Merge branch 'shadps4-emu:main' into fix-tracy

This commit is contained in:
Stephen Miller 2025-07-03 16:26:45 -05:00 committed by GitHub
commit 688ee12a07
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
37 changed files with 740 additions and 352 deletions

View File

@ -689,6 +689,7 @@ set(COMMON src/common/logging/backend.cpp
src/common/recursive_lock.cpp
src/common/recursive_lock.h
src/common/sha1.h
src/common/shared_first_mutex.h
src/common/signal_context.h
src/common/signal_context.cpp
src/common/singleton.h

View File

@ -51,6 +51,8 @@ static bool isShowSplash = false;
static std::string isSideTrophy = "right";
static bool isNullGpu = false;
static bool shouldCopyGPUBuffers = false;
static bool readbacksEnabled = false;
static bool directMemoryAccessEnabled = false;
static bool shouldDumpShaders = false;
static bool shouldPatchShaders = true;
static u32 vblankDivider = 1;
@ -240,6 +242,14 @@ bool copyGPUCmdBuffers() {
return shouldCopyGPUBuffers;
}
bool readbacks() {
return readbacksEnabled;
}
bool directMemoryAccess() {
return directMemoryAccessEnabled;
}
bool dumpShaders() {
return shouldDumpShaders;
}
@ -344,6 +354,14 @@ void setCopyGPUCmdBuffers(bool enable) {
shouldCopyGPUBuffers = enable;
}
void setReadbacks(bool enable) {
readbacksEnabled = enable;
}
void setDirectMemoryAccess(bool enable) {
directMemoryAccessEnabled = enable;
}
void setDumpShaders(bool enable) {
shouldDumpShaders = enable;
}
@ -586,6 +604,8 @@ void load(const std::filesystem::path& path) {
screenHeight = toml::find_or<int>(gpu, "screenHeight", screenHeight);
isNullGpu = toml::find_or<bool>(gpu, "nullGpu", false);
shouldCopyGPUBuffers = toml::find_or<bool>(gpu, "copyGPUBuffers", false);
readbacksEnabled = toml::find_or<bool>(gpu, "readbacks", false);
directMemoryAccessEnabled = toml::find_or<bool>(gpu, "directMemoryAccess", false);
shouldDumpShaders = toml::find_or<bool>(gpu, "dumpShaders", false);
shouldPatchShaders = toml::find_or<bool>(gpu, "patchShaders", true);
vblankDivider = toml::find_or<int>(gpu, "vblankDivider", 1);
@ -735,6 +755,8 @@ void save(const std::filesystem::path& path) {
data["GPU"]["screenHeight"] = screenHeight;
data["GPU"]["nullGpu"] = isNullGpu;
data["GPU"]["copyGPUBuffers"] = shouldCopyGPUBuffers;
data["GPU"]["readbacks"] = readbacksEnabled;
data["GPU"]["directMemoryAccess"] = directMemoryAccessEnabled;
data["GPU"]["dumpShaders"] = shouldDumpShaders;
data["GPU"]["patchShaders"] = shouldPatchShaders;
data["GPU"]["vblankDivider"] = vblankDivider;

View File

@ -45,6 +45,10 @@ bool nullGpu();
void setNullGpu(bool enable);
bool copyGPUCmdBuffers();
void setCopyGPUCmdBuffers(bool enable);
bool readbacks();
void setReadbacks(bool enable);
bool directMemoryAccess();
void setDirectMemoryAccess(bool enable);
bool dumpShaders();
void setDumpShaders(bool enable);
u32 vblankDiv();

View File

@ -0,0 +1,46 @@
// SPDX-FileCopyrightText: Copyright 2025 shadPS4 Emulator Project
// SPDX-License-Identifier: GPL-2.0-or-later
#pragma once
#include <condition_variable>
#include <mutex>
namespace Common {
// Like std::shared_mutex, but reader has priority over writer.
class SharedFirstMutex {
public:
void lock() {
std::unique_lock<std::mutex> lock(mtx);
cv.wait(lock, [this]() { return !writer_active && readers == 0; });
writer_active = true;
}
void unlock() {
std::lock_guard<std::mutex> lock(mtx);
writer_active = false;
cv.notify_all();
}
void lock_shared() {
std::unique_lock<std::mutex> lock(mtx);
cv.wait(lock, [this]() { return !writer_active; });
++readers;
}
void unlock_shared() {
std::lock_guard<std::mutex> lock(mtx);
if (--readers == 0) {
cv.notify_all();
}
}
private:
std::mutex mtx;
std::condition_variable cv;
int readers = 0;
bool writer_active = false;
};
} // namespace Common

View File

@ -302,14 +302,15 @@ struct AddressSpace::Impl {
new_flags = PAGE_READWRITE;
} else if (read && !write) {
new_flags = PAGE_READONLY;
} else if (execute && !read && not write) {
} else if (execute && !read && !write) {
new_flags = PAGE_EXECUTE;
} else if (!read && !write && !execute) {
new_flags = PAGE_NOACCESS;
} else {
LOG_CRITICAL(Common_Memory,
"Unsupported protection flag combination for address {:#x}, size {}",
virtual_addr, size);
"Unsupported protection flag combination for address {:#x}, size {}, "
"read={}, write={}, execute={}",
virtual_addr, size, read, write, execute);
return;
}

View File

@ -11,6 +11,7 @@
namespace Core {
enum class MemoryPermission : u32 {
None = 0,
Read = 1 << 0,
Write = 1 << 1,
ReadWrite = Read | Write,

View File

@ -2834,7 +2834,7 @@ void RegisterlibSceGnmDriver(Core::Loader::SymbolsResolver* sym) {
}
if (Config::copyGPUCmdBuffers()) {
liverpool->reserveCopyBufferSpace();
liverpool->ReserveCopyBufferSpace();
}
Platform::IrqC::Instance()->Register(Platform::InterruptId::GpuIdle, ResetSubmissionLock,

View File

@ -132,6 +132,8 @@ void Emulator::Run(std::filesystem::path file, const std::vector<std::string> ar
LOG_INFO(Config, "General LogType: {}", Config::getLogType());
LOG_INFO(Config, "General isNeo: {}", Config::isNeoModeConsole());
LOG_INFO(Config, "GPU isNullGpu: {}", Config::nullGpu());
LOG_INFO(Config, "GPU readbacks: {}", Config::readbacks());
LOG_INFO(Config, "GPU directMemoryAccess: {}", Config::directMemoryAccess());
LOG_INFO(Config, "GPU shouldDumpShaders: {}", Config::dumpShaders());
LOG_INFO(Config, "GPU vblankDivider: {}", Config::vblankDiv());
LOG_INFO(Config, "Vulkan gpuId: {}", Config::getGpuId());

View File

@ -200,10 +200,18 @@ Id EmitBufferAtomicSMin32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id addre
return BufferAtomicU32(ctx, inst, handle, address, value, &Sirit::Module::OpAtomicSMin);
}
Id EmitBufferAtomicSMin64(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) {
return BufferAtomicU64(ctx, inst, handle, address, value, &Sirit::Module::OpAtomicSMin);
}
Id EmitBufferAtomicUMin32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) {
return BufferAtomicU32(ctx, inst, handle, address, value, &Sirit::Module::OpAtomicUMin);
}
Id EmitBufferAtomicUMin64(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) {
return BufferAtomicU64(ctx, inst, handle, address, value, &Sirit::Module::OpAtomicUMin);
}
Id EmitBufferAtomicFMin32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) {
if (ctx.profile.supports_buffer_fp32_atomic_min_max) {
return BufferAtomicU32<true>(ctx, inst, handle, address, value,

View File

@ -2,6 +2,7 @@
// SPDX-License-Identifier: GPL-2.0-or-later
#include "common/assert.h"
#include "common/config.h"
#include "common/logging/log.h"
#include "shader_recompiler/backend/spirv/emit_spirv_bounds.h"
#include "shader_recompiler/backend/spirv/emit_spirv_instructions.h"
@ -167,6 +168,9 @@ using PointerSize = EmitContext::PointerSize;
Id EmitReadConst(EmitContext& ctx, IR::Inst* inst, Id addr, Id offset) {
const u32 flatbuf_off_dw = inst->Flags<u32>();
if (!Config::directMemoryAccess()) {
return ctx.EmitFlatbufferLoad(ctx.ConstU32(flatbuf_off_dw));
}
// We can only provide a fallback for immediate offsets.
if (flatbuf_off_dw == 0) {
return ctx.OpFunctionCall(ctx.U32[1], ctx.read_const_dynamic, addr, offset);

View File

@ -91,7 +91,9 @@ Id EmitBufferAtomicIAdd32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id addre
Id EmitBufferAtomicIAdd64(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value);
Id EmitBufferAtomicISub32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value);
Id EmitBufferAtomicSMin32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value);
Id EmitBufferAtomicSMin64(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value);
Id EmitBufferAtomicUMin32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value);
Id EmitBufferAtomicUMin64(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value);
Id EmitBufferAtomicFMin32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value);
Id EmitBufferAtomicSMax32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value);
Id EmitBufferAtomicSMax64(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value);
@ -406,14 +408,20 @@ Id EmitULessThan32(EmitContext& ctx, Id lhs, Id rhs);
Id EmitULessThan64(EmitContext& ctx, Id lhs, Id rhs);
Id EmitIEqual32(EmitContext& ctx, Id lhs, Id rhs);
Id EmitIEqual64(EmitContext& ctx, Id lhs, Id rhs);
Id EmitSLessThanEqual(EmitContext& ctx, Id lhs, Id rhs);
Id EmitULessThanEqual(EmitContext& ctx, Id lhs, Id rhs);
Id EmitSGreaterThan(EmitContext& ctx, Id lhs, Id rhs);
Id EmitUGreaterThan(EmitContext& ctx, Id lhs, Id rhs);
Id EmitSLessThanEqual32(EmitContext& ctx, Id lhs, Id rhs);
Id EmitSLessThanEqual64(EmitContext& ctx, Id lhs, Id rhs);
Id EmitULessThanEqual32(EmitContext& ctx, Id lhs, Id rhs);
Id EmitULessThanEqual64(EmitContext& ctx, Id lhs, Id rhs);
Id EmitSGreaterThan32(EmitContext& ctx, Id lhs, Id rhs);
Id EmitSGreaterThan64(EmitContext& ctx, Id lhs, Id rhs);
Id EmitUGreaterThan32(EmitContext& ctx, Id lhs, Id rhs);
Id EmitUGreaterThan64(EmitContext& ctx, Id lhs, Id rhs);
Id EmitINotEqual32(EmitContext& ctx, Id lhs, Id rhs);
Id EmitINotEqual64(EmitContext& ctx, Id lhs, Id rhs);
Id EmitSGreaterThanEqual(EmitContext& ctx, Id lhs, Id rhs);
Id EmitUGreaterThanEqual(EmitContext& ctx, Id lhs, Id rhs);
Id EmitSGreaterThanEqual32(EmitContext& ctx, Id lhs, Id rhs);
Id EmitSGreaterThanEqual64(EmitContext& ctx, Id lhs, Id rhs);
Id EmitUGreaterThanEqual32(EmitContext& ctx, Id lhs, Id rhs);
Id EmitUGreaterThanEqual64(EmitContext& ctx, Id lhs, Id rhs);
Id EmitLogicalOr(EmitContext& ctx, Id a, Id b);
Id EmitLogicalAnd(EmitContext& ctx, Id a, Id b);
Id EmitLogicalXor(EmitContext& ctx, Id a, Id b);

View File

@ -371,19 +371,35 @@ Id EmitIEqual64(EmitContext& ctx, Id lhs, Id rhs) {
return ctx.OpIEqual(ctx.U1[1], lhs, rhs);
}
Id EmitSLessThanEqual(EmitContext& ctx, Id lhs, Id rhs) {
Id EmitSLessThanEqual32(EmitContext& ctx, Id lhs, Id rhs) {
return ctx.OpSLessThanEqual(ctx.U1[1], lhs, rhs);
}
Id EmitULessThanEqual(EmitContext& ctx, Id lhs, Id rhs) {
Id EmitSLessThanEqual64(EmitContext& ctx, Id lhs, Id rhs) {
return ctx.OpSLessThanEqual(ctx.U1[1], lhs, rhs);
}
Id EmitULessThanEqual32(EmitContext& ctx, Id lhs, Id rhs) {
return ctx.OpULessThanEqual(ctx.U1[1], lhs, rhs);
}
Id EmitSGreaterThan(EmitContext& ctx, Id lhs, Id rhs) {
Id EmitULessThanEqual64(EmitContext& ctx, Id lhs, Id rhs) {
return ctx.OpULessThanEqual(ctx.U1[1], lhs, rhs);
}
Id EmitSGreaterThan32(EmitContext& ctx, Id lhs, Id rhs) {
return ctx.OpSGreaterThan(ctx.U1[1], lhs, rhs);
}
Id EmitUGreaterThan(EmitContext& ctx, Id lhs, Id rhs) {
Id EmitSGreaterThan64(EmitContext& ctx, Id lhs, Id rhs) {
return ctx.OpSGreaterThan(ctx.U1[1], lhs, rhs);
}
Id EmitUGreaterThan32(EmitContext& ctx, Id lhs, Id rhs) {
return ctx.OpUGreaterThan(ctx.U1[1], lhs, rhs);
}
Id EmitUGreaterThan64(EmitContext& ctx, Id lhs, Id rhs) {
return ctx.OpUGreaterThan(ctx.U1[1], lhs, rhs);
}
@ -395,11 +411,19 @@ Id EmitINotEqual64(EmitContext& ctx, Id lhs, Id rhs) {
return ctx.OpINotEqual(ctx.U1[1], lhs, rhs);
}
Id EmitSGreaterThanEqual(EmitContext& ctx, Id lhs, Id rhs) {
Id EmitSGreaterThanEqual32(EmitContext& ctx, Id lhs, Id rhs) {
return ctx.OpSGreaterThanEqual(ctx.U1[1], lhs, rhs);
}
Id EmitUGreaterThanEqual(EmitContext& ctx, Id lhs, Id rhs) {
Id EmitSGreaterThanEqual64(EmitContext& ctx, Id lhs, Id rhs) {
return ctx.OpSGreaterThanEqual(ctx.U1[1], lhs, rhs);
}
Id EmitUGreaterThanEqual32(EmitContext& ctx, Id lhs, Id rhs) {
return ctx.OpUGreaterThanEqual(ctx.U1[1], lhs, rhs);
}
Id EmitUGreaterThanEqual64(EmitContext& ctx, Id lhs, Id rhs) {
return ctx.OpUGreaterThanEqual(ctx.U1[1], lhs, rhs);
}

View File

@ -784,19 +784,6 @@ EmitContext::BufferSpv EmitContext::DefineBuffer(bool is_storage, bool is_writte
};
void EmitContext::DefineBuffers() {
if (!profile.supports_robust_buffer_access && !info.uses_dma) {
// In case Flatbuf has not already been bound by IR and is needed
// to query buffer sizes, bind it now.
info.buffers.push_back({
.used_types = IR::Type::U32,
// We can't guarantee that flatbuf will not grow past UBO
// limit if there are a lot of ReadConsts. (We could specialize)
.inline_cbuf = AmdGpu::Buffer::Placeholder(std::numeric_limits<u32>::max()),
.buffer_type = BufferType::Flatbuf,
});
// In the future we may want to read buffer sizes from GPU memory if available.
// info.readconst_types |= Info::ReadConstType::Immediate;
}
for (const auto& desc : info.buffers) {
const auto buf_sharp = desc.GetSharp(info);
const bool is_storage = desc.IsStorage(buf_sharp, profile);
@ -1219,14 +1206,7 @@ Id EmitContext::DefineReadConst(bool dynamic) {
if (dynamic) {
return u32_zero_value;
} else {
const auto& flatbuf_buffer{buffers[flatbuf_index]};
ASSERT(flatbuf_buffer.binding >= 0 &&
flatbuf_buffer.buffer_type == BufferType::Flatbuf);
const auto [flatbuf_buffer_id, flatbuf_pointer_type] =
flatbuf_buffer.Alias(PointerType::U32);
const auto ptr{OpAccessChain(flatbuf_pointer_type, flatbuf_buffer_id, u32_zero_value,
flatbuf_offset)};
return OpLoad(U32[1], ptr);
return EmitFlatbufferLoad(flatbuf_offset);
}
});

View File

@ -180,6 +180,16 @@ public:
return OpAccessChain(result_type, shared_mem, index);
}
Id EmitFlatbufferLoad(Id flatbuf_offset) {
const auto& flatbuf_buffer{buffers[flatbuf_index]};
ASSERT(flatbuf_buffer.binding >= 0 && flatbuf_buffer.buffer_type == BufferType::Flatbuf);
const auto [flatbuf_buffer_id, flatbuf_pointer_type] =
flatbuf_buffer.aliases[u32(PointerType::U32)];
const auto ptr{
OpAccessChain(flatbuf_pointer_type, flatbuf_buffer_id, u32_zero_value, flatbuf_offset)};
return OpLoad(U32[1], ptr);
}
Info& info;
const RuntimeInfo& runtime_info;
const Profile& profile;

View File

@ -20,7 +20,7 @@ namespace Shader::Gcn {
enum class ConditionOp : u32 {
F,
EQ,
LG,
LG, // NE
GT,
GE,
LT,
@ -230,7 +230,7 @@ public:
// VOPC
void V_CMP_F32(ConditionOp op, bool set_exec, const GcnInst& inst);
void V_CMP_U32(ConditionOp op, bool is_signed, bool set_exec, const GcnInst& inst);
void V_CMP_NE_U64(const GcnInst& inst);
void V_CMP_U64(ConditionOp op, bool is_signed, bool set_exec, const GcnInst& inst);
void V_CMP_CLASS_F32(const GcnInst& inst);
// VOP3a

View File

@ -327,8 +327,10 @@ void Translator::EmitVectorAlu(const GcnInst& inst) {
return V_CMP_U32(ConditionOp::TRU, false, true, inst);
// V_CMP_{OP8}_U64
case Opcode::V_CMP_EQ_U64:
return V_CMP_U64(ConditionOp::EQ, false, false, inst);
case Opcode::V_CMP_NE_U64:
return V_CMP_NE_U64(inst);
return V_CMP_U64(ConditionOp::LG, false, false, inst);
case Opcode::V_CMP_CLASS_F32:
return V_CMP_CLASS_F32(inst);
@ -556,27 +558,31 @@ void Translator::V_BCNT_U32_B32(const GcnInst& inst) {
void Translator::V_MBCNT_U32_B32(bool is_low, const GcnInst& inst) {
if (!is_low) {
// v_mbcnt_hi_u32_b32 v2, -1, 0
// v_mbcnt_hi_u32_b32 vX, -1, 0
if (inst.src[0].field == OperandField::SignedConstIntNeg && inst.src[0].code == 193 &&
inst.src[1].field == OperandField::ConstZero) {
return;
}
// v_mbcnt_hi_u32_b32 vX, exec_hi, 0
if (inst.src[0].field == OperandField::ExecHi &&
inst.src[1].field == OperandField::ConstZero) {
return;
// v_mbcnt_hi_u32_b32 vX, exec_hi, 0/vZ
if ((inst.src[0].field == OperandField::ExecHi ||
inst.src[0].field == OperandField::VccHi) &&
(inst.src[1].field == OperandField::ConstZero ||
inst.src[1].field == OperandField::VectorGPR)) {
return SetDst(inst.dst[0], GetSrc(inst.src[1]));
}
UNREACHABLE();
} else {
// v_mbcnt_lo_u32_b32 v2, -1, vX
// v_mbcnt_lo_u32_b32 vY, -1, vX
// used combined with above to fetch lane id in non-compute stages
if (inst.src[0].field == OperandField::SignedConstIntNeg && inst.src[0].code == 193) {
SetDst(inst.dst[0], ir.LaneId());
return SetDst(inst.dst[0], ir.LaneId());
}
// v_mbcnt_lo_u32_b32 v20, exec_lo, vX
// used combined in above for append buffer indexing.
if (inst.src[0].field == OperandField::ExecLo) {
SetDst(inst.dst[0], ir.Imm32(0));
// v_mbcnt_lo_u32_b32 vY, exec_lo, vX
// used combined with above for append buffer indexing.
if (inst.src[0].field == OperandField::ExecLo || inst.src[0].field == OperandField::VccLo) {
return SetDst(inst.dst[0], GetSrc(inst.src[1]));
}
UNREACHABLE();
}
}
@ -996,39 +1002,32 @@ void Translator::V_CMP_U32(ConditionOp op, bool is_signed, bool set_exec, const
}
}
void Translator::V_CMP_NE_U64(const GcnInst& inst) {
const auto get_src = [&](const InstOperand& operand) {
switch (operand.field) {
case OperandField::VccLo:
return ir.GetVcc();
case OperandField::ExecLo:
return ir.GetExec();
case OperandField::ScalarGPR:
return ir.GetThreadBitScalarReg(IR::ScalarReg(operand.code));
case OperandField::ConstZero:
return ir.Imm1(false);
void Translator::V_CMP_U64(ConditionOp op, bool is_signed, bool set_exec, const GcnInst& inst) {
const IR::U64 src0{GetSrc64(inst.src[0])};
const IR::U64 src1{GetSrc64(inst.src[1])};
const IR::U1 result = [&] {
switch (op) {
case ConditionOp::EQ:
return ir.IEqual(src0, src1);
case ConditionOp::LG: // NE
return ir.INotEqual(src0, src1);
default:
UNREACHABLE();
UNREACHABLE_MSG("Unsupported V_CMP_U64 condition operation: {}", u32(op));
}
};
const IR::U1 src0{get_src(inst.src[0])};
auto op = [&inst, this](auto x) {
switch (inst.src[1].field) {
case OperandField::ConstZero:
return x;
case OperandField::SignedConstIntNeg:
return ir.LogicalNot(x);
default:
UNREACHABLE_MSG("unhandled V_CMP_NE_U64 source argument {}", u32(inst.src[1].field));
}
};
}();
if (is_signed) {
UNREACHABLE_MSG("V_CMP_U64 with signed integers is not supported");
}
if (set_exec) {
UNREACHABLE_MSG("Exec setting for V_CMP_U64 is not supported");
}
switch (inst.dst[1].field) {
case OperandField::VccLo:
ir.SetVcc(op(src0));
break;
return ir.SetVcc(result);
case OperandField::ScalarGPR:
ir.SetThreadBitScalarReg(IR::ScalarReg(inst.dst[1].code), op(src0));
break;
return ir.SetThreadBitScalarReg(IR::ScalarReg(inst.dst[1].code), result);
default:
UNREACHABLE();
}

View File

@ -74,8 +74,12 @@ void Translator::EmitVectorMemory(const GcnInst& inst) {
return BUFFER_ATOMIC(AtomicOp::CmpSwap, inst);
case Opcode::BUFFER_ATOMIC_SMIN:
return BUFFER_ATOMIC(AtomicOp::Smin, inst);
case Opcode::BUFFER_ATOMIC_SMIN_X2:
return BUFFER_ATOMIC<IR::U64>(AtomicOp::Smin, inst);
case Opcode::BUFFER_ATOMIC_UMIN:
return BUFFER_ATOMIC(AtomicOp::Umin, inst);
case Opcode::BUFFER_ATOMIC_UMIN_X2:
return BUFFER_ATOMIC<IR::U64>(AtomicOp::Umin, inst);
case Opcode::BUFFER_ATOMIC_SMAX:
return BUFFER_ATOMIC(AtomicOp::Smax, inst);
case Opcode::BUFFER_ATOMIC_SMAX_X2:

View File

@ -500,8 +500,16 @@ Value IREmitter::BufferAtomicISub(const Value& handle, const Value& address, con
Value IREmitter::BufferAtomicIMin(const Value& handle, const Value& address, const Value& value,
bool is_signed, BufferInstInfo info) {
return is_signed ? Inst(Opcode::BufferAtomicSMin32, Flags{info}, handle, address, value)
: Inst(Opcode::BufferAtomicUMin32, Flags{info}, handle, address, value);
switch (value.Type()) {
case Type::U32:
return is_signed ? Inst(Opcode::BufferAtomicSMin32, Flags{info}, handle, address, value)
: Inst(Opcode::BufferAtomicUMin32, Flags{info}, handle, address, value);
case Type::U64:
return is_signed ? Inst(Opcode::BufferAtomicSMin64, Flags{info}, handle, address, value)
: Inst(Opcode::BufferAtomicUMin64, Flags{info}, handle, address, value);
default:
ThrowInvalidType(value.Type());
}
}
Value IREmitter::BufferAtomicFMin(const Value& handle, const Value& address, const Value& value,
@ -1712,12 +1720,32 @@ U1 IREmitter::IEqual(const U32U64& lhs, const U32U64& rhs) {
}
}
U1 IREmitter::ILessThanEqual(const U32& lhs, const U32& rhs, bool is_signed) {
return Inst<U1>(is_signed ? Opcode::SLessThanEqual : Opcode::ULessThanEqual, lhs, rhs);
U1 IREmitter::ILessThanEqual(const U32U64& lhs, const U32U64& rhs, bool is_signed) {
if (lhs.Type() != rhs.Type()) {
UNREACHABLE_MSG("Mismatching types {} and {}", lhs.Type(), rhs.Type());
}
switch (lhs.Type()) {
case Type::U32:
return Inst<U1>(is_signed ? Opcode::SLessThanEqual32 : Opcode::ULessThanEqual32, lhs, rhs);
case Type::U64:
return Inst<U1>(is_signed ? Opcode::SLessThanEqual64 : Opcode::ULessThanEqual64, lhs, rhs);
default:
ThrowInvalidType(lhs.Type());
}
}
U1 IREmitter::IGreaterThan(const U32& lhs, const U32& rhs, bool is_signed) {
return Inst<U1>(is_signed ? Opcode::SGreaterThan : Opcode::UGreaterThan, lhs, rhs);
U1 IREmitter::IGreaterThan(const U32U64& lhs, const U32U64& rhs, bool is_signed) {
if (lhs.Type() != rhs.Type()) {
UNREACHABLE_MSG("Mismatching types {} and {}", lhs.Type(), rhs.Type());
}
switch (lhs.Type()) {
case Type::U32:
return Inst<U1>(is_signed ? Opcode::SGreaterThan32 : Opcode::UGreaterThan32, lhs, rhs);
case Type::U64:
return Inst<U1>(is_signed ? Opcode::SGreaterThan64 : Opcode::UGreaterThan64, lhs, rhs);
default:
ThrowInvalidType(lhs.Type());
}
}
U1 IREmitter::INotEqual(const U32U64& lhs, const U32U64& rhs) {
@ -1734,8 +1762,20 @@ U1 IREmitter::INotEqual(const U32U64& lhs, const U32U64& rhs) {
}
}
U1 IREmitter::IGreaterThanEqual(const U32& lhs, const U32& rhs, bool is_signed) {
return Inst<U1>(is_signed ? Opcode::SGreaterThanEqual : Opcode::UGreaterThanEqual, lhs, rhs);
U1 IREmitter::IGreaterThanEqual(const U32U64& lhs, const U32U64& rhs, bool is_signed) {
if (lhs.Type() != rhs.Type()) {
UNREACHABLE_MSG("Mismatching types {} and {}", lhs.Type(), rhs.Type());
}
switch (lhs.Type()) {
case Type::U32:
return Inst<U1>(is_signed ? Opcode::SGreaterThanEqual32 : Opcode::UGreaterThanEqual32, lhs,
rhs);
case Type::U64:
return Inst<U1>(is_signed ? Opcode::SGreaterThanEqual64 : Opcode::UGreaterThanEqual64, lhs,
rhs);
default:
ThrowInvalidType(lhs.Type());
}
}
U1 IREmitter::LogicalOr(const U1& a, const U1& b) {

View File

@ -299,10 +299,10 @@ public:
[[nodiscard]] U1 ILessThan(const U32U64& lhs, const U32U64& rhs, bool is_signed);
[[nodiscard]] U1 IEqual(const U32U64& lhs, const U32U64& rhs);
[[nodiscard]] U1 ILessThanEqual(const U32& lhs, const U32& rhs, bool is_signed);
[[nodiscard]] U1 IGreaterThan(const U32& lhs, const U32& rhs, bool is_signed);
[[nodiscard]] U1 ILessThanEqual(const U32U64& lhs, const U32U64& rhs, bool is_signed);
[[nodiscard]] U1 IGreaterThan(const U32U64& lhs, const U32U64& rhs, bool is_signed);
[[nodiscard]] U1 INotEqual(const U32U64& lhs, const U32U64& rhs);
[[nodiscard]] U1 IGreaterThanEqual(const U32& lhs, const U32& rhs, bool is_signed);
[[nodiscard]] U1 IGreaterThanEqual(const U32U64& lhs, const U32U64& rhs, bool is_signed);
[[nodiscard]] U1 LogicalOr(const U1& a, const U1& b);
[[nodiscard]] U1 LogicalAnd(const U1& a, const U1& b);

View File

@ -70,7 +70,9 @@ bool Inst::MayHaveSideEffects() const noexcept {
case Opcode::BufferAtomicIAdd64:
case Opcode::BufferAtomicISub32:
case Opcode::BufferAtomicSMin32:
case Opcode::BufferAtomicSMin64:
case Opcode::BufferAtomicUMin32:
case Opcode::BufferAtomicUMin64:
case Opcode::BufferAtomicFMin32:
case Opcode::BufferAtomicSMax32:
case Opcode::BufferAtomicSMax64:

View File

@ -124,7 +124,9 @@ OPCODE(BufferAtomicIAdd32, U32, Opaq
OPCODE(BufferAtomicIAdd64, U64, Opaque, Opaque, U64 )
OPCODE(BufferAtomicISub32, U32, Opaque, Opaque, U32 )
OPCODE(BufferAtomicSMin32, U32, Opaque, Opaque, U32 )
OPCODE(BufferAtomicSMin64, U64, Opaque, Opaque, U64 )
OPCODE(BufferAtomicUMin32, U32, Opaque, Opaque, U32 )
OPCODE(BufferAtomicUMin64, U64, Opaque, Opaque, U64 )
OPCODE(BufferAtomicFMin32, U32, Opaque, Opaque, F32 )
OPCODE(BufferAtomicSMax32, U32, Opaque, Opaque, U32 )
OPCODE(BufferAtomicSMax64, U64, Opaque, Opaque, U64 )
@ -382,14 +384,20 @@ OPCODE(ULessThan32, U1, U32,
OPCODE(ULessThan64, U1, U64, U64, )
OPCODE(IEqual32, U1, U32, U32, )
OPCODE(IEqual64, U1, U64, U64, )
OPCODE(SLessThanEqual, U1, U32, U32, )
OPCODE(ULessThanEqual, U1, U32, U32, )
OPCODE(SGreaterThan, U1, U32, U32, )
OPCODE(UGreaterThan, U1, U32, U32, )
OPCODE(SLessThanEqual32, U1, U32, U32, )
OPCODE(SLessThanEqual64, U1, U64, U64, )
OPCODE(ULessThanEqual32, U1, U32, U32, )
OPCODE(ULessThanEqual64, U1, U64, U64, )
OPCODE(SGreaterThan32, U1, U32, U32, )
OPCODE(SGreaterThan64, U1, U64, U64, )
OPCODE(UGreaterThan32, U1, U32, U32, )
OPCODE(UGreaterThan64, U1, U64, U64, )
OPCODE(INotEqual32, U1, U32, U32, )
OPCODE(INotEqual64, U1, U64, U64, )
OPCODE(SGreaterThanEqual, U1, U32, U32, )
OPCODE(UGreaterThanEqual, U1, U32, U32, )
OPCODE(SGreaterThanEqual32, U1, U32, U32, )
OPCODE(SGreaterThanEqual64, U1, U64, U64, )
OPCODE(UGreaterThanEqual32, U1, U32, U32, )
OPCODE(UGreaterThanEqual64, U1, U64, U64, )
// Logical operations
OPCODE(LogicalOr, U1, U1, U1, )

View File

@ -381,24 +381,42 @@ void ConstantPropagation(IR::Block& block, IR::Inst& inst) {
case IR::Opcode::ULessThan64:
FoldWhenAllImmediates(inst, [](u64 a, u64 b) { return a < b; });
return;
case IR::Opcode::SLessThanEqual:
case IR::Opcode::SLessThanEqual32:
FoldWhenAllImmediates(inst, [](s32 a, s32 b) { return a <= b; });
return;
case IR::Opcode::ULessThanEqual:
case IR::Opcode::SLessThanEqual64:
FoldWhenAllImmediates(inst, [](s64 a, s64 b) { return a <= b; });
return;
case IR::Opcode::ULessThanEqual32:
FoldWhenAllImmediates(inst, [](u32 a, u32 b) { return a <= b; });
return;
case IR::Opcode::SGreaterThan:
case IR::Opcode::ULessThanEqual64:
FoldWhenAllImmediates(inst, [](u64 a, u64 b) { return a <= b; });
return;
case IR::Opcode::SGreaterThan32:
FoldWhenAllImmediates(inst, [](s32 a, s32 b) { return a > b; });
return;
case IR::Opcode::UGreaterThan:
case IR::Opcode::SGreaterThan64:
FoldWhenAllImmediates(inst, [](s64 a, s64 b) { return a > b; });
return;
case IR::Opcode::UGreaterThan32:
FoldWhenAllImmediates(inst, [](u32 a, u32 b) { return a > b; });
return;
case IR::Opcode::SGreaterThanEqual:
case IR::Opcode::UGreaterThan64:
FoldWhenAllImmediates(inst, [](u64 a, u64 b) { return a > b; });
return;
case IR::Opcode::SGreaterThanEqual32:
FoldWhenAllImmediates(inst, [](s32 a, s32 b) { return a >= b; });
return;
case IR::Opcode::UGreaterThanEqual:
case IR::Opcode::SGreaterThanEqual64:
FoldWhenAllImmediates(inst, [](s64 a, s64 b) { return a >= b; });
return;
case IR::Opcode::UGreaterThanEqual32:
FoldWhenAllImmediates(inst, [](u32 a, u32 b) { return a >= b; });
return;
case IR::Opcode::UGreaterThanEqual64:
FoldWhenAllImmediates(inst, [](u64 a, u64 b) { return a >= b; });
return;
case IR::Opcode::IEqual32:
FoldWhenAllImmediates(inst, [](u32 a, u32 b) { return a == b; });
return;

View File

@ -19,7 +19,7 @@ void ConstantPropagationPass(IR::BlockList& program);
void FlattenExtendedUserdataPass(IR::Program& program);
void ReadLaneEliminationPass(IR::Program& program);
void ResourceTrackingPass(IR::Program& program);
void CollectShaderInfoPass(IR::Program& program);
void CollectShaderInfoPass(IR::Program& program, const Profile& profile);
void LowerBufferFormatToRaw(IR::Program& program);
void LowerFp64ToFp32(IR::Program& program);
void RingAccessElimination(const IR::Program& program, const RuntimeInfo& runtime_info);

View File

@ -20,7 +20,9 @@ bool IsBufferAtomic(const IR::Inst& inst) {
case IR::Opcode::BufferAtomicIAdd64:
case IR::Opcode::BufferAtomicISub32:
case IR::Opcode::BufferAtomicSMin32:
case IR::Opcode::BufferAtomicSMin64:
case IR::Opcode::BufferAtomicUMin32:
case IR::Opcode::BufferAtomicUMin64:
case IR::Opcode::BufferAtomicFMin32:
case IR::Opcode::BufferAtomicSMax32:
case IR::Opcode::BufferAtomicSMax64:
@ -97,6 +99,10 @@ IR::Type BufferDataType(const IR::Inst& inst, AmdGpu::NumberFormat num_format) {
case IR::Opcode::LoadBufferU64:
case IR::Opcode::StoreBufferU64:
case IR::Opcode::BufferAtomicIAdd64:
case IR::Opcode::BufferAtomicSMax64:
case IR::Opcode::BufferAtomicSMin64:
case IR::Opcode::BufferAtomicUMax64:
case IR::Opcode::BufferAtomicUMin64:
return IR::Type::U64;
case IR::Opcode::LoadBufferFormatF32:
case IR::Opcode::StoreBufferFormatF32:
@ -118,6 +124,10 @@ u32 BufferAddressShift(const IR::Inst& inst, AmdGpu::DataFormat data_format) {
case IR::Opcode::LoadBufferU64:
case IR::Opcode::StoreBufferU64:
case IR::Opcode::BufferAtomicIAdd64:
case IR::Opcode::BufferAtomicSMax64:
case IR::Opcode::BufferAtomicSMin64:
case IR::Opcode::BufferAtomicUMax64:
case IR::Opcode::BufferAtomicUMin64:
return 3;
case IR::Opcode::LoadBufferFormatF32:
case IR::Opcode::StoreBufferFormatF32: {

View File

@ -1,6 +1,7 @@
// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project
// SPDX-License-Identifier: GPL-2.0-or-later
#include "common/config.h"
#include "shader_recompiler/ir/program.h"
#include "video_core/buffer_cache/buffer_cache.h"
@ -102,7 +103,9 @@ void Visit(Info& info, const IR::Inst& inst) {
break;
case IR::Opcode::BufferAtomicIAdd64:
case IR::Opcode::BufferAtomicSMax64:
case IR::Opcode::BufferAtomicSMin64:
case IR::Opcode::BufferAtomicUMax64:
case IR::Opcode::BufferAtomicUMin64:
info.uses_buffer_int64_atomics = true;
break;
case IR::Opcode::LaneId:
@ -136,7 +139,7 @@ void Visit(Info& info, const IR::Inst& inst) {
}
}
void CollectShaderInfoPass(IR::Program& program) {
void CollectShaderInfoPass(IR::Program& program, const Profile& profile) {
auto& info = program.info;
for (IR::Block* const block : program.post_order_blocks) {
for (IR::Inst& inst : block->Instructions()) {
@ -144,6 +147,25 @@ void CollectShaderInfoPass(IR::Program& program) {
}
}
// In case Flatbuf has not already been bound by IR and is needed
// to query buffer sizes, bind it now.
if (!profile.supports_robust_buffer_access && !info.uses_dma) {
info.buffers.push_back({
.used_types = IR::Type::U32,
// We can't guarantee that flatbuf will not grow past UBO
// limit if there are a lot of ReadConsts. (We could specialize)
.inline_cbuf = AmdGpu::Buffer::Placeholder(std::numeric_limits<u32>::max()),
.buffer_type = BufferType::Flatbuf,
});
// In the future we may want to read buffer sizes from GPU memory if available.
// info.readconst_types |= Info::ReadConstType::Immediate;
}
if (!Config::directMemoryAccess()) {
info.uses_dma = false;
info.readconst_types = Info::ReadConstType::None;
}
if (info.uses_dma) {
info.buffers.push_back({
.used_types = IR::Type::U64,

View File

@ -84,7 +84,7 @@ IR::Program TranslateProgram(std::span<const u32> code, Pools& pools, Info& info
Shader::Optimization::IdentityRemovalPass(program.blocks);
Shader::Optimization::DeadCodeEliminationPass(program);
Shader::Optimization::ConstantPropagationPass(program.post_order_blocks);
Shader::Optimization::CollectShaderInfoPass(program);
Shader::Optimization::CollectShaderInfoPass(program, profile);
Shader::IR::DumpProgram(program, info);

View File

@ -72,8 +72,23 @@ Liverpool::~Liverpool() {
process_thread.join();
}
void Liverpool::ProcessCommands() {
// Process incoming commands with high priority
while (num_commands) {
Common::UniqueFunction<void> callback{};
{
std::scoped_lock lk{submit_mutex};
callback = std::move(command_queue.front());
command_queue.pop();
--num_commands;
}
callback();
}
}
void Liverpool::Process(std::stop_token stoken) {
Common::SetCurrentThreadName("shadPS4:GpuCommandProcessor");
gpu_id = std::this_thread::get_id();
while (!stoken.stop_requested()) {
{
@ -90,18 +105,7 @@ void Liverpool::Process(std::stop_token stoken) {
curr_qid = -1;
while (num_submits || num_commands) {
// Process incoming commands with high priority
while (num_commands) {
Common::UniqueFunction<void> callback{};
{
std::unique_lock lk{submit_mutex};
callback = std::move(command_queue.front());
command_queue.pop();
--num_commands;
}
callback();
}
ProcessCommands();
curr_qid = (curr_qid + 1) % num_mapped_queues;
@ -147,6 +151,8 @@ Liverpool::Task Liverpool::ProcessCeUpdate(std::span<const u32> ccb) {
FIBER_ENTER(ccb_task_name);
while (!ccb.empty()) {
ProcessCommands();
const auto* header = reinterpret_cast<const PM4Header*>(ccb.data());
const u32 type = header->type;
if (type != 3) {
@ -224,6 +230,8 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span<const u32> dcb, std::span<c
const auto base_addr = reinterpret_cast<uintptr_t>(dcb.data());
while (!dcb.empty()) {
ProcessCommands();
const auto* header = reinterpret_cast<const PM4Header*>(dcb.data());
const u32 type = header->type;
@ -638,9 +646,8 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span<const u32> dcb, std::span<c
} else if ((dma_data->src_sel == DmaDataSrc::Memory ||
dma_data->src_sel == DmaDataSrc::MemoryUsingL2) &&
dma_data->dst_sel == DmaDataDst::Gds) {
rasterizer->InlineData(dma_data->dst_addr_lo,
dma_data->SrcAddress<const void*>(),
dma_data->NumBytes(), true);
rasterizer->CopyBuffer(dma_data->dst_addr_lo, dma_data->SrcAddress<VAddr>(),
dma_data->NumBytes(), true, false);
} else if (dma_data->src_sel == DmaDataSrc::Data &&
(dma_data->dst_sel == DmaDataDst::Memory ||
dma_data->dst_sel == DmaDataDst::MemoryUsingL2)) {
@ -649,14 +656,15 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span<const u32> dcb, std::span<c
} else if (dma_data->src_sel == DmaDataSrc::Gds &&
(dma_data->dst_sel == DmaDataDst::Memory ||
dma_data->dst_sel == DmaDataDst::MemoryUsingL2)) {
// LOG_WARNING(Render_Vulkan, "GDS memory read");
rasterizer->CopyBuffer(dma_data->DstAddress<VAddr>(), dma_data->src_addr_lo,
dma_data->NumBytes(), false, true);
} else if ((dma_data->src_sel == DmaDataSrc::Memory ||
dma_data->src_sel == DmaDataSrc::MemoryUsingL2) &&
(dma_data->dst_sel == DmaDataDst::Memory ||
dma_data->dst_sel == DmaDataDst::MemoryUsingL2)) {
rasterizer->InlineData(dma_data->DstAddress<VAddr>(),
dma_data->SrcAddress<const void*>(),
dma_data->NumBytes(), false);
rasterizer->CopyBuffer(dma_data->DstAddress<VAddr>(),
dma_data->SrcAddress<VAddr>(), dma_data->NumBytes(),
false, false);
} else {
UNREACHABLE_MSG("WriteData src_sel = {}, dst_sel = {}",
u32(dma_data->src_sel.Value()), u32(dma_data->dst_sel.Value()));
@ -702,6 +710,9 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span<const u32> dcb, std::span<c
break;
}
case PM4ItOpcode::Rewind: {
if (!rasterizer) {
break;
}
const PM4CmdRewind* rewind = reinterpret_cast<const PM4CmdRewind*>(header);
while (!rewind->Valid()) {
YIELD_GFX();
@ -801,29 +812,32 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span<const u32> dcb, std::span<c
}
template <bool is_indirect>
Liverpool::Task Liverpool::ProcessCompute(const u32* acb, u32 acb_dwords, u32 vqid) {
Liverpool::Task Liverpool::ProcessCompute(std::span<const u32> acb, u32 vqid) {
FIBER_ENTER(acb_task_name[vqid]);
auto& queue = asc_queues[{vqid}];
auto base_addr = reinterpret_cast<VAddr>(acb);
while (acb_dwords > 0) {
auto* header = reinterpret_cast<const PM4Header*>(acb);
auto base_addr = reinterpret_cast<VAddr>(acb.data());
while (!acb.empty()) {
ProcessCommands();
auto* header = reinterpret_cast<const PM4Header*>(acb.data());
u32 next_dw_off = header->type3.NumWords() + 1;
// If we have a buffered packet, use it.
if (queue.tmp_dwords > 0) [[unlikely]] {
header = reinterpret_cast<const PM4Header*>(queue.tmp_packet.data());
next_dw_off = header->type3.NumWords() + 1 - queue.tmp_dwords;
std::memcpy(queue.tmp_packet.data() + queue.tmp_dwords, acb, next_dw_off * sizeof(u32));
std::memcpy(queue.tmp_packet.data() + queue.tmp_dwords, acb.data(),
next_dw_off * sizeof(u32));
queue.tmp_dwords = 0;
}
// If the packet is split across ring boundary, buffer until next submission
if (next_dw_off > acb_dwords) [[unlikely]] {
std::memcpy(queue.tmp_packet.data(), acb, acb_dwords * sizeof(u32));
queue.tmp_dwords = acb_dwords;
if (next_dw_off > acb.size()) [[unlikely]] {
std::memcpy(queue.tmp_packet.data(), acb.data(), acb.size_bytes());
queue.tmp_dwords = acb.size();
if constexpr (!is_indirect) {
*queue.read_addr += acb_dwords;
*queue.read_addr += acb.size();
*queue.read_addr %= queue.ring_size_dw;
}
break;
@ -832,9 +846,7 @@ Liverpool::Task Liverpool::ProcessCompute(const u32* acb, u32 acb_dwords, u32 vq
if (header->type == 2) {
// Type-2 packet are used for padding purposes
next_dw_off = 1;
acb += next_dw_off;
acb_dwords -= next_dw_off;
acb = NextPacket(acb, next_dw_off);
if constexpr (!is_indirect) {
*queue.read_addr += next_dw_off;
*queue.read_addr %= queue.ring_size_dw;
@ -856,8 +868,8 @@ Liverpool::Task Liverpool::ProcessCompute(const u32* acb, u32 acb_dwords, u32 vq
}
case PM4ItOpcode::IndirectBuffer: {
const auto* indirect_buffer = reinterpret_cast<const PM4CmdIndirectBuffer*>(header);
auto task = ProcessCompute<true>(indirect_buffer->Address<const u32>(),
indirect_buffer->ib_size, vqid);
auto task = ProcessCompute<true>(
{indirect_buffer->Address<const u32>(), indirect_buffer->ib_size}, vqid);
RESUME_ASC(task, vqid);
while (!task.handle.done()) {
@ -876,8 +888,8 @@ Liverpool::Task Liverpool::ProcessCompute(const u32* acb, u32 acb_dwords, u32 vq
} else if ((dma_data->src_sel == DmaDataSrc::Memory ||
dma_data->src_sel == DmaDataSrc::MemoryUsingL2) &&
dma_data->dst_sel == DmaDataDst::Gds) {
rasterizer->InlineData(dma_data->dst_addr_lo, dma_data->SrcAddress<const void*>(),
dma_data->NumBytes(), true);
rasterizer->CopyBuffer(dma_data->dst_addr_lo, dma_data->SrcAddress<VAddr>(),
dma_data->NumBytes(), true, false);
} else if (dma_data->src_sel == DmaDataSrc::Data &&
(dma_data->dst_sel == DmaDataDst::Memory ||
dma_data->dst_sel == DmaDataDst::MemoryUsingL2)) {
@ -886,14 +898,14 @@ Liverpool::Task Liverpool::ProcessCompute(const u32* acb, u32 acb_dwords, u32 vq
} else if (dma_data->src_sel == DmaDataSrc::Gds &&
(dma_data->dst_sel == DmaDataDst::Memory ||
dma_data->dst_sel == DmaDataDst::MemoryUsingL2)) {
// LOG_WARNING(Render_Vulkan, "GDS memory read");
rasterizer->CopyBuffer(dma_data->DstAddress<VAddr>(), dma_data->src_addr_lo,
dma_data->NumBytes(), false, true);
} else if ((dma_data->src_sel == DmaDataSrc::Memory ||
dma_data->src_sel == DmaDataSrc::MemoryUsingL2) &&
(dma_data->dst_sel == DmaDataDst::Memory ||
dma_data->dst_sel == DmaDataDst::MemoryUsingL2)) {
rasterizer->InlineData(dma_data->DstAddress<VAddr>(),
dma_data->SrcAddress<const void*>(), dma_data->NumBytes(),
false);
rasterizer->CopyBuffer(dma_data->DstAddress<VAddr>(), dma_data->SrcAddress<VAddr>(),
dma_data->NumBytes(), false, false);
} else {
UNREACHABLE_MSG("WriteData src_sel = {}, dst_sel = {}",
u32(dma_data->src_sel.Value()), u32(dma_data->dst_sel.Value()));
@ -904,6 +916,9 @@ Liverpool::Task Liverpool::ProcessCompute(const u32* acb, u32 acb_dwords, u32 vq
break;
}
case PM4ItOpcode::Rewind: {
if (!rasterizer) {
break;
}
const PM4CmdRewind* rewind = reinterpret_cast<const PM4CmdRewind*>(header);
while (!rewind->Valid()) {
YIELD_ASC(vqid);
@ -1016,8 +1031,7 @@ Liverpool::Task Liverpool::ProcessCompute(const u32* acb, u32 acb_dwords, u32 vq
static_cast<u32>(opcode), header->type3.NumWords());
}
acb += next_dw_off;
acb_dwords -= next_dw_off;
acb = NextPacket(acb, next_dw_off);
if constexpr (!is_indirect) {
*queue.read_addr += next_dw_off;
@ -1087,7 +1101,7 @@ void Liverpool::SubmitAsc(u32 gnm_vqid, std::span<const u32> acb) {
auto& queue = mapped_queues[gnm_vqid];
const auto vqid = gnm_vqid - 1;
const auto& task = ProcessCompute(acb.data(), acb.size(), vqid);
const auto& task = ProcessCompute(acb, vqid);
{
std::scoped_lock lock{queue.m_access};
queue.submits.emplace(task.handle);

View File

@ -8,6 +8,7 @@
#include <coroutine>
#include <exception>
#include <mutex>
#include <semaphore>
#include <span>
#include <thread>
#include <vector>
@ -1512,14 +1513,32 @@ public:
rasterizer = rasterizer_;
}
void SendCommand(Common::UniqueFunction<void>&& func) {
std::scoped_lock lk{submit_mutex};
command_queue.emplace(std::move(func));
++num_commands;
submit_cv.notify_one();
template <bool wait_done = false>
void SendCommand(auto&& func) {
if (std::this_thread::get_id() == gpu_id) {
return func();
}
if constexpr (wait_done) {
std::binary_semaphore sem{0};
{
std::scoped_lock lk{submit_mutex};
command_queue.emplace([&sem, &func] {
func();
sem.release();
});
++num_commands;
submit_cv.notify_one();
}
sem.acquire();
} else {
std::scoped_lock lk{submit_mutex};
command_queue.emplace(std::move(func));
++num_commands;
submit_cv.notify_one();
}
}
void reserveCopyBufferSpace() {
void ReserveCopyBufferSpace() {
GpuQueue& gfx_queue = mapped_queues[GfxQueueId];
std::scoped_lock<std::mutex> lk(gfx_queue.m_access);
@ -1581,8 +1600,9 @@ private:
Task ProcessGraphics(std::span<const u32> dcb, std::span<const u32> ccb);
Task ProcessCeUpdate(std::span<const u32> ccb);
template <bool is_indirect = false>
Task ProcessCompute(const u32* acb, u32 acb_dwords, u32 vqid);
Task ProcessCompute(std::span<const u32> acb, u32 vqid);
void ProcessCommands();
void Process(std::stop_token stoken);
struct GpuQueue {
@ -1626,6 +1646,7 @@ private:
std::mutex submit_mutex;
std::condition_variable_any submit_cv;
std::queue<Common::UniqueFunction<void>> command_queue{};
std::thread::id gpu_id;
int curr_qid{-1};
};

View File

@ -2,6 +2,7 @@
// SPDX-License-Identifier: GPL-2.0-or-later
#include <algorithm>
#include <mutex>
#include "common/alignment.h"
#include "common/debug.h"
#include "common/scope_exit.h"
@ -9,6 +10,7 @@
#include "core/memory.h"
#include "video_core/amdgpu/liverpool.h"
#include "video_core/buffer_cache/buffer_cache.h"
#include "video_core/buffer_cache/memory_tracker.h"
#include "video_core/host_shaders/fault_buffer_process_comp.h"
#include "video_core/renderer_vulkan/vk_graphics_pipeline.h"
#include "video_core/renderer_vulkan/vk_instance.h"
@ -27,10 +29,10 @@ static constexpr size_t DeviceBufferSize = 128_MB;
static constexpr size_t MaxPageFaults = 1024;
BufferCache::BufferCache(const Vulkan::Instance& instance_, Vulkan::Scheduler& scheduler_,
Vulkan::Rasterizer& rasterizer_, AmdGpu::Liverpool* liverpool_,
TextureCache& texture_cache_, PageManager& tracker_)
: instance{instance_}, scheduler{scheduler_}, rasterizer{rasterizer_}, liverpool{liverpool_},
memory{Core::Memory::Instance()}, texture_cache{texture_cache_}, tracker{tracker_},
AmdGpu::Liverpool* liverpool_, TextureCache& texture_cache_,
PageManager& tracker)
: instance{instance_}, scheduler{scheduler_}, liverpool{liverpool_},
memory{Core::Memory::Instance()}, texture_cache{texture_cache_},
staging_buffer{instance, scheduler, MemoryUsage::Upload, StagingBufferSize},
stream_buffer{instance, scheduler, MemoryUsage::Stream, UboStreamBufferSize},
download_buffer{instance, scheduler, MemoryUsage::Download, DownloadBufferSize},
@ -38,13 +40,14 @@ BufferCache::BufferCache(const Vulkan::Instance& instance_, Vulkan::Scheduler& s
gds_buffer{instance, scheduler, MemoryUsage::Stream, 0, AllFlags, DataShareBufferSize},
bda_pagetable_buffer{instance, scheduler, MemoryUsage::DeviceLocal,
0, AllFlags, BDA_PAGETABLE_SIZE},
fault_buffer(instance, scheduler, MemoryUsage::DeviceLocal, 0, AllFlags, FAULT_BUFFER_SIZE),
memory_tracker{tracker} {
fault_buffer(instance, scheduler, MemoryUsage::DeviceLocal, 0, AllFlags, FAULT_BUFFER_SIZE) {
Vulkan::SetObjectName(instance.GetDevice(), gds_buffer.Handle(), "GDS Buffer");
Vulkan::SetObjectName(instance.GetDevice(), bda_pagetable_buffer.Handle(),
"BDA Page Table Buffer");
Vulkan::SetObjectName(instance.GetDevice(), fault_buffer.Handle(), "Fault Buffer");
memory_tracker = std::make_unique<MemoryTracker>(tracker);
// Ensure the first slot is used for the null buffer
const auto null_id =
slot_buffers.insert(instance, scheduler, MemoryUsage::DeviceLocal, 0, AllFlags, 16);
@ -129,22 +132,26 @@ BufferCache::BufferCache(const Vulkan::Instance& instance_, Vulkan::Scheduler& s
BufferCache::~BufferCache() = default;
void BufferCache::InvalidateMemory(VAddr device_addr, u64 size, bool unmap) {
const bool is_tracked = IsRegionRegistered(device_addr, size);
if (is_tracked) {
// Mark the page as CPU modified to stop tracking writes.
memory_tracker.MarkRegionAsCpuModified(device_addr, size);
if (unmap) {
return;
}
void BufferCache::InvalidateMemory(VAddr device_addr, u64 size) {
if (!IsRegionRegistered(device_addr, size)) {
return;
}
memory_tracker->InvalidateRegion(
device_addr, size, Config::readbacks(),
[this, device_addr, size] { ReadMemory(device_addr, size, true); });
}
void BufferCache::DownloadBufferMemory(Buffer& buffer, VAddr device_addr, u64 size) {
void BufferCache::ReadMemory(VAddr device_addr, u64 size, bool is_write) {
liverpool->SendCommand<true>([this, device_addr, size, is_write] {
Buffer& buffer = slot_buffers[FindBuffer(device_addr, size)];
DownloadBufferMemory(buffer, device_addr, size, is_write);
});
}
void BufferCache::DownloadBufferMemory(Buffer& buffer, VAddr device_addr, u64 size, bool is_write) {
boost::container::small_vector<vk::BufferCopy, 1> copies;
u64 total_size_bytes = 0;
memory_tracker.ForEachDownloadRange<true>(
memory_tracker->ForEachDownloadRange<false>(
device_addr, size, [&](u64 device_addr_out, u64 range_size) {
const VAddr buffer_addr = buffer.CpuAddr();
const auto add_download = [&](VAddr start, VAddr end) {
@ -155,7 +162,10 @@ void BufferCache::DownloadBufferMemory(Buffer& buffer, VAddr device_addr, u64 si
.dstOffset = total_size_bytes,
.size = new_size,
});
total_size_bytes += new_size;
// Align up to avoid cache conflicts
constexpr u64 align = 64ULL;
constexpr u64 mask = ~(align - 1ULL);
total_size_bytes += (new_size + align - 1) & mask;
};
gpu_modified_ranges.ForEachInRange(device_addr_out, range_size, add_download);
gpu_modified_ranges.Subtract(device_addr_out, range_size);
@ -173,10 +183,16 @@ void BufferCache::DownloadBufferMemory(Buffer& buffer, VAddr device_addr, u64 si
const auto cmdbuf = scheduler.CommandBuffer();
cmdbuf.copyBuffer(buffer.buffer, download_buffer.Handle(), copies);
scheduler.Finish();
auto* memory = Core::Memory::Instance();
for (const auto& copy : copies) {
const VAddr copy_device_addr = buffer.CpuAddr() + copy.srcOffset;
const u64 dst_offset = copy.dstOffset - offset;
std::memcpy(std::bit_cast<u8*>(copy_device_addr), download + dst_offset, copy.size);
memory->TryWriteBacking(std::bit_cast<u8*>(copy_device_addr), download + dst_offset,
copy.size);
}
memory_tracker->UnmarkRegionAsGpuModified(device_addr, size);
if (is_write) {
memory_tracker->MarkRegionAsCpuModified(device_addr, size);
}
}
@ -296,9 +312,11 @@ void BufferCache::BindIndexBuffer(u32 index_offset) {
void BufferCache::InlineData(VAddr address, const void* value, u32 num_bytes, bool is_gds) {
ASSERT_MSG(address % 4 == 0, "GDS offset must be dword aligned");
if (!is_gds && !IsRegionGpuModified(address, num_bytes)) {
memcpy(std::bit_cast<void*>(address), value, num_bytes);
return;
if (!is_gds) {
ASSERT(memory->TryWriteBacking(std::bit_cast<void*>(address), value, num_bytes));
if (!IsRegionRegistered(address, num_bytes)) {
return;
}
}
Buffer* buffer = [&] {
if (is_gds) {
@ -326,25 +344,107 @@ void BufferCache::WriteData(VAddr address, const void* value, u32 num_bytes, boo
WriteDataBuffer(*buffer, address, value, num_bytes);
}
void BufferCache::CopyBuffer(VAddr dst, VAddr src, u32 num_bytes, bool dst_gds, bool src_gds) {
if (!dst_gds && !IsRegionGpuModified(dst, num_bytes)) {
if (!src_gds && !IsRegionGpuModified(src, num_bytes)) {
// Both buffers were not transferred to GPU yet. Can safely copy in host memory.
memcpy(std::bit_cast<void*>(dst), std::bit_cast<void*>(src), num_bytes);
return;
}
// Without a readback there's nothing we can do with this
// Fallback to creating dst buffer on GPU to at least have this data there
}
auto& src_buffer = [&] -> const Buffer& {
if (src_gds) {
return gds_buffer;
}
// Avoid using ObtainBuffer here as that might give us the stream buffer.
const BufferId buffer_id = FindBuffer(src, num_bytes);
auto& buffer = slot_buffers[buffer_id];
SynchronizeBuffer(buffer, src, num_bytes, false, false);
return buffer;
}();
auto& dst_buffer = [&] -> const Buffer& {
if (dst_gds) {
return gds_buffer;
}
// Prefer using ObtainBuffer here as that will auto-mark the region as GPU modified.
const auto [buffer, offset] = ObtainBuffer(dst, num_bytes, true);
return *buffer;
}();
vk::BufferCopy region{
.srcOffset = src_buffer.Offset(src),
.dstOffset = dst_buffer.Offset(dst),
.size = num_bytes,
};
const vk::BufferMemoryBarrier2 buf_barriers_before[2] = {
{
.srcStageMask = vk::PipelineStageFlagBits2::eAllCommands,
.srcAccessMask = vk::AccessFlagBits2::eMemoryRead,
.dstStageMask = vk::PipelineStageFlagBits2::eAllCommands,
.dstAccessMask = vk::AccessFlagBits2::eTransferWrite,
.buffer = dst_buffer.Handle(),
.offset = dst_buffer.Offset(dst),
.size = num_bytes,
},
{
.srcStageMask = vk::PipelineStageFlagBits2::eAllCommands,
.srcAccessMask = vk::AccessFlagBits2::eMemoryWrite,
.dstStageMask = vk::PipelineStageFlagBits2::eAllCommands,
.dstAccessMask = vk::AccessFlagBits2::eTransferRead,
.buffer = src_buffer.Handle(),
.offset = src_buffer.Offset(src),
.size = num_bytes,
},
};
scheduler.EndRendering();
const auto cmdbuf = scheduler.CommandBuffer();
cmdbuf.pipelineBarrier2(vk::DependencyInfo{
.dependencyFlags = vk::DependencyFlagBits::eByRegion,
.bufferMemoryBarrierCount = 2,
.pBufferMemoryBarriers = buf_barriers_before,
});
cmdbuf.copyBuffer(src_buffer.Handle(), dst_buffer.Handle(), region);
const vk::BufferMemoryBarrier2 buf_barriers_after[2] = {
{
.srcStageMask = vk::PipelineStageFlagBits2::eAllCommands,
.srcAccessMask = vk::AccessFlagBits2::eTransferWrite,
.dstStageMask = vk::PipelineStageFlagBits2::eAllCommands,
.dstAccessMask = vk::AccessFlagBits2::eMemoryRead,
.buffer = dst_buffer.Handle(),
.offset = dst_buffer.Offset(dst),
.size = num_bytes,
},
{
.srcStageMask = vk::PipelineStageFlagBits2::eAllCommands,
.srcAccessMask = vk::AccessFlagBits2::eTransferRead,
.dstStageMask = vk::PipelineStageFlagBits2::eAllCommands,
.dstAccessMask = vk::AccessFlagBits2::eMemoryWrite,
.buffer = src_buffer.Handle(),
.offset = src_buffer.Offset(src),
.size = num_bytes,
},
};
cmdbuf.pipelineBarrier2(vk::DependencyInfo{
.dependencyFlags = vk::DependencyFlagBits::eByRegion,
.bufferMemoryBarrierCount = 2,
.pBufferMemoryBarriers = buf_barriers_after,
});
}
std::pair<Buffer*, u32> BufferCache::ObtainBuffer(VAddr device_addr, u32 size, bool is_written,
bool is_texel_buffer, BufferId buffer_id) {
// For small uniform buffers that have not been modified by gpu
// use device local stream buffer to reduce renderpass breaks.
// Maybe we want to modify the threshold now that the page size is 16KB?
static constexpr u64 StreamThreshold = CACHING_PAGESIZE;
const bool is_gpu_dirty = memory_tracker.IsRegionGpuModified(device_addr, size);
if (!is_written && size <= StreamThreshold && !is_gpu_dirty) {
// For read-only buffers use device local stream buffer to reduce renderpass breaks.
if (!is_written && size <= CACHING_PAGESIZE && !IsRegionGpuModified(device_addr, size)) {
const u64 offset = stream_buffer.Copy(device_addr, size, instance.UniformMinAlignment());
return {&stream_buffer, offset};
}
if (!buffer_id || slot_buffers[buffer_id].is_deleted) {
if (IsBufferInvalid(buffer_id)) {
buffer_id = FindBuffer(device_addr, size);
}
Buffer& buffer = slot_buffers[buffer_id];
SynchronizeBuffer(buffer, device_addr, size, is_texel_buffer);
SynchronizeBuffer(buffer, device_addr, size, is_written, is_texel_buffer);
if (is_written) {
memory_tracker.MarkRegionAsGpuModified(device_addr, size);
gpu_modified_ranges.Add(device_addr, size);
}
return {&buffer, buffer.Offset(device_addr)};
@ -352,21 +452,17 @@ std::pair<Buffer*, u32> BufferCache::ObtainBuffer(VAddr device_addr, u32 size, b
std::pair<Buffer*, u32> BufferCache::ObtainBufferForImage(VAddr gpu_addr, u32 size) {
// Check if any buffer contains the full requested range.
const u64 page = gpu_addr >> CACHING_PAGEBITS;
const BufferId buffer_id = page_table[page].buffer_id;
const BufferId buffer_id = page_table[gpu_addr >> CACHING_PAGEBITS].buffer_id;
if (buffer_id) {
Buffer& buffer = slot_buffers[buffer_id];
if (buffer.IsInBounds(gpu_addr, size)) {
SynchronizeBuffer(buffer, gpu_addr, size, false);
if (Buffer& buffer = slot_buffers[buffer_id]; buffer.IsInBounds(gpu_addr, size)) {
SynchronizeBuffer(buffer, gpu_addr, size, false, false);
return {&buffer, buffer.Offset(gpu_addr)};
}
}
// If no buffer contains the full requested range but some buffer within was GPU-modified,
// fall back to ObtainBuffer to create a full buffer and avoid losing GPU modifications.
if (memory_tracker.IsRegionGpuModified(gpu_addr, size)) {
// If some buffer within was GPU modified create a full buffer to avoid losing GPU data.
if (IsRegionGpuModified(gpu_addr, size)) {
return ObtainBuffer(gpu_addr, size, false, false);
}
// In all other cases, just do a CPU copy to the staging buffer.
const auto [data, offset] = staging_buffer.Map(size, 16);
memory->CopySparseMemory(gpu_addr, data, size);
@ -380,11 +476,11 @@ bool BufferCache::IsRegionRegistered(VAddr addr, size_t size) {
}
bool BufferCache::IsRegionCpuModified(VAddr addr, size_t size) {
return memory_tracker.IsRegionCpuModified(addr, size);
return memory_tracker->IsRegionCpuModified(addr, size);
}
bool BufferCache::IsRegionGpuModified(VAddr addr, size_t size) {
return memory_tracker.IsRegionGpuModified(addr, size);
return memory_tracker->IsRegionGpuModified(addr, size);
}
BufferId BufferCache::FindBuffer(VAddr device_addr, u32 size) {
@ -718,56 +814,27 @@ void BufferCache::ChangeRegister(BufferId buffer_id) {
}
}
void BufferCache::SynchronizeBuffer(Buffer& buffer, VAddr device_addr, u32 size,
void BufferCache::SynchronizeBuffer(Buffer& buffer, VAddr device_addr, u32 size, bool is_written,
bool is_texel_buffer) {
boost::container::small_vector<vk::BufferCopy, 4> copies;
u64 total_size_bytes = 0;
VAddr buffer_start = buffer.CpuAddr();
memory_tracker.ForEachUploadRange(device_addr, size, [&](u64 device_addr_out, u64 range_size) {
copies.push_back(vk::BufferCopy{
.srcOffset = total_size_bytes,
.dstOffset = device_addr_out - buffer_start,
.size = range_size,
memory_tracker->ForEachUploadRange(
device_addr, size, is_written, [&](u64 device_addr_out, u64 range_size) {
const u64 offset = staging_buffer.Copy(device_addr_out, range_size);
copies.push_back(vk::BufferCopy{
.srcOffset = offset,
.dstOffset = device_addr_out - buffer_start,
.size = range_size,
});
});
total_size_bytes += range_size;
});
SCOPE_EXIT {
if (is_texel_buffer) {
SynchronizeBufferFromImage(buffer, device_addr, size);
}
};
if (total_size_bytes == 0) {
if (copies.empty()) {
return;
}
vk::Buffer src_buffer = staging_buffer.Handle();
if (total_size_bytes < StagingBufferSize) {
const auto [staging, offset] = staging_buffer.Map(total_size_bytes);
for (auto& copy : copies) {
u8* const src_pointer = staging + copy.srcOffset;
const VAddr device_addr = buffer.CpuAddr() + copy.dstOffset;
std::memcpy(src_pointer, std::bit_cast<const u8*>(device_addr), copy.size);
// Apply the staging offset
copy.srcOffset += offset;
}
staging_buffer.Commit();
} else {
// For large one time transfers use a temporary host buffer.
// RenderDoc can lag quite a bit if the stream buffer is too large.
Buffer temp_buffer{instance,
scheduler,
MemoryUsage::Upload,
0,
vk::BufferUsageFlagBits::eTransferSrc,
total_size_bytes};
src_buffer = temp_buffer.Handle();
u8* const staging = temp_buffer.mapped_data.data();
for (auto& copy : copies) {
u8* const src_pointer = staging + copy.srcOffset;
const VAddr device_addr = buffer.CpuAddr() + copy.dstOffset;
std::memcpy(src_pointer, std::bit_cast<const u8*>(device_addr), copy.size);
}
scheduler.DeferOperation([buffer = std::move(temp_buffer)]() mutable {});
}
scheduler.EndRendering();
const auto cmdbuf = scheduler.CommandBuffer();
const vk::BufferMemoryBarrier2 pre_barrier = {
@ -794,7 +861,7 @@ void BufferCache::SynchronizeBuffer(Buffer& buffer, VAddr device_addr, u32 size,
.bufferMemoryBarrierCount = 1,
.pBufferMemoryBarriers = &pre_barrier,
});
cmdbuf.copyBuffer(src_buffer, buffer.buffer, copies);
cmdbuf.copyBuffer(staging_buffer.Handle(), buffer.buffer, copies);
cmdbuf.pipelineBarrier2(vk::DependencyInfo{
.dependencyFlags = vk::DependencyFlagBits::eByRegion,
.bufferMemoryBarrierCount = 1,
@ -925,7 +992,7 @@ void BufferCache::SynchronizeBuffersInRange(VAddr device_addr, u64 size) {
VAddr start = std::max(buffer.CpuAddr(), device_addr);
VAddr end = std::min(buffer.CpuAddr() + buffer.SizeBytes(), device_addr_end);
u32 size = static_cast<u32>(end - start);
SynchronizeBuffer(buffer, start, size, false);
SynchronizeBuffer(buffer, start, size, false, false);
});
}

View File

@ -9,7 +9,6 @@
#include "common/slot_vector.h"
#include "common/types.h"
#include "video_core/buffer_cache/buffer.h"
#include "video_core/buffer_cache/memory_tracker.h"
#include "video_core/buffer_cache/range_set.h"
#include "video_core/multi_level_page_table.h"
@ -21,13 +20,6 @@ namespace Core {
class MemoryManager;
}
namespace Shader {
namespace Gcn {
struct FetchShaderData;
}
struct Info;
} // namespace Shader
namespace Vulkan {
class GraphicsPipeline;
}
@ -39,6 +31,8 @@ using BufferId = Common::SlotId;
static constexpr BufferId NULL_BUFFER_ID{0};
class TextureCache;
class MemoryTracker;
class PageManager;
class BufferCache {
public:
@ -69,10 +63,16 @@ public:
bool has_stream_leap = false;
};
using IntervalSet =
boost::icl::interval_set<VAddr, std::less,
ICL_INTERVAL_INSTANCE(ICL_INTERVAL_DEFAULT, VAddr, std::less),
RangeSetsAllocator>;
using IntervalType = typename IntervalSet::interval_type;
public:
explicit BufferCache(const Vulkan::Instance& instance, Vulkan::Scheduler& scheduler,
Vulkan::Rasterizer& rasterizer_, AmdGpu::Liverpool* liverpool,
TextureCache& texture_cache, PageManager& tracker);
AmdGpu::Liverpool* liverpool, TextureCache& texture_cache,
PageManager& tracker);
~BufferCache();
/// Returns a pointer to GDS device local buffer.
@ -110,7 +110,10 @@ public:
}
/// Invalidates any buffer in the logical page range.
void InvalidateMemory(VAddr device_addr, u64 size, bool unmap);
void InvalidateMemory(VAddr device_addr, u64 size);
/// Waits on pending downloads in the logical page range.
void ReadMemory(VAddr device_addr, u64 size, bool is_write = false);
/// Binds host vertex buffers for the current draw.
void BindVertexBuffers(const Vulkan::GraphicsPipeline& pipeline);
@ -124,6 +127,9 @@ public:
/// Writes a value to GPU buffer. (uses staging buffer to temporarily store the data)
void WriteData(VAddr address, const void* value, u32 num_bytes, bool is_gds);
/// Performs buffer to buffer data copy on the GPU.
void CopyBuffer(VAddr dst, VAddr src, u32 num_bytes, bool dst_gds, bool src_gds);
/// Obtains a buffer for the specified region.
[[nodiscard]] std::pair<Buffer*, u32> ObtainBuffer(VAddr gpu_addr, u32 size, bool is_written,
bool is_texel_buffer = false,
@ -166,7 +172,11 @@ private:
});
}
void DownloadBufferMemory(Buffer& buffer, VAddr device_addr, u64 size);
inline bool IsBufferInvalid(BufferId buffer_id) const {
return !buffer_id || slot_buffers[buffer_id].is_deleted;
}
void DownloadBufferMemory(Buffer& buffer, VAddr device_addr, u64 size, bool is_write);
[[nodiscard]] OverlapResult ResolveOverlaps(VAddr device_addr, u32 wanted_size);
@ -181,7 +191,8 @@ private:
template <bool insert>
void ChangeRegister(BufferId buffer_id);
void SynchronizeBuffer(Buffer& buffer, VAddr device_addr, u32 size, bool is_texel_buffer);
void SynchronizeBuffer(Buffer& buffer, VAddr device_addr, u32 size, bool is_written,
bool is_texel_buffer);
bool SynchronizeBufferFromImage(Buffer& buffer, VAddr device_addr, u32 size);
@ -193,11 +204,10 @@ private:
const Vulkan::Instance& instance;
Vulkan::Scheduler& scheduler;
Vulkan::Rasterizer& rasterizer;
AmdGpu::Liverpool* liverpool;
Core::MemoryManager* memory;
TextureCache& texture_cache;
PageManager& tracker;
std::unique_ptr<MemoryTracker> memory_tracker;
StreamBuffer staging_buffer;
StreamBuffer stream_buffer;
StreamBuffer download_buffer;
@ -209,7 +219,6 @@ private:
Common::SlotVector<Buffer> slot_buffers;
RangeSet gpu_modified_ranges;
SplitRangeMap<BufferId> buffer_ranges;
MemoryTracker memory_tracker;
PageTable page_table;
vk::UniqueDescriptorSetLayout fault_process_desc_layout;
vk::UniquePipeline fault_process_pipeline;

View File

@ -27,6 +27,7 @@ public:
bool IsRegionCpuModified(VAddr query_cpu_addr, u64 query_size) noexcept {
return IteratePages<true>(
query_cpu_addr, query_size, [](RegionManager* manager, u64 offset, size_t size) {
std::scoped_lock lk{manager->lock};
return manager->template IsRegionModified<Type::CPU>(offset, size);
});
}
@ -35,6 +36,7 @@ public:
bool IsRegionGpuModified(VAddr query_cpu_addr, u64 query_size) noexcept {
return IteratePages<false>(
query_cpu_addr, query_size, [](RegionManager* manager, u64 offset, size_t size) {
std::scoped_lock lk{manager->lock};
return manager->template IsRegionModified<Type::GPU>(offset, size);
});
}
@ -43,26 +45,57 @@ public:
void MarkRegionAsCpuModified(VAddr dirty_cpu_addr, u64 query_size) {
IteratePages<false>(dirty_cpu_addr, query_size,
[](RegionManager* manager, u64 offset, size_t size) {
std::scoped_lock lk{manager->lock};
manager->template ChangeRegionState<Type::CPU, true>(
manager->GetCpuAddr() + offset, size);
});
}
/// Mark region as modified from the host GPU
void MarkRegionAsGpuModified(VAddr dirty_cpu_addr, u64 query_size) noexcept {
/// Unmark region as modified from the host GPU
void UnmarkRegionAsGpuModified(VAddr dirty_cpu_addr, u64 query_size) noexcept {
IteratePages<false>(dirty_cpu_addr, query_size,
[](RegionManager* manager, u64 offset, size_t size) {
manager->template ChangeRegionState<Type::GPU, true>(
std::scoped_lock lk{manager->lock};
manager->template ChangeRegionState<Type::GPU, false>(
manager->GetCpuAddr() + offset, size);
});
}
/// Removes all protection from a page and ensures GPU data has been flushed if requested
void InvalidateRegion(VAddr cpu_addr, u64 size, bool try_flush, auto&& on_flush) noexcept {
IteratePages<false>(
cpu_addr, size,
[try_flush, &on_flush](RegionManager* manager, u64 offset, size_t size) {
const bool should_flush = [&] {
// Perform both the GPU modification check and CPU state change with the lock
// in case we are racing with GPU thread trying to mark the page as GPU
// modified. If we need to flush the flush function is going to perform CPU
// state change.
std::scoped_lock lk{manager->lock};
if (try_flush && manager->template IsRegionModified<Type::GPU>(offset, size)) {
return true;
}
manager->template ChangeRegionState<Type::CPU, true>(
manager->GetCpuAddr() + offset, size);
return false;
}();
if (should_flush) {
on_flush();
}
});
}
/// Call 'func' for each CPU modified range and unmark those pages as CPU modified
void ForEachUploadRange(VAddr query_cpu_range, u64 query_size, auto&& func) {
void ForEachUploadRange(VAddr query_cpu_range, u64 query_size, bool is_written, auto&& func) {
IteratePages<true>(query_cpu_range, query_size,
[&func](RegionManager* manager, u64 offset, size_t size) {
[&func, is_written](RegionManager* manager, u64 offset, size_t size) {
std::scoped_lock lk{manager->lock};
manager->template ForEachModifiedRange<Type::CPU, true>(
manager->GetCpuAddr() + offset, size, func);
if (is_written) {
manager->template ChangeRegionState<Type::GPU, true>(
manager->GetCpuAddr() + offset, size);
}
});
}
@ -71,6 +104,7 @@ public:
void ForEachDownloadRange(VAddr query_cpu_range, u64 query_size, auto&& func) {
IteratePages<false>(query_cpu_range, query_size,
[&func](RegionManager* manager, u64 offset, size_t size) {
std::scoped_lock lk{manager->lock};
manager->template ForEachModifiedRange<Type::GPU, clear>(
manager->GetCpuAddr() + offset, size, func);
});

View File

@ -3,7 +3,6 @@
#pragma once
#include <array>
#include "common/bit_array.h"
#include "common/types.h"
@ -20,9 +19,8 @@ constexpr u64 NUM_PAGES_PER_REGION = TRACKER_HIGHER_PAGE_SIZE / TRACKER_BYTES_PE
enum class Type {
CPU,
GPU,
Writeable,
};
using RegionBits = Common::BitArray<NUM_PAGES_PER_REGION>;
} // namespace VideoCore
} // namespace VideoCore

View File

@ -3,9 +3,9 @@
#pragma once
#include <mutex>
#include <utility>
#include "common/config.h"
#include "common/div_ceil.h"
#include "common/logging/log.h"
#ifdef __linux__
#include "common/adaptive_mutex.h"
@ -19,8 +19,14 @@
namespace VideoCore {
#ifdef PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP
using LockType = Common::AdaptiveMutex;
#else
using LockType = Common::SpinLock;
#endif
/**
* Allows tracking CPU and GPU modification of pages in a contigious 4MB virtual address region.
* Allows tracking CPU and GPU modification of pages in a contigious 16MB virtual address region.
* Information is stored in bitsets for spacial locality and fast update of single pages.
*/
class RegionManager {
@ -30,6 +36,7 @@ public:
cpu.Fill();
gpu.Clear();
writeable.Fill();
readable.Fill();
}
explicit RegionManager() = default;
@ -47,29 +54,19 @@ public:
template <Type type>
RegionBits& GetRegionBits() noexcept {
static_assert(type != Type::Writeable);
if constexpr (type == Type::CPU) {
return cpu;
} else if constexpr (type == Type::GPU) {
return gpu;
} else if constexpr (type == Type::Writeable) {
return writeable;
} else {
static_assert(false, "Invalid type");
}
}
template <Type type>
const RegionBits& GetRegionBits() const noexcept {
static_assert(type != Type::Writeable);
if constexpr (type == Type::CPU) {
return cpu;
} else if constexpr (type == Type::GPU) {
return gpu;
} else if constexpr (type == Type::Writeable) {
return writeable;
} else {
static_assert(false, "Invalid type");
}
}
@ -89,8 +86,6 @@ public:
if (start_page >= NUM_PAGES_PER_REGION || end_page <= start_page) {
return;
}
std::scoped_lock lk{lock};
static_assert(type != Type::Writeable);
RegionBits& bits = GetRegionBits<type>();
if constexpr (enable) {
@ -99,7 +94,9 @@ public:
bits.UnsetRange(start_page, end_page);
}
if constexpr (type == Type::CPU) {
UpdateProtection<!enable>();
UpdateProtection<!enable, false>();
} else if (Config::readbacks()) {
UpdateProtection<enable, true>();
}
}
@ -121,27 +118,22 @@ public:
if (start_page >= NUM_PAGES_PER_REGION || end_page <= start_page) {
return;
}
std::scoped_lock lk{lock};
static_assert(type != Type::Writeable);
RegionBits& bits = GetRegionBits<type>();
RegionBits mask(bits, start_page, end_page);
// TODO: this will not be needed once we handle readbacks
if constexpr (type == Type::GPU) {
mask &= ~writeable;
if constexpr (clear) {
bits.UnsetRange(start_page, end_page);
if constexpr (type == Type::CPU) {
UpdateProtection<true, false>();
} else if (Config::readbacks()) {
UpdateProtection<false, true>();
}
}
for (const auto& [start, end] : mask) {
func(cpu_addr + start * TRACKER_BYTES_PER_PAGE, (end - start) * TRACKER_BYTES_PER_PAGE);
}
if constexpr (clear) {
bits.UnsetRange(start_page, end_page);
if constexpr (type == Type::CPU) {
UpdateProtection<true>();
}
}
}
/**
@ -151,7 +143,7 @@ public:
* @param size Size in bytes of the region to query for modifications
*/
template <Type type>
[[nodiscard]] bool IsRegionModified(u64 offset, u64 size) const noexcept {
[[nodiscard]] bool IsRegionModified(u64 offset, u64 size) noexcept {
RENDERER_TRACE;
const size_t start_page = SanitizeAddress(offset) / TRACKER_BYTES_PER_PAGE;
const size_t end_page =
@ -159,20 +151,14 @@ public:
if (start_page >= NUM_PAGES_PER_REGION || end_page <= start_page) {
return false;
}
// std::scoped_lock lk{lock}; // Is this needed?
static_assert(type != Type::Writeable);
const RegionBits& bits = GetRegionBits<type>();
RegionBits test(bits, start_page, end_page);
// TODO: this will not be needed once we handle readbacks
if constexpr (type == Type::GPU) {
test &= ~writeable;
}
return test.Any();
}
LockType lock;
private:
/**
* Notify tracker about changes in the CPU tracking state of a word in the buffer
@ -181,31 +167,29 @@ private:
* @param current_bits Current state of the word
* @param new_bits New state of the word
*
* @tparam add_to_tracker True when the tracker should start tracking the new pages
* @tparam track True when the tracker should start tracking the new pages
*/
template <bool add_to_tracker>
template <bool track, bool is_read>
void UpdateProtection() {
RENDERER_TRACE;
RegionBits mask = cpu ^ writeable;
RegionBits mask = is_read ? (~gpu ^ readable) : (cpu ^ writeable);
if (mask.None()) {
return; // No changes to the CPU tracking state
return;
}
writeable = cpu;
tracker->UpdatePageWatchersForRegion<add_to_tracker>(cpu_addr, mask);
if constexpr (is_read) {
readable = ~gpu;
} else {
writeable = cpu;
}
tracker->UpdatePageWatchersForRegion<track, is_read>(cpu_addr, mask);
}
#ifdef PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP
Common::AdaptiveMutex lock;
#else
Common::SpinLock lock;
#endif
PageManager* tracker;
VAddr cpu_addr = 0;
RegionBits cpu;
RegionBits gpu;
RegionBits writeable;
RegionBits readable;
};
} // namespace VideoCore

View File

@ -13,6 +13,7 @@
#ifndef _WIN64
#include <sys/mman.h>
#include "common/adaptive_mutex.h"
#ifdef ENABLE_USERFAULTFD
#include <thread>
#include <fcntl.h>
@ -23,6 +24,7 @@
#endif
#else
#include <windows.h>
#include "common/spin_lock.h"
#endif
#ifdef __linux__
@ -38,22 +40,45 @@ constexpr size_t PAGE_BITS = 12;
struct PageManager::Impl {
struct PageState {
u8 num_watchers{};
u8 num_write_watchers : 7;
// At the moment only buffer cache can request read watchers.
// And buffers cannot overlap, thus only 1 can exist per page.
u8 num_read_watchers : 1;
Core::MemoryPermission Perm() const noexcept {
return num_watchers == 0 ? Core::MemoryPermission::ReadWrite
: Core::MemoryPermission::Read;
Core::MemoryPermission WritePerm() const noexcept {
return num_write_watchers == 0 ? Core::MemoryPermission::Write
: Core::MemoryPermission::None;
}
template <s32 delta>
Core::MemoryPermission ReadPerm() const noexcept {
return num_read_watchers == 0 ? Core::MemoryPermission::Read
: Core::MemoryPermission::None;
}
Core::MemoryPermission Perms() const noexcept {
return ReadPerm() | WritePerm();
}
template <s32 delta, bool is_read>
u8 AddDelta() {
if constexpr (delta == 1) {
return ++num_watchers;
} else if constexpr (delta == -1) {
ASSERT_MSG(num_watchers > 0, "Not enough watchers");
return --num_watchers;
if constexpr (is_read) {
if constexpr (delta == 1) {
return ++num_read_watchers;
} else if (delta == -1) {
ASSERT_MSG(num_read_watchers > 0, "Not enough watchers");
return --num_read_watchers;
} else {
return num_read_watchers;
}
} else {
return num_watchers;
if constexpr (delta == 1) {
return ++num_write_watchers;
} else if (delta == -1) {
ASSERT_MSG(num_write_watchers > 0, "Not enough watchers");
return --num_write_watchers;
} else {
return num_write_watchers;
}
}
}
};
@ -176,19 +201,23 @@ struct PageManager::Impl {
RENDERER_TRACE;
auto* memory = Core::Memory::Instance();
auto& impl = memory->GetAddressSpace();
ASSERT_MSG(perms != Core::MemoryPermission::Write,
"Attempted to protect region as write-only which is not a valid permission");
impl.Protect(address, size, perms);
}
static bool GuestFaultSignalHandler(void* context, void* fault_address) {
const auto addr = reinterpret_cast<VAddr>(fault_address);
if (Common::IsWriteError(context)) {
return rasterizer->InvalidateMemory(addr, 1);
return rasterizer->InvalidateMemory(addr, 8);
} else {
return rasterizer->ReadMemory(addr, 8);
}
return false;
}
#endif
template <bool track>
template <bool track, bool is_read>
void UpdatePageWatchers(VAddr addr, u64 size) {
RENDERER_TRACE;
@ -200,7 +229,7 @@ struct PageManager::Impl {
const auto lock_end = locks.begin() + Common::DivCeil(page_end, PAGES_PER_LOCK);
Common::RangeLockGuard lk(lock_start, lock_end);
auto perms = cached_pages[page].Perm();
auto perms = cached_pages[page].Perms();
u64 range_begin = 0;
u64 range_bytes = 0;
u64 potential_range_bytes = 0;
@ -226,9 +255,9 @@ struct PageManager::Impl {
PageState& state = cached_pages[page];
// Apply the change to the page state
const u8 new_count = state.AddDelta<track ? 1 : -1>();
const u8 new_count = state.AddDelta<track ? 1 : -1, is_read>();
if (auto new_perms = state.Perm(); new_perms != perms) [[unlikely]] {
if (auto new_perms = state.Perms(); new_perms != perms) [[unlikely]] {
// If the protection changed add pending (un)protect action
release_pending();
perms = new_perms;
@ -253,25 +282,23 @@ struct PageManager::Impl {
release_pending();
}
template <bool track>
template <bool track, bool is_read>
void UpdatePageWatchersForRegion(VAddr base_addr, RegionBits& mask) {
RENDERER_TRACE;
auto start_range = mask.FirstRange();
auto end_range = mask.LastRange();
if (start_range.second == end_range.second) {
// Optimization: if all pages are contiguous, use the regular UpdatePageWatchers
// if all pages are contiguous, use the regular UpdatePageWatchers
const VAddr start_addr = base_addr + (start_range.first << PAGE_BITS);
const u64 size = (start_range.second - start_range.first) << PAGE_BITS;
UpdatePageWatchers<track>(start_addr, size);
return;
return UpdatePageWatchers<track, is_read>(start_addr, size);
}
size_t base_page = (base_addr >> PAGE_BITS);
ASSERT(base_page % PAGES_PER_LOCK == 0);
std::scoped_lock lk(locks[base_page / PAGES_PER_LOCK]);
auto perms = cached_pages[base_page + start_range.first].Perm();
auto perms = cached_pages[base_page + start_range.first].Perms();
u64 range_begin = 0;
u64 range_bytes = 0;
u64 potential_range_bytes = 0;
@ -292,9 +319,10 @@ struct PageManager::Impl {
const bool update = mask.Get(page);
// Apply the change to the page state
const u8 new_count = update ? state.AddDelta<track ? 1 : -1>() : state.AddDelta<0>();
const u8 new_count =
update ? state.AddDelta<track ? 1 : -1, is_read>() : state.AddDelta<0, is_read>();
if (auto new_perms = state.Perm(); new_perms != perms) [[unlikely]] {
if (auto new_perms = state.Perms(); new_perms != perms) [[unlikely]] {
// If the protection changed add pending (un)protect action
release_pending();
perms = new_perms;
@ -348,19 +376,23 @@ void PageManager::OnGpuUnmap(VAddr address, size_t size) {
template <bool track>
void PageManager::UpdatePageWatchers(VAddr addr, u64 size) const {
impl->UpdatePageWatchers<track>(addr, size);
impl->UpdatePageWatchers<track, false>(addr, size);
}
template <bool track>
template <bool track, bool is_read>
void PageManager::UpdatePageWatchersForRegion(VAddr base_addr, RegionBits& mask) const {
impl->UpdatePageWatchersForRegion<track>(base_addr, mask);
impl->UpdatePageWatchersForRegion<track, is_read>(base_addr, mask);
}
template void PageManager::UpdatePageWatchers<true>(VAddr addr, u64 size) const;
template void PageManager::UpdatePageWatchers<false>(VAddr addr, u64 size) const;
template void PageManager::UpdatePageWatchersForRegion<true>(VAddr base_addr,
RegionBits& mask) const;
template void PageManager::UpdatePageWatchersForRegion<false>(VAddr base_addr,
RegionBits& mask) const;
template void PageManager::UpdatePageWatchersForRegion<true, true>(VAddr base_addr,
RegionBits& mask) const;
template void PageManager::UpdatePageWatchersForRegion<true, false>(VAddr base_addr,
RegionBits& mask) const;
template void PageManager::UpdatePageWatchersForRegion<false, true>(VAddr base_addr,
RegionBits& mask) const;
template void PageManager::UpdatePageWatchersForRegion<false, false>(VAddr base_addr,
RegionBits& mask) const;
} // namespace VideoCore

View File

@ -37,9 +37,8 @@ public:
template <bool track>
void UpdatePageWatchers(VAddr addr, u64 size) const;
/// Updates watches in the pages touching the specified region
/// using a mask.
template <bool track>
/// Updates watches in the pages touching the specified region using a mask.
template <bool track, bool is_read = false>
void UpdatePageWatchersForRegion(VAddr base_addr, RegionBits& mask) const;
/// Returns page aligned address.

View File

@ -36,7 +36,7 @@ static Shader::PushData MakeUserData(const AmdGpu::Liverpool::Regs& regs) {
Rasterizer::Rasterizer(const Instance& instance_, Scheduler& scheduler_,
AmdGpu::Liverpool* liverpool_)
: instance{instance_}, scheduler{scheduler_}, page_manager{this},
buffer_cache{instance, scheduler, *this, liverpool_, texture_cache, page_manager},
buffer_cache{instance, scheduler, liverpool_, texture_cache, page_manager},
texture_cache{instance, scheduler, buffer_cache, page_manager}, liverpool{liverpool_},
memory{Core::Memory::Instance()}, pipeline_cache{instance, scheduler, liverpool} {
if (!Config::nullGpu()) {
@ -471,7 +471,7 @@ bool Rasterizer::BindResources(const Pipeline* pipeline) {
uses_dma |= stage->uses_dma;
}
if (uses_dma && !fault_process_pending) {
if (uses_dma) {
// We only use fault buffer for DMA right now.
{
Common::RecursiveSharedLock lock{mapped_ranges_mutex};
@ -945,6 +945,10 @@ void Rasterizer::InlineData(VAddr address, const void* value, u32 num_bytes, boo
buffer_cache.InlineData(address, value, num_bytes, is_gds);
}
void Rasterizer::CopyBuffer(VAddr dst, VAddr src, u32 num_bytes, bool dst_gds, bool src_gds) {
buffer_cache.CopyBuffer(dst, src, num_bytes, dst_gds, src_gds);
}
u32 Rasterizer::ReadDataFromGds(u32 gds_offset) {
auto* gds_buf = buffer_cache.GetGdsBuffer();
u32 value;
@ -957,11 +961,20 @@ bool Rasterizer::InvalidateMemory(VAddr addr, u64 size) {
// Not GPU mapped memory, can skip invalidation logic entirely.
return false;
}
buffer_cache.InvalidateMemory(addr, size, false);
buffer_cache.InvalidateMemory(addr, size);
texture_cache.InvalidateMemory(addr, size);
return true;
}
bool Rasterizer::ReadMemory(VAddr addr, u64 size) {
if (!IsMapped(addr, size)) {
// Not GPU mapped memory, can skip invalidation logic entirely.
return false;
}
buffer_cache.ReadMemory(addr, size);
return true;
}
bool Rasterizer::IsMapped(VAddr addr, u64 size) {
if (size == 0) {
// There is no memory, so not mapped.
@ -982,7 +995,7 @@ void Rasterizer::MapMemory(VAddr addr, u64 size) {
}
void Rasterizer::UnmapMemory(VAddr addr, u64 size) {
buffer_cache.InvalidateMemory(addr, size, true);
buffer_cache.InvalidateMemory(addr, size);
texture_cache.UnmapMemory(addr, size);
page_manager.OnGpuUnmap(addr, size);
{

View File

@ -5,6 +5,7 @@
#include <shared_mutex>
#include "common/recursive_lock.h"
#include "common/shared_first_mutex.h"
#include "video_core/buffer_cache/buffer_cache.h"
#include "video_core/page_manager.h"
#include "video_core/renderer_vulkan/vk_pipeline_cache.h"
@ -56,8 +57,10 @@ public:
bool from_guest = false);
void InlineData(VAddr address, const void* value, u32 num_bytes, bool is_gds);
void CopyBuffer(VAddr dst, VAddr src, u32 num_bytes, bool dst_gds, bool src_gds);
u32 ReadDataFromGds(u32 gsd_offset);
bool InvalidateMemory(VAddr addr, u64 size);
bool ReadMemory(VAddr addr, u64 size);
bool IsMapped(VAddr addr, u64 size);
void MapMemory(VAddr addr, u64 size);
void UnmapMemory(VAddr addr, u64 size);
@ -120,7 +123,7 @@ private:
AmdGpu::Liverpool* liverpool;
Core::MemoryManager* memory;
boost::icl::interval_set<VAddr> mapped_ranges;
std::shared_mutex mapped_ranges_mutex;
Common::SharedFirstMutex mapped_ranges_mutex;
PipelineCache pipeline_cache;
boost::container::static_vector<