Merge remote-tracking branch 'origin/main' into np_signed

This commit is contained in:
mailwl 2025-05-22 21:29:25 +03:00
commit abce6471a3
49 changed files with 1699 additions and 329 deletions

View File

@ -203,7 +203,7 @@ execute_process(
# Set Version
set(EMULATOR_VERSION_MAJOR "0")
set(EMULATOR_VERSION_MINOR "8")
set(EMULATOR_VERSION_MINOR "9")
set(EMULATOR_VERSION_PATCH "1")
set_source_files_properties(src/shadps4.rc PROPERTIES COMPILE_DEFINITIONS "EMULATOR_VERSION_MAJOR=${EMULATOR_VERSION_MAJOR};EMULATOR_VERSION_MINOR=${EMULATOR_VERSION_MINOR};EMULATOR_VERSION_PATCH=${EMULATOR_VERSION_PATCH}")
@ -674,6 +674,8 @@ set(COMMON src/common/logging/backend.cpp
src/common/polyfill_thread.h
src/common/rdtsc.cpp
src/common/rdtsc.h
src/common/recursive_lock.cpp
src/common/recursive_lock.h
src/common/sha1.h
src/common/signal_context.h
src/common/signal_context.cpp
@ -864,6 +866,7 @@ set(SHADER_RECOMPILER src/shader_recompiler/exception.h
src/shader_recompiler/ir/passes/shared_memory_barrier_pass.cpp
src/shader_recompiler/ir/passes/shared_memory_to_storage_pass.cpp
src/shader_recompiler/ir/passes/ssa_rewrite_pass.cpp
src/shader_recompiler/ir/abstract_syntax_list.cpp
src/shader_recompiler/ir/abstract_syntax_list.h
src/shader_recompiler/ir/attribute.cpp
src/shader_recompiler/ir/attribute.h

View File

@ -37,7 +37,10 @@
<category translate="no">Game</category>
</categories>
<releases>
<release version="0.8.0" date="2025-05-23">
<release version="0.9.0" date="2025-05-22">
<url>https://github.com/shadps4-emu/shadPS4/releases/tag/v.0.9.0</url>
</release>
<release version="0.8.0" date="2025-04-23">
<url>https://github.com/shadps4-emu/shadPS4/releases/tag/v.0.8.0</url>
</release>
<release version="0.7.0" date="2025-03-23">

2
externals/sirit vendored

@ -1 +1 @@
Subproject commit 09a1416ab1b59ddfebd2618412f118f2004f3b2c
Subproject commit 6b450704f6fedb9413d0c89a9eb59d028eb1e6c0

View File

@ -155,7 +155,7 @@ bool GetLoadGameSizeEnabled() {
std::filesystem::path GetSaveDataPath() {
if (save_data_path.empty()) {
return Common::FS::GetUserPath(Common::FS::PathType::SaveDataDir);
return Common::FS::GetUserPath(Common::FS::PathType::UserDir) / "savedata";
}
return save_data_path;
}

View File

@ -128,7 +128,6 @@ static auto UserPaths = [] {
create_path(PathType::LogDir, user_dir / LOG_DIR);
create_path(PathType::ScreenshotsDir, user_dir / SCREENSHOTS_DIR);
create_path(PathType::ShaderDir, user_dir / SHADER_DIR);
create_path(PathType::SaveDataDir, user_dir / SAVEDATA_DIR);
create_path(PathType::GameDataDir, user_dir / GAMEDATA_DIR);
create_path(PathType::TempDataDir, user_dir / TEMPDATA_DIR);
create_path(PathType::SysModuleDir, user_dir / SYSMODULES_DIR);

View File

@ -18,7 +18,6 @@ enum class PathType {
LogDir, // Where log files are stored.
ScreenshotsDir, // Where screenshots are stored.
ShaderDir, // Where shaders are stored.
SaveDataDir, // Where guest save data is stored.
TempDataDir, // Where game temp data is stored.
GameDataDir, // Where game data is stored.
SysModuleDir, // Where system modules are stored.
@ -36,7 +35,6 @@ constexpr auto PORTABLE_DIR = "user";
constexpr auto LOG_DIR = "log";
constexpr auto SCREENSHOTS_DIR = "screenshots";
constexpr auto SHADER_DIR = "shader";
constexpr auto SAVEDATA_DIR = "savedata";
constexpr auto GAMEDATA_DIR = "data";
constexpr auto TEMPDATA_DIR = "temp";
constexpr auto SYSMODULES_DIR = "sys_modules";

View File

@ -0,0 +1,37 @@
// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project
// SPDX-License-Identifier: GPL-2.0-or-later
#include <unordered_map>
#include "common/assert.h"
#include "common/recursive_lock.h"
namespace Common::Detail {
struct RecursiveLockState {
RecursiveLockType type;
int count;
};
thread_local std::unordered_map<void*, RecursiveLockState> g_recursive_locks;
bool IncrementRecursiveLock(void* mutex, RecursiveLockType type) {
auto& state = g_recursive_locks[mutex];
if (state.count == 0) {
ASSERT(state.type == RecursiveLockType::None);
state.type = type;
}
ASSERT(state.type == type);
return state.count++ == 0;
}
bool DecrementRecursiveLock(void* mutex, RecursiveLockType type) {
auto& state = g_recursive_locks[mutex];
ASSERT(state.type == type && state.count > 0);
if (--state.count == 0) {
g_recursive_locks.erase(mutex);
return true;
}
return false;
}
} // namespace Common::Detail

View File

@ -0,0 +1,67 @@
// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project
// SPDX-License-Identifier: GPL-2.0-or-later
#pragma once
#include <mutex>
#include <optional>
#include <shared_mutex>
namespace Common {
namespace Detail {
enum class RecursiveLockType { None, Shared, Exclusive };
bool IncrementRecursiveLock(void* mutex, RecursiveLockType type);
bool DecrementRecursiveLock(void* mutex, RecursiveLockType type);
} // namespace Detail
template <typename MutexType>
class RecursiveScopedLock {
public:
explicit RecursiveScopedLock(MutexType& mutex) : m_mutex(mutex), m_locked(false) {
if (Detail::IncrementRecursiveLock(&m_mutex, Detail::RecursiveLockType::Exclusive)) {
m_locked = true;
m_lock.emplace(m_mutex);
}
}
~RecursiveScopedLock() {
Detail::DecrementRecursiveLock(&m_mutex, Detail::RecursiveLockType::Exclusive);
if (m_locked) {
m_lock.reset();
}
}
private:
MutexType& m_mutex;
std::optional<std::unique_lock<MutexType>> m_lock;
bool m_locked = false;
};
template <typename MutexType>
class RecursiveSharedLock {
public:
explicit RecursiveSharedLock(MutexType& mutex) : m_mutex(mutex), m_locked(false) {
if (Detail::IncrementRecursiveLock(&m_mutex, Detail::RecursiveLockType::Shared)) {
m_locked = true;
m_lock.emplace(m_mutex);
}
}
~RecursiveSharedLock() {
Detail::DecrementRecursiveLock(&m_mutex, Detail::RecursiveLockType::Shared);
if (m_locked) {
m_lock.reset();
}
}
private:
MutexType& m_mutex;
std::optional<std::shared_lock<MutexType>> m_lock;
bool m_locked = false;
};
} // namespace Common

View File

@ -14,6 +14,9 @@ namespace Common {
struct SlotId {
static constexpr u32 INVALID_INDEX = std::numeric_limits<u32>::max();
SlotId() noexcept = default;
constexpr SlotId(u32 index) noexcept : index(index) {}
constexpr auto operator<=>(const SlotId&) const noexcept = default;
constexpr explicit operator bool() const noexcept {
@ -28,6 +31,63 @@ class SlotVector {
constexpr static std::size_t InitialCapacity = 2048;
public:
template <typename ValueType, typename Pointer, typename Reference>
class Iterator {
public:
using iterator_category = std::forward_iterator_tag;
using value_type = ValueType;
using difference_type = std::ptrdiff_t;
using pointer = Pointer;
using reference = Reference;
Iterator(SlotVector& vector_, SlotId index_) : vector(vector_), slot(index_) {
AdvanceToValid();
}
reference operator*() const {
return vector[slot];
}
pointer operator->() const {
return &vector[slot];
}
Iterator& operator++() {
++slot.index;
AdvanceToValid();
return *this;
}
Iterator operator++(int) {
Iterator temp = *this;
++(*this);
return temp;
}
bool operator==(const Iterator& other) const {
return slot == other.slot;
}
bool operator!=(const Iterator& other) const {
return !(*this == other);
}
private:
void AdvanceToValid() {
while (slot < vector.values_capacity && !vector.ReadStorageBit(slot.index)) {
++slot.index;
}
}
SlotVector& vector;
SlotId slot;
};
using iterator = Iterator<T, T*, T&>;
using const_iterator = Iterator<const T, const T*, const T&>;
using reverse_iterator = std::reverse_iterator<iterator>;
using const_reverse_iterator = std::reverse_iterator<const_iterator>;
SlotVector() {
Reserve(InitialCapacity);
}
@ -60,7 +120,7 @@ public:
}
template <typename... Args>
[[nodiscard]] SlotId insert(Args&&... args) noexcept {
SlotId insert(Args&&... args) noexcept {
const u32 index = FreeValueIndex();
new (&values[index].object) T(std::forward<Args>(args)...);
SetStorageBit(index);
@ -78,6 +138,54 @@ public:
return values_capacity - free_list.size();
}
iterator begin() noexcept {
return iterator(*this, 0);
}
const_iterator begin() const noexcept {
return const_iterator(*this, 0);
}
const_iterator cbegin() const noexcept {
return begin();
}
iterator end() noexcept {
return iterator(*this, values_capacity);
}
const_iterator end() const noexcept {
return const_iterator(*this, values_capacity);
}
const_iterator cend() const noexcept {
return end();
}
reverse_iterator rbegin() noexcept {
return reverse_iterator(end());
}
const_reverse_iterator rbegin() const noexcept {
return const_reverse_iterator(end());
}
const_reverse_iterator crbegin() const noexcept {
return rbegin();
}
reverse_iterator rend() noexcept {
return reverse_iterator(begin());
}
const_reverse_iterator rend() const noexcept {
return const_reverse_iterator(begin());
}
const_reverse_iterator crend() const noexcept {
return rend();
}
private:
struct NonTrivialDummy {
NonTrivialDummy() noexcept {}

View File

@ -206,6 +206,10 @@ s32 PS4_SYSV_ABI sceNpTrophyDestroyHandle(OrbisNpTrophyHandle handle) {
if (handle == ORBIS_NP_TROPHY_INVALID_HANDLE)
return ORBIS_NP_TROPHY_ERROR_INVALID_HANDLE;
if (handle >= trophy_handles.size()) {
LOG_ERROR(Lib_NpTrophy, "Invalid handle {}", handle);
return ORBIS_NP_TROPHY_ERROR_INVALID_HANDLE;
}
if (!trophy_handles.is_allocated({static_cast<u32>(handle)})) {
return ORBIS_NP_TROPHY_ERROR_INVALID_HANDLE;
}

View File

@ -8,6 +8,7 @@
#include <magic_enum/magic_enum.hpp>
#include "common/assert.h"
#include "common/config.h"
#include "common/cstring.h"
#include "common/elf_info.h"
#include "common/enum.h"
@ -438,7 +439,7 @@ static Error saveDataMount(const OrbisSaveDataMount2* mount_info,
LOG_INFO(Lib_SaveData, "called with invalid block size");
}
const auto root_save = Common::FS::GetUserPath(Common::FS::PathType::SaveDataDir);
const auto root_save = Config::GetSaveDataPath();
fs::create_directories(root_save);
const auto available = fs::space(root_save).available;

View File

@ -156,11 +156,9 @@ public:
}
if (selected == openSaveDataFolder) {
QString userPath;
Common::FS::PathToQString(userPath,
Common::FS::GetUserPath(Common::FS::PathType::UserDir));
QString saveDataPath =
userPath + "/savedata/1/" + QString::fromStdString(m_games[itemID].save_dir);
QString saveDataPath;
Common::FS::PathToQString(saveDataPath,
Config::GetSaveDataPath() / "1" / m_games[itemID].save_dir);
QDir(saveDataPath).mkpath(saveDataPath);
QDesktopServices::openUrl(QUrl::fromLocalFile(saveDataPath));
}
@ -485,8 +483,7 @@ public:
dlc_path, Config::getAddonInstallDir() /
Common::FS::PathFromQString(folder_path).parent_path().filename());
Common::FS::PathToQString(save_data_path,
Common::FS::GetUserPath(Common::FS::PathType::UserDir) /
"savedata/1" / m_games[itemID].serial);
Config::GetSaveDataPath() / "1" / m_games[itemID].save_dir);
Common::FS::PathToQString(trophy_data_path,
Common::FS::GetUserPath(Common::FS::PathType::MetaDataDir) /

View File

@ -154,6 +154,7 @@ void Traverse(EmitContext& ctx, const IR::Program& program) {
for (IR::Inst& inst : node.data.block->Instructions()) {
EmitInst(ctx, &inst);
}
ctx.first_to_last_label_map[label.value] = ctx.last_label;
break;
}
case IR::AbstractSyntaxNode::Type::If: {
@ -298,6 +299,10 @@ void SetupCapabilities(const Info& info, const Profile& profile, EmitContext& ct
if (stage == LogicalStage::TessellationControl || stage == LogicalStage::TessellationEval) {
ctx.AddCapability(spv::Capability::Tessellation);
}
if (info.dma_types != IR::Type::Void) {
ctx.AddCapability(spv::Capability::PhysicalStorageBufferAddresses);
ctx.AddExtension("SPV_KHR_physical_storage_buffer");
}
}
void DefineEntryPoint(const Info& info, EmitContext& ctx, Id main) {
@ -387,7 +392,7 @@ void SetupFloatMode(EmitContext& ctx, const Profile& profile, const RuntimeInfo&
void PatchPhiNodes(const IR::Program& program, EmitContext& ctx) {
auto inst{program.blocks.front()->begin()};
size_t block_index{0};
ctx.PatchDeferredPhi([&](size_t phi_arg) {
ctx.PatchDeferredPhi([&](u32 phi_arg, Id first_parent) {
if (phi_arg == 0) {
++inst;
if (inst == program.blocks[block_index]->end() ||
@ -398,7 +403,9 @@ void PatchPhiNodes(const IR::Program& program, EmitContext& ctx) {
} while (inst->GetOpcode() != IR::Opcode::Phi);
}
}
return ctx.Def(inst->Arg(phi_arg));
const Id arg = ctx.Def(inst->Arg(phi_arg));
const Id parent = ctx.first_to_last_label_map[first_parent.value];
return std::make_pair(arg, parent);
});
}
} // Anonymous namespace

View File

@ -60,7 +60,7 @@ Id BufferAtomicU32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id
address = ctx.OpIAdd(ctx.U32[1], address, buffer.offset);
}
const Id index = ctx.OpShiftRightLogical(ctx.U32[1], address, ctx.ConstU32(2u));
const auto [id, pointer_type] = buffer[EmitContext::BufferAlias::U32];
const auto [id, pointer_type] = buffer[EmitContext::PointerType::U32];
const Id ptr = ctx.OpAccessChain(pointer_type, id, ctx.u32_zero_value, index);
const auto [scope, semantics]{AtomicArgs(ctx)};
return BufferAtomicU32BoundsCheck(ctx, index, buffer.size_dwords, [&] {
@ -257,7 +257,7 @@ Id EmitImageAtomicExchange32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id co
Id EmitDataAppend(EmitContext& ctx, u32 gds_addr, u32 binding) {
const auto& buffer = ctx.buffers[binding];
const auto [id, pointer_type] = buffer[EmitContext::BufferAlias::U32];
const auto [id, pointer_type] = buffer[EmitContext::PointerType::U32];
const Id ptr = ctx.OpAccessChain(pointer_type, id, ctx.u32_zero_value, ctx.ConstU32(gds_addr));
const auto [scope, semantics]{AtomicArgs(ctx)};
return ctx.OpAtomicIIncrement(ctx.U32[1], ptr, scope, semantics);
@ -265,7 +265,7 @@ Id EmitDataAppend(EmitContext& ctx, u32 gds_addr, u32 binding) {
Id EmitDataConsume(EmitContext& ctx, u32 gds_addr, u32 binding) {
const auto& buffer = ctx.buffers[binding];
const auto [id, pointer_type] = buffer[EmitContext::BufferAlias::U32];
const auto [id, pointer_type] = buffer[EmitContext::PointerType::U32];
const Id ptr = ctx.OpAccessChain(pointer_type, id, ctx.u32_zero_value, ctx.ConstU32(gds_addr));
const auto [scope, semantics]{AtomicArgs(ctx)};
return ctx.OpAtomicIDecrement(ctx.U32[1], ptr, scope, semantics);

View File

@ -161,33 +161,37 @@ void EmitGetGotoVariable(EmitContext&) {
UNREACHABLE_MSG("Unreachable instruction");
}
using BufferAlias = EmitContext::BufferAlias;
using PointerType = EmitContext::PointerType;
Id EmitReadConst(EmitContext& ctx, IR::Inst* inst) {
Id EmitReadConst(EmitContext& ctx, IR::Inst* inst, Id addr, Id offset) {
const u32 flatbuf_off_dw = inst->Flags<u32>();
const auto& srt_flatbuf = ctx.buffers.back();
ASSERT(srt_flatbuf.binding >= 0 && flatbuf_off_dw > 0 &&
srt_flatbuf.buffer_type == BufferType::ReadConstUbo);
LOG_DEBUG(Render_Recompiler, "ReadConst from flatbuf dword {}", flatbuf_off_dw);
const auto [id, pointer_type] = srt_flatbuf[BufferAlias::U32];
const Id ptr{
ctx.OpAccessChain(pointer_type, id, ctx.u32_zero_value, ctx.ConstU32(flatbuf_off_dw))};
return ctx.OpLoad(ctx.U32[1], ptr);
// We can only provide a fallback for immediate offsets.
if (flatbuf_off_dw == 0) {
return ctx.OpFunctionCall(ctx.U32[1], ctx.read_const_dynamic, addr, offset);
} else {
return ctx.OpFunctionCall(ctx.U32[1], ctx.read_const, addr, offset,
ctx.ConstU32(flatbuf_off_dw));
}
}
Id EmitReadConstBuffer(EmitContext& ctx, u32 handle, Id index) {
template <PointerType type>
Id ReadConstBuffer(EmitContext& ctx, u32 handle, Id index) {
const auto& buffer = ctx.buffers[handle];
index = ctx.OpIAdd(ctx.U32[1], index, buffer.offset_dwords);
const auto [id, pointer_type] = buffer[BufferAlias::U32];
const auto [id, pointer_type] = buffer[type];
const auto value_type = type == PointerType::U32 ? ctx.U32[1] : ctx.F32[1];
const Id ptr{ctx.OpAccessChain(pointer_type, id, ctx.u32_zero_value, index)};
const Id result{ctx.OpLoad(ctx.U32[1], ptr)};
const Id result{ctx.OpLoad(value_type, ptr)};
if (Sirit::ValidId(buffer.size_dwords)) {
const Id in_bounds = ctx.OpULessThan(ctx.U1[1], index, buffer.size_dwords);
return ctx.OpSelect(ctx.U32[1], in_bounds, result, ctx.u32_zero_value);
} else {
return result;
return ctx.OpSelect(value_type, in_bounds, result, ctx.u32_zero_value);
}
return result;
}
Id EmitReadConstBuffer(EmitContext& ctx, u32 handle, Id index) {
return ReadConstBuffer<PointerType::U32>(ctx, handle, index);
}
Id EmitReadStepRate(EmitContext& ctx, int rate_idx) {
@ -246,7 +250,7 @@ Id EmitGetAttribute(EmitContext& ctx, IR::Attribute attr, u32 comp, Id index) {
ctx.OpUDiv(ctx.U32[1], ctx.OpLoad(ctx.U32[1], ctx.instance_id), step_rate),
ctx.ConstU32(param.num_components)),
ctx.ConstU32(comp));
return EmitReadConstBuffer(ctx, param.buffer_handle, offset);
return ReadConstBuffer<PointerType::F32>(ctx, param.buffer_handle, offset);
}
Id result;
@ -432,7 +436,7 @@ static Id EmitLoadBufferBoundsCheck(EmitContext& ctx, Id index, Id buffer_size,
return result;
}
template <u32 N, BufferAlias alias>
template <u32 N, PointerType alias>
static Id EmitLoadBufferB32xN(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) {
const auto flags = inst->Flags<IR::BufferInstInfo>();
const auto& spv_buffer = ctx.buffers[handle];
@ -440,7 +444,7 @@ static Id EmitLoadBufferB32xN(EmitContext& ctx, IR::Inst* inst, u32 handle, Id a
address = ctx.OpIAdd(ctx.U32[1], address, spv_buffer.offset);
}
const Id index = ctx.OpShiftRightLogical(ctx.U32[1], address, ctx.ConstU32(2u));
const auto& data_types = alias == BufferAlias::U32 ? ctx.U32 : ctx.F32;
const auto& data_types = alias == PointerType::U32 ? ctx.U32 : ctx.F32;
const auto [id, pointer_type] = spv_buffer[alias];
boost::container::static_vector<Id, N> ids;
@ -451,7 +455,7 @@ static Id EmitLoadBufferB32xN(EmitContext& ctx, IR::Inst* inst, u32 handle, Id a
if (!flags.typed) {
// Untyped loads have bounds checking per-component.
ids.push_back(EmitLoadBufferBoundsCheck<1>(ctx, index_i, spv_buffer.size_dwords,
result_i, alias == BufferAlias::F32));
result_i, alias == PointerType::F32));
} else {
ids.push_back(result_i);
}
@ -461,7 +465,7 @@ static Id EmitLoadBufferB32xN(EmitContext& ctx, IR::Inst* inst, u32 handle, Id a
if (flags.typed) {
// Typed loads have single bounds check for the whole load.
return EmitLoadBufferBoundsCheck<N>(ctx, index, spv_buffer.size_dwords, result,
alias == BufferAlias::F32);
alias == PointerType::F32);
}
return result;
}
@ -471,7 +475,7 @@ Id EmitLoadBufferU8(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) {
if (Sirit::ValidId(spv_buffer.offset)) {
address = ctx.OpIAdd(ctx.U32[1], address, spv_buffer.offset);
}
const auto [id, pointer_type] = spv_buffer[BufferAlias::U8];
const auto [id, pointer_type] = spv_buffer[PointerType::U8];
const Id ptr{ctx.OpAccessChain(pointer_type, id, ctx.u32_zero_value, address)};
const Id result{ctx.OpUConvert(ctx.U32[1], ctx.OpLoad(ctx.U8, ptr))};
return EmitLoadBufferBoundsCheck<1>(ctx, address, spv_buffer.size, result, false);
@ -482,7 +486,7 @@ Id EmitLoadBufferU16(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) {
if (Sirit::ValidId(spv_buffer.offset)) {
address = ctx.OpIAdd(ctx.U32[1], address, spv_buffer.offset);
}
const auto [id, pointer_type] = spv_buffer[BufferAlias::U16];
const auto [id, pointer_type] = spv_buffer[PointerType::U16];
const Id index = ctx.OpShiftRightLogical(ctx.U32[1], address, ctx.ConstU32(1u));
const Id ptr{ctx.OpAccessChain(pointer_type, id, ctx.u32_zero_value, index)};
const Id result{ctx.OpUConvert(ctx.U32[1], ctx.OpLoad(ctx.U16, ptr))};
@ -490,35 +494,35 @@ Id EmitLoadBufferU16(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) {
}
Id EmitLoadBufferU32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) {
return EmitLoadBufferB32xN<1, BufferAlias::U32>(ctx, inst, handle, address);
return EmitLoadBufferB32xN<1, PointerType::U32>(ctx, inst, handle, address);
}
Id EmitLoadBufferU32x2(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) {
return EmitLoadBufferB32xN<2, BufferAlias::U32>(ctx, inst, handle, address);
return EmitLoadBufferB32xN<2, PointerType::U32>(ctx, inst, handle, address);
}
Id EmitLoadBufferU32x3(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) {
return EmitLoadBufferB32xN<3, BufferAlias::U32>(ctx, inst, handle, address);
return EmitLoadBufferB32xN<3, PointerType::U32>(ctx, inst, handle, address);
}
Id EmitLoadBufferU32x4(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) {
return EmitLoadBufferB32xN<4, BufferAlias::U32>(ctx, inst, handle, address);
return EmitLoadBufferB32xN<4, PointerType::U32>(ctx, inst, handle, address);
}
Id EmitLoadBufferF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) {
return EmitLoadBufferB32xN<1, BufferAlias::F32>(ctx, inst, handle, address);
return EmitLoadBufferB32xN<1, PointerType::F32>(ctx, inst, handle, address);
}
Id EmitLoadBufferF32x2(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) {
return EmitLoadBufferB32xN<2, BufferAlias::F32>(ctx, inst, handle, address);
return EmitLoadBufferB32xN<2, PointerType::F32>(ctx, inst, handle, address);
}
Id EmitLoadBufferF32x3(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) {
return EmitLoadBufferB32xN<3, BufferAlias::F32>(ctx, inst, handle, address);
return EmitLoadBufferB32xN<3, PointerType::F32>(ctx, inst, handle, address);
}
Id EmitLoadBufferF32x4(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) {
return EmitLoadBufferB32xN<4, BufferAlias::F32>(ctx, inst, handle, address);
return EmitLoadBufferB32xN<4, PointerType::F32>(ctx, inst, handle, address);
}
Id EmitLoadBufferFormatF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) {
@ -548,7 +552,7 @@ void EmitStoreBufferBoundsCheck(EmitContext& ctx, Id index, Id buffer_size, auto
emit_func();
}
template <u32 N, BufferAlias alias>
template <u32 N, PointerType alias>
static void EmitStoreBufferB32xN(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address,
Id value) {
const auto flags = inst->Flags<IR::BufferInstInfo>();
@ -557,7 +561,7 @@ static void EmitStoreBufferB32xN(EmitContext& ctx, IR::Inst* inst, u32 handle, I
address = ctx.OpIAdd(ctx.U32[1], address, spv_buffer.offset);
}
const Id index = ctx.OpShiftRightLogical(ctx.U32[1], address, ctx.ConstU32(2u));
const auto& data_types = alias == BufferAlias::U32 ? ctx.U32 : ctx.F32;
const auto& data_types = alias == PointerType::U32 ? ctx.U32 : ctx.F32;
const auto [id, pointer_type] = spv_buffer[alias];
auto store = [&] {
@ -588,7 +592,7 @@ void EmitStoreBufferU8(EmitContext& ctx, IR::Inst*, u32 handle, Id address, Id v
if (Sirit::ValidId(spv_buffer.offset)) {
address = ctx.OpIAdd(ctx.U32[1], address, spv_buffer.offset);
}
const auto [id, pointer_type] = spv_buffer[BufferAlias::U8];
const auto [id, pointer_type] = spv_buffer[PointerType::U8];
const Id ptr{ctx.OpAccessChain(pointer_type, id, ctx.u32_zero_value, address)};
const Id result{ctx.OpUConvert(ctx.U8, value)};
EmitStoreBufferBoundsCheck<1>(ctx, address, spv_buffer.size, [&] { ctx.OpStore(ptr, result); });
@ -599,7 +603,7 @@ void EmitStoreBufferU16(EmitContext& ctx, IR::Inst*, u32 handle, Id address, Id
if (Sirit::ValidId(spv_buffer.offset)) {
address = ctx.OpIAdd(ctx.U32[1], address, spv_buffer.offset);
}
const auto [id, pointer_type] = spv_buffer[BufferAlias::U16];
const auto [id, pointer_type] = spv_buffer[PointerType::U16];
const Id index = ctx.OpShiftRightLogical(ctx.U32[1], address, ctx.ConstU32(1u));
const Id ptr{ctx.OpAccessChain(pointer_type, id, ctx.u32_zero_value, index)};
const Id result{ctx.OpUConvert(ctx.U16, value)};
@ -608,35 +612,35 @@ void EmitStoreBufferU16(EmitContext& ctx, IR::Inst*, u32 handle, Id address, Id
}
void EmitStoreBufferU32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) {
EmitStoreBufferB32xN<1, BufferAlias::U32>(ctx, inst, handle, address, value);
EmitStoreBufferB32xN<1, PointerType::U32>(ctx, inst, handle, address, value);
}
void EmitStoreBufferU32x2(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) {
EmitStoreBufferB32xN<2, BufferAlias::U32>(ctx, inst, handle, address, value);
EmitStoreBufferB32xN<2, PointerType::U32>(ctx, inst, handle, address, value);
}
void EmitStoreBufferU32x3(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) {
EmitStoreBufferB32xN<3, BufferAlias::U32>(ctx, inst, handle, address, value);
EmitStoreBufferB32xN<3, PointerType::U32>(ctx, inst, handle, address, value);
}
void EmitStoreBufferU32x4(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) {
EmitStoreBufferB32xN<4, BufferAlias::U32>(ctx, inst, handle, address, value);
EmitStoreBufferB32xN<4, PointerType::U32>(ctx, inst, handle, address, value);
}
void EmitStoreBufferF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) {
EmitStoreBufferB32xN<1, BufferAlias::F32>(ctx, inst, handle, address, value);
EmitStoreBufferB32xN<1, PointerType::F32>(ctx, inst, handle, address, value);
}
void EmitStoreBufferF32x2(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) {
EmitStoreBufferB32xN<2, BufferAlias::F32>(ctx, inst, handle, address, value);
EmitStoreBufferB32xN<2, PointerType::F32>(ctx, inst, handle, address, value);
}
void EmitStoreBufferF32x3(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) {
EmitStoreBufferB32xN<3, BufferAlias::F32>(ctx, inst, handle, address, value);
EmitStoreBufferB32xN<3, PointerType::F32>(ctx, inst, handle, address, value);
}
void EmitStoreBufferF32x4(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) {
EmitStoreBufferB32xN<4, BufferAlias::F32>(ctx, inst, handle, address, value);
EmitStoreBufferB32xN<4, PointerType::F32>(ctx, inst, handle, address, value);
}
void EmitStoreBufferFormatF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) {

View File

@ -61,7 +61,7 @@ void EmitSetVectorRegister(EmitContext& ctx);
void EmitSetGotoVariable(EmitContext& ctx);
void EmitGetGotoVariable(EmitContext& ctx);
void EmitSetScc(EmitContext& ctx);
Id EmitReadConst(EmitContext& ctx, IR::Inst* inst);
Id EmitReadConst(EmitContext& ctx, IR::Inst* inst, Id addr, Id offset);
Id EmitReadConstBuffer(EmitContext& ctx, u32 handle, Id index);
Id EmitLoadBufferU8(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address);
Id EmitLoadBufferU16(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address);

View File

@ -7,6 +7,7 @@
#include "shader_recompiler/frontend/fetch_shader.h"
#include "shader_recompiler/runtime_info.h"
#include "video_core/amdgpu/types.h"
#include "video_core/buffer_cache/buffer_cache.h"
#include <boost/container/static_vector.hpp>
#include <fmt/format.h>
@ -70,6 +71,12 @@ EmitContext::EmitContext(const Profile& profile_, const RuntimeInfo& runtime_inf
Bindings& binding_)
: Sirit::Module(profile_.supported_spirv), info{info_}, runtime_info{runtime_info_},
profile{profile_}, stage{info.stage}, l_stage{info.l_stage}, binding{binding_} {
if (info.dma_types != IR::Type::Void) {
SetMemoryModel(spv::AddressingModel::PhysicalStorageBuffer64, spv::MemoryModel::GLSL450);
} else {
SetMemoryModel(spv::AddressingModel::Logical, spv::MemoryModel::GLSL450);
}
AddCapability(spv::Capability::Shader);
DefineArithmeticTypes();
DefineInterfaces();
@ -137,9 +144,13 @@ void EmitContext::DefineArithmeticTypes() {
true_value = ConstantTrue(U1[1]);
false_value = ConstantFalse(U1[1]);
u8_one_value = Constant(U8, 1U);
u8_zero_value = Constant(U8, 0U);
u32_one_value = ConstU32(1U);
u32_zero_value = ConstU32(0U);
f32_zero_value = ConstF32(0.0f);
u64_one_value = Constant(U64, 1ULL);
u64_zero_value = Constant(U64, 0ULL);
pi_x2 = ConstF32(2.0f * float{std::numbers::pi});
@ -157,6 +168,35 @@ void EmitContext::DefineArithmeticTypes() {
if (info.uses_fp64) {
frexp_result_f64 = Name(TypeStruct(F64[1], S32[1]), "frexp_result_f64");
}
if (True(info.dma_types & IR::Type::F64)) {
physical_pointer_types[PointerType::F64] =
TypePointer(spv::StorageClass::PhysicalStorageBuffer, F64[1]);
}
if (True(info.dma_types & IR::Type::U64)) {
physical_pointer_types[PointerType::U64] =
TypePointer(spv::StorageClass::PhysicalStorageBuffer, U64);
}
if (True(info.dma_types & IR::Type::F32)) {
physical_pointer_types[PointerType::F32] =
TypePointer(spv::StorageClass::PhysicalStorageBuffer, F32[1]);
}
if (True(info.dma_types & IR::Type::U32)) {
physical_pointer_types[PointerType::U32] =
TypePointer(spv::StorageClass::PhysicalStorageBuffer, U32[1]);
}
if (True(info.dma_types & IR::Type::F16)) {
physical_pointer_types[PointerType::F16] =
TypePointer(spv::StorageClass::PhysicalStorageBuffer, F16[1]);
}
if (True(info.dma_types & IR::Type::U16)) {
physical_pointer_types[PointerType::U16] =
TypePointer(spv::StorageClass::PhysicalStorageBuffer, U16);
}
if (True(info.dma_types & IR::Type::U8)) {
physical_pointer_types[PointerType::U8] =
TypePointer(spv::StorageClass::PhysicalStorageBuffer, U8);
}
}
void EmitContext::DefineInterfaces() {
@ -195,9 +235,10 @@ EmitContext::SpirvAttribute EmitContext::GetAttributeInfo(AmdGpu::NumberFormat f
}
Id EmitContext::GetBufferSize(const u32 sharp_idx) {
const auto& srt_flatbuf = buffers.back();
ASSERT(srt_flatbuf.buffer_type == BufferType::ReadConstUbo);
const auto [id, pointer_type] = srt_flatbuf[BufferAlias::U32];
// Can this be done with memory access? Like we do now with ReadConst
const auto& srt_flatbuf = buffers[flatbuf_index];
ASSERT(srt_flatbuf.buffer_type == BufferType::Flatbuf);
const auto [id, pointer_type] = srt_flatbuf[PointerType::U32];
const auto rsrc1{
OpLoad(U32[1], OpAccessChain(pointer_type, id, u32_zero_value, ConstU32(sharp_idx + 1)))};
@ -690,8 +731,14 @@ EmitContext::BufferSpv EmitContext::DefineBuffer(bool is_storage, bool is_writte
case Shader::BufferType::GdsBuffer:
Name(id, "gds_buffer");
break;
case Shader::BufferType::ReadConstUbo:
Name(id, "srt_flatbuf_ubo");
case Shader::BufferType::Flatbuf:
Name(id, "srt_flatbuf");
break;
case Shader::BufferType::BdaPagetable:
Name(id, "bda_pagetable");
break;
case Shader::BufferType::FaultBuffer:
Name(id, "fault_buffer");
break;
case Shader::BufferType::SharedMemory:
Name(id, "ssbo_shmem");
@ -705,35 +752,53 @@ EmitContext::BufferSpv EmitContext::DefineBuffer(bool is_storage, bool is_writte
};
void EmitContext::DefineBuffers() {
if (!profile.supports_robust_buffer_access && !info.has_readconst) {
// In case ReadConstUbo has not already been bound by IR and is needed
if (!profile.supports_robust_buffer_access &&
info.readconst_types == Info::ReadConstType::None) {
// In case Flatbuf has not already been bound by IR and is needed
// to query buffer sizes, bind it now.
info.buffers.push_back({
.used_types = IR::Type::U32,
.inline_cbuf = AmdGpu::Buffer::Null(),
.buffer_type = BufferType::ReadConstUbo,
// We can't guarantee that flatbuf will not grow past UBO
// limit if there are a lot of ReadConsts. (We could specialize)
.inline_cbuf = AmdGpu::Buffer::Placeholder(std::numeric_limits<u32>::max()),
.buffer_type = BufferType::Flatbuf,
});
// In the future we may want to read buffer sizes from GPU memory if available.
// info.readconst_types |= Info::ReadConstType::Immediate;
}
for (const auto& desc : info.buffers) {
const auto buf_sharp = desc.GetSharp(info);
const bool is_storage = desc.IsStorage(buf_sharp, profile);
// Set indexes for special buffers.
if (desc.buffer_type == BufferType::Flatbuf) {
flatbuf_index = buffers.size();
} else if (desc.buffer_type == BufferType::BdaPagetable) {
bda_pagetable_index = buffers.size();
} else if (desc.buffer_type == BufferType::FaultBuffer) {
fault_buffer_index = buffers.size();
}
// Define aliases depending on the shader usage.
auto& spv_buffer = buffers.emplace_back(binding.buffer++, desc.buffer_type);
if (True(desc.used_types & IR::Type::U64)) {
spv_buffer[PointerType::U64] =
DefineBuffer(is_storage, desc.is_written, 3, desc.buffer_type, U64);
}
if (True(desc.used_types & IR::Type::U32)) {
spv_buffer[BufferAlias::U32] =
spv_buffer[PointerType::U32] =
DefineBuffer(is_storage, desc.is_written, 2, desc.buffer_type, U32[1]);
}
if (True(desc.used_types & IR::Type::F32)) {
spv_buffer[BufferAlias::F32] =
spv_buffer[PointerType::F32] =
DefineBuffer(is_storage, desc.is_written, 2, desc.buffer_type, F32[1]);
}
if (True(desc.used_types & IR::Type::U16)) {
spv_buffer[BufferAlias::U16] =
spv_buffer[PointerType::U16] =
DefineBuffer(is_storage, desc.is_written, 1, desc.buffer_type, U16);
}
if (True(desc.used_types & IR::Type::U8)) {
spv_buffer[BufferAlias::U8] =
spv_buffer[PointerType::U8] =
DefineBuffer(is_storage, desc.is_written, 0, desc.buffer_type, U8);
}
++binding.unified;
@ -1003,6 +1068,101 @@ Id EmitContext::DefineUfloatM5ToFloat32(u32 mantissa_bits, const std::string_vie
return func;
}
Id EmitContext::DefineGetBdaPointer() {
const auto caching_pagebits{
Constant(U64, static_cast<u64>(VideoCore::BufferCache::CACHING_PAGEBITS))};
const auto caching_pagemask{Constant(U64, VideoCore::BufferCache::CACHING_PAGESIZE - 1)};
const auto func_type{TypeFunction(U64, U64)};
const auto func{OpFunction(U64, spv::FunctionControlMask::MaskNone, func_type)};
const auto address{OpFunctionParameter(U64)};
Name(func, "get_bda_pointer");
AddLabel();
const auto fault_label{OpLabel()};
const auto available_label{OpLabel()};
const auto merge_label{OpLabel()};
// Get page BDA
const auto page{OpShiftRightLogical(U64, address, caching_pagebits)};
const auto page32{OpUConvert(U32[1], page)};
const auto& bda_buffer{buffers[bda_pagetable_index]};
const auto [bda_buffer_id, bda_pointer_type] = bda_buffer[PointerType::U64];
const auto bda_ptr{OpAccessChain(bda_pointer_type, bda_buffer_id, u32_zero_value, page32)};
const auto bda{OpLoad(U64, bda_ptr)};
// Check if page is GPU cached
const auto is_fault{OpIEqual(U1[1], bda, u64_zero_value)};
OpSelectionMerge(merge_label, spv::SelectionControlMask::MaskNone);
OpBranchConditional(is_fault, fault_label, available_label);
// First time acces, mark as fault
AddLabel(fault_label);
const auto& fault_buffer{buffers[fault_buffer_index]};
const auto [fault_buffer_id, fault_pointer_type] = fault_buffer[PointerType::U8];
const auto page_div8{OpShiftRightLogical(U32[1], page32, ConstU32(3U))};
const auto page_mod8{OpBitwiseAnd(U32[1], page32, ConstU32(7U))};
const auto page_mask{OpShiftLeftLogical(U8, u8_one_value, page_mod8)};
const auto fault_ptr{
OpAccessChain(fault_pointer_type, fault_buffer_id, u32_zero_value, page_div8)};
const auto fault_value{OpLoad(U8, fault_ptr)};
const auto fault_value_masked{OpBitwiseOr(U8, fault_value, page_mask)};
OpStore(fault_ptr, fault_value_masked);
// Return null pointer
const auto fallback_result{u64_zero_value};
OpBranch(merge_label);
// Value is available, compute address
AddLabel(available_label);
const auto offset_in_bda{OpBitwiseAnd(U64, address, caching_pagemask)};
const auto addr{OpIAdd(U64, bda, offset_in_bda)};
OpBranch(merge_label);
// Merge
AddLabel(merge_label);
const auto result{OpPhi(U64, addr, available_label, fallback_result, fault_label)};
OpReturnValue(result);
OpFunctionEnd();
return func;
}
Id EmitContext::DefineReadConst(bool dynamic) {
const auto func_type{!dynamic ? TypeFunction(U32[1], U32[2], U32[1], U32[1])
: TypeFunction(U32[1], U32[2], U32[1])};
const auto func{OpFunction(U32[1], spv::FunctionControlMask::MaskNone, func_type)};
const auto base{OpFunctionParameter(U32[2])};
const auto offset{OpFunctionParameter(U32[1])};
const auto flatbuf_offset{!dynamic ? OpFunctionParameter(U32[1]) : Id{}};
Name(func, dynamic ? "read_const_dynamic" : "read_const");
AddLabel();
const auto base_lo{OpUConvert(U64, OpCompositeExtract(U32[1], base, 0))};
const auto base_hi{OpUConvert(U64, OpCompositeExtract(U32[1], base, 1))};
const auto base_shift{OpShiftLeftLogical(U64, base_hi, ConstU32(32U))};
const auto base_addr{OpBitwiseOr(U64, base_lo, base_shift)};
const auto offset_bytes{OpShiftLeftLogical(U32[1], offset, ConstU32(2U))};
const auto addr{OpIAdd(U64, base_addr, OpUConvert(U64, offset_bytes))};
const auto result = EmitMemoryRead(U32[1], addr, [&]() {
if (dynamic) {
return u32_zero_value;
} else {
const auto& flatbuf_buffer{buffers[flatbuf_index]};
ASSERT(flatbuf_buffer.binding >= 0 &&
flatbuf_buffer.buffer_type == BufferType::Flatbuf);
const auto [flatbuf_buffer_id, flatbuf_pointer_type] = flatbuf_buffer[PointerType::U32];
const auto ptr{OpAccessChain(flatbuf_pointer_type, flatbuf_buffer_id, u32_zero_value,
flatbuf_offset)};
return OpLoad(U32[1], ptr);
}
});
OpReturnValue(result);
OpFunctionEnd();
return func;
}
void EmitContext::DefineFunctions() {
if (info.uses_pack_10_11_11) {
f32_to_uf11 = DefineFloat32ToUfloatM5(6, "f32_to_uf11");
@ -1012,6 +1172,18 @@ void EmitContext::DefineFunctions() {
uf11_to_f32 = DefineUfloatM5ToFloat32(6, "uf11_to_f32");
uf10_to_f32 = DefineUfloatM5ToFloat32(5, "uf10_to_f32");
}
if (info.dma_types != IR::Type::Void) {
get_bda_pointer = DefineGetBdaPointer();
}
if (True(info.readconst_types & Info::ReadConstType::Immediate)) {
LOG_DEBUG(Render_Recompiler, "Shader {:#x} uses immediate ReadConst", info.pgm_hash);
read_const = DefineReadConst(false);
}
if (True(info.readconst_types & Info::ReadConstType::Dynamic)) {
LOG_DEBUG(Render_Recompiler, "Shader {:#x} uses dynamic ReadConst", info.pgm_hash);
read_const_dynamic = DefineReadConst(true);
}
}
} // namespace Shader::Backend::SPIRV

View File

@ -4,6 +4,7 @@
#pragma once
#include <array>
#include <unordered_map>
#include <sirit/sirit.h>
#include "shader_recompiler/backend/bindings.h"
@ -41,6 +42,17 @@ public:
Bindings& binding);
~EmitContext();
enum class PointerType : u32 {
U8,
U16,
F16,
U32,
F32,
U64,
F64,
NumAlias,
};
Id Def(const IR::Value& value);
void DefineBufferProperties();
@ -133,12 +145,72 @@ public:
return ConstantComposite(type, constituents);
}
inline Id AddLabel() {
last_label = Module::AddLabel();
return last_label;
}
inline Id AddLabel(Id label) {
last_label = Module::AddLabel(label);
return last_label;
}
PointerType PointerTypeFromType(Id type) {
if (type.value == U8.value)
return PointerType::U8;
if (type.value == U16.value)
return PointerType::U16;
if (type.value == F16[1].value)
return PointerType::F16;
if (type.value == U32[1].value)
return PointerType::U32;
if (type.value == F32[1].value)
return PointerType::F32;
if (type.value == U64.value)
return PointerType::U64;
if (type.value == F64[1].value)
return PointerType::F64;
UNREACHABLE_MSG("Unknown type for pointer");
}
Id EmitMemoryRead(Id type, Id address, auto&& fallback) {
const Id available_label = OpLabel();
const Id fallback_label = OpLabel();
const Id merge_label = OpLabel();
const Id addr = OpFunctionCall(U64, get_bda_pointer, address);
const Id is_available = OpINotEqual(U1[1], addr, u64_zero_value);
OpSelectionMerge(merge_label, spv::SelectionControlMask::MaskNone);
OpBranchConditional(is_available, available_label, fallback_label);
// Available
AddLabel(available_label);
const auto pointer_type = PointerTypeFromType(type);
const Id pointer_type_id = physical_pointer_types[pointer_type];
const Id addr_ptr = OpConvertUToPtr(pointer_type_id, addr);
const Id result = OpLoad(type, addr_ptr, spv::MemoryAccessMask::Aligned, 4u);
OpBranch(merge_label);
// Fallback
AddLabel(fallback_label);
const Id fallback_result = fallback();
OpBranch(merge_label);
// Merge
AddLabel(merge_label);
const Id final_result =
OpPhi(type, fallback_result, fallback_label, result, available_label);
return final_result;
}
Info& info;
const RuntimeInfo& runtime_info;
const Profile& profile;
Stage stage;
LogicalStage l_stage{};
Id last_label{};
Id void_id{};
Id U8{};
Id S8{};
@ -161,9 +233,13 @@ public:
Id true_value{};
Id false_value{};
Id u8_one_value{};
Id u8_zero_value{};
Id u32_one_value{};
Id u32_zero_value{};
Id f32_zero_value{};
Id u64_one_value{};
Id u64_zero_value{};
Id shared_u8{};
Id shared_u16{};
@ -231,14 +307,6 @@ public:
bool is_storage = false;
};
enum class BufferAlias : u32 {
U8,
U16,
U32,
F32,
NumAlias,
};
struct BufferSpv {
Id id;
Id pointer_type;
@ -252,22 +320,40 @@ public:
Id size;
Id size_shorts;
Id size_dwords;
std::array<BufferSpv, u32(BufferAlias::NumAlias)> aliases;
std::array<BufferSpv, u32(PointerType::NumAlias)> aliases;
const BufferSpv& operator[](BufferAlias alias) const {
const BufferSpv& operator[](PointerType alias) const {
return aliases[u32(alias)];
}
BufferSpv& operator[](BufferAlias alias) {
BufferSpv& operator[](PointerType alias) {
return aliases[u32(alias)];
}
};
struct PhysicalPointerTypes {
std::array<Id, u32(PointerType::NumAlias)> types;
const Id& operator[](PointerType type) const {
return types[u32(type)];
}
Id& operator[](PointerType type) {
return types[u32(type)];
}
};
Bindings& binding;
boost::container::small_vector<Id, 16> buf_type_ids;
boost::container::small_vector<BufferDefinition, 16> buffers;
boost::container::small_vector<TextureDefinition, 8> images;
boost::container::small_vector<Id, 4> samplers;
PhysicalPointerTypes physical_pointer_types;
std::unordered_map<u32, Id> first_to_last_label_map;
size_t flatbuf_index{};
size_t bda_pagetable_index{};
size_t fault_buffer_index{};
Id sampler_type{};
Id sampler_pointer_type{};
@ -292,6 +378,11 @@ public:
Id uf10_to_f32{};
Id f32_to_uf10{};
Id get_bda_pointer{};
Id read_const{};
Id read_const_dynamic{};
private:
void DefineArithmeticTypes();
void DefineInterfaces();
@ -312,6 +403,10 @@ private:
Id DefineFloat32ToUfloatM5(u32 mantissa_bits, std::string_view name);
Id DefineUfloatM5ToFloat32(u32 mantissa_bits, std::string_view name);
Id DefineGetBdaPointer();
Id DefineReadConst(bool dynamic);
Id GetBufferSize(u32 sharp_idx);
};

View File

@ -39,21 +39,22 @@ void Translator::EmitScalarMemory(const GcnInst& inst) {
void Translator::S_LOAD_DWORD(int num_dwords, const GcnInst& inst) {
const auto& smrd = inst.control.smrd;
const u32 dword_offset = [&] -> u32 {
const IR::ScalarReg sbase{inst.src[0].code * 2};
const IR::U32 dword_offset = [&] -> IR::U32 {
if (smrd.imm) {
return smrd.offset;
return ir.Imm32(smrd.offset);
}
if (smrd.offset == SQ_SRC_LITERAL) {
return inst.src[1].code;
return ir.Imm32(inst.src[1].code);
}
UNREACHABLE();
return ir.ShiftRightLogical(ir.GetScalarReg(IR::ScalarReg(smrd.offset)), ir.Imm32(2));
}();
const IR::ScalarReg sbase{inst.src[0].code * 2};
const IR::Value base =
ir.CompositeConstruct(ir.GetScalarReg(sbase), ir.GetScalarReg(sbase + 1));
IR::ScalarReg dst_reg{inst.dst[0].code};
for (u32 i = 0; i < num_dwords; i++) {
ir.SetScalarReg(dst_reg + i, ir.ReadConst(base, ir.Imm32(dword_offset + i)));
IR::U32 index = ir.IAdd(dword_offset, ir.Imm32(i));
ir.SetScalarReg(dst_reg + i, ir.ReadConst(base, index));
}
}

View File

@ -41,7 +41,9 @@ constexpr u32 NUM_TEXTURE_TYPES = 7;
enum class BufferType : u32 {
Guest,
ReadConstUbo,
Flatbuf,
BdaPagetable,
FaultBuffer,
GdsBuffer,
SharedMemory,
};
@ -215,11 +217,18 @@ struct Info {
bool stores_tess_level_outer{};
bool stores_tess_level_inner{};
bool translation_failed{};
bool has_readconst{};
u8 mrt_mask{0u};
bool has_fetch_shader{false};
u32 fetch_shader_sgpr_base{0u};
enum class ReadConstType {
None = 0,
Immediate = 1 << 0,
Dynamic = 1 << 1,
};
ReadConstType readconst_types{};
IR::Type dma_types{IR::Type::Void};
explicit Info(Stage stage_, LogicalStage l_stage_, ShaderParams params)
: stage{stage_}, l_stage{l_stage_}, pgm_hash{params.hash}, pgm_base{params.Base()},
user_data{params.user_data} {}
@ -277,6 +286,7 @@ struct Info {
sizeof(tess_constants));
}
};
DECLARE_ENUM_FLAG_OPERATORS(Info::ReadConstType);
constexpr AmdGpu::Buffer BufferResource::GetSharp(const Info& info) const noexcept {
return inline_cbuf ? inline_cbuf : info.ReadUdSharp<AmdGpu::Buffer>(sharp_idx);

View File

@ -0,0 +1,44 @@
// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project
// SPDX-License-Identifier: GPL-2.0-or-later
#include "abstract_syntax_list.h"
namespace Shader::IR {
std::string DumpASLNode(const AbstractSyntaxNode& node,
const std::map<const Block*, size_t>& block_to_index,
const std::map<const Inst*, size_t>& inst_to_index) {
switch (node.type) {
case AbstractSyntaxNode::Type::Block:
return fmt::format("Block: ${}", block_to_index.at(node.data.block));
case AbstractSyntaxNode::Type::If:
return fmt::format("If: cond = %{}, body = ${}, merge = ${}",
inst_to_index.at(node.data.if_node.cond.Inst()),
block_to_index.at(node.data.if_node.body),
block_to_index.at(node.data.if_node.merge));
case AbstractSyntaxNode::Type::EndIf:
return fmt::format("EndIf: merge = ${}", block_to_index.at(node.data.end_if.merge));
case AbstractSyntaxNode::Type::Loop:
return fmt::format("Loop: body = ${}, continue = ${}, merge = ${}",
block_to_index.at(node.data.loop.body),
block_to_index.at(node.data.loop.continue_block),
block_to_index.at(node.data.loop.merge));
case AbstractSyntaxNode::Type::Repeat:
return fmt::format("Repeat: cond = %{}, header = ${}, merge = ${}",
inst_to_index.at(node.data.repeat.cond.Inst()),
block_to_index.at(node.data.repeat.loop_header),
block_to_index.at(node.data.repeat.merge));
case AbstractSyntaxNode::Type::Break:
return fmt::format("Break: cond = %{}, merge = ${}, skip = ${}",
inst_to_index.at(node.data.break_node.cond.Inst()),
block_to_index.at(node.data.break_node.merge),
block_to_index.at(node.data.break_node.skip));
case AbstractSyntaxNode::Type::Return:
return "Return";
case AbstractSyntaxNode::Type::Unreachable:
return "Unreachable";
};
UNREACHABLE();
}
} // namespace Shader::IR

View File

@ -3,6 +3,7 @@
#pragma once
#include <map>
#include <vector>
#include "shader_recompiler/ir/value.h"
@ -53,4 +54,8 @@ struct AbstractSyntaxNode {
};
using AbstractSyntaxList = std::vector<AbstractSyntaxNode>;
std::string DumpASLNode(const AbstractSyntaxNode& node,
const std::map<const Block*, size_t>& block_to_index,
const std::map<const Inst*, size_t>& inst_to_index);
} // namespace Shader::IR

View File

@ -2,6 +2,7 @@
// SPDX-License-Identifier: GPL-2.0-or-later
#include "shader_recompiler/ir/program.h"
#include "video_core/buffer_cache/buffer_cache.h"
namespace Shader::Optimization {
@ -79,14 +80,21 @@ void Visit(Info& info, const IR::Inst& inst) {
info.uses_lane_id = true;
break;
case IR::Opcode::ReadConst:
if (!info.has_readconst) {
if (info.readconst_types == Info::ReadConstType::None) {
info.buffers.push_back({
.used_types = IR::Type::U32,
.inline_cbuf = AmdGpu::Buffer::Null(),
.buffer_type = BufferType::ReadConstUbo,
// We can't guarantee that flatbuf will not grow past UBO
// limit if there are a lot of ReadConsts. (We could specialize)
.inline_cbuf = AmdGpu::Buffer::Placeholder(std::numeric_limits<u32>::max()),
.buffer_type = BufferType::Flatbuf,
});
info.has_readconst = true;
}
if (inst.Flags<u32>() != 0) {
info.readconst_types |= Info::ReadConstType::Immediate;
} else {
info.readconst_types |= Info::ReadConstType::Dynamic;
}
info.dma_types |= IR::Type::U32;
break;
case IR::Opcode::PackUfloat10_11_11:
info.uses_pack_10_11_11 = true;
@ -105,6 +113,21 @@ void CollectShaderInfoPass(IR::Program& program) {
Visit(program.info, inst);
}
}
if (program.info.dma_types != IR::Type::Void) {
program.info.buffers.push_back({
.used_types = IR::Type::U64,
.inline_cbuf = AmdGpu::Buffer::Placeholder(VideoCore::BufferCache::BDA_PAGETABLE_SIZE),
.buffer_type = BufferType::BdaPagetable,
.is_written = true,
});
program.info.buffers.push_back({
.used_types = IR::Type::U8,
.inline_cbuf = AmdGpu::Buffer::Placeholder(VideoCore::BufferCache::FAULT_BUFFER_SIZE),
.buffer_type = BufferType::FaultBuffer,
.is_written = true,
});
}
}
} // namespace Shader::Optimization

View File

@ -6,13 +6,30 @@
#include <fmt/format.h>
#include "common/config.h"
#include "common/io_file.h"
#include "common/path_util.h"
#include "shader_recompiler/ir/basic_block.h"
#include "shader_recompiler/ir/program.h"
#include "shader_recompiler/ir/value.h"
namespace Shader::IR {
std::string DumpProgram(const Program& program) {
void DumpProgram(const Program& program, const Info& info, const std::string& type) {
using namespace Common::FS;
if (!Config::dumpShaders()) {
return;
}
const auto dump_dir = GetUserPath(PathType::ShaderDir) / "dumps";
if (!std::filesystem::exists(dump_dir)) {
std::filesystem::create_directories(dump_dir);
}
const auto ir_filename =
fmt::format("{}_{:#018x}.{}irprogram.txt", info.stage, info.pgm_hash, type);
const auto ir_file = IOFile{dump_dir / ir_filename, FileAccessMode::Write, FileType::TextFile};
size_t index{0};
std::map<const IR::Inst*, size_t> inst_to_index;
std::map<const IR::Block*, size_t> block_to_index;
@ -21,11 +38,20 @@ std::string DumpProgram(const Program& program) {
block_to_index.emplace(block, index);
++index;
}
std::string ret;
for (const auto& block : program.blocks) {
ret += IR::DumpBlock(*block, block_to_index, inst_to_index, index) + '\n';
std::string s = IR::DumpBlock(*block, block_to_index, inst_to_index, index) + '\n';
ir_file.WriteString(s);
}
const auto asl_filename = fmt::format("{}_{:#018x}.{}asl.txt", info.stage, info.pgm_hash, type);
const auto asl_file =
IOFile{dump_dir / asl_filename, FileAccessMode::Write, FileType::TextFile};
for (const auto& node : program.syntax_list) {
std::string s = IR::DumpASLNode(node, block_to_index, inst_to_index) + '\n';
asl_file.WriteString(s);
}
return ret;
}
} // namespace Shader::IR

View File

@ -21,6 +21,6 @@ struct Program {
Info& info;
};
[[nodiscard]] std::string DumpProgram(const Program& program);
void DumpProgram(const Program& program, const Info& info, const std::string& type = "");
} // namespace Shader::IR

View File

@ -46,6 +46,10 @@ inline F32 ApplyReadNumberConversion(IREmitter& ir, const F32& value,
const IR::F32 max = ir.Imm32(float(std::numeric_limits<u16>::max()));
return ir.FPDiv(left, max);
}
case AmdGpu::NumberConversion::Uint32ToUnorm: {
const auto float_val = ir.ConvertUToF(32, 32, ir.BitCast<U32>(value));
return ir.FPDiv(float_val, ir.Imm32(static_cast<float>(std::numeric_limits<u32>::max())));
}
default:
UNREACHABLE();
}
@ -92,6 +96,12 @@ inline F32 ApplyWriteNumberConversion(IREmitter& ir, const F32& value,
const IR::U32 raw = ir.ConvertFToS(32, ir.FPDiv(left, ir.Imm32(2.f)));
return ir.BitCast<F32>(raw);
}
case AmdGpu::NumberConversion::Uint32ToUnorm: {
const auto clamped = ir.FPClamp(value, ir.Imm32(0.f), ir.Imm32(1.f));
const auto unnormalized =
ir.FPMul(clamped, ir.Imm32(static_cast<float>(std::numeric_limits<u32>::max())));
return ir.BitCast<F32>(U32{ir.ConvertFToU(32, unnormalized)});
}
default:
UNREACHABLE();
}

View File

@ -85,6 +85,8 @@ IR::Program TranslateProgram(std::span<const u32> code, Pools& pools, Info& info
Shader::Optimization::ConstantPropagationPass(program.post_order_blocks);
Shader::Optimization::CollectShaderInfoPass(program);
Shader::IR::DumpProgram(program, info);
return program;
}

View File

@ -133,6 +133,7 @@ void Liverpool::Process(std::stop_token stoken) {
VideoCore::EndCapture();
if (rasterizer) {
rasterizer->ProcessFaults();
rasterizer->Flush();
}
submit_done = false;

View File

@ -37,6 +37,13 @@ struct Buffer {
return buffer;
}
static constexpr Buffer Placeholder(u32 size) {
Buffer buffer{};
buffer.base_address = 1;
buffer.num_records = size;
return buffer;
}
bool Valid() const {
return type == 0u;
}

View File

@ -197,8 +197,9 @@ enum class NumberConversion : u32 {
UintToUscaled = 1,
SintToSscaled = 2,
UnormToUbnorm = 3,
Sint8ToSnormNz = 5,
Sint16ToSnormNz = 6,
Sint8ToSnormNz = 4,
Sint16ToSnormNz = 5,
Uint32ToUnorm = 6,
};
struct CompMapping {
@ -286,6 +287,17 @@ inline DataFormat RemapDataFormat(const DataFormat format) {
inline NumberFormat RemapNumberFormat(const NumberFormat format, const DataFormat data_format) {
switch (format) {
case NumberFormat::Unorm: {
switch (data_format) {
case DataFormat::Format32:
case DataFormat::Format32_32:
case DataFormat::Format32_32_32:
case DataFormat::Format32_32_32_32:
return NumberFormat::Uint;
default:
return format;
}
}
case NumberFormat::Uscaled:
return NumberFormat::Uint;
case NumberFormat::Sscaled:
@ -341,6 +353,17 @@ inline CompMapping RemapSwizzle(const DataFormat format, const CompMapping swizz
inline NumberConversion MapNumberConversion(const NumberFormat num_fmt, const DataFormat data_fmt) {
switch (num_fmt) {
case NumberFormat::Unorm: {
switch (data_fmt) {
case DataFormat::Format32:
case DataFormat::Format32_32:
case DataFormat::Format32_32_32:
case DataFormat::Format32_32_32_32:
return NumberConversion::Uint32ToUnorm;
default:
return NumberConversion::None;
}
}
case NumberFormat::Uscaled:
return NumberConversion::UintToUscaled;
case NumberFormat::Sscaled:

View File

@ -70,8 +70,11 @@ UniqueBuffer::~UniqueBuffer() {
void UniqueBuffer::Create(const vk::BufferCreateInfo& buffer_ci, MemoryUsage usage,
VmaAllocationInfo* out_alloc_info) {
const bool with_bda = bool(buffer_ci.usage & vk::BufferUsageFlagBits::eShaderDeviceAddress);
const VmaAllocationCreateFlags bda_flag =
with_bda ? VMA_ALLOCATION_CREATE_DEDICATED_MEMORY_BIT : 0;
const VmaAllocationCreateInfo alloc_ci = {
.flags = VMA_ALLOCATION_CREATE_WITHIN_BUDGET_BIT | MemoryUsageVmaFlags(usage),
.flags = VMA_ALLOCATION_CREATE_WITHIN_BUDGET_BIT | bda_flag | MemoryUsageVmaFlags(usage),
.usage = MemoryUsageVma(usage),
.requiredFlags = 0,
.preferredFlags = MemoryUsagePreferredVmaFlags(usage),
@ -86,6 +89,15 @@ void UniqueBuffer::Create(const vk::BufferCreateInfo& buffer_ci, MemoryUsage usa
ASSERT_MSG(result == VK_SUCCESS, "Failed allocating buffer with error {}",
vk::to_string(vk::Result{result}));
buffer = vk::Buffer{unsafe_buffer};
if (with_bda) {
vk::BufferDeviceAddressInfo bda_info{
.buffer = buffer,
};
auto bda_result = device.getBufferAddress(bda_info);
ASSERT_MSG(bda_result != 0, "Failed to get buffer device address");
bda_addr = bda_result;
}
}
Buffer::Buffer(const Vulkan::Instance& instance_, Vulkan::Scheduler& scheduler_, MemoryUsage usage_,

View File

@ -68,6 +68,7 @@ struct UniqueBuffer {
VmaAllocator allocator;
VmaAllocation allocation;
vk::Buffer buffer{};
vk::DeviceAddress bda_addr = 0;
};
class Buffer {
@ -115,6 +116,11 @@ public:
return buffer;
}
vk::DeviceAddress BufferDeviceAddress() const noexcept {
ASSERT_MSG(buffer.bda_addr != 0, "Can't get BDA from a non BDA buffer");
return buffer.bda_addr;
}
std::optional<vk::BufferMemoryBarrier2> GetBarrier(
vk::Flags<vk::AccessFlagBits2> dst_acess_mask, vk::PipelineStageFlagBits2 dst_stage,
u32 offset = 0) {

View File

@ -3,13 +3,17 @@
#include <algorithm>
#include "common/alignment.h"
#include "common/debug.h"
#include "common/scope_exit.h"
#include "common/types.h"
#include "video_core/amdgpu/liverpool.h"
#include "video_core/buffer_cache/buffer_cache.h"
#include "video_core/host_shaders/fault_buffer_process_comp.h"
#include "video_core/renderer_vulkan/vk_graphics_pipeline.h"
#include "video_core/renderer_vulkan/vk_instance.h"
#include "video_core/renderer_vulkan/vk_rasterizer.h"
#include "video_core/renderer_vulkan/vk_scheduler.h"
#include "video_core/renderer_vulkan/vk_shader_util.h"
#include "video_core/texture_cache/texture_cache.h"
namespace VideoCore {
@ -17,17 +21,26 @@ namespace VideoCore {
static constexpr size_t DataShareBufferSize = 64_KB;
static constexpr size_t StagingBufferSize = 512_MB;
static constexpr size_t UboStreamBufferSize = 128_MB;
static constexpr size_t DownloadBufferSize = 128_MB;
static constexpr size_t MaxPageFaults = 1024;
BufferCache::BufferCache(const Vulkan::Instance& instance_, Vulkan::Scheduler& scheduler_,
AmdGpu::Liverpool* liverpool_, TextureCache& texture_cache_,
PageManager& tracker_)
: instance{instance_}, scheduler{scheduler_}, liverpool{liverpool_},
Vulkan::Rasterizer& rasterizer_, AmdGpu::Liverpool* liverpool_,
TextureCache& texture_cache_, PageManager& tracker_)
: instance{instance_}, scheduler{scheduler_}, rasterizer{rasterizer_}, liverpool{liverpool_},
texture_cache{texture_cache_}, tracker{tracker_},
staging_buffer{instance, scheduler, MemoryUsage::Upload, StagingBufferSize},
stream_buffer{instance, scheduler, MemoryUsage::Stream, UboStreamBufferSize},
download_buffer(instance, scheduler, MemoryUsage::Download, DownloadBufferSize),
gds_buffer{instance, scheduler, MemoryUsage::Stream, 0, AllFlags, DataShareBufferSize},
memory_tracker{&tracker} {
bda_pagetable_buffer{instance, scheduler, MemoryUsage::DeviceLocal,
0, AllFlags, BDA_PAGETABLE_SIZE},
fault_buffer(instance, scheduler, MemoryUsage::DeviceLocal, 0, AllFlags, FAULT_BUFFER_SIZE),
memory_tracker{tracker} {
Vulkan::SetObjectName(instance.GetDevice(), gds_buffer.Handle(), "GDS Buffer");
Vulkan::SetObjectName(instance.GetDevice(), bda_pagetable_buffer.Handle(),
"BDA Page Table Buffer");
Vulkan::SetObjectName(instance.GetDevice(), fault_buffer.Handle(), "Fault Buffer");
// Ensure the first slot is used for the null buffer
const auto null_id =
@ -35,15 +48,93 @@ BufferCache::BufferCache(const Vulkan::Instance& instance_, Vulkan::Scheduler& s
ASSERT(null_id.index == 0);
const vk::Buffer& null_buffer = slot_buffers[null_id].buffer;
Vulkan::SetObjectName(instance.GetDevice(), null_buffer, "Null Buffer");
// Prepare the fault buffer parsing pipeline
boost::container::static_vector<vk::DescriptorSetLayoutBinding, 2> bindings{
{
.binding = 0,
.descriptorType = vk::DescriptorType::eStorageBuffer,
.descriptorCount = 1,
.stageFlags = vk::ShaderStageFlagBits::eCompute,
},
{
.binding = 1,
.descriptorType = vk::DescriptorType::eStorageBuffer,
.descriptorCount = 1,
.stageFlags = vk::ShaderStageFlagBits::eCompute,
},
};
const vk::DescriptorSetLayoutCreateInfo desc_layout_ci = {
.flags = vk::DescriptorSetLayoutCreateFlagBits::ePushDescriptorKHR,
.bindingCount = static_cast<u32>(bindings.size()),
.pBindings = bindings.data(),
};
auto [desc_layout_result, desc_layout] =
instance.GetDevice().createDescriptorSetLayoutUnique(desc_layout_ci);
ASSERT_MSG(desc_layout_result == vk::Result::eSuccess,
"Failed to create descriptor set layout: {}", vk::to_string(desc_layout_result));
fault_process_desc_layout = std::move(desc_layout);
const auto& module = Vulkan::Compile(HostShaders::FAULT_BUFFER_PROCESS_COMP,
vk::ShaderStageFlagBits::eCompute, instance.GetDevice());
Vulkan::SetObjectName(instance.GetDevice(), module, "Fault Buffer Parser");
const vk::SpecializationMapEntry specialization_map_entry = {
.constantID = 0,
.offset = 0,
.size = sizeof(u32),
};
const vk::SpecializationInfo specialization_info = {
.mapEntryCount = 1,
.pMapEntries = &specialization_map_entry,
.dataSize = sizeof(u32),
.pData = &CACHING_PAGEBITS,
};
const vk::PipelineShaderStageCreateInfo shader_ci = {
.stage = vk::ShaderStageFlagBits::eCompute,
.module = module,
.pName = "main",
.pSpecializationInfo = &specialization_info,
};
const vk::PipelineLayoutCreateInfo layout_info = {
.setLayoutCount = 1U,
.pSetLayouts = &(*fault_process_desc_layout),
};
auto [layout_result, layout] = instance.GetDevice().createPipelineLayoutUnique(layout_info);
ASSERT_MSG(layout_result == vk::Result::eSuccess, "Failed to create pipeline layout: {}",
vk::to_string(layout_result));
fault_process_pipeline_layout = std::move(layout);
const vk::ComputePipelineCreateInfo pipeline_info = {
.stage = shader_ci,
.layout = *fault_process_pipeline_layout,
};
auto [pipeline_result, pipeline] =
instance.GetDevice().createComputePipelineUnique({}, pipeline_info);
ASSERT_MSG(pipeline_result == vk::Result::eSuccess, "Failed to create compute pipeline: {}",
vk::to_string(pipeline_result));
fault_process_pipeline = std::move(pipeline);
Vulkan::SetObjectName(instance.GetDevice(), *fault_process_pipeline,
"Fault Buffer Parser Pipeline");
instance.GetDevice().destroyShaderModule(module);
}
BufferCache::~BufferCache() = default;
void BufferCache::InvalidateMemory(VAddr device_addr, u64 size) {
void BufferCache::InvalidateMemory(VAddr device_addr, u64 size, bool unmap) {
const bool is_tracked = IsRegionRegistered(device_addr, size);
if (is_tracked) {
// Mark the page as CPU modified to stop tracking writes.
memory_tracker.MarkRegionAsCpuModified(device_addr, size);
if (unmap) {
return;
}
}
}
@ -69,20 +160,20 @@ void BufferCache::DownloadBufferMemory(Buffer& buffer, VAddr device_addr, u64 si
if (total_size_bytes == 0) {
return;
}
const auto [staging, offset] = staging_buffer.Map(total_size_bytes);
const auto [download, offset] = download_buffer.Map(total_size_bytes);
for (auto& copy : copies) {
// Modify copies to have the staging offset in mind
copy.dstOffset += offset;
}
staging_buffer.Commit();
download_buffer.Commit();
scheduler.EndRendering();
const auto cmdbuf = scheduler.CommandBuffer();
cmdbuf.copyBuffer(buffer.buffer, staging_buffer.Handle(), copies);
cmdbuf.copyBuffer(buffer.buffer, download_buffer.Handle(), copies);
scheduler.Finish();
for (const auto& copy : copies) {
const VAddr copy_device_addr = buffer.CpuAddr() + copy.srcOffset;
const u64 dst_offset = copy.dstOffset - offset;
std::memcpy(std::bit_cast<u8*>(copy_device_addr), staging + dst_offset, copy.size);
std::memcpy(std::bit_cast<u8*>(copy_device_addr), download + dst_offset, copy.size);
}
}
@ -206,58 +297,37 @@ void BufferCache::InlineData(VAddr address, const void* value, u32 num_bytes, bo
memcpy(std::bit_cast<void*>(address), value, num_bytes);
return;
}
scheduler.EndRendering();
const Buffer* buffer = [&] {
Buffer* buffer = [&] {
if (is_gds) {
return &gds_buffer;
}
const BufferId buffer_id = FindBuffer(address, num_bytes);
return &slot_buffers[buffer_id];
}();
const auto cmdbuf = scheduler.CommandBuffer();
const vk::BufferMemoryBarrier2 pre_barrier = {
.srcStageMask = vk::PipelineStageFlagBits2::eAllCommands,
.srcAccessMask = vk::AccessFlagBits2::eMemoryRead,
.dstStageMask = vk::PipelineStageFlagBits2::eTransfer,
.dstAccessMask = vk::AccessFlagBits2::eTransferWrite,
.buffer = buffer->Handle(),
.offset = buffer->Offset(address),
.size = num_bytes,
};
const vk::BufferMemoryBarrier2 post_barrier = {
.srcStageMask = vk::PipelineStageFlagBits2::eTransfer,
.srcAccessMask = vk::AccessFlagBits2::eTransferWrite,
.dstStageMask = vk::PipelineStageFlagBits2::eAllCommands,
.dstAccessMask = vk::AccessFlagBits2::eMemoryRead,
.buffer = buffer->Handle(),
.offset = buffer->Offset(address),
.size = num_bytes,
};
cmdbuf.pipelineBarrier2(vk::DependencyInfo{
.dependencyFlags = vk::DependencyFlagBits::eByRegion,
.bufferMemoryBarrierCount = 1,
.pBufferMemoryBarriers = &pre_barrier,
});
// vkCmdUpdateBuffer can only copy up to 65536 bytes at a time.
static constexpr u32 UpdateBufferMaxSize = 65536;
const auto dst_offset = buffer->Offset(address);
for (u32 offset = 0; offset < num_bytes; offset += UpdateBufferMaxSize) {
const auto* update_src = static_cast<const u8*>(value) + offset;
const auto update_dst = dst_offset + offset;
const auto update_size = std::min(num_bytes - offset, UpdateBufferMaxSize);
cmdbuf.updateBuffer(buffer->Handle(), update_dst, update_size, update_src);
InlineDataBuffer(*buffer, address, value, num_bytes);
}
void BufferCache::WriteData(VAddr address, const void* value, u32 num_bytes, bool is_gds) {
ASSERT_MSG(address % 4 == 0, "GDS offset must be dword aligned");
if (!is_gds && !IsRegionRegistered(address, num_bytes)) {
memcpy(std::bit_cast<void*>(address), value, num_bytes);
return;
}
cmdbuf.pipelineBarrier2(vk::DependencyInfo{
.dependencyFlags = vk::DependencyFlagBits::eByRegion,
.bufferMemoryBarrierCount = 1,
.pBufferMemoryBarriers = &post_barrier,
});
Buffer* buffer = [&] {
if (is_gds) {
return &gds_buffer;
}
const BufferId buffer_id = FindBuffer(address, num_bytes);
return &slot_buffers[buffer_id];
}();
WriteDataBuffer(*buffer, address, value, num_bytes);
}
std::pair<Buffer*, u32> BufferCache::ObtainBuffer(VAddr device_addr, u32 size, bool is_written,
bool is_texel_buffer, BufferId buffer_id) {
// For small uniform buffers that have not been modified by gpu
// use device local stream buffer to reduce renderpass breaks.
// Maybe we want to modify the threshold now that the page size is 16KB?
static constexpr u64 StreamThreshold = CACHING_PAGESIZE;
const bool is_gpu_dirty = memory_tracker.IsRegionGpuModified(device_addr, size);
if (!is_written && size <= StreamThreshold && !is_gpu_dirty) {
@ -280,7 +350,7 @@ std::pair<Buffer*, u32> BufferCache::ObtainBuffer(VAddr device_addr, u32 size, b
std::pair<Buffer*, u32> BufferCache::ObtainViewBuffer(VAddr gpu_addr, u32 size, bool prefer_gpu) {
// Check if any buffer contains the full requested range.
const u64 page = gpu_addr >> CACHING_PAGEBITS;
const BufferId buffer_id = page_table[page];
const BufferId buffer_id = page_table[page].buffer_id;
if (buffer_id) {
Buffer& buffer = slot_buffers[buffer_id];
if (buffer.IsInBounds(gpu_addr, size)) {
@ -300,24 +370,8 @@ std::pair<Buffer*, u32> BufferCache::ObtainViewBuffer(VAddr gpu_addr, u32 size,
}
bool BufferCache::IsRegionRegistered(VAddr addr, size_t size) {
const VAddr end_addr = addr + size;
const u64 page_end = Common::DivCeil(end_addr, CACHING_PAGESIZE);
for (u64 page = addr >> CACHING_PAGEBITS; page < page_end;) {
const BufferId buffer_id = page_table[page];
if (!buffer_id) {
++page;
continue;
}
std::shared_lock lk{mutex};
Buffer& buffer = slot_buffers[buffer_id];
const VAddr buf_start_addr = buffer.CpuAddr();
const VAddr buf_end_addr = buf_start_addr + buffer.SizeBytes();
if (buf_start_addr < end_addr && addr < buf_end_addr) {
return true;
}
page = Common::DivCeil(buf_end_addr, CACHING_PAGESIZE);
}
return false;
// Check if we are missing some edge case here
return buffer_ranges.Intersects(addr, size);
}
bool BufferCache::IsRegionCpuModified(VAddr addr, size_t size) {
@ -333,7 +387,7 @@ BufferId BufferCache::FindBuffer(VAddr device_addr, u32 size) {
return NULL_BUFFER_ID;
}
const u64 page = device_addr >> CACHING_PAGEBITS;
const BufferId buffer_id = page_table[page];
const BufferId buffer_id = page_table[page].buffer_id;
if (!buffer_id) {
return CreateBuffer(device_addr, size);
}
@ -379,7 +433,7 @@ BufferCache::OverlapResult BufferCache::ResolveOverlaps(VAddr device_addr, u32 w
}
for (; device_addr >> CACHING_PAGEBITS < Common::DivCeil(end, CACHING_PAGESIZE);
device_addr += CACHING_PAGESIZE) {
const BufferId overlap_id = page_table[device_addr >> CACHING_PAGEBITS];
const BufferId overlap_id = page_table[device_addr >> CACHING_PAGEBITS].buffer_id;
if (!overlap_id) {
continue;
}
@ -480,11 +534,21 @@ BufferId BufferCache::CreateBuffer(VAddr device_addr, u32 wanted_size) {
const OverlapResult overlap = ResolveOverlaps(device_addr, wanted_size);
const u32 size = static_cast<u32>(overlap.end - overlap.begin);
const BufferId new_buffer_id = [&] {
std::scoped_lock lk{mutex};
std::scoped_lock lk{slot_buffers_mutex};
return slot_buffers.insert(instance, scheduler, MemoryUsage::DeviceLocal, overlap.begin,
AllFlags, size);
AllFlags | vk::BufferUsageFlagBits::eShaderDeviceAddress, size);
}();
auto& new_buffer = slot_buffers[new_buffer_id];
boost::container::small_vector<vk::DeviceAddress, 128> bda_addrs;
const u64 start_page = overlap.begin >> CACHING_PAGEBITS;
const u64 size_pages = size >> CACHING_PAGEBITS;
bda_addrs.reserve(size_pages);
for (u64 i = 0; i < size_pages; ++i) {
vk::DeviceAddress addr = new_buffer.BufferDeviceAddress() + (i << CACHING_PAGEBITS);
bda_addrs.push_back(addr);
}
WriteDataBuffer(bda_pagetable_buffer, start_page * sizeof(vk::DeviceAddress), bda_addrs.data(),
bda_addrs.size() * sizeof(vk::DeviceAddress));
const size_t size_bytes = new_buffer.SizeBytes();
const auto cmdbuf = scheduler.CommandBuffer();
scheduler.EndRendering();
@ -496,6 +560,129 @@ BufferId BufferCache::CreateBuffer(VAddr device_addr, u32 wanted_size) {
return new_buffer_id;
}
void BufferCache::ProcessFaultBuffer() {
// Run fault processing shader
const auto [mapped, offset] = download_buffer.Map(MaxPageFaults * sizeof(u64));
vk::BufferMemoryBarrier2 fault_buffer_barrier{
.srcStageMask = vk::PipelineStageFlagBits2::eAllCommands,
.srcAccessMask = vk::AccessFlagBits2::eShaderWrite,
.dstStageMask = vk::PipelineStageFlagBits2::eComputeShader,
.dstAccessMask = vk::AccessFlagBits2::eShaderRead,
.buffer = fault_buffer.Handle(),
.offset = 0,
.size = FAULT_BUFFER_SIZE,
};
vk::BufferMemoryBarrier2 download_barrier{
.srcStageMask = vk::PipelineStageFlagBits2::eTransfer,
.srcAccessMask = vk::AccessFlagBits2::eTransferWrite,
.dstStageMask = vk::PipelineStageFlagBits2::eComputeShader,
.dstAccessMask = vk::AccessFlagBits2::eShaderRead | vk::AccessFlagBits2::eShaderWrite,
.buffer = download_buffer.Handle(),
.offset = offset,
.size = MaxPageFaults * sizeof(u64),
};
std::array<vk::BufferMemoryBarrier2, 2> barriers{fault_buffer_barrier, download_barrier};
vk::DescriptorBufferInfo fault_buffer_info{
.buffer = fault_buffer.Handle(),
.offset = 0,
.range = FAULT_BUFFER_SIZE,
};
vk::DescriptorBufferInfo download_info{
.buffer = download_buffer.Handle(),
.offset = offset,
.range = MaxPageFaults * sizeof(u64),
};
boost::container::small_vector<vk::WriteDescriptorSet, 2> writes{
{
.dstSet = VK_NULL_HANDLE,
.dstBinding = 0,
.dstArrayElement = 0,
.descriptorCount = 1,
.descriptorType = vk::DescriptorType::eStorageBuffer,
.pBufferInfo = &fault_buffer_info,
},
{
.dstSet = VK_NULL_HANDLE,
.dstBinding = 1,
.dstArrayElement = 0,
.descriptorCount = 1,
.descriptorType = vk::DescriptorType::eStorageBuffer,
.pBufferInfo = &download_info,
},
};
download_buffer.Commit();
scheduler.EndRendering();
const auto cmdbuf = scheduler.CommandBuffer();
cmdbuf.fillBuffer(download_buffer.Handle(), offset, MaxPageFaults * sizeof(u64), 0);
cmdbuf.pipelineBarrier2(vk::DependencyInfo{
.dependencyFlags = vk::DependencyFlagBits::eByRegion,
.bufferMemoryBarrierCount = 2,
.pBufferMemoryBarriers = barriers.data(),
});
cmdbuf.bindPipeline(vk::PipelineBindPoint::eCompute, *fault_process_pipeline);
cmdbuf.pushDescriptorSetKHR(vk::PipelineBindPoint::eCompute, *fault_process_pipeline_layout, 0,
writes);
constexpr u32 num_threads = CACHING_NUMPAGES / 32; // 1 bit per page, 32 pages per workgroup
constexpr u32 num_workgroups = Common::DivCeil(num_threads, 64u);
cmdbuf.dispatch(num_workgroups, 1, 1);
// Reset fault buffer
const vk::BufferMemoryBarrier2 reset_pre_barrier = {
.srcStageMask = vk::PipelineStageFlagBits2::eComputeShader,
.srcAccessMask = vk::AccessFlagBits2::eShaderRead,
.dstStageMask = vk::PipelineStageFlagBits2::eTransfer,
.dstAccessMask = vk::AccessFlagBits2::eTransferWrite,
.buffer = fault_buffer.Handle(),
.offset = 0,
.size = FAULT_BUFFER_SIZE,
};
const vk::BufferMemoryBarrier2 reset_post_barrier = {
.srcStageMask = vk::PipelineStageFlagBits2::eTransfer,
.srcAccessMask = vk::AccessFlagBits2::eTransferWrite,
.dstStageMask = vk::PipelineStageFlagBits2::eAllCommands,
.dstAccessMask = vk::AccessFlagBits2::eMemoryRead | vk::AccessFlagBits2::eMemoryWrite,
.buffer = fault_buffer.Handle(),
.offset = 0,
.size = FAULT_BUFFER_SIZE,
};
cmdbuf.pipelineBarrier2(vk::DependencyInfo{
.dependencyFlags = vk::DependencyFlagBits::eByRegion,
.bufferMemoryBarrierCount = 1,
.pBufferMemoryBarriers = &reset_pre_barrier,
});
cmdbuf.fillBuffer(fault_buffer.buffer, 0, FAULT_BUFFER_SIZE, 0);
cmdbuf.pipelineBarrier2(vk::DependencyInfo{
.dependencyFlags = vk::DependencyFlagBits::eByRegion,
.bufferMemoryBarrierCount = 1,
.pBufferMemoryBarriers = &reset_post_barrier,
});
// Defer creating buffers
scheduler.DeferOperation([this, mapped]() {
// Create the fault buffers batched
boost::icl::interval_set<VAddr> fault_ranges;
const u64* fault_ptr = std::bit_cast<const u64*>(mapped);
const u32 fault_count = static_cast<u32>(*(fault_ptr++));
for (u32 i = 0; i < fault_count; ++i) {
const VAddr fault = *(fault_ptr++);
const VAddr fault_end = fault + CACHING_PAGESIZE; // This can be adjusted
fault_ranges +=
boost::icl::interval_set<VAddr>::interval_type::right_open(fault, fault_end);
LOG_INFO(Render_Vulkan, "Accessed non-GPU mapped memory at {:#x}", fault);
}
for (const auto& range : fault_ranges) {
const VAddr start = range.lower();
const VAddr end = range.upper();
const u64 page_start = start >> CACHING_PAGEBITS;
const u64 page_end = Common::DivCeil(end, CACHING_PAGESIZE);
// Buffer size is in 32 bits
ASSERT_MSG((range.upper() - range.lower()) <= std::numeric_limits<u32>::max(),
"Buffer size is too large");
CreateBuffer(start, static_cast<u32>(end - start));
}
});
}
void BufferCache::Register(BufferId buffer_id) {
ChangeRegister<true>(buffer_id);
}
@ -514,11 +701,16 @@ void BufferCache::ChangeRegister(BufferId buffer_id) {
const u64 page_end = Common::DivCeil(device_addr_end, CACHING_PAGESIZE);
for (u64 page = page_begin; page != page_end; ++page) {
if constexpr (insert) {
page_table[page] = buffer_id;
page_table[page].buffer_id = buffer_id;
} else {
page_table[page] = BufferId{};
page_table[page].buffer_id = BufferId{};
}
}
if constexpr (insert) {
buffer_ranges.Add(buffer.CpuAddr(), buffer.SizeBytes(), buffer_id);
} else {
buffer_ranges.Subtract(buffer.CpuAddr(), buffer.SizeBytes());
}
}
void BufferCache::SynchronizeBuffer(Buffer& buffer, VAddr device_addr, u32 size,
@ -697,6 +889,138 @@ bool BufferCache::SynchronizeBufferFromImage(Buffer& buffer, VAddr device_addr,
return true;
}
void BufferCache::SynchronizeBuffersInRange(VAddr device_addr, u64 size) {
if (device_addr == 0) {
return;
}
VAddr device_addr_end = device_addr + size;
ForEachBufferInRange(device_addr, size, [&](BufferId buffer_id, Buffer& buffer) {
RENDERER_TRACE;
VAddr start = std::max(buffer.CpuAddr(), device_addr);
VAddr end = std::min(buffer.CpuAddr() + buffer.SizeBytes(), device_addr_end);
u32 size = static_cast<u32>(end - start);
SynchronizeBuffer(buffer, start, size, false);
});
}
void BufferCache::MemoryBarrier() {
// Vulkan doesn't know which buffer we access in a shader if we use
// BufferDeviceAddress. We need a full memory barrier.
// For now, we only read memory using BDA. If we want to write to it,
// we might need to change this.
scheduler.EndRendering();
const auto cmdbuf = scheduler.CommandBuffer();
vk::MemoryBarrier2 barrier = {
.srcStageMask = vk::PipelineStageFlagBits2::eTransfer,
.srcAccessMask = vk::AccessFlagBits2::eMemoryWrite,
.dstStageMask = vk::PipelineStageFlagBits2::eAllCommands,
.dstAccessMask = vk::AccessFlagBits2::eMemoryRead,
};
cmdbuf.pipelineBarrier2(vk::DependencyInfo{
.memoryBarrierCount = 1,
.pMemoryBarriers = &barrier,
});
}
void BufferCache::InlineDataBuffer(Buffer& buffer, VAddr address, const void* value,
u32 num_bytes) {
scheduler.EndRendering();
const auto cmdbuf = scheduler.CommandBuffer();
const vk::BufferMemoryBarrier2 pre_barrier = {
.srcStageMask = vk::PipelineStageFlagBits2::eAllCommands,
.srcAccessMask = vk::AccessFlagBits2::eMemoryRead,
.dstStageMask = vk::PipelineStageFlagBits2::eTransfer,
.dstAccessMask = vk::AccessFlagBits2::eTransferWrite,
.buffer = buffer.Handle(),
.offset = buffer.Offset(address),
.size = num_bytes,
};
const vk::BufferMemoryBarrier2 post_barrier = {
.srcStageMask = vk::PipelineStageFlagBits2::eTransfer,
.srcAccessMask = vk::AccessFlagBits2::eTransferWrite,
.dstStageMask = vk::PipelineStageFlagBits2::eAllCommands,
.dstAccessMask = vk::AccessFlagBits2::eMemoryRead,
.buffer = buffer.Handle(),
.offset = buffer.Offset(address),
.size = num_bytes,
};
cmdbuf.pipelineBarrier2(vk::DependencyInfo{
.dependencyFlags = vk::DependencyFlagBits::eByRegion,
.bufferMemoryBarrierCount = 1,
.pBufferMemoryBarriers = &pre_barrier,
});
// vkCmdUpdateBuffer can only copy up to 65536 bytes at a time.
static constexpr u32 UpdateBufferMaxSize = 65536;
const auto dst_offset = buffer.Offset(address);
for (u32 offset = 0; offset < num_bytes; offset += UpdateBufferMaxSize) {
const auto* update_src = static_cast<const u8*>(value) + offset;
const auto update_dst = dst_offset + offset;
const auto update_size = std::min(num_bytes - offset, UpdateBufferMaxSize);
cmdbuf.updateBuffer(buffer.Handle(), update_dst, update_size, update_src);
}
cmdbuf.pipelineBarrier2(vk::DependencyInfo{
.dependencyFlags = vk::DependencyFlagBits::eByRegion,
.bufferMemoryBarrierCount = 1,
.pBufferMemoryBarriers = &post_barrier,
});
}
void BufferCache::WriteDataBuffer(Buffer& buffer, VAddr address, const void* value, u32 num_bytes) {
vk::BufferCopy copy = {
.srcOffset = 0,
.dstOffset = buffer.Offset(address),
.size = num_bytes,
};
vk::Buffer src_buffer = staging_buffer.Handle();
if (num_bytes < StagingBufferSize) {
const auto [staging, offset] = staging_buffer.Map(num_bytes);
std::memcpy(staging, value, num_bytes);
copy.srcOffset = offset;
staging_buffer.Commit();
} else {
// For large one time transfers use a temporary host buffer.
// RenderDoc can lag quite a bit if the stream buffer is too large.
Buffer temp_buffer{
instance, scheduler, MemoryUsage::Upload, 0, vk::BufferUsageFlagBits::eTransferSrc,
num_bytes};
src_buffer = temp_buffer.Handle();
u8* const staging = temp_buffer.mapped_data.data();
std::memcpy(staging, value, num_bytes);
scheduler.DeferOperation([buffer = std::move(temp_buffer)]() mutable {});
}
scheduler.EndRendering();
const auto cmdbuf = scheduler.CommandBuffer();
const vk::BufferMemoryBarrier2 pre_barrier = {
.srcStageMask = vk::PipelineStageFlagBits2::eAllCommands,
.srcAccessMask = vk::AccessFlagBits2::eMemoryRead,
.dstStageMask = vk::PipelineStageFlagBits2::eTransfer,
.dstAccessMask = vk::AccessFlagBits2::eTransferWrite,
.buffer = buffer.Handle(),
.offset = buffer.Offset(address),
.size = num_bytes,
};
const vk::BufferMemoryBarrier2 post_barrier = {
.srcStageMask = vk::PipelineStageFlagBits2::eTransfer,
.srcAccessMask = vk::AccessFlagBits2::eTransferWrite,
.dstStageMask = vk::PipelineStageFlagBits2::eAllCommands,
.dstAccessMask = vk::AccessFlagBits2::eMemoryRead | vk::AccessFlagBits2::eMemoryWrite,
.buffer = buffer.Handle(),
.offset = buffer.Offset(address),
.size = num_bytes,
};
cmdbuf.pipelineBarrier2(vk::DependencyInfo{
.dependencyFlags = vk::DependencyFlagBits::eByRegion,
.bufferMemoryBarrierCount = 1,
.pBufferMemoryBarriers = &pre_barrier,
});
cmdbuf.copyBuffer(src_buffer, buffer.Handle(), copy);
cmdbuf.pipelineBarrier2(vk::DependencyInfo{
.dependencyFlags = vk::DependencyFlagBits::eByRegion,
.bufferMemoryBarrierCount = 1,
.pBufferMemoryBarriers = &post_barrier,
});
}
void BufferCache::DeleteBuffer(BufferId buffer_id) {
Buffer& buffer = slot_buffers[buffer_id];
Unregister(buffer_id);

View File

@ -38,14 +38,22 @@ class TextureCache;
class BufferCache {
public:
static constexpr u32 CACHING_PAGEBITS = 12;
static constexpr u32 CACHING_PAGEBITS = 14;
static constexpr u64 CACHING_PAGESIZE = u64{1} << CACHING_PAGEBITS;
static constexpr u64 DEVICE_PAGESIZE = 4_KB;
static constexpr u64 DEVICE_PAGESIZE = 16_KB;
static constexpr u64 CACHING_NUMPAGES = u64{1} << (40 - CACHING_PAGEBITS);
static constexpr u64 BDA_PAGETABLE_SIZE = CACHING_NUMPAGES * sizeof(vk::DeviceAddress);
static constexpr u64 FAULT_BUFFER_SIZE = CACHING_NUMPAGES / 8; // Bit per page
struct PageData {
BufferId buffer_id{};
};
struct Traits {
using Entry = BufferId;
using Entry = PageData;
static constexpr size_t AddressSpaceBits = 40;
static constexpr size_t FirstLevelBits = 14;
static constexpr size_t FirstLevelBits = 16;
static constexpr size_t PageBits = CACHING_PAGEBITS;
};
using PageTable = MultiLevelPageTable<Traits>;
@ -59,8 +67,8 @@ public:
public:
explicit BufferCache(const Vulkan::Instance& instance, Vulkan::Scheduler& scheduler,
AmdGpu::Liverpool* liverpool, TextureCache& texture_cache,
PageManager& tracker);
Vulkan::Rasterizer& rasterizer_, AmdGpu::Liverpool* liverpool,
TextureCache& texture_cache, PageManager& tracker);
~BufferCache();
/// Returns a pointer to GDS device local buffer.
@ -73,13 +81,23 @@ public:
return stream_buffer;
}
/// Retrieves the device local DBA page table buffer.
[[nodiscard]] Buffer* GetBdaPageTableBuffer() noexcept {
return &bda_pagetable_buffer;
}
/// Retrieves the fault buffer.
[[nodiscard]] Buffer* GetFaultBuffer() noexcept {
return &fault_buffer;
}
/// Retrieves the buffer with the specified id.
[[nodiscard]] Buffer& GetBuffer(BufferId id) {
return slot_buffers[id];
}
/// Invalidates any buffer in the logical page range.
void InvalidateMemory(VAddr device_addr, u64 size);
void InvalidateMemory(VAddr device_addr, u64 size, bool unmap);
/// Binds host vertex buffers for the current draw.
void BindVertexBuffers(const Vulkan::GraphicsPipeline& pipeline);
@ -87,9 +105,12 @@ public:
/// Bind host index buffer for the current draw.
void BindIndexBuffer(u32 index_offset);
/// Writes a value to GPU buffer.
/// Writes a value to GPU buffer. (uses command buffer to temporarily store the data)
void InlineData(VAddr address, const void* value, u32 num_bytes, bool is_gds);
/// Writes a value to GPU buffer. (uses staging buffer to temporarily store the data)
void WriteData(VAddr address, const void* value, u32 num_bytes, bool is_gds);
/// Obtains a buffer for the specified region.
[[nodiscard]] std::pair<Buffer*, u32> ObtainBuffer(VAddr gpu_addr, u32 size, bool is_written,
bool is_texel_buffer = false,
@ -108,24 +129,29 @@ public:
/// Return true when a CPU region is modified from the GPU
[[nodiscard]] bool IsRegionGpuModified(VAddr addr, size_t size);
[[nodiscard]] BufferId FindBuffer(VAddr device_addr, u32 size);
/// Return buffer id for the specified region
BufferId FindBuffer(VAddr device_addr, u32 size);
/// Processes the fault buffer.
void ProcessFaultBuffer();
/// Synchronizes all buffers in the specified range.
void SynchronizeBuffersInRange(VAddr device_addr, u64 size);
/// Synchronizes all buffers neede for DMA.
void SynchronizeDmaBuffers();
/// Record memory barrier. Used for buffers when accessed via BDA.
void MemoryBarrier();
private:
template <typename Func>
void ForEachBufferInRange(VAddr device_addr, u64 size, Func&& func) {
const u64 page_end = Common::DivCeil(device_addr + size, CACHING_PAGESIZE);
for (u64 page = device_addr >> CACHING_PAGEBITS; page < page_end;) {
const BufferId buffer_id = page_table[page];
if (!buffer_id) {
++page;
continue;
}
Buffer& buffer = slot_buffers[buffer_id];
func(buffer_id, buffer);
const VAddr end_addr = buffer.CpuAddr() + buffer.SizeBytes();
page = Common::DivCeil(end_addr, CACHING_PAGESIZE);
}
buffer_ranges.ForEachInRange(device_addr, size,
[&](u64 page_start, u64 page_end, BufferId id) {
Buffer& buffer = slot_buffers[id];
func(id, buffer);
});
}
void DownloadBufferMemory(Buffer& buffer, VAddr device_addr, u64 size);
@ -134,7 +160,7 @@ private:
void JoinOverlap(BufferId new_buffer_id, BufferId overlap_id, bool accumulate_stream_score);
[[nodiscard]] BufferId CreateBuffer(VAddr device_addr, u32 wanted_size);
BufferId CreateBuffer(VAddr device_addr, u32 wanted_size);
void Register(BufferId buffer_id);
@ -147,21 +173,33 @@ private:
bool SynchronizeBufferFromImage(Buffer& buffer, VAddr device_addr, u32 size);
void InlineDataBuffer(Buffer& buffer, VAddr address, const void* value, u32 num_bytes);
void WriteDataBuffer(Buffer& buffer, VAddr address, const void* value, u32 num_bytes);
void DeleteBuffer(BufferId buffer_id);
const Vulkan::Instance& instance;
Vulkan::Scheduler& scheduler;
Vulkan::Rasterizer& rasterizer;
AmdGpu::Liverpool* liverpool;
TextureCache& texture_cache;
PageManager& tracker;
StreamBuffer staging_buffer;
StreamBuffer stream_buffer;
StreamBuffer download_buffer;
Buffer gds_buffer;
std::shared_mutex mutex;
Buffer bda_pagetable_buffer;
Buffer fault_buffer;
std::shared_mutex slot_buffers_mutex;
Common::SlotVector<Buffer> slot_buffers;
RangeSet gpu_modified_ranges;
SplitRangeMap<BufferId> buffer_ranges;
MemoryTracker memory_tracker;
PageTable page_table;
vk::UniqueDescriptorSetLayout fault_process_desc_layout;
vk::UniquePipeline fault_process_pipeline;
vk::UniquePipelineLayout fault_process_pipeline_layout;
};
} // namespace VideoCore

View File

@ -7,6 +7,7 @@
#include <deque>
#include <type_traits>
#include <vector>
#include "common/debug.h"
#include "common/types.h"
#include "video_core/buffer_cache/word_manager.h"
@ -19,11 +20,11 @@ public:
static constexpr size_t MANAGER_POOL_SIZE = 32;
public:
explicit MemoryTracker(PageManager* tracker_) : tracker{tracker_} {}
explicit MemoryTracker(PageManager& tracker_) : tracker{&tracker_} {}
~MemoryTracker() = default;
/// Returns true if a region has been modified from the CPU
[[nodiscard]] bool IsRegionCpuModified(VAddr query_cpu_addr, u64 query_size) noexcept {
bool IsRegionCpuModified(VAddr query_cpu_addr, u64 query_size) noexcept {
return IteratePages<true>(
query_cpu_addr, query_size, [](RegionManager* manager, u64 offset, size_t size) {
return manager->template IsRegionModified<Type::CPU>(offset, size);
@ -31,7 +32,7 @@ public:
}
/// Returns true if a region has been modified from the GPU
[[nodiscard]] bool IsRegionGpuModified(VAddr query_cpu_addr, u64 query_size) noexcept {
bool IsRegionGpuModified(VAddr query_cpu_addr, u64 query_size) noexcept {
return IteratePages<false>(
query_cpu_addr, query_size, [](RegionManager* manager, u64 offset, size_t size) {
return manager->template IsRegionModified<Type::GPU>(offset, size);
@ -57,8 +58,7 @@ public:
}
/// Call 'func' for each CPU modified range and unmark those pages as CPU modified
template <typename Func>
void ForEachUploadRange(VAddr query_cpu_range, u64 query_size, Func&& func) {
void ForEachUploadRange(VAddr query_cpu_range, u64 query_size, auto&& func) {
IteratePages<true>(query_cpu_range, query_size,
[&func](RegionManager* manager, u64 offset, size_t size) {
manager->template ForEachModifiedRange<Type::CPU, true>(
@ -67,17 +67,12 @@ public:
}
/// Call 'func' for each GPU modified range and unmark those pages as GPU modified
template <bool clear, typename Func>
void ForEachDownloadRange(VAddr query_cpu_range, u64 query_size, Func&& func) {
template <bool clear>
void ForEachDownloadRange(VAddr query_cpu_range, u64 query_size, auto&& func) {
IteratePages<false>(query_cpu_range, query_size,
[&func](RegionManager* manager, u64 offset, size_t size) {
if constexpr (clear) {
manager->template ForEachModifiedRange<Type::GPU, true>(
manager->GetCpuAddr() + offset, size, func);
} else {
manager->template ForEachModifiedRange<Type::GPU, false>(
manager->GetCpuAddr() + offset, size, func);
}
manager->template ForEachModifiedRange<Type::GPU, clear>(
manager->GetCpuAddr() + offset, size, func);
});
}
@ -91,6 +86,7 @@ private:
*/
template <bool create_region_on_fail, typename Func>
bool IteratePages(VAddr cpu_address, size_t size, Func&& func) {
RENDERER_TRACE;
using FuncReturn = typename std::invoke_result<Func, RegionManager*, u64, size_t>::type;
static constexpr bool BOOL_BREAK = std::is_same_v<FuncReturn, bool>;
std::size_t remaining_size{size};

View File

@ -3,7 +3,10 @@
#pragma once
#include <boost/icl/discrete_interval.hpp>
#include <boost/icl/interval_map.hpp>
#include <boost/icl/split_interval_map.hpp>
#include <boost/icl/split_interval_set.hpp>
#include <boost/pool/pool.hpp>
#include <boost/pool/pool_alloc.hpp>
#include <boost/pool/poolfwd.hpp>
@ -38,6 +41,22 @@ struct RangeSet {
m_ranges_set.subtract(interval);
}
void Clear() {
m_ranges_set.clear();
}
bool Contains(VAddr base_address, size_t size) const {
const VAddr end_address = base_address + size;
IntervalType interval{base_address, end_address};
return boost::icl::contains(m_ranges_set, interval);
}
bool Intersects(VAddr base_address, size_t size) const {
const VAddr end_address = base_address + size;
IntervalType interval{base_address, end_address};
return boost::icl::intersects(m_ranges_set, interval);
}
template <typename Func>
void ForEach(Func&& func) const {
if (m_ranges_set.empty()) {
@ -77,14 +96,29 @@ struct RangeSet {
}
}
template <typename Func>
void ForEachNotInRange(VAddr base_addr, size_t size, Func&& func) const {
const VAddr end_addr = base_addr + size;
ForEachInRange(base_addr, size, [&](VAddr range_addr, VAddr range_end) {
if (size_t gap_size = range_addr - base_addr; gap_size != 0) {
func(base_addr, gap_size);
}
base_addr = range_end;
});
if (base_addr != end_addr) {
func(base_addr, end_addr - base_addr);
}
}
IntervalSet m_ranges_set;
};
template <typename T>
class RangeMap {
public:
using IntervalMap =
boost::icl::interval_map<VAddr, u64, boost::icl::partial_absorber, std::less,
boost::icl::inplace_plus, boost::icl::inter_section,
boost::icl::interval_map<VAddr, T, boost::icl::total_absorber, std::less,
boost::icl::inplace_identity, boost::icl::inter_section,
ICL_INTERVAL_INSTANCE(ICL_INTERVAL_DEFAULT, VAddr, std::less),
RangeSetsAllocator>;
using IntervalType = typename IntervalMap::interval_type;
@ -99,7 +133,7 @@ public:
RangeMap(RangeMap&& other);
RangeMap& operator=(RangeMap&& other);
void Add(VAddr base_address, size_t size, u64 value) {
void Add(VAddr base_address, size_t size, const T& value) {
const VAddr end_address = base_address + size;
IntervalType interval{base_address, end_address};
m_ranges_map.add({interval, value});
@ -111,6 +145,35 @@ public:
m_ranges_map -= interval;
}
void Clear() {
m_ranges_map.clear();
}
bool Contains(VAddr base_address, size_t size) const {
const VAddr end_address = base_address + size;
IntervalType interval{base_address, end_address};
return boost::icl::contains(m_ranges_map, interval);
}
bool Intersects(VAddr base_address, size_t size) const {
const VAddr end_address = base_address + size;
IntervalType interval{base_address, end_address};
return boost::icl::intersects(m_ranges_map, interval);
}
template <typename Func>
void ForEach(Func&& func) const {
if (m_ranges_map.empty()) {
return;
}
for (const auto& [interval, value] : m_ranges_map) {
const VAddr inter_addr_end = interval.upper();
const VAddr inter_addr = interval.lower();
func(inter_addr, inter_addr_end, value);
}
}
template <typename Func>
void ForEachInRange(VAddr base_addr, size_t size, Func&& func) const {
if (m_ranges_map.empty()) {
@ -140,7 +203,111 @@ public:
template <typename Func>
void ForEachNotInRange(VAddr base_addr, size_t size, Func&& func) const {
const VAddr end_addr = base_addr + size;
ForEachInRange(base_addr, size, [&](VAddr range_addr, VAddr range_end, u64) {
ForEachInRange(base_addr, size, [&](VAddr range_addr, VAddr range_end, const T&) {
if (size_t gap_size = range_addr - base_addr; gap_size != 0) {
func(base_addr, gap_size);
}
base_addr = range_end;
});
if (base_addr != end_addr) {
func(base_addr, end_addr - base_addr);
}
}
private:
IntervalMap m_ranges_map;
};
template <typename T>
class SplitRangeMap {
public:
using IntervalMap = boost::icl::split_interval_map<
VAddr, T, boost::icl::total_absorber, std::less, boost::icl::inplace_identity,
boost::icl::inter_section, ICL_INTERVAL_INSTANCE(ICL_INTERVAL_DEFAULT, VAddr, std::less),
RangeSetsAllocator>;
using IntervalType = typename IntervalMap::interval_type;
public:
SplitRangeMap() = default;
~SplitRangeMap() = default;
SplitRangeMap(SplitRangeMap const&) = delete;
SplitRangeMap& operator=(SplitRangeMap const&) = delete;
SplitRangeMap(SplitRangeMap&& other);
SplitRangeMap& operator=(SplitRangeMap&& other);
void Add(VAddr base_address, size_t size, const T& value) {
const VAddr end_address = base_address + size;
IntervalType interval{base_address, end_address};
m_ranges_map.add({interval, value});
}
void Subtract(VAddr base_address, size_t size) {
const VAddr end_address = base_address + size;
IntervalType interval{base_address, end_address};
m_ranges_map -= interval;
}
void Clear() {
m_ranges_map.clear();
}
bool Contains(VAddr base_address, size_t size) const {
const VAddr end_address = base_address + size;
IntervalType interval{base_address, end_address};
return boost::icl::contains(m_ranges_map, interval);
}
bool Intersects(VAddr base_address, size_t size) const {
const VAddr end_address = base_address + size;
IntervalType interval{base_address, end_address};
return boost::icl::intersects(m_ranges_map, interval);
}
template <typename Func>
void ForEach(Func&& func) const {
if (m_ranges_map.empty()) {
return;
}
for (const auto& [interval, value] : m_ranges_map) {
const VAddr inter_addr_end = interval.upper();
const VAddr inter_addr = interval.lower();
func(inter_addr, inter_addr_end, value);
}
}
template <typename Func>
void ForEachInRange(VAddr base_addr, size_t size, Func&& func) const {
if (m_ranges_map.empty()) {
return;
}
const VAddr start_address = base_addr;
const VAddr end_address = start_address + size;
const IntervalType search_interval{start_address, end_address};
auto it = m_ranges_map.lower_bound(search_interval);
if (it == m_ranges_map.end()) {
return;
}
auto end_it = m_ranges_map.upper_bound(search_interval);
for (; it != end_it; it++) {
VAddr inter_addr_end = it->first.upper();
VAddr inter_addr = it->first.lower();
if (inter_addr_end > end_address) {
inter_addr_end = end_address;
}
if (inter_addr < start_address) {
inter_addr = start_address;
}
func(inter_addr, inter_addr_end, it->second);
}
}
template <typename Func>
void ForEachNotInRange(VAddr base_addr, size_t size, Func&& func) const {
const VAddr end_addr = base_addr + size;
ForEachInRange(base_addr, size, [&](VAddr range_addr, VAddr range_end, const T&) {
if (size_t gap_size = range_addr - base_addr; gap_size != 0) {
func(base_addr, gap_size);
}

View File

@ -10,8 +10,10 @@
#ifdef __linux__
#include "common/adaptive_mutex.h"
#endif
#else
#include "common/spin_lock.h"
#endif
#include "common/debug.h"
#include "common/types.h"
#include "video_core/page_manager.h"
@ -56,7 +58,7 @@ public:
return cpu_addr;
}
static u64 ExtractBits(u64 word, size_t page_start, size_t page_end) {
static constexpr u64 ExtractBits(u64 word, size_t page_start, size_t page_end) {
constexpr size_t number_bits = sizeof(u64) * 8;
const size_t limit_page_end = number_bits - std::min(page_end, number_bits);
u64 bits = (word >> page_start) << page_start;
@ -64,7 +66,7 @@ public:
return bits;
}
static std::pair<size_t, size_t> GetWordPage(VAddr address) {
static constexpr std::pair<size_t, size_t> GetWordPage(VAddr address) {
const size_t converted_address = static_cast<size_t>(address);
const size_t word_number = converted_address / BYTES_PER_WORD;
const size_t amount_pages = converted_address % BYTES_PER_WORD;
@ -73,6 +75,7 @@ public:
template <typename Func>
void IterateWords(size_t offset, size_t size, Func&& func) const {
RENDERER_TRACE;
using FuncReturn = std::invoke_result_t<Func, std::size_t, u64>;
static constexpr bool BOOL_BREAK = std::is_same_v<FuncReturn, bool>;
const size_t start = static_cast<size_t>(std::max<s64>(static_cast<s64>(offset), 0LL));
@ -104,13 +107,13 @@ public:
}
}
template <typename Func>
void IteratePages(u64 mask, Func&& func) const {
void IteratePages(u64 mask, auto&& func) const {
RENDERER_TRACE;
size_t offset = 0;
while (mask != 0) {
const size_t empty_bits = std::countr_zero(mask);
offset += empty_bits;
mask = mask >> empty_bits;
mask >>= empty_bits;
const size_t continuous_bits = std::countr_one(mask);
func(offset, continuous_bits);
@ -155,8 +158,9 @@ public:
* @param size Size in bytes of the CPU range to loop over
* @param func Function to call for each turned off region
*/
template <Type type, bool clear, typename Func>
void ForEachModifiedRange(VAddr query_cpu_range, s64 size, Func&& func) {
template <Type type, bool clear>
void ForEachModifiedRange(VAddr query_cpu_range, s64 size, auto&& func) {
RENDERER_TRACE;
std::scoped_lock lk{lock};
static_assert(type != Type::Untracked);
@ -170,6 +174,7 @@ public:
(pending_pointer - pending_offset) * BYTES_PER_PAGE);
};
IterateWords(offset, size, [&](size_t index, u64 mask) {
RENDERER_TRACE;
if constexpr (type == Type::GPU) {
mask &= ~untracked[index];
}
@ -177,14 +182,13 @@ public:
if constexpr (clear) {
if constexpr (type == Type::CPU) {
UpdateProtection<true>(index, untracked[index], mask);
}
state_words[index] &= ~mask;
if constexpr (type == Type::CPU) {
untracked[index] &= ~mask;
}
state_words[index] &= ~mask;
}
const size_t base_offset = index * PAGES_PER_WORD;
IteratePages(word, [&](size_t pages_offset, size_t pages_size) {
RENDERER_TRACE;
const auto reset = [&]() {
pending_offset = base_offset + pages_offset;
pending_pointer = base_offset + pages_offset + pages_size;
@ -245,11 +249,13 @@ private:
*/
template <bool add_to_tracker>
void UpdateProtection(u64 word_index, u64 current_bits, u64 new_bits) const {
RENDERER_TRACE;
constexpr s32 delta = add_to_tracker ? 1 : -1;
u64 changed_bits = (add_to_tracker ? current_bits : ~current_bits) & new_bits;
VAddr addr = cpu_addr + word_index * BYTES_PER_WORD;
IteratePages(changed_bits, [&](size_t offset, size_t size) {
tracker->UpdatePagesCachedCount(addr + offset * BYTES_PER_PAGE, size * BYTES_PER_PAGE,
add_to_tracker ? 1 : -1);
tracker->UpdatePageWatchers<delta>(addr + offset * BYTES_PER_PAGE,
size * BYTES_PER_PAGE);
});
}

View File

@ -11,6 +11,7 @@ set(SHADER_FILES
detilers/micro_32bpp.comp
detilers/micro_64bpp.comp
detilers/micro_8bpp.comp
fault_buffer_process.comp
fs_tri.vert
fsr.comp
post_process.frag

View File

@ -0,0 +1,42 @@
// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project
// SPDX-License-Identifier: GPL-2.0-or-later
#version 450
#extension GL_ARB_gpu_shader_int64 : enable
layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
layout(std430, binding = 0) buffer input_buf {
uint fault_buffer[];
};
layout(std430, binding = 1) buffer output_buf {
uint64_t download_buffer[];
};
// Overlap for 32 bit atomics
layout(std430, binding = 1) buffer output_buf32 {
uint download_buffer32[];
};
layout(constant_id = 0) const uint CACHING_PAGEBITS = 0;
void main() {
uint id = gl_GlobalInvocationID.x;
uint word = fault_buffer[id];
if (word == 0u) {
return;
}
// 1 page per bit
uint base_bit = id * 32u;
while (word != 0u) {
uint bit = findLSB(word);
word &= word - 1;
uint page = base_bit + bit;
uint store_index = atomicAdd(download_buffer32[0], 1u) + 1u;
// It is very unlikely, but should we check for overflow?
if (store_index < 1024u) { // only support 1024 page faults
download_buffer[store_index] = uint64_t(page) << CACHING_PAGEBITS;
}
}
}

View File

@ -1,11 +1,9 @@
// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project
// SPDX-License-Identifier: GPL-2.0-or-later
#include <thread>
#include <boost/icl/interval_set.hpp>
#include "common/alignment.h"
#include <boost/container/small_vector.hpp>
#include "common/assert.h"
#include "common/error.h"
#include "common/debug.h"
#include "common/signal_context.h"
#include "core/memory.h"
#include "core/signals.h"
@ -15,23 +13,60 @@
#ifndef _WIN64
#include <sys/mman.h>
#ifdef ENABLE_USERFAULTFD
#include <thread>
#include <fcntl.h>
#include <linux/userfaultfd.h>
#include <poll.h>
#include <sys/ioctl.h>
#include "common/error.h"
#endif
#else
#include <windows.h>
#endif
#ifdef __linux__
#include "common/adaptive_mutex.h"
#else
#include "common/spin_lock.h"
#endif
namespace VideoCore {
constexpr size_t PAGESIZE = 4_KB;
constexpr size_t PAGEBITS = 12;
constexpr size_t PAGE_SIZE = 4_KB;
constexpr size_t PAGE_BITS = 12;
#ifdef ENABLE_USERFAULTFD
struct PageManager::Impl {
Impl(Vulkan::Rasterizer* rasterizer_) : rasterizer{rasterizer_} {
struct PageState {
u8 num_watchers{};
Core::MemoryPermission Perm() const noexcept {
return num_watchers == 0 ? Core::MemoryPermission::ReadWrite
: Core::MemoryPermission::Read;
}
template <s32 delta>
u8 AddDelta() {
if constexpr (delta == 1) {
return ++num_watchers;
} else {
ASSERT_MSG(num_watchers > 0, "Not enough watchers");
return --num_watchers;
}
}
};
struct UpdateProtectRange {
VAddr addr;
u64 size;
Core::MemoryPermission perms;
};
static constexpr size_t ADDRESS_BITS = 40;
static constexpr size_t NUM_ADDRESS_PAGES = 1ULL << (40 - PAGE_BITS);
inline static Vulkan::Rasterizer* rasterizer;
#ifdef ENABLE_USERFAULTFD
Impl(Vulkan::Rasterizer* rasterizer_) {
rasterizer = rasterizer_;
uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK | UFFD_USER_MODE_ONLY);
ASSERT_MSG(uffd != -1, "{}", Common::GetLastErrorMsg());
@ -63,7 +98,8 @@ struct PageManager::Impl {
ASSERT_MSG(ret != -1, "Uffdio unregister failed");
}
void Protect(VAddr address, size_t size, bool allow_write) {
void Protect(VAddr address, size_t size, Core::MemoryPermission perms) {
bool allow_write = True(perms & Core::MemoryPermission::Write);
uffdio_writeprotect wp;
wp.range.start = address;
wp.range.len = size;
@ -118,12 +154,9 @@ struct PageManager::Impl {
}
}
Vulkan::Rasterizer* rasterizer;
std::jthread ufd_thread;
int uffd;
};
#else
struct PageManager::Impl {
Impl(Vulkan::Rasterizer* rasterizer_) {
rasterizer = rasterizer_;
@ -141,12 +174,11 @@ struct PageManager::Impl {
// No-op
}
void Protect(VAddr address, size_t size, bool allow_write) {
void Protect(VAddr address, size_t size, Core::MemoryPermission perms) {
RENDERER_TRACE;
auto* memory = Core::Memory::Instance();
auto& impl = memory->GetAddressSpace();
impl.Protect(address, size,
allow_write ? Core::MemoryPermission::ReadWrite
: Core::MemoryPermission::Read);
impl.Protect(address, size, perms);
}
static bool GuestFaultSignalHandler(void* context, void* fault_address) {
@ -157,23 +189,76 @@ struct PageManager::Impl {
return false;
}
inline static Vulkan::Rasterizer* rasterizer;
};
#endif
template <s32 delta>
void UpdatePageWatchers(VAddr addr, u64 size) {
RENDERER_TRACE;
boost::container::small_vector<UpdateProtectRange, 16> update_ranges;
{
std::scoped_lock lk(lock);
size_t page = addr >> PAGE_BITS;
auto perms = cached_pages[page].Perm();
u64 range_begin = 0;
u64 range_bytes = 0;
const auto release_pending = [&] {
if (range_bytes > 0) {
RENDERER_TRACE;
// Add pending (un)protect action
update_ranges.push_back({range_begin << PAGE_BITS, range_bytes, perms});
range_bytes = 0;
}
};
// Iterate requested pages
const u64 page_end = Common::DivCeil(addr + size, PAGE_SIZE);
for (; page != page_end; ++page) {
PageState& state = cached_pages[page];
// Apply the change to the page state
const u8 new_count = state.AddDelta<delta>();
// If the protection changed add pending (un)protect action
if (auto new_perms = state.Perm(); new_perms != perms) [[unlikely]] {
release_pending();
perms = new_perms;
}
// If the page must be (un)protected, add it to the pending range
if ((new_count == 0 && delta < 0) || (new_count == 1 && delta > 0)) {
if (range_bytes == 0) {
range_begin = page;
}
range_bytes += PAGE_SIZE;
} else {
release_pending();
}
}
// Add pending (un)protect action
release_pending();
}
// Flush deferred protects
for (const auto& range : update_ranges) {
Protect(range.addr, range.size, range.perms);
}
}
std::array<PageState, NUM_ADDRESS_PAGES> cached_pages{};
#ifdef __linux__
Common::AdaptiveMutex lock;
#else
Common::SpinLock lock;
#endif
};
PageManager::PageManager(Vulkan::Rasterizer* rasterizer_)
: impl{std::make_unique<Impl>(rasterizer_)}, rasterizer{rasterizer_} {}
: impl{std::make_unique<Impl>(rasterizer_)} {}
PageManager::~PageManager() = default;
VAddr PageManager::GetPageAddr(VAddr addr) {
return Common::AlignDown(addr, PAGESIZE);
}
VAddr PageManager::GetNextPageAddr(VAddr addr) {
return Common::AlignUp(addr + 1, PAGESIZE);
}
void PageManager::OnGpuMap(VAddr address, size_t size) {
impl->OnMap(address, size);
}
@ -182,41 +267,12 @@ void PageManager::OnGpuUnmap(VAddr address, size_t size) {
impl->OnUnmap(address, size);
}
void PageManager::UpdatePagesCachedCount(VAddr addr, u64 size, s32 delta) {
static constexpr u64 PageShift = 12;
std::scoped_lock lk{lock};
const u64 num_pages = ((addr + size - 1) >> PageShift) - (addr >> PageShift) + 1;
const u64 page_start = addr >> PageShift;
const u64 page_end = page_start + num_pages;
const auto pages_interval =
decltype(cached_pages)::interval_type::right_open(page_start, page_end);
if (delta > 0) {
cached_pages.add({pages_interval, delta});
}
const auto& range = cached_pages.equal_range(pages_interval);
for (const auto& [range, count] : boost::make_iterator_range(range)) {
const auto interval = range & pages_interval;
const VAddr interval_start_addr = boost::icl::first(interval) << PageShift;
const VAddr interval_end_addr = boost::icl::last_next(interval) << PageShift;
const u32 interval_size = interval_end_addr - interval_start_addr;
ASSERT_MSG(rasterizer->IsMapped(interval_start_addr, interval_size),
"Attempted to track non-GPU memory at address {:#x}, size {:#x}.",
interval_start_addr, interval_size);
if (delta > 0 && count == delta) {
impl->Protect(interval_start_addr, interval_size, false);
} else if (delta < 0 && count == -delta) {
impl->Protect(interval_start_addr, interval_size, true);
} else {
ASSERT(count >= 0);
}
}
if (delta < 0) {
cached_pages.add({pages_interval, delta});
}
template <s32 delta>
void PageManager::UpdatePageWatchers(VAddr addr, u64 size) const {
impl->UpdatePageWatchers<delta>(addr, size);
}
template void PageManager::UpdatePageWatchers<1>(VAddr addr, u64 size) const;
template void PageManager::UpdatePageWatchers<-1>(VAddr addr, u64 size) const;
} // namespace VideoCore

View File

@ -4,11 +4,7 @@
#pragma once
#include <memory>
#include <boost/icl/interval_map.hpp>
#ifdef __linux__
#include "common/adaptive_mutex.h"
#endif
#include "common/spin_lock.h"
#include "common/alignment.h"
#include "common/types.h"
namespace Vulkan {
@ -18,6 +14,9 @@ class Rasterizer;
namespace VideoCore {
class PageManager {
static constexpr size_t PAGE_BITS = 12;
static constexpr size_t PAGE_SIZE = 1ULL << PAGE_BITS;
public:
explicit PageManager(Vulkan::Rasterizer* rasterizer);
~PageManager();
@ -28,22 +27,23 @@ public:
/// Unregister a range of gpu memory that was unmapped.
void OnGpuUnmap(VAddr address, size_t size);
/// Increase/decrease the number of surface in pages touching the specified region
void UpdatePagesCachedCount(VAddr addr, u64 size, s32 delta);
/// Updates watches in the pages touching the specified region.
template <s32 delta>
void UpdatePageWatchers(VAddr addr, u64 size) const;
static VAddr GetPageAddr(VAddr addr);
static VAddr GetNextPageAddr(VAddr addr);
/// Returns page aligned address.
static constexpr VAddr GetPageAddr(VAddr addr) {
return Common::AlignDown(addr, PAGE_SIZE);
}
/// Returns address of the next page.
static constexpr VAddr GetNextPageAddr(VAddr addr) {
return Common::AlignUp(addr + 1, PAGE_SIZE);
}
private:
struct Impl;
std::unique_ptr<Impl> impl;
Vulkan::Rasterizer* rasterizer;
boost::icl::interval_map<VAddr, s32> cached_pages;
#ifdef PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP
Common::AdaptiveMutex lock;
#else
Common::SpinLock lock;
#endif
};
} // namespace VideoCore

View File

@ -121,6 +121,7 @@ void SetOutputDir(const std::filesystem::path& path, const std::string& prefix)
if (!rdoc_api) {
return;
}
LOG_WARNING(Common, "RenderDoc capture path: {}", (path / prefix).string());
rdoc_api->SetCaptureFilePathTemplate(fmt::UTF((path / prefix).u8string()).data.data());
}

View File

@ -147,6 +147,7 @@ Instance::Instance(Frontend::WindowSDL& window, s32 physical_device_index,
available_extensions = GetSupportedExtensions(physical_device);
format_properties = GetFormatProperties(physical_device);
properties = physical_device.getProperties();
memory_properties = physical_device.getMemoryProperties();
CollectDeviceParameters();
ASSERT_MSG(properties.apiVersion >= TargetVulkanApiVersion,
"Vulkan {}.{} is required, but only {}.{} is supported by device!",
@ -375,6 +376,7 @@ bool Instance::CreateDevice() {
.separateDepthStencilLayouts = vk12_features.separateDepthStencilLayouts,
.hostQueryReset = vk12_features.hostQueryReset,
.timelineSemaphore = vk12_features.timelineSemaphore,
.bufferDeviceAddress = vk12_features.bufferDeviceAddress,
},
vk::PhysicalDeviceVulkan13Features{
.robustImageAccess = vk13_features.robustImageAccess,
@ -505,6 +507,7 @@ void Instance::CreateAllocator() {
};
const VmaAllocatorCreateInfo allocator_info = {
.flags = VMA_ALLOCATOR_CREATE_BUFFER_DEVICE_ADDRESS_BIT,
.physicalDevice = physical_device,
.device = *device,
.pVulkanFunctions = &functions,

View File

@ -286,6 +286,11 @@ public:
return vk12_props;
}
/// Returns the memory properties of the physical device.
const vk::PhysicalDeviceMemoryProperties& GetMemoryProperties() const noexcept {
return memory_properties;
}
/// Returns true if shaders can declare the ClipDistance attribute
bool IsShaderClipDistanceSupported() const {
return features.shaderClipDistance;
@ -335,6 +340,7 @@ private:
vk::PhysicalDevice physical_device;
vk::UniqueDevice device;
vk::PhysicalDeviceProperties properties;
vk::PhysicalDeviceMemoryProperties memory_properties;
vk::PhysicalDeviceVulkan11Properties vk11_props;
vk::PhysicalDeviceVulkan12Properties vk12_props;
vk::PhysicalDevicePushDescriptorPropertiesKHR push_descriptor_props;

View File

@ -36,7 +36,7 @@ static Shader::PushData MakeUserData(const AmdGpu::Liverpool::Regs& regs) {
Rasterizer::Rasterizer(const Instance& instance_, Scheduler& scheduler_,
AmdGpu::Liverpool* liverpool_)
: instance{instance_}, scheduler{scheduler_}, page_manager{this},
buffer_cache{instance, scheduler, liverpool_, texture_cache, page_manager},
buffer_cache{instance, scheduler, *this, liverpool_, texture_cache, page_manager},
texture_cache{instance, scheduler, buffer_cache, page_manager}, liverpool{liverpool_},
memory{Core::Memory::Instance()}, pipeline_cache{instance, scheduler, liverpool} {
if (!Config::nullGpu()) {
@ -439,6 +439,13 @@ void Rasterizer::Finish() {
scheduler.Finish();
}
void Rasterizer::ProcessFaults() {
if (fault_process_pending) {
fault_process_pending = false;
buffer_cache.ProcessFaultBuffer();
}
}
bool Rasterizer::BindResources(const Pipeline* pipeline) {
if (IsComputeMetaClear(pipeline)) {
return false;
@ -449,6 +456,8 @@ bool Rasterizer::BindResources(const Pipeline* pipeline) {
buffer_infos.clear();
image_infos.clear();
bool uses_dma = false;
// Bind resource buffers and textures.
Shader::Backend::Bindings binding{};
Shader::PushData push_data = MakeUserData(liverpool->regs);
@ -459,9 +468,28 @@ bool Rasterizer::BindResources(const Pipeline* pipeline) {
stage->PushUd(binding, push_data);
BindBuffers(*stage, binding, push_data);
BindTextures(*stage, binding);
uses_dma |= stage->dma_types != Shader::IR::Type::Void;
}
pipeline->BindResources(set_writes, buffer_barriers, push_data);
if (uses_dma && !fault_process_pending) {
// We only use fault buffer for DMA right now.
{
// TODO: GPU might have written to memory (for example with EVENT_WRITE_EOP)
// we need to account for that and synchronize.
Common::RecursiveSharedLock lock{mapped_ranges_mutex};
for (auto& range : mapped_ranges) {
buffer_cache.SynchronizeBuffersInRange(range.lower(),
range.upper() - range.lower());
}
}
buffer_cache.MemoryBarrier();
}
fault_process_pending |= uses_dma;
return true;
}
@ -520,12 +548,18 @@ void Rasterizer::BindBuffers(const Shader::Info& stage, Shader::Backend::Binding
if (desc.buffer_type == Shader::BufferType::GdsBuffer) {
const auto* gds_buf = buffer_cache.GetGdsBuffer();
buffer_infos.emplace_back(gds_buf->Handle(), 0, gds_buf->SizeBytes());
} else if (desc.buffer_type == Shader::BufferType::ReadConstUbo) {
} else if (desc.buffer_type == Shader::BufferType::Flatbuf) {
auto& vk_buffer = buffer_cache.GetStreamBuffer();
const u32 ubo_size = stage.flattened_ud_buf.size() * sizeof(u32);
const u64 offset = vk_buffer.Copy(stage.flattened_ud_buf.data(), ubo_size,
instance.UniformMinAlignment());
buffer_infos.emplace_back(vk_buffer.Handle(), offset, ubo_size);
} else if (desc.buffer_type == Shader::BufferType::BdaPagetable) {
const auto* bda_buffer = buffer_cache.GetBdaPageTableBuffer();
buffer_infos.emplace_back(bda_buffer->Handle(), 0, bda_buffer->SizeBytes());
} else if (desc.buffer_type == Shader::BufferType::FaultBuffer) {
const auto* fault_buffer = buffer_cache.GetFaultBuffer();
buffer_infos.emplace_back(fault_buffer->Handle(), 0, fault_buffer->SizeBytes());
} else if (desc.buffer_type == Shader::BufferType::SharedMemory) {
auto& lds_buffer = buffer_cache.GetStreamBuffer();
const auto& cs_program = liverpool->GetCsRegs();
@ -925,7 +959,7 @@ bool Rasterizer::InvalidateMemory(VAddr addr, u64 size) {
// Not GPU mapped memory, can skip invalidation logic entirely.
return false;
}
buffer_cache.InvalidateMemory(addr, size);
buffer_cache.InvalidateMemory(addr, size, false);
texture_cache.InvalidateMemory(addr, size);
return true;
}
@ -937,24 +971,24 @@ bool Rasterizer::IsMapped(VAddr addr, u64 size) {
}
const auto range = decltype(mapped_ranges)::interval_type::right_open(addr, addr + size);
std::shared_lock lock{mapped_ranges_mutex};
Common::RecursiveSharedLock lock{mapped_ranges_mutex};
return boost::icl::contains(mapped_ranges, range);
}
void Rasterizer::MapMemory(VAddr addr, u64 size) {
{
std::unique_lock lock{mapped_ranges_mutex};
std::scoped_lock lock{mapped_ranges_mutex};
mapped_ranges += decltype(mapped_ranges)::interval_type::right_open(addr, addr + size);
}
page_manager.OnGpuMap(addr, size);
}
void Rasterizer::UnmapMemory(VAddr addr, u64 size) {
buffer_cache.InvalidateMemory(addr, size);
buffer_cache.InvalidateMemory(addr, size, true);
texture_cache.UnmapMemory(addr, size);
page_manager.OnGpuUnmap(addr, size);
{
std::unique_lock lock{mapped_ranges_mutex};
std::scoped_lock lock{mapped_ranges_mutex};
mapped_ranges -= decltype(mapped_ranges)::interval_type::right_open(addr, addr + size);
}
}

View File

@ -4,7 +4,7 @@
#pragma once
#include <shared_mutex>
#include "common/recursive_lock.h"
#include "video_core/buffer_cache/buffer_cache.h"
#include "video_core/page_manager.h"
#include "video_core/renderer_vulkan/vk_pipeline_cache.h"
@ -65,11 +65,21 @@ public:
void CpSync();
u64 Flush();
void Finish();
void ProcessFaults();
PipelineCache& GetPipelineCache() {
return pipeline_cache;
}
template <typename Func>
void ForEachMappedRangeInRange(VAddr addr, u64 size, Func&& func) {
const auto range = decltype(mapped_ranges)::interval_type::right_open(addr, addr + size);
Common::RecursiveSharedLock lock{mapped_ranges_mutex};
for (const auto& mapped_range : (mapped_ranges & range)) {
func(mapped_range);
}
}
private:
RenderState PrepareRenderState(u32 mrt_mask);
void BeginRendering(const GraphicsPipeline& pipeline, RenderState& state);
@ -100,6 +110,8 @@ private:
bool IsComputeMetaClear(const Pipeline* pipeline);
private:
friend class VideoCore::BufferCache;
const Instance& instance;
Scheduler& scheduler;
VideoCore::PageManager page_manager;
@ -126,6 +138,7 @@ private:
boost::container::static_vector<BufferBindingInfo, Shader::NumBuffers> buffer_bindings;
using ImageBindingInfo = std::pair<VideoCore::ImageId, VideoCore::TextureCache::TextureDesc>;
boost::container::static_vector<ImageBindingInfo, Shader::NumImages> image_bindings;
bool fault_process_pending{false};
};
} // namespace Vulkan

View File

@ -70,6 +70,11 @@ void Scheduler::Flush(SubmitInfo& info) {
SubmitExecution(info);
}
void Scheduler::Flush() {
SubmitInfo info{};
Flush(info);
}
void Scheduler::Finish() {
// When finishing, we need to wait for the submission to have executed on the device.
const u64 presubmit_tick = CurrentTick();
@ -85,6 +90,15 @@ void Scheduler::Wait(u64 tick) {
Flush(info);
}
master_semaphore.Wait(tick);
// CAUTION: This can introduce unexpected variation in the wait time.
// We don't currently sync the GPU, and some games are very sensitive to this.
// If this becomes a problem, it can be commented out.
// Idealy we would implement proper gpu sync.
while (!pending_ops.empty() && pending_ops.front().gpu_tick <= tick) {
pending_ops.front().callback();
pending_ops.pop();
}
}
void Scheduler::AllocateWorkerCommandBuffers() {

View File

@ -307,6 +307,10 @@ public:
/// and increments the scheduler timeline semaphore.
void Flush(SubmitInfo& info);
/// Sends the current execution context to the GPU
/// and increments the scheduler timeline semaphore.
void Flush();
/// Sends the current execution context to the GPU and waits for it to complete.
void Finish();

View File

@ -672,7 +672,7 @@ void TextureCache::TrackImage(ImageId image_id) {
// Re-track the whole image
image.track_addr = image_begin;
image.track_addr_end = image_end;
tracker.UpdatePagesCachedCount(image_begin, image.info.guest_size, 1);
tracker.UpdatePageWatchers<1>(image_begin, image.info.guest_size);
} else {
if (image_begin < image.track_addr) {
TrackImageHead(image_id);
@ -695,7 +695,7 @@ void TextureCache::TrackImageHead(ImageId image_id) {
ASSERT(image.track_addr != 0 && image_begin < image.track_addr);
const auto size = image.track_addr - image_begin;
image.track_addr = image_begin;
tracker.UpdatePagesCachedCount(image_begin, size, 1);
tracker.UpdatePageWatchers<1>(image_begin, size);
}
void TextureCache::TrackImageTail(ImageId image_id) {
@ -711,7 +711,7 @@ void TextureCache::TrackImageTail(ImageId image_id) {
const auto addr = image.track_addr_end;
const auto size = image_end - image.track_addr_end;
image.track_addr_end = image_end;
tracker.UpdatePagesCachedCount(addr, size, 1);
tracker.UpdatePageWatchers<1>(addr, size);
}
void TextureCache::UntrackImage(ImageId image_id) {
@ -724,7 +724,7 @@ void TextureCache::UntrackImage(ImageId image_id) {
image.track_addr = 0;
image.track_addr_end = 0;
if (size != 0) {
tracker.UpdatePagesCachedCount(addr, size, -1);
tracker.UpdatePageWatchers<-1>(addr, size);
}
}
@ -743,7 +743,7 @@ void TextureCache::UntrackImageHead(ImageId image_id) {
// Cehck its hash later.
MarkAsMaybeDirty(image_id, image);
}
tracker.UpdatePagesCachedCount(image_begin, size, -1);
tracker.UpdatePageWatchers<-1>(image_begin, size);
}
void TextureCache::UntrackImageTail(ImageId image_id) {
@ -762,7 +762,7 @@ void TextureCache::UntrackImageTail(ImageId image_id) {
// Cehck its hash later.
MarkAsMaybeDirty(image_id, image);
}
tracker.UpdatePagesCachedCount(addr, size, -1);
tracker.UpdatePageWatchers<-1>(addr, size);
}
void TextureCache::DeleteImage(ImageId image_id) {