diff --git a/src/video_core/amdgpu/liverpool.cpp b/src/video_core/amdgpu/liverpool.cpp index 4db7648c6..3358af2b7 100644 --- a/src/video_core/amdgpu/liverpool.cpp +++ b/src/video_core/amdgpu/liverpool.cpp @@ -239,6 +239,11 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span dcb, std::spantype3.NumWords(); const PM4ItOpcode opcode = header->type3.opcode; + const auto predicate = header->type3.predicate; + if (predicate == PM4Predicate::PredEnable) { + LOG_DEBUG(Render_Vulkan, "PM4 command {} is predicated", + magic_enum::enum_name(opcode)); + } switch (opcode) { case PM4ItOpcode::Nop: { const auto* nop = reinterpret_cast(header); @@ -394,7 +399,25 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span dcb, std::span(header); + if (predication->continue_bit.Value()) { + LOG_WARNING(Render_Vulkan, "unhandled continue bit in predication command"); + } + if (predication->pred_op.Value() == PredicateOperation::Clear) { + if (rasterizer) { + rasterizer->EndPredication(); + } + } else if (predication->pred_op.Value() == PredicateOperation::Zpass) { + if (rasterizer) { + rasterizer->StartPredication( + predication->Address(), + predication->action.Value() == Predication::DrawIfVisible, + predication->hint.Value() == PredicationHint::Wait); + } + } else { + LOG_WARNING(Render_Vulkan, "unhandled predicate operation {}", + magic_enum::enum_name(predication->pred_op.Value())); + } break; } case PM4ItOpcode::IndexType: { @@ -595,6 +618,24 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span dcb, std::spanevent_index.Value() == EventIndex::ZpassDone) { + if (event->event_type.Value() == EventType::PixelPipeStatControl) { + + } else if (event->event_type.Value() == EventType::PixelPipeStatDump) { + if ((event->Address() & 0x8) == 0) { + // occlusion query start + if (rasterizer) { + rasterizer->StartOcclusionQuery(event->Address()); + } + } else { + // occlusion query end + if (rasterizer) { + rasterizer->EndOcclusionQuery(event->Address() & ~0xF); + } + } + } + } break; } case PM4ItOpcode::EventWriteEos: { diff --git a/src/video_core/amdgpu/pm4_cmds.h b/src/video_core/amdgpu/pm4_cmds.h index 58ecda93e..d4ce9a03a 100644 --- a/src/video_core/amdgpu/pm4_cmds.h +++ b/src/video_core/amdgpu/pm4_cmds.h @@ -415,6 +415,13 @@ struct PM4CmdEventWrite { BitField<20, 1, u32> inv_l2; ///< Send WBINVL2 op to the TC L2 cache when EVENT_INDEX = 0111 }; u32 address[]; + + template + T Address() const { + ASSERT(event_index.Value() >= EventIndex::ZpassDone && + event_index.Value() <= EventIndex::SampleStreamoutStatSx); + return std::bit_cast((u64(address[1]) << 32u) | u64(address[0])); + } }; struct PM4CmdEventWriteEop { @@ -1104,4 +1111,43 @@ struct PM4CmdMemSemaphore { } }; +enum class Predication : u32 { + DrawIfNotVisible = 0, + DrawIfVisible = 1, +}; + +enum class PredicationHint : u32 { + Wait = 0, + Draw = 1, +}; + +enum class PredicateOperation : u32 { + Clear = 0, + Zpass = 1, + PrimCount = 2, + // other values are reserved +}; + +struct PM4CmdSetPredication { + PM4Type3Header header; + union { + BitField<4, 28, u32> start_address_lo; + u32 raw1; + }; + union { + BitField<0, 8, u32> start_address_hi; + BitField<8, 1, Predication> action; + BitField<12, 1, PredicationHint> hint; + BitField<16, 3, PredicateOperation> pred_op; + BitField<31, 1, u32> continue_bit; + u32 raw2; + }; + + template + T Address() const { + return std::bit_cast(u64(start_address_lo.Value()) << 4 | u64(start_address_hi.Value()) + << 32); + } +}; + } // namespace AmdGpu diff --git a/src/video_core/renderer_vulkan/vk_instance.cpp b/src/video_core/renderer_vulkan/vk_instance.cpp index 9584329f0..654f611f6 100644 --- a/src/video_core/renderer_vulkan/vk_instance.cpp +++ b/src/video_core/renderer_vulkan/vk_instance.cpp @@ -212,7 +212,8 @@ bool Instance::CreateDevice() { vk::PhysicalDeviceExtendedDynamicState3FeaturesEXT, vk::PhysicalDevicePrimitiveTopologyListRestartFeaturesEXT, vk::PhysicalDevicePortabilitySubsetFeaturesKHR, - vk::PhysicalDeviceShaderAtomicFloat2FeaturesEXT>(); + vk::PhysicalDeviceShaderAtomicFloat2FeaturesEXT, + vk::PhysicalDeviceConditionalRenderingFeaturesEXT>(); features = feature_chain.get().features; const vk::StructureChain properties_chain = physical_device.getProperties2< @@ -283,6 +284,7 @@ bool Instance::CreateDevice() { LOG_INFO(Render_Vulkan, "- shaderImageFloat32AtomicMinMax: {}", shader_atomic_float2_features.shaderImageFloat32AtomicMinMax); } + conditional_rendering = add_extension(VK_EXT_CONDITIONAL_RENDERING_EXTENSION_NAME); const bool calibrated_timestamps = TRACY_GPU_ENABLED ? add_extension(VK_EXT_CALIBRATED_TIMESTAMPS_EXTENSION_NAME) : false; @@ -420,6 +422,9 @@ bool Instance::CreateDevice() { .shaderImageFloat32AtomicMinMax = shader_atomic_float2_features.shaderImageFloat32AtomicMinMax, }, + vk::PhysicalDeviceConditionalRenderingFeaturesEXT{ + .conditionalRendering = true, + }, #ifdef __APPLE__ portability_features, #endif @@ -452,6 +457,9 @@ bool Instance::CreateDevice() { if (!shader_atomic_float2) { device_chain.unlink(); } + if (!conditional_rendering) { + device_chain.unlink(); + } auto [device_result, dev] = physical_device.createDeviceUnique(device_chain.get()); if (device_result != vk::Result::eSuccess) { diff --git a/src/video_core/renderer_vulkan/vk_instance.h b/src/video_core/renderer_vulkan/vk_instance.h index 30848e8b7..6cb550496 100644 --- a/src/video_core/renderer_vulkan/vk_instance.h +++ b/src/video_core/renderer_vulkan/vk_instance.h @@ -191,6 +191,11 @@ public: return !portability_subset || portability_features.tessellationPointMode; } + /// Returns true when VK_EXT_conditional_rendering is supported by the device + bool IsConditionalRenderingSupported() const { + return conditional_rendering; + } + /// Returns the vendor ID of the physical device u32 GetVendorID() const { return properties.vendorID; @@ -374,6 +379,7 @@ private: bool amd_gcn_shader{}; bool amd_shader_trinary_minmax{}; bool shader_atomic_float2{}; + bool conditional_rendering{}; bool portability_subset{}; }; diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index dff4e5a5f..5765950b5 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -17,6 +17,10 @@ #undef MemoryBarrier #endif +namespace { +const int OCCLUSION_QUERIES_COUNT = 256; +} + namespace Vulkan { static Shader::PushData MakeUserData(const AmdGpu::Liverpool::Regs& regs) { @@ -38,11 +42,25 @@ Rasterizer::Rasterizer(const Instance& instance_, Scheduler& scheduler_, : instance{instance_}, scheduler{scheduler_}, page_manager{this}, buffer_cache{instance, scheduler, *this, liverpool_, texture_cache, page_manager}, texture_cache{instance, scheduler, buffer_cache, page_manager}, liverpool{liverpool_}, - memory{Core::Memory::Instance()}, pipeline_cache{instance, scheduler, liverpool} { + memory{Core::Memory::Instance()}, pipeline_cache{instance, scheduler, liverpool}, + occlusion_query_buffer{instance, + scheduler, + VideoCore::MemoryUsage::DeviceLocal, + 0, + vk::BufferUsageFlagBits::eConditionalRenderingEXT | + vk::BufferUsageFlagBits::eTransferDst, + sizeof(u32) * OCCLUSION_QUERIES_COUNT} { if (!Config::nullGpu()) { liverpool->BindRasterizer(this); } memory->SetRasterizer(this); + occlusion_query_pool = Check<"occlusion query pool">(instance.GetDevice().createQueryPool({ + .queryType = vk::QueryType::eOcclusion, + .queryCount = OCCLUSION_QUERIES_COUNT, + })); + instance.GetDevice().resetQueryPool(occlusion_query_pool, 0, OCCLUSION_QUERIES_COUNT); + Vulkan::SetObjectName(instance.GetDevice(), occlusion_query_buffer.Handle(), + "OcclusionQueryBuffer:{:#x}", sizeof(u32) * OCCLUSION_QUERIES_COUNT); } Rasterizer::~Rasterizer() = default; @@ -302,6 +320,9 @@ void Rasterizer::Draw(bool is_indexed, u32 index_offset) { const auto cmdbuf = scheduler.CommandBuffer(); cmdbuf.bindPipeline(vk::PipelineBindPoint::eGraphics, pipeline->Handle()); + if (active_predication) { + cmdbuf.beginConditionalRenderingEXT(&*active_predication); + } if (is_indexed) { cmdbuf.drawIndexed(regs.num_indices, regs.num_instances.NumInstances(), 0, s32(vertex_offset), instance_offset); @@ -309,7 +330,9 @@ void Rasterizer::Draw(bool is_indexed, u32 index_offset) { cmdbuf.draw(regs.num_indices, regs.num_instances.NumInstances(), vertex_offset, instance_offset); } - + if (active_predication) { + cmdbuf.endConditionalRenderingEXT(); + } ResetBindings(); } @@ -354,6 +377,9 @@ void Rasterizer::DrawIndirect(bool is_indexed, VAddr arg_address, u32 offset, u3 const auto cmdbuf = scheduler.CommandBuffer(); cmdbuf.bindPipeline(vk::PipelineBindPoint::eGraphics, pipeline->Handle()); + if (active_predication) { + cmdbuf.beginConditionalRenderingEXT(&*active_predication); + } if (is_indexed) { ASSERT(sizeof(VkDrawIndexedIndirectCommand) == stride); @@ -373,7 +399,9 @@ void Rasterizer::DrawIndirect(bool is_indexed, VAddr arg_address, u32 offset, u3 cmdbuf.drawIndirect(buffer->Handle(), base, max_count, stride); } } - + if (active_predication) { + cmdbuf.endConditionalRenderingEXT(); + } ResetBindings(); } @@ -1263,4 +1291,102 @@ void Rasterizer::ScopedMarkerInsertColor(const std::string_view& str, const u32 (f32)(color & 0xff) / 255.0f, (f32)((color >> 24) & 0xff) / 255.0f})}); } +void Rasterizer::StartPredication(VAddr addr, bool draw_if_visible, bool wait_for_result) { + if (!instance.IsConditionalRenderingSupported()) { + return; + } + + ASSERT(!active_predication); + ASSERT(occlusion_index_mapping.contains(addr)); + + auto index = occlusion_index_mapping[addr]; + LOG_DEBUG(Render_Vulkan, + "addr = {:#x}, index = {}, draw_if_visible = {}, " + "wait_for_result = {}", + addr, index, draw_if_visible, wait_for_result); + + scheduler.EndRendering(); + const auto cmdbuf = scheduler.CommandBuffer(); + + cmdbuf.copyQueryPoolResults(occlusion_query_pool, index, 1, occlusion_query_buffer.Handle(), + index * sizeof(u32), sizeof(u32), + wait_for_result ? vk::QueryResultFlagBits::eWait + : vk::QueryResultFlagBits::ePartial); + + const auto pre_barrier = vk::BufferMemoryBarrier2{ + .srcStageMask = vk::PipelineStageFlagBits2::eCopy, + .srcAccessMask = vk::AccessFlagBits2::eTransferWrite, + .dstStageMask = vk::PipelineStageFlagBits2::eCopy, + .dstAccessMask = vk::AccessFlagBits2::eTransferWrite, + .buffer = occlusion_query_buffer.Handle(), + .offset = index * sizeof(u32), + .size = sizeof(u32), + }; + + const vk::MemoryBarrier2 ib_barrier{ + .srcStageMask = vk::PipelineStageFlagBits2::eCopy, + .srcAccessMask = vk::AccessFlagBits2::eTransferWrite, + .dstStageMask = vk::PipelineStageFlagBits2::eConditionalRenderingEXT, + .dstAccessMask = vk::AccessFlagBits2::eConditionalRenderingReadEXT, + }; + cmdbuf.pipelineBarrier2(vk::DependencyInfo{ + .dependencyFlags = vk::DependencyFlagBits::eByRegion, + .memoryBarrierCount = 1, + .pMemoryBarriers = &ib_barrier, + .bufferMemoryBarrierCount = 1, + .pBufferMemoryBarriers = &pre_barrier, + }); + + ScopeMarkerBegin("gfx:{}:predication", fmt::ptr(reinterpret_cast(addr))); + vk::ConditionalRenderingBeginInfoEXT conditional_rendering_info{ + .buffer = occlusion_query_buffer.Handle(), + .offset = index * sizeof(u32), + .flags = draw_if_visible ? vk::ConditionalRenderingFlagBitsEXT::eInverted + : vk::ConditionalRenderingFlagsEXT(), + }; + + active_predication = conditional_rendering_info; +} + +void Rasterizer::EndPredication() { + if (!active_predication) { + return; + } + + LOG_DEBUG(Render_Vulkan, ""); + + scheduler.EndRendering(); + ScopeMarkerEnd(); + active_predication = std::nullopt; +} + +void Rasterizer::StartOcclusionQuery(VAddr addr) { + LOG_DEBUG(Render_Vulkan, "addr = {:#x}, index = {}", addr, occlusion_current_index); + + scheduler.EndRendering(); + const auto cmdbuf = scheduler.CommandBuffer(); + cmdbuf.resetQueryPool(occlusion_query_pool, occlusion_current_index, 1); + ScopeMarkerBegin("gfx:{}:occlusionQuery", fmt::ptr(reinterpret_cast(addr))); + cmdbuf.beginQuery(occlusion_query_pool, occlusion_current_index, vk::QueryControlFlags()); + + occlusion_index_mapping.insert_or_assign(addr, occlusion_current_index); + + occlusion_current_index++; + if (occlusion_current_index > OCCLUSION_QUERIES_COUNT - 1) { + occlusion_current_index = 0; + } +} + +void Rasterizer::EndOcclusionQuery(VAddr addr) { + ASSERT(occlusion_index_mapping.contains(addr)); + + auto index = occlusion_index_mapping[addr]; + LOG_DEBUG(Render_Vulkan, "addr = {:#x}, index = {}", addr, index); + + scheduler.EndRendering(); + const auto cmdbuf = scheduler.CommandBuffer(); + cmdbuf.endQuery(occlusion_query_pool, index); + ScopeMarkerEnd(); +} + } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.h b/src/video_core/renderer_vulkan/vk_rasterizer.h index fb9ca4bbe..b0de5e903 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.h +++ b/src/video_core/renderer_vulkan/vk_rasterizer.h @@ -55,6 +55,11 @@ public: void ScopedMarkerInsertColor(const std::string_view& str, const u32 color, bool from_guest = false); + void StartPredication(VAddr addr, bool discard_if_zero, bool wait_for_result); + void EndPredication(); + void StartOcclusionQuery(VAddr addr); + void EndOcclusionQuery(VAddr addr); + void InlineData(VAddr address, const void* value, u32 num_bytes, bool is_gds); u32 ReadDataFromGds(u32 gsd_offset); bool InvalidateMemory(VAddr addr, u64 size); @@ -122,6 +127,11 @@ private: boost::icl::interval_set mapped_ranges; std::shared_mutex mapped_ranges_mutex; PipelineCache pipeline_cache; + vk::QueryPool occlusion_query_pool; + u32 occlusion_current_index{}; + std::map occlusion_index_mapping; + VideoCore::Buffer occlusion_query_buffer; + std::optional active_predication; boost::container::static_vector< std::pair, 8>