From 320c81fa4f7f44ccbc172a2d397f0bd6c8c20a6a Mon Sep 17 00:00:00 2001 From: Dario Date: Mon, 10 Feb 2025 22:23:39 -0300 Subject: [PATCH] Added profiling timestamps to Vulkan. Added more profilers in general. --- .../gpu/rhi/plume_render_interface.h | 10 ++ .../gpu/rhi/plume_render_interface_types.h | 1 + UnleashedRecomp/gpu/rhi/plume_vulkan.cpp | 92 ++++++++++++ UnleashedRecomp/gpu/rhi/plume_vulkan.h | 15 ++ UnleashedRecomp/gpu/video.cpp | 137 +++++++++++++----- 5 files changed, 216 insertions(+), 39 deletions(-) diff --git a/UnleashedRecomp/gpu/rhi/plume_render_interface.h b/UnleashedRecomp/gpu/rhi/plume_render_interface.h index 995bc256..e62db052 100644 --- a/UnleashedRecomp/gpu/rhi/plume_render_interface.h +++ b/UnleashedRecomp/gpu/rhi/plume_render_interface.h @@ -147,6 +147,8 @@ namespace plume { virtual void buildBottomLevelAS(const RenderAccelerationStructure *dstAccelerationStructure, RenderBufferReference scratchBuffer, const RenderBottomLevelASBuildInfo &buildInfo) = 0; virtual void buildTopLevelAS(const RenderAccelerationStructure *dstAccelerationStructure, RenderBufferReference scratchBuffer, RenderBufferReference instancesBuffer, const RenderTopLevelASBuildInfo &buildInfo) = 0; virtual void discardTexture(const RenderTexture* texture) = 0; // D3D12 only. + virtual void resetQueryPool(const RenderQueryPool *queryPool, uint32_t queryFirstIndex, uint32_t queryCount) = 0; + virtual void writeTimestamp(const RenderQueryPool *queryPool, uint32_t queryIndex) = 0; // Concrete implementation shortcuts. inline void barriers(RenderBarrierStages stages, const RenderBufferBarrier &barrier) { @@ -208,6 +210,13 @@ namespace plume { virtual std::unique_ptr createTexture(const RenderTextureDesc &desc) = 0; }; + struct RenderQueryPool { + virtual ~RenderQueryPool() { } + virtual void queryResults() = 0; + virtual const uint64_t *getResults() const = 0; + virtual uint32_t getCount() const = 0; + }; + struct RenderDevice { virtual ~RenderDevice() { } virtual std::unique_ptr createCommandList(RenderCommandListType type) = 0; @@ -226,6 +235,7 @@ namespace plume { virtual std::unique_ptr createCommandFence() = 0; virtual std::unique_ptr createCommandSemaphore() = 0; virtual std::unique_ptr createFramebuffer(const RenderFramebufferDesc &desc) = 0; + virtual std::unique_ptr createQueryPool(uint32_t queryCount) = 0; virtual void setBottomLevelASBuildInfo(RenderBottomLevelASBuildInfo &buildInfo, const RenderBottomLevelASMesh *meshes, uint32_t meshCount, bool preferFastBuild = true, bool preferFastTrace = false) = 0; virtual void setTopLevelASBuildInfo(RenderTopLevelASBuildInfo &buildInfo, const RenderTopLevelASInstance *instances, uint32_t instanceCount, bool preferFastBuild = true, bool preferFastTrace = false) = 0; virtual void setShaderBindingTableInfo(RenderShaderBindingTableInfo &tableInfo, const RenderShaderBindingGroups &groups, const RenderPipeline *pipeline, RenderDescriptorSet **descriptorSets, uint32_t descriptorSetCount) = 0; diff --git a/UnleashedRecomp/gpu/rhi/plume_render_interface_types.h b/UnleashedRecomp/gpu/rhi/plume_render_interface_types.h index b7551832..36a9c995 100644 --- a/UnleashedRecomp/gpu/rhi/plume_render_interface_types.h +++ b/UnleashedRecomp/gpu/rhi/plume_render_interface_types.h @@ -69,6 +69,7 @@ namespace plume { struct RenderSampler; struct RenderShader; struct RenderTexture; + struct RenderQueryPool; // Enums. diff --git a/UnleashedRecomp/gpu/rhi/plume_vulkan.cpp b/UnleashedRecomp/gpu/rhi/plume_vulkan.cpp index 9e497c88..4684d338 100644 --- a/UnleashedRecomp/gpu/rhi/plume_vulkan.cpp +++ b/UnleashedRecomp/gpu/rhi/plume_vulkan.cpp @@ -2522,6 +2522,80 @@ namespace plume { return (depthAttachment == attachment); } + // VulkanQueryPool + + VulkanQueryPool::VulkanQueryPool(VulkanDevice *device, uint32_t queryCount) { + assert(device != nullptr); + assert(queryCount > 0); + + this->device = device; + + VkQueryPoolCreateInfo createInfo = {}; + createInfo.sType = VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO; + createInfo.queryType = VK_QUERY_TYPE_TIMESTAMP; + createInfo.queryCount = queryCount; + + VkResult res = vkCreateQueryPool(device->vk, &createInfo, nullptr, &vk); + if (res != VK_SUCCESS) { + fprintf(stderr, "vkCreateQueryPool failed with error code 0x%X.\n", res); + return; + } + + results.resize(queryCount); + } + + VulkanQueryPool::~VulkanQueryPool() { + vkDestroyQueryPool(device->vk, vk, nullptr); + } + + void VulkanQueryPool::queryResults() { + VkResult res = vkGetQueryPoolResults(device->vk, vk, 0, uint32_t(results.size()), sizeof(uint64_t) * results.size(), results.data(), sizeof(uint64_t), VK_QUERY_RESULT_64_BIT); + if (res != VK_SUCCESS) { + fprintf(stderr, "vkGetQueryPoolResults failed with error code 0x%X.\n", res); + return; + } + + // Conversion sourced from Godot Engine's Vulkan Rendering Driver. + auto mult64to128 = [](uint64_t u, uint64_t v, uint64_t &h, uint64_t &l) { + uint64_t u1 = (u & 0xffffffff); + uint64_t v1 = (v & 0xffffffff); + uint64_t t = (u1 * v1); + uint64_t w3 = (t & 0xffffffff); + uint64_t k = (t >> 32); + + u >>= 32; + t = (u * v1) + k; + k = (t & 0xffffffff); + uint64_t w1 = (t >> 32); + + v >>= 32; + t = (u1 * v) + k; + k = (t >> 32); + + h = (u * v) + w1 + k; + l = (t << 32) + w3; + }; + + // Convert results to timestamps. + constexpr uint64_t shift_bits = 16; + double timestampPeriod = double(device->physicalDeviceProperties.limits.timestampPeriod); + uint64_t h = 0, l = 0; + for (size_t &result : results) { + mult64to128(result, uint64_t(timestampPeriod * double(1 << shift_bits)), h, l); + result = l; + result >>= shift_bits; + result |= h << (64 - shift_bits); + } + } + + const uint64_t *VulkanQueryPool::getResults() const { + return results.data(); + } + + uint32_t VulkanQueryPool::getCount() const { + return uint32_t(results.size()); + } + // VulkanCommandList VulkanCommandList::VulkanCommandList(VulkanDevice *device, RenderCommandListType type) { @@ -3210,6 +3284,20 @@ namespace plume { // Not required in Vulkan. } + void VulkanCommandList::resetQueryPool(const RenderQueryPool *queryPool, uint32_t queryFirstIndex, uint32_t queryCount) { + assert(queryPool != nullptr); + + const VulkanQueryPool *interfaceQueryPool = static_cast(queryPool); + vkCmdResetQueryPool(vk, interfaceQueryPool->vk, queryFirstIndex, queryCount); + } + + void VulkanCommandList::writeTimestamp(const RenderQueryPool *queryPool, uint32_t queryIndex) { + assert(queryPool != nullptr); + + const VulkanQueryPool *interfaceQueryPool = static_cast(queryPool); + vkCmdWriteTimestamp(vk, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, interfaceQueryPool->vk, queryIndex); + } + void VulkanCommandList::checkActiveRenderPass() { assert(targetFramebuffer != nullptr); @@ -3891,6 +3979,10 @@ namespace plume { return std::make_unique(this, desc); } + std::unique_ptr VulkanDevice::createQueryPool(uint32_t queryCount) { + return std::make_unique(this, queryCount); + } + void VulkanDevice::setBottomLevelASBuildInfo(RenderBottomLevelASBuildInfo &buildInfo, const RenderBottomLevelASMesh *meshes, uint32_t meshCount, bool preferFastBuild, bool preferFastTrace) { assert(meshes != nullptr); assert(meshCount > 0); diff --git a/UnleashedRecomp/gpu/rhi/plume_vulkan.h b/UnleashedRecomp/gpu/rhi/plume_vulkan.h index 59f2aaf0..e25e1869 100644 --- a/UnleashedRecomp/gpu/rhi/plume_vulkan.h +++ b/UnleashedRecomp/gpu/rhi/plume_vulkan.h @@ -271,6 +271,18 @@ namespace plume { bool contains(const VulkanTexture *attachment) const; }; + struct VulkanQueryPool : RenderQueryPool { + VulkanDevice *device = nullptr; + std::vector results; + VkQueryPool vk = VK_NULL_HANDLE; + + VulkanQueryPool(VulkanDevice *device, uint32_t queryCount); + virtual ~VulkanQueryPool() override; + virtual void queryResults() override; + virtual const uint64_t *getResults() const override; + virtual uint32_t getCount() const override; + }; + struct VulkanCommandList : RenderCommandList { VkCommandBuffer vk = VK_NULL_HANDLE; VkCommandPool commandPool = VK_NULL_HANDLE; @@ -319,6 +331,8 @@ namespace plume { void buildBottomLevelAS(const RenderAccelerationStructure *dstAccelerationStructure, RenderBufferReference scratchBuffer, const RenderBottomLevelASBuildInfo &buildInfo) override; void buildTopLevelAS(const RenderAccelerationStructure *dstAccelerationStructure, RenderBufferReference scratchBuffer, RenderBufferReference instancesBuffer, const RenderTopLevelASBuildInfo &buildInfo) override; void discardTexture(const RenderTexture* texture) override; + void resetQueryPool(const RenderQueryPool *queryPool, uint32_t queryFirstIndex, uint32_t queryCount) override; + void writeTimestamp(const RenderQueryPool *queryPool, uint32_t queryIndex) override; void checkActiveRenderPass(); void endActiveRenderPass(); void setDescriptorSet(VkPipelineBindPoint bindPoint, const VulkanPipelineLayout *pipelineLayout, const RenderDescriptorSet *descriptorSet, uint32_t setIndex); @@ -409,6 +423,7 @@ namespace plume { std::unique_ptr createCommandFence() override; std::unique_ptr createCommandSemaphore() override; std::unique_ptr createFramebuffer(const RenderFramebufferDesc &desc) override; + std::unique_ptr createQueryPool(uint32_t queryCount) override; void setBottomLevelASBuildInfo(RenderBottomLevelASBuildInfo &buildInfo, const RenderBottomLevelASMesh *meshes, uint32_t meshCount, bool preferFastBuild, bool preferFastTrace) override; void setTopLevelASBuildInfo(RenderTopLevelASBuildInfo &buildInfo, const RenderTopLevelASInstance *instances, uint32_t instanceCount, bool preferFastBuild, bool preferFastTrace) override; void setShaderBindingTableInfo(RenderShaderBindingTableInfo &tableInfo, const RenderShaderBindingGroups &groups, const RenderPipeline *pipeline, RenderDescriptorSet **descriptorSets, uint32_t descriptorSetCount) override; diff --git a/UnleashedRecomp/gpu/video.cpp b/UnleashedRecomp/gpu/video.cpp index 7073c865..e2562fbc 100644 --- a/UnleashedRecomp/gpu/video.cpp +++ b/UnleashedRecomp/gpu/video.cpp @@ -230,6 +230,54 @@ static void SetDirtyValue(bool& dirtyState, T& dest, const T& src) } } +static constexpr size_t PROFILER_VALUE_COUNT = 256; +static size_t g_profilerValueIndex; + +struct Profiler +{ + std::atomic value; + double values[PROFILER_VALUE_COUNT]; + std::chrono::steady_clock::time_point start; + + void Begin() + { + start = std::chrono::steady_clock::now(); + } + + void End() + { + value = std::chrono::duration(std::chrono::steady_clock::now() - start).count(); + } + + void Set(double v) + { + value = v; + } + + void Reset() + { + End(); + Begin(); + } + + double UpdateAndReturnAverage() + { + values[g_profilerValueIndex] = value; + return std::accumulate(values, values + PROFILER_VALUE_COUNT, 0.0) / PROFILER_VALUE_COUNT; + } +}; + +static double g_applicationValues[PROFILER_VALUE_COUNT]; +static Profiler g_gpuFrameProfiler; +static Profiler g_presentProfiler; +static Profiler g_renderDirectorProfiler; +static Profiler g_frameFenceProfiler; +static Profiler g_presentWaitProfiler; +static Profiler g_swapChainAcquireProfiler; + +static bool g_profilerVisible; +static bool g_profilerWasToggled; + #ifdef UNLEASHED_RECOMP_D3D12 static bool g_vulkan = false; #else @@ -245,6 +293,7 @@ static std::unique_ptr g_device; static RenderDeviceCapabilities g_capabilities; static constexpr size_t NUM_FRAMES = 2; +static constexpr size_t NUM_QUERIES = 2; static uint32_t g_frame = 0; static uint32_t g_nextFrame = 1; @@ -252,6 +301,7 @@ static uint32_t g_nextFrame = 1; static std::unique_ptr g_queue; static std::unique_ptr g_commandLists[NUM_FRAMES]; static std::unique_ptr g_commandFences[NUM_FRAMES]; +static std::unique_ptr g_queryPools[NUM_FRAMES]; static bool g_commandListStates[NUM_FRAMES]; static Mutex g_copyMutex; @@ -1476,7 +1526,11 @@ static void CheckSwapChain() } if (g_swapChainValid) + { + g_swapChainAcquireProfiler.Begin(); g_swapChainValid = g_swapChain->acquireTexture(g_acquireSemaphores[g_frame].get(), &g_backBufferIndex); + g_swapChainAcquireProfiler.End(); + } if (g_needsResize) Video::ComputeViewportDimensions(); @@ -1552,6 +1606,8 @@ static void BeginCommandList() auto& commandList = g_commandLists[g_frame]; commandList->begin(); + commandList->resetQueryPool(g_queryPools[g_frame].get(), 0, NUM_QUERIES); + commandList->writeTimestamp(g_queryPools[g_frame].get(), 0); commandList->setGraphicsPipelineLayout(g_pipelineLayout.get()); commandList->setGraphicsDescriptorSet(g_textureDescriptorSet.get(), 0); commandList->setGraphicsDescriptorSet(g_textureDescriptorSet.get(), 1); @@ -1655,6 +1711,9 @@ bool Video::CreateHostDevice(const char *sdlVideoDriver) for (auto& commandFence : g_commandFences) commandFence = g_device->createCommandFence(); + for (auto& queryPool : g_queryPools) + queryPool = g_device->createQueryPool(NUM_QUERIES); + g_copyQueue = g_device->createCommandQueue(RenderCommandListType::COPY); g_copyCommandList = g_device->createCommandList(RenderCommandListType::COPY); g_copyCommandFence = g_device->createCommandFence(); @@ -1875,8 +1934,12 @@ bool Video::CreateHostDevice(const char *sdlVideoDriver) return true; } +static uint32_t g_waitForGPUCount = 0; + void Video::WaitForGPU() { + g_waitForGPUCount++; + if (g_vulkan) { g_device->waitIdle(); @@ -2125,45 +2188,6 @@ static uint32_t HashVertexDeclaration(uint32_t vertexDeclaration) return vertexDeclaration; } -static constexpr size_t PROFILER_VALUE_COUNT = 256; -static size_t g_profilerValueIndex; - -struct Profiler -{ - std::atomic value; - double values[PROFILER_VALUE_COUNT]; - std::chrono::steady_clock::time_point start; - - void Begin() - { - start = std::chrono::steady_clock::now(); - } - - void End() - { - value = std::chrono::duration(std::chrono::steady_clock::now() - start).count(); - } - - void Reset() - { - End(); - Begin(); - } - - double UpdateAndReturnAverage() - { - values[g_profilerValueIndex] = value; - return std::accumulate(values, values + PROFILER_VALUE_COUNT, 0.0) / PROFILER_VALUE_COUNT; - } -}; - -static double g_applicationValues[PROFILER_VALUE_COUNT]; -static Profiler g_presentProfiler; -static Profiler g_renderDirectorProfiler; - -static bool g_profilerVisible; -static bool g_profilerWasToggled; - static const char *DeviceTypeName(RenderDeviceType type) { switch (type) @@ -2203,29 +2227,47 @@ static void DrawProfiler() g_applicationValues[g_profilerValueIndex] = App::s_deltaTime * 1000.0; const double applicationAvg = std::accumulate(g_applicationValues, g_applicationValues + PROFILER_VALUE_COUNT, 0.0) / PROFILER_VALUE_COUNT; + double gpuFrameAvg = g_gpuFrameProfiler.UpdateAndReturnAverage(); double presentAvg = g_presentProfiler.UpdateAndReturnAverage(); double renderDirectorAvg = g_renderDirectorProfiler.UpdateAndReturnAverage(); + double frameFenceAvg = g_frameFenceProfiler.UpdateAndReturnAverage(); + double presentWaitAvg = g_presentWaitProfiler.UpdateAndReturnAverage(); + double swapChainAcquireAvg = g_swapChainAcquireProfiler.UpdateAndReturnAverage(); if (ImPlot::BeginPlot("Frame Time")) { ImPlot::SetupAxisLimits(ImAxis_Y1, 0.0, 20.0); ImPlot::SetupAxis(ImAxis_Y1, "ms", ImPlotAxisFlags_None); ImPlot::PlotLine("Application", g_applicationValues, PROFILER_VALUE_COUNT, 1.0, 0.0, ImPlotLineFlags_None, g_profilerValueIndex); + ImPlot::PlotLine("GPU Frame", g_gpuFrameProfiler.values, PROFILER_VALUE_COUNT, 1.0, 0.0, ImPlotLineFlags_None, g_profilerValueIndex); ImPlot::PlotLine("Present", g_presentProfiler.values, PROFILER_VALUE_COUNT, 1.0, 0.0, ImPlotLineFlags_None, g_profilerValueIndex); ImPlot::PlotLine("Render Director", g_renderDirectorProfiler.values, PROFILER_VALUE_COUNT, 1.0, 0.0, ImPlotLineFlags_None, g_profilerValueIndex); + ImPlot::PlotLine("Frame Fence", g_frameFenceProfiler.values, PROFILER_VALUE_COUNT, 1.0, 0.0, ImPlotLineFlags_None, g_profilerValueIndex); + ImPlot::PlotLine("Present Wait", g_presentWaitProfiler.values, PROFILER_VALUE_COUNT, 1.0, 0.0, ImPlotLineFlags_None, g_profilerValueIndex); + ImPlot::PlotLine("Swap Chain Acquire", g_swapChainAcquireProfiler.values, PROFILER_VALUE_COUNT, 1.0, 0.0, ImPlotLineFlags_None, g_profilerValueIndex); ImPlot::EndPlot(); } g_profilerValueIndex = (g_profilerValueIndex + 1) % PROFILER_VALUE_COUNT; ImGui::Text("Current Application: %g ms (%g FPS)", App::s_deltaTime * 1000.0, 1.0 / App::s_deltaTime); + ImGui::Text("Current GPU Frame: %g ms (%g FPS)", g_gpuFrameProfiler.value.load(), 1000.0 / g_gpuFrameProfiler.value.load()); ImGui::Text("Current Present: %g ms (%g FPS)", g_presentProfiler.value.load(), 1000.0 / g_presentProfiler.value.load()); ImGui::Text("Current Render Director: %g ms (%g FPS)", g_renderDirectorProfiler.value.load(), 1000.0 / g_renderDirectorProfiler.value.load()); + ImGui::Text("Current Frame Fence: %g ms (%g FPS)", g_frameFenceProfiler.value.load(), 1000.0 / g_frameFenceProfiler.value.load()); + ImGui::Text("Current Present Wait: %g ms (%g FPS)", g_presentWaitProfiler.value.load(), 1000.0 / g_presentWaitProfiler.value.load()); + ImGui::Text("Current Swap Chain Acquire: %g ms (%g FPS)", g_swapChainAcquireProfiler.value.load(), 1000.0 / g_swapChainAcquireProfiler.value.load()); + ImGui::NewLine(); ImGui::Text("Average Application: %g ms (%g FPS)", applicationAvg, 1000.0 / applicationAvg); + ImGui::Text("Average GPU Frame: %g ms (%g FPS)", gpuFrameAvg, 1000.0 / gpuFrameAvg); ImGui::Text("Average Present: %g ms (%g FPS)", presentAvg, 1000.0 / presentAvg); ImGui::Text("Average Render Director: %g ms (%g FPS)", renderDirectorAvg, 1000.0 / renderDirectorAvg); + ImGui::Text("Average Frame Fence: %g ms (%g FPS)", frameFenceAvg, 1000.0 / frameFenceAvg); + ImGui::Text("Average Present Wait: %g ms (%g FPS)", presentWaitAvg, 1000.0 / presentWaitAvg); + ImGui::Text("Average Swap Chain Acquire: %g ms (%g FPS)", swapChainAcquireAvg, 1000.0 / swapChainAcquireAvg); + ImGui::NewLine(); O1HeapDiagnostics diagnostics, physicalDiagnostics; @@ -2240,6 +2282,7 @@ static void DrawProfiler() ImGui::Text("Heap Allocated: %d MB", int32_t(diagnostics.allocated / (1024 * 1024))); ImGui::Text("Physical Heap Allocated: %d MB", int32_t(physicalDiagnostics.allocated / (1024 * 1024))); + ImGui::Text("GPU Waits: %d", int32_t(g_waitForGPUCount)); ImGui::NewLine(); ImGui::Text("Present Wait: %s", g_capabilities.presentWait ? "Supported" : "Unsupported"); @@ -2509,7 +2552,11 @@ void Video::WaitOnSwapChain() if (g_pendingWaitOnSwapChain) { if (g_swapChainValid) + { + g_presentWaitProfiler.Begin(); g_swapChain->wait(); + g_presentWaitProfiler.End(); + } g_pendingWaitOnSwapChain = false; } @@ -2542,7 +2589,11 @@ void Video::Present() if (g_swapChainValid) { if (g_pendingWaitOnSwapChain) + { + g_presentWaitProfiler.Begin(); g_swapChain->wait(); // Never gonna happen outside loading threads as explained above. + g_presentWaitProfiler.End(); + } RenderCommandSemaphore* signalSemaphores[] = { g_renderSemaphores[g_frame].get() }; g_swapChainValid = g_swapChain->present(g_backBufferIndex, signalSemaphores, std::size(signalSemaphores)); @@ -2555,8 +2606,15 @@ void Video::Present() if (g_commandListStates[g_frame]) { + g_frameFenceProfiler.Begin(); g_queue->waitForCommandFence(g_commandFences[g_frame].get()); + g_frameFenceProfiler.End(); g_commandListStates[g_frame] = false; + + // Update the GPU profiler with the results from the timestamps of the frame. + g_queryPools[g_frame]->queryResults(); + const uint64_t *frameTimestamps = g_queryPools[g_frame]->getResults(); + g_gpuFrameProfiler.Set(double(frameTimestamps[1] - frameTimestamps[0]) / 1000000.0); } g_dirtyStates = DirtyStates(true); @@ -2691,6 +2749,7 @@ static void ProcExecuteCommandList(const RenderCommand& cmd) } auto &commandList = g_commandLists[g_frame]; + commandList->writeTimestamp(g_queryPools[g_frame].get(), 1); commandList->end(); if (g_swapChainValid)