diff --git a/UnleashedRecomp/gpu/rhi/plume_d3d12.cpp b/UnleashedRecomp/gpu/rhi/plume_d3d12.cpp index 85974e1..6b9eeea 100644 --- a/UnleashedRecomp/gpu/rhi/plume_d3d12.cpp +++ b/UnleashedRecomp/gpu/rhi/plume_d3d12.cpp @@ -1437,6 +1437,52 @@ namespace plume { return height; } + // D3D12QueryPool + + D3D12QueryPool::D3D12QueryPool(D3D12Device *device, uint32_t queryCount) { + assert(device != nullptr); + assert(queryCount > 0); + + this->device = device; + + D3D12_QUERY_HEAP_DESC queryHeapDesc = {}; + queryHeapDesc.Type = D3D12_QUERY_HEAP_TYPE_TIMESTAMP; + queryHeapDesc.Count = queryCount; + + HRESULT res = device->d3d->CreateQueryHeap(&queryHeapDesc, IID_PPV_ARGS(&d3d)); + if (FAILED(res)) { + fprintf(stderr, "CreateQueryHeap failed with error code 0x%lX.\n", res); + return; + } + + readbackBuffer = device->createBuffer(RenderBufferDesc::ReadbackBuffer(sizeof(uint64_t) * queryCount)); + results.resize(queryCount); + } + + D3D12QueryPool::~D3D12QueryPool() { + if (d3d != nullptr) { + d3d->Release(); + } + } + + void D3D12QueryPool::queryResults() { + void *readbackData = readbackBuffer->map(); + memcpy(results.data(), readbackData, sizeof(uint64_t) * results.size()); + readbackBuffer->unmap(); + + for (uint64_t &result : results) { + result = result / double(device->timestampFrequency) * 1000000000.0; + } + } + + const uint64_t *D3D12QueryPool::getResults() const { + return results.data(); + } + + uint32_t D3D12QueryPool::getCount() const { + return uint32_t(results.size()); + } + // D3D12CommandList D3D12CommandList::D3D12CommandList(D3D12Device *device, RenderCommandListType type) { @@ -2004,6 +2050,19 @@ namespace plume { d3d->DiscardResource(interfaceTexture->d3d, nullptr); } + void D3D12CommandList::resetQueryPool(const RenderQueryPool *queryPool, uint32_t queryFirstIndex, uint32_t queryCount) { + // Do nothing. + } + + void D3D12CommandList::writeTimestamp(const RenderQueryPool *queryPool, uint32_t queryIndex) { + assert(queryPool != nullptr); + + const D3D12QueryPool *interfaceQueryPool = static_cast(queryPool); + const D3D12Buffer *readbackBuffer = static_cast(interfaceQueryPool->readbackBuffer.get()); + d3d->EndQuery(interfaceQueryPool->d3d, D3D12_QUERY_TYPE_TIMESTAMP, queryIndex); + d3d->ResolveQueryData(interfaceQueryPool->d3d, D3D12_QUERY_TYPE_TIMESTAMP, queryIndex, 1, readbackBuffer->d3d, queryIndex * sizeof(uint64_t)); + } + void D3D12CommandList::checkDescriptorHeaps() { if (!descriptorHeapsSet) { ID3D12DescriptorHeap *descriptorHeaps[] = { device->viewHeapAllocator->heap, device->samplerHeapAllocator->heap }; @@ -3461,6 +3520,13 @@ namespace plume { samplerHeapAllocator = std::make_unique(this, SamplerDescriptorHeapSize, D3D12_DESCRIPTOR_HEAP_TYPE_SAMPLER); colorTargetHeapAllocator = std::make_unique(this, TargetDescriptorHeapSize, D3D12_DESCRIPTOR_HEAP_TYPE_RTV); depthTargetHeapAllocator = std::make_unique(this, TargetDescriptorHeapSize, D3D12_DESCRIPTOR_HEAP_TYPE_DSV); + + // Create a command queue only for retrieving the timestamp frequency. Delete it immediately afterwards. + std::unique_ptr timestampCommandQueue = std::make_unique(this, RenderCommandListType::DIRECT); + res = timestampCommandQueue->d3d->GetTimestampFrequency(×tampFrequency); + if (FAILED(res)) { + fprintf(stderr, "GetTimestampFrequency failed with error code 0x%lX. Timestamps will be inaccurate.\n", res); + } } D3D12Device::~D3D12Device() { @@ -3535,6 +3601,10 @@ namespace plume { return std::make_unique(this, desc); } + std::unique_ptr D3D12Device::createQueryPool(uint32_t queryCount) { + return std::make_unique(this, queryCount); + } + void D3D12Device::setBottomLevelASBuildInfo(RenderBottomLevelASBuildInfo &buildInfo, const RenderBottomLevelASMesh *meshes, uint32_t meshCount, bool preferFastBuild, bool preferFastTrace) { assert(meshes != nullptr); assert(meshCount > 0); diff --git a/UnleashedRecomp/gpu/rhi/plume_d3d12.h b/UnleashedRecomp/gpu/rhi/plume_d3d12.h index 6b00ed7..d4987fb 100644 --- a/UnleashedRecomp/gpu/rhi/plume_d3d12.h +++ b/UnleashedRecomp/gpu/rhi/plume_d3d12.h @@ -144,6 +144,19 @@ namespace plume { uint32_t getHeight() const override; }; + struct D3D12QueryPool : RenderQueryPool { + D3D12Device *device = nullptr; + ID3D12QueryHeap *d3d = nullptr; + std::vector results; + std::unique_ptr readbackBuffer; + + D3D12QueryPool(D3D12Device *device, uint32_t queryCount); + virtual ~D3D12QueryPool() override; + virtual void queryResults() override; + virtual const uint64_t *getResults() const override; + virtual uint32_t getCount() const override; + }; + struct D3D12CommandList : RenderCommandList { ID3D12GraphicsCommandList9 *d3d = nullptr; ID3D12CommandAllocator *commandAllocator = nullptr; @@ -196,6 +209,8 @@ namespace plume { void buildBottomLevelAS(const RenderAccelerationStructure *dstAccelerationStructure, RenderBufferReference scratchBuffer, const RenderBottomLevelASBuildInfo &buildInfo) override; void buildTopLevelAS(const RenderAccelerationStructure *dstAccelerationStructure, RenderBufferReference scratchBuffer, RenderBufferReference instancesBuffer, const RenderTopLevelASBuildInfo &buildInfo) override; void discardTexture(const RenderTexture* texture) override; + void resetQueryPool(const RenderQueryPool *queryPool, uint32_t queryFirstIndex, uint32_t queryCount) override; + void writeTimestamp(const RenderQueryPool *queryPool, uint32_t queryIndex) override; void checkDescriptorHeaps(); void notifyDescriptorHeapWasChangedExternally(); void checkTopology(); @@ -417,6 +432,7 @@ namespace plume { std::unique_ptr depthTargetHeapAllocator; RenderDeviceCapabilities capabilities; RenderDeviceDescription description; + uint64_t timestampFrequency = 1; D3D12Device(D3D12Interface *renderInterface, const std::string &preferredDeviceName); ~D3D12Device() override; @@ -436,6 +452,7 @@ namespace plume { std::unique_ptr createCommandFence() override; std::unique_ptr createCommandSemaphore() override; std::unique_ptr createFramebuffer(const RenderFramebufferDesc &desc) override; + std::unique_ptr createQueryPool(uint32_t queryCount) override; void setBottomLevelASBuildInfo(RenderBottomLevelASBuildInfo &buildInfo, const RenderBottomLevelASMesh *meshes, uint32_t meshCount, bool preferFastBuild, bool preferFastTrace) override; void setTopLevelASBuildInfo(RenderTopLevelASBuildInfo &buildInfo, const RenderTopLevelASInstance *instances, uint32_t instanceCount, bool preferFastBuild, bool preferFastTrace) override; void setShaderBindingTableInfo(RenderShaderBindingTableInfo &tableInfo, const RenderShaderBindingGroups &groups, const RenderPipeline *pipeline, RenderDescriptorSet **descriptorSets, uint32_t descriptorSetCount) override; diff --git a/UnleashedRecomp/gpu/rhi/plume_render_interface.h b/UnleashedRecomp/gpu/rhi/plume_render_interface.h index 995bc25..e62db05 100644 --- a/UnleashedRecomp/gpu/rhi/plume_render_interface.h +++ b/UnleashedRecomp/gpu/rhi/plume_render_interface.h @@ -147,6 +147,8 @@ namespace plume { virtual void buildBottomLevelAS(const RenderAccelerationStructure *dstAccelerationStructure, RenderBufferReference scratchBuffer, const RenderBottomLevelASBuildInfo &buildInfo) = 0; virtual void buildTopLevelAS(const RenderAccelerationStructure *dstAccelerationStructure, RenderBufferReference scratchBuffer, RenderBufferReference instancesBuffer, const RenderTopLevelASBuildInfo &buildInfo) = 0; virtual void discardTexture(const RenderTexture* texture) = 0; // D3D12 only. + virtual void resetQueryPool(const RenderQueryPool *queryPool, uint32_t queryFirstIndex, uint32_t queryCount) = 0; + virtual void writeTimestamp(const RenderQueryPool *queryPool, uint32_t queryIndex) = 0; // Concrete implementation shortcuts. inline void barriers(RenderBarrierStages stages, const RenderBufferBarrier &barrier) { @@ -208,6 +210,13 @@ namespace plume { virtual std::unique_ptr createTexture(const RenderTextureDesc &desc) = 0; }; + struct RenderQueryPool { + virtual ~RenderQueryPool() { } + virtual void queryResults() = 0; + virtual const uint64_t *getResults() const = 0; + virtual uint32_t getCount() const = 0; + }; + struct RenderDevice { virtual ~RenderDevice() { } virtual std::unique_ptr createCommandList(RenderCommandListType type) = 0; @@ -226,6 +235,7 @@ namespace plume { virtual std::unique_ptr createCommandFence() = 0; virtual std::unique_ptr createCommandSemaphore() = 0; virtual std::unique_ptr createFramebuffer(const RenderFramebufferDesc &desc) = 0; + virtual std::unique_ptr createQueryPool(uint32_t queryCount) = 0; virtual void setBottomLevelASBuildInfo(RenderBottomLevelASBuildInfo &buildInfo, const RenderBottomLevelASMesh *meshes, uint32_t meshCount, bool preferFastBuild = true, bool preferFastTrace = false) = 0; virtual void setTopLevelASBuildInfo(RenderTopLevelASBuildInfo &buildInfo, const RenderTopLevelASInstance *instances, uint32_t instanceCount, bool preferFastBuild = true, bool preferFastTrace = false) = 0; virtual void setShaderBindingTableInfo(RenderShaderBindingTableInfo &tableInfo, const RenderShaderBindingGroups &groups, const RenderPipeline *pipeline, RenderDescriptorSet **descriptorSets, uint32_t descriptorSetCount) = 0; diff --git a/UnleashedRecomp/gpu/rhi/plume_render_interface_types.h b/UnleashedRecomp/gpu/rhi/plume_render_interface_types.h index b755183..36a9c99 100644 --- a/UnleashedRecomp/gpu/rhi/plume_render_interface_types.h +++ b/UnleashedRecomp/gpu/rhi/plume_render_interface_types.h @@ -69,6 +69,7 @@ namespace plume { struct RenderSampler; struct RenderShader; struct RenderTexture; + struct RenderQueryPool; // Enums. diff --git a/UnleashedRecomp/gpu/rhi/plume_vulkan.cpp b/UnleashedRecomp/gpu/rhi/plume_vulkan.cpp index 9e497c8..a95df55 100644 --- a/UnleashedRecomp/gpu/rhi/plume_vulkan.cpp +++ b/UnleashedRecomp/gpu/rhi/plume_vulkan.cpp @@ -2522,6 +2522,80 @@ namespace plume { return (depthAttachment == attachment); } + // VulkanQueryPool + + VulkanQueryPool::VulkanQueryPool(VulkanDevice *device, uint32_t queryCount) { + assert(device != nullptr); + assert(queryCount > 0); + + this->device = device; + + VkQueryPoolCreateInfo createInfo = {}; + createInfo.sType = VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO; + createInfo.queryType = VK_QUERY_TYPE_TIMESTAMP; + createInfo.queryCount = queryCount; + + VkResult res = vkCreateQueryPool(device->vk, &createInfo, nullptr, &vk); + if (res != VK_SUCCESS) { + fprintf(stderr, "vkCreateQueryPool failed with error code 0x%X.\n", res); + return; + } + + results.resize(queryCount); + } + + VulkanQueryPool::~VulkanQueryPool() { + vkDestroyQueryPool(device->vk, vk, nullptr); + } + + void VulkanQueryPool::queryResults() { + VkResult res = vkGetQueryPoolResults(device->vk, vk, 0, uint32_t(results.size()), sizeof(uint64_t) * results.size(), results.data(), sizeof(uint64_t), VK_QUERY_RESULT_64_BIT); + if (res != VK_SUCCESS) { + fprintf(stderr, "vkGetQueryPoolResults failed with error code 0x%X.\n", res); + return; + } + + // Conversion sourced from Godot Engine's Vulkan Rendering Driver. + auto mult64to128 = [](uint64_t u, uint64_t v, uint64_t &h, uint64_t &l) { + uint64_t u1 = (u & 0xffffffff); + uint64_t v1 = (v & 0xffffffff); + uint64_t t = (u1 * v1); + uint64_t w3 = (t & 0xffffffff); + uint64_t k = (t >> 32); + + u >>= 32; + t = (u * v1) + k; + k = (t & 0xffffffff); + uint64_t w1 = (t >> 32); + + v >>= 32; + t = (u1 * v) + k; + k = (t >> 32); + + h = (u * v) + w1 + k; + l = (t << 32) + w3; + }; + + // Convert results to timestamps. + constexpr uint64_t shift_bits = 16; + double timestampPeriod = double(device->physicalDeviceProperties.limits.timestampPeriod); + uint64_t h = 0, l = 0; + for (uint64_t &result : results) { + mult64to128(result, uint64_t(timestampPeriod * double(1 << shift_bits)), h, l); + result = l; + result >>= shift_bits; + result |= h << (64 - shift_bits); + } + } + + const uint64_t *VulkanQueryPool::getResults() const { + return results.data(); + } + + uint32_t VulkanQueryPool::getCount() const { + return uint32_t(results.size()); + } + // VulkanCommandList VulkanCommandList::VulkanCommandList(VulkanDevice *device, RenderCommandListType type) { @@ -3210,6 +3284,20 @@ namespace plume { // Not required in Vulkan. } + void VulkanCommandList::resetQueryPool(const RenderQueryPool *queryPool, uint32_t queryFirstIndex, uint32_t queryCount) { + assert(queryPool != nullptr); + + const VulkanQueryPool *interfaceQueryPool = static_cast(queryPool); + vkCmdResetQueryPool(vk, interfaceQueryPool->vk, queryFirstIndex, queryCount); + } + + void VulkanCommandList::writeTimestamp(const RenderQueryPool *queryPool, uint32_t queryIndex) { + assert(queryPool != nullptr); + + const VulkanQueryPool *interfaceQueryPool = static_cast(queryPool); + vkCmdWriteTimestamp(vk, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, interfaceQueryPool->vk, queryIndex); + } + void VulkanCommandList::checkActiveRenderPass() { assert(targetFramebuffer != nullptr); @@ -3891,6 +3979,10 @@ namespace plume { return std::make_unique(this, desc); } + std::unique_ptr VulkanDevice::createQueryPool(uint32_t queryCount) { + return std::make_unique(this, queryCount); + } + void VulkanDevice::setBottomLevelASBuildInfo(RenderBottomLevelASBuildInfo &buildInfo, const RenderBottomLevelASMesh *meshes, uint32_t meshCount, bool preferFastBuild, bool preferFastTrace) { assert(meshes != nullptr); assert(meshCount > 0); diff --git a/UnleashedRecomp/gpu/rhi/plume_vulkan.h b/UnleashedRecomp/gpu/rhi/plume_vulkan.h index 59f2aaf..e25e186 100644 --- a/UnleashedRecomp/gpu/rhi/plume_vulkan.h +++ b/UnleashedRecomp/gpu/rhi/plume_vulkan.h @@ -271,6 +271,18 @@ namespace plume { bool contains(const VulkanTexture *attachment) const; }; + struct VulkanQueryPool : RenderQueryPool { + VulkanDevice *device = nullptr; + std::vector results; + VkQueryPool vk = VK_NULL_HANDLE; + + VulkanQueryPool(VulkanDevice *device, uint32_t queryCount); + virtual ~VulkanQueryPool() override; + virtual void queryResults() override; + virtual const uint64_t *getResults() const override; + virtual uint32_t getCount() const override; + }; + struct VulkanCommandList : RenderCommandList { VkCommandBuffer vk = VK_NULL_HANDLE; VkCommandPool commandPool = VK_NULL_HANDLE; @@ -319,6 +331,8 @@ namespace plume { void buildBottomLevelAS(const RenderAccelerationStructure *dstAccelerationStructure, RenderBufferReference scratchBuffer, const RenderBottomLevelASBuildInfo &buildInfo) override; void buildTopLevelAS(const RenderAccelerationStructure *dstAccelerationStructure, RenderBufferReference scratchBuffer, RenderBufferReference instancesBuffer, const RenderTopLevelASBuildInfo &buildInfo) override; void discardTexture(const RenderTexture* texture) override; + void resetQueryPool(const RenderQueryPool *queryPool, uint32_t queryFirstIndex, uint32_t queryCount) override; + void writeTimestamp(const RenderQueryPool *queryPool, uint32_t queryIndex) override; void checkActiveRenderPass(); void endActiveRenderPass(); void setDescriptorSet(VkPipelineBindPoint bindPoint, const VulkanPipelineLayout *pipelineLayout, const RenderDescriptorSet *descriptorSet, uint32_t setIndex); @@ -409,6 +423,7 @@ namespace plume { std::unique_ptr createCommandFence() override; std::unique_ptr createCommandSemaphore() override; std::unique_ptr createFramebuffer(const RenderFramebufferDesc &desc) override; + std::unique_ptr createQueryPool(uint32_t queryCount) override; void setBottomLevelASBuildInfo(RenderBottomLevelASBuildInfo &buildInfo, const RenderBottomLevelASMesh *meshes, uint32_t meshCount, bool preferFastBuild, bool preferFastTrace) override; void setTopLevelASBuildInfo(RenderTopLevelASBuildInfo &buildInfo, const RenderTopLevelASInstance *instances, uint32_t instanceCount, bool preferFastBuild, bool preferFastTrace) override; void setShaderBindingTableInfo(RenderShaderBindingTableInfo &tableInfo, const RenderShaderBindingGroups &groups, const RenderPipeline *pipeline, RenderDescriptorSet **descriptorSets, uint32_t descriptorSetCount) override; diff --git a/UnleashedRecomp/gpu/video.cpp b/UnleashedRecomp/gpu/video.cpp index 7073c86..0f8d49a 100644 --- a/UnleashedRecomp/gpu/video.cpp +++ b/UnleashedRecomp/gpu/video.cpp @@ -230,6 +230,55 @@ static void SetDirtyValue(bool& dirtyState, T& dest, const T& src) } } +static constexpr size_t PROFILER_VALUE_COUNT = 256; +static size_t g_profilerValueIndex; + +struct Profiler +{ + std::atomic value; + double values[PROFILER_VALUE_COUNT]; + std::chrono::steady_clock::time_point start; + + void Begin() + { + start = std::chrono::steady_clock::now(); + } + + void End() + { + value = std::chrono::duration(std::chrono::steady_clock::now() - start).count(); + } + + void Set(double v) + { + value = v; + } + + void Reset() + { + End(); + Begin(); + } + + double UpdateAndReturnAverage() + { + values[g_profilerValueIndex] = value; + return std::accumulate(values, values + PROFILER_VALUE_COUNT, 0.0) / PROFILER_VALUE_COUNT; + } +}; + +static double g_applicationValues[PROFILER_VALUE_COUNT]; +static Profiler g_gpuFrameProfiler; +static Profiler g_presentProfiler; +static Profiler g_updateDirectorProfiler; +static Profiler g_renderDirectorProfiler; +static Profiler g_frameFenceProfiler; +static Profiler g_presentWaitProfiler; +static Profiler g_swapChainAcquireProfiler; + +static bool g_profilerVisible; +static bool g_profilerWasToggled; + #ifdef UNLEASHED_RECOMP_D3D12 static bool g_vulkan = false; #else @@ -245,6 +294,7 @@ static std::unique_ptr g_device; static RenderDeviceCapabilities g_capabilities; static constexpr size_t NUM_FRAMES = 2; +static constexpr size_t NUM_QUERIES = 2; static uint32_t g_frame = 0; static uint32_t g_nextFrame = 1; @@ -252,6 +302,7 @@ static uint32_t g_nextFrame = 1; static std::unique_ptr g_queue; static std::unique_ptr g_commandLists[NUM_FRAMES]; static std::unique_ptr g_commandFences[NUM_FRAMES]; +static std::unique_ptr g_queryPools[NUM_FRAMES]; static bool g_commandListStates[NUM_FRAMES]; static Mutex g_copyMutex; @@ -1476,7 +1527,11 @@ static void CheckSwapChain() } if (g_swapChainValid) + { + g_swapChainAcquireProfiler.Begin(); g_swapChainValid = g_swapChain->acquireTexture(g_acquireSemaphores[g_frame].get(), &g_backBufferIndex); + g_swapChainAcquireProfiler.End(); + } if (g_needsResize) Video::ComputeViewportDimensions(); @@ -1552,6 +1607,8 @@ static void BeginCommandList() auto& commandList = g_commandLists[g_frame]; commandList->begin(); + commandList->resetQueryPool(g_queryPools[g_frame].get(), 0, NUM_QUERIES); + commandList->writeTimestamp(g_queryPools[g_frame].get(), 0); commandList->setGraphicsPipelineLayout(g_pipelineLayout.get()); commandList->setGraphicsDescriptorSet(g_textureDescriptorSet.get(), 0); commandList->setGraphicsDescriptorSet(g_textureDescriptorSet.get(), 1); @@ -1655,6 +1712,9 @@ bool Video::CreateHostDevice(const char *sdlVideoDriver) for (auto& commandFence : g_commandFences) commandFence = g_device->createCommandFence(); + for (auto& queryPool : g_queryPools) + queryPool = g_device->createQueryPool(NUM_QUERIES); + g_copyQueue = g_device->createCommandQueue(RenderCommandListType::COPY); g_copyCommandList = g_device->createCommandList(RenderCommandListType::COPY); g_copyCommandFence = g_device->createCommandFence(); @@ -1875,8 +1935,12 @@ bool Video::CreateHostDevice(const char *sdlVideoDriver) return true; } +static uint32_t g_waitForGPUCount = 0; + void Video::WaitForGPU() { + g_waitForGPUCount++; + if (g_vulkan) { g_device->waitIdle(); @@ -2125,45 +2189,6 @@ static uint32_t HashVertexDeclaration(uint32_t vertexDeclaration) return vertexDeclaration; } -static constexpr size_t PROFILER_VALUE_COUNT = 256; -static size_t g_profilerValueIndex; - -struct Profiler -{ - std::atomic value; - double values[PROFILER_VALUE_COUNT]; - std::chrono::steady_clock::time_point start; - - void Begin() - { - start = std::chrono::steady_clock::now(); - } - - void End() - { - value = std::chrono::duration(std::chrono::steady_clock::now() - start).count(); - } - - void Reset() - { - End(); - Begin(); - } - - double UpdateAndReturnAverage() - { - values[g_profilerValueIndex] = value; - return std::accumulate(values, values + PROFILER_VALUE_COUNT, 0.0) / PROFILER_VALUE_COUNT; - } -}; - -static double g_applicationValues[PROFILER_VALUE_COUNT]; -static Profiler g_presentProfiler; -static Profiler g_renderDirectorProfiler; - -static bool g_profilerVisible; -static bool g_profilerWasToggled; - static const char *DeviceTypeName(RenderDeviceType type) { switch (type) @@ -2203,29 +2228,51 @@ static void DrawProfiler() g_applicationValues[g_profilerValueIndex] = App::s_deltaTime * 1000.0; const double applicationAvg = std::accumulate(g_applicationValues, g_applicationValues + PROFILER_VALUE_COUNT, 0.0) / PROFILER_VALUE_COUNT; + double gpuFrameAvg = g_gpuFrameProfiler.UpdateAndReturnAverage(); double presentAvg = g_presentProfiler.UpdateAndReturnAverage(); + double updateDirectorAvg = g_updateDirectorProfiler.UpdateAndReturnAverage(); double renderDirectorAvg = g_renderDirectorProfiler.UpdateAndReturnAverage(); + double frameFenceAvg = g_frameFenceProfiler.UpdateAndReturnAverage(); + double presentWaitAvg = g_presentWaitProfiler.UpdateAndReturnAverage(); + double swapChainAcquireAvg = g_swapChainAcquireProfiler.UpdateAndReturnAverage(); if (ImPlot::BeginPlot("Frame Time")) { ImPlot::SetupAxisLimits(ImAxis_Y1, 0.0, 20.0); ImPlot::SetupAxis(ImAxis_Y1, "ms", ImPlotAxisFlags_None); ImPlot::PlotLine("Application", g_applicationValues, PROFILER_VALUE_COUNT, 1.0, 0.0, ImPlotLineFlags_None, g_profilerValueIndex); + ImPlot::PlotLine("GPU Frame", g_gpuFrameProfiler.values, PROFILER_VALUE_COUNT, 1.0, 0.0, ImPlotLineFlags_None, g_profilerValueIndex); ImPlot::PlotLine("Present", g_presentProfiler.values, PROFILER_VALUE_COUNT, 1.0, 0.0, ImPlotLineFlags_None, g_profilerValueIndex); + ImPlot::PlotLine("Update Director", g_updateDirectorProfiler.values, PROFILER_VALUE_COUNT, 1.0, 0.0, ImPlotLineFlags_None, g_profilerValueIndex); ImPlot::PlotLine("Render Director", g_renderDirectorProfiler.values, PROFILER_VALUE_COUNT, 1.0, 0.0, ImPlotLineFlags_None, g_profilerValueIndex); + ImPlot::PlotLine("Frame Fence", g_frameFenceProfiler.values, PROFILER_VALUE_COUNT, 1.0, 0.0, ImPlotLineFlags_None, g_profilerValueIndex); + ImPlot::PlotLine("Present Wait", g_presentWaitProfiler.values, PROFILER_VALUE_COUNT, 1.0, 0.0, ImPlotLineFlags_None, g_profilerValueIndex); + ImPlot::PlotLine("Swap Chain Acquire", g_swapChainAcquireProfiler.values, PROFILER_VALUE_COUNT, 1.0, 0.0, ImPlotLineFlags_None, g_profilerValueIndex); ImPlot::EndPlot(); } g_profilerValueIndex = (g_profilerValueIndex + 1) % PROFILER_VALUE_COUNT; ImGui::Text("Current Application: %g ms (%g FPS)", App::s_deltaTime * 1000.0, 1.0 / App::s_deltaTime); + ImGui::Text("Current GPU Frame: %g ms (%g FPS)", g_gpuFrameProfiler.value.load(), 1000.0 / g_gpuFrameProfiler.value.load()); ImGui::Text("Current Present: %g ms (%g FPS)", g_presentProfiler.value.load(), 1000.0 / g_presentProfiler.value.load()); + ImGui::Text("Current Update Director: %g ms (%g FPS)", g_updateDirectorProfiler.value.load(), 1000.0 / g_updateDirectorProfiler.value.load()); ImGui::Text("Current Render Director: %g ms (%g FPS)", g_renderDirectorProfiler.value.load(), 1000.0 / g_renderDirectorProfiler.value.load()); + ImGui::Text("Current Frame Fence: %g ms", g_frameFenceProfiler.value.load()); + ImGui::Text("Current Present Wait: %g ms", g_presentWaitProfiler.value.load()); + ImGui::Text("Current Swap Chain Acquire: %g ms", g_swapChainAcquireProfiler.value.load()); + ImGui::NewLine(); ImGui::Text("Average Application: %g ms (%g FPS)", applicationAvg, 1000.0 / applicationAvg); + ImGui::Text("Average GPU Frame: %g ms (%g FPS)", gpuFrameAvg, 1000.0 / gpuFrameAvg); ImGui::Text("Average Present: %g ms (%g FPS)", presentAvg, 1000.0 / presentAvg); + ImGui::Text("Average Update Director: %g ms (%g FPS)", updateDirectorAvg, 1000.0 / updateDirectorAvg); ImGui::Text("Average Render Director: %g ms (%g FPS)", renderDirectorAvg, 1000.0 / renderDirectorAvg); + ImGui::Text("Average Frame Fence: %g ms", frameFenceAvg); + ImGui::Text("Average Present Wait: %g ms", presentWaitAvg); + ImGui::Text("Average Swap Chain Acquire: %g ms", swapChainAcquireAvg); + ImGui::NewLine(); O1HeapDiagnostics diagnostics, physicalDiagnostics; @@ -2240,6 +2287,7 @@ static void DrawProfiler() ImGui::Text("Heap Allocated: %d MB", int32_t(diagnostics.allocated / (1024 * 1024))); ImGui::Text("Physical Heap Allocated: %d MB", int32_t(physicalDiagnostics.allocated / (1024 * 1024))); + ImGui::Text("GPU Waits: %d", int32_t(g_waitForGPUCount)); ImGui::NewLine(); ImGui::Text("Present Wait: %s", g_capabilities.presentWait ? "Supported" : "Unsupported"); @@ -2509,7 +2557,11 @@ void Video::WaitOnSwapChain() if (g_pendingWaitOnSwapChain) { if (g_swapChainValid) + { + g_presentWaitProfiler.Begin(); g_swapChain->wait(); + g_presentWaitProfiler.End(); + } g_pendingWaitOnSwapChain = false; } @@ -2542,7 +2594,11 @@ void Video::Present() if (g_swapChainValid) { if (g_pendingWaitOnSwapChain) + { + g_presentWaitProfiler.Begin(); g_swapChain->wait(); // Never gonna happen outside loading threads as explained above. + g_presentWaitProfiler.End(); + } RenderCommandSemaphore* signalSemaphores[] = { g_renderSemaphores[g_frame].get() }; g_swapChainValid = g_swapChain->present(g_backBufferIndex, signalSemaphores, std::size(signalSemaphores)); @@ -2555,8 +2611,15 @@ void Video::Present() if (g_commandListStates[g_frame]) { + g_frameFenceProfiler.Begin(); g_queue->waitForCommandFence(g_commandFences[g_frame].get()); + g_frameFenceProfiler.End(); g_commandListStates[g_frame] = false; + + // Update the GPU profiler with the results from the timestamps of the frame. + g_queryPools[g_frame]->queryResults(); + const uint64_t *frameTimestamps = g_queryPools[g_frame]->getResults(); + g_gpuFrameProfiler.Set(double(frameTimestamps[1] - frameTimestamps[0]) / 1000000.0); } g_dirtyStates = DirtyStates(true); @@ -2691,6 +2754,7 @@ static void ProcExecuteCommandList(const RenderCommand& cmd) } auto &commandList = g_commandLists[g_frame]; + commandList->writeTimestamp(g_queryPools[g_frame].get(), 1); commandList->end(); if (g_swapChainValid) @@ -5637,8 +5701,6 @@ PPC_FUNC(sub_8258C8A0) PPC_FUNC_IMPL(__imp__sub_8258CAE0); PPC_FUNC(sub_8258CAE0) { - g_renderDirectorProfiler.Begin(); - if (g_needsResize) { // Backup fade values. These get modified by cutscenes, @@ -5700,7 +5762,21 @@ PPC_FUNC(sub_8258CAE0) } __imp__sub_8258CAE0(ctx, base); +} +PPC_FUNC_IMPL(__imp__sub_824EB5B0); +PPC_FUNC(sub_824EB5B0) +{ + g_updateDirectorProfiler.Begin(); + __imp__sub_824EB5B0(ctx, base); + g_updateDirectorProfiler.End(); +} + +PPC_FUNC_IMPL(__imp__sub_824EB290); +PPC_FUNC(sub_824EB290) +{ + g_renderDirectorProfiler.Begin(); + __imp__sub_824EB290(ctx, base); g_renderDirectorProfiler.End(); }