Update tracy client to v0.10

This commit is contained in:
SteelT 2024-01-27 21:06:14 -05:00
parent b323c6bdb7
commit 9db766935c
26 changed files with 1316 additions and 817 deletions

View file

@ -1,4 +1,4 @@
# Tracy Profiler Client 0.9.1 # Tracy Profiler Client 0.10.0
# BSD 3-clause # BSD 3-clause
# Copyright (c) 2017-2023, Bartosz Taudul <wolf@nereid.pl> # Copyright (c) 2017-2023, Bartosz Taudul <wolf@nereid.pl>

View file

@ -22,6 +22,7 @@
#include "common/tracy_lz4.cpp" #include "common/tracy_lz4.cpp"
#include "client/TracyProfiler.cpp" #include "client/TracyProfiler.cpp"
#include "client/TracyCallstack.cpp" #include "client/TracyCallstack.cpp"
#include "client/TracySysPower.cpp"
#include "client/TracySysTime.cpp" #include "client/TracySysTime.cpp"
#include "client/TracySysTrace.cpp" #include "client/TracySysTrace.cpp"
#include "common/TracySocket.cpp" #include "common/TracySocket.cpp"

View file

@ -686,7 +686,9 @@ void InitCallstackCritical()
void InitCallstack() void InitCallstack()
{ {
cb_bts = backtrace_create_state( nullptr, 0, nullptr, nullptr ); cb_bts = backtrace_create_state( nullptr, 0, nullptr, nullptr );
#ifndef TRACY_DEMANGLE
___tracy_init_demangle_buffer(); ___tracy_init_demangle_buffer();
#endif
#ifdef __linux #ifdef __linux
InitKernelSymbols(); InitKernelSymbols();
@ -761,7 +763,9 @@ debuginfod_client* GetDebuginfodClient()
void EndCallstack() void EndCallstack()
{ {
#ifndef TRACY_DEMANGLE
___tracy_free_demangle_buffer(); ___tracy_free_demangle_buffer();
#endif
#ifdef TRACY_DEBUGINFOD #ifdef TRACY_DEBUGINFOD
ClearDebugInfoVector( s_di_known ); ClearDebugInfoVector( s_di_known );
debuginfod_end( s_debuginfod ); debuginfod_end( s_debuginfod );

View file

@ -21,7 +21,7 @@ public:
, m_active( false ) , m_active( false )
#endif #endif
{ {
assert( m_id != std::numeric_limits<uint32_t>::max() ); assert( m_id != (std::numeric_limits<uint32_t>::max)() );
auto item = Profiler::QueueSerial(); auto item = Profiler::QueueSerial();
MemWrite( &item->hdr.type, QueueType::LockAnnounce ); MemWrite( &item->hdr.type, QueueType::LockAnnounce );
@ -154,7 +154,7 @@ public:
tracy_force_inline void CustomName( const char* name, size_t size ) tracy_force_inline void CustomName( const char* name, size_t size )
{ {
assert( size < std::numeric_limits<uint16_t>::max() ); assert( size < (std::numeric_limits<uint16_t>::max)() );
auto ptr = (char*)tracy_malloc( size ); auto ptr = (char*)tracy_malloc( size );
memcpy( ptr, name, size ); memcpy( ptr, name, size );
auto item = Profiler::QueueSerial(); auto item = Profiler::QueueSerial();
@ -235,7 +235,7 @@ public:
, m_active( false ) , m_active( false )
#endif #endif
{ {
assert( m_id != std::numeric_limits<uint32_t>::max() ); assert( m_id != (std::numeric_limits<uint32_t>::max)() );
auto item = Profiler::QueueSerial(); auto item = Profiler::QueueSerial();
MemWrite( &item->hdr.type, QueueType::LockAnnounce ); MemWrite( &item->hdr.type, QueueType::LockAnnounce );
@ -450,7 +450,7 @@ public:
tracy_force_inline void CustomName( const char* name, size_t size ) tracy_force_inline void CustomName( const char* name, size_t size )
{ {
assert( size < std::numeric_limits<uint16_t>::max() ); assert( size < (std::numeric_limits<uint16_t>::max)() );
auto ptr = (char*)tracy_malloc( size ); auto ptr = (char*)tracy_malloc( size );
memcpy( ptr, name, size ); memcpy( ptr, name, size );
auto item = Profiler::QueueSerial(); auto item = Profiler::QueueSerial();

View file

@ -83,7 +83,9 @@
#endif #endif
#ifdef __APPLE__ #ifdef __APPLE__
# define TRACY_DELAYED_INIT # ifndef TRACY_DELAYED_INIT
# define TRACY_DELAYED_INIT
# endif
#else #else
# ifdef __GNUC__ # ifdef __GNUC__
# define init_order( val ) __attribute__ ((init_priority(val))) # define init_order( val ) __attribute__ ((init_priority(val)))
@ -1072,7 +1074,9 @@ static void CrashHandler( int signal, siginfo_t* info, void* /*ucontext*/ )
} }
closedir( dp ); closedir( dp );
#ifdef TRACY_HAS_CALLSTACK
if( selfTid == s_symbolTid ) s_symbolThreadGone.store( true, std::memory_order_release ); if( selfTid == s_symbolTid ) s_symbolThreadGone.store( true, std::memory_order_release );
#endif
TracyLfqPrepare( QueueType::Crash ); TracyLfqPrepare( QueueType::Crash );
TracyLfqCommit; TracyLfqCommit;
@ -1353,6 +1357,7 @@ Profiler::Profiler()
, m_queryImage( nullptr ) , m_queryImage( nullptr )
, m_queryData( nullptr ) , m_queryData( nullptr )
, m_crashHandlerInstalled( false ) , m_crashHandlerInstalled( false )
, m_programName( nullptr )
{ {
assert( !s_instance ); assert( !s_instance );
s_instance = this; s_instance = this;
@ -1711,6 +1716,9 @@ void Profiler::Worker()
if( m_sock ) break; if( m_sock ) break;
#ifndef TRACY_ON_DEMAND #ifndef TRACY_ON_DEMAND
ProcessSysTime(); ProcessSysTime();
# ifdef TRACY_HAS_SYSPOWER
m_sysPower.Tick();
# endif
#endif #endif
if( m_broadcast ) if( m_broadcast )
@ -1718,6 +1726,14 @@ void Profiler::Worker()
const auto t = std::chrono::high_resolution_clock::now().time_since_epoch().count(); const auto t = std::chrono::high_resolution_clock::now().time_since_epoch().count();
if( t - lastBroadcast > 3000000000 ) // 3s if( t - lastBroadcast > 3000000000 ) // 3s
{ {
m_programNameLock.lock();
if( m_programName )
{
broadcastMsg = GetBroadcastMessage( m_programName, strlen( m_programName ), broadcastLen, dataPort );
m_programName = nullptr;
}
m_programNameLock.unlock();
lastBroadcast = t; lastBroadcast = t;
const auto ts = std::chrono::duration_cast<std::chrono::seconds>( std::chrono::system_clock::now().time_since_epoch() ).count(); const auto ts = std::chrono::duration_cast<std::chrono::seconds>( std::chrono::system_clock::now().time_since_epoch() ).count();
broadcastMsg.activeTime = int32_t( ts - m_epoch ); broadcastMsg.activeTime = int32_t( ts - m_epoch );
@ -1828,6 +1844,9 @@ void Profiler::Worker()
for(;;) for(;;)
{ {
ProcessSysTime(); ProcessSysTime();
#ifdef TRACY_HAS_SYSPOWER
m_sysPower.Tick();
#endif
const auto status = Dequeue( token ); const auto status = Dequeue( token );
const auto serialStatus = DequeueSerial(); const auto serialStatus = DequeueSerial();
if( status == DequeueStatus::ConnectionLost || serialStatus == DequeueStatus::ConnectionLost ) if( status == DequeueStatus::ConnectionLost || serialStatus == DequeueStatus::ConnectionLost )
@ -4149,6 +4168,7 @@ TRACY_API void ___tracy_emit_frame_image( const void* image, uint16_t w, uint16_
TRACY_API void ___tracy_emit_plot( const char* name, double val ) { tracy::Profiler::PlotData( name, val ); } TRACY_API void ___tracy_emit_plot( const char* name, double val ) { tracy::Profiler::PlotData( name, val ); }
TRACY_API void ___tracy_emit_plot_float( const char* name, float val ) { tracy::Profiler::PlotData( name, val ); } TRACY_API void ___tracy_emit_plot_float( const char* name, float val ) { tracy::Profiler::PlotData( name, val ); }
TRACY_API void ___tracy_emit_plot_int( const char* name, int64_t val ) { tracy::Profiler::PlotData( name, val ); } TRACY_API void ___tracy_emit_plot_int( const char* name, int64_t val ) { tracy::Profiler::PlotData( name, val ); }
TRACY_API void ___tracy_emit_plot_config( const char* name, int type, int step, int fill, uint32_t color ) { tracy::Profiler::ConfigurePlot( name, tracy::PlotFormatType(type), step, fill, color ); }
TRACY_API void ___tracy_emit_message( const char* txt, size_t size, int callstack ) { tracy::Profiler::Message( txt, size, callstack ); } TRACY_API void ___tracy_emit_message( const char* txt, size_t size, int callstack ) { tracy::Profiler::Message( txt, size, callstack ); }
TRACY_API void ___tracy_emit_messageL( const char* txt, int callstack ) { tracy::Profiler::Message( txt, callstack ); } TRACY_API void ___tracy_emit_messageL( const char* txt, int callstack ) { tracy::Profiler::Message( txt, callstack ); }
TRACY_API void ___tracy_emit_messageC( const char* txt, size_t size, uint32_t color, int callstack ) { tracy::Profiler::MessageColor( txt, size, color, callstack ); } TRACY_API void ___tracy_emit_messageC( const char* txt, size_t size, uint32_t color, int callstack ) { tracy::Profiler::MessageColor( txt, size, color, callstack ); }
@ -4167,7 +4187,7 @@ TRACY_API void ___tracy_emit_gpu_zone_begin( const struct ___tracy_gpu_zone_begi
{ {
TracyLfqPrepareC( tracy::QueueType::GpuZoneBegin ); TracyLfqPrepareC( tracy::QueueType::GpuZoneBegin );
tracy::MemWrite( &item->gpuZoneBegin.cpuTime, tracy::Profiler::GetTime() ); tracy::MemWrite( &item->gpuZoneBegin.cpuTime, tracy::Profiler::GetTime() );
tracy::MemWrite( &item->gpuNewContext.thread, tracy::GetThreadHandle() ); tracy::MemWrite( &item->gpuZoneBegin.thread, tracy::GetThreadHandle() );
tracy::MemWrite( &item->gpuZoneBegin.srcloc, data.srcloc ); tracy::MemWrite( &item->gpuZoneBegin.srcloc, data.srcloc );
tracy::MemWrite( &item->gpuZoneBegin.queryId, data.queryId ); tracy::MemWrite( &item->gpuZoneBegin.queryId, data.queryId );
tracy::MemWrite( &item->gpuZoneBegin.context, data.context ); tracy::MemWrite( &item->gpuZoneBegin.context, data.context );
@ -4190,7 +4210,7 @@ TRACY_API void ___tracy_emit_gpu_zone_begin_alloc( const struct ___tracy_gpu_zon
{ {
TracyLfqPrepareC( tracy::QueueType::GpuZoneBeginAllocSrcLoc ); TracyLfqPrepareC( tracy::QueueType::GpuZoneBeginAllocSrcLoc );
tracy::MemWrite( &item->gpuZoneBegin.cpuTime, tracy::Profiler::GetTime() ); tracy::MemWrite( &item->gpuZoneBegin.cpuTime, tracy::Profiler::GetTime() );
tracy::MemWrite( &item->gpuNewContext.thread, tracy::GetThreadHandle() ); tracy::MemWrite( &item->gpuZoneBegin.thread, tracy::GetThreadHandle() );
tracy::MemWrite( &item->gpuZoneBegin.srcloc, data.srcloc ); tracy::MemWrite( &item->gpuZoneBegin.srcloc, data.srcloc );
tracy::MemWrite( &item->gpuZoneBegin.queryId, data.queryId ); tracy::MemWrite( &item->gpuZoneBegin.queryId, data.queryId );
tracy::MemWrite( &item->gpuZoneBegin.context, data.context ); tracy::MemWrite( &item->gpuZoneBegin.context, data.context );
@ -4202,7 +4222,7 @@ TRACY_API void ___tracy_emit_gpu_zone_begin_alloc_callstack( const struct ___tra
tracy::GetProfiler().SendCallstack( data.depth ); tracy::GetProfiler().SendCallstack( data.depth );
TracyLfqPrepareC( tracy::QueueType::GpuZoneBeginAllocSrcLocCallstack ); TracyLfqPrepareC( tracy::QueueType::GpuZoneBeginAllocSrcLocCallstack );
tracy::MemWrite( &item->gpuZoneBegin.cpuTime, tracy::Profiler::GetTime() ); tracy::MemWrite( &item->gpuZoneBegin.cpuTime, tracy::Profiler::GetTime() );
tracy::MemWrite( &item->gpuNewContext.thread, tracy::GetThreadHandle() ); tracy::MemWrite( &item->gpuZoneBegin.thread, tracy::GetThreadHandle() );
tracy::MemWrite( &item->gpuZoneBegin.srcloc, data.srcloc ); tracy::MemWrite( &item->gpuZoneBegin.srcloc, data.srcloc );
tracy::MemWrite( &item->gpuZoneBegin.queryId, data.queryId ); tracy::MemWrite( &item->gpuZoneBegin.queryId, data.queryId );
tracy::MemWrite( &item->gpuZoneBegin.context, data.context ); tracy::MemWrite( &item->gpuZoneBegin.context, data.context );
@ -4292,7 +4312,7 @@ TRACY_API void ___tracy_emit_gpu_zone_begin_alloc_serial( const struct ___tracy_
auto item = tracy::Profiler::QueueSerial(); auto item = tracy::Profiler::QueueSerial();
tracy::MemWrite( &item->hdr.type, tracy::QueueType::GpuZoneBeginAllocSrcLocSerial ); tracy::MemWrite( &item->hdr.type, tracy::QueueType::GpuZoneBeginAllocSrcLocSerial );
tracy::MemWrite( &item->gpuZoneBegin.cpuTime, tracy::Profiler::GetTime() ); tracy::MemWrite( &item->gpuZoneBegin.cpuTime, tracy::Profiler::GetTime() );
tracy::MemWrite( &item->gpuNewContext.thread, tracy::GetThreadHandle() ); tracy::MemWrite( &item->gpuZoneBegin.thread, tracy::GetThreadHandle() );
tracy::MemWrite( &item->gpuZoneBegin.srcloc, data.srcloc ); tracy::MemWrite( &item->gpuZoneBegin.srcloc, data.srcloc );
tracy::MemWrite( &item->gpuZoneBegin.queryId, data.queryId ); tracy::MemWrite( &item->gpuZoneBegin.queryId, data.queryId );
tracy::MemWrite( &item->gpuZoneBegin.context, data.context ); tracy::MemWrite( &item->gpuZoneBegin.context, data.context );
@ -4304,7 +4324,7 @@ TRACY_API void ___tracy_emit_gpu_zone_begin_alloc_callstack_serial( const struct
auto item = tracy::Profiler::QueueSerialCallstack( tracy::Callstack( data.depth ) ); auto item = tracy::Profiler::QueueSerialCallstack( tracy::Callstack( data.depth ) );
tracy::MemWrite( &item->hdr.type, tracy::QueueType::GpuZoneBeginAllocSrcLocCallstackSerial ); tracy::MemWrite( &item->hdr.type, tracy::QueueType::GpuZoneBeginAllocSrcLocCallstackSerial );
tracy::MemWrite( &item->gpuZoneBegin.cpuTime, tracy::Profiler::GetTime() ); tracy::MemWrite( &item->gpuZoneBegin.cpuTime, tracy::Profiler::GetTime() );
tracy::MemWrite( &item->gpuNewContext.thread, tracy::GetThreadHandle() ); tracy::MemWrite( &item->gpuZoneBegin.thread, tracy::GetThreadHandle() );
tracy::MemWrite( &item->gpuZoneBegin.srcloc, data.srcloc ); tracy::MemWrite( &item->gpuZoneBegin.srcloc, data.srcloc );
tracy::MemWrite( &item->gpuZoneBegin.queryId, data.queryId ); tracy::MemWrite( &item->gpuZoneBegin.queryId, data.queryId );
tracy::MemWrite( &item->gpuZoneBegin.context, data.context ); tracy::MemWrite( &item->gpuZoneBegin.context, data.context );

View file

@ -10,6 +10,7 @@
#include "tracy_concurrentqueue.h" #include "tracy_concurrentqueue.h"
#include "tracy_SPSCQueue.h" #include "tracy_SPSCQueue.h"
#include "TracyCallstack.hpp" #include "TracyCallstack.hpp"
#include "TracySysPower.hpp"
#include "TracySysTime.hpp" #include "TracySysTime.hpp"
#include "TracyFastVector.hpp" #include "TracyFastVector.hpp"
#include "../common/TracyQueue.hpp" #include "../common/TracyQueue.hpp"
@ -208,7 +209,22 @@ public:
if( HardwareSupportsInvariantTSC() ) if( HardwareSupportsInvariantTSC() )
{ {
uint64_t rax, rdx; uint64_t rax, rdx;
#ifdef TRACY_PATCHABLE_NOPSLEDS
// Some external tooling (such as rr) wants to patch our rdtsc and replace it by a
// branch to control the external input seen by a program. This kind of patching is
// not generally possible depending on the surrounding code and can lead to significant
// slowdowns if the compiler generated unlucky code and rr and tracy are used together.
// To avoid this, use the rr-safe `nopl 0(%rax, %rax, 1); rdtsc` instruction sequence,
// which rr promises will be patchable independent of the surrounding code.
asm volatile (
// This is nopl 0(%rax, %rax, 1), but assemblers are inconsistent about whether
// they emit that as a 4 or 5 byte sequence and we need to be guaranteed to use
// the 5 byte one.
".byte 0x0f, 0x1f, 0x44, 0x00, 0x00\n\t"
"rdtsc" : "=a" (rax), "=d" (rdx) );
#else
asm volatile ( "rdtsc" : "=a" (rax), "=d" (rdx) ); asm volatile ( "rdtsc" : "=a" (rax), "=d" (rdx) );
#endif
return (int64_t)(( rdx << 32 ) + rax); return (int64_t)(( rdx << 32 ) + rax);
} }
# else # else
@ -288,7 +304,7 @@ public:
{ {
#ifndef TRACY_NO_FRAME_IMAGE #ifndef TRACY_NO_FRAME_IMAGE
auto& profiler = GetProfiler(); auto& profiler = GetProfiler();
assert( profiler.m_frameCount.load( std::memory_order_relaxed ) < std::numeric_limits<uint32_t>::max() ); assert( profiler.m_frameCount.load( std::memory_order_relaxed ) < (std::numeric_limits<uint32_t>::max)() );
# ifdef TRACY_ON_DEMAND # ifdef TRACY_ON_DEMAND
if( !profiler.IsConnected() ) return; if( !profiler.IsConnected() ) return;
# endif # endif
@ -305,6 +321,12 @@ public:
fi->flip = flip; fi->flip = flip;
profiler.m_fiQueue.commit_next(); profiler.m_fiQueue.commit_next();
profiler.m_fiLock.unlock(); profiler.m_fiLock.unlock();
#else
static_cast<void>(image); // unused
static_cast<void>(w); // unused
static_cast<void>(h); // unused
static_cast<void>(offset); // unused
static_cast<void>(flip); // unused
#endif #endif
} }
@ -362,7 +384,7 @@ public:
static tracy_force_inline void Message( const char* txt, size_t size, int callstack ) static tracy_force_inline void Message( const char* txt, size_t size, int callstack )
{ {
assert( size < std::numeric_limits<uint16_t>::max() ); assert( size < (std::numeric_limits<uint16_t>::max)() );
#ifdef TRACY_ON_DEMAND #ifdef TRACY_ON_DEMAND
if( !GetProfiler().IsConnected() ) return; if( !GetProfiler().IsConnected() ) return;
#endif #endif
@ -399,7 +421,7 @@ public:
static tracy_force_inline void MessageColor( const char* txt, size_t size, uint32_t color, int callstack ) static tracy_force_inline void MessageColor( const char* txt, size_t size, uint32_t color, int callstack )
{ {
assert( size < std::numeric_limits<uint16_t>::max() ); assert( size < (std::numeric_limits<uint16_t>::max)() );
#ifdef TRACY_ON_DEMAND #ifdef TRACY_ON_DEMAND
if( !GetProfiler().IsConnected() ) return; if( !GetProfiler().IsConnected() ) return;
#endif #endif
@ -442,7 +464,7 @@ public:
static tracy_force_inline void MessageAppInfo( const char* txt, size_t size ) static tracy_force_inline void MessageAppInfo( const char* txt, size_t size )
{ {
assert( size < std::numeric_limits<uint16_t>::max() ); assert( size < (std::numeric_limits<uint16_t>::max)() );
auto ptr = (char*)tracy_malloc( size ); auto ptr = (char*)tracy_malloc( size );
memcpy( ptr, txt, size ); memcpy( ptr, txt, size );
TracyLfqPrepare( QueueType::MessageAppInfo ); TracyLfqPrepare( QueueType::MessageAppInfo );
@ -676,6 +698,13 @@ public:
return m_isConnected.load( std::memory_order_acquire ); return m_isConnected.load( std::memory_order_acquire );
} }
tracy_force_inline void SetProgramName( const char* name )
{
m_programNameLock.lock();
m_programName = name;
m_programNameLock.unlock();
}
#ifdef TRACY_ON_DEMAND #ifdef TRACY_ON_DEMAND
tracy_force_inline uint64_t ConnectionId() const tracy_force_inline uint64_t ConnectionId() const
{ {
@ -730,7 +759,7 @@ public:
static tracy_force_inline uint64_t AllocSourceLocation( uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz, const char* name, size_t nameSz ) static tracy_force_inline uint64_t AllocSourceLocation( uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz, const char* name, size_t nameSz )
{ {
const auto sz32 = uint32_t( 2 + 4 + 4 + functionSz + 1 + sourceSz + 1 + nameSz ); const auto sz32 = uint32_t( 2 + 4 + 4 + functionSz + 1 + sourceSz + 1 + nameSz );
assert( sz32 <= std::numeric_limits<uint16_t>::max() ); assert( sz32 <= (std::numeric_limits<uint16_t>::max)() );
const auto sz = uint16_t( sz32 ); const auto sz = uint16_t( sz32 );
auto ptr = (char*)tracy_malloc( sz ); auto ptr = (char*)tracy_malloc( sz );
memcpy( ptr, &sz, 2 ); memcpy( ptr, &sz, 2 );
@ -941,6 +970,10 @@ private:
void ProcessSysTime() {} void ProcessSysTime() {}
#endif #endif
#ifdef TRACY_HAS_SYSPOWER
SysPower m_sysPower;
#endif
ParameterCallback m_paramCallback; ParameterCallback m_paramCallback;
void* m_paramCallbackData; void* m_paramCallbackData;
SourceContentsCallback m_sourceCallback; SourceContentsCallback m_sourceCallback;
@ -959,6 +992,9 @@ private:
} m_prevSignal; } m_prevSignal;
#endif #endif
bool m_crashHandlerInstalled; bool m_crashHandlerInstalled;
const char* m_programName;
TracyMutex m_programNameLock;
}; };
} }

View file

@ -108,7 +108,7 @@ public:
tracy_force_inline void Text( const char* txt, size_t size ) tracy_force_inline void Text( const char* txt, size_t size )
{ {
assert( size < std::numeric_limits<uint16_t>::max() ); assert( size < (std::numeric_limits<uint16_t>::max)() );
if( !m_active ) return; if( !m_active ) return;
#ifdef TRACY_ON_DEMAND #ifdef TRACY_ON_DEMAND
if( GetProfiler().ConnectionId() != m_connectionId ) return; if( GetProfiler().ConnectionId() != m_connectionId ) return;
@ -123,7 +123,7 @@ public:
tracy_force_inline void Name( const char* txt, size_t size ) tracy_force_inline void Name( const char* txt, size_t size )
{ {
assert( size < std::numeric_limits<uint16_t>::max() ); assert( size < (std::numeric_limits<uint16_t>::max)() );
if( !m_active ) return; if( !m_active ) return;
#ifdef TRACY_ON_DEMAND #ifdef TRACY_ON_DEMAND
if( GetProfiler().ConnectionId() != m_connectionId ) return; if( GetProfiler().ConnectionId() != m_connectionId ) return;

View file

@ -0,0 +1,164 @@
#include "TracySysPower.hpp"
#ifdef TRACY_HAS_SYSPOWER
#include <sys/types.h>
#include <dirent.h>
#include <chrono>
#include <inttypes.h>
#include <stdio.h>
#include <string.h>
#include "TracyDebug.hpp"
#include "TracyProfiler.hpp"
#include "../common/TracyAlloc.hpp"
namespace tracy
{
SysPower::SysPower()
: m_domains( 4 )
, m_lastTime( 0 )
{
ScanDirectory( "/sys/devices/virtual/powercap/intel-rapl", -1 );
}
SysPower::~SysPower()
{
for( auto& v : m_domains )
{
fclose( v.handle );
// Do not release v.name, as it may be still needed
}
}
void SysPower::Tick()
{
auto t = std::chrono::high_resolution_clock::now().time_since_epoch().count();
if( t - m_lastTime > 10000000 ) // 10 ms
{
m_lastTime = t;
for( auto& v : m_domains )
{
char tmp[32];
if( fread( tmp, 1, 32, v.handle ) > 0 )
{
rewind( v.handle );
auto p = (uint64_t)atoll( tmp );
uint64_t delta;
if( p >= v.value )
{
delta = p - v.value;
}
else
{
delta = v.overflow - v.value + p;
}
v.value = p;
TracyLfqPrepare( QueueType::SysPowerReport );
MemWrite( &item->sysPower.time, Profiler::GetTime() );
MemWrite( &item->sysPower.delta, delta );
MemWrite( &item->sysPower.name, (uint64_t)v.name );
TracyLfqCommit;
}
}
}
}
void SysPower::ScanDirectory( const char* path, int parent )
{
DIR* dir = opendir( path );
if( !dir ) return;
struct dirent* ent;
uint64_t maxRange = 0;
char* name = nullptr;
FILE* handle = nullptr;
while( ( ent = readdir( dir ) ) )
{
if( ent->d_type == DT_REG )
{
if( strcmp( ent->d_name, "max_energy_range_uj" ) == 0 )
{
char tmp[PATH_MAX];
snprintf( tmp, PATH_MAX, "%s/max_energy_range_uj", path );
FILE* f = fopen( tmp, "r" );
if( f )
{
fscanf( f, "%" PRIu64, &maxRange );
fclose( f );
}
}
else if( strcmp( ent->d_name, "name" ) == 0 )
{
char tmp[PATH_MAX];
snprintf( tmp, PATH_MAX, "%s/name", path );
FILE* f = fopen( tmp, "r" );
if( f )
{
char ntmp[128];
if( fgets( ntmp, 128, f ) )
{
// Last character is newline, skip it
const auto sz = strlen( ntmp ) - 1;
if( parent < 0 )
{
name = (char*)tracy_malloc( sz + 1 );
memcpy( name, ntmp, sz );
name[sz] = '\0';
}
else
{
const auto p = m_domains[parent];
const auto psz = strlen( p.name );
name = (char*)tracy_malloc( psz + sz + 2 );
memcpy( name, p.name, psz );
name[psz] = ':';
memcpy( name+psz+1, ntmp, sz );
name[psz+sz+1] = '\0';
}
}
fclose( f );
}
}
else if( strcmp( ent->d_name, "energy_uj" ) == 0 )
{
char tmp[PATH_MAX];
snprintf( tmp, PATH_MAX, "%s/energy_uj", path );
handle = fopen( tmp, "r" );
}
}
if( name && handle && maxRange > 0 ) break;
}
if( name && handle && maxRange > 0 )
{
parent = (int)m_domains.size();
Domain* domain = m_domains.push_next();
domain->value = 0;
domain->overflow = maxRange;
domain->handle = handle;
domain->name = name;
TracyDebug( "Power domain id %i, %s found at %s\n", parent, name, path );
}
else
{
if( name ) tracy_free( name );
if( handle ) fclose( handle );
}
rewinddir( dir );
while( ( ent = readdir( dir ) ) )
{
if( ent->d_type == DT_DIR && strncmp( ent->d_name, "intel-rapl:", 11 ) == 0 )
{
char tmp[PATH_MAX];
snprintf( tmp, PATH_MAX, "%s/%s", path, ent->d_name );
ScanDirectory( tmp, parent );
}
}
closedir( dir );
}
}
#endif

View file

@ -0,0 +1,44 @@
#ifndef __TRACYSYSPOWER_HPP__
#define __TRACYSYSPOWER_HPP__
#if defined __linux__
# define TRACY_HAS_SYSPOWER
#endif
#ifdef TRACY_HAS_SYSPOWER
#include <stdint.h>
#include <stdio.h>
#include "TracyFastVector.hpp"
namespace tracy
{
class SysPower
{
struct Domain
{
uint64_t value;
uint64_t overflow;
FILE* handle;
const char* name;
};
public:
SysPower();
~SysPower();
void Tick();
private:
void ScanDirectory( const char* path, int parent );
FastVector<Domain> m_domains;
uint64_t m_lastTime;
};
}
#endif
#endif

View file

@ -770,6 +770,13 @@ bool SysTraceStart( int64_t& samplingPeriod )
TracyDebug( "sched_wakeup id: %i\n", wakeupId ); TracyDebug( "sched_wakeup id: %i\n", wakeupId );
TracyDebug( "drm_vblank_event id: %i\n", vsyncId ); TracyDebug( "drm_vblank_event id: %i\n", vsyncId );
#ifdef TRACY_NO_SAMPLING
const bool noSoftwareSampling = true;
#else
const char* noSoftwareSamplingEnv = GetEnvVar( "TRACY_NO_SAMPLING" );
const bool noSoftwareSampling = noSoftwareSamplingEnv && noSoftwareSamplingEnv[0] == '1';
#endif
#ifdef TRACY_NO_SAMPLE_RETIREMENT #ifdef TRACY_NO_SAMPLE_RETIREMENT
const bool noRetirement = true; const bool noRetirement = true;
#else #else
@ -839,28 +846,31 @@ bool SysTraceStart( int64_t& samplingPeriod )
pe.clockid = CLOCK_MONOTONIC_RAW; pe.clockid = CLOCK_MONOTONIC_RAW;
#endif #endif
TracyDebug( "Setup software sampling\n" ); if( !noSoftwareSampling )
ProbePreciseIp( pe, currentPid );
for( int i=0; i<s_numCpus; i++ )
{ {
int fd = perf_event_open( &pe, currentPid, i, -1, PERF_FLAG_FD_CLOEXEC ); TracyDebug( "Setup software sampling\n" );
if( fd == -1 ) ProbePreciseIp( pe, currentPid );
for( int i=0; i<s_numCpus; i++ )
{ {
pe.exclude_kernel = 1; int fd = perf_event_open( &pe, currentPid, i, -1, PERF_FLAG_FD_CLOEXEC );
ProbePreciseIp( pe, currentPid );
fd = perf_event_open( &pe, currentPid, i, -1, PERF_FLAG_FD_CLOEXEC );
if( fd == -1 ) if( fd == -1 )
{ {
TracyDebug( " Failed to setup!\n"); pe.exclude_kernel = 1;
break; ProbePreciseIp( pe, currentPid );
fd = perf_event_open( &pe, currentPid, i, -1, PERF_FLAG_FD_CLOEXEC );
if( fd == -1 )
{
TracyDebug( " Failed to setup!\n");
break;
}
TracyDebug( " No access to kernel samples\n" );
}
new( s_ring+s_numBuffers ) RingBuffer( 64*1024, fd, EventCallstack );
if( s_ring[s_numBuffers].IsValid() )
{
s_numBuffers++;
TracyDebug( " Core %i ok\n", i );
} }
TracyDebug( " No access to kernel samples\n" );
}
new( s_ring+s_numBuffers ) RingBuffer( 64*1024, fd, EventCallstack );
if( s_ring[s_numBuffers].IsValid() )
{
s_numBuffers++;
TracyDebug( " Core %i ok\n", i );
} }
} }

View file

@ -147,7 +147,7 @@
# if defined(__APPLE__) # if defined(__APPLE__)
# include <TargetConditionals.h> # include <TargetConditionals.h>
# if !TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR # if !TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR
# include <mach/mach_vm.h> # include <mach/mach.h>
# include <mach/vm_statistics.h> # include <mach/vm_statistics.h>
# endif # endif
# include <pthread.h> # include <pthread.h>

View file

@ -9,14 +9,14 @@ namespace tracy
constexpr unsigned Lz4CompressBound( unsigned isize ) { return isize + ( isize / 255 ) + 16; } constexpr unsigned Lz4CompressBound( unsigned isize ) { return isize + ( isize / 255 ) + 16; }
enum : uint32_t { ProtocolVersion = 63 }; enum : uint32_t { ProtocolVersion = 64 };
enum : uint16_t { BroadcastVersion = 3 }; enum : uint16_t { BroadcastVersion = 3 };
using lz4sz_t = uint32_t; using lz4sz_t = uint32_t;
enum { TargetFrameSize = 256 * 1024 }; enum { TargetFrameSize = 256 * 1024 };
enum { LZ4Size = Lz4CompressBound( TargetFrameSize ) }; enum { LZ4Size = Lz4CompressBound( TargetFrameSize ) };
static_assert( LZ4Size <= std::numeric_limits<lz4sz_t>::max(), "LZ4Size greater than lz4sz_t" ); static_assert( LZ4Size <= (std::numeric_limits<lz4sz_t>::max)(), "LZ4Size greater than lz4sz_t" );
static_assert( TargetFrameSize * 2 >= 64 * 1024, "Not enough space for LZ4 stream buffer" ); static_assert( TargetFrameSize * 2 >= 64 * 1024, "Not enough space for LZ4 stream buffer" );
enum { HandshakeShibbolethSize = 8 }; enum { HandshakeShibbolethSize = 8 };

View file

@ -90,6 +90,7 @@ enum class QueueType : uint8_t
GpuNewContext, GpuNewContext,
CallstackFrame, CallstackFrame,
SysTimeReport, SysTimeReport,
SysPowerReport,
TidToPid, TidToPid,
HwSampleCpuCycle, HwSampleCpuCycle,
HwSampleInstructionRetired, HwSampleInstructionRetired,
@ -563,6 +564,13 @@ struct QueueSysTime
float sysTime; float sysTime;
}; };
struct QueueSysPower
{
int64_t time;
uint64_t delta;
uint64_t name; // ptr
};
struct QueueContextSwitch struct QueueContextSwitch
{ {
int64_t time; int64_t time;
@ -729,6 +737,7 @@ struct QueueItem
QueueCrashReport crashReport; QueueCrashReport crashReport;
QueueCrashReportThread crashReportThread; QueueCrashReportThread crashReportThread;
QueueSysTime sysTime; QueueSysTime sysTime;
QueueSysPower sysPower;
QueueContextSwitch contextSwitch; QueueContextSwitch contextSwitch;
QueueThreadWakeup threadWakeup; QueueThreadWakeup threadWakeup;
QueueTidToPid tidToPid; QueueTidToPid tidToPid;
@ -832,6 +841,7 @@ static constexpr size_t QueueDataSize[] = {
sizeof( QueueHeader ) + sizeof( QueueGpuNewContext ), sizeof( QueueHeader ) + sizeof( QueueGpuNewContext ),
sizeof( QueueHeader ) + sizeof( QueueCallstackFrame ), sizeof( QueueHeader ) + sizeof( QueueCallstackFrame ),
sizeof( QueueHeader ) + sizeof( QueueSysTime ), sizeof( QueueHeader ) + sizeof( QueueSysTime ),
sizeof( QueueHeader ) + sizeof( QueueSysPower ),
sizeof( QueueHeader ) + sizeof( QueueTidToPid ), sizeof( QueueHeader ) + sizeof( QueueTidToPid ),
sizeof( QueueHeader ) + sizeof( QueueHwSample ), // cpu cycle sizeof( QueueHeader ) + sizeof( QueueHwSample ), // cpu cycle
sizeof( QueueHeader ) + sizeof( QueueHwSample ), // instruction retired sizeof( QueueHeader ) + sizeof( QueueHwSample ), // instruction retired

View file

@ -353,7 +353,7 @@ int Socket::Recv( void* _buf, int len, int timeout )
} }
} }
int Socket::ReadUpTo( void* _buf, int len, int timeout ) int Socket::ReadUpTo( void* _buf, int len )
{ {
const auto sock = m_sock.load( std::memory_order_relaxed ); const auto sock = m_sock.load( std::memory_order_relaxed );
auto buf = (char*)_buf; auto buf = (char*)_buf;
@ -678,10 +678,10 @@ bool UdpListen::Listen( uint16_t port )
#endif #endif
#if defined _WIN32 #if defined _WIN32
unsigned long reuse = 1; unsigned long reuse = 1;
setsockopt( m_sock, SOL_SOCKET, SO_REUSEADDR, (const char*)&reuse, sizeof( reuse ) ); setsockopt( sock, SOL_SOCKET, SO_REUSEADDR, (const char*)&reuse, sizeof( reuse ) );
#else #else
int reuse = 1; int reuse = 1;
setsockopt( m_sock, SOL_SOCKET, SO_REUSEADDR, &reuse, sizeof( reuse ) ); setsockopt( sock, SOL_SOCKET, SO_REUSEADDR, &reuse, sizeof( reuse ) );
#endif #endif
#if defined _WIN32 #if defined _WIN32
unsigned long broadcast = 1; unsigned long broadcast = 1;

View file

@ -29,7 +29,7 @@ public:
int Send( const void* buf, int len ); int Send( const void* buf, int len );
int GetSendBufSize(); int GetSendBufSize();
int ReadUpTo( void* buf, int len, int timeout ); int ReadUpTo( void* buf, int len );
bool Read( void* buf, int len, int timeout ); bool Read( void* buf, int len, int timeout );
template<typename ShouldExit> template<typename ShouldExit>

View file

@ -213,21 +213,24 @@ TRACY_API const char* GetThreadName( uint32_t id )
# else # else
static auto _GetThreadDescription = (t_GetThreadDescription)GetProcAddress( GetModuleHandleA( "kernel32.dll" ), "GetThreadDescription" ); static auto _GetThreadDescription = (t_GetThreadDescription)GetProcAddress( GetModuleHandleA( "kernel32.dll" ), "GetThreadDescription" );
# endif # endif
if( _GetThreadDescription ) if( _GetThreadDescription )
{ {
auto hnd = OpenThread( THREAD_QUERY_LIMITED_INFORMATION, FALSE, (DWORD)id ); auto hnd = OpenThread( THREAD_QUERY_LIMITED_INFORMATION, FALSE, (DWORD)id );
if( hnd != 0 ) if( hnd != 0 )
{ {
PWSTR tmp; PWSTR tmp;
_GetThreadDescription( hnd, &tmp ); if( SUCCEEDED( _GetThreadDescription( hnd, &tmp ) ) )
auto ret = wcstombs( buf, tmp, 256 ); {
CloseHandle( hnd ); auto ret = wcstombs( buf, tmp, 256 );
if( ret != 0 ) CloseHandle( hnd );
{ LocalFree( tmp );
return buf; if( ret != static_cast<size_t>( -1 ) )
} {
} return buf;
} }
}
}
}
#elif defined __linux__ #elif defined __linux__
int cs, fd; int cs, fd;
char path[32]; char path[32];

View file

@ -6,8 +6,8 @@ namespace tracy
namespace Version namespace Version
{ {
enum { Major = 0 }; enum { Major = 0 };
enum { Minor = 9 }; enum { Minor = 10 };
enum { Patch = 1 }; enum { Patch = 0 };
} }
} }

View file

@ -1,4 +1,8 @@
#include <limits.h> #include <limits.h>
#if defined(__linux__) && !defined(__GLIBC__) && !defined(__WORDSIZE)
// include __WORDSIZE headers for musl
# include <bits/reg.h>
#endif
#if __WORDSIZE == 64 #if __WORDSIZE == 64
# define BACKTRACE_ELF_SIZE 64 # define BACKTRACE_ELF_SIZE 64
#else #else

View file

@ -473,7 +473,7 @@ enum attr_val_encoding
/* An address. */ /* An address. */
ATTR_VAL_ADDRESS, ATTR_VAL_ADDRESS,
/* An index into the .debug_addr section, whose value is relative to /* An index into the .debug_addr section, whose value is relative to
* the DW_AT_addr_base attribute of the compilation unit. */ the DW_AT_addr_base attribute of the compilation unit. */
ATTR_VAL_ADDRESS_INDEX, ATTR_VAL_ADDRESS_INDEX,
/* A unsigned integer. */ /* A unsigned integer. */
ATTR_VAL_UINT, ATTR_VAL_UINT,
@ -611,8 +611,8 @@ struct function
struct function_addrs struct function_addrs
{ {
/* Range is LOW <= PC < HIGH. */ /* Range is LOW <= PC < HIGH. */
uint64_t low; uintptr_t low;
uint64_t high; uintptr_t high;
/* Function for this address range. */ /* Function for this address range. */
struct function *function; struct function *function;
}; };
@ -693,8 +693,8 @@ struct unit
struct unit_addrs struct unit_addrs
{ {
/* Range is LOW <= PC < HIGH. */ /* Range is LOW <= PC < HIGH. */
uint64_t low; uintptr_t low;
uint64_t high; uintptr_t high;
/* Compilation unit for this address range. */ /* Compilation unit for this address range. */
struct unit *u; struct unit *u;
}; };
@ -1431,7 +1431,7 @@ resolve_addr_index (const struct dwarf_sections *dwarf_sections,
uint64_t addr_base, int addrsize, int is_bigendian, uint64_t addr_base, int addrsize, int is_bigendian,
uint64_t addr_index, uint64_t addr_index,
backtrace_error_callback error_callback, void *data, backtrace_error_callback error_callback, void *data,
uint64_t *address) uintptr_t *address)
{ {
uint64_t offset; uint64_t offset;
struct dwarf_buf addr_buf; struct dwarf_buf addr_buf;
@ -1452,7 +1452,7 @@ resolve_addr_index (const struct dwarf_sections *dwarf_sections,
addr_buf.data = data; addr_buf.data = data;
addr_buf.reported_underflow = 0; addr_buf.reported_underflow = 0;
*address = read_address (&addr_buf, addrsize); *address = (uintptr_t) read_address (&addr_buf, addrsize);
return 1; return 1;
} }
@ -1531,7 +1531,7 @@ function_addrs_search (const void *vkey, const void *ventry)
static int static int
add_unit_addr (struct backtrace_state *state, void *rdata, add_unit_addr (struct backtrace_state *state, void *rdata,
uint64_t lowpc, uint64_t highpc, uintptr_t lowpc, uintptr_t highpc,
backtrace_error_callback error_callback, void *data, backtrace_error_callback error_callback, void *data,
void *pvec) void *pvec)
{ {
@ -1867,10 +1867,10 @@ lookup_abbrev (struct abbrevs *abbrevs, uint64_t code,
lowpc/highpc is set or ranges is set. */ lowpc/highpc is set or ranges is set. */
struct pcrange { struct pcrange {
uint64_t lowpc; /* The low PC value. */ uintptr_t lowpc; /* The low PC value. */
int have_lowpc; /* Whether a low PC value was found. */ int have_lowpc; /* Whether a low PC value was found. */
int lowpc_is_addr_index; /* Whether lowpc is in .debug_addr. */ int lowpc_is_addr_index; /* Whether lowpc is in .debug_addr. */
uint64_t highpc; /* The high PC value. */ uintptr_t highpc; /* The high PC value. */
int have_highpc; /* Whether a high PC value was found. */ int have_highpc; /* Whether a high PC value was found. */
int highpc_is_relative; /* Whether highpc is relative to lowpc. */ int highpc_is_relative; /* Whether highpc is relative to lowpc. */
int highpc_is_addr_index; /* Whether highpc is in .debug_addr. */ int highpc_is_addr_index; /* Whether highpc is in .debug_addr. */
@ -1890,12 +1890,12 @@ update_pcrange (const struct attr* attr, const struct attr_val* val,
case DW_AT_low_pc: case DW_AT_low_pc:
if (val->encoding == ATTR_VAL_ADDRESS) if (val->encoding == ATTR_VAL_ADDRESS)
{ {
pcrange->lowpc = val->u.uint; pcrange->lowpc = (uintptr_t) val->u.uint;
pcrange->have_lowpc = 1; pcrange->have_lowpc = 1;
} }
else if (val->encoding == ATTR_VAL_ADDRESS_INDEX) else if (val->encoding == ATTR_VAL_ADDRESS_INDEX)
{ {
pcrange->lowpc = val->u.uint; pcrange->lowpc = (uintptr_t) val->u.uint;
pcrange->have_lowpc = 1; pcrange->have_lowpc = 1;
pcrange->lowpc_is_addr_index = 1; pcrange->lowpc_is_addr_index = 1;
} }
@ -1904,18 +1904,18 @@ update_pcrange (const struct attr* attr, const struct attr_val* val,
case DW_AT_high_pc: case DW_AT_high_pc:
if (val->encoding == ATTR_VAL_ADDRESS) if (val->encoding == ATTR_VAL_ADDRESS)
{ {
pcrange->highpc = val->u.uint; pcrange->highpc = (uintptr_t) val->u.uint;
pcrange->have_highpc = 1; pcrange->have_highpc = 1;
} }
else if (val->encoding == ATTR_VAL_UINT) else if (val->encoding == ATTR_VAL_UINT)
{ {
pcrange->highpc = val->u.uint; pcrange->highpc = (uintptr_t) val->u.uint;
pcrange->have_highpc = 1; pcrange->have_highpc = 1;
pcrange->highpc_is_relative = 1; pcrange->highpc_is_relative = 1;
} }
else if (val->encoding == ATTR_VAL_ADDRESS_INDEX) else if (val->encoding == ATTR_VAL_ADDRESS_INDEX)
{ {
pcrange->highpc = val->u.uint; pcrange->highpc = (uintptr_t) val->u.uint;
pcrange->have_highpc = 1; pcrange->have_highpc = 1;
pcrange->highpc_is_addr_index = 1; pcrange->highpc_is_addr_index = 1;
} }
@ -1950,16 +1950,16 @@ add_low_high_range (struct backtrace_state *state,
uintptr_t base_address, int is_bigendian, uintptr_t base_address, int is_bigendian,
struct unit *u, const struct pcrange *pcrange, struct unit *u, const struct pcrange *pcrange,
int (*add_range) (struct backtrace_state *state, int (*add_range) (struct backtrace_state *state,
void *rdata, uint64_t lowpc, void *rdata, uintptr_t lowpc,
uint64_t highpc, uintptr_t highpc,
backtrace_error_callback error_callback, backtrace_error_callback error_callback,
void *data, void *vec), void *data, void *vec),
void *rdata, void *rdata,
backtrace_error_callback error_callback, void *data, backtrace_error_callback error_callback, void *data,
void *vec) void *vec)
{ {
uint64_t lowpc; uintptr_t lowpc;
uint64_t highpc; uintptr_t highpc;
lowpc = pcrange->lowpc; lowpc = pcrange->lowpc;
if (pcrange->lowpc_is_addr_index) if (pcrange->lowpc_is_addr_index)
@ -1997,10 +1997,10 @@ add_ranges_from_ranges (
struct backtrace_state *state, struct backtrace_state *state,
const struct dwarf_sections *dwarf_sections, const struct dwarf_sections *dwarf_sections,
uintptr_t base_address, int is_bigendian, uintptr_t base_address, int is_bigendian,
struct unit *u, uint64_t base, struct unit *u, uintptr_t base,
const struct pcrange *pcrange, const struct pcrange *pcrange,
int (*add_range) (struct backtrace_state *state, void *rdata, int (*add_range) (struct backtrace_state *state, void *rdata,
uint64_t lowpc, uint64_t highpc, uintptr_t lowpc, uintptr_t highpc,
backtrace_error_callback error_callback, void *data, backtrace_error_callback error_callback, void *data,
void *vec), void *vec),
void *rdata, void *rdata,
@ -2039,12 +2039,12 @@ add_ranges_from_ranges (
break; break;
if (is_highest_address (low, u->addrsize)) if (is_highest_address (low, u->addrsize))
base = high; base = (uintptr_t) high;
else else
{ {
if (!add_range (state, rdata, if (!add_range (state, rdata,
low + base + base_address, (uintptr_t) low + base + base_address,
high + base + base_address, (uintptr_t) high + base + base_address,
error_callback, data, vec)) error_callback, data, vec))
return 0; return 0;
} }
@ -2064,10 +2064,10 @@ add_ranges_from_rnglists (
struct backtrace_state *state, struct backtrace_state *state,
const struct dwarf_sections *dwarf_sections, const struct dwarf_sections *dwarf_sections,
uintptr_t base_address, int is_bigendian, uintptr_t base_address, int is_bigendian,
struct unit *u, uint64_t base, struct unit *u, uintptr_t base,
const struct pcrange *pcrange, const struct pcrange *pcrange,
int (*add_range) (struct backtrace_state *state, void *rdata, int (*add_range) (struct backtrace_state *state, void *rdata,
uint64_t lowpc, uint64_t highpc, uintptr_t lowpc, uintptr_t highpc,
backtrace_error_callback error_callback, void *data, backtrace_error_callback error_callback, void *data,
void *vec), void *vec),
void *rdata, void *rdata,
@ -2133,8 +2133,8 @@ add_ranges_from_rnglists (
case DW_RLE_startx_endx: case DW_RLE_startx_endx:
{ {
uint64_t index; uint64_t index;
uint64_t low; uintptr_t low;
uint64_t high; uintptr_t high;
index = read_uleb128 (&rnglists_buf); index = read_uleb128 (&rnglists_buf);
if (!resolve_addr_index (dwarf_sections, u->addr_base, if (!resolve_addr_index (dwarf_sections, u->addr_base,
@ -2156,8 +2156,8 @@ add_ranges_from_rnglists (
case DW_RLE_startx_length: case DW_RLE_startx_length:
{ {
uint64_t index; uint64_t index;
uint64_t low; uintptr_t low;
uint64_t length; uintptr_t length;
index = read_uleb128 (&rnglists_buf); index = read_uleb128 (&rnglists_buf);
if (!resolve_addr_index (dwarf_sections, u->addr_base, if (!resolve_addr_index (dwarf_sections, u->addr_base,
@ -2187,16 +2187,16 @@ add_ranges_from_rnglists (
break; break;
case DW_RLE_base_address: case DW_RLE_base_address:
base = read_address (&rnglists_buf, u->addrsize); base = (uintptr_t) read_address (&rnglists_buf, u->addrsize);
break; break;
case DW_RLE_start_end: case DW_RLE_start_end:
{ {
uint64_t low; uintptr_t low;
uint64_t high; uintptr_t high;
low = read_address (&rnglists_buf, u->addrsize); low = (uintptr_t) read_address (&rnglists_buf, u->addrsize);
high = read_address (&rnglists_buf, u->addrsize); high = (uintptr_t) read_address (&rnglists_buf, u->addrsize);
if (!add_range (state, rdata, low + base_address, if (!add_range (state, rdata, low + base_address,
high + base_address, error_callback, data, high + base_address, error_callback, data,
vec)) vec))
@ -2206,11 +2206,11 @@ add_ranges_from_rnglists (
case DW_RLE_start_length: case DW_RLE_start_length:
{ {
uint64_t low; uintptr_t low;
uint64_t length; uintptr_t length;
low = read_address (&rnglists_buf, u->addrsize); low = (uintptr_t) read_address (&rnglists_buf, u->addrsize);
length = read_uleb128 (&rnglists_buf); length = (uintptr_t) read_uleb128 (&rnglists_buf);
low += base_address; low += base_address;
if (!add_range (state, rdata, low, low + length, if (!add_range (state, rdata, low, low + length,
error_callback, data, vec)) error_callback, data, vec))
@ -2240,9 +2240,9 @@ static int
add_ranges (struct backtrace_state *state, add_ranges (struct backtrace_state *state,
const struct dwarf_sections *dwarf_sections, const struct dwarf_sections *dwarf_sections,
uintptr_t base_address, int is_bigendian, uintptr_t base_address, int is_bigendian,
struct unit *u, uint64_t base, const struct pcrange *pcrange, struct unit *u, uintptr_t base, const struct pcrange *pcrange,
int (*add_range) (struct backtrace_state *state, void *rdata, int (*add_range) (struct backtrace_state *state, void *rdata,
uint64_t lowpc, uint64_t highpc, uintptr_t lowpc, uintptr_t highpc,
backtrace_error_callback error_callback, backtrace_error_callback error_callback,
void *data, void *vec), void *data, void *vec),
void *rdata, void *rdata,
@ -3520,7 +3520,7 @@ read_referenced_name (struct dwarf_data *ddata, struct unit *u,
static int static int
add_function_range (struct backtrace_state *state, void *rdata, add_function_range (struct backtrace_state *state, void *rdata,
uint64_t lowpc, uint64_t highpc, uintptr_t lowpc, uintptr_t highpc,
backtrace_error_callback error_callback, void *data, backtrace_error_callback error_callback, void *data,
void *pvec) void *pvec)
{ {
@ -3560,7 +3560,7 @@ add_function_range (struct backtrace_state *state, void *rdata,
static int static int
read_function_entry (struct backtrace_state *state, struct dwarf_data *ddata, read_function_entry (struct backtrace_state *state, struct dwarf_data *ddata,
struct unit *u, uint64_t base, struct dwarf_buf *unit_buf, struct unit *u, uintptr_t base, struct dwarf_buf *unit_buf,
const struct line_header *lhdr, const struct line_header *lhdr,
backtrace_error_callback error_callback, void *data, backtrace_error_callback error_callback, void *data,
struct function_vector *vec_function, struct function_vector *vec_function,
@ -3624,7 +3624,7 @@ read_function_entry (struct backtrace_state *state, struct dwarf_data *ddata,
&& abbrev->attrs[i].name == DW_AT_low_pc) && abbrev->attrs[i].name == DW_AT_low_pc)
{ {
if (val.encoding == ATTR_VAL_ADDRESS) if (val.encoding == ATTR_VAL_ADDRESS)
base = val.u.uint; base = (uintptr_t) val.u.uint;
else if (val.encoding == ATTR_VAL_ADDRESS_INDEX) else if (val.encoding == ATTR_VAL_ADDRESS_INDEX)
{ {
if (!resolve_addr_index (&ddata->dwarf_sections, if (!resolve_addr_index (&ddata->dwarf_sections,

View file

@ -2823,18 +2823,18 @@ elf_zstd_read_fse (const unsigned char **ppin, const unsigned char *pinend,
while ((val & 0xfff) == 0xfff) while ((val & 0xfff) == 0xfff)
{ {
zidx += 3 * 6; zidx += 3 * 6;
if (!elf_fetch_bits (&pin, pinend, &val, &bits))
return 0;
val >>= 12; val >>= 12;
bits -= 12; bits -= 12;
if (!elf_fetch_bits (&pin, pinend, &val, &bits))
return 0;
} }
while ((val & 3) == 3) while ((val & 3) == 3)
{ {
zidx += 3; zidx += 3;
if (!elf_fetch_bits (&pin, pinend, &val, &bits))
return 0;
val >>= 2; val >>= 2;
bits -= 2; bits -= 2;
if (!elf_fetch_bits (&pin, pinend, &val, &bits))
return 0;
} }
/* We have at least 13 bits here, don't need to fetch. */ /* We have at least 13 bits here, don't need to fetch. */
zidx += val & 3; zidx += val & 3;
@ -2964,7 +2964,7 @@ elf_zstd_build_fse (const int16_t *norm, int idx, uint16_t *next,
pos = (pos + step) & mask; pos = (pos + step) & mask;
} }
} }
if (pos != 0) if (unlikely (pos != 0))
{ {
elf_uncompress_failed (); elf_uncompress_failed ();
return 0; return 0;
@ -3440,17 +3440,17 @@ static const struct elf_zstd_fse_baseline_entry elf_zstd_match_table[64] =
static const struct elf_zstd_fse_baseline_entry elf_zstd_offset_table[32] = static const struct elf_zstd_fse_baseline_entry elf_zstd_offset_table[32] =
{ {
{ 1, 0, 5, 0 }, { 64, 6, 4, 0 }, { 512, 9, 5, 0 }, { 1, 0, 5, 0 }, { 61, 6, 4, 0 }, { 509, 9, 5, 0 },
{ 32768, 15, 5, 0 }, { 2097152, 21, 5, 0 }, { 8, 3, 5, 0 }, { 32765, 15, 5, 0 }, { 2097149, 21, 5, 0 }, { 5, 3, 5, 0 },
{ 128, 7, 4, 0 }, { 4096, 12, 5, 0 }, { 262144, 18, 5, 0 }, { 125, 7, 4, 0 }, { 4093, 12, 5, 0 }, { 262141, 18, 5, 0 },
{ 8388608, 23, 5, 0 }, { 32, 5, 5, 0 }, { 256, 8, 4, 0 }, { 8388605, 23, 5, 0 }, { 29, 5, 5, 0 }, { 253, 8, 4, 0 },
{ 16384, 14, 5, 0 }, { 1048576, 20, 5, 0 }, { 4, 2, 5, 0 }, { 16381, 14, 5, 0 }, { 1048573, 20, 5, 0 }, { 1, 2, 5, 0 },
{ 128, 7, 4, 16 }, { 2048, 11, 5, 0 }, { 131072, 17, 5, 0 }, { 125, 7, 4, 16 }, { 2045, 11, 5, 0 }, { 131069, 17, 5, 0 },
{ 4194304, 22, 5, 0 }, { 16, 4, 5, 0 }, { 256, 8, 4, 16 }, { 4194301, 22, 5, 0 }, { 13, 4, 5, 0 }, { 253, 8, 4, 16 },
{ 8192, 13, 5, 0 }, { 524288, 19, 5, 0 }, { 2, 1, 5, 0 }, { 8189, 13, 5, 0 }, { 524285, 19, 5, 0 }, { 2, 1, 5, 0 },
{ 64, 6, 4, 16 }, { 1024, 10, 5, 0 }, { 65536, 16, 5, 0 }, { 61, 6, 4, 16 }, { 1021, 10, 5, 0 }, { 65533, 16, 5, 0 },
{ 268435456, 28, 5, 0 }, { 134217728, 27, 5, 0 }, { 67108864, 26, 5, 0 }, { 268435453, 28, 5, 0 }, { 134217725, 27, 5, 0 }, { 67108861, 26, 5, 0 },
{ 33554432, 25, 5, 0 }, { 16777216, 24, 5, 0 }, { 33554429, 25, 5, 0 }, { 16777213, 24, 5, 0 },
}; };
/* Read a zstd Huffman table and build the decoding table in *TABLE, reading /* Read a zstd Huffman table and build the decoding table in *TABLE, reading
@ -3635,7 +3635,7 @@ elf_zstd_read_huff (const unsigned char **ppin, const unsigned char *pinend,
} }
weight_mark = (uint32_t *) (weights + 256); weight_mark = (uint32_t *) (weights + 256);
memset (weight_mark, 0, 12 * sizeof (uint32_t)); memset (weight_mark, 0, 13 * sizeof (uint32_t));
weight_mask = 0; weight_mask = 0;
for (i = 0; i < count; ++i) for (i = 0; i < count; ++i)
{ {
@ -3702,7 +3702,7 @@ elf_zstd_read_huff (const unsigned char **ppin, const unsigned char *pinend,
/* Change WEIGHT_MARK from a count of weights to the index of the first /* Change WEIGHT_MARK from a count of weights to the index of the first
symbol for that weight. We shift the indexes to also store how many we symbol for that weight. We shift the indexes to also store how many we
hae seen so far, below. */ have seen so far, below. */
{ {
uint32_t next; uint32_t next;
@ -3783,7 +3783,7 @@ elf_zstd_read_literals (const unsigned char **ppin,
{ {
int raw; int raw;
/* Raw_literals_Block or RLE_Literals_Block */ /* Raw_Literals_Block or RLE_Literals_Block */
raw = (hdr & 3) == 0; raw = (hdr & 3) == 0;
@ -3965,7 +3965,7 @@ elf_zstd_read_literals (const unsigned char **ppin,
unsigned int bits; unsigned int bits;
uint32_t i; uint32_t i;
pback = pin + compressed_size - 1; pback = pin + total_streams_size - 1;
pbackend = pin; pbackend = pin;
if (!elf_fetch_backward_init (&pback, pbackend, &val, &bits)) if (!elf_fetch_backward_init (&pback, pbackend, &val, &bits))
return 0; return 0;

View file

@ -109,6 +109,7 @@
#define TracyParameterRegister(x,y) #define TracyParameterRegister(x,y)
#define TracyParameterSetup(x,y,z,w) #define TracyParameterSetup(x,y,z,w)
#define TracyIsConnected false #define TracyIsConnected false
#define TracySetProgramName(x)
#define TracyFiberEnter(x) #define TracyFiberEnter(x)
#define TracyFiberLeave #define TracyFiberLeave
@ -270,6 +271,7 @@
#define TracyParameterRegister( cb, data ) tracy::Profiler::ParameterRegister( cb, data ) #define TracyParameterRegister( cb, data ) tracy::Profiler::ParameterRegister( cb, data )
#define TracyParameterSetup( idx, name, isBool, val ) tracy::Profiler::ParameterSetup( idx, name, isBool, val ) #define TracyParameterSetup( idx, name, isBool, val ) tracy::Profiler::ParameterSetup( idx, name, isBool, val )
#define TracyIsConnected tracy::GetProfiler().IsConnected() #define TracyIsConnected tracy::GetProfiler().IsConnected()
#define TracySetProgramName( name ) tracy::GetProfiler().SetProgramName( name );
#ifdef TRACY_FIBERS #ifdef TRACY_FIBERS
# define TracyFiberEnter( fiber ) tracy::Profiler::EnterFiber( fiber ) # define TracyFiberEnter( fiber ) tracy::Profiler::EnterFiber( fiber )

View file

@ -11,6 +11,14 @@
extern "C" { extern "C" {
#endif #endif
enum TracyPlotFormatEnum
{
TracyPlotFormatNumber,
TracyPlotFormatMemory,
TracyPlotFormatPercentage,
TracyPlotFormatWatt
};
TRACY_API void ___tracy_set_thread_name( const char* name ); TRACY_API void ___tracy_set_thread_name( const char* name );
#define TracyCSetThreadName( name ) ___tracy_set_thread_name( name ); #define TracyCSetThreadName( name ) ___tracy_set_thread_name( name );
@ -60,6 +68,8 @@ typedef const void* TracyCZoneCtx;
#define TracyCPlot(x,y) #define TracyCPlot(x,y)
#define TracyCPlotF(x,y) #define TracyCPlotF(x,y)
#define TracyCPlotI(x,y) #define TracyCPlotI(x,y)
#define TracyCPlotConfig(x,y,z,w,a)
#define TracyCMessage(x,y) #define TracyCMessage(x,y)
#define TracyCMessageL(x) #define TracyCMessageL(x)
#define TracyCMessageC(x,y,z) #define TracyCMessageC(x,y,z)
@ -289,11 +299,13 @@ TRACY_API void ___tracy_emit_frame_image( const void* image, uint16_t w, uint16_
TRACY_API void ___tracy_emit_plot( const char* name, double val ); TRACY_API void ___tracy_emit_plot( const char* name, double val );
TRACY_API void ___tracy_emit_plot_float( const char* name, float val ); TRACY_API void ___tracy_emit_plot_float( const char* name, float val );
TRACY_API void ___tracy_emit_plot_int( const char* name, int64_t val ); TRACY_API void ___tracy_emit_plot_int( const char* name, int64_t val );
TRACY_API void ___tracy_emit_plot_config( const char* name, int type, int step, int fill, uint32_t color );
TRACY_API void ___tracy_emit_message_appinfo( const char* txt, size_t size ); TRACY_API void ___tracy_emit_message_appinfo( const char* txt, size_t size );
#define TracyCPlot( name, val ) ___tracy_emit_plot( name, val ); #define TracyCPlot( name, val ) ___tracy_emit_plot( name, val );
#define TracyCPlotF( name, val ) ___tracy_emit_plot_float( name, val ); #define TracyCPlotF( name, val ) ___tracy_emit_plot_float( name, val );
#define TracyCPlotI( name, val ) ___tracy_emit_plot_int( name, val ); #define TracyCPlotI( name, val ) ___tracy_emit_plot_int( name, val );
#define TracyCPlotConfig( name, type, step, fill, color ) ___tracy_emit_plot_config( name, type, step, fill, color );
#define TracyCAppInfo( txt, size ) ___tracy_emit_message_appinfo( txt, size ); #define TracyCAppInfo( txt, size ) ___tracy_emit_message_appinfo( txt, size );

View file

@ -13,13 +13,13 @@
#define TracyD3D11ZoneC(ctx, name, color) #define TracyD3D11ZoneC(ctx, name, color)
#define TracyD3D11NamedZone(ctx, varname, name, active) #define TracyD3D11NamedZone(ctx, varname, name, active)
#define TracyD3D11NamedZoneC(ctx, varname, name, color, active) #define TracyD3D11NamedZoneC(ctx, varname, name, color, active)
#define TracyD3D12ZoneTransient(ctx, varname, name, active) #define TracyD3D11ZoneTransient(ctx, varname, name, active)
#define TracyD3D11ZoneS(ctx, name, depth) #define TracyD3D11ZoneS(ctx, name, depth)
#define TracyD3D11ZoneCS(ctx, name, color, depth) #define TracyD3D11ZoneCS(ctx, name, color, depth)
#define TracyD3D11NamedZoneS(ctx, varname, name, depth, active) #define TracyD3D11NamedZoneS(ctx, varname, name, depth, active)
#define TracyD3D11NamedZoneCS(ctx, varname, name, color, depth, active) #define TracyD3D11NamedZoneCS(ctx, varname, name, color, depth, active)
#define TracyD3D12ZoneTransientS(ctx, varname, name, depth, active) #define TracyD3D11ZoneTransientS(ctx, varname, name, depth, active)
#define TracyD3D11Collect(ctx) #define TracyD3D11Collect(ctx)
@ -39,11 +39,12 @@ using TracyD3D11Ctx = void*;
#include "Tracy.hpp" #include "Tracy.hpp"
#include "../client/TracyProfiler.hpp" #include "../client/TracyProfiler.hpp"
#include "../client/TracyCallstack.hpp" #include "../client/TracyCallstack.hpp"
#include "../common/TracyAlign.hpp" #include "../common/TracyYield.hpp"
#include "../common/TracyAlloc.hpp"
#include <d3d11.h> #include <d3d11.h>
#define TracyD3D11Panic(msg, ...) do { assert(false && "TracyD3D11: " msg); TracyMessageLC("TracyD3D11: " msg, tracy::Color::Red4); __VA_ARGS__; } while(false);
namespace tracy namespace tracy
{ {
@ -51,71 +52,83 @@ class D3D11Ctx
{ {
friend class D3D11ZoneScope; friend class D3D11ZoneScope;
enum { QueryCount = 64 * 1024 }; static constexpr uint32_t MaxQueries = 64 * 1024;
enum CollectMode { POLL, BLOCK };
public: public:
D3D11Ctx( ID3D11Device* device, ID3D11DeviceContext* devicectx ) D3D11Ctx( ID3D11Device* device, ID3D11DeviceContext* devicectx )
: m_device( device )
, m_devicectx( devicectx )
, m_context( GetGpuCtxCounter().fetch_add( 1, std::memory_order_relaxed ) )
, m_head( 0 )
, m_tail( 0 )
{ {
assert( m_context != 255 ); // TODO: consider calling ID3D11Device::GetImmediateContext() instead of passing it as an argument
m_device = device;
device->AddRef();
m_immediateDevCtx = devicectx;
devicectx->AddRef();
for (int i = 0; i < QueryCount; i++)
{ {
HRESULT hr = S_OK; D3D11_QUERY_DESC desc = { };
D3D11_QUERY_DESC desc;
desc.MiscFlags = 0;
desc.Query = D3D11_QUERY_TIMESTAMP;
hr |= device->CreateQuery(&desc, &m_queries[i]);
desc.Query = D3D11_QUERY_TIMESTAMP_DISJOINT; desc.Query = D3D11_QUERY_TIMESTAMP_DISJOINT;
hr |= device->CreateQuery(&desc, &m_disjoints[i]); if (FAILED(m_device->CreateQuery(&desc, &m_disjointQuery)))
{
m_disjointMap[i] = nullptr; TracyD3D11Panic("unable to create disjoint timestamp query.", return);
}
assert(SUCCEEDED(hr));
} }
// Force query the initial GPU timestamp (pipeline stall) for (ID3D11Query*& query : m_queries)
D3D11_QUERY_DATA_TIMESTAMP_DISJOINT disjoint; {
UINT64 timestamp; D3D11_QUERY_DESC desc = { };
desc.Query = D3D11_QUERY_TIMESTAMP;
if (FAILED(m_device->CreateQuery(&desc, &query)))
{
TracyD3D11Panic("unable to create timestamp query.", return);
}
}
// Calibrate CPU and GPU timestamps
int64_t tcpu = 0;
int64_t tgpu = 0;
for (int attempts = 0; attempts < 50; attempts++) for (int attempts = 0; attempts < 50; attempts++)
{ {
devicectx->Begin(m_disjoints[0]); m_immediateDevCtx->Begin(m_disjointQuery);
devicectx->End(m_queries[0]); m_immediateDevCtx->End(m_queries[0]);
devicectx->End(m_disjoints[0]); m_immediateDevCtx->End(m_disjointQuery);
devicectx->Flush();
while (devicectx->GetData(m_disjoints[0], &disjoint, sizeof(disjoint), 0) == S_FALSE) int64_t tcpu0 = Profiler::GetTime();
/* Nothing */; WaitForQuery(m_disjointQuery);
int64_t tcpu1 = Profiler::GetTime();
D3D11_QUERY_DATA_TIMESTAMP_DISJOINT disjoint = { };
if (m_immediateDevCtx->GetData(m_disjointQuery, &disjoint, sizeof(disjoint), 0) != S_OK)
{
TracyMessageLC("TracyD3D11: unable to query GPU timestamp; retrying...", tracy::Color::Tomato);
continue;
}
if (disjoint.Disjoint) if (disjoint.Disjoint)
continue; continue;
while (devicectx->GetData(m_queries[0], &timestamp, sizeof(timestamp), 0) == S_FALSE) UINT64 timestamp = 0;
/* Nothing */; if (m_immediateDevCtx->GetData(m_queries[0], &timestamp, sizeof(timestamp), 0) != S_OK)
continue; // this should never happen, since the enclosing disjoint query succeeded
tcpu = tcpu0 + (tcpu1 - tcpu0) * 1 / 2;
tgpu = timestamp * (1000000000 / disjoint.Frequency);
break; break;
} }
int64_t tgpu = timestamp * (1000000000ull / disjoint.Frequency); // ready to roll
int64_t tcpu = Profiler::GetTime(); m_contextId = GetGpuCtxCounter().fetch_add(1);
m_immediateDevCtx->Begin(m_disjointQuery);
m_previousCheckpoint = m_nextCheckpoint = 0;
uint8_t flags = 0;
const float period = 1.f;
auto* item = Profiler::QueueSerial(); auto* item = Profiler::QueueSerial();
MemWrite( &item->hdr.type, QueueType::GpuNewContext ); MemWrite( &item->hdr.type, QueueType::GpuNewContext );
MemWrite( &item->gpuNewContext.cpuTime, tcpu ); MemWrite( &item->gpuNewContext.cpuTime, tcpu );
MemWrite( &item->gpuNewContext.gpuTime, tgpu ); MemWrite( &item->gpuNewContext.gpuTime, tgpu );
memset(&item->gpuNewContext.thread, 0, sizeof(item->gpuNewContext.thread)); MemWrite( &item->gpuNewContext.thread, uint32_t(0) ); // #TODO: why not GetThreadHandle()?
MemWrite( &item->gpuNewContext.period, period ); MemWrite( &item->gpuNewContext.period, 1.0f );
MemWrite( &item->gpuNewContext.context, m_context ); MemWrite( &item->gpuNewContext.context, m_contextId);
MemWrite( &item->gpuNewContext.flags, flags ); MemWrite( &item->gpuNewContext.flags, uint8_t(0) );
MemWrite( &item->gpuNewContext.type, GpuContextType::Direct3D11 ); MemWrite( &item->gpuNewContext.type, GpuContextType::Direct3D11 );
#ifdef TRACY_ON_DEMAND #ifdef TRACY_ON_DEMAND
@ -127,12 +140,20 @@ public:
~D3D11Ctx() ~D3D11Ctx()
{ {
for (int i = 0; i < QueryCount; i++) // collect all pending timestamps before destroying everything
do
{ {
m_queries[i]->Release(); Collect(BLOCK);
m_disjoints[i]->Release(); } while (m_previousCheckpoint != m_queryCounter);
m_disjointMap[i] = nullptr;
for (ID3D11Query* query : m_queries)
{
query->Release();
} }
m_immediateDevCtx->End(m_disjointQuery);
m_disjointQuery->Release();
m_immediateDevCtx->Release();
m_device->Release();
} }
void Name( const char* name, uint16_t len ) void Name( const char* name, uint16_t len )
@ -142,7 +163,7 @@ public:
auto item = Profiler::QueueSerial(); auto item = Profiler::QueueSerial();
MemWrite( &item->hdr.type, QueueType::GpuContextName ); MemWrite( &item->hdr.type, QueueType::GpuContextName );
MemWrite( &item->gpuContextNameFat.context, m_context ); MemWrite( &item->gpuContextNameFat.context, m_contextId );
MemWrite( &item->gpuContextNameFat.ptr, (uint64_t)ptr ); MemWrite( &item->gpuContextNameFat.ptr, (uint64_t)ptr );
MemWrite( &item->gpuContextNameFat.size, len ); MemWrite( &item->gpuContextNameFat.size, len );
#ifdef TRACY_ON_DEMAND #ifdef TRACY_ON_DEMAND
@ -151,217 +172,170 @@ public:
Profiler::QueueSerialFinish(); Profiler::QueueSerialFinish();
} }
void Collect() void Collect(CollectMode mode = POLL)
{ {
ZoneScopedC( Color::Red4 ); ZoneScopedC( Color::Red4 );
if( m_tail == m_head ) return;
#ifdef TRACY_ON_DEMAND #ifdef TRACY_ON_DEMAND
if( !GetProfiler().IsConnected() ) if( !GetProfiler().IsConnected() )
{ {
m_head = m_tail = 0; m_previousCheckpoint = m_nextCheckpoint = m_queryCounter;
return; return;
} }
#endif #endif
auto start = m_tail; if (m_previousCheckpoint == m_nextCheckpoint)
auto end = m_head + QueryCount;
auto cnt = (end - start) % QueryCount;
while (cnt > 1)
{ {
auto mid = start + cnt / 2; uintptr_t nextCheckpoint = m_queryCounter;
if (nextCheckpoint == m_nextCheckpoint)
bool available =
m_devicectx->GetData(m_disjointMap[mid % QueryCount], nullptr, 0, D3D11_ASYNC_GETDATA_DONOTFLUSH) == S_OK &&
m_devicectx->GetData(m_queries[mid % QueryCount], nullptr, 0, D3D11_ASYNC_GETDATA_DONOTFLUSH) == S_OK;
if (available)
{ {
start = mid; return;
} }
else m_nextCheckpoint = nextCheckpoint;
{ m_immediateDevCtx->End(m_disjointQuery);
end = mid;
}
cnt = (end - start) % QueryCount;
} }
start %= QueryCount; if (mode == CollectMode::BLOCK)
while (m_tail != start)
{ {
D3D11_QUERY_DATA_TIMESTAMP_DISJOINT disjoint; WaitForQuery(m_disjointQuery);
UINT64 time; }
m_devicectx->GetData(m_disjointMap[m_tail], &disjoint, sizeof(disjoint), 0); D3D11_QUERY_DATA_TIMESTAMP_DISJOINT disjoint = { };
m_devicectx->GetData(m_queries[m_tail], &time, sizeof(time), 0); if (m_immediateDevCtx->GetData(m_disjointQuery, &disjoint, sizeof(disjoint), D3D11_ASYNC_GETDATA_DONOTFLUSH) != S_OK)
{
return;
}
time *= (1000000000ull / disjoint.Frequency); if (disjoint.Disjoint == TRUE)
{
m_previousCheckpoint = m_nextCheckpoint;
TracyD3D11Panic("disjoint timestamps detected; dropping.");
return;
}
auto begin = m_previousCheckpoint;
auto end = m_nextCheckpoint;
for (auto i = begin; i != end; ++i)
{
uint32_t k = RingIndex(i);
UINT64 timestamp = 0;
if (m_immediateDevCtx->GetData(m_queries[k], &timestamp, sizeof(timestamp), 0) != S_OK)
{
TracyD3D11Panic("timestamp expected to be ready, but it was not!");
break;
}
timestamp *= (1000000000ull / disjoint.Frequency);
auto* item = Profiler::QueueSerial(); auto* item = Profiler::QueueSerial();
MemWrite(&item->hdr.type, QueueType::GpuTime); MemWrite(&item->hdr.type, QueueType::GpuTime);
MemWrite(&item->gpuTime.gpuTime, (int64_t)time); MemWrite(&item->gpuTime.gpuTime, static_cast<int64_t>(timestamp));
MemWrite(&item->gpuTime.queryId, (uint16_t)m_tail); MemWrite(&item->gpuTime.queryId, static_cast<uint16_t>(k));
MemWrite(&item->gpuTime.context, m_context); MemWrite(&item->gpuTime.context, m_contextId);
Profiler::QueueSerialFinish(); Profiler::QueueSerialFinish();
m_tail = (m_tail + 1) % QueryCount;
} }
// disjoint timestamp queries should only be invoked once per frame or less
// https://learn.microsoft.com/en-us/windows/win32/api/d3d11/ne-d3d11-d3d11_query
m_immediateDevCtx->Begin(m_disjointQuery);
m_previousCheckpoint = m_nextCheckpoint;
} }
private: private:
tracy_force_inline unsigned int NextQueryId() tracy_force_inline uint32_t RingIndex(uintptr_t index)
{ {
const auto id = m_head; index %= MaxQueries;
m_head = ( m_head + 1 ) % QueryCount; return static_cast<uint32_t>(index);
assert( m_head != m_tail );
return id;
} }
tracy_force_inline ID3D11Query* TranslateQueryId( unsigned int id ) tracy_force_inline uint32_t RingCount(uintptr_t begin, uintptr_t end)
{
// wrap-around safe: all unsigned
uintptr_t count = end - begin;
return static_cast<uint32_t>(count);
}
tracy_force_inline uint32_t NextQueryId()
{
auto id = m_queryCounter++;
if (RingCount(m_previousCheckpoint, id) >= MaxQueries)
{
TracyD3D11Panic("too many pending timestamp queries.");
// #TODO: return some sentinel value; ideally a "hidden" query index
}
return RingIndex(id);
}
tracy_force_inline ID3D11Query* GetQueryObjectFromId(uint32_t id)
{ {
return m_queries[id]; return m_queries[id];
} }
tracy_force_inline ID3D11Query* MapDisjointQueryId( unsigned int id, unsigned int disjointId ) tracy_force_inline void WaitForQuery(ID3D11Query* query)
{ {
m_disjointMap[id] = m_disjoints[disjointId]; m_immediateDevCtx->Flush();
return m_disjoints[disjointId]; while (m_immediateDevCtx->GetData(query, nullptr, 0, 0) != S_OK)
YieldThread(); // busy-wait :-( attempt to reduce power usage with _mm_pause() & friends...
} }
tracy_force_inline uint8_t GetId() const tracy_force_inline uint8_t GetContextId() const
{ {
return m_context; return m_contextId;
} }
ID3D11Device* m_device; ID3D11Device* m_device = nullptr;
ID3D11DeviceContext* m_devicectx; ID3D11DeviceContext* m_immediateDevCtx = nullptr;
ID3D11Query* m_queries[QueryCount]; ID3D11Query* m_queries[MaxQueries];
ID3D11Query* m_disjoints[QueryCount]; ID3D11Query* m_disjointQuery = nullptr;
ID3D11Query* m_disjointMap[QueryCount]; // Multiple time queries can have one disjoint query
uint8_t m_context;
unsigned int m_head; uint8_t m_contextId = 255; // NOTE: apparently, 255 means invalid id; is this documented anywhere?
unsigned int m_tail;
uintptr_t m_queryCounter = 0;
uintptr_t m_previousCheckpoint = 0;
uintptr_t m_nextCheckpoint = 0;
}; };
class D3D11ZoneScope class D3D11ZoneScope
{ {
public: public:
tracy_force_inline D3D11ZoneScope( D3D11Ctx* ctx, const SourceLocationData* srcloc, bool is_active ) tracy_force_inline D3D11ZoneScope( D3D11Ctx* ctx, const SourceLocationData* srcloc, bool active )
#ifdef TRACY_ON_DEMAND : D3D11ZoneScope(ctx, active)
: m_active( is_active && GetProfiler().IsConnected() )
#else
: m_active( is_active )
#endif
{ {
if( !m_active ) return; if( !m_active ) return;
m_ctx = ctx;
const auto queryId = ctx->NextQueryId();
ctx->m_devicectx->Begin(ctx->MapDisjointQueryId(queryId, queryId));
ctx->m_devicectx->End(ctx->TranslateQueryId(queryId));
m_disjointId = queryId;
auto* item = Profiler::QueueSerial(); auto* item = Profiler::QueueSerial();
MemWrite( &item->hdr.type, QueueType::GpuZoneBeginSerial ); WriteQueueItem(item, QueueType::GpuZoneBeginSerial, reinterpret_cast<uint64_t>(srcloc));
MemWrite( &item->gpuZoneBegin.cpuTime, Profiler::GetTime() );
MemWrite( &item->gpuZoneBegin.srcloc, (uint64_t)srcloc );
MemWrite( &item->gpuZoneBegin.thread, GetThreadHandle() );
MemWrite( &item->gpuZoneBegin.queryId, uint16_t( queryId ) );
MemWrite( &item->gpuZoneBegin.context, ctx->GetId() );
Profiler::QueueSerialFinish();
} }
tracy_force_inline D3D11ZoneScope( D3D11Ctx* ctx, const SourceLocationData* srcloc, int depth, bool is_active ) tracy_force_inline D3D11ZoneScope( D3D11Ctx* ctx, const SourceLocationData* srcloc, int depth, bool active )
#ifdef TRACY_ON_DEMAND : D3D11ZoneScope(ctx, active)
: m_active( is_active && GetProfiler().IsConnected() )
#else
: m_active( is_active )
#endif
{ {
if( !m_active ) return; if( !m_active ) return;
m_ctx = ctx;
const auto queryId = ctx->NextQueryId(); auto* item = Profiler::QueueSerialCallstack(Callstack(depth));
ctx->m_devicectx->Begin(ctx->MapDisjointQueryId(queryId, queryId)); WriteQueueItem(item, QueueType::GpuZoneBeginCallstackSerial, reinterpret_cast<uint64_t>(srcloc));
ctx->m_devicectx->End(ctx->TranslateQueryId(queryId));
m_disjointId = queryId;
auto* item = Profiler::QueueSerial();
MemWrite( &item->hdr.type, QueueType::GpuZoneBeginCallstackSerial );
MemWrite( &item->gpuZoneBegin.cpuTime, Profiler::GetTime() );
MemWrite( &item->gpuZoneBegin.srcloc, (uint64_t)srcloc );
MemWrite( &item->gpuZoneBegin.thread, GetThreadHandle() );
MemWrite( &item->gpuZoneBegin.queryId, uint16_t( queryId ) );
MemWrite( &item->gpuZoneBegin.context, ctx->GetId() );
Profiler::QueueSerialFinish();
GetProfiler().SendCallstack( depth );
} }
tracy_force_inline D3D11ZoneScope(D3D11Ctx* ctx, uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz, const char* name, size_t nameSz, bool active) tracy_force_inline D3D11ZoneScope(D3D11Ctx* ctx, uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz, const char* name, size_t nameSz, bool active)
#ifdef TRACY_ON_DEMAND : D3D11ZoneScope(ctx, active)
: m_active(active&& GetProfiler().IsConnected())
#else
: m_active(active)
#endif
{ {
if( !m_active ) return; if( !m_active ) return;
m_ctx = ctx;
const auto queryId = ctx->NextQueryId();
ctx->m_devicectx->Begin(ctx->MapDisjointQueryId(queryId, queryId));
ctx->m_devicectx->End(ctx->TranslateQueryId(queryId));
m_disjointId = queryId;
const auto sourceLocation = Profiler::AllocSourceLocation(line, source, sourceSz, function, functionSz, name, nameSz); const auto sourceLocation = Profiler::AllocSourceLocation(line, source, sourceSz, function, functionSz, name, nameSz);
auto* item = Profiler::QueueSerial(); auto* item = Profiler::QueueSerial();
MemWrite(&item->hdr.type, QueueType::GpuZoneBeginAllocSrcLocSerial); WriteQueueItem(item, QueueType::GpuZoneBeginAllocSrcLocSerial, sourceLocation);
MemWrite(&item->gpuZoneBegin.cpuTime, Profiler::GetTime());
MemWrite(&item->gpuZoneBegin.srcloc, sourceLocation);
MemWrite(&item->gpuZoneBegin.thread, GetThreadHandle());
MemWrite(&item->gpuZoneBegin.queryId, static_cast<uint16_t>(queryId));
MemWrite(&item->gpuZoneBegin.context, ctx->GetId());
Profiler::QueueSerialFinish();
} }
tracy_force_inline D3D11ZoneScope(D3D11Ctx* ctx, uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz, const char* name, size_t nameSz, int depth, bool active) tracy_force_inline D3D11ZoneScope(D3D11Ctx* ctx, uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz, const char* name, size_t nameSz, int depth, bool active)
#ifdef TRACY_ON_DEMAND : D3D11ZoneScope(ctx, active)
: m_active(active&& GetProfiler().IsConnected())
#else
: m_active(active)
#endif
{ {
if( !m_active ) return; if( !m_active ) return;
m_ctx = ctx;
const auto queryId = ctx->NextQueryId();
ctx->m_devicectx->Begin(ctx->MapDisjointQueryId(queryId, queryId));
ctx->m_devicectx->End(ctx->TranslateQueryId(queryId));
m_disjointId = queryId;
const auto sourceLocation = Profiler::AllocSourceLocation(line, source, sourceSz, function, functionSz, name, nameSz); const auto sourceLocation = Profiler::AllocSourceLocation(line, source, sourceSz, function, functionSz, name, nameSz);
auto* item = Profiler::QueueSerialCallstack(Callstack(depth)); auto* item = Profiler::QueueSerialCallstack(Callstack(depth));
MemWrite(&item->hdr.type, QueueType::GpuZoneBeginAllocSrcLocCallstackSerial); WriteQueueItem(item, QueueType::GpuZoneBeginAllocSrcLocCallstackSerial, sourceLocation);
MemWrite(&item->gpuZoneBegin.cpuTime, Profiler::GetTime());
MemWrite(&item->gpuZoneBegin.srcloc, sourceLocation);
MemWrite(&item->gpuZoneBegin.thread, GetThreadHandle());
MemWrite(&item->gpuZoneBegin.queryId, static_cast<uint16_t>(queryId));
MemWrite(&item->gpuZoneBegin.context, ctx->GetId());
Profiler::QueueSerialFinish();
} }
tracy_force_inline ~D3D11ZoneScope() tracy_force_inline ~D3D11ZoneScope()
@ -369,24 +343,46 @@ public:
if( !m_active ) return; if( !m_active ) return;
const auto queryId = m_ctx->NextQueryId(); const auto queryId = m_ctx->NextQueryId();
m_ctx->m_devicectx->End(m_ctx->TranslateQueryId(queryId)); m_ctx->m_immediateDevCtx->End(m_ctx->GetQueryObjectFromId(queryId));
m_ctx->m_devicectx->End(m_ctx->MapDisjointQueryId(queryId, m_disjointId));
auto* item = Profiler::QueueSerial(); auto* item = Profiler::QueueSerial();
MemWrite( &item->hdr.type, QueueType::GpuZoneEndSerial ); MemWrite( &item->hdr.type, QueueType::GpuZoneEndSerial );
MemWrite( &item->gpuZoneEnd.cpuTime, Profiler::GetTime() ); MemWrite( &item->gpuZoneEnd.cpuTime, Profiler::GetTime() );
MemWrite( &item->gpuZoneEnd.thread, GetThreadHandle() ); MemWrite( &item->gpuZoneEnd.thread, GetThreadHandle() );
MemWrite( &item->gpuZoneEnd.queryId, uint16_t( queryId ) ); MemWrite( &item->gpuZoneEnd.queryId, uint16_t( queryId ) );
MemWrite( &item->gpuZoneEnd.context, m_ctx->GetId() ); MemWrite( &item->gpuZoneEnd.context, m_ctx->GetContextId() );
Profiler::QueueSerialFinish(); Profiler::QueueSerialFinish();
} }
private: private:
tracy_force_inline D3D11ZoneScope( D3D11Ctx* ctx, bool active )
#ifdef TRACY_ON_DEMAND
: m_active( is_active && GetProfiler().IsConnected() )
#else
: m_active( active )
#endif
{
if( !m_active ) return;
m_ctx = ctx;
}
void WriteQueueItem(tracy::QueueItem* item, tracy::QueueType queueItemType, uint64_t sourceLocation)
{
const auto queryId = m_ctx->NextQueryId();
m_ctx->m_immediateDevCtx->End(m_ctx->GetQueryObjectFromId(queryId));
MemWrite( &item->hdr.type, queueItemType);
MemWrite( &item->gpuZoneBegin.cpuTime, Profiler::GetTime() );
MemWrite( &item->gpuZoneBegin.srcloc, sourceLocation );
MemWrite( &item->gpuZoneBegin.thread, GetThreadHandle() );
MemWrite( &item->gpuZoneBegin.queryId, uint16_t( queryId ) );
MemWrite( &item->gpuZoneBegin.context, m_ctx->GetContextId() );
Profiler::QueueSerialFinish();
}
const bool m_active; const bool m_active;
D3D11Ctx* m_ctx; D3D11Ctx* m_ctx;
unsigned int m_disjointId;
}; };
static inline D3D11Ctx* CreateD3D11Context( ID3D11Device* device, ID3D11DeviceContext* devicectx ) static inline D3D11Ctx* CreateD3D11Context( ID3D11Device* device, ID3D11DeviceContext* devicectx )
@ -403,38 +399,44 @@ static inline void DestroyD3D11Context( D3D11Ctx* ctx )
} }
} }
#undef TracyD3D11Panic
using TracyD3D11Ctx = tracy::D3D11Ctx*; using TracyD3D11Ctx = tracy::D3D11Ctx*;
#define TracyD3D11Context( device, devicectx ) tracy::CreateD3D11Context( device, devicectx ); #define TracyD3D11Context( device, devicectx ) tracy::CreateD3D11Context( device, devicectx );
#define TracyD3D11Destroy(ctx) tracy::DestroyD3D11Context(ctx); #define TracyD3D11Destroy(ctx) tracy::DestroyD3D11Context(ctx);
#define TracyD3D11ContextName(ctx, name, size) ctx->Name(name, size); #define TracyD3D11ContextName(ctx, name, size) ctx->Name(name, size);
#define TracyD3D11UnnamedZone ___tracy_gpu_d3d11_zone
#define TracyD3D11SrcLocSymbol TracyConcat(__tracy_gpu_d3d11_source_location,TracyLine)
#define TracyD3D11SrcLocObject(name, color) static constexpr tracy::SourceLocationData TracyD3D11SrcLocSymbol { name, TracyFunction, TracyFile, (uint32_t)TracyLine, color };
#if defined TRACY_HAS_CALLSTACK && defined TRACY_CALLSTACK #if defined TRACY_HAS_CALLSTACK && defined TRACY_CALLSTACK
# define TracyD3D11Zone( ctx, name ) TracyD3D11NamedZoneS( ctx, ___tracy_gpu_zone, name, TRACY_CALLSTACK, true ) # define TracyD3D11Zone( ctx, name ) TracyD3D11NamedZoneS( ctx, TracyD3D11UnnamedZone, name, TRACY_CALLSTACK, true )
# define TracyD3D11ZoneC( ctx, name, color ) TracyD3D11NamedZoneCS( ctx, ___tracy_gpu_zone, name, color, TRACY_CALLSTACK, true ) # define TracyD3D11ZoneC( ctx, name, color ) TracyD3D11NamedZoneCS( ctx, TracyD3D11UnnamedZone, name, color, TRACY_CALLSTACK, true )
# define TracyD3D11NamedZone( ctx, varname, name, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,TracyLine) { name, TracyFunction, TracyFile, (uint32_t)TracyLine, 0 }; tracy::D3D11ZoneScope varname( ctx, &TracyConcat(__tracy_gpu_source_location,TracyLine), TRACY_CALLSTACK, active ); # define TracyD3D11NamedZone( ctx, varname, name, active ) TracyD3D11SrcLocObject(name, 0); tracy::D3D11ZoneScope varname( ctx, &TracyD3D11SrcLocSymbol, TRACY_CALLSTACK, active );
# define TracyD3D11NamedZoneC( ctx, varname, name, color, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,TracyLine) { name, TracyFunction, TracyFile, (uint32_t)TracyLine, color }; tracy::D3D11ZoneScope varname( ctx, &TracyConcat(__tracy_gpu_source_location,TracyLine), TRACY_CALLSTACK, active ); # define TracyD3D11NamedZoneC( ctx, varname, name, color, active ) TracyD3D11SrcLocObject(name, color); tracy::D3D11ZoneScope varname( ctx, &TracyD3D11SrcLocSymbol, TRACY_CALLSTACK, active );
# define TracyD3D11ZoneTransient(ctx, varname, name, active) TracyD3D11ZoneTransientS(ctx, varname, cmdList, name, TRACY_CALLSTACK, active) # define TracyD3D11ZoneTransient(ctx, varname, name, active) TracyD3D11ZoneTransientS(ctx, varname, cmdList, name, TRACY_CALLSTACK, active)
#else #else
# define TracyD3D11Zone( ctx, name ) TracyD3D11NamedZone( ctx, ___tracy_gpu_zone, name, true ) # define TracyD3D11Zone( ctx, name ) TracyD3D11NamedZone( ctx, TracyD3D11UnnamedZone, name, true )
# define TracyD3D11ZoneC( ctx, name, color ) TracyD3D11NamedZoneC( ctx, ___tracy_gpu_zone, name, color, true ) # define TracyD3D11ZoneC( ctx, name, color ) TracyD3D11NamedZoneC( ctx, TracyD3D11UnnamedZone, name, color, true )
# define TracyD3D11NamedZone( ctx, varname, name, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,TracyLine) { name, TracyFunction, TracyFile, (uint32_t)TracyLine, 0 }; tracy::D3D11ZoneScope varname( ctx, &TracyConcat(__tracy_gpu_source_location,TracyLine), active ); # define TracyD3D11NamedZone( ctx, varname, name, active ) TracyD3D11SrcLocObject(name, 0); tracy::D3D11ZoneScope varname( ctx, &TracyD3D11SrcLocSymbol, active );
# define TracyD3D11NamedZoneC( ctx, varname, name, color, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,TracyLine) { name, TracyFunction, TracyFile, (uint32_t)TracyLine, color }; tracy::D3D11ZoneScope varname( ctx, &TracyConcat(__tracy_gpu_source_location,TracyLine), active ); # define TracyD3D11NamedZoneC( ctx, varname, name, color, active ) TracyD3D11SrcLocObject(name, color); tracy::D3D11ZoneScope varname( ctx, &TracyD3D11SrcLocSymbol, active );
# define TracyD3D11ZoneTransient(ctx, varname, name, active) tracy::D3D11ZoneScope varname{ ctx, TracyLine, TracyFile, strlen(TracyFile), TracyFunction, strlen(TracyFunction), name, strlen(name), active }; # define TracyD3D11ZoneTransient(ctx, varname, name, active) tracy::D3D11ZoneScope varname{ ctx, TracyLine, TracyFile, strlen(TracyFile), TracyFunction, strlen(TracyFunction), name, strlen(name), active };
#endif #endif
#ifdef TRACY_HAS_CALLSTACK #ifdef TRACY_HAS_CALLSTACK
# define TracyD3D11ZoneS( ctx, name, depth ) TracyD3D11NamedZoneS( ctx, ___tracy_gpu_zone, name, depth, true ) # define TracyD3D11ZoneS( ctx, name, depth ) TracyD3D11NamedZoneS( ctx, TracyD3D11UnnamedZone, name, depth, true )
# define TracyD3D11ZoneCS( ctx, name, color, depth ) TracyD3D11NamedZoneCS( ctx, ___tracy_gpu_zone, name, color, depth, true ) # define TracyD3D11ZoneCS( ctx, name, color, depth ) TracyD3D11NamedZoneCS( ctx, TracyD3D11UnnamedZone, name, color, depth, true )
# define TracyD3D11NamedZoneS( ctx, varname, name, depth, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,TracyLine) { name, TracyFunction, TracyFile, (uint32_t)TracyLine, 0 }; tracy::D3D11ZoneScope varname( ctx, &TracyConcat(__tracy_gpu_source_location,TracyLine), depth, active ); # define TracyD3D11NamedZoneS( ctx, varname, name, depth, active ) TracyD3D11SrcLocObject(name, 0); tracy::D3D11ZoneScope varname( ctx, &TracyD3D11SrcLocSymbol, depth, active );
# define TracyD3D11NamedZoneCS( ctx, varname, name, color, depth, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,TracyLine) { name, TracyFunction, TracyFile, (uint32_t)TracyLine, color }; tracy::D3D11ZoneScope varname( ctx, &TracyConcat(__tracy_gpu_source_location,TracyLine), depth, active ); # define TracyD3D11NamedZoneCS( ctx, varname, name, color, depth, active ) TracyD3D11SrcLocObject(name, color); tracy::D3D11ZoneScope varname( ctx, &TracyD3D11SrcLocSymbol, depth, active );
# define TracyD3D11ZoneTransientS(ctx, varname, name, depth, active) tracy::D3D11ZoneScope varname{ ctx, TracyLine, TracyFile, strlen(TracyFile), TracyFunction, strlen(TracyFunction), name, strlen(name), depth, active }; # define TracyD3D11ZoneTransientS(ctx, varname, name, depth, active) tracy::D3D11ZoneScope varname{ ctx, TracyLine, TracyFile, strlen(TracyFile), TracyFunction, strlen(TracyFunction), name, strlen(name), depth, active };
#else #else
# define TracyD3D11ZoneS( ctx, name, depth, active ) TracyD3D11Zone( ctx, name ) # define TracyD3D11ZoneS( ctx, name, depth, active ) TracyD3D11Zone( ctx, name )
# define TracyD3D11ZoneCS( ctx, name, color, depth, active ) TracyD3D11ZoneC( name, color ) # define TracyD3D11ZoneCS( ctx, name, color, depth, active ) TracyD3D11ZoneC( name, color )
# define TracyD3D11NamedZoneS( ctx, varname, name, depth, active ) TracyD3D11NamedZone( ctx, varname, name, active ) # define TracyD3D11NamedZoneS( ctx, varname, name, depth, active ) TracyD3D11NamedZone( ctx, varname, name, active )
# define TracyD3D11NamedZoneCS( ctx, varname, name, color, depth, active ) TracyD3D11NamedZoneC( ctx, varname, name, color, active ) # define TracyD3D11NamedZoneCS( ctx, varname, name, color, depth, active ) TracyD3D11NamedZoneC( ctx, varname, name, color, active )
# define TracyD3D11ZoneTransientS(ctx, varname, name, depth, active) TracyD3D12ZoneTransient(ctx, varname, name, active) # define TracyD3D11ZoneTransientS(ctx, varname, name, depth, active) TracyD3D11ZoneTransient(ctx, varname, name, active)
#endif #endif
#define TracyD3D11Collect( ctx ) ctx->Collect(); #define TracyD3D11Collect( ctx ) ctx->Collect();

View file

@ -25,7 +25,7 @@
namespace tracy namespace tracy
{ {
class D3D12ZoneScope {}; class D3D12ZoneScope {};
} }
using TracyD3D12Ctx = void*; using TracyD3D12Ctx = void*;
@ -40,429 +40,419 @@ using TracyD3D12Ctx = void*;
#include <cassert> #include <cassert>
#include <d3d12.h> #include <d3d12.h>
#include <dxgi.h> #include <dxgi.h>
#include <wrl/client.h>
#include <queue> #include <queue>
#define TracyD3D12Panic(msg, ...) do { assert(false && "TracyD3D12: " msg); TracyMessageLC("TracyD3D12: " msg, tracy::Color::Red4); __VA_ARGS__; } while(false);
namespace tracy namespace tracy
{ {
struct D3D12QueryPayload struct D3D12QueryPayload
{ {
uint32_t m_queryIdStart = 0; uint32_t m_queryIdStart = 0;
uint32_t m_queryCount = 0; uint32_t m_queryCount = 0;
}; };
// Command queue context. // Command queue context.
class D3D12QueueCtx class D3D12QueueCtx
{ {
friend class D3D12ZoneScope; friend class D3D12ZoneScope;
static constexpr uint32_t MaxQueries = 64 * 1024; // Queries are begin and end markers, so we can store half as many total time durations. Must be even! ID3D12Device* m_device = nullptr;
ID3D12CommandQueue* m_queue = nullptr;
uint8_t m_contextId = 255; // TODO: apparently, 255 means "invalid id"; is this documented somewhere?
ID3D12QueryHeap* m_queryHeap = nullptr;
ID3D12Resource* m_readbackBuffer = nullptr;
bool m_initialized = false; // In-progress payload.
uint32_t m_queryLimit = 0;
std::atomic<uint32_t> m_queryCounter = 0;
uint32_t m_previousQueryCounter = 0;
ID3D12Device* m_device = nullptr; uint32_t m_activePayload = 0;
ID3D12CommandQueue* m_queue = nullptr; ID3D12Fence* m_payloadFence = nullptr;
uint8_t m_context; std::queue<D3D12QueryPayload> m_payloadQueue;
Microsoft::WRL::ComPtr<ID3D12QueryHeap> m_queryHeap;
Microsoft::WRL::ComPtr<ID3D12Resource> m_readbackBuffer;
// In-progress payload. UINT64 m_prevCalibrationTicksCPU = 0;
uint32_t m_queryLimit = MaxQueries;
std::atomic<uint32_t> m_queryCounter = 0;
uint32_t m_previousQueryCounter = 0;
uint32_t m_activePayload = 0; void RecalibrateClocks()
Microsoft::WRL::ComPtr<ID3D12Fence> m_payloadFence; {
std::queue<D3D12QueryPayload> m_payloadQueue; UINT64 cpuTimestamp;
UINT64 gpuTimestamp;
if (FAILED(m_queue->GetClockCalibration(&gpuTimestamp, &cpuTimestamp)))
{
TracyD3D12Panic("failed to obtain queue clock calibration counters.", return);
}
int64_t m_prevCalibration = 0; int64_t cpuDeltaTicks = cpuTimestamp - m_prevCalibrationTicksCPU;
int64_t m_qpcToNs = int64_t{ 1000000000 / GetFrequencyQpc() }; if (cpuDeltaTicks > 0)
{
static const int64_t nanosecodsPerTick = int64_t(1000000000) / GetFrequencyQpc();
int64_t cpuDeltaNS = cpuDeltaTicks * nanosecodsPerTick;
// Save the device cpu timestamp, not the Tracy profiler timestamp:
m_prevCalibrationTicksCPU = cpuTimestamp;
public: cpuTimestamp = Profiler::GetTime();
D3D12QueueCtx(ID3D12Device* device, ID3D12CommandQueue* queue)
: m_device(device)
, m_queue(queue)
, m_context(GetGpuCtxCounter().fetch_add(1, std::memory_order_relaxed))
{
// Verify we support timestamp queries on this queue.
if (queue->GetDesc().Type == D3D12_COMMAND_LIST_TYPE_COPY) auto* item = Profiler::QueueSerial();
{ MemWrite(&item->hdr.type, QueueType::GpuCalibration);
D3D12_FEATURE_DATA_D3D12_OPTIONS3 featureData{}; MemWrite(&item->gpuCalibration.gpuTime, gpuTimestamp);
MemWrite(&item->gpuCalibration.cpuTime, cpuTimestamp);
MemWrite(&item->gpuCalibration.cpuDelta, cpuDeltaNS);
MemWrite(&item->gpuCalibration.context, GetId());
SubmitQueueItem(item);
}
}
bool Success = SUCCEEDED(device->CheckFeatureSupport(D3D12_FEATURE_D3D12_OPTIONS3, &featureData, sizeof(featureData))); tracy_force_inline void SubmitQueueItem(tracy::QueueItem* item)
assert(Success && featureData.CopyQueueTimestampQueriesSupported && "Platform does not support profiling of copy queues."); {
} #ifdef TRACY_ON_DEMAND
GetProfiler().DeferItem(*item);
#endif
Profiler::QueueSerialFinish();
}
uint64_t timestampFrequency; public:
D3D12QueueCtx(ID3D12Device* device, ID3D12CommandQueue* queue)
: m_device(device)
, m_queue(queue)
{
// Verify we support timestamp queries on this queue.
if (FAILED(queue->GetTimestampFrequency(&timestampFrequency))) if (queue->GetDesc().Type == D3D12_COMMAND_LIST_TYPE_COPY)
{ {
assert(false && "Failed to get timestamp frequency."); D3D12_FEATURE_DATA_D3D12_OPTIONS3 featureData{};
}
uint64_t cpuTimestamp; HRESULT hr = device->CheckFeatureSupport(D3D12_FEATURE_D3D12_OPTIONS3, &featureData, sizeof(featureData));
uint64_t gpuTimestamp; if (FAILED(hr) || (featureData.CopyQueueTimestampQueriesSupported == FALSE))
{
TracyD3D12Panic("Platform does not support profiling of copy queues.", return);
}
}
if (FAILED(queue->GetClockCalibration(&gpuTimestamp, &cpuTimestamp))) static constexpr uint32_t MaxQueries = 64 * 1024; // Must be even, because queries are (begin, end) pairs
{ m_queryLimit = MaxQueries;
assert(false && "Failed to get queue clock calibration.");
}
// Save the device cpu timestamp, not the profiler's timestamp. D3D12_QUERY_HEAP_DESC heapDesc{};
m_prevCalibration = cpuTimestamp * m_qpcToNs; heapDesc.Type = queue->GetDesc().Type == D3D12_COMMAND_LIST_TYPE_COPY ? D3D12_QUERY_HEAP_TYPE_COPY_QUEUE_TIMESTAMP : D3D12_QUERY_HEAP_TYPE_TIMESTAMP;
heapDesc.Count = m_queryLimit;
heapDesc.NodeMask = 0; // #TODO: Support multiple adapters.
cpuTimestamp = Profiler::GetTime(); while (FAILED(device->CreateQueryHeap(&heapDesc, IID_PPV_ARGS(&m_queryHeap))))
{
m_queryLimit /= 2;
heapDesc.Count = m_queryLimit;
}
D3D12_QUERY_HEAP_DESC heapDesc{}; // Create a readback buffer, which will be used as a destination for the query data.
heapDesc.Type = queue->GetDesc().Type == D3D12_COMMAND_LIST_TYPE_COPY ? D3D12_QUERY_HEAP_TYPE_COPY_QUEUE_TIMESTAMP : D3D12_QUERY_HEAP_TYPE_TIMESTAMP;
heapDesc.Count = m_queryLimit;
heapDesc.NodeMask = 0; // #TODO: Support multiple adapters.
while (FAILED(device->CreateQueryHeap(&heapDesc, IID_PPV_ARGS(&m_queryHeap)))) D3D12_RESOURCE_DESC readbackBufferDesc{};
{ readbackBufferDesc.Alignment = 0;
m_queryLimit /= 2; readbackBufferDesc.Dimension = D3D12_RESOURCE_DIMENSION_BUFFER;
heapDesc.Count = m_queryLimit; readbackBufferDesc.Width = m_queryLimit * sizeof(uint64_t);
} readbackBufferDesc.Height = 1;
readbackBufferDesc.DepthOrArraySize = 1;
readbackBufferDesc.Format = DXGI_FORMAT_UNKNOWN;
readbackBufferDesc.Layout = D3D12_TEXTURE_LAYOUT_ROW_MAJOR; // Buffers are always row major.
readbackBufferDesc.MipLevels = 1;
readbackBufferDesc.SampleDesc.Count = 1;
readbackBufferDesc.SampleDesc.Quality = 0;
readbackBufferDesc.Flags = D3D12_RESOURCE_FLAG_NONE;
// Create a readback buffer, which will be used as a destination for the query data. D3D12_HEAP_PROPERTIES readbackHeapProps{};
readbackHeapProps.Type = D3D12_HEAP_TYPE_READBACK;
readbackHeapProps.CPUPageProperty = D3D12_CPU_PAGE_PROPERTY_UNKNOWN;
readbackHeapProps.MemoryPoolPreference = D3D12_MEMORY_POOL_UNKNOWN;
readbackHeapProps.CreationNodeMask = 0;
readbackHeapProps.VisibleNodeMask = 0; // #TODO: Support multiple adapters.
D3D12_RESOURCE_DESC readbackBufferDesc{}; if (FAILED(device->CreateCommittedResource(&readbackHeapProps, D3D12_HEAP_FLAG_NONE, &readbackBufferDesc, D3D12_RESOURCE_STATE_COPY_DEST, nullptr, IID_PPV_ARGS(&m_readbackBuffer))))
readbackBufferDesc.Alignment = 0; {
readbackBufferDesc.Dimension = D3D12_RESOURCE_DIMENSION_BUFFER; TracyD3D12Panic("Failed to create query readback buffer.", return);
readbackBufferDesc.Width = m_queryLimit * sizeof(uint64_t); }
readbackBufferDesc.Height = 1;
readbackBufferDesc.DepthOrArraySize = 1;
readbackBufferDesc.Format = DXGI_FORMAT_UNKNOWN;
readbackBufferDesc.Layout = D3D12_TEXTURE_LAYOUT_ROW_MAJOR; // Buffers are always row major.
readbackBufferDesc.MipLevels = 1;
readbackBufferDesc.SampleDesc.Count = 1;
readbackBufferDesc.SampleDesc.Quality = 0;
readbackBufferDesc.Flags = D3D12_RESOURCE_FLAG_NONE;
D3D12_HEAP_PROPERTIES readbackHeapProps{}; if (FAILED(device->CreateFence(0, D3D12_FENCE_FLAG_NONE, IID_PPV_ARGS(&m_payloadFence))))
readbackHeapProps.Type = D3D12_HEAP_TYPE_READBACK; {
readbackHeapProps.CPUPageProperty = D3D12_CPU_PAGE_PROPERTY_UNKNOWN; TracyD3D12Panic("Failed to create payload fence.", return);
readbackHeapProps.MemoryPoolPreference = D3D12_MEMORY_POOL_UNKNOWN; }
readbackHeapProps.CreationNodeMask = 0;
readbackHeapProps.VisibleNodeMask = 0; // #TODO: Support multiple adapters.
if (FAILED(device->CreateCommittedResource(&readbackHeapProps, D3D12_HEAP_FLAG_NONE, &readbackBufferDesc, D3D12_RESOURCE_STATE_COPY_DEST, nullptr, IID_PPV_ARGS(&m_readbackBuffer)))) float period = [queue]()
{ {
assert(false && "Failed to create query readback buffer."); uint64_t timestampFrequency;
} if (FAILED(queue->GetTimestampFrequency(&timestampFrequency)))
{
return 0.0f;
}
return static_cast<float>( 1E+09 / static_cast<double>(timestampFrequency) );
}();
if (FAILED(device->CreateFence(0, D3D12_FENCE_FLAG_NONE, IID_PPV_ARGS(&m_payloadFence)))) if (period == 0.0f)
{ {
assert(false && "Failed to create payload fence."); TracyD3D12Panic("Failed to get timestamp frequency.", return);
} }
auto* item = Profiler::QueueSerial(); uint64_t cpuTimestamp;
MemWrite(&item->hdr.type, QueueType::GpuNewContext); uint64_t gpuTimestamp;
MemWrite(&item->gpuNewContext.cpuTime, cpuTimestamp); if (FAILED(queue->GetClockCalibration(&gpuTimestamp, &cpuTimestamp)))
MemWrite(&item->gpuNewContext.gpuTime, gpuTimestamp); {
memset(&item->gpuNewContext.thread, 0, sizeof(item->gpuNewContext.thread)); TracyD3D12Panic("Failed to get queue clock calibration.", return);
MemWrite(&item->gpuNewContext.period, 1E+09f / static_cast<float>(timestampFrequency)); }
MemWrite(&item->gpuNewContext.context, m_context);
MemWrite(&item->gpuNewContext.flags, GpuContextCalibration); // Save the device cpu timestamp, not the profiler's timestamp.
MemWrite(&item->gpuNewContext.type, GpuContextType::Direct3D12); m_prevCalibrationTicksCPU = cpuTimestamp;
cpuTimestamp = Profiler::GetTime();
// all checked: ready to roll
m_contextId = GetGpuCtxCounter().fetch_add(1);
auto* item = Profiler::QueueSerial();
MemWrite(&item->hdr.type, QueueType::GpuNewContext);
MemWrite(&item->gpuNewContext.cpuTime, cpuTimestamp);
MemWrite(&item->gpuNewContext.gpuTime, gpuTimestamp);
MemWrite(&item->gpuNewContext.thread, decltype(item->gpuNewContext.thread)(0)); // #TODO: why 0 instead of GetThreadHandle()?
MemWrite(&item->gpuNewContext.period, period);
MemWrite(&item->gpuNewContext.context, GetId());
MemWrite(&item->gpuNewContext.flags, GpuContextCalibration);
MemWrite(&item->gpuNewContext.type, GpuContextType::Direct3D12);
SubmitQueueItem(item);
}
~D3D12QueueCtx()
{
ZoneScopedC(Color::Red4);
// collect all pending timestamps
while (m_payloadFence->GetCompletedValue() != m_activePayload)
/* busy-wait ... */;
Collect();
m_payloadFence->Release();
m_readbackBuffer->Release();
m_queryHeap->Release();
}
void NewFrame()
{
uint32_t queryCounter = m_queryCounter.exchange(0);
m_payloadQueue.emplace(D3D12QueryPayload{ m_previousQueryCounter, queryCounter });
m_previousQueryCounter += queryCounter;
if (m_previousQueryCounter >= m_queryLimit)
{
m_previousQueryCounter -= m_queryLimit;
}
m_queue->Signal(m_payloadFence, ++m_activePayload);
}
void Name( const char* name, uint16_t len )
{
auto ptr = (char*)tracy_malloc( len );
memcpy( ptr, name, len );
auto item = Profiler::QueueSerial();
MemWrite( &item->hdr.type, QueueType::GpuContextName );
MemWrite( &item->gpuContextNameFat.context, GetId());
MemWrite( &item->gpuContextNameFat.ptr, (uint64_t)ptr );
MemWrite( &item->gpuContextNameFat.size, len );
SubmitQueueItem(item);
}
void Collect()
{
ZoneScopedC(Color::Red4);
#ifdef TRACY_ON_DEMAND #ifdef TRACY_ON_DEMAND
GetProfiler().DeferItem(*item); if (!GetProfiler().IsConnected())
{
m_queryCounter = 0;
return;
}
#endif #endif
Profiler::QueueSerialFinish(); // Find out what payloads are available.
const auto newestReadyPayload = m_payloadFence->GetCompletedValue();
const auto payloadCount = m_payloadQueue.size() - (m_activePayload - newestReadyPayload);
m_initialized = true; if (!payloadCount)
} {
return; // No payloads are available yet, exit out.
}
void NewFrame() D3D12_RANGE mapRange{ 0, m_queryLimit * sizeof(uint64_t) };
{
uint32_t queryCounter = m_queryCounter.exchange(0);
m_payloadQueue.emplace(D3D12QueryPayload{ m_previousQueryCounter, queryCounter });
m_previousQueryCounter += queryCounter;
if (m_previousQueryCounter >= m_queryLimit) // Map the readback buffer so we can fetch the query data from the GPU.
{ void* readbackBufferMapping = nullptr;
m_previousQueryCounter -= m_queryLimit;
}
m_queue->Signal(m_payloadFence.Get(), ++m_activePayload); if (FAILED(m_readbackBuffer->Map(0, &mapRange, &readbackBufferMapping)))
} {
TracyD3D12Panic("Failed to map readback buffer.", return);
}
void Name( const char* name, uint16_t len ) auto* timestampData = static_cast<uint64_t*>(readbackBufferMapping);
{
auto ptr = (char*)tracy_malloc( len );
memcpy( ptr, name, len );
auto item = Profiler::QueueSerial(); for (uint32_t i = 0; i < payloadCount; ++i)
MemWrite( &item->hdr.type, QueueType::GpuContextName ); {
MemWrite( &item->gpuContextNameFat.context, m_context ); const auto& payload = m_payloadQueue.front();
MemWrite( &item->gpuContextNameFat.ptr, (uint64_t)ptr );
MemWrite( &item->gpuContextNameFat.size, len ); for (uint32_t j = 0; j < payload.m_queryCount; ++j)
{
const auto counter = (payload.m_queryIdStart + j) % m_queryLimit;
const auto timestamp = timestampData[counter];
const auto queryId = counter;
auto* item = Profiler::QueueSerial();
MemWrite(&item->hdr.type, QueueType::GpuTime);
MemWrite(&item->gpuTime.gpuTime, timestamp);
MemWrite(&item->gpuTime.queryId, static_cast<uint16_t>(queryId));
MemWrite(&item->gpuTime.context, GetId());
Profiler::QueueSerialFinish();
}
m_payloadQueue.pop();
}
m_readbackBuffer->Unmap(0, nullptr);
// Recalibrate to account for drift.
RecalibrateClocks();
}
private:
tracy_force_inline uint32_t NextQueryId()
{
uint32_t queryCounter = m_queryCounter.fetch_add(2);
if (queryCounter >= m_queryLimit)
{
TracyD3D12Panic("Submitted too many GPU queries! Consider increasing MaxQueries.");
// #TODO: consider returning an invalid id or sentinel value here
}
const uint32_t id = (m_previousQueryCounter + queryCounter) % m_queryLimit;
return id;
}
tracy_force_inline uint8_t GetId() const
{
return m_contextId;
}
};
class D3D12ZoneScope
{
const bool m_active;
D3D12QueueCtx* m_ctx = nullptr;
ID3D12GraphicsCommandList* m_cmdList = nullptr;
uint32_t m_queryId = 0; // Used for tracking in nested zones.
tracy_force_inline void WriteQueueItem(QueueItem* item, QueueType type, uint64_t srcLocation)
{
MemWrite(&item->hdr.type, type);
MemWrite(&item->gpuZoneBegin.cpuTime, Profiler::GetTime());
MemWrite(&item->gpuZoneBegin.srcloc, srcLocation);
MemWrite(&item->gpuZoneBegin.thread, GetThreadHandle());
MemWrite(&item->gpuZoneBegin.queryId, static_cast<uint16_t>(m_queryId));
MemWrite(&item->gpuZoneBegin.context, m_ctx->GetId());
Profiler::QueueSerialFinish();
}
tracy_force_inline D3D12ZoneScope(D3D12QueueCtx* ctx, ID3D12GraphicsCommandList* cmdList, bool active)
#ifdef TRACY_ON_DEMAND #ifdef TRACY_ON_DEMAND
GetProfiler().DeferItem( *item ); : m_active(active&& GetProfiler().IsConnected())
#endif
Profiler::QueueSerialFinish();
}
void Collect()
{
ZoneScopedC(Color::Red4);
#ifdef TRACY_ON_DEMAND
if (!GetProfiler().IsConnected())
{
m_queryCounter = 0;
return;
}
#endif
// Find out what payloads are available.
const auto newestReadyPayload = m_payloadFence->GetCompletedValue();
const auto payloadCount = m_payloadQueue.size() - (m_activePayload - newestReadyPayload);
if (!payloadCount)
{
return; // No payloads are available yet, exit out.
}
D3D12_RANGE mapRange{ 0, m_queryLimit * sizeof(uint64_t) };
// Map the readback buffer so we can fetch the query data from the GPU.
void* readbackBufferMapping = nullptr;
if (FAILED(m_readbackBuffer->Map(0, &mapRange, &readbackBufferMapping)))
{
assert(false && "Failed to map readback buffer.");
}
auto* timestampData = static_cast<uint64_t*>(readbackBufferMapping);
for (uint32_t i = 0; i < payloadCount; ++i)
{
const auto& payload = m_payloadQueue.front();
for (uint32_t j = 0; j < payload.m_queryCount; ++j)
{
const auto counter = (payload.m_queryIdStart + j) % m_queryLimit;
const auto timestamp = timestampData[counter];
const auto queryId = counter;
auto* item = Profiler::QueueSerial();
MemWrite(&item->hdr.type, QueueType::GpuTime);
MemWrite(&item->gpuTime.gpuTime, timestamp);
MemWrite(&item->gpuTime.queryId, static_cast<uint16_t>(queryId));
MemWrite(&item->gpuTime.context, m_context);
Profiler::QueueSerialFinish();
}
m_payloadQueue.pop();
}
m_readbackBuffer->Unmap(0, nullptr);
// Recalibrate to account for drift.
uint64_t cpuTimestamp;
uint64_t gpuTimestamp;
if (FAILED(m_queue->GetClockCalibration(&gpuTimestamp, &cpuTimestamp)))
{
assert(false && "Failed to get queue clock calibration.");
}
cpuTimestamp *= m_qpcToNs;
const auto cpuDelta = cpuTimestamp - m_prevCalibration;
if (cpuDelta > 0)
{
m_prevCalibration = cpuTimestamp;
cpuTimestamp = Profiler::GetTime();
auto* item = Profiler::QueueSerial();
MemWrite(&item->hdr.type, QueueType::GpuCalibration);
MemWrite(&item->gpuCalibration.gpuTime, gpuTimestamp);
MemWrite(&item->gpuCalibration.cpuTime, cpuTimestamp);
MemWrite(&item->gpuCalibration.cpuDelta, cpuDelta);
MemWrite(&item->gpuCalibration.context, m_context);
Profiler::QueueSerialFinish();
}
}
private:
tracy_force_inline uint32_t NextQueryId()
{
uint32_t queryCounter = m_queryCounter.fetch_add(2);
assert(queryCounter < m_queryLimit && "Submitted too many GPU queries! Consider increasing MaxQueries.");
const uint32_t id = (m_previousQueryCounter + queryCounter) % m_queryLimit;
return id;
}
tracy_force_inline uint8_t GetId() const
{
return m_context;
}
};
class D3D12ZoneScope
{
const bool m_active;
D3D12QueueCtx* m_ctx = nullptr;
ID3D12GraphicsCommandList* m_cmdList = nullptr;
uint32_t m_queryId = 0; // Used for tracking in nested zones.
public:
tracy_force_inline D3D12ZoneScope(D3D12QueueCtx* ctx, ID3D12GraphicsCommandList* cmdList, const SourceLocationData* srcLocation, bool active)
#ifdef TRACY_ON_DEMAND
: m_active(active && GetProfiler().IsConnected())
#else #else
: m_active(active) : m_active(active)
#endif #endif
{ {
if (!m_active) return; if (!m_active) return;
m_ctx = ctx; m_ctx = ctx;
m_cmdList = cmdList; m_cmdList = cmdList;
m_queryId = ctx->NextQueryId(); m_queryId = m_ctx->NextQueryId();
cmdList->EndQuery(ctx->m_queryHeap.Get(), D3D12_QUERY_TYPE_TIMESTAMP, m_queryId); m_cmdList->EndQuery(m_ctx->m_queryHeap, D3D12_QUERY_TYPE_TIMESTAMP, m_queryId);
}
auto* item = Profiler::QueueSerial(); public:
MemWrite(&item->hdr.type, QueueType::GpuZoneBeginSerial); tracy_force_inline D3D12ZoneScope(D3D12QueueCtx* ctx, ID3D12GraphicsCommandList* cmdList, const SourceLocationData* srcLocation, bool active)
MemWrite(&item->gpuZoneBegin.cpuTime, Profiler::GetTime()); : D3D12ZoneScope(ctx, cmdList, active)
MemWrite(&item->gpuZoneBegin.srcloc, reinterpret_cast<uint64_t>(srcLocation)); {
MemWrite(&item->gpuZoneBegin.thread, GetThreadHandle()); if (!m_active) return;
MemWrite(&item->gpuZoneBegin.queryId, static_cast<uint16_t>(m_queryId));
MemWrite(&item->gpuZoneBegin.context, ctx->GetId());
Profiler::QueueSerialFinish(); auto* item = Profiler::QueueSerial();
} WriteQueueItem(item, QueueType::GpuZoneBeginSerial, reinterpret_cast<uint64_t>(srcLocation));
}
tracy_force_inline D3D12ZoneScope(D3D12QueueCtx* ctx, ID3D12GraphicsCommandList* cmdList, const SourceLocationData* srcLocation, int depth, bool active) tracy_force_inline D3D12ZoneScope(D3D12QueueCtx* ctx, ID3D12GraphicsCommandList* cmdList, const SourceLocationData* srcLocation, int depth, bool active)
#ifdef TRACY_ON_DEMAND : D3D12ZoneScope(ctx, cmdList, active)
: m_active(active&& GetProfiler().IsConnected()) {
#else if (!m_active) return;
: m_active(active)
#endif
{
if (!m_active) return;
m_ctx = ctx; auto* item = Profiler::QueueSerialCallstack(Callstack(depth));
m_cmdList = cmdList; WriteQueueItem(item, QueueType::GpuZoneBeginCallstackSerial, reinterpret_cast<uint64_t>(srcLocation));
}
m_queryId = ctx->NextQueryId(); tracy_force_inline D3D12ZoneScope(D3D12QueueCtx* ctx, uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz, const char* name, size_t nameSz, ID3D12GraphicsCommandList* cmdList, bool active)
cmdList->EndQuery(ctx->m_queryHeap.Get(), D3D12_QUERY_TYPE_TIMESTAMP, m_queryId); : D3D12ZoneScope(ctx, cmdList, active)
{
if (!m_active) return;
auto* item = Profiler::QueueSerialCallstack(Callstack(depth)); const auto sourceLocation = Profiler::AllocSourceLocation(line, source, sourceSz, function, functionSz, name, nameSz);
MemWrite(&item->hdr.type, QueueType::GpuZoneBeginCallstackSerial);
MemWrite(&item->gpuZoneBegin.cpuTime, Profiler::GetTime());
MemWrite(&item->gpuZoneBegin.srcloc, reinterpret_cast<uint64_t>(srcLocation));
MemWrite(&item->gpuZoneBegin.thread, GetThreadHandle());
MemWrite(&item->gpuZoneBegin.queryId, static_cast<uint16_t>(m_queryId));
MemWrite(&item->gpuZoneBegin.context, ctx->GetId());
Profiler::QueueSerialFinish(); auto* item = Profiler::QueueSerial();
} WriteQueueItem(item, QueueType::GpuZoneBeginAllocSrcLocSerial, sourceLocation);
}
tracy_force_inline D3D12ZoneScope(D3D12QueueCtx* ctx, uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz, const char* name, size_t nameSz, ID3D12GraphicsCommandList* cmdList, bool active) tracy_force_inline D3D12ZoneScope(D3D12QueueCtx* ctx, uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz, const char* name, size_t nameSz, ID3D12GraphicsCommandList* cmdList, int depth, bool active)
#ifdef TRACY_ON_DEMAND : D3D12ZoneScope(ctx, cmdList, active)
: m_active(active&& GetProfiler().IsConnected()) {
#else if (!m_active) return;
: m_active(active)
#endif
{
if (!m_active) return;
m_ctx = ctx; const auto sourceLocation = Profiler::AllocSourceLocation(line, source, sourceSz, function, functionSz, name, nameSz);
m_cmdList = cmdList;
m_queryId = ctx->NextQueryId(); auto* item = Profiler::QueueSerialCallstack(Callstack(depth));
cmdList->EndQuery(ctx->m_queryHeap.Get(), D3D12_QUERY_TYPE_TIMESTAMP, m_queryId); WriteQueueItem(item, QueueType::GpuZoneBeginAllocSrcLocCallstackSerial, sourceLocation);
}
const auto sourceLocation = Profiler::AllocSourceLocation(line, source, sourceSz, function, functionSz, name, nameSz); tracy_force_inline ~D3D12ZoneScope()
{
if (!m_active) return;
auto* item = Profiler::QueueSerial(); const auto queryId = m_queryId + 1; // Our end query slot is immediately after the begin slot.
MemWrite(&item->hdr.type, QueueType::GpuZoneBeginAllocSrcLocSerial); m_cmdList->EndQuery(m_ctx->m_queryHeap, D3D12_QUERY_TYPE_TIMESTAMP, queryId);
MemWrite(&item->gpuZoneBegin.cpuTime, Profiler::GetTime());
MemWrite(&item->gpuZoneBegin.srcloc, sourceLocation);
MemWrite(&item->gpuZoneBegin.thread, GetThreadHandle());
MemWrite(&item->gpuZoneBegin.queryId, static_cast<uint16_t>(m_queryId));
MemWrite(&item->gpuZoneBegin.context, ctx->GetId());
Profiler::QueueSerialFinish(); auto* item = Profiler::QueueSerial();
} MemWrite(&item->hdr.type, QueueType::GpuZoneEndSerial);
MemWrite(&item->gpuZoneEnd.cpuTime, Profiler::GetTime());
MemWrite(&item->gpuZoneEnd.thread, GetThreadHandle());
MemWrite(&item->gpuZoneEnd.queryId, static_cast<uint16_t>(queryId));
MemWrite(&item->gpuZoneEnd.context, m_ctx->GetId());
Profiler::QueueSerialFinish();
tracy_force_inline D3D12ZoneScope(D3D12QueueCtx* ctx, uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz, const char* name, size_t nameSz, ID3D12GraphicsCommandList* cmdList, int depth, bool active) m_cmdList->ResolveQueryData(m_ctx->m_queryHeap, D3D12_QUERY_TYPE_TIMESTAMP, m_queryId, 2, m_ctx->m_readbackBuffer, m_queryId * sizeof(uint64_t));
#ifdef TRACY_ON_DEMAND }
: m_active(active&& GetProfiler().IsConnected()) };
#else
: m_active(active)
#endif
{
if (!m_active) return;
m_ctx = ctx; static inline D3D12QueueCtx* CreateD3D12Context(ID3D12Device* device, ID3D12CommandQueue* queue)
m_cmdList = cmdList; {
auto* ctx = static_cast<D3D12QueueCtx*>(tracy_malloc(sizeof(D3D12QueueCtx)));
new (ctx) D3D12QueueCtx{ device, queue };
m_queryId = ctx->NextQueryId(); return ctx;
cmdList->EndQuery(ctx->m_queryHeap.Get(), D3D12_QUERY_TYPE_TIMESTAMP, m_queryId); }
const auto sourceLocation = Profiler::AllocSourceLocation(line, source, sourceSz, function, functionSz, name, nameSz); static inline void DestroyD3D12Context(D3D12QueueCtx* ctx)
{
auto* item = Profiler::QueueSerialCallstack(Callstack(depth)); ctx->~D3D12QueueCtx();
MemWrite(&item->hdr.type, QueueType::GpuZoneBeginAllocSrcLocCallstackSerial); tracy_free(ctx);
MemWrite(&item->gpuZoneBegin.cpuTime, Profiler::GetTime()); }
MemWrite(&item->gpuZoneBegin.srcloc, sourceLocation);
MemWrite(&item->gpuZoneBegin.thread, GetThreadHandle());
MemWrite(&item->gpuZoneBegin.queryId, static_cast<uint16_t>(m_queryId));
MemWrite(&item->gpuZoneBegin.context, ctx->GetId());
Profiler::QueueSerialFinish();
}
tracy_force_inline ~D3D12ZoneScope()
{
if (!m_active) return;
const auto queryId = m_queryId + 1; // Our end query slot is immediately after the begin slot.
m_cmdList->EndQuery(m_ctx->m_queryHeap.Get(), D3D12_QUERY_TYPE_TIMESTAMP, queryId);
auto* item = Profiler::QueueSerial();
MemWrite(&item->hdr.type, QueueType::GpuZoneEndSerial);
MemWrite(&item->gpuZoneEnd.cpuTime, Profiler::GetTime());
MemWrite(&item->gpuZoneEnd.thread, GetThreadHandle());
MemWrite(&item->gpuZoneEnd.queryId, static_cast<uint16_t>(queryId));
MemWrite(&item->gpuZoneEnd.context, m_ctx->GetId());
Profiler::QueueSerialFinish();
m_cmdList->ResolveQueryData(m_ctx->m_queryHeap.Get(), D3D12_QUERY_TYPE_TIMESTAMP, m_queryId, 2, m_ctx->m_readbackBuffer.Get(), m_queryId * sizeof(uint64_t));
}
};
static inline D3D12QueueCtx* CreateD3D12Context(ID3D12Device* device, ID3D12CommandQueue* queue)
{
auto* ctx = static_cast<D3D12QueueCtx*>(tracy_malloc(sizeof(D3D12QueueCtx)));
new (ctx) D3D12QueueCtx{ device, queue };
return ctx;
}
static inline void DestroyD3D12Context(D3D12QueueCtx* ctx)
{
ctx->~D3D12QueueCtx();
tracy_free(ctx);
}
} }
#undef TracyD3D12Panic
using TracyD3D12Ctx = tracy::D3D12QueueCtx*; using TracyD3D12Ctx = tracy::D3D12QueueCtx*;
#define TracyD3D12Context(device, queue) tracy::CreateD3D12Context(device, queue); #define TracyD3D12Context(device, queue) tracy::CreateD3D12Context(device, queue);
@ -471,25 +461,29 @@ using TracyD3D12Ctx = tracy::D3D12QueueCtx*;
#define TracyD3D12NewFrame(ctx) ctx->NewFrame(); #define TracyD3D12NewFrame(ctx) ctx->NewFrame();
#define TracyD3D12UnnamedZone ___tracy_gpu_d3d12_zone
#define TracyD3D12SrcLocSymbol TracyConcat(__tracy_d3d12_source_location,TracyLine)
#define TracyD3D12SrcLocObject(name, color) static constexpr tracy::SourceLocationData TracyD3D12SrcLocSymbol { name, TracyFunction, TracyFile, (uint32_t)TracyLine, color };
#if defined TRACY_HAS_CALLSTACK && defined TRACY_CALLSTACK #if defined TRACY_HAS_CALLSTACK && defined TRACY_CALLSTACK
# define TracyD3D12Zone(ctx, cmdList, name) TracyD3D12NamedZoneS(ctx, ___tracy_gpu_zone, cmdList, name, TRACY_CALLSTACK, true) # define TracyD3D12Zone(ctx, cmdList, name) TracyD3D12NamedZoneS(ctx, TracyD3D12UnnamedZone, cmdList, name, TRACY_CALLSTACK, true)
# define TracyD3D12ZoneC(ctx, cmdList, name, color) TracyD3D12NamedZoneCS(ctx, ___tracy_gpu_zone, cmdList, name, color, TRACY_CALLSTACK, true) # define TracyD3D12ZoneC(ctx, cmdList, name, color) TracyD3D12NamedZoneCS(ctx, TracyD3D12UnnamedZone, cmdList, name, color, TRACY_CALLSTACK, true)
# define TracyD3D12NamedZone(ctx, varname, cmdList, name, active) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location, TracyLine) { name, TracyFunction, TracyFile, (uint32_t)TracyLine, 0 }; tracy::D3D12ZoneScope varname{ ctx, cmdList, &TracyConcat(__tracy_gpu_source_location, TracyLine), TRACY_CALLSTACK, active }; # define TracyD3D12NamedZone(ctx, varname, cmdList, name, active) TracyD3D12SrcLocObject(name, 0); tracy::D3D12ZoneScope varname{ ctx, cmdList, &TracyD3D12SrcLocSymbol, TRACY_CALLSTACK, active };
# define TracyD3D12NamedZoneC(ctx, varname, cmdList, name, color, active) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location, TracyLine) { name, TracyFunction, TracyFile, (uint32_t)TracyLine, color }; tracy::D3D12ZoneScope varname{ ctx, cmdList, &TracyConcat(__tracy_gpu_source_location, TracyLine), TRACY_CALLSTACK, active }; # define TracyD3D12NamedZoneC(ctx, varname, cmdList, name, color, active) TracyD3D12SrcLocObject(name, color); tracy::D3D12ZoneScope varname{ ctx, cmdList, &TracyD3D12SrcLocSymbol, TRACY_CALLSTACK, active };
# define TracyD3D12ZoneTransient(ctx, varname, cmdList, name, active) TracyD3D12ZoneTransientS(ctx, varname, cmdList, name, TRACY_CALLSTACK, active) # define TracyD3D12ZoneTransient(ctx, varname, cmdList, name, active) TracyD3D12ZoneTransientS(ctx, varname, cmdList, name, TRACY_CALLSTACK, active)
#else #else
# define TracyD3D12Zone(ctx, cmdList, name) TracyD3D12NamedZone(ctx, ___tracy_gpu_zone, cmdList, name, true) # define TracyD3D12Zone(ctx, cmdList, name) TracyD3D12NamedZone(ctx, TracyD3D12UnnamedZone, cmdList, name, true)
# define TracyD3D12ZoneC(ctx, cmdList, name, color) TracyD3D12NamedZoneC(ctx, ___tracy_gpu_zone, cmdList, name, color, true) # define TracyD3D12ZoneC(ctx, cmdList, name, color) TracyD3D12NamedZoneC(ctx, TracyD3D12UnnamedZone, cmdList, name, color, true)
# define TracyD3D12NamedZone(ctx, varname, cmdList, name, active) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location, TracyLine) { name, TracyFunction, TracyFile, (uint32_t)TracyLine, 0 }; tracy::D3D12ZoneScope varname{ ctx, cmdList, &TracyConcat(__tracy_gpu_source_location, TracyLine), active }; # define TracyD3D12NamedZone(ctx, varname, cmdList, name, active) TracyD3D12SrcLocObject(name, 0); tracy::D3D12ZoneScope varname{ ctx, cmdList, &TracyD3D12SrcLocSymbol, active };
# define TracyD3D12NamedZoneC(ctx, varname, cmdList, name, color, active) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location, TracyLine) { name, TracyFunction, TracyFile, (uint32_t)TracyLine, color }; tracy::D3D12ZoneScope varname{ ctx, cmdList, &TracyConcat(__tracy_gpu_source_location, TracyLine), active }; # define TracyD3D12NamedZoneC(ctx, varname, cmdList, name, color, active) TracyD3D12SrcLocObject(name, color); tracy::D3D12ZoneScope varname{ ctx, cmdList, &TracyD3D12SrcLocSymbol, active };
# define TracyD3D12ZoneTransient(ctx, varname, cmdList, name, active) tracy::D3D12ZoneScope varname{ ctx, TracyLine, TracyFile, strlen(TracyFile), TracyFunction, strlen(TracyFunction), name, strlen(name), cmdList, active }; # define TracyD3D12ZoneTransient(ctx, varname, cmdList, name, active) tracy::D3D12ZoneScope varname{ ctx, TracyLine, TracyFile, strlen(TracyFile), TracyFunction, strlen(TracyFunction), name, strlen(name), cmdList, active };
#endif #endif
#ifdef TRACY_HAS_CALLSTACK #ifdef TRACY_HAS_CALLSTACK
# define TracyD3D12ZoneS(ctx, cmdList, name, depth) TracyD3D12NamedZoneS(ctx, ___tracy_gpu_zone, cmdList, name, depth, true) # define TracyD3D12ZoneS(ctx, cmdList, name, depth) TracyD3D12NamedZoneS(ctx, TracyD3D12UnnamedZone, cmdList, name, depth, true)
# define TracyD3D12ZoneCS(ctx, cmdList, name, color, depth) TracyD3D12NamedZoneCS(ctx, ___tracy_gpu_zone, cmdList, name, color, depth, true) # define TracyD3D12ZoneCS(ctx, cmdList, name, color, depth) TracyD3D12NamedZoneCS(ctx, TracyD3D12UnnamedZone, cmdList, name, color, depth, true)
# define TracyD3D12NamedZoneS(ctx, varname, cmdList, name, depth, active) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location, TracyLine) { name, TracyFunction, TracyFile, (uint32_t)TracyLine, 0 }; tracy::D3D12ZoneScope varname{ ctx, cmdList, &TracyConcat(__tracy_gpu_source_location, TracyLine), depth, active }; # define TracyD3D12NamedZoneS(ctx, varname, cmdList, name, depth, active) TracyD3D12SrcLocObject(name, 0); tracy::D3D12ZoneScope varname{ ctx, cmdList, &TracyD3D12SrcLocSymbol, depth, active };
# define TracyD3D12NamedZoneCS(ctx, varname, cmdList, name, color, depth, active) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location, TracyLine) { name, TracyFunction, TracyFile, (uint32_t)TracyLine, color }; tracy::D3D12ZoneScope varname{ ctx, cmdList, &TracyConcat(__tracy_gpu_source_location, TracyLine), depth, active }; # define TracyD3D12NamedZoneCS(ctx, varname, cmdList, name, color, depth, active) TracyD3D12SrcLocObject(name, color); tracy::D3D12ZoneScope varname{ ctx, cmdList, &TracyD3D12SrcLocSymbol, depth, active };
# define TracyD3D12ZoneTransientS(ctx, varname, cmdList, name, depth, active) tracy::D3D12ZoneScope varname{ ctx, TracyLine, TracyFile, strlen(TracyFile), TracyFunction, strlen(TracyFunction), name, strlen(name), cmdList, depth, active }; # define TracyD3D12ZoneTransientS(ctx, varname, cmdList, name, depth, active) tracy::D3D12ZoneScope varname{ ctx, TracyLine, TracyFile, strlen(TracyFile), TracyFunction, strlen(TracyFunction), name, strlen(name), cmdList, depth, active };
#else #else
# define TracyD3D12ZoneS(ctx, cmdList, name, depth) TracyD3D12Zone(ctx, cmdList, name) # define TracyD3D12ZoneS(ctx, cmdList, name, depth) TracyD3D12Zone(ctx, cmdList, name)

View file

@ -173,10 +173,10 @@ static tracy_force_inline void SendLuaCallstack( lua_State* L, uint32_t depth )
{ {
const uint32_t line = dbg[i].currentline; const uint32_t line = dbg[i].currentline;
memcpy( dst, &line, 4 ); dst += 4; memcpy( dst, &line, 4 ); dst += 4;
assert( fsz[i] <= std::numeric_limits<uint16_t>::max() ); assert( fsz[i] <= (std::numeric_limits<uint16_t>::max)() );
memcpy( dst, fsz+i, 2 ); dst += 2; memcpy( dst, fsz+i, 2 ); dst += 2;
memcpy( dst, func[i], fsz[i] ); dst += fsz[i]; memcpy( dst, func[i], fsz[i] ); dst += fsz[i];
assert( ssz[i] <= std::numeric_limits<uint16_t>::max() ); assert( ssz[i] <= (std::numeric_limits<uint16_t>::max)() );
memcpy( dst, ssz+i, 2 ); dst += 2; memcpy( dst, ssz+i, 2 ); dst += 2;
memcpy( dst, dbg[i].source, ssz[i] ), dst += ssz[i]; memcpy( dst, dbg[i].source, ssz[i] ), dst += ssz[i];
} }
@ -333,7 +333,7 @@ static inline int LuaZoneText( lua_State* L )
auto txt = lua_tostring( L, 1 ); auto txt = lua_tostring( L, 1 );
const auto size = strlen( txt ); const auto size = strlen( txt );
assert( size < std::numeric_limits<uint16_t>::max() ); assert( size < (std::numeric_limits<uint16_t>::max)() );
auto ptr = (char*)tracy_malloc( size ); auto ptr = (char*)tracy_malloc( size );
memcpy( ptr, txt, size ); memcpy( ptr, txt, size );
@ -358,7 +358,7 @@ static inline int LuaZoneName( lua_State* L )
auto txt = lua_tostring( L, 1 ); auto txt = lua_tostring( L, 1 );
const auto size = strlen( txt ); const auto size = strlen( txt );
assert( size < std::numeric_limits<uint16_t>::max() ); assert( size < (std::numeric_limits<uint16_t>::max)() );
auto ptr = (char*)tracy_malloc( size ); auto ptr = (char*)tracy_malloc( size );
memcpy( ptr, txt, size ); memcpy( ptr, txt, size );
@ -378,7 +378,7 @@ static inline int LuaMessage( lua_State* L )
auto txt = lua_tostring( L, 1 ); auto txt = lua_tostring( L, 1 );
const auto size = strlen( txt ); const auto size = strlen( txt );
assert( size < std::numeric_limits<uint16_t>::max() ); assert( size < (std::numeric_limits<uint16_t>::max)() );
auto ptr = (char*)tracy_malloc( size ); auto ptr = (char*)tracy_malloc( size );
memcpy( ptr, txt, size ); memcpy( ptr, txt, size );

View file

@ -5,6 +5,9 @@
#define TracyVkContext(x,y,z,w) nullptr #define TracyVkContext(x,y,z,w) nullptr
#define TracyVkContextCalibrated(x,y,z,w,a,b) nullptr #define TracyVkContextCalibrated(x,y,z,w,a,b) nullptr
#if defined VK_EXT_host_query_reset
#define TracyVkContextHostCalibrated(x,y,z,w,a) nullptr
#endif
#define TracyVkDestroy(x) #define TracyVkDestroy(x)
#define TracyVkContextName(c,x,y) #define TracyVkContextName(c,x,y)
#define TracyVkNamedZone(c,x,y,z,w) #define TracyVkNamedZone(c,x,y,z,w)
@ -39,9 +42,47 @@ using TracyVkCtx = void*;
#include "../client/TracyProfiler.hpp" #include "../client/TracyProfiler.hpp"
#include "../client/TracyCallstack.hpp" #include "../client/TracyCallstack.hpp"
#include <atomic>
namespace tracy namespace tracy
{ {
#if defined TRACY_VK_USE_SYMBOL_TABLE
#define LoadVkDeviceCoreSymbols(Operation) \
Operation(vkBeginCommandBuffer) \
Operation(vkCmdResetQueryPool) \
Operation(vkCmdWriteTimestamp) \
Operation(vkCreateQueryPool) \
Operation(vkDestroyQueryPool) \
Operation(vkEndCommandBuffer) \
Operation(vkGetQueryPoolResults) \
Operation(vkQueueSubmit) \
Operation(vkQueueWaitIdle) \
Operation(vkResetQueryPool)
#define LoadVkDeviceExtensionSymbols(Operation) \
Operation(vkGetCalibratedTimestampsEXT) \
Operation(vkGetPhysicalDeviceCalibrateableTimeDomainsEXT)
#define LoadVkInstanceCoreSymbols(Operation) \
Operation(vkGetPhysicalDeviceProperties)
struct VkSymbolTable
{
#define MAKE_PFN(name) PFN_##name name;
LoadVkDeviceCoreSymbols(MAKE_PFN)
LoadVkDeviceExtensionSymbols(MAKE_PFN)
LoadVkInstanceCoreSymbols(MAKE_PFN)
#undef MAKE_PFN
};
#define VK_FUNCTION_WRAPPER(callSignature) m_symbols.callSignature
#define CONTEXT_VK_FUNCTION_WRAPPER(callSignature) m_ctx->m_symbols.callSignature
#else
#define VK_FUNCTION_WRAPPER(callSignature) callSignature
#define CONTEXT_VK_FUNCTION_WRAPPER(callSignature) callSignature
#endif
class VkCtx class VkCtx
{ {
friend class VkCtxScope; friend class VkCtxScope;
@ -49,7 +90,11 @@ class VkCtx
enum { QueryCount = 64 * 1024 }; enum { QueryCount = 64 * 1024 };
public: public:
VkCtx( VkPhysicalDevice physdev, VkDevice device, VkQueue queue, VkCommandBuffer cmdbuf, PFN_vkGetPhysicalDeviceCalibrateableTimeDomainsEXT _vkGetPhysicalDeviceCalibrateableTimeDomainsEXT, PFN_vkGetCalibratedTimestampsEXT _vkGetCalibratedTimestampsEXT ) #if defined TRACY_VK_USE_SYMBOL_TABLE
VkCtx( VkInstance instance, VkPhysicalDevice physdev, VkDevice device, VkQueue queue, VkCommandBuffer cmdbuf, PFN_vkGetInstanceProcAddr instanceProcAddr, PFN_vkGetDeviceProcAddr deviceProcAddr, bool calibrated )
#else
VkCtx( VkPhysicalDevice physdev, VkDevice device, VkQueue queue, VkCommandBuffer cmdbuf, PFN_vkGetPhysicalDeviceCalibrateableTimeDomainsEXT vkGetPhysicalDeviceCalibrateableTimeDomainsEXT, PFN_vkGetCalibratedTimestampsEXT vkGetCalibratedTimestampsEXT)
#endif
: m_device( device ) : m_device( device )
, m_timeDomain( VK_TIME_DOMAIN_DEVICE_EXT ) , m_timeDomain( VK_TIME_DOMAIN_DEVICE_EXT )
, m_context( GetGpuCtxCounter().fetch_add( 1, std::memory_order_relaxed ) ) , m_context( GetGpuCtxCounter().fetch_add( 1, std::memory_order_relaxed ) )
@ -57,47 +102,28 @@ public:
, m_tail( 0 ) , m_tail( 0 )
, m_oldCnt( 0 ) , m_oldCnt( 0 )
, m_queryCount( QueryCount ) , m_queryCount( QueryCount )
, m_vkGetCalibratedTimestampsEXT( _vkGetCalibratedTimestampsEXT ) #if !defined TRACY_VK_USE_SYMBOL_TABLE
, m_vkGetCalibratedTimestampsEXT( vkGetCalibratedTimestampsEXT )
#endif
{ {
assert( m_context != 255 ); assert( m_context != 255 );
if( _vkGetPhysicalDeviceCalibrateableTimeDomainsEXT && _vkGetCalibratedTimestampsEXT ) #if defined TRACY_VK_USE_SYMBOL_TABLE
PopulateSymbolTable(instance, instanceProcAddr, deviceProcAddr);
if ( calibrated )
{ {
uint32_t num; m_vkGetCalibratedTimestampsEXT = m_symbols.vkGetCalibratedTimestampsEXT;
_vkGetPhysicalDeviceCalibrateableTimeDomainsEXT( physdev, &num, nullptr ); }
if( num > 4 ) num = 4;
VkTimeDomainEXT data[4];
_vkGetPhysicalDeviceCalibrateableTimeDomainsEXT( physdev, &num, data );
VkTimeDomainEXT supportedDomain = (VkTimeDomainEXT)-1;
#if defined _WIN32
supportedDomain = VK_TIME_DOMAIN_QUERY_PERFORMANCE_COUNTER_EXT;
#elif defined __linux__ && defined CLOCK_MONOTONIC_RAW
supportedDomain = VK_TIME_DOMAIN_CLOCK_MONOTONIC_RAW_EXT;
#endif #endif
for( uint32_t i=0; i<num; i++ )
{
if( data[i] == supportedDomain )
{
m_timeDomain = data[i];
break;
}
}
}
VkPhysicalDeviceProperties prop; if( VK_FUNCTION_WRAPPER( vkGetPhysicalDeviceCalibrateableTimeDomainsEXT ) && m_vkGetCalibratedTimestampsEXT )
vkGetPhysicalDeviceProperties( physdev, &prop );
const float period = prop.limits.timestampPeriod;
VkQueryPoolCreateInfo poolInfo = {};
poolInfo.sType = VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO;
poolInfo.queryCount = m_queryCount;
poolInfo.queryType = VK_QUERY_TYPE_TIMESTAMP;
while( vkCreateQueryPool( device, &poolInfo, nullptr, &m_query ) != VK_SUCCESS )
{ {
m_queryCount /= 2; FindAvailableTimeDomains( physdev, VK_FUNCTION_WRAPPER( vkGetPhysicalDeviceCalibrateableTimeDomainsEXT ) );
poolInfo.queryCount = m_queryCount;
} }
CreateQueryPool();
VkCommandBufferBeginInfo beginInfo = {}; VkCommandBufferBeginInfo beginInfo = {};
beginInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO; beginInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO;
beginInfo.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT; beginInfo.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
@ -107,87 +133,96 @@ public:
submitInfo.commandBufferCount = 1; submitInfo.commandBufferCount = 1;
submitInfo.pCommandBuffers = &cmdbuf; submitInfo.pCommandBuffers = &cmdbuf;
vkBeginCommandBuffer( cmdbuf, &beginInfo ); VK_FUNCTION_WRAPPER( vkBeginCommandBuffer( cmdbuf, &beginInfo ) );
vkCmdResetQueryPool( cmdbuf, m_query, 0, m_queryCount ); VK_FUNCTION_WRAPPER( vkCmdResetQueryPool( cmdbuf, m_query, 0, m_queryCount ) );
vkEndCommandBuffer( cmdbuf ); VK_FUNCTION_WRAPPER( vkEndCommandBuffer( cmdbuf ) );
vkQueueSubmit( queue, 1, &submitInfo, VK_NULL_HANDLE ); VK_FUNCTION_WRAPPER( vkQueueSubmit( queue, 1, &submitInfo, VK_NULL_HANDLE ) );
vkQueueWaitIdle( queue ); VK_FUNCTION_WRAPPER( vkQueueWaitIdle( queue ) );
int64_t tcpu, tgpu; int64_t tcpu, tgpu;
if( m_timeDomain == VK_TIME_DOMAIN_DEVICE_EXT ) if( m_timeDomain == VK_TIME_DOMAIN_DEVICE_EXT )
{ {
vkBeginCommandBuffer( cmdbuf, &beginInfo ); VK_FUNCTION_WRAPPER( vkBeginCommandBuffer( cmdbuf, &beginInfo ) );
vkCmdWriteTimestamp( cmdbuf, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, m_query, 0 ); VK_FUNCTION_WRAPPER( vkCmdWriteTimestamp( cmdbuf, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, m_query, 0 ) );
vkEndCommandBuffer( cmdbuf ); VK_FUNCTION_WRAPPER( vkEndCommandBuffer( cmdbuf ) );
vkQueueSubmit( queue, 1, &submitInfo, VK_NULL_HANDLE ); VK_FUNCTION_WRAPPER( vkQueueSubmit( queue, 1, &submitInfo, VK_NULL_HANDLE ) );
vkQueueWaitIdle( queue ); VK_FUNCTION_WRAPPER( vkQueueWaitIdle( queue ) );
tcpu = Profiler::GetTime(); tcpu = Profiler::GetTime();
vkGetQueryPoolResults( device, m_query, 0, 1, sizeof( tgpu ), &tgpu, sizeof( tgpu ), VK_QUERY_RESULT_64_BIT | VK_QUERY_RESULT_WAIT_BIT ); VK_FUNCTION_WRAPPER( vkGetQueryPoolResults( device, m_query, 0, 1, sizeof( tgpu ), &tgpu, sizeof( tgpu ), VK_QUERY_RESULT_64_BIT | VK_QUERY_RESULT_WAIT_BIT ) );
vkBeginCommandBuffer( cmdbuf, &beginInfo ); VK_FUNCTION_WRAPPER( vkBeginCommandBuffer( cmdbuf, &beginInfo ) );
vkCmdResetQueryPool( cmdbuf, m_query, 0, 1 ); VK_FUNCTION_WRAPPER( vkCmdResetQueryPool( cmdbuf, m_query, 0, 1 ) );
vkEndCommandBuffer( cmdbuf ); VK_FUNCTION_WRAPPER( vkEndCommandBuffer( cmdbuf ) );
vkQueueSubmit( queue, 1, &submitInfo, VK_NULL_HANDLE ); VK_FUNCTION_WRAPPER( vkQueueSubmit( queue, 1, &submitInfo, VK_NULL_HANDLE ) );
vkQueueWaitIdle( queue ); VK_FUNCTION_WRAPPER( vkQueueWaitIdle( queue ) );
} }
else else
{ {
enum { NumProbes = 32 }; FindCalibratedTimestampDeviation();
VkCalibratedTimestampInfoEXT spec[2] = {
{ VK_STRUCTURE_TYPE_CALIBRATED_TIMESTAMP_INFO_EXT, nullptr, VK_TIME_DOMAIN_DEVICE_EXT },
{ VK_STRUCTURE_TYPE_CALIBRATED_TIMESTAMP_INFO_EXT, nullptr, m_timeDomain },
};
uint64_t ts[2];
uint64_t deviation[NumProbes];
for( int i=0; i<NumProbes; i++ )
{
_vkGetCalibratedTimestampsEXT( device, 2, spec, ts, deviation+i );
}
uint64_t minDeviation = deviation[0];
for( int i=1; i<NumProbes; i++ )
{
if( minDeviation > deviation[i] )
{
minDeviation = deviation[i];
}
}
m_deviation = minDeviation * 3 / 2;
#if defined _WIN32
m_qpcToNs = int64_t( 1000000000. / GetFrequencyQpc() );
#endif
Calibrate( device, m_prevCalibration, tgpu ); Calibrate( device, m_prevCalibration, tgpu );
tcpu = Profiler::GetTime(); tcpu = Profiler::GetTime();
} }
uint8_t flags = 0; WriteInitialItem( physdev, tcpu, tgpu );
if( m_timeDomain != VK_TIME_DOMAIN_DEVICE_EXT ) flags |= GpuContextCalibration;
auto item = Profiler::QueueSerial();
MemWrite( &item->hdr.type, QueueType::GpuNewContext );
MemWrite( &item->gpuNewContext.cpuTime, tcpu );
MemWrite( &item->gpuNewContext.gpuTime, tgpu );
memset( &item->gpuNewContext.thread, 0, sizeof( item->gpuNewContext.thread ) );
MemWrite( &item->gpuNewContext.period, period );
MemWrite( &item->gpuNewContext.context, m_context );
MemWrite( &item->gpuNewContext.flags, flags );
MemWrite( &item->gpuNewContext.type, GpuContextType::Vulkan );
#ifdef TRACY_ON_DEMAND
GetProfiler().DeferItem( *item );
#endif
Profiler::QueueSerialFinish();
m_res = (int64_t*)tracy_malloc( sizeof( int64_t ) * m_queryCount ); m_res = (int64_t*)tracy_malloc( sizeof( int64_t ) * m_queryCount );
} }
#if defined VK_EXT_host_query_reset
/**
* This alternative constructor does not use command buffers and instead uses functionality from
* VK_EXT_host_query_reset (core with 1.2 and non-optional) and VK_EXT_calibrated_timestamps. This requires
* the physical device to have another time domain apart from DEVICE to be calibrateable.
*/
#if defined TRACY_VK_USE_SYMBOL_TABLE
VkCtx( VkInstance instance, VkPhysicalDevice physdev, VkDevice device, PFN_vkGetInstanceProcAddr instanceProcAddr, PFN_vkGetDeviceProcAddr deviceProcAddr )
#else
VkCtx( VkPhysicalDevice physdev, VkDevice device, PFN_vkResetQueryPoolEXT vkResetQueryPool, PFN_vkGetPhysicalDeviceCalibrateableTimeDomainsEXT vkGetPhysicalDeviceCalibrateableTimeDomainsEXT, PFN_vkGetCalibratedTimestampsEXT vkGetCalibratedTimestampsEXT )
#endif
: m_device( device )
, m_timeDomain( VK_TIME_DOMAIN_DEVICE_EXT )
, m_context( GetGpuCtxCounter().fetch_add(1, std::memory_order_relaxed) )
, m_head( 0 )
, m_tail( 0 )
, m_oldCnt( 0 )
, m_queryCount( QueryCount )
#if !defined TRACY_VK_USE_SYMBOL_TABLE
, m_vkGetCalibratedTimestampsEXT( vkGetCalibratedTimestampsEXT )
#endif
{
assert( m_context != 255);
#if defined TRACY_VK_USE_SYMBOL_TABLE
PopulateSymbolTable(instance, instanceProcAddr, deviceProcAddr);
m_vkGetCalibratedTimestampsEXT = m_symbols.vkGetCalibratedTimestampsEXT;
#endif
assert( VK_FUNCTION_WRAPPER( vkResetQueryPool ) != nullptr );
assert( VK_FUNCTION_WRAPPER( vkGetPhysicalDeviceCalibrateableTimeDomainsEXT ) != nullptr );
assert( VK_FUNCTION_WRAPPER( vkGetCalibratedTimestampsEXT ) != nullptr );
FindAvailableTimeDomains( physdev, VK_FUNCTION_WRAPPER( vkGetPhysicalDeviceCalibrateableTimeDomainsEXT ) );
// We require a host time domain to be available to properly calibrate.
FindCalibratedTimestampDeviation();
int64_t tgpu;
Calibrate( device, m_prevCalibration, tgpu );
int64_t tcpu = Profiler::GetTime();
CreateQueryPool();
VK_FUNCTION_WRAPPER( vkResetQueryPool( device, m_query, 0, m_queryCount ) );
WriteInitialItem( physdev, tcpu, tgpu );
m_res = (int64_t*)tracy_malloc( sizeof( int64_t ) * m_queryCount );
}
#endif
~VkCtx() ~VkCtx()
{ {
tracy_free( m_res ); tracy_free( m_res );
vkDestroyQueryPool( m_device, m_query, nullptr ); VK_FUNCTION_WRAPPER( vkDestroyQueryPool( m_device, m_query, nullptr ) );
} }
void Name( const char* name, uint16_t len ) void Name( const char* name, uint16_t len )
@ -210,18 +245,23 @@ public:
{ {
ZoneScopedC( Color::Red4 ); ZoneScopedC( Color::Red4 );
if( m_tail == m_head ) return; const uint64_t head = m_head.load(std::memory_order_relaxed);
if( m_tail == head ) return;
#ifdef TRACY_ON_DEMAND #ifdef TRACY_ON_DEMAND
if( !GetProfiler().IsConnected() ) if( !GetProfiler().IsConnected() )
{ {
vkCmdResetQueryPool( cmdbuf, m_query, 0, m_queryCount ); VK_FUNCTION_WRAPPER( vkCmdResetQueryPool( cmdbuf, m_query, 0, m_queryCount ) );
m_head = m_tail = m_oldCnt = 0; m_tail = head;
m_oldCnt = 0;
int64_t tgpu; int64_t tgpu;
if( m_timeDomain != VK_TIME_DOMAIN_DEVICE_EXT ) Calibrate( m_device, m_prevCalibration, tgpu ); if( m_timeDomain != VK_TIME_DOMAIN_DEVICE_EXT ) Calibrate( m_device, m_prevCalibration, tgpu );
return; return;
} }
#endif #endif
assert( head > m_tail );
const unsigned int wrappedTail = (unsigned int)( m_tail % m_queryCount );
unsigned int cnt; unsigned int cnt;
if( m_oldCnt != 0 ) if( m_oldCnt != 0 )
@ -231,10 +271,16 @@ public:
} }
else else
{ {
cnt = m_head < m_tail ? m_queryCount - m_tail : m_head - m_tail; cnt = (unsigned int)( head - m_tail );
assert( cnt <= m_queryCount );
if( wrappedTail + cnt > m_queryCount )
{
cnt = m_queryCount - wrappedTail;
}
} }
if( vkGetQueryPoolResults( m_device, m_query, m_tail, cnt, sizeof( int64_t ) * m_queryCount, m_res, sizeof( int64_t ), VK_QUERY_RESULT_64_BIT ) == VK_NOT_READY )
if( VK_FUNCTION_WRAPPER( vkGetQueryPoolResults( m_device, m_query, wrappedTail, cnt, sizeof( int64_t ) * m_queryCount, m_res, sizeof( int64_t ), VK_QUERY_RESULT_64_BIT ) == VK_NOT_READY ) )
{ {
m_oldCnt = cnt; m_oldCnt = cnt;
return; return;
@ -245,7 +291,7 @@ public:
auto item = Profiler::QueueSerial(); auto item = Profiler::QueueSerial();
MemWrite( &item->hdr.type, QueueType::GpuTime ); MemWrite( &item->hdr.type, QueueType::GpuTime );
MemWrite( &item->gpuTime.gpuTime, m_res[idx] ); MemWrite( &item->gpuTime.gpuTime, m_res[idx] );
MemWrite( &item->gpuTime.queryId, uint16_t( m_tail + idx ) ); MemWrite( &item->gpuTime.queryId, uint16_t( wrappedTail + idx ) );
MemWrite( &item->gpuTime.context, m_context ); MemWrite( &item->gpuTime.context, m_context );
Profiler::QueueSerialFinish(); Profiler::QueueSerialFinish();
} }
@ -269,19 +315,16 @@ public:
} }
} }
vkCmdResetQueryPool( cmdbuf, m_query, m_tail, cnt ); VK_FUNCTION_WRAPPER( vkCmdResetQueryPool( cmdbuf, m_query, wrappedTail, cnt ) );
m_tail += cnt; m_tail += cnt;
if( m_tail == m_queryCount ) m_tail = 0;
} }
private: private:
tracy_force_inline unsigned int NextQueryId() tracy_force_inline unsigned int NextQueryId()
{ {
const auto id = m_head; const uint64_t id = m_head.fetch_add(1, std::memory_order_relaxed);
m_head = ( m_head + 1 ) % m_queryCount; return id % m_queryCount;
assert( m_head != m_tail );
return id;
} }
tracy_force_inline uint8_t GetId() const tracy_force_inline uint8_t GetId() const
@ -315,16 +358,126 @@ private:
#endif #endif
} }
tracy_force_inline void CreateQueryPool()
{
VkQueryPoolCreateInfo poolInfo = {};
poolInfo.sType = VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO;
poolInfo.queryCount = m_queryCount;
poolInfo.queryType = VK_QUERY_TYPE_TIMESTAMP;
while ( VK_FUNCTION_WRAPPER( vkCreateQueryPool( m_device, &poolInfo, nullptr, &m_query ) != VK_SUCCESS ) )
{
m_queryCount /= 2;
poolInfo.queryCount = m_queryCount;
}
}
tracy_force_inline void FindAvailableTimeDomains( VkPhysicalDevice physicalDevice, PFN_vkGetPhysicalDeviceCalibrateableTimeDomainsEXT _vkGetPhysicalDeviceCalibrateableTimeDomainsEXT )
{
uint32_t num;
_vkGetPhysicalDeviceCalibrateableTimeDomainsEXT( physicalDevice, &num, nullptr );
if(num > 4) num = 4;
VkTimeDomainEXT data[4];
_vkGetPhysicalDeviceCalibrateableTimeDomainsEXT( physicalDevice, &num, data );
VkTimeDomainEXT supportedDomain = (VkTimeDomainEXT)-1;
#if defined _WIN32
supportedDomain = VK_TIME_DOMAIN_QUERY_PERFORMANCE_COUNTER_EXT;
#elif defined __linux__ && defined CLOCK_MONOTONIC_RAW
supportedDomain = VK_TIME_DOMAIN_CLOCK_MONOTONIC_RAW_EXT;
#endif
for( uint32_t i=0; i<num; i++ ) {
if(data[i] == supportedDomain) {
m_timeDomain = data[i];
break;
}
}
}
tracy_force_inline void FindCalibratedTimestampDeviation()
{
assert( m_timeDomain != VK_TIME_DOMAIN_DEVICE_EXT );
constexpr size_t NumProbes = 32;
VkCalibratedTimestampInfoEXT spec[2] = {
{ VK_STRUCTURE_TYPE_CALIBRATED_TIMESTAMP_INFO_EXT, nullptr, VK_TIME_DOMAIN_DEVICE_EXT },
{ VK_STRUCTURE_TYPE_CALIBRATED_TIMESTAMP_INFO_EXT, nullptr, m_timeDomain },
};
uint64_t ts[2];
uint64_t deviation[NumProbes];
for( int i=0; i<NumProbes; i++ ) {
m_vkGetCalibratedTimestampsEXT( m_device, 2, spec, ts, deviation + i );
}
uint64_t minDeviation = deviation[0];
for( int i=1; i<NumProbes; i++ ) {
if ( minDeviation > deviation[i] ) {
minDeviation = deviation[i];
}
}
m_deviation = minDeviation * 3 / 2;
#if defined _WIN32
m_qpcToNs = int64_t( 1000000000. / GetFrequencyQpc() );
#endif
}
tracy_force_inline void WriteInitialItem( VkPhysicalDevice physdev, int64_t tcpu, int64_t tgpu )
{
uint8_t flags = 0;
if( m_timeDomain != VK_TIME_DOMAIN_DEVICE_EXT ) flags |= GpuContextCalibration;
VkPhysicalDeviceProperties prop;
VK_FUNCTION_WRAPPER( vkGetPhysicalDeviceProperties( physdev, &prop ) );
const float period = prop.limits.timestampPeriod;
auto item = Profiler::QueueSerial();
MemWrite( &item->hdr.type, QueueType::GpuNewContext );
MemWrite( &item->gpuNewContext.cpuTime, tcpu );
MemWrite( &item->gpuNewContext.gpuTime, tgpu );
memset( &item->gpuNewContext.thread, 0, sizeof( item->gpuNewContext.thread ) );
MemWrite( &item->gpuNewContext.period, period );
MemWrite( &item->gpuNewContext.context, m_context );
MemWrite( &item->gpuNewContext.flags, flags );
MemWrite( &item->gpuNewContext.type, GpuContextType::Vulkan );
#ifdef TRACY_ON_DEMAND
GetProfiler().DeferItem( *item );
#endif
Profiler::QueueSerialFinish();
}
#if defined TRACY_VK_USE_SYMBOL_TABLE
void PopulateSymbolTable( VkInstance instance, PFN_vkGetInstanceProcAddr instanceProcAddr, PFN_vkGetDeviceProcAddr deviceProcAddr )
{
#define VK_GET_DEVICE_SYMBOL( name ) \
(PFN_##name)deviceProcAddr( m_device, #name );
#define VK_LOAD_DEVICE_SYMBOL( name ) \
m_symbols.name = VK_GET_DEVICE_SYMBOL( name );
#define VK_GET_INSTANCE_SYMBOL( name ) \
(PFN_##name)instanceProcAddr( instance, #name );
#define VK_LOAD_INSTANCE_SYMBOL( name ) \
m_symbols.name = VK_GET_INSTANCE_SYMBOL( name );
LoadVkDeviceCoreSymbols( VK_LOAD_DEVICE_SYMBOL )
LoadVkDeviceExtensionSymbols( VK_LOAD_DEVICE_SYMBOL )
LoadVkInstanceCoreSymbols( VK_LOAD_INSTANCE_SYMBOL )
#undef VK_GET_DEVICE_SYMBOL
#undef VK_LOAD_DEVICE_SYMBOL
#undef VK_GET_INSTANCE_SYMBOL
#undef VK_LOAD_INSTANCE_SYMBOL
}
#endif
VkDevice m_device; VkDevice m_device;
VkQueryPool m_query; VkQueryPool m_query;
VkTimeDomainEXT m_timeDomain; VkTimeDomainEXT m_timeDomain;
#if defined TRACY_VK_USE_SYMBOL_TABLE
VkSymbolTable m_symbols;
#endif
uint64_t m_deviation; uint64_t m_deviation;
int64_t m_qpcToNs; int64_t m_qpcToNs;
int64_t m_prevCalibration; int64_t m_prevCalibration;
uint8_t m_context; uint8_t m_context;
unsigned int m_head; std::atomic<uint64_t> m_head;
unsigned int m_tail; uint64_t m_tail;
unsigned int m_oldCnt; unsigned int m_oldCnt;
unsigned int m_queryCount; unsigned int m_queryCount;
@ -348,7 +501,7 @@ public:
m_ctx = ctx; m_ctx = ctx;
const auto queryId = ctx->NextQueryId(); const auto queryId = ctx->NextQueryId();
vkCmdWriteTimestamp( cmdbuf, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, ctx->m_query, queryId ); CONTEXT_VK_FUNCTION_WRAPPER( vkCmdWriteTimestamp( cmdbuf, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, ctx->m_query, queryId ) );
auto item = Profiler::QueueSerial(); auto item = Profiler::QueueSerial();
MemWrite( &item->hdr.type, QueueType::GpuZoneBeginSerial ); MemWrite( &item->hdr.type, QueueType::GpuZoneBeginSerial );
@ -372,7 +525,7 @@ public:
m_ctx = ctx; m_ctx = ctx;
const auto queryId = ctx->NextQueryId(); const auto queryId = ctx->NextQueryId();
vkCmdWriteTimestamp( cmdbuf, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, ctx->m_query, queryId ); CONTEXT_VK_FUNCTION_WRAPPER( vkCmdWriteTimestamp( cmdbuf, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, ctx->m_query, queryId ) );
auto item = Profiler::QueueSerialCallstack( Callstack( depth ) ); auto item = Profiler::QueueSerialCallstack( Callstack( depth ) );
MemWrite( &item->hdr.type, QueueType::GpuZoneBeginCallstackSerial ); MemWrite( &item->hdr.type, QueueType::GpuZoneBeginCallstackSerial );
@ -396,7 +549,7 @@ public:
m_ctx = ctx; m_ctx = ctx;
const auto queryId = ctx->NextQueryId(); const auto queryId = ctx->NextQueryId();
vkCmdWriteTimestamp( cmdbuf, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, ctx->m_query, queryId ); CONTEXT_VK_FUNCTION_WRAPPER( vkCmdWriteTimestamp( cmdbuf, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, ctx->m_query, queryId ) );
const auto srcloc = Profiler::AllocSourceLocation( line, source, sourceSz, function, functionSz, name, nameSz ); const auto srcloc = Profiler::AllocSourceLocation( line, source, sourceSz, function, functionSz, name, nameSz );
auto item = Profiler::QueueSerial(); auto item = Profiler::QueueSerial();
@ -421,7 +574,7 @@ public:
m_ctx = ctx; m_ctx = ctx;
const auto queryId = ctx->NextQueryId(); const auto queryId = ctx->NextQueryId();
vkCmdWriteTimestamp( cmdbuf, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, ctx->m_query, queryId ); CONTEXT_VK_FUNCTION_WRAPPER( vkCmdWriteTimestamp( cmdbuf, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, ctx->m_query, queryId ) );
const auto srcloc = Profiler::AllocSourceLocation( line, source, sourceSz, function, functionSz, name, nameSz ); const auto srcloc = Profiler::AllocSourceLocation( line, source, sourceSz, function, functionSz, name, nameSz );
auto item = Profiler::QueueSerialCallstack( Callstack( depth ) ); auto item = Profiler::QueueSerialCallstack( Callstack( depth ) );
@ -439,7 +592,7 @@ public:
if( !m_active ) return; if( !m_active ) return;
const auto queryId = m_ctx->NextQueryId(); const auto queryId = m_ctx->NextQueryId();
vkCmdWriteTimestamp( m_cmdbuf, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, m_ctx->m_query, queryId ); CONTEXT_VK_FUNCTION_WRAPPER( vkCmdWriteTimestamp( m_cmdbuf, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, m_ctx->m_query, queryId ) );
auto item = Profiler::QueueSerial(); auto item = Profiler::QueueSerial();
MemWrite( &item->hdr.type, QueueType::GpuZoneEndSerial ); MemWrite( &item->hdr.type, QueueType::GpuZoneEndSerial );
@ -457,13 +610,38 @@ private:
VkCtx* m_ctx; VkCtx* m_ctx;
}; };
#if defined TRACY_VK_USE_SYMBOL_TABLE
static inline VkCtx* CreateVkContext( VkInstance instance, VkPhysicalDevice physdev, VkDevice device, VkQueue queue, VkCommandBuffer cmdbuf, PFN_vkGetInstanceProcAddr instanceProcAddr, PFN_vkGetDeviceProcAddr getDeviceProcAddr, bool calibrated = false )
#else
static inline VkCtx* CreateVkContext( VkPhysicalDevice physdev, VkDevice device, VkQueue queue, VkCommandBuffer cmdbuf, PFN_vkGetPhysicalDeviceCalibrateableTimeDomainsEXT gpdctd, PFN_vkGetCalibratedTimestampsEXT gct ) static inline VkCtx* CreateVkContext( VkPhysicalDevice physdev, VkDevice device, VkQueue queue, VkCommandBuffer cmdbuf, PFN_vkGetPhysicalDeviceCalibrateableTimeDomainsEXT gpdctd, PFN_vkGetCalibratedTimestampsEXT gct )
#endif
{ {
auto ctx = (VkCtx*)tracy_malloc( sizeof( VkCtx ) ); auto ctx = (VkCtx*)tracy_malloc( sizeof( VkCtx ) );
#if defined TRACY_VK_USE_SYMBOL_TABLE
new(ctx) VkCtx( instance, physdev, device, queue, cmdbuf, instanceProcAddr, getDeviceProcAddr, calibrated );
#else
new(ctx) VkCtx( physdev, device, queue, cmdbuf, gpdctd, gct ); new(ctx) VkCtx( physdev, device, queue, cmdbuf, gpdctd, gct );
#endif
return ctx; return ctx;
} }
#if defined VK_EXT_host_query_reset
#if defined TRACY_VK_USE_SYMBOL_TABLE
static inline VkCtx* CreateVkContext( VkInstance instance, VkPhysicalDevice physdev, VkDevice device, PFN_vkGetInstanceProcAddr instanceProcAddr, PFN_vkGetDeviceProcAddr getDeviceProcAddr )
#else
static inline VkCtx* CreateVkContext( VkPhysicalDevice physdev, VkDevice device, PFN_vkResetQueryPoolEXT qpreset, PFN_vkGetPhysicalDeviceCalibrateableTimeDomainsEXT gpdctd, PFN_vkGetCalibratedTimestampsEXT gct )
#endif
{
auto ctx = (VkCtx*)tracy_malloc( sizeof( VkCtx ) );
#if defined TRACY_VK_USE_SYMBOL_TABLE
new(ctx) VkCtx( instance, physdev, device, instanceProcAddr, getDeviceProcAddr );
#else
new(ctx) VkCtx( physdev, device, qpreset, gpdctd, gct );
#endif
return ctx;
}
#endif
static inline void DestroyVkContext( VkCtx* ctx ) static inline void DestroyVkContext( VkCtx* ctx )
{ {
ctx->~VkCtx(); ctx->~VkCtx();
@ -474,8 +652,23 @@ static inline void DestroyVkContext( VkCtx* ctx )
using TracyVkCtx = tracy::VkCtx*; using TracyVkCtx = tracy::VkCtx*;
#if defined TRACY_VK_USE_SYMBOL_TABLE
#define TracyVkContext( instance, physdev, device, queue, cmdbuf, instanceProcAddr, deviceProcAddr ) tracy::CreateVkContext( instance, physdev, device, queue, cmdbuf, instanceProcAddr, deviceProcAddr );
#else
#define TracyVkContext( physdev, device, queue, cmdbuf ) tracy::CreateVkContext( physdev, device, queue, cmdbuf, nullptr, nullptr ); #define TracyVkContext( physdev, device, queue, cmdbuf ) tracy::CreateVkContext( physdev, device, queue, cmdbuf, nullptr, nullptr );
#endif
#if defined TRACY_VK_USE_SYMBOL_TABLE
#define TracyVkContextCalibrated( instance, physdev, device, queue, cmdbuf, instanceProcAddr, deviceProcAddr ) tracy::CreateVkContext( instance, physdev, device, queue, cmdbuf, instanceProcAddr, deviceProcAddr, true );
#else
#define TracyVkContextCalibrated( physdev, device, queue, cmdbuf, gpdctd, gct ) tracy::CreateVkContext( physdev, device, queue, cmdbuf, gpdctd, gct ); #define TracyVkContextCalibrated( physdev, device, queue, cmdbuf, gpdctd, gct ) tracy::CreateVkContext( physdev, device, queue, cmdbuf, gpdctd, gct );
#endif
#if defined VK_EXT_host_query_reset
#if defined TRACY_VK_USE_SYMBOL_TABLE
#define TracyVkContextHostCalibrated( instance, physdev, device, instanceProcAddr, deviceProcAddr ) tracy::CreateVkContext( instance, physdev, device, instanceProcAddr, deviceProcAddr );
#else
#define TracyVkContextHostCalibrated( physdev, device, qpreset, gpdctd, gct ) tracy::CreateVkContext( physdev, device, qpreset, gpdctd, gct );
#endif
#endif
#define TracyVkDestroy( ctx ) tracy::DestroyVkContext( ctx ); #define TracyVkDestroy( ctx ) tracy::DestroyVkContext( ctx );
#define TracyVkContextName( ctx, name, size ) ctx->Name( name, size ); #define TracyVkContextName( ctx, name, size ) ctx->Name( name, size );
#if defined TRACY_HAS_CALLSTACK && defined TRACY_CALLSTACK #if defined TRACY_HAS_CALLSTACK && defined TRACY_CALLSTACK