Merge branch 'update-tracy' into 'master'

Update tracy client to latest version

See merge request KartKrew/Kart!1877
This commit is contained in:
Oni 2024-01-28 23:30:01 +00:00
commit b254ce51c0
26 changed files with 1316 additions and 817 deletions

View file

@ -1,4 +1,4 @@
# Tracy Profiler Client 0.9.1
# Tracy Profiler Client 0.10.0
# BSD 3-clause
# Copyright (c) 2017-2023, Bartosz Taudul <wolf@nereid.pl>

View file

@ -22,6 +22,7 @@
#include "common/tracy_lz4.cpp"
#include "client/TracyProfiler.cpp"
#include "client/TracyCallstack.cpp"
#include "client/TracySysPower.cpp"
#include "client/TracySysTime.cpp"
#include "client/TracySysTrace.cpp"
#include "common/TracySocket.cpp"

View file

@ -686,7 +686,9 @@ void InitCallstackCritical()
void InitCallstack()
{
cb_bts = backtrace_create_state( nullptr, 0, nullptr, nullptr );
#ifndef TRACY_DEMANGLE
___tracy_init_demangle_buffer();
#endif
#ifdef __linux
InitKernelSymbols();
@ -761,7 +763,9 @@ debuginfod_client* GetDebuginfodClient()
void EndCallstack()
{
#ifndef TRACY_DEMANGLE
___tracy_free_demangle_buffer();
#endif
#ifdef TRACY_DEBUGINFOD
ClearDebugInfoVector( s_di_known );
debuginfod_end( s_debuginfod );

View file

@ -21,7 +21,7 @@ public:
, m_active( false )
#endif
{
assert( m_id != std::numeric_limits<uint32_t>::max() );
assert( m_id != (std::numeric_limits<uint32_t>::max)() );
auto item = Profiler::QueueSerial();
MemWrite( &item->hdr.type, QueueType::LockAnnounce );
@ -154,7 +154,7 @@ public:
tracy_force_inline void CustomName( const char* name, size_t size )
{
assert( size < std::numeric_limits<uint16_t>::max() );
assert( size < (std::numeric_limits<uint16_t>::max)() );
auto ptr = (char*)tracy_malloc( size );
memcpy( ptr, name, size );
auto item = Profiler::QueueSerial();
@ -235,7 +235,7 @@ public:
, m_active( false )
#endif
{
assert( m_id != std::numeric_limits<uint32_t>::max() );
assert( m_id != (std::numeric_limits<uint32_t>::max)() );
auto item = Profiler::QueueSerial();
MemWrite( &item->hdr.type, QueueType::LockAnnounce );
@ -450,7 +450,7 @@ public:
tracy_force_inline void CustomName( const char* name, size_t size )
{
assert( size < std::numeric_limits<uint16_t>::max() );
assert( size < (std::numeric_limits<uint16_t>::max)() );
auto ptr = (char*)tracy_malloc( size );
memcpy( ptr, name, size );
auto item = Profiler::QueueSerial();

View file

@ -83,7 +83,9 @@
#endif
#ifdef __APPLE__
# define TRACY_DELAYED_INIT
# ifndef TRACY_DELAYED_INIT
# define TRACY_DELAYED_INIT
# endif
#else
# ifdef __GNUC__
# define init_order( val ) __attribute__ ((init_priority(val)))
@ -1072,7 +1074,9 @@ static void CrashHandler( int signal, siginfo_t* info, void* /*ucontext*/ )
}
closedir( dp );
#ifdef TRACY_HAS_CALLSTACK
if( selfTid == s_symbolTid ) s_symbolThreadGone.store( true, std::memory_order_release );
#endif
TracyLfqPrepare( QueueType::Crash );
TracyLfqCommit;
@ -1353,6 +1357,7 @@ Profiler::Profiler()
, m_queryImage( nullptr )
, m_queryData( nullptr )
, m_crashHandlerInstalled( false )
, m_programName( nullptr )
{
assert( !s_instance );
s_instance = this;
@ -1711,6 +1716,9 @@ void Profiler::Worker()
if( m_sock ) break;
#ifndef TRACY_ON_DEMAND
ProcessSysTime();
# ifdef TRACY_HAS_SYSPOWER
m_sysPower.Tick();
# endif
#endif
if( m_broadcast )
@ -1718,6 +1726,14 @@ void Profiler::Worker()
const auto t = std::chrono::high_resolution_clock::now().time_since_epoch().count();
if( t - lastBroadcast > 3000000000 ) // 3s
{
m_programNameLock.lock();
if( m_programName )
{
broadcastMsg = GetBroadcastMessage( m_programName, strlen( m_programName ), broadcastLen, dataPort );
m_programName = nullptr;
}
m_programNameLock.unlock();
lastBroadcast = t;
const auto ts = std::chrono::duration_cast<std::chrono::seconds>( std::chrono::system_clock::now().time_since_epoch() ).count();
broadcastMsg.activeTime = int32_t( ts - m_epoch );
@ -1828,6 +1844,9 @@ void Profiler::Worker()
for(;;)
{
ProcessSysTime();
#ifdef TRACY_HAS_SYSPOWER
m_sysPower.Tick();
#endif
const auto status = Dequeue( token );
const auto serialStatus = DequeueSerial();
if( status == DequeueStatus::ConnectionLost || serialStatus == DequeueStatus::ConnectionLost )
@ -4149,6 +4168,7 @@ TRACY_API void ___tracy_emit_frame_image( const void* image, uint16_t w, uint16_
TRACY_API void ___tracy_emit_plot( const char* name, double val ) { tracy::Profiler::PlotData( name, val ); }
TRACY_API void ___tracy_emit_plot_float( const char* name, float val ) { tracy::Profiler::PlotData( name, val ); }
TRACY_API void ___tracy_emit_plot_int( const char* name, int64_t val ) { tracy::Profiler::PlotData( name, val ); }
TRACY_API void ___tracy_emit_plot_config( const char* name, int type, int step, int fill, uint32_t color ) { tracy::Profiler::ConfigurePlot( name, tracy::PlotFormatType(type), step, fill, color ); }
TRACY_API void ___tracy_emit_message( const char* txt, size_t size, int callstack ) { tracy::Profiler::Message( txt, size, callstack ); }
TRACY_API void ___tracy_emit_messageL( const char* txt, int callstack ) { tracy::Profiler::Message( txt, callstack ); }
TRACY_API void ___tracy_emit_messageC( const char* txt, size_t size, uint32_t color, int callstack ) { tracy::Profiler::MessageColor( txt, size, color, callstack ); }
@ -4167,7 +4187,7 @@ TRACY_API void ___tracy_emit_gpu_zone_begin( const struct ___tracy_gpu_zone_begi
{
TracyLfqPrepareC( tracy::QueueType::GpuZoneBegin );
tracy::MemWrite( &item->gpuZoneBegin.cpuTime, tracy::Profiler::GetTime() );
tracy::MemWrite( &item->gpuNewContext.thread, tracy::GetThreadHandle() );
tracy::MemWrite( &item->gpuZoneBegin.thread, tracy::GetThreadHandle() );
tracy::MemWrite( &item->gpuZoneBegin.srcloc, data.srcloc );
tracy::MemWrite( &item->gpuZoneBegin.queryId, data.queryId );
tracy::MemWrite( &item->gpuZoneBegin.context, data.context );
@ -4190,7 +4210,7 @@ TRACY_API void ___tracy_emit_gpu_zone_begin_alloc( const struct ___tracy_gpu_zon
{
TracyLfqPrepareC( tracy::QueueType::GpuZoneBeginAllocSrcLoc );
tracy::MemWrite( &item->gpuZoneBegin.cpuTime, tracy::Profiler::GetTime() );
tracy::MemWrite( &item->gpuNewContext.thread, tracy::GetThreadHandle() );
tracy::MemWrite( &item->gpuZoneBegin.thread, tracy::GetThreadHandle() );
tracy::MemWrite( &item->gpuZoneBegin.srcloc, data.srcloc );
tracy::MemWrite( &item->gpuZoneBegin.queryId, data.queryId );
tracy::MemWrite( &item->gpuZoneBegin.context, data.context );
@ -4202,7 +4222,7 @@ TRACY_API void ___tracy_emit_gpu_zone_begin_alloc_callstack( const struct ___tra
tracy::GetProfiler().SendCallstack( data.depth );
TracyLfqPrepareC( tracy::QueueType::GpuZoneBeginAllocSrcLocCallstack );
tracy::MemWrite( &item->gpuZoneBegin.cpuTime, tracy::Profiler::GetTime() );
tracy::MemWrite( &item->gpuNewContext.thread, tracy::GetThreadHandle() );
tracy::MemWrite( &item->gpuZoneBegin.thread, tracy::GetThreadHandle() );
tracy::MemWrite( &item->gpuZoneBegin.srcloc, data.srcloc );
tracy::MemWrite( &item->gpuZoneBegin.queryId, data.queryId );
tracy::MemWrite( &item->gpuZoneBegin.context, data.context );
@ -4292,7 +4312,7 @@ TRACY_API void ___tracy_emit_gpu_zone_begin_alloc_serial( const struct ___tracy_
auto item = tracy::Profiler::QueueSerial();
tracy::MemWrite( &item->hdr.type, tracy::QueueType::GpuZoneBeginAllocSrcLocSerial );
tracy::MemWrite( &item->gpuZoneBegin.cpuTime, tracy::Profiler::GetTime() );
tracy::MemWrite( &item->gpuNewContext.thread, tracy::GetThreadHandle() );
tracy::MemWrite( &item->gpuZoneBegin.thread, tracy::GetThreadHandle() );
tracy::MemWrite( &item->gpuZoneBegin.srcloc, data.srcloc );
tracy::MemWrite( &item->gpuZoneBegin.queryId, data.queryId );
tracy::MemWrite( &item->gpuZoneBegin.context, data.context );
@ -4304,7 +4324,7 @@ TRACY_API void ___tracy_emit_gpu_zone_begin_alloc_callstack_serial( const struct
auto item = tracy::Profiler::QueueSerialCallstack( tracy::Callstack( data.depth ) );
tracy::MemWrite( &item->hdr.type, tracy::QueueType::GpuZoneBeginAllocSrcLocCallstackSerial );
tracy::MemWrite( &item->gpuZoneBegin.cpuTime, tracy::Profiler::GetTime() );
tracy::MemWrite( &item->gpuNewContext.thread, tracy::GetThreadHandle() );
tracy::MemWrite( &item->gpuZoneBegin.thread, tracy::GetThreadHandle() );
tracy::MemWrite( &item->gpuZoneBegin.srcloc, data.srcloc );
tracy::MemWrite( &item->gpuZoneBegin.queryId, data.queryId );
tracy::MemWrite( &item->gpuZoneBegin.context, data.context );

View file

@ -10,6 +10,7 @@
#include "tracy_concurrentqueue.h"
#include "tracy_SPSCQueue.h"
#include "TracyCallstack.hpp"
#include "TracySysPower.hpp"
#include "TracySysTime.hpp"
#include "TracyFastVector.hpp"
#include "../common/TracyQueue.hpp"
@ -208,7 +209,22 @@ public:
if( HardwareSupportsInvariantTSC() )
{
uint64_t rax, rdx;
#ifdef TRACY_PATCHABLE_NOPSLEDS
// Some external tooling (such as rr) wants to patch our rdtsc and replace it by a
// branch to control the external input seen by a program. This kind of patching is
// not generally possible depending on the surrounding code and can lead to significant
// slowdowns if the compiler generated unlucky code and rr and tracy are used together.
// To avoid this, use the rr-safe `nopl 0(%rax, %rax, 1); rdtsc` instruction sequence,
// which rr promises will be patchable independent of the surrounding code.
asm volatile (
// This is nopl 0(%rax, %rax, 1), but assemblers are inconsistent about whether
// they emit that as a 4 or 5 byte sequence and we need to be guaranteed to use
// the 5 byte one.
".byte 0x0f, 0x1f, 0x44, 0x00, 0x00\n\t"
"rdtsc" : "=a" (rax), "=d" (rdx) );
#else
asm volatile ( "rdtsc" : "=a" (rax), "=d" (rdx) );
#endif
return (int64_t)(( rdx << 32 ) + rax);
}
# else
@ -288,7 +304,7 @@ public:
{
#ifndef TRACY_NO_FRAME_IMAGE
auto& profiler = GetProfiler();
assert( profiler.m_frameCount.load( std::memory_order_relaxed ) < std::numeric_limits<uint32_t>::max() );
assert( profiler.m_frameCount.load( std::memory_order_relaxed ) < (std::numeric_limits<uint32_t>::max)() );
# ifdef TRACY_ON_DEMAND
if( !profiler.IsConnected() ) return;
# endif
@ -305,6 +321,12 @@ public:
fi->flip = flip;
profiler.m_fiQueue.commit_next();
profiler.m_fiLock.unlock();
#else
static_cast<void>(image); // unused
static_cast<void>(w); // unused
static_cast<void>(h); // unused
static_cast<void>(offset); // unused
static_cast<void>(flip); // unused
#endif
}
@ -362,7 +384,7 @@ public:
static tracy_force_inline void Message( const char* txt, size_t size, int callstack )
{
assert( size < std::numeric_limits<uint16_t>::max() );
assert( size < (std::numeric_limits<uint16_t>::max)() );
#ifdef TRACY_ON_DEMAND
if( !GetProfiler().IsConnected() ) return;
#endif
@ -399,7 +421,7 @@ public:
static tracy_force_inline void MessageColor( const char* txt, size_t size, uint32_t color, int callstack )
{
assert( size < std::numeric_limits<uint16_t>::max() );
assert( size < (std::numeric_limits<uint16_t>::max)() );
#ifdef TRACY_ON_DEMAND
if( !GetProfiler().IsConnected() ) return;
#endif
@ -442,7 +464,7 @@ public:
static tracy_force_inline void MessageAppInfo( const char* txt, size_t size )
{
assert( size < std::numeric_limits<uint16_t>::max() );
assert( size < (std::numeric_limits<uint16_t>::max)() );
auto ptr = (char*)tracy_malloc( size );
memcpy( ptr, txt, size );
TracyLfqPrepare( QueueType::MessageAppInfo );
@ -676,6 +698,13 @@ public:
return m_isConnected.load( std::memory_order_acquire );
}
tracy_force_inline void SetProgramName( const char* name )
{
m_programNameLock.lock();
m_programName = name;
m_programNameLock.unlock();
}
#ifdef TRACY_ON_DEMAND
tracy_force_inline uint64_t ConnectionId() const
{
@ -730,7 +759,7 @@ public:
static tracy_force_inline uint64_t AllocSourceLocation( uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz, const char* name, size_t nameSz )
{
const auto sz32 = uint32_t( 2 + 4 + 4 + functionSz + 1 + sourceSz + 1 + nameSz );
assert( sz32 <= std::numeric_limits<uint16_t>::max() );
assert( sz32 <= (std::numeric_limits<uint16_t>::max)() );
const auto sz = uint16_t( sz32 );
auto ptr = (char*)tracy_malloc( sz );
memcpy( ptr, &sz, 2 );
@ -941,6 +970,10 @@ private:
void ProcessSysTime() {}
#endif
#ifdef TRACY_HAS_SYSPOWER
SysPower m_sysPower;
#endif
ParameterCallback m_paramCallback;
void* m_paramCallbackData;
SourceContentsCallback m_sourceCallback;
@ -959,6 +992,9 @@ private:
} m_prevSignal;
#endif
bool m_crashHandlerInstalled;
const char* m_programName;
TracyMutex m_programNameLock;
};
}

View file

@ -108,7 +108,7 @@ public:
tracy_force_inline void Text( const char* txt, size_t size )
{
assert( size < std::numeric_limits<uint16_t>::max() );
assert( size < (std::numeric_limits<uint16_t>::max)() );
if( !m_active ) return;
#ifdef TRACY_ON_DEMAND
if( GetProfiler().ConnectionId() != m_connectionId ) return;
@ -123,7 +123,7 @@ public:
tracy_force_inline void Name( const char* txt, size_t size )
{
assert( size < std::numeric_limits<uint16_t>::max() );
assert( size < (std::numeric_limits<uint16_t>::max)() );
if( !m_active ) return;
#ifdef TRACY_ON_DEMAND
if( GetProfiler().ConnectionId() != m_connectionId ) return;

View file

@ -0,0 +1,164 @@
#include "TracySysPower.hpp"
#ifdef TRACY_HAS_SYSPOWER
#include <sys/types.h>
#include <dirent.h>
#include <chrono>
#include <inttypes.h>
#include <stdio.h>
#include <string.h>
#include "TracyDebug.hpp"
#include "TracyProfiler.hpp"
#include "../common/TracyAlloc.hpp"
namespace tracy
{
SysPower::SysPower()
: m_domains( 4 )
, m_lastTime( 0 )
{
ScanDirectory( "/sys/devices/virtual/powercap/intel-rapl", -1 );
}
SysPower::~SysPower()
{
for( auto& v : m_domains )
{
fclose( v.handle );
// Do not release v.name, as it may be still needed
}
}
void SysPower::Tick()
{
auto t = std::chrono::high_resolution_clock::now().time_since_epoch().count();
if( t - m_lastTime > 10000000 ) // 10 ms
{
m_lastTime = t;
for( auto& v : m_domains )
{
char tmp[32];
if( fread( tmp, 1, 32, v.handle ) > 0 )
{
rewind( v.handle );
auto p = (uint64_t)atoll( tmp );
uint64_t delta;
if( p >= v.value )
{
delta = p - v.value;
}
else
{
delta = v.overflow - v.value + p;
}
v.value = p;
TracyLfqPrepare( QueueType::SysPowerReport );
MemWrite( &item->sysPower.time, Profiler::GetTime() );
MemWrite( &item->sysPower.delta, delta );
MemWrite( &item->sysPower.name, (uint64_t)v.name );
TracyLfqCommit;
}
}
}
}
void SysPower::ScanDirectory( const char* path, int parent )
{
DIR* dir = opendir( path );
if( !dir ) return;
struct dirent* ent;
uint64_t maxRange = 0;
char* name = nullptr;
FILE* handle = nullptr;
while( ( ent = readdir( dir ) ) )
{
if( ent->d_type == DT_REG )
{
if( strcmp( ent->d_name, "max_energy_range_uj" ) == 0 )
{
char tmp[PATH_MAX];
snprintf( tmp, PATH_MAX, "%s/max_energy_range_uj", path );
FILE* f = fopen( tmp, "r" );
if( f )
{
fscanf( f, "%" PRIu64, &maxRange );
fclose( f );
}
}
else if( strcmp( ent->d_name, "name" ) == 0 )
{
char tmp[PATH_MAX];
snprintf( tmp, PATH_MAX, "%s/name", path );
FILE* f = fopen( tmp, "r" );
if( f )
{
char ntmp[128];
if( fgets( ntmp, 128, f ) )
{
// Last character is newline, skip it
const auto sz = strlen( ntmp ) - 1;
if( parent < 0 )
{
name = (char*)tracy_malloc( sz + 1 );
memcpy( name, ntmp, sz );
name[sz] = '\0';
}
else
{
const auto p = m_domains[parent];
const auto psz = strlen( p.name );
name = (char*)tracy_malloc( psz + sz + 2 );
memcpy( name, p.name, psz );
name[psz] = ':';
memcpy( name+psz+1, ntmp, sz );
name[psz+sz+1] = '\0';
}
}
fclose( f );
}
}
else if( strcmp( ent->d_name, "energy_uj" ) == 0 )
{
char tmp[PATH_MAX];
snprintf( tmp, PATH_MAX, "%s/energy_uj", path );
handle = fopen( tmp, "r" );
}
}
if( name && handle && maxRange > 0 ) break;
}
if( name && handle && maxRange > 0 )
{
parent = (int)m_domains.size();
Domain* domain = m_domains.push_next();
domain->value = 0;
domain->overflow = maxRange;
domain->handle = handle;
domain->name = name;
TracyDebug( "Power domain id %i, %s found at %s\n", parent, name, path );
}
else
{
if( name ) tracy_free( name );
if( handle ) fclose( handle );
}
rewinddir( dir );
while( ( ent = readdir( dir ) ) )
{
if( ent->d_type == DT_DIR && strncmp( ent->d_name, "intel-rapl:", 11 ) == 0 )
{
char tmp[PATH_MAX];
snprintf( tmp, PATH_MAX, "%s/%s", path, ent->d_name );
ScanDirectory( tmp, parent );
}
}
closedir( dir );
}
}
#endif

View file

@ -0,0 +1,44 @@
#ifndef __TRACYSYSPOWER_HPP__
#define __TRACYSYSPOWER_HPP__
#if defined __linux__
# define TRACY_HAS_SYSPOWER
#endif
#ifdef TRACY_HAS_SYSPOWER
#include <stdint.h>
#include <stdio.h>
#include "TracyFastVector.hpp"
namespace tracy
{
class SysPower
{
struct Domain
{
uint64_t value;
uint64_t overflow;
FILE* handle;
const char* name;
};
public:
SysPower();
~SysPower();
void Tick();
private:
void ScanDirectory( const char* path, int parent );
FastVector<Domain> m_domains;
uint64_t m_lastTime;
};
}
#endif
#endif

View file

@ -770,6 +770,13 @@ bool SysTraceStart( int64_t& samplingPeriod )
TracyDebug( "sched_wakeup id: %i\n", wakeupId );
TracyDebug( "drm_vblank_event id: %i\n", vsyncId );
#ifdef TRACY_NO_SAMPLING
const bool noSoftwareSampling = true;
#else
const char* noSoftwareSamplingEnv = GetEnvVar( "TRACY_NO_SAMPLING" );
const bool noSoftwareSampling = noSoftwareSamplingEnv && noSoftwareSamplingEnv[0] == '1';
#endif
#ifdef TRACY_NO_SAMPLE_RETIREMENT
const bool noRetirement = true;
#else
@ -839,28 +846,31 @@ bool SysTraceStart( int64_t& samplingPeriod )
pe.clockid = CLOCK_MONOTONIC_RAW;
#endif
TracyDebug( "Setup software sampling\n" );
ProbePreciseIp( pe, currentPid );
for( int i=0; i<s_numCpus; i++ )
if( !noSoftwareSampling )
{
int fd = perf_event_open( &pe, currentPid, i, -1, PERF_FLAG_FD_CLOEXEC );
if( fd == -1 )
TracyDebug( "Setup software sampling\n" );
ProbePreciseIp( pe, currentPid );
for( int i=0; i<s_numCpus; i++ )
{
pe.exclude_kernel = 1;
ProbePreciseIp( pe, currentPid );
fd = perf_event_open( &pe, currentPid, i, -1, PERF_FLAG_FD_CLOEXEC );
int fd = perf_event_open( &pe, currentPid, i, -1, PERF_FLAG_FD_CLOEXEC );
if( fd == -1 )
{
TracyDebug( " Failed to setup!\n");
break;
pe.exclude_kernel = 1;
ProbePreciseIp( pe, currentPid );
fd = perf_event_open( &pe, currentPid, i, -1, PERF_FLAG_FD_CLOEXEC );
if( fd == -1 )
{
TracyDebug( " Failed to setup!\n");
break;
}
TracyDebug( " No access to kernel samples\n" );
}
new( s_ring+s_numBuffers ) RingBuffer( 64*1024, fd, EventCallstack );
if( s_ring[s_numBuffers].IsValid() )
{
s_numBuffers++;
TracyDebug( " Core %i ok\n", i );
}
TracyDebug( " No access to kernel samples\n" );
}
new( s_ring+s_numBuffers ) RingBuffer( 64*1024, fd, EventCallstack );
if( s_ring[s_numBuffers].IsValid() )
{
s_numBuffers++;
TracyDebug( " Core %i ok\n", i );
}
}

View file

@ -147,7 +147,7 @@
# if defined(__APPLE__)
# include <TargetConditionals.h>
# if !TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR
# include <mach/mach_vm.h>
# include <mach/mach.h>
# include <mach/vm_statistics.h>
# endif
# include <pthread.h>

View file

@ -9,14 +9,14 @@ namespace tracy
constexpr unsigned Lz4CompressBound( unsigned isize ) { return isize + ( isize / 255 ) + 16; }
enum : uint32_t { ProtocolVersion = 63 };
enum : uint32_t { ProtocolVersion = 64 };
enum : uint16_t { BroadcastVersion = 3 };
using lz4sz_t = uint32_t;
enum { TargetFrameSize = 256 * 1024 };
enum { LZ4Size = Lz4CompressBound( TargetFrameSize ) };
static_assert( LZ4Size <= std::numeric_limits<lz4sz_t>::max(), "LZ4Size greater than lz4sz_t" );
static_assert( LZ4Size <= (std::numeric_limits<lz4sz_t>::max)(), "LZ4Size greater than lz4sz_t" );
static_assert( TargetFrameSize * 2 >= 64 * 1024, "Not enough space for LZ4 stream buffer" );
enum { HandshakeShibbolethSize = 8 };

View file

@ -90,6 +90,7 @@ enum class QueueType : uint8_t
GpuNewContext,
CallstackFrame,
SysTimeReport,
SysPowerReport,
TidToPid,
HwSampleCpuCycle,
HwSampleInstructionRetired,
@ -563,6 +564,13 @@ struct QueueSysTime
float sysTime;
};
struct QueueSysPower
{
int64_t time;
uint64_t delta;
uint64_t name; // ptr
};
struct QueueContextSwitch
{
int64_t time;
@ -729,6 +737,7 @@ struct QueueItem
QueueCrashReport crashReport;
QueueCrashReportThread crashReportThread;
QueueSysTime sysTime;
QueueSysPower sysPower;
QueueContextSwitch contextSwitch;
QueueThreadWakeup threadWakeup;
QueueTidToPid tidToPid;
@ -832,6 +841,7 @@ static constexpr size_t QueueDataSize[] = {
sizeof( QueueHeader ) + sizeof( QueueGpuNewContext ),
sizeof( QueueHeader ) + sizeof( QueueCallstackFrame ),
sizeof( QueueHeader ) + sizeof( QueueSysTime ),
sizeof( QueueHeader ) + sizeof( QueueSysPower ),
sizeof( QueueHeader ) + sizeof( QueueTidToPid ),
sizeof( QueueHeader ) + sizeof( QueueHwSample ), // cpu cycle
sizeof( QueueHeader ) + sizeof( QueueHwSample ), // instruction retired

View file

@ -353,7 +353,7 @@ int Socket::Recv( void* _buf, int len, int timeout )
}
}
int Socket::ReadUpTo( void* _buf, int len, int timeout )
int Socket::ReadUpTo( void* _buf, int len )
{
const auto sock = m_sock.load( std::memory_order_relaxed );
auto buf = (char*)_buf;
@ -678,10 +678,10 @@ bool UdpListen::Listen( uint16_t port )
#endif
#if defined _WIN32
unsigned long reuse = 1;
setsockopt( m_sock, SOL_SOCKET, SO_REUSEADDR, (const char*)&reuse, sizeof( reuse ) );
setsockopt( sock, SOL_SOCKET, SO_REUSEADDR, (const char*)&reuse, sizeof( reuse ) );
#else
int reuse = 1;
setsockopt( m_sock, SOL_SOCKET, SO_REUSEADDR, &reuse, sizeof( reuse ) );
setsockopt( sock, SOL_SOCKET, SO_REUSEADDR, &reuse, sizeof( reuse ) );
#endif
#if defined _WIN32
unsigned long broadcast = 1;

View file

@ -29,7 +29,7 @@ public:
int Send( const void* buf, int len );
int GetSendBufSize();
int ReadUpTo( void* buf, int len, int timeout );
int ReadUpTo( void* buf, int len );
bool Read( void* buf, int len, int timeout );
template<typename ShouldExit>

View file

@ -213,21 +213,24 @@ TRACY_API const char* GetThreadName( uint32_t id )
# else
static auto _GetThreadDescription = (t_GetThreadDescription)GetProcAddress( GetModuleHandleA( "kernel32.dll" ), "GetThreadDescription" );
# endif
if( _GetThreadDescription )
{
auto hnd = OpenThread( THREAD_QUERY_LIMITED_INFORMATION, FALSE, (DWORD)id );
if( hnd != 0 )
{
PWSTR tmp;
_GetThreadDescription( hnd, &tmp );
auto ret = wcstombs( buf, tmp, 256 );
CloseHandle( hnd );
if( ret != 0 )
{
return buf;
}
}
}
if( _GetThreadDescription )
{
auto hnd = OpenThread( THREAD_QUERY_LIMITED_INFORMATION, FALSE, (DWORD)id );
if( hnd != 0 )
{
PWSTR tmp;
if( SUCCEEDED( _GetThreadDescription( hnd, &tmp ) ) )
{
auto ret = wcstombs( buf, tmp, 256 );
CloseHandle( hnd );
LocalFree( tmp );
if( ret != static_cast<size_t>( -1 ) )
{
return buf;
}
}
}
}
#elif defined __linux__
int cs, fd;
char path[32];

View file

@ -6,8 +6,8 @@ namespace tracy
namespace Version
{
enum { Major = 0 };
enum { Minor = 9 };
enum { Patch = 1 };
enum { Minor = 10 };
enum { Patch = 0 };
}
}

View file

@ -1,4 +1,8 @@
#include <limits.h>
#if defined(__linux__) && !defined(__GLIBC__) && !defined(__WORDSIZE)
// include __WORDSIZE headers for musl
# include <bits/reg.h>
#endif
#if __WORDSIZE == 64
# define BACKTRACE_ELF_SIZE 64
#else

View file

@ -473,7 +473,7 @@ enum attr_val_encoding
/* An address. */
ATTR_VAL_ADDRESS,
/* An index into the .debug_addr section, whose value is relative to
* the DW_AT_addr_base attribute of the compilation unit. */
the DW_AT_addr_base attribute of the compilation unit. */
ATTR_VAL_ADDRESS_INDEX,
/* A unsigned integer. */
ATTR_VAL_UINT,
@ -611,8 +611,8 @@ struct function
struct function_addrs
{
/* Range is LOW <= PC < HIGH. */
uint64_t low;
uint64_t high;
uintptr_t low;
uintptr_t high;
/* Function for this address range. */
struct function *function;
};
@ -693,8 +693,8 @@ struct unit
struct unit_addrs
{
/* Range is LOW <= PC < HIGH. */
uint64_t low;
uint64_t high;
uintptr_t low;
uintptr_t high;
/* Compilation unit for this address range. */
struct unit *u;
};
@ -1431,7 +1431,7 @@ resolve_addr_index (const struct dwarf_sections *dwarf_sections,
uint64_t addr_base, int addrsize, int is_bigendian,
uint64_t addr_index,
backtrace_error_callback error_callback, void *data,
uint64_t *address)
uintptr_t *address)
{
uint64_t offset;
struct dwarf_buf addr_buf;
@ -1452,7 +1452,7 @@ resolve_addr_index (const struct dwarf_sections *dwarf_sections,
addr_buf.data = data;
addr_buf.reported_underflow = 0;
*address = read_address (&addr_buf, addrsize);
*address = (uintptr_t) read_address (&addr_buf, addrsize);
return 1;
}
@ -1531,7 +1531,7 @@ function_addrs_search (const void *vkey, const void *ventry)
static int
add_unit_addr (struct backtrace_state *state, void *rdata,
uint64_t lowpc, uint64_t highpc,
uintptr_t lowpc, uintptr_t highpc,
backtrace_error_callback error_callback, void *data,
void *pvec)
{
@ -1867,10 +1867,10 @@ lookup_abbrev (struct abbrevs *abbrevs, uint64_t code,
lowpc/highpc is set or ranges is set. */
struct pcrange {
uint64_t lowpc; /* The low PC value. */
uintptr_t lowpc; /* The low PC value. */
int have_lowpc; /* Whether a low PC value was found. */
int lowpc_is_addr_index; /* Whether lowpc is in .debug_addr. */
uint64_t highpc; /* The high PC value. */
uintptr_t highpc; /* The high PC value. */
int have_highpc; /* Whether a high PC value was found. */
int highpc_is_relative; /* Whether highpc is relative to lowpc. */
int highpc_is_addr_index; /* Whether highpc is in .debug_addr. */
@ -1890,12 +1890,12 @@ update_pcrange (const struct attr* attr, const struct attr_val* val,
case DW_AT_low_pc:
if (val->encoding == ATTR_VAL_ADDRESS)
{
pcrange->lowpc = val->u.uint;
pcrange->lowpc = (uintptr_t) val->u.uint;
pcrange->have_lowpc = 1;
}
else if (val->encoding == ATTR_VAL_ADDRESS_INDEX)
{
pcrange->lowpc = val->u.uint;
pcrange->lowpc = (uintptr_t) val->u.uint;
pcrange->have_lowpc = 1;
pcrange->lowpc_is_addr_index = 1;
}
@ -1904,18 +1904,18 @@ update_pcrange (const struct attr* attr, const struct attr_val* val,
case DW_AT_high_pc:
if (val->encoding == ATTR_VAL_ADDRESS)
{
pcrange->highpc = val->u.uint;
pcrange->highpc = (uintptr_t) val->u.uint;
pcrange->have_highpc = 1;
}
else if (val->encoding == ATTR_VAL_UINT)
{
pcrange->highpc = val->u.uint;
pcrange->highpc = (uintptr_t) val->u.uint;
pcrange->have_highpc = 1;
pcrange->highpc_is_relative = 1;
}
else if (val->encoding == ATTR_VAL_ADDRESS_INDEX)
{
pcrange->highpc = val->u.uint;
pcrange->highpc = (uintptr_t) val->u.uint;
pcrange->have_highpc = 1;
pcrange->highpc_is_addr_index = 1;
}
@ -1950,16 +1950,16 @@ add_low_high_range (struct backtrace_state *state,
uintptr_t base_address, int is_bigendian,
struct unit *u, const struct pcrange *pcrange,
int (*add_range) (struct backtrace_state *state,
void *rdata, uint64_t lowpc,
uint64_t highpc,
void *rdata, uintptr_t lowpc,
uintptr_t highpc,
backtrace_error_callback error_callback,
void *data, void *vec),
void *rdata,
backtrace_error_callback error_callback, void *data,
void *vec)
{
uint64_t lowpc;
uint64_t highpc;
uintptr_t lowpc;
uintptr_t highpc;
lowpc = pcrange->lowpc;
if (pcrange->lowpc_is_addr_index)
@ -1997,10 +1997,10 @@ add_ranges_from_ranges (
struct backtrace_state *state,
const struct dwarf_sections *dwarf_sections,
uintptr_t base_address, int is_bigendian,
struct unit *u, uint64_t base,
struct unit *u, uintptr_t base,
const struct pcrange *pcrange,
int (*add_range) (struct backtrace_state *state, void *rdata,
uint64_t lowpc, uint64_t highpc,
uintptr_t lowpc, uintptr_t highpc,
backtrace_error_callback error_callback, void *data,
void *vec),
void *rdata,
@ -2039,12 +2039,12 @@ add_ranges_from_ranges (
break;
if (is_highest_address (low, u->addrsize))
base = high;
base = (uintptr_t) high;
else
{
if (!add_range (state, rdata,
low + base + base_address,
high + base + base_address,
(uintptr_t) low + base + base_address,
(uintptr_t) high + base + base_address,
error_callback, data, vec))
return 0;
}
@ -2064,10 +2064,10 @@ add_ranges_from_rnglists (
struct backtrace_state *state,
const struct dwarf_sections *dwarf_sections,
uintptr_t base_address, int is_bigendian,
struct unit *u, uint64_t base,
struct unit *u, uintptr_t base,
const struct pcrange *pcrange,
int (*add_range) (struct backtrace_state *state, void *rdata,
uint64_t lowpc, uint64_t highpc,
uintptr_t lowpc, uintptr_t highpc,
backtrace_error_callback error_callback, void *data,
void *vec),
void *rdata,
@ -2133,8 +2133,8 @@ add_ranges_from_rnglists (
case DW_RLE_startx_endx:
{
uint64_t index;
uint64_t low;
uint64_t high;
uintptr_t low;
uintptr_t high;
index = read_uleb128 (&rnglists_buf);
if (!resolve_addr_index (dwarf_sections, u->addr_base,
@ -2156,8 +2156,8 @@ add_ranges_from_rnglists (
case DW_RLE_startx_length:
{
uint64_t index;
uint64_t low;
uint64_t length;
uintptr_t low;
uintptr_t length;
index = read_uleb128 (&rnglists_buf);
if (!resolve_addr_index (dwarf_sections, u->addr_base,
@ -2187,16 +2187,16 @@ add_ranges_from_rnglists (
break;
case DW_RLE_base_address:
base = read_address (&rnglists_buf, u->addrsize);
base = (uintptr_t) read_address (&rnglists_buf, u->addrsize);
break;
case DW_RLE_start_end:
{
uint64_t low;
uint64_t high;
uintptr_t low;
uintptr_t high;
low = read_address (&rnglists_buf, u->addrsize);
high = read_address (&rnglists_buf, u->addrsize);
low = (uintptr_t) read_address (&rnglists_buf, u->addrsize);
high = (uintptr_t) read_address (&rnglists_buf, u->addrsize);
if (!add_range (state, rdata, low + base_address,
high + base_address, error_callback, data,
vec))
@ -2206,11 +2206,11 @@ add_ranges_from_rnglists (
case DW_RLE_start_length:
{
uint64_t low;
uint64_t length;
uintptr_t low;
uintptr_t length;
low = read_address (&rnglists_buf, u->addrsize);
length = read_uleb128 (&rnglists_buf);
low = (uintptr_t) read_address (&rnglists_buf, u->addrsize);
length = (uintptr_t) read_uleb128 (&rnglists_buf);
low += base_address;
if (!add_range (state, rdata, low, low + length,
error_callback, data, vec))
@ -2240,9 +2240,9 @@ static int
add_ranges (struct backtrace_state *state,
const struct dwarf_sections *dwarf_sections,
uintptr_t base_address, int is_bigendian,
struct unit *u, uint64_t base, const struct pcrange *pcrange,
struct unit *u, uintptr_t base, const struct pcrange *pcrange,
int (*add_range) (struct backtrace_state *state, void *rdata,
uint64_t lowpc, uint64_t highpc,
uintptr_t lowpc, uintptr_t highpc,
backtrace_error_callback error_callback,
void *data, void *vec),
void *rdata,
@ -3520,7 +3520,7 @@ read_referenced_name (struct dwarf_data *ddata, struct unit *u,
static int
add_function_range (struct backtrace_state *state, void *rdata,
uint64_t lowpc, uint64_t highpc,
uintptr_t lowpc, uintptr_t highpc,
backtrace_error_callback error_callback, void *data,
void *pvec)
{
@ -3560,7 +3560,7 @@ add_function_range (struct backtrace_state *state, void *rdata,
static int
read_function_entry (struct backtrace_state *state, struct dwarf_data *ddata,
struct unit *u, uint64_t base, struct dwarf_buf *unit_buf,
struct unit *u, uintptr_t base, struct dwarf_buf *unit_buf,
const struct line_header *lhdr,
backtrace_error_callback error_callback, void *data,
struct function_vector *vec_function,
@ -3624,7 +3624,7 @@ read_function_entry (struct backtrace_state *state, struct dwarf_data *ddata,
&& abbrev->attrs[i].name == DW_AT_low_pc)
{
if (val.encoding == ATTR_VAL_ADDRESS)
base = val.u.uint;
base = (uintptr_t) val.u.uint;
else if (val.encoding == ATTR_VAL_ADDRESS_INDEX)
{
if (!resolve_addr_index (&ddata->dwarf_sections,

View file

@ -2823,18 +2823,18 @@ elf_zstd_read_fse (const unsigned char **ppin, const unsigned char *pinend,
while ((val & 0xfff) == 0xfff)
{
zidx += 3 * 6;
if (!elf_fetch_bits (&pin, pinend, &val, &bits))
return 0;
val >>= 12;
bits -= 12;
if (!elf_fetch_bits (&pin, pinend, &val, &bits))
return 0;
}
while ((val & 3) == 3)
{
zidx += 3;
if (!elf_fetch_bits (&pin, pinend, &val, &bits))
return 0;
val >>= 2;
bits -= 2;
if (!elf_fetch_bits (&pin, pinend, &val, &bits))
return 0;
}
/* We have at least 13 bits here, don't need to fetch. */
zidx += val & 3;
@ -2964,7 +2964,7 @@ elf_zstd_build_fse (const int16_t *norm, int idx, uint16_t *next,
pos = (pos + step) & mask;
}
}
if (pos != 0)
if (unlikely (pos != 0))
{
elf_uncompress_failed ();
return 0;
@ -3440,17 +3440,17 @@ static const struct elf_zstd_fse_baseline_entry elf_zstd_match_table[64] =
static const struct elf_zstd_fse_baseline_entry elf_zstd_offset_table[32] =
{
{ 1, 0, 5, 0 }, { 64, 6, 4, 0 }, { 512, 9, 5, 0 },
{ 32768, 15, 5, 0 }, { 2097152, 21, 5, 0 }, { 8, 3, 5, 0 },
{ 128, 7, 4, 0 }, { 4096, 12, 5, 0 }, { 262144, 18, 5, 0 },
{ 8388608, 23, 5, 0 }, { 32, 5, 5, 0 }, { 256, 8, 4, 0 },
{ 16384, 14, 5, 0 }, { 1048576, 20, 5, 0 }, { 4, 2, 5, 0 },
{ 128, 7, 4, 16 }, { 2048, 11, 5, 0 }, { 131072, 17, 5, 0 },
{ 4194304, 22, 5, 0 }, { 16, 4, 5, 0 }, { 256, 8, 4, 16 },
{ 8192, 13, 5, 0 }, { 524288, 19, 5, 0 }, { 2, 1, 5, 0 },
{ 64, 6, 4, 16 }, { 1024, 10, 5, 0 }, { 65536, 16, 5, 0 },
{ 268435456, 28, 5, 0 }, { 134217728, 27, 5, 0 }, { 67108864, 26, 5, 0 },
{ 33554432, 25, 5, 0 }, { 16777216, 24, 5, 0 },
{ 1, 0, 5, 0 }, { 61, 6, 4, 0 }, { 509, 9, 5, 0 },
{ 32765, 15, 5, 0 }, { 2097149, 21, 5, 0 }, { 5, 3, 5, 0 },
{ 125, 7, 4, 0 }, { 4093, 12, 5, 0 }, { 262141, 18, 5, 0 },
{ 8388605, 23, 5, 0 }, { 29, 5, 5, 0 }, { 253, 8, 4, 0 },
{ 16381, 14, 5, 0 }, { 1048573, 20, 5, 0 }, { 1, 2, 5, 0 },
{ 125, 7, 4, 16 }, { 2045, 11, 5, 0 }, { 131069, 17, 5, 0 },
{ 4194301, 22, 5, 0 }, { 13, 4, 5, 0 }, { 253, 8, 4, 16 },
{ 8189, 13, 5, 0 }, { 524285, 19, 5, 0 }, { 2, 1, 5, 0 },
{ 61, 6, 4, 16 }, { 1021, 10, 5, 0 }, { 65533, 16, 5, 0 },
{ 268435453, 28, 5, 0 }, { 134217725, 27, 5, 0 }, { 67108861, 26, 5, 0 },
{ 33554429, 25, 5, 0 }, { 16777213, 24, 5, 0 },
};
/* Read a zstd Huffman table and build the decoding table in *TABLE, reading
@ -3635,7 +3635,7 @@ elf_zstd_read_huff (const unsigned char **ppin, const unsigned char *pinend,
}
weight_mark = (uint32_t *) (weights + 256);
memset (weight_mark, 0, 12 * sizeof (uint32_t));
memset (weight_mark, 0, 13 * sizeof (uint32_t));
weight_mask = 0;
for (i = 0; i < count; ++i)
{
@ -3702,7 +3702,7 @@ elf_zstd_read_huff (const unsigned char **ppin, const unsigned char *pinend,
/* Change WEIGHT_MARK from a count of weights to the index of the first
symbol for that weight. We shift the indexes to also store how many we
hae seen so far, below. */
have seen so far, below. */
{
uint32_t next;
@ -3783,7 +3783,7 @@ elf_zstd_read_literals (const unsigned char **ppin,
{
int raw;
/* Raw_literals_Block or RLE_Literals_Block */
/* Raw_Literals_Block or RLE_Literals_Block */
raw = (hdr & 3) == 0;
@ -3965,7 +3965,7 @@ elf_zstd_read_literals (const unsigned char **ppin,
unsigned int bits;
uint32_t i;
pback = pin + compressed_size - 1;
pback = pin + total_streams_size - 1;
pbackend = pin;
if (!elf_fetch_backward_init (&pback, pbackend, &val, &bits))
return 0;

View file

@ -109,6 +109,7 @@
#define TracyParameterRegister(x,y)
#define TracyParameterSetup(x,y,z,w)
#define TracyIsConnected false
#define TracySetProgramName(x)
#define TracyFiberEnter(x)
#define TracyFiberLeave
@ -270,6 +271,7 @@
#define TracyParameterRegister( cb, data ) tracy::Profiler::ParameterRegister( cb, data )
#define TracyParameterSetup( idx, name, isBool, val ) tracy::Profiler::ParameterSetup( idx, name, isBool, val )
#define TracyIsConnected tracy::GetProfiler().IsConnected()
#define TracySetProgramName( name ) tracy::GetProfiler().SetProgramName( name );
#ifdef TRACY_FIBERS
# define TracyFiberEnter( fiber ) tracy::Profiler::EnterFiber( fiber )

View file

@ -11,6 +11,14 @@
extern "C" {
#endif
enum TracyPlotFormatEnum
{
TracyPlotFormatNumber,
TracyPlotFormatMemory,
TracyPlotFormatPercentage,
TracyPlotFormatWatt
};
TRACY_API void ___tracy_set_thread_name( const char* name );
#define TracyCSetThreadName( name ) ___tracy_set_thread_name( name );
@ -60,6 +68,8 @@ typedef const void* TracyCZoneCtx;
#define TracyCPlot(x,y)
#define TracyCPlotF(x,y)
#define TracyCPlotI(x,y)
#define TracyCPlotConfig(x,y,z,w,a)
#define TracyCMessage(x,y)
#define TracyCMessageL(x)
#define TracyCMessageC(x,y,z)
@ -289,11 +299,13 @@ TRACY_API void ___tracy_emit_frame_image( const void* image, uint16_t w, uint16_
TRACY_API void ___tracy_emit_plot( const char* name, double val );
TRACY_API void ___tracy_emit_plot_float( const char* name, float val );
TRACY_API void ___tracy_emit_plot_int( const char* name, int64_t val );
TRACY_API void ___tracy_emit_plot_config( const char* name, int type, int step, int fill, uint32_t color );
TRACY_API void ___tracy_emit_message_appinfo( const char* txt, size_t size );
#define TracyCPlot( name, val ) ___tracy_emit_plot( name, val );
#define TracyCPlotF( name, val ) ___tracy_emit_plot_float( name, val );
#define TracyCPlotI( name, val ) ___tracy_emit_plot_int( name, val );
#define TracyCPlotConfig( name, type, step, fill, color ) ___tracy_emit_plot_config( name, type, step, fill, color );
#define TracyCAppInfo( txt, size ) ___tracy_emit_message_appinfo( txt, size );

View file

@ -13,13 +13,13 @@
#define TracyD3D11ZoneC(ctx, name, color)
#define TracyD3D11NamedZone(ctx, varname, name, active)
#define TracyD3D11NamedZoneC(ctx, varname, name, color, active)
#define TracyD3D12ZoneTransient(ctx, varname, name, active)
#define TracyD3D11ZoneTransient(ctx, varname, name, active)
#define TracyD3D11ZoneS(ctx, name, depth)
#define TracyD3D11ZoneCS(ctx, name, color, depth)
#define TracyD3D11NamedZoneS(ctx, varname, name, depth, active)
#define TracyD3D11NamedZoneCS(ctx, varname, name, color, depth, active)
#define TracyD3D12ZoneTransientS(ctx, varname, name, depth, active)
#define TracyD3D11ZoneTransientS(ctx, varname, name, depth, active)
#define TracyD3D11Collect(ctx)
@ -39,11 +39,12 @@ using TracyD3D11Ctx = void*;
#include "Tracy.hpp"
#include "../client/TracyProfiler.hpp"
#include "../client/TracyCallstack.hpp"
#include "../common/TracyAlign.hpp"
#include "../common/TracyAlloc.hpp"
#include "../common/TracyYield.hpp"
#include <d3d11.h>
#define TracyD3D11Panic(msg, ...) do { assert(false && "TracyD3D11: " msg); TracyMessageLC("TracyD3D11: " msg, tracy::Color::Red4); __VA_ARGS__; } while(false);
namespace tracy
{
@ -51,71 +52,83 @@ class D3D11Ctx
{
friend class D3D11ZoneScope;
enum { QueryCount = 64 * 1024 };
static constexpr uint32_t MaxQueries = 64 * 1024;
enum CollectMode { POLL, BLOCK };
public:
D3D11Ctx( ID3D11Device* device, ID3D11DeviceContext* devicectx )
: m_device( device )
, m_devicectx( devicectx )
, m_context( GetGpuCtxCounter().fetch_add( 1, std::memory_order_relaxed ) )
, m_head( 0 )
, m_tail( 0 )
{
assert( m_context != 255 );
// TODO: consider calling ID3D11Device::GetImmediateContext() instead of passing it as an argument
m_device = device;
device->AddRef();
m_immediateDevCtx = devicectx;
devicectx->AddRef();
for (int i = 0; i < QueryCount; i++)
{
HRESULT hr = S_OK;
D3D11_QUERY_DESC desc;
desc.MiscFlags = 0;
desc.Query = D3D11_QUERY_TIMESTAMP;
hr |= device->CreateQuery(&desc, &m_queries[i]);
D3D11_QUERY_DESC desc = { };
desc.Query = D3D11_QUERY_TIMESTAMP_DISJOINT;
hr |= device->CreateQuery(&desc, &m_disjoints[i]);
m_disjointMap[i] = nullptr;
assert(SUCCEEDED(hr));
if (FAILED(m_device->CreateQuery(&desc, &m_disjointQuery)))
{
TracyD3D11Panic("unable to create disjoint timestamp query.", return);
}
}
// Force query the initial GPU timestamp (pipeline stall)
D3D11_QUERY_DATA_TIMESTAMP_DISJOINT disjoint;
UINT64 timestamp;
for (ID3D11Query*& query : m_queries)
{
D3D11_QUERY_DESC desc = { };
desc.Query = D3D11_QUERY_TIMESTAMP;
if (FAILED(m_device->CreateQuery(&desc, &query)))
{
TracyD3D11Panic("unable to create timestamp query.", return);
}
}
// Calibrate CPU and GPU timestamps
int64_t tcpu = 0;
int64_t tgpu = 0;
for (int attempts = 0; attempts < 50; attempts++)
{
devicectx->Begin(m_disjoints[0]);
devicectx->End(m_queries[0]);
devicectx->End(m_disjoints[0]);
devicectx->Flush();
m_immediateDevCtx->Begin(m_disjointQuery);
m_immediateDevCtx->End(m_queries[0]);
m_immediateDevCtx->End(m_disjointQuery);
while (devicectx->GetData(m_disjoints[0], &disjoint, sizeof(disjoint), 0) == S_FALSE)
/* Nothing */;
int64_t tcpu0 = Profiler::GetTime();
WaitForQuery(m_disjointQuery);
int64_t tcpu1 = Profiler::GetTime();
D3D11_QUERY_DATA_TIMESTAMP_DISJOINT disjoint = { };
if (m_immediateDevCtx->GetData(m_disjointQuery, &disjoint, sizeof(disjoint), 0) != S_OK)
{
TracyMessageLC("TracyD3D11: unable to query GPU timestamp; retrying...", tracy::Color::Tomato);
continue;
}
if (disjoint.Disjoint)
continue;
while (devicectx->GetData(m_queries[0], &timestamp, sizeof(timestamp), 0) == S_FALSE)
/* Nothing */;
UINT64 timestamp = 0;
if (m_immediateDevCtx->GetData(m_queries[0], &timestamp, sizeof(timestamp), 0) != S_OK)
continue; // this should never happen, since the enclosing disjoint query succeeded
tcpu = tcpu0 + (tcpu1 - tcpu0) * 1 / 2;
tgpu = timestamp * (1000000000 / disjoint.Frequency);
break;
}
int64_t tgpu = timestamp * (1000000000ull / disjoint.Frequency);
int64_t tcpu = Profiler::GetTime();
// ready to roll
m_contextId = GetGpuCtxCounter().fetch_add(1);
m_immediateDevCtx->Begin(m_disjointQuery);
m_previousCheckpoint = m_nextCheckpoint = 0;
uint8_t flags = 0;
const float period = 1.f;
auto* item = Profiler::QueueSerial();
MemWrite( &item->hdr.type, QueueType::GpuNewContext );
MemWrite( &item->gpuNewContext.cpuTime, tcpu );
MemWrite( &item->gpuNewContext.gpuTime, tgpu );
memset(&item->gpuNewContext.thread, 0, sizeof(item->gpuNewContext.thread));
MemWrite( &item->gpuNewContext.period, period );
MemWrite( &item->gpuNewContext.context, m_context );
MemWrite( &item->gpuNewContext.flags, flags );
MemWrite( &item->gpuNewContext.thread, uint32_t(0) ); // #TODO: why not GetThreadHandle()?
MemWrite( &item->gpuNewContext.period, 1.0f );
MemWrite( &item->gpuNewContext.context, m_contextId);
MemWrite( &item->gpuNewContext.flags, uint8_t(0) );
MemWrite( &item->gpuNewContext.type, GpuContextType::Direct3D11 );
#ifdef TRACY_ON_DEMAND
@ -127,12 +140,20 @@ public:
~D3D11Ctx()
{
for (int i = 0; i < QueryCount; i++)
// collect all pending timestamps before destroying everything
do
{
m_queries[i]->Release();
m_disjoints[i]->Release();
m_disjointMap[i] = nullptr;
Collect(BLOCK);
} while (m_previousCheckpoint != m_queryCounter);
for (ID3D11Query* query : m_queries)
{
query->Release();
}
m_immediateDevCtx->End(m_disjointQuery);
m_disjointQuery->Release();
m_immediateDevCtx->Release();
m_device->Release();
}
void Name( const char* name, uint16_t len )
@ -142,7 +163,7 @@ public:
auto item = Profiler::QueueSerial();
MemWrite( &item->hdr.type, QueueType::GpuContextName );
MemWrite( &item->gpuContextNameFat.context, m_context );
MemWrite( &item->gpuContextNameFat.context, m_contextId );
MemWrite( &item->gpuContextNameFat.ptr, (uint64_t)ptr );
MemWrite( &item->gpuContextNameFat.size, len );
#ifdef TRACY_ON_DEMAND
@ -151,217 +172,170 @@ public:
Profiler::QueueSerialFinish();
}
void Collect()
void Collect(CollectMode mode = POLL)
{
ZoneScopedC( Color::Red4 );
if( m_tail == m_head ) return;
#ifdef TRACY_ON_DEMAND
if( !GetProfiler().IsConnected() )
{
m_head = m_tail = 0;
m_previousCheckpoint = m_nextCheckpoint = m_queryCounter;
return;
}
#endif
auto start = m_tail;
auto end = m_head + QueryCount;
auto cnt = (end - start) % QueryCount;
while (cnt > 1)
if (m_previousCheckpoint == m_nextCheckpoint)
{
auto mid = start + cnt / 2;
bool available =
m_devicectx->GetData(m_disjointMap[mid % QueryCount], nullptr, 0, D3D11_ASYNC_GETDATA_DONOTFLUSH) == S_OK &&
m_devicectx->GetData(m_queries[mid % QueryCount], nullptr, 0, D3D11_ASYNC_GETDATA_DONOTFLUSH) == S_OK;
if (available)
uintptr_t nextCheckpoint = m_queryCounter;
if (nextCheckpoint == m_nextCheckpoint)
{
start = mid;
return;
}
else
{
end = mid;
}
cnt = (end - start) % QueryCount;
m_nextCheckpoint = nextCheckpoint;
m_immediateDevCtx->End(m_disjointQuery);
}
start %= QueryCount;
while (m_tail != start)
if (mode == CollectMode::BLOCK)
{
D3D11_QUERY_DATA_TIMESTAMP_DISJOINT disjoint;
UINT64 time;
WaitForQuery(m_disjointQuery);
}
m_devicectx->GetData(m_disjointMap[m_tail], &disjoint, sizeof(disjoint), 0);
m_devicectx->GetData(m_queries[m_tail], &time, sizeof(time), 0);
D3D11_QUERY_DATA_TIMESTAMP_DISJOINT disjoint = { };
if (m_immediateDevCtx->GetData(m_disjointQuery, &disjoint, sizeof(disjoint), D3D11_ASYNC_GETDATA_DONOTFLUSH) != S_OK)
{
return;
}
time *= (1000000000ull / disjoint.Frequency);
if (disjoint.Disjoint == TRUE)
{
m_previousCheckpoint = m_nextCheckpoint;
TracyD3D11Panic("disjoint timestamps detected; dropping.");
return;
}
auto begin = m_previousCheckpoint;
auto end = m_nextCheckpoint;
for (auto i = begin; i != end; ++i)
{
uint32_t k = RingIndex(i);
UINT64 timestamp = 0;
if (m_immediateDevCtx->GetData(m_queries[k], &timestamp, sizeof(timestamp), 0) != S_OK)
{
TracyD3D11Panic("timestamp expected to be ready, but it was not!");
break;
}
timestamp *= (1000000000ull / disjoint.Frequency);
auto* item = Profiler::QueueSerial();
MemWrite(&item->hdr.type, QueueType::GpuTime);
MemWrite(&item->gpuTime.gpuTime, (int64_t)time);
MemWrite(&item->gpuTime.queryId, (uint16_t)m_tail);
MemWrite(&item->gpuTime.context, m_context);
MemWrite(&item->gpuTime.gpuTime, static_cast<int64_t>(timestamp));
MemWrite(&item->gpuTime.queryId, static_cast<uint16_t>(k));
MemWrite(&item->gpuTime.context, m_contextId);
Profiler::QueueSerialFinish();
m_tail = (m_tail + 1) % QueryCount;
}
// disjoint timestamp queries should only be invoked once per frame or less
// https://learn.microsoft.com/en-us/windows/win32/api/d3d11/ne-d3d11-d3d11_query
m_immediateDevCtx->Begin(m_disjointQuery);
m_previousCheckpoint = m_nextCheckpoint;
}
private:
tracy_force_inline unsigned int NextQueryId()
tracy_force_inline uint32_t RingIndex(uintptr_t index)
{
const auto id = m_head;
m_head = ( m_head + 1 ) % QueryCount;
assert( m_head != m_tail );
return id;
index %= MaxQueries;
return static_cast<uint32_t>(index);
}
tracy_force_inline ID3D11Query* TranslateQueryId( unsigned int id )
tracy_force_inline uint32_t RingCount(uintptr_t begin, uintptr_t end)
{
// wrap-around safe: all unsigned
uintptr_t count = end - begin;
return static_cast<uint32_t>(count);
}
tracy_force_inline uint32_t NextQueryId()
{
auto id = m_queryCounter++;
if (RingCount(m_previousCheckpoint, id) >= MaxQueries)
{
TracyD3D11Panic("too many pending timestamp queries.");
// #TODO: return some sentinel value; ideally a "hidden" query index
}
return RingIndex(id);
}
tracy_force_inline ID3D11Query* GetQueryObjectFromId(uint32_t id)
{
return m_queries[id];
}
tracy_force_inline ID3D11Query* MapDisjointQueryId( unsigned int id, unsigned int disjointId )
tracy_force_inline void WaitForQuery(ID3D11Query* query)
{
m_disjointMap[id] = m_disjoints[disjointId];
return m_disjoints[disjointId];
m_immediateDevCtx->Flush();
while (m_immediateDevCtx->GetData(query, nullptr, 0, 0) != S_OK)
YieldThread(); // busy-wait :-( attempt to reduce power usage with _mm_pause() & friends...
}
tracy_force_inline uint8_t GetId() const
tracy_force_inline uint8_t GetContextId() const
{
return m_context;
return m_contextId;
}
ID3D11Device* m_device;
ID3D11DeviceContext* m_devicectx;
ID3D11Device* m_device = nullptr;
ID3D11DeviceContext* m_immediateDevCtx = nullptr;
ID3D11Query* m_queries[QueryCount];
ID3D11Query* m_disjoints[QueryCount];
ID3D11Query* m_disjointMap[QueryCount]; // Multiple time queries can have one disjoint query
uint8_t m_context;
ID3D11Query* m_queries[MaxQueries];
ID3D11Query* m_disjointQuery = nullptr;
unsigned int m_head;
unsigned int m_tail;
uint8_t m_contextId = 255; // NOTE: apparently, 255 means invalid id; is this documented anywhere?
uintptr_t m_queryCounter = 0;
uintptr_t m_previousCheckpoint = 0;
uintptr_t m_nextCheckpoint = 0;
};
class D3D11ZoneScope
{
public:
tracy_force_inline D3D11ZoneScope( D3D11Ctx* ctx, const SourceLocationData* srcloc, bool is_active )
#ifdef TRACY_ON_DEMAND
: m_active( is_active && GetProfiler().IsConnected() )
#else
: m_active( is_active )
#endif
tracy_force_inline D3D11ZoneScope( D3D11Ctx* ctx, const SourceLocationData* srcloc, bool active )
: D3D11ZoneScope(ctx, active)
{
if( !m_active ) return;
m_ctx = ctx;
const auto queryId = ctx->NextQueryId();
ctx->m_devicectx->Begin(ctx->MapDisjointQueryId(queryId, queryId));
ctx->m_devicectx->End(ctx->TranslateQueryId(queryId));
m_disjointId = queryId;
auto* item = Profiler::QueueSerial();
MemWrite( &item->hdr.type, QueueType::GpuZoneBeginSerial );
MemWrite( &item->gpuZoneBegin.cpuTime, Profiler::GetTime() );
MemWrite( &item->gpuZoneBegin.srcloc, (uint64_t)srcloc );
MemWrite( &item->gpuZoneBegin.thread, GetThreadHandle() );
MemWrite( &item->gpuZoneBegin.queryId, uint16_t( queryId ) );
MemWrite( &item->gpuZoneBegin.context, ctx->GetId() );
Profiler::QueueSerialFinish();
WriteQueueItem(item, QueueType::GpuZoneBeginSerial, reinterpret_cast<uint64_t>(srcloc));
}
tracy_force_inline D3D11ZoneScope( D3D11Ctx* ctx, const SourceLocationData* srcloc, int depth, bool is_active )
#ifdef TRACY_ON_DEMAND
: m_active( is_active && GetProfiler().IsConnected() )
#else
: m_active( is_active )
#endif
tracy_force_inline D3D11ZoneScope( D3D11Ctx* ctx, const SourceLocationData* srcloc, int depth, bool active )
: D3D11ZoneScope(ctx, active)
{
if( !m_active ) return;
m_ctx = ctx;
const auto queryId = ctx->NextQueryId();
ctx->m_devicectx->Begin(ctx->MapDisjointQueryId(queryId, queryId));
ctx->m_devicectx->End(ctx->TranslateQueryId(queryId));
m_disjointId = queryId;
auto* item = Profiler::QueueSerial();
MemWrite( &item->hdr.type, QueueType::GpuZoneBeginCallstackSerial );
MemWrite( &item->gpuZoneBegin.cpuTime, Profiler::GetTime() );
MemWrite( &item->gpuZoneBegin.srcloc, (uint64_t)srcloc );
MemWrite( &item->gpuZoneBegin.thread, GetThreadHandle() );
MemWrite( &item->gpuZoneBegin.queryId, uint16_t( queryId ) );
MemWrite( &item->gpuZoneBegin.context, ctx->GetId() );
Profiler::QueueSerialFinish();
GetProfiler().SendCallstack( depth );
auto* item = Profiler::QueueSerialCallstack(Callstack(depth));
WriteQueueItem(item, QueueType::GpuZoneBeginCallstackSerial, reinterpret_cast<uint64_t>(srcloc));
}
tracy_force_inline D3D11ZoneScope(D3D11Ctx* ctx, uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz, const char* name, size_t nameSz, bool active)
#ifdef TRACY_ON_DEMAND
: m_active(active&& GetProfiler().IsConnected())
#else
: m_active(active)
#endif
: D3D11ZoneScope(ctx, active)
{
if( !m_active ) return;
m_ctx = ctx;
const auto queryId = ctx->NextQueryId();
ctx->m_devicectx->Begin(ctx->MapDisjointQueryId(queryId, queryId));
ctx->m_devicectx->End(ctx->TranslateQueryId(queryId));
m_disjointId = queryId;
const auto sourceLocation = Profiler::AllocSourceLocation(line, source, sourceSz, function, functionSz, name, nameSz);
auto* item = Profiler::QueueSerial();
MemWrite(&item->hdr.type, QueueType::GpuZoneBeginAllocSrcLocSerial);
MemWrite(&item->gpuZoneBegin.cpuTime, Profiler::GetTime());
MemWrite(&item->gpuZoneBegin.srcloc, sourceLocation);
MemWrite(&item->gpuZoneBegin.thread, GetThreadHandle());
MemWrite(&item->gpuZoneBegin.queryId, static_cast<uint16_t>(queryId));
MemWrite(&item->gpuZoneBegin.context, ctx->GetId());
Profiler::QueueSerialFinish();
WriteQueueItem(item, QueueType::GpuZoneBeginAllocSrcLocSerial, sourceLocation);
}
tracy_force_inline D3D11ZoneScope(D3D11Ctx* ctx, uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz, const char* name, size_t nameSz, int depth, bool active)
#ifdef TRACY_ON_DEMAND
: m_active(active&& GetProfiler().IsConnected())
#else
: m_active(active)
#endif
: D3D11ZoneScope(ctx, active)
{
if( !m_active ) return;
m_ctx = ctx;
const auto queryId = ctx->NextQueryId();
ctx->m_devicectx->Begin(ctx->MapDisjointQueryId(queryId, queryId));
ctx->m_devicectx->End(ctx->TranslateQueryId(queryId));
m_disjointId = queryId;
const auto sourceLocation = Profiler::AllocSourceLocation(line, source, sourceSz, function, functionSz, name, nameSz);
auto* item = Profiler::QueueSerialCallstack(Callstack(depth));
MemWrite(&item->hdr.type, QueueType::GpuZoneBeginAllocSrcLocCallstackSerial);
MemWrite(&item->gpuZoneBegin.cpuTime, Profiler::GetTime());
MemWrite(&item->gpuZoneBegin.srcloc, sourceLocation);
MemWrite(&item->gpuZoneBegin.thread, GetThreadHandle());
MemWrite(&item->gpuZoneBegin.queryId, static_cast<uint16_t>(queryId));
MemWrite(&item->gpuZoneBegin.context, ctx->GetId());
Profiler::QueueSerialFinish();
WriteQueueItem(item, QueueType::GpuZoneBeginAllocSrcLocCallstackSerial, sourceLocation);
}
tracy_force_inline ~D3D11ZoneScope()
@ -369,24 +343,46 @@ public:
if( !m_active ) return;
const auto queryId = m_ctx->NextQueryId();
m_ctx->m_devicectx->End(m_ctx->TranslateQueryId(queryId));
m_ctx->m_devicectx->End(m_ctx->MapDisjointQueryId(queryId, m_disjointId));
m_ctx->m_immediateDevCtx->End(m_ctx->GetQueryObjectFromId(queryId));
auto* item = Profiler::QueueSerial();
MemWrite( &item->hdr.type, QueueType::GpuZoneEndSerial );
MemWrite( &item->gpuZoneEnd.cpuTime, Profiler::GetTime() );
MemWrite( &item->gpuZoneEnd.thread, GetThreadHandle() );
MemWrite( &item->gpuZoneEnd.queryId, uint16_t( queryId ) );
MemWrite( &item->gpuZoneEnd.context, m_ctx->GetId() );
MemWrite( &item->gpuZoneEnd.context, m_ctx->GetContextId() );
Profiler::QueueSerialFinish();
}
private:
tracy_force_inline D3D11ZoneScope( D3D11Ctx* ctx, bool active )
#ifdef TRACY_ON_DEMAND
: m_active( is_active && GetProfiler().IsConnected() )
#else
: m_active( active )
#endif
{
if( !m_active ) return;
m_ctx = ctx;
}
void WriteQueueItem(tracy::QueueItem* item, tracy::QueueType queueItemType, uint64_t sourceLocation)
{
const auto queryId = m_ctx->NextQueryId();
m_ctx->m_immediateDevCtx->End(m_ctx->GetQueryObjectFromId(queryId));
MemWrite( &item->hdr.type, queueItemType);
MemWrite( &item->gpuZoneBegin.cpuTime, Profiler::GetTime() );
MemWrite( &item->gpuZoneBegin.srcloc, sourceLocation );
MemWrite( &item->gpuZoneBegin.thread, GetThreadHandle() );
MemWrite( &item->gpuZoneBegin.queryId, uint16_t( queryId ) );
MemWrite( &item->gpuZoneBegin.context, m_ctx->GetContextId() );
Profiler::QueueSerialFinish();
}
const bool m_active;
D3D11Ctx* m_ctx;
unsigned int m_disjointId;
};
static inline D3D11Ctx* CreateD3D11Context( ID3D11Device* device, ID3D11DeviceContext* devicectx )
@ -403,38 +399,44 @@ static inline void DestroyD3D11Context( D3D11Ctx* ctx )
}
}
#undef TracyD3D11Panic
using TracyD3D11Ctx = tracy::D3D11Ctx*;
#define TracyD3D11Context( device, devicectx ) tracy::CreateD3D11Context( device, devicectx );
#define TracyD3D11Destroy(ctx) tracy::DestroyD3D11Context(ctx);
#define TracyD3D11ContextName(ctx, name, size) ctx->Name(name, size);
#define TracyD3D11UnnamedZone ___tracy_gpu_d3d11_zone
#define TracyD3D11SrcLocSymbol TracyConcat(__tracy_gpu_d3d11_source_location,TracyLine)
#define TracyD3D11SrcLocObject(name, color) static constexpr tracy::SourceLocationData TracyD3D11SrcLocSymbol { name, TracyFunction, TracyFile, (uint32_t)TracyLine, color };
#if defined TRACY_HAS_CALLSTACK && defined TRACY_CALLSTACK
# define TracyD3D11Zone( ctx, name ) TracyD3D11NamedZoneS( ctx, ___tracy_gpu_zone, name, TRACY_CALLSTACK, true )
# define TracyD3D11ZoneC( ctx, name, color ) TracyD3D11NamedZoneCS( ctx, ___tracy_gpu_zone, name, color, TRACY_CALLSTACK, true )
# define TracyD3D11NamedZone( ctx, varname, name, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,TracyLine) { name, TracyFunction, TracyFile, (uint32_t)TracyLine, 0 }; tracy::D3D11ZoneScope varname( ctx, &TracyConcat(__tracy_gpu_source_location,TracyLine), TRACY_CALLSTACK, active );
# define TracyD3D11NamedZoneC( ctx, varname, name, color, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,TracyLine) { name, TracyFunction, TracyFile, (uint32_t)TracyLine, color }; tracy::D3D11ZoneScope varname( ctx, &TracyConcat(__tracy_gpu_source_location,TracyLine), TRACY_CALLSTACK, active );
# define TracyD3D11Zone( ctx, name ) TracyD3D11NamedZoneS( ctx, TracyD3D11UnnamedZone, name, TRACY_CALLSTACK, true )
# define TracyD3D11ZoneC( ctx, name, color ) TracyD3D11NamedZoneCS( ctx, TracyD3D11UnnamedZone, name, color, TRACY_CALLSTACK, true )
# define TracyD3D11NamedZone( ctx, varname, name, active ) TracyD3D11SrcLocObject(name, 0); tracy::D3D11ZoneScope varname( ctx, &TracyD3D11SrcLocSymbol, TRACY_CALLSTACK, active );
# define TracyD3D11NamedZoneC( ctx, varname, name, color, active ) TracyD3D11SrcLocObject(name, color); tracy::D3D11ZoneScope varname( ctx, &TracyD3D11SrcLocSymbol, TRACY_CALLSTACK, active );
# define TracyD3D11ZoneTransient(ctx, varname, name, active) TracyD3D11ZoneTransientS(ctx, varname, cmdList, name, TRACY_CALLSTACK, active)
#else
# define TracyD3D11Zone( ctx, name ) TracyD3D11NamedZone( ctx, ___tracy_gpu_zone, name, true )
# define TracyD3D11ZoneC( ctx, name, color ) TracyD3D11NamedZoneC( ctx, ___tracy_gpu_zone, name, color, true )
# define TracyD3D11NamedZone( ctx, varname, name, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,TracyLine) { name, TracyFunction, TracyFile, (uint32_t)TracyLine, 0 }; tracy::D3D11ZoneScope varname( ctx, &TracyConcat(__tracy_gpu_source_location,TracyLine), active );
# define TracyD3D11NamedZoneC( ctx, varname, name, color, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,TracyLine) { name, TracyFunction, TracyFile, (uint32_t)TracyLine, color }; tracy::D3D11ZoneScope varname( ctx, &TracyConcat(__tracy_gpu_source_location,TracyLine), active );
# define TracyD3D11Zone( ctx, name ) TracyD3D11NamedZone( ctx, TracyD3D11UnnamedZone, name, true )
# define TracyD3D11ZoneC( ctx, name, color ) TracyD3D11NamedZoneC( ctx, TracyD3D11UnnamedZone, name, color, true )
# define TracyD3D11NamedZone( ctx, varname, name, active ) TracyD3D11SrcLocObject(name, 0); tracy::D3D11ZoneScope varname( ctx, &TracyD3D11SrcLocSymbol, active );
# define TracyD3D11NamedZoneC( ctx, varname, name, color, active ) TracyD3D11SrcLocObject(name, color); tracy::D3D11ZoneScope varname( ctx, &TracyD3D11SrcLocSymbol, active );
# define TracyD3D11ZoneTransient(ctx, varname, name, active) tracy::D3D11ZoneScope varname{ ctx, TracyLine, TracyFile, strlen(TracyFile), TracyFunction, strlen(TracyFunction), name, strlen(name), active };
#endif
#ifdef TRACY_HAS_CALLSTACK
# define TracyD3D11ZoneS( ctx, name, depth ) TracyD3D11NamedZoneS( ctx, ___tracy_gpu_zone, name, depth, true )
# define TracyD3D11ZoneCS( ctx, name, color, depth ) TracyD3D11NamedZoneCS( ctx, ___tracy_gpu_zone, name, color, depth, true )
# define TracyD3D11NamedZoneS( ctx, varname, name, depth, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,TracyLine) { name, TracyFunction, TracyFile, (uint32_t)TracyLine, 0 }; tracy::D3D11ZoneScope varname( ctx, &TracyConcat(__tracy_gpu_source_location,TracyLine), depth, active );
# define TracyD3D11NamedZoneCS( ctx, varname, name, color, depth, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,TracyLine) { name, TracyFunction, TracyFile, (uint32_t)TracyLine, color }; tracy::D3D11ZoneScope varname( ctx, &TracyConcat(__tracy_gpu_source_location,TracyLine), depth, active );
# define TracyD3D11ZoneS( ctx, name, depth ) TracyD3D11NamedZoneS( ctx, TracyD3D11UnnamedZone, name, depth, true )
# define TracyD3D11ZoneCS( ctx, name, color, depth ) TracyD3D11NamedZoneCS( ctx, TracyD3D11UnnamedZone, name, color, depth, true )
# define TracyD3D11NamedZoneS( ctx, varname, name, depth, active ) TracyD3D11SrcLocObject(name, 0); tracy::D3D11ZoneScope varname( ctx, &TracyD3D11SrcLocSymbol, depth, active );
# define TracyD3D11NamedZoneCS( ctx, varname, name, color, depth, active ) TracyD3D11SrcLocObject(name, color); tracy::D3D11ZoneScope varname( ctx, &TracyD3D11SrcLocSymbol, depth, active );
# define TracyD3D11ZoneTransientS(ctx, varname, name, depth, active) tracy::D3D11ZoneScope varname{ ctx, TracyLine, TracyFile, strlen(TracyFile), TracyFunction, strlen(TracyFunction), name, strlen(name), depth, active };
#else
# define TracyD3D11ZoneS( ctx, name, depth, active ) TracyD3D11Zone( ctx, name )
# define TracyD3D11ZoneCS( ctx, name, color, depth, active ) TracyD3D11ZoneC( name, color )
# define TracyD3D11NamedZoneS( ctx, varname, name, depth, active ) TracyD3D11NamedZone( ctx, varname, name, active )
# define TracyD3D11NamedZoneCS( ctx, varname, name, color, depth, active ) TracyD3D11NamedZoneC( ctx, varname, name, color, active )
# define TracyD3D11ZoneTransientS(ctx, varname, name, depth, active) TracyD3D12ZoneTransient(ctx, varname, name, active)
# define TracyD3D11ZoneTransientS(ctx, varname, name, depth, active) TracyD3D11ZoneTransient(ctx, varname, name, active)
#endif
#define TracyD3D11Collect( ctx ) ctx->Collect();

View file

@ -25,7 +25,7 @@
namespace tracy
{
class D3D12ZoneScope {};
class D3D12ZoneScope {};
}
using TracyD3D12Ctx = void*;
@ -40,429 +40,419 @@ using TracyD3D12Ctx = void*;
#include <cassert>
#include <d3d12.h>
#include <dxgi.h>
#include <wrl/client.h>
#include <queue>
#define TracyD3D12Panic(msg, ...) do { assert(false && "TracyD3D12: " msg); TracyMessageLC("TracyD3D12: " msg, tracy::Color::Red4); __VA_ARGS__; } while(false);
namespace tracy
{
struct D3D12QueryPayload
{
uint32_t m_queryIdStart = 0;
uint32_t m_queryCount = 0;
};
struct D3D12QueryPayload
{
uint32_t m_queryIdStart = 0;
uint32_t m_queryCount = 0;
};
// Command queue context.
class D3D12QueueCtx
{
friend class D3D12ZoneScope;
// Command queue context.
class D3D12QueueCtx
{
friend class D3D12ZoneScope;
static constexpr uint32_t MaxQueries = 64 * 1024; // Queries are begin and end markers, so we can store half as many total time durations. Must be even!
ID3D12Device* m_device = nullptr;
ID3D12CommandQueue* m_queue = nullptr;
uint8_t m_contextId = 255; // TODO: apparently, 255 means "invalid id"; is this documented somewhere?
ID3D12QueryHeap* m_queryHeap = nullptr;
ID3D12Resource* m_readbackBuffer = nullptr;
bool m_initialized = false;
// In-progress payload.
uint32_t m_queryLimit = 0;
std::atomic<uint32_t> m_queryCounter = 0;
uint32_t m_previousQueryCounter = 0;
ID3D12Device* m_device = nullptr;
ID3D12CommandQueue* m_queue = nullptr;
uint8_t m_context;
Microsoft::WRL::ComPtr<ID3D12QueryHeap> m_queryHeap;
Microsoft::WRL::ComPtr<ID3D12Resource> m_readbackBuffer;
uint32_t m_activePayload = 0;
ID3D12Fence* m_payloadFence = nullptr;
std::queue<D3D12QueryPayload> m_payloadQueue;
// In-progress payload.
uint32_t m_queryLimit = MaxQueries;
std::atomic<uint32_t> m_queryCounter = 0;
uint32_t m_previousQueryCounter = 0;
UINT64 m_prevCalibrationTicksCPU = 0;
uint32_t m_activePayload = 0;
Microsoft::WRL::ComPtr<ID3D12Fence> m_payloadFence;
std::queue<D3D12QueryPayload> m_payloadQueue;
void RecalibrateClocks()
{
UINT64 cpuTimestamp;
UINT64 gpuTimestamp;
if (FAILED(m_queue->GetClockCalibration(&gpuTimestamp, &cpuTimestamp)))
{
TracyD3D12Panic("failed to obtain queue clock calibration counters.", return);
}
int64_t m_prevCalibration = 0;
int64_t m_qpcToNs = int64_t{ 1000000000 / GetFrequencyQpc() };
int64_t cpuDeltaTicks = cpuTimestamp - m_prevCalibrationTicksCPU;
if (cpuDeltaTicks > 0)
{
static const int64_t nanosecodsPerTick = int64_t(1000000000) / GetFrequencyQpc();
int64_t cpuDeltaNS = cpuDeltaTicks * nanosecodsPerTick;
// Save the device cpu timestamp, not the Tracy profiler timestamp:
m_prevCalibrationTicksCPU = cpuTimestamp;
public:
D3D12QueueCtx(ID3D12Device* device, ID3D12CommandQueue* queue)
: m_device(device)
, m_queue(queue)
, m_context(GetGpuCtxCounter().fetch_add(1, std::memory_order_relaxed))
{
// Verify we support timestamp queries on this queue.
cpuTimestamp = Profiler::GetTime();
if (queue->GetDesc().Type == D3D12_COMMAND_LIST_TYPE_COPY)
{
D3D12_FEATURE_DATA_D3D12_OPTIONS3 featureData{};
auto* item = Profiler::QueueSerial();
MemWrite(&item->hdr.type, QueueType::GpuCalibration);
MemWrite(&item->gpuCalibration.gpuTime, gpuTimestamp);
MemWrite(&item->gpuCalibration.cpuTime, cpuTimestamp);
MemWrite(&item->gpuCalibration.cpuDelta, cpuDeltaNS);
MemWrite(&item->gpuCalibration.context, GetId());
SubmitQueueItem(item);
}
}
bool Success = SUCCEEDED(device->CheckFeatureSupport(D3D12_FEATURE_D3D12_OPTIONS3, &featureData, sizeof(featureData)));
assert(Success && featureData.CopyQueueTimestampQueriesSupported && "Platform does not support profiling of copy queues.");
}
tracy_force_inline void SubmitQueueItem(tracy::QueueItem* item)
{
#ifdef TRACY_ON_DEMAND
GetProfiler().DeferItem(*item);
#endif
Profiler::QueueSerialFinish();
}
uint64_t timestampFrequency;
public:
D3D12QueueCtx(ID3D12Device* device, ID3D12CommandQueue* queue)
: m_device(device)
, m_queue(queue)
{
// Verify we support timestamp queries on this queue.
if (FAILED(queue->GetTimestampFrequency(&timestampFrequency)))
{
assert(false && "Failed to get timestamp frequency.");
}
if (queue->GetDesc().Type == D3D12_COMMAND_LIST_TYPE_COPY)
{
D3D12_FEATURE_DATA_D3D12_OPTIONS3 featureData{};
uint64_t cpuTimestamp;
uint64_t gpuTimestamp;
HRESULT hr = device->CheckFeatureSupport(D3D12_FEATURE_D3D12_OPTIONS3, &featureData, sizeof(featureData));
if (FAILED(hr) || (featureData.CopyQueueTimestampQueriesSupported == FALSE))
{
TracyD3D12Panic("Platform does not support profiling of copy queues.", return);
}
}
if (FAILED(queue->GetClockCalibration(&gpuTimestamp, &cpuTimestamp)))
{
assert(false && "Failed to get queue clock calibration.");
}
static constexpr uint32_t MaxQueries = 64 * 1024; // Must be even, because queries are (begin, end) pairs
m_queryLimit = MaxQueries;
// Save the device cpu timestamp, not the profiler's timestamp.
m_prevCalibration = cpuTimestamp * m_qpcToNs;
D3D12_QUERY_HEAP_DESC heapDesc{};
heapDesc.Type = queue->GetDesc().Type == D3D12_COMMAND_LIST_TYPE_COPY ? D3D12_QUERY_HEAP_TYPE_COPY_QUEUE_TIMESTAMP : D3D12_QUERY_HEAP_TYPE_TIMESTAMP;
heapDesc.Count = m_queryLimit;
heapDesc.NodeMask = 0; // #TODO: Support multiple adapters.
cpuTimestamp = Profiler::GetTime();
while (FAILED(device->CreateQueryHeap(&heapDesc, IID_PPV_ARGS(&m_queryHeap))))
{
m_queryLimit /= 2;
heapDesc.Count = m_queryLimit;
}
D3D12_QUERY_HEAP_DESC heapDesc{};
heapDesc.Type = queue->GetDesc().Type == D3D12_COMMAND_LIST_TYPE_COPY ? D3D12_QUERY_HEAP_TYPE_COPY_QUEUE_TIMESTAMP : D3D12_QUERY_HEAP_TYPE_TIMESTAMP;
heapDesc.Count = m_queryLimit;
heapDesc.NodeMask = 0; // #TODO: Support multiple adapters.
// Create a readback buffer, which will be used as a destination for the query data.
while (FAILED(device->CreateQueryHeap(&heapDesc, IID_PPV_ARGS(&m_queryHeap))))
{
m_queryLimit /= 2;
heapDesc.Count = m_queryLimit;
}
D3D12_RESOURCE_DESC readbackBufferDesc{};
readbackBufferDesc.Alignment = 0;
readbackBufferDesc.Dimension = D3D12_RESOURCE_DIMENSION_BUFFER;
readbackBufferDesc.Width = m_queryLimit * sizeof(uint64_t);
readbackBufferDesc.Height = 1;
readbackBufferDesc.DepthOrArraySize = 1;
readbackBufferDesc.Format = DXGI_FORMAT_UNKNOWN;
readbackBufferDesc.Layout = D3D12_TEXTURE_LAYOUT_ROW_MAJOR; // Buffers are always row major.
readbackBufferDesc.MipLevels = 1;
readbackBufferDesc.SampleDesc.Count = 1;
readbackBufferDesc.SampleDesc.Quality = 0;
readbackBufferDesc.Flags = D3D12_RESOURCE_FLAG_NONE;
// Create a readback buffer, which will be used as a destination for the query data.
D3D12_HEAP_PROPERTIES readbackHeapProps{};
readbackHeapProps.Type = D3D12_HEAP_TYPE_READBACK;
readbackHeapProps.CPUPageProperty = D3D12_CPU_PAGE_PROPERTY_UNKNOWN;
readbackHeapProps.MemoryPoolPreference = D3D12_MEMORY_POOL_UNKNOWN;
readbackHeapProps.CreationNodeMask = 0;
readbackHeapProps.VisibleNodeMask = 0; // #TODO: Support multiple adapters.
D3D12_RESOURCE_DESC readbackBufferDesc{};
readbackBufferDesc.Alignment = 0;
readbackBufferDesc.Dimension = D3D12_RESOURCE_DIMENSION_BUFFER;
readbackBufferDesc.Width = m_queryLimit * sizeof(uint64_t);
readbackBufferDesc.Height = 1;
readbackBufferDesc.DepthOrArraySize = 1;
readbackBufferDesc.Format = DXGI_FORMAT_UNKNOWN;
readbackBufferDesc.Layout = D3D12_TEXTURE_LAYOUT_ROW_MAJOR; // Buffers are always row major.
readbackBufferDesc.MipLevels = 1;
readbackBufferDesc.SampleDesc.Count = 1;
readbackBufferDesc.SampleDesc.Quality = 0;
readbackBufferDesc.Flags = D3D12_RESOURCE_FLAG_NONE;
if (FAILED(device->CreateCommittedResource(&readbackHeapProps, D3D12_HEAP_FLAG_NONE, &readbackBufferDesc, D3D12_RESOURCE_STATE_COPY_DEST, nullptr, IID_PPV_ARGS(&m_readbackBuffer))))
{
TracyD3D12Panic("Failed to create query readback buffer.", return);
}
D3D12_HEAP_PROPERTIES readbackHeapProps{};
readbackHeapProps.Type = D3D12_HEAP_TYPE_READBACK;
readbackHeapProps.CPUPageProperty = D3D12_CPU_PAGE_PROPERTY_UNKNOWN;
readbackHeapProps.MemoryPoolPreference = D3D12_MEMORY_POOL_UNKNOWN;
readbackHeapProps.CreationNodeMask = 0;
readbackHeapProps.VisibleNodeMask = 0; // #TODO: Support multiple adapters.
if (FAILED(device->CreateFence(0, D3D12_FENCE_FLAG_NONE, IID_PPV_ARGS(&m_payloadFence))))
{
TracyD3D12Panic("Failed to create payload fence.", return);
}
if (FAILED(device->CreateCommittedResource(&readbackHeapProps, D3D12_HEAP_FLAG_NONE, &readbackBufferDesc, D3D12_RESOURCE_STATE_COPY_DEST, nullptr, IID_PPV_ARGS(&m_readbackBuffer))))
{
assert(false && "Failed to create query readback buffer.");
}
float period = [queue]()
{
uint64_t timestampFrequency;
if (FAILED(queue->GetTimestampFrequency(&timestampFrequency)))
{
return 0.0f;
}
return static_cast<float>( 1E+09 / static_cast<double>(timestampFrequency) );
}();
if (FAILED(device->CreateFence(0, D3D12_FENCE_FLAG_NONE, IID_PPV_ARGS(&m_payloadFence))))
{
assert(false && "Failed to create payload fence.");
}
if (period == 0.0f)
{
TracyD3D12Panic("Failed to get timestamp frequency.", return);
}
auto* item = Profiler::QueueSerial();
MemWrite(&item->hdr.type, QueueType::GpuNewContext);
MemWrite(&item->gpuNewContext.cpuTime, cpuTimestamp);
MemWrite(&item->gpuNewContext.gpuTime, gpuTimestamp);
memset(&item->gpuNewContext.thread, 0, sizeof(item->gpuNewContext.thread));
MemWrite(&item->gpuNewContext.period, 1E+09f / static_cast<float>(timestampFrequency));
MemWrite(&item->gpuNewContext.context, m_context);
MemWrite(&item->gpuNewContext.flags, GpuContextCalibration);
MemWrite(&item->gpuNewContext.type, GpuContextType::Direct3D12);
uint64_t cpuTimestamp;
uint64_t gpuTimestamp;
if (FAILED(queue->GetClockCalibration(&gpuTimestamp, &cpuTimestamp)))
{
TracyD3D12Panic("Failed to get queue clock calibration.", return);
}
// Save the device cpu timestamp, not the profiler's timestamp.
m_prevCalibrationTicksCPU = cpuTimestamp;
cpuTimestamp = Profiler::GetTime();
// all checked: ready to roll
m_contextId = GetGpuCtxCounter().fetch_add(1);
auto* item = Profiler::QueueSerial();
MemWrite(&item->hdr.type, QueueType::GpuNewContext);
MemWrite(&item->gpuNewContext.cpuTime, cpuTimestamp);
MemWrite(&item->gpuNewContext.gpuTime, gpuTimestamp);
MemWrite(&item->gpuNewContext.thread, decltype(item->gpuNewContext.thread)(0)); // #TODO: why 0 instead of GetThreadHandle()?
MemWrite(&item->gpuNewContext.period, period);
MemWrite(&item->gpuNewContext.context, GetId());
MemWrite(&item->gpuNewContext.flags, GpuContextCalibration);
MemWrite(&item->gpuNewContext.type, GpuContextType::Direct3D12);
SubmitQueueItem(item);
}
~D3D12QueueCtx()
{
ZoneScopedC(Color::Red4);
// collect all pending timestamps
while (m_payloadFence->GetCompletedValue() != m_activePayload)
/* busy-wait ... */;
Collect();
m_payloadFence->Release();
m_readbackBuffer->Release();
m_queryHeap->Release();
}
void NewFrame()
{
uint32_t queryCounter = m_queryCounter.exchange(0);
m_payloadQueue.emplace(D3D12QueryPayload{ m_previousQueryCounter, queryCounter });
m_previousQueryCounter += queryCounter;
if (m_previousQueryCounter >= m_queryLimit)
{
m_previousQueryCounter -= m_queryLimit;
}
m_queue->Signal(m_payloadFence, ++m_activePayload);
}
void Name( const char* name, uint16_t len )
{
auto ptr = (char*)tracy_malloc( len );
memcpy( ptr, name, len );
auto item = Profiler::QueueSerial();
MemWrite( &item->hdr.type, QueueType::GpuContextName );
MemWrite( &item->gpuContextNameFat.context, GetId());
MemWrite( &item->gpuContextNameFat.ptr, (uint64_t)ptr );
MemWrite( &item->gpuContextNameFat.size, len );
SubmitQueueItem(item);
}
void Collect()
{
ZoneScopedC(Color::Red4);
#ifdef TRACY_ON_DEMAND
GetProfiler().DeferItem(*item);
if (!GetProfiler().IsConnected())
{
m_queryCounter = 0;
return;
}
#endif
Profiler::QueueSerialFinish();
// Find out what payloads are available.
const auto newestReadyPayload = m_payloadFence->GetCompletedValue();
const auto payloadCount = m_payloadQueue.size() - (m_activePayload - newestReadyPayload);
m_initialized = true;
}
if (!payloadCount)
{
return; // No payloads are available yet, exit out.
}
void NewFrame()
{
uint32_t queryCounter = m_queryCounter.exchange(0);
m_payloadQueue.emplace(D3D12QueryPayload{ m_previousQueryCounter, queryCounter });
m_previousQueryCounter += queryCounter;
D3D12_RANGE mapRange{ 0, m_queryLimit * sizeof(uint64_t) };
if (m_previousQueryCounter >= m_queryLimit)
{
m_previousQueryCounter -= m_queryLimit;
}
// Map the readback buffer so we can fetch the query data from the GPU.
void* readbackBufferMapping = nullptr;
m_queue->Signal(m_payloadFence.Get(), ++m_activePayload);
}
if (FAILED(m_readbackBuffer->Map(0, &mapRange, &readbackBufferMapping)))
{
TracyD3D12Panic("Failed to map readback buffer.", return);
}
void Name( const char* name, uint16_t len )
{
auto ptr = (char*)tracy_malloc( len );
memcpy( ptr, name, len );
auto* timestampData = static_cast<uint64_t*>(readbackBufferMapping);
auto item = Profiler::QueueSerial();
MemWrite( &item->hdr.type, QueueType::GpuContextName );
MemWrite( &item->gpuContextNameFat.context, m_context );
MemWrite( &item->gpuContextNameFat.ptr, (uint64_t)ptr );
MemWrite( &item->gpuContextNameFat.size, len );
for (uint32_t i = 0; i < payloadCount; ++i)
{
const auto& payload = m_payloadQueue.front();
for (uint32_t j = 0; j < payload.m_queryCount; ++j)
{
const auto counter = (payload.m_queryIdStart + j) % m_queryLimit;
const auto timestamp = timestampData[counter];
const auto queryId = counter;
auto* item = Profiler::QueueSerial();
MemWrite(&item->hdr.type, QueueType::GpuTime);
MemWrite(&item->gpuTime.gpuTime, timestamp);
MemWrite(&item->gpuTime.queryId, static_cast<uint16_t>(queryId));
MemWrite(&item->gpuTime.context, GetId());
Profiler::QueueSerialFinish();
}
m_payloadQueue.pop();
}
m_readbackBuffer->Unmap(0, nullptr);
// Recalibrate to account for drift.
RecalibrateClocks();
}
private:
tracy_force_inline uint32_t NextQueryId()
{
uint32_t queryCounter = m_queryCounter.fetch_add(2);
if (queryCounter >= m_queryLimit)
{
TracyD3D12Panic("Submitted too many GPU queries! Consider increasing MaxQueries.");
// #TODO: consider returning an invalid id or sentinel value here
}
const uint32_t id = (m_previousQueryCounter + queryCounter) % m_queryLimit;
return id;
}
tracy_force_inline uint8_t GetId() const
{
return m_contextId;
}
};
class D3D12ZoneScope
{
const bool m_active;
D3D12QueueCtx* m_ctx = nullptr;
ID3D12GraphicsCommandList* m_cmdList = nullptr;
uint32_t m_queryId = 0; // Used for tracking in nested zones.
tracy_force_inline void WriteQueueItem(QueueItem* item, QueueType type, uint64_t srcLocation)
{
MemWrite(&item->hdr.type, type);
MemWrite(&item->gpuZoneBegin.cpuTime, Profiler::GetTime());
MemWrite(&item->gpuZoneBegin.srcloc, srcLocation);
MemWrite(&item->gpuZoneBegin.thread, GetThreadHandle());
MemWrite(&item->gpuZoneBegin.queryId, static_cast<uint16_t>(m_queryId));
MemWrite(&item->gpuZoneBegin.context, m_ctx->GetId());
Profiler::QueueSerialFinish();
}
tracy_force_inline D3D12ZoneScope(D3D12QueueCtx* ctx, ID3D12GraphicsCommandList* cmdList, bool active)
#ifdef TRACY_ON_DEMAND
GetProfiler().DeferItem( *item );
#endif
Profiler::QueueSerialFinish();
}
void Collect()
{
ZoneScopedC(Color::Red4);
#ifdef TRACY_ON_DEMAND
if (!GetProfiler().IsConnected())
{
m_queryCounter = 0;
return;
}
#endif
// Find out what payloads are available.
const auto newestReadyPayload = m_payloadFence->GetCompletedValue();
const auto payloadCount = m_payloadQueue.size() - (m_activePayload - newestReadyPayload);
if (!payloadCount)
{
return; // No payloads are available yet, exit out.
}
D3D12_RANGE mapRange{ 0, m_queryLimit * sizeof(uint64_t) };
// Map the readback buffer so we can fetch the query data from the GPU.
void* readbackBufferMapping = nullptr;
if (FAILED(m_readbackBuffer->Map(0, &mapRange, &readbackBufferMapping)))
{
assert(false && "Failed to map readback buffer.");
}
auto* timestampData = static_cast<uint64_t*>(readbackBufferMapping);
for (uint32_t i = 0; i < payloadCount; ++i)
{
const auto& payload = m_payloadQueue.front();
for (uint32_t j = 0; j < payload.m_queryCount; ++j)
{
const auto counter = (payload.m_queryIdStart + j) % m_queryLimit;
const auto timestamp = timestampData[counter];
const auto queryId = counter;
auto* item = Profiler::QueueSerial();
MemWrite(&item->hdr.type, QueueType::GpuTime);
MemWrite(&item->gpuTime.gpuTime, timestamp);
MemWrite(&item->gpuTime.queryId, static_cast<uint16_t>(queryId));
MemWrite(&item->gpuTime.context, m_context);
Profiler::QueueSerialFinish();
}
m_payloadQueue.pop();
}
m_readbackBuffer->Unmap(0, nullptr);
// Recalibrate to account for drift.
uint64_t cpuTimestamp;
uint64_t gpuTimestamp;
if (FAILED(m_queue->GetClockCalibration(&gpuTimestamp, &cpuTimestamp)))
{
assert(false && "Failed to get queue clock calibration.");
}
cpuTimestamp *= m_qpcToNs;
const auto cpuDelta = cpuTimestamp - m_prevCalibration;
if (cpuDelta > 0)
{
m_prevCalibration = cpuTimestamp;
cpuTimestamp = Profiler::GetTime();
auto* item = Profiler::QueueSerial();
MemWrite(&item->hdr.type, QueueType::GpuCalibration);
MemWrite(&item->gpuCalibration.gpuTime, gpuTimestamp);
MemWrite(&item->gpuCalibration.cpuTime, cpuTimestamp);
MemWrite(&item->gpuCalibration.cpuDelta, cpuDelta);
MemWrite(&item->gpuCalibration.context, m_context);
Profiler::QueueSerialFinish();
}
}
private:
tracy_force_inline uint32_t NextQueryId()
{
uint32_t queryCounter = m_queryCounter.fetch_add(2);
assert(queryCounter < m_queryLimit && "Submitted too many GPU queries! Consider increasing MaxQueries.");
const uint32_t id = (m_previousQueryCounter + queryCounter) % m_queryLimit;
return id;
}
tracy_force_inline uint8_t GetId() const
{
return m_context;
}
};
class D3D12ZoneScope
{
const bool m_active;
D3D12QueueCtx* m_ctx = nullptr;
ID3D12GraphicsCommandList* m_cmdList = nullptr;
uint32_t m_queryId = 0; // Used for tracking in nested zones.
public:
tracy_force_inline D3D12ZoneScope(D3D12QueueCtx* ctx, ID3D12GraphicsCommandList* cmdList, const SourceLocationData* srcLocation, bool active)
#ifdef TRACY_ON_DEMAND
: m_active(active && GetProfiler().IsConnected())
: m_active(active&& GetProfiler().IsConnected())
#else
: m_active(active)
: m_active(active)
#endif
{
if (!m_active) return;
{
if (!m_active) return;
m_ctx = ctx;
m_cmdList = cmdList;
m_ctx = ctx;
m_cmdList = cmdList;
m_queryId = ctx->NextQueryId();
cmdList->EndQuery(ctx->m_queryHeap.Get(), D3D12_QUERY_TYPE_TIMESTAMP, m_queryId);
m_queryId = m_ctx->NextQueryId();
m_cmdList->EndQuery(m_ctx->m_queryHeap, D3D12_QUERY_TYPE_TIMESTAMP, m_queryId);
}
auto* item = Profiler::QueueSerial();
MemWrite(&item->hdr.type, QueueType::GpuZoneBeginSerial);
MemWrite(&item->gpuZoneBegin.cpuTime, Profiler::GetTime());
MemWrite(&item->gpuZoneBegin.srcloc, reinterpret_cast<uint64_t>(srcLocation));
MemWrite(&item->gpuZoneBegin.thread, GetThreadHandle());
MemWrite(&item->gpuZoneBegin.queryId, static_cast<uint16_t>(m_queryId));
MemWrite(&item->gpuZoneBegin.context, ctx->GetId());
public:
tracy_force_inline D3D12ZoneScope(D3D12QueueCtx* ctx, ID3D12GraphicsCommandList* cmdList, const SourceLocationData* srcLocation, bool active)
: D3D12ZoneScope(ctx, cmdList, active)
{
if (!m_active) return;
Profiler::QueueSerialFinish();
}
auto* item = Profiler::QueueSerial();
WriteQueueItem(item, QueueType::GpuZoneBeginSerial, reinterpret_cast<uint64_t>(srcLocation));
}
tracy_force_inline D3D12ZoneScope(D3D12QueueCtx* ctx, ID3D12GraphicsCommandList* cmdList, const SourceLocationData* srcLocation, int depth, bool active)
#ifdef TRACY_ON_DEMAND
: m_active(active&& GetProfiler().IsConnected())
#else
: m_active(active)
#endif
{
if (!m_active) return;
tracy_force_inline D3D12ZoneScope(D3D12QueueCtx* ctx, ID3D12GraphicsCommandList* cmdList, const SourceLocationData* srcLocation, int depth, bool active)
: D3D12ZoneScope(ctx, cmdList, active)
{
if (!m_active) return;
m_ctx = ctx;
m_cmdList = cmdList;
auto* item = Profiler::QueueSerialCallstack(Callstack(depth));
WriteQueueItem(item, QueueType::GpuZoneBeginCallstackSerial, reinterpret_cast<uint64_t>(srcLocation));
}
m_queryId = ctx->NextQueryId();
cmdList->EndQuery(ctx->m_queryHeap.Get(), D3D12_QUERY_TYPE_TIMESTAMP, m_queryId);
tracy_force_inline D3D12ZoneScope(D3D12QueueCtx* ctx, uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz, const char* name, size_t nameSz, ID3D12GraphicsCommandList* cmdList, bool active)
: D3D12ZoneScope(ctx, cmdList, active)
{
if (!m_active) return;
auto* item = Profiler::QueueSerialCallstack(Callstack(depth));
MemWrite(&item->hdr.type, QueueType::GpuZoneBeginCallstackSerial);
MemWrite(&item->gpuZoneBegin.cpuTime, Profiler::GetTime());
MemWrite(&item->gpuZoneBegin.srcloc, reinterpret_cast<uint64_t>(srcLocation));
MemWrite(&item->gpuZoneBegin.thread, GetThreadHandle());
MemWrite(&item->gpuZoneBegin.queryId, static_cast<uint16_t>(m_queryId));
MemWrite(&item->gpuZoneBegin.context, ctx->GetId());
const auto sourceLocation = Profiler::AllocSourceLocation(line, source, sourceSz, function, functionSz, name, nameSz);
Profiler::QueueSerialFinish();
}
auto* item = Profiler::QueueSerial();
WriteQueueItem(item, QueueType::GpuZoneBeginAllocSrcLocSerial, sourceLocation);
}
tracy_force_inline D3D12ZoneScope(D3D12QueueCtx* ctx, uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz, const char* name, size_t nameSz, ID3D12GraphicsCommandList* cmdList, bool active)
#ifdef TRACY_ON_DEMAND
: m_active(active&& GetProfiler().IsConnected())
#else
: m_active(active)
#endif
{
if (!m_active) return;
tracy_force_inline D3D12ZoneScope(D3D12QueueCtx* ctx, uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz, const char* name, size_t nameSz, ID3D12GraphicsCommandList* cmdList, int depth, bool active)
: D3D12ZoneScope(ctx, cmdList, active)
{
if (!m_active) return;
m_ctx = ctx;
m_cmdList = cmdList;
const auto sourceLocation = Profiler::AllocSourceLocation(line, source, sourceSz, function, functionSz, name, nameSz);
m_queryId = ctx->NextQueryId();
cmdList->EndQuery(ctx->m_queryHeap.Get(), D3D12_QUERY_TYPE_TIMESTAMP, m_queryId);
auto* item = Profiler::QueueSerialCallstack(Callstack(depth));
WriteQueueItem(item, QueueType::GpuZoneBeginAllocSrcLocCallstackSerial, sourceLocation);
}
const auto sourceLocation = Profiler::AllocSourceLocation(line, source, sourceSz, function, functionSz, name, nameSz);
tracy_force_inline ~D3D12ZoneScope()
{
if (!m_active) return;
auto* item = Profiler::QueueSerial();
MemWrite(&item->hdr.type, QueueType::GpuZoneBeginAllocSrcLocSerial);
MemWrite(&item->gpuZoneBegin.cpuTime, Profiler::GetTime());
MemWrite(&item->gpuZoneBegin.srcloc, sourceLocation);
MemWrite(&item->gpuZoneBegin.thread, GetThreadHandle());
MemWrite(&item->gpuZoneBegin.queryId, static_cast<uint16_t>(m_queryId));
MemWrite(&item->gpuZoneBegin.context, ctx->GetId());
const auto queryId = m_queryId + 1; // Our end query slot is immediately after the begin slot.
m_cmdList->EndQuery(m_ctx->m_queryHeap, D3D12_QUERY_TYPE_TIMESTAMP, queryId);
Profiler::QueueSerialFinish();
}
auto* item = Profiler::QueueSerial();
MemWrite(&item->hdr.type, QueueType::GpuZoneEndSerial);
MemWrite(&item->gpuZoneEnd.cpuTime, Profiler::GetTime());
MemWrite(&item->gpuZoneEnd.thread, GetThreadHandle());
MemWrite(&item->gpuZoneEnd.queryId, static_cast<uint16_t>(queryId));
MemWrite(&item->gpuZoneEnd.context, m_ctx->GetId());
Profiler::QueueSerialFinish();
tracy_force_inline D3D12ZoneScope(D3D12QueueCtx* ctx, uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz, const char* name, size_t nameSz, ID3D12GraphicsCommandList* cmdList, int depth, bool active)
#ifdef TRACY_ON_DEMAND
: m_active(active&& GetProfiler().IsConnected())
#else
: m_active(active)
#endif
{
if (!m_active) return;
m_cmdList->ResolveQueryData(m_ctx->m_queryHeap, D3D12_QUERY_TYPE_TIMESTAMP, m_queryId, 2, m_ctx->m_readbackBuffer, m_queryId * sizeof(uint64_t));
}
};
m_ctx = ctx;
m_cmdList = cmdList;
static inline D3D12QueueCtx* CreateD3D12Context(ID3D12Device* device, ID3D12CommandQueue* queue)
{
auto* ctx = static_cast<D3D12QueueCtx*>(tracy_malloc(sizeof(D3D12QueueCtx)));
new (ctx) D3D12QueueCtx{ device, queue };
m_queryId = ctx->NextQueryId();
cmdList->EndQuery(ctx->m_queryHeap.Get(), D3D12_QUERY_TYPE_TIMESTAMP, m_queryId);
return ctx;
}
const auto sourceLocation = Profiler::AllocSourceLocation(line, source, sourceSz, function, functionSz, name, nameSz);
auto* item = Profiler::QueueSerialCallstack(Callstack(depth));
MemWrite(&item->hdr.type, QueueType::GpuZoneBeginAllocSrcLocCallstackSerial);
MemWrite(&item->gpuZoneBegin.cpuTime, Profiler::GetTime());
MemWrite(&item->gpuZoneBegin.srcloc, sourceLocation);
MemWrite(&item->gpuZoneBegin.thread, GetThreadHandle());
MemWrite(&item->gpuZoneBegin.queryId, static_cast<uint16_t>(m_queryId));
MemWrite(&item->gpuZoneBegin.context, ctx->GetId());
Profiler::QueueSerialFinish();
}
tracy_force_inline ~D3D12ZoneScope()
{
if (!m_active) return;
const auto queryId = m_queryId + 1; // Our end query slot is immediately after the begin slot.
m_cmdList->EndQuery(m_ctx->m_queryHeap.Get(), D3D12_QUERY_TYPE_TIMESTAMP, queryId);
auto* item = Profiler::QueueSerial();
MemWrite(&item->hdr.type, QueueType::GpuZoneEndSerial);
MemWrite(&item->gpuZoneEnd.cpuTime, Profiler::GetTime());
MemWrite(&item->gpuZoneEnd.thread, GetThreadHandle());
MemWrite(&item->gpuZoneEnd.queryId, static_cast<uint16_t>(queryId));
MemWrite(&item->gpuZoneEnd.context, m_ctx->GetId());
Profiler::QueueSerialFinish();
m_cmdList->ResolveQueryData(m_ctx->m_queryHeap.Get(), D3D12_QUERY_TYPE_TIMESTAMP, m_queryId, 2, m_ctx->m_readbackBuffer.Get(), m_queryId * sizeof(uint64_t));
}
};
static inline D3D12QueueCtx* CreateD3D12Context(ID3D12Device* device, ID3D12CommandQueue* queue)
{
auto* ctx = static_cast<D3D12QueueCtx*>(tracy_malloc(sizeof(D3D12QueueCtx)));
new (ctx) D3D12QueueCtx{ device, queue };
return ctx;
}
static inline void DestroyD3D12Context(D3D12QueueCtx* ctx)
{
ctx->~D3D12QueueCtx();
tracy_free(ctx);
}
static inline void DestroyD3D12Context(D3D12QueueCtx* ctx)
{
ctx->~D3D12QueueCtx();
tracy_free(ctx);
}
}
#undef TracyD3D12Panic
using TracyD3D12Ctx = tracy::D3D12QueueCtx*;
#define TracyD3D12Context(device, queue) tracy::CreateD3D12Context(device, queue);
@ -471,25 +461,29 @@ using TracyD3D12Ctx = tracy::D3D12QueueCtx*;
#define TracyD3D12NewFrame(ctx) ctx->NewFrame();
#define TracyD3D12UnnamedZone ___tracy_gpu_d3d12_zone
#define TracyD3D12SrcLocSymbol TracyConcat(__tracy_d3d12_source_location,TracyLine)
#define TracyD3D12SrcLocObject(name, color) static constexpr tracy::SourceLocationData TracyD3D12SrcLocSymbol { name, TracyFunction, TracyFile, (uint32_t)TracyLine, color };
#if defined TRACY_HAS_CALLSTACK && defined TRACY_CALLSTACK
# define TracyD3D12Zone(ctx, cmdList, name) TracyD3D12NamedZoneS(ctx, ___tracy_gpu_zone, cmdList, name, TRACY_CALLSTACK, true)
# define TracyD3D12ZoneC(ctx, cmdList, name, color) TracyD3D12NamedZoneCS(ctx, ___tracy_gpu_zone, cmdList, name, color, TRACY_CALLSTACK, true)
# define TracyD3D12NamedZone(ctx, varname, cmdList, name, active) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location, TracyLine) { name, TracyFunction, TracyFile, (uint32_t)TracyLine, 0 }; tracy::D3D12ZoneScope varname{ ctx, cmdList, &TracyConcat(__tracy_gpu_source_location, TracyLine), TRACY_CALLSTACK, active };
# define TracyD3D12NamedZoneC(ctx, varname, cmdList, name, color, active) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location, TracyLine) { name, TracyFunction, TracyFile, (uint32_t)TracyLine, color }; tracy::D3D12ZoneScope varname{ ctx, cmdList, &TracyConcat(__tracy_gpu_source_location, TracyLine), TRACY_CALLSTACK, active };
# define TracyD3D12Zone(ctx, cmdList, name) TracyD3D12NamedZoneS(ctx, TracyD3D12UnnamedZone, cmdList, name, TRACY_CALLSTACK, true)
# define TracyD3D12ZoneC(ctx, cmdList, name, color) TracyD3D12NamedZoneCS(ctx, TracyD3D12UnnamedZone, cmdList, name, color, TRACY_CALLSTACK, true)
# define TracyD3D12NamedZone(ctx, varname, cmdList, name, active) TracyD3D12SrcLocObject(name, 0); tracy::D3D12ZoneScope varname{ ctx, cmdList, &TracyD3D12SrcLocSymbol, TRACY_CALLSTACK, active };
# define TracyD3D12NamedZoneC(ctx, varname, cmdList, name, color, active) TracyD3D12SrcLocObject(name, color); tracy::D3D12ZoneScope varname{ ctx, cmdList, &TracyD3D12SrcLocSymbol, TRACY_CALLSTACK, active };
# define TracyD3D12ZoneTransient(ctx, varname, cmdList, name, active) TracyD3D12ZoneTransientS(ctx, varname, cmdList, name, TRACY_CALLSTACK, active)
#else
# define TracyD3D12Zone(ctx, cmdList, name) TracyD3D12NamedZone(ctx, ___tracy_gpu_zone, cmdList, name, true)
# define TracyD3D12ZoneC(ctx, cmdList, name, color) TracyD3D12NamedZoneC(ctx, ___tracy_gpu_zone, cmdList, name, color, true)
# define TracyD3D12NamedZone(ctx, varname, cmdList, name, active) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location, TracyLine) { name, TracyFunction, TracyFile, (uint32_t)TracyLine, 0 }; tracy::D3D12ZoneScope varname{ ctx, cmdList, &TracyConcat(__tracy_gpu_source_location, TracyLine), active };
# define TracyD3D12NamedZoneC(ctx, varname, cmdList, name, color, active) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location, TracyLine) { name, TracyFunction, TracyFile, (uint32_t)TracyLine, color }; tracy::D3D12ZoneScope varname{ ctx, cmdList, &TracyConcat(__tracy_gpu_source_location, TracyLine), active };
# define TracyD3D12Zone(ctx, cmdList, name) TracyD3D12NamedZone(ctx, TracyD3D12UnnamedZone, cmdList, name, true)
# define TracyD3D12ZoneC(ctx, cmdList, name, color) TracyD3D12NamedZoneC(ctx, TracyD3D12UnnamedZone, cmdList, name, color, true)
# define TracyD3D12NamedZone(ctx, varname, cmdList, name, active) TracyD3D12SrcLocObject(name, 0); tracy::D3D12ZoneScope varname{ ctx, cmdList, &TracyD3D12SrcLocSymbol, active };
# define TracyD3D12NamedZoneC(ctx, varname, cmdList, name, color, active) TracyD3D12SrcLocObject(name, color); tracy::D3D12ZoneScope varname{ ctx, cmdList, &TracyD3D12SrcLocSymbol, active };
# define TracyD3D12ZoneTransient(ctx, varname, cmdList, name, active) tracy::D3D12ZoneScope varname{ ctx, TracyLine, TracyFile, strlen(TracyFile), TracyFunction, strlen(TracyFunction), name, strlen(name), cmdList, active };
#endif
#ifdef TRACY_HAS_CALLSTACK
# define TracyD3D12ZoneS(ctx, cmdList, name, depth) TracyD3D12NamedZoneS(ctx, ___tracy_gpu_zone, cmdList, name, depth, true)
# define TracyD3D12ZoneCS(ctx, cmdList, name, color, depth) TracyD3D12NamedZoneCS(ctx, ___tracy_gpu_zone, cmdList, name, color, depth, true)
# define TracyD3D12NamedZoneS(ctx, varname, cmdList, name, depth, active) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location, TracyLine) { name, TracyFunction, TracyFile, (uint32_t)TracyLine, 0 }; tracy::D3D12ZoneScope varname{ ctx, cmdList, &TracyConcat(__tracy_gpu_source_location, TracyLine), depth, active };
# define TracyD3D12NamedZoneCS(ctx, varname, cmdList, name, color, depth, active) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location, TracyLine) { name, TracyFunction, TracyFile, (uint32_t)TracyLine, color }; tracy::D3D12ZoneScope varname{ ctx, cmdList, &TracyConcat(__tracy_gpu_source_location, TracyLine), depth, active };
# define TracyD3D12ZoneS(ctx, cmdList, name, depth) TracyD3D12NamedZoneS(ctx, TracyD3D12UnnamedZone, cmdList, name, depth, true)
# define TracyD3D12ZoneCS(ctx, cmdList, name, color, depth) TracyD3D12NamedZoneCS(ctx, TracyD3D12UnnamedZone, cmdList, name, color, depth, true)
# define TracyD3D12NamedZoneS(ctx, varname, cmdList, name, depth, active) TracyD3D12SrcLocObject(name, 0); tracy::D3D12ZoneScope varname{ ctx, cmdList, &TracyD3D12SrcLocSymbol, depth, active };
# define TracyD3D12NamedZoneCS(ctx, varname, cmdList, name, color, depth, active) TracyD3D12SrcLocObject(name, color); tracy::D3D12ZoneScope varname{ ctx, cmdList, &TracyD3D12SrcLocSymbol, depth, active };
# define TracyD3D12ZoneTransientS(ctx, varname, cmdList, name, depth, active) tracy::D3D12ZoneScope varname{ ctx, TracyLine, TracyFile, strlen(TracyFile), TracyFunction, strlen(TracyFunction), name, strlen(name), cmdList, depth, active };
#else
# define TracyD3D12ZoneS(ctx, cmdList, name, depth) TracyD3D12Zone(ctx, cmdList, name)

View file

@ -173,10 +173,10 @@ static tracy_force_inline void SendLuaCallstack( lua_State* L, uint32_t depth )
{
const uint32_t line = dbg[i].currentline;
memcpy( dst, &line, 4 ); dst += 4;
assert( fsz[i] <= std::numeric_limits<uint16_t>::max() );
assert( fsz[i] <= (std::numeric_limits<uint16_t>::max)() );
memcpy( dst, fsz+i, 2 ); dst += 2;
memcpy( dst, func[i], fsz[i] ); dst += fsz[i];
assert( ssz[i] <= std::numeric_limits<uint16_t>::max() );
assert( ssz[i] <= (std::numeric_limits<uint16_t>::max)() );
memcpy( dst, ssz+i, 2 ); dst += 2;
memcpy( dst, dbg[i].source, ssz[i] ), dst += ssz[i];
}
@ -333,7 +333,7 @@ static inline int LuaZoneText( lua_State* L )
auto txt = lua_tostring( L, 1 );
const auto size = strlen( txt );
assert( size < std::numeric_limits<uint16_t>::max() );
assert( size < (std::numeric_limits<uint16_t>::max)() );
auto ptr = (char*)tracy_malloc( size );
memcpy( ptr, txt, size );
@ -358,7 +358,7 @@ static inline int LuaZoneName( lua_State* L )
auto txt = lua_tostring( L, 1 );
const auto size = strlen( txt );
assert( size < std::numeric_limits<uint16_t>::max() );
assert( size < (std::numeric_limits<uint16_t>::max)() );
auto ptr = (char*)tracy_malloc( size );
memcpy( ptr, txt, size );
@ -378,7 +378,7 @@ static inline int LuaMessage( lua_State* L )
auto txt = lua_tostring( L, 1 );
const auto size = strlen( txt );
assert( size < std::numeric_limits<uint16_t>::max() );
assert( size < (std::numeric_limits<uint16_t>::max)() );
auto ptr = (char*)tracy_malloc( size );
memcpy( ptr, txt, size );

View file

@ -5,6 +5,9 @@
#define TracyVkContext(x,y,z,w) nullptr
#define TracyVkContextCalibrated(x,y,z,w,a,b) nullptr
#if defined VK_EXT_host_query_reset
#define TracyVkContextHostCalibrated(x,y,z,w,a) nullptr
#endif
#define TracyVkDestroy(x)
#define TracyVkContextName(c,x,y)
#define TracyVkNamedZone(c,x,y,z,w)
@ -39,9 +42,47 @@ using TracyVkCtx = void*;
#include "../client/TracyProfiler.hpp"
#include "../client/TracyCallstack.hpp"
#include <atomic>
namespace tracy
{
#if defined TRACY_VK_USE_SYMBOL_TABLE
#define LoadVkDeviceCoreSymbols(Operation) \
Operation(vkBeginCommandBuffer) \
Operation(vkCmdResetQueryPool) \
Operation(vkCmdWriteTimestamp) \
Operation(vkCreateQueryPool) \
Operation(vkDestroyQueryPool) \
Operation(vkEndCommandBuffer) \
Operation(vkGetQueryPoolResults) \
Operation(vkQueueSubmit) \
Operation(vkQueueWaitIdle) \
Operation(vkResetQueryPool)
#define LoadVkDeviceExtensionSymbols(Operation) \
Operation(vkGetCalibratedTimestampsEXT) \
Operation(vkGetPhysicalDeviceCalibrateableTimeDomainsEXT)
#define LoadVkInstanceCoreSymbols(Operation) \
Operation(vkGetPhysicalDeviceProperties)
struct VkSymbolTable
{
#define MAKE_PFN(name) PFN_##name name;
LoadVkDeviceCoreSymbols(MAKE_PFN)
LoadVkDeviceExtensionSymbols(MAKE_PFN)
LoadVkInstanceCoreSymbols(MAKE_PFN)
#undef MAKE_PFN
};
#define VK_FUNCTION_WRAPPER(callSignature) m_symbols.callSignature
#define CONTEXT_VK_FUNCTION_WRAPPER(callSignature) m_ctx->m_symbols.callSignature
#else
#define VK_FUNCTION_WRAPPER(callSignature) callSignature
#define CONTEXT_VK_FUNCTION_WRAPPER(callSignature) callSignature
#endif
class VkCtx
{
friend class VkCtxScope;
@ -49,7 +90,11 @@ class VkCtx
enum { QueryCount = 64 * 1024 };
public:
VkCtx( VkPhysicalDevice physdev, VkDevice device, VkQueue queue, VkCommandBuffer cmdbuf, PFN_vkGetPhysicalDeviceCalibrateableTimeDomainsEXT _vkGetPhysicalDeviceCalibrateableTimeDomainsEXT, PFN_vkGetCalibratedTimestampsEXT _vkGetCalibratedTimestampsEXT )
#if defined TRACY_VK_USE_SYMBOL_TABLE
VkCtx( VkInstance instance, VkPhysicalDevice physdev, VkDevice device, VkQueue queue, VkCommandBuffer cmdbuf, PFN_vkGetInstanceProcAddr instanceProcAddr, PFN_vkGetDeviceProcAddr deviceProcAddr, bool calibrated )
#else
VkCtx( VkPhysicalDevice physdev, VkDevice device, VkQueue queue, VkCommandBuffer cmdbuf, PFN_vkGetPhysicalDeviceCalibrateableTimeDomainsEXT vkGetPhysicalDeviceCalibrateableTimeDomainsEXT, PFN_vkGetCalibratedTimestampsEXT vkGetCalibratedTimestampsEXT)
#endif
: m_device( device )
, m_timeDomain( VK_TIME_DOMAIN_DEVICE_EXT )
, m_context( GetGpuCtxCounter().fetch_add( 1, std::memory_order_relaxed ) )
@ -57,47 +102,28 @@ public:
, m_tail( 0 )
, m_oldCnt( 0 )
, m_queryCount( QueryCount )
, m_vkGetCalibratedTimestampsEXT( _vkGetCalibratedTimestampsEXT )
#if !defined TRACY_VK_USE_SYMBOL_TABLE
, m_vkGetCalibratedTimestampsEXT( vkGetCalibratedTimestampsEXT )
#endif
{
assert( m_context != 255 );
if( _vkGetPhysicalDeviceCalibrateableTimeDomainsEXT && _vkGetCalibratedTimestampsEXT )
#if defined TRACY_VK_USE_SYMBOL_TABLE
PopulateSymbolTable(instance, instanceProcAddr, deviceProcAddr);
if ( calibrated )
{
uint32_t num;
_vkGetPhysicalDeviceCalibrateableTimeDomainsEXT( physdev, &num, nullptr );
if( num > 4 ) num = 4;
VkTimeDomainEXT data[4];
_vkGetPhysicalDeviceCalibrateableTimeDomainsEXT( physdev, &num, data );
VkTimeDomainEXT supportedDomain = (VkTimeDomainEXT)-1;
#if defined _WIN32
supportedDomain = VK_TIME_DOMAIN_QUERY_PERFORMANCE_COUNTER_EXT;
#elif defined __linux__ && defined CLOCK_MONOTONIC_RAW
supportedDomain = VK_TIME_DOMAIN_CLOCK_MONOTONIC_RAW_EXT;
m_vkGetCalibratedTimestampsEXT = m_symbols.vkGetCalibratedTimestampsEXT;
}
#endif
for( uint32_t i=0; i<num; i++ )
{
if( data[i] == supportedDomain )
{
m_timeDomain = data[i];
break;
}
}
}
VkPhysicalDeviceProperties prop;
vkGetPhysicalDeviceProperties( physdev, &prop );
const float period = prop.limits.timestampPeriod;
VkQueryPoolCreateInfo poolInfo = {};
poolInfo.sType = VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO;
poolInfo.queryCount = m_queryCount;
poolInfo.queryType = VK_QUERY_TYPE_TIMESTAMP;
while( vkCreateQueryPool( device, &poolInfo, nullptr, &m_query ) != VK_SUCCESS )
if( VK_FUNCTION_WRAPPER( vkGetPhysicalDeviceCalibrateableTimeDomainsEXT ) && m_vkGetCalibratedTimestampsEXT )
{
m_queryCount /= 2;
poolInfo.queryCount = m_queryCount;
FindAvailableTimeDomains( physdev, VK_FUNCTION_WRAPPER( vkGetPhysicalDeviceCalibrateableTimeDomainsEXT ) );
}
CreateQueryPool();
VkCommandBufferBeginInfo beginInfo = {};
beginInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO;
beginInfo.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
@ -107,87 +133,96 @@ public:
submitInfo.commandBufferCount = 1;
submitInfo.pCommandBuffers = &cmdbuf;
vkBeginCommandBuffer( cmdbuf, &beginInfo );
vkCmdResetQueryPool( cmdbuf, m_query, 0, m_queryCount );
vkEndCommandBuffer( cmdbuf );
vkQueueSubmit( queue, 1, &submitInfo, VK_NULL_HANDLE );
vkQueueWaitIdle( queue );
VK_FUNCTION_WRAPPER( vkBeginCommandBuffer( cmdbuf, &beginInfo ) );
VK_FUNCTION_WRAPPER( vkCmdResetQueryPool( cmdbuf, m_query, 0, m_queryCount ) );
VK_FUNCTION_WRAPPER( vkEndCommandBuffer( cmdbuf ) );
VK_FUNCTION_WRAPPER( vkQueueSubmit( queue, 1, &submitInfo, VK_NULL_HANDLE ) );
VK_FUNCTION_WRAPPER( vkQueueWaitIdle( queue ) );
int64_t tcpu, tgpu;
if( m_timeDomain == VK_TIME_DOMAIN_DEVICE_EXT )
{
vkBeginCommandBuffer( cmdbuf, &beginInfo );
vkCmdWriteTimestamp( cmdbuf, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, m_query, 0 );
vkEndCommandBuffer( cmdbuf );
vkQueueSubmit( queue, 1, &submitInfo, VK_NULL_HANDLE );
vkQueueWaitIdle( queue );
VK_FUNCTION_WRAPPER( vkBeginCommandBuffer( cmdbuf, &beginInfo ) );
VK_FUNCTION_WRAPPER( vkCmdWriteTimestamp( cmdbuf, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, m_query, 0 ) );
VK_FUNCTION_WRAPPER( vkEndCommandBuffer( cmdbuf ) );
VK_FUNCTION_WRAPPER( vkQueueSubmit( queue, 1, &submitInfo, VK_NULL_HANDLE ) );
VK_FUNCTION_WRAPPER( vkQueueWaitIdle( queue ) );
tcpu = Profiler::GetTime();
vkGetQueryPoolResults( device, m_query, 0, 1, sizeof( tgpu ), &tgpu, sizeof( tgpu ), VK_QUERY_RESULT_64_BIT | VK_QUERY_RESULT_WAIT_BIT );
VK_FUNCTION_WRAPPER( vkGetQueryPoolResults( device, m_query, 0, 1, sizeof( tgpu ), &tgpu, sizeof( tgpu ), VK_QUERY_RESULT_64_BIT | VK_QUERY_RESULT_WAIT_BIT ) );
vkBeginCommandBuffer( cmdbuf, &beginInfo );
vkCmdResetQueryPool( cmdbuf, m_query, 0, 1 );
vkEndCommandBuffer( cmdbuf );
vkQueueSubmit( queue, 1, &submitInfo, VK_NULL_HANDLE );
vkQueueWaitIdle( queue );
VK_FUNCTION_WRAPPER( vkBeginCommandBuffer( cmdbuf, &beginInfo ) );
VK_FUNCTION_WRAPPER( vkCmdResetQueryPool( cmdbuf, m_query, 0, 1 ) );
VK_FUNCTION_WRAPPER( vkEndCommandBuffer( cmdbuf ) );
VK_FUNCTION_WRAPPER( vkQueueSubmit( queue, 1, &submitInfo, VK_NULL_HANDLE ) );
VK_FUNCTION_WRAPPER( vkQueueWaitIdle( queue ) );
}
else
{
enum { NumProbes = 32 };
VkCalibratedTimestampInfoEXT spec[2] = {
{ VK_STRUCTURE_TYPE_CALIBRATED_TIMESTAMP_INFO_EXT, nullptr, VK_TIME_DOMAIN_DEVICE_EXT },
{ VK_STRUCTURE_TYPE_CALIBRATED_TIMESTAMP_INFO_EXT, nullptr, m_timeDomain },
};
uint64_t ts[2];
uint64_t deviation[NumProbes];
for( int i=0; i<NumProbes; i++ )
{
_vkGetCalibratedTimestampsEXT( device, 2, spec, ts, deviation+i );
}
uint64_t minDeviation = deviation[0];
for( int i=1; i<NumProbes; i++ )
{
if( minDeviation > deviation[i] )
{
minDeviation = deviation[i];
}
}
m_deviation = minDeviation * 3 / 2;
#if defined _WIN32
m_qpcToNs = int64_t( 1000000000. / GetFrequencyQpc() );
#endif
FindCalibratedTimestampDeviation();
Calibrate( device, m_prevCalibration, tgpu );
tcpu = Profiler::GetTime();
}
uint8_t flags = 0;
if( m_timeDomain != VK_TIME_DOMAIN_DEVICE_EXT ) flags |= GpuContextCalibration;
auto item = Profiler::QueueSerial();
MemWrite( &item->hdr.type, QueueType::GpuNewContext );
MemWrite( &item->gpuNewContext.cpuTime, tcpu );
MemWrite( &item->gpuNewContext.gpuTime, tgpu );
memset( &item->gpuNewContext.thread, 0, sizeof( item->gpuNewContext.thread ) );
MemWrite( &item->gpuNewContext.period, period );
MemWrite( &item->gpuNewContext.context, m_context );
MemWrite( &item->gpuNewContext.flags, flags );
MemWrite( &item->gpuNewContext.type, GpuContextType::Vulkan );
#ifdef TRACY_ON_DEMAND
GetProfiler().DeferItem( *item );
#endif
Profiler::QueueSerialFinish();
WriteInitialItem( physdev, tcpu, tgpu );
m_res = (int64_t*)tracy_malloc( sizeof( int64_t ) * m_queryCount );
}
#if defined VK_EXT_host_query_reset
/**
* This alternative constructor does not use command buffers and instead uses functionality from
* VK_EXT_host_query_reset (core with 1.2 and non-optional) and VK_EXT_calibrated_timestamps. This requires
* the physical device to have another time domain apart from DEVICE to be calibrateable.
*/
#if defined TRACY_VK_USE_SYMBOL_TABLE
VkCtx( VkInstance instance, VkPhysicalDevice physdev, VkDevice device, PFN_vkGetInstanceProcAddr instanceProcAddr, PFN_vkGetDeviceProcAddr deviceProcAddr )
#else
VkCtx( VkPhysicalDevice physdev, VkDevice device, PFN_vkResetQueryPoolEXT vkResetQueryPool, PFN_vkGetPhysicalDeviceCalibrateableTimeDomainsEXT vkGetPhysicalDeviceCalibrateableTimeDomainsEXT, PFN_vkGetCalibratedTimestampsEXT vkGetCalibratedTimestampsEXT )
#endif
: m_device( device )
, m_timeDomain( VK_TIME_DOMAIN_DEVICE_EXT )
, m_context( GetGpuCtxCounter().fetch_add(1, std::memory_order_relaxed) )
, m_head( 0 )
, m_tail( 0 )
, m_oldCnt( 0 )
, m_queryCount( QueryCount )
#if !defined TRACY_VK_USE_SYMBOL_TABLE
, m_vkGetCalibratedTimestampsEXT( vkGetCalibratedTimestampsEXT )
#endif
{
assert( m_context != 255);
#if defined TRACY_VK_USE_SYMBOL_TABLE
PopulateSymbolTable(instance, instanceProcAddr, deviceProcAddr);
m_vkGetCalibratedTimestampsEXT = m_symbols.vkGetCalibratedTimestampsEXT;
#endif
assert( VK_FUNCTION_WRAPPER( vkResetQueryPool ) != nullptr );
assert( VK_FUNCTION_WRAPPER( vkGetPhysicalDeviceCalibrateableTimeDomainsEXT ) != nullptr );
assert( VK_FUNCTION_WRAPPER( vkGetCalibratedTimestampsEXT ) != nullptr );
FindAvailableTimeDomains( physdev, VK_FUNCTION_WRAPPER( vkGetPhysicalDeviceCalibrateableTimeDomainsEXT ) );
// We require a host time domain to be available to properly calibrate.
FindCalibratedTimestampDeviation();
int64_t tgpu;
Calibrate( device, m_prevCalibration, tgpu );
int64_t tcpu = Profiler::GetTime();
CreateQueryPool();
VK_FUNCTION_WRAPPER( vkResetQueryPool( device, m_query, 0, m_queryCount ) );
WriteInitialItem( physdev, tcpu, tgpu );
m_res = (int64_t*)tracy_malloc( sizeof( int64_t ) * m_queryCount );
}
#endif
~VkCtx()
{
tracy_free( m_res );
vkDestroyQueryPool( m_device, m_query, nullptr );
VK_FUNCTION_WRAPPER( vkDestroyQueryPool( m_device, m_query, nullptr ) );
}
void Name( const char* name, uint16_t len )
@ -210,18 +245,23 @@ public:
{
ZoneScopedC( Color::Red4 );
if( m_tail == m_head ) return;
const uint64_t head = m_head.load(std::memory_order_relaxed);
if( m_tail == head ) return;
#ifdef TRACY_ON_DEMAND
if( !GetProfiler().IsConnected() )
{
vkCmdResetQueryPool( cmdbuf, m_query, 0, m_queryCount );
m_head = m_tail = m_oldCnt = 0;
VK_FUNCTION_WRAPPER( vkCmdResetQueryPool( cmdbuf, m_query, 0, m_queryCount ) );
m_tail = head;
m_oldCnt = 0;
int64_t tgpu;
if( m_timeDomain != VK_TIME_DOMAIN_DEVICE_EXT ) Calibrate( m_device, m_prevCalibration, tgpu );
return;
}
#endif
assert( head > m_tail );
const unsigned int wrappedTail = (unsigned int)( m_tail % m_queryCount );
unsigned int cnt;
if( m_oldCnt != 0 )
@ -231,10 +271,16 @@ public:
}
else
{
cnt = m_head < m_tail ? m_queryCount - m_tail : m_head - m_tail;
cnt = (unsigned int)( head - m_tail );
assert( cnt <= m_queryCount );
if( wrappedTail + cnt > m_queryCount )
{
cnt = m_queryCount - wrappedTail;
}
}
if( vkGetQueryPoolResults( m_device, m_query, m_tail, cnt, sizeof( int64_t ) * m_queryCount, m_res, sizeof( int64_t ), VK_QUERY_RESULT_64_BIT ) == VK_NOT_READY )
if( VK_FUNCTION_WRAPPER( vkGetQueryPoolResults( m_device, m_query, wrappedTail, cnt, sizeof( int64_t ) * m_queryCount, m_res, sizeof( int64_t ), VK_QUERY_RESULT_64_BIT ) == VK_NOT_READY ) )
{
m_oldCnt = cnt;
return;
@ -245,7 +291,7 @@ public:
auto item = Profiler::QueueSerial();
MemWrite( &item->hdr.type, QueueType::GpuTime );
MemWrite( &item->gpuTime.gpuTime, m_res[idx] );
MemWrite( &item->gpuTime.queryId, uint16_t( m_tail + idx ) );
MemWrite( &item->gpuTime.queryId, uint16_t( wrappedTail + idx ) );
MemWrite( &item->gpuTime.context, m_context );
Profiler::QueueSerialFinish();
}
@ -269,19 +315,16 @@ public:
}
}
vkCmdResetQueryPool( cmdbuf, m_query, m_tail, cnt );
VK_FUNCTION_WRAPPER( vkCmdResetQueryPool( cmdbuf, m_query, wrappedTail, cnt ) );
m_tail += cnt;
if( m_tail == m_queryCount ) m_tail = 0;
}
private:
tracy_force_inline unsigned int NextQueryId()
{
const auto id = m_head;
m_head = ( m_head + 1 ) % m_queryCount;
assert( m_head != m_tail );
return id;
const uint64_t id = m_head.fetch_add(1, std::memory_order_relaxed);
return id % m_queryCount;
}
tracy_force_inline uint8_t GetId() const
@ -315,16 +358,126 @@ private:
#endif
}
tracy_force_inline void CreateQueryPool()
{
VkQueryPoolCreateInfo poolInfo = {};
poolInfo.sType = VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO;
poolInfo.queryCount = m_queryCount;
poolInfo.queryType = VK_QUERY_TYPE_TIMESTAMP;
while ( VK_FUNCTION_WRAPPER( vkCreateQueryPool( m_device, &poolInfo, nullptr, &m_query ) != VK_SUCCESS ) )
{
m_queryCount /= 2;
poolInfo.queryCount = m_queryCount;
}
}
tracy_force_inline void FindAvailableTimeDomains( VkPhysicalDevice physicalDevice, PFN_vkGetPhysicalDeviceCalibrateableTimeDomainsEXT _vkGetPhysicalDeviceCalibrateableTimeDomainsEXT )
{
uint32_t num;
_vkGetPhysicalDeviceCalibrateableTimeDomainsEXT( physicalDevice, &num, nullptr );
if(num > 4) num = 4;
VkTimeDomainEXT data[4];
_vkGetPhysicalDeviceCalibrateableTimeDomainsEXT( physicalDevice, &num, data );
VkTimeDomainEXT supportedDomain = (VkTimeDomainEXT)-1;
#if defined _WIN32
supportedDomain = VK_TIME_DOMAIN_QUERY_PERFORMANCE_COUNTER_EXT;
#elif defined __linux__ && defined CLOCK_MONOTONIC_RAW
supportedDomain = VK_TIME_DOMAIN_CLOCK_MONOTONIC_RAW_EXT;
#endif
for( uint32_t i=0; i<num; i++ ) {
if(data[i] == supportedDomain) {
m_timeDomain = data[i];
break;
}
}
}
tracy_force_inline void FindCalibratedTimestampDeviation()
{
assert( m_timeDomain != VK_TIME_DOMAIN_DEVICE_EXT );
constexpr size_t NumProbes = 32;
VkCalibratedTimestampInfoEXT spec[2] = {
{ VK_STRUCTURE_TYPE_CALIBRATED_TIMESTAMP_INFO_EXT, nullptr, VK_TIME_DOMAIN_DEVICE_EXT },
{ VK_STRUCTURE_TYPE_CALIBRATED_TIMESTAMP_INFO_EXT, nullptr, m_timeDomain },
};
uint64_t ts[2];
uint64_t deviation[NumProbes];
for( int i=0; i<NumProbes; i++ ) {
m_vkGetCalibratedTimestampsEXT( m_device, 2, spec, ts, deviation + i );
}
uint64_t minDeviation = deviation[0];
for( int i=1; i<NumProbes; i++ ) {
if ( minDeviation > deviation[i] ) {
minDeviation = deviation[i];
}
}
m_deviation = minDeviation * 3 / 2;
#if defined _WIN32
m_qpcToNs = int64_t( 1000000000. / GetFrequencyQpc() );
#endif
}
tracy_force_inline void WriteInitialItem( VkPhysicalDevice physdev, int64_t tcpu, int64_t tgpu )
{
uint8_t flags = 0;
if( m_timeDomain != VK_TIME_DOMAIN_DEVICE_EXT ) flags |= GpuContextCalibration;
VkPhysicalDeviceProperties prop;
VK_FUNCTION_WRAPPER( vkGetPhysicalDeviceProperties( physdev, &prop ) );
const float period = prop.limits.timestampPeriod;
auto item = Profiler::QueueSerial();
MemWrite( &item->hdr.type, QueueType::GpuNewContext );
MemWrite( &item->gpuNewContext.cpuTime, tcpu );
MemWrite( &item->gpuNewContext.gpuTime, tgpu );
memset( &item->gpuNewContext.thread, 0, sizeof( item->gpuNewContext.thread ) );
MemWrite( &item->gpuNewContext.period, period );
MemWrite( &item->gpuNewContext.context, m_context );
MemWrite( &item->gpuNewContext.flags, flags );
MemWrite( &item->gpuNewContext.type, GpuContextType::Vulkan );
#ifdef TRACY_ON_DEMAND
GetProfiler().DeferItem( *item );
#endif
Profiler::QueueSerialFinish();
}
#if defined TRACY_VK_USE_SYMBOL_TABLE
void PopulateSymbolTable( VkInstance instance, PFN_vkGetInstanceProcAddr instanceProcAddr, PFN_vkGetDeviceProcAddr deviceProcAddr )
{
#define VK_GET_DEVICE_SYMBOL( name ) \
(PFN_##name)deviceProcAddr( m_device, #name );
#define VK_LOAD_DEVICE_SYMBOL( name ) \
m_symbols.name = VK_GET_DEVICE_SYMBOL( name );
#define VK_GET_INSTANCE_SYMBOL( name ) \
(PFN_##name)instanceProcAddr( instance, #name );
#define VK_LOAD_INSTANCE_SYMBOL( name ) \
m_symbols.name = VK_GET_INSTANCE_SYMBOL( name );
LoadVkDeviceCoreSymbols( VK_LOAD_DEVICE_SYMBOL )
LoadVkDeviceExtensionSymbols( VK_LOAD_DEVICE_SYMBOL )
LoadVkInstanceCoreSymbols( VK_LOAD_INSTANCE_SYMBOL )
#undef VK_GET_DEVICE_SYMBOL
#undef VK_LOAD_DEVICE_SYMBOL
#undef VK_GET_INSTANCE_SYMBOL
#undef VK_LOAD_INSTANCE_SYMBOL
}
#endif
VkDevice m_device;
VkQueryPool m_query;
VkTimeDomainEXT m_timeDomain;
#if defined TRACY_VK_USE_SYMBOL_TABLE
VkSymbolTable m_symbols;
#endif
uint64_t m_deviation;
int64_t m_qpcToNs;
int64_t m_prevCalibration;
uint8_t m_context;
unsigned int m_head;
unsigned int m_tail;
std::atomic<uint64_t> m_head;
uint64_t m_tail;
unsigned int m_oldCnt;
unsigned int m_queryCount;
@ -348,7 +501,7 @@ public:
m_ctx = ctx;
const auto queryId = ctx->NextQueryId();
vkCmdWriteTimestamp( cmdbuf, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, ctx->m_query, queryId );
CONTEXT_VK_FUNCTION_WRAPPER( vkCmdWriteTimestamp( cmdbuf, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, ctx->m_query, queryId ) );
auto item = Profiler::QueueSerial();
MemWrite( &item->hdr.type, QueueType::GpuZoneBeginSerial );
@ -372,7 +525,7 @@ public:
m_ctx = ctx;
const auto queryId = ctx->NextQueryId();
vkCmdWriteTimestamp( cmdbuf, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, ctx->m_query, queryId );
CONTEXT_VK_FUNCTION_WRAPPER( vkCmdWriteTimestamp( cmdbuf, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, ctx->m_query, queryId ) );
auto item = Profiler::QueueSerialCallstack( Callstack( depth ) );
MemWrite( &item->hdr.type, QueueType::GpuZoneBeginCallstackSerial );
@ -396,7 +549,7 @@ public:
m_ctx = ctx;
const auto queryId = ctx->NextQueryId();
vkCmdWriteTimestamp( cmdbuf, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, ctx->m_query, queryId );
CONTEXT_VK_FUNCTION_WRAPPER( vkCmdWriteTimestamp( cmdbuf, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, ctx->m_query, queryId ) );
const auto srcloc = Profiler::AllocSourceLocation( line, source, sourceSz, function, functionSz, name, nameSz );
auto item = Profiler::QueueSerial();
@ -421,7 +574,7 @@ public:
m_ctx = ctx;
const auto queryId = ctx->NextQueryId();
vkCmdWriteTimestamp( cmdbuf, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, ctx->m_query, queryId );
CONTEXT_VK_FUNCTION_WRAPPER( vkCmdWriteTimestamp( cmdbuf, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, ctx->m_query, queryId ) );
const auto srcloc = Profiler::AllocSourceLocation( line, source, sourceSz, function, functionSz, name, nameSz );
auto item = Profiler::QueueSerialCallstack( Callstack( depth ) );
@ -439,7 +592,7 @@ public:
if( !m_active ) return;
const auto queryId = m_ctx->NextQueryId();
vkCmdWriteTimestamp( m_cmdbuf, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, m_ctx->m_query, queryId );
CONTEXT_VK_FUNCTION_WRAPPER( vkCmdWriteTimestamp( m_cmdbuf, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, m_ctx->m_query, queryId ) );
auto item = Profiler::QueueSerial();
MemWrite( &item->hdr.type, QueueType::GpuZoneEndSerial );
@ -457,13 +610,38 @@ private:
VkCtx* m_ctx;
};
#if defined TRACY_VK_USE_SYMBOL_TABLE
static inline VkCtx* CreateVkContext( VkInstance instance, VkPhysicalDevice physdev, VkDevice device, VkQueue queue, VkCommandBuffer cmdbuf, PFN_vkGetInstanceProcAddr instanceProcAddr, PFN_vkGetDeviceProcAddr getDeviceProcAddr, bool calibrated = false )
#else
static inline VkCtx* CreateVkContext( VkPhysicalDevice physdev, VkDevice device, VkQueue queue, VkCommandBuffer cmdbuf, PFN_vkGetPhysicalDeviceCalibrateableTimeDomainsEXT gpdctd, PFN_vkGetCalibratedTimestampsEXT gct )
#endif
{
auto ctx = (VkCtx*)tracy_malloc( sizeof( VkCtx ) );
#if defined TRACY_VK_USE_SYMBOL_TABLE
new(ctx) VkCtx( instance, physdev, device, queue, cmdbuf, instanceProcAddr, getDeviceProcAddr, calibrated );
#else
new(ctx) VkCtx( physdev, device, queue, cmdbuf, gpdctd, gct );
#endif
return ctx;
}
#if defined VK_EXT_host_query_reset
#if defined TRACY_VK_USE_SYMBOL_TABLE
static inline VkCtx* CreateVkContext( VkInstance instance, VkPhysicalDevice physdev, VkDevice device, PFN_vkGetInstanceProcAddr instanceProcAddr, PFN_vkGetDeviceProcAddr getDeviceProcAddr )
#else
static inline VkCtx* CreateVkContext( VkPhysicalDevice physdev, VkDevice device, PFN_vkResetQueryPoolEXT qpreset, PFN_vkGetPhysicalDeviceCalibrateableTimeDomainsEXT gpdctd, PFN_vkGetCalibratedTimestampsEXT gct )
#endif
{
auto ctx = (VkCtx*)tracy_malloc( sizeof( VkCtx ) );
#if defined TRACY_VK_USE_SYMBOL_TABLE
new(ctx) VkCtx( instance, physdev, device, instanceProcAddr, getDeviceProcAddr );
#else
new(ctx) VkCtx( physdev, device, qpreset, gpdctd, gct );
#endif
return ctx;
}
#endif
static inline void DestroyVkContext( VkCtx* ctx )
{
ctx->~VkCtx();
@ -474,8 +652,23 @@ static inline void DestroyVkContext( VkCtx* ctx )
using TracyVkCtx = tracy::VkCtx*;
#if defined TRACY_VK_USE_SYMBOL_TABLE
#define TracyVkContext( instance, physdev, device, queue, cmdbuf, instanceProcAddr, deviceProcAddr ) tracy::CreateVkContext( instance, physdev, device, queue, cmdbuf, instanceProcAddr, deviceProcAddr );
#else
#define TracyVkContext( physdev, device, queue, cmdbuf ) tracy::CreateVkContext( physdev, device, queue, cmdbuf, nullptr, nullptr );
#endif
#if defined TRACY_VK_USE_SYMBOL_TABLE
#define TracyVkContextCalibrated( instance, physdev, device, queue, cmdbuf, instanceProcAddr, deviceProcAddr ) tracy::CreateVkContext( instance, physdev, device, queue, cmdbuf, instanceProcAddr, deviceProcAddr, true );
#else
#define TracyVkContextCalibrated( physdev, device, queue, cmdbuf, gpdctd, gct ) tracy::CreateVkContext( physdev, device, queue, cmdbuf, gpdctd, gct );
#endif
#if defined VK_EXT_host_query_reset
#if defined TRACY_VK_USE_SYMBOL_TABLE
#define TracyVkContextHostCalibrated( instance, physdev, device, instanceProcAddr, deviceProcAddr ) tracy::CreateVkContext( instance, physdev, device, instanceProcAddr, deviceProcAddr );
#else
#define TracyVkContextHostCalibrated( physdev, device, qpreset, gpdctd, gct ) tracy::CreateVkContext( physdev, device, qpreset, gpdctd, gct );
#endif
#endif
#define TracyVkDestroy( ctx ) tracy::DestroyVkContext( ctx );
#define TracyVkContextName( ctx, name, size ) ctx->Name( name, size );
#if defined TRACY_HAS_CALLSTACK && defined TRACY_CALLSTACK