最近在工作中,我在虚幻源码中插的桩导致了崩溃:
{
SCOPED_NAMED_EVENT(FFrameEndSync_Sync, FColor::MakeRandomColor());
// This could be perhaps moved down to get greater parallelism
// Sync game and render/RHI threads.
FFrameEndSync::Sync();
}
我来探究一下插桩的实现原理,进而剖析一下崩溃原因;
宏展开
#define SCOPED_NAMED_EVENT(Name, Color)\
FScopedNamedEventStatic ANONYMOUS_VARIABLE(NamedEvent_##Name##_)(Color, NAMED_EVENT_STR(#Name));\
TRACE_CPUPROFILER_EVENT_SCOPE(Name);
以上是SCOPED_NAMED_EVENT的定义
ANONYMOUS_VARIABLE展开
#ifdef __COUNTER__
// Created a variable with a unique name
#define ANONYMOUS_VARIABLE( Name ) PREPROCESSOR_JOIN(Name, __COUNTER__)
#else
...
#endif
...
#define PREPROCESSOR_JOIN(TokenA, TokenB) UE_JOIN(TokenA, TokenB)
#define UE_JOIN(TokenA, TokenB) UE_PRIVATE_JOIN(TokenA, TokenB)
#define UE_PRIVATE_JOIN(TokenA, TokenB) TokenA##TokenB
会发现到最后ANONYMOUS_VARIABLE(NamedEvent_##Name##_)被展开为NamedEvent_##__COUNTER__,在虚幻中,__COUNTER__
是编译器提供的每次展开时递增的整数,所以拼起来就是NamedEvent_<N>,比如NamedEvent_0
NAMED_EVENT_STR展开
#if PLATFORM_USES_ANSI_STRING_FOR_EXTERNAL_PROFILING
#define NAMED_EVENT_STR(x) x
#else
#define NAMED_EVENT_STR(x) TEXT(x)
#endif
解到最后要么是他自己( x ),要么是 L ## x,我这里的结果就是他本身,在这次的例子中,就是 FFrameEndSync_Sync 。
目前第一行展开为:FScopedNamedEventStatic NamedEvent_0(FColor::Blue, "FFrameEndSync_Sync");
TRACE_CPUPROFILER_EVENT_SCOPE展开
#define TRACE_CPUPROFILER_EVENT_SCOPE(Name) \
TRACE_CPUPROFILER_EVENT_SCOPE_CONDITIONAL(Name, true)
#define TRACE_CPUPROFILER_EVENT_SCOPE_CONDITIONAL(Name, Condition) \
TRACE_CPUPROFILER_EVENT_SCOPE_STR_CONDITIONAL(#Name, (Condition))
#define TRACE_CPUPROFILER_EVENT_SCOPE_STR_CONDITIONAL(NameStr, Condition) \
TRACE_CPUPROFILER_EVENT_DECLARE(PREPROCESSOR_JOIN(__CpuProfilerEventSpecId, __LINE__)); \
TRACE_CPUPROFILER_EVENT_SCOPE_USE(PREPROCESSOR_JOIN(__CpuProfilerEventSpecId, __LINE__), NameStr, PREPROCESSOR_JOIN(__CpuProfilerEventScope, __LINE__), (Condition));
#define TRACE_CPUPROFILER_EVENT_DECLARE(DeclName) \
static uint32 DeclName;
#define TRACE_CPUPROFILER_EVENT_SCOPE_USE(DeclName, NameStr, ScopeName, Condition) \
FCpuProfilerTrace::FEventScope ScopeName(DeclName, NameStr, Condition, __FILE__, __LINE__);
先一步一步来看,首先,比较方便看出来可以展开成这样:TRACE_CPUPROFILER_EVENT_SCOPE_STR_CONDITIONAL("FFrameEndSync_Sync", true);
进而得到这两条语句
TRACE_CPUPROFILER_EVENT_DECLARE(PREPROCESSOR_JOIN(__CpuProfilerEventSpecId, __LINE__));
TRACE_CPUPROFILER_EVENT_SCOPE_USE(PREPROCESSOR_JOIN(__CpuProfilerEventSpecId, __LINE__), "FFrameEndSync_Sync", PREPROCESSOR_JOIN(__CpuProfilerEventScope, __LINE__), (true));
这里的 __LINE__
是当前代码行号,会在预处理时被替换成具体数值。
由之前提供的关于PREPROCESSOR_JOIN的一系列展开可以得到以下两行:
TRACE_CPUPROFILER_EVENT_DECLARE(__CpuProfilerEventSpecId<Line>);
TRACE_CPUPROFILER_EVENT_SCOPE_USE(__CpuProfilerEventSpecId<Line>, "FFrameEndSync_Sync", __CpuProfilerEventSpecId<Line>, true);
最终可以展开为:
static uint32 __CpuProfilerEventSpecId123; // 假设Line为123,__CpuProfilerEventSpecId<Line>
FCpuProfilerTrace::FEventScope __CpuProfilerEventScope123(__CpuProfilerEventSpecId123, "FFrameEndSync_Sync", true, __FILE__, __LINE__);
最终展开
FScopedNamedEventStatic NamedEvent_0(FColor::Blue, "FFrameEndSync_Sync"); //假设颜色为Blue
static uint32 __CpuProfilerEventSpecId123; // 假设Line为123,__CpuProfilerEventSpecId<Line>
FCpuProfilerTrace::FEventScope __CpuProfilerEventScope123(__CpuProfilerEventSpecId123, "FFrameEndSync_Sync", true, __FILE__, 123);
实现分析
第一句
FScopedNamedEventStatic NamedEvent_0(FColor::Blue, "FFrameEndSync_Sync");
class FScopedNamedEventStatic
{
public:
FScopedNamedEventStatic(const struct FColor& Color, const TCHAR* Text)
{
#if PLATFORM_IMPLEMENTS_BeginNamedEventStatic
FPlatformMisc::BeginNamedEventStatic(Color, Text);
#else
FPlatformMisc::BeginNamedEvent(Color, Text);
#endif
}
FScopedNamedEventStatic(const struct FColor& Color, const ANSICHAR* Text)
{
#if PLATFORM_IMPLEMENTS_BeginNamedEventStatic
FPlatformMisc::BeginNamedEventStatic(Color, Text);
#else
FPlatformMisc::BeginNamedEvent(Color, Text);
#endif
}
~FScopedNamedEventStatic()
{
FPlatformMisc::EndNamedEvent();
}
};
可见是通过类的构造函数和析构函数来实现的自动开始和结束
CPU分析代码
事件的开始和结束,在使用 Intel 开发的 VTune CPU 性能分析器的时候,可以溯源到这两个函数
void StartScopedEvent(const struct FColor& Color, const TCHAR* Text) override
{
uint32 Hash = FCrc::Strihash_DEPRECATED(Text);
TMap<uint32, __itt_string_handle *>& VTuneHandleMap = FVTunePerThreadHandleMap::Get().VTuneHandleMap;
if (!VTuneHandleMap.Contains(Hash))
{
VTuneHandleMap.Emplace(Hash, __itt_string_handle_createW(Text));
}
//Activate timer
__itt_string_handle *Handle = VTuneHandleMap[Hash];
__itt_task_begin(Domain, __itt_null, __itt_null, Handle);
}
void EndScopedEvent() override
{
//Deactivate last event
__itt_task_end(Domain);
}
GPU分析代码
在使用微软提供的PIX分析器(GPU)的情况下,可以溯源到这两个函数:
template<typename STR, typename... ARGS>
void PIXBeginEvent(UINT64 color, STR formatString, ARGS... args)
{
PIXEventsThreadInfo* threadInfo = PIXGetThreadInfo();
UINT64* limit = threadInfo->biasedLimit;
if (limit != nullptr)
{
UINT64* destination = threadInfo->destination;
if (destination < limit)
{
limit += PIXEventsSafeFastCopySpaceQwords;
UINT64 time = PIXGetTimestampCounter();
UINT64* eventDestination = destination++;
*destination++ = color;
PIXCopyStringArguments(destination, limit, formatString, args...);
*destination = PIXEventsBlockEndMarker;
const UINT8 eventSize = PIXGetEventSize(destination, threadInfo->destination);
const UINT8 eventMetadata =
PIX_EVENT_METADATA_HAS_COLOR |
PIXEncodeStringIsAnsi<STR>();
*eventDestination = PIXEncodeEventInfo(time, PIXEvent_BeginEvent, eventSize, eventMetadata);
threadInfo->destination = destination;
}
else
{
PIXBeginEventAllocate(threadInfo, color, formatString, args...);
}
}
}
inline void PIXEndEvent()
{
PIXEventsThreadInfo* threadInfo = PIXGetThreadInfo();
UINT64* limit = threadInfo->biasedLimit;
if (limit != nullptr)
{
UINT64* destination = threadInfo->destination;
if (destination < limit)
{
limit += PIXEventsSafeFastCopySpaceQwords;
UINT64 time = PIXGetTimestampCounter();
const UINT8 eventSize = 1;
const UINT8 eventMetadata = PIX_EVENT_METADATA_NONE;
*destination++ = PIXEncodeEventInfo(time, PIXEvent_EndEvent, eventSize, eventMetadata);
*destination = PIXEventsBlockEndMarker;
threadInfo->destination = destination;
}
else
{
PIXEndEventAllocate(threadInfo);
}
}
}
第二句&&第三句
static uint32 __CpuProfilerEventSpecId123; // 假设Line为123,__CpuProfilerEventSpecId<Line>
FCpuProfilerTrace::FEventScope __CpuProfilerEventScope123(__CpuProfilerEventSpecId123, "FFrameEndSync_Sync", true, __FILE__, 123);
FORCEINLINE FEventScope(uint32& InOutSpecId, const ANSICHAR* InEventString, bool bInCondition, const ANSICHAR* File, uint32 Line)
: bEnabled(bInCondition && CpuChannel)
{
BeginEventCommon(InOutSpecId, InEventString, File, Line);
}
FORCEINLINE void BeginEventCommon(uint32& InOutSpecId, const ANSICHAR* InEventString, const ANSICHAR* File, uint32 Line)
{
if (bEnabled)
{
// We only do relaxed here to avoid barrier cost as the worst case that can happen is multiple threads could each create an event type.
// At some point the last thread in the race will set the output event and no more thread will try to create new ones from then on.
// We don't care which event type wins as long as all threads eventually converge and stop creating new ones.
if (FPlatformAtomics::AtomicRead_Relaxed((volatile int32*)&InOutSpecId) == 0)
{
FPlatformAtomics::AtomicStore_Relaxed((volatile int32*)&InOutSpecId, FCpuProfilerTrace::OutputEventType(InEventString, File, Line));
}
OutputBeginEvent(FPlatformAtomics::AtomicRead_Relaxed((volatile int32*)&InOutSpecId));
}
}
其中AtomicStore_Relaxed的定义为:
static FORCEINLINE void AtomicStore_Relaxed(volatile int32* Src, int32 Val)
{
*Src = Val;
}
这部分干的事情就是检测事件标识符是否已设置:
使用 FPlatformAtomics::AtomicRead_Relaxed
读取传入的 InOutSpecId
。如果这个值为 0,说明还没有为当前事件分配标识符。
当 InOutSpecId
为 0 时,调用 FCpuProfilerTrace::OutputEventType(InEventString, File, Line)
生成一个新的事件类型标识符。然后通过 FPlatformAtomics::AtomicStore_Relaxed
(在这个实现里就是直接赋值)将这个标识符存入 InOutSpecId
。这里使用的是 ”Relax“,不会引入内存屏障,因为即使有多个线程同时执行,最终只要有一个线程写入了正确的标识符即可(多个线程创建标识符时,最后一次写入的值会被统一使用)。
接着看FCpuProfilerTrace::OutputBeginEvent
void FCpuProfilerTrace::OutputBeginEvent(uint32 SpecId)
{
CPUPROFILERTRACE_OUTPUTBEGINEVENT_PROLOGUE();
CPUPROFILERTRACE_OUTPUTBEGINEVENT_EPILOGUE();
}
#define CPUPROFILERTRACE_OUTPUTBEGINEVENT_PROLOGUE() \
++FCpuProfilerTraceInternal::ThreadDepth; \
FCpuProfilerTraceInternal::FThreadBuffer* ThreadBuffer = FCpuProfilerTraceInternal::ThreadBuffer; \
if (!ThreadBuffer) \
{ \
ThreadBuffer = FCpuProfilerTraceInternal::CreateThreadBuffer(); \
} \
#define CPUPROFILERTRACE_OUTPUTBEGINEVENT_EPILOGUE() \
uint64 Cycle = FPlatformTime::Cycles64(); \
uint64 CycleDiff = Cycle - ThreadBuffer->LastCycle; \
ThreadBuffer->LastCycle = Cycle; \
uint8* BufferPtr = ThreadBuffer->Buffer + ThreadBuffer->BufferSize; \
FTraceUtils::Encode7bit((CycleDiff << 2) | 1ull, BufferPtr); \
FTraceUtils::Encode7bit(SpecId, BufferPtr); \
ThreadBuffer->BufferSize = (uint16)(BufferPtr - ThreadBuffer->Buffer); \
if (ThreadBuffer->BufferSize >= FCpuProfilerTraceInternal::FullBufferThreshold) \
{ \
FCpuProfilerTraceInternal::FlushThreadBuffer(ThreadBuffer); \
}
这里的++FCpuProfilerTraceInternal::ThreadDepth; 我估计是维护的Stack信息,以及用来存显示在Unrealinsights中的Depth。
而对于这一段:
FTraceUtils::Encode7bit((CycleDiff << 2) | 1ull, BufferPtr); \
FTraceUtils::Encode7bit(SpecId, BufferPtr); \
我之前在读 Unrealinsights 源码的时候, Session->StartAnalysis() 后,在 FCpuProfilerAnalyzer 的 ProcessBuffer 和 ProcessBufferV2 中,里面也提到了7bit,在解码 utrace 文件的时候用到了 Decode7bit ,是对一个 Buffer 进行 Decode。这与CPUPROFILERTRACE_OUTPUTBEGINEVENT_EPILOGUE宏中的逻辑吻合。
总结
第一句:FScopedNamedEventStatic NamedEvent_0(FColor::Blue, "FFrameEndSync_Sync");
利用 RAII(构造即开始,析构即结束)的机制,在进入当前代码作用域时调用构造函数开始一个命名事件(通过调用 FPlatformMisc::BeginNamedEvent 或 BeginNamedEventStatic),在离开作用域时自动调用析构函数结束事件(调用 FPlatformMisc::EndNamedEvent)。这对 GPU 和 CPU 的分析工具都能记录到这个事件。
第二、三句:FCpuProfilerTrace::FEventScope __CpuProfilerEventScope123(__CpuProfilerEventSpecId123, "FFrameEndSync_Sync", true, __FILE__, 123);
创建了一个 CPU 事件作用域对象。构造函数内部会检查静态的事件标识符是否已设置(如果未设置则通过调用 FCpuProfilerTrace::OutputEventType 分配一个新的标识符),然后调用 OutputBeginEvent 将事件的开始信息记录到 CPU 分析的线程缓冲区中。缓冲区记录的内容(例如经过 7 位编码的时间差和事件标识符)会用于Unrealinsights的解码。
FFrameEndSync::Sync()解读
// The "r.OneFrameThreadLag" cvar forces a full sync, meaning the game thread will
// not start work until all the rendering work for the previous frame has completed.
bFullSync |= CVarAllowOneFrameThreadLag.GetValueOnAnyThread() <= 0;
r.OneFrameThreadLag
是一个控制台变量(cvar),用于管理GameThread和RenderThread之间的同步策略。当启用这个变量(或设置其值使其生效)时,GameThread在开始新一帧的处理前,会等待RenderThreadFences的所有渲染工作完全完成,否则就留一个任务不完成。
ENQUEUE_RENDER_COMMAND(FrameDebugInfo)(
[CurrentFrameCounter = GFrameCounter, CurrentInputTime = GInputTime](FRHICommandListImmediate& RHICmdList)
{
RHICmdList.EnqueueLambda(
[CurrentFrameCounter, CurrentInputTime](FRHICommandListImmediate&)
{
// Set the FrameCount and InputTime for input latency stats and flip debugging.
RHISetFrameDebugInfo(GRHIPresentCounter - 1, CurrentFrameCounter, CurrentInputTime);
});
});
这是插入调试信息
// Always sync with the render thread (either current frame, or N-1 frame)
RenderThreadFences.Emplace();
while (RenderThreadFences.Num() > (bFullSync ? 0 : 1))
{
RenderThreadFences.RemoveAt(0);
}
Global:TArray<FRenderThreadFence, TInlineAllocator<2>> RenderThreadFences;
这里的RemoveAt会触发FRenderThreadFence的析构函数
~FRenderThreadFence()
{
Fence.Wait(true);
}
进而触发
static void GameThreadWaitForTask(const UE::Tasks::FTask& Task, bool bEmptyGameThreadTasks = false)
{
TRACE_CPUPROFILER_EVENT_SCOPE(GameThreadWaitForTask);
SCOPE_TIME_GUARD(TEXT("GameThreadWaitForTask"));
...
}
之后通过参数和VSync状态确定NumFramesOverlap和SyncDepth的值,调用PipelineFences.Emplace_GetRef().BeginFence(SyncDepth);
PipelineFences的定义为:
// Additional fences to await. These sync with either the RHI thread or swapchain,
// and are used to prevent the game thread running too far ahead of presented frames.
TArray<FRenderCommandFence, TInlineAllocator<3>> PipelineFences;
下面这一段则是:首先确保GameThread处理了一次游戏线程上的任务,之后处理旧的 Fence
if (!FTaskGraphInterface::Get().IsThreadProcessingTasks(ENamedThreads::GameThread))
{
// need to process gamethread tasks at least once a frame no matter what
FTaskGraphInterface::Get().ProcessThreadUntilIdle(ENamedThreads::GameThread);
}
while (PipelineFences.Num() > NumFramesOverlap)
{
PipelineFences[0].Wait(true);
PipelineFences.RemoveAt(0);
}
总结
在 Sync 函数这样一个对线程同步和命令调度非常敏感的关键路径上,调用 SCOPED_NAMED_EVENT 会引入额外的工作和操作,这可能会在某些测试环境或边界条件下导致崩溃。尤其是帧率极低或任务排队过多时。