最近在工作中,我在虚幻源码中插的桩导致了崩溃:

{
  SCOPED_NAMED_EVENT(FFrameEndSync_Sync, FColor::MakeRandomColor());
  // This could be perhaps moved down to get greater parallelism
  // Sync game and render/RHI threads.
  FFrameEndSync::Sync();
}

我来探究一下插桩的实现原理,进而剖析一下崩溃原因;

宏展开

#define SCOPED_NAMED_EVENT(Name, Color)\
    FScopedNamedEventStatic ANONYMOUS_VARIABLE(NamedEvent_##Name##_)(Color, NAMED_EVENT_STR(#Name));\
    TRACE_CPUPROFILER_EVENT_SCOPE(Name);

以上是SCOPED_NAMED_EVENT的定义

ANONYMOUS_VARIABLE展开

#ifdef __COUNTER__
    // Created a variable with a unique name
    #define ANONYMOUS_VARIABLE( Name ) PREPROCESSOR_JOIN(Name, __COUNTER__)
#else
    ...
#endif
...
#define PREPROCESSOR_JOIN(TokenA, TokenB)             UE_JOIN(TokenA, TokenB)
#define UE_JOIN(TokenA, TokenB) UE_PRIVATE_JOIN(TokenA, TokenB)
#define UE_PRIVATE_JOIN(TokenA, TokenB) TokenA##TokenB

会发现到最后ANONYMOUS_VARIABLE(NamedEvent_##Name##_)被展开为NamedEvent_##__COUNTER__,在虚幻中,__COUNTER__ 是编译器提供的每次展开时递增的整数,所以拼起来就是NamedEvent_<N>,比如NamedEvent_0

NAMED_EVENT_STR展开

#if PLATFORM_USES_ANSI_STRING_FOR_EXTERNAL_PROFILING
    #define NAMED_EVENT_STR(x) x
#else
    #define NAMED_EVENT_STR(x) TEXT(x)
#endif

解到最后要么是他自己( x ),要么是 L ## x,我这里的结果就是他本身,在这次的例子中,就是 FFrameEndSync_Sync 。

目前第一行展开为:FScopedNamedEventStatic NamedEvent_0(FColor::Blue, "FFrameEndSync_Sync");

TRACE_CPUPROFILER_EVENT_SCOPE展开

#define TRACE_CPUPROFILER_EVENT_SCOPE(Name) \
    TRACE_CPUPROFILER_EVENT_SCOPE_CONDITIONAL(Name, true)

#define TRACE_CPUPROFILER_EVENT_SCOPE_CONDITIONAL(Name, Condition) \
	TRACE_CPUPROFILER_EVENT_SCOPE_STR_CONDITIONAL(#Name, (Condition))

#define TRACE_CPUPROFILER_EVENT_SCOPE_STR_CONDITIONAL(NameStr, Condition) \
	TRACE_CPUPROFILER_EVENT_DECLARE(PREPROCESSOR_JOIN(__CpuProfilerEventSpecId, __LINE__)); \
	TRACE_CPUPROFILER_EVENT_SCOPE_USE(PREPROCESSOR_JOIN(__CpuProfilerEventSpecId, __LINE__), NameStr, PREPROCESSOR_JOIN(__CpuProfilerEventScope, __LINE__), (Condition));

#define TRACE_CPUPROFILER_EVENT_DECLARE(DeclName) \
	static uint32 DeclName;

#define TRACE_CPUPROFILER_EVENT_SCOPE_USE(DeclName, NameStr, ScopeName, Condition) \
	FCpuProfilerTrace::FEventScope ScopeName(DeclName, NameStr, Condition, __FILE__, __LINE__);

先一步一步来看,首先,比较方便看出来可以展开成这样:TRACE_CPUPROFILER_EVENT_SCOPE_STR_CONDITIONAL("FFrameEndSync_Sync", true);

进而得到这两条语句

TRACE_CPUPROFILER_EVENT_DECLARE(PREPROCESSOR_JOIN(__CpuProfilerEventSpecId, __LINE__));
TRACE_CPUPROFILER_EVENT_SCOPE_USE(PREPROCESSOR_JOIN(__CpuProfilerEventSpecId, __LINE__), "FFrameEndSync_Sync", PREPROCESSOR_JOIN(__CpuProfilerEventScope, __LINE__), (true));

这里的 __LINE__ 是当前代码行号,会在预处理时被替换成具体数值。

由之前提供的关于PREPROCESSOR_JOIN的一系列展开可以得到以下两行:

TRACE_CPUPROFILER_EVENT_DECLARE(__CpuProfilerEventSpecId<Line>);
TRACE_CPUPROFILER_EVENT_SCOPE_USE(__CpuProfilerEventSpecId<Line>, "FFrameEndSync_Sync", __CpuProfilerEventSpecId<Line>, true);

最终可以展开为:

static uint32 __CpuProfilerEventSpecId123; // 假设Line为123,__CpuProfilerEventSpecId<Line>
FCpuProfilerTrace::FEventScope __CpuProfilerEventScope123(__CpuProfilerEventSpecId123, "FFrameEndSync_Sync", true, __FILE__, __LINE__);

最终展开

FScopedNamedEventStatic NamedEvent_0(FColor::Blue, "FFrameEndSync_Sync"); //假设颜色为Blue
static uint32 __CpuProfilerEventSpecId123; // 假设Line为123,__CpuProfilerEventSpecId<Line>
FCpuProfilerTrace::FEventScope __CpuProfilerEventScope123(__CpuProfilerEventSpecId123, "FFrameEndSync_Sync", true, __FILE__, 123);

实现分析

第一句

FScopedNamedEventStatic NamedEvent_0(FColor::Blue, "FFrameEndSync_Sync");

class FScopedNamedEventStatic
{
public:

    FScopedNamedEventStatic(const struct FColor& Color, const TCHAR* Text)
    {
#if PLATFORM_IMPLEMENTS_BeginNamedEventStatic
       FPlatformMisc::BeginNamedEventStatic(Color, Text);
#else
       FPlatformMisc::BeginNamedEvent(Color, Text);
#endif
    }

    FScopedNamedEventStatic(const struct FColor& Color, const ANSICHAR* Text)
    {
#if PLATFORM_IMPLEMENTS_BeginNamedEventStatic
       FPlatformMisc::BeginNamedEventStatic(Color, Text);
#else
       FPlatformMisc::BeginNamedEvent(Color, Text);
#endif
    }

    ~FScopedNamedEventStatic()
    {
       FPlatformMisc::EndNamedEvent();
    }
};

可见是通过类的构造函数和析构函数来实现的自动开始和结束

CPU分析代码

事件的开始和结束,在使用 Intel 开发的 VTune CPU 性能分析器的时候,可以溯源到这两个函数

void StartScopedEvent(const struct FColor& Color, const TCHAR* Text) override
{
    uint32 Hash = FCrc::Strihash_DEPRECATED(Text);

    TMap<uint32, __itt_string_handle *>& VTuneHandleMap = FVTunePerThreadHandleMap::Get().VTuneHandleMap;

    if (!VTuneHandleMap.Contains(Hash))
    {
       VTuneHandleMap.Emplace(Hash, __itt_string_handle_createW(Text));
    }

    //Activate timer
    __itt_string_handle *Handle = VTuneHandleMap[Hash];
    __itt_task_begin(Domain, __itt_null, __itt_null, Handle);
}

void EndScopedEvent() override
{
    //Deactivate last event
    __itt_task_end(Domain);
}

GPU分析代码

在使用微软提供的PIX分析器(GPU)的情况下,可以溯源到这两个函数:

template<typename STR, typename... ARGS>
void PIXBeginEvent(UINT64 color, STR formatString, ARGS... args)
{
    PIXEventsThreadInfo* threadInfo = PIXGetThreadInfo();
    UINT64* limit = threadInfo->biasedLimit;
    if (limit != nullptr)
    {
        UINT64* destination = threadInfo->destination;
        if (destination < limit)
        {
            limit += PIXEventsSafeFastCopySpaceQwords;
            UINT64 time = PIXGetTimestampCounter();
            UINT64* eventDestination = destination++;
            *destination++ = color;

            PIXCopyStringArguments(destination, limit, formatString, args...);
            *destination = PIXEventsBlockEndMarker;

            const UINT8 eventSize = PIXGetEventSize(destination, threadInfo->destination);
            const UINT8 eventMetadata =
                PIX_EVENT_METADATA_HAS_COLOR |
                PIXEncodeStringIsAnsi<STR>();
            *eventDestination = PIXEncodeEventInfo(time, PIXEvent_BeginEvent, eventSize, eventMetadata);

            threadInfo->destination = destination;
        }
        else
        {
            PIXBeginEventAllocate(threadInfo, color, formatString, args...);
        }
    }
}

inline void PIXEndEvent()
    {
        PIXEventsThreadInfo* threadInfo = PIXGetThreadInfo();
        UINT64* limit = threadInfo->biasedLimit;
        if (limit != nullptr)
        {
            UINT64* destination = threadInfo->destination;
            if (destination < limit)
            {
                limit += PIXEventsSafeFastCopySpaceQwords;
                UINT64 time = PIXGetTimestampCounter();
                const UINT8 eventSize = 1;
                const UINT8 eventMetadata = PIX_EVENT_METADATA_NONE;
                *destination++ = PIXEncodeEventInfo(time, PIXEvent_EndEvent, eventSize, eventMetadata);
                *destination = PIXEventsBlockEndMarker;

                threadInfo->destination = destination;
            }
            else
            {
                PIXEndEventAllocate(threadInfo);
            }
        }
    }

第二句&&第三句

static uint32 __CpuProfilerEventSpecId123; // 假设Line为123,__CpuProfilerEventSpecId<Line>
FCpuProfilerTrace::FEventScope __CpuProfilerEventScope123(__CpuProfilerEventSpecId123, "FFrameEndSync_Sync", true, __FILE__, 123);

FORCEINLINE FEventScope(uint32& InOutSpecId, const ANSICHAR* InEventString, bool bInCondition, const ANSICHAR* File, uint32 Line)
    : bEnabled(bInCondition && CpuChannel)
{
    BeginEventCommon(InOutSpecId, InEventString, File, Line);
}

FORCEINLINE void BeginEventCommon(uint32& InOutSpecId, const ANSICHAR* InEventString, const ANSICHAR* File, uint32 Line)
		{
			if (bEnabled)
			{
				// We only do relaxed here to avoid barrier cost as the worst case that can happen is multiple threads could each create an event type.
				// At some point the last thread in the race will set the output event and no more thread will try to create new ones from then on.
				// We don't care which event type wins as long as all threads eventually converge and stop creating new ones.
				if (FPlatformAtomics::AtomicRead_Relaxed((volatile int32*)&InOutSpecId) == 0)
				{
					FPlatformAtomics::AtomicStore_Relaxed((volatile int32*)&InOutSpecId, FCpuProfilerTrace::OutputEventType(InEventString, File, Line));
				}
				OutputBeginEvent(FPlatformAtomics::AtomicRead_Relaxed((volatile int32*)&InOutSpecId));
			}
		}

其中AtomicStore_Relaxed的定义为:

static FORCEINLINE void AtomicStore_Relaxed(volatile int32* Src, int32 Val)
{
    *Src = Val;
}

这部分干的事情就是检测事件标识符是否已设置:
使用 FPlatformAtomics::AtomicRead_Relaxed 读取传入的 InOutSpecId。如果这个值为 0,说明还没有为当前事件分配标识符。
InOutSpecId 为 0 时,调用 FCpuProfilerTrace::OutputEventType(InEventString, File, Line) 生成一个新的事件类型标识符。然后通过 FPlatformAtomics::AtomicStore_Relaxed(在这个实现里就是直接赋值)将这个标识符存入 InOutSpecId。这里使用的是 ”Relax“,不会引入内存屏障,因为即使有多个线程同时执行,最终只要有一个线程写入了正确的标识符即可(多个线程创建标识符时,最后一次写入的值会被统一使用)。

接着看FCpuProfilerTrace::OutputBeginEvent

void FCpuProfilerTrace::OutputBeginEvent(uint32 SpecId)
{
    CPUPROFILERTRACE_OUTPUTBEGINEVENT_PROLOGUE();
    CPUPROFILERTRACE_OUTPUTBEGINEVENT_EPILOGUE();
}

#define CPUPROFILERTRACE_OUTPUTBEGINEVENT_PROLOGUE() \
	++FCpuProfilerTraceInternal::ThreadDepth; \
	FCpuProfilerTraceInternal::FThreadBuffer* ThreadBuffer = FCpuProfilerTraceInternal::ThreadBuffer; \
	if (!ThreadBuffer) \
	{ \
		ThreadBuffer = FCpuProfilerTraceInternal::CreateThreadBuffer(); \
	} \

#define CPUPROFILERTRACE_OUTPUTBEGINEVENT_EPILOGUE() \
	uint64 Cycle = FPlatformTime::Cycles64(); \
	uint64 CycleDiff = Cycle - ThreadBuffer->LastCycle; \
	ThreadBuffer->LastCycle = Cycle; \
	uint8* BufferPtr = ThreadBuffer->Buffer + ThreadBuffer->BufferSize; \
	FTraceUtils::Encode7bit((CycleDiff << 2) | 1ull, BufferPtr); \
	FTraceUtils::Encode7bit(SpecId, BufferPtr); \
	ThreadBuffer->BufferSize = (uint16)(BufferPtr - ThreadBuffer->Buffer); \
	if (ThreadBuffer->BufferSize >= FCpuProfilerTraceInternal::FullBufferThreshold) \
	{ \
		FCpuProfilerTraceInternal::FlushThreadBuffer(ThreadBuffer); \
	}

这里的++FCpuProfilerTraceInternal::ThreadDepth; 我估计是维护的Stack信息,以及用来存显示在Unrealinsights中的Depth。

而对于这一段:
FTraceUtils::Encode7bit((CycleDiff << 2) | 1ull, BufferPtr); \ FTraceUtils::Encode7bit(SpecId, BufferPtr); \

我之前在读 Unrealinsights 源码的时候, Session->StartAnalysis() 后,在 FCpuProfilerAnalyzer 的 ProcessBuffer 和 ProcessBufferV2 中,里面也提到了7bit,在解码 utrace 文件的时候用到了 Decode7bit ,是对一个 Buffer 进行 Decode。这与CPUPROFILERTRACE_OUTPUTBEGINEVENT_EPILOGUE宏中的逻辑吻合。

总结

第一句:
FScopedNamedEventStatic NamedEvent_0(FColor::Blue, "FFrameEndSync_Sync");
利用 RAII(构造即开始,析构即结束)的机制,在进入当前代码作用域时调用构造函数开始一个命名事件(通过调用 FPlatformMisc::BeginNamedEvent 或 BeginNamedEventStatic),在离开作用域时自动调用析构函数结束事件(调用 FPlatformMisc::EndNamedEvent)。这对 GPU 和 CPU 的分析工具都能记录到这个事件。

第二、三句:
FCpuProfilerTrace::FEventScope __CpuProfilerEventScope123(__CpuProfilerEventSpecId123, "FFrameEndSync_Sync", true, __FILE__, 123);
创建了一个 CPU 事件作用域对象。构造函数内部会检查静态的事件标识符是否已设置(如果未设置则通过调用 FCpuProfilerTrace::OutputEventType 分配一个新的标识符),然后调用 OutputBeginEvent 将事件的开始信息记录到 CPU 分析的线程缓冲区中。缓冲区记录的内容(例如经过 7 位编码的时间差和事件标识符)会用于Unrealinsights的解码。

FFrameEndSync::Sync()解读

// The "r.OneFrameThreadLag" cvar forces a full sync, meaning the game thread will
// not start work until all the rendering work for the previous frame has completed.
bFullSync |= CVarAllowOneFrameThreadLag.GetValueOnAnyThread() <= 0;

r.OneFrameThreadLag 是一个控制台变量(cvar),用于管理GameThread和RenderThread之间的同步策略。当启用这个变量(或设置其值使其生效)时,GameThread在开始新一帧的处理前,会等待RenderThreadFences的所有渲染工作完全完成,否则就留一个任务不完成。

ENQUEUE_RENDER_COMMAND(FrameDebugInfo)(
    [CurrentFrameCounter = GFrameCounter, CurrentInputTime = GInputTime](FRHICommandListImmediate& RHICmdList)
{
    RHICmdList.EnqueueLambda(
       [CurrentFrameCounter, CurrentInputTime](FRHICommandListImmediate&)
    {
       // Set the FrameCount and InputTime for input latency stats and flip debugging.
       RHISetFrameDebugInfo(GRHIPresentCounter - 1, CurrentFrameCounter, CurrentInputTime);
    });
});

这是插入调试信息

// Always sync with the render thread (either current frame, or N-1 frame)
RenderThreadFences.Emplace();
while (RenderThreadFences.Num() > (bFullSync ? 0 : 1))
{
    RenderThreadFences.RemoveAt(0);
}

Global:TArray<FRenderThreadFence, TInlineAllocator<2>> RenderThreadFences;

这里的RemoveAt会触发FRenderThreadFence的析构函数

~FRenderThreadFence()
{
    Fence.Wait(true);
}

进而触发

static void GameThreadWaitForTask(const UE::Tasks::FTask& Task, bool bEmptyGameThreadTasks = false)
{
    TRACE_CPUPROFILER_EVENT_SCOPE(GameThreadWaitForTask);
    SCOPE_TIME_GUARD(TEXT("GameThreadWaitForTask"));
    ...
}

之后通过参数和VSync状态确定NumFramesOverlap和SyncDepth的值,调用PipelineFences.Emplace_GetRef().BeginFence(SyncDepth);

PipelineFences的定义为:

// Additional fences to await. These sync with either the RHI thread or swapchain,
// and are used to prevent the game thread running too far ahead of presented frames.
TArray<FRenderCommandFence, TInlineAllocator<3>> PipelineFences;

下面这一段则是:首先确保GameThread处理了一次游戏线程上的任务,之后处理旧的 Fence

if (!FTaskGraphInterface::Get().IsThreadProcessingTasks(ENamedThreads::GameThread))
{
    // need to process gamethread tasks at least once a frame no matter what
    FTaskGraphInterface::Get().ProcessThreadUntilIdle(ENamedThreads::GameThread);
}

while (PipelineFences.Num() > NumFramesOverlap)
{
    PipelineFences[0].Wait(true);
    PipelineFences.RemoveAt(0);
}

总结

在 Sync 函数这样一个对线程同步和命令调度非常敏感的关键路径上,调用 SCOPED_NAMED_EVENT 会引入额外的工作和操作,这可能会在某些测试环境或边界条件下导致崩溃。尤其是帧率极低或任务排队过多时。

Life is a Rainmeter