rhi: d3d12: Implement lastCompletedGpuTime via timestamp queries

Change-Id: I5f2588268cf4d52025f9b1c8d94cdcd9a742531c
Reviewed-by: Andy Nichols <andy.nichols@qt.io>
Reviewed-by: Qt CI Bot <qt_ci_bot@qt-project.org>
This commit is contained in:
Laszlo Agocs 2023-07-17 12:28:13 +02:00
parent 95ed8d1fd9
commit eb99d19cc4
2 changed files with 168 additions and 3 deletions

View File

@ -397,6 +397,9 @@ bool QRhiD3D12::create(QRhi::Flags flags)
qWarning("Could not create host-visible staging area");
return false;
}
QString decoratedName = QLatin1String("Small staging area buffer/");
decoratedName += QString::number(i);
smallStagingAreas[i].mem.buffer->SetName(reinterpret_cast<LPCWSTR>(decoratedName.utf16()));
}
if (!shaderVisibleCbvSrvUavHeap.create(dev,
@ -407,6 +410,45 @@ bool QRhiD3D12::create(QRhi::Flags flags)
return false;
}
if (flags.testFlag(QRhi::EnableTimestamps)) {
static bool wantsStablePowerState = qEnvironmentVariableIntValue("QT_D3D_STABLE_POWER_STATE");
//
// https://learn.microsoft.com/en-us/windows/win32/api/d3d12/nf-d3d12-id3d12device-setstablepowerstate
//
// NB! This is a _global_ setting, affecting other processes (and 3D
// APIs such as Vulkan), as long as this application is running. Hence
// making it an env.var. for now. Never enable it in production. But
// extremely useful for the GPU timings with NVIDIA at least; the
// timestamps become stable and smooth, making the number readable and
// actually useful e.g. in Quick 3D's DebugView when this is enabled.
// (otherwise the number's all over the place)
//
// See also
// https://developer.nvidia.com/blog/advanced-api-performance-setstablepowerstate/
// for possible other approaches.
//
if (wantsStablePowerState)
dev->SetStablePowerState(TRUE);
hr = cmdQueue->GetTimestampFrequency(&timestampTicksPerSecond);
if (FAILED(hr)) {
qWarning("Failed to query timestamp frequency: %s",
qPrintable(QSystemError::windowsComString(hr)));
return false;
}
if (!timestampQueryHeap.create(dev, QD3D12_FRAMES_IN_FLIGHT * 2, D3D12_QUERY_HEAP_TYPE_TIMESTAMP)) {
qWarning("Failed to create timestamp query pool");
return false;
}
const quint32 readbackBufSize = QD3D12_FRAMES_IN_FLIGHT * 2 * sizeof(quint64);
if (!timestampReadbackArea.create(this, readbackBufSize, D3D12_HEAP_TYPE_READBACK)) {
qWarning("Failed to create timestamp readback buffer");
return false;
}
timestampReadbackArea.mem.buffer->SetName(L"Timestamp readback buffer");
memset(timestampReadbackArea.mem.p, 0, readbackBufSize);
}
D3D12_FEATURE_DATA_D3D12_OPTIONS3 options3 = {};
if (SUCCEEDED(dev->CheckFeatureSupport(D3D12_FEATURE_D3D12_OPTIONS3, &options3, sizeof(options3))))
caps.multiView = options3.ViewInstancingTier != D3D12_VIEW_INSTANCING_TIER_NOT_SUPPORTED;
@ -439,6 +481,9 @@ void QRhiD3D12::destroy()
}
}
timestampQueryHeap.destroy();
timestampReadbackArea.destroy();
shaderVisibleCbvSrvUavHeap.destroy();
for (int i = 0; i < QD3D12_FRAMES_IN_FLIGHT; ++i)
@ -574,7 +619,7 @@ bool QRhiD3D12::isFeatureSupported(QRhi::Feature feature) const
return false;
#endif
case QRhi::Timestamps:
return false; // ###
return true;
case QRhi::Instancing:
return true;
case QRhi::CustomInstanceStepRate:
@ -1389,8 +1434,24 @@ void QRhiD3D12::endExternal(QRhiCommandBuffer *cb)
double QRhiD3D12::lastCompletedGpuTime(QRhiCommandBuffer *cb)
{
Q_UNUSED(cb);
return 0;
QD3D12CommandBuffer *cbD = QRHI_RES(QD3D12CommandBuffer, cb);
return cbD->lastGpuTime;
}
static void calculateGpuTime(QD3D12CommandBuffer *cbD,
int timestampPairStartIndex,
const quint8 *readbackBufPtr,
quint64 timestampTicksPerSecond)
{
const size_t byteOffset = timestampPairStartIndex * sizeof(quint64);
const quint64 *p = reinterpret_cast<const quint64 *>(readbackBufPtr + byteOffset);
const quint64 startTime = *p++;
const quint64 endTime = *p;
if (startTime < endTime) {
const quint64 ticks = endTime - startTime;
const double timeSec = ticks / double(timestampTicksPerSecond);
cbD->lastGpuTime = timeSec;
}
}
QRhi::FrameOpResult QRhiD3D12::beginFrame(QRhiSwapChain *swapChain, QRhi::BeginFrameFlags flags)
@ -1453,6 +1514,20 @@ QRhi::FrameOpResult QRhiD3D12::beginFrame(QRhiSwapChain *swapChain, QRhi::BeginF
finishActiveReadbacks(); // last, in case the readback-completed callback issues rhi calls
if (timestampQueryHeap.isValid() && timestampTicksPerSecond) {
// Read the timestamps for the previous frame for this slot. (the
// ResolveQuery() should have completed by now due to the wait above)
const int timestampPairStartIndex = currentFrameSlot * QD3D12_FRAMES_IN_FLIGHT;
calculateGpuTime(cbD,
timestampPairStartIndex,
timestampReadbackArea.mem.p,
timestampTicksPerSecond);
// Write the start timestamp for this frame for this slot.
cbD->cmdList->EndQuery(timestampQueryHeap.heap,
D3D12_QUERY_TYPE_TIMESTAMP,
timestampPairStartIndex);
}
return QRhi::FrameOpSuccess;
}
@ -1477,6 +1552,19 @@ QRhi::FrameOpResult QRhiD3D12::endFrame(QRhiSwapChain *swapChain, QRhi::EndFrame
barrierGen.addTransitionBarrier(backBufferResourceHandle, D3D12_RESOURCE_STATE_PRESENT);
barrierGen.enqueueBufferedTransitionBarriers(cbD);
if (timestampQueryHeap.isValid()) {
const int timestampPairStartIndex = currentFrameSlot * QD3D12_FRAMES_IN_FLIGHT;
cbD->cmdList->EndQuery(timestampQueryHeap.heap,
D3D12_QUERY_TYPE_TIMESTAMP,
timestampPairStartIndex + 1);
cbD->cmdList->ResolveQueryData(timestampQueryHeap.heap,
D3D12_QUERY_TYPE_TIMESTAMP,
timestampPairStartIndex,
2,
timestampReadbackArea.mem.buffer,
timestampPairStartIndex * sizeof(quint64));
}
ID3D12GraphicsCommandList1 *cmdList = cbD->cmdList;
HRESULT hr = cmdList->Close();
if (FAILED(hr)) {
@ -1561,6 +1649,12 @@ QRhi::FrameOpResult QRhiD3D12::beginOffscreenFrame(QRhiCommandBuffer **cb, QRhi:
bindShaderVisibleHeaps(cbD);
if (timestampQueryHeap.isValid() && timestampTicksPerSecond) {
cbD->cmdList->EndQuery(timestampQueryHeap.heap,
D3D12_QUERY_TYPE_TIMESTAMP,
currentFrameSlot * QD3D12_FRAMES_IN_FLIGHT);
}
offscreenActive = true;
*cb = cbD;
@ -1574,6 +1668,19 @@ QRhi::FrameOpResult QRhiD3D12::endOffscreenFrame(QRhi::EndFrameFlags flags)
offscreenActive = false;
QD3D12CommandBuffer *cbD = offscreenCb[currentFrameSlot];
if (timestampQueryHeap.isValid()) {
const int timestampPairStartIndex = currentFrameSlot * QD3D12_FRAMES_IN_FLIGHT;
cbD->cmdList->EndQuery(timestampQueryHeap.heap,
D3D12_QUERY_TYPE_TIMESTAMP,
timestampPairStartIndex + 1);
cbD->cmdList->ResolveQueryData(timestampQueryHeap.heap,
D3D12_QUERY_TYPE_TIMESTAMP,
timestampPairStartIndex,
2,
timestampReadbackArea.mem.buffer,
timestampPairStartIndex * sizeof(quint64));
}
ID3D12GraphicsCommandList1 *cmdList = cbD->cmdList;
HRESULT hr = cmdList->Close();
if (FAILED(hr)) {
@ -1594,6 +1701,14 @@ QRhi::FrameOpResult QRhiD3D12::endOffscreenFrame(QRhi::EndFrameFlags flags)
// previous) frame is safe since we waited for completion above.
finishActiveReadbacks(true);
// the timestamp query results should be available too, given the wait
if (timestampQueryHeap.isValid()) {
calculateGpuTime(cbD,
currentFrameSlot * QD3D12_FRAMES_IN_FLIGHT,
timestampReadbackArea.mem.p,
timestampTicksPerSecond);
}
return QRhi::FrameOpSuccess;
}
@ -2058,6 +2173,36 @@ void QD3D12CpuDescriptorPool::release(const QD3D12Descriptor &descriptor, quint3
quint64(descriptor.cpuHandle.ptr));
}
bool QD3D12QueryHeap::create(ID3D12Device *device,
quint32 queryCount,
D3D12_QUERY_HEAP_TYPE heapType)
{
capacity = queryCount;
D3D12_QUERY_HEAP_DESC heapDesc = {};
heapDesc.Type = heapType;
heapDesc.Count = capacity;
HRESULT hr = device->CreateQueryHeap(&heapDesc, __uuidof(ID3D12QueryHeap), reinterpret_cast<void **>(&heap));
if (FAILED(hr)) {
qWarning("Failed to create query heap: %s", qPrintable(QSystemError::windowsComString(hr)));
heap = nullptr;
capacity = 0;
return false;
}
return true;
}
void QD3D12QueryHeap::destroy()
{
if (heap) {
heap->Release();
heap = nullptr;
}
capacity = 0;
}
bool QD3D12StagingArea::create(QRhiD3D12 *rhi, quint32 capacity, D3D12_HEAP_TYPE heapType)
{
Q_ASSERT(heapType == D3D12_HEAP_TYPE_UPLOAD || heapType == D3D12_HEAP_TYPE_READBACK);

View File

@ -119,6 +119,18 @@ struct QD3D12CpuDescriptorPool
const char *debugName;
};
struct QD3D12QueryHeap
{
bool isValid() const { return heap && capacity; }
bool create(ID3D12Device *device,
quint32 queryCount,
D3D12_QUERY_HEAP_TYPE heapType);
void destroy();
ID3D12QueryHeap *heap = nullptr;
quint32 capacity = 0;
};
struct QD3D12StagingArea
{
static const quint32 ALIGNMENT = D3D12_TEXTURE_DATA_PLACEMENT_ALIGNMENT; // 512 so good enough both for cb and texdata
@ -931,9 +943,11 @@ struct QD3D12CommandBuffer : public QRhiCommandBuffer
currentVertexOffsets = {};
}
// per-frame
PassType recordingPass;
QRhiRenderTarget *currentTarget;
// per-pass
QD3D12GraphicsPipeline *currentGraphicsPipeline;
QD3D12ComputePipeline *currentComputePipeline;
uint currentPipelineGeneration;
@ -945,6 +959,9 @@ struct QD3D12CommandBuffer : public QRhiCommandBuffer
DXGI_FORMAT currentIndexFormat;
std::array<QD3D12ObjectHandle, D3D12_IA_VERTEX_INPUT_RESOURCE_SLOT_COUNT> currentVertexBuffers;
std::array<quint32, D3D12_IA_VERTEX_INPUT_RESOURCE_SLOT_COUNT> currentVertexOffsets;
// global
double lastGpuTime = 0;
};
struct QD3D12SwapChain : public QRhiSwapChain
@ -1169,6 +1186,9 @@ public:
QD3D12MipmapGenerator mipmapGen;
QD3D12StagingArea smallStagingAreas[QD3D12_FRAMES_IN_FLIGHT];
QD3D12ShaderVisibleDescriptorHeap shaderVisibleCbvSrvUavHeap;
UINT64 timestampTicksPerSecond = 0;
QD3D12QueryHeap timestampQueryHeap;
QD3D12StagingArea timestampReadbackArea;
IDCompositionDevice *dcompDevice = nullptr;
QD3D12SwapChain *currentSwapChain = nullptr;
QSet<QD3D12SwapChain *> swapchains;