rhi: d3d12: Implement lastCompletedGpuTime via timestamp queries
Change-Id: I5f2588268cf4d52025f9b1c8d94cdcd9a742531c Reviewed-by: Andy Nichols <andy.nichols@qt.io> Reviewed-by: Qt CI Bot <qt_ci_bot@qt-project.org>
This commit is contained in:
parent
95ed8d1fd9
commit
eb99d19cc4
@ -397,6 +397,9 @@ bool QRhiD3D12::create(QRhi::Flags flags)
|
||||
qWarning("Could not create host-visible staging area");
|
||||
return false;
|
||||
}
|
||||
QString decoratedName = QLatin1String("Small staging area buffer/");
|
||||
decoratedName += QString::number(i);
|
||||
smallStagingAreas[i].mem.buffer->SetName(reinterpret_cast<LPCWSTR>(decoratedName.utf16()));
|
||||
}
|
||||
|
||||
if (!shaderVisibleCbvSrvUavHeap.create(dev,
|
||||
@ -407,6 +410,45 @@ bool QRhiD3D12::create(QRhi::Flags flags)
|
||||
return false;
|
||||
}
|
||||
|
||||
if (flags.testFlag(QRhi::EnableTimestamps)) {
|
||||
static bool wantsStablePowerState = qEnvironmentVariableIntValue("QT_D3D_STABLE_POWER_STATE");
|
||||
//
|
||||
// https://learn.microsoft.com/en-us/windows/win32/api/d3d12/nf-d3d12-id3d12device-setstablepowerstate
|
||||
//
|
||||
// NB! This is a _global_ setting, affecting other processes (and 3D
|
||||
// APIs such as Vulkan), as long as this application is running. Hence
|
||||
// making it an env.var. for now. Never enable it in production. But
|
||||
// extremely useful for the GPU timings with NVIDIA at least; the
|
||||
// timestamps become stable and smooth, making the number readable and
|
||||
// actually useful e.g. in Quick 3D's DebugView when this is enabled.
|
||||
// (otherwise the number's all over the place)
|
||||
//
|
||||
// See also
|
||||
// https://developer.nvidia.com/blog/advanced-api-performance-setstablepowerstate/
|
||||
// for possible other approaches.
|
||||
//
|
||||
if (wantsStablePowerState)
|
||||
dev->SetStablePowerState(TRUE);
|
||||
|
||||
hr = cmdQueue->GetTimestampFrequency(×tampTicksPerSecond);
|
||||
if (FAILED(hr)) {
|
||||
qWarning("Failed to query timestamp frequency: %s",
|
||||
qPrintable(QSystemError::windowsComString(hr)));
|
||||
return false;
|
||||
}
|
||||
if (!timestampQueryHeap.create(dev, QD3D12_FRAMES_IN_FLIGHT * 2, D3D12_QUERY_HEAP_TYPE_TIMESTAMP)) {
|
||||
qWarning("Failed to create timestamp query pool");
|
||||
return false;
|
||||
}
|
||||
const quint32 readbackBufSize = QD3D12_FRAMES_IN_FLIGHT * 2 * sizeof(quint64);
|
||||
if (!timestampReadbackArea.create(this, readbackBufSize, D3D12_HEAP_TYPE_READBACK)) {
|
||||
qWarning("Failed to create timestamp readback buffer");
|
||||
return false;
|
||||
}
|
||||
timestampReadbackArea.mem.buffer->SetName(L"Timestamp readback buffer");
|
||||
memset(timestampReadbackArea.mem.p, 0, readbackBufSize);
|
||||
}
|
||||
|
||||
D3D12_FEATURE_DATA_D3D12_OPTIONS3 options3 = {};
|
||||
if (SUCCEEDED(dev->CheckFeatureSupport(D3D12_FEATURE_D3D12_OPTIONS3, &options3, sizeof(options3))))
|
||||
caps.multiView = options3.ViewInstancingTier != D3D12_VIEW_INSTANCING_TIER_NOT_SUPPORTED;
|
||||
@ -439,6 +481,9 @@ void QRhiD3D12::destroy()
|
||||
}
|
||||
}
|
||||
|
||||
timestampQueryHeap.destroy();
|
||||
timestampReadbackArea.destroy();
|
||||
|
||||
shaderVisibleCbvSrvUavHeap.destroy();
|
||||
|
||||
for (int i = 0; i < QD3D12_FRAMES_IN_FLIGHT; ++i)
|
||||
@ -574,7 +619,7 @@ bool QRhiD3D12::isFeatureSupported(QRhi::Feature feature) const
|
||||
return false;
|
||||
#endif
|
||||
case QRhi::Timestamps:
|
||||
return false; // ###
|
||||
return true;
|
||||
case QRhi::Instancing:
|
||||
return true;
|
||||
case QRhi::CustomInstanceStepRate:
|
||||
@ -1389,8 +1434,24 @@ void QRhiD3D12::endExternal(QRhiCommandBuffer *cb)
|
||||
|
||||
double QRhiD3D12::lastCompletedGpuTime(QRhiCommandBuffer *cb)
|
||||
{
|
||||
Q_UNUSED(cb);
|
||||
return 0;
|
||||
QD3D12CommandBuffer *cbD = QRHI_RES(QD3D12CommandBuffer, cb);
|
||||
return cbD->lastGpuTime;
|
||||
}
|
||||
|
||||
static void calculateGpuTime(QD3D12CommandBuffer *cbD,
|
||||
int timestampPairStartIndex,
|
||||
const quint8 *readbackBufPtr,
|
||||
quint64 timestampTicksPerSecond)
|
||||
{
|
||||
const size_t byteOffset = timestampPairStartIndex * sizeof(quint64);
|
||||
const quint64 *p = reinterpret_cast<const quint64 *>(readbackBufPtr + byteOffset);
|
||||
const quint64 startTime = *p++;
|
||||
const quint64 endTime = *p;
|
||||
if (startTime < endTime) {
|
||||
const quint64 ticks = endTime - startTime;
|
||||
const double timeSec = ticks / double(timestampTicksPerSecond);
|
||||
cbD->lastGpuTime = timeSec;
|
||||
}
|
||||
}
|
||||
|
||||
QRhi::FrameOpResult QRhiD3D12::beginFrame(QRhiSwapChain *swapChain, QRhi::BeginFrameFlags flags)
|
||||
@ -1453,6 +1514,20 @@ QRhi::FrameOpResult QRhiD3D12::beginFrame(QRhiSwapChain *swapChain, QRhi::BeginF
|
||||
|
||||
finishActiveReadbacks(); // last, in case the readback-completed callback issues rhi calls
|
||||
|
||||
if (timestampQueryHeap.isValid() && timestampTicksPerSecond) {
|
||||
// Read the timestamps for the previous frame for this slot. (the
|
||||
// ResolveQuery() should have completed by now due to the wait above)
|
||||
const int timestampPairStartIndex = currentFrameSlot * QD3D12_FRAMES_IN_FLIGHT;
|
||||
calculateGpuTime(cbD,
|
||||
timestampPairStartIndex,
|
||||
timestampReadbackArea.mem.p,
|
||||
timestampTicksPerSecond);
|
||||
// Write the start timestamp for this frame for this slot.
|
||||
cbD->cmdList->EndQuery(timestampQueryHeap.heap,
|
||||
D3D12_QUERY_TYPE_TIMESTAMP,
|
||||
timestampPairStartIndex);
|
||||
}
|
||||
|
||||
return QRhi::FrameOpSuccess;
|
||||
}
|
||||
|
||||
@ -1477,6 +1552,19 @@ QRhi::FrameOpResult QRhiD3D12::endFrame(QRhiSwapChain *swapChain, QRhi::EndFrame
|
||||
barrierGen.addTransitionBarrier(backBufferResourceHandle, D3D12_RESOURCE_STATE_PRESENT);
|
||||
barrierGen.enqueueBufferedTransitionBarriers(cbD);
|
||||
|
||||
if (timestampQueryHeap.isValid()) {
|
||||
const int timestampPairStartIndex = currentFrameSlot * QD3D12_FRAMES_IN_FLIGHT;
|
||||
cbD->cmdList->EndQuery(timestampQueryHeap.heap,
|
||||
D3D12_QUERY_TYPE_TIMESTAMP,
|
||||
timestampPairStartIndex + 1);
|
||||
cbD->cmdList->ResolveQueryData(timestampQueryHeap.heap,
|
||||
D3D12_QUERY_TYPE_TIMESTAMP,
|
||||
timestampPairStartIndex,
|
||||
2,
|
||||
timestampReadbackArea.mem.buffer,
|
||||
timestampPairStartIndex * sizeof(quint64));
|
||||
}
|
||||
|
||||
ID3D12GraphicsCommandList1 *cmdList = cbD->cmdList;
|
||||
HRESULT hr = cmdList->Close();
|
||||
if (FAILED(hr)) {
|
||||
@ -1561,6 +1649,12 @@ QRhi::FrameOpResult QRhiD3D12::beginOffscreenFrame(QRhiCommandBuffer **cb, QRhi:
|
||||
|
||||
bindShaderVisibleHeaps(cbD);
|
||||
|
||||
if (timestampQueryHeap.isValid() && timestampTicksPerSecond) {
|
||||
cbD->cmdList->EndQuery(timestampQueryHeap.heap,
|
||||
D3D12_QUERY_TYPE_TIMESTAMP,
|
||||
currentFrameSlot * QD3D12_FRAMES_IN_FLIGHT);
|
||||
}
|
||||
|
||||
offscreenActive = true;
|
||||
*cb = cbD;
|
||||
|
||||
@ -1574,6 +1668,19 @@ QRhi::FrameOpResult QRhiD3D12::endOffscreenFrame(QRhi::EndFrameFlags flags)
|
||||
offscreenActive = false;
|
||||
|
||||
QD3D12CommandBuffer *cbD = offscreenCb[currentFrameSlot];
|
||||
if (timestampQueryHeap.isValid()) {
|
||||
const int timestampPairStartIndex = currentFrameSlot * QD3D12_FRAMES_IN_FLIGHT;
|
||||
cbD->cmdList->EndQuery(timestampQueryHeap.heap,
|
||||
D3D12_QUERY_TYPE_TIMESTAMP,
|
||||
timestampPairStartIndex + 1);
|
||||
cbD->cmdList->ResolveQueryData(timestampQueryHeap.heap,
|
||||
D3D12_QUERY_TYPE_TIMESTAMP,
|
||||
timestampPairStartIndex,
|
||||
2,
|
||||
timestampReadbackArea.mem.buffer,
|
||||
timestampPairStartIndex * sizeof(quint64));
|
||||
}
|
||||
|
||||
ID3D12GraphicsCommandList1 *cmdList = cbD->cmdList;
|
||||
HRESULT hr = cmdList->Close();
|
||||
if (FAILED(hr)) {
|
||||
@ -1594,6 +1701,14 @@ QRhi::FrameOpResult QRhiD3D12::endOffscreenFrame(QRhi::EndFrameFlags flags)
|
||||
// previous) frame is safe since we waited for completion above.
|
||||
finishActiveReadbacks(true);
|
||||
|
||||
// the timestamp query results should be available too, given the wait
|
||||
if (timestampQueryHeap.isValid()) {
|
||||
calculateGpuTime(cbD,
|
||||
currentFrameSlot * QD3D12_FRAMES_IN_FLIGHT,
|
||||
timestampReadbackArea.mem.p,
|
||||
timestampTicksPerSecond);
|
||||
}
|
||||
|
||||
return QRhi::FrameOpSuccess;
|
||||
}
|
||||
|
||||
@ -2058,6 +2173,36 @@ void QD3D12CpuDescriptorPool::release(const QD3D12Descriptor &descriptor, quint3
|
||||
quint64(descriptor.cpuHandle.ptr));
|
||||
}
|
||||
|
||||
bool QD3D12QueryHeap::create(ID3D12Device *device,
|
||||
quint32 queryCount,
|
||||
D3D12_QUERY_HEAP_TYPE heapType)
|
||||
{
|
||||
capacity = queryCount;
|
||||
|
||||
D3D12_QUERY_HEAP_DESC heapDesc = {};
|
||||
heapDesc.Type = heapType;
|
||||
heapDesc.Count = capacity;
|
||||
|
||||
HRESULT hr = device->CreateQueryHeap(&heapDesc, __uuidof(ID3D12QueryHeap), reinterpret_cast<void **>(&heap));
|
||||
if (FAILED(hr)) {
|
||||
qWarning("Failed to create query heap: %s", qPrintable(QSystemError::windowsComString(hr)));
|
||||
heap = nullptr;
|
||||
capacity = 0;
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void QD3D12QueryHeap::destroy()
|
||||
{
|
||||
if (heap) {
|
||||
heap->Release();
|
||||
heap = nullptr;
|
||||
}
|
||||
capacity = 0;
|
||||
}
|
||||
|
||||
bool QD3D12StagingArea::create(QRhiD3D12 *rhi, quint32 capacity, D3D12_HEAP_TYPE heapType)
|
||||
{
|
||||
Q_ASSERT(heapType == D3D12_HEAP_TYPE_UPLOAD || heapType == D3D12_HEAP_TYPE_READBACK);
|
||||
|
@ -119,6 +119,18 @@ struct QD3D12CpuDescriptorPool
|
||||
const char *debugName;
|
||||
};
|
||||
|
||||
struct QD3D12QueryHeap
|
||||
{
|
||||
bool isValid() const { return heap && capacity; }
|
||||
bool create(ID3D12Device *device,
|
||||
quint32 queryCount,
|
||||
D3D12_QUERY_HEAP_TYPE heapType);
|
||||
void destroy();
|
||||
|
||||
ID3D12QueryHeap *heap = nullptr;
|
||||
quint32 capacity = 0;
|
||||
};
|
||||
|
||||
struct QD3D12StagingArea
|
||||
{
|
||||
static const quint32 ALIGNMENT = D3D12_TEXTURE_DATA_PLACEMENT_ALIGNMENT; // 512 so good enough both for cb and texdata
|
||||
@ -931,9 +943,11 @@ struct QD3D12CommandBuffer : public QRhiCommandBuffer
|
||||
currentVertexOffsets = {};
|
||||
}
|
||||
|
||||
// per-frame
|
||||
PassType recordingPass;
|
||||
QRhiRenderTarget *currentTarget;
|
||||
|
||||
// per-pass
|
||||
QD3D12GraphicsPipeline *currentGraphicsPipeline;
|
||||
QD3D12ComputePipeline *currentComputePipeline;
|
||||
uint currentPipelineGeneration;
|
||||
@ -945,6 +959,9 @@ struct QD3D12CommandBuffer : public QRhiCommandBuffer
|
||||
DXGI_FORMAT currentIndexFormat;
|
||||
std::array<QD3D12ObjectHandle, D3D12_IA_VERTEX_INPUT_RESOURCE_SLOT_COUNT> currentVertexBuffers;
|
||||
std::array<quint32, D3D12_IA_VERTEX_INPUT_RESOURCE_SLOT_COUNT> currentVertexOffsets;
|
||||
|
||||
// global
|
||||
double lastGpuTime = 0;
|
||||
};
|
||||
|
||||
struct QD3D12SwapChain : public QRhiSwapChain
|
||||
@ -1169,6 +1186,9 @@ public:
|
||||
QD3D12MipmapGenerator mipmapGen;
|
||||
QD3D12StagingArea smallStagingAreas[QD3D12_FRAMES_IN_FLIGHT];
|
||||
QD3D12ShaderVisibleDescriptorHeap shaderVisibleCbvSrvUavHeap;
|
||||
UINT64 timestampTicksPerSecond = 0;
|
||||
QD3D12QueryHeap timestampQueryHeap;
|
||||
QD3D12StagingArea timestampReadbackArea;
|
||||
IDCompositionDevice *dcompDevice = nullptr;
|
||||
QD3D12SwapChain *currentSwapChain = nullptr;
|
||||
QSet<QD3D12SwapChain *> swapchains;
|
||||
|
Loading…
Reference in New Issue
Block a user