From ee061291b755959d5615205fbd6eaafbb32db515 Mon Sep 17 00:00:00 2001 From: Takahito Tejima Date: Thu, 8 May 2014 17:20:54 -0700 Subject: [PATCH] Interleaved buffer support in OsdCompute. Removed OsdVertexDescriptor and replaced with OsdVertexBufferDescriptor. All kernels take offset/length/stride to apply subdivision partially in each vertex elements. Also the offset can be used for client-based VBO aggregation, without modifying index buffers. This is useful for topology sharing, in conjunction with glDrawElementsBaseVertex etc. However, gregory patch shader fetches vertex buffer via texture buffer, which index should also be offsetted too. Although gl_BaseVertexARB extension should be able to do that job, it's a relatively new extension. So we use OsdBaseVertex() call to mitigate the compatibility issue as clients can provide it in their way at least for the time being. --- examples/facePartition/shader.glsl | 4 + examples/glBatchViewer/shader.glsl | 4 + examples/glViewer/shader.glsl | 4 + examples/paintTest/shader.glsl | 4 + examples/ptexViewer/shader.glsl | 4 + examples/uvViewer/shader.glsl | 4 + opensubdiv/osd/clComputeController.cpp | 219 +++++---- opensubdiv/osd/clComputeController.h | 73 ++- opensubdiv/osd/clKernel.cl | 221 +++++---- opensubdiv/osd/clKernelBundle.cpp | 31 +- opensubdiv/osd/clKernelBundle.h | 28 +- opensubdiv/osd/cpuComputeController.cpp | 59 ++- opensubdiv/osd/cpuComputeController.h | 74 ++- opensubdiv/osd/cpuKernel.cpp | 283 ++++++++---- opensubdiv/osd/cpuKernel.h | 41 +- opensubdiv/osd/cudaComputeController.cpp | 134 ++++-- opensubdiv/osd/cudaComputeController.h | 74 ++- opensubdiv/osd/cudaKernel.cu | 426 ++++++++++-------- opensubdiv/osd/d3d11ComputeController.cpp | 101 +++-- opensubdiv/osd/d3d11ComputeController.h | 79 +++- opensubdiv/osd/d3d11KernelBundle.cpp | 79 +++- opensubdiv/osd/d3d11KernelBundle.h | 57 ++- opensubdiv/osd/d3d11Mesh.h | 14 + opensubdiv/osd/gcdComputeController.cpp | 47 +- opensubdiv/osd/gcdComputeController.h | 63 ++- opensubdiv/osd/gcdKernel.cpp | 128 ++++-- opensubdiv/osd/gcdKernel.h | 43 +- opensubdiv/osd/glMesh.h | 18 + opensubdiv/osd/glslComputeController.cpp | 101 +++-- opensubdiv/osd/glslComputeController.h | 81 +++- opensubdiv/osd/glslComputeKernel.glsl | 33 +- opensubdiv/osd/glslKernelBundle.cpp | 61 ++- opensubdiv/osd/glslKernelBundle.h | 34 +- opensubdiv/osd/glslPatchCommon.glsl | 1 + opensubdiv/osd/glslPatchGregory.glsl | 72 +-- ...glslTransformFeedbackComputeController.cpp | 133 +++--- .../glslTransformFeedbackComputeController.h | 91 ++-- .../osd/glslTransformFeedbackKernel.glsl | 16 +- .../osd/glslTransformFeedbackKernelBundle.cpp | 304 ++++++++----- .../osd/glslTransformFeedbackKernelBundle.h | 108 +++-- opensubdiv/osd/hlslComputeKernel.hlsl | 14 +- opensubdiv/osd/mesh.h | 12 + opensubdiv/osd/ompComputeController.cpp | 47 +- opensubdiv/osd/ompComputeController.h | 73 ++- opensubdiv/osd/ompKernel.cpp | 285 +++++++++--- opensubdiv/osd/ompKernel.h | 42 +- opensubdiv/osd/tbbComputeController.cpp | 60 ++- opensubdiv/osd/tbbComputeController.h | 75 ++- opensubdiv/osd/tbbKernel.cpp | 299 +++++++----- opensubdiv/osd/tbbKernel.h | 43 +- opensubdiv/osd/vertexDescriptor.h | 156 +------ 51 files changed, 2850 insertions(+), 1607 deletions(-) mode change 100644 => 100755 opensubdiv/osd/cpuKernel.cpp mode change 100644 => 100755 opensubdiv/osd/d3d11ComputeController.cpp mode change 100644 => 100755 opensubdiv/osd/d3d11ComputeController.h mode change 100644 => 100755 opensubdiv/osd/d3d11KernelBundle.h mode change 100644 => 100755 opensubdiv/osd/d3d11Mesh.h diff --git a/examples/facePartition/shader.glsl b/examples/facePartition/shader.glsl index 570cb7a7..e93b097e 100644 --- a/examples/facePartition/shader.glsl +++ b/examples/facePartition/shader.glsl @@ -67,6 +67,10 @@ int OsdPrimitiveIdBase() { return PrimitiveIdBase; } +int OsdBaseVertex() +{ + return 0; +} //-------------------------------------------------------------- // Vertex Shader diff --git a/examples/glBatchViewer/shader.glsl b/examples/glBatchViewer/shader.glsl index ff222b8d..a4e72606 100644 --- a/examples/glBatchViewer/shader.glsl +++ b/examples/glBatchViewer/shader.glsl @@ -99,6 +99,10 @@ int OsdPrimitiveIdBase() { return PrimitiveIdBase; } +int OsdBaseVertex() +{ + return 0; +} //-------------------------------------------------------------- // Vertex Shader diff --git a/examples/glViewer/shader.glsl b/examples/glViewer/shader.glsl index 6ce7581c..0b437acb 100644 --- a/examples/glViewer/shader.glsl +++ b/examples/glViewer/shader.glsl @@ -97,6 +97,10 @@ int OsdPrimitiveIdBase() { return PrimitiveIdBase; } +int OsdBaseVertex() +{ + return 0; +} //-------------------------------------------------------------- // Vertex Shader diff --git a/examples/paintTest/shader.glsl b/examples/paintTest/shader.glsl index 9ac8cf5c..e239983f 100644 --- a/examples/paintTest/shader.glsl +++ b/examples/paintTest/shader.glsl @@ -98,6 +98,10 @@ int OsdPrimitiveIdBase() { return PrimitiveIdBase; } +int OsdBaseVertex() +{ + return 0; +} //-------------------------------------------------------------- // Geometry Shader diff --git a/examples/ptexViewer/shader.glsl b/examples/ptexViewer/shader.glsl index b4c33e6e..3ee67f02 100644 --- a/examples/ptexViewer/shader.glsl +++ b/examples/ptexViewer/shader.glsl @@ -130,6 +130,10 @@ int OsdPrimitiveIdBase() { return PrimitiveIdBase; } +int OsdBaseVertex() +{ + return 0; +} //-------------------------------------------------------------- // Vertex Shader diff --git a/examples/uvViewer/shader.glsl b/examples/uvViewer/shader.glsl index 26df14e5..e2a34e9b 100644 --- a/examples/uvViewer/shader.glsl +++ b/examples/uvViewer/shader.glsl @@ -91,6 +91,10 @@ int OsdPrimitiveIdBase() { return PrimitiveIdBase; } +int OsdBaseVertex() +{ + return 0; +} //-------------------------------------------------------------- // Vertex Shader diff --git a/opensubdiv/osd/clComputeController.cpp b/opensubdiv/osd/clComputeController.cpp index 39ed8302..3d7621f7 100644 --- a/opensubdiv/osd/clComputeController.cpp +++ b/opensubdiv/osd/clComputeController.cpp @@ -53,9 +53,7 @@ namespace OPENSUBDIV_VERSION { OsdCLComputeController::OsdCLComputeController(cl_context clContext, cl_command_queue queue) : - _clContext(clContext), _clQueue(queue), - _currentVertexBuffer(0), _currentVaryingBuffer(0), - _currentKernelBundle(NULL) { + _clContext(clContext), _clQueue(queue) { } OsdCLComputeController::~OsdCLComputeController() { @@ -73,21 +71,23 @@ OsdCLComputeController::Synchronize() { } OsdCLKernelBundle * -OsdCLComputeController::getKernelBundle(int numVertexElements, - int numVaryingElements) { +OsdCLComputeController::getKernelBundle( + OsdVertexBufferDescriptor const &vertexDesc, + OsdVertexBufferDescriptor const &varyingDesc) { std::vector::iterator it = std::find_if(_kernelRegistry.begin(), _kernelRegistry.end(), - OsdCLKernelBundle::Match(numVertexElements, - numVaryingElements)); + OsdCLKernelBundle::Match(vertexDesc, + varyingDesc)); + if (it != _kernelRegistry.end()) { return *it; } else { OsdCLKernelBundle *kernelBundle = new OsdCLKernelBundle(); _kernelRegistry.push_back(kernelBundle); kernelBundle->Compile(_clContext, - numVertexElements, - numVaryingElements); + vertexDesc, + varyingDesc); return kernelBundle; } } @@ -107,17 +107,19 @@ OsdCLComputeController::ApplyBilinearEdgeVerticesKernel( cl_int ciErrNum; size_t globalWorkSize[1] = { (size_t)(batch.GetEnd() - batch.GetStart()) }; - cl_kernel kernel = _currentKernelBundle->GetBilinearEdgeKernel(); + cl_kernel kernel = _currentBindState.kernelBundle->GetBilinearEdgeKernel(); cl_mem E_IT = context->GetTable(FarSubdivisionTables::E_IT)->GetDevicePtr(); - clSetKernelArg(kernel, 0, sizeof(cl_mem), &_currentVertexBuffer); - clSetKernelArg(kernel, 1, sizeof(cl_mem), &_currentVaryingBuffer); + clSetKernelArg(kernel, 0, sizeof(cl_mem), &_currentBindState.vertexBuffer); + clSetKernelArg(kernel, 1, sizeof(cl_mem), &_currentBindState.varyingBuffer); clSetKernelArg(kernel, 2, sizeof(cl_mem), &E_IT); - clSetKernelArg(kernel, 3, sizeof(int), batch.GetVertexOffsetPtr()); - clSetKernelArg(kernel, 4, sizeof(int), batch.GetTableOffsetPtr()); - clSetKernelArg(kernel, 5, sizeof(int), batch.GetStartPtr()); - clSetKernelArg(kernel, 6, sizeof(int), batch.GetEndPtr()); + clSetKernelArg(kernel, 3, sizeof(int), &_currentBindState.vertexDesc.offset); + clSetKernelArg(kernel, 4, sizeof(int), &_currentBindState.varyingDesc.offset); + clSetKernelArg(kernel, 5, sizeof(int), batch.GetVertexOffsetPtr()); + clSetKernelArg(kernel, 6, sizeof(int), batch.GetTableOffsetPtr()); + clSetKernelArg(kernel, 7, sizeof(int), batch.GetStartPtr()); + clSetKernelArg(kernel, 8, sizeof(int), batch.GetEndPtr()); ciErrNum = clEnqueueNDRangeKernel(_clQueue, kernel, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL); @@ -132,17 +134,19 @@ OsdCLComputeController::ApplyBilinearVertexVerticesKernel( cl_int ciErrNum; size_t globalWorkSize[1] = { (size_t)(batch.GetEnd() - batch.GetStart()) }; - cl_kernel kernel = _currentKernelBundle->GetBilinearVertexKernel(); + cl_kernel kernel = _currentBindState.kernelBundle->GetBilinearVertexKernel(); cl_mem V_ITa = context->GetTable(FarSubdivisionTables::V_ITa)->GetDevicePtr(); - clSetKernelArg(kernel, 0, sizeof(cl_mem), &_currentVertexBuffer); - clSetKernelArg(kernel, 1, sizeof(cl_mem), &_currentVaryingBuffer); + clSetKernelArg(kernel, 0, sizeof(cl_mem), &_currentBindState.vertexBuffer); + clSetKernelArg(kernel, 1, sizeof(cl_mem), &_currentBindState.varyingBuffer); clSetKernelArg(kernel, 2, sizeof(cl_mem), &V_ITa); - clSetKernelArg(kernel, 3, sizeof(int), batch.GetVertexOffsetPtr()); - clSetKernelArg(kernel, 4, sizeof(int), batch.GetTableOffsetPtr()); - clSetKernelArg(kernel, 5, sizeof(int), batch.GetStartPtr()); - clSetKernelArg(kernel, 6, sizeof(int), batch.GetEndPtr()); + clSetKernelArg(kernel, 3, sizeof(int), &_currentBindState.vertexDesc.offset); + clSetKernelArg(kernel, 4, sizeof(int), &_currentBindState.varyingDesc.offset); + clSetKernelArg(kernel, 5, sizeof(int), batch.GetVertexOffsetPtr()); + clSetKernelArg(kernel, 6, sizeof(int), batch.GetTableOffsetPtr()); + clSetKernelArg(kernel, 7, sizeof(int), batch.GetStartPtr()); + clSetKernelArg(kernel, 8, sizeof(int), batch.GetEndPtr()); ciErrNum = clEnqueueNDRangeKernel(_clQueue, kernel, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL); @@ -157,19 +161,21 @@ OsdCLComputeController::ApplyCatmarkFaceVerticesKernel( cl_int ciErrNum; size_t globalWorkSize[1] = { (size_t)(batch.GetEnd() - batch.GetStart()) }; - cl_kernel kernel = _currentKernelBundle->GetCatmarkFaceKernel(); + cl_kernel kernel = _currentBindState.kernelBundle->GetCatmarkFaceKernel(); cl_mem F_IT = context->GetTable(FarSubdivisionTables::F_IT)->GetDevicePtr(); cl_mem F_ITa = context->GetTable(FarSubdivisionTables::F_ITa)->GetDevicePtr(); - clSetKernelArg(kernel, 0, sizeof(cl_mem), &_currentVertexBuffer); - clSetKernelArg(kernel, 1, sizeof(cl_mem), &_currentVaryingBuffer); + clSetKernelArg(kernel, 0, sizeof(cl_mem), &_currentBindState.vertexBuffer); + clSetKernelArg(kernel, 1, sizeof(cl_mem), &_currentBindState.varyingBuffer); clSetKernelArg(kernel, 2, sizeof(cl_mem), &F_IT); clSetKernelArg(kernel, 3, sizeof(cl_mem), &F_ITa); - clSetKernelArg(kernel, 4, sizeof(int), batch.GetVertexOffsetPtr()); - clSetKernelArg(kernel, 5, sizeof(int), batch.GetTableOffsetPtr()); - clSetKernelArg(kernel, 6, sizeof(int), batch.GetStartPtr()); - clSetKernelArg(kernel, 7, sizeof(int), batch.GetEndPtr()); + clSetKernelArg(kernel, 4, sizeof(int), &_currentBindState.vertexDesc.offset); + clSetKernelArg(kernel, 5, sizeof(int), &_currentBindState.varyingDesc.offset); + clSetKernelArg(kernel, 6, sizeof(int), batch.GetVertexOffsetPtr()); + clSetKernelArg(kernel, 7, sizeof(int), batch.GetTableOffsetPtr()); + clSetKernelArg(kernel, 8, sizeof(int), batch.GetStartPtr()); + clSetKernelArg(kernel, 9, sizeof(int), batch.GetEndPtr()); ciErrNum = clEnqueueNDRangeKernel(_clQueue, kernel, 1, NULL, globalWorkSize, @@ -185,19 +191,21 @@ OsdCLComputeController::ApplyCatmarkEdgeVerticesKernel( cl_int ciErrNum; size_t globalWorkSize[1] = { (size_t)(batch.GetEnd() - batch.GetStart()) }; - cl_kernel kernel = _currentKernelBundle->GetCatmarkEdgeKernel(); + cl_kernel kernel = _currentBindState.kernelBundle->GetCatmarkEdgeKernel(); cl_mem E_IT = context->GetTable(FarSubdivisionTables::E_IT)->GetDevicePtr(); cl_mem E_W = context->GetTable(FarSubdivisionTables::E_W)->GetDevicePtr(); - clSetKernelArg(kernel, 0, sizeof(cl_mem), &_currentVertexBuffer); - clSetKernelArg(kernel, 1, sizeof(cl_mem), &_currentVaryingBuffer); + clSetKernelArg(kernel, 0, sizeof(cl_mem), &_currentBindState.vertexBuffer); + clSetKernelArg(kernel, 1, sizeof(cl_mem), &_currentBindState.varyingBuffer); clSetKernelArg(kernel, 2, sizeof(cl_mem), &E_IT); clSetKernelArg(kernel, 3, sizeof(cl_mem), &E_W); - clSetKernelArg(kernel, 4, sizeof(int), batch.GetVertexOffsetPtr()); - clSetKernelArg(kernel, 5, sizeof(int), batch.GetTableOffsetPtr()); - clSetKernelArg(kernel, 6, sizeof(int), batch.GetStartPtr()); - clSetKernelArg(kernel, 7, sizeof(int), batch.GetEndPtr()); + clSetKernelArg(kernel, 4, sizeof(int), &_currentBindState.vertexDesc.offset); + clSetKernelArg(kernel, 5, sizeof(int), &_currentBindState.varyingDesc.offset); + clSetKernelArg(kernel, 6, sizeof(int), batch.GetVertexOffsetPtr()); + clSetKernelArg(kernel, 7, sizeof(int), batch.GetTableOffsetPtr()); + clSetKernelArg(kernel, 8, sizeof(int), batch.GetStartPtr()); + clSetKernelArg(kernel, 9, sizeof(int), batch.GetEndPtr()); ciErrNum = clEnqueueNDRangeKernel(_clQueue, kernel, 1, NULL, globalWorkSize, @@ -213,21 +221,23 @@ OsdCLComputeController::ApplyCatmarkVertexVerticesKernelB( cl_int ciErrNum; size_t globalWorkSize[1] = { (size_t)(batch.GetEnd() - batch.GetStart()) }; - cl_kernel kernel = _currentKernelBundle->GetCatmarkVertexKernelB(); + cl_kernel kernel = _currentBindState.kernelBundle->GetCatmarkVertexKernelB(); cl_mem V_ITa = context->GetTable(FarSubdivisionTables::V_ITa)->GetDevicePtr(); cl_mem V_IT = context->GetTable(FarSubdivisionTables::V_IT)->GetDevicePtr(); cl_mem V_W = context->GetTable(FarSubdivisionTables::V_W)->GetDevicePtr(); - clSetKernelArg(kernel, 0, sizeof(cl_mem), &_currentVertexBuffer); - clSetKernelArg(kernel, 1, sizeof(cl_mem), &_currentVaryingBuffer); + clSetKernelArg(kernel, 0, sizeof(cl_mem), &_currentBindState.vertexBuffer); + clSetKernelArg(kernel, 1, sizeof(cl_mem), &_currentBindState.varyingBuffer); clSetKernelArg(kernel, 2, sizeof(cl_mem), &V_ITa); clSetKernelArg(kernel, 3, sizeof(cl_mem), &V_IT); clSetKernelArg(kernel, 4, sizeof(cl_mem), &V_W); - clSetKernelArg(kernel, 5, sizeof(int), batch.GetVertexOffsetPtr()); - clSetKernelArg(kernel, 6, sizeof(int), batch.GetTableOffsetPtr()); - clSetKernelArg(kernel, 7, sizeof(int), batch.GetStartPtr()); - clSetKernelArg(kernel, 8, sizeof(int), batch.GetEndPtr()); + clSetKernelArg(kernel, 5, sizeof(int), &_currentBindState.vertexDesc.offset); + clSetKernelArg(kernel, 6, sizeof(int), &_currentBindState.varyingDesc.offset); + clSetKernelArg(kernel, 7, sizeof(int), batch.GetVertexOffsetPtr()); + clSetKernelArg(kernel, 8, sizeof(int), batch.GetTableOffsetPtr()); + clSetKernelArg(kernel, 9, sizeof(int), batch.GetStartPtr()); + clSetKernelArg(kernel, 10, sizeof(int), batch.GetEndPtr()); ciErrNum = clEnqueueNDRangeKernel(_clQueue, kernel, 1, NULL, globalWorkSize, @@ -244,20 +254,22 @@ OsdCLComputeController::ApplyCatmarkVertexVerticesKernelA1( cl_int ciErrNum; size_t globalWorkSize[1] = { (size_t)(batch.GetEnd() - batch.GetStart()) }; int ipass = false; - cl_kernel kernel = _currentKernelBundle->GetCatmarkVertexKernelA(); + cl_kernel kernel = _currentBindState.kernelBundle->GetCatmarkVertexKernelA(); cl_mem V_ITa = context->GetTable(FarSubdivisionTables::V_ITa)->GetDevicePtr(); cl_mem V_W = context->GetTable(FarSubdivisionTables::V_W)->GetDevicePtr(); - clSetKernelArg(kernel, 0, sizeof(cl_mem), &_currentVertexBuffer); - clSetKernelArg(kernel, 1, sizeof(cl_mem), &_currentVaryingBuffer); + clSetKernelArg(kernel, 0, sizeof(cl_mem), &_currentBindState.vertexBuffer); + clSetKernelArg(kernel, 1, sizeof(cl_mem), &_currentBindState.varyingBuffer); clSetKernelArg(kernel, 2, sizeof(cl_mem), &V_ITa); clSetKernelArg(kernel, 3, sizeof(cl_mem), &V_W); - clSetKernelArg(kernel, 4, sizeof(int), batch.GetVertexOffsetPtr()); - clSetKernelArg(kernel, 5, sizeof(int), batch.GetTableOffsetPtr()); - clSetKernelArg(kernel, 6, sizeof(int), batch.GetStartPtr()); - clSetKernelArg(kernel, 7, sizeof(int), batch.GetEndPtr()); - clSetKernelArg(kernel, 8, sizeof(int), &ipass); + clSetKernelArg(kernel, 4, sizeof(int), &_currentBindState.vertexDesc.offset); + clSetKernelArg(kernel, 5, sizeof(int), &_currentBindState.varyingDesc.offset); + clSetKernelArg(kernel, 6, sizeof(int), batch.GetVertexOffsetPtr()); + clSetKernelArg(kernel, 7, sizeof(int), batch.GetTableOffsetPtr()); + clSetKernelArg(kernel, 8, sizeof(int), batch.GetStartPtr()); + clSetKernelArg(kernel, 9, sizeof(int), batch.GetEndPtr()); + clSetKernelArg(kernel, 10, sizeof(int), &ipass); ciErrNum = clEnqueueNDRangeKernel(_clQueue, kernel, 1, NULL, globalWorkSize, @@ -274,20 +286,22 @@ OsdCLComputeController::ApplyCatmarkVertexVerticesKernelA2( cl_int ciErrNum; size_t globalWorkSize[1] = { (size_t)(batch.GetEnd() - batch.GetStart()) }; int ipass = true; - cl_kernel kernel = _currentKernelBundle->GetCatmarkVertexKernelA(); + cl_kernel kernel = _currentBindState.kernelBundle->GetCatmarkVertexKernelA(); cl_mem V_ITa = context->GetTable(FarSubdivisionTables::V_ITa)->GetDevicePtr(); cl_mem V_W = context->GetTable(FarSubdivisionTables::V_W)->GetDevicePtr(); - clSetKernelArg(kernel, 0, sizeof(cl_mem), &_currentVertexBuffer); - clSetKernelArg(kernel, 1, sizeof(cl_mem), &_currentVaryingBuffer); + clSetKernelArg(kernel, 0, sizeof(cl_mem), &_currentBindState.vertexBuffer); + clSetKernelArg(kernel, 1, sizeof(cl_mem), &_currentBindState.varyingBuffer); clSetKernelArg(kernel, 2, sizeof(cl_mem), &V_ITa); clSetKernelArg(kernel, 3, sizeof(cl_mem), &V_W); - clSetKernelArg(kernel, 4, sizeof(int), batch.GetVertexOffsetPtr()); - clSetKernelArg(kernel, 5, sizeof(int), batch.GetTableOffsetPtr()); - clSetKernelArg(kernel, 6, sizeof(int), batch.GetStartPtr()); - clSetKernelArg(kernel, 7, sizeof(int), batch.GetEndPtr()); - clSetKernelArg(kernel, 8, sizeof(int), &ipass); + clSetKernelArg(kernel, 4, sizeof(int), &_currentBindState.vertexDesc.offset); + clSetKernelArg(kernel, 5, sizeof(int), &_currentBindState.varyingDesc.offset); + clSetKernelArg(kernel, 6, sizeof(int), batch.GetVertexOffsetPtr()); + clSetKernelArg(kernel, 7, sizeof(int), batch.GetTableOffsetPtr()); + clSetKernelArg(kernel, 8, sizeof(int), batch.GetStartPtr()); + clSetKernelArg(kernel, 9, sizeof(int), batch.GetEndPtr()); + clSetKernelArg(kernel, 10, sizeof(int), &ipass); ciErrNum = clEnqueueNDRangeKernel(_clQueue, kernel, 1, NULL, globalWorkSize, @@ -303,19 +317,21 @@ OsdCLComputeController::ApplyLoopEdgeVerticesKernel( cl_int ciErrNum; size_t globalWorkSize[1] = { (size_t)(batch.GetEnd() - batch.GetStart()) }; - cl_kernel kernel = _currentKernelBundle->GetLoopEdgeKernel(); + cl_kernel kernel = _currentBindState.kernelBundle->GetLoopEdgeKernel(); cl_mem E_IT = context->GetTable(FarSubdivisionTables::E_IT)->GetDevicePtr(); cl_mem E_W = context->GetTable(FarSubdivisionTables::E_W)->GetDevicePtr(); - clSetKernelArg(kernel, 0, sizeof(cl_mem), &_currentVertexBuffer); - clSetKernelArg(kernel, 1, sizeof(cl_mem), &_currentVaryingBuffer); + clSetKernelArg(kernel, 0, sizeof(cl_mem), &_currentBindState.vertexBuffer); + clSetKernelArg(kernel, 1, sizeof(cl_mem), &_currentBindState.varyingBuffer); clSetKernelArg(kernel, 2, sizeof(cl_mem), &E_IT); clSetKernelArg(kernel, 3, sizeof(cl_mem), &E_W); - clSetKernelArg(kernel, 4, sizeof(int), batch.GetVertexOffsetPtr()); - clSetKernelArg(kernel, 5, sizeof(int), batch.GetTableOffsetPtr()); - clSetKernelArg(kernel, 6, sizeof(int), batch.GetStartPtr()); - clSetKernelArg(kernel, 7, sizeof(int), batch.GetEndPtr()); + clSetKernelArg(kernel, 4, sizeof(int), &_currentBindState.vertexDesc.offset); + clSetKernelArg(kernel, 5, sizeof(int), &_currentBindState.varyingDesc.offset); + clSetKernelArg(kernel, 6, sizeof(int), batch.GetVertexOffsetPtr()); + clSetKernelArg(kernel, 7, sizeof(int), batch.GetTableOffsetPtr()); + clSetKernelArg(kernel, 8, sizeof(int), batch.GetStartPtr()); + clSetKernelArg(kernel, 9, sizeof(int), batch.GetEndPtr()); ciErrNum = clEnqueueNDRangeKernel(_clQueue, kernel, 1, NULL, globalWorkSize, @@ -331,21 +347,23 @@ OsdCLComputeController::ApplyLoopVertexVerticesKernelB( cl_int ciErrNum; size_t globalWorkSize[1] = { (size_t)(batch.GetEnd() - batch.GetStart()) }; - cl_kernel kernel = _currentKernelBundle->GetLoopVertexKernelB(); + cl_kernel kernel = _currentBindState.kernelBundle->GetLoopVertexKernelB(); cl_mem V_ITa = context->GetTable(FarSubdivisionTables::V_ITa)->GetDevicePtr(); cl_mem V_IT = context->GetTable(FarSubdivisionTables::V_IT)->GetDevicePtr(); cl_mem V_W = context->GetTable(FarSubdivisionTables::V_W)->GetDevicePtr(); - clSetKernelArg(kernel, 0, sizeof(cl_mem), &_currentVertexBuffer); - clSetKernelArg(kernel, 1, sizeof(cl_mem), &_currentVaryingBuffer); + clSetKernelArg(kernel, 0, sizeof(cl_mem), &_currentBindState.vertexBuffer); + clSetKernelArg(kernel, 1, sizeof(cl_mem), &_currentBindState.varyingBuffer); clSetKernelArg(kernel, 2, sizeof(cl_mem), &V_ITa); clSetKernelArg(kernel, 3, sizeof(cl_mem), &V_IT); clSetKernelArg(kernel, 4, sizeof(cl_mem), &V_W); - clSetKernelArg(kernel, 5, sizeof(int), batch.GetVertexOffsetPtr()); - clSetKernelArg(kernel, 6, sizeof(int), batch.GetTableOffsetPtr()); - clSetKernelArg(kernel, 7, sizeof(int), batch.GetStartPtr()); - clSetKernelArg(kernel, 8, sizeof(int), batch.GetEndPtr()); + clSetKernelArg(kernel, 5, sizeof(int), &_currentBindState.vertexDesc.offset); + clSetKernelArg(kernel, 6, sizeof(int), &_currentBindState.varyingDesc.offset); + clSetKernelArg(kernel, 7, sizeof(int), batch.GetVertexOffsetPtr()); + clSetKernelArg(kernel, 8, sizeof(int), batch.GetTableOffsetPtr()); + clSetKernelArg(kernel, 9, sizeof(int), batch.GetStartPtr()); + clSetKernelArg(kernel, 10, sizeof(int), batch.GetEndPtr()); ciErrNum = clEnqueueNDRangeKernel(_clQueue, kernel, 1, NULL, globalWorkSize, @@ -362,20 +380,22 @@ OsdCLComputeController::ApplyLoopVertexVerticesKernelA1( cl_int ciErrNum; size_t globalWorkSize[1] = { (size_t)(batch.GetEnd() - batch.GetStart()) }; int ipass = false; - cl_kernel kernel = _currentKernelBundle->GetLoopVertexKernelA(); + cl_kernel kernel = _currentBindState.kernelBundle->GetLoopVertexKernelA(); cl_mem V_ITa = context->GetTable(FarSubdivisionTables::V_ITa)->GetDevicePtr(); cl_mem V_W = context->GetTable(FarSubdivisionTables::V_W)->GetDevicePtr(); - clSetKernelArg(kernel, 0, sizeof(cl_mem), &_currentVertexBuffer); - clSetKernelArg(kernel, 1, sizeof(cl_mem), &_currentVaryingBuffer); + clSetKernelArg(kernel, 0, sizeof(cl_mem), &_currentBindState.vertexBuffer); + clSetKernelArg(kernel, 1, sizeof(cl_mem), &_currentBindState.varyingBuffer); clSetKernelArg(kernel, 2, sizeof(cl_mem), &V_ITa); clSetKernelArg(kernel, 3, sizeof(cl_mem), &V_W); - clSetKernelArg(kernel, 4, sizeof(int), batch.GetVertexOffsetPtr()); - clSetKernelArg(kernel, 5, sizeof(int), batch.GetTableOffsetPtr()); - clSetKernelArg(kernel, 6, sizeof(int), batch.GetStartPtr()); - clSetKernelArg(kernel, 7, sizeof(int), batch.GetEndPtr()); - clSetKernelArg(kernel, 8, sizeof(int), &ipass); + clSetKernelArg(kernel, 4, sizeof(int), &_currentBindState.vertexDesc.offset); + clSetKernelArg(kernel, 5, sizeof(int), &_currentBindState.varyingDesc.offset); + clSetKernelArg(kernel, 6, sizeof(int), batch.GetVertexOffsetPtr()); + clSetKernelArg(kernel, 7, sizeof(int), batch.GetTableOffsetPtr()); + clSetKernelArg(kernel, 8, sizeof(int), batch.GetStartPtr()); + clSetKernelArg(kernel, 9, sizeof(int), batch.GetEndPtr()); + clSetKernelArg(kernel, 10, sizeof(int), &ipass); ciErrNum = clEnqueueNDRangeKernel(_clQueue, kernel, 1, NULL, globalWorkSize, @@ -392,20 +412,22 @@ OsdCLComputeController::ApplyLoopVertexVerticesKernelA2( cl_int ciErrNum; size_t globalWorkSize[1] = { (size_t)(batch.GetEnd() - batch.GetStart()) }; int ipass = true; - cl_kernel kernel = _currentKernelBundle->GetLoopVertexKernelA(); + cl_kernel kernel = _currentBindState.kernelBundle->GetLoopVertexKernelA(); cl_mem V_ITa = context->GetTable(FarSubdivisionTables::V_ITa)->GetDevicePtr(); cl_mem V_W = context->GetTable(FarSubdivisionTables::V_W)->GetDevicePtr(); - clSetKernelArg(kernel, 0, sizeof(cl_mem), &_currentVertexBuffer); - clSetKernelArg(kernel, 1, sizeof(cl_mem), &_currentVaryingBuffer); + clSetKernelArg(kernel, 0, sizeof(cl_mem), &_currentBindState.vertexBuffer); + clSetKernelArg(kernel, 1, sizeof(cl_mem), &_currentBindState.varyingBuffer); clSetKernelArg(kernel, 2, sizeof(cl_mem), &V_ITa); clSetKernelArg(kernel, 3, sizeof(cl_mem), &V_W); - clSetKernelArg(kernel, 4, sizeof(int), batch.GetVertexOffsetPtr()); - clSetKernelArg(kernel, 5, sizeof(int), batch.GetTableOffsetPtr()); - clSetKernelArg(kernel, 6, sizeof(int), batch.GetStartPtr()); - clSetKernelArg(kernel, 7, sizeof(int), batch.GetEndPtr()); - clSetKernelArg(kernel, 8, sizeof(int), &ipass); + clSetKernelArg(kernel, 4, sizeof(int), &_currentBindState.vertexDesc.offset); + clSetKernelArg(kernel, 5, sizeof(int), &_currentBindState.varyingDesc.offset); + clSetKernelArg(kernel, 6, sizeof(int), batch.GetVertexOffsetPtr()); + clSetKernelArg(kernel, 7, sizeof(int), batch.GetTableOffsetPtr()); + clSetKernelArg(kernel, 8, sizeof(int), batch.GetStartPtr()); + clSetKernelArg(kernel, 9, sizeof(int), batch.GetEndPtr()); + clSetKernelArg(kernel, 10, sizeof(int), &ipass); ciErrNum = clEnqueueNDRangeKernel(_clQueue, kernel, 1, NULL, globalWorkSize, @@ -434,17 +456,18 @@ OsdCLComputeController::ApplyVertexEdits( int primvarWidth = edit->GetPrimvarWidth(); if (edit->GetOperation() == FarVertexEdit::Add) { - cl_kernel kernel = _currentKernelBundle->GetVertexEditAdd(); + cl_kernel kernel = _currentBindState.kernelBundle->GetVertexEditAdd(); - clSetKernelArg(kernel, 0, sizeof(cl_mem), &_currentVertexBuffer); + clSetKernelArg(kernel, 0, sizeof(cl_mem), &_currentBindState.vertexBuffer); clSetKernelArg(kernel, 1, sizeof(cl_mem), &indices); clSetKernelArg(kernel, 2, sizeof(cl_mem), &values); - clSetKernelArg(kernel, 3, sizeof(int), &primvarOffset); - clSetKernelArg(kernel, 4, sizeof(int), &primvarWidth); - clSetKernelArg(kernel, 5, sizeof(int), batch.GetVertexOffsetPtr()); - clSetKernelArg(kernel, 6, sizeof(int), batch.GetTableOffsetPtr()); - clSetKernelArg(kernel, 7, sizeof(int), batch.GetStartPtr()); - clSetKernelArg(kernel, 8, sizeof(int), batch.GetEndPtr()); + clSetKernelArg(kernel, 3, sizeof(int), &_currentBindState.vertexDesc.offset); + clSetKernelArg(kernel, 4, sizeof(int), &primvarOffset); + clSetKernelArg(kernel, 5, sizeof(int), &primvarWidth); + clSetKernelArg(kernel, 6, sizeof(int), batch.GetVertexOffsetPtr()); + clSetKernelArg(kernel, 7, sizeof(int), batch.GetTableOffsetPtr()); + clSetKernelArg(kernel, 8, sizeof(int), batch.GetStartPtr()); + clSetKernelArg(kernel, 9, sizeof(int), batch.GetEndPtr()); ciErrNum = clEnqueueNDRangeKernel(_clQueue, kernel, 1, NULL, globalWorkSize, diff --git a/opensubdiv/osd/clComputeController.h b/opensubdiv/osd/clComputeController.h index e97922a5..55f5b15b 100644 --- a/opensubdiv/osd/clComputeController.h +++ b/opensubdiv/osd/clComputeController.h @@ -29,6 +29,7 @@ #include "../far/dispatcher.h" #include "../osd/clComputeContext.h" +#include "../osd/vertexDescriptor.h" #if defined(__APPLE__) #include @@ -79,15 +80,25 @@ public: /// /// @param varyingBuffer varying-interpolated data buffer /// + /// @param vertexDesc the descriptor of vertex elements to be refined. + /// if it's null, all primvars in the vertex buffer + /// will be refined. + /// + /// @param varyingDesc the descriptor of varying elements to be refined. + /// if it's null, all primvars in the varying buffer + /// will be refined. + /// template void Refine(ComputeContext const *context, FarKernelBatchVector const &batches, VERTEX_BUFFER *vertexBuffer, - VARYING_BUFFER *varyingBuffer) { + VARYING_BUFFER *varyingBuffer, + OsdVertexBufferDescriptor const *vertexDesc=NULL, + OsdVertexBufferDescriptor const *varyingDesc=NULL) { if (batches.empty()) return; - bind(vertexBuffer, varyingBuffer); + bind(vertexBuffer, varyingBuffer, vertexDesc, varyingDesc); FarDispatcher::Refine(this, context, batches, /*maxlevel*/-1); @@ -152,33 +163,63 @@ protected: void ApplyVertexEdits(FarKernelBatch const &batch, ComputeContext const *context) const; - OsdCLKernelBundle * getKernelBundle(int numVertexElements, - int numVaryingElements); + OsdCLKernelBundle * getKernelBundle( + OsdVertexBufferDescriptor const &vertexDesc, + OsdVertexBufferDescriptor const &varyingDesc); template - void bind(VERTEX_BUFFER *vertex, VARYING_BUFFER *varying) { + void bind(VERTEX_BUFFER *vertex, VARYING_BUFFER *varying, + OsdVertexBufferDescriptor const *vertexDesc, + OsdVertexBufferDescriptor const *varyingDesc) { - int numVertexElements = vertex ? vertex->GetNumElements() : 0; - int numVaryingElements = varying ? varying->GetNumElements() : 0; + // if the vertex buffer descriptor is specified, use it. + // otherwise, assumes the data is tightly packed in the vertex buffer. + if (vertexDesc) { + _currentBindState.vertexDesc = *vertexDesc; + } else { + int numElements = vertex ? vertex->GetNumElements() : 0; + _currentBindState.vertexDesc = OsdVertexBufferDescriptor( + 0, numElements, numElements); + } + if (varyingDesc) { + _currentBindState.varyingDesc = *varyingDesc; + } else { + int numElements = varying ? varying->GetNumElements() : 0; + _currentBindState.varyingDesc = OsdVertexBufferDescriptor( + 0, numElements, numElements); + } - _currentVertexBuffer = vertex ? vertex->BindCLBuffer(_clQueue) : NULL; - _currentVaryingBuffer = varying ? varying->BindCLBuffer(_clQueue) : NULL; - _currentKernelBundle = getKernelBundle(numVertexElements, numVaryingElements); + _currentBindState.vertexBuffer = vertex ? vertex->BindCLBuffer(_clQueue) : 0; + _currentBindState.varyingBuffer = varying ? varying->BindCLBuffer(_clQueue) : 0; + _currentBindState.kernelBundle = getKernelBundle(_currentBindState.vertexDesc, + _currentBindState.varyingDesc); } void unbind() { - _currentVertexBuffer = NULL; - _currentVaryingBuffer = NULL; - _currentKernelBundle = NULL; + _currentBindState.Reset(); } private: + struct BindState { + BindState() : vertexBuffer(NULL), varyingBuffer(NULL), kernelBundle(NULL) {} + void Reset() { + vertexBuffer = varyingBuffer = NULL; + vertexDesc.Reset(); + varyingDesc.Reset(); + kernelBundle = NULL; + } + cl_mem vertexBuffer; + cl_mem varyingBuffer; + OsdVertexBufferDescriptor vertexDesc; + OsdVertexBufferDescriptor varyingDesc; + OsdCLKernelBundle *kernelBundle; + }; + + BindState _currentBindState; + cl_context _clContext; cl_command_queue _clQueue; std::vector _kernelRegistry; - - cl_mem _currentVertexBuffer, _currentVaryingBuffer; - OsdCLKernelBundle *_currentKernelBundle; }; } // end namespace OPENSUBDIV_VERSION diff --git a/opensubdiv/osd/clKernel.cl b/opensubdiv/osd/clKernel.cl index 24a5ba64..79000f8d 100644 --- a/opensubdiv/osd/clKernel.cl +++ b/opensubdiv/osd/clKernel.cl @@ -28,12 +28,12 @@ struct Vertex { - float v[NUM_VERTEX_ELEMENTS]; + float v[VERTEX_STRIDE]; }; struct Varying { - float v[NUM_VARYING_ELEMENTS]; + float v[VARYING_STRIDE]; }; static void clearVertex(struct Vertex *vertex) { @@ -49,86 +49,121 @@ static void clearVarying(struct Varying *varying) { } } -static void addWithWeight(struct Vertex *dst, __global struct Vertex *src, float weight) { +static void addWithWeight(struct Vertex *dst, + __global float *srcOrigin, + int index, float weight) { - for (int i = 0; i < NUM_VERTEX_ELEMENTS; i++) { - dst->v[i] += src->v[i] * weight; + __global float *src = srcOrigin + index * VERTEX_STRIDE; + for (int i = 0; i < NUM_VERTEX_ELEMENTS; ++i) { + dst->v[i] += src[i] * weight; } } -static void addVaryingWithWeight(struct Varying *dst, __global struct Varying *src, float weight) { +static void addVaryingWithWeight(struct Varying *dst, + __global float *srcOrigin, + int index, float weight) { - for (int i = 0; i < NUM_VARYING_ELEMENTS; i++) { - dst->v[i] += src->v[i] * weight; + __global float *src = srcOrigin + index * VARYING_STRIDE; + for (int i = 0; i < NUM_VARYING_ELEMENTS; ++i) { + dst->v[i] += src[i] * weight; } } -__kernel void computeBilinearEdge(__global struct Vertex *vertex, - __global struct Varying *varying, +static void writeVertex(__global float *dstOrigin, + int index, + struct Vertex *src) { + + __global float *dst = dstOrigin + index * VERTEX_STRIDE; + for (int i = 0; i < NUM_VERTEX_ELEMENTS; ++i) { + dst[i] = src->v[i]; + } +} + +static void writeVarying(__global float *dstOrigin, + int index, + struct Varying *src) { + + __global float *dst = dstOrigin + index * VARYING_STRIDE; + for (int i = 0; i < NUM_VARYING_ELEMENTS; ++i) { + dst[i] = src->v[i]; + } +} + +__kernel void computeBilinearEdge(__global float *vertex, + __global float *varying, __global int *E_IT, - int vertexOffset, int tableOffset, + int vertexOffset, int varyingOffset, + int offset, int tableOffset, int start, int end) { int i = start + get_global_id(0) + tableOffset; - int vid = start + get_global_id(0) + vertexOffset; + int vid = start + get_global_id(0) + offset; int eidx0 = E_IT[2*i+0]; int eidx1 = E_IT[2*i+1]; + vertex += vertexOffset; + varying += (varying ? varyingOffset :0); struct Vertex dst; struct Varying dstVarying; clearVertex(&dst); clearVarying(&dstVarying); - addWithWeight(&dst, &vertex[eidx0], 0.5f); - addWithWeight(&dst, &vertex[eidx1], 0.5f); + addWithWeight(&dst, vertex, eidx0, 0.5f); + addWithWeight(&dst, vertex, eidx1, 0.5f); - vertex[vid] = dst; + writeVertex(vertex, vid, &dst); if (varying) { - addVaryingWithWeight(&dstVarying, &varying[eidx0], 0.5f); - addVaryingWithWeight(&dstVarying, &varying[eidx1], 0.5f); - varying[vid] = dstVarying; + addVaryingWithWeight(&dstVarying, varying, eidx0, 0.5f); + addVaryingWithWeight(&dstVarying, varying, eidx1, 0.5f); + writeVarying(varying, vid, &dstVarying); } } -__kernel void computeBilinearVertex(__global struct Vertex *vertex, - __global struct Varying *varying, +__kernel void computeBilinearVertex(__global float *vertex, + __global float *varying, __global int *V_ITa, - int vertexOffset, int tableOffset, + int vertexOffset, int varyingOffset, + int offset, int tableOffset, int start, int end) { int i = start + get_global_id(0) + tableOffset; - int vid = start + get_global_id(0) + vertexOffset; + int vid = start + get_global_id(0) + offset; + vertex += vertexOffset; + varying += (varying ? varyingOffset :0); int p = V_ITa[i]; struct Vertex dst; clearVertex(&dst); - addWithWeight(&dst, &vertex[p], 1.0f); + addWithWeight(&dst, vertex, p, 1.0f); - vertex[vid] = dst; + writeVertex(vertex, vid, &dst); if (varying) { struct Varying dstVarying; clearVarying(&dstVarying); - addVaryingWithWeight(&dstVarying, &varying[p], 1.0f); - varying[vid] = dstVarying; + addVaryingWithWeight(&dstVarying, varying, p, 1.0f); + writeVarying(varying, vid, &dstVarying); } } -// ---------------------------------------------------------------------------------------- +// --------------------------------------------------------------------------- -__kernel void computeFace(__global struct Vertex *vertex, - __global struct Varying *varying, +__kernel void computeFace(__global float *vertex, + __global float *varying, __global int *F_IT, __global int *F_ITa, - int vertexOffset, int tableOffset, + int vertexOffset, int varyingOffset, + int offset, int tableOffset, int start, int end) { int i = start + get_global_id(0) + tableOffset; - int vid = start + get_global_id(0) + vertexOffset; + int vid = start + get_global_id(0) + offset; int h = F_ITa[2*i]; int n = F_ITa[2*i+1]; + vertex += vertexOffset; + varying += (varying ? varyingOffset :0); float weight = 1.0f/n; @@ -138,26 +173,31 @@ __kernel void computeFace(__global struct Vertex *vertex, clearVarying(&dstVarying); for (int j=0; j -1) { float faceWeight = E_W[i*2+1]; - addWithWeight(&dst, &vertex[eidx2], faceWeight); - addWithWeight(&dst, &vertex[eidx3], faceWeight); + addWithWeight(&dst, vertex, eidx2, faceWeight); + addWithWeight(&dst, vertex, eidx3, faceWeight); } - vertex[vid] = dst; + writeVertex(vertex, vid, &dst); if (varying) { - addVaryingWithWeight(&dstVarying, &varying[eidx0], 0.5f); - addVaryingWithWeight(&dstVarying, &varying[eidx1], 0.5f); - varying[vid] = dstVarying; + addVaryingWithWeight(&dstVarying, varying, eidx0, 0.5f); + addVaryingWithWeight(&dstVarying, varying, eidx1, 0.5f); + writeVarying(varying, vid, &dstVarying); } } -__kernel void computeVertexA(__global struct Vertex *vertex, - __global struct Varying *varying, +__kernel void computeVertexA(__global float *vertex, + __global float *varying, __global int *V_ITa, __global float *V_W, - int vertexOffset, int tableOffset, + int vertexOffset, int varyingOffset, + int offset, int tableOffset, int start, int end, int pass) { int i = start + get_global_id(0) + tableOffset; - int vid = start + get_global_id(0) + vertexOffset; + int vid = start + get_global_id(0) + offset; int n = V_ITa[5*i+1]; int p = V_ITa[5*i+2]; int eidx0 = V_ITa[5*i+3]; int eidx1 = V_ITa[5*i+4]; + vertex += vertexOffset; + varying += (varying ? varyingOffset :0); float weight = (pass==1) ? V_W[i] : 1.0f - V_W[i]; @@ -209,41 +252,43 @@ __kernel void computeVertexA(__global struct Vertex *vertex, weight=1.0f-weight; struct Vertex dst; - if (! pass) - clearVertex(&dst); - else - dst = vertex[vid]; + clearVertex(&dst); + if (pass) + addWithWeight(&dst, vertex, vid, 1.0f); // copy previous result if (eidx0==-1 || (pass==0 && (n==-1)) ) { - addWithWeight(&dst, &vertex[p], weight); + addWithWeight(&dst, vertex, p, weight); } else { - addWithWeight(&dst, &vertex[p], weight * 0.75f); - addWithWeight(&dst, &vertex[eidx0], weight * 0.125f); - addWithWeight(&dst, &vertex[eidx1], weight * 0.125f); + addWithWeight(&dst, vertex, p, weight * 0.75f); + addWithWeight(&dst, vertex, eidx0, weight * 0.125f); + addWithWeight(&dst, vertex, eidx1, weight * 0.125f); } - vertex[vid] = dst; + writeVertex(vertex, vid, &dst); if (! pass && varying) { struct Varying dstVarying; clearVarying(&dstVarying); - addVaryingWithWeight(&dstVarying, &varying[p], 1.0f); - varying[vid] = dstVarying; + addVaryingWithWeight(&dstVarying, varying, p, 1.0f); + writeVarying(varying, vid, &dstVarying); } } -__kernel void computeVertexB(__global struct Vertex *vertex, - __global struct Varying *varying, +__kernel void computeVertexB(__global float *vertex, + __global float *varying, __global int *V_ITa, __global int *V_IT, __global float *V_W, - int vertexOffset, int tableOffset, + int vertexOffset, int varyingOffset, + int offset, int tableOffset, int start, int end) { int i = start + get_global_id(0) + tableOffset; - int vid = start + get_global_id(0) + vertexOffset; + int vid = start + get_global_id(0) + offset; int h = V_ITa[5*i]; int n = V_ITa[5*i+1]; int p = V_ITa[5*i+2]; + vertex += vertexOffset; + varying += (varying ? varyingOffset :0); float weight = V_W[i]; float wp = 1.0f/(float)(n*n); @@ -252,35 +297,38 @@ __kernel void computeVertexB(__global struct Vertex *vertex, struct Vertex dst; clearVertex(&dst); - addWithWeight(&dst, &vertex[p], weight * wv); + addWithWeight(&dst, vertex, p, weight * wv); for (int j = 0; j < n; ++j) { - addWithWeight(&dst, &vertex[V_IT[h+j*2]], weight * wp); - addWithWeight(&dst, &vertex[V_IT[h+j*2+1]], weight * wp); + addWithWeight(&dst, vertex, V_IT[h+j*2], weight * wp); + addWithWeight(&dst, vertex, V_IT[h+j*2+1], weight * wp); } - vertex[vid] = dst; + writeVertex(vertex, vid, &dst); if (varying) { struct Varying dstVarying; clearVarying(&dstVarying); - addVaryingWithWeight(&dstVarying, &varying[p], 1.0f); - varying[vid] = dstVarying; + addVaryingWithWeight(&dstVarying, varying, p, 1.0f); + writeVarying(varying, vid, &dstVarying); } } -__kernel void computeLoopVertexB(__global struct Vertex *vertex, - __global struct Varying *varying, +__kernel void computeLoopVertexB(__global float *vertex, + __global float *varying, __global int *V_ITa, __global int *V_IT, __global float *V_W, - int vertexOffset, int tableOffset, + int vertexOffset, int varyingOffset, + int offset, int tableOffset, int start, int end) { int i = start + get_global_id(0) + tableOffset; - int vid = start + get_global_id(0) + vertexOffset; + int vid = start + get_global_id(0) + offset; int h = V_ITa[5*i]; int n = V_ITa[5*i+1]; int p = V_ITa[5*i+2]; + vertex += vertexOffset; + varying += (varying ? varyingOffset :0); float weight = V_W[i]; float wp = 1.0f/(float)(n); @@ -290,36 +338,37 @@ __kernel void computeLoopVertexB(__global struct Vertex *vertex, struct Vertex dst; clearVertex(&dst); - addWithWeight(&dst, &vertex[p], weight * (1.0f - (beta * n))); + addWithWeight(&dst, vertex, p, weight * (1.0f - (beta * n))); for (int j = 0; j < n; ++j) { - addWithWeight(&dst, &vertex[V_IT[h+j]], weight * beta); + addWithWeight(&dst, vertex, V_IT[h+j], weight * beta); } - vertex[vid] = dst; + writeVertex(vertex, vid, &dst); if (varying) { struct Varying dstVarying; clearVarying(&dstVarying); - addVaryingWithWeight(&dstVarying, &varying[p], 1.0f); - varying[vid] = dstVarying; + addVaryingWithWeight(&dstVarying, varying, p, 1.0f); + writeVarying(varying, vid, &dstVarying); } } -__kernel void editVertexAdd(__global struct Vertex *vertex, +__kernel void editVertexAdd(__global float *vertex, __global int *editIndices, __global float *editValues, + int vertexOffset, int primVarOffset, int primVarWidth, - int vertexOffset, int tableOffset, + int offset, int tableOffset, int start, int end) { int i = start + get_global_id(0) + tableOffset; int v = editIndices[i]; int eid = start + get_global_id(0); - struct Vertex dst = vertex[v]; + vertex += vertexOffset; + vertex += v * VERTEX_STRIDE + primVarOffset; for (int j = 0; j < primVarWidth; ++j) { - dst.v[j+primVarOffset] += editValues[eid*primVarWidth + j]; + vertex[j] += editValues[eid*primVarWidth + j]; } - vertex[v] = dst; } diff --git a/opensubdiv/osd/clKernelBundle.cpp b/opensubdiv/osd/clKernelBundle.cpp index 32515171..e24ec200 100644 --- a/opensubdiv/osd/clKernelBundle.cpp +++ b/opensubdiv/osd/clKernelBundle.cpp @@ -28,6 +28,8 @@ #include "../osd/error.h" #include +#include + #ifdef _MSC_VER #define snprintf _snprintf #endif @@ -54,8 +56,11 @@ OsdCLKernelBundle::OsdCLKernelBundle() : _clCatmarkVertexB(NULL), _clLoopEdge(NULL), _clLoopVertexA(NULL), - _clLoopVertexB(NULL) - { + _clLoopVertexB(NULL), + _numVertexElements(0), + _vertexStride(0), + _numVaryingElements(0), + _varyingStride(0) { } OsdCLKernelBundle::~OsdCLKernelBundle() { @@ -97,19 +102,24 @@ static cl_kernel buildKernel(cl_program prog, const char * name) { bool OsdCLKernelBundle::Compile(cl_context clContext, - int numVertexElements, int numVaryingElements) { + OsdVertexBufferDescriptor const &vertexDesc, + OsdVertexBufferDescriptor const &varyingDesc) { cl_int ciErrNum; - _vdesc.Set( numVertexElements, numVaryingElements ); + _numVertexElements = vertexDesc.length; + _vertexStride = vertexDesc.stride; + _numVaryingElements = varyingDesc.length; + _varyingStride = varyingDesc.stride; - char constantDefine[256]; - snprintf(constantDefine, sizeof(constantDefine), - "#define NUM_VERTEX_ELEMENTS %d\n" - "#define NUM_VARYING_ELEMENTS %d\n", - numVertexElements, numVaryingElements); + std::ostringstream defines; + defines << "#define NUM_VERTEX_ELEMENTS " << _numVertexElements << "\n" + << "#define VERTEX_STRIDE " << _vertexStride << "\n" + << "#define NUM_VARYING_ELEMENTS " << _numVaryingElements << "\n" + << "#define VARYING_STRIDE " << _varyingStride << "\n"; + std::string defineStr = defines.str(); - const char *sources[] = { constantDefine, clSource }; + const char *sources[] = { defineStr.c_str(), clSource }; _clProgram = clCreateProgramWithSource(clContext, 2, sources, 0, &ciErrNum); CL_CHECK_ERROR(ciErrNum, "clCreateProgramWithSource\n"); @@ -131,6 +141,7 @@ OsdCLKernelBundle::Compile(cl_context clContext, OsdError(OSD_CL_PROGRAM_BUILD_ERROR, cBuildLog); } delete[] devices; + return false; } diff --git a/opensubdiv/osd/clKernelBundle.h b/opensubdiv/osd/clKernelBundle.h index 2e086644..efa2c79a 100644 --- a/opensubdiv/osd/clKernelBundle.h +++ b/opensubdiv/osd/clKernelBundle.h @@ -47,7 +47,8 @@ public: ~OsdCLKernelBundle(); bool Compile(cl_context clContext, - int numVertexElements, int numVaryingElements); + OsdVertexBufferDescriptor const &vertexDesc, + OsdVertexBufferDescriptor const &varyingDesc); cl_kernel GetBilinearEdgeKernel() const { return _clBilinearEdge; } @@ -70,17 +71,23 @@ public: cl_kernel GetVertexEditAdd() const { return _clVertexEditAdd; } struct Match { - /// Constructor - Match(int numVertexElements, int numVaryingElements) - : vdesc(numVertexElements, numVaryingElements) { + Match(OsdVertexBufferDescriptor const &vertex, + OsdVertexBufferDescriptor const &varying) + : vertexDesc(vertex), varyingDesc(varying) { } - + bool operator() (OsdCLKernelBundle const *kernel) { - return vdesc == kernel->_vdesc; + // offset is dynamic. just comparing length and stride here, + // returns true if they are equal + return (vertexDesc.length == kernel->_numVertexElements and + vertexDesc.stride == kernel->_vertexStride and + varyingDesc.length == kernel->_numVaryingElements and + varyingDesc.stride == kernel->_varyingStride); } - - OsdVertexDescriptor vdesc; + + OsdVertexBufferDescriptor vertexDesc; + OsdVertexBufferDescriptor varyingDesc; }; friend struct Match; @@ -99,7 +106,10 @@ protected: _clLoopVertexB, _clVertexEditAdd; - OsdVertexDescriptor _vdesc; + int _numVertexElements; + int _vertexStride; + int _numVaryingElements; + int _varyingStride; }; } // end namespace OPENSUBDIV_VERSION diff --git a/opensubdiv/osd/cpuComputeController.cpp b/opensubdiv/osd/cpuComputeController.cpp index 3c072011..69c1973e 100644 --- a/opensubdiv/osd/cpuComputeController.cpp +++ b/opensubdiv/osd/cpuComputeController.cpp @@ -30,8 +30,7 @@ namespace OpenSubdiv { namespace OPENSUBDIV_VERSION { -OsdCpuComputeController::OsdCpuComputeController() : - _currentVertexBuffer(NULL), _currentVaryingBuffer(NULL) { +OsdCpuComputeController::OsdCpuComputeController() { } OsdCpuComputeController::~OsdCpuComputeController() { @@ -44,7 +43,8 @@ OsdCpuComputeController::ApplyBilinearFaceVerticesKernel( assert(context); OsdCpuComputeFace( - _vdesc, _currentVertexBuffer, _currentVaryingBuffer, + _currentBindState.vertexBuffer, _currentBindState.varyingBuffer, + _currentBindState.vertexDesc, _currentBindState.varyingDesc, (const int*)context->GetTable(FarSubdivisionTables::F_IT)->GetBuffer(), (const int*)context->GetTable(FarSubdivisionTables::F_ITa)->GetBuffer(), batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd()); @@ -57,7 +57,8 @@ OsdCpuComputeController::ApplyBilinearEdgeVerticesKernel( assert(context); OsdCpuComputeBilinearEdge( - _vdesc, _currentVertexBuffer, _currentVaryingBuffer, + _currentBindState.vertexBuffer, _currentBindState.varyingBuffer, + _currentBindState.vertexDesc, _currentBindState.varyingDesc, (const int*)context->GetTable(FarSubdivisionTables::E_IT)->GetBuffer(), batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd()); } @@ -69,7 +70,8 @@ OsdCpuComputeController::ApplyBilinearVertexVerticesKernel( assert(context); OsdCpuComputeBilinearVertex( - _vdesc, _currentVertexBuffer, _currentVaryingBuffer, + _currentBindState.vertexBuffer, _currentBindState.varyingBuffer, + _currentBindState.vertexDesc, _currentBindState.varyingDesc, (const int*)context->GetTable(FarSubdivisionTables::V_ITa)->GetBuffer(), batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd()); } @@ -81,7 +83,8 @@ OsdCpuComputeController::ApplyCatmarkFaceVerticesKernel( assert(context); OsdCpuComputeFace( - _vdesc, _currentVertexBuffer, _currentVaryingBuffer, + _currentBindState.vertexBuffer, _currentBindState.varyingBuffer, + _currentBindState.vertexDesc, _currentBindState.varyingDesc, (const int*)context->GetTable(FarSubdivisionTables::F_IT)->GetBuffer(), (const int*)context->GetTable(FarSubdivisionTables::F_ITa)->GetBuffer(), batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd()); @@ -94,7 +97,8 @@ OsdCpuComputeController::ApplyCatmarkEdgeVerticesKernel( assert(context); OsdCpuComputeEdge( - _vdesc, _currentVertexBuffer, _currentVaryingBuffer, + _currentBindState.vertexBuffer, _currentBindState.varyingBuffer, + _currentBindState.vertexDesc, _currentBindState.varyingDesc, (const int*)context->GetTable(FarSubdivisionTables::E_IT)->GetBuffer(), (const float*)context->GetTable(FarSubdivisionTables::E_W)->GetBuffer(), batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd()); @@ -107,7 +111,8 @@ OsdCpuComputeController::ApplyCatmarkVertexVerticesKernelB( assert(context); OsdCpuComputeVertexB( - _vdesc, _currentVertexBuffer, _currentVaryingBuffer, + _currentBindState.vertexBuffer, _currentBindState.varyingBuffer, + _currentBindState.vertexDesc, _currentBindState.varyingDesc, (const int*)context->GetTable(FarSubdivisionTables::V_ITa)->GetBuffer(), (const int*)context->GetTable(FarSubdivisionTables::V_IT)->GetBuffer(), (const float*)context->GetTable(FarSubdivisionTables::V_W)->GetBuffer(), @@ -121,7 +126,8 @@ OsdCpuComputeController::ApplyCatmarkVertexVerticesKernelA1( assert(context); OsdCpuComputeVertexA( - _vdesc, _currentVertexBuffer, _currentVaryingBuffer, + _currentBindState.vertexBuffer, _currentBindState.varyingBuffer, + _currentBindState.vertexDesc, _currentBindState.varyingDesc, (const int*)context->GetTable(FarSubdivisionTables::V_ITa)->GetBuffer(), (const float*)context->GetTable(FarSubdivisionTables::V_W)->GetBuffer(), batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd(), false); @@ -134,7 +140,8 @@ OsdCpuComputeController::ApplyCatmarkVertexVerticesKernelA2( assert(context); OsdCpuComputeVertexA( - _vdesc, _currentVertexBuffer, _currentVaryingBuffer, + _currentBindState.vertexBuffer, _currentBindState.varyingBuffer, + _currentBindState.vertexDesc, _currentBindState.varyingDesc, (const int*)context->GetTable(FarSubdivisionTables::V_ITa)->GetBuffer(), (const float*)context->GetTable(FarSubdivisionTables::V_W)->GetBuffer(), batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd(), true); @@ -147,7 +154,8 @@ OsdCpuComputeController::ApplyLoopEdgeVerticesKernel( assert(context); OsdCpuComputeEdge( - _vdesc, _currentVertexBuffer, _currentVaryingBuffer, + _currentBindState.vertexBuffer, _currentBindState.varyingBuffer, + _currentBindState.vertexDesc, _currentBindState.varyingDesc, (const int*)context->GetTable(FarSubdivisionTables::E_IT)->GetBuffer(), (const float*)context->GetTable(FarSubdivisionTables::E_W)->GetBuffer(), batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd()); @@ -160,7 +168,8 @@ OsdCpuComputeController::ApplyLoopVertexVerticesKernelB( assert(context); OsdCpuComputeLoopVertexB( - _vdesc, _currentVertexBuffer, _currentVaryingBuffer, + _currentBindState.vertexBuffer, _currentBindState.varyingBuffer, + _currentBindState.vertexDesc, _currentBindState.varyingDesc, (const int*)context->GetTable(FarSubdivisionTables::V_ITa)->GetBuffer(), (const int*)context->GetTable(FarSubdivisionTables::V_IT)->GetBuffer(), (const float*)context->GetTable(FarSubdivisionTables::V_W)->GetBuffer(), @@ -174,7 +183,8 @@ OsdCpuComputeController::ApplyLoopVertexVerticesKernelA1( assert(context); OsdCpuComputeVertexA( - _vdesc, _currentVertexBuffer, _currentVaryingBuffer, + _currentBindState.vertexBuffer, _currentBindState.varyingBuffer, + _currentBindState.vertexDesc, _currentBindState.varyingDesc, (const int*)context->GetTable(FarSubdivisionTables::V_ITa)->GetBuffer(), (const float*)context->GetTable(FarSubdivisionTables::V_W)->GetBuffer(), batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd(), false); @@ -187,7 +197,8 @@ OsdCpuComputeController::ApplyLoopVertexVerticesKernelA2( assert(context); OsdCpuComputeVertexA( - _vdesc, _currentVertexBuffer, _currentVaryingBuffer, + _currentBindState.vertexBuffer, _currentBindState.varyingBuffer, + _currentBindState.vertexDesc, _currentBindState.varyingDesc, (const int*)context->GetTable(FarSubdivisionTables::V_ITa)->GetBuffer(), (const float*)context->GetTable(FarSubdivisionTables::V_W)->GetBuffer(), batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd(), true); @@ -206,24 +217,24 @@ OsdCpuComputeController::ApplyVertexEdits( const OsdCpuTable * editValues = edit->GetEditValues(); if (edit->GetOperation() == FarVertexEdit::Add) { - OsdCpuEditVertexAdd(_vdesc, - _currentVertexBuffer, + OsdCpuEditVertexAdd(_currentBindState.vertexBuffer, + _currentBindState.vertexDesc, edit->GetPrimvarOffset(), edit->GetPrimvarWidth(), - batch.GetVertexOffset(), - batch.GetTableOffset(), - batch.GetStart(), + batch.GetVertexOffset(), + batch.GetTableOffset(), + batch.GetStart(), batch.GetEnd(), static_cast(primvarIndices->GetBuffer()), static_cast(editValues->GetBuffer())); } else if (edit->GetOperation() == FarVertexEdit::Set) { - OsdCpuEditVertexSet(_vdesc, - _currentVertexBuffer, + OsdCpuEditVertexSet(_currentBindState.vertexBuffer, + _currentBindState.vertexDesc, edit->GetPrimvarOffset(), edit->GetPrimvarWidth(), - batch.GetVertexOffset(), - batch.GetTableOffset(), - batch.GetStart(), + batch.GetVertexOffset(), + batch.GetTableOffset(), + batch.GetStart(), batch.GetEnd(), static_cast(primvarIndices->GetBuffer()), static_cast(editValues->GetBuffer())); diff --git a/opensubdiv/osd/cpuComputeController.h b/opensubdiv/osd/cpuComputeController.h index 38c70e67..bbfc0ca6 100644 --- a/opensubdiv/osd/cpuComputeController.h +++ b/opensubdiv/osd/cpuComputeController.h @@ -29,6 +29,7 @@ #include "../far/dispatcher.h" #include "../osd/cpuComputeContext.h" +#include "../osd/vertexDescriptor.h" namespace OpenSubdiv { namespace OPENSUBDIV_VERSION { @@ -64,15 +65,25 @@ public: /// /// @param varyingBuffer varying-interpolated data buffer /// + /// @param vertexDesc the descriptor of vertex elements to be refined. + /// if it's null, all primvars in the vertex buffer + /// will be refined. + /// + /// @param varyingDesc the descriptor of varying elements to be refined. + /// if it's null, all primvars in the varying buffer + /// will be refined. + /// template void Refine(OsdCpuComputeContext const *context, FarKernelBatchVector const & batches, VERTEX_BUFFER *vertexBuffer, - VARYING_BUFFER *varyingBuffer) { + VARYING_BUFFER *varyingBuffer, + OsdVertexBufferDescriptor const *vertexDesc=NULL, + OsdVertexBufferDescriptor const *varyingDesc=NULL) { if (batches.empty()) return; - bind(vertexBuffer, varyingBuffer); + bind(vertexBuffer, varyingBuffer, vertexDesc, varyingDesc); FarDispatcher::Refine(this, context, batches, /*maxlevel*/-1); @@ -130,25 +141,62 @@ protected: void ApplyVertexEdits(FarKernelBatch const &batch, ComputeContext const *context) const; template - void bind(VERTEX_BUFFER *vertex, VARYING_BUFFER *varying) { + void bind(VERTEX_BUFFER *vertex, VARYING_BUFFER *varying, + OsdVertexBufferDescriptor const *vertexDesc, + OsdVertexBufferDescriptor const *varyingDesc) { - _currentVertexBuffer = vertex ? vertex->BindCpuBuffer() : 0; - _currentVaryingBuffer = varying ? varying->BindCpuBuffer() : 0; + // if the vertex buffer descriptor is specified, use it. + // otherwise, assumes the data is tightly packed in the vertex buffer. + if (vertexDesc) { + _currentBindState.vertexDesc = *vertexDesc; + } else { + int numElements = vertex ? vertex->GetNumElements() : 0; + _currentBindState.vertexDesc = OsdVertexBufferDescriptor( + 0, numElements, numElements); + } + if (varyingDesc) { + _currentBindState.varyingDesc = *varyingDesc; + } else { + int numElements = varying ? varying->GetNumElements() : 0; + _currentBindState.varyingDesc = OsdVertexBufferDescriptor( + 0, numElements, numElements); + } - int numVertexElements = vertex ? vertex->GetNumElements() : 0; - int numVaryingElements = varying ? varying->GetNumElements() : 0; - _vdesc.Set(numVertexElements, numVaryingElements); + // apply vertex offset here + if (vertex) { + _currentBindState.vertexBuffer = + vertex->BindCpuBuffer() + _currentBindState.vertexDesc.offset; + } else { + _currentBindState.vertexBuffer = NULL; + } + if (varying) { + _currentBindState.varyingBuffer = + varying->BindCpuBuffer() + _currentBindState.varyingDesc.offset; + } else { + _currentBindState.varyingBuffer = NULL; + } } void unbind() { - _currentVertexBuffer = 0; - _currentVaryingBuffer = 0; - _vdesc.Reset(); + _currentBindState.Reset(); } private: - float *_currentVertexBuffer, *_currentVaryingBuffer; - OsdVertexDescriptor _vdesc; + // Bind state is a transitional state during refinement. + // It doesn't take an ownership of vertex buffers. + struct BindState { + BindState() : vertexBuffer(NULL), varyingBuffer(NULL) {} + void Reset() { + vertexBuffer = varyingBuffer = NULL; + vertexDesc.Reset(); + varyingDesc.Reset(); + } + float *vertexBuffer; + float *varyingBuffer; + OsdVertexBufferDescriptor vertexDesc; + OsdVertexBufferDescriptor varyingDesc; + }; + BindState _currentBindState; }; } // end namespace OPENSUBDIV_VERSION diff --git a/opensubdiv/osd/cpuKernel.cpp b/opensubdiv/osd/cpuKernel.cpp old mode 100644 new mode 100755 index b84e04a0..3012181e --- a/opensubdiv/osd/cpuKernel.cpp +++ b/opensubdiv/osd/cpuKernel.cpp @@ -25,54 +25,101 @@ #include "../osd/cpuKernel.h" #include "../osd/vertexDescriptor.h" +#include +#include +#include + namespace OpenSubdiv { namespace OPENSUBDIV_VERSION { +static inline void +clear(float *dst, OsdVertexBufferDescriptor const &desc) { + + memset(dst, 0, desc.length*sizeof(float)); +} + +static inline void +addWithWeight(float *dst, const float *srcOrigin, int srcIndex, float weight, + OsdVertexBufferDescriptor const &desc) { + + if (srcOrigin && dst) { + const float *src = srcOrigin + srcIndex * desc.stride; + for (int k = 0; k < desc.length; ++k) { + dst[k] += src[k] * weight; + } + } +} + +static inline void +copy(float *dstOrigin, const float *src, int dstIndex, + OsdVertexBufferDescriptor const &desc) { + + if (dstOrigin && src) { + float *dst = dstOrigin + dstIndex * desc.stride; + memcpy(dst, src, desc.length*sizeof(float)); + } +} + void OsdCpuComputeFace( - OsdVertexDescriptor const &vdesc, float * vertex, float * varying, + float * vertex, float * varying, + OsdVertexBufferDescriptor const &vertexDesc, + OsdVertexBufferDescriptor const &varyingDesc, const int *F_IT, const int *F_ITa, int vertexOffset, int tableOffset, int start, int end) { - if(vdesc.numVertexElements == 4 && varying == NULL) { + if(vertexDesc == OsdVertexBufferDescriptor(0, 4, 4) && varying == NULL) { ComputeFaceKernel<4> (vertex, F_IT, F_ITa, vertexOffset, tableOffset, start, end); - } else if(vdesc.numVertexElements == 8 && varying == NULL) { + } else if(vertexDesc == OsdVertexBufferDescriptor(0, 8, 8) && varying == NULL) { ComputeFaceKernel<8> (vertex, F_IT, F_ITa, vertexOffset, tableOffset, start, end); } else { + float *vertexResults = (float*)alloca(vertexDesc.length * sizeof(float)); + float *varyingResults = (float*)alloca(varyingDesc.length * sizeof(float)); + for (int i = start + tableOffset; i < end + tableOffset; i++) { int h = F_ITa[2*i]; int n = F_ITa[2*i+1]; float weight = 1.0f/n; - - // XXX: should use local vertex struct variable instead of - // accumulating directly into global memory. int dstIndex = i + vertexOffset - tableOffset; - vdesc.Clear(vertex, varying, dstIndex); + // clear + clear(vertexResults, vertexDesc); + clear(varyingResults, varyingDesc); + + // accum for (int j = 0; j < n; ++j) { int index = F_IT[h+j]; - vdesc.AddWithWeight(vertex, dstIndex, index, weight); - vdesc.AddVaryingWithWeight(varying, dstIndex, index, weight); + addWithWeight(vertexResults, vertex, index, weight, vertexDesc); + addWithWeight(varyingResults, varying, index, weight, varyingDesc); } - } + + // write results + copy(vertex, vertexResults, dstIndex, vertexDesc); + copy(varying, varyingResults, dstIndex, varyingDesc); + } } } void OsdCpuComputeEdge( - OsdVertexDescriptor const &vdesc, float *vertex, float *varying, + float *vertex, float *varying, + OsdVertexBufferDescriptor const &vertexDesc, + OsdVertexBufferDescriptor const &varyingDesc, const int *E_IT, const float *E_W, int vertexOffset, int tableOffset, int start, int end) { - if(vdesc.numVertexElements == 4 && varying == NULL) { + if(vertexDesc == OsdVertexBufferDescriptor(0, 4, 4) && varying == NULL) { ComputeEdgeKernel<4>(vertex, E_IT, E_W, vertexOffset, tableOffset, start, end); } - else if(vdesc.numVertexElements == 8 && varying == NULL) { + else if(vertexDesc == OsdVertexBufferDescriptor(0, 8, 8) && varying == NULL) { ComputeEdgeKernel<8>(vertex, E_IT, E_W, vertexOffset, tableOffset, - start, end); + start, end); } else { + float *vertexResults = (float*)alloca(vertexDesc.length * sizeof(float)); + float *varyingResults = (float*)alloca(varyingDesc.length * sizeof(float)); + for (int i = start + tableOffset; i < end + tableOffset; i++) { int eidx0 = E_IT[4*i+0]; int eidx1 = E_IT[4*i+1]; @@ -82,37 +129,46 @@ void OsdCpuComputeEdge( float vertWeight = E_W[i*2+0]; int dstIndex = i + vertexOffset - tableOffset; - vdesc.Clear(vertex, varying, dstIndex); + clear(vertexResults, vertexDesc); + clear(varyingResults, varyingDesc); - vdesc.AddWithWeight(vertex, dstIndex, eidx0, vertWeight); - vdesc.AddWithWeight(vertex, dstIndex, eidx1, vertWeight); + addWithWeight(vertexResults, vertex, eidx0, vertWeight, vertexDesc); + addWithWeight(vertexResults, vertex, eidx1, vertWeight, vertexDesc); if (eidx2 != -1) { float faceWeight = E_W[i*2+1]; - vdesc.AddWithWeight(vertex, dstIndex, eidx2, faceWeight); - vdesc.AddWithWeight(vertex, dstIndex, eidx3, faceWeight); + addWithWeight(vertexResults, vertex, eidx2, faceWeight, vertexDesc); + addWithWeight(vertexResults, vertex, eidx3, faceWeight, vertexDesc); } - vdesc.AddVaryingWithWeight(varying, dstIndex, eidx0, 0.5f); - vdesc.AddVaryingWithWeight(varying, dstIndex, eidx1, 0.5f); - } + addWithWeight(varyingResults, varying, eidx0, 0.5f, varyingDesc); + addWithWeight(varyingResults, varying, eidx1, 0.5f, varyingDesc); + + copy(vertex, vertexResults, dstIndex, vertexDesc); + copy(varying, varyingResults, dstIndex, varyingDesc); + } } } void OsdCpuComputeVertexA( - OsdVertexDescriptor const &vdesc, float *vertex, float *varying, + float *vertex, float *varying, + OsdVertexBufferDescriptor const &vertexDesc, + OsdVertexBufferDescriptor const &varyingDesc, const int *V_ITa, const float *V_W, int vertexOffset, int tableOffset, int start, int end, int pass) { - if(vdesc.numVertexElements == 4 && varying == NULL) { + if(vertexDesc == OsdVertexBufferDescriptor(0, 4, 4) && varying == NULL) { ComputeVertexAKernel<4>(vertex, V_ITa, V_W, vertexOffset, tableOffset, start, end, pass); } - else if (vdesc.numVertexElements == 8 && varying == NULL) { + else if(vertexDesc == OsdVertexBufferDescriptor(0, 8, 8) && varying == NULL) { ComputeVertexAKernel<8>(vertex, V_ITa, V_W, vertexOffset, tableOffset, start, end, pass); - } + } else { + float *vertexResults = (float*)alloca(vertexDesc.length * sizeof(float)); + float *varyingResults = (float*)alloca(varyingDesc.length * sizeof(float)); + for (int i = start + tableOffset; i < end + tableOffset; i++) { int n = V_ITa[5*i+1]; int p = V_ITa[5*i+2]; @@ -129,36 +185,48 @@ void OsdCpuComputeVertexA( int dstIndex = i + vertexOffset - tableOffset; - if (not pass) - vdesc.Clear(vertex, varying, dstIndex); - - if (eidx0 == -1 || (pass == 0 && (n == -1))) { - vdesc.AddWithWeight(vertex, dstIndex, p, weight); - } else { - vdesc.AddWithWeight(vertex, dstIndex, p, weight * 0.75f); - vdesc.AddWithWeight(vertex, dstIndex, eidx0, weight * 0.125f); - vdesc.AddWithWeight(vertex, dstIndex, eidx1, weight * 0.125f); + clear(vertexResults, vertexDesc); + clear(varyingResults, varyingDesc); + if (pass) { + // copy previous results + addWithWeight(vertexResults, vertex, dstIndex, 1.0f, vertexDesc); } - if (not pass) - vdesc.AddVaryingWithWeight(varying, dstIndex, p, 1.0f); - } + if (eidx0 == -1 || (pass == 0 && (n == -1))) { + addWithWeight(vertexResults, vertex, p, weight, vertexDesc); + } else { + addWithWeight(vertexResults, vertex, p, weight * 0.75f, vertexDesc); + addWithWeight(vertexResults, vertex, eidx0, weight * 0.125f, vertexDesc); + addWithWeight(vertexResults, vertex, eidx1, weight * 0.125f, vertexDesc); + } + + copy(vertex, vertexResults, dstIndex, vertexDesc); + if (not pass) { + addWithWeight(varyingResults, varying, p, 1.0f, varyingDesc); + copy(varying, varyingResults, dstIndex, varyingDesc); + } + } } } void OsdCpuComputeVertexB( - OsdVertexDescriptor const &vdesc, float *vertex, float *varying, + float *vertex, float *varying, + OsdVertexBufferDescriptor const &vertexDesc, + OsdVertexBufferDescriptor const &varyingDesc, const int *V_ITa, const int *V_IT, const float *V_W, int vertexOffset, int tableOffset, int start, int end) { - if(vdesc.numVertexElements == 4 && varying == NULL) { + if(vertexDesc == OsdVertexBufferDescriptor(0, 4, 4) && varying == NULL) { ComputeVertexBKernel<4>(vertex, V_ITa, V_IT, V_W, vertexOffset, tableOffset, start, end); } - else if(vdesc.numVertexElements == 8 && varying == NULL) { + else if(vertexDesc == OsdVertexBufferDescriptor(0, 8, 8) && varying == NULL) { ComputeVertexBKernel<8>(vertex, V_ITa, V_IT, V_W, vertexOffset, tableOffset, start, end); - } + } else { + float *vertexResults = (float*)alloca(vertexDesc.length * sizeof(float)); + float *varyingResults = (float*)alloca(varyingDesc.length * sizeof(float)); + for (int i = start + tableOffset; i < end + tableOffset; i++) { int h = V_ITa[5*i]; int n = V_ITa[5*i+1]; @@ -169,32 +237,41 @@ void OsdCpuComputeVertexB( float wv = (n-2.0f) * n * wp; int dstIndex = i + vertexOffset - tableOffset; - vdesc.Clear(vertex, varying, dstIndex); + clear(vertexResults, vertexDesc); + clear(varyingResults, varyingDesc); - vdesc.AddWithWeight(vertex, dstIndex, p, weight * wv); + addWithWeight(vertexResults, vertex, p, weight * wv, vertexDesc); for (int j = 0; j < n; ++j) { - vdesc.AddWithWeight(vertex, dstIndex, V_IT[h+j*2], weight * wp); - vdesc.AddWithWeight(vertex, dstIndex, V_IT[h+j*2+1], weight * wp); + addWithWeight(vertexResults, vertex, V_IT[h+j*2], weight * wp, vertexDesc); + addWithWeight(vertexResults, vertex, V_IT[h+j*2+1], weight * wp, vertexDesc); } - vdesc.AddVaryingWithWeight(varying, dstIndex, p, 1.0f); + addWithWeight(varyingResults, varying, p, 1.0f, varyingDesc); + + copy(vertex, vertexResults, dstIndex, vertexDesc); + copy(varying, varyingResults, dstIndex, varyingDesc); } - } + } } void OsdCpuComputeLoopVertexB( - OsdVertexDescriptor const &vdesc, float *vertex, float *varying, + float *vertex, float *varying, + OsdVertexBufferDescriptor const &vertexDesc, + OsdVertexBufferDescriptor const &varyingDesc, const int *V_ITa, const int *V_IT, const float *V_W, int vertexOffset, int tableOffset, int start, int end) { - if(vdesc.numVertexElements == 4 && varying == NULL) { + if(vertexDesc == OsdVertexBufferDescriptor(0, 4, 4) && varying == NULL) { ComputeLoopVertexBKernel<4>(vertex, V_ITa, V_IT, V_W, vertexOffset, tableOffset, start, end); } - else if(vdesc.numVertexElements == 8 && varying == NULL) { + else if(vertexDesc == OsdVertexBufferDescriptor(0, 8, 8) && varying == NULL) { ComputeLoopVertexBKernel<8>(vertex, V_ITa, V_IT, V_W, vertexOffset, tableOffset, start, end); } else { + float *vertexResults = (float*)alloca(vertexDesc.length * sizeof(float)); + float *varyingResults = (float*)alloca(varyingDesc.length * sizeof(float)); + for (int i = start + tableOffset; i < end + tableOffset; i++) { int h = V_ITa[5*i]; int n = V_ITa[5*i+1]; @@ -207,94 +284,120 @@ void OsdCpuComputeLoopVertexB( beta = (0.625f - beta) * wp; int dstIndex = i + vertexOffset - tableOffset; - vdesc.Clear(vertex, varying, dstIndex); + clear(vertexResults, vertexDesc); + clear(varyingResults, varyingDesc); - vdesc.AddWithWeight(vertex, dstIndex, p, weight * (1.0f - (beta * n))); + addWithWeight(vertexResults, vertex, p, weight * (1.0f - (beta * n)), vertexDesc); for (int j = 0; j < n; ++j) - vdesc.AddWithWeight(vertex, dstIndex, V_IT[h+j], weight * beta); + addWithWeight(vertexResults, vertex, V_IT[h+j], weight * beta, vertexDesc); - vdesc.AddVaryingWithWeight(varying, dstIndex, p, 1.0f); - } - } + addWithWeight(varyingResults, varying, p, 1.0f, varyingDesc); + + copy(vertex, vertexResults, dstIndex, vertexDesc); + copy(varying, varyingResults, dstIndex, varyingDesc); + } + } } void OsdCpuComputeBilinearEdge( - OsdVertexDescriptor const &vdesc, float *vertex, float *varying, + float *vertex, float *varying, + OsdVertexBufferDescriptor const &vertexDesc, + OsdVertexBufferDescriptor const &varyingDesc, const int *E_IT, int vertexOffset, int tableOffset, int start, int end) { - if(vdesc.numVertexElements == 4 && varying == NULL) { + if(vertexDesc == OsdVertexBufferDescriptor(0, 4, 4) && varying == NULL) { ComputeBilinearEdgeKernel<4>(vertex, E_IT, vertexOffset, tableOffset, start, end); } - else if(vdesc.numVertexElements == 8 && varying == NULL) { + else if(vertexDesc == OsdVertexBufferDescriptor(0, 8, 8) && varying == NULL) { ComputeBilinearEdgeKernel<8>(vertex, E_IT, vertexOffset, tableOffset, start, end); } else { + float *vertexResults = (float*)alloca(vertexDesc.length * sizeof(float)); + float *varyingResults = (float*)alloca(varyingDesc.length * sizeof(float)); + for (int i = start + tableOffset; i < end + tableOffset; i++) { int eidx0 = E_IT[2*i+0]; int eidx1 = E_IT[2*i+1]; int dstIndex = i + vertexOffset - tableOffset; - vdesc.Clear(vertex, varying, dstIndex); + clear(vertexResults, vertexDesc); + clear(varyingResults, varyingDesc); - vdesc.AddWithWeight(vertex, dstIndex, eidx0, 0.5f); - vdesc.AddWithWeight(vertex, dstIndex, eidx1, 0.5f); + addWithWeight(vertexResults, vertex, eidx0, 0.5f, vertexDesc); + addWithWeight(vertexResults, vertex, eidx1, 0.5f, vertexDesc); - vdesc.AddVaryingWithWeight(varying, dstIndex, eidx0, 0.5f); - vdesc.AddVaryingWithWeight(varying, dstIndex, eidx1, 0.5f); - } + addWithWeight(varyingResults, varying, eidx0, 0.5f, varyingDesc); + addWithWeight(varyingResults, varying, eidx1, 0.5f, varyingDesc); + + copy(vertex, vertexResults, dstIndex, vertexDesc); + copy(varying, varyingResults, dstIndex, varyingDesc); + } } } void OsdCpuComputeBilinearVertex( - OsdVertexDescriptor const &vdesc, float *vertex, float *varying, + float *vertex, float *varying, + OsdVertexBufferDescriptor const &vertexDesc, + OsdVertexBufferDescriptor const &varyingDesc, const int *V_ITa, int vertexOffset, int tableOffset, int start, int end) { - int numVertexElements = vdesc.numVertexElements; - int numVaryingElements = vdesc.numVaryingElements; - float *src, *des; + + float *src, *des; for (int i = start + tableOffset; i < end + tableOffset; i++) { int p = V_ITa[i]; - int dstIndex = i + vertexOffset - tableOffset; - src = vertex + p * numVertexElements; - des = vertex + dstIndex * numVertexElements; - memcpy(des, src, sizeof(float)*numVertexElements); - if(varying) { - src = varying + p * numVaryingElements; - des = varying + dstIndex * numVaryingElements; - memcpy(des, src, sizeof(float)*numVaryingElements); + int dstIndex = i + vertexOffset - tableOffset; + if (vertex) { + src = vertex + p * vertexDesc.stride; + des = vertex + dstIndex * vertexDesc.stride; + memcpy(des, src, sizeof(float)*vertexDesc.length); + } + if (varying) { + src = varying + p * varyingDesc.stride; + des = varying + dstIndex * varyingDesc.stride; + memcpy(des, src, sizeof(float)*varyingDesc.length); } } } void OsdCpuEditVertexAdd( - OsdVertexDescriptor const &vdesc, float *vertex, + float *vertex, + OsdVertexBufferDescriptor const &vertexDesc, int primVarOffset, int primVarWidth, int vertexOffset, int tableOffset, int start, int end, const unsigned int *editIndices, const float *editValues) { for (int i = start+tableOffset; i < end+tableOffset; i++) { - vdesc.ApplyVertexEditAdd(vertex, - primVarOffset, - primVarWidth, - editIndices[i] + vertexOffset, - &editValues[i*primVarWidth]); + + if (vertex) { + int editIndex = editIndices[i] + vertexOffset; + float *dst = vertex + editIndex * vertexDesc.stride + primVarOffset; + + for (int i = 0; i < primVarWidth; ++i) { + dst[i] += editValues[i]; + } + } } } void OsdCpuEditVertexSet( - OsdVertexDescriptor const &vdesc, float *vertex, + float *vertex, + OsdVertexBufferDescriptor const &vertexDesc, int primVarOffset, int primVarWidth, int vertexOffset, int tableOffset, int start, int end, const unsigned int *editIndices, const float *editValues) { for (int i = start+tableOffset; i < end+tableOffset; i++) { - vdesc.ApplyVertexEditSet(vertex, - primVarOffset, - primVarWidth, - editIndices[i] + vertexOffset, - &editValues[i*primVarWidth]); + + if (vertex) { + int editIndex = editIndices[i] + vertexOffset; + float *dst = vertex + editIndex * vertexDesc.stride + primVarOffset; + + for (int i = 0; i < primVarWidth; ++i) { + dst[i] = editValues[i]; + } + } } } diff --git a/opensubdiv/osd/cpuKernel.h b/opensubdiv/osd/cpuKernel.h index 3b962f38..b3dd8fe2 100644 --- a/opensubdiv/osd/cpuKernel.h +++ b/opensubdiv/osd/cpuKernel.h @@ -88,8 +88,9 @@ void ComputeFaceKernel(float *vertex, memcpy(des, result1, sizeof(float)*numVertexElements); } } -void OsdCpuComputeFace(OsdVertexDescriptor const &vdesc, - float * vertex, float * varying, +void OsdCpuComputeFace(float * vertex, float * varying, + OsdVertexBufferDescriptor const &vertexDesc, + OsdVertexBufferDescriptor const &varyingDesc, const int *F_IT, const int *F_ITa, int vertexOffset, int tableOffset, int start, int end); @@ -147,8 +148,9 @@ void ComputeEdgeKernel( float *vertex, memcpy(des, result1, sizeof(float)*numVertexElements); } } -void OsdCpuComputeEdge(OsdVertexDescriptor const &vdesc, - float *vertex, float * varying, +void OsdCpuComputeEdge(float *vertex, float * varying, + OsdVertexBufferDescriptor const &vertexDesc, + OsdVertexBufferDescriptor const &varyingDesc, const int *E_IT, const float *E_ITa, int vertexOffset, int tableOffset, int start, int end); @@ -230,8 +232,9 @@ void ComputeVertexAKernel( float *vertex, memcpy(des, result1, sizeof(float)*numVertexElements); } } -void OsdCpuComputeVertexA(OsdVertexDescriptor const &vdesc, - float *vertex, float * varying, +void OsdCpuComputeVertexA(float *vertex, float * varying, + OsdVertexBufferDescriptor const &vertexDesc, + OsdVertexBufferDescriptor const &varyingDesc, const int *V_ITa, const float *V_IT, int vertexOffset, int tableOffset, int start, int end, int pass); @@ -291,8 +294,9 @@ void ComputeVertexBKernel( float *vertex, } } -void OsdCpuComputeVertexB(OsdVertexDescriptor const &vdesc, - float *vertex, float * varying, +void OsdCpuComputeVertexB(float *vertex, float * varying, + OsdVertexBufferDescriptor const &vertexDesc, + OsdVertexBufferDescriptor const &varyingDesc, const int *V_ITa, const int *V_IT, const float *V_W, int vertexOffset, int tableOffset, int start, int end); @@ -350,8 +354,9 @@ void ComputeLoopVertexBKernel( float *vertex, memcpy(des, result1, sizeof(float)*numVertexElements); } } -void OsdCpuComputeLoopVertexB(OsdVertexDescriptor const &vdesc, - float *vertex, float * varying, +void OsdCpuComputeLoopVertexB(float *vertex, float * varying, + OsdVertexBufferDescriptor const &vertexDesc, + OsdVertexBufferDescriptor const &varyingDesc, const int *V_ITa, const int *V_IT, const float *V_W, int vertexOffset, int tableOffset, @@ -385,26 +390,30 @@ void ComputeBilinearEdgeKernel( float *vertex, memcpy(des, result, sizeof(float)*numVertexElements); } } -void OsdCpuComputeBilinearEdge(OsdVertexDescriptor const &vdesc, - float *vertex, float * varying, +void OsdCpuComputeBilinearEdge(float *vertex, float * varying, + OsdVertexBufferDescriptor const &vertexDesc, + OsdVertexBufferDescriptor const &varyingDesc, const int *E_IT, int vertexOffset, int tableOffset, int start, int end); -void OsdCpuComputeBilinearVertex(OsdVertexDescriptor const &vdesc, - float *vertex, float * varying, +void OsdCpuComputeBilinearVertex(float *vertex, float * varying, + OsdVertexBufferDescriptor const &vertexDesc, + OsdVertexBufferDescriptor const &varyingDesc, const int *V_ITa, int vertexOffset, int tableOffset, int start, int end); -void OsdCpuEditVertexAdd(OsdVertexDescriptor const &vdesc, float *vertex, +void OsdCpuEditVertexAdd(float *vertex, + OsdVertexBufferDescriptor const &vertexDesc, int primVarOffset, int primVarWidth, int vertexOffset, int tableOffset, int start, int end, const unsigned int *editIndices, const float *editValues); -void OsdCpuEditVertexSet(OsdVertexDescriptor const &vdesc, float *vertex, +void OsdCpuEditVertexSet(float *vertex, + OsdVertexBufferDescriptor const &vertexDesc, int primVarOffset, int primVarWidth, int vertexOffset, int tableOffset, int start, int end, diff --git a/opensubdiv/osd/cudaComputeController.cpp b/opensubdiv/osd/cudaComputeController.cpp index 357c37ea..d900485f 100644 --- a/opensubdiv/osd/cudaComputeController.cpp +++ b/opensubdiv/osd/cudaComputeController.cpp @@ -31,42 +31,47 @@ extern "C" { void OsdCudaComputeFace(float *vertex, float *varying, - int numUserVertexElements, int numVaryingElements, + int vertexLength, int vertexStride, + int varyingLength, int varyingStride, int *F_IT, int *F_ITa, int offset, int tableOffset, int start, int end); void OsdCudaComputeEdge(float *vertex, float *varying, - int numUserVertexElements, int numVaryingElements, + int vertexLength, int vertexStride, + int varyingLength, int varyingStride, int *E_IT, float *E_W, int offset, int tableOffset, int start, int end); void OsdCudaComputeVertexA(float *vertex, float *varying, - int numUserVertexElements, int numVaryingElements, + int vertexLength, int vertexStride, + int varyingLength, int varyingStride, int *V_ITa, float *V_W, int offset, int tableOffset, int start, int end, int pass); void OsdCudaComputeVertexB(float *vertex, float *varying, - int numUserVertexElements, int numVaryingElements, + int vertexLength, int vertexStride, + int varyingLength, int varyingStride, int *V_ITa, int *V_IT, float *V_W, int offset, int tableOffset, int start, int end); void OsdCudaComputeLoopVertexB(float *vertex, float *varying, - int numUserVertexElements, - int numVaryingElements, + int vertexLength, int vertexStride, + int varyingLength, int varyingStride, int *V_ITa, int *V_IT, float *V_W, int offset, int tableOffset, int start, int end); void OsdCudaComputeBilinearEdge(float *vertex, float *varying, - int numUserVertexElements, - int numVaryingElements, + int vertexLength, int vertexStride, + int varyingLength, int varyingStride, int *E_IT, int offset, int tableOffset, int start, int end); void OsdCudaComputeBilinearVertex(float *vertex, float *varying, - int numUserVertexElements, - int numVaryingElements, + int vertexLength, int vertexStride, + int varyingLength, int varyingStride, int *V_ITa, int offset, int tableOffset, int start, int end); -void OsdCudaEditVertexAdd(float *vertex, int numUserVertexElements, +void OsdCudaEditVertexAdd(float *vertex, + int vertexLength, int vertexStride, int primVarOffset, int primVarWidth, - int vertexOffset, int tableOffset, + int offset, int tableOffset, int start, int end, int *editIndices, float *editValues); } @@ -74,8 +79,7 @@ void OsdCudaEditVertexAdd(float *vertex, int numUserVertexElements, namespace OpenSubdiv { namespace OPENSUBDIV_VERSION { -OsdCudaComputeController::OsdCudaComputeController() : - _currentVertexBuffer(NULL), _currentVaryingBuffer(NULL) { +OsdCudaComputeController::OsdCudaComputeController() { } OsdCudaComputeController::~OsdCudaComputeController() { @@ -92,9 +96,13 @@ OsdCudaComputeController::ApplyBilinearFaceVerticesKernel( assert(F_IT); assert(F_ITa); + float *vertex = _currentBindState.GetOffsettedVertexBuffer(); + float *varying = _currentBindState.GetOffsettedVaryingBuffer(); + OsdCudaComputeFace( - _currentVertexBuffer, _currentVaryingBuffer, - _vdesc.numVertexElements-3, _vdesc.numVaryingElements, + vertex, varying, + _currentBindState.vertexDesc.length, _currentBindState.vertexDesc.stride, + _currentBindState.varyingDesc.length, _currentBindState.varyingDesc.stride, static_cast(F_IT->GetCudaMemory()), static_cast(F_ITa->GetCudaMemory()), batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd()); @@ -109,9 +117,13 @@ OsdCudaComputeController::ApplyBilinearEdgeVerticesKernel( const OsdCudaTable * E_IT = context->GetTable(FarSubdivisionTables::E_IT); assert(E_IT); + float *vertex = _currentBindState.GetOffsettedVertexBuffer(); + float *varying = _currentBindState.GetOffsettedVaryingBuffer(); + OsdCudaComputeBilinearEdge( - _currentVertexBuffer, _currentVaryingBuffer, - _vdesc.numVertexElements-3, _vdesc.numVaryingElements, + vertex, varying, + _currentBindState.vertexDesc.length, _currentBindState.vertexDesc.stride, + _currentBindState.varyingDesc.length, _currentBindState.varyingDesc.stride, static_cast(E_IT->GetCudaMemory()), batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd()); } @@ -125,9 +137,13 @@ OsdCudaComputeController::ApplyBilinearVertexVerticesKernel( const OsdCudaTable * V_ITa = context->GetTable(FarSubdivisionTables::V_ITa); assert(V_ITa); + float *vertex = _currentBindState.GetOffsettedVertexBuffer(); + float *varying = _currentBindState.GetOffsettedVaryingBuffer(); + OsdCudaComputeBilinearVertex( - _currentVertexBuffer, _currentVaryingBuffer, - _vdesc.numVertexElements-3, _vdesc.numVaryingElements, + vertex, varying, + _currentBindState.vertexDesc.length, _currentBindState.vertexDesc.stride, + _currentBindState.varyingDesc.length, _currentBindState.varyingDesc.stride, static_cast(V_ITa->GetCudaMemory()), batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd()); } @@ -143,9 +159,13 @@ OsdCudaComputeController::ApplyCatmarkFaceVerticesKernel( assert(F_IT); assert(F_ITa); + float *vertex = _currentBindState.GetOffsettedVertexBuffer(); + float *varying = _currentBindState.GetOffsettedVaryingBuffer(); + OsdCudaComputeFace( - _currentVertexBuffer, _currentVaryingBuffer, - _vdesc.numVertexElements-3, _vdesc.numVaryingElements, + vertex, varying, + _currentBindState.vertexDesc.length, _currentBindState.vertexDesc.stride, + _currentBindState.varyingDesc.length, _currentBindState.varyingDesc.stride, static_cast(F_IT->GetCudaMemory()), static_cast(F_ITa->GetCudaMemory()), batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd()); @@ -162,9 +182,13 @@ OsdCudaComputeController::ApplyCatmarkEdgeVerticesKernel( assert(E_IT); assert(E_W); + float *vertex = _currentBindState.GetOffsettedVertexBuffer(); + float *varying = _currentBindState.GetOffsettedVaryingBuffer(); + OsdCudaComputeEdge( - _currentVertexBuffer, _currentVaryingBuffer, - _vdesc.numVertexElements-3, _vdesc.numVaryingElements, + vertex, varying, + _currentBindState.vertexDesc.length, _currentBindState.vertexDesc.stride, + _currentBindState.varyingDesc.length, _currentBindState.varyingDesc.stride, static_cast(E_IT->GetCudaMemory()), static_cast(E_W->GetCudaMemory()), batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd()); @@ -183,9 +207,13 @@ OsdCudaComputeController::ApplyCatmarkVertexVerticesKernelB( assert(V_IT); assert(V_W); + float *vertex = _currentBindState.GetOffsettedVertexBuffer(); + float *varying = _currentBindState.GetOffsettedVaryingBuffer(); + OsdCudaComputeVertexB( - _currentVertexBuffer, _currentVaryingBuffer, - _vdesc.numVertexElements-3, _vdesc.numVaryingElements, + vertex, varying, + _currentBindState.vertexDesc.length, _currentBindState.vertexDesc.stride, + _currentBindState.varyingDesc.length, _currentBindState.varyingDesc.stride, static_cast(V_ITa->GetCudaMemory()), static_cast(V_IT->GetCudaMemory()), static_cast(V_W->GetCudaMemory()), @@ -203,9 +231,13 @@ OsdCudaComputeController::ApplyCatmarkVertexVerticesKernelA1( assert(V_ITa); assert(V_W); + float *vertex = _currentBindState.GetOffsettedVertexBuffer(); + float *varying = _currentBindState.GetOffsettedVaryingBuffer(); + OsdCudaComputeVertexA( - _currentVertexBuffer, _currentVaryingBuffer, - _vdesc.numVertexElements-3, _vdesc.numVaryingElements, + vertex, varying, + _currentBindState.vertexDesc.length, _currentBindState.vertexDesc.stride, + _currentBindState.varyingDesc.length, _currentBindState.varyingDesc.stride, static_cast(V_ITa->GetCudaMemory()), static_cast(V_W->GetCudaMemory()), batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd(), false); @@ -222,9 +254,13 @@ OsdCudaComputeController::ApplyCatmarkVertexVerticesKernelA2( assert(V_ITa); assert(V_W); + float *vertex = _currentBindState.GetOffsettedVertexBuffer(); + float *varying = _currentBindState.GetOffsettedVaryingBuffer(); + OsdCudaComputeVertexA( - _currentVertexBuffer, _currentVaryingBuffer, - _vdesc.numVertexElements-3, _vdesc.numVaryingElements, + vertex, varying, + _currentBindState.vertexDesc.length, _currentBindState.vertexDesc.stride, + _currentBindState.varyingDesc.length, _currentBindState.varyingDesc.stride, static_cast(V_ITa->GetCudaMemory()), static_cast(V_W->GetCudaMemory()), batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd(), true); @@ -241,9 +277,13 @@ OsdCudaComputeController::ApplyLoopEdgeVerticesKernel( assert(E_IT); assert(E_W); + float *vertex = _currentBindState.GetOffsettedVertexBuffer(); + float *varying = _currentBindState.GetOffsettedVaryingBuffer(); + OsdCudaComputeEdge( - _currentVertexBuffer, _currentVaryingBuffer, - _vdesc.numVertexElements-3, _vdesc.numVaryingElements, + vertex, varying, + _currentBindState.vertexDesc.length, _currentBindState.vertexDesc.stride, + _currentBindState.varyingDesc.length, _currentBindState.varyingDesc.stride, static_cast(E_IT->GetCudaMemory()), static_cast(E_W->GetCudaMemory()), batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd()); @@ -262,9 +302,13 @@ OsdCudaComputeController::ApplyLoopVertexVerticesKernelB( assert(V_IT); assert(V_W); + float *vertex = _currentBindState.GetOffsettedVertexBuffer(); + float *varying = _currentBindState.GetOffsettedVaryingBuffer(); + OsdCudaComputeLoopVertexB( - _currentVertexBuffer, _currentVaryingBuffer, - _vdesc.numVertexElements-3, _vdesc.numVaryingElements, + vertex, varying, + _currentBindState.vertexDesc.length, _currentBindState.vertexDesc.stride, + _currentBindState.varyingDesc.length, _currentBindState.varyingDesc.stride, static_cast(V_ITa->GetCudaMemory()), static_cast(V_IT->GetCudaMemory()), static_cast(V_W->GetCudaMemory()), @@ -282,9 +326,13 @@ OsdCudaComputeController::ApplyLoopVertexVerticesKernelA1( assert(V_ITa); assert(V_W); + float *vertex = _currentBindState.GetOffsettedVertexBuffer(); + float *varying = _currentBindState.GetOffsettedVaryingBuffer(); + OsdCudaComputeVertexA( - _currentVertexBuffer, _currentVaryingBuffer, - _vdesc.numVertexElements-3, _vdesc.numVaryingElements, + vertex, varying, + _currentBindState.vertexDesc.length, _currentBindState.vertexDesc.stride, + _currentBindState.varyingDesc.length, _currentBindState.varyingDesc.stride, static_cast(V_ITa->GetCudaMemory()), static_cast(V_W->GetCudaMemory()), batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd(), false); @@ -301,9 +349,13 @@ OsdCudaComputeController::ApplyLoopVertexVerticesKernelA2( assert(V_ITa); assert(V_W); + float *vertex = _currentBindState.GetOffsettedVertexBuffer(); + float *varying = _currentBindState.GetOffsettedVaryingBuffer(); + OsdCudaComputeVertexA( - _currentVertexBuffer, _currentVaryingBuffer, - _vdesc.numVertexElements-3, _vdesc.numVaryingElements, + vertex, varying, + _currentBindState.vertexDesc.length, _currentBindState.vertexDesc.stride, + _currentBindState.varyingDesc.length, _currentBindState.varyingDesc.stride, static_cast(V_ITa->GetCudaMemory()), static_cast(V_W->GetCudaMemory()), batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd(), true); @@ -321,10 +373,12 @@ OsdCudaComputeController::ApplyVertexEdits( const OsdCudaTable * primvarIndices = edit->GetPrimvarIndices(); const OsdCudaTable * editValues = edit->GetEditValues(); + float *vertex = _currentBindState.GetOffsettedVertexBuffer(); + if (edit->GetOperation() == FarVertexEdit::Add) { OsdCudaEditVertexAdd( - _currentVertexBuffer, - _vdesc.numVertexElements-3, + vertex, + _currentBindState.vertexDesc.length, _currentBindState.vertexDesc.stride, edit->GetPrimvarOffset(), edit->GetPrimvarWidth(), batch.GetVertexOffset(), diff --git a/opensubdiv/osd/cudaComputeController.h b/opensubdiv/osd/cudaComputeController.h index 2bb47460..ffacb7ac 100644 --- a/opensubdiv/osd/cudaComputeController.h +++ b/opensubdiv/osd/cudaComputeController.h @@ -29,6 +29,7 @@ #include "../far/dispatcher.h" #include "../osd/cudaComputeContext.h" +#include "../osd/vertexDescriptor.h" namespace OpenSubdiv { namespace OPENSUBDIV_VERSION { @@ -64,15 +65,25 @@ public: /// /// @param varyingBuffer varying-interpolated data buffer /// + /// @param vertexDesc the descriptor of vertex elements to be refined. + /// if it's null, all primvars in the vertex buffer + /// will be refined. + /// + /// @param varyingDesc the descriptor of varying elements to be refined. + /// if it's null, all primvars in the varying buffer + /// will be refined. + /// template void Refine(OsdCudaComputeContext const *context, FarKernelBatchVector const &batches, VERTEX_BUFFER *vertexBuffer, - VARYING_BUFFER *varyingBuffer) { + VARYING_BUFFER *varyingBuffer, + OsdVertexBufferDescriptor const *vertexDesc=NULL, + OsdVertexBufferDescriptor const *varyingDesc=NULL) { if (batches.empty()) return; - bind(vertexBuffer, varyingBuffer); + bind(vertexBuffer, varyingBuffer, vertexDesc, varyingDesc); FarDispatcher::Refine(this, context, batches, /*maxlevel*/-1); @@ -130,37 +141,60 @@ protected: void ApplyVertexEdits(FarKernelBatch const &batch, ComputeContext const *context) const; template - void bind(VERTEX_BUFFER *vertex, VARYING_BUFFER *varying) { + void bind(VERTEX_BUFFER *vertex, VARYING_BUFFER *varying, + OsdVertexBufferDescriptor const *vertexDesc, + OsdVertexBufferDescriptor const *varyingDesc) { - if (vertex) { - _currentVertexBuffer = static_cast(vertex->BindCudaBuffer()); - _vdesc.numVertexElements = vertex->GetNumElements(); + // if the vertex buffer descriptor is specified, use it. + // otherwise, assumes the data is tightly packed in the vertex buffer. + if (vertexDesc) { + _currentBindState.vertexDesc = *vertexDesc; } else { - _currentVertexBuffer = 0; - _vdesc.numVertexElements = 0; + int numElements = vertex ? vertex->GetNumElements() : 0; + _currentBindState.vertexDesc = OsdVertexBufferDescriptor( + 0, numElements, numElements); + } + if (varyingDesc) { + _currentBindState.varyingDesc = *varyingDesc; + } else { + int numElements = varying ? varying->GetNumElements() : 0; + _currentBindState.varyingDesc = OsdVertexBufferDescriptor( + 0, numElements, numElements); } - if (varying) { - _currentVaryingBuffer = static_cast(varying->BindCudaBuffer()); - _vdesc.numVaryingElements = varying->GetNumElements(); - } else { - _currentVaryingBuffer = 0; - _vdesc.numVaryingElements = 0; - } + _currentBindState.vertexBuffer = vertex ? + static_cast(vertex->BindCudaBuffer()) : 0; + _currentBindState.varyingBuffer = varying ? + static_cast(varying->BindCudaBuffer()) : 0; } /// Unbinds any previously bound vertex and varying data buffers. void unbind() { - _currentVertexBuffer = 0; - _currentVaryingBuffer = 0; + _currentBindState.Reset(); } private: - float *_currentVertexBuffer, // cuda buffers - *_currentVaryingBuffer; + struct BindState { + BindState() : vertexBuffer(NULL), varyingBuffer(NULL) {} + void Reset() { + vertexBuffer = varyingBuffer = NULL; + vertexDesc.Reset(); + varyingDesc.Reset(); + } + float *GetOffsettedVertexBuffer() const { + return vertexBuffer ? vertexBuffer + vertexDesc.offset : 0; + } + float *GetOffsettedVaryingBuffer() const { + return varyingBuffer ? varyingBuffer + varyingDesc.offset : 0; + } - OsdVertexDescriptor _vdesc; + float *vertexBuffer; // cuda buffers + float *varyingBuffer; + OsdVertexBufferDescriptor vertexDesc; + OsdVertexBufferDescriptor varyingDesc; + }; + BindState _currentBindState; }; } // end namespace OPENSUBDIV_VERSION diff --git a/opensubdiv/osd/cudaKernel.cu b/opensubdiv/osd/cudaKernel.cu index 71547e15..e74a7b4d 100644 --- a/opensubdiv/osd/cudaKernel.cu +++ b/opensubdiv/osd/cudaKernel.cu @@ -25,37 +25,18 @@ #include template struct DeviceVertex -{ - float pos[3]; - float userVertexData[N]; - - __device__ void addWithWeight(const DeviceVertex *src, float weight) { - pos[0] += src->pos[0] * weight; - pos[1] += src->pos[1] * weight; - pos[2] += src->pos[2] * weight; - - for(int i = 0; i < N; ++i){ - userVertexData[i] += src->userVertexData[i] * weight; - } - } - __device__ void clear() { - pos[0] = pos[1] = pos[2] = 0.0f; - for(int i = 0; i < N; ++i){ - userVertexData[i] = 0.0f; - } - } -}; - -template struct DeviceVarying { float v[N]; - __device__ void addVaryingWithWeight(const DeviceVarying *src, float weight) { + __device__ void addWithWeight(const DeviceVertex *src, float weight) { +#pragma unroll for(int i = 0; i < N; ++i){ v[i] += src->v[i] * weight; } } + __device__ void clear() { +#pragma unroll for(int i = 0; i < N; ++i){ v[i] = 0.0f; } @@ -64,9 +45,9 @@ template struct DeviceVarying // Specialize DeviceVarying for N=0 to avoid compile error: // "flexible array member in otherwise empty struct" -template<> struct DeviceVarying<0> +template<> struct DeviceVertex<0> { - __device__ void addVaryingWithWeight(const DeviceVarying<0> *src, float weight) { + __device__ void addWithWeight(const DeviceVertex<0> *src, float weight) { } __device__ void clear() { } @@ -94,32 +75,30 @@ __device__ void addWithWeight(float *dst, float *src, float weight, int count) for(int i = 0; i < count; ++i) dst[i] += src[i] * weight; } -__device__ void addVaryingWithWeight(float *dst, float *src, float weight, int count) -{ - for(int i = 0; i < count; ++i) dst[i] += src[i] * weight; -} - -template __global__ void +template __global__ void computeFace(float *fVertex, float *fVaryings, int *F0_IT, int *F0_ITa, int offset, int tableOffset, int start, int end) { - DeviceVertex *vertex = (DeviceVertex*)fVertex; - DeviceVarying *varyings = (DeviceVarying*)fVaryings; - for(int i = start + tableOffset + threadIdx.x + blockIdx.x*blockDim.x; i < end + tableOffset; i += blockDim.x * gridDim.x){ + DeviceVertex *vertex = (DeviceVertex*)fVertex; + DeviceVertex *varyings = (DeviceVertex*)fVaryings; + for (int i = start + tableOffset + threadIdx.x + blockIdx.x*blockDim.x; + i < end + tableOffset; + i += blockDim.x * gridDim.x) { + int h = F0_ITa[2*i]; int n = F0_ITa[2*i+1]; float weight = 1.0f/n; - DeviceVertex dst; + DeviceVertex dst; dst.clear(); if(NUM_VARYING_ELEMENTS > 0){ - DeviceVarying dstVarying; + DeviceVertex dstVarying; dstVarying.clear(); for(int j=0; j __global__ void +template __global__ void computeEdge(float *fVertex, float *fVaryings, int *E0_IT, float *E0_S, int offset, int tableOffset, int start, int end) { - DeviceVertex *vertex = (DeviceVertex*)fVertex; - DeviceVarying *varyings = (DeviceVarying*)fVaryings; - for(int i = start + tableOffset + threadIdx.x + blockIdx.x*blockDim.x; i < end + tableOffset; i+= blockDim.x * gridDim.x){ + DeviceVertex *vertex = (DeviceVertex*)fVertex; + DeviceVertex *varyings = (DeviceVertex*)fVaryings; + + for (int i = start + tableOffset + threadIdx.x + blockIdx.x*blockDim.x; + i < end + tableOffset; + i+= blockDim.x * gridDim.x){ + int eidx0 = E0_IT[4*i+0]; int eidx1 = E0_IT[4*i+1]; int eidx2 = E0_IT[4*i+2]; @@ -170,7 +158,7 @@ computeEdge(float *fVertex, float *fVaryings, int *E0_IT, float *E0_S, int offse float vertWeight = E0_S[i*2+0]; // Fully sharp edge : vertWeight = 0.5f; - DeviceVertex dst; + DeviceVertex dst; dst.clear(); dst.addWithWeight(&vertex[eidx0], vertWeight); @@ -185,20 +173,24 @@ computeEdge(float *fVertex, float *fVaryings, int *E0_IT, float *E0_S, int offse vertex[offset+i-tableOffset] = dst; if(NUM_VARYING_ELEMENTS > 0){ - DeviceVarying dstVarying; + DeviceVertex dstVarying; dstVarying.clear(); - dstVarying.addVaryingWithWeight(&varyings[eidx0], 0.5f); - dstVarying.addVaryingWithWeight(&varyings[eidx1], 0.5f); + dstVarying.addWithWeight(&varyings[eidx0], 0.5f); + dstVarying.addWithWeight(&varyings[eidx1], 0.5f); varyings[offset+i-tableOffset] = dstVarying; } } } __global__ void -computeEdge(float *fVertex, int numVertexElements, float *fVarying, int numVaryingElements, +computeEdge(float *fVertex, float *fVarying, + int vertexLength, int vertexStride, + int varyingLength, int varyingStride, int *E0_IT, float *E0_S, int offset, int tableOffset, int start, int end) { - for(int i = start + tableOffset + threadIdx.x + blockIdx.x*blockDim.x; i < end + tableOffset; i+= blockDim.x * gridDim.x){ + for (int i = start + tableOffset + threadIdx.x + blockIdx.x*blockDim.x; + i < end + tableOffset;i+= blockDim.x * gridDim.x) { + int eidx0 = E0_IT[4*i+0]; int eidx1 = E0_IT[4*i+1]; int eidx2 = E0_IT[4*i+2]; @@ -207,35 +199,38 @@ computeEdge(float *fVertex, int numVertexElements, float *fVarying, int numVaryi float vertWeight = E0_S[i*2+0]; // Fully sharp edge : vertWeight = 0.5f; - float *dstVertex = fVertex + (i+offset-tableOffset)*numVertexElements; - clear(dstVertex, numVertexElements); + float *dstVertex = fVertex + (i+offset-tableOffset)*vertexStride; + clear(dstVertex, vertexLength); - addWithWeight(dstVertex, fVertex + eidx0*numVertexElements, vertWeight, numVertexElements); - addWithWeight(dstVertex, fVertex + eidx1*numVertexElements, vertWeight, numVertexElements); + addWithWeight(dstVertex, fVertex + eidx0*vertexStride, vertWeight, vertexLength); + addWithWeight(dstVertex, fVertex + eidx1*vertexStride, vertWeight, vertexLength); if(eidx2 > -1){ float faceWeight = E0_S[i*2+1]; - addWithWeight(dstVertex, fVertex + eidx2*numVertexElements, faceWeight, numVertexElements); - addWithWeight(dstVertex, fVertex + eidx3*numVertexElements, faceWeight, numVertexElements); + addWithWeight(dstVertex, fVertex + eidx2*vertexStride, faceWeight, vertexLength); + addWithWeight(dstVertex, fVertex + eidx3*vertexStride, faceWeight, vertexLength); } - if(numVaryingElements > 0){ - float *dstVarying = fVarying + (i+offset-tableOffset)*numVaryingElements; - clear(dstVarying, numVaryingElements); + if (varyingLength > 0){ + float *dstVarying = fVarying + (i+offset-tableOffset)*varyingStride; + clear(dstVarying, varyingLength); - addVaryingWithWeight(dstVarying, fVarying + eidx0*numVaryingElements, 0.5f, numVaryingElements); - addVaryingWithWeight(dstVarying, fVarying + eidx1*numVaryingElements, 0.5f, numVaryingElements); + addWithWeight(dstVarying, fVarying + eidx0*varyingStride, 0.5f, varyingLength); + addWithWeight(dstVarying, fVarying + eidx1*varyingStride, 0.5f, varyingLength); } } } -template __global__ void +template __global__ void computeVertexA(float *fVertex, float *fVaryings, int *V0_ITa, float *V0_S, int offset, int tableOffset, int start, int end, int pass) { - DeviceVertex *vertex = (DeviceVertex*)fVertex; - DeviceVarying *varyings = (DeviceVarying*)fVaryings; - for(int i = start + tableOffset + threadIdx.x + blockIdx.x*blockDim.x; i < end+tableOffset; i += blockDim.x * gridDim.x){ + DeviceVertex *vertex = (DeviceVertex*)fVertex; + DeviceVertex *varyings = (DeviceVertex*)fVaryings; + for (int i = start + tableOffset + threadIdx.x + blockIdx.x*blockDim.x; + i < end+tableOffset; + i += blockDim.x * gridDim.x) { + int n = V0_ITa[5*i+1]; int p = V0_ITa[5*i+2]; int eidx0 = V0_ITa[5*i+3]; @@ -249,7 +244,7 @@ computeVertexA(float *fVertex, float *fVaryings, int *V0_ITa, float *V0_S, int o if (weight>0.0f && weight<1.0f && n > 0) weight=1.0f-weight; - DeviceVertex dst; + DeviceVertex dst; if (not pass) { dst.clear(); } else { @@ -267,9 +262,9 @@ computeVertexA(float *fVertex, float *fVaryings, int *V0_ITa, float *V0_S, int o if(NUM_VARYING_ELEMENTS > 0){ if(not pass){ - DeviceVarying dstVarying; + DeviceVertex dstVarying; dstVarying.clear(); - dstVarying.addVaryingWithWeight(&varyings[p], 1.0f); + dstVarying.addWithWeight(&varyings[p], 1.0f); varyings[i+offset-tableOffset] = dstVarying; } } @@ -277,10 +272,15 @@ computeVertexA(float *fVertex, float *fVaryings, int *V0_ITa, float *V0_S, int o } __global__ void -computeVertexA(float *fVertex, int numVertexElements, float *fVaryings, int numVaryingElements, +computeVertexA(float *fVertex, float *fVaryings, + int vertexLength, int vertexStride, + int varyingLength, int varyingStride, int *V0_ITa, float *V0_S, int offset, int tableOffset, int start, int end, int pass) { - for(int i = start + tableOffset + threadIdx.x + blockIdx.x*blockDim.x; i < end + tableOffset; i += blockDim.x * gridDim.x){ + for (int i = start + tableOffset + threadIdx.x + blockIdx.x*blockDim.x; + i < end + tableOffset; + i += blockDim.x * gridDim.x){ + int n = V0_ITa[5*i+1]; int p = V0_ITa[5*i+2]; int eidx0 = V0_ITa[5*i+3]; @@ -294,24 +294,24 @@ computeVertexA(float *fVertex, int numVertexElements, float *fVaryings, int numV if (weight>0.0f && weight<1.0f && n > 0) weight=1.0f-weight; - float *dstVertex = fVertex + (i+offset-tableOffset)*numVertexElements; + float *dstVertex = fVertex + (i+offset-tableOffset)*vertexStride; if (not pass) { - clear(dstVertex, numVertexElements); + clear(dstVertex, vertexLength); } if (eidx0==-1 || (pass==0 && (n==-1)) ) { - addWithWeight(dstVertex, fVertex + p*numVertexElements, weight, numVertexElements); + addWithWeight(dstVertex, fVertex + p*vertexStride, weight, vertexLength); } else { - addWithWeight(dstVertex, fVertex + p*numVertexElements, weight*0.75f, numVertexElements); - addWithWeight(dstVertex, fVertex + eidx0*numVertexElements, weight*0.125f, numVertexElements); - addWithWeight(dstVertex, fVertex + eidx1*numVertexElements, weight*0.125f, numVertexElements); + addWithWeight(dstVertex, fVertex + p*vertexStride, weight*0.75f, vertexLength); + addWithWeight(dstVertex, fVertex + eidx0*vertexStride, weight*0.125f, vertexLength); + addWithWeight(dstVertex, fVertex + eidx1*vertexStride, weight*0.125f, vertexLength); } - if(numVaryingElements > 0){ + if(varyingLength > 0){ if(not pass){ - float *dstVarying = fVaryings + (i+offset-tableOffset)*numVaryingElements; - clear(dstVarying, numVaryingElements); - addVaryingWithWeight(dstVarying, fVaryings + p*numVaryingElements, 1.0f, numVaryingElements); + float *dstVarying = fVaryings + (i+offset-tableOffset)*varyingStride; + clear(dstVarying, varyingLength); + addWithWeight(dstVarying, fVaryings + p*varyingStride, 1.0f, varyingLength); } } } @@ -321,13 +321,16 @@ computeVertexA(float *fVertex, int numVertexElements, float *fVaryings, int numV //texture texV0_IT; -template __global__ void +template __global__ void computeVertexB(float *fVertex, float *fVaryings, const int *V0_ITa, const int *V0_IT, const float *V0_S, int offset, int tableOffset, int start, int end) { - DeviceVertex *vertex = (DeviceVertex*)fVertex; - DeviceVarying *varyings = (DeviceVarying*)fVaryings; - for(int i = start + tableOffset + threadIdx.x + blockIdx.x*blockDim.x; i < end + tableOffset; i += blockDim.x * gridDim.x){ + DeviceVertex *vertex = (DeviceVertex*)fVertex; + DeviceVertex *varyings = (DeviceVertex*)fVaryings; + for (int i = start + tableOffset + threadIdx.x + blockIdx.x*blockDim.x; + i < end + tableOffset; + i += blockDim.x * gridDim.x) { + int h = V0_ITa[5*i]; int n = V0_ITa[5*i+1]; int p = V0_ITa[5*i+2]; @@ -336,11 +339,11 @@ computeVertexB(float *fVertex, float *fVaryings, float wp = 1.0f/float(n*n); float wv = (n-2.0f) * n * wp; - DeviceVertex dst; + DeviceVertex dst; dst.clear(); dst.addWithWeight(&vertex[p], weight * wv); - for(int j = 0; j < n; ++j){ + for (int j = 0; j < n; ++j) { dst.addWithWeight(&vertex[V0_IT[h+j*2]], weight * wp); dst.addWithWeight(&vertex[V0_IT[h+j*2+1]], weight * wp); // int idx0 = tex1Dfetch(texV0_IT, h+j*2); @@ -351,19 +354,24 @@ computeVertexB(float *fVertex, float *fVaryings, vertex[i+offset-tableOffset] = dst; if(NUM_VARYING_ELEMENTS > 0){ - DeviceVarying dstVarying; + DeviceVertex dstVarying; dstVarying.clear(); - dstVarying.addVaryingWithWeight(&varyings[p], 1.0f); + dstVarying.addWithWeight(&varyings[p], 1.0f); varyings[i+offset-tableOffset] = dstVarying; } } } __global__ void -computeVertexB(float *fVertex, int numVertexElements, float *fVaryings, int numVaryingElements, +computeVertexB(float *fVertex, float *fVarying, + int vertexLength, int vertexStride, + int varyingLength, int varyingStride, const int *V0_ITa, const int *V0_IT, const float *V0_S, int offset, int tableOffset, int start, int end) { - for(int i = start + tableOffset + threadIdx.x + blockIdx.x*blockDim.x; i < end + tableOffset; i += blockDim.x * gridDim.x){ + for (int i = start + tableOffset + threadIdx.x + blockIdx.x*blockDim.x; + i < end + tableOffset; + i += blockDim.x * gridDim.x) { + int h = V0_ITa[5*i]; int n = V0_ITa[5*i+1]; int p = V0_ITa[5*i+2]; @@ -372,19 +380,19 @@ computeVertexB(float *fVertex, int numVertexElements, float *fVaryings, int numV float wp = 1.0f/float(n*n); float wv = (n-2.0f) * n * wp; - float *dstVertex = fVertex + (i+offset-tableOffset)*numVertexElements; - clear(dstVertex, numVertexElements); - addWithWeight(dstVertex, fVertex + p*numVertexElements, weight*wv, numVertexElements); + float *dstVertex = fVertex + (i+offset-tableOffset)*vertexStride; + clear(dstVertex, vertexLength); + addWithWeight(dstVertex, fVertex + p*vertexStride, weight*wv, vertexLength); - for(int j = 0; j < n; ++j){ - addWithWeight(dstVertex, fVertex + V0_IT[h+j*2]*numVertexElements, weight*wp, numVertexElements); - addWithWeight(dstVertex, fVertex + V0_IT[h+j*2+1]*numVertexElements, weight*wp, numVertexElements); + for (int j = 0; j < n; ++j) { + addWithWeight(dstVertex, fVertex + V0_IT[h+j*2]*vertexStride, weight*wp, vertexLength); + addWithWeight(dstVertex, fVertex + V0_IT[h+j*2+1]*vertexStride, weight*wp, vertexLength); } - if(numVaryingElements > 0){ - float *dstVarying = fVaryings + (i+offset-tableOffset)*numVaryingElements; - clear(dstVarying, numVaryingElements); - addVaryingWithWeight(dstVarying, fVaryings + p*numVaryingElements, 1.0f, numVaryingElements); + if (varyingLength > 0) { + float *dstVarying = fVarying + (i+offset-tableOffset)*varyingStride; + clear(dstVarying, varyingLength); + addWithWeight(dstVarying, fVarying + p*varyingStride, 1.0f, varyingLength); } } } @@ -392,12 +400,15 @@ computeVertexB(float *fVertex, int numVertexElements, float *fVaryings, int numV // -------------------------------------------------------------------------------------------- -template __global__ void +template __global__ void computeLoopVertexB(float *fVertex, float *fVaryings, int *V0_ITa, int *V0_IT, float *V0_S, int offset, int tableOffset, int start, int end) { - DeviceVertex *vertex = (DeviceVertex*)fVertex; - DeviceVarying *varyings = (DeviceVarying*)fVaryings; - for(int i = start + tableOffset + threadIdx.x + blockIdx.x*blockDim.x; i < end + tableOffset; i += blockDim.x * gridDim.x){ + DeviceVertex *vertex = (DeviceVertex*)fVertex; + DeviceVertex *varyings = (DeviceVertex*)fVaryings; + for (int i = start + tableOffset + threadIdx.x + blockIdx.x*blockDim.x; + i < end + tableOffset; + i += blockDim.x * gridDim.x) { + int h = V0_ITa[5*i]; int n = V0_ITa[5*i+1]; int p = V0_ITa[5*i+2]; @@ -408,30 +419,35 @@ computeLoopVertexB(float *fVertex, float *fVaryings, int *V0_ITa, int *V0_IT, fl beta = beta * beta; beta = (0.625f - beta) * wp; - DeviceVertex dst; + DeviceVertex dst; dst.clear(); dst.addWithWeight(&vertex[p], weight * (1.0f - (beta * n))); - for(int j = 0; j < n; ++j){ + for (int j = 0; j < n; ++j) { dst.addWithWeight(&vertex[V0_IT[h+j]], weight * beta); } vertex[i+offset-tableOffset] = dst; - if(NUM_VARYING_ELEMENTS > 0){ - DeviceVarying dstVarying; + if (NUM_VARYING_ELEMENTS > 0) { + DeviceVertex dstVarying; dstVarying.clear(); - dstVarying.addVaryingWithWeight(&varyings[p], 1.0f); + dstVarying.addWithWeight(&varyings[p], 1.0f); varyings[i+offset-tableOffset] = dstVarying; } } } __global__ void -computeLoopVertexB(float *fVertex, int numVertexElements, float *fVaryings, int numVaryingElements, +computeLoopVertexB(float *fVertex, float *fVarying, + int vertexLength, int vertexStride, + int varyingLength, int varyingStride, const int *V0_ITa, const int *V0_IT, const float *V0_S, int offset, int tableOffset, int start, int end) { - for(int i = start + tableOffset + threadIdx.x + blockIdx.x*blockDim.x; i < end + tableOffset; i += blockDim.x * gridDim.x){ + for (int i = start + tableOffset + threadIdx.x + blockIdx.x*blockDim.x; + i < end + tableOffset; + i += blockDim.x * gridDim.x) { + int h = V0_ITa[5*i]; int n = V0_ITa[5*i+1]; int p = V0_ITa[5*i+2]; @@ -442,34 +458,37 @@ computeLoopVertexB(float *fVertex, int numVertexElements, float *fVaryings, int beta = beta * beta; beta = (0.625f - beta) * wp; - float *dstVertex = fVertex + (i+offset-tableOffset)*numVertexElements; - clear(dstVertex, numVertexElements); - addWithWeight(dstVertex, fVertex + p*numVertexElements, weight*(1.0f-(beta*n)), numVertexElements); + float *dstVertex = fVertex + (i+offset-tableOffset)*vertexStride; + clear(dstVertex, vertexLength); + addWithWeight(dstVertex, fVertex + p*vertexStride, weight*(1.0f-(beta*n)), vertexLength); - for(int j = 0; j < n; ++j){ - addWithWeight(dstVertex, fVertex + V0_IT[h+j]*numVertexElements, weight*beta, numVertexElements); + for (int j = 0; j < n; ++j) { + addWithWeight(dstVertex, fVertex + V0_IT[h+j]*vertexStride, weight*beta, vertexLength); } - if(numVaryingElements > 0){ - float *dstVarying = fVaryings + (i+offset-tableOffset)*numVaryingElements; - clear(dstVarying, numVaryingElements); - addVaryingWithWeight(dstVarying, fVaryings + p*numVaryingElements, 1.0f, numVaryingElements); + if (varyingLength > 0) { + float *dstVarying = fVarying + (i+offset-tableOffset)*varyingStride; + clear(dstVarying, varyingLength); + addWithWeight(dstVarying, fVarying + p*varyingStride, 1.0f, varyingLength); } } } // -------------------------------------------------------------------------------------------- -template __global__ void +template __global__ void computeBilinearEdge(float *fVertex, float *fVaryings, int *E0_IT, int offset, int tableOffset, int start, int end) { - DeviceVertex *vertex = (DeviceVertex*)fVertex; - DeviceVarying *varyings = (DeviceVarying*)fVaryings; - for(int i = start + tableOffset + threadIdx.x + blockIdx.x*blockDim.x; i < end + tableOffset; i+= blockDim.x * gridDim.x){ + DeviceVertex *vertex = (DeviceVertex*)fVertex; + DeviceVertex *varyings = (DeviceVertex*)fVaryings; + for (int i = start + tableOffset + threadIdx.x + blockIdx.x*blockDim.x; + i < end + tableOffset; + i+= blockDim.x * gridDim.x) { + int eidx0 = E0_IT[2*i+0]; int eidx1 = E0_IT[2*i+1]; - DeviceVertex dst; + DeviceVertex dst; dst.clear(); dst.addWithWeight(&vertex[eidx0], 0.5f); @@ -477,78 +496,91 @@ computeBilinearEdge(float *fVertex, float *fVaryings, int *E0_IT, int offset, in vertex[offset+i-tableOffset] = dst; - if(NUM_VARYING_ELEMENTS > 0){ - DeviceVarying dstVarying; + if (NUM_VARYING_ELEMENTS > 0) { + DeviceVertex dstVarying; dstVarying.clear(); - dstVarying.addVaryingWithWeight(&varyings[eidx0], 0.5f); - dstVarying.addVaryingWithWeight(&varyings[eidx1], 0.5f); + dstVarying.addWithWeight(&varyings[eidx0], 0.5f); + dstVarying.addWithWeight(&varyings[eidx1], 0.5f); varyings[offset+i-tableOffset] = dstVarying; } } } __global__ void -computeBilinearEdge(float *fVertex, int numVertexElements, float *fVarying, int numVaryingElements, +computeBilinearEdge(float *fVertex, float *fVarying, + int vertexLength, int vertexStride, + int varyingLength, int varyingStride, int *E0_IT, int offset, int tableOffset, int start, int end) { - for(int i = start + tableOffset + threadIdx.x + blockIdx.x*blockDim.x; i < end + tableOffset; i+= blockDim.x * gridDim.x){ + for (int i = start + tableOffset + threadIdx.x + blockIdx.x*blockDim.x; + i < end + tableOffset; + i+= blockDim.x * gridDim.x) { + int eidx0 = E0_IT[2*i+0]; int eidx1 = E0_IT[2*i+1]; - float *dstVertex = fVertex + (i+offset-tableOffset)*numVertexElements; - clear(dstVertex, numVertexElements); + float *dstVertex = fVertex + (i+offset-tableOffset)*vertexStride; + clear(dstVertex, vertexLength); - addWithWeight(dstVertex, fVertex + eidx0*numVertexElements, 0.5f, numVertexElements); - addWithWeight(dstVertex, fVertex + eidx1*numVertexElements, 0.5f, numVertexElements); + addWithWeight(dstVertex, fVertex + eidx0*vertexStride, 0.5f, vertexLength); + addWithWeight(dstVertex, fVertex + eidx1*vertexStride, 0.5f, vertexLength); - if(numVaryingElements > 0){ - float *dstVarying = fVarying + (i+offset-tableOffset)*numVaryingElements; - clear(dstVarying, numVaryingElements); + if (varyingLength > 0) { + float *dstVarying = fVarying + (i+offset-tableOffset)*varyingStride; + clear(dstVarying, varyingLength); - addVaryingWithWeight(dstVarying, fVarying + eidx0*numVaryingElements, 0.5f, numVaryingElements); - addVaryingWithWeight(dstVarying, fVarying + eidx1*numVaryingElements, 0.5f, numVaryingElements); + addWithWeight(dstVarying, fVarying + eidx0*varyingStride, 0.5f, varyingLength); + addWithWeight(dstVarying, fVarying + eidx1*varyingStride, 0.5f, varyingLength); } } } -template __global__ void +template __global__ void computeBilinearVertex(float *fVertex, float *fVaryings, int *V0_ITa, int offset, int tableOffset, int start, int end) { - DeviceVertex *vertex = (DeviceVertex*)fVertex; - DeviceVarying *varyings = (DeviceVarying*)fVaryings; - for(int i = start + tableOffset + threadIdx.x + blockIdx.x*blockDim.x; i < end + tableOffset; i += blockDim.x * gridDim.x){ + DeviceVertex *vertex = (DeviceVertex*)fVertex; + DeviceVertex *varyings = (DeviceVertex*)fVaryings; + for (int i = start + tableOffset + threadIdx.x + blockIdx.x*blockDim.x; + i < end + tableOffset; + i += blockDim.x * gridDim.x) { + int p = V0_ITa[i]; - DeviceVertex dst; + DeviceVertex dst; dst.clear(); dst.addWithWeight(&vertex[p], 1.0f); vertex[i+offset-tableOffset] = dst; - if(NUM_VARYING_ELEMENTS > 0){ - DeviceVarying dstVarying; + if (NUM_VARYING_ELEMENTS > 0) { + DeviceVertex dstVarying; dstVarying.clear(); - dstVarying.addVaryingWithWeight(&varyings[p], 1.0f); + dstVarying.addWithWeight(&varyings[p], 1.0f); varyings[i+offset-tableOffset] = dstVarying; } } } __global__ void -computeBilinearVertex(float *fVertex, int numVertexElements, float *fVaryings, int numVaryingElements, +computeBilinearVertex(float *fVertex, float *fVarying, + int vertexLength, int vertexStride, + int varyingLength, int varyingStride, const int *V0_ITa, int offset, int tableOffset, int start, int end) { - for(int i = start + tableOffset + threadIdx.x + blockIdx.x*blockDim.x; i < end + tableOffset; i += blockDim.x * gridDim.x){ + for (int i = start + tableOffset + threadIdx.x + blockIdx.x*blockDim.x; + i < end + tableOffset; + i += blockDim.x * gridDim.x) { + int p = V0_ITa[i]; - float *dstVertex = fVertex + (i+offset-tableOffset)*numVertexElements; - clear(dstVertex, numVertexElements); - addWithWeight(dstVertex, fVertex + p*numVertexElements, 1.0f, numVertexElements); + float *dstVertex = fVertex + (i+offset-tableOffset)*vertexStride; + clear(dstVertex, vertexLength); + addWithWeight(dstVertex, fVertex + p*vertexStride, 1.0f, vertexLength); - if(numVaryingElements > 0){ - float *dstVarying = fVaryings + (i+offset-tableOffset)*numVaryingElements; - clear(dstVarying, numVaryingElements); - addVaryingWithWeight(dstVarying, fVaryings + p*numVaryingElements, 1.0f, numVaryingElements); + if (varyingLength > 0) { + float *dstVarying = fVarying + (i+offset-tableOffset)*varyingStride; + clear(dstVarying, varyingLength); + addWithWeight(dstVarying, fVarying + p*varyingStride, 1.0f, varyingLength); } } } @@ -556,15 +588,16 @@ computeBilinearVertex(float *fVertex, int numVertexElements, float *fVaryings, i // -------------------------------------------------------------------------------------------- __global__ void -editVertexAdd(float *fVertex, int numVertexElements, int primVarOffset, int primVarWidth, +editVertexAdd(float *fVertex, int vertexLength, int vertexStride, + int primVarOffset, int primVarWidth, int vertexOffset, int tableOffset, int start, int end, const int *editIndices, const float *editValues) { - for(int i = start + tableOffset + threadIdx.x + blockIdx.x*blockDim.x; - i < end + tableOffset; - i += blockDim.x * gridDim.x) { + for (int i = start + tableOffset + threadIdx.x + blockIdx.x*blockDim.x; + i < end + tableOffset; + i += blockDim.x * gridDim.x) { - float *dstVertex = fVertex + (editIndices[i] + vertexOffset) * numVertexElements + primVarOffset; + float *dstVertex = fVertex + (editIndices[i] + vertexOffset) * vertexStride + primVarOffset; for(int j = 0; j < primVarWidth; j++) { *dstVertex++ += editValues[i*primVarWidth + j]; @@ -579,16 +612,19 @@ editVertexAdd(float *fVertex, int numVertexElements, int primVarOffset, int prim // XXX: this macro usage is tentative. Since cuda kernel can't be dynamically configured, // still trying to find better way to have optimized kernel.. -#define OPT_KERNEL(NUM_USER_VERTEX_ELEMENTS, NUM_VARYING_ELEMENTS, KERNEL, X, Y, ARG) \ - if(numUserVertexElements == NUM_USER_VERTEX_ELEMENTS && \ - numVaryingElements == NUM_VARYING_ELEMENTS) \ - { KERNEL<<>>ARG; \ - return; } +#define OPT_KERNEL(NUM_VERTEX_ELEMENTS, NUM_VARYING_ELEMENTS, KERNEL, X, Y, ARG) \ + if(vertexLength == NUM_VERTEX_ELEMENTS && \ + varyingLength == NUM_VARYING_ELEMENTS && \ + vertexStride == vertexLength && \ + varyingStride == varyingLength) \ + { KERNEL<<>>ARG; \ + return; } extern "C" { void OsdCudaComputeFace(float *vertex, float *varying, - int numUserVertexElements, int numVaryingElements, + int vertexLength, int vertexStride, + int varyingLength, int varyingStride, int *F_IT, int *F_ITa, int offset, int tableOffset, int start, int end) { //computeFace<3, 0><<<512,32>>>(vertex, varying, F_IT, F_ITa, offset, start, end); @@ -598,12 +634,15 @@ void OsdCudaComputeFace(float *vertex, float *varying, OPT_KERNEL(3, 3, computeFace, 512, 32, (vertex, varying, F_IT, F_ITa, offset, tableOffset, start, end)); // fallback kernel (slow) - computeFace<<<512, 32>>>(vertex, 3+numUserVertexElements, varying, numVaryingElements, + computeFace<<<512, 32>>>(vertex, varying, + vertexLength, vertexStride, varyingLength, varyingStride, F_IT, F_ITa, offset, tableOffset, start, end); } + void OsdCudaComputeEdge(float *vertex, float *varying, - int numUserVertexElements, int numVaryingElements, + int vertexLength, int vertexStride, + int varyingLength, int varyingStride, int *E_IT, float *E_W, int offset, int tableOffset, int start, int end) { //computeEdge<0, 3><<<512,32>>>(vertex, varying, E_IT, E_W, offset, start, end); @@ -612,12 +651,14 @@ void OsdCudaComputeEdge(float *vertex, float *varying, OPT_KERNEL(3, 0, computeEdge, 512, 32, (vertex, varying, E_IT, E_W, offset, tableOffset, start, end)); OPT_KERNEL(3, 3, computeEdge, 512, 32, (vertex, varying, E_IT, E_W, offset, tableOffset, start, end)); - computeEdge<<<512, 32>>>(vertex, 3+numUserVertexElements, varying, numVaryingElements, + computeEdge<<<512, 32>>>(vertex, varying, + vertexLength, vertexStride, varyingLength, varyingStride, E_IT, E_W, offset, tableOffset, start, end); } void OsdCudaComputeVertexA(float *vertex, float *varying, - int numUserVertexElements, int numVaryingElements, + int vertexLength, int vertexStride, + int varyingLength, int varyingStride, int *V_ITa, float *V_W, int offset, int tableOffset, int start, int end, int pass) { // computeVertexA<0, 3><<<512,32>>>(vertex, varying, V_ITa, V_W, offset, start, end, pass); @@ -626,12 +667,14 @@ void OsdCudaComputeVertexA(float *vertex, float *varying, OPT_KERNEL(3, 0, computeVertexA, 512, 32, (vertex, varying, V_ITa, V_W, offset, tableOffset, start, end, pass)); OPT_KERNEL(3, 3, computeVertexA, 512, 32, (vertex, varying, V_ITa, V_W, offset, tableOffset, start, end, pass)); - computeVertexA<<<512, 32>>>(vertex, 3+numUserVertexElements, varying, numVaryingElements, + computeVertexA<<<512, 32>>>(vertex, varying, + vertexLength, vertexStride, varyingLength, varyingStride, V_ITa, V_W, offset, tableOffset, start, end, pass); } void OsdCudaComputeVertexB(float *vertex, float *varying, - int numUserVertexElements, int numVaryingElements, + int vertexLength, int vertexStride, + int varyingLength, int varyingStride, int *V_ITa, int *V_IT, float *V_W, int offset, int tableOffset, int start, int end) { // computeVertexB<0, 3><<<512,32>>>(vertex, varying, V_ITa, V_IT, V_W, offset, start, end); @@ -640,12 +683,14 @@ void OsdCudaComputeVertexB(float *vertex, float *varying, OPT_KERNEL(3, 0, computeVertexB, 512, 32, (vertex, varying, V_ITa, V_IT, V_W, offset, tableOffset, start, end)); OPT_KERNEL(3, 3, computeVertexB, 512, 32, (vertex, varying, V_ITa, V_IT, V_W, offset, tableOffset, start, end)); - computeVertexB<<<512, 32>>>(vertex, 3+numUserVertexElements, varying, numVaryingElements, + computeVertexB<<<512, 32>>>(vertex, varying, + vertexLength, vertexStride, varyingLength, varyingStride, V_ITa, V_IT, V_W, offset, tableOffset, start, end); } void OsdCudaComputeLoopVertexB(float *vertex, float *varying, - int numUserVertexElements, int numVaryingElements, + int vertexLength, int vertexStride, + int varyingLength, int varyingStride, int *V_ITa, int *V_IT, float *V_W, int offset, int tableOffset, int start, int end) { // computeLoopVertexB<0, 3><<<512,32>>>(vertex, varying, V_ITa, V_IT, V_W, offset, start, end); @@ -654,12 +699,14 @@ void OsdCudaComputeLoopVertexB(float *vertex, float *varying, OPT_KERNEL(3, 0, computeLoopVertexB, 512, 32, (vertex, varying, V_ITa, V_IT, V_W, offset, tableOffset, start, end)); OPT_KERNEL(3, 3, computeLoopVertexB, 512, 32, (vertex, varying, V_ITa, V_IT, V_W, offset, tableOffset, start, end)); - computeLoopVertexB<<<512, 32>>>(vertex, 3+numUserVertexElements, varying, numVaryingElements, + computeLoopVertexB<<<512, 32>>>(vertex, varying, + vertexLength, vertexStride, varyingLength, varyingStride, V_ITa, V_IT, V_W, offset, tableOffset, start, end); } void OsdCudaComputeBilinearEdge(float *vertex, float *varying, - int numUserVertexElements, int numVaryingElements, + int vertexLength, int vertexStride, + int varyingLength, int varyingStride, int *E_IT, int offset, int tableOffset, int start, int end) { //computeBilinearEdge<0, 3><<<512,32>>>(vertex, varying, E_IT, offset, start, end); @@ -668,12 +715,14 @@ void OsdCudaComputeBilinearEdge(float *vertex, float *varying, OPT_KERNEL(3, 0, computeBilinearEdge, 512, 32, (vertex, varying, E_IT, offset, tableOffset, start, end)); OPT_KERNEL(3, 3, computeBilinearEdge, 512, 32, (vertex, varying, E_IT, offset, tableOffset, start, end)); - computeBilinearEdge<<<512, 32>>>(vertex, 3+numUserVertexElements, varying, numVaryingElements, + computeBilinearEdge<<<512, 32>>>(vertex, varying, + vertexLength, vertexStride, varyingLength, varyingStride, E_IT, offset, tableOffset, start, end); } void OsdCudaComputeBilinearVertex(float *vertex, float *varying, - int numUserVertexElements, int numVaryingElements, + int vertexLength, int vertexStride, + int varyingLength, int varyingStride, int *V_ITa, int offset, int tableOffset, int start, int end) { // computeBilinearVertex<0, 3><<<512,32>>>(vertex, varying, V_ITa, offset, start, end); @@ -682,16 +731,17 @@ void OsdCudaComputeBilinearVertex(float *vertex, float *varying, OPT_KERNEL(3, 0, computeBilinearVertex, 512, 32, (vertex, varying, V_ITa, offset, tableOffset, start, end)); OPT_KERNEL(3, 3, computeBilinearVertex, 512, 32, (vertex, varying, V_ITa, offset, tableOffset, start, end)); - computeBilinearVertex<<<512, 32>>>(vertex, 3+numUserVertexElements, varying, numVaryingElements, + computeBilinearVertex<<<512, 32>>>(vertex, varying, + vertexLength, vertexStride, varyingLength, varyingStride, V_ITa, offset, tableOffset, start, end); } -void OsdCudaEditVertexAdd(float *vertex, int numUserVertexElements, +void OsdCudaEditVertexAdd(float *vertex, int vertexLength, int vertexStride, int primVarOffset, int primVarWidth, int vertexOffset, int tableOffset, int start, int end, int *editIndices, float *editValues) { - editVertexAdd<<<512, 32>>>(vertex, 3+numUserVertexElements, primVarOffset, primVarWidth, + editVertexAdd<<<512, 32>>>(vertex, vertexLength, vertexStride, primVarOffset, primVarWidth, vertexOffset, tableOffset, start, end, editIndices, editValues); } diff --git a/opensubdiv/osd/d3d11ComputeController.cpp b/opensubdiv/osd/d3d11ComputeController.cpp old mode 100644 new mode 100755 index 126d8f77..077517f0 --- a/opensubdiv/osd/d3d11ComputeController.cpp +++ b/opensubdiv/osd/d3d11ComputeController.cpp @@ -38,9 +38,7 @@ namespace OPENSUBDIV_VERSION { OsdD3D11ComputeController::OsdD3D11ComputeController( ID3D11DeviceContext *deviceContext) - : _deviceContext(deviceContext), _query(0), - _currentVertexBufferUAV(0), _currentVaryingBufferUAV(0), - _currentKernelBundle(NULL) { + : _deviceContext(deviceContext), _query(0) { } OsdD3D11ComputeController::~OsdD3D11ComputeController() { @@ -72,20 +70,21 @@ OsdD3D11ComputeController::Synchronize() { } OsdD3D11ComputeKernelBundle * -OsdD3D11ComputeController::getKernels(int numVertexElements, - int numVaryingElements) { +OsdD3D11ComputeController::getKernels(OsdVertexBufferDescriptor const &vertexDesc, + OsdVertexBufferDescriptor const &varyingDesc) { std::vector::iterator it = std::find_if(_kernelRegistry.begin(), _kernelRegistry.end(), - OsdD3D11ComputeKernelBundle::Match(numVertexElements, - numVaryingElements)); + OsdD3D11ComputeKernelBundle::Match( + vertexDesc, varyingDesc)); + if (it != _kernelRegistry.end()) { return *it; } else { OsdD3D11ComputeKernelBundle *kernelBundle = new OsdD3D11ComputeKernelBundle(_deviceContext); _kernelRegistry.push_back(kernelBundle); - kernelBundle->Compile(numVertexElements, numVaryingElements); + kernelBundle->Compile(vertexDesc, varyingDesc); return kernelBundle; } } @@ -102,11 +101,11 @@ OsdD3D11ComputeController::bindShaderResources() ID3D11ShaderResourceView *NULLSRV = 0; _deviceContext->VSSetShaderResources(0, 1, &NULLSRV); - if (_currentVertexBufferUAV) - _deviceContext->CSSetUnorderedAccessViews(0, 1, &_currentVertexBufferUAV, 0); // u0 + if (_currentBindState.vertexBuffer) + _deviceContext->CSSetUnorderedAccessViews(0, 1, &_currentBindState.vertexBuffer, 0); // u0 - if (_currentVaryingBufferUAV) - _deviceContext->CSSetUnorderedAccessViews(1, 1, &_currentVaryingBufferUAV, 0); // u1 + if (_currentBindState.varyingBuffer) + _deviceContext->CSSetUnorderedAccessViews(1, 1, &_currentBindState.varyingBuffer, 0); // u1 } void @@ -122,8 +121,10 @@ OsdD3D11ComputeController::ApplyBilinearFaceVerticesKernel( assert(context); - _currentKernelBundle->ApplyBilinearFaceVerticesKernel( - batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd()); + _currentBindState.kernelBundle->ApplyBilinearFaceVerticesKernel( + batch.GetVertexOffset(), batch.GetTableOffset(), + batch.GetStart(), batch.GetEnd(), + _currentBindState.vertexDesc.offset, _currentBindState.varyingDesc.offset); } void @@ -132,8 +133,10 @@ OsdD3D11ComputeController::ApplyBilinearEdgeVerticesKernel( assert(context); - _currentKernelBundle->ApplyBilinearEdgeVerticesKernel( - batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd()); + _currentBindState.kernelBundle->ApplyBilinearEdgeVerticesKernel( + batch.GetVertexOffset(), batch.GetTableOffset(), + batch.GetStart(), batch.GetEnd(), + _currentBindState.vertexDesc.offset, _currentBindState.varyingDesc.offset); } void @@ -142,8 +145,10 @@ OsdD3D11ComputeController::ApplyBilinearVertexVerticesKernel( assert(context); - _currentKernelBundle->ApplyBilinearVertexVerticesKernel( - batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd()); + _currentBindState.kernelBundle->ApplyBilinearVertexVerticesKernel( + batch.GetVertexOffset(), batch.GetTableOffset(), + batch.GetStart(), batch.GetEnd(), + _currentBindState.vertexDesc.offset, _currentBindState.varyingDesc.offset); } void @@ -152,8 +157,10 @@ OsdD3D11ComputeController::ApplyCatmarkFaceVerticesKernel( assert(context); - _currentKernelBundle->ApplyCatmarkFaceVerticesKernel( - batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd()); + _currentBindState.kernelBundle->ApplyCatmarkFaceVerticesKernel( + batch.GetVertexOffset(), batch.GetTableOffset(), + batch.GetStart(), batch.GetEnd(), + _currentBindState.vertexDesc.offset, _currentBindState.varyingDesc.offset); } @@ -164,8 +171,10 @@ OsdD3D11ComputeController::ApplyCatmarkEdgeVerticesKernel( assert(context); - _currentKernelBundle->ApplyCatmarkEdgeVerticesKernel( - batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd()); + _currentBindState.kernelBundle->ApplyCatmarkEdgeVerticesKernel( + batch.GetVertexOffset(), batch.GetTableOffset(), + batch.GetStart(), batch.GetEnd(), + _currentBindState.vertexDesc.offset, _currentBindState.varyingDesc.offset); } void @@ -174,8 +183,10 @@ OsdD3D11ComputeController::ApplyCatmarkVertexVerticesKernelB( assert(context); - _currentKernelBundle->ApplyCatmarkVertexVerticesKernelB( - batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd()); + _currentBindState.kernelBundle->ApplyCatmarkVertexVerticesKernelB( + batch.GetVertexOffset(), batch.GetTableOffset(), + batch.GetStart(), batch.GetEnd(), + _currentBindState.vertexDesc.offset, _currentBindState.varyingDesc.offset); } void @@ -184,8 +195,10 @@ OsdD3D11ComputeController::ApplyCatmarkVertexVerticesKernelA1( assert(context); - _currentKernelBundle->ApplyCatmarkVertexVerticesKernelA( - batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd(), false); + _currentBindState.kernelBundle->ApplyCatmarkVertexVerticesKernelA( + batch.GetVertexOffset(), batch.GetTableOffset(), + batch.GetStart(), batch.GetEnd(), false, + _currentBindState.vertexDesc.offset, _currentBindState.varyingDesc.offset); } void @@ -194,8 +207,10 @@ OsdD3D11ComputeController::ApplyCatmarkVertexVerticesKernelA2( assert(context); - _currentKernelBundle->ApplyCatmarkVertexVerticesKernelA( - batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd(), true); + _currentBindState.kernelBundle->ApplyCatmarkVertexVerticesKernelA( + batch.GetVertexOffset(), batch.GetTableOffset(), + batch.GetStart(), batch.GetEnd(), true, + _currentBindState.vertexDesc.offset, _currentBindState.varyingDesc.offset); } void @@ -204,8 +219,10 @@ OsdD3D11ComputeController::ApplyLoopEdgeVerticesKernel( assert(context); - _currentKernelBundle->ApplyLoopEdgeVerticesKernel( - batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd()); + _currentBindState.kernelBundle->ApplyLoopEdgeVerticesKernel( + batch.GetVertexOffset(), batch.GetTableOffset(), + batch.GetStart(), batch.GetEnd(), + _currentBindState.vertexDesc.offset, _currentBindState.varyingDesc.offset); } void @@ -214,8 +231,10 @@ OsdD3D11ComputeController::ApplyLoopVertexVerticesKernelB( assert(context); - _currentKernelBundle->ApplyLoopVertexVerticesKernelB( - batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd()); + _currentBindState.kernelBundle->ApplyLoopVertexVerticesKernelB( + batch.GetVertexOffset(), batch.GetTableOffset(), + batch.GetStart(), batch.GetEnd(), + _currentBindState.vertexDesc.offset, _currentBindState.varyingDesc.offset); } void @@ -224,8 +243,10 @@ OsdD3D11ComputeController::ApplyLoopVertexVerticesKernelA1( assert(context); - _currentKernelBundle->ApplyLoopVertexVerticesKernelA( - batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd(), false); + _currentBindState.kernelBundle->ApplyLoopVertexVerticesKernelA( + batch.GetVertexOffset(), batch.GetTableOffset(), + batch.GetStart(), batch.GetEnd(), false, + _currentBindState.vertexDesc.offset, _currentBindState.varyingDesc.offset); } void @@ -234,8 +255,10 @@ OsdD3D11ComputeController::ApplyLoopVertexVerticesKernelA2( assert(context); - _currentKernelBundle->ApplyLoopVertexVerticesKernelA( - batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd(), true); + _currentBindState.kernelBundle->ApplyLoopVertexVerticesKernelA( + batch.GetVertexOffset(), batch.GetTableOffset(), + batch.GetStart(), batch.GetEnd(), true, + _currentBindState.vertexDesc.offset, _currentBindState.varyingDesc.offset); } void @@ -253,11 +276,13 @@ OsdD3D11ComputeController::ApplyVertexEdits( int primvarWidth = edit->GetPrimvarWidth(); if (edit->GetOperation() == FarVertexEdit::Add) { - _currentKernelBundle->ApplyEditAdd(primvarOffset, primvarWidth, + _currentBindState.kernelBundle->ApplyEditAdd(primvarOffset, primvarWidth, batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), - batch.GetEnd()); + batch.GetEnd(), + _currentBindState.vertexDesc.offset, + _currentBindState.varyingDesc.offset); } else { // XXX: edit SET is not implemented yet. } diff --git a/opensubdiv/osd/d3d11ComputeController.h b/opensubdiv/osd/d3d11ComputeController.h old mode 100644 new mode 100755 index f2f44988..ed3abf94 --- a/opensubdiv/osd/d3d11ComputeController.h +++ b/opensubdiv/osd/d3d11ComputeController.h @@ -29,6 +29,7 @@ #include "../far/dispatcher.h" #include "../osd/d3d11ComputeContext.h" +#include "../osd/vertexDescriptor.h" #include @@ -75,15 +76,25 @@ public: /// /// @param varyingBuffer varying-interpolated data buffer /// + /// @param vertexDesc the descriptor of vertex elements to be refined. + /// if it's null, all primvars in the vertex buffer + /// will be refined. + /// + /// @param varyingDesc the descriptor of varying elements to be refined. + /// if it's null, all primvars in the varying buffer + /// will be refined. + /// template void Refine(OsdD3D11ComputeContext const *context, FarKernelBatchVector const &batches, VERTEX_BUFFER *vertexBuffer, - VARYING_BUFFER *varyingBuffer) { + VARYING_BUFFER *varyingBuffer, + OsdVertexBufferDescriptor const *vertexDesc=NULL, + OsdVertexBufferDescriptor const *varyingDesc=NULL) { if (batches.empty()) return; - bind(vertexBuffer, varyingBuffer); + bind(vertexBuffer, varyingBuffer, vertexDesc, varyingDesc); context->BindShaderStorageBuffers(_deviceContext); FarDispatcher::Refine(this, @@ -145,48 +156,68 @@ protected: void ApplyVertexEdits(FarKernelBatch const &batch, ComputeContext const *context) const; - OsdD3D11ComputeKernelBundle * getKernels(int numVertexElements, - int numVaryingElements); + OsdD3D11ComputeKernelBundle * getKernels(OsdVertexBufferDescriptor const &vertexDesc, + OsdVertexBufferDescriptor const &varyingDesc); void bindShaderResources(); void unbindShaderResources(); template - void bind(VERTEX_BUFFER *vertex, VARYING_BUFFER *varying) { + void bind(VERTEX_BUFFER *vertex, VARYING_BUFFER *varying, + OsdVertexBufferDescriptor const *vertexDesc, + OsdVertexBufferDescriptor const *varyingDesc) { - _currentVertexBufferUAV = vertex ? vertex->BindD3D11UAV(_deviceContext) : 0; - _currentVaryingBufferUAV = varying ? varying->BindD3D11UAV(_deviceContext) : 0; - - _vdesc.numVertexElements = vertex ? vertex->GetNumElements() : 0; - _vdesc.numVaryingElements = varying ? varying->GetNumElements() : 0; - - _currentKernelBundle = getKernels(_vdesc.numVertexElements, - _vdesc.numVaryingElements); + // if the vertex buffer descriptor is specified, use it. + // otherwise, assumes the data is tightly packed in the vertex buffer. + if (vertexDesc) { + _currentBindState.vertexDesc = *vertexDesc; + } else { + int numElements = vertex ? vertex->GetNumElements() : 0; + _currentBindState.vertexDesc = OsdVertexBufferDescriptor( + 0, numElements, numElements); + } + if (varyingDesc) { + _currentBindState.varyingDesc = *varyingDesc; + } else { + int numElements = varying ? varying->GetNumElements() : 0; + _currentBindState.varyingDesc = OsdVertexBufferDescriptor( + 0, numElements, numElements); + } + _currentBindState.vertexBuffer = vertex ? vertex->BindD3D11UAV(_deviceContext) : 0; + _currentBindState.varyingBuffer = varying ? varying->BindD3D11UAV(_deviceContext) : 0; + _currentBindState.kernelBundle = getKernels(_currentBindState.vertexDesc, + _currentBindState.varyingDesc); bindShaderResources(); } void unbind() { - _currentVertexBufferUAV = 0; - _currentVaryingBufferUAV = 0; - _currentKernelBundle = 0; + _currentBindState.Reset(); unbindShaderResources(); } private: + struct BindState { + BindState() : vertexBuffer(0), varyingBuffer(0), kernelBundle(NULL) {} + void Reset() { + vertexBuffer = varyingBuffer = 0; + vertexDesc.Reset(); + varyingDesc.Reset(); + } + ID3D11UnorderedAccessView *vertexBuffer; + ID3D11UnorderedAccessView *varyingBuffer; + OsdVertexBufferDescriptor vertexDesc; + OsdVertexBufferDescriptor varyingDesc; + OsdD3D11ComputeKernelBundle *kernelBundle; + }; + + BindState _currentBindState; + ID3D11DeviceContext *_deviceContext; ID3D11Query *_query; std::vector _kernelRegistry; - - OsdVertexDescriptor _vdesc; - - ID3D11UnorderedAccessView * _currentVertexBufferUAV, - * _currentVaryingBufferUAV; - - OsdD3D11ComputeKernelBundle * _currentKernelBundle; - }; } // end namespace OPENSUBDIV_VERSION diff --git a/opensubdiv/osd/d3d11KernelBundle.cpp b/opensubdiv/osd/d3d11KernelBundle.cpp index d125a825..ac2c9209 100644 --- a/opensubdiv/osd/d3d11KernelBundle.cpp +++ b/opensubdiv/osd/d3d11KernelBundle.cpp @@ -79,10 +79,14 @@ OsdD3D11ComputeKernelBundle::~OsdD3D11ComputeKernelBundle() { } bool -OsdD3D11ComputeKernelBundle::Compile(int numVertexElements, - int numVaryingElements) { +OsdD3D11ComputeKernelBundle::Compile( + OsdVertexBufferDescriptor const &vertexDesc, + OsdVertexBufferDescriptor const &varyingDesc) { - _vdesc.Set( numVertexElements, numVaryingElements ); + _numVertexElements = vertexDesc.length; + _vertexStride = vertexDesc.stride; + _numVaryingElements = varyingDesc.length; + _varyingStride = varyingDesc.stride; DWORD dwShaderFlags = D3DCOMPILE_ENABLE_STRICTNESS; #ifdef _DEBUG @@ -90,18 +94,26 @@ OsdD3D11ComputeKernelBundle::Compile(int numVertexElements, #endif std::ostringstream ss; - ss << numVertexElements; + ss << _numVertexElements; std::string numVertexElementsStr(ss.str()); ss.str(""); - ss << numVaryingElements; + ss << _numVaryingElements; std::string numVaryingElementsStr(ss.str()); ss.str(""); + ss << _vertexStride; + std::string vertexStrideStr(ss.str()); + ss.str(""); + ss << _varyingStride; + std::string varyingStrideStr(ss.str()); + ss.str(""); ss << _workGroupSize; std::string workGroupSizeStr(ss.str()); D3D_SHADER_MACRO shaderDefines[] = { "NUM_VERTEX_ELEMENTS", numVertexElementsStr.c_str(), + "VERTEX_STRIDE", vertexStrideStr.c_str(), "NUM_VARYING_ELEMENTS", numVaryingElementsStr.c_str(), + "VARYING_STRIDE", varyingStrideStr.c_str(), "WORK_GROUP_SIZE", workGroupSizeStr.c_str(), 0, 0 }; @@ -183,6 +195,8 @@ struct OsdD3D11ComputeKernelBundle::KernelCB { int tableOffset; // offset of subdivision table int indexStart; // start index relative to tableOffset int indexEnd; // end index relative to tableOffset + int vertexBaseOffset; // base vbo offset of the vertex buffer + int varyingBaseOffset; // base vbo offset of the varying buffer BOOL vertexPass; // 4-byte bool // vertex edit kernel @@ -225,7 +239,8 @@ OsdD3D11ComputeKernelBundle::dispatchCompute( void OsdD3D11ComputeKernelBundle::ApplyBilinearFaceVerticesKernel( - int vertexOffset, int tableOffset, int start, int end) { + int vertexOffset, int tableOffset, int start, int end, + int vertexBaseOffset, int varyingBaseOffset) { KernelCB args; ZeroMemory(&args, sizeof(args)); @@ -233,12 +248,15 @@ OsdD3D11ComputeKernelBundle::ApplyBilinearFaceVerticesKernel( args.tableOffset = tableOffset; args.indexStart = start; args.indexEnd = end; + args.vertexBaseOffset = vertexBaseOffset; + args.varyingBaseOffset = varyingBaseOffset; dispatchCompute(_kernelComputeFace, args); } void OsdD3D11ComputeKernelBundle::ApplyBilinearEdgeVerticesKernel( - int vertexOffset, int tableOffset, int start, int end) { + int vertexOffset, int tableOffset, int start, int end, + int vertexBaseOffset, int varyingBaseOffset) { KernelCB args; ZeroMemory(&args, sizeof(args)); @@ -246,12 +264,15 @@ OsdD3D11ComputeKernelBundle::ApplyBilinearEdgeVerticesKernel( args.tableOffset = tableOffset; args.indexStart = start; args.indexEnd = end; + args.vertexBaseOffset = vertexBaseOffset; + args.varyingBaseOffset = varyingBaseOffset; dispatchCompute(_kernelComputeBilinearEdge, args); } void OsdD3D11ComputeKernelBundle::ApplyBilinearVertexVerticesKernel( - int vertexOffset, int tableOffset, int start, int end) { + int vertexOffset, int tableOffset, int start, int end, + int vertexBaseOffset, int varyingBaseOffset) { KernelCB args; ZeroMemory(&args, sizeof(args)); @@ -259,13 +280,16 @@ OsdD3D11ComputeKernelBundle::ApplyBilinearVertexVerticesKernel( args.tableOffset = tableOffset; args.indexStart = start; args.indexEnd = end; + args.vertexBaseOffset = vertexBaseOffset; + args.varyingBaseOffset = varyingBaseOffset; dispatchCompute(_kernelComputeVertex, args); } void OsdD3D11ComputeKernelBundle::ApplyCatmarkFaceVerticesKernel( - int vertexOffset, int tableOffset, int start, int end) { + int vertexOffset, int tableOffset, int start, int end, + int vertexBaseOffset, int varyingBaseOffset) { KernelCB args; ZeroMemory(&args, sizeof(args)); @@ -273,12 +297,15 @@ OsdD3D11ComputeKernelBundle::ApplyCatmarkFaceVerticesKernel( args.tableOffset = tableOffset; args.indexStart = start; args.indexEnd = end; + args.vertexBaseOffset = vertexBaseOffset; + args.varyingBaseOffset = varyingBaseOffset; dispatchCompute(_kernelComputeFace, args); } void OsdD3D11ComputeKernelBundle::ApplyCatmarkEdgeVerticesKernel( - int vertexOffset, int tableOffset, int start, int end) { + int vertexOffset, int tableOffset, int start, int end, + int vertexBaseOffset, int varyingBaseOffset) { KernelCB args; ZeroMemory(&args, sizeof(args)); @@ -286,12 +313,15 @@ OsdD3D11ComputeKernelBundle::ApplyCatmarkEdgeVerticesKernel( args.tableOffset = tableOffset; args.indexStart = start; args.indexEnd = end; + args.vertexBaseOffset = vertexBaseOffset; + args.varyingBaseOffset = varyingBaseOffset; dispatchCompute(_kernelComputeEdge, args); } void OsdD3D11ComputeKernelBundle::ApplyCatmarkVertexVerticesKernelB( - int vertexOffset, int tableOffset, int start, int end) { + int vertexOffset, int tableOffset, int start, int end, + int vertexBaseOffset, int varyingBaseOffset) { KernelCB args; ZeroMemory(&args, sizeof(args)); @@ -299,12 +329,15 @@ OsdD3D11ComputeKernelBundle::ApplyCatmarkVertexVerticesKernelB( args.tableOffset = tableOffset; args.indexStart = start; args.indexEnd = end; + args.vertexBaseOffset = vertexBaseOffset; + args.varyingBaseOffset = varyingBaseOffset; dispatchCompute(_kernelComputeCatmarkVertexB, args); } void OsdD3D11ComputeKernelBundle::ApplyCatmarkVertexVerticesKernelA( - int vertexOffset, int tableOffset, int start, int end, bool pass) { + int vertexOffset, int tableOffset, int start, int end, bool pass, + int vertexBaseOffset, int varyingBaseOffset) { KernelCB args; ZeroMemory(&args, sizeof(args)); @@ -313,12 +346,15 @@ OsdD3D11ComputeKernelBundle::ApplyCatmarkVertexVerticesKernelA( args.indexStart = start; args.indexEnd = end; args.vertexPass = pass ? 1 : 0; + args.vertexBaseOffset = vertexBaseOffset; + args.varyingBaseOffset = varyingBaseOffset; dispatchCompute(_kernelComputeVertexA, args); } void OsdD3D11ComputeKernelBundle::ApplyLoopEdgeVerticesKernel( - int vertexOffset, int tableOffset, int start, int end) { + int vertexOffset, int tableOffset, int start, int end, + int vertexBaseOffset, int varyingBaseOffset) { KernelCB args; ZeroMemory(&args, sizeof(args)); @@ -326,12 +362,15 @@ OsdD3D11ComputeKernelBundle::ApplyLoopEdgeVerticesKernel( args.tableOffset = tableOffset; args.indexStart = start; args.indexEnd = end; + args.vertexBaseOffset = vertexBaseOffset; + args.varyingBaseOffset = varyingBaseOffset; dispatchCompute(_kernelComputeEdge, args); } void OsdD3D11ComputeKernelBundle::ApplyLoopVertexVerticesKernelB( - int vertexOffset, int tableOffset, int start, int end) { + int vertexOffset, int tableOffset, int start, int end, + int vertexBaseOffset, int varyingBaseOffset) { KernelCB args; ZeroMemory(&args, sizeof(args)); @@ -339,12 +378,15 @@ OsdD3D11ComputeKernelBundle::ApplyLoopVertexVerticesKernelB( args.tableOffset = tableOffset; args.indexStart = start; args.indexEnd = end; + args.vertexBaseOffset = vertexBaseOffset; + args.varyingBaseOffset = varyingBaseOffset; dispatchCompute(_kernelComputeLoopVertexB, args); } void OsdD3D11ComputeKernelBundle::ApplyLoopVertexVerticesKernelA( - int vertexOffset, int tableOffset, int start, int end, bool pass) { + int vertexOffset, int tableOffset, int start, int end, bool pass, + int vertexBaseOffset, int varyingBaseOffset) { KernelCB args; ZeroMemory(&args, sizeof(args)); @@ -353,13 +395,16 @@ OsdD3D11ComputeKernelBundle::ApplyLoopVertexVerticesKernelA( args.indexStart = start; args.indexEnd = end; args.vertexPass = pass ? 1 : 0; + args.vertexBaseOffset = vertexBaseOffset; + args.varyingBaseOffset = varyingBaseOffset; dispatchCompute(_kernelComputeVertexA, args); } void OsdD3D11ComputeKernelBundle::ApplyEditAdd( int primvarOffset, int primvarWidth, - int vertexOffset, int tableOffset, int start, int end) { + int vertexOffset, int tableOffset, int start, int end, + int vertexBaseOffset, int varyingBaseOffset) { KernelCB args; ZeroMemory(&args, sizeof(args)); @@ -369,6 +414,8 @@ OsdD3D11ComputeKernelBundle::ApplyEditAdd( args.indexEnd = end; args.editPrimVarOffset = primvarOffset; args.editPrimVarWidth = primvarWidth; + args.vertexBaseOffset = vertexBaseOffset; + args.varyingBaseOffset = varyingBaseOffset; dispatchCompute(_kernelEditAdd, args); } diff --git a/opensubdiv/osd/d3d11KernelBundle.h b/opensubdiv/osd/d3d11KernelBundle.h old mode 100644 new mode 100755 index c77219f8..1dcb6727 --- a/opensubdiv/osd/d3d11KernelBundle.h +++ b/opensubdiv/osd/d3d11KernelBundle.h @@ -48,53 +48,71 @@ public: /// Destructor ~OsdD3D11ComputeKernelBundle(); - bool Compile(int numVertexElements, int numVaryingElements); + bool Compile(OsdVertexBufferDescriptor const &vertexDesc, + OsdVertexBufferDescriptor const &varyingDesc); void ApplyBilinearFaceVerticesKernel( - int vertexOffset, int tableOffset, int start, int end); + int vertexOffset, int tableOffset, int start, int end, + int vertexBaseOffset, int varyingBaseOffset); void ApplyBilinearEdgeVerticesKernel( - int vertexOffset, int tableOffset, int start, int end); + int vertexOffset, int tableOffset, int start, int end, + int vertexBaseOffset, int varyingBaseOffset); void ApplyBilinearVertexVerticesKernel( - int vertexOffset, int tableOffset, int start, int end); + int vertexOffset, int tableOffset, int start, int end, + int vertexBaseOffset, int varyingBaseOffset); void ApplyCatmarkFaceVerticesKernel( - int vertexOffset, int tableOffset, int start, int end); + int vertexOffset, int tableOffset, int start, int end, + int vertexBaseOffset, int varyingBaseOffset); void ApplyCatmarkEdgeVerticesKernel( - int vertexOffset, int tableOffset, int start, int end); + int vertexOffset, int tableOffset, int start, int end, + int vertexBaseOffset, int varyingBaseOffset); void ApplyCatmarkVertexVerticesKernelB( - int vertexOffset, int tableOffset, int start, int end); + int vertexOffset, int tableOffset, int start, int end, + int vertexBaseOffset, int varyingBaseOffset); void ApplyCatmarkVertexVerticesKernelA( - int vertexOffset, int tableOffset, int start, int end, bool pass); + int vertexOffset, int tableOffset, int start, int end, bool pass, + int vertexBaseOffset, int varyingBaseOffset); void ApplyLoopEdgeVerticesKernel( - int vertexOffset, int tableOffset, int start, int end); + int vertexOffset, int tableOffset, int start, int end, + int vertexBaseOffset, int varyingBaseOffset); void ApplyLoopVertexVerticesKernelB( - int vertexOffset, int tableOffset, int start, int end); + int vertexOffset, int tableOffset, int start, int end, + int vertexBaseOffset, int varyingBaseOffset); void ApplyLoopVertexVerticesKernelA( - int vertexOffset, int tableOffset, int start, int end, bool pass); + int vertexOffset, int tableOffset, int start, int end, bool pass, + int vertexBaseOffset, int varyingBaseOffset); void ApplyEditAdd(int primvarOffset, int primvarWidth, - int vertexOffset, int tableOffset, int start, int end); + int vertexOffset, int tableOffset, int start, int end, + int vertexBaseOffset, int varyingBaseOffset); struct Match { - /// Constructor - Match(int numVertexElements, int numVaryingElements) - : vdesc(numVertexElements, numVaryingElements) { + Match(OsdVertexBufferDescriptor const &vertex, + OsdVertexBufferDescriptor const &varying) + : vertexDesc(vertex), varyingDesc(varying) { } bool operator() (OsdD3D11ComputeKernelBundle const *kernel) { - return vdesc == kernel->_vdesc; + // offset is dynamic. just comparing length and stride here, + // returns true if they are equal + return (vertexDesc.length == kernel->_numVertexElements and + vertexDesc.stride == kernel->_vertexStride and + varyingDesc.length == kernel->_numVaryingElements and + varyingDesc.stride == kernel->_varyingStride); } - OsdVertexDescriptor vdesc; + OsdVertexBufferDescriptor vertexDesc; + OsdVertexBufferDescriptor varyingDesc; }; friend struct Match; @@ -130,7 +148,10 @@ protected: int _workGroupSize; - OsdVertexDescriptor _vdesc; + int _numVertexElements; + int _vertexStride; + int _numVaryingElements; + int _varyingStride; }; } // end namespace OPENSUBDIV_VERSION diff --git a/opensubdiv/osd/d3d11Mesh.h b/opensubdiv/osd/d3d11Mesh.h old mode 100644 new mode 100755 index ecffaeb3..ea1ae166 --- a/opensubdiv/osd/d3d11Mesh.h +++ b/opensubdiv/osd/d3d11Mesh.h @@ -124,6 +124,13 @@ public: virtual void Refine() { _computeController->Refine(_computeContext, _farMesh->GetKernelBatches(), _vertexBuffer, _varyingBuffer); } + virtual void Refine(OsdVertexBufferDescriptor const *vertexDesc, + OsdVertexBufferDescriptor const *varyingDesc, + bool interleaved) { + _computeController->Refine(_computeContext, _farMesh->GetKernelBatches(), + _vertexBuffer, (interleaved ? _vertexBuffer : _varyingBuffer), + vertexDesc, varyingDesc); + } virtual void Synchronize() { _computeController->Synchronize(); } @@ -265,6 +272,13 @@ public: virtual void Refine() { _computeController->Refine(_computeContext, _farMesh->GetKernelBatches(), _vertexBuffer, _varyingBuffer); } + virtual void Refine(OsdVertexBufferDescriptor const *vertexDesc, + OsdVertexBufferDescriptor const *varyingDesc, + bool interleaved) { + _computeController->Refine(_computeContext, _farMesh->GetKernelBatches(), + _vertexBuffer, (interleaved ? _vertexBuffer : _varyingBuffer), + vertexDesc, varyingDesc); + } virtual void Synchronize() { _computeController->Synchronize(); } diff --git a/opensubdiv/osd/gcdComputeController.cpp b/opensubdiv/osd/gcdComputeController.cpp index 77c603fe..796c506f 100644 --- a/opensubdiv/osd/gcdComputeController.cpp +++ b/opensubdiv/osd/gcdComputeController.cpp @@ -30,8 +30,7 @@ namespace OpenSubdiv { namespace OPENSUBDIV_VERSION { -OsdGcdComputeController::OsdGcdComputeController() : - _currentVertexBuffer(0), _currentVaryingBuffer(0) { +OsdGcdComputeController::OsdGcdComputeController() { _gcd_queue = dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0); } @@ -42,7 +41,8 @@ OsdGcdComputeController::ApplyBilinearFaceVerticesKernel( assert(context); OsdGcdComputeFace( - _vdesc, _currentVertexBuffer, _currentVaryingBuffer, + _currentBindState.vertexBuffer, _currentBindState.varyingBuffer, + _currentBindState.vertexDesc, _currentBindState.varyingDesc, (const int*)context->GetTable(FarSubdivisionTables::F_IT)->GetBuffer(), (const int*)context->GetTable(FarSubdivisionTables::F_ITa)->GetBuffer(), batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd(), @@ -56,7 +56,8 @@ OsdGcdComputeController::ApplyBilinearEdgeVerticesKernel( assert(context); OsdGcdComputeBilinearEdge( - _vdesc, _currentVertexBuffer, _currentVaryingBuffer, + _currentBindState.vertexBuffer, _currentBindState.varyingBuffer, + _currentBindState.vertexDesc, _currentBindState.varyingDesc, (const int*)context->GetTable(FarSubdivisionTables::E_IT)->GetBuffer(), batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd(), _gcd_queue); @@ -69,7 +70,8 @@ OsdGcdComputeController::ApplyBilinearVertexVerticesKernel( assert(context); OsdGcdComputeBilinearVertex( - _vdesc, _currentVertexBuffer, _currentVaryingBuffer, + _currentBindState.vertexBuffer, _currentBindState.varyingBuffer, + _currentBindState.vertexDesc, _currentBindState.varyingDesc, (const int*)context->GetTable(FarSubdivisionTables::V_ITa)->GetBuffer(), batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd(), _gcd_queue); @@ -82,7 +84,8 @@ OsdGcdComputeController::ApplyCatmarkFaceVerticesKernel( assert(context); OsdGcdComputeFace( - _vdesc, _currentVertexBuffer, _currentVaryingBuffer, + _currentBindState.vertexBuffer, _currentBindState.varyingBuffer, + _currentBindState.vertexDesc, _currentBindState.varyingDesc, (const int*)context->GetTable(FarSubdivisionTables::F_IT)->GetBuffer(), (const int*)context->GetTable(FarSubdivisionTables::F_ITa)->GetBuffer(), batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd(), @@ -96,7 +99,8 @@ OsdGcdComputeController::ApplyCatmarkEdgeVerticesKernel( assert(context); OsdGcdComputeEdge( - _vdesc, _currentVertexBuffer, _currentVaryingBuffer, + _currentBindState.vertexBuffer, _currentBindState.varyingBuffer, + _currentBindState.vertexDesc, _currentBindState.varyingDesc, (const int*)context->GetTable(FarSubdivisionTables::E_IT)->GetBuffer(), (const float*)context->GetTable(FarSubdivisionTables::E_W)->GetBuffer(), batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd(), @@ -110,7 +114,8 @@ OsdGcdComputeController::ApplyCatmarkVertexVerticesKernelB( assert(context); OsdGcdComputeVertexB( - _vdesc, _currentVertexBuffer, _currentVaryingBuffer, + _currentBindState.vertexBuffer, _currentBindState.varyingBuffer, + _currentBindState.vertexDesc, _currentBindState.varyingDesc, (const int*)context->GetTable(FarSubdivisionTables::V_ITa)->GetBuffer(), (const int*)context->GetTable(FarSubdivisionTables::V_IT)->GetBuffer(), (const float*)context->GetTable(FarSubdivisionTables::V_W)->GetBuffer(), @@ -125,7 +130,8 @@ OsdGcdComputeController::ApplyCatmarkVertexVerticesKernelA1( assert(context); OsdGcdComputeVertexA( - _vdesc, _currentVertexBuffer, _currentVaryingBuffer, + _currentBindState.vertexBuffer, _currentBindState.varyingBuffer, + _currentBindState.vertexDesc, _currentBindState.varyingDesc, (const int*)context->GetTable(FarSubdivisionTables::V_ITa)->GetBuffer(), (const float*)context->GetTable(FarSubdivisionTables::V_W)->GetBuffer(), batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd(), false, @@ -139,7 +145,8 @@ OsdGcdComputeController::ApplyCatmarkVertexVerticesKernelA2( assert(context); OsdGcdComputeVertexA( - _vdesc, _currentVertexBuffer, _currentVaryingBuffer, + _currentBindState.vertexBuffer, _currentBindState.varyingBuffer, + _currentBindState.vertexDesc, _currentBindState.varyingDesc, (const int*)context->GetTable(FarSubdivisionTables::V_ITa)->GetBuffer(), (const float*)context->GetTable(FarSubdivisionTables::V_W)->GetBuffer(), batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd(), true, @@ -153,7 +160,8 @@ OsdGcdComputeController::ApplyLoopEdgeVerticesKernel( assert(context); OsdGcdComputeEdge( - _vdesc, _currentVertexBuffer, _currentVaryingBuffer, + _currentBindState.vertexBuffer, _currentBindState.varyingBuffer, + _currentBindState.vertexDesc, _currentBindState.varyingDesc, (const int*)context->GetTable(FarSubdivisionTables::E_IT)->GetBuffer(), (const float*)context->GetTable(FarSubdivisionTables::E_W)->GetBuffer(), batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd(), @@ -167,7 +175,8 @@ OsdGcdComputeController::ApplyLoopVertexVerticesKernelB( assert(context); OsdGcdComputeLoopVertexB( - _vdesc, _currentVertexBuffer, _currentVaryingBuffer, + _currentBindState.vertexBuffer, _currentBindState.varyingBuffer, + _currentBindState.vertexDesc, _currentBindState.varyingDesc, (const int*)context->GetTable(FarSubdivisionTables::V_ITa)->GetBuffer(), (const int*)context->GetTable(FarSubdivisionTables::V_IT)->GetBuffer(), (const float*)context->GetTable(FarSubdivisionTables::V_W)->GetBuffer(), @@ -182,7 +191,8 @@ OsdGcdComputeController::ApplyLoopVertexVerticesKernelA1( assert(context); OsdGcdComputeVertexA( - _vdesc, _currentVertexBuffer, _currentVaryingBuffer, + _currentBindState.vertexBuffer, _currentBindState.varyingBuffer, + _currentBindState.vertexDesc, _currentBindState.varyingDesc, (const int*)context->GetTable(FarSubdivisionTables::V_ITa)->GetBuffer(), (const float*)context->GetTable(FarSubdivisionTables::V_W)->GetBuffer(), batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd(), false, @@ -196,7 +206,8 @@ OsdGcdComputeController::ApplyLoopVertexVerticesKernelA2( assert(context); OsdGcdComputeVertexA( - _vdesc, _currentVertexBuffer, _currentVaryingBuffer, + _currentBindState.vertexBuffer, _currentBindState.varyingBuffer, + _currentBindState.vertexDesc, _currentBindState.varyingDesc, (const int*)context->GetTable(FarSubdivisionTables::V_ITa)->GetBuffer(), (const float*)context->GetTable(FarSubdivisionTables::V_W)->GetBuffer(), batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd(), true, @@ -216,8 +227,8 @@ OsdGcdComputeController::ApplyVertexEdits( const OsdCpuTable * editValues = edit->GetEditValues(); if (edit->GetOperation() == FarVertexEdit::Add) { - OsdGcdEditVertexAdd(_vdesc, - _currentVertexBuffer, + OsdGcdEditVertexAdd(_currentBindState.vertexBuffer, + _currentBindState.vertexDesc, edit->GetPrimvarOffset(), edit->GetPrimvarWidth(), batch.GetVertexOffset(), @@ -228,8 +239,8 @@ OsdGcdComputeController::ApplyVertexEdits( static_cast(editValues->GetBuffer()), _gcd_queue); } else if (edit->GetOperation() == FarVertexEdit::Set) { - OsdGcdEditVertexSet(_vdesc, - _currentVertexBuffer, + OsdGcdEditVertexSet(_currentBindState.vertexBuffer, + _currentBindState.vertexDesc, edit->GetPrimvarOffset(), edit->GetPrimvarWidth(), batch.GetVertexOffset(), diff --git a/opensubdiv/osd/gcdComputeController.h b/opensubdiv/osd/gcdComputeController.h index e209c4a2..8df8ac40 100644 --- a/opensubdiv/osd/gcdComputeController.h +++ b/opensubdiv/osd/gcdComputeController.h @@ -29,6 +29,7 @@ #include "../far/dispatcher.h" #include "../osd/cpuComputeContext.h" +#include "../osd/vertexDescriptor.h" #include @@ -64,15 +65,25 @@ public: /// /// @param varyingBuffer varying-interpolated data buffer /// + /// @param vertexDesc the descriptor of vertex elements to be refined. + /// if it's null, all primvars in the vertex buffer + /// will be refined. + /// + /// @param varyingDesc the descriptor of varying elements to be refined. + /// if it's null, all primvars in the varying buffer + /// will be refined. + /// template void Refine(OsdCpuComputeContext const *context, FarKernelBatchVector const & batches, VERTEX_BUFFER *vertexBuffer, - VARYING_BUFFER *varyingBuffer) { + VARYING_BUFFER *varyingBuffer, + OsdVertexBufferDescriptor const *vertexDesc=NULL, + OsdVertexBufferDescriptor const *varyingDesc=NULL) { if (batches.empty()) return; - bind(vertexBuffer, varyingBuffer); + bind(vertexBuffer, varyingBuffer, vertexDesc, varyingDesc); FarDispatcher::Refine(this, context, batches, /*maxlevel*/-1); @@ -130,26 +141,50 @@ protected: void ApplyVertexEdits(FarKernelBatch const &batch, ComputeContext const *context) const; template - void bind(VERTEX_BUFFER *vertex, VARYING_BUFFER *varying) { + void bind(VERTEX_BUFFER *vertex, VARYING_BUFFER *varying, + OsdVertexBufferDescriptor const *vertexDesc, + OsdVertexBufferDescriptor const *varyingDesc) { - _currentVertexBuffer = vertex ? vertex->BindCpuBuffer() : 0; - _currentVaryingBuffer = varying ? varying->BindCpuBuffer() : 0; + // if the vertex buffer descriptor is specified, use it. + // otherwise, assumes the data is tightly packed in the vertex buffer. + if (vertexDesc) { + _currentBindState.vertexDesc = *vertexDesc; + } else { + int numElements = vertex ? vertex->GetNumElements() : 0; + _currentBindState.vertexDesc = OsdVertexBufferDescriptor( + 0, numElements, numElements); + } + if (varyingDesc) { + _currentBindState.varyingDesc = *varyingDesc; + } else { + int numElements = varying ? varying->GetNumElements() : 0; + _currentBindState.varyingDesc = OsdVertexBufferDescriptor( + 0, numElements, numElements); + } - int numVertexElements = vertex ? vertex->GetNumElements() : 0; - int numVaryingElements = varying ? varying->GetNumElements() : 0; - _vdesc.Set(numVertexElements, numVaryingElements); + _currentBindState.vertexBuffer = vertex ? vertex->BindCpuBuffer() : 0; + _currentBindState.varyingBuffer = varying ? varying->BindCpuBuffer() : 0; } void unbind() { - _currentVertexBuffer = 0; - _currentVaryingBuffer = 0; - _vdesc.Reset(); + _currentBindState.Reset(); } private: - dispatch_queue_t _gcd_queue; + struct BindState { + BindState() : vertexBuffer(NULL), varyingBuffer(NULL) {} + void Reset() { + vertexBuffer = varyingBuffer = NULL; + vertexDesc.Reset(); + varyingDesc.Reset(); + } + float *vertexBuffer; + float *varyingBuffer; + OsdVertexBufferDescriptor vertexDesc; + OsdVertexBufferDescriptor varyingDesc; + }; - float *_currentVertexBuffer, *_currentVaryingBuffer; - OsdVertexDescriptor _vdesc; + BindState _currentBindState; + dispatch_queue_t _gcd_queue; }; diff --git a/opensubdiv/osd/gcdKernel.cpp b/opensubdiv/osd/gcdKernel.cpp index 3bddd4df..52a3c1c8 100644 --- a/opensubdiv/osd/gcdKernel.cpp +++ b/opensubdiv/osd/gcdKernel.cpp @@ -33,9 +33,32 @@ namespace OPENSUBDIV_VERSION { const int GCD_WORK_STRIDE = 32; +static inline void +clear(float *origin, int index, OsdVertexBufferDescriptor const &desc) { + + if (origin) { + float *dst = origin + index * desc.stride + desc.offset; + memset(dst, 0, desc.length * sizeof(float)); + } +} + +static inline void +addWithWeight(float *origin, int dstIndex, int srcIndex, + float weight, OsdVertexBufferDescriptor const &desc) { + + if (origin) { + const float *src = origin + srcIndex * desc.stride + desc.offset; + float *dst = origin + dstIndex * desc.stride + desc.offset; + for (int k = 0; k < desc.length; ++k) { + dst[k] += src[k] * weight; + } + } +} void OsdGcdComputeFace( - OsdVertexDescriptor const &vdesc, float * vertex, float * varying, + float * vertex, float * varying, + OsdVertexBufferDescriptor const &vertexDesc, + OsdVertexBufferDescriptor const &varyingDesc, const int *F_IT, const int *F_ITa, int vertexOffset, int tableOffset, int start, int end, dispatch_queue_t gcdq) { @@ -44,18 +67,22 @@ void OsdGcdComputeFace( dispatch_apply(workSize/GCD_WORK_STRIDE, gcdq, ^(size_t blockIdx){ const int start_i = start + blockIdx*GCD_WORK_STRIDE; const int end_i = start_i + GCD_WORK_STRIDE; - OsdCpuComputeFace(vdesc, vertex, varying, F_IT, F_ITa, + OsdCpuComputeFace(vertex, varying, vertexDesc, varyingDesc, + F_IT, F_ITa, vertexOffset, tableOffset, start_i, end_i); }); const int start_e = end - workSize%GCD_WORK_STRIDE; const int end_e = end; if (start_e < end_e) - OsdCpuComputeFace(vdesc, vertex, varying, F_IT, F_ITa, + OsdCpuComputeFace(vertex, varying, vertexDesc, varyingDesc, + F_IT, F_ITa, vertexOffset, tableOffset, start_e, end_e); } void OsdGcdComputeEdge( - OsdVertexDescriptor const &vdesc, float * vertex, float * varying, + float * vertex, float * varying, + OsdVertexBufferDescriptor const &vertexDesc, + OsdVertexBufferDescriptor const &varyingDesc, const int *E_IT, const float *E_W, int vertexOffset, int tableOffset, int start, int end, dispatch_queue_t gcdq) { @@ -64,18 +91,22 @@ void OsdGcdComputeEdge( dispatch_apply(workSize/GCD_WORK_STRIDE, gcdq, ^(size_t blockIdx){ const int start_i = start + blockIdx*GCD_WORK_STRIDE; const int end_i = start_i + GCD_WORK_STRIDE; - OsdCpuComputeEdge(vdesc, vertex, varying, E_IT, E_W, + OsdCpuComputeEdge(vertex, varying, vertexDesc, varyingDesc, + E_IT, E_W, vertexOffset, tableOffset, start_i, end_i); }); const int start_e = end - workSize%GCD_WORK_STRIDE; const int end_e = end; if (start_e < end_e) - OsdCpuComputeEdge(vdesc, vertex, varying, E_IT, E_W, + OsdCpuComputeEdge(vertex, varying, vertexDesc, varyingDesc, + E_IT, E_W, vertexOffset, tableOffset, start_e, end_e); } void OsdGcdComputeVertexA( - OsdVertexDescriptor const &vdesc, float * vertex, float * varying, + float * vertex, float * varying, + OsdVertexBufferDescriptor const &vertexDesc, + OsdVertexBufferDescriptor const &varyingDesc, const int *V_ITa, const float *V_W, int vertexOffset, int tableOffset, int start, int end, int pass, dispatch_queue_t gcdq) { @@ -84,18 +115,22 @@ void OsdGcdComputeVertexA( dispatch_apply(workSize/GCD_WORK_STRIDE, gcdq, ^(size_t blockIdx){ const int start_i = start + blockIdx*GCD_WORK_STRIDE; const int end_i = start_i + GCD_WORK_STRIDE; - OsdCpuComputeVertexA(vdesc, vertex, varying, V_ITa, V_W, + OsdCpuComputeVertexA(vertex, varying, vertexDesc, varyingDesc, + V_ITa, V_W, vertexOffset, tableOffset, start_i, end_i, pass); }); const int start_e = end - workSize%GCD_WORK_STRIDE; const int end_e = end; if (start_e < end_e) - OsdCpuComputeVertexA(vdesc, vertex, varying, V_ITa, V_W, + OsdCpuComputeVertexA(vertex, varying, vertexDesc, varyingDesc, + V_ITa, V_W, vertexOffset, tableOffset, start_e, end_e, pass); } void OsdGcdComputeVertexB( - OsdVertexDescriptor const &vdesc, float * vertex, float * varying, + float * vertex, float * varying, + OsdVertexBufferDescriptor const &vertexDesc, + OsdVertexBufferDescriptor const &varyingDesc, const int *V_ITa, const int *V_IT, const float *V_W, int vertexOffset, int tableOffset, int start, int end, dispatch_queue_t gcdq) { @@ -104,18 +139,22 @@ void OsdGcdComputeVertexB( dispatch_apply(workSize/GCD_WORK_STRIDE, gcdq, ^(size_t blockIdx){ const int start_i = start + blockIdx*GCD_WORK_STRIDE; const int end_i = start_i + GCD_WORK_STRIDE; - OsdCpuComputeVertexB(vdesc, vertex, varying, V_ITa, V_IT, V_W, + OsdCpuComputeVertexB(vertex, varying, vertexDesc, varyingDesc, + V_ITa, V_IT, V_W, vertexOffset, tableOffset, start_i, end_i); }); const int start_e = end - workSize%GCD_WORK_STRIDE; const int end_e = end; if (start_e < end_e) - OsdCpuComputeVertexB(vdesc, vertex, varying, V_ITa, V_IT, V_W, + OsdCpuComputeVertexB(vertex, varying, vertexDesc, varyingDesc, + V_ITa, V_IT, V_W, vertexOffset, tableOffset, start_e, end_e); } void OsdGcdComputeLoopVertexB( - OsdVertexDescriptor const &vdesc, float * vertex, float * varying, + float * vertex, float * varying, + OsdVertexBufferDescriptor const &vertexDesc, + OsdVertexBufferDescriptor const &varyingDesc, const int *V_ITa, const int *V_IT, const float *V_W, int vertexOffset, int tableOffset, int start, int end, dispatch_queue_t gcdq) { @@ -133,19 +172,22 @@ void OsdGcdComputeLoopVertexB( beta = (0.625f - beta) * wp; int dstIndex = vertexOffset + i - tableOffset; - vdesc.Clear(vertex, varying, dstIndex); + clear(vertex, dstIndex, vertexDesc); + clear(varying, dstIndex, varyingDesc); - vdesc.AddWithWeight(vertex, dstIndex, p, weight * (1.0f - (beta * n))); + addWithWeight(vertex, dstIndex, p, weight * (1.0f - (beta * n)), vertexDesc); for (int j = 0; j < n; ++j) - vdesc.AddWithWeight(vertex, dstIndex, V_IT[h+j], weight * beta); + addWithWeight(vertex, dstIndex, V_IT[h+j], weight * beta, vertexDesc); - vdesc.AddVaryingWithWeight(varying, dstIndex, p, 1.0f); + addWithWeight(varying, dstIndex, p, 1.0f, varyingDesc); }); } void OsdGcdComputeBilinearEdge( - OsdVertexDescriptor const &vdesc, float * vertex, float * varying, + float * vertex, float * varying, + OsdVertexBufferDescriptor const &vertexDesc, + OsdVertexBufferDescriptor const &varyingDesc, const int *E_IT, int vertexOffset, int tableOffset, int start, int end, dispatch_queue_t gcdq) { @@ -156,18 +198,21 @@ void OsdGcdComputeBilinearEdge( int eidx1 = E_IT[2*i+1]; int dstIndex = vertexOffset + i - tableOffset; - vdesc.Clear(vertex, varying, dstIndex); + clear(vertex, dstIndex, vertexDesc); + clear(varying, dstIndex, varyingDesc); - vdesc.AddWithWeight(vertex, dstIndex, eidx0, 0.5f); - vdesc.AddWithWeight(vertex, dstIndex, eidx1, 0.5f); + addWithWeight(vertex, dstIndex, eidx0, 0.5f, vertexDesc); + addWithWeight(vertex, dstIndex, eidx1, 0.5f, vertexDesc); - vdesc.AddVaryingWithWeight(varying, dstIndex, eidx0, 0.5f); - vdesc.AddVaryingWithWeight(varying, dstIndex, eidx1, 0.5f); + addWithWeight(varying, dstIndex, eidx0, 0.5f, varyingDesc); + addWithWeight(varying, dstIndex, eidx1, 0.5f, varyingDesc); }); } void OsdGcdComputeBilinearVertex( - OsdVertexDescriptor const &vdesc, float * vertex, float * varying, + float * vertex, float * varying, + OsdVertexBufferDescriptor const &vertexDesc, + OsdVertexBufferDescriptor const &varyingDesc, const int *V_ITa, int vertexOffset, int tableOffset, int start, int end, dispatch_queue_t gcdq) { @@ -177,15 +222,17 @@ void OsdGcdComputeBilinearVertex( int p = V_ITa[i]; int dstIndex = vertexOffset + i - tableOffset; - vdesc.Clear(vertex, varying, dstIndex); + clear(vertex, dstIndex, vertexDesc); + clear(varying, dstIndex, varyingDesc); - vdesc.AddWithWeight(vertex, dstIndex, p, 1.0f); - vdesc.AddVaryingWithWeight(varying, dstIndex, p, 1.0f); + addWithWeight(vertex, dstIndex, p, 1.0f, vertexDesc); + addWithWeight(varying, dstIndex, p, 1.0f, varyingDesc); }); } void OsdGcdEditVertexAdd( - OsdVertexDescriptor const &vdesc, float * vertex, + float * vertex, + OsdVertexBufferDescriptor const &vertexDesc, int primVarOffset, int primVarWidth, int vertexOffset, int tableOffset, int start, int end, @@ -195,14 +242,20 @@ void OsdGcdEditVertexAdd( int vertexCount = end - start; dispatch_apply(vertexCount, gcdq, ^(size_t blockIdx){ int i = start + blockIdx + tableOffset; - vdesc.ApplyVertexEditAdd(vertex, primVarOffset, primVarWidth, - editIndices[i] + vertexOffset, - &editValues[i*primVarWidth]); + + if (vertex) { + int editIndex = editIndices[i] + vertexOffset; + float *dst = vertex + editIndex * vertexDesc.stride + + vertexDesc.offset + primVarOffset; + + dst[i] += editValues[i]; + } }); } void OsdGcdEditVertexSet( - OsdVertexDescriptor const &vdesc, float * vertex, + float * vertex, + OsdVertexBufferDescriptor const &vertexDesc, int primVarOffset, int primVarWidth, int vertexOffset, int tableOffset, int start, int end, @@ -212,9 +265,14 @@ void OsdGcdEditVertexSet( int vertexCount = end - start; dispatch_apply(vertexCount, gcdq, ^(size_t blockIdx){ int i = start + blockIdx + tableOffset; - vdesc.ApplyVertexEditSet(vertex, primVarOffset, primVarWidth, - editIndices[i] + vertexOffset, - &editValues[i*primVarWidth]); + + if (vertex) { + int editIndex = editIndices[i] + vertexOffset; + float *dst = vertex + editIndex * vertexDesc.stride + + vertexDesc.offset + primVarOffset; + + dst[i] = editValues[i]; + } }); } diff --git a/opensubdiv/osd/gcdKernel.h b/opensubdiv/osd/gcdKernel.h index e50ba731..63184dfd 100644 --- a/opensubdiv/osd/gcdKernel.h +++ b/opensubdiv/osd/gcdKernel.h @@ -32,66 +32,75 @@ namespace OpenSubdiv { namespace OPENSUBDIV_VERSION { -struct OsdVertexDescriptor; +struct OsdVertexBufferDescriptor; -void OsdGcdComputeFace(OsdVertexDescriptor const &vdesc, - float * vertex, float * varying, +void OsdGcdComputeFace(float * vertex, float * varying, + OsdVertexBufferDescriptor const &vertexDesc, + OsdVertexBufferDescriptor const &varyingDesc, const int *F_IT, const int *F_ITa, int vertexOffset, int tableOffset, int start, int end, dispatch_queue_t gcdq); -void OsdGcdComputeEdge(OsdVertexDescriptor const &vdesc, - float *vertex, float * varying, +void OsdGcdComputeEdge(float *vertex, float * varying, + OsdVertexBufferDescriptor const &vertexDesc, + OsdVertexBufferDescriptor const &varyingDesc, const int *E_IT, const float *E_ITa, int vertexOffset, int tableOffset, int start, int end, dispatch_queue_t gcdq); -void OsdGcdComputeVertexA(OsdVertexDescriptor const &vdesc, - float *vertex, float * varying, +void OsdGcdComputeVertexA(float *vertex, float * varying, + OsdVertexBufferDescriptor const &vertexDesc, + OsdVertexBufferDescriptor const &varyingDesc, const int *V_ITa, const float *V_IT, int vertexOffset, int tableOffset, int start, int end, int pass, dispatch_queue_t gcdq); -void OsdGcdComputeVertexB(OsdVertexDescriptor const &vdesc, - float *vertex, float * varying, +void OsdGcdComputeVertexB(float *vertex, float * varying, + OsdVertexBufferDescriptor const &vertexDesc, + OsdVertexBufferDescriptor const &varyingDesc, const int *V_ITa, const int *V_IT, const float *V_W, int vertexOffset, int tableOffset, int start, int end, dispatch_queue_t gcdq); -void OsdGcdComputeLoopVertexB(OsdVertexDescriptor const &vdesc, - float *vertex, float * varying, +void OsdGcdComputeLoopVertexB(float *vertex, float * varying, + OsdVertexBufferDescriptor const &vertexDesc, + OsdVertexBufferDescriptor const &varyingDesc, const int *V_ITa, const int *V_IT, const float *V_W, int vertexOffset, int tableOffset, int start, int end, dispatch_queue_t gcdq); -void OsdGcdComputeBilinearEdge(OsdVertexDescriptor const &vdesc, - float *vertex, float * varying, +void OsdGcdComputeBilinearEdge(float *vertex, float * varying, + OsdVertexBufferDescriptor const &vertexDesc, + OsdVertexBufferDescriptor const &varyingDesc, const int *E_IT, int vertexOffset, int tableOffset, int start, int end, dispatch_queue_t gcdq); -void OsdGcdComputeBilinearVertex(OsdVertexDescriptor const &vdesc, - float *vertex, float * varying, +void OsdGcdComputeBilinearVertex(float *vertex, float * varying, + OsdVertexBufferDescriptor const &vertexDesc, + OsdVertexBufferDescriptor const &varyingDesc, const int *V_ITa, int vertexOffset, int tableOffset, int start, int end, dispatch_queue_t gcdq); -void OsdGcdEditVertexAdd(OsdVertexDescriptor const &vdesc, float *vertex, +void OsdGcdEditVertexAdd(float *vertex, + OsdVertexBufferDescriptor const &vertexDesc, int primVarOffset, int primVarWidth, int vertexOffset, int tableOffset, int start, int end, const unsigned int *editIndices, const float *editValues, dispatch_queue_t gcdq); -void OsdGcdEditVertexSet(OsdVertexDescriptor const &vdesc, float *vertex, +void OsdGcdEditVertexSet(float *vertex, + OsdVertexBufferDescriptor const &vertexDesc, int primVarOffset, int primVarWidth, int vertexOffset, int tableOffset, int start, int end, diff --git a/opensubdiv/osd/glMesh.h b/opensubdiv/osd/glMesh.h index eedfb9cf..ae43b238 100644 --- a/opensubdiv/osd/glMesh.h +++ b/opensubdiv/osd/glMesh.h @@ -29,6 +29,7 @@ #include "../osd/mesh.h" #include "../osd/glDrawContext.h" +#include "../osd/vertexDescriptor.h" #ifdef OPENSUBDIV_HAS_OPENCL #if defined(__APPLE__) @@ -125,6 +126,14 @@ public: virtual void Refine() { _computeController->Refine(_computeContext, _farMesh->GetKernelBatches(), _vertexBuffer, _varyingBuffer); } + virtual void Refine(OsdVertexBufferDescriptor const *vertexDesc, + OsdVertexBufferDescriptor const *varyingDesc, + bool interleaved) { + _computeController->Refine(_computeContext, _farMesh->GetKernelBatches(), + _vertexBuffer, (interleaved ? _vertexBuffer : _varyingBuffer), + vertexDesc, varyingDesc); + } + virtual void Synchronize() { _computeController->Synchronize(); } @@ -250,6 +259,7 @@ public: virtual ~OsdMesh() { delete _farMesh; delete _vertexBuffer; + delete _varyingBuffer; delete _computeContext; delete _drawContext; } @@ -265,6 +275,14 @@ public: virtual void Refine() { _computeController->Refine(_computeContext, _farMesh->GetKernelBatches(), _vertexBuffer, _varyingBuffer); } + virtual void Refine(OsdVertexBufferDescriptor const *vertexDesc, + OsdVertexBufferDescriptor const *varyingDesc, + bool interleaved) { + + _computeController->Refine(_computeContext, _farMesh->GetKernelBatches(), + _vertexBuffer, (interleaved ? _vertexBuffer : _varyingBuffer), + vertexDesc, varyingDesc); + } virtual void Synchronize() { _computeController->Synchronize(); } diff --git a/opensubdiv/osd/glslComputeController.cpp b/opensubdiv/osd/glslComputeController.cpp index 6e24cda7..d9f5f40f 100644 --- a/opensubdiv/osd/glslComputeController.cpp +++ b/opensubdiv/osd/glslComputeController.cpp @@ -34,8 +34,7 @@ namespace OpenSubdiv { namespace OPENSUBDIV_VERSION { -OsdGLSLComputeController::OsdGLSLComputeController() - : _currentVertexBuffer(0), _currentVaryingBuffer(0), _currentKernelBundle(NULL) { +OsdGLSLComputeController::OsdGLSLComputeController() { } OsdGLSLComputeController::~OsdGLSLComputeController() { @@ -54,20 +53,21 @@ OsdGLSLComputeController::Synchronize() { } OsdGLSLComputeKernelBundle * -OsdGLSLComputeController::getKernels(int numVertexElements, - int numVaryingElements) { +OsdGLSLComputeController::getKernels( + OsdVertexBufferDescriptor const &vertexDesc, + OsdVertexBufferDescriptor const &varyingDesc) { std::vector::iterator it = std::find_if(_kernelRegistry.begin(), _kernelRegistry.end(), - OsdGLSLComputeKernelBundle::Match(numVertexElements, - numVaryingElements)); + OsdGLSLComputeKernelBundle::Match(vertexDesc, + varyingDesc)); if (it != _kernelRegistry.end()) { return *it; } else { OsdGLSLComputeKernelBundle *kernelBundle = new OsdGLSLComputeKernelBundle(); _kernelRegistry.push_back(kernelBundle); - kernelBundle->Compile(numVertexElements, numVaryingElements); + kernelBundle->Compile(vertexDesc, varyingDesc); return kernelBundle; } } @@ -75,18 +75,21 @@ OsdGLSLComputeController::getKernels(int numVertexElements, void OsdGLSLComputeController::bindBufferAndProgram() { - if (_currentVertexBuffer) - glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, _currentVertexBuffer); + if (_currentBindState.vertexBuffer) + glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, _currentBindState.vertexBuffer); - if (_currentVaryingBuffer) - glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, _currentVaryingBuffer); + if (_currentBindState.varyingBuffer) + glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, _currentBindState.varyingBuffer); - _currentKernelBundle->UseProgram(); + _currentBindState.kernelBundle->UseProgram(_currentBindState.vertexDesc.offset, + _currentBindState.varyingDesc.offset); + glMemoryBarrier(GL_TEXTURE_FETCH_BARRIER_BIT); } void OsdGLSLComputeController::unbindBufferAndProgram() { + glMemoryBarrier(GL_TEXTURE_FETCH_BARRIER_BIT); glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, 0); glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, 0); glUseProgram(0); @@ -98,8 +101,9 @@ OsdGLSLComputeController::ApplyBilinearFaceVerticesKernel( assert(context); - _currentKernelBundle->ApplyBilinearFaceVerticesKernel( - batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd()); + _currentBindState.kernelBundle->ApplyBilinearFaceVerticesKernel( + batch.GetVertexOffset(), batch.GetTableOffset(), + batch.GetStart(), batch.GetEnd()); } void @@ -108,8 +112,9 @@ OsdGLSLComputeController::ApplyBilinearEdgeVerticesKernel( assert(context); - _currentKernelBundle->ApplyBilinearEdgeVerticesKernel( - batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd()); + _currentBindState.kernelBundle->ApplyBilinearEdgeVerticesKernel( + batch.GetVertexOffset(), batch.GetTableOffset(), + batch.GetStart(), batch.GetEnd()); } void @@ -118,8 +123,9 @@ OsdGLSLComputeController::ApplyBilinearVertexVerticesKernel( assert(context); - _currentKernelBundle->ApplyBilinearVertexVerticesKernel( - batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd()); + _currentBindState.kernelBundle->ApplyBilinearVertexVerticesKernel( + batch.GetVertexOffset(), batch.GetTableOffset(), + batch.GetStart(), batch.GetEnd()); } void @@ -128,20 +134,20 @@ OsdGLSLComputeController::ApplyCatmarkFaceVerticesKernel( assert(context); - _currentKernelBundle->ApplyCatmarkFaceVerticesKernel( - batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd()); + _currentBindState.kernelBundle->ApplyCatmarkFaceVerticesKernel( + batch.GetVertexOffset(), batch.GetTableOffset(), + batch.GetStart(), batch.GetEnd()); } - - void OsdGLSLComputeController::ApplyCatmarkEdgeVerticesKernel( FarKernelBatch const &batch, OsdGLSLComputeContext const *context) const { assert(context); - _currentKernelBundle->ApplyCatmarkEdgeVerticesKernel( - batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd()); + _currentBindState.kernelBundle->ApplyCatmarkEdgeVerticesKernel( + batch.GetVertexOffset(), batch.GetTableOffset(), + batch.GetStart(), batch.GetEnd()); } void @@ -150,8 +156,9 @@ OsdGLSLComputeController::ApplyCatmarkVertexVerticesKernelB( assert(context); - _currentKernelBundle->ApplyCatmarkVertexVerticesKernelB( - batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd()); + _currentBindState.kernelBundle->ApplyCatmarkVertexVerticesKernelB( + batch.GetVertexOffset(), batch.GetTableOffset(), + batch.GetStart(), batch.GetEnd()); } void @@ -160,8 +167,9 @@ OsdGLSLComputeController::ApplyCatmarkVertexVerticesKernelA1( assert(context); - _currentKernelBundle->ApplyCatmarkVertexVerticesKernelA( - batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd(), false); + _currentBindState.kernelBundle->ApplyCatmarkVertexVerticesKernelA( + batch.GetVertexOffset(), batch.GetTableOffset(), + batch.GetStart(), batch.GetEnd(), false); } void @@ -170,8 +178,9 @@ OsdGLSLComputeController::ApplyCatmarkVertexVerticesKernelA2( assert(context); - _currentKernelBundle->ApplyCatmarkVertexVerticesKernelA( - batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd(), true); + _currentBindState.kernelBundle->ApplyCatmarkVertexVerticesKernelA( + batch.GetVertexOffset(), batch.GetTableOffset(), + batch.GetStart(), batch.GetEnd(), true); } void @@ -180,8 +189,9 @@ OsdGLSLComputeController::ApplyLoopEdgeVerticesKernel( assert(context); - _currentKernelBundle->ApplyLoopEdgeVerticesKernel( - batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd()); + _currentBindState.kernelBundle->ApplyLoopEdgeVerticesKernel( + batch.GetVertexOffset(), batch.GetTableOffset(), + batch.GetStart(), batch.GetEnd()); } void @@ -190,8 +200,9 @@ OsdGLSLComputeController::ApplyLoopVertexVerticesKernelB( assert(context); - _currentKernelBundle->ApplyLoopVertexVerticesKernelB( - batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd()); + _currentBindState.kernelBundle->ApplyLoopVertexVerticesKernelB( + batch.GetVertexOffset(), batch.GetTableOffset(), + batch.GetStart(), batch.GetEnd()); } void @@ -200,8 +211,9 @@ OsdGLSLComputeController::ApplyLoopVertexVerticesKernelA1( assert(context); - _currentKernelBundle->ApplyLoopVertexVerticesKernelA( - batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd(), false); + _currentBindState.kernelBundle->ApplyLoopVertexVerticesKernelA( + batch.GetVertexOffset(), batch.GetTableOffset(), + batch.GetStart(), batch.GetEnd(), false); } void @@ -210,8 +222,9 @@ OsdGLSLComputeController::ApplyLoopVertexVerticesKernelA2( assert(context); - _currentKernelBundle->ApplyLoopVertexVerticesKernelA( - batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd(), true); + _currentBindState.kernelBundle->ApplyLoopVertexVerticesKernelA( + batch.GetVertexOffset(), batch.GetTableOffset(), + batch.GetStart(), batch.GetEnd(), true); } void @@ -229,12 +242,12 @@ OsdGLSLComputeController::ApplyVertexEdits( int primvarWidth = edit->GetPrimvarWidth(); if (edit->GetOperation() == FarVertexEdit::Add) { - _currentKernelBundle->ApplyEditAdd( primvarOffset, - primvarWidth, - batch.GetVertexOffset(), - batch.GetTableOffset(), - batch.GetStart(), - batch.GetEnd()); + _currentBindState.kernelBundle->ApplyEditAdd(primvarOffset, + primvarWidth, + batch.GetVertexOffset(), + batch.GetTableOffset(), + batch.GetStart(), + batch.GetEnd()); } else { // XXX: edit SET is not implemented yet. } diff --git a/opensubdiv/osd/glslComputeController.h b/opensubdiv/osd/glslComputeController.h index f052dbdc..e1e4c652 100644 --- a/opensubdiv/osd/glslComputeController.h +++ b/opensubdiv/osd/glslComputeController.h @@ -29,6 +29,7 @@ #include "../far/dispatcher.h" #include "../osd/glslComputeContext.h" +#include "../osd/vertexDescriptor.h" #include @@ -69,18 +70,25 @@ public: /// /// @param varyingBuffer varying-interpolated data buffer /// + /// @param vertexDesc the descriptor of vertex elements to be refined. + /// if it's null, all primvars in the vertex buffer + /// will be refined. + /// + /// @param varyingDesc the descriptor of varying elements to be refined. + /// if it's null, all primvars in the varying buffer + /// will be refined. + /// template void Refine(OsdGLSLComputeContext const *context, FarKernelBatchVector const &batches, VERTEX_BUFFER *vertexBuffer, - VARYING_BUFFER *varyingBuffer) { + VARYING_BUFFER *varyingBuffer, + OsdVertexBufferDescriptor const *vertexDesc=NULL, + OsdVertexBufferDescriptor const *varyingDesc=NULL) { if (batches.empty()) return; - int numVertexElements = vertexBuffer ? vertexBuffer->GetNumElements() : 0; - int numVaryingElements = varyingBuffer ? varyingBuffer->GetNumElements() : 0; - - bind(vertexBuffer, varyingBuffer, getKernels(numVertexElements, numVaryingElements)); + bind(vertexBuffer, varyingBuffer, vertexDesc, varyingDesc); // bind table buffers. context->BindShaderStorageBuffers(); @@ -141,42 +149,69 @@ protected: void ApplyVertexEdits(FarKernelBatch const &batch, ComputeContext const *context) const; - OsdGLSLComputeKernelBundle * getKernels(int numVertexElements, - int numVaryingElements); + OsdGLSLComputeKernelBundle * getKernels( + OsdVertexBufferDescriptor const &vertexDesc, + OsdVertexBufferDescriptor const &varyingDesc); void bindBufferAndProgram(); void unbindBufferAndProgram(); template - void bind(VERTEX_BUFFER *vertex, VARYING_BUFFER *varying, OsdGLSLComputeKernelBundle *kernelBundle) { + void bind(VERTEX_BUFFER *vertex, VARYING_BUFFER *varying, + OsdVertexBufferDescriptor const *vertexDesc, + OsdVertexBufferDescriptor const *varyingDesc) { - _currentVertexBuffer = vertex ? vertex->BindVBO() : 0; - _currentVaryingBuffer = varying ? varying->BindVBO() : 0; + // if the vertex buffer descriptor is specified, use it. + // otherwise, assumes the data is tightly packed in the vertex buffer. + if (vertexDesc) { + _currentBindState.vertexDesc = *vertexDesc; + } else { + int numElements = vertex ? vertex->GetNumElements() : 0; + _currentBindState.vertexDesc = OsdVertexBufferDescriptor( + 0, numElements, numElements); + } + if (varyingDesc) { + _currentBindState.varyingDesc = *varyingDesc; + } else { + int numElements = varying ? varying->GetNumElements() : 0; + _currentBindState.varyingDesc = OsdVertexBufferDescriptor( + 0, numElements, numElements); + } - _vdesc.numVertexElements = vertex ? vertex->GetNumElements() : 0; - _vdesc.numVaryingElements = varying ? varying->GetNumElements() : 0; - - _currentKernelBundle = kernelBundle; + _currentBindState.vertexBuffer = vertex ? vertex->BindVBO() : 0; + _currentBindState.varyingBuffer = varying ? varying->BindVBO() : 0; + _currentBindState.kernelBundle = getKernels(_currentBindState.vertexDesc, + _currentBindState.varyingDesc); bindBufferAndProgram(); } /// Unbinds any previously bound vertex and varying data buffers. void unbind() { - _currentVertexBuffer = 0; - _currentVaryingBuffer = 0; + _currentBindState.Reset(); + + unbindBufferAndProgram(); } private: + struct BindState { + BindState() : vertexBuffer(0), varyingBuffer(0), kernelBundle(NULL) {} + void Reset() { + vertexBuffer = varyingBuffer = 0; + vertexDesc.Reset(); + varyingDesc.Reset(); + } + GLuint vertexBuffer; + GLuint varyingBuffer; + OsdVertexBufferDescriptor vertexDesc; + OsdVertexBufferDescriptor varyingDesc; + OsdGLSLComputeKernelBundle *kernelBundle; + }; + + BindState _currentBindState; + std::vector _kernelRegistry; - - GLuint _currentVertexBuffer, _currentVaryingBuffer; - - OsdVertexDescriptor _vdesc; - - OsdGLSLComputeKernelBundle * _currentKernelBundle; - }; } // end namespace OPENSUBDIV_VERSION diff --git a/opensubdiv/osd/glslComputeKernel.glsl b/opensubdiv/osd/glslComputeKernel.glsl index 08c8316b..2a24c245 100644 --- a/opensubdiv/osd/glslComputeKernel.glsl +++ b/opensubdiv/osd/glslComputeKernel.glsl @@ -31,6 +31,8 @@ uniform int vertexOffset = 0; // vertex index offset for the batch uniform int tableOffset = 0; // offset of subdivision table uniform int indexStart = 0; // start index relative to tableOffset uniform int indexEnd = 0; // end index relative to tableOffset +uniform int vertexBaseOffset = 0; // base vbo offset of the vertex buffer +uniform int varyingBaseOffset = 0; // base vbo offset of the varying buffer uniform bool vertexPass; /* @@ -40,6 +42,22 @@ uniform bool vertexPass; ^ ^ ^ vertexOffset | | indexStart indexEnd + + + +interleaved buffer example + +---------------------------+ + | x | y | z | r | g | b | a | + +---------------------------+ + ^ + vertexBaseOffset + ^ + varyingBaseOffset + +NUM_VERTEX_ELEMENTS = 3 +NUM_VARYING_ELEMENTS = 4 +VERTEX_STRIDE = VARYING_STRIDE = 7 + */ layout(binding=0) buffer vertex_buffer { float vertexBuffer[]; }; @@ -86,13 +104,15 @@ Vertex readVertex(int index) Vertex v; #if NUM_VERTEX_ELEMENTS > 0 + int vertexIndex = index * VERTEX_STRIDE + vertexBaseOffset; for (int i = 0; i < NUM_VERTEX_ELEMENTS; i++) { - v.vertexData[i] = vertexBuffer[index*NUM_VERTEX_ELEMENTS+i]; + v.vertexData[i] = vertexBuffer[vertexIndex + i]; } #endif #if NUM_VARYING_ELEMENTS > 0 + int varyingIndex = index * VARYING_STRIDE + varyingBaseOffset; for (int i = 0; i < NUM_VARYING_ELEMENTS; i++) { - v.varyingData[i] = varyingBuffer[index*NUM_VARYING_ELEMENTS+i]; + v.varyingData[i] = varyingBuffer[varyingIndex + i]; } #endif return v; @@ -101,13 +121,15 @@ Vertex readVertex(int index) void writeVertex(int index, Vertex v) { #if NUM_VERTEX_ELEMENTS > 0 + int vertexIndex = index * VERTEX_STRIDE + vertexBaseOffset; for (int i = 0; i < NUM_VERTEX_ELEMENTS; i++) { - vertexBuffer[index*NUM_VERTEX_ELEMENTS+i] = v.vertexData[i]; + vertexBuffer[vertexIndex + i] = v.vertexData[i]; } #endif #if NUM_VARYING_ELEMENTS > 0 + int varyingIndex = index * VARYING_STRIDE + varyingBaseOffset; for (int i = 0; i < NUM_VARYING_ELEMENTS; i++) { - varyingBuffer[index*NUM_VARYING_ELEMENTS+i] = v.varyingData[i]; + varyingBuffer[varyingIndex + i] = v.varyingData[i]; } #endif } @@ -152,6 +174,7 @@ void catmarkComputeFace() addWithWeight(dst, readVertex(index), weight); addVaryingWithWeight(dst, readVertex(index), weight); } + writeVertex(vid, dst); } @@ -356,6 +379,7 @@ void editAdd() // seemingly we can't iterate dynamically over vertexData[n] // due to mysterious glsl runtime limitation...? +#if NUM_VERTEX_ELEMENTS > 0 for (int j = 0; j < NUM_VERTEX_ELEMENTS; ++j) { float editValue = _editValues[i*editPrimVarWidth + min(j, editPrimVarWidth)]; editValue *= float(j >= editPrimVarOffset); @@ -363,6 +387,7 @@ void editAdd() dst.vertexData[j] += editValue; } writeVertex(v + vertexOffset, dst); +#endif } void main() diff --git a/opensubdiv/osd/glslKernelBundle.cpp b/opensubdiv/osd/glslKernelBundle.cpp index 94107c8e..aa140911 100644 --- a/opensubdiv/osd/glslKernelBundle.cpp +++ b/opensubdiv/osd/glslKernelBundle.cpp @@ -37,6 +37,7 @@ #include "../osd/opengl.h" #include +#include namespace OpenSubdiv { namespace OPENSUBDIV_VERSION { @@ -46,7 +47,11 @@ static const char *shaderSource = ; OsdGLSLComputeKernelBundle::OsdGLSLComputeKernelBundle() - : _program(0) { + : _program(0), + _numVertexElements(0), + _vertexStride(0), + _numVaryingElements(0), + _varyingStride(0) { // XXX: too rough! _workGroupSize = 64; @@ -58,9 +63,14 @@ OsdGLSLComputeKernelBundle::~OsdGLSLComputeKernelBundle() { } bool -OsdGLSLComputeKernelBundle::Compile(int numVertexElements, int numVaryingElements) { +OsdGLSLComputeKernelBundle::Compile( + OsdVertexBufferDescriptor const &vertexDesc, + OsdVertexBufferDescriptor const &varyingDesc) { - _vdesc.Set(numVertexElements, numVaryingElements ); + _numVertexElements = vertexDesc.length; + _vertexStride = vertexDesc.stride; + _numVaryingElements = varyingDesc.length; + _varyingStride = varyingDesc.stride; if (_program) { glDeleteProgram(_program); @@ -70,15 +80,16 @@ OsdGLSLComputeKernelBundle::Compile(int numVertexElements, int numVaryingElement GLuint shader = glCreateShader(GL_COMPUTE_SHADER); - char constantDefine[256]; - snprintf(constantDefine, 256, - "#define NUM_VERTEX_ELEMENTS %d\n" - "#define NUM_VARYING_ELEMENTS %d\n" - "#define WORK_GROUP_SIZE %d\n", - numVertexElements, numVaryingElements, _workGroupSize); + std::ostringstream defines; + defines << "#define NUM_VERTEX_ELEMENTS " << _numVertexElements << "\n" + << "#define VERTEX_STRIDE " << _vertexStride << "\n" + << "#define NUM_VARYING_ELEMENTS " << _numVaryingElements << "\n" + << "#define VARYING_STRIDE " << _varyingStride << "\n" + << "#define WORK_GROUP_SIZE " << _workGroupSize << "\n"; + std::string defineStr = defines.str(); const char *shaderSources[3]; - shaderSources[0] = constantDefine; + shaderSources[0] = defineStr.c_str(); shaderSources[1] = shaderSource; glShaderSource(shader, 2, shaderSources, NULL); glCompileShader(shader); @@ -98,9 +109,6 @@ OsdGLSLComputeKernelBundle::Compile(int numVertexElements, int numVaryingElement glDeleteProgram(_program); _program = 0; - // XXX ERROR HANDLE - printf("%s\n", constantDefine); - assert(false); return false; } @@ -129,11 +137,13 @@ OsdGLSLComputeKernelBundle::Compile(int numVertexElements, int numVaryingElement "loopComputeVertexB"); // set uniform locations for compute - _uniformVertexPass = glGetUniformLocation(_program, "vertexPass"); - _uniformVertexOffset = glGetUniformLocation(_program, "vertexOffset"); - _uniformTableOffset = glGetUniformLocation(_program, "tableOffset"); - _uniformIndexStart = glGetUniformLocation(_program, "indexStart"); - _uniformIndexEnd = glGetUniformLocation(_program, "indexEnd"); + _uniformVertexPass = glGetUniformLocation(_program, "vertexPass"); + _uniformVertexOffset = glGetUniformLocation(_program, "vertexOffset"); + _uniformTableOffset = glGetUniformLocation(_program, "tableOffset"); + _uniformIndexStart = glGetUniformLocation(_program, "indexStart"); + _uniformIndexEnd = glGetUniformLocation(_program, "indexEnd"); + _uniformVertexBaseOffset = glGetUniformLocation(_program, "vertexBaseOffset"); + _uniformVaryingBaseOffset = glGetUniformLocation(_program, "varyingBaseOffset"); _tableUniforms[FarSubdivisionTables::F_IT] = glGetUniformLocation(_program, "_F0_IT"); _tableUniforms[FarSubdivisionTables::F_ITa] = glGetUniformLocation(_program, "_F0_ITa"); @@ -176,8 +186,7 @@ OsdGLSLComputeKernelBundle::dispatchCompute( // we found a problem (issue #295) with nvidia driver 331.49 / Quadro4000 // resulting invalid vertices. // Apparently adding TEXTURE_FETCH_BARRIER after face kernel fixes it. - // We'll revisit this later. - glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT); + // The workaroud is commented out, since it looks fixed at driver 334.xx. } void @@ -186,6 +195,8 @@ OsdGLSLComputeKernelBundle::ApplyBilinearFaceVerticesKernel( glUniformSubroutinesuiv(GL_COMPUTE_SHADER, 1, &_subComputeFace); dispatchCompute(vertexOffset, tableOffset, start, end); + + // glMemoryBarrier(GL_TEXTURE_FETCH_BARRIER_BIT); } void @@ -213,8 +224,8 @@ OsdGLSLComputeKernelBundle::ApplyCatmarkFaceVerticesKernel( dispatchCompute(vertexOffset, tableOffset, start, end); // see the comment in dispatchCompute() - // this workaround could be a performance problem - glMemoryBarrier(GL_TEXTURE_FETCH_BARRIER_BIT); + // this workaround causes a performance problem. + // glMemoryBarrier(GL_TEXTURE_FETCH_BARRIER_BIT); } void @@ -279,9 +290,13 @@ OsdGLSLComputeKernelBundle::ApplyEditAdd( } void -OsdGLSLComputeKernelBundle::UseProgram() const +OsdGLSLComputeKernelBundle::UseProgram(int vertexBaseOffset, + int varyingBaseOffset) const { glUseProgram(_program); + + glUniform1i(_uniformVertexBaseOffset, vertexBaseOffset); + glUniform1i(_uniformVaryingBaseOffset, varyingBaseOffset); } } // end namespace OPENSUBDIV_VERSION diff --git a/opensubdiv/osd/glslKernelBundle.h b/opensubdiv/osd/glslKernelBundle.h index fbd36e91..b79030d9 100644 --- a/opensubdiv/osd/glslKernelBundle.h +++ b/opensubdiv/osd/glslKernelBundle.h @@ -42,7 +42,8 @@ public: OsdGLSLComputeKernelBundle(); ~OsdGLSLComputeKernelBundle(); - bool Compile(int numVertexElements, int numVaryingElements); + bool Compile(OsdVertexBufferDescriptor const &vertexDesc, + OsdVertexBufferDescriptor const &varyingDesc); void ApplyBilinearFaceVerticesKernel( int vertexOffset, int tableOffset, int start, int end); @@ -75,32 +76,40 @@ public: int vertexOffset, int tableOffset, int start, int end, bool pass); void ApplyEditAdd(int primvarOffset, int primvarWidth, - int vertexOffset, int tableOffset, int start, int end); + int vertexOffset, int tableOffset, + int start, int end); - void UseProgram() const; + void UseProgram(int vertexBaseOffset, int varyingBaseOffset) const; GLuint GetTableUniformLocation(int tableIndex) const { return _tableUniforms[tableIndex]; } struct Match { - /// Constructor - Match(int numVertexElements, int numVaryingElements) - : vdesc(numVertexElements, numVaryingElements) { + Match(OsdVertexBufferDescriptor const &vertex, + OsdVertexBufferDescriptor const &varying) + : vertexDesc(vertex), varyingDesc(varying) { } bool operator() (OsdGLSLComputeKernelBundle const *kernel) { - return vdesc == kernel->_vdesc; + // offset is dynamic. just comparing length and stride here, + // returns true if they are equal + return (vertexDesc.length == kernel->_numVertexElements and + vertexDesc.stride == kernel->_vertexStride and + varyingDesc.length == kernel->_numVaryingElements and + varyingDesc.stride == kernel->_varyingStride); } - OsdVertexDescriptor vdesc; + OsdVertexBufferDescriptor vertexDesc; + OsdVertexBufferDescriptor varyingDesc; }; friend struct Match; protected: - void dispatchCompute(int vertexOffset, int tableOffset, int start, int end) const; + void dispatchCompute(int vertexOffset, int tableOffset, + int start, int end) const ; GLuint _program; @@ -111,6 +120,8 @@ protected: GLuint _uniformTableOffset; GLuint _uniformIndexStart; GLuint _uniformIndexEnd; + GLuint _uniformVertexBaseOffset; + GLuint _uniformVaryingBaseOffset; // uniform locations for vertex edit GLuint _uniformEditPrimVarOffset; @@ -135,7 +146,10 @@ protected: int _workGroupSize; - OsdVertexDescriptor _vdesc; + int _numVertexElements; + int _vertexStride; + int _numVaryingElements; + int _varyingStride; }; } // end namespace OPENSUBDIV_VERSION diff --git a/opensubdiv/osd/glslPatchCommon.glsl b/opensubdiv/osd/glslPatchCommon.glsl index 591dedbd..fe95985e 100644 --- a/opensubdiv/osd/glslPatchCommon.glsl +++ b/opensubdiv/osd/glslPatchCommon.glsl @@ -140,6 +140,7 @@ mat4 OsdModelViewProjectionMatrix(); float OsdTessLevel(); int OsdGregoryQuadOffsetBase(); int OsdPrimitiveIdBase(); +int OsdBaseVertex(); float GetTessLevel(int patchLevel) { diff --git a/opensubdiv/osd/glslPatchGregory.glsl b/opensubdiv/osd/glslPatchGregory.glsl index e228a3dd..4e377ac5 100644 --- a/opensubdiv/osd/glslPatchGregory.glsl +++ b/opensubdiv/osd/glslPatchGregory.glsl @@ -68,6 +68,14 @@ out block { OSD_USER_VARYING_DECLARE } outpt; +vec3 readVertex(uint vertexIndex) +{ + vertexIndex += OsdBaseVertex(); + return vec3(texelFetch(OsdVertexBuffer, int(OSD_NUM_ELEMENTS*vertexIndex)).x, + texelFetch(OsdVertexBuffer, int(OSD_NUM_ELEMENTS*vertexIndex+1)).x, + texelFetch(OsdVertexBuffer, int(OSD_NUM_ELEMENTS*vertexIndex+2)).x); +} + void main() { int vID = gl_VertexID; @@ -122,38 +130,23 @@ void main() } #endif - vec3 neighbor = - vec3(texelFetch(OsdVertexBuffer, int(OSD_NUM_ELEMENTS*idx_neighbor)).x, - texelFetch(OsdVertexBuffer, int(OSD_NUM_ELEMENTS*idx_neighbor+1)).x, - texelFetch(OsdVertexBuffer, int(OSD_NUM_ELEMENTS*idx_neighbor+2)).x); + vec3 neighbor = readVertex(idx_neighbor); uint idx_diagonal = uint(texelFetch(OsdValenceBuffer, int(vID * (2*OSD_MAX_VALENCE+1) + 2*i + 1 + 1)).x); - vec3 diagonal = - vec3(texelFetch(OsdVertexBuffer, int(OSD_NUM_ELEMENTS*idx_diagonal)).x, - texelFetch(OsdVertexBuffer, int(OSD_NUM_ELEMENTS*idx_diagonal+1)).x, - texelFetch(OsdVertexBuffer, int(OSD_NUM_ELEMENTS*idx_diagonal+2)).x); + vec3 diagonal = readVertex(idx_diagonal); uint idx_neighbor_p = uint(texelFetch(OsdValenceBuffer, int(vID * (2*OSD_MAX_VALENCE+1) + 2*ip + 0 + 1)).x); - vec3 neighbor_p = - vec3(texelFetch(OsdVertexBuffer, int(OSD_NUM_ELEMENTS*idx_neighbor_p)).x, - texelFetch(OsdVertexBuffer, int(OSD_NUM_ELEMENTS*idx_neighbor_p+1)).x, - texelFetch(OsdVertexBuffer, int(OSD_NUM_ELEMENTS*idx_neighbor_p+2)).x); + vec3 neighbor_p = readVertex(idx_neighbor_p); uint idx_neighbor_m = uint(texelFetch(OsdValenceBuffer, int(vID * (2*OSD_MAX_VALENCE+1) + 2*im + 0 + 1)).x); - vec3 neighbor_m = - vec3(texelFetch(OsdVertexBuffer, int(OSD_NUM_ELEMENTS*idx_neighbor_m)).x, - texelFetch(OsdVertexBuffer, int(OSD_NUM_ELEMENTS*idx_neighbor_m+1)).x, - texelFetch(OsdVertexBuffer, int(OSD_NUM_ELEMENTS*idx_neighbor_m+2)).x); + vec3 neighbor_m = readVertex(idx_neighbor_m); uint idx_diagonal_m = uint(texelFetch(OsdValenceBuffer, int(vID * (2*OSD_MAX_VALENCE+1) + 2*im + 1 + 1)).x); - vec3 diagonal_m = - vec3(texelFetch(OsdVertexBuffer, int(OSD_NUM_ELEMENTS*idx_diagonal_m)).x, - texelFetch(OsdVertexBuffer, int(OSD_NUM_ELEMENTS*idx_diagonal_m+1)).x, - texelFetch(OsdVertexBuffer, int(OSD_NUM_ELEMENTS*idx_diagonal_m+2)).x); + vec3 diagonal_m = readVertex(idx_diagonal_m); f[i] = (pos * float(valence) + (neighbor_p + neighbor)*2.0f + diagonal) / (float(valence)+5.0f); @@ -186,24 +179,16 @@ void main() if (ivalence < 0) { if (valence > 2) { outpt.v.position = ( - vec3(texelFetch(OsdVertexBuffer, int(OSD_NUM_ELEMENTS*boundaryEdgeNeighbors[0])).x, - texelFetch(OsdVertexBuffer, int(OSD_NUM_ELEMENTS*boundaryEdgeNeighbors[0]+1)).x, - texelFetch(OsdVertexBuffer, int(OSD_NUM_ELEMENTS*boundaryEdgeNeighbors[0]+2)).x) + - vec3(texelFetch(OsdVertexBuffer, int(OSD_NUM_ELEMENTS*boundaryEdgeNeighbors[1])).x, - texelFetch(OsdVertexBuffer, int(OSD_NUM_ELEMENTS*boundaryEdgeNeighbors[1]+1)).x, - texelFetch(OsdVertexBuffer, int(OSD_NUM_ELEMENTS*boundaryEdgeNeighbors[1]+2)).x) + + readVertex(boundaryEdgeNeighbors[0]) + + readVertex(boundaryEdgeNeighbors[1]) + 4.0f * pos)/6.0f; } else { outpt.v.position = pos; } outpt.v.e0 = ( - vec3(texelFetch(OsdVertexBuffer, int(OSD_NUM_ELEMENTS*boundaryEdgeNeighbors[0])).x, - texelFetch(OsdVertexBuffer, int(OSD_NUM_ELEMENTS*boundaryEdgeNeighbors[0]+1)).x, - texelFetch(OsdVertexBuffer, int(OSD_NUM_ELEMENTS*boundaryEdgeNeighbors[0]+2)).x) - - vec3(texelFetch(OsdVertexBuffer, int(OSD_NUM_ELEMENTS*boundaryEdgeNeighbors[1])).x, - texelFetch(OsdVertexBuffer, int(OSD_NUM_ELEMENTS*boundaryEdgeNeighbors[1]+1)).x, - texelFetch(OsdVertexBuffer, int(OSD_NUM_ELEMENTS*boundaryEdgeNeighbors[1]+2)).x) + readVertex(boundaryEdgeNeighbors[0]) - + readVertex(boundaryEdgeNeighbors[1]) )/6.0; float k = float(float(valence) - 1.0f); //k is the number of faces @@ -216,18 +201,11 @@ void main() int idx_diagonal = texelFetch(OsdValenceBuffer,int((vID) * (2*OSD_MAX_VALENCE+1) + 2*zerothNeighbor + 1 + 1)).x; idx_diagonal = abs(idx_diagonal); - vec3 diagonal = - vec3(texelFetch(OsdVertexBuffer, int(OSD_NUM_ELEMENTS*idx_diagonal)).x, - texelFetch(OsdVertexBuffer, int(OSD_NUM_ELEMENTS*idx_diagonal+1)).x, - texelFetch(OsdVertexBuffer, int(OSD_NUM_ELEMENTS*idx_diagonal+2)).x); + vec3 diagonal = readVertex(idx_diagonal); outpt.v.e1 = gamma * pos + - alpha_0k * vec3(texelFetch(OsdVertexBuffer, int(OSD_NUM_ELEMENTS*boundaryEdgeNeighbors[0])).x, - texelFetch(OsdVertexBuffer, int(OSD_NUM_ELEMENTS*boundaryEdgeNeighbors[0]+1)).x, - texelFetch(OsdVertexBuffer, int(OSD_NUM_ELEMENTS*boundaryEdgeNeighbors[0]+2)).x) + - alpha_0k * vec3(texelFetch(OsdVertexBuffer, int(OSD_NUM_ELEMENTS*boundaryEdgeNeighbors[1])).x, - texelFetch(OsdVertexBuffer, int(OSD_NUM_ELEMENTS*boundaryEdgeNeighbors[1]+1)).x, - texelFetch(OsdVertexBuffer, int(OSD_NUM_ELEMENTS*boundaryEdgeNeighbors[1]+2)).x) + + alpha_0k * readVertex(boundaryEdgeNeighbors[0]) + + alpha_0k * readVertex(boundaryEdgeNeighbors[1]) + beta_0 * diagonal; for (uint x=1; x::iterator it = std::find_if(_kernelRegistry.begin(), _kernelRegistry.end(), - OsdGLSLTransformFeedbackKernelBundle::Match(numVertexElements, - numVaryingElements)); + OsdGLSLTransformFeedbackKernelBundle::Match( + vertexDesc, varyingDesc, interleaved)); + if (it != _kernelRegistry.end()) { return *it; } else { - OsdGLSLTransformFeedbackKernelBundle *kernelBundle = new OsdGLSLTransformFeedbackKernelBundle(); + OsdGLSLTransformFeedbackKernelBundle *kernelBundle = + new OsdGLSLTransformFeedbackKernelBundle(); _kernelRegistry.push_back(kernelBundle); - kernelBundle->Compile(numVertexElements, numVaryingElements); + kernelBundle->Compile(vertexDesc, varyingDesc, interleaved); return kernelBundle; } } @@ -86,54 +88,59 @@ bindTexture(GLint samplerUniform, GLuint texture, int unit) { } void -OsdGLSLTransformFeedbackComputeController::bindTextures() { +OsdGLSLTransformFeedbackComputeController::bindResources() { glEnable(GL_RASTERIZER_DISCARD); - _currentKernelBundle->UseProgram(); + _currentBindState.kernelBundle->UseProgram(_currentBindState.vertexDesc.offset, _currentBindState.varyingDesc.offset); // bind vertex texture - if (_currentVertexBuffer) { + if (_currentBindState.vertexBuffer) { if (not _vertexTexture) glGenTextures(1, &_vertexTexture); #if defined(GL_EXT_direct_state_access) if (glTextureBufferEXT) { - glTextureBufferEXT(_vertexTexture, GL_TEXTURE_BUFFER, GL_R32F, _currentVertexBuffer); + glTextureBufferEXT(_vertexTexture, GL_TEXTURE_BUFFER, GL_R32F, _currentBindState.vertexBuffer); } else { #else { #endif glBindTexture(GL_TEXTURE_BUFFER, _vertexTexture); - glTexBuffer(GL_TEXTURE_BUFFER, GL_R32F, _currentVertexBuffer); + glTexBuffer(GL_TEXTURE_BUFFER, GL_R32F, _currentBindState.vertexBuffer); glBindTexture(GL_TEXTURE_BUFFER, 0); } } - if (_currentVaryingBuffer) { + if (_currentBindState.varyingBuffer) { if (not _varyingTexture) glGenTextures(1, &_varyingTexture); #if defined(GL_EXT_direct_state_access) if (glTextureBufferEXT) { - glTextureBufferEXT(_varyingTexture, GL_TEXTURE_BUFFER, GL_R32F, _currentVaryingBuffer); + glTextureBufferEXT(_varyingTexture, GL_TEXTURE_BUFFER, GL_R32F, _currentBindState.varyingBuffer); } else { #else { #endif glBindTexture(GL_TEXTURE_BUFFER, _varyingTexture); - glTexBuffer(GL_TEXTURE_BUFFER, GL_R32F, _currentVaryingBuffer); + glTexBuffer(GL_TEXTURE_BUFFER, GL_R32F, _currentBindState.varyingBuffer); glBindTexture(GL_TEXTURE_BUFFER, 0); } } if (_vertexTexture) - bindTexture(_currentKernelBundle->GetVertexUniformLocation(), _vertexTexture, 0); + bindTexture(_currentBindState.kernelBundle->GetVertexUniformLocation(), _vertexTexture, 0); if (_varyingTexture) - bindTexture(_currentKernelBundle->GetVaryingUniformLocation(), _varyingTexture, 1); + bindTexture(_currentBindState.kernelBundle->GetVaryingUniformLocation(), _varyingTexture, 1); // bind vertex texture image (for edit kernel) - glUniform1i(_currentKernelBundle->GetVertexBufferImageUniformLocation(), 0); + glUniform1i(_currentBindState.kernelBundle->GetVertexBufferImageUniformLocation(), 0); glBindImageTexture(0, _vertexTexture, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_R32F); + + // bind vertex array + // always create new one, to be safe with multiple contexts. + glGenVertexArrays(1, &_vao); + glBindVertexArray(_vao); } void -OsdGLSLTransformFeedbackComputeController::unbindTextures() { +OsdGLSLTransformFeedbackComputeController::unbindResources() { glActiveTexture(GL_TEXTURE0); glBindTexture(GL_TEXTURE_BUFFER, 0); @@ -146,6 +153,10 @@ OsdGLSLTransformFeedbackComputeController::unbindTextures() { glDisable(GL_RASTERIZER_DISCARD); glUseProgram(0); glActiveTexture(GL_TEXTURE0); + + // unbind vertex array + glBindVertexArray(0); + glDeleteVertexArrays(1, &_vao); } void @@ -154,9 +165,9 @@ OsdGLSLTransformFeedbackComputeController::ApplyBilinearFaceVerticesKernel( assert(context); - _currentKernelBundle->ApplyBilinearFaceVerticesKernel( - _currentVertexBuffer, _vdesc.numVertexElements, - _currentVaryingBuffer, _vdesc.numVaryingElements, + _currentBindState.kernelBundle->ApplyBilinearFaceVerticesKernel( + _currentBindState.vertexBuffer, _currentBindState.varyingBuffer, + _currentBindState.vertexDesc.offset, _currentBindState.varyingDesc.offset, batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd()); } @@ -166,9 +177,9 @@ OsdGLSLTransformFeedbackComputeController::ApplyBilinearEdgeVerticesKernel( assert(context); - _currentKernelBundle->ApplyBilinearEdgeVerticesKernel( - _currentVertexBuffer, _vdesc.numVertexElements, - _currentVaryingBuffer, _vdesc.numVaryingElements, + _currentBindState.kernelBundle->ApplyBilinearEdgeVerticesKernel( + _currentBindState.vertexBuffer, _currentBindState.varyingBuffer, + _currentBindState.vertexDesc.offset, _currentBindState.varyingDesc.offset, batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd()); } @@ -178,9 +189,9 @@ OsdGLSLTransformFeedbackComputeController::ApplyBilinearVertexVerticesKernel( assert(context); - _currentKernelBundle->ApplyBilinearVertexVerticesKernel( - _currentVertexBuffer, _vdesc.numVertexElements, - _currentVaryingBuffer, _vdesc.numVaryingElements, + _currentBindState.kernelBundle->ApplyBilinearVertexVerticesKernel( + _currentBindState.vertexBuffer, _currentBindState.varyingBuffer, + _currentBindState.vertexDesc.offset, _currentBindState.varyingDesc.offset, batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd()); } @@ -190,9 +201,9 @@ OsdGLSLTransformFeedbackComputeController::ApplyCatmarkFaceVerticesKernel( assert(context); - _currentKernelBundle->ApplyCatmarkFaceVerticesKernel( - _currentVertexBuffer, _vdesc.numVertexElements, - _currentVaryingBuffer, _vdesc.numVaryingElements, + _currentBindState.kernelBundle->ApplyCatmarkFaceVerticesKernel( + _currentBindState.vertexBuffer, _currentBindState.varyingBuffer, + _currentBindState.vertexDesc.offset, _currentBindState.varyingDesc.offset, batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd()); } @@ -204,9 +215,9 @@ OsdGLSLTransformFeedbackComputeController::ApplyCatmarkEdgeVerticesKernel( assert(context); - _currentKernelBundle->ApplyCatmarkEdgeVerticesKernel( - _currentVertexBuffer, _vdesc.numVertexElements, - _currentVaryingBuffer, _vdesc.numVaryingElements, + _currentBindState.kernelBundle->ApplyCatmarkEdgeVerticesKernel( + _currentBindState.vertexBuffer, _currentBindState.varyingBuffer, + _currentBindState.vertexDesc.offset, _currentBindState.varyingDesc.offset, batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd()); } @@ -216,9 +227,9 @@ OsdGLSLTransformFeedbackComputeController::ApplyCatmarkVertexVerticesKernelB( assert(context); - _currentKernelBundle->ApplyCatmarkVertexVerticesKernelB( - _currentVertexBuffer, _vdesc.numVertexElements, - _currentVaryingBuffer, _vdesc.numVaryingElements, + _currentBindState.kernelBundle->ApplyCatmarkVertexVerticesKernelB( + _currentBindState.vertexBuffer, _currentBindState.varyingBuffer, + _currentBindState.vertexDesc.offset, _currentBindState.varyingDesc.offset, batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd()); } @@ -228,9 +239,9 @@ OsdGLSLTransformFeedbackComputeController::ApplyCatmarkVertexVerticesKernelA1( assert(context); - _currentKernelBundle->ApplyCatmarkVertexVerticesKernelA( - _currentVertexBuffer, _vdesc.numVertexElements, - _currentVaryingBuffer, _vdesc.numVaryingElements, + _currentBindState.kernelBundle->ApplyCatmarkVertexVerticesKernelA( + _currentBindState.vertexBuffer, _currentBindState.varyingBuffer, + _currentBindState.vertexDesc.offset, _currentBindState.varyingDesc.offset, batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd(), false); } @@ -240,9 +251,9 @@ OsdGLSLTransformFeedbackComputeController::ApplyCatmarkVertexVerticesKernelA2( assert(context); - _currentKernelBundle->ApplyCatmarkVertexVerticesKernelA( - _currentVertexBuffer, _vdesc.numVertexElements, - _currentVaryingBuffer, _vdesc.numVaryingElements, + _currentBindState.kernelBundle->ApplyCatmarkVertexVerticesKernelA( + _currentBindState.vertexBuffer, _currentBindState.varyingBuffer, + _currentBindState.vertexDesc.offset, _currentBindState.varyingDesc.offset, batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd(), true); } @@ -252,9 +263,9 @@ OsdGLSLTransformFeedbackComputeController::ApplyLoopEdgeVerticesKernel( assert(context); - _currentKernelBundle->ApplyLoopEdgeVerticesKernel( - _currentVertexBuffer, _vdesc.numVertexElements, - _currentVaryingBuffer, _vdesc.numVaryingElements, + _currentBindState.kernelBundle->ApplyLoopEdgeVerticesKernel( + _currentBindState.vertexBuffer, _currentBindState.varyingBuffer, + _currentBindState.vertexDesc.offset, _currentBindState.varyingDesc.offset, batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd()); } @@ -264,9 +275,9 @@ OsdGLSLTransformFeedbackComputeController::ApplyLoopVertexVerticesKernelB( assert(context); - _currentKernelBundle->ApplyLoopVertexVerticesKernelB( - _currentVertexBuffer, _vdesc.numVertexElements, - _currentVaryingBuffer, _vdesc.numVaryingElements, + _currentBindState.kernelBundle->ApplyLoopVertexVerticesKernelB( + _currentBindState.vertexBuffer, _currentBindState.varyingBuffer, + _currentBindState.vertexDesc.offset, _currentBindState.varyingDesc.offset, batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd()); } @@ -276,9 +287,9 @@ OsdGLSLTransformFeedbackComputeController::ApplyLoopVertexVerticesKernelA1( assert(context); - _currentKernelBundle->ApplyLoopVertexVerticesKernelA( - _currentVertexBuffer, _vdesc.numVertexElements, - _currentVaryingBuffer, _vdesc.numVaryingElements, + _currentBindState.kernelBundle->ApplyLoopVertexVerticesKernelA( + _currentBindState.vertexBuffer, _currentBindState.varyingBuffer, + _currentBindState.vertexDesc.offset, _currentBindState.varyingDesc.offset, batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd(), false); } @@ -288,9 +299,9 @@ OsdGLSLTransformFeedbackComputeController::ApplyLoopVertexVerticesKernelA2( assert(context); - _currentKernelBundle->ApplyLoopVertexVerticesKernelA( - _currentVertexBuffer, _vdesc.numVertexElements, - _currentVaryingBuffer, _vdesc.numVaryingElements, + _currentBindState.kernelBundle->ApplyLoopVertexVerticesKernelA( + _currentBindState.vertexBuffer, _currentBindState.varyingBuffer, + _currentBindState.vertexDesc.offset, _currentBindState.varyingDesc.offset, batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd(), true); } @@ -303,15 +314,15 @@ OsdGLSLTransformFeedbackComputeController::ApplyVertexEdits( const OsdGLSLTransformFeedbackHEditTable * edit = context->GetEditTable(batch.GetTableIndex()); assert(edit); - context->BindEditTextures(batch.GetTableIndex(), _currentKernelBundle); + context->BindEditTextures(batch.GetTableIndex(), _currentBindState.kernelBundle); int primvarOffset = edit->GetPrimvarOffset(); int primvarWidth = edit->GetPrimvarWidth(); if (edit->GetOperation() == FarVertexEdit::Add) { - _currentKernelBundle->ApplyEditAdd( - _currentVertexBuffer, _vdesc.numVertexElements, - _currentVaryingBuffer, _vdesc.numVaryingElements, + _currentBindState.kernelBundle->ApplyEditAdd( + _currentBindState.vertexBuffer, _currentBindState.varyingBuffer, + _currentBindState.vertexDesc.offset, _currentBindState.varyingDesc.offset, primvarOffset, primvarWidth, batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd()); } else { diff --git a/opensubdiv/osd/glslTransformFeedbackComputeController.h b/opensubdiv/osd/glslTransformFeedbackComputeController.h index 5a4ca903..08239f72 100644 --- a/opensubdiv/osd/glslTransformFeedbackComputeController.h +++ b/opensubdiv/osd/glslTransformFeedbackComputeController.h @@ -29,6 +29,7 @@ #include "../far/dispatcher.h" #include "../osd/glslTransformFeedbackComputeContext.h" +#include "../osd/vertexDescriptor.h" #include @@ -69,16 +70,26 @@ public: /// /// @param varyingBuffer varying-interpolated data buffer /// + /// @param vertexDesc the descriptor of vertex elements to be refined. + /// if it's null, all primvars in the vertex buffer + /// will be refined. + /// + /// @param varyingDesc the descriptor of varying elements to be refined. + /// if it's null, all primvars in the varying buffer + /// will be refined. + /// template void Refine(OsdGLSLTransformFeedbackComputeContext const *context, FarKernelBatchVector const &batches, VERTEX_BUFFER *vertexBuffer, - VARYING_BUFFER *varyingBuffer) { + VARYING_BUFFER *varyingBuffer, + OsdVertexBufferDescriptor const *vertexDesc=NULL, + OsdVertexBufferDescriptor const *varyingDesc=NULL) { if (batches.empty()) return; - bind(vertexBuffer, varyingBuffer); - context->BindTableTextures(_currentKernelBundle); + bind(vertexBuffer, varyingBuffer, vertexDesc, varyingDesc); + context->BindTableTextures(_currentBindState.kernelBundle); FarDispatcher::Refine(this, context, batches, /*maxlevel*/-1); @@ -136,47 +147,73 @@ protected: void ApplyVertexEdits(FarKernelBatch const &batch, ComputeContext const *context) const; - OsdGLSLTransformFeedbackKernelBundle * getKernels(int numVertexElements, - int numVaryingElements); + OsdGLSLTransformFeedbackKernelBundle * getKernels( + OsdVertexBufferDescriptor const &vertexDesc, + OsdVertexBufferDescriptor const &varyingDesc, + bool interleaved); - void bindTextures(); + void bindResources(); - void unbindTextures(); + void unbindResources(); template - void bind(VERTEX_BUFFER *vertex, VARYING_BUFFER *varying) { + void bind(VERTEX_BUFFER *vertex, VARYING_BUFFER *varying, + OsdVertexBufferDescriptor const *vertexDesc, + OsdVertexBufferDescriptor const *varyingDesc) { - _currentVertexBuffer = vertex ? vertex->BindVBO() : 0; - _currentVaryingBuffer = varying ? varying->BindVBO() : 0; + // if the vertex buffer descriptor is specified, use it. + // otherwise, assumes the data is tightly packed in the vertex buffer. + if (vertexDesc) { + _currentBindState.vertexDesc = *vertexDesc; + } else { + int numElements = vertex ? vertex->GetNumElements() : 0; + _currentBindState.vertexDesc = OsdVertexBufferDescriptor( + 0, numElements, numElements); + } + if (varyingDesc) { + _currentBindState.varyingDesc = *varyingDesc; + } else { + int numElements = varying ? varying->GetNumElements() : 0; + _currentBindState.varyingDesc = OsdVertexBufferDescriptor( + 0, numElements, numElements); + } - _vdesc.numVertexElements = vertex ? vertex->GetNumElements() : 0; - _vdesc.numVaryingElements = varying ? varying->GetNumElements() : 0; + bool interleaved = (vertex and varying and (vertex == varying)); + _currentBindState.vertexBuffer = vertex ? vertex->BindVBO() : 0; + _currentBindState.varyingBuffer = varying ? varying->BindVBO() : 0; + _currentBindState.kernelBundle = getKernels(_currentBindState.vertexDesc, + _currentBindState.varyingDesc, + interleaved); - _currentKernelBundle = - getKernels(_vdesc.numVertexElements, _vdesc.numVaryingElements); - - bindTextures(); + bindResources(); } /// Unbinds any previously bound vertex and varying data buffers. void unbind() { - _currentVertexBuffer = 0; - _currentVaryingBuffer = 0; - _currentKernelBundle = NULL; + _currentBindState.Reset(); - unbindTextures(); + unbindResources(); } private: + struct BindState { + BindState() : vertexBuffer(0), varyingBuffer(0), kernelBundle(NULL) {} + void Reset() { + vertexBuffer = varyingBuffer = 0; + vertexDesc.Reset(); + varyingDesc.Reset(); + } + GLuint vertexBuffer; + GLuint varyingBuffer; + OsdVertexBufferDescriptor vertexDesc; + OsdVertexBufferDescriptor varyingDesc; + OsdGLSLTransformFeedbackKernelBundle *kernelBundle; + }; + BindState _currentBindState; + std::vector _kernelRegistry; - GLuint _vertexTexture, _varyingTexture; - GLuint _currentVertexBuffer, _currentVaryingBuffer; - - OsdVertexDescriptor _vdesc; - - OsdGLSLTransformFeedbackKernelBundle * _currentKernelBundle; - + GLuint _vao; }; } // end namespace OPENSUBDIV_VERSION diff --git a/opensubdiv/osd/glslTransformFeedbackKernel.glsl b/opensubdiv/osd/glslTransformFeedbackKernel.glsl index f81c7e3e..b1b9a084 100644 --- a/opensubdiv/osd/glslTransformFeedbackKernel.glsl +++ b/opensubdiv/osd/glslTransformFeedbackKernel.glsl @@ -41,6 +41,8 @@ layout(size1x32) uniform imageBuffer _vertexBufferImage; uniform int vertexOffset = 0; // vertex index offset for the batch uniform int tableOffset = 0; // offset of subdivision table uniform int indexStart = 0; // start index relative to tableOffset +uniform int vertexBaseOffset = 0; // base vbo offset of the vertex buffer +uniform int varyingBaseOffset = 0; // base vbo offset of the varying buffer uniform bool vertexPass; /* @@ -50,6 +52,12 @@ uniform bool vertexPass; ^ ^ vertexOffset | indexStart + + +NUM_VERTEX_ELEMENTS = 3 +NUM_VARYING_ELEMENTS = 4 +VERTEX_STRIDE = VARYING_STRIDE = 7 + */ //-------------------------------------------------------------------------------- @@ -100,13 +108,15 @@ Vertex readVertex(int index) // unpacking #if NUM_VERTEX_ELEMENTS > 0 + int vertexIndex = index * VERTEX_STRIDE; for(int i = 0; i < NUM_VERTEX_ELEMENTS; i++) { - v.vertexData[i] = texelFetch(vertexData, index*NUM_VERTEX_ELEMENTS+i).x; + v.vertexData[i] = texelFetch(vertexData, vertexIndex+i+vertexBaseOffset).x; } #endif #if NUM_VARYING_ELEMENTS > 0 + int varyingIndex = index * VARYING_STRIDE; for(int i = 0; i < NUM_VARYING_ELEMENTS; i++){ - v.varyingData[i] = texelFetch(varyingData, index*NUM_VARYING_ELEMENTS+i).x; + v.varyingData[i] = texelFetch(varyingData, varyingIndex+i+varyingBaseOffset).x; } #endif return v; @@ -130,7 +140,7 @@ void writeVertex(Vertex v) void writeVertexByImageStore(Vertex v, int index) { #if NUM_VERTEX_ELEMENTS > 0 - int p = index * NUM_VERTEX_ELEMENTS; + int p = index * VERTEX_STRIDE + vertexBaseOffset; for(int i = 0; i < NUM_VERTEX_ELEMENTS; i++) { imageStore(_vertexBufferImage, p+i, vec4(v.vertexData[i], 0, 0, 0)); } diff --git a/opensubdiv/osd/glslTransformFeedbackKernelBundle.cpp b/opensubdiv/osd/glslTransformFeedbackKernelBundle.cpp index e32b4b07..20fd98a1 100644 --- a/opensubdiv/osd/glslTransformFeedbackKernelBundle.cpp +++ b/opensubdiv/osd/glslTransformFeedbackKernelBundle.cpp @@ -40,6 +40,7 @@ #include #include +#include namespace OpenSubdiv { namespace OPENSUBDIV_VERSION { @@ -61,7 +62,12 @@ static const char *shaderDefines = "" ; OsdGLSLTransformFeedbackKernelBundle::OsdGLSLTransformFeedbackKernelBundle() - : _program(0) { + : _program(0), + _numVertexElements(0), + _vertexStride(0), + _numVaryingElements(0), + _varyingStride(0), + _interleaved(false) { } OsdGLSLTransformFeedbackKernelBundle::~OsdGLSLTransformFeedbackKernelBundle() { @@ -70,24 +76,34 @@ OsdGLSLTransformFeedbackKernelBundle::~OsdGLSLTransformFeedbackKernelBundle() { } bool -OsdGLSLTransformFeedbackKernelBundle::Compile(int numVertexElements, int numVaryingElements) { +OsdGLSLTransformFeedbackKernelBundle::Compile( + OsdVertexBufferDescriptor const &vertexDesc, + OsdVertexBufferDescriptor const &varyingDesc, + bool interleaved) { - assert(numVertexElements >= 3); // at least xyz required (for performance reason) + _numVertexElements = vertexDesc.length; + _vertexStride = vertexDesc.stride; + _numVaryingElements = varyingDesc.length; + _varyingStride = varyingDesc.stride; + _interleaved = interleaved; + + // modulo of vbo offset + _vertexOffsetMod = (_vertexStride ? vertexDesc.offset % _vertexStride : 0); + _varyingOffsetMod = (_varyingStride ? varyingDesc.offset % _varyingStride : 0); - _vdesc.Set(numVertexElements, numVaryingElements); - _program = glCreateProgram(); GLuint shader = glCreateShader(GL_VERTEX_SHADER); - char constantDefine[256]; - snprintf(constantDefine, 256, - "#define NUM_VERTEX_ELEMENTS %d\n" - "#define NUM_VARYING_ELEMENTS %d\n", - numVertexElements, numVaryingElements); + std::ostringstream defines; + defines << "#define NUM_VERTEX_ELEMENTS " << _numVertexElements << "\n" + << "#define VERTEX_STRIDE " << _vertexStride << "\n" + << "#define NUM_VARYING_ELEMENTS " << _numVaryingElements << "\n" + << "#define VARYING_STRIDE " << _varyingStride << "\n"; + std::string defineStr = defines.str(); const char *shaderSources[3]; - shaderSources[0] = constantDefine; + shaderSources[0] = defineStr.c_str(); shaderSources[1] = shaderDefines; shaderSources[2] = shaderSource; glShaderSource(shader, 3, shaderSources, NULL); @@ -96,21 +112,85 @@ OsdGLSLTransformFeedbackKernelBundle::Compile(int numVertexElements, int numVary std::vector outputs; - // position and custom vertex data are stored same buffer whereas varying data - // exists on another buffer. "gl_NextBuffer" identifier helps to split them. - for (int i = 0; i < numVertexElements; ++i) { + /* + output attribute array + + - interleaved + outVertexData[0] + outVertexData[1] + outVertexData[2] + (gl_SkipComponents1) + outVaryingData[0] + outVaryingData[1] + outVaryingData[2] + outVaryingData[3] + (gl_SkipComponents1) + ... + + + - non-interleaved + outVertexData[0] + outVertexData[1] + outVertexData[2] + gl_NextBuffer + outVaryingData[0] + outVaryingData[1] + outVaryingData[2] + outVaryingData[3] + + */ + + if (_interleaved) { + assert(_vertexStride == _varyingStride); + assert(_numVertexElements + _numVaryingElements <= _vertexStride); char attrName[32]; - snprintf(attrName, 32, "outVertexData[%d]", i); - outputs.push_back(attrName); - } - for (int i = 0; i < numVaryingElements; ++i) { - if (i == 0 and (not outputs.empty())) { + + for (int i = 0; i < _vertexStride; ++i) { + int vertexElem = i - _vertexOffsetMod; + int varyingElem = i - _varyingOffsetMod; + + if (vertexElem >= 0 and vertexElem < _numVertexElements) { + snprintf(attrName, 32, "outVertexData[%d]", vertexElem); + outputs.push_back(attrName); + } else if (varyingElem >= 0 and varyingElem <= _numVaryingElements) { + snprintf(attrName, 32, "outVaryingData[%d]", varyingElem); + outputs.push_back(attrName); + } else { + outputs.push_back("gl_SkipComponents1"); + } + } + } else { + // non-interleaved + char attrName[32]; + + // vertex data (may include custom vertex data) and varying data + // are stored into the same buffer, interleaved. + for (int i = 0; i < _vertexOffsetMod; ++i) + outputs.push_back("gl_SkipComponents1"); + for (int i = 0; i < _numVertexElements; ++i) { + snprintf(attrName, 32, "outVertexData[%d]", i); + outputs.push_back(attrName); + } + for (int i = _numVertexElements + _vertexOffsetMod; i < _vertexStride; ++i) + outputs.push_back("gl_SkipComponents1"); + + // varying + if (_numVaryingElements) { outputs.push_back("gl_NextBuffer"); } - char attrName[32]; - snprintf(attrName, 32, "outVaryingData[%d]", i); - outputs.push_back(attrName); + for (int i = 0; i < _varyingOffsetMod; ++i) { + outputs.push_back("gl_SkipComponents1"); + } + for (int i = 0; i < _numVaryingElements; ++i) { + snprintf(attrName, 32, "outVaryingData[%d]", i); + outputs.push_back(attrName); + } + for (int i = _numVaryingElements + _varyingOffsetMod; i < _varyingStride; ++i) { + outputs.push_back("gl_SkipComponents1"); + } } + + // convert to char* array std::vector pOutputs; for (size_t i = 0; i < outputs.size(); ++i) { pOutputs.push_back(&outputs[i][0]); @@ -156,6 +236,8 @@ OsdGLSLTransformFeedbackKernelBundle::Compile(int numVertexElements, int numVary _uniformVertexOffset = glGetUniformLocation(_program, "vertexOffset"); _uniformTableOffset = glGetUniformLocation(_program, "tableOffset"); _uniformIndexStart = glGetUniformLocation(_program, "indexStart"); + _uniformVertexBaseOffset = glGetUniformLocation(_program, "vertexBaseOffset"); + _uniformVaryingBaseOffset = glGetUniformLocation(_program, "varyingBaseOffset"); _uniformTables[FarSubdivisionTables::F_IT] = glGetUniformLocation(_program, "_F0_IT"); _uniformTables[FarSubdivisionTables::F_ITa] = glGetUniformLocation(_program, "_F0_ITa"); @@ -181,32 +263,44 @@ OsdGLSLTransformFeedbackKernelBundle::Compile(int numVertexElements, int numVary void OsdGLSLTransformFeedbackKernelBundle::transformGpuBufferData( - GLuint vertexBuffer, int numVertexElements, - GLuint varyingBuffer, int numVaryingElements, - int vertexOffset, int tableOffset, int start, int end) const { + GLuint vertexBuffer, GLuint varyingBuffer, + int vertexOffset, int varyingOffset, + int offset, int tableOffset, int start, int end) const { int count = end - start; if (count <= 0) return; // set batch range glUniform1i(_uniformIndexStart, start); - glUniform1i(_uniformVertexOffset, vertexOffset); + glUniform1i(_uniformVertexOffset, offset); glUniform1i(_uniformTableOffset, tableOffset); + // XXX: end is not used here now OSD_DEBUG_CHECK_GL_ERROR("Uniform index set at offset=%d. start=%d\n", - vertexOffset, start); + offset, start); + + int vertexOrigin = vertexOffset - _vertexOffsetMod; + int varyingOrigin = varyingOffset - _varyingOffsetMod; // set transform feedback buffer - if (vertexBuffer) { - int vertexStride = numVertexElements*sizeof(float); + if (_interleaved) { + int vertexStride = _vertexStride*sizeof(float); glBindBufferRange(GL_TRANSFORM_FEEDBACK_BUFFER, 0, vertexBuffer, - (start + vertexOffset)*vertexStride, count*vertexStride); - } - - if (varyingBuffer){ - int varyingStride = numVaryingElements*sizeof(float); - glBindBufferRange(GL_TRANSFORM_FEEDBACK_BUFFER, 1, varyingBuffer, - (start + vertexOffset)*varyingStride, count*varyingStride); + (start + offset)*vertexStride + vertexOrigin*sizeof(float), + count*vertexStride); + } else { + if (vertexBuffer) { + int vertexStride = _vertexStride*sizeof(float); + glBindBufferRange(GL_TRANSFORM_FEEDBACK_BUFFER, 0, vertexBuffer, + (start + offset)*vertexStride + vertexOrigin*sizeof(float), + count*vertexStride); + } + if (varyingBuffer){ + int varyingStride = _varyingStride*sizeof(float); + glBindBufferRange(GL_TRANSFORM_FEEDBACK_BUFFER, 1, varyingBuffer, + (start + offset)*varyingStride + varyingOrigin*sizeof(float), + count*varyingStride); + } } OSD_DEBUG_CHECK_GL_ERROR("transformGpuBufferData glBindBufferRange\n"); @@ -222,142 +316,138 @@ OsdGLSLTransformFeedbackKernelBundle::transformGpuBufferData( glEndTransformFeedback(); glBindBuffer(GL_TRANSFORM_FEEDBACK_BUFFER, 0); - - GLsync sync = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0); - glWaitSync(sync, 0, GL_TIMEOUT_IGNORED); - glDeleteSync(sync); } void OsdGLSLTransformFeedbackKernelBundle::ApplyBilinearFaceVerticesKernel( - GLuint vertexBuffer, int numVertexElements, - GLuint varyingBuffer, int numVaryingElements, - int vertexOffset, int tableOffset, int start, int end) { + GLuint vertexBuffer, GLuint varyingBuffer, + int vertexOffset, int varyingOffset, + int offset, int tableOffset, int start, int end) { glUniformSubroutinesuiv(GL_VERTEX_SHADER, 1, &_subComputeFace); - transformGpuBufferData(vertexBuffer, numVertexElements, - varyingBuffer, numVaryingElements, - vertexOffset, tableOffset, start, end); + transformGpuBufferData(vertexBuffer, varyingBuffer, + vertexOffset, varyingOffset, + offset, tableOffset, start, end); } void OsdGLSLTransformFeedbackKernelBundle::ApplyBilinearEdgeVerticesKernel( - GLuint vertexBuffer, int numVertexElements, - GLuint varyingBuffer, int numVaryingElements, - int vertexOffset, int tableOffset, int start, int end) { + GLuint vertexBuffer, GLuint varyingBuffer, + int vertexOffset, int varyingOffset, + int offset, int tableOffset, int start, int end) { glUniformSubroutinesuiv(GL_VERTEX_SHADER, 1, &_subComputeBilinearEdge); - transformGpuBufferData(vertexBuffer, numVertexElements, - varyingBuffer, numVaryingElements, - vertexOffset, tableOffset, start, end); + transformGpuBufferData(vertexBuffer, varyingBuffer, + vertexOffset, varyingOffset, + offset, tableOffset, start, end); } void OsdGLSLTransformFeedbackKernelBundle::ApplyBilinearVertexVerticesKernel( - GLuint vertexBuffer, int numVertexElements, - GLuint varyingBuffer, int numVaryingElements, - int vertexOffset, int tableOffset, int start, int end) { + GLuint vertexBuffer, GLuint varyingBuffer, + int vertexOffset, int varyingOffset, + int offset, int tableOffset, int start, int end) { glUniformSubroutinesuiv(GL_VERTEX_SHADER, 1, &_subComputeVertex); - transformGpuBufferData(vertexBuffer, numVertexElements, - varyingBuffer, numVaryingElements, - vertexOffset, tableOffset, start, end); + transformGpuBufferData(vertexBuffer, varyingBuffer, + vertexOffset, varyingOffset, + offset, tableOffset, start, end); } void OsdGLSLTransformFeedbackKernelBundle::ApplyCatmarkFaceVerticesKernel( - GLuint vertexBuffer, int numVertexElements, - GLuint varyingBuffer, int numVaryingElements, - int vertexOffset, int tableOffset, int start, int end) { + GLuint vertexBuffer, GLuint varyingBuffer, + int vertexOffset, int varyingOffset, + int offset, int tableOffset, int start, int end) { glUniformSubroutinesuiv(GL_VERTEX_SHADER, 1, &_subComputeFace); - transformGpuBufferData(vertexBuffer, numVertexElements, - varyingBuffer, numVaryingElements, - vertexOffset, tableOffset, start, end); + transformGpuBufferData(vertexBuffer, varyingBuffer, + vertexOffset, varyingOffset, + offset, tableOffset, start, end); } void OsdGLSLTransformFeedbackKernelBundle::ApplyCatmarkEdgeVerticesKernel( - GLuint vertexBuffer, int numVertexElements, - GLuint varyingBuffer, int numVaryingElements, - int vertexOffset, int tableOffset, int start, int end) { + GLuint vertexBuffer, GLuint varyingBuffer, + int vertexOffset, int varyingOffset, + int offset, int tableOffset, int start, int end) { glUniformSubroutinesuiv(GL_VERTEX_SHADER, 1, &_subComputeEdge); - transformGpuBufferData(vertexBuffer, numVertexElements, - varyingBuffer, numVaryingElements, - vertexOffset, tableOffset, start, end); + transformGpuBufferData(vertexBuffer, varyingBuffer, + vertexOffset, varyingOffset, + offset, tableOffset, start, end); } void OsdGLSLTransformFeedbackKernelBundle::ApplyCatmarkVertexVerticesKernelB( - GLuint vertexBuffer, int numVertexElements, - GLuint varyingBuffer, int numVaryingElements, - int vertexOffset, int tableOffset, int start, int end) { + GLuint vertexBuffer, GLuint varyingBuffer, + int vertexOffset, int varyingOffset, + int offset, int tableOffset, int start, int end) { glUniformSubroutinesuiv(GL_VERTEX_SHADER, 1, &_subComputeCatmarkVertexB); - transformGpuBufferData(vertexBuffer, numVertexElements, - varyingBuffer, numVaryingElements, - vertexOffset, tableOffset, start, end); + transformGpuBufferData(vertexBuffer, varyingBuffer, + vertexOffset, varyingOffset, + offset, tableOffset, start, end); } void OsdGLSLTransformFeedbackKernelBundle::ApplyCatmarkVertexVerticesKernelA( - GLuint vertexBuffer, int numVertexElements, - GLuint varyingBuffer, int numVaryingElements, - int vertexOffset, int tableOffset, int start, int end, bool pass) { + GLuint vertexBuffer, GLuint varyingBuffer, + int vertexOffset, int varyingOffset, + int offset, int tableOffset, int start, int end, bool pass) { glUniformSubroutinesuiv(GL_VERTEX_SHADER, 1, &_subComputeVertexA); glUniform1i(_uniformVertexPass, pass ? 1 : 0); - transformGpuBufferData(vertexBuffer, numVertexElements, - varyingBuffer, numVaryingElements, - vertexOffset, tableOffset, start, end); + transformGpuBufferData(vertexBuffer, varyingBuffer, + vertexOffset, varyingOffset, + offset, tableOffset, start, end); } void OsdGLSLTransformFeedbackKernelBundle::ApplyLoopEdgeVerticesKernel( - GLuint vertexBuffer, int numVertexElements, - GLuint varyingBuffer, int numVaryingElements, - int vertexOffset, int tableOffset, int start, int end) { + GLuint vertexBuffer, GLuint varyingBuffer, + int vertexOffset, int varyingOffset, + int offset, int tableOffset, int start, int end) { glUniformSubroutinesuiv(GL_VERTEX_SHADER, 1, &_subComputeEdge); - transformGpuBufferData(vertexBuffer, numVertexElements, - varyingBuffer, numVaryingElements, - vertexOffset, tableOffset, start, end); + transformGpuBufferData(vertexBuffer, varyingBuffer, + vertexOffset, varyingOffset, + offset, tableOffset, start, end); } void OsdGLSLTransformFeedbackKernelBundle::ApplyLoopVertexVerticesKernelB( - GLuint vertexBuffer, int numVertexElements, - GLuint varyingBuffer, int numVaryingElements, - int vertexOffset, int tableOffset, int start, int end) { + GLuint vertexBuffer, GLuint varyingBuffer, + int vertexOffset, int varyingOffset, + int offset, int tableOffset, int start, int end) { glUniformSubroutinesuiv(GL_VERTEX_SHADER, 1, &_subComputeLoopVertexB); - transformGpuBufferData(vertexBuffer, numVertexElements, - varyingBuffer, numVaryingElements, - vertexOffset, tableOffset, start, end); + transformGpuBufferData(vertexBuffer, varyingBuffer, + vertexOffset, varyingOffset, + offset, tableOffset, start, end); } void OsdGLSLTransformFeedbackKernelBundle::ApplyLoopVertexVerticesKernelA( - GLuint vertexBuffer, int numVertexElements, - GLuint varyingBuffer, int numVaryingElements, - int vertexOffset, int tableOffset, int start, int end, bool pass) { + GLuint vertexBuffer, GLuint varyingBuffer, + int vertexOffset, int varyingOffset, + int offset, int tableOffset, int start, int end, bool pass) { glUniformSubroutinesuiv(GL_VERTEX_SHADER, 1, &_subComputeVertexA); glUniform1i(_uniformVertexPass, pass ? 1 : 0); - transformGpuBufferData(vertexBuffer, numVertexElements, - varyingBuffer, numVaryingElements, - vertexOffset, tableOffset, start, end); + transformGpuBufferData(vertexBuffer, varyingBuffer, + vertexOffset, varyingOffset, + offset, tableOffset, start, end); } void OsdGLSLTransformFeedbackKernelBundle::ApplyEditAdd( - GLuint vertexBuffer, int numVertexElements, - GLuint varyingBuffer, int numVaryingElements, + GLuint vertexBuffer, GLuint varyingBuffer, + int vertexOffset, int varyingOffset, int primvarOffset, int primvarWidth, - int vertexOffset, int tableOffset, int start, int end) { + int offset, int tableOffset, int start, int end) { if (end - start <= 0) return; glUniformSubroutinesuiv(GL_VERTEX_SHADER, 1, &_subEditAdd); @@ -365,15 +455,19 @@ OsdGLSLTransformFeedbackKernelBundle::ApplyEditAdd( glUniform1i(_uniformEditPrimVarWidth, primvarWidth); glUniform1i(_uniformIndexStart, start); - glUniform1i(_uniformVertexOffset, vertexOffset); + glUniform1i(_uniformVertexOffset, offset); glUniform1i(_uniformTableOffset, tableOffset); glDrawArrays(GL_POINTS, 0, end - start); } void -OsdGLSLTransformFeedbackKernelBundle::UseProgram() const +OsdGLSLTransformFeedbackKernelBundle::UseProgram(int vertexBaseOffset, + int varyingBaseOffset) const { glUseProgram(_program); + + glUniform1i(_uniformVertexBaseOffset, vertexBaseOffset); + glUniform1i(_uniformVaryingBaseOffset, varyingBaseOffset); } diff --git a/opensubdiv/osd/glslTransformFeedbackKernelBundle.h b/opensubdiv/osd/glslTransformFeedbackKernelBundle.h index af25b522..f9817ddb 100644 --- a/opensubdiv/osd/glslTransformFeedbackKernelBundle.h +++ b/opensubdiv/osd/glslTransformFeedbackKernelBundle.h @@ -44,65 +44,67 @@ public: ~OsdGLSLTransformFeedbackKernelBundle(); - bool Compile(int numVertexElements, int numVaryingElements); + bool Compile(OsdVertexBufferDescriptor const &vertexDesc, + OsdVertexBufferDescriptor const &varyingDesc, + bool interleaved); void ApplyBilinearFaceVerticesKernel( - GLuint vertexBuffer, int numVertexElements, - GLuint varyingBuffer, int numVaryingElements, - int vertexOffset, int tableOffset, int start, int end); + GLuint vertexBuffer, GLuint varyingBuffer, + int vertexOffset, int varyingOffset, + int offset, int tableOffset, int start, int end); void ApplyBilinearEdgeVerticesKernel( - GLuint vertexBuffer, int numVertexElements, - GLuint varyingBuffer, int numVaryingElements, - int vertexOffset, int tableOffset, int start, int end); + GLuint vertexBuffer, GLuint varyingBuffer, + int vertexOffset, int varyingOffset, + int offset, int tableOffset, int start, int end); void ApplyBilinearVertexVerticesKernel( - GLuint vertexBuffer, int numVertexElements, - GLuint varyingBuffer, int numVaryingElements, - int vertexOffset, int tableOffset, int start, int end); + GLuint vertexBuffer, GLuint varyingBuffer, + int vertexOffset, int varyingOffset, + int offset, int tableOffset, int start, int end); void ApplyCatmarkFaceVerticesKernel( - GLuint vertexBuffer, int numVertexElements, - GLuint varyingBuffer, int numVaryingElements, - int vertexOffset, int tableOffset, int start, int end); + GLuint vertexBuffer, GLuint varyingBuffer, + int vertexOffset, int varyingOffset, + int offset, int tableOffset, int start, int end); void ApplyCatmarkEdgeVerticesKernel( - GLuint vertexBuffer, int numVertexElements, - GLuint varyingBuffer, int numVaryingElements, - int vertexOffset, int tableOffset, int start, int end); + GLuint vertexBuffer, GLuint varyingBuffer, + int vertexOffset, int varyingOffset, + int offset, int tableOffset, int start, int end); void ApplyCatmarkVertexVerticesKernelB( - GLuint vertexBuffer, int numVertexElements, - GLuint varyingBuffer, int numVaryingElements, - int vertexOffset, int tableOffset, int start, int end); + GLuint vertexBuffer, GLuint varyingBuffer, + int vertexOffset, int varyingOffset, + int offset, int tableOffset, int start, int end); void ApplyCatmarkVertexVerticesKernelA( - GLuint vertexBuffer, int numVertexElements, - GLuint varyingBuffer, int numVaryingElements, - int vertexOffset, int tableOffset, int start, int end, bool pass); + GLuint vertexBuffer, GLuint varyingBuffer, + int vertexOffset, int varyingOffset, + int offset, int tableOffset, int start, int end, bool pass); void ApplyLoopEdgeVerticesKernel( - GLuint vertexBuffer, int numVertexElements, - GLuint varyingBuffer, int numVaryingElements, - int vertexOffset, int tableOffset, int start, int end); + GLuint vertexBuffer, GLuint varyingBuffer, + int vertexOffset, int varyingOffset, + int offset, int tableOffset, int start, int end); void ApplyLoopVertexVerticesKernelB( - GLuint vertexBuffer, int numVertexElements, - GLuint varyingBuffer, int numVaryingElements, - int vertexOffset, int tableOffset, int start, int end); + GLuint vertexBuffer, GLuint varyingBuffer, + int vertexOffset, int varyingOffset, + int offset, int tableOffset, int start, int end); void ApplyLoopVertexVerticesKernelA( - GLuint vertexBuffer, int numVertexElements, - GLuint varyingBuffer, int numVaryingElements, - int vertexOffset, int tableOffset, int start, int end, bool pass); + GLuint vertexBuffer, GLuint varyingBuffer, + int vertexOffset, int varyingOffset, + int offset, int tableOffset, int start, int end, bool pass); void ApplyEditAdd( - GLuint vertexBuffer, int numVertexElements, - GLuint varyingBuffer, int numVaryingElements, + GLuint vertexBuffer, GLuint varyingBuffer, + int vertexOffset, int varyingOffset, int primvarOffset, int primvarWidth, - int vertexOffset, int tableOffset, int start, int end); + int offset, int tableOffset, int start, int end); - void UseProgram() const; + void UseProgram(int vertexBaseOffset, int varyingBaseOffset) const; GLint GetTableUniformLocation(int tableIndex) const { return _uniformTables[tableIndex]; @@ -124,26 +126,35 @@ public: } struct Match { - /// Constructor - Match(int numVertexElements, int numVaryingElements) - : vdesc(numVertexElements, numVaryingElements) { + Match(OsdVertexBufferDescriptor const &vertex, + OsdVertexBufferDescriptor const &varying, + bool interleaved) + : vertexDesc(vertex), varyingDesc(varying), interleaved(interleaved) { } bool operator() (OsdGLSLTransformFeedbackKernelBundle const *kernel) { - return vdesc == kernel->_vdesc; + // offset is dynamic. just comparing length and stride here, + // returns true if they are equal + return (vertexDesc.length == kernel->_numVertexElements and + vertexDesc.stride == kernel->_vertexStride and + varyingDesc.length == kernel->_numVaryingElements and + varyingDesc.stride == kernel->_varyingStride and + interleaved == kernel->_interleaved); } - OsdVertexDescriptor vdesc; + OsdVertexBufferDescriptor vertexDesc; + OsdVertexBufferDescriptor varyingDesc; + bool interleaved; }; friend struct Match; protected: void transformGpuBufferData( - GLuint vertexBuffer, int numVertexElements, - GLuint varyingBuffer, int numVaryingElements, - int vertexOffset, int tableOffset, int start, int end) const; + GLuint vertexBuffer, GLuint varyingBuffer, + int vertexOffset, int varyingOffset, + int offset, int tableOffset, int start, int end) const; GLuint _program; @@ -153,6 +164,8 @@ protected: GLint _uniformVertexOffset; GLint _uniformTableOffset; GLint _uniformIndexStart; + GLint _uniformVertexBaseOffset; + GLint _uniformVaryingBaseOffset; GLint _uniformVertexBuffer; GLint _uniformVaryingBuffer; @@ -182,7 +195,14 @@ protected: GLuint _subEditAdd; // hedit kernel (add) - OsdVertexDescriptor _vdesc; + // kernelbundle discriminators + int _numVertexElements; + int _vertexStride; + int _numVaryingElements; + int _varyingStride; + int _vertexOffsetMod; + int _varyingOffsetMod; + bool _interleaved; }; } // end namespace OPENSUBDIV_VERSION diff --git a/opensubdiv/osd/hlslComputeKernel.hlsl b/opensubdiv/osd/hlslComputeKernel.hlsl index ab1c38ea..f265d894 100644 --- a/opensubdiv/osd/hlslComputeKernel.hlsl +++ b/opensubdiv/osd/hlslComputeKernel.hlsl @@ -32,6 +32,8 @@ cbuffer KernelCB : register( b0 ) { int tableOffset; // offset of subdivision table int indexStart; // start index relative to tableOffset int indexEnd; // end index relative to tableOffset + int vertexBaseOffset; // base vbo offset of the vertex buffer + int varyingBaseOffset; // base vbo offset of the varying buffer bool vertexPass; // vertex edit kernel @@ -91,13 +93,15 @@ Vertex readVertex(int index) Vertex v; #if NUM_VERTEX_ELEMENTS > 0 + int vertexIndex = index * VERTEX_STRIDE + vertexBaseOffset; for (int i = 0; i < NUM_VERTEX_ELEMENTS; i++) { - v.vertexData[i] = vertexBuffer[index*NUM_VERTEX_ELEMENTS+i]; + v.vertexData[i] = vertexBuffer[vertexIndex + i]; } #endif #if NUM_VARYING_ELEMENTS > 0 + int varyingIndex = index * VARYING_STRIDE + varyingBaseOffset; for (int i = 0; i < NUM_VARYING_ELEMENTS; i++) { - v.varyingData[i] = varyingBuffer[index*NUM_VARYING_ELEMENTS+i]; + v.varyingData[i] = varyingBuffer[varyingIndex + i]; } #endif return v; @@ -106,13 +110,15 @@ Vertex readVertex(int index) void writeVertex(int index, Vertex v) { #if NUM_VERTEX_ELEMENTS > 0 + int vertexIndex = index * VERTEX_STRIDE + vertexBaseOffset; for (int i = 0; i < NUM_VERTEX_ELEMENTS; i++) { - vertexBuffer[index*NUM_VERTEX_ELEMENTS+i] = v.vertexData[i]; + vertexBuffer[vertexIndex + i] = v.vertexData[i]; } #endif #if NUM_VARYING_ELEMENTS > 0 + int varyingIndex = index * VARYING_STRIDE + varyingBaseOffset; for (int i = 0; i < NUM_VARYING_ELEMENTS; i++) { - varyingBuffer[index*NUM_VARYING_ELEMENTS+i] = v.varyingData[i]; + varyingBuffer[varyingIndex + i] = v.varyingData[i]; } #endif } diff --git a/opensubdiv/osd/mesh.h b/opensubdiv/osd/mesh.h index c9f19fd6..7df546e2 100644 --- a/opensubdiv/osd/mesh.h +++ b/opensubdiv/osd/mesh.h @@ -33,6 +33,7 @@ #include "../hbr/mesh.h" #include "../osd/vertex.h" +#include "../osd/vertexDescriptor.h" #include @@ -68,6 +69,10 @@ public: virtual void Refine() = 0; + virtual void Refine(OsdVertexBufferDescriptor const *vertexDesc, + OsdVertexBufferDescriptor const *varyingDesc, + bool interleaved) = 0; + virtual void Synchronize() = 0; virtual DrawContext * GetDrawContext() = 0; @@ -158,6 +163,13 @@ public: virtual void Refine() { _computeController->Refine(_computeContext, _farMesh->GetKernelBatches(), _vertexBuffer, _varyingBuffer); } + virtual void Refine(OsdVertexBufferDescriptor const *vertexDesc, + OsdVertexBufferDescriptor const *varyingDesc) { + _computeController->Refine(_computeContext, _farMesh->GetKernelBatches(), + _vertexBuffer, _varyingBuffer, + vertexDesc, varyingDesc); + } + virtual void Synchronize() { _computeController->Synchronize(); } diff --git a/opensubdiv/osd/ompComputeController.cpp b/opensubdiv/osd/ompComputeController.cpp index d69a7a60..64768e18 100644 --- a/opensubdiv/osd/ompComputeController.cpp +++ b/opensubdiv/osd/ompComputeController.cpp @@ -34,8 +34,7 @@ namespace OpenSubdiv { namespace OPENSUBDIV_VERSION { -OsdOmpComputeController::OsdOmpComputeController(int numThreads) : - _currentVertexBuffer(NULL), _currentVaryingBuffer(NULL) { +OsdOmpComputeController::OsdOmpComputeController(int numThreads) { _numThreads = (numThreads == -1) ? omp_get_max_threads() : numThreads; } @@ -48,7 +47,8 @@ OsdOmpComputeController::ApplyBilinearFaceVerticesKernel( assert(context); OsdOmpComputeFace( - _vdesc, _currentVertexBuffer, _currentVaryingBuffer, + _currentBindState.vertexBuffer, _currentBindState.varyingBuffer, + _currentBindState.vertexDesc, _currentBindState.varyingDesc, (const int*)context->GetTable(FarSubdivisionTables::F_IT)->GetBuffer(), (const int*)context->GetTable(FarSubdivisionTables::F_ITa)->GetBuffer(), batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd()); @@ -61,7 +61,8 @@ OsdOmpComputeController::ApplyBilinearEdgeVerticesKernel( assert(context); OsdOmpComputeBilinearEdge( - _vdesc, _currentVertexBuffer, _currentVaryingBuffer, + _currentBindState.vertexBuffer, _currentBindState.varyingBuffer, + _currentBindState.vertexDesc, _currentBindState.varyingDesc, (const int*)context->GetTable(FarSubdivisionTables::E_IT)->GetBuffer(), batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd()); } @@ -73,7 +74,8 @@ OsdOmpComputeController::ApplyBilinearVertexVerticesKernel( assert(context); OsdOmpComputeBilinearVertex( - _vdesc, _currentVertexBuffer, _currentVaryingBuffer, + _currentBindState.vertexBuffer, _currentBindState.varyingBuffer, + _currentBindState.vertexDesc, _currentBindState.varyingDesc, (const int*)context->GetTable(FarSubdivisionTables::V_ITa)->GetBuffer(), batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd()); } @@ -85,7 +87,8 @@ OsdOmpComputeController::ApplyCatmarkFaceVerticesKernel( assert(context); OsdOmpComputeFace( - _vdesc, _currentVertexBuffer, _currentVaryingBuffer, + _currentBindState.vertexBuffer, _currentBindState.varyingBuffer, + _currentBindState.vertexDesc, _currentBindState.varyingDesc, (const int*)context->GetTable(FarSubdivisionTables::F_IT)->GetBuffer(), (const int*)context->GetTable(FarSubdivisionTables::F_ITa)->GetBuffer(), batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd()); @@ -98,7 +101,8 @@ OsdOmpComputeController::ApplyCatmarkEdgeVerticesKernel( assert(context); OsdOmpComputeEdge( - _vdesc, _currentVertexBuffer, _currentVaryingBuffer, + _currentBindState.vertexBuffer, _currentBindState.varyingBuffer, + _currentBindState.vertexDesc, _currentBindState.varyingDesc, (const int*)context->GetTable(FarSubdivisionTables::E_IT)->GetBuffer(), (const float*)context->GetTable(FarSubdivisionTables::E_W)->GetBuffer(), batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd()); @@ -111,7 +115,8 @@ OsdOmpComputeController::ApplyCatmarkVertexVerticesKernelB( assert(context); OsdOmpComputeVertexB( - _vdesc, _currentVertexBuffer, _currentVaryingBuffer, + _currentBindState.vertexBuffer, _currentBindState.varyingBuffer, + _currentBindState.vertexDesc, _currentBindState.varyingDesc, (const int*)context->GetTable(FarSubdivisionTables::V_ITa)->GetBuffer(), (const int*)context->GetTable(FarSubdivisionTables::V_IT)->GetBuffer(), (const float*)context->GetTable(FarSubdivisionTables::V_W)->GetBuffer(), @@ -125,7 +130,8 @@ OsdOmpComputeController::ApplyCatmarkVertexVerticesKernelA1( assert(context); OsdOmpComputeVertexA( - _vdesc, _currentVertexBuffer, _currentVaryingBuffer, + _currentBindState.vertexBuffer, _currentBindState.varyingBuffer, + _currentBindState.vertexDesc, _currentBindState.varyingDesc, (const int*)context->GetTable(FarSubdivisionTables::V_ITa)->GetBuffer(), (const float*)context->GetTable(FarSubdivisionTables::V_W)->GetBuffer(), batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd(), false); @@ -138,7 +144,8 @@ OsdOmpComputeController::ApplyCatmarkVertexVerticesKernelA2( assert(context); OsdOmpComputeVertexA( - _vdesc, _currentVertexBuffer, _currentVaryingBuffer, + _currentBindState.vertexBuffer, _currentBindState.varyingBuffer, + _currentBindState.vertexDesc, _currentBindState.varyingDesc, (const int*)context->GetTable(FarSubdivisionTables::V_ITa)->GetBuffer(), (const float*)context->GetTable(FarSubdivisionTables::V_W)->GetBuffer(), batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd(), true); @@ -151,7 +158,8 @@ OsdOmpComputeController::ApplyLoopEdgeVerticesKernel( assert(context); OsdOmpComputeEdge( - _vdesc, _currentVertexBuffer, _currentVaryingBuffer, + _currentBindState.vertexBuffer, _currentBindState.varyingBuffer, + _currentBindState.vertexDesc, _currentBindState.varyingDesc, (const int*)context->GetTable(FarSubdivisionTables::E_IT)->GetBuffer(), (const float*)context->GetTable(FarSubdivisionTables::E_W)->GetBuffer(), batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd()); @@ -164,7 +172,8 @@ OsdOmpComputeController::ApplyLoopVertexVerticesKernelB( assert(context); OsdOmpComputeLoopVertexB( - _vdesc, _currentVertexBuffer, _currentVaryingBuffer, + _currentBindState.vertexBuffer, _currentBindState.varyingBuffer, + _currentBindState.vertexDesc, _currentBindState.varyingDesc, (const int*)context->GetTable(FarSubdivisionTables::V_ITa)->GetBuffer(), (const int*)context->GetTable(FarSubdivisionTables::V_IT)->GetBuffer(), (const float*)context->GetTable(FarSubdivisionTables::V_W)->GetBuffer(), @@ -178,7 +187,8 @@ OsdOmpComputeController::ApplyLoopVertexVerticesKernelA1( assert(context); OsdOmpComputeVertexA( - _vdesc, _currentVertexBuffer, _currentVaryingBuffer, + _currentBindState.vertexBuffer, _currentBindState.varyingBuffer, + _currentBindState.vertexDesc, _currentBindState.varyingDesc, (const int*)context->GetTable(FarSubdivisionTables::V_ITa)->GetBuffer(), (const float*)context->GetTable(FarSubdivisionTables::V_W)->GetBuffer(), batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd(), false); @@ -191,7 +201,8 @@ OsdOmpComputeController::ApplyLoopVertexVerticesKernelA2( assert(context); OsdOmpComputeVertexA( - _vdesc, _currentVertexBuffer, _currentVaryingBuffer, + _currentBindState.vertexBuffer, _currentBindState.varyingBuffer, + _currentBindState.vertexDesc, _currentBindState.varyingDesc, (const int*)context->GetTable(FarSubdivisionTables::V_ITa)->GetBuffer(), (const float*)context->GetTable(FarSubdivisionTables::V_W)->GetBuffer(), batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd(), true); @@ -210,8 +221,8 @@ OsdOmpComputeController::ApplyVertexEdits( const OsdCpuTable * editValues = edit->GetEditValues(); if (edit->GetOperation() == FarVertexEdit::Add) { - OsdOmpEditVertexAdd(_vdesc, - _currentVertexBuffer, + OsdOmpEditVertexAdd(_currentBindState.vertexBuffer, + _currentBindState.vertexDesc, edit->GetPrimvarOffset(), edit->GetPrimvarWidth(), batch.GetVertexOffset(), @@ -221,8 +232,8 @@ OsdOmpComputeController::ApplyVertexEdits( static_cast(primvarIndices->GetBuffer()), static_cast(editValues->GetBuffer())); } else if (edit->GetOperation() == FarVertexEdit::Set) { - OsdOmpEditVertexSet(_vdesc, - _currentVertexBuffer, + OsdOmpEditVertexSet(_currentBindState.vertexBuffer, + _currentBindState.vertexDesc, edit->GetPrimvarOffset(), edit->GetPrimvarWidth(), batch.GetVertexOffset(), diff --git a/opensubdiv/osd/ompComputeController.h b/opensubdiv/osd/ompComputeController.h index 81c90bf1..507aaa9d 100644 --- a/opensubdiv/osd/ompComputeController.h +++ b/opensubdiv/osd/ompComputeController.h @@ -29,6 +29,7 @@ #include "../far/dispatcher.h" #include "../osd/cpuComputeContext.h" +#include "../osd/vertexDescriptor.h" #ifdef OPENSUBDIV_HAS_OPENMP #include @@ -69,17 +70,27 @@ public: /// /// @param varyingBuffer varying-interpolated data buffer /// + /// @param vertexDesc the descriptor of vertex elements to be refined. + /// if it's null, all primvars in the vertex buffer + /// will be refined. + /// + /// @param varyingDesc the descriptor of varying elements to be refined. + /// if it's null, all primvars in the varying buffer + /// will be refined. + /// template void Refine(OsdCpuComputeContext const *context, FarKernelBatchVector const & batches, VERTEX_BUFFER * vertexBuffer, - VARYING_BUFFER * varyingBuffer) { + VARYING_BUFFER * varyingBuffer, + OsdVertexBufferDescriptor const *vertexDesc=NULL, + OsdVertexBufferDescriptor const *varyingDesc=NULL) { if (batches.empty()) return; omp_set_num_threads(_numThreads); - bind(vertexBuffer, varyingBuffer); + bind(vertexBuffer, varyingBuffer, vertexDesc, varyingDesc); FarDispatcher::Refine(this, context, batches, /*maxlevel*/-1); @@ -137,24 +148,60 @@ protected: void ApplyVertexEdits(FarKernelBatch const &batch, ComputeContext const *context) const; template - void bind(VERTEX_BUFFER *vertex, VARYING_BUFFER *varying) { + void bind(VERTEX_BUFFER *vertex, VARYING_BUFFER *varying, + OsdVertexBufferDescriptor const *vertexDesc, + OsdVertexBufferDescriptor const *varyingDesc) { - _currentVertexBuffer = vertex ? vertex->BindCpuBuffer() : 0; - _currentVaryingBuffer = varying ? varying->BindCpuBuffer() : 0; + // if the vertex buffer descriptor is specified, use it. + // otherwise, assumes the data is tightly packed in the vertex buffer. + if (vertexDesc) { + _currentBindState.vertexDesc = *vertexDesc; + } else { + int numElements = vertex ? vertex->GetNumElements() : 0; + _currentBindState.vertexDesc = OsdVertexBufferDescriptor( + 0, numElements, numElements); + } + if (varyingDesc) { + _currentBindState.varyingDesc = *varyingDesc; + } else { + int numElements = varying ? varying->GetNumElements() : 0; + _currentBindState.varyingDesc = OsdVertexBufferDescriptor( + 0, numElements, numElements); + } - int numVertexElements = vertex ? vertex->GetNumElements() : 0; - int numVaryingElements = varying ? varying->GetNumElements() : 0; - _vdesc.Set(numVertexElements, numVaryingElements); + // apply vertex offset here + if (vertex) { + _currentBindState.vertexBuffer = + vertex->BindCpuBuffer() + _currentBindState.vertexDesc.offset; + } else { + _currentBindState.vertexBuffer = NULL; + } + if (varying) { + _currentBindState.varyingBuffer = + varying->BindCpuBuffer() + _currentBindState.varyingDesc.offset; + } else { + _currentBindState.varyingBuffer = NULL; + } } void unbind() { - _currentVertexBuffer = 0; - _currentVaryingBuffer = 0; - _vdesc.Reset(); + _currentBindState.Reset(); } private: - float *_currentVertexBuffer, *_currentVaryingBuffer; - OsdVertexDescriptor _vdesc; + struct BindState { + BindState() : vertexBuffer(NULL), varyingBuffer(NULL) {} + void Reset() { + vertexBuffer = varyingBuffer = NULL; + vertexDesc.Reset(); + varyingDesc.Reset(); + } + float *vertexBuffer; + float *varyingBuffer; + OsdVertexBufferDescriptor vertexDesc; + OsdVertexBufferDescriptor varyingDesc; + }; + + BindState _currentBindState; int _numThreads; }; diff --git a/opensubdiv/osd/ompKernel.cpp b/opensubdiv/osd/ompKernel.cpp index 2e61005c..257a00b2 100644 --- a/opensubdiv/osd/ompKernel.cpp +++ b/opensubdiv/osd/ompKernel.cpp @@ -25,40 +25,94 @@ #include "../osd/ompKernel.h" #include "../osd/vertexDescriptor.h" -#include +#include +#include +#include #include namespace OpenSubdiv { namespace OPENSUBDIV_VERSION { +static inline void +clear(float *dst, OsdVertexBufferDescriptor const &desc) { + + if (dst) { + memset(dst, 0, desc.length*sizeof(float)); + } +} + +static inline void +addWithWeight(float *dst, const float *srcOrigin, int srcIndex, float weight, + OsdVertexBufferDescriptor const &desc) { + + if (srcOrigin && dst) { + const float *src = srcOrigin + srcIndex * desc.stride; + for (int k = 0; k < desc.length; ++k) { + dst[k] += src[k] * weight; + } + } +} + +static inline void +copy(float *dstOrigin, const float *src, int dstIndex, + OsdVertexBufferDescriptor const &desc) { + + if (dstOrigin && src) { + float *dst = dstOrigin + dstIndex * desc.stride; + memcpy(dst, src, desc.length*sizeof(float)); + } +} + void OsdOmpComputeFace( - OsdVertexDescriptor const &vdesc, float * vertex, float * varying, + float * vertex, float * varying, + OsdVertexBufferDescriptor const &vertexDesc, + OsdVertexBufferDescriptor const &varyingDesc, const int *F_IT, const int *F_ITa, int offset, int tableOffset, int start, int end) { + int numThreads = omp_get_max_threads(); + float *vertexResultsArray = (float*)alloca(vertexDesc.length * sizeof(float) * numThreads); + float *varyingResultsArray = (float*)alloca(varyingDesc.length * sizeof(float) * numThreads); + #pragma omp parallel for for (int i = start + tableOffset; i < end + tableOffset; i++) { int h = F_ITa[2*i]; int n = F_ITa[2*i+1]; float weight = 1.0f/n; - - // XXX: should use local vertex struct variable instead of - // accumulating directly into global memory. int dstIndex = offset + i - tableOffset; - vdesc.Clear(vertex, varying, dstIndex); + + int threadId = omp_get_thread_num(); + float *vertexResults = vertexResultsArray + + vertexDesc.length * threadId; + float *varyingResults = varyingResultsArray + + varyingDesc.length * threadId; + + // clear + clear(vertexResults, vertexDesc); + clear(varyingResults, varyingDesc); for (int j = 0; j < n; ++j) { int index = F_IT[h+j]; - vdesc.AddWithWeight(vertex, dstIndex, index, weight); - vdesc.AddVaryingWithWeight(varying, dstIndex, index, weight); + addWithWeight(vertexResults, vertex, index, weight, vertexDesc); + addWithWeight(varyingResults, varying, index, weight, varyingDesc); } + + // write results + copy(vertex, vertexResults, dstIndex, vertexDesc); + copy(varying, varyingResults, dstIndex, varyingDesc); } } void OsdOmpComputeEdge( - OsdVertexDescriptor const &vdesc, float *vertex, float *varying, + float * vertex, float * varying, + OsdVertexBufferDescriptor const &vertexDesc, + OsdVertexBufferDescriptor const &varyingDesc, const int *E_IT, const float *E_W, int offset, int tableOffset, int start, int end) { + int numThreads = omp_get_max_threads(); + float *vertexResultsArray = (float*)alloca(vertexDesc.length * sizeof(float) * numThreads); + float *varyingResultsArray = (float*)alloca(varyingDesc.length * sizeof(float) * numThreads); + #pragma omp parallel for for (int i = start + tableOffset; i < end + tableOffset; i++) { int eidx0 = E_IT[4*i+0]; @@ -67,30 +121,47 @@ void OsdOmpComputeEdge( int eidx3 = E_IT[4*i+3]; float vertWeight = E_W[i*2+0]; - int dstIndex = offset + i - tableOffset; - vdesc.Clear(vertex, varying, dstIndex); - vdesc.AddWithWeight(vertex, dstIndex, eidx0, vertWeight); - vdesc.AddWithWeight(vertex, dstIndex, eidx1, vertWeight); + int threadId = omp_get_thread_num(); + float *vertexResults = vertexResultsArray + + vertexDesc.length * threadId; + float *varyingResults = varyingResultsArray + + varyingDesc.length * threadId; + + // clear + clear(vertexResults, vertexDesc); + clear(varyingResults, varyingDesc); + + addWithWeight(vertexResults, vertex, eidx0, vertWeight, vertexDesc); + addWithWeight(vertexResults, vertex, eidx1, vertWeight, vertexDesc); if (eidx2 != -1) { float faceWeight = E_W[i*2+1]; - vdesc.AddWithWeight(vertex, dstIndex, eidx2, faceWeight); - vdesc.AddWithWeight(vertex, dstIndex, eidx3, faceWeight); + addWithWeight(vertexResults, vertex, eidx2, faceWeight, vertexDesc); + addWithWeight(vertexResults, vertex, eidx3, faceWeight, vertexDesc); } - vdesc.AddVaryingWithWeight(varying, dstIndex, eidx0, 0.5f); - vdesc.AddVaryingWithWeight(varying, dstIndex, eidx1, 0.5f); + addWithWeight(varyingResults, varying, eidx0, 0.5f, varyingDesc); + addWithWeight(varyingResults, varying, eidx1, 0.5f, varyingDesc); + + copy(vertex, vertexResults, dstIndex, vertexDesc); + copy(varying, varyingResults, dstIndex, varyingDesc); } } void OsdOmpComputeVertexA( - OsdVertexDescriptor const &vdesc, float *vertex, float *varying, + float * vertex, float * varying, + OsdVertexBufferDescriptor const &vertexDesc, + OsdVertexBufferDescriptor const &varyingDesc, const int *V_ITa, const float *V_W, int offset, int tableOffset, int start, int end, int pass) { + int numThreads = omp_get_max_threads(); + float *vertexResultsArray = (float*)alloca(vertexDesc.length * sizeof(float) * numThreads); + float *varyingResultsArray = (float*)alloca(varyingDesc.length * sizeof(float) * numThreads); + #pragma omp parallel for for (int i = start + tableOffset; i < end + tableOffset; i++) { int n = V_ITa[5*i+1]; @@ -107,27 +178,47 @@ void OsdOmpComputeVertexA( weight = 1.0f - weight; int dstIndex = offset + i - tableOffset; - if (not pass) - vdesc.Clear(vertex, varying, dstIndex); - if (eidx0 == -1 || (pass == 0 && (n == -1))) { - vdesc.AddWithWeight(vertex, dstIndex, p, weight); - } else { - vdesc.AddWithWeight(vertex, dstIndex, p, weight * 0.75f); - vdesc.AddWithWeight(vertex, dstIndex, eidx0, weight * 0.125f); - vdesc.AddWithWeight(vertex, dstIndex, eidx1, weight * 0.125f); + int threadId = omp_get_thread_num(); + float *vertexResults = vertexResultsArray + + vertexDesc.length * threadId; + float *varyingResults = varyingResultsArray + + varyingDesc.length * threadId; + + clear(vertexResults, vertexDesc); + clear(varyingResults, varyingDesc); + if (pass) { + // copy previous results + addWithWeight(vertexResults, vertex, dstIndex, 1.0f, vertexDesc); } - if (not pass) - vdesc.AddVaryingWithWeight(varying, dstIndex, p, 1.0f); + if (eidx0 == -1 || (pass == 0 && (n == -1))) { + addWithWeight(vertexResults, vertex, p, weight, vertexDesc); + } else { + addWithWeight(vertexResults, vertex, p, weight * 0.75f, vertexDesc); + addWithWeight(vertexResults, vertex, eidx0, weight * 0.125f, vertexDesc); + addWithWeight(vertexResults, vertex, eidx1, weight * 0.125f, vertexDesc); + } + + copy(vertex, vertexResults, dstIndex, vertexDesc); + if (not pass) { + addWithWeight(varyingResults, varying, p, 1.0f, varyingDesc); + copy(varying, varyingResults, dstIndex, varyingDesc); + } } } void OsdOmpComputeVertexB( - OsdVertexDescriptor const &vdesc, float *vertex, float *varying, + float * vertex, float * varying, + OsdVertexBufferDescriptor const &vertexDesc, + OsdVertexBufferDescriptor const &varyingDesc, const int *V_ITa, const int *V_IT, const float *V_W, int offset, int tableOffset, int start, int end) { + int numThreads = omp_get_max_threads(); + float *vertexResultsArray = (float*)alloca(vertexDesc.length * sizeof(float) * numThreads); + float *varyingResultsArray = (float*)alloca(varyingDesc.length * sizeof(float) * numThreads); + #pragma omp parallel for for (int i = start + tableOffset; i < end + tableOffset; i++) { int h = V_ITa[5*i]; @@ -139,23 +230,40 @@ void OsdOmpComputeVertexB( float wv = (n-2.0f) * n * wp; int dstIndex = offset + i - tableOffset; - vdesc.Clear(vertex, varying, dstIndex); - vdesc.AddWithWeight(vertex, dstIndex, p, weight * wv); + int threadId = omp_get_thread_num(); + float *vertexResults = vertexResultsArray + + vertexDesc.length * threadId; + float *varyingResults = varyingResultsArray + + varyingDesc.length * threadId; + + clear(vertexResults, vertexDesc); + clear(varyingResults, varyingDesc); + + addWithWeight(vertexResults, vertex, p, weight * wv, vertexDesc); for (int j = 0; j < n; ++j) { - vdesc.AddWithWeight(vertex, dstIndex, V_IT[h+j*2], weight * wp); - vdesc.AddWithWeight(vertex, dstIndex, V_IT[h+j*2+1], weight * wp); + addWithWeight(vertexResults, vertex, V_IT[h+j*2], weight * wp, vertexDesc); + addWithWeight(vertexResults, vertex, V_IT[h+j*2+1], weight * wp, vertexDesc); } - vdesc.AddVaryingWithWeight(varying, dstIndex, p, 1.0f); + addWithWeight(varyingResults, varying, p, 1.0f, varyingDesc); + + copy(vertex, vertexResults, dstIndex, vertexDesc); + copy(varying, varyingResults, dstIndex, varyingDesc); } } void OsdOmpComputeLoopVertexB( - OsdVertexDescriptor const &vdesc, float *vertex, float *varying, + float * vertex, float * varying, + OsdVertexBufferDescriptor const &vertexDesc, + OsdVertexBufferDescriptor const &varyingDesc, const int *V_ITa, const int *V_IT, const float *V_W, int vertexOffset, int tableOffset, int start, int end) { + int numThreads = omp_get_max_threads(); + float *vertexResultsArray = (float*)alloca(vertexDesc.length * sizeof(float) * numThreads); + float *varyingResultsArray = (float*)alloca(varyingDesc.length * sizeof(float) * numThreads); + #pragma omp parallel for for (int i = start + tableOffset; i < end + tableOffset; i++) { int h = V_ITa[5*i]; @@ -169,82 +277,137 @@ void OsdOmpComputeLoopVertexB( beta = (0.625f - beta) * wp; int dstIndex = i + vertexOffset - tableOffset; - vdesc.Clear(vertex, varying, dstIndex); - vdesc.AddWithWeight(vertex, dstIndex, p, weight * (1.0f - (beta * n))); + int threadId = omp_get_thread_num(); + float *vertexResults = vertexResultsArray + + vertexDesc.length * threadId; + float *varyingResults = varyingResultsArray + + varyingDesc.length * threadId; + + clear(vertexResults, vertexDesc); + clear(varyingResults, varyingDesc); + + addWithWeight(vertexResults, vertex, p, weight * (1.0f - (beta * n)), vertexDesc); for (int j = 0; j < n; ++j) - vdesc.AddWithWeight(vertex, dstIndex, V_IT[h+j], weight * beta); + addWithWeight(vertexResults, vertex, V_IT[h+j], weight * beta, vertexDesc); - vdesc.AddVaryingWithWeight(varying, dstIndex, p, 1.0f); + addWithWeight(varyingResults, varying, p, 1.0f, varyingDesc); + + copy(vertex, vertexResults, dstIndex, vertexDesc); + copy(varying, varyingResults, dstIndex, varyingDesc); } } void OsdOmpComputeBilinearEdge( - OsdVertexDescriptor const &vdesc, float *vertex, float *varying, + float * vertex, float * varying, + OsdVertexBufferDescriptor const &vertexDesc, + OsdVertexBufferDescriptor const &varyingDesc, const int *E_IT, int vertexOffset, int tableOffset, int start, int end) { + int numThreads = omp_get_max_threads(); + float *vertexResultsArray = (float*)alloca(vertexDesc.length * sizeof(float) * numThreads); + float *varyingResultsArray = (float*)alloca(varyingDesc.length * sizeof(float) * numThreads); + #pragma omp parallel for for (int i = start + tableOffset; i < end + tableOffset; i++) { int eidx0 = E_IT[2*i+0]; int eidx1 = E_IT[2*i+1]; int dstIndex = i + vertexOffset - tableOffset; - vdesc.Clear(vertex, varying, dstIndex); - vdesc.AddWithWeight(vertex, dstIndex, eidx0, 0.5f); - vdesc.AddWithWeight(vertex, dstIndex, eidx1, 0.5f); + int threadId = omp_get_thread_num(); + float *vertexResults = vertexResultsArray + + vertexDesc.length * threadId; + float *varyingResults = varyingResultsArray + + varyingDesc.length * threadId; - vdesc.AddVaryingWithWeight(varying, dstIndex, eidx0, 0.5f); - vdesc.AddVaryingWithWeight(varying, dstIndex, eidx1, 0.5f); + clear(vertexResults, vertexDesc); + clear(varyingResults, varyingDesc); + + addWithWeight(vertexResults, vertex, eidx0, 0.5f, vertexDesc); + addWithWeight(vertexResults, vertex, eidx1, 0.5f, vertexDesc); + + addWithWeight(varyingResults, varying, eidx0, 0.5f, varyingDesc); + addWithWeight(varyingResults, varying, eidx1, 0.5f, varyingDesc); + + copy(vertex, vertexResults, dstIndex, vertexDesc); + copy(varying, varyingResults, dstIndex, varyingDesc); } } void OsdOmpComputeBilinearVertex( - OsdVertexDescriptor const &vdesc, float *vertex, float *varying, + float * vertex, float * varying, + OsdVertexBufferDescriptor const &vertexDesc, + OsdVertexBufferDescriptor const &varyingDesc, const int *V_ITa, int vertexOffset, int tableOffset, int start, int end) { + int numThreads = omp_get_max_threads(); + float *vertexResultsArray = (float*)alloca(vertexDesc.length * sizeof(float) * numThreads); + float *varyingResultsArray = (float*)alloca(varyingDesc.length * sizeof(float) * numThreads); + #pragma omp parallel for for (int i = start + tableOffset; i < end + tableOffset; i++) { int p = V_ITa[i]; int dstIndex = i + vertexOffset - tableOffset; - vdesc.Clear(vertex, varying, dstIndex); - vdesc.AddWithWeight(vertex, dstIndex, p, 1.0f); - vdesc.AddVaryingWithWeight(varying, dstIndex, p, 1.0f); + int threadId = omp_get_thread_num(); + float *vertexResults = vertexResultsArray + + vertexDesc.length * threadId; + float *varyingResults = varyingResultsArray + + varyingDesc.length * threadId; + + clear(vertexResults, vertexDesc); + clear(varyingResults, varyingDesc); + + addWithWeight(vertexResults, vertex, p, 1.0f, vertexDesc); + addWithWeight(varyingResults, varying, p, 1.0f, varyingDesc); + + copy(vertex, vertexResults, dstIndex, vertexDesc); + copy(varying, varyingResults, dstIndex, varyingDesc); } } void OsdOmpEditVertexAdd( - OsdVertexDescriptor const &vdesc, float *vertex, + float * vertex, + OsdVertexBufferDescriptor const &vertexDesc, int primVarOffset, int primVarWidth, int vertexOffset, int tableOffset, int start, int end, const unsigned int *editIndices, const float *editValues) { #pragma omp parallel for for (int i = start+tableOffset; i < end+tableOffset; i++) { - vdesc.ApplyVertexEditAdd(vertex, - primVarOffset, - primVarWidth, - editIndices[i] + vertexOffset, - &editValues[i*primVarWidth]); + + if (vertex) { + int editIndex = editIndices[i] + vertexOffset; + float *dst = vertex + editIndex * vertexDesc.stride + primVarOffset; + + for (int i = 0; i < primVarWidth; ++i) { + dst[i] += editValues[i]; + } + } } } void OsdOmpEditVertexSet( - OsdVertexDescriptor const &vdesc, float *vertex, + float * vertex, + OsdVertexBufferDescriptor const &vertexDesc, int primVarOffset, int primVarWidth, int vertexOffset, int tableOffset, int start, int end, const unsigned int *editIndices, const float *editValues) { #pragma omp parallel for for (int i = start+tableOffset; i < end+tableOffset; i++) { - vdesc.ApplyVertexEditSet(vertex, - primVarOffset, - primVarWidth, - editIndices[i] + vertexOffset, - &editValues[i*primVarWidth]); + + if (vertex) { + int editIndex = editIndices[i] + vertexOffset; + float *dst = vertex + editIndex * vertexDesc.stride + primVarOffset; + + for (int i = 0; i < primVarWidth; ++i) { + dst[i] = editValues[i]; + } + } } } diff --git a/opensubdiv/osd/ompKernel.h b/opensubdiv/osd/ompKernel.h index 4118712f..4477f0fb 100644 --- a/opensubdiv/osd/ompKernel.h +++ b/opensubdiv/osd/ompKernel.h @@ -26,63 +26,73 @@ #define OSD_OMP_KERNEL_H #include "../version.h" +#include "../osd/vertexDescriptor.h" namespace OpenSubdiv { namespace OPENSUBDIV_VERSION { struct OsdVertexDescriptor; -void OsdOmpComputeFace(OsdVertexDescriptor const &vdesc, - float * vertex, float * varying, +void OsdOmpComputeFace(float * vertex, float * varying, + OsdVertexBufferDescriptor const &vertexDesc, + OsdVertexBufferDescriptor const &varyingDesc, const int *F_IT, const int *F_ITa, int vertexOffset, int tableOffset, int start, int end); -void OsdOmpComputeEdge(OsdVertexDescriptor const &vdesc, - float *vertex, float * varying, +void OsdOmpComputeEdge(float *vertex, float * varying, + OsdVertexBufferDescriptor const &vertexDesc, + OsdVertexBufferDescriptor const &varyingDesc, const int *E_IT, const float *E_ITa, int vertexOffset, int tableOffset, int start, int end); -void OsdOmpComputeVertexA(OsdVertexDescriptor const &vdesc, - float *vertex, float * varying, +void OsdOmpComputeVertexA(float *vertex, float * varying, + OsdVertexBufferDescriptor const &vertexDesc, + OsdVertexBufferDescriptor const &varyingDesc, const int *V_ITa, const float *V_IT, int vertexOffset, int tableOffset, int start, int end, int pass); -void OsdOmpComputeVertexB(OsdVertexDescriptor const &vdesc, - float *vertex, float * varying, +void OsdOmpComputeVertexB(float *vertex, float * varying, + OsdVertexBufferDescriptor const &vertexDesc, + OsdVertexBufferDescriptor const &varyingDesc, const int *V_ITa, const int *V_IT, const float *V_W, int vertexOffset, int tableOffset, int start, int end); -void OsdOmpComputeLoopVertexB(OsdVertexDescriptor const &vdesc, - float *vertex, float * varying, +void OsdOmpComputeLoopVertexB(float *vertex, float * varying, + OsdVertexBufferDescriptor const &vertexDesc, + OsdVertexBufferDescriptor const &varyingDesc, const int *V_ITa, const int *V_IT, const float *V_W, int vertexOffset, int tableOffset, int start, int end); -void OsdOmpComputeBilinearEdge(OsdVertexDescriptor const &vdesc, - float *vertex, float * varying, +void OsdOmpComputeBilinearEdge(float *vertex, float * varying, + OsdVertexBufferDescriptor const &vertexDesc, + OsdVertexBufferDescriptor const &varyingDesc, const int *E_IT, int vertexOffset, int tableOffset, int start, int end); -void OsdOmpComputeBilinearVertex(OsdVertexDescriptor const &vdesc, - float *vertex, float * varying, +void OsdOmpComputeBilinearVertex(float *vertex, float * varying, + OsdVertexBufferDescriptor const &vertexDesc, + OsdVertexBufferDescriptor const &varyingDesc, const int *V_ITa, int vertexOffset, int tableOffset, int start, int end); -void OsdOmpEditVertexAdd(OsdVertexDescriptor const &vdesc, float *vertex, +void OsdOmpEditVertexAdd(float *vertex, + OsdVertexBufferDescriptor const &vertexDesc, int primVarOffset, int primVarWidth, int vertexOffset, int tableOffset, int start, int end, const unsigned int *editIndices, const float *editValues); -void OsdOmpEditVertexSet(OsdVertexDescriptor const &vdesc, float *vertex, +void OsdOmpEditVertexSet(float *vertex, + OsdVertexBufferDescriptor const &vertexDesc, int primVarOffset, int primVarWidth, int vertexOffset, int tableOffset, int start, int end, diff --git a/opensubdiv/osd/tbbComputeController.cpp b/opensubdiv/osd/tbbComputeController.cpp index e19cd741..b8bca97b 100644 --- a/opensubdiv/osd/tbbComputeController.cpp +++ b/opensubdiv/osd/tbbComputeController.cpp @@ -37,9 +37,7 @@ namespace OPENSUBDIV_VERSION { OsdTbbComputeController::OsdTbbComputeController(int numThreads) - : _currentVertexBuffer(NULL), - _currentVaryingBuffer(NULL), - _numThreads(numThreads) { + : _numThreads(numThreads) { if(_numThreads == -1) tbb::task_scheduler_init init; @@ -55,7 +53,8 @@ OsdTbbComputeController::ApplyBilinearFaceVerticesKernel( assert(context); OsdTbbComputeFace( - _vdesc, _currentVertexBuffer, _currentVaryingBuffer, + _currentBindState.vertexBuffer, _currentBindState.varyingBuffer, + _currentBindState.vertexDesc, _currentBindState.varyingDesc, (const int*)context->GetTable(FarSubdivisionTables::F_IT)->GetBuffer(), (const int*)context->GetTable(FarSubdivisionTables::F_ITa)->GetBuffer(), batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd()); @@ -68,7 +67,8 @@ OsdTbbComputeController::ApplyBilinearEdgeVerticesKernel( assert(context); OsdTbbComputeBilinearEdge( - _vdesc, _currentVertexBuffer, _currentVaryingBuffer, + _currentBindState.vertexBuffer, _currentBindState.varyingBuffer, + _currentBindState.vertexDesc, _currentBindState.varyingDesc, (const int*)context->GetTable(FarSubdivisionTables::E_IT)->GetBuffer(), batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd()); } @@ -80,7 +80,8 @@ OsdTbbComputeController::ApplyBilinearVertexVerticesKernel( assert(context); OsdTbbComputeBilinearVertex( - _vdesc, _currentVertexBuffer, _currentVaryingBuffer, + _currentBindState.vertexBuffer, _currentBindState.varyingBuffer, + _currentBindState.vertexDesc, _currentBindState.varyingDesc, (const int*)context->GetTable(FarSubdivisionTables::V_ITa)->GetBuffer(), batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd()); } @@ -92,7 +93,8 @@ OsdTbbComputeController::ApplyCatmarkFaceVerticesKernel( assert(context); OsdTbbComputeFace( - _vdesc, _currentVertexBuffer, _currentVaryingBuffer, + _currentBindState.vertexBuffer, _currentBindState.varyingBuffer, + _currentBindState.vertexDesc, _currentBindState.varyingDesc, (const int*)context->GetTable(FarSubdivisionTables::F_IT)->GetBuffer(), (const int*)context->GetTable(FarSubdivisionTables::F_ITa)->GetBuffer(), batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd()); @@ -105,7 +107,8 @@ OsdTbbComputeController::ApplyCatmarkEdgeVerticesKernel( assert(context); OsdTbbComputeEdge( - _vdesc, _currentVertexBuffer, _currentVaryingBuffer, + _currentBindState.vertexBuffer, _currentBindState.varyingBuffer, + _currentBindState.vertexDesc, _currentBindState.varyingDesc, (const int*)context->GetTable(FarSubdivisionTables::E_IT)->GetBuffer(), (const float*)context->GetTable(FarSubdivisionTables::E_W)->GetBuffer(), batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd()); @@ -118,7 +121,8 @@ OsdTbbComputeController::ApplyCatmarkVertexVerticesKernelB( assert(context); OsdTbbComputeVertexB( - _vdesc, _currentVertexBuffer, _currentVaryingBuffer, + _currentBindState.vertexBuffer, _currentBindState.varyingBuffer, + _currentBindState.vertexDesc, _currentBindState.varyingDesc, (const int*)context->GetTable(FarSubdivisionTables::V_ITa)->GetBuffer(), (const int*)context->GetTable(FarSubdivisionTables::V_IT)->GetBuffer(), (const float*)context->GetTable(FarSubdivisionTables::V_W)->GetBuffer(), @@ -132,7 +136,8 @@ OsdTbbComputeController::ApplyCatmarkVertexVerticesKernelA1( assert(context); OsdTbbComputeVertexA( - _vdesc, _currentVertexBuffer, _currentVaryingBuffer, + _currentBindState.vertexBuffer, _currentBindState.varyingBuffer, + _currentBindState.vertexDesc, _currentBindState.varyingDesc, (const int*)context->GetTable(FarSubdivisionTables::V_ITa)->GetBuffer(), (const float*)context->GetTable(FarSubdivisionTables::V_W)->GetBuffer(), batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd(), false); @@ -145,7 +150,8 @@ OsdTbbComputeController::ApplyCatmarkVertexVerticesKernelA2( assert(context); OsdTbbComputeVertexA( - _vdesc, _currentVertexBuffer, _currentVaryingBuffer, + _currentBindState.vertexBuffer, _currentBindState.varyingBuffer, + _currentBindState.vertexDesc, _currentBindState.varyingDesc, (const int*)context->GetTable(FarSubdivisionTables::V_ITa)->GetBuffer(), (const float*)context->GetTable(FarSubdivisionTables::V_W)->GetBuffer(), batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd(), true); @@ -158,7 +164,8 @@ OsdTbbComputeController::ApplyLoopEdgeVerticesKernel( assert(context); OsdTbbComputeEdge( - _vdesc, _currentVertexBuffer, _currentVaryingBuffer, + _currentBindState.vertexBuffer, _currentBindState.varyingBuffer, + _currentBindState.vertexDesc, _currentBindState.varyingDesc, (const int*)context->GetTable(FarSubdivisionTables::E_IT)->GetBuffer(), (const float*)context->GetTable(FarSubdivisionTables::E_W)->GetBuffer(), batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd()); @@ -171,7 +178,8 @@ OsdTbbComputeController::ApplyLoopVertexVerticesKernelB( assert(context); OsdTbbComputeLoopVertexB( - _vdesc, _currentVertexBuffer, _currentVaryingBuffer, + _currentBindState.vertexBuffer, _currentBindState.varyingBuffer, + _currentBindState.vertexDesc, _currentBindState.varyingDesc, (const int*)context->GetTable(FarSubdivisionTables::V_ITa)->GetBuffer(), (const int*)context->GetTable(FarSubdivisionTables::V_IT)->GetBuffer(), (const float*)context->GetTable(FarSubdivisionTables::V_W)->GetBuffer(), @@ -185,7 +193,8 @@ OsdTbbComputeController::ApplyLoopVertexVerticesKernelA1( assert(context); OsdTbbComputeVertexA( - _vdesc, _currentVertexBuffer, _currentVaryingBuffer, + _currentBindState.vertexBuffer, _currentBindState.varyingBuffer, + _currentBindState.vertexDesc, _currentBindState.varyingDesc, (const int*)context->GetTable(FarSubdivisionTables::V_ITa)->GetBuffer(), (const float*)context->GetTable(FarSubdivisionTables::V_W)->GetBuffer(), batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd(), false); @@ -198,7 +207,8 @@ OsdTbbComputeController::ApplyLoopVertexVerticesKernelA2( assert(context); OsdTbbComputeVertexA( - _vdesc, _currentVertexBuffer, _currentVaryingBuffer, + _currentBindState.vertexBuffer, _currentBindState.varyingBuffer, + _currentBindState.vertexDesc, _currentBindState.varyingDesc, (const int*)context->GetTable(FarSubdivisionTables::V_ITa)->GetBuffer(), (const float*)context->GetTable(FarSubdivisionTables::V_W)->GetBuffer(), batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd(), true); @@ -217,24 +227,24 @@ OsdTbbComputeController::ApplyVertexEdits( const OsdCpuTable * editValues = edit->GetEditValues(); if (edit->GetOperation() == FarVertexEdit::Add) { - OsdTbbEditVertexAdd(_vdesc, - _currentVertexBuffer, + OsdTbbEditVertexAdd(_currentBindState.vertexBuffer, + _currentBindState.vertexDesc, edit->GetPrimvarOffset(), edit->GetPrimvarWidth(), - batch.GetVertexOffset(), - batch.GetTableOffset(), - batch.GetStart(), + batch.GetVertexOffset(), + batch.GetTableOffset(), + batch.GetStart(), batch.GetEnd(), static_cast(primvarIndices->GetBuffer()), static_cast(editValues->GetBuffer())); } else if (edit->GetOperation() == FarVertexEdit::Set) { - OsdTbbEditVertexSet(_vdesc, - _currentVertexBuffer, + OsdTbbEditVertexSet(_currentBindState.vertexBuffer, + _currentBindState.vertexDesc, edit->GetPrimvarOffset(), edit->GetPrimvarWidth(), - batch.GetVertexOffset(), - batch.GetTableOffset(), - batch.GetStart(), + batch.GetVertexOffset(), + batch.GetTableOffset(), + batch.GetStart(), batch.GetEnd(), static_cast(primvarIndices->GetBuffer()), static_cast(editValues->GetBuffer())); diff --git a/opensubdiv/osd/tbbComputeController.h b/opensubdiv/osd/tbbComputeController.h index 441ee5cc..c2a827ce 100644 --- a/opensubdiv/osd/tbbComputeController.h +++ b/opensubdiv/osd/tbbComputeController.h @@ -29,6 +29,7 @@ #include "../far/dispatcher.h" #include "../osd/cpuComputeContext.h" +#include "../osd/vertexDescriptor.h" namespace OpenSubdiv { namespace OPENSUBDIV_VERSION { @@ -65,13 +66,23 @@ public: /// /// @param varyingBuffer varying-interpolated data buffer /// + /// @param vertexDesc the descriptor of vertex elements to be refined. + /// if it's null, all primvars in the vertex buffer + /// will be refined. + /// + /// @param varyingDesc the descriptor of varying elements to be refined. + /// if it's null, all primvars in the varying buffer + /// will be refined. + /// template void Refine(OsdCpuComputeContext const *context, FarKernelBatchVector const & batches, VERTEX_BUFFER * vertexBuffer, - VARYING_BUFFER * varyingBuffer) { + VARYING_BUFFER * varyingBuffer, + OsdVertexBufferDescriptor const *vertexDesc=NULL, + OsdVertexBufferDescriptor const *varyingDesc=NULL) { - bind(vertexBuffer, varyingBuffer); + bind(vertexBuffer, varyingBuffer, vertexDesc, varyingDesc); FarDispatcher::Refine(this, context, batches, /*maxlevel*/-1); @@ -128,25 +139,61 @@ protected: void ApplyVertexEdits(FarKernelBatch const &batch, ComputeContext const *context) const; -private: template - void bind(VERTEX_BUFFER *vertex, VARYING_BUFFER *varying) { + void bind(VERTEX_BUFFER *vertex, VARYING_BUFFER *varying, + OsdVertexBufferDescriptor const *vertexDesc, + OsdVertexBufferDescriptor const *varyingDesc) { - _currentVertexBuffer = vertex ? vertex->BindCpuBuffer() : 0; - _currentVaryingBuffer = varying ? varying->BindCpuBuffer() : 0; + // if the vertex buffer descriptor is specified, use it. + // otherwise, assumes the data is tightly packed in the vertex buffer. + if (vertexDesc) { + _currentBindState.vertexDesc = *vertexDesc; + } else { + int numElements = vertex ? vertex->GetNumElements() : 0; + _currentBindState.vertexDesc = OsdVertexBufferDescriptor( + 0, numElements, numElements); + } + if (varyingDesc) { + _currentBindState.varyingDesc = *varyingDesc; + } else { + int numElements = varying ? varying->GetNumElements() : 0; + _currentBindState.varyingDesc = OsdVertexBufferDescriptor( + 0, numElements, numElements); + } - int numVertexElements = vertex ? vertex->GetNumElements() : 0; - int numVaryingElements = varying ? varying->GetNumElements() : 0; - _vdesc.Set(numVertexElements, numVaryingElements); + // apply vertex offset here + if (vertex) { + _currentBindState.vertexBuffer = + vertex->BindCpuBuffer() + _currentBindState.vertexDesc.offset; + } else { + _currentBindState.vertexBuffer = NULL; + } + if (varying) { + _currentBindState.varyingBuffer = + varying->BindCpuBuffer() + _currentBindState.varyingDesc.offset; + } else { + _currentBindState.varyingBuffer = NULL; + } } void unbind() { - _currentVertexBuffer = 0; - _currentVaryingBuffer = 0; - _vdesc.Reset(); + _currentBindState.Reset(); } - float *_currentVertexBuffer, *_currentVaryingBuffer; - OsdVertexDescriptor _vdesc; +private: + struct BindState { + BindState() : vertexBuffer(NULL), varyingBuffer(NULL) {} + void Reset() { + vertexBuffer = varyingBuffer = NULL; + vertexDesc.Reset(); + varyingDesc.Reset(); + } + float *vertexBuffer; + float *varyingBuffer; + OsdVertexBufferDescriptor vertexDesc; + OsdVertexBufferDescriptor varyingDesc; + }; + + BindState _currentBindState; int _numThreads; }; diff --git a/opensubdiv/osd/tbbKernel.cpp b/opensubdiv/osd/tbbKernel.cpp index eb5916d7..939a7a10 100644 --- a/opensubdiv/osd/tbbKernel.cpp +++ b/opensubdiv/osd/tbbKernel.cpp @@ -34,10 +34,33 @@ namespace OPENSUBDIV_VERSION { #define grain_size 200 +static inline void +clear(float *origin, int index, OsdVertexBufferDescriptor const &desc) { + + if (origin) { + float *dst = origin + index * desc.stride; + memset(dst, 0, desc.length * sizeof(float)); + } +} + +static inline void +addWithWeight(float *origin, int dstIndex, int srcIndex, + float weight, OsdVertexBufferDescriptor const &desc) { + + if (origin) { + const float *src = origin + srcIndex * desc.stride; + float *dst = origin + dstIndex * desc.stride; + for (int k = 0; k < desc.length; ++k) { + dst[k] += src[k] * weight; + } + } +} + class TBBFaceKernel { - OsdVertexDescriptor const *vdesc; float *vertex; float *varying; + OsdVertexBufferDescriptor vertexDesc; + OsdVertexBufferDescriptor varyingDesc; int const *F_IT; int const *F_ITa; int vertexOffset; @@ -45,10 +68,10 @@ class TBBFaceKernel { public: void operator() (tbb::blocked_range const &r) const { - if(vdesc->numVertexElements == 4 && varying == NULL) { + if(vertexDesc.length == 4 && varying == NULL) { ComputeFaceKernel<4> (vertex, F_IT, F_ITa, vertexOffset, tableOffset, r.begin(), r.end()); - } else if(vdesc->numVertexElements == 8 && varying == NULL) { + } else if(vertexDesc.length == 8 && varying == NULL) { ComputeFaceKernel<8> (vertex, F_IT, F_ITa, vertexOffset, tableOffset, r.begin(), r.end()); } @@ -62,12 +85,14 @@ public: // XXX: should use local vertex struct variable instead of // accumulating directly into global memory. int dstIndex = i + vertexOffset - tableOffset; - vdesc->Clear(vertex, varying, dstIndex); + + clear(vertex, dstIndex, vertexDesc); + clear(varying, dstIndex, varyingDesc); for (int j = 0; j < n; ++j) { int index = F_IT[h+j]; - vdesc->AddWithWeight(vertex, dstIndex, index, weight); - vdesc->AddVaryingWithWeight(varying, dstIndex, index, weight); + addWithWeight(vertex, dstIndex, index, weight, vertexDesc); + addWithWeight(varying, dstIndex, index, weight, varyingDesc); } } } @@ -75,25 +100,28 @@ public: TBBFaceKernel(TBBFaceKernel const &other) { - this->vdesc = other.vdesc; this->vertex = other.vertex; this->varying= other.varying; + this->vertexDesc = other.vertexDesc; + this->varyingDesc = other.varyingDesc; this->F_IT = other.F_IT; this->F_ITa = other.F_ITa; this->vertexOffset = other.vertexOffset; this->tableOffset = other.tableOffset; } - TBBFaceKernel(OsdVertexDescriptor const *vdesc_in, - float *vertex_in, + TBBFaceKernel(float *vertex_in, float *varying_in, + OsdVertexBufferDescriptor const &vertexDesc_in, + OsdVertexBufferDescriptor const &varyingDesc_in, int const *F_IT_in, int const *F_ITa_in, int vertexOffset_in, int tableOffset_in) : - vdesc (vdesc_in), vertex (vertex_in), varying(varying_in), + vertexDesc(vertexDesc_in), + varyingDesc(varyingDesc_in), F_IT (F_IT_in), F_ITa (F_ITa_in), vertexOffset(vertexOffset_in), @@ -102,20 +130,23 @@ public: }; void OsdTbbComputeFace( - OsdVertexDescriptor const &vdesc, float * vertex, float * varying, + float * vertex, float * varying, + OsdVertexBufferDescriptor const &vertexDesc, + OsdVertexBufferDescriptor const &varyingDesc, int const *F_IT, int const *F_ITa, int vertexOffset, int tableOffset, int start, int end) { - TBBFaceKernel kernel(&vdesc, vertex, varying, F_IT, F_ITa, + TBBFaceKernel kernel(vertex, varying, vertexDesc, varyingDesc, F_IT, F_ITa, vertexOffset, tableOffset); tbb::blocked_range range(start, end, grain_size); tbb::parallel_for(range, kernel); } class TBBEdgeKernel { - OsdVertexDescriptor const *vdesc; float *vertex; float *varying; + OsdVertexBufferDescriptor vertexDesc; + OsdVertexBufferDescriptor varyingDesc; int const *E_IT; float const *E_W; int vertexOffset; @@ -123,11 +154,11 @@ class TBBEdgeKernel { public: void operator() (tbb::blocked_range const &r) const { - if(vdesc->numVertexElements == 4 && varying == NULL) { + if(vertexDesc.length == 4 && varying == NULL) { ComputeEdgeKernel<4>(vertex, E_IT, E_W, vertexOffset, tableOffset, r.begin(), r.end()); } - else if(vdesc->numVertexElements == 8 && varying == NULL) { + else if(vertexDesc.length == 8 && varying == NULL) { ComputeEdgeKernel<8>(vertex, E_IT, E_W, vertexOffset, tableOffset, r.begin(), r.end()); } @@ -141,45 +172,49 @@ public: float vertWeight = E_W[i*2+0]; int dstIndex = i + vertexOffset - tableOffset; - vdesc->Clear(vertex, varying, dstIndex); + clear(vertex, dstIndex, vertexDesc); + clear(varying, dstIndex, varyingDesc); - vdesc->AddWithWeight(vertex, dstIndex, eidx0, vertWeight); - vdesc->AddWithWeight(vertex, dstIndex, eidx1, vertWeight); + addWithWeight(vertex, dstIndex, eidx0, vertWeight, vertexDesc); + addWithWeight(vertex, dstIndex, eidx1, vertWeight, vertexDesc); if (eidx2 != -1) { float faceWeight = E_W[i*2+1]; - vdesc->AddWithWeight(vertex, dstIndex, eidx2, faceWeight); - vdesc->AddWithWeight(vertex, dstIndex, eidx3, faceWeight); + addWithWeight(vertex, dstIndex, eidx2, faceWeight, vertexDesc); + addWithWeight(vertex, dstIndex, eidx3, faceWeight, vertexDesc); } - vdesc->AddVaryingWithWeight(varying, dstIndex, eidx0, 0.5f); - vdesc->AddVaryingWithWeight(varying, dstIndex, eidx1, 0.5f); + addWithWeight(varying, dstIndex, eidx0, 0.5f, varyingDesc); + addWithWeight(varying, dstIndex, eidx1, 0.5f, varyingDesc); } } } TBBEdgeKernel(TBBEdgeKernel const &other) { - this->vdesc = other.vdesc; this->vertex = other.vertex; this->varying= other.varying; + this->vertexDesc = other.vertexDesc; + this->varyingDesc = other.varyingDesc; this->E_IT = other.E_IT; this->E_W = other.E_W; this->vertexOffset = other.vertexOffset; this->tableOffset = other.tableOffset; } - TBBEdgeKernel(OsdVertexDescriptor const *vdesc_in, - float *vertex_in, + TBBEdgeKernel(float *vertex_in, float *varying_in, + OsdVertexBufferDescriptor const &vertexDesc_in, + OsdVertexBufferDescriptor const &varyingDesc_in, int const *E_IT_in, float const *E_W_in, int vertexOffset_in, int tableOffset_in) : - vdesc (vdesc_in), vertex (vertex_in), varying(varying_in), + vertexDesc(vertexDesc_in), + varyingDesc(varyingDesc_in), E_IT (E_IT_in), E_W (E_W_in), vertexOffset(vertexOffset_in), @@ -189,19 +224,22 @@ public: void OsdTbbComputeEdge( - OsdVertexDescriptor const &vdesc, float *vertex, float *varying, + float *vertex, float *varying, + OsdVertexBufferDescriptor const &vertexDesc, + OsdVertexBufferDescriptor const &varyingDesc, int const *E_IT, float const *E_W, int vertexOffset, int tableOffset, int start, int end) { tbb::blocked_range range(start, end, grain_size); - TBBEdgeKernel kernel(&vdesc, vertex, varying, E_IT, E_W, + TBBEdgeKernel kernel(vertex, varying, vertexDesc, varyingDesc, E_IT, E_W, vertexOffset, tableOffset); tbb::parallel_for(range, kernel); } class TBBVertexKernelA { - OsdVertexDescriptor const *vdesc; float *vertex; float *varying; + OsdVertexBufferDescriptor vertexDesc; + OsdVertexBufferDescriptor varyingDesc; int const *V_ITa; float const *V_W; int vertexOffset; @@ -210,11 +248,11 @@ class TBBVertexKernelA { public: void operator() (tbb::blocked_range const &r) const { - if(vdesc->numVertexElements == 4 && varying == NULL) { + if(vertexDesc.length == 4 && varying == NULL) { ComputeVertexAKernel<4>(vertex, V_ITa, V_W, vertexOffset, tableOffset, r.begin(), r.end(), pass); } - else if (vdesc->numVertexElements == 8 && varying == NULL) { + else if (vertexDesc.length == 8 && varying == NULL) { ComputeVertexAKernel<8>(vertex, V_ITa, V_W, vertexOffset, tableOffset, r.begin(), r.end(), pass); } @@ -235,28 +273,31 @@ public: int dstIndex = i + vertexOffset - tableOffset; - if (not pass) - vdesc->Clear(vertex, varying, dstIndex); + if (not pass) { + clear(vertex, dstIndex, vertexDesc); + clear(varying, dstIndex, varyingDesc); + } if (eidx0 == -1 || (pass == 0 && (n == -1))) { - vdesc->AddWithWeight(vertex, dstIndex, p, weight); + addWithWeight(vertex, dstIndex, p, weight, vertexDesc); } else { - vdesc->AddWithWeight(vertex, dstIndex, p, weight * 0.75f); - vdesc->AddWithWeight(vertex, dstIndex, eidx0, weight * 0.125f); - vdesc->AddWithWeight(vertex, dstIndex, eidx1, weight * 0.125f); + addWithWeight(vertex, dstIndex, p, weight * 0.75f, vertexDesc); + addWithWeight(vertex, dstIndex, eidx0, weight * 0.125f, vertexDesc); + addWithWeight(vertex, dstIndex, eidx1, weight * 0.125f, vertexDesc); } if (not pass) - vdesc->AddVaryingWithWeight(varying, dstIndex, p, 1.0f); + addWithWeight(varying, dstIndex, p, 1.0f, varyingDesc); } } } TBBVertexKernelA(TBBVertexKernelA const &other) { - this->vdesc = other.vdesc; this->vertex = other.vertex; this->varying= other.varying; + this->vertexDesc = other.vertexDesc; + this->varyingDesc = other.varyingDesc; this->V_ITa = other.V_ITa; this->V_W = other.V_W; this->vertexOffset = other.vertexOffset; @@ -264,17 +305,19 @@ public: this->pass = other.pass; } - TBBVertexKernelA(OsdVertexDescriptor const *vdesc_in, - float *vertex_in, + TBBVertexKernelA(float *vertex_in, float *varying_in, + OsdVertexBufferDescriptor const &vertexDesc_in, + OsdVertexBufferDescriptor const &varyingDesc_in, int const *V_ITa_in, float const *V_W_in, int vertexOffset_in, int tableOffset_in, int pass_in) : - vdesc (vdesc_in), vertex (vertex_in), varying(varying_in), + vertexDesc(vertexDesc_in), + varyingDesc(varyingDesc_in), V_ITa (V_ITa_in), V_W (V_W_in), vertexOffset(vertexOffset_in), @@ -284,19 +327,23 @@ public: }; void OsdTbbComputeVertexA( - OsdVertexDescriptor const &vdesc, float *vertex, float *varying, + float *vertex, float *varying, + OsdVertexBufferDescriptor const &vertexDesc, + OsdVertexBufferDescriptor const &varyingDesc, int const *V_ITa, float const *V_W, int vertexOffset, int tableOffset, int start, int end, int pass) { tbb::blocked_range range(start, end, grain_size); - TBBVertexKernelA kernel(&vdesc, vertex, varying, V_ITa, V_W, + TBBVertexKernelA kernel(vertex, varying, vertexDesc, varyingDesc, + V_ITa, V_W, vertexOffset, tableOffset, pass); tbb::parallel_for(range, kernel); } class TBBVertexKernelB { - OsdVertexDescriptor const *vdesc; float *vertex; float *varying; + OsdVertexBufferDescriptor vertexDesc; + OsdVertexBufferDescriptor varyingDesc; int const *V_ITa; int const *V_IT; float const *V_W; @@ -305,11 +352,11 @@ class TBBVertexKernelB { public: void operator() (tbb::blocked_range const &r) const { - if(vdesc->numVertexElements == 4 && varying == NULL) { + if(vertexDesc.length == 4 && varying == NULL) { ComputeVertexBKernel<4>(vertex, V_ITa, V_IT, V_W, vertexOffset, tableOffset, r.begin(), r.end()); } - else if(vdesc->numVertexElements == 8 && varying == NULL) { + else if(vertexDesc.length == 8 && varying == NULL) { ComputeVertexBKernel<8>(vertex, V_ITa, V_IT, V_W, vertexOffset, tableOffset, r.begin(), r.end()); } @@ -324,24 +371,26 @@ public: float wv = (n-2.0f) * n * wp; int dstIndex = i + vertexOffset - tableOffset; - vdesc->Clear(vertex, varying, dstIndex); + clear(vertex, dstIndex, vertexDesc); + clear(varying, dstIndex, varyingDesc); - vdesc->AddWithWeight(vertex, dstIndex, p, weight * wv); + addWithWeight(vertex, dstIndex, p, weight * wv, vertexDesc); for (int j = 0; j < n; ++j) { - vdesc->AddWithWeight(vertex, dstIndex, V_IT[h+j*2], weight * wp); - vdesc->AddWithWeight(vertex, dstIndex, V_IT[h+j*2+1], weight * wp); + addWithWeight(vertex, dstIndex, V_IT[h+j*2], weight * wp, vertexDesc); + addWithWeight(vertex, dstIndex, V_IT[h+j*2+1], weight * wp, vertexDesc); } - vdesc->AddVaryingWithWeight(varying, dstIndex, p, 1.0f); + addWithWeight(varying, dstIndex, p, 1.0f, varyingDesc); } } } TBBVertexKernelB(TBBVertexKernelB const &other) { - this->vdesc = other.vdesc; this->vertex = other.vertex; this->varying= other.varying; + this->vertexDesc = other.vertexDesc; + this->varyingDesc = other.varyingDesc; this->V_ITa = other.V_ITa; this->V_IT = other.V_IT; this->V_W = other.V_W; @@ -349,17 +398,19 @@ public: this->tableOffset = other.tableOffset; } - TBBVertexKernelB(OsdVertexDescriptor const *vdesc_in, - float *vertex_in, + TBBVertexKernelB(float *vertex_in, float *varying_in, + OsdVertexBufferDescriptor const &vertexDesc_in, + OsdVertexBufferDescriptor const &varyingDesc_in, int const *V_ITa_in, int const *V_IT_in, float const *V_W_in, int vertexOffset_in, int tableOffset_in) : - vdesc (vdesc_in), vertex (vertex_in), varying(varying_in), + vertexDesc(vertexDesc_in), + varyingDesc(varyingDesc_in), V_ITa (V_ITa_in), V_IT (V_IT_in), V_W (V_W_in), @@ -369,20 +420,24 @@ public: }; void OsdTbbComputeVertexB( - OsdVertexDescriptor const &vdesc, float *vertex, float *varying, + float *vertex, float *varying, + OsdVertexBufferDescriptor const &vertexDesc, + OsdVertexBufferDescriptor const &varyingDesc, int const *V_ITa, int const *V_IT, float const *V_W, int vertexOffset, int tableOffset, int start, int end) { tbb::blocked_range range(start, end, grain_size); - TBBVertexKernelB kernel(&vdesc, vertex, varying, V_ITa, V_IT, V_W, + TBBVertexKernelB kernel(vertex, varying, vertexDesc, varyingDesc, + V_ITa, V_IT, V_W, vertexOffset, tableOffset); tbb::parallel_for(range, kernel); } class TBBLoopVertexKernelB { - OsdVertexDescriptor const *vdesc; float *vertex; float *varying; + OsdVertexBufferDescriptor vertexDesc; + OsdVertexBufferDescriptor varyingDesc; int const *V_ITa; int const *V_IT; float const *V_W; @@ -391,11 +446,11 @@ class TBBLoopVertexKernelB { public: void operator() (tbb::blocked_range const &r) const { - if(vdesc->numVertexElements == 4 && varying == NULL) { + if(vertexDesc.length == 4 && varying == NULL) { ComputeLoopVertexBKernel<4>(vertex, V_ITa, V_IT, V_W, vertexOffset, tableOffset, r.begin(), r.end()); } - else if(vdesc->numVertexElements == 8 && varying == NULL) { + else if(vertexDesc.length == 8 && varying == NULL) { ComputeLoopVertexBKernel<8>(vertex, V_ITa, V_IT, V_W, vertexOffset, tableOffset, r.begin(), r.end()); } @@ -412,23 +467,25 @@ public: beta = (0.625f - beta) * wp; int dstIndex = i + vertexOffset - tableOffset; - vdesc->Clear(vertex, varying, dstIndex); + clear(vertex, dstIndex, vertexDesc); + clear(varying, dstIndex, varyingDesc); - vdesc->AddWithWeight(vertex, dstIndex, p, weight * (1.0f - (beta * n))); + addWithWeight(vertex, dstIndex, p, weight * (1.0f - (beta * n)), vertexDesc); for (int j = 0; j < n; ++j) - vdesc->AddWithWeight(vertex, dstIndex, V_IT[h+j], weight * beta); + addWithWeight(vertex, dstIndex, V_IT[h+j], weight * beta, vertexDesc); - vdesc->AddVaryingWithWeight(varying, dstIndex, p, 1.0f); + addWithWeight(varying, dstIndex, p, 1.0f, varyingDesc); } } } TBBLoopVertexKernelB(TBBLoopVertexKernelB const &other) { - this->vdesc = other.vdesc; this->vertex = other.vertex; this->varying= other.varying; + this->vertexDesc = other.vertexDesc; + this->varyingDesc = other.varyingDesc; this->V_ITa = other.V_ITa; this->V_IT = other.V_IT; this->V_W = other.V_W; @@ -436,17 +493,19 @@ public: this->tableOffset = other.tableOffset; } - TBBLoopVertexKernelB(OsdVertexDescriptor const *vdesc_in, - float *vertex_in, + TBBLoopVertexKernelB(float *vertex_in, float *varying_in, + OsdVertexBufferDescriptor const &vertexDesc_in, + OsdVertexBufferDescriptor const &varyingDesc_in, int const *V_ITa_in, int const *V_IT_in, float const *V_W_in, int vertexOffset_in, int tableOffset_in) : - vdesc (vdesc_in), vertex (vertex_in), varying(varying_in), + vertexDesc(vertexDesc_in), + varyingDesc(varyingDesc_in), V_ITa (V_ITa_in), V_IT (V_IT_in), V_W (V_W_in), @@ -456,31 +515,35 @@ public: }; void OsdTbbComputeLoopVertexB( - OsdVertexDescriptor const &vdesc, float *vertex, float *varying, + float *vertex, float *varying, + OsdVertexBufferDescriptor const &vertexDesc, + OsdVertexBufferDescriptor const &varyingDesc, int const *V_ITa, int const *V_IT, float const *V_W, int vertexOffset, int tableOffset, int start, int end) { tbb::blocked_range range(start, end, grain_size); - TBBLoopVertexKernelB kernel(&vdesc, vertex, varying, V_ITa, V_IT, V_W, + TBBLoopVertexKernelB kernel(vertex, varying, vertexDesc, varyingDesc, + V_ITa, V_IT, V_W, vertexOffset, tableOffset); tbb::parallel_for(range, kernel); } class TBBBilinearEdgeKernel { - OsdVertexDescriptor const *vdesc; float *vertex; float *varying; + OsdVertexBufferDescriptor vertexDesc; + OsdVertexBufferDescriptor varyingDesc; int const *E_IT; int vertexOffset; int tableOffset; public: void operator() (tbb::blocked_range const &r) const { - if(vdesc->numVertexElements == 4 && varying == NULL) { + if(vertexDesc.length == 4 && varying == NULL) { ComputeBilinearEdgeKernel<4>(vertex, E_IT, vertexOffset, tableOffset, r.begin(), r.end()); } - else if(vdesc->numVertexElements == 8 && varying == NULL) { + else if(vertexDesc.length == 8 && varying == NULL) { ComputeBilinearEdgeKernel<8>(vertex, E_IT, vertexOffset, tableOffset, r.begin(), r.end()); } @@ -490,36 +553,40 @@ public: int eidx1 = E_IT[2*i+1]; int dstIndex = i + vertexOffset - tableOffset; - vdesc->Clear(vertex, varying, dstIndex); + clear(vertex, dstIndex, vertexDesc); + clear(varying, dstIndex, varyingDesc); - vdesc->AddWithWeight(vertex, dstIndex, eidx0, 0.5f); - vdesc->AddWithWeight(vertex, dstIndex, eidx1, 0.5f); + addWithWeight(vertex, dstIndex, eidx0, 0.5f, vertexDesc); + addWithWeight(vertex, dstIndex, eidx1, 0.5f, vertexDesc); - vdesc->AddVaryingWithWeight(varying, dstIndex, eidx0, 0.5f); - vdesc->AddVaryingWithWeight(varying, dstIndex, eidx1, 0.5f); + addWithWeight(varying, dstIndex, eidx0, 0.5f, varyingDesc); + addWithWeight(varying, dstIndex, eidx1, 0.5f, varyingDesc); } } } TBBBilinearEdgeKernel(TBBBilinearEdgeKernel const &other) { - this->vdesc = other.vdesc; this->vertex = other.vertex; this->varying= other.varying; + this->vertexDesc = other.vertexDesc; + this->varyingDesc = other.varyingDesc; this->E_IT = other.E_IT; this->vertexOffset = other.vertexOffset; this->tableOffset = other.tableOffset; } - TBBBilinearEdgeKernel(OsdVertexDescriptor const *vdesc_in, - float *vertex_in, + TBBBilinearEdgeKernel(float *vertex_in, float *varying_in, + OsdVertexBufferDescriptor const &vertexDesc_in, + OsdVertexBufferDescriptor const &varyingDesc_in, int const *E_IT_in, int vertexOffset_in, int tableOffset_in) : - vdesc (vdesc_in), vertex (vertex_in), varying(varying_in), + vertexDesc(vertexDesc_in), + varyingDesc(varyingDesc_in), E_IT (E_IT_in), vertexOffset(vertexOffset_in), tableOffset(tableOffset_in) @@ -527,25 +594,29 @@ public: }; void OsdTbbComputeBilinearEdge( - OsdVertexDescriptor const &vdesc, float *vertex, float *varying, + float *vertex, float *varying, + OsdVertexBufferDescriptor const &vertexDesc, + OsdVertexBufferDescriptor const &varyingDesc, int const *E_IT, int vertexOffset, int tableOffset, int start, int end) { tbb::blocked_range range(start, end, grain_size); - TBBBilinearEdgeKernel kernel(&vdesc, vertex, varying, E_IT, vertexOffset, tableOffset); + TBBBilinearEdgeKernel kernel(vertex, varying, vertexDesc, varyingDesc, + E_IT, vertexOffset, tableOffset); tbb::parallel_for(range, kernel); } class TBBBilinearVertexKernel { - OsdVertexDescriptor const *vdesc; float *vertex; float *varying; + OsdVertexBufferDescriptor vertexDesc; + OsdVertexBufferDescriptor varyingDesc; int const *V_ITa; int vertexOffset; int tableOffset; public: void operator() (tbb::blocked_range const &r) const { - int numVertexElements = vdesc->numVertexElements; - int numVaryingElements = vdesc->numVaryingElements; + int numVertexElements = vertexDesc.length; + int numVaryingElements = varyingDesc.length; float *src, *des; for (int i = r.begin() + tableOffset; i < r.end() + tableOffset; i++) { int p = V_ITa[i]; @@ -564,23 +635,26 @@ public: TBBBilinearVertexKernel(TBBBilinearVertexKernel const &other) { - this->vdesc = other.vdesc; this->vertex = other.vertex; this->varying= other.varying; + this->vertexDesc = other.vertexDesc; + this->varyingDesc = other.varyingDesc; this->V_ITa = other.V_ITa; this->vertexOffset = other.vertexOffset; this->tableOffset = other.tableOffset; } - TBBBilinearVertexKernel(OsdVertexDescriptor const *vdesc_in, - float *vertex_in, + TBBBilinearVertexKernel(float *vertex_in, float *varying_in, + OsdVertexBufferDescriptor const &vertexDesc_in, + OsdVertexBufferDescriptor const &varyingDesc_in, int const *V_ITa_in, int vertexOffset_in, int tableOffset_in) : - vdesc (vdesc_in), vertex (vertex_in), varying(varying_in), + vertexDesc(vertexDesc_in), + varyingDesc(varyingDesc_in), V_ITa (V_ITa_in), vertexOffset(vertexOffset_in), tableOffset(tableOffset_in) @@ -588,40 +662,53 @@ public: }; void OsdTbbComputeBilinearVertex( - OsdVertexDescriptor const &vdesc, float *vertex, float *varying, + float *vertex, float *varying, + OsdVertexBufferDescriptor const &vertexDesc, + OsdVertexBufferDescriptor const &varyingDesc, int const *V_ITa, int vertexOffset, int tableOffset, int start, int end) { tbb::blocked_range range(start, end, grain_size); - TBBBilinearVertexKernel kernel(&vdesc, vertex, varying, V_ITa, vertexOffset, tableOffset); + TBBBilinearVertexKernel kernel(vertex, varying, vertexDesc, varyingDesc, + V_ITa, vertexOffset, tableOffset); tbb::parallel_for(range, kernel); } void OsdTbbEditVertexAdd( - OsdVertexDescriptor const &vdesc, float *vertex, + float *vertex, + OsdVertexBufferDescriptor const &vertexDesc, int primVarOffset, int primVarWidth, int vertexOffset, int tableOffset, int start, int end, unsigned int const *editIndices, float const *editValues) { for (int i = start+tableOffset; i < end+tableOffset; i++) { - vdesc.ApplyVertexEditAdd(vertex, - primVarOffset, - primVarWidth, - editIndices[i] + vertexOffset, - &editValues[i*primVarWidth]); + + if (vertex) { + int editIndex = editIndices[i] + vertexOffset; + float *dst = vertex + editIndex * vertexDesc.stride + primVarOffset; + + for (int i = 0; i < primVarWidth; ++i) { + dst[i] += editValues[i]; + } + } } } void OsdTbbEditVertexSet( - OsdVertexDescriptor const &vdesc, float *vertex, + float *vertex, + OsdVertexBufferDescriptor const &vertexDesc, int primVarOffset, int primVarWidth, int vertexOffset, int tableOffset, int start, int end, unsigned int const *editIndices, float const *editValues) { for (int i = start+tableOffset; i < end+tableOffset; i++) { - vdesc.ApplyVertexEditSet(vertex, - primVarOffset, - primVarWidth, - editIndices[i] + vertexOffset, - &editValues[i*primVarWidth]); + + if (vertex) { + int editIndex = editIndices[i] + vertexOffset; + float *dst = vertex + editIndex * vertexDesc.stride + primVarOffset; + + for (int i = 0; i < primVarWidth; ++i) { + dst[i] = editValues[i]; + } + } } } diff --git a/opensubdiv/osd/tbbKernel.h b/opensubdiv/osd/tbbKernel.h index 698dffd8..11882bed 100644 --- a/opensubdiv/osd/tbbKernel.h +++ b/opensubdiv/osd/tbbKernel.h @@ -30,59 +30,68 @@ namespace OpenSubdiv { namespace OPENSUBDIV_VERSION { -struct OsdVertexDescriptor; +struct OsdVertexBufferDescriptor; -void OsdTbbComputeFace(OsdVertexDescriptor const &vdesc, - float * vertex, float * varying, +void OsdTbbComputeFace(float * vertex, float * varying, + OsdVertexBufferDescriptor const &vertexDesc, + OsdVertexBufferDescriptor const &varyingDesc, int const *F_IT, int const *F_ITa, int vertexOffset, int tableOffset, int start, int end); -void OsdTbbComputeEdge(OsdVertexDescriptor const &vdesc, - float *vertex, float * varying, +void OsdTbbComputeEdge(float *vertex, float * varying, + OsdVertexBufferDescriptor const &vertexDesc, + OsdVertexBufferDescriptor const &varyingDesc, int const *E_IT, float const *E_ITa, int vertexOffset, int tableOffset, int start, int end); -void OsdTbbComputeVertexA(OsdVertexDescriptor const &vdesc, - float *vertex, float * varying, +void OsdTbbComputeVertexA(float *vertex, float * varying, + OsdVertexBufferDescriptor const &vertexDesc, + OsdVertexBufferDescriptor const &varyingDesc, int const *V_ITa, float const *V_IT, int vertexOffset, int tableOffset, int start, int end, int pass); -void OsdTbbComputeVertexB(OsdVertexDescriptor const &vdesc, - float *vertex, float * varying, +void OsdTbbComputeVertexB(float *vertex, float * varying, + OsdVertexBufferDescriptor const &vertexDesc, + OsdVertexBufferDescriptor const &varyingDesc, int const *V_ITa, int const *V_IT, float const *V_W, int vertexOffset, int tableOffset, int start, int end); -void OsdTbbComputeLoopVertexB(OsdVertexDescriptor const &vdesc, - float *vertex, float * varying, +void OsdTbbComputeLoopVertexB(float *vertex, float * varying, + OsdVertexBufferDescriptor const &vertexDesc, + OsdVertexBufferDescriptor const &varyingDesc, int const *V_ITa, int const *V_IT, float const *V_W, int vertexOffset, int tableOffset, int start, int end); -void OsdTbbComputeBilinearEdge(OsdVertexDescriptor const &vdesc, - float *vertex, float * varying, +void OsdTbbComputeBilinearEdge(float *vertex, float * varying, + OsdVertexBufferDescriptor const &vertexDesc, + OsdVertexBufferDescriptor const &varyingDesc, int const *E_IT, int vertexOffset, int tableOffset, int start, int end); -void OsdTbbComputeBilinearVertex(OsdVertexDescriptor const &vdesc, - float *vertex, float * varying, +void OsdTbbComputeBilinearVertex(float *vertex, float * varying, + OsdVertexBufferDescriptor const &vertexDesc, + OsdVertexBufferDescriptor const &varyingDesc, int const *V_ITa, int vertexOffset, int tableOffset, int start, int end); -void OsdTbbEditVertexAdd(OsdVertexDescriptor const &vdesc, float *vertex, +void OsdTbbEditVertexAdd(float *vertex, + OsdVertexBufferDescriptor const &vertexDesc, int primVarOffset, int primVarWidth, int vertexOffset, int tableOffset, int start, int end, unsigned int const *editIndices, float const *editValues); -void OsdTbbEditVertexSet(OsdVertexDescriptor const &vdesc, float *vertex, +void OsdTbbEditVertexSet(float *vertex, + OsdVertexBufferDescriptor const &vertexDesc, int primVarOffset, int primVarWidth, int vertexOffset, int tableOffset, int start, int end, diff --git a/opensubdiv/osd/vertexDescriptor.h b/opensubdiv/osd/vertexDescriptor.h index 75443dcb..68da8d10 100644 --- a/opensubdiv/osd/vertexDescriptor.h +++ b/opensubdiv/osd/vertexDescriptor.h @@ -31,155 +31,6 @@ namespace OpenSubdiv { namespace OPENSUBDIV_VERSION { -struct OsdVertexDescriptor { - - /// Constructor - OsdVertexDescriptor() : numVertexElements(0), numVaryingElements(0) {} - - /// Constructor - /// - /// @param numVertexElem number of vertex-interpolated data elements (floats) - /// - /// @param numVaryingElem number of varying-interpolated data elements (floats) - /// - OsdVertexDescriptor(int numVertexElem, int numVaryingElem) - : numVertexElements(numVertexElem), - numVaryingElements(numVaryingElem) { } - - /// Sets descriptor - /// - /// @param numVertexElem number of vertex-interpolated data elements (floats) - /// - /// @param numVaryingElem number of varying-interpolated data elements (floats) - /// - void Set(int numVertexElem, int numVaryingElem) { - numVertexElements = numVertexElem; - numVaryingElements = numVaryingElem; - } - - /// Resets the descriptor - void Reset() { - numVertexElements = numVaryingElements = 0; - } - - /// Returns the total number of elements (vertex + varying) - int GetNumElements() const { - return numVertexElements + numVaryingElements; - } - - bool operator == (OsdVertexDescriptor const & other) { - return (numVertexElements == other.numVertexElements and - numVaryingElements == other.numVaryingElements); - } - - /// Resets the contents of vertex & varying primvar data buffers for a given - /// vertex. - /// - /// @param vertex The float array containing the vertex-interpolated primvar - /// data that needs to be reset. - /// - /// @param varying The float array containing the varying-interpolated primvar - /// data that needs to be reset. - /// - /// @param index Vertex index in the buffer. - /// - void Clear(float *vertex, float *varying, int index) const { - if (vertex) { - memset(vertex+index*numVertexElements, 0, sizeof(float)*numVertexElements); - } - - if (varying) { - memset(varying+index*numVaryingElements, 0, sizeof(float)*numVaryingElements); - - } - } - - /// Applies "dst += src*weight" to "vertex" primvar data in a vertex buffer. - /// - /// @param vertex The VertexData buffer - /// - /// @param dstIndex Index of the destination vertex. - /// - /// @param srcIndex Index of the origin vertex. - /// - /// @param weight Weight applied to the primvar data. - /// - inline - void AddWithWeight(float *vertex, int dstIndex, int srcIndex, float weight) const { - int d = dstIndex * numVertexElements; - int s = srcIndex * numVertexElements; -#if defined ( __INTEL_COMPILER ) or defined ( __ICC ) - #pragma ivdep - #pragma vector aligned -#endif - for (int i = 0; i < numVertexElements; ++i) - vertex[d++] += vertex[s++] * weight; - } - - /// Applies "dst += src*weight" to "varying" primvar data in a vertex buffer. - /// - /// @param varying The VaryingData buffer - /// - /// @param dstIndex Index of the destination vertex. - /// - /// @param srcIndex Index of the source vertex. - /// - /// @param weight Weight applied to the primvar data. - /// - inline - void AddVaryingWithWeight(float *varying, int dstIndex, int srcIndex, float weight) const { - int d = dstIndex * numVaryingElements; - int s = srcIndex * numVaryingElements; -#if defined ( __INTEL_COMPILER ) or defined ( __ICC ) - #pragma ivdep - #pragma vector aligned -#endif - for (int i = 0; i < numVaryingElements; ++i) - varying[d++] += varying[s++] * weight; - } - - /// Applies an "add" vertex edit - /// - /// @param vertex The primvar data buffer. - /// - /// @param primVarOffset Offset to the primvar datum. - /// - /// @param primVarWidth Length of the primvar datum. - /// - /// @param editIndex The location of the vertex in the buffer. - /// - /// @param editValues The values to add to the primvar datum. - /// - void ApplyVertexEditAdd(float *vertex, int primVarOffset, int primVarWidth, int editIndex, const float *editValues) const { - int d = editIndex * numVertexElements + primVarOffset; - for (int i = 0; i < primVarWidth; ++i) { - vertex[d++] += editValues[i]; - } - } - - /// Applies a "set" vertex edit - /// - /// @param vertex The primvar data buffer. - /// - /// @param primVarOffset Offset to the primvar datum. - /// - /// @param primVarWidth Length of the primvar datum. - /// - /// @param editIndex The location of the vertex in the buffer. - /// - /// @param editValues The values to add to the primvar datum. - /// - void ApplyVertexEditSet(float *vertex, int primVarOffset, int primVarWidth, int editIndex, const float *editValues) const { - int d = editIndex * numVertexElements + primVarOffset; - for (int i = 0; i < primVarWidth; ++i) { - vertex[d++] = editValues[i]; - } - } - - int numVertexElements; - int numVaryingElements; -}; - /// \brief Describes vertex elements in interleaved data buffers struct OsdVertexBufferDescriptor { @@ -207,6 +58,13 @@ struct OsdVertexBufferDescriptor { offset = length = stride = 0; } + /// True if the descriptors are identical + bool operator == ( OsdVertexBufferDescriptor const other ) const { + return (offset == other.offset and + length == other.length and + stride == other.stride); + } + int offset; // offset to desired element data int length; // number or length of the data int stride; // stride to the next element