From b7a763853c542580caf3d16f190eb2b001d8edd0 Mon Sep 17 00:00:00 2001 From: Nathan Litke Date: Tue, 10 Jun 2014 16:31:44 -0700 Subject: [PATCH] Added the CATMARK_RESTRICTED_VERT_VERTEX_A, CATMARK_RESTRICTED_VERT_VERTEX_B1, and CATMARK_RESTRICTED_VERT_VERTEX_B2 kernels which compute vertices resulting from the refinement of a smooth or (fully) sharp vertex. * CATMARK_RESTRICTED_VERT_VERTEX_A handles k_Crease and k_Corner rules * CATMARK_RESTRICTED_VERT_VERTEX_B1 handles regular k_Smooth and k_Dart rules * CATMARK_RESTRICTED_VERT_VERTEX_B2 handles irregular k_Smooth and k_Dart rules --- .../far/catmarkSubdivisionTablesFactory.h | 88 +++++-- opensubdiv/far/dispatcher.h | 63 +++++ opensubdiv/far/kernelBatch.h | 3 + opensubdiv/far/kernelBatchFactory.h | 77 +++++- opensubdiv/far/subdivisionTables.h | 86 +++++++ opensubdiv/far/subdivisionTablesFactory.h | 38 ++- opensubdiv/osd/clComputeController.cpp | 88 +++++++ opensubdiv/osd/clComputeController.h | 6 + opensubdiv/osd/clKernel.cl | 101 ++++++++ opensubdiv/osd/clKernelBundle.cpp | 12 + opensubdiv/osd/clKernelBundle.h | 9 + opensubdiv/osd/cpuComputeController.cpp | 41 +++ opensubdiv/osd/cpuComputeController.h | 6 + opensubdiv/osd/cpuKernel.cpp | 94 +++++++ opensubdiv/osd/cpuKernel.h | 21 ++ opensubdiv/osd/cudaComputeController.cpp | 84 +++++++ opensubdiv/osd/cudaComputeController.h | 6 + opensubdiv/osd/cudaKernel.cu | 234 ++++++++++++++++++ opensubdiv/osd/d3d11ComputeController.cpp | 36 +++ opensubdiv/osd/d3d11ComputeController.h | 6 + opensubdiv/osd/d3d11KernelBundle.cpp | 63 +++++ opensubdiv/osd/d3d11KernelBundle.h | 18 ++ opensubdiv/osd/gcdComputeController.cpp | 44 ++++ opensubdiv/osd/gcdComputeController.h | 5 + opensubdiv/osd/gcdKernel.cpp | 72 ++++++ opensubdiv/osd/gcdKernel.h | 24 ++ opensubdiv/osd/glslComputeController.cpp | 33 +++ opensubdiv/osd/glslComputeController.h | 6 + opensubdiv/osd/glslComputeKernel.glsl | 76 ++++++ opensubdiv/osd/glslKernelBundle.cpp | 93 ++++--- opensubdiv/osd/glslKernelBundle.h | 15 ++ ...glslTransformFeedbackComputeController.cpp | 36 +++ .../glslTransformFeedbackComputeController.h | 6 + .../osd/glslTransformFeedbackKernel.glsl | 88 +++++++ .../osd/glslTransformFeedbackKernelBundle.cpp | 39 +++ .../osd/glslTransformFeedbackKernelBundle.h | 21 ++ opensubdiv/osd/hlslComputeKernel.hlsl | 83 +++++++ opensubdiv/osd/ompComputeController.cpp | 41 +++ opensubdiv/osd/ompComputeController.h | 6 + opensubdiv/osd/ompKernel.cpp | 120 +++++++++ opensubdiv/osd/ompKernel.h | 25 +- opensubdiv/osd/tbbComputeController.cpp | 41 +++ opensubdiv/osd/tbbComputeController.h | 6 + opensubdiv/osd/tbbKernel.cpp | 220 ++++++++++++++++ opensubdiv/osd/tbbKernel.h | 23 +- 45 files changed, 2238 insertions(+), 65 deletions(-) diff --git a/opensubdiv/far/catmarkSubdivisionTablesFactory.h b/opensubdiv/far/catmarkSubdivisionTablesFactory.h index bdd674c7..3a46b712 100644 --- a/opensubdiv/far/catmarkSubdivisionTablesFactory.h +++ b/opensubdiv/far/catmarkSubdivisionTablesFactory.h @@ -56,6 +56,10 @@ protected: /// will reserve and append refinement tasks /// static FarSubdivisionTables * Create( FarMeshFactory * meshFactory, FarKernelBatchVector *batches ); + + // Compares vertices based on their topological configuration + // (see subdivisionTables::GetMaskRanking for more details) + static bool CompareVertices( HbrVertex const *x, HbrVertex const *y ); }; // This factory walks the Hbr vertices and accumulates the weights and adjacency @@ -97,9 +101,18 @@ FarCatmarkSubdivisionTablesFactory::Create( FarMeshFactory * meshFacto typename HbrCatmarkSubdivision::TriangleSubdivision triangleMethod = dynamic_cast *>(meshFactory->GetHbrMesh()->GetSubdivision())->GetTriangleSubdivisionMethod(); bool hasFractionalEdgeSharpness = tablesFactory.HasFractionalEdgeSharpness(); - bool useRestrictedEdgeVertexKernel = meshFactory->IsKernelTypeSupported(FarKernelBatch::CATMARK_RESTRICTED_EDGE_VERTEX);; + bool useRestrictedEdgeVertexKernel = meshFactory->IsKernelTypeSupported(FarKernelBatch::CATMARK_RESTRICTED_EDGE_VERTEX); useRestrictedEdgeVertexKernel &= !hasFractionalEdgeSharpness && triangleMethod != HbrCatmarkSubdivision::k_New; + bool hasFractionalVertexSharpness = tablesFactory.HasFractionalVertexSharpness(); + bool hasStandardVertexVertexKernels = meshFactory->IsKernelTypeSupported(FarKernelBatch::CATMARK_VERT_VERTEX_A1) && + meshFactory->IsKernelTypeSupported(FarKernelBatch::CATMARK_VERT_VERTEX_A2) && + meshFactory->IsKernelTypeSupported(FarKernelBatch::CATMARK_VERT_VERTEX_B); + bool useRestrictedVertexVertexKernels = meshFactory->IsKernelTypeSupported(FarKernelBatch::CATMARK_RESTRICTED_VERT_VERTEX_A) && + meshFactory->IsKernelTypeSupported(FarKernelBatch::CATMARK_RESTRICTED_VERT_VERTEX_B1) && + meshFactory->IsKernelTypeSupported(FarKernelBatch::CATMARK_RESTRICTED_VERT_VERTEX_B2); + useRestrictedVertexVertexKernels &= !hasFractionalVertexSharpness && !hasFractionalEdgeSharpness; + // Allocate memory for the indexing tables result->_F_ITa.resize(F_ITa_size); result->_F_IT.resize(F_IT_size); @@ -111,8 +124,9 @@ FarCatmarkSubdivisionTablesFactory::Create( FarMeshFactory * meshFacto result->_V_ITa.resize((tablesFactory.GetNumVertexVerticesTotal(maxlevel) - tablesFactory.GetNumVertexVerticesTotal(0))*5); // subtract coarse cage vertices result->_V_IT.resize(tablesFactory.GetVertVertsValenceSum()*2); - result->_V_W.resize(tablesFactory.GetNumVertexVerticesTotal(maxlevel) - - tablesFactory.GetNumVertexVerticesTotal(0)); + if (!useRestrictedVertexVertexKernels) + result->_V_W.resize(tablesFactory.GetNumVertexVerticesTotal(maxlevel) + - tablesFactory.GetNumVertexVerticesTotal(0)); // Prepare batch table batches->reserve(maxlevel*5); @@ -382,30 +396,49 @@ FarCatmarkSubdivisionTablesFactory::Create( FarMeshFactory * meshFacto break; } case HbrVertex::k_Corner : - // in the case of a k_Crease / k_Corner pass combination, we - // need to set the valence to -1 to tell the "B" Kernel to - // switch to k_Corner rule (as edge indices won't be -1) - if (V_ITa[5*i+1]==0) - V_ITa[5*i+1] = -1; + if (!useRestrictedVertexVertexKernels) { + // in the case of a k_Crease / k_Corner pass combination, we + // need to set the valence to -1 to tell the "B" Kernel to + // switch to k_Corner rule (as edge indices won't be -1) + if (V_ITa[5*i+1]==0) + V_ITa[5*i+1] = -1; + } else { + // in the case of a k_Corner, repeat the vertex + V_ITa[5*i+3] = V_ITa[5*i+2]; + V_ITa[5*i+4] = V_ITa[5*i+2]; + } default : break; } - if (rank>7) - // the k_Corner and k_Crease single-pass cases apply a weight of 1.0 - // but this value is inverted in the kernel - V_W[i] = 0.0; - else - V_W[i] = weights[0]; + if (!useRestrictedVertexVertexKernels) { + if (rank>7) + // the k_Corner and k_Crease single-pass cases apply a weight of 1.0 + // but this value is inverted in the kernel + V_W[i] = 0.0; + else + V_W[i] = weights[0]; + } - batchFactory.AddVertex( i, rank ); + if (!useRestrictedVertexVertexKernels) + batchFactory.AddVertex( i, rank ); + else + batchFactory.AddCatmarkRestrictedVertex( i, rank, V_ITa[5*i+1] ); } V_ITa += nVertVertices*5; - V_W += nVertVertices; + if (!useRestrictedVertexVertexKernels) + V_W += nVertVertices; // add batches for vert vertices - if (nVertVertices > 0) - batchFactory.AppendCatmarkBatches(level, vertTableOffset, vertexOffset, batches); + if (nVertVertices > 0) { + if (!useRestrictedVertexVertexKernels) { + assert(hasStandardVertexVertexKernels); + batchFactory.AppendCatmarkBatches(level, vertTableOffset, vertexOffset, batches); + } else { + batchFactory.AppendCatmarkRestrictedBatches(level, vertTableOffset, vertexOffset, batches); + } + } + vertexOffset += nVertVertices; vertTableOffset += nVertVertices; } @@ -413,6 +446,25 @@ FarCatmarkSubdivisionTablesFactory::Create( FarMeshFactory * meshFacto return result; } +template bool +FarCatmarkSubdivisionTablesFactory::CompareVertices( HbrVertex const * x, HbrVertex const * y ) { + + // Masks of the parent vertex decide for the current vertex. + HbrVertex * px=x->GetParentVertex(), + * py=y->GetParentVertex(); + + int rankx = GetMaskRanking(px->GetMask(false), px->GetMask(true) ); + int ranky = GetMaskRanking(py->GetMask(false), py->GetMask(true) ); + + assert( rankx!=0xFF and ranky!=0xFF ); + + // Arrange regular vertices before irregular vertices within the same kernel + if ((rankx <= 2 && ranky <= 2) || (rankx >= 3 && rankx <= 7 && ranky >= 3 && ranky <= 7) || (rankx >= 8 && ranky >= 8)) + return px->GetValence() == 4 && py->GetValence() != 4; + else + return rankx < ranky; +} + } // end namespace OPENSUBDIV_VERSION using namespace OPENSUBDIV_VERSION; diff --git a/opensubdiv/far/dispatcher.h b/opensubdiv/far/dispatcher.h index 5b7af34f..8d5ace73 100644 --- a/opensubdiv/far/dispatcher.h +++ b/opensubdiv/far/dispatcher.h @@ -108,6 +108,15 @@ FarDispatcher::ApplyKernel(CONTROLLER *controller, CONTEXT *context, FarKernelBa case FarKernelBatch::CATMARK_VERT_VERTEX_A2: controller->ApplyCatmarkVertexVerticesKernelA2(batch, context); break; + case FarKernelBatch::CATMARK_RESTRICTED_VERT_VERTEX_B1: + controller->ApplyCatmarkRestrictedVertexVerticesKernelB1(batch, context); + break; + case FarKernelBatch::CATMARK_RESTRICTED_VERT_VERTEX_B2: + controller->ApplyCatmarkRestrictedVertexVerticesKernelB2(batch, context); + break; + case FarKernelBatch::CATMARK_RESTRICTED_VERT_VERTEX_A: + controller->ApplyCatmarkRestrictedVertexVerticesKernelA(batch, context); + break; case FarKernelBatch::LOOP_EDGE_VERTEX: controller->ApplyLoopEdgeVerticesKernel(batch, context); @@ -200,6 +209,15 @@ public: template void ApplyCatmarkVertexVerticesKernelA2(FarKernelBatch const &batch, CONTEXT *context) const; + template + void ApplyCatmarkRestrictedVertexVerticesKernelB1(FarKernelBatch const &batch, CONTEXT *context) const; + + template + void ApplyCatmarkRestrictedVertexVerticesKernelB2(FarKernelBatch const &batch, CONTEXT *context) const; + + template + void ApplyCatmarkRestrictedVertexVerticesKernelA(FarKernelBatch const &batch, CONTEXT *context) const; + template void ApplyLoopEdgeVerticesKernel(FarKernelBatch const &batch, CONTEXT *context) const; @@ -393,6 +411,51 @@ FarComputeController::ApplyCatmarkVertexVerticesKernelA2(FarKernelBatch const &b vsrc ); } +template void +FarComputeController::ApplyCatmarkRestrictedVertexVerticesKernelB1(FarKernelBatch const &batch, CONTEXT *context) const { + + typename CONTEXT::VertexType *vsrc = &context->GetVertices().at(0); + + FarSubdivisionTables const * subdivision = context->GetSubdivisionTables(); + assert(subdivision); + + subdivision->computeCatmarkRestrictedVertexPointsB1( batch.GetVertexOffset(), + batch.GetTableOffset(), + batch.GetStart(), + batch.GetEnd(), + vsrc ); +} + +template void +FarComputeController::ApplyCatmarkRestrictedVertexVerticesKernelB2(FarKernelBatch const &batch, CONTEXT *context) const { + + typename CONTEXT::VertexType *vsrc = &context->GetVertices().at(0); + + FarSubdivisionTables const * subdivision = context->GetSubdivisionTables(); + assert(subdivision); + + subdivision->computeCatmarkRestrictedVertexPointsB2( batch.GetVertexOffset(), + batch.GetTableOffset(), + batch.GetStart(), + batch.GetEnd(), + vsrc ); +} + +template void +FarComputeController::ApplyCatmarkRestrictedVertexVerticesKernelA(FarKernelBatch const &batch, CONTEXT *context) const { + + typename CONTEXT::VertexType *vsrc = &context->GetVertices().at(0); + + FarSubdivisionTables const * subdivision = context->GetSubdivisionTables(); + assert(subdivision); + + subdivision->computeCatmarkRestrictedVertexPointsA( batch.GetVertexOffset(), + batch.GetTableOffset(), + batch.GetStart(), + batch.GetEnd(), + vsrc ); +} + template void FarComputeController::ApplyLoopEdgeVerticesKernel(FarKernelBatch const &batch, CONTEXT *context) const { diff --git a/opensubdiv/far/kernelBatch.h b/opensubdiv/far/kernelBatch.h index 4fd85569..260e640a 100644 --- a/opensubdiv/far/kernelBatch.h +++ b/opensubdiv/far/kernelBatch.h @@ -74,6 +74,9 @@ public: CATMARK_VERT_VERTEX_A1, CATMARK_VERT_VERTEX_A2, CATMARK_VERT_VERTEX_B, + CATMARK_RESTRICTED_VERT_VERTEX_A, + CATMARK_RESTRICTED_VERT_VERTEX_B1, + CATMARK_RESTRICTED_VERT_VERTEX_B2, LOOP_EDGE_VERTEX, LOOP_VERT_VERTEX_A1, LOOP_VERT_VERTEX_A2, diff --git a/opensubdiv/far/kernelBatchFactory.h b/opensubdiv/far/kernelBatchFactory.h index 167b8b01..c987d7d7 100644 --- a/opensubdiv/far/kernelBatchFactory.h +++ b/opensubdiv/far/kernelBatchFactory.h @@ -46,6 +46,8 @@ public: FarVertexKernelBatchFactory(int start, int end) { kernelB.start = kernelA1.start = kernelA2.start = start; kernelB.end = kernelA1.end = kernelA2.end = end; + restrictedKernelB1.start = restrictedKernelB2.start = restrictedKernelA.start = start; + restrictedKernelB1.end = restrictedKernelB2.end = restrictedKernelA.end = end; } @@ -97,6 +99,17 @@ public: void AddVertex( int index, int rank ); + /// \brief Adds a vertex-vertex to the appropriate restricted compute batch based on "Rank" and valence. + /// + /// @param index the index of the vertex + /// + /// @param rank the rank of the vertex (see + /// FarSubdivisionTables::GetMaskRanking()) + /// + /// @param valence the valence of the vertex + /// + void AddCatmarkRestrictedVertex( int index, int rank, int valence ); + /// \brief Appends a FarKernelBatch to a vector of batches for Catmark subdivision /// @@ -111,6 +124,18 @@ public: void AppendCatmarkBatches(int level, int tableOffset, int vertexOffset, FarKernelBatchVector *result); + /// \brief Appends a restricted FarKernelBatch to a vector of batches for Catmark subdivision + /// + /// @param level the subdivision level of the vertices in the batch + /// + /// @param tableOffset XXXX + /// + /// @param vertexOffset XXXX + /// + /// @param result the expanded batch vector + /// + void AppendCatmarkRestrictedBatches(int level, int tableOffset, int vertexOffset, FarKernelBatchVector *result); + /// \brief Appends a FarKernelBatch to a vector of batches for Loop subdivision /// @@ -131,9 +156,12 @@ private: end; }; - Range kernelB; // vertex batch range (kernel B) - Range kernelA1; // vertex batch range (kernel A pass 1) - Range kernelA2; // vertex batch range (kernel A pass 2) + Range kernelB; // vertex batch range (kernel B) + Range kernelA1; // vertex batch range (kernel A pass 1) + Range kernelA2; // vertex batch range (kernel A pass 2) + Range restrictedKernelB1; // vertex batch range (restricted kernel B regular) + Range restrictedKernelB2; // vertex batch range (restricted kernel B irregular) + Range restrictedKernelA; // vertex batch range (restricted kernel A) }; inline void @@ -160,6 +188,29 @@ FarVertexKernelBatchFactory::AddVertex( int index, int rank ) { } } +inline void +FarVertexKernelBatchFactory::AddCatmarkRestrictedVertex( int index, int rank, int valence ) { + + assert(rank <= 2 || rank >= 8); + + if (rank <= 2 && valence == 4) { + if (index < restrictedKernelB1.start) + restrictedKernelB1.start=index; + if (index > restrictedKernelB1.end) + restrictedKernelB1.end=index; + } else if (rank <= 2 && valence != 4) { + if (index < restrictedKernelB2.start) + restrictedKernelB2.start=index; + if (index > restrictedKernelB2.end) + restrictedKernelB2.end=index; + } else if (rank >= 8) { + if (index < restrictedKernelA.start) + restrictedKernelA.start=index; + if (index > restrictedKernelA.end) + restrictedKernelA.end=index; + } +} + inline void FarVertexKernelBatchFactory::AppendCatmarkBatches(int level, int tableOffset, @@ -180,6 +231,26 @@ FarVertexKernelBatchFactory::AppendCatmarkBatches(int level, tableOffset, vertexOffset) ); } +inline void +FarVertexKernelBatchFactory::AppendCatmarkRestrictedBatches(int level, + int tableOffset, + int vertexOffset, + FarKernelBatchVector *result) { + + if (restrictedKernelB1.end >= restrictedKernelB1.start) + result->push_back(FarKernelBatch( FarKernelBatch::CATMARK_RESTRICTED_VERT_VERTEX_B1, level, 0, + restrictedKernelB1.start, restrictedKernelB1.end+1, + tableOffset, vertexOffset)); + if (restrictedKernelB2.end >= restrictedKernelB2.start) + result->push_back(FarKernelBatch( FarKernelBatch::CATMARK_RESTRICTED_VERT_VERTEX_B2, level, 0, + restrictedKernelB2.start, restrictedKernelB2.end+1, + tableOffset, vertexOffset) ); + if (restrictedKernelA.end >= restrictedKernelA.start) + result->push_back(FarKernelBatch( FarKernelBatch::CATMARK_RESTRICTED_VERT_VERTEX_A, level, 0, + restrictedKernelA.start, restrictedKernelA.end+1, + tableOffset, vertexOffset) ); +} + inline void FarVertexKernelBatchFactory::AppendLoopBatches(int level, int tableOffset, diff --git a/opensubdiv/far/subdivisionTables.h b/opensubdiv/far/subdivisionTables.h index 37b1fa9f..bd0d5fee 100644 --- a/opensubdiv/far/subdivisionTables.h +++ b/opensubdiv/far/subdivisionTables.h @@ -189,6 +189,22 @@ public: template void computeCatmarkVertexPointsB(int vertexOffset, int tableOffset, int start, int end, U * vsrc) const; + // Compute-kernel applied to vertices resulting from the refinement of a smooth or sharp vertex + // Kernel "A" handles the k_Crease and k_Corner rules + template + void computeCatmarkRestrictedVertexPointsA(int vertexOffset, int tableOffset, int start, int end, U * vsrc) const; + + // Compute-kernel applied to vertices resulting from the refinement of a smooth or sharp vertex + // Kernel "B1" handles the regular k_Smooth and k_Dart rules + template + void computeCatmarkRestrictedVertexPointsB1(int vertexOffset, int tableOffset, int start, int end, U * vsrc) const; + + // Compute-kernel applied to vertices resulting from the refinement of a smooth or sharp vertex + // Kernel "B2" handles the irregular k_Smooth and k_Dart rules + template + void computeCatmarkRestrictedVertexPointsB2(int vertexOffset, int tableOffset, int start, int end, U * vsrc) const; + + // ------------------------------------------------------------------------- // Loop scheme @@ -573,6 +589,76 @@ FarSubdivisionTables::computeCatmarkVertexPointsB( int vertexOffset, int tableOf } } +// single-pass kernel handling k_Crease and k_Corner rules +template void +FarSubdivisionTables::computeCatmarkRestrictedVertexPointsA( int vertexOffset, int tableOffset, int start, int end, U * vsrc ) const { + + U * vdst = vsrc + vertexOffset + start; + + for (int i=start+tableOffset; iClear(); + + int p=this->_V_ITa[5*i+2], // index of the parent vertex + eidx0=this->_V_ITa[5*i+3], // index of the first crease rule edge + eidx1=this->_V_ITa[5*i+4]; // index of the second crease rule edge + + vdst->AddWithWeight( vsrc[p], 0.75f ); + vdst->AddWithWeight( vsrc[eidx0], 0.125f ); + vdst->AddWithWeight( vsrc[eidx1], 0.125f ); + vdst->AddVaryingWithWeight( vsrc[p], 1.0f ); + } +} + +// single-pass kernel handling regular k_Smooth and k_Dart rules +template void +FarSubdivisionTables::computeCatmarkRestrictedVertexPointsB1( int vertexOffset, int tableOffset, int start, int end, U * vsrc ) const { + + U * vdst = vsrc + vertexOffset + start; + + for (int i=start+tableOffset; iClear(); + + int h = this->_V_ITa[5*i ], // offset of the vertices in the _V_IT array + p = this->_V_ITa[5*i+2]; // index of the parent vertex + + vdst->AddWithWeight( vsrc[p], 0.5f ); + + for (int j=0; j<8; ++j, ++h) + vdst->AddWithWeight( vsrc[this->_V_IT[h]], 0.0625f ); + + vdst->AddVaryingWithWeight( vsrc[p], 1.0f ); + } +} + +// single-pass kernel handling irregular k_Smooth and k_Dart rules +template void +FarSubdivisionTables::computeCatmarkRestrictedVertexPointsB2( int vertexOffset, int tableOffset, int start, int end, U * vsrc ) const { + + U * vdst = vsrc + vertexOffset + start; + + for (int i=start+tableOffset; iClear(); + + int h = this->_V_ITa[5*i ], // offset of the vertices in the _V_IT array + n = this->_V_ITa[5*i+1], // number of vertices in the _V_IT array (valence) + p = this->_V_ITa[5*i+2]; // index of the parent vertex + + float wp = 1.0f/(n*n), + wv = (n-2.0f)*n*wp; + + vdst->AddWithWeight( vsrc[p], wv ); + + for (int j=0; jAddWithWeight( vsrc[this->_V_IT[h+j*2 ]], wp ); + vdst->AddWithWeight( vsrc[this->_V_IT[h+j*2+1]], wp ); + } + vdst->AddVaryingWithWeight( vsrc[p], 1.0f ); + } +} + // // Edge-vertices compute Kernel - completely re-entrant // diff --git a/opensubdiv/far/subdivisionTablesFactory.h b/opensubdiv/far/subdivisionTablesFactory.h index 3083829d..7f9b2691 100644 --- a/opensubdiv/far/subdivisionTablesFactory.h +++ b/opensubdiv/far/subdivisionTablesFactory.h @@ -62,11 +62,13 @@ protected: template friend class FarMeshFactory; + typedef bool (*CompareVerticesOperator)(const HbrVertex *, const HbrVertex *); + // This factory accumulates vertex topology data that will be shared among the // specialized subdivision scheme factories (Bilinear / Catmark / Loop). // It also populates the FarMeshFactory vertex remapping vector that ties the // Hbr vertex indices to the FarVertexEdit tables. - FarSubdivisionTablesFactory( HbrMesh const * mesh, int maxlevel, std::vector & remapTable ); + FarSubdivisionTablesFactory( HbrMesh const * mesh, int maxlevel, std::vector & remapTable, CompareVerticesOperator compareVertices = CompareVertices ); // Returns the number of coarse vertices found in the mesh int GetNumCoarseVertices() const { @@ -108,6 +110,15 @@ protected: bool HasFractionalEdgeSharpness() const { return _hasFractionalEdgeSharpness; } + bool HasFractionalVertexSharpness() const { return _hasFractionalVertexSharpness; } + + // Compares vertices based on their topological configuration + // (see subdivisionTables::GetMaskRanking for more details) + static bool CompareVertices( HbrVertex const *x, HbrVertex const *y ); + + // Compare vertices operator + CompareVerticesOperator _compareVertices; + // Per-level counters and offsets for each type of vertex (face,edge,vert) std::vector _faceVertIdx, _edgeVertIdx, @@ -130,8 +141,9 @@ protected: // Number of coarse triangle faces int _numCoarseTriangleFaces; - // Indicates if an edge has a fractional (non-integer) sharpness - bool _hasFractionalEdgeSharpness; + // Indicates if an edge or vertex has a fractional (non-integer) sharpness + bool _hasFractionalEdgeSharpness, + _hasFractionalVertexSharpness; private: @@ -142,14 +154,11 @@ private: // Sums the number of adjacent vertices required to interpolate a Vert-Vertex static int sumVertVertexValence(HbrVertex * vertex); - - // Compares vertices based on their topological configuration - // (see subdivisionTables::GetMaskRanking for more details) - static bool compareVertices( HbrVertex const *x, HbrVertex const *y ); }; template -FarSubdivisionTablesFactory::FarSubdivisionTablesFactory( HbrMesh const * mesh, int maxlevel, std::vector & remapTable ) : +FarSubdivisionTablesFactory::FarSubdivisionTablesFactory( HbrMesh const * mesh, int maxlevel, std::vector & remapTable, CompareVerticesOperator compareVertices ) : + _compareVertices(compareVertices), _faceVertIdx(maxlevel+1,0), _edgeVertIdx(maxlevel+1,0), _vertVertIdx(maxlevel+1,0), @@ -161,7 +170,8 @@ FarSubdivisionTablesFactory::FarSubdivisionTablesFactory( HbrMesh const _minCoarseFaceValence(0), _maxCoarseFaceValence(0), _numCoarseTriangleFaces(0), - _hasFractionalEdgeSharpness(false) + _hasFractionalEdgeSharpness(false), + _hasFractionalVertexSharpness(false) { assert( mesh ); @@ -215,6 +225,9 @@ FarSubdivisionTablesFactory::FarSubdivisionTablesFactory( HbrMesh const } else if (v->GetParentVertex()) { vertCounts[depth]++; _vertVertsValenceSum+=sumVertVertexValence(v); + float sharpness = v->GetParentVertex()->GetSharpness(); + if (sharpness > 0.0f && sharpness < 1.0f) + _hasFractionalVertexSharpness = true; } } @@ -278,7 +291,7 @@ FarSubdivisionTablesFactory::FarSubdivisionTablesFactory( HbrMesh const // mask. The masks combinations are ordered so as to minimize the compute // kernel switching.(see subdivisionTables::GetMaskRanking for more details) for (size_t i=1; i<_vertVertsList.size(); ++i) - std::sort( _vertVertsList[i].begin(), _vertVertsList[i].end(), compareVertices ); + std::sort( _vertVertsList[i].begin(), _vertVertsList[i].end(), _compareVertices ); // These vertices still need a remapped index @@ -384,7 +397,7 @@ FarSubdivisionTablesFactory::sumVertVertexValence(HbrVertex * vertex) { // The vertices should be sorted so as to minimize the number execution calls of // these kernels to match the 2 pass interpolation scheme used in Hbr. template bool -FarSubdivisionTablesFactory::compareVertices( HbrVertex const * x, HbrVertex const * y ) { +FarSubdivisionTablesFactory::CompareVertices( HbrVertex const * x, HbrVertex const * y ) { // Masks of the parent vertex decide for the current vertex. HbrVertex * px=x->GetParentVertex(), @@ -604,6 +617,9 @@ FarSubdivisionTablesFactory::Splice(FarMeshVector const &meshes, FarKernelB } else if (batch._kernelType == FarKernelBatch::CATMARK_VERT_VERTEX_A1 or batch._kernelType == FarKernelBatch::CATMARK_VERT_VERTEX_A2 or batch._kernelType == FarKernelBatch::CATMARK_VERT_VERTEX_B or + batch._kernelType == FarKernelBatch::CATMARK_RESTRICTED_VERT_VERTEX_A or + batch._kernelType == FarKernelBatch::CATMARK_RESTRICTED_VERT_VERTEX_B1 or + batch._kernelType == FarKernelBatch::CATMARK_RESTRICTED_VERT_VERTEX_B2 or batch._kernelType == FarKernelBatch::LOOP_VERT_VERTEX_A1 or batch._kernelType == FarKernelBatch::LOOP_VERT_VERTEX_A2 or batch._kernelType == FarKernelBatch::LOOP_VERT_VERTEX_B or diff --git a/opensubdiv/osd/clComputeController.cpp b/opensubdiv/osd/clComputeController.cpp index f8a844b4..b46ddc33 100644 --- a/opensubdiv/osd/clComputeController.cpp +++ b/opensubdiv/osd/clComputeController.cpp @@ -390,6 +390,94 @@ OsdCLComputeController::ApplyCatmarkVertexVerticesKernelA2( CL_CHECK_ERROR(ciErrNum, "vertex kernel A2 %d\n", ciErrNum); } +void +OsdCLComputeController::ApplyCatmarkRestrictedVertexVerticesKernelB1( + FarKernelBatch const &batch, OsdCLComputeContext const *context) const { + + assert(context); + + cl_int ciErrNum; + size_t globalWorkSize[1] = { (size_t)(batch.GetEnd() - batch.GetStart()) }; + cl_kernel kernel = _currentBindState.kernelBundle->GetCatmarkRestrictedVertexKernelB1(); + + cl_mem V_ITa = context->GetTable(FarSubdivisionTables::V_ITa)->GetDevicePtr(); + cl_mem V_IT = context->GetTable(FarSubdivisionTables::V_IT)->GetDevicePtr(); + + clSetKernelArg(kernel, 0, sizeof(cl_mem), &_currentBindState.vertexBuffer); + clSetKernelArg(kernel, 1, sizeof(cl_mem), &_currentBindState.varyingBuffer); + clSetKernelArg(kernel, 2, sizeof(cl_mem), &V_ITa); + clSetKernelArg(kernel, 3, sizeof(cl_mem), &V_IT); + clSetKernelArg(kernel, 4, sizeof(int), &_currentBindState.vertexDesc.offset); + clSetKernelArg(kernel, 5, sizeof(int), &_currentBindState.varyingDesc.offset); + clSetKernelArg(kernel, 6, sizeof(int), batch.GetVertexOffsetPtr()); + clSetKernelArg(kernel, 7, sizeof(int), batch.GetTableOffsetPtr()); + clSetKernelArg(kernel, 8, sizeof(int), batch.GetStartPtr()); + clSetKernelArg(kernel, 9, sizeof(int), batch.GetEndPtr()); + + ciErrNum = clEnqueueNDRangeKernel(_clQueue, + kernel, 1, NULL, globalWorkSize, + NULL, 0, NULL, NULL); + CL_CHECK_ERROR(ciErrNum, "restricted vertex kernel B1 %d\n", ciErrNum); +} + +void +OsdCLComputeController::ApplyCatmarkRestrictedVertexVerticesKernelB2( + FarKernelBatch const &batch, OsdCLComputeContext const *context) const { + + assert(context); + + cl_int ciErrNum; + size_t globalWorkSize[1] = { (size_t)(batch.GetEnd() - batch.GetStart()) }; + cl_kernel kernel = _currentBindState.kernelBundle->GetCatmarkRestrictedVertexKernelB2(); + + cl_mem V_ITa = context->GetTable(FarSubdivisionTables::V_ITa)->GetDevicePtr(); + cl_mem V_IT = context->GetTable(FarSubdivisionTables::V_IT)->GetDevicePtr(); + + clSetKernelArg(kernel, 0, sizeof(cl_mem), &_currentBindState.vertexBuffer); + clSetKernelArg(kernel, 1, sizeof(cl_mem), &_currentBindState.varyingBuffer); + clSetKernelArg(kernel, 2, sizeof(cl_mem), &V_ITa); + clSetKernelArg(kernel, 3, sizeof(cl_mem), &V_IT); + clSetKernelArg(kernel, 4, sizeof(int), &_currentBindState.vertexDesc.offset); + clSetKernelArg(kernel, 5, sizeof(int), &_currentBindState.varyingDesc.offset); + clSetKernelArg(kernel, 6, sizeof(int), batch.GetVertexOffsetPtr()); + clSetKernelArg(kernel, 7, sizeof(int), batch.GetTableOffsetPtr()); + clSetKernelArg(kernel, 8, sizeof(int), batch.GetStartPtr()); + clSetKernelArg(kernel, 9, sizeof(int), batch.GetEndPtr()); + + ciErrNum = clEnqueueNDRangeKernel(_clQueue, + kernel, 1, NULL, globalWorkSize, + NULL, 0, NULL, NULL); + CL_CHECK_ERROR(ciErrNum, "restricted vertex kernel B2 %d\n", ciErrNum); +} + +void +OsdCLComputeController::ApplyCatmarkRestrictedVertexVerticesKernelA( + FarKernelBatch const &batch, OsdCLComputeContext const *context) const { + + assert(context); + + cl_int ciErrNum; + size_t globalWorkSize[1] = { (size_t)(batch.GetEnd() - batch.GetStart()) }; + cl_kernel kernel = _currentBindState.kernelBundle->GetCatmarkRestrictedVertexKernelA(); + + cl_mem V_ITa = context->GetTable(FarSubdivisionTables::V_ITa)->GetDevicePtr(); + + clSetKernelArg(kernel, 0, sizeof(cl_mem), &_currentBindState.vertexBuffer); + clSetKernelArg(kernel, 1, sizeof(cl_mem), &_currentBindState.varyingBuffer); + clSetKernelArg(kernel, 2, sizeof(cl_mem), &V_ITa); + clSetKernelArg(kernel, 3, sizeof(int), &_currentBindState.vertexDesc.offset); + clSetKernelArg(kernel, 4, sizeof(int), &_currentBindState.varyingDesc.offset); + clSetKernelArg(kernel, 5, sizeof(int), batch.GetVertexOffsetPtr()); + clSetKernelArg(kernel, 6, sizeof(int), batch.GetTableOffsetPtr()); + clSetKernelArg(kernel, 7, sizeof(int), batch.GetStartPtr()); + clSetKernelArg(kernel, 8, sizeof(int), batch.GetEndPtr()); + + ciErrNum = clEnqueueNDRangeKernel(_clQueue, + kernel, 1, NULL, globalWorkSize, + NULL, 0, NULL, NULL); + CL_CHECK_ERROR(ciErrNum, "restricted vertex kernel A %d\n", ciErrNum); +} + void OsdCLComputeController::ApplyLoopEdgeVerticesKernel( FarKernelBatch const &batch, OsdCLComputeContext const *context) const { diff --git a/opensubdiv/osd/clComputeController.h b/opensubdiv/osd/clComputeController.h index 6ad9d87f..d268a59d 100644 --- a/opensubdiv/osd/clComputeController.h +++ b/opensubdiv/osd/clComputeController.h @@ -151,6 +151,12 @@ protected: void ApplyCatmarkVertexVerticesKernelA2(FarKernelBatch const &batch, ComputeContext const *context) const; + void ApplyCatmarkRestrictedVertexVerticesKernelB1(FarKernelBatch const &batch, ComputeContext const *context) const; + + void ApplyCatmarkRestrictedVertexVerticesKernelB2(FarKernelBatch const &batch, ComputeContext const *context) const; + + void ApplyCatmarkRestrictedVertexVerticesKernelA(FarKernelBatch const &batch, ComputeContext const *context) const; + void ApplyLoopEdgeVerticesKernel(FarKernelBatch const &batch, ComputeContext const *context) const; diff --git a/opensubdiv/osd/clKernel.cl b/opensubdiv/osd/clKernel.cl index 37fafc7e..0ab77fad 100644 --- a/opensubdiv/osd/clKernel.cl +++ b/opensubdiv/osd/clKernel.cl @@ -428,6 +428,107 @@ __kernel void computeVertexB(__global float *vertex, } } +__kernel void computeRestrictedVertexA(__global float *vertex, + __global float *varying, + __global int *V_ITa, + int vertexOffset, int varyingOffset, + int offset, int tableOffset, + int start, int end) { + + int i = start + get_global_id(0) + tableOffset; + int vid = start + get_global_id(0) + offset; + int p = V_ITa[5*i+2]; + int eidx0 = V_ITa[5*i+3]; + int eidx1 = V_ITa[5*i+4]; + vertex += vertexOffset; + varying += (varying ? varyingOffset :0); + + struct Vertex dst; + clearVertex(&dst); + addWithWeight(&dst, vertex, p, 0.75f); + addWithWeight(&dst, vertex, eidx0, 0.125f); + addWithWeight(&dst, vertex, eidx1, 0.125f); + writeVertex(vertex, vid, &dst); + + if (varying) { + struct Varying dstVarying; + clearVarying(&dstVarying); + addVaryingWithWeight(&dstVarying, varying, p, 1.0f); + writeVarying(varying, vid, &dstVarying); + } +} + +__kernel void computeRestrictedVertexB1(__global float *vertex, + __global float *varying, + __global int *V_ITa, + __global int *V_IT, + int vertexOffset, int varyingOffset, + int offset, int tableOffset, + int start, int end) { + + int i = start + get_global_id(0) + tableOffset; + int vid = start + get_global_id(0) + offset; + int h = V_ITa[5*i]; + int p = V_ITa[5*i+2]; + vertex += vertexOffset; + varying += (varying ? varyingOffset :0); + + struct Vertex dst; + clearVertex(&dst); + + addWithWeight(&dst, vertex, p, 0.5f); + + for (int j = 0; j < 8; ++j, ++h) { + addWithWeight(&dst, vertex, V_IT[h], 0.0625f); + } + writeVertex(vertex, vid, &dst); + + if (varying) { + struct Varying dstVarying; + clearVarying(&dstVarying); + addVaryingWithWeight(&dstVarying, varying, p, 1.0f); + writeVarying(varying, vid, &dstVarying); + } +} + +__kernel void computeRestrictedVertexB2(__global float *vertex, + __global float *varying, + __global int *V_ITa, + __global int *V_IT, + int vertexOffset, int varyingOffset, + int offset, int tableOffset, + int start, int end) { + + int i = start + get_global_id(0) + tableOffset; + int vid = start + get_global_id(0) + offset; + int h = V_ITa[5*i]; + int n = V_ITa[5*i+1]; + int p = V_ITa[5*i+2]; + vertex += vertexOffset; + varying += (varying ? varyingOffset :0); + + float wp = 1.0f/(float)(n*n); + float wv = (n-2.0f) * n * wp; + + struct Vertex dst; + clearVertex(&dst); + + addWithWeight(&dst, vertex, p, wv); + + for (int j = 0; j < n; ++j) { + addWithWeight(&dst, vertex, V_IT[h+j*2], wp); + addWithWeight(&dst, vertex, V_IT[h+j*2+1], wp); + } + writeVertex(vertex, vid, &dst); + + if (varying) { + struct Varying dstVarying; + clearVarying(&dstVarying); + addVaryingWithWeight(&dstVarying, varying, p, 1.0f); + writeVarying(varying, vid, &dstVarying); + } +} + __kernel void computeLoopVertexB(__global float *vertex, __global float *varying, __global int *V_ITa, diff --git a/opensubdiv/osd/clKernelBundle.cpp b/opensubdiv/osd/clKernelBundle.cpp index e0f7f1d9..b9cc23a6 100644 --- a/opensubdiv/osd/clKernelBundle.cpp +++ b/opensubdiv/osd/clKernelBundle.cpp @@ -57,6 +57,9 @@ OsdCLKernelBundle::OsdCLKernelBundle() : _clCatmarkRestrictedEdge(NULL), _clCatmarkVertexA(NULL), _clCatmarkVertexB(NULL), + _clCatmarkRestrictedVertexA(NULL), + _clCatmarkRestrictedVertexB1(NULL), + _clCatmarkRestrictedVertexB2(NULL), _clLoopEdge(NULL), _clLoopVertexA(NULL), _clLoopVertexB(NULL), @@ -87,6 +90,12 @@ OsdCLKernelBundle::~OsdCLKernelBundle() { clReleaseKernel(_clCatmarkVertexA); if (_clCatmarkVertexB) clReleaseKernel(_clCatmarkVertexB); + if (_clCatmarkRestrictedVertexA) + clReleaseKernel(_clCatmarkRestrictedVertexA); + if (_clCatmarkRestrictedVertexB1) + clReleaseKernel(_clCatmarkRestrictedVertexB1); + if (_clCatmarkRestrictedVertexB2) + clReleaseKernel(_clCatmarkRestrictedVertexB2); if (_clLoopEdge) clReleaseKernel(_clLoopEdge); @@ -163,6 +172,9 @@ OsdCLKernelBundle::Compile(cl_context clContext, _clCatmarkRestrictedEdge = buildKernel(_clProgram, "computeRestrictedEdge"); _clCatmarkVertexA = buildKernel(_clProgram, "computeVertexA"); _clCatmarkVertexB = buildKernel(_clProgram, "computeVertexB"); + _clCatmarkRestrictedVertexA = buildKernel(_clProgram, "computeRestrictedVertexA"); + _clCatmarkRestrictedVertexB1 = buildKernel(_clProgram, "computeRestrictedVertexB1"); + _clCatmarkRestrictedVertexB2 = buildKernel(_clProgram, "computeRestrictedVertexB2"); _clLoopEdge = buildKernel(_clProgram, "computeEdge"); _clLoopVertexA = buildKernel(_clProgram, "computeVertexA"); _clLoopVertexB = buildKernel(_clProgram, "computeLoopVertexB"); diff --git a/opensubdiv/osd/clKernelBundle.h b/opensubdiv/osd/clKernelBundle.h index eb6ff8eb..44b47dda 100644 --- a/opensubdiv/osd/clKernelBundle.h +++ b/opensubdiv/osd/clKernelBundle.h @@ -62,6 +62,12 @@ public: cl_kernel GetCatmarkVertexKernelB() const { return _clCatmarkVertexB; } + cl_kernel GetCatmarkRestrictedVertexKernelA() const { return _clCatmarkRestrictedVertexA; } + + cl_kernel GetCatmarkRestrictedVertexKernelB1() const { return _clCatmarkRestrictedVertexB1; } + + cl_kernel GetCatmarkRestrictedVertexKernelB2() const { return _clCatmarkRestrictedVertexB2; } + cl_kernel GetLoopEdgeKernel() const { return _clLoopEdge; } cl_kernel GetLoopVertexKernelA() const { return _clLoopVertexA; } @@ -104,6 +110,9 @@ protected: _clCatmarkRestrictedEdge, _clCatmarkVertexA, _clCatmarkVertexB, + _clCatmarkRestrictedVertexA, + _clCatmarkRestrictedVertexB1, + _clCatmarkRestrictedVertexB2, _clLoopEdge, _clLoopVertexA, _clLoopVertexB, diff --git a/opensubdiv/osd/cpuComputeController.cpp b/opensubdiv/osd/cpuComputeController.cpp index 1e9b5b17..747650cf 100644 --- a/opensubdiv/osd/cpuComputeController.cpp +++ b/opensubdiv/osd/cpuComputeController.cpp @@ -186,6 +186,47 @@ OsdCpuComputeController::ApplyCatmarkVertexVerticesKernelA2( batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd(), true); } +void +OsdCpuComputeController::ApplyCatmarkRestrictedVertexVerticesKernelB1( + FarKernelBatch const &batch, OsdCpuComputeContext const *context) const { + + assert(context); + + OsdCpuComputeRestrictedVertexB1( + _currentBindState.vertexBuffer, _currentBindState.varyingBuffer, + _currentBindState.vertexDesc, _currentBindState.varyingDesc, + (const int*)context->GetTable(FarSubdivisionTables::V_ITa)->GetBuffer(), + (const int*)context->GetTable(FarSubdivisionTables::V_IT)->GetBuffer(), + batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd()); +} + +void +OsdCpuComputeController::ApplyCatmarkRestrictedVertexVerticesKernelB2( + FarKernelBatch const &batch, OsdCpuComputeContext const *context) const { + + assert(context); + + OsdCpuComputeRestrictedVertexB2( + _currentBindState.vertexBuffer, _currentBindState.varyingBuffer, + _currentBindState.vertexDesc, _currentBindState.varyingDesc, + (const int*)context->GetTable(FarSubdivisionTables::V_ITa)->GetBuffer(), + (const int*)context->GetTable(FarSubdivisionTables::V_IT)->GetBuffer(), + batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd()); +} + +void +OsdCpuComputeController::ApplyCatmarkRestrictedVertexVerticesKernelA( + FarKernelBatch const &batch, OsdCpuComputeContext const *context) const { + + assert(context); + + OsdCpuComputeRestrictedVertexA( + _currentBindState.vertexBuffer, _currentBindState.varyingBuffer, + _currentBindState.vertexDesc, _currentBindState.varyingDesc, + (const int*)context->GetTable(FarSubdivisionTables::V_ITa)->GetBuffer(), + batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd()); +} + void OsdCpuComputeController::ApplyLoopEdgeVerticesKernel( FarKernelBatch const &batch, OsdCpuComputeContext const *context) const { diff --git a/opensubdiv/osd/cpuComputeController.h b/opensubdiv/osd/cpuComputeController.h index 298fe3a1..fb8ba4f6 100644 --- a/opensubdiv/osd/cpuComputeController.h +++ b/opensubdiv/osd/cpuComputeController.h @@ -134,6 +134,12 @@ protected: void ApplyCatmarkVertexVerticesKernelA2(FarKernelBatch const &batch, ComputeContext const *context) const; + void ApplyCatmarkRestrictedVertexVerticesKernelB1(FarKernelBatch const &batch, ComputeContext const *context) const; + + void ApplyCatmarkRestrictedVertexVerticesKernelB2(FarKernelBatch const &batch, ComputeContext const *context) const; + + void ApplyCatmarkRestrictedVertexVerticesKernelA(FarKernelBatch const &batch, ComputeContext const *context) const; + void ApplyLoopEdgeVerticesKernel(FarKernelBatch const &batch, ComputeContext const *context) const; diff --git a/opensubdiv/osd/cpuKernel.cpp b/opensubdiv/osd/cpuKernel.cpp index 13745ef3..4fc069f0 100755 --- a/opensubdiv/osd/cpuKernel.cpp +++ b/opensubdiv/osd/cpuKernel.cpp @@ -367,6 +367,100 @@ void OsdCpuComputeVertexB( } } +void OsdCpuComputeRestrictedVertexB1( + float *vertex, float *varying, + OsdVertexBufferDescriptor const &vertexDesc, + OsdVertexBufferDescriptor const &varyingDesc, + const int *V_ITa, const int *V_IT, + int vertexOffset, int tableOffset, int start, int end) { + + float *vertexResults = (float*)alloca(vertexDesc.length * sizeof(float)); + float *varyingResults = (float*)alloca(varyingDesc.length * sizeof(float)); + + for (int i = start + tableOffset; i < end + tableOffset; i++) { + int h = V_ITa[5*i]; + int p = V_ITa[5*i+2]; + + int dstIndex = i + vertexOffset - tableOffset; + clear(vertexResults, vertexDesc); + clear(varyingResults, varyingDesc); + + addWithWeight(vertexResults, vertex, p, 0.5f, vertexDesc); + + for (int j = 0; j < 8; ++j, ++h) + addWithWeight(vertexResults, vertex, V_IT[h], 0.0625f, vertexDesc); + + addWithWeight(varyingResults, varying, p, 1.0f, varyingDesc); + + copy(vertex, vertexResults, dstIndex, vertexDesc); + copy(varying, varyingResults, dstIndex, varyingDesc); + } +} + +void OsdCpuComputeRestrictedVertexB2( + float *vertex, float *varying, + OsdVertexBufferDescriptor const &vertexDesc, + OsdVertexBufferDescriptor const &varyingDesc, + const int *V_ITa, const int *V_IT, + int vertexOffset, int tableOffset, int start, int end) { + + float *vertexResults = (float*)alloca(vertexDesc.length * sizeof(float)); + float *varyingResults = (float*)alloca(varyingDesc.length * sizeof(float)); + + for (int i = start + tableOffset; i < end + tableOffset; i++) { + int h = V_ITa[5*i]; + int n = V_ITa[5*i+1]; + int p = V_ITa[5*i+2]; + + float wp = 1.0f/static_cast(n*n); + float wv = (n-2.0f) * n * wp; + + int dstIndex = i + vertexOffset - tableOffset; + clear(vertexResults, vertexDesc); + clear(varyingResults, varyingDesc); + + addWithWeight(vertexResults, vertex, p, wv, vertexDesc); + + for (int j = 0; j < n; ++j) { + addWithWeight(vertexResults, vertex, V_IT[h+j*2], wp, vertexDesc); + addWithWeight(vertexResults, vertex, V_IT[h+j*2+1], wp, vertexDesc); + } + addWithWeight(varyingResults, varying, p, 1.0f, varyingDesc); + + copy(vertex, vertexResults, dstIndex, vertexDesc); + copy(varying, varyingResults, dstIndex, varyingDesc); + } +} + +void OsdCpuComputeRestrictedVertexA( + float *vertex, float *varying, + OsdVertexBufferDescriptor const &vertexDesc, + OsdVertexBufferDescriptor const &varyingDesc, + const int *V_ITa, + int vertexOffset, int tableOffset, int start, int end) { + + float *vertexResults = (float*)alloca(vertexDesc.length * sizeof(float)); + float *varyingResults = (float*)alloca(varyingDesc.length * sizeof(float)); + + for (int i = start + tableOffset; i < end + tableOffset; i++) { + int p = V_ITa[5*i+2]; + int eidx0 = V_ITa[5*i+3]; + int eidx1 = V_ITa[5*i+4]; + + int dstIndex = i + vertexOffset - tableOffset; + clear(vertexResults, vertexDesc); + clear(varyingResults, varyingDesc); + + addWithWeight(vertexResults, vertex, p, 0.75f, vertexDesc); + addWithWeight(vertexResults, vertex, eidx0, 0.125f, vertexDesc); + addWithWeight(vertexResults, vertex, eidx1, 0.125f, vertexDesc); + addWithWeight(varyingResults, varying, p, 1.0f, varyingDesc); + + copy(vertex, vertexResults, dstIndex, vertexDesc); + copy(varying, varyingResults, dstIndex, varyingDesc); + } +} + void OsdCpuComputeLoopVertexB( float *vertex, float *varying, OsdVertexBufferDescriptor const &vertexDesc, diff --git a/opensubdiv/osd/cpuKernel.h b/opensubdiv/osd/cpuKernel.h index 151c1bd7..bd405c93 100644 --- a/opensubdiv/osd/cpuKernel.h +++ b/opensubdiv/osd/cpuKernel.h @@ -322,6 +322,27 @@ void OsdCpuComputeVertexB(float *vertex, float * varying, int vertexOffset, int tableOffset, int start, int end); +void OsdCpuComputeRestrictedVertexB1(float *vertex, float * varying, + OsdVertexBufferDescriptor const &vertexDesc, + OsdVertexBufferDescriptor const &varyingDesc, + const int *V_ITa, const int *V_IT, + int vertexOffset, int tableOffset, + int start, int end); + +void OsdCpuComputeRestrictedVertexB2(float *vertex, float * varying, + OsdVertexBufferDescriptor const &vertexDesc, + OsdVertexBufferDescriptor const &varyingDesc, + const int *V_ITa, const int *V_IT, + int vertexOffset, int tableOffset, + int start, int end); + +void OsdCpuComputeRestrictedVertexA(float *vertex, float * varying, + OsdVertexBufferDescriptor const &vertexDesc, + OsdVertexBufferDescriptor const &varyingDesc, + const int *V_ITa, + int vertexOffset, int tableOffset, + int start, int end); + template void ComputeLoopVertexBKernel( float *vertex, const int *V_ITa, diff --git a/opensubdiv/osd/cudaComputeController.cpp b/opensubdiv/osd/cudaComputeController.cpp index 7df4ea4d..d93f7a62 100644 --- a/opensubdiv/osd/cudaComputeController.cpp +++ b/opensubdiv/osd/cudaComputeController.cpp @@ -67,6 +67,24 @@ void OsdCudaComputeVertexB(float *vertex, float *varying, int *V_ITa, int *V_IT, float *V_W, int offset, int tableOffset, int start, int end); +void OsdCudaComputeRestrictedVertexA(float *vertex, float *varying, + int vertexLength, int vertexStride, + int varyingLength, int varyingStride, + int *V_ITa, int offset, int tableOffset, + int start, int end); + +void OsdCudaComputeRestrictedVertexB1(float *vertex, float *varying, + int vertexLength, int vertexStride, + int varyingLength, int varyingStride, + int *V_ITa, int *V_IT, int offset, int tableOffset, + int start, int end); + +void OsdCudaComputeRestrictedVertexB2(float *vertex, float *varying, + int vertexLength, int vertexStride, + int varyingLength, int varyingStride, + int *V_ITa, int *V_IT, int offset, int tableOffset, + int start, int end); + void OsdCudaComputeLoopVertexB(float *vertex, float *varying, int vertexLength, int vertexStride, int varyingLength, int varyingStride, @@ -341,6 +359,72 @@ OsdCudaComputeController::ApplyCatmarkVertexVerticesKernelA2( batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd(), true); } +void +OsdCudaComputeController::ApplyCatmarkRestrictedVertexVerticesKernelB1( + FarKernelBatch const &batch, OsdCudaComputeContext const *context) const { + + assert(context); + + const OsdCudaTable * V_ITa = context->GetTable(FarSubdivisionTables::V_ITa); + const OsdCudaTable * V_IT = context->GetTable(FarSubdivisionTables::V_IT); + assert(V_ITa); + assert(V_IT); + + float *vertex = _currentBindState.GetOffsettedVertexBuffer(); + float *varying = _currentBindState.GetOffsettedVaryingBuffer(); + + OsdCudaComputeRestrictedVertexB1( + vertex, varying, + _currentBindState.vertexDesc.length, _currentBindState.vertexDesc.stride, + _currentBindState.varyingDesc.length, _currentBindState.varyingDesc.stride, + static_cast(V_ITa->GetCudaMemory()), + static_cast(V_IT->GetCudaMemory()), + batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd()); +} + +void +OsdCudaComputeController::ApplyCatmarkRestrictedVertexVerticesKernelB2( + FarKernelBatch const &batch, OsdCudaComputeContext const *context) const { + + assert(context); + + const OsdCudaTable * V_ITa = context->GetTable(FarSubdivisionTables::V_ITa); + const OsdCudaTable * V_IT = context->GetTable(FarSubdivisionTables::V_IT); + assert(V_ITa); + assert(V_IT); + + float *vertex = _currentBindState.GetOffsettedVertexBuffer(); + float *varying = _currentBindState.GetOffsettedVaryingBuffer(); + + OsdCudaComputeRestrictedVertexB2( + vertex, varying, + _currentBindState.vertexDesc.length, _currentBindState.vertexDesc.stride, + _currentBindState.varyingDesc.length, _currentBindState.varyingDesc.stride, + static_cast(V_ITa->GetCudaMemory()), + static_cast(V_IT->GetCudaMemory()), + batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd()); +} + +void +OsdCudaComputeController::ApplyCatmarkRestrictedVertexVerticesKernelA( + FarKernelBatch const &batch, OsdCudaComputeContext const *context) const { + + assert(context); + + const OsdCudaTable * V_ITa = context->GetTable(FarSubdivisionTables::V_ITa); + assert(V_ITa); + + float *vertex = _currentBindState.GetOffsettedVertexBuffer(); + float *varying = _currentBindState.GetOffsettedVaryingBuffer(); + + OsdCudaComputeRestrictedVertexA( + vertex, varying, + _currentBindState.vertexDesc.length, _currentBindState.vertexDesc.stride, + _currentBindState.varyingDesc.length, _currentBindState.varyingDesc.stride, + static_cast(V_ITa->GetCudaMemory()), + batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd()); +} + void OsdCudaComputeController::ApplyLoopEdgeVerticesKernel( FarKernelBatch const &batch, OsdCudaComputeContext const *context) const { diff --git a/opensubdiv/osd/cudaComputeController.h b/opensubdiv/osd/cudaComputeController.h index 8cb5fc46..2a04551d 100644 --- a/opensubdiv/osd/cudaComputeController.h +++ b/opensubdiv/osd/cudaComputeController.h @@ -134,6 +134,12 @@ protected: void ApplyCatmarkVertexVerticesKernelA2(FarKernelBatch const &batch, ComputeContext const *context) const; + void ApplyCatmarkRestrictedVertexVerticesKernelB1(FarKernelBatch const &batch, ComputeContext const *context) const; + + void ApplyCatmarkRestrictedVertexVerticesKernelB2(FarKernelBatch const &batch, ComputeContext const *context) const; + + void ApplyCatmarkRestrictedVertexVerticesKernelA(FarKernelBatch const &batch, ComputeContext const *context) const; + void ApplyLoopEdgeVerticesKernel(FarKernelBatch const &batch, ComputeContext const *context) const; diff --git a/opensubdiv/osd/cudaKernel.cu b/opensubdiv/osd/cudaKernel.cu index c91e55c6..143ff5e9 100644 --- a/opensubdiv/osd/cudaKernel.cu +++ b/opensubdiv/osd/cudaKernel.cu @@ -619,6 +619,192 @@ computeVertexB(float *fVertex, float *fVarying, } } +template __global__ void +computeRestrictedVertexA(float *fVertex, float *fVaryings, int *V0_ITa, int offset, int tableOffset, int start, int end) +{ + DeviceVertex *vertex = (DeviceVertex*)fVertex; + DeviceVertex *varyings = (DeviceVertex*)fVaryings; + for (int i = start + tableOffset + threadIdx.x + blockIdx.x*blockDim.x; + i < end+tableOffset; + i += blockDim.x * gridDim.x) { + + int p = V0_ITa[5*i+2]; + int eidx0 = V0_ITa[5*i+3]; + int eidx1 = V0_ITa[5*i+4]; + + DeviceVertex dst; + dst.clear(); + + dst.addWithWeight(&vertex[p], 0.75f); + dst.addWithWeight(&vertex[eidx0], 0.125f); + dst.addWithWeight(&vertex[eidx1], 0.125f); + vertex[i+offset-tableOffset] = dst; + + if(NUM_VARYING_ELEMENTS > 0){ + DeviceVertex dstVarying; + dstVarying.clear(); + dstVarying.addWithWeight(&varyings[p], 1.0f); + varyings[i+offset-tableOffset] = dstVarying; + } + } +} + +__global__ void +computeRestrictedVertexA(float *fVertex, float *fVaryings, + int vertexLength, int vertexStride, + int varyingLength, int varyingStride, + int *V0_ITa, int offset, int tableOffset, int start, int end) +{ + for (int i = start + tableOffset + threadIdx.x + blockIdx.x*blockDim.x; + i < end + tableOffset; + i += blockDim.x * gridDim.x){ + + int p = V0_ITa[5*i+2]; + int eidx0 = V0_ITa[5*i+3]; + int eidx1 = V0_ITa[5*i+4]; + + float *dstVertex = fVertex + (i+offset-tableOffset)*vertexStride; + clear(dstVertex, vertexLength); + + addWithWeight(dstVertex, fVertex + p*vertexStride, 0.75f, vertexLength); + addWithWeight(dstVertex, fVertex + eidx0*vertexStride, 0.125f, vertexLength); + addWithWeight(dstVertex, fVertex + eidx1*vertexStride, 0.125f, vertexLength); + + if(varyingLength > 0){ + float *dstVarying = fVaryings + (i+offset-tableOffset)*varyingStride; + clear(dstVarying, varyingLength); + addWithWeight(dstVarying, fVaryings + p*varyingStride, 1.0f, varyingLength); + } + } +} + +template __global__ void +computeRestrictedVertexB1(float *fVertex, float *fVaryings, + const int *V0_ITa, const int *V0_IT, int offset, int tableOffset, int start, int end) +{ + DeviceVertex *vertex = (DeviceVertex*)fVertex; + DeviceVertex *varyings = (DeviceVertex*)fVaryings; + for (int i = start + tableOffset + threadIdx.x + blockIdx.x*blockDim.x; + i < end + tableOffset; + i += blockDim.x * gridDim.x) { + + int h = V0_ITa[5*i]; + int p = V0_ITa[5*i+2]; + + DeviceVertex dst; + dst.clear(); + dst.addWithWeight(&vertex[p], 0.5f); + + for (int j = 0; j < 8; ++j) + dst.addWithWeight(&vertex[V0_IT[h+j]], 0.0625f); + vertex[i+offset-tableOffset] = dst; + + if(NUM_VARYING_ELEMENTS > 0){ + DeviceVertex dstVarying; + dstVarying.clear(); + dstVarying.addWithWeight(&varyings[p], 1.0f); + varyings[i+offset-tableOffset] = dstVarying; + } + } +} + +__global__ void +computeRestrictedVertexB1(float *fVertex, float *fVarying, + int vertexLength, int vertexStride, + int varyingLength, int varyingStride, + const int *V0_ITa, const int *V0_IT, int offset, int tableOffset, int start, int end) +{ + for (int i = start + tableOffset + threadIdx.x + blockIdx.x*blockDim.x; + i < end + tableOffset; + i += blockDim.x * gridDim.x) { + + int h = V0_ITa[5*i]; + int p = V0_ITa[5*i+2]; + + float *dstVertex = fVertex + (i+offset-tableOffset)*vertexStride; + clear(dstVertex, vertexLength); + addWithWeight(dstVertex, fVertex + p*vertexStride, 0.5f, vertexLength); + + for (int j = 0; j < 8; ++j) + addWithWeight(dstVertex, fVertex + V0_IT[h+j]*vertexStride, 0.0625f, vertexLength); + + if (varyingLength > 0) { + float *dstVarying = fVarying + (i+offset-tableOffset)*varyingStride; + clear(dstVarying, varyingLength); + addWithWeight(dstVarying, fVarying + p*varyingStride, 1.0f, varyingLength); + } + } +} + +template __global__ void +computeRestrictedVertexB2(float *fVertex, float *fVaryings, + const int *V0_ITa, const int *V0_IT, int offset, int tableOffset, int start, int end) +{ + DeviceVertex *vertex = (DeviceVertex*)fVertex; + DeviceVertex *varyings = (DeviceVertex*)fVaryings; + for (int i = start + tableOffset + threadIdx.x + blockIdx.x*blockDim.x; + i < end + tableOffset; + i += blockDim.x * gridDim.x) { + + int h = V0_ITa[5*i]; + int n = V0_ITa[5*i+1]; + int p = V0_ITa[5*i+2]; + + float wp = 1.0f/float(n*n); + float wv = (n-2.0f) * n * wp; + + DeviceVertex dst; + dst.clear(); + dst.addWithWeight(&vertex[p], wv); + + for (int j = 0; j < n; ++j) { + dst.addWithWeight(&vertex[V0_IT[h+j*2]], wp); + dst.addWithWeight(&vertex[V0_IT[h+j*2+1]], wp); + } + vertex[i+offset-tableOffset] = dst; + + if(NUM_VARYING_ELEMENTS > 0){ + DeviceVertex dstVarying; + dstVarying.clear(); + dstVarying.addWithWeight(&varyings[p], 1.0f); + varyings[i+offset-tableOffset] = dstVarying; + } + } +} + +__global__ void +computeRestrictedVertexB2(float *fVertex, float *fVarying, + int vertexLength, int vertexStride, + int varyingLength, int varyingStride, + const int *V0_ITa, const int *V0_IT, int offset, int tableOffset, int start, int end) +{ + for (int i = start + tableOffset + threadIdx.x + blockIdx.x*blockDim.x; + i < end + tableOffset; + i += blockDim.x * gridDim.x) { + + int h = V0_ITa[5*i]; + int n = V0_ITa[5*i+1]; + int p = V0_ITa[5*i+2]; + + float wp = 1.0f/float(n*n); + float wv = (n-2.0f) * n * wp; + + float *dstVertex = fVertex + (i+offset-tableOffset)*vertexStride; + clear(dstVertex, vertexLength); + addWithWeight(dstVertex, fVertex + p*vertexStride, wv, vertexLength); + + for (int j = 0; j < n; ++j) { + addWithWeight(dstVertex, fVertex + V0_IT[h+j*2]*vertexStride, wp, vertexLength); + addWithWeight(dstVertex, fVertex + V0_IT[h+j*2+1]*vertexStride, wp, vertexLength); + } + + if (varyingLength > 0) { + float *dstVarying = fVarying + (i+offset-tableOffset)*varyingStride; + clear(dstVarying, varyingLength); + addWithWeight(dstVarying, fVarying + p*varyingStride, 1.0f, varyingLength); + } + } +} // -------------------------------------------------------------------------------------------- @@ -960,6 +1146,54 @@ void OsdCudaComputeVertexB(float *vertex, float *varying, V_ITa, V_IT, V_W, offset, tableOffset, start, end); } +void OsdCudaComputeRestrictedVertexA(float *vertex, float *varying, + int vertexLength, int vertexStride, + int varyingLength, int varyingStride, + int *V_ITa, int offset, int tableOffset, int start, int end, int pass) +{ +// computeRestrictedVertexA<0, 3><<<512,32>>>(vertex, varying, V_ITa, offset, start, end); + OPT_KERNEL(0, 0, computeRestrictedVertexA, 512, 32, (vertex, varying, V_ITa, offset, tableOffset, start, end)); + OPT_KERNEL(0, 3, computeRestrictedVertexA, 512, 32, (vertex, varying, V_ITa, offset, tableOffset, start, end)); + OPT_KERNEL(3, 0, computeRestrictedVertexA, 512, 32, (vertex, varying, V_ITa, offset, tableOffset, start, end)); + OPT_KERNEL(3, 3, computeRestrictedVertexA, 512, 32, (vertex, varying, V_ITa, offset, tableOffset, start, end)); + + computeRestrictedVertexA<<<512, 32>>>(vertex, varying, + vertexLength, vertexStride, varyingLength, varyingStride, + V_ITa, offset, tableOffset, start, end); +} + +void OsdCudaComputeRestrictedVertexB1(float *vertex, float *varying, + int vertexLength, int vertexStride, + int varyingLength, int varyingStride, + int *V_ITa, int *V_IT, int offset, int tableOffset, int start, int end) +{ +// computeRestrictedVertexB1<0, 3><<<512,32>>>(vertex, varying, V_ITa, V_IT, offset, start, end); + OPT_KERNEL(0, 0, computeRestrictedVertexB1, 512, 32, (vertex, varying, V_ITa, V_IT, offset, tableOffset, start, end)); + OPT_KERNEL(0, 3, computeRestrictedVertexB1, 512, 32, (vertex, varying, V_ITa, V_IT, offset, tableOffset, start, end)); + OPT_KERNEL(3, 0, computeRestrictedVertexB1, 512, 32, (vertex, varying, V_ITa, V_IT, offset, tableOffset, start, end)); + OPT_KERNEL(3, 3, computeRestrictedVertexB1, 512, 32, (vertex, varying, V_ITa, V_IT, offset, tableOffset, start, end)); + + computeRestrictedVertexB1 <<<512, 32>>>(vertex, varying, + vertexLength, vertexStride, varyingLength, varyingStride, + V_ITa, V_IT, offset, tableOffset, start, end); +} + +void OsdCudaComputeRestrictedVertexB2(float *vertex, float *varying, + int vertexLength, int vertexStride, + int varyingLength, int varyingStride, + int *V_ITa, int *V_IT, int offset, int tableOffset, int start, int end) +{ +// computeRestrictedVertexB2<0, 3><<<512,32>>>(vertex, varying, V_ITa, V_IT, offset, start, end); + OPT_KERNEL(0, 0, computeRestrictedVertexB2, 512, 32, (vertex, varying, V_ITa, V_IT, offset, tableOffset, start, end)); + OPT_KERNEL(0, 3, computeRestrictedVertexB2, 512, 32, (vertex, varying, V_ITa, V_IT, offset, tableOffset, start, end)); + OPT_KERNEL(3, 0, computeRestrictedVertexB2, 512, 32, (vertex, varying, V_ITa, V_IT, offset, tableOffset, start, end)); + OPT_KERNEL(3, 3, computeRestrictedVertexB2, 512, 32, (vertex, varying, V_ITa, V_IT, offset, tableOffset, start, end)); + + computeRestrictedVertexB2 <<<512, 32>>>(vertex, varying, + vertexLength, vertexStride, varyingLength, varyingStride, + V_ITa, V_IT, offset, tableOffset, start, end); +} + void OsdCudaComputeLoopVertexB(float *vertex, float *varying, int vertexLength, int vertexStride, int varyingLength, int varyingStride, diff --git a/opensubdiv/osd/d3d11ComputeController.cpp b/opensubdiv/osd/d3d11ComputeController.cpp index 9894e9c9..f49ef38f 100755 --- a/opensubdiv/osd/d3d11ComputeController.cpp +++ b/opensubdiv/osd/d3d11ComputeController.cpp @@ -247,6 +247,42 @@ OsdD3D11ComputeController::ApplyCatmarkVertexVerticesKernelA2( _currentBindState.vertexDesc.offset, _currentBindState.varyingDesc.offset); } +void +OsdD3D11ComputeController::ApplyCatmarkRestrictedVertexVerticesKernelB1( + FarKernelBatch const &batch, OsdD3D11ComputeContext const *context) const { + + assert(context); + + _currentBindState.kernelBundle->ApplyCatmarkRestrictedVertexVerticesKernelB1( + batch.GetVertexOffset(), batch.GetTableOffset(), + batch.GetStart(), batch.GetEnd(), + _currentBindState.vertexDesc.offset, _currentBindState.varyingDesc.offset); +} + +void +OsdD3D11ComputeController::ApplyCatmarkRestrictedVertexVerticesKernelB2( + FarKernelBatch const &batch, OsdD3D11ComputeContext const *context) const { + + assert(context); + + _currentBindState.kernelBundle->ApplyCatmarkRestrictedVertexVerticesKernelB2( + batch.GetVertexOffset(), batch.GetTableOffset(), + batch.GetStart(), batch.GetEnd(), + _currentBindState.vertexDesc.offset, _currentBindState.varyingDesc.offset); +} + +void +OsdD3D11ComputeController::ApplyCatmarkRestrictedVertexVerticesKernelA( + FarKernelBatch const &batch, OsdD3D11ComputeContext const *context) const { + + assert(context); + + _currentBindState.kernelBundle->ApplyCatmarkRestrictedVertexVerticesKernelA( + batch.GetVertexOffset(), batch.GetTableOffset(), + batch.GetStart(), batch.GetEnd(), + _currentBindState.vertexDesc.offset, _currentBindState.varyingDesc.offset); +} + void OsdD3D11ComputeController::ApplyLoopEdgeVerticesKernel( FarKernelBatch const &batch, OsdD3D11ComputeContext const *context) const { diff --git a/opensubdiv/osd/d3d11ComputeController.h b/opensubdiv/osd/d3d11ComputeController.h index 1d1bd327..2446091e 100755 --- a/opensubdiv/osd/d3d11ComputeController.h +++ b/opensubdiv/osd/d3d11ComputeController.h @@ -150,6 +150,12 @@ protected: void ApplyCatmarkVertexVerticesKernelA2(FarKernelBatch const &batch, ComputeContext const *context) const; + void ApplyCatmarkRestrictedVertexVerticesKernelB1(FarKernelBatch const &batch, ComputeContext const *context) const; + + void ApplyCatmarkRestrictedVertexVerticesKernelB2(FarKernelBatch const &batch, ComputeContext const *context) const; + + void ApplyCatmarkRestrictedVertexVerticesKernelA(FarKernelBatch const &batch, ComputeContext const *context) const; + void ApplyLoopEdgeVerticesKernel(FarKernelBatch const &batch, ComputeContext const *context) const; diff --git a/opensubdiv/osd/d3d11KernelBundle.cpp b/opensubdiv/osd/d3d11KernelBundle.cpp index 90c48ec5..b4ff03e7 100644 --- a/opensubdiv/osd/d3d11KernelBundle.cpp +++ b/opensubdiv/osd/d3d11KernelBundle.cpp @@ -60,6 +60,9 @@ OsdD3D11ComputeKernelBundle::OsdD3D11ComputeKernelBundle( _kernelComputeVertex(0), _kernelComputeVertexA(0), _kernelComputeCatmarkVertexB(0), + _kernelComputeCatmarkRestrictedVertexA(0), + _kernelComputeCatmarkRestrictedVertexB1(0), + _kernelComputeCatmarkRestrictedVertexB2(0), _kernelComputeLoopVertexB(0), _kernelEditAdd(0) { @@ -80,6 +83,9 @@ OsdD3D11ComputeKernelBundle::~OsdD3D11ComputeKernelBundle() { SAFE_RELEASE(_kernelComputeVertex); SAFE_RELEASE(_kernelComputeVertexA); SAFE_RELEASE(_kernelComputeCatmarkVertexB); + SAFE_RELEASE(_kernelComputeCatmarkRestrictedVertexA); + SAFE_RELEASE(_kernelComputeCatmarkRestrictedVertexB1); + SAFE_RELEASE(_kernelComputeCatmarkRestrictedVertexB2); SAFE_RELEASE(_kernelComputeLoopVertexB); SAFE_RELEASE(_kernelEditAdd); } @@ -193,6 +199,15 @@ OsdD3D11ComputeKernelBundle::Compile( _classLinkage->GetClassInstance( "catmarkComputeVertexB", 0, &_kernelComputeCatmarkVertexB); assert(_kernelComputeCatmarkVertexB); + _classLinkage->GetClassInstance( + "catmarkComputeRestrictedVertexA", 0, &_kernelComputeCatmarkRestrictedVertexA); + assert(_kernelComputeCatmarkRestrictedVertexA); + _classLinkage->GetClassInstance( + "catmarkComputeRestrictedVertexB1", 0, &_kernelComputeCatmarkRestrictedVertexB1); + assert(_kernelComputeCatmarkRestrictedVertexB1); + _classLinkage->GetClassInstance( + "catmarkComputeRestrictedVertexB2", 0, &_kernelComputeCatmarkRestrictedVertexB2); + assert(_kernelComputeCatmarkRestrictedVertexA); _classLinkage->GetClassInstance( "loopComputeVertexB", 0, &_kernelComputeLoopVertexB); assert(_kernelComputeLoopVertexB); @@ -414,6 +429,54 @@ OsdD3D11ComputeKernelBundle::ApplyCatmarkVertexVerticesKernelA( dispatchCompute(_kernelComputeVertexA, args); } +void +OsdD3D11ComputeKernelBundle::ApplyCatmarkRestrictedVertexVerticesKernelB1( + int vertexOffset, int tableOffset, int start, int end, + int vertexBaseOffset, int varyingBaseOffset) { + + KernelCB args; + ZeroMemory(&args, sizeof(args)); + args.vertexOffset = vertexOffset; + args.tableOffset = tableOffset; + args.indexStart = start; + args.indexEnd = end; + args.vertexBaseOffset = vertexBaseOffset; + args.varyingBaseOffset = varyingBaseOffset; + dispatchCompute(_kernelComputeCatmarkRestrictedVertexB1, args); +} + +void +OsdD3D11ComputeKernelBundle::ApplyCatmarkRestrictedVertexVerticesKernelB2( + int vertexOffset, int tableOffset, int start, int end, + int vertexBaseOffset, int varyingBaseOffset) { + + KernelCB args; + ZeroMemory(&args, sizeof(args)); + args.vertexOffset = vertexOffset; + args.tableOffset = tableOffset; + args.indexStart = start; + args.indexEnd = end; + args.vertexBaseOffset = vertexBaseOffset; + args.varyingBaseOffset = varyingBaseOffset; + dispatchCompute(_kernelComputeCatmarkRestrictedVertexB2, args); +} + +void +OsdD3D11ComputeKernelBundle::ApplyCatmarkRestrictedVertexVerticesKernelA( + int vertexOffset, int tableOffset, int start, int end, + int vertexBaseOffset, int varyingBaseOffset) { + + KernelCB args; + ZeroMemory(&args, sizeof(args)); + args.vertexOffset = vertexOffset; + args.tableOffset = tableOffset; + args.indexStart = start; + args.indexEnd = end; + args.vertexBaseOffset = vertexBaseOffset; + args.varyingBaseOffset = varyingBaseOffset; + dispatchCompute(_kernelComputeCatmarkRestrictedVertexA, args); +} + void OsdD3D11ComputeKernelBundle::ApplyLoopEdgeVerticesKernel( int vertexOffset, int tableOffset, int start, int end, diff --git a/opensubdiv/osd/d3d11KernelBundle.h b/opensubdiv/osd/d3d11KernelBundle.h index 56a27480..78a2476f 100755 --- a/opensubdiv/osd/d3d11KernelBundle.h +++ b/opensubdiv/osd/d3d11KernelBundle.h @@ -91,6 +91,18 @@ public: int vertexOffset, int tableOffset, int start, int end, bool pass, int vertexBaseOffset, int varyingBaseOffset); + void ApplyCatmarkRestrictedVertexVerticesKernelB1( + int vertexOffset, int tableOffset, int start, int end, + int vertexBaseOffset, int varyingBaseOffset); + + void ApplyCatmarkRestrictedVertexVerticesKernelB2( + int vertexOffset, int tableOffset, int start, int end, + int vertexBaseOffset, int varyingBaseOffset); + + void ApplyCatmarkRestrictedVertexVerticesKernelA( + int vertexOffset, int tableOffset, int start, int end, + int vertexBaseOffset, int varyingBaseOffset); + void ApplyLoopEdgeVerticesKernel( int vertexOffset, int tableOffset, int start, int end, int vertexBaseOffset, int varyingBaseOffset); @@ -160,6 +172,12 @@ protected: ID3D11ClassInstance * _kernelComputeCatmarkVertexB; // vertex-vertex kernel B (catmark scheme) + ID3D11ClassInstance * _kernelComputeCatmarkRestrictedVertexA; // restricted vertex-vertex kernel A (catmark scheme) + + ID3D11ClassInstance * _kernelComputeCatmarkRestrictedVertexB1; // restricted vertex-vertex kernel B1 (catmark scheme) + + ID3D11ClassInstance * _kernelComputeCatmarkRestrictedVertexB2; // restricted vertex-vertex kernel B2 (catmark scheme) + ID3D11ClassInstance * _kernelComputeLoopVertexB; // vertex-vertex kernel B (loop scheme) ID3D11ClassInstance * _kernelEditAdd; // hedit kernel (add) diff --git a/opensubdiv/osd/gcdComputeController.cpp b/opensubdiv/osd/gcdComputeController.cpp index 3f2154f8..3239123e 100644 --- a/opensubdiv/osd/gcdComputeController.cpp +++ b/opensubdiv/osd/gcdComputeController.cpp @@ -195,6 +195,50 @@ OsdGcdComputeController::ApplyCatmarkVertexVerticesKernelA2( _gcd_queue); } +void +OsdGcdComputeController::ApplyCatmarkRestrictedVertexVerticesKernelB1( + FarKernelBatch const &batch, OsdCpuComputeContext const *context) const { + + assert(context); + + OsdGcdComputeRestrictedVertexB1( + _currentBindState.vertexBuffer, _currentBindState.varyingBuffer, + _currentBindState.vertexDesc, _currentBindState.varyingDesc, + (const int*)context->GetTable(FarSubdivisionTables::V_ITa)->GetBuffer(), + (const int*)context->GetTable(FarSubdivisionTables::V_IT)->GetBuffer(), + batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd(), + _gcd_queue); +} + +void +OsdGcdComputeController::ApplyCatmarkRestrictedVertexVerticesKernelB2( + FarKernelBatch const &batch, OsdCpuComputeContext const *context) const { + + assert(context); + + OsdGcdComputeRestrictedVertexB2( + _currentBindState.vertexBuffer, _currentBindState.varyingBuffer, + _currentBindState.vertexDesc, _currentBindState.varyingDesc, + (const int*)context->GetTable(FarSubdivisionTables::V_ITa)->GetBuffer(), + (const int*)context->GetTable(FarSubdivisionTables::V_IT)->GetBuffer(), + batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd(), + _gcd_queue); +} + +void +OsdGcdComputeController::ApplyCatmarkRestrictedVertexVerticesKernelA( + FarKernelBatch const &batch, OsdCpuComputeContext const *context) const { + + assert(context); + + OsdGcdComputeRestrictedVertexA( + _currentBindState.vertexBuffer, _currentBindState.varyingBuffer, + _currentBindState.vertexDesc, _currentBindState.varyingDesc, + (const int*)context->GetTable(FarSubdivisionTables::V_ITa)->GetBuffer(), + batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd(), + _gcd_queue); +} + void OsdGcdComputeController::ApplyLoopEdgeVerticesKernel( FarKernelBatch const &batch, OsdCpuComputeContext const *context) const { diff --git a/opensubdiv/osd/gcdComputeController.h b/opensubdiv/osd/gcdComputeController.h index c7046b13..ecf24ee7 100644 --- a/opensubdiv/osd/gcdComputeController.h +++ b/opensubdiv/osd/gcdComputeController.h @@ -134,6 +134,11 @@ protected: void ApplyCatmarkVertexVerticesKernelA2(FarKernelBatch const &batch, ComputeContext const *context) const; + void ApplyCatmarkRestrictedVertexVerticesKernelB1(FarKernelBatch const &batch, ComputeContext const *context) const; + + void ApplyCatmarkRestrictedVertexVerticesKernelB2(FarKernelBatch const &batch, ComputeContext const *context) const; + + void ApplyCatmarkRestrictedVertexVerticesKernelA(FarKernelBatch const &batch, ComputeContext const *context) const; void ApplyLoopEdgeVerticesKernel(FarKernelBatch const &batch, ComputeContext const *context) const; diff --git a/opensubdiv/osd/gcdKernel.cpp b/opensubdiv/osd/gcdKernel.cpp index 2db1ac60..af777189 100644 --- a/opensubdiv/osd/gcdKernel.cpp +++ b/opensubdiv/osd/gcdKernel.cpp @@ -222,6 +222,78 @@ void OsdGcdComputeVertexB( vertexOffset, tableOffset, start_e, end_e); } +void OsdGcdComputeRestrictedVertexA( + float * vertex, float * varying, + OsdVertexBufferDescriptor const &vertexDesc, + OsdVertexBufferDescriptor const &varyingDesc, + const int *V_ITa, + int vertexOffset, int tableOffset, int start, int end, + dispatch_queue_t gcdq) { + + const int workSize = end-start; + dispatch_apply(workSize/GCD_WORK_STRIDE, gcdq, ^(size_t blockIdx){ + const int start_i = start + blockIdx*GCD_WORK_STRIDE; + const int end_i = start_i + GCD_WORK_STRIDE; + OsdCpuComputeRestrictedVertexA(vertex, varying, vertexDesc, varyingDesc, + V_ITa, + vertexOffset, tableOffset, start_i, end_i); + }); + const int start_e = end - workSize%GCD_WORK_STRIDE; + const int end_e = end; + if (start_e < end_e) + OsdCpuComputeRestrictedVertexA(vertex, varying, vertexDesc, varyingDesc, + V_ITa, + vertexOffset, tableOffset, start_e, end_e); +} + +void OsdGcdComputeRestrictedVertexB1( + float * vertex, float * varying, + OsdVertexBufferDescriptor const &vertexDesc, + OsdVertexBufferDescriptor const &varyingDesc, + const int *V_ITa, const int *V_IT, + int vertexOffset, int tableOffset, int start, int end, + dispatch_queue_t gcdq) { + + const int workSize = end-start; + dispatch_apply(workSize/GCD_WORK_STRIDE, gcdq, ^(size_t blockIdx){ + const int start_i = start + blockIdx*GCD_WORK_STRIDE; + const int end_i = start_i + GCD_WORK_STRIDE; + OsdCpuComputeRestrictedVertexB1(vertex, varying, vertexDesc, varyingDesc, + V_ITa, V_IT, + vertexOffset, tableOffset, start_i, end_i); + }); + const int start_e = end - workSize%GCD_WORK_STRIDE; + const int end_e = end; + if (start_e < end_e) + OsdCpuComputeRestrictedVertexB1(vertex, varying, vertexDesc, varyingDesc, + V_ITa, V_IT, + vertexOffset, tableOffset, start_e, end_e); +} + +void OsdGcdComputeRestrictedVertexB2( + float * vertex, float * varying, + OsdVertexBufferDescriptor const &vertexDesc, + OsdVertexBufferDescriptor const &varyingDesc, + const int *V_ITa, const int *V_IT, + int vertexOffset, int tableOffset, int start, int end, + dispatch_queue_t gcdq) { + + const int workSize = end-start; + dispatch_apply(workSize/GCD_WORK_STRIDE, gcdq, ^(size_t blockIdx){ + const int start_i = start + blockIdx*GCD_WORK_STRIDE; + const int end_i = start_i + GCD_WORK_STRIDE; + OsdCpuComputeRestrictedVertexB2(vertex, varying, vertexDesc, varyingDesc, + V_ITa, V_IT, + vertexOffset, tableOffset, start_i, end_i); + }); + const int start_e = end - workSize%GCD_WORK_STRIDE; + const int end_e = end; + if (start_e < end_e) + OsdCpuComputeRestrictedVertexB2(vertex, varying, vertexDesc, varyingDesc, + V_ITa, V_IT, + vertexOffset, tableOffset, start_e, end_e); +} + void OsdGcdComputeLoopVertexB( float * vertex, float * varying, OsdVertexBufferDescriptor const &vertexDesc, diff --git a/opensubdiv/osd/gcdKernel.h b/opensubdiv/osd/gcdKernel.h index 2cbb0769..8aa04e0d 100644 --- a/opensubdiv/osd/gcdKernel.h +++ b/opensubdiv/osd/gcdKernel.h @@ -90,6 +90,30 @@ void OsdGcdComputeVertexB(float *vertex, float * varying, int start, int end, dispatch_queue_t gcdq); +void OsdGcdComputeRestrictedVertexA(float *vertex, float * varying, + OsdVertexBufferDescriptor const &vertexDesc, + OsdVertexBufferDescriptor const &varyingDesc, + const int *V_ITa, + int vertexOffset, int tableOffset, + int start, int end, + dispatch_queue_t gcdq); + +void OsdGcdComputeRestrictedVertexB1(float *vertex, float * varying, + OsdVertexBufferDescriptor const &vertexDesc, + OsdVertexBufferDescriptor const &varyingDesc, + const int *V_ITa, const int *V_IT, + int vertexOffset, int tableOffset, + int start, int end, + dispatch_queue_t gcdq); + +void OsdGcdComputeRestrictedVertexB2(float *vertex, float * varying, + OsdVertexBufferDescriptor const &vertexDesc, + OsdVertexBufferDescriptor const &varyingDesc, + const int *V_ITa, const int *V_IT, + int vertexOffset, int tableOffset, + int start, int end, + dispatch_queue_t gcdq); + void OsdGcdComputeLoopVertexB(float *vertex, float * varying, OsdVertexBufferDescriptor const &vertexDesc, OsdVertexBufferDescriptor const &varyingDesc, diff --git a/opensubdiv/osd/glslComputeController.cpp b/opensubdiv/osd/glslComputeController.cpp index d13f6504..3508b972 100644 --- a/opensubdiv/osd/glslComputeController.cpp +++ b/opensubdiv/osd/glslComputeController.cpp @@ -216,6 +216,39 @@ OsdGLSLComputeController::ApplyCatmarkVertexVerticesKernelA2( batch.GetStart(), batch.GetEnd(), true); } +void +OsdGLSLComputeController::ApplyCatmarkRestrictedVertexVerticesKernelB1( + FarKernelBatch const &batch, OsdGLSLComputeContext const *context) const { + + assert(context); + + _currentBindState.kernelBundle->ApplyCatmarkRestrictedVertexVerticesKernelB1( + batch.GetVertexOffset(), batch.GetTableOffset(), + batch.GetStart(), batch.GetEnd()); +} + +void +OsdGLSLComputeController::ApplyCatmarkRestrictedVertexVerticesKernelB2( + FarKernelBatch const &batch, OsdGLSLComputeContext const *context) const { + + assert(context); + + _currentBindState.kernelBundle->ApplyCatmarkRestrictedVertexVerticesKernelB2( + batch.GetVertexOffset(), batch.GetTableOffset(), + batch.GetStart(), batch.GetEnd()); +} + +void +OsdGLSLComputeController::ApplyCatmarkRestrictedVertexVerticesKernelA( + FarKernelBatch const &batch, OsdGLSLComputeContext const *context) const { + + assert(context); + + _currentBindState.kernelBundle->ApplyCatmarkRestrictedVertexVerticesKernelA( + batch.GetVertexOffset(), batch.GetTableOffset(), + batch.GetStart(), batch.GetEnd()); +} + void OsdGLSLComputeController::ApplyLoopEdgeVerticesKernel( FarKernelBatch const &batch, OsdGLSLComputeContext const *context) const { diff --git a/opensubdiv/osd/glslComputeController.h b/opensubdiv/osd/glslComputeController.h index 1a95508b..6b095f7d 100644 --- a/opensubdiv/osd/glslComputeController.h +++ b/opensubdiv/osd/glslComputeController.h @@ -143,6 +143,12 @@ protected: void ApplyCatmarkVertexVerticesKernelA2(FarKernelBatch const &batch, ComputeContext const *context) const; + void ApplyCatmarkRestrictedVertexVerticesKernelB1(FarKernelBatch const &batch, ComputeContext const *context) const; + + void ApplyCatmarkRestrictedVertexVerticesKernelB2(FarKernelBatch const &batch, ComputeContext const *context) const; + + void ApplyCatmarkRestrictedVertexVerticesKernelA(FarKernelBatch const &batch, ComputeContext const *context) const; + void ApplyLoopEdgeVerticesKernel(FarKernelBatch const &batch, ComputeContext const *context) const; diff --git a/opensubdiv/osd/glslComputeKernel.glsl b/opensubdiv/osd/glslComputeKernel.glsl index 3a863a89..60951e36 100644 --- a/opensubdiv/osd/glslComputeKernel.glsl +++ b/opensubdiv/osd/glslComputeKernel.glsl @@ -416,6 +416,82 @@ void catmarkComputeVertexB() writeVertex(vid, dst); } +// Restricted vertex-vertices compute Kernels 'A' / k_Crease and k_Corner rules +subroutine(computeKernelType) +void catmarkComputeRestrictedVertexA() +{ + int i = int(gl_GlobalInvocationID.x) + indexStart; + if (i >= indexEnd) return; + int vid = i + vertexOffset; + i += tableOffset; + + int p = _V_ITa[5*i+2]; + int eidx0 = _V_ITa[5*i+3]; + int eidx1 = _V_ITa[5*i+4]; + + Vertex dst; + clear(dst); + + addWithWeight(dst, readVertex(p), 0.75f); + addWithWeight(dst, readVertex(eidx0), 0.125f); + addWithWeight(dst, readVertex(eidx1), 0.125f); + addVaryingWithWeight(dst, readVertex(p), 1); + + writeVertex(vid, dst); +} + +// Vertex-vertices compute Kernels 'B' / regular k_Dart and k_Smooth rules +subroutine(computeKernelType) +void catmarkComputeRestrictedVertexB1() +{ + int i = int(gl_GlobalInvocationID.x) + indexStart; + if (i >= indexEnd) return; + int vid = i + vertexOffset; + i += tableOffset; + + int h = _V_ITa[5*i]; + int p = _V_ITa[5*i+2]; + + Vertex dst; + clear(dst); + + addWithWeight(dst, readVertex(p), 0.5f); + + for(int j = 0; j < 8; ++j) + addWithWeight(dst, readVertex(_V_IT[h+j]), 0.0625f); + addVaryingWithWeight(dst, readVertex(p), 1); + writeVertex(vid, dst); +} + +// Vertex-vertices compute Kernels 'B' / irregular k_Dart and k_Smooth rules +subroutine(computeKernelType) +void catmarkComputeRestrictedVertexB2() +{ + int i = int(gl_GlobalInvocationID.x) + indexStart; + if (i >= indexEnd) return; + int vid = i + vertexOffset; + i += tableOffset; + + int h = _V_ITa[5*i]; + int n = _V_ITa[5*i+1]; + int p = _V_ITa[5*i+2]; + + float wp = 1.0/float(n*n); + float wv = (n-2.0) * n * wp; + + Vertex dst; + clear(dst); + + addWithWeight(dst, readVertex(p), wv); + + for(int j = 0; j < n; ++j){ + addWithWeight(dst, readVertex(_V_IT[h+j*2]), wp); + addWithWeight(dst, readVertex(_V_IT[h+j*2+1]), wp); + } + addVaryingWithWeight(dst, readVertex(p), 1); + writeVertex(vid, dst); +} + // Vertex-vertices compute Kernels 'B' / k_Dart and k_Smooth rules subroutine(computeKernelType) void loopComputeVertexB() diff --git a/opensubdiv/osd/glslKernelBundle.cpp b/opensubdiv/osd/glslKernelBundle.cpp index 31fcacad..d7ede402 100644 --- a/opensubdiv/osd/glslKernelBundle.cpp +++ b/opensubdiv/osd/glslKernelBundle.cpp @@ -114,36 +114,45 @@ OsdGLSLComputeKernelBundle::Compile( glDeleteShader(shader); - _subComputeFace = glGetSubroutineIndex(_program, - GL_COMPUTE_SHADER, - "catmarkComputeFace"); - _subComputeQuadFace = glGetSubroutineIndex(_program, - GL_COMPUTE_SHADER, - "catmarkComputeQuadFace"); - _subComputeTriQuadFace = glGetSubroutineIndex(_program, - GL_COMPUTE_SHADER, - "catmarkComputeTriQuadFace"); - _subComputeEdge = glGetSubroutineIndex(_program, - GL_COMPUTE_SHADER, - "catmarkComputeEdge"); - _subComputeRestrictedEdge = glGetSubroutineIndex(_program, - GL_COMPUTE_SHADER, - "catmarkComputeRestrictedEdge"); - _subComputeBilinearEdge = glGetSubroutineIndex(_program, - GL_COMPUTE_SHADER, - "bilinearComputeEdge"); - _subComputeVertex = glGetSubroutineIndex(_program, - GL_COMPUTE_SHADER, - "bilinearComputeVertex"); - _subComputeVertexA = glGetSubroutineIndex(_program, - GL_COMPUTE_SHADER, - "catmarkComputeVertexA"); - _subComputeCatmarkVertexB = glGetSubroutineIndex(_program, - GL_COMPUTE_SHADER, - "catmarkComputeVertexB"); - _subComputeLoopVertexB = glGetSubroutineIndex(_program, - GL_COMPUTE_SHADER, - "loopComputeVertexB"); + _subComputeFace = glGetSubroutineIndex(_program, + GL_COMPUTE_SHADER, + "catmarkComputeFace"); + _subComputeQuadFace = glGetSubroutineIndex(_program, + GL_COMPUTE_SHADER, + "catmarkComputeQuadFace"); + _subComputeTriQuadFace = glGetSubroutineIndex(_program, + GL_COMPUTE_SHADER, + "catmarkComputeTriQuadFace"); + _subComputeEdge = glGetSubroutineIndex(_program, + GL_COMPUTE_SHADER, + "catmarkComputeEdge"); + _subComputeRestrictedEdge = glGetSubroutineIndex(_program, + GL_COMPUTE_SHADER, + "catmarkComputeRestrictedEdge"); + _subComputeRestrictedVertexA = glGetSubroutineIndex(_program, + GL_COMPUTE_SHADER, + "catmarkComputeRestrictedVertexA"); + _subComputeRestrictedVertexB1 = glGetSubroutineIndex(_program, + GL_COMPUTE_SHADER, + "catmarkComputeRestrictedVertexB1"); + _subComputeRestrictedVertexB2 = glGetSubroutineIndex(_program, + GL_COMPUTE_SHADER, + "catmarkComputeRestrictedVertexB2"); + _subComputeBilinearEdge = glGetSubroutineIndex(_program, + GL_COMPUTE_SHADER, + "bilinearComputeEdge"); + _subComputeVertex = glGetSubroutineIndex(_program, + GL_COMPUTE_SHADER, + "bilinearComputeVertex"); + _subComputeVertexA = glGetSubroutineIndex(_program, + GL_COMPUTE_SHADER, + "catmarkComputeVertexA"); + _subComputeCatmarkVertexB = glGetSubroutineIndex(_program, + GL_COMPUTE_SHADER, + "catmarkComputeVertexB"); + _subComputeLoopVertexB = glGetSubroutineIndex(_program, + GL_COMPUTE_SHADER, + "loopComputeVertexB"); // set uniform locations for compute _uniformVertexPass = glGetUniformLocation(_program, "vertexPass"); @@ -287,6 +296,30 @@ OsdGLSLComputeKernelBundle::ApplyCatmarkVertexVerticesKernelA( dispatchCompute(vertexOffset, tableOffset, start, end); } +void +OsdGLSLComputeKernelBundle::ApplyCatmarkRestrictedVertexVerticesKernelB1( + int vertexOffset, int tableOffset, int start, int end) { + + glUniformSubroutinesuiv(GL_COMPUTE_SHADER, 1, &_subComputeRestrictedVertexB1); + dispatchCompute(vertexOffset, tableOffset, start, end); +} + +void +OsdGLSLComputeKernelBundle::ApplyCatmarkRestrictedVertexVerticesKernelB2( + int vertexOffset, int tableOffset, int start, int end) { + + glUniformSubroutinesuiv(GL_COMPUTE_SHADER, 1, &_subComputeRestrictedVertexB2); + dispatchCompute(vertexOffset, tableOffset, start, end); +} + +void +OsdGLSLComputeKernelBundle::ApplyCatmarkRestrictedVertexVerticesKernelA( + int vertexOffset, int tableOffset, int start, int end) { + + glUniformSubroutinesuiv(GL_COMPUTE_SHADER, 1, &_subComputeRestrictedVertexA); + dispatchCompute(vertexOffset, tableOffset, start, end); +} + void OsdGLSLComputeKernelBundle::ApplyLoopEdgeVerticesKernel( int vertexOffset, int tableOffset, int start, int end) { diff --git a/opensubdiv/osd/glslKernelBundle.h b/opensubdiv/osd/glslKernelBundle.h index 2235fcc8..20a0e7eb 100644 --- a/opensubdiv/osd/glslKernelBundle.h +++ b/opensubdiv/osd/glslKernelBundle.h @@ -78,6 +78,15 @@ public: void ApplyLoopEdgeVerticesKernel( int vertexOffset, int tableOffset, int start, int end); + void ApplyCatmarkRestrictedVertexVerticesKernelB1( + int vertexOffset, int tableOffset, int start, int end); + + void ApplyCatmarkRestrictedVertexVerticesKernelB2( + int vertexOffset, int tableOffset, int start, int end); + + void ApplyCatmarkRestrictedVertexVerticesKernelA( + int vertexOffset, int tableOffset, int start, int end); + void ApplyLoopVertexVerticesKernelB( int vertexOffset, int tableOffset, int start, int end); @@ -147,6 +156,12 @@ protected: GLuint _subComputeRestrictedEdge; // restricted edge-vertex kernel (catmark scheme) + GLuint _subComputeRestrictedVertexA; // restricted vertex-vertex kernel A (catmark scheme) + + GLuint _subComputeRestrictedVertexB1; // restricted vertex-vertex kernel B1 (catmark scheme) + + GLuint _subComputeRestrictedVertexB2; // restricted vertex-vertex kernel B2 (catmark scheme) + GLuint _subComputeBilinearEdge; // edge-vertex kernel (bilinear scheme) GLuint _subComputeVertex; // vertex-vertex kernel (bilinear scheme) diff --git a/opensubdiv/osd/glslTransformFeedbackComputeController.cpp b/opensubdiv/osd/glslTransformFeedbackComputeController.cpp index cf439534..70adbf07 100644 --- a/opensubdiv/osd/glslTransformFeedbackComputeController.cpp +++ b/opensubdiv/osd/glslTransformFeedbackComputeController.cpp @@ -291,6 +291,42 @@ OsdGLSLTransformFeedbackComputeController::ApplyCatmarkVertexVerticesKernelA2( batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd(), true); } +void +OsdGLSLTransformFeedbackComputeController::ApplyCatmarkRestrictedVertexVerticesKernelB1( + FarKernelBatch const &batch, OsdGLSLTransformFeedbackComputeContext const *context) const { + + assert(context); + + _currentBindState.kernelBundle->ApplyCatmarkRestrictedVertexVerticesKernelB1( + _currentBindState.vertexBuffer, _currentBindState.varyingBuffer, + _currentBindState.vertexDesc.offset, _currentBindState.varyingDesc.offset, + batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd()); +} + +void +OsdGLSLTransformFeedbackComputeController::ApplyCatmarkRestrictedVertexVerticesKernelB2( + FarKernelBatch const &batch, OsdGLSLTransformFeedbackComputeContext const *context) const { + + assert(context); + + _currentBindState.kernelBundle->ApplyCatmarkRestrictedVertexVerticesKernelB2( + _currentBindState.vertexBuffer, _currentBindState.varyingBuffer, + _currentBindState.vertexDesc.offset, _currentBindState.varyingDesc.offset, + batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd()); +} + +void +OsdGLSLTransformFeedbackComputeController::ApplyCatmarkRestrictedVertexVerticesKernelA( + FarKernelBatch const &batch, OsdGLSLTransformFeedbackComputeContext const *context) const { + + assert(context); + + _currentBindState.kernelBundle->ApplyCatmarkRestrictedVertexVerticesKernelA( + _currentBindState.vertexBuffer, _currentBindState.varyingBuffer, + _currentBindState.vertexDesc.offset, _currentBindState.varyingDesc.offset, + batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd()); +} + void OsdGLSLTransformFeedbackComputeController::ApplyLoopEdgeVerticesKernel( FarKernelBatch const &batch, OsdGLSLTransformFeedbackComputeContext const *context) const { diff --git a/opensubdiv/osd/glslTransformFeedbackComputeController.h b/opensubdiv/osd/glslTransformFeedbackComputeController.h index 2f3c8189..8c30e0f8 100644 --- a/opensubdiv/osd/glslTransformFeedbackComputeController.h +++ b/opensubdiv/osd/glslTransformFeedbackComputeController.h @@ -141,6 +141,12 @@ protected: void ApplyCatmarkVertexVerticesKernelA2(FarKernelBatch const &batch, ComputeContext const *context) const; + void ApplyCatmarkRestrictedVertexVerticesKernelB1(FarKernelBatch const &batch, ComputeContext const *context) const; + + void ApplyCatmarkRestrictedVertexVerticesKernelB2(FarKernelBatch const &batch, ComputeContext const *context) const; + + void ApplyCatmarkRestrictedVertexVerticesKernelA(FarKernelBatch const &batch, ComputeContext const *context) const; + void ApplyLoopEdgeVerticesKernel(FarKernelBatch const &batch, ComputeContext const *context) const; diff --git a/opensubdiv/osd/glslTransformFeedbackKernel.glsl b/opensubdiv/osd/glslTransformFeedbackKernel.glsl index f593faa2..a6749749 100644 --- a/opensubdiv/osd/glslTransformFeedbackKernel.glsl +++ b/opensubdiv/osd/glslTransformFeedbackKernel.glsl @@ -433,6 +433,94 @@ void catmarkComputeVertexB() writeVertex(dst); } +// Restricted vertex-vertices compute Kernels 'A' / k_Crease and k_Corner rules +subroutine(computeKernelType) +void catmarkComputeRestrictedVertexA() +{ + int i = gl_VertexID + indexStart + tableOffset; + int vid = gl_VertexID + indexStart + vertexOffset; + + int p = texelFetch(_V0_ITa, 5*i+2).x; + int eidx0 = texelFetch(_V0_ITa, 5*i+3).x; + int eidx1 = texelFetch(_V0_ITa, 5*i+4).x; + + Vertex dst; + clear(dst); + + addWithWeight(dst, readVertex(p), 0.75f); + addWithWeight(dst, readVertex(eidx0), 0.125f); + addWithWeight(dst, readVertex(eidx1), 0.125f); + addVaryingWithWeight(dst, readVertex(p), 1.0f); + + writeVertex(dst); +} + +// Restricted vertex-vertices compute Kernels 'B' / regular k_Dart and k_Smooth rules +subroutine(computeKernelType) +void catmarkComputeRestrictedVertexB1() +{ + int i = gl_VertexID + indexStart + tableOffset; + + int h = texelFetch(_V0_ITa, 5*i).x; +#ifdef OPT_CATMARK_V_IT_VEC2 + int h2 = h/2; +#endif + int p = texelFetch(_V0_ITa, 5*i+2).x; + + Vertex dst; + clear(dst); + + addWithWeight(dst, readVertex(p), 0.5f); + + for(int j = 0; j < 4; ++j){ +#ifdef OPT_CATMARK_V_IT_VEC2 + ivec2 v0it = texelFetch(_V0_IT, h2+j).xy; + addWithWeight(dst, readVertex(v0it.x), 0.0625f); + addWithWeight(dst, readVertex(v0it.y), 0.0625f); +#else + addWithWeight(dst, readVertex(texelFetch(_V0_IT, h+j*2).x), 0.0625f); + addWithWeight(dst, readVertex(texelFetch(_V0_IT, h+j*2+1).x), 0.0625f); +#endif + } + addVaryingWithWeight(dst, readVertex(p), 1.0f); + writeVertex(dst); +} + +// Restricted vertex-vertices compute Kernels 'B' / irregular k_Dart and k_Smooth rules +subroutine(computeKernelType) +void catmarkComputeRestrictedVertexB2() +{ + int i = gl_VertexID + indexStart + tableOffset; + + int h = texelFetch(_V0_ITa, 5*i).x; +#ifdef OPT_CATMARK_V_IT_VEC2 + int h2 = h/2; +#endif + int n = texelFetch(_V0_ITa, 5*i+1).x; + int p = texelFetch(_V0_ITa, 5*i+2).x; + + float wp = 1.0/float(n*n); + float wv = (n-2.0) * n * wp; + + Vertex dst; + clear(dst); + + addWithWeight(dst, readVertex(p), wv); + + for(int j = 0; j < n; ++j){ +#ifdef OPT_CATMARK_V_IT_VEC2 + ivec2 v0it = texelFetch(_V0_IT, h2+j).xy; + addWithWeight(dst, readVertex(v0it.x), wp); + addWithWeight(dst, readVertex(v0it.y), wp); +#else + addWithWeight(dst, readVertex(texelFetch(_V0_IT, h+j*2).x), wp); + addWithWeight(dst, readVertex(texelFetch(_V0_IT, h+j*2+1).x), wp); +#endif + } + addVaryingWithWeight(dst, readVertex(p), 1.0f); + writeVertex(dst); +} + // Vertex-vertices compute Kernels 'B' / k_Dart and k_Smooth rules subroutine(computeKernelType) void loopComputeVertexB() diff --git a/opensubdiv/osd/glslTransformFeedbackKernelBundle.cpp b/opensubdiv/osd/glslTransformFeedbackKernelBundle.cpp index fb125a69..faba4f1a 100644 --- a/opensubdiv/osd/glslTransformFeedbackKernelBundle.cpp +++ b/opensubdiv/osd/glslTransformFeedbackKernelBundle.cpp @@ -233,6 +233,9 @@ OsdGLSLTransformFeedbackKernelBundle::Compile( _subComputeVertex = glGetSubroutineIndex(_program, GL_VERTEX_SHADER, "bilinearComputeVertex"); _subComputeVertexA = glGetSubroutineIndex(_program, GL_VERTEX_SHADER, "catmarkComputeVertexA"); _subComputeCatmarkVertexB = glGetSubroutineIndex(_program, GL_VERTEX_SHADER, "catmarkComputeVertexB"); + _subComputeRestrictedVertexA = glGetSubroutineIndex(_program, GL_VERTEX_SHADER, "catmarkComputeRestrictedVertexA"); + _subComputeRestrictedVertexB1 = glGetSubroutineIndex(_program, GL_VERTEX_SHADER, "catmarkComputeRestrictedVertexB1"); + _subComputeRestrictedVertexB2 = glGetSubroutineIndex(_program, GL_VERTEX_SHADER, "catmarkComputeRestrictedVertexB2"); _subComputeLoopVertexB = glGetSubroutineIndex(_program, GL_VERTEX_SHADER, "loopComputeVertexB"); _uniformVertexPass = glGetUniformLocation(_program, "vertexPass"); @@ -444,6 +447,42 @@ OsdGLSLTransformFeedbackKernelBundle::ApplyCatmarkVertexVerticesKernelA( offset, tableOffset, start, end); } +void +OsdGLSLTransformFeedbackKernelBundle::ApplyCatmarkRestrictedVertexVerticesKernelB1( + GLuint vertexBuffer, GLuint varyingBuffer, + int vertexOffset, int varyingOffset, + int offset, int tableOffset, int start, int end) { + + glUniformSubroutinesuiv(GL_VERTEX_SHADER, 1, &_subComputeRestrictedVertexB1); + transformGpuBufferData(vertexBuffer, varyingBuffer, + vertexOffset, varyingOffset, + offset, tableOffset, start, end); +} + +void +OsdGLSLTransformFeedbackKernelBundle::ApplyCatmarkRestrictedVertexVerticesKernelB2( + GLuint vertexBuffer, GLuint varyingBuffer, + int vertexOffset, int varyingOffset, + int offset, int tableOffset, int start, int end) { + + glUniformSubroutinesuiv(GL_VERTEX_SHADER, 1, &_subComputeRestrictedVertexB2); + transformGpuBufferData(vertexBuffer, varyingBuffer, + vertexOffset, varyingOffset, + offset, tableOffset, start, end); +} + +void +OsdGLSLTransformFeedbackKernelBundle::ApplyCatmarkRestrictedVertexVerticesKernelA( + GLuint vertexBuffer, GLuint varyingBuffer, + int vertexOffset, int varyingOffset, + int offset, int tableOffset, int start, int end) { + + glUniformSubroutinesuiv(GL_VERTEX_SHADER, 1, &_subComputeRestrictedVertexA); + transformGpuBufferData(vertexBuffer, varyingBuffer, + vertexOffset, varyingOffset, + offset, tableOffset, start, end); +} + void OsdGLSLTransformFeedbackKernelBundle::ApplyLoopEdgeVerticesKernel( GLuint vertexBuffer, GLuint varyingBuffer, diff --git a/opensubdiv/osd/glslTransformFeedbackKernelBundle.h b/opensubdiv/osd/glslTransformFeedbackKernelBundle.h index e354aa4e..43e7191e 100644 --- a/opensubdiv/osd/glslTransformFeedbackKernelBundle.h +++ b/opensubdiv/osd/glslTransformFeedbackKernelBundle.h @@ -98,6 +98,21 @@ public: int vertexOffset, int varyingOffset, int offset, int tableOffset, int start, int end, bool pass); + void ApplyCatmarkRestrictedVertexVerticesKernelB1( + GLuint vertexBuffer, GLuint varyingBuffer, + int vertexOffset, int varyingOffset, + int offset, int tableOffset, int start, int end); + + void ApplyCatmarkRestrictedVertexVerticesKernelB2( + GLuint vertexBuffer, GLuint varyingBuffer, + int vertexOffset, int varyingOffset, + int offset, int tableOffset, int start, int end); + + void ApplyCatmarkRestrictedVertexVerticesKernelA( + GLuint vertexBuffer, GLuint varyingBuffer, + int vertexOffset, int varyingOffset, + int offset, int tableOffset, int start, int end); + void ApplyLoopEdgeVerticesKernel( GLuint vertexBuffer, GLuint varyingBuffer, int vertexOffset, int varyingOffset, @@ -212,6 +227,12 @@ protected: GLuint _subComputeCatmarkVertexB;// vertex-vertex kernel B (catmark scheme) + GLuint _subComputeRestrictedVertexA; // restricted vertex-vertex kernel A (catmark scheme) + + GLuint _subComputeRestrictedVertexB1; // restricted vertex-vertex kernel B1 (catmark scheme) + + GLuint _subComputeRestrictedVertexB2; // restricted vertex-vertex kernel B2 (catmark scheme) + GLuint _subComputeLoopVertexB; // vertex-vertex kernel B (loop scheme) GLuint _subEditAdd; // hedit kernel (add) diff --git a/opensubdiv/osd/hlslComputeKernel.hlsl b/opensubdiv/osd/hlslComputeKernel.hlsl index 8660df2f..fc5df746 100644 --- a/opensubdiv/osd/hlslComputeKernel.hlsl +++ b/opensubdiv/osd/hlslComputeKernel.hlsl @@ -427,6 +427,86 @@ void runKernel( uint3 ID ) } }; +// Restricted vertex-vertices compute Kernels 'A' / k_Crease and k_Corner rules +class CatmarkComputeRestrictedVertexA : IComputeKernel { +int placeholder; +void runKernel( uint3 ID ) +{ + int i = int(ID.x) + indexStart; + if (i >= indexEnd) return; + int vid = i + vertexOffset; + i += tableOffset; + + int p = _V_ITa[5*i+2]; + int eidx0 = _V_ITa[5*i+3]; + int eidx1 = _V_ITa[5*i+4]; + + Vertex dst; + clear(dst); + + addWithWeight(dst, readVertex(p), 0.75f); + addWithWeight(dst, readVertex(eidx0), 0.125f); + addWithWeight(dst, readVertex(eidx1), 0.125f); + addVaryingWithWeight(dst, readVertex(p), 1); + + writeVertex(vid, dst); +} +}; + +// Restricted vertex-vertices compute Kernels 'B' / regular k_Dart and k_Smooth rules +class CatmarkComputeRestrictedVertexB1 : IComputeKernel { +int placeholder; +void runKernel( uint3 ID ) +{ + int i = int(ID.x) + indexStart; + if (i >= indexEnd) return; + int vid = i + vertexOffset; + i += tableOffset; + + int h = _V_ITa[5*i]; + int p = _V_ITa[5*i+2]; + + Vertex dst; + clear(dst); + + addWithWeight(dst, readVertex(p), 0.5f); + for(int j = 0; j < 8; ++j) + addWithWeight(dst, readVertex(_V_IT[h+j]), 0.0625f); + addVaryingWithWeight(dst, readVertex(p), 1); + writeVertex(vid, dst); +} +}; + +// Restricted vertex-vertices compute Kernels 'B' / irregular k_Dart and k_Smooth rules +class CatmarkComputeRestrictedVertexB2 : IComputeKernel { +int placeholder; +void runKernel( uint3 ID ) +{ + int i = int(ID.x) + indexStart; + if (i >= indexEnd) return; + int vid = i + vertexOffset; + i += tableOffset; + + int h = _V_ITa[5*i]; + int n = _V_ITa[5*i+1]; + int p = _V_ITa[5*i+2]; + + float wp = 1.0/float(n*n); + float wv = (n-2.0) * n * wp; + + Vertex dst; + clear(dst); + + addWithWeight(dst, readVertex(p), wv); + for(int j = 0; j < n; ++j){ + addWithWeight(dst, readVertex(_V_IT[h+j*2]), wp); + addWithWeight(dst, readVertex(_V_IT[h+j*2+1]), wp); + } + addVaryingWithWeight(dst, readVertex(p), 1); + writeVertex(vid, dst); +} +}; + // Vertex-vertices compute Kernels 'B' / k_Dart and k_Smooth rules class LoopComputeVertexB : IComputeKernel { int placeholder; @@ -493,6 +573,9 @@ BilinearComputeEdge bilinearComputeEdge; BilinearComputeVertex bilinearComputeVertex; CatmarkComputeVertexA catmarkComputeVertexA; CatmarkComputeVertexB catmarkComputeVertexB; +CatmarkComputeRestrictedVertexA catmarkComputeRestrictedVertexA; +CatmarkComputeRestrictedVertexB1 catmarkComputeRestrictedVertexB1; +CatmarkComputeRestrictedVertexB2 catmarkComputeRestrictedVertexB2; LoopComputeVertexB loopComputeVertexB; EditAdd editAdd; diff --git a/opensubdiv/osd/ompComputeController.cpp b/opensubdiv/osd/ompComputeController.cpp index 10e78207..2170a42e 100644 --- a/opensubdiv/osd/ompComputeController.cpp +++ b/opensubdiv/osd/ompComputeController.cpp @@ -190,6 +190,47 @@ OsdOmpComputeController::ApplyCatmarkVertexVerticesKernelA2( batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd(), true); } +void +OsdOmpComputeController::ApplyCatmarkRestrictedVertexVerticesKernelB1( + FarKernelBatch const &batch, OsdCpuComputeContext const *context) const { + + assert(context); + + OsdOmpComputeRestrictedVertexB1( + _currentBindState.vertexBuffer, _currentBindState.varyingBuffer, + _currentBindState.vertexDesc, _currentBindState.varyingDesc, + (const int*)context->GetTable(FarSubdivisionTables::V_ITa)->GetBuffer(), + (const int*)context->GetTable(FarSubdivisionTables::V_IT)->GetBuffer(), + batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd()); +} + +void +OsdOmpComputeController::ApplyCatmarkRestrictedVertexVerticesKernelB2( + FarKernelBatch const &batch, OsdCpuComputeContext const *context) const { + + assert(context); + + OsdOmpComputeRestrictedVertexB2( + _currentBindState.vertexBuffer, _currentBindState.varyingBuffer, + _currentBindState.vertexDesc, _currentBindState.varyingDesc, + (const int*)context->GetTable(FarSubdivisionTables::V_ITa)->GetBuffer(), + (const int*)context->GetTable(FarSubdivisionTables::V_IT)->GetBuffer(), + batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd()); +} + +void +OsdOmpComputeController::ApplyCatmarkRestrictedVertexVerticesKernelA( + FarKernelBatch const &batch, OsdCpuComputeContext const *context) const { + + assert(context); + + OsdOmpComputeRestrictedVertexA( + _currentBindState.vertexBuffer, _currentBindState.varyingBuffer, + _currentBindState.vertexDesc, _currentBindState.varyingDesc, + (const int*)context->GetTable(FarSubdivisionTables::V_ITa)->GetBuffer(), + batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd()); +} + void OsdOmpComputeController::ApplyLoopEdgeVerticesKernel( FarKernelBatch const &batch, OsdCpuComputeContext const *context) const { diff --git a/opensubdiv/osd/ompComputeController.h b/opensubdiv/osd/ompComputeController.h index ad425b2a..fae794e8 100644 --- a/opensubdiv/osd/ompComputeController.h +++ b/opensubdiv/osd/ompComputeController.h @@ -142,6 +142,12 @@ protected: void ApplyCatmarkVertexVerticesKernelA2(FarKernelBatch const &batch, ComputeContext const *context) const; + void ApplyCatmarkRestrictedVertexVerticesKernelB1(FarKernelBatch const &batch, ComputeContext const *context) const; + + void ApplyCatmarkRestrictedVertexVerticesKernelB2(FarKernelBatch const &batch, ComputeContext const *context) const; + + void ApplyCatmarkRestrictedVertexVerticesKernelA(FarKernelBatch const &batch, ComputeContext const *context) const; + void ApplyLoopEdgeVerticesKernel(FarKernelBatch const &batch, ComputeContext const *context) const; diff --git a/opensubdiv/osd/ompKernel.cpp b/opensubdiv/osd/ompKernel.cpp index 1c094dae..0fcd6c0c 100644 --- a/opensubdiv/osd/ompKernel.cpp +++ b/opensubdiv/osd/ompKernel.cpp @@ -386,6 +386,126 @@ void OsdOmpComputeVertexB( } } +void OsdOmpComputeRestrictedVertexA( + float * vertex, float * varying, + OsdVertexBufferDescriptor const &vertexDesc, + OsdVertexBufferDescriptor const &varyingDesc, + const int *V_ITa, + int offset, int tableOffset, int start, int end) { + + int numThreads = omp_get_max_threads(); + float *vertexResultsArray = (float*)alloca(vertexDesc.length * sizeof(float) * numThreads); + float *varyingResultsArray = (float*)alloca(varyingDesc.length * sizeof(float) * numThreads); + +#pragma omp parallel for + for (int i = start + tableOffset; i < end + tableOffset; i++) { + int p = V_ITa[5*i+2]; + int eidx0 = V_ITa[5*i+3]; + int eidx1 = V_ITa[5*i+4]; + + int dstIndex = offset + i - tableOffset; + + int threadId = omp_get_thread_num(); + float *vertexResults = vertexResultsArray + + vertexDesc.length * threadId; + float *varyingResults = varyingResultsArray + + varyingDesc.length * threadId; + + clear(vertexResults, vertexDesc); + clear(varyingResults, varyingDesc); + + addWithWeight(vertexResults, vertex, p, 0.75f, vertexDesc); + addWithWeight(vertexResults, vertex, eidx0, 0.125f, vertexDesc); + addWithWeight(vertexResults, vertex, eidx1, 0.125f, vertexDesc); + copy(vertex, vertexResults, dstIndex, vertexDesc); + + addWithWeight(varyingResults, varying, p, 1.0f, varyingDesc); + copy(varying, varyingResults, dstIndex, varyingDesc); + } +} + +void OsdOmpComputeRestrictedVertexB1( + float * vertex, float * varying, + OsdVertexBufferDescriptor const &vertexDesc, + OsdVertexBufferDescriptor const &varyingDesc, + const int *V_ITa, const int *V_IT, + int offset, int tableOffset, int start, int end) { + + int numThreads = omp_get_max_threads(); + float *vertexResultsArray = (float*)alloca(vertexDesc.length * sizeof(float) * numThreads); + float *varyingResultsArray = (float*)alloca(varyingDesc.length * sizeof(float) * numThreads); + +#pragma omp parallel for + for (int i = start + tableOffset; i < end + tableOffset; i++) { + int h = V_ITa[5*i]; + int p = V_ITa[5*i+2]; + + int dstIndex = offset + i - tableOffset; + + int threadId = omp_get_thread_num(); + float *vertexResults = vertexResultsArray + + vertexDesc.length * threadId; + float *varyingResults = varyingResultsArray + + varyingDesc.length * threadId; + + clear(vertexResults, vertexDesc); + clear(varyingResults, varyingDesc); + + addWithWeight(vertexResults, vertex, p, 0.5f, vertexDesc); + + for (int j = 0; j < 8; ++j) + addWithWeight(vertexResults, vertex, V_IT[h+j], 0.0625f, vertexDesc); + addWithWeight(varyingResults, varying, p, 1.0f, varyingDesc); + + copy(vertex, vertexResults, dstIndex, vertexDesc); + copy(varying, varyingResults, dstIndex, varyingDesc); + } +} + +void OsdOmpComputeRestrictedVertexB2( + float * vertex, float * varying, + OsdVertexBufferDescriptor const &vertexDesc, + OsdVertexBufferDescriptor const &varyingDesc, + const int *V_ITa, const int *V_IT, + int offset, int tableOffset, int start, int end) { + + int numThreads = omp_get_max_threads(); + float *vertexResultsArray = (float*)alloca(vertexDesc.length * sizeof(float) * numThreads); + float *varyingResultsArray = (float*)alloca(varyingDesc.length * sizeof(float) * numThreads); + +#pragma omp parallel for + for (int i = start + tableOffset; i < end + tableOffset; i++) { + int h = V_ITa[5*i]; + int n = V_ITa[5*i+1]; + int p = V_ITa[5*i+2]; + + float wp = 1.0f/static_cast(n*n); + float wv = (n-2.0f) * n * wp; + + int dstIndex = offset + i - tableOffset; + + int threadId = omp_get_thread_num(); + float *vertexResults = vertexResultsArray + + vertexDesc.length * threadId; + float *varyingResults = varyingResultsArray + + varyingDesc.length * threadId; + + clear(vertexResults, vertexDesc); + clear(varyingResults, varyingDesc); + + addWithWeight(vertexResults, vertex, p, wv, vertexDesc); + + for (int j = 0; j < n; ++j) { + addWithWeight(vertexResults, vertex, V_IT[h+j*2], wp, vertexDesc); + addWithWeight(vertexResults, vertex, V_IT[h+j*2+1], wp, vertexDesc); + } + addWithWeight(varyingResults, varying, p, 1.0f, varyingDesc); + + copy(vertex, vertexResults, dstIndex, vertexDesc); + copy(varying, varyingResults, dstIndex, varyingDesc); + } +} + void OsdOmpComputeLoopVertexB( float * vertex, float * varying, OsdVertexBufferDescriptor const &vertexDesc, diff --git a/opensubdiv/osd/ompKernel.h b/opensubdiv/osd/ompKernel.h index 6e56fbfa..280f1173 100644 --- a/opensubdiv/osd/ompKernel.h +++ b/opensubdiv/osd/ompKernel.h @@ -57,7 +57,7 @@ void OsdOmpComputeTriQuadFace(float * vertex, float * varying, void OsdOmpComputeEdge(float *vertex, float * varying, OsdVertexBufferDescriptor const &vertexDesc, OsdVertexBufferDescriptor const &varyingDesc, - const int *E_IT, const float *E_ITa, + const int *E_IT, const float *E_W, int vertexOffset, int tableOffset, int start, int end); @@ -71,7 +71,7 @@ void OsdOmpComputeRestrictedEdge(float *vertex, float * varying, void OsdOmpComputeVertexA(float *vertex, float * varying, OsdVertexBufferDescriptor const &vertexDesc, OsdVertexBufferDescriptor const &varyingDesc, - const int *V_ITa, const float *V_IT, + const int *V_ITa, const float *V_W, int vertexOffset, int tableOffset, int start, int end, int pass); @@ -82,6 +82,27 @@ void OsdOmpComputeVertexB(float *vertex, float * varying, int vertexOffset, int tableOffset, int start, int end); +void OsdOmpComputeRestrictedVertexA(float *vertex, float * varying, + OsdVertexBufferDescriptor const &vertexDesc, + OsdVertexBufferDescriptor const &varyingDesc, + const int *V_ITa, + int vertexOffset, int tableOffset, + int start, int end); + +void OsdOmpComputeRestrictedVertexB1(float *vertex, float * varying, + OsdVertexBufferDescriptor const &vertexDesc, + OsdVertexBufferDescriptor const &varyingDesc, + const int *V_ITa, const int *V_IT, + int vertexOffset, int tableOffset, + int start, int end); + +void OsdOmpComputeRestrictedVertexB2(float *vertex, float * varying, + OsdVertexBufferDescriptor const &vertexDesc, + OsdVertexBufferDescriptor const &varyingDesc, + const int *V_ITa, const int *V_IT, + int vertexOffset, int tableOffset, + int start, int end); + void OsdOmpComputeLoopVertexB(float *vertex, float * varying, OsdVertexBufferDescriptor const &vertexDesc, OsdVertexBufferDescriptor const &varyingDesc, diff --git a/opensubdiv/osd/tbbComputeController.cpp b/opensubdiv/osd/tbbComputeController.cpp index 405cc892..34ba5f87 100644 --- a/opensubdiv/osd/tbbComputeController.cpp +++ b/opensubdiv/osd/tbbComputeController.cpp @@ -196,6 +196,47 @@ OsdTbbComputeController::ApplyCatmarkVertexVerticesKernelA2( batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd(), true); } +void +OsdTbbComputeController::ApplyCatmarkRestrictedVertexVerticesKernelB1( + FarKernelBatch const &batch, OsdCpuComputeContext const *context) const { + + assert(context); + + OsdTbbComputeRestrictedVertexB1( + _currentBindState.vertexBuffer, _currentBindState.varyingBuffer, + _currentBindState.vertexDesc, _currentBindState.varyingDesc, + (const int*)context->GetTable(FarSubdivisionTables::V_ITa)->GetBuffer(), + (const int*)context->GetTable(FarSubdivisionTables::V_IT)->GetBuffer(), + batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd()); +} + +void +OsdTbbComputeController::ApplyCatmarkRestrictedVertexVerticesKernelB2( + FarKernelBatch const &batch, OsdCpuComputeContext const *context) const { + + assert(context); + + OsdTbbComputeRestrictedVertexB2( + _currentBindState.vertexBuffer, _currentBindState.varyingBuffer, + _currentBindState.vertexDesc, _currentBindState.varyingDesc, + (const int*)context->GetTable(FarSubdivisionTables::V_ITa)->GetBuffer(), + (const int*)context->GetTable(FarSubdivisionTables::V_IT)->GetBuffer(), + batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd()); +} + +void +OsdTbbComputeController::ApplyCatmarkRestrictedVertexVerticesKernelA( + FarKernelBatch const &batch, OsdCpuComputeContext const *context) const { + + assert(context); + + OsdTbbComputeRestrictedVertexA( + _currentBindState.vertexBuffer, _currentBindState.varyingBuffer, + _currentBindState.vertexDesc, _currentBindState.varyingDesc, + (const int*)context->GetTable(FarSubdivisionTables::V_ITa)->GetBuffer(), + batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd()); +} + void OsdTbbComputeController::ApplyLoopEdgeVerticesKernel( FarKernelBatch const &batch, OsdCpuComputeContext const *context) const { diff --git a/opensubdiv/osd/tbbComputeController.h b/opensubdiv/osd/tbbComputeController.h index bc6b4fb2..fb5756c9 100644 --- a/opensubdiv/osd/tbbComputeController.h +++ b/opensubdiv/osd/tbbComputeController.h @@ -134,6 +134,12 @@ protected: void ApplyCatmarkVertexVerticesKernelA2(FarKernelBatch const &batch, ComputeContext const *context) const; + void ApplyCatmarkRestrictedVertexVerticesKernelB1(FarKernelBatch const &batch, ComputeContext const *context) const; + + void ApplyCatmarkRestrictedVertexVerticesKernelB2(FarKernelBatch const &batch, ComputeContext const *context) const; + + void ApplyCatmarkRestrictedVertexVerticesKernelA(FarKernelBatch const &batch, ComputeContext const *context) const; + void ApplyLoopEdgeVerticesKernel(FarKernelBatch const &batch, ComputeContext const *context) const; diff --git a/opensubdiv/osd/tbbKernel.cpp b/opensubdiv/osd/tbbKernel.cpp index fed5785a..51ee600e 100644 --- a/opensubdiv/osd/tbbKernel.cpp +++ b/opensubdiv/osd/tbbKernel.cpp @@ -658,6 +658,226 @@ void OsdTbbComputeVertexB( tbb::parallel_for(range, kernel); } +class TBBRestrictedVertexKernelA { + float *vertex; + float *varying; + OsdVertexBufferDescriptor vertexDesc; + OsdVertexBufferDescriptor varyingDesc; + int const *V_ITa; + int vertexOffset; + int tableOffset; + +public: + void operator() (tbb::blocked_range const &r) const { + for (int i = r.begin() + tableOffset; i < r.end() + tableOffset; i++) { + int p = V_ITa[5*i+2]; + int eidx0 = V_ITa[5*i+3]; + int eidx1 = V_ITa[5*i+4]; + + int dstIndex = i + vertexOffset - tableOffset; + + clear(vertex, dstIndex, vertexDesc); + clear(varying, dstIndex, varyingDesc); + addWithWeight(vertex, dstIndex, p, 0.75f, vertexDesc); + addWithWeight(vertex, dstIndex, eidx0, 0.125f, vertexDesc); + addWithWeight(vertex, dstIndex, eidx1, 0.125f, vertexDesc); + addWithWeight(varying, dstIndex, p, 1.0f, varyingDesc); + } + } + + TBBRestrictedVertexKernelA(TBBRestrictedVertexKernelA const &other) + { + this->vertex = other.vertex; + this->varying= other.varying; + this->vertexDesc = other.vertexDesc; + this->varyingDesc = other.varyingDesc; + this->V_ITa = other.V_ITa; + this->vertexOffset = other.vertexOffset; + this->tableOffset = other.tableOffset; + } + + TBBRestrictedVertexKernelA(float *vertex_in, + float *varying_in, + OsdVertexBufferDescriptor const &vertexDesc_in, + OsdVertexBufferDescriptor const &varyingDesc_in, + int const *V_ITa_in, + int vertexOffset_in, + int tableOffset_in) : + vertex (vertex_in), + varying(varying_in), + vertexDesc(vertexDesc_in), + varyingDesc(varyingDesc_in), + V_ITa (V_ITa_in), + vertexOffset(vertexOffset_in), + tableOffset(tableOffset_in) + {}; +}; + +void OsdTbbComputeRestrictedVertexA( + float *vertex, float *varying, + OsdVertexBufferDescriptor const &vertexDesc, + OsdVertexBufferDescriptor const &varyingDesc, + int const *V_ITa, int vertexOffset, int tableOffset, + int start, int end) { + tbb::blocked_range range(start, end, grain_size); + TBBRestrictedVertexKernelA kernel(vertex, varying, vertexDesc, varyingDesc, + V_ITa, + vertexOffset, tableOffset); + tbb::parallel_for(range, kernel); +} + +class TBBRestrictedVertexKernelB1 { + float *vertex; + float *varying; + OsdVertexBufferDescriptor vertexDesc; + OsdVertexBufferDescriptor varyingDesc; + int const *V_ITa; + int const *V_IT; + int vertexOffset; + int tableOffset; + +public: + void operator() (tbb::blocked_range const &r) const { + for (int i = r.begin() + tableOffset; i < r.end() + tableOffset; i++) { + int h = V_ITa[5*i]; + int p = V_ITa[5*i+2]; + + int dstIndex = i + vertexOffset - tableOffset; + clear(vertex, dstIndex, vertexDesc); + clear(varying, dstIndex, varyingDesc); + + addWithWeight(vertex, dstIndex, p, 0.5f, vertexDesc); + + for (int j = 0; j < 8; ++j) + addWithWeight(vertex, dstIndex, V_IT[h+j], 0.0625f, vertexDesc); + addWithWeight(varying, dstIndex, p, 1.0f, varyingDesc); + } + } + + TBBRestrictedVertexKernelB1(TBBRestrictedVertexKernelB1 const &other) + { + this->vertex = other.vertex; + this->varying= other.varying; + this->vertexDesc = other.vertexDesc; + this->varyingDesc = other.varyingDesc; + this->V_ITa = other.V_ITa; + this->V_IT = other.V_IT; + this->vertexOffset = other.vertexOffset; + this->tableOffset = other.tableOffset; + } + + TBBRestrictedVertexKernelB1(float *vertex_in, + float *varying_in, + OsdVertexBufferDescriptor const &vertexDesc_in, + OsdVertexBufferDescriptor const &varyingDesc_in, + int const *V_ITa_in, + int const *V_IT_in, + int vertexOffset_in, + int tableOffset_in) : + vertex (vertex_in), + varying(varying_in), + vertexDesc(vertexDesc_in), + varyingDesc(varyingDesc_in), + V_ITa (V_ITa_in), + V_IT (V_IT_in), + vertexOffset(vertexOffset_in), + tableOffset(tableOffset_in) + {}; +}; + +void OsdTbbComputeRestrictedVertexB1( + float *vertex, float *varying, + OsdVertexBufferDescriptor const &vertexDesc, + OsdVertexBufferDescriptor const &varyingDesc, + int const *V_ITa, int const *V_IT, + int vertexOffset, int tableOffset, int start, int end) { + + tbb::blocked_range range(start, end, grain_size); + TBBRestrictedVertexKernelB1 kernel(vertex, varying, vertexDesc, varyingDesc, + V_ITa, V_IT, + vertexOffset, tableOffset); + tbb::parallel_for(range, kernel); +} + +class TBBRestrictedVertexKernelB2 { + float *vertex; + float *varying; + OsdVertexBufferDescriptor vertexDesc; + OsdVertexBufferDescriptor varyingDesc; + int const *V_ITa; + int const *V_IT; + int vertexOffset; + int tableOffset; + +public: + void operator() (tbb::blocked_range const &r) const { + for (int i = r.begin() + tableOffset; i < r.end() + tableOffset; i++) { + int h = V_ITa[5*i]; + int n = V_ITa[5*i+1]; + int p = V_ITa[5*i+2]; + + float wp = 1.0f/static_cast(n*n); + float wv = (n-2.0f) * n * wp; + + int dstIndex = i + vertexOffset - tableOffset; + clear(vertex, dstIndex, vertexDesc); + clear(varying, dstIndex, varyingDesc); + + addWithWeight(vertex, dstIndex, p, wv, vertexDesc); + + for (int j = 0; j < n; ++j) { + addWithWeight(vertex, dstIndex, V_IT[h+j*2], wp, vertexDesc); + addWithWeight(vertex, dstIndex, V_IT[h+j*2+1], wp, vertexDesc); + } + addWithWeight(varying, dstIndex, p, 1.0f, varyingDesc); + } + } + + TBBRestrictedVertexKernelB2(TBBRestrictedVertexKernelB2 const &other) + { + this->vertex = other.vertex; + this->varying= other.varying; + this->vertexDesc = other.vertexDesc; + this->varyingDesc = other.varyingDesc; + this->V_ITa = other.V_ITa; + this->V_IT = other.V_IT; + this->vertexOffset = other.vertexOffset; + this->tableOffset = other.tableOffset; + } + + TBBRestrictedVertexKernelB2(float *vertex_in, + float *varying_in, + OsdVertexBufferDescriptor const &vertexDesc_in, + OsdVertexBufferDescriptor const &varyingDesc_in, + int const *V_ITa_in, + int const *V_IT_in, + int vertexOffset_in, + int tableOffset_in) : + vertex (vertex_in), + varying(varying_in), + vertexDesc(vertexDesc_in), + varyingDesc(varyingDesc_in), + V_ITa (V_ITa_in), + V_IT (V_IT_in), + vertexOffset(vertexOffset_in), + tableOffset(tableOffset_in) + {}; +}; + +void OsdTbbComputeRestrictedVertexB2( + float *vertex, float *varying, + OsdVertexBufferDescriptor const &vertexDesc, + OsdVertexBufferDescriptor const &varyingDesc, + int const *V_ITa, int const *V_IT, + int vertexOffset, int tableOffset, int start, int end) { + + tbb::blocked_range range(start, end, grain_size); + TBBRestrictedVertexKernelB2 kernel(vertex, varying, vertexDesc, varyingDesc, + V_ITa, V_IT, + vertexOffset, tableOffset); + tbb::parallel_for(range, kernel); +} + class TBBLoopVertexKernelB { float *vertex; float *varying; diff --git a/opensubdiv/osd/tbbKernel.h b/opensubdiv/osd/tbbKernel.h index 48646036..4656e059 100644 --- a/opensubdiv/osd/tbbKernel.h +++ b/opensubdiv/osd/tbbKernel.h @@ -70,7 +70,7 @@ void OsdTbbComputeRestrictedEdge(float *vertex, float * varying, void OsdTbbComputeVertexA(float *vertex, float * varying, OsdVertexBufferDescriptor const &vertexDesc, OsdVertexBufferDescriptor const &varyingDesc, - int const *V_ITa, float const *V_IT, + int const *V_ITa, float const *V_W, int vertexOffset, int tableOffset, int start, int end, int pass); @@ -81,6 +81,27 @@ void OsdTbbComputeVertexB(float *vertex, float * varying, int vertexOffset, int tableOffset, int start, int end); +void OsdTbbComputeRestrictedVertexA(float *vertex, float * varying, + OsdVertexBufferDescriptor const &vertexDesc, + OsdVertexBufferDescriptor const &varyingDesc, + int const *V_ITa, + int vertexOffset, int tableOffset, + int start, int end); + +void OsdTbbComputeRestrictedVertexB1(float *vertex, float * varying, + OsdVertexBufferDescriptor const &vertexDesc, + OsdVertexBufferDescriptor const &varyingDesc, + int const *V_ITa, int const *V_IT, + int vertexOffset, int tableOffset, + int start, int end); + +void OsdTbbComputeRestrictedVertexB2(float *vertex, float * varying, + OsdVertexBufferDescriptor const &vertexDesc, + OsdVertexBufferDescriptor const &varyingDesc, + int const *V_ITa, int const *V_IT, + int vertexOffset, int tableOffset, + int start, int end); + void OsdTbbComputeLoopVertexB(float *vertex, float * varying, OsdVertexBufferDescriptor const &vertexDesc, OsdVertexBufferDescriptor const &varyingDesc,