Implemented CATMARK_QUAD_FACE_VERTEX and CATMARK_TRI_QUAD_FACE_VERTEX kernels for CUDA, GLSL Transform Feedback, and GLSL Compute platforms.

2024-11-23 12:10:08 +00:00 · 2014-05-28 16:46:38 -07:00 · 2014-05-28 16:46:38 -07:00 · b769f4f60d
commit b769f4f60d
parent 064115bbe8
13 changed files with 481 additions and 0 deletions
--- a/opensubdiv/osd/cudaComputeController.cpp
+++ b/opensubdiv/osd/cudaComputeController.cpp
@ -35,6 +35,16 @@ void OsdCudaComputeFace(float *vertex, float *varying,
                        int varyingLength, int varyingStride,
                        int *F_IT, int *F_ITa, int offset, int tableOffset, int start, int end);

+void OsdCudaComputeQuadFace(float *vertex, float *varying,
+                            int vertexLength, int vertexStride,
+                            int varyingLength, int varyingStride,
+                            int *F_IT, int offset, int tableOffset, int start, int end);
+
+void OsdCudaComputeTriQuadFace(float *vertex, float *varying,
+                               int vertexLength, int vertexStride,
+                               int varyingLength, int varyingStride,
+                               int *F_IT, int offset, int tableOffset, int start, int end);
+
 void OsdCudaComputeEdge(float *vertex, float *varying,
                        int vertexLength, int vertexStride,
                        int varyingLength, int varyingStride,
@ -171,6 +181,46 @@ OsdCudaComputeController::ApplyCatmarkFaceVerticesKernel(
        batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd());
 }

+void
+OsdCudaComputeController::ApplyCatmarkQuadFaceVerticesKernel(
+    FarKernelBatch const &batch, OsdCudaComputeContext const *context) const {
+
+    assert(context);
+
+    const OsdCudaTable * F_IT = context->GetTable(FarSubdivisionTables::F_IT);
+    assert(F_IT);
+
+    float *vertex = _currentBindState.GetOffsettedVertexBuffer();
+    float *varying = _currentBindState.GetOffsettedVaryingBuffer();
+
+    OsdCudaComputeQuadFace(
+        vertex, varying,
+        _currentBindState.vertexDesc.length, _currentBindState.vertexDesc.stride,
+        _currentBindState.varyingDesc.length, _currentBindState.varyingDesc.stride,
+        static_cast<int*>(F_IT->GetCudaMemory()),
+        batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd());
+}
+
+void
+OsdCudaComputeController::ApplyCatmarkTriQuadFaceVerticesKernel(
+    FarKernelBatch const &batch, OsdCudaComputeContext const *context) const {
+
+    assert(context);
+
+    const OsdCudaTable * F_IT = context->GetTable(FarSubdivisionTables::F_IT);
+    assert(F_IT);
+
+    float *vertex = _currentBindState.GetOffsettedVertexBuffer();
+    float *varying = _currentBindState.GetOffsettedVaryingBuffer();
+
+    OsdCudaComputeTriQuadFace(
+        vertex, varying,
+        _currentBindState.vertexDesc.length, _currentBindState.vertexDesc.stride,
+        _currentBindState.varyingDesc.length, _currentBindState.varyingDesc.stride,
+        static_cast<int*>(F_IT->GetCudaMemory()),
+        batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd());
+}
+
 void
 OsdCudaComputeController::ApplyCatmarkEdgeVerticesKernel(
    FarKernelBatch const &batch, OsdCudaComputeContext const *context) const {
--- a/opensubdiv/osd/cudaComputeController.h
+++ b/opensubdiv/osd/cudaComputeController.h
@ -120,6 +120,10 @@ protected:

    void ApplyCatmarkFaceVerticesKernel(FarKernelBatch const &batch, ComputeContext const *context) const;

+    void ApplyCatmarkQuadFaceVerticesKernel(FarKernelBatch const &batch, ComputeContext const *context) const;
+
+    void ApplyCatmarkTriQuadFaceVerticesKernel(FarKernelBatch const &batch, ComputeContext const *context) const;
+
    void ApplyCatmarkEdgeVerticesKernel(FarKernelBatch const &batch, ComputeContext const *context) const;

    void ApplyCatmarkVertexVerticesKernelB(FarKernelBatch const &batch, ComputeContext const *context) const;
--- a/opensubdiv/osd/cudaKernel.cu
+++ b/opensubdiv/osd/cudaKernel.cu
@ -140,6 +140,163 @@ computeFace(float *fVertex, float *fVarying,
    }
 }

+template <int NUM_VERTEX_ELEMENTS, int NUM_VARYING_ELEMENTS> __global__ void
+computeQuadFace(float *fVertex, float *fVaryings, int *F0_IT, int offset, int tableOffset, int start, int end)
+{
+    DeviceVertex<NUM_VERTEX_ELEMENTS> *vertex = (DeviceVertex<NUM_VERTEX_ELEMENTS>*)fVertex;
+    DeviceVertex<NUM_VARYING_ELEMENTS> *varyings = (DeviceVertex<NUM_VARYING_ELEMENTS>*)fVaryings;
+    for (int i = start + threadIdx.x + blockIdx.x*blockDim.x;
+         i < end;
+         i += blockDim.x * gridDim.x) {
+
+        int fidx0 = F0_IT[tableOffset + 4 * i + 0];
+        int fidx1 = F0_IT[tableOffset + 4 * i + 1];
+        int fidx2 = F0_IT[tableOffset + 4 * i + 2];
+        int fidx3 = F0_IT[tableOffset + 4 * i + 3];
+
+        DeviceVertex<NUM_VERTEX_ELEMENTS> dst;
+        dst.clear();
+
+        if(NUM_VARYING_ELEMENTS > 0){
+            DeviceVertex<NUM_VARYING_ELEMENTS> dstVarying;
+            dstVarying.clear();
+
+            dst.addWithWeight(&vertex[fidx0], 0.25f);
+            dst.addWithWeight(&vertex[fidx1], 0.25f);
+            dst.addWithWeight(&vertex[fidx2], 0.25f);
+            dst.addWithWeight(&vertex[fidx3], 0.25f);
+            dstVarying.addWithWeight(&varyings[fidx0], 0.25f);
+            dstVarying.addWithWeight(&varyings[fidx1], 0.25f);
+            dstVarying.addWithWeight(&varyings[fidx2], 0.25f);
+            dstVarying.addWithWeight(&varyings[fidx3], 0.25f);
+            vertex[offset + i] = dst;
+            varyings[offset + i] = dstVarying;
+        }else{
+            dst.addWithWeight(&vertex[fidx0], 0.25f);
+            dst.addWithWeight(&vertex[fidx1], 0.25f);
+            dst.addWithWeight(&vertex[fidx2], 0.25f);
+            dst.addWithWeight(&vertex[fidx3], 0.25f);
+            vertex[offset + i] = dst;
+        }
+    }
+}
+
+__global__ void
+computeQuadFace(float *fVertex, float *fVarying,
+                int vertexLength, int vertexStride,
+                int varyingLength, int varyingStride,
+                int *F0_IT, int offset, int tableOffset, int start, int end)
+{
+    for (int i = start +threadIdx.x + blockIdx.x*blockDim.x;
+        i < end;
+        i += blockDim.x * gridDim.x){
+
+        int fidx0 = F0_IT[tableOffset + 4 * i + 0];
+        int fidx1 = F0_IT[tableOffset + 4 * i + 1];
+        int fidx2 = F0_IT[tableOffset + 4 * i + 2];
+        int fidx3 = F0_IT[tableOffset + 4 * i + 3];
+
+        // XXX: can we use local stack like alloca?
+        float *dstVertex = fVertex + (i+offset)*vertexStride;
+        clear(dstVertex, vertexLength);
+        float *dstVarying = fVarying + (i+offset)*varyingStride;
+        clear(dstVarying, varyingLength);
+
+        addWithWeight(dstVertex, fVertex + fidx0*vertexStride, 0.25f, vertexLength);
+        addWithWeight(dstVertex, fVertex + fidx1*vertexStride, 0.25f, vertexLength);
+        addWithWeight(dstVertex, fVertex + fidx2*vertexStride, 0.25f, vertexLength);
+        addWithWeight(dstVertex, fVertex + fidx3*vertexStride, 0.25f, vertexLength);
+        addWithWeight(dstVarying, fVarying + fidx0*varyingStride, 0.25f, varyingLength);
+        addWithWeight(dstVarying, fVarying + fidx1*varyingStride, 0.25f, varyingLength);
+        addWithWeight(dstVarying, fVarying + fidx2*varyingStride, 0.25f, varyingLength);
+        addWithWeight(dstVarying, fVarying + fidx3*varyingStride, 0.25f, varyingLength);
+    }
+}
+
+template <int NUM_VERTEX_ELEMENTS, int NUM_VARYING_ELEMENTS> __global__ void
+computeTriQuadFace(float *fVertex, float *fVaryings, int *F0_IT, int offset, int tableOffset, int start, int end)
+{
+    DeviceVertex<NUM_VERTEX_ELEMENTS> *vertex = (DeviceVertex<NUM_VERTEX_ELEMENTS>*)fVertex;
+    DeviceVertex<NUM_VARYING_ELEMENTS> *varyings = (DeviceVertex<NUM_VARYING_ELEMENTS>*)fVaryings;
+    for (int i = start + threadIdx.x + blockIdx.x*blockDim.x;
+         i < end;
+         i += blockDim.x * gridDim.x) {
+
+        int fidx0 = F0_IT[tableOffset + 4 * i + 0];
+        int fidx1 = F0_IT[tableOffset + 4 * i + 1];
+        int fidx2 = F0_IT[tableOffset + 4 * i + 2];
+        int fidx3 = F0_IT[tableOffset + 4 * i + 3];
+
+        bool triangle = (fidx2 == fidx3);
+        float weight = triangle ? 1.0f / 3.0f : 1.0f / 4.0f;
+
+        DeviceVertex<NUM_VERTEX_ELEMENTS> dst;
+        dst.clear();
+
+        if(NUM_VARYING_ELEMENTS > 0){
+            DeviceVertex<NUM_VARYING_ELEMENTS> dstVarying;
+            dstVarying.clear();
+
+            dst.addWithWeight(&vertex[fidx0], weight);
+            dst.addWithWeight(&vertex[fidx1], weight);
+            dst.addWithWeight(&vertex[fidx2], weight);
+            dstVarying.addWithWeight(&varyings[fidx0], weight);
+            dstVarying.addWithWeight(&varyings[fidx1], weight);
+            dstVarying.addWithWeight(&varyings[fidx2], weight);
+            if (!triangle) {
+                dst.addWithWeight(&vertex[fidx3], weight);
+                dstVarying.addWithWeight(&varyings[fidx3], 0.25f);
+            }
+            vertex[offset + i] = dst;
+            varyings[offset + i] = dstVarying;
+        }else{
+            dst.addWithWeight(&vertex[fidx0], weight);
+            dst.addWithWeight(&vertex[fidx1], weight);
+            dst.addWithWeight(&vertex[fidx2], weight);
+            if (!triangle)
+                dst.addWithWeight(&vertex[fidx3], weight);
+            vertex[offset + i] = dst;
+        }
+    }
+}
+
+__global__ void
+computeTriQuadFace(float *fVertex, float *fVarying,
+                   int vertexLength, int vertexStride,
+                   int varyingLength, int varyingStride,
+                   int *F0_IT, int offset, int tableOffset, int start, int end)
+{
+    for (int i = start +threadIdx.x + blockIdx.x*blockDim.x;
+        i < end;
+        i += blockDim.x * gridDim.x){
+
+        int fidx0 = F0_IT[tableOffset + 4 * i + 0];
+        int fidx1 = F0_IT[tableOffset + 4 * i + 1];
+        int fidx2 = F0_IT[tableOffset + 4 * i + 2];
+        int fidx3 = F0_IT[tableOffset + 4 * i + 3];
+
+        bool triangle = (fidx2 == fidx3);
+        float weight = triangle ? 1.0f / 3.0f : 1.0f / 4.0f;
+
+        // XXX: can we use local stack like alloca?
+        float *dstVertex = fVertex + (i+offset)*vertexStride;
+        clear(dstVertex, vertexLength);
+        float *dstVarying = fVarying + (i+offset)*varyingStride;
+        clear(dstVarying, varyingLength);
+
+        addWithWeight(dstVertex, fVertex + fidx0*vertexStride, weight, vertexLength);
+        addWithWeight(dstVertex, fVertex + fidx1*vertexStride, weight, vertexLength);
+        addWithWeight(dstVertex, fVertex + fidx2*vertexStride, weight, vertexLength);
+        addWithWeight(dstVarying, fVarying + fidx0*varyingStride, weight, varyingLength);
+        addWithWeight(dstVarying, fVarying + fidx1*varyingStride, weight, varyingLength);
+        addWithWeight(dstVarying, fVarying + fidx2*varyingStride, weight, varyingLength);
+        if (!triangle) {
+            addWithWeight(dstVertex, fVertex + fidx3*vertexStride, weight, vertexLength);
+            addWithWeight(dstVarying, fVarying + fidx3*varyingStride, weight, varyingLength);
+        }
+    }
+}
+
 template <int NUM_VERTEX_ELEMENTS, int NUM_VARYING_ELEMENTS> __global__ void
 computeEdge(float *fVertex, float *fVaryings, int *E0_IT, float *E0_S, int offset, int tableOffset, int start, int end)
 {
@ -639,6 +796,40 @@ void OsdCudaComputeFace(float *vertex, float *varying,
                             F_IT, F_ITa, offset, tableOffset, start, end);
 }

+void OsdCudaComputeQuadFace(float *vertex, float *varying,
+                            int vertexLength, int vertexStride,
+                            int varyingLength, int varyingStride,
+                            int *F_IT, int offset, int tableOffset, int start, int end)
+{
+    //computeQuadFace<3, 0><<<512,32>>>(vertex, varying, F_IT, offset, start, end);
+    OPT_KERNEL(0, 0, computeQuadFace, 512, 32, (vertex, varying, F_IT, offset, tableOffset, start, end));
+    OPT_KERNEL(0, 3, computeQuadFace, 512, 32, (vertex, varying, F_IT, offset, tableOffset, start, end));
+    OPT_KERNEL(3, 0, computeQuadFace, 512, 32, (vertex, varying, F_IT, offset, tableOffset, start, end));
+    OPT_KERNEL(3, 3, computeQuadFace, 512, 32, (vertex, varying, F_IT, offset, tableOffset, start, end));
+
+    // fallback kernel (slow)
+    computeQuadFace<<<512, 32>>>(vertex, varying,
+                                 vertexLength, vertexStride, varyingLength, varyingStride,
+                                 F_IT, offset, tableOffset, start, end);
+}
+
+void OsdCudaComputeTriQuadFace(float *vertex, float *varying,
+                               int vertexLength, int vertexStride,
+                               int varyingLength, int varyingStride,
+                               int *F_IT, int offset, int tableOffset, int start, int end)
+{
+    //computeTriQuadFace<3, 0><<<512,32>>>(vertex, varying, F_IT, offset, start, end);
+    OPT_KERNEL(0, 0, computeTriQuadFace, 512, 32, (vertex, varying, F_IT, offset, tableOffset, start, end));
+    OPT_KERNEL(0, 3, computeTriQuadFace, 512, 32, (vertex, varying, F_IT, offset, tableOffset, start, end));
+    OPT_KERNEL(3, 0, computeTriQuadFace, 512, 32, (vertex, varying, F_IT, offset, tableOffset, start, end));
+    OPT_KERNEL(3, 3, computeTriQuadFace, 512, 32, (vertex, varying, F_IT, offset, tableOffset, start, end));
+
+    // fallback kernel (slow)
+    computeTriQuadFace<<<512, 32>>>(vertex, varying,
+                                    vertexLength, vertexStride, varyingLength, varyingStride,
+                                    F_IT, offset, tableOffset, start, end);
+}
+

 void OsdCudaComputeEdge(float *vertex, float *varying,
                        int vertexLength, int vertexStride,
--- a/opensubdiv/osd/glslComputeController.cpp
+++ b/opensubdiv/osd/glslComputeController.cpp
@ -139,6 +139,28 @@ OsdGLSLComputeController::ApplyCatmarkFaceVerticesKernel(
        batch.GetStart(), batch.GetEnd());
 }

+void
+OsdGLSLComputeController::ApplyCatmarkQuadFaceVerticesKernel(
+    FarKernelBatch const &batch, OsdGLSLComputeContext const *context) const {
+
+    assert(context);
+
+    _currentBindState.kernelBundle->ApplyCatmarkQuadFaceVerticesKernel(
+        batch.GetVertexOffset(), batch.GetTableOffset(),
+        batch.GetStart(), batch.GetEnd());
+}
+
+void
+OsdGLSLComputeController::ApplyCatmarkTriQuadFaceVerticesKernel(
+    FarKernelBatch const &batch, OsdGLSLComputeContext const *context) const {
+
+    assert(context);
+
+    _currentBindState.kernelBundle->ApplyCatmarkTriQuadFaceVerticesKernel(
+        batch.GetVertexOffset(), batch.GetTableOffset(),
+        batch.GetStart(), batch.GetEnd());
+}
+
 void
 OsdGLSLComputeController::ApplyCatmarkEdgeVerticesKernel(
    FarKernelBatch const &batch, OsdGLSLComputeContext const *context) const {
--- a/opensubdiv/osd/glslComputeController.h
+++ b/opensubdiv/osd/glslComputeController.h
@ -129,6 +129,10 @@ protected:

    void ApplyCatmarkFaceVerticesKernel(FarKernelBatch const &batch, ComputeContext const *context) const;

+    void ApplyCatmarkQuadFaceVerticesKernel(FarKernelBatch const &batch, ComputeContext const *context) const;
+
+    void ApplyCatmarkTriQuadFaceVerticesKernel(FarKernelBatch const &batch, ComputeContext const *context) const;
+
    void ApplyCatmarkEdgeVerticesKernel(FarKernelBatch const &batch, ComputeContext const *context) const;

    void ApplyCatmarkVertexVerticesKernelB(FarKernelBatch const &batch, ComputeContext const *context) const;
--- a/opensubdiv/osd/glslComputeKernel.glsl
+++ b/opensubdiv/osd/glslComputeKernel.glsl
@ -178,6 +178,64 @@ void catmarkComputeFace()
    writeVertex(vid, dst);
 }

+// Quad face-vertices compute Kernel
+subroutine(computeKernelType)
+void catmarkComputeQuadFace()
+{
+    int i = int(gl_GlobalInvocationID.x) + indexStart;
+    if (i >= indexEnd) return;
+    int vid = i + vertexOffset;
+    int fidx0 = _F_IT[tableOffset + i * 4 + 0];
+    int fidx1 = _F_IT[tableOffset + i * 4 + 1];
+    int fidx2 = _F_IT[tableOffset + i * 4 + 2];
+    int fidx3 = _F_IT[tableOffset + i * 4 + 3];
+
+    Vertex dst;
+    clear(dst);
+    addWithWeight(dst, readVertex(fidx0), 0.25);
+    addWithWeight(dst, readVertex(fidx1), 0.25);
+    addWithWeight(dst, readVertex(fidx2), 0.25);
+    addWithWeight(dst, readVertex(fidx3), 0.25);
+    addVaryingWithWeight(dst, readVertex(fidx0), 0.25);
+    addVaryingWithWeight(dst, readVertex(fidx1), 0.25);
+    addVaryingWithWeight(dst, readVertex(fidx2), 0.25);
+    addVaryingWithWeight(dst, readVertex(fidx3), 0.25);
+
+    writeVertex(vid, dst);
+}
+
+// Tri-quad face-vertices compute Kernel
+subroutine(computeKernelType)
+void catmarkComputeTriQuadFace()
+{
+    int i = int(gl_GlobalInvocationID.x) + indexStart;
+    if (i >= indexEnd) return;
+    int vid = i + vertexOffset;
+    int fidx0 = _F_IT[tableOffset + i * 4 + 0];
+    int fidx1 = _F_IT[tableOffset + i * 4 + 1];
+    int fidx2 = _F_IT[tableOffset + i * 4 + 2];
+    int fidx3 = _F_IT[tableOffset + i * 4 + 3];
+
+    bool triangle = (fidx2 == fidx3);
+    float weight = triangle ? 1.0f / 3.0f : 1.0f / 4.0f;
+
+    Vertex dst;
+    clear(dst);
+    addWithWeight(dst, readVertex(fidx0), weight);
+    addWithWeight(dst, readVertex(fidx1), weight);
+    addWithWeight(dst, readVertex(fidx2), weight);
+    if (!triangle)
+        addWithWeight(dst, readVertex(fidx3), weight);
+
+    addVaryingWithWeight(dst, readVertex(fidx0), weight);
+    addVaryingWithWeight(dst, readVertex(fidx1), weight);
+    addVaryingWithWeight(dst, readVertex(fidx2), weight);
+    if (!triangle)
+        addVaryingWithWeight(dst, readVertex(fidx3), weight);
+
+    writeVertex(vid, dst);
+}
+
 // Edge-vertices compute Kernepl
 subroutine(computeKernelType)
 void catmarkComputeEdge()
--- a/opensubdiv/osd/glslKernelBundle.cpp
+++ b/opensubdiv/osd/glslKernelBundle.cpp
@ -117,6 +117,12 @@ OsdGLSLComputeKernelBundle::Compile(
    _subComputeFace           = glGetSubroutineIndex(_program,
                                                     GL_COMPUTE_SHADER,
                                                     "catmarkComputeFace");
+    _subComputeQuadFace       = glGetSubroutineIndex(_program,
+                                                     GL_COMPUTE_SHADER,
+                                                     "catmarkComputeQuadFace");
+    _subComputeTriQuadFace    = glGetSubroutineIndex(_program,
+                                                     GL_COMPUTE_SHADER,
+                                                     "catmarkComputeTriQuadFace");
    _subComputeEdge           = glGetSubroutineIndex(_program,
                                                     GL_COMPUTE_SHADER,
                                                     "catmarkComputeEdge");
@ -229,6 +235,22 @@ OsdGLSLComputeKernelBundle::ApplyCatmarkFaceVerticesKernel(
    // glMemoryBarrier(GL_TEXTURE_FETCH_BARRIER_BIT);
 }

+void
+OsdGLSLComputeKernelBundle::ApplyCatmarkQuadFaceVerticesKernel(
+    int vertexOffset, int tableOffset, int start, int end) {
+
+    glUniformSubroutinesuiv(GL_COMPUTE_SHADER, 1, &_subComputeQuadFace);
+    dispatchCompute(vertexOffset, tableOffset, start, end);
+}
+
+void
+OsdGLSLComputeKernelBundle::ApplyCatmarkTriQuadFaceVerticesKernel(
+    int vertexOffset, int tableOffset, int start, int end) {
+
+    glUniformSubroutinesuiv(GL_COMPUTE_SHADER, 1, &_subComputeTriQuadFace);
+    dispatchCompute(vertexOffset, tableOffset, start, end);
+}
+
 void
 OsdGLSLComputeKernelBundle::ApplyCatmarkEdgeVerticesKernel(
    int vertexOffset, int tableOffset, int start, int end) {
--- a/opensubdiv/osd/glslKernelBundle.h
+++ b/opensubdiv/osd/glslKernelBundle.h
@ -57,6 +57,12 @@ public:
    void ApplyCatmarkFaceVerticesKernel(
        int vertexOffset, int tableOffset, int start, int end);

+    void ApplyCatmarkQuadFaceVerticesKernel(
+        int vertexOffset, int tableOffset, int start, int end);
+
+    void ApplyCatmarkTriQuadFaceVerticesKernel(
+        int vertexOffset, int tableOffset, int start, int end);
+
    void ApplyCatmarkEdgeVerticesKernel(
        int vertexOffset, int tableOffset, int start, int end);

@ -130,6 +136,10 @@ protected:
    
    GLuint _subComputeFace; // general face-vertex kernel (all schemes)

+    GLuint _subComputeQuadFace; // quad face-vertex kernel (catmark)
+
+    GLuint _subComputeTriQuadFace; // tri-quad face-vertex kernel (catmark)
+
    GLuint _subComputeEdge; // edge-vertex kernel (catmark + loop schemes)

    GLuint _subComputeBilinearEdge; // edge-vertex kernel (bilinear scheme)
--- a/opensubdiv/osd/glslTransformFeedbackComputeController.cpp
+++ b/opensubdiv/osd/glslTransformFeedbackComputeController.cpp
@ -207,6 +207,29 @@ OsdGLSLTransformFeedbackComputeController::ApplyCatmarkFaceVerticesKernel(
        batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd());
 }

+void
+OsdGLSLTransformFeedbackComputeController::ApplyCatmarkQuadFaceVerticesKernel(
+    FarKernelBatch const &batch, OsdGLSLTransformFeedbackComputeContext const *context) const {
+
+    assert(context);
+
+    _currentBindState.kernelBundle->ApplyCatmarkQuadFaceVerticesKernel(
+        _currentBindState.vertexBuffer, _currentBindState.varyingBuffer,
+        _currentBindState.vertexDesc.offset, _currentBindState.varyingDesc.offset,
+        batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd());
+}
+
+void
+OsdGLSLTransformFeedbackComputeController::ApplyCatmarkTriQuadFaceVerticesKernel(
+    FarKernelBatch const &batch, OsdGLSLTransformFeedbackComputeContext const *context) const {
+
+    assert(context);
+
+    _currentBindState.kernelBundle->ApplyCatmarkTriQuadFaceVerticesKernel(
+        _currentBindState.vertexBuffer, _currentBindState.varyingBuffer,
+        _currentBindState.vertexDesc.offset, _currentBindState.varyingDesc.offset,
+        batch.GetVertexOffset(), batch.GetTableOffset(), batch.GetStart(), batch.GetEnd());
+}


 void
--- a/opensubdiv/osd/glslTransformFeedbackComputeController.h
+++ b/opensubdiv/osd/glslTransformFeedbackComputeController.h
@ -127,6 +127,10 @@ protected:

    void ApplyCatmarkFaceVerticesKernel(FarKernelBatch const &batch, ComputeContext const *context) const;

+    void ApplyCatmarkQuadFaceVerticesKernel(FarKernelBatch const &batch, ComputeContext const *context) const;
+
+    void ApplyCatmarkTriQuadFaceVerticesKernel(FarKernelBatch const &batch, ComputeContext const *context) const;
+
    void ApplyCatmarkEdgeVerticesKernel(FarKernelBatch const &batch, ComputeContext const *context) const;

    void ApplyCatmarkVertexVerticesKernelB(FarKernelBatch const &batch, ComputeContext const *context) const;
--- a/opensubdiv/osd/glslTransformFeedbackKernel.glsl
+++ b/opensubdiv/osd/glslTransformFeedbackKernel.glsl
@ -186,6 +186,59 @@ void catmarkComputeFace()
    writeVertex(dst);
 }

+// Quad face-vertices compute Kernel
+subroutine(computeKernelType)
+void catmarkComputeQuadFace()
+{
+    int i = gl_VertexID + indexStart;
+    int fidx0 = texelFetch(_F0_IT, tableOffset + 4 * i + 0);
+    int fidx1 = texelFetch(_F0_IT, tableOffset + 4 * i + 0);
+    int fidx2 = texelFetch(_F0_IT, tableOffset + 4 * i + 0);
+    int fidx3 = texelFetch(_F0_IT, tableOffset + 4 * i + 0);
+
+    Vertex dst;
+    clear(dst);
+    addWithWeight(dst, readVertex(fidx0), 0.25);
+    addWithWeight(dst, readVertex(fidx1), 0.25);
+    addWithWeight(dst, readVertex(fidx2), 0.25);
+    addWithWeight(dst, readVertex(fidx3), 0.25);
+    addVaryingWithWeight(dst, readVertex(fidx0), 0.25);
+    addVaryingWithWeight(dst, readVertex(fidx1), 0.25);
+    addVaryingWithWeight(dst, readVertex(fidx2), 0.25);
+    addVaryingWithWeight(dst, readVertex(fidx3), 0.25);
+    writeVertex(dst);
+}
+
+// Tri-quad face-vertices compute Kernel
+subroutine(computeKernelType)
+void catmarkComputeTriQuadFace()
+{
+    int i = gl_VertexID + indexStart;
+    int fidx0 = texelFetch(_F0_IT, tableOffset + 4 * i + 0);
+    int fidx1 = texelFetch(_F0_IT, tableOffset + 4 * i + 0);
+    int fidx2 = texelFetch(_F0_IT, tableOffset + 4 * i + 0);
+    int fidx3 = texelFetch(_F0_IT, tableOffset + 4 * i + 0);
+
+    bool triangle = (fidx2 == fidx3);
+    float weight = triangle ? 1.0f / 3.0f : 1.0f / 4.0f;
+
+    Vertex dst;
+    clear(dst);
+    addWithWeight(dst, readVertex(fidx0), weight);
+    addWithWeight(dst, readVertex(fidx1), weight);
+    addWithWeight(dst, readVertex(fidx2), weight);
+    if (!triangle)
+        addWithWeight(dst, readVertex(fidx3), weight);
+
+    addVaryingWithWeight(dst, readVertex(fidx0), weight);
+    addVaryingWithWeight(dst, readVertex(fidx1), weight);
+    addVaryingWithWeight(dst, readVertex(fidx2), weight);
+    if (!triangle)
+        addVaryingWithWeight(dst, readVertex(fidx3), weight);
+
+    writeVertex(dst);
+}
+
 // Edge-vertices compute Kernel
 subroutine(computeKernelType)
 void catmarkComputeEdge()
--- a/opensubdiv/osd/glslTransformFeedbackKernelBundle.cpp
+++ b/opensubdiv/osd/glslTransformFeedbackKernelBundle.cpp
@ -225,6 +225,8 @@ OsdGLSLTransformFeedbackKernelBundle::Compile(
    _uniformVaryingBuffer        = glGetUniformLocation(_program, "varyingData");

    _subComputeFace = glGetSubroutineIndex(_program, GL_VERTEX_SHADER, "catmarkComputeFace");
+    _subComputeQuadFace = glGetSubroutineIndex(_program, GL_VERTEX_SHADER, "catmarkComputeQuadFace");
+    _subComputeTriQuadFace = glGetSubroutineIndex(_program, GL_VERTEX_SHADER, "catmarkComputeTriQuadFace");
    _subComputeEdge = glGetSubroutineIndex(_program, GL_VERTEX_SHADER, "catmarkComputeEdge");
    _subComputeBilinearEdge = glGetSubroutineIndex(_program, GL_VERTEX_SHADER, "bilinearComputeEdge");
    _subComputeVertex = glGetSubroutineIndex(_program, GL_VERTEX_SHADER, "bilinearComputeVertex");
@ -356,6 +358,30 @@ OsdGLSLTransformFeedbackKernelBundle::ApplyBilinearVertexVerticesKernel(
 }


+void
+OsdGLSLTransformFeedbackKernelBundle::ApplyCatmarkQuadFaceVerticesKernel(
+    GLuint vertexBuffer, GLuint varyingBuffer,
+    int vertexOffset, int varyingOffset,
+    int offset, int tableOffset, int start, int end) {
+
+    glUniformSubroutinesuiv(GL_VERTEX_SHADER, 1, &_subComputeQuadFace);
+    transformGpuBufferData(vertexBuffer, varyingBuffer,
+                           vertexOffset, varyingOffset,
+                           offset, tableOffset, start, end);
+}
+
+void
+OsdGLSLTransformFeedbackKernelBundle::ApplyCatmarkTriQuadFaceVerticesKernel(
+    GLuint vertexBuffer, GLuint varyingBuffer,
+    int vertexOffset, int varyingOffset,
+    int offset, int tableOffset, int start, int end) {
+
+    glUniformSubroutinesuiv(GL_VERTEX_SHADER, 1, &_subComputeTriQuadFace);
+    transformGpuBufferData(vertexBuffer, varyingBuffer,
+                           vertexOffset, varyingOffset,
+                           offset, tableOffset, start, end);
+}
+
 void
 OsdGLSLTransformFeedbackKernelBundle::ApplyCatmarkFaceVerticesKernel(
    GLuint vertexBuffer, GLuint varyingBuffer,
--- a/opensubdiv/osd/glslTransformFeedbackKernelBundle.h
+++ b/opensubdiv/osd/glslTransformFeedbackKernelBundle.h
@ -68,6 +68,16 @@ public:
        int vertexOffset, int varyingOffset,
        int offset, int tableOffset, int start, int end);

+    void ApplyCatmarkQuadFaceVerticesKernel(
+        GLuint vertexBuffer, GLuint varyingBuffer,
+        int vertexOffset, int varyingOffset,
+        int offset, int tableOffset, int start, int end);
+
+    void ApplyCatmarkTriQuadFaceVerticesKernel(
+        GLuint vertexBuffer, GLuint varyingBuffer,
+        int vertexOffset, int varyingOffset,
+        int offset, int tableOffset, int start, int end);
+
    void ApplyCatmarkEdgeVerticesKernel(
        GLuint vertexBuffer, GLuint varyingBuffer,
        int vertexOffset, int varyingOffset,
@ -181,6 +191,10 @@ protected:

    GLuint _subComputeFace; // general face-vertex kernel (all schemes)

+    GLuint _subComputeQuadFace; // quad face-vertex kernel (catmark)
+
+    GLuint _subComputeTriQuadFace; // tri-quad face-vertex kernel (catmark)
+
    GLuint _subComputeEdge; // edge-vertex kernel (catmark + loop schemes)

    GLuint _subComputeBilinearEdge; // edge-vertex kernel (bilinear scheme)