Merge pull request #862 from davidgyu/patch_eval_osd

Osd Varying and FaceVarying Patch Evaluation
2025-01-05 14:31:07 +00:00 · 2016-09-29 10:54:56 -07:00 · 2016-09-29 10:54:56 -07:00 · feda3cb054
commit feda3cb054
parent 0e0974188d 236afb2c06
27 changed files with 1818 additions and 305 deletions
--- a/opensubdiv/osd/clEvaluator.cpp
+++ b/opensubdiv/osd/clEvaluator.cpp
@ -41,6 +41,9 @@ namespace Osd {
 static const char *clSource =
 #include "clKernel.gen.h"
 ;
+static const char *patchBasisSource =
+#include "patchBasisCommon.gen.h"
+;

 // ----------------------------------------------------------------------------

@ -140,11 +143,12 @@ CLEvaluator::Compile(BufferDescriptor const &srcDesc,
    std::ostringstream defines;
    defines << "#define LENGTH "     << srcDesc.length << "\n"
            << "#define SRC_STRIDE " << srcDesc.stride << "\n"
-            << "#define DST_STRIDE " << dstDesc.stride << "\n";
+            << "#define DST_STRIDE " << dstDesc.stride << "\n"
+            << "#define OSD_PATCH_BASIS_OPENCL\n";
    std::string defineStr = defines.str();

-    const char *sources[] = { defineStr.c_str(), clSource };
-    _program = clCreateProgramWithSource(_clContext, 2, sources, 0, &errNum);
+    const char *sources[] = { defineStr.c_str(), patchBasisSource, clSource };
+    _program = clCreateProgramWithSource(_clContext, 3, sources, 0, &errNum);
    if (errNum != CL_SUCCESS) {
        Far::Error(Far::FAR_RUNTIME_ERROR,
                   "clCreateProgramWithSource (%d)", errNum);
--- a/opensubdiv/osd/clEvaluator.h
+++ b/opensubdiv/osd/clEvaluator.h
@ -724,6 +724,316 @@ public:
                     const cl_event* startEvents=NULL,
                     cl_event* endEvent=NULL) const;

+    /// \brief Generic limit eval function. This function has a same
+    ///        signature as other device kernels have so that it can be called
+    ///        in the same way.
+    ///
+    /// @param srcBuffer      Input primvar buffer.
+    ///                       must have BindCLBuffer() method returning a CL
+    ///                       buffer object of source data
+    ///
+    /// @param srcDesc        vertex buffer descriptor for the input buffer
+    ///
+    /// @param dstBuffer      Output primvar buffer
+    ///                       must have BindCLBuffer() method returning a CL
+    ///                       buffer object of destination data
+    ///
+    /// @param dstDesc        vertex buffer descriptor for the output buffer
+    ///
+    /// @param numPatchCoords number of patchCoords.
+    ///
+    /// @param patchCoords    array of locations to be evaluated.
+    ///                       must have BindCLBuffer() method returning an
+    ///                       array of PatchCoord struct.
+    ///
+    /// @param patchTable     CLPatchTable or equivalent
+    ///
+    /// @param instance       cached compiled instance. Clients are supposed to
+    ///                       pre-compile an instance of this class and provide
+    ///                       to this function. If it's null the kernel still
+    ///                       compute by instantiating on-demand kernel although
+    ///                       it may cause a performance problem.
+    ///
+    /// @param deviceContext  client providing context class which supports
+    ///                         cL_context GetContext()
+    ///                         cl_command_queue GetCommandQueue()
+    ///                       methods.
+    ///
+    /// @param numStartEvents the number of events in the array pointed to by
+    ///                       startEvents.
+    ///
+    /// @param startEvents    points to an array of cl_event which will determine
+    ///                       when it is safe for the OpenCL device to begin work
+    ///                       or NULL if it can begin immediately.
+    ///
+    /// @param endEvent       pointer to a cl_event which will recieve a copy of
+    ///                       the cl_event which indicates when all work for this
+    ///                       call has completed.  This cl_event has an incremented
+    ///                       reference count and should be released via
+    ///                       clReleaseEvent().  NULL if not required.
+    ///
+    template <typename SRC_BUFFER, typename DST_BUFFER,
+              typename PATCHCOORD_BUFFER, typename PATCH_TABLE,
+              typename DEVICE_CONTEXT>
+    static bool EvalPatchesVarying(
+        SRC_BUFFER *srcBuffer, BufferDescriptor const &srcDesc,
+        DST_BUFFER *dstBuffer, BufferDescriptor const &dstDesc,
+        int numPatchCoords,
+        PATCHCOORD_BUFFER *patchCoords,
+        PATCH_TABLE *patchTable,
+        CLEvaluator const *instance,
+        DEVICE_CONTEXT deviceContext,
+        unsigned int numStartEvents=0,
+        const cl_event* startEvents=NULL,
+        cl_event* endEvent=NULL) {
+
+        if (instance) {
+            return instance->EvalPatchesVarying(
+                                         srcBuffer, srcDesc,
+                                         dstBuffer, dstDesc,
+                                         numPatchCoords, patchCoords,
+                                         patchTable,
+                                         numStartEvents, startEvents, endEvent);
+        } else {
+            // Create an instance on demand (slow)
+            (void)deviceContext;  // unused
+            instance = Create(srcDesc, dstDesc,
+                              BufferDescriptor(),
+                              BufferDescriptor(),
+                              deviceContext);
+            if (instance) {
+                bool r = instance->EvalPatchesVarying(
+                                               srcBuffer, srcDesc,
+                                               dstBuffer, dstDesc,
+                                               numPatchCoords, patchCoords,
+                                               patchTable,
+                                               numStartEvents, startEvents, endEvent);
+                delete instance;
+                return r;
+            }
+            return false;
+        }
+    }
+
+    /// \brief Generic limit eval function. This function has a same
+    ///        signature as other device kernels have so that it can be called
+    ///        in the same way.
+    ///
+    /// @param srcBuffer      Input primvar buffer.
+    ///                       must have BindCLBuffer() method returning a CL
+    ///                       buffer object of source data
+    ///
+    /// @param srcDesc        vertex buffer descriptor for the input buffer
+    ///
+    /// @param dstBuffer      Output primvar buffer
+    ///                       must have BindCLBuffer() method returning a CL
+    ///                       buffer object of destination data
+    ///
+    /// @param dstDesc        vertex buffer descriptor for the output buffer
+    ///
+    /// @param numPatchCoords number of patchCoords.
+    ///
+    /// @param patchCoords    array of locations to be evaluated.
+    ///                       must have BindCLBuffer() method returning an
+    ///                       array of PatchCoord struct.
+    ///
+    /// @param patchTable     CLPatchTable or equivalent
+    ///
+    /// @param numStartEvents the number of events in the array pointed to by
+    ///                       startEvents.
+    ///
+    /// @param startEvents    points to an array of cl_event which will determine
+    ///                       when it is safe for the OpenCL device to begin work
+    ///                       or NULL if it can begin immediately.
+    ///
+    /// @param endEvent       pointer to a cl_event which will recieve a copy of
+    ///                       the cl_event which indicates when all work for this
+    ///                       call has completed.  This cl_event has an incremented
+    ///                       reference count and should be released via
+    ///                       clReleaseEvent().  NULL if not required.
+    ///
+    template <typename SRC_BUFFER, typename DST_BUFFER,
+              typename PATCHCOORD_BUFFER, typename PATCH_TABLE>
+    bool EvalPatchesVarying(
+        SRC_BUFFER *srcBuffer, BufferDescriptor const &srcDesc,
+        DST_BUFFER *dstBuffer, BufferDescriptor const &dstDesc,
+        int numPatchCoords,
+        PATCHCOORD_BUFFER *patchCoords,
+        PATCH_TABLE *patchTable,
+        unsigned int numStartEvents=0,
+        const cl_event* startEvents=NULL,
+        cl_event* endEvent=NULL) const {
+
+        return EvalPatches(srcBuffer->BindCLBuffer(_clCommandQueue), srcDesc,
+                           dstBuffer->BindCLBuffer(_clCommandQueue), dstDesc,
+                           0, BufferDescriptor(),
+                           0, BufferDescriptor(),
+                           numPatchCoords,
+                           patchCoords->BindCLBuffer(_clCommandQueue),
+                           patchTable->GetVaryingPatchArrayBuffer(),
+                           patchTable->GetVaryingPatchIndexBuffer(),
+                           patchTable->GetPatchParamBuffer(),
+                           numStartEvents, startEvents, endEvent);
+    }
+
+    /// \brief Generic limit eval function. This function has a same
+    ///        signature as other device kernels have so that it can be called
+    ///        in the same way.
+    ///
+    /// @param srcBuffer      Input primvar buffer.
+    ///                       must have BindCLBuffer() method returning a CL
+    ///                       buffer object of source data
+    ///
+    /// @param srcDesc        vertex buffer descriptor for the input buffer
+    ///
+    /// @param dstBuffer      Output primvar buffer
+    ///                       must have BindCLBuffer() method returning a CL
+    ///                       buffer object of destination data
+    ///
+    /// @param dstDesc        vertex buffer descriptor for the output buffer
+    ///
+    /// @param numPatchCoords number of patchCoords.
+    ///
+    /// @param patchCoords    array of locations to be evaluated.
+    ///                       must have BindCLBuffer() method returning an
+    ///                       array of PatchCoord struct.
+    ///
+    /// @param patchTable     CLPatchTable or equivalent
+    ///
+    /// @param fvarChannel    face-varying channel
+    ///
+    /// @param instance       cached compiled instance. Clients are supposed to
+    ///                       pre-compile an instance of this class and provide
+    ///                       to this function. If it's null the kernel still
+    ///                       compute by instantiating on-demand kernel although
+    ///                       it may cause a performance problem.
+    ///
+    /// @param deviceContext  client providing context class which supports
+    ///                         cL_context GetContext()
+    ///                         cl_command_queue GetCommandQueue()
+    ///                       methods.
+    ///
+    /// @param numStartEvents the number of events in the array pointed to by
+    ///                       startEvents.
+    ///
+    /// @param startEvents    points to an array of cl_event which will determine
+    ///                       when it is safe for the OpenCL device to begin work
+    ///                       or NULL if it can begin immediately.
+    ///
+    /// @param endEvent       pointer to a cl_event which will recieve a copy of
+    ///                       the cl_event which indicates when all work for this
+    ///                       call has completed.  This cl_event has an incremented
+    ///                       reference count and should be released via
+    ///                       clReleaseEvent().  NULL if not required.
+    ///
+    template <typename SRC_BUFFER, typename DST_BUFFER,
+              typename PATCHCOORD_BUFFER, typename PATCH_TABLE,
+              typename DEVICE_CONTEXT>
+    static bool EvalPatchesFaceVarying(
+        SRC_BUFFER *srcBuffer, BufferDescriptor const &srcDesc,
+        DST_BUFFER *dstBuffer, BufferDescriptor const &dstDesc,
+        int numPatchCoords,
+        PATCHCOORD_BUFFER *patchCoords,
+        PATCH_TABLE *patchTable,
+        int fvarChannel,
+        CLEvaluator const *instance,
+        DEVICE_CONTEXT deviceContext,
+        unsigned int numStartEvents=0,
+        const cl_event* startEvents=NULL,
+        cl_event* endEvent=NULL) {
+
+        if (instance) {
+            return instance->EvalPatchesFaceVarying(
+                                         srcBuffer, srcDesc,
+                                         dstBuffer, dstDesc,
+                                         numPatchCoords, patchCoords,
+                                         patchTable, fvarChannel,
+                                         numStartEvents, startEvents, endEvent);
+        } else {
+            // Create an instance on demand (slow)
+            (void)deviceContext;  // unused
+            instance = Create(srcDesc, dstDesc,
+                              BufferDescriptor(),
+                              BufferDescriptor(),
+                              deviceContext);
+            if (instance) {
+                bool r = instance->EvalPatchesFaceVarying(
+                                               srcBuffer, srcDesc,
+                                               dstBuffer, dstDesc,
+                                               numPatchCoords, patchCoords,
+                                               patchTable, fvarChannel,
+                                               numStartEvents, startEvents, endEvent);
+                delete instance;
+                return r;
+            }
+            return false;
+        }
+    }
+
+    /// \brief Generic limit eval function. This function has a same
+    ///        signature as other device kernels have so that it can be called
+    ///        in the same way.
+    ///
+    /// @param srcBuffer      Input primvar buffer.
+    ///                       must have BindCLBuffer() method returning a CL
+    ///                       buffer object of source data
+    ///
+    /// @param srcDesc        vertex buffer descriptor for the input buffer
+    ///
+    /// @param dstBuffer      Output primvar buffer
+    ///                       must have BindCLBuffer() method returning a CL
+    ///                       buffer object of destination data
+    ///
+    /// @param dstDesc        vertex buffer descriptor for the output buffer
+    ///
+    /// @param numPatchCoords number of patchCoords.
+    ///
+    /// @param patchCoords    array of locations to be evaluated.
+    ///                       must have BindCLBuffer() method returning an
+    ///                       array of PatchCoord struct.
+    ///
+    /// @param patchTable     CLPatchTable or equivalent
+    ///
+    /// @param fvarChannel    face-varying channel
+    ///
+    /// @param numStartEvents the number of events in the array pointed to by
+    ///                       startEvents.
+    ///
+    /// @param startEvents    points to an array of cl_event which will determine
+    ///                       when it is safe for the OpenCL device to begin work
+    ///                       or NULL if it can begin immediately.
+    ///
+    /// @param endEvent       pointer to a cl_event which will recieve a copy of
+    ///                       the cl_event which indicates when all work for this
+    ///                       call has completed.  This cl_event has an incremented
+    ///                       reference count and should be released via
+    ///                       clReleaseEvent().  NULL if not required.
+    ///
+    template <typename SRC_BUFFER, typename DST_BUFFER,
+              typename PATCHCOORD_BUFFER, typename PATCH_TABLE>
+    bool EvalPatchesFaceVarying(
+        SRC_BUFFER *srcBuffer, BufferDescriptor const &srcDesc,
+        DST_BUFFER *dstBuffer, BufferDescriptor const &dstDesc,
+        int numPatchCoords,
+        PATCHCOORD_BUFFER *patchCoords,
+        PATCH_TABLE *patchTable,
+        int fvarChannel = 0,
+        unsigned int numStartEvents=0,
+        const cl_event* startEvents=NULL,
+        cl_event* endEvent=NULL) const {
+
+        return EvalPatches(srcBuffer->BindCLBuffer(_clCommandQueue), srcDesc,
+                           dstBuffer->BindCLBuffer(_clCommandQueue), dstDesc,
+                           0, BufferDescriptor(),
+                           0, BufferDescriptor(),
+                           numPatchCoords,
+                           patchCoords->BindCLBuffer(_clCommandQueue),
+                           patchTable->GetFVarPatchArrayBuffer(fvarChannel),
+                           patchTable->GetFVarPatchIndexBuffer(fvarChannel),
+                           patchTable->GetFVarPatchParamBuffer(fvarChannel),
+                           numStartEvents, startEvents, endEvent);
+    }
+
    /// ----------------------------------------------------------------------
    ///
    ///   Other methods
--- a/opensubdiv/osd/clKernel.cl
+++ b/opensubdiv/osd/clKernel.cl
@ -162,50 +162,6 @@ struct PatchParam {
    float sharpness;
 };

-static void getBSplineWeights(float t, float *point, float *deriv) {
-    // The four uniform cubic B-Spline basis functions evaluated at t:
-    float one6th = 1.0f / 6.0f;
-
-    float t2 = t * t;
-    float t3 = t * t2;
-
-    point[0] = one6th * (1.0f - 3.0f*(t -      t2) -      t3);
-    point[1] = one6th * (4.0f           - 6.0f*t2  + 3.0f*t3);
-    point[2] = one6th * (1.0f + 3.0f*(t +      t2  -      t3));
-    point[3] = one6th * (                                 t3);
-
-    // Derivatives of the above four basis functions at t:
-    deriv[0] = -0.5f*t2 +      t - 0.5f;
-    deriv[1] =  1.5f*t2 - 2.0f*t;
-    deriv[2] = -1.5f*t2 +      t + 0.5f;
-    deriv[3] =  0.5f*t2;
-}
-
-static void adjustBoundaryWeights(uint bits, float *sWeights, float *tWeights) {
-    int boundary = ((bits >> 8) & 0xf);
-
-    if (boundary & 1) {
-        tWeights[2] -= tWeights[0];
-        tWeights[1] += 2*tWeights[0];
-        tWeights[0] = 0;
-    }
-    if (boundary & 2) {
-        sWeights[1] -= sWeights[3];
-        sWeights[2] += 2*sWeights[3];
-        sWeights[3] = 0;
-    }
-    if (boundary & 4) {
-        tWeights[1] -= tWeights[3];
-        tWeights[2] += 2*tWeights[3];
-        tWeights[3] = 0;
-    }
-    if (boundary & 8) {
-        sWeights[2] -= sWeights[0];
-        sWeights[1] += 2*sWeights[0];
-        sWeights[0] = 0;
-    }
-}
-
 static int getDepth(uint patchBits) {
    return (patchBits & 0xf);
 }
@ -235,6 +191,16 @@ static void normalizePatchCoord(uint patchBits, float *uv) {
    uv[1] = (uv[1] - pv) / frac;
 }

+static bool isRegular(uint patchBits) {
+    return ((patchBits >> 5) & 0x1) != 0;
+}
+
+static int getNumControlVertices(int patchType) {
+    return (patchType == 3) ? 4 :
+           (patchType == 6) ? 16 :
+           (patchType == 9) ? 20 : 0;
+}
+
 __kernel void computePatches(__global float *src, int srcOffset,
                             __global float *dst, int dstOffset,
                             __global float *du,  int duOffset, int duStride,
@ -253,35 +219,34 @@ __kernel void computePatches(__global float *src, int srcOffset,
    struct PatchCoord coord = patchCoords[current];
    struct PatchArray array = patchArrayBuffer[coord.arrayIndex];

-    int patchType = 6; // array.patchType XXX: REGULAR only for now.
-    int numControlVertices = 16;
    uint patchBits = patchParamBuffer[coord.patchIndex].field1;
+    int patchType = isRegular(patchBits) ? 6 : array.patchType;

    float uv[2] = {coord.s, coord.t};
    normalizePatchCoord(patchBits, uv);
    float dScale = (float)(1 << getDepth(patchBits));
+    int boundary = (patchBits >> 8) & 0xf;

-    float wP[20], wDs[20], wDt[20];
-    if (patchType == 6) {  // REGULAR
-        float sWeights[4], tWeights[4], dsWeights[4], dtWeights[4];
-        getBSplineWeights(uv[0], sWeights, dsWeights);
-        getBSplineWeights(uv[1], tWeights, dtWeights);
+    float wP[20], wDs[20], wDt[20], wDss[20], wDst[20], wDtt[20];

-        adjustBoundaryWeights(patchBits, sWeights, tWeights);
-        adjustBoundaryWeights(patchBits, dsWeights, dtWeights);
-
-        for (int k = 0; k < 4; ++k) {
-            for (int l = 0; l < 4; ++l) {
-                wP[4*k+l]  = sWeights[l]  * tWeights[k];
-                wDs[4*k+l] = dsWeights[l] * tWeights[k]  * dScale;
-                wDt[4*k+l] = sWeights[l]  * dtWeights[k] * dScale;
-            }
-        }
-    } else {
-        // TODO: GREGORY BASIS
+    int numControlVertices = 0;
+    if (patchType == 3) {
+        OsdGetBilinearPatchWeights(uv[0], uv[1], dScale,
+            wP, wDs, wDt, wDss, wDst, wDtt);
+        numControlVertices = 4;
+    } else if (patchType == 6) {
+        OsdGetBSplinePatchWeights(uv[0], uv[1], dScale, boundary,
+            wP, wDs, wDt, wDss, wDst, wDtt);
+        numControlVertices = 16;
+    } else if (patchType == 9) {
+        OsdGetGregoryPatchWeights(uv[0], uv[1], dScale,
+            wP, wDs, wDt, wDss, wDst, wDtt);
+        numControlVertices = 20;
    }

-    int indexBase = array.indexBase + coord.vertIndex;
+    int indexStride = getNumControlVertices(array.patchType);
+    int indexBase = array.indexBase + indexStride *
+            (coord.patchIndex - array.primitiveIdBase);

    struct Vertex v;
    clear(&v);
--- a/opensubdiv/osd/clPatchTable.cpp
+++ b/opensubdiv/osd/clPatchTable.cpp
@ -42,6 +42,17 @@ CLPatchTable::~CLPatchTable() {
    if (_patchArrays) clReleaseMemObject(_patchArrays);
    if (_indexBuffer) clReleaseMemObject(_indexBuffer);
    if (_patchParamBuffer) clReleaseMemObject(_patchParamBuffer);
+    if (_varyingPatchArrays) clReleaseMemObject(_varyingPatchArrays);
+    if (_varyingIndexBuffer) clReleaseMemObject(_varyingIndexBuffer);
+    for (int fvc=0; fvc<(int)_fvarPatchArrays.size(); ++fvc) {
+        if (_fvarPatchArrays[fvc]) clReleaseMemObject(_fvarPatchArrays[fvc]);
+    }
+    for (int fvc=0; fvc<(int)_fvarIndexBuffers.size(); ++fvc) {
+        if (_fvarIndexBuffers[fvc]) clReleaseMemObject(_fvarIndexBuffers[fvc]);
+    }
+    for (int fvc=0; fvc<(int)_fvarParamBuffers.size(); ++fvc) {
+        if (_fvarParamBuffers[fvc]) clReleaseMemObject(_fvarParamBuffers[fvc]);
+    }
 }

 CLPatchTable *
@ -91,6 +102,63 @@ CLPatchTable::allocate(Far::PatchTable const *farPatchTable, cl_context clContex
        Far::Error(Far::FAR_RUNTIME_ERROR, "clCreateBuffer: %d", err);
        return false;
    }
+
+    _varyingPatchArrays = clCreateBuffer(clContext,
+                                  CL_MEM_READ_WRITE|CL_MEM_COPY_HOST_PTR,
+                                  numPatchArrays * sizeof(Osd::PatchArray),
+                                  (void*)patchTable.GetVaryingPatchArrayBuffer(),
+                                  &err);
+    if (err != CL_SUCCESS) {
+        Far::Error(Far::FAR_RUNTIME_ERROR, "clCreateBuffer: %d", err);
+        return false;
+    }
+
+    _varyingIndexBuffer = clCreateBuffer(clContext,
+                                  CL_MEM_READ_WRITE|CL_MEM_COPY_HOST_PTR,
+                                  patchTable.GetVaryingPatchIndexSize() * sizeof(int),
+                                  (void*)patchTable.GetVaryingPatchIndexBuffer(),
+                                  &err);
+    if (err != CL_SUCCESS) {
+        Far::Error(Far::FAR_RUNTIME_ERROR, "clCreateBuffer: %d", err);
+        return false;
+    }
+
+    size_t numFVarChannels = patchTable.GetNumFVarChannels();
+    _fvarPatchArrays.resize(numFVarChannels, 0);
+    _fvarIndexBuffers.resize(numFVarChannels, 0);
+    _fvarParamBuffers.resize(numFVarChannels, 0);
+    for (int fvc=0; fvc<(int)numFVarChannels; ++fvc) {
+        _fvarPatchArrays[fvc] = clCreateBuffer(clContext,
+                                  CL_MEM_READ_WRITE|CL_MEM_COPY_HOST_PTR,
+                                  numPatchArrays * sizeof(Osd::PatchArray),
+                                  (void*)patchTable.GetFVarPatchArrayBuffer(fvc),
+                                  &err);
+        if (err != CL_SUCCESS) {
+            Far::Error(Far::FAR_RUNTIME_ERROR, "clCreateBuffer: %d", err);
+            return false;
+        }
+
+        _fvarIndexBuffers[fvc] = clCreateBuffer(clContext,
+                                  CL_MEM_READ_WRITE|CL_MEM_COPY_HOST_PTR,
+                                  patchTable.GetFVarPatchIndexSize(fvc) * sizeof(int),
+                                  (void*)patchTable.GetFVarPatchIndexBuffer(fvc),
+                                  &err);
+        if (err != CL_SUCCESS) {
+            Far::Error(Far::FAR_RUNTIME_ERROR, "clCreateBuffer: %d", err);
+            return false;
+        }
+
+        _fvarParamBuffers[fvc] = clCreateBuffer(clContext,
+                                   CL_MEM_READ_WRITE|CL_MEM_COPY_HOST_PTR,
+                                   patchTable.GetFVarPatchParamSize(fvc) * sizeof(Osd::PatchParam),
+                                   (void*)patchTable.GetFVarPatchParamBuffer(fvc),
+                                   &err);
+        if (err != CL_SUCCESS) {
+            Far::Error(Far::FAR_RUNTIME_ERROR, "clCreateBuffer: %d", err);
+            return false;
+        }
+    }
+
    return true;
 }

--- a/opensubdiv/osd/clPatchTable.h
+++ b/opensubdiv/osd/clPatchTable.h
@ -31,6 +31,8 @@
 #include "../osd/nonCopyable.h"
 #include "../osd/types.h"

+#include <vector>
+
 namespace OpenSubdiv {
 namespace OPENSUBDIV_VERSION {

@ -71,6 +73,21 @@ public:
    /// Returns the CL memory of the array of Osd::PatchParam buffer
    cl_mem GetPatchParamBuffer() const { return _patchParamBuffer; }

+    /// Returns the CL memory of the array of Osd::PatchArray buffer
+    cl_mem GetVaryingPatchArrayBuffer() const { return _varyingPatchArrays; }
+
+    /// Returns the CL memory of the varying control vertices
+    cl_mem GetVaryingPatchIndexBuffer() const { return _varyingIndexBuffer; }
+
+    /// Returns the CL memory of the array of Osd::PatchArray buffer
+    cl_mem GetFVarPatchArrayBuffer(int fvarChannel = 0) const { return _fvarPatchArrays[fvarChannel]; }
+
+    /// Returns the CL memory of the face-varying control vertices
+    cl_mem GetFVarPatchIndexBuffer(int fvarChannel = 0) const { return _fvarIndexBuffers[fvarChannel]; }
+
+    /// Returns the CL memory of the array of Osd::PatchParam buffer
+    cl_mem GetFVarPatchParamBuffer(int fvarChannel = 0) const { return _fvarParamBuffers[fvarChannel]; }
+
 protected:
    CLPatchTable();

@ -79,6 +96,14 @@ protected:
    cl_mem _patchArrays;
    cl_mem _indexBuffer;
    cl_mem _patchParamBuffer;
+
+    cl_mem _varyingPatchArrays;
+    cl_mem _varyingIndexBuffer;
+
+    std::vector<cl_mem> _fvarPatchArrays;
+    std::vector<cl_mem> _fvarIndexBuffers;
+    std::vector<cl_mem> _fvarParamBuffers;
+
 };

 }  // end namespace Osd
--- a/opensubdiv/osd/cpuEvaluator.cpp
+++ b/opensubdiv/osd/cpuEvaluator.cpp
@ -141,12 +141,11 @@ CpuEvaluator::EvalPatches(const float *src, BufferDescriptor const &srcDesc,
        PatchCoord const &coord = patchCoords[i];
        PatchArray const &array = patchArrays[coord.handle.arrayIndex];

-        int patchType = array.GetPatchType();
-        // XXX: patchIndex is absolute. not sure it's consistent.
-        //      (should be offsetted by array.primitiveIdBase?)
-        //    patchParamBuffer[array.primitiveIdBase + coord.handle.patchIndex]
        Far::PatchParam const & param =
            patchParamBuffer[coord.handle.patchIndex];
+        int patchType = param.IsRegular()
+            ? Far::PatchDescriptor::REGULAR
+            : array.GetPatchType();

        int numControlVertices = 0;
        if (patchType == Far::PatchDescriptor::REGULAR) {
@ -165,8 +164,12 @@ CpuEvaluator::EvalPatches(const float *src, BufferDescriptor const &srcDesc,
            assert(0);
            return false;
        }
-        const int *cvs =
-            &patchIndexBuffer[array.indexBase + coord.handle.vertIndex];
+
+        int indexStride = Far::PatchDescriptor(array.GetPatchType()).GetNumControlVertices();
+        int indexBase = array.GetIndexBase() + indexStride *
+                (coord.handle.patchIndex - array.GetPrimitiveIdBase());
+
+        const int *cvs = &patchIndexBuffer[indexBase];

        dstT.Clear();
        for (int j = 0; j < numControlVertices; ++j) {
@ -217,9 +220,11 @@ CpuEvaluator::EvalPatches(const float *src, BufferDescriptor const &srcDesc,
        PatchCoord const &coord = patchCoords[i];
        PatchArray const &array = patchArrays[coord.handle.arrayIndex];

-        int patchType = array.GetPatchType();
        Far::PatchParam const & param =
            patchParamBuffer[coord.handle.patchIndex];
+        int patchType = param.IsRegular()
+            ? Far::PatchDescriptor::REGULAR
+            : array.GetPatchType();

        int numControlVertices = 0;
        if (patchType == Far::PatchDescriptor::REGULAR) {
@ -237,8 +242,12 @@ CpuEvaluator::EvalPatches(const float *src, BufferDescriptor const &srcDesc,
        } else {
            assert(0);
        }
-        const int *cvs =
-            &patchIndexBuffer[array.indexBase + coord.handle.vertIndex];
+
+        int indexStride = Far::PatchDescriptor(array.GetPatchType()).GetNumControlVertices();
+        int indexBase = array.GetIndexBase() + indexStride *
+                (coord.handle.patchIndex - array.GetPrimitiveIdBase());
+
+        const int *cvs = &patchIndexBuffer[indexBase];

        dstT.Clear();
        duT.Clear();
--- a/opensubdiv/osd/cpuEvaluator.h
+++ b/opensubdiv/osd/cpuEvaluator.h
@ -28,7 +28,6 @@
 #include "../version.h"

 #include <cstddef>
-#include <vector>
 #include "../osd/bufferDescriptor.h"
 #include "../osd/types.h"

@ -458,6 +457,111 @@ public:
        const int *patchIndexBuffer,
        PatchParam const *patchParamBuffer);

+    /// \brief Generic limit eval function. This function has a same
+    ///        signature as other device kernels have so that it can be called
+    ///        in the same way.
+    ///
+    /// @param srcBuffer        Input primvar buffer.
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         const float pointer for read
+    ///
+    /// @param srcDesc          vertex buffer descriptor for the input buffer
+    ///
+    /// @param dstBuffer        Output primvar buffer
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param dstDesc          vertex buffer descriptor for the output buffer
+    ///
+    /// @param numPatchCoords   number of patchCoords.
+    ///
+    /// @param patchCoords      array of locations to be evaluated.
+    ///
+    /// @param patchTable       CpuPatchTable or equivalent
+    ///                         XXX: currently Far::PatchTable can't be used
+    ///                              due to interface mismatch
+    ///
+    /// @param instance         not used in the cpu evaluator
+    ///
+    /// @param deviceContext    not used in the cpu evaluator
+    ///
+    template <typename SRC_BUFFER, typename DST_BUFFER,
+              typename PATCHCOORD_BUFFER, typename PATCH_TABLE>
+    static bool EvalPatchesVarying(
+        SRC_BUFFER *srcBuffer, BufferDescriptor const &srcDesc,
+        DST_BUFFER *dstBuffer, BufferDescriptor const &dstDesc,
+        int numPatchCoords,
+        PATCHCOORD_BUFFER *patchCoords,
+        PATCH_TABLE *patchTable,
+        CpuEvaluator const *instance = NULL,
+        void * deviceContext = NULL) {
+
+        (void)instance;       // unused
+        (void)deviceContext;  // unused
+
+        return EvalPatches(srcBuffer->BindCpuBuffer(), srcDesc,
+                           dstBuffer->BindCpuBuffer(), dstDesc,
+                           numPatchCoords,
+                           (const PatchCoord*)patchCoords->BindCpuBuffer(),
+                           patchTable->GetVaryingPatchArrayBuffer(),
+                           patchTable->GetVaryingPatchIndexBuffer(),
+                           patchTable->GetPatchParamBuffer());
+    }
+
+    /// \brief Generic limit eval function. This function has a same
+    ///        signature as other device kernels have so that it can be called
+    ///        in the same way.
+    ///
+    /// @param srcBuffer        Input primvar buffer.
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         const float pointer for read
+    ///
+    /// @param srcDesc          vertex buffer descriptor for the input buffer
+    ///
+    /// @param dstBuffer        Output primvar buffer
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param dstDesc          vertex buffer descriptor for the output buffer
+    ///
+    /// @param numPatchCoords   number of patchCoords.
+    ///
+    /// @param patchCoords      array of locations to be evaluated.
+    ///
+    /// @param patchTable       CpuPatchTable or equivalent
+    ///                         XXX: currently Far::PatchTable can't be used
+    ///                              due to interface mismatch
+    ///
+    /// @param fvarChannel      face-varying channel
+    ///
+    /// @param instance         not used in the cpu evaluator
+    ///
+    /// @param deviceContext    not used in the cpu evaluator
+    ///
+    template <typename SRC_BUFFER, typename DST_BUFFER,
+              typename PATCHCOORD_BUFFER, typename PATCH_TABLE>
+    static bool EvalPatchesFaceVarying(
+        SRC_BUFFER *srcBuffer, BufferDescriptor const &srcDesc,
+        DST_BUFFER *dstBuffer, BufferDescriptor const &dstDesc,
+        int numPatchCoords,
+        PATCHCOORD_BUFFER *patchCoords,
+        PATCH_TABLE *patchTable,
+        int fvarChannel,
+        CpuEvaluator const *instance = NULL,
+        void * deviceContext = NULL) {
+
+        (void)instance;       // unused
+        (void)deviceContext;  // unused
+
+        return EvalPatches(srcBuffer->BindCpuBuffer(), srcDesc,
+                           dstBuffer->BindCpuBuffer(), dstDesc,
+                           numPatchCoords,
+                           (const PatchCoord*)patchCoords->BindCpuBuffer(),
+                           patchTable->GetFVarPatchArrayBuffer(fvarChannel),
+                           patchTable->GetFVarPatchIndexBuffer(fvarChannel),
+                           patchTable->GetFVarPatchParamBuffer(fvarChannel));
+    }
+
    /// ----------------------------------------------------------------------
    ///
    ///   Other methods
--- a/opensubdiv/osd/cpuPatchTable.cpp
+++ b/opensubdiv/osd/cpuPatchTable.cpp
@ -23,6 +23,9 @@
 //

 #include "../osd/cpuPatchTable.h"
+#include "../far/patchDescriptor.h"
+
+#include <iostream>

 namespace OpenSubdiv {
 namespace OPENSUBDIV_VERSION {
@ -43,6 +46,20 @@ CpuPatchTable::CpuPatchTable(const Far::PatchTable *farPatchTable) {
    }
    _patchArrays.reserve(nPatchArrays);
    _indexBuffer.reserve(numIndices);
+
+    _varyingPatchArrays.reserve(nPatchArrays);
+    _varyingIndexBuffer.reserve(
+        numPatches*farPatchTable->GetVaryingPatchDescriptor().GetNumControlVertices());
+
+    _fvarPatchArrays.resize(farPatchTable->GetNumFVarChannels());
+    _fvarIndexBuffers.resize(farPatchTable->GetNumFVarChannels());
+    _fvarParamBuffers.resize(farPatchTable->GetNumFVarChannels());
+    for (int fvc=0; fvc<farPatchTable->GetNumFVarChannels(); ++fvc) {
+        _fvarPatchArrays[fvc].reserve(nPatchArrays);
+        _fvarIndexBuffers[fvc].reserve(
+            numPatches*farPatchTable->GetFVarChannelPatchDescriptor(fvc).GetNumControlVertices());
+        _fvarParamBuffers[fvc].reserve(numPatches);
+    }
    _patchParamBuffer.reserve(numPatches);

    // for each patchArray
@ -55,8 +72,39 @@ CpuPatchTable::CpuPatchTable(const Far::PatchTable *farPatchTable) {

        // indices
        Far::ConstIndexArray indices = farPatchTable->GetPatchArrayVertices(j);
-        for (int k = 0; k < indices.size(); ++k) {
-            _indexBuffer.push_back(indices[k]);
+        _indexBuffer.insert(_indexBuffer.end(), indices.begin(), indices.end());
+
+        // varying
+        PatchArray varyingPatchArray(
+            farPatchTable->GetVaryingPatchDescriptor(), numPatches, 0, 0);
+        _varyingPatchArrays.push_back(varyingPatchArray);
+        Far::ConstIndexArray
+            varyingIndices = farPatchTable->GetPatchArrayVaryingVertices(j);
+        _varyingIndexBuffer.insert(_varyingIndexBuffer.end(),
+                varyingIndices.begin(), varyingIndices.end());
+
+        // face-varying
+        for (int fvc=0; fvc<farPatchTable->GetNumFVarChannels(); ++fvc) {
+            PatchArray fvarPatchArray(
+                farPatchTable->GetFVarChannelPatchDescriptor(fvc), numPatches, 0, 0);
+            _fvarPatchArrays[fvc].push_back(fvarPatchArray);
+
+            Far::ConstIndexArray
+                fvarIndices = farPatchTable->GetPatchArrayFVarValues(j, fvc);
+            _fvarIndexBuffers[fvc].insert(_fvarIndexBuffers[fvc].end(),
+                    fvarIndices.begin(), fvarIndices.end());
+
+            // face-varying param
+            Far::ConstPatchParamArray
+                fvarParam = farPatchTable->GetPatchArrayFVarPatchParam(j, fvc);
+            for (int k = 0; k < numPatches; ++k) {
+                PatchParam param;
+                //param.patchParam = patchParamTable[patchIndex];
+                param.field0 = fvarParam[k].field0;
+                param.field1 = fvarParam[k].field1;
+                param.sharpness = 0.0f;
+                _fvarParamBuffers[fvc].push_back(param);
+            }
        }

        // patchParams bundling
--- a/opensubdiv/osd/cpuPatchTable.h
+++ b/opensubdiv/osd/cpuPatchTable.h
@ -32,6 +32,8 @@
 #include "../osd/nonCopyable.h"
 #include "../osd/types.h"

+#include <vector>
+
 namespace OpenSubdiv {
 namespace OPENSUBDIV_VERSION {

@ -85,10 +87,52 @@ public:
        return _patchParamBuffer.size();
    }

+    const PatchArray *GetVaryingPatchArrayBuffer() const {
+        if (_varyingPatchArrays.empty()) {
+            return NULL;
+        }
+        return &_varyingPatchArrays[0];
+    }
+    const int *GetVaryingPatchIndexBuffer() const {
+        if (_varyingIndexBuffer.empty()) {
+            return NULL;
+        }
+        return &_varyingIndexBuffer[0];
+    }
+    size_t GetVaryingPatchIndexSize() const {
+        return _varyingIndexBuffer.size();
+    }
+
+    int GetNumFVarChannels() const {
+        return (int)_fvarPatchArrays.size();
+    }
+    const PatchArray *GetFVarPatchArrayBuffer(int fvarChannel = 0) const {
+        return &_fvarPatchArrays[fvarChannel][0];
+    }
+    const int *GetFVarPatchIndexBuffer(int fvarChannel = 0) const {
+        return &_fvarIndexBuffers[fvarChannel][0];
+    }
+    size_t GetFVarPatchIndexSize(int fvarChannel = 0) const {
+        return _fvarIndexBuffers[fvarChannel].size();
+    }
+    const PatchParam *GetFVarPatchParamBuffer(int fvarChannel= 0) const {
+        return &_fvarParamBuffers[fvarChannel][0];
+    }
+    size_t GetFVarPatchParamSize(int fvarChannel = 0) const {
+        return _fvarParamBuffers[fvarChannel].size();
+    }
+
 protected:
    PatchArrayVector _patchArrays;
    std::vector<int> _indexBuffer;
    PatchParamVector _patchParamBuffer;
+
+    PatchArrayVector _varyingPatchArrays;
+    std::vector<int> _varyingIndexBuffer;
+
+    std::vector< PatchArrayVector > _fvarPatchArrays;
+    std::vector< std::vector<int> > _fvarIndexBuffers;
+    std::vector< PatchParamVector > _fvarParamBuffers;
 };

 }  // end namespace Osd
--- a/opensubdiv/osd/cudaEvaluator.cpp
+++ b/opensubdiv/osd/cudaEvaluator.cpp
@ -61,6 +61,7 @@ extern "C" {
        const void *patchArrays,
        const int *patchIndices,
        const void *patchParams);
+
 }

 namespace OpenSubdiv {
--- a/opensubdiv/osd/cudaEvaluator.h
+++ b/opensubdiv/osd/cudaEvaluator.h
@ -494,6 +494,111 @@ public:
        const int *patchIndices,
        const PatchParam *patchParams);

+    /// \brief Generic limit eval function. This function has a same
+    ///        signature as other device kernels have so that it can be called
+    ///        in the same way.
+    ///
+    /// @param srcBuffer        Input primvar buffer.
+    ///                         must have BindCudaBuffer() method returning a
+    ///                         const float pointer for read
+    ///
+    /// @param srcDesc          vertex buffer descriptor for the input buffer
+    ///
+    /// @param dstBuffer        Output primvar buffer
+    ///                         must have BindCudaBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param dstDesc          vertex buffer descriptor for the output buffer
+    ///
+    /// @param numPatchCoords   number of patchCoords.
+    ///
+    /// @param patchCoords      array of locations to be evaluated.
+    ///                         must have BindCudaBuffer() method returning an
+    ///                         array of PatchCoord struct in cuda memory.
+    ///
+    /// @param patchTable       CudaPatchTable or equivalent
+    ///
+    /// @param instance         not used in the cuda evaluator
+    ///
+    /// @param deviceContext    not used in the cuda evaluator
+    ///
+    template <typename SRC_BUFFER, typename DST_BUFFER,
+              typename PATCHCOORD_BUFFER, typename PATCH_TABLE>
+    static bool EvalPatchesVarying(
+        SRC_BUFFER *srcBuffer, BufferDescriptor const &srcDesc,
+        DST_BUFFER *dstBuffer, BufferDescriptor const &dstDesc,
+        int numPatchCoords,
+        PATCHCOORD_BUFFER *patchCoords,
+        PATCH_TABLE *patchTable,
+        CudaEvaluator const *instance,
+        void * deviceContext = NULL) {
+
+        (void)instance;   // unused
+        (void)deviceContext;   // unused
+
+        return EvalPatches(srcBuffer->BindCudaBuffer(), srcDesc,
+                           dstBuffer->BindCudaBuffer(), dstDesc,
+                           numPatchCoords,
+                           (const PatchCoord *)patchCoords->BindCudaBuffer(),
+                           (const PatchArray *)patchTable->GetVaryingPatchArrayBuffer(),
+                           (const int *)patchTable->GetVaryingPatchIndexBuffer(),
+                           (const PatchParam *)patchTable->GetPatchParamBuffer());
+    }
+
+    /// \brief Generic limit eval function. This function has a same
+    ///        signature as other device kernels have so that it can be called
+    ///        in the same way.
+    ///
+    /// @param srcBuffer        Input primvar buffer.
+    ///                         must have BindCudaBuffer() method returning a
+    ///                         const float pointer for read
+    ///
+    /// @param srcDesc          vertex buffer descriptor for the input buffer
+    ///
+    /// @param dstBuffer        Output primvar buffer
+    ///                         must have BindCudaBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param dstDesc          vertex buffer descriptor for the output buffer
+    ///
+    /// @param numPatchCoords   number of patchCoords.
+    ///
+    /// @param patchCoords      array of locations to be evaluated.
+    ///                         must have BindCudaBuffer() method returning an
+    ///                         array of PatchCoord struct in cuda memory.
+    ///
+    /// @param patchTable       CudaPatchTable or equivalent
+    ///
+    /// @param fvarChannel      face-varying channel
+    ///
+    /// @param instance         not used in the cuda evaluator
+    ///
+    /// @param deviceContext    not used in the cuda evaluator
+    ///
+    template <typename SRC_BUFFER, typename DST_BUFFER,
+              typename PATCHCOORD_BUFFER, typename PATCH_TABLE>
+    static bool EvalPatchesFaceVarying(
+        SRC_BUFFER *srcBuffer, BufferDescriptor const &srcDesc,
+        DST_BUFFER *dstBuffer, BufferDescriptor const &dstDesc,
+        int numPatchCoords,
+        PATCHCOORD_BUFFER *patchCoords,
+        PATCH_TABLE *patchTable,
+        int fvarChannel,
+        CudaEvaluator const *instance,
+        void * deviceContext = NULL) {
+
+        (void)instance;   // unused
+        (void)deviceContext;   // unused
+
+        return EvalPatches(srcBuffer->BindCudaBuffer(), srcDesc,
+                           dstBuffer->BindCudaBuffer(), dstDesc,
+                           numPatchCoords,
+                           (const PatchCoord *)patchCoords->BindCudaBuffer(),
+                           (const PatchArray *)patchTable->GetFVarPatchArrayBuffer(fvarChannel),
+                           (const int *)patchTable->GetFVarPatchIndexBuffer(fvarChannel),
+                           (const PatchParam *)patchTable->GetFVarPatchParamBuffer(fvarChannel));
+    }
+
    /// ----------------------------------------------------------------------
    ///
    ///   Other methods
--- a/opensubdiv/osd/cudaKernel.cu
+++ b/opensubdiv/osd/cudaKernel.cu
@ -23,6 +23,8 @@
 //

 #include <assert.h>
+#define OSD_PATCH_BASIS_CUDA
+#include "../osd/patchBasisCommon.h"

 // -----------------------------------------------------------------------------
 template<int N> struct DeviceVertex {
@ -258,54 +260,6 @@ struct PatchParam {
    float sharpness;
 };

-__device__ void
-getBSplineWeights(float t, float point[4], float deriv[4]) {
-    // The four uniform cubic B-Spline basis functions evaluated at t:
-    float const one6th = 1.0f / 6.0f;
-
-    float t2 = t * t;
-    float t3 = t * t2;
-
-    point[0] = one6th * (1.0f - 3.0f*(t -      t2) -      t3);
-    point[1] = one6th * (4.0f           - 6.0f*t2  + 3.0f*t3);
-    point[2] = one6th * (1.0f + 3.0f*(t +      t2  -      t3));
-    point[3] = one6th * (                                 t3);
-
-    // Derivatives of the above four basis functions at t:
-    if (deriv) {
-        deriv[0] = -0.5f*t2 +      t - 0.5f;
-        deriv[1] =  1.5f*t2 - 2.0f*t;
-        deriv[2] = -1.5f*t2 +      t + 0.5f;
-        deriv[3] =  0.5f*t2;
-    }
-}
-
-__device__ void
-adjustBoundaryWeights(unsigned int bits, float sWeights[4], float tWeights[4]) {
-    int boundary = ((bits >> 8) & 0xf);  // far/patchParam.h
-
-    if (boundary & 1) {
-        tWeights[2] -= tWeights[0];
-        tWeights[1] += 2*tWeights[0];
-        tWeights[0] = 0;
-    }
-    if (boundary & 2) {
-        sWeights[1] -= sWeights[3];
-        sWeights[2] += 2*sWeights[3];
-        sWeights[3] = 0;
-    }
-    if (boundary & 4) {
-        tWeights[1] -= tWeights[3];
-        tWeights[2] += 2*tWeights[3];
-        tWeights[3] = 0;
-    }
-    if (boundary & 8) {
-        sWeights[2] -= sWeights[0];
-        sWeights[1] += 2*sWeights[0];
-        sWeights[0] = 0;
-    }
-}
-
 __device__
 int getDepth(unsigned int patchBits) {
    return (patchBits & 0xf);
@ -338,6 +292,18 @@ void normalizePatchCoord(unsigned int patchBits, float *u, float *v) {
    *v = (*v - pv) / frac;
 }

+__device__
+bool isRegular(unsigned int patchBits) {
+    return ((patchBits >> 5) & 0x1) != 0;
+}
+
+__device__
+int getNumControlVertices(int patchType) {
+    return (patchType == 3) ? 4 :
+           (patchType == 6) ? 16 :
+           (patchType == 9) ? 20 : 0;
+}
+
 __global__ void
 computePatches(const float *src, float *dst, float *dstDu, float *dstDv,
               int length, int srcStride, int dstStride, int dstDuStride, int dstDvStride,
@ -350,46 +316,43 @@ computePatches(const float *src, float *dst, float *dstDu, float *dstDv,

    // PERFORMANCE: not yet optimized

-    float wP[20], wDs[20], wDt[20];
+    float wP[20], wDs[20], wDt[20], wDss[20], wDst[20], wDtt[20];

    for (int i = first; i < numPatchCoords; i += blockDim.x * gridDim.x) {

        PatchCoord const &coord = patchCoords[i];
        PatchArray const &array = patchArrayBuffer[coord.arrayIndex];

-        int patchType = 6; // array.patchType XXX: REGULAR only for now.
-        int numControlVertices = 16;
-        // note: patchIndex is absolute.
        unsigned int patchBits = patchParamBuffer[coord.patchIndex].field1;
+        int patchType = isRegular(patchBits) ? 6 : array.patchType;

        // normalize
        float s = coord.s;
        float t = coord.t;
        normalizePatchCoord(patchBits, &s, &t);
        float dScale = (float)(1 << getDepth(patchBits));
+        int boundary = int((patchBits >> 8) & 0xfU);

-        if (patchType == 6) {
-            float sWeights[4], tWeights[4], dsWeights[4], dtWeights[4];
-            getBSplineWeights(s, sWeights, dsWeights);
-            getBSplineWeights(t, tWeights, dtWeights);
-
-            // Compute the tensor product weight of the (s,t) basis function
-            // corresponding to each control vertex:
-            adjustBoundaryWeights(patchBits, sWeights, tWeights);
-            adjustBoundaryWeights(patchBits, dsWeights, dtWeights);
-
-            for (int k = 0; k < 4; ++k) {
-                for (int l = 0; l < 4; ++l) {
-                    wP[4*k+l]  = sWeights[l]  * tWeights[k];
-                    wDs[4*k+l] = dsWeights[l] * tWeights[k]  * dScale;
-                    wDt[4*k+l] = sWeights[l]  * dtWeights[k] * dScale;
-                }
-            }
-        } else {
-            // TODO: Gregory Basis.
-            continue;
+        int numControlVertices = 0;
+        if (patchType == 3) {
+            OsdGetBilinearPatchWeights(s, t, dScale,
+                wP, wDs, wDt, wDss, wDst, wDtt);
+            numControlVertices = 4;
+        } else if (patchType == 6) {
+            OsdGetBSplinePatchWeights(s, t, dScale, boundary,
+                wP, wDs, wDt, wDss, wDst, wDtt);
+            numControlVertices = 16;
+        } else if (patchType == 9) {
+            OsdGetGregoryPatchWeights(s, t, dScale,
+                wP, wDs, wDt, wDss, wDst, wDtt);
+            numControlVertices = 20;
        }
-        const int *cvs = patchIndexBuffer + array.indexBase + coord.vertIndex;
+
+        int indexStride = getNumControlVertices(array.patchType);
+        int indexBase = array.indexBase + indexStride *
+                (coord.patchIndex - array.primitiveIdBase);
+
+        const int *cvs = patchIndexBuffer + indexBase;

        float * dstVert = dst + i * dstStride;
        clear(dstVert, length);
--- a/opensubdiv/osd/cudaPatchTable.cpp
+++ b/opensubdiv/osd/cudaPatchTable.cpp
@ -35,13 +35,25 @@ namespace OPENSUBDIV_VERSION {
 namespace Osd {

 CudaPatchTable::CudaPatchTable() :
-    _patchArrays(NULL), _indexBuffer(NULL), _patchParamBuffer(NULL) {
+    _patchArrays(NULL), _indexBuffer(NULL), _patchParamBuffer(NULL),
+    _varyingPatchArrays(NULL), _varyingIndexBuffer(NULL) {
 }

 CudaPatchTable::~CudaPatchTable() {
    if (_patchArrays) cudaFree(_patchArrays);
    if (_indexBuffer) cudaFree(_indexBuffer);
    if (_patchParamBuffer) cudaFree(_patchParamBuffer);
+    if (_varyingPatchArrays) cudaFree(_varyingPatchArrays);
+    if (_varyingIndexBuffer) cudaFree(_varyingIndexBuffer);
+    for (int fvc=0; fvc<(int)_fvarPatchArrays.size(); ++fvc) {
+        if (_fvarPatchArrays[fvc]) cudaFree(_fvarPatchArrays[fvc]);
+    }
+    for (int fvc=0; fvc<(int)_fvarIndexBuffers.size(); ++fvc) {
+        if (_fvarIndexBuffers[fvc]) cudaFree(_fvarIndexBuffers[fvc]);
+    }
+    for (int fvc=0; fvc<(int)_fvarParamBuffers.size(); ++fvc) {
+        if (_fvarParamBuffers[fvc]) cudaFree(_fvarParamBuffers[fvc]);
+    }
 }

 CudaPatchTable *
@ -71,6 +83,48 @@ CudaPatchTable::allocate(Far::PatchTable const *farPatchTable) {
    err = cudaMalloc(&_patchParamBuffer, patchParamSize * sizeof(Osd::PatchParam));
    if (err != cudaSuccess) return false;

+    err = cudaMalloc(&_varyingPatchArrays, numPatchArrays * sizeof(Osd::PatchArray));
+    if (err != cudaSuccess) return false;
+
+    size_t varyingIndexSize = patchTable.GetVaryingPatchIndexSize();
+    err = cudaMalloc(&_varyingIndexBuffer, varyingIndexSize * sizeof(int));
+    if (err != cudaSuccess) return false;
+
+    size_t numFVarChannels = patchTable.GetNumFVarChannels();
+    _fvarPatchArrays.resize(numFVarChannels, 0);
+    _fvarIndexBuffers.resize(numFVarChannels, 0);
+    _fvarParamBuffers.resize(numFVarChannels, 0);
+    for (int fvc=0; fvc<(int)numFVarChannels; ++fvc) {
+        err = cudaMalloc(&_fvarPatchArrays[fvc], numPatchArrays * sizeof(Osd::PatchArray));
+        if (err != cudaSuccess) return false;
+
+        err = cudaMemcpy(_fvarPatchArrays[fvc],
+                         patchTable.GetFVarPatchArrayBuffer(fvc),
+                         numPatchArrays * sizeof(Osd::PatchArray),
+                         cudaMemcpyHostToDevice);
+        if (err != cudaSuccess) return false;
+
+        size_t fvarIndexSize = patchTable.GetFVarPatchIndexSize(fvc);
+        err = cudaMalloc(&_fvarIndexBuffers[fvc], fvarIndexSize * sizeof(int));
+        if (err != cudaSuccess) return false;
+
+        err = cudaMemcpy(_fvarIndexBuffers[fvc],
+                         patchTable.GetFVarPatchIndexBuffer(fvc),
+                         indexSize * sizeof(int),
+                         cudaMemcpyHostToDevice);
+        if (err != cudaSuccess) return false;
+
+        size_t fvarParamSize = patchTable.GetFVarPatchParamSize(fvc);
+        err = cudaMalloc(&_fvarParamBuffers[fvc], fvarParamSize * sizeof(Osd::PatchParam));
+        if (err != cudaSuccess) return false;
+
+        err = cudaMemcpy(_fvarParamBuffers[fvc],
+                         patchTable.GetFVarPatchParamBuffer(fvc),
+                         patchParamSize * sizeof(PatchParam),
+                         cudaMemcpyHostToDevice);
+        if (err != cudaSuccess) return false;
+    }
+
    // copy patch array
    err = cudaMemcpy(_patchArrays,
                     patchTable.GetPatchArrayBuffer(),
@ -92,6 +146,18 @@ CudaPatchTable::allocate(Far::PatchTable const *farPatchTable) {
                     cudaMemcpyHostToDevice);
    if (err != cudaSuccess) return false;

+    // copy varying patch arrays and index buffer
+    err = cudaMemcpy(_varyingPatchArrays,
+                     patchTable.GetVaryingPatchArrayBuffer(),
+                     numPatchArrays * sizeof(Osd::PatchArray),
+                     cudaMemcpyHostToDevice);
+    if (err != cudaSuccess) return false;
+    err = cudaMemcpy(_varyingIndexBuffer,
+                     patchTable.GetVaryingPatchIndexBuffer(),
+                     varyingIndexSize * sizeof(int),
+                     cudaMemcpyHostToDevice);
+    if (err != cudaSuccess) return false;
+
    return true;
 }

--- a/opensubdiv/osd/cudaPatchTable.h
+++ b/opensubdiv/osd/cudaPatchTable.h
@ -30,6 +30,8 @@
 #include "../osd/nonCopyable.h"
 #include "../osd/types.h"

+#include <vector>
+
 namespace OpenSubdiv {
 namespace OPENSUBDIV_VERSION {

@ -63,6 +65,30 @@ public:
    /// Returns the cuda memory of the array of Osd::PatchParam buffer
    void *GetPatchParamBuffer() const { return _patchParamBuffer; }

+    /// Returns the cuda memory of the array of Osd::PatchArray buffer
+    void *GetVaryingPatchArrayBuffer() const {
+        return _varyingPatchArrays;
+    }
+    /// Returns the cuda memory of the array of varying control vertices
+    void *GetVaryingPatchIndexBuffer() const {
+        return _varyingIndexBuffer;
+    }
+
+    /// Returns the cuda memory of the array of Osd::PatchArray buffer
+    void *GetFVarPatchArrayBuffer(int fvarChannel) const {
+        return _fvarPatchArrays[fvarChannel];
+    }
+
+    /// Returns the cuda memory of the array of face-varying control vertices
+    void *GetFVarPatchIndexBuffer(int fvarChannel = 0) const {
+        return _fvarIndexBuffers[fvarChannel];
+    }
+
+    /// Returns the cuda memory of the array of face-varying param
+    void *GetFVarPatchParamBuffer(int fvarChannel = 0) const {
+        return _fvarParamBuffers[fvarChannel];
+    }
+
 protected:
    CudaPatchTable();

@ -71,6 +97,13 @@ protected:
    void *_patchArrays;
    void *_indexBuffer;
    void *_patchParamBuffer;
+
+    void *_varyingPatchArrays;
+    void *_varyingIndexBuffer;
+
+    std::vector<void *> _fvarPatchArrays;
+    std::vector<void *> _fvarIndexBuffers;
+    std::vector<void *> _fvarParamBuffers;
 };

 }  // end namespace Osd
--- a/opensubdiv/osd/glComputeEvaluator.cpp
+++ b/opensubdiv/osd/glComputeEvaluator.cpp
@ -23,6 +23,7 @@
 //

 #include "../osd/glComputeEvaluator.h"
+#include "../osd/glslPatchShaderSource.h"

 #include <cassert>
 #include <sstream>
@ -127,18 +128,25 @@ compileKernel(BufferDescriptor const &srcDesc,

    GLuint shader = glCreateShader(GL_COMPUTE_SHADER);

+    std::string patchBasisShaderSource =
+        GLSLPatchShaderSource::GetPatchBasisShaderSource();
+    const char *patchBasisShaderSourceDefine = "#define OSD_PATCH_BASIS_GLSL\n";
+
    std::ostringstream defines;
    defines << "#define LENGTH "     << srcDesc.length << "\n"
            << "#define SRC_STRIDE " << srcDesc.stride << "\n"
            << "#define DST_STRIDE " << dstDesc.stride << "\n"
            << "#define WORK_GROUP_SIZE " << workGroupSize << "\n"
-            << kernelDefine << "\n";
+            << kernelDefine << "\n"
+            << patchBasisShaderSourceDefine << "\n";
    std::string defineStr = defines.str();

-    const char *shaderSources[3] = {"#version 430\n", 0, 0};
+    const char *shaderSources[4] = {"#version 430\n", 0, 0, 0};
+
    shaderSources[1] = defineStr.c_str();
-    shaderSources[2] = shaderSource;
-    glShaderSource(shader, 3, shaderSources, NULL);
+    shaderSources[2] = patchBasisShaderSource.c_str();
+    shaderSources[3] = shaderSource;
+    glShaderSource(shader, 4, shaderSources, NULL);
    glCompileShader(shader);
    glAttachShader(program, shader);

--- a/opensubdiv/osd/glComputeEvaluator.h
+++ b/opensubdiv/osd/glComputeEvaluator.h
@ -306,7 +306,7 @@ public:
    ///   Limit evaluations with PatchTable
    ///
    /// ----------------------------------------------------------------------
-    ///
+
    /// \brief Generic limit eval function. This function has a same
    ///        signature as other device kernels have so that it can be called
    ///        in the same way.
@ -561,6 +561,236 @@ public:
                     GLuint patchIndexBuffer,
                     GLuint patchParamsBuffer) const;

+    /// \brief Generic limit eval function. This function has a same
+    ///        signature as other device kernels have so that it can be called
+    ///        in the same way.
+    ///
+    /// @param srcBuffer      Input primvar buffer.
+    ///                       must have BindVBO() method returning a GL
+    ///                       buffer object of source data
+    ///
+    /// @param srcDesc        vertex buffer descriptor for the input buffer
+    ///
+    /// @param dstBuffer      Output primvar buffer
+    ///                       must have BindVBO() method returning a GL
+    ///                       buffer object of destination data
+    ///
+    /// @param dstDesc        vertex buffer descriptor for the output buffer
+    ///
+    /// @param numPatchCoords number of patchCoords.
+    ///
+    /// @param patchCoords    array of locations to be evaluated.
+    ///                       must have BindVBO() method returning an
+    ///                       array of PatchCoord struct in VBO.
+    ///
+    /// @param patchTable     GLPatchTable or equivalent
+    ///
+    /// @param instance       cached compiled instance. Clients are supposed to
+    ///                       pre-compile an instance of this class and provide
+    ///                       to this function. If it's null the kernel still
+    ///                       compute by instantiating on-demand kernel although
+    ///                       it may cause a performance problem.
+    ///
+    /// @param deviceContext  not used in the GLXFB evaluator
+    ///
+    template <typename SRC_BUFFER, typename DST_BUFFER,
+              typename PATCHCOORD_BUFFER, typename PATCH_TABLE>
+    static bool EvalPatchesVarying(
+        SRC_BUFFER *srcBuffer, BufferDescriptor const &srcDesc,
+        DST_BUFFER *dstBuffer, BufferDescriptor const &dstDesc,
+        int numPatchCoords,
+        PATCHCOORD_BUFFER *patchCoords,
+        PATCH_TABLE *patchTable,
+        GLComputeEvaluator const *instance,
+        void * deviceContext = NULL) {
+
+        if (instance) {
+            return instance->EvalPatchesVarying(
+                                         srcBuffer, srcDesc,
+                                         dstBuffer, dstDesc,
+                                         numPatchCoords, patchCoords,
+                                         patchTable);
+        } else {
+            // Create an instance on demand (slow)
+            (void)deviceContext;  // unused
+            instance = Create(srcDesc, dstDesc,
+                              BufferDescriptor(),
+                              BufferDescriptor());
+            if (instance) {
+                bool r = instance->EvalPatchesVarying(
+                                               srcBuffer, srcDesc,
+                                               dstBuffer, dstDesc,
+                                               numPatchCoords, patchCoords,
+                                               patchTable);
+                delete instance;
+                return r;
+            }
+            return false;
+        }
+    }
+
+    /// \brief Generic limit eval function. This function has a same
+    ///        signature as other device kernels have so that it can be called
+    ///        in the same way.
+    ///
+    /// @param srcBuffer      Input primvar buffer.
+    ///                       must have BindVBO() method returning a
+    ///                       const float pointer for read
+    ///
+    /// @param srcDesc        vertex buffer descriptor for the input buffer
+    ///
+    /// @param dstBuffer      Output primvar buffer
+    ///                       must have BindVBOBuffer() method returning a
+    ///                       float pointer for write
+    ///
+    /// @param dstDesc        vertex buffer descriptor for the output buffer
+    ///
+    /// @param numPatchCoords number of patchCoords.
+    ///
+    /// @param patchCoords    array of locations to be evaluated.
+    ///                       must have BindVBO() method returning an
+    ///                       array of PatchCoord struct in VBO.
+    ///
+    /// @param patchTable     GLPatchTable or equivalent
+    ///
+    template <typename SRC_BUFFER, typename DST_BUFFER,
+              typename PATCHCOORD_BUFFER, typename PATCH_TABLE>
+    bool EvalPatchesVarying(
+        SRC_BUFFER *srcBuffer, BufferDescriptor const &srcDesc,
+        DST_BUFFER *dstBuffer, BufferDescriptor const &dstDesc,
+        int numPatchCoords,
+        PATCHCOORD_BUFFER *patchCoords,
+        PATCH_TABLE *patchTable) const {
+
+        return EvalPatches(srcBuffer->BindVBO(), srcDesc,
+                           dstBuffer->BindVBO(), dstDesc,
+                           0, BufferDescriptor(),
+                           0, BufferDescriptor(),
+                           numPatchCoords,
+                           patchCoords->BindVBO(),
+                           patchTable->GetVaryingPatchArrays(),
+                           patchTable->GetVaryingPatchIndexBuffer(),
+                           patchTable->GetPatchParamBuffer());
+    }
+
+    /// \brief Generic limit eval function. This function has a same
+    ///        signature as other device kernels have so that it can be called
+    ///        in the same way.
+    ///
+    /// @param srcBuffer      Input primvar buffer.
+    ///                       must have BindVBO() method returning a GL
+    ///                       buffer object of source data
+    ///
+    /// @param srcDesc        vertex buffer descriptor for the input buffer
+    ///
+    /// @param dstBuffer      Output primvar buffer
+    ///                       must have BindVBO() method returning a GL
+    ///                       buffer object of destination data
+    ///
+    /// @param dstDesc        vertex buffer descriptor for the output buffer
+    ///
+    /// @param numPatchCoords number of patchCoords.
+    ///
+    /// @param patchCoords    array of locations to be evaluated.
+    ///                       must have BindVBO() method returning an
+    ///                       array of PatchCoord struct in VBO.
+    ///
+    /// @param patchTable     GLPatchTable or equivalent
+    ///
+    /// @param fvarChannel    face-varying channel
+    ///
+    /// @param instance       cached compiled instance. Clients are supposed to
+    ///                       pre-compile an instance of this class and provide
+    ///                       to this function. If it's null the kernel still
+    ///                       compute by instantiating on-demand kernel although
+    ///                       it may cause a performance problem.
+    ///
+    /// @param deviceContext  not used in the GLXFB evaluator
+    ///
+    template <typename SRC_BUFFER, typename DST_BUFFER,
+              typename PATCHCOORD_BUFFER, typename PATCH_TABLE>
+    static bool EvalPatchesFaceVarying(
+        SRC_BUFFER *srcBuffer, BufferDescriptor const &srcDesc,
+        DST_BUFFER *dstBuffer, BufferDescriptor const &dstDesc,
+        int numPatchCoords,
+        PATCHCOORD_BUFFER *patchCoords,
+        PATCH_TABLE *patchTable,
+        int fvarChannel,
+        GLComputeEvaluator const *instance,
+        void * deviceContext = NULL) {
+
+        if (instance) {
+            return instance->EvalPatchesFaceVarying(
+                                         srcBuffer, srcDesc,
+                                         dstBuffer, dstDesc,
+                                         numPatchCoords, patchCoords,
+                                         patchTable, fvarChannel);
+        } else {
+            // Create an instance on demand (slow)
+            (void)deviceContext;  // unused
+            instance = Create(srcDesc, dstDesc,
+                              BufferDescriptor(),
+                              BufferDescriptor());
+            if (instance) {
+                bool r = instance->EvalPatchesFaceVarying(
+                                               srcBuffer, srcDesc,
+                                               dstBuffer, dstDesc,
+                                               numPatchCoords, patchCoords,
+                                               patchTable, fvarChannel);
+                delete instance;
+                return r;
+            }
+            return false;
+        }
+    }
+
+    /// \brief Generic limit eval function. This function has a same
+    ///        signature as other device kernels have so that it can be called
+    ///        in the same way.
+    ///
+    /// @param srcBuffer      Input primvar buffer.
+    ///                       must have BindVBO() method returning a
+    ///                       const float pointer for read
+    ///
+    /// @param srcDesc        vertex buffer descriptor for the input buffer
+    ///
+    /// @param dstBuffer      Output primvar buffer
+    ///                       must have BindVBOBuffer() method returning a
+    ///                       float pointer for write
+    ///
+    /// @param dstDesc        vertex buffer descriptor for the output buffer
+    ///
+    /// @param numPatchCoords number of patchCoords.
+    ///
+    /// @param patchCoords    array of locations to be evaluated.
+    ///                       must have BindVBO() method returning an
+    ///                       array of PatchCoord struct in VBO.
+    ///
+    /// @param patchTable     GLPatchTable or equivalent
+    ///
+    /// @param fvarChannel    face-varying channel
+    ///
+    template <typename SRC_BUFFER, typename DST_BUFFER,
+              typename PATCHCOORD_BUFFER, typename PATCH_TABLE>
+    bool EvalPatchesFaceVarying(
+        SRC_BUFFER *srcBuffer, BufferDescriptor const &srcDesc,
+        DST_BUFFER *dstBuffer, BufferDescriptor const &dstDesc,
+        int numPatchCoords,
+        PATCHCOORD_BUFFER *patchCoords,
+        PATCH_TABLE *patchTable,
+        int fvarChannel = 0) const {
+
+        return EvalPatches(srcBuffer->BindVBO(), srcDesc,
+                           dstBuffer->BindVBO(), dstDesc,
+                           0, BufferDescriptor(),
+                           0, BufferDescriptor(),
+                           numPatchCoords,
+                           patchCoords->BindVBO(),
+                           patchTable->GetFVarPatchArrays(fvarChannel),
+                           patchTable->GetFVarPatchIndexBuffer(fvarChannel),
+                           patchTable->GetFVarPatchParamBuffer(fvarChannel));
+    }
+
    /// ----------------------------------------------------------------------
    ///
    ///   Other methods
--- a/opensubdiv/osd/glPatchTable.cpp
+++ b/opensubdiv/osd/glPatchTable.cpp
@ -43,6 +43,14 @@ GLPatchTable::~GLPatchTable() {
    if (_patchParamBuffer) glDeleteBuffers(1, &_patchParamBuffer);
    if (_patchIndexTexture) glDeleteTextures(1, &_patchIndexTexture);
    if (_patchParamTexture) glDeleteTextures(1, &_patchParamTexture);
+    if (_varyingIndexBuffer) glDeleteBuffers(1, &_varyingIndexBuffer);
+    if (_varyingIndexTexture) glDeleteTextures(1, &_varyingIndexTexture);
+    for (int fvc=0; fvc<(int)_fvarIndexBuffers.size(); ++fvc) {
+        if (_fvarIndexBuffers[fvc]) glDeleteBuffers(1, &_fvarIndexBuffers[fvc]);
+    }
+    for (int fvc=0; fvc<(int)_fvarIndexTextures.size(); ++fvc) {
+        if (_fvarIndexTextures[fvc]) glDeleteTextures(1, &_fvarIndexTextures[fvc]);
+    }
 }

 GLPatchTable *
@ -93,6 +101,60 @@ GLPatchTable::allocate(Far::PatchTable const *farPatchTable) {

    glBindTexture(GL_TEXTURE_BUFFER, _patchParamTexture);
    glTexBuffer(GL_TEXTURE_BUFFER, GL_RGB32I, _patchParamBuffer);
+
+    // varying
+    _varyingPatchArrays.assign(
+        patchTable.GetVaryingPatchArrayBuffer(),
+        patchTable.GetVaryingPatchArrayBuffer() + numPatchArrays);
+
+    glGenBuffers(1, &_varyingIndexBuffer);
+    glBindBuffer(GL_ARRAY_BUFFER, _varyingIndexBuffer);
+    glBufferData(GL_ARRAY_BUFFER,
+                 patchTable.GetVaryingPatchIndexSize() * sizeof(GLint),
+                 patchTable.GetVaryingPatchIndexBuffer(),
+                 GL_STATIC_DRAW);
+
+    glGenTextures(1, &_varyingIndexTexture);
+    glBindTexture(GL_TEXTURE_BUFFER, _varyingIndexTexture);
+    glTexBuffer(GL_TEXTURE_BUFFER, GL_R32I, _varyingIndexBuffer);
+
+    // face-varying
+    int numFVarChannels = patchTable.GetNumFVarChannels();
+    _fvarPatchArrays.resize(numFVarChannels);
+    _fvarIndexBuffers.resize(numFVarChannels);
+    _fvarIndexTextures.resize(numFVarChannels);
+    _fvarParamBuffers.resize(numFVarChannels);
+    _fvarParamTextures.resize(numFVarChannels);
+    for (int fvc=0; fvc<numFVarChannels; ++fvc) {
+        _fvarPatchArrays[fvc].assign(
+            patchTable.GetFVarPatchArrayBuffer(fvc),
+            patchTable.GetFVarPatchArrayBuffer(fvc) + numPatchArrays);
+
+        glGenBuffers(1, &_fvarIndexBuffers[fvc]);
+        glBindBuffer(GL_ARRAY_BUFFER, _fvarIndexBuffers[fvc]);
+        glBufferData(GL_ARRAY_BUFFER,
+                     patchTable.GetFVarPatchIndexSize(fvc) * sizeof(GLint),
+                     patchTable.GetFVarPatchIndexBuffer(fvc),
+                 GL_STATIC_DRAW);
+
+        glGenTextures(1, &_fvarIndexTextures[fvc]);
+        glBindTexture(GL_TEXTURE_BUFFER, _fvarIndexTextures[fvc]);
+        glTexBuffer(GL_TEXTURE_BUFFER, GL_R32I, _fvarIndexBuffers[fvc]);
+
+        glGenBuffers(1, &_fvarParamBuffers[fvc]);
+        glBindBuffer(GL_ARRAY_BUFFER, _fvarParamBuffers[fvc]);
+        glBufferData(GL_ARRAY_BUFFER,
+                     patchTable.GetFVarPatchParamSize(fvc) * sizeof(PatchParam),
+                     patchTable.GetFVarPatchParamBuffer(fvc),
+                 GL_STATIC_DRAW);
+
+        glGenTextures(1, &_fvarParamTextures[fvc]);
+        glBindTexture(GL_TEXTURE_BUFFER, _fvarParamTextures[fvc]);
+        glTexBuffer(GL_TEXTURE_BUFFER, GL_RGB32I, _fvarParamBuffers[fvc]);
+        glBindTexture(GL_TEXTURE_BUFFER, 0);
+    }
+
+    glBindBuffer(GL_ARRAY_BUFFER, 0);
    glBindTexture(GL_TEXTURE_BUFFER, 0);

    return true;
--- a/opensubdiv/osd/glPatchTable.h
+++ b/opensubdiv/osd/glPatchTable.h
@ -31,6 +31,8 @@
 #include "../osd/opengl.h"
 #include "../osd/types.h"

+#include <vector>
+
 namespace OpenSubdiv {
 namespace OPENSUBDIV_VERSION {

@ -73,6 +75,38 @@ public:
        return _patchParamTexture;
    }

+    PatchArrayVector const &GetVaryingPatchArrays() const {
+        return _varyingPatchArrays;
+    }
+
+    /// Returns the GL index buffer containing the varying control vertices
+    GLuint GetVaryingPatchIndexBuffer() const {
+        return _varyingIndexBuffer;
+    }
+
+    /// Returns the GL texture buffer containing the varying control vertices
+    GLuint GetVaryingPatchIndexTextureBuffer() const {
+        return _varyingIndexTexture;
+    }
+
+    PatchArrayVector const &GetFVarPatchArrays(int fvarChannel = 0) const {
+        return _fvarPatchArrays[fvarChannel];
+    }
+
+    /// Returns the GL texture buffer containing the face-varying control vertices
+    GLuint GetFVarPatchIndexBuffer(int fvarChannel = 0) const {
+        return _fvarIndexBuffers[fvarChannel];
+    }
+    GLuint GetFVarPatchIndexTextureBuffer(int fvarChannel = 0) const {
+        return _fvarIndexTextures[fvarChannel];
+    }
+    GLuint GetFVarPatchParamBuffer(int fvarChannel = 0) const {
+        return _fvarParamBuffers[fvarChannel];
+    }
+    GLuint GetFVarPatchParamTextureBuffer(int fvarChannel = 0) const {
+        return _fvarParamTextures[fvarChannel];
+    }
+
 protected:
    GLPatchTable();

@ -86,6 +120,17 @@ protected:

    GLuint _patchIndexTexture;
    GLuint _patchParamTexture;
+
+    PatchArrayVector _varyingPatchArrays;
+    GLuint _varyingIndexBuffer;
+    GLuint _varyingIndexTexture;
+
+    std::vector<PatchArrayVector> _fvarPatchArrays;
+    std::vector<GLuint> _fvarIndexBuffers;
+    std::vector<GLuint> _fvarIndexTextures;
+
+    std::vector<GLuint> _fvarParamBuffers;
+    std::vector<GLuint> _fvarParamTextures;
 };


--- a/opensubdiv/osd/glXFBEvaluator.cpp
+++ b/opensubdiv/osd/glXFBEvaluator.cpp
@ -23,6 +23,7 @@
 //

 #include "../osd/glXFBEvaluator.h"
+#include "../osd/glslPatchShaderSource.h"

 #include <sstream>
 #include <string>
@ -154,18 +155,25 @@ compileKernel(BufferDescriptor const &srcDesc,

    GLuint vertexShader = glCreateShader(GL_VERTEX_SHADER);

+    std::string patchBasisShaderSource =
+        GLSLPatchShaderSource::GetPatchBasisShaderSource();
+    const char *patchBasisShaderSourceDefine = "#define OSD_PATCH_BASIS_GLSL\n";
+
    std::ostringstream defines;
    defines << "#define LENGTH " << srcDesc.length << "\n"
            << "#define SRC_STRIDE " << srcDesc.stride << "\n"
            << "#define VERTEX_SHADER\n"
-            << kernelDefine << "\n";
+            << kernelDefine << "\n"
+            << patchBasisShaderSourceDefine << "\n";
    std::string defineStr = defines.str();

-    const char *shaderSources[3] = {"#version 410\n", NULL, NULL};
+
+    const char *shaderSources[4] = {"#version 410\n", NULL, NULL, NULL};

    shaderSources[1] = defineStr.c_str();
-    shaderSources[2] = shaderSource;
-    glShaderSource(vertexShader, 3, shaderSources, NULL);
+    shaderSources[2] = patchBasisShaderSource.c_str();
+    shaderSources[3] = shaderSource;
+    glShaderSource(vertexShader, 4, shaderSources, NULL);
    glCompileShader(vertexShader);
    glAttachShader(program, vertexShader);

--- a/opensubdiv/osd/glXFBEvaluator.h
+++ b/opensubdiv/osd/glXFBEvaluator.h
@ -380,7 +380,7 @@ public:
    ///   Limit evaluations with PatchTable
    ///
    /// ----------------------------------------------------------------------
-    ///
+
    /// \brief Generic limit eval function. This function has a same
    ///        signature as other device kernels have so that it can be called
    ///        in the same way.
@ -530,13 +530,13 @@ public:
    ///        in the same way.
    ///
    /// @param srcBuffer      Input primvar buffer.
-    ///                       must have BindCudaBuffer() method returning a
+    ///                       must have BindVBO() method returning a
    ///                       const float pointer for read
    ///
    /// @param srcDesc        vertex buffer descriptor for the input buffer
    ///
    /// @param dstBuffer      Output primvar buffer
-    ///                       must have BindCudaBuffer() method returning a
+    ///                       must have BindVBO() method returning a
    ///                       float pointer for write
    ///
    /// @param dstDesc        vertex buffer descriptor for the output buffer
@ -544,7 +544,7 @@ public:
    /// @param numPatchCoords number of patchCoords.
    ///
    /// @param patchCoords    array of locations to be evaluated.
-    ///                       must have BindCudaBuffer() method returning an
+    ///                       must have BindVBO() method returning an
    ///                       array of PatchCoord struct in cuda memory.
    ///
    /// @param patchTable     GLPatchTable or equivalent
@ -574,25 +574,25 @@ public:
    ///        called in the same way.
    ///
    /// @param srcBuffer        Input primvar buffer.
-    ///                         must have BindCudaBuffer() method returning a
+    ///                         must have BindVBO() method returning a
    ///                         const float pointer for read
    ///
    /// @param srcDesc          vertex buffer descriptor for the input buffer
    ///
    /// @param dstBuffer        Output primvar buffer
-    ///                         must have BindCudaBuffer() method returning a
+    ///                         must have BindVBO() method returning a
    ///                         float pointer for write
    ///
    /// @param dstDesc          vertex buffer descriptor for the output buffer
    ///
    /// @param duBuffer         Output s-derivatives buffer
-    ///                         must have BindCudaBuffer() method returning a
+    ///                         must have BindVBO() method returning a
    ///                         float pointer for write
    ///
    /// @param duDesc           vertex buffer descriptor for the duBuffer
    ///
    /// @param dvBuffer         Output t-derivatives buffer
-    ///                         must have BindCudaBuffer() method returning a
+    ///                         must have BindVBO() method returning a
    ///                         float pointer for write
    ///
    /// @param dvDesc           vertex buffer descriptor for the dvBuffer
@ -635,6 +635,238 @@ public:
                     GLuint patchIndexBuffer,
                     GLuint patchParamsBuffer) const;

+    ///
+    /// \brief Generic limit eval function. This function has a same
+    ///        signature as other device kernels have so that it can be called
+    ///        in the same way.
+    ///
+    /// @param srcBuffer      Input primvar buffer.
+    ///                       must have BindVBO() method returning a GL
+    ///                       buffer object of source data
+    ///
+    /// @param srcDesc        vertex buffer descriptor for the input buffer
+    ///
+    /// @param dstBuffer      Output primvar buffer
+    ///                       must have BindVBO() method returning a GL
+    ///                       buffer object of destination data
+    ///
+    /// @param dstDesc        vertex buffer descriptor for the output buffer
+    ///
+    /// @param numPatchCoords number of patchCoords.
+    ///
+    /// @param patchCoords    array of locations to be evaluated.
+    ///                       must have BindVBO() method returning an
+    ///                       array of PatchCoord struct in VBO.
+    ///
+    /// @param patchTable     GLPatchTable or equivalent
+    ///
+    /// @param instance       cached compiled instance. Clients are supposed to
+    ///                       pre-compile an instance of this class and provide
+    ///                       to this function. If it's null the kernel still
+    ///                       compute by instantiating on-demand kernel although
+    ///                       it may cause a performance problem.
+    ///
+    /// @param deviceContext  not used in the GLXFB evaluator
+    ///
+    template <typename SRC_BUFFER, typename DST_BUFFER,
+              typename PATCHCOORD_BUFFER, typename PATCH_TABLE>
+    static bool EvalPatchesVarying(
+        SRC_BUFFER *srcBuffer, BufferDescriptor const &srcDesc,
+        DST_BUFFER *dstBuffer, BufferDescriptor const &dstDesc,
+        int numPatchCoords,
+        PATCHCOORD_BUFFER *patchCoords,
+        PATCH_TABLE *patchTable,
+        GLXFBEvaluator const *instance,
+        void * deviceContext = NULL) {
+
+        if (instance) {
+            return instance->EvalPatchesVarying(
+                                         srcBuffer, srcDesc,
+                                         dstBuffer, dstDesc,
+                                         numPatchCoords, patchCoords,
+                                         patchTable);
+        } else {
+            // Create an instance on demand (slow)
+            (void)deviceContext;  // unused
+            instance = Create(srcDesc, dstDesc,
+                              BufferDescriptor(),
+                              BufferDescriptor());
+            if (instance) {
+                bool r = instance->EvalPatchesVarying(
+                                               srcBuffer, srcDesc,
+                                               dstBuffer, dstDesc,
+                                               numPatchCoords, patchCoords,
+                                               patchTable);
+                delete instance;
+                return r;
+            }
+            return false;
+        }
+    }
+
+    /// \brief Generic limit eval function. This function has a same
+    ///        signature as other device kernels have so that it can be called
+    ///        in the same way.
+    ///
+    /// @param srcBuffer      Input primvar buffer.
+    ///                       must have BindVBO() method returning a
+    ///                       const float pointer for read
+    ///
+    /// @param srcDesc        vertex buffer descriptor for the input buffer
+    ///
+    /// @param dstBuffer      Output primvar buffer
+    ///                       must have BindVBO() method returning a
+    ///                       float pointer for write
+    ///
+    /// @param dstDesc        vertex buffer descriptor for the output buffer
+    ///
+    /// @param numPatchCoords number of patchCoords.
+    ///
+    /// @param patchCoords    array of locations to be evaluated.
+    ///                       must have BindVBO() method returning an
+    ///                       array of PatchCoord struct in cuda memory.
+    ///
+    /// @param patchTable     GLPatchTable or equivalent
+    ///
+    template <typename SRC_BUFFER, typename DST_BUFFER,
+              typename PATCHCOORD_BUFFER, typename PATCH_TABLE>
+    bool EvalPatchesVarying(
+        SRC_BUFFER *srcBuffer, BufferDescriptor const &srcDesc,
+        DST_BUFFER *dstBuffer, BufferDescriptor const &dstDesc,
+        int numPatchCoords,
+        PATCHCOORD_BUFFER *patchCoords,
+        PATCH_TABLE *patchTable) const {
+
+        return EvalPatches(srcBuffer->BindVBO(), srcDesc,
+                           dstBuffer->BindVBO(), dstDesc,
+                           0, BufferDescriptor(),
+                           0, BufferDescriptor(),
+                           numPatchCoords,
+                           patchCoords->BindVBO(),
+                           patchTable->GetVaryingPatchArrays(),
+                           patchTable->GetVaryingPatchIndexTextureBuffer(),
+                           patchTable->GetPatchParamTextureBuffer());
+    }
+
+    ///
+    /// \brief Generic limit eval function. This function has a same
+    ///        signature as other device kernels have so that it can be called
+    ///        in the same way.
+    ///
+    /// @param srcBuffer      Input primvar buffer.
+    ///                       must have BindVBO() method returning a GL
+    ///                       buffer object of source data
+    ///
+    /// @param srcDesc        vertex buffer descriptor for the input buffer
+    ///
+    /// @param dstBuffer      Output primvar buffer
+    ///                       must have BindVBO() method returning a GL
+    ///                       buffer object of destination data
+    ///
+    /// @param dstDesc        vertex buffer descriptor for the output buffer
+    ///
+    /// @param numPatchCoords number of patchCoords.
+    ///
+    /// @param patchCoords    array of locations to be evaluated.
+    ///                       must have BindVBO() method returning an
+    ///                       array of PatchCoord struct in VBO.
+    ///
+    /// @param patchTable     GLPatchTable or equivalent
+    ///
+    /// @param fvarChannel    face-varying channel
+    ///
+    /// @param instance       cached compiled instance. Clients are supposed to
+    ///                       pre-compile an instance of this class and provide
+    ///                       to this function. If it's null the kernel still
+    ///                       compute by instantiating on-demand kernel although
+    ///                       it may cause a performance problem.
+    ///
+    /// @param deviceContext  not used in the GLXFB evaluator
+    ///
+    template <typename SRC_BUFFER, typename DST_BUFFER,
+              typename PATCHCOORD_BUFFER, typename PATCH_TABLE>
+    static bool EvalPatchesFaceVarying(
+        SRC_BUFFER *srcBuffer, BufferDescriptor const &srcDesc,
+        DST_BUFFER *dstBuffer, BufferDescriptor const &dstDesc,
+        int numPatchCoords,
+        PATCHCOORD_BUFFER *patchCoords,
+        PATCH_TABLE *patchTable,
+        int fvarChannel,
+        GLXFBEvaluator const *instance,
+        void * deviceContext = NULL) {
+
+        if (instance) {
+            return instance->EvalPatchesFaceVarying(
+                                         srcBuffer, srcDesc,
+                                         dstBuffer, dstDesc,
+                                         numPatchCoords, patchCoords,
+                                         patchTable, fvarChannel);
+        } else {
+            // Create an instance on demand (slow)
+            (void)deviceContext;  // unused
+            instance = Create(srcDesc, dstDesc,
+                              BufferDescriptor(),
+                              BufferDescriptor());
+            if (instance) {
+                bool r = instance->EvalPatchesFaceVarying(
+                                               srcBuffer, srcDesc,
+                                               dstBuffer, dstDesc,
+                                               numPatchCoords, patchCoords,
+                                               patchTable, fvarChannel);
+                delete instance;
+                return r;
+            }
+            return false;
+        }
+    }
+
+    /// \brief Generic limit eval function. This function has a same
+    ///        signature as other device kernels have so that it can be called
+    ///        in the same way.
+    ///
+    /// @param srcBuffer      Input primvar buffer.
+    ///                       must have BindVBO() method returning a
+    ///                       const float pointer for read
+    ///
+    /// @param srcDesc        vertex buffer descriptor for the input buffer
+    ///
+    /// @param dstBuffer      Output primvar buffer
+    ///                       must have BindVBO() method returning a
+    ///                       float pointer for write
+    ///
+    /// @param dstDesc        vertex buffer descriptor for the output buffer
+    ///
+    /// @param numPatchCoords number of patchCoords.
+    ///
+    /// @param patchCoords    array of locations to be evaluated.
+    ///                       must have BindVBO() method returning an
+    ///                       array of PatchCoord struct in cuda memory.
+    ///
+    /// @param patchTable     GLPatchTable or equivalent
+    ///
+    /// @param fvarChannel    face-varying channel
+    ///
+    template <typename SRC_BUFFER, typename DST_BUFFER,
+              typename PATCHCOORD_BUFFER, typename PATCH_TABLE>
+    bool EvalPatchesFaceVarying(
+        SRC_BUFFER *srcBuffer, BufferDescriptor const &srcDesc,
+        DST_BUFFER *dstBuffer, BufferDescriptor const &dstDesc,
+        int numPatchCoords,
+        PATCHCOORD_BUFFER *patchCoords,
+        PATCH_TABLE *patchTable,
+        int fvarChannel = 0) const {
+
+        return EvalPatches(srcBuffer->BindVBO(), srcDesc,
+                           dstBuffer->BindVBO(), dstDesc,
+                           0, BufferDescriptor(),
+                           0, BufferDescriptor(),
+                           numPatchCoords,
+                           patchCoords->BindVBO(),
+                           patchTable->GetFVarPatchArrays(fvarChannel),
+                           patchTable->GetFVarPatchIndexTextureBuffer(fvarChannel),
+                           patchTable->GetFVarPatchParamTextureBuffer(fvarChannel));
+    }
+
    /// ----------------------------------------------------------------------
    ///
    ///   Other methods
--- a/opensubdiv/osd/glslComputeKernel.glsl
+++ b/opensubdiv/osd/glslComputeKernel.glsl
@ -195,25 +195,6 @@ void main() {
 //};
 // # of patcharrays is 1 or 2.

-void getBSplineWeights(float t, inout vec4 point, inout vec4 deriv) {
-    // The four uniform cubic B-Spline basis functions evaluated at t:
-    float one6th = 1.0f / 6.0f;
-
-    float t2 = t * t;
-    float t3 = t * t2;
-
-    point.x = one6th * (1.0f - 3.0f*(t -      t2) -      t3);
-    point.y = one6th * (4.0f           - 6.0f*t2  + 3.0f*t3);
-    point.z = one6th * (1.0f + 3.0f*(t +      t2  -      t3));
-    point.w = one6th * (                                 t3);
-
-    // Derivatives of the above four basis functions at t:
-    deriv.x = -0.5f*t2 +      t - 0.5f;
-    deriv.y =  1.5f*t2 - 2.0f*t;
-    deriv.z = -1.5f*t2 +      t + 0.5f;
-    deriv.w =  0.5f*t2;
-}
-
 uint getDepth(uint patchBits) {
    return (patchBits & 0xf);
 }
@ -242,29 +223,14 @@ vec2 normalizePatchCoord(uint patchBits, vec2 uv) {
    return vec2((uv.x - pu) / frac, (uv.y - pv) / frac);
 }

-void adjustBoundaryWeights(uint bits, inout vec4 sWeights, inout vec4 tWeights) {
-    uint boundary = ((bits >> 8) & 0xf);
+bool isRegular(uint patchBits) {
+    return (((patchBits >> 5) & 0x1u) != 0);
+}

-    if ((boundary & 1) != 0) {
-        tWeights[2] -= tWeights[0];
-        tWeights[1] += 2*tWeights[0];
-        tWeights[0] = 0;
-    }
-    if ((boundary & 2) != 0) {
-        sWeights[1] -= sWeights[3];
-        sWeights[2] += 2*sWeights[3];
-        sWeights[3] = 0;
-    }
-    if ((boundary & 4) != 0) {
-        tWeights[1] -= tWeights[3];
-        tWeights[2] += 2*tWeights[3];
-        tWeights[3] = 0;
-    }
-    if ((boundary & 8) != 0) {
-        sWeights[2] -= sWeights[0];
-        sWeights[1] += 2*sWeights[0];
-        sWeights[0] = 0;
-    }
+int getNumControlVertices(int patchType) {
+    return (patchType == 3) ? 4 :
+           (patchType == 6) ? 16 :
+           (patchType == 9) ? 20 : 0;
 }

 void main() {
@ -275,31 +241,38 @@ void main() {
    int patchIndex = coord.patchIndex;

    ivec4 array = patchArray[coord.arrayIndex];
-    int patchType = 6; // array.x XXX: REGULAR only for now.
-    int numControlVertices = 16;

    uint patchBits = patchParamBuffer[patchIndex].field1;
+    int patchType = isRegular(patchBits) ? 6 : array.x;
+
    vec2 uv = normalizePatchCoord(patchBits, vec2(coord.s, coord.t));
    float dScale = float(1 << getDepth(patchBits));
+    int boundary = int((patchBits >> 8) & 0xfU);

-    float wP[20], wDs[20], wDt[20];
-    if (patchType == 6) {  // REGULAR
-        vec4 sWeights, tWeights, dsWeights, dtWeights;
-        getBSplineWeights(uv.x, sWeights, dsWeights);
-        getBSplineWeights(uv.y, tWeights, dtWeights);
+    float wP[20], wDs[20], wDt[20], wDss[20], wDst[20], wDtt[20];

-        adjustBoundaryWeights(patchBits, sWeights, tWeights);
-        adjustBoundaryWeights(patchBits, dsWeights, dtWeights);
-
-        for (int k = 0; k < 4; ++k) {
-            for (int l = 0; l < 4; ++l) {
-                wP[4*k+l]  = sWeights[l]  * tWeights[k];
-                wDs[4*k+l] = dsWeights[l] * tWeights[k]  * dScale;
-                wDt[4*k+l] = sWeights[l]  * dtWeights[k] * dScale;
-            }
+    int numControlVertices = 0;
+    if (patchType == 3) {
+        float wP4[4], wDs4[4], wDt4[4], wDss4[4], wDst4[4], wDtt4[4];
+        OsdGetBilinearPatchWeights(uv.s, uv.t, dScale, wP4, wDs4, wDt4, wDss4, wDst4, wDtt4);
+        numControlVertices = 4;
+        for (int i=0; i<numControlVertices; ++i) {
+            wP[i] = wP4[i];
+            wDs[i] = wDs4[i];
+            wDt[i] = wDt4[i];
        }
-    } else {
-        // TODO: GREGORY BASIS
+    } else if (patchType == 6) {
+        float wP16[16], wDs16[16], wDt16[16], wDss16[16], wDst16[16], wDtt16[16];
+        OsdGetBSplinePatchWeights(uv.s, uv.t, dScale, boundary, wP16, wDs16, wDt16, wDss16, wDst16, wDtt16);
+        numControlVertices = 16;
+        for (int i=0; i<numControlVertices; ++i) {
+            wP[i] = wP16[i];
+            wDs[i] = wDs16[i];
+            wDt[i] = wDt16[i];
+        }
+    } else if (patchType == 9) {
+        OsdGetGregoryPatchWeights(uv.s, uv.t, dScale, wP, wDs, wDt, wDss, wDst, wDtt);
+        numControlVertices = 20;
    }

    Vertex dst, du, dv;
@ -307,7 +280,9 @@ void main() {
    clear(du);
    clear(dv);

-    int indexBase = array.z + coord.vertIndex;
+    int indexStride = getNumControlVertices(array.x);
+    int indexBase = array.z + indexStride * (patchIndex - array.w);
+
    for (int cv = 0; cv < numControlVertices; ++cv) {
        int index = patchIndexBuffer[indexBase + cv];
        addWithWeight(dst, readVertex(index), wP[cv]);
--- a/opensubdiv/osd/glslXFBKernel.glsl
+++ b/opensubdiv/osd/glslXFBKernel.glsl
@ -153,25 +153,6 @@ uniform ivec4 patchArray[2];
 uniform isamplerBuffer patchParamBuffer;
 uniform isamplerBuffer patchIndexBuffer;

-void getBSplineWeights(float t, inout vec4 point, inout vec4 deriv) {
-    // The four uniform cubic B-Spline basis functions evaluated at t:
-    float one6th = 1.0f / 6.0f;
-
-    float t2 = t * t;
-    float t3 = t * t2;
-
-    point.x = one6th * (1.0f - 3.0f*(t -      t2) -      t3);
-    point.y = one6th * (4.0f           - 6.0f*t2  + 3.0f*t3);
-    point.z = one6th * (1.0f + 3.0f*(t +      t2  -      t3));
-    point.w = one6th * (                                 t3);
-
-    // Derivatives of the above four basis functions at t:
-    deriv.x = -0.5f*t2 +      t - 0.5f;
-    deriv.y =  1.5f*t2 - 2.0f*t;
-    deriv.z = -1.5f*t2 +      t + 0.5f;
-    deriv.w =  0.5f*t2;
-}
-
 uint getDepth(uint patchBits) {
    return (patchBits & 0xfU);
 }
@ -200,29 +181,14 @@ vec2 normalizePatchCoord(uint patchBits, vec2 uv) {
    return vec2((uv.x - pu) / frac, (uv.y - pv) / frac);
 }

-void adjustBoundaryWeights(uint bits, inout vec4 sWeights, inout vec4 tWeights) {
-    uint boundary = ((bits >> 8) & 0xfU);
+bool isRegular(uint patchBits) {
+    return (((patchBits >> 5) & 0x1u) != 0);
+}

-    if ((boundary & 1U) != 0) {
-        tWeights[2] -= tWeights[0];
-        tWeights[1] += 2*tWeights[0];
-        tWeights[0] = 0;
-    }
-    if ((boundary & 2U) != 0) {
-        sWeights[1] -= sWeights[3];
-        sWeights[2] += 2*sWeights[3];
-        sWeights[3] = 0;
-    }
-    if ((boundary & 4U) != 0) {
-        tWeights[1] -= tWeights[3];
-        tWeights[2] += 2*tWeights[3];
-        tWeights[3] = 0;
-    }
-    if ((boundary & 8U) != 0) {
-        sWeights[2] -= sWeights[0];
-        sWeights[1] += 2*sWeights[0];
-        sWeights[0] = 0;
-    }
+int getNumControlVertices(int patchType) {
+    return (patchType == 3) ? 4 :
+           (patchType == 6) ? 16 :
+           (patchType == 9) ? 20 : 0;
 }

 void main() {
@ -233,32 +199,39 @@ void main() {

    vec2 coord = patchCoords;
    ivec4 array = patchArray[handle.x];
-    int patchType = array.x;
-    int numControlVertices = 16;

    uint patchBits = texelFetch(patchParamBuffer, patchIndex).y;
+    int patchType = isRegular(patchBits) ? 6 : array.x;

    // normalize
    coord = normalizePatchCoord(patchBits, coord);
    float dScale = float(1 << getDepth(patchBits));
+    int boundary = int((patchBits >> 8) & 0xfU);

-    // if regular
-    float wP[20], wDs[20], wDt[20];
-    {
-        vec4 sWeights, tWeights, dsWeights, dtWeights;
-        getBSplineWeights(coord.s, sWeights, dsWeights);
-        getBSplineWeights(coord.t, tWeights, dtWeights);
+    float wP[20], wDs[20], wDt[20], wDss[20], wDst[20], wDtt[20];

-        adjustBoundaryWeights(patchBits, sWeights, tWeights);
-        adjustBoundaryWeights(patchBits, dsWeights, dtWeights);
-
-        for (int k = 0; k < 4; ++k) {
-            for (int l = 0; l < 4; ++l) {
-                wP[4*k+l]  = sWeights[l]  * tWeights[k];
-                wDs[4*k+l] = dsWeights[l] * tWeights[k]  * dScale;
-                wDt[4*k+l] = sWeights[l]  * dtWeights[k] * dScale;
-            }
+    int numControlVertices = 0;
+    if (patchType == 3) {
+        float wP4[4], wDs4[4], wDt4[4], wDss4[4], wDst4[4], wDtt4[4];
+        OsdGetBilinearPatchWeights(coord.s, coord.t, dScale, wP4, wDs4, wDt4, wDss4, wDst4, wDtt4);
+        numControlVertices = 4;
+        for (int i=0; i<numControlVertices; ++i) {
+            wP[i] = wP4[i];
+            wDs[i] = wDs4[i];
+            wDt[i] = wDt4[i];
        }
+    } else if (patchType == 6) {
+        float wP16[16], wDs16[16], wDt16[16], wDss16[16], wDst16[16], wDtt16[16];
+        OsdGetBSplinePatchWeights(coord.s, coord.t, dScale, boundary, wP16, wDs16, wDt16, wDss16, wDst16, wDtt16);
+        numControlVertices = 16;
+        for (int i=0; i<numControlVertices; ++i) {
+            wP[i] = wP16[i];
+            wDs[i] = wDs16[i];
+            wDt[i] = wDt16[i];
+        }
+    } else if (patchType == 9) {
+        OsdGetGregoryPatchWeights(coord.s, coord.t, dScale, wP, wDs, wDt, wDss, wDst, wDtt);
+        numControlVertices = 20;
    }

    Vertex dst, du, dv;
@ -266,7 +239,9 @@ void main() {
    clear(du);
    clear(dv);

-    int indexBase = array.z + handle.z;
+    int indexStride = getNumControlVertices(array.x);
+    int indexBase = array.z + indexStride * (patchIndex - array.w);
+
    for (int cv = 0; cv < numControlVertices; ++cv) {
        int index = texelFetch(patchIndexBuffer, indexBase + cv).x;
        addWithWeight(dst, readVertex(index), wP[cv]);
--- a/opensubdiv/osd/ompEvaluator.cpp
+++ b/opensubdiv/osd/ompEvaluator.cpp
@ -138,12 +138,11 @@ OmpEvaluator::EvalPatches(
        PatchCoord const &coord = patchCoords[i];
        PatchArray const &array = patchArrays[coord.handle.arrayIndex];

-        int patchType = array.GetPatchType();
-        // XXX: patchIndex is absolute. not sure it's consistent.
-        //      (should be offsetted by array.primitiveIdBase?)
-        //    patchParamBuffer[array.primitiveIdBase + coord.handle.patchIndex]
        Far::PatchParam const & param =
            patchParamBuffer[coord.handle.patchIndex];
+        int patchType = param.IsRegular()
+            ? Far::PatchDescriptor::REGULAR
+            : array.GetPatchType();

        int numControlVertices = 0;
        if (patchType == Far::PatchDescriptor::REGULAR) {
@ -161,8 +160,12 @@ OmpEvaluator::EvalPatches(
        } else {
            continue;
        }
-        const int *cvs =
-            &patchIndexBuffer[array.indexBase + coord.handle.vertIndex];
+
+        int indexStride = Far::PatchDescriptor(array.GetPatchType()).GetNumControlVertices();
+        int indexBase = array.GetIndexBase() + indexStride *
+                (coord.handle.patchIndex - array.GetPrimitiveIdBase());
+
+        const int *cvs = &patchIndexBuffer[indexBase];

        dstT.Clear();
        for (int j = 0; j < numControlVertices; ++j) {
@ -202,9 +205,11 @@ OmpEvaluator::EvalPatches(
        PatchCoord const &coord = patchCoords[i];
        PatchArray const &array = patchArrays[coord.handle.arrayIndex];

-        int patchType = array.GetPatchType();
        Far::PatchParam const & param =
            patchParamBuffer[coord.handle.patchIndex];
+        int patchType = param.IsRegular()
+            ? Far::PatchDescriptor::REGULAR
+            : array.GetPatchType();

        int numControlVertices = 0;
        if (patchType == Far::PatchDescriptor::REGULAR) {
@ -222,8 +227,12 @@ OmpEvaluator::EvalPatches(
        } else {
            continue;
        }
-        const int *cvs =
-            &patchIndexBuffer[array.indexBase + coord.handle.vertIndex];
+
+        int indexStride = Far::PatchDescriptor(array.GetPatchType()).GetNumControlVertices();
+        int indexBase = array.GetIndexBase() + indexStride *
+                (coord.handle.patchIndex - array.GetPrimitiveIdBase());
+
+        const int *cvs = &patchIndexBuffer[indexBase];

        dstT.Clear();
        duT.Clear();
@ -240,6 +249,7 @@ OmpEvaluator::EvalPatches(
    return true;
 }

+
 /* static */
 void
 OmpEvaluator::Synchronize(void * /*deviceContext*/) {
--- a/opensubdiv/osd/ompEvaluator.h
+++ b/opensubdiv/osd/ompEvaluator.h
@ -28,8 +28,8 @@
 #include "../version.h"

 #include <cstddef>
-#include "../osd/types.h"
 #include "../osd/bufferDescriptor.h"
+#include "../osd/types.h"

 namespace OpenSubdiv {
 namespace OPENSUBDIV_VERSION {
@ -457,6 +457,111 @@ public:
        const int *patchIndexBuffer,
        PatchParam const *patchParamBuffer);

+    /// \brief Generic limit eval function. This function has a same
+    ///        signature as other device kernels have so that it can be called
+    ///        in the same way.
+    ///
+    /// @param srcBuffer        Input primvar buffer.
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         const float pointer for read
+    ///
+    /// @param srcDesc          vertex buffer descriptor for the input buffer
+    ///
+    /// @param dstBuffer        Output primvar buffer
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param dstDesc          vertex buffer descriptor for the output buffer
+    ///
+    /// @param numPatchCoords   number of patchCoords.
+    ///
+    /// @param patchCoords      array of locations to be evaluated.
+    ///
+    /// @param patchTable       CpuPatchTable or equivalent
+    ///                         XXX: currently Far::PatchTable can't be used
+    ///                              due to interface mismatch
+    ///
+    /// @param instance         not used in the omp evaluator
+    ///
+    /// @param deviceContext    not used in the omp evaluator
+    ///
+    template <typename SRC_BUFFER, typename DST_BUFFER,
+              typename PATCHCOORD_BUFFER, typename PATCH_TABLE>
+    static bool EvalPatchesVarying(
+        SRC_BUFFER *srcBuffer, BufferDescriptor const &srcDesc,
+        DST_BUFFER *dstBuffer, BufferDescriptor const &dstDesc,
+        int numPatchCoords,
+        PATCHCOORD_BUFFER *patchCoords,
+        PATCH_TABLE *patchTable,
+        OmpEvaluator const *instance = NULL,
+        void * deviceContext = NULL) {
+
+        (void)instance;       // unused
+        (void)deviceContext;  // unused
+
+        return EvalPatches(srcBuffer->BindCpuBuffer(), srcDesc,
+                           dstBuffer->BindCpuBuffer(), dstDesc,
+                           numPatchCoords,
+                           (const PatchCoord*)patchCoords->BindCpuBuffer(),
+                           patchTable->GetVaryingPatchArrayBuffer(),
+                           patchTable->GetVaryingPatchIndexBuffer(),
+                           patchTable->GetPatchParamBuffer());
+    }
+
+    /// \brief Generic limit eval function. This function has a same
+    ///        signature as other device kernels have so that it can be called
+    ///        in the same way.
+    ///
+    /// @param srcBuffer        Input primvar buffer.
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         const float pointer for read
+    ///
+    /// @param srcDesc          vertex buffer descriptor for the input buffer
+    ///
+    /// @param dstBuffer        Output primvar buffer
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param dstDesc          vertex buffer descriptor for the output buffer
+    ///
+    /// @param numPatchCoords   number of patchCoords.
+    ///
+    /// @param patchCoords      array of locations to be evaluated.
+    ///
+    /// @param patchTable       CpuPatchTable or equivalent
+    ///                         XXX: currently Far::PatchTable can't be used
+    ///                              due to interface mismatch
+    ///
+    /// @param fvarChannel      face-varying channel
+    ///
+    /// @param instance         not used in the omp evaluator
+    ///
+    /// @param deviceContext    not used in the omp evaluator
+    ///
+    template <typename SRC_BUFFER, typename DST_BUFFER,
+              typename PATCHCOORD_BUFFER, typename PATCH_TABLE>
+    static bool EvalPatchesFaceVarying(
+        SRC_BUFFER *srcBuffer, BufferDescriptor const &srcDesc,
+        DST_BUFFER *dstBuffer, BufferDescriptor const &dstDesc,
+        int numPatchCoords,
+        PATCHCOORD_BUFFER *patchCoords,
+        PATCH_TABLE *patchTable,
+        int fvarChannel,
+        OmpEvaluator const *instance = NULL,
+        void * deviceContext = NULL) {
+
+        (void)instance;       // unused
+        (void)deviceContext;  // unused
+
+        return EvalPatches(srcBuffer->BindCpuBuffer(), srcDesc,
+                           dstBuffer->BindCpuBuffer(), dstDesc,
+                           numPatchCoords,
+                           (const PatchCoord*)patchCoords->BindCpuBuffer(),
+                           patchTable->GetFVarPatchArrayBuffer(fvarChannel),
+                           patchTable->GetFVarPatchIndexBuffer(fvarChannel),
+                           patchTable->GetFVarPatchParamBuffer(fvarChannel));
+    }
+
    /// ----------------------------------------------------------------------
    ///
    ///   Other methods
--- a/opensubdiv/osd/tbbEvaluator.h
+++ b/opensubdiv/osd/tbbEvaluator.h
@ -454,6 +454,111 @@ public:
        const int *patchIndexBuffer,
        const PatchParam *patchParamBuffer);

+    /// \brief Generic limit eval function. This function has a same
+    ///        signature as other device kernels have so that it can be called
+    ///        in the same way.
+    ///
+    /// @param srcBuffer        Input primvar buffer.
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         const float pointer for read
+    ///
+    /// @param srcDesc          vertex buffer descriptor for the input buffer
+    ///
+    /// @param dstBuffer        Output primvar buffer
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param dstDesc          vertex buffer descriptor for the output buffer
+    ///
+    /// @param numPatchCoords   number of patchCoords.
+    ///
+    /// @param patchCoords      array of locations to be evaluated.
+    ///
+    /// @param patchTable       Far::PatchTable
+    ///
+    /// @param instance         not used in the cpu evaluator
+    ///
+    /// @param deviceContext    not used in the cpu evaluator
+    ///
+    template <typename SRC_BUFFER, typename DST_BUFFER,
+              typename PATCHCOORD_BUFFER, typename PATCH_TABLE>
+    static bool EvalPatchesVarying(
+        SRC_BUFFER *srcBuffer, BufferDescriptor const &srcDesc,
+        DST_BUFFER *dstBuffer, BufferDescriptor const &dstDesc,
+        int numPatchCoords,
+        PATCHCOORD_BUFFER *patchCoords,
+        PATCH_TABLE *patchTable,
+        TbbEvaluator const *instance = NULL,
+        void * deviceContext = NULL) {
+
+        (void)instance;       // unused
+        (void)deviceContext;  // unused
+
+        return EvalPatches(srcBuffer->BindCpuBuffer(),
+                           srcDesc,
+                           dstBuffer->BindCpuBuffer(),
+                           dstDesc,
+                           numPatchCoords,
+                           (const PatchCoord*)patchCoords->BindCpuBuffer(),
+                           patchTable->GetVaryingPatchArrayBuffer(),
+                           patchTable->GetVaryingPatchIndexBuffer(),
+                           patchTable->GetPatchParamBuffer());
+    }
+
+    /// \brief Generic limit eval function. This function has a same
+    ///        signature as other device kernels have so that it can be called
+    ///        in the same way.
+    ///
+    /// @param srcBuffer        Input primvar buffer.
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         const float pointer for read
+    ///
+    /// @param srcDesc          vertex buffer descriptor for the input buffer
+    ///
+    /// @param dstBuffer        Output primvar buffer
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param dstDesc          vertex buffer descriptor for the output buffer
+    ///
+    /// @param numPatchCoords   number of patchCoords.
+    ///
+    /// @param patchCoords      array of locations to be evaluated.
+    ///
+    /// @param patchTable       Far::PatchTable
+    ///
+    /// @param fvarChannel      face-varying channel
+    ///
+    /// @param instance         not used in the cpu evaluator
+    ///
+    /// @param deviceContext    not used in the cpu evaluator
+    ///
+    template <typename SRC_BUFFER, typename DST_BUFFER,
+              typename PATCHCOORD_BUFFER, typename PATCH_TABLE>
+    static bool EvalPatchesFaceVarying(
+        SRC_BUFFER *srcBuffer, BufferDescriptor const &srcDesc,
+        DST_BUFFER *dstBuffer, BufferDescriptor const &dstDesc,
+        int numPatchCoords,
+        PATCHCOORD_BUFFER *patchCoords,
+        PATCH_TABLE *patchTable,
+        int fvarChannel,
+        TbbEvaluator const *instance = NULL,
+        void * deviceContext = NULL) {
+
+        (void)instance;       // unused
+        (void)deviceContext;  // unused
+
+        return EvalPatches(srcBuffer->BindCpuBuffer(),
+                           srcDesc,
+                           dstBuffer->BindCpuBuffer(),
+                           dstDesc,
+                           numPatchCoords,
+                           (const PatchCoord*)patchCoords->BindCpuBuffer(),
+                           patchTable->GetFVarPatchArrayBuffer(fvarChannel),
+                           patchTable->GetFVarPatchIndexBuffer(fvarChannel),
+                           patchTable->GetFVarPatchParamBuffer(fvarChannel));
+    }
+
    /// ----------------------------------------------------------------------
    ///
    ///   Other methods
--- a/opensubdiv/osd/tbbKernel.cpp
+++ b/opensubdiv/osd/tbbKernel.cpp
@ -316,9 +316,11 @@ public:
            PatchCoord const &coord = _patchCoords[i];
            PatchArray const &array = _patchArrayBuffer[coord.handle.arrayIndex];

-            int patchType = array.GetPatchType();
            Far::PatchParam const & param =
                _patchParamBuffer[coord.handle.patchIndex];
+            int patchType = param.IsRegular()
+                ? Far::PatchDescriptor::REGULAR
+                : array.GetPatchType();

            int numControlVertices = 0;
            if (patchType == Far::PatchDescriptor::REGULAR) {
@ -337,8 +339,11 @@ public:
                assert(0);
            }

-            const int *cvs =
-                &_patchIndexBuffer[array.indexBase + coord.handle.vertIndex];
+            int indexStride = Far::PatchDescriptor(array.GetPatchType()).GetNumControlVertices();
+            int indexBase = array.GetIndexBase() + indexStride *
+                    (coord.handle.patchIndex - array.GetPrimitiveIdBase());
+
+            const int *cvs = &_patchIndexBuffer[indexBase];

            dstT.Clear();
            for (int j = 0; j < numControlVertices; ++j) {
@ -370,9 +375,11 @@ public:
            PatchCoord const &coord = _patchCoords[i];
            PatchArray const &array = _patchArrayBuffer[coord.handle.arrayIndex];

-            int patchType = array.GetPatchType();
            Far::PatchParam const & param =
                _patchParamBuffer[coord.handle.patchIndex];
+            int patchType = param.IsRegular()
+                ? Far::PatchDescriptor::REGULAR
+                : array.GetPatchType();

            int numControlVertices = 0;
            if (patchType == Far::PatchDescriptor::REGULAR) {
@ -391,8 +398,11 @@ public:
                assert(0);
            }

-            const int *cvs =
-                &_patchIndexBuffer[array.indexBase + coord.handle.vertIndex];
+            int indexStride = Far::PatchDescriptor(array.GetPatchType()).GetNumControlVertices();
+            int indexBase = array.GetIndexBase() + indexStride *
+                    (coord.handle.patchIndex - array.GetPrimitiveIdBase());
+
+            const int *cvs = &_patchIndexBuffer[indexBase];

            dstT.Clear();
            dstDuT.Clear();
@ -433,6 +443,7 @@ TbbEvalPatches(float const *src, BufferDescriptor const &srcDesc,

 }

+
 }  // end namespace Osd

 }  // end namespace OPENSUBDIV_VERSION
--- a/opensubdiv/osd/tbbKernel.h
+++ b/opensubdiv/osd/tbbKernel.h
@ -26,6 +26,8 @@
 #define OPENSUBDIV3_OSD_TBB_KERNEL_H

 #include "../version.h"
+#include "../far/patchDescriptor.h"
+#include "../far/patchParam.h"

 namespace OpenSubdiv {
 namespace OPENSUBDIV_VERSION {