Merge branch 'release/v3_2_0'

2024-11-23 20:20:09 +00:00 · 2017-01-31 13:45:44 -08:00 · 2017-01-31 13:45:44 -08:00 · a00df9344c
commit a00df9344c
parent ce5f7e9320 62e1a62842
128 changed files with 10038 additions and 985 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -143,7 +143,7 @@ SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)

 set(OSD_COMPILER_FLAGS)

-# Disable spurrious warnings in gcc builds and clang
+# Disable spurious warnings in gcc builds and clang
 if (CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_CLANGCC OR CMAKE_COMPILER_IS_ICC )

    # Turn on all warnings
@ -154,7 +154,7 @@ if (CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_CLANGCC OR CMAKE_COMPILER_IS_IC
    endif()

    # HBR uses the offsetof macro on a templated struct, which appears
-    # to spurriously set off this warning in both gccc and Clang
+    # to spuriously set off this warning in both gcc and Clang
    list(APPEND OSD_COMPILER_FLAGS -Wno-invalid-offsetof)

    # HBR uses unions as an optimization for its memory allocation.
@ -164,7 +164,7 @@ if (CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_CLANGCC OR CMAKE_COMPILER_IS_IC
    list(APPEND OSD_COMPILER_FLAGS -Wno-strict-aliasing)

    # FAR and OSD have templated virtual function implementations that trigger
-    # a lot of hidden virtual function overloads (some of them spurrious).
+    # a lot of hidden virtual function overloads (some of them spurious).
    # Disable those for now in Clang.
    if(CMAKE_COMPILER_IS_CLANGCC)
        list(APPEND OSD_COMPILER_FLAGS -Wno-overloaded-virtual)
@ -244,11 +244,24 @@ elseif(MSVC)
                    #/D_HAS_ITERATOR_DEBUGGING=0
    )

+    option(MSVC_STATIC_CRT "Statically link MSVC CRT" OFF)
+
+    if(MSVC_STATIC_CRT)
+        message(STATUS "Using static MSVC CRT")
+        # http://stackoverflow.com/a/32128977/486990
+        add_compile_options(
+            "$<$<CONFIG:Debug>:/MTd>"
+            "$<$<CONFIG:RelWithDebInfo>:/MT>"
+            "$<$<CONFIG:Release>:/MT>"
+            "$<$<CONFIG:MinSizeRel>:/MT>"
+        )
+    else()
        # Turn off a duplicate LIBCMT linker warning
        set(CMAKE_EXE_LINKER_FLAGS
            "${CMAKE_EXE_LINKER_FLAGS} /NODEFAULTLIB:libcmt.lib")
        set(CMAKE_SHARED_LINKER_FLAGS
            "${CMAKE_SHARED_LINKER_FLAGS} /NODEFAULTLIB:libcmt.lib")
+    endif()

 endif()

--- a/documentation/CMakeLists.txt
+++ b/documentation/CMakeLists.txt
@ -93,6 +93,7 @@ if (DOCUTILS_FOUND AND PYTHONINTERP_FOUND)
        references.rst
        release_30.rst
        release_31.rst
+        release_32.rst
        release_notes.rst
        release_notes_2x.rst
        sdc_overview.rst
--- a/documentation/images/far_legacy_sharp_corner_patch_false.png
+++ b/documentation/images/far_legacy_sharp_corner_patch_false.png
--- a/documentation/images/far_legacy_sharp_corner_patch_true.png
+++ b/documentation/images/far_legacy_sharp_corner_patch_true.png
--- a/documentation/images/osd_eval_1st_deriv_normal.png
+++ b/documentation/images/osd_eval_1st_deriv_normal.png
--- a/documentation/images/osd_eval_2nd_deriv_curvature.png
+++ b/documentation/images/osd_eval_2nd_deriv_curvature.png
--- a/documentation/nav_template.txt
+++ b/documentation/nav_template.txt
@ -96,6 +96,7 @@
            <p></p>
            <li><a href="release_notes.html">Releases</a>
                <ul>
+                    <li><a href="release_32.html">Release 3.2</a></li>
                    <li><a href="release_31.html">Release 3.1</a></li>
                    <li><a href="release_30.html">Release 3.0</a></li>
                        <ul>
--- a/documentation/release_32.rst
+++ b/documentation/release_32.rst
@ -0,0 +1,127 @@
+..
+     Copyright 2017 Pixar
+
+     Licensed under the Apache License, Version 2.0 (the "Apache License")
+     with the following modification; you may not use this file except in
+     compliance with the Apache License and the following modification to it:
+     Section 6. Trademarks. is deleted and replaced with:
+
+     6. Trademarks. This License does not grant permission to use the trade
+        names, trademarks, service marks, or product names of the Licensor
+        and its affiliates, except as required to comply with Section 4(c) of
+        the License and to reproduce the content of the NOTICE file.
+
+     You may obtain a copy of the Apache License at
+
+         http://www.apache.org/licenses/LICENSE-2.0
+
+     Unless required by applicable law or agreed to in writing, software
+     distributed under the Apache License with the above modification is
+     distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+     KIND, either express or implied. See the Apache License for the specific
+     language governing permissions and limitations under the Apache License.
+
+
+Overview of Release 3.2
+=======================
+
+.. contents::
+   :local:
+   :backlinks: none
+
+New Features
+------------
+
+Face-Varying Stencil Evaluation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Face-Varying primvar values may now be refined using stencil tables.
+
+The stencil table for a face-varying channel is created by specifying the desired fvarChannel and setting
+the Far::StencilTableFactory::Option interpolationMode to INTERPOLATE_FACE_VARYING when creating the stencil table.
+
+1st and 2nd Derivative Evaluation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The Osd Evaluator API has been extended to support 1st derivative and 2nd partial derivative evaluation for stencils and patches.
+
+----------------------------------------------------+------------------------------------------------------+
+| .. image:: images/osd_eval_1st_deriv_normal.png    | .. image:: images/osd_eval_2nd_deriv_curvature.png   |
+|    :align:  center                                 |    :align:  center                                   |
+|    :width:  75%                                    |    :width:  75%                                      |
+|    :target: images/osd_eval_1st_deriv_normal.png   |    :target: images/osd_eval_2nd_deriv_curvature.png  |
+|                                                    |                                                      |
+| 1st Derivative Surface Normal                      | 2nd Derivative Surface Curvature                     |
+----------------------------------------------------+------------------------------------------------------+
+
+On the left is an example of computing a surface normal at each point using the evaluated 1st derivatives,
+while on the right is an example of computing surface curvature at each point using the evaluated 2nd partial derivatives.
+
+Smooth Corner Patch
+~~~~~~~~~~~~~~~~~~~
+
+An option has been added to disable the legacy behavior of generating a sharp-corner patch at a smooth corner.
+Corners which are actually sharp will continue to generate sharp-corner patches.
+
+The differences between the two methods is most apparent at low-levels of feature isolation.
+
+This feature is controlled by the generateLegacySharpCornerPatches option added to Far::PatchTableFactory::Options.
+
+------------------------------------------------------------+-------------------------------------------------------------+
+| .. image:: images/far_legacy_sharp_corner_patch_true.png   | .. image:: images/far_legacy_sharp_corner_patch_false.png   |
+|    :align:  center                                         |    :align:  center                                          |
+|    :width:  75%                                            |    :width:  75%                                             |
+|    :target: images/far_legacy_sharp_corner_patch_true.png  |    :target: images/far_legacy_sharp_corner_patch_false.png  |
+|                                                            |                                                             |
+| Sharp Corner Patch (legacy behavior)                       | Smooth Corner Patch                                         |
+------------------------------------------------------------+-------------------------------------------------------------+
+
+On the left is the legacy behavior of generating sharp corner patches at smooth corners.
+The image on the right shows the correct smooth corner patches generated when this legacy behavior is disabled.
+
+API Additions
+-------------
+
+See associated `Doxygen <doxy_html/index.html>`__ for full details.
+
+Osd::CpuEvaluator, GLComputeEvaluator, etc
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    - Create()
+    - EvalStencils()
+    - EvalPatches()
+    - EvalPatchesVarying()
+    - EvalPatchesFaceVarying()
+
+Osd::Mesh
+~~~~~~~~~
+    - Create()
+
+Osd::MeshBits
+~~~~~~~~~~~~~
+    - member MeshUseSmoothCornerPatch
+
+Far::PatchTableFactory::Options
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    - member generateLegacySharpCornerPatches
+
+Far::StencilTableFactory
+~~~~~~~~~~~~~~~~~~~~~~~~
+    - enumeration Mode::INTERPOLATE_FACE_VARYING
+    - AppendLocalPointStencilTableFaceVarying()
+
+Far::StencilTableFactory::Options
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    - member fvarChannel
+
+Other Changes
+-------------
+
+Improvements
+~~~~~~~~~~~~
+    - Corrected numerous spelling errors in doxygen comments
+    - Updated glFVarViewer with improved error detection and command line parsing
+    - Added option to build using MSVC with static CRT
+
+Bug Fixes
+~~~~~~~~~
+    - Fixed a double delete of GL program in Osd::GLComputeEvaluator
--- a/documentation/release_notes.rst
+++ b/documentation/release_notes.rst
@ -22,7 +22,7 @@
     language governing permissions and limitations under the Apache License.


-3.0 - 3.1 Release Notes
+3.0 - 3.2 Release Notes
 -----------------------

 .. contents::
@ -31,6 +31,24 @@

 ----

+Release 3.2.0
+=============
+
+Release 3.2.0 is a minor release containing API additions and bug fixes
+
+**New Features**
+    - Extended Far::StencilTableFactory to support face-varying
+    - Extended Osd Evaluator classes to support evaluation of 1st and 2nd derivatives
+    - Added an option to disable generation of legacy sharp corner patches
+
+**Changes**
+    - Corrected numerous spelling errors in doxygen comments
+    - Updated glFVarViewer with improved error detection and command line parsing
+    - Added option to build using MSVC with static CRT
+
+**Bug Fixes**
+    - Fixed a double delete of GL program in Osd::GLComputeEvaluator
+
 Release 3.1.1
 =============

--- a/examples/common/clDeviceContext.cpp
+++ b/examples/common/clDeviceContext.cpp
@ -124,8 +124,8 @@ findExtensionSupportedDevice(cl_device_id *clDevices,
            std::string extString(extensions);
            delete[] extensions;

-            // parse string. This is bit deficient since the extentions
-            // is space separated.
+            // parse string. This is a bit deficient since the extensions
+            // string is space separated.
            //
            // The actual string would be "cl_khr_d3d11_sharing"
            //                         or "cl_nv_d3d11_sharing"
--- a/examples/common/cudaDeviceContext.h
+++ b/examples/common/cudaDeviceContext.h
@ -32,10 +32,10 @@ public:
    CudaDeviceContext();
    ~CudaDeviceContext();

-    /// Initialze cuda device from the current GL context
+    /// Initialize cuda device from the current GL context
    bool Initialize();

-    /// Initialze cuda device from the ID3D11Device
+    /// Initialize cuda device from the ID3D11Device
    bool Initialize(ID3D11Device *device);

    /// Returns true if the cuda device has already been initialized
--- a/examples/common/d3d11PtexMipmapTexture.h
+++ b/examples/common/d3d11PtexMipmapTexture.h
@ -63,7 +63,7 @@ public:
 private:
    D3D11PtexMipmapTexture();

-    int _width,   // widht / height / depth of the 3D texel buffer
+    int _width,   // width / height / depth of the 3D texel buffer
        _height,
        _depth;

--- a/examples/common/glPtexMipmapTexture.h
+++ b/examples/common/glPtexMipmapTexture.h
@ -47,7 +47,7 @@ public:
    /// Returns the texels texture array.
    GLuint GetTexelsTexture() const { return _texels; }

-    /// Returns the amount of allocated memory (in byte)
+    /// Returns the amount of allocated memory (in bytes)
    size_t GetMemoryUsage() const { return _memoryUsage; }

    ~GLPtexMipmapTexture();
@ -55,7 +55,7 @@ public:
 private:
    GLPtexMipmapTexture();

-    GLsizei _width,   // widht / height / depth of the 3D texel buffer
+    GLsizei _width,   // width / height / depth of the 3D texel buffer
            _height,
            _depth;

--- a/examples/common/glUtils.cpp
+++ b/examples/common/glUtils.cpp
@ -260,9 +260,8 @@ GetShaderVersion(){
    return shader_version;
 }

-/* Generates the version defintion needed by the glsl shaders based on the 
- * opengl string
-*/
+// Generates the version definition needed by the glsl shaders based on the
+// opengl string
 std::string GetShaderVersionInclude(){
    return "#version " + GetShaderVersion() + "\n";
 }
@ -295,4 +294,4 @@ bool GL_ARBComputeShaderOrGL_VERSION_4_3() {

 #undef IS_SUPPORTED

-}   // namesapce GLUtils
+}   // namespace GLUtils
--- a/examples/common/ptexMipmapTextureLoader.cpp
+++ b/examples/common/ptexMipmapTextureLoader.cpp
@ -110,7 +110,7 @@ PtexMipmapTextureLoader::Block::guttering(PtexMipmapTextureLoader *loader,
                        (everything else, including boundary)
                    Since guttering pixels are placed on the border of each
                    ptex faces, it's not possible to store more than 4 pixels
-                    at a coner for a reasonable interpolation.
+                    at a corner for a reasonable interpolation.
                    In this case, we need to average all corner pixels and
                    overwrite with an averaged value, so that every face
                    vertex picks the same value.
@ -452,7 +452,7 @@ PtexMipmapTextureLoader::PtexMipmapTextureLoader(PtexTexture *ptex,
    _pageWidth(0), _pageHeight(0),
    _texelBuffer(NULL), _layoutBuffer(NULL), _memoryUsage(0)
 {
-    // byte per pixel
+    // bytes per pixel
    _bpp = ptex->numChannels() * Ptex::DataSize(ptex->dataType());

    int numFaces = ptex->numFaces();
--- a/examples/common/stb_image_write.h
+++ b/examples/common/stb_image_write.h
@ -19,7 +19,7 @@ ABOUT:

   The PNG output is not optimal; it is 20-50% larger than the file
   written by a decent optimizing implementation. This library is designed
-   for source code compactness and simplicitly, not optimal image file size
+   for source code compactness and simplicity, not optimal image file size
   or run-time performance.

 USAGE:
--- a/examples/dxPtexViewer/dxPtexViewer.cpp
+++ b/examples/dxPtexViewer/dxPtexViewer.cpp
@ -1937,7 +1937,7 @@ WinMain(HINSTANCE hInstance, HINSTANCE hPrevInstance, LPTSTR lpCmdLine, int nCmd
    wcex.lpszClassName  = szWindowClass;
    RegisterClass(&wcex);

-    // crete window
+    // create window
    RECT rect = { 0, 0, g_width, g_height };
    AdjustWindowRect(&rect, WS_OVERLAPPEDWINDOW, FALSE);

--- a/examples/dxViewer/dxviewer.cpp
+++ b/examples/dxViewer/dxviewer.cpp
@ -1544,7 +1544,7 @@ WinMain(HINSTANCE hInstance, HINSTANCE hPrevInstance, LPTSTR lpCmdLine, int nCmd
    wcex.lpszClassName  = szWindowClass;
    RegisterClass(&wcex);

-    // crete window
+    // create window
    RECT rect = { 0, 0, g_width, g_height };
    AdjustWindowRect(&rect, WS_OVERLAPPEDWINDOW, FALSE);

--- a/examples/farViewer/farViewer.cpp
+++ b/examples/farViewer/farViewer.cpp
@ -1387,7 +1387,7 @@ int main(int argc, char ** argv)
    }
    glfwMakeContextCurrent(g_window);

-    // accommocate high DPI displays (e.g. mac retina displays)
+    // accommodate high DPI displays (e.g. mac retina displays)
    glfwGetFramebufferSize(g_window, &g_width, &g_height);
    glfwSetFramebufferSizeCallback(g_window, reshape);

@ -1408,7 +1408,7 @@ int main(int argc, char ** argv)
        exit(1);
    }
 #ifdef CORE_PROFILE
-    // clear GL errors which was generated during glewInit()
+    // clear GL errors which were generated during glewInit()
    glGetError();
 #endif
 #endif
--- a/examples/glEvalLimit/glEvalLimit.cpp
+++ b/examples/glEvalLimit/glEvalLimit.cpp
@ -121,7 +121,8 @@ enum DrawMode { kUV,
                kVARYING,
                kNORMAL,
                kSHADE,
-                kFACEVARYING };
+                kFACEVARYING,
+                kMEAN_CURVATURE };

 std::vector<float> g_orgPositions,
                   g_positions,
@ -192,8 +193,11 @@ struct Program {
    GLuint uniformDrawMode;
    GLuint attrPosition;
    GLuint attrColor;
-    GLuint attrTangentU;
-    GLuint attrTangentV;
+    GLuint attrDu;
+    GLuint attrDv;
+    GLuint attrDuu;
+    GLuint attrDuv;
+    GLuint attrDvv;
    GLuint attrPatchCoord;
    GLuint attrFVarData;
 } g_defaultProgram;
@ -221,15 +225,18 @@ public:
    virtual ~EvalOutputBase() {}
    virtual GLuint BindSourceData() const = 0;
    virtual GLuint BindVertexData() const = 0;
-    virtual GLuint BindDerivatives() const = 0;
+    virtual GLuint Bind1stDerivatives() const = 0;
+    virtual GLuint Bind2ndDerivatives() const = 0;
    virtual GLuint BindFaceVaryingData() const = 0;
    virtual GLuint BindPatchCoords() const = 0;
    virtual void UpdateData(const float *src, int startVertex, int numVertices) = 0;
    virtual void UpdateVaryingData(const float *src, int startVertex, int numVertices) = 0;
    virtual void UpdateFaceVaryingData(const float *src, int startVertex, int numVertices) = 0;
+    virtual bool HasFaceVaryingData() const = 0;
    virtual void Refine() = 0;
    virtual void EvalPatches() = 0;
-    virtual void EvalPatchesWithDerivatives() = 0;
+    virtual void EvalPatchesWith1stDerivatives() = 0;
+    virtual void EvalPatchesWith2ndDerivatives() = 0;
    virtual void EvalPatchesVarying() = 0;
    virtual void EvalPatchesFaceVarying() = 0;
    virtual void UpdatePatchCoords(
@ -251,37 +258,58 @@ public:
    EvalOutput(Far::StencilTable const *vertexStencils,
               Far::StencilTable const *varyingStencils,
               Far::StencilTable const *faceVaryingStencils,
-               int numCoarseVerts, int numTotalVerts,
-               int numCoarseFVarVerts, int numTotalFVarVerts,
+               int fvarChannel, int fvarWidth,
               int numParticles, Far::PatchTable const *patchTable,
               EvaluatorCache *evaluatorCache = NULL,
               DEVICE_CONTEXT *deviceContext = NULL)
        : _srcDesc(       /*offset*/ 0, /*length*/ 3, /*stride*/ 3),
          _srcVaryingDesc(/*offset*/ 0, /*length*/ 3, /*stride*/ 3),
-          _srcFVarDesc(   /*offset*/ 0, /*length*/ 2, /*stride*/ 2),
-          _vertexDesc(    /*offset*/ 0, /*legnth*/ 3, /*stride*/ 6),
-          _varyingDesc(   /*offset*/ 3, /*legnth*/ 3, /*stride*/ 6),
-          _fvarDesc(      /*offset*/ 0, /*legnth*/ 2, /*stride*/ 2),
-          _duDesc(        /*offset*/ 0, /*legnth*/ 3, /*stride*/ 6),
-          _dvDesc(        /*offset*/ 3, /*legnth*/ 3, /*stride*/ 6),
+          _srcFVarDesc(   /*offset*/ 0, /*length*/ fvarWidth, /*stride*/ fvarWidth),
+          _vertexDesc(    /*offset*/ 0, /*length*/ 3, /*stride*/ 6),
+          _varyingDesc(   /*offset*/ 3, /*length*/ 3, /*stride*/ 6),
+          _fvarDesc(      /*offset*/ 0, /*length*/ fvarWidth, /*stride*/ fvarWidth),
+          _duDesc(        /*offset*/ 0, /*length*/ 3, /*stride*/ 6),
+          _dvDesc(        /*offset*/ 3, /*length*/ 3, /*stride*/ 6),
+          _duuDesc(       /*offset*/ 0, /*length*/ 3, /*stride*/ 9),
+          _duvDesc(       /*offset*/ 3, /*length*/ 3, /*stride*/ 9),
+          _dvvDesc(       /*offset*/ 6, /*length*/ 3, /*stride*/ 9),
          _deviceContext(deviceContext) {
+
+        // total number of vertices = coarse points + refined points + local points
+        int numTotalVerts = vertexStencils->GetNumControlVertices()
+                          + vertexStencils->GetNumStencils();
+
        _srcData = SRC_VERTEX_BUFFER::Create(3, numTotalVerts, _deviceContext);
        _srcVaryingData = SRC_VERTEX_BUFFER::Create(3, numTotalVerts, _deviceContext);
-        _srcFVarData = EVAL_VERTEX_BUFFER::Create(2, numTotalFVarVerts, _deviceContext);
        _vertexData = EVAL_VERTEX_BUFFER::Create(6, numParticles, _deviceContext);
-        _derivatives = EVAL_VERTEX_BUFFER::Create(6, numParticles, _deviceContext);
-        _fvarData = EVAL_VERTEX_BUFFER::Create(2, numParticles, _deviceContext);
+        _deriv1 = EVAL_VERTEX_BUFFER::Create(6, numParticles, _deviceContext);
+        _deriv2 = EVAL_VERTEX_BUFFER::Create(9, numParticles, _deviceContext);
        _patchTable = PATCH_TABLE::Create(patchTable, _deviceContext);
        _patchCoords = NULL;
-        _numCoarseVerts = numCoarseVerts;
-        _numCoarseFVarVerts = numCoarseFVarVerts;
+        _numCoarseVerts = vertexStencils->GetNumControlVertices();
        _vertexStencils =
            Osd::convertToCompatibleStencilTable<STENCIL_TABLE>(vertexStencils, _deviceContext);
        _varyingStencils =
            Osd::convertToCompatibleStencilTable<STENCIL_TABLE>(varyingStencils, _deviceContext);
-        _faceVaryingStencils = (faceVaryingStencils)
-            ? Osd::convertToCompatibleStencilTable<STENCIL_TABLE>(faceVaryingStencils, _deviceContext)
-            : NULL;
+
+        if (faceVaryingStencils) {
+            _numCoarseFVarVerts = faceVaryingStencils->GetNumControlVertices();
+            int numTotalFVarVerts = faceVaryingStencils->GetNumControlVertices()
+                                  + faceVaryingStencils->GetNumStencils();
+            _srcFVarData = EVAL_VERTEX_BUFFER::Create(2, numTotalFVarVerts, _deviceContext);
+            _fvarData = EVAL_VERTEX_BUFFER::Create(fvarWidth, numParticles, _deviceContext);
+            _faceVaryingStencils =
+                Osd::convertToCompatibleStencilTable<STENCIL_TABLE>(faceVaryingStencils, _deviceContext);
+            _fvarChannel = fvarChannel;
+            _fvarWidth = fvarWidth;
+        } else {
+            _numCoarseFVarVerts = 0;
+            _srcFVarData = NULL;
+            _fvarData = NULL;
+            _faceVaryingStencils = NULL;
+            _fvarChannel = 0;
+            _fvarWidth = 0;
+        }
        _evaluatorCache = evaluatorCache;
    }
    ~EvalOutput() {
@ -289,7 +317,8 @@ public:
        delete _srcVaryingData;
        delete _srcFVarData;
        delete _vertexData;
-        delete _derivatives;
+        delete _deriv1;
+        delete _deriv2;
        delete _fvarData;
        delete _patchTable;
        delete _patchCoords;
@ -303,8 +332,11 @@ public:
    virtual GLuint BindVertexData() const {
        return _vertexData->BindVBO();
    }
-    virtual GLuint BindDerivatives() const {
-        return _derivatives->BindVBO();
+    virtual GLuint Bind1stDerivatives() const {
+        return _deriv1->BindVBO();
+    }
+    virtual GLuint Bind2ndDerivatives() const {
+        return _deriv2->BindVBO();
    }
    virtual GLuint BindFaceVaryingData() const {
        return _fvarData->BindVBO();
@ -321,6 +353,9 @@ public:
    virtual void UpdateFaceVaryingData(const float *src, int startVertex, int numVertices) {
        _srcFVarData->UpdateData(src, startVertex, numVertices, _deviceContext);
    }
+    virtual bool HasFaceVaryingData() const {
+        return _faceVaryingStencils != NULL;
+    }
    virtual void Refine() {
        Osd::BufferDescriptor dstDesc = _srcDesc;
        dstDesc.offset += _numCoarseVerts * _srcDesc.stride;
@ -345,10 +380,10 @@ public:
                                evalInstance,
                                _deviceContext);

-        if (_faceVaryingStencils) {
-            int const fvarWidth = 2;
-            Osd::BufferDescriptor dstFVarDesc(_numCoarseFVarVerts*fvarWidth,
-                                              fvarWidth, fvarWidth);
+        if (HasFaceVaryingData()) {
+            Osd::BufferDescriptor dstFVarDesc = _srcFVarDesc;
+            dstFVarDesc.offset += _numCoarseFVarVerts * _srcFVarDesc.stride;
+
            evalInstance = OpenSubdiv::Osd::GetEvaluator<EVALUATOR>(
                _evaluatorCache, _srcFVarDesc, dstFVarDesc, _deviceContext);

@ -371,14 +406,31 @@ public:
            _patchCoords,
            _patchTable, evalInstance, _deviceContext);
    }
-    virtual void EvalPatchesWithDerivatives() {
+    virtual void EvalPatchesWith1stDerivatives() {
        EVALUATOR const *evalInstance = OpenSubdiv::Osd::GetEvaluator<EVALUATOR>(
            _evaluatorCache, _srcDesc, _vertexDesc, _duDesc, _dvDesc, _deviceContext);
        EVALUATOR::EvalPatches(
            _srcData, _srcDesc,
            _vertexData, _vertexDesc,
-            _derivatives, _duDesc,
-            _derivatives, _dvDesc,
+            _deriv1, _duDesc,
+            _deriv1, _dvDesc,
+            _patchCoords->GetNumVertices(),
+            _patchCoords,
+            _patchTable, evalInstance, _deviceContext);
+    }
+    virtual void EvalPatchesWith2ndDerivatives() {
+        EVALUATOR const *evalInstance = OpenSubdiv::Osd::GetEvaluator<EVALUATOR>(
+            _evaluatorCache, _srcDesc, _vertexDesc,
+            _duDesc, _dvDesc, _duuDesc, _duvDesc, _dvvDesc,
+            _deviceContext);
+        EVALUATOR::EvalPatches(
+            _srcData, _srcDesc,
+            _vertexData, _vertexDesc,
+            _deriv1, _duDesc,
+            _deriv1, _dvDesc,
+            _deriv2, _duuDesc,
+            _deriv2, _duvDesc,
+            _deriv2, _dvvDesc,
            _patchCoords->GetNumVertices(),
            _patchCoords,
            _patchTable, evalInstance, _deviceContext);
@ -389,7 +441,7 @@ public:

        EVALUATOR::EvalPatchesVarying(
            _srcVaryingData, _srcVaryingDesc,
-            // varyingdata is interleved in vertexData.
+            // varying data is interleaved in vertexData.
            _vertexData, _varyingDesc,
            _patchCoords->GetNumVertices(),
            _patchCoords,
@ -404,7 +456,7 @@ public:
            _fvarData, _fvarDesc,
            _patchCoords->GetNumVertices(),
            _patchCoords,
-            _patchTable, /*fvarChannel=*/0, evalInstance, _deviceContext);
+            _patchTable, _fvarChannel, evalInstance, _deviceContext);
    }
    virtual void UpdatePatchCoords(
        std::vector<Osd::PatchCoord> const &patchCoords) {
@ -425,7 +477,8 @@ private:
    SRC_VERTEX_BUFFER *_srcVaryingData;
    EVAL_VERTEX_BUFFER *_srcFVarData;
    EVAL_VERTEX_BUFFER *_vertexData;
-    EVAL_VERTEX_BUFFER *_derivatives;
+    EVAL_VERTEX_BUFFER *_deriv1;
+    EVAL_VERTEX_BUFFER *_deriv2;
    EVAL_VERTEX_BUFFER *_fvarData;
    EVAL_VERTEX_BUFFER *_patchCoords;
    PATCH_TABLE *_patchTable;
@ -437,6 +490,9 @@ private:
    Osd::BufferDescriptor _fvarDesc;
    Osd::BufferDescriptor _duDesc;
    Osd::BufferDescriptor _dvDesc;
+    Osd::BufferDescriptor _duuDesc;
+    Osd::BufferDescriptor _duvDesc;
+    Osd::BufferDescriptor _dvvDesc;
    int _numCoarseVerts;
    int _numCoarseFVarVerts;

@ -444,10 +500,22 @@ private:
    STENCIL_TABLE const *_varyingStencils;
    STENCIL_TABLE const *_faceVaryingStencils;

+    int _fvarChannel;
+    int _fvarWidth;
+
    EvaluatorCache *_evaluatorCache;
    DEVICE_CONTEXT *_deviceContext;
 };

+// This example uses one shared interleaved buffer for evaluated
+// 1st derivatives and a second shared interleaved buffer for
+// evaluated 2nd derivatives. We use this specialized device
+// context to allow the XFB evaluator to take advantage of this
+// and make more efficient use of available XFB buffer bindings.
+struct XFBDeviceContext {
+    bool AreInterleavedDerivativeBuffers() const { return true; }
+} g_xfbDeviceContext;
+
 EvalOutputBase *g_evalOutput = NULL;
 STParticles * g_particles=0;

@ -508,9 +576,12 @@ updateGeom() {
    g_evalOutput->UpdatePatchCoords(patchCoords);

    // Evaluate the positions of the samples on the limit surface
-    if (g_drawMode == kNORMAL || g_drawMode == kSHADE) {
-        // evaluate positions and derivatives
-        g_evalOutput->EvalPatchesWithDerivatives();
+    if (g_drawMode == kMEAN_CURVATURE) {
+        // evaluate positions and 2nd derivatives
+        g_evalOutput->EvalPatchesWith2ndDerivatives();
+    } else if (g_drawMode == kNORMAL || g_drawMode == kSHADE) {
+        // evaluate positions and 1st derivatives
+        g_evalOutput->EvalPatchesWith1stDerivatives();
    } else {
        // evaluate positions
        g_evalOutput->EvalPatches();
@ -519,7 +590,7 @@ updateGeom() {
    // color
    if (g_drawMode == kVARYING) {
        g_evalOutput->EvalPatchesVarying();
-    } else if (g_drawMode == kFACEVARYING) {
+    } else if (g_drawMode == kFACEVARYING && g_evalOutput->HasFaceVaryingData()) {
        g_evalOutput->EvalPatchesFaceVarying();
    }

@ -579,8 +650,10 @@ createOsdMesh(ShapeDesc const & shapeDesc, int level) {
    Far::StencilTable const * vertexStencils = NULL;
    Far::StencilTable const * varyingStencils = NULL;
    Far::StencilTable const * faceVaryingStencils = NULL;
-    std::vector<float> fvarData;
-    int nverts=0, nTotalfvarVerts=0;
+
+    int fvarChannel = 0;
+    int fvarWidth = shape->GetFVarWidth();
+    bool hasFVarData = !shape->uvs.empty();

    {
        bool adaptive = (sdctype == OpenSubdiv::Sdc::SCHEME_CATMARK);
@ -590,7 +663,7 @@ createOsdMesh(ShapeDesc const & shapeDesc, int level) {
            // Apply feature adaptive refinement to the mesh so that we can use the
            // limit evaluation API features.
            Far::TopologyRefiner::AdaptiveOptions options(level);
-            options.considerFVarChannels = true;
+            options.considerFVarChannels = hasFVarData;
            options.useInfSharpPatch = doInfSharpPatch;
            topologyRefiner->RefineAdaptive(options);
        } else {
@ -609,10 +682,16 @@ createOsdMesh(ShapeDesc const & shapeDesc, int level) {
            Far::StencilTableFactory::Create(*topologyRefiner, soptions);

        soptions.interpolationMode = Far::StencilTableFactory::INTERPOLATE_VARYING;
-
        varyingStencils =
            Far::StencilTableFactory::Create(*topologyRefiner, soptions);

+        if (hasFVarData) {
+            soptions.interpolationMode = Far::StencilTableFactory::INTERPOLATE_FACE_VARYING;
+            soptions.fvarChannel = fvarChannel;
+            faceVaryingStencils =
+                Far::StencilTableFactory::Create(*topologyRefiner, soptions);
+        }
+
        // Generate bi-cubic patch table for the limit surface
        Far::PatchTableFactory::Options poptions(level);
        if (g_endCap == kEndCapBSplineBasis) {
@ -623,7 +702,7 @@ createOsdMesh(ShapeDesc const & shapeDesc, int level) {
                Far::PatchTableFactory::Options::ENDCAP_GREGORY_BASIS);
        }
        poptions.useInfSharpPatch = doInfSharpPatch;
-        poptions.generateFVarTables = true;
+        poptions.generateFVarTables = hasFVarData;
        poptions.generateFVarLegacyLinearPatches = false;

        Far::PatchTable const * patchTable =
@ -649,28 +728,18 @@ createOsdMesh(ShapeDesc const & shapeDesc, int level) {
        }
        if (Far::StencilTable const *localPointFaceVaryingStencilTable =
            patchTable->GetLocalPointFaceVaryingStencilTable()) {
-            faceVaryingStencils = localPointFaceVaryingStencilTable;
+            Far::StencilTable const *table =
+                Far::StencilTableFactory::AppendLocalPointStencilTableFaceVarying(
+                    *topologyRefiner,
+                    faceVaryingStencils, localPointFaceVaryingStencilTable);
+            delete faceVaryingStencils;
+            faceVaryingStencils = table;
        }

-        // total number of vertices = coarse verts + refined verts + gregory basis verts
-        nverts = vertexStencils->GetNumControlVertices() +
-            vertexStencils->GetNumStencils();
-
-        nTotalfvarVerts = topologyRefiner->GetNumFVarValuesTotal(0) +
-            patchTable->GetNumLocalPointsFaceVarying(0);
-
-        InterpolateFVarData(*topologyRefiner, *shape, fvarData);
-
        if (g_patchTable) delete g_patchTable;
        g_patchTable = patchTable;
    }

-    delete shape;
-
-    // note that for patch eval we need coarse+refined combined buffer.
-    int nCoarseVertices = topologyRefiner->GetLevel(0).GetNumVertices();
-    int nCoarseFVarVertices = (int)fvarData.size()/2;
-
    // In following template instantiations, same type of vertex buffers are
    // used for both source and destination (first and second template
    // parameters), since we'd like to draw control mesh wireframe too in
@ -686,8 +755,7 @@ createOsdMesh(ShapeDesc const & shapeDesc, int level) {
                                      Osd::CpuPatchTable,
                                      Osd::CpuEvaluator>
            (vertexStencils, varyingStencils, faceVaryingStencils,
-             nCoarseVertices, nverts,
-             nCoarseFVarVertices, nTotalfvarVerts,
+             fvarChannel, fvarWidth,
             g_nParticles, g_patchTable);
 #ifdef OPENSUBDIV_HAS_OPENMP
    } else if (g_kernel == kOPENMP) {
@ -697,8 +765,7 @@ createOsdMesh(ShapeDesc const & shapeDesc, int level) {
                                      Osd::CpuPatchTable,
                                      Osd::OmpEvaluator>
            (vertexStencils, varyingStencils, faceVaryingStencils,
-            nCoarseVertices, nverts,
-            nCoarseFVarVertices, nTotalfvarVerts,
+            fvarChannel, fvarWidth,
            g_nParticles, g_patchTable);
 #endif
 #ifdef OPENSUBDIV_HAS_TBB
@ -709,8 +776,7 @@ createOsdMesh(ShapeDesc const & shapeDesc, int level) {
                                      Osd::CpuPatchTable,
                                      Osd::TbbEvaluator>
            (vertexStencils, varyingStencils, faceVaryingStencils,
-            nCoarseVertices, nverts,
-            nCoarseFVarVertices, nTotalfvarVerts,
+            fvarChannel, fvarWidth,
            g_nParticles, g_patchTable);
 #endif
 #ifdef OPENSUBDIV_HAS_CUDA
@ -721,8 +787,7 @@ createOsdMesh(ShapeDesc const & shapeDesc, int level) {
                                      Osd::CudaPatchTable,
                                      Osd::CudaEvaluator>
            (vertexStencils, varyingStencils, faceVaryingStencils,
-            nCoarseVertices, nverts,
-            nCoarseFVarVertices, nTotalfvarVerts,
+            fvarChannel, fvarWidth,
            g_nParticles, g_patchTable);
 #endif
 #ifdef OPENSUBDIV_HAS_OPENCL
@ -735,8 +800,7 @@ createOsdMesh(ShapeDesc const & shapeDesc, int level) {
                                      Osd::CLEvaluator,
                                      CLDeviceContext>
            (vertexStencils, varyingStencils, faceVaryingStencils,
-            nCoarseVertices, nverts,
-            nCoarseFVarVertices, nTotalfvarVerts,
+            fvarChannel, fvarWidth,
            g_nParticles, g_patchTable,
            &clEvaluatorCache, &g_clDeviceContext);
 #endif
@ -747,12 +811,12 @@ createOsdMesh(ShapeDesc const & shapeDesc, int level) {
                                      Osd::GLVertexBuffer,
                                      Osd::GLStencilTableTBO,
                                      Osd::GLPatchTable,
-                                      Osd::GLXFBEvaluator>
+                                      Osd::GLXFBEvaluator,
+                                      XFBDeviceContext>
            (vertexStencils, varyingStencils, faceVaryingStencils,
-            nCoarseVertices, nverts,
-            nCoarseFVarVertices, nTotalfvarVerts,
+            fvarChannel, fvarWidth,
            g_nParticles, g_patchTable,
-             &glXFBEvaluatorCache);
+             &glXFBEvaluatorCache, &g_xfbDeviceContext);
 #endif
 #ifdef OPENSUBDIV_HAS_GLSL_COMPUTE
    } else if (g_kernel == kGLCompute) {
@ -763,14 +827,18 @@ createOsdMesh(ShapeDesc const & shapeDesc, int level) {
                                      Osd::GLPatchTable,
                                      Osd::GLComputeEvaluator>
            (vertexStencils, varyingStencils, faceVaryingStencils,
-            nCoarseVertices, nverts,
-            nCoarseFVarVertices, nTotalfvarVerts,
+            fvarChannel, fvarWidth,
            g_nParticles, g_patchTable,
             &glComputeEvaluatorCache);
 #endif
    }

-    g_evalOutput->UpdateFaceVaryingData(&fvarData[0], 0, (int)fvarData.size()/2);
+    if (g_evalOutput->HasFaceVaryingData()) {
+        g_evalOutput->UpdateFaceVaryingData(
+            &shape->uvs[0], 0, (int)shape->uvs.size()/shape->GetFVarWidth());
+    }
+
+    delete shape;

    // Create the 'uv particles' manager - this class manages the limit
    // location samples (ptex face index, (s,t) and updates them between frames.
@ -803,8 +871,11 @@ linkDefaultProgram() {
        GLSL_VERSION_DEFINE
        "in vec3 position;\n"
        "in vec3 color;\n"
-        "in vec3 tangentU;\n"
-        "in vec3 tangentV;\n"
+        "in vec3 du;\n"
+        "in vec3 dv;\n"
+        "in vec3 duu;\n"
+        "in vec3 duv;\n"
+        "in vec3 dvv;\n"
        "in vec2 patchCoord;\n"
        "in vec2 fvarData;\n"
        "out vec4 fragColor;\n"
@ -813,7 +884,7 @@ linkDefaultProgram() {
        "uniform int DrawMode;\n"
        "void main() {\n"
        "  vec3 normal = (ModelViewMatrix * "
-        "               vec4(normalize(cross(tangentU, tangentV)), 0)).xyz;\n"
+        "               vec4(normalize(cross(du, dv)), 0)).xyz;\n"
        "  gl_Position = ProjectionMatrix * ModelViewMatrix * "
        "                  vec4(position, 1);\n"
        "  if (DrawMode == 0) {\n" // UV
@ -826,6 +897,16 @@ linkDefaultProgram() {
        "    // generating a checkerboard pattern\n"
        "    int checker = int(floor(20*fvarData.r)+floor(20*fvarData.g))&1;\n"
        "    fragColor = vec4(fvarData.rg*checker, 1-checker, 1);\n"
+        "  } else if (DrawMode == 5) {\n"  // mean curvature
+        "    vec3 N = normalize(cross(du, dv));\n"
+        "    float E = dot(du, du);\n"
+        "    float F = dot(du, dv);\n"
+        "    float G = dot(dv, dv);\n"
+        "    float e = dot(N, duu);\n"
+        "    float f = dot(N, duv);\n"
+        "    float g = dot(N, dvv);\n"
+        "    float H = 0.5 * abs(0.5 * (E*g - 2*F*f - G*e) / (E*G - F*F));\n"
+        "    fragColor = vec4(H, H, H, 1.0);\n"
        "  } else {\n" // varying
        "    fragColor = vec4(color, 1);\n"
        "  }\n"
@ -848,10 +929,13 @@ linkDefaultProgram() {

    glBindAttribLocation(program, 0, "position");
    glBindAttribLocation(program, 1, "color");
-    glBindAttribLocation(program, 2, "tangentU");
-    glBindAttribLocation(program, 3, "tangentV");
-    glBindAttribLocation(program, 4, "patchCoord");
-    glBindAttribLocation(program, 5, "fvarData");
+    glBindAttribLocation(program, 2, "du");
+    glBindAttribLocation(program, 3, "dv");
+    glBindAttribLocation(program, 4, "duu");
+    glBindAttribLocation(program, 5, "duv");
+    glBindAttribLocation(program, 6, "dvv");
+    glBindAttribLocation(program, 7, "patchCoord");
+    glBindAttribLocation(program, 8, "fvarData");
    glBindFragDataLocation(program, 0, "color");

    glLinkProgram(program);
@ -877,8 +961,11 @@ linkDefaultProgram() {
        glGetUniformLocation(program, "DrawMode");
    g_defaultProgram.attrPosition = glGetAttribLocation(program, "position");
    g_defaultProgram.attrColor = glGetAttribLocation(program, "color");
-    g_defaultProgram.attrTangentU = glGetAttribLocation(program, "tangentU");
-    g_defaultProgram.attrTangentV = glGetAttribLocation(program, "tangentV");
+    g_defaultProgram.attrDu = glGetAttribLocation(program, "du");
+    g_defaultProgram.attrDv = glGetAttribLocation(program, "dv");
+    g_defaultProgram.attrDuu = glGetAttribLocation(program, "duu");
+    g_defaultProgram.attrDuv = glGetAttribLocation(program, "duv");
+    g_defaultProgram.attrDvv = glGetAttribLocation(program, "dvv");
    g_defaultProgram.attrPatchCoord = glGetAttribLocation(program, "patchCoord");
    g_defaultProgram.attrFVarData = glGetAttribLocation(program, "fvarData");

@ -900,31 +987,33 @@ drawSamples() {

    glEnableVertexAttribArray(g_defaultProgram.attrPosition);
    glEnableVertexAttribArray(g_defaultProgram.attrColor);
-    glEnableVertexAttribArray(g_defaultProgram.attrTangentU);
-    glEnableVertexAttribArray(g_defaultProgram.attrTangentV);
-    glEnableVertexAttribArray(g_defaultProgram.attrPatchCoord);
-    glEnableVertexAttribArray(g_defaultProgram.attrFVarData);
-
    glBindBuffer(GL_ARRAY_BUFFER, g_evalOutput->BindVertexData());
    glVertexAttribPointer(0, 3, GL_FLOAT, GL_FALSE, sizeof (GLfloat) * 6, 0);
    glVertexAttribPointer(1, 3, GL_FLOAT, GL_FALSE, sizeof (GLfloat) * 6, (float*)12);

-    glBindBuffer(GL_ARRAY_BUFFER, g_evalOutput->BindDerivatives());
+    glEnableVertexAttribArray(g_defaultProgram.attrDu);
+    glEnableVertexAttribArray(g_defaultProgram.attrDv);
+    glBindBuffer(GL_ARRAY_BUFFER, g_evalOutput->Bind1stDerivatives());
    glVertexAttribPointer(2, 3, GL_FLOAT, GL_FALSE, sizeof (GLfloat) * 6, 0);
    glVertexAttribPointer(3, 3, GL_FLOAT, GL_FALSE, sizeof (GLfloat) * 6, (float*)12);

-    glBindBuffer(GL_ARRAY_BUFFER, g_evalOutput->BindPatchCoords());
-    glVertexAttribPointer(4, 2, GL_FLOAT, GL_FALSE, sizeof (GLfloat) * 5, (float*)12);
+    glEnableVertexAttribArray(g_defaultProgram.attrDuu);
+    glEnableVertexAttribArray(g_defaultProgram.attrDuv);
+    glEnableVertexAttribArray(g_defaultProgram.attrDvv);
+    glBindBuffer(GL_ARRAY_BUFFER, g_evalOutput->Bind2ndDerivatives());
+    glVertexAttribPointer(4, 3, GL_FLOAT, GL_FALSE, sizeof (GLfloat) * 9, 0);
+    glVertexAttribPointer(5, 3, GL_FLOAT, GL_FALSE, sizeof (GLfloat) * 9, (float*)12);
+    glVertexAttribPointer(6, 3, GL_FLOAT, GL_FALSE, sizeof (GLfloat) * 9, (float*)24);

-    glBindBuffer(GL_ARRAY_BUFFER, g_evalOutput->BindFaceVaryingData());
-    glVertexAttribPointer(5, 2, GL_FLOAT, GL_FALSE, sizeof (GLfloat) * 2, 0);
-
-    glEnableVertexAttribArray(g_defaultProgram.attrPosition);
-    glEnableVertexAttribArray(g_defaultProgram.attrColor);
-    glEnableVertexAttribArray(g_defaultProgram.attrTangentU);
-    glEnableVertexAttribArray(g_defaultProgram.attrTangentV);
    glEnableVertexAttribArray(g_defaultProgram.attrPatchCoord);
+    glBindBuffer(GL_ARRAY_BUFFER, g_evalOutput->BindPatchCoords());
+    glVertexAttribPointer(7, 2, GL_FLOAT, GL_FALSE, sizeof (GLfloat) * 5, (float*)12);
+
+    if (g_evalOutput->HasFaceVaryingData()) {
        glEnableVertexAttribArray(g_defaultProgram.attrFVarData);
+        glBindBuffer(GL_ARRAY_BUFFER, g_evalOutput->BindFaceVaryingData());
+        glVertexAttribPointer(8, 2, GL_FLOAT, GL_FALSE, sizeof (GLfloat) * 2, 0);
+    }

    glPointSize(2.0f);
    int nPatchCoords = (int)g_particles->GetPatchCoords().size();
@ -933,8 +1022,11 @@ drawSamples() {

    glDisableVertexAttribArray(g_defaultProgram.attrPosition);
    glDisableVertexAttribArray(g_defaultProgram.attrColor);
-    glDisableVertexAttribArray(g_defaultProgram.attrTangentU);
-    glDisableVertexAttribArray(g_defaultProgram.attrTangentV);
+    glDisableVertexAttribArray(g_defaultProgram.attrDu);
+    glDisableVertexAttribArray(g_defaultProgram.attrDv);
+    glDisableVertexAttribArray(g_defaultProgram.attrDuu);
+    glDisableVertexAttribArray(g_defaultProgram.attrDuv);
+    glDisableVertexAttribArray(g_defaultProgram.attrDvv);
    glDisableVertexAttribArray(g_defaultProgram.attrPatchCoord);
    glDisableVertexAttribArray(g_defaultProgram.attrFVarData);

@ -1297,6 +1389,7 @@ initHUD() {
    g_hud.AddPullDownButton(shading_pulldown, "Normal", kNORMAL, g_drawMode==kNORMAL);
    g_hud.AddPullDownButton(shading_pulldown, "Shade", kSHADE, g_drawMode==kSHADE);
    g_hud.AddPullDownButton(shading_pulldown, "FaceVarying", kFACEVARYING, g_drawMode==kFACEVARYING);
+    g_hud.AddPullDownButton(shading_pulldown, "Mean Curvature", kMEAN_CURVATURE, g_drawMode==kMEAN_CURVATURE);

    for (int i = 1; i < 11; ++i) {
        char level[16];
--- a/examples/glEvalLimit/init_shapes.h
+++ b/examples/glEvalLimit/init_shapes.h
@ -47,6 +47,7 @@ static void initShapes() {
    g_defaultShapes.push_back( ShapeDesc("catmark_cube_corner4",     catmark_cube_corner4,     kCatmark ) );
    g_defaultShapes.push_back( ShapeDesc("catmark_cube_creases0",    catmark_cube_creases0,    kCatmark ) );
    g_defaultShapes.push_back( ShapeDesc("catmark_cube_creases1",    catmark_cube_creases1,    kCatmark ) );
+    g_defaultShapes.push_back( ShapeDesc("catmark_cube_creases2",    catmark_cube_creases2,    kCatmark ) );
    g_defaultShapes.push_back( ShapeDesc("catmark_cube",             catmark_cube,             kCatmark ) );
    g_defaultShapes.push_back( ShapeDesc("catmark_dart_edgecorner",  catmark_dart_edgecorner,  kCatmark ) );
    g_defaultShapes.push_back( ShapeDesc("catmark_dart_edgeonly",    catmark_dart_edgeonly,    kCatmark ) );
@ -55,6 +56,8 @@ static void initShapes() {
    g_defaultShapes.push_back( ShapeDesc("catmark_chaikin0",         catmark_chaikin0,         kCatmark ) );
    g_defaultShapes.push_back( ShapeDesc("catmark_chaikin1",         catmark_chaikin1,         kCatmark ) );
    g_defaultShapes.push_back( ShapeDesc("catmark_chaikin2",         catmark_chaikin2,         kCatmark ) );
+    g_defaultShapes.push_back( ShapeDesc("catmark_single_crease",    catmark_single_crease,    kCatmark ) );
+    g_defaultShapes.push_back( ShapeDesc("catmark_inf_crease0",      catmark_inf_crease0,      kCatmark ) );
    g_defaultShapes.push_back( ShapeDesc("catmark_fan",              catmark_fan,              kCatmark ) );
    g_defaultShapes.push_back( ShapeDesc("catmark_flap",             catmark_flap,             kCatmark ) );
    g_defaultShapes.push_back( ShapeDesc("catmark_flap2",            catmark_flap2,            kCatmark ) );
--- a/examples/glEvalLimit/particles.h
+++ b/examples/glEvalLimit/particles.h
@ -39,7 +39,7 @@
 // particle is assigned a location on the subdivision surface limit that is
 // composed of a unique ptex face index, with a local (s,t) parametric pair.
 //
-// The system also generates an array of parametric velocties (ds, dt) for each
+// The system also generates an array of parametric velocities (ds, dt) for each
 // particle. An Update() function then applies the velocities to the locations and
 // moves the points along the parametric space.
 //
@ -49,7 +49,7 @@
 // bounces it, if the edge happens to be a boundary.
 // 
 // Note: currently the adjacency code does not handle 'diagonal' crossings, nor
-// crossings between quand and non-quad faces.
+// crossings between quad and non-quad faces.
 //
 class STParticles {

--- a/examples/glFVarViewer/glFVarViewer.cpp
+++ b/examples/glFVarViewer/glFVarViewer.cpp
@ -156,22 +156,39 @@ struct FVarData
            glDeleteTextures(1, &textureParamBuffer);
        textureParamBuffer = 0;
    }
-    void Create(OpenSubdiv::Far::PatchTable const *patchTable,
-                int fvarWidth, std::vector<float> const & fvarSrcData) {
+    void Create(OpenSubdiv::Far::TopologyRefiner const *refiner,
+                OpenSubdiv::Far::PatchTable const *patchTable,
+                std::vector<float> const & fvarSrcData,
+                int fvarWidth, int fvarChannel = 0) {

        using namespace OpenSubdiv;

        Release();
-        Far::ConstIndexArray indices = patchTable->GetFVarValues();

-        const float * fvarSrcDataPtr = &fvarSrcData[0];
-        Osd::CpuVertexBuffer *fvarBuffer = NULL;
+        Far::StencilTableFactory::Options soptions;
+        soptions.interpolationMode = Far::StencilTableFactory::INTERPOLATE_FACE_VARYING;
+        soptions.fvarChannel = fvarChannel;
+        soptions.generateOffsets = true;
+        soptions.generateIntermediateLevels = !refiner->IsUniform();
+        Far::StencilTable const *fvarStencils =
+            Far::StencilTableFactory::Create(*refiner, soptions);
+
+        if (Far::StencilTable const *fvarStencilsWithLocalPoints =
+            Far::StencilTableFactory::AppendLocalPointStencilTableFaceVarying(
+                *refiner,
+                fvarStencils,
+                patchTable->GetLocalPointFaceVaryingStencilTable(),
+                fvarChannel)) {
+            delete fvarStencils;
+            fvarStencils = fvarStencilsWithLocalPoints;
+        }

-        int numLocalFVarPoints = patchTable->GetNumLocalPointsFaceVarying();
-        if (numLocalFVarPoints > 0) {
        int numSrcFVarPoints = (int)fvarSrcData.size() / fvarWidth;
-            fvarBuffer = Osd::CpuVertexBuffer::Create(
-                fvarWidth, numSrcFVarPoints + numLocalFVarPoints);
+        int numFVarPoints = numSrcFVarPoints
+                          + fvarStencils->GetNumStencils();
+
+        Osd::CpuVertexBuffer *fvarBuffer =
+            Osd::CpuVertexBuffer::Create(fvarWidth, numFVarPoints);
        fvarBuffer->UpdateData(&fvarSrcData[0], 0, numSrcFVarPoints);

        Osd::BufferDescriptor srcDesc(0, fvarWidth, fvarWidth);
@ -180,10 +197,12 @@ struct FVarData

        Osd::CpuEvaluator::EvalStencils(fvarBuffer, srcDesc,
                                        fvarBuffer, dstDesc,
-                                            patchTable->GetLocalPointFaceVaryingStencilTable());
+                                        fvarStencils);

-            fvarSrcDataPtr = fvarBuffer->BindCpuBuffer();
-        }
+        Far::ConstIndexArray indices = patchTable->GetFVarValues();
+        const float * fvarSrcDataPtr = !refiner->IsUniform()
+            ? fvarBuffer->BindCpuBuffer()
+            : fvarBuffer->BindCpuBuffer() + numSrcFVarPoints * fvarWidth;

        // expand fvardata to per-patch array
        std::vector<float> data;
@ -201,9 +220,7 @@ struct FVarData
        glBufferData(GL_ARRAY_BUFFER, data.size()*sizeof(float),
                     &data[0], GL_STATIC_DRAW);

-        if (fvarBuffer) {
        delete fvarBuffer;
-        }

        glGenTextures(1, &textureBuffer);
        glBindTexture(GL_TEXTURE_BUFFER, textureBuffer);
@ -390,6 +407,11 @@ rebuildMesh() {

    Shape * shape = Shape::parseObj(shapeDesc.data.c_str(), shapeDesc.scheme);

+    if (!shape->HasUV()) {
+        printf("Error: shape %s does not contain face-varying UVs\n", shapeDesc.name.c_str());
+        exit(1);
+    }
+
    // create Far mesh (topology)
    OpenSubdiv::Sdc::SchemeType sdctype = GetSdcType(*shape);
    OpenSubdiv::Sdc::Options sdcoptions = GetSdcOptions(*shape);
@ -435,13 +457,9 @@ rebuildMesh() {
                                           numVaryingElements,
                                           level, bits);

-    std::vector<float> fvarData;
-
-    InterpolateFVarData(*refiner, *shape, fvarData);
-
    // set fvardata to texture buffer
-    g_fvarData.Create(g_mesh->GetFarPatchTable(),
-                      shape->GetFVarWidth(), fvarData);
+    g_fvarData.Create(refiner, g_mesh->GetFarPatchTable(),
+                      shape->uvs, shape->GetFVarWidth());

    delete shape;

@ -816,7 +834,7 @@ display() {

    glEnable(GL_DEPTH_TEST);

-    // make sure that the vertex buffer is interoped back as a GL resources.
+    // make sure that the vertex buffer is interoped back as a GL resource.
    GLuint vbo = g_mesh->BindVertexBuffer();

    glBindVertexArray(g_vao);
@ -1096,7 +1114,7 @@ initHUD() {
    for (int i = 1; i < 11; ++i) {
        char level[16];
        sprintf(level, "Lv. %d", i);
-        g_hud.AddRadioButton(3, level, i == 2, 10, 270 + i*20, callbackLevel, i, '0'+(i%10));
+        g_hud.AddRadioButton(3, level, i == g_level, 10, 270 + i*20, callbackLevel, i, '0'+(i%10));
    }

    typedef OpenSubdiv::Sdc::Options SdcOptions;
@ -1162,26 +1180,42 @@ callbackErrorGLFW(int error, const char* description) {
    fprintf(stderr, "GLFW Error (%d) : %s\n", error, description);
 }

+//------------------------------------------------------------------------------
+static int
+parseIntArg(const char* argString, int dfltValue = 0) {
+    char *argEndptr;
+    int argValue = strtol(argString, &argEndptr, 10);
+    if (*argEndptr != 0) {
+        printf("Warning: non-integer option parameter '%s' ignored\n", argString);
+        argValue = dfltValue;
+    }
+    return argValue;
+}
+
 //------------------------------------------------------------------------------
 int main(int argc, char ** argv) {

    bool fullscreen = false;
    std::string str;
    for (int i = 1; i < argc; ++i) {
-        if (!strcmp(argv[i], "-d"))
-            g_level = atoi(argv[++i]);
-        else if (!strcmp(argv[i], "-c"))
-            g_repeatCount = atoi(argv[++i]);
-        else if (!strcmp(argv[i], "-f"))
+        if (!strcmp(argv[i], "-d")) {
+            if (++i < argc) g_level = parseIntArg(argv[i], g_level);
+        } else if (!strcmp(argv[i], "-c")) {
+            if (++i < argc) g_repeatCount = parseIntArg(argv[i], g_repeatCount);
+        } else if (!strcmp(argv[i], "-f")) {
            fullscreen = true;
-        else {
-            std::ifstream ifs(argv[1]);
+        } else if (argv[i][0] == '-') {
+            printf("Warning: unrecognized option '%s' ignored\n", argv[i]);
+        } else {
+            std::ifstream ifs(argv[i]);
            if (ifs) {
                std::stringstream ss;
                ss << ifs.rdbuf();
                ifs.close();
                str = ss.str();
-                g_defaultShapes.push_back(ShapeDesc(argv[1], str.c_str(), kCatmark));
+                g_defaultShapes.push_back(ShapeDesc(argv[i], str.c_str(), kCatmark));
+            } else {
+                printf("Warning: cannot open shape file '%s'\n", argv[i]);
            }
        }
    }
@ -1250,7 +1284,7 @@ int main(int argc, char ** argv) {
        exit(1);
    }
 #ifdef CORE_PROFILE
-    // clear GL errors which was generated during glewInit()
+    // clear GL errors which were generated during glewInit()
    glGetError();
 #endif
 #endif
--- a/examples/glFVarViewer/init_shapes.h
+++ b/examples/glFVarViewer/init_shapes.h
@ -40,6 +40,10 @@ static std::vector<ShapeDesc> g_defaultShapes;
 //------------------------------------------------------------------------------
 static void initShapes() {

+    //
+    //  Note that any shapes added here must have UVs -- loading a shape without UVs is a fatal
+    //  error and will result in termination when it is selected.
+    //
    g_defaultShapes.push_back( ShapeDesc("catmark_cube_corner0",     catmark_cube_corner0,     kCatmark ) );
    g_defaultShapes.push_back( ShapeDesc("catmark_cube_corner1",     catmark_cube_corner1,     kCatmark ) );
    g_defaultShapes.push_back( ShapeDesc("catmark_cube_corner2",     catmark_cube_corner2,     kCatmark ) );
--- a/examples/glPaintTest/glPaintTest.cpp
+++ b/examples/glPaintTest/glPaintTest.cpp
@ -648,7 +648,7 @@ display() {

    bindTextures(effect);

-    // make sure that the vertex buffer is interoped back as a GL resources.
+    // make sure that the vertex buffer is interoped back as a GL resource.
    g_mesh->BindVertexBuffer();

    glBindVertexArray(g_vao);
@ -694,7 +694,7 @@ display() {
        g_fpsTimer.Stop();
        double fps = 1.0/g_fpsTimer.GetElapsed();
        g_fpsTimer.Start();
-        // Avereage fps over a defined number of time samples for
+        // Average fps over a defined number of time samples for
        // easier reading in the HUD
        g_fpsTimeSamples[g_currentFpsTimeSample++] = float(fps);
        if (g_currentFpsTimeSample >= NUM_FPS_TIME_SAMPLES)
@ -776,7 +776,7 @@ drawStroke(int x, int y) {

    glBindBufferBase(GL_UNIFORM_BUFFER, g_tessellationBinding, g_tessellationUB);

-    // make sure that the vertex buffer is interoped back as a GL resources.
+    // make sure that the vertex buffer is interoped back as a GL resource.
    g_mesh->BindVertexBuffer();

    glBindVertexArray(g_vao);
@ -1153,7 +1153,7 @@ int main(int argc, char ** argv) {
        exit(1);
    }
 #ifdef CORE_PROFILE
-    // clear GL errors which was generated during glewInit()
+    // clear GL errors which were generated during glewInit()
    glGetError();
 #endif
 #endif
--- a/examples/glPtexViewer/glPtexViewer.cpp
+++ b/examples/glPtexViewer/glPtexViewer.cpp
@ -1117,7 +1117,7 @@ updateConstantUniformBlock() {
    memcpy(g_modelViewProjection, constantData.ModelViewProjectionMatrix,
           16*sizeof(float));

-    // lighs
+    // lights
    Constant::Light light0 = {  { 0.6f, 1.0f, 0.6f, 0.0f },
                                { 0.1f, 0.1f, 0.1f, 1.0f },
                                { 1.7f, 1.3f, 1.1f, 1.0f },
@ -1401,7 +1401,7 @@ display() {
    if (g_hud.IsVisible()) {
        double fps = 1.0/elapsed;

-        // Avereage fps over a defined number of time samples for
+        // Average fps over a defined number of time samples for
        // easier reading in the HUD
        g_fpsTimeSamples[g_currentFpsTimeSample++] = float(fps);
        if (g_currentFpsTimeSample >= NUM_FPS_TIME_SAMPLES)
@ -1695,9 +1695,9 @@ void usage(const char *program) {
    printf("          -e <specularEnvMap.hdr> : specular environment map for IBL\n");
    printf("          -s <shaderfile.glsl>    : custom shader file\n");
    printf("          -y                      : Y-up model\n");
-    printf("          -m level                : max mimmap level (default=10)\n");
+    printf("          -m level                : max mipmap level (default=10)\n");
    printf("          -x <ptex limit MB>      : ptex target memory size\n");
-    printf("          --disp <scale>          : Displacment scale\n");
+    printf("          --disp <scale>          : Displacement scale\n");
 }

 //------------------------------------------------------------------------------
--- a/examples/glShareTopology/glShareTopology.cpp
+++ b/examples/glShareTopology/glShareTopology.cpp
@ -635,7 +635,7 @@ display() {

    glEnable(GL_DEPTH_TEST);

-    // make sure that the vertex buffer is interoped back as a GL resources.
+    // make sure that the vertex buffer is interoped back as a GL resource.
    g_scene->BindVertexBuffer();

    glBindVertexArray(g_vao);
@ -1193,7 +1193,7 @@ int main(int argc, char ** argv) {
    glfwMakeContextCurrent(g_window);
    GLUtils::PrintGLVersion();

-    // accommocate high DPI displays (e.g. mac retina displays)
+    // accommodate high DPI displays (e.g. mac retina displays)
    glfwGetFramebufferSize(g_window, &g_width, &g_height);
    glfwSetFramebufferSizeCallback(g_window, reshape);

@ -1212,7 +1212,7 @@ int main(int argc, char ** argv) {
        exit(1);
    }
 #ifdef CORE_PROFILE
-    // clear GL errors which was generated during glewInit()
+    // clear GL errors which were generated during glewInit()
    glGetError();
 #endif
 #endif
--- a/examples/glShareTopology/sceneBase.cpp
+++ b/examples/glShareTopology/sceneBase.cpp
@ -169,7 +169,7 @@ SceneBase::createStencilTable(Shape const *shape, int level, bool varying,

    _stencilTableSize = createMeshRefiner(vertexStencils, varyingStencils,
                                          numControlVertices);
-    // note: refiner takes ownerships of vertexStencils/ varyingStencils, patchTable
+    // note: refiner takes ownership of vertexStencils, varyingStencils, patchTable

    delete refiner;
    return numControlVertices + vertexStencils->GetNumStencils();
--- a/examples/glStencilViewer/glStencilViewer.cpp
+++ b/examples/glStencilViewer/glStencilViewer.cpp
@ -1130,7 +1130,7 @@ int main(int argc, char **argv) {
        exit(1);
    }
 #ifdef CORE_PROFILE
-    // clear GL errors which was generated during glewInit()
+    // clear GL errors which were generated during glewInit()
    glGetError();
 #endif
 #endif
--- a/examples/glViewer/glViewer.cpp
+++ b/examples/glViewer/glViewer.cpp
@ -85,10 +85,10 @@ OpenSubdiv::Osd::GLLegacyGregoryPatchTable *g_legacyGregoryPatchTable = NULL;


 /* Function to get the correct shader file based on the opengl version.
-  The implentation varies depending if glew is available or not. In case
+  The implementation varies depending if glew is available or not. In case it
  is available the capabilities are queried during execution and the correct
-  source is returned. If glew in not available during compile time the version
-  is determined*/
+  source is returned. If glew is not available the version is determined at
+  compile time */
 static const char *shaderSource(){
 #if ! defined(OSD_USES_GLEW)

@ -110,7 +110,7 @@ static const char *res =
                ;
            //Determine the shader file to use. Since some opengl implementations
            //define that an extension is available but not an implementation 
-            //for it you cannnot trust in the glew header definitions to know that is 
+            //for it you cannot trust in the glew header definitions to know that is
            //available, but you need to query it during runtime.
            if (GLUtils::SupportsAdaptiveTessellation())
                res = gen;
@ -162,6 +162,7 @@ enum HudCheckBox { kHUD_CB_DISPLAY_CONTROL_MESH_EDGES,
                   kHUD_CB_FREEZE,
                   kHUD_CB_DISPLAY_PATCH_COUNTS,
                   kHUD_CB_ADAPTIVE,
+                   kHUD_CB_SMOOTH_CORNER_PATCH,
                   kHUD_CB_SINGLE_CREASE_PATCH,
                   kHUD_CB_INF_SHARP_PATCH };

@ -182,6 +183,7 @@ int   g_fullscreen = 0,
      g_displayStyle = kDisplayStyleWireOnShaded,
      g_adaptive = 1,
      g_endCap = kEndCapBSplineBasis,
+      g_smoothCornerPatch = 0,
      g_singleCreasePatch = 1,
      g_infSharpPatch = 0,
      g_mbutton[3] = {0, 0, 0},
@ -447,11 +449,13 @@ rebuildMesh() {
    // Adaptive refinement currently supported only for catmull-clark scheme
    bool doAdaptive = (g_adaptive!=0 && scheme==kCatmark);
    bool interleaveVarying = g_shadingMode == kShadingInterleavedVaryingColor;
+    bool doSmoothCornerPatch = (g_smoothCornerPatch!=0 && scheme==kCatmark);
    bool doSingleCreasePatch = (g_singleCreasePatch!=0 && scheme==kCatmark);
    bool doInfSharpPatch = (g_infSharpPatch!=0 && scheme==kCatmark);

    Osd::MeshBitset bits;
    bits.set(Osd::MeshAdaptive, doAdaptive);
+    bits.set(Osd::MeshUseSmoothCornerPatch, doSmoothCornerPatch);
    bits.set(Osd::MeshUseSingleCreasePatch, doSingleCreasePatch);
    bits.set(Osd::MeshUseInfSharpPatch, doInfSharpPatch);
    bits.set(Osd::MeshInterleaveVarying, interleaveVarying);
@ -1084,7 +1088,7 @@ display() {
    inverseMatrix(g_transformData.ModelViewInverseMatrix,
                  g_transformData.ModelViewMatrix);

-    // make sure that the vertex buffer is interoped back as a GL resources.
+    // make sure that the vertex buffer is interoped back as a GL resource.
    GLuint vbo = g_mesh->BindVertexBuffer();

    // vertex texture update for legacy gregory drawing
@ -1402,6 +1406,10 @@ callbackCheckBox(bool checked, int button) {
            g_adaptive = checked;
            rebuildMesh();
            return;
+        case kHUD_CB_SMOOTH_CORNER_PATCH:
+            g_smoothCornerPatch = checked;
+            rebuildMesh();
+            return;
        case kHUD_CB_SINGLE_CREASE_PATCH:
            g_singleCreasePatch = checked;
            rebuildMesh();
@ -1541,13 +1549,15 @@ initHUD() {
    if (GLUtils::SupportsAdaptiveTessellation()) {
        g_hud.AddCheckBox("Adaptive (`)", g_adaptive!=0,
                          10, 190, callbackCheckBox, kHUD_CB_ADAPTIVE, '`');
+        g_hud.AddCheckBox("Smooth Corner Patch (O)", g_smoothCornerPatch!=0,
+                          10, 210, callbackCheckBox, kHUD_CB_SMOOTH_CORNER_PATCH, 'o');
        g_hud.AddCheckBox("Single Crease Patch (S)", g_singleCreasePatch!=0,
-                          10, 210, callbackCheckBox, kHUD_CB_SINGLE_CREASE_PATCH, 's');
+                          10, 230, callbackCheckBox, kHUD_CB_SINGLE_CREASE_PATCH, 's');
        g_hud.AddCheckBox("Inf Sharp Patch (I)", g_infSharpPatch!=0,
-                          10, 230, callbackCheckBox, kHUD_CB_INF_SHARP_PATCH, 'i');
+                          10, 250, callbackCheckBox, kHUD_CB_INF_SHARP_PATCH, 'i');

        int endcap_pulldown = g_hud.AddPullDown(
-            "End cap (E)", 10, 250, 200, callbackEndCap, 'e');
+            "End cap (E)", 10, 270, 200, callbackEndCap, 'e');
        g_hud.AddPullDownButton(endcap_pulldown,"None",
                                kEndCapNone,
                                g_endCap == kEndCapNone);
@ -1709,7 +1719,7 @@ int main(int argc, char ** argv) {
    glfwMakeContextCurrent(g_window);
    GLUtils::PrintGLVersion();

-    // accommocate high DPI displays (e.g. mac retina displays)
+    // accommodate high DPI displays (e.g. mac retina displays)
    glfwGetFramebufferSize(g_window, &g_width, &g_height);
    glfwSetFramebufferSizeCallback(g_window, reshape);

@ -1728,7 +1738,7 @@ int main(int argc, char ** argv) {
        exit(1);
    }
 #ifdef CORE_PROFILE
-    // clear GL errors which was generated during glewInit()
+    // clear GL errors which were generated during glewInit()
    glGetError();
 #endif
 #endif
--- a/opensubdiv/far/endCapBSplineBasisPatchFactory.cpp
+++ b/opensubdiv/far/endCapBSplineBasisPatchFactory.cpp
@ -132,7 +132,7 @@ EndCapBSplineBasisPatchFactory::getPatchPointsFromGregoryBasis(

    // XXX: For now, always create new 16 indices for each patch.
    // we'll optimize later to share all regular control points with
-    // other patches as well as to try to make extra ordinary verts watertight.
+    // other patches as well as try to make extra-ordinary verts watertight.

    int offset = (fvarChannel < 0)
               ? _refiner->GetNumVerticesTotal()
@ -298,12 +298,12 @@ EndCapBSplineBasisPatchFactory::getPatchPoints(
    //
    //  This function assumes the patch is not on boundary
    //  and it contains only 1 extraordinary vertex.
-    //  The location of the extraoridnary vertex can be one of
+    //  The location of the extraordinary vertex can be one of
    //  0-ring quad corner.
    //
-    //  B-Spline control point gathering indice
+    //  B-Spline control point gathering indices
    //
-    //     [5]   (4)---(15)--(14)    0 : extraoridnary vertex
+    //     [5]   (4)---(15)--(14)    0 : extraordinary vertex
    //            |     |     |
    //            |     |     |      1,2,3,9,10,11,12,13 :
    //     (6)----0-----3-----13       B-Spline control points, gathered by
@ -467,7 +467,7 @@ EndCapBSplineBasisPatchFactory::getPatchPoints(
    X5.AddWithWeight(X7,                             -1.0f);
    X5.AddWithWeight(X15,                            -1.0f);

-    //     [5]   (4)---(15)--(14)    0 : extraoridnary vertex
+    //     [5]   (4)---(15)--(14)    0 : extraordinary vertex
    //            |     |     |
    //            |     |     |      1,2,3,9,10,11,12,13 :
    //     (6)----0-----3-----13       B-Spline control points, gathered by
--- a/opensubdiv/far/endCapBSplineBasisPatchFactory.h
+++ b/opensubdiv/far/endCapBSplineBasisPatchFactory.h
@ -45,7 +45,7 @@ class TopologyRefiner;
 class EndCapBSplineBasisPatchFactory {

 public:
-    /// \brief This factory accumulates vertex for bspline basis end cap
+    /// \brief This factory accumulates vertices for bspline basis end cap
    ///
    /// @param refiner                TopologyRefiner from which to generate patches
    ///
@ -66,7 +66,7 @@ public:
    /// \brief Returns end patch point indices for \a faceIndex of \a level.
    ///        Note that end patch points are not included in the vertices in
    ///        the topologyRefiner, they're expected to come after the end.
-    ///        The returning indices are offsetted by refiner->GetNumVerticesTotal.
+    ///        The returned indices are offset by refiner->GetNumVerticesTotal.
    ///
    /// @param level            vtr refinement level
    ///
--- a/opensubdiv/far/endCapGregoryBasisPatchFactory.cpp
+++ b/opensubdiv/far/endCapGregoryBasisPatchFactory.cpp
@ -190,7 +190,7 @@ EndCapGregoryBasisPatchFactory::GetPatchPoints(
                int aedge = aedges.FindIndexIn4Tuple(edge);
                assert(aedge!=Vtr::INDEX_INVALID);

-                // Find index of basis in the list of basis already generated
+                // Find index of basis in the list of bases already generated
                unsigned int adjLevelAndFaceIndex = LevelAndFaceIndex::create(levelIndex, adjFaceIndex);
                unsigned int * ptr = (unsigned int *)std::bsearch(&adjLevelAndFaceIndex,
                                                                  &_levelAndFaceIndices[0],
@ -212,7 +212,7 @@ EndCapGregoryBasisPatchFactory::GetPatchPoints(
                Index * src = &_patchPoints[adjPatchIndex*20];
                for (int j=0; j<4; ++j) {
                    // invert direction
-                    // note that src  indices have already been offsetted.
+                    // note that src indices have already been offset.
                    dest[gregoryEdgeVerts[i][3-j]] = src[gregoryEdgeVerts[aedge][j]];
                }
            }
--- a/opensubdiv/far/endCapGregoryBasisPatchFactory.h
+++ b/opensubdiv/far/endCapGregoryBasisPatchFactory.h
@ -72,7 +72,7 @@ public:
    /// \brief Returns end patch point indices for \a faceIndex of \a level.
    ///        Note that end patch points are not included in the vertices in
    ///        the topologyRefiner, they're expected to come after the end.
-    ///        The returning indices are offsetted by refiner->GetNumVerticesTotal.
+    ///        The returned indices are offset by refiner->GetNumVerticesTotal.
    ///
    /// @param level            vtr refinement level
    ///
@ -91,7 +91,7 @@ public:
 private:

    /// Creates a basis for the vertices specified in mask on the face and
-    /// accumates it
+    /// accumulates it
    bool addPatchBasis(Vtr::internal::Level const & level, Index faceIndex,
                       Vtr::internal::Level::VSpan const cornerSpans[],
                       bool newVerticesMask[4][5],
--- a/opensubdiv/far/endCapLegacyGregoryPatchFactory.cpp
+++ b/opensubdiv/far/endCapLegacyGregoryPatchFactory.cpp
@ -140,7 +140,7 @@ EndCapLegacyGregoryPatchFactory::Finalize(
    //      - it allocates 2*maxvalence+1 for ALL vertices
    //      - it initializes the one-ring for ALL vertices
    //  We use the full size expected (not sure what else relies on that) but 
-    //  we avoiding initializing
+    //  we avoid initializing
    //  the vast majority of vertices that are not associated with gregory 
    //  patches -- by having previously marked those that are associated above
    //  and skipping all others.
--- a/opensubdiv/far/endCapLegacyGregoryPatchFactory.h
+++ b/opensubdiv/far/endCapLegacyGregoryPatchFactory.h
@ -50,7 +50,7 @@ public:
    /// \brief Returns end patch point indices for \a faceIndex of \a level.
    ///        Note that legacy gregory patch points exist in the max level
    ///        of subdivision in the topologyRefiner.
-    ///        The returning indices are offsetted by levelVertOffset
+    ///        The returned indices are offset by levelVertOffset
    ///
    /// @param level            vtr refinement level
    ///
--- a/opensubdiv/far/patchParam.h
+++ b/opensubdiv/far/patchParam.h
@ -173,7 +173,7 @@ struct PatchParam {
    /// \brief Resets everything to 0
    void Clear() { field0 = field1 = 0; }

-    /// \brief Retuns the faceid
+    /// \brief Returns the faceid
    Index GetFaceId() const { return Index(unpack(field0,28,0)); }

    /// \brief Returns the log2 value of the u parameter at
--- a/opensubdiv/far/patchTable.h
+++ b/opensubdiv/far/patchTable.h
@ -42,7 +42,7 @@ namespace Far {

 /// \brief Container for arrays of parametric patches
 ///
-/// PatchTable contain topology and parametric information about the patches
+/// PatchTable contains topology and parametric information about the patches
 /// generated by the Refinement process. Patches in the table are sorted into
 /// arrays based on their PatchDescriptor Type.
 ///
@ -371,7 +371,7 @@ public:
    /// \brief Evaluate basis functions for position and derivatives at a
    /// given (u,v) parametric location of a patch.
    ///
-    /// @param handle  A patch handle indentifying the sub-patch containing the
+    /// @param handle  A patch handle identifying the sub-patch containing the
    ///                (u,v) location
    ///
    /// @param u       Patch coordinate (in base face normalized space)
@ -397,7 +397,7 @@ public:
    /// \brief Evaluate basis functions for a varying value and
    /// derivatives at a given (u,v) parametric location of a patch.
    ///
-    /// @param handle  A patch handle indentifying the sub-patch containing the
+    /// @param handle  A patch handle identifying the sub-patch containing the
    ///                (u,v) location
    ///
    /// @param u       Patch coordinate (in base face normalized space)
@ -423,7 +423,7 @@ public:
    /// \brief Evaluate basis functions for a face-varying value and
    /// derivatives at a given (u,v) parametric location of a patch.
    ///
-    /// @param handle  A patch handle indentifying the sub-patch containing the
+    /// @param handle  A patch handle identifying the sub-patch containing the
    ///                (u,v) location
    ///
    /// @param u       Patch coordinate (in base face normalized space)
--- a/opensubdiv/far/patchTableFactory.cpp
+++ b/opensubdiv/far/patchTableFactory.cpp
@ -312,9 +312,6 @@ public:

    Options const options;

-    //  Additional options eventually to be made public in Options above:
-    bool options_approxSmoothCornerWithSharp;
-
    PtexIndices const ptexIndices;

    // Counters accumulating each type of patch during topology traversal
@ -340,9 +337,6 @@ PatchTableFactory::BuilderContext::BuilderContext(
    numRegularPatches(0), numIrregularPatches(0),
    numIrregularBoundaryPatches(0) {

-    //  Eventually to be passed in as Options and assigned to member...
-    options_approxSmoothCornerWithSharp = true;
-
    if (options.generateFVarTables) {
        // If client-code does not select specific channels, default to all
        // the channels in the refiner.
@ -623,7 +617,7 @@ PatchTableFactory::BuilderContext::IsPatchRegular(
    }

    //  Legacy option -- reinterpret an irregular smooth corner as sharp if specified:
-    if (!isRegular && options_approxSmoothCornerWithSharp) {
+    if (!isRegular && options.generateLegacySharpCornerPatches) {
        if (fCompVTag._xordinary && fCompVTag._boundary && !fCompVTag._nonManifold) {
            isRegular = IsPatchSmoothCorner(levelIndex, faceIndex, fvcRefiner);
        }
@ -756,7 +750,7 @@ PatchTableFactory::BuilderContext::GetIrregularPatchCornerSpans(
        }

        //  Legacy option -- reinterpret an irregular smooth corner as sharp if specified:
-        if (!cornerSpans[i]._sharp && options_approxSmoothCornerWithSharp) {
+        if (!cornerSpans[i]._sharp && options.generateLegacySharpCornerPatches) {
            if (vTags[i]._xordinary && vTags[i]._boundary && !vTags[i]._nonManifold) {
                    int nFaces = cornerSpans[i].isAssigned() ? cornerSpans[i]._numFaces
                               : level.getVertexFaces(fVerts[i]).size();
--- a/opensubdiv/far/patchTableFactory.h
+++ b/opensubdiv/far/patchTableFactory.h
@ -65,6 +65,7 @@ public:
             shareEndCapPatchPoints(true),
             generateFVarTables(false),
             generateFVarLegacyLinearPatches(true),
+             generateLegacySharpCornerPatches(true),
             numFVarChannels(-1),
             fvarChannelIndices(0)
        { }
@ -88,7 +89,11 @@ public:

                     // face-varying
                     generateFVarTables  : 1, ///< Generate face-varying patch tables
-                     generateFVarLegacyLinearPatches : 1; ///< Generate all linear face-varying patches (legacy)
+
+                     // legacy behaviors (default to true)
+                     generateFVarLegacyLinearPatches  : 1, ///< Generate all linear face-varying patches (legacy)
+                     generateLegacySharpCornerPatches : 1; ///< Generate sharp regular patches at smooth corners (legacy)
+
        int          numFVarChannels;          ///< Number of channel indices and interpolation modes passed
        int const *  fvarChannelIndices;       ///< List containing the indices of the channels selected for the factory
    };
--- a/opensubdiv/far/primvarRefiner.h
+++ b/opensubdiv/far/primvarRefiner.h
@ -92,14 +92,14 @@ public:
    ///       (ex. std::vector<MyVertex>).
    ///       Some interpolation methods however allow passing the buffers by
    ///       reference: this allows to work transparently with arrays and
-    ///       containers (or other scheme that overload the '[]' operator)
+    ///       containers (or other schemes that overload the '[]' operator)
    ///       <br><br>
    ///       See the <a href=http://graphics.pixar.com/opensubdiv/docs/tutorials.html>
    ///       Far tutorials</a> for code examples.
    ///

    /// \brief Apply vertex interpolation weights to a primvar buffer for a single
-    ///        level level of refinement.
+    ///        level of refinement.
    ///
    /// The destination buffer must allocate an array of data for all the
    /// refined vertices, i.e. at least refiner.GetLevel(level).GetNumVertices()
@ -113,7 +113,7 @@ public:
    template <class T, class U> void Interpolate(int level, T const & src, U & dst) const;

    /// \brief Apply only varying interpolation weights to a primvar buffer
-    ///        for a single level level of refinement.
+    ///        for a single level of refinement.
    ///
    /// This method can useful if the varying primvar data does not need to be
    /// re-computed over time.
@ -207,7 +207,7 @@ private:

 private:
    //
-    //  Local class to fulfil interface for <typename MASK> in the Scheme mask queries:
+    //  Local class to fulfill interface for <typename MASK> in the Scheme mask queries:
    //
    class Mask {
    public:
@ -410,7 +410,7 @@ PrimvarRefiner::InterpolateVarying(int level, T const & src, U & dst) const {
    Vtr::internal::Level const &      parent     = refinement.parent();

    //
-    //  Group values to interolate based on origin -- note that there may
+    //  Group values to interpolate based on origin -- note that there may
    //  be none originating from faces:
    //
    if (refinement.getNumChildVerticesFromFaces() > 0) {
@ -608,7 +608,7 @@ PrimvarRefiner::interpFromVerts(int level, T const & src, U & dst) const {
        //  Apply the weights to the parent vertex, the vertices opposite its incident
        //  edges, and the child vertices of its incident faces:
        //
-        //  In order to improve numerical precision, its better to apply smaller weights
+        //  In order to improve numerical precision, it's better to apply smaller weights
        //  first, so begin with the face-weights followed by the edge-weights and the
        //  vertex weight last.
        dst[cVert].Clear();
@ -705,7 +705,7 @@ PrimvarRefiner::interpFVarFromEdges(int level, T const & src, U & dst, int chann
    Vtr::internal::FVarLevel const &      childFVar  = childLevel.getFVarLevel(channel);

    //
-    //  Allocate and intialize (if linearly interpolated) interpolation weights for
+    //  Allocate and initialize (if linearly interpolated) interpolation weights for
    //  the edge mask:
    //
    float                               eVertWeights[2];
@ -748,7 +748,7 @@ PrimvarRefiner::interpFVarFromEdges(int level, T const & src, U & dst, int chann
                scheme.ComputeEdgeVertexMask(eHood, eMask, pRule, cRule);
            }

-            //  Apply the weights to the parent edges's vertices and (if applicable) to
+            //  Apply the weights to the parent edge's vertices and (if applicable) to
            //  the child vertices of its incident faces:
            //
            //  Even though the face-varying topology matches the vertex topology, we need
@ -915,7 +915,7 @@ PrimvarRefiner::interpFVarFromVerts(int level, T const & src, U & dst, int chann
            //  it matches.
            //
            //  As with applying the mask to vertex data, in order to improve numerical
-            //  precision, its better to apply smaller weights first, so begin with the
+            //  precision, it's better to apply smaller weights first, so begin with the
            //  face-weights followed by the edge-weights and the vertex weight last.
            //
            Vtr::Index pVertValue = pVertValues[0];
@ -978,7 +978,7 @@ PrimvarRefiner::interpFVarFromVerts(int level, T const & src, U & dst, int chann
                    float eWeight = 0.125f;

                    //
-                    //  If semisharp we need to apply fractional weighting -- if made sharp because
+                    //  If semi-sharp we need to apply fractional weighting -- if made sharp because
                    //  of the other sibling (dependent-sharp) use the fractional weight from that
                    //  other sibling (should only occur when there are 2):
                    //
@ -1093,7 +1093,7 @@ PrimvarRefiner::limit(T const & src, U & dstPos, U1 * dstTan1Ptr, U2 * dstTan2Pt

        //
        //  Combine the weights and indices for position and tangents.  As with applying
-        //  refinment masks to vertex data, in order to improve numerical precision, its
+        //  refinement masks to vertex data, in order to improve numerical precision, it's
        //  better to apply smaller weights first, so begin with the face-weights followed
        //  by the edge-weights and the vertex weight last.
        //
@ -1108,7 +1108,7 @@ PrimvarRefiner::limit(T const & src, U & dstPos, U1 * dstTan1Ptr, U2 * dstTan2Pt

        //
        //  Apply the tangent masks -- both will have the same number of weights and 
-        //  indices (one tangent may be "padded" to accomodate the other), but these
+        //  indices (one tangent may be "padded" to accommodate the other), but these
        //  may differ from those of the position:
        //
        if (hasTangents) {
--- a/opensubdiv/far/ptexIndices.cpp
+++ b/opensubdiv/far/ptexIndices.cpp
@ -163,7 +163,7 @@ PtexIndices::GetAdjacency(
            adjEdges[2] = 1;
        }

-        {   // resolve neighbor outisde the sub-face (edge 0)
+        {   // resolve neighbor outside the sub-face (edge 0)
            int edge0 = fedges[quadrant];
            Index adjface0 = getAdjacentFace(level, edge0, face);
            if (adjface0==-1) {
@ -182,7 +182,7 @@ PtexIndices::GetAdjacency(
                assert(adjFaces[0]!=-1);
            }

-            // resolve neighbor outisde the sub-face (edge 3)
+            // resolve neighbor outside the sub-face (edge 3)
            int edge3 = fedges[prevQuadrant];
            Index adjface3 = getAdjacentFace(level, edge3, face);
            if (adjface3==-1) {
--- a/opensubdiv/far/stencilBuilder.h
+++ b/opensubdiv/far/stencilBuilder.h
@ -53,7 +53,7 @@ public:

    void SetCoarseVertCount(int numVerts);

-    // Mapping from stencil[i] to it's starting offset in the sources[] and weights[] arrays;
+    // Mapping from stencil[i] to its starting offset in the sources[] and weights[] arrays;
    std::vector<int> const& GetStencilOffsets() const;

    // The number of contributing sources and weights in stencil[i]
--- a/opensubdiv/far/stencilTable.h
+++ b/opensubdiv/far/stencilTable.h
@ -83,7 +83,7 @@ public:
        return _size;
    }

-    /// \brief Returns the control vertices indices
+    /// \brief Returns the control vertices' indices
    Index const * GetVertexIndices() const {
        return _indices;
    }
@ -112,7 +112,7 @@ protected:

 /// \brief Table of subdivision stencils.
 ///
-/// Stencils are the most direct methods of evaluation of locations on the limit
+/// Stencils are the most direct method of evaluation of locations on the limit
 /// of a surface. Every point of a limit surface can be computed by linearly
 /// blending a collection of coarse control vertices.
 ///
@ -230,7 +230,7 @@ protected:

    int _numControlVertices;              // number of control vertices

-    std::vector<int> _sizes;    // number of coeffiecient for each stencil
+    std::vector<int>           _sizes;    // number of coefficients for each stencil
    std::vector<Index>         _offsets,  // offset to the start of each stencil
                               _indices;  // indices of contributing coarse vertices
    std::vector<float>         _weights;  // stencil weight coefficients
@ -449,7 +449,7 @@ private:
 };


-// Update values by appling cached stencil weights to new control values
+// Update values by applying cached stencil weights to new control values
 template <class T> void
 StencilTable::update(T const *controlValues, T *values,
    std::vector<float> const &valueWeights, Index start, Index end) const {
@ -476,7 +476,7 @@ StencilTable::update(T const *controlValues, T *values,
        // Zero out the result accumulators
        values[i].Clear();

-        // For each element in the array, add the coefs contribution
+        // For each element in the array, add the coef's contribution
        for (int j=0; j<*sizes; ++j, ++indices, ++weights) {
            values[i].AddWithWeight( controlValues[*indices], *weights );
        }
--- a/opensubdiv/far/stencilTableFactory.cpp
+++ b/opensubdiv/far/stencilTableFactory.cpp
@ -75,15 +75,22 @@ StencilTable const *
 StencilTableFactory::Create(TopologyRefiner const & refiner,
    Options options) {

+    bool interpolateVertex = options.interpolationMode==INTERPOLATE_VERTEX;
+    bool interpolateVarying = options.interpolationMode==INTERPOLATE_VARYING;
+    bool interpolateFaceVarying = options.interpolationMode==INTERPOLATE_FACE_VARYING;
+
+    int numControlVertices = !interpolateFaceVarying
+        ? refiner.GetLevel(0).GetNumVertices()
+        : refiner.GetLevel(0).GetNumFVarValues(options.fvarChannel);
+
    int maxlevel = std::min(int(options.maxLevel), refiner.GetMaxLevel());
    if (maxlevel==0 && (! options.generateControlVerts)) {
        StencilTable * result = new StencilTable;
-        result->_numControlVertices = refiner.GetLevel(0).GetNumVertices();
+        result->_numControlVertices = numControlVertices;
        return result;
    }

-    bool interpolateVarying = options.interpolationMode==INTERPOLATE_VARYING;
-    internal::StencilBuilder builder(refiner.GetLevel(0).GetNumVertices(),
+    internal::StencilBuilder builder(numControlVertices,
                                /*genControlVerts*/ true,
                                /*compactWeights*/  true);

@ -94,21 +101,25 @@ StencilTableFactory::Create(TopologyRefiner const & refiner,
    PrimvarRefiner primvarRefiner(refiner);

    internal::StencilBuilder::Index srcIndex(&builder, 0);
-    internal::StencilBuilder::Index dstIndex(&builder, 
-                                    refiner.GetLevel(0).GetNumVertices());
+    internal::StencilBuilder::Index dstIndex(&builder, numControlVertices);

    for (int level=1; level<=maxlevel; ++level) {
-        if (! interpolateVarying) {
+        if (interpolateVertex) {
            primvarRefiner.Interpolate(level, srcIndex, dstIndex);
-        } else {
+        } else if (interpolateVarying) {
            primvarRefiner.InterpolateVarying(level, srcIndex, dstIndex);
+        } else {
+            primvarRefiner.InterpolateFaceVarying(level, srcIndex, dstIndex, options.fvarChannel);
        }

        if (options.factorizeIntermediateLevels) {
            srcIndex = dstIndex;
        }

-        dstIndex = dstIndex[refiner.GetLevel(level).GetNumVertices()];
+        int dstVertex = !interpolateFaceVarying
+            ? refiner.GetLevel(level).GetNumVertices()
+            : refiner.GetLevel(level).GetNumFVarValues(options.fvarChannel);
+        dstIndex = dstIndex[dstVertex];

        if (! options.factorizeIntermediateLevels) {
            // All previous verts are considered as coarse verts, as a
@ -118,14 +129,14 @@ StencilTableFactory::Create(TopologyRefiner const & refiner,
        }
    }

-    size_t firstOffset = refiner.GetLevel(0).GetNumVertices();
+    size_t firstOffset = numControlVertices;
    if (! options.generateIntermediateLevels)
        firstOffset = srcIndex.GetOffset();
 
    // Copy stencils from the StencilBuilder into the StencilTable.
    // Always initialize numControlVertices (useful for torus case)
    StencilTable * result = 
-                        new StencilTable(refiner.GetLevel(0).GetNumVertices(),
+                        new StencilTable(numControlVertices,
                                          builder.GetStencilOffsets(),
                                          builder.GetStencilSizes(),
                                          builder.GetStencilSources(),
@ -210,6 +221,38 @@ StencilTableFactory::AppendLocalPointStencilTable(
    StencilTable const * localPointStencilTable,
    bool factorize) {

+    return appendLocalPointStencilTable(
+        refiner,
+        baseStencilTable,
+        localPointStencilTable,
+        /*channel*/-1,
+        factorize);
+}
+
+StencilTable const *
+StencilTableFactory::AppendLocalPointStencilTableFaceVarying(
+    TopologyRefiner const &refiner,
+    StencilTable const * baseStencilTable,
+    StencilTable const * localPointStencilTable,
+    int channel,
+    bool factorize) {
+
+    return appendLocalPointStencilTable(
+        refiner,
+        baseStencilTable,
+        localPointStencilTable,
+        channel,
+        factorize);
+}
+
+StencilTable const *
+StencilTableFactory::appendLocalPointStencilTable(
+    TopologyRefiner const &refiner,
+    StencilTable const * baseStencilTable,
+    StencilTable const * localPointStencilTable,
+    int channel,
+    bool factorize) {
+
    // factorize and append.
    if (baseStencilTable == NULL ||
        localPointStencilTable == NULL ||
@ -218,14 +261,20 @@ StencilTableFactory::AppendLocalPointStencilTable(
    // baseStencilTable can be built with or without singular stencils
    // (single weight of 1.0f) as place-holders for coarse mesh vertices.

+    int nControlVerts = channel < 0
+        ? refiner.GetLevel(0).GetNumVertices()
+        : refiner.GetLevel(0).GetNumFVarValues(channel);
+
    int controlVertsIndexOffset = 0;
    int nBaseStencils = baseStencilTable->GetNumStencils();
    int nBaseStencilsElements = (int)baseStencilTable->_indices.size();
    {
-        int nverts = refiner.GetNumVerticesTotal();
+        int nverts = channel < 0
+            ? refiner.GetNumVerticesTotal()
+            : refiner.GetNumFVarValuesTotal(channel);
        if (nBaseStencils == nverts) {

-            // the table contain stencils for the control vertices
+            // the table contains stencils for the control vertices
            //
            //  <-----------------  nverts ------------------>
            //
@ -240,7 +289,7 @@ StencilTableFactory::AppendLocalPointStencilTable(
            //
            controlVertsIndexOffset = 0;

-        } else if (nBaseStencils == (nverts -refiner.GetLevel(0).GetNumVertices())) {
+        } else if (nBaseStencils == (nverts - nControlVerts)) {

            // the table does not contain stencils for the control vertices
            //
@ -256,7 +305,7 @@ StencilTableFactory::AppendLocalPointStencilTable(
            //  <-------------->
            //                 controlVertsIndexOffset
            //
-            controlVertsIndexOffset = refiner.GetLevel(0).GetNumVertices();
+            controlVertsIndexOffset = nControlVerts;

        } else {
            // these are not the stencils you are looking for.
@ -265,11 +314,11 @@ StencilTableFactory::AppendLocalPointStencilTable(
        }
    }

-    // copy all local points stencils to proto stencils, and factorize if needed.
+    // copy all local point stencils to proto stencils, and factorize if needed.
    int nLocalPointStencils = localPointStencilTable->GetNumStencils();
    int nLocalPointStencilsElements = 0;

-    internal::StencilBuilder builder(refiner.GetLevel(0).GetNumVertices(),
+    internal::StencilBuilder builder(nControlVerts,
                                /*genControlVerts*/ false,
                                /*compactWeights*/  factorize);
    internal::StencilBuilder::Index origin(&builder, 0);
@ -302,7 +351,7 @@ StencilTableFactory::AppendLocalPointStencilTable(

    // create new stencil table
    StencilTable * result = new StencilTable;
-    result->_numControlVertices = refiner.GetLevel(0).GetNumVertices();
+    result->_numControlVertices = nControlVerts;
    result->resize(nBaseStencils + nLocalPointStencils,
                   nBaseStencilsElements + nLocalPointStencilsElements);

--- a/opensubdiv/far/stencilTableFactory.h
+++ b/opensubdiv/far/stencilTableFactory.h
@ -50,8 +50,9 @@ class StencilTableFactory {
 public:

    enum Mode {
-        INTERPOLATE_VERTEX=0,
-        INTERPOLATE_VARYING
+        INTERPOLATE_VERTEX=0,           ///< vertex primvar stencils
+        INTERPOLATE_VARYING,            ///< varying primvar stencils
+        INTERPOLATE_FACE_VARYING        ///< face-varying primvar stencils
    };

    struct Options {
@ -61,7 +62,8 @@ public:
                    generateControlVerts(false),
                    generateIntermediateLevels(true),
                    factorizeIntermediateLevels(true),
-                    maxLevel(10) { }
+                    maxLevel(10),
+                    fvarChannel(0) { }

        unsigned int interpolationMode           : 2, ///< interpolation mode
                     generateOffsets             : 1, ///< populate optional "_offsets" field
@ -71,6 +73,8 @@ public:
                                                      ///  vertices or from the stencils of the
                                                      ///  previous level
                     maxLevel                    : 4; ///< generate stencils up to 'maxLevel'
+        unsigned int fvarChannel;                     ///< face-varying channel to use
+                                                      ///  when generating face-varying stencils
    };

    /// \brief Instantiates StencilTable from TopologyRefiner that have been
@ -89,9 +93,9 @@ public:


    /// \brief Instantiates StencilTable by concatenating an array of existing
-    ///        stencil table.
+    ///        stencil tables.
    ///
-    /// \note This factory checks that the stencil table point to the same set
+    /// \note This factory checks that the stencil tables point to the same set
    ///       of supporting control vertices - no re-indexing is done.
    ///       GetNumControlVertices() *must* return the same value for all input
    ///       tables.
@ -112,7 +116,7 @@ public:
    /// @param localPointStencilTable
    ///                             StencilTable for the change of basis patch points.
    ///
-    /// @param factorize            If factorize sets to true, endcap stencils will be
+    /// @param factorize            If factorize set to true, endcap stencils will be
    ///                             factorized with supporting vertices from baseStencil
    ///                             table so that the endcap points can be computed
    ///                             directly from control vertices.
@ -123,10 +127,42 @@ public:
        StencilTable const *localPointStencilTable,
        bool factorize = true);

+    /// \brief Utility function for stencil splicing for local point
+    /// face-varying stencils.
+    ///
+    /// @param refiner              The TopologyRefiner containing the topology
+    ///
+    /// @param baseStencilTable     Input StencilTable for refined vertices
+    ///
+    /// @param localPointStencilTable
+    ///                             StencilTable for the change of basis patch points.
+    ///
+    /// @param channel              face-varying channel
+    ///
+    /// @param factorize            If factorize sets to true, endcap stencils will be
+    ///                             factorized with supporting vertices from baseStencil
+    ///                             table so that the endcap points can be computed
+    ///                             directly from control vertices.
+    ///
+    static StencilTable const * AppendLocalPointStencilTableFaceVarying(
+        TopologyRefiner const &refiner,
+        StencilTable const *baseStencilTable,
+        StencilTable const *localPointStencilTable,
+        int channel = 0,
+        bool factorize = true);
+
 private:

    // Generate stencils for the coarse control-vertices (single weight = 1.0f)
    static void generateControlVertStencils(int numControlVerts, Stencil & dst);
+
+    // Internal method to splice local point stencils
+    static StencilTable const * appendLocalPointStencilTable(
+        TopologyRefiner const &refiner,
+        StencilTable const * baseStencilTable,
+        StencilTable const * localPointStencilTable,
+        int channel,
+        bool factorize);
 };

 /// \brief A specialized factory for LimitStencilTable
--- a/opensubdiv/far/topologyLevel.h
+++ b/opensubdiv/far/topologyLevel.h
@ -43,7 +43,7 @@ namespace Far {
 /// TopologyLevel provides an interface to data in a specific level of a topology hierarchy.
 /// Instances of TopologyLevel are created and owned by a TopologyRefiner,
 /// which will return const-references to them.  Such references are only valid during the
-/// lifetime of TopologyRefiner that created and returned them, and only for a given refinement,
+/// lifetime of the TopologyRefiner that created and returned them, and only for a given refinement,
 /// i.e. if the TopologyRefiner is re-refined, any references to TopoologyLevels are invalidated.
 ///
 class TopologyLevel {
@ -167,7 +167,7 @@ public:
    /// unspecified.
    ///
    /// A face-varying channel is composed of a set of values that may be shared
-    /// by faces meeting at a common vertex.  Just as there are set of vertices
+    /// by faces meeting at a common vertex.  Just as there are sets of vertices
    /// that are associated with faces by index (ranging from 0 to
    /// num-vertices - 1), face-varying values are also referenced by index
    /// (ranging from 0 to num-values -1).
--- a/opensubdiv/far/topologyRefiner.cpp
+++ b/opensubdiv/far/topologyRefiner.cpp
@ -94,7 +94,7 @@ TopologyRefiner::Unrefine() {


 //
-//  Intializing and updating the component inventory:
+//  Initializing and updating the component inventory:
 //
 void
 TopologyRefiner::initializeInventory() {
--- a/opensubdiv/far/topologyRefinerFactory.h
+++ b/opensubdiv/far/topologyRefinerFactory.h
@ -161,7 +161,7 @@ protected:
    ///  the vertices for that face.
    ///
    ///  If a full boundary representation with all neighborhood information is not
-    ///  available, e.g. faces and vertices are avaible but not edges, only the
+    ///  available, e.g. faces and vertices are available but not edges, only the
    ///  face-vertices should be specified.  The remaining topological relationships
    ///  will be constructed later in the assembly (though at greater cost than if
    ///  specified directly).
@ -170,13 +170,13 @@ protected:
    ///  specified in order, i.e. the number of face-vertices for each successive face.
    ///

-    /// \brief Specify the number of vertices to be accomodated
+    /// \brief Specify the number of vertices to be accommodated
    static void setNumBaseVertices(TopologyRefiner & newRefiner, int count);

-    /// \brief Specify the number of faces to be accomodated
+    /// \brief Specify the number of faces to be accommodated
    static void setNumBaseFaces(TopologyRefiner & newRefiner, int count);

-    /// \brief Specify the number of edges to be accomodated
+    /// \brief Specify the number of edges to be accommodated
    static void setNumBaseEdges(TopologyRefiner & newRefiner, int count);

    /// \brief Specify the number of vertices incident each face
@ -260,10 +260,10 @@ protected:
    ///
    /// These methods are used to assign edge or vertex sharpness, for tagging faces
    /// as holes, etc.  Unlike topological assignment, only those components that
-    /// posses a feature of interest need be explicitly assigned.
+    /// possess a feature of interest need be explicitly assigned.
    ///
    /// Since topological construction is largely complete by this point, a method is
-    /// availble to identify an edge for sharpness assignment given a pair of vertices.
+    /// available to identify an edge for sharpness assignment given a pair of vertices.
    ///

    /// \brief Identify an edge to be assigned a sharpness value given a vertex pair
@ -286,7 +286,7 @@ protected:
    /// topology is assigned -- indices for face-varying values are assigned to the
    /// corners of each face just as indices for vertices were assigned.
    ///
-    /// Independent sets of face-varying data is stored in channels.  The identifier
+    /// Independent sets of face-varying data are stored in channels.  The identifier
    /// of each channel (an integer) is expected whenever referring to face-varying
    /// data in any form.
    ///
@ -614,7 +614,7 @@ TopologyRefinerFactory<MESH>::assignComponentTopology(TopologyRefiner& /* refine
    //  or, if the mesh is manifold, explicit assignment of these can be deferred and
    //  all can be determined by calling:
    //
-    //      void populateBaseLocalIndices(TopologyRefiner& newRefiner, )
+    //      void populateBaseLocalIndices(TopologyRefiner& newRefiner)
    //
    //  All components are assumed to be locally manifold and ordering of components in
    //  the above relations is expected to be counter-clockwise.
@ -628,7 +628,7 @@ TopologyRefinerFactory<MESH>::assignComponentTopology(TopologyRefiner& /* refine
    //      void setBaseVertexNonManifold(TopologyRefiner& newRefiner, Index vertex, bool b);
    //
    //  Also consider using TopologyLevel::ValidateTopology() when debugging to ensure
-    //  that topolology has been completely and correctly specified.
+    //  that topology has been completely and correctly specified.
    //
    return false;
 }
--- a/opensubdiv/far/types.h
+++ b/opensubdiv/far/types.h
@ -35,7 +35,7 @@ namespace OPENSUBDIV_VERSION {
 namespace Far {

 //
-//  Typedef's for indices that are inherited from the Vtr level -- eventually
+//  Typedefs for indices that are inherited from the Vtr level -- eventually
 //  these primitive Vtr types may be declared at a lower, more public level.
 //
 typedef Vtr::Index       Index;
--- a/opensubdiv/osd/clD3D11VertexBuffer.h
+++ b/opensubdiv/osd/clD3D11VertexBuffer.h
@ -45,7 +45,7 @@ namespace OPENSUBDIV_VERSION {
 namespace Osd {

 ///
-/// \brief Concrete vertex buffer class for OpenCL subvision and DirectX
+/// \brief Concrete vertex buffer class for OpenCL subdivision and DirectX
 /// drawing.
 ///
 /// D3D11VertexBuffer implements CLVertexBufferInterface and
--- a/opensubdiv/osd/clEvaluator.cpp
+++ b/opensubdiv/osd/clEvaluator.cpp
@ -49,6 +49,10 @@ static const char *patchBasisSource =

 template <class T> cl_mem
 createCLBuffer(std::vector<T> const & src, cl_context clContext) {
+    if (src.empty()) {
+        return NULL;
+    }
+
    cl_int errNum = 0;
    cl_mem devicePtr = clCreateBuffer(clContext,
                                      CL_MEM_READ_WRITE|CL_MEM_COPY_HOST_PTR,
@ -76,9 +80,11 @@ CLStencilTable::CLStencilTable(Far::StencilTable const *stencilTable,
                                  clContext);
        _weights = createCLBuffer(stencilTable->GetWeights(), clContext);
        _duWeights = _dvWeights = NULL;
+        _duuWeights = _duvWeights = _dvvWeights = NULL;
    } else {
        _sizes = _offsets = _indices = _weights = NULL;
        _duWeights = _dvWeights = NULL;
+        _duuWeights = _duvWeights = _dvvWeights = NULL;
    }
 }

@ -96,9 +102,16 @@ CLStencilTable::CLStencilTable(Far::LimitStencilTable const *limitStencilTable,
            limitStencilTable->GetDuWeights(), clContext);
        _dvWeights = createCLBuffer(
            limitStencilTable->GetDvWeights(), clContext);
+        _duuWeights = createCLBuffer(
+            limitStencilTable->GetDuuWeights(), clContext);
+        _duvWeights = createCLBuffer(
+            limitStencilTable->GetDuvWeights(), clContext);
+        _dvvWeights = createCLBuffer(
+            limitStencilTable->GetDvvWeights(), clContext);
    } else {
        _sizes = _offsets = _indices = _weights = NULL;
        _duWeights = _dvWeights = NULL;
+        _duuWeights = _duvWeights = _dvvWeights = NULL;
    }
 }

@ -109,6 +122,9 @@ CLStencilTable::~CLStencilTable() {
    if (_weights) clReleaseMemObject(_weights);
    if (_duWeights) clReleaseMemObject(_duWeights);
    if (_dvWeights) clReleaseMemObject(_dvWeights);
+    if (_duuWeights) clReleaseMemObject(_duuWeights);
+    if (_duvWeights) clReleaseMemObject(_duvWeights);
+    if (_dvvWeights) clReleaseMemObject(_dvvWeights);
 }

 // ---------------------------------------------------------------------------
@ -130,7 +146,10 @@ bool
 CLEvaluator::Compile(BufferDescriptor const &srcDesc,
                     BufferDescriptor const &dstDesc,
                     BufferDescriptor const & /*duDesc*/,
-                     BufferDescriptor const & /*dvDesc*/) {
+                     BufferDescriptor const & /*dvDesc*/,
+                     BufferDescriptor const & /*duuDesc*/,
+                     BufferDescriptor const & /*duvDesc*/,
+                     BufferDescriptor const & /*dvvDesc*/) {
    if (srcDesc.length > dstDesc.length) {
        Far::Error(Far::FAR_RUNTIME_ERROR,
                   "srcDesc length must be less than or equal to "
@ -263,6 +282,7 @@ CLEvaluator::EvalStencils(cl_mem src, BufferDescriptor const &srcDesc,

    size_t globalWorkSize = (size_t)(end - start);

+    BufferDescriptor empty;
    clSetKernelArg(_stencilDerivKernel,  0, sizeof(cl_mem), &src);
    clSetKernelArg(_stencilDerivKernel,  1, sizeof(int), &srcDesc.offset);
    clSetKernelArg(_stencilDerivKernel,  2, sizeof(cl_mem), &dst);
@ -273,14 +293,26 @@ CLEvaluator::EvalStencils(cl_mem src, BufferDescriptor const &srcDesc,
    clSetKernelArg(_stencilDerivKernel,  7, sizeof(cl_mem), &dv);
    clSetKernelArg(_stencilDerivKernel,  8, sizeof(int), &dvDesc.offset);
    clSetKernelArg(_stencilDerivKernel,  9, sizeof(int), &dvDesc.stride);
-    clSetKernelArg(_stencilDerivKernel, 10, sizeof(cl_mem), &sizes);
-    clSetKernelArg(_stencilDerivKernel, 11, sizeof(cl_mem), &offsets);
-    clSetKernelArg(_stencilDerivKernel, 12, sizeof(cl_mem), &indices);
-    clSetKernelArg(_stencilDerivKernel, 13, sizeof(cl_mem), &weights);
-    clSetKernelArg(_stencilDerivKernel, 14, sizeof(cl_mem), &duWeights);
-    clSetKernelArg(_stencilDerivKernel, 15, sizeof(cl_mem), &dvWeights);
-    clSetKernelArg(_stencilDerivKernel, 16, sizeof(int), &start);
-    clSetKernelArg(_stencilDerivKernel, 17, sizeof(int), &end);
+    clSetKernelArg(_stencilDerivKernel, 10, sizeof(cl_mem), NULL);
+    clSetKernelArg(_stencilDerivKernel, 11, sizeof(int), &empty.offset);
+    clSetKernelArg(_stencilDerivKernel, 12, sizeof(int), &empty.stride);
+    clSetKernelArg(_stencilDerivKernel, 13, sizeof(cl_mem), NULL);
+    clSetKernelArg(_stencilDerivKernel, 14, sizeof(int), &empty.offset);
+    clSetKernelArg(_stencilDerivKernel, 15, sizeof(int), &empty.stride);
+    clSetKernelArg(_stencilDerivKernel, 16, sizeof(cl_mem), NULL);
+    clSetKernelArg(_stencilDerivKernel, 17, sizeof(int), &empty.offset);
+    clSetKernelArg(_stencilDerivKernel, 18, sizeof(int), &empty.stride);
+    clSetKernelArg(_stencilDerivKernel, 19, sizeof(cl_mem), &sizes);
+    clSetKernelArg(_stencilDerivKernel, 20, sizeof(cl_mem), &offsets);
+    clSetKernelArg(_stencilDerivKernel, 21, sizeof(cl_mem), &indices);
+    clSetKernelArg(_stencilDerivKernel, 22, sizeof(cl_mem), &weights);
+    clSetKernelArg(_stencilDerivKernel, 23, sizeof(cl_mem), &duWeights);
+    clSetKernelArg(_stencilDerivKernel, 24, sizeof(cl_mem), &dvWeights);
+    clSetKernelArg(_stencilDerivKernel, 25, sizeof(cl_mem), NULL);
+    clSetKernelArg(_stencilDerivKernel, 26, sizeof(cl_mem), NULL);
+    clSetKernelArg(_stencilDerivKernel, 27, sizeof(cl_mem), NULL);
+    clSetKernelArg(_stencilDerivKernel, 28, sizeof(int), &start);
+    clSetKernelArg(_stencilDerivKernel, 29, sizeof(int), &end);

    cl_int errNum = clEnqueueNDRangeKernel(
        _clCommandQueue, _stencilDerivKernel, 1, NULL,
@ -292,8 +324,79 @@ CLEvaluator::EvalStencils(cl_mem src, BufferDescriptor const &srcDesc,
        return false;
    }

-    if (endEvent == NULL)
-    {
+    if (endEvent == NULL) {
+        clFinish(_clCommandQueue);
+    }
+    return true;
+}
+
+bool
+CLEvaluator::EvalStencils(cl_mem src, BufferDescriptor const &srcDesc,
+                          cl_mem dst, BufferDescriptor const &dstDesc,
+                          cl_mem du,  BufferDescriptor const &duDesc,
+                          cl_mem dv,  BufferDescriptor const &dvDesc,
+                          cl_mem duu, BufferDescriptor const &duuDesc,
+                          cl_mem duv, BufferDescriptor const &duvDesc,
+                          cl_mem dvv, BufferDescriptor const &dvvDesc,
+                          cl_mem sizes,
+                          cl_mem offsets,
+                          cl_mem indices,
+                          cl_mem weights,
+                          cl_mem duWeights,
+                          cl_mem dvWeights,
+                          cl_mem duuWeights,
+                          cl_mem duvWeights,
+                          cl_mem dvvWeights,
+                          int start, int end,
+                          unsigned int numStartEvents,
+                          const cl_event* startEvents,
+                          cl_event* endEvent) const {
+    if (end <= start) return true;
+
+    size_t globalWorkSize = (size_t)(end - start);
+
+    clSetKernelArg(_stencilDerivKernel,  0, sizeof(cl_mem), &src);
+    clSetKernelArg(_stencilDerivKernel,  1, sizeof(int), &srcDesc.offset);
+    clSetKernelArg(_stencilDerivKernel,  2, sizeof(cl_mem), &dst);
+    clSetKernelArg(_stencilDerivKernel,  3, sizeof(int), &dstDesc.offset);
+    clSetKernelArg(_stencilDerivKernel,  4, sizeof(cl_mem), &du);
+    clSetKernelArg(_stencilDerivKernel,  5, sizeof(int), &duDesc.offset);
+    clSetKernelArg(_stencilDerivKernel,  6, sizeof(int), &duDesc.stride);
+    clSetKernelArg(_stencilDerivKernel,  7, sizeof(cl_mem), &dv);
+    clSetKernelArg(_stencilDerivKernel,  8, sizeof(int), &dvDesc.offset);
+    clSetKernelArg(_stencilDerivKernel,  9, sizeof(int), &dvDesc.stride);
+    clSetKernelArg(_stencilDerivKernel, 10, sizeof(cl_mem), &duu);
+    clSetKernelArg(_stencilDerivKernel, 11, sizeof(int), &duuDesc.offset);
+    clSetKernelArg(_stencilDerivKernel, 12, sizeof(int), &duuDesc.stride);
+    clSetKernelArg(_stencilDerivKernel, 13, sizeof(cl_mem), &duv);
+    clSetKernelArg(_stencilDerivKernel, 14, sizeof(int), &duvDesc.offset);
+    clSetKernelArg(_stencilDerivKernel, 15, sizeof(int), &duvDesc.stride);
+    clSetKernelArg(_stencilDerivKernel, 16, sizeof(cl_mem), &dvv);
+    clSetKernelArg(_stencilDerivKernel, 17, sizeof(int), &dvvDesc.offset);
+    clSetKernelArg(_stencilDerivKernel, 18, sizeof(int), &dvvDesc.stride);
+    clSetKernelArg(_stencilDerivKernel, 19, sizeof(cl_mem), &sizes);
+    clSetKernelArg(_stencilDerivKernel, 20, sizeof(cl_mem), &offsets);
+    clSetKernelArg(_stencilDerivKernel, 21, sizeof(cl_mem), &indices);
+    clSetKernelArg(_stencilDerivKernel, 22, sizeof(cl_mem), &weights);
+    clSetKernelArg(_stencilDerivKernel, 23, sizeof(cl_mem), &duWeights);
+    clSetKernelArg(_stencilDerivKernel, 24, sizeof(cl_mem), &dvWeights);
+    clSetKernelArg(_stencilDerivKernel, 25, sizeof(cl_mem), &duuWeights);
+    clSetKernelArg(_stencilDerivKernel, 26, sizeof(cl_mem), &duvWeights);
+    clSetKernelArg(_stencilDerivKernel, 27, sizeof(cl_mem), &dvvWeights);
+    clSetKernelArg(_stencilDerivKernel, 28, sizeof(int), &start);
+    clSetKernelArg(_stencilDerivKernel, 29, sizeof(int), &end);
+
+    cl_int errNum = clEnqueueNDRangeKernel(
+        _clCommandQueue, _stencilDerivKernel, 1, NULL,
+        &globalWorkSize, NULL, numStartEvents, startEvents, endEvent);
+
+    if (errNum != CL_SUCCESS) {
+        Far::Error(Far::FAR_RUNTIME_ERROR,
+                   "ApplyStencilKernel (%d) ", errNum);
+        return false;
+    }
+
+    if (endEvent == NULL) {
        clFinish(_clCommandQueue);
    }
    return true;
@ -315,6 +418,7 @@ CLEvaluator::EvalPatches(cl_mem src, BufferDescriptor const &srcDesc,

    size_t globalWorkSize = (size_t)(numPatchCoords);

+    BufferDescriptor empty;
    clSetKernelArg(_patchKernel,  0, sizeof(cl_mem), &src);
    clSetKernelArg(_patchKernel,  1, sizeof(int),    &srcDesc.offset);
    clSetKernelArg(_patchKernel,  2, sizeof(cl_mem), &dst);
@ -325,10 +429,19 @@ CLEvaluator::EvalPatches(cl_mem src, BufferDescriptor const &srcDesc,
    clSetKernelArg(_patchKernel,  7, sizeof(cl_mem), &dv);
    clSetKernelArg(_patchKernel,  8, sizeof(int),    &dvDesc.offset);
    clSetKernelArg(_patchKernel,  9, sizeof(int),    &dvDesc.stride);
-    clSetKernelArg(_patchKernel, 10, sizeof(cl_mem), &patchCoordsBuffer);
-    clSetKernelArg(_patchKernel, 11, sizeof(cl_mem), &patchArrayBuffer);
-    clSetKernelArg(_patchKernel, 12, sizeof(cl_mem), &patchIndexBuffer);
-    clSetKernelArg(_patchKernel, 13, sizeof(cl_mem), &patchParamBuffer);
+    clSetKernelArg(_patchKernel, 10, sizeof(cl_mem), NULL);
+    clSetKernelArg(_patchKernel, 11, sizeof(int),    &empty.offset);
+    clSetKernelArg(_patchKernel, 12, sizeof(int),    &empty.stride);
+    clSetKernelArg(_patchKernel, 13, sizeof(cl_mem), NULL);
+    clSetKernelArg(_patchKernel, 14, sizeof(int),    &empty.offset);
+    clSetKernelArg(_patchKernel, 15, sizeof(int),    &empty.stride);
+    clSetKernelArg(_patchKernel, 16, sizeof(cl_mem), NULL);
+    clSetKernelArg(_patchKernel, 17, sizeof(int),    &empty.offset);
+    clSetKernelArg(_patchKernel, 18, sizeof(int),    &empty.stride);
+    clSetKernelArg(_patchKernel, 19, sizeof(cl_mem), &patchCoordsBuffer);
+    clSetKernelArg(_patchKernel, 20, sizeof(cl_mem), &patchArrayBuffer);
+    clSetKernelArg(_patchKernel, 21, sizeof(cl_mem), &patchIndexBuffer);
+    clSetKernelArg(_patchKernel, 22, sizeof(cl_mem), &patchParamBuffer);

    cl_int errNum = clEnqueueNDRangeKernel(
        _clCommandQueue, _patchKernel, 1, NULL,
@ -340,13 +453,70 @@ CLEvaluator::EvalPatches(cl_mem src, BufferDescriptor const &srcDesc,
        return false;
    }

-    if (endEvent == NULL)
-    {
+    if (endEvent == NULL) {
        clFinish(_clCommandQueue);
    }
    return true;
 }

+bool
+CLEvaluator::EvalPatches(cl_mem src, BufferDescriptor const &srcDesc,
+                         cl_mem dst, BufferDescriptor const &dstDesc,
+                         cl_mem du,  BufferDescriptor const &duDesc,
+                         cl_mem dv,  BufferDescriptor const &dvDesc,
+                         cl_mem duu, BufferDescriptor const &duuDesc,
+                         cl_mem duv, BufferDescriptor const &duvDesc,
+                         cl_mem dvv, BufferDescriptor const &dvvDesc,
+                         int numPatchCoords,
+                         cl_mem patchCoordsBuffer,
+                         cl_mem patchArrayBuffer,
+                         cl_mem patchIndexBuffer,
+                         cl_mem patchParamBuffer,
+                         unsigned int numStartEvents,
+                         const cl_event* startEvents,
+                         cl_event* endEvent) const {
+
+    size_t globalWorkSize = (size_t)(numPatchCoords);
+
+    clSetKernelArg(_patchKernel,  0, sizeof(cl_mem), &src);
+    clSetKernelArg(_patchKernel,  1, sizeof(int),    &srcDesc.offset);
+    clSetKernelArg(_patchKernel,  2, sizeof(cl_mem), &dst);
+    clSetKernelArg(_patchKernel,  3, sizeof(int),    &dstDesc.offset);
+    clSetKernelArg(_patchKernel,  4, sizeof(cl_mem), &du);
+    clSetKernelArg(_patchKernel,  5, sizeof(int),    &duDesc.offset);
+    clSetKernelArg(_patchKernel,  6, sizeof(int),    &duDesc.stride);
+    clSetKernelArg(_patchKernel,  7, sizeof(cl_mem), &dv);
+    clSetKernelArg(_patchKernel,  8, sizeof(int),    &dvDesc.offset);
+    clSetKernelArg(_patchKernel,  9, sizeof(int),    &dvDesc.stride);
+    clSetKernelArg(_patchKernel, 10, sizeof(cl_mem), &duu);
+    clSetKernelArg(_patchKernel, 11, sizeof(int),    &duuDesc.offset);
+    clSetKernelArg(_patchKernel, 12, sizeof(int),    &duuDesc.stride);
+    clSetKernelArg(_patchKernel, 13, sizeof(cl_mem), &duv);
+    clSetKernelArg(_patchKernel, 14, sizeof(int),    &duvDesc.offset);
+    clSetKernelArg(_patchKernel, 15, sizeof(int),    &duvDesc.stride);
+    clSetKernelArg(_patchKernel, 16, sizeof(cl_mem), &dvv);
+    clSetKernelArg(_patchKernel, 17, sizeof(int),    &dvvDesc.offset);
+    clSetKernelArg(_patchKernel, 18, sizeof(int),    &dvvDesc.stride);
+    clSetKernelArg(_patchKernel, 19, sizeof(cl_mem), &patchCoordsBuffer);
+    clSetKernelArg(_patchKernel, 20, sizeof(cl_mem), &patchArrayBuffer);
+    clSetKernelArg(_patchKernel, 21, sizeof(cl_mem), &patchIndexBuffer);
+    clSetKernelArg(_patchKernel, 22, sizeof(cl_mem), &patchParamBuffer);
+
+    cl_int errNum = clEnqueueNDRangeKernel(
+        _clCommandQueue, _patchKernel, 1, NULL,
+        &globalWorkSize, NULL, numStartEvents, startEvents, endEvent);
+
+    if (errNum != CL_SUCCESS) {
+        Far::Error(Far::FAR_RUNTIME_ERROR,
+                   "ApplyPatchKernel (%d) ", errNum);
+        return false;
+    }
+
+    if (endEvent == NULL) {
+        clFinish(_clCommandQueue);
+    }
+    return true;
+}


 /* static */
--- a/opensubdiv/osd/clEvaluator.h
+++ b/opensubdiv/osd/clEvaluator.h
--- a/opensubdiv/osd/clGLVertexBuffer.h
+++ b/opensubdiv/osd/clGLVertexBuffer.h
@ -36,7 +36,7 @@ namespace OPENSUBDIV_VERSION {
 namespace Osd {

 ///
-/// \brief Concrete vertex buffer class for OpenCL subvision and OpenGL drawing.
+/// \brief Concrete vertex buffer class for OpenCL subdivision and OpenGL drawing.
 ///
 /// CLGLVertexBuffer implements CLVertexBufferInterface and
 /// GLVertexBufferInterface.
@ -93,7 +93,7 @@ protected:
    /// Returns true if success.
    bool allocate(cl_context clContext);

-    /// Acqures a resource from GL.
+    /// Acquires a resource from GL.
    void map(cl_command_queue queue);

    /// Releases a resource to GL.
--- a/opensubdiv/osd/clKernel.cl
+++ b/opensubdiv/osd/clKernel.cl
@ -99,12 +99,18 @@ __kernel void computeStencilsDerivatives(
    __global float * dst, int dstOffset,
    __global float * du,  int duOffset, int duStride,
    __global float * dv,  int dvOffset, int dvStride,
+    __global float * duu, int duuOffset, int duuStride,
+    __global float * duv, int duvOffset, int duvStride,
+    __global float * dvv, int dvvOffset, int dvvStride,
    __global int * sizes,
    __global int * offsets,
    __global int * indices,
    __global float * weights,
    __global float * duWeights,
    __global float * dvWeights,
+    __global float * duuWeights,
+    __global float * duvWeights,
+    __global float * dvvWeights,
    int batchStart, int batchEnd) {

    int current = get_global_id(0) + batchStart;
@ -113,10 +119,13 @@ __kernel void computeStencilsDerivatives(
        return;
    }

-    struct Vertex v, vdu, vdv;
+    struct Vertex v, vdu, vdv, vduu, vduv, vdvv;
    clear(&v);
    clear(&vdu);
    clear(&vdv);
+    clear(&vduu);
+    clear(&vduv);
+    clear(&vdvv);

    int size = sizes[current],
        offset = offsets[current];
@ -125,6 +134,9 @@ __kernel void computeStencilsDerivatives(
    if (dst) dst += dstOffset;
    if (du)  du  += duOffset;
    if (dv)  dv  += dvOffset;
+    if (duu) duu += duuOffset;
+    if (duv) duv += duvOffset;
+    if (dvv) dvv += dvvOffset;

    for (int i=0; i<size; ++i) {
        int ofs = offset + i;
@ -132,11 +144,17 @@ __kernel void computeStencilsDerivatives(
        if (weights)   addWithWeight(  &v, src, vid,   weights[ofs]);
        if (duWeights) addWithWeight(&vdu, src, vid, duWeights[ofs]);
        if (dvWeights) addWithWeight(&vdv, src, vid, dvWeights[ofs]);
+        if (duuWeights) addWithWeight(&vduu, src, vid, duuWeights[ofs]);
+        if (duvWeights) addWithWeight(&vduv, src, vid, duvWeights[ofs]);
+        if (dvvWeights) addWithWeight(&vdvv, src, vid, dvvWeights[ofs]);
    }

    if (dst) writeVertex      (dst, current, &v);
    if (du)  writeVertexStride(du,  current, &vdu, duStride);
    if (dv)  writeVertexStride(dv,  current, &vdv, dvStride);
+    if (duu) writeVertexStride(duu, current, &vduu, duuStride);
+    if (duv) writeVertexStride(duv, current, &vduv, duvStride);
+    if (dvv) writeVertexStride(dvv, current, &vdvv, dvvStride);
 }

 // ---------------------------------------------------------------------------
@ -205,6 +223,9 @@ __kernel void computePatches(__global float *src, int srcOffset,
                             __global float *dst, int dstOffset,
                             __global float *du,  int duOffset, int duStride,
                             __global float *dv,  int dvOffset, int dvStride,
+                             __global float *duu, int duuOffset, int duuStride,
+                             __global float *duv, int duvOffset, int duvStride,
+                             __global float *dvv, int dvvOffset, int dvvStride,
                             __global struct PatchCoord *patchCoords,
                             __global struct PatchArray *patchArrayBuffer,
                             __global int *patchIndexBuffer,
@ -215,6 +236,9 @@ __kernel void computePatches(__global float *src, int srcOffset,
    if (dst) dst += dstOffset;
    if (du)  du  += duOffset;
    if (dv)  dv  += dvOffset;
+    if (duu) duu += duuOffset;
+    if (duv) duv += duvOffset;
+    if (dvv) dvv += dvvOffset;

    struct PatchCoord coord = patchCoords[current];
    struct PatchArray array = patchArrayBuffer[coord.arrayIndex];
@ -274,5 +298,31 @@ __kernel void computePatches(__global float *src, int srcOffset,
        }
        writeVertexStride(dv, current, &vdv, dvStride);
    }
-
+    if (duu) {
+        struct Vertex vduu;
+        clear(&vduu);
+        for (int i = 0; i < numControlVertices; ++i) {
+            int index = patchIndexBuffer[indexBase + i];
+            addWithWeight(&vduu, src, index, wDss[i]);
+        }
+        writeVertexStride(duu, current, &vduu, duuStride);
+    }
+    if (duv) {
+        struct Vertex vduv;
+        clear(&vduv);
+        for (int i = 0; i < numControlVertices; ++i) {
+            int index = patchIndexBuffer[indexBase + i];
+            addWithWeight(&vduv, src, index, wDst[i]);
+        }
+        writeVertexStride(duv, current, &vduv, duvStride);
+    }
+    if (dvv) {
+        struct Vertex vdvv;
+        clear(&vdvv);
+        for (int i = 0; i < numControlVertices; ++i) {
+            int index = patchIndexBuffer[indexBase + i];
+            addWithWeight(&vdvv, src, index, wDtt[i]);
+        }
+        writeVertexStride(dvv, current, &vdvv, dvvStride);
+    }
 }
--- a/opensubdiv/osd/clVertexBuffer.h
+++ b/opensubdiv/osd/clVertexBuffer.h
@ -34,7 +34,7 @@ namespace OPENSUBDIV_VERSION {
 namespace Osd {

 ///
-/// \brief Concrete vertex buffer class for OpenCL subvision.
+/// \brief Concrete vertex buffer class for OpenCL subdivision.
 ///
 /// CLVertexBuffer implements CLVertexBufferInterface. An instance of this
 /// buffer class can be passed to CLEvaluator
--- a/opensubdiv/osd/cpuD3D11VertexBuffer.h
+++ b/opensubdiv/osd/cpuD3D11VertexBuffer.h
@ -38,7 +38,7 @@ namespace OPENSUBDIV_VERSION {
 namespace Osd {

 ///
-/// \brief Concrete vertex buffer class for Cpu subvision and DirectX drawing.
+/// \brief Concrete vertex buffer class for Cpu subdivision and DirectX drawing.
 ///
 /// CpuD3D11VertexBuffer implements CpuVertexBufferInterface and
 /// D3D11VertexBufferInterface.
--- a/opensubdiv/osd/cpuEvaluator.cpp
+++ b/opensubdiv/osd/cpuEvaluator.cpp
@ -82,6 +82,48 @@ CpuEvaluator::EvalStencils(const float *src, BufferDescriptor const &srcDesc,
    return true;
 }

+/* static */
+bool
+CpuEvaluator::EvalStencils(const float *src, BufferDescriptor const &srcDesc,
+                           float *dst,       BufferDescriptor const &dstDesc,
+                           float *du,        BufferDescriptor const &duDesc,
+                           float *dv,        BufferDescriptor const &dvDesc,
+                           float *duu,       BufferDescriptor const &duuDesc,
+                           float *duv,       BufferDescriptor const &duvDesc,
+                           float *dvv,       BufferDescriptor const &dvvDesc,
+                           const int * sizes,
+                           const int * offsets,
+                           const int * indices,
+                           const float * weights,
+                           const float * duWeights,
+                           const float * dvWeights,
+                           const float * duuWeights,
+                           const float * duvWeights,
+                           const float * dvvWeights,
+                           int start, int end) {
+    if (end <= start) return true;
+    if (srcDesc.length != dstDesc.length) return false;
+    if (srcDesc.length != duDesc.length) return false;
+    if (srcDesc.length != dvDesc.length) return false;
+    if (srcDesc.length != duuDesc.length) return false;
+    if (srcDesc.length != duvDesc.length) return false;
+    if (srcDesc.length != dvvDesc.length) return false;
+
+    CpuEvalStencils(src, srcDesc,
+                    dst, dstDesc,
+                    du,  duDesc,
+                    dv,  dvDesc,
+                    duu, duuDesc,
+                    duv, duvDesc,
+                    dvv, dvvDesc,
+                    sizes, offsets, indices,
+                    weights, duWeights, dvWeights,
+                    duuWeights, duvWeights, dvvWeights,
+                    start, end);
+
+    return true;
+}
+
 template <typename T>
 struct BufferAdapter {
    BufferAdapter(T *p, int length, int stride) :
@ -264,6 +306,120 @@ CpuEvaluator::EvalPatches(const float *src, BufferDescriptor const &srcDesc,
    return true;
 }

+/* static */
+bool
+CpuEvaluator::EvalPatches(const float *src, BufferDescriptor const &srcDesc,
+                          float *dst,       BufferDescriptor const &dstDesc,
+                          float *du,        BufferDescriptor const &duDesc,
+                          float *dv,        BufferDescriptor const &dvDesc,
+                          float *duu,       BufferDescriptor const &duuDesc,
+                          float *duv,       BufferDescriptor const &duvDesc,
+                          float *dvv,       BufferDescriptor const &dvvDesc,
+                          int numPatchCoords,
+                          const PatchCoord *patchCoords,
+                          const PatchArray *patchArrays,
+                          const int *patchIndexBuffer,
+                          const PatchParam *patchParamBuffer) {
+    if (src) {
+        src += srcDesc.offset;
+    } else {
+        return false;
+    }
+    if (dst) {
+        if (srcDesc.length != dstDesc.length) return false;
+        dst += dstDesc.offset;
+    }
+    if (du) {
+        du  += duDesc.offset;
+        if (srcDesc.length != duDesc.length) return false;
+    }
+    if (dv) {
+        dv  += dvDesc.offset;
+        if (srcDesc.length != dvDesc.length) return false;
+    }
+    if (duu) {
+        duu += duuDesc.offset;
+        if (srcDesc.length != duuDesc.length) return false;
+    }
+    if (duv) {
+        duv += duvDesc.offset;
+        if (srcDesc.length != duvDesc.length) return false;
+    }
+    if (dvv) {
+        dvv += dvvDesc.offset;
+        if (srcDesc.length != dvvDesc.length) return false;
+    }
+
+    BufferAdapter<const float> srcT(src, srcDesc.length, srcDesc.stride);
+    BufferAdapter<float>       dstT(dst, dstDesc.length, dstDesc.stride);
+    BufferAdapter<float>       duT(du,   duDesc.length,  duDesc.stride);
+    BufferAdapter<float>       dvT(dv,   dvDesc.length,  dvDesc.stride);
+    BufferAdapter<float>       duuT(duu, duuDesc.length, duuDesc.stride);
+    BufferAdapter<float>       duvT(duv, duvDesc.length, duvDesc.stride);
+    BufferAdapter<float>       dvvT(dvv, dvvDesc.length, dvvDesc.stride);
+
+    float wP[20], wDu[20], wDv[20], wDuu[20], wDuv[20], wDvv[20];
+
+    for (int i = 0; i < numPatchCoords; ++i) {
+        PatchCoord const &coord = patchCoords[i];
+        PatchArray const &array = patchArrays[coord.handle.arrayIndex];
+
+        Far::PatchParam const & param =
+            patchParamBuffer[coord.handle.patchIndex];
+        int patchType = param.IsRegular()
+            ? Far::PatchDescriptor::REGULAR
+            : array.GetPatchType();
+
+        int numControlVertices = 0;
+        if (patchType == Far::PatchDescriptor::REGULAR) {
+            Far::internal::GetBSplineWeights(param,
+                                             coord.s, coord.t, wP, wDu, wDv,
+                                             wDuu, wDuv, wDvv);
+            numControlVertices = 16;
+        } else if (patchType == Far::PatchDescriptor::GREGORY_BASIS) {
+            Far::internal::GetGregoryWeights(param,
+                                             coord.s, coord.t, wP, wDu, wDv,
+                                             wDuu, wDuv, wDvv);
+            numControlVertices = 20;
+        } else if (patchType == Far::PatchDescriptor::QUADS) {
+            Far::internal::GetBilinearWeights(param,
+                                              coord.s, coord.t, wP, wDu, wDv,
+                                              wDuu, wDuv, wDvv);
+            numControlVertices = 4;
+        } else {
+            assert(0);
+        }
+
+        int indexStride = Far::PatchDescriptor(array.GetPatchType()).GetNumControlVertices();
+        int indexBase = array.GetIndexBase() + indexStride *
+                (coord.handle.patchIndex - array.GetPrimitiveIdBase());
+
+        const int *cvs = &patchIndexBuffer[indexBase];
+
+        dstT.Clear();
+        duT.Clear();
+        dvT.Clear();
+        duuT.Clear();
+        duvT.Clear();
+        dvvT.Clear();
+        for (int j = 0; j < numControlVertices; ++j) {
+            dstT.AddWithWeight(srcT[cvs[j]], wP[j]);
+            duT.AddWithWeight (srcT[cvs[j]], wDu[j]);
+            dvT.AddWithWeight (srcT[cvs[j]], wDv[j]);
+            duuT.AddWithWeight (srcT[cvs[j]], wDuu[j]);
+            duvT.AddWithWeight (srcT[cvs[j]], wDuv[j]);
+            dvvT.AddWithWeight (srcT[cvs[j]], wDvv[j]);
+        }
+        ++dstT;
+        ++duT;
+        ++dvT;
+        ++duuT;
+        ++duvT;
+        ++dvvT;
+    }
+    return true;
+}
+

 }  // end namespace Osd

--- a/opensubdiv/osd/cpuEvaluator.h
+++ b/opensubdiv/osd/cpuEvaluator.h
@ -26,11 +26,11 @@
 #define OPENSUBDIV3_OSD_CPU_EVALUATOR_H

 #include "../version.h"
-
-#include <cstddef>
 #include "../osd/bufferDescriptor.h"
 #include "../osd/types.h"

+#include <cstddef>
+
 namespace OpenSubdiv {
 namespace OPENSUBDIV_VERSION {

@ -107,7 +107,6 @@ public:
    /// @param dstDesc        vertex buffer descriptor for the output buffer
    ///
    /// @param sizes          pointer to the sizes buffer of the stencil table
-    ///                       to apply for the range [start, end)
    ///
    /// @param offsets        pointer to the offsets buffer of the stencil table
    ///
@ -145,17 +144,17 @@ public:
    ///
    /// @param dstDesc        vertex buffer descriptor for the output buffer
    ///
-    /// @param duBuffer       Output U-derivative buffer
+    /// @param duBuffer       Output buffer derivative wrt u
    ///                       must have BindCpuBuffer() method returning a
    ///                       float pointer for write
    ///
-    /// @param duDesc         vertex buffer descriptor for the output buffer
+    /// @param duDesc         vertex buffer descriptor for the duBuffer
    ///
-    /// @param dvBuffer       Output V-derivative buffer
+    /// @param dvBuffer       Output buffer derivative wrt v
    ///                       must have BindCpuBuffer() method returning a
    ///                       float pointer for write
    ///
-    /// @param dvDesc         vertex buffer descriptor for the output buffer
+    /// @param dvDesc         vertex buffer descriptor for the dvBuffer
    ///
    /// @param stencilTable   Far::StencilTable or equivalent
    ///
@ -206,15 +205,15 @@ public:
    ///
    /// @param dstDesc        vertex buffer descriptor for the output buffer
    ///
-    /// @param du             Output U-derivatives pointer. An offset of
+    /// @param du             Output pointer derivative wrt u. An offset of
    ///                       duDesc will be applied internally.
    ///
-    /// @param duDesc         vertex buffer descriptor for the output buffer
+    /// @param duDesc         vertex buffer descriptor for the duBuffer
    ///
-    /// @param dv             Output V-derivatives pointer. An offset of
+    /// @param dv             Output pointer derivative wrt v. An offset of
    ///                       dvDesc will be applied internally.
    ///
-    /// @param dvDesc         vertex buffer descriptor for the output buffer
+    /// @param dvDesc         vertex buffer descriptor for the dvBuffer
    ///
    /// @param sizes          pointer to the sizes buffer of the stencil table
    ///
@ -245,6 +244,177 @@ public:
        const float * dvWeights,
        int start, int end);

+    /// \brief Generic static eval stencils function with derivatives.
+    ///        This function has a same signature as other device kernels
+    ///        have so that it can be called in the same way from OsdMesh
+    ///        template interface.
+    ///
+    /// @param srcBuffer      Input primvar buffer.
+    ///                       must have BindCpuBuffer() method returning a
+    ///                       const float pointer for read
+    ///
+    /// @param srcDesc        vertex buffer descriptor for the input buffer
+    ///
+    /// @param dstBuffer      Output primvar buffer
+    ///                       must have BindCpuBuffer() method returning a
+    ///                       float pointer for write
+    ///
+    /// @param dstDesc        vertex buffer descriptor for the output buffer
+    ///
+    /// @param duBuffer       Output buffer derivative wrt u
+    ///                       must have BindCpuBuffer() method returning a
+    ///                       float pointer for write
+    ///
+    /// @param duDesc         vertex buffer descriptor for the duBuffer
+    ///
+    /// @param dvBuffer       Output buffer derivative wrt v
+    ///                       must have BindCpuBuffer() method returning a
+    ///                       float pointer for write
+    ///
+    /// @param dvDesc         vertex buffer descriptor for the dvBuffer
+    ///
+    /// @param duuBuffer      Output buffer 2nd derivative wrt u
+    ///                       must have BindCpuBuffer() method returning a
+    ///                       float pointer for write
+    ///
+    /// @param duuDesc        vertex buffer descriptor for the duuBuffer
+    ///
+    /// @param duvBuffer      Output buffer 2nd derivative wrt u and v
+    ///                       must have BindCpuBuffer() method returning a
+    ///                       float pointer for write
+    ///
+    /// @param duvDesc        vertex buffer descriptor for the duvBuffer
+    ///
+    /// @param dvvBuffer      Output buffer 2nd derivative wrt v
+    ///                       must have BindCpuBuffer() method returning a
+    ///                       float pointer for write
+    ///
+    /// @param dvvDesc        vertex buffer descriptor for the dvvBuffer
+    ///
+    /// @param stencilTable   Far::StencilTable or equivalent
+    ///
+    /// @param instance       not used in the cpu kernel
+    ///                       (declared as a typed pointer to prevent
+    ///                        undesirable template resolution)
+    ///
+    /// @param deviceContext  not used in the cpu kernel
+    ///
+    template <typename SRC_BUFFER, typename DST_BUFFER, typename STENCIL_TABLE>
+    static bool EvalStencils(
+        SRC_BUFFER *srcBuffer, BufferDescriptor const &srcDesc,
+        DST_BUFFER *dstBuffer, BufferDescriptor const &dstDesc,
+        DST_BUFFER *duBuffer,  BufferDescriptor const &duDesc,
+        DST_BUFFER *dvBuffer,  BufferDescriptor const &dvDesc,
+        DST_BUFFER *duuBuffer, BufferDescriptor const &duuDesc,
+        DST_BUFFER *duvBuffer, BufferDescriptor const &duvDesc,
+        DST_BUFFER *dvvBuffer, BufferDescriptor const &dvvDesc,
+        STENCIL_TABLE const *stencilTable,
+        const CpuEvaluator *instance = NULL,
+        void * deviceContext = NULL) {
+
+        (void)instance;       // unused
+        (void)deviceContext;  // unused
+
+        return EvalStencils(srcBuffer->BindCpuBuffer(), srcDesc,
+                            dstBuffer->BindCpuBuffer(), dstDesc,
+                            duBuffer->BindCpuBuffer(),  duDesc,
+                            dvBuffer->BindCpuBuffer(),  dvDesc,
+                            duuBuffer->BindCpuBuffer(), duuDesc,
+                            duvBuffer->BindCpuBuffer(), duvDesc,
+                            dvvBuffer->BindCpuBuffer(), dvvDesc,
+                            &stencilTable->GetSizes()[0],
+                            &stencilTable->GetOffsets()[0],
+                            &stencilTable->GetControlIndices()[0],
+                            &stencilTable->GetWeights()[0],
+                            &stencilTable->GetDuWeights()[0],
+                            &stencilTable->GetDvWeights()[0],
+                            &stencilTable->GetDuuWeights()[0],
+                            &stencilTable->GetDuvWeights()[0],
+                            &stencilTable->GetDvvWeights()[0],
+                            /*start = */ 0,
+                            /*end   = */ stencilTable->GetNumStencils());
+    }
+
+    /// \brief Static eval stencils function with derivatives, which takes
+    ///        raw CPU pointers for input and output.
+    ///
+    /// @param src            Input primvar pointer. An offset of srcDesc
+    ///                       will be applied internally (i.e. the pointer
+    ///                       should not include the offset)
+    ///
+    /// @param srcDesc        vertex buffer descriptor for the input buffer
+    ///
+    /// @param dst            Output primvar pointer. An offset of dstDesc
+    ///                       will be applied internally.
+    ///
+    /// @param dstDesc        vertex buffer descriptor for the output buffer
+    ///
+    /// @param du             Output pointer derivative wrt u. An offset of
+    ///                       duDesc will be applied internally.
+    ///
+    /// @param duDesc         vertex buffer descriptor for the duBuffer
+    ///
+    /// @param dv             Output pointer derivative wrt v. An offset of
+    ///                       dvDesc will be applied internally.
+    ///
+    /// @param dvDesc         vertex buffer descriptor for the dvBuffer
+    ///
+    /// @param duu            Output pointer 2nd derivative wrt u. An offset of
+    ///                       duuDesc will be applied internally.
+    ///
+    /// @param duuDesc        vertex buffer descriptor for the duuBuffer
+    ///
+    /// @param duv            Output pointer 2nd derivative wrt u and v. An offset of
+    ///                       duvDesc will be applied internally.
+    ///
+    /// @param duvDesc        vertex buffer descriptor for the duvBuffer
+    ///
+    /// @param dvv            Output pointer 2nd derivative wrt v. An offset of
+    ///                       dvvDesc will be applied internally.
+    ///
+    /// @param dvvDesc        vertex buffer descriptor for the dvvBuffer
+    ///
+    /// @param sizes          pointer to the sizes buffer of the stencil table
+    ///
+    /// @param offsets        pointer to the offsets buffer of the stencil table
+    ///
+    /// @param indices        pointer to the indices buffer of the stencil table
+    ///
+    /// @param weights        pointer to the weights buffer of the stencil table
+    ///
+    /// @param duWeights      pointer to the du-weights buffer of the stencil table
+    ///
+    /// @param dvWeights      pointer to the dv-weights buffer of the stencil table
+    ///
+    /// @param duuWeights     pointer to the duu-weights buffer of the stencil table
+    ///
+    /// @param duvWeights     pointer to the duv-weights buffer of the stencil table
+    ///
+    /// @param dvvWeights     pointer to the dvv-weights buffer of the stencil table
+    ///
+    /// @param start          start index of stencil table
+    ///
+    /// @param end            end index of stencil table
+    ///
+    static bool EvalStencils(
+        const float *src, BufferDescriptor const &srcDesc,
+        float *dst,       BufferDescriptor const &dstDesc,
+        float *du,        BufferDescriptor const &duDesc,
+        float *dv,        BufferDescriptor const &dvDesc,
+        float *duu,       BufferDescriptor const &duuDesc,
+        float *duv,       BufferDescriptor const &duvDesc,
+        float *dvv,       BufferDescriptor const &dvvDesc,
+        const int * sizes,
+        const int * offsets,
+        const int * indices,
+        const float * weights,
+        const float * duWeights,
+        const float * dvWeights,
+        const float * duuWeights,
+        const float * duvWeights,
+        const float * dvvWeights,
+        int start, int end);
+
    /// ----------------------------------------------------------------------
    ///
    ///   Limit evaluations with PatchTable
@ -318,13 +488,13 @@ public:
    ///
    /// @param dstDesc          vertex buffer descriptor for the output buffer
    ///
-    /// @param duBuffer         Output U-derivatives buffer
+    /// @param duBuffer         Output buffer derivative wrt u
    ///                         must have BindCpuBuffer() method returning a
    ///                         float pointer for write
    ///
    /// @param duDesc           vertex buffer descriptor for the duBuffer
    ///
-    /// @param dvBuffer         Output V-derivatives buffer
+    /// @param dvBuffer         Output buffer derivative wrt v
    ///                         must have BindCpuBuffer() method returning a
    ///                         float pointer for write
    ///
@ -354,6 +524,7 @@ public:
        PATCH_TABLE *patchTable,
        CpuEvaluator const *instance = NULL,
        void * deviceContext = NULL) {
+
        (void)instance;       // unused
        (void)deviceContext;  // unused

@ -373,6 +544,102 @@ public:
                           patchTable->GetPatchParamBuffer());
    }

+    /// \brief Generic limit eval function with derivatives. This function has
+    ///        a same signature as other device kernels have so that it can be
+    ///        called in the same way.
+    ///
+    /// @param srcBuffer        Input primvar buffer.
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         const float pointer for read
+    ///
+    /// @param srcDesc          vertex buffer descriptor for the input buffer
+    ///
+    /// @param dstBuffer        Output primvar buffer
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param dstDesc          vertex buffer descriptor for the output buffer
+    ///
+    /// @param duBuffer         Output buffer derivative wrt u
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param duDesc           vertex buffer descriptor for the duBuffer
+    ///
+    /// @param dvBuffer         Output buffer derivative wrt v
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param dvDesc           vertex buffer descriptor for the dvBuffer
+    ///
+    /// @param duuBuffer        Output buffer 2nd derivative wrt u
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param duuDesc          vertex buffer descriptor for the duuBuffer
+    ///
+    /// @param duvBuffer        Output buffer 2nd derivative wrt u and v
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param duvDesc          vertex buffer descriptor for the duvBuffer
+    ///
+    /// @param dvvBuffer        Output buffer 2nd derivative wrt v
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param dvvDesc          vertex buffer descriptor for the dvvBuffer
+    ///
+    /// @param numPatchCoords   number of patchCoords.
+    ///
+    /// @param patchCoords      array of locations to be evaluated.
+    ///
+    /// @param patchTable       CpuPatchTable or equivalent
+    ///                         XXX: currently Far::PatchTable can't be used
+    ///                              due to interface mismatch
+    ///
+    /// @param instance         not used in the cpu evaluator
+    ///
+    /// @param deviceContext    not used in the cpu evaluator
+    ///
+    template <typename SRC_BUFFER, typename DST_BUFFER,
+              typename PATCHCOORD_BUFFER, typename PATCH_TABLE>
+    static bool EvalPatches(
+        SRC_BUFFER *srcBuffer, BufferDescriptor const &srcDesc,
+        DST_BUFFER *dstBuffer, BufferDescriptor const &dstDesc,
+        DST_BUFFER *duBuffer,  BufferDescriptor const &duDesc,
+        DST_BUFFER *dvBuffer,  BufferDescriptor const &dvDesc,
+        DST_BUFFER *duuBuffer, BufferDescriptor const &duuDesc,
+        DST_BUFFER *duvBuffer, BufferDescriptor const &duvDesc,
+        DST_BUFFER *dvvBuffer, BufferDescriptor const &dvvDesc,
+        int numPatchCoords,
+        PATCHCOORD_BUFFER *patchCoords,
+        PATCH_TABLE *patchTable,
+        CpuEvaluator const *instance = NULL,
+        void * deviceContext = NULL) {
+
+        (void)instance;       // unused
+        (void)deviceContext;  // unused
+
+        // XXX: PatchCoords is somewhat abusing vertex primvar buffer interop.
+        //      ideally all buffer classes should have templated by datatype
+        //      so that downcast isn't needed there.
+        //      (e.g. Osd::CpuBuffer<PatchCoord> )
+        //
+        return EvalPatches(srcBuffer->BindCpuBuffer(), srcDesc,
+                           dstBuffer->BindCpuBuffer(), dstDesc,
+                           duBuffer->BindCpuBuffer(),  duDesc,
+                           dvBuffer->BindCpuBuffer(),  dvDesc,
+                           duuBuffer->BindCpuBuffer(), duuDesc,
+                           duvBuffer->BindCpuBuffer(), duvDesc,
+                           dvvBuffer->BindCpuBuffer(), dvvDesc,
+                           numPatchCoords,
+                           (const PatchCoord*)patchCoords->BindCpuBuffer(),
+                           patchTable->GetPatchArrayBuffer(),
+                           patchTable->GetPatchIndexBuffer(),
+                           patchTable->GetPatchParamBuffer());
+    }
+
    /// \brief Static limit eval function. It takes an array of PatchCoord
    ///        and evaluate limit values on given PatchTable.
    ///
@ -423,15 +690,15 @@ public:
    ///
    /// @param dstDesc          vertex buffer descriptor for the output buffer
    ///
-    /// @param du               Output U-derivatives pointer. An offset of
+    /// @param du               Output pointer derivative wrt u. An offset of
    ///                         duDesc will be applied internally.
    ///
-    /// @param duDesc           vertex buffer descriptor for the du buffer
+    /// @param duDesc           vertex buffer descriptor for the duBuffer
    ///
-    /// @param dv               Output V-derivatives pointer. An offset of
+    /// @param dv               Output pointer derivative wrt v. An offset of
    ///                         dvDesc will be applied internally.
    ///
-    /// @param dvDesc           vertex buffer descriptor for the dv buffer
+    /// @param dvDesc           vertex buffer descriptor for the dvBuffer
    ///
    /// @param numPatchCoords   number of patchCoords.
    ///
@ -457,6 +724,72 @@ public:
        const int *patchIndexBuffer,
        PatchParam const *patchParamBuffer);

+    /// \brief Static limit eval function. It takes an array of PatchCoord
+    ///        and evaluate limit values on given PatchTable.
+    ///
+    /// @param src              Input primvar pointer. An offset of srcDesc
+    ///                         will be applied internally (i.e. the pointer
+    ///                         should not include the offset)
+    ///
+    /// @param srcDesc          vertex buffer descriptor for the input buffer
+    ///
+    /// @param dst              Output primvar pointer. An offset of dstDesc
+    ///                         will be applied internally.
+    ///
+    /// @param dstDesc          vertex buffer descriptor for the output buffer
+    ///
+    /// @param du               Output pointer derivative wrt u. An offset of
+    ///                         duDesc will be applied internally.
+    ///
+    /// @param duDesc           vertex buffer descriptor for the duBuffer
+    ///
+    /// @param dv               Output pointer derivative wrt v. An offset of
+    ///                         dvDesc will be applied internally.
+    ///
+    /// @param dvDesc           vertex buffer descriptor for the dvBuffer
+    ///
+    /// @param duu              Output pointer 2nd derivative wrt u. An offset of
+    ///                         duuDesc will be applied internally.
+    ///
+    /// @param duuDesc          vertex buffer descriptor for the duuBuffer
+    ///
+    /// @param duv              Output pointer 2nd derivative wrt u and v. An offset of
+    ///                         duvDesc will be applied internally.
+    ///
+    /// @param duvDesc          vertex buffer descriptor for the duvBuffer
+    ///
+    /// @param dvv              Output pointer 2nd derivative wrt v. An offset of
+    ///                         dvvDesc will be applied internally.
+    ///
+    /// @param dvvDesc          vertex buffer descriptor for the dvvBuffer
+    ///
+    /// @param numPatchCoords   number of patchCoords.
+    ///
+    /// @param patchCoords      array of locations to be evaluated.
+    ///
+    /// @param patchArrays      an array of Osd::PatchArray struct
+    ///                         indexed by PatchCoord::arrayIndex
+    ///
+    /// @param patchIndexBuffer an array of patch indices
+    ///                         indexed by PatchCoord::vertIndex
+    ///
+    /// @param patchParamBuffer an array of Osd::PatchParam struct
+    ///                         indexed by PatchCoord::patchIndex
+    ///
+    static bool EvalPatches(
+        const float *src, BufferDescriptor const &srcDesc,
+        float *dst,       BufferDescriptor const &dstDesc,
+        float *du,        BufferDescriptor const &duDesc,
+        float *dv,        BufferDescriptor const &dvDesc,
+        float *duu,       BufferDescriptor const &duuDesc,
+        float *duv,       BufferDescriptor const &duvDesc,
+        float *dvv,       BufferDescriptor const &dvvDesc,
+        int numPatchCoords,
+        PatchCoord const *patchCoords,
+        PatchArray const *patchArrays,
+        const int *patchIndexBuffer,
+        PatchParam const *patchParamBuffer);
+
    /// \brief Generic limit eval function. This function has a same
    ///        signature as other device kernels have so that it can be called
    ///        in the same way.
@ -508,6 +841,164 @@ public:
                           patchTable->GetPatchParamBuffer());
    }

+    /// \brief Generic limit eval function. This function has a same
+    ///        signature as other device kernels have so that it can be called
+    ///        in the same way.
+    ///
+    /// @param srcBuffer        Input primvar buffer.
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         const float pointer for read
+    ///
+    /// @param srcDesc          vertex buffer descriptor for the input buffer
+    ///
+    /// @param dstBuffer        Output primvar buffer
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param dstDesc          vertex buffer descriptor for the output buffer
+    ///
+    /// @param duBuffer         Output buffer derivative wrt u
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param duDesc           vertex buffer descriptor for the duBuffer
+    ///
+    /// @param dvBuffer         Output buffer derivative wrt v
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param dvDesc           vertex buffer descriptor for the dvBuffer
+    ///
+    /// @param numPatchCoords   number of patchCoords.
+    ///
+    /// @param patchCoords      array of locations to be evaluated.
+    ///
+    /// @param patchTable       CpuPatchTable or equivalent
+    ///                         XXX: currently Far::PatchTable can't be used
+    ///                              due to interface mismatch
+    ///
+    /// @param instance         not used in the cpu evaluator
+    ///
+    /// @param deviceContext    not used in the cpu evaluator
+    ///
+    template <typename SRC_BUFFER, typename DST_BUFFER,
+              typename PATCHCOORD_BUFFER, typename PATCH_TABLE>
+    static bool EvalPatchesVarying(
+        SRC_BUFFER *srcBuffer, BufferDescriptor const &srcDesc,
+        DST_BUFFER *dstBuffer, BufferDescriptor const &dstDesc,
+        DST_BUFFER *duBuffer,  BufferDescriptor const &duDesc,
+        DST_BUFFER *dvBuffer,  BufferDescriptor const &dvDesc,
+        int numPatchCoords,
+        PATCHCOORD_BUFFER *patchCoords,
+        PATCH_TABLE *patchTable,
+        CpuEvaluator const *instance = NULL,
+        void * deviceContext = NULL) {
+
+        (void)instance;       // unused
+        (void)deviceContext;  // unused
+
+        return EvalPatches(srcBuffer->BindCpuBuffer(), srcDesc,
+                           dstBuffer->BindCpuBuffer(), dstDesc,
+                           duBuffer->BindCpuBuffer(),  duDesc,
+                           dvBuffer->BindCpuBuffer(),  dvDesc,
+                           numPatchCoords,
+                           (const PatchCoord*)patchCoords->BindCpuBuffer(),
+                           patchTable->GetVaryingPatchArrayBuffer(),
+                           patchTable->GetVaryingPatchIndexBuffer(),
+                           patchTable->GetPatchParamBuffer());
+    }
+
+    /// \brief Generic limit eval function. This function has a same
+    ///        signature as other device kernels have so that it can be called
+    ///        in the same way.
+    ///
+    /// @param srcBuffer        Input primvar buffer.
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         const float pointer for read
+    ///
+    /// @param srcDesc          vertex buffer descriptor for the input buffer
+    ///
+    /// @param dstBuffer        Output primvar buffer
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param dstDesc          vertex buffer descriptor for the output buffer
+    ///
+    /// @param duBuffer         Output buffer derivative wrt u
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param duDesc           vertex buffer descriptor for the duBuffer
+    ///
+    /// @param dvBuffer         Output buffer derivative wrt v
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param dvDesc           vertex buffer descriptor for the dvBuffer
+    ///
+    /// @param duuBuffer        Output buffer 2nd derivative wrt u
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param duuDesc          vertex buffer descriptor for the duuBuffer
+    ///
+    /// @param duvBuffer        Output buffer 2nd derivative wrt u and v
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param duvDesc          vertex buffer descriptor for the duvBuffer
+    ///
+    /// @param dvvBuffer        Output buffer 2nd derivative wrt v
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param dvvDesc          vertex buffer descriptor for the dvvBuffer
+    ///
+    /// @param numPatchCoords   number of patchCoords.
+    ///
+    /// @param patchCoords      array of locations to be evaluated.
+    ///
+    /// @param patchTable       CpuPatchTable or equivalent
+    ///                         XXX: currently Far::PatchTable can't be used
+    ///                              due to interface mismatch
+    ///
+    /// @param instance         not used in the cpu evaluator
+    ///
+    /// @param deviceContext    not used in the cpu evaluator
+    ///
+    template <typename SRC_BUFFER, typename DST_BUFFER,
+              typename PATCHCOORD_BUFFER, typename PATCH_TABLE>
+    static bool EvalPatchesVarying(
+        SRC_BUFFER *srcBuffer, BufferDescriptor const &srcDesc,
+        DST_BUFFER *dstBuffer, BufferDescriptor const &dstDesc,
+        DST_BUFFER *duBuffer,  BufferDescriptor const &duDesc,
+        DST_BUFFER *dvBuffer,  BufferDescriptor const &dvDesc,
+        DST_BUFFER *duuBuffer, BufferDescriptor const &duuDesc,
+        DST_BUFFER *duvBuffer, BufferDescriptor const &duvDesc,
+        DST_BUFFER *dvvBuffer, BufferDescriptor const &dvvDesc,
+        int numPatchCoords,
+        PATCHCOORD_BUFFER *patchCoords,
+        PATCH_TABLE *patchTable,
+        CpuEvaluator const *instance = NULL,
+        void * deviceContext = NULL) {
+
+        (void)instance;       // unused
+        (void)deviceContext;  // unused
+
+        return EvalPatches(srcBuffer->BindCpuBuffer(), srcDesc,
+                           dstBuffer->BindCpuBuffer(), dstDesc,
+                           duBuffer->BindCpuBuffer(),  duDesc,
+                           dvBuffer->BindCpuBuffer(),  dvDesc,
+                           duuBuffer->BindCpuBuffer(), duuDesc,
+                           duvBuffer->BindCpuBuffer(), duvDesc,
+                           dvvBuffer->BindCpuBuffer(), dvvDesc,
+                           numPatchCoords,
+                           (const PatchCoord*)patchCoords->BindCpuBuffer(),
+                           patchTable->GetVaryingPatchArrayBuffer(),
+                           patchTable->GetVaryingPatchIndexBuffer(),
+                           patchTable->GetPatchParamBuffer());
+    }
+
    /// \brief Generic limit eval function. This function has a same
    ///        signature as other device kernels have so that it can be called
    ///        in the same way.
@ -562,6 +1053,170 @@ public:
                           patchTable->GetFVarPatchParamBuffer(fvarChannel));
    }

+    /// \brief Generic limit eval function. This function has a same
+    ///        signature as other device kernels have so that it can be called
+    ///        in the same way.
+    ///
+    /// @param srcBuffer        Input primvar buffer.
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         const float pointer for read
+    ///
+    /// @param srcDesc          vertex buffer descriptor for the input buffer
+    ///
+    /// @param dstBuffer        Output primvar buffer
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param dstDesc          vertex buffer descriptor for the output buffer
+    ///
+    /// @param duBuffer         Output buffer derivative wrt u
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param duDesc           vertex buffer descriptor for the duBuffer
+    ///
+    /// @param dvBuffer         Output buffer derivative wrt v
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param dvDesc           vertex buffer descriptor for the dvBuffer
+    ///
+    /// @param numPatchCoords   number of patchCoords.
+    ///
+    /// @param patchCoords      array of locations to be evaluated.
+    ///
+    /// @param patchTable       CpuPatchTable or equivalent
+    ///                         XXX: currently Far::PatchTable can't be used
+    ///                              due to interface mismatch
+    ///
+    /// @param fvarChannel      face-varying channel
+    ///
+    /// @param instance         not used in the cpu evaluator
+    ///
+    /// @param deviceContext    not used in the cpu evaluator
+    ///
+    template <typename SRC_BUFFER, typename DST_BUFFER,
+              typename PATCHCOORD_BUFFER, typename PATCH_TABLE>
+    static bool EvalPatchesFaceVarying(
+        SRC_BUFFER *srcBuffer, BufferDescriptor const &srcDesc,
+        DST_BUFFER *dstBuffer, BufferDescriptor const &dstDesc,
+        DST_BUFFER *duBuffer,  BufferDescriptor const &duDesc,
+        DST_BUFFER *dvBuffer,  BufferDescriptor const &dvDesc,
+        int numPatchCoords,
+        PATCHCOORD_BUFFER *patchCoords,
+        PATCH_TABLE *patchTable,
+        int fvarChannel,
+        CpuEvaluator const *instance = NULL,
+        void * deviceContext = NULL) {
+
+        (void)instance;       // unused
+        (void)deviceContext;  // unused
+
+        return EvalPatches(srcBuffer->BindCpuBuffer(), srcDesc,
+                           dstBuffer->BindCpuBuffer(), dstDesc,
+                           duBuffer->BindCpuBuffer(),  duDesc,
+                           dvBuffer->BindCpuBuffer(),  dvDesc,
+                           numPatchCoords,
+                           (const PatchCoord*)patchCoords->BindCpuBuffer(),
+                           patchTable->GetFVarPatchArrayBuffer(fvarChannel),
+                           patchTable->GetFVarPatchIndexBuffer(fvarChannel),
+                           patchTable->GetFVarPatchParamBuffer(fvarChannel));
+    }
+
+    /// \brief Generic limit eval function. This function has a same
+    ///        signature as other device kernels have so that it can be called
+    ///        in the same way.
+    ///
+    /// @param srcBuffer        Input primvar buffer.
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         const float pointer for read
+    ///
+    /// @param srcDesc          vertex buffer descriptor for the input buffer
+    ///
+    /// @param dstBuffer        Output primvar buffer
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param dstDesc          vertex buffer descriptor for the output buffer
+    ///
+    /// @param duBuffer         Output buffer derivative wrt u
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param duDesc           vertex buffer descriptor for the duBuffer
+    ///
+    /// @param dvBuffer         Output buffer derivative wrt v
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param dvDesc           vertex buffer descriptor for the dvBuffer
+    ///
+    /// @param duuBuffer        Output buffer 2nd derivative wrt u
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param duuDesc          vertex buffer descriptor for the duuBuffer
+    ///
+    /// @param duvBuffer        Output buffer 2nd derivative wrt u and v
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param duvDesc          vertex buffer descriptor for the duvBuffer
+    ///
+    /// @param dvvBuffer        Output buffer 2nd derivative wrt v
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param dvvDesc          vertex buffer descriptor for the dvvBuffer
+    ///
+    /// @param numPatchCoords   number of patchCoords.
+    ///
+    /// @param patchCoords      array of locations to be evaluated.
+    ///
+    /// @param patchTable       CpuPatchTable or equivalent
+    ///                         XXX: currently Far::PatchTable can't be used
+    ///                              due to interface mismatch
+    ///
+    /// @param fvarChannel      face-varying channel
+    ///
+    /// @param instance         not used in the cpu evaluator
+    ///
+    /// @param deviceContext    not used in the cpu evaluator
+    ///
+    template <typename SRC_BUFFER, typename DST_BUFFER,
+              typename PATCHCOORD_BUFFER, typename PATCH_TABLE>
+    static bool EvalPatchesFaceVarying(
+        SRC_BUFFER *srcBuffer, BufferDescriptor const &srcDesc,
+        DST_BUFFER *dstBuffer, BufferDescriptor const &dstDesc,
+        DST_BUFFER *duBuffer,  BufferDescriptor const &duDesc,
+        DST_BUFFER *dvBuffer,  BufferDescriptor const &dvDesc,
+        DST_BUFFER *duuBuffer, BufferDescriptor const &duuDesc,
+        DST_BUFFER *duvBuffer, BufferDescriptor const &duvDesc,
+        DST_BUFFER *dvvBuffer, BufferDescriptor const &dvvDesc,
+        int numPatchCoords,
+        PATCHCOORD_BUFFER *patchCoords,
+        PATCH_TABLE *patchTable,
+        int fvarChannel,
+        CpuEvaluator const *instance = NULL,
+        void * deviceContext = NULL) {
+
+        (void)instance;       // unused
+        (void)deviceContext;  // unused
+
+        return EvalPatches(srcBuffer->BindCpuBuffer(), srcDesc,
+                           dstBuffer->BindCpuBuffer(), dstDesc,
+                           duBuffer->BindCpuBuffer(),  duDesc,
+                           dvBuffer->BindCpuBuffer(),  dvDesc,
+                           duuBuffer->BindCpuBuffer(), duuDesc,
+                           duvBuffer->BindCpuBuffer(), duvDesc,
+                           dvvBuffer->BindCpuBuffer(), dvvDesc,
+                           numPatchCoords,
+                           (const PatchCoord*)patchCoords->BindCpuBuffer(),
+                           patchTable->GetFVarPatchArrayBuffer(fvarChannel),
+                           patchTable->GetFVarPatchIndexBuffer(fvarChannel),
+                           patchTable->GetFVarPatchParamBuffer(fvarChannel));
+    }
+
    /// ----------------------------------------------------------------------
    ///
    ///   Other methods
--- a/opensubdiv/osd/cpuGLVertexBuffer.h
+++ b/opensubdiv/osd/cpuGLVertexBuffer.h
@ -36,7 +36,7 @@ namespace OPENSUBDIV_VERSION {
 namespace Osd {

 ///
-/// \brief Concrete vertex buffer class for cpu subvision and OpenGL drawing.
+/// \brief Concrete vertex buffer class for cpu subdivision and OpenGL drawing.
 ///
 /// CpuGLVertexBuffer implements CpuVertexBufferInterface and
 /// GLVertexBufferInterface.
--- a/opensubdiv/osd/cpuKernel.cpp
+++ b/opensubdiv/osd/cpuKernel.cpp
@ -169,6 +169,76 @@ CpuEvalStencils(float const * src, BufferDescriptor const &srcDesc,
    }
 }

+void
+CpuEvalStencils(float const * src, BufferDescriptor const &srcDesc,
+                float * dst,       BufferDescriptor const &dstDesc,
+                float * dstDu,     BufferDescriptor const &dstDuDesc,
+                float * dstDv,     BufferDescriptor const &dstDvDesc,
+                float * dstDuu,    BufferDescriptor const &dstDuuDesc,
+                float * dstDuv,    BufferDescriptor const &dstDuvDesc,
+                float * dstDvv,    BufferDescriptor const &dstDvvDesc,
+                int const * sizes,
+                int const * offsets,
+                int const * indices,
+                float const * weights,
+                float const * duWeights,
+                float const * dvWeights,
+                float const * duuWeights,
+                float const * duvWeights,
+                float const * dvvWeights,
+                int start, int end) {
+    if (start > 0) {
+        sizes += start;
+        indices += offsets[start];
+        weights += offsets[start];
+        duWeights += offsets[start];
+        dvWeights += offsets[start];
+        duuWeights += offsets[start];
+        duvWeights += offsets[start];
+        dvvWeights += offsets[start];
+    }
+
+    src += srcDesc.offset;
+    dst += dstDesc.offset;
+    dstDu += dstDuDesc.offset;
+    dstDv += dstDvDesc.offset;
+    dstDuu += dstDuuDesc.offset;
+    dstDuv += dstDuvDesc.offset;
+    dstDvv += dstDvvDesc.offset;
+
+    int nOutLength = dstDesc.length + dstDuDesc.length + dstDvDesc.length
+                   + dstDuuDesc.length + dstDuvDesc.length + dstDvvDesc.length;
+    float * result   = (float*)alloca(nOutLength * sizeof(float));
+    float * resultDu = result + dstDesc.length;
+    float * resultDv = resultDu + dstDuDesc.length;
+    float * resultDuu = resultDv + dstDvDesc.length;
+    float * resultDuv = resultDuu + dstDuuDesc.length;
+    float * resultDvv = resultDuv + dstDuvDesc.length;
+
+    int nStencils = end - start;
+    for (int i = 0; i < nStencils; ++i, ++sizes) {
+
+        // clear
+        memset(result, 0, nOutLength * sizeof(float));
+
+        for (int j=0; j<*sizes; ++j) {
+            addWithWeight(result,   src, *indices, *weights++,   srcDesc);
+            addWithWeight(resultDu, src, *indices, *duWeights++, srcDesc);
+            addWithWeight(resultDv, src, *indices, *dvWeights++, srcDesc);
+            addWithWeight(resultDuu, src, *indices, *duuWeights++, srcDesc);
+            addWithWeight(resultDuv, src, *indices, *duvWeights++, srcDesc);
+            addWithWeight(resultDvv, src, *indices, *dvvWeights++, srcDesc);
+            ++indices;
+        }
+        copy(dst,   i, result, dstDesc);
+        copy(dstDu, i, resultDu, dstDuDesc);
+        copy(dstDv, i, resultDv, dstDvDesc);
+        copy(dstDuu, i, resultDuu, dstDuuDesc);
+        copy(dstDuv, i, resultDuv, dstDuvDesc);
+        copy(dstDvv, i, resultDvv, dstDvvDesc);
+    }
+}
+
 }  // end namespace Osd

 }  // end namespace OPENSUBDIV_VERSION
--- a/opensubdiv/osd/cpuKernel.h
+++ b/opensubdiv/osd/cpuKernel.h
@ -57,6 +57,25 @@ CpuEvalStencils(float const * src, BufferDescriptor const &srcDesc,
                float const * dvWeights,
                int start, int end);

+void
+CpuEvalStencils(float const * src, BufferDescriptor const &srcDesc,
+                float * dst,       BufferDescriptor const &dstDesc,
+                float * dstDu,     BufferDescriptor const &dstDuDesc,
+                float * dstDv,     BufferDescriptor const &dstDvDesc,
+                float * dstDuu,    BufferDescriptor const &dstDuuDesc,
+                float * dstDuv,    BufferDescriptor const &dstDuvDesc,
+                float * dstDvv,    BufferDescriptor const &dstDvvDesc,
+                int const * sizes,
+                int const * offsets,
+                int const * indices,
+                float const * weights,
+                float const * duWeights,
+                float const * dvWeights,
+                float const * duuWeights,
+                float const * duvWeights,
+                float const * dvvWeights,
+                int start, int end);
+
 //
 // SIMD ICC optimization of the stencil kernel
 //
--- a/opensubdiv/osd/cpuVertexBuffer.h
+++ b/opensubdiv/osd/cpuVertexBuffer.h
@ -34,7 +34,7 @@ namespace OPENSUBDIV_VERSION {

 namespace Osd {

-/// \brief Concrete vertex buffer class for cpu subvision.
+/// \brief Concrete vertex buffer class for CPU subdivision.
 ///
 /// CpuVertexBuffer implements the VertexBufferInterface. An instance
 /// of this buffer class can be passed to CpuEvaluator
--- a/opensubdiv/osd/cudaD3D11VertexBuffer.h
+++ b/opensubdiv/osd/cudaD3D11VertexBuffer.h
@ -38,7 +38,7 @@ namespace OPENSUBDIV_VERSION {

 namespace Osd {

-/// \brief Concrete vertex buffer class for cuda subvision and D3D11 drawing.
+/// \brief Concrete vertex buffer class for cuda subdivision and D3D11 drawing.
 ///
 /// CudaD3D11VertexBuffer implements CudaVertexBufferInterface and
 /// D3D11VertexBufferInterface.
@ -85,7 +85,7 @@ protected:

    bool allocate(ID3D11Device *device);

-    // Acqures a cuda resource from DX11
+    // Acquires a cuda resource from DX11
    void map();

    // Releases a cuda resource to DX11
--- a/opensubdiv/osd/cudaEvaluator.cpp
+++ b/opensubdiv/osd/cudaEvaluator.cpp
@ -53,9 +53,12 @@ extern "C" {
        const void *patchParams);

    void CudaEvalPatchesWithDerivatives(
-        const float *src, float *dst, float *du, float *dv,
-        int length,
-        int srcStride, int dstStride, int dvStride, int duStride,
+        const float *src, float *dst,
+        float *du, float *dv,
+        float *duu, float *duv, float *dvv,
+        int length, int srcStride, int dstStride,
+        int duStride, int dvStride,
+        int duuStride, int duvStride, int dvvStride,
        int numPatchCoords,
        const void *patchCoords,
        const void *patchArrays,
@ -71,6 +74,10 @@ namespace Osd {

 template <class T> void *
 createCudaBuffer(std::vector<T> const & src) {
+    if (src.empty()) {
+        return NULL;
+    }
+
    void * devicePtr = 0;

    size_t size = src.size()*sizeof(T);
@ -98,9 +105,11 @@ CudaStencilTable::CudaStencilTable(Far::StencilTable const *stencilTable) {
        _indices = createCudaBuffer(stencilTable->GetControlIndices());
        _weights = createCudaBuffer(stencilTable->GetWeights());
        _duWeights = _dvWeights = NULL;
+        _duuWeights = _duvWeights = _dvvWeights = NULL;
    } else {
        _sizes = _offsets = _indices = _weights = NULL;
        _duWeights = _dvWeights = NULL;
+        _duuWeights = _duvWeights = _dvvWeights = NULL;
    }
 }

@ -113,9 +122,13 @@ CudaStencilTable::CudaStencilTable(Far::LimitStencilTable const *limitStencilTab
        _weights = createCudaBuffer(limitStencilTable->GetWeights());
        _duWeights = createCudaBuffer(limitStencilTable->GetDuWeights());
        _dvWeights = createCudaBuffer(limitStencilTable->GetDvWeights());
+        _duuWeights = createCudaBuffer(limitStencilTable->GetDuuWeights());
+        _duvWeights = createCudaBuffer(limitStencilTable->GetDuvWeights());
+        _dvvWeights = createCudaBuffer(limitStencilTable->GetDvvWeights());
    } else {
        _sizes = _offsets = _indices = _weights = NULL;
        _duWeights = _dvWeights = NULL;
+        _duuWeights = _duvWeights = _dvvWeights = NULL;
    }
 }

@ -126,6 +139,9 @@ CudaStencilTable::~CudaStencilTable() {
    if (_weights) cudaFree(_weights);
    if (_duWeights) cudaFree(_duWeights);
    if (_dvWeights) cudaFree(_dvWeights);
+    if (_duuWeights) cudaFree(_duuWeights);
+    if (_duvWeights) cudaFree(_duvWeights);
+    if (_dvvWeights) cudaFree(_dvvWeights);
 }

 // ---------------------------------------------------------------------------
@ -197,6 +213,84 @@ CudaEvaluator::EvalStencils(const float *src, BufferDescriptor const &srcDesc,
    return true;
 }

+/* static */
+bool
+CudaEvaluator::EvalStencils(const float *src, BufferDescriptor const &srcDesc,
+                            float *dst,       BufferDescriptor const &dstDesc,
+                            float *du,        BufferDescriptor const &duDesc,
+                            float *dv,        BufferDescriptor const &dvDesc,
+                            float *duu,       BufferDescriptor const &duuDesc,
+                            float *duv,       BufferDescriptor const &duvDesc,
+                            float *dvv,       BufferDescriptor const &dvvDesc,
+                            const int * sizes,
+                            const int * offsets,
+                            const int * indices,
+                            const float * weights,
+                            const float * duWeights,
+                            const float * dvWeights,
+                            const float * duuWeights,
+                            const float * duvWeights,
+                            const float * dvvWeights,
+                            int start,
+                            int end) {
+    // PERFORMANCE: need to combine 3 launches together
+    if (dst) {
+        CudaEvalStencils(src + srcDesc.offset,
+                         dst + dstDesc.offset,
+                         srcDesc.length,
+                         srcDesc.stride,
+                         dstDesc.stride,
+                         sizes, offsets, indices, weights,
+                         start, end);
+    }
+    if (du) {
+        CudaEvalStencils(src + srcDesc.offset,
+                         du  +  duDesc.offset,
+                         srcDesc.length,
+                         srcDesc.stride,
+                         duDesc.stride,
+                         sizes, offsets, indices, duWeights,
+                         start, end);
+    }
+    if (dv) {
+        CudaEvalStencils(src + srcDesc.offset,
+                         dv  + dvDesc.offset,
+                         srcDesc.length,
+                         srcDesc.stride,
+                         dvDesc.stride,
+                         sizes, offsets, indices, dvWeights,
+                         start, end);
+    }
+    if (duu) {
+        CudaEvalStencils(src + srcDesc.offset,
+                         duu +  duuDesc.offset,
+                         srcDesc.length,
+                         srcDesc.stride,
+                         duuDesc.stride,
+                         sizes, offsets, indices, duuWeights,
+                         start, end);
+    }
+    if (duv) {
+        CudaEvalStencils(src + srcDesc.offset,
+                         duv +  duvDesc.offset,
+                         srcDesc.length,
+                         srcDesc.stride,
+                         duvDesc.stride,
+                         sizes, offsets, indices, duvWeights,
+                         start, end);
+    }
+    if (dvv) {
+        CudaEvalStencils(src + srcDesc.offset,
+                         dvv + dvvDesc.offset,
+                         srcDesc.length,
+                         srcDesc.stride,
+                         dvvDesc.stride,
+                         sizes, offsets, indices, dvvWeights,
+                         start, end);
+    }
+    return true;
+}
+
 /* static */
 bool
 CudaEvaluator::EvalPatches(const float *src,
@ -237,9 +331,42 @@ CudaEvaluator::EvalPatches(
    if (dv)  dv  += dvDesc.offset;

    CudaEvalPatchesWithDerivatives(
-        src, dst, du, dv,
-        srcDesc.length, srcDesc.stride,
-        dstDesc.stride, duDesc.stride, dvDesc.stride,
+        src, dst, du, dv, NULL, NULL, NULL,
+        srcDesc.length, srcDesc.stride, dstDesc.stride,
+        duDesc.stride, dvDesc.stride, 0, 0, 0,
+        numPatchCoords, patchCoords, patchArrays, patchIndices, patchParams);
+    return true;
+}
+
+/* static */
+bool
+CudaEvaluator::EvalPatches(
+    const float *src, BufferDescriptor const &srcDesc,
+    float *dst,       BufferDescriptor const &dstDesc,
+    float *du,        BufferDescriptor const &duDesc,
+    float *dv,        BufferDescriptor const &dvDesc,
+    float *duu,       BufferDescriptor const &duuDesc,
+    float *duv,       BufferDescriptor const &duvDesc,
+    float *dvv,       BufferDescriptor const &dvvDesc,
+    int numPatchCoords,
+    const PatchCoord *patchCoords,
+    const PatchArray *patchArrays,
+    const int *patchIndices,
+    const PatchParam *patchParams) {
+
+    if (src) src += srcDesc.offset;
+    if (dst) dst += dstDesc.offset;
+    if (du)  du  += duDesc.offset;
+    if (dv)  dv  += dvDesc.offset;
+    if (duu) duu += duuDesc.offset;
+    if (duv) duv += duvDesc.offset;
+    if (dvv) dvv += dvvDesc.offset;
+
+    CudaEvalPatchesWithDerivatives(
+        src, dst, du, dv, duu, duv, dvv,
+        srcDesc.length, srcDesc.stride, dstDesc.stride,
+        duDesc.stride, dvDesc.stride,
+        duuDesc.stride, duvDesc.stride, dvvDesc.stride,
        numPatchCoords, patchCoords, patchArrays, patchIndices, patchParams);
    return true;
 }
--- a/opensubdiv/osd/cudaEvaluator.h
+++ b/opensubdiv/osd/cudaEvaluator.h
@ -73,6 +73,9 @@ public:
    void *GetWeightsBuffer() const { return _weights; }
    void *GetDuWeightsBuffer() const { return _duWeights; }
    void *GetDvWeightsBuffer() const { return _dvWeights; }
+    void *GetDuuWeightsBuffer() const { return _duuWeights; }
+    void *GetDuvWeightsBuffer() const { return _duvWeights; }
+    void *GetDvvWeightsBuffer() const { return _dvvWeights; }
    int GetNumStencils() const { return _numStencils; }

 private:
@ -81,7 +84,10 @@ private:
         * _indices,
         * _weights,
         * _duWeights,
-         * _dvWeights;
+         * _dvWeights,
+         * _duuWeights,
+         * _duvWeights,
+         * _dvvWeights;
    int _numStencils;
 };

@ -188,17 +194,17 @@ public:
    ///
    /// @param dstDesc        vertex buffer descriptor for the output buffer
    ///
-    /// @param duBuffer       Output U-derivative buffer
+    /// @param duBuffer       Output buffer derivative wrt u
    ///                       must have BindCudaBuffer() method returning a
    ///                       float pointer for write
    ///
-    /// @param duDesc         vertex buffer descriptor for the output buffer
+    /// @param duDesc         vertex buffer descriptor for the duBuffer
    ///
-    /// @param dvBuffer       Output V-derivative buffer
+    /// @param dvBuffer       Output buffer derivative wrt v
    ///                       must have BindCudaBuffer() method returning a
    ///                       float pointer for write
    ///
-    /// @param dvDesc         vertex buffer descriptor for the output buffer
+    /// @param dvDesc         vertex buffer descriptor for the dvBuffer
    ///
    /// @param stencilTable   stencil table to be applied.
    ///
@ -249,15 +255,15 @@ public:
    ///
    /// @param dstDesc        vertex buffer descriptor for the output buffer
    ///
-    /// @param du             Output U-derivatives pointer. An offset of
+    /// @param du             Output pointer derivative wrt u. An offset of
    ///                       duDesc will be applied internally.
    ///
-    /// @param duDesc         vertex buffer descriptor for the output buffer
+    /// @param duDesc         vertex buffer descriptor for the duBuffer
    ///
-    /// @param dv             Output V-derivatives pointer. An offset of
+    /// @param dv             Output pointer derivative wrt v. An offset of
    ///                       dvDesc will be applied internally.
    ///
-    /// @param dvDesc         vertex buffer descriptor for the output buffer
+    /// @param dvDesc         vertex buffer descriptor for the dvBuffer
    ///
    /// @param sizes          pointer to the sizes buffer of the stencil table
    ///
@ -288,6 +294,177 @@ public:
        const float * dvWeights,
        int start, int end);

+    /// \brief Generic static eval stencils function with derivatives.
+    ///        This function has a same signature as other device kernels
+    ///        have so that it can be called in the same way from OsdMesh
+    ///        template interface.
+    ///
+    /// @param srcBuffer      Input primvar buffer.
+    ///                       must have BindCudaBuffer() method returning a
+    ///                       const float pointer for read
+    ///
+    /// @param srcDesc        vertex buffer descriptor for the input buffer
+    ///
+    /// @param dstBuffer      Output primvar buffer
+    ///                       must have BindCudaBuffer() method returning a
+    ///                       float pointer for write
+    ///
+    /// @param dstDesc        vertex buffer descriptor for the output buffer
+    ///
+    /// @param duBuffer       Output buffer derivative wrt u
+    ///                       must have BindCudaBuffer() method returning a
+    ///                       float pointer for write
+    ///
+    /// @param duDesc         vertex buffer descriptor for the duBuffer
+    ///
+    /// @param dvBuffer       Output buffer derivative wrt v
+    ///                       must have BindCudaBuffer() method returning a
+    ///                       float pointer for write
+    ///
+    /// @param dvDesc         vertex buffer descriptor for the dvBuffer
+    ///
+    /// @param duuBuffer      Output buffer 2nd derivative wrt u
+    ///                       must have BindCudaBuffer() method returning a
+    ///                       float pointer for write
+    ///
+    /// @param duuDesc        vertex buffer descriptor for the duuBuffer
+    ///
+    /// @param duvBuffer      Output buffer 2nd derivative wrt u and v
+    ///                       must have BindCudaBuffer() method returning a
+    ///                       float pointer for write
+    ///
+    /// @param duvDesc        vertex buffer descriptor for the duvBuffer
+    ///
+    /// @param dvvBuffer      Output buffer 2nd derivative wrt v
+    ///                       must have BindCudaBuffer() method returning a
+    ///                       float pointer for write
+    ///
+    /// @param dvvDesc        vertex buffer descriptor for the dvvBuffer
+    ///
+    /// @param stencilTable   stencil table to be applied.
+    ///
+    /// @param instance       not used in the cuda kernel
+    ///                       (declared as a typed pointer to prevent
+    ///                        undesirable template resolution)
+    ///
+    /// @param deviceContext  not used in the cuda kernel
+    ///
+    template <typename SRC_BUFFER, typename DST_BUFFER, typename STENCIL_TABLE>
+    static bool EvalStencils(
+        SRC_BUFFER *srcBuffer, BufferDescriptor const &srcDesc,
+        DST_BUFFER *dstBuffer, BufferDescriptor const &dstDesc,
+        DST_BUFFER *duBuffer,  BufferDescriptor const &duDesc,
+        DST_BUFFER *dvBuffer,  BufferDescriptor const &dvDesc,
+        DST_BUFFER *duuBuffer, BufferDescriptor const &duuDesc,
+        DST_BUFFER *duvBuffer, BufferDescriptor const &duvDesc,
+        DST_BUFFER *dvvBuffer, BufferDescriptor const &dvvDesc,
+        STENCIL_TABLE const *stencilTable,
+        const CudaEvaluator *instance = NULL,
+        void * deviceContext = NULL) {
+
+        (void)instance;       // unused
+        (void)deviceContext;  // unused
+
+        return EvalStencils(srcBuffer->BindCudaBuffer(), srcDesc,
+                            dstBuffer->BindCudaBuffer(), dstDesc,
+                            duBuffer->BindCudaBuffer(),  duDesc,
+                            dvBuffer->BindCudaBuffer(),  dvDesc,
+                            duuBuffer->BindCudaBuffer(), duuDesc,
+                            duvBuffer->BindCudaBuffer(), duvDesc,
+                            dvvBuffer->BindCudaBuffer(), dvvDesc,
+                            (int const *)stencilTable->GetSizesBuffer(),
+                            (int const *)stencilTable->GetOffsetsBuffer(),
+                            (int const *)stencilTable->GetIndicesBuffer(),
+                            (float const *)stencilTable->GetWeightsBuffer(),
+                            (float const *)stencilTable->GetDuWeightsBuffer(),
+                            (float const *)stencilTable->GetDvWeightsBuffer(),
+                            (float const *)stencilTable->GetDuuWeightsBuffer(),
+                            (float const *)stencilTable->GetDuvWeightsBuffer(),
+                            (float const *)stencilTable->GetDvvWeightsBuffer(),
+                            /*start = */ 0,
+                            /*end   = */ stencilTable->GetNumStencils());
+    }
+
+    /// \brief Static eval stencils function with derivatives, which takes
+    ///        raw cuda pointers for input and output.
+    ///
+    /// @param src            Input primvar pointer. An offset of srcDesc
+    ///                       will be applied internally (i.e. the pointer
+    ///                       should not include the offset)
+    ///
+    /// @param srcDesc        vertex buffer descriptor for the input buffer
+    ///
+    /// @param dst            Output primvar pointer. An offset of dstDesc
+    ///                       will be applied internally.
+    ///
+    /// @param dstDesc        vertex buffer descriptor for the output buffer
+    ///
+    /// @param du             Output pointer derivative wrt u. An offset of
+    ///                       duDesc will be applied internally.
+    ///
+    /// @param duDesc         vertex buffer descriptor for the duBuffer
+    ///
+    /// @param dv             Output pointer derivative wrt v. An offset of
+    ///                       dvDesc will be applied internally.
+    ///
+    /// @param dvDesc         vertex buffer descriptor for the dvBuffer
+    ///
+    /// @param duu            Output pointer 2nd derivative wrt u. An offset of
+    ///                       duuDesc will be applied internally.
+    ///
+    /// @param duuDesc        vertex buffer descriptor for the duuBuffer
+    ///
+    /// @param duv            Output pointer 2nd derivative wrt u and v. An offset of
+    ///                       duvDesc will be applied internally.
+    ///
+    /// @param duvDesc        vertex buffer descriptor for the duvBuffer
+    ///
+    /// @param dvv            Output pointer 2nd derivative wrt v. An offset of
+    ///                       dvvDesc will be applied internally.
+    ///
+    /// @param dvvDesc        vertex buffer descriptor for the dvvBuffer
+    ///
+    /// @param sizes          pointer to the sizes buffer of the stencil table
+    ///
+    /// @param offsets        pointer to the offsets buffer of the stencil table
+    ///
+    /// @param indices        pointer to the indices buffer of the stencil table
+    ///
+    /// @param weights        pointer to the weights buffer of the stencil table
+    ///
+    /// @param duWeights      pointer to the du-weights buffer of the stencil table
+    ///
+    /// @param dvWeights      pointer to the dv-weights buffer of the stencil table
+    ///
+    /// @param duuWeights     pointer to the duu-weights buffer of the stencil table
+    ///
+    /// @param duvWeights     pointer to the duv-weights buffer of the stencil table
+    ///
+    /// @param dvvWeights     pointer to the dvv-weights buffer of the stencil table
+    ///
+    /// @param start          start index of stencil table
+    ///
+    /// @param end            end index of stencil table
+    ///
+    static bool EvalStencils(
+        const float *src, BufferDescriptor const &srcDesc,
+        float *dst,       BufferDescriptor const &dstDesc,
+        float *du,        BufferDescriptor const &duDesc,
+        float *dv,        BufferDescriptor const &dvDesc,
+        float *duu,       BufferDescriptor const &duuDesc,
+        float *duv,       BufferDescriptor const &duvDesc,
+        float *dvv,       BufferDescriptor const &dvvDesc,
+        const int * sizes,
+        const int * offsets,
+        const int * indices,
+        const float * weights,
+        const float * duWeights,
+        const float * dvWeights,
+        const float * duuWeights,
+        const float * duvWeights,
+        const float * dvvWeights,
+        int start, int end);
+
    /// ----------------------------------------------------------------------
    ///
    ///   Limit evaluations with PatchTable
@ -361,13 +538,13 @@ public:
    ///
    /// @param dstDesc          vertex buffer descriptor for the output buffer
    ///
-    /// @param duBuffer         Output U-derivatives buffer
+    /// @param duBuffer         Output buffer derivative wrt u
    ///                         must have BindCudaBuffer() method returning a
    ///                         float pointer for write
    ///
    /// @param duDesc           vertex buffer descriptor for the duBuffer
    ///
-    /// @param dvBuffer         Output V-derivatives buffer
+    /// @param dvBuffer         Output buffer derivative wrt v
    ///                         must have BindCudaBuffer() method returning a
    ///                         float pointer for write
    ///
@ -410,6 +587,95 @@ public:
                           (const PatchParam *)patchTable->GetPatchParamBuffer());
    }

+    /// \brief Generic limit eval function with derivatives. This function has
+    ///        a same signature as other device kernels have so that it can be
+    ///        called in the same way.
+    ///
+    /// @param srcBuffer        Input primvar buffer.
+    ///                         must have BindCudaBuffer() method returning a
+    ///                         const float pointer for read
+    ///
+    /// @param srcDesc          vertex buffer descriptor for the input buffer
+    ///
+    /// @param dstBuffer        Output primvar buffer
+    ///                         must have BindCudaBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param dstDesc          vertex buffer descriptor for the output buffer
+    ///
+    /// @param duBuffer         Output buffer derivative wrt u
+    ///                         must have BindCudaBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param duDesc           vertex buffer descriptor for the duBuffer
+    ///
+    /// @param dvBuffer         Output buffer derivative wrt v
+    ///                         must have BindCudaBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param dvDesc           vertex buffer descriptor for the dvBuffer
+    ///
+    /// @param duuBuffer        Output buffer 2nd derivative wrt u
+    ///                         must have BindCudaBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param duuDesc          vertex buffer descriptor for the duuBuffer
+    ///
+    /// @param duvBuffer        Output buffer 2nd derivative wrt u
+    ///                         must have BindCudaBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param duvDesc          vertex buffer descriptor for the duvBuffer
+    ///
+    /// @param dvvBuffer        Output buffer 2nd derivative wrt v
+    ///                         must have BindCudaBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param dvvDesc          vertex buffer descriptor for the dvvBuffer
+    ///
+    /// @param numPatchCoords   number of patchCoords.
+    ///
+    /// @param patchCoords      array of locations to be evaluated.
+    ///
+    /// @param patchTable       CudaPatchTable or equivalent
+    ///
+    /// @param instance         not used in the cuda evaluator
+    ///
+    /// @param deviceContext    not used in the cuda evaluator
+    ///
+    template <typename SRC_BUFFER, typename DST_BUFFER,
+              typename PATCHCOORD_BUFFER, typename PATCH_TABLE>
+    static bool EvalPatches(
+        SRC_BUFFER *srcBuffer, BufferDescriptor const &srcDesc,
+        DST_BUFFER *dstBuffer, BufferDescriptor const &dstDesc,
+        DST_BUFFER *duBuffer,  BufferDescriptor const &duDesc,
+        DST_BUFFER *dvBuffer,  BufferDescriptor const &dvDesc,
+        DST_BUFFER *duuBuffer, BufferDescriptor const &duuDesc,
+        DST_BUFFER *duvBuffer, BufferDescriptor const &duvDesc,
+        DST_BUFFER *dvvBuffer, BufferDescriptor const &dvvDesc,
+        int numPatchCoords,
+        PATCHCOORD_BUFFER *patchCoords,
+        PATCH_TABLE *patchTable,
+        CudaEvaluator const *instance,
+        void * deviceContext = NULL) {
+
+        (void)instance;       // unused
+        (void)deviceContext;  // unused
+
+        return EvalPatches(srcBuffer->BindCudaBuffer(), srcDesc,
+                           dstBuffer->BindCudaBuffer(), dstDesc,
+                           duBuffer->BindCudaBuffer(),  duDesc,
+                           dvBuffer->BindCudaBuffer(),  dvDesc,
+                           duuBuffer->BindCudaBuffer(), duuDesc,
+                           duvBuffer->BindCudaBuffer(), duvDesc,
+                           dvvBuffer->BindCudaBuffer(), dvvDesc,
+                           numPatchCoords,
+                           (const PatchCoord *)patchCoords->BindCudaBuffer(),
+                           (const PatchArray *)patchTable->GetPatchArrayBuffer(),
+                           (const int *)patchTable->GetPatchIndexBuffer(),
+                           (const PatchParam *)patchTable->GetPatchParamBuffer());
+    }
+
    /// \brief Static limit eval function. It takes an array of PatchCoord
    ///        and evaluate limit values on given PatchTable.
    ///
@ -460,15 +726,15 @@ public:
    ///
    /// @param dstDesc          vertex buffer descriptor for the output buffer
    ///
-    /// @param du               Output U-derivatives pointer. An offset of
+    /// @param du               Output pointer derivative wrt u. An offset of
    ///                         duDesc will be applied internally.
    ///
-    /// @param duDesc           vertex buffer descriptor for the du buffer
+    /// @param duDesc           vertex buffer descriptor for the duBuffer
    ///
-    /// @param dv               Output V-derivatives pointer. An offset of
+    /// @param dv               Output pointer derivative wrt v. An offset of
    ///                         dvDesc will be applied internally.
    ///
-    /// @param dvDesc           vertex buffer descriptor for the dv buffer
+    /// @param dvDesc           vertex buffer descriptor for the dvBuffer
    ///
    /// @param numPatchCoords   number of patchCoords.
    ///
@ -489,10 +755,76 @@ public:
        float *du,        BufferDescriptor const &duDesc,
        float *dv,        BufferDescriptor const &dvDesc,
        int numPatchCoords,
-        const PatchCoord *patchCoords,
-        const PatchArray *patchArrays,
+        PatchCoord const *patchCoords,
+        PatchArray const *patchArrays,
        const int *patchIndices,
-        const PatchParam *patchParams);
+        PatchParam const *patchParams);
+
+    /// \brief Static limit eval function. It takes an array of PatchCoord
+    ///        and evaluate limit values on given PatchTable.
+    ///
+    /// @param src              Input primvar pointer. An offset of srcDesc
+    ///                         will be applied internally (i.e. the pointer
+    ///                         should not include the offset)
+    ///
+    /// @param srcDesc          vertex buffer descriptor for the input buffer
+    ///
+    /// @param dst              Output primvar pointer. An offset of dstDesc
+    ///                         will be applied internally.
+    ///
+    /// @param dstDesc          vertex buffer descriptor for the output buffer
+    ///
+    /// @param du               Output pointer derivative wrt u. An offset of
+    ///                         duDesc will be applied internally.
+    ///
+    /// @param duDesc           vertex buffer descriptor for the duBuffer
+    ///
+    /// @param dv               Output pointer derivative wrt v. An offset of
+    ///                         dvDesc will be applied internally.
+    ///
+    /// @param dvDesc           vertex buffer descriptor for the dvBuffer
+    ///
+    /// @param duu              Output pointer 2nd derivative wrt u. An offset of
+    ///                         duuDesc will be applied internally.
+    ///
+    /// @param duuDesc          vertex buffer descriptor for the duuBuffer
+    ///
+    /// @param duv              Output pointer 2nd derivative wrt u and v. An offset of
+    ///                         duvDesc will be applied internally.
+    ///
+    /// @param duvDesc          vertex buffer descriptor for the duvBuffer
+    ///
+    /// @param dvv              Output pointer 2nd derivative wrt v. An offset of
+    ///                         dvvDesc will be applied internally.
+    ///
+    /// @param dvvDesc          vertex buffer descriptor for the dvvBuffer
+    ///
+    /// @param numPatchCoords   number of patchCoords.
+    ///
+    /// @param patchCoords      array of locations to be evaluated.
+    ///
+    /// @param patchArrays      an array of Osd::PatchArray struct
+    ///                         indexed by PatchCoord::arrayIndex
+    ///
+    /// @param patchIndices     an array of patch indices
+    ///                         indexed by PatchCoord::vertIndex
+    ///
+    /// @param patchParams      an array of Osd::PatchParam struct
+    ///                         indexed by PatchCoord::patchIndex
+    ///
+    static bool EvalPatches(
+        const float *src, BufferDescriptor const &srcDesc,
+        float *dst,       BufferDescriptor const &dstDesc,
+        float *du,        BufferDescriptor const &duDesc,
+        float *dv,        BufferDescriptor const &dvDesc,
+        float *duu,       BufferDescriptor const &duuDesc,
+        float *duv,       BufferDescriptor const &duvDesc,
+        float *dvv,       BufferDescriptor const &dvvDesc,
+        int numPatchCoords,
+        PatchCoord const *patchCoords,
+        PatchArray const *patchArrays,
+        const int *patchIndices,
+        PatchParam const *patchParams);

    /// \brief Generic limit eval function. This function has a same
    ///        signature as other device kernels have so that it can be called
@ -545,6 +877,164 @@ public:
                           (const PatchParam *)patchTable->GetPatchParamBuffer());
    }

+    /// \brief Generic limit eval function. This function has a same
+    ///        signature as other device kernels have so that it can be called
+    ///        in the same way.
+    ///
+    /// @param srcBuffer        Input primvar buffer.
+    ///                         must have BindCudaBuffer() method returning a
+    ///                         const float pointer for read
+    ///
+    /// @param srcDesc          vertex buffer descriptor for the input buffer
+    ///
+    /// @param dstBuffer        Output primvar buffer
+    ///                         must have BindCudaBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param dstDesc          vertex buffer descriptor for the output buffer
+    ///
+    /// @param duBuffer         Output buffer derivative wrt u
+    ///                         must have BindCudaBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param duDesc           vertex buffer descriptor for the duBuffer
+    ///
+    /// @param dvBuffer         Output buffer derivative wrt v
+    ///                         must have BindCudaBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param dvDesc           vertex buffer descriptor for the dvBuffer
+    ///
+    /// @param numPatchCoords   number of patchCoords.
+    ///
+    /// @param patchCoords      array of locations to be evaluated.
+    ///                         must have BindCudaBuffer() method returning an
+    ///                         array of PatchCoord struct in cuda memory.
+    ///
+    /// @param patchTable       CudaPatchTable or equivalent
+    ///
+    /// @param instance         not used in the cuda evaluator
+    ///
+    /// @param deviceContext    not used in the cuda evaluator
+    ///
+    template <typename SRC_BUFFER, typename DST_BUFFER,
+              typename PATCHCOORD_BUFFER, typename PATCH_TABLE>
+    static bool EvalPatchesVarying(
+        SRC_BUFFER *srcBuffer, BufferDescriptor const &srcDesc,
+        DST_BUFFER *dstBuffer, BufferDescriptor const &dstDesc,
+        DST_BUFFER *duBuffer,  BufferDescriptor const &duDesc,
+        DST_BUFFER *dvBuffer,  BufferDescriptor const &dvDesc,
+        int numPatchCoords,
+        PATCHCOORD_BUFFER *patchCoords,
+        PATCH_TABLE *patchTable,
+        CudaEvaluator const *instance,
+        void * deviceContext = NULL) {
+
+        (void)instance;       // unused
+        (void)deviceContext;  // unused
+
+        return EvalPatches(srcBuffer->BindCudaBuffer(), srcDesc,
+                           dstBuffer->BindCudaBuffer(), dstDesc,
+                           duBuffer->BindCudaBuffer(), duDesc,
+                           dvBuffer->BindCudaBuffer(), dvDesc,
+                           numPatchCoords,
+                           (const PatchCoord *)patchCoords->BindCudaBuffer(),
+                           (const PatchArray *)patchTable->GetVaryingPatchArrayBuffer(),
+                           (const int *)patchTable->GetVaryingPatchIndexBuffer(),
+                           (const PatchParam *)patchTable->GetPatchParamBuffer());
+    }
+
+    /// \brief Generic limit eval function. This function has a same
+    ///        signature as other device kernels have so that it can be called
+    ///        in the same way.
+    ///
+    /// @param srcBuffer        Input primvar buffer.
+    ///                         must have BindCudaBuffer() method returning a
+    ///                         const float pointer for read
+    ///
+    /// @param srcDesc          vertex buffer descriptor for the input buffer
+    ///
+    /// @param dstBuffer        Output primvar buffer
+    ///                         must have BindCudaBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param dstDesc          vertex buffer descriptor for the output buffer
+    ///
+    /// @param duBuffer         Output buffer derivative wrt u
+    ///                         must have BindCudaBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param duDesc           vertex buffer descriptor for the duBuffer
+    ///
+    /// @param dvBuffer         Output buffer derivative wrt v
+    ///                         must have BindCudaBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param dvDesc           vertex buffer descriptor for the dvBuffer
+    ///
+    /// @param duuBuffer        Output buffer 2nd derivative wrt u
+    ///                         must have BindCudaBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param duuDesc          vertex buffer descriptor for the duuBuffer
+    ///
+    /// @param duvBuffer        Output buffer 2nd derivative wrt u
+    ///                         must have BindCudaBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param duvDesc          vertex buffer descriptor for the duvBuffer
+    ///
+    /// @param dvvBuffer        Output buffer 2nd derivative wrt v
+    ///                         must have BindCudaBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param dvvDesc          vertex buffer descriptor for the dvvBuffer
+    ///
+    /// @param numPatchCoords   number of patchCoords.
+    ///
+    /// @param patchCoords      array of locations to be evaluated.
+    ///                         must have BindCudaBuffer() method returning an
+    ///                         array of PatchCoord struct in cuda memory.
+    ///
+    /// @param patchTable       CudaPatchTable or equivalent
+    ///
+    /// @param instance         not used in the cuda evaluator
+    ///
+    /// @param deviceContext    not used in the cuda evaluator
+    ///
+    template <typename SRC_BUFFER, typename DST_BUFFER,
+              typename PATCHCOORD_BUFFER, typename PATCH_TABLE>
+    static bool EvalPatchesVarying(
+        SRC_BUFFER *srcBuffer, BufferDescriptor const &srcDesc,
+        DST_BUFFER *dstBuffer, BufferDescriptor const &dstDesc,
+        DST_BUFFER *duBuffer,  BufferDescriptor const &duDesc,
+        DST_BUFFER *dvBuffer,  BufferDescriptor const &dvDesc,
+        DST_BUFFER *duuBuffer, BufferDescriptor const &duuDesc,
+        DST_BUFFER *duvBuffer, BufferDescriptor const &duvDesc,
+        DST_BUFFER *dvvBuffer, BufferDescriptor const &dvvDesc,
+        int numPatchCoords,
+        PATCHCOORD_BUFFER *patchCoords,
+        PATCH_TABLE *patchTable,
+        CudaEvaluator const *instance,
+        void * deviceContext = NULL) {
+
+        (void)instance;       // unused
+        (void)deviceContext;  // unused
+
+        return EvalPatches(srcBuffer->BindCudaBuffer(), srcDesc,
+                           dstBuffer->BindCudaBuffer(), dstDesc,
+                           duBuffer->BindCudaBuffer(), duDesc,
+                           dvBuffer->BindCudaBuffer(), dvDesc,
+                           duuBuffer->BindCudaBuffer(), duuDesc,
+                           duvBuffer->BindCudaBuffer(), duvDesc,
+                           dvvBuffer->BindCudaBuffer(), dvvDesc,
+                           numPatchCoords,
+                           (const PatchCoord *)patchCoords->BindCudaBuffer(),
+                           (const PatchArray *)patchTable->GetVaryingPatchArrayBuffer(),
+                           (const int *)patchTable->GetVaryingPatchIndexBuffer(),
+                           (const PatchParam *)patchTable->GetPatchParamBuffer());
+    }
+
    /// \brief Generic limit eval function. This function has a same
    ///        signature as other device kernels have so that it can be called
    ///        in the same way.
@ -599,6 +1089,170 @@ public:
                           (const PatchParam *)patchTable->GetFVarPatchParamBuffer(fvarChannel));
    }

+    /// \brief Generic limit eval function. This function has a same
+    ///        signature as other device kernels have so that it can be called
+    ///        in the same way.
+    ///
+    /// @param srcBuffer        Input primvar buffer.
+    ///                         must have BindCudaBuffer() method returning a
+    ///                         const float pointer for read
+    ///
+    /// @param srcDesc          vertex buffer descriptor for the input buffer
+    ///
+    /// @param dstBuffer        Output primvar buffer
+    ///                         must have BindCudaBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param dstDesc          vertex buffer descriptor for the output buffer
+    ///
+    /// @param duBuffer         Output buffer derivative wrt u
+    ///                         must have BindCudaBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param duDesc           vertex buffer descriptor for the duBuffer
+    ///
+    /// @param dvBuffer         Output buffer derivative wrt v
+    ///                         must have BindCudaBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param dvDesc           vertex buffer descriptor for the dvBuffer
+    ///
+    /// @param numPatchCoords   number of patchCoords.
+    ///
+    /// @param patchCoords      array of locations to be evaluated.
+    ///                         must have BindCudaBuffer() method returning an
+    ///                         array of PatchCoord struct in cuda memory.
+    ///
+    /// @param patchTable       CudaPatchTable or equivalent
+    ///
+    /// @param fvarChannel      face-varying channel
+    ///
+    /// @param instance         not used in the cuda evaluator
+    ///
+    /// @param deviceContext    not used in the cuda evaluator
+    ///
+    template <typename SRC_BUFFER, typename DST_BUFFER,
+              typename PATCHCOORD_BUFFER, typename PATCH_TABLE>
+    static bool EvalPatchesFaceVarying(
+        SRC_BUFFER *srcBuffer, BufferDescriptor const &srcDesc,
+        DST_BUFFER *dstBuffer, BufferDescriptor const &dstDesc,
+        DST_BUFFER *duBuffer, BufferDescriptor const &duDesc,
+        DST_BUFFER *dvBuffer, BufferDescriptor const &dvDesc,
+        int numPatchCoords,
+        PATCHCOORD_BUFFER *patchCoords,
+        PATCH_TABLE *patchTable,
+        int fvarChannel,
+        CudaEvaluator const *instance,
+        void * deviceContext = NULL) {
+
+        (void)instance;   // unused
+        (void)deviceContext;   // unused
+
+        return EvalPatches(srcBuffer->BindCudaBuffer(), srcDesc,
+                           dstBuffer->BindCudaBuffer(), dstDesc,
+                           duBuffer->BindCudaBuffer(), duDesc,
+                           dvBuffer->BindCudaBuffer(), dvDesc,
+                           numPatchCoords,
+                           (const PatchCoord *)patchCoords->BindCudaBuffer(),
+                           (const PatchArray *)patchTable->GetFVarPatchArrayBuffer(fvarChannel),
+                           (const int *)patchTable->GetFVarPatchIndexBuffer(fvarChannel),
+                           (const PatchParam *)patchTable->GetFVarPatchParamBuffer(fvarChannel));
+    }
+
+    /// \brief Generic limit eval function. This function has a same
+    ///        signature as other device kernels have so that it can be called
+    ///        in the same way.
+    ///
+    /// @param srcBuffer        Input primvar buffer.
+    ///                         must have BindCudaBuffer() method returning a
+    ///                         const float pointer for read
+    ///
+    /// @param srcDesc          vertex buffer descriptor for the input buffer
+    ///
+    /// @param dstBuffer        Output primvar buffer
+    ///                         must have BindCudaBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param dstDesc          vertex buffer descriptor for the output buffer
+    ///
+    /// @param duBuffer         Output buffer derivative wrt u
+    ///                         must have BindCudaBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param duDesc           vertex buffer descriptor for the duBuffer
+    ///
+    /// @param dvBuffer         Output buffer derivative wrt v
+    ///                         must have BindCudaBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param dvDesc           vertex buffer descriptor for the dvBuffer
+    ///
+    /// @param duuBuffer        Output buffer 2nd derivative wrt u
+    ///                         must have BindCudaBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param duuDesc          vertex buffer descriptor for the duuBuffer
+    ///
+    /// @param duvBuffer        Output buffer 2nd derivative wrt u
+    ///                         must have BindCudaBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param duvDesc          vertex buffer descriptor for the duvBuffer
+    ///
+    /// @param dvvBuffer        Output buffer 2nd derivative wrt v
+    ///                         must have BindCudaBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param dvvDesc          vertex buffer descriptor for the dvvBuffer
+    ///
+    /// @param numPatchCoords   number of patchCoords.
+    ///
+    /// @param patchCoords      array of locations to be evaluated.
+    ///                         must have BindCudaBuffer() method returning an
+    ///                         array of PatchCoord struct in cuda memory.
+    ///
+    /// @param patchTable       CudaPatchTable or equivalent
+    ///
+    /// @param fvarChannel      face-varying channel
+    ///
+    /// @param instance         not used in the cuda evaluator
+    ///
+    /// @param deviceContext    not used in the cuda evaluator
+    ///
+    template <typename SRC_BUFFER, typename DST_BUFFER,
+              typename PATCHCOORD_BUFFER, typename PATCH_TABLE>
+    static bool EvalPatchesFaceVarying(
+        SRC_BUFFER *srcBuffer, BufferDescriptor const &srcDesc,
+        DST_BUFFER *dstBuffer, BufferDescriptor const &dstDesc,
+        DST_BUFFER *duBuffer,  BufferDescriptor const &duDesc,
+        DST_BUFFER *dvBuffer,  BufferDescriptor const &dvDesc,
+        DST_BUFFER *duuBuffer, BufferDescriptor const &duuDesc,
+        DST_BUFFER *duvBuffer, BufferDescriptor const &duvDesc,
+        DST_BUFFER *dvvBuffer, BufferDescriptor const &dvvDesc,
+        int numPatchCoords,
+        PATCHCOORD_BUFFER *patchCoords,
+        PATCH_TABLE *patchTable,
+        int fvarChannel,
+        CudaEvaluator const *instance,
+        void * deviceContext = NULL) {
+
+        (void)instance;       // unused
+        (void)deviceContext;  // unused
+
+        return EvalPatches(srcBuffer->BindCudaBuffer(), srcDesc,
+                           dstBuffer->BindCudaBuffer(), dstDesc,
+                           duBuffer->BindCudaBuffer(), duDesc,
+                           dvBuffer->BindCudaBuffer(), dvDesc,
+                           duuBuffer->BindCudaBuffer(), duuDesc,
+                           duvBuffer->BindCudaBuffer(), duvDesc,
+                           dvvBuffer->BindCudaBuffer(), dvvDesc,
+                           numPatchCoords,
+                           (const PatchCoord *)patchCoords->BindCudaBuffer(),
+                           (const PatchArray *)patchTable->GetFVarPatchArrayBuffer(fvarChannel),
+                           (const int *)patchTable->GetFVarPatchIndexBuffer(fvarChannel),
+                           (const PatchParam *)patchTable->GetFVarPatchParamBuffer(fvarChannel));
+    }
+
    /// ----------------------------------------------------------------------
    ///
    ///   Other methods
--- a/opensubdiv/osd/cudaGLVertexBuffer.h
+++ b/opensubdiv/osd/cudaGLVertexBuffer.h
@ -40,7 +40,7 @@ namespace OPENSUBDIV_VERSION {

 namespace Osd {

-/// \brief Concrete vertex buffer class for cuda subvision and OpenGL drawing.
+/// \brief Concrete vertex buffer class for cuda subdivision and OpenGL drawing.
 ///
 /// CudaGLVertexBuffer implements CudaVertexBufferInterface and
 /// GLVertexBufferInterface.
--- a/opensubdiv/osd/cudaKernel.cu
+++ b/opensubdiv/osd/cudaKernel.cu
@ -305,8 +305,12 @@ int getNumControlVertices(int patchType) {
 }

 __global__ void
-computePatches(const float *src, float *dst, float *dstDu, float *dstDv,
-               int length, int srcStride, int dstStride, int dstDuStride, int dstDvStride,
+computePatches(const float *src, float *dst,
+               float *dstDu, float *dstDv,
+               float *dstDuu, float *dstDuv, float *dstDvv,
+               int length, int srcStride, int dstStride,
+               int dstDuStride, int dstDvStride,
+               int dstDuuStride, int dstDuvStride, int dstDvvStride,
               int numPatchCoords, const PatchCoord *patchCoords,
               const PatchArray *patchArrayBuffer,
               const int *patchIndexBuffer,
@ -376,6 +380,30 @@ computePatches(const float *src, float *dst, float *dstDu, float *dstDv,
                addWithWeight(d, srcVert, wDt[j], length);
            }
        }
+        if (dstDuu) {
+            float *d = dstDuu + i * dstDuuStride;
+            clear(d, length);
+            for (int j = 0; j < numControlVertices; ++j) {
+                const float * srcVert = src + cvs[j] * srcStride;
+                addWithWeight(d, srcVert, wDss[j], length);
+            }
+        }
+        if (dstDuv) {
+            float *d = dstDuv + i * dstDuvStride;
+            clear(d, length);
+            for (int j = 0; j < numControlVertices; ++j) {
+                const float * srcVert = src + cvs[j] * srcStride;
+                addWithWeight(d, srcVert, wDst[j], length);
+            }
+        }
+        if (dstDvv) {
+            float *d = dstDvv + i * dstDvvStride;
+            clear(d, length);
+            for (int j = 0; j < numControlVertices; ++j) {
+                const float * srcVert = src + cvs[j] * srcStride;
+                addWithWeight(d, srcVert, wDtt[j], length);
+            }
+        }
    }
 }

@ -447,14 +475,19 @@ void CudaEvalPatches(
    // PERFORMANCE: not optimized at all

    computePatches <<<512, 32>>>(
-        src, dst, NULL, NULL, length, srcStride, dstStride, 0, 0,
+        src, dst, NULL, NULL, NULL, NULL, NULL,
+        length, srcStride, dstStride, 0, 0, 0, 0, 0,
        numPatchCoords, patchCoords,
        patchArrayBuffer, patchIndexBuffer, patchParamBuffer);
 }

 void CudaEvalPatchesWithDerivatives(
-    const float *src, float *dst, float *dstDu, float *dstDv,
-    int length, int srcStride, int dstStride, int dstDuStride, int dstDvStride,
+    const float *src, float *dst,
+    float *dstDu, float *dstDv,
+    float *dstDuu, float *dstDuv, float *dstDvv,
+    int length, int srcStride, int dstStride,
+    int dstDuStride, int dstDvStride,
+    int dstDuuStride, int dstDuvStride, int dstDvvStride,
    int numPatchCoords, const PatchCoord *patchCoords,
    const PatchArray *patchArrayBuffer,
    const int *patchIndexBuffer,
@ -463,7 +496,9 @@ void CudaEvalPatchesWithDerivatives(
    // PERFORMANCE: not optimized at all

    computePatches <<<512, 32>>>(
-        src, dst, dstDu, dstDv, length, srcStride, dstStride, dstDuStride, dstDvStride,
+        src, dst, dstDu, dstDv, dstDuu, dstDuv, dstDvv,
+        length, srcStride, dstStride,
+        dstDuStride, dstDvStride, dstDuuStride, dstDuvStride, dstDvvStride,
        numPatchCoords, patchCoords,
        patchArrayBuffer, patchIndexBuffer, patchParamBuffer);
 }
--- a/opensubdiv/osd/cudaVertexBuffer.h
+++ b/opensubdiv/osd/cudaVertexBuffer.h
@ -34,7 +34,7 @@ namespace OPENSUBDIV_VERSION {

 namespace Osd {

-/// \brief Concrete vertex buffer class for Cuda subvision.
+/// \brief Concrete vertex buffer class for Cuda subdivision.
 ///
 /// CudaVertexBuffer implements CudaVertexBufferInterface.
 /// An instance of this buffer class can be passed to CudaEvaluator
--- a/opensubdiv/osd/d3d11ComputeEvaluator.cpp
+++ b/opensubdiv/osd/d3d11ComputeEvaluator.cpp
@ -172,6 +172,22 @@ D3D11ComputeEvaluator::Create(BufferDescriptor const &srcDesc,
                              BufferDescriptor const &duDesc,
                              BufferDescriptor const &dvDesc,
                              ID3D11DeviceContext *deviceContext) {
+    return Create(srcDesc, dstDesc, duDesc, dvDesc,
+                  BufferDescriptor(),
+                  BufferDescriptor(),
+                  BufferDescriptor(),
+                  deviceContext);
+}
+
+D3D11ComputeEvaluator *
+D3D11ComputeEvaluator::Create(BufferDescriptor const &srcDesc,
+                              BufferDescriptor const &dstDesc,
+                              BufferDescriptor const &duDesc,
+                              BufferDescriptor const &dvDesc,
+                              BufferDescriptor const &duuDesc,
+                              BufferDescriptor const &duvDesc,
+                              BufferDescriptor const &dvvDesc,
+                              ID3D11DeviceContext *deviceContext) {
    (void)deviceContext;  // not used

    // TODO: implements derivatives
--- a/opensubdiv/osd/d3d11ComputeEvaluator.h
+++ b/opensubdiv/osd/d3d11ComputeEvaluator.h
@ -102,6 +102,15 @@ public:
                                          BufferDescriptor const &dvDesc,
                                          ID3D11DeviceContext *deviceContext);

+    static D3D11ComputeEvaluator * Create(BufferDescriptor const &srcDesc,
+                                          BufferDescriptor const &dstDesc,
+                                          BufferDescriptor const &duDesc,
+                                          BufferDescriptor const &dvDesc,
+                                          BufferDescriptor const &duuDesc,
+                                          BufferDescriptor const &duvDesc,
+                                          BufferDescriptor const &dvvDesc,
+                                          ID3D11DeviceContext *deviceContext);
+
    /// Constructor.
    D3D11ComputeEvaluator();

@ -148,7 +157,7 @@ public:
                                          stencilTable,
                                          deviceContext);
        } else {
-            // Create an instace on demand (slow)
+            // Create an instance on demand (slow)
            (void)deviceContext;  // unused
            instance = Create(srcDesc, dstDesc,
                              BufferDescriptor(),
@ -212,7 +221,7 @@ private:
    ID3D11ClassLinkage  * _classLinkage;
    ID3D11ClassInstance * _singleBufferKernel;
    ID3D11ClassInstance * _separateBufferKernel;
-    ID3D11Buffer        * _uniformArgs; // uniform paramaeters for kernels
+    ID3D11Buffer        * _uniformArgs; // uniform parameters for kernels

    int _workGroupSize;
 };
--- a/opensubdiv/osd/d3d11VertexBuffer.h
+++ b/opensubdiv/osd/d3d11VertexBuffer.h
@ -38,7 +38,7 @@ namespace OPENSUBDIV_VERSION {
 namespace Osd {

 ///
-/// \brief Concrete vertex buffer class for DirectX subvision and DirectX drawing.
+/// \brief Concrete vertex buffer class for DirectX subdivision and DirectX drawing.
 ///
 /// D3D11VertexBuffer implements D3D11VertexBufferInterface. An instance
 /// of this buffer class can be passed to D3D11ComputeEvaluator.
--- a/opensubdiv/osd/glComputeEvaluator.cpp
+++ b/opensubdiv/osd/glComputeEvaluator.cpp
@ -44,6 +44,10 @@ static const char *shaderSource =

 template <class T> GLuint
 createSSBO(std::vector<T> const & src) {
+    if (src.empty()) {
+        return 0;
+    }
+
    GLuint devicePtr = 0;
    glGenBuffers(1, &devicePtr);

@ -75,9 +79,11 @@ GLStencilTableSSBO::GLStencilTableSSBO(
        _indices = createSSBO(stencilTable->GetControlIndices());
        _weights = createSSBO(stencilTable->GetWeights());
        _duWeights = _dvWeights = 0;
+        _duuWeights = _duvWeights = _dvvWeights = 0;
    } else {
        _sizes = _offsets = _indices = _weights = 0;
        _duWeights = _dvWeights = 0;
+        _duuWeights = _duvWeights = _dvvWeights = 0;
    }
 }

@ -91,9 +97,13 @@ GLStencilTableSSBO::GLStencilTableSSBO(
        _weights = createSSBO(limitStencilTable->GetWeights());
        _duWeights = createSSBO(limitStencilTable->GetDuWeights());
        _dvWeights = createSSBO(limitStencilTable->GetDvWeights());
+        _duuWeights = createSSBO(limitStencilTable->GetDuuWeights());
+        _duvWeights = createSSBO(limitStencilTable->GetDuvWeights());
+        _dvvWeights = createSSBO(limitStencilTable->GetDvvWeights());
    } else {
        _sizes = _offsets = _indices = _weights = 0;
        _duWeights = _dvWeights = 0;
+        _duuWeights = _duvWeights = _dvvWeights = 0;
    }
 }

@ -104,6 +114,9 @@ GLStencilTableSSBO::~GLStencilTableSSBO() {
    if (_weights) glDeleteBuffers(1, &_weights);
    if (_duWeights) glDeleteBuffers(1, &_duWeights);
    if (_dvWeights) glDeleteBuffers(1, &_dvWeights);
+    if (_duuWeights) glDeleteBuffers(1, &_duuWeights);
+    if (_duvWeights) glDeleteBuffers(1, &_duvWeights);
+    if (_dvvWeights) glDeleteBuffers(1, &_dvvWeights);
 }

 // ---------------------------------------------------------------------------
@ -120,8 +133,11 @@ GLComputeEvaluator::~GLComputeEvaluator() {
 static GLuint
 compileKernel(BufferDescriptor const &srcDesc,
              BufferDescriptor const &dstDesc,
-              BufferDescriptor const & /* duDesc */,
-              BufferDescriptor const & /* dvDesc */,
+              BufferDescriptor const & duDesc,
+              BufferDescriptor const & dvDesc,
+              BufferDescriptor const & duuDesc,
+              BufferDescriptor const & duvDesc,
+              BufferDescriptor const & dvvDesc,
              const char *kernelDefine,
              int workGroupSize) {
    GLuint program = glCreateProgram();
@ -139,6 +155,16 @@ compileKernel(BufferDescriptor const &srcDesc,
            << "#define WORK_GROUP_SIZE " << workGroupSize << "\n"
            << kernelDefine << "\n"
            << patchBasisShaderSourceDefine << "\n";
+
+    bool deriv1 = (duDesc.length > 0 || dvDesc.length > 0);
+    bool deriv2 = (duuDesc.length > 0 || duvDesc.length > 0 || dvvDesc.length > 0);
+    if (deriv1) {
+        defines << "#define OPENSUBDIV_GLSL_COMPUTE_USE_1ST_DERIVATIVES\n";
+    }
+    if (deriv2) {
+        defines << "#define OPENSUBDIV_GLSL_COMPUTE_USE_2ND_DERIVATIVES\n";
+    }
+
    std::string defineStr = defines.str();

    const char *shaderSources[4] = {"#version 430\n", 0, 0, 0};
@ -175,16 +201,23 @@ bool
 GLComputeEvaluator::Compile(BufferDescriptor const &srcDesc,
                            BufferDescriptor const &dstDesc,
                            BufferDescriptor const &duDesc,
-                            BufferDescriptor const &dvDesc) {
+                            BufferDescriptor const &dvDesc,
+                            BufferDescriptor const &duuDesc,
+                            BufferDescriptor const &duvDesc,
+                            BufferDescriptor const &dvvDesc) {

    // create a stencil kernel
-    if (!_stencilKernel.Compile(srcDesc, dstDesc, duDesc, dvDesc,
+    if (!_stencilKernel.Compile(srcDesc, dstDesc,
+                                duDesc, dvDesc,
+                                duuDesc, duvDesc, dvvDesc,
                                _workGroupSize)) {
        return false;
    }

    // create a patch kernel
-    if (!_patchKernel.Compile(srcDesc, dstDesc, duDesc, dvDesc,
+    if (!_patchKernel.Compile(srcDesc, dstDesc,
+                              duDesc, dvDesc,
+                              duuDesc, duvDesc, dvvDesc,
                              _workGroupSize)) {
        return false;
    }
@ -214,6 +247,40 @@ GLComputeEvaluator::EvalStencils(
    GLuint dvWeightsBuffer,
    int start, int end) const {

+    return EvalStencils(srcBuffer, srcDesc,
+                        dstBuffer, dstDesc,
+                        duBuffer, duDesc,
+                        dvBuffer, dvDesc,
+                        0, BufferDescriptor(),
+                        0, BufferDescriptor(),
+                        0, BufferDescriptor(),
+                        sizesBuffer, offsetsBuffer, indicesBuffer,
+                        weightsBuffer,
+                        duWeightsBuffer, dvWeightsBuffer,
+                        0, 0, 0,
+                        start, end);
+}
+
+bool
+GLComputeEvaluator::EvalStencils(
+    GLuint srcBuffer, BufferDescriptor const &srcDesc,
+    GLuint dstBuffer, BufferDescriptor const &dstDesc,
+    GLuint duBuffer,  BufferDescriptor const &duDesc,
+    GLuint dvBuffer,  BufferDescriptor const &dvDesc,
+    GLuint duuBuffer, BufferDescriptor const &duuDesc,
+    GLuint duvBuffer, BufferDescriptor const &duvDesc,
+    GLuint dvvBuffer, BufferDescriptor const &dvvDesc,
+    GLuint sizesBuffer,
+    GLuint offsetsBuffer,
+    GLuint indicesBuffer,
+    GLuint weightsBuffer,
+    GLuint duWeightsBuffer,
+    GLuint dvWeightsBuffer,
+    GLuint duuWeightsBuffer,
+    GLuint duvWeightsBuffer,
+    GLuint dvvWeightsBuffer,
+    int start, int end) const {
+
    if (!_stencilKernel.program) return false;
    int count = end - start;
    if (count <= 0) {
@ -224,6 +291,9 @@ GLComputeEvaluator::EvalStencils(
    glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, dstBuffer);
    glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, duBuffer);
    glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 3, dvBuffer);
+    glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 10, duuBuffer);
+    glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 11, duvBuffer);
+    glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 12, dvvBuffer);
    glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 4, sizesBuffer);
    glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 5, offsetsBuffer);
    glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 6, indicesBuffer);
@ -232,6 +302,12 @@ GLComputeEvaluator::EvalStencils(
        glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 8, duWeightsBuffer);
    if (dvWeightsBuffer)
        glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 9, dvWeightsBuffer);
+    if (duuWeightsBuffer)
+        glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 13, duuWeightsBuffer);
+    if (duvWeightsBuffer)
+        glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 14, duvWeightsBuffer);
+    if (dvvWeightsBuffer)
+        glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 15, dvvWeightsBuffer);

    glUseProgram(_stencilKernel.program);

@ -247,13 +323,25 @@ GLComputeEvaluator::EvalStencils(
        glUniform3i(_stencilKernel.uniformDvDesc,
                    dvDesc.offset, dvDesc.length, dvDesc.stride);
    }
+    if (_stencilKernel.uniformDuuDesc > 0) {
+        glUniform3i(_stencilKernel.uniformDuuDesc,
+                    duuDesc.offset, duuDesc.length, duuDesc.stride);
+    }
+    if (_stencilKernel.uniformDuvDesc > 0) {
+        glUniform3i(_stencilKernel.uniformDuvDesc,
+                    duvDesc.offset, duvDesc.length, duvDesc.stride);
+    }
+    if (_stencilKernel.uniformDvvDesc > 0) {
+        glUniform3i(_stencilKernel.uniformDvvDesc,
+                    dvvDesc.offset, dvvDesc.length, dvvDesc.stride);
+    }

    glDispatchCompute((count + _workGroupSize - 1) / _workGroupSize, 1, 1);

    glUseProgram(0);

    glMemoryBarrier(GL_TEXTURE_FETCH_BARRIER_BIT);
-    for (int i = 0; i < 10; ++i) {
+    for (int i = 0; i < 16; ++i) {
        glBindBufferBase(GL_SHADER_STORAGE_BUFFER, i, 0);
    }

@ -272,12 +360,44 @@ GLComputeEvaluator::EvalPatches(
    GLuint patchIndexBuffer,
    GLuint patchParamsBuffer) const {

+    return EvalPatches(srcBuffer, srcDesc,
+                       dstBuffer, dstDesc,
+                       duBuffer, duDesc,
+                       dvBuffer, dvDesc,
+                       0, BufferDescriptor(),
+                       0, BufferDescriptor(),
+                       0, BufferDescriptor(),
+                       numPatchCoords,
+                       patchCoordsBuffer,
+                       patchArrays,
+                       patchIndexBuffer,
+                       patchParamsBuffer);
+}
+
+bool
+GLComputeEvaluator::EvalPatches(
+    GLuint srcBuffer, BufferDescriptor const &srcDesc,
+    GLuint dstBuffer, BufferDescriptor const &dstDesc,
+    GLuint duBuffer,  BufferDescriptor const &duDesc,
+    GLuint dvBuffer,  BufferDescriptor const &dvDesc,
+    GLuint duuBuffer, BufferDescriptor const &duuDesc,
+    GLuint duvBuffer, BufferDescriptor const &duvDesc,
+    GLuint dvvBuffer, BufferDescriptor const &dvvDesc,
+    int numPatchCoords,
+    GLuint patchCoordsBuffer,
+    const PatchArrayVector &patchArrays,
+    GLuint patchIndexBuffer,
+    GLuint patchParamsBuffer) const {
+
    if (!_patchKernel.program) return false;

    glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, srcBuffer);
    glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, dstBuffer);
    glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, duBuffer);
    glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 3, dvBuffer);
+    glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 10, duuBuffer);
+    glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 11, duvBuffer);
+    glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 12, dvvBuffer);
    glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 4, patchCoordsBuffer);
    glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 5, patchIndexBuffer);
    glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 6, patchParamsBuffer);
@ -288,8 +408,27 @@ GLComputeEvaluator::EvalPatches(
    glUniform1i(_patchKernel.uniformDstOffset, dstDesc.offset);
    glUniform4iv(_patchKernel.uniformPatchArray, (int)patchArrays.size(),
                 (const GLint*)&patchArrays[0]);
-    glUniform3i(_patchKernel.uniformDuDesc, duDesc.offset, duDesc.length, duDesc.stride);
-    glUniform3i(_patchKernel.uniformDvDesc, dvDesc.offset, dvDesc.length, dvDesc.stride);
+
+    if (_patchKernel.uniformDuDesc > 0) {
+        glUniform3i(_patchKernel.uniformDuDesc,
+                    duDesc.offset, duDesc.length, duDesc.stride);
+    }
+    if (_patchKernel.uniformDvDesc > 0) {
+        glUniform3i(_patchKernel.uniformDvDesc,
+                    dvDesc.offset, dvDesc.length, dvDesc.stride);
+    }
+    if (_patchKernel.uniformDuuDesc > 0) {
+        glUniform3i(_patchKernel.uniformDuuDesc,
+                    duuDesc.offset, duuDesc.length, duuDesc.stride);
+    }
+    if (_patchKernel.uniformDuvDesc > 0) {
+        glUniform3i(_patchKernel.uniformDuvDesc,
+                    duvDesc.offset, duvDesc.length, duvDesc.stride);
+    }
+    if (_patchKernel.uniformDvvDesc > 0) {
+        glUniform3i(_patchKernel.uniformDvvDesc,
+                    dvvDesc.offset, dvvDesc.length, dvvDesc.stride);
+    }

    glDispatchCompute((numPatchCoords + _workGroupSize - 1) / _workGroupSize, 1, 1);

@ -303,6 +442,10 @@ GLComputeEvaluator::EvalPatches(
    glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 5, 0);
    glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 6, 0);

+    glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 10, 0);
+    glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 11, 0);
+    glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 12, 0);
+
    return true;
 }
 // ---------------------------------------------------------------------------
@ -320,23 +463,21 @@ GLComputeEvaluator::_StencilKernel::Compile(BufferDescriptor const &srcDesc,
                                            BufferDescriptor const &dstDesc,
                                            BufferDescriptor const &duDesc,
                                            BufferDescriptor const &dvDesc,
+                                            BufferDescriptor const &duuDesc,
+                                            BufferDescriptor const &duvDesc,
+                                            BufferDescriptor const &dvvDesc,
                                            int workGroupSize) {
    // create stencil kernel
    if (program) {
        glDeleteProgram(program);
    }

-    bool derivatives = (duDesc.length > 0 || dvDesc.length > 0);
-    const char *kernelDef = derivatives
-        ? "#define OPENSUBDIV_GLSL_COMPUTE_KERNEL_EVAL_STENCILS\n"
-          "#define OPENSUBDIV_GLSL_COMPUTE_USE_DERIVATIVES\n"
-        : "#define OPENSUBDIV_GLSL_COMPUTE_KERNEL_EVAL_STENCILS\n";
+    const char * kernelDefine =
+        "#define OPENSUBDIV_GLSL_COMPUTE_KERNEL_EVAL_STENCILS\n";

-    if (program) {
-        glDeleteProgram(program);
-    }
-    program = compileKernel(srcDesc, dstDesc, duDesc, dvDesc, kernelDef,
-                            workGroupSize);
+    program = compileKernel(srcDesc, dstDesc,
+                            duDesc, dvDesc, duuDesc, duvDesc, dvvDesc,
+                            kernelDefine, workGroupSize);
    if (program == 0) return false;

    // cache uniform locations (TODO: use uniform block)
@ -346,6 +487,9 @@ GLComputeEvaluator::_StencilKernel::Compile(BufferDescriptor const &srcDesc,
    uniformDstOffset = glGetUniformLocation(program, "dstOffset");
    uniformDuDesc    = glGetUniformLocation(program, "duDesc");
    uniformDvDesc    = glGetUniformLocation(program, "dvDesc");
+    uniformDuuDesc   = glGetUniformLocation(program, "duuDesc");
+    uniformDuvDesc   = glGetUniformLocation(program, "duvDesc");
+    uniformDvvDesc   = glGetUniformLocation(program, "dvvDesc");

    return true;
 }
@ -365,23 +509,21 @@ GLComputeEvaluator::_PatchKernel::Compile(BufferDescriptor const &srcDesc,
                                          BufferDescriptor const &dstDesc,
                                          BufferDescriptor const &duDesc,
                                          BufferDescriptor const &dvDesc,
+                                          BufferDescriptor const &duuDesc,
+                                          BufferDescriptor const &duvDesc,
+                                          BufferDescriptor const &dvvDesc,
                                          int workGroupSize) {
    // create stencil kernel
    if (program) {
        glDeleteProgram(program);
    }

-    bool derivatives = (duDesc.length > 0 || dvDesc.length > 0);
-    const char *kernelDef = derivatives
-        ? "#define OPENSUBDIV_GLSL_COMPUTE_KERNEL_EVAL_PATCHES\n"
-          "#define OPENSUBDIV_GLSL_COMPUTE_USE_DERIVATIVES\n"
-        : "#define OPENSUBDIV_GLSL_COMPUTE_KERNEL_EVAL_PATCHES\n";
+    const char * kernelDefine =
+        "#define OPENSUBDIV_GLSL_COMPUTE_KERNEL_EVAL_PATCHES\n";

-    if (program) {
-        glDeleteProgram(program);
-    }
-    program = compileKernel(srcDesc, dstDesc, duDesc, dvDesc, kernelDef,
-                            workGroupSize);
+    program = compileKernel(srcDesc, dstDesc,
+                            duDesc, dvDesc, duuDesc, duvDesc, dvvDesc,
+                            kernelDefine, workGroupSize);
    if (program == 0) return false;

    // cache uniform locations
@ -390,6 +532,9 @@ GLComputeEvaluator::_PatchKernel::Compile(BufferDescriptor const &srcDesc,
    uniformPatchArray = glGetUniformLocation(program, "patchArray");
    uniformDuDesc     = glGetUniformLocation(program, "duDesc");
    uniformDvDesc     = glGetUniformLocation(program, "dvDesc");
+    uniformDuuDesc    = glGetUniformLocation(program, "duuDesc");
+    uniformDuvDesc    = glGetUniformLocation(program, "duvDesc");
+    uniformDvvDesc    = glGetUniformLocation(program, "dvvDesc");

    return true;
 }
--- a/opensubdiv/osd/glComputeEvaluator.h
+++ b/opensubdiv/osd/glComputeEvaluator.h
--- a/opensubdiv/osd/glVertexBuffer.h
+++ b/opensubdiv/osd/glVertexBuffer.h
@ -36,7 +36,7 @@ namespace OPENSUBDIV_VERSION {
 namespace Osd {

 ///
-/// \brief Concrete vertex buffer class for GLSL subvision and OpenGL drawing.
+/// \brief Concrete vertex buffer class for GLSL subdivision and OpenGL drawing.
 ///
 /// GLVertexBuffer implements GLVertexBufferInterface. An instance
 /// of this buffer class can be passed to OsdGLComputeEvaluator.
--- a/opensubdiv/osd/glXFBEvaluator.cpp
+++ b/opensubdiv/osd/glXFBEvaluator.cpp
@ -48,6 +48,10 @@ static const char *shaderSource =

 template <class T> GLuint
 createGLTextureBuffer(std::vector<T> const & src, GLenum type) {
+    if (src.empty()) {
+        return 0;
+    }
+
    GLint size = static_cast<int>(src.size()*sizeof(T));
    void const * ptr = &src.at(0);

@ -95,9 +99,11 @@ GLStencilTableTBO::GLStencilTableTBO(
            stencilTable->GetControlIndices(), GL_R32I);
        _weights = createGLTextureBuffer(stencilTable->GetWeights(), GL_R32F);
        _duWeights = _dvWeights = 0;
+        _duuWeights = _duvWeights = _dvvWeights = 0;
    } else {
        _sizes = _offsets = _indices = _weights = 0;
        _duWeights = _dvWeights = 0;
+        _duuWeights = _duvWeights = _dvvWeights = 0;
    }
 }

@ -118,9 +124,16 @@ GLStencilTableTBO::GLStencilTableTBO(
            limitStencilTable->GetDuWeights(), GL_R32F);
        _dvWeights = createGLTextureBuffer(
            limitStencilTable->GetDvWeights(), GL_R32F);
+        _duuWeights = createGLTextureBuffer(
+            limitStencilTable->GetDuuWeights(), GL_R32F);
+        _duvWeights = createGLTextureBuffer(
+            limitStencilTable->GetDuvWeights(), GL_R32F);
+        _dvvWeights = createGLTextureBuffer(
+            limitStencilTable->GetDvvWeights(), GL_R32F);
    } else {
        _sizes = _offsets = _indices = _weights = 0;
        _duWeights = _dvWeights = 0;
+        _duuWeights = _duvWeights = _dvvWeights = 0;
    }
 }

@ -131,11 +144,16 @@ GLStencilTableTBO::~GLStencilTableTBO() {
    if (_weights) glDeleteTextures(1, &_weights);
    if (_duWeights) glDeleteTextures(1, &_duWeights);
    if (_dvWeights) glDeleteTextures(1, &_dvWeights);
+    if (_duuWeights) glDeleteTextures(1, &_duuWeights);
+    if (_duvWeights) glDeleteTextures(1, &_duvWeights);
+    if (_dvvWeights) glDeleteTextures(1, &_dvvWeights);
 }

 // ---------------------------------------------------------------------------

-GLXFBEvaluator::GLXFBEvaluator() : _srcBufferTexture(0) {
+GLXFBEvaluator::GLXFBEvaluator(bool interleavedDerivativeBuffers)
+    : _srcBufferTexture(0),
+      _interleavedDerivativeBuffers(interleavedDerivativeBuffers) {
 }

 GLXFBEvaluator::~GLXFBEvaluator() {
@ -149,7 +167,11 @@ compileKernel(BufferDescriptor const &srcDesc,
              BufferDescriptor const &dstDesc,
              BufferDescriptor const &duDesc,
              BufferDescriptor const &dvDesc,
-              const char *kernelDefine) {
+              BufferDescriptor const &duuDesc,
+              BufferDescriptor const &duvDesc,
+              BufferDescriptor const &dvvDesc,
+              const char *kernelDefine,
+              bool interleavedDerivativeBuffers) {

    GLuint program = glCreateProgram();

@ -165,8 +187,25 @@ compileKernel(BufferDescriptor const &srcDesc,
            << "#define VERTEX_SHADER\n"
            << kernelDefine << "\n"
            << patchBasisShaderSourceDefine << "\n";
-    std::string defineStr = defines.str();

+    bool deriv1 = (duDesc.length > 0 || dvDesc.length > 0);
+    bool deriv2 = (duuDesc.length > 0 || duvDesc.length > 0 || dvvDesc.length > 0);
+    if (deriv1) {
+        defines << "#define OPENSUBDIV_GLSL_XFB_USE_1ST_DERIVATIVES\n";
+        if (interleavedDerivativeBuffers) {
+            defines <<
+                "#define OPENSUBDIV_GLSL_XFB_INTERLEAVED_1ST_DERIVATIVE_BUFFERS\n";
+        }
+    }
+    if (deriv2) {
+        defines << "#define OPENSUBDIV_GLSL_XFB_USE_2ND_DERIVATIVES\n";
+        if (interleavedDerivativeBuffers) {
+            defines <<
+                "#define OPENSUBDIV_GLSL_XFB_INTERLEAVED_2ND_DERIVATIVE_BUFFERS\n";
+        }
+    }
+
+    std::string defineStr = defines.str();

    const char *shaderSources[4] = {"#version 410\n", NULL, NULL, NULL};

@ -204,15 +243,43 @@ compileKernel(BufferDescriptor const &srcDesc,
            outputs.push_back("gl_SkipComponents1");
        }
    }
-    if (duDesc.length) {
    //
    // For derivatives, we use another buffer bindings so gl_NextBuffer
    // is inserted here to switch the destination of transform feedback.
    //
-        // Note that the destination buffers may or may not be shared between
+    // Note that the destination buffers may or may not be interleaved between
    // vertex and each derivatives. gl_NextBuffer seems still works well
    // in either case.
    //
+    // If we know that the buffers for derivatives are interleaved, then we
+    // can use fewer buffer bindings. This can be important, since most GL
+    // implementations will support only up to 4 transform feedback bindings.
+    //
+    if (deriv1 && interleavedDerivativeBuffers) {
+        outputs.push_back("gl_NextBuffer");
+
+        int primvar1Offset = (duDesc.offset % duDesc.stride);
+        int primvar2Offset = (dvDesc.offset % dvDesc.stride);
+
+        for (int i = 0; i < primvar1Offset; ++i) {
+            outputs.push_back("gl_SkipComponents1");
+        }
+        for (int i = 0; i < duDesc.length; ++i) {
+            snprintf(attrName, sizeof(attrName), "outDeriv1Buffer[%d]", i);
+            outputs.push_back(attrName);
+        }
+        for (int i = primvar1Offset + duDesc.length; i < primvar2Offset; ++i) {
+            outputs.push_back("gl_SkipComponents1");
+        }
+        for (int i = 0; i < dvDesc.length; ++i) {
+            snprintf(attrName, sizeof(attrName), "outDeriv1Buffer[%d]", i+duDesc.length);
+            outputs.push_back(attrName);
+        }
+        for (int i = primvar2Offset + dvDesc.length; i < dvDesc.stride; ++i) {
+            outputs.push_back("gl_SkipComponents1");
+        }
+    } else {
+        if (duDesc.length) {
            outputs.push_back("gl_NextBuffer");
            int primvarOffset = (duDesc.offset % duDesc.stride);
            for (int i = 0; i < primvarOffset; ++i) {
@ -240,6 +307,85 @@ compileKernel(BufferDescriptor const &srcDesc,
                outputs.push_back("gl_SkipComponents1");
            }
        }
+    }
+    if (deriv2 && interleavedDerivativeBuffers) {
+        outputs.push_back("gl_NextBuffer");
+
+        int primvar1Offset = (duuDesc.offset % duuDesc.stride);
+        int primvar2Offset = (duvDesc.offset % duvDesc.stride);
+        int primvar3Offset = (dvvDesc.offset % dvvDesc.stride);
+
+        for (int i = 0; i < primvar1Offset; ++i) {
+            outputs.push_back("gl_SkipComponents1");
+        }
+        for (int i = 0; i < duuDesc.length; ++i) {
+            snprintf(attrName, sizeof(attrName), "outDeriv2Buffer[%d]", i);
+            outputs.push_back(attrName);
+        }
+
+        for (int i = primvar1Offset + duuDesc.length; i < primvar2Offset; ++i) {
+            outputs.push_back("gl_SkipComponents1");
+        }
+        for (int i = 0; i < duvDesc.length; ++i) {
+            snprintf(attrName, sizeof(attrName), "outDeriv2Buffer[%d]", i+duuDesc.length);
+            outputs.push_back(attrName);
+        }
+
+        for (int i = primvar2Offset + duvDesc.length; i < primvar3Offset; ++i) {
+            outputs.push_back("gl_SkipComponents1");
+        }
+        for (int i = 0; i < dvvDesc.length; ++i) {
+            snprintf(attrName, sizeof(attrName), "outDeriv2Buffer[%d]", i+duuDesc.length+duvDesc.length);
+            outputs.push_back(attrName);
+        }
+
+        for (int i = primvar3Offset + dvvDesc.length; i < dvvDesc.stride; ++i) {
+            outputs.push_back("gl_SkipComponents1");
+        }
+    } else {
+        if (duuDesc.length) {
+            outputs.push_back("gl_NextBuffer");
+            int primvarOffset = (duuDesc.offset % duuDesc.stride);
+            for (int i = 0; i < primvarOffset; ++i) {
+                outputs.push_back("gl_SkipComponents1");
+            }
+            for (int i = 0; i < duuDesc.length; ++i) {
+                snprintf(attrName, sizeof(attrName), "outDuuBuffer[%d]", i);
+                outputs.push_back(attrName);
+            }
+            for (int i = primvarOffset + duuDesc.length; i < duuDesc.stride; ++i) {
+                outputs.push_back("gl_SkipComponents1");
+            }
+        }
+        if (duvDesc.length) {
+            outputs.push_back("gl_NextBuffer");
+            int primvarOffset = (duvDesc.offset % duvDesc.stride);
+            for (int i = 0; i < primvarOffset; ++i) {
+                outputs.push_back("gl_SkipComponents1");
+            }
+            for (int i = 0; i < duvDesc.length; ++i) {
+                snprintf(attrName, sizeof(attrName), "outDuvBuffer[%d]", i);
+                outputs.push_back(attrName);
+            }
+            for (int i = primvarOffset + duvDesc.length; i < duvDesc.stride; ++i) {
+                outputs.push_back("gl_SkipComponents1");
+            }
+        }
+        if (dvvDesc.length) {
+            outputs.push_back("gl_NextBuffer");
+            int primvarOffset = (dvvDesc.offset % dvvDesc.stride);
+            for (int i = 0; i < primvarOffset; ++i) {
+                outputs.push_back("gl_SkipComponents1");
+            }
+            for (int i = 0; i < dvvDesc.length; ++i) {
+                snprintf(attrName, sizeof(attrName), "outDvvBuffer[%d]", i);
+                outputs.push_back(attrName);
+            }
+            for (int i = primvarOffset + dvvDesc.length; i < dvvDesc.stride; ++i) {
+                outputs.push_back("gl_SkipComponents1");
+            }
+        }
+    }
    // convert to char* array
    std::vector<const char *> pOutputs;
    for (size_t i = 0; i < outputs.size(); ++i) {
@ -274,13 +420,20 @@ bool
 GLXFBEvaluator::Compile(BufferDescriptor const &srcDesc,
                        BufferDescriptor const &dstDesc,
                        BufferDescriptor const &duDesc,
-                        BufferDescriptor const &dvDesc) {
+                        BufferDescriptor const &dvDesc,
+                        BufferDescriptor const &duuDesc,
+                        BufferDescriptor const &duvDesc,
+                        BufferDescriptor const &dvvDesc) {

    // create a stencil kernel
-    _stencilKernel.Compile(srcDesc, dstDesc, duDesc, dvDesc);
+    _stencilKernel.Compile(srcDesc, dstDesc, duDesc, dvDesc,
+                           duuDesc, duvDesc, dvvDesc,
+                           _interleavedDerivativeBuffers);

    // create a patch kernel
-    _patchKernel.Compile(srcDesc, dstDesc, duDesc, dvDesc);
+    _patchKernel.Compile(srcDesc, dstDesc, duDesc, dvDesc,
+                         duuDesc, duvDesc, dvvDesc,
+                         _interleavedDerivativeBuffers);

    // create a texture for input buffer
    if (!_srcBufferTexture) {
@ -314,12 +467,46 @@ GLXFBEvaluator::EvalStencils(
    GLuint dstBuffer, BufferDescriptor const &dstDesc,
    GLuint duBuffer,  BufferDescriptor const &duDesc,
    GLuint dvBuffer,  BufferDescriptor const &dvDesc,
+    GLuint sizesBuffer,
+    GLuint offsetsBuffer,
+    GLuint indicesBuffer,
+    GLuint weightsBuffer,
+    GLuint duWeightsBuffer,
+    GLuint dvWeightsBuffer,
+    int start, int end) const {
+
+    return EvalStencils(srcBuffer, srcDesc,
+                        dstBuffer, dstDesc,
+                        duBuffer, duDesc,
+                        dvBuffer, dvDesc,
+                        0, BufferDescriptor(),
+                        0, BufferDescriptor(),
+                        0, BufferDescriptor(),
+                        sizesBuffer, offsetsBuffer, indicesBuffer,
+                        weightsBuffer,
+                        duWeightsBuffer, dvWeightsBuffer,
+                        0, 0, 0,
+                        start, end);
+}
+
+bool
+GLXFBEvaluator::EvalStencils(
+    GLuint srcBuffer, BufferDescriptor const &srcDesc,
+    GLuint dstBuffer, BufferDescriptor const &dstDesc,
+    GLuint duBuffer,  BufferDescriptor const &duDesc,
+    GLuint dvBuffer,  BufferDescriptor const &dvDesc,
+    GLuint duuBuffer, BufferDescriptor const &duuDesc,
+    GLuint duvBuffer, BufferDescriptor const &duvDesc,
+    GLuint dvvBuffer, BufferDescriptor const &dvvDesc,
    GLuint sizesTexture,
    GLuint offsetsTexture,
    GLuint indicesTexture,
    GLuint weightsTexture,
    GLuint duWeightsTexture,
    GLuint dvWeightsTexture,
+    GLuint duuWeightsTexture,
+    GLuint duvWeightsTexture,
+    GLuint dvvWeightsTexture,
    int start, int end) const {

    if (!_stencilKernel.program) return false;
@ -353,6 +540,12 @@ GLXFBEvaluator::EvalStencils(
        bindTexture(_stencilKernel.uniformDuWeightsTexture, duWeightsTexture, 5);
    if (_stencilKernel.uniformDvWeightsTexture >= 0 && dvWeightsTexture)
        bindTexture(_stencilKernel.uniformDvWeightsTexture, dvWeightsTexture, 6);
+    if (_stencilKernel.uniformDuuWeightsTexture >= 0 && duuWeightsTexture)
+        bindTexture(_stencilKernel.uniformDuuWeightsTexture, duuWeightsTexture, 7);
+    if (_stencilKernel.uniformDuvWeightsTexture >= 0 && duvWeightsTexture)
+        bindTexture(_stencilKernel.uniformDuvWeightsTexture, duvWeightsTexture, 8);
+    if (_stencilKernel.uniformDvvWeightsTexture >= 0 && dvvWeightsTexture)
+        bindTexture(_stencilKernel.uniformDvvWeightsTexture, dvvWeightsTexture, 9);

    // set batch range
    glUniform1i(_stencilKernel.uniformStart,     start);
@ -392,6 +585,12 @@ GLXFBEvaluator::EvalStencils(
        (duDesc.offset - (duDesc.offset % duDesc.stride)) : 0;
    int dvBufferBindOffset = dvDesc.stride ?
        (dvDesc.offset - (dvDesc.offset % dvDesc.stride)) : 0;
+    int duuBufferBindOffset = duuDesc.stride ?
+        (duuDesc.offset - (duuDesc.offset % duuDesc.stride)) : 0;
+    int duvBufferBindOffset = duvDesc.stride ?
+        (duvDesc.offset - (duvDesc.offset % duvDesc.stride)) : 0;
+    int dvvBufferBindOffset = dvvDesc.stride ?
+        (dvvDesc.offset - (dvvDesc.offset % dvvDesc.stride)) : 0;

    // bind destination buffer
    glBindBufferRange(GL_TRANSFORM_FEEDBACK_BUFFER,
@ -399,6 +598,12 @@ GLXFBEvaluator::EvalStencils(
                      dstBufferBindOffset * sizeof(float),
                      count * dstDesc.stride * sizeof(float));

+    if ((duDesc.length > 0) && _interleavedDerivativeBuffers) {
+        glBindBufferRange(GL_TRANSFORM_FEEDBACK_BUFFER,
+                          1, duBuffer,
+                          duBufferBindOffset * sizeof(float),
+                          count * duDesc.stride * sizeof(float));
+    } else {
        if (duDesc.length > 0) {
            glBindBufferRange(GL_TRANSFORM_FEEDBACK_BUFFER,
                              1, duBuffer,
@ -412,6 +617,35 @@ GLXFBEvaluator::EvalStencils(
                              dvBufferBindOffset * sizeof(float),
                              count * dvDesc.stride * sizeof(float));
        }
+    }
+
+    if ((duuDesc.length > 0) && _interleavedDerivativeBuffers) {
+        glBindBufferRange(GL_TRANSFORM_FEEDBACK_BUFFER,
+                          2, duuBuffer,
+                          duuBufferBindOffset * sizeof(float),
+                          count * duuDesc.stride * sizeof(float));
+    } else {
+        if (duuDesc.length > 0) {
+            glBindBufferRange(GL_TRANSFORM_FEEDBACK_BUFFER,
+                              3, duuBuffer,
+                              duuBufferBindOffset * sizeof(float),
+                              count * duuDesc.stride * sizeof(float));
+        }
+
+        if (duvDesc.length > 0) {
+            glBindBufferRange(GL_TRANSFORM_FEEDBACK_BUFFER,
+                              4, duvBuffer,
+                              duvBufferBindOffset * sizeof(float),
+                              count * duvDesc.stride * sizeof(float));
+        }
+
+        if (dvvDesc.length > 0) {
+            glBindBufferRange(GL_TRANSFORM_FEEDBACK_BUFFER,
+                              5, dvvBuffer,
+                              dvvBufferBindOffset * sizeof(float),
+                              count * dvvDesc.stride * sizeof(float));
+        }
+    }

    glBeginTransformFeedback(GL_POINTS);
    glDrawArrays(GL_POINTS, 0, count);
@ -419,7 +653,7 @@ GLXFBEvaluator::EvalStencils(

    glBindBuffer(GL_TRANSFORM_FEEDBACK_BUFFER, 0);

-    for (int i = 0; i < 5; ++i) {
+    for (int i = 0; i < 6; ++i) {
        glActiveTexture(GL_TEXTURE0 + i);
        glBindTexture(GL_TEXTURE_BUFFER, 0);
    }
@ -448,7 +682,36 @@ GLXFBEvaluator::EvalPatches(
    GLuint patchIndexTexture,
    GLuint patchParamTexture) const {

-    bool derivatives = (duDesc.length > 0 || dvDesc.length > 0);
+    return EvalPatches(srcBuffer, srcDesc,
+                       dstBuffer, dstDesc,
+                       duBuffer, duDesc,
+                       dvBuffer, dvDesc,
+                       0, BufferDescriptor(),
+                       0, BufferDescriptor(),
+                       0, BufferDescriptor(),
+                       numPatchCoords,
+                       patchCoordsBuffer, patchArrays,
+                       patchIndexTexture,
+                       patchParamTexture);
+}
+
+bool
+GLXFBEvaluator::EvalPatches(
+    GLuint srcBuffer, BufferDescriptor const &srcDesc,
+    GLuint dstBuffer, BufferDescriptor const &dstDesc,
+    GLuint duBuffer,  BufferDescriptor const &duDesc,
+    GLuint dvBuffer,  BufferDescriptor const &dvDesc,
+    GLuint duuBuffer, BufferDescriptor const &duuDesc,
+    GLuint duvBuffer, BufferDescriptor const &duvDesc,
+    GLuint dvvBuffer, BufferDescriptor const &dvvDesc,
+    int numPatchCoords,
+    GLuint patchCoordsBuffer,
+    const PatchArrayVector &patchArrays,
+    GLuint patchIndexTexture,
+    GLuint patchParamTexture) const {
+
+    bool deriv1 = (duDesc.length > 0 || dvDesc.length > 0);
+    bool deriv2 = (duuDesc.length > 0 || duvDesc.length > 0 || dvvDesc.length > 0);

    if (!_patchKernel.program) return false;

@ -493,6 +756,15 @@ GLXFBEvaluator::EvalPatches(
    int dvBufferBindOffset = dvDesc.stride
        ? (dvDesc.offset - (dvDesc.offset % dvDesc.stride))
        : 0;
+    int duuBufferBindOffset = duuDesc.stride
+        ? (duuDesc.offset - (duuDesc.offset % duuDesc.stride))
+        : 0;
+    int duvBufferBindOffset = duvDesc.stride
+        ? (duvDesc.offset - (duvDesc.offset % duvDesc.stride))
+        : 0;
+    int dvvBufferBindOffset = dvvDesc.stride
+        ? (dvvDesc.offset - (dvvDesc.offset % dvvDesc.stride))
+        : 0;

    // bind destination buffer
    glBindBufferRange(GL_TRANSFORM_FEEDBACK_BUFFER,
@ -500,7 +772,12 @@ GLXFBEvaluator::EvalPatches(
                      dstBufferBindOffset * sizeof(float),
                      numPatchCoords * dstDesc.stride * sizeof(float));

-    if (derivatives) {
+    if (deriv1 && _interleavedDerivativeBuffers) {
+        glBindBufferRange(GL_TRANSFORM_FEEDBACK_BUFFER,
+                          1, duBuffer,
+                          duBufferBindOffset * sizeof(float),
+                          numPatchCoords * duDesc.stride * sizeof(float));
+    } else if (deriv1) {
        glBindBufferRange(GL_TRANSFORM_FEEDBACK_BUFFER,
                          1, duBuffer,
                          duBufferBindOffset * sizeof(float),
@ -510,7 +787,27 @@ GLXFBEvaluator::EvalPatches(
                          2, dvBuffer,
                          dvBufferBindOffset * sizeof(float),
                          numPatchCoords * dvDesc.stride * sizeof(float));
+    }
+    if (deriv2 && _interleavedDerivativeBuffers) {
+        glBindBufferRange(GL_TRANSFORM_FEEDBACK_BUFFER,
+                          2, duuBuffer,
+                          duuBufferBindOffset * sizeof(float),
+                          numPatchCoords * duuDesc.stride * sizeof(float));
+    } else if (deriv2) {
+        glBindBufferRange(GL_TRANSFORM_FEEDBACK_BUFFER,
+                          3, duuBuffer,
+                          duuBufferBindOffset * sizeof(float),
+                          numPatchCoords * duuDesc.stride * sizeof(float));

+        glBindBufferRange(GL_TRANSFORM_FEEDBACK_BUFFER,
+                          4, duvBuffer,
+                          duvBufferBindOffset * sizeof(float),
+                          numPatchCoords * duvDesc.stride * sizeof(float));
+
+        glBindBufferRange(GL_TRANSFORM_FEEDBACK_BUFFER,
+                          5, dvvBuffer,
+                          dvvBufferBindOffset * sizeof(float),
+                          numPatchCoords * dvvDesc.stride * sizeof(float));
    }

    glBeginTransformFeedback(GL_POINTS);
@ -520,7 +817,7 @@ GLXFBEvaluator::EvalPatches(
    glBindBuffer(GL_TRANSFORM_FEEDBACK_BUFFER, 0);

    // unbind textures
-    for (int i = 0; i < 3; ++i) {
+    for (int i = 0; i < 6; ++i) {
        glActiveTexture(GL_TEXTURE0 + i);
        glBindTexture(GL_TEXTURE_BUFFER, 0);
    }
@ -536,7 +833,6 @@ GLXFBEvaluator::EvalPatches(
    glBindVertexArray(0);
    glDeleteVertexArrays(1, &vao);

-
    return true;
 }

@ -554,19 +850,22 @@ bool
 GLXFBEvaluator::_StencilKernel::Compile(BufferDescriptor const &srcDesc,
                                        BufferDescriptor const &dstDesc,
                                        BufferDescriptor const &duDesc,
-                                        BufferDescriptor const &dvDesc) {
+                                        BufferDescriptor const &dvDesc,
+                                        BufferDescriptor const &duuDesc,
+                                        BufferDescriptor const &duvDesc,
+                                        BufferDescriptor const &dvvDesc,
+                                        bool interleavedDerivativeBuffers) {
    // create stencil kernel
    if (program) {
        glDeleteProgram(program);
    }

-    bool derivatives = (duDesc.length > 0 || dvDesc.length > 0);
-    const char *kernelDef = derivatives
-        ? "#define OPENSUBDIV_GLSL_XFB_KERNEL_EVAL_STENCILS\n"
-          "#define OPENSUBDIV_GLSL_XFB_USE_DERIVATIVES\n"
-        : "#define OPENSUBDIV_GLSL_XFB_KERNEL_EVAL_STENCILS\n";
+    const char * kernelDefines =
+        "#define OPENSUBDIV_GLSL_XFB_KERNEL_EVAL_STENCILS\n";

-    program = compileKernel(srcDesc, dstDesc, duDesc, dvDesc, kernelDef);
+    program = compileKernel(srcDesc, dstDesc, duDesc, dvDesc,
+                            duuDesc, duvDesc, dvvDesc,
+                            kernelDefines, interleavedDerivativeBuffers);
    if (program == 0) return false;

    // cache uniform locations (TODO: use uniform block)
@ -578,6 +877,9 @@ GLXFBEvaluator::_StencilKernel::Compile(BufferDescriptor const &srcDesc,
    uniformWeightsTexture    = glGetUniformLocation(program, "weights");
    uniformDuWeightsTexture  = glGetUniformLocation(program, "duWeights");
    uniformDvWeightsTexture  = glGetUniformLocation(program, "dvWeights");
+    uniformDuuWeightsTexture = glGetUniformLocation(program, "duuWeights");
+    uniformDuvWeightsTexture = glGetUniformLocation(program, "duvWeights");
+    uniformDvvWeightsTexture = glGetUniformLocation(program, "dvvWeights");
    uniformStart             = glGetUniformLocation(program, "batchStart");
    uniformEnd               = glGetUniformLocation(program, "batchEnd");

@ -598,19 +900,22 @@ bool
 GLXFBEvaluator::_PatchKernel::Compile(BufferDescriptor const &srcDesc,
                                      BufferDescriptor const &dstDesc,
                                      BufferDescriptor const &duDesc,
-                                      BufferDescriptor const &dvDesc) {
+                                      BufferDescriptor const &dvDesc,
+                                      BufferDescriptor const &duuDesc,
+                                      BufferDescriptor const &duvDesc,
+                                      BufferDescriptor const &dvvDesc,
+                                      bool interleavedDerivativeBuffers) {
    // create stencil kernel
    if (program) {
        glDeleteProgram(program);
    }

-    bool derivatives = (duDesc.length > 0 || dvDesc.length > 0);
-    const char *kernelDef = derivatives
-        ? "#define OPENSUBDIV_GLSL_XFB_KERNEL_EVAL_PATCHES\n"
-          "#define OPENSUBDIV_GLSL_XFB_USE_DERIVATIVES\n"
-        : "#define OPENSUBDIV_GLSL_XFB_KERNEL_EVAL_PATCHES\n";
+    const char * kernelDefines =
+        "#define OPENSUBDIV_GLSL_XFB_KERNEL_EVAL_PATCHES\n";

-    program = compileKernel(srcDesc, dstDesc, duDesc, dvDesc, kernelDef);
+    program = compileKernel(srcDesc, dstDesc, duDesc, dvDesc,
+                            duuDesc, duvDesc, dvvDesc,
+                            kernelDefines, interleavedDerivativeBuffers);
    if (program == 0) return false;

    // cache uniform locations
@ -623,7 +928,6 @@ GLXFBEvaluator::_PatchKernel::Compile(BufferDescriptor const &srcDesc,
    return true;
 }

-
 }  // end namespace Osd

 }  // end namespace OPENSUBDIV_VERSION
--- a/opensubdiv/osd/glXFBEvaluator.h
+++ b/opensubdiv/osd/glXFBEvaluator.h
--- a/opensubdiv/osd/glslComputeKernel.glsl
+++ b/opensubdiv/osd/glslComputeKernel.glsl
@ -37,13 +37,22 @@ layout(binding=1) buffer dst_buffer      { float    dstVertexBuffer[]; };

 // derivative buffers (if needed)

-#if defined(OPENSUBDIV_GLSL_COMPUTE_USE_DERIVATIVES)
+#if defined(OPENSUBDIV_GLSL_COMPUTE_USE_1ST_DERIVATIVES)
 uniform ivec3 duDesc;
 uniform ivec3 dvDesc;
 layout(binding=2) buffer du_buffer   { float duBuffer[]; };
 layout(binding=3) buffer dv_buffer   { float dvBuffer[]; };
 #endif

+#if defined(OPENSUBDIV_GLSL_COMPUTE_USE_2ND_DERIVATIVES)
+uniform ivec3 duuDesc;
+uniform ivec3 duvDesc;
+uniform ivec3 dvvDesc;
+layout(binding=10) buffer duu_buffer   { float duuBuffer[]; };
+layout(binding=11) buffer duv_buffer   { float duvBuffer[]; };
+layout(binding=12) buffer dvv_buffer   { float dvvBuffer[]; };
+#endif
+
 // stencil buffers

 #if defined(OPENSUBDIV_GLSL_COMPUTE_KERNEL_EVAL_STENCILS)
@ -55,11 +64,17 @@ layout(binding=5) buffer stencilOffsets  { int      _offsets[]; };
 layout(binding=6) buffer stencilIndices  { int      _indices[]; };
 layout(binding=7) buffer stencilWeights  { float    _weights[]; };

-#if defined(OPENSUBDIV_GLSL_COMPUTE_USE_DERIVATIVES)
+#if defined(OPENSUBDIV_GLSL_COMPUTE_USE_1ST_DERIVATIVES)
 layout(binding=8) buffer stencilDuWeights { float  _duWeights[]; };
 layout(binding=9) buffer stencilDvWeights { float  _dvWeights[]; };
 #endif

+#if defined(OPENSUBDIV_GLSL_COMPUTE_USE_2ND_DERIVATIVES)
+layout(binding=13) buffer stencilDuuWeights { float  _duuWeights[]; };
+layout(binding=14) buffer stencilDuvWeights { float  _duvWeights[]; };
+layout(binding=15) buffer stencilDvvWeights { float  _dvvWeights[]; };
+#endif
+
 #endif

 // patch buffers
@ -119,7 +134,7 @@ void addWithWeight(inout Vertex v, const Vertex src, float weight) {
    }
 }

-#if defined(OPENSUBDIV_GLSL_COMPUTE_USE_DERIVATIVES)
+#if defined(OPENSUBDIV_GLSL_COMPUTE_USE_1ST_DERIVATIVES)
 void writeDu(int index, Vertex du) {
    int duIndex = duDesc.x + index * duDesc.z;
    for (int i = 0; i < LENGTH; ++i) {
@ -135,6 +150,29 @@ void writeDv(int index, Vertex dv) {
 }
 #endif

+#if defined(OPENSUBDIV_GLSL_COMPUTE_USE_2ND_DERIVATIVES)
+void writeDuu(int index, Vertex duu) {
+    int duuIndex = duuDesc.x + index * duuDesc.z;
+    for (int i = 0; i < LENGTH; ++i) {
+        duuBuffer[duuIndex + i] = duu.vertexData[i];
+    }
+}
+
+void writeDuv(int index, Vertex duv) {
+    int duvIndex = duvDesc.x + index * duvDesc.z;
+    for (int i = 0; i < LENGTH; ++i) {
+        duvBuffer[duvIndex + i] = duv.vertexData[i];
+    }
+}
+
+void writeDvv(int index, Vertex dvv) {
+    int dvvIndex = dvvDesc.x + index * dvvDesc.z;
+    for (int i = 0; i < LENGTH; ++i) {
+        dvvBuffer[dvvIndex + i] = dvv.vertexData[i];
+    }
+}
+#endif
+
 //------------------------------------------------------------------------------
 #if defined(OPENSUBDIV_GLSL_COMPUTE_KERNEL_EVAL_STENCILS)

@ -160,7 +198,7 @@ void main() {

    writeVertex(current, dst);

-#if defined(OPENSUBDIV_GLSL_COMPUTE_USE_DERIVATIVES)
+#if defined(OPENSUBDIV_GLSL_COMPUTE_USE_1ST_DERIVATIVES)
    Vertex du, dv;
    clear(du);
    clear(dv);
@ -178,6 +216,29 @@ void main() {
        writeDv(current, dv);
    }
 #endif
+#if defined(OPENSUBDIV_GLSL_COMPUTE_USE_2ND_DERIVATIVES)
+    Vertex duu, duv, dvv;
+    clear(duu);
+    clear(duv);
+    clear(dvv);
+    for (int i=0; i<size; ++i) {
+        // expects the compiler optimizes readVertex out here.
+        Vertex src = readVertex(_indices[offset+i]);
+        addWithWeight(duu, src, _duuWeights[offset+i]);
+        addWithWeight(duv, src, _duvWeights[offset+i]);
+        addWithWeight(dvv, src, _dvvWeights[offset+i]);
+    }
+
+    if (duuDesc.y > 0) { // length
+        writeDuu(current, duu);
+    }
+    if (duvDesc.y > 0) {
+        writeDuv(current, duv);
+    }
+    if (dvvDesc.y > 0) {
+        writeDvv(current, dvv);
+    }
+#endif
 }

 #endif
@ -260,6 +321,9 @@ void main() {
            wP[i] = wP4[i];
            wDs[i] = wDs4[i];
            wDt[i] = wDt4[i];
+            wDss[i] = wDss4[i];
+            wDst[i] = wDst4[i];
+            wDtt[i] = wDtt4[i];
        }
    } else if (patchType == 6) {
        float wP16[16], wDs16[16], wDt16[16], wDss16[16], wDst16[16], wDtt16[16];
@ -269,16 +333,22 @@ void main() {
            wP[i] = wP16[i];
            wDs[i] = wDs16[i];
            wDt[i] = wDt16[i];
+            wDss[i] = wDss16[i];
+            wDst[i] = wDst16[i];
+            wDtt[i] = wDtt16[i];
        }
    } else if (patchType == 9) {
        OsdGetGregoryPatchWeights(uv.s, uv.t, dScale, wP, wDs, wDt, wDss, wDst, wDtt);
        numControlVertices = 20;
    }

-    Vertex dst, du, dv;
+    Vertex dst, du, dv, duu, duv, dvv;
    clear(dst);
    clear(du);
    clear(dv);
+    clear(duu);
+    clear(duv);
+    clear(dvv);

    int indexStride = getNumControlVertices(array.x);
    int indexBase = array.z + indexStride * (patchIndex - array.w);
@ -288,10 +358,13 @@ void main() {
        addWithWeight(dst, readVertex(index), wP[cv]);
        addWithWeight(du, readVertex(index), wDs[cv]);
        addWithWeight(dv, readVertex(index), wDt[cv]);
+        addWithWeight(duu, readVertex(index), wDss[cv]);
+        addWithWeight(duv, readVertex(index), wDst[cv]);
+        addWithWeight(dvv, readVertex(index), wDtt[cv]);
    }
    writeVertex(current, dst);

-#if defined(OPENSUBDIV_GLSL_COMPUTE_USE_DERIVATIVES)
+#if defined(OPENSUBDIV_GLSL_COMPUTE_USE_1ST_DERIVATIVES)
    if (duDesc.y > 0) { // length
        writeDu(current, du);
    }
@ -299,6 +372,17 @@ void main() {
        writeDv(current, dv);
    }
 #endif
+#if defined(OPENSUBDIV_GLSL_COMPUTE_USE_2ND_DERIVATIVES)
+    if (duuDesc.y > 0) { // length
+        writeDuu(current, duu);
+    }
+    if (duvDesc.y > 0) { // length
+        writeDuv(current, duv);
+    }
+    if (dvvDesc.y > 0) {
+        writeDvv(current, dvv);
+    }
+#endif
 }

 #endif
--- a/opensubdiv/osd/glslPatchCommon.glsl
+++ b/opensubdiv/osd/glslPatchCommon.glsl
@ -304,7 +304,7 @@ uniform samplerBuffer OsdFVarDataBuffer;
    }

 // ------ extract from triangles (loop) ---------
-// XXX: no interpolation supproted
+// XXX: no interpolation supported

 #define OSD_COMPUTE_FACE_VARYING_TRI_1(result, fvarOffset, triVert)     \
    {                                                                   \
@ -575,7 +575,7 @@ OsdComputeBSplineBoundaryPoints(inout vec3 cpt[16], ivec3 patchParam)
 // (labeled vv* and ev* respectively).
 //
 // The two segments of each transition edge are labeled Lo and Hi,
-// with the Lo segment occuring before the Hi segment along the
+// with the Lo segment occurring before the Hi segment along the
 // transition edge's domain parameterization. These Lo and Hi segment
 // tessellation levels determine how domain evaluation coordinates
 // are remapped along transition edges. The Hi segment value will
@ -632,7 +632,7 @@ float OsdComputeTessLevel(vec3 p0, vec3 p1)

    // We restrict adaptive tessellation levels to half of the device
    // supported maximum because transition edges are split into two
-    // halfs and the sum of the two corresponding levels must not exceed
+    // halves and the sum of the two corresponding levels must not exceed
    // the device maximum. We impose this limit even for non-transition
    // edges because a non-transition edge must be able to match up with
    // one half of the transition edge of an adjacent transition patch.
--- a/opensubdiv/osd/glslXFBKernel.glsl
+++ b/opensubdiv/osd/glslXFBKernel.glsl
@ -63,7 +63,22 @@ void writeVertex(Vertex v) {

 //------------------------------------------------------------------------------

-#if defined(OPENSUBDIV_GLSL_XFB_USE_DERIVATIVES)
+#if defined(OPENSUBDIV_GLSL_XFB_USE_1ST_DERIVATIVES) && \
+    defined(OPENSUBDIV_GLSL_XFB_INTERLEAVED_1ST_DERIVATIVE_BUFFERS)
+out float outDeriv1Buffer[2*LENGTH];
+
+void writeDu(Vertex v) {
+    for(int i = 0; i < LENGTH; i++) {
+        outDeriv1Buffer[i] = v.vertexData[i];
+    }
+}
+
+void writeDv(Vertex v) {
+    for(int i = 0; i < LENGTH; i++) {
+        outDeriv1Buffer[i+LENGTH] = v.vertexData[i];
+    }
+}
+#elif defined(OPENSUBDIV_GLSL_XFB_USE_1ST_DERIVATIVES)
 out float outDuBuffer[LENGTH];
 out float outDvBuffer[LENGTH];

@ -80,6 +95,51 @@ void writeDv(Vertex v) {
 }
 #endif

+#if defined(OPENSUBDIV_GLSL_XFB_USE_2ND_DERIVATIVES) && \
+    defined(OPENSUBDIV_GLSL_XFB_INTERLEAVED_2ND_DERIVATIVE_BUFFERS)
+out float outDeriv2Buffer[3*LENGTH];
+
+void writeDuu(Vertex v) {
+    for(int i = 0; i < LENGTH; i++) {
+        outDeriv2Buffer[i] = v.vertexData[i];
+    }
+}
+
+void writeDuv(Vertex v) {
+    for(int i = 0; i < LENGTH; i++) {
+        outDeriv2Buffer[i+LENGTH] = v.vertexData[i];
+    }
+}
+
+void writeDvv(Vertex v) {
+    for(int i = 0; i < LENGTH; i++) {
+        outDeriv2Buffer[i+2*LENGTH] = v.vertexData[i];
+    }
+}
+#elif defined(OPENSUBDIV_GLSL_XFB_USE_2ND_DERIVATIVES)
+out float outDuuBuffer[LENGTH];
+out float outDuvBuffer[LENGTH];
+out float outDvvBuffer[LENGTH];
+
+void writeDuu(Vertex v) {
+    for(int i = 0; i < LENGTH; i++) {
+        outDuuBuffer[i] = v.vertexData[i];
+    }
+}
+
+void writeDuv(Vertex v) {
+    for(int i = 0; i < LENGTH; i++) {
+        outDuvBuffer[i] = v.vertexData[i];
+    }
+}
+
+void writeDvv(Vertex v) {
+    for(int i = 0; i < LENGTH; i++) {
+        outDvvBuffer[i] = v.vertexData[i];
+    }
+}
+#endif
+
 //------------------------------------------------------------------------------

 #if defined(OPENSUBDIV_GLSL_XFB_KERNEL_EVAL_STENCILS)
@ -89,11 +149,17 @@ uniform isamplerBuffer offsets;
 uniform isamplerBuffer indices;
 uniform samplerBuffer  weights;

-#if defined(OPENSUBDIV_GLSL_XFB_USE_DERIVATIVES)
+#if defined(OPENSUBDIV_GLSL_XFB_USE_1ST_DERIVATIVES)
 uniform samplerBuffer  duWeights;
 uniform samplerBuffer  dvWeights;
 #endif

+#if defined(OPENSUBDIV_GLSL_XFB_USE_2ND_DERIVATIVES)
+uniform samplerBuffer  duuWeights;
+uniform samplerBuffer  duvWeights;
+uniform samplerBuffer  dvvWeights;
+#endif
+
 uniform int batchStart = 0;
 uniform int batchEnd = 0;

@ -104,10 +170,13 @@ void main() {
        return;
    }

-    Vertex dst, du, dv;
+    Vertex dst, du, dv, duu, duv, dvv;
    clear(dst);
    clear(du);
    clear(dv);
+    clear(duu);
+    clear(duv);
+    clear(dvv);

    int offset = texelFetch(offsets, current).x;
    uint size = texelFetch(sizes, current).x;
@ -117,19 +186,32 @@ void main() {
        float weight = texelFetch(weights, offset+stencil).x;
        addWithWeight(dst, readVertex( index ), weight);

-#if defined(OPENSUBDIV_GLSL_XFB_USE_DERIVATIVES)
+#if defined(OPENSUBDIV_GLSL_XFB_USE_1ST_DERIVATIVES)
        float duWeight = texelFetch(duWeights, offset+stencil).x;
        float dvWeight = texelFetch(dvWeights, offset+stencil).x;
        addWithWeight(du,  readVertex(index), duWeight);
        addWithWeight(dv,  readVertex(index), dvWeight);
+#endif
+#if defined(OPENSUBDIV_GLSL_XFB_USE_2ND_DERIVATIVES)
+        float duuWeight = texelFetch(duuWeights, offset+stencil).x;
+        float duvWeight = texelFetch(duvWeights, offset+stencil).x;
+        float dvvWeight = texelFetch(dvvWeights, offset+stencil).x;
+        addWithWeight(duu,  readVertex(index), duuWeight);
+        addWithWeight(duv,  readVertex(index), duvWeight);
+        addWithWeight(dvv,  readVertex(index), dvvWeight);
 #endif
    }
    writeVertex(dst);

-#if defined(OPENSUBDIV_GLSL_XFB_USE_DERIVATIVES)
+#if defined(OPENSUBDIV_GLSL_XFB_USE_1ST_DERIVATIVES)
    writeDu(du);
    writeDv(dv);
 #endif
+#if defined(OPENSUBDIV_GLSL_XFB_USE_2ND_DERIVATIVES)
+    writeDuu(duu);
+    writeDuv(duv);
+    writeDvv(dvv);
+#endif
 }

 #endif
@ -213,31 +295,43 @@ void main() {
    int numControlVertices = 0;
    if (patchType == 3) {
        float wP4[4], wDs4[4], wDt4[4], wDss4[4], wDst4[4], wDtt4[4];
-        OsdGetBilinearPatchWeights(coord.s, coord.t, dScale, wP4, wDs4, wDt4, wDss4, wDst4, wDtt4);
+        OsdGetBilinearPatchWeights(coord.s, coord.t, dScale, wP4,
+                                   wDs4, wDt4, wDss4, wDst4, wDtt4);
        numControlVertices = 4;
        for (int i=0; i<numControlVertices; ++i) {
            wP[i] = wP4[i];
            wDs[i] = wDs4[i];
            wDt[i] = wDt4[i];
+            wDss[i] = wDss4[i];
+            wDst[i] = wDst4[i];
+            wDtt[i] = wDtt4[i];
        }
    } else if (patchType == 6) {
        float wP16[16], wDs16[16], wDt16[16], wDss16[16], wDst16[16], wDtt16[16];
-        OsdGetBSplinePatchWeights(coord.s, coord.t, dScale, boundary, wP16, wDs16, wDt16, wDss16, wDst16, wDtt16);
+        OsdGetBSplinePatchWeights(coord.s, coord.t, dScale, boundary, wP16,
+                                  wDs16, wDt16, wDss16, wDst16, wDtt16);
        numControlVertices = 16;
        for (int i=0; i<numControlVertices; ++i) {
            wP[i] = wP16[i];
            wDs[i] = wDs16[i];
            wDt[i] = wDt16[i];
+            wDss[i] = wDss16[i];
+            wDst[i] = wDst16[i];
+            wDtt[i] = wDtt16[i];
        }
    } else if (patchType == 9) {
-        OsdGetGregoryPatchWeights(coord.s, coord.t, dScale, wP, wDs, wDt, wDss, wDst, wDtt);
+        OsdGetGregoryPatchWeights(coord.s, coord.t, dScale, wP,
+                                  wDs, wDt, wDss, wDst, wDtt);
        numControlVertices = 20;
    }

-    Vertex dst, du, dv;
+    Vertex dst, du, dv, duu, duv, dvv;
    clear(dst);
    clear(du);
    clear(dv);
+    clear(duu);
+    clear(duv);
+    clear(dvv);

    int indexStride = getNumControlVertices(array.x);
    int indexBase = array.z + indexStride * (patchIndex - array.w);
@ -247,15 +341,22 @@ void main() {
        addWithWeight(dst, readVertex(index), wP[cv]);
        addWithWeight(du,  readVertex(index), wDs[cv]);
        addWithWeight(dv,  readVertex(index), wDt[cv]);
+        addWithWeight(duu, readVertex(index), wDss[cv]);
+        addWithWeight(duv, readVertex(index), wDst[cv]);
+        addWithWeight(dvv, readVertex(index), wDtt[cv]);
    }

    writeVertex(dst);

-#if defined(OPENSUBDIV_GLSL_XFB_USE_DERIVATIVES)
+#if defined(OPENSUBDIV_GLSL_XFB_USE_1ST_DERIVATIVES)
    writeDu(du);
    writeDv(dv);
 #endif
-
+#if defined(OPENSUBDIV_GLSL_XFB_USE_2ND_DERIVATIVES)
+    writeDuu(duu);
+    writeDuv(duv);
+    writeDvv(dvv);
+#endif
 }

 #endif
--- a/opensubdiv/osd/hlslPatchCommon.hlsl
+++ b/opensubdiv/osd/hlslPatchCommon.hlsl
@ -448,7 +448,7 @@ OsdComputeBSplineBoundaryPoints(inout float3 cpt[16], int3 patchParam)
 // (labeled vv* and ev* respectively).
 //
 // The two segments of each transition edge are labeled Lo and Hi,
-// with the Lo segment occuring before the Hi segment along the
+// with the Lo segment occurring before the Hi segment along the
 // transition edge's domain parameterization. These Lo and Hi segment
 // tessellation levels determine how domain evaluation coordinates
 // are remapped along transition edges. The Hi segment value will
@ -505,7 +505,7 @@ float OsdComputeTessLevel(float3 p0, float3 p1)

    // We restrict adaptive tessellation levels to half of the device
    // supported maximum because transition edges are split into two
-    // halfs and the sum of the two corresponding levels must not exceed
+    // halves and the sum of the two corresponding levels must not exceed
    // the device maximum. We impose this limit even for non-transition
    // edges because a non-transition edge must be able to match up with
    // one half of the transition edge of an adjacent transition patch.
--- a/opensubdiv/osd/mesh.h
+++ b/opensubdiv/osd/mesh.h
@ -51,12 +51,13 @@ enum MeshBits {
    MeshInterleaveVarying    = 1,
    MeshFVarData             = 2,
    MeshFVarAdaptive         = 3,
-    MeshUseSingleCreasePatch = 4,
-    MeshUseInfSharpPatch     = 5,
-    MeshEndCapBSplineBasis   = 6,  // exclusive
-    MeshEndCapGregoryBasis   = 7,  // exclusive
-    MeshEndCapLegacyGregory  = 8,  // exclusive
-    NUM_MESH_BITS            = 9,
+    MeshUseSmoothCornerPatch = 4,
+    MeshUseSingleCreasePatch = 5,
+    MeshUseInfSharpPatch     = 6,
+    MeshEndCapBSplineBasis   = 7,  // exclusive
+    MeshEndCapGregoryBasis   = 8,  // exclusive
+    MeshEndCapLegacyGregory  = 9,  // exclusive
+    NUM_MESH_BITS            = 10,
 };
 typedef std::bitset<NUM_MESH_BITS> MeshBitset;

@ -175,7 +176,7 @@ convertToCompatibleStencilTable<Far::StencilTable, Far::StencilTable, ID3D11Devi
 // ---------------------------------------------------------------------------

 // Osd evaluator cache: for the GPU backends require compiled instance
-//   (GLXFB, GLCompue, CL)
+//   (GLXFB, GLCompute, CL)
 //
 // note: this is just an example usage and client applications are supposed
 //       to implement their own structure for Evaluator instance.
@ -197,8 +198,27 @@ public:
              BufferDescriptor const &duDescArg,
              BufferDescriptor const &dvDescArg,
              EVALUATOR *evalArg) : srcDesc(srcDescArg), dstDesc(dstDescArg),
-                              duDesc(duDescArg), dvDesc(dvDescArg), evaluator(evalArg) {}
-        BufferDescriptor srcDesc, dstDesc, duDesc, dvDesc;
+                              duDesc(duDescArg), dvDesc(dvDescArg),
+                              duuDesc(BufferDescriptor()),
+                              duvDesc(BufferDescriptor()),
+                              dvvDesc(BufferDescriptor()),
+                              evaluator(evalArg) {}
+        Entry(BufferDescriptor const &srcDescArg,
+              BufferDescriptor const &dstDescArg,
+              BufferDescriptor const &duDescArg,
+              BufferDescriptor const &dvDescArg,
+              BufferDescriptor const &duuDescArg,
+              BufferDescriptor const &duvDescArg,
+              BufferDescriptor const &dvvDescArg,
+              EVALUATOR *evalArg) : srcDesc(srcDescArg), dstDesc(dstDescArg),
+                              duDesc(duDescArg), dvDesc(dvDescArg),
+                              duuDesc(duuDescArg),
+                              duvDesc(duvDescArg),
+                              dvvDesc(dvvDescArg),
+                              evaluator(evalArg) {}
+        BufferDescriptor srcDesc, dstDesc;
+        BufferDescriptor duDesc, dvDesc;
+        BufferDescriptor duuDesc, duvDesc, dvvDesc;
        EVALUATOR *evaluator;
    };
    typedef std::vector<Entry> Evaluators;
@ -208,6 +228,9 @@ public:
                            BufferDescriptor const &dstDesc,
                            DEVICE_CONTEXT *deviceContext) {
        return GetEvaluator(srcDesc, dstDesc,
+                            BufferDescriptor(),
+                            BufferDescriptor(),
+                            BufferDescriptor(),
                            BufferDescriptor(),
                            BufferDescriptor(),
                            deviceContext);
@ -219,20 +242,43 @@ public:
                            BufferDescriptor const &duDesc,
                            BufferDescriptor const &dvDesc,
                            DEVICE_CONTEXT *deviceContext) {
+        return GetEvaluator(srcDesc, dstDesc,
+                            duDesc, dvDesc,
+                            BufferDescriptor(),
+                            BufferDescriptor(),
+                            BufferDescriptor(),
+                            deviceContext);
+    }
+
+    template <typename DEVICE_CONTEXT>
+    EVALUATOR *GetEvaluator(BufferDescriptor const &srcDesc,
+                            BufferDescriptor const &dstDesc,
+                            BufferDescriptor const &duDesc,
+                            BufferDescriptor const &dvDesc,
+                            BufferDescriptor const &duuDesc,
+                            BufferDescriptor const &duvDesc,
+                            BufferDescriptor const &dvvDesc,
+                            DEVICE_CONTEXT *deviceContext) {

        for(typename Evaluators::iterator it = _evaluators.begin();
            it != _evaluators.end(); ++it) {
            if (isEqual(srcDesc, it->srcDesc) &&
                isEqual(dstDesc, it->dstDesc) &&
                isEqual(duDesc,  it->duDesc) &&
-                isEqual(dvDesc, it->dvDesc)) {
+                isEqual(dvDesc,  it->dvDesc) &&
+                isEqual(duuDesc, it->duuDesc) &&
+                isEqual(duvDesc, it->duvDesc) &&
+                isEqual(dvvDesc, it->dvvDesc)) {
                return it->evaluator;
            }
        }
        EVALUATOR *e = EVALUATOR::Create(srcDesc, dstDesc,
                                         duDesc, dvDesc,
+                                         duuDesc, duvDesc, dvvDesc,
                                         deviceContext);
-        _evaluators.push_back(Entry(srcDesc, dstDesc, duDesc, dvDesc, e));
+        _evaluators.push_back(Entry(srcDesc, dstDesc,
+                                    duDesc, dvDesc,
+                                    duuDesc, duvDesc, dvvDesc, e));
        return e;
    }

@ -272,6 +318,25 @@ struct enable_if<false, T> { };
 /// @endcond

 // extract a kernel from cache if available
+template <typename EVALUATOR, typename DEVICE_CONTEXT>
+static EVALUATOR *GetEvaluator(
+    EvaluatorCacheT<EVALUATOR> *cache,
+    BufferDescriptor const &srcDesc,
+    BufferDescriptor const &dstDesc,
+    BufferDescriptor const &duDesc,
+    BufferDescriptor const &dvDesc,
+    BufferDescriptor const &duuDesc,
+    BufferDescriptor const &duvDesc,
+    BufferDescriptor const &dvvDesc,
+    DEVICE_CONTEXT deviceContext,
+    typename enable_if<instantiatable<EVALUATOR>::value, void>::type*t=0) {
+    (void)t;
+    if (cache == NULL) return NULL;
+    return cache->GetEvaluator(srcDesc, dstDesc,
+                               duDesc, dvDesc, duuDesc, duvDesc, dvvDesc,
+                               deviceContext);
+}
+
 template <typename EVALUATOR, typename DEVICE_CONTEXT>
 static EVALUATOR *GetEvaluator(
    EvaluatorCacheT<EVALUATOR> *cache,
@ -302,6 +367,22 @@ static EVALUATOR *GetEvaluator(
 }

 // fallback
+template <typename EVALUATOR, typename DEVICE_CONTEXT>
+static EVALUATOR *GetEvaluator(
+    EvaluatorCacheT<EVALUATOR> *,
+    BufferDescriptor const &,
+    BufferDescriptor const &,
+    BufferDescriptor const &,
+    BufferDescriptor const &,
+    BufferDescriptor const &,
+    BufferDescriptor const &,
+    BufferDescriptor const &,
+    DEVICE_CONTEXT,
+    typename enable_if<!instantiatable<EVALUATOR>::value, void>::type*t=0) {
+    (void)t;
+    return NULL;
+}
+
 template <typename EVALUATOR, typename DEVICE_CONTEXT>
 static EVALUATOR *GetEvaluator(
    EvaluatorCacheT<EVALUATOR> *,
@ -529,6 +610,7 @@ private:
        Far::PatchTableFactory::Options poptions(level);
        poptions.generateFVarTables = bits.test(MeshFVarData);
        poptions.generateFVarLegacyLinearPatches = !bits.test(MeshFVarAdaptive);
+        poptions.generateLegacySharpCornerPatches = !bits.test(MeshUseSmoothCornerPatch);
        poptions.useSingleCreasePatch = bits.test(MeshUseSingleCreasePatch);
        poptions.useInfSharpPatch = bits.test(MeshUseInfSharpPatch);

--- a/opensubdiv/osd/ompEvaluator.cpp
+++ b/opensubdiv/osd/ompEvaluator.cpp
@ -84,6 +84,50 @@ OmpEvaluator::EvalStencils(
    return true;
 }

+/* static */
+bool
+OmpEvaluator::EvalStencils(
+    const float *src, BufferDescriptor const &srcDesc,
+    float *dst,       BufferDescriptor const &dstDesc,
+    float *du,        BufferDescriptor const &duDesc,
+    float *dv,        BufferDescriptor const &dvDesc,
+    float *duu,       BufferDescriptor const &duuDesc,
+    float *duv,       BufferDescriptor const &duvDesc,
+    float *dvv,       BufferDescriptor const &dvvDesc,
+    const int * sizes,
+    const int * offsets,
+    const int * indices,
+    const float * weights,
+    const float * duWeights,
+    const float * dvWeights,
+    const float * duuWeights,
+    const float * duvWeights,
+    const float * dvvWeights,
+    int start, int end) {
+
+    if (end <= start) return true;
+    if (srcDesc.length != dstDesc.length) return false;
+    if (srcDesc.length != duDesc.length) return false;
+    if (srcDesc.length != dvDesc.length) return false;
+    if (srcDesc.length != duuDesc.length) return false;
+    if (srcDesc.length != duvDesc.length) return false;
+    if (srcDesc.length != dvvDesc.length) return false;
+
+    OmpEvalStencils(src, srcDesc,
+                    dst, dstDesc,
+                    du,  duDesc,
+                    dv,  dvDesc,
+                    duu, duuDesc,
+                    duv, duvDesc,
+                    dvv, dvvDesc,
+                    sizes, offsets, indices,
+                    weights, duWeights, dvWeights,
+                    duuWeights, duvWeights, dvvWeights,
+                    start, end);
+
+    return true;
+}
+
 template <typename T>
 struct BufferAdapter {
    BufferAdapter(T *p, int length, int stride) :
@ -197,7 +241,7 @@ OmpEvaluator::EvalPatches(

 #pragma omp parallel for
    for (int i = 0; i < numPatchCoords; ++i) {
-        float wP[20], wDs[20], wDt[20];
+        float wP[20], wDu[20], wDv[20];
        BufferAdapter<float> dstT(dst + dstDesc.stride*i, dstDesc.length, dstDesc.stride);
        BufferAdapter<float> duT(du   + duDesc.stride*i, duDesc.length, duDesc.stride);
        BufferAdapter<float> dvT(dv   + dvDesc.stride*i, dvDesc.length, dvDesc.stride);
@ -214,15 +258,15 @@ OmpEvaluator::EvalPatches(
        int numControlVertices = 0;
        if (patchType == Far::PatchDescriptor::REGULAR) {
            Far::internal::GetBSplineWeights(param,
-                                             coord.s, coord.t, wP, wDs, wDt);
+                                             coord.s, coord.t, wP, wDu, wDv);
            numControlVertices = 16;
        } else if (patchType == Far::PatchDescriptor::GREGORY_BASIS) {
            Far::internal::GetGregoryWeights(param,
-                                             coord.s, coord.t, wP, wDs, wDt);
+                                             coord.s, coord.t, wP, wDu, wDv);
            numControlVertices = 20;
        } else if (patchType == Far::PatchDescriptor::QUADS) {
            Far::internal::GetBilinearWeights(param,
-                                              coord.s, coord.t, wP, wDs, wDt);
+                                              coord.s, coord.t, wP, wDu, wDv);
            numControlVertices = 4;
        } else {
            continue;
@ -239,8 +283,8 @@ OmpEvaluator::EvalPatches(
        dvT.Clear();
        for (int j = 0; j < numControlVertices; ++j) {
            dstT.AddWithWeight(srcT[cvs[j]], wP[j]);
-            duT.AddWithWeight(srcT[cvs[j]], wDs[j]);
-            dvT.AddWithWeight(srcT[cvs[j]], wDt[j]);
+            duT.AddWithWeight(srcT[cvs[j]], wDu[j]);
+            dvT.AddWithWeight(srcT[cvs[j]], wDv[j]);
        }
        ++dstT;
        ++duT;
@ -249,6 +293,101 @@ OmpEvaluator::EvalPatches(
    return true;
 }

+/* static */
+bool
+OmpEvaluator::EvalPatches(
+    const float *src, BufferDescriptor const &srcDesc,
+    float *dst,       BufferDescriptor const &dstDesc,
+    float *du,        BufferDescriptor const &duDesc,
+    float *dv,        BufferDescriptor const &dvDesc,
+    float *duu,       BufferDescriptor const &duuDesc,
+    float *duv,       BufferDescriptor const &duvDesc,
+    float *dvv,       BufferDescriptor const &dvvDesc,
+    int numPatchCoords,
+    PatchCoord const *patchCoords,
+    PatchArray const *patchArrays,
+    const int *patchIndexBuffer,
+    PatchParam const *patchParamBuffer) {
+
+    src += srcDesc.offset;
+    if (dst) dst += dstDesc.offset;
+    if (du)  du += duDesc.offset;
+    if (dv)  dv += dvDesc.offset;
+    if (duu) duu += duuDesc.offset;
+    if (duv) duv += duvDesc.offset;
+    if (dvv) dvv += dvvDesc.offset;
+
+    BufferAdapter<const float> srcT(src, srcDesc.length, srcDesc.stride);
+
+#pragma omp parallel for
+    for (int i = 0; i < numPatchCoords; ++i) {
+        float wP[20], wDu[20], wDv[20], wDuu[20], wDuv[20], wDvv[20];
+        BufferAdapter<float> dstT(dst + dstDesc.stride*i, dstDesc.length, dstDesc.stride);
+        BufferAdapter<float> duT(du   + duDesc.stride*i, duDesc.length, duDesc.stride);
+        BufferAdapter<float> dvT(dv   + dvDesc.stride*i, dvDesc.length, dvDesc.stride);
+        BufferAdapter<float> duuT(duu + duuDesc.stride*i, duuDesc.length, duuDesc.stride);
+        BufferAdapter<float> duvT(duv + duvDesc.stride*i, duvDesc.length, duvDesc.stride);
+        BufferAdapter<float> dvvT(dvv + dvvDesc.stride*i, dvvDesc.length, dvvDesc.stride);
+
+        PatchCoord const &coord = patchCoords[i];
+        PatchArray const &array = patchArrays[coord.handle.arrayIndex];
+
+        Far::PatchParam const & param =
+            patchParamBuffer[coord.handle.patchIndex];
+        int patchType = param.IsRegular()
+            ? Far::PatchDescriptor::REGULAR
+            : array.GetPatchType();
+
+        int numControlVertices = 0;
+        if (patchType == Far::PatchDescriptor::REGULAR) {
+            Far::internal::GetBSplineWeights(param,
+                                             coord.s, coord.t, wP,
+                                             wDu, wDv, wDuu, wDuv, wDvv);
+            numControlVertices = 16;
+        } else if (patchType == Far::PatchDescriptor::GREGORY_BASIS) {
+            Far::internal::GetGregoryWeights(param,
+                                             coord.s, coord.t, wP,
+                                             wDu, wDv, wDuu, wDuv, wDvv);
+            numControlVertices = 20;
+        } else if (patchType == Far::PatchDescriptor::QUADS) {
+            Far::internal::GetBilinearWeights(param,
+                                              coord.s, coord.t, wP,
+                                              wDu, wDv, wDuu, wDuv, wDvv);
+            numControlVertices = 4;
+        } else {
+            continue;
+        }
+
+        int indexStride = Far::PatchDescriptor(array.GetPatchType()).GetNumControlVertices();
+        int indexBase = array.GetIndexBase() + indexStride *
+                (coord.handle.patchIndex - array.GetPrimitiveIdBase());
+
+        const int *cvs = &patchIndexBuffer[indexBase];
+
+        dstT.Clear();
+        duT.Clear();
+        dvT.Clear();
+        duuT.Clear();
+        duvT.Clear();
+        dvvT.Clear();
+        for (int j = 0; j < numControlVertices; ++j) {
+            dstT.AddWithWeight(srcT[cvs[j]], wP[j]);
+            duT.AddWithWeight(srcT[cvs[j]], wDu[j]);
+            dvT.AddWithWeight(srcT[cvs[j]], wDv[j]);
+            duuT.AddWithWeight(srcT[cvs[j]], wDuu[j]);
+            duvT.AddWithWeight(srcT[cvs[j]], wDuv[j]);
+            dvvT.AddWithWeight(srcT[cvs[j]], wDvv[j]);
+        }
+        ++dstT;
+        ++duT;
+        ++dvT;
+        ++duuT;
+        ++duvT;
+        ++dvvT;
+    }
+    return true;
+}
+

 /* static */
 void
--- a/opensubdiv/osd/ompEvaluator.h
+++ b/opensubdiv/osd/ompEvaluator.h
@ -26,11 +26,11 @@
 #define OPENSUBDIV3_OSD_OMP_EVALUATOR_H

 #include "../version.h"
-
-#include <cstddef>
 #include "../osd/bufferDescriptor.h"
 #include "../osd/types.h"

+#include <cstddef>
+
 namespace OpenSubdiv {
 namespace OPENSUBDIV_VERSION {

@ -107,7 +107,6 @@ public:
    /// @param dstDesc        vertex buffer descriptor for the output buffer
    ///
    /// @param sizes          pointer to the sizes buffer of the stencil table
-    ///                       to apply for the range [start, end)
    ///
    /// @param offsets        pointer to the offsets buffer of the stencil table
    ///
@ -145,17 +144,17 @@ public:
    ///
    /// @param dstDesc        vertex buffer descriptor for the output buffer
    ///
-    /// @param duBuffer       Output U-derivative buffer
+    /// @param duBuffer       Output buffer derivative wrt u
    ///                       must have BindCpuBuffer() method returning a
    ///                       float pointer for write
    ///
-    /// @param duDesc         vertex buffer descriptor for the output buffer
+    /// @param duDesc         vertex buffer descriptor for the duBuffer
    ///
-    /// @param dvBuffer       Output V-derivative buffer
+    /// @param dvBuffer       Output buffer derivative wrt v
    ///                       must have BindCpuBuffer() method returning a
    ///                       float pointer for write
    ///
-    /// @param dvDesc         vertex buffer descriptor for the output buffer
+    /// @param dvDesc         vertex buffer descriptor for the dvBuffer
    ///
    /// @param stencilTable   Far::StencilTable or equivalent
    ///
@ -206,15 +205,15 @@ public:
    ///
    /// @param dstDesc        vertex buffer descriptor for the output buffer
    ///
-    /// @param du             Output U-derivatives pointer. An offset of
+    /// @param du             Output pointer derivative wrt u. An offset of
    ///                       duDesc will be applied internally.
    ///
-    /// @param duDesc         vertex buffer descriptor for the output buffer
+    /// @param duDesc         vertex buffer descriptor for the duBuffer
    ///
-    /// @param dv             Output V-derivatives pointer. An offset of
+    /// @param dv             Output pointer derivative wrt v. An offset of
    ///                       dvDesc will be applied internally.
    ///
-    /// @param dvDesc         vertex buffer descriptor for the output buffer
+    /// @param dvDesc         vertex buffer descriptor for the dvBuffer
    ///
    /// @param sizes          pointer to the sizes buffer of the stencil table
    ///
@ -245,6 +244,177 @@ public:
        const float * dvWeights,
        int start, int end);

+    /// \brief Generic static eval stencils function with derivatives.
+    ///        This function has a same signature as other device kernels
+    ///        have so that it can be called in the same way from OsdMesh
+    ///        template interface.
+    ///
+    /// @param srcBuffer      Input primvar buffer.
+    ///                       must have BindCpuBuffer() method returning a
+    ///                       const float pointer for read
+    ///
+    /// @param srcDesc        vertex buffer descriptor for the input buffer
+    ///
+    /// @param dstBuffer      Output primvar buffer
+    ///                       must have BindCpuBuffer() method returning a
+    ///                       float pointer for write
+    ///
+    /// @param dstDesc        vertex buffer descriptor for the output buffer
+    ///
+    /// @param duBuffer       Output buffer derivative wrt u
+    ///                       must have BindCpuBuffer() method returning a
+    ///                       float pointer for write
+    ///
+    /// @param duDesc         vertex buffer descriptor for the duBuffer
+    ///
+    /// @param dvBuffer       Output buffer derivative wrt v
+    ///                       must have BindCpuBuffer() method returning a
+    ///                       float pointer for write
+    ///
+    /// @param dvDesc         vertex buffer descriptor for the dvBuffer
+    ///
+    /// @param duuBuffer      Output buffer 2nd derivative wrt u
+    ///                       must have BindCpuBuffer() method returning a
+    ///                       float pointer for write
+    ///
+    /// @param duuDesc        vertex buffer descriptor for the duuBuffer
+    ///
+    /// @param duvBuffer      Output buffer 2nd derivative wrt u and v
+    ///                       must have BindCpuBuffer() method returning a
+    ///                       float pointer for write
+    ///
+    /// @param duvDesc        vertex buffer descriptor for the duvBuffer
+    ///
+    /// @param dvvBuffer      Output buffer 2nd derivative wrt v
+    ///                       must have BindCpuBuffer() method returning a
+    ///                       float pointer for write
+    ///
+    /// @param dvvDesc        vertex buffer descriptor for the dvvBuffer
+    ///
+    /// @param stencilTable   Far::StencilTable or equivalent
+    ///
+    /// @param instance       not used in the omp kernel
+    ///                       (declared as a typed pointer to prevent
+    ///                        undesirable template resolution)
+    ///
+    /// @param deviceContext  not used in the omp kernel
+    ///
+    template <typename SRC_BUFFER, typename DST_BUFFER, typename STENCIL_TABLE>
+    static bool EvalStencils(
+        SRC_BUFFER *srcBuffer, BufferDescriptor const &srcDesc,
+        DST_BUFFER *dstBuffer, BufferDescriptor const &dstDesc,
+        DST_BUFFER *duBuffer,  BufferDescriptor const &duDesc,
+        DST_BUFFER *dvBuffer,  BufferDescriptor const &dvDesc,
+        DST_BUFFER *duuBuffer, BufferDescriptor const &duuDesc,
+        DST_BUFFER *duvBuffer, BufferDescriptor const &duvDesc,
+        DST_BUFFER *dvvBuffer, BufferDescriptor const &dvvDesc,
+        STENCIL_TABLE const *stencilTable,
+        const OmpEvaluator *instance = NULL,
+        void * deviceContext = NULL) {
+
+        (void)instance;       // unused
+        (void)deviceContext;  // unused
+
+        return EvalStencils(srcBuffer->BindCpuBuffer(), srcDesc,
+                            dstBuffer->BindCpuBuffer(), dstDesc,
+                            duBuffer->BindCpuBuffer(),  duDesc,
+                            dvBuffer->BindCpuBuffer(),  dvDesc,
+                            duuBuffer->BindCpuBuffer(), duuDesc,
+                            duvBuffer->BindCpuBuffer(), duvDesc,
+                            dvvBuffer->BindCpuBuffer(), dvvDesc,
+                            &stencilTable->GetSizes()[0],
+                            &stencilTable->GetOffsets()[0],
+                            &stencilTable->GetControlIndices()[0],
+                            &stencilTable->GetWeights()[0],
+                            &stencilTable->GetDuWeights()[0],
+                            &stencilTable->GetDvWeights()[0],
+                            &stencilTable->GetDuuWeights()[0],
+                            &stencilTable->GetDuvWeights()[0],
+                            &stencilTable->GetDvvWeights()[0],
+                            /*start = */ 0,
+                            /*end   = */ stencilTable->GetNumStencils());
+    }
+
+    /// \brief Static eval stencils function with derivatives, which takes
+    ///        raw CPU pointers for input and output.
+    ///
+    /// @param src            Input primvar pointer. An offset of srcDesc
+    ///                       will be applied internally (i.e. the pointer
+    ///                       should not include the offset)
+    ///
+    /// @param srcDesc        vertex buffer descriptor for the input buffer
+    ///
+    /// @param dst            Output primvar pointer. An offset of dstDesc
+    ///                       will be applied internally.
+    ///
+    /// @param dstDesc        vertex buffer descriptor for the output buffer
+    ///
+    /// @param du             Output pointer derivative wrt u. An offset of
+    ///                       duDesc will be applied internally.
+    ///
+    /// @param duDesc         vertex buffer descriptor for the duBuffer
+    ///
+    /// @param dv             Output pointer derivative wrt v. An offset of
+    ///                       dvDesc will be applied internally.
+    ///
+    /// @param dvDesc         vertex buffer descriptor for the dvBuffer
+    ///
+    /// @param duu            Output pointer 2nd derivative wrt u. An offset of
+    ///                       duuDesc will be applied internally.
+    ///
+    /// @param duuDesc        vertex buffer descriptor for the duuBuffer
+    ///
+    /// @param duv            Output pointer 2nd derivative wrt u and v. An offset of
+    ///                       duvDesc will be applied internally.
+    ///
+    /// @param duvDesc        vertex buffer descriptor for the duvBuffer
+    ///
+    /// @param dvv            Output pointer 2nd derivative wrt v. An offset of
+    ///                       dvvDesc will be applied internally.
+    ///
+    /// @param dvvDesc        vertex buffer descriptor for the dvvBuffer
+    ///
+    /// @param sizes          pointer to the sizes buffer of the stencil table
+    ///
+    /// @param offsets        pointer to the offsets buffer of the stencil table
+    ///
+    /// @param indices        pointer to the indices buffer of the stencil table
+    ///
+    /// @param weights        pointer to the weights buffer of the stencil table
+    ///
+    /// @param duWeights      pointer to the du-weights buffer of the stencil table
+    ///
+    /// @param dvWeights      pointer to the dv-weights buffer of the stencil table
+    ///
+    /// @param duuWeights     pointer to the duu-weights buffer of the stencil table
+    ///
+    /// @param duvWeights     pointer to the duv-weights buffer of the stencil table
+    ///
+    /// @param dvvWeights     pointer to the dvv-weights buffer of the stencil table
+    ///
+    /// @param start          start index of stencil table
+    ///
+    /// @param end            end index of stencil table
+    ///
+    static bool EvalStencils(
+        const float *src, BufferDescriptor const &srcDesc,
+        float *dst,       BufferDescriptor const &dstDesc,
+        float *du,        BufferDescriptor const &duDesc,
+        float *dv,        BufferDescriptor const &dvDesc,
+        float *duu,       BufferDescriptor const &duuDesc,
+        float *duv,       BufferDescriptor const &duvDesc,
+        float *dvv,       BufferDescriptor const &dvvDesc,
+        const int * sizes,
+        const int * offsets,
+        const int * indices,
+        const float * weights,
+        const float * duWeights,
+        const float * dvWeights,
+        const float * duuWeights,
+        const float * duvWeights,
+        const float * dvvWeights,
+        int start, int end);
+
    /// ----------------------------------------------------------------------
    ///
    ///   Limit evaluations with PatchTable
@ -318,13 +488,13 @@ public:
    ///
    /// @param dstDesc          vertex buffer descriptor for the output buffer
    ///
-    /// @param duBuffer         Output U-derivatives buffer
+    /// @param duBuffer         Output buffer derivative wrt u
    ///                         must have BindCpuBuffer() method returning a
    ///                         float pointer for write
    ///
    /// @param duDesc           vertex buffer descriptor for the duBuffer
    ///
-    /// @param dvBuffer         Output V-derivatives buffer
+    /// @param dvBuffer         Output buffer derivative wrt v
    ///                         must have BindCpuBuffer() method returning a
    ///                         float pointer for write
    ///
@ -354,6 +524,7 @@ public:
        PATCH_TABLE *patchTable,
        OmpEvaluator const *instance = NULL,
        void * deviceContext = NULL) {
+
        (void)instance;       // unused
        (void)deviceContext;  // unused

@ -373,6 +544,102 @@ public:
                           patchTable->GetPatchParamBuffer());
    }

+    /// \brief Generic limit eval function with derivatives. This function has
+    ///        a same signature as other device kernels have so that it can be
+    ///        called in the same way.
+    ///
+    /// @param srcBuffer        Input primvar buffer.
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         const float pointer for read
+    ///
+    /// @param srcDesc          vertex buffer descriptor for the input buffer
+    ///
+    /// @param dstBuffer        Output primvar buffer
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param dstDesc          vertex buffer descriptor for the output buffer
+    ///
+    /// @param duBuffer         Output buffer derivative wrt u
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param duDesc           vertex buffer descriptor for the duBuffer
+    ///
+    /// @param dvBuffer         Output buffer derivative wrt v
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param dvDesc           vertex buffer descriptor for the dvBuffer
+    ///
+    /// @param duuBuffer        Output buffer 2nd derivative wrt u
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param duuDesc          vertex buffer descriptor for the duuBuffer
+    ///
+    /// @param duvBuffer        Output buffer 2nd derivative wrt u and v
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param duvDesc          vertex buffer descriptor for the duvBuffer
+    ///
+    /// @param dvvBuffer        Output buffer 2nd derivative wrt v
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param dvvDesc          vertex buffer descriptor for the dvvBuffer
+    ///
+    /// @param numPatchCoords   number of patchCoords.
+    ///
+    /// @param patchCoords      array of locations to be evaluated.
+    ///
+    /// @param patchTable       CpuPatchTable or equivalent
+    ///                         XXX: currently Far::PatchTable can't be used
+    ///                              due to interface mismatch
+    ///
+    /// @param instance         not used in the omp evaluator
+    ///
+    /// @param deviceContext    not used in the omp evaluator
+    ///
+    template <typename SRC_BUFFER, typename DST_BUFFER,
+              typename PATCHCOORD_BUFFER, typename PATCH_TABLE>
+    static bool EvalPatches(
+        SRC_BUFFER *srcBuffer, BufferDescriptor const &srcDesc,
+        DST_BUFFER *dstBuffer, BufferDescriptor const &dstDesc,
+        DST_BUFFER *duBuffer,  BufferDescriptor const &duDesc,
+        DST_BUFFER *dvBuffer,  BufferDescriptor const &dvDesc,
+        DST_BUFFER *duuBuffer, BufferDescriptor const &duuDesc,
+        DST_BUFFER *duvBuffer, BufferDescriptor const &duvDesc,
+        DST_BUFFER *dvvBuffer, BufferDescriptor const &dvvDesc,
+        int numPatchCoords,
+        PATCHCOORD_BUFFER *patchCoords,
+        PATCH_TABLE *patchTable,
+        OmpEvaluator const *instance = NULL,
+        void * deviceContext = NULL) {
+
+        (void)instance;       // unused
+        (void)deviceContext;  // unused
+
+        // XXX: PatchCoords is somewhat abusing vertex primvar buffer interop.
+        //      ideally all buffer classes should have templated by datatype
+        //      so that downcast isn't needed there.
+        //      (e.g. Osd::CpuBuffer<PatchCoord> )
+        //
+        return EvalPatches(srcBuffer->BindCpuBuffer(), srcDesc,
+                           dstBuffer->BindCpuBuffer(), dstDesc,
+                           duBuffer->BindCpuBuffer(),  duDesc,
+                           dvBuffer->BindCpuBuffer(),  dvDesc,
+                           duuBuffer->BindCpuBuffer(), duuDesc,
+                           duvBuffer->BindCpuBuffer(), duvDesc,
+                           dvvBuffer->BindCpuBuffer(), dvvDesc,
+                           numPatchCoords,
+                           (const PatchCoord*)patchCoords->BindCpuBuffer(),
+                           patchTable->GetPatchArrayBuffer(),
+                           patchTable->GetPatchIndexBuffer(),
+                           patchTable->GetPatchParamBuffer());
+    }
+
    /// \brief Static limit eval function. It takes an array of PatchCoord
    ///        and evaluate limit values on given PatchTable.
    ///
@ -423,15 +690,15 @@ public:
    ///
    /// @param dstDesc          vertex buffer descriptor for the output buffer
    ///
-    /// @param du               Output U-derivatives pointer. An offset of
+    /// @param du               Output pointer derivative wrt u. An offset of
    ///                         duDesc will be applied internally.
    ///
-    /// @param duDesc           vertex buffer descriptor for the du buffer
+    /// @param duDesc           vertex buffer descriptor for the duBuffer
    ///
-    /// @param dv               Output V-derivatives pointer. An offset of
+    /// @param dv               Output pointer derivative wrt v. An offset of
    ///                         dvDesc will be applied internally.
    ///
-    /// @param dvDesc           vertex buffer descriptor for the dv buffer
+    /// @param dvDesc           vertex buffer descriptor for the dvBuffer
    ///
    /// @param numPatchCoords   number of patchCoords.
    ///
@ -457,6 +724,72 @@ public:
        const int *patchIndexBuffer,
        PatchParam const *patchParamBuffer);

+    /// \brief Static limit eval function. It takes an array of PatchCoord
+    ///        and evaluate limit values on given PatchTable.
+    ///
+    /// @param src              Input primvar pointer. An offset of srcDesc
+    ///                         will be applied internally (i.e. the pointer
+    ///                         should not include the offset)
+    ///
+    /// @param srcDesc          vertex buffer descriptor for the input buffer
+    ///
+    /// @param dst              Output primvar pointer. An offset of dstDesc
+    ///                         will be applied internally.
+    ///
+    /// @param dstDesc          vertex buffer descriptor for the output buffer
+    ///
+    /// @param du               Output pointer derivative wrt u. An offset of
+    ///                         duDesc will be applied internally.
+    ///
+    /// @param duDesc           vertex buffer descriptor for the duBuffer
+    ///
+    /// @param dv               Output pointer derivative wrt v. An offset of
+    ///                         dvDesc will be applied internally.
+    ///
+    /// @param dvDesc           vertex buffer descriptor for the dvBuffer
+    ///
+    /// @param duu              Output pointer 2nd derivative wrt u. An offset of
+    ///                         duuDesc will be applied internally.
+    ///
+    /// @param duuDesc          vertex buffer descriptor for the duuBuffer
+    ///
+    /// @param duv              Output pointer 2nd derivative wrt u and v. An offset of
+    ///                         duvDesc will be applied internally.
+    ///
+    /// @param duvDesc          vertex buffer descriptor for the duvBuffer
+    ///
+    /// @param dvv              Output pointer 2nd derivative wrt v. An offset of
+    ///                         dvvDesc will be applied internally.
+    ///
+    /// @param dvvDesc          vertex buffer descriptor for the dvvBuffer
+    ///
+    /// @param numPatchCoords   number of patchCoords.
+    ///
+    /// @param patchCoords      array of locations to be evaluated.
+    ///
+    /// @param patchArrays      an array of Osd::PatchArray struct
+    ///                         indexed by PatchCoord::arrayIndex
+    ///
+    /// @param patchIndexBuffer an array of patch indices
+    ///                         indexed by PatchCoord::vertIndex
+    ///
+    /// @param patchParamBuffer an array of Osd::PatchParam struct
+    ///                         indexed by PatchCoord::patchIndex
+    ///
+    static bool EvalPatches(
+        const float *src, BufferDescriptor const &srcDesc,
+        float *dst,       BufferDescriptor const &dstDesc,
+        float *du,        BufferDescriptor const &duDesc,
+        float *dv,        BufferDescriptor const &dvDesc,
+        float *duu,       BufferDescriptor const &duuDesc,
+        float *duv,       BufferDescriptor const &duvDesc,
+        float *dvv,       BufferDescriptor const &dvvDesc,
+        int numPatchCoords,
+        PatchCoord const *patchCoords,
+        PatchArray const *patchArrays,
+        const int *patchIndexBuffer,
+        PatchParam const *patchParamBuffer);
+
    /// \brief Generic limit eval function. This function has a same
    ///        signature as other device kernels have so that it can be called
    ///        in the same way.
@ -508,6 +841,164 @@ public:
                           patchTable->GetPatchParamBuffer());
    }

+    /// \brief Generic limit eval function. This function has a same
+    ///        signature as other device kernels have so that it can be called
+    ///        in the same way.
+    ///
+    /// @param srcBuffer        Input primvar buffer.
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         const float pointer for read
+    ///
+    /// @param srcDesc          vertex buffer descriptor for the input buffer
+    ///
+    /// @param dstBuffer        Output primvar buffer
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param dstDesc          vertex buffer descriptor for the output buffer
+    ///
+    /// @param duBuffer         Output buffer derivative wrt u
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param duDesc           vertex buffer descriptor for the duBuffer
+    ///
+    /// @param dvBuffer         Output buffer derivative wrt v
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param dvDesc           vertex buffer descriptor for the dvBuffer
+    ///
+    /// @param numPatchCoords   number of patchCoords.
+    ///
+    /// @param patchCoords      array of locations to be evaluated.
+    ///
+    /// @param patchTable       CpuPatchTable or equivalent
+    ///                         XXX: currently Far::PatchTable can't be used
+    ///                              due to interface mismatch
+    ///
+    /// @param instance         not used in the omp evaluator
+    ///
+    /// @param deviceContext    not used in the omp evaluator
+    ///
+    template <typename SRC_BUFFER, typename DST_BUFFER,
+              typename PATCHCOORD_BUFFER, typename PATCH_TABLE>
+    static bool EvalPatchesVarying(
+        SRC_BUFFER *srcBuffer, BufferDescriptor const &srcDesc,
+        DST_BUFFER *dstBuffer, BufferDescriptor const &dstDesc,
+        DST_BUFFER *duBuffer,  BufferDescriptor const &duDesc,
+        DST_BUFFER *dvBuffer,  BufferDescriptor const &dvDesc,
+        int numPatchCoords,
+        PATCHCOORD_BUFFER *patchCoords,
+        PATCH_TABLE *patchTable,
+        OmpEvaluator const *instance = NULL,
+        void * deviceContext = NULL) {
+
+        (void)instance;       // unused
+        (void)deviceContext;  // unused
+
+        return EvalPatches(srcBuffer->BindCpuBuffer(), srcDesc,
+                           dstBuffer->BindCpuBuffer(), dstDesc,
+                           duBuffer->BindCpuBuffer(),  duDesc,
+                           dvBuffer->BindCpuBuffer(),  dvDesc,
+                           numPatchCoords,
+                           (const PatchCoord*)patchCoords->BindCpuBuffer(),
+                           patchTable->GetVaryingPatchArrayBuffer(),
+                           patchTable->GetVaryingPatchIndexBuffer(),
+                           patchTable->GetPatchParamBuffer());
+    }
+
+    /// \brief Generic limit eval function. This function has a same
+    ///        signature as other device kernels have so that it can be called
+    ///        in the same way.
+    ///
+    /// @param srcBuffer        Input primvar buffer.
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         const float pointer for read
+    ///
+    /// @param srcDesc          vertex buffer descriptor for the input buffer
+    ///
+    /// @param dstBuffer        Output primvar buffer
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param dstDesc          vertex buffer descriptor for the output buffer
+    ///
+    /// @param duBuffer         Output buffer derivative wrt u
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param duDesc           vertex buffer descriptor for the duBuffer
+    ///
+    /// @param dvBuffer         Output buffer derivative wrt v
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param dvDesc           vertex buffer descriptor for the dvBuffer
+    ///
+    /// @param duuBuffer        Output buffer 2nd derivative wrt u
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param duuDesc          vertex buffer descriptor for the duuBuffer
+    ///
+    /// @param duvBuffer        Output buffer 2nd derivative wrt u and v
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param duvDesc          vertex buffer descriptor for the duvBuffer
+    ///
+    /// @param dvvBuffer        Output buffer 2nd derivative wrt v
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param dvvDesc          vertex buffer descriptor for the dvvBuffer
+    ///
+    /// @param numPatchCoords   number of patchCoords.
+    ///
+    /// @param patchCoords      array of locations to be evaluated.
+    ///
+    /// @param patchTable       CpuPatchTable or equivalent
+    ///                         XXX: currently Far::PatchTable can't be used
+    ///                              due to interface mismatch
+    ///
+    /// @param instance         not used in the omp evaluator
+    ///
+    /// @param deviceContext    not used in the omp evaluator
+    ///
+    template <typename SRC_BUFFER, typename DST_BUFFER,
+              typename PATCHCOORD_BUFFER, typename PATCH_TABLE>
+    static bool EvalPatchesVarying(
+        SRC_BUFFER *srcBuffer, BufferDescriptor const &srcDesc,
+        DST_BUFFER *dstBuffer, BufferDescriptor const &dstDesc,
+        DST_BUFFER *duBuffer,  BufferDescriptor const &duDesc,
+        DST_BUFFER *dvBuffer,  BufferDescriptor const &dvDesc,
+        DST_BUFFER *duuBuffer, BufferDescriptor const &duuDesc,
+        DST_BUFFER *duvBuffer, BufferDescriptor const &duvDesc,
+        DST_BUFFER *dvvBuffer, BufferDescriptor const &dvvDesc,
+        int numPatchCoords,
+        PATCHCOORD_BUFFER *patchCoords,
+        PATCH_TABLE *patchTable,
+        OmpEvaluator const *instance = NULL,
+        void * deviceContext = NULL) {
+
+        (void)instance;       // unused
+        (void)deviceContext;  // unused
+
+        return EvalPatches(srcBuffer->BindCpuBuffer(), srcDesc,
+                           dstBuffer->BindCpuBuffer(), dstDesc,
+                           duBuffer->BindCpuBuffer(),  duDesc,
+                           dvBuffer->BindCpuBuffer(),  dvDesc,
+                           duuBuffer->BindCpuBuffer(), duuDesc,
+                           duvBuffer->BindCpuBuffer(), duvDesc,
+                           dvvBuffer->BindCpuBuffer(), dvvDesc,
+                           numPatchCoords,
+                           (const PatchCoord*)patchCoords->BindCpuBuffer(),
+                           patchTable->GetVaryingPatchArrayBuffer(),
+                           patchTable->GetVaryingPatchIndexBuffer(),
+                           patchTable->GetPatchParamBuffer());
+    }
+
    /// \brief Generic limit eval function. This function has a same
    ///        signature as other device kernels have so that it can be called
    ///        in the same way.
@ -562,6 +1053,170 @@ public:
                           patchTable->GetFVarPatchParamBuffer(fvarChannel));
    }

+    /// \brief Generic limit eval function. This function has a same
+    ///        signature as other device kernels have so that it can be called
+    ///        in the same way.
+    ///
+    /// @param srcBuffer        Input primvar buffer.
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         const float pointer for read
+    ///
+    /// @param srcDesc          vertex buffer descriptor for the input buffer
+    ///
+    /// @param dstBuffer        Output primvar buffer
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param dstDesc          vertex buffer descriptor for the output buffer
+    ///
+    /// @param duBuffer         Output buffer derivative wrt u
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param duDesc           vertex buffer descriptor for the duBuffer
+    ///
+    /// @param dvBuffer         Output buffer derivative wrt v
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param dvDesc           vertex buffer descriptor for the dvBuffer
+    ///
+    /// @param numPatchCoords   number of patchCoords.
+    ///
+    /// @param patchCoords      array of locations to be evaluated.
+    ///
+    /// @param patchTable       CpuPatchTable or equivalent
+    ///                         XXX: currently Far::PatchTable can't be used
+    ///                              due to interface mismatch
+    ///
+    /// @param fvarChannel      face-varying channel
+    ///
+    /// @param instance         not used in the omp evaluator
+    ///
+    /// @param deviceContext    not used in the omp evaluator
+    ///
+    template <typename SRC_BUFFER, typename DST_BUFFER,
+              typename PATCHCOORD_BUFFER, typename PATCH_TABLE>
+    static bool EvalPatchesFaceVarying(
+        SRC_BUFFER *srcBuffer, BufferDescriptor const &srcDesc,
+        DST_BUFFER *dstBuffer, BufferDescriptor const &dstDesc,
+        DST_BUFFER *duBuffer,  BufferDescriptor const &duDesc,
+        DST_BUFFER *dvBuffer,  BufferDescriptor const &dvDesc,
+        int numPatchCoords,
+        PATCHCOORD_BUFFER *patchCoords,
+        PATCH_TABLE *patchTable,
+        int fvarChannel,
+        OmpEvaluator const *instance = NULL,
+        void * deviceContext = NULL) {
+
+        (void)instance;       // unused
+        (void)deviceContext;  // unused
+
+        return EvalPatches(srcBuffer->BindCpuBuffer(), srcDesc,
+                           dstBuffer->BindCpuBuffer(), dstDesc,
+                           duBuffer->BindCpuBuffer(),  duDesc,
+                           dvBuffer->BindCpuBuffer(),  dvDesc,
+                           numPatchCoords,
+                           (const PatchCoord*)patchCoords->BindCpuBuffer(),
+                           patchTable->GetFVarPatchArrayBuffer(fvarChannel),
+                           patchTable->GetFVarPatchIndexBuffer(fvarChannel),
+                           patchTable->GetFVarPatchParamBuffer(fvarChannel));
+    }
+
+    /// \brief Generic limit eval function. This function has a same
+    ///        signature as other device kernels have so that it can be called
+    ///        in the same way.
+    ///
+    /// @param srcBuffer        Input primvar buffer.
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         const float pointer for read
+    ///
+    /// @param srcDesc          vertex buffer descriptor for the input buffer
+    ///
+    /// @param dstBuffer        Output primvar buffer
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param dstDesc          vertex buffer descriptor for the output buffer
+    ///
+    /// @param duBuffer         Output buffer derivative wrt u
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param duDesc           vertex buffer descriptor for the duBuffer
+    ///
+    /// @param dvBuffer         Output buffer derivative wrt v
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param dvDesc           vertex buffer descriptor for the dvBuffer
+    ///
+    /// @param duuBuffer        Output buffer 2nd derivative wrt u
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param duuDesc          vertex buffer descriptor for the duuBuffer
+    ///
+    /// @param duvBuffer        Output buffer 2nd derivative wrt u and v
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param duvDesc          vertex buffer descriptor for the duvBuffer
+    ///
+    /// @param dvvBuffer        Output buffer 2nd derivative wrt v
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param dvvDesc          vertex buffer descriptor for the dvvBuffer
+    ///
+    /// @param numPatchCoords   number of patchCoords.
+    ///
+    /// @param patchCoords      array of locations to be evaluated.
+    ///
+    /// @param patchTable       CpuPatchTable or equivalent
+    ///                         XXX: currently Far::PatchTable can't be used
+    ///                              due to interface mismatch
+    ///
+    /// @param fvarChannel      face-varying channel
+    ///
+    /// @param instance         not used in the omp evaluator
+    ///
+    /// @param deviceContext    not used in the omp evaluator
+    ///
+    template <typename SRC_BUFFER, typename DST_BUFFER,
+              typename PATCHCOORD_BUFFER, typename PATCH_TABLE>
+    static bool EvalPatchesFaceVarying(
+        SRC_BUFFER *srcBuffer, BufferDescriptor const &srcDesc,
+        DST_BUFFER *dstBuffer, BufferDescriptor const &dstDesc,
+        DST_BUFFER *duBuffer,  BufferDescriptor const &duDesc,
+        DST_BUFFER *dvBuffer,  BufferDescriptor const &dvDesc,
+        DST_BUFFER *duuBuffer, BufferDescriptor const &duuDesc,
+        DST_BUFFER *duvBuffer, BufferDescriptor const &duvDesc,
+        DST_BUFFER *dvvBuffer, BufferDescriptor const &dvvDesc,
+        int numPatchCoords,
+        PATCHCOORD_BUFFER *patchCoords,
+        PATCH_TABLE *patchTable,
+        int fvarChannel,
+        OmpEvaluator const *instance = NULL,
+        void * deviceContext = NULL) {
+
+        (void)instance;       // unused
+        (void)deviceContext;  // unused
+
+        return EvalPatches(srcBuffer->BindCpuBuffer(), srcDesc,
+                           dstBuffer->BindCpuBuffer(), dstDesc,
+                           duBuffer->BindCpuBuffer(),  duDesc,
+                           dvBuffer->BindCpuBuffer(),  dvDesc,
+                           duuBuffer->BindCpuBuffer(), duuDesc,
+                           duvBuffer->BindCpuBuffer(), duvDesc,
+                           dvvBuffer->BindCpuBuffer(), dvvDesc,
+                           numPatchCoords,
+                           (const PatchCoord*)patchCoords->BindCpuBuffer(),
+                           patchTable->GetFVarPatchArrayBuffer(fvarChannel),
+                           patchTable->GetFVarPatchIndexBuffer(fvarChannel),
+                           patchTable->GetFVarPatchParamBuffer(fvarChannel));
+    }
+
    /// ----------------------------------------------------------------------
    ///
    ///   Other methods
--- a/opensubdiv/osd/ompKernel.cpp
+++ b/opensubdiv/osd/ompKernel.cpp
@ -177,6 +177,99 @@ OmpEvalStencils(float const * src, BufferDescriptor const &srcDesc,

 }

+void
+OmpEvalStencils(float const * src, BufferDescriptor const &srcDesc,
+                float * dst,       BufferDescriptor const &dstDesc,
+                float * dstDu,     BufferDescriptor const &dstDuDesc,
+                float * dstDv,     BufferDescriptor const &dstDvDesc,
+                float * dstDuu,    BufferDescriptor const &dstDuuDesc,
+                float * dstDuv,    BufferDescriptor const &dstDuvDesc,
+                float * dstDvv,    BufferDescriptor const &dstDvvDesc,
+                int const * sizes,
+                int const * offsets,
+                int const * indices,
+                float const * weights,
+                float const * duWeights,
+                float const * dvWeights,
+                float const * duuWeights,
+                float const * duvWeights,
+                float const * dvvWeights,
+                int start, int end) {
+    start = (start > 0 ? start : 0);
+
+    src += srcDesc.offset;
+    dst += dstDesc.offset;
+    dstDu += dstDuDesc.offset;
+    dstDv += dstDvDesc.offset;
+    dstDuu += dstDuuDesc.offset;
+    dstDuv += dstDuvDesc.offset;
+    dstDvv += dstDvvDesc.offset;
+
+    int numThreads = omp_get_max_threads();
+    int n = end - start;
+
+    float * result = (float*)alloca(srcDesc.length * numThreads * sizeof(float));
+    float * resultDu = (float*)alloca(srcDesc.length * numThreads * sizeof(float));
+    float * resultDv = (float*)alloca(srcDesc.length * numThreads * sizeof(float));
+    float * resultDuu = (float*)alloca(srcDesc.length * numThreads * sizeof(float));
+    float * resultDuv = (float*)alloca(srcDesc.length * numThreads * sizeof(float));
+    float * resultDvv = (float*)alloca(srcDesc.length * numThreads * sizeof(float));
+
+#pragma omp parallel for
+    for (int i = 0; i < n; ++i) {
+
+        int index = i + start; // Stencil index
+
+        // Get thread-local pointers
+        int const           * threadIndices = indices + offsets[index];
+        float const         * threadWeights = weights + offsets[index];
+        float const         * threadWeightsDu = duWeights + offsets[index];
+        float const         * threadWeightsDv = dvWeights + offsets[index];
+        float const         * threadWeightsDuu = duuWeights + offsets[index];
+        float const         * threadWeightsDuv = duvWeights + offsets[index];
+        float const         * threadWeightsDvv = dvvWeights + offsets[index];
+
+        int threadId = omp_get_thread_num();
+
+        float * threadResult = result + threadId*srcDesc.length;
+        float * threadResultDu = resultDu + threadId*srcDesc.length;
+        float * threadResultDv = resultDv + threadId*srcDesc.length;
+        float * threadResultDuu = resultDuu + threadId*srcDesc.length;
+        float * threadResultDuv = resultDuv + threadId*srcDesc.length;
+        float * threadResultDvv = resultDvv + threadId*srcDesc.length;
+
+        clear(threadResult, dstDesc);
+        clear(threadResultDu, dstDuDesc);
+        clear(threadResultDv, dstDvDesc);
+        clear(threadResultDuu, dstDuuDesc);
+        clear(threadResultDuv, dstDuvDesc);
+        clear(threadResultDvv, dstDvvDesc);
+
+        for (int j=0; j<(int)sizes[index]; ++j) {
+            addWithWeight(threadResult, src,
+                threadIndices[j], threadWeights[j], srcDesc);
+            addWithWeight(threadResultDu, src,
+                threadIndices[j], threadWeightsDu[j], srcDesc);
+            addWithWeight(threadResultDv, src,
+                threadIndices[j], threadWeightsDv[j], srcDesc);
+            addWithWeight(threadResultDuu, src,
+                threadIndices[j], threadWeightsDuu[j], srcDesc);
+            addWithWeight(threadResultDuv, src,
+                threadIndices[j], threadWeightsDuv[j], srcDesc);
+            addWithWeight(threadResultDvv, src,
+                threadIndices[j], threadWeightsDvv[j], srcDesc);
+        }
+
+        copy(dst, i, threadResult, dstDesc);
+        copy(dstDu, i, threadResultDu, dstDuDesc);
+        copy(dstDv, i, threadResultDv, dstDvDesc);
+        copy(dstDuu, i, threadResultDuu, dstDuuDesc);
+        copy(dstDuv, i, threadResultDuv, dstDuvDesc);
+        copy(dstDvv, i, threadResultDvv, dstDvvDesc);
+    }
+
+}
+
 }  // end namespace Osd

 }  // end namespace OPENSUBDIV_VERSION
--- a/opensubdiv/osd/ompKernel.h
+++ b/opensubdiv/osd/ompKernel.h
@ -56,6 +56,25 @@ OmpEvalStencils(float const * src, BufferDescriptor const &srcDesc,
                float const * dvWeights,
                int start, int end);

+void
+OmpEvalStencils(float const * src, BufferDescriptor const &srcDesc,
+                float * dst,       BufferDescriptor const &dstDesc,
+                float * dstDu,     BufferDescriptor const &dstDuDesc,
+                float * dstDv,     BufferDescriptor const &dstDvDesc,
+                float * dstDuu,    BufferDescriptor const &dstDuuDesc,
+                float * dstDuv,    BufferDescriptor const &dstDuvDesc,
+                float * dstDvv,    BufferDescriptor const &dstDvvDesc,
+                int const * sizes,
+                int const * offsets,
+                int const * indices,
+                float const * weights,
+                float const * duWeights,
+                float const * dvWeights,
+                float const * duuWeights,
+                float const * duvWeights,
+                float const * dvvWeights,
+                int start, int end);
+
 } // end namespace Osd

 }  // end namespace OPENSUBDIV_VERSION
--- a/opensubdiv/osd/tbbEvaluator.cpp
+++ b/opensubdiv/osd/tbbEvaluator.cpp
@ -75,8 +75,55 @@ TbbEvaluator::EvalStencils(
                    dst, dstDesc,
                    du,  duDesc,
                    dv,  dvDesc,
+                    NULL, BufferDescriptor(),
+                    NULL, BufferDescriptor(),
+                    NULL, BufferDescriptor(),
+                    sizes, offsets, indices,
+                    weights, duWeights, dvWeights, NULL, NULL, NULL,
+                    start, end);
+
+    return true;
+}
+
+/* static */
+bool
+TbbEvaluator::EvalStencils(
+    const float *src, BufferDescriptor const &srcDesc,
+    float *dst,       BufferDescriptor const &dstDesc,
+    float *du,        BufferDescriptor const &duDesc,
+    float *dv,        BufferDescriptor const &dvDesc,
+    float *duu,       BufferDescriptor const &duuDesc,
+    float *duv,       BufferDescriptor const &duvDesc,
+    float *dvv,       BufferDescriptor const &dvvDesc,
+    const int * sizes,
+    const int * offsets,
+    const int * indices,
+    const float * weights,
+    const float * duWeights,
+    const float * dvWeights,
+    const float * duuWeights,
+    const float * duvWeights,
+    const float * dvvWeights,
+    int start, int end) {
+
+    if (end <= start) return true;
+    if (srcDesc.length != dstDesc.length) return false;
+    if (srcDesc.length != duDesc.length) return false;
+    if (srcDesc.length != dvDesc.length) return false;
+    if (srcDesc.length != duuDesc.length) return false;
+    if (srcDesc.length != duvDesc.length) return false;
+    if (srcDesc.length != dvvDesc.length) return false;
+
+    TbbEvalStencils(src, srcDesc,
+                    dst, dstDesc,
+                    du,  duDesc,
+                    dv,  dvDesc,
+                    duu, duuDesc,
+                    duv, duvDesc,
+                    dvv, dvvDesc,
                    sizes, offsets, indices,
                    weights, duWeights, dvWeights,
+                    duuWeights, duvWeights, dvvWeights,
                    start, end);

    return true;
@ -96,6 +143,9 @@ TbbEvaluator::EvalPatches(
    if (srcDesc.length != dstDesc.length) return false;

    TbbEvalPatches(src, srcDesc, dst, dstDesc,
+                   NULL, BufferDescriptor(),
+                   NULL, BufferDescriptor(),
+                   NULL, BufferDescriptor(),
                   NULL, BufferDescriptor(),
                   NULL, BufferDescriptor(),
                   numPatchCoords, patchCoords,
@ -121,6 +171,36 @@ TbbEvaluator::EvalPatches(

    TbbEvalPatches(src, srcDesc, dst, dstDesc,
                   du,  duDesc,  dv,  dvDesc,
+                   NULL, BufferDescriptor(),
+                   NULL, BufferDescriptor(),
+                   NULL, BufferDescriptor(),
+                   numPatchCoords, patchCoords,
+                   patchArrayBuffer, patchIndexBuffer, patchParamBuffer);
+
+    return true;
+}
+
+/* static */
+bool
+TbbEvaluator::EvalPatches(
+    const float *src, BufferDescriptor const &srcDesc,
+    float *dst,       BufferDescriptor const &dstDesc,
+    float *du,        BufferDescriptor const &duDesc,
+    float *dv,        BufferDescriptor const &dvDesc,
+    float *duu,       BufferDescriptor const &duuDesc,
+    float *duv,       BufferDescriptor const &duvDesc,
+    float *dvv,       BufferDescriptor const &dvvDesc,
+    int numPatchCoords,
+    const PatchCoord *patchCoords,
+    const PatchArray *patchArrayBuffer,
+    const int *patchIndexBuffer,
+    const PatchParam *patchParamBuffer) {
+
+    if (srcDesc.length != dstDesc.length) return false;
+
+    TbbEvalPatches(src, srcDesc, dst, dstDesc,
+                   du,  duDesc,  dv,  dvDesc,
+                   duu, duuDesc, duv, duvDesc, dvv, dvvDesc,
                   numPatchCoords, patchCoords,
                   patchArrayBuffer, patchIndexBuffer, patchParamBuffer);

--- a/opensubdiv/osd/tbbEvaluator.h
+++ b/opensubdiv/osd/tbbEvaluator.h
@ -26,9 +26,8 @@
 #define OPENSUBDIV3_OSD_TBB_EVALUATOR_H

 #include "../version.h"
-#include "../osd/types.h"
 #include "../osd/bufferDescriptor.h"
-#include "../far/patchTable.h"
+#include "../osd/types.h"

 #include <cstddef>

@ -61,7 +60,7 @@ public:
    ///
    /// @param dstDesc        vertex buffer descriptor for the output buffer
    ///
-    /// @param stencilTable   stencil table to be applied.
+    /// @param stencilTable   Far::StencilTable or equivalent
    ///
    /// @param instance       not used in the tbb kernel
    ///                       (declared as a typed pointer to prevent
@ -108,7 +107,6 @@ public:
    /// @param dstDesc        vertex buffer descriptor for the output buffer
    ///
    /// @param sizes          pointer to the sizes buffer of the stencil table
-    ///                       to apply for the range [start, end)
    ///
    /// @param offsets        pointer to the offsets buffer of the stencil table
    ///
@ -146,19 +144,19 @@ public:
    ///
    /// @param dstDesc        vertex buffer descriptor for the output buffer
    ///
-    /// @param duBuffer       Output U-derivative buffer
+    /// @param duBuffer       Output buffer derivative wrt u
    ///                       must have BindCpuBuffer() method returning a
    ///                       float pointer for write
    ///
-    /// @param duDesc         vertex buffer descriptor for the output buffer
+    /// @param duDesc         vertex buffer descriptor for the duBuffer
    ///
-    /// @param dvBuffer       Output V-derivative buffer
+    /// @param dvBuffer       Output buffer derivative wrt v
    ///                       must have BindCpuBuffer() method returning a
    ///                       float pointer for write
    ///
-    /// @param dvDesc         vertex buffer descriptor for the output buffer
+    /// @param dvDesc         vertex buffer descriptor for the dvBuffer
    ///
-    /// @param stencilTable   stencil table to be applied.
+    /// @param stencilTable   Far::StencilTable or equivalent
    ///
    /// @param instance       not used in the tbb kernel
    ///                       (declared as a typed pointer to prevent
@ -207,18 +205,17 @@ public:
    ///
    /// @param dstDesc        vertex buffer descriptor for the output buffer
    ///
-    /// @param du             Output s-derivatives pointer. An offset of
+    /// @param du             Output pointer derivative wrt u. An offset of
    ///                       duDesc will be applied internally.
    ///
-    /// @param duDesc         vertex buffer descriptor for the output buffer
+    /// @param duDesc         vertex buffer descriptor for the duBuffer
    ///
-    /// @param dv             Output t-derivatives pointer. An offset of
+    /// @param dv             Output pointer derivative wrt v. An offset of
    ///                       dvDesc will be applied internally.
    ///
-    /// @param dvDesc         vertex buffer descriptor for the output buffer
+    /// @param dvDesc         vertex buffer descriptor for the dvBuffer
    ///
    /// @param sizes          pointer to the sizes buffer of the stencil table
-    ///                       to apply for the range [start, end)
    ///
    /// @param offsets        pointer to the offsets buffer of the stencil table
    ///
@ -226,9 +223,9 @@ public:
    ///
    /// @param weights        pointer to the weights buffer of the stencil table
    ///
-    /// @param duWeights      pointer to the u-weights buffer of the stencil table
+    /// @param duWeights      pointer to the du-weights buffer of the stencil table
    ///
-    /// @param dvWeights      pointer to the v-weights buffer of the stencil table
+    /// @param dvWeights      pointer to the dv-weights buffer of the stencil table
    ///
    /// @param start          start index of stencil table
    ///
@ -247,6 +244,177 @@ public:
        const float * dvWeights,
        int start, int end);

+    /// \brief Generic static eval stencils function with derivatives.
+    ///        This function has a same signature as other device kernels
+    ///        have so that it can be called in the same way from OsdMesh
+    ///        template interface.
+    ///
+    /// @param srcBuffer      Input primvar buffer.
+    ///                       must have BindCpuBuffer() method returning a
+    ///                       const float pointer for read
+    ///
+    /// @param srcDesc        vertex buffer descriptor for the input buffer
+    ///
+    /// @param dstBuffer      Output primvar buffer
+    ///                       must have BindCpuBuffer() method returning a
+    ///                       float pointer for write
+    ///
+    /// @param dstDesc        vertex buffer descriptor for the output buffer
+    ///
+    /// @param duBuffer       Output buffer derivative wrt u
+    ///                       must have BindCpuBuffer() method returning a
+    ///                       float pointer for write
+    ///
+    /// @param duDesc         vertex buffer descriptor for the duBuffer
+    ///
+    /// @param dvBuffer       Output buffer derivative wrt v
+    ///                       must have BindCpuBuffer() method returning a
+    ///                       float pointer for write
+    ///
+    /// @param dvDesc         vertex buffer descriptor for the dvBuffer
+    ///
+    /// @param duuBuffer      Output buffer 2nd derivative wrt u
+    ///                       must have BindCpuBuffer() method returning a
+    ///                       float pointer for write
+    ///
+    /// @param duuDesc        vertex buffer descriptor for the duuBuffer
+    ///
+    /// @param duvBuffer      Output buffer 2nd derivative wrt u and v
+    ///                       must have BindCpuBuffer() method returning a
+    ///                       float pointer for write
+    ///
+    /// @param duvDesc        vertex buffer descriptor for the duvBuffer
+    ///
+    /// @param dvvBuffer      Output buffer 2nd derivative wrt v
+    ///                       must have BindCpuBuffer() method returning a
+    ///                       float pointer for write
+    ///
+    /// @param dvvDesc        vertex buffer descriptor for the dvvBuffer
+    ///
+    /// @param stencilTable   Far::StencilTable or equivalent
+    ///
+    /// @param instance       not used in the tbb kernel
+    ///                       (declared as a typed pointer to prevent
+    ///                        undesirable template resolution)
+    ///
+    /// @param deviceContext  not used in the tbb kernel
+    ///
+    template <typename SRC_BUFFER, typename DST_BUFFER, typename STENCIL_TABLE>
+    static bool EvalStencils(
+        SRC_BUFFER *srcBuffer, BufferDescriptor const &srcDesc,
+        DST_BUFFER *dstBuffer, BufferDescriptor const &dstDesc,
+        DST_BUFFER *duBuffer,  BufferDescriptor const &duDesc,
+        DST_BUFFER *dvBuffer,  BufferDescriptor const &dvDesc,
+        DST_BUFFER *duuBuffer, BufferDescriptor const &duuDesc,
+        DST_BUFFER *duvBuffer, BufferDescriptor const &duvDesc,
+        DST_BUFFER *dvvBuffer, BufferDescriptor const &dvvDesc,
+        STENCIL_TABLE const *stencilTable,
+        const TbbEvaluator *instance = NULL,
+        void * deviceContext = NULL) {
+
+        (void)instance;       // unused
+        (void)deviceContext;  // unused
+
+        return EvalStencils(srcBuffer->BindCpuBuffer(), srcDesc,
+                            dstBuffer->BindCpuBuffer(), dstDesc,
+                            duBuffer->BindCpuBuffer(),  duDesc,
+                            dvBuffer->BindCpuBuffer(),  dvDesc,
+                            duuBuffer->BindCpuBuffer(), duuDesc,
+                            duvBuffer->BindCpuBuffer(), duvDesc,
+                            dvvBuffer->BindCpuBuffer(), dvvDesc,
+                            &stencilTable->GetSizes()[0],
+                            &stencilTable->GetOffsets()[0],
+                            &stencilTable->GetControlIndices()[0],
+                            &stencilTable->GetWeights()[0],
+                            &stencilTable->GetDuWeights()[0],
+                            &stencilTable->GetDvWeights()[0],
+                            &stencilTable->GetDuuWeights()[0],
+                            &stencilTable->GetDuvWeights()[0],
+                            &stencilTable->GetDvvWeights()[0],
+                            /*start = */ 0,
+                            /*end   = */ stencilTable->GetNumStencils());
+    }
+
+    /// \brief Static eval stencils function with derivatives, which takes
+    ///        raw CPU pointers for input and output.
+    ///
+    /// @param src            Input primvar pointer. An offset of srcDesc
+    ///                       will be applied internally (i.e. the pointer
+    ///                       should not include the offset)
+    ///
+    /// @param srcDesc        vertex buffer descriptor for the input buffer
+    ///
+    /// @param dst            Output primvar pointer. An offset of dstDesc
+    ///                       will be applied internally.
+    ///
+    /// @param dstDesc        vertex buffer descriptor for the output buffer
+    ///
+    /// @param du             Output pointer derivative wrt u. An offset of
+    ///                       duDesc will be applied internally.
+    ///
+    /// @param duDesc         vertex buffer descriptor for the duBuffer
+    ///
+    /// @param dv             Output pointer derivative wrt v. An offset of
+    ///                       dvDesc will be applied internally.
+    ///
+    /// @param dvDesc         vertex buffer descriptor for the dvBuffer
+    ///
+    /// @param duu            Output pointer 2nd derivative wrt u. An offset of
+    ///                       duuDesc will be applied internally.
+    ///
+    /// @param duuDesc        vertex buffer descriptor for the duuBuffer
+    ///
+    /// @param duv            Output pointer 2nd derivative wrt u and v. An offset of
+    ///                       duvDesc will be applied internally.
+    ///
+    /// @param duvDesc        vertex buffer descriptor for the duvBuffer
+    ///
+    /// @param dvv            Output pointer 2nd derivative wrt v. An offset of
+    ///                       dvvDesc will be applied internally.
+    ///
+    /// @param dvvDesc        vertex buffer descriptor for the dvvBuffer
+    ///
+    /// @param sizes          pointer to the sizes buffer of the stencil table
+    ///
+    /// @param offsets        pointer to the offsets buffer of the stencil table
+    ///
+    /// @param indices        pointer to the indices buffer of the stencil table
+    ///
+    /// @param weights        pointer to the weights buffer of the stencil table
+    ///
+    /// @param duWeights      pointer to the du-weights buffer of the stencil table
+    ///
+    /// @param dvWeights      pointer to the dv-weights buffer of the stencil table
+    ///
+    /// @param duuWeights     pointer to the duu-weights buffer of the stencil table
+    ///
+    /// @param duvWeights     pointer to the duv-weights buffer of the stencil table
+    ///
+    /// @param dvvWeights     pointer to the dvv-weights buffer of the stencil table
+    ///
+    /// @param start          start index of stencil table
+    ///
+    /// @param end            end index of stencil table
+    ///
+    static bool EvalStencils(
+        const float *src, BufferDescriptor const &srcDesc,
+        float *dst,       BufferDescriptor const &dstDesc,
+        float *du,        BufferDescriptor const &duDesc,
+        float *dv,        BufferDescriptor const &dvDesc,
+        float *duu,       BufferDescriptor const &duuDesc,
+        float *duv,       BufferDescriptor const &duvDesc,
+        float *dvv,       BufferDescriptor const &dvvDesc,
+        const int * sizes,
+        const int * offsets,
+        const int * indices,
+        const float * weights,
+        const float * duWeights,
+        const float * dvWeights,
+        const float * duuWeights,
+        const float * duvWeights,
+        const float * dvvWeights,
+        int start, int end);
+
    /// ----------------------------------------------------------------------
    ///
    ///   Limit evaluations with PatchTable
@ -273,7 +441,9 @@ public:
    ///
    /// @param patchCoords      array of locations to be evaluated.
    ///
-    /// @param patchTable       Far::PatchTable
+    /// @param patchTable       CpuPatchTable or equivalent
+    ///                         XXX: currently Far::PatchTable can't be used
+    ///                              due to interface mismatch
    ///
    /// @param instance         not used in the cpu evaluator
    ///
@ -293,10 +463,8 @@ public:
        (void)instance;       // unused
        (void)deviceContext;  // unused

-        return EvalPatches(srcBuffer->BindCpuBuffer(),
-                           srcDesc,
-                           dstBuffer->BindCpuBuffer(),
-                           dstDesc,
+        return EvalPatches(srcBuffer->BindCpuBuffer(), srcDesc,
+                           dstBuffer->BindCpuBuffer(), dstDesc,
                           numPatchCoords,
                           (const PatchCoord*)patchCoords->BindCpuBuffer(),
                           patchTable->GetPatchArrayBuffer(),
@ -320,13 +488,13 @@ public:
    ///
    /// @param dstDesc          vertex buffer descriptor for the output buffer
    ///
-    /// @param duBuffer         Output s-derivatives buffer
+    /// @param duBuffer         Output buffer derivative wrt u
    ///                         must have BindCpuBuffer() method returning a
    ///                         float pointer for write
    ///
    /// @param duDesc           vertex buffer descriptor for the duBuffer
    ///
-    /// @param dvBuffer         Output t-derivatives buffer
+    /// @param dvBuffer         Output buffer derivative wrt v
    ///                         must have BindCpuBuffer() method returning a
    ///                         float pointer for write
    ///
@ -336,7 +504,9 @@ public:
    ///
    /// @param patchCoords      array of locations to be evaluated.
    ///
-    /// @param patchTable       Far::PatchTable
+    /// @param patchTable       CpuPatchTable or equivalent
+    ///                         XXX: currently Far::PatchTable can't be used
+    ///                              due to interface mismatch
    ///
    /// @param instance         not used in the cpu evaluator
    ///
@ -358,8 +528,12 @@ public:
        (void)instance;       // unused
        (void)deviceContext;  // unused

-        return EvalPatches(
-            srcBuffer->BindCpuBuffer(), srcDesc,
+        // XXX: PatchCoords is somewhat abusing vertex primvar buffer interop.
+        //      ideally all buffer classes should have templated by datatype
+        //      so that downcast isn't needed there.
+        //      (e.g. Osd::CpuBuffer<PatchCoord> )
+        //
+        return EvalPatches(srcBuffer->BindCpuBuffer(), srcDesc,
                           dstBuffer->BindCpuBuffer(), dstDesc,
                           duBuffer->BindCpuBuffer(),  duDesc,
                           dvBuffer->BindCpuBuffer(),  dvDesc,
@ -370,6 +544,102 @@ public:
                           patchTable->GetPatchParamBuffer());
    }

+    /// \brief Generic limit eval function with derivatives. This function has
+    ///        a same signature as other device kernels have so that it can be
+    ///        called in the same way.
+    ///
+    /// @param srcBuffer        Input primvar buffer.
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         const float pointer for read
+    ///
+    /// @param srcDesc          vertex buffer descriptor for the input buffer
+    ///
+    /// @param dstBuffer        Output primvar buffer
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param dstDesc          vertex buffer descriptor for the output buffer
+    ///
+    /// @param duBuffer         Output buffer derivative wrt u
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param duDesc           vertex buffer descriptor for the duBuffer
+    ///
+    /// @param dvBuffer         Output buffer derivative wrt v
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param dvDesc           vertex buffer descriptor for the dvBuffer
+    ///
+    /// @param duuBuffer        Output buffer 2nd derivative wrt u
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param duuDesc          vertex buffer descriptor for the duuBuffer
+    ///
+    /// @param duvBuffer        Output buffer 2nd derivative wrt u and v
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param duvDesc          vertex buffer descriptor for the duvBuffer
+    ///
+    /// @param dvvBuffer        Output buffer 2nd derivative wrt v
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param dvvDesc          vertex buffer descriptor for the dvvBuffer
+    ///
+    /// @param numPatchCoords   number of patchCoords.
+    ///
+    /// @param patchCoords      array of locations to be evaluated.
+    ///
+    /// @param patchTable       CpuPatchTable or equivalent
+    ///                         XXX: currently Far::PatchTable can't be used
+    ///                              due to interface mismatch
+    ///
+    /// @param instance         not used in the cpu evaluator
+    ///
+    /// @param deviceContext    not used in the cpu evaluator
+    ///
+    template <typename SRC_BUFFER, typename DST_BUFFER,
+              typename PATCHCOORD_BUFFER, typename PATCH_TABLE>
+    static bool EvalPatches(
+        SRC_BUFFER *srcBuffer, BufferDescriptor const &srcDesc,
+        DST_BUFFER *dstBuffer, BufferDescriptor const &dstDesc,
+        DST_BUFFER *duBuffer,  BufferDescriptor const &duDesc,
+        DST_BUFFER *dvBuffer,  BufferDescriptor const &dvDesc,
+        DST_BUFFER *duuBuffer, BufferDescriptor const &duuDesc,
+        DST_BUFFER *duvBuffer, BufferDescriptor const &duvDesc,
+        DST_BUFFER *dvvBuffer, BufferDescriptor const &dvvDesc,
+        int numPatchCoords,
+        PATCHCOORD_BUFFER *patchCoords,
+        PATCH_TABLE *patchTable,
+        TbbEvaluator const *instance = NULL,
+        void * deviceContext = NULL) {
+
+        (void)instance;       // unused
+        (void)deviceContext;  // unused
+
+        // XXX: PatchCoords is somewhat abusing vertex primvar buffer interop.
+        //      ideally all buffer classes should have templated by datatype
+        //      so that downcast isn't needed there.
+        //      (e.g. Osd::CpuBuffer<PatchCoord> )
+        //
+        return EvalPatches(srcBuffer->BindCpuBuffer(), srcDesc,
+                           dstBuffer->BindCpuBuffer(), dstDesc,
+                           duBuffer->BindCpuBuffer(),  duDesc,
+                           dvBuffer->BindCpuBuffer(),  dvDesc,
+                           duuBuffer->BindCpuBuffer(), duuDesc,
+                           duvBuffer->BindCpuBuffer(), duvDesc,
+                           dvvBuffer->BindCpuBuffer(), dvvDesc,
+                           numPatchCoords,
+                           (const PatchCoord*)patchCoords->BindCpuBuffer(),
+                           patchTable->GetPatchArrayBuffer(),
+                           patchTable->GetPatchIndexBuffer(),
+                           patchTable->GetPatchParamBuffer());
+    }
+
    /// \brief Static limit eval function. It takes an array of PatchCoord
    ///        and evaluate limit values on given PatchTable.
    ///
@ -420,15 +690,15 @@ public:
    ///
    /// @param dstDesc          vertex buffer descriptor for the output buffer
    ///
-    /// @param du               Output s-derivatives pointer. An offset of
+    /// @param du               Output pointer derivative wrt u. An offset of
    ///                         duDesc will be applied internally.
    ///
-    /// @param duDesc           vertex buffer descriptor for the du buffer
+    /// @param duDesc           vertex buffer descriptor for the duBuffer
    ///
-    /// @param dv               Output t-derivatives pointer. An offset of
+    /// @param dv               Output pointer derivative wrt v. An offset of
    ///                         dvDesc will be applied internally.
    ///
-    /// @param dvDesc           vertex buffer descriptor for the dv buffer
+    /// @param dvDesc           vertex buffer descriptor for the dvBuffer
    ///
    /// @param numPatchCoords   number of patchCoords.
    ///
@ -449,10 +719,76 @@ public:
        float *du,        BufferDescriptor const &duDesc,
        float *dv,        BufferDescriptor const &dvDesc,
        int numPatchCoords,
-        const PatchCoord *patchCoords,
-        const PatchArray *patchArrays,
+        PatchCoord const *patchCoords,
+        PatchArray const *patchArrays,
        const int *patchIndexBuffer,
-        const PatchParam *patchParamBuffer);
+        PatchParam const *patchParamBuffer);
+
+    /// \brief Static limit eval function. It takes an array of PatchCoord
+    ///        and evaluate limit values on given PatchTable.
+    ///
+    /// @param src              Input primvar pointer. An offset of srcDesc
+    ///                         will be applied internally (i.e. the pointer
+    ///                         should not include the offset)
+    ///
+    /// @param srcDesc          vertex buffer descriptor for the input buffer
+    ///
+    /// @param dst              Output primvar pointer. An offset of dstDesc
+    ///                         will be applied internally.
+    ///
+    /// @param dstDesc          vertex buffer descriptor for the output buffer
+    ///
+    /// @param du               Output pointer derivative wrt u. An offset of
+    ///                         duDesc will be applied internally.
+    ///
+    /// @param duDesc           vertex buffer descriptor for the duBuffer
+    ///
+    /// @param dv               Output pointer derivative wrt v. An offset of
+    ///                         dvDesc will be applied internally.
+    ///
+    /// @param dvDesc           vertex buffer descriptor for the dvBuffer
+    ///
+    /// @param duu              Output pointer 2nd derivative wrt u. An offset of
+    ///                         duuDesc will be applied internally.
+    ///
+    /// @param duuDesc          vertex buffer descriptor for the duuBuffer
+    ///
+    /// @param duv              Output pointer 2nd derivative wrt u and v. An offset of
+    ///                         duvDesc will be applied internally.
+    ///
+    /// @param duvDesc          vertex buffer descriptor for the duvBuffer
+    ///
+    /// @param dvv              Output pointer 2nd derivative wrt v. An offset of
+    ///                         dvvDesc will be applied internally.
+    ///
+    /// @param dvvDesc          vertex buffer descriptor for the dvvBuffer
+    ///
+    /// @param numPatchCoords   number of patchCoords.
+    ///
+    /// @param patchCoords      array of locations to be evaluated.
+    ///
+    /// @param patchArrays      an array of Osd::PatchArray struct
+    ///                         indexed by PatchCoord::arrayIndex
+    ///
+    /// @param patchIndexBuffer an array of patch indices
+    ///                         indexed by PatchCoord::vertIndex
+    ///
+    /// @param patchParamBuffer an array of Osd::PatchParam struct
+    ///                         indexed by PatchCoord::patchIndex
+    ///
+    static bool EvalPatches(
+        const float *src, BufferDescriptor const &srcDesc,
+        float *dst,       BufferDescriptor const &dstDesc,
+        float *du,        BufferDescriptor const &duDesc,
+        float *dv,        BufferDescriptor const &dvDesc,
+        float *duu,       BufferDescriptor const &duuDesc,
+        float *duv,       BufferDescriptor const &duvDesc,
+        float *dvv,       BufferDescriptor const &dvvDesc,
+        int numPatchCoords,
+        PatchCoord const *patchCoords,
+        PatchArray const *patchArrays,
+        const int *patchIndexBuffer,
+        PatchParam const *patchParamBuffer);

    /// \brief Generic limit eval function. This function has a same
    ///        signature as other device kernels have so that it can be called
@ -474,7 +810,9 @@ public:
    ///
    /// @param patchCoords      array of locations to be evaluated.
    ///
-    /// @param patchTable       Far::PatchTable
+    /// @param patchTable       CpuPatchTable or equivalent
+    ///                         XXX: currently Far::PatchTable can't be used
+    ///                              due to interface mismatch
    ///
    /// @param instance         not used in the cpu evaluator
    ///
@ -494,10 +832,166 @@ public:
        (void)instance;       // unused
        (void)deviceContext;  // unused

-        return EvalPatches(srcBuffer->BindCpuBuffer(),
-                           srcDesc,
-                           dstBuffer->BindCpuBuffer(),
-                           dstDesc,
+        return EvalPatches(srcBuffer->BindCpuBuffer(), srcDesc,
+                           dstBuffer->BindCpuBuffer(), dstDesc,
+                           numPatchCoords,
+                           (const PatchCoord*)patchCoords->BindCpuBuffer(),
+                           patchTable->GetVaryingPatchArrayBuffer(),
+                           patchTable->GetVaryingPatchIndexBuffer(),
+                           patchTable->GetPatchParamBuffer());
+    }
+
+    /// \brief Generic limit eval function. This function has a same
+    ///        signature as other device kernels have so that it can be called
+    ///        in the same way.
+    ///
+    /// @param srcBuffer        Input primvar buffer.
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         const float pointer for read
+    ///
+    /// @param srcDesc          vertex buffer descriptor for the input buffer
+    ///
+    /// @param dstBuffer        Output primvar buffer
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param dstDesc          vertex buffer descriptor for the output buffer
+    ///
+    /// @param duBuffer         Output buffer derivative wrt u
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param duDesc           vertex buffer descriptor for the duBuffer
+    ///
+    /// @param dvBuffer         Output buffer derivative wrt v
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param dvDesc           vertex buffer descriptor for the dvBuffer
+    ///
+    /// @param numPatchCoords   number of patchCoords.
+    ///
+    /// @param patchCoords      array of locations to be evaluated.
+    ///
+    /// @param patchTable       CpuPatchTable or equivalent
+    ///                         XXX: currently Far::PatchTable can't be used
+    ///                              due to interface mismatch
+    ///
+    /// @param instance         not used in the cpu evaluator
+    ///
+    /// @param deviceContext    not used in the cpu evaluator
+    ///
+    template <typename SRC_BUFFER, typename DST_BUFFER,
+              typename PATCHCOORD_BUFFER, typename PATCH_TABLE>
+    static bool EvalPatchesVarying(
+        SRC_BUFFER *srcBuffer, BufferDescriptor const &srcDesc,
+        DST_BUFFER *dstBuffer, BufferDescriptor const &dstDesc,
+        DST_BUFFER *duBuffer,  BufferDescriptor const &duDesc,
+        DST_BUFFER *dvBuffer,  BufferDescriptor const &dvDesc,
+        int numPatchCoords,
+        PATCHCOORD_BUFFER *patchCoords,
+        PATCH_TABLE *patchTable,
+        TbbEvaluator const *instance = NULL,
+        void * deviceContext = NULL) {
+
+        (void)instance;       // unused
+        (void)deviceContext;  // unused
+
+        return EvalPatches(srcBuffer->BindCpuBuffer(), srcDesc,
+                           dstBuffer->BindCpuBuffer(), dstDesc,
+                           duBuffer->BindCpuBuffer(),  duDesc,
+                           dvBuffer->BindCpuBuffer(),  dvDesc,
+                           numPatchCoords,
+                           (const PatchCoord*)patchCoords->BindCpuBuffer(),
+                           patchTable->GetVaryingPatchArrayBuffer(),
+                           patchTable->GetVaryingPatchIndexBuffer(),
+                           patchTable->GetPatchParamBuffer());
+    }
+
+    /// \brief Generic limit eval function. This function has a same
+    ///        signature as other device kernels have so that it can be called
+    ///        in the same way.
+    ///
+    /// @param srcBuffer        Input primvar buffer.
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         const float pointer for read
+    ///
+    /// @param srcDesc          vertex buffer descriptor for the input buffer
+    ///
+    /// @param dstBuffer        Output primvar buffer
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param dstDesc          vertex buffer descriptor for the output buffer
+    ///
+    /// @param duBuffer         Output buffer derivative wrt u
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param duDesc           vertex buffer descriptor for the duBuffer
+    ///
+    /// @param dvBuffer         Output buffer derivative wrt v
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param dvDesc           vertex buffer descriptor for the dvBuffer
+    ///
+    /// @param duuBuffer        Output buffer 2nd derivative wrt u
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param duuDesc          vertex buffer descriptor for the duuBuffer
+    ///
+    /// @param duvBuffer        Output buffer 2nd derivative wrt u and v
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param duvDesc          vertex buffer descriptor for the duvBuffer
+    ///
+    /// @param dvvBuffer        Output buffer 2nd derivative wrt v
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param dvvDesc          vertex buffer descriptor for the dvvBuffer
+    ///
+    /// @param numPatchCoords   number of patchCoords.
+    ///
+    /// @param patchCoords      array of locations to be evaluated.
+    ///
+    /// @param patchTable       CpuPatchTable or equivalent
+    ///                         XXX: currently Far::PatchTable can't be used
+    ///                              due to interface mismatch
+    ///
+    /// @param instance         not used in the cpu evaluator
+    ///
+    /// @param deviceContext    not used in the cpu evaluator
+    ///
+    template <typename SRC_BUFFER, typename DST_BUFFER,
+              typename PATCHCOORD_BUFFER, typename PATCH_TABLE>
+    static bool EvalPatchesVarying(
+        SRC_BUFFER *srcBuffer, BufferDescriptor const &srcDesc,
+        DST_BUFFER *dstBuffer, BufferDescriptor const &dstDesc,
+        DST_BUFFER *duBuffer,  BufferDescriptor const &duDesc,
+        DST_BUFFER *dvBuffer,  BufferDescriptor const &dvDesc,
+        DST_BUFFER *duuBuffer, BufferDescriptor const &duuDesc,
+        DST_BUFFER *duvBuffer, BufferDescriptor const &duvDesc,
+        DST_BUFFER *dvvBuffer, BufferDescriptor const &dvvDesc,
+        int numPatchCoords,
+        PATCHCOORD_BUFFER *patchCoords,
+        PATCH_TABLE *patchTable,
+        TbbEvaluator const *instance = NULL,
+        void * deviceContext = NULL) {
+
+        (void)instance;       // unused
+        (void)deviceContext;  // unused
+
+        return EvalPatches(srcBuffer->BindCpuBuffer(), srcDesc,
+                           dstBuffer->BindCpuBuffer(), dstDesc,
+                           duBuffer->BindCpuBuffer(),  duDesc,
+                           dvBuffer->BindCpuBuffer(),  dvDesc,
+                           duuBuffer->BindCpuBuffer(), duuDesc,
+                           duvBuffer->BindCpuBuffer(), duvDesc,
+                           dvvBuffer->BindCpuBuffer(), dvvDesc,
                           numPatchCoords,
                           (const PatchCoord*)patchCoords->BindCpuBuffer(),
                           patchTable->GetVaryingPatchArrayBuffer(),
@ -525,7 +1019,9 @@ public:
    ///
    /// @param patchCoords      array of locations to be evaluated.
    ///
-    /// @param patchTable       Far::PatchTable
+    /// @param patchTable       CpuPatchTable or equivalent
+    ///                         XXX: currently Far::PatchTable can't be used
+    ///                              due to interface mismatch
    ///
    /// @param fvarChannel      face-varying channel
    ///
@ -548,10 +1044,172 @@ public:
        (void)instance;       // unused
        (void)deviceContext;  // unused

-        return EvalPatches(srcBuffer->BindCpuBuffer(),
-                           srcDesc,
-                           dstBuffer->BindCpuBuffer(),
-                           dstDesc,
+        return EvalPatches(srcBuffer->BindCpuBuffer(), srcDesc,
+                           dstBuffer->BindCpuBuffer(), dstDesc,
+                           numPatchCoords,
+                           (const PatchCoord*)patchCoords->BindCpuBuffer(),
+                           patchTable->GetFVarPatchArrayBuffer(fvarChannel),
+                           patchTable->GetFVarPatchIndexBuffer(fvarChannel),
+                           patchTable->GetFVarPatchParamBuffer(fvarChannel));
+    }
+
+    /// \brief Generic limit eval function. This function has a same
+    ///        signature as other device kernels have so that it can be called
+    ///        in the same way.
+    ///
+    /// @param srcBuffer        Input primvar buffer.
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         const float pointer for read
+    ///
+    /// @param srcDesc          vertex buffer descriptor for the input buffer
+    ///
+    /// @param dstBuffer        Output primvar buffer
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param dstDesc          vertex buffer descriptor for the output buffer
+    ///
+    /// @param duBuffer         Output buffer derivative wrt u
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param duDesc           vertex buffer descriptor for the duBuffer
+    ///
+    /// @param dvBuffer         Output buffer derivative wrt v
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param dvDesc           vertex buffer descriptor for the dvBuffer
+    ///
+    /// @param numPatchCoords   number of patchCoords.
+    ///
+    /// @param patchCoords      array of locations to be evaluated.
+    ///
+    /// @param patchTable       CpuPatchTable or equivalent
+    ///                         XXX: currently Far::PatchTable can't be used
+    ///                              due to interface mismatch
+    ///
+    /// @param fvarChannel      face-varying channel
+    ///
+    /// @param instance         not used in the cpu evaluator
+    ///
+    /// @param deviceContext    not used in the cpu evaluator
+    ///
+    template <typename SRC_BUFFER, typename DST_BUFFER,
+              typename PATCHCOORD_BUFFER, typename PATCH_TABLE>
+    static bool EvalPatchesFaceVarying(
+        SRC_BUFFER *srcBuffer, BufferDescriptor const &srcDesc,
+        DST_BUFFER *dstBuffer, BufferDescriptor const &dstDesc,
+        DST_BUFFER *duBuffer,  BufferDescriptor const &duDesc,
+        DST_BUFFER *dvBuffer,  BufferDescriptor const &dvDesc,
+        int numPatchCoords,
+        PATCHCOORD_BUFFER *patchCoords,
+        PATCH_TABLE *patchTable,
+        int fvarChannel,
+        TbbEvaluator const *instance = NULL,
+        void * deviceContext = NULL) {
+
+        (void)instance;       // unused
+        (void)deviceContext;  // unused
+
+        return EvalPatches(srcBuffer->BindCpuBuffer(), srcDesc,
+                           dstBuffer->BindCpuBuffer(), dstDesc,
+                           duBuffer->BindCpuBuffer(),  duDesc,
+                           dvBuffer->BindCpuBuffer(),  dvDesc,
+                           numPatchCoords,
+                           (const PatchCoord*)patchCoords->BindCpuBuffer(),
+                           patchTable->GetFVarPatchArrayBuffer(fvarChannel),
+                           patchTable->GetFVarPatchIndexBuffer(fvarChannel),
+                           patchTable->GetFVarPatchParamBuffer(fvarChannel));
+    }
+
+    /// \brief Generic limit eval function. This function has a same
+    ///        signature as other device kernels have so that it can be called
+    ///        in the same way.
+    ///
+    /// @param srcBuffer        Input primvar buffer.
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         const float pointer for read
+    ///
+    /// @param srcDesc          vertex buffer descriptor for the input buffer
+    ///
+    /// @param dstBuffer        Output primvar buffer
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param dstDesc          vertex buffer descriptor for the output buffer
+    ///
+    /// @param duBuffer         Output buffer derivative wrt u
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param duDesc           vertex buffer descriptor for the duBuffer
+    ///
+    /// @param dvBuffer         Output buffer derivative wrt v
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param dvDesc           vertex buffer descriptor for the dvBuffer
+    ///
+    /// @param duuBuffer        Output buffer 2nd derivative wrt u
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param duuDesc          vertex buffer descriptor for the duuBuffer
+    ///
+    /// @param duvBuffer        Output buffer 2nd derivative wrt u and v
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param duvDesc          vertex buffer descriptor for the duvBuffer
+    ///
+    /// @param dvvBuffer        Output buffer 2nd derivative wrt v
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param dvvDesc          vertex buffer descriptor for the dvvBuffer
+    ///
+    /// @param numPatchCoords   number of patchCoords.
+    ///
+    /// @param patchCoords      array of locations to be evaluated.
+    ///
+    /// @param patchTable       CpuPatchTable or equivalent
+    ///                         XXX: currently Far::PatchTable can't be used
+    ///                              due to interface mismatch
+    ///
+    /// @param fvarChannel      face-varying channel
+    ///
+    /// @param instance         not used in the cpu evaluator
+    ///
+    /// @param deviceContext    not used in the cpu evaluator
+    ///
+    template <typename SRC_BUFFER, typename DST_BUFFER,
+              typename PATCHCOORD_BUFFER, typename PATCH_TABLE>
+    static bool EvalPatchesFaceVarying(
+        SRC_BUFFER *srcBuffer, BufferDescriptor const &srcDesc,
+        DST_BUFFER *dstBuffer, BufferDescriptor const &dstDesc,
+        DST_BUFFER *duBuffer,  BufferDescriptor const &duDesc,
+        DST_BUFFER *dvBuffer,  BufferDescriptor const &dvDesc,
+        DST_BUFFER *duuBuffer, BufferDescriptor const &duuDesc,
+        DST_BUFFER *duvBuffer, BufferDescriptor const &duvDesc,
+        DST_BUFFER *dvvBuffer, BufferDescriptor const &dvvDesc,
+        int numPatchCoords,
+        PATCHCOORD_BUFFER *patchCoords,
+        PATCH_TABLE *patchTable,
+        int fvarChannel,
+        TbbEvaluator const *instance = NULL,
+        void * deviceContext = NULL) {
+
+        (void)instance;       // unused
+        (void)deviceContext;  // unused
+
+        return EvalPatches(srcBuffer->BindCpuBuffer(), srcDesc,
+                           dstBuffer->BindCpuBuffer(), dstDesc,
+                           duBuffer->BindCpuBuffer(),  duDesc,
+                           dvBuffer->BindCpuBuffer(),  dvDesc,
+                           duuBuffer->BindCpuBuffer(), duuDesc,
+                           duvBuffer->BindCpuBuffer(), duvDesc,
+                           dvvBuffer->BindCpuBuffer(), dvvDesc,
                           numPatchCoords,
                           (const PatchCoord*)patchCoords->BindCpuBuffer(),
                           patchTable->GetFVarPatchArrayBuffer(fvarChannel),
--- a/opensubdiv/osd/tbbKernel.cpp
+++ b/opensubdiv/osd/tbbKernel.cpp
@ -219,6 +219,78 @@ TbbEvalStencils(float const * src, BufferDescriptor const &srcDesc,
        tbb::blocked_range<int> range(start, end, grain_size);
        tbb::parallel_for(range, kernel);
    }
+
+}
+
+void
+TbbEvalStencils(float const * src, BufferDescriptor const &srcDesc,
+                float * dst,       BufferDescriptor const &dstDesc,
+                float * du,        BufferDescriptor const &duDesc,
+                float * dv,        BufferDescriptor const &dvDesc,
+                float * duu,       BufferDescriptor const &duuDesc,
+                float * duv,       BufferDescriptor const &duvDesc,
+                float * dvv,       BufferDescriptor const &dvvDesc,
+                int const * sizes,
+                int const * offsets,
+                int const * indices,
+                float const * weights,
+                float const * duWeights,
+                float const * dvWeights,
+                float const * duuWeights,
+                float const * duvWeights,
+                float const * dvvWeights,
+                int start, int end) {
+
+    if (src) src += srcDesc.offset;
+    if (dst) dst += dstDesc.offset;
+    if (du)  du  += duDesc.offset;
+    if (dv)  dv  += dvDesc.offset;
+    if (duu) duu += duuDesc.offset;
+    if (duv) duv += duvDesc.offset;
+    if (dvv) dvv += dvvDesc.offset;
+
+    // PERFORMANCE: need to combine 3 launches together
+    if (dst) {
+        TBBStencilKernel kernel(src, srcDesc, dst, dstDesc,
+                                sizes, offsets, indices, weights);
+        tbb::blocked_range<int> range(start, end, grain_size);
+        tbb::parallel_for(range, kernel);
+    }
+
+    if (du) {
+        TBBStencilKernel kernel(src, srcDesc, du, duDesc,
+                                sizes, offsets, indices, duWeights);
+        tbb::blocked_range<int> range(start, end, grain_size);
+        tbb::parallel_for(range, kernel);
+    }
+
+    if (dv) {
+        TBBStencilKernel kernel(src, srcDesc, dv, dvDesc,
+                                sizes, offsets, indices, dvWeights);
+        tbb::blocked_range<int> range(start, end, grain_size);
+        tbb::parallel_for(range, kernel);
+    }
+
+    if (duu) {
+        TBBStencilKernel kernel(src, srcDesc, duu, duuDesc,
+                                sizes, offsets, indices, duuWeights);
+        tbb::blocked_range<int> range(start, end, grain_size);
+        tbb::parallel_for(range, kernel);
+    }
+
+    if (duv) {
+        TBBStencilKernel kernel(src, srcDesc, duv, duvDesc,
+                                sizes, offsets, indices, duvWeights);
+        tbb::blocked_range<int> range(start, end, grain_size);
+        tbb::parallel_for(range, kernel);
+    }
+
+    if (dvv) {
+        TBBStencilKernel kernel(src, srcDesc, dvv, dvvDesc,
+                                sizes, offsets, indices, dvvWeights);
+        tbb::blocked_range<int> range(start, end, grain_size);
+        tbb::parallel_for(range, kernel);
+    }
 }

 // ---------------------------------------------------------------------------
@ -257,10 +329,16 @@ class TbbEvalPatchesKernel {
    BufferDescriptor _dstDesc;
    BufferDescriptor _dstDuDesc;
    BufferDescriptor _dstDvDesc;
+    BufferDescriptor _dstDuuDesc;
+    BufferDescriptor _dstDuvDesc;
+    BufferDescriptor _dstDvvDesc;
    float const * _src;
    float * _dst;
    float * _dstDu;
    float * _dstDv;
+    float * _dstDuu;
+    float * _dstDuv;
+    float * _dstDvv;
    int _numPatchCoords;
    const PatchCoord *_patchCoords;
    const PatchArray *_patchArrayBuffer;
@ -272,6 +350,9 @@ public:
                         float *dst,       BufferDescriptor dstDesc,
                         float *dstDu,     BufferDescriptor dstDuDesc,
                         float *dstDv,     BufferDescriptor dstDvDesc,
+                         float *dstDuu,    BufferDescriptor dstDuuDesc,
+                         float *dstDuv,    BufferDescriptor dstDuvDesc,
+                         float *dstDvv,    BufferDescriptor dstDvvDesc,
                         int numPatchCoords,
                         const PatchCoord *patchCoords,
                         const PatchArray *patchArrayBuffer,
@ -279,7 +360,10 @@ public:
                         const PatchParam *patchParamBuffer) :
        _srcDesc(srcDesc), _dstDesc(dstDesc),
        _dstDuDesc(dstDuDesc), _dstDvDesc(dstDvDesc),
-        _src(src), _dst(dst), _dstDu(dstDu), _dstDv(dstDv),
+        _dstDuuDesc(dstDuuDesc), _dstDuvDesc(dstDuvDesc), _dstDvvDesc(dstDvvDesc),
+        _src(src), _dst(dst),
+        _dstDu(dstDu), _dstDv(dstDv),
+        _dstDuu(dstDuu), _dstDuv(dstDuv), _dstDvv(dstDvv),
        _numPatchCoords(numPatchCoords),
        _patchCoords(patchCoords),
        _patchArrayBuffer(patchArrayBuffer),
@ -290,13 +374,15 @@ public:
    void operator() (tbb::blocked_range<int> const &r) const {
        if (_dstDu == NULL && _dstDv == NULL) {
            compute(r);
+        } else if (_dstDuu == NULL && _dstDuv == NULL && _dstDvv == NULL) {
+            computeWith1stDerivative(r);
        } else {
-            computeWithDerivative(r);
+            computeWith2ndDerivative(r);
        }
    }

    void compute(tbb::blocked_range<int> const &r) const {
-        float wP[20], wDs[20], wDt[20];
+        float wP[20], wDu[20], wDv[20];
        BufferAdapter<const float> srcT(_src + _srcDesc.offset,
                                        _srcDesc.length,
                                        _srcDesc.stride);
@ -305,12 +391,6 @@ public:
                                  _dstDesc.length,
                                  _dstDesc.stride);

-        BufferAdapter<float> dstDuT(_dstDu,
-                                    _dstDuDesc.length,
-                                    _dstDuDesc.stride);
-        BufferAdapter<float> dstDvT(_dstDv,
-                                    _dstDvDesc.length,
-                                    _dstDvDesc.stride);

        for (int i = r.begin(); i < r.end(); ++i) {
            PatchCoord const &coord = _patchCoords[i];
@ -325,15 +405,18 @@ public:
            int numControlVertices = 0;
            if (patchType == Far::PatchDescriptor::REGULAR) {
                Far::internal::GetBSplineWeights(param,
-                                                 coord.s, coord.t, wP, wDs, wDt);
+                                                 coord.s, coord.t, wP,
+                                                 wDu, wDv);
                numControlVertices = 16;
            } else if (patchType == Far::PatchDescriptor::GREGORY_BASIS) {
                Far::internal::GetGregoryWeights(param,
-                                                 coord.s, coord.t, wP, wDs, wDt);
+                                                 coord.s, coord.t, wP,
+                                                 wDu, wDv);
                numControlVertices = 20;
            } else if (patchType == Far::PatchDescriptor::QUADS) {
                Far::internal::GetBilinearWeights(param,
-                                                  coord.s, coord.t, wP, wDs, wDt);
+                                                  coord.s, coord.t, wP,
+                                                  wDu, wDv);
                numControlVertices = 4;
            } else {
                assert(0);
@ -353,8 +436,8 @@ public:
        }
    }

-    void computeWithDerivative(tbb::blocked_range<int> const &r) const {
-        float wP[20], wDs[20], wDt[20];
+    void computeWith1stDerivative(tbb::blocked_range<int> const &r) const {
+        float wP[20], wDu[20], wDv[20];
        BufferAdapter<const float> srcT(_src + _srcDesc.offset,
                                        _srcDesc.length,
                                        _srcDesc.stride);
@ -384,15 +467,18 @@ public:
            int numControlVertices = 0;
            if (patchType == Far::PatchDescriptor::REGULAR) {
                Far::internal::GetBSplineWeights(param,
-                                                 coord.s, coord.t, wP, wDs, wDt);
+                                                 coord.s, coord.t, wP,
+                                                 wDu, wDv);
                numControlVertices = 16;
            } else if (patchType == Far::PatchDescriptor::GREGORY_BASIS) {
                Far::internal::GetGregoryWeights(param,
-                                                 coord.s, coord.t, wP, wDs, wDt);
+                                                 coord.s, coord.t, wP,
+                                                 wDu, wDv);
                numControlVertices = 20;
            } else if (patchType == Far::PatchDescriptor::QUADS) {
                Far::internal::GetBilinearWeights(param,
-                                                  coord.s, coord.t, wP, wDs, wDt);
+                                                  coord.s, coord.t,
+                                                  wP, wDu, wDv);
                numControlVertices = 4;
            } else {
                assert(0);
@ -409,14 +495,103 @@ public:
            dstDvT.Clear();
            for (int j = 0; j < numControlVertices; ++j) {
                dstT.AddWithWeight(srcT[cvs[j]], wP[j]);
-                dstDuT.AddWithWeight(srcT[cvs[j]], wDs[j]);
-                dstDvT.AddWithWeight(srcT[cvs[j]], wDt[j]);
+                dstDuT.AddWithWeight(srcT[cvs[j]], wDu[j]);
+                dstDvT.AddWithWeight(srcT[cvs[j]], wDv[j]);
            }
            ++dstT;
            ++dstDuT;
            ++dstDvT;
        }
    }
+
+    void computeWith2ndDerivative(tbb::blocked_range<int> const &r) const {
+        float wP[20], wDu[20], wDv[20], wDuu[20], wDuv[20], wDvv[20];
+        BufferAdapter<const float> srcT(_src + _srcDesc.offset,
+                                        _srcDesc.length,
+                                        _srcDesc.stride);
+        BufferAdapter<float> dstT(_dst + _dstDesc.offset
+                                       + r.begin() * _dstDesc.stride,
+                                  _dstDesc.length,
+                                  _dstDesc.stride);
+        BufferAdapter<float> dstDuT(_dstDu + _dstDuDesc.offset
+                                       + r.begin() * _dstDuDesc.stride,
+                                  _dstDuDesc.length,
+                                  _dstDuDesc.stride);
+        BufferAdapter<float> dstDvT(_dstDv + _dstDvDesc.offset
+                                       + r.begin() * _dstDvDesc.stride,
+                                  _dstDvDesc.length,
+                                  _dstDvDesc.stride);
+        BufferAdapter<float> dstDuuT(_dstDuu + _dstDuuDesc.offset
+                                       + r.begin() * _dstDuuDesc.stride,
+                                  _dstDuuDesc.length,
+                                  _dstDuuDesc.stride);
+        BufferAdapter<float> dstDuvT(_dstDuv + _dstDuvDesc.offset
+                                       + r.begin() * _dstDuvDesc.stride,
+                                  _dstDuvDesc.length,
+                                  _dstDuvDesc.stride);
+        BufferAdapter<float> dstDvvT(_dstDvv + _dstDvvDesc.offset
+                                       + r.begin() * _dstDvvDesc.stride,
+                                  _dstDvvDesc.length,
+                                  _dstDvvDesc.stride);
+
+        for (int i = r.begin(); i < r.end(); ++i) {
+            PatchCoord const &coord = _patchCoords[i];
+            PatchArray const &array = _patchArrayBuffer[coord.handle.arrayIndex];
+
+            Far::PatchParam const & param =
+                _patchParamBuffer[coord.handle.patchIndex];
+            int patchType = param.IsRegular()
+                ? Far::PatchDescriptor::REGULAR
+                : array.GetPatchType();
+
+            int numControlVertices = 0;
+            if (patchType == Far::PatchDescriptor::REGULAR) {
+                Far::internal::GetBSplineWeights(param,
+                                                 coord.s, coord.t, wP,
+                                                 wDu, wDv, wDuu, wDuv, wDvv);
+                numControlVertices = 16;
+            } else if (patchType == Far::PatchDescriptor::GREGORY_BASIS) {
+                Far::internal::GetGregoryWeights(param,
+                                                 coord.s, coord.t, wP,
+                                                 wDu, wDv, wDuu, wDuv, wDvv);
+                numControlVertices = 20;
+            } else if (patchType == Far::PatchDescriptor::QUADS) {
+                Far::internal::GetBilinearWeights(param,
+                                                  coord.s, coord.t, wP,
+                                                 wDu, wDv, wDuu, wDuv, wDvv);
+                numControlVertices = 4;
+            } else {
+                assert(0);
+            }
+
+            int indexStride = Far::PatchDescriptor(array.GetPatchType()).GetNumControlVertices();
+            int indexBase = array.GetIndexBase() + indexStride *
+                    (coord.handle.patchIndex - array.GetPrimitiveIdBase());
+
+            const int *cvs = &_patchIndexBuffer[indexBase];
+
+            dstT.Clear();
+            dstDuT.Clear();
+            dstDvT.Clear();
+            dstDuuT.Clear();
+            dstDuvT.Clear();
+            dstDvvT.Clear();
+            for (int j = 0; j < numControlVertices; ++j) {
+                dstT.AddWithWeight(srcT[cvs[j]], wP[j]);
+                dstDuT.AddWithWeight(srcT[cvs[j]], wDu[j]);
+                dstDvT.AddWithWeight(srcT[cvs[j]], wDv[j]);
+                dstDuuT.AddWithWeight(srcT[cvs[j]], wDuu[j]);
+                dstDuvT.AddWithWeight(srcT[cvs[j]], wDuv[j]);
+                dstDvvT.AddWithWeight(srcT[cvs[j]], wDvv[j]);
+            }
+            ++dstT;
+            ++dstDuT;
+            ++dstDvT;
+            ++dstDuuT;
+            ++dstDuvT;
+            ++dstDvvT;
+        }
+    }
 };


@ -433,6 +608,39 @@ TbbEvalPatches(float const *src, BufferDescriptor const &srcDesc,

    TbbEvalPatchesKernel kernel(src, srcDesc, dst, dstDesc,
                                dstDu, dstDuDesc, dstDv, dstDvDesc,
+                                NULL, BufferDescriptor(),
+                                NULL, BufferDescriptor(),
+                                NULL, BufferDescriptor(),
+                                numPatchCoords, patchCoords,
+                                patchArrayBuffer,
+                                patchIndexBuffer,
+                                patchParamBuffer);
+
+    tbb::blocked_range<int> range(0, numPatchCoords, grain_size);
+    tbb::parallel_for(range, kernel);
+
+}
+
+
+void
+TbbEvalPatches(float const *src, BufferDescriptor const &srcDesc,
+               float *dst,       BufferDescriptor const &dstDesc,
+               float *dstDu,     BufferDescriptor const &dstDuDesc,
+               float *dstDv,     BufferDescriptor const &dstDvDesc,
+               float *dstDuu,    BufferDescriptor const &dstDuuDesc,
+               float *dstDuv,    BufferDescriptor const &dstDuvDesc,
+               float *dstDvv,    BufferDescriptor const &dstDvvDesc,
+               int numPatchCoords,
+               const PatchCoord *patchCoords,
+               const PatchArray *patchArrayBuffer,
+               const int *patchIndexBuffer,
+               const PatchParam *patchParamBuffer) {
+
+    TbbEvalPatchesKernel kernel(src, srcDesc, dst, dstDesc,
+                                dstDu, dstDuDesc, dstDv, dstDvDesc,
+                                dstDuu, dstDuuDesc,
+                                dstDuv, dstDuvDesc,
+                                dstDvv, dstDvvDesc,
                                numPatchCoords, patchCoords,
                                patchArrayBuffer,
                                patchIndexBuffer,
--- a/opensubdiv/osd/tbbKernel.h
+++ b/opensubdiv/osd/tbbKernel.h
@ -61,6 +61,25 @@ TbbEvalStencils(float const * src, BufferDescriptor const &srcDesc,
                float const * dvWeights,
                int start, int end);

+void
+TbbEvalStencils(float const * src, BufferDescriptor const &srcDesc,
+                float * dst,       BufferDescriptor const &dstDesc,
+                float * dstDu,     BufferDescriptor const &dstDuDesc,
+                float * dstDv,     BufferDescriptor const &dstDvDesc,
+                float * dstDuu,    BufferDescriptor const &dstDuuDesc,
+                float * dstDuv,    BufferDescriptor const &dstDuvDesc,
+                float * dstDvv,    BufferDescriptor const &dstDvvDesc,
+                int const * sizes,
+                int const * offsets,
+                int const * indices,
+                float const * weights,
+                float const * duWeights,
+                float const * dvWeights,
+                float const * duuWeights,
+                float const * duvWeights,
+                float const * dvvWeights,
+                int start, int end);
+
 void
 TbbEvalPatches(float const *src, BufferDescriptor const &srcDesc,
               float *dst,       BufferDescriptor const &dstDesc,
@ -72,6 +91,20 @@ TbbEvalPatches(float const *src, BufferDescriptor const &srcDesc,
               const int *patchIndexBuffer,
               const PatchParam *patchParamBuffer);

+void
+TbbEvalPatches(float const *src, BufferDescriptor const &srcDesc,
+               float *dst,       BufferDescriptor const &dstDesc,
+               float *dstDu,     BufferDescriptor const &dstDuDesc,
+               float *dstDv,     BufferDescriptor const &dstDvDesc,
+               float *dstDuu,    BufferDescriptor const &dstDuuDesc,
+               float *dstDuv,    BufferDescriptor const &dstDuvDesc,
+               float *dstDvv,    BufferDescriptor const &dstDvvDesc,
+               int numPatchCoords,
+               const PatchCoord *patchCoords,
+               const PatchArray *patchArrayBuffer,
+               const int *patchIndexBuffer,
+               const PatchParam *patchParamBuffer);
+
 }  // end namespace Osd

 }  // end namespace OPENSUBDIV_VERSION
--- a/opensubdiv/sdc/bilinearScheme.h
+++ b/opensubdiv/sdc/bilinearScheme.h
@ -109,7 +109,7 @@ Scheme<SCHEME_BILINEAR>::assignSmoothLimitMask(VERTEX const& vertex, MASK& posMa
 }

 //
-//  Limit masks for tangents -- these are ambibuous around all vertices.  Provide
+//  Limit masks for tangents -- these are ambiguous around all vertices.  Provide
 //  the tangents based on the incident edges of the first face.
 //
 template <>
--- a/opensubdiv/sdc/catmarkScheme.h
+++ b/opensubdiv/sdc/catmarkScheme.h
@ -362,7 +362,7 @@ Scheme<SCHEME_CATMARK>::assignCreaseLimitTangentMasks(VERTEX const& vertex,

    //
    //  Second, the tangent across the interior faces:
-    //      Note this is ambigous for an interior vertex.  We currently return
+    //      Note this is ambiguous for an interior vertex.  We currently return
    //  the tangent for the surface in the counter-clockwise span between the
    //  leading and trailing edges that form the crease.  Given the expected
    //  computation of a surface normal as Tan1 X Tan2, this tangent should be
--- a/opensubdiv/sdc/crease.h
+++ b/opensubdiv/sdc/crease.h
@ -53,7 +53,7 @@ namespace Sdc {
 ///  users will be expected to provided them -- particularly when they expect the mask queries
 ///  to do all of the work (just determining if a vertex is smooth will require inspection of
 ///  incident edge sharpness).
-///      Mask queries will occassionally require the subdivided sharpness values around the
+///      Mask queries will occasionally require the subdivided sharpness values around the
 ///  child vertex.  So users will be expected to either provide them up front when known, or to be
 ///  gathered on demand.  Any implementation of subdivision with creasing cannot avoid subdividing
 ///  the sharpness values first, so keeping them available for re-use is a worthwhile consideration.
@ -97,7 +97,7 @@ public:
    //@{
    ///  Optional sharp features:
    ///      Since options treat certain topological features as infinitely sharp -- boundaries
-    ///  or (in future) nonmanifold features -- sharpness values should be adjust before use.
+    ///  or (in future) non-manifold features -- sharpness values should be adjusted before use.
    ///  The following methods will adjust (by return) specific values according to the options
    ///  applied.
    ///
@ -190,7 +190,7 @@ Crease::SharpenBoundaryEdge(float /* edgeSharpness */) const {

    //
    //  Despite the presence of the BOUNDARY_NONE option, boundary edges are always sharpened.
-    //  Much of the code relies on sharpess to indicate boundaries to avoid the more complex
+    //  Much of the code relies on sharpness to indicate boundaries to avoid the more complex
    //  topological inspection
    //
    return SHARPNESS_INFINITE;
--- a/opensubdiv/sdc/loopScheme.h
+++ b/opensubdiv/sdc/loopScheme.h
@ -60,7 +60,7 @@ inline int Scheme<SCHEME_LOOP>::GetLocalNeighborhoodSize() { return 1; }
 //  Protected methods to assign the two types of masks for an edge-vertex --
 //  Crease and Smooth.
 //
-//  The Crease case does not really need to be speciailized, though it may be
+//  The Crease case does not really need to be specialized, though it may be
 //  preferable to define all explicitly here.
 //
 template <>
@ -132,7 +132,7 @@ Scheme<SCHEME_LOOP>::assignSmoothMaskForEdge(EDGE const& edge, MASK& mask) const
 //  Protected methods to assign the three types of masks for a vertex-vertex --
 //  Corner, Crease and Smooth (Dart is the same as Smooth).
 //
-//  Corner and Crease do not really need to be speciailized, though it may be
+//  Corner and Crease do not really need to be specialized, though it may be
 //  preferable to define all explicitly here.
 //
 template <>
@ -313,7 +313,7 @@ Scheme<SCHEME_LOOP>::assignSmoothLimitMask(VERTEX const& vertex, MASK& posMask)
 //
 //  A note on tangent magnitudes:
 //
-//  Several formulae exist for limit tangents at a vertex to accomodate the
+//  Several formulae exist for limit tangents at a vertex to accommodate the
 //  different topological configurations around the vertex.  While these produce
 //  the desired direction, there is inconsistency in the resulting magnitudes.
 //  Ideally a regular mesh of uniformly shaped triangles with similar edge lengths
@ -322,7 +322,7 @@ Scheme<SCHEME_LOOP>::assignSmoothLimitMask(VERTEX const& vertex, MASK& posMask)
 //  scale factors.
 //
 //  For uses where magnitude does not matter, this scaling should be irrelevant.
-//  But just as with patches, where the magnitudes of partial derivates are
+//  But just as with patches, where the magnitudes of partial derivatives are
 //  consistent between similar patches, the magnitudes of limit tangents should
 //  also be similar.
 //
@ -349,9 +349,9 @@ Scheme<SCHEME_LOOP>::assignSmoothLimitMask(VERTEX const& vertex, MASK& posMask)
 //  where v5 = v0 + (v4 - v3) and v6 = v0 + v1 - v2.
 //
 //  When the standard limit tangent mask is applied, the cosines of increments
-//  of pi/3 gives us coefficients that are mutliples of 1/2, leading to the first
+//  of pi/3 give us coefficients that are multiples of 1/2, leading to the first
 //  tangent T1 = 3/2 * (v1 - v4), rather than the widely used T1 = v1 - v4.  So
-//  this scale factor of 3/2 is applied to insure tangents along the boundaries
+//  this scale factor of 3/2 is applied to ensure tangents along the boundaries
 //  are of similar magnitude as tangents in the immediate interior (which may be
 //  parallel).
 //
@ -442,7 +442,7 @@ Scheme<SCHEME_LOOP>::assignCreaseLimitTangentMasks(VERTEX const& vertex,

    //
    //  Second, the tangent across the interior faces:
-    //      Note this is ambigous for an interior vertex.  We currently return
+    //      Note this is ambiguous for an interior vertex.  We currently return
    //  the tangent for the surface in the counter-clockwise span between the
    //  leading and trailing edges that form the crease.  Given the expected
    //  computation of a surface normal as Tan1 X Tan2, this tangent should be
--- a/opensubdiv/sdc/options.h
+++ b/opensubdiv/sdc/options.h
@ -42,7 +42,7 @@ namespace Sdc {
 ///  limit surface, including the "shape" of primitive variable data associated with
 ///  it.
 ///
-///  The intent is that these sets of options be defined at a high-level and
+///  The intent is that these sets of options be defined at a high level and
 ///  propagated into the lowest-level computation in support of each subdivision
 ///  scheme.  Ideally it remains a set of bit-fields (essentially an int) and so
 ///  remains light weight and easily passed around by value.
@ -83,10 +83,10 @@ public:
    //  Trivial get/set methods:
    //

-    /// \brief Set vertex boundary interpolation rule
+    /// \brief Get vertex boundary interpolation rule
    VtxBoundaryInterpolation GetVtxBoundaryInterpolation() const { return (VtxBoundaryInterpolation) _vtxBoundInterp; }

-    /// \brief Get vertex boundary interpolation rule
+    /// \brief Set vertex boundary interpolation rule
    void SetVtxBoundaryInterpolation(VtxBoundaryInterpolation b) { _vtxBoundInterp = b; }

    /// \brief Get face-varying interpolation rule
@ -101,10 +101,10 @@ public:
    /// \brief Set edge crease rule
    void SetCreasingMethod(CreasingMethod c) { _creasingMethod = c; }

-    /// \brief Get triangle subdivsion weights rule (Catmark scheme only !)
+    /// \brief Get triangle subdivision weights rule (Catmark scheme only !)
    TriangleSubdivision GetTriangleSubdivision() const { return (TriangleSubdivision) _triangleSub; }

-    /// \brief Set triangle subdivsion weights rule (Catmark scheme only !)
+    /// \brief Set triangle subdivision weights rule (Catmark scheme only !)
    void SetTriangleSubdivision(TriangleSubdivision t) { _triangleSub = t; }

 private:
--- a/opensubdiv/sdc/scheme.h
+++ b/opensubdiv/sdc/scheme.h
@ -131,7 +131,7 @@ public:
    ///  edge while T2 points inward across the limit surface.
    ///
    ///  As for magnitude, no assumptions should be made of the magnitudes of the resulting
-    ///  tanget vectors.  Common formulae often factor out scale factors that contribute to
+    ///  tangent vectors.  Common formulae often factor out scale factors that contribute to
    ///  magnitude.  While some attempt has been made to make magnitudes more consistent
    ///  between regular corners, boundaries and the interior, the same has not been done at
    ///  irregular vertices -- at least not yet.  This may be addressed in future, as having
@ -201,9 +201,9 @@ protected:
    //  Internal implementation support:
    //
    //  We need a local "mask" class to be declared locally within the vertex-vertex mask query
-    //  to hold one of the two possible mask required and to combine the local mask with the mask
+    //  to hold one of the two possible masks required and to combine the local mask with the mask
    //  the caller provides.  It has been parameterized by <WEIGHT> so that a version compatible
-    //  with the callers mask class is created.
+    //  with the caller's mask class is created.
    //
    template <typename WEIGHT>
    class LocalMask {
@ -366,7 +366,7 @@ Scheme<SCHEME>::ComputeFaceVertexMask(FACE const& face, MASK& mask) const {
 //  determine if smooth or a crease, and also to detect and apply a transition from a
 //  crease to smooth.  Using the protected methods to assign the specific masks (only
 //  two -- smooth or crease) this implementation should serve all non-linear schemes
-//  (currently Catmark and Loop) and only need to be specialized it for Bilinear to
+//  (currently Catmark and Loop) and only needs to be specialized for Bilinear to
 //  trivialize it to the crease case.
 //
 //  The implementation here is slightly complicated by combining two scenarios into a
@ -446,7 +446,7 @@ Scheme<SCHEME>::ComputeEdgeVertexMask(EDGE const&     edge,
    }

    //
-    //  We are now left with have the Crease-to-Smooth case -- compute the Smooth mask
+    //  We are now left with the Crease-to-Smooth case -- compute the Smooth mask
    //  for the child and augment it with the transitional Crease of the parent.
    //
    //  A general combination of separately assigned masks here (as done in the vertex-
@ -475,7 +475,7 @@ Scheme<SCHEME>::ComputeEdgeVertexMask(EDGE const&     edge,
 //  to determine what subdivision Rules apply to the parent and its child vertex, and also to
 //  detect and apply a transition between two differing Rules.  Using the protected methods to
 //  assign specific masks, this implementation should serve all non-linear schemes (currently
-//  Catmark and Loop) and only need to be specialized for Bilinear to remove all unnecessary
+//  Catmark and Loop) and only needs to be specialized for Bilinear to remove all unnecessary
 //  complexity relating to creasing, Rules, etc.
 //
 //  The implementation here is slightly complicated by combining two scenarios into one --
@ -484,7 +484,7 @@ Scheme<SCHEME>::ComputeEdgeVertexMask(EDGE const&     edge,
 //  provided though, there are cases where the parent and child sharpness values need to be
 //  identified, so accounting for the unknown Rules too is not much of an added complication.
 //
-//  The benefit of supporting specified Rules is that they can often often be trivially
+//  The benefit of supporting specified Rules is that they can often be trivially
 //  determined from context (e.g. a vertex derived from a face at a previous level will always
 //  be smooth) rather than more generally, and at greater cost, inspecting neighboring and
 //  they are often the same for parent and child.
@ -572,7 +572,7 @@ Scheme<SCHEME>::ComputeVertexVertexMask(VERTEX const&   vertex,
    }

    //
-    //  Intialize a local child mask, compute the fractional weight from parent and child
+    //  Initialize a local child mask, compute the fractional weight from parent and child
    //  sharpness values and combine the two masks:
    //
    typedef typename MASK::Weight Weight;
--- a/opensubdiv/sdc/types.h
+++ b/opensubdiv/sdc/types.h
@ -32,7 +32,7 @@ namespace OPENSUBDIV_VERSION {
 namespace Sdc {

 ///
-///  \brief Enumerated type for all subdivisions schemes supported by OpenSubdiv
+///  \brief Enumerated type for all subdivision schemes supported by OpenSubdiv
 ///
 enum SchemeType {
    SCHEME_BILINEAR,
@ -42,7 +42,7 @@ enum SchemeType {


 ///
-///  \brief Enumerated type for all face splitting scheme
+///  \brief Enumerated type for all face splitting schemes
 ///
 enum Split {
    SPLIT_TO_QUADS,  ///< Used by Catmark and Bilinear
@ -51,8 +51,8 @@ enum Split {
 };

 ///
-///  \brief Traits associated the types of all subdivision schemes -- parameterized by
-///  the scheme type.  All traits are also defined on the scheme itself.
+///  \brief Traits associated with the types of all subdivision schemes -- parameterized by
+///  the scheme type.  All traits are also defined in the scheme itself.
 ///
 struct SchemeTypeTraits {

--- a/opensubdiv/version.h
+++ b/opensubdiv/version.h
@ -25,13 +25,13 @@
 #ifndef OPENSUBDIV3_VERSION_H
 #define OPENSUBDIV3_VERSION_H

-#define OPENSUBDIV_VERSION v3_1_1
+#define OPENSUBDIV_VERSION v3_2_0

-#define OPENSUBDIV_VERSION_NUMBER 30101
+#define OPENSUBDIV_VERSION_NUMBER 30200

 #define OPENSUBDIV_VERSION_MAJOR 3
-#define OPENSUBDIV_VERSION_MINOR 1
-#define OPENSUBDIV_VERSION_PATCH 1
+#define OPENSUBDIV_VERSION_MINOR 2
+#define OPENSUBDIV_VERSION_PATCH 0

 namespace OpenSubdiv {
 namespace OPENSUBDIV_VERSION {
--- a/opensubdiv/vtr/componentInterfaces.h
+++ b/opensubdiv/vtr/componentInterfaces.h
@ -46,7 +46,7 @@ namespace internal {
 //
 //  These are not used with Vtr but arguably belong with it as the details to
 //  write these efficiently depends very much on intimate details of Vtr's
-//  implmentation, e.g. the use of tag bits, subdivision Rules, etc.
+//  implementation, e.g. the use of tag bits, subdivision Rules, etc.
 //


--- a/opensubdiv/vtr/fvarLevel.cpp
+++ b/opensubdiv/vtr/fvarLevel.cpp
@ -139,7 +139,7 @@ FVarLevel::resizeValues(int valueCount) {
 //
 //  Once values have been identified for each vertex and tagged, refinement propagates
 //  the tags to child values using more simplified logic (child values inherit the
-//  topology of their parent) and no futher analysis is required.
+//  topology of their parent) and no further analysis is required.
 //
 void
 FVarLevel::completeTopologyFromFaceValues(int regularBoundaryValence) {
@ -177,7 +177,7 @@ FVarLevel::completeTopologyFromFaceValues(int regularBoundaryValence) {


    //
-    //  Its awkward and potentially inefficient to try and accomplish everything in one
+    //  It's awkward and potentially inefficient to try and accomplish everything in one
    //  pass over the vertices...
    //
    //  Make a first pass through the vertices to identify discts edges and to determine
@ -413,7 +413,7 @@ FVarLevel::completeTopologyFromFaceValues(int regularBoundaryValence) {

    //
    //  Now that we know the total number of additional sibling values (M values in addition
-    //  to the N vertex values) allocate space to accomodate all N + M vertex values.
+    //  to the N vertex values) allocate space to accommodate all N + M vertex values.
    //
    //  Then make the second pass through the vertices to identify the values associated with
    //  each and to inspect and tag local face-varying topology for those that don't match:
@ -582,7 +582,7 @@ FVarLevel::completeTopologyFromFaceValues(int regularBoundaryValence) {

 //
 //  Values tagged as creases have their two "end values" identified relative to the incident
-//  faces of the vertex for compact storage and quick retrieval.  This methods identifies the
+//  faces of the vertex for compact storage and quick retrieval.  This method identifies the
 //  values for the two ends of such a crease value:
 //
 void
--- a/Show More
+++ b/Show More