diff --git a/examples/dxPtexViewer/dxPtexViewer.cpp b/examples/dxPtexViewer/dxPtexViewer.cpp
old mode 100755
new mode 100644
index 448d6e69..5dd60e61
--- a/examples/dxPtexViewer/dxPtexViewer.cpp
+++ b/examples/dxPtexViewer/dxPtexViewer.cpp
@@ -819,7 +819,7 @@ createOsdMesh(int level, int kernel) {
 
 //------------------------------------------------------------------------------
 static void
-bindProgram(Effect effect, OpenSubdiv::Osd::D3D11PatchTable::PatchArray const & patch) {
+bindProgram(Effect effect, OpenSubdiv::Osd::PatchArray const & patch) {
 
     EffectDesc effectDesc(patch.GetDescriptor(), effect);
 
@@ -984,7 +984,7 @@ drawModel() {
     UINT hOffsets = 0;
     g_pd3dDeviceContext->IASetVertexBuffers(0, 1, &buffer, &hStrides, &hOffsets);
 
-    OpenSubdiv::Osd::D3D11PatchTable::PatchArrayVector const & patches =
+    OpenSubdiv::Osd::PatchArrayVector const & patches =
         g_mesh->GetPatchTable()->GetPatchArrays();
 
     g_pd3dDeviceContext->IASetIndexBuffer(
@@ -993,7 +993,7 @@ drawModel() {
 
     // patch drawing
     for (int i = 0; i < (int)patches.size(); ++i) {
-        OpenSubdiv::Osd::D3D11PatchTable::PatchArray const & patch = patches[i];
+        OpenSubdiv::Osd::PatchArray const & patch = patches[i];
         OpenSubdiv::Far::PatchDescriptor desc = patch.GetDescriptor();
         OpenSubdiv::Far::PatchDescriptor::Type patchType = desc.GetType();
 
diff --git a/examples/dxViewer/dxviewer.cpp b/examples/dxViewer/dxviewer.cpp
index d3189e5d..5e59b1eb 100644
--- a/examples/dxViewer/dxviewer.cpp
+++ b/examples/dxViewer/dxviewer.cpp
@@ -630,7 +630,7 @@ ShaderCache g_shaderCache;
 
 //------------------------------------------------------------------------------
 static void
-bindProgram(Effect effect, OpenSubdiv::Osd::D3D11PatchTable::PatchArray const & patch) {
+bindProgram(Effect effect, OpenSubdiv::Osd::PatchArray const & patch) {
 
     EffectDesc effectDesc(patch.GetDescriptor(), effect);
     typedef OpenSubdiv::Far::PatchDescriptor Descriptor;
@@ -822,7 +822,7 @@ display() {
     UINT hOffsets = 0;
     g_pd3dDeviceContext->IASetVertexBuffers(0, 1, &buffer, &hStrides, &hOffsets);
 
-    OpenSubdiv::Osd::D3D11PatchTable::PatchArrayVector const & patches =
+    OpenSubdiv::Osd::PatchArrayVector const & patches =
         g_mesh->GetPatchTable()->GetPatchArrays();
 
     g_pd3dDeviceContext->IASetIndexBuffer(
@@ -834,7 +834,7 @@ display() {
     int numDrawCalls = 0;
 
     for (int i=0; i<(int)patches.size(); ++i) {
-        OpenSubdiv::Osd::D3D11PatchTable::PatchArray const & patch = patches[i];
+        OpenSubdiv::Osd::PatchArray const & patch = patches[i];
 
         OpenSubdiv::Far::PatchDescriptor desc = patch.GetDescriptor();
         OpenSubdiv::Far::PatchDescriptor::Type patchType = desc.GetType();
diff --git a/examples/glEvalLimit/glEvalLimit.cpp b/examples/glEvalLimit/glEvalLimit.cpp
old mode 100755
new mode 100644
index ad6df89d..2dc54d9f
--- a/examples/glEvalLimit/glEvalLimit.cpp
+++ b/examples/glEvalLimit/glEvalLimit.cpp
@@ -44,15 +44,54 @@ GLFWmonitor* g_primary=0;
 
 #include <osd/cpuEvaluator.h>
 #include <osd/cpuVertexBuffer.h>
+#include <osd/cpuPatchTable.h>
 #include <osd/cpuGLVertexBuffer.h>
 #include <osd/mesh.h>
 
+#ifdef OPENSUBDIV_HAS_TBB
+    #include <osd/tbbEvaluator.h>
+#endif
+
+#ifdef OPENSUBDIV_HAS_OPENMP
+    #include <osd/ompEvaluator.h>
+#endif
+
+#ifdef OPENSUBDIV_HAS_CUDA
+    #include <osd/cudaEvaluator.h>
+    #include <osd/cudaVertexBuffer.h>
+    #include <osd/cudaGLVertexBuffer.h>
+    #include <osd/cudaPatchTable.h>
+    #include "../common/cudaDeviceContext.h"
+
+    CudaDeviceContext g_cudaDeviceContext;
+#endif
+
+#ifdef OPENSUBDIV_HAS_OPENCL
+    #include <osd/clVertexBuffer.h>
+    #include <osd/clGLVertexBuffer.h>
+    #include <osd/clEvaluator.h>
+    #include <osd/clPatchTable.h>
+    #include "../common/clDeviceContext.h"
+    CLDeviceContext g_clDeviceContext;
+#endif
+
+#ifdef OPENSUBDIV_HAS_GLSL_TRANSFORM_FEEDBACK
+    #include <osd/glXFBEvaluator.h>
+    #include <osd/glVertexBuffer.h>
+    #include <osd/glPatchTable.h>
+#endif
+
+#ifdef OPENSUBDIV_HAS_GLSL_COMPUTE
+    #include <osd/glComputeEvaluator.h>
+    #include <osd/glVertexBuffer.h>
+    #include <osd/glPatchTable.h>
+#endif
+
 #include <far/gregoryBasis.h>
 #include <far/endCapGregoryBasisPatchFactory.h>
 #include <far/topologyRefiner.h>
 #include <far/stencilTableFactory.h>
 #include <far/patchTableFactory.h>
-#include <far/patchMap.h>
 
 #include <far/error.h>
 
@@ -75,32 +114,44 @@ GLFWmonitor* g_primary=0;
 using namespace OpenSubdiv;
 
 //------------------------------------------------------------------------------
+enum KernelType { kCPU = 0,
+                  kOPENMP = 1,
+                  kTBB = 2,
+                  kCUDA = 3,
+                  kCL = 4,
+                  kGLXFB = 5,
+                  kGLCompute = 6 };
+
+enum EndCap      { kEndCapBSplineBasis,
+                   kEndCapGregoryBasis };
+
+enum DrawMode { kUV,
+                kVARYING,
+                kNORMAL,
+                kSHADE,
+                kFACEVARYING };
+
 std::vector<float> g_orgPositions,
                    g_positions,
                    g_varyingColors;
 
 int g_currentShape = 0,
     g_level = 3,
+    g_kernel = kCPU,
+    g_endCap = kEndCapBSplineBasis,
     g_numElements = 3;
 
 std::vector<int>   g_coarseEdges;
 std::vector<float> g_coarseEdgeSharpness;
 std::vector<float> g_coarseVertexSharpness;
 
-enum DrawMode { kRANDOM=0,
-                kUV,
-                kVARYING,
-                kNORMAL,
-                kSHADE,
-                kFACEVARYING };
-
 int   g_running = 1,
       g_width = 1024,
       g_height = 1024,
       g_fullscreen = 0,
       g_drawCageEdges = 1,
       g_drawCageVertices = 1,
-      g_drawMode = kVARYING,
+      g_drawMode = kUV,
       g_prev_x = 0,
       g_prev_y = 0,
       g_mbutton[3] = {0, 0, 0},
@@ -131,11 +182,9 @@ float g_computeTime = 0;
 Stopwatch g_fpsTimer;
 
 //------------------------------------------------------------------------------
-int g_nparticles=0,
-    g_nsamples=101,
-    g_nsamplesFound=0;
+int g_nParticles = 65536;
 
-bool g_randomStart=false;
+bool g_randomStart = true;//false;
 
 GLuint g_cageEdgeVAO = 0,
        g_cageEdgeVBO = 0,
@@ -155,6 +204,7 @@ struct Program {
     GLuint attrColor;
     GLuint attrTangentU;
     GLuint attrTangentV;
+    GLuint attrPatchCoord;
 } g_defaultProgram;
 
 //------------------------------------------------------------------------------
@@ -204,37 +254,189 @@ createCoarseMesh(OpenSubdiv::Far::TopologyRefiner const & refiner) {
 }
 
 //------------------------------------------------------------------------------
-Far::TopologyRefiner * g_topologyRefiner = 0;
-
-Far::StencilTable const * g_vertexStencils = NULL;
-Far::StencilTable const * g_varyingStencils = NULL;
-
 Far::PatchTable const * g_patchTable = NULL;
-Far::PatchMap const * g_patchMap = NULL;
-std::vector<Osd::PatchCoord> g_patchCoords;
 
-Osd::VertexBufferDescriptor g_idesc(/*offset*/ 0, /*legnth*/ 3, /*stride*/ 3),
-                            g_odesc(/*offset*/ 0, /*legnth*/ 3, /*stride*/ 6),
-                            g_vdesc(/*offset*/ 3, /*legnth*/ 3, /*stride*/ 6),
-                            g_duDesc(/*offset*/ 0, /*legnth*/ 3, /*stride*/ 6),
-                            g_dvDesc(/*offset*/ 3, /*legnth*/ 3, /*stride*/ 6),
-                            g_fvidesc(/*offset*/ 0, /*legnth*/ 2, /*stride*/ 2),
-                            g_fvodesc(/*offset*/ 3, /*legnth*/ 2, /*stride*/ 6);
+// input and output vertex data
+class EvalOutputBase {
+public:
+    virtual ~EvalOutputBase() {}
+    virtual GLuint BindVertexData() const = 0;
+    virtual GLuint BindDerivatives() const = 0;
+    virtual GLuint BindPatchCoords() const = 0;
+    virtual void UpdateData(const float *src, int startVertex, int numVertices) = 0;
+    virtual void UpdateVaryingData(const float *src, int startVertex, int numVertices) = 0;
+    virtual void Refine() = 0;
+    virtual void EvalPatches() = 0;
+    virtual void EvalPatchesWithDerivatives() = 0;
+    virtual void EvalPatchesVarying() = 0;
+    virtual void UpdatePatchCoords(
+        std::vector<Osd::PatchCoord> const &patchCoords) = 0;
+};
 
-// input vertex data (coarse + refined)
-Osd::CpuVertexBuffer * g_vertexData = 0;
-Osd::CpuVertexBuffer * g_varyingData = 0;
+// note: Since we don't have a class for device-patchcoord container in osd,
+// we cheat to use vertexbuffer as a patch-coord (5int) container.
+//
+// Please don't follow the pattern in your actual application.
+//
+template<typename SRC_VERTEX_BUFFER, typename EVAL_VERTEX_BUFFER,
+         typename STENCIL_TABLE, typename PATCH_TABLE, typename EVALUATOR,
+         typename DEVICE_CONTEXT = void>
+class EvalOutput : public EvalOutputBase {
+public:
+    typedef OpenSubdiv::Osd::EvaluatorCacheT<EVALUATOR> EvaluatorCache;
 
-// output vertex data (limit locations)
-Osd::CpuGLVertexBuffer * g_outVertexData = NULL;
-Osd::CpuGLVertexBuffer * g_outDerivatives = NULL;
+    EvalOutput(Far::StencilTable const *vertexStencils,
+               Far::StencilTable const *varyingStencils,
+               int numCoarseVerts, int numTotalVerts, int numParticles,
+               Far::PatchTable const *patchTable,
+               EvaluatorCache *evaluatorCache = NULL,
+               DEVICE_CONTEXT *deviceContext = NULL)
+        : _srcDesc(       /*offset*/ 0, /*length*/ 3, /*stride*/ 3),
+          _srcVaryingDesc(/*offset*/ 0, /*length*/ 3, /*stride*/ 3),
+          _vertexDesc(    /*offset*/ 0, /*legnth*/ 3, /*stride*/ 6),
+          _varyingDesc(   /*offset*/ 3, /*legnth*/ 3, /*stride*/ 6),
+          _duDesc(        /*offset*/ 0, /*legnth*/ 3, /*stride*/ 6),
+          _dvDesc(        /*offset*/ 3, /*legnth*/ 3, /*stride*/ 6),
+          _deviceContext(deviceContext) {
+        _srcData = SRC_VERTEX_BUFFER::Create(3, numTotalVerts, _deviceContext);
+        _srcVaryingData = SRC_VERTEX_BUFFER::Create(3, numTotalVerts, _deviceContext);
+        _vertexData = EVAL_VERTEX_BUFFER::Create(6, numParticles, _deviceContext);
+        _derivatives = EVAL_VERTEX_BUFFER::Create(6, numParticles, _deviceContext);
+        _patchTable = PATCH_TABLE::Create(patchTable, _deviceContext);
+        _patchCoords = NULL;
+        _numCoarseVerts = numCoarseVerts;
+        _vertexStencils =
+            Osd::convertToCompatibleStencilTable<STENCIL_TABLE>(vertexStencils, _deviceContext);
+        _varyingStencils =
+            Osd::convertToCompatibleStencilTable<STENCIL_TABLE>(varyingStencils, _deviceContext);
+        _evaluatorCache = evaluatorCache;
+    }
+    ~EvalOutput() {
+        delete _srcData;
+        delete _srcVaryingData;
+        delete _vertexData;
+        delete _derivatives;
+        delete _patchTable;
+        delete _patchCoords;
+    }
+    virtual GLuint BindVertexData() const {
+        return _vertexData->BindVBO();
+    }
+    virtual GLuint BindDerivatives() const {
+        return _derivatives->BindVBO();
+    }
+    virtual GLuint BindPatchCoords() const {
+        return _patchCoords->BindVBO();
+    }
+    virtual void UpdateData(const float *src, int startVertex, int numVertices) {
+        _srcData->UpdateData(src, startVertex, numVertices, _deviceContext);
+    }
+    virtual void UpdateVaryingData(const float *src, int startVertex, int numVertices) {
+        _srcVaryingData->UpdateData(src, startVertex, numVertices, _deviceContext);
+    }
+    virtual void Refine() {
+        Osd::VertexBufferDescriptor dstDesc = _srcDesc;
+        dstDesc.offset += _numCoarseVerts * _srcDesc.stride;
+
+        EVALUATOR const *evalInstance = OpenSubdiv::Osd::GetEvaluator<EVALUATOR>(
+            _evaluatorCache, _srcDesc, dstDesc, _deviceContext);
+
+        EVALUATOR::EvalStencils(_srcData, _srcDesc,
+                                _srcData, dstDesc,
+                                _vertexStencils,
+                                evalInstance,
+                                _deviceContext);
+
+        dstDesc = _srcVaryingDesc;
+        dstDesc.offset += _numCoarseVerts * _srcVaryingDesc.stride;
+        evalInstance = OpenSubdiv::Osd::GetEvaluator<EVALUATOR>(
+            _evaluatorCache, _srcVaryingDesc, dstDesc, _deviceContext);
+
+        EVALUATOR::EvalStencils(_srcVaryingData, _srcVaryingDesc,
+                                _srcVaryingData, dstDesc,
+                                _varyingStencils,
+                                evalInstance,
+                                _deviceContext);
+    }
+    virtual void EvalPatches() {
+        EVALUATOR const *evalInstance = OpenSubdiv::Osd::GetEvaluator<EVALUATOR>(
+            _evaluatorCache, _srcDesc, _vertexDesc, _deviceContext);
+
+        EVALUATOR::EvalPatches(
+            _srcData, _srcDesc,
+            _vertexData, _vertexDesc,
+            _patchCoords->GetNumVertices(),
+            _patchCoords,
+            _patchTable, evalInstance, _deviceContext);
+    }
+    virtual void EvalPatchesWithDerivatives() {
+        EVALUATOR const *evalInstance = OpenSubdiv::Osd::GetEvaluator<EVALUATOR>(
+            _evaluatorCache, _srcDesc, _vertexDesc, _deviceContext);
+        EVALUATOR::EvalPatches(
+            _srcData, _srcDesc,
+            _vertexData, _vertexDesc,
+            _derivatives, _duDesc,
+            _derivatives, _dvDesc,
+            _patchCoords->GetNumVertices(),
+            _patchCoords,
+            _patchTable, evalInstance, _deviceContext);
+    }
+    virtual void EvalPatchesVarying() {
+        EVALUATOR const *evalInstance = OpenSubdiv::Osd::GetEvaluator<EVALUATOR>(
+            _evaluatorCache, _srcVaryingDesc, _varyingDesc, _deviceContext);
+
+        EVALUATOR::EvalPatches(
+            _srcVaryingData, _srcVaryingDesc,
+            // varyingdata is interleved in vertexData.
+            _vertexData, _varyingDesc,
+            _patchCoords->GetNumVertices(),
+            _patchCoords,
+            _patchTable, evalInstance, _deviceContext);
+    }
+    virtual void UpdatePatchCoords(
+        std::vector<Osd::PatchCoord> const &patchCoords) {
+        if (_patchCoords and
+            _patchCoords->GetNumVertices() != (int)patchCoords.size()) {
+            delete _patchCoords;
+            _patchCoords = NULL;
+        }
+        if (not _patchCoords) {
+            _patchCoords = EVAL_VERTEX_BUFFER::Create(5,
+                                                      (int)patchCoords.size(),
+                                                      _deviceContext);
+        }
+        _patchCoords->UpdateData((float*)&patchCoords[0], 0, (int)patchCoords.size(), _deviceContext);
+    }
+private:
+    SRC_VERTEX_BUFFER *_srcData;
+    SRC_VERTEX_BUFFER *_srcVaryingData;
+    EVAL_VERTEX_BUFFER *_vertexData;
+    EVAL_VERTEX_BUFFER *_derivatives;
+    EVAL_VERTEX_BUFFER *_varyingData;
+    EVAL_VERTEX_BUFFER *_patchCoords;
+    PATCH_TABLE *_patchTable;
+    Osd::VertexBufferDescriptor _srcDesc;
+    Osd::VertexBufferDescriptor _srcVaryingDesc;
+    Osd::VertexBufferDescriptor _vertexDesc;
+    Osd::VertexBufferDescriptor _varyingDesc;
+    Osd::VertexBufferDescriptor _duDesc;
+    Osd::VertexBufferDescriptor _dvDesc;
+    int _numCoarseVerts;
+
+    STENCIL_TABLE const *_vertexStencils;
+    STENCIL_TABLE const *_varyingStencils;
+
+    EvaluatorCache *_evaluatorCache;
+    DEVICE_CONTEXT *_deviceContext;
+};
+
+EvalOutputBase *g_evalOutput = NULL;
 
 STParticles * g_particles=0;
 
 //------------------------------------------------------------------------------
 static void
 updateGeom() {
-
     int nverts = (int)g_orgPositions.size() / 3;
 
     const float *p = &g_orgPositions[0];
@@ -255,28 +457,18 @@ updateGeom() {
     Stopwatch s;
     s.Start();
 
-    g_vertexData->UpdateData( &g_positions[0], 0, nverts);
+    // update coarse vertices
+    g_evalOutput->UpdateData(&g_positions[0], 0, nverts);
 
-    if (! g_topologyRefiner) return;
+    // update coarse varying
+    if (g_drawMode == kVARYING) {
+        g_evalOutput->UpdateVaryingData(&g_varyingColors[0], 0, nverts);
 
-    // note that for patch eval we need coarse+refined combined buffer.
-    int nCoarseVertices = g_topologyRefiner->GetLevel(0).GetNumVertices();
-    Osd::CpuEvaluator::EvalStencils(g_vertexData,
-                                    Osd::VertexBufferDescriptor(0, 3, 3),
-                                    g_vertexData,
-                                    Osd::VertexBufferDescriptor(
-                                        nCoarseVertices*3, 3, 3),
-                                    g_vertexStencils);
-
-    if (g_varyingData) {
-        Osd::CpuEvaluator::EvalStencils(g_varyingData,
-                                        Osd::VertexBufferDescriptor(0, 3, 3),
-                                        g_varyingData,
-                                        Osd::VertexBufferDescriptor(
-                                            nCoarseVertices*3, 3, 3),
-                                        g_varyingStencils);
     }
 
+    // Refine
+    g_evalOutput->Refine();
+
     s.Stop();
     g_computeTime = float(s.GetElapsed() * 1000.0f);
 
@@ -287,60 +479,28 @@ updateGeom() {
 
     // Apply 'dynamics' update
     assert(g_particles);
+
     g_particles->Update(g_evalTime); // XXXX g_evalTime is not really elapsed time...
 
+    std::vector<OpenSubdiv::Osd::PatchCoord> const &patchCoords
+        = g_particles->GetPatchCoords();
 
-    // resolve particle positions into patch handles
-    // XXX: this process should be handled by OsdKernel in parallel
-    g_patchCoords.clear();
-    for (int i = 0; i < g_particles->GetNumParticles(); ++i) {
-        STParticles::Position const &position = g_particles->GetPositions()[i];
-        Far::PatchTable::PatchHandle const *handle =
-            g_patchMap->FindPatch(position.ptexIndex, position.s, position.t);
-        if (handle) {
-            g_patchCoords.push_back(Osd::PatchCoord(
-                                        *handle, position.s, position.t));
-        }
-    }
+    // update patchcoord to be evaluated
+    g_evalOutput->UpdatePatchCoords(patchCoords);
 
     // Evaluate the positions of the samples on the limit surface
     if (g_drawMode == kNORMAL || g_drawMode == kSHADE) {
         // evaluate positions and derivatives
-        g_nsamplesFound = Osd::CpuEvaluator::EvalPatches(
-            g_vertexData,      g_idesc,
-            g_outVertexData,   g_odesc,
-            g_outDerivatives,  g_duDesc,
-            g_outDerivatives,  g_dvDesc,
-            (int)g_patchCoords.size(),
-            &g_patchCoords[0],
-            g_patchTable, NULL);
+        g_evalOutput->EvalPatchesWithDerivatives();
     } else {
         // evaluate positions
-        g_nsamplesFound = Osd::CpuEvaluator::EvalPatches(
-            g_vertexData,     g_idesc,
-            g_outVertexData,  g_odesc,
-            (int)g_patchCoords.size(),
-            &g_patchCoords[0],
-            g_patchTable, NULL);
+        g_evalOutput->EvalPatches();
     }
 
     // color
-    if (g_drawMode == kUV) {
-        // store patchCoords as colors
-        float *p = g_outVertexData->BindCpuBuffer() + g_vdesc.offset;
-        for (int i = 0; i < (int)g_patchCoords.size(); ++i) {
-            p[0] = g_patchCoords[i].s;
-            p[1] = g_patchCoords[i].t;
-            p[2] = 0;
-            p += g_vdesc.stride;
-        }
-    } else if (g_drawMode == kVARYING) {
+    if (g_drawMode == kVARYING) {
         // XXX: is this really varying?
-        Osd::CpuEvaluator::EvalPatches(g_varyingData,   g_idesc,
-                                       g_outVertexData, g_vdesc,
-                                       (int)g_patchCoords.size(),
-                                       &g_patchCoords[0],
-                                       g_patchTable, NULL);
+        g_evalOutput->EvalPatchesVarying();
     }
 
     s.Stop();
@@ -359,8 +519,7 @@ createOsdMesh(ShapeDesc const & shapeDesc, int level) {
     OpenSubdiv::Sdc::SchemeType sdctype = GetSdcType(*shape);
     OpenSubdiv::Sdc::Options sdcoptions = GetSdcOptions(*shape);
 
-    delete g_topologyRefiner;
-    g_topologyRefiner =
+    Far::TopologyRefiner *topologyRefiner =
         OpenSubdiv::Far::TopologyRefinerFactory<Shape>::Create(*shape,
             OpenSubdiv::Far::TopologyRefinerFactory<Shape>::Options(sdctype, sdcoptions));
 
@@ -371,22 +530,16 @@ createOsdMesh(ShapeDesc const & shapeDesc, int level) {
 
     float speed = g_particles ? g_particles->GetSpeed() : 0.2f;
 
-    // Create the 'uv particles' manager - this class manages the limit
-    // location samples (ptex face index, (s,t) and updates them between frames.
-    // Note: the number of limit locations can be entirely arbitrary
-    delete g_particles;
-    g_particles = new STParticles(*g_topologyRefiner, g_nsamples, !g_randomStart);
-    g_nparticles = g_particles->GetNumParticles();
-    g_particles->SetSpeed(speed);
-
-    createCoarseMesh(*g_topologyRefiner);
+    createCoarseMesh(*topologyRefiner);
 
+    Far::StencilTable const * vertexStencils = NULL;
+    Far::StencilTable const * varyingStencils = NULL;
     int nverts=0;
     {
         // Apply feature adaptive refinement to the mesh so that we can use the
         // limit evaluation API features.
         Far::TopologyRefiner::AdaptiveOptions options(level);
-        g_topologyRefiner->RefineAdaptive(options);
+        topologyRefiner->RefineAdaptive(options);
 
         // Generate stencil table to update the bi-cubic patches control
         // vertices after they have been re-posed (both for vertex & varying
@@ -395,27 +548,33 @@ createOsdMesh(ShapeDesc const & shapeDesc, int level) {
         soptions.generateOffsets=true;
         soptions.generateIntermediateLevels=true;
 
-        Far::StencilTable const * vertexStencils =
-            Far::StencilTableFactory::Create(*g_topologyRefiner, soptions);
+        vertexStencils =
+            Far::StencilTableFactory::Create(*topologyRefiner, soptions);
 
         soptions.interpolationMode = Far::StencilTableFactory::INTERPOLATE_VARYING;
-        Far::StencilTable const * varyingStencils =
-            Far::StencilTableFactory::Create(*g_topologyRefiner, soptions);
+
+        varyingStencils =
+            Far::StencilTableFactory::Create(*topologyRefiner, soptions);
 
         // Generate bi-cubic patch table for the limit surface
         Far::PatchTableFactory::Options poptions;
-        poptions.SetEndCapType(
-            Far::PatchTableFactory::Options::ENDCAP_GREGORY_BASIS);
+        if (g_endCap == kEndCapBSplineBasis) {
+            poptions.SetEndCapType(
+                Far::PatchTableFactory::Options::ENDCAP_BSPLINE_BASIS);
+        } else {
+            poptions.SetEndCapType(
+                Far::PatchTableFactory::Options::ENDCAP_GREGORY_BASIS);
+        }
 
         Far::PatchTable const * patchTable =
-            Far::PatchTableFactory::Create(*g_topologyRefiner, poptions);
+            Far::PatchTableFactory::Create(*topologyRefiner, poptions);
 
         // append endcap stencils
         if (Far::StencilTable const *endCapVertexStencilTable =
             patchTable->GetEndCapVertexStencilTable()) {
             Far::StencilTable const *table =
                 Far::StencilTableFactory::AppendEndCapStencilTable(
-                    *g_topologyRefiner,
+                    *topologyRefiner,
                     vertexStencils, endCapVertexStencilTable);
             delete vertexStencils;
             vertexStencils = table;
@@ -424,7 +583,7 @@ createOsdMesh(ShapeDesc const & shapeDesc, int level) {
             patchTable->GetEndCapVaryingStencilTable()) {
             Far::StencilTable const *table =
                 Far::StencilTableFactory::AppendEndCapStencilTable(
-                    *g_topologyRefiner,
+                    *topologyRefiner,
                     varyingStencils, endCapVaryingStencilTable);
             delete varyingStencils;
             varyingStencils = table;
@@ -434,45 +593,100 @@ createOsdMesh(ShapeDesc const & shapeDesc, int level) {
         nverts = vertexStencils->GetNumControlVertices() +
             vertexStencils->GetNumStencils();
 
-        if (g_vertexStencils) delete g_vertexStencils;
-        g_vertexStencils = vertexStencils;
-        if (g_varyingStencils) delete g_varyingStencils;
-        g_varyingStencils = varyingStencils;
-
         if (g_patchTable) delete g_patchTable;
         g_patchTable = patchTable;
-
-        // Create a far patch map
-        if (g_patchMap) delete g_patchMap;
-        g_patchMap = new Far::PatchMap(*g_patchTable);
     }
 
-    {   // Create vertex primvar buffer for the CVs
-        delete g_vertexData;
-        g_vertexData = Osd::CpuVertexBuffer::Create(3, nverts);
+    // note that for patch eval we need coarse+refined combined buffer.
+    int nCoarseVertices = topologyRefiner->GetLevel(0).GetNumVertices();
 
-        // Create varying primvar buffer for the CVs with random colors.
-        // These are immediately interpolated (once) and saved for display.
-        delete g_varyingData; g_varyingData = 0;
-        if (g_drawMode==kVARYING) {
-            g_varyingData = Osd::CpuVertexBuffer::Create(3, nverts);
-            g_varyingData->UpdateData( 
-                &g_varyingColors[0], 0, (int)g_varyingColors.size()/3 );
-        }
-
-        // Create output buffers for the limit samples (position & tangents)
-        delete g_outVertexData;
-        g_outVertexData = Osd::CpuGLVertexBuffer::Create(6, g_nparticles);
-        memset(g_outVertexData->BindCpuBuffer(), 0, g_nparticles*6*sizeof(float));
-        if (g_drawMode==kRANDOM) {
-            createRandomColors(g_nparticles, 6, g_outVertexData->BindCpuBuffer()+3);
-        }
-
-        delete g_outDerivatives;
-        g_outDerivatives = Osd::CpuGLVertexBuffer::Create(6, g_nparticles);
-        memset(g_outDerivatives->BindCpuBuffer(), 0, g_nparticles*6*sizeof(float));
+    delete g_evalOutput;
+    if (g_kernel == kCPU) {
+        g_evalOutput = new EvalOutput<Osd::CpuVertexBuffer,
+                                      Osd::CpuGLVertexBuffer,
+                                      Far::StencilTable,
+                                      Osd::CpuPatchTable,
+                                      Osd::CpuEvaluator>
+            (vertexStencils, varyingStencils,
+             nCoarseVertices, nverts, g_nParticles, g_patchTable);
+#ifdef OPENSUBDIV_HAS_OPENMP
+    } else if (g_kernel == kOPENMP) {
+        g_evalOutput = new EvalOutput<Osd::CpuVertexBuffer,
+                                      Osd::CpuGLVertexBuffer,
+                                      Far::StencilTable,
+                                      Osd::CpuPatchTable,
+                                      Osd::OmpEvaluator>
+            (vertexStencils, varyingStencils,
+            nCoarseVertices, nverts, g_nParticles, g_patchTable);
+#endif
+#ifdef OPENSUBDIV_HAS_TBB
+    } else if (g_kernel == kTBB) {
+        g_evalOutput = new EvalOutput<Osd::CpuVertexBuffer,
+                                      Osd::CpuGLVertexBuffer,
+                                      Far::StencilTable,
+                                      Osd::CpuPatchTable,
+                                      Osd::TbbEvaluator>
+            (vertexStencils, varyingStencils,
+            nCoarseVertices, nverts, g_nParticles, g_patchTable);
+#endif
+#ifdef OPENSUBDIV_HAS_CUDA
+    } else if (g_kernel == kCUDA) {
+        g_evalOutput = new EvalOutput<Osd::CudaVertexBuffer,
+                                      Osd::CudaGLVertexBuffer,
+                                      Osd::CudaStencilTable,
+                                      Osd::CudaPatchTable,
+                                      Osd::CudaEvaluator>
+            (vertexStencils, varyingStencils,
+            nCoarseVertices, nverts, g_nParticles, g_patchTable);
+#endif
+#ifdef OPENSUBDIV_HAS_OPENCL
+    } else if (g_kernel == kCL) {
+        static Osd::EvaluatorCacheT<Osd::CLEvaluator> clEvaluatorCache;
+        g_evalOutput = new EvalOutput<Osd::CLVertexBuffer,
+                                      Osd::CLGLVertexBuffer,
+                                      Osd::CLStencilTable,
+                                      Osd::CLPatchTable,
+                                      Osd::CLEvaluator,
+                                      CLDeviceContext>
+            (vertexStencils, varyingStencils,
+            nCoarseVertices, nverts, g_nParticles, g_patchTable,
+            &clEvaluatorCache, &g_clDeviceContext);
+#endif
+#ifdef OPENSUBDIV_HAS_GLSL_TRANSFORM_FEEDBACK
+    } else if (g_kernel == kGLXFB) {
+        static Osd::EvaluatorCacheT<Osd::GLXFBEvaluator> glXFBEvaluatorCache;
+        g_evalOutput = new EvalOutput<Osd::GLVertexBuffer,
+                                      Osd::GLVertexBuffer,
+                                      Osd::GLStencilTableTBO,
+                                      Osd::GLPatchTable,
+                                      Osd::GLXFBEvaluator>
+            (vertexStencils, varyingStencils,
+             nCoarseVertices, nverts, g_nParticles, g_patchTable,
+             &glXFBEvaluatorCache);
+#endif
+#ifdef OPENSUBDIV_HAS_GLSL_COMPUTE
+    } else if (g_kernel == kGLCompute) {
+        static Osd::EvaluatorCacheT<Osd::GLComputeEvaluator> glComputeEvaluatorCache;
+        g_evalOutput = new EvalOutput<Osd::GLVertexBuffer,
+                                      Osd::GLVertexBuffer,
+                                      Osd::GLStencilTableSSBO,
+                                      Osd::GLPatchTable,
+                                      Osd::GLComputeEvaluator>
+            (vertexStencils, varyingStencils,
+             nCoarseVertices, nverts, g_nParticles, g_patchTable,
+             &glComputeEvaluatorCache);
+#endif
     }
 
+    // Create the 'uv particles' manager - this class manages the limit
+    // location samples (ptex face index, (s,t) and updates them between frames.
+    // Note: the number of limit locations can be entirely arbitrary
+    delete g_particles;
+    g_particles = new STParticles(*topologyRefiner, g_patchTable,
+                                  g_nParticles, !g_randomStart);
+    g_nParticles = g_particles->GetNumParticles();
+    g_particles->SetSpeed(speed);
+
     updateGeom();
 
 }
@@ -505,32 +719,33 @@ linkDefaultProgram() {
         "in vec3 color;\n"
         "in vec3 tangentU;\n"
         "in vec3 tangentV;\n"
+        "in vec2 patchCoord;\n"
         "out vec4 fragColor;\n"
-        "out vec3 normal;\n"
         "uniform mat4 ModelViewMatrix;\n"
         "uniform mat4 ProjectionMatrix;\n"
+        "uniform int DrawMode;\n"
         "void main() {\n"
-        "  fragColor = vec4(color, 1);\n"
-        // XXX: fix the normal transform
-        "  normal = (ModelViewMatrix * vec4(normalize(cross(tangentU, tangentV)), 0)).xyz;\n"
+        "  vec3 normal = (ModelViewMatrix * "
+        "               vec4(normalize(cross(tangentU, tangentV)), 0)).xyz;\n"
         "  gl_Position = ProjectionMatrix * ModelViewMatrix * "
         "                  vec4(position, 1);\n"
+        "  if (DrawMode == 0) {\n" // UV
+        "    fragColor = vec4(patchCoord.x, patchCoord.y, 0, 1);\n"
+        "  } else if (DrawMode == 2) {\n"
+        "    fragColor = vec4(normal*0.5+vec3(0.5), 1);\n"
+        "  } else if (DrawMode == 3) {\n"
+        "    fragColor = vec4(vec3(1)*dot(normal, vec3(0,0,1)), 1);\n"
+        "  } else {\n" // varying
+        "    fragColor = vec4(color, 1);\n"
+        "  }\n"
         "}\n";
 
     static const char *fsSrc =
         GLSL_VERSION_DEFINE
         "in vec4 fragColor;\n"
-        "in vec3 normal;\n"
-        "uniform int DrawMode;\n"
         "out vec4 color;\n"
         "void main() {\n"
-        "  if (DrawMode == 3) {\n"
-        "    color = vec4(normal*0.5+vec3(0.5), 1);\n"
-        "  } else if (DrawMode == 4) {\n"
-        "    color = vec4(vec3(1)*dot(normal, vec3(0,0,1)), 1);\n"
-        "  } else {\n"
-        "    color = fragColor;\n"
-        "  }\n"
+        "  color = fragColor;\n"
         "}\n";
 
     GLuint program = glCreateProgram();
@@ -544,6 +759,7 @@ linkDefaultProgram() {
     glBindAttribLocation(program, 1, "color");
     glBindAttribLocation(program, 2, "tangentU");
     glBindAttribLocation(program, 3, "tangentV");
+    glBindAttribLocation(program, 4, "patchCoord");
     glBindFragDataLocation(program, 0, "color");
 
     glLinkProgram(program);
@@ -571,6 +787,7 @@ linkDefaultProgram() {
     g_defaultProgram.attrColor = glGetAttribLocation(program, "color");
     g_defaultProgram.attrTangentU = glGetAttribLocation(program, "tangentU");
     g_defaultProgram.attrTangentV = glGetAttribLocation(program, "tangentV");
+    g_defaultProgram.attrPatchCoord = glGetAttribLocation(program, "patchCoord");
 
     return true;
 }
@@ -621,6 +838,7 @@ drawCageEdges() {
     glEnableVertexAttribArray(g_defaultProgram.attrColor);
     glDisableVertexAttribArray(g_defaultProgram.attrTangentU);
     glDisableVertexAttribArray(g_defaultProgram.attrTangentV);
+    glDisableVertexAttribArray(g_defaultProgram.attrPatchCoord);
     glVertexAttribPointer(g_defaultProgram.attrPosition,
                           3, GL_FLOAT, GL_FALSE, sizeof (GLfloat) * 6, 0);
     glVertexAttribPointer(g_defaultProgram.attrColor,
@@ -680,6 +898,7 @@ drawCageVertices() {
     glEnableVertexAttribArray(g_defaultProgram.attrColor);
     glDisableVertexAttribArray(g_defaultProgram.attrTangentU);
     glDisableVertexAttribArray(g_defaultProgram.attrTangentV);
+    glDisableVertexAttribArray(g_defaultProgram.attrPatchCoord);
     glVertexAttribPointer(g_defaultProgram.attrPosition,
                           3, GL_FLOAT, GL_FALSE, sizeof (GLfloat) * 6, 0);
     glVertexAttribPointer(g_defaultProgram.attrColor,
@@ -711,27 +930,33 @@ drawSamples() {
     glEnableVertexAttribArray(g_defaultProgram.attrTangentU);
     glEnableVertexAttribArray(g_defaultProgram.attrTangentV);
 
-    glBindBuffer(GL_ARRAY_BUFFER, g_outVertexData->BindVBO());
+    glBindBuffer(GL_ARRAY_BUFFER, g_evalOutput->BindVertexData());
     glVertexAttribPointer(0, 3, GL_FLOAT, GL_FALSE, sizeof (GLfloat) * 6, 0);
     glVertexAttribPointer(1, 3, GL_FLOAT, GL_FALSE, sizeof (GLfloat) * 6, (float*)12);
 
-    glBindBuffer(GL_ARRAY_BUFFER, g_outDerivatives->BindVBO());
+    glBindBuffer(GL_ARRAY_BUFFER, g_evalOutput->BindDerivatives());
     glVertexAttribPointer(2, 3, GL_FLOAT, GL_FALSE, sizeof (GLfloat) * 6, 0);
     glVertexAttribPointer(3, 3, GL_FLOAT, GL_FALSE, sizeof (GLfloat) * 6, (float*)12);
 
+    glBindBuffer(GL_ARRAY_BUFFER, g_evalOutput->BindPatchCoords());
+    glVertexAttribPointer(4, 2, GL_FLOAT, GL_FALSE, sizeof (GLfloat) * 5, (float*)12);
+
     glEnableVertexAttribArray(g_defaultProgram.attrPosition);
     glEnableVertexAttribArray(g_defaultProgram.attrColor);
     glEnableVertexAttribArray(g_defaultProgram.attrTangentU);
     glEnableVertexAttribArray(g_defaultProgram.attrTangentV);
+    glEnableVertexAttribArray(g_defaultProgram.attrPatchCoord);
 
     glPointSize(2.0f);
-    glDrawArrays(GL_POINTS, 0, g_nparticles);
+    int nPatchCoords = (int)g_particles->GetPatchCoords().size();
+    glDrawArrays(GL_POINTS, 0, nPatchCoords);
     glPointSize(1.0f);
 
     glDisableVertexAttribArray(g_defaultProgram.attrPosition);
     glDisableVertexAttribArray(g_defaultProgram.attrColor);
     glDisableVertexAttribArray(g_defaultProgram.attrTangentU);
     glDisableVertexAttribArray(g_defaultProgram.attrTangentV);
+    glDisableVertexAttribArray(g_defaultProgram.attrPatchCoord);
 
     glBindVertexArray(0);
 
@@ -789,9 +1014,10 @@ display() {
         double fps = 1.0/g_fpsTimer.GetElapsed();
         g_fpsTimer.Start();
 
+        int nPatchCoords = (int)g_particles->GetPatchCoords().size();
+
         g_hud.DrawString(10, -150, "Particle Speed ([) (]): %.1f", g_particles->GetSpeed());
-        g_hud.DrawString(10, -120, "# Samples  : (%d/%d)",
-                         g_nsamplesFound, g_outVertexData->GetNumVertices());
+        g_hud.DrawString(10, -120, "# Samples  : (%d / %d)", nPatchCoords, g_nParticles);
         g_hud.DrawString(10, -100, "Compute    : %.3f ms", g_computeTime);
         g_hud.DrawString(10, -80,  "Eval       : %.3f ms", g_evalTime * 1000.f);
         g_hud.DrawString(10, -60,  "GPU Draw   : %.3f ms", drawGpuTime);
@@ -884,9 +1110,11 @@ void windowClose(GLFWwindow*) {
 //------------------------------------------------------------------------------
 static void
 setSamples(bool add) {
-    g_nsamples += add ? 50 : -50;
-
-    g_nsamples = std::max(0, g_nsamples);
+    if (add) {
+        g_nParticles = g_nParticles * 2;
+    } else {
+        g_nParticles = std::max(1, g_nParticles / 2);
+    }
 
     createOsdMesh(g_defaultShapes[g_currentShape], g_level);
 }
@@ -936,6 +1164,40 @@ callbackModel(int m) {
     createOsdMesh(g_defaultShapes[g_currentShape], g_level);
 }
 
+//------------------------------------------------------------------------------
+static void
+callbackEndCap(int endCap) {
+    g_endCap = endCap;
+    createOsdMesh(g_defaultShapes[g_currentShape], g_level);
+}
+
+//------------------------------------------------------------------------------
+static void
+callbackKernel(int k) {
+
+    g_kernel = k;
+
+#ifdef OPENSUBDIV_HAS_OPENCL
+    if (g_kernel == kCL and (not g_clDeviceContext.IsInitialized())) {
+        if (g_clDeviceContext.Initialize() == false) {
+            printf("Error in initializing OpenCL\n");
+            exit(1);
+        }
+    }
+#endif
+#ifdef OPENSUBDIV_HAS_CUDA
+    if (g_kernel == kCUDA and (not g_cudaDeviceContext.IsInitialized())) {
+        if (g_cudaDeviceContext.Initialize() == false) {
+            printf("Error in initializing Cuda\n");
+            exit(1);
+        }
+    }
+#endif
+
+    createOsdMesh(g_defaultShapes[g_currentShape], g_level);
+
+}
+
 //------------------------------------------------------------------------------
 static void
 callbackLevel(int l) {
@@ -1001,10 +1263,40 @@ initHUD() {
     g_hud.AddCheckBox("Animate vertices (M)", g_moveScale != 0, 10, 50, callbackAnimate, 0, 'm');
     g_hud.AddCheckBox("Freeze (spc)", false, 10, 70, callbackFreeze, 0, ' ');
 
-    g_hud.AddCheckBox("Random Start", false, 10, 120, callbackCentered, g_randomStart);
+    g_hud.AddCheckBox("Random Start", g_randomStart, 10, 120, callbackCentered, 0);
+
+    int compute_pulldown = g_hud.AddPullDown("Compute (K)", 475, 10, 300,
+                                             callbackKernel, 'k');
+    g_hud.AddPullDownButton(compute_pulldown, "CPU", kCPU);
+#ifdef OPENSUBDIV_HAS_OPENMP
+    g_hud.AddPullDownButton(compute_pulldown, "OPENMP", kOPENMP);
+#endif
+#ifdef OPENSUBDIV_HAS_TBB
+    g_hud.AddPullDownButton(compute_pulldown, "TBB", kTBB);
+#endif
+#ifdef OPENSUBDIV_HAS_CUDA
+    g_hud.AddPullDownButton(compute_pulldown, "CUDA", kCUDA);
+#endif
+#ifdef OPENSUBDIV_HAS_OPENCL
+    g_hud.AddPullDownButton(compute_pulldown, "OpenCL", kCL);
+#endif
+#ifdef OPENSUBDIV_HAS_GLSL_TRANSFORM_FEEDBACK
+    g_hud.AddPullDownButton(compute_pulldown, "GL XFB", kGLXFB);
+#endif
+#ifdef OPENSUBDIV_HAS_GLSL_COMPUTE
+    g_hud.AddPullDownButton(compute_pulldown, "GL Compute", kGLCompute);
+#endif
+
+    int endcap_pulldown = g_hud.AddPullDown("End cap (E)", 10, 140, 200,
+                                            callbackEndCap, 'e');
+    g_hud.AddPullDownButton(endcap_pulldown, "BSpline",
+        kEndCapBSplineBasis,
+        g_endCap == kEndCapBSplineBasis);
+    g_hud.AddPullDownButton(endcap_pulldown, "GregoryBasis",
+        kEndCapGregoryBasis,
+        g_endCap == kEndCapGregoryBasis);
 
     int shading_pulldown = g_hud.AddPullDown("Shading (W)", 250, 10, 250, callbackDisplayVaryingColors, 'w');
-    g_hud.AddPullDownButton(shading_pulldown, "Random", kRANDOM, g_drawMode==kRANDOM);
     g_hud.AddPullDownButton(shading_pulldown, "(u,v)", kUV, g_drawMode==kUV);
     g_hud.AddPullDownButton(shading_pulldown, "Varying", kVARYING, g_drawMode==kVARYING);
     g_hud.AddPullDownButton(shading_pulldown, "Normal", kNORMAL, g_drawMode==kNORMAL);
diff --git a/examples/glEvalLimit/particles.cpp b/examples/glEvalLimit/particles.cpp
index 5dee362d..fccbae97 100644
--- a/examples/glEvalLimit/particles.cpp
+++ b/examples/glEvalLimit/particles.cpp
@@ -25,38 +25,104 @@
 #include "particles.h"
 
 #include <far/ptexIndices.h>
+#include <far/patchMap.h>
+
+#ifdef OPENSUBDIV_HAS_TBB
+#include <tbb/parallel_for.h>
+#include <tbb/atomic.h>
+tbb::atomic<int> g_tbbCounter;
+class TbbUpdateKernel {
+public:
+    TbbUpdateKernel(float speed,
+                    STParticles::Position *positions,
+                    float *velocities,
+                    std::vector<STParticles::FaceInfo> const &adjacency,
+                    OpenSubdiv::Osd::PatchCoord *patchCoords,
+                    OpenSubdiv::Far::PatchMap const *patchMap) :
+        _speed(speed), _positions(positions), _velocities(velocities),
+        _adjacency(adjacency), _patchCoords(patchCoords), _patchMap(patchMap) {
+    }
+
+    void operator () (tbb::blocked_range<int> const &r) const {
+        for (int i = r.begin(); i < r.end(); ++i) {
+            STParticles::Position * p = _positions + i;
+            float    *dp = _velocities + i*2;
+
+            // apply velocity
+            p->s += dp[0] * _speed;
+            p->t += dp[1] * _speed;
+
+            // make sure particles can't skip more than 1 face boundary at a time
+            assert((p->s>-2.0f) and (p->s<2.0f) and (p->t>-2.0f) and (p->t<2.0f));
+
+            // check if the particle is jumping a boundary
+            // note: a particle can jump 2 edges at a time (a "diagonal" jump)
+            //       this is not treated here.
+            int edge = -1;
+            if (p->s >= 1.0f) edge = 1;
+            if (p->s <= 0.0f) edge = 3;
+            if (p->t >= 1.0f) edge = 2;
+            if (p->t <= 0.0f) edge = 0;
+
+            if (edge>=0) {
+                // warp the particle to the other side of the boundary
+                STParticles::WarpParticle(_adjacency, edge, p, dp);
+            }
+            assert((p->s>=0.0f) and (p->s<=1.0f) and (p->t>=0.0f) and (p->t<=1.0f));
+
+            // resolve particle positions into patch handles
+            OpenSubdiv::Far::PatchTable::PatchHandle const *handle =
+                _patchMap->FindPatch(p->ptexIndex, p->s, p->t);
+            if (handle) {
+                int index = g_tbbCounter.fetch_and_add(1);
+                _patchCoords[index] =
+                    OpenSubdiv::Osd::PatchCoord(*handle, p->s, p->t);
+            }
+        }
+    }
+private:
+    float _speed;
+    STParticles::Position *_positions;
+    float *_velocities;
+    std::vector<STParticles::FaceInfo> const &_adjacency;
+    OpenSubdiv::Osd::PatchCoord *_patchCoords;
+    OpenSubdiv::Far::PatchMap const *_patchMap;
+};
+#endif
 
 #include <cassert>
 
-STParticles::STParticles(Refiner const & refiner, int nparticles, bool centered) :
+STParticles::STParticles(Refiner const & refiner,
+                         PatchTable const *patchTable,
+                         int nParticles, bool centered) :
     _speed(1.0f) {
 
     OpenSubdiv::Far::PtexIndices ptexIndices(refiner);
 
-    int nptexfaces = ptexIndices.GetNumFaces(),
-        nsamples = nptexfaces * nparticles;
+    // Create a far patch map
+    _patchMap = new OpenSubdiv::Far::PatchMap(*patchTable);
+
+    int nPtexFaces = ptexIndices.GetNumFaces();
 
     srand(static_cast<int>(2147483647));
 
     {   // initialize positions
 
-        _positions.resize(nsamples);
+        _positions.resize(nParticles);
         Position * pos = &_positions[0];
 
-        for (int i=0; i<nptexfaces; ++i) {
-            for (int j=0; j<nparticles; ++j) {
-                pos->ptexIndex = i;
-                pos->s = centered ? 0.5f : (float)rand()/(float)RAND_MAX;
-                pos->t = centered ? 0.5f : (float)rand()/(float)RAND_MAX;
-                ++pos;
-            }
+        for (int i = 0; i < nParticles; ++i) {
+            pos->ptexIndex = (int)(((float)rand()/(float)RAND_MAX) * nPtexFaces);
+            pos->s = centered ? 0.5f : (float)rand()/(float)RAND_MAX;
+            pos->t = centered ? 0.5f : (float)rand()/(float)RAND_MAX;
+            ++pos;
         }
     }
 
     {   // initialize velocities
-        _velocities.resize(nsamples*2);
+        _velocities.resize(nParticles * 2);
 
-        for (int i=0; i<nsamples; ++i) {
+        for (int i = 0; i < nParticles; ++i) {
             // initialize normalized random directions
             float s = 2.0f*(float)rand()/(float)RAND_MAX - 1.0f,
                   t = 2.0f*(float)rand()/(float)RAND_MAX - 1.0f,
@@ -68,7 +134,7 @@ STParticles::STParticles(Refiner const & refiner, int nparticles, bool centered)
     }
 
     {   // initialize topology adjacency
-        _adjacency.resize(nptexfaces);
+        _adjacency.resize(nPtexFaces);
 
         OpenSubdiv::Far::TopologyLevel const & refBaseLevel = refiner.GetLevel(0);
 
@@ -97,42 +163,6 @@ STParticles::STParticles(Refiner const & refiner, int nparticles, bool centered)
     //std::cout << *this;
 }
 
-void
-STParticles::Update(float deltaTime) {
-
-    float speed = GetSpeed() * std::max(0.001f, std::min(deltaTime, 0.5f));
-
-    Position *  p = &_positions[0];
-    float    * dp = &_velocities[0];
-
-    for (int i=0; i<GetNumParticles(); ++i, ++p, dp+=2) {
-
-        // apply velocity
-        p->s += dp[0] * speed;
-        p->t += dp[1] * speed;
-        
-        // make sure particles can't skip more than 1 face boundary at a time
-        assert((p->s>-2.0f) and (p->s<2.0f) and (p->t>-2.0f) and (p->t<2.0f));
-        
-        // check if the particle is jumping a boundary
-        // note: a particle can jump 2 edges at a time (a "diagonal" jump)
-        //       this is not treated here.
-        int edge = -1;
-
-        if (p->s >= 1.0f) edge = 1; 
-        if (p->s <= 0.0f) edge = 3; 
-        if (p->t >= 1.0f) edge = 2; 
-        if (p->t <= 0.0f) edge = 0; 
-        
-        if (edge>=0) {
-            // warp the particle to the other side of the boundary
-            warpParticle(edge, p, dp);
-        }
-        assert((p->s>=0.0f) and (p->s<=1.0f) and (p->t>=0.0f) and (p->t<=1.0f));
-    }
-    
-}
-
 inline void
 FlipS(STParticles::Position * p, float * dp) {
     p->s = 1.0f-p->s;
@@ -163,6 +193,15 @@ Rotate(int rot, STParticles::Position * p, float * dp) {
     assert((p->s>=0.0f) and (p->s<=1.0f) and (p->t>=0.0f) and (p->t<=1.0f));
 }
 
+inline void
+Trim(STParticles::Position * p) {
+    if (p->s <0.0f) p->s = 1.0f + p->s;
+    if (p->s>=1.0f) p->s = p->s - 1.0f;
+    if (p->t <0.0f) p->t = 1.0f + p->t;
+    if (p->t>=1.0f) p->t = p->t - 1.0f;
+    assert((p->s>=0.0f) and (p->s<=1.0f) and (p->t>=0.0f) and (p->t<=1.0f));
+}
+
 inline void
 Clamp(STParticles::Position * p) {
          if (p->s<0.0f) {
@@ -192,21 +231,12 @@ Bounce(int edge, STParticles::Position * p, float * dp) {
     assert((p->s>=0.0f) and (p->s<=1.0f) and (p->t>=0.0f) and (p->t<=1.0f));
 }
 
-inline void
-Trim(STParticles::Position * p) {
-    if (p->s <0.0f) p->s = 1.0f + p->s;
-    if (p->s>=1.0f) p->s = p->s - 1.0f;
-    if (p->t <0.0f) p->t = 1.0f + p->t;
-    if (p->t>=1.0f) p->t = p->t - 1.0f;
-    assert((p->s>=0.0f) and (p->s<=1.0f) and (p->t>=0.0f) and (p->t<=1.0f));
-}
-
 void
-STParticles::warpParticle(int edge, Position * p, float * dp) {
-
-    assert(p->ptexIndex<(int)_adjacency.size() and (edge>=0 and edge<4));
+STParticles::WarpParticle(std::vector<FaceInfo> const &adjacency,
+                          int edge, Position * p, float * dp) {
+    assert(p->ptexIndex<(int)adjacency.size() and (edge>=0 and edge<4));
     
-    FaceInfo const & f = _adjacency[p->ptexIndex];
+    FaceInfo const & f = adjacency[p->ptexIndex];
 
     int afid = f.adjface(edge),
         aeid = f.adjedge(edge);
@@ -215,7 +245,7 @@ STParticles::warpParticle(int edge, Position * p, float * dp) {
         // boundary detected: bounce the particle
         Bounce(edge, p, dp);
     } else {
-        FaceInfo const & af = _adjacency[afid];
+        FaceInfo const & af = adjacency[afid];
         int rot = edge - aeid + 2;
 
         bool fIsSubface = f.isSubface(),
@@ -233,6 +263,66 @@ STParticles::warpParticle(int edge, Position * p, float * dp) {
     assert((p->s>=0.0f) and (p->s<=1.0f) and (p->t>=0.0f) and (p->t<=1.0f));
 }
 
+STParticles::~STParticles() {
+    delete _patchMap;
+}
+
+void
+STParticles::Update(float deltaTime) {
+
+    if (fabs(GetSpeed()) < 0.001f) return;
+    float speed = GetSpeed() * std::max(0.001f, std::min(deltaTime, 0.5f));
+
+    _patchCoords.clear();
+
+    // XXX: this process should be parallelized.
+#ifdef OPENSUBDIV_HAS_TBB
+
+    _patchCoords.resize((int)GetNumParticles());
+    TbbUpdateKernel kernel(speed, &_positions[0], &_velocities[0],
+                           _adjacency, &_patchCoords[0], _patchMap);;
+    g_tbbCounter = 0;
+    tbb::blocked_range<int> range(0, GetNumParticles(), 256);
+    tbb::parallel_for(range, kernel);
+    _patchCoords.resize(g_tbbCounter);
+#else
+    Position *  p = &_positions[0];
+    float    * dp = &_velocities[0];
+    for (int i=0; i<GetNumParticles(); ++i, ++p, dp+=2) {
+        // apply velocity
+        p->s += dp[0] * speed;
+        p->t += dp[1] * speed;
+
+        // make sure particles can't skip more than 1 face boundary at a time
+        assert((p->s>-2.0f) and (p->s<2.0f) and (p->t>-2.0f) and (p->t<2.0f));
+
+        // check if the particle is jumping a boundary
+        // note: a particle can jump 2 edges at a time (a "diagonal" jump)
+        //       this is not treated here.
+        int edge = -1;
+
+        if (p->s >= 1.0f) edge = 1;
+        if (p->s <= 0.0f) edge = 3;
+        if (p->t >= 1.0f) edge = 2;
+        if (p->t <= 0.0f) edge = 0;
+
+        if (edge>=0) {
+            // warp the particle to the other side of the boundary
+            WarpParticle(_adjacency, edge, p, dp);
+        }
+        assert((p->s>=0.0f) and (p->s<=1.0f) and (p->t>=0.0f) and (p->t<=1.0f));
+
+        // resolve particle positions into patch handles
+        OpenSubdiv::Far::PatchTable::PatchHandle const *handle =
+            _patchMap->FindPatch(p->ptexIndex, p->s, p->t);
+        if (handle) {
+            _patchCoords.push_back(
+                OpenSubdiv::Osd::PatchCoord(*handle, p->s, p->t));
+        }
+    }
+#endif
+}
+
 // Dump adjacency info
 std::ostream & operator << (std::ostream & os,
     STParticles::FaceInfo const & f) {
diff --git a/examples/glEvalLimit/particles.h b/examples/glEvalLimit/particles.h
index ea1f798d..887add25 100644
--- a/examples/glEvalLimit/particles.h
+++ b/examples/glEvalLimit/particles.h
@@ -26,7 +26,8 @@
 #define ST_PARTICLES_H
 
 #include <far/topologyRefiner.h>
-
+#include <far/patchMap.h>
+#include <osd/types.h>
 #include <iostream>
 
 //
@@ -72,47 +73,6 @@ public:
         float s, t;         ///< parametric location on face
     };
 
-    typedef OpenSubdiv::Far::TopologyRefiner Refiner;
-
-    STParticles(Refiner const & refiner, int nparticles, bool centered=false);
-
-    void Update(float deltaTime);
-
-    int GetNumParticles() const {
-        return (int)_positions.size();
-    }
-
-    void SetSpeed(float speed) {
-        _speed = std::max(-1.0f, std::min(1.0f, speed));
-    }
-    
-    float GetSpeed() const {
-        return _speed;
-    }
-    
-    std::vector<Position> & GetPositions() {
-        return _positions;
-    }
-
-    std::vector<float> & GetVelocities() {
-        return _velocities;
-    }
-
-    friend std::ostream & operator << (std::ostream & os, STParticles const & f);
-
-private:
-
-    //
-    // Particle "Dynamics"
-    //
-    std::vector<Position> _positions;
-
-    std::vector<float> _velocities;
-
-    float _speed;  // velocity multiplier
-
-private:
-
     //
     // Topology adjacency (borrowed from Ptexture.h)
     //
@@ -152,11 +112,63 @@ private:
                  int adjfaces[4];
     };
 
-    void warpParticle(int edge, Position * p, float * dp);
+    typedef OpenSubdiv::Far::TopologyRefiner Refiner;
+    typedef OpenSubdiv::Far::PatchTable PatchTable;
+
+    STParticles(Refiner const & refiner, PatchTable const *patchTable,
+                int nparticles, bool centered=false);
+
+    ~STParticles();
+
+    void Update(float deltaTime);
+
+    int GetNumParticles() const {
+        return (int)_positions.size();
+    }
+
+    void SetSpeed(float speed) {
+        _speed = std::max(-1.0f, std::min(1.0f, speed));
+    }
+    
+    float GetSpeed() const {
+        return _speed;
+    }
+    
+    std::vector<Position> & GetPositions() {
+        return _positions;
+    }
+
+    std::vector<float> & GetVelocities() {
+        return _velocities;
+    }
+
+    std::vector<OpenSubdiv::Osd::PatchCoord> GetPatchCoords() const {
+        return _patchCoords;
+    }
+
+    friend std::ostream & operator << (std::ostream & os, STParticles const & f);
+
+    static void WarpParticle(std::vector<FaceInfo> const &adjacency,
+                             int edge, Position * p, float * dp);
+
+private:
+
+    //
+    // Particle "Dynamics"
+    //
+    std::vector<Position> _positions;
+
+    std::vector<float> _velocities;
+
+    std::vector<OpenSubdiv::Osd::PatchCoord> _patchCoords;
+
+    float _speed;  // velocity multiplier
 
     friend std::ostream & operator << (std::ostream & os, FaceInfo const & f);
 
+
     std::vector<FaceInfo> _adjacency;
+    OpenSubdiv::Far::PatchMap const *_patchMap;
 };
 
 #endif // ST_PARTICLES_H
diff --git a/examples/glFVarViewer/glFVarViewer.cpp b/examples/glFVarViewer/glFVarViewer.cpp
index faea611a..954d263e 100644
--- a/examples/glFVarViewer/glFVarViewer.cpp
+++ b/examples/glFVarViewer/glFVarViewer.cpp
@@ -786,7 +786,7 @@ bindTextures() {
 }
 
 static GLenum
-bindProgram(Effect effect, OpenSubdiv::Osd::GLPatchTable::PatchArray const & patch) {
+bindProgram(Effect effect, OpenSubdiv::Osd::PatchArray const & patch) {
 
     EffectDesc effectDesc(patch.GetDescriptor(), effect);
 
@@ -868,7 +868,7 @@ display() {
 
     glBindVertexArray(g_vao);
 
-    OpenSubdiv::Osd::GLPatchTable::PatchArrayVector const & patches =
+    OpenSubdiv::Osd::PatchArrayVector const & patches =
         g_mesh->GetPatchTable()->GetPatchArrays();
 
     if (g_displayStyle == kWire)
@@ -879,7 +879,7 @@ display() {
 
     // patch drawing
     for (int i = 0; i < (int)patches.size(); ++i) {
-        OpenSubdiv::Osd::GLPatchTable::PatchArray const & patch = patches[i];
+        OpenSubdiv::Osd::PatchArray const & patch = patches[i];
 
         GLenum primType = bindProgram(GetEffect(), patch);
 
@@ -909,7 +909,7 @@ display() {
     glPolygonMode(GL_FRONT_AND_BACK, GL_LINE);
 
     for (int i = 0; i < (int)patches.size(); ++i) {
-        OpenSubdiv::Osd::GLPatchTable::PatchArray const & patch = patches[i];
+        OpenSubdiv::Osd::PatchArray const & patch = patches[i];
 
         GLenum primType = bindProgram(GetEffect(/*uvDraw=*/ true), patch);
 
diff --git a/examples/glImaging/glImaging.cpp b/examples/glImaging/glImaging.cpp
old mode 100755
new mode 100644
index 1e0c38fb..a58e2dab
--- a/examples/glImaging/glImaging.cpp
+++ b/examples/glImaging/glImaging.cpp
@@ -422,11 +422,11 @@ void runTest(ShapeDesc const &shapeDesc, std::string const &kernel,
                       mesh->GetPatchTable()->GetPatchParamTextureBuffer());
     }
 
-    Osd::GLPatchTable::PatchArrayVector const & patches =
+    Osd::PatchArrayVector const & patches =
         mesh->GetPatchTable()->GetPatchArrays();
 
     for (int i=0; i<(int)patches.size(); ++i) {
-        Osd::GLPatchTable::PatchArray const & patch = patches[i];
+        Osd::PatchArray const & patch = patches[i];
         Far::PatchDescriptor desc = patch.GetDescriptor();
         Far::PatchDescriptor::Type patchType = desc.GetType();
 
diff --git a/examples/glPaintTest/glPaintTest.cpp b/examples/glPaintTest/glPaintTest.cpp
index d75f1039..4e13a61e 100644
--- a/examples/glPaintTest/glPaintTest.cpp
+++ b/examples/glPaintTest/glPaintTest.cpp
@@ -597,7 +597,7 @@ static void bindTextures(Effect effect) {
 }
 
 static GLuint
-bindProgram(Effect effect, OpenSubdiv::Osd::GLPatchTable::PatchArray const & patch) {
+bindProgram(Effect effect, OpenSubdiv::Osd::PatchArray const & patch) {
 
     EffectDesc effectDesc(patch.GetDescriptor(), effect);
 
@@ -670,12 +670,12 @@ display() {
 
     glBindVertexArray(g_vao);
 
-    OpenSubdiv::Osd::GLPatchTable::PatchArrayVector const & patches =
+    OpenSubdiv::Osd::PatchArrayVector const & patches =
         g_mesh->GetPatchTable()->GetPatchArrays();
 
     // patch drawing
     for (int i=0; i<(int)patches.size(); ++i) {
-        OpenSubdiv::Osd::GLPatchTable::PatchArray const & patch = patches[i];
+        OpenSubdiv::Osd::PatchArray const & patch = patches[i];
         OpenSubdiv::Far::PatchDescriptor desc = patch.GetDescriptor();
 
         GLenum primType = GL_PATCHES;
@@ -807,13 +807,13 @@ drawStroke(int x, int y) {
     effect.paint = 1;
     bindTextures(effect);
 
-    OpenSubdiv::Osd::GLPatchTable::PatchArrayVector const & patches =
+    OpenSubdiv::Osd::PatchArrayVector const & patches =
         g_mesh->GetPatchTable()->GetPatchArrays();
 
     // patch drawing
     for (int i=0; i<(int)patches.size(); ++i) {
 
-        OpenSubdiv::Osd::GLPatchTable::PatchArray const & patch = patches[i];
+        OpenSubdiv::Osd::PatchArray const & patch = patches[i];
         OpenSubdiv::Far::PatchDescriptor desc = patch.GetDescriptor();
 
         GLenum primType = GL_PATCHES;
diff --git a/examples/glPtexViewer/glPtexViewer.cpp b/examples/glPtexViewer/glPtexViewer.cpp
index 1a27fd98..67e4bb31 100644
--- a/examples/glPtexViewer/glPtexViewer.cpp
+++ b/examples/glPtexViewer/glPtexViewer.cpp
@@ -1249,7 +1249,7 @@ bindTextures() {
 //------------------------------------------------------------------------------
 static GLenum
 bindProgram(Effect effect,
-            OpenSubdiv::Osd::GLPatchTable::PatchArray const & patch) {
+            OpenSubdiv::Osd::PatchArray const & patch) {
     EffectDesc effectDesc(patch.GetDescriptor(), effect);
 
     GLDrawConfig *config = g_shaderCache.GetDrawConfig(effectDesc);
@@ -1296,10 +1296,10 @@ drawModel() {
     glBindVertexArray(g_vao);
 
     // patch drawing
-    OpenSubdiv::Osd::GLPatchTable::PatchArrayVector const & patches =
+    OpenSubdiv::Osd::PatchArrayVector const & patches =
         g_mesh->GetPatchTable()->GetPatchArrays();
     for (int i = 0; i < (int)patches.size(); ++i) {
-        OpenSubdiv::Osd::GLPatchTable::PatchArray const & patch = patches[i];
+        OpenSubdiv::Osd::PatchArray const & patch = patches[i];
 
         Effect effect;
         effect.value = 0;
diff --git a/examples/glViewer/glViewer.cpp b/examples/glViewer/glViewer.cpp
index cf2663c8..38f7af7a 100644
--- a/examples/glViewer/glViewer.cpp
+++ b/examples/glViewer/glViewer.cpp
@@ -1181,7 +1181,7 @@ bindTextures() {
 
 static GLenum
 bindProgram(Effect effect,
-            OpenSubdiv::Osd::GLPatchTable::PatchArray const & patch) {
+            OpenSubdiv::Osd::PatchArray const & patch) {
     EffectDesc effectDesc(patch.GetDescriptor(), effect);
 
     // only legacy gregory needs maxValence and numElements
@@ -1299,7 +1299,7 @@ display() {
 
     glBindVertexArray(g_vao);
 
-    OpenSubdiv::Osd::GLPatchTable::PatchArrayVector const & patches =
+    OpenSubdiv::Osd::PatchArrayVector const & patches =
         g_mesh->GetPatchTable()->GetPatchArrays();
 
     // patch drawing
@@ -1316,7 +1316,7 @@ display() {
 
     // core draw-calls
     for (int i=0; i<(int)patches.size(); ++i) {
-        OpenSubdiv::Osd::GLPatchTable::PatchArray const & patch = patches[i];
+        OpenSubdiv::Osd::PatchArray const & patch = patches[i];
 
         OpenSubdiv::Far::PatchDescriptor desc = patch.GetDescriptor();
         OpenSubdiv::Far::PatchDescriptor::Type patchType = desc.GetType();
diff --git a/opensubdiv/osd/CMakeLists.txt b/opensubdiv/osd/CMakeLists.txt
index 4994ef64..134e6b30 100644
--- a/opensubdiv/osd/CMakeLists.txt
+++ b/opensubdiv/osd/CMakeLists.txt
@@ -29,6 +29,7 @@
 set(CPU_SOURCE_FILES
     cpuEvaluator.cpp
     cpuKernel.cpp
+    cpuPatchTable.cpp
     cpuVertexBuffer.cpp
 )
 
@@ -43,10 +44,12 @@ set(PRIVATE_HEADER_FILES
 
 set(PUBLIC_HEADER_FILES
     cpuEvaluator.h
+    cpuPatchTable.h
     cpuVertexBuffer.h
     mesh.h
     nonCopyable.h
     opengl.h
+    types.h
     vertexDescriptor.h
 )
 
@@ -216,6 +219,7 @@ list(APPEND DOXY_HEADER_FILES ${DXSDK_PUBLIC_HEADERS})
 # OpenCL code & dependencies
 set(OPENCL_PUBLIC_HEADERS
     clEvaluator.h
+    clPatchTable.h
     clVertexBuffer.h
     opencl.h
 )
@@ -223,6 +227,7 @@ set(OPENCL_PUBLIC_HEADERS
 if ( OPENCL_FOUND )
     list(APPEND GPU_SOURCE_FILES
         clEvaluator.cpp
+        clPatchTable.cpp
         clVertexBuffer.cpp
     )
     list(APPEND PUBLIC_HEADER_FILES ${OPENCL_PUBLIC_HEADERS})
@@ -254,12 +259,14 @@ list(APPEND DOXY_HEADER_FILES ${OPENCL_PUBLIC_HEADERS})
 # CUDA code & dependencies
 set(CUDA_PUBLIC_HEADERS
     cudaEvaluator.h
+    cudaPatchTable.h
     cudaVertexBuffer.h
 )
 
 if( CUDA_FOUND )
     list(APPEND GPU_SOURCE_FILES
         cudaEvaluator.cpp
+        cudaPatchTable.cpp
         cudaVertexBuffer.cpp
     )
     list(APPEND PUBLIC_HEADER_FILES ${CUDA_PUBLIC_HEADERS})
diff --git a/opensubdiv/osd/clEvaluator.cpp b/opensubdiv/osd/clEvaluator.cpp
index 4c2efcf8..8708fd20 100644
--- a/opensubdiv/osd/clEvaluator.cpp
+++ b/opensubdiv/osd/clEvaluator.cpp
@@ -27,6 +27,7 @@
 #include <sstream>
 #include <string>
 #include <vector>
+#include <cstdio>
 
 #include "../osd/opencl.h"
 #include "../far/error.h"
@@ -87,11 +88,12 @@ CLStencilTable::~CLStencilTable() {
 
 CLEvaluator::CLEvaluator(cl_context context, cl_command_queue queue)
     : _clContext(context), _clCommandQueue(queue),
-      _program(NULL), _stencilsKernel(NULL) {
+      _program(NULL), _stencilKernel(NULL), _patchKernel(NULL) {
 }
 
 CLEvaluator::~CLEvaluator() {
-    if (_stencilsKernel) clReleaseKernel(_stencilsKernel);
+    if (_stencilKernel) clReleaseKernel(_stencilKernel);
+    if (_patchKernel) clReleaseKernel(_patchKernel);
     if (_program) clReleaseProgram(_program);
 }
 
@@ -145,7 +147,13 @@ CLEvaluator::Compile(VertexBufferDescriptor const &srcDesc,
         return false;
     }
 
-    _stencilsKernel = clCreateKernel(_program, "computeStencils", &errNum);
+    _stencilKernel = clCreateKernel(_program, "computeStencils", &errNum);
+    if (errNum != CL_SUCCESS) {
+        Far::Error(Far::FAR_RUNTIME_ERROR, "buildKernel (%d)\n", errNum);
+        return false;
+    }
+
+    _patchKernel = clCreateKernel(_program, "computePatches", &errNum);
 
     if (errNum != CL_SUCCESS) {
         Far::Error(Far::FAR_RUNTIME_ERROR, "buildKernel (%d)\n", errNum);
@@ -169,24 +177,24 @@ CLEvaluator::EvalStencils(cl_mem src,
 
     size_t globalWorkSize = (size_t)(end - start);
 
-    clSetKernelArg(_stencilsKernel, 0, sizeof(cl_mem), &src);
-    clSetKernelArg(_stencilsKernel, 1, sizeof(int), &srcDesc.offset);
-    clSetKernelArg(_stencilsKernel, 2, sizeof(cl_mem), &dst);
-    clSetKernelArg(_stencilsKernel, 3, sizeof(int), &dstDesc.offset);
-    clSetKernelArg(_stencilsKernel, 4, sizeof(cl_mem), &sizes);
-    clSetKernelArg(_stencilsKernel, 5, sizeof(cl_mem), &offsets);
-    clSetKernelArg(_stencilsKernel, 6, sizeof(cl_mem), &indices);
-    clSetKernelArg(_stencilsKernel, 7, sizeof(cl_mem), &weights);
-    clSetKernelArg(_stencilsKernel, 8, sizeof(int), &start);
-    clSetKernelArg(_stencilsKernel, 9, sizeof(int), &end);
+    clSetKernelArg(_stencilKernel, 0, sizeof(cl_mem), &src);
+    clSetKernelArg(_stencilKernel, 1, sizeof(int), &srcDesc.offset);
+    clSetKernelArg(_stencilKernel, 2, sizeof(cl_mem), &dst);
+    clSetKernelArg(_stencilKernel, 3, sizeof(int), &dstDesc.offset);
+    clSetKernelArg(_stencilKernel, 4, sizeof(cl_mem), &sizes);
+    clSetKernelArg(_stencilKernel, 5, sizeof(cl_mem), &offsets);
+    clSetKernelArg(_stencilKernel, 6, sizeof(cl_mem), &indices);
+    clSetKernelArg(_stencilKernel, 7, sizeof(cl_mem), &weights);
+    clSetKernelArg(_stencilKernel, 8, sizeof(int), &start);
+    clSetKernelArg(_stencilKernel, 9, sizeof(int), &end);
 
     cl_int errNum = clEnqueueNDRangeKernel(
-        _clCommandQueue, _stencilsKernel, 1, NULL,
+        _clCommandQueue, _stencilKernel, 1, NULL,
         &globalWorkSize, NULL, 0, NULL, NULL);
 
     if (errNum != CL_SUCCESS) {
         Far::Error(Far::FAR_RUNTIME_ERROR,
-                   "ApplyStencilTableKernel (%d) ", errNum);
+                   "ApplyStencilKernel (%d) ", errNum);
         return false;
     }
 
@@ -194,6 +202,51 @@ CLEvaluator::EvalStencils(cl_mem src,
     return true;
 }
 
+bool
+CLEvaluator::EvalPatches(cl_mem src, VertexBufferDescriptor const &srcDesc,
+                         cl_mem dst, VertexBufferDescriptor const &dstDesc,
+                         cl_mem du,  VertexBufferDescriptor const &duDesc,
+                         cl_mem dv,  VertexBufferDescriptor const &dvDesc,
+                         int numPatchCoords,
+                         cl_mem patchCoordsBuffer,
+                         cl_mem patchArrayBuffer,
+                         cl_mem patchIndexBuffer,
+                         cl_mem patchParamBuffer) const {
+
+    size_t globalWorkSize = (size_t)(numPatchCoords);
+
+    clSetKernelArg(_patchKernel,  0, sizeof(cl_mem), &src);
+    clSetKernelArg(_patchKernel,  1, sizeof(int),    &srcDesc.offset);
+    clSetKernelArg(_patchKernel,  2, sizeof(cl_mem), &dst);
+    clSetKernelArg(_patchKernel,  3, sizeof(int),    &dstDesc.offset);
+    clSetKernelArg(_patchKernel,  4, sizeof(int),    &numPatchCoords);
+    // clSetKernelArg(_patchKernel,  4, sizeof(cl_mem), &du);
+    // clSetKernelArg(_patchKernel,  5, sizeof(int),    &duDesc.offset);
+    // clSetKernelArg(_patchKernel,  6, sizeof(int),    &duDesc.stride);
+    // clSetKernelArg(_patchKernel,  7, sizeof(cl_mem), &dv);
+    // clSetKernelArg(_patchKernel,  8, sizeof(int),    &dvDesc.offset);
+    // clSetKernelArg(_patchKernel,  9, sizeof(int),    &dvDesc.stride);
+    clSetKernelArg(_patchKernel, 5, sizeof(cl_mem), &patchCoordsBuffer);
+    clSetKernelArg(_patchKernel, 6, sizeof(cl_mem), &patchArrayBuffer);
+    clSetKernelArg(_patchKernel, 7, sizeof(cl_mem), &patchIndexBuffer);
+    clSetKernelArg(_patchKernel, 8, sizeof(cl_mem), &patchParamBuffer);
+
+    cl_int errNum = clEnqueueNDRangeKernel(
+        _clCommandQueue, _patchKernel, 1, NULL,
+        &globalWorkSize, NULL, 0, NULL, NULL);
+
+    if (errNum != CL_SUCCESS) {
+        Far::Error(Far::FAR_RUNTIME_ERROR,
+                   "ApplyPatchKernel (%d) ", errNum);
+        return false;
+    }
+
+    clFinish(_clCommandQueue);
+    return true;
+}
+
+
+
 /* static */
 void
 CLEvaluator::Synchronize(cl_command_queue clCommandQueue) {
diff --git a/opensubdiv/osd/clEvaluator.h b/opensubdiv/osd/clEvaluator.h
index 62780fe5..e2857110 100644
--- a/opensubdiv/osd/clEvaluator.h
+++ b/opensubdiv/osd/clEvaluator.h
@@ -28,6 +28,7 @@
 #include "../version.h"
 
 #include "../osd/opencl.h"
+#include "../osd/types.h"
 #include "../osd/vertexDescriptor.h"
 
 namespace OpenSubdiv {
@@ -75,9 +76,6 @@ private:
 
 // ---------------------------------------------------------------------------
 
-/// \brief OpenCL stencil kernel
-///
-///
 class CLEvaluator {
 public:
     typedef bool Instantiatable;
@@ -107,6 +105,12 @@ public:
         return NULL;
     }
 
+    /// ----------------------------------------------------------------------
+    ///
+    ///   Stencil evaluations with StencilTable
+    ///
+    /// ----------------------------------------------------------------------
+
     /// \brief Generic static compute function. This function has a same
     ///        signature as other device kernels have so that it can be called
     ///        transparently from OsdMesh template interface.
@@ -124,7 +128,7 @@ public:
     /// @param dstDesc        vertex buffer descriptor for the output buffer
     ///
     /// @param stencilTable   stencil table to be applied. The table must have
-    ///                       OpenCL memory interfaces.
+    ///                       SSBO interfaces.
     ///
     /// @param instance       cached compiled instance. Clients are supposed to
     ///                       pre-compile an instance of this class and provide
@@ -137,25 +141,25 @@ public:
     ///                         cl_command_queue GetCommandQueue()
     ///                       methods.
     ///
-    template <typename VERTEX_BUFFER, typename STENCIL_TABLE,
-              typename DEVICE_CONTEXT>
-    static bool EvalStencils(VERTEX_BUFFER *srcVertexBuffer,
-                             VertexBufferDescriptor const &srcDesc,
-                             VERTEX_BUFFER *dstVertexBuffer,
-                             VertexBufferDescriptor const &dstDesc,
-                             STENCIL_TABLE const *stencilTable,
-                             CLEvaluator const *instance,
-                             DEVICE_CONTEXT deviceContext) {
+    template <typename SRC_BUFFER, typename DST_BUFFER,
+              typename STENCIL_TABLE, typename DEVICE_CONTEXT>
+    static bool EvalStencils(
+        SRC_BUFFER *srcBuffer, VertexBufferDescriptor const &srcDesc,
+        DST_BUFFER *dstBuffer, VertexBufferDescriptor const &dstDesc,
+        STENCIL_TABLE const *stencilTable,
+        CLEvaluator const *instance,
+        DEVICE_CONTEXT deviceContext) {
+
         if (instance) {
-            return instance->EvalStencils(srcVertexBuffer, srcDesc,
-                                          dstVertexBuffer, dstDesc,
+            return instance->EvalStencils(srcBuffer, srcDesc,
+                                          dstBuffer, dstDesc,
                                           stencilTable);
         } else {
             // Create an instance on demand (slow)
             instance = Create(srcDesc, dstDesc, deviceContext);
             if (instance) {
-                bool r = instance->EvalStencils(srcVertexBuffer, srcDesc,
-                                                dstVertexBuffer, dstDesc,
+                bool r = instance->EvalStencils(srcBuffer, srcDesc,
+                                                dstBuffer, dstDesc,
                                                 stencilTable);
                 delete instance;
                 return r;
@@ -167,15 +171,14 @@ public:
     /// Generic compute function.
     /// Dispatch the CL compute kernel asynchronously.
     /// Returns false if the kernel hasn't been compiled yet.
-    template <typename VERTEX_BUFFER, typename STENCIL_TABLE>
-    bool EvalStencils(VERTEX_BUFFER *srcVertexBuffer,
-                      VertexBufferDescriptor const &srcDesc,
-                      VERTEX_BUFFER *dstVertexBuffer,
-                      VertexBufferDescriptor const &dstDesc,
-                      STENCIL_TABLE const *stencilTable) const {
-        return EvalStencils(srcVertexBuffer->BindCLBuffer(_clCommandQueue),
+    template <typename SRC_BUFFER, typename DST_BUFFER, typename STENCIL_TABLE>
+    bool EvalStencils(
+        SRC_BUFFER *srcBuffer, VertexBufferDescriptor const &srcDesc,
+        DST_BUFFER *dstBuffer, VertexBufferDescriptor const &dstDesc,
+        STENCIL_TABLE const *stencilTable) const {
+        return EvalStencils(srcBuffer->BindCLBuffer(_clCommandQueue),
                             srcDesc,
-                            dstVertexBuffer->BindCLBuffer(_clCommandQueue),
+                            dstBuffer->BindCLBuffer(_clCommandQueue),
                             dstDesc,
                             stencilTable->GetSizesBuffer(),
                             stencilTable->GetOffsetsBuffer(),
@@ -187,10 +190,8 @@ public:
 
     /// Dispatch the CL compute kernel asynchronously.
     /// returns false if the kernel hasn't been compiled yet.
-    bool EvalStencils(cl_mem src,
-                      VertexBufferDescriptor const &srcDesc,
-                      cl_mem dst,
-                      VertexBufferDescriptor const &dstDesc,
+    bool EvalStencils(cl_mem src, VertexBufferDescriptor const &srcDesc,
+                      cl_mem dst, VertexBufferDescriptor const &dstDesc,
                       cl_mem sizes,
                       cl_mem offsets,
                       cl_mem indices,
@@ -198,6 +199,278 @@ public:
                       int start,
                       int end) const;
 
+    /// ----------------------------------------------------------------------
+    ///
+    ///   Limit evaluations with PatchTable
+    ///
+    /// ----------------------------------------------------------------------
+    ///
+    /// \brief Generic limit eval function. This function has a same
+    ///        signature as other device kernels have so that it can be called
+    ///        in the same way.
+    ///
+    /// @param srcBuffer      Input primvar buffer.
+    ///                       must have BindCLBuffer() method returning a CL
+    ///                       buffer object of source data
+    ///
+    /// @param srcDesc        vertex buffer descriptor for the input buffer
+    ///
+    /// @param dstBuffer      Output primvar buffer
+    ///                       must have BindCLBuffer() method returning a CL
+    ///                       buffer object of destination data
+    ///
+    /// @param dstDesc        vertex buffer descriptor for the output buffer
+    ///
+    /// @param numPatchCoords number of patchCoords.
+    ///
+    /// @param patchCoords    array of locations to be evaluated.
+    ///                       must have BindCLBuffer() method returning an
+    ///                       array of PatchCoord struct.
+    ///
+    /// @param patchTable     CLPatchTable or equivalent
+    ///
+    /// @param instance       cached compiled instance. Clients are supposed to
+    ///                       pre-compile an instance of this class and provide
+    ///                       to this function. If it's null the kernel still
+    ///                       compute by instantiating on-demand kernel although
+    ///                       it may cause a performance problem.
+    ///
+    /// @param deviceContext  client providing context class which supports
+    ///                         cL_context GetContext()
+    ///                         cl_command_queue GetCommandQueue()
+    ///                       methods.
+    ///
+    template <typename SRC_BUFFER, typename DST_BUFFER,
+              typename PATCHCOORD_BUFFER, typename PATCH_TABLE,
+              typename DEVICE_CONTEXT>
+    static bool EvalPatches(
+        SRC_BUFFER *srcBuffer, VertexBufferDescriptor const &srcDesc,
+        DST_BUFFER *dstBuffer, VertexBufferDescriptor const &dstDesc,
+        int numPatchCoords,
+        PATCHCOORD_BUFFER *patchCoords,
+        PATCH_TABLE *patchTable,
+        CLEvaluator const *instance,
+        DEVICE_CONTEXT deviceContext) {
+
+        if (instance) {
+            return instance->EvalPatches(srcBuffer, srcDesc,
+                                         dstBuffer, dstDesc,
+                                         numPatchCoords, patchCoords,
+                                         patchTable);
+        } else {
+            // Create an instance on demand (slow)
+            (void)deviceContext;  // unused
+            instance = Create(srcDesc, dstDesc, deviceContext);
+            if (instance) {
+                bool r = instance->EvalPatches(srcBuffer, srcDesc,
+                                               dstBuffer, dstDesc,
+                                               numPatchCoords, patchCoords,
+                                               patchTable);
+                delete instance;
+                return r;
+            }
+            return false;
+        }
+    }
+
+    /// \brief Generic limit eval function. This function has a same
+    ///        signature as other device kernels have so that it can be called
+    ///        in the same way.
+    ///
+    /// @param srcBuffer      Input primvar buffer.
+    ///                       must have BindCLBuffer() method returning a CL
+    ///                       buffer object of source data
+    ///
+    /// @param srcDesc        vertex buffer descriptor for the input buffer
+    ///
+    /// @param dstBuffer      Output primvar buffer
+    ///                       must have BindCLBuffer() method returning a CL
+    ///                       buffer object of destination data
+    ///
+    /// @param dstDesc        vertex buffer descriptor for the output buffer
+    ///
+    /// @param duBuffer
+    ///
+    /// @param duDesc
+    ///
+    /// @param dvBuffer
+    ///
+    /// @param dvDesc
+    ///
+    /// @param numPatchCoords number of patchCoords.
+    ///
+    /// @param patchCoords    array of locations to be evaluated.
+    ///                       must have BindCLBuffer() method returning an
+    ///                       array of PatchCoord struct
+    ///
+    /// @param patchTable     CLPatchTable or equivalent
+    ///
+    /// @param instance       cached compiled instance. Clients are supposed to
+    ///                       pre-compile an instance of this class and provide
+    ///                       to this function. If it's null the kernel still
+    ///                       compute by instantiating on-demand kernel although
+    ///                       it may cause a performance problem.
+    ///
+    /// @param deviceContext  client providing context class which supports
+    ///                         cL_context GetContext()
+    ///                         cl_command_queue GetCommandQueue()
+    ///                       methods.
+    ///
+    template <typename SRC_BUFFER, typename DST_BUFFER,
+              typename PATCHCOORD_BUFFER, typename PATCH_TABLE,
+              typename DEVICE_CONTEXT>
+    static bool EvalPatches(
+        SRC_BUFFER *srcBuffer, VertexBufferDescriptor const &srcDesc,
+        DST_BUFFER *dstBuffer, VertexBufferDescriptor const &dstDesc,
+        DST_BUFFER *duBuffer,  VertexBufferDescriptor const &duDesc,
+        DST_BUFFER *dvBuffer,  VertexBufferDescriptor const &dvDesc,
+        int numPatchCoords,
+        PATCHCOORD_BUFFER *patchCoords,
+        PATCH_TABLE *patchTable,
+        CLEvaluator const *instance,
+        DEVICE_CONTEXT deviceContext) {
+
+        if (instance) {
+            return instance->EvalPatches(srcBuffer, srcDesc,
+                                         dstBuffer, dstDesc,
+                                         duBuffer, duDesc,
+                                         dvBuffer, dvDesc,
+                                         numPatchCoords, patchCoords,
+                                         patchTable);
+        } else {
+            // Create an instance on demand (slow)
+            (void)deviceContext;  // unused
+            instance = Create(srcDesc, dstDesc, deviceContext);
+            if (instance) {
+                bool r = instance->EvalPatches(srcBuffer, srcDesc,
+                                               dstBuffer, dstDesc,
+                                               duBuffer, duDesc,
+                                               dvBuffer, dvDesc,
+                                               numPatchCoords, patchCoords,
+                                               patchTable);
+                delete instance;
+                return r;
+            }
+            return false;
+        }
+    }
+
+    /// \brief Generic limit eval function. This function has a same
+    ///        signature as other device kernels have so that it can be called
+    ///        in the same way.
+    ///
+    /// @param srcBuffer      Input primvar buffer.
+    ///                       must have BindCLBuffer() method returning a CL
+    ///                       buffer object of source data
+    ///
+    /// @param srcDesc        vertex buffer descriptor for the input buffer
+    ///
+    /// @param dstBuffer      Output primvar buffer
+    ///                       must have BindCLBuffer() method returning a CL
+    ///                       buffer object of destination data
+    ///
+    /// @param dstDesc        vertex buffer descriptor for the output buffer
+    ///
+    /// @param numPatchCoords number of patchCoords.
+    ///
+    /// @param patchCoords    array of locations to be evaluated.
+    ///                       must have BindCLBuffer() method returning an
+    ///                       array of PatchCoord struct.
+    ///
+    /// @param patchTable     CLPatchTable or equivalent
+    ///
+    template <typename SRC_BUFFER, typename DST_BUFFER,
+              typename PATCHCOORD_BUFFER, typename PATCH_TABLE>
+    bool EvalPatches(
+        SRC_BUFFER *srcBuffer, VertexBufferDescriptor const &srcDesc,
+        DST_BUFFER *dstBuffer, VertexBufferDescriptor const &dstDesc,
+        int numPatchCoords,
+        PATCHCOORD_BUFFER *patchCoords,
+        PATCH_TABLE *patchTable) const {
+
+        return EvalPatches(srcBuffer->BindCLBuffer(_clCommandQueue), srcDesc,
+                           dstBuffer->BindCLBuffer(_clCommandQueue), dstDesc,
+                           0, VertexBufferDescriptor(),
+                           0, VertexBufferDescriptor(),
+                           numPatchCoords,
+                           patchCoords->BindCLBuffer(_clCommandQueue),
+                           patchTable->GetPatchArrayBuffer(),
+                           patchTable->GetPatchIndexBuffer(),
+                           patchTable->GetPatchParamBuffer());
+    }
+
+    /// \brief Generic limit eval function with derivatives. This function has
+    ///        a same signature as other device kernels have so that it can be
+    ///        called in the same way.
+    ///
+    /// @param srcBuffer        Input primvar buffer.
+    ///                         must have BindCLBuffer() method returning a CL
+    ///                         buffer object of source data
+    ///
+    /// @param srcDesc          vertex buffer descriptor for the input buffer
+    ///
+    /// @param dstBuffer        Output primvar buffer
+    ///                         must have BindCLBuffer() method returning a CL
+    ///                         buffer object of destination data
+    ///
+    /// @param dstDesc          vertex buffer descriptor for the output buffer
+    ///
+    /// @param duBuffer         Output U-derivatives buffer
+    ///                         must have BindCLBuffer() method returning a CL
+    ///                         buffer object of destination data of Du
+    ///
+    /// @param duDesc           vertex buffer descriptor for the duBuffer
+    ///
+    /// @param dvBuffer         Output V-derivatives buffer
+    ///                         must have BindCLBuffer() method returning a CL
+    ///                         buffer object of destination data of Dv
+    ///
+    /// @param dvDesc           vertex buffer descriptor for the dvBuffer
+    ///
+    /// @param numPatchCoords   number of patchCoords.
+    ///
+    /// @param patchCoords      array of locations to be evaluated.
+    ///
+    /// @param patchTable       CLPatchTable or equivalent
+    ///
+    template <typename SRC_BUFFER, typename DST_BUFFER,
+              typename PATCHCOORD_BUFFER, typename PATCH_TABLE>
+    bool EvalPatches(
+        SRC_BUFFER *srcBuffer, VertexBufferDescriptor const &srcDesc,
+        DST_BUFFER *dstBuffer, VertexBufferDescriptor const &dstDesc,
+        DST_BUFFER *duBuffer,  VertexBufferDescriptor const &duDesc,
+        DST_BUFFER *dvBuffer,  VertexBufferDescriptor const &dvDesc,
+        int numPatchCoords,
+        PATCHCOORD_BUFFER *patchCoords,
+        PATCH_TABLE *patchTable) const {
+
+        return EvalPatches(srcBuffer->BindCLBuffer(_clCommandQueue), srcDesc,
+                           dstBuffer->BindCLBuffer(_clCommandQueue), dstDesc,
+                           duBuffer->BindCLBuffer(_clCommandQueue),  duDesc,
+                           dvBuffer->BindCLBuffer(_clCommandQueue),  dvDesc,
+                           numPatchCoords,
+                           patchCoords->BindCLBuffer(_clCommandQueue),
+                           patchTable->GetPatchArrayBuffer(),
+                           patchTable->GetPatchIndexBuffer(),
+                           patchTable->GetPatchParamBuffer());
+    }
+
+    bool EvalPatches(cl_mem src, VertexBufferDescriptor const &srcDesc,
+                     cl_mem dst, VertexBufferDescriptor const &dstDesc,
+                     cl_mem du,  VertexBufferDescriptor const &duDesc,
+                     cl_mem dv,  VertexBufferDescriptor const &dvDesc,
+                     int numPatchCoords,
+                     cl_mem patchCoordsBuffer,
+                     cl_mem patchArrayBuffer,
+                     cl_mem patchIndexBuffer,
+                     cl_mem patchParamsBuffer) const;
+
+    /// ----------------------------------------------------------------------
+    ///
+    ///   Other methods
+    ///
+    /// ----------------------------------------------------------------------
+
     /// Configure OpenCL kernel.
     /// Returns false if it fails to compile the kernel.
     bool Compile(VertexBufferDescriptor const &srcDesc,
@@ -215,7 +488,8 @@ private:
     cl_context _clContext;
     cl_command_queue _clCommandQueue;
     cl_program _program;
-    cl_kernel _stencilsKernel;
+    cl_kernel _stencilKernel;
+    cl_kernel _patchKernel;
 };
 
 
diff --git a/opensubdiv/osd/clKernel.cl b/opensubdiv/osd/clKernel.cl
index 38c2c51e..372887e5 100644
--- a/opensubdiv/osd/clKernel.cl
+++ b/opensubdiv/osd/clKernel.cl
@@ -85,3 +85,118 @@ __kernel void computeStencils(__global float * src,
 
     writeVertex(dst, current, &v);
 }
+
+// ---------------------------------------------------------------------------
+
+struct PatchArray {
+    int patchType;
+    int numPatches;
+    int indexBase;        // an offset within the index buffer
+    int primitiveIdBase;  // an offset within the patch param buffer
+};
+
+struct PatchCoord {
+   int arrayIndex;
+   int patchIndex;
+   int vertIndex;
+   float s;
+   float t;
+};
+
+struct PatchParam {
+    int faceIndex;
+    uint patchBits;
+    float sharpness;
+};
+
+static void getBSplineWeights(float t, float *point, float *deriv) {
+    // The four uniform cubic B-Spline basis functions evaluated at t:
+    float one6th = 1.0f / 6.0f;
+
+    float t2 = t * t;
+    float t3 = t * t2;
+
+    point[0] = one6th * (1.0f - 3.0f*(t -      t2) -      t3);
+    point[1] = one6th * (4.0f           - 6.0f*t2  + 3.0f*t3);
+    point[2] = one6th * (1.0f + 3.0f*(t +      t2  -      t3));
+    point[3] = one6th * (                                 t3);
+
+    // Derivatives of the above four basis functions at t:
+    deriv[0] = -0.5f*t2 +      t - 0.5f;
+    deriv[1] =  1.5f*t2 - 2.0f*t;
+    deriv[2] = -1.5f*t2 +      t + 0.5f;
+    deriv[3] =  0.5f*t2;
+}
+
+__kernel void computePatches(__global float *src, int srcOffset,
+                             __global float *dst, int dstOffset,
+//                             __global float *du,  int duOffset, int duStride,
+//                             __global float *dv,  int dvOffset, int dvStride,
+                             int numPatchCoords,
+                             __global struct PatchCoord *patchCoords,
+                             __global struct PatchArray *patchArrayBuffer,
+                             __global int *patchIndexBuffer,
+                             __global struct PatchParam *patchParamBuffer) {
+    int current = get_global_id(0);
+
+    if (current > numPatchCoords) return;
+
+    src += srcOffset;
+    dst += dstOffset;
+    // du += duOffset;
+    // dv += dvOffset;
+
+    struct PatchCoord coord = patchCoords[current];
+    int patchIndex = coord.patchIndex;
+//    struct PatchArray array = patchArrayBuffer[coord.arrayIndex];
+    struct PatchArray array = patchArrayBuffer[0];
+
+    int patchType = 6; // array.x XXX: REGULAR only for now.
+    int numControlVertices = 16;
+
+    uint patchBits = patchParamBuffer[patchIndex].patchBits;
+//    vec2 uv = normalizePatchCoord(patchBits, vec2(coord.s, coord.t));
+    float dScale = 1.0f;//float(1 << getDepth(patchBits));
+
+    float uv[2] = {coord.s, coord.t};
+
+    float wP[20], wDs[20], wDt[20];
+    if (patchType == 6) {  // REGULAR
+        float sWeights[4], tWeights[4], dsWeights[4], dtWeights[4];
+        getBSplineWeights(uv[0], sWeights, dsWeights);
+        getBSplineWeights(uv[1], tWeights, dtWeights);
+
+//        adjustBoundaryWeights(patchBits, sWeights, tWeights);
+//        adjustBoundaryWeights(patchBits, dsWeights, dtWeights);
+
+        for (int k = 0; k < 4; ++k) {
+            for (int l = 0; l < 4; ++l) {
+                wP[4*k+l]  = sWeights[l]  * tWeights[k];
+                wDs[4*k+l] = dsWeights[l] * tWeights[k]  * dScale;
+                wDt[4*k+l] = sWeights[l]  * dtWeights[k] * dScale;
+            }
+        }
+    } else {
+        // TODO: GREGORY BASIS
+    }
+
+    struct Vertex v;
+    clear(&v);
+
+#if 1
+    // debug
+    v.v[0] = uv[0];
+    v.v[1] = uv[1];
+    v.v[2] = patchIndexBuffer[current] * 0.1;
+    writeVertex(dst, current, &v);
+    return;
+#endif
+
+    int indexBase = array.indexBase + coord.vertIndex;
+    for (int i = 0; i < numControlVertices; ++i) {
+        int index = patchIndexBuffer[indexBase + i];
+        if (index < 0) index = 0;
+        addWithWeight(&v, src, index, wP[i]);
+    }
+    writeVertex(dst, current, &v);
+}
diff --git a/opensubdiv/osd/clPatchTable.cpp b/opensubdiv/osd/clPatchTable.cpp
new file mode 100644
index 00000000..2fb03730
--- /dev/null
+++ b/opensubdiv/osd/clPatchTable.cpp
@@ -0,0 +1,99 @@
+//
+//   Copyright 2015 Pixar
+//
+//   Licensed under the Apache License, Version 2.0 (the "Apache License")
+//   with the following modification; you may not use this file except in
+//   compliance with the Apache License and the following modification to it:
+//   Section 6. Trademarks. is deleted and replaced with:
+//
+//   6. Trademarks. This License does not grant permission to use the trade
+//      names, trademarks, service marks, or product names of the Licensor
+//      and its affiliates, except as required to comply with Section 4(c) of
+//      the License and to reproduce the content of the NOTICE file.
+//
+//   You may obtain a copy of the Apache License at
+//
+//       http://www.apache.org/licenses/LICENSE-2.0
+//
+//   Unless required by applicable law or agreed to in writing, software
+//   distributed under the Apache License with the above modification is
+//   distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+//   KIND, either express or implied. See the Apache License for the specific
+//   language governing permissions and limitations under the Apache License.
+//
+
+#include "../osd/clPatchTable.h"
+
+#include "../far/error.h"
+#include "../far/patchTable.h"
+#include "../osd/opencl.h"
+#include "../osd/cpuPatchTable.h"
+
+namespace OpenSubdiv {
+namespace OPENSUBDIV_VERSION {
+
+namespace Osd {
+
+CLPatchTable::CLPatchTable() :
+    _patchArrays(NULL), _indexBuffer(NULL), _patchParamBuffer(NULL) {
+}
+
+CLPatchTable::~CLPatchTable() {
+    if (_patchArrays) clReleaseMemObject(_patchArrays);
+    if (_indexBuffer) clReleaseMemObject(_indexBuffer);
+    if (_patchParamBuffer) clReleaseMemObject(_patchParamBuffer);
+}
+
+CLPatchTable *
+CLPatchTable::Create(Far::PatchTable const *farPatchTable,
+                     cl_context clContext) {
+    CLPatchTable *instance = new CLPatchTable();
+    if (instance->allocate(farPatchTable, clContext)) return instance;
+    delete instance;
+    return 0;
+}
+
+bool
+CLPatchTable::allocate(Far::PatchTable const *farPatchTable, cl_context clContext) {
+    CpuPatchTable patchTable(farPatchTable);
+
+    size_t numPatchArrays = patchTable.GetNumPatchArrays();
+    size_t indexSize = patchTable.GetPatchIndexSize();
+    size_t patchParamSize = patchTable.GetPatchParamSize();
+
+    cl_int err = 0;
+    _patchArrays = clCreateBuffer(clContext, CL_MEM_READ_WRITE,
+                                  numPatchArrays * sizeof(Osd::PatchArray),
+                                  (void*)patchTable.GetPatchArrayBuffer(),
+                                  &err);
+    if (err != CL_SUCCESS) {
+        Far::Error(Far::FAR_RUNTIME_ERROR, "clCreateBuffer: %d", err);
+        return false;
+    }
+
+    _indexBuffer = clCreateBuffer(clContext, CL_MEM_READ_WRITE,
+                                  indexSize * sizeof(int),
+                                  (void*)patchTable.GetPatchIndexBuffer(),
+                                  &err);
+    if (err != CL_SUCCESS) {
+        Far::Error(Far::FAR_RUNTIME_ERROR, "clCreateBuffer: %d", err);
+        return false;
+    }
+
+    _patchParamBuffer = clCreateBuffer(clContext, CL_MEM_READ_WRITE,
+                                       patchParamSize * sizeof(Osd::PatchParam),
+                                       (void*)patchTable.GetPatchParamBuffer(),
+                                       &err);
+    if (err != CL_SUCCESS) {
+        Far::Error(Far::FAR_RUNTIME_ERROR, "clCreateBuffer: %d", err);
+        return false;
+    }
+    return true;
+}
+
+
+}  // end namespace Osd
+
+}  // end namespace OPENSUBDIV_VERSION
+}  // end namespace OpenSubdiv
+
diff --git a/opensubdiv/osd/clPatchTable.h b/opensubdiv/osd/clPatchTable.h
new file mode 100644
index 00000000..1a90ec6a
--- /dev/null
+++ b/opensubdiv/osd/clPatchTable.h
@@ -0,0 +1,91 @@
+//
+//   Copyright 2015 Pixar
+//
+//   Licensed under the Apache License, Version 2.0 (the "Apache License")
+//   with the following modification; you may not use this file except in
+//   compliance with the Apache License and the following modification to it:
+//   Section 6. Trademarks. is deleted and replaced with:
+//
+//   6. Trademarks. This License does not grant permission to use the trade
+//      names, trademarks, service marks, or product names of the Licensor
+//      and its affiliates, except as required to comply with Section 4(c) of
+//      the License and to reproduce the content of the NOTICE file.
+//
+//   You may obtain a copy of the Apache License at
+//
+//       http://www.apache.org/licenses/LICENSE-2.0
+//
+//   Unless required by applicable law or agreed to in writing, software
+//   distributed under the Apache License with the above modification is
+//   distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+//   KIND, either express or implied. See the Apache License for the specific
+//   language governing permissions and limitations under the Apache License.
+//
+
+#ifndef OPENSUBDIV3_OSD_CL_PATCH_TABLE_H
+#define OPENSUBDIV3_OSD_CL_PATCH_TABLE_H
+
+#include "../version.h"
+
+#include "../osd/opencl.h"
+#include "../osd/nonCopyable.h"
+#include "../osd/types.h"
+
+namespace OpenSubdiv {
+namespace OPENSUBDIV_VERSION {
+
+namespace Far{
+    class PatchTable;
+};
+
+namespace Osd {
+
+/// \brief CL patch table
+///
+/// This class is a CL buffer representation of Far::PatchTable.
+///
+/// CLEvaluator consumes this table to evaluate on the patches.
+///
+///
+class CLPatchTable : private NonCopyable<CLPatchTable> {
+public:
+    /// Creator. Returns NULL if error
+    static CLPatchTable *Create(Far::PatchTable const *patchTable,
+                                cl_context clContext);
+
+    template <typename DEVICE_CONTEXT>
+    static CLPatchTable * Create(Far::PatchTable const *patchTable,
+                                 DEVICE_CONTEXT context) {
+        return Create(patchTable, context->GetContext());
+    }
+
+    /// Destructor
+    ~CLPatchTable();
+
+    /// Returns the CL memory of the array of Osd::PatchArray buffer
+    cl_mem GetPatchArrayBuffer() const { return _patchArrays; }
+
+    /// Returns the CL memory of the patch control vertices
+    cl_mem GetPatchIndexBuffer() const { return _indexBuffer; }
+
+    /// Returns the CL memory of the array of Osd::PatchParam buffer
+    cl_mem GetPatchParamBuffer() const { return _patchParamBuffer; }
+
+protected:
+    CLPatchTable();
+
+    bool allocate(Far::PatchTable const *patchTable, cl_context clContext);
+
+    cl_mem _patchArrays;
+    cl_mem _indexBuffer;
+    cl_mem _patchParamBuffer;
+};
+
+}  // end namespace Osd
+
+}  // end namespace OPENSUBDIV_VERSION
+using namespace OPENSUBDIV_VERSION;
+
+}  // end namespace OpenSubdiv
+
+#endif  // OPENSUBDIV3_OSD_CL_PATCH_TABLE_H
diff --git a/opensubdiv/osd/clVertexBuffer.h b/opensubdiv/osd/clVertexBuffer.h
index 0b8336e9..de203ab0 100644
--- a/opensubdiv/osd/clVertexBuffer.h
+++ b/opensubdiv/osd/clVertexBuffer.h
@@ -45,6 +45,12 @@ public:
     /// Creator. Returns NULL if error.
     static CLVertexBuffer * Create(int numElements, int numVertices, cl_context clContext);
 
+    template <typename DEVICE_CONTEXT>
+    static CLVertexBuffer * Create(int numElements, int numVertices,
+                                   DEVICE_CONTEXT context) {
+        return Create(numElements, numVertices, context->GetContext());
+    }
+
     /// Destructor.
     ~CLVertexBuffer();
 
@@ -52,6 +58,12 @@ public:
     /// vertices data to Osd.
     void UpdateData(const float *src, int startVertex, int numVertices, cl_command_queue clQueue);
 
+    template<typename DEVICE_CONTEXT>
+    void UpdateData(const float *src, int startVertex, int numVertices,
+                    DEVICE_CONTEXT context) {
+        UpdateData(src, startVertex, numVertices, context->GetCommandQueue());
+    }
+
     /// Returns how many elements defined in this vertex buffer.
     int GetNumElements() const;
 
diff --git a/opensubdiv/osd/cpuEvaluator.cpp b/opensubdiv/osd/cpuEvaluator.cpp
index e15f09bd..3c8d761a 100644
--- a/opensubdiv/osd/cpuEvaluator.cpp
+++ b/opensubdiv/osd/cpuEvaluator.cpp
@@ -24,6 +24,7 @@
 
 #include "../osd/cpuEvaluator.h"
 #include "../osd/cpuKernel.h"
+#include "../far/patchBasis.h"
 
 #include <cstdlib>
 
@@ -34,15 +35,15 @@ namespace Osd {
 
 /* static */
 bool
-CpuEvaluator::EvalStencils(const float *src,
-                           VertexBufferDescriptor const &srcDesc,
-                           float *dst,
-                           VertexBufferDescriptor const &dstDesc,
-                           const int * sizes,
-                           const int * offsets,
-                           const int * indices,
-                           const float * weights,
-                           int start, int end) {
+CpuEvaluator::EvalStencils(
+    const float *src, VertexBufferDescriptor const &srcDesc,
+    float *dst,       VertexBufferDescriptor const &dstDesc,
+    const int * sizes,
+    const int * offsets,
+    const int * indices,
+    const float * weights,
+    int start, int end) {
+
     if (end <= start) return true;
     if (srcDesc.length != dstDesc.length) return false;
 
@@ -55,30 +56,28 @@ CpuEvaluator::EvalStencils(const float *src,
 
 /* static */
 bool
-CpuEvaluator::EvalStencils(const float *src,
-                           VertexBufferDescriptor const &srcDesc,
-                           float *dst,
-                           VertexBufferDescriptor const &dstDesc,
-                           float *dstDs,
-                           VertexBufferDescriptor const &dstDsDesc,
-                           float *dstDt,
-                           VertexBufferDescriptor const &dstDtDesc,
-                           const int * sizes,
-                           const int * offsets,
-                           const int * indices,
-                           const float * weights,
-                           const float * duWeights,
-                           const float * dvWeights,
-                           int start, int end) {
+CpuEvaluator::EvalStencils(
+    const float *src, VertexBufferDescriptor const &srcDesc,
+    float *dst,       VertexBufferDescriptor const &dstDesc,
+    float *du,        VertexBufferDescriptor const &duDesc,
+    float *dv,        VertexBufferDescriptor const &dvDesc,
+    const int * sizes,
+    const int * offsets,
+    const int * indices,
+    const float * weights,
+    const float * duWeights,
+    const float * dvWeights,
+    int start, int end) {
+
     if (end <= start) return true;
     if (srcDesc.length != dstDesc.length) return false;
-    if (srcDesc.length != dstDsDesc.length) return false;
-    if (srcDesc.length != dstDtDesc.length) return false;
+    if (srcDesc.length != duDesc.length) return false;
+    if (srcDesc.length != dvDesc.length) return false;
 
     CpuEvalStencils(src, srcDesc,
                     dst, dstDesc,
-                    dstDs, dstDsDesc,
-                    dstDt, dstDtDesc,
+                    du,  duDesc,
+                    dv,  dvDesc,
                     sizes, offsets, indices,
                     weights, duWeights, dvWeights,
                     start, end);
@@ -123,10 +122,13 @@ CpuEvaluator::EvalPatches(const float *src,
                           float *dst,
                           VertexBufferDescriptor const &dstDesc,
                           int numPatchCoords,
-                          PatchCoord const *patchCoords,
-                          Far::PatchTable const *patchTable) {
+                          const PatchCoord *patchCoords,
+                          const PatchArray *patchArrays,
+                          const int *patchIndexBuffer,
+                          const PatchParam *patchParamBuffer){
     src += srcDesc.offset;
     if (dst) dst += dstDesc.offset;
+    else return false;
 
     BufferAdapter<const float> srcT(src, srcDesc.length, srcDesc.stride);
     BufferAdapter<float>       dstT(dst, dstDesc.length, dstDesc.stride);
@@ -134,14 +136,38 @@ CpuEvaluator::EvalPatches(const float *src,
     float wP[20], wDs[20], wDt[20];
 
     for (int i = 0; i < numPatchCoords; ++i) {
-        PatchCoord const &coords = patchCoords[i];
+        PatchCoord const &coord = patchCoords[i];
+        PatchArray const &array = patchArrays[coord.handle.arrayIndex];
 
-        patchTable->EvaluateBasis(coords.handle, coords.s, coords.t, wP, wDs, wDt);
+        int patchType = array.GetPatchType();
+        // XXX: patchIndex is absolute. not sure it's consistent.
+        //      (should be offsetted by array.primitiveIdBase?)
+        //    patchParamBuffer[array.primitiveIdBase + coord.handle.patchIndex]
+        Far::PatchParam::BitField patchBits = *(Far::PatchParam::BitField*)
+            &patchParamBuffer[coord.handle.patchIndex].patchBits;
 
-        Far::ConstIndexArray cvs = patchTable->GetPatchVertices(coords.handle);
+        int numControlVertices = 0;
+        if (patchType == Far::PatchDescriptor::REGULAR) {
+            Far::internal::GetBSplineWeights(patchBits,
+                                             coord.s, coord.t, wP, wDs, wDt);
+            numControlVertices = 16;
+        } else if (patchType == Far::PatchDescriptor::GREGORY_BASIS) {
+            Far::internal::GetGregoryWeights(patchBits,
+                                             coord.s, coord.t, wP, wDs, wDt);
+            numControlVertices = 20;
+        } else if (patchType == Far::PatchDescriptor::QUADS) {
+            Far::internal::GetBilinearWeights(patchBits,
+                                              coord.s, coord.t, wP, wDs, wDt);
+            numControlVertices = 4;
+        } else {
+            assert(0);
+            return false;
+        }
+        const int *cvs =
+            &patchIndexBuffer[array.indexBase + coord.handle.vertIndex];
 
         dstT.Clear();
-        for (int j = 0; j < cvs.size(); ++j) {
+        for (int j = 0; j < numControlVertices; ++j) {
             dstT.AddWithWeight(srcT[cvs[j]], wP[j]);
         }
         ++dstT;
@@ -151,47 +177,67 @@ CpuEvaluator::EvalPatches(const float *src,
 
 /* static */
 bool
-CpuEvaluator::EvalPatches(const float *src,
-                          VertexBufferDescriptor const &srcDesc,
-                          float *dst,
-                          VertexBufferDescriptor const &dstDesc,
-                          float *dstDs,
-                          VertexBufferDescriptor const &dstDsDesc,
-                          float *dstDt,
-                          VertexBufferDescriptor const &dstDtDesc,
-                          int numPatchCoords,
-                          PatchCoord const *patchCoords,
-                          Far::PatchTable const *patchTable) {
+CpuEvaluator::EvalPatches(
+    const float *src, VertexBufferDescriptor const &srcDesc,
+    float *dst,       VertexBufferDescriptor const &dstDesc,
+    float *du,        VertexBufferDescriptor const &duDesc,
+    float *dv,        VertexBufferDescriptor const &dvDesc,
+    int numPatchCoords,
+    PatchCoord const *patchCoords,
+    PatchArray const *patchArrays,
+    const int *patchIndexBuffer,
+    PatchParam const *patchParamBuffer) {
+
     src += srcDesc.offset;
     if (dst) dst += dstDesc.offset;
-    if (dstDs) dstDs += dstDsDesc.offset;
-    if (dstDt) dstDt += dstDtDesc.offset;
+    if (du)  du  += duDesc.offset;
+    if (dv)  dv  += dvDesc.offset;
 
     BufferAdapter<const float> srcT(src, srcDesc.length, srcDesc.stride);
-    BufferAdapter<float> dstT(dst, dstDesc.length, dstDesc.stride);
-    BufferAdapter<float> dstDsT(dstDs, dstDsDesc.length, dstDsDesc.stride);
-    BufferAdapter<float> dstDtT(dstDt, dstDtDesc.length, dstDtDesc.stride);
+    BufferAdapter<float>       dstT(dst, dstDesc.length, dstDesc.stride);
+    BufferAdapter<float>       duT (du,  duDesc.length,  duDesc.stride);
+    BufferAdapter<float>       dvT (dv,  dvDesc.length,  dvDesc.stride);
 
     float wP[20], wDs[20], wDt[20];
 
     for (int i = 0; i < numPatchCoords; ++i) {
-        PatchCoord const &coords = patchCoords[i];
+        PatchCoord const &coord = patchCoords[i];
+        PatchArray const &array = patchArrays[coord.handle.arrayIndex];
 
-        patchTable->EvaluateBasis(coords.handle, coords.s, coords.t, wP, wDs, wDt);
+        int patchType = array.GetPatchType();
+        Far::PatchParam::BitField patchBits = *(Far::PatchParam::BitField*)
+            &patchParamBuffer[coord.handle.patchIndex].patchBits;
 
-        Far::ConstIndexArray cvs = patchTable->GetPatchVertices(coords.handle);
+        int numControlVertices = 0;
+        if (patchType == Far::PatchDescriptor::REGULAR) {
+            Far::internal::GetBSplineWeights(patchBits,
+                                             coord.s, coord.t, wP, wDs, wDt);
+            numControlVertices = 16;
+        } else if (patchType == Far::PatchDescriptor::GREGORY_BASIS) {
+            Far::internal::GetGregoryWeights(patchBits,
+                                             coord.s, coord.t, wP, wDs, wDt);
+            numControlVertices = 20;
+        } else if (patchType == Far::PatchDescriptor::QUADS) {
+            Far::internal::GetBilinearWeights(patchBits,
+                                              coord.s, coord.t, wP, wDs, wDt);
+            numControlVertices = 4;
+        } else {
+            assert(0);
+        }
+        const int *cvs =
+            &patchIndexBuffer[array.indexBase + coord.handle.vertIndex];
 
         dstT.Clear();
-        dstDsT.Clear();
-        dstDtT.Clear();
-        for (int j = 0; j < cvs.size(); ++j) {
+        duT.Clear();
+        dvT.Clear();
+        for (int j = 0; j < numControlVertices; ++j) {
             dstT.AddWithWeight(srcT[cvs[j]], wP[j]);
-            dstDsT.AddWithWeight(srcT[cvs[j]], wDs[j]);
-            dstDtT.AddWithWeight(srcT[cvs[j]], wDt[j]);
+            duT.AddWithWeight (srcT[cvs[j]], wDs[j]);
+            dvT.AddWithWeight (srcT[cvs[j]], wDt[j]);
         }
         ++dstT;
-        ++dstDsT;
-        ++dstDtT;
+        ++duT;
+        ++dvT;
     }
     return true;
 }
diff --git a/opensubdiv/osd/cpuEvaluator.h b/opensubdiv/osd/cpuEvaluator.h
index e03be1ee..5c402485 100644
--- a/opensubdiv/osd/cpuEvaluator.h
+++ b/opensubdiv/osd/cpuEvaluator.h
@@ -29,33 +29,14 @@
 
 #include <cstddef>
 #include <vector>
+#include "../osd/types.h"
 #include "../osd/vertexDescriptor.h"
-#include "../far/patchTable.h"
 
 namespace OpenSubdiv {
 namespace OPENSUBDIV_VERSION {
 
 namespace Osd {
 
-/// \brief Coordinates set on a patch table
-///     XXX: this is a temporary structure, exists during Osd refactoring work.
-///
-struct PatchCoord {
-    /// \brief Constructor
-    ///
-    /// @param p patch handle
-    ///
-    /// @param s parametric location on the patch
-    ///
-    /// @param t parametric location on the patch
-    ///
-    PatchCoord(Far::PatchTable::PatchHandle handle, float s, float t) :
-        handle(handle), s(s), t(t) { }
-
-    Far::PatchTable::PatchHandle handle; ///< patch handle
-    float s, t;              ///< parametric location on patch
-};
-
 class CpuEvaluator {
 public:
     /// ----------------------------------------------------------------------
@@ -80,7 +61,7 @@ public:
     ///
     /// @param dstDesc        vertex buffer descriptor for the output buffer
     ///
-    /// @param stencilTable   stencil table to be applied.
+    /// @param stencilTable   Far::StencilTable or equivalent
     ///
     /// @param instance       not used in the cpu kernel
     ///                       (declared as a typed pointer to prevent
@@ -89,20 +70,18 @@ public:
     /// @param deviceContext  not used in the cpu kernel
     ///
     template <typename SRC_BUFFER, typename DST_BUFFER, typename STENCIL_TABLE>
-    static bool EvalStencils(SRC_BUFFER *srcBuffer,
-                             VertexBufferDescriptor const &srcDesc,
-                             DST_BUFFER *dstBuffer,
-                             VertexBufferDescriptor const &dstDesc,
-                             STENCIL_TABLE const *stencilTable,
-                             const CpuEvaluator *instance = NULL,
-                             void * deviceContext = NULL) {
-        (void)instance;   // unused
-        (void)deviceContext;   // unused
+    static bool EvalStencils(
+        SRC_BUFFER *srcBuffer, VertexBufferDescriptor const &srcDesc,
+        DST_BUFFER *dstBuffer, VertexBufferDescriptor const &dstDesc,
+        STENCIL_TABLE const *stencilTable,
+        const CpuEvaluator *instance = NULL,
+        void * deviceContext = NULL) {
 
-        return EvalStencils(srcBuffer->BindCpuBuffer(),
-                            srcDesc,
-                            dstBuffer->BindCpuBuffer(),
-                            dstDesc,
+        (void)instance;       // unused
+        (void)deviceContext;  // unused
+
+        return EvalStencils(srcBuffer->BindCpuBuffer(), srcDesc,
+                            dstBuffer->BindCpuBuffer(), dstDesc,
                             &stencilTable->GetSizes()[0],
                             &stencilTable->GetOffsets()[0],
                             &stencilTable->GetControlIndices()[0],
@@ -125,24 +104,27 @@ public:
     ///
     /// @param dstDesc        vertex buffer descriptor for the output buffer
     ///
-    /// @param stencilTable   stencil table to be applied.
+    /// @param sizes          pointer to the sizes buffer of the stencil table
+    ///                       to apply for the range [start, end)
     ///
-    /// @param instance       not used in the cpu kernel
-    ///                       (declared as a typed pointer to prevent
-    ///                        undesirable template resolution)
+    /// @param offsets        pointer to the offsets buffer of the stencil table
     ///
-    /// @param deviceContext  not used in the cpu kernel
+    /// @param indices        pointer to the indices buffer of the stencil table
     ///
-    static bool EvalStencils(const float *src,
-                             VertexBufferDescriptor const &srcDesc,
-                             float *dst,
-                             VertexBufferDescriptor const &dstDesc,
-                             const int * sizes,
-                             const int * offsets,
-                             const int * indices,
-                             const float * weights,
-                             int start,
-                             int end);
+    /// @param weights        pointer to the weights buffer of the stencil table
+    ///
+    /// @param start          start index of stencil table
+    ///
+    /// @param end            end index of stencil table
+    ///
+    static bool EvalStencils(
+        const float *src,  VertexBufferDescriptor const &srcDesc,
+        float *dst,        VertexBufferDescriptor const &dstDesc,
+        const int * sizes,
+        const int * offsets,
+        const int * indices,
+        const float * weights,
+        int start, int end);
 
     /// \brief Generic static eval stencils function with derivatives.
     ///        This function has a same signature as other device kernels
@@ -161,19 +143,19 @@ public:
     ///
     /// @param dstDesc        vertex buffer descriptor for the output buffer
     ///
-    /// @param dstDsBuffer    Output s-derivative buffer
+    /// @param duBuffer       Output U-derivative buffer
     ///                       must have BindCpuBuffer() method returning a
     ///                       float pointer for write
     ///
-    /// @param dstDsDesc      vertex buffer descriptor for the output buffer
+    /// @param duDesc         vertex buffer descriptor for the output buffer
     ///
-    /// @param dstDtBuffer    Output t-derivative buffer
+    /// @param dvBuffer       Output V-derivative buffer
     ///                       must have BindCpuBuffer() method returning a
     ///                       float pointer for write
     ///
-    /// @param dstDtDesc      vertex buffer descriptor for the output buffer
+    /// @param dvDesc         vertex buffer descriptor for the output buffer
     ///
-    /// @param stencilTable   stencil table to be applied.
+    /// @param stencilTable   Far::StencilTable or equivalent
     ///
     /// @param instance       not used in the cpu kernel
     ///                       (declared as a typed pointer to prevent
@@ -182,28 +164,22 @@ public:
     /// @param deviceContext  not used in the cpu kernel
     ///
     template <typename SRC_BUFFER, typename DST_BUFFER, typename STENCIL_TABLE>
-    static bool EvalStencils(SRC_BUFFER *srcBuffer,
-                             VertexBufferDescriptor const &srcDesc,
-                             DST_BUFFER *dstBuffer,
-                             VertexBufferDescriptor const &dstDesc,
-                             DST_BUFFER *dstDsBuffer,
-                             VertexBufferDescriptor const &dstDsDesc,
-                             DST_BUFFER *dstDtBuffer,
-                             VertexBufferDescriptor const &dstDtDesc,
-                             STENCIL_TABLE const *stencilTable,
-                             const CpuEvaluator *evaluator = NULL,
-                             void * deviceContext = NULL) {
-        (void)evaluator;   // unused
-        (void)deviceContext;   // unused
+    static bool EvalStencils(
+        SRC_BUFFER *srcBuffer, VertexBufferDescriptor const &srcDesc,
+        DST_BUFFER *dstBuffer, VertexBufferDescriptor const &dstDesc,
+        DST_BUFFER *duBuffer,  VertexBufferDescriptor const &duDesc,
+        DST_BUFFER *dvBuffer,  VertexBufferDescriptor const &dvDesc,
+        STENCIL_TABLE const *stencilTable,
+        const CpuEvaluator *instance = NULL,
+        void * deviceContext = NULL) {
 
-        return EvalStencils(srcBuffer->BindCpuBuffer(),
-                            srcDesc,
-                            dstBuffer->BindCpuBuffer(),
-                            dstDesc,
-                            dstDsBuffer->BindCpuBuffer(),
-                            dstDsDesc,
-                            dstDtBuffer->BindCpuBuffer(),
-                            dstDtDesc,
+        (void)instance;       // unused
+        (void)deviceContext;  // unused
+
+        return EvalStencils(srcBuffer->BindCpuBuffer(), srcDesc,
+                            dstBuffer->BindCpuBuffer(), dstDesc,
+                            duBuffer->BindCpuBuffer(),  duDesc,
+                            dvBuffer->BindCpuBuffer(),  dvDesc,
                             &stencilTable->GetSizes()[0],
                             &stencilTable->GetOffsets()[0],
                             &stencilTable->GetControlIndices()[0],
@@ -228,40 +204,44 @@ public:
     ///
     /// @param dstDesc        vertex buffer descriptor for the output buffer
     ///
-    /// @param dstDs          Output s-derivatives pointer. An offset of
-    ///                       dstDsDesc will be applied internally.
+    /// @param du             Output U-derivatives pointer. An offset of
+    ///                       duDesc will be applied internally.
     ///
-    /// @param dstDsDesc      vertex buffer descriptor for the output buffer
+    /// @param duDesc         vertex buffer descriptor for the output buffer
     ///
-    /// @param dstDt          Output t-derivatives pointer. An offset of
-    ///                       dstDtDesc will be applied internally.
+    /// @param dv             Output V-derivatives pointer. An offset of
+    ///                       dvDesc will be applied internally.
     ///
-    /// @param dstDtDesc      vertex buffer descriptor for the output buffer
+    /// @param dvDesc         vertex buffer descriptor for the output buffer
     ///
-    /// @param stencilTable   stencil table to be applied.
+    /// @param sizes          pointer to the sizes buffer of the stencil table
     ///
-    /// @param instance       not used in the cpu kernel
-    ///                       (declared as a typed pointer to prevent
-    ///                        undesirable template resolution)
+    /// @param offsets        pointer to the offsets buffer of the stencil table
     ///
-    /// @param deviceContext  not used in the cpu kernel
+    /// @param indices        pointer to the indices buffer of the stencil table
     ///
-    static bool EvalStencils(const float *src,
-                             VertexBufferDescriptor const &srcDesc,
-                             float *dst,
-                             VertexBufferDescriptor const &dstDesc,
-                             float *dstDs,
-                             VertexBufferDescriptor const &dstDsDesc,
-                             float *dstDt,
-                             VertexBufferDescriptor const &dstDtDesc,
-                             const int * sizes,
-                             const int * offsets,
-                             const int * indices,
-                             const float * weights,
-                             const float * duWeights,
-                             const float * dvWeights,
-                             int start,
-                             int end);
+    /// @param weights        pointer to the weights buffer of the stencil table
+    ///
+    /// @param duWeights      pointer to the du-weights buffer of the stencil table
+    ///
+    /// @param dvWeights      pointer to the dv-weights buffer of the stencil table
+    ///
+    /// @param start          start index of stencil table
+    ///
+    /// @param end            end index of stencil table
+    ///
+    static bool EvalStencils(
+        const float *src, VertexBufferDescriptor const &srcDesc,
+        float *dst,       VertexBufferDescriptor const &dstDesc,
+        float *du,        VertexBufferDescriptor const &duDesc,
+        float *dv,        VertexBufferDescriptor const &dvDesc,
+        const int * sizes,
+        const int * offsets,
+        const int * indices,
+        const float * weights,
+        const float * duWeights,
+        const float * dvWeights,
+        int start, int end);
 
     /// ----------------------------------------------------------------------
     ///
@@ -289,32 +269,35 @@ public:
     ///
     /// @param patchCoords      array of locations to be evaluated.
     ///
-    /// @param patchTable       Far::PatchTable
+    /// @param patchTable       CpuPatchTable or equivalent
+    ///                         XXX: currently Far::PatchTable can't be used
+    ///                              due to interface mismatch
     ///
     /// @param instance         not used in the cpu evaluator
     ///
     /// @param deviceContext    not used in the cpu evaluator
     ///
-    template <typename SRC_BUFFER, typename DST_BUFFER>
-    static bool EvalPatches(SRC_BUFFER *srcBuffer,
-                            VertexBufferDescriptor const &srcDesc,
-                            DST_BUFFER *dstBuffer,
-                            VertexBufferDescriptor const &dstDesc,
-                            int numPatchCoords,
-                            PatchCoord const *patchCoords,
-                            Far::PatchTable const *patchTable,
-                            CpuEvaluator const *instance,
-                            void * deviceContext = NULL) {
-        (void)instance;   // unused
-        (void)deviceContext;   // unused
+    template <typename SRC_BUFFER, typename DST_BUFFER,
+              typename PATCHCOORD_BUFFER, typename PATCH_TABLE>
+    static bool EvalPatches(
+        SRC_BUFFER *srcBuffer, VertexBufferDescriptor const &srcDesc,
+        DST_BUFFER *dstBuffer, VertexBufferDescriptor const &dstDesc,
+        int numPatchCoords,
+        PATCHCOORD_BUFFER *patchCoords,
+        PATCH_TABLE *patchTable,
+        CpuEvaluator const *instance = NULL,
+        void * deviceContext = NULL) {
 
-        return EvalPatches(srcBuffer->BindCpuBuffer(),
-                           srcDesc,
-                           dstBuffer->BindCpuBuffer(),
-                           dstDesc,
+        (void)instance;       // unused
+        (void)deviceContext;  // unused
+
+        return EvalPatches(srcBuffer->BindCpuBuffer(), srcDesc,
+                           dstBuffer->BindCpuBuffer(), dstDesc,
                            numPatchCoords,
-                           patchCoords,
-                           patchTable);
+                           (const PatchCoord*)patchCoords->BindCpuBuffer(),
+                           patchTable->GetPatchArrayBuffer(),
+                           patchTable->GetPatchIndexBuffer(),
+                           patchTable->GetPatchParamBuffer());
     }
 
     /// \brief Generic limit eval function with derivatives. This function has
@@ -333,56 +316,59 @@ public:
     ///
     /// @param dstDesc          vertex buffer descriptor for the output buffer
     ///
-    /// @param dstDsBuffer      Output s-derivatives buffer
+    /// @param duBuffer         Output U-derivatives buffer
     ///                         must have BindCpuBuffer() method returning a
     ///                         float pointer for write
     ///
-    /// @param dstDsDesc        vertex buffer descriptor for the dstDsBuffer
+    /// @param duDesc           vertex buffer descriptor for the duBuffer
     ///
-    /// @param dstDtBuffer      Output t-derivatives buffer
+    /// @param dvBuffer         Output V-derivatives buffer
     ///                         must have BindCpuBuffer() method returning a
     ///                         float pointer for write
     ///
-    /// @param dstDtDesc        vertex buffer descriptor for the dstDtBuffer
+    /// @param dvDesc           vertex buffer descriptor for the dvBuffer
     ///
     /// @param numPatchCoords   number of patchCoords.
     ///
     /// @param patchCoords      array of locations to be evaluated.
     ///
-    /// @param patchTable       Far::PatchTable
+    /// @param patchTable       CpuPatchTable or equivalent
+    ///                         XXX: currently Far::PatchTable can't be used
+    ///                              due to interface mismatch
     ///
     /// @param instance         not used in the cpu evaluator
     ///
     /// @param deviceContext    not used in the cpu evaluator
     ///
-    template <typename SRC_BUFFER, typename DST_BUFFER>
-    static bool EvalPatches(SRC_BUFFER *srcBuffer,
-                            VertexBufferDescriptor const &srcDesc,
-                            DST_BUFFER *dstBuffer,
-                            VertexBufferDescriptor const &dstDesc,
-                            DST_BUFFER *dstDsBuffer,
-                            VertexBufferDescriptor const &dstDsDesc,
-                            DST_BUFFER *dstDtBuffer,
-                            VertexBufferDescriptor const &dstDtDesc,
-                            int numPatchCoords,
-                            PatchCoord const *patchCoords,
-                            Far::PatchTable const *patchTable,
-                            CpuEvaluator const *instance,
-                            void * deviceContext = NULL) {
-        (void)instance;   // unused
-        (void)deviceContext;   // unused
+    template <typename SRC_BUFFER, typename DST_BUFFER,
+              typename PATCHCOORD_BUFFER, typename PATCH_TABLE>
+    static bool EvalPatches(
+        SRC_BUFFER *srcBuffer, VertexBufferDescriptor const &srcDesc,
+        DST_BUFFER *dstBuffer, VertexBufferDescriptor const &dstDesc,
+        DST_BUFFER *duBuffer,  VertexBufferDescriptor const &duDesc,
+        DST_BUFFER *dvBuffer,  VertexBufferDescriptor const &dvDesc,
+        int numPatchCoords,
+        PATCHCOORD_BUFFER *patchCoords,
+        PATCH_TABLE *patchTable,
+        CpuEvaluator const *instance = NULL,
+        void * deviceContext = NULL) {
+        (void)instance;       // unused
+        (void)deviceContext;  // unused
 
-        return EvalPatches(srcBuffer->BindCpuBuffer(),
-                           srcDesc,
-                           dstBuffer->BindCpuBuffer(),
-                           dstDesc,
-                           dstDsBuffer->BindCpuBuffer(),
-                           dstDsDesc,
-                           dstDtBuffer->BindCpuBuffer(),
-                           dstDtDesc,
+        // XXX: PatchCoords is somewhat abusing vertex primvar buffer interop.
+        //      ideally all buffer classes should have templated by datatype
+        //      so that downcast isn't needed there.
+        //      (e.g. Osd::CpuBuffer<PatchCoord> )
+        //
+        return EvalPatches(srcBuffer->BindCpuBuffer(), srcDesc,
+                           dstBuffer->BindCpuBuffer(), dstDesc,
+                           duBuffer->BindCpuBuffer(),  duDesc,
+                           dvBuffer->BindCpuBuffer(),  dvDesc,
                            numPatchCoords,
-                           patchCoords,
-                           patchTable);
+                           (const PatchCoord*)patchCoords->BindCpuBuffer(),
+                           patchTable->GetPatchArrayBuffer(),
+                           patchTable->GetPatchIndexBuffer(),
+                           patchTable->GetPatchParamBuffer());
     }
 
     /// \brief Static limit eval function. It takes an array of PatchCoord
@@ -403,20 +389,23 @@ public:
     ///
     /// @param patchCoords      array of locations to be evaluated.
     ///
-    /// @param patchTable       Far::PatchTable on which primvars are evaluated
-    ///                         for the patchCoords
+    /// @param patchArrays      an array of Osd::PatchArray struct
+    ///                         indexed by PatchCoord::arrayIndex
     ///
-    /// @param instance         not used in the cpu evaluator
+    /// @param patchIndexBuffer an array of patch indices
+    ///                         indexed by PatchCoord::vertIndex
     ///
-    /// @param deviceContext    not used in the cpu evaluator
+    /// @param patchParamBuffer an array of Osd::PatchParam struct
+    ///                         indexed by PatchCoord::patchIndex
     ///
-    static bool EvalPatches(const float *src,
-                            VertexBufferDescriptor const &srcDesc,
-                            float *dst,
-                            VertexBufferDescriptor const &dstDesc,
-                            int numPatchCoords,
-                            PatchCoord const *patchCoords,
-                            Far::PatchTable const *patchTable);
+    static bool EvalPatches(
+        const float *src, VertexBufferDescriptor const &srcDesc,
+        float *dst,       VertexBufferDescriptor const &dstDesc,
+        int numPatchCoords,
+        const PatchCoord *patchCoords,
+        const PatchArray *patchArrays,
+        const int *patchIndexBuffer,
+        const PatchParam *patchParamBuffer);
 
     /// \brief Static limit eval function. It takes an array of PatchCoord
     ///        and evaluate limit values on given PatchTable.
@@ -432,38 +421,45 @@ public:
     ///
     /// @param dstDesc          vertex buffer descriptor for the output buffer
     ///
-    /// @param dstDs            Output s-derivatives pointer. An offset of
-    ///                         dstDsDesc will be applied internally.
+    /// @param du               Output U-derivatives pointer. An offset of
+    ///                         duDesc will be applied internally.
     ///
-    /// @param dstDsDesc        vertex buffer descriptor for the dstDs buffer
+    /// @param duDesc           vertex buffer descriptor for the du buffer
     ///
-    /// @param dstDt            Output t-derivatives pointer. An offset of
-    ///                         dstDtDesc will be applied internally.
+    /// @param dv               Output V-derivatives pointer. An offset of
+    ///                         dvDesc will be applied internally.
     ///
-    /// @param dstDtDesc        vertex buffer descriptor for the dstDt buffer
+    /// @param dvDesc           vertex buffer descriptor for the dv buffer
     ///
     /// @param numPatchCoords   number of patchCoords.
     ///
     /// @param patchCoords      array of locations to be evaluated.
     ///
-    /// @param patchTable       Far::PatchTable on which primvars are evaluated
-    ///                         for the patchCoords
+    /// @param patchArrays      an array of Osd::PatchArray struct
+    ///                         indexed by PatchCoord::arrayIndex
     ///
-    /// @param instance         not used in the cpu evaluator
+    /// @param patchIndexBuffer an array of patch indices
+    ///                         indexed by PatchCoord::vertIndex
     ///
-    /// @param deviceContext    not used in the cpu evaluator
+    /// @param patchParamBuffer an array of Osd::PatchParam struct
+    ///                         indexed by PatchCoord::patchIndex
     ///
-    static bool EvalPatches(const float *src,
-                            VertexBufferDescriptor const &srcDesc,
-                            float *dst,
-                            VertexBufferDescriptor const &dstDesc,
-                            float *dstDs,
-                            VertexBufferDescriptor const &dstDsDesc,
-                            float *dstDt,
-                            VertexBufferDescriptor const &dstDtDesc,
-                            int numPatchCoords,
-                            PatchCoord const *patchCoords,
-                            Far::PatchTable const *patchTable);
+    static bool EvalPatches(
+        const float *src, VertexBufferDescriptor const &srcDesc,
+        float *dst,       VertexBufferDescriptor const &dstDesc,
+        float *du,        VertexBufferDescriptor const &duDesc,
+        float *dv,        VertexBufferDescriptor const &dvDesc,
+        int numPatchCoords,
+        PatchCoord const *patchCoords,
+        PatchArray const *patchArrays,
+        const int *patchIndexBuffer,
+        PatchParam const *patchParamBuffer);
+
+    /// ----------------------------------------------------------------------
+    ///
+    ///   Other methods
+    ///
+    /// ----------------------------------------------------------------------
 
     /// \brief synchronize all asynchronous computation invoked on this device.
     static void Synchronize(void * /*deviceContext = NULL*/) {
diff --git a/opensubdiv/osd/cpuPatchTable.cpp b/opensubdiv/osd/cpuPatchTable.cpp
new file mode 100644
index 00000000..a59dd8e8
--- /dev/null
+++ b/opensubdiv/osd/cpuPatchTable.cpp
@@ -0,0 +1,106 @@
+//
+//   Copyright 2015 Pixar
+//
+//   Licensed under the Apache License, Version 2.0 (the "Apache License")
+//   with the following modification; you may not use this file except in
+//   compliance with the Apache License and the following modification to it:
+//   Section 6. Trademarks. is deleted and replaced with:
+//
+//   6. Trademarks. This License does not grant permission to use the trade
+//      names, trademarks, service marks, or product names of the Licensor
+//      and its affiliates, except as required to comply with Section 4(c) of
+//      the License and to reproduce the content of the NOTICE file.
+//
+//   You may obtain a copy of the Apache License at
+//
+//       http://www.apache.org/licenses/LICENSE-2.0
+//
+//   Unless required by applicable law or agreed to in writing, software
+//   distributed under the Apache License with the above modification is
+//   distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+//   KIND, either express or implied. See the Apache License for the specific
+//   language governing permissions and limitations under the Apache License.
+//
+
+#include "../osd/cpuPatchTable.h"
+
+namespace OpenSubdiv {
+namespace OPENSUBDIV_VERSION {
+
+namespace Osd {
+
+CpuPatchTable::CpuPatchTable(const Far::PatchTable *farPatchTable) {
+    int nPatchArrays = farPatchTable->GetNumPatchArrays();
+
+    // count
+    int numPatches = 0;
+    int numIndices = 0;
+    for (int j = 0; j < nPatchArrays; ++j) {
+        int nPatch = farPatchTable->GetNumPatches(j);
+        int nCV = farPatchTable->GetPatchArrayDescriptor(j).GetNumControlVertices();
+        numPatches += nPatch;
+        numIndices += nPatch * nCV;
+    }
+    _patchArrays.reserve(nPatchArrays);
+    _indexBuffer.reserve(numIndices);
+    _patchParamBuffer.reserve(numPatches);
+
+    // for each patchArray
+    for (int j = 0; j < nPatchArrays; ++j) {
+        PatchArray patchArray(farPatchTable->GetPatchArrayDescriptor(j),
+                              farPatchTable->GetNumPatches(j),
+                              (int)_indexBuffer.size(),
+                              (int)_patchParamBuffer.size());
+        _patchArrays.push_back(patchArray);
+
+        // indices
+        Far::ConstIndexArray indices = farPatchTable->GetPatchArrayVertices(j);
+        for (int k = 0; k < indices.size(); ++k) {
+            _indexBuffer.push_back(indices[k]);
+        }
+
+        // patchParams bundling
+        // XXX: this process won't be needed if Far::PatchParam includes
+        // sharpness.
+#if 0
+        // XXX: we need sharpness interface for patcharray or put sharpness
+        //      into patchParam.
+        Far::ConstPatchParamArray patchParams =
+            farPatchTable->GetPatchParams(j);
+        for (int k = 0; k < patchParams.size(); ++k) {
+            float sharpness = 0.0;
+            _patchParamBuffer.push_back(patchParams[k].faceIndex);
+            _patchParamBuffer.push_back(patchParams[k].bitField.field);
+            _patchParamBuffer.push_back(*((unsigned int *)&sharpness));
+        }
+#else
+        // XXX: workaround. GetPatchParamTable() will be deprecated though.
+        Far::PatchParamTable const & patchParamTable =
+            farPatchTable->GetPatchParamTable();
+        std::vector<Far::Index> const &sharpnessIndexTable =
+            farPatchTable->GetSharpnessIndexTable();
+        int numPatches = farPatchTable->GetNumPatches(j);
+        for (int k = 0; k < numPatches; ++k) {
+            float sharpness = 0.0;
+            int patchIndex = (int)_patchParamBuffer.size();
+            if (patchIndex < (int)sharpnessIndexTable.size()) {
+                int sharpnessIndex = sharpnessIndexTable[patchIndex];
+                if (sharpnessIndex >= 0)
+                    sharpness = farPatchTable->GetSharpnessValues()[sharpnessIndex];
+            }
+            PatchParam param;
+            //param.patchParam = patchParamTable[patchIndex];
+            param.faceIndex = patchParamTable[patchIndex].faceIndex;
+            param.patchBits = patchParamTable[patchIndex].bitField.field;
+            param.sharpness = sharpness;
+            _patchParamBuffer.push_back(param);
+        }
+#endif
+    }
+}
+
+}  // end namespace Osd
+
+}  // end namespace OPENSUBDIV_VERSION
+}  // end namespace OpenSubdiv
+
diff --git a/opensubdiv/osd/cpuPatchTable.h b/opensubdiv/osd/cpuPatchTable.h
new file mode 100644
index 00000000..05151a30
--- /dev/null
+++ b/opensubdiv/osd/cpuPatchTable.h
@@ -0,0 +1,102 @@
+//
+//   Copyright 2015 Pixar
+//
+//   Licensed under the Apache License, Version 2.0 (the "Apache License")
+//   with the following modification; you may not use this file except in
+//   compliance with the Apache License and the following modification to it:
+//   Section 6. Trademarks. is deleted and replaced with:
+//
+//   6. Trademarks. This License does not grant permission to use the trade
+//      names, trademarks, service marks, or product names of the Licensor
+//      and its affiliates, except as required to comply with Section 4(c) of
+//      the License and to reproduce the content of the NOTICE file.
+//
+//   You may obtain a copy of the Apache License at
+//
+//       http://www.apache.org/licenses/LICENSE-2.0
+//
+//   Unless required by applicable law or agreed to in writing, software
+//   distributed under the Apache License with the above modification is
+//   distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+//   KIND, either express or implied. See the Apache License for the specific
+//   language governing permissions and limitations under the Apache License.
+//
+
+#ifndef OPENSUBDIV3_OSD_CPU_PATCH_TABLE_H
+#define OPENSUBDIV3_OSD_CPU_PATCH_TABLE_H
+
+#include "../version.h"
+
+#include <vector>
+#include "../far/patchDescriptor.h"
+#include "../osd/nonCopyable.h"
+#include "../osd/opengl.h"
+#include "../osd/types.h"
+
+namespace OpenSubdiv {
+namespace OPENSUBDIV_VERSION {
+
+namespace Far{
+    class PatchTable;
+};
+
+namespace Osd {
+
+/// \brief Cpu patch table
+///
+/// XXX: We can use just Far::PatchTable for typical CpuEval use cases.
+///
+///  Currently this class exists because of the template resolution
+///  for the CpuEvaluator's generic interface functions
+///    (glEvalLimit example uses), and
+///  device-specific patch tables such as GLPatchTables internally use
+///  as a staging buffer to splice patcharray and interleave sharpnesses.
+///
+///  Ideally Far::PatchTables should have the same data representation
+///  and accessors so that we don't have to copy data unnecessarily.
+///
+class CpuPatchTable {
+public:
+    static CpuPatchTable *Create(const Far::PatchTable *patchTable,
+                                 void *deviceContext = NULL) {
+        (void)deviceContext;  // unused
+        return new CpuPatchTable(patchTable);
+    }
+
+    explicit CpuPatchTable(const Far::PatchTable *patchTable);
+    ~CpuPatchTable() {}
+
+    const PatchArray *GetPatchArrayBuffer() const {
+        return &_patchArrays[0];
+    }
+    const int *GetPatchIndexBuffer() const {
+        return &_indexBuffer[0];
+    }
+    const PatchParam *GetPatchParamBuffer() const {
+        return &_patchParamBuffer[0];
+    }
+
+    size_t GetNumPatchArrays() const {
+        return _patchArrays.size();
+    }
+    size_t GetPatchIndexSize() const {
+        return _indexBuffer.size();
+    }
+    size_t GetPatchParamSize() const {
+        return _patchParamBuffer.size();
+    }
+
+protected:
+    PatchArrayVector _patchArrays;
+    std::vector<int> _indexBuffer;
+    PatchParamVector _patchParamBuffer;
+};
+
+}  // end namespace Osd
+
+}  // end namespace OPENSUBDIV_VERSION
+using namespace OPENSUBDIV_VERSION;
+
+}  // end namespace OpenSubdiv
+
+#endif  // OPENSUBDIV3_OSD_CPU_PATCH_TABLE_H
diff --git a/opensubdiv/osd/cpuVertexBuffer.cpp b/opensubdiv/osd/cpuVertexBuffer.cpp
index 714b783e..9f354da5 100644
--- a/opensubdiv/osd/cpuVertexBuffer.cpp
+++ b/opensubdiv/osd/cpuVertexBuffer.cpp
@@ -52,7 +52,8 @@ CpuVertexBuffer::Create(int numElements, int numVertices,
 }
 
 void
-CpuVertexBuffer::UpdateData(const float *src, int startVertex, int numVertices) {
+CpuVertexBuffer::UpdateData(const float *src, int startVertex, int numVertices,
+                            void * /*deviceContext*/) {
 
     memcpy(_cpuBuffer + startVertex * _numElements,
            src, GetNumElements() * numVertices * sizeof(float));
diff --git a/opensubdiv/osd/cpuVertexBuffer.h b/opensubdiv/osd/cpuVertexBuffer.h
index ff0302b4..da24680a 100644
--- a/opensubdiv/osd/cpuVertexBuffer.h
+++ b/opensubdiv/osd/cpuVertexBuffer.h
@@ -50,7 +50,8 @@ public:
 
     /// This method is meant to be used in client code in order to provide
     /// coarse vertices data to Osd.
-    void UpdateData(const float *src, int startVertex, int numVertices);
+    void UpdateData(const float *src, int startVertex, int numVertices,
+                    void *deviceContext = NULL);
 
     /// Returns how many elements defined in this vertex buffer.
     int GetNumElements() const;
diff --git a/opensubdiv/osd/cudaEvaluator.cpp b/opensubdiv/osd/cudaEvaluator.cpp
index 74907acd..3335fb88 100644
--- a/opensubdiv/osd/cudaEvaluator.cpp
+++ b/opensubdiv/osd/cudaEvaluator.cpp
@@ -28,6 +28,7 @@
 #include <vector>
 
 #include "../far/stencilTable.h"
+#include "../osd/types.h"
 
 extern "C" {
     void CudaEvalStencils(const float *src,
@@ -41,6 +42,25 @@ extern "C" {
                           const float * weights,
                           int start,
                           int end);
+
+    void CudaEvalPatches(
+        const float *src, float *dst,
+        int length, int srcStride, int dstStride,
+        int numPatchCoords,
+        const void *patchCoords,
+        const void *patchArrays,
+        const int *patchIndices,
+        const void *patchParams);
+
+    void CudaEvalPatchesWithDerivatives(
+        const float *src, float *dst, float *du, float *dv,
+        int length,
+        int srcStride, int dstStride, int dvStride, int duStride,
+        int numPatchCoords,
+        const void *patchCoords,
+        const void *patchArrays,
+        const int *patchIndices,
+        const void *patchParams);
 }
 
 namespace OpenSubdiv {
@@ -102,6 +122,8 @@ CudaEvaluator::EvalStencils(const float *src,
                             const float * weights,
                             int start,
                             int end) {
+    if (dst == NULL) return false;
+
     CudaEvalStencils(src + srcDesc.offset,
                      dst + dstDesc.offset,
                      srcDesc.length,
@@ -112,6 +134,105 @@ CudaEvaluator::EvalStencils(const float *src,
     return true;
 }
 
+/* static */
+bool
+CudaEvaluator::EvalStencils(const float *src,
+                            VertexBufferDescriptor const &srcDesc,
+                            float *dst,
+                            VertexBufferDescriptor const &dstDesc,
+                            float *dstDu,
+                            VertexBufferDescriptor const &dstDuDesc,
+                            float *dstDv,
+                            VertexBufferDescriptor const &dstDvDesc,
+                            const int * sizes,
+                            const int * offsets,
+                            const int * indices,
+                            const float * weights,
+                            const float * duWeights,
+                            const float * dvWeights,
+                            int start,
+                            int end) {
+    // PERFORMANCE: need to combine 3 launches together
+    if (dst) {
+        CudaEvalStencils(src + srcDesc.offset,
+                         dst + dstDesc.offset,
+                         srcDesc.length,
+                         srcDesc.stride,
+                         dstDesc.stride,
+                         sizes, offsets, indices, weights,
+                         start, end);
+    }
+    if (dstDu) {
+        CudaEvalStencils(src + srcDesc.offset,
+                         dstDu + dstDuDesc.offset,
+                         srcDesc.length,
+                         srcDesc.stride,
+                         dstDuDesc.stride,
+                         sizes, offsets, indices, duWeights,
+                         start, end);
+    }
+    if (dstDv) {
+        CudaEvalStencils(src + srcDesc.offset,
+                         dstDv + dstDvDesc.offset,
+                         srcDesc.length,
+                         srcDesc.stride,
+                         dstDvDesc.stride,
+                         sizes, offsets, indices, dvWeights,
+                         start, end);
+    }
+    return true;
+}
+
+/* static */
+bool
+CudaEvaluator::EvalPatches(const float *src,
+                           VertexBufferDescriptor const &srcDesc,
+                           float *dst,
+                           VertexBufferDescriptor const &dstDesc,
+                           int numPatchCoords,
+                           const PatchCoord *patchCoords,
+                           const PatchArray *patchArrays,
+                           const int *patchIndices,
+                           const PatchParam *patchParams) {
+    src += srcDesc.offset;
+    if (dst)
+        dst += dstDesc.offset;
+
+    CudaEvalPatches(src, dst,
+                    srcDesc.length, srcDesc.stride, dstDesc.stride,
+                    numPatchCoords, patchCoords, patchArrays, patchIndices, patchParams);
+
+    return true;
+}
+
+/* static */
+bool
+CudaEvaluator::EvalPatches(
+    const float *src, VertexBufferDescriptor const &srcDesc,
+    float *dst,       VertexBufferDescriptor const &dstDesc,
+    float *du,        VertexBufferDescriptor const &duDesc,
+    float *dv,        VertexBufferDescriptor const &dvDesc,
+    int numPatchCoords,
+    const PatchCoord *patchCoords,
+    const PatchArray *patchArrays,
+    const int *patchIndices,
+    const PatchParam *patchParams) {
+
+    if (src) src += srcDesc.offset;
+    if (dst) dst += dstDesc.offset;
+    if (du)  du  += duDesc.offset;
+    if (dv)  dv  += dvDesc.offset;
+
+    CudaEvalPatchesWithDerivatives(
+        src, dst, du, dv,
+        srcDesc.length, srcDesc.stride,
+        dstDesc.stride, duDesc.stride, dvDesc.stride,
+        numPatchCoords, patchCoords, patchArrays, patchIndices, patchParams);
+    return true;
+}
+
+
+
 /* static */
 void
 CudaEvaluator::Synchronize(void * /*deviceContext*/) {
diff --git a/opensubdiv/osd/cudaEvaluator.h b/opensubdiv/osd/cudaEvaluator.h
index ee36e0b6..c57f4a22 100644
--- a/opensubdiv/osd/cudaEvaluator.h
+++ b/opensubdiv/osd/cudaEvaluator.h
@@ -29,11 +29,13 @@
 
 #include <vector>
 #include "../osd/vertexDescriptor.h"
+#include "../osd/types.h"
 
 namespace OpenSubdiv {
 namespace OPENSUBDIV_VERSION {
 
 namespace Far {
+    class PatchTable;
     class StencilTable;
 }
 
@@ -43,7 +45,7 @@ namespace Osd {
 ///
 /// This class is a cuda buffer representation of Far::StencilTable.
 ///
-/// CudaComputeKernel consumes this table to apply stencils
+/// CudaEvaluator consumes this table to apply stencils
 ///
 ///
 class CudaStencilTable {
@@ -72,10 +74,14 @@ private:
     int _numStencils;
 };
 
-// ---------------------------------------------------------------------------
-
 class CudaEvaluator {
 public:
+    /// ----------------------------------------------------------------------
+    ///
+    ///   Stencil evaluations with StencilTable
+    ///
+    /// ----------------------------------------------------------------------
+
     /// \brief Generic static compute function. This function has a same
     ///        signature as other device kernels have so that it can be called
     ///        transparently from OsdMesh template interface.
@@ -99,21 +105,18 @@ public:
     ///
     /// @param deviceContext  not used in the CudaEvaluator
     ///
-    template <typename VERTEX_BUFFER, typename STENCIL_TABLE>
-    static bool EvalStencils(VERTEX_BUFFER *srcVertexBuffer,
-                             VertexBufferDescriptor const &srcDesc,
-                             VERTEX_BUFFER *dstVertexBuffer,
-                             VertexBufferDescriptor const &dstDesc,
-                             STENCIL_TABLE const *stencilTable,
-                             const void *instance = NULL,
-                             void * deviceContext = NULL) {
+    template <typename SRC_BUFFER, typename DST_BUFFER, typename STENCIL_TABLE>
+    static bool EvalStencils(
+        SRC_BUFFER *srcBuffer, VertexBufferDescriptor const &srcDesc,
+        DST_BUFFER *dstBuffer, VertexBufferDescriptor const &dstDesc,
+        STENCIL_TABLE const *stencilTable,
+        const void *instance = NULL,
+        void * deviceContext = NULL) {
 
         (void)instance;  // unused
         (void)deviceContext;  // unused
-        return EvalStencils(srcVertexBuffer->BindCudaBuffer(),
-                            srcDesc,
-                            dstVertexBuffer->BindCudaBuffer(),
-                            dstDesc,
+        return EvalStencils(srcBuffer->BindCudaBuffer(), srcDesc,
+                            dstBuffer->BindCudaBuffer(), dstDesc,
                             (int const *)stencilTable->GetSizesBuffer(),
                             (int const *)stencilTable->GetOffsetsBuffer(),
                             (int const *)stencilTable->GetIndicesBuffer(),
@@ -122,17 +125,369 @@ public:
                             /*end   = */ stencilTable->GetNumStencils());
     }
 
-    static bool EvalStencils(const float *src,
-                             VertexBufferDescriptor const &srcDesc,
-                             float *dst,
-                             VertexBufferDescriptor const &dstDesc,
-                             const int * sizes,
-                             const int * offsets,
-                             const int * indices,
-                             const float * weights,
-                             int start,
-                             int end);
+    /// \brief Static eval stencils function which takes raw cuda buffers for
+    ///        input and output.
+    ///
+    /// @param src            Input primvar pointer. An offset of srcDesc
+    ///                       will be applied internally (i.e. the pointer
+    ///                       should not include the offset)
+    ///
+    /// @param srcDesc        vertex buffer descriptor for the input buffer
+    ///
+    /// @param dst            Output primvar pointer. An offset of dstDesc
+    ///                       will be applied internally.
+    ///
+    /// @param dstDesc        vertex buffer descriptor for the output buffer
+    ///
+    /// @param sizes          pointer to the sizes buffer of the stencil table
+    ///
+    /// @param offsets        pointer to the offsets buffer of the stencil table
+    ///
+    /// @param indices        pointer to the indices buffer of the stencil table
+    ///
+    /// @param weights        pointer to the weights buffer of the stencil table
+    ///
+    /// @param start          start index of stencil table
+    ///
+    /// @param end            end index of stencil table
+    ///
+    static bool EvalStencils(
+        const float *src, VertexBufferDescriptor const &srcDesc,
+        float *dst,       VertexBufferDescriptor const &dstDesc,
+        const int * sizes,
+        const int * offsets,
+        const int * indices,
+        const float * weights,
+        int start, int end);
 
+    /// \brief Generic static eval stencils function with derivatives.
+    ///        This function has a same signature as other device kernels
+    ///        have so that it can be called in the same way from OsdMesh
+    ///        template interface.
+    ///
+    /// @param srcBuffer      Input primvar buffer.
+    ///                       must have BindCudaBuffer() method returning a
+    ///                       const float pointer for read
+    ///
+    /// @param srcDesc        vertex buffer descriptor for the input buffer
+    ///
+    /// @param dstBuffer      Output primvar buffer
+    ///                       must have BindCudaBuffer() method returning a
+    ///                       float pointer for write
+    ///
+    /// @param dstDesc        vertex buffer descriptor for the output buffer
+    ///
+    /// @param duBuffer       Output U-derivative buffer
+    ///                       must have BindCudaBuffer() method returning a
+    ///                       float pointer for write
+    ///
+    /// @param duDesc         vertex buffer descriptor for the output buffer
+    ///
+    /// @param dvBuffer       Output V-derivative buffer
+    ///                       must have BindCudaBuffer() method returning a
+    ///                       float pointer for write
+    ///
+    /// @param dvDesc         vertex buffer descriptor for the output buffer
+    ///
+    /// @param stencilTable   stencil table to be applied.
+    ///
+    /// @param instance       not used in the cuda kernel
+    ///                       (declared as a typed pointer to prevent
+    ///                        undesirable template resolution)
+    ///
+    /// @param deviceContext  not used in the cuda kernel
+    ///
+    template <typename SRC_BUFFER, typename DST_BUFFER, typename STENCIL_TABLE>
+    static bool EvalStencils(
+        SRC_BUFFER *srcBuffer, VertexBufferDescriptor const &srcDesc,
+        DST_BUFFER *dstBuffer, VertexBufferDescriptor const &dstDesc,
+        DST_BUFFER *duBuffer,  VertexBufferDescriptor const &duDesc,
+        DST_BUFFER *dvBuffer,  VertexBufferDescriptor const &dvDesc,
+        STENCIL_TABLE const *stencilTable,
+        const CudaEvaluator *instance = NULL,
+        void * deviceContext = NULL) {
+
+        (void)instance;   // unused
+        (void)deviceContext;   // unused
+
+        return EvalStencils(srcBuffer->BindCudaBuffer(), srcDesc,
+                            dstBuffer->BindCudaBuffer(), dstDesc,
+                            duBuffer->BindCudaBuffer(),  duDesc,
+                            dvBuffer->BindCudaBuffer(),  dvDesc,
+                            &stencilTable->GetSizes()[0],
+                            &stencilTable->GetOffsets()[0],
+                            &stencilTable->GetControlIndices()[0],
+                            &stencilTable->GetWeights()[0],
+                            &stencilTable->GetDuWeights()[0],
+                            &stencilTable->GetDvWeights()[0],
+                            /*start = */ 0,
+                            /*end   = */ stencilTable->GetNumStencils());
+    }
+
+    /// \brief Static eval stencils function with derivatives, which takes
+    ///        raw cuda pointers for input and output.
+    ///
+    /// @param src            Input primvar pointer. An offset of srcDesc
+    ///                       will be applied internally (i.e. the pointer
+    ///                       should not include the offset)
+    ///
+    /// @param srcDesc        vertex buffer descriptor for the input buffer
+    ///
+    /// @param dst            Output primvar pointer. An offset of dstDesc
+    ///                       will be applied internally.
+    ///
+    /// @param dstDesc        vertex buffer descriptor for the output buffer
+    ///
+    /// @param du             Output U-derivatives pointer. An offset of
+    ///                       duDesc will be applied internally.
+    ///
+    /// @param duDesc         vertex buffer descriptor for the output buffer
+    ///
+    /// @param dv             Output V-derivatives pointer. An offset of
+    ///                       dvDesc will be applied internally.
+    ///
+    /// @param dvDesc         vertex buffer descriptor for the output buffer
+    ///
+    /// @param sizes          pointer to the sizes buffer of the stencil table
+    ///
+    /// @param offsets        pointer to the offsets buffer of the stencil table
+    ///
+    /// @param indices        pointer to the indices buffer of the stencil table
+    ///
+    /// @param weights        pointer to the weights buffer of the stencil table
+    ///
+    /// @param duWeights      pointer to the du-weights buffer of the stencil table
+    ///
+    /// @param dvWeights      pointer to the dv-weights buffer of the stencil table
+    ///
+    /// @param start          start index of stencil table
+    ///
+    /// @param end            end index of stencil table
+    ///
+    static bool EvalStencils(
+        const float *src, VertexBufferDescriptor const &srcDesc,
+        float *dst,       VertexBufferDescriptor const &dstDesc,
+        float *du,        VertexBufferDescriptor const &duDesc,
+        float *dv,        VertexBufferDescriptor const &dvDesc,
+        const int * sizes,
+        const int * offsets,
+        const int * indices,
+        const float * weights,
+        const float * duWeights,
+        const float * dvWeights,
+        int start, int end);
+
+    /// ----------------------------------------------------------------------
+    ///
+    ///   Limit evaluations with PatchTable
+    ///
+    /// ----------------------------------------------------------------------
+
+    /// \brief Generic limit eval function. This function has a same
+    ///        signature as other device kernels have so that it can be called
+    ///        in the same way.
+    ///
+    /// @param srcBuffer        Input primvar buffer.
+    ///                         must have BindCudaBuffer() method returning a
+    ///                         const float pointer for read
+    ///
+    /// @param srcDesc          vertex buffer descriptor for the input buffer
+    ///
+    /// @param dstBuffer        Output primvar buffer
+    ///                         must have BindCudaBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param dstDesc          vertex buffer descriptor for the output buffer
+    ///
+    /// @param numPatchCoords   number of patchCoords.
+    ///
+    /// @param patchCoords      array of locations to be evaluated.
+    ///                         must have BindCudaBuffer() method returning an
+    ///                         array of PatchCoord struct in cuda memory.
+    ///
+    /// @param patchTable       CudaPatchTable or equivalent
+    ///
+    /// @param instance         not used in the cuda evaluator
+    ///
+    /// @param deviceContext    not used in the cuda evaluator
+    ///
+    template <typename SRC_BUFFER, typename DST_BUFFER,
+              typename PATCHCOORD_BUFFER, typename PATCH_TABLE>
+    static bool EvalPatches(
+        SRC_BUFFER *srcBuffer, VertexBufferDescriptor const &srcDesc,
+        DST_BUFFER *dstBuffer, VertexBufferDescriptor const &dstDesc,
+        int numPatchCoords,
+        PATCHCOORD_BUFFER *patchCoords,
+        PATCH_TABLE *patchTable,
+        CudaEvaluator const *instance,
+        void * deviceContext = NULL) {
+
+        (void)instance;   // unused
+        (void)deviceContext;   // unused
+
+        return EvalPatches(srcBuffer->BindCudaBuffer(), srcDesc,
+                           dstBuffer->BindCudaBuffer(), dstDesc,
+                           numPatchCoords,
+                           (const PatchCoord *)patchCoords->BindCudaBuffer(),
+                           (const PatchArray *)patchTable->GetPatchArrayBuffer(),
+                           (const int *)patchTable->GetPatchIndexBuffer(),
+                           (const PatchParam *)patchTable->GetPatchParamBuffer());
+    }
+
+    /// \brief Generic limit eval function with derivatives. This function has
+    ///        a same signature as other device kernels have so that it can be
+    ///        called in the same way.
+    ///
+    /// @param srcBuffer        Input primvar buffer.
+    ///                         must have BindCudaBuffer() method returning a
+    ///                         const float pointer for read
+    ///
+    /// @param srcDesc          vertex buffer descriptor for the input buffer
+    ///
+    /// @param dstBuffer        Output primvar buffer
+    ///                         must have BindCudaBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param dstDesc          vertex buffer descriptor for the output buffer
+    ///
+    /// @param duBuffer         Output U-derivatives buffer
+    ///                         must have BindCudaBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param duDesc           vertex buffer descriptor for the duBuffer
+    ///
+    /// @param dvBuffer         Output V-derivatives buffer
+    ///                         must have BindCudaBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param dvDesc           vertex buffer descriptor for the dvBuffer
+    ///
+    /// @param numPatchCoords   number of patchCoords.
+    ///
+    /// @param patchCoords      array of locations to be evaluated.
+    ///
+    /// @param patchTable       CudaPatchTable or equivalent
+    ///
+    /// @param instance         not used in the cuda evaluator
+    ///
+    /// @param deviceContext    not used in the cuda evaluator
+    ///
+    template <typename SRC_BUFFER, typename DST_BUFFER,
+              typename PATCHCOORD_BUFFER, typename PATCH_TABLE>
+    static bool EvalPatches(
+        SRC_BUFFER *srcBuffer, VertexBufferDescriptor const &srcDesc,
+        DST_BUFFER *dstBuffer, VertexBufferDescriptor const &dstDesc,
+        DST_BUFFER *duBuffer,  VertexBufferDescriptor const &duDesc,
+        DST_BUFFER *dvBuffer,  VertexBufferDescriptor const &dvDesc,
+        int numPatchCoords,
+        PATCHCOORD_BUFFER *patchCoords,
+        PATCH_TABLE *patchTable,
+        CudaEvaluator const *instance,
+        void * deviceContext = NULL) {
+
+        (void)instance;   // unused
+        (void)deviceContext;   // unused
+
+        return EvalPatches(srcBuffer->BindCudaBuffer(), srcDesc,
+                           dstBuffer->BindCudaBuffer(), dstDesc,
+                           duBuffer->BindCudaBuffer(),  duDesc,
+                           dvBuffer->BindCudaBuffer(),  dvDesc,
+                           numPatchCoords,
+                           (const PatchCoord *)patchCoords->BindCudaBuffer(),
+                           (const PatchArray *)patchTable->GetPatchArrayBuffer(),
+                           (const int *)patchTable->GetPatchIndexBuffer(),
+                           (const PatchParam *)patchTable->GetPatchParamBuffer());
+    }
+
+    /// \brief Static limit eval function. It takes an array of PatchCoord
+    ///        and evaluate limit values on given PatchTable.
+    ///
+    /// @param src              Input primvar pointer. An offset of srcDesc
+    ///                         will be applied internally (i.e. the pointer
+    ///                         should not include the offset)
+    ///
+    /// @param srcDesc          vertex buffer descriptor for the input buffer
+    ///
+    /// @param dst              Output primvar pointer. An offset of dstDesc
+    ///                         will be applied internally.
+    ///
+    /// @param dstDesc          vertex buffer descriptor for the output buffer
+    ///
+    /// @param numPatchCoords   number of patchCoords.
+    ///
+    /// @param patchCoords      array of locations to be evaluated.
+    ///
+    /// @param patchArrays      an array of Osd::PatchArray struct
+    ///                         indexed by PatchCoord::arrayIndex
+    ///
+    /// @param patchIndices     an array of patch indices
+    ///                         indexed by PatchCoord::vertIndex
+    ///
+    /// @param patchParams      an array of Osd::PatchParam struct
+    ///                         indexed by PatchCoord::patchIndex
+    ///
+    static bool EvalPatches(
+        const float *src, VertexBufferDescriptor const &srcDesc,
+        float *dst,       VertexBufferDescriptor const &dstDesc,
+        int numPatchCoords,
+        const PatchCoord *patchCoords,
+        const PatchArray *patchArrays,
+        const int *patchIndices,
+        const PatchParam *patchParams);
+
+    /// \brief Static limit eval function. It takes an array of PatchCoord
+    ///        and evaluate limit values on given PatchTable.
+    ///
+    /// @param src              Input primvar pointer. An offset of srcDesc
+    ///                         will be applied internally (i.e. the pointer
+    ///                         should not include the offset)
+    ///
+    /// @param srcDesc          vertex buffer descriptor for the input buffer
+    ///
+    /// @param dst              Output primvar pointer. An offset of dstDesc
+    ///                         will be applied internally.
+    ///
+    /// @param dstDesc          vertex buffer descriptor for the output buffer
+    ///
+    /// @param du               Output U-derivatives pointer. An offset of
+    ///                         duDesc will be applied internally.
+    ///
+    /// @param duDesc           vertex buffer descriptor for the du buffer
+    ///
+    /// @param dv               Output V-derivatives pointer. An offset of
+    ///                         dvDesc will be applied internally.
+    ///
+    /// @param dvDesc           vertex buffer descriptor for the dv buffer
+    ///
+    /// @param numPatchCoords   number of patchCoords.
+    ///
+    /// @param patchCoords      array of locations to be evaluated.
+    ///
+    /// @param patchArrays      an array of Osd::PatchArray struct
+    ///                         indexed by PatchCoord::arrayIndex
+    ///
+    /// @param patchIndices     an array of patch indices
+    ///                         indexed by PatchCoord::vertIndex
+    ///
+    /// @param patchParams      an array of Osd::PatchParam struct
+    ///                         indexed by PatchCoord::patchIndex
+    ///
+    static bool EvalPatches(
+        const float *src, VertexBufferDescriptor const &srcDesc,
+        float *dst,       VertexBufferDescriptor const &dstDesc,
+        float *du,        VertexBufferDescriptor const &duDesc,
+        float *dv,        VertexBufferDescriptor const &dvDesc,
+        int numPatchCoords,
+        const PatchCoord *patchCoords,
+        const PatchArray *patchArrays,
+        const int *patchIndices,
+        const PatchParam *patchParams);
+
+    /// ----------------------------------------------------------------------
+    ///
+    ///   Other methods
+    ///
+    /// ----------------------------------------------------------------------
     static void Synchronize(void *deviceContext = NULL);
 };
 
diff --git a/opensubdiv/osd/cudaKernel.cu b/opensubdiv/osd/cudaKernel.cu
index daa041e3..3ca69382 100644
--- a/opensubdiv/osd/cudaKernel.cu
+++ b/opensubdiv/osd/cudaKernel.cu
@@ -238,6 +238,204 @@ __global__ void computeStencilsNv_v4(float const *__restrict cvs,
 
 // -----------------------------------------------------------------------------
 
+// Osd::PatchCoord osd/types.h
+struct PatchCoord {
+    int arrayIndex;
+    int patchIndex;
+    int vertIndex;
+    float s;
+    float t;
+};
+struct PatchArray {
+    int patchType;        // Far::PatchDescriptor::Type
+    int numPatches;
+    int indexBase;        // offset in the index buffer
+    int primitiveIdBase;  // offset in the patch param buffer
+};
+struct PatchParam {
+    int faceIndex;
+    unsigned int bitField;
+    float sharpness;
+};
+
+__device__ void
+getBSplineWeights(float t, float point[4], float deriv[4]) {
+    // The four uniform cubic B-Spline basis functions evaluated at t:
+    float const one6th = 1.0f / 6.0f;
+
+    float t2 = t * t;
+    float t3 = t * t2;
+
+    point[0] = one6th * (1.0f - 3.0f*(t -      t2) -      t3);
+    point[1] = one6th * (4.0f           - 6.0f*t2  + 3.0f*t3);
+    point[2] = one6th * (1.0f + 3.0f*(t +      t2  -      t3));
+    point[3] = one6th * (                                 t3);
+
+    // Derivatives of the above four basis functions at t:
+    if (deriv) {
+        deriv[0] = -0.5f*t2 +      t - 0.5f;
+        deriv[1] =  1.5f*t2 - 2.0f*t;
+        deriv[2] = -1.5f*t2 +      t + 0.5f;
+        deriv[3] =  0.5f*t2;
+    }
+}
+
+__device__ void
+adjustBoundaryWeights(unsigned int bits, float sWeights[4], float tWeights[4]) {
+    int boundary = ((bits >> 4) & 0xf);  // far/patchParam.h
+
+    if (boundary & 1) {
+        tWeights[2] -= tWeights[0];
+        tWeights[1] += 2*tWeights[0];
+        tWeights[0] = 0;
+    }
+    if (boundary & 2) {
+        sWeights[1] -= sWeights[3];
+        sWeights[2] += 2*sWeights[3];
+        sWeights[3] = 0;
+    }
+    if (boundary & 4) {
+        tWeights[1] -= tWeights[3];
+        tWeights[2] += 2*tWeights[3];
+        tWeights[3] = 0;
+    }
+    if (boundary & 8) {
+        sWeights[2] -= sWeights[0];
+        sWeights[1] += 2*sWeights[0];
+        sWeights[0] = 0;
+    }
+}
+
+__device__
+int getDepth(unsigned int patchBits) {
+    return (patchBits & 0x7);
+}
+
+__device__
+float getParamFraction(unsigned int patchBits) {
+    bool nonQuadRoot = (patchBits >> 3) & 0x1;
+    int depth = getDepth(patchBits);
+    if (nonQuadRoot) {
+        return 1.0f / float( 1 << (depth-1) );
+    } else {
+        return 1.0f / float( 1 << depth );
+    }
+}
+
+__device__
+void normalizePatchCoord(unsigned int patchBits, float *u, float *v) {
+    float frac = getParamFraction(patchBits);
+
+    int iu = (patchBits >> 22) & 0x3ff;
+    int iv = (patchBits >> 12) & 0x3ff;
+
+    // top left corner
+    float pu = (float)iu*frac;
+    float pv = (float)iv*frac;
+
+    // normalize u,v coordinates
+    *u = (*u - pu) / frac,
+    *v = (*v - pv) / frac;
+}
+
+// Far::PatchDescriptor::Type
+enum Type {
+    NON_PATCH = 0,     ///< undefined
+    POINTS,            ///< points (useful for cage drawing)
+    LINES,             ///< lines  (useful for cage drawing)
+    QUADS,             ///< bilinear quads-only patches
+    TRIANGLES,         ///< bilinear triangles-only mesh
+    LOOP,              ///< Loop patch
+    REGULAR,           ///< feature-adaptive bicubic patches
+    GREGORY,
+    GREGORY_BOUNDARY,
+    GREGORY_BASIS
+};
+
+__global__ void
+computePatches(const float *src, float *dst, float *dstDu, float *dstDv,
+               int length, int srcStride, int dstStride, int dstDuStride, int dstDvStride,
+               int numPatchCoords, const PatchCoord *patchCoords,
+               const PatchArray *patchArrayBuffer,
+               const int *patchIndexBuffer,
+               const PatchParam *patchParamBuffer) {
+
+    int first = threadIdx.x + blockIdx.x * blockDim.x;
+
+    // PERFORMANCE: not yet optimized
+
+    float wP[20], wDs[20], wDt[20];
+
+    for (int i = first; i < numPatchCoords; i += blockDim.x * gridDim.x) {
+
+        PatchCoord const &coord = patchCoords[i];
+        PatchArray const &array = patchArrayBuffer[coord.arrayIndex];
+
+        int patchType = array.patchType;
+        int numControlVertices = 16;
+        // note: patchIndex is absolute.
+        unsigned int patchBits = patchParamBuffer[coord.patchIndex].bitField;
+
+        // normalize
+        float s = coord.s;
+        float t = coord.t;
+        normalizePatchCoord(patchBits, &s, &t);
+        float dScale = (float)(1 << getDepth(patchBits));
+
+        if (patchType == REGULAR) {
+            float sWeights[4], tWeights[4], dsWeights[4], dtWeights[4];
+            getBSplineWeights(s, sWeights, dsWeights);
+            getBSplineWeights(t, tWeights, dtWeights);
+
+            // Compute the tensor product weight of the (s,t) basis function
+            // corresponding to each control vertex:
+            adjustBoundaryWeights(patchBits, sWeights, tWeights);
+            adjustBoundaryWeights(patchBits, dsWeights, dtWeights);
+
+            for (int k = 0; k < 4; ++k) {
+                for (int l = 0; l < 4; ++l) {
+                    wP[4*k+l]  = sWeights[l]  * tWeights[k];
+                    wDs[4*k+l] = dsWeights[l] * tWeights[k]  * dScale;
+                    wDt[4*k+l] = sWeights[l]  * dtWeights[k] * dScale;
+                }
+            }
+        } else if (patchType == GREGORY_BASIS) {
+            // XXX: not yet implemented.
+            continue;
+        } else {
+            // unknown patchType
+            continue;
+        }
+        const int *cvs =
+            &patchIndexBuffer[array.indexBase + coord.vertIndex];
+
+        float * dstVert = dst + i * dstStride;
+        clear(dstVert, length);
+        for (int j = 0; j < numControlVertices; ++j) {
+            const float * srcVert = src + cvs[j] * srcStride;
+            addWithWeight(dstVert, srcVert, wP[j], length);
+        }
+        if (dstDu) {
+            float *d = dstDu + i * dstDuStride;
+            clear(d, length);
+            for (int j = 0; j < numControlVertices; ++j) {
+                const float * srcVert = src + cvs[j] * srcStride;
+                addWithWeight(d, srcVert, wDs[j], length);
+            }
+        }
+        if (dstDv) {
+            float *d = dstDv + i * dstDvStride;
+            clear(d, length);
+            for (int j = 0; j < numControlVertices; ++j) {
+                const float * srcVert = src + cvs[j] * srcStride;
+                addWithWeight(d, srcVert, wDt[j], length);
+            }
+        }
+    }
+}
+
+// -----------------------------------------------------------------------------
+
 #include "../version.h"
 
 #define OPT_KERNEL(NUM_ELEMENTS, KERNEL, X, Y, ARG) \
@@ -257,20 +455,12 @@ __global__ void computeStencilsNv_v4(float const *__restrict cvs,
 
 extern "C" {
 
-void CudaEvalStencils(const float *src,
-                      float *dst,
-                      int length,
-                      int srcStride,
-                      int dstStride,
-                      const int * sizes,
-                      const int * offsets,
-                      const int * indices,
-                      const float * weights,
-                      int start,
-                      int end)
-{
-//    assert(cvs and dst and sizes and offsets and indices and weights and (end>=start));
-
+void CudaEvalStencils(
+    const float *src, float *dst,
+    int length, int srcStride, int dstStride,
+    const int * sizes, const int * offsets, const int * indices,
+    const float * weights,
+    int start, int end) {
     if (length == 0 or srcStride == 0 or dstStride == 0 or (end <= start)) {
         return;
     }
@@ -301,4 +491,36 @@ void CudaEvalStencils(const float *src,
 
 // -----------------------------------------------------------------------------
 
+void CudaEvalPatches(
+    const float *src, float *dst,
+    int length, int srcStride, int dstStride,
+    int numPatchCoords, const PatchCoord *patchCoords,
+    const PatchArray *patchArrayBuffer,
+    const int *patchIndexBuffer,
+    const PatchParam *patchParamBuffer) {
+
+    // PERFORMANCE: not optimized at all
+
+    computePatches <<<512, 32>>>(
+        src, dst, NULL, NULL, length, srcStride, dstStride, 0, 0,
+        numPatchCoords, patchCoords,
+        patchArrayBuffer, patchIndexBuffer, patchParamBuffer);
+}
+
+void CudaEvalPatchesWithDerivatives(
+    const float *src, float *dst, float *dstDu, float *dstDv,
+    int length, int srcStride, int dstStride, int dstDuStride, int dstDvStride,
+    int numPatchCoords, const PatchCoord *patchCoords,
+    const PatchArray *patchArrayBuffer,
+    const int *patchIndexBuffer,
+    const PatchParam *patchParamBuffer) {
+
+    // PERFORMANCE: not optimized at all
+
+    computePatches <<<512, 32>>>(
+        src, dst, dstDu, dstDv, length, srcStride, dstStride, dstDuStride, dstDvStride,
+        numPatchCoords, patchCoords,
+        patchArrayBuffer, patchIndexBuffer, patchParamBuffer);
+}
+
 }  /* extern "C" */
diff --git a/opensubdiv/osd/cudaPatchTable.cpp b/opensubdiv/osd/cudaPatchTable.cpp
new file mode 100644
index 00000000..f90dcc1a
--- /dev/null
+++ b/opensubdiv/osd/cudaPatchTable.cpp
@@ -0,0 +1,103 @@
+//
+//   Copyright 2015 Pixar
+//
+//   Licensed under the Apache License, Version 2.0 (the "Apache License")
+//   with the following modification; you may not use this file except in
+//   compliance with the Apache License and the following modification to it:
+//   Section 6. Trademarks. is deleted and replaced with:
+//
+//   6. Trademarks. This License does not grant permission to use the trade
+//      names, trademarks, service marks, or product names of the Licensor
+//      and its affiliates, except as required to comply with Section 4(c) of
+//      the License and to reproduce the content of the NOTICE file.
+//
+//   You may obtain a copy of the Apache License at
+//
+//       http://www.apache.org/licenses/LICENSE-2.0
+//
+//   Unless required by applicable law or agreed to in writing, software
+//   distributed under the Apache License with the above modification is
+//   distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+//   KIND, either express or implied. See the Apache License for the specific
+//   language governing permissions and limitations under the Apache License.
+//
+
+#include "../osd/cudaPatchTable.h"
+
+#include <cuda_runtime.h>
+
+#include "../far/patchTable.h"
+#include "../osd/cpuPatchTable.h"
+
+namespace OpenSubdiv {
+namespace OPENSUBDIV_VERSION {
+
+namespace Osd {
+
+CudaPatchTable::CudaPatchTable() :
+    _patchArrays(NULL), _indexBuffer(NULL), _patchParamBuffer(NULL) {
+}
+
+CudaPatchTable::~CudaPatchTable() {
+    if (_patchArrays) cudaFree(_patchArrays);
+    if (_indexBuffer) cudaFree(_indexBuffer);
+    if (_patchParamBuffer) cudaFree(_patchParamBuffer);
+}
+
+CudaPatchTable *
+CudaPatchTable::Create(Far::PatchTable const *farPatchTable,
+                       void * /*deviceContext*/) {
+    CudaPatchTable *instance = new CudaPatchTable();
+    if (instance->allocate(farPatchTable)) return instance;
+    delete instance;
+    return 0;
+}
+
+bool
+CudaPatchTable::allocate(Far::PatchTable const *farPatchTable) {
+    CpuPatchTable patchTable(farPatchTable);
+
+    size_t numPatchArrays = patchTable.GetNumPatchArrays();
+    size_t indexSize = patchTable.GetPatchIndexSize();
+    size_t patchParamSize = patchTable.GetPatchParamSize();
+
+    cudaError_t err;
+    err = cudaMalloc(&_patchArrays, numPatchArrays * sizeof(Osd::PatchArray));
+    if (err != cudaSuccess) return false;
+
+    err = cudaMalloc(&_indexBuffer, indexSize * sizeof(int));
+    if (err != cudaSuccess) return false;
+
+    err = cudaMalloc(&_patchParamBuffer, patchParamSize * sizeof(Osd::PatchParam));
+    if (err != cudaSuccess) return false;
+
+    // copy patch array
+    err = cudaMemcpy(_patchArrays,
+                     patchTable.GetPatchArrayBuffer(),
+                     numPatchArrays * sizeof(Osd::PatchArray),
+                     cudaMemcpyHostToDevice);
+    if (err != cudaSuccess) return false;
+
+    // copy index buffer
+    err = cudaMemcpy(_indexBuffer,
+                     patchTable.GetPatchIndexBuffer(),
+                     indexSize * sizeof(int),
+                     cudaMemcpyHostToDevice);
+    if (err != cudaSuccess) return false;
+
+    // patch param buffer
+    err = cudaMemcpy(_patchParamBuffer,
+                     patchTable.GetPatchParamBuffer(),
+                     patchParamSize * sizeof(Osd::PatchParam),
+                     cudaMemcpyHostToDevice);
+    if (err != cudaSuccess) return false;
+
+    return true;
+}
+
+
+}  // end namespace Osd
+
+}  // end namespace OPENSUBDIV_VERSION
+}  // end namespace OpenSubdiv
+
diff --git a/opensubdiv/osd/cudaPatchTable.h b/opensubdiv/osd/cudaPatchTable.h
new file mode 100644
index 00000000..50f2a8ca
--- /dev/null
+++ b/opensubdiv/osd/cudaPatchTable.h
@@ -0,0 +1,83 @@
+//
+//   Copyright 2015 Pixar
+//
+//   Licensed under the Apache License, Version 2.0 (the "Apache License")
+//   with the following modification; you may not use this file except in
+//   compliance with the Apache License and the following modification to it:
+//   Section 6. Trademarks. is deleted and replaced with:
+//
+//   6. Trademarks. This License does not grant permission to use the trade
+//      names, trademarks, service marks, or product names of the Licensor
+//      and its affiliates, except as required to comply with Section 4(c) of
+//      the License and to reproduce the content of the NOTICE file.
+//
+//   You may obtain a copy of the Apache License at
+//
+//       http://www.apache.org/licenses/LICENSE-2.0
+//
+//   Unless required by applicable law or agreed to in writing, software
+//   distributed under the Apache License with the above modification is
+//   distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+//   KIND, either express or implied. See the Apache License for the specific
+//   language governing permissions and limitations under the Apache License.
+//
+
+#ifndef OPENSUBDIV3_OSD_CUDA_PATCH_TABLE_H
+#define OPENSUBDIV3_OSD_CUDA_PATCH_TABLE_H
+
+#include "../version.h"
+
+#include "../osd/nonCopyable.h"
+#include "../osd/types.h"
+
+namespace OpenSubdiv {
+namespace OPENSUBDIV_VERSION {
+
+namespace Far{
+    class PatchTable;
+};
+
+namespace Osd {
+
+/// \brief CUDA patch table
+///
+/// This class is a cuda buffer representation of Far::PatchTable.
+///
+/// CudaEvaluator consumes this table to evaluate on the patches.
+///
+///
+class CudaPatchTable : private NonCopyable<CudaPatchTable> {
+public:
+    /// Creator. Returns NULL if error
+    static CudaPatchTable *Create(Far::PatchTable const *patchTable,
+                                  void *deviceContext = NULL);
+    /// Destructor
+    ~CudaPatchTable();
+
+    /// Returns the cuda memory of the array of Osd::PatchArray buffer
+    void *GetPatchArrayBuffer() const { return _patchArrays; }
+
+    /// Returns the cuda memory of the patch control vertices
+    void *GetPatchIndexBuffer() const { return _indexBuffer; }
+
+    /// Returns the cuda memory of the array of Osd::PatchParam buffer
+    void *GetPatchParamBuffer() const { return _patchParamBuffer; }
+
+protected:
+    CudaPatchTable();
+
+    bool allocate(Far::PatchTable const *patchTable);
+
+    void *_patchArrays;
+    void *_indexBuffer;
+    void *_patchParamBuffer;
+};
+
+}  // end namespace Osd
+
+}  // end namespace OPENSUBDIV_VERSION
+using namespace OPENSUBDIV_VERSION;
+
+}  // end namespace OpenSubdiv
+
+#endif  // OPENSUBDIV3_OSD_CUDA_PATCH_TABLE_H
diff --git a/opensubdiv/osd/cudaVertexBuffer.cpp b/opensubdiv/osd/cudaVertexBuffer.cpp
index fb3e72b7..38ac62db 100644
--- a/opensubdiv/osd/cudaVertexBuffer.cpp
+++ b/opensubdiv/osd/cudaVertexBuffer.cpp
@@ -43,7 +43,8 @@ CudaVertexBuffer::~CudaVertexBuffer() {
 }
 
 CudaVertexBuffer *
-CudaVertexBuffer::Create(int numElements, int numVertices) {
+CudaVertexBuffer::Create(int numElements, int numVertices,
+                         void * /*deviceContext */) {
     CudaVertexBuffer *instance =
         new CudaVertexBuffer(numElements, numVertices);
     if (instance->allocate()) return instance;
@@ -52,7 +53,8 @@ CudaVertexBuffer::Create(int numElements, int numVertices) {
 }
 
 void
-CudaVertexBuffer::UpdateData(const float *src, int startVertex, int numVertices) {
+CudaVertexBuffer::UpdateData(const float *src, int startVertex, int numVertices,
+                             void * /*deviceContext*/) {
 
     size_t size = _numElements * numVertices * sizeof(float);
 
diff --git a/opensubdiv/osd/cudaVertexBuffer.h b/opensubdiv/osd/cudaVertexBuffer.h
index 6a660142..df1d2dec 100644
--- a/opensubdiv/osd/cudaVertexBuffer.h
+++ b/opensubdiv/osd/cudaVertexBuffer.h
@@ -27,6 +27,8 @@
 
 #include "../version.h"
 
+#include <cstddef>
+
 namespace OpenSubdiv {
 namespace OPENSUBDIV_VERSION {
 
@@ -41,14 +43,16 @@ class CudaVertexBuffer {
 
 public:
     /// Creator. Returns NULL if error.
-    static CudaVertexBuffer * Create(int numElements, int numVertices);
+    static CudaVertexBuffer * Create(int numElements, int numVertices,
+                                     void *deviceContext = NULL);
 
     /// Destructor.
     ~CudaVertexBuffer();
 
     /// This method is meant to be used in client code in order to provide coarse
     /// vertices data to Osd.
-    void UpdateData(const float *src, int startVertex, int numVertices);
+    void UpdateData(const float *src, int startVertex, int numVertices,
+                    void *deviceContext=NULL);
 
     /// Returns how many elements defined in this vertex buffer.
     int GetNumElements() const;
diff --git a/opensubdiv/osd/d3d11PatchTable.h b/opensubdiv/osd/d3d11PatchTable.h
index 5fed93b7..69b3aa47 100644
--- a/opensubdiv/osd/d3d11PatchTable.h
+++ b/opensubdiv/osd/d3d11PatchTable.h
@@ -30,6 +30,7 @@
 #include <vector>
 #include "../far/patchDescriptor.h"
 #include "../osd/nonCopyable.h"
+#include "../osd/types.h"
 
 struct ID3D11Buffer;
 struct ID3D11ShaderResourceView;
@@ -49,33 +50,6 @@ class D3D11PatchTable : private NonCopyable<D3D11PatchTable> {
 public:
     typedef ID3D11Buffer * VertexBufferBinding;
 
-    // XXX: this struct will be further refactored.
-    class PatchArray {
-    public:
-        PatchArray(Far::PatchDescriptor desc, int numPatches,
-                   int indexBase, int primitiveIdBase) :
-            desc(desc), numPatches(numPatches), indexBase(indexBase),
-            primitiveIdBase(primitiveIdBase) {}
-        Far::PatchDescriptor const &GetDescriptor() const {
-            return desc;
-        }
-        int GetNumPatches() const {
-            return numPatches;
-        }
-        int GetIndexBase() const {
-            return indexBase;
-        }
-        int GetPrimitiveIdBase() const {
-            return primitiveIdBase;
-        }
-    private:
-        Far::PatchDescriptor desc;
-        int numPatches;
-        int indexBase;        // an offset within the index buffer
-        int primitiveIdBase;  // an offset within the patch param buffer
-    };
-    typedef std::vector<PatchArray> PatchArrayVector;
-
     D3D11PatchTable();
     ~D3D11PatchTable();
 
diff --git a/opensubdiv/osd/glComputeEvaluator.cpp b/opensubdiv/osd/glComputeEvaluator.cpp
index a43bd99b..800df867 100644
--- a/opensubdiv/osd/glComputeEvaluator.cpp
+++ b/opensubdiv/osd/glComputeEvaluator.cpp
@@ -88,16 +88,66 @@ GLStencilTableSSBO::~GLStencilTableSSBO() {
 // ---------------------------------------------------------------------------
 
 
-GLComputeEvaluator::GLComputeEvaluator() :
-    _program(0), _workGroupSize(64) {
+GLComputeEvaluator::GLComputeEvaluator() : _workGroupSize(64) {
+    memset (&_stencilKernel, 0, sizeof(_stencilKernel));
+    memset (&_patchKernel, 0, sizeof(_patchKernel));
 }
 
 GLComputeEvaluator::~GLComputeEvaluator() {
-    if (_program) {
-        glDeleteProgram(_program);
+    if (_stencilKernel.program) {
+        glDeleteProgram(_stencilKernel.program);
+    }
+    if (_patchKernel.program) {
+        glDeleteProgram(_patchKernel.program);
     }
 }
 
+static GLuint
+compileKernel(VertexBufferDescriptor const &srcDesc,
+              VertexBufferDescriptor const &dstDesc,
+              const char *kernelDefine,
+              int workGroupSize) {
+    GLuint program = glCreateProgram();
+
+    GLuint shader = glCreateShader(GL_COMPUTE_SHADER);
+
+    std::ostringstream defines;
+    defines << "#define LENGTH "     << srcDesc.length << "\n"
+            << "#define SRC_STRIDE " << srcDesc.stride << "\n"
+            << "#define DST_STRIDE " << dstDesc.stride << "\n"
+            << "#define WORK_GROUP_SIZE " << workGroupSize << "\n"
+            << kernelDefine << "\n";
+    std::string defineStr = defines.str();
+
+    const char *shaderSources[3] = {"#version 430\n", 0, 0};
+    shaderSources[1] = defineStr.c_str();
+    shaderSources[2] = shaderSource;
+    glShaderSource(shader, 3, shaderSources, NULL);
+    glCompileShader(shader);
+    glAttachShader(program, shader);
+
+    GLint linked = 0;
+    glLinkProgram(program);
+    glGetProgramiv(program, GL_LINK_STATUS, &linked);
+
+    if (linked == GL_FALSE) {
+        char buffer[1024];
+        glGetShaderInfoLog(shader, 1024, NULL, buffer);
+        Far::Error(Far::FAR_RUNTIME_ERROR, buffer);
+
+        glGetProgramInfoLog(program, 1024, NULL, buffer);
+        Far::Error(Far::FAR_RUNTIME_ERROR, buffer);
+
+        glDeleteProgram(program);
+        return 0;
+    }
+
+    glDeleteShader(shader);
+
+    return program;
+}
+
+
 bool
 GLComputeEvaluator::Compile(VertexBufferDescriptor const &srcDesc,
                             VertexBufferDescriptor const &dstDesc) {
@@ -108,58 +158,55 @@ GLComputeEvaluator::Compile(VertexBufferDescriptor const &srcDesc,
         return false;
     }
 
-    if (_program) {
-        glDeleteProgram(_program);
-        _program = 0;
+    // create stencil kernel
+    if (_stencilKernel.program) {
+        glDeleteProgram(_stencilKernel.program);
     }
-    _program = glCreateProgram();
-
-    GLuint shader = glCreateShader(GL_COMPUTE_SHADER);
-
-    std::ostringstream defines;
-    defines << "#define LENGTH "     << srcDesc.length << "\n"
-            << "#define SRC_STRIDE " << srcDesc.stride << "\n"
-            << "#define DST_STRIDE " << dstDesc.stride << "\n"
-            << "#define WORK_GROUP_SIZE " << _workGroupSize << "\n";
-    std::string defineStr = defines.str();
-
-    const char *shaderSources[3] = {"#version 430\n", 0, 0};
-    shaderSources[1] = defineStr.c_str();
-    shaderSources[2] = shaderSource;
-    glShaderSource(shader, 3, shaderSources, NULL);
-    glCompileShader(shader);
-    glAttachShader(_program, shader);
-
-    GLint linked = 0;
-    glLinkProgram(_program);
-    glGetProgramiv(_program, GL_LINK_STATUS, &linked);
-
-    if (linked == GL_FALSE) {
-        char buffer[1024];
-        glGetShaderInfoLog(shader, 1024, NULL, buffer);
-        Far::Error(Far::FAR_RUNTIME_ERROR, buffer);
-
-        glGetProgramInfoLog(_program, 1024, NULL, buffer);
-        Far::Error(Far::FAR_RUNTIME_ERROR, buffer);
-
-        glDeleteProgram(_program);
-        _program = 0;
-        return false;
-    }
-
-    glDeleteShader(shader);
+    _stencilKernel.program = compileKernel(
+        srcDesc, dstDesc,
+        "#define OPENSUBDIV_GLSL_COMPUTE_KERNEL_EVAL_STENCILS",
+        _workGroupSize);
+    if (_stencilKernel.program == 0) return false;
 
     // store uniform locations for the compute kernel program.
-    _uniformSizes   = glGetUniformLocation(_program, "stencilSizes");
-    _uniformOffsets = glGetUniformLocation(_program, "stencilOffsets");
-    _uniformIndices = glGetUniformLocation(_program, "stencilIndices");
-    _uniformWeights = glGetUniformLocation(_program, "stencilIWeights");
+    _stencilKernel.uniformSizes   =
+        glGetUniformLocation(_stencilKernel.program, "stencilSizes");
+    _stencilKernel.uniformOffsets =
+        glGetUniformLocation(_stencilKernel.program, "stencilOffsets");
+    _stencilKernel.uniformIndices =
+        glGetUniformLocation(_stencilKernel.program, "stencilIndices");
+    _stencilKernel.uniformWeights =
+        glGetUniformLocation(_stencilKernel.program, "stencilIWeights");
+    _stencilKernel.uniformStart   =
+        glGetUniformLocation(_stencilKernel.program, "batchStart");
+    _stencilKernel.uniformEnd     =
+        glGetUniformLocation(_stencilKernel.program, "batchEnd");
+    _stencilKernel.uniformSrcOffset =
+        glGetUniformLocation(_stencilKernel.program, "srcOffset");
+    _stencilKernel.uniformDstOffset =
+        glGetUniformLocation(_stencilKernel.program, "dstOffset");
 
-    _uniformStart   = glGetUniformLocation(_program, "batchStart");
-    _uniformEnd     = glGetUniformLocation(_program, "batchEnd");
+    // create patch kernel
+    if (_patchKernel.program) {
+        glDeleteProgram(_patchKernel.program);
+    }
+    _patchKernel.program = compileKernel(
+        srcDesc, dstDesc,
+        "#define OPENSUBDIV_GLSL_COMPUTE_KERNEL_EVAL_PATCHES",
+        _workGroupSize);
+    if (_patchKernel.program == 0) return false;
 
-    _uniformSrcOffset = glGetUniformLocation(_program, "srcOffset");
-    _uniformDstOffset = glGetUniformLocation(_program, "dstOffset");
+    // uniform locaitons
+    _patchKernel.uniformSrcOffset =
+        glGetUniformLocation(_patchKernel.program, "srcOffset");
+    _patchKernel.uniformDstOffset =
+        glGetUniformLocation(_patchKernel.program, "dstOffset");
+    _patchKernel.uniformPatchArray =
+        glGetUniformLocation(_patchKernel.program, "patchArray");
+    _patchKernel.uniformDuDesc =
+        glGetUniformLocation(_patchKernel.program, "dstDuDesc");
+    _patchKernel.uniformDvDesc =
+        glGetUniformLocation(_patchKernel.program, "dstDvDesc");
 
     return true;
 }
@@ -183,7 +230,7 @@ GLComputeEvaluator::EvalStencils(GLuint srcBuffer,
                                  GLuint weightsBuffer,
                                  int start,
                                  int end) const {
-    if (!_program) return false;
+    if (!_stencilKernel.program) return false;
     int count = end - start;
     if (count <= 0) {
         return true;
@@ -196,12 +243,12 @@ GLComputeEvaluator::EvalStencils(GLuint srcBuffer,
     glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 4, indicesBuffer);
     glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 5, weightsBuffer);
 
-    glUseProgram(_program);
+    glUseProgram(_stencilKernel.program);
 
-    glUniform1i(_uniformStart,     start);
-    glUniform1i(_uniformEnd,       end);
-    glUniform1i(_uniformSrcOffset, srcDesc.offset);
-    glUniform1i(_uniformDstOffset, dstDesc.offset);
+    glUniform1i(_stencilKernel.uniformStart,     start);
+    glUniform1i(_stencilKernel.uniformEnd,       end);
+    glUniform1i(_stencilKernel.uniformSrcOffset, srcDesc.offset);
+    glUniform1i(_stencilKernel.uniformDstOffset, dstDesc.offset);
 
     glDispatchCompute((count + _workGroupSize - 1) / _workGroupSize, 1, 1);
 
@@ -218,6 +265,52 @@ GLComputeEvaluator::EvalStencils(GLuint srcBuffer,
     return true;
 }
 
+bool
+GLComputeEvaluator::EvalPatches(
+    GLuint srcBuffer, VertexBufferDescriptor const &srcDesc,
+    GLuint dstBuffer, VertexBufferDescriptor const &dstDesc,
+    GLuint duBuffer, VertexBufferDescriptor const &duDesc,
+    GLuint dvBuffer, VertexBufferDescriptor const &dvDesc,
+    int numPatchCoords,
+    GLuint patchCoordsBuffer,
+    const PatchArrayVector &patchArrays,
+    GLuint patchIndexBuffer,
+    GLuint patchParamsBuffer) const {
+
+    if (!_patchKernel.program) return false;
+
+    glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, srcBuffer);
+    glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, dstBuffer);
+    glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, duBuffer);
+    glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 3, dvBuffer);
+    glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 4, patchCoordsBuffer);
+    glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 5, patchIndexBuffer);
+    glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 6, patchParamsBuffer);
+
+    glUseProgram(_patchKernel.program);
+
+    glUniform1i(_patchKernel.uniformSrcOffset, srcDesc.offset);
+    glUniform1i(_patchKernel.uniformDstOffset, dstDesc.offset);
+    glUniform4iv(_patchKernel.uniformPatchArray, (int)patchArrays.size(),
+                 (const GLint*)&patchArrays[0]);
+    glUniform3i(_patchKernel.uniformDuDesc, duDesc.offset, duDesc.length, duDesc.stride);
+    glUniform3i(_patchKernel.uniformDvDesc, dvDesc.offset, dvDesc.length, dvDesc.stride);
+
+    glDispatchCompute((numPatchCoords + _workGroupSize - 1) / _workGroupSize, 1, 1);
+
+    glUseProgram(0);
+
+    glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, 0);
+    glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, 0);
+    glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, 0);
+    glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 3, 0);
+    glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 4, 0);
+    glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 5, 0);
+    glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 6, 0);
+
+    return true;
+}
+
 }  // end namespace Osd
 
 }  // end namespace OPENSUBDIV_VERSION
diff --git a/opensubdiv/osd/glComputeEvaluator.h b/opensubdiv/osd/glComputeEvaluator.h
index 1e469f6f..8b7f41a5 100644
--- a/opensubdiv/osd/glComputeEvaluator.h
+++ b/opensubdiv/osd/glComputeEvaluator.h
@@ -28,6 +28,7 @@
 #include "../version.h"
 
 #include "../osd/opengl.h"
+#include "../osd/types.h"
 #include "../osd/vertexDescriptor.h"
 
 namespace OpenSubdiv {
@@ -92,26 +93,32 @@ public:
     /// Destructor. note that the GL context must be made current.
     ~GLComputeEvaluator();
 
+    /// ----------------------------------------------------------------------
+    ///
+    ///   Stencil evaluations with StencilTable
+    ///
+    /// ----------------------------------------------------------------------
+
     /// \brief Generic static compute function. This function has a same
     ///        signature as other device kernels have so that it can be called
     ///        transparently from OsdMesh template interface.
     ///
     /// @param srcBuffer      Input primvar buffer.
     ///                       must have BindVBO() method returning a
-    ///                       const float pointer for read
+    ///                       GL buffer object of source data
     ///
     /// @param srcDesc        vertex buffer descriptor for the input buffer
     ///
     /// @param dstBuffer      Output primvar buffer
     ///                       must have BindVBO() method returning a
-    ///                       float pointer for write
+    ///                       GL buffer object of destination data
     ///
     /// @param dstDesc        vertex buffer descriptor for the output buffer
     ///
     /// @param stencilTable   stencil table to be applied. The table must have
     ///                       SSBO interfaces.
     ///
-    /// @param evaluator      cached compiled instance. Clients are supposed to
+    /// @param instance       cached compiled instance. Clients are supposed to
     ///                       pre-compile an instance of this class and provide
     ///                       to this function. If it's null the kernel still
     ///                       compute by instantiating on-demand kernel although
@@ -119,25 +126,25 @@ public:
     ///
     /// @param deviceContext  not used in the GLSL kernel
     ///
-    template <typename VERTEX_BUFFER, typename STENCIL_TABLE>
-    static bool EvalStencils(VERTEX_BUFFER *srcVertexBuffer,
-                             VertexBufferDescriptor const &srcDesc,
-                             VERTEX_BUFFER *dstVertexBuffer,
-                             VertexBufferDescriptor const &dstDesc,
-                             STENCIL_TABLE const *stencilTable,
-                             GLComputeEvaluator const *instance,
-                             void * deviceContext = NULL) {
+    template <typename SRC_BUFFER, typename DST_BUFFER, typename STENCIL_TABLE>
+    static bool EvalStencils(
+        SRC_BUFFER *srcBuffer, VertexBufferDescriptor const &srcDesc,
+        DST_BUFFER *dstBuffer, VertexBufferDescriptor const &dstDesc,
+        STENCIL_TABLE const *stencilTable,
+        GLComputeEvaluator const *instance,
+        void * deviceContext = NULL) {
+
         if (instance) {
-            return instance->EvalStencils(srcVertexBuffer, srcDesc,
-                                          dstVertexBuffer, dstDesc,
+            return instance->EvalStencils(srcBuffer, srcDesc,
+                                          dstBuffer, dstDesc,
                                           stencilTable);
         } else {
             // Create a kernel on demand (slow)
             (void)deviceContext;  // unused
             instance = Create(srcDesc, dstDesc);
             if (instance) {
-                bool r = instance->EvalStencils(srcVertexBuffer, srcDesc,
-                                                dstVertexBuffer, dstDesc,
+                bool r = instance->EvalStencils(srcBuffer, srcDesc,
+                                                dstBuffer, dstDesc,
                                                 stencilTable);
                 delete instance;
                 return r;
@@ -148,15 +155,14 @@ public:
 
     /// Dispatch the GLSL compute kernel on GPU asynchronously.
     /// returns false if the kernel hasn't been compiled yet.
-    template <typename VERTEX_BUFFER, typename STENCIL_TABLE>
-    bool EvalStencils(VERTEX_BUFFER *srcVertexBuffer,
-                      VertexBufferDescriptor const &srcDesc,
-                      VERTEX_BUFFER *dstVertexBuffer,
-                      VertexBufferDescriptor const &dstDesc,
-                      STENCIL_TABLE const *stencilTable) const {
-        return EvalStencils(srcVertexBuffer->BindVBO(),
+    template <typename SRC_BUFFER, typename DST_BUFFER, typename STENCIL_TABLE>
+    bool EvalStencils(
+        SRC_BUFFER *srcBuffer, VertexBufferDescriptor const &srcDesc,
+        DST_BUFFER *dstBuffer, VertexBufferDescriptor const &dstDesc,
+        STENCIL_TABLE const *stencilTable) const {
+        return EvalStencils(srcBuffer->BindVBO(),
                             srcDesc,
-                            dstVertexBuffer->BindVBO(),
+                            dstBuffer->BindVBO(),
                             dstDesc,
                             stencilTable->GetSizesBuffer(),
                             stencilTable->GetOffsetsBuffer(),
@@ -168,10 +174,8 @@ public:
 
     /// Dispatch the GLSL compute kernel on GPU asynchronously.
     /// returns false if the kernel hasn't been compiled yet.
-    bool EvalStencils(GLuint srcBuffer,
-                      VertexBufferDescriptor const &srcDesc,
-                      GLuint dstBuffer,
-                      VertexBufferDescriptor const &dstDesc,
+    bool EvalStencils(GLuint srcBuffer, VertexBufferDescriptor const &srcDesc,
+                      GLuint dstBuffer, VertexBufferDescriptor const &dstDesc,
                       GLuint sizesBuffer,
                       GLuint offsetsBuffer,
                       GLuint indicesBuffer,
@@ -179,6 +183,271 @@ public:
                       int start,
                       int end) const;
 
+
+    /// ----------------------------------------------------------------------
+    ///
+    ///   Limit evaluations with PatchTable
+    ///
+    /// ----------------------------------------------------------------------
+    ///
+    /// \brief Generic limit eval function. This function has a same
+    ///        signature as other device kernels have so that it can be called
+    ///        in the same way.
+    ///
+    /// @param srcBuffer      Input primvar buffer.
+    ///                       must have BindVBO() method returning a GL
+    ///                       buffer object of source data
+    ///
+    /// @param srcDesc        vertex buffer descriptor for the input buffer
+    ///
+    /// @param dstBuffer      Output primvar buffer
+    ///                       must have BindVBO() method returning a GL
+    ///                       buffer object of destination data
+    ///
+    /// @param dstDesc        vertex buffer descriptor for the output buffer
+    ///
+    /// @param numPatchCoords number of patchCoords.
+    ///
+    /// @param patchCoords    array of locations to be evaluated.
+    ///                       must have BindVBO() method returning an
+    ///                       array of PatchCoord struct in VBO.
+    ///
+    /// @param patchTable     GLPatchTable or equivalent
+    ///
+    /// @param instance       cached compiled instance. Clients are supposed to
+    ///                       pre-compile an instance of this class and provide
+    ///                       to this function. If it's null the kernel still
+    ///                       compute by instantiating on-demand kernel although
+    ///                       it may cause a performance problem.
+    ///
+    /// @param deviceContext  not used in the GLXFB evaluator
+    ///
+    template <typename SRC_BUFFER, typename DST_BUFFER,
+              typename PATCHCOORD_BUFFER, typename PATCH_TABLE>
+    static bool EvalPatches(
+        SRC_BUFFER *srcBuffer, VertexBufferDescriptor const &srcDesc,
+        DST_BUFFER *dstBuffer, VertexBufferDescriptor const &dstDesc,
+        int numPatchCoords,
+        PATCHCOORD_BUFFER *patchCoords,
+        PATCH_TABLE *patchTable,
+        GLComputeEvaluator const *instance,
+        void * deviceContext = NULL) {
+
+        if (instance) {
+            return instance->EvalPatches(srcBuffer, srcDesc,
+                                         dstBuffer, dstDesc,
+                                         numPatchCoords, patchCoords,
+                                         patchTable);
+        } else {
+            // Create an instance on demand (slow)
+            (void)deviceContext;  // unused
+            instance = Create(srcDesc, dstDesc);
+            if (instance) {
+                bool r = instance->EvalPatches(srcBuffer, srcDesc,
+                                               dstBuffer, dstDesc,
+                                               numPatchCoords, patchCoords,
+                                               patchTable);
+                delete instance;
+                return r;
+            }
+            return false;
+        }
+    }
+
+    /// \brief Generic limit eval function. This function has a same
+    ///        signature as other device kernels have so that it can be called
+    ///        in the same way.
+    ///
+    /// @param srcBuffer      Input primvar buffer.
+    ///                       must have BindVBO() method returning a GL
+    ///                       buffer object of source data
+    ///
+    /// @param srcDesc        vertex buffer descriptor for the input buffer
+    ///
+    /// @param dstBuffer      Output primvar buffer
+    ///                       must have BindVBO() method returning a GL
+    ///                       buffer object of destination data
+    ///
+    /// @param dstDesc        vertex buffer descriptor for the output buffer
+    ///
+    /// @param duBuffer
+    ///
+    /// @param duDesc
+    ///
+    /// @param dvBuffer
+    ///
+    /// @param dvDesc
+    ///
+    /// @param numPatchCoords number of patchCoords.
+    ///
+    /// @param patchCoords    array of locations to be evaluated.
+    ///                       must have BindVBO() method returning an
+    ///                       array of PatchCoord struct in VBO.
+    ///
+    /// @param patchTable     GLPatchTable or equivalent
+    ///
+    /// @param instance       cached compiled instance. Clients are supposed to
+    ///                       pre-compile an instance of this class and provide
+    ///                       to this function. If it's null the kernel still
+    ///                       compute by instantiating on-demand kernel although
+    ///                       it may cause a performance problem.
+    ///
+    /// @param deviceContext  not used in the GLXFB evaluator
+    ///
+    template <typename SRC_BUFFER, typename DST_BUFFER,
+              typename PATCHCOORD_BUFFER, typename PATCH_TABLE>
+    static bool EvalPatches(
+        SRC_BUFFER *srcBuffer, VertexBufferDescriptor const &srcDesc,
+        DST_BUFFER *dstBuffer, VertexBufferDescriptor const &dstDesc,
+        DST_BUFFER *duBuffer,  VertexBufferDescriptor const &duDesc,
+        DST_BUFFER *dvBuffer,  VertexBufferDescriptor const &dvDesc,
+        int numPatchCoords,
+        PATCHCOORD_BUFFER *patchCoords,
+        PATCH_TABLE *patchTable,
+        GLComputeEvaluator const *instance,
+        void * deviceContext = NULL) {
+
+        if (instance) {
+            return instance->EvalPatches(srcBuffer, srcDesc,
+                                         dstBuffer, dstDesc,
+                                         duBuffer, duDesc,
+                                         dvBuffer, dvDesc,
+                                         numPatchCoords, patchCoords,
+                                         patchTable);
+        } else {
+            // Create an instance on demand (slow)
+            (void)deviceContext;  // unused
+            instance = Create(srcDesc, dstDesc);
+            if (instance) {
+                bool r = instance->EvalPatches(srcBuffer, srcDesc,
+                                               dstBuffer, dstDesc,
+                                               duBuffer, duDesc,
+                                               dvBuffer, dvDesc,
+                                               numPatchCoords, patchCoords,
+                                               patchTable);
+                delete instance;
+                return r;
+            }
+            return false;
+        }
+    }
+
+    /// \brief Generic limit eval function. This function has a same
+    ///        signature as other device kernels have so that it can be called
+    ///        in the same way.
+    ///
+    /// @param srcBuffer      Input primvar buffer.
+    ///                       must have BindVBO() method returning a
+    ///                       const float pointer for read
+    ///
+    /// @param srcDesc        vertex buffer descriptor for the input buffer
+    ///
+    /// @param dstBuffer      Output primvar buffer
+    ///                       must have BindVBOBuffer() method returning a
+    ///                       float pointer for write
+    ///
+    /// @param dstDesc        vertex buffer descriptor for the output buffer
+    ///
+    /// @param numPatchCoords number of patchCoords.
+    ///
+    /// @param patchCoords    array of locations to be evaluated.
+    ///                       must have BindVBO() method returning an
+    ///                       array of PatchCoord struct in VBO.
+    ///
+    /// @param patchTable     GLPatchTable or equivalent
+    ///
+    template <typename SRC_BUFFER, typename DST_BUFFER,
+              typename PATCHCOORD_BUFFER, typename PATCH_TABLE>
+    bool EvalPatches(
+        SRC_BUFFER *srcBuffer, VertexBufferDescriptor const &srcDesc,
+        DST_BUFFER *dstBuffer, VertexBufferDescriptor const &dstDesc,
+        int numPatchCoords,
+        PATCHCOORD_BUFFER *patchCoords,
+        PATCH_TABLE *patchTable) const {
+
+        return EvalPatches(srcBuffer->BindVBO(), srcDesc,
+                           dstBuffer->BindVBO(), dstDesc,
+                           0, VertexBufferDescriptor(),
+                           0, VertexBufferDescriptor(),
+                           numPatchCoords,
+                           patchCoords->BindVBO(),
+                           patchTable->GetPatchArrays(),
+                           patchTable->GetPatchIndexBuffer(),
+                           patchTable->GetPatchParamBuffer());
+    }
+
+    /// \brief Generic limit eval function with derivatives. This function has
+    ///        a same signature as other device kernels have so that it can be
+    ///        called in the same way.
+    ///
+    /// @param srcBuffer        Input primvar buffer.
+    ///                         must have BindVBO() method returning a
+    ///                         const float pointer for read
+    ///
+    /// @param srcDesc          vertex buffer descriptor for the input buffer
+    ///
+    /// @param dstBuffer        Output primvar buffer
+    ///                         must have BindVBO() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param dstDesc          vertex buffer descriptor for the output buffer
+    ///
+    /// @param duBuffer         Output U-derivatives buffer
+    ///                         must have BindVBO() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param duDesc           vertex buffer descriptor for the duBuffer
+    ///
+    /// @param dvBuffer         Output V-derivatives buffer
+    ///                         must have BindVBO() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param dvDesc           vertex buffer descriptor for the dvBuffer
+    ///
+    /// @param numPatchCoords   number of patchCoords.
+    ///
+    /// @param patchCoords      array of locations to be evaluated.
+    ///
+    /// @param patchTable       GLPatchTable or equivalent
+    ///
+    template <typename SRC_BUFFER, typename DST_BUFFER,
+              typename PATCHCOORD_BUFFER, typename PATCH_TABLE>
+    bool EvalPatches(
+        SRC_BUFFER *srcBuffer, VertexBufferDescriptor const &srcDesc,
+        DST_BUFFER *dstBuffer, VertexBufferDescriptor const &dstDesc,
+        DST_BUFFER *duBuffer,  VertexBufferDescriptor const &duDesc,
+        DST_BUFFER *dvBuffer,  VertexBufferDescriptor const &dvDesc,
+        int numPatchCoords,
+        PATCHCOORD_BUFFER *patchCoords,
+        PATCH_TABLE *patchTable) const {
+
+        return EvalPatches(srcBuffer->BindVBO(), srcDesc,
+                           dstBuffer->BindVBO(), dstDesc,
+                           duBuffer->BindVBO(),  duDesc,
+                           dvBuffer->BindVBO(),  dvDesc,
+                           numPatchCoords,
+                           patchCoords->BindVBO(),
+                           patchTable->GetPatchArrays(),
+                           patchTable->GetPatchIndexBuffer(),
+                           patchTable->GetPatchParamBuffer());
+    }
+
+    bool EvalPatches(GLuint srcBuffer, VertexBufferDescriptor const &srcDesc,
+                     GLuint dstBuffer, VertexBufferDescriptor const &dstDesc,
+                     GLuint duBuffer, VertexBufferDescriptor const &duDesc,
+                     GLuint dvBuffer, VertexBufferDescriptor const &dvDesc,
+                     int numPatchCoords,
+                     GLuint patchCoordsBuffer,
+                     const PatchArrayVector &patchArrays,
+                     GLuint patchIndexBuffer,
+                     GLuint patchParamsBuffer) const;
+
+    /// ----------------------------------------------------------------------
+    ///
+    ///   Other methods
+    ///
+    /// ----------------------------------------------------------------------
+
     /// Configure GLSL kernel. A valid GL context must be made current before
     /// calling this function. Returns false if it fails to compile the kernel.
     bool Compile(VertexBufferDescriptor const &srcDesc,
@@ -188,18 +457,27 @@ public:
     static void Synchronize(void *deviceContext);
 
 private:
-    GLuint _program;
+    struct _StencilKernel {
+        GLuint program;
+        GLuint uniformSizes;
+        GLuint uniformOffsets;
+        GLuint uniformIndices;
+        GLuint uniformWeights;
+        GLuint uniformStart;
+        GLuint uniformEnd;
+        GLuint uniformSrcOffset;
+        GLuint uniformDstOffset;
+    } _stencilKernel;
 
-    GLuint _uniformSizes,        // stencil table
-           _uniformOffsets,
-           _uniformIndices,
-           _uniformWeights,
+    struct _PatchKernel {
+        GLuint program;
+        GLuint uniformSrcOffset;
+        GLuint uniformDstOffset;
+        GLuint uniformPatchArray;
+        GLuint uniformDuDesc;
+        GLuint uniformDvDesc;
 
-           _uniformStart,        // range
-           _uniformEnd,
-
-           _uniformSrcOffset,    // src buffer offset (in elements)
-           _uniformDstOffset;    // dst buffer offset (in elements)
+    } _patchKernel;
 
     int _workGroupSize;
 };
diff --git a/opensubdiv/osd/glPatchTable.cpp b/opensubdiv/osd/glPatchTable.cpp
index 986133d3..aa9ba90d 100644
--- a/opensubdiv/osd/glPatchTable.cpp
+++ b/opensubdiv/osd/glPatchTable.cpp
@@ -26,6 +26,7 @@
 
 #include "../far/patchTable.h"
 #include "../osd/opengl.h"
+#include "../osd/cpuPatchTable.h"
 
 namespace OpenSubdiv {
 namespace OPENSUBDIV_VERSION {
@@ -33,11 +34,14 @@ namespace OPENSUBDIV_VERSION {
 namespace Osd {
 
 GLPatchTable::GLPatchTable() :
-    _indexBuffer(0), _patchParamTexture(0) {
+    _patchIndexBuffer(0), _patchParamBuffer(0),
+    _patchIndexTexture(0), _patchParamTexture(0) {
 }
 
 GLPatchTable::~GLPatchTable() {
-    if (_indexBuffer) glDeleteBuffers(1, &_indexBuffer);
+    if (_patchIndexBuffer) glDeleteBuffers(1, &_patchIndexBuffer);
+    if (_patchParamBuffer) glDeleteBuffers(1, &_patchParamBuffer);
+    if (_patchIndexTexture) glDeleteTextures(1, &_patchIndexTexture);
     if (_patchParamTexture) glDeleteTextures(1, &_patchParamTexture);
 }
 
@@ -52,84 +56,56 @@ GLPatchTable::Create(Far::PatchTable const *farPatchTable,
 
 bool
 GLPatchTable::allocate(Far::PatchTable const *farPatchTable) {
-    glGenBuffers(1, &_indexBuffer);
+    glGenBuffers(1, &_patchIndexBuffer);
+    glGenBuffers(1, &_patchParamBuffer);
 
-    glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, _indexBuffer);
-    std::vector<int> buffer;
-    std::vector<unsigned int> ppBuffer;
+    CpuPatchTable patchTable(farPatchTable);
 
-    // needs reserve?
+    size_t numPatchArrays = patchTable.GetNumPatchArrays();
+    GLsizei indexSize = (GLsizei)patchTable.GetPatchIndexSize();
+    GLsizei patchParamSize = (GLsizei)patchTable.GetPatchParamSize();
 
-    int nPatchArrays = farPatchTable->GetNumPatchArrays();
-
-    // for each patchArray
-    for (int j = 0; j < nPatchArrays; ++j) {
-        PatchArray patchArray(farPatchTable->GetPatchArrayDescriptor(j),
-                              farPatchTable->GetNumPatches(j),
-                              (int)buffer.size(),
-                              (int)ppBuffer.size()/3);
-        _patchArrays.push_back(patchArray);
-
-        // indices
-        Far::ConstIndexArray indices = farPatchTable->GetPatchArrayVertices(j);
-        for (int k = 0; k < indices.size(); ++k) {
-            buffer.push_back(indices[k]);
-        }
-
-        // patchParams
-#if 0
-        // XXX: we need sharpness interface for patcharray or put sharpness
-        //      into patchParam.
-        Far::ConstPatchParamArray patchParams =
-            farPatchTable->GetPatchParams(j);
-        for (int k = 0; k < patchParams.size(); ++k) {
-            float sharpness = 0.0;
-            ppBuffer.push_back(patchParams[k].faceIndex);
-            ppBuffer.push_back(patchParams[k].bitField.field);
-            ppBuffer.push_back(*((unsigned int *)&sharpness));
-        }
-#else
-        // XXX: workaround. GetPatchParamTable() will be deprecated though.
-        Far::PatchParamTable const & patchParamTable =
-            farPatchTable->GetPatchParamTable();
-        std::vector<Far::Index> const &sharpnessIndexTable =
-            farPatchTable->GetSharpnessIndexTable();
-        int numPatches = farPatchTable->GetNumPatches(j);
-        for (int k = 0; k < numPatches; ++k) {
-            float sharpness = 0.0;
-            int patchIndex = (int)ppBuffer.size()/3;
-            if (patchIndex < (int)sharpnessIndexTable.size()) {
-                int sharpnessIndex = sharpnessIndexTable[patchIndex];
-                if (sharpnessIndex >= 0)
-                    sharpness = farPatchTable->GetSharpnessValues()[sharpnessIndex];
-            }
-            ppBuffer.push_back(patchParamTable[patchIndex].faceIndex);
-            ppBuffer.push_back(patchParamTable[patchIndex].bitField.field);
-            ppBuffer.push_back(*((unsigned int *)&sharpness));
-        }
-#endif
-    }
+    // copy patch array
+    _patchArrays.insert(_patchArrays.end(),
+                        patchTable.GetPatchArrayBuffer(),
+                        patchTable.GetPatchArrayBuffer() + numPatchArrays);
 
+    // copy index buffer
+    glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, _patchIndexBuffer);
     glBufferData(GL_ELEMENT_ARRAY_BUFFER,
-                 (int)buffer.size()*sizeof(int), &buffer[0], GL_STATIC_DRAW);
-
-    // patchParam is currently expected to be texture (it can be SSBO)
-    GLuint texBuffer = 0;
-    glGenBuffers(1, &texBuffer);
-    glBindBuffer(GL_ARRAY_BUFFER, texBuffer);
-    glBufferData(GL_ARRAY_BUFFER, ppBuffer.size()*sizeof(unsigned int),
-                 &ppBuffer[0], GL_STATIC_DRAW);
-
-    glGenTextures(1, &_patchParamTexture);
-    glBindTexture(GL_TEXTURE_BUFFER, _patchParamTexture);
-    glTexBuffer(GL_TEXTURE_BUFFER, GL_RGB32I, texBuffer);
-    glBindTexture(GL_TEXTURE_BUFFER, 0);
-
-    glDeleteBuffers(1, &texBuffer);
-
+                 indexSize * sizeof(GLint),
+                 patchTable.GetPatchIndexBuffer(),
+                 GL_STATIC_DRAW);
     glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, 0);
+
+    // copy patchparam buffer
+    glBindBuffer(GL_ARRAY_BUFFER, _patchParamBuffer);
+    glBufferData(GL_ARRAY_BUFFER,
+                 patchParamSize * sizeof(PatchParam),
+                 patchTable.GetPatchParamBuffer(),
+                 GL_STATIC_DRAW);
     glBindBuffer(GL_ARRAY_BUFFER, 0);
 
+    // make both buffer as texture buffers too.
+    glGenTextures(1, &_patchIndexTexture);
+    glGenTextures(1, &_patchParamTexture);
+
+    GLuint buffer;
+    glGenBuffers(1, &buffer);
+    glBindBuffer(GL_ARRAY_BUFFER, buffer);
+    glBufferData(GL_ARRAY_BUFFER,
+                 indexSize * sizeof(GLint),
+                 patchTable.GetPatchIndexBuffer(),
+                 GL_STATIC_DRAW);
+
+    glBindTexture(GL_TEXTURE_BUFFER, _patchIndexTexture);
+//    glTexBuffer(GL_TEXTURE_BUFFER, GL_R32I, _patchIndexBuffer);
+    glTexBuffer(GL_TEXTURE_BUFFER, GL_R32I, buffer);
+
+    glBindTexture(GL_TEXTURE_BUFFER, _patchParamTexture);
+    glTexBuffer(GL_TEXTURE_BUFFER, GL_RGB32I, _patchParamBuffer);
+    glBindTexture(GL_TEXTURE_BUFFER, 0);
+
     return true;
 }
 
diff --git a/opensubdiv/osd/glPatchTable.h b/opensubdiv/osd/glPatchTable.h
index 4fbf1b0b..e7e67861 100644
--- a/opensubdiv/osd/glPatchTable.h
+++ b/opensubdiv/osd/glPatchTable.h
@@ -27,10 +27,9 @@
 
 #include "../version.h"
 
-#include <vector>
-#include "../far/patchDescriptor.h"
 #include "../osd/nonCopyable.h"
 #include "../osd/opengl.h"
+#include "../osd/types.h"
 
 namespace OpenSubdiv {
 namespace OPENSUBDIV_VERSION {
@@ -45,34 +44,6 @@ class GLPatchTable : private NonCopyable<GLPatchTable> {
 public:
     typedef GLuint VertexBufferBinding;
 
-    // XXX: this struct will be further refactored.
-    class PatchArray {
-    public:
-        PatchArray(Far::PatchDescriptor desc, int numPatches,
-                   int indexBase, int primitiveIdBase) :
-            desc(desc), numPatches(numPatches), indexBase(indexBase),
-            primitiveIdBase(primitiveIdBase) {}
-        Far::PatchDescriptor const &GetDescriptor() const {
-            return desc;
-        }
-        int GetNumPatches() const {
-            return numPatches;
-        }
-        int GetIndexBase() const {
-            return indexBase;
-        }
-        int GetPrimitiveIdBase() const {
-            return primitiveIdBase;
-        }
-    private:
-        Far::PatchDescriptor desc;
-        int numPatches;
-        int indexBase;        // an offset within the index buffer
-        int primitiveIdBase;  // an offset within the patch param buffer
-    };
-    typedef std::vector<PatchArray> PatchArrayVector;
-
-    GLPatchTable();
     ~GLPatchTable();
 
     static GLPatchTable *Create(Far::PatchTable const *farPatchTable,
@@ -84,7 +55,17 @@ public:
 
     /// Returns the GL index buffer containing the patch control vertices
     GLuint GetPatchIndexBuffer() const {
-        return _indexBuffer;
+        return _patchIndexBuffer;
+    }
+
+    /// Returns the GL index buffer containing the patch parameter
+    GLuint GetPatchParamBuffer() const {
+        return _patchParamBuffer;
+    }
+
+    /// Returns the GL texture buffer containing the patch control vertices
+    GLuint GetPatchIndexTextureBuffer() const {
+        return _patchIndexTexture;
     }
 
     /// Returns the GL texture buffer containing the patch parameter
@@ -93,11 +74,17 @@ public:
     }
 
 protected:
+    GLPatchTable();
+
     // allocate buffers from patchTable
     bool allocate(Far::PatchTable const *farPatchTable);
 
     PatchArrayVector _patchArrays;
-    GLuint _indexBuffer;
+
+    GLuint _patchIndexBuffer;
+    GLuint _patchParamBuffer;
+
+    GLuint _patchIndexTexture;
     GLuint _patchParamTexture;
 };
 
diff --git a/opensubdiv/osd/glXFBEvaluator.cpp b/opensubdiv/osd/glXFBEvaluator.cpp
index 807a7ec6..6ff3c8d9 100644
--- a/opensubdiv/osd/glXFBEvaluator.cpp
+++ b/opensubdiv/osd/glXFBEvaluator.cpp
@@ -108,37 +108,36 @@ GLStencilTableTBO::~GLStencilTableTBO() {
 // ---------------------------------------------------------------------------
 
 
-GLXFBEvaluator::GLXFBEvaluator() :
-    _program(0), _srcBufferTexture(0),
-    _uniformSrcBufferTexture(0), _uniformSizesTexture(0),
-    _uniformOffsetsTexture(0), _uniformIndicesTexture(0),
-    _uniformWeightsTexture(0), _uniformStart(0), _uniformEnd(0),
-    _uniformSrcOffset(0) {
+GLXFBEvaluator::GLXFBEvaluator() : _srcBufferTexture(0) {
+    memset (&_stencilKernel, 0, sizeof(_stencilKernel));
+    memset (&_patchKernel, 0, sizeof(_patchKernel));
 }
 
 GLXFBEvaluator::~GLXFBEvaluator() {
-    if (_program) {
-        glDeleteProgram(_program);
+    if (_stencilKernel.program) {
+        glDeleteProgram(_stencilKernel.program);
+    }
+    if (_patchKernel.program) {
+        glDeleteProgram(_patchKernel.program);
     }
     if (_srcBufferTexture) {
         glDeleteTextures(1, &_srcBufferTexture);
     }
 }
 
-bool
-GLXFBEvaluator::Compile(VertexBufferDescriptor const &srcDesc,
-                        VertexBufferDescriptor const &dstDesc) {
-    if (_program) {
-        glDeleteProgram(_program);
-        _program = 0;
-    }
-    _program = glCreateProgram();
+static GLuint
+compileKernel(VertexBufferDescriptor const &srcDesc,
+              VertexBufferDescriptor const &dstDesc,
+              const char *kernelDefine) {
+
+    GLuint program = glCreateProgram();
 
     GLuint shader = glCreateShader(GL_VERTEX_SHADER);
 
     std::ostringstream defines;
     defines << "#define LENGTH " << srcDesc.length << "\n"
-            << "#define SRC_STRIDE " << srcDesc.stride << "\n";
+            << "#define SRC_STRIDE " << srcDesc.stride << "\n"
+            << kernelDefine << "\n";
     std::string defineStr = defines.str();
 
     const char *shaderSources[3] = {"#version 410\n", NULL, NULL};
@@ -147,7 +146,7 @@ GLXFBEvaluator::Compile(VertexBufferDescriptor const &srcDesc,
     shaderSources[2] = shaderSource;
     glShaderSource(shader, 3, shaderSources, NULL);
     glCompileShader(shader);
-    glAttachShader(_program, shader);
+    glAttachShader(program, shader);
 
     std::vector<std::string> outputs;
     std::vector<const char *> pOutputs;
@@ -183,40 +182,83 @@ GLXFBEvaluator::Compile(VertexBufferDescriptor const &srcDesc,
         }
     }
 
-    glTransformFeedbackVaryings(_program, (GLsizei)outputs.size(),
+    glTransformFeedbackVaryings(program, (GLsizei)outputs.size(),
                                 &pOutputs[0], GL_INTERLEAVED_ATTRIBS);
 
     GLint linked = 0;
-    glLinkProgram(_program);
-    glGetProgramiv(_program, GL_LINK_STATUS, &linked);
+    glLinkProgram(program);
+    glGetProgramiv(program, GL_LINK_STATUS, &linked);
 
     if (linked == GL_FALSE) {
         char buffer[1024];
         glGetShaderInfoLog(shader, 1024, NULL, buffer);
         Far::Error(Far::FAR_RUNTIME_ERROR, buffer);
 
-        glGetProgramInfoLog(_program, 1024, NULL, buffer);
+        glGetProgramInfoLog(program, 1024, NULL, buffer);
         Far::Error(Far::FAR_RUNTIME_ERROR, buffer);
 
-        glDeleteProgram(_program);
-        _program = 0;
-        return false;
+        glDeleteProgram(program);
+        program = 0;
     }
 
     glDeleteShader(shader);
 
-    // set uniform locations for compute kernels
-    _uniformSrcBufferTexture  = glGetUniformLocation(_program, "vertexBuffer");
+    return program;
+}
 
-    _uniformSizesTexture   = glGetUniformLocation(_program, "sizes");
-    _uniformOffsetsTexture = glGetUniformLocation(_program, "offsets");
-    _uniformIndicesTexture = glGetUniformLocation(_program, "indices");
-    _uniformWeightsTexture = glGetUniformLocation(_program, "weights");
+bool
+GLXFBEvaluator::Compile(VertexBufferDescriptor const &srcDesc,
+                        VertexBufferDescriptor const &dstDesc) {
 
-    _uniformStart   = glGetUniformLocation(_program, "batchStart");
-    _uniformEnd     = glGetUniformLocation(_program, "batchEnd");
+    // create stencil kernel
+    if (_stencilKernel.program) {
+        glDeleteProgram(_stencilKernel.program);
+    }
+    _stencilKernel.program = compileKernel(
+        srcDesc, dstDesc,
+        "#define OPENSUBDIV_GLSL_XFB_KERNEL_EVAL_STENCILS");
+    if (_stencilKernel.program == 0) return false;
 
-    _uniformSrcOffset = glGetUniformLocation(_program, "srcOffset");
+    // cache uniform locations
+    _stencilKernel.uniformSrcBufferTexture
+        = glGetUniformLocation(_stencilKernel.program, "vertexBuffer");
+    _stencilKernel.uniformSrcOffset
+        = glGetUniformLocation(_stencilKernel.program, "srcOffset");
+
+    _stencilKernel.uniformSizesTexture
+        = glGetUniformLocation(_stencilKernel.program, "sizes");
+    _stencilKernel.uniformOffsetsTexture
+        = glGetUniformLocation(_stencilKernel.program, "offsets");
+    _stencilKernel.uniformIndicesTexture
+        = glGetUniformLocation(_stencilKernel.program, "indices");
+    _stencilKernel.uniformWeightsTexture
+        = glGetUniformLocation(_stencilKernel.program, "weights");
+    _stencilKernel.uniformStart
+        = glGetUniformLocation(_stencilKernel.program, "batchStart");
+    _stencilKernel.uniformEnd
+        = glGetUniformLocation(_stencilKernel.program, "batchEnd");
+
+    // create patch kernel
+    if (_patchKernel.program) {
+        glDeleteProgram(_patchKernel.program);
+    }
+    _patchKernel.program = compileKernel(
+        srcDesc, dstDesc,
+        "#define OPENSUBDIV_GLSL_XFB_KERNEL_EVAL_PATCHES");
+    if (_patchKernel.program == 0) return false;
+
+    // cache uniform locations
+    _patchKernel.uniformSrcBufferTexture
+        = glGetUniformLocation(_patchKernel.program, "vertexBuffer");
+    _patchKernel.uniformSrcOffset
+        = glGetUniformLocation(_patchKernel.program, "srcOffset");
+
+    _patchKernel.uniformPatchArray
+        = glGetUniformLocation(_patchKernel.program, "patchArray");
+    _patchKernel.uniformPatchParamTexture
+        = glGetUniformLocation(_patchKernel.program, "patchParamBuffer");
+    _patchKernel.uniformPatchIndexTexture
+        = glGetUniformLocation(_patchKernel.program, "patchIndexBuffer");
 
     // create a texture for input buffer
     if (!_srcBufferTexture) {
@@ -255,7 +297,7 @@ GLXFBEvaluator::EvalStencils(GLuint srcBuffer,
                              GLuint weightsTexture,
                              int start,
                              int end) const {
-    if (!_program) return false;
+    if (!_stencilKernel.program) return false;
     int count = end - start;
     if (count <= 0) {
         return true;
@@ -268,25 +310,25 @@ GLXFBEvaluator::EvalStencils(GLuint srcBuffer,
     glBindVertexArray(vao);
 
     glEnable(GL_RASTERIZER_DISCARD);
-    glUseProgram(_program);
+    glUseProgram(_stencilKernel.program);
 
     // Set input VBO as a texture buffer.
     glBindTexture(GL_TEXTURE_BUFFER, _srcBufferTexture);
     glTexBuffer(GL_TEXTURE_BUFFER, GL_R32F, srcBuffer);
     glBindTexture(GL_TEXTURE_BUFFER, 0);
 
-    bindTexture(_uniformSrcBufferTexture, _srcBufferTexture, 0);
+    bindTexture(_stencilKernel.uniformSrcBufferTexture, _srcBufferTexture, 0);
 
     // bind stencil table textures.
-    bindTexture(_uniformSizesTexture,   sizesTexture, 1);
-    bindTexture(_uniformOffsetsTexture, offsetsTexture, 2);
-    bindTexture(_uniformIndicesTexture, indicesTexture, 3);
-    bindTexture(_uniformWeightsTexture, weightsTexture, 4);
+    bindTexture(_stencilKernel.uniformSizesTexture,   sizesTexture, 1);
+    bindTexture(_stencilKernel.uniformOffsetsTexture, offsetsTexture, 2);
+    bindTexture(_stencilKernel.uniformIndicesTexture, indicesTexture, 3);
+    bindTexture(_stencilKernel.uniformWeightsTexture, weightsTexture, 4);
 
     // set batch range
-    glUniform1i(_uniformStart,     start);
-    glUniform1i(_uniformEnd,       end);
-    glUniform1i(_uniformSrcOffset, srcDesc.offset);
+    glUniform1i(_stencilKernel.uniformStart,     start);
+    glUniform1i(_stencilKernel.uniformEnd,       end);
+    glUniform1i(_stencilKernel.uniformSrcOffset, srcDesc.offset);
 
     // The destination buffer is bound at vertex boundary.
     //
@@ -347,6 +389,94 @@ GLXFBEvaluator::EvalStencils(GLuint srcBuffer,
     return true;
 }
 
+bool
+GLXFBEvaluator::EvalPatches(
+    GLuint srcBuffer, VertexBufferDescriptor const &srcDesc,
+    GLuint dstBuffer, VertexBufferDescriptor const &dstDesc,
+    GLuint duBuffer, VertexBufferDescriptor const & /*duDesc*/,
+    GLuint dvBuffer, VertexBufferDescriptor const & /*dvDesc*/,
+    int numPatchCoords,
+    GLuint patchCoordsBuffer,
+    const PatchArrayVector &patchArrays,
+    GLuint patchIndexTexture,
+    GLuint patchParamTexture) const {
+    if (!_patchKernel.program) return false;
+
+    if (duBuffer != 0 || dvBuffer != 0) {
+        Far::Error(Far::FAR_RUNTIME_ERROR,
+                   "GLXFBEvaluator doesn't support derivative evaluation yet.\n");
+    }
+
+    // bind vertex array
+    // always create new one, to be safe with multiple contexts (slow though)
+    GLuint vao = 0;
+    glGenVertexArrays(1, &vao);
+    glBindVertexArray(vao);
+
+    glEnable(GL_RASTERIZER_DISCARD);
+    glUseProgram(_patchKernel.program);
+
+    // Set input VBO as a texture buffer.
+    glBindTexture(GL_TEXTURE_BUFFER, _srcBufferTexture);
+    glTexBuffer(GL_TEXTURE_BUFFER, GL_R32F, srcBuffer);
+    glBindTexture(GL_TEXTURE_BUFFER, 0);
+
+    bindTexture(_patchKernel.uniformSrcBufferTexture, _srcBufferTexture, 0);
+
+    // bind patch index and patch param textures.
+    bindTexture(_patchKernel.uniformPatchParamTexture, patchParamTexture, 1);
+    bindTexture(_patchKernel.uniformPatchIndexTexture, patchIndexTexture, 2);
+
+    // set other uniforms
+    glUniform4iv(_patchKernel.uniformPatchArray, (int)patchArrays.size(),
+                 (const GLint*)&patchArrays[0]);
+    glUniform1i(_patchKernel.uniformSrcOffset, srcDesc.offset);
+
+    // input patchcoords
+    glEnableVertexAttribArray(0);
+    glEnableVertexAttribArray(1);
+    int stride = sizeof(int) * 5; // patchcoord = int*5 struct
+    glBindBuffer(GL_ARRAY_BUFFER, patchCoordsBuffer);
+    glVertexAttribIPointer(0, 3, GL_UNSIGNED_INT, stride, (void*)0);
+    glVertexAttribPointer(1, 2, GL_FLOAT, GL_FALSE, stride, (void*)(sizeof(int)*3));
+
+    int dstBufferBindOffset =
+        dstDesc.offset - (dstDesc.offset % dstDesc.stride);
+
+    // bind destination buffer
+    glBindBufferRange(GL_TRANSFORM_FEEDBACK_BUFFER,
+                      0, dstBuffer,
+                      dstBufferBindOffset * sizeof(float),
+                      numPatchCoords * dstDesc.stride * sizeof(float));
+
+    glBeginTransformFeedback(GL_POINTS);
+    glDrawArrays(GL_POINTS, 0, numPatchCoords);
+    glEndTransformFeedback();
+
+    glBindBuffer(GL_TRANSFORM_FEEDBACK_BUFFER, 0);
+
+    // unbind textures
+    for (int i = 0; i < 3; ++i) {
+        glActiveTexture(GL_TEXTURE0 + i);
+        glBindTexture(GL_TEXTURE_BUFFER, 0);
+    }
+
+    glDisable(GL_RASTERIZER_DISCARD);
+    glUseProgram(0);
+    glActiveTexture(GL_TEXTURE0);
+
+    glDisableVertexAttribArray(0);
+    glDisableVertexAttribArray(1);
+
+    // revert vao
+    glBindVertexArray(0);
+    glDeleteVertexArrays(1, &vao);
+
+
+    return true;
+}
+
+
 }  // end namespace Osd
 
 }  // end namespace OPENSUBDIV_VERSION
diff --git a/opensubdiv/osd/glXFBEvaluator.h b/opensubdiv/osd/glXFBEvaluator.h
index 65881670..e04d3bb7 100644
--- a/opensubdiv/osd/glXFBEvaluator.h
+++ b/opensubdiv/osd/glXFBEvaluator.h
@@ -28,6 +28,7 @@
 #include "../version.h"
 
 #include "../osd/opengl.h"
+#include "../osd/types.h"
 #include "../osd/vertexDescriptor.h"
 
 namespace OpenSubdiv {
@@ -93,6 +94,12 @@ public:
     /// Destructor. note that the GL context must be made current.
     ~GLXFBEvaluator();
 
+    /// ----------------------------------------------------------------------
+    ///
+    ///   Stencil evaluations with StencilTable
+    ///
+    /// ----------------------------------------------------------------------
+
     /// \brief Generic static stencil function. This function has a same
     ///        signature as other device kernels have so that it can be called
     ///        transparently from OsdMesh template interface.
@@ -120,25 +127,25 @@ public:
     ///
     /// @param deviceContext  not used in the GLSLTransformFeedback kernel
     ///
-    template <typename VERTEX_BUFFER, typename STENCIL_TABLE>
-    static bool EvalStencils(VERTEX_BUFFER *srcVertexBuffer,
-                             VertexBufferDescriptor const &srcDesc,
-                             VERTEX_BUFFER *dstVertexBuffer,
-                             VertexBufferDescriptor const &dstDesc,
-                             STENCIL_TABLE const *stencilTable,
-                             GLXFBEvaluator const *instance,
-                             void * deviceContext = NULL) {
+    template <typename SRC_BUFFER, typename DST_BUFFER, typename STENCIL_TABLE>
+    static bool EvalStencils(
+        SRC_BUFFER *srcBuffer, VertexBufferDescriptor const &srcDesc,
+        DST_BUFFER *dstBuffer, VertexBufferDescriptor const &dstDesc,
+        STENCIL_TABLE const *stencilTable,
+        GLXFBEvaluator const *instance,
+        void * deviceContext = NULL) {
+
         if (instance) {
-            return instance->EvalStencils(srcVertexBuffer, srcDesc,
-                                          dstVertexBuffer, dstDesc,
+            return instance->EvalStencils(srcBuffer, srcDesc,
+                                          dstBuffer, dstDesc,
                                           stencilTable);
         } else {
             // Create an instance on demand (slow)
             (void)deviceContext;  // unused
             instance = Create(srcDesc, dstDesc);
             if (instance) {
-                bool r = instance->EvalStencils(srcVertexBuffer, srcDesc,
-                                                dstVertexBuffer, dstDesc,
+                bool r = instance->EvalStencils(srcBuffer, srcDesc,
+                                                dstBuffer, dstDesc,
                                                 stencilTable);
                 delete instance;
                 return r;
@@ -147,18 +154,30 @@ public:
         }
     }
 
-    /// Dispatch the GLSL compute kernel on GPU asynchronously.
-    /// returns false if the kernel hasn't been compiled yet.
-    template <typename VERTEX_BUFFER, typename STENCIL_TABLE>
-    bool EvalStencils(VERTEX_BUFFER *srcVertexBuffer,
-                      VertexBufferDescriptor const &srcDesc,
-                      VERTEX_BUFFER *dstVertexBuffer,
-                      VertexBufferDescriptor const &dstDesc,
-                      STENCIL_TABLE const *stencilTable) const {
-        return EvalStencils(srcVertexBuffer->BindVBO(),
-                            srcDesc,
-                            dstVertexBuffer->BindVBO(),
-                            dstDesc,
+    /// \brief dispatch eval stencils function.
+    ///
+    /// @param srcBuffer      Input primvar buffer.
+    ///                       must have BindVBO() method returning a GL
+    ///                       buffer object of source data
+    ///
+    /// @param srcDesc        vertex buffer descriptor for the input buffer
+    ///
+    /// @param dstBuffer      Output primvar buffer
+    ///                       must have BindVBO() method returning a GL
+    ///                       buffer object for destination data
+    ///
+    /// @param dstDesc        vertex buffer descriptor for the output buffer
+    ///
+    /// @param stencilTable   stencil table to be applied.
+    ///
+    template <typename SRC_BUFFER, typename DST_BUFFER, typename STENCIL_TABLE>
+    bool EvalStencils(
+        SRC_BUFFER *srcBuffer, VertexBufferDescriptor const &srcDesc,
+        DST_BUFFER *dstBuffer, VertexBufferDescriptor const &dstDesc,
+        STENCIL_TABLE const *stencilTable) const {
+
+        return EvalStencils(srcBuffer->BindVBO(), srcDesc,
+                            dstBuffer->BindVBO(), dstDesc,
                             stencilTable->GetSizesTexture(),
                             stencilTable->GetOffsetsTexture(),
                             stencilTable->GetIndicesTexture(),
@@ -167,12 +186,31 @@ public:
                             /* end   = */ stencilTable->GetNumStencils());
     }
 
-    /// Dispatch the GLSL compute kernel on GPU asynchronously.
-    /// returns false if the kernel hasn't been compiled yet.
-    bool EvalStencils(GLuint srcBuffer,
-                      VertexBufferDescriptor const &srcDesc,
-                      GLuint dstBuffer,
-                      VertexBufferDescriptor const &dstDesc,
+    /// \brief Static eval stencils function, dispatch the GLSL XFB kernel on
+    ///        on GPU asynchronously.
+    ///
+    /// @param srcBuffer      GL buffer of input primvars.
+    ///
+    /// @param srcDesc        vertex buffer descriptor for the input buffer
+    ///
+    /// @param dstBuffer      GL buffer of output primvars.
+    ///
+    /// @param dstDesc        vertex buffer descriptor for the output buffer
+    ///
+    /// @param sizesBuffer    GL buffer of the sizes in the stencil table
+    ///
+    /// @param offsetsBuffer  GL buffer of the offsets in the stencil table
+    ///
+    /// @param indicesBuffer  GL buffer of the indices in the stencil table
+    ///
+    /// @param weightsBuffer  GL buffer of the weifgrs in the stencil table
+    ///
+    /// @param start          start index of stencil table
+    ///
+    /// @param end            end index of stencil table
+    ///
+    bool EvalStencils(GLuint srcBuffer, VertexBufferDescriptor const &srcDesc,
+                      GLuint dstBuffer, VertexBufferDescriptor const &dstDesc,
                       GLuint sizesBuffer,
                       GLuint offsetsBuffer,
                       GLuint indicesBuffer,
@@ -180,6 +218,270 @@ public:
                       int start,
                       int end) const;
 
+    /// ----------------------------------------------------------------------
+    ///
+    ///   Limit evaluations with PatchTable
+    ///
+    /// ----------------------------------------------------------------------
+    ///
+    /// \brief Generic limit eval function. This function has a same
+    ///        signature as other device kernels have so that it can be called
+    ///        in the same way.
+    ///
+    /// @param srcBuffer      Input primvar buffer.
+    ///                       must have BindVBO() method returning a GL
+    ///                       buffer object of source data
+    ///
+    /// @param srcDesc        vertex buffer descriptor for the input buffer
+    ///
+    /// @param dstBuffer      Output primvar buffer
+    ///                       must have BindVBO() method returning a GL
+    ///                       buffer object of destination data
+    ///
+    /// @param dstDesc        vertex buffer descriptor for the output buffer
+    ///
+    /// @param numPatchCoords number of patchCoords.
+    ///
+    /// @param patchCoords    array of locations to be evaluated.
+    ///                       must have BindVBO() method returning an
+    ///                       array of PatchCoord struct in VBO.
+    ///
+    /// @param patchTable     GLPatchTable or equivalent
+    ///
+    /// @param instance       cached compiled instance. Clients are supposed to
+    ///                       pre-compile an instance of this class and provide
+    ///                       to this function. If it's null the kernel still
+    ///                       compute by instantiating on-demand kernel although
+    ///                       it may cause a performance problem.
+    ///
+    /// @param deviceContext  not used in the GLXFB evaluator
+    ///
+    template <typename SRC_BUFFER, typename DST_BUFFER,
+              typename PATCHCOORD_BUFFER, typename PATCH_TABLE>
+    static bool EvalPatches(
+        SRC_BUFFER *srcBuffer, VertexBufferDescriptor const &srcDesc,
+        DST_BUFFER *dstBuffer, VertexBufferDescriptor const &dstDesc,
+        int numPatchCoords,
+        PATCHCOORD_BUFFER *patchCoords,
+        PATCH_TABLE *patchTable,
+        GLXFBEvaluator const *instance,
+        void * deviceContext = NULL) {
+
+        if (instance) {
+            return instance->EvalPatches(srcBuffer, srcDesc,
+                                         dstBuffer, dstDesc,
+                                         numPatchCoords, patchCoords,
+                                         patchTable);
+        } else {
+            // Create an instance on demand (slow)
+            (void)deviceContext;  // unused
+            instance = Create(srcDesc, dstDesc);
+            if (instance) {
+                bool r = instance->EvalPatches(srcBuffer, srcDesc,
+                                               dstBuffer, dstDesc,
+                                               numPatchCoords, patchCoords,
+                                               patchTable);
+                delete instance;
+                return r;
+            }
+            return false;
+        }
+    }
+
+    /// \brief Generic limit eval function. This function has a same
+    ///        signature as other device kernels have so that it can be called
+    ///        in the same way.
+    ///
+    /// @param srcBuffer      Input primvar buffer.
+    ///                       must have BindVBO() method returning a GL
+    ///                       buffer object of source data
+    ///
+    /// @param srcDesc        vertex buffer descriptor for the input buffer
+    ///
+    /// @param dstBuffer      Output primvar buffer
+    ///                       must have BindVBO() method returning a GL
+    ///                       buffer object of destination data
+    ///
+    /// @param dstDesc        vertex buffer descriptor for the output buffer
+    ///
+    /// @param duBuffer
+    ///
+    /// @param duDesc
+    ///
+    /// @param dvBuffer
+    ///
+    /// @param dvDesc
+    ///
+    /// @param numPatchCoords number of patchCoords.
+    ///
+    /// @param patchCoords    array of locations to be evaluated.
+    ///                       must have BindVBO() method returning an
+    ///                       array of PatchCoord struct in VBO.
+    ///
+    /// @param patchTable     GLPatchTable or equivalent
+    ///
+    /// @param instance       cached compiled instance. Clients are supposed to
+    ///                       pre-compile an instance of this class and provide
+    ///                       to this function. If it's null the kernel still
+    ///                       compute by instantiating on-demand kernel although
+    ///                       it may cause a performance problem.
+    ///
+    /// @param deviceContext  not used in the GLXFB evaluator
+    ///
+    template <typename SRC_BUFFER, typename DST_BUFFER,
+              typename PATCHCOORD_BUFFER, typename PATCH_TABLE>
+    static bool EvalPatches(
+        SRC_BUFFER *srcBuffer, VertexBufferDescriptor const &srcDesc,
+        DST_BUFFER *dstBuffer, VertexBufferDescriptor const &dstDesc,
+        DST_BUFFER *duBuffer,  VertexBufferDescriptor const &duDesc,
+        DST_BUFFER *dvBuffer,  VertexBufferDescriptor const &dvDesc,
+        int numPatchCoords,
+        PATCHCOORD_BUFFER *patchCoords,
+        PATCH_TABLE *patchTable,
+        GLXFBEvaluator const *instance,
+        void * deviceContext = NULL) {
+
+        if (instance) {
+            return instance->EvalPatches(srcBuffer, srcDesc,
+                                         dstBuffer, dstDesc,
+                                         duBuffer, duDesc,
+                                         dvBuffer, dvDesc,
+                                         numPatchCoords, patchCoords,
+                                         patchTable);
+        } else {
+            // Create an instance on demand (slow)
+            (void)deviceContext;  // unused
+            instance = Create(srcDesc, dstDesc);
+            if (instance) {
+                bool r = instance->EvalPatches(srcBuffer, srcDesc,
+                                               dstBuffer, dstDesc,
+                                               duBuffer, duDesc,
+                                               dvBuffer, dvDesc,
+                                               numPatchCoords, patchCoords,
+                                               patchTable);
+                delete instance;
+                return r;
+            }
+            return false;
+        }
+    }
+
+    /// \brief Generic limit eval function. This function has a same
+    ///        signature as other device kernels have so that it can be called
+    ///        in the same way.
+    ///
+    /// @param srcBuffer      Input primvar buffer.
+    ///                       must have BindCudaBuffer() method returning a
+    ///                       const float pointer for read
+    ///
+    /// @param srcDesc        vertex buffer descriptor for the input buffer
+    ///
+    /// @param dstBuffer      Output primvar buffer
+    ///                       must have BindCudaBuffer() method returning a
+    ///                       float pointer for write
+    ///
+    /// @param dstDesc        vertex buffer descriptor for the output buffer
+    ///
+    /// @param numPatchCoords number of patchCoords.
+    ///
+    /// @param patchCoords    array of locations to be evaluated.
+    ///                       must have BindCudaBuffer() method returning an
+    ///                       array of PatchCoord struct in cuda memory.
+    ///
+    /// @param patchTable     GLPatchTable or equivalent
+    ///
+    template <typename SRC_BUFFER, typename DST_BUFFER,
+              typename PATCHCOORD_BUFFER, typename PATCH_TABLE>
+    bool EvalPatches(
+        SRC_BUFFER *srcBuffer, VertexBufferDescriptor const &srcDesc,
+        DST_BUFFER *dstBuffer, VertexBufferDescriptor const &dstDesc,
+        int numPatchCoords,
+        PATCHCOORD_BUFFER *patchCoords,
+        PATCH_TABLE *patchTable) const {
+
+        return EvalPatches(srcBuffer->BindVBO(), srcDesc,
+                           dstBuffer->BindVBO(), dstDesc,
+                           0, VertexBufferDescriptor(),
+                           0, VertexBufferDescriptor(),
+                           numPatchCoords,
+                           patchCoords->BindVBO(),
+                           patchTable->GetPatchArrays(),
+                           patchTable->GetPatchIndexTextureBuffer(),
+                           patchTable->GetPatchParamTextureBuffer());
+    }
+
+    /// \brief Generic limit eval function with derivatives. This function has
+    ///        a same signature as other device kernels have so that it can be
+    ///        called in the same way.
+    ///
+    /// @param srcBuffer        Input primvar buffer.
+    ///                         must have BindCudaBuffer() method returning a
+    ///                         const float pointer for read
+    ///
+    /// @param srcDesc          vertex buffer descriptor for the input buffer
+    ///
+    /// @param dstBuffer        Output primvar buffer
+    ///                         must have BindCudaBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param dstDesc          vertex buffer descriptor for the output buffer
+    ///
+    /// @param duBuffer         Output s-derivatives buffer
+    ///                         must have BindCudaBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param duDesc           vertex buffer descriptor for the duBuffer
+    ///
+    /// @param dvBuffer         Output t-derivatives buffer
+    ///                         must have BindCudaBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param dvDesc           vertex buffer descriptor for the dvBuffer
+    ///
+    /// @param numPatchCoords   number of patchCoords.
+    ///
+    /// @param patchCoords      array of locations to be evaluated.
+    ///
+    /// @param patchTable       GLPatchTable or equivalent
+    ///
+    template <typename SRC_BUFFER, typename DST_BUFFER,
+              typename PATCHCOORD_BUFFER, typename PATCH_TABLE>
+    bool EvalPatches(
+        SRC_BUFFER *srcBuffer, VertexBufferDescriptor const &srcDesc,
+        DST_BUFFER *dstBuffer, VertexBufferDescriptor const &dstDesc,
+        DST_BUFFER *duBuffer,  VertexBufferDescriptor const &duDesc,
+        DST_BUFFER *dvBuffer,  VertexBufferDescriptor const &dvDesc,
+        int numPatchCoords,
+        PATCHCOORD_BUFFER *patchCoords,
+        PATCH_TABLE *patchTable) const {
+
+        return EvalPatches(srcBuffer->BindVBO(), srcDesc,
+                           dstBuffer->BindVBO(), dstDesc,
+                           duBuffer->BindVBO(),  duDesc,
+                           dvBuffer->BindVBO(),  dvDesc,
+                           numPatchCoords,
+                           patchCoords->BindVBO(),
+                           patchTable->GetPatchArrays(),
+                           patchTable->GetPatchIndexTextureBuffer(),
+                           patchTable->GetPatchParamTextureBuffer());
+    }
+
+    bool EvalPatches(GLuint srcBuffer, VertexBufferDescriptor const &srcDesc,
+                     GLuint dstBuffer, VertexBufferDescriptor const &dstDesc,
+                     GLuint duBuffer, VertexBufferDescriptor const &duDesc,
+                     GLuint dvBuffer, VertexBufferDescriptor const &dvDesc,
+                     int numPatchCoords,
+                     GLuint patchCoordsBuffer,
+                     const PatchArrayVector &patchArrays,
+                     GLuint patchIndexBuffer,
+                     GLuint patchParamsBuffer) const;
+
+    /// ----------------------------------------------------------------------
+    ///
+    ///   Other methods
+    ///
+    /// ----------------------------------------------------------------------
+
     /// Configure GLSL kernel. A valid GL context must be made current before
     /// calling this function. Returns false if it fails to compile the kernel.
     bool Compile(VertexBufferDescriptor const &srcDesc,
@@ -189,19 +491,31 @@ public:
     static void Synchronize(void *kernel);
 
 private:
-    GLuint _program;
-
     GLuint _srcBufferTexture;
 
-    GLuint _uniformSrcBufferTexture;
-    GLuint _uniformSizesTexture;
-    GLuint _uniformOffsetsTexture;
-    GLuint _uniformIndicesTexture;
-    GLuint _uniformWeightsTexture;
+    struct _StencilKernel {
+        GLuint program;
+        GLuint uniformSrcBufferTexture;
+        GLuint uniformSrcOffset;    // src buffer offset (in elements)
+
+        GLuint uniformSizesTexture;
+        GLuint uniformOffsetsTexture;
+        GLuint uniformIndicesTexture;
+        GLuint uniformWeightsTexture;
+        GLuint uniformStart;     // range
+        GLuint uniformEnd;
+    } _stencilKernel;
+
+    struct _PatchKernel {
+        GLuint program;
+        GLuint uniformSrcBufferTexture;
+        GLuint uniformSrcOffset;    // src buffer offset (in elements)
+
+        GLuint uniformPatchArray;
+        GLuint uniformPatchParamTexture;
+        GLuint uniformPatchIndexTexture;
+    } _patchKernel;
 
-    GLuint _uniformStart;     // range
-    GLuint _uniformEnd;
-    GLuint _uniformSrcOffset;    // src buffer offset (in elements)
 };
 
 }  // end namespace Osd
diff --git a/opensubdiv/osd/glslComputeKernel.glsl b/opensubdiv/osd/glslComputeKernel.glsl
index 946a2773..617d4351 100644
--- a/opensubdiv/osd/glslComputeKernel.glsl
+++ b/opensubdiv/osd/glslComputeKernel.glsl
@@ -24,18 +24,10 @@
 
 //------------------------------------------------------------------------------
 
-uniform int batchStart = 0;
-uniform int batchEnd = 0;
 uniform int srcOffset = 0;
 uniform int dstOffset = 0;
-
 layout(binding=0) buffer src_buffer      { float    srcVertexBuffer[]; };
 layout(binding=1) buffer dst_buffer      { float    dstVertexBuffer[]; };
-layout(binding=2) buffer stencilSizes    { int      _sizes[];   };
-layout(binding=3) buffer stencilOffsets  { int      _offsets[]; };
-layout(binding=4) buffer stencilIndices  { int      _indices[]; };
-layout(binding=5) buffer stencilWeights  { float    _weights[]; };
-
 layout(local_size_x=WORK_GROUP_SIZE, local_size_y=1, local_size_z=1) in;
 
 //------------------------------------------------------------------------------
@@ -73,6 +65,15 @@ void addWithWeight(inout Vertex v, const Vertex src, float weight) {
 }
 
 //------------------------------------------------------------------------------
+#if defined(OPENSUBDIV_GLSL_COMPUTE_KERNEL_EVAL_STENCILS)
+
+uniform int batchStart = 0;
+uniform int batchEnd = 0;
+layout(binding=2) buffer stencilSizes    { int      _sizes[];   };
+layout(binding=3) buffer stencilOffsets  { int      _offsets[]; };
+layout(binding=4) buffer stencilIndices  { int      _indices[]; };
+layout(binding=5) buffer stencilWeights  { float    _weights[]; };
+
 void main() {
 
     int current = int(gl_GlobalInvocationID.x) + batchStart;
@@ -94,4 +95,194 @@ void main() {
     writeVertex(current, dst);
 }
 
+#endif
+
 //------------------------------------------------------------------------------
+#if defined(OPENSUBDIV_GLSL_COMPUTE_KERNEL_EVAL_PATCHES)
+
+// PERFORMANCE: stride could be constant, but not as significant as length
+
+//struct PatchArray {
+//    int patchType;
+//    int numPatches;
+//    int indexBase;        // an offset within the index buffer
+//    int primitiveIdBase;  // an offset within the patch param buffer
+//};
+// # of patcharrays is 1 or 2.
+
+uniform ivec4 patchArray[2];
+uniform ivec3 dstDuDesc;
+uniform ivec3 dstDvDesc;
+layout(binding=2) buffer du_buffer   { float dstDuBuffer[]; };
+layout(binding=3) buffer dv_buffer   { float dstDvBuffer[]; };
+
+struct PatchCoord {
+   int arrayIndex;
+   int patchIndex;
+   int vertIndex;
+   float s;
+   float t;
+};
+
+struct PatchParam {
+    int faceIndex;
+    uint patchBits;
+    float sharpness;
+};
+
+layout(binding=4) buffer patchCoord_buffer { PatchCoord patchCoords[]; };
+layout(binding=5) buffer patchIndex_buffer { int patchIndexBuffer[]; };
+layout(binding=6) buffer patchParam_buffer { PatchParam patchParamBuffer[]; };
+
+void writeDu(int index, Vertex du) {
+    int duIndex = dstDuDesc.x + index * dstDuDesc.z;
+    for (int i = 0; i < LENGTH; ++i) {
+        dstDuBuffer[duIndex + i] = du.vertexData[i];
+    }
+}
+
+void writeDv(int index, Vertex dv) {
+    int dvIndex = dstDvDesc.x + index * dstDvDesc.z;
+    for (int i = 0; i < LENGTH; ++i) {
+        dstDvBuffer[dvIndex + i] = dv.vertexData[i];
+    }
+}
+
+void getBSplineWeights(float t, inout vec4 point, inout vec4 deriv) {
+    // The four uniform cubic B-Spline basis functions evaluated at t:
+    float one6th = 1.0f / 6.0f;
+
+    float t2 = t * t;
+    float t3 = t * t2;
+
+    point.x = one6th * (1.0f - 3.0f*(t -      t2) -      t3);
+    point.y = one6th * (4.0f           - 6.0f*t2  + 3.0f*t3);
+    point.z = one6th * (1.0f + 3.0f*(t +      t2  -      t3));
+    point.w = one6th * (                                 t3);
+
+    // Derivatives of the above four basis functions at t:
+    deriv.x = -0.5f*t2 +      t - 0.5f;
+    deriv.y =  1.5f*t2 - 2.0f*t;
+    deriv.z = -1.5f*t2 +      t + 0.5f;
+    deriv.w =  0.5f*t2;
+}
+
+uint getDepth(uint patchBits) {
+    return (patchBits & 0x7);
+}
+
+float getParamFraction(uint patchBits) {
+    uint nonQuadRoot = (patchBits >> 3) & 0x1;
+    uint depth = getDepth(patchBits);
+    if (nonQuadRoot == 1) {
+        return 1.0f / float( 1 << (depth-1) );
+    } else {
+        return 1.0f / float( 1 << depth );
+    }
+}
+
+vec2 normalizePatchCoord(uint patchBits, vec2 uv) {
+    float frac = getParamFraction(patchBits);
+
+    uint iu = (patchBits >> 22) & 0x3ff;
+    uint iv = (patchBits >> 12) & 0x3ff;
+
+    // top left corner
+    float pu = float(iu*frac);
+    float pv = float(iv*frac);
+
+    // normalize u,v coordinates
+    return vec2((uv.x - pu) / frac, (uv.y - pv) / frac);
+}
+
+void adjustBoundaryWeights(uint bits, inout vec4 sWeights, inout vec4 tWeights) {
+    uint boundary = ((bits >> 4) & 0xf);
+
+    if ((boundary & 1) != 0) {
+        tWeights[2] -= tWeights[0];
+        tWeights[1] += 2*tWeights[0];
+        tWeights[0] = 0;
+    }
+    if ((boundary & 2) != 0) {
+        sWeights[1] -= sWeights[3];
+        sWeights[2] += 2*sWeights[3];
+        sWeights[3] = 0;
+    }
+    if ((boundary & 4) != 0) {
+        tWeights[1] -= tWeights[3];
+        tWeights[2] += 2*tWeights[3];
+        tWeights[3] = 0;
+    }
+    if ((boundary & 8) != 0) {
+        sWeights[2] -= sWeights[0];
+        sWeights[1] += 2*sWeights[0];
+        sWeights[0] = 0;
+    }
+}
+
+void main() {
+
+    int current = int(gl_GlobalInvocationID.x);
+
+    PatchCoord coord = patchCoords[current];
+    int patchIndex = coord.patchIndex;
+
+    ivec4 array = patchArray[coord.arrayIndex];
+    int patchType = 6; // array.x XXX: REGULAR only for now.
+    int numControlVertices = 16;
+
+    uint patchBits = patchParamBuffer[patchIndex].patchBits;
+    vec2 uv = normalizePatchCoord(patchBits, vec2(coord.s, coord.t));
+    float dScale = float(1 << getDepth(patchBits));
+
+    float wP[20], wDs[20], wDt[20];
+    if (patchType == 6) {  // REGULAR
+        vec4 sWeights, tWeights, dsWeights, dtWeights;
+        getBSplineWeights(uv.x, sWeights, dsWeights);
+        getBSplineWeights(uv.y, tWeights, dtWeights);
+
+        adjustBoundaryWeights(patchBits, sWeights, tWeights);
+        adjustBoundaryWeights(patchBits, dsWeights, dtWeights);
+
+        for (int k = 0; k < 4; ++k) {
+            for (int l = 0; l < 4; ++l) {
+                wP[4*k+l]  = sWeights[l]  * tWeights[k];
+                wDs[4*k+l] = dsWeights[l] * tWeights[k]  * dScale;
+                wDt[4*k+l] = sWeights[l]  * dtWeights[k] * dScale;
+            }
+        }
+    } else {
+        // TODO: GREGORY BASIS
+    }
+
+    Vertex dst;
+    clear(dst);
+
+    int indexBase = array.z + coord.vertIndex;
+    for (int i = 0; i < numControlVertices; ++i) {
+        int index = patchIndexBuffer[indexBase + i];
+        addWithWeight(dst, readVertex(index), wP[i]);
+    }
+    writeVertex(current, dst);
+
+    if (dstDuDesc.y > 0) { // length
+        Vertex du;
+        clear(du);
+        for (int i = 0; i < numControlVertices; ++i) {
+            int index = patchIndexBuffer[indexBase + i];
+            addWithWeight(du, readVertex(index), wDs[i]);
+        }
+        writeDu(current, du);
+    }
+    if (dstDvDesc.y > 0) {
+        Vertex dv;
+        clear(dv);
+        for (int i = 0; i < numControlVertices; ++i) {
+            int index = patchIndexBuffer[indexBase + i];
+            addWithWeight(dv, readVertex(index), wDt[i]);
+        }
+        writeDv(current, dv);
+    }
+}
+
+#endif
diff --git a/opensubdiv/osd/glslXFBKernel.glsl b/opensubdiv/osd/glslXFBKernel.glsl
index caba7e2a..24200a5e 100644
--- a/opensubdiv/osd/glslXFBKernel.glsl
+++ b/opensubdiv/osd/glslXFBKernel.glsl
@@ -25,18 +25,8 @@
 //------------------------------------------------------------------------------
 
 uniform samplerBuffer vertexBuffer;
-
-out float outVertexBuffer[LENGTH];
-
-uniform usamplerBuffer sizes;
-uniform isamplerBuffer offsets;
-uniform isamplerBuffer indices;
-uniform samplerBuffer  weights;
-
-uniform int batchStart = 0;
-uniform int batchEnd = 0;
-
 uniform int srcOffset = 0;
+out float outVertexBuffer[LENGTH];
 
 //------------------------------------------------------------------------------
 
@@ -72,6 +62,16 @@ void writeVertex(Vertex v) {
 }
 
 //------------------------------------------------------------------------------
+
+#if defined(OPENSUBDIV_GLSL_XFB_KERNEL_EVAL_STENCILS)
+
+uniform usamplerBuffer sizes;
+uniform isamplerBuffer offsets;
+uniform isamplerBuffer indices;
+uniform samplerBuffer  weights;
+uniform int batchStart = 0;
+uniform int batchEnd = 0;
+
 void main() {
 
     int current = gl_VertexID + batchStart;
@@ -97,4 +97,146 @@ void main() {
     writeVertex(dst);
 }
 
+#endif
+
 //------------------------------------------------------------------------------
+
+#if defined(OPENSUBDIV_GLSL_XFB_KERNEL_EVAL_PATCHES)
+
+layout (location = 0) in ivec3 patchHandles;
+layout (location = 1) in vec2  patchCoords;
+
+//struct PatchArray {
+//    int patchType;
+//    int numPatches;
+//    int indexBase;        // an offset within the index buffer
+//    int primitiveIdBase;  // an offset within the patch param buffer
+//};
+// # of patcharrays is 1 or 2.
+
+uniform ivec4 patchArray[2];
+uniform isamplerBuffer patchParamBuffer;
+uniform isamplerBuffer patchIndexBuffer;
+
+void getBSplineWeights(float t, inout vec4 point, vec4 deriv) {
+    // The four uniform cubic B-Spline basis functions evaluated at t:
+    float one6th = 1.0f / 6.0f;
+
+    float t2 = t * t;
+    float t3 = t * t2;
+
+    point.x = one6th * (1.0f - 3.0f*(t -      t2) -      t3);
+    point.y = one6th * (4.0f           - 6.0f*t2  + 3.0f*t3);
+    point.z = one6th * (1.0f + 3.0f*(t +      t2  -      t3));
+    point.w = one6th * (                                 t3);
+
+    // Derivatives of the above four basis functions at t:
+    /* if (deriv) { */
+    /*     deriv[0] = -0.5f*t2 +      t - 0.5f; */
+    /*     deriv[1] =  1.5f*t2 - 2.0f*t; */
+    /*     deriv[2] = -1.5f*t2 +      t + 0.5f; */
+    /*     deriv[3] =  0.5f*t2; */
+    /* } */
+}
+
+uint getDepth(uint patchBits) {
+    return (patchBits & 0x7);
+}
+
+float getParamFraction(uint patchBits) {
+    uint nonQuadRoot = (patchBits >> 3) & 0x1;
+    uint depth = getDepth(patchBits);
+    if (nonQuadRoot == 1) {
+        return 1.0f / float( 1 << (depth-1) );
+    } else {
+        return 1.0f / float( 1 << depth );
+    }
+}
+
+vec2 normalizePatchCoord(uint patchBits, vec2 uv) {
+    float frac = getParamFraction(patchBits);
+
+    uint iu = (patchBits >> 22) & 0x3ff;
+    uint iv = (patchBits >> 12) & 0x3ff;
+
+    // top left corner
+    float pu = float(iu*frac);
+    float pv = float(iv*frac);
+
+    // normalize u,v coordinates
+    return vec2((uv.x - pu) / frac, (uv.y - pv) / frac);
+}
+
+void adjustBoundaryWeights(uint bits, inout vec4 sWeights, inout vec4 tWeights) {
+    uint boundary = ((bits >> 4) & 0xf);
+
+    if ((boundary & 1) != 0) {
+        tWeights[2] -= tWeights[0];
+        tWeights[1] += 2*tWeights[0];
+        tWeights[0] = 0;
+    }
+    if ((boundary & 2) != 0) {
+        sWeights[1] -= sWeights[3];
+        sWeights[2] += 2*sWeights[3];
+        sWeights[3] = 0;
+    }
+    if ((boundary & 4) != 0) {
+        tWeights[1] -= tWeights[3];
+        tWeights[2] += 2*tWeights[3];
+        tWeights[3] = 0;
+    }
+    if ((boundary & 8) != 0) {
+        sWeights[2] -= sWeights[0];
+        sWeights[1] += 2*sWeights[0];
+        sWeights[0] = 0;
+    }
+}
+
+void main() {
+    int current = gl_VertexID;
+
+    ivec3 handle = patchHandles;
+    int patchIndex = handle.y;
+
+    vec2 coord = patchCoords;
+    ivec4 array = patchArray[handle.x];
+    int patchType = array.x;
+    int numControlVertices = 16;
+
+    uint patchBits = texelFetch(patchParamBuffer, patchIndex).y;
+
+    // normalize
+    coord = normalizePatchCoord(patchBits, coord);
+
+    // XXX: dScale for derivative
+
+    // if regular
+    float wP[20];
+    {
+        vec4 sWeights, tWeights, dsWeights, dtWeights;
+        getBSplineWeights(coord.s, sWeights, dsWeights);
+        getBSplineWeights(coord.t, tWeights, dtWeights);
+
+        adjustBoundaryWeights(patchBits, sWeights, tWeights);
+
+        for (int k = 0; k < 4; ++k) {
+            for (int l = 0; l < 4; ++l) {
+                wP[4*k+l]  = sWeights[l]  * tWeights[k];
+            }
+        }
+    }
+
+    Vertex dst;
+    clear(dst);
+
+    int indexBase = array.z + handle.z;
+    for (int i = 0; i < numControlVertices; ++i) {
+        int index = texelFetch(patchIndexBuffer, indexBase + i).x;
+        addWithWeight(dst, readVertex(index), wP[i]);
+    }
+
+    writeVertex(dst);
+}
+
+#endif
+
diff --git a/opensubdiv/osd/mesh.h b/opensubdiv/osd/mesh.h
index 00646512..d8253f69 100644
--- a/opensubdiv/osd/mesh.h
+++ b/opensubdiv/osd/mesh.h
@@ -171,7 +171,12 @@ public:
 
         for(typename Evaluators::iterator it = _evaluators.begin();
             it != _evaluators.end(); ++it) {
-            if (it->srcDesc.length == srcDesc.length and
+            // Note: XFB kernel needs to be configured with the local offset
+            // of the dstDesc to skip preceding primvars.
+            int dstOffset1 = it->dstDesc.offset % it->dstDesc.stride;
+            int dstOffset2 = dstDesc.offset % dstDesc.stride;
+            if (dstOffset1 == dstOffset2 and
+                it->srcDesc.length == srcDesc.length and
                 it->srcDesc.stride == srcDesc.stride and
                 it->dstDesc.length == dstDesc.length and
                 it->dstDesc.stride == dstDesc.stride) {
diff --git a/opensubdiv/osd/ompEvaluator.cpp b/opensubdiv/osd/ompEvaluator.cpp
index 13d07e19..8f117db3 100644
--- a/opensubdiv/osd/ompEvaluator.cpp
+++ b/opensubdiv/osd/ompEvaluator.cpp
@@ -24,6 +24,7 @@
 
 #include "../osd/ompEvaluator.h"
 #include "../osd/ompKernel.h"
+#include "../far/patchBasis.h"
 #include <omp.h>
 
 namespace OpenSubdiv {
@@ -33,24 +34,212 @@ namespace Osd {
 
 /* static */
 bool
-OmpEvaluator::EvalStencils(const float *src,
-                           VertexBufferDescriptor const &srcDesc,
-                           float *dst,
-                           VertexBufferDescriptor const &dstDesc,
-                           const int * sizes,
-                           const int * offsets,
-                           const int * indices,
-                           const float * weights,
-                           int start, int end) {
-    if (end <= start) return true;
+OmpEvaluator::EvalStencils(
+    const float *src, VertexBufferDescriptor const &srcDesc,
+    float *dst,       VertexBufferDescriptor const &dstDesc,
+    const int * sizes,
+    const int * offsets,
+    const int * indices,
+    const float * weights,
+    int start, int end) {
 
-    // we can probably expand cpuKernel.cpp to here.
+    if (end <= start) return true;
+    if (srcDesc.length != dstDesc.length) return false;
+
+    // XXX: we can probably expand cpuKernel.cpp to here.
     OmpEvalStencils(src, srcDesc, dst, dstDesc,
                     sizes, offsets, indices, weights, start, end);
 
     return true;
 }
 
+/* static */
+bool
+OmpEvaluator::EvalStencils(
+    const float *src, VertexBufferDescriptor const &srcDesc,
+    float *dst,       VertexBufferDescriptor const &dstDesc,
+    float *du,        VertexBufferDescriptor const &duDesc,
+    float *dv,        VertexBufferDescriptor const &dvDesc,
+    const int * sizes,
+    const int * offsets,
+    const int * indices,
+    const float * weights,
+    const float * duWeights,
+    const float * dvWeights,
+    int start, int end) {
+
+    if (end <= start) return true;
+    if (srcDesc.length != dstDesc.length) return false;
+    if (srcDesc.length != duDesc.length) return false;
+    if (srcDesc.length != dvDesc.length) return false;
+
+    OmpEvalStencils(src, srcDesc,
+                    dst, dstDesc,
+                    du,  duDesc,
+                    dv,  dvDesc,
+                    sizes, offsets, indices,
+                    weights, duWeights, dvWeights,
+                    start, end);
+
+    return true;
+}
+
+template <typename T>
+struct BufferAdapter {
+    BufferAdapter(T *p, int length, int stride) :
+        _p(p), _length(length), _stride(stride) { }
+    void Clear() {
+        for (int i = 0; i < _length; ++i) _p[i] = 0;
+    }
+    void AddWithWeight(T const *src, float w) {
+        if (_p) {
+            // TODO: derivatives.
+            for (int i = 0; i < _length; ++i) {
+                _p[i] += src[i] * w;
+            }
+        }
+    }
+    const T *operator[] (int index) const {
+        return _p + _stride * index;
+    }
+    BufferAdapter<T> & operator ++() {
+        if (_p) {
+            _p += _stride;
+        }
+        return *this;
+    }
+
+    T *_p;
+    int _length;
+    int _stride;
+};
+
+/* static */
+bool
+OmpEvaluator::EvalPatches(
+    const float *src, VertexBufferDescriptor const &srcDesc,
+    float *dst,       VertexBufferDescriptor const &dstDesc,
+    int numPatchCoords,
+    const PatchCoord *patchCoords,
+    const PatchArray *patchArrays,
+    const int *patchIndexBuffer,
+    const PatchParam *patchParamBuffer){
+
+    src += srcDesc.offset;
+    if (dst) dst += dstDesc.offset;
+    else return false;
+    BufferAdapter<const float> srcT(src, srcDesc.length, srcDesc.stride);
+
+#pragma omp parallel for
+    for (int i = 0; i < numPatchCoords; ++i) {
+        BufferAdapter<float> dstT(dst + dstDesc.stride*i, dstDesc.length, dstDesc.stride);
+
+        float wP[20], wDs[20], wDt[20];
+        PatchCoord const &coord = patchCoords[i];
+        PatchArray const &array = patchArrays[coord.handle.arrayIndex];
+
+        int patchType = array.GetPatchType();
+        // XXX: patchIndex is absolute. not sure it's consistent.
+        //      (should be offsetted by array.primitiveIdBase?)
+        //    patchParamBuffer[array.primitiveIdBase + coord.handle.patchIndex]
+        Far::PatchParam::BitField patchBits = *(Far::PatchParam::BitField*)
+            &patchParamBuffer[coord.handle.patchIndex].patchBits;
+
+        int numControlVertices = 0;
+        if (patchType == Far::PatchDescriptor::REGULAR) {
+            Far::internal::GetBSplineWeights(patchBits,
+                                             coord.s, coord.t, wP, wDs, wDt);
+            numControlVertices = 16;
+        } else if (patchType == Far::PatchDescriptor::GREGORY_BASIS) {
+            Far::internal::GetGregoryWeights(patchBits,
+                                             coord.s, coord.t, wP, wDs, wDt);
+            numControlVertices = 20;
+        } else if (patchType == Far::PatchDescriptor::QUADS) {
+            Far::internal::GetBilinearWeights(patchBits,
+                                              coord.s, coord.t, wP, wDs, wDt);
+            numControlVertices = 4;
+        } else {
+            continue;
+        }
+        const int *cvs =
+            &patchIndexBuffer[array.indexBase + coord.handle.vertIndex];
+
+        dstT.Clear();
+        for (int j = 0; j < numControlVertices; ++j) {
+            dstT.AddWithWeight(srcT[cvs[j]], wP[j]);
+        }
+    }
+    return true;
+}
+
+/* static */
+bool
+OmpEvaluator::EvalPatches(
+    const float *src, VertexBufferDescriptor const &srcDesc,
+    float *dst,       VertexBufferDescriptor const &dstDesc,
+    float *du,        VertexBufferDescriptor const &duDesc,
+    float *dv,        VertexBufferDescriptor const &dvDesc,
+    int numPatchCoords,
+    PatchCoord const *patchCoords,
+    PatchArray const *patchArrays,
+    const int *patchIndexBuffer,
+    PatchParam const *patchParamBuffer) {
+
+    src += srcDesc.offset;
+    if (dst) dst += dstDesc.offset;
+    if (du)  du += duDesc.offset;
+    if (dv)  dv += dvDesc.offset;
+
+    BufferAdapter<const float> srcT(src, srcDesc.length, srcDesc.stride);
+
+#pragma omp parallel for
+    for (int i = 0; i < numPatchCoords; ++i) {
+        float wP[20], wDs[20], wDt[20];
+        BufferAdapter<float> dstT(dst + dstDesc.stride*i, dstDesc.length, dstDesc.stride);
+        BufferAdapter<float> duT(du   + duDesc.stride*i, duDesc.length, duDesc.stride);
+        BufferAdapter<float> dvT(dv   + dvDesc.stride*i, dvDesc.length, dvDesc.stride);
+
+        PatchCoord const &coord = patchCoords[i];
+        PatchArray const &array = patchArrays[coord.handle.arrayIndex];
+
+        int patchType = array.GetPatchType();
+        Far::PatchParam::BitField patchBits = *(Far::PatchParam::BitField*)
+            &patchParamBuffer[coord.handle.patchIndex].patchBits;
+
+        int numControlVertices = 0;
+        if (patchType == Far::PatchDescriptor::REGULAR) {
+            Far::internal::GetBSplineWeights(patchBits,
+                                             coord.s, coord.t, wP, wDs, wDt);
+            numControlVertices = 16;
+        } else if (patchType == Far::PatchDescriptor::GREGORY_BASIS) {
+            Far::internal::GetGregoryWeights(patchBits,
+                                             coord.s, coord.t, wP, wDs, wDt);
+            numControlVertices = 20;
+        } else if (patchType == Far::PatchDescriptor::QUADS) {
+            Far::internal::GetBilinearWeights(patchBits,
+                                              coord.s, coord.t, wP, wDs, wDt);
+            numControlVertices = 4;
+        } else {
+            continue;
+        }
+        const int *cvs =
+            &patchIndexBuffer[array.indexBase + coord.handle.vertIndex];
+
+        dstT.Clear();
+        duT.Clear();
+        dvT.Clear();
+        for (int j = 0; j < numControlVertices; ++j) {
+            dstT.AddWithWeight(srcT[cvs[j]], wP[j]);
+            duT.AddWithWeight(srcT[cvs[j]], wDs[j]);
+            dvT.AddWithWeight(srcT[cvs[j]], wDt[j]);
+        }
+        ++dstT;
+        ++duT;
+        ++dvT;
+    }
+    return true;
+}
+
 /* static */
 void
 OmpEvaluator::Synchronize(void * /*deviceContext*/) {
diff --git a/opensubdiv/osd/ompEvaluator.h b/opensubdiv/osd/ompEvaluator.h
index a329eed1..9b30e7a6 100644
--- a/opensubdiv/osd/ompEvaluator.h
+++ b/opensubdiv/osd/ompEvaluator.h
@@ -28,7 +28,7 @@
 #include "../version.h"
 
 #include <cstddef>
-
+#include "../osd/types.h"
 #include "../osd/vertexDescriptor.h"
 
 namespace OpenSubdiv {
@@ -38,9 +38,15 @@ namespace Osd {
 
 class OmpEvaluator {
 public:
-    /// \brief Generic static compute function. This function has a same
+    /// ----------------------------------------------------------------------
+    ///
+    ///   Stencil evaluations with StencilTable
+    ///
+    /// ----------------------------------------------------------------------
+
+    /// \brief Generic static eval stencils function. This function has a same
     ///        signature as other device kernels have so that it can be called
-    ///        transparently from OsdMesh template interface.
+    ///        in the same way from OsdMesh template interface.
     ///
     /// @param srcBuffer      Input primvar buffer.
     ///                       must have BindCpuBuffer() method returning a
@@ -54,29 +60,27 @@ public:
     ///
     /// @param dstDesc        vertex buffer descriptor for the output buffer
     ///
-    /// @param stencilTable   stencil table to be applied.
+    /// @param stencilTable   Far::StencilTable or equivalent
     ///
-    /// @param instance        not used in the omp kernel
+    /// @param instance       not used in the omp kernel
     ///                       (declared as a typed pointer to prevent
     ///                        undesirable template resolution)
     ///
     /// @param deviceContext  not used in the omp kernel
     ///
-    template <typename VERTEX_BUFFER, typename STENCIL_TABLE>
-    static bool EvalStencils(VERTEX_BUFFER *srcVertexBuffer,
-                             VertexBufferDescriptor const &srcDesc,
-                             VERTEX_BUFFER *dstVertexBuffer,
-                             VertexBufferDescriptor const &dstDesc,
-                             STENCIL_TABLE const *stencilTable,
-                             OmpEvaluator const * instance = NULL,
-                             void * deviceContext = NULL) {
-        (void)instance;   // unused;
-        (void)deviceContext;   // unused;
+    template <typename SRC_BUFFER, typename DST_BUFFER, typename STENCIL_TABLE>
+    static bool EvalStencils(
+        SRC_BUFFER *srcBuffer, VertexBufferDescriptor const &srcDesc,
+        DST_BUFFER *dstBuffer, VertexBufferDescriptor const &dstDesc,
+        STENCIL_TABLE const *stencilTable,
+        const OmpEvaluator *instance = NULL,
+        void * deviceContext = NULL) {
 
-        return EvalStencils(srcVertexBuffer->BindCpuBuffer(),
-                            srcDesc,
-                            dstVertexBuffer->BindCpuBuffer(),
-                            dstDesc,
+        (void)instance;       // unused
+        (void)deviceContext;  // unused
+
+        return EvalStencils(srcBuffer->BindCpuBuffer(), srcDesc,
+                            dstBuffer->BindCpuBuffer(), dstDesc,
                             &stencilTable->GetSizes()[0],
                             &stencilTable->GetOffsets()[0],
                             &stencilTable->GetControlIndices()[0],
@@ -85,17 +89,376 @@ public:
                             /*end   = */ stencilTable->GetNumStencils());
     }
 
-    /// stencil compute function.
-    static bool EvalStencils(const float *src,
-                             VertexBufferDescriptor const &srcDesc,
-                             float *dst,
-                             VertexBufferDescriptor const &dstDesc,
-                             const int * sizes,
-                             const int * offsets,
-                             const int * indices,
-                             const float * weights,
-                             int start,
-                             int end);
+    /// \brief Static eval stencils function which takes raw CPU pointers for
+    ///        input and output.
+    ///
+    /// @param src            Input primvar pointer. An offset of srcDesc
+    ///                       will be applied internally (i.e. the pointer
+    ///                       should not include the offset)
+    ///
+    /// @param srcDesc        vertex buffer descriptor for the input buffer
+    ///
+    /// @param dst            Output primvar pointer. An offset of dstDesc
+    ///                       will be applied internally.
+    ///
+    /// @param dstDesc        vertex buffer descriptor for the output buffer
+    ///
+    /// @param sizes          pointer to the sizes buffer of the stencil table
+    ///                       to apply for the range [start, end)
+    ///
+    /// @param offsets        pointer to the offsets buffer of the stencil table
+    ///
+    /// @param indices        pointer to the indices buffer of the stencil table
+    ///
+    /// @param weights        pointer to the weights buffer of the stencil table
+    ///
+    /// @param start          start index of stencil table
+    ///
+    /// @param end            end index of stencil table
+    ///
+    static bool EvalStencils(
+        const float *src,  VertexBufferDescriptor const &srcDesc,
+        float *dst,        VertexBufferDescriptor const &dstDesc,
+        const int * sizes,
+        const int * offsets,
+        const int * indices,
+        const float * weights,
+        int start, int end);
+
+    /// \brief Generic static eval stencils function with derivatives.
+    ///        This function has a same signature as other device kernels
+    ///        have so that it can be called in the same way from OsdMesh
+    ///        template interface.
+    ///
+    /// @param srcBuffer      Input primvar buffer.
+    ///                       must have BindCpuBuffer() method returning a
+    ///                       const float pointer for read
+    ///
+    /// @param srcDesc        vertex buffer descriptor for the input buffer
+    ///
+    /// @param dstBuffer      Output primvar buffer
+    ///                       must have BindCpuBuffer() method returning a
+    ///                       float pointer for write
+    ///
+    /// @param dstDesc        vertex buffer descriptor for the output buffer
+    ///
+    /// @param duBuffer       Output U-derivative buffer
+    ///                       must have BindCpuBuffer() method returning a
+    ///                       float pointer for write
+    ///
+    /// @param duDesc         vertex buffer descriptor for the output buffer
+    ///
+    /// @param dvBuffer       Output V-derivative buffer
+    ///                       must have BindCpuBuffer() method returning a
+    ///                       float pointer for write
+    ///
+    /// @param dvDesc         vertex buffer descriptor for the output buffer
+    ///
+    /// @param stencilTable   Far::StencilTable or equivalent
+    ///
+    /// @param instance       not used in the omp kernel
+    ///                       (declared as a typed pointer to prevent
+    ///                        undesirable template resolution)
+    ///
+    /// @param deviceContext  not used in the omp kernel
+    ///
+    template <typename SRC_BUFFER, typename DST_BUFFER, typename STENCIL_TABLE>
+    static bool EvalStencils(
+        SRC_BUFFER *srcBuffer, VertexBufferDescriptor const &srcDesc,
+        DST_BUFFER *dstBuffer, VertexBufferDescriptor const &dstDesc,
+        DST_BUFFER *duBuffer,  VertexBufferDescriptor const &duDesc,
+        DST_BUFFER *dvBuffer,  VertexBufferDescriptor const &dvDesc,
+        STENCIL_TABLE const *stencilTable,
+        const OmpEvaluator *instance = NULL,
+        void * deviceContext = NULL) {
+
+        (void)instance;       // unused
+        (void)deviceContext;  // unused
+
+        return EvalStencils(srcBuffer->BindCpuBuffer(), srcDesc,
+                            dstBuffer->BindCpuBuffer(), dstDesc,
+                            duBuffer->BindCpuBuffer(),  duDesc,
+                            dvBuffer->BindCpuBuffer(),  dvDesc,
+                            &stencilTable->GetSizes()[0],
+                            &stencilTable->GetOffsets()[0],
+                            &stencilTable->GetControlIndices()[0],
+                            &stencilTable->GetWeights()[0],
+                            &stencilTable->GetDuWeights()[0],
+                            &stencilTable->GetDvWeights()[0],
+                            /*start = */ 0,
+                            /*end   = */ stencilTable->GetNumStencils());
+    }
+
+    /// \brief Static eval stencils function with derivatives, which takes
+    ///        raw CPU pointers for input and output.
+    ///
+    /// @param src            Input primvar pointer. An offset of srcDesc
+    ///                       will be applied internally (i.e. the pointer
+    ///                       should not include the offset)
+    ///
+    /// @param srcDesc        vertex buffer descriptor for the input buffer
+    ///
+    /// @param dst            Output primvar pointer. An offset of dstDesc
+    ///                       will be applied internally.
+    ///
+    /// @param dstDesc        vertex buffer descriptor for the output buffer
+    ///
+    /// @param du             Output U-derivatives pointer. An offset of
+    ///                       duDesc will be applied internally.
+    ///
+    /// @param duDesc         vertex buffer descriptor for the output buffer
+    ///
+    /// @param dv             Output V-derivatives pointer. An offset of
+    ///                       dvDesc will be applied internally.
+    ///
+    /// @param dvDesc         vertex buffer descriptor for the output buffer
+    ///
+    /// @param sizes          pointer to the sizes buffer of the stencil table
+    ///
+    /// @param offsets        pointer to the offsets buffer of the stencil table
+    ///
+    /// @param indices        pointer to the indices buffer of the stencil table
+    ///
+    /// @param weights        pointer to the weights buffer of the stencil table
+    ///
+    /// @param duWeights      pointer to the du-weights buffer of the stencil table
+    ///
+    /// @param dvWeights      pointer to the dv-weights buffer of the stencil table
+    ///
+    /// @param start          start index of stencil table
+    ///
+    /// @param end            end index of stencil table
+    ///
+    static bool EvalStencils(
+        const float *src, VertexBufferDescriptor const &srcDesc,
+        float *dst,       VertexBufferDescriptor const &dstDesc,
+        float *du,        VertexBufferDescriptor const &duDesc,
+        float *dv,        VertexBufferDescriptor const &dvDesc,
+        const int * sizes,
+        const int * offsets,
+        const int * indices,
+        const float * weights,
+        const float * duWeights,
+        const float * dvWeights,
+        int start, int end);
+
+    /// ----------------------------------------------------------------------
+    ///
+    ///   Limit evaluations with PatchTable
+    ///
+    /// ----------------------------------------------------------------------
+
+    /// \brief Generic limit eval function. This function has a same
+    ///        signature as other device kernels have so that it can be called
+    ///        in the same way.
+    ///
+    /// @param srcBuffer        Input primvar buffer.
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         const float pointer for read
+    ///
+    /// @param srcDesc          vertex buffer descriptor for the input buffer
+    ///
+    /// @param dstBuffer        Output primvar buffer
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param dstDesc          vertex buffer descriptor for the output buffer
+    ///
+    /// @param numPatchCoords   number of patchCoords.
+    ///
+    /// @param patchCoords      array of locations to be evaluated.
+    ///
+    /// @param patchTable       CpuPatchTable or equivalent
+    ///                         XXX: currently Far::PatchTable can't be used
+    ///                              due to interface mismatch
+    ///
+    /// @param instance         not used in the omp evaluator
+    ///
+    /// @param deviceContext    not used in the omp evaluator
+    ///
+    template <typename SRC_BUFFER, typename DST_BUFFER,
+              typename PATCHCOORD_BUFFER, typename PATCH_TABLE>
+    static bool EvalPatches(
+        SRC_BUFFER *srcBuffer, VertexBufferDescriptor const &srcDesc,
+        DST_BUFFER *dstBuffer, VertexBufferDescriptor const &dstDesc,
+        int numPatchCoords,
+        PATCHCOORD_BUFFER *patchCoords,
+        PATCH_TABLE *patchTable,
+        OmpEvaluator const *instance = NULL,
+        void * deviceContext = NULL) {
+
+        (void)instance;       // unused
+        (void)deviceContext;  // unused
+
+        return EvalPatches(srcBuffer->BindCpuBuffer(), srcDesc,
+                           dstBuffer->BindCpuBuffer(), dstDesc,
+                           numPatchCoords,
+                           (const PatchCoord*)patchCoords->BindCpuBuffer(),
+                           patchTable->GetPatchArrayBuffer(),
+                           patchTable->GetPatchIndexBuffer(),
+                           patchTable->GetPatchParamBuffer());
+    }
+
+    /// \brief Generic limit eval function with derivatives. This function has
+    ///        a same signature as other device kernels have so that it can be
+    ///        called in the same way.
+    ///
+    /// @param srcBuffer        Input primvar buffer.
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         const float pointer for read
+    ///
+    /// @param srcDesc          vertex buffer descriptor for the input buffer
+    ///
+    /// @param dstBuffer        Output primvar buffer
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param dstDesc          vertex buffer descriptor for the output buffer
+    ///
+    /// @param duBuffer         Output U-derivatives buffer
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param duDesc           vertex buffer descriptor for the duBuffer
+    ///
+    /// @param dvBuffer         Output V-derivatives buffer
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param dvDesc           vertex buffer descriptor for the dvBuffer
+    ///
+    /// @param numPatchCoords   number of patchCoords.
+    ///
+    /// @param patchCoords      array of locations to be evaluated.
+    ///
+    /// @param patchTable       CpuPatchTable or equivalent
+    ///                         XXX: currently Far::PatchTable can't be used
+    ///                              due to interface mismatch
+    ///
+    /// @param instance         not used in the omp evaluator
+    ///
+    /// @param deviceContext    not used in the omp evaluator
+    ///
+    template <typename SRC_BUFFER, typename DST_BUFFER,
+              typename PATCHCOORD_BUFFER, typename PATCH_TABLE>
+    static bool EvalPatches(
+        SRC_BUFFER *srcBuffer, VertexBufferDescriptor const &srcDesc,
+        DST_BUFFER *dstBuffer, VertexBufferDescriptor const &dstDesc,
+        DST_BUFFER *duBuffer,  VertexBufferDescriptor const &duDesc,
+        DST_BUFFER *dvBuffer,  VertexBufferDescriptor const &dvDesc,
+        int numPatchCoords,
+        PATCHCOORD_BUFFER *patchCoords,
+        PATCH_TABLE *patchTable,
+        OmpEvaluator const *instance = NULL,
+        void * deviceContext = NULL) {
+        (void)instance;       // unused
+        (void)deviceContext;  // unused
+
+        // XXX: PatchCoords is somewhat abusing vertex primvar buffer interop.
+        //      ideally all buffer classes should have templated by datatype
+        //      so that downcast isn't needed there.
+        //      (e.g. Osd::CpuBuffer<PatchCoord> )
+        //
+        return EvalPatches(srcBuffer->BindCpuBuffer(), srcDesc,
+                           dstBuffer->BindCpuBuffer(), dstDesc,
+                           duBuffer->BindCpuBuffer(),  duDesc,
+                           dvBuffer->BindCpuBuffer(),  dvDesc,
+                           numPatchCoords,
+                           (const PatchCoord*)patchCoords->BindCpuBuffer(),
+                           patchTable->GetPatchArrayBuffer(),
+                           patchTable->GetPatchIndexBuffer(),
+                           patchTable->GetPatchParamBuffer());
+    }
+
+    /// \brief Static limit eval function. It takes an array of PatchCoord
+    ///        and evaluate limit values on given PatchTable.
+    ///
+    /// @param src              Input primvar pointer. An offset of srcDesc
+    ///                         will be applied internally (i.e. the pointer
+    ///                         should not include the offset)
+    ///
+    /// @param srcDesc          vertex buffer descriptor for the input buffer
+    ///
+    /// @param dst              Output primvar pointer. An offset of dstDesc
+    ///                         will be applied internally.
+    ///
+    /// @param dstDesc          vertex buffer descriptor for the output buffer
+    ///
+    /// @param numPatchCoords   number of patchCoords.
+    ///
+    /// @param patchCoords      array of locations to be evaluated.
+    ///
+    /// @param patchArrays      an array of Osd::PatchArray struct
+    ///                         indexed by PatchCoord::arrayIndex
+    ///
+    /// @param patchIndexBuffer an array of patch indices
+    ///                         indexed by PatchCoord::vertIndex
+    ///
+    /// @param patchParamBuffer an array of Osd::PatchParam struct
+    ///                         indexed by PatchCoord::patchIndex
+    ///
+    static bool EvalPatches(
+        const float *src, VertexBufferDescriptor const &srcDesc,
+        float *dst,       VertexBufferDescriptor const &dstDesc,
+        int numPatchCoords,
+        const PatchCoord *patchCoords,
+        const PatchArray *patchArrays,
+        const int *patchIndexBuffer,
+        const PatchParam *patchParamBuffer);
+
+    /// \brief Static limit eval function. It takes an array of PatchCoord
+    ///        and evaluate limit values on given PatchTable.
+    ///
+    /// @param src              Input primvar pointer. An offset of srcDesc
+    ///                         will be applied internally (i.e. the pointer
+    ///                         should not include the offset)
+    ///
+    /// @param srcDesc          vertex buffer descriptor for the input buffer
+    ///
+    /// @param dst              Output primvar pointer. An offset of dstDesc
+    ///                         will be applied internally.
+    ///
+    /// @param dstDesc          vertex buffer descriptor for the output buffer
+    ///
+    /// @param du               Output U-derivatives pointer. An offset of
+    ///                         duDesc will be applied internally.
+    ///
+    /// @param duDesc           vertex buffer descriptor for the du buffer
+    ///
+    /// @param dv               Output V-derivatives pointer. An offset of
+    ///                         dvDesc will be applied internally.
+    ///
+    /// @param dvDesc           vertex buffer descriptor for the dv buffer
+    ///
+    /// @param numPatchCoords   number of patchCoords.
+    ///
+    /// @param patchCoords      array of locations to be evaluated.
+    ///
+    /// @param patchArrays      an array of Osd::PatchArray struct
+    ///                         indexed by PatchCoord::arrayIndex
+    ///
+    /// @param patchIndexBuffer an array of patch indices
+    ///                         indexed by PatchCoord::vertIndex
+    ///
+    /// @param patchParamBuffer an array of Osd::PatchParam struct
+    ///                         indexed by PatchCoord::patchIndex
+    ///
+    static bool EvalPatches(
+        const float *src, VertexBufferDescriptor const &srcDesc,
+        float *dst,       VertexBufferDescriptor const &dstDesc,
+        float *du,        VertexBufferDescriptor const &duDesc,
+        float *dv,        VertexBufferDescriptor const &dvDesc,
+        int numPatchCoords,
+        PatchCoord const *patchCoords,
+        PatchArray const *patchArrays,
+        const int *patchIndexBuffer,
+        PatchParam const *patchParamBuffer);
+
+    /// ----------------------------------------------------------------------
+    ///
+    ///   Other methods
+    ///
+    /// ----------------------------------------------------------------------
 
     static void Synchronize(void *deviceContext = NULL);
 
diff --git a/opensubdiv/osd/ompKernel.cpp b/opensubdiv/osd/ompKernel.cpp
index dc7ecbfe..2b727c4f 100644
--- a/opensubdiv/osd/ompKernel.cpp
+++ b/opensubdiv/osd/ompKernel.cpp
@@ -117,10 +117,82 @@ OmpEvalStencils(float const * src,
 
         copy(dst, i, threadResult, dstDesc);
     }
+}
+
+void
+OmpEvalStencils(float const * src,
+                VertexBufferDescriptor const &srcDesc,
+                float * dst,
+                VertexBufferDescriptor const &dstDesc,
+                float * dstDu,
+                VertexBufferDescriptor const &dstDuDesc,
+                float * dstDv,
+                VertexBufferDescriptor const &dstDvDesc,
+                int const * sizes,
+                int const * offsets,
+                int const * indices,
+                float const * weights,
+                float const * duWeights,
+                float const * dvWeights,
+                int start, int end) {
+    if (start > 0) {
+        sizes += start;
+        indices += offsets[start];
+        weights += offsets[start];
+        duWeights += offsets[start];
+        dvWeights += offsets[start];
+    }
+
+    src += srcDesc.offset;
+    dst += dstDesc.offset;
+    dstDu += dstDuDesc.offset;
+    dstDv += dstDvDesc.offset;
+
+    int numThreads = omp_get_max_threads();
+    int n = end - start;
+
+    float * result = (float*)alloca(srcDesc.length * numThreads * sizeof(float));
+    float * resultDu = (float*)alloca(srcDesc.length * numThreads * sizeof(float));
+    float * resultDv = (float*)alloca(srcDesc.length * numThreads * sizeof(float));
+
+#pragma omp parallel for
+    for (int i = 0; i < n; ++i) {
+
+        int index = i + (start > 0 ? start : 0); // Stencil index
+
+        // Get thread-local pointers
+        int const           * threadIndices = indices + offsets[index];
+        float const         * threadWeights = weights + offsets[index];
+        float const         * threadWeightsDu = duWeights + offsets[index];
+        float const         * threadWeightsDv = dvWeights + offsets[index];
+
+        int threadId = omp_get_thread_num();
+
+        float * threadResult = result + threadId*srcDesc.length;
+        float * threadResultDu = resultDu + threadId*srcDesc.length;
+        float * threadResultDv = resultDv + threadId*srcDesc.length;
+
+        clear(threadResult, dstDesc);
+        clear(threadResultDu, dstDuDesc);
+        clear(threadResultDv, dstDvDesc);
+
+        for (int j=0; j<(int)sizes[index]; ++j) {
+            addWithWeight(threadResult, src,
+                threadIndices[j], threadWeights[j], srcDesc);
+            addWithWeight(threadResultDu, src,
+                threadIndices[j], threadWeightsDu[j], srcDesc);
+            addWithWeight(threadResultDv, src,
+                threadIndices[j], threadWeightsDv[j], srcDesc);
+        }
+
+        copy(dst, i, threadResult, dstDesc);
+        copy(dstDu, i, threadResultDu, dstDuDesc);
+        copy(dstDv, i, threadResultDv, dstDvDesc);
+    }
 
 }
 
-} // end namespace Osd
+}  // end namespace Osd
 
 }  // end namespace OPENSUBDIV_VERSION
 }  // end namespace OpenSubdiv
diff --git a/opensubdiv/osd/ompKernel.h b/opensubdiv/osd/ompKernel.h
index 7a9770cb..f0d71aea 100644
--- a/opensubdiv/osd/ompKernel.h
+++ b/opensubdiv/osd/ompKernel.h
@@ -45,6 +45,23 @@ OmpEvalStencils(float const * src,
                 float const * weights,
                 int start, int end);
 
+void
+OmpEvalStencils(float const * src,
+                VertexBufferDescriptor const &srcDesc,
+                float * dst,
+                VertexBufferDescriptor const &dstDesc,
+                float * dstDu,
+                VertexBufferDescriptor const &dstDuDesc,
+                float * dstDv,
+                VertexBufferDescriptor const &dstDvDesc,
+                int const * sizes,
+                int const * offsets,
+                int const * indices,
+                float const * weights,
+                float const * duWeights,
+                float const * dvWeights,
+                int start, int end);
+
 } // end namespace Osd
 
 }  // end namespace OPENSUBDIV_VERSION
diff --git a/opensubdiv/osd/tbbEvaluator.cpp b/opensubdiv/osd/tbbEvaluator.cpp
index 2d49e258..5bfce4b9 100644
--- a/opensubdiv/osd/tbbEvaluator.cpp
+++ b/opensubdiv/osd/tbbEvaluator.cpp
@@ -34,15 +34,15 @@ namespace Osd {
 
 /* static */
 bool
-TbbEvaluator::EvalStencils(const float *src,
-                           VertexBufferDescriptor const &srcDesc,
-                           float *dst,
-                           VertexBufferDescriptor const &dstDesc,
-                           const int * sizes,
-                           const int * offsets,
-                           const int * indices,
-                           const float * weights,
-                           int start, int end) {
+TbbEvaluator::EvalStencils(
+    const float *src, VertexBufferDescriptor const &srcDesc,
+    float *dst,       VertexBufferDescriptor const &dstDesc,
+    const int * sizes,
+    const int * offsets,
+    const int * indices,
+    const float * weights,
+    int start, int end) {
+
     if (end <= start) return true;
 
     TbbEvalStencils(src, srcDesc, dst, dstDesc,
@@ -51,6 +51,82 @@ TbbEvaluator::EvalStencils(const float *src,
     return true;
 }
 
+/* static */
+bool
+TbbEvaluator::EvalStencils(
+    const float *src, VertexBufferDescriptor const &srcDesc,
+    float *dst,       VertexBufferDescriptor const &dstDesc,
+    float *du,        VertexBufferDescriptor const &duDesc,
+    float *dv,        VertexBufferDescriptor const &dvDesc,
+    const int * sizes,
+    const int * offsets,
+    const int * indices,
+    const float * weights,
+    const float * duWeights,
+    const float * dvWeights,
+    int start, int end) {
+
+    if (end <= start) return true;
+    if (srcDesc.length != dstDesc.length) return false;
+    if (srcDesc.length != duDesc.length) return false;
+    if (srcDesc.length != dvDesc.length) return false;
+
+    TbbEvalStencils(src, srcDesc,
+                    dst, dstDesc,
+                    du,  duDesc,
+                    dv,  dvDesc,
+                    sizes, offsets, indices,
+                    weights, duWeights, dvWeights,
+                    start, end);
+
+    return true;
+}
+
+/* static */
+bool
+TbbEvaluator::EvalPatches(
+    const float *src, VertexBufferDescriptor const &srcDesc,
+    float *dst,       VertexBufferDescriptor const &dstDesc,
+    int numPatchCoords,
+    const PatchCoord *patchCoords,
+    const PatchArray *patchArrayBuffer,
+    const int *patchIndexBuffer,
+    const PatchParam *patchParamBuffer) {
+
+    if (srcDesc.length != dstDesc.length) return false;
+
+    TbbEvalPatches(src, srcDesc, dst, dstDesc,
+                   NULL, VertexBufferDescriptor(),
+                   NULL, VertexBufferDescriptor(),
+                   numPatchCoords, patchCoords,
+                   patchArrayBuffer, patchIndexBuffer, patchParamBuffer);
+
+    return true;
+}
+
+/* static */
+bool
+TbbEvaluator::EvalPatches(
+    const float *src, VertexBufferDescriptor const &srcDesc,
+    float *dst,       VertexBufferDescriptor const &dstDesc,
+    float *du,        VertexBufferDescriptor const &duDesc,
+    float *dv,        VertexBufferDescriptor const &dvDesc,
+    int numPatchCoords,
+    const PatchCoord *patchCoords,
+    const PatchArray *patchArrayBuffer,
+    const int *patchIndexBuffer,
+    const PatchParam *patchParamBuffer) {
+
+    if (srcDesc.length != dstDesc.length) return false;
+
+    TbbEvalPatches(src, srcDesc, dst, dstDesc,
+                   du,  duDesc,  dv,  dvDesc,
+                   numPatchCoords, patchCoords,
+                   patchArrayBuffer, patchIndexBuffer, patchParamBuffer);
+
+    return true;
+}
+
 /* static */
 void
 TbbEvaluator::Synchronize(void *) {
diff --git a/opensubdiv/osd/tbbEvaluator.h b/opensubdiv/osd/tbbEvaluator.h
index 58cc02f6..37eaf9a2 100644
--- a/opensubdiv/osd/tbbEvaluator.h
+++ b/opensubdiv/osd/tbbEvaluator.h
@@ -26,7 +26,9 @@
 #define OPENSUBDIV3_OSD_TBB_EVALUATOR_H
 
 #include "../version.h"
+#include "../osd/types.h"
 #include "../osd/vertexDescriptor.h"
+#include "../far/patchTable.h"
 
 #include <cstddef>
 
@@ -37,9 +39,15 @@ namespace Osd {
 
 class TbbEvaluator {
 public:
-    /// \brief Generic static stencil eval function. This function has a same
+    /// ----------------------------------------------------------------------
+    ///
+    ///   Stencil evaluations with StencilTable
+    ///
+    /// ----------------------------------------------------------------------
+
+    /// \brief Generic static eval stencils function. This function has a same
     ///        signature as other device kernels have so that it can be called
-    ///        transparently from OsdMesh template interface.
+    ///        in the same way from OsdMesh template interface.
     ///
     /// @param srcBuffer      Input primvar buffer.
     ///                       must have BindCpuBuffer() method returning a
@@ -55,27 +63,25 @@ public:
     ///
     /// @param stencilTable   stencil table to be applied.
     ///
-    /// @param instance        not used in the tbb kernel
+    /// @param instance       not used in the tbb kernel
     ///                       (declared as a typed pointer to prevent
     ///                        undesirable template resolution)
     ///
     /// @param deviceContext  not used in the tbb kernel
     ///
-    template <typename VERTEX_BUFFER, typename STENCIL_TABLE>
-    static bool EvalStencils(VERTEX_BUFFER *srcVertexBuffer,
-                        VertexBufferDescriptor const &srcDesc,
-                        VERTEX_BUFFER *dstVertexBuffer,
-                        VertexBufferDescriptor const &dstDesc,
-                        STENCIL_TABLE const *stencilTable,
-                        TbbEvaluator const *instance = NULL,
-                        void *deviceContext = NULL) {
+    template <typename SRC_BUFFER, typename DST_BUFFER, typename STENCIL_TABLE>
+    static bool EvalStencils(
+        SRC_BUFFER *srcBuffer, VertexBufferDescriptor const &srcDesc,
+        DST_BUFFER *dstBuffer, VertexBufferDescriptor const &dstDesc,
+        STENCIL_TABLE const *stencilTable,
+        TbbEvaluator const *instance = NULL,
+        void *deviceContext = NULL) {
+
         (void)instance;   // unused
         (void)deviceContext;  // unused
 
-        return EvalStencils(srcVertexBuffer->BindCpuBuffer(),
-                            srcDesc,
-                            dstVertexBuffer->BindCpuBuffer(),
-                            dstDesc,
+        return EvalStencils(srcBuffer->BindCpuBuffer(), srcDesc,
+                            dstBuffer->BindCpuBuffer(), dstDesc,
                             &stencilTable->GetSizes()[0],
                             &stencilTable->GetOffsets()[0],
                             &stencilTable->GetControlIndices()[0],
@@ -84,19 +90,381 @@ public:
                             /*end   = */ stencilTable->GetNumStencils());
     }
 
-    static bool EvalStencils(const float *src,
-                             VertexBufferDescriptor const &srcDesc,
-                             float *dst,
-                             VertexBufferDescriptor const &dstDesc,
-                             const int *sizes,
-                             const int *offsets,
-                             const int *indices,
-                             const float *weights,
-                             int start,
-                             int end);
+    /// \brief Static eval stencils function which takes raw CPU pointers for
+    ///        input and output.
+    ///
+    /// @param src            Input primvar pointer. An offset of srcDesc
+    ///                       will be applied internally (i.e. the pointer
+    ///                       should not include the offset)
+    ///
+    /// @param srcDesc        vertex buffer descriptor for the input buffer
+    ///
+    /// @param dst            Output primvar pointer. An offset of dstDesc
+    ///                       will be applied internally.
+    ///
+    /// @param dstDesc        vertex buffer descriptor for the output buffer
+    ///
+    /// @param sizes          pointer to the sizes buffer of the stencil table
+    ///                       to apply for the range [start, end)
+    ///
+    /// @param offsets        pointer to the offsets buffer of the stencil table
+    ///
+    /// @param indices        pointer to the indices buffer of the stencil table
+    ///
+    /// @param weights        pointer to the weights buffer of the stencil table
+    ///
+    /// @param start          start index of stencil table
+    ///
+    /// @param end            end index of stencil table
+    ///
+    static bool EvalStencils(
+        const float *src, VertexBufferDescriptor const &srcDesc,
+        float *dst,       VertexBufferDescriptor const &dstDesc,
+        const int *sizes,
+        const int *offsets,
+        const int *indices,
+        const float *weights,
+        int start, int end);
 
+    /// \brief Generic static eval stencils function with derivatives.
+    ///        This function has a same signature as other device kernels
+    ///        have so that it can be called in the same way from OsdMesh
+    ///        template interface.
+    ///
+    /// @param srcBuffer      Input primvar buffer.
+    ///                       must have BindCpuBuffer() method returning a
+    ///                       const float pointer for read
+    ///
+    /// @param srcDesc        vertex buffer descriptor for the input buffer
+    ///
+    /// @param dstBuffer      Output primvar buffer
+    ///                       must have BindCpuBuffer() method returning a
+    ///                       float pointer for write
+    ///
+    /// @param dstDesc        vertex buffer descriptor for the output buffer
+    ///
+    /// @param duBuffer       Output U-derivative buffer
+    ///                       must have BindCpuBuffer() method returning a
+    ///                       float pointer for write
+    ///
+    /// @param duDesc         vertex buffer descriptor for the output buffer
+    ///
+    /// @param dvBuffer       Output V-derivative buffer
+    ///                       must have BindCpuBuffer() method returning a
+    ///                       float pointer for write
+    ///
+    /// @param dvDesc         vertex buffer descriptor for the output buffer
+    ///
+    /// @param stencilTable   stencil table to be applied.
+    ///
+    /// @param instance       not used in the tbb kernel
+    ///                       (declared as a typed pointer to prevent
+    ///                        undesirable template resolution)
+    ///
+    /// @param deviceContext  not used in the tbb kernel
+    ///
+    template <typename SRC_BUFFER, typename DST_BUFFER, typename STENCIL_TABLE>
+    static bool EvalStencils(
+        SRC_BUFFER *srcBuffer, VertexBufferDescriptor const &srcDesc,
+        DST_BUFFER *dstBuffer, VertexBufferDescriptor const &dstDesc,
+        DST_BUFFER *duBuffer,  VertexBufferDescriptor const &duDesc,
+        DST_BUFFER *dvBuffer,  VertexBufferDescriptor const &dvDesc,
+        STENCIL_TABLE const *stencilTable,
+        const TbbEvaluator *instance = NULL,
+        void * deviceContext = NULL) {
+
+        (void)instance;   // unused
+        (void)deviceContext;   // unused
+
+        return EvalStencils(srcBuffer->BindCpuBuffer(), srcDesc,
+                            dstBuffer->BindCpuBuffer(), dstDesc,
+                            duBuffer->BindCpuBuffer(),  duDesc,
+                            dvBuffer->BindCpuBuffer(),  dvDesc,
+                            &stencilTable->GetSizes()[0],
+                            &stencilTable->GetOffsets()[0],
+                            &stencilTable->GetControlIndices()[0],
+                            &stencilTable->GetWeights()[0],
+                            &stencilTable->GetDuWeights()[0],
+                            &stencilTable->GetDvWeights()[0],
+                            /*start = */ 0,
+                            /*end   = */ stencilTable->GetNumStencils());
+    }
+
+    /// \brief Static eval stencils function with derivatives, which takes
+    ///        raw CPU pointers for input and output.
+    ///
+    /// @param src            Input primvar pointer. An offset of srcDesc
+    ///                       will be applied internally (i.e. the pointer
+    ///                       should not include the offset)
+    ///
+    /// @param srcDesc        vertex buffer descriptor for the input buffer
+    ///
+    /// @param dst            Output primvar pointer. An offset of dstDesc
+    ///                       will be applied internally.
+    ///
+    /// @param dstDesc        vertex buffer descriptor for the output buffer
+    ///
+    /// @param du             Output s-derivatives pointer. An offset of
+    ///                       duDesc will be applied internally.
+    ///
+    /// @param duDesc         vertex buffer descriptor for the output buffer
+    ///
+    /// @param dv             Output t-derivatives pointer. An offset of
+    ///                       dvDesc will be applied internally.
+    ///
+    /// @param dvDesc         vertex buffer descriptor for the output buffer
+    ///
+    /// @param sizes          pointer to the sizes buffer of the stencil table
+    ///                       to apply for the range [start, end)
+    ///
+    /// @param offsets        pointer to the offsets buffer of the stencil table
+    ///
+    /// @param indices        pointer to the indices buffer of the stencil table
+    ///
+    /// @param weights        pointer to the weights buffer of the stencil table
+    ///
+    /// @param duWeights      pointer to the u-weights buffer of the stencil table
+    ///
+    /// @param dvWeights      pointer to the v-weights buffer of the stencil table
+    ///
+    /// @param start          start index of stencil table
+    ///
+    /// @param end            end index of stencil table
+    ///
+    static bool EvalStencils(
+        const float *src, VertexBufferDescriptor const &srcDesc,
+        float *dst,       VertexBufferDescriptor const &dstDesc,
+        float *du,        VertexBufferDescriptor const &duDesc,
+        float *dv,        VertexBufferDescriptor const &dvDesc,
+        const int * sizes,
+        const int * offsets,
+        const int * indices,
+        const float * weights,
+        const float * duWeights,
+        const float * dvWeights,
+        int start, int end);
+
+    /// ----------------------------------------------------------------------
+    ///
+    ///   Limit evaluations with PatchTable
+    ///
+    /// ----------------------------------------------------------------------
+
+    /// \brief Generic limit eval function. This function has a same
+    ///        signature as other device kernels have so that it can be called
+    ///        in the same way.
+    ///
+    /// @param srcBuffer        Input primvar buffer.
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         const float pointer for read
+    ///
+    /// @param srcDesc          vertex buffer descriptor for the input buffer
+    ///
+    /// @param dstBuffer        Output primvar buffer
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param dstDesc          vertex buffer descriptor for the output buffer
+    ///
+    /// @param numPatchCoords   number of patchCoords.
+    ///
+    /// @param patchCoords      array of locations to be evaluated.
+    ///
+    /// @param patchTable       Far::PatchTable
+    ///
+    /// @param instance         not used in the cpu evaluator
+    ///
+    /// @param deviceContext    not used in the cpu evaluator
+    ///
+    template <typename SRC_BUFFER, typename DST_BUFFER,
+              typename PATCHCOORD_BUFFER, typename PATCH_TABLE>
+    static bool EvalPatches(
+        SRC_BUFFER *srcBuffer, VertexBufferDescriptor const &srcDesc,
+        DST_BUFFER *dstBuffer, VertexBufferDescriptor const &dstDesc,
+        int numPatchCoords,
+        PATCHCOORD_BUFFER *patchCoords,
+        PATCH_TABLE *patchTable,
+        TbbEvaluator const *instance = NULL,
+        void * deviceContext = NULL) {
+
+        (void)instance;       // unused
+        (void)deviceContext;  // unused
+
+        return EvalPatches(srcBuffer->BindCpuBuffer(),
+                           srcDesc,
+                           dstBuffer->BindCpuBuffer(),
+                           dstDesc,
+                           numPatchCoords,
+                           (const PatchCoord*)patchCoords->BindCpuBuffer(),
+                           patchTable->GetPatchArrayBuffer(),
+                           patchTable->GetPatchIndexBuffer(),
+                           patchTable->GetPatchParamBuffer());
+    }
+
+    /// \brief Generic limit eval function with derivatives. This function has
+    ///        a same signature as other device kernels have so that it can be
+    ///        called in the same way.
+    ///
+    /// @param srcBuffer        Input primvar buffer.
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         const float pointer for read
+    ///
+    /// @param srcDesc          vertex buffer descriptor for the input buffer
+    ///
+    /// @param dstBuffer        Output primvar buffer
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param dstDesc          vertex buffer descriptor for the output buffer
+    ///
+    /// @param duBuffer         Output s-derivatives buffer
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param duDesc           vertex buffer descriptor for the duBuffer
+    ///
+    /// @param dvBuffer         Output t-derivatives buffer
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param dvDesc           vertex buffer descriptor for the dvBuffer
+    ///
+    /// @param numPatchCoords   number of patchCoords.
+    ///
+    /// @param patchCoords      array of locations to be evaluated.
+    ///
+    /// @param patchTable       Far::PatchTable
+    ///
+    /// @param instance         not used in the cpu evaluator
+    ///
+    /// @param deviceContext    not used in the cpu evaluator
+    ///
+    template <typename SRC_BUFFER, typename DST_BUFFER,
+              typename PATCHCOORD_BUFFER, typename PATCH_TABLE>
+    static bool EvalPatches(
+        SRC_BUFFER *srcBuffer, VertexBufferDescriptor const &srcDesc,
+        DST_BUFFER *dstBuffer, VertexBufferDescriptor const &dstDesc,
+        DST_BUFFER *duBuffer,  VertexBufferDescriptor const &duDesc,
+        DST_BUFFER *dvBuffer,  VertexBufferDescriptor const &dvDesc,
+        int numPatchCoords,
+        PATCHCOORD_BUFFER *patchCoords,
+        PATCH_TABLE *patchTable,
+        TbbEvaluator const *instance = NULL,
+        void * deviceContext = NULL) {
+
+        (void)instance;       // unused
+        (void)deviceContext;  // unused
+
+        return EvalPatches(
+            srcBuffer->BindCpuBuffer(), srcDesc,
+            dstBuffer->BindCpuBuffer(), dstDesc,
+            duBuffer->BindCpuBuffer(),  duDesc,
+            dvBuffer->BindCpuBuffer(),  dvDesc,
+            numPatchCoords,
+            (const PatchCoord*)patchCoords->BindCpuBuffer(),
+            patchTable->GetPatchArrayBuffer(),
+            patchTable->GetPatchIndexBuffer(),
+            patchTable->GetPatchParamBuffer());
+    }
+
+    /// \brief Static limit eval function. It takes an array of PatchCoord
+    ///        and evaluate limit values on given PatchTable.
+    ///
+    /// @param src              Input primvar pointer. An offset of srcDesc
+    ///                         will be applied internally (i.e. the pointer
+    ///                         should not include the offset)
+    ///
+    /// @param srcDesc          vertex buffer descriptor for the input buffer
+    ///
+    /// @param dst              Output primvar pointer. An offset of dstDesc
+    ///                         will be applied internally.
+    ///
+    /// @param dstDesc          vertex buffer descriptor for the output buffer
+    ///
+    /// @param numPatchCoords   number of patchCoords.
+    ///
+    /// @param patchCoords      array of locations to be evaluated.
+    ///
+    /// @param patchArrays      an array of Osd::PatchArray struct
+    ///                         indexed by PatchCoord::arrayIndex
+    ///
+    /// @param patchIndexBuffer an array of patch indices
+    ///                         indexed by PatchCoord::vertIndex
+    ///
+    /// @param patchParamBuffer an array of Osd::PatchParam struct
+    ///                         indexed by PatchCoord::patchIndex
+    ///
+    static bool EvalPatches(
+        const float *src, VertexBufferDescriptor const &srcDesc,
+        float *dst,       VertexBufferDescriptor const &dstDesc,
+        int numPatchCoords,
+        const PatchCoord *patchCoords,
+        const PatchArray *patchArrays,
+        const int *patchIndexBuffer,
+        const PatchParam *patchParamBuffer);
+
+    /// \brief Static limit eval function. It takes an array of PatchCoord
+    ///        and evaluate limit values on given PatchTable.
+    ///
+    /// @param src              Input primvar pointer. An offset of srcDesc
+    ///                         will be applied internally (i.e. the pointer
+    ///                         should not include the offset)
+    ///
+    /// @param srcDesc          vertex buffer descriptor for the input buffer
+    ///
+    /// @param dst              Output primvar pointer. An offset of dstDesc
+    ///                         will be applied internally.
+    ///
+    /// @param dstDesc          vertex buffer descriptor for the output buffer
+    ///
+    /// @param du               Output s-derivatives pointer. An offset of
+    ///                         duDesc will be applied internally.
+    ///
+    /// @param duDesc           vertex buffer descriptor for the du buffer
+    ///
+    /// @param dv               Output t-derivatives pointer. An offset of
+    ///                         dvDesc will be applied internally.
+    ///
+    /// @param dvDesc           vertex buffer descriptor for the dv buffer
+    ///
+    /// @param numPatchCoords   number of patchCoords.
+    ///
+    /// @param patchCoords      array of locations to be evaluated.
+    ///
+    /// @param patchArrays      an array of Osd::PatchArray struct
+    ///                         indexed by PatchCoord::arrayIndex
+    ///
+    /// @param patchIndexBuffer an array of patch indices
+    ///                         indexed by PatchCoord::vertIndex
+    ///
+    /// @param patchParamBuffer an array of Osd::PatchParam struct
+    ///                         indexed by PatchCoord::patchIndex
+    ///
+    static bool EvalPatches(
+        const float *src, VertexBufferDescriptor const &srcDesc,
+        float *dst,       VertexBufferDescriptor const &dstDesc,
+        float *du,        VertexBufferDescriptor const &duDesc,
+        float *dv,        VertexBufferDescriptor const &dvDesc,
+        int numPatchCoords,
+        const PatchCoord *patchCoords,
+        const PatchArray *patchArrays,
+        const int *patchIndexBuffer,
+        const PatchParam *patchParamBuffer);
+
+    /// ----------------------------------------------------------------------
+    ///
+    ///   Other methods
+    ///
+    /// ----------------------------------------------------------------------
+
+    /// \brief synchronize all asynchronous computation invoked on this device.
     static void Synchronize(void *deviceContext = NULL);
 
+    /// \brief initialize tbb task schedular
+    ///        (optional: client may use tbb::task_scheduler_init)
+    ///
+    /// @param numThreads      how many threads
+    ///
     static void SetNumThreads(int numThreads);
 };
 
diff --git a/opensubdiv/osd/tbbKernel.cpp b/opensubdiv/osd/tbbKernel.cpp
index a15f765c..6b45f078 100644
--- a/opensubdiv/osd/tbbKernel.cpp
+++ b/opensubdiv/osd/tbbKernel.cpp
@@ -24,7 +24,9 @@
 
 #include "../osd/cpuKernel.h"
 #include "../osd/tbbKernel.h"
+#include "../osd/types.h"
 #include "../osd/vertexDescriptor.h"
+#include "../far/patchBasis.h"
 
 #include <cassert>
 #include <cstdlib>
@@ -187,6 +189,274 @@ TbbEvalStencils(float const * src,
     tbb::parallel_for(range, kernel);
 }
 
+void
+TbbEvalStencils(float const * src, VertexBufferDescriptor const &srcDesc,
+                float * dst,       VertexBufferDescriptor const &dstDesc,
+                float * du,        VertexBufferDescriptor const &duDesc,
+                float * dv,        VertexBufferDescriptor const &dvDesc,
+                int const * sizes,
+                int const * offsets,
+                int const * indices,
+                float const * weights,
+                float const * duWeights,
+                float const * dvWeights,
+                int start, int end) {
+    if (start > 0) {
+        sizes += start;
+        indices += offsets[start];
+        weights += offsets[start];
+        duWeights += offsets[start];
+        dvWeights += offsets[start];
+    }
+
+    if (src) src += srcDesc.offset;
+    if (dst) dst += dstDesc.offset;
+    if (du)  du  += duDesc.offset;
+    if (dv)  dv  += dvDesc.offset;
+
+    // PERFORMANCE: need to combine 3 launches together
+    if (dst) {
+        TBBStencilKernel kernel(src, srcDesc, dst, dstDesc,
+                                sizes, offsets, indices, weights);
+        tbb::blocked_range<int> range(start, end, grain_size);
+        tbb::parallel_for(range, kernel);
+    }
+
+    if (du) {
+        TBBStencilKernel kernel(src, srcDesc, du, duDesc,
+                                sizes, offsets, indices, duWeights);
+        tbb::blocked_range<int> range(start, end, grain_size);
+        tbb::parallel_for(range, kernel);
+    }
+
+    if (dv) {
+        TBBStencilKernel kernel(src, srcDesc, dv, dvDesc,
+                                sizes, offsets, indices, dvWeights);
+        tbb::blocked_range<int> range(start, end, grain_size);
+        tbb::parallel_for(range, kernel);
+    }
+}
+
+// ---------------------------------------------------------------------------
+
+template <typename T>
+struct BufferAdapter {
+    BufferAdapter(T *p, int length, int stride) :
+        _p(p), _length(length), _stride(stride) { }
+    void Clear() {
+        for (int i = 0; i < _length; ++i) _p[i] = 0;
+    }
+    void AddWithWeight(T const *src, float w) {
+        if (_p) {
+            for (int i = 0; i < _length; ++i) {
+                _p[i] += src[i] * w;
+            }
+        }
+    }
+    const T *operator[] (int index) const {
+        return _p + _stride * index;
+    }
+    BufferAdapter<T> & operator ++() {
+        if (_p) {
+            _p += _stride;
+        }
+        return *this;
+    }
+
+    T *_p;
+    int _length;
+    int _stride;
+};
+
+class TbbEvalPatchesKernel {
+    VertexBufferDescriptor _srcDesc;
+    VertexBufferDescriptor _dstDesc;
+    VertexBufferDescriptor _dstDuDesc;
+    VertexBufferDescriptor _dstDvDesc;
+    float const * _src;
+    float * _dst;
+    float * _dstDu;
+    float * _dstDv;
+    int _numPatchCoords;
+    const PatchCoord *_patchCoords;
+    const PatchArray *_patchArrayBuffer;
+    const int        *_patchIndexBuffer;
+    const PatchParam *_patchParamBuffer;
+
+public:
+    TbbEvalPatchesKernel(float const *src,
+                         VertexBufferDescriptor srcDesc,
+                         float *dst,
+                         VertexBufferDescriptor dstDesc,
+                         float *dstDu,
+                         VertexBufferDescriptor dstDuDesc,
+                         float *dstDv,
+                         VertexBufferDescriptor dstDvDesc,
+                         int numPatchCoords,
+                         const PatchCoord *patchCoords,
+                         const PatchArray *patchArrayBuffer,
+                         const int *patchIndexBuffer,
+                         const PatchParam *patchParamBuffer) :
+        _srcDesc(srcDesc), _dstDesc(dstDesc),
+        _dstDuDesc(dstDuDesc), _dstDvDesc(dstDvDesc),
+        _src(src), _dst(dst), _dstDu(dstDu), _dstDv(dstDv),
+        _numPatchCoords(numPatchCoords),
+        _patchCoords(patchCoords),
+        _patchArrayBuffer(patchArrayBuffer),
+        _patchIndexBuffer(patchIndexBuffer),
+        _patchParamBuffer(patchParamBuffer) {
+    }
+
+    void operator() (tbb::blocked_range<int> const &r) const {
+        if (_dstDu == NULL && _dstDv == NULL) {
+            compute(r);
+        } else {
+            computeWithDerivative(r);
+        }
+    }
+
+    void compute(tbb::blocked_range<int> const &r) const {
+        float wP[20], wDs[20], wDt[20];
+        BufferAdapter<const float> srcT(_src + _srcDesc.offset,
+                                        _srcDesc.length,
+                                        _srcDesc.stride);
+        BufferAdapter<float> dstT(_dst + _dstDesc.offset
+                                       + r.begin() * _dstDesc.stride,
+                                  _dstDesc.length,
+                                  _dstDesc.stride);
+
+        BufferAdapter<float> dstDuT(_dstDu,
+                                    _dstDuDesc.length,
+                                    _dstDuDesc.stride);
+        BufferAdapter<float> dstDvT(_dstDv,
+                                    _dstDvDesc.length,
+                                    _dstDvDesc.stride);
+
+        for (int i = r.begin(); i < r.end(); ++i) {
+            PatchCoord const &coord = _patchCoords[i];
+            PatchArray const &array = _patchArrayBuffer[coord.handle.arrayIndex];
+
+            int patchType = array.GetPatchType();
+            Far::PatchParam::BitField patchBits = *(Far::PatchParam::BitField*)
+                &_patchParamBuffer[coord.handle.patchIndex].patchBits;
+
+            int numControlVertices = 0;
+            if (patchType == Far::PatchDescriptor::REGULAR) {
+                Far::internal::GetBSplineWeights(patchBits,
+                                                 coord.s, coord.t, wP, wDs, wDt);
+                numControlVertices = 16;
+            } else if (patchType == Far::PatchDescriptor::GREGORY_BASIS) {
+                Far::internal::GetGregoryWeights(patchBits,
+                                                 coord.s, coord.t, wP, wDs, wDt);
+                numControlVertices = 20;
+            } else if (patchType == Far::PatchDescriptor::QUADS) {
+                Far::internal::GetBilinearWeights(patchBits,
+                                                  coord.s, coord.t, wP, wDs, wDt);
+                numControlVertices = 4;
+            } else {
+                assert(0);
+            }
+
+            const int *cvs =
+                &_patchIndexBuffer[array.indexBase + coord.handle.vertIndex];
+
+            dstT.Clear();
+            for (int j = 0; j < numControlVertices; ++j) {
+                dstT.AddWithWeight(srcT[cvs[j]], wP[j]);
+            }
+            ++dstT;
+        }
+    }
+
+    void computeWithDerivative(tbb::blocked_range<int> const &r) const {
+        float wP[20], wDs[20], wDt[20];
+        BufferAdapter<const float> srcT(_src + _srcDesc.offset,
+                                        _srcDesc.length,
+                                        _srcDesc.stride);
+        BufferAdapter<float> dstT(_dst + _dstDesc.offset
+                                       + r.begin() * _dstDesc.stride,
+                                  _dstDesc.length,
+                                  _dstDesc.stride);
+        BufferAdapter<float> dstDuT(_dstDu + _dstDuDesc.offset
+                                       + r.begin() * _dstDuDesc.stride,
+                                  _dstDuDesc.length,
+                                  _dstDuDesc.stride);
+        BufferAdapter<float> dstDvT(_dstDv + _dstDvDesc.offset
+                                       + r.begin() * _dstDvDesc.stride,
+                                  _dstDvDesc.length,
+                                  _dstDvDesc.stride);
+
+        for (int i = r.begin(); i < r.end(); ++i) {
+            PatchCoord const &coord = _patchCoords[i];
+            PatchArray const &array = _patchArrayBuffer[coord.handle.arrayIndex];
+
+            int patchType = array.GetPatchType();
+            Far::PatchParam::BitField patchBits = *(Far::PatchParam::BitField*)
+                &_patchParamBuffer[coord.handle.patchIndex].patchBits;
+
+            int numControlVertices = 0;
+            if (patchType == Far::PatchDescriptor::REGULAR) {
+                Far::internal::GetBSplineWeights(patchBits,
+                                                 coord.s, coord.t, wP, wDs, wDt);
+                numControlVertices = 16;
+            } else if (patchType == Far::PatchDescriptor::GREGORY_BASIS) {
+                Far::internal::GetGregoryWeights(patchBits,
+                                                 coord.s, coord.t, wP, wDs, wDt);
+                numControlVertices = 20;
+            } else if (patchType == Far::PatchDescriptor::QUADS) {
+                Far::internal::GetBilinearWeights(patchBits,
+                                                  coord.s, coord.t, wP, wDs, wDt);
+                numControlVertices = 4;
+            } else {
+                assert(0);
+            }
+
+            const int *cvs =
+                &_patchIndexBuffer[array.indexBase + coord.handle.vertIndex];
+
+            dstT.Clear();
+            dstDuT.Clear();
+            dstDvT.Clear();
+            for (int j = 0; j < numControlVertices; ++j) {
+                dstT.AddWithWeight(srcT[cvs[j]], wP[j]);
+                dstDuT.AddWithWeight(srcT[cvs[j]], wDs[j]);
+                dstDvT.AddWithWeight(srcT[cvs[j]], wDt[j]);
+            }
+            ++dstT;
+            ++dstDuT;
+            ++dstDvT;
+        }
+    }
+};
+
+
+void
+TbbEvalPatches(float const *src,
+               VertexBufferDescriptor const &srcDesc,
+               float *dst,
+               VertexBufferDescriptor const &dstDesc,
+               float *dstDu,
+               VertexBufferDescriptor const &dstDuDesc,
+               float *dstDv,
+               VertexBufferDescriptor const &dstDvDesc,
+               int numPatchCoords,
+               const PatchCoord *patchCoords,
+               const PatchArray *patchArrayBuffer,
+               const int *patchIndexBuffer,
+               const PatchParam *patchParamBuffer) {
+
+    TbbEvalPatchesKernel kernel(src, srcDesc, dst, dstDesc,
+                                dstDu, dstDuDesc, dstDv, dstDvDesc,
+                                numPatchCoords, patchCoords,
+                                patchArrayBuffer,
+                                patchIndexBuffer,
+                                patchParamBuffer);
+
+    tbb::blocked_range<int> range(0, numPatchCoords, grain_size);
+    tbb::parallel_for(range, kernel);
+
+}
+
 }  // end namespace Osd
 
 }  // end namespace OPENSUBDIV_VERSION
diff --git a/opensubdiv/osd/tbbKernel.h b/opensubdiv/osd/tbbKernel.h
index c44dd732..01a6389d 100644
--- a/opensubdiv/osd/tbbKernel.h
+++ b/opensubdiv/osd/tbbKernel.h
@@ -32,6 +32,9 @@ namespace OPENSUBDIV_VERSION {
 
 namespace Osd {
 
+struct PatchArray;
+struct PatchCoord;
+struct PatchParam;
 struct VertexBufferDescriptor;
 
 void
@@ -45,6 +48,38 @@ TbbEvalStencils(float const * src,
                 float const * weights,
                 int start, int end);
 
+void
+TbbEvalStencils(float const * src,
+                VertexBufferDescriptor const &srcDesc,
+                float * dst,
+                VertexBufferDescriptor const &dstDesc,
+                float * dstDu,
+                VertexBufferDescriptor const &dstDuDesc,
+                float * dstDv,
+                VertexBufferDescriptor const &dstDvDesc,
+                int const * sizes,
+                int const * offsets,
+                int const * indices,
+                float const * weights,
+                float const * duWeights,
+                float const * dvWeights,
+                int start, int end);
+
+void
+TbbEvalPatches(float const *src,
+               VertexBufferDescriptor const &srcDesc,
+               float *dst,
+               VertexBufferDescriptor const &dstDesc,
+               float *dstDu,
+               VertexBufferDescriptor const &dstDuDesc,
+               float *dstDv,
+               VertexBufferDescriptor const &dstDvDesc,
+               int numPatchCoords,
+               const PatchCoord *patchCoords,
+               const PatchArray *patchArrayBuffer,
+               const int *patchIndexBuffer,
+               const PatchParam *patchParamBuffer);
+
 }  // end namespace Osd
 
 }  // end namespace OPENSUBDIV_VERSION
diff --git a/opensubdiv/osd/types.h b/opensubdiv/osd/types.h
new file mode 100644
index 00000000..b2279b3d
--- /dev/null
+++ b/opensubdiv/osd/types.h
@@ -0,0 +1,111 @@
+//
+//   Copyright 2015 Pixar
+//
+//   Licensed under the Apache License, Version 2.0 (the "Apache License")
+//   with the following modification; you may not use this file except in
+//   compliance with the Apache License and the following modification to it:
+//   Section 6. Trademarks. is deleted and replaced with:
+//
+//   6. Trademarks. This License does not grant permission to use the trade
+//      names, trademarks, service marks, or product names of the Licensor
+//      and its affiliates, except as required to comply with Section 4(c) of
+//      the License and to reproduce the content of the NOTICE file.
+//
+//   You may obtain a copy of the Apache License at
+//
+//       http://www.apache.org/licenses/LICENSE-2.0
+//
+//   Unless required by applicable law or agreed to in writing, software
+//   distributed under the Apache License with the above modification is
+//   distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+//   KIND, either express or implied. See the Apache License for the specific
+//   language governing permissions and limitations under the Apache License.
+//
+
+#ifndef OPENSUBDIV3_OSD_TYPES_H
+#define OPENSUBDIV3_OSD_TYPES_H
+
+#include "../version.h"
+#include "../far/patchTable.h"
+
+namespace OpenSubdiv {
+namespace OPENSUBDIV_VERSION {
+
+namespace Osd {
+
+/// \brief Coordinates set on a patch table
+///
+///  XXX: this class may be moved into Far
+///
+struct PatchCoord {
+    // 5-ints struct.
+
+    /// \brief Constructor
+    ///
+    /// @param handle    patch handle
+    ///
+    /// @param s         parametric location on the patch
+    ///
+    /// @param t         parametric location on the patch
+    ///
+    PatchCoord(Far::PatchTable::PatchHandle handle, float s, float t) :
+        handle(handle), s(s), t(t) { }
+
+    PatchCoord() : s(0), t(0) {
+        handle.arrayIndex = 0;
+        handle.patchIndex = 0;
+        handle.vertIndex = 0;
+    }
+
+    Far::PatchTable::PatchHandle handle; ///< patch handle
+    float s, t;              ///< parametric location on patch
+};
+
+struct PatchArray {
+    // 4-ints struct.
+    PatchArray(Far::PatchDescriptor desc, int numPatches,
+               int indexBase, int primitiveIdBase) :
+        desc(desc), numPatches(numPatches), indexBase(indexBase),
+        primitiveIdBase(primitiveIdBase) {}
+
+    Far::PatchDescriptor const &GetDescriptor() const {
+        return desc;
+    }
+
+    int GetPatchType() const {
+        return desc.GetType();
+    }
+    int GetNumPatches() const {
+        return numPatches;
+    }
+    int GetIndexBase() const {
+        return indexBase;
+    }
+    int GetPrimitiveIdBase() const {
+        return primitiveIdBase;
+    }
+    Far::PatchDescriptor desc;
+    int numPatches;
+    int indexBase;        // an offset within the index buffer
+    int primitiveIdBase;  // an offset within the patch param buffer
+};
+
+struct PatchParam {
+    // int3 struct.
+    int faceIndex;
+    unsigned int patchBits;
+    float sharpness;
+};
+
+typedef std::vector<PatchArray> PatchArrayVector;
+typedef std::vector<PatchParam> PatchParamVector;
+
+}  // end namespace Osd
+
+}  // end namespace OPENSUBDIV_VERSION
+using namespace OPENSUBDIV_VERSION;
+
+}  // end namespace OpenSubdiv
+
+
+#endif  // OPENSUBDIV3_OSD_TYPES_H