Merge pull request #407 from takahito-tejima/dev

Remove FarKernelBatch
2024-12-01 15:50:07 +00:00 · 2015-04-10 11:20:15 -07:00 · 2015-04-10 11:20:15 -07:00 · f5c1617b0f
commit f5c1617b0f
parent 50d30b101e 1ec0b2b1f2
40 changed files with 410 additions and 608 deletions
--- a/examples/glEvalLimit/glEvalLimit.cpp
+++ b/examples/glEvalLimit/glEvalLimit.cpp
@ -197,8 +197,6 @@ Osd::CpuComputeContext * g_computeCtx = 0;
 Osd::CpuComputeController g_computeCtrl;
 Far::KernelBatchVector  g_kernelBatches;
 Osd::CpuEvalLimitContext * g_evalCtx = 0;
 Osd::CpuEvalLimitController g_evalCtrl;
@ -242,7 +240,7 @@ updateGeom() {
    g_vertexData->UpdateData( &g_positions[0], 0, nverts);
-    g_computeCtrl.Compute(g_computeCtx, g_kernelBatches, g_vertexData, g_varyingData);
+    g_computeCtrl.Compute(g_computeCtx, g_vertexData, g_varyingData);
    s.Stop();
    g_computeTime = float(s.GetElapsed() * 1000.0f);
@ -417,9 +415,6 @@ createOsdMesh(ShapeDesc const & shapeDesc, int level) {
        delete g_evalCtx;
        g_evalCtx = Osd::CpuEvalLimitContext::Create(*patchTables);
        g_kernelBatches.clear();
        g_kernelBatches.push_back(Far::StencilTablesFactory::Create(*concatStencils));
    }
    {   // Create vertex primvar buffer for the CVs
--- a/examples/glShareTopology/glShareTopology.cpp
+++ b/examples/glShareTopology/glShareTopology.cpp
@ -295,8 +295,6 @@ public:
        _computeContext = ComputeContext::Create(vertexStencils, varyingStencils);
        _kernelBatches.push_back(Far::StencilTablesFactory::Create(*vertexStencils));
        _numVertices = vertexStencils->GetNumStencils() +
            vertexStencils->GetNumControlVertices();
    }
@ -328,7 +326,6 @@ public:
                globalVaryingDesc.stride);
            _computeController.Compute(_computeContext,
                                      _kernelBatches,
                                      typedInstance->GetVertexBuffer(),
                                      typedInstance->GetVaryingBuffer(),
                                      &vertexDesc,
@ -364,7 +361,6 @@ public:
 private:
    COMPUTE_CONTROLLER _computeController;
    ComputeContext *_computeContext;
    Far::KernelBatchVector _kernelBatches;
 };
 // ---------------------------------------------------------------------------
@ -394,8 +390,6 @@ Topology(Far::PatchTables const * patchTables,
    _computeContext = ComputeContext::Create(g_clContext, vertexStencils, varyingStencils);
    _kernelBatches.push_back(Far::StencilTablesFactory::Create(*vertexStencils));
    _numVertices = vertexStencils->GetNumStencils() +
        vertexStencils->GetNumControlVertices();
 }
--- a/opensubdiv/far/CMakeLists.txt
+++ b/opensubdiv/far/CMakeLists.txt
@ -45,8 +45,6 @@ set(PUBLIC_HEADER_FILES
    error.h
    gregoryBasis.h
    interpolate.h
    kernelBatch.h
    kernelBatchDispatcher.h
    patchDescriptor.h
    patchParam.h
    patchMap.h
--- a/opensubdiv/far/kernelBatch.h
+++ b/opensubdiv/far/kernelBatch.h
@ -1,82 +0,0 @@
 //
 //   Copyright 2013 Pixar
 //
 //   Licensed under the Apache License, Version 2.0 (the "Apache License")
 //   with the following modification; you may not use this file except in
 //   compliance with the Apache License and the following modification to it:
 //   Section 6. Trademarks. is deleted and replaced with:
 //
 //   6. Trademarks. This License does not grant permission to use the trade
 //      names, trademarks, service marks, or product names of the Licensor
 //      and its affiliates, except as required to comply with Section 4(c) of
 //      the License and to reproduce the content of the NOTICE file.
 //
 //   You may obtain a copy of the Apache License at
 //
 //       http://www.apache.org/licenses/LICENSE-2.0
 //
 //   Unless required by applicable law or agreed to in writing, software
 //   distributed under the Apache License with the above modification is
 //   distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 //   KIND, either express or implied. See the Apache License for the specific
 //   language governing permissions and limitations under the Apache License.
 //
 #ifndef FAR_KERNEL_BATCH_H
 #define FAR_KERNEL_BATCH_H
 #include "../version.h"
 #include <vector>
 namespace OpenSubdiv {
 namespace OPENSUBDIV_VERSION {
 namespace Far {
 /// \brief A GP Compute Kernel descriptor.
 ///
 /// Vertex refinement through subdivision schemes requires the successive
 /// application of dedicated compute kernels. OpenSubdiv groups these vertices
 /// in batches based on their topology in order to minimize the number of kernel
 /// switches to process a given primitive.
 ///
 struct KernelBatch {
 public:
    enum KernelType {
        KERNEL_UNKNOWN=0,
        KERNEL_STENCIL_TABLE,
        KERNEL_USER_DEFINED
    };
    /// \brief Constructor.
    /// 
    /// @param _kernelType    The type of compute kernel kernel
    ///
    /// @param _level         The level of subdivision of the vertices in the batch
    ///
    /// @param _start         Index of the first vertex in the batch
    ///
    /// @param _end           Index of the last vertex in the batch
    ///
    KernelBatch( int _kernelType, int _level, int _start, int _end ) :
        kernelType(_kernelType), level(_level), start(_start), end(_end) { }
    int kernelType,   // the type of compute kernel kernel
        level,        // the level of subdivision of the vertices in the batch
        start,        // index of the first vertex in the batch
        end;          // index of the last vertex in the batch
 };
 typedef std::vector<KernelBatch> KernelBatchVector;
 } // end namespace Far
 } // end namespace OPENSUBDIV_VERSION
 using namespace OPENSUBDIV_VERSION;
 } // end namespace OpenSubdiv
 #endif  /* FAR_KERNEL_BATCH_H */
--- a/opensubdiv/far/kernelBatchDispatcher.h
+++ b/opensubdiv/far/kernelBatchDispatcher.h
@ -1,164 +0,0 @@
 //
 //   Copyright 2013 Pixar
 //
 //   Licensed under the Apache License, Version 2.0 (the "Apache License")
 //   with the following modification; you may not use this file except in
 //   compliance with the Apache License and the following modification to it:
 //   Section 6. Trademarks. is deleted and replaced with:
 //
 //   6. Trademarks. This License does not grant permission to use the trade
 //      names, trademarks, service marks, or product names of the Licensor
 //      and its affiliates, except as required to comply with Section 4(c) of
 //      the License and to reproduce the content of the NOTICE file.
 //
 //   You may obtain a copy of the Apache License at
 //
 //       http://www.apache.org/licenses/LICENSE-2.0
 //
 //   Unless required by applicable law or agreed to in writing, software
 //   distributed under the Apache License with the above modification is
 //   distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 //   KIND, either express or implied. See the Apache License for the specific
 //   language governing permissions and limitations under the Apache License.
 //
 #ifndef FAR_KERNELBATCH_DISPATCHER_H
 #define FAR_KERNELBATCH_DISPATCHER_H
 #include "../version.h"
 #include "../far/kernelBatch.h"
 #include "../far/stencilTables.h"
 #include <cassert>
 namespace OpenSubdiv {
 namespace OPENSUBDIV_VERSION {
 namespace Far {
 /// \brief Subdivision refinement encapsulation layer.
 ///
 /// The kernel dispatcher allows client code to customize parts or the entire
 /// computation process. This pattern aims at hiding the logic specific to
 /// the subdivision algorithms and expose a simplified access to minimalistic
 /// compute kernels. By default, meshes revert to a default dispatcher that
 /// implements single-threaded CPU kernels.
 ///
 /// - derive a dispatcher class from this one
 /// - override the virtual functions
 /// - pass the derived dispatcher to the factory (one instance can be shared by many meshes)
 ///
 /// Note : the caller is responsible for deleting a custom dispatcher
 ///
 class KernelBatchDispatcher {
 public:
    /// \brief Launches the processing of a vector of kernel batches
    /// this is a convenient API for controllers which don't have any user defined kernels.
    ///
    /// @param controller  refinement controller implementation (vertex array)
    ///
    /// @param context     refinement context implementation (subdivision tables)
    ///                    passed to the controller.
    ///
    /// @param batches     batches of kernels that need to be processed
    ///
    /// @param maxlevel    process vertex batches up to this level
    ///
    template <class CONTROLLER, class CONTEXT> static void Apply(
        CONTROLLER *controller, CONTEXT *context, KernelBatchVector const & batches, int maxlevel);
 protected:
    /// \brief Launches the processing of a kernel batch
    /// returns true if the batch is handled, otherwise returns false (i.e. user defined kernel)
    ///
    /// @param controller  refinement controller implementation
    ///
    /// @param context     refinement context implementation
    ///
    /// @param batch       a batch of kernel that need to be processed
    ///
    template <class CONTROLLER, class CONTEXT> static bool ApplyKernel(
        CONTROLLER *controller, CONTEXT *context, KernelBatch const &batch);
 };
 ///
 /// \brief Far default controller implementation
 ///
 /// This is Far's default implementation of a kernal batch controller.
 ///
 class DefaultController {
 public:
    template <class CONTEXT> void ApplyStencilTableKernel(
        KernelBatch const &batch, CONTEXT *context) const;
 };
 // Launches the processing of a kernel batch
 template <class CONTROLLER, class CONTEXT> bool
 KernelBatchDispatcher::ApplyKernel(CONTROLLER *controller, CONTEXT *context,
    KernelBatch const &batch) {
    if (batch.end==0) {
        return true;
    }
    switch(batch.kernelType) {
        case KernelBatch::KERNEL_UNKNOWN:
            assert(0);
        case KernelBatch::KERNEL_STENCIL_TABLE:
            controller->ApplyStencilTableKernel(batch, context);
            break;
        default: // user defined kernel type
            return false;
    }
    return true;
 }
 // Launches the processing of a vector of kernel batches
 template <class CONTROLLER, class CONTEXT> void
 KernelBatchDispatcher::Apply(CONTROLLER *controller, CONTEXT *context,
    KernelBatchVector const & batches, int maxlevel) {
    for (int i = 0; i < (int)batches.size(); ++i) {
        const KernelBatch &batch = batches[i];
        if (maxlevel>=0 and batch.level>=maxlevel) {
            continue;
        }
        ApplyKernel(controller, context, batch);
    }
 }
 template <class CONTEXT> void
 DefaultController::ApplyStencilTableKernel(
    KernelBatch const &batch, CONTEXT *context) const {
    StencilTables const * stencilTables = context->GetStencilTables();
    assert(stencilTables);
    typename CONTEXT::VertexType *vsrc = &context->GetVertices().at(0),
                                 *vdst = vsrc + batch.start + stencilTables->GetNumControlVertices();
    stencilTables->UpdateValues(vsrc, vdst, batch.start, batch.end);
 }
 } // end namespace Far
 } // end namespace OPENSUBDIV_VERSION
 using namespace OPENSUBDIV_VERSION;
 } // end namespace OpenSubdiv
 #endif /* FAR_KERNELBATCH_DISPATCHER_H */
--- a/opensubdiv/far/stencilTables.h
+++ b/opensubdiv/far/stencilTables.h
@ -203,6 +203,7 @@ protected:
    void resize(int nstencils, int nelems);
 protected:
    StencilTables() : _numControlVertices(0) {}
    friend class StencilTablesFactory;
    friend class GregoryBasisFactory;
--- a/opensubdiv/far/stencilTablesFactory.cpp
+++ b/opensubdiv/far/stencilTablesFactory.cpp
@ -395,15 +395,6 @@ LimitStencilTablesFactory::Create(TopologyRefiner const & refiner,
    return result;
 }
 //------------------------------------------------------------------------------
 KernelBatch
 StencilTablesFactory::Create(StencilTables const &stencilTables) {
    return KernelBatch( KernelBatch::KERNEL_STENCIL_TABLE,
        -1, 0, stencilTables.GetNumStencils());
 }
 } // end namespace Far
 } // end namespace OPENSUBDIV_VERSION
--- a/opensubdiv/far/stencilTablesFactory.h
+++ b/opensubdiv/far/stencilTablesFactory.h
@ -27,7 +27,6 @@
 #include "../version.h"
 #include "../far/kernelBatch.h"
 #include "../far/patchTables.h"
 #include <vector>
@ -103,13 +102,6 @@ public:
    ///
    static StencilTables const * Create(int numTables, StencilTables const ** tables);
    /// \brief Returns a KernelBatch applying all the stencil in the tables
    ///        to primvar data.
    ///
    /// @param stencilTables The stencil tables to batch
    ///
    static KernelBatch Create(StencilTables const &stencilTables);
 private:
    // Generate stencils for the coarse control-vertices (single weight = 1.0f)
--- a/opensubdiv/osd/clComputeContext.cpp
+++ b/opensubdiv/osd/clComputeContext.cpp
@ -56,10 +56,15 @@ class CLComputeContext::CLStencilTables {
 public:
    CLStencilTables(Far::StencilTables const & stencilTables, cl_context clContext) {
-        _sizes = createCLBuffer(stencilTables.GetSizes(), clContext);
+        _numStencils = stencilTables.GetNumStencils();
-        _offsets = createCLBuffer(stencilTables.GetOffsets(), clContext);
+        if (_numStencils > 0) {
-        _indices = createCLBuffer(stencilTables.GetControlIndices(), clContext);
+            _sizes = createCLBuffer(stencilTables.GetSizes(), clContext);
-        _weights = createCLBuffer(stencilTables.GetWeights(), clContext);
+            _offsets = createCLBuffer(stencilTables.GetOffsets(), clContext);
            _indices = createCLBuffer(stencilTables.GetControlIndices(), clContext);
            _weights = createCLBuffer(stencilTables.GetWeights(), clContext);
        } else {
            _sizes = _offsets = _indices = _weights = NULL;
        }
    }
    ~CLStencilTables() {
@ -89,12 +94,17 @@ public:
        return _weights;
    }
    int GetNumStencils() const {
        return _numStencils;
    }
 private:
    cl_mem _sizes,
           _offsets,
           _indices,
           _weights;
    int _numStencils;
 };
 // -----------------------------------------------------------------------------
@ -104,7 +114,7 @@ CLComputeContext::CLComputeContext(
        Far::StencilTables const * varyingStencilTables,
            cl_context clContext) :
                _vertexStencilTables(0), _varyingStencilTables(0),
-                     _numControlVertices(0) {
+                _numControlVertices(0) {
    if (vertexStencilTables) {
        _vertexStencilTables = new CLStencilTables(*vertexStencilTables, clContext);
@ -139,6 +149,15 @@ CLComputeContext::HasVaryingStencilTables() const {
    return _varyingStencilTables ? _varyingStencilTables->IsValid() : false;
 }
 int
 CLComputeContext::GetNumStencilsInVertexStencilTables() const {
    return _vertexStencilTables ? _vertexStencilTables->GetNumStencils() : 0;
 }
 int
 CLComputeContext::GetNumStencilsInVaryingStencilTables() const {
    return _varyingStencilTables ? _varyingStencilTables->GetNumStencils() : 0;
 }
 // ----------------------------------------------------------------------------
 cl_mem
--- a/opensubdiv/osd/clComputeContext.h
+++ b/opensubdiv/osd/clComputeContext.h
@ -80,6 +80,12 @@ public:
        return _numControlVertices;
    }
    /// Returns the number of stencils in vertex stencil table
    int GetNumStencilsInVertexStencilTables() const;
    /// Returns the number of stencils in varying stencil table
    int GetNumStencilsInVaryingStencilTables() const;
    /// Returns the Cuda buffer containing vertex-stencil stencil sizes
    cl_mem GetVertexStencilTablesSizes() const;
--- a/opensubdiv/osd/clComputeController.cpp
+++ b/opensubdiv/osd/clComputeController.cpp
@ -32,6 +32,7 @@
 #include <algorithm>
 #include <string.h>
 #include <sstream>
 #include <cassert>
 namespace OpenSubdiv {
 namespace OPENSUBDIV_VERSION {
@ -138,18 +139,20 @@ private:
 // ----------------------------------------------------------------------------
 void
-CLComputeController::ApplyStencilTableKernel(
+CLComputeController::ApplyStencilTableKernel(ComputeContext const *context) {
    Far::KernelBatch const &batch, ComputeContext const *context) {
    assert(context);
    cl_int errNum;
-    size_t globalWorkSize[1] = { (size_t)(batch.end - batch.start) };
+    size_t globalWorkSize = 0;
    int ncvs = context->GetNumControlVertices();
    if (context->HasVertexStencilTables()) {
        int start = 0;
        int end = context->GetNumStencilsInVertexStencilTables();
        globalWorkSize = (size_t)(end - start);
        KernelBundle const * bundle = getKernel(_currentBindState.vertexDesc);
@ -167,14 +170,14 @@ CLComputeController::ApplyStencilTableKernel(
        clSetKernelArg(kernel, 3, sizeof(cl_mem), &indices);
        clSetKernelArg(kernel, 4, sizeof(cl_mem), &weights);
-        clSetKernelArg(kernel, 5, sizeof(int), &batch.start);
+        clSetKernelArg(kernel, 5, sizeof(int), &start);
-        clSetKernelArg(kernel, 6, sizeof(int), &batch.end);
+        clSetKernelArg(kernel, 6, sizeof(int), &end);
        clSetKernelArg(kernel, 7, sizeof(int), &_currentBindState.vertexDesc.offset);
        clSetKernelArg(kernel, 8, sizeof(int), &ncvs);
        errNum = clEnqueueNDRangeKernel(
-            _clQueue, kernel, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL);
+            _clQueue, kernel, 1, NULL, &globalWorkSize, NULL, 0, NULL, NULL);
        if (errNum!=CL_SUCCESS) {
            Far::Error(Far::FAR_RUNTIME_ERROR,
                "ApplyStencilTableKernel (%d) ", errNum);
@ -182,6 +185,9 @@ CLComputeController::ApplyStencilTableKernel(
    }
    if (context->HasVaryingStencilTables()) {
        int start = 0;
        int end = context->GetNumStencilsInVaryingStencilTables();
        globalWorkSize = (size_t)(end - start);
        KernelBundle const * bundle = getKernel(_currentBindState.varyingDesc);
@ -199,14 +205,14 @@ CLComputeController::ApplyStencilTableKernel(
        clSetKernelArg(kernel, 3, sizeof(cl_mem), &indices);
        clSetKernelArg(kernel, 4, sizeof(cl_mem), &weights);
-        clSetKernelArg(kernel, 5, sizeof(int), &batch.start);
+        clSetKernelArg(kernel, 5, sizeof(int), &start);
-        clSetKernelArg(kernel, 6, sizeof(int), &batch.end);
+        clSetKernelArg(kernel, 6, sizeof(int), &end);
        clSetKernelArg(kernel, 7, sizeof(int), &_currentBindState.varyingDesc.offset);
        clSetKernelArg(kernel, 8, sizeof(int), &ncvs);
        errNum = clEnqueueNDRangeKernel(
-            _clQueue, kernel, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL);
+            _clQueue, kernel, 1, NULL, &globalWorkSize, NULL, 0, NULL, NULL);
        if (errNum!=CL_SUCCESS) {
            Far::Error(Far::FAR_RUNTIME_ERROR,
                "ApplyStencilTableKernel (%d)", errNum);
--- a/opensubdiv/osd/clComputeController.h
+++ b/opensubdiv/osd/clComputeController.h
@ -27,7 +27,6 @@
 #include "../version.h"
 #include "../far/kernelBatchDispatcher.h"
 #include "../osd/clComputeContext.h"
 #include "../osd/vertexDescriptor.h"
 #include "../osd/opencl.h"
@ -70,9 +69,6 @@ public:
    ///
    /// @param  context       The CLContext to apply refinement operations to
    ///
    /// @param  batches       Vector of batches of vertices organized by operative
    ///                       kernel
    ///
    /// @param  vertexBuffer  Vertex-interpolated data buffer
    ///
    /// @param  vertexDesc    The descriptor of vertex elements to be refined.
@ -87,17 +83,14 @@ public:
    ///
    template<class VERTEX_BUFFER, class VARYING_BUFFER>
        void Compute( CLComputeContext const * context,
                      Far::KernelBatchVector const & batches,
                      VERTEX_BUFFER  * vertexBuffer,
                      VARYING_BUFFER * varyingBuffer,
                      VertexBufferDescriptor const * vertexDesc=NULL,
                      VertexBufferDescriptor const * varyingDesc=NULL ){
        if (batches.empty()) return;
        bind(vertexBuffer, varyingBuffer, vertexDesc, varyingDesc);
-        Far::KernelBatchDispatcher::Apply(this, context, batches, /*maxlevel*/ -1);
+        ApplyStencilTableKernel(context);
        unbind();
    }
@ -113,10 +106,9 @@ public:
    ///
    template<class VERTEX_BUFFER>
        void Compute(CLComputeContext const * context,
                     Far::KernelBatchVector const & batches,
                     VERTEX_BUFFER *vertexBuffer) {
-        Compute<VERTEX_BUFFER>(context, batches, vertexBuffer, (VERTEX_BUFFER*)0);
+        Compute<VERTEX_BUFFER>(context, vertexBuffer, (VERTEX_BUFFER*)0);
    }
    /// Waits until all running subdivision kernels finish.
@ -130,10 +122,7 @@ public:
 protected:
-    friend class Far::KernelBatchDispatcher;
+    void ApplyStencilTableKernel(ComputeContext const *context);
    void ApplyStencilTableKernel(Far::KernelBatch const &batch,
        ComputeContext const *context);
    template<class VERTEX_BUFFER, class VARYING_BUFFER>
        void bind( VERTEX_BUFFER * vertexBuffer,
--- a/opensubdiv/osd/cpuComputeContext.h
+++ b/opensubdiv/osd/cpuComputeContext.h
@ -61,7 +61,7 @@ public:
    ///                              interpolation
    ///
    static CpuComputeContext * Create(Far::StencilTables const * vertexStencilTables,
-                                         Far::StencilTables const * varyingStencilTables=0);
+                                      Far::StencilTables const * varyingStencilTables=0);
    /// Destructor
    virtual ~CpuComputeContext();
--- a/opensubdiv/osd/cpuComputeController.cpp
+++ b/opensubdiv/osd/cpuComputeController.cpp
@ -46,7 +46,7 @@ CpuComputeController::Synchronize() {
 void
 CpuComputeController::ApplyStencilTableKernel(
-    Far::KernelBatch const &batch, ComputeContext const *context) const {
+    ComputeContext const *context) const {
    assert(context);
@ -61,14 +61,19 @@ CpuComputeController::ApplyStencilTableKernel(
        float * destBuffer = _currentBindState.vertexBuffer + desc.offset +
            vertexStencils->GetNumControlVertices() * desc.stride;
-        CpuComputeStencils(_currentBindState.vertexDesc,
+        int start = 0;
-                              srcBuffer, destBuffer,
+        int end = vertexStencils->GetNumStencils();
-                              &vertexStencils->GetSizes().at(0),
+
-                              &vertexStencils->GetOffsets().at(0),
+        if (end > start) {
-                              &vertexStencils->GetControlIndices().at(0),
+            CpuComputeStencils(_currentBindState.vertexDesc,
-                              &vertexStencils->GetWeights().at(0),
+                               srcBuffer, destBuffer,
-                              batch.start,
+                               &vertexStencils->GetSizes().at(0),
-                              batch.end);
+                               &vertexStencils->GetOffsets().at(0),
                               &vertexStencils->GetControlIndices().at(0),
                               &vertexStencils->GetWeights().at(0),
                               start,
                               end);
        }
    }
    Far::StencilTables const * varyingStencils = context->GetVaryingStencilTables();
@ -82,14 +87,19 @@ CpuComputeController::ApplyStencilTableKernel(
        float * destBuffer = _currentBindState.varyingBuffer + desc.offset +
            varyingStencils->GetNumControlVertices() * desc.stride;
-        CpuComputeStencils(_currentBindState.varyingDesc,
+        int start = 0;
-                              srcBuffer, destBuffer,
+        int end = varyingStencils->GetNumStencils();
-                              &varyingStencils->GetSizes().at(0),
+
-                              &varyingStencils->GetOffsets().at(0),
+        if (end > start) {
-                              &varyingStencils->GetControlIndices().at(0),
+            CpuComputeStencils(_currentBindState.varyingDesc,
-                              &varyingStencils->GetWeights().at(0),
+                               srcBuffer, destBuffer,
-                              batch.start,
+                               &varyingStencils->GetSizes().at(0),
-                              batch.end);
+                               &varyingStencils->GetOffsets().at(0),
                               &varyingStencils->GetControlIndices().at(0),
                               &varyingStencils->GetWeights().at(0),
                               start,
                               end);
        }
    }
 }
--- a/opensubdiv/osd/cpuComputeController.h
+++ b/opensubdiv/osd/cpuComputeController.h
@ -27,7 +27,6 @@
 #include "../version.h"
 #include "../far/kernelBatchDispatcher.h"
 #include "../osd/cpuComputeContext.h"
 #include "../osd/vertexDescriptor.h"
@ -64,9 +63,6 @@ public:
    ///
    /// @param  context       The CpuContext to apply refinement operations to
    ///
    /// @param  batches       Vector of batches of vertices organized by operative
    ///                       kernel
    ///
    /// @param  vertexBuffer  Vertex-interpolated data buffer
    ///
    /// @param  vertexDesc    The descriptor of vertex elements to be refined.
@ -81,17 +77,14 @@ public:
    ///
    template<class VERTEX_BUFFER, class VARYING_BUFFER>
        void Compute( CpuComputeContext const * context,
                      Far::KernelBatchVector const & batches,
                      VERTEX_BUFFER  * vertexBuffer,
                      VARYING_BUFFER * varyingBuffer,
                      VertexBufferDescriptor const * vertexDesc=NULL,
                      VertexBufferDescriptor const * varyingDesc=NULL ){
        if (batches.empty()) return;
        bind(vertexBuffer, varyingBuffer, vertexDesc, varyingDesc);
-        Far::KernelBatchDispatcher::Apply(this, context, batches, /*maxlevel*/ -1);
+        ApplyStencilTableKernel(context);
        unbind();
    }
@ -100,17 +93,13 @@ public:
    ///
    /// @param  context       The CpuContext to apply refinement operations to
    ///
    /// @param  batches       Vector of batches of vertices organized by operative
    ///                       kernel
    ///
    /// @param  vertexBuffer  Vertex-interpolated data buffer
    ///
    template<class VERTEX_BUFFER>
        void Compute(CpuComputeContext const * context,
                     Far::KernelBatchVector const & batches,
                     VERTEX_BUFFER *vertexBuffer) {
-        Compute<VERTEX_BUFFER>(context, batches, vertexBuffer, (VERTEX_BUFFER*)0);
+        Compute<VERTEX_BUFFER>(context, vertexBuffer, (VERTEX_BUFFER*)0);
    }
    /// Waits until all running subdivision kernels finish.
@ -119,10 +108,7 @@ public:
 protected:
-    friend class Far::KernelBatchDispatcher;
+    void ApplyStencilTableKernel(ComputeContext const *context) const;
    void ApplyStencilTableKernel(Far::KernelBatch const &batch,
        ComputeContext const *context) const;
    template<class VERTEX_BUFFER, class VARYING_BUFFER>
        void bind( VERTEX_BUFFER * vertexBuffer,
--- a/opensubdiv/osd/cudaComputeContext.cpp
+++ b/opensubdiv/osd/cudaComputeContext.cpp
@ -63,10 +63,15 @@ class CudaComputeContext::CudaStencilTables {
 public:
    CudaStencilTables(Far::StencilTables const & stencilTables) {
-        _sizes = createCudaBuffer(stencilTables.GetSizes());
+        _numStencils = stencilTables.GetNumStencils();
-        _offsets = createCudaBuffer(stencilTables.GetOffsets());
+        if (_numStencils > 0) {
-        _indices = createCudaBuffer(stencilTables.GetControlIndices());
+            _sizes = createCudaBuffer(stencilTables.GetSizes());
-        _weights = createCudaBuffer(stencilTables.GetWeights());
+            _offsets = createCudaBuffer(stencilTables.GetOffsets());
            _indices = createCudaBuffer(stencilTables.GetControlIndices());
            _weights = createCudaBuffer(stencilTables.GetWeights());
        } else {
            _sizes = _offsets = _indices = _weights = NULL;
        }
    }
    ~CudaStencilTables() {
@ -96,11 +101,16 @@ public:
        return _weights;
    }
    int GetNumStencils() const {
        return _numStencils;
    }
 private:
    void * _sizes,
         * _offsets,
         * _indices,
         * _weights;
    int _numStencils;
 };
 // ----------------------------------------------------------------------------
@ -144,6 +154,16 @@ CudaComputeContext::HasVaryingStencilTables() const {
    return _varyingStencilTables ? _varyingStencilTables->IsValid() : false;
 }
 int
 CudaComputeContext::GetNumStencilsInVertexStencilTables() const {
    return _vertexStencilTables ? _vertexStencilTables->GetNumStencils() : 0;
 }
 int
 CudaComputeContext::GetNumStencilsInVaryingStencilTables() const {
    return _varyingStencilTables ? _varyingStencilTables->GetNumStencils() : 0;
 }
 // ----------------------------------------------------------------------------
 void *
--- a/opensubdiv/osd/cudaComputeContext.h
+++ b/opensubdiv/osd/cudaComputeContext.h
@ -77,6 +77,12 @@ public:
        return _numControlVertices;
    }
    /// Returns the number of stencils in vertex stencil tables
    int GetNumStencilsInVertexStencilTables() const;
    /// Returns the number of stencils in varying stencil tables
    int GetNumStencilsInVaryingStencilTables() const;
    /// Returns the Cuda buffer containing vertex-stencil stencil sizes
    void * GetVertexStencilTablesSizes() const;
--- a/opensubdiv/osd/cudaComputeController.cpp
+++ b/opensubdiv/osd/cudaComputeController.cpp
@ -26,6 +26,7 @@
 #include <cuda_runtime.h>
 #include <string.h>
 #include <cassert>
 extern "C" {
@ -46,7 +47,7 @@ namespace Osd {
 void
 CudaComputeController::ApplyStencilTableKernel(
-    Far::KernelBatch const &batch, ComputeContext const *context) const {
+    ComputeContext const *context) const {
    assert(context);
@ -55,18 +56,23 @@ CudaComputeController::ApplyStencilTableKernel(
        int length = _currentBindState.vertexDesc.length,
            stride = _currentBindState.vertexDesc.stride;
        int start = 0;
        int end = context->GetNumStencilsInVertexStencilTables();
        float const * src = _currentBindState.GetVertexBufferAtOffset();
        float * dst = const_cast<float *>(src) +
            context->GetNumControlVertices() * stride;
-        CudaComputeStencils(src, dst, length, stride,
+        if (end > start) {
-                               (unsigned char const *)context->GetVertexStencilTablesSizes(),
+            CudaComputeStencils(src, dst, length, stride,
-                               (int const *)context->GetVertexStencilTablesOffsets(),
+                                (unsigned char const *)context->GetVertexStencilTablesSizes(),
-                               (int const *)context->GetVertexStencilTablesIndices(),
+                                (int const *)context->GetVertexStencilTablesOffsets(),
-                               (float const *)context->GetVertexStencilTablesWeights(),
+                                (int const *)context->GetVertexStencilTablesIndices(),
-                               batch.start,
+                                (float const *)context->GetVertexStencilTablesWeights(),
-                               batch.end);
+                                start,
                                end);
        }
    }
    if (context->HasVaryingStencilTables()) {
@ -74,18 +80,23 @@ CudaComputeController::ApplyStencilTableKernel(
        int length = _currentBindState.varyingDesc.length,
            stride = _currentBindState.varyingDesc.stride;
        int start = 0;
        int end = context->GetNumStencilsInVaryingStencilTables();
        float const * src = _currentBindState.GetVaryingBufferAtOffset();
        float * dst = const_cast<float *>(src) +
            context->GetNumControlVertices() * stride;
-        CudaComputeStencils(src, dst, length, stride,
+        if (end > start) {
-                               (unsigned char const *)context->GetVaryingStencilTablesSizes(),
+            CudaComputeStencils(src, dst, length, stride,
-                               (int const *)context->GetVaryingStencilTablesOffsets(),
+                                (unsigned char const *)context->GetVaryingStencilTablesSizes(),
-                               (int const *)context->GetVaryingStencilTablesIndices(),
+                                (int const *)context->GetVaryingStencilTablesOffsets(),
-                               (float const *)context->GetVaryingStencilTablesWeights(),
+                                (int const *)context->GetVaryingStencilTablesIndices(),
-                               batch.start,
+                                (float const *)context->GetVaryingStencilTablesWeights(),
-                               batch.end);
+                                start,
                                end);
        }
    }
 }
--- a/opensubdiv/osd/cudaComputeController.h
+++ b/opensubdiv/osd/cudaComputeController.h
@ -27,7 +27,6 @@
 #include "../version.h"
 #include "../far/kernelBatchDispatcher.h"
 #include "../osd/cudaComputeContext.h"
 #include "../osd/vertexDescriptor.h"
@ -77,17 +76,14 @@ public:
    ///
    template<class VERTEX_BUFFER, class VARYING_BUFFER>
        void Compute( CudaComputeContext const * context,
                      Far::KernelBatchVector const & batches,
                      VERTEX_BUFFER  * vertexBuffer,
                      VARYING_BUFFER * varyingBuffer,
                      VertexBufferDescriptor const * vertexDesc=NULL,
                      VertexBufferDescriptor const * varyingDesc=NULL ){
        if (batches.empty()) return;
        bind(vertexBuffer, varyingBuffer, vertexDesc, varyingDesc);
-        Far::KernelBatchDispatcher::Apply(this, context, batches, /*maxlevel*/ -1);
+        ApplyStencilTableKernel(context);
        unbind();
    }
@ -103,10 +99,9 @@ public:
    ///
    template<class VERTEX_BUFFER>
        void Compute(CudaComputeContext const * context,
                     Far::KernelBatchVector const & batches,
                     VERTEX_BUFFER *vertexBuffer) {
-        Compute<VERTEX_BUFFER>(context, batches, vertexBuffer, (VERTEX_BUFFER*)0);
+        Compute<VERTEX_BUFFER>(context, vertexBuffer, (VERTEX_BUFFER*)0);
    }
    /// Waits until all running subdivision kernels finish.
@ -114,10 +109,7 @@ public:
 protected:
-    friend class Far::KernelBatchDispatcher;
+    void ApplyStencilTableKernel(ComputeContext const *context) const;
    void ApplyStencilTableKernel(Far::KernelBatch const &batch,
        ComputeContext const *context) const;
    template<class VERTEX_BUFFER, class VARYING_BUFFER>
        void bind( VERTEX_BUFFER * vertexBuffer,
--- a/opensubdiv/osd/d3d11ComputeContext.cpp
+++ b/opensubdiv/osd/d3d11ComputeContext.cpp
@ -114,17 +114,26 @@ public:
    D3D11StencilTables(Far::StencilTables const & stencilTables,
        ID3D11DeviceContext *deviceContext) {
-        // convert unsigned char sizes buffer to ints (HLSL does not have uint8 type)
+        _numStencils = stencilTables.GetNumStencils();
-        std::vector<int> const sizes(stencilTables.GetSizes().begin(),
+        if (_numStencils > 0) {
-            stencilTables.GetSizes().end());
+            // convert unsigned char sizes buffer to ints
            // (HLSL does not have uint8 type)
            std::vector<int> const sizes(stencilTables.GetSizes().begin(),
                                         stencilTables.GetSizes().end());
-        _sizes.initialize(sizes, DXGI_FORMAT_R32_SINT, deviceContext);
+            _sizes.initialize(sizes,
-
+                              DXGI_FORMAT_R32_SINT,
-        _offsets.initialize(stencilTables.GetOffsets(), DXGI_FORMAT_R32_SINT, deviceContext);
+                              deviceContext);
-
+            _offsets.initialize(stencilTables.GetOffsets(),
-        _indices.initialize(stencilTables.GetControlIndices(), DXGI_FORMAT_R32_SINT, deviceContext);
+                                DXGI_FORMAT_R32_SINT,
-
+                                deviceContext);
-        _weights.initialize(stencilTables.GetWeights(), DXGI_FORMAT_R32_FLOAT, deviceContext);
+            _indices.initialize(stencilTables.GetControlIndices(),
                                DXGI_FORMAT_R32_SINT,
                                deviceContext);
            _weights.initialize(stencilTables.GetWeights(),
                                DXGI_FORMAT_R32_FLOAT,
                                deviceContext);
        }
    }
    bool IsValid() const {
@ -148,6 +157,10 @@ public:
        return _weights;
    }
    int GetNumStencils() const {
        return _numStencils;
    }
    void Bind(ID3D11DeviceContext * deviceContext) const {
        ID3D11ShaderResourceView *SRViews[] = {
            _sizes.srv,
@ -170,6 +183,8 @@ private:
               _offsets,
               _indices,
               _weights;
    int _numStencils;
 };
 // ----------------------------------------------------------------------------
@ -217,6 +232,16 @@ D3D11ComputeContext::HasVaryingStencilTables() const {
    return _varyingStencilTables ? _varyingStencilTables->IsValid() : false;
 }
 int
 D3D11ComputeContext::GetNumStencilsInVertexStencilTables() const {
    return _vertexStencilTables ? _vertexStencilTables->GetNumStencils() : 0;
 }
 int
 D3D11ComputeContext::GetNumStencilsInVaryingStencilTables() const {
    return _varyingStencilTables ? _varyingStencilTables->GetNumStencils() : 0;
 }
 // ----------------------------------------------------------------------------
 void
--- a/opensubdiv/osd/d3d11ComputeContext.h
+++ b/opensubdiv/osd/d3d11ComputeContext.h
@ -78,6 +78,12 @@ public:
        return _numControlVertices;
    }
    /// Returns the number of stencils in vertex stencil table
    int GetNumStencilsInVertexStencilTables() const;
    /// Returns the number of stencils in varying stencil table
    int GetNumStencilsInVaryingStencilTables() const;
    /// Binds D3D11 buffers containing stencils for 'vertex' interpolation
    ///
    /// @param deviceContext         The D3D device
--- a/opensubdiv/osd/d3d11ComputeController.cpp
+++ b/opensubdiv/osd/d3d11ComputeController.cpp
@ -155,11 +155,11 @@ public:
    }
    void ApplyStencilTableKernel(ID3D11DeviceContext *deviceContext,
-        Far::KernelBatch const &batch, int offset, int numCVs) {
+                                 int offset, int numCVs, int start, int end) {
        KernelUniformArgs args;
-        args.uniformStart = batch.start;
+        args.uniformStart = start;
-        args.uniformEnd = batch.end;
+        args.uniformEnd = end;
        args.uniformOffset = offset;
        args.uniformNumCVs = numCVs;
@ -299,7 +299,7 @@ D3D11ComputeController::unbindBuffer() {
 void
 D3D11ComputeController::ApplyStencilTableKernel(
-    Far::KernelBatch const &batch, D3D11ComputeContext const *context) const {
+    D3D11ComputeContext const *context, int numStencils) const {
    assert(context);
@ -307,8 +307,12 @@ D3D11ComputeController::ApplyStencilTableKernel(
    D3D11ComputeController::KernelBundle * bundle =
        const_cast<D3D11ComputeController::KernelBundle *>(_currentBindState.kernelBundle);
-    bundle->ApplyStencilTableKernel(_deviceContext,
+    bundle->ApplyStencilTableKernel(
-        batch, _currentBindState.desc.offset, context->GetNumControlVertices());
+        _deviceContext,
        _currentBindState.desc.offset,
        context->GetNumControlVertices(),
        0,
        numStencils);
 }
--- a/opensubdiv/osd/d3d11ComputeController.h
+++ b/opensubdiv/osd/d3d11ComputeController.h
@ -27,7 +27,6 @@
 #include "../version.h"
 #include "../far/kernelBatchDispatcher.h"
 #include "../osd/d3d11ComputeContext.h"
 #include "../osd/vertexDescriptor.h"
@ -69,9 +68,6 @@ public:
    ///
    /// @param  context       The D3D11Context to apply refinement operations to
    ///
    /// @param  batches       Vector of batches of vertices organized by operative
    ///                       kernel
    ///
    /// @param  vertexBuffer  Vertex-interpolated data buffer
    ///
    /// @param  vertexDesc    The descriptor of vertex elements to be refined.
@ -86,20 +82,18 @@ public:
    ///
    template<class VERTEX_BUFFER, class VARYING_BUFFER>
        void Compute( D3D11ComputeContext const * context,
                      Far::KernelBatchVector const & batches,
                      VERTEX_BUFFER  * vertexBuffer,
                      VARYING_BUFFER * varyingBuffer,
                      VertexBufferDescriptor const * vertexDesc=NULL,
                      VertexBufferDescriptor const * varyingDesc=NULL ){
        if (batches.empty()) return;
        if (vertexBuffer) {
            bind(vertexBuffer, vertexDesc);
            context->BindVertexStencilTables(_deviceContext);
-            Far::KernelBatchDispatcher::Apply(this, context, batches, /*maxlevel*/ -1);
+            ApplyStencilTableKernel(
                context, context->GetNumStencilsInVertexStencilTables());
        }
        if (varyingBuffer) {
@ -107,7 +101,8 @@ public:
            context->BindVaryingStencilTables(_deviceContext);
-            Far::KernelBatchDispatcher::Apply(this, context, batches, /*maxlevel*/ -1);
+            ApplyStencilTableKernel(
                context, context->GetNumStencilsInVaryingStencilTables());
        }
        context->UnbindStencilTables(_deviceContext);
@ -119,17 +114,13 @@ public:
    ///
    /// @param  context       The D3D11Context to apply refinement operations to
    ///
    /// @param  batches       Vector of batches of vertices organized by operative
    ///                       kernel
    ///
    /// @param  vertexBuffer  Vertex-interpolated data buffer
    ///
    template<class VERTEX_BUFFER>
        void Compute(D3D11ComputeContext const * context,
                     Far::KernelBatchVector const & batches,
                     VERTEX_BUFFER *vertexBuffer) {
-        Compute<VERTEX_BUFFER>(context, batches, vertexBuffer, (VERTEX_BUFFER*)0);
+        Compute<VERTEX_BUFFER>(context, vertexBuffer, (VERTEX_BUFFER*)0);
    }
    /// Waits until all running subdivision kernels finish.
@ -137,10 +128,8 @@ public:
 protected:
-    friend class Far::KernelBatchDispatcher;
+    void ApplyStencilTableKernel(ComputeContext const *context,
-
+                                 int numStencils) const;
    void ApplyStencilTableKernel(Far::KernelBatch const &batch,
        ComputeContext const *context) const;
    template<class BUFFER>
        void bind( BUFFER * buffer,
--- a/opensubdiv/osd/d3d11Mesh.h
+++ b/opensubdiv/osd/d3d11Mesh.h
@ -78,7 +78,6 @@ public:
    Mesh(ComputeController * computeController,
            Far::TopologyRefiner * refiner,
            Far::PatchTables * patchTables,
            Far::KernelBatchVector const & kernelBatches,
            VertexBuffer * vertexBuffer,
            VertexBuffer * varyingBuffer,
            ComputeContext * computeContext,
@ -87,7 +86,6 @@ public:
            _refiner(refiner),
            _patchTables(patchTables),
            _kernelBatches(kernelBatches),
            _vertexBuffer(vertexBuffer),
            _varyingBuffer(varyingBuffer),
            _computeContext(computeContext),
@ -119,13 +117,14 @@ public:
        _varyingBuffer->UpdateData(varyingData, startVertex, numVerts, _d3d11DeviceContext);
    }
    virtual void Refine() {
-        _computeController->Compute(_computeContext, _kernelBatches, _vertexBuffer, _varyingBuffer);
+        _computeController->Compute(_computeContext, _vertexBuffer, _varyingBuffer);
    }
    virtual void Refine(VertexBufferDescriptor const *vertexDesc,
                        VertexBufferDescriptor const *varyingDesc,
                        bool interleaved) {
-        _computeController->Compute(_computeContext, _kernelBatches,
+        _computeController->Compute(_computeContext,
-                                    _vertexBuffer, (interleaved ? _vertexBuffer : _varyingBuffer),
+                                    _vertexBuffer,
                                    (interleaved ? _vertexBuffer : _varyingBuffer),
                                    vertexDesc, varyingDesc);
    }
    virtual void Synchronize() {
@ -176,8 +175,6 @@ private:
        if (numVertexElements>0) {
            vertexStencils = Far::StencilTablesFactory::Create(*_refiner, options);
            _kernelBatches.push_back(Far::StencilTablesFactory::Create(*vertexStencils));
        }
        if (numVaryingElements>0) {
@ -235,7 +232,6 @@ private:
    Far::TopologyRefiner * _refiner;
    Far::PatchTables * _patchTables;
    Far::KernelBatchVector _kernelBatches;
    VertexBuffer *_vertexBuffer;
    VertexBuffer *_varyingBuffer;
@ -286,7 +282,6 @@ public:
    Mesh(ComputeController * computeController,
            Far::TopologyRefiner * refiner,
            Far::PatchTables * patchTables,
            Far::KernelBatchVector const & kernelBatches,
            VertexBuffer * vertexBuffer,
            VertexBuffer * varyingBuffer,
            ComputeContext * computeContext,
@ -295,7 +290,6 @@ public:
            _refiner(refiner),
            _patchTables(patchTables),
            _kernelBatches(kernelBatches),
            _vertexBuffer(vertexBuffer),
            _varyingBuffer(varyingBuffer),
            _computeContext(computeContext),
@ -324,12 +318,12 @@ public:
        _varyingBuffer->UpdateData(varyingData, startVertex, numVerts, _d3d11DeviceContext);
    }
    virtual void Refine() {
-        _computeController->Compute(_computeContext, _kernelBatches, _vertexBuffer, _varyingBuffer);
+        _computeController->Compute(_computeContext, _vertexBuffer, _varyingBuffer);
    }
    virtual void Refine(VertexBufferDescriptor const *vertexDesc,
                        VertexBufferDescriptor const *varyingDesc,
                        bool interleaved) {
-        _computeController->Compute(_computeContext, _kernelBatches,
+        _computeController->Compute(_computeContext,
                                    _vertexBuffer, (interleaved ? _vertexBuffer : _varyingBuffer),
                                    vertexDesc, varyingDesc);
    }
@ -382,8 +376,6 @@ private:
        if (numVertexElements>0) {
            vertexStencils = Far::StencilTablesFactory::Create(*_refiner, options);
            _kernelBatches.push_back(Far::StencilTablesFactory::Create(*vertexStencils));
        }
        if (numVaryingElements>0) {
@ -441,7 +433,6 @@ private:
    Far::TopologyRefiner * _refiner;
    Far::PatchTables * _patchTables;
    Far::KernelBatchVector _kernelBatches;
    VertexBuffer *_vertexBuffer;
    VertexBuffer *_varyingBuffer;
--- a/opensubdiv/osd/glMesh.h
+++ b/opensubdiv/osd/glMesh.h
@ -98,7 +98,6 @@ public:
    Mesh(ComputeController * computeController,
            Far::TopologyRefiner * refiner,
            Far::PatchTables * patchTables,
            Far::KernelBatchVector const & kernelBatches,
            VertexBuffer * vertexBuffer,
            VertexBuffer * varyingBuffer,
            ComputeContext * computeContext,
@ -106,7 +105,6 @@ public:
            _refiner(refiner),
            _patchTables(patchTables),
            _kernelBatches(kernelBatches),
            _vertexBuffer(vertexBuffer),
            _varyingBuffer(varyingBuffer),
            _computeContext(computeContext),
@ -140,14 +138,15 @@ public:
    }
    virtual void Refine() {
-        _computeController->Compute(_computeContext, _kernelBatches, _vertexBuffer, _varyingBuffer);
+        _computeController->Compute(_computeContext, _vertexBuffer, _varyingBuffer);
    }
    virtual void Refine(VertexBufferDescriptor const * vertexDesc,
                        VertexBufferDescriptor const * varyingDesc,
                        bool interleaved) {
-        _computeController->Compute(_computeContext, _kernelBatches,
+        _computeController->Compute(_computeContext,
-                                    _vertexBuffer, (interleaved ? _vertexBuffer : _varyingBuffer),
+                                    _vertexBuffer,
                                    (interleaved ? _vertexBuffer : _varyingBuffer),
                                    vertexDesc, varyingDesc);
    }
@ -255,8 +254,6 @@ private:
            varyingStencils = concatVaryingStencils;
        }
        _kernelBatches.push_back(Far::StencilTablesFactory::Create(*vertexStencils));
        _computeContext = ComputeContext::Create(vertexStencils,
                                                 varyingStencils);
@ -279,7 +276,6 @@ private:
    Far::TopologyRefiner * _refiner;
    Far::PatchTables * _patchTables;
    Far::KernelBatchVector _kernelBatches;
    VertexBuffer *_vertexBuffer;
    VertexBuffer *_varyingBuffer;
@ -353,7 +349,6 @@ public:
    Mesh(ComputeController * computeController,
            Far::TopologyRefiner * refiner,
            Far::PatchTables * patchTables,
            Far::KernelBatchVector const & kernelBatches,
            VertexBuffer * vertexBuffer,
            VertexBuffer * varyingBuffer,
            ComputeContext * computeContext,
@ -363,7 +358,6 @@ public:
            _refiner(refiner),
            _patchTables(patchTables),
            _kernelBatches(kernelBatches),
            _vertexBuffer(vertexBuffer),
            _varyingBuffer(varyingBuffer),
            _computeContext(computeContext),
@ -395,14 +389,15 @@ public:
    }
    virtual void Refine() {
-        _computeController->Compute(_computeContext, _kernelBatches, _vertexBuffer, _varyingBuffer);
+        _computeController->Compute(_computeContext, _vertexBuffer, _varyingBuffer);
    }
    virtual void Refine(VertexBufferDescriptor const *vertexDesc,
                        VertexBufferDescriptor const *varyingDesc,
                        bool interleaved) {
-        _computeController->Compute(_computeContext, _kernelBatches,
+        _computeController->Compute(_computeContext,
-                                    _vertexBuffer, (interleaved ? _vertexBuffer : _varyingBuffer),
+                                    _vertexBuffer,
                                    (interleaved ? _vertexBuffer : _varyingBuffer),
                                    vertexDesc, varyingDesc);
    }
@ -499,8 +494,6 @@ private:
            Far::StencilTables const *concatStencils =
                Far::StencilTablesFactory::Create(2, inStencils);
            _kernelBatches.push_back(Far::StencilTablesFactory::Create(*concatStencils));
            Far::StencilTables const *inVaryingStencils[] = {
                varyingStencils, endCapVaryingStencils
            };
@ -513,7 +506,6 @@ private:
            delete varyingStencils;
            varyingStencils = concatVaryingStencils;
        }
        _kernelBatches.push_back(Far::StencilTablesFactory::Create(*vertexStencils));
        _computeContext = ComputeContext::Create(_clContext,
                                                 vertexStencils,
@ -538,7 +530,6 @@ private:
    Far::TopologyRefiner * _refiner;
    Far::PatchTables * _patchTables;
    Far::KernelBatchVector _kernelBatches;
    VertexBuffer *_vertexBuffer;
    VertexBuffer *_varyingBuffer;
--- a/opensubdiv/osd/glslComputeContext.cpp
+++ b/opensubdiv/osd/glslComputeContext.cpp
@ -67,17 +67,22 @@ class GLSLComputeContext::GLSLStencilTables {
 public:
    GLSLStencilTables(Far::StencilTables const & stencilTables) {
-        _sizes = createGLSLBuffer(stencilTables.GetSizes());
+        _numStencils = stencilTables.GetNumStencils();
-        _offsets = createGLSLBuffer(stencilTables.GetOffsets());
+        if (_numStencils > 0) {
-        _indices = createGLSLBuffer(stencilTables.GetControlIndices());
+            _sizes = createGLSLBuffer(stencilTables.GetSizes());
-        _weights = createGLSLBuffer(stencilTables.GetWeights());
+            _offsets = createGLSLBuffer(stencilTables.GetOffsets());
            _indices = createGLSLBuffer(stencilTables.GetControlIndices());
            _weights = createGLSLBuffer(stencilTables.GetWeights());
        } else {
            _sizes = _offsets = _indices = _weights = 0;
        }
    }
    ~GLSLStencilTables() {
-        glDeleteBuffers(1, &_sizes);
+        if (_sizes)   glDeleteBuffers(1, &_sizes);
-        glDeleteBuffers(1, &_offsets);
+        if (_offsets) glDeleteBuffers(1, &_offsets);
-        glDeleteBuffers(1, &_weights);
+        if (_weights) glDeleteBuffers(1, &_weights);
-        glDeleteBuffers(1, &_indices);
+        if (_indices) glDeleteBuffers(1, &_indices);
    }
    bool IsValid() const {
@ -100,6 +105,10 @@ public:
        return _weights;
    }
    int GetNumStencils() const {
        return _numStencils;
    }
    void Bind() const {
        glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, _sizes);
        glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, _offsets);
@ -122,6 +131,7 @@ private:
           _offsets,
           _indices,
           _weights;
    int _numStencils;
 };
 // -----------------------------------------------------------------------------
@ -130,7 +140,8 @@ GLSLComputeContext::GLSLComputeContext(
    Far::StencilTables const * vertexStencilTables,
        Far::StencilTables const * varyingStencilTables) :
            _vertexStencilTables(0), _varyingStencilTables(0),
-                _numControlVertices(0) {
+    _numControlVertices(0),
    _numStencils(0) {
    if (vertexStencilTables) {
        _vertexStencilTables = new GLSLStencilTables(*vertexStencilTables);
@ -165,8 +176,17 @@ GLSLComputeContext::HasVaryingStencilTables() const {
    return _varyingStencilTables ? _varyingStencilTables->IsValid() : false;
 }
-// ----------------------------------------------------------------------------
+int
 GLSLComputeContext::GetNumStencilsInVertexStencilTables() const {
    return _vertexStencilTables ? _vertexStencilTables->GetNumStencils() : false;
 }
 int
 GLSLComputeContext::GetNumStencilsInVaryingStencilTables() const {
    return _varyingStencilTables ? _varyingStencilTables->GetNumStencils() : false;
 }
 // ----------------------------------------------------------------------------
 void
 GLSLComputeContext::BindVertexStencilTables() const {
--- a/opensubdiv/osd/glslComputeContext.h
+++ b/opensubdiv/osd/glslComputeContext.h
@ -76,10 +76,16 @@ public:
        return _numControlVertices;
    }
-    /// Returns the Cuda buffer containing vertex-stencil stencil sizes
+    /// Returns the number of stencils in vertex stencil table
    int GetNumStencilsInVertexStencilTables() const;
    /// Returns the number of stencils in varying stencil table
    int GetNumStencilsInVaryingStencilTables() const;
    /// Returns the GL buffer containing vertex-stencil stencil sizes
    GLuint GetVertexStencilTablesSizes() const;
-    /// Returns the Cuda buffer containing vertex-stencil stencil offsets
+    /// Returns the GL buffer containing vertex-stencil stencil offsets
    GLuint GetVertexStencilTablesOffsets() const;
    /// Binds GL buffers containing stencils for 'vertex' interpolation
@ -104,6 +110,7 @@ private:
                      * _varyingStencilTables;
    int _numControlVertices;
    int _numStencils;
 };
 }  // end namespace Osd
--- a/opensubdiv/osd/glslComputeController.cpp
+++ b/opensubdiv/osd/glslComputeController.cpp
@ -24,7 +24,6 @@
 #include "../osd/glslComputeController.h"
 #include "../osd/vertexDescriptor.h"
 //#include "../osd/debug.h"
 #include "../osd/opengl.h"
 #include "../far/error.h"
@ -138,12 +137,13 @@ public:
        return true;
    }
-    void ApplyStencilTableKernel(Far::KernelBatch const &batch, int offset, int numCVs) const {
+    void ApplyStencilTableKernel(int offset, int numCVs,
                                 int start, int end) const {
        // select stencil GLSL subroutine
        glUniformSubroutinesuiv(GL_COMPUTE_SHADER, 1, &_subStencilKernel);
-        dispatchCompute(offset, numCVs, batch.start, batch.end);
+        dispatchCompute(offset, numCVs, start, end);
    }
    struct Match {
@ -214,12 +214,20 @@ private:
 void
 GLSLComputeController::ApplyStencilTableKernel(
-    Far::KernelBatch const &batch, ComputeContext const *context) const {
+    ComputeContext const *context, int numStencils) const {
    assert(context);
    // Note: GLSLComputeContext has a state, knowing whether vertex or
    // varying stencil tables are being bound. GetNumStencils() reflects it.
    // This structure will likely be revisited.
    int start = 0;
    int end = numStencils;
    _currentBindState.kernelBundle->ApplyStencilTableKernel(
-        batch, _currentBindState.desc.offset, context->GetNumControlVertices());
+        _currentBindState.desc.offset, context->GetNumControlVertices(),
        start, end);
 }
 // ----------------------------------------------------------------------------
--- a/opensubdiv/osd/glslComputeController.h
+++ b/opensubdiv/osd/glslComputeController.h
@ -27,11 +27,11 @@
 #include "../version.h"
 #include "../far/kernelBatchDispatcher.h"
 #include "../osd/glslComputeContext.h"
 #include "../osd/vertexDescriptor.h"
 #include <vector>
 #include <cassert>
 namespace OpenSubdiv {
 namespace OPENSUBDIV_VERSION {
@ -79,20 +79,18 @@ public:
    ///
    template<class VERTEX_BUFFER, class VARYING_BUFFER>
        void Compute( GLSLComputeContext const * context,
                      Far::KernelBatchVector const & batches,
                      VERTEX_BUFFER  * vertexBuffer,
                      VARYING_BUFFER * varyingBuffer,
                      VertexBufferDescriptor const * vertexDesc=NULL,
                      VertexBufferDescriptor const * varyingDesc=NULL ){
        if (batches.empty()) return;
        if (vertexBuffer) {
            bind(vertexBuffer, vertexDesc);
            context->BindVertexStencilTables();
-            Far::KernelBatchDispatcher::Apply(this, context, batches, /*maxlevel*/ -1);
+            ApplyStencilTableKernel(
                context, context->GetNumStencilsInVertexStencilTables());
        }
        if (varyingBuffer) {
@ -100,7 +98,8 @@ public:
            context->BindVaryingStencilTables();
-            Far::KernelBatchDispatcher::Apply(this, context, batches, /*maxlevel*/ -1);
+            ApplyStencilTableKernel(
                context, context->GetNumStencilsInVaryingStencilTables());
        }
        context->UnbindStencilTables();
@ -119,10 +118,9 @@ public:
    ///
    template<class VERTEX_BUFFER>
        void Compute(GLSLComputeContext const * context,
                     Far::KernelBatchVector const & batches,
                     VERTEX_BUFFER *vertexBuffer) {
-        Compute<VERTEX_BUFFER>(context, batches, vertexBuffer, (VERTEX_BUFFER*)0);
+        Compute<VERTEX_BUFFER>(context, vertexBuffer, (VERTEX_BUFFER*)0);
    }
    /// Waits until all running subdivision kernels finish.
@ -130,10 +128,8 @@ public:
 protected:
-    friend class Far::KernelBatchDispatcher;
+    void ApplyStencilTableKernel(ComputeContext const *context,
-
+                                 int numStencils) const;
    void ApplyStencilTableKernel(Far::KernelBatch const &batch,
        ComputeContext const *context) const;
    template<class BUFFER>
        void bind( BUFFER * buffer,
--- a/opensubdiv/osd/glslTransformFeedbackComputeContext.cpp
+++ b/opensubdiv/osd/glslTransformFeedbackComputeContext.cpp
@ -82,17 +82,22 @@ class GLSLTransformFeedbackComputeContext::GLStencilTables {
 public:
    GLStencilTables(Far::StencilTables const & stencilTables) {
-        _sizes = createGLTextureBuffer(stencilTables.GetSizes(), GL_R8UI);
+        _numStencils = stencilTables.GetNumStencils();
-        _offsets = createGLTextureBuffer(stencilTables.GetOffsets(), GL_R32I);
+        if (_numStencils > 0) {
-        _indices = createGLTextureBuffer(stencilTables.GetControlIndices(), GL_R32I);
+            _sizes = createGLTextureBuffer(stencilTables.GetSizes(), GL_R8UI);
-        _weights = createGLTextureBuffer(stencilTables.GetWeights(), GL_R32F);
+            _offsets = createGLTextureBuffer(stencilTables.GetOffsets(), GL_R32I);
            _indices = createGLTextureBuffer(stencilTables.GetControlIndices(), GL_R32I);
            _weights = createGLTextureBuffer(stencilTables.GetWeights(), GL_R32F);
        } else {
            _sizes = _offsets = _indices = _weights = 0;
        }
    }
    ~GLStencilTables() {
-        glDeleteTextures(1, &_sizes);
+        if (_sizes) glDeleteTextures(1, &_sizes);
-        glDeleteTextures(1, &_offsets);
+        if (_offsets) glDeleteTextures(1, &_offsets);
-        glDeleteTextures(1, &_weights);
+        if (_weights) glDeleteTextures(1, &_weights);
-        glDeleteTextures(1, &_indices);
+        if (_indices) glDeleteTextures(1, &_indices);
    }
    bool IsValid() const {
@ -115,12 +120,18 @@ public:
        return _weights;
    }
    int GetNumStencils() const {
        return _numStencils;
    }
 private:
    GLuint _sizes,
           _offsets,
           _indices,
           _weights;
    int _numStencils;
 };
 // -----------------------------------------------------------------------------
@ -164,8 +175,17 @@ GLSLTransformFeedbackComputeContext::HasVaryingStencilTables() const {
    return _varyingStencilTables ? _varyingStencilTables->IsValid() : false;
 }
-// ----------------------------------------------------------------------------
+int
 GLSLTransformFeedbackComputeContext::GetNumStencilsInVertexStencilTables() const {
    return _vertexStencilTables ? _vertexStencilTables->GetNumStencils() : 0;
 }
 int
 GLSLTransformFeedbackComputeContext::GetNumStencilsInVaryingStencilTables() const {
    return _varyingStencilTables ? _varyingStencilTables->GetNumStencils() : 0;
 }
 // ----------------------------------------------------------------------------
 GLuint
 GLSLTransformFeedbackComputeContext::GetVertexStencilTablesSizes() const {
    return _vertexStencilTables ? _vertexStencilTables->GetSizes() : 0;
--- a/opensubdiv/osd/glslTransformFeedbackComputeContext.h
+++ b/opensubdiv/osd/glslTransformFeedbackComputeContext.h
@ -59,7 +59,7 @@ public:
    ///                              interpolation
    ///
    static GLSLTransformFeedbackComputeContext * Create(Far::StencilTables const * vertexStencilTables,
-                                                           Far::StencilTables const * varyingStencilTables=0);
+                                                        Far::StencilTables const * varyingStencilTables=0);
    /// Destructor
    virtual ~GLSLTransformFeedbackComputeContext();
@ -75,6 +75,12 @@ public:
        return _numControlVertices;
    }
    /// Returns the number of stencils in vertex stencil table
    int GetNumStencilsInVertexStencilTables() const;
    /// Returns the number of stencils in varying stencil table
    int GetNumStencilsInVaryingStencilTables() const;
    /// Returns the GL texture buffer containing vertex-stencil stencil sizes
    GLuint GetVertexStencilTablesSizes() const;
--- a/opensubdiv/osd/glslTransformFeedbackComputeController.cpp
+++ b/opensubdiv/osd/glslTransformFeedbackComputeController.cpp
@ -87,7 +87,7 @@ public:
        _uniformWeights(0),
        _uniformStart(0),
        _uniformEnd(0),
-        _uniformOffset(0) { }
+        _uniformPrimvarOffset(0) { }
    ~KernelBundle() {
        if (_program) {
@ -95,9 +95,8 @@ public:
        }
    }
-    void UseProgram(int primvarOffset) const {
+    void UseProgram() const {
        glUseProgram(_program);
        glUniform1i(_uniformOffset, primvarOffset);
    }
    bool Compile(VertexBufferDescriptor const & desc) {
@ -138,15 +137,19 @@ public:
            // outVertexData[2]
            // (gl_SkipComponents1)
            //
            // note that "primvarOffset" in shader is still needed to read
            // interleaved components even if gl_SkipComponents is used.
            //
            char attrName[32];
-            for (int i = 0; i < desc.offset; ++i) {
+            int primvarOffset = (desc.offset % desc.stride);
            for (int i = 0; i < primvarOffset; ++i) {
                outputs.push_back("gl_SkipComponents1");
            }
            for (int i = 0; i < desc.length; ++i) {
                snprintf(attrName, 32, "outVertexBuffer[%d]", i);
                outputs.push_back(attrName);
            }
-            for (int i = desc.offset + desc.length; i < desc.stride; ++i) {
+            for (int i = primvarOffset + desc.length; i < desc.stride; ++i) {
                outputs.push_back("gl_SkipComponents1");
            }
@ -192,7 +195,7 @@ public:
        _uniformStart   = glGetUniformLocation(_program, "batchStart");
        _uniformEnd     = glGetUniformLocation(_program, "batchEnd");
-        _uniformOffset  = glGetUniformLocation(_program, "primvarOffset");
+        _uniformPrimvarOffset = glGetUniformLocation(_program, "primvarOffset");
        OSD_DEBUG_CHECK_GL_ERROR("KernelBundle::Compile");
@ -225,14 +228,16 @@ public:
        // set batch range
        glUniform1i(_uniformStart,  start);
        glUniform1i(_uniformEnd,    end);
-        glUniform1i(_uniformOffset, offset);
+        glUniform1i(_uniformPrimvarOffset, offset);
        int count = end - start,
            stride = _desc.stride*sizeof(float);
        // note: offset includes both "batching offset" and "primvar offset".
        // 
        glBindBufferRange(GL_TRANSFORM_FEEDBACK_BUFFER,
                          0, primvarBuffer,
-                          (start + numCVs)*stride + offset*sizeof(float),
+                          (start + numCVs)*stride + (offset - offset%stride)*sizeof(float),
                          count*stride);
        glBeginTransformFeedback(GL_POINTS);
@ -246,13 +251,14 @@ public:
        //OSD_DEBUG_CHECK_GL_ERROR("TransformPrimvarBuffer\n");
    }
-    void ApplyStencilTableKernel(Far::KernelBatch const &batch,
+    void ApplyStencilTableKernel(GLuint primvarBuffer,
-        GLuint primvarBuffer, int offset, int numCVs) const {
+                                 int offset, int numCVs,
                                 int start, int end) const {
        glUniformSubroutinesuiv(GL_VERTEX_SHADER, 1, &_subStencilKernel);
        TransformPrimvarBuffer(primvarBuffer,
-            offset, numCVs, batch.start, batch.end);
+                               offset, numCVs, start, end);
    }
    struct Match {
@ -283,7 +289,7 @@ private:
          _uniformStart,     // batch
          _uniformEnd,
-          _uniformOffset;    // GL primvar buffer descriptor
+          _uniformPrimvarOffset;
    VertexBufferDescriptor _desc; // primvar buffer descriptor
 };
@ -294,7 +300,7 @@ GLSLTransformFeedbackComputeController::bindBufferAndProgram(
    GLuint & feedbackTexture) {
    glEnable(GL_RASTERIZER_DISCARD);
-    _currentBindState.kernelBundle->UseProgram(/*primvarOffset*/0);
+    _currentBindState.kernelBundle->UseProgram();
    if (not feedbackTexture) {
        glGenTextures(1, &feedbackTexture);
@ -385,14 +391,18 @@ GLSLTransformFeedbackComputeController::getKernel(
 void
 GLSLTransformFeedbackComputeController::ApplyStencilTableKernel(
-    Far::KernelBatch const &batch,
+    GLSLTransformFeedbackComputeContext const *context, int numStencils) const {
        GLSLTransformFeedbackComputeContext const *context) const {
    assert(context);
-    _currentBindState.kernelBundle->ApplyStencilTableKernel(batch,
+    int start = 0;
    int end = numStencils;
    _currentBindState.kernelBundle->ApplyStencilTableKernel(
        _currentBindState.buffer, _currentBindState.desc.offset,
-            context->GetNumControlVertices());
+        context->GetNumControlVertices(),
        start,
        end);
 }
--- a/opensubdiv/osd/glslTransformFeedbackComputeController.h
+++ b/opensubdiv/osd/glslTransformFeedbackComputeController.h
@ -27,11 +27,11 @@
 #include "../version.h"
 #include "../far/kernelBatchDispatcher.h"
 #include "../osd/glslTransformFeedbackComputeContext.h"
 #include "../osd/vertexDescriptor.h"
 #include <vector>
 #include <cassert>
 namespace OpenSubdiv {
 namespace OPENSUBDIV_VERSION {
@ -67,9 +67,6 @@ public:
    /// @param  context       The GLSLTransformFeedbackComputeContext to apply
    ///                       refinement operations to
    ///
    /// @param  batches       Vector of batches of vertices organized by operative
    ///                       kernel
    ///
    /// @param  vertexBuffer  Vertex-interpolated data buffer
    ///
    /// @param  vertexDesc    The descriptor of vertex elements to be refined.
@ -84,21 +81,19 @@ public:
    ///
    template<class VERTEX_BUFFER, class VARYING_BUFFER>
        void Compute( GLSLTransformFeedbackComputeContext const * context,
                      Far::KernelBatchVector const & batches,
                      VERTEX_BUFFER  * vertexBuffer,
                      VARYING_BUFFER * varyingBuffer,
                      VertexBufferDescriptor const * vertexDesc=NULL,
                      VertexBufferDescriptor const * varyingDesc=NULL ){
        if (batches.empty()) return;
        if (vertexBuffer) {
            bind(vertexBuffer, vertexDesc, _vertexTexture);
            bindContextStencilTables(context, false);
-            Far::KernelBatchDispatcher::Apply(this, context, batches, /*maxlevel*/ -1);
+            ApplyStencilTableKernel(
                context, context->GetNumStencilsInVertexStencilTables());
        }
        if (varyingBuffer) {
@ -107,7 +102,8 @@ public:
            bindContextStencilTables(context, true);
-            Far::KernelBatchDispatcher::Apply(this, context, batches, /*maxlevel*/ -1);
+            ApplyStencilTableKernel(
                context, context->GetNumStencilsInVaryingStencilTables());
        }
        unbind();
    }
@ -117,17 +113,13 @@ public:
    /// @param  context       The GLSLTransformFeedbackComputeContext to apply
    ///                       refinement operations to
    ///
    /// @param  batches       Vector of batches of vertices organized by operative
    ///                       kernel
    ///
    /// @param  vertexBuffer  Vertex-interpolated data buffer
    ///
    template<class VERTEX_BUFFER>
        void Compute(GLSLTransformFeedbackComputeContext const * context,
                     Far::KernelBatchVector const & batches,
                     VERTEX_BUFFER *vertexBuffer) {
-        Compute<VERTEX_BUFFER>(context, batches, vertexBuffer, (VERTEX_BUFFER*)0);
+        Compute<VERTEX_BUFFER>(context, vertexBuffer, (VERTEX_BUFFER*)0);
    }
    /// Waits until all running subdivision kernels finish.
@ -135,10 +127,8 @@ public:
 protected:
-    friend class Far::KernelBatchDispatcher;
+    void ApplyStencilTableKernel(ComputeContext const *context,
-
+                                 int numStencils) const;
    void ApplyStencilTableKernel(Far::KernelBatch const &batch,
        ComputeContext const *context) const;
    template<class BUFFER>
        void bind( BUFFER * buffer, VertexBufferDescriptor const * desc,
--- a/opensubdiv/osd/mesh.h
+++ b/opensubdiv/osd/mesh.h
@ -27,7 +27,6 @@
 #include "../version.h"
 #include "../far/kernelBatch.h"
 #include "../far/topologyRefiner.h"
 #include "../far/patchTablesFactory.h"
 #include "../far/stencilTables.h"
@ -154,7 +153,6 @@ public:
    Mesh(ComputeController * computeController,
            Far::TopologyRefiner * refiner,
            Far::PatchTables * patchTables,
            Far::KernelBatchVector const & kernelBatches,
            VertexBuffer * vertexBuffer,
            VertexBuffer * varyingBuffer,
            ComputeContext * computeContext,
@ -162,7 +160,6 @@ public:
            _refiner(refiner),
            _patchTables(patchTables),
            _kernelBatches(kernelBatches),
            _vertexBuffer(vertexBuffer),
            _varyingBuffer(varyingBuffer),
            _computeContext(computeContext),
@ -192,11 +189,11 @@ public:
    }
    virtual void Refine() {
-        _computeController->Compute(_computeContext, _kernelBatches, _vertexBuffer, _varyingBuffer);
+        _computeController->Compute(_computeContext, _vertexBuffer, _varyingBuffer);
    }
    virtual void Refine(VertexBufferDescriptor const *vertexDesc, VertexBufferDescriptor const *varyingDesc) {
-        _computeController->Refine(_computeContext, _kernelBatches, _vertexBuffer, _varyingBuffer, vertexDesc, varyingDesc);
+        _computeController->Refine(_computeContext, _vertexBuffer, _varyingBuffer, vertexDesc, varyingDesc);
    }
    virtual void Synchronize() {
@ -237,8 +234,6 @@ private:
        if (numVertexElements>0) {
            vertexStencils = Far::StencilTablesFactory::Create(*_refiner, options);
            _kernelBatches.push_back(Far::StencilTablesFactory::Create(*vertexStencils));
        }
        if (numVaryingElements>0) {
@ -291,7 +286,6 @@ private:
    Far::TopologyRefiner * _refiner;
    Far::PatchTables * _patchTables;
    Far::KernelBatchVector _kernelBatches;
    VertexBuffer * _vertexBuffer,
                 * _varyingBuffer;
--- a/opensubdiv/osd/ompComputeController.cpp
+++ b/opensubdiv/osd/ompComputeController.cpp
@ -40,7 +40,7 @@ OmpComputeController::OmpComputeController(int numThreads) {
 void
 OmpComputeController::ApplyStencilTableKernel(
-    Far::KernelBatch const &batch, ComputeContext const *context) const {
+    ComputeContext const *context) const {
    assert(context);
@ -48,6 +48,9 @@ OmpComputeController::ApplyStencilTableKernel(
    if (vertexStencils and _currentBindState.vertexBuffer) {
        int start = 0;
        int end = vertexStencils->GetNumStencils();
        VertexBufferDescriptor const & desc = _currentBindState.vertexDesc;
        float const * srcBuffer = _currentBindState.vertexBuffer + desc.offset;
@ -55,20 +58,25 @@ OmpComputeController::ApplyStencilTableKernel(
        float * destBuffer = _currentBindState.vertexBuffer + desc.offset +
            vertexStencils->GetNumControlVertices() * desc.stride;
-        OmpComputeStencils(_currentBindState.vertexDesc,
+        if (end > start) {
-                              srcBuffer, destBuffer,
+            OmpComputeStencils(_currentBindState.vertexDesc,
-                              &vertexStencils->GetSizes().at(0),
+                               srcBuffer, destBuffer,
-                              &vertexStencils->GetOffsets().at(0),
+                               &vertexStencils->GetSizes().at(0),
-                              &vertexStencils->GetControlIndices().at(0),
+                               &vertexStencils->GetOffsets().at(0),
-                              &vertexStencils->GetWeights().at(0),
+                               &vertexStencils->GetControlIndices().at(0),
-                              batch.start,
+                               &vertexStencils->GetWeights().at(0),
-                              batch.end);
+                               start,
                               end);
        }
    }
    Far::StencilTables const * varyingStencils = context->GetVaryingStencilTables();
    if (varyingStencils and _currentBindState.varyingBuffer) {
        int start = 0;
        int end = varyingStencils->GetNumStencils();
        VertexBufferDescriptor const & desc = _currentBindState.varyingDesc;
        float const * srcBuffer = _currentBindState.varyingBuffer + desc.offset;
@ -76,14 +84,16 @@ OmpComputeController::ApplyStencilTableKernel(
        float * destBuffer = _currentBindState.varyingBuffer + desc.offset +
            varyingStencils->GetNumControlVertices() * desc.stride;
-        OmpComputeStencils(_currentBindState.varyingDesc,
+        if (end > start) {
-                              srcBuffer, destBuffer,
+            OmpComputeStencils(_currentBindState.varyingDesc,
-                              &varyingStencils->GetSizes().at(0),
+                               srcBuffer, destBuffer,
-                              &varyingStencils->GetOffsets().at(0),
+                               &varyingStencils->GetSizes().at(0),
-                              &varyingStencils->GetControlIndices().at(0),
+                               &varyingStencils->GetOffsets().at(0),
-                              &varyingStencils->GetWeights().at(0),
+                               &varyingStencils->GetControlIndices().at(0),
-                              batch.start,
+                               &varyingStencils->GetWeights().at(0),
-                              batch.end);
+                               start,
                               end);
        }
    }
 }
--- a/opensubdiv/osd/ompComputeController.h
+++ b/opensubdiv/osd/ompComputeController.h
@ -27,7 +27,6 @@
 #include "../version.h"
 #include "../far/kernelBatchDispatcher.h"
 #include "../osd/cpuComputeContext.h"
 #include "../osd/vertexDescriptor.h"
@ -66,9 +65,6 @@ public:
    ///
    /// @param  context       The CpuContext to apply refinement operations to
    ///
    /// @param  batches       Vector of batches of vertices organized by operative
    ///                       kernel
    ///
    /// @param  vertexBuffer  Vertex-interpolated data buffer
    ///
    /// @param  vertexDesc    The descriptor of vertex elements to be refined.
@ -83,19 +79,16 @@ public:
    ///
    template<class VERTEX_BUFFER, class VARYING_BUFFER>
        void Compute( CpuComputeContext const * context,
                      Far::KernelBatchVector const & batches,
                      VERTEX_BUFFER  * vertexBuffer,
                      VARYING_BUFFER * varyingBuffer,
                      VertexBufferDescriptor const * vertexDesc=NULL,
                      VertexBufferDescriptor const * varyingDesc=NULL ){
        if (batches.empty()) return;
        omp_set_num_threads(_numThreads);
        bind(vertexBuffer, varyingBuffer, vertexDesc, varyingDesc);
-        Far::KernelBatchDispatcher::Apply(this, context, batches, /*maxlevel*/ -1);
+        ApplyStencilTableKernel(context);
        unbind();
    }
@ -104,17 +97,13 @@ public:
    ///
    /// @param  context       The CpuContext to apply refinement operations to
    ///
    /// @param  batches       Vector of batches of vertices organized by operative
    ///                       kernel
    ///
    /// @param  vertexBuffer  Vertex-interpolated data buffer
    ///
    template<class VERTEX_BUFFER>
        void Compute(CpuComputeContext const * context,
                     Far::KernelBatchVector const & batches,
                     VERTEX_BUFFER *vertexBuffer) {
-        Compute<VERTEX_BUFFER>(context, batches, vertexBuffer, (VERTEX_BUFFER*)0);
+        Compute<VERTEX_BUFFER>(context, vertexBuffer, (VERTEX_BUFFER*)0);
    }
    /// Waits until all running subdivision kernels finish.
@ -122,10 +111,7 @@ public:
 protected:
-    friend class Far::KernelBatchDispatcher;
+    void ApplyStencilTableKernel(ComputeContext const *context) const;
    void ApplyStencilTableKernel(Far::KernelBatch const &batch,
        ComputeContext const *context) const;
    template<class VERTEX_BUFFER, class VARYING_BUFFER>
        void bind( VERTEX_BUFFER * vertexBuffer,
--- a/opensubdiv/osd/tbbComputeController.cpp
+++ b/opensubdiv/osd/tbbComputeController.cpp
@ -49,7 +49,7 @@ TbbComputeController::TbbComputeController(int numThreads)
 void
 TbbComputeController::ApplyStencilTableKernel(
-    Far::KernelBatch const &batch, ComputeContext const *context) const {
+    ComputeContext const *context) const {
    assert(context);
@ -57,6 +57,9 @@ TbbComputeController::ApplyStencilTableKernel(
    if (vertexStencils and _currentBindState.vertexBuffer) {
        int start = 0;
        int end = vertexStencils->GetNumStencils();
        VertexBufferDescriptor const & desc = _currentBindState.vertexDesc;
        float const * srcBuffer = _currentBindState.vertexBuffer + desc.offset;
@ -64,20 +67,25 @@ TbbComputeController::ApplyStencilTableKernel(
        float * destBuffer = _currentBindState.vertexBuffer + desc.offset +
            vertexStencils->GetNumControlVertices() * desc.stride;
-        TbbComputeStencils(_currentBindState.vertexDesc,
+        if (end > start) {
-                              srcBuffer, destBuffer,
+            TbbComputeStencils(_currentBindState.vertexDesc,
-                              &vertexStencils->GetSizes().at(0),
+                               srcBuffer, destBuffer,
-                              &vertexStencils->GetOffsets().at(0),
+                               &vertexStencils->GetSizes().at(0),
-                              &vertexStencils->GetControlIndices().at(0),
+                               &vertexStencils->GetOffsets().at(0),
-                              &vertexStencils->GetWeights().at(0),
+                               &vertexStencils->GetControlIndices().at(0),
-                              batch.start,
+                               &vertexStencils->GetWeights().at(0),
-                              batch.end);
+                               start,
                               end);
        }
    }
    Far::StencilTables const * varyingStencils = context->GetVaryingStencilTables();
    if (varyingStencils and _currentBindState.varyingBuffer) {
        int start = 0;
        int end = varyingStencils->GetNumStencils();
        VertexBufferDescriptor const & desc = _currentBindState.varyingDesc;
        float const * srcBuffer = _currentBindState.varyingBuffer + desc.offset;
@ -85,14 +93,16 @@ TbbComputeController::ApplyStencilTableKernel(
        float * destBuffer = _currentBindState.varyingBuffer + desc.offset +
            varyingStencils->GetNumControlVertices() * desc.stride;
-        TbbComputeStencils(_currentBindState.varyingDesc,
+        if (end > start) {
-                              srcBuffer, destBuffer,
+            TbbComputeStencils(_currentBindState.varyingDesc,
-                              &varyingStencils->GetSizes().at(0),
+                               srcBuffer, destBuffer,
-                              &varyingStencils->GetOffsets().at(0),
+                               &varyingStencils->GetSizes().at(0),
-                              &varyingStencils->GetControlIndices().at(0),
+                               &varyingStencils->GetOffsets().at(0),
-                              &varyingStencils->GetWeights().at(0),
+                               &varyingStencils->GetControlIndices().at(0),
-                              batch.start,
+                               &varyingStencils->GetWeights().at(0),
-                              batch.end);
+                               start,
                               end);
        }
    }
 }
--- a/opensubdiv/osd/tbbComputeController.h
+++ b/opensubdiv/osd/tbbComputeController.h
@ -27,7 +27,6 @@
 #include "../version.h"
 #include "../far/kernelBatchDispatcher.h"
 #include "../osd/cpuComputeContext.h"
 #include "../osd/vertexDescriptor.h"
@ -62,9 +61,6 @@ public:
    ///
    /// @param  context       The CpuContext to apply refinement operations to
    ///
    /// @param  batches       Vector of batches of vertices organized by operative
    ///                       kernel
    ///
    /// @param  vertexBuffer  Vertex-interpolated data buffer
    ///
    /// @param  vertexDesc    The descriptor of vertex elements to be refined.
@ -79,17 +75,14 @@ public:
    ///
    template<class VERTEX_BUFFER, class VARYING_BUFFER>
        void Compute( CpuComputeContext const * context,
                      Far::KernelBatchVector const & batches,
                      VERTEX_BUFFER  * vertexBuffer,
                      VARYING_BUFFER * varyingBuffer,
                      VertexBufferDescriptor const * vertexDesc=NULL,
                      VertexBufferDescriptor const * varyingDesc=NULL ){
        if (batches.empty()) return;
        bind(vertexBuffer, varyingBuffer, vertexDesc, varyingDesc);
-        Far::KernelBatchDispatcher::Apply(this, context, batches, /*maxlevel*/ -1);
+        ApplyStencilTableKernel(context);
        unbind();
    }
@ -98,17 +91,13 @@ public:
    ///
    /// @param  context       The CpuContext to apply refinement operations to
    ///
    /// @param  batches       Vector of batches of vertices organized by operative
    ///                       kernel
    ///
    /// @param  vertexBuffer  Vertex-interpolated data buffer
    ///
    template<class VERTEX_BUFFER>
        void Compute(CpuComputeContext const * context,
                     Far::KernelBatchVector const & batches,
                     VERTEX_BUFFER *vertexBuffer) {
-        Compute<VERTEX_BUFFER>(context, batches, vertexBuffer, (VERTEX_BUFFER*)0);
+        Compute<VERTEX_BUFFER>(context, vertexBuffer, (VERTEX_BUFFER*)0);
    }
    /// Waits until all running subdivision kernels finish.
@ -116,10 +105,7 @@ public:
 protected:
-    friend class Far::KernelBatchDispatcher;
+    void ApplyStencilTableKernel(ComputeContext const *context) const;
    void ApplyStencilTableKernel(Far::KernelBatch const &batch,
        ComputeContext const *context) const;
    template<class VERTEX_BUFFER, class VARYING_BUFFER>
        void bind( VERTEX_BUFFER * vertexBuffer,
--- a/regression/osd_regression/main.cpp
+++ b/regression/osd_regression/main.cpp
@ -303,11 +303,7 @@ checkMeshCPU( FarTopologyRefiner *refiner,
    vb->UpdateData( coarseverts[0].GetPos(), 0, (int)coarseverts.size() );
-    Far::KernelBatchVector kernelBatches;
+    controller->Compute( context, vb );
    kernelBatches.push_back(
        Far::StencilTablesFactory::Create(*vertexStencils));
    controller->Compute( context, kernelBatches, vb );
    int result = checkVertexBuffer(*refiner, refmesh, vb->BindCpuBuffer(), 
        vb->GetNumElements());
@ -341,11 +337,7 @@ checkMeshCPUGL(FarTopologyRefiner *refiner,
    vb->UpdateData( coarseverts[0].GetPos(), 0, (int)coarseverts.size() );
-    Far::KernelBatchVector kernelBatches;
+    controller->Compute( context, vb );
    kernelBatches.push_back(
        Far::StencilTablesFactory::Create(*vertexStencils));
    controller->Compute( context, kernelBatches, vb );
    int result = checkVertexBuffer(*refiner, refmesh, 
        vb->BindCpuBuffer(), vb->GetNumElements());
@ -383,11 +375,7 @@ checkMeshCL( FarTopologyRefiner *refiner,
    vb->UpdateData( coarseverts[0].GetPos(), 0, (int)coarseverts.size(),
        g_clQueue );
-    Far::KernelBatchVector kernelBatches;
+    controller->Compute( context, vb );
    kernelBatches.push_back(
        Far::StencilTablesFactory::Create(*vertexStencils));
    controller->Compute( context, kernelBatches, vb );
    // read data back from CL buffer
    size_t dataSize = vb->GetNumVertices() * vb->GetNumElements();
--- a/tutorials/osd/tutorial_0/osd_tutorial_0.cpp
+++ b/tutorials/osd/tutorial_0/osd_tutorial_0.cpp
@ -75,8 +75,6 @@ int main(int, char **) {
    Osd::CpuComputeContext * context=0;
    Far::KernelBatchVector batches;
    //
    // Setup phase
    //
@ -92,9 +90,6 @@ int main(int, char **) {
        Far::StencilTables const * stencilTables =
            Far::StencilTablesFactory::Create(*refiner, options);
        // We need a kernel batch to dispatch Compute launches
        batches.push_back(Far::StencilTablesFactory::Create(*stencilTables));
        // Create an Osd Compute Context from the stencil tables
        context = Osd::CpuComputeContext::Create(stencilTables);
@ -122,7 +117,7 @@ int main(int, char **) {
        vbuffer->UpdateData(g_verts, 0, nCoarseVerts);
        // Launch the computation
-        controller.Compute(context, batches, vbuffer);
+        controller.Compute(context, vbuffer);
    }
    { // Visualization with Maya : print a MEL script that generates particles