From 8da827336d9d683ce6b1c907d3b1e5e35fe8ee17 Mon Sep 17 00:00:00 2001
From: Takahito Tejima <takahito@pixar.com>
Date: Thu, 9 Apr 2015 11:16:54 -0700
Subject: [PATCH 1/4] Removes FarKernelBatch. This is the first step to tease
 off Osd compute controller/contexts from Far API. Currently FarStencilTable
 only creates a kernelbatch for the entire range, so we can use [0,
 numStencils) for all cases instead of KernelBatch. This might not be true if
 we apply non-factorized level-wise stencils, then we'll add another modular
 utility to serve those cases.

---
 examples/glEvalLimit/glEvalLimit.cpp          |   7 +-
 examples/glShareTopology/glShareTopology.cpp  |   6 -
 opensubdiv/far/CMakeLists.txt                 |   2 -
 opensubdiv/far/kernelBatch.h                  |  82 ---------
 opensubdiv/far/kernelBatchDispatcher.h        | 164 ------------------
 opensubdiv/far/stencilTablesFactory.cpp       |   9 -
 opensubdiv/far/stencilTablesFactory.h         |   8 -
 opensubdiv/osd/clComputeContext.cpp           |  17 +-
 opensubdiv/osd/clComputeContext.h             |   6 +
 opensubdiv/osd/clComputeController.cpp        |  24 ++-
 opensubdiv/osd/clComputeController.h          |  17 +-
 opensubdiv/osd/cpuComputeContext.h            |   2 +-
 opensubdiv/osd/cpuComputeController.cpp       |  36 ++--
 opensubdiv/osd/cpuComputeController.h         |  20 +--
 opensubdiv/osd/cudaComputeContext.cpp         |  16 ++
 opensubdiv/osd/cudaComputeContext.h           |   6 +
 opensubdiv/osd/cudaComputeController.cpp      |  17 +-
 opensubdiv/osd/cudaComputeController.h        |  14 +-
 opensubdiv/osd/d3d11ComputeContext.cpp        |  18 ++
 opensubdiv/osd/d3d11ComputeContext.h          |   6 +
 opensubdiv/osd/d3d11ComputeController.cpp     |  16 +-
 opensubdiv/osd/d3d11ComputeController.h       |  25 +--
 opensubdiv/osd/d3d11Mesh.h                    |  21 +--
 opensubdiv/osd/glMesh.h                       |  25 +--
 opensubdiv/osd/glslComputeContext.cpp         |  20 ++-
 opensubdiv/osd/glslComputeContext.h           |  11 +-
 opensubdiv/osd/glslComputeController.cpp      |  18 +-
 opensubdiv/osd/glslComputeController.h        |  21 +--
 .../glslTransformFeedbackComputeContext.cpp   |  18 +-
 .../osd/glslTransformFeedbackComputeContext.h |   8 +-
 ...glslTransformFeedbackComputeController.cpp |  44 +++--
 .../glslTransformFeedbackComputeController.h  |  25 +--
 opensubdiv/osd/mesh.h                         |  10 +-
 opensubdiv/osd/ompComputeController.cpp       |  36 ++--
 opensubdiv/osd/ompComputeController.h         |  20 +--
 opensubdiv/osd/tbbComputeController.cpp       |  36 ++--
 opensubdiv/osd/tbbComputeController.h         |  20 +--
 regression/osd_regression/main.cpp            |  18 +-
 tutorials/osd/tutorial_0/osd_tutorial_0.cpp   |   7 +-
 39 files changed, 318 insertions(+), 558 deletions(-)
 delete mode 100644 opensubdiv/far/kernelBatch.h
 delete mode 100644 opensubdiv/far/kernelBatchDispatcher.h

diff --git a/examples/glEvalLimit/glEvalLimit.cpp b/examples/glEvalLimit/glEvalLimit.cpp
index d6ad53a0..d53cfb8b 100644
--- a/examples/glEvalLimit/glEvalLimit.cpp
+++ b/examples/glEvalLimit/glEvalLimit.cpp
@@ -197,8 +197,6 @@ Osd::CpuComputeContext * g_computeCtx = 0;
 
 Osd::CpuComputeController g_computeCtrl;
 
-Far::KernelBatchVector  g_kernelBatches;
-
 Osd::CpuEvalLimitContext * g_evalCtx = 0;
 
 Osd::CpuEvalLimitController g_evalCtrl;
@@ -242,7 +240,7 @@ updateGeom() {
 
     g_vertexData->UpdateData( &g_positions[0], 0, nverts);
 
-    g_computeCtrl.Compute(g_computeCtx, g_kernelBatches, g_vertexData, g_varyingData);
+    g_computeCtrl.Compute(g_computeCtx, g_vertexData, g_varyingData);
 
     s.Stop();
     g_computeTime = float(s.GetElapsed() * 1000.0f);
@@ -417,9 +415,6 @@ createOsdMesh(ShapeDesc const & shapeDesc, int level) {
         delete g_evalCtx;
         g_evalCtx = Osd::CpuEvalLimitContext::Create(*patchTables);
 
-        g_kernelBatches.clear();
-        g_kernelBatches.push_back(Far::StencilTablesFactory::Create(*concatStencils));
- 
     }
 
     {   // Create vertex primvar buffer for the CVs
diff --git a/examples/glShareTopology/glShareTopology.cpp b/examples/glShareTopology/glShareTopology.cpp
index aa751b90..ff2b5943 100644
--- a/examples/glShareTopology/glShareTopology.cpp
+++ b/examples/glShareTopology/glShareTopology.cpp
@@ -295,8 +295,6 @@ public:
 
         _computeContext = ComputeContext::Create(vertexStencils, varyingStencils);
 
-        _kernelBatches.push_back(Far::StencilTablesFactory::Create(*vertexStencils));
-
         _numVertices = vertexStencils->GetNumStencils() +
             vertexStencils->GetNumControlVertices();
     }
@@ -328,7 +326,6 @@ public:
                 globalVaryingDesc.stride);
 
             _computeController.Compute(_computeContext,
-                                      _kernelBatches,
                                       typedInstance->GetVertexBuffer(),
                                       typedInstance->GetVaryingBuffer(),
                                       &vertexDesc,
@@ -364,7 +361,6 @@ public:
 private:
     COMPUTE_CONTROLLER _computeController;
     ComputeContext *_computeContext;
-    Far::KernelBatchVector _kernelBatches;
 };
 
 // ---------------------------------------------------------------------------
@@ -394,8 +390,6 @@ Topology(Far::PatchTables const * patchTables,
 
     _computeContext = ComputeContext::Create(g_clContext, vertexStencils, varyingStencils);
 
-    _kernelBatches.push_back(Far::StencilTablesFactory::Create(*vertexStencils));
-
     _numVertices = vertexStencils->GetNumStencils() +
         vertexStencils->GetNumControlVertices();
 }
diff --git a/opensubdiv/far/CMakeLists.txt b/opensubdiv/far/CMakeLists.txt
index df902186..258ec261 100644
--- a/opensubdiv/far/CMakeLists.txt
+++ b/opensubdiv/far/CMakeLists.txt
@@ -45,8 +45,6 @@ set(PUBLIC_HEADER_FILES
     error.h
     gregoryBasis.h
     interpolate.h
-    kernelBatch.h
-    kernelBatchDispatcher.h
     patchDescriptor.h
     patchParam.h
     patchMap.h
diff --git a/opensubdiv/far/kernelBatch.h b/opensubdiv/far/kernelBatch.h
deleted file mode 100644
index 03724d15..00000000
--- a/opensubdiv/far/kernelBatch.h
+++ /dev/null
@@ -1,82 +0,0 @@
-//
-//   Copyright 2013 Pixar
-//
-//   Licensed under the Apache License, Version 2.0 (the "Apache License")
-//   with the following modification; you may not use this file except in
-//   compliance with the Apache License and the following modification to it:
-//   Section 6. Trademarks. is deleted and replaced with:
-//
-//   6. Trademarks. This License does not grant permission to use the trade
-//      names, trademarks, service marks, or product names of the Licensor
-//      and its affiliates, except as required to comply with Section 4(c) of
-//      the License and to reproduce the content of the NOTICE file.
-//
-//   You may obtain a copy of the Apache License at
-//
-//       http://www.apache.org/licenses/LICENSE-2.0
-//
-//   Unless required by applicable law or agreed to in writing, software
-//   distributed under the Apache License with the above modification is
-//   distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-//   KIND, either express or implied. See the Apache License for the specific
-//   language governing permissions and limitations under the Apache License.
-//
-
-#ifndef FAR_KERNEL_BATCH_H
-#define FAR_KERNEL_BATCH_H
-
-#include "../version.h"
-
-#include <vector>
-
-namespace OpenSubdiv {
-namespace OPENSUBDIV_VERSION {
-
-namespace Far {
-
-/// \brief A GP Compute Kernel descriptor.
-///
-/// Vertex refinement through subdivision schemes requires the successive
-/// application of dedicated compute kernels. OpenSubdiv groups these vertices
-/// in batches based on their topology in order to minimize the number of kernel
-/// switches to process a given primitive.
-///
-struct KernelBatch {
-
-public:
-
-    enum KernelType {
-        KERNEL_UNKNOWN=0,
-        KERNEL_STENCIL_TABLE,
-        KERNEL_USER_DEFINED
-    };
-
-    /// \brief Constructor.
-    /// 
-    /// @param _kernelType    The type of compute kernel kernel
-    ///
-    /// @param _level         The level of subdivision of the vertices in the batch
-    ///
-    /// @param _start         Index of the first vertex in the batch
-    ///
-    /// @param _end           Index of the last vertex in the batch
-    ///
-    KernelBatch( int _kernelType, int _level, int _start, int _end ) :
-        kernelType(_kernelType), level(_level), start(_start), end(_end) { }
-
-    int kernelType,   // the type of compute kernel kernel
-        level,        // the level of subdivision of the vertices in the batch
-        start,        // index of the first vertex in the batch
-        end;          // index of the last vertex in the batch
-};
-
-typedef std::vector<KernelBatch> KernelBatchVector;
-
-} // end namespace Far
-
-} // end namespace OPENSUBDIV_VERSION
-using namespace OPENSUBDIV_VERSION;
-
-} // end namespace OpenSubdiv
-
-#endif  /* FAR_KERNEL_BATCH_H */
diff --git a/opensubdiv/far/kernelBatchDispatcher.h b/opensubdiv/far/kernelBatchDispatcher.h
deleted file mode 100644
index e46641b0..00000000
--- a/opensubdiv/far/kernelBatchDispatcher.h
+++ /dev/null
@@ -1,164 +0,0 @@
-//
-//   Copyright 2013 Pixar
-//
-//   Licensed under the Apache License, Version 2.0 (the "Apache License")
-//   with the following modification; you may not use this file except in
-//   compliance with the Apache License and the following modification to it:
-//   Section 6. Trademarks. is deleted and replaced with:
-//
-//   6. Trademarks. This License does not grant permission to use the trade
-//      names, trademarks, service marks, or product names of the Licensor
-//      and its affiliates, except as required to comply with Section 4(c) of
-//      the License and to reproduce the content of the NOTICE file.
-//
-//   You may obtain a copy of the Apache License at
-//
-//       http://www.apache.org/licenses/LICENSE-2.0
-//
-//   Unless required by applicable law or agreed to in writing, software
-//   distributed under the Apache License with the above modification is
-//   distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-//   KIND, either express or implied. See the Apache License for the specific
-//   language governing permissions and limitations under the Apache License.
-//
-
-#ifndef FAR_KERNELBATCH_DISPATCHER_H
-#define FAR_KERNELBATCH_DISPATCHER_H
-
-#include "../version.h"
-
-#include "../far/kernelBatch.h"
-#include "../far/stencilTables.h"
-
-#include <cassert>
-
-namespace OpenSubdiv {
-namespace OPENSUBDIV_VERSION {
-
-namespace Far {
-
-/// \brief Subdivision refinement encapsulation layer.
-///
-/// The kernel dispatcher allows client code to customize parts or the entire
-/// computation process. This pattern aims at hiding the logic specific to
-/// the subdivision algorithms and expose a simplified access to minimalistic
-/// compute kernels. By default, meshes revert to a default dispatcher that
-/// implements single-threaded CPU kernels.
-///
-/// - derive a dispatcher class from this one
-/// - override the virtual functions
-/// - pass the derived dispatcher to the factory (one instance can be shared by many meshes)
-///
-/// Note : the caller is responsible for deleting a custom dispatcher
-///
-class KernelBatchDispatcher {
-public:
-
-    /// \brief Launches the processing of a vector of kernel batches
-    /// this is a convenient API for controllers which don't have any user defined kernels.
-    ///
-    /// @param controller  refinement controller implementation (vertex array)
-    ///
-    /// @param context     refinement context implementation (subdivision tables)
-    ///                    passed to the controller.
-    ///
-    /// @param batches     batches of kernels that need to be processed
-    ///
-    /// @param maxlevel    process vertex batches up to this level
-    ///
-    template <class CONTROLLER, class CONTEXT> static void Apply(
-        CONTROLLER *controller, CONTEXT *context, KernelBatchVector const & batches, int maxlevel);
-
-protected:
-
-    /// \brief Launches the processing of a kernel batch
-    /// returns true if the batch is handled, otherwise returns false (i.e. user defined kernel)
-    ///
-    /// @param controller  refinement controller implementation
-    ///
-    /// @param context     refinement context implementation
-    ///
-    /// @param batch       a batch of kernel that need to be processed
-    ///
-    template <class CONTROLLER, class CONTEXT> static bool ApplyKernel(
-        CONTROLLER *controller, CONTEXT *context, KernelBatch const &batch);
-
-};
-
-///
-/// \brief Far default controller implementation
-///
-/// This is Far's default implementation of a kernal batch controller.
-///
-class DefaultController {
-public:
-
-    template <class CONTEXT> void ApplyStencilTableKernel(
-        KernelBatch const &batch, CONTEXT *context) const;
-
-};
-
-
-// Launches the processing of a kernel batch
-template <class CONTROLLER, class CONTEXT> bool
-KernelBatchDispatcher::ApplyKernel(CONTROLLER *controller, CONTEXT *context,
-    KernelBatch const &batch) {
-
-    if (batch.end==0) {
-        return true;
-    }
-
-    switch(batch.kernelType) {
-
-        case KernelBatch::KERNEL_UNKNOWN:
-            assert(0);
-
-        case KernelBatch::KERNEL_STENCIL_TABLE:
-            controller->ApplyStencilTableKernel(batch, context);
-            break;
-
-        default: // user defined kernel type
-            return false;
-    }
-
-    return true;
-}
-
-// Launches the processing of a vector of kernel batches
-template <class CONTROLLER, class CONTEXT> void
-KernelBatchDispatcher::Apply(CONTROLLER *controller, CONTEXT *context,
-    KernelBatchVector const & batches, int maxlevel) {
-
-    for (int i = 0; i < (int)batches.size(); ++i) {
-
-        const KernelBatch &batch = batches[i];
-
-        if (maxlevel>=0 and batch.level>=maxlevel) {
-            continue;
-        }
-
-        ApplyKernel(controller, context, batch);
-    }
-}
-
-template <class CONTEXT> void
-DefaultController::ApplyStencilTableKernel(
-    KernelBatch const &batch, CONTEXT *context) const {
-
-    StencilTables const * stencilTables = context->GetStencilTables();
-    assert(stencilTables);
-
-    typename CONTEXT::VertexType *vsrc = &context->GetVertices().at(0),
-                                 *vdst = vsrc + batch.start + stencilTables->GetNumControlVertices();
-
-    stencilTables->UpdateValues(vsrc, vdst, batch.start, batch.end);
-}
-
-} // end namespace Far
-
-} // end namespace OPENSUBDIV_VERSION
-using namespace OPENSUBDIV_VERSION;
-
-} // end namespace OpenSubdiv
-
-#endif /* FAR_KERNELBATCH_DISPATCHER_H */
diff --git a/opensubdiv/far/stencilTablesFactory.cpp b/opensubdiv/far/stencilTablesFactory.cpp
index 299bb3b8..47c38bd3 100644
--- a/opensubdiv/far/stencilTablesFactory.cpp
+++ b/opensubdiv/far/stencilTablesFactory.cpp
@@ -395,15 +395,6 @@ LimitStencilTablesFactory::Create(TopologyRefiner const & refiner,
     return result;
 }
 
-//------------------------------------------------------------------------------
-
-KernelBatch
-StencilTablesFactory::Create(StencilTables const &stencilTables) {
-
-    return KernelBatch( KernelBatch::KERNEL_STENCIL_TABLE,
-        -1, 0, stencilTables.GetNumStencils());
-}
-
 } // end namespace Far
 
 } // end namespace OPENSUBDIV_VERSION
diff --git a/opensubdiv/far/stencilTablesFactory.h b/opensubdiv/far/stencilTablesFactory.h
index a9a04bb0..8394d55f 100644
--- a/opensubdiv/far/stencilTablesFactory.h
+++ b/opensubdiv/far/stencilTablesFactory.h
@@ -27,7 +27,6 @@
 
 #include "../version.h"
 
-#include "../far/kernelBatch.h"
 #include "../far/patchTables.h"
 
 #include <vector>
@@ -103,13 +102,6 @@ public:
     ///
     static StencilTables const * Create(int numTables, StencilTables const ** tables);
 
-    /// \brief Returns a KernelBatch applying all the stencil in the tables
-    ///        to primvar data.
-    ///
-    /// @param stencilTables The stencil tables to batch
-    ///
-    static KernelBatch Create(StencilTables const &stencilTables);
-
 private:
 
     // Generate stencils for the coarse control-vertices (single weight = 1.0f)
diff --git a/opensubdiv/osd/clComputeContext.cpp b/opensubdiv/osd/clComputeContext.cpp
index 3f0731a4..5f02a700 100644
--- a/opensubdiv/osd/clComputeContext.cpp
+++ b/opensubdiv/osd/clComputeContext.cpp
@@ -60,6 +60,7 @@ public:
         _offsets = createCLBuffer(stencilTables.GetOffsets(), clContext);
         _indices = createCLBuffer(stencilTables.GetControlIndices(), clContext);
         _weights = createCLBuffer(stencilTables.GetWeights(), clContext);
+        _numStencils = stencilTables.GetNumStencils();
     }
 
     ~CLStencilTables() {
@@ -89,12 +90,17 @@ public:
         return _weights;
     }
 
+    int GetNumStencils() const {
+        return _numStencils;
+    }
+
 private:
 
     cl_mem _sizes,
            _offsets,
            _indices,
            _weights;
+    int _numStencils;
 };
 
 // -----------------------------------------------------------------------------
@@ -104,7 +110,7 @@ CLComputeContext::CLComputeContext(
         Far::StencilTables const * varyingStencilTables,
             cl_context clContext) :
                 _vertexStencilTables(0), _varyingStencilTables(0),
-                     _numControlVertices(0) {
+                _numControlVertices(0) {
 
     if (vertexStencilTables) {
         _vertexStencilTables = new CLStencilTables(*vertexStencilTables, clContext);
@@ -139,6 +145,15 @@ CLComputeContext::HasVaryingStencilTables() const {
     return _varyingStencilTables ? _varyingStencilTables->IsValid() : false;
 }
 
+int
+CLComputeContext::GetNumStencilsInVertexStencilTables() const {
+    return _vertexStencilTables ? _vertexStencilTables->GetNumStencils() : 0;
+}
+
+int
+CLComputeContext::GetNumStencilsInVaryingStencilTables() const {
+    return _varyingStencilTables ? _varyingStencilTables->GetNumStencils() : 0;
+}
 // ----------------------------------------------------------------------------
 
 cl_mem
diff --git a/opensubdiv/osd/clComputeContext.h b/opensubdiv/osd/clComputeContext.h
index 2802361f..8fe718f5 100644
--- a/opensubdiv/osd/clComputeContext.h
+++ b/opensubdiv/osd/clComputeContext.h
@@ -80,6 +80,12 @@ public:
         return _numControlVertices;
     }
 
+    /// Returns the number of stencils in vertex stencil table
+    int GetNumStencilsInVertexStencilTables() const;
+
+    /// Returns the number of stencils in varying stencil table
+    int GetNumStencilsInVaryingStencilTables() const;
+
     /// Returns the Cuda buffer containing vertex-stencil stencil sizes
     cl_mem GetVertexStencilTablesSizes() const;
 
diff --git a/opensubdiv/osd/clComputeController.cpp b/opensubdiv/osd/clComputeController.cpp
index d87a4506..30b6b65f 100644
--- a/opensubdiv/osd/clComputeController.cpp
+++ b/opensubdiv/osd/clComputeController.cpp
@@ -32,6 +32,7 @@
 #include <algorithm>
 #include <string.h>
 #include <sstream>
+#include <cassert>
 
 namespace OpenSubdiv {
 namespace OPENSUBDIV_VERSION {
@@ -138,18 +139,20 @@ private:
 // ----------------------------------------------------------------------------
 
 void
-CLComputeController::ApplyStencilTableKernel(
-    Far::KernelBatch const &batch, ComputeContext const *context) {
+CLComputeController::ApplyStencilTableKernel(ComputeContext const *context) {
 
     assert(context);
 
     cl_int errNum;
 
-    size_t globalWorkSize[1] = { (size_t)(batch.end - batch.start) };
+    size_t globalWorkSize = 0;
 
     int ncvs = context->GetNumControlVertices();
 
     if (context->HasVertexStencilTables()) {
+        int start = 0;
+        int end = context->GetNumStencilsInVertexStencilTables();
+        globalWorkSize = (size_t)(end - start);
 
         KernelBundle const * bundle = getKernel(_currentBindState.vertexDesc);
 
@@ -167,14 +170,14 @@ CLComputeController::ApplyStencilTableKernel(
         clSetKernelArg(kernel, 3, sizeof(cl_mem), &indices);
         clSetKernelArg(kernel, 4, sizeof(cl_mem), &weights);
 
-        clSetKernelArg(kernel, 5, sizeof(int), &batch.start);
-        clSetKernelArg(kernel, 6, sizeof(int), &batch.end);
+        clSetKernelArg(kernel, 5, sizeof(int), &start);
+        clSetKernelArg(kernel, 6, sizeof(int), &end);
 
         clSetKernelArg(kernel, 7, sizeof(int), &_currentBindState.vertexDesc.offset);
         clSetKernelArg(kernel, 8, sizeof(int), &ncvs);
 
         errNum = clEnqueueNDRangeKernel(
-            _clQueue, kernel, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL);
+            _clQueue, kernel, 1, NULL, &globalWorkSize, NULL, 0, NULL, NULL);
         if (errNum!=CL_SUCCESS) {
             Far::Error(Far::FAR_RUNTIME_ERROR,
                 "ApplyStencilTableKernel (%d) ", errNum);
@@ -182,6 +185,9 @@ CLComputeController::ApplyStencilTableKernel(
     }
 
     if (context->HasVaryingStencilTables()) {
+        int start = 0;
+        int end = context->GetNumStencilsInVaryingStencilTables();
+        globalWorkSize = (size_t)(end - start);
 
         KernelBundle const * bundle = getKernel(_currentBindState.varyingDesc);
 
@@ -199,14 +205,14 @@ CLComputeController::ApplyStencilTableKernel(
         clSetKernelArg(kernel, 3, sizeof(cl_mem), &indices);
         clSetKernelArg(kernel, 4, sizeof(cl_mem), &weights);
 
-        clSetKernelArg(kernel, 5, sizeof(int), &batch.start);
-        clSetKernelArg(kernel, 6, sizeof(int), &batch.end);
+        clSetKernelArg(kernel, 5, sizeof(int), &start);
+        clSetKernelArg(kernel, 6, sizeof(int), &end);
 
         clSetKernelArg(kernel, 7, sizeof(int), &_currentBindState.varyingDesc.offset);
         clSetKernelArg(kernel, 8, sizeof(int), &ncvs);
 
         errNum = clEnqueueNDRangeKernel(
-            _clQueue, kernel, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL);
+            _clQueue, kernel, 1, NULL, &globalWorkSize, NULL, 0, NULL, NULL);
         if (errNum!=CL_SUCCESS) {
             Far::Error(Far::FAR_RUNTIME_ERROR,
                 "ApplyStencilTableKernel (%d)", errNum);
diff --git a/opensubdiv/osd/clComputeController.h b/opensubdiv/osd/clComputeController.h
index e08340f2..16b59d11 100644
--- a/opensubdiv/osd/clComputeController.h
+++ b/opensubdiv/osd/clComputeController.h
@@ -27,7 +27,6 @@
 
 #include "../version.h"
 
-#include "../far/kernelBatchDispatcher.h"
 #include "../osd/clComputeContext.h"
 #include "../osd/vertexDescriptor.h"
 #include "../osd/opencl.h"
@@ -70,9 +69,6 @@ public:
     ///
     /// @param  context       The CLContext to apply refinement operations to
     ///
-    /// @param  batches       Vector of batches of vertices organized by operative
-    ///                       kernel
-    ///
     /// @param  vertexBuffer  Vertex-interpolated data buffer
     ///
     /// @param  vertexDesc    The descriptor of vertex elements to be refined.
@@ -87,17 +83,14 @@ public:
     ///
     template<class VERTEX_BUFFER, class VARYING_BUFFER>
         void Compute( CLComputeContext const * context,
-                      Far::KernelBatchVector const & batches,
                       VERTEX_BUFFER  * vertexBuffer,
                       VARYING_BUFFER * varyingBuffer,
                       VertexBufferDescriptor const * vertexDesc=NULL,
                       VertexBufferDescriptor const * varyingDesc=NULL ){
 
-        if (batches.empty()) return;
-
         bind(vertexBuffer, varyingBuffer, vertexDesc, varyingDesc);
 
-        Far::KernelBatchDispatcher::Apply(this, context, batches, /*maxlevel*/ -1);
+        ApplyStencilTableKernel(context);
 
         unbind();
     }
@@ -113,10 +106,9 @@ public:
     ///
     template<class VERTEX_BUFFER>
         void Compute(CLComputeContext const * context,
-                     Far::KernelBatchVector const & batches,
                      VERTEX_BUFFER *vertexBuffer) {
 
-        Compute<VERTEX_BUFFER>(context, batches, vertexBuffer, (VERTEX_BUFFER*)0);
+        Compute<VERTEX_BUFFER>(context, vertexBuffer, (VERTEX_BUFFER*)0);
     }
 
     /// Waits until all running subdivision kernels finish.
@@ -130,10 +122,7 @@ public:
 
 protected:
 
-    friend class Far::KernelBatchDispatcher;
-
-    void ApplyStencilTableKernel(Far::KernelBatch const &batch,
-        ComputeContext const *context);
+    void ApplyStencilTableKernel(ComputeContext const *context);
 
     template<class VERTEX_BUFFER, class VARYING_BUFFER>
         void bind( VERTEX_BUFFER * vertexBuffer,
diff --git a/opensubdiv/osd/cpuComputeContext.h b/opensubdiv/osd/cpuComputeContext.h
index 3bb5bdff..26f720f5 100644
--- a/opensubdiv/osd/cpuComputeContext.h
+++ b/opensubdiv/osd/cpuComputeContext.h
@@ -61,7 +61,7 @@ public:
     ///                              interpolation
     ///
     static CpuComputeContext * Create(Far::StencilTables const * vertexStencilTables,
-                                         Far::StencilTables const * varyingStencilTables=0);
+                                      Far::StencilTables const * varyingStencilTables=0);
 
     /// Destructor
     virtual ~CpuComputeContext();
diff --git a/opensubdiv/osd/cpuComputeController.cpp b/opensubdiv/osd/cpuComputeController.cpp
index 9fa44173..4eca60cf 100644
--- a/opensubdiv/osd/cpuComputeController.cpp
+++ b/opensubdiv/osd/cpuComputeController.cpp
@@ -46,7 +46,7 @@ CpuComputeController::Synchronize() {
 
 void
 CpuComputeController::ApplyStencilTableKernel(
-    Far::KernelBatch const &batch, ComputeContext const *context) const {
+    ComputeContext const *context) const {
 
     assert(context);
 
@@ -61,14 +61,17 @@ CpuComputeController::ApplyStencilTableKernel(
         float * destBuffer = _currentBindState.vertexBuffer + desc.offset +
             vertexStencils->GetNumControlVertices() * desc.stride;
 
+        int start = 0;
+        int end = vertexStencils->GetNumStencils();
+
         CpuComputeStencils(_currentBindState.vertexDesc,
-                              srcBuffer, destBuffer,
-                              &vertexStencils->GetSizes().at(0),
-                              &vertexStencils->GetOffsets().at(0),
-                              &vertexStencils->GetControlIndices().at(0),
-                              &vertexStencils->GetWeights().at(0),
-                              batch.start,
-                              batch.end);
+                           srcBuffer, destBuffer,
+                           &vertexStencils->GetSizes().at(0),
+                           &vertexStencils->GetOffsets().at(0),
+                           &vertexStencils->GetControlIndices().at(0),
+                           &vertexStencils->GetWeights().at(0),
+                           start,
+                           end);
     }
 
     Far::StencilTables const * varyingStencils = context->GetVaryingStencilTables();
@@ -82,14 +85,17 @@ CpuComputeController::ApplyStencilTableKernel(
         float * destBuffer = _currentBindState.varyingBuffer + desc.offset +
             varyingStencils->GetNumControlVertices() * desc.stride;
 
+        int start = 0;
+        int end = varyingStencils->GetNumStencils();
+
         CpuComputeStencils(_currentBindState.varyingDesc,
-                              srcBuffer, destBuffer,
-                              &varyingStencils->GetSizes().at(0),
-                              &varyingStencils->GetOffsets().at(0),
-                              &varyingStencils->GetControlIndices().at(0),
-                              &varyingStencils->GetWeights().at(0),
-                              batch.start,
-                              batch.end);
+                           srcBuffer, destBuffer,
+                           &varyingStencils->GetSizes().at(0),
+                           &varyingStencils->GetOffsets().at(0),
+                           &varyingStencils->GetControlIndices().at(0),
+                           &varyingStencils->GetWeights().at(0),
+                           start,
+                           end);
     }
 }
 
diff --git a/opensubdiv/osd/cpuComputeController.h b/opensubdiv/osd/cpuComputeController.h
index 51c3f2bd..2a9edde7 100644
--- a/opensubdiv/osd/cpuComputeController.h
+++ b/opensubdiv/osd/cpuComputeController.h
@@ -27,7 +27,6 @@
 
 #include "../version.h"
 
-#include "../far/kernelBatchDispatcher.h"
 #include "../osd/cpuComputeContext.h"
 #include "../osd/vertexDescriptor.h"
 
@@ -64,9 +63,6 @@ public:
     ///
     /// @param  context       The CpuContext to apply refinement operations to
     ///
-    /// @param  batches       Vector of batches of vertices organized by operative
-    ///                       kernel
-    ///
     /// @param  vertexBuffer  Vertex-interpolated data buffer
     ///
     /// @param  vertexDesc    The descriptor of vertex elements to be refined.
@@ -81,17 +77,14 @@ public:
     ///
     template<class VERTEX_BUFFER, class VARYING_BUFFER>
         void Compute( CpuComputeContext const * context,
-                      Far::KernelBatchVector const & batches,
                       VERTEX_BUFFER  * vertexBuffer,
                       VARYING_BUFFER * varyingBuffer,
                       VertexBufferDescriptor const * vertexDesc=NULL,
                       VertexBufferDescriptor const * varyingDesc=NULL ){
 
-        if (batches.empty()) return;
-
         bind(vertexBuffer, varyingBuffer, vertexDesc, varyingDesc);
 
-        Far::KernelBatchDispatcher::Apply(this, context, batches, /*maxlevel*/ -1);
+        ApplyStencilTableKernel(context);
 
         unbind();
     }
@@ -100,17 +93,13 @@ public:
     ///
     /// @param  context       The CpuContext to apply refinement operations to
     ///
-    /// @param  batches       Vector of batches of vertices organized by operative
-    ///                       kernel
-    ///
     /// @param  vertexBuffer  Vertex-interpolated data buffer
     ///
     template<class VERTEX_BUFFER>
         void Compute(CpuComputeContext const * context,
-                     Far::KernelBatchVector const & batches,
                      VERTEX_BUFFER *vertexBuffer) {
 
-        Compute<VERTEX_BUFFER>(context, batches, vertexBuffer, (VERTEX_BUFFER*)0);
+        Compute<VERTEX_BUFFER>(context, vertexBuffer, (VERTEX_BUFFER*)0);
     }
 
     /// Waits until all running subdivision kernels finish.
@@ -119,10 +108,7 @@ public:
 
 protected:
 
-    friend class Far::KernelBatchDispatcher;
-
-    void ApplyStencilTableKernel(Far::KernelBatch const &batch,
-        ComputeContext const *context) const;
+    void ApplyStencilTableKernel(ComputeContext const *context) const;
 
     template<class VERTEX_BUFFER, class VARYING_BUFFER>
         void bind( VERTEX_BUFFER * vertexBuffer,
diff --git a/opensubdiv/osd/cudaComputeContext.cpp b/opensubdiv/osd/cudaComputeContext.cpp
index 98ae4acf..cac7b5c6 100644
--- a/opensubdiv/osd/cudaComputeContext.cpp
+++ b/opensubdiv/osd/cudaComputeContext.cpp
@@ -67,6 +67,7 @@ public:
         _offsets = createCudaBuffer(stencilTables.GetOffsets());
         _indices = createCudaBuffer(stencilTables.GetControlIndices());
         _weights = createCudaBuffer(stencilTables.GetWeights());
+        _numStencils = stencilTables.GetNumStencils();
     }
 
     ~CudaStencilTables() {
@@ -96,11 +97,16 @@ public:
         return _weights;
     }
 
+    int GetNumStencils() const {
+        return _numStencils;
+    }
+
 private:
     void * _sizes,
          * _offsets,
          * _indices,
          * _weights;
+    int _numStencils;
 };
 
 // ----------------------------------------------------------------------------
@@ -144,6 +150,16 @@ CudaComputeContext::HasVaryingStencilTables() const {
     return _varyingStencilTables ? _varyingStencilTables->IsValid() : false;
 }
 
+int
+CudaComputeContext::GetNumStencilsInVertexStencilTables() const {
+    return _vertexStencilTables ? _vertexStencilTables->GetNumStencils() : 0;
+}
+
+int
+CudaComputeContext::GetNumStencilsInVaryingStencilTables() const {
+    return _varyingStencilTables ? _varyingStencilTables->GetNumStencils() : 0;
+}
+
 // ----------------------------------------------------------------------------
 
 void *
diff --git a/opensubdiv/osd/cudaComputeContext.h b/opensubdiv/osd/cudaComputeContext.h
index 29e1a377..94bfaf2f 100644
--- a/opensubdiv/osd/cudaComputeContext.h
+++ b/opensubdiv/osd/cudaComputeContext.h
@@ -77,6 +77,12 @@ public:
         return _numControlVertices;
     }
 
+    /// Returns the number of stencils in vertex stencil tables
+    int GetNumStencilsInVertexStencilTables() const;
+
+    /// Returns the number of stencils in varying stencil tables
+    int GetNumStencilsInVaryingStencilTables() const;
+
     /// Returns the Cuda buffer containing vertex-stencil stencil sizes
     void * GetVertexStencilTablesSizes() const;
 
diff --git a/opensubdiv/osd/cudaComputeController.cpp b/opensubdiv/osd/cudaComputeController.cpp
index d38a8d73..53bb4c86 100644
--- a/opensubdiv/osd/cudaComputeController.cpp
+++ b/opensubdiv/osd/cudaComputeController.cpp
@@ -26,6 +26,7 @@
 
 #include <cuda_runtime.h>
 #include <string.h>
+#include <cassert>
 
 extern "C" {
 
@@ -46,7 +47,7 @@ namespace Osd {
 
 void
 CudaComputeController::ApplyStencilTableKernel(
-    Far::KernelBatch const &batch, ComputeContext const *context) const {
+    ComputeContext const *context) const {
 
     assert(context);
 
@@ -55,6 +56,9 @@ CudaComputeController::ApplyStencilTableKernel(
         int length = _currentBindState.vertexDesc.length,
             stride = _currentBindState.vertexDesc.stride;
 
+        int start = 0;
+        int end = context->GetNumStencilsInVertexStencilTables();
+
         float const * src = _currentBindState.GetVertexBufferAtOffset();
 
         float * dst = const_cast<float *>(src) +
@@ -65,8 +69,8 @@ CudaComputeController::ApplyStencilTableKernel(
                                (int const *)context->GetVertexStencilTablesOffsets(),
                                (int const *)context->GetVertexStencilTablesIndices(),
                                (float const *)context->GetVertexStencilTablesWeights(),
-                               batch.start,
-                               batch.end);
+                               start,
+                               end);
     }
 
     if (context->HasVaryingStencilTables()) {
@@ -74,6 +78,9 @@ CudaComputeController::ApplyStencilTableKernel(
         int length = _currentBindState.varyingDesc.length,
             stride = _currentBindState.varyingDesc.stride;
 
+        int start = 0;
+        int end = context->GetNumStencilsInVaryingStencilTables();
+
         float const * src = _currentBindState.GetVaryingBufferAtOffset();
 
         float * dst = const_cast<float *>(src) +
@@ -84,8 +91,8 @@ CudaComputeController::ApplyStencilTableKernel(
                                (int const *)context->GetVaryingStencilTablesOffsets(),
                                (int const *)context->GetVaryingStencilTablesIndices(),
                                (float const *)context->GetVaryingStencilTablesWeights(),
-                               batch.start,
-                               batch.end);
+                               start,
+                               end);
     }
 }
 
diff --git a/opensubdiv/osd/cudaComputeController.h b/opensubdiv/osd/cudaComputeController.h
index e1822234..3309760d 100644
--- a/opensubdiv/osd/cudaComputeController.h
+++ b/opensubdiv/osd/cudaComputeController.h
@@ -27,7 +27,6 @@
 
 #include "../version.h"
 
-#include "../far/kernelBatchDispatcher.h"
 #include "../osd/cudaComputeContext.h"
 #include "../osd/vertexDescriptor.h"
 
@@ -77,17 +76,14 @@ public:
     ///
     template<class VERTEX_BUFFER, class VARYING_BUFFER>
         void Compute( CudaComputeContext const * context,
-                      Far::KernelBatchVector const & batches,
                       VERTEX_BUFFER  * vertexBuffer,
                       VARYING_BUFFER * varyingBuffer,
                       VertexBufferDescriptor const * vertexDesc=NULL,
                       VertexBufferDescriptor const * varyingDesc=NULL ){
 
-        if (batches.empty()) return;
-
         bind(vertexBuffer, varyingBuffer, vertexDesc, varyingDesc);
 
-        Far::KernelBatchDispatcher::Apply(this, context, batches, /*maxlevel*/ -1);
+        ApplyStencilTableKernel(context);
 
         unbind();
     }
@@ -103,10 +99,9 @@ public:
     ///
     template<class VERTEX_BUFFER>
         void Compute(CudaComputeContext const * context,
-                     Far::KernelBatchVector const & batches,
                      VERTEX_BUFFER *vertexBuffer) {
 
-        Compute<VERTEX_BUFFER>(context, batches, vertexBuffer, (VERTEX_BUFFER*)0);
+        Compute<VERTEX_BUFFER>(context, vertexBuffer, (VERTEX_BUFFER*)0);
     }
 
     /// Waits until all running subdivision kernels finish.
@@ -114,10 +109,7 @@ public:
 
 protected:
 
-    friend class Far::KernelBatchDispatcher;
-
-    void ApplyStencilTableKernel(Far::KernelBatch const &batch,
-        ComputeContext const *context) const;
+    void ApplyStencilTableKernel(ComputeContext const *context) const;
 
     template<class VERTEX_BUFFER, class VARYING_BUFFER>
         void bind( VERTEX_BUFFER * vertexBuffer,
diff --git a/opensubdiv/osd/d3d11ComputeContext.cpp b/opensubdiv/osd/d3d11ComputeContext.cpp
index 14a3f887..1899abc5 100644
--- a/opensubdiv/osd/d3d11ComputeContext.cpp
+++ b/opensubdiv/osd/d3d11ComputeContext.cpp
@@ -125,6 +125,8 @@ public:
         _indices.initialize(stencilTables.GetControlIndices(), DXGI_FORMAT_R32_SINT, deviceContext);
 
         _weights.initialize(stencilTables.GetWeights(), DXGI_FORMAT_R32_FLOAT, deviceContext);
+
+        _numStencils = stencilTables.GetNumStencils();
     }
 
     bool IsValid() const {
@@ -148,6 +150,10 @@ public:
         return _weights;
     }
 
+    int GetNumStencils() const {
+        return _numStencils;
+    }
+
     void Bind(ID3D11DeviceContext * deviceContext) const {
         ID3D11ShaderResourceView *SRViews[] = {
             _sizes.srv,
@@ -170,6 +176,8 @@ private:
                _offsets,
                _indices,
                _weights;
+
+    int _numStencils;
 };
 
 // ----------------------------------------------------------------------------
@@ -217,6 +225,16 @@ D3D11ComputeContext::HasVaryingStencilTables() const {
     return _varyingStencilTables ? _varyingStencilTables->IsValid() : false;
 }
 
+int
+D3D11ComputeContext::GetNumStencilsInVertexStencilTables() const {
+    return _vertexStencilTables ? _vertexStencilTables->GetNumStencils() : 0;
+}
+
+int
+D3D11ComputeContext::GetNumStencilsInVaryingStencilTables() const {
+    return _varyingStencilTables ? _varyingStencilTables->GetNumStencils() : 0;
+}
+
 // ----------------------------------------------------------------------------
 
 void
diff --git a/opensubdiv/osd/d3d11ComputeContext.h b/opensubdiv/osd/d3d11ComputeContext.h
index 73457ecf..db5abfff 100644
--- a/opensubdiv/osd/d3d11ComputeContext.h
+++ b/opensubdiv/osd/d3d11ComputeContext.h
@@ -78,6 +78,12 @@ public:
         return _numControlVertices;
     }
 
+    /// Returns the number of stencils in vertex stencil table
+    int GetNumStencilsInVertexStencilTables() const;
+
+    /// Returns the number of stencils in varying stencil table
+    int GetNumStencilsInVaryingStencilTables() const;
+
     /// Binds D3D11 buffers containing stencils for 'vertex' interpolation
     ///
     /// @param deviceContext         The D3D device
diff --git a/opensubdiv/osd/d3d11ComputeController.cpp b/opensubdiv/osd/d3d11ComputeController.cpp
index b1cffbca..e74ab021 100644
--- a/opensubdiv/osd/d3d11ComputeController.cpp
+++ b/opensubdiv/osd/d3d11ComputeController.cpp
@@ -155,11 +155,11 @@ public:
     }
 
     void ApplyStencilTableKernel(ID3D11DeviceContext *deviceContext,
-        Far::KernelBatch const &batch, int offset, int numCVs) {
+                                 int offset, int numCVs, int start, int end) {
 
         KernelUniformArgs args;
-        args.uniformStart = batch.start;
-        args.uniformEnd = batch.end;
+        args.uniformStart = start;
+        args.uniformEnd = end;
         args.uniformOffset = offset;
         args.uniformNumCVs = numCVs;
 
@@ -299,7 +299,7 @@ D3D11ComputeController::unbindBuffer() {
 
 void
 D3D11ComputeController::ApplyStencilTableKernel(
-    Far::KernelBatch const &batch, D3D11ComputeContext const *context) const {
+    D3D11ComputeContext const *context, int numStencils) const {
 
     assert(context);
 
@@ -307,8 +307,12 @@ D3D11ComputeController::ApplyStencilTableKernel(
     D3D11ComputeController::KernelBundle * bundle =
         const_cast<D3D11ComputeController::KernelBundle *>(_currentBindState.kernelBundle);
 
-    bundle->ApplyStencilTableKernel(_deviceContext,
-        batch, _currentBindState.desc.offset, context->GetNumControlVertices());
+    bundle->ApplyStencilTableKernel(
+        _deviceContext,
+        _currentBindState.desc.offset,
+        context->GetNumControlVertices(),
+        0,
+        numStencils);
 }
 
 
diff --git a/opensubdiv/osd/d3d11ComputeController.h b/opensubdiv/osd/d3d11ComputeController.h
index 4ea8e321..c40c2e6a 100644
--- a/opensubdiv/osd/d3d11ComputeController.h
+++ b/opensubdiv/osd/d3d11ComputeController.h
@@ -27,7 +27,6 @@
 
 #include "../version.h"
 
-#include "../far/kernelBatchDispatcher.h"
 #include "../osd/d3d11ComputeContext.h"
 #include "../osd/vertexDescriptor.h"
 
@@ -69,9 +68,6 @@ public:
     ///
     /// @param  context       The D3D11Context to apply refinement operations to
     ///
-    /// @param  batches       Vector of batches of vertices organized by operative
-    ///                       kernel
-    ///
     /// @param  vertexBuffer  Vertex-interpolated data buffer
     ///
     /// @param  vertexDesc    The descriptor of vertex elements to be refined.
@@ -86,20 +82,18 @@ public:
     ///
     template<class VERTEX_BUFFER, class VARYING_BUFFER>
         void Compute( D3D11ComputeContext const * context,
-                      Far::KernelBatchVector const & batches,
                       VERTEX_BUFFER  * vertexBuffer,
                       VARYING_BUFFER * varyingBuffer,
                       VertexBufferDescriptor const * vertexDesc=NULL,
                       VertexBufferDescriptor const * varyingDesc=NULL ){
 
-        if (batches.empty()) return;
-
         if (vertexBuffer) {
             bind(vertexBuffer, vertexDesc);
 
             context->BindVertexStencilTables(_deviceContext);
 
-            Far::KernelBatchDispatcher::Apply(this, context, batches, /*maxlevel*/ -1);
+            ApplyStencilTableKernel(
+                context, context->GetNumStencilsInVertexStencilTables());
         }
 
         if (varyingBuffer) {
@@ -107,7 +101,8 @@ public:
 
             context->BindVaryingStencilTables(_deviceContext);
 
-            Far::KernelBatchDispatcher::Apply(this, context, batches, /*maxlevel*/ -1);
+            ApplyStencilTableKernel(
+                context, context->GetNumStencilsInVaryingStencilTables());
         }
 
         context->UnbindStencilTables(_deviceContext);
@@ -119,17 +114,13 @@ public:
     ///
     /// @param  context       The D3D11Context to apply refinement operations to
     ///
-    /// @param  batches       Vector of batches of vertices organized by operative
-    ///                       kernel
-    ///
     /// @param  vertexBuffer  Vertex-interpolated data buffer
     ///
     template<class VERTEX_BUFFER>
         void Compute(D3D11ComputeContext const * context,
-                     Far::KernelBatchVector const & batches,
                      VERTEX_BUFFER *vertexBuffer) {
 
-        Compute<VERTEX_BUFFER>(context, batches, vertexBuffer, (VERTEX_BUFFER*)0);
+        Compute<VERTEX_BUFFER>(context, vertexBuffer, (VERTEX_BUFFER*)0);
     }
 
     /// Waits until all running subdivision kernels finish.
@@ -137,10 +128,8 @@ public:
 
 protected:
 
-    friend class Far::KernelBatchDispatcher;
-
-    void ApplyStencilTableKernel(Far::KernelBatch const &batch,
-        ComputeContext const *context) const;
+    void ApplyStencilTableKernel(ComputeContext const *context,
+                                 int numStencils) const;
 
     template<class BUFFER>
         void bind( BUFFER * buffer,
diff --git a/opensubdiv/osd/d3d11Mesh.h b/opensubdiv/osd/d3d11Mesh.h
index 5013bbaf..b50b77e3 100644
--- a/opensubdiv/osd/d3d11Mesh.h
+++ b/opensubdiv/osd/d3d11Mesh.h
@@ -78,7 +78,6 @@ public:
     Mesh(ComputeController * computeController,
             Far::TopologyRefiner * refiner,
             Far::PatchTables * patchTables,
-            Far::KernelBatchVector const & kernelBatches,
             VertexBuffer * vertexBuffer,
             VertexBuffer * varyingBuffer,
             ComputeContext * computeContext,
@@ -87,7 +86,6 @@ public:
 
             _refiner(refiner),
             _patchTables(patchTables),
-            _kernelBatches(kernelBatches),
             _vertexBuffer(vertexBuffer),
             _varyingBuffer(varyingBuffer),
             _computeContext(computeContext),
@@ -119,13 +117,14 @@ public:
         _varyingBuffer->UpdateData(varyingData, startVertex, numVerts, _d3d11DeviceContext);
     }
     virtual void Refine() {
-        _computeController->Compute(_computeContext, _kernelBatches, _vertexBuffer, _varyingBuffer);
+        _computeController->Compute(_computeContext, _vertexBuffer, _varyingBuffer);
     }
     virtual void Refine(VertexBufferDescriptor const *vertexDesc,
                         VertexBufferDescriptor const *varyingDesc,
                         bool interleaved) {
-        _computeController->Compute(_computeContext, _kernelBatches,
-                                    _vertexBuffer, (interleaved ? _vertexBuffer : _varyingBuffer),
+        _computeController->Compute(_computeContext,
+                                    _vertexBuffer,
+                                    (interleaved ? _vertexBuffer : _varyingBuffer),
                                     vertexDesc, varyingDesc);
     }
     virtual void Synchronize() {
@@ -176,8 +175,6 @@ private:
         if (numVertexElements>0) {
 
             vertexStencils = Far::StencilTablesFactory::Create(*_refiner, options);
-
-            _kernelBatches.push_back(Far::StencilTablesFactory::Create(*vertexStencils));
         }
 
         if (numVaryingElements>0) {
@@ -235,7 +232,6 @@ private:
 
     Far::TopologyRefiner * _refiner;
     Far::PatchTables * _patchTables;
-    Far::KernelBatchVector _kernelBatches;
 
     VertexBuffer *_vertexBuffer;
     VertexBuffer *_varyingBuffer;
@@ -286,7 +282,6 @@ public:
     Mesh(ComputeController * computeController,
             Far::TopologyRefiner * refiner,
             Far::PatchTables * patchTables,
-            Far::KernelBatchVector const & kernelBatches,
             VertexBuffer * vertexBuffer,
             VertexBuffer * varyingBuffer,
             ComputeContext * computeContext,
@@ -295,7 +290,6 @@ public:
 
             _refiner(refiner),
             _patchTables(patchTables),
-            _kernelBatches(kernelBatches),
             _vertexBuffer(vertexBuffer),
             _varyingBuffer(varyingBuffer),
             _computeContext(computeContext),
@@ -324,12 +318,12 @@ public:
         _varyingBuffer->UpdateData(varyingData, startVertex, numVerts, _d3d11DeviceContext);
     }
     virtual void Refine() {
-        _computeController->Compute(_computeContext, _kernelBatches, _vertexBuffer, _varyingBuffer);
+        _computeController->Compute(_computeContext, _vertexBuffer, _varyingBuffer);
     }
     virtual void Refine(VertexBufferDescriptor const *vertexDesc,
                         VertexBufferDescriptor const *varyingDesc,
                         bool interleaved) {
-        _computeController->Compute(_computeContext, _kernelBatches,
+        _computeController->Compute(_computeContext,
                                     _vertexBuffer, (interleaved ? _vertexBuffer : _varyingBuffer),
                                     vertexDesc, varyingDesc);
     }
@@ -382,8 +376,6 @@ private:
         if (numVertexElements>0) {
 
             vertexStencils = Far::StencilTablesFactory::Create(*_refiner, options);
-
-            _kernelBatches.push_back(Far::StencilTablesFactory::Create(*vertexStencils));
         }
 
         if (numVaryingElements>0) {
@@ -441,7 +433,6 @@ private:
 
     Far::TopologyRefiner * _refiner;
     Far::PatchTables * _patchTables;
-    Far::KernelBatchVector _kernelBatches;
 
     VertexBuffer *_vertexBuffer;
     VertexBuffer *_varyingBuffer;
diff --git a/opensubdiv/osd/glMesh.h b/opensubdiv/osd/glMesh.h
index 2f3f3934..85a7b865 100644
--- a/opensubdiv/osd/glMesh.h
+++ b/opensubdiv/osd/glMesh.h
@@ -98,7 +98,6 @@ public:
     Mesh(ComputeController * computeController,
             Far::TopologyRefiner * refiner,
             Far::PatchTables * patchTables,
-            Far::KernelBatchVector const & kernelBatches,
             VertexBuffer * vertexBuffer,
             VertexBuffer * varyingBuffer,
             ComputeContext * computeContext,
@@ -106,7 +105,6 @@ public:
 
             _refiner(refiner),
             _patchTables(patchTables),
-            _kernelBatches(kernelBatches),
             _vertexBuffer(vertexBuffer),
             _varyingBuffer(varyingBuffer),
             _computeContext(computeContext),
@@ -140,14 +138,15 @@ public:
     }
 
     virtual void Refine() {
-        _computeController->Compute(_computeContext, _kernelBatches, _vertexBuffer, _varyingBuffer);
+        _computeController->Compute(_computeContext, _vertexBuffer, _varyingBuffer);
     }
 
     virtual void Refine(VertexBufferDescriptor const * vertexDesc,
                         VertexBufferDescriptor const * varyingDesc,
                         bool interleaved) {
-        _computeController->Compute(_computeContext, _kernelBatches,
-                                    _vertexBuffer, (interleaved ? _vertexBuffer : _varyingBuffer),
+        _computeController->Compute(_computeContext,
+                                    _vertexBuffer,
+                                    (interleaved ? _vertexBuffer : _varyingBuffer),
                                     vertexDesc, varyingDesc);
     }
 
@@ -255,8 +254,6 @@ private:
             varyingStencils = concatVaryingStencils;
         }
 
-        _kernelBatches.push_back(Far::StencilTablesFactory::Create(*vertexStencils));
-
         _computeContext = ComputeContext::Create(vertexStencils,
                                                  varyingStencils);
 
@@ -279,7 +276,6 @@ private:
 
     Far::TopologyRefiner * _refiner;
     Far::PatchTables * _patchTables;
-    Far::KernelBatchVector _kernelBatches;
 
     VertexBuffer *_vertexBuffer;
     VertexBuffer *_varyingBuffer;
@@ -353,7 +349,6 @@ public:
     Mesh(ComputeController * computeController,
             Far::TopologyRefiner * refiner,
             Far::PatchTables * patchTables,
-            Far::KernelBatchVector const & kernelBatches,
             VertexBuffer * vertexBuffer,
             VertexBuffer * varyingBuffer,
             ComputeContext * computeContext,
@@ -363,7 +358,6 @@ public:
 
             _refiner(refiner),
             _patchTables(patchTables),
-            _kernelBatches(kernelBatches),
             _vertexBuffer(vertexBuffer),
             _varyingBuffer(varyingBuffer),
             _computeContext(computeContext),
@@ -395,14 +389,15 @@ public:
     }
 
     virtual void Refine() {
-        _computeController->Compute(_computeContext, _kernelBatches, _vertexBuffer, _varyingBuffer);
+        _computeController->Compute(_computeContext, _vertexBuffer, _varyingBuffer);
     }
 
     virtual void Refine(VertexBufferDescriptor const *vertexDesc,
                         VertexBufferDescriptor const *varyingDesc,
                         bool interleaved) {
-        _computeController->Compute(_computeContext, _kernelBatches,
-                                    _vertexBuffer, (interleaved ? _vertexBuffer : _varyingBuffer),
+        _computeController->Compute(_computeContext,
+                                    _vertexBuffer,
+                                    (interleaved ? _vertexBuffer : _varyingBuffer),
                                     vertexDesc, varyingDesc);
     }
 
@@ -499,8 +494,6 @@ private:
             Far::StencilTables const *concatStencils =
                 Far::StencilTablesFactory::Create(2, inStencils);
 
-            _kernelBatches.push_back(Far::StencilTablesFactory::Create(*concatStencils));
-
             Far::StencilTables const *inVaryingStencils[] = {
                 varyingStencils, endCapVaryingStencils
             };
@@ -513,7 +506,6 @@ private:
             delete varyingStencils;
             varyingStencils = concatVaryingStencils;
         }
-        _kernelBatches.push_back(Far::StencilTablesFactory::Create(*vertexStencils));
 
         _computeContext = ComputeContext::Create(_clContext,
                                                  vertexStencils,
@@ -538,7 +530,6 @@ private:
 
     Far::TopologyRefiner * _refiner;
     Far::PatchTables * _patchTables;
-    Far::KernelBatchVector _kernelBatches;
 
     VertexBuffer *_vertexBuffer;
     VertexBuffer *_varyingBuffer;
diff --git a/opensubdiv/osd/glslComputeContext.cpp b/opensubdiv/osd/glslComputeContext.cpp
index 5d59e8df..54d2ce21 100644
--- a/opensubdiv/osd/glslComputeContext.cpp
+++ b/opensubdiv/osd/glslComputeContext.cpp
@@ -71,6 +71,7 @@ public:
         _offsets = createGLSLBuffer(stencilTables.GetOffsets());
         _indices = createGLSLBuffer(stencilTables.GetControlIndices());
         _weights = createGLSLBuffer(stencilTables.GetWeights());
+        _numStencils = stencilTables.GetNumStencils();
     }
 
     ~GLSLStencilTables() {
@@ -100,6 +101,10 @@ public:
         return _weights;
     }
 
+    int GetNumStencils() const {
+        return _numStencils;
+    }
+
     void Bind() const {
         glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, _sizes);
         glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, _offsets);
@@ -122,6 +127,7 @@ private:
            _offsets,
            _indices,
            _weights;
+    int _numStencils;
 };
 
 // -----------------------------------------------------------------------------
@@ -130,7 +136,8 @@ GLSLComputeContext::GLSLComputeContext(
     Far::StencilTables const * vertexStencilTables,
         Far::StencilTables const * varyingStencilTables) :
             _vertexStencilTables(0), _varyingStencilTables(0),
-                _numControlVertices(0) {
+    _numControlVertices(0),
+    _numStencils(0) {
 
     if (vertexStencilTables) {
         _vertexStencilTables = new GLSLStencilTables(*vertexStencilTables);
@@ -165,8 +172,17 @@ GLSLComputeContext::HasVaryingStencilTables() const {
     return _varyingStencilTables ? _varyingStencilTables->IsValid() : false;
 }
 
-// ----------------------------------------------------------------------------
+int
+GLSLComputeContext::GetNumStencilsInVertexStencilTables() const {
+    return _vertexStencilTables ? _vertexStencilTables->GetNumStencils() : false;
+}
 
+int
+GLSLComputeContext::GetNumStencilsInVaryingStencilTables() const {
+    return _varyingStencilTables ? _varyingStencilTables->GetNumStencils() : false;
+}
+
+// ----------------------------------------------------------------------------
 
 void
 GLSLComputeContext::BindVertexStencilTables() const {
diff --git a/opensubdiv/osd/glslComputeContext.h b/opensubdiv/osd/glslComputeContext.h
index ef96b24b..bb412ee3 100644
--- a/opensubdiv/osd/glslComputeContext.h
+++ b/opensubdiv/osd/glslComputeContext.h
@@ -76,10 +76,16 @@ public:
         return _numControlVertices;
     }
 
-    /// Returns the Cuda buffer containing vertex-stencil stencil sizes
+    /// Returns the number of stencils in vertex stencil table
+    int GetNumStencilsInVertexStencilTables() const;
+
+    /// Returns the number of stencils in varying stencil table
+    int GetNumStencilsInVaryingStencilTables() const;
+
+    /// Returns the GL buffer containing vertex-stencil stencil sizes
     GLuint GetVertexStencilTablesSizes() const;
 
-    /// Returns the Cuda buffer containing vertex-stencil stencil offsets
+    /// Returns the GL buffer containing vertex-stencil stencil offsets
     GLuint GetVertexStencilTablesOffsets() const;
 
     /// Binds GL buffers containing stencils for 'vertex' interpolation
@@ -104,6 +110,7 @@ private:
                       * _varyingStencilTables;
 
     int _numControlVertices;
+    int _numStencils;
 };
 
 }  // end namespace Osd
diff --git a/opensubdiv/osd/glslComputeController.cpp b/opensubdiv/osd/glslComputeController.cpp
index 5c5150d1..e9b0cfe4 100644
--- a/opensubdiv/osd/glslComputeController.cpp
+++ b/opensubdiv/osd/glslComputeController.cpp
@@ -24,7 +24,6 @@
 
 #include "../osd/glslComputeController.h"
 #include "../osd/vertexDescriptor.h"
-//#include "../osd/debug.h"
 #include "../osd/opengl.h"
 #include "../far/error.h"
 
@@ -138,12 +137,13 @@ public:
         return true;
     }
 
-    void ApplyStencilTableKernel(Far::KernelBatch const &batch, int offset, int numCVs) const {
+    void ApplyStencilTableKernel(int offset, int numCVs,
+                                 int start, int end) const {
 
         // select stencil GLSL subroutine
         glUniformSubroutinesuiv(GL_COMPUTE_SHADER, 1, &_subStencilKernel);
 
-        dispatchCompute(offset, numCVs, batch.start, batch.end);
+        dispatchCompute(offset, numCVs, start, end);
     }
 
     struct Match {
@@ -214,12 +214,20 @@ private:
 
 void
 GLSLComputeController::ApplyStencilTableKernel(
-    Far::KernelBatch const &batch, ComputeContext const *context) const {
+    ComputeContext const *context, int numStencils) const {
 
     assert(context);
 
+    // Note: GLSLComputeContext has a state, knowing whether vertex or
+    // varying stencil tables are being bound. GetNumStencils() reflects it.
+    // This structure will likely be revisited.
+
+    int start = 0;
+    int end = numStencils;
+
     _currentBindState.kernelBundle->ApplyStencilTableKernel(
-        batch, _currentBindState.desc.offset, context->GetNumControlVertices());
+        _currentBindState.desc.offset, context->GetNumControlVertices(),
+        start, end);
 }
 
 // ----------------------------------------------------------------------------
diff --git a/opensubdiv/osd/glslComputeController.h b/opensubdiv/osd/glslComputeController.h
index 7eb08c95..4b3b7127 100644
--- a/opensubdiv/osd/glslComputeController.h
+++ b/opensubdiv/osd/glslComputeController.h
@@ -27,7 +27,6 @@
 
 #include "../version.h"
 
-#include "../far/kernelBatchDispatcher.h"
 #include "../osd/glslComputeContext.h"
 #include "../osd/vertexDescriptor.h"
 
@@ -79,28 +78,27 @@ public:
     ///
     template<class VERTEX_BUFFER, class VARYING_BUFFER>
         void Compute( GLSLComputeContext const * context,
-                      Far::KernelBatchVector const & batches,
                       VERTEX_BUFFER  * vertexBuffer,
                       VARYING_BUFFER * varyingBuffer,
                       VertexBufferDescriptor const * vertexDesc=NULL,
                       VertexBufferDescriptor const * varyingDesc=NULL ){
 
-        if (batches.empty()) return;
-
         if (vertexBuffer) {
             bind(vertexBuffer, vertexDesc);
 
             context->BindVertexStencilTables();
 
-            Far::KernelBatchDispatcher::Apply(this, context, batches, /*maxlevel*/ -1);
+            ApplyStencilTableKernel(
+                context, context->GetNumStencilsInVertexStencilTables());
         }
-        
+
         if (varyingBuffer) {
             bind(varyingBuffer, varyingDesc);
 
             context->BindVaryingStencilTables();
 
-            Far::KernelBatchDispatcher::Apply(this, context, batches, /*maxlevel*/ -1);
+            ApplyStencilTableKernel(
+                context, context->GetNumStencilsInVaryingStencilTables());
         }
 
         context->UnbindStencilTables();
@@ -119,10 +117,9 @@ public:
     ///
     template<class VERTEX_BUFFER>
         void Compute(GLSLComputeContext const * context,
-                     Far::KernelBatchVector const & batches,
                      VERTEX_BUFFER *vertexBuffer) {
 
-        Compute<VERTEX_BUFFER>(context, batches, vertexBuffer, (VERTEX_BUFFER*)0);
+        Compute<VERTEX_BUFFER>(context, vertexBuffer, (VERTEX_BUFFER*)0);
     }
 
     /// Waits until all running subdivision kernels finish.
@@ -130,10 +127,8 @@ public:
 
 protected:
 
-    friend class Far::KernelBatchDispatcher;
-
-    void ApplyStencilTableKernel(Far::KernelBatch const &batch,
-        ComputeContext const *context) const;
+    void ApplyStencilTableKernel(ComputeContext const *context,
+                                 int numStencils) const;
 
     template<class BUFFER>
         void bind( BUFFER * buffer,
diff --git a/opensubdiv/osd/glslTransformFeedbackComputeContext.cpp b/opensubdiv/osd/glslTransformFeedbackComputeContext.cpp
index 65cd2558..50c9fa3c 100644
--- a/opensubdiv/osd/glslTransformFeedbackComputeContext.cpp
+++ b/opensubdiv/osd/glslTransformFeedbackComputeContext.cpp
@@ -86,6 +86,7 @@ public:
         _offsets = createGLTextureBuffer(stencilTables.GetOffsets(), GL_R32I);
         _indices = createGLTextureBuffer(stencilTables.GetControlIndices(), GL_R32I);
         _weights = createGLTextureBuffer(stencilTables.GetWeights(), GL_R32F);
+        _numStencils = stencilTables.GetNumStencils();
     }
 
     ~GLStencilTables() {
@@ -115,12 +116,18 @@ public:
         return _weights;
     }
 
+    int GetNumStencils() const {
+        return _numStencils;
+    }
+
 private:
 
     GLuint _sizes,
            _offsets,
            _indices,
            _weights;
+
+    int _numStencils;
 };
 
 // -----------------------------------------------------------------------------
@@ -164,8 +171,17 @@ GLSLTransformFeedbackComputeContext::HasVaryingStencilTables() const {
     return _varyingStencilTables ? _varyingStencilTables->IsValid() : false;
 }
 
-// ----------------------------------------------------------------------------
+int
+GLSLTransformFeedbackComputeContext::GetNumStencilsInVertexStencilTables() const {
+    return _vertexStencilTables ? _vertexStencilTables->GetNumStencils() : 0;
+}
 
+int
+GLSLTransformFeedbackComputeContext::GetNumStencilsInVaryingStencilTables() const {
+    return _varyingStencilTables ? _varyingStencilTables->GetNumStencils() : 0;
+}
+
+// ----------------------------------------------------------------------------
 GLuint
 GLSLTransformFeedbackComputeContext::GetVertexStencilTablesSizes() const {
     return _vertexStencilTables ? _vertexStencilTables->GetSizes() : 0;
diff --git a/opensubdiv/osd/glslTransformFeedbackComputeContext.h b/opensubdiv/osd/glslTransformFeedbackComputeContext.h
index 2ec1de8a..48a76d65 100644
--- a/opensubdiv/osd/glslTransformFeedbackComputeContext.h
+++ b/opensubdiv/osd/glslTransformFeedbackComputeContext.h
@@ -59,7 +59,7 @@ public:
     ///                              interpolation
     ///
     static GLSLTransformFeedbackComputeContext * Create(Far::StencilTables const * vertexStencilTables,
-                                                           Far::StencilTables const * varyingStencilTables=0);
+                                                        Far::StencilTables const * varyingStencilTables=0);
 
     /// Destructor
     virtual ~GLSLTransformFeedbackComputeContext();
@@ -75,6 +75,12 @@ public:
         return _numControlVertices;
     }
 
+    /// Returns the number of stencils in vertex stencil table
+    int GetNumStencilsInVertexStencilTables() const;
+
+    /// Returns the number of stencils in varying stencil table
+    int GetNumStencilsInVaryingStencilTables() const;
+
     /// Returns the GL texture buffer containing vertex-stencil stencil sizes
     GLuint GetVertexStencilTablesSizes() const;
 
diff --git a/opensubdiv/osd/glslTransformFeedbackComputeController.cpp b/opensubdiv/osd/glslTransformFeedbackComputeController.cpp
index 391c1e2b..5bd12b01 100644
--- a/opensubdiv/osd/glslTransformFeedbackComputeController.cpp
+++ b/opensubdiv/osd/glslTransformFeedbackComputeController.cpp
@@ -87,7 +87,7 @@ public:
         _uniformWeights(0),
         _uniformStart(0),
         _uniformEnd(0),
-        _uniformOffset(0) { }
+        _uniformPrimvarOffset(0) { }
 
     ~KernelBundle() {
         if (_program) {
@@ -95,9 +95,8 @@ public:
         }
     }
 
-    void UseProgram(int primvarOffset) const {
+    void UseProgram() const {
         glUseProgram(_program);
-        glUniform1i(_uniformOffset, primvarOffset);
     }
 
     bool Compile(VertexBufferDescriptor const & desc) {
@@ -138,15 +137,19 @@ public:
             // outVertexData[2]
             // (gl_SkipComponents1)
             //
+            // note that "primvarOffset" in shader is still needed to read
+            // interleaved components even if gl_SkipComponents is used.
+            //
             char attrName[32];
-            for (int i = 0; i < desc.offset; ++i) {
+            int primvarOffset = (desc.offset % desc.stride);
+            for (int i = 0; i < primvarOffset; ++i) {
                 outputs.push_back("gl_SkipComponents1");
             }
             for (int i = 0; i < desc.length; ++i) {
                 snprintf(attrName, 32, "outVertexBuffer[%d]", i);
                 outputs.push_back(attrName);
             }
-            for (int i = desc.offset + desc.length; i < desc.stride; ++i) {
+            for (int i = primvarOffset + desc.length; i < desc.stride; ++i) {
                 outputs.push_back("gl_SkipComponents1");
             }
 
@@ -192,7 +195,7 @@ public:
         _uniformStart   = glGetUniformLocation(_program, "batchStart");
         _uniformEnd     = glGetUniformLocation(_program, "batchEnd");
 
-        _uniformOffset  = glGetUniformLocation(_program, "primvarOffset");
+        _uniformPrimvarOffset = glGetUniformLocation(_program, "primvarOffset");
 
         OSD_DEBUG_CHECK_GL_ERROR("KernelBundle::Compile");
 
@@ -225,14 +228,16 @@ public:
         // set batch range
         glUniform1i(_uniformStart,  start);
         glUniform1i(_uniformEnd,    end);
-        glUniform1i(_uniformOffset, offset);
+        glUniform1i(_uniformPrimvarOffset, offset);
 
         int count = end - start,
             stride = _desc.stride*sizeof(float);
 
+        // note: offset includes both "batching offset" and "primvar offset".
+        // 
         glBindBufferRange(GL_TRANSFORM_FEEDBACK_BUFFER,
                           0, primvarBuffer,
-                          (start + numCVs)*stride + offset*sizeof(float),
+                          (start + numCVs)*stride + (offset - offset%stride)*sizeof(float),
                           count*stride);
 
         glBeginTransformFeedback(GL_POINTS);
@@ -246,13 +251,14 @@ public:
         //OSD_DEBUG_CHECK_GL_ERROR("TransformPrimvarBuffer\n");
     }
 
-    void ApplyStencilTableKernel(Far::KernelBatch const &batch,
-        GLuint primvarBuffer, int offset, int numCVs) const {
+    void ApplyStencilTableKernel(GLuint primvarBuffer,
+                                 int offset, int numCVs,
+                                 int start, int end) const {
 
         glUniformSubroutinesuiv(GL_VERTEX_SHADER, 1, &_subStencilKernel);
 
         TransformPrimvarBuffer(primvarBuffer,
-            offset, numCVs, batch.start, batch.end);
+                               offset, numCVs, start, end);
     }
 
     struct Match {
@@ -283,7 +289,7 @@ private:
           _uniformStart,     // batch
           _uniformEnd,
 
-          _uniformOffset;    // GL primvar buffer descriptor
+          _uniformPrimvarOffset;
 
     VertexBufferDescriptor _desc; // primvar buffer descriptor
 };
@@ -294,7 +300,7 @@ GLSLTransformFeedbackComputeController::bindBufferAndProgram(
     GLuint & feedbackTexture) {
 
     glEnable(GL_RASTERIZER_DISCARD);
-    _currentBindState.kernelBundle->UseProgram(/*primvarOffset*/0);
+    _currentBindState.kernelBundle->UseProgram();
 
     if (not feedbackTexture) {
         glGenTextures(1, &feedbackTexture);
@@ -385,14 +391,18 @@ GLSLTransformFeedbackComputeController::getKernel(
 
 void
 GLSLTransformFeedbackComputeController::ApplyStencilTableKernel(
-    Far::KernelBatch const &batch,
-        GLSLTransformFeedbackComputeContext const *context) const {
+    GLSLTransformFeedbackComputeContext const *context, int numStencils) const {
 
     assert(context);
 
-    _currentBindState.kernelBundle->ApplyStencilTableKernel(batch,
+    int start = 0;
+    int end = numStencils;
+
+    _currentBindState.kernelBundle->ApplyStencilTableKernel(
         _currentBindState.buffer, _currentBindState.desc.offset,
-            context->GetNumControlVertices());
+        context->GetNumControlVertices(),
+        start,
+        end);
 }
 
 
diff --git a/opensubdiv/osd/glslTransformFeedbackComputeController.h b/opensubdiv/osd/glslTransformFeedbackComputeController.h
index ad6af0f4..50b38f54 100644
--- a/opensubdiv/osd/glslTransformFeedbackComputeController.h
+++ b/opensubdiv/osd/glslTransformFeedbackComputeController.h
@@ -27,7 +27,6 @@
 
 #include "../version.h"
 
-#include "../far/kernelBatchDispatcher.h"
 #include "../osd/glslTransformFeedbackComputeContext.h"
 #include "../osd/vertexDescriptor.h"
 
@@ -67,9 +66,6 @@ public:
     /// @param  context       The GLSLTransformFeedbackComputeContext to apply
     ///                       refinement operations to
     ///
-    /// @param  batches       Vector of batches of vertices organized by operative
-    ///                       kernel
-    ///
     /// @param  vertexBuffer  Vertex-interpolated data buffer
     ///
     /// @param  vertexDesc    The descriptor of vertex elements to be refined.
@@ -84,21 +80,19 @@ public:
     ///
     template<class VERTEX_BUFFER, class VARYING_BUFFER>
         void Compute( GLSLTransformFeedbackComputeContext const * context,
-                      Far::KernelBatchVector const & batches,
                       VERTEX_BUFFER  * vertexBuffer,
                       VARYING_BUFFER * varyingBuffer,
                       VertexBufferDescriptor const * vertexDesc=NULL,
                       VertexBufferDescriptor const * varyingDesc=NULL ){
 
-        if (batches.empty()) return;
-
         if (vertexBuffer) {
 
             bind(vertexBuffer, vertexDesc, _vertexTexture);
 
             bindContextStencilTables(context, false);
 
-            Far::KernelBatchDispatcher::Apply(this, context, batches, /*maxlevel*/ -1);
+            ApplyStencilTableKernel(
+                context, context->GetNumStencilsInVertexStencilTables());
         }
 
         if (varyingBuffer) {
@@ -107,7 +101,8 @@ public:
 
             bindContextStencilTables(context, true);
 
-            Far::KernelBatchDispatcher::Apply(this, context, batches, /*maxlevel*/ -1);
+            ApplyStencilTableKernel(
+                context, context->GetNumStencilsInVaryingStencilTables());
         }
         unbind();
     }
@@ -117,17 +112,13 @@ public:
     /// @param  context       The GLSLTransformFeedbackComputeContext to apply
     ///                       refinement operations to
     ///
-    /// @param  batches       Vector of batches of vertices organized by operative
-    ///                       kernel
-    ///
     /// @param  vertexBuffer  Vertex-interpolated data buffer
     ///
     template<class VERTEX_BUFFER>
         void Compute(GLSLTransformFeedbackComputeContext const * context,
-                     Far::KernelBatchVector const & batches,
                      VERTEX_BUFFER *vertexBuffer) {
 
-        Compute<VERTEX_BUFFER>(context, batches, vertexBuffer, (VERTEX_BUFFER*)0);
+        Compute<VERTEX_BUFFER>(context, vertexBuffer, (VERTEX_BUFFER*)0);
     }
 
     /// Waits until all running subdivision kernels finish.
@@ -135,10 +126,8 @@ public:
 
 protected:
 
-    friend class Far::KernelBatchDispatcher;
-
-    void ApplyStencilTableKernel(Far::KernelBatch const &batch,
-        ComputeContext const *context) const;
+    void ApplyStencilTableKernel(ComputeContext const *context,
+                                 int numStencils) const;
 
     template<class BUFFER>
         void bind( BUFFER * buffer, VertexBufferDescriptor const * desc,
diff --git a/opensubdiv/osd/mesh.h b/opensubdiv/osd/mesh.h
index 7ae49026..f28ecde4 100644
--- a/opensubdiv/osd/mesh.h
+++ b/opensubdiv/osd/mesh.h
@@ -27,7 +27,6 @@
 
 #include "../version.h"
 
-#include "../far/kernelBatch.h"
 #include "../far/topologyRefiner.h"
 #include "../far/patchTablesFactory.h"
 #include "../far/stencilTables.h"
@@ -154,7 +153,6 @@ public:
     Mesh(ComputeController * computeController,
             Far::TopologyRefiner * refiner,
             Far::PatchTables * patchTables,
-            Far::KernelBatchVector const & kernelBatches,
             VertexBuffer * vertexBuffer,
             VertexBuffer * varyingBuffer,
             ComputeContext * computeContext,
@@ -162,7 +160,6 @@ public:
 
             _refiner(refiner),
             _patchTables(patchTables),
-            _kernelBatches(kernelBatches),
             _vertexBuffer(vertexBuffer),
             _varyingBuffer(varyingBuffer),
             _computeContext(computeContext),
@@ -192,11 +189,11 @@ public:
     }
 
     virtual void Refine() {
-        _computeController->Compute(_computeContext, _kernelBatches, _vertexBuffer, _varyingBuffer);
+        _computeController->Compute(_computeContext, _vertexBuffer, _varyingBuffer);
     }
 
     virtual void Refine(VertexBufferDescriptor const *vertexDesc, VertexBufferDescriptor const *varyingDesc) {
-        _computeController->Refine(_computeContext, _kernelBatches, _vertexBuffer, _varyingBuffer, vertexDesc, varyingDesc);
+        _computeController->Refine(_computeContext, _vertexBuffer, _varyingBuffer, vertexDesc, varyingDesc);
     }
 
     virtual void Synchronize() {
@@ -237,8 +234,6 @@ private:
         if (numVertexElements>0) {
 
             vertexStencils = Far::StencilTablesFactory::Create(*_refiner, options);
-
-            _kernelBatches.push_back(Far::StencilTablesFactory::Create(*vertexStencils));
         }
 
         if (numVaryingElements>0) {
@@ -291,7 +286,6 @@ private:
 
     Far::TopologyRefiner * _refiner;
     Far::PatchTables * _patchTables;
-    Far::KernelBatchVector _kernelBatches;
 
     VertexBuffer * _vertexBuffer,
                  * _varyingBuffer;
diff --git a/opensubdiv/osd/ompComputeController.cpp b/opensubdiv/osd/ompComputeController.cpp
index c948ee34..3b9f1eeb 100644
--- a/opensubdiv/osd/ompComputeController.cpp
+++ b/opensubdiv/osd/ompComputeController.cpp
@@ -40,7 +40,7 @@ OmpComputeController::OmpComputeController(int numThreads) {
 
 void
 OmpComputeController::ApplyStencilTableKernel(
-    Far::KernelBatch const &batch, ComputeContext const *context) const {
+    ComputeContext const *context) const {
 
     assert(context);
 
@@ -48,6 +48,9 @@ OmpComputeController::ApplyStencilTableKernel(
 
     if (vertexStencils and _currentBindState.vertexBuffer) {
 
+        int start = 0;
+        int end = vertexStencils->GetNumStencils();
+
         VertexBufferDescriptor const & desc = _currentBindState.vertexDesc;
 
         float const * srcBuffer = _currentBindState.vertexBuffer + desc.offset;
@@ -56,19 +59,22 @@ OmpComputeController::ApplyStencilTableKernel(
             vertexStencils->GetNumControlVertices() * desc.stride;
 
         OmpComputeStencils(_currentBindState.vertexDesc,
-                              srcBuffer, destBuffer,
-                              &vertexStencils->GetSizes().at(0),
-                              &vertexStencils->GetOffsets().at(0),
-                              &vertexStencils->GetControlIndices().at(0),
-                              &vertexStencils->GetWeights().at(0),
-                              batch.start,
-                              batch.end);
+                           srcBuffer, destBuffer,
+                           &vertexStencils->GetSizes().at(0),
+                           &vertexStencils->GetOffsets().at(0),
+                           &vertexStencils->GetControlIndices().at(0),
+                           &vertexStencils->GetWeights().at(0),
+                           start,
+                           end);
     }
 
     Far::StencilTables const * varyingStencils = context->GetVaryingStencilTables();
 
     if (varyingStencils and _currentBindState.varyingBuffer) {
 
+        int start = 0;
+        int end = varyingStencils->GetNumStencils();
+
         VertexBufferDescriptor const & desc = _currentBindState.varyingDesc;
 
         float const * srcBuffer = _currentBindState.varyingBuffer + desc.offset;
@@ -77,13 +83,13 @@ OmpComputeController::ApplyStencilTableKernel(
             varyingStencils->GetNumControlVertices() * desc.stride;
 
         OmpComputeStencils(_currentBindState.varyingDesc,
-                              srcBuffer, destBuffer,
-                              &varyingStencils->GetSizes().at(0),
-                              &varyingStencils->GetOffsets().at(0),
-                              &varyingStencils->GetControlIndices().at(0),
-                              &varyingStencils->GetWeights().at(0),
-                              batch.start,
-                              batch.end);
+                           srcBuffer, destBuffer,
+                           &varyingStencils->GetSizes().at(0),
+                           &varyingStencils->GetOffsets().at(0),
+                           &varyingStencils->GetControlIndices().at(0),
+                           &varyingStencils->GetWeights().at(0),
+                           start,
+                           end);
     }
 }
 
diff --git a/opensubdiv/osd/ompComputeController.h b/opensubdiv/osd/ompComputeController.h
index f43301f1..29831d4c 100644
--- a/opensubdiv/osd/ompComputeController.h
+++ b/opensubdiv/osd/ompComputeController.h
@@ -27,7 +27,6 @@
 
 #include "../version.h"
 
-#include "../far/kernelBatchDispatcher.h"
 #include "../osd/cpuComputeContext.h"
 #include "../osd/vertexDescriptor.h"
 
@@ -66,9 +65,6 @@ public:
     ///
     /// @param  context       The CpuContext to apply refinement operations to
     ///
-    /// @param  batches       Vector of batches of vertices organized by operative
-    ///                       kernel
-    ///
     /// @param  vertexBuffer  Vertex-interpolated data buffer
     ///
     /// @param  vertexDesc    The descriptor of vertex elements to be refined.
@@ -83,19 +79,16 @@ public:
     ///
     template<class VERTEX_BUFFER, class VARYING_BUFFER>
         void Compute( CpuComputeContext const * context,
-                      Far::KernelBatchVector const & batches,
                       VERTEX_BUFFER  * vertexBuffer,
                       VARYING_BUFFER * varyingBuffer,
                       VertexBufferDescriptor const * vertexDesc=NULL,
                       VertexBufferDescriptor const * varyingDesc=NULL ){
 
-        if (batches.empty()) return;
-
         omp_set_num_threads(_numThreads);
 
         bind(vertexBuffer, varyingBuffer, vertexDesc, varyingDesc);
 
-        Far::KernelBatchDispatcher::Apply(this, context, batches, /*maxlevel*/ -1);
+        ApplyStencilTableKernel(context);
 
         unbind();
     }
@@ -104,17 +97,13 @@ public:
     ///
     /// @param  context       The CpuContext to apply refinement operations to
     ///
-    /// @param  batches       Vector of batches of vertices organized by operative
-    ///                       kernel
-    ///
     /// @param  vertexBuffer  Vertex-interpolated data buffer
     ///
     template<class VERTEX_BUFFER>
         void Compute(CpuComputeContext const * context,
-                     Far::KernelBatchVector const & batches,
                      VERTEX_BUFFER *vertexBuffer) {
 
-        Compute<VERTEX_BUFFER>(context, batches, vertexBuffer, (VERTEX_BUFFER*)0);
+        Compute<VERTEX_BUFFER>(context, vertexBuffer, (VERTEX_BUFFER*)0);
     }
 
     /// Waits until all running subdivision kernels finish.
@@ -122,10 +111,7 @@ public:
 
 protected:
 
-    friend class Far::KernelBatchDispatcher;
-
-    void ApplyStencilTableKernel(Far::KernelBatch const &batch,
-        ComputeContext const *context) const;
+    void ApplyStencilTableKernel(ComputeContext const *context) const;
 
     template<class VERTEX_BUFFER, class VARYING_BUFFER>
         void bind( VERTEX_BUFFER * vertexBuffer,
diff --git a/opensubdiv/osd/tbbComputeController.cpp b/opensubdiv/osd/tbbComputeController.cpp
index a87f2fa0..6ce55113 100644
--- a/opensubdiv/osd/tbbComputeController.cpp
+++ b/opensubdiv/osd/tbbComputeController.cpp
@@ -49,7 +49,7 @@ TbbComputeController::TbbComputeController(int numThreads)
 
 void
 TbbComputeController::ApplyStencilTableKernel(
-    Far::KernelBatch const &batch, ComputeContext const *context) const {
+    ComputeContext const *context) const {
 
     assert(context);
 
@@ -57,6 +57,9 @@ TbbComputeController::ApplyStencilTableKernel(
 
     if (vertexStencils and _currentBindState.vertexBuffer) {
 
+        int start = 0;
+        int end = vertexStencils->GetNumStencils();
+
         VertexBufferDescriptor const & desc = _currentBindState.vertexDesc;
 
         float const * srcBuffer = _currentBindState.vertexBuffer + desc.offset;
@@ -65,19 +68,22 @@ TbbComputeController::ApplyStencilTableKernel(
             vertexStencils->GetNumControlVertices() * desc.stride;
 
         TbbComputeStencils(_currentBindState.vertexDesc,
-                              srcBuffer, destBuffer,
-                              &vertexStencils->GetSizes().at(0),
-                              &vertexStencils->GetOffsets().at(0),
-                              &vertexStencils->GetControlIndices().at(0),
-                              &vertexStencils->GetWeights().at(0),
-                              batch.start,
-                              batch.end);
+                           srcBuffer, destBuffer,
+                           &vertexStencils->GetSizes().at(0),
+                           &vertexStencils->GetOffsets().at(0),
+                           &vertexStencils->GetControlIndices().at(0),
+                           &vertexStencils->GetWeights().at(0),
+                           start,
+                           end);
     }
 
     Far::StencilTables const * varyingStencils = context->GetVaryingStencilTables();
 
     if (varyingStencils and _currentBindState.varyingBuffer) {
 
+        int start = 0;
+        int end = varyingStencils->GetNumStencils();
+
         VertexBufferDescriptor const & desc = _currentBindState.varyingDesc;
 
         float const * srcBuffer = _currentBindState.varyingBuffer + desc.offset;
@@ -86,13 +92,13 @@ TbbComputeController::ApplyStencilTableKernel(
             varyingStencils->GetNumControlVertices() * desc.stride;
 
         TbbComputeStencils(_currentBindState.varyingDesc,
-                              srcBuffer, destBuffer,
-                              &varyingStencils->GetSizes().at(0),
-                              &varyingStencils->GetOffsets().at(0),
-                              &varyingStencils->GetControlIndices().at(0),
-                              &varyingStencils->GetWeights().at(0),
-                              batch.start,
-                              batch.end);
+                           srcBuffer, destBuffer,
+                           &varyingStencils->GetSizes().at(0),
+                           &varyingStencils->GetOffsets().at(0),
+                           &varyingStencils->GetControlIndices().at(0),
+                           &varyingStencils->GetWeights().at(0),
+                           start,
+                           end);
     }
 }
 
diff --git a/opensubdiv/osd/tbbComputeController.h b/opensubdiv/osd/tbbComputeController.h
index 7f1ac4c0..6bb13d87 100644
--- a/opensubdiv/osd/tbbComputeController.h
+++ b/opensubdiv/osd/tbbComputeController.h
@@ -27,7 +27,6 @@
 
 #include "../version.h"
 
-#include "../far/kernelBatchDispatcher.h"
 #include "../osd/cpuComputeContext.h"
 #include "../osd/vertexDescriptor.h"
 
@@ -62,9 +61,6 @@ public:
     ///
     /// @param  context       The CpuContext to apply refinement operations to
     ///
-    /// @param  batches       Vector of batches of vertices organized by operative
-    ///                       kernel
-    ///
     /// @param  vertexBuffer  Vertex-interpolated data buffer
     ///
     /// @param  vertexDesc    The descriptor of vertex elements to be refined.
@@ -79,17 +75,14 @@ public:
     ///
     template<class VERTEX_BUFFER, class VARYING_BUFFER>
         void Compute( CpuComputeContext const * context,
-                      Far::KernelBatchVector const & batches,
                       VERTEX_BUFFER  * vertexBuffer,
                       VARYING_BUFFER * varyingBuffer,
                       VertexBufferDescriptor const * vertexDesc=NULL,
                       VertexBufferDescriptor const * varyingDesc=NULL ){
 
-        if (batches.empty()) return;
-
         bind(vertexBuffer, varyingBuffer, vertexDesc, varyingDesc);
 
-        Far::KernelBatchDispatcher::Apply(this, context, batches, /*maxlevel*/ -1);
+        ApplyStencilTableKernel(context);
 
         unbind();
     }
@@ -98,17 +91,13 @@ public:
     ///
     /// @param  context       The CpuContext to apply refinement operations to
     ///
-    /// @param  batches       Vector of batches of vertices organized by operative
-    ///                       kernel
-    ///
     /// @param  vertexBuffer  Vertex-interpolated data buffer
     ///
     template<class VERTEX_BUFFER>
         void Compute(CpuComputeContext const * context,
-                     Far::KernelBatchVector const & batches,
                      VERTEX_BUFFER *vertexBuffer) {
 
-        Compute<VERTEX_BUFFER>(context, batches, vertexBuffer, (VERTEX_BUFFER*)0);
+        Compute<VERTEX_BUFFER>(context, vertexBuffer, (VERTEX_BUFFER*)0);
     }
 
     /// Waits until all running subdivision kernels finish.
@@ -116,10 +105,7 @@ public:
 
 protected:
 
-    friend class Far::KernelBatchDispatcher;
-
-    void ApplyStencilTableKernel(Far::KernelBatch const &batch,
-        ComputeContext const *context) const;
+    void ApplyStencilTableKernel(ComputeContext const *context) const;
 
     template<class VERTEX_BUFFER, class VARYING_BUFFER>
         void bind( VERTEX_BUFFER * vertexBuffer,
diff --git a/regression/osd_regression/main.cpp b/regression/osd_regression/main.cpp
index ec499863..6caa4c36 100644
--- a/regression/osd_regression/main.cpp
+++ b/regression/osd_regression/main.cpp
@@ -303,11 +303,7 @@ checkMeshCPU( FarTopologyRefiner *refiner,
     
     vb->UpdateData( coarseverts[0].GetPos(), 0, (int)coarseverts.size() );
     
-    Far::KernelBatchVector kernelBatches;
-    kernelBatches.push_back(
-        Far::StencilTablesFactory::Create(*vertexStencils));
-    
-    controller->Compute( context, kernelBatches, vb );
+    controller->Compute( context, vb );
 
     int result = checkVertexBuffer(*refiner, refmesh, vb->BindCpuBuffer(), 
         vb->GetNumElements());
@@ -341,11 +337,7 @@ checkMeshCPUGL(FarTopologyRefiner *refiner,
     
     vb->UpdateData( coarseverts[0].GetPos(), 0, (int)coarseverts.size() );
 
-    Far::KernelBatchVector kernelBatches;
-    kernelBatches.push_back(
-        Far::StencilTablesFactory::Create(*vertexStencils));
-    
-    controller->Compute( context, kernelBatches, vb );
+    controller->Compute( context, vb );
     
     int result = checkVertexBuffer(*refiner, refmesh, 
         vb->BindCpuBuffer(), vb->GetNumElements());
@@ -383,11 +375,7 @@ checkMeshCL( FarTopologyRefiner *refiner,
     vb->UpdateData( coarseverts[0].GetPos(), 0, (int)coarseverts.size(),
         g_clQueue );
 
-    Far::KernelBatchVector kernelBatches;
-    kernelBatches.push_back(
-        Far::StencilTablesFactory::Create(*vertexStencils));
-    
-    controller->Compute( context, kernelBatches, vb );
+    controller->Compute( context, vb );
 
     // read data back from CL buffer
     size_t dataSize = vb->GetNumVertices() * vb->GetNumElements();
diff --git a/tutorials/osd/tutorial_0/osd_tutorial_0.cpp b/tutorials/osd/tutorial_0/osd_tutorial_0.cpp
index d36315c1..a9ea4f32 100644
--- a/tutorials/osd/tutorial_0/osd_tutorial_0.cpp
+++ b/tutorials/osd/tutorial_0/osd_tutorial_0.cpp
@@ -75,8 +75,6 @@ int main(int, char **) {
 
     Osd::CpuComputeContext * context=0;
 
-    Far::KernelBatchVector batches;
-
     //
     // Setup phase
     //
@@ -92,9 +90,6 @@ int main(int, char **) {
         Far::StencilTables const * stencilTables =
             Far::StencilTablesFactory::Create(*refiner, options);
 
-        // We need a kernel batch to dispatch Compute launches
-        batches.push_back(Far::StencilTablesFactory::Create(*stencilTables));
-
         // Create an Osd Compute Context from the stencil tables
         context = Osd::CpuComputeContext::Create(stencilTables);
 
@@ -122,7 +117,7 @@ int main(int, char **) {
         vbuffer->UpdateData(g_verts, 0, nCoarseVerts);
 
         // Launch the computation
-        controller.Compute(context, batches, vbuffer);
+        controller.Compute(context, vbuffer);
     }
 
     { // Visualization with Maya : print a MEL script that generates particles

From 214e62e67a10d16031642c1fbcc86f5f224731bb Mon Sep 17 00:00:00 2001
From: Takahito Tejima <takahito@pixar.com>
Date: Thu, 9 Apr 2015 13:58:47 -0700
Subject: [PATCH 2/4] add a constructor to FarStencilTables to initialze
 members.

---
 opensubdiv/far/stencilTables.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/opensubdiv/far/stencilTables.h b/opensubdiv/far/stencilTables.h
index 09cb0a95..1e5063b8 100644
--- a/opensubdiv/far/stencilTables.h
+++ b/opensubdiv/far/stencilTables.h
@@ -203,6 +203,7 @@ protected:
     void resize(int nstencils, int nelems);
 
 protected:
+    StencilTables() : _numControlVertices(0) {}
 
     friend class StencilTablesFactory;
     friend class GregoryBasisFactory;

From 24a435da013a129985a06739b6964b316b3e4ed6 Mon Sep 17 00:00:00 2001
From: Takahito Tejima <takahito@pixar.com>
Date: Thu, 9 Apr 2015 13:59:59 -0700
Subject: [PATCH 3/4] avoid lauching stencil kernel and not to allocate device
 tables if the stencil table is empty.

---
 opensubdiv/osd/clComputeContext.cpp           | 12 ++++---
 opensubdiv/osd/cpuComputeController.cpp       | 36 ++++++++++---------
 opensubdiv/osd/cudaComputeContext.cpp         | 12 ++++---
 opensubdiv/osd/cudaComputeController.cpp      | 32 +++++++++--------
 opensubdiv/osd/d3d11ComputeContext.cpp        | 31 +++++++++-------
 opensubdiv/osd/glslComputeContext.cpp         | 20 ++++++-----
 .../glslTransformFeedbackComputeContext.cpp   | 20 ++++++-----
 opensubdiv/osd/ompComputeController.cpp       | 36 ++++++++++---------
 opensubdiv/osd/tbbComputeController.cpp       | 36 ++++++++++---------
 9 files changed, 137 insertions(+), 98 deletions(-)

diff --git a/opensubdiv/osd/clComputeContext.cpp b/opensubdiv/osd/clComputeContext.cpp
index 5f02a700..0069aa11 100644
--- a/opensubdiv/osd/clComputeContext.cpp
+++ b/opensubdiv/osd/clComputeContext.cpp
@@ -56,11 +56,15 @@ class CLComputeContext::CLStencilTables {
 public:
 
     CLStencilTables(Far::StencilTables const & stencilTables, cl_context clContext) {
-        _sizes = createCLBuffer(stencilTables.GetSizes(), clContext);
-        _offsets = createCLBuffer(stencilTables.GetOffsets(), clContext);
-        _indices = createCLBuffer(stencilTables.GetControlIndices(), clContext);
-        _weights = createCLBuffer(stencilTables.GetWeights(), clContext);
         _numStencils = stencilTables.GetNumStencils();
+        if (_numStencils > 0) {
+            _sizes = createCLBuffer(stencilTables.GetSizes(), clContext);
+            _offsets = createCLBuffer(stencilTables.GetOffsets(), clContext);
+            _indices = createCLBuffer(stencilTables.GetControlIndices(), clContext);
+            _weights = createCLBuffer(stencilTables.GetWeights(), clContext);
+        } else {
+            _sizes = _offsets = _indices = _weights = NULL;
+        }
     }
 
     ~CLStencilTables() {
diff --git a/opensubdiv/osd/cpuComputeController.cpp b/opensubdiv/osd/cpuComputeController.cpp
index 4eca60cf..c23130d9 100644
--- a/opensubdiv/osd/cpuComputeController.cpp
+++ b/opensubdiv/osd/cpuComputeController.cpp
@@ -64,14 +64,16 @@ CpuComputeController::ApplyStencilTableKernel(
         int start = 0;
         int end = vertexStencils->GetNumStencils();
 
-        CpuComputeStencils(_currentBindState.vertexDesc,
-                           srcBuffer, destBuffer,
-                           &vertexStencils->GetSizes().at(0),
-                           &vertexStencils->GetOffsets().at(0),
-                           &vertexStencils->GetControlIndices().at(0),
-                           &vertexStencils->GetWeights().at(0),
-                           start,
-                           end);
+        if (end > start) {
+            CpuComputeStencils(_currentBindState.vertexDesc,
+                               srcBuffer, destBuffer,
+                               &vertexStencils->GetSizes().at(0),
+                               &vertexStencils->GetOffsets().at(0),
+                               &vertexStencils->GetControlIndices().at(0),
+                               &vertexStencils->GetWeights().at(0),
+                               start,
+                               end);
+        }
     }
 
     Far::StencilTables const * varyingStencils = context->GetVaryingStencilTables();
@@ -88,14 +90,16 @@ CpuComputeController::ApplyStencilTableKernel(
         int start = 0;
         int end = varyingStencils->GetNumStencils();
 
-        CpuComputeStencils(_currentBindState.varyingDesc,
-                           srcBuffer, destBuffer,
-                           &varyingStencils->GetSizes().at(0),
-                           &varyingStencils->GetOffsets().at(0),
-                           &varyingStencils->GetControlIndices().at(0),
-                           &varyingStencils->GetWeights().at(0),
-                           start,
-                           end);
+        if (end > start) {
+            CpuComputeStencils(_currentBindState.varyingDesc,
+                               srcBuffer, destBuffer,
+                               &varyingStencils->GetSizes().at(0),
+                               &varyingStencils->GetOffsets().at(0),
+                               &varyingStencils->GetControlIndices().at(0),
+                               &varyingStencils->GetWeights().at(0),
+                               start,
+                               end);
+        }
     }
 }
 
diff --git a/opensubdiv/osd/cudaComputeContext.cpp b/opensubdiv/osd/cudaComputeContext.cpp
index cac7b5c6..667056de 100644
--- a/opensubdiv/osd/cudaComputeContext.cpp
+++ b/opensubdiv/osd/cudaComputeContext.cpp
@@ -63,11 +63,15 @@ class CudaComputeContext::CudaStencilTables {
 public:
 
     CudaStencilTables(Far::StencilTables const & stencilTables) {
-        _sizes = createCudaBuffer(stencilTables.GetSizes());
-        _offsets = createCudaBuffer(stencilTables.GetOffsets());
-        _indices = createCudaBuffer(stencilTables.GetControlIndices());
-        _weights = createCudaBuffer(stencilTables.GetWeights());
         _numStencils = stencilTables.GetNumStencils();
+        if (_numStencils > 0) {
+            _sizes = createCudaBuffer(stencilTables.GetSizes());
+            _offsets = createCudaBuffer(stencilTables.GetOffsets());
+            _indices = createCudaBuffer(stencilTables.GetControlIndices());
+            _weights = createCudaBuffer(stencilTables.GetWeights());
+        } else {
+            _sizes = _offsets = _indices = _weights = NULL;
+        }
     }
 
     ~CudaStencilTables() {
diff --git a/opensubdiv/osd/cudaComputeController.cpp b/opensubdiv/osd/cudaComputeController.cpp
index 53bb4c86..b6a7760e 100644
--- a/opensubdiv/osd/cudaComputeController.cpp
+++ b/opensubdiv/osd/cudaComputeController.cpp
@@ -64,13 +64,15 @@ CudaComputeController::ApplyStencilTableKernel(
         float * dst = const_cast<float *>(src) +
             context->GetNumControlVertices() * stride;
 
-        CudaComputeStencils(src, dst, length, stride,
-                               (unsigned char const *)context->GetVertexStencilTablesSizes(),
-                               (int const *)context->GetVertexStencilTablesOffsets(),
-                               (int const *)context->GetVertexStencilTablesIndices(),
-                               (float const *)context->GetVertexStencilTablesWeights(),
-                               start,
-                               end);
+        if (end > start) {
+            CudaComputeStencils(src, dst, length, stride,
+                                (unsigned char const *)context->GetVertexStencilTablesSizes(),
+                                (int const *)context->GetVertexStencilTablesOffsets(),
+                                (int const *)context->GetVertexStencilTablesIndices(),
+                                (float const *)context->GetVertexStencilTablesWeights(),
+                                start,
+                                end);
+        }
     }
 
     if (context->HasVaryingStencilTables()) {
@@ -86,13 +88,15 @@ CudaComputeController::ApplyStencilTableKernel(
         float * dst = const_cast<float *>(src) +
             context->GetNumControlVertices() * stride;
 
-        CudaComputeStencils(src, dst, length, stride,
-                               (unsigned char const *)context->GetVaryingStencilTablesSizes(),
-                               (int const *)context->GetVaryingStencilTablesOffsets(),
-                               (int const *)context->GetVaryingStencilTablesIndices(),
-                               (float const *)context->GetVaryingStencilTablesWeights(),
-                               start,
-                               end);
+        if (end > start) {
+            CudaComputeStencils(src, dst, length, stride,
+                                (unsigned char const *)context->GetVaryingStencilTablesSizes(),
+                                (int const *)context->GetVaryingStencilTablesOffsets(),
+                                (int const *)context->GetVaryingStencilTablesIndices(),
+                                (float const *)context->GetVaryingStencilTablesWeights(),
+                                start,
+                                end);
+        }
     }
 }
 
diff --git a/opensubdiv/osd/d3d11ComputeContext.cpp b/opensubdiv/osd/d3d11ComputeContext.cpp
index 1899abc5..798d5eee 100644
--- a/opensubdiv/osd/d3d11ComputeContext.cpp
+++ b/opensubdiv/osd/d3d11ComputeContext.cpp
@@ -114,19 +114,26 @@ public:
     D3D11StencilTables(Far::StencilTables const & stencilTables,
         ID3D11DeviceContext *deviceContext) {
 
-        // convert unsigned char sizes buffer to ints (HLSL does not have uint8 type)
-        std::vector<int> const sizes(stencilTables.GetSizes().begin(),
-            stencilTables.GetSizes().end());
-
-        _sizes.initialize(sizes, DXGI_FORMAT_R32_SINT, deviceContext);
-
-        _offsets.initialize(stencilTables.GetOffsets(), DXGI_FORMAT_R32_SINT, deviceContext);
-
-        _indices.initialize(stencilTables.GetControlIndices(), DXGI_FORMAT_R32_SINT, deviceContext);
-
-        _weights.initialize(stencilTables.GetWeights(), DXGI_FORMAT_R32_FLOAT, deviceContext);
-
         _numStencils = stencilTables.GetNumStencils();
+        if (_numStencils > 0) {
+            // convert unsigned char sizes buffer to ints
+            // (HLSL does not have uint8 type)
+            std::vector<int> const sizes(stencilTables.GetSizes().begin(),
+                                         stencilTables.GetSizes().end());
+
+            _sizes.initialize(sizes,
+                              DXGI_FORMAT_R32_SINT,
+                              deviceContext);
+            _offsets.initialize(stencilTables.GetOffsets(),
+                                DXGI_FORMAT_R32_SINT,
+                                deviceContext);
+            _indices.initialize(stencilTables.GetControlIndices(),
+                                DXGI_FORMAT_R32_SINT,
+                                deviceContext);
+            _weights.initialize(stencilTables.GetWeights(),
+                                DXGI_FORMAT_R32_FLOAT,
+                                deviceContext);
+        }
     }
 
     bool IsValid() const {
diff --git a/opensubdiv/osd/glslComputeContext.cpp b/opensubdiv/osd/glslComputeContext.cpp
index 54d2ce21..32886d74 100644
--- a/opensubdiv/osd/glslComputeContext.cpp
+++ b/opensubdiv/osd/glslComputeContext.cpp
@@ -67,18 +67,22 @@ class GLSLComputeContext::GLSLStencilTables {
 public:
 
     GLSLStencilTables(Far::StencilTables const & stencilTables) {
-        _sizes = createGLSLBuffer(stencilTables.GetSizes());
-        _offsets = createGLSLBuffer(stencilTables.GetOffsets());
-        _indices = createGLSLBuffer(stencilTables.GetControlIndices());
-        _weights = createGLSLBuffer(stencilTables.GetWeights());
         _numStencils = stencilTables.GetNumStencils();
+        if (_numStencils > 0) {
+            _sizes = createGLSLBuffer(stencilTables.GetSizes());
+            _offsets = createGLSLBuffer(stencilTables.GetOffsets());
+            _indices = createGLSLBuffer(stencilTables.GetControlIndices());
+            _weights = createGLSLBuffer(stencilTables.GetWeights());
+        } else {
+            _sizes = _offsets = _indices = _weights = 0;
+        }
     }
 
     ~GLSLStencilTables() {
-        glDeleteBuffers(1, &_sizes);
-        glDeleteBuffers(1, &_offsets);
-        glDeleteBuffers(1, &_weights);
-        glDeleteBuffers(1, &_indices);
+        if (_sizes)   glDeleteBuffers(1, &_sizes);
+        if (_offsets) glDeleteBuffers(1, &_offsets);
+        if (_weights) glDeleteBuffers(1, &_weights);
+        if (_indices) glDeleteBuffers(1, &_indices);
     }
 
     bool IsValid() const {
diff --git a/opensubdiv/osd/glslTransformFeedbackComputeContext.cpp b/opensubdiv/osd/glslTransformFeedbackComputeContext.cpp
index 50c9fa3c..ad44c462 100644
--- a/opensubdiv/osd/glslTransformFeedbackComputeContext.cpp
+++ b/opensubdiv/osd/glslTransformFeedbackComputeContext.cpp
@@ -82,18 +82,22 @@ class GLSLTransformFeedbackComputeContext::GLStencilTables {
 public:
 
     GLStencilTables(Far::StencilTables const & stencilTables) {
-        _sizes = createGLTextureBuffer(stencilTables.GetSizes(), GL_R8UI);
-        _offsets = createGLTextureBuffer(stencilTables.GetOffsets(), GL_R32I);
-        _indices = createGLTextureBuffer(stencilTables.GetControlIndices(), GL_R32I);
-        _weights = createGLTextureBuffer(stencilTables.GetWeights(), GL_R32F);
         _numStencils = stencilTables.GetNumStencils();
+        if (_numStencils > 0) {
+            _sizes = createGLTextureBuffer(stencilTables.GetSizes(), GL_R8UI);
+            _offsets = createGLTextureBuffer(stencilTables.GetOffsets(), GL_R32I);
+            _indices = createGLTextureBuffer(stencilTables.GetControlIndices(), GL_R32I);
+            _weights = createGLTextureBuffer(stencilTables.GetWeights(), GL_R32F);
+        } else {
+            _sizes = _offsets = _indices = _weights = 0;
+        }
     }
 
     ~GLStencilTables() {
-        glDeleteTextures(1, &_sizes);
-        glDeleteTextures(1, &_offsets);
-        glDeleteTextures(1, &_weights);
-        glDeleteTextures(1, &_indices);
+        if (_sizes) glDeleteTextures(1, &_sizes);
+        if (_offsets) glDeleteTextures(1, &_offsets);
+        if (_weights) glDeleteTextures(1, &_weights);
+        if (_indices) glDeleteTextures(1, &_indices);
     }
 
     bool IsValid() const {
diff --git a/opensubdiv/osd/ompComputeController.cpp b/opensubdiv/osd/ompComputeController.cpp
index 3b9f1eeb..9e94a994 100644
--- a/opensubdiv/osd/ompComputeController.cpp
+++ b/opensubdiv/osd/ompComputeController.cpp
@@ -58,14 +58,16 @@ OmpComputeController::ApplyStencilTableKernel(
         float * destBuffer = _currentBindState.vertexBuffer + desc.offset +
             vertexStencils->GetNumControlVertices() * desc.stride;
 
-        OmpComputeStencils(_currentBindState.vertexDesc,
-                           srcBuffer, destBuffer,
-                           &vertexStencils->GetSizes().at(0),
-                           &vertexStencils->GetOffsets().at(0),
-                           &vertexStencils->GetControlIndices().at(0),
-                           &vertexStencils->GetWeights().at(0),
-                           start,
-                           end);
+        if (end > start) {
+            OmpComputeStencils(_currentBindState.vertexDesc,
+                               srcBuffer, destBuffer,
+                               &vertexStencils->GetSizes().at(0),
+                               &vertexStencils->GetOffsets().at(0),
+                               &vertexStencils->GetControlIndices().at(0),
+                               &vertexStencils->GetWeights().at(0),
+                               start,
+                               end);
+        }
     }
 
     Far::StencilTables const * varyingStencils = context->GetVaryingStencilTables();
@@ -82,14 +84,16 @@ OmpComputeController::ApplyStencilTableKernel(
         float * destBuffer = _currentBindState.varyingBuffer + desc.offset +
             varyingStencils->GetNumControlVertices() * desc.stride;
 
-        OmpComputeStencils(_currentBindState.varyingDesc,
-                           srcBuffer, destBuffer,
-                           &varyingStencils->GetSizes().at(0),
-                           &varyingStencils->GetOffsets().at(0),
-                           &varyingStencils->GetControlIndices().at(0),
-                           &varyingStencils->GetWeights().at(0),
-                           start,
-                           end);
+        if (end > start) {
+            OmpComputeStencils(_currentBindState.varyingDesc,
+                               srcBuffer, destBuffer,
+                               &varyingStencils->GetSizes().at(0),
+                               &varyingStencils->GetOffsets().at(0),
+                               &varyingStencils->GetControlIndices().at(0),
+                               &varyingStencils->GetWeights().at(0),
+                               start,
+                               end);
+        }
     }
 }
 
diff --git a/opensubdiv/osd/tbbComputeController.cpp b/opensubdiv/osd/tbbComputeController.cpp
index 6ce55113..75ddcf05 100644
--- a/opensubdiv/osd/tbbComputeController.cpp
+++ b/opensubdiv/osd/tbbComputeController.cpp
@@ -67,14 +67,16 @@ TbbComputeController::ApplyStencilTableKernel(
         float * destBuffer = _currentBindState.vertexBuffer + desc.offset +
             vertexStencils->GetNumControlVertices() * desc.stride;
 
-        TbbComputeStencils(_currentBindState.vertexDesc,
-                           srcBuffer, destBuffer,
-                           &vertexStencils->GetSizes().at(0),
-                           &vertexStencils->GetOffsets().at(0),
-                           &vertexStencils->GetControlIndices().at(0),
-                           &vertexStencils->GetWeights().at(0),
-                           start,
-                           end);
+        if (end > start) {
+            TbbComputeStencils(_currentBindState.vertexDesc,
+                               srcBuffer, destBuffer,
+                               &vertexStencils->GetSizes().at(0),
+                               &vertexStencils->GetOffsets().at(0),
+                               &vertexStencils->GetControlIndices().at(0),
+                               &vertexStencils->GetWeights().at(0),
+                               start,
+                               end);
+        }
     }
 
     Far::StencilTables const * varyingStencils = context->GetVaryingStencilTables();
@@ -91,14 +93,16 @@ TbbComputeController::ApplyStencilTableKernel(
         float * destBuffer = _currentBindState.varyingBuffer + desc.offset +
             varyingStencils->GetNumControlVertices() * desc.stride;
 
-        TbbComputeStencils(_currentBindState.varyingDesc,
-                           srcBuffer, destBuffer,
-                           &varyingStencils->GetSizes().at(0),
-                           &varyingStencils->GetOffsets().at(0),
-                           &varyingStencils->GetControlIndices().at(0),
-                           &varyingStencils->GetWeights().at(0),
-                           start,
-                           end);
+        if (end > start) {
+            TbbComputeStencils(_currentBindState.varyingDesc,
+                               srcBuffer, destBuffer,
+                               &varyingStencils->GetSizes().at(0),
+                               &varyingStencils->GetOffsets().at(0),
+                               &varyingStencils->GetControlIndices().at(0),
+                               &varyingStencils->GetWeights().at(0),
+                               start,
+                               end);
+        }
     }
 }
 

From 1ec0b2b1f271e593a3e7a7053e33be12eea9e9c3 Mon Sep 17 00:00:00 2001
From: Takahito Tejima <takahito@pixar.com>
Date: Thu, 9 Apr 2015 17:07:52 -0700
Subject: [PATCH 4/4] add includes

---
 opensubdiv/osd/glslComputeController.h                  | 1 +
 opensubdiv/osd/glslTransformFeedbackComputeController.h | 1 +
 2 files changed, 2 insertions(+)

diff --git a/opensubdiv/osd/glslComputeController.h b/opensubdiv/osd/glslComputeController.h
index 4b3b7127..8c87b37d 100644
--- a/opensubdiv/osd/glslComputeController.h
+++ b/opensubdiv/osd/glslComputeController.h
@@ -31,6 +31,7 @@
 #include "../osd/vertexDescriptor.h"
 
 #include <vector>
+#include <cassert>
 
 namespace OpenSubdiv {
 namespace OPENSUBDIV_VERSION {
diff --git a/opensubdiv/osd/glslTransformFeedbackComputeController.h b/opensubdiv/osd/glslTransformFeedbackComputeController.h
index 50b38f54..3a9c92ce 100644
--- a/opensubdiv/osd/glslTransformFeedbackComputeController.h
+++ b/opensubdiv/osd/glslTransformFeedbackComputeController.h
@@ -31,6 +31,7 @@
 #include "../osd/vertexDescriptor.h"
 
 #include <vector>
+#include <cassert>
 
 namespace OpenSubdiv {
 namespace OPENSUBDIV_VERSION {