Add ISPC limit surface evaluation

2024-11-09 22:00:06 +00:00 · 2015-07-20 14:12:11 -07:00 · 2015-07-20 14:12:11 -07:00 · d3f8725e79
commit d3f8725e79
parent bd7b017c02
13 changed files with 1953 additions and 30 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -197,6 +197,8 @@ if (CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_CLANGCC OR CMAKE_COMPILER_IS_IC
            endif()

        endforeach()
+
+        list(APPEND OSD_COMPILER_FLAGS -std=c++11)
    endif()

 elseif(MSVC)
@ -321,6 +323,9 @@ endif()
 if(NOT NO_TBB)
    find_package(TBB 4.0)
 endif()
+if(NOT NO_ISPC)
+    find_package(ISPC 1.6)
+endif()
 if (NOT NO_OPENGL)
    find_package(OpenGL)
 endif()
@ -539,6 +544,12 @@ if (NOT NO_MAYA)
    endif()
 endif()

+if(ISPC_FOUND)
+    add_definitions(
+        -DOPENSUBDIV_HAS_ISPC
+    )
+endif()
+
 # Link examples & regressions dynamically against Osd
 set( OSD_LINK_TARGET osd_dynamic_cpu osd_dynamic_gpu )

--- a/cmake/FindISPC.cmake
+++ b/cmake/FindISPC.cmake
@ -0,0 +1,94 @@
+#
+#   Copyright 2013 Pixar
+#
+#   Licensed under the Apache License, Version 2.0 (the "Apache License")
+#   with the following modification; you may not use this file except in
+#   compliance with the Apache License and the following modification to it:
+#   Section 6. Trademarks. is deleted and replaced with:
+#
+#   6. Trademarks. This License does not grant permission to use the trade
+#      names, trademarks, service marks, or product names of the Licensor
+#      and its affiliates, except as required to comply with Section 4(c) of
+#      the License and to reproduce the content of the NOTICE file.
+#
+#   You may obtain a copy of the Apache License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#   Unless required by applicable law or agreed to in writing, software
+#   distributed under the Apache License with the above modification is
+#   distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+#   KIND, either express or implied. See the Apache License for the specific
+#   language governing permissions and limitations under the Apache License.
+#
+
+# - Try to find Intel's ISPC
+# Once done this will define
+#
+#  ISPC_FOUND - System has ISPC
+#  ISPC_DIR - The ISPC directory
+
+# Obtain ISPC directory
+if (WIN32)
+    #NOT IMPLEMENTED
+elseif (APPLE)
+    #NOT IMPLEMENTED
+else ()
+    find_path(ISPC_DIR
+        NAMES
+            ispc
+        PATHS
+            ${ISPC_LOCATION}  
+        NO_DEFAULT_PATH NO_SYSTEM_ENVIRONMENT_PATH
+        DOC "The directory where ISPC reside")
+endif ()
+
+if (ISPC_DIR)
+    execute_process(COMMAND ${ISPC_DIR}/ispc --version OUTPUT_VARIABLE ISPC_VERSION)
+    string(REGEX MATCH "[0-9].[0-9].[0-9]" ISPC_VERSION ${ISPC_VERSION})
+endif ()
+
+include(FindPackageHandleStandardArgs)
+
+find_package_handle_standard_args(ISPC
+    REQUIRED_VARS
+        ISPC_DIR
+    VERSION_VAR
+        ISPC_VERSION
+)
+
+mark_as_advanced( ISPC_DIR )
+
+MACRO (ispc_compile)
+  
+    SET(ISPC_TARGET_DIR ${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/osd_ispc_obj.dir)
+
+    SET(ISPC_OBJECTS "")
+    
+    FOREACH(src ${ARGN})
+    
+        GET_FILENAME_COMPONENT(fname ${src} NAME_WE)
+        
+        SET(results "${ISPC_TARGET_DIR}/${fname}.dev.o")
+  
+        ADD_CUSTOM_COMMAND(
+            OUTPUT ${results} ${ISPC_TARGET_DIR}/${fname}_ispc.h
+            COMMAND  ${ISPC_DIR}/ispc  
+            --pic
+            -O1
+            --wno-perf
+            --woff
+            -h ${ISPC_TARGET_DIR}/${fname}_ispc.h
+            -MMM  ${ISPC_TARGET_DIR}/${fname}.dev.idep 
+            -o ${ISPC_TARGET_DIR}/${fname}.dev.o
+            ${CMAKE_CURRENT_SOURCE_DIR}/${src} 
+            \;
+            DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/${src} 
+        )
+
+        SET(ISPC_OBJECTS ${ISPC_OBJECTS} ${results})
+
+    ENDFOREACH()
+    
+ENDMACRO()
+
--- a/examples/glEvalLimit/glEvalLimit.cpp
+++ b/examples/glEvalLimit/glEvalLimit.cpp
@ -34,6 +34,10 @@ GLFWmonitor* g_primary=0;
 #include <osd/cpuGLVertexBuffer.h>
 #include <osd/mesh.h>

+#ifdef OPENSUBDIV_HAS_ISPC
+    #include <osd/ispcEvaluator.h>
+#endif    
+    
 #ifdef OPENSUBDIV_HAS_TBB
    #include <osd/tbbEvaluator.h>
 #endif
@ -104,7 +108,8 @@ enum KernelType { kCPU = 0,
                  kCUDA = 3,
                  kCL = 4,
                  kGLXFB = 5,
-                  kGLCompute = 6 };
+                  kGLCompute = 6,
+                  kISPC = 7 };

 enum EndCap      { kEndCapBSplineBasis,
                   kEndCapGregoryBasis };
@ -169,10 +174,10 @@ float g_currentTime = 0;
 Stopwatch g_fpsTimer;

 //------------------------------------------------------------------------------
-int g_nParticles = 65536;
+int g_nParticles = 655360;

 bool g_randomStart = true;//false;
-bool g_animParticles = true;
+bool g_animParticles = false;

 GLuint g_samplesVAO=0;

@ -439,7 +444,9 @@ updateGeom() {
    assert(g_particles);

    float elapsed = g_currentTime - g_prevTime;
-    g_particles->Update(elapsed);
+    if(elapsed != 0.0f) {
+        g_particles->Update(elapsed);
+    }
    g_prevTime = g_currentTime;

    std::vector<OpenSubdiv::Osd::PatchCoord> const &patchCoords
@ -464,7 +471,7 @@ updateGeom() {
    }

    s.Stop();
-
+        
    g_evalTime = float(s.GetElapsed());
 }

@ -648,8 +655,20 @@ createOsdMesh(ShapeDesc const & shapeDesc, int level) {
            (vertexStencils, varyingStencils,
             nCoarseVertices, nverts, g_nParticles, g_patchTable,
             &glComputeEvaluatorCache);
+
+    }             
 #endif
-    }
+#if  defined(OPENSUBDIV_HAS_ISPC) && defined(OPENSUBDIV_HAS_TBB)
+    else if(g_kernel == kISPC) {
+        g_evalOutput = new EvalOutput<Osd::CpuGLVertexBuffer,
+                                      Osd::CpuGLVertexBuffer,
+                                      Far::StencilTable,
+                                      Osd::CpuPatchTable,
+                                      Osd::IspcEvaluator>
+            (vertexStencils, varyingStencils,
+             nCoarseVertices, nverts, g_nParticles, g_patchTable);    
+    }    
+#endif    

    // Create the 'uv particles' manager - this class manages the limit
    // location samples (ptex face index, (s,t) and updates them between frames.
@ -875,7 +894,7 @@ display() {
        }

        if (g_endCap != kEndCapBSplineBasis &&
-            (g_kernel != kCPU && g_kernel != kOPENMP && g_kernel != kTBB)) {
+            (g_kernel != kCPU && g_kernel != kOPENMP && g_kernel != kTBB && g_kernel != kISPC)) {
            static char msg[] =
                "ERROR: This kernel only supports BSpline basis patches.";
            g_hud.DrawString(g_width/4, g_height/4+20, 1, 0, 0, msg);
@ -1129,6 +1148,9 @@ initHUD() {
 #ifdef OPENSUBDIV_HAS_TBB
    g_hud.AddPullDownButton(compute_pulldown, "TBB", kTBB);
 #endif
+#if  defined(OPENSUBDIV_HAS_ISPC) && defined(OPENSUBDIV_HAS_TBB)
+    g_hud.AddPullDownButton(compute_pulldown, "ISPC", kISPC);
+#endif
 #ifdef OPENSUBDIV_HAS_CUDA
    g_hud.AddPullDownButton(compute_pulldown, "CUDA", kCUDA);
 #endif
--- a/examples/glEvalLimit/particles.cpp
+++ b/examples/glEvalLimit/particles.cpp
@ -32,17 +32,17 @@
 #ifdef OPENSUBDIV_HAS_TBB
 #include <tbb/parallel_for.h>
 #include <tbb/atomic.h>
-tbb::atomic<int> g_tbbCounter;
+
 class TbbUpdateKernel {
 public:
    TbbUpdateKernel(float speed,
                    STParticles::Position *positions,
                    float *velocities,
                    std::vector<STParticles::FaceInfo> const &adjacency,
-                    OpenSubdiv::Osd::PatchCoord *patchCoords,
+                    PatchHandleMap *patchHandleMap,
                    OpenSubdiv::Far::PatchMap const *patchMap) :
        _speed(speed), _positions(positions), _velocities(velocities),
-        _adjacency(adjacency), _patchCoords(patchCoords), _patchMap(patchMap) {
+        _adjacency(adjacency), _patchHandleMap(patchHandleMap), _patchMap(patchMap) {
    }

    void operator () (tbb::blocked_range<int> const &r) const {
@ -76,9 +76,13 @@ public:
            OpenSubdiv::Far::PatchTable::PatchHandle const *handle =
                _patchMap->FindPatch(p->ptexIndex, p->s, p->t);
            if (handle) {
-                int index = g_tbbCounter.fetch_and_add(1);
-                _patchCoords[index] =
-                    OpenSubdiv::Osd::PatchCoord(*handle, p->s, p->t);
+                PatchHandleMap::accessor a;
+                if( !_patchHandleMap->find(a, handle)) {  
+                    _patchHandleMap->insert(a, handle);               
+                }
+                std::vector<float> &st = a->second;
+                st.push_back(p->s);
+                st.push_back(p->t);  
            }
        }
    }
@ -87,7 +91,7 @@ private:
    STParticles::Position *_positions;
    float *_velocities;
    std::vector<STParticles::FaceInfo> const &_adjacency;
-    OpenSubdiv::Osd::PatchCoord *_patchCoords;
+    PatchHandleMap *_patchHandleMap;
    OpenSubdiv::Far::PatchMap const *_patchMap;
 };
 #endif
@ -276,18 +280,36 @@ STParticles::Update(float deltaTime) {
    if (deltaTime == 0) return;
    float speed = GetSpeed() * std::max(0.001f, std::min(deltaTime, 0.5f));

-    _patchCoords.clear();
-
    // XXX: this process should be parallelized.
 #ifdef OPENSUBDIV_HAS_TBB
-
-    _patchCoords.resize((int)GetNumParticles());
+    _patchHandleMap.clear();
+    
    TbbUpdateKernel kernel(speed, &_positions[0], &_velocities[0],
-                           _adjacency, &_patchCoords[0], _patchMap);;
-    g_tbbCounter = 0;
+                           _adjacency, &_patchHandleMap, _patchMap);;
    tbb::blocked_range<int> range(0, GetNumParticles(), 256);
    tbb::parallel_for(range, kernel);
-    _patchCoords.resize(g_tbbCounter);
+    
+
+    int nCoord = 0;
+    for(PatchHandleMap::iterator i  = _patchHandleMap.begin();
+                                 i != _patchHandleMap.end();
+                                 i ++) {
+        nCoord += (i->second.size() / 2);
+    }
+    
+    _patchCoords.resize(nCoord);
+    
+    int index = 0;
+    for(PatchHandleMap::iterator i  = _patchHandleMap.begin();
+                                 i != _patchHandleMap.end();
+                                 i ++) {
+        for(int j = 0; j < i->second.size(); j += 2) {
+            _patchCoords[index].handle = *(i->first);
+            _patchCoords[index].s      = i->second[j];
+            _patchCoords[index].t      = i->second[j+1];
+            index ++;
+        }
+    }     
 #else
    Position *  p = &_positions[0];
    float    * dp = &_velocities[0];
@ -323,7 +345,7 @@ STParticles::Update(float deltaTime) {
                OpenSubdiv::Osd::PatchCoord(*handle, p->s, p->t));
        }
    }
-#endif
+#endif   
 }

 // Dump adjacency info
--- a/examples/glEvalLimit/particles.h
+++ b/examples/glEvalLimit/particles.h
@ -30,6 +30,11 @@
 #include <osd/types.h>
 #include <iostream>

+#ifdef OPENSUBDIV_HAS_TBB
+#include <tbb/concurrent_hash_map.h>
+typedef tbb::concurrent_hash_map< OpenSubdiv::Far::PatchTable::PatchHandle const*, std::vector<float> > PatchHandleMap;
+#endif
+   
 //
 // In order to emphasize the dynamic nature of the EvalLimit API, where the
 // locations can be arbitrarily updated before each evaluation, the glEvalLimit
@ -142,7 +147,7 @@ public:
        return _velocities;
    }

-    std::vector<OpenSubdiv::Osd::PatchCoord> GetPatchCoords() const {
+    std::vector<OpenSubdiv::Osd::PatchCoord> const &GetPatchCoords() const {
        return _patchCoords;
    }

@ -159,6 +164,10 @@ private:
    std::vector<Position> _positions;

    std::vector<float> _velocities;
+    
+#ifdef OPENSUBDIV_HAS_TBB    
+    PatchHandleMap  _patchHandleMap;
+#endif

    std::vector<OpenSubdiv::Osd::PatchCoord> _patchCoords;

--- a/opensubdiv/CMakeLists.txt
+++ b/opensubdiv/CMakeLists.txt
@ -147,9 +147,16 @@ if (NOT NO_LIB)
    )
    set_target_properties(osd_static_cpu PROPERTIES OUTPUT_NAME osdCPU CLEAN_DIRECT_OUTPUT 1)

-    target_link_libraries(osd_static_cpu
-        ${PLATFORM_CPU_LIBRARIES}
-    )
+    if( ISPC_FOUND)
+        target_link_libraries(osd_static_cpu
+            osd_ispc_obj
+            ${PLATFORM_CPU_LIBRARIES}
+        )
+    else()
+        target_link_libraries(osd_static_cpu
+            ${PLATFORM_CPU_LIBRARIES}
+        )    
+    endif()

    install( TARGETS osd_static_cpu DESTINATION "${CMAKE_LIBDIR_BASE}" )

@ -200,9 +207,16 @@ if (NOT NO_LIB)
                )
        endif()

-        target_link_libraries(osd_dynamic_cpu
-            ${PLATFORM_CPU_LIBRARIES}
-        )
+        if ( ISPC_FOUND)
+            target_link_libraries(osd_dynamic_cpu
+                osd_ispc_obj
+                ${PLATFORM_CPU_LIBRARIES}
+            )
+        else()
+            target_link_libraries(osd_dynamic_cpu
+                ${PLATFORM_CPU_LIBRARIES}
+            )        
+        endif()

        install( TARGETS osd_dynamic_cpu LIBRARY DESTINATION "${CMAKE_LIBDIR_BASE}" )

--- a/opensubdiv/far/patchParam.h
+++ b/opensubdiv/far/patchParam.h
@ -116,6 +116,15 @@ struct PatchParam {
    ///
    void Normalize( float & u, float & v ) const;

+    /// This function is the reverse operation of function Normalize()
+    /// The (u,v) pair is converted from patch sub-parametric space to control
+    /// face parametric space.
+    ///
+    /// @param u  u parameter
+    /// @param v  v parameter
+    ///        
+    void Denormalize( float & u, float & v) const;
+    
    unsigned int field0:32;
    unsigned int field1:32;
 };
@ -161,6 +170,20 @@ PatchParam::Normalize( float & u, float & v ) const {
    v = (v - pv) / frac;
 }

+inline void
+PatchParam::Denormalize( float & u, float & v ) const {
+
+    float frac = GetParamFraction();
+
+    // top left corner
+    float pu = (float)GetU()*frac;
+    float pv = (float)GetV()*frac;
+
+    // normalize u,v coordinates
+    u = u * frac + pu;
+    v = v * frac + pv;    
+}
+
 } // end namespace Far

 } // end namespace OPENSUBDIV_VERSION
--- a/opensubdiv/far/patchTable.h
+++ b/opensubdiv/far/patchTable.h
@ -68,6 +68,12 @@ public:
        Index arrayIndex, // Array index of the patch
              patchIndex, // Absolute Index of the patch
              vertIndex;  // Relative offset to the first CV of the patch in array
+              
+        bool isEqual(const PatchHandle &other) {
+            return other.arrayIndex == arrayIndex &&
+                   other.patchIndex == patchIndex &&
+                   other.vertIndex  == vertIndex;
+        }
    };

 public:
--- a/opensubdiv/osd/CMakeLists.txt
+++ b/opensubdiv/osd/CMakeLists.txt
@ -26,6 +26,7 @@

 #-------------------------------------------------------------------------------
 # source & headers
+
 set(CPU_SOURCE_FILES
    cpuEvaluator.cpp
    cpuKernel.cpp
@ -33,8 +34,12 @@ set(CPU_SOURCE_FILES
    cpuVertexBuffer.cpp
 )

-set(GPU_SOURCE_FILES )
+if( ISPC_FOUND) 
+    list(APPEND CPU_SOURCE_FILES ispcEvaluator.cpp)  
+endif()

+set(GPU_SOURCE_FILES )
+set(ISPC_SOURCE_FILES )
 set(INC_FILES )

 set(PRIVATE_HEADER_FILES
@ -296,6 +301,17 @@ if( CUDA_FOUND )
    endif()
 endif()

+if( ISPC_FOUND) 
+    list(APPEND ISPC_SOURCE_FILES
+         ispcEvalLimitKernel.ispc
+    ) 
+    
+    # Compile ISPC code to objs
+    ispc_compile(${ISPC_SOURCE_FILES})
+    ADD_LIBRARY(osd_ispc_obj STATIC ${ISPC_OBJECTS})   
+    SET_TARGET_PROPERTIES(osd_ispc_obj PROPERTIES LINKER_LANGUAGE C)    
+endif()
+
 list(APPEND DOXY_HEADER_FILES ${CUDA_PUBLIC_HEADERS})

 #-------------------------------------------------------------------------------
--- a/opensubdiv/osd/ispcEvalLimitKernel.ispc
+++ b/opensubdiv/osd/ispcEvalLimitKernel.ispc
@ -0,0 +1,880 @@
+//
+//   Copyright 2013 Pixar
+//
+//   Licensed under the Apache License, Version 2.0 (the "Apache License")
+//   with the following modification; you may not use this file except in
+//   compliance with the Apache License and the following modification to it:
+//   Section 6. Trademarks. is deleted and replaced with:
+//
+//   6. Trademarks. This License does not grant permission to use the trade
+//      names, trademarks, service marks, or product names of the Licensor
+//      and its affiliates, except as required to comply with Section 4(c) of
+//      the License and to reproduce the content of the NOTICE file.
+//
+//   You may obtain a copy of the Apache License at
+//
+//       http://www.apache.org/licenses/LICENSE-2.0
+//
+//   Unless required by applicable law or agreed to in writing, software
+//   distributed under the Apache License with the above modification is
+//   distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+//   KIND, either express or implied. See the Apache License for the specific
+//   language governing permissions and limitations under the Apache License.
+//
+
+
+#define MAX_CHANNEL 4
+
+struct BufferDescriptor {
+    int offset;  // offset to desired element data
+    int length;  // number or length of the data
+    int stride;  // stride to the next element    
+};
+
+struct Point {
+    float x;
+    float y;
+    float z;
+};
+
+inline struct Point operator+(struct Point a, struct Point b) {
+    struct Point result;
+    result.x = a.x + b.x;
+    result.y = a.y + b.y;
+    result.z = a.z + b.z;        
+
+    return result;
+}
+
+inline uniform struct Point operator+(uniform struct Point a, uniform struct Point b) {
+    uniform struct Point result;
+    result.x = a.x + b.x;
+    result.y = a.y + b.y;
+    result.z = a.z + b.z;        
+
+    return result;
+}
+
+inline struct Point operator-(struct Point a, struct Point b) {
+    struct Point result;
+    result.x = a.x - b.x;
+    result.y = a.y - b.y;
+    result.z = a.z - b.z;        
+
+    return result;
+}
+
+inline uniform struct Point operator-(uniform struct Point a, uniform struct Point b) {
+    uniform struct Point result;
+    result.x = a.x - b.x;
+    result.y = a.y - b.y;
+    result.z = a.z - b.z;        
+
+    return result;
+}
+
+inline struct Point operator*(struct Point a, float b) {
+    struct Point result;
+    result.x = a.x * b;
+    result.y = a.y * b;
+    result.z = a.z * b;        
+
+    return result;
+}
+
+inline uniform struct Point operator*(uniform struct Point a, uniform float b) {
+    uniform struct Point result;
+    result.x = a.x * b;
+    result.y = a.y * b;
+    result.z = a.z * b;        
+
+    return result;
+} 
+
+inline struct Point operator*(float b, struct Point a) {
+    struct Point result;
+    result.x = b * a.x;
+    result.y = b * a.y;
+    result.z = b * a.z;        
+
+    return result;
+}
+
+inline uniform struct Point operator*(uniform float b, uniform struct Point a) {
+    uniform struct Point result;
+    result.x = b * a.x;
+    result.y = b * a.y;
+    result.z = b * a.z;        
+
+    return result;
+}
+
+inline struct Point operator/(struct Point a, float b) {
+    struct Point result;
+    result.x = a.x / b;
+    result.y = a.y / b;
+    result.z = a.z / b;        
+
+    return result;
+}
+
+inline uniform struct Point operator/(uniform struct Point a, uniform float b) {
+    uniform struct Point result;
+    result.x = a.x / b;
+    result.y = a.y / b;
+    result.z = a.z / b;        
+
+    return result;
+}
+
+inline void cross(struct Point &a, struct Point &b, struct Point &c)
+{
+    c.x = a.y*b.z - a.z*b.y;
+    c.y = a.z*b.x - a.x*b.z;
+    c.z = a.x*b.y - a.y*b.x;
+}
+
+inline uniform bool
+nonQuadRoot(uniform unsigned int bitField) 
+{
+    return (bitField >> 3) & 0x1;
+}
+
+inline uniform unsigned int getU(uniform unsigned int bitField) 
+{ 
+    return (uniform unsigned int)((bitField >> 22) & 0x3ff); 
+}
+
+inline uniform unsigned int getV(uniform unsigned int bitField) 
+{ 
+    return (uniform unsigned int)((bitField >> 12) & 0x3ff); 
+}
+
+inline uniform unsigned int getBoundary(uniform unsigned int bitField)
+{ 
+    return (uniform unsigned int)((bitField >> 8) & 0xf); 
+}
+     
+inline uniform unsigned int getDepth(uniform unsigned int bitField)
+{ 
+    return  (uniform unsigned int)(bitField & 0xf); 
+}
+
+inline uniform float
+getParamFraction(uniform unsigned int bitField){
+    if (nonQuadRoot(bitField)) {
+        return 1.0f / (1 << (getDepth(bitField)-1));
+    } else {
+        return 1.0f / (1 << getDepth(bitField));
+    }
+}
+
+inline void 
+adjustBoundaryWeights(uniform unsigned int bitField,
+                      float                sWeights[4], 
+                      float                tWeights[4]) {
+
+    uniform int boundary = getBoundary(bitField);
+
+    if (boundary & 1) {
+        tWeights[2] -= tWeights[0];
+        tWeights[1] += 2*tWeights[0];
+        tWeights[0] = 0;
+    }
+    if (boundary & 2) {
+        sWeights[1] -= sWeights[3];
+        sWeights[2] += 2*sWeights[3];
+        sWeights[3] = 0;
+    }
+    if (boundary & 4) {
+        tWeights[1] -= tWeights[3];
+        tWeights[2] += 2*tWeights[3];
+        tWeights[3] = 0;
+    }
+    if (boundary & 8) {
+        sWeights[2] -= sWeights[0];
+        sWeights[1] += 2*sWeights[0];
+        sWeights[0] = 0;
+    }
+}
+
+inline void
+getBSplineWeights(float t, float point[4], float deriv[4]) {
+    // The four uniform cubic B-Spline basis functions evaluated at t:
+    float const one6th = 1.0f / 6.0f;
+
+    float t2 = t * t;
+    float t3 = t * t2;
+
+    point[0] = one6th * (1.0f - 3.0f*(t -      t2) -      t3);
+    point[1] = one6th * (4.0f           - 6.0f*t2  + 3.0f*t3);
+    point[2] = one6th * (1.0f + 3.0f*(t +      t2  -      t3));
+    point[3] = one6th * (                                 t3);
+
+    // Derivatives of the above four basis functions at t:
+    deriv[0] = -0.5f*t2 +      t - 0.5f;
+    deriv[1] =  1.5f*t2 - 2.0f*t;
+    deriv[2] = -1.5f*t2 +      t + 0.5f;
+    deriv[3] =  0.5f*t2;
+}
+
+inline void
+getBezierWeights(float t, float point[4], float deriv[4]) {
+    // The four uniform cubic Bezier basis functions (in terms of t and its
+    // complement tC) evaluated at t:
+    float t2 = t*t;
+    float tC = 1.0f - t;
+    float tC2 = tC * tC;
+
+    point[0] = tC2 * tC;
+    point[1] = tC2 * t * 3.0f;
+    point[2] = t2 * tC * 3.0f;
+    point[3] = t2 * t;
+
+    // Derivatives of the above four basis functions at t:
+    deriv[0] = -3.0f * tC2;
+    deriv[1] =  9.0f * t2 - 12.0f * t + 3.0f;
+    deriv[2] = -9.0f * t2 +  6.0f * t;
+    deriv[3] =  3.0f * t2;
+}
+
+inline void
+getBSplineWeightsNoDerivative(float t, float point[4]) {
+    // The four uniform cubic B-Spline basis functions evaluated at t:
+    float const one6th = 1.0f / 6.0f;
+
+    float t2 = t * t;
+    float t3 = t * t2;
+
+    point[0] = one6th * (1.0f - 3.0f*(t -      t2) -      t3);
+    point[1] = one6th * (4.0f           - 6.0f*t2  + 3.0f*t3);
+    point[2] = one6th * (1.0f + 3.0f*(t +      t2  -      t3));
+    point[3] = one6th * (                                 t3);
+}
+
+inline void
+getBezierWeightsNoDerivative(float t, float point[4]) {
+    // The four uniform cubic Bezier basis functions (in terms of t and its
+    // complement tC) evaluated at t:
+    float t2 = t*t;
+    float tC = 1.0f - t;
+    float tC2 = tC * tC;
+
+    point[0] = tC2 * tC;
+    point[1] = tC2 * t * 3.0f;
+    point[2] = t2 * tC * 3.0f;
+    point[3] = t2 * t;
+}
+
+export void
+evalBilinear(uniform unsigned int                  bitField,
+             uniform int                           nPoint, 
+             uniform const float  * uniform        u, 
+             uniform const float  * uniform        v,             
+             uniform const int    * uniform        vertexIndices,
+             uniform const BufferDescriptor       &inDesc,
+             uniform const float * uniform         inQ,
+             uniform const BufferDescriptor       &outDesc,
+             uniform float *uniform                outQ,
+             uniform const BufferDescriptor       &duDesc,            
+             uniform float *uniform                outDQU,
+             uniform const BufferDescriptor       &dvDesc,            
+             uniform float *uniform                outDQV)
+{
+    uniform int nChannel = inDesc.length / 3;
+    assert(nChannel < MAX_CHANNEL);
+    
+    uniform Point controlVertices[MAX_CHANNEL*4];
+    for(uniform int i=0; i<4; i++) {
+        uniform unsigned int id = vertexIndices[i];
+        uniform const float * uniform pVertex = inQ + inDesc.offset + id * inDesc.stride;
+        for(uniform int c=0; c<nChannel; c++) {
+            uniform int offset = c * 4 + i;
+            controlVertices[offset].x = pVertex[0];
+            controlVertices[offset].y = pVertex[1];
+            controlVertices[offset].z = pVertex[2];
+            pVertex += 3;
+        }
+    }        
+                 
+    foreach( n = 0 ... nPoint) {        
+        float ou   = 1.0f - u[n];
+        float ov   = 1.0f - v[n];
+        float w[4] = { ov*ou, v[n]*ou, v[n]*u[n], ov*u[n] };
+                
+        float *pOutQ   = outQ   + outDesc.offset + n * outDesc.stride;        
+        for(uniform int c=0; c<nChannel; c++) { 
+            Point Q;
+            Q.x = Q.y = Q.z = 0.0;
+            for (uniform int i=0; i<4; ++i) {
+                Q = Q + w[i] * controlVertices[c * 4 + i];              
+            }    
+              
+            *pOutQ ++ = Q.x, *pOutQ ++ = Q.y, *pOutQ ++ = Q.z;
+        }        
+    }
+    
+    uniform Point dU[MAX_CHANNEL], dV[MAX_CHANNEL];
+    for(uniform int c=0; c<nChannel; c++) { 
+        dU[c] = 0.5 * (controlVertices[c * 4 + 3] - controlVertices[c * 4 + 0] +
+                       controlVertices[c * 4 + 2] - controlVertices[c * 4 + 1]  );
+                       
+        dV[c] = 0.5 * (controlVertices[c * 4 + 1] - controlVertices[c * 4 + 0] +
+                       controlVertices[c * 4 + 2] - controlVertices[c * 4 + 3]  );                       
+    }    
+    
+    foreach( n = 0 ... nPoint) {
+        float *pOutDQU = outDQU +  duDesc.offset  + n *  duDesc.stride;     
+        float *pOutDQV = outDQV +  dvDesc.offset  + n *  dvDesc.stride;           
+        for(uniform int c=0; c<nChannel; c++) { 
+            *pOutDQU ++ = dU[c].x, *pOutDQU ++ = dU[c].y, *pOutDQU ++ = dU[c].z;
+            *pOutDQV ++ = dV[c].x, *pOutDQV ++ = dV[c].y, *pOutDQV ++ = dV[c].z;            
+        }
+    }    
+}   
+
+export void
+evalBilinearNoDerivative(uniform unsigned int                  bitField,
+                         uniform int                           nPoint, 
+                         uniform const float  * uniform        u, 
+                         uniform const float  * uniform        v,             
+                         uniform const int    * uniform        vertexIndices,
+                         uniform const BufferDescriptor       &inDesc,
+                         uniform const float * uniform         inQ,
+                         uniform const BufferDescriptor       &outDesc,
+                         uniform float *uniform                outQ)
+{
+    uniform int nChannel = inDesc.length / 3;
+    assert(nChannel < MAX_CHANNEL);
+    
+    uniform Point controlVertices[MAX_CHANNEL*4];
+    for(uniform int i=0; i<4; i++) {
+        uniform unsigned int id = vertexIndices[i];
+        uniform const float * uniform pVertex = inQ + inDesc.offset + id * inDesc.stride;
+        for(uniform int c=0; c<nChannel; c++) {
+            uniform int offset = c * 4 + i;
+            controlVertices[offset].x = pVertex[0];
+            controlVertices[offset].y = pVertex[1];
+            controlVertices[offset].z = pVertex[2];
+            pVertex += 3;
+        }
+    }        
+                 
+    foreach( n = 0 ... nPoint) {        
+        float ou   = 1.0f - u[n];
+        float ov   = 1.0f - v[n];
+        float w[4] = { ov*ou, v[n]*ou, v[n]*u[n], ov*u[n] };
+                
+        float *pOutQ   = outQ   + outDesc.offset + n * outDesc.stride;        
+        for(uniform int c=0; c<nChannel; c++) { 
+            Point Q;
+            Q.x = Q.y = Q.z = 0.0;
+            for (uniform int i=0; i<4; ++i) {
+                Q = Q + w[i] * controlVertices[c * 4 + i];              
+            }    
+              
+            *pOutQ ++ = Q.x, *pOutQ ++ = Q.y, *pOutQ ++ = Q.z;
+        }        
+    }
+}   
+
+export void
+evalBSpline(uniform unsigned int                  bitField,
+            uniform int                           nPoint, 
+            uniform const float  * uniform        u, 
+            uniform const float  * uniform        v,             
+            uniform const int    * uniform        vertexIndices,
+            uniform const BufferDescriptor       &inDesc,
+            uniform const float * uniform         inQ,
+            uniform const BufferDescriptor       &outDesc,
+            uniform float *uniform                outQ,
+            uniform const BufferDescriptor       &duDesc,            
+            uniform float *uniform                outDQU,
+            uniform const BufferDescriptor       &dvDesc,            
+            uniform float *uniform                outDQV)
+{
+    uniform int nChannel = inDesc.length / 3;
+    assert(nChannel < MAX_CHANNEL);
+    
+    uniform Point controlVertices[MAX_CHANNEL*16];
+    for(uniform int i=0; i<16; i++) {
+        uniform unsigned int id = vertexIndices[i];
+        uniform const float * uniform pVertex = inQ + inDesc.offset + id * inDesc.stride;
+        for(uniform int c=0; c<nChannel; c++) {
+            uniform int offset = c * 16 + i;
+            controlVertices[offset].x = pVertex[0];
+            controlVertices[offset].y = pVertex[1];
+            controlVertices[offset].z = pVertex[2];
+            pVertex += 3;
+        }
+    }
+
+    uniform float dScale = (uniform float)(1 << getDepth(bitField));
+    
+    uniform float frac = getParamFraction(bitField);
+
+    // top left corner
+    uniform float pu = (uniform float)getU(bitField)*frac;
+    uniform float pv = (uniform float)getV(bitField)*frac;
+
+    foreach( n = 0 ... nPoint) {
+        // normalize u,v coordinates
+        float s = (u[n] - pu) / frac;
+        float t = (v[n] - pv) / frac;
+        
+        float sWeights[4], tWeights[4], dsWeights[4], dtWeights[4];
+       
+        getBSplineWeights(s, sWeights, dsWeights);
+        getBSplineWeights(t, tWeights, dtWeights);      
+        
+        adjustBoundaryWeights(bitField,  sWeights,  tWeights);
+        adjustBoundaryWeights(bitField, dsWeights, dtWeights);            
+        
+        float weight[16];       
+        for (uniform int i = 0; i < 4; ++i) {
+            for (uniform int j = 0; j < 4; ++j) {
+                weight[4*i+j] = sWeights[j] * tWeights[i];
+            }
+        }
+                       
+        float *pOutQ = outQ + outDesc.offset + n * outDesc.stride;             
+        for(uniform int c=0; c<nChannel; c++) { 
+            uniform int offset = c * 16;
+            Point Q;
+            Q.x = Q.y = Q.z = 0.0;
+            for (uniform int i=0; i<16; ++i) {
+                Q = Q + weight[i] * controlVertices[offset + i];                            
+            }    
+              
+            *pOutQ ++ = Q.x, *pOutQ ++ = Q.y, *pOutQ ++ = Q.z;
+        }   
+        
+        float derivS[16], derivT[16];       
+        for (uniform int i = 0; i < 4; ++i) {
+            for (uniform int j = 0; j < 4; ++j) {
+                derivS[4*i+j] = dsWeights[j] *  tWeights[i] * dScale;
+                derivT[4*i+j] =  sWeights[j] * dtWeights[i] * dScale;                
+            }
+        }
+                       
+        float *pOutDQU = outDQU + duDesc.offset + n * duDesc.stride;
+        float *pOutDQV = outDQV + dvDesc.offset + n * dvDesc.stride;                                  
+        for(uniform int c=0; c<nChannel; c++) { 
+            uniform int offset = c * 16;
+            Point DQU, DQV;
+            DQU.x = DQU.y = DQU.z = 0.0;
+            DQV.x = DQV.y = DQV.z = 0.0;            
+            for (uniform int i=0; i<16; ++i) {
+                DQU = DQU + derivS[i] * controlVertices[offset + i];
+                DQV = DQV + derivT[i] * controlVertices[offset + i];                                            
+            }    
+              
+            *pOutDQU ++ = DQU.x, *pOutDQU ++ = DQU.y, *pOutDQU ++ = DQU.z;
+            *pOutDQV ++ = DQV.x, *pOutDQV ++ = DQV.y, *pOutDQV ++ = DQV.z;            
+        }                   
+    }
+}  
+
+export void
+evalBSplineNoDerivative(uniform unsigned int                  bitField,
+                        uniform int                           nPoint, 
+                        uniform const float  * uniform        u, 
+                        uniform const float  * uniform        v,             
+                        uniform const int    * uniform        vertexIndices,
+                        uniform const BufferDescriptor       &inDesc,
+                        uniform const float * uniform         inQ,
+                        uniform const BufferDescriptor       &outDesc,
+                        uniform float *uniform                outQ)
+{
+    uniform int nChannel = inDesc.length / 3;
+    assert(nChannel < MAX_CHANNEL);
+    
+    uniform Point controlVertices[MAX_CHANNEL*16];
+    for(uniform int i=0; i<16; i++) {
+        uniform unsigned int id = vertexIndices[i];
+        uniform const float * uniform pVertex = inQ + inDesc.offset + id * inDesc.stride;
+        for(uniform int c=0; c<nChannel; c++) {
+            uniform int offset = c * 16 + i;
+            controlVertices[offset].x = pVertex[0];
+            controlVertices[offset].y = pVertex[1];
+            controlVertices[offset].z = pVertex[2];
+            pVertex += 3;
+        }
+    }
+
+    uniform float frac = getParamFraction(bitField);
+
+    // top left corner
+    uniform float pu = (uniform float)getU(bitField)*frac;
+    uniform float pv = (uniform float)getV(bitField)*frac;
+
+    foreach( n = 0 ... nPoint) {
+        // normalize u,v coordinates
+        float s = (u[n] - pu) / frac;
+        float t = (v[n] - pv) / frac;
+        
+        float sWeights[4], tWeights[4];
+       
+        getBSplineWeightsNoDerivative(s, sWeights);
+        getBSplineWeightsNoDerivative(t, tWeights);      
+        
+        adjustBoundaryWeights(bitField, sWeights, tWeights);  
+        
+        float weight[16];       
+        for (uniform int i = 0; i < 4; ++i) {
+            for (uniform int j = 0; j < 4; ++j) {
+                weight[4*i+j] = sWeights[j] * tWeights[i];
+            }
+        }
+                       
+        float *pOutQ = outQ + outDesc.offset + n * outDesc.stride;             
+        for(uniform int c=0; c<nChannel; c++) { 
+            uniform int offset = c * 16;
+            Point Q;
+            Q.x = Q.y = Q.z = 0.0;
+            for (uniform int i=0; i<16; ++i) {
+                Q = Q + weight[i] * controlVertices[offset + i];                            
+            }    
+              
+            *pOutQ ++ = Q.x, *pOutQ ++ = Q.y, *pOutQ ++ = Q.z;
+        }           
+    }
+}  
+
+void getGregoryWeights(uniform unsigned int bitField, 
+                       float s, float t, float point[20], float deriv1[20], float deriv2[20]) {
+    //
+    //  P3         e3-      e2+         P2
+    //     15------17-------11--------10
+    //     |        |        |        |
+    //     |        |        |        |
+    //     |        | f3-    | f2+    |
+    //     |       19       13        |
+    // e3+ 16-----18           14-----12 e2-
+    //     |     f3+          f2-     |
+    //     |                          |
+    //     |                          |
+    //     |      f0-         f1+     |
+    // e0- 2------4            8------6 e1+
+    //     |        3        9        |
+    //     |        | f0+    | f1-    |
+    //     |        |        |        |
+    //     |        |        |        |
+    //     O--------1--------7--------5
+    //  P0         e0+      e1-         P1
+    //
+
+    //  Indices of boundary and interior points and their corresponding Bezier points
+    //  (this can be reduced with more direct indexing and unrolling of loops):
+    //
+    static uniform int const boundaryGregory[12] = { 0, 1, 7, 5, 2, 6, 16, 12, 15, 17, 11, 10 };
+    static uniform int const boundaryBezSCol[12] = { 0, 1, 2, 3, 0, 3,  0,  3,  0,  1,  2,  3 };
+    static uniform int const boundaryBezTRow[12] = { 0, 0, 0, 0, 1, 1,  2,  2,  3,  3,  3,  3 };
+
+    static uniform int const interiorGregory[8] = { 3, 4,  8, 9,  13, 14,  18, 19 };
+    static uniform int const interiorBezSCol[8] = { 1, 1,  2, 2,   2,  2,   1,  1 };
+    static uniform int const interiorBezTRow[8] = { 1, 1,  1, 1,   2,  2,   2,  2 };
+
+    //
+    //  Bezier basis functions are denoted with B while the rational multipliers for the
+    //  interior points will be denoted G -- so we have B(s), B(t) and G(s,t):
+    //
+    //  Directional Bezier basis functions B at s and t:
+    float Bs[4], Bds[4];
+    float Bt[4], Bdt[4];
+
+    getBezierWeights(s, Bs, Bds);
+    getBezierWeights(t, Bt, Bdt);
+
+    //  Rational multipliers G at s and t:
+    float sC = 1.0f - s;
+    float tC = 1.0f - t;
+
+    //  Use <= here to avoid compiler warnings -- the sums should always be non-negative:
+    float df0 = s  + t;   df0 = (df0 <= 0.0f) ? 1.0f : (1.0f / df0);
+    float df1 = sC + t;   df1 = (df1 <= 0.0f) ? 1.0f : (1.0f / df1);
+    float df2 = sC + tC;  df2 = (df2 <= 0.0f) ? 1.0f : (1.0f / df2);
+    float df3 = s  + tC;  df3 = (df3 <= 0.0f) ? 1.0f : (1.0f / df3);
+
+    float G[8] = { s*df0, t*df0,  t*df1, sC*df1,  sC*df2, tC*df2,  tC*df3, s*df3 };
+
+    //  Combined weights for boundary and interior points:
+    for (uniform int i = 0; i < 12; ++i) {
+        point[boundaryGregory[i]] = Bs[boundaryBezSCol[i]] * Bt[boundaryBezTRow[i]];
+    }
+    for (uniform int i = 0; i < 8; ++i) {
+        point[interiorGregory[i]] = Bs[interiorBezSCol[i]] * Bt[interiorBezTRow[i]] * G[i];
+    }
+
+    //
+    //  For derivatives, the basis functions for the interior points are rational and ideally
+    //  require appropriate differentiation, i.e. product rule for the combination of B and G
+    //  and the quotient rule for the rational G itself.  As initially proposed by Loop et al
+    //  though, the approximation using the 16 Bezier points arising from the G(s,t) has
+    //  proved adequate (and is what the GPU shaders use) so we continue to use that here.
+    //
+    //  An implementation of the true derivatives is provided for future reference -- it is
+    //  unclear if the approximations will hold up under surface analysis involving higher
+    //  order differentiation.
+    //
+
+    //  Remember to include derivative scaling in all assignments below:
+    uniform float dScale = (uniform float)(1 << getDepth(bitField));
+
+    //  Combined weights for boundary points -- simple (scaled) tensor products:
+    for (uniform int i = 0; i < 12; ++i) {
+        uniform int iDst = boundaryGregory[i];
+        uniform int tRow = boundaryBezTRow[i];
+        uniform int sCol = boundaryBezSCol[i];
+
+        deriv1[iDst] = Bds[sCol] * Bt[tRow] * dScale;
+        deriv2[iDst] = Bdt[tRow] * Bs[sCol] * dScale;
+    }
+
+#define _USE_BEZIER_PSEUDO_DERIVATIVES
+#ifdef _USE_BEZIER_PSEUDO_DERIVATIVES
+    //  Approximation to the true Gregory derivatives by differentiating the Bezier patch
+    //  unique to the given (s,t), i.e. having F = (g^+ * f^+) + (g^- * f^-) as its four
+    //  interior points:
+    //
+    //  Combined weights for interior points -- (scaled) tensor products with G+ or G-:
+    for (uniform int i = 0; i < 8; ++i) {
+        uniform int iDst = interiorGregory[i];
+        uniform int tRow = interiorBezTRow[i];
+        uniform int sCol = interiorBezSCol[i];
+        deriv1[iDst] = Bds[sCol] * Bt[tRow] * G[i] * dScale;
+        deriv2[iDst] = Bdt[tRow] * Bs[sCol] * G[i] * dScale;
+    }
+#else
+    //  True Gregory derivatives using appropriate differentiation of composite functions:
+    //
+    //  Note that for G(s,t) = N(s,t) / D(s,t), all N' and D' are trivial constants (which
+    //  simplifies things for higher order derivatives).  And while each pair of functions
+    //  G (i.e. the G+ and G- corresponding to points f+ and f-) must sum to 1 to ensure
+    //  Bezier equivalence (when f+ = f-), the pairs of G' must similarly sum to 0.  So we
+    //  can potentially compute only one of the pair and negate the result for the other
+    //  (and with 4 or 8 computations involving these constants, this is all very SIMD
+    //  friendly...) but for now we treat all 8 independently for simplicity.
+    //
+    //float N[8] = {   s,     t,      t,     sC,      sC,     tC,      tC,     s };
+    uniform float D[8] = {   df0,   df0,    df1,    df1,     df2,    df2,     df3,   df3 };
+
+    static uniform float const Nds[8] = { 1.0f, 0.0f,  0.0f, -1.0f, -1.0f,  0.0f,  0.0f,  1.0f };
+    static uniform float const Ndt[8] = { 0.0f, 1.0f,  1.0f,  0.0f,  0.0f, -1.0f, -1.0f,  0.0f };
+
+    static uniform float const Dds[8] = { 1.0f, 1.0f, -1.0f, -1.0f, -1.0f, -1.0f,  1.0f,  1.0f };
+    static uniform float const Ddt[8] = { 1.0f, 1.0f,  1.0f,  1.0f, -1.0f, -1.0f, -1.0f, -1.0f };
+
+    //  Combined weights for interior points -- (scaled) combinations of B, B', G and G':
+    for (uniform int i = 0; i < 8; ++i) {
+        uniform int iDst = interiorGregory[i];
+        uniform int tRow = interiorBezTRow[i];
+        uniform int sCol = interiorBezSCol[i];
+
+        //  Quotient rule for G' (re-expressed in terms of G to simplify (and D = 1/D)):
+        float Gds = (Nds[i] - Dds[i] * G[i]) * D[i];
+        float Gdt = (Ndt[i] - Ddt[i] * G[i]) * D[i];
+
+        //  Product rule combining B and B' with G and G' (and scaled):
+        deriv1[iDst] = (Bds[sCol] * G[i] + Bs[sCol] * Gds) * Bt[tRow] * dScale;
+        deriv2[iDst] = (Bdt[tRow] * G[i] + Bt[tRow] * Gdt) * Bs[sCol] * dScale;
+    }
+#endif
+}
+
+void getGregoryWeightsNoDerivative(uniform unsigned int bitField, float s, float t, float point[20]) {
+    //
+    //  P3         e3-      e2+         P2
+    //     15------17-------11--------10
+    //     |        |        |        |
+    //     |        |        |        |
+    //     |        | f3-    | f2+    |
+    //     |       19       13        |
+    // e3+ 16-----18           14-----12 e2-
+    //     |     f3+          f2-     |
+    //     |                          |
+    //     |                          |
+    //     |      f0-         f1+     |
+    // e0- 2------4            8------6 e1+
+    //     |        3        9        |
+    //     |        | f0+    | f1-    |
+    //     |        |        |        |
+    //     |        |        |        |
+    //     O--------1--------7--------5
+    //  P0         e0+      e1-         P1
+    //
+
+    //  Indices of boundary and interior points and their corresponding Bezier points
+    //  (this can be reduced with more direct indexing and unrolling of loops):
+    //
+    static uniform int const boundaryGregory[12] = { 0, 1, 7, 5, 2, 6, 16, 12, 15, 17, 11, 10 };
+    static uniform int const boundaryBezSCol[12] = { 0, 1, 2, 3, 0, 3,  0,  3,  0,  1,  2,  3 };
+    static uniform int const boundaryBezTRow[12] = { 0, 0, 0, 0, 1, 1,  2,  2,  3,  3,  3,  3 };
+
+    static uniform int const interiorGregory[8] = { 3, 4,  8, 9,  13, 14,  18, 19 };
+    static uniform int const interiorBezSCol[8] = { 1, 1,  2, 2,   2,  2,   1,  1 };
+    static uniform int const interiorBezTRow[8] = { 1, 1,  1, 1,   2,  2,   2,  2 };
+
+    //
+    //  Bezier basis functions are denoted with B while the rational multipliers for the
+    //  interior points will be denoted G -- so we have B(s), B(t) and G(s,t):
+    //
+    //  Directional Bezier basis functions B at s and t:
+    float Bs[4];
+    float Bt[4];
+
+    getBezierWeightsNoDerivative(s, Bs);
+    getBezierWeightsNoDerivative(t, Bt);
+
+    //  Rational multipliers G at s and t:
+    float sC = 1.0f - s;
+    float tC = 1.0f - t;
+
+    //  Use <= here to avoid compiler warnings -- the sums should always be non-negative:
+    float df0 = s  + t;   df0 = (df0 <= 0.0f) ? 1.0f : (1.0f / df0);
+    float df1 = sC + t;   df1 = (df1 <= 0.0f) ? 1.0f : (1.0f / df1);
+    float df2 = sC + tC;  df2 = (df2 <= 0.0f) ? 1.0f : (1.0f / df2);
+    float df3 = s  + tC;  df3 = (df3 <= 0.0f) ? 1.0f : (1.0f / df3);
+
+    float G[8] = { s*df0, t*df0,  t*df1, sC*df1,  sC*df2, tC*df2,  tC*df3, s*df3 };
+
+    //  Combined weights for boundary and interior points:
+    for (uniform int i = 0; i < 12; ++i) {
+        point[boundaryGregory[i]] = Bs[boundaryBezSCol[i]] * Bt[boundaryBezTRow[i]];
+    }
+    for (uniform int i = 0; i < 8; ++i) {
+        point[interiorGregory[i]] = Bs[interiorBezSCol[i]] * Bt[interiorBezTRow[i]] * G[i];
+    }
+}
+
+export void
+evalGregory(uniform   unsigned int            bitField,
+            uniform   int                     nPoint, 
+            uniform   float                   u[], 
+            uniform   float                   v[],                    
+            uniform   const unsigned int      vertexIndices[],
+            uniform   const BufferDescriptor &inDesc,
+            uniform   const float             inQ[], 
+            uniform   const BufferDescriptor &outDesc,
+            uniform   float                   outQ[], 
+            uniform   const BufferDescriptor &duDesc,
+            uniform   float                   outDQU[],
+            uniform   const BufferDescriptor &dvDesc,            
+            uniform   float                   outDQV[])
+{
+    uniform int nChannel = inDesc.length / 3;
+    assert(nChannel < MAX_CHANNEL);
+    
+    uniform Point controlVertices[MAX_CHANNEL*20];
+    for(uniform int i=0; i<20; i++) {
+        uniform unsigned int id = vertexIndices[i];
+        uniform const float * uniform pVertex = inQ + inDesc.offset + id * inDesc.stride;
+        for(uniform int c=0; c<nChannel; c++) {
+            uniform int offset = c * 20 + i;
+            controlVertices[offset].x = pVertex[0];
+            controlVertices[offset].y = pVertex[1];
+            controlVertices[offset].z = pVertex[2];
+            pVertex += 3;
+        }
+    }
+
+    uniform float frac = getParamFraction(bitField);
+
+    // top left corner
+    uniform float pu = (uniform float)getU(bitField)*frac;
+    uniform float pv = (uniform float)getV(bitField)*frac;
+
+    foreach( n = 0 ... nPoint) {
+        // normalize u,v coordinates
+        float s = (u[n] - pu) / frac;
+        float t = (v[n] - pv) / frac;
+        
+        float point[20], deriv1[20], deriv2[20];
+        getGregoryWeights(bitField, s, t, point, deriv1, deriv2);
+        
+        float *pOutQ = outQ + outDesc.offset + n * outDesc.stride;             
+        for(uniform int c=0; c<nChannel; c++) { 
+            uniform int offset = c * 16;
+            Point Q;
+            Q.x = Q.y = Q.z = 0.0;
+            for (uniform int i=0; i<16; ++i) {
+                Q = Q + point[i] * controlVertices[offset + i];                            
+            }    
+              
+            *pOutQ ++ = Q.x, *pOutQ ++ = Q.y, *pOutQ ++ = Q.z;
+        }   
+                              
+        float *pOutDQU = outDQU + duDesc.offset + n * duDesc.stride;
+        float *pOutDQV = outDQV + dvDesc.offset + n * dvDesc.stride;                                  
+        for(uniform int c=0; c<nChannel; c++) { 
+            uniform int offset = c * 20;
+            Point DQU, DQV;
+            DQU.x = DQU.y = DQU.z = 0.0;
+            DQV.x = DQV.y = DQV.z = 0.0;            
+            for (uniform int i=0; i<20; ++i) {
+                DQU = DQU + deriv1[i] * controlVertices[offset + i];
+                DQV = DQV + deriv2[i] * controlVertices[offset + i];                                            
+            }    
+              
+            *pOutDQU ++ = DQU.x, *pOutDQU ++ = DQU.y, *pOutDQU ++ = DQU.z;
+            *pOutDQV ++ = DQV.x, *pOutDQV ++ = DQV.y, *pOutDQV ++ = DQV.z;            
+        }                           
+    }
+}   
+
+export void
+evalGregoryNoDerivative(uniform unsigned int            bitField,
+                        uniform int                     nPoint, 
+                        uniform float                   u[], 
+                        uniform float                   v[],                    
+                        uniform const unsigned int      vertexIndices[],
+                        uniform const BufferDescriptor &inDesc,
+                        uniform const float             inQ[], 
+                        uniform const BufferDescriptor &outDesc,
+                        uniform float                   outQ[]
+                       )
+{
+    uniform int nChannel = inDesc.length / 3;
+    assert(nChannel < MAX_CHANNEL);
+    
+    uniform Point controlVertices[MAX_CHANNEL*20];
+    for(uniform int i=0; i<20; i++) {
+        uniform unsigned int id = vertexIndices[i];
+        uniform const float * uniform pVertex = inQ + inDesc.offset + id * inDesc.stride;
+        for(uniform int c=0; c<nChannel; c++) {
+            uniform int offset = c * 20 + i;
+            controlVertices[offset].x = pVertex[0];
+            controlVertices[offset].y = pVertex[1];
+            controlVertices[offset].z = pVertex[2];
+            pVertex += 3;
+        }
+    }
+
+    uniform float frac = getParamFraction(bitField);
+
+    // top left corner
+    uniform float pu = (uniform float)getU(bitField)*frac;
+    uniform float pv = (uniform float)getV(bitField)*frac;
+
+    foreach( n = 0 ... nPoint) {
+        // normalize u,v coordinates
+        float s = (u[n] - pu) / frac;
+        float t = (v[n] - pv) / frac;
+        
+        float point[20];
+        getGregoryWeightsNoDerivative(bitField, s, t, point);
+        
+        float *pOutQ = outQ + outDesc.offset + n * outDesc.stride;             
+        for(uniform int c=0; c<nChannel; c++) { 
+            uniform int offset = c * 20;
+            Point Q;
+            Q.x = Q.y = Q.z = 0.0;
+            for (uniform int i=0; i<20; ++i) {
+                Q = Q + point[i] * controlVertices[offset + i];                            
+            }    
+              
+            *pOutQ ++ = Q.x, *pOutQ ++ = Q.y, *pOutQ ++ = Q.z;
+        }   
+    }
+}    
+
--- a/opensubdiv/osd/ispcEvalLimitKernel.isph
+++ b/opensubdiv/osd/ispcEvalLimitKernel.isph
@ -0,0 +1,55 @@
+//
+// ispcEvalLimitKernel.isph
+// (Header automatically generated by the ispc compiler.)
+// DO NOT EDIT THIS FILE.
+//
+
+#ifndef ISPC_ISPCEVALLIMITKERNEL_ISPH
+#define ISPC_ISPCEVALLIMITKERNEL_ISPH
+
+#include <stdint.h>
+
+
+
+#ifdef __cplusplus
+namespace ispc { /* namespace */
+#endif // __cplusplus
+#ifndef __ISPC_STRUCT_BufferDescriptor__
+#define __ISPC_STRUCT_BufferDescriptor__
+struct BufferDescriptor {
+    int32_t offset;
+    int32_t length;
+    int32_t stride;
+};
+#endif
+
+
+///////////////////////////////////////////////////////////////////////////
+// Functions exported from ispc code
+///////////////////////////////////////////////////////////////////////////
+#if defined(__cplusplus) && !defined(__ISPC_NO_EXTERN_C)
+extern "C" {
+#endif // __cplusplus
+    extern void evalBSpline(int32_t bitField, int32_t nPoint, const float * u, const float * v, const int32_t * vertexIndices, const struct BufferDescriptor &inDesc, const float * inQ, const struct BufferDescriptor &outDesc, float * outQ, const struct BufferDescriptor &duDesc, float * outDQU, const struct BufferDescriptor &dvDesc, float * outDQV);
+    
+    extern void evalBilinear(int32_t bitField, int32_t nPoint, const float * u, const float * v, const int32_t * vertexIndices, const struct BufferDescriptor &inDesc, const float * inQ, const struct BufferDescriptor &outDesc, float * outQ, const struct BufferDescriptor &duDesc, float * outDQU, const struct BufferDescriptor &dvDesc, float * outDQV);
+    
+    extern void evalGregory(int32_t bitField, int32_t nPoint, const float * u, const float * v, const int32_t * vertexIndices, const struct BufferDescriptor &inDesc, const float * inQ, const struct BufferDescriptor &outDesc, float * outQ, const struct BufferDescriptor &duDesc, float * outDQU, const struct BufferDescriptor &dvDesc, float * outDQV);
+    
+    extern void evalBSplineNoDerivative(int32_t bitField, int32_t nPoint, const float * u, const float * v, const int32_t * vertexIndices, const struct BufferDescriptor &inDesc, const float * inQ, const struct BufferDescriptor &outDesc, float * outQ);
+    
+    extern void evalBilinearNoDerivative(int32_t bitField, int32_t nPoint, const float * u, const float * v, const int32_t * vertexIndices, const struct BufferDescriptor &inDesc, const float * inQ, const struct BufferDescriptor &outDesc, float * outQ);
+    
+    extern void evalGregoryNoDerivative(int32_t bitField, int32_t nPoint, const float * u, const float * v, const int32_t * vertexIndices, const struct BufferDescriptor &inDesc, const float * inQ, const struct BufferDescriptor &outDesc, float * outQ);
+        
+    extern void getSIMDWidth(int32_t &simdWidth);
+#if defined(__cplusplus) && !defined(__ISPC_NO_EXTERN_C)
+} /* end extern C */
+#endif // __cplusplus
+
+
+#ifdef __cplusplus
+} /* namespace */
+#endif // __cplusplus
+
+#endif // ISPC_ISPCEVALLIMITKERNEL_ISPH
--- a/opensubdiv/osd/ispcEvaluator.cpp
+++ b/opensubdiv/osd/ispcEvaluator.cpp
@ -0,0 +1,289 @@
+//
+//   Copyright 2015 Pixar
+//
+//   Licensed under the Apache License, Version 2.0 (the "Apache License")
+//   with the following modification; you may not use this file except in
+//   compliance with the Apache License and the following modification to it:
+//   Section 6. Trademarks. is deleted and replaced with:
+//
+//   6. Trademarks. This License does not grant permission to use the trade
+//      names, trademarks, service marks, or product names of the Licensor
+//      and its affiliates, except as required to comply with Section 4(c) of
+//      the License and to reproduce the content of the NOTICE file.
+//
+//   You may obtain a copy of the Apache License at
+//
+//       http://www.apache.org/licenses/LICENSE-2.0
+//
+//   Unless required by applicable law or agreed to in writing, software
+//   distributed under the Apache License with the above modification is
+//   distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+//   KIND, either express or implied. See the Apache License for the specific
+//   language governing permissions and limitations under the Apache License.
+//
+
+#include "ispcEvaluator.h"
+#include "cpuKernel.h"
+#include "../far/patchBasis.h"
+#include "ispcEvalLimitKernel.isph"
+
+#include <tbb/parallel_for.h>
+#include <cstdlib>
+
+namespace OpenSubdiv {
+namespace OPENSUBDIV_VERSION {
+
+namespace Osd {
+
+#define grain_size  512
+
+/* static */
+bool
+IspcEvaluator::EvalStencils(const float *src, BufferDescriptor const &srcDesc,
+                           float *dst,       BufferDescriptor const &dstDesc,
+                           const int * sizes,
+                           const int * offsets,
+                           const int * indices,
+                           const float * weights,
+                           int start, int end) {
+
+    if (end <= start) return true;
+    if (srcDesc.length != dstDesc.length) return false;
+
+    // XXX: we can probably expand cpuKernel.cpp to here.
+    CpuEvalStencils(src, srcDesc, dst, dstDesc,
+                    sizes, offsets, indices, weights, start, end);
+
+    return true;
+}
+
+/* static */
+bool
+IspcEvaluator::EvalStencils(const float *src, BufferDescriptor const &srcDesc,
+                           float *dst,       BufferDescriptor const &dstDesc,
+                           float *du,        BufferDescriptor const &duDesc,
+                           float *dv,        BufferDescriptor const &dvDesc,
+                           const int * sizes,
+                           const int * offsets,
+                           const int * indices,
+                           const float * weights,
+                           const float * duWeights,
+                           const float * dvWeights,
+                           int start, int end) {
+    if (end <= start) return true;
+    if (srcDesc.length != dstDesc.length) return false;
+    if (srcDesc.length != duDesc.length) return false;
+    if (srcDesc.length != dvDesc.length) return false;
+
+    CpuEvalStencils(src, srcDesc,
+                    dst, dstDesc,
+                    du,  duDesc,
+                    dv,  dvDesc,
+                    sizes, offsets, indices,
+                    weights, duWeights, dvWeights,
+                    start, end);
+
+    return true;
+}
+
+template <typename T>
+struct BufferAdapter {
+    BufferAdapter(T *p, int length, int stride) :
+        _p(p), _length(length), _stride(stride) { }
+    void Clear() {
+        for (int i = 0; i < _length; ++i) _p[i] = 0;
+    }
+    void AddWithWeight(T const *src, float w) {
+        if (_p) {
+            for (int i = 0; i < _length; ++i) {
+                _p[i] += src[i] * w;
+            }
+        }
+    }
+    const T *operator[] (int index) const {
+        return _p + _stride * index;
+    }
+    BufferAdapter<T> & operator ++() {
+        if (_p) {
+            _p += _stride;
+        }
+        return *this;
+    }
+
+    T *_p;
+    int _length;
+    int _stride;
+};
+
+/* static */
+bool
+IspcEvaluator::EvalPatches(const float *src, BufferDescriptor const &srcDesc,
+                           float *dst,       BufferDescriptor const &dstDesc,
+                           int numPatchCoords,
+                           const PatchCoord *patchCoords,
+                           const PatchArray *patchArrays,
+                           const int *patchIndexBuffer,
+                           const PatchParam *patchParamBuffer) { 
+    if (srcDesc.length != dstDesc.length) return false;
+        
+    // Copy BufferDescriptor to ispc version
+    // Since memory alignment in ISPC may be different from C++,
+    // we use the assignment for each field instead of the assignment for 
+    // the whole struct
+    ispc::BufferDescriptor ispcSrcDesc;
+    ispcSrcDesc.offset = srcDesc.offset;
+    ispcSrcDesc.length = srcDesc.length;
+    ispcSrcDesc.stride = srcDesc.stride;                                           
+                          
+    tbb::blocked_range<int> range = tbb::blocked_range<int>(0, numPatchCoords, grain_size);
+    tbb::parallel_for(range, [&](const tbb::blocked_range<int> &r)
+    {    
+    uint i = r.begin();
+        
+    ispc::BufferDescriptor ispcDstDesc, ispcDuDesc, ispcDvDesc;                               
+    ispcDstDesc.offset = dstDesc.offset + dstDesc.offset + i * dstDesc.stride;
+    ispcDstDesc.length = dstDesc.length;
+    ispcDstDesc.stride = dstDesc.stride;
+    
+    while (i < r.end()) {
+        // the patch coordinates are sorted by patch handle
+        // the following code searches the coordinates that
+        // belongs to the same patch so that they can be evalauated 
+        // with ISPC
+        int nCoord = 1;
+        Far::PatchTable::PatchHandle handle = patchCoords[i].handle;
+        while(i + nCoord < r.end() && 
+              handle.isEqual(patchCoords[i + nCoord].handle) )
+              nCoord ++;
+              
+        PatchArray const &array = patchArrays[handle.arrayIndex];
+        int patchType = array.GetPatchType();
+        Far::PatchParam const & param = patchParamBuffer[handle.patchIndex];
+
+        unsigned int bitField = param.field1;
+
+        const int *cvs = &patchIndexBuffer[array.indexBase + handle.vertIndex];
+
+        __declspec( align(64) ) float u[nCoord];
+        __declspec( align(64) ) float v[nCoord];        
+        
+        for(int n=0; n<nCoord; n++) {
+            u[n] = patchCoords[i + n].s;
+            v[n] = patchCoords[i + n].t;            
+        }
+        
+        if (patchType == Far::PatchDescriptor::REGULAR) {
+            ispc::evalBSplineNoDerivative(bitField, nCoord, u, v, cvs, ispcSrcDesc, src, 
+                              ispcDstDesc, dst);
+        } else if (patchType == Far::PatchDescriptor::GREGORY_BASIS) {
+            ispc::evalGregoryNoDerivative(bitField, nCoord, u, v, cvs, ispcSrcDesc, src, 
+                              ispcDstDesc, dst);        
+        } else if (patchType == Far::PatchDescriptor::QUADS) {
+            ispc::evalBilinearNoDerivative(bitField, nCoord, u, v, cvs, ispcSrcDesc, src, 
+                               ispcDstDesc, dst);           
+        } else {
+            assert(0);
+        }
+        
+        i += nCoord;
+        ispcDstDesc.offset = dstDesc.offset + i * dstDesc.stride;                                                  
+    }
+    });
+    
+    return true;
+}
+
+/* static */
+bool
+IspcEvaluator::EvalPatches(const float *src, BufferDescriptor const &srcDesc,
+                           float *dst,       BufferDescriptor const &dstDesc,
+                           float *du,        BufferDescriptor const &duDesc,
+                           float *dv,        BufferDescriptor const &dvDesc,
+                           int numPatchCoords,
+                           const PatchCoord *patchCoords,
+                           const PatchArray *patchArrays,
+                           const int *patchIndexBuffer,
+                           const PatchParam *patchParamBuffer) {
+    if (srcDesc.length != dstDesc.length) return false;
+        
+    // Copy BufferDescriptor to ispc version
+    // Since memory alignment in ISPC may be different from C++,
+    // we use the assignment for each field instead of the assignment for 
+    // the whole struct
+    ispc::BufferDescriptor ispcSrcDesc;
+    ispcSrcDesc.offset = srcDesc.offset;
+    ispcSrcDesc.length = srcDesc.length;
+    ispcSrcDesc.stride = srcDesc.stride;                      
+                      
+    tbb::blocked_range<int> range = tbb::blocked_range<int>(0, numPatchCoords, grain_size);
+    tbb::parallel_for(range, [&](const tbb::blocked_range<int> &r)
+    {    
+    uint i = r.begin();
+        
+    ispc::BufferDescriptor ispcDstDesc, ispcDuDesc, ispcDvDesc;                               
+    ispcDstDesc.offset = dstDesc.offset + dstDesc.offset + i * dstDesc.stride;
+    ispcDstDesc.length = dstDesc.length;
+    ispcDstDesc.stride = dstDesc.stride;
+    
+    ispcDuDesc.offset  = duDesc.offset  + i * duDesc.stride;
+    ispcDuDesc.length  = duDesc.length;
+    ispcDuDesc.stride  = duDesc.stride;
+    
+    ispcDvDesc.offset  = dvDesc.offset  + i * dvDesc.stride;
+    ispcDvDesc.length  = dvDesc.length;
+    ispcDvDesc.stride  = dvDesc.stride;
+    while (i < r.end()) {
+        // the patch coordinates are sorted by patch handle
+        // the following code searches the coordinates that
+        // belongs to the same patch so that they can be evalauated 
+        // with ISPC
+        int nCoord = 1;
+        Far::PatchTable::PatchHandle handle = patchCoords[i].handle;
+        while(i + nCoord < r.end() && 
+              handle.isEqual(patchCoords[i + nCoord].handle) )
+              nCoord ++;
+              
+        PatchArray const &array = patchArrays[handle.arrayIndex];
+        int patchType = array.GetPatchType();
+        Far::PatchParam const & param = patchParamBuffer[handle.patchIndex];
+
+        unsigned int bitField = param.field1;
+
+        const int *cvs = &patchIndexBuffer[array.indexBase + handle.vertIndex];
+
+        __declspec( align(64) ) float u[nCoord];
+        __declspec( align(64) ) float v[nCoord];        
+        
+        for(int n=0; n<nCoord; n++) {
+            u[n] = patchCoords[i + n].s;
+            v[n] = patchCoords[i + n].t;            
+        }
+        
+        if (patchType == Far::PatchDescriptor::REGULAR) {
+            ispc::evalBSpline(bitField, nCoord, u, v, cvs, ispcSrcDesc, src, 
+                              ispcDstDesc, dst, ispcDuDesc, du, ispcDvDesc, dv);
+        } else if (patchType == Far::PatchDescriptor::GREGORY_BASIS) {
+            ispc::evalGregory(bitField, nCoord, u, v, cvs, ispcSrcDesc, src, 
+                              ispcDstDesc, dst, ispcDuDesc, du, ispcDvDesc, dv);        
+        } else if (patchType == Far::PatchDescriptor::QUADS) {
+            ispc::evalBilinear(bitField, nCoord, u, v, cvs, ispcSrcDesc, src, 
+                               ispcDstDesc, dst, ispcDuDesc, du, ispcDvDesc, dv);           
+        } else {
+            assert(0);
+        }
+        
+        i += nCoord;
+        ispcDstDesc.offset = dstDesc.offset + i * dstDesc.stride;
+        ispcDuDesc.offset  = duDesc.offset  + i * duDesc.stride;
+        ispcDvDesc.offset  = dvDesc.offset  + i * dvDesc.stride;                                                        
+    }
+    });
+    
+    return true;
+}
+
+
+}  // end namespace Osd
+
+}  // end namespace OPENSUBDIV_VERSION
+}  // end namespace OpenSubdiv
--- a/opensubdiv/osd/ispcEvaluator.h
+++ b/opensubdiv/osd/ispcEvaluator.h
@ -0,0 +1,482 @@
+//
+//   Copyright 2015 Pixar
+//
+//   Licensed under the Apache License, Version 2.0 (the "Apache License")
+//   with the following modification; you may not use this file except in
+//   compliance with the Apache License and the following modification to it:
+//   Section 6. Trademarks. is deleted and replaced with:
+//
+//   6. Trademarks. This License does not grant permission to use the trade
+//      names, trademarks, service marks, or product names of the Licensor
+//      and its affiliates, except as required to comply with Section 4(c) of
+//      the License and to reproduce the content of the NOTICE file.
+//
+//   You may obtain a copy of the Apache License at
+//
+//       http://www.apache.org/licenses/LICENSE-2.0
+//
+//   Unless required by applicable law or agreed to in writing, software
+//   distributed under the Apache License with the above modification is
+//   distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+//   KIND, either express or implied. See the Apache License for the specific
+//   language governing permissions and limitations under the Apache License.
+//
+
+#ifndef OPENSUBDIV3_OSD_ISPC_EVALUATOR_H
+#define OPENSUBDIV3_OSD_ISPC_EVALUATOR_H
+
+#include "../version.h"
+
+#include <cstddef>
+#include <vector>
+#include "../osd/bufferDescriptor.h"
+#include "../osd/types.h"
+
+namespace OpenSubdiv {
+namespace OPENSUBDIV_VERSION {
+
+namespace Osd {
+
+class IspcEvaluator {
+public:
+    /// ----------------------------------------------------------------------
+    ///
+    ///   Stencil evaluations with StencilTable
+    ///
+    /// ----------------------------------------------------------------------
+
+    /// \brief Generic static eval stencils function. This function has a same
+    ///        signature as other device kernels have so that it can be called
+    ///        in the same way from OsdMesh template interface.
+    ///
+    /// @param srcBuffer      Input primvar buffer.
+    ///                       must have BindCpuBuffer() method returning a
+    ///                       const float pointer for read
+    ///
+    /// @param srcDesc        vertex buffer descriptor for the input buffer
+    ///
+    /// @param dstBuffer      Output primvar buffer
+    ///                       must have BindCpuBuffer() method returning a
+    ///                       float pointer for write
+    ///
+    /// @param dstDesc        vertex buffer descriptor for the output buffer
+    ///
+    /// @param stencilTable   Far::StencilTable or equivalent
+    ///
+    /// @param instance       not used in the cpu kernel
+    ///                       (declared as a typed pointer to prevent
+    ///                        undesirable template resolution)
+    ///
+    /// @param deviceContext  not used in the cpu kernel
+    ///
+    template <typename SRC_BUFFER, typename DST_BUFFER, typename STENCIL_TABLE>
+    static bool EvalStencils(
+        SRC_BUFFER *srcBuffer, BufferDescriptor const &srcDesc,
+        DST_BUFFER *dstBuffer, BufferDescriptor const &dstDesc,
+        STENCIL_TABLE const *stencilTable,
+        const IspcEvaluator *instance = NULL,
+        void * deviceContext = NULL) {
+
+        (void)instance;       // unused
+        (void)deviceContext;  // unused
+
+        if (stencilTable->GetNumStencils() == 0)
+            return false;
+
+        return EvalStencils(srcBuffer->BindCpuBuffer(), srcDesc,
+                            dstBuffer->BindCpuBuffer(), dstDesc,
+                            &stencilTable->GetSizes()[0],
+                            &stencilTable->GetOffsets()[0],
+                            &stencilTable->GetControlIndices()[0],
+                            &stencilTable->GetWeights()[0],
+                            /*start = */ 0,
+                            /*end   = */ stencilTable->GetNumStencils());
+    }
+
+    /// \brief Static eval stencils function which takes raw CPU pointers for
+    ///        input and output.
+    ///
+    /// @param src            Input primvar pointer. An offset of srcDesc
+    ///                       will be applied internally (i.e. the pointer
+    ///                       should not include the offset)
+    ///
+    /// @param srcDesc        vertex buffer descriptor for the input buffer
+    ///
+    /// @param dst            Output primvar pointer. An offset of dstDesc
+    ///                       will be applied internally.
+    ///
+    /// @param dstDesc        vertex buffer descriptor for the output buffer
+    ///
+    /// @param sizes          pointer to the sizes buffer of the stencil table
+    ///                       to apply for the range [start, end)
+    ///
+    /// @param offsets        pointer to the offsets buffer of the stencil table
+    ///
+    /// @param indices        pointer to the indices buffer of the stencil table
+    ///
+    /// @param weights        pointer to the weights buffer of the stencil table
+    ///
+    /// @param start          start index of stencil table
+    ///
+    /// @param end            end index of stencil table
+    ///
+    static bool EvalStencils(
+        const float *src,  BufferDescriptor const &srcDesc,
+        float *dst,        BufferDescriptor const &dstDesc,
+        const int * sizes,
+        const int * offsets,
+        const int * indices,
+        const float * weights,
+        int start, int end);
+
+    /// \brief Generic static eval stencils function with derivatives.
+    ///        This function has a same signature as other device kernels
+    ///        have so that it can be called in the same way from OsdMesh
+    ///        template interface.
+    ///
+    /// @param srcBuffer      Input primvar buffer.
+    ///                       must have BindCpuBuffer() method returning a
+    ///                       const float pointer for read
+    ///
+    /// @param srcDesc        vertex buffer descriptor for the input buffer
+    ///
+    /// @param dstBuffer      Output primvar buffer
+    ///                       must have BindCpuBuffer() method returning a
+    ///                       float pointer for write
+    ///
+    /// @param dstDesc        vertex buffer descriptor for the output buffer
+    ///
+    /// @param duBuffer       Output U-derivative buffer
+    ///                       must have BindCpuBuffer() method returning a
+    ///                       float pointer for write
+    ///
+    /// @param duDesc         vertex buffer descriptor for the output buffer
+    ///
+    /// @param dvBuffer       Output V-derivative buffer
+    ///                       must have BindCpuBuffer() method returning a
+    ///                       float pointer for write
+    ///
+    /// @param dvDesc         vertex buffer descriptor for the output buffer
+    ///
+    /// @param stencilTable   Far::StencilTable or equivalent
+    ///
+    /// @param instance       not used in the cpu kernel
+    ///                       (declared as a typed pointer to prevent
+    ///                        undesirable template resolution)
+    ///
+    /// @param deviceContext  not used in the cpu kernel
+    ///
+    template <typename SRC_BUFFER, typename DST_BUFFER, typename STENCIL_TABLE>
+    static bool EvalStencils(
+        SRC_BUFFER *srcBuffer, BufferDescriptor const &srcDesc,
+        DST_BUFFER *dstBuffer, BufferDescriptor const &dstDesc,
+        DST_BUFFER *duBuffer,  BufferDescriptor const &duDesc,
+        DST_BUFFER *dvBuffer,  BufferDescriptor const &dvDesc,
+        STENCIL_TABLE const *stencilTable,
+        const IspcEvaluator *instance = NULL,
+        void * deviceContext = NULL) {
+
+        (void)instance;       // unused
+        (void)deviceContext;  // unused
+
+        return EvalStencils(srcBuffer->BindCpuBuffer(), srcDesc,
+                            dstBuffer->BindCpuBuffer(), dstDesc,
+                            duBuffer->BindCpuBuffer(),  duDesc,
+                            dvBuffer->BindCpuBuffer(),  dvDesc,
+                            &stencilTable->GetSizes()[0],
+                            &stencilTable->GetOffsets()[0],
+                            &stencilTable->GetControlIndices()[0],
+                            &stencilTable->GetWeights()[0],
+                            &stencilTable->GetDuWeights()[0],
+                            &stencilTable->GetDvWeights()[0],
+                            /*start = */ 0,
+                            /*end   = */ stencilTable->GetNumStencils());
+    }
+
+    /// \brief Static eval stencils function with derivatives, which takes
+    ///        raw CPU pointers for input and output.
+    ///
+    /// @param src            Input primvar pointer. An offset of srcDesc
+    ///                       will be applied internally (i.e. the pointer
+    ///                       should not include the offset)
+    ///
+    /// @param srcDesc        vertex buffer descriptor for the input buffer
+    ///
+    /// @param dst            Output primvar pointer. An offset of dstDesc
+    ///                       will be applied internally.
+    ///
+    /// @param dstDesc        vertex buffer descriptor for the output buffer
+    ///
+    /// @param du             Output U-derivatives pointer. An offset of
+    ///                       duDesc will be applied internally.
+    ///
+    /// @param duDesc         vertex buffer descriptor for the output buffer
+    ///
+    /// @param dv             Output V-derivatives pointer. An offset of
+    ///                       dvDesc will be applied internally.
+    ///
+    /// @param dvDesc         vertex buffer descriptor for the output buffer
+    ///
+    /// @param sizes          pointer to the sizes buffer of the stencil table
+    ///
+    /// @param offsets        pointer to the offsets buffer of the stencil table
+    ///
+    /// @param indices        pointer to the indices buffer of the stencil table
+    ///
+    /// @param weights        pointer to the weights buffer of the stencil table
+    ///
+    /// @param duWeights      pointer to the du-weights buffer of the stencil table
+    ///
+    /// @param dvWeights      pointer to the dv-weights buffer of the stencil table
+    ///
+    /// @param start          start index of stencil table
+    ///
+    /// @param end            end index of stencil table
+    ///
+    static bool EvalStencils(
+        const float *src, BufferDescriptor const &srcDesc,
+        float *dst,       BufferDescriptor const &dstDesc,
+        float *du,        BufferDescriptor const &duDesc,
+        float *dv,        BufferDescriptor const &dvDesc,
+        const int * sizes,
+        const int * offsets,
+        const int * indices,
+        const float * weights,
+        const float * duWeights,
+        const float * dvWeights,
+        int start, int end);
+
+    /// ----------------------------------------------------------------------
+    ///
+    ///   Limit evaluations with PatchTable
+    ///
+    /// ----------------------------------------------------------------------
+
+    /// \brief Generic limit eval function. This function has a same
+    ///        signature as other device kernels have so that it can be called
+    ///        in the same way.
+    ///
+    /// @param srcBuffer        Input primvar buffer.
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         const float pointer for read
+    ///
+    /// @param srcDesc          vertex buffer descriptor for the input buffer
+    ///
+    /// @param dstBuffer        Output primvar buffer
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param dstDesc          vertex buffer descriptor for the output buffer
+    ///
+    /// @param numPatchCoords   number of patchCoords.
+    ///
+    /// @param patchCoords      array of locations to be evaluated.
+    ///
+    /// @param patchTable       CpuPatchTable or equivalent
+    ///                         XXX: currently Far::PatchTable can't be used
+    ///                              due to interface mismatch
+    ///
+    /// @param instance         not used in the cpu evaluator
+    ///
+    /// @param deviceContext    not used in the cpu evaluator
+    ///
+    template <typename SRC_BUFFER, typename DST_BUFFER,
+              typename PATCHCOORD_BUFFER, typename PATCH_TABLE>
+    static bool EvalPatches(
+        SRC_BUFFER *srcBuffer, BufferDescriptor const &srcDesc,
+        DST_BUFFER *dstBuffer, BufferDescriptor const &dstDesc,
+        int numPatchCoords,
+        PATCHCOORD_BUFFER *patchCoords,
+        PATCH_TABLE *patchTable,
+        IspcEvaluator const *instance = NULL,
+        void * deviceContext = NULL) {
+
+        (void)instance;       // unused
+        (void)deviceContext;  // unused
+
+        return EvalPatches(srcBuffer->BindCpuBuffer(), srcDesc,
+                           dstBuffer->BindCpuBuffer(), dstDesc,
+                           numPatchCoords,
+                           (const PatchCoord*)patchCoords->BindCpuBuffer(),
+                           patchTable->GetPatchArrayBuffer(),
+                           patchTable->GetPatchIndexBuffer(),
+                           patchTable->GetPatchParamBuffer());
+    }
+
+    /// \brief Generic limit eval function with derivatives. This function has
+    ///        a same signature as other device kernels have so that it can be
+    ///        called in the same way.
+    ///
+    /// @param srcBuffer        Input primvar buffer.
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         const float pointer for read
+    ///
+    /// @param srcDesc          vertex buffer descriptor for the input buffer
+    ///
+    /// @param dstBuffer        Output primvar buffer
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param dstDesc          vertex buffer descriptor for the output buffer
+    ///
+    /// @param duBuffer         Output U-derivatives buffer
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param duDesc           vertex buffer descriptor for the duBuffer
+    ///
+    /// @param dvBuffer         Output V-derivatives buffer
+    ///                         must have BindCpuBuffer() method returning a
+    ///                         float pointer for write
+    ///
+    /// @param dvDesc           vertex buffer descriptor for the dvBuffer
+    ///
+    /// @param numPatchCoords   number of patchCoords.
+    ///
+    /// @param patchCoords      array of locations to be evaluated.
+    ///
+    /// @param patchTable       CpuPatchTable or equivalent
+    ///                         XXX: currently Far::PatchTable can't be used
+    ///                              due to interface mismatch
+    ///
+    /// @param instance         not used in the cpu evaluator
+    ///
+    /// @param deviceContext    not used in the cpu evaluator
+    ///
+    template <typename SRC_BUFFER, typename DST_BUFFER,
+              typename PATCHCOORD_BUFFER, typename PATCH_TABLE>
+    static bool EvalPatches(
+        SRC_BUFFER *srcBuffer, BufferDescriptor const &srcDesc,
+        DST_BUFFER *dstBuffer, BufferDescriptor const &dstDesc,
+        DST_BUFFER *duBuffer,  BufferDescriptor const &duDesc,
+        DST_BUFFER *dvBuffer,  BufferDescriptor const &dvDesc,
+        int numPatchCoords,
+        PATCHCOORD_BUFFER *patchCoords,
+        PATCH_TABLE *patchTable,
+        IspcEvaluator const *instance = NULL,
+        void * deviceContext = NULL) {
+        (void)instance;       // unused
+        (void)deviceContext;  // unused
+
+        // XXX: PatchCoords is somewhat abusing vertex primvar buffer interop.
+        //      ideally all buffer classes should have templated by datatype
+        //      so that downcast isn't needed there.
+        //      (e.g. Osd::CpuBuffer<PatchCoord> )
+        //
+        return EvalPatches(srcBuffer->BindCpuBuffer(), srcDesc,
+                           dstBuffer->BindCpuBuffer(), dstDesc,
+                           duBuffer->BindCpuBuffer(),  duDesc,
+                           dvBuffer->BindCpuBuffer(),  dvDesc,
+                           numPatchCoords,
+                           (const PatchCoord*)patchCoords->BindCpuBuffer(),
+                           patchTable->GetPatchArrayBuffer(),
+                           patchTable->GetPatchIndexBuffer(),
+                           patchTable->GetPatchParamBuffer());
+    }
+
+    /// \brief Static limit eval function. It takes an array of PatchCoord
+    ///        and evaluate limit values on given PatchTable.
+    ///
+    /// @param src              Input primvar pointer. An offset of srcDesc
+    ///                         will be applied internally (i.e. the pointer
+    ///                         should not include the offset)
+    ///
+    /// @param srcDesc          vertex buffer descriptor for the input buffer
+    ///
+    /// @param dst              Output primvar pointer. An offset of dstDesc
+    ///                         will be applied internally.
+    ///
+    /// @param dstDesc          vertex buffer descriptor for the output buffer
+    ///
+    /// @param numPatchCoords   number of patchCoords.
+    ///
+    /// @param patchCoords      array of locations to be evaluated.
+    ///
+    /// @param patchArrays      an array of Osd::PatchArray struct
+    ///                         indexed by PatchCoord::arrayIndex
+    ///
+    /// @param patchIndexBuffer an array of patch indices
+    ///                         indexed by PatchCoord::vertIndex
+    ///
+    /// @param patchParamBuffer an array of Osd::PatchParam struct
+    ///                         indexed by PatchCoord::patchIndex
+    ///
+    static bool EvalPatches(
+        const float *src, BufferDescriptor const &srcDesc,
+        float *dst,       BufferDescriptor const &dstDesc,
+        int numPatchCoords,
+        const PatchCoord *patchCoords,
+        const PatchArray *patchArrays,
+        const int *patchIndexBuffer,
+        const PatchParam *patchParamBuffer);
+
+    /// \brief Static limit eval function. It takes an array of PatchCoord
+    ///        and evaluate limit values on given PatchTable.
+    ///
+    /// @param src              Input primvar pointer. An offset of srcDesc
+    ///                         will be applied internally (i.e. the pointer
+    ///                         should not include the offset)
+    ///
+    /// @param srcDesc          vertex buffer descriptor for the input buffer
+    ///
+    /// @param dst              Output primvar pointer. An offset of dstDesc
+    ///                         will be applied internally.
+    ///
+    /// @param dstDesc          vertex buffer descriptor for the output buffer
+    ///
+    /// @param du               Output U-derivatives pointer. An offset of
+    ///                         duDesc will be applied internally.
+    ///
+    /// @param duDesc           vertex buffer descriptor for the du buffer
+    ///
+    /// @param dv               Output V-derivatives pointer. An offset of
+    ///                         dvDesc will be applied internally.
+    ///
+    /// @param dvDesc           vertex buffer descriptor for the dv buffer
+    ///
+    /// @param numPatchCoords   number of patchCoords.
+    ///
+    /// @param patchCoords      array of locations to be evaluated.
+    ///
+    /// @param patchArrays      an array of Osd::PatchArray struct
+    ///                         indexed by PatchCoord::arrayIndex
+    ///
+    /// @param patchIndexBuffer an array of patch indices
+    ///                         indexed by PatchCoord::vertIndex
+    ///
+    /// @param patchParamBuffer an array of Osd::PatchParam struct
+    ///                         indexed by PatchCoord::patchIndex
+    ///
+    static bool EvalPatches(
+        const float *src, BufferDescriptor const &srcDesc,
+        float *dst,       BufferDescriptor const &dstDesc,
+        float *du,        BufferDescriptor const &duDesc,
+        float *dv,        BufferDescriptor const &dvDesc,
+        int numPatchCoords,
+        PatchCoord const *patchCoords,
+        PatchArray const *patchArrays,
+        const int *patchIndexBuffer,
+        PatchParam const *patchParamBuffer);
+
+    /// ----------------------------------------------------------------------
+    ///
+    ///   Other methods
+    ///
+    /// ----------------------------------------------------------------------
+
+    /// \brief synchronize all asynchronous computation invoked on this device.
+    static void Synchronize(void * /*deviceContext = NULL*/) {
+        // nothing.
+    }
+};
+
+
+}  // end namespace Osd
+
+}  // end namespace OPENSUBDIV_VERSION
+using namespace OPENSUBDIV_VERSION;
+
+}  // end namespace OpenSubdiv
+
+
+#endif  // OPENSUBDIV3_OSD_CPU_EVALUATOR_H