Merge pull request #705 from PixarAnimationStudios/revert-704-ispc

Revert "Add ISPC limit surface evaluation"
2025-01-09 16:20:10 +00:00 · 2015-07-20 17:28:02 -07:00 · 2015-07-20 17:28:02 -07:00 · 22a8c26048
commit 22a8c26048
parent b006dc328e 8a8771c97d
13 changed files with 30 additions and 1953 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -197,8 +197,6 @@ if (CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_CLANGCC OR CMAKE_COMPILER_IS_IC
            endif()

        endforeach()
-
-        list(APPEND OSD_COMPILER_FLAGS -std=c++11)
    endif()

 elseif(MSVC)
@ -323,9 +321,6 @@ endif()
 if(NOT NO_TBB)
    find_package(TBB 4.0)
 endif()
-if(NOT NO_ISPC)
-    find_package(ISPC 1.6)
-endif()
 if (NOT NO_OPENGL)
    find_package(OpenGL)
 endif()
@ -544,12 +539,6 @@ if (NOT NO_MAYA)
    endif()
 endif()

-if(ISPC_FOUND)
-    add_definitions(
-        -DOPENSUBDIV_HAS_ISPC
-    )
-endif()
-
 # Link examples & regressions dynamically against Osd
 set( OSD_LINK_TARGET osd_dynamic_cpu osd_dynamic_gpu )

--- a/cmake/FindISPC.cmake
+++ b/cmake/FindISPC.cmake
@ -1,94 +0,0 @@
-#
-#   Copyright 2013 Pixar
-#
-#   Licensed under the Apache License, Version 2.0 (the "Apache License")
-#   with the following modification; you may not use this file except in
-#   compliance with the Apache License and the following modification to it:
-#   Section 6. Trademarks. is deleted and replaced with:
-#
-#   6. Trademarks. This License does not grant permission to use the trade
-#      names, trademarks, service marks, or product names of the Licensor
-#      and its affiliates, except as required to comply with Section 4(c) of
-#      the License and to reproduce the content of the NOTICE file.
-#
-#   You may obtain a copy of the Apache License at
-#
-#       http://www.apache.org/licenses/LICENSE-2.0
-#
-#   Unless required by applicable law or agreed to in writing, software
-#   distributed under the Apache License with the above modification is
-#   distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-#   KIND, either express or implied. See the Apache License for the specific
-#   language governing permissions and limitations under the Apache License.
-#
-
-# - Try to find Intel's ISPC
-# Once done this will define
-#
-#  ISPC_FOUND - System has ISPC
-#  ISPC_DIR - The ISPC directory
-
-# Obtain ISPC directory
-if (WIN32)
-    #NOT IMPLEMENTED
-elseif (APPLE)
-    #NOT IMPLEMENTED
-else ()
-    find_path(ISPC_DIR
-        NAMES
-            ispc
-        PATHS
-            ${ISPC_LOCATION}  
-        NO_DEFAULT_PATH NO_SYSTEM_ENVIRONMENT_PATH
-        DOC "The directory where ISPC reside")
-endif ()
-
-if (ISPC_DIR)
-    execute_process(COMMAND ${ISPC_DIR}/ispc --version OUTPUT_VARIABLE ISPC_VERSION)
-    string(REGEX MATCH "[0-9].[0-9].[0-9]" ISPC_VERSION ${ISPC_VERSION})
-endif ()
-
-include(FindPackageHandleStandardArgs)
-
-find_package_handle_standard_args(ISPC
-    REQUIRED_VARS
-        ISPC_DIR
-    VERSION_VAR
-        ISPC_VERSION
-)
-
-mark_as_advanced( ISPC_DIR )
-
-MACRO (ispc_compile)
-  
-    SET(ISPC_TARGET_DIR ${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/osd_ispc_obj.dir)
-
-    SET(ISPC_OBJECTS "")
-    
-    FOREACH(src ${ARGN})
-    
-        GET_FILENAME_COMPONENT(fname ${src} NAME_WE)
-        
-        SET(results "${ISPC_TARGET_DIR}/${fname}.dev.o")
-  
-        ADD_CUSTOM_COMMAND(
-            OUTPUT ${results} ${ISPC_TARGET_DIR}/${fname}_ispc.h
-            COMMAND  ${ISPC_DIR}/ispc  
-            --pic
-            -O1
-            --wno-perf
-            --woff
-            -h ${ISPC_TARGET_DIR}/${fname}_ispc.h
-            -MMM  ${ISPC_TARGET_DIR}/${fname}.dev.idep 
-            -o ${ISPC_TARGET_DIR}/${fname}.dev.o
-            ${CMAKE_CURRENT_SOURCE_DIR}/${src} 
-            \;
-            DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/${src} 
-        )
-
-        SET(ISPC_OBJECTS ${ISPC_OBJECTS} ${results})
-
-    ENDFOREACH()
-    
-ENDMACRO()
-
--- a/examples/glEvalLimit/glEvalLimit.cpp
+++ b/examples/glEvalLimit/glEvalLimit.cpp
@ -34,10 +34,6 @@ GLFWmonitor* g_primary=0;
 #include <osd/cpuGLVertexBuffer.h>
 #include <osd/mesh.h>

-#ifdef OPENSUBDIV_HAS_ISPC
-    #include <osd/ispcEvaluator.h>
-#endif    
-    
 #ifdef OPENSUBDIV_HAS_TBB
    #include <osd/tbbEvaluator.h>
 #endif
@ -108,8 +104,7 @@ enum KernelType { kCPU = 0,
                  kCUDA = 3,
                  kCL = 4,
                  kGLXFB = 5,
-                  kGLCompute = 6,
-                  kISPC = 7 };
+                  kGLCompute = 6 };

 enum EndCap      { kEndCapBSplineBasis,
                   kEndCapGregoryBasis };
@ -174,10 +169,10 @@ float g_currentTime = 0;
 Stopwatch g_fpsTimer;

 //------------------------------------------------------------------------------
-int g_nParticles = 655360;
+int g_nParticles = 65536;

 bool g_randomStart = true;//false;
-bool g_animParticles = false;
+bool g_animParticles = true;

 GLuint g_samplesVAO=0;

@ -444,9 +439,7 @@ updateGeom() {
    assert(g_particles);

    float elapsed = g_currentTime - g_prevTime;
-    if(elapsed != 0.0f) {
-        g_particles->Update(elapsed);
-    }
+    g_particles->Update(elapsed);
    g_prevTime = g_currentTime;

    std::vector<OpenSubdiv::Osd::PatchCoord> const &patchCoords
@ -471,7 +464,7 @@ updateGeom() {
    }

    s.Stop();
-        
+
    g_evalTime = float(s.GetElapsed());
 }

@ -655,20 +648,8 @@ createOsdMesh(ShapeDesc const & shapeDesc, int level) {
            (vertexStencils, varyingStencils,
             nCoarseVertices, nverts, g_nParticles, g_patchTable,
             &glComputeEvaluatorCache);
-
-    }             
 #endif
-#if  defined(OPENSUBDIV_HAS_ISPC) && defined(OPENSUBDIV_HAS_TBB)
-    else if(g_kernel == kISPC) {
-        g_evalOutput = new EvalOutput<Osd::CpuGLVertexBuffer,
-                                      Osd::CpuGLVertexBuffer,
-                                      Far::StencilTable,
-                                      Osd::CpuPatchTable,
-                                      Osd::IspcEvaluator>
-            (vertexStencils, varyingStencils,
-             nCoarseVertices, nverts, g_nParticles, g_patchTable);    
-    }    
-#endif    
+    }

    // Create the 'uv particles' manager - this class manages the limit
    // location samples (ptex face index, (s,t) and updates them between frames.
@ -894,7 +875,7 @@ display() {
        }

        if (g_endCap != kEndCapBSplineBasis &&
-            (g_kernel != kCPU && g_kernel != kOPENMP && g_kernel != kTBB && g_kernel != kISPC)) {
+            (g_kernel != kCPU && g_kernel != kOPENMP && g_kernel != kTBB)) {
            static char msg[] =
                "ERROR: This kernel only supports BSpline basis patches.";
            g_hud.DrawString(g_width/4, g_height/4+20, 1, 0, 0, msg);
@ -1148,9 +1129,6 @@ initHUD() {
 #ifdef OPENSUBDIV_HAS_TBB
    g_hud.AddPullDownButton(compute_pulldown, "TBB", kTBB);
 #endif
-#if  defined(OPENSUBDIV_HAS_ISPC) && defined(OPENSUBDIV_HAS_TBB)
-    g_hud.AddPullDownButton(compute_pulldown, "ISPC", kISPC);
-#endif
 #ifdef OPENSUBDIV_HAS_CUDA
    g_hud.AddPullDownButton(compute_pulldown, "CUDA", kCUDA);
 #endif
--- a/examples/glEvalLimit/particles.cpp
+++ b/examples/glEvalLimit/particles.cpp
@ -32,17 +32,17 @@
 #ifdef OPENSUBDIV_HAS_TBB
 #include <tbb/parallel_for.h>
 #include <tbb/atomic.h>
-
+tbb::atomic<int> g_tbbCounter;
 class TbbUpdateKernel {
 public:
    TbbUpdateKernel(float speed,
                    STParticles::Position *positions,
                    float *velocities,
                    std::vector<STParticles::FaceInfo> const &adjacency,
-                    PatchHandleMap *patchHandleMap,
+                    OpenSubdiv::Osd::PatchCoord *patchCoords,
                    OpenSubdiv::Far::PatchMap const *patchMap) :
        _speed(speed), _positions(positions), _velocities(velocities),
-        _adjacency(adjacency), _patchHandleMap(patchHandleMap), _patchMap(patchMap) {
+        _adjacency(adjacency), _patchCoords(patchCoords), _patchMap(patchMap) {
    }

    void operator () (tbb::blocked_range<int> const &r) const {
@ -76,13 +76,9 @@ public:
            OpenSubdiv::Far::PatchTable::PatchHandle const *handle =
                _patchMap->FindPatch(p->ptexIndex, p->s, p->t);
            if (handle) {
-                PatchHandleMap::accessor a;
-                if( !_patchHandleMap->find(a, handle)) {  
-                    _patchHandleMap->insert(a, handle);               
-                }
-                std::vector<float> &st = a->second;
-                st.push_back(p->s);
-                st.push_back(p->t);  
+                int index = g_tbbCounter.fetch_and_add(1);
+                _patchCoords[index] =
+                    OpenSubdiv::Osd::PatchCoord(*handle, p->s, p->t);
            }
        }
    }
@ -91,7 +87,7 @@ private:
    STParticles::Position *_positions;
    float *_velocities;
    std::vector<STParticles::FaceInfo> const &_adjacency;
-    PatchHandleMap *_patchHandleMap;
+    OpenSubdiv::Osd::PatchCoord *_patchCoords;
    OpenSubdiv::Far::PatchMap const *_patchMap;
 };
 #endif
@ -280,36 +276,18 @@ STParticles::Update(float deltaTime) {
    if (deltaTime == 0) return;
    float speed = GetSpeed() * std::max(0.001f, std::min(deltaTime, 0.5f));

+    _patchCoords.clear();
+
    // XXX: this process should be parallelized.
 #ifdef OPENSUBDIV_HAS_TBB
-    _patchHandleMap.clear();
-    
+
+    _patchCoords.resize((int)GetNumParticles());
    TbbUpdateKernel kernel(speed, &_positions[0], &_velocities[0],
-                           _adjacency, &_patchHandleMap, _patchMap);;
+                           _adjacency, &_patchCoords[0], _patchMap);;
+    g_tbbCounter = 0;
    tbb::blocked_range<int> range(0, GetNumParticles(), 256);
    tbb::parallel_for(range, kernel);
-    
-
-    int nCoord = 0;
-    for(PatchHandleMap::iterator i  = _patchHandleMap.begin();
-                                 i != _patchHandleMap.end();
-                                 i ++) {
-        nCoord += (i->second.size() / 2);
-    }
-    
-    _patchCoords.resize(nCoord);
-    
-    int index = 0;
-    for(PatchHandleMap::iterator i  = _patchHandleMap.begin();
-                                 i != _patchHandleMap.end();
-                                 i ++) {
-        for(int j = 0; j < i->second.size(); j += 2) {
-            _patchCoords[index].handle = *(i->first);
-            _patchCoords[index].s      = i->second[j];
-            _patchCoords[index].t      = i->second[j+1];
-            index ++;
-        }
-    }     
+    _patchCoords.resize(g_tbbCounter);
 #else
    Position *  p = &_positions[0];
    float    * dp = &_velocities[0];
@ -345,7 +323,7 @@ STParticles::Update(float deltaTime) {
                OpenSubdiv::Osd::PatchCoord(*handle, p->s, p->t));
        }
    }
-#endif   
+#endif
 }

 // Dump adjacency info
--- a/examples/glEvalLimit/particles.h
+++ b/examples/glEvalLimit/particles.h
@ -30,11 +30,6 @@
 #include <osd/types.h>
 #include <iostream>

-#ifdef OPENSUBDIV_HAS_TBB
-#include <tbb/concurrent_hash_map.h>
-typedef tbb::concurrent_hash_map< OpenSubdiv::Far::PatchTable::PatchHandle const*, std::vector<float> > PatchHandleMap;
-#endif
-   
 //
 // In order to emphasize the dynamic nature of the EvalLimit API, where the
 // locations can be arbitrarily updated before each evaluation, the glEvalLimit
@ -147,7 +142,7 @@ public:
        return _velocities;
    }

-    std::vector<OpenSubdiv::Osd::PatchCoord> const &GetPatchCoords() const {
+    std::vector<OpenSubdiv::Osd::PatchCoord> GetPatchCoords() const {
        return _patchCoords;
    }

@ -164,10 +159,6 @@ private:
    std::vector<Position> _positions;

    std::vector<float> _velocities;
-    
-#ifdef OPENSUBDIV_HAS_TBB    
-    PatchHandleMap  _patchHandleMap;
-#endif

    std::vector<OpenSubdiv::Osd::PatchCoord> _patchCoords;

--- a/opensubdiv/CMakeLists.txt
+++ b/opensubdiv/CMakeLists.txt
@ -147,16 +147,9 @@ if (NOT NO_LIB)
    )
    set_target_properties(osd_static_cpu PROPERTIES OUTPUT_NAME osdCPU CLEAN_DIRECT_OUTPUT 1)

-    if( ISPC_FOUND)
-        target_link_libraries(osd_static_cpu
-            osd_ispc_obj
-            ${PLATFORM_CPU_LIBRARIES}
-        )
-    else()
-        target_link_libraries(osd_static_cpu
-            ${PLATFORM_CPU_LIBRARIES}
-        )    
-    endif()
+    target_link_libraries(osd_static_cpu
+        ${PLATFORM_CPU_LIBRARIES}
+    )

    install( TARGETS osd_static_cpu DESTINATION "${CMAKE_LIBDIR_BASE}" )

@ -207,16 +200,9 @@ if (NOT NO_LIB)
                )
        endif()

-        if ( ISPC_FOUND)
-            target_link_libraries(osd_dynamic_cpu
-                osd_ispc_obj
-                ${PLATFORM_CPU_LIBRARIES}
-            )
-        else()
-            target_link_libraries(osd_dynamic_cpu
-                ${PLATFORM_CPU_LIBRARIES}
-            )        
-        endif()
+        target_link_libraries(osd_dynamic_cpu
+            ${PLATFORM_CPU_LIBRARIES}
+        )

        install( TARGETS osd_dynamic_cpu LIBRARY DESTINATION "${CMAKE_LIBDIR_BASE}" )

--- a/opensubdiv/far/patchParam.h
+++ b/opensubdiv/far/patchParam.h
@ -116,15 +116,6 @@ struct PatchParam {
    ///
    void Normalize( float & u, float & v ) const;

-    /// This function is the reverse operation of function Normalize()
-    /// The (u,v) pair is converted from patch sub-parametric space to control
-    /// face parametric space.
-    ///
-    /// @param u  u parameter
-    /// @param v  v parameter
-    ///        
-    void Denormalize( float & u, float & v) const;
-    
    unsigned int field0:32;
    unsigned int field1:32;
 };
@ -170,20 +161,6 @@ PatchParam::Normalize( float & u, float & v ) const {
    v = (v - pv) / frac;
 }

-inline void
-PatchParam::Denormalize( float & u, float & v ) const {
-
-    float frac = GetParamFraction();
-
-    // top left corner
-    float pu = (float)GetU()*frac;
-    float pv = (float)GetV()*frac;
-
-    // normalize u,v coordinates
-    u = u * frac + pu;
-    v = v * frac + pv;    
-}
-
 } // end namespace Far

 } // end namespace OPENSUBDIV_VERSION
--- a/opensubdiv/far/patchTable.h
+++ b/opensubdiv/far/patchTable.h
@ -68,12 +68,6 @@ public:
        Index arrayIndex, // Array index of the patch
              patchIndex, // Absolute Index of the patch
              vertIndex;  // Relative offset to the first CV of the patch in array
-              
-        bool isEqual(const PatchHandle &other) {
-            return other.arrayIndex == arrayIndex &&
-                   other.patchIndex == patchIndex &&
-                   other.vertIndex  == vertIndex;
-        }
    };

 public:
--- a/opensubdiv/osd/CMakeLists.txt
+++ b/opensubdiv/osd/CMakeLists.txt
@ -26,7 +26,6 @@

 #-------------------------------------------------------------------------------
 # source & headers
-
 set(CPU_SOURCE_FILES
    cpuEvaluator.cpp
    cpuKernel.cpp
@ -34,12 +33,8 @@ set(CPU_SOURCE_FILES
    cpuVertexBuffer.cpp
 )

-if( ISPC_FOUND) 
-    list(APPEND CPU_SOURCE_FILES ispcEvaluator.cpp)  
-endif()
-
 set(GPU_SOURCE_FILES )
-set(ISPC_SOURCE_FILES )
+
 set(INC_FILES )

 set(PRIVATE_HEADER_FILES
@ -301,17 +296,6 @@ if( CUDA_FOUND )
    endif()
 endif()

-if( ISPC_FOUND) 
-    list(APPEND ISPC_SOURCE_FILES
-         ispcEvalLimitKernel.ispc
-    ) 
-    
-    # Compile ISPC code to objs
-    ispc_compile(${ISPC_SOURCE_FILES})
-    ADD_LIBRARY(osd_ispc_obj STATIC ${ISPC_OBJECTS})   
-    SET_TARGET_PROPERTIES(osd_ispc_obj PROPERTIES LINKER_LANGUAGE C)    
-endif()
-
 list(APPEND DOXY_HEADER_FILES ${CUDA_PUBLIC_HEADERS})

 #-------------------------------------------------------------------------------
--- a/opensubdiv/osd/ispcEvalLimitKernel.ispc
+++ b/opensubdiv/osd/ispcEvalLimitKernel.ispc
@ -1,880 +0,0 @@
-//
-//   Copyright 2013 Pixar
-//
-//   Licensed under the Apache License, Version 2.0 (the "Apache License")
-//   with the following modification; you may not use this file except in
-//   compliance with the Apache License and the following modification to it:
-//   Section 6. Trademarks. is deleted and replaced with:
-//
-//   6. Trademarks. This License does not grant permission to use the trade
-//      names, trademarks, service marks, or product names of the Licensor
-//      and its affiliates, except as required to comply with Section 4(c) of
-//      the License and to reproduce the content of the NOTICE file.
-//
-//   You may obtain a copy of the Apache License at
-//
-//       http://www.apache.org/licenses/LICENSE-2.0
-//
-//   Unless required by applicable law or agreed to in writing, software
-//   distributed under the Apache License with the above modification is
-//   distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-//   KIND, either express or implied. See the Apache License for the specific
-//   language governing permissions and limitations under the Apache License.
-//
-
-
-#define MAX_CHANNEL 4
-
-struct BufferDescriptor {
-    int offset;  // offset to desired element data
-    int length;  // number or length of the data
-    int stride;  // stride to the next element    
-};
-
-struct Point {
-    float x;
-    float y;
-    float z;
-};
-
-inline struct Point operator+(struct Point a, struct Point b) {
-    struct Point result;
-    result.x = a.x + b.x;
-    result.y = a.y + b.y;
-    result.z = a.z + b.z;        
-
-    return result;
-}
-
-inline uniform struct Point operator+(uniform struct Point a, uniform struct Point b) {
-    uniform struct Point result;
-    result.x = a.x + b.x;
-    result.y = a.y + b.y;
-    result.z = a.z + b.z;        
-
-    return result;
-}
-
-inline struct Point operator-(struct Point a, struct Point b) {
-    struct Point result;
-    result.x = a.x - b.x;
-    result.y = a.y - b.y;
-    result.z = a.z - b.z;        
-
-    return result;
-}
-
-inline uniform struct Point operator-(uniform struct Point a, uniform struct Point b) {
-    uniform struct Point result;
-    result.x = a.x - b.x;
-    result.y = a.y - b.y;
-    result.z = a.z - b.z;        
-
-    return result;
-}
-
-inline struct Point operator*(struct Point a, float b) {
-    struct Point result;
-    result.x = a.x * b;
-    result.y = a.y * b;
-    result.z = a.z * b;        
-
-    return result;
-}
-
-inline uniform struct Point operator*(uniform struct Point a, uniform float b) {
-    uniform struct Point result;
-    result.x = a.x * b;
-    result.y = a.y * b;
-    result.z = a.z * b;        
-
-    return result;
-} 
-
-inline struct Point operator*(float b, struct Point a) {
-    struct Point result;
-    result.x = b * a.x;
-    result.y = b * a.y;
-    result.z = b * a.z;        
-
-    return result;
-}
-
-inline uniform struct Point operator*(uniform float b, uniform struct Point a) {
-    uniform struct Point result;
-    result.x = b * a.x;
-    result.y = b * a.y;
-    result.z = b * a.z;        
-
-    return result;
-}
-
-inline struct Point operator/(struct Point a, float b) {
-    struct Point result;
-    result.x = a.x / b;
-    result.y = a.y / b;
-    result.z = a.z / b;        
-
-    return result;
-}
-
-inline uniform struct Point operator/(uniform struct Point a, uniform float b) {
-    uniform struct Point result;
-    result.x = a.x / b;
-    result.y = a.y / b;
-    result.z = a.z / b;        
-
-    return result;
-}
-
-inline void cross(struct Point &a, struct Point &b, struct Point &c)
-{
-    c.x = a.y*b.z - a.z*b.y;
-    c.y = a.z*b.x - a.x*b.z;
-    c.z = a.x*b.y - a.y*b.x;
-}
-
-inline uniform bool
-nonQuadRoot(uniform unsigned int bitField) 
-{
-    return (bitField >> 3) & 0x1;
-}
-
-inline uniform unsigned int getU(uniform unsigned int bitField) 
-{ 
-    return (uniform unsigned int)((bitField >> 22) & 0x3ff); 
-}
-
-inline uniform unsigned int getV(uniform unsigned int bitField) 
-{ 
-    return (uniform unsigned int)((bitField >> 12) & 0x3ff); 
-}
-
-inline uniform unsigned int getBoundary(uniform unsigned int bitField)
-{ 
-    return (uniform unsigned int)((bitField >> 8) & 0xf); 
-}
-     
-inline uniform unsigned int getDepth(uniform unsigned int bitField)
-{ 
-    return  (uniform unsigned int)(bitField & 0xf); 
-}
-
-inline uniform float
-getParamFraction(uniform unsigned int bitField){
-    if (nonQuadRoot(bitField)) {
-        return 1.0f / (1 << (getDepth(bitField)-1));
-    } else {
-        return 1.0f / (1 << getDepth(bitField));
-    }
-}
-
-inline void 
-adjustBoundaryWeights(uniform unsigned int bitField,
-                      float                sWeights[4], 
-                      float                tWeights[4]) {
-
-    uniform int boundary = getBoundary(bitField);
-
-    if (boundary & 1) {
-        tWeights[2] -= tWeights[0];
-        tWeights[1] += 2*tWeights[0];
-        tWeights[0] = 0;
-    }
-    if (boundary & 2) {
-        sWeights[1] -= sWeights[3];
-        sWeights[2] += 2*sWeights[3];
-        sWeights[3] = 0;
-    }
-    if (boundary & 4) {
-        tWeights[1] -= tWeights[3];
-        tWeights[2] += 2*tWeights[3];
-        tWeights[3] = 0;
-    }
-    if (boundary & 8) {
-        sWeights[2] -= sWeights[0];
-        sWeights[1] += 2*sWeights[0];
-        sWeights[0] = 0;
-    }
-}
-
-inline void
-getBSplineWeights(float t, float point[4], float deriv[4]) {
-    // The four uniform cubic B-Spline basis functions evaluated at t:
-    float const one6th = 1.0f / 6.0f;
-
-    float t2 = t * t;
-    float t3 = t * t2;
-
-    point[0] = one6th * (1.0f - 3.0f*(t -      t2) -      t3);
-    point[1] = one6th * (4.0f           - 6.0f*t2  + 3.0f*t3);
-    point[2] = one6th * (1.0f + 3.0f*(t +      t2  -      t3));
-    point[3] = one6th * (                                 t3);
-
-    // Derivatives of the above four basis functions at t:
-    deriv[0] = -0.5f*t2 +      t - 0.5f;
-    deriv[1] =  1.5f*t2 - 2.0f*t;
-    deriv[2] = -1.5f*t2 +      t + 0.5f;
-    deriv[3] =  0.5f*t2;
-}
-
-inline void
-getBezierWeights(float t, float point[4], float deriv[4]) {
-    // The four uniform cubic Bezier basis functions (in terms of t and its
-    // complement tC) evaluated at t:
-    float t2 = t*t;
-    float tC = 1.0f - t;
-    float tC2 = tC * tC;
-
-    point[0] = tC2 * tC;
-    point[1] = tC2 * t * 3.0f;
-    point[2] = t2 * tC * 3.0f;
-    point[3] = t2 * t;
-
-    // Derivatives of the above four basis functions at t:
-    deriv[0] = -3.0f * tC2;
-    deriv[1] =  9.0f * t2 - 12.0f * t + 3.0f;
-    deriv[2] = -9.0f * t2 +  6.0f * t;
-    deriv[3] =  3.0f * t2;
-}
-
-inline void
-getBSplineWeightsNoDerivative(float t, float point[4]) {
-    // The four uniform cubic B-Spline basis functions evaluated at t:
-    float const one6th = 1.0f / 6.0f;
-
-    float t2 = t * t;
-    float t3 = t * t2;
-
-    point[0] = one6th * (1.0f - 3.0f*(t -      t2) -      t3);
-    point[1] = one6th * (4.0f           - 6.0f*t2  + 3.0f*t3);
-    point[2] = one6th * (1.0f + 3.0f*(t +      t2  -      t3));
-    point[3] = one6th * (                                 t3);
-}
-
-inline void
-getBezierWeightsNoDerivative(float t, float point[4]) {
-    // The four uniform cubic Bezier basis functions (in terms of t and its
-    // complement tC) evaluated at t:
-    float t2 = t*t;
-    float tC = 1.0f - t;
-    float tC2 = tC * tC;
-
-    point[0] = tC2 * tC;
-    point[1] = tC2 * t * 3.0f;
-    point[2] = t2 * tC * 3.0f;
-    point[3] = t2 * t;
-}
-
-export void
-evalBilinear(uniform unsigned int                  bitField,
-             uniform int                           nPoint, 
-             uniform const float  * uniform        u, 
-             uniform const float  * uniform        v,             
-             uniform const int    * uniform        vertexIndices,
-             uniform const BufferDescriptor       &inDesc,
-             uniform const float * uniform         inQ,
-             uniform const BufferDescriptor       &outDesc,
-             uniform float *uniform                outQ,
-             uniform const BufferDescriptor       &duDesc,            
-             uniform float *uniform                outDQU,
-             uniform const BufferDescriptor       &dvDesc,            
-             uniform float *uniform                outDQV)
-{
-    uniform int nChannel = inDesc.length / 3;
-    assert(nChannel < MAX_CHANNEL);
-    
-    uniform Point controlVertices[MAX_CHANNEL*4];
-    for(uniform int i=0; i<4; i++) {
-        uniform unsigned int id = vertexIndices[i];
-        uniform const float * uniform pVertex = inQ + inDesc.offset + id * inDesc.stride;
-        for(uniform int c=0; c<nChannel; c++) {
-            uniform int offset = c * 4 + i;
-            controlVertices[offset].x = pVertex[0];
-            controlVertices[offset].y = pVertex[1];
-            controlVertices[offset].z = pVertex[2];
-            pVertex += 3;
-        }
-    }        
-                 
-    foreach( n = 0 ... nPoint) {        
-        float ou   = 1.0f - u[n];
-        float ov   = 1.0f - v[n];
-        float w[4] = { ov*ou, v[n]*ou, v[n]*u[n], ov*u[n] };
-                
-        float *pOutQ   = outQ   + outDesc.offset + n * outDesc.stride;        
-        for(uniform int c=0; c<nChannel; c++) { 
-            Point Q;
-            Q.x = Q.y = Q.z = 0.0;
-            for (uniform int i=0; i<4; ++i) {
-                Q = Q + w[i] * controlVertices[c * 4 + i];              
-            }    
-              
-            *pOutQ ++ = Q.x, *pOutQ ++ = Q.y, *pOutQ ++ = Q.z;
-        }        
-    }
-    
-    uniform Point dU[MAX_CHANNEL], dV[MAX_CHANNEL];
-    for(uniform int c=0; c<nChannel; c++) { 
-        dU[c] = 0.5 * (controlVertices[c * 4 + 3] - controlVertices[c * 4 + 0] +
-                       controlVertices[c * 4 + 2] - controlVertices[c * 4 + 1]  );
-                       
-        dV[c] = 0.5 * (controlVertices[c * 4 + 1] - controlVertices[c * 4 + 0] +
-                       controlVertices[c * 4 + 2] - controlVertices[c * 4 + 3]  );                       
-    }    
-    
-    foreach( n = 0 ... nPoint) {
-        float *pOutDQU = outDQU +  duDesc.offset  + n *  duDesc.stride;     
-        float *pOutDQV = outDQV +  dvDesc.offset  + n *  dvDesc.stride;           
-        for(uniform int c=0; c<nChannel; c++) { 
-            *pOutDQU ++ = dU[c].x, *pOutDQU ++ = dU[c].y, *pOutDQU ++ = dU[c].z;
-            *pOutDQV ++ = dV[c].x, *pOutDQV ++ = dV[c].y, *pOutDQV ++ = dV[c].z;            
-        }
-    }    
-}   
-
-export void
-evalBilinearNoDerivative(uniform unsigned int                  bitField,
-                         uniform int                           nPoint, 
-                         uniform const float  * uniform        u, 
-                         uniform const float  * uniform        v,             
-                         uniform const int    * uniform        vertexIndices,
-                         uniform const BufferDescriptor       &inDesc,
-                         uniform const float * uniform         inQ,
-                         uniform const BufferDescriptor       &outDesc,
-                         uniform float *uniform                outQ)
-{
-    uniform int nChannel = inDesc.length / 3;
-    assert(nChannel < MAX_CHANNEL);
-    
-    uniform Point controlVertices[MAX_CHANNEL*4];
-    for(uniform int i=0; i<4; i++) {
-        uniform unsigned int id = vertexIndices[i];
-        uniform const float * uniform pVertex = inQ + inDesc.offset + id * inDesc.stride;
-        for(uniform int c=0; c<nChannel; c++) {
-            uniform int offset = c * 4 + i;
-            controlVertices[offset].x = pVertex[0];
-            controlVertices[offset].y = pVertex[1];
-            controlVertices[offset].z = pVertex[2];
-            pVertex += 3;
-        }
-    }        
-                 
-    foreach( n = 0 ... nPoint) {        
-        float ou   = 1.0f - u[n];
-        float ov   = 1.0f - v[n];
-        float w[4] = { ov*ou, v[n]*ou, v[n]*u[n], ov*u[n] };
-                
-        float *pOutQ   = outQ   + outDesc.offset + n * outDesc.stride;        
-        for(uniform int c=0; c<nChannel; c++) { 
-            Point Q;
-            Q.x = Q.y = Q.z = 0.0;
-            for (uniform int i=0; i<4; ++i) {
-                Q = Q + w[i] * controlVertices[c * 4 + i];              
-            }    
-              
-            *pOutQ ++ = Q.x, *pOutQ ++ = Q.y, *pOutQ ++ = Q.z;
-        }        
-    }
-}   
-
-export void
-evalBSpline(uniform unsigned int                  bitField,
-            uniform int                           nPoint, 
-            uniform const float  * uniform        u, 
-            uniform const float  * uniform        v,             
-            uniform const int    * uniform        vertexIndices,
-            uniform const BufferDescriptor       &inDesc,
-            uniform const float * uniform         inQ,
-            uniform const BufferDescriptor       &outDesc,
-            uniform float *uniform                outQ,
-            uniform const BufferDescriptor       &duDesc,            
-            uniform float *uniform                outDQU,
-            uniform const BufferDescriptor       &dvDesc,            
-            uniform float *uniform                outDQV)
-{
-    uniform int nChannel = inDesc.length / 3;
-    assert(nChannel < MAX_CHANNEL);
-    
-    uniform Point controlVertices[MAX_CHANNEL*16];
-    for(uniform int i=0; i<16; i++) {
-        uniform unsigned int id = vertexIndices[i];
-        uniform const float * uniform pVertex = inQ + inDesc.offset + id * inDesc.stride;
-        for(uniform int c=0; c<nChannel; c++) {
-            uniform int offset = c * 16 + i;
-            controlVertices[offset].x = pVertex[0];
-            controlVertices[offset].y = pVertex[1];
-            controlVertices[offset].z = pVertex[2];
-            pVertex += 3;
-        }
-    }
-
-    uniform float dScale = (uniform float)(1 << getDepth(bitField));
-    
-    uniform float frac = getParamFraction(bitField);
-
-    // top left corner
-    uniform float pu = (uniform float)getU(bitField)*frac;
-    uniform float pv = (uniform float)getV(bitField)*frac;
-
-    foreach( n = 0 ... nPoint) {
-        // normalize u,v coordinates
-        float s = (u[n] - pu) / frac;
-        float t = (v[n] - pv) / frac;
-        
-        float sWeights[4], tWeights[4], dsWeights[4], dtWeights[4];
-       
-        getBSplineWeights(s, sWeights, dsWeights);
-        getBSplineWeights(t, tWeights, dtWeights);      
-        
-        adjustBoundaryWeights(bitField,  sWeights,  tWeights);
-        adjustBoundaryWeights(bitField, dsWeights, dtWeights);            
-        
-        float weight[16];       
-        for (uniform int i = 0; i < 4; ++i) {
-            for (uniform int j = 0; j < 4; ++j) {
-                weight[4*i+j] = sWeights[j] * tWeights[i];
-            }
-        }
-                       
-        float *pOutQ = outQ + outDesc.offset + n * outDesc.stride;             
-        for(uniform int c=0; c<nChannel; c++) { 
-            uniform int offset = c * 16;
-            Point Q;
-            Q.x = Q.y = Q.z = 0.0;
-            for (uniform int i=0; i<16; ++i) {
-                Q = Q + weight[i] * controlVertices[offset + i];                            
-            }    
-              
-            *pOutQ ++ = Q.x, *pOutQ ++ = Q.y, *pOutQ ++ = Q.z;
-        }   
-        
-        float derivS[16], derivT[16];       
-        for (uniform int i = 0; i < 4; ++i) {
-            for (uniform int j = 0; j < 4; ++j) {
-                derivS[4*i+j] = dsWeights[j] *  tWeights[i] * dScale;
-                derivT[4*i+j] =  sWeights[j] * dtWeights[i] * dScale;                
-            }
-        }
-                       
-        float *pOutDQU = outDQU + duDesc.offset + n * duDesc.stride;
-        float *pOutDQV = outDQV + dvDesc.offset + n * dvDesc.stride;                                  
-        for(uniform int c=0; c<nChannel; c++) { 
-            uniform int offset = c * 16;
-            Point DQU, DQV;
-            DQU.x = DQU.y = DQU.z = 0.0;
-            DQV.x = DQV.y = DQV.z = 0.0;            
-            for (uniform int i=0; i<16; ++i) {
-                DQU = DQU + derivS[i] * controlVertices[offset + i];
-                DQV = DQV + derivT[i] * controlVertices[offset + i];                                            
-            }    
-              
-            *pOutDQU ++ = DQU.x, *pOutDQU ++ = DQU.y, *pOutDQU ++ = DQU.z;
-            *pOutDQV ++ = DQV.x, *pOutDQV ++ = DQV.y, *pOutDQV ++ = DQV.z;            
-        }                   
-    }
-}  
-
-export void
-evalBSplineNoDerivative(uniform unsigned int                  bitField,
-                        uniform int                           nPoint, 
-                        uniform const float  * uniform        u, 
-                        uniform const float  * uniform        v,             
-                        uniform const int    * uniform        vertexIndices,
-                        uniform const BufferDescriptor       &inDesc,
-                        uniform const float * uniform         inQ,
-                        uniform const BufferDescriptor       &outDesc,
-                        uniform float *uniform                outQ)
-{
-    uniform int nChannel = inDesc.length / 3;
-    assert(nChannel < MAX_CHANNEL);
-    
-    uniform Point controlVertices[MAX_CHANNEL*16];
-    for(uniform int i=0; i<16; i++) {
-        uniform unsigned int id = vertexIndices[i];
-        uniform const float * uniform pVertex = inQ + inDesc.offset + id * inDesc.stride;
-        for(uniform int c=0; c<nChannel; c++) {
-            uniform int offset = c * 16 + i;
-            controlVertices[offset].x = pVertex[0];
-            controlVertices[offset].y = pVertex[1];
-            controlVertices[offset].z = pVertex[2];
-            pVertex += 3;
-        }
-    }
-
-    uniform float frac = getParamFraction(bitField);
-
-    // top left corner
-    uniform float pu = (uniform float)getU(bitField)*frac;
-    uniform float pv = (uniform float)getV(bitField)*frac;
-
-    foreach( n = 0 ... nPoint) {
-        // normalize u,v coordinates
-        float s = (u[n] - pu) / frac;
-        float t = (v[n] - pv) / frac;
-        
-        float sWeights[4], tWeights[4];
-       
-        getBSplineWeightsNoDerivative(s, sWeights);
-        getBSplineWeightsNoDerivative(t, tWeights);      
-        
-        adjustBoundaryWeights(bitField, sWeights, tWeights);  
-        
-        float weight[16];       
-        for (uniform int i = 0; i < 4; ++i) {
-            for (uniform int j = 0; j < 4; ++j) {
-                weight[4*i+j] = sWeights[j] * tWeights[i];
-            }
-        }
-                       
-        float *pOutQ = outQ + outDesc.offset + n * outDesc.stride;             
-        for(uniform int c=0; c<nChannel; c++) { 
-            uniform int offset = c * 16;
-            Point Q;
-            Q.x = Q.y = Q.z = 0.0;
-            for (uniform int i=0; i<16; ++i) {
-                Q = Q + weight[i] * controlVertices[offset + i];                            
-            }    
-              
-            *pOutQ ++ = Q.x, *pOutQ ++ = Q.y, *pOutQ ++ = Q.z;
-        }           
-    }
-}  
-
-void getGregoryWeights(uniform unsigned int bitField, 
-                       float s, float t, float point[20], float deriv1[20], float deriv2[20]) {
-    //
-    //  P3         e3-      e2+         P2
-    //     15------17-------11--------10
-    //     |        |        |        |
-    //     |        |        |        |
-    //     |        | f3-    | f2+    |
-    //     |       19       13        |
-    // e3+ 16-----18           14-----12 e2-
-    //     |     f3+          f2-     |
-    //     |                          |
-    //     |                          |
-    //     |      f0-         f1+     |
-    // e0- 2------4            8------6 e1+
-    //     |        3        9        |
-    //     |        | f0+    | f1-    |
-    //     |        |        |        |
-    //     |        |        |        |
-    //     O--------1--------7--------5
-    //  P0         e0+      e1-         P1
-    //
-
-    //  Indices of boundary and interior points and their corresponding Bezier points
-    //  (this can be reduced with more direct indexing and unrolling of loops):
-    //
-    static uniform int const boundaryGregory[12] = { 0, 1, 7, 5, 2, 6, 16, 12, 15, 17, 11, 10 };
-    static uniform int const boundaryBezSCol[12] = { 0, 1, 2, 3, 0, 3,  0,  3,  0,  1,  2,  3 };
-    static uniform int const boundaryBezTRow[12] = { 0, 0, 0, 0, 1, 1,  2,  2,  3,  3,  3,  3 };
-
-    static uniform int const interiorGregory[8] = { 3, 4,  8, 9,  13, 14,  18, 19 };
-    static uniform int const interiorBezSCol[8] = { 1, 1,  2, 2,   2,  2,   1,  1 };
-    static uniform int const interiorBezTRow[8] = { 1, 1,  1, 1,   2,  2,   2,  2 };
-
-    //
-    //  Bezier basis functions are denoted with B while the rational multipliers for the
-    //  interior points will be denoted G -- so we have B(s), B(t) and G(s,t):
-    //
-    //  Directional Bezier basis functions B at s and t:
-    float Bs[4], Bds[4];
-    float Bt[4], Bdt[4];
-
-    getBezierWeights(s, Bs, Bds);
-    getBezierWeights(t, Bt, Bdt);
-
-    //  Rational multipliers G at s and t:
-    float sC = 1.0f - s;
-    float tC = 1.0f - t;
-
-    //  Use <= here to avoid compiler warnings -- the sums should always be non-negative:
-    float df0 = s  + t;   df0 = (df0 <= 0.0f) ? 1.0f : (1.0f / df0);
-    float df1 = sC + t;   df1 = (df1 <= 0.0f) ? 1.0f : (1.0f / df1);
-    float df2 = sC + tC;  df2 = (df2 <= 0.0f) ? 1.0f : (1.0f / df2);
-    float df3 = s  + tC;  df3 = (df3 <= 0.0f) ? 1.0f : (1.0f / df3);
-
-    float G[8] = { s*df0, t*df0,  t*df1, sC*df1,  sC*df2, tC*df2,  tC*df3, s*df3 };
-
-    //  Combined weights for boundary and interior points:
-    for (uniform int i = 0; i < 12; ++i) {
-        point[boundaryGregory[i]] = Bs[boundaryBezSCol[i]] * Bt[boundaryBezTRow[i]];
-    }
-    for (uniform int i = 0; i < 8; ++i) {
-        point[interiorGregory[i]] = Bs[interiorBezSCol[i]] * Bt[interiorBezTRow[i]] * G[i];
-    }
-
-    //
-    //  For derivatives, the basis functions for the interior points are rational and ideally
-    //  require appropriate differentiation, i.e. product rule for the combination of B and G
-    //  and the quotient rule for the rational G itself.  As initially proposed by Loop et al
-    //  though, the approximation using the 16 Bezier points arising from the G(s,t) has
-    //  proved adequate (and is what the GPU shaders use) so we continue to use that here.
-    //
-    //  An implementation of the true derivatives is provided for future reference -- it is
-    //  unclear if the approximations will hold up under surface analysis involving higher
-    //  order differentiation.
-    //
-
-    //  Remember to include derivative scaling in all assignments below:
-    uniform float dScale = (uniform float)(1 << getDepth(bitField));
-
-    //  Combined weights for boundary points -- simple (scaled) tensor products:
-    for (uniform int i = 0; i < 12; ++i) {
-        uniform int iDst = boundaryGregory[i];
-        uniform int tRow = boundaryBezTRow[i];
-        uniform int sCol = boundaryBezSCol[i];
-
-        deriv1[iDst] = Bds[sCol] * Bt[tRow] * dScale;
-        deriv2[iDst] = Bdt[tRow] * Bs[sCol] * dScale;
-    }
-
-#define _USE_BEZIER_PSEUDO_DERIVATIVES
-#ifdef _USE_BEZIER_PSEUDO_DERIVATIVES
-    //  Approximation to the true Gregory derivatives by differentiating the Bezier patch
-    //  unique to the given (s,t), i.e. having F = (g^+ * f^+) + (g^- * f^-) as its four
-    //  interior points:
-    //
-    //  Combined weights for interior points -- (scaled) tensor products with G+ or G-:
-    for (uniform int i = 0; i < 8; ++i) {
-        uniform int iDst = interiorGregory[i];
-        uniform int tRow = interiorBezTRow[i];
-        uniform int sCol = interiorBezSCol[i];
-        deriv1[iDst] = Bds[sCol] * Bt[tRow] * G[i] * dScale;
-        deriv2[iDst] = Bdt[tRow] * Bs[sCol] * G[i] * dScale;
-    }
-#else
-    //  True Gregory derivatives using appropriate differentiation of composite functions:
-    //
-    //  Note that for G(s,t) = N(s,t) / D(s,t), all N' and D' are trivial constants (which
-    //  simplifies things for higher order derivatives).  And while each pair of functions
-    //  G (i.e. the G+ and G- corresponding to points f+ and f-) must sum to 1 to ensure
-    //  Bezier equivalence (when f+ = f-), the pairs of G' must similarly sum to 0.  So we
-    //  can potentially compute only one of the pair and negate the result for the other
-    //  (and with 4 or 8 computations involving these constants, this is all very SIMD
-    //  friendly...) but for now we treat all 8 independently for simplicity.
-    //
-    //float N[8] = {   s,     t,      t,     sC,      sC,     tC,      tC,     s };
-    uniform float D[8] = {   df0,   df0,    df1,    df1,     df2,    df2,     df3,   df3 };
-
-    static uniform float const Nds[8] = { 1.0f, 0.0f,  0.0f, -1.0f, -1.0f,  0.0f,  0.0f,  1.0f };
-    static uniform float const Ndt[8] = { 0.0f, 1.0f,  1.0f,  0.0f,  0.0f, -1.0f, -1.0f,  0.0f };
-
-    static uniform float const Dds[8] = { 1.0f, 1.0f, -1.0f, -1.0f, -1.0f, -1.0f,  1.0f,  1.0f };
-    static uniform float const Ddt[8] = { 1.0f, 1.0f,  1.0f,  1.0f, -1.0f, -1.0f, -1.0f, -1.0f };
-
-    //  Combined weights for interior points -- (scaled) combinations of B, B', G and G':
-    for (uniform int i = 0; i < 8; ++i) {
-        uniform int iDst = interiorGregory[i];
-        uniform int tRow = interiorBezTRow[i];
-        uniform int sCol = interiorBezSCol[i];
-
-        //  Quotient rule for G' (re-expressed in terms of G to simplify (and D = 1/D)):
-        float Gds = (Nds[i] - Dds[i] * G[i]) * D[i];
-        float Gdt = (Ndt[i] - Ddt[i] * G[i]) * D[i];
-
-        //  Product rule combining B and B' with G and G' (and scaled):
-        deriv1[iDst] = (Bds[sCol] * G[i] + Bs[sCol] * Gds) * Bt[tRow] * dScale;
-        deriv2[iDst] = (Bdt[tRow] * G[i] + Bt[tRow] * Gdt) * Bs[sCol] * dScale;
-    }
-#endif
-}
-
-void getGregoryWeightsNoDerivative(uniform unsigned int bitField, float s, float t, float point[20]) {
-    //
-    //  P3         e3-      e2+         P2
-    //     15------17-------11--------10
-    //     |        |        |        |
-    //     |        |        |        |
-    //     |        | f3-    | f2+    |
-    //     |       19       13        |
-    // e3+ 16-----18           14-----12 e2-
-    //     |     f3+          f2-     |
-    //     |                          |
-    //     |                          |
-    //     |      f0-         f1+     |
-    // e0- 2------4            8------6 e1+
-    //     |        3        9        |
-    //     |        | f0+    | f1-    |
-    //     |        |        |        |
-    //     |        |        |        |
-    //     O--------1--------7--------5
-    //  P0         e0+      e1-         P1
-    //
-
-    //  Indices of boundary and interior points and their corresponding Bezier points
-    //  (this can be reduced with more direct indexing and unrolling of loops):
-    //
-    static uniform int const boundaryGregory[12] = { 0, 1, 7, 5, 2, 6, 16, 12, 15, 17, 11, 10 };
-    static uniform int const boundaryBezSCol[12] = { 0, 1, 2, 3, 0, 3,  0,  3,  0,  1,  2,  3 };
-    static uniform int const boundaryBezTRow[12] = { 0, 0, 0, 0, 1, 1,  2,  2,  3,  3,  3,  3 };
-
-    static uniform int const interiorGregory[8] = { 3, 4,  8, 9,  13, 14,  18, 19 };
-    static uniform int const interiorBezSCol[8] = { 1, 1,  2, 2,   2,  2,   1,  1 };
-    static uniform int const interiorBezTRow[8] = { 1, 1,  1, 1,   2,  2,   2,  2 };
-
-    //
-    //  Bezier basis functions are denoted with B while the rational multipliers for the
-    //  interior points will be denoted G -- so we have B(s), B(t) and G(s,t):
-    //
-    //  Directional Bezier basis functions B at s and t:
-    float Bs[4];
-    float Bt[4];
-
-    getBezierWeightsNoDerivative(s, Bs);
-    getBezierWeightsNoDerivative(t, Bt);
-
-    //  Rational multipliers G at s and t:
-    float sC = 1.0f - s;
-    float tC = 1.0f - t;
-
-    //  Use <= here to avoid compiler warnings -- the sums should always be non-negative:
-    float df0 = s  + t;   df0 = (df0 <= 0.0f) ? 1.0f : (1.0f / df0);
-    float df1 = sC + t;   df1 = (df1 <= 0.0f) ? 1.0f : (1.0f / df1);
-    float df2 = sC + tC;  df2 = (df2 <= 0.0f) ? 1.0f : (1.0f / df2);
-    float df3 = s  + tC;  df3 = (df3 <= 0.0f) ? 1.0f : (1.0f / df3);
-
-    float G[8] = { s*df0, t*df0,  t*df1, sC*df1,  sC*df2, tC*df2,  tC*df3, s*df3 };
-
-    //  Combined weights for boundary and interior points:
-    for (uniform int i = 0; i < 12; ++i) {
-        point[boundaryGregory[i]] = Bs[boundaryBezSCol[i]] * Bt[boundaryBezTRow[i]];
-    }
-    for (uniform int i = 0; i < 8; ++i) {
-        point[interiorGregory[i]] = Bs[interiorBezSCol[i]] * Bt[interiorBezTRow[i]] * G[i];
-    }
-}
-
-export void
-evalGregory(uniform   unsigned int            bitField,
-            uniform   int                     nPoint, 
-            uniform   float                   u[], 
-            uniform   float                   v[],                    
-            uniform   const unsigned int      vertexIndices[],
-            uniform   const BufferDescriptor &inDesc,
-            uniform   const float             inQ[], 
-            uniform   const BufferDescriptor &outDesc,
-            uniform   float                   outQ[], 
-            uniform   const BufferDescriptor &duDesc,
-            uniform   float                   outDQU[],
-            uniform   const BufferDescriptor &dvDesc,            
-            uniform   float                   outDQV[])
-{
-    uniform int nChannel = inDesc.length / 3;
-    assert(nChannel < MAX_CHANNEL);
-    
-    uniform Point controlVertices[MAX_CHANNEL*20];
-    for(uniform int i=0; i<20; i++) {
-        uniform unsigned int id = vertexIndices[i];
-        uniform const float * uniform pVertex = inQ + inDesc.offset + id * inDesc.stride;
-        for(uniform int c=0; c<nChannel; c++) {
-            uniform int offset = c * 20 + i;
-            controlVertices[offset].x = pVertex[0];
-            controlVertices[offset].y = pVertex[1];
-            controlVertices[offset].z = pVertex[2];
-            pVertex += 3;
-        }
-    }
-
-    uniform float frac = getParamFraction(bitField);
-
-    // top left corner
-    uniform float pu = (uniform float)getU(bitField)*frac;
-    uniform float pv = (uniform float)getV(bitField)*frac;
-
-    foreach( n = 0 ... nPoint) {
-        // normalize u,v coordinates
-        float s = (u[n] - pu) / frac;
-        float t = (v[n] - pv) / frac;
-        
-        float point[20], deriv1[20], deriv2[20];
-        getGregoryWeights(bitField, s, t, point, deriv1, deriv2);
-        
-        float *pOutQ = outQ + outDesc.offset + n * outDesc.stride;             
-        for(uniform int c=0; c<nChannel; c++) { 
-            uniform int offset = c * 16;
-            Point Q;
-            Q.x = Q.y = Q.z = 0.0;
-            for (uniform int i=0; i<16; ++i) {
-                Q = Q + point[i] * controlVertices[offset + i];                            
-            }    
-              
-            *pOutQ ++ = Q.x, *pOutQ ++ = Q.y, *pOutQ ++ = Q.z;
-        }   
-                              
-        float *pOutDQU = outDQU + duDesc.offset + n * duDesc.stride;
-        float *pOutDQV = outDQV + dvDesc.offset + n * dvDesc.stride;                                  
-        for(uniform int c=0; c<nChannel; c++) { 
-            uniform int offset = c * 20;
-            Point DQU, DQV;
-            DQU.x = DQU.y = DQU.z = 0.0;
-            DQV.x = DQV.y = DQV.z = 0.0;            
-            for (uniform int i=0; i<20; ++i) {
-                DQU = DQU + deriv1[i] * controlVertices[offset + i];
-                DQV = DQV + deriv2[i] * controlVertices[offset + i];                                            
-            }    
-              
-            *pOutDQU ++ = DQU.x, *pOutDQU ++ = DQU.y, *pOutDQU ++ = DQU.z;
-            *pOutDQV ++ = DQV.x, *pOutDQV ++ = DQV.y, *pOutDQV ++ = DQV.z;            
-        }                           
-    }
-}   
-
-export void
-evalGregoryNoDerivative(uniform unsigned int            bitField,
-                        uniform int                     nPoint, 
-                        uniform float                   u[], 
-                        uniform float                   v[],                    
-                        uniform const unsigned int      vertexIndices[],
-                        uniform const BufferDescriptor &inDesc,
-                        uniform const float             inQ[], 
-                        uniform const BufferDescriptor &outDesc,
-                        uniform float                   outQ[]
-                       )
-{
-    uniform int nChannel = inDesc.length / 3;
-    assert(nChannel < MAX_CHANNEL);
-    
-    uniform Point controlVertices[MAX_CHANNEL*20];
-    for(uniform int i=0; i<20; i++) {
-        uniform unsigned int id = vertexIndices[i];
-        uniform const float * uniform pVertex = inQ + inDesc.offset + id * inDesc.stride;
-        for(uniform int c=0; c<nChannel; c++) {
-            uniform int offset = c * 20 + i;
-            controlVertices[offset].x = pVertex[0];
-            controlVertices[offset].y = pVertex[1];
-            controlVertices[offset].z = pVertex[2];
-            pVertex += 3;
-        }
-    }
-
-    uniform float frac = getParamFraction(bitField);
-
-    // top left corner
-    uniform float pu = (uniform float)getU(bitField)*frac;
-    uniform float pv = (uniform float)getV(bitField)*frac;
-
-    foreach( n = 0 ... nPoint) {
-        // normalize u,v coordinates
-        float s = (u[n] - pu) / frac;
-        float t = (v[n] - pv) / frac;
-        
-        float point[20];
-        getGregoryWeightsNoDerivative(bitField, s, t, point);
-        
-        float *pOutQ = outQ + outDesc.offset + n * outDesc.stride;             
-        for(uniform int c=0; c<nChannel; c++) { 
-            uniform int offset = c * 20;
-            Point Q;
-            Q.x = Q.y = Q.z = 0.0;
-            for (uniform int i=0; i<20; ++i) {
-                Q = Q + point[i] * controlVertices[offset + i];                            
-            }    
-              
-            *pOutQ ++ = Q.x, *pOutQ ++ = Q.y, *pOutQ ++ = Q.z;
-        }   
-    }
-}    
-
--- a/opensubdiv/osd/ispcEvalLimitKernel.isph
+++ b/opensubdiv/osd/ispcEvalLimitKernel.isph
@ -1,55 +0,0 @@
-//
-// ispcEvalLimitKernel.isph
-// (Header automatically generated by the ispc compiler.)
-// DO NOT EDIT THIS FILE.
-//
-
-#ifndef ISPC_ISPCEVALLIMITKERNEL_ISPH
-#define ISPC_ISPCEVALLIMITKERNEL_ISPH
-
-#include <stdint.h>
-
-
-
-#ifdef __cplusplus
-namespace ispc { /* namespace */
-#endif // __cplusplus
-#ifndef __ISPC_STRUCT_BufferDescriptor__
-#define __ISPC_STRUCT_BufferDescriptor__
-struct BufferDescriptor {
-    int32_t offset;
-    int32_t length;
-    int32_t stride;
-};
-#endif
-
-
-///////////////////////////////////////////////////////////////////////////
-// Functions exported from ispc code
-///////////////////////////////////////////////////////////////////////////
-#if defined(__cplusplus) && !defined(__ISPC_NO_EXTERN_C)
-extern "C" {
-#endif // __cplusplus
-    extern void evalBSpline(int32_t bitField, int32_t nPoint, const float * u, const float * v, const int32_t * vertexIndices, const struct BufferDescriptor &inDesc, const float * inQ, const struct BufferDescriptor &outDesc, float * outQ, const struct BufferDescriptor &duDesc, float * outDQU, const struct BufferDescriptor &dvDesc, float * outDQV);
-    
-    extern void evalBilinear(int32_t bitField, int32_t nPoint, const float * u, const float * v, const int32_t * vertexIndices, const struct BufferDescriptor &inDesc, const float * inQ, const struct BufferDescriptor &outDesc, float * outQ, const struct BufferDescriptor &duDesc, float * outDQU, const struct BufferDescriptor &dvDesc, float * outDQV);
-    
-    extern void evalGregory(int32_t bitField, int32_t nPoint, const float * u, const float * v, const int32_t * vertexIndices, const struct BufferDescriptor &inDesc, const float * inQ, const struct BufferDescriptor &outDesc, float * outQ, const struct BufferDescriptor &duDesc, float * outDQU, const struct BufferDescriptor &dvDesc, float * outDQV);
-    
-    extern void evalBSplineNoDerivative(int32_t bitField, int32_t nPoint, const float * u, const float * v, const int32_t * vertexIndices, const struct BufferDescriptor &inDesc, const float * inQ, const struct BufferDescriptor &outDesc, float * outQ);
-    
-    extern void evalBilinearNoDerivative(int32_t bitField, int32_t nPoint, const float * u, const float * v, const int32_t * vertexIndices, const struct BufferDescriptor &inDesc, const float * inQ, const struct BufferDescriptor &outDesc, float * outQ);
-    
-    extern void evalGregoryNoDerivative(int32_t bitField, int32_t nPoint, const float * u, const float * v, const int32_t * vertexIndices, const struct BufferDescriptor &inDesc, const float * inQ, const struct BufferDescriptor &outDesc, float * outQ);
-        
-    extern void getSIMDWidth(int32_t &simdWidth);
-#if defined(__cplusplus) && !defined(__ISPC_NO_EXTERN_C)
-} /* end extern C */
-#endif // __cplusplus
-
-
-#ifdef __cplusplus
-} /* namespace */
-#endif // __cplusplus
-
-#endif // ISPC_ISPCEVALLIMITKERNEL_ISPH
--- a/opensubdiv/osd/ispcEvaluator.cpp
+++ b/opensubdiv/osd/ispcEvaluator.cpp
@ -1,289 +0,0 @@
-//
-//   Copyright 2015 Pixar
-//
-//   Licensed under the Apache License, Version 2.0 (the "Apache License")
-//   with the following modification; you may not use this file except in
-//   compliance with the Apache License and the following modification to it:
-//   Section 6. Trademarks. is deleted and replaced with:
-//
-//   6. Trademarks. This License does not grant permission to use the trade
-//      names, trademarks, service marks, or product names of the Licensor
-//      and its affiliates, except as required to comply with Section 4(c) of
-//      the License and to reproduce the content of the NOTICE file.
-//
-//   You may obtain a copy of the Apache License at
-//
-//       http://www.apache.org/licenses/LICENSE-2.0
-//
-//   Unless required by applicable law or agreed to in writing, software
-//   distributed under the Apache License with the above modification is
-//   distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-//   KIND, either express or implied. See the Apache License for the specific
-//   language governing permissions and limitations under the Apache License.
-//
-
-#include "ispcEvaluator.h"
-#include "cpuKernel.h"
-#include "../far/patchBasis.h"
-#include "ispcEvalLimitKernel.isph"
-
-#include <tbb/parallel_for.h>
-#include <cstdlib>
-
-namespace OpenSubdiv {
-namespace OPENSUBDIV_VERSION {
-
-namespace Osd {
-
-#define grain_size  512
-
-/* static */
-bool
-IspcEvaluator::EvalStencils(const float *src, BufferDescriptor const &srcDesc,
-                           float *dst,       BufferDescriptor const &dstDesc,
-                           const int * sizes,
-                           const int * offsets,
-                           const int * indices,
-                           const float * weights,
-                           int start, int end) {
-
-    if (end <= start) return true;
-    if (srcDesc.length != dstDesc.length) return false;
-
-    // XXX: we can probably expand cpuKernel.cpp to here.
-    CpuEvalStencils(src, srcDesc, dst, dstDesc,
-                    sizes, offsets, indices, weights, start, end);
-
-    return true;
-}
-
-/* static */
-bool
-IspcEvaluator::EvalStencils(const float *src, BufferDescriptor const &srcDesc,
-                           float *dst,       BufferDescriptor const &dstDesc,
-                           float *du,        BufferDescriptor const &duDesc,
-                           float *dv,        BufferDescriptor const &dvDesc,
-                           const int * sizes,
-                           const int * offsets,
-                           const int * indices,
-                           const float * weights,
-                           const float * duWeights,
-                           const float * dvWeights,
-                           int start, int end) {
-    if (end <= start) return true;
-    if (srcDesc.length != dstDesc.length) return false;
-    if (srcDesc.length != duDesc.length) return false;
-    if (srcDesc.length != dvDesc.length) return false;
-
-    CpuEvalStencils(src, srcDesc,
-                    dst, dstDesc,
-                    du,  duDesc,
-                    dv,  dvDesc,
-                    sizes, offsets, indices,
-                    weights, duWeights, dvWeights,
-                    start, end);
-
-    return true;
-}
-
-template <typename T>
-struct BufferAdapter {
-    BufferAdapter(T *p, int length, int stride) :
-        _p(p), _length(length), _stride(stride) { }
-    void Clear() {
-        for (int i = 0; i < _length; ++i) _p[i] = 0;
-    }
-    void AddWithWeight(T const *src, float w) {
-        if (_p) {
-            for (int i = 0; i < _length; ++i) {
-                _p[i] += src[i] * w;
-            }
-        }
-    }
-    const T *operator[] (int index) const {
-        return _p + _stride * index;
-    }
-    BufferAdapter<T> & operator ++() {
-        if (_p) {
-            _p += _stride;
-        }
-        return *this;
-    }
-
-    T *_p;
-    int _length;
-    int _stride;
-};
-
-/* static */
-bool
-IspcEvaluator::EvalPatches(const float *src, BufferDescriptor const &srcDesc,
-                           float *dst,       BufferDescriptor const &dstDesc,
-                           int numPatchCoords,
-                           const PatchCoord *patchCoords,
-                           const PatchArray *patchArrays,
-                           const int *patchIndexBuffer,
-                           const PatchParam *patchParamBuffer) { 
-    if (srcDesc.length != dstDesc.length) return false;
-        
-    // Copy BufferDescriptor to ispc version
-    // Since memory alignment in ISPC may be different from C++,
-    // we use the assignment for each field instead of the assignment for 
-    // the whole struct
-    ispc::BufferDescriptor ispcSrcDesc;
-    ispcSrcDesc.offset = srcDesc.offset;
-    ispcSrcDesc.length = srcDesc.length;
-    ispcSrcDesc.stride = srcDesc.stride;                                           
-                          
-    tbb::blocked_range<int> range = tbb::blocked_range<int>(0, numPatchCoords, grain_size);
-    tbb::parallel_for(range, [&](const tbb::blocked_range<int> &r)
-    {    
-    uint i = r.begin();
-        
-    ispc::BufferDescriptor ispcDstDesc, ispcDuDesc, ispcDvDesc;                               
-    ispcDstDesc.offset = dstDesc.offset + dstDesc.offset + i * dstDesc.stride;
-    ispcDstDesc.length = dstDesc.length;
-    ispcDstDesc.stride = dstDesc.stride;
-    
-    while (i < r.end()) {
-        // the patch coordinates are sorted by patch handle
-        // the following code searches the coordinates that
-        // belongs to the same patch so that they can be evalauated 
-        // with ISPC
-        int nCoord = 1;
-        Far::PatchTable::PatchHandle handle = patchCoords[i].handle;
-        while(i + nCoord < r.end() && 
-              handle.isEqual(patchCoords[i + nCoord].handle) )
-              nCoord ++;
-              
-        PatchArray const &array = patchArrays[handle.arrayIndex];
-        int patchType = array.GetPatchType();
-        Far::PatchParam const & param = patchParamBuffer[handle.patchIndex];
-
-        unsigned int bitField = param.field1;
-
-        const int *cvs = &patchIndexBuffer[array.indexBase + handle.vertIndex];
-
-        __declspec( align(64) ) float u[nCoord];
-        __declspec( align(64) ) float v[nCoord];        
-        
-        for(int n=0; n<nCoord; n++) {
-            u[n] = patchCoords[i + n].s;
-            v[n] = patchCoords[i + n].t;            
-        }
-        
-        if (patchType == Far::PatchDescriptor::REGULAR) {
-            ispc::evalBSplineNoDerivative(bitField, nCoord, u, v, cvs, ispcSrcDesc, src, 
-                              ispcDstDesc, dst);
-        } else if (patchType == Far::PatchDescriptor::GREGORY_BASIS) {
-            ispc::evalGregoryNoDerivative(bitField, nCoord, u, v, cvs, ispcSrcDesc, src, 
-                              ispcDstDesc, dst);        
-        } else if (patchType == Far::PatchDescriptor::QUADS) {
-            ispc::evalBilinearNoDerivative(bitField, nCoord, u, v, cvs, ispcSrcDesc, src, 
-                               ispcDstDesc, dst);           
-        } else {
-            assert(0);
-        }
-        
-        i += nCoord;
-        ispcDstDesc.offset = dstDesc.offset + i * dstDesc.stride;                                                  
-    }
-    });
-    
-    return true;
-}
-
-/* static */
-bool
-IspcEvaluator::EvalPatches(const float *src, BufferDescriptor const &srcDesc,
-                           float *dst,       BufferDescriptor const &dstDesc,
-                           float *du,        BufferDescriptor const &duDesc,
-                           float *dv,        BufferDescriptor const &dvDesc,
-                           int numPatchCoords,
-                           const PatchCoord *patchCoords,
-                           const PatchArray *patchArrays,
-                           const int *patchIndexBuffer,
-                           const PatchParam *patchParamBuffer) {
-    if (srcDesc.length != dstDesc.length) return false;
-        
-    // Copy BufferDescriptor to ispc version
-    // Since memory alignment in ISPC may be different from C++,
-    // we use the assignment for each field instead of the assignment for 
-    // the whole struct
-    ispc::BufferDescriptor ispcSrcDesc;
-    ispcSrcDesc.offset = srcDesc.offset;
-    ispcSrcDesc.length = srcDesc.length;
-    ispcSrcDesc.stride = srcDesc.stride;                      
-                      
-    tbb::blocked_range<int> range = tbb::blocked_range<int>(0, numPatchCoords, grain_size);
-    tbb::parallel_for(range, [&](const tbb::blocked_range<int> &r)
-    {    
-    uint i = r.begin();
-        
-    ispc::BufferDescriptor ispcDstDesc, ispcDuDesc, ispcDvDesc;                               
-    ispcDstDesc.offset = dstDesc.offset + dstDesc.offset + i * dstDesc.stride;
-    ispcDstDesc.length = dstDesc.length;
-    ispcDstDesc.stride = dstDesc.stride;
-    
-    ispcDuDesc.offset  = duDesc.offset  + i * duDesc.stride;
-    ispcDuDesc.length  = duDesc.length;
-    ispcDuDesc.stride  = duDesc.stride;
-    
-    ispcDvDesc.offset  = dvDesc.offset  + i * dvDesc.stride;
-    ispcDvDesc.length  = dvDesc.length;
-    ispcDvDesc.stride  = dvDesc.stride;
-    while (i < r.end()) {
-        // the patch coordinates are sorted by patch handle
-        // the following code searches the coordinates that
-        // belongs to the same patch so that they can be evalauated 
-        // with ISPC
-        int nCoord = 1;
-        Far::PatchTable::PatchHandle handle = patchCoords[i].handle;
-        while(i + nCoord < r.end() && 
-              handle.isEqual(patchCoords[i + nCoord].handle) )
-              nCoord ++;
-              
-        PatchArray const &array = patchArrays[handle.arrayIndex];
-        int patchType = array.GetPatchType();
-        Far::PatchParam const & param = patchParamBuffer[handle.patchIndex];
-
-        unsigned int bitField = param.field1;
-
-        const int *cvs = &patchIndexBuffer[array.indexBase + handle.vertIndex];
-
-        __declspec( align(64) ) float u[nCoord];
-        __declspec( align(64) ) float v[nCoord];        
-        
-        for(int n=0; n<nCoord; n++) {
-            u[n] = patchCoords[i + n].s;
-            v[n] = patchCoords[i + n].t;            
-        }
-        
-        if (patchType == Far::PatchDescriptor::REGULAR) {
-            ispc::evalBSpline(bitField, nCoord, u, v, cvs, ispcSrcDesc, src, 
-                              ispcDstDesc, dst, ispcDuDesc, du, ispcDvDesc, dv);
-        } else if (patchType == Far::PatchDescriptor::GREGORY_BASIS) {
-            ispc::evalGregory(bitField, nCoord, u, v, cvs, ispcSrcDesc, src, 
-                              ispcDstDesc, dst, ispcDuDesc, du, ispcDvDesc, dv);        
-        } else if (patchType == Far::PatchDescriptor::QUADS) {
-            ispc::evalBilinear(bitField, nCoord, u, v, cvs, ispcSrcDesc, src, 
-                               ispcDstDesc, dst, ispcDuDesc, du, ispcDvDesc, dv);           
-        } else {
-            assert(0);
-        }
-        
-        i += nCoord;
-        ispcDstDesc.offset = dstDesc.offset + i * dstDesc.stride;
-        ispcDuDesc.offset  = duDesc.offset  + i * duDesc.stride;
-        ispcDvDesc.offset  = dvDesc.offset  + i * dvDesc.stride;                                                        
-    }
-    });
-    
-    return true;
-}
-
-
-}  // end namespace Osd
-
-}  // end namespace OPENSUBDIV_VERSION
-}  // end namespace OpenSubdiv
--- a/opensubdiv/osd/ispcEvaluator.h
+++ b/opensubdiv/osd/ispcEvaluator.h
@ -1,482 +0,0 @@
-//
-//   Copyright 2015 Pixar
-//
-//   Licensed under the Apache License, Version 2.0 (the "Apache License")
-//   with the following modification; you may not use this file except in
-//   compliance with the Apache License and the following modification to it:
-//   Section 6. Trademarks. is deleted and replaced with:
-//
-//   6. Trademarks. This License does not grant permission to use the trade
-//      names, trademarks, service marks, or product names of the Licensor
-//      and its affiliates, except as required to comply with Section 4(c) of
-//      the License and to reproduce the content of the NOTICE file.
-//
-//   You may obtain a copy of the Apache License at
-//
-//       http://www.apache.org/licenses/LICENSE-2.0
-//
-//   Unless required by applicable law or agreed to in writing, software
-//   distributed under the Apache License with the above modification is
-//   distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-//   KIND, either express or implied. See the Apache License for the specific
-//   language governing permissions and limitations under the Apache License.
-//
-
-#ifndef OPENSUBDIV3_OSD_ISPC_EVALUATOR_H
-#define OPENSUBDIV3_OSD_ISPC_EVALUATOR_H
-
-#include "../version.h"
-
-#include <cstddef>
-#include <vector>
-#include "../osd/bufferDescriptor.h"
-#include "../osd/types.h"
-
-namespace OpenSubdiv {
-namespace OPENSUBDIV_VERSION {
-
-namespace Osd {
-
-class IspcEvaluator {
-public:
-    /// ----------------------------------------------------------------------
-    ///
-    ///   Stencil evaluations with StencilTable
-    ///
-    /// ----------------------------------------------------------------------
-
-    /// \brief Generic static eval stencils function. This function has a same
-    ///        signature as other device kernels have so that it can be called
-    ///        in the same way from OsdMesh template interface.
-    ///
-    /// @param srcBuffer      Input primvar buffer.
-    ///                       must have BindCpuBuffer() method returning a
-    ///                       const float pointer for read
-    ///
-    /// @param srcDesc        vertex buffer descriptor for the input buffer
-    ///
-    /// @param dstBuffer      Output primvar buffer
-    ///                       must have BindCpuBuffer() method returning a
-    ///                       float pointer for write
-    ///
-    /// @param dstDesc        vertex buffer descriptor for the output buffer
-    ///
-    /// @param stencilTable   Far::StencilTable or equivalent
-    ///
-    /// @param instance       not used in the cpu kernel
-    ///                       (declared as a typed pointer to prevent
-    ///                        undesirable template resolution)
-    ///
-    /// @param deviceContext  not used in the cpu kernel
-    ///
-    template <typename SRC_BUFFER, typename DST_BUFFER, typename STENCIL_TABLE>
-    static bool EvalStencils(
-        SRC_BUFFER *srcBuffer, BufferDescriptor const &srcDesc,
-        DST_BUFFER *dstBuffer, BufferDescriptor const &dstDesc,
-        STENCIL_TABLE const *stencilTable,
-        const IspcEvaluator *instance = NULL,
-        void * deviceContext = NULL) {
-
-        (void)instance;       // unused
-        (void)deviceContext;  // unused
-
-        if (stencilTable->GetNumStencils() == 0)
-            return false;
-
-        return EvalStencils(srcBuffer->BindCpuBuffer(), srcDesc,
-                            dstBuffer->BindCpuBuffer(), dstDesc,
-                            &stencilTable->GetSizes()[0],
-                            &stencilTable->GetOffsets()[0],
-                            &stencilTable->GetControlIndices()[0],
-                            &stencilTable->GetWeights()[0],
-                            /*start = */ 0,
-                            /*end   = */ stencilTable->GetNumStencils());
-    }
-
-    /// \brief Static eval stencils function which takes raw CPU pointers for
-    ///        input and output.
-    ///
-    /// @param src            Input primvar pointer. An offset of srcDesc
-    ///                       will be applied internally (i.e. the pointer
-    ///                       should not include the offset)
-    ///
-    /// @param srcDesc        vertex buffer descriptor for the input buffer
-    ///
-    /// @param dst            Output primvar pointer. An offset of dstDesc
-    ///                       will be applied internally.
-    ///
-    /// @param dstDesc        vertex buffer descriptor for the output buffer
-    ///
-    /// @param sizes          pointer to the sizes buffer of the stencil table
-    ///                       to apply for the range [start, end)
-    ///
-    /// @param offsets        pointer to the offsets buffer of the stencil table
-    ///
-    /// @param indices        pointer to the indices buffer of the stencil table
-    ///
-    /// @param weights        pointer to the weights buffer of the stencil table
-    ///
-    /// @param start          start index of stencil table
-    ///
-    /// @param end            end index of stencil table
-    ///
-    static bool EvalStencils(
-        const float *src,  BufferDescriptor const &srcDesc,
-        float *dst,        BufferDescriptor const &dstDesc,
-        const int * sizes,
-        const int * offsets,
-        const int * indices,
-        const float * weights,
-        int start, int end);
-
-    /// \brief Generic static eval stencils function with derivatives.
-    ///        This function has a same signature as other device kernels
-    ///        have so that it can be called in the same way from OsdMesh
-    ///        template interface.
-    ///
-    /// @param srcBuffer      Input primvar buffer.
-    ///                       must have BindCpuBuffer() method returning a
-    ///                       const float pointer for read
-    ///
-    /// @param srcDesc        vertex buffer descriptor for the input buffer
-    ///
-    /// @param dstBuffer      Output primvar buffer
-    ///                       must have BindCpuBuffer() method returning a
-    ///                       float pointer for write
-    ///
-    /// @param dstDesc        vertex buffer descriptor for the output buffer
-    ///
-    /// @param duBuffer       Output U-derivative buffer
-    ///                       must have BindCpuBuffer() method returning a
-    ///                       float pointer for write
-    ///
-    /// @param duDesc         vertex buffer descriptor for the output buffer
-    ///
-    /// @param dvBuffer       Output V-derivative buffer
-    ///                       must have BindCpuBuffer() method returning a
-    ///                       float pointer for write
-    ///
-    /// @param dvDesc         vertex buffer descriptor for the output buffer
-    ///
-    /// @param stencilTable   Far::StencilTable or equivalent
-    ///
-    /// @param instance       not used in the cpu kernel
-    ///                       (declared as a typed pointer to prevent
-    ///                        undesirable template resolution)
-    ///
-    /// @param deviceContext  not used in the cpu kernel
-    ///
-    template <typename SRC_BUFFER, typename DST_BUFFER, typename STENCIL_TABLE>
-    static bool EvalStencils(
-        SRC_BUFFER *srcBuffer, BufferDescriptor const &srcDesc,
-        DST_BUFFER *dstBuffer, BufferDescriptor const &dstDesc,
-        DST_BUFFER *duBuffer,  BufferDescriptor const &duDesc,
-        DST_BUFFER *dvBuffer,  BufferDescriptor const &dvDesc,
-        STENCIL_TABLE const *stencilTable,
-        const IspcEvaluator *instance = NULL,
-        void * deviceContext = NULL) {
-
-        (void)instance;       // unused
-        (void)deviceContext;  // unused
-
-        return EvalStencils(srcBuffer->BindCpuBuffer(), srcDesc,
-                            dstBuffer->BindCpuBuffer(), dstDesc,
-                            duBuffer->BindCpuBuffer(),  duDesc,
-                            dvBuffer->BindCpuBuffer(),  dvDesc,
-                            &stencilTable->GetSizes()[0],
-                            &stencilTable->GetOffsets()[0],
-                            &stencilTable->GetControlIndices()[0],
-                            &stencilTable->GetWeights()[0],
-                            &stencilTable->GetDuWeights()[0],
-                            &stencilTable->GetDvWeights()[0],
-                            /*start = */ 0,
-                            /*end   = */ stencilTable->GetNumStencils());
-    }
-
-    /// \brief Static eval stencils function with derivatives, which takes
-    ///        raw CPU pointers for input and output.
-    ///
-    /// @param src            Input primvar pointer. An offset of srcDesc
-    ///                       will be applied internally (i.e. the pointer
-    ///                       should not include the offset)
-    ///
-    /// @param srcDesc        vertex buffer descriptor for the input buffer
-    ///
-    /// @param dst            Output primvar pointer. An offset of dstDesc
-    ///                       will be applied internally.
-    ///
-    /// @param dstDesc        vertex buffer descriptor for the output buffer
-    ///
-    /// @param du             Output U-derivatives pointer. An offset of
-    ///                       duDesc will be applied internally.
-    ///
-    /// @param duDesc         vertex buffer descriptor for the output buffer
-    ///
-    /// @param dv             Output V-derivatives pointer. An offset of
-    ///                       dvDesc will be applied internally.
-    ///
-    /// @param dvDesc         vertex buffer descriptor for the output buffer
-    ///
-    /// @param sizes          pointer to the sizes buffer of the stencil table
-    ///
-    /// @param offsets        pointer to the offsets buffer of the stencil table
-    ///
-    /// @param indices        pointer to the indices buffer of the stencil table
-    ///
-    /// @param weights        pointer to the weights buffer of the stencil table
-    ///
-    /// @param duWeights      pointer to the du-weights buffer of the stencil table
-    ///
-    /// @param dvWeights      pointer to the dv-weights buffer of the stencil table
-    ///
-    /// @param start          start index of stencil table
-    ///
-    /// @param end            end index of stencil table
-    ///
-    static bool EvalStencils(
-        const float *src, BufferDescriptor const &srcDesc,
-        float *dst,       BufferDescriptor const &dstDesc,
-        float *du,        BufferDescriptor const &duDesc,
-        float *dv,        BufferDescriptor const &dvDesc,
-        const int * sizes,
-        const int * offsets,
-        const int * indices,
-        const float * weights,
-        const float * duWeights,
-        const float * dvWeights,
-        int start, int end);
-
-    /// ----------------------------------------------------------------------
-    ///
-    ///   Limit evaluations with PatchTable
-    ///
-    /// ----------------------------------------------------------------------
-
-    /// \brief Generic limit eval function. This function has a same
-    ///        signature as other device kernels have so that it can be called
-    ///        in the same way.
-    ///
-    /// @param srcBuffer        Input primvar buffer.
-    ///                         must have BindCpuBuffer() method returning a
-    ///                         const float pointer for read
-    ///
-    /// @param srcDesc          vertex buffer descriptor for the input buffer
-    ///
-    /// @param dstBuffer        Output primvar buffer
-    ///                         must have BindCpuBuffer() method returning a
-    ///                         float pointer for write
-    ///
-    /// @param dstDesc          vertex buffer descriptor for the output buffer
-    ///
-    /// @param numPatchCoords   number of patchCoords.
-    ///
-    /// @param patchCoords      array of locations to be evaluated.
-    ///
-    /// @param patchTable       CpuPatchTable or equivalent
-    ///                         XXX: currently Far::PatchTable can't be used
-    ///                              due to interface mismatch
-    ///
-    /// @param instance         not used in the cpu evaluator
-    ///
-    /// @param deviceContext    not used in the cpu evaluator
-    ///
-    template <typename SRC_BUFFER, typename DST_BUFFER,
-              typename PATCHCOORD_BUFFER, typename PATCH_TABLE>
-    static bool EvalPatches(
-        SRC_BUFFER *srcBuffer, BufferDescriptor const &srcDesc,
-        DST_BUFFER *dstBuffer, BufferDescriptor const &dstDesc,
-        int numPatchCoords,
-        PATCHCOORD_BUFFER *patchCoords,
-        PATCH_TABLE *patchTable,
-        IspcEvaluator const *instance = NULL,
-        void * deviceContext = NULL) {
-
-        (void)instance;       // unused
-        (void)deviceContext;  // unused
-
-        return EvalPatches(srcBuffer->BindCpuBuffer(), srcDesc,
-                           dstBuffer->BindCpuBuffer(), dstDesc,
-                           numPatchCoords,
-                           (const PatchCoord*)patchCoords->BindCpuBuffer(),
-                           patchTable->GetPatchArrayBuffer(),
-                           patchTable->GetPatchIndexBuffer(),
-                           patchTable->GetPatchParamBuffer());
-    }
-
-    /// \brief Generic limit eval function with derivatives. This function has
-    ///        a same signature as other device kernels have so that it can be
-    ///        called in the same way.
-    ///
-    /// @param srcBuffer        Input primvar buffer.
-    ///                         must have BindCpuBuffer() method returning a
-    ///                         const float pointer for read
-    ///
-    /// @param srcDesc          vertex buffer descriptor for the input buffer
-    ///
-    /// @param dstBuffer        Output primvar buffer
-    ///                         must have BindCpuBuffer() method returning a
-    ///                         float pointer for write
-    ///
-    /// @param dstDesc          vertex buffer descriptor for the output buffer
-    ///
-    /// @param duBuffer         Output U-derivatives buffer
-    ///                         must have BindCpuBuffer() method returning a
-    ///                         float pointer for write
-    ///
-    /// @param duDesc           vertex buffer descriptor for the duBuffer
-    ///
-    /// @param dvBuffer         Output V-derivatives buffer
-    ///                         must have BindCpuBuffer() method returning a
-    ///                         float pointer for write
-    ///
-    /// @param dvDesc           vertex buffer descriptor for the dvBuffer
-    ///
-    /// @param numPatchCoords   number of patchCoords.
-    ///
-    /// @param patchCoords      array of locations to be evaluated.
-    ///
-    /// @param patchTable       CpuPatchTable or equivalent
-    ///                         XXX: currently Far::PatchTable can't be used
-    ///                              due to interface mismatch
-    ///
-    /// @param instance         not used in the cpu evaluator
-    ///
-    /// @param deviceContext    not used in the cpu evaluator
-    ///
-    template <typename SRC_BUFFER, typename DST_BUFFER,
-              typename PATCHCOORD_BUFFER, typename PATCH_TABLE>
-    static bool EvalPatches(
-        SRC_BUFFER *srcBuffer, BufferDescriptor const &srcDesc,
-        DST_BUFFER *dstBuffer, BufferDescriptor const &dstDesc,
-        DST_BUFFER *duBuffer,  BufferDescriptor const &duDesc,
-        DST_BUFFER *dvBuffer,  BufferDescriptor const &dvDesc,
-        int numPatchCoords,
-        PATCHCOORD_BUFFER *patchCoords,
-        PATCH_TABLE *patchTable,
-        IspcEvaluator const *instance = NULL,
-        void * deviceContext = NULL) {
-        (void)instance;       // unused
-        (void)deviceContext;  // unused
-
-        // XXX: PatchCoords is somewhat abusing vertex primvar buffer interop.
-        //      ideally all buffer classes should have templated by datatype
-        //      so that downcast isn't needed there.
-        //      (e.g. Osd::CpuBuffer<PatchCoord> )
-        //
-        return EvalPatches(srcBuffer->BindCpuBuffer(), srcDesc,
-                           dstBuffer->BindCpuBuffer(), dstDesc,
-                           duBuffer->BindCpuBuffer(),  duDesc,
-                           dvBuffer->BindCpuBuffer(),  dvDesc,
-                           numPatchCoords,
-                           (const PatchCoord*)patchCoords->BindCpuBuffer(),
-                           patchTable->GetPatchArrayBuffer(),
-                           patchTable->GetPatchIndexBuffer(),
-                           patchTable->GetPatchParamBuffer());
-    }
-
-    /// \brief Static limit eval function. It takes an array of PatchCoord
-    ///        and evaluate limit values on given PatchTable.
-    ///
-    /// @param src              Input primvar pointer. An offset of srcDesc
-    ///                         will be applied internally (i.e. the pointer
-    ///                         should not include the offset)
-    ///
-    /// @param srcDesc          vertex buffer descriptor for the input buffer
-    ///
-    /// @param dst              Output primvar pointer. An offset of dstDesc
-    ///                         will be applied internally.
-    ///
-    /// @param dstDesc          vertex buffer descriptor for the output buffer
-    ///
-    /// @param numPatchCoords   number of patchCoords.
-    ///
-    /// @param patchCoords      array of locations to be evaluated.
-    ///
-    /// @param patchArrays      an array of Osd::PatchArray struct
-    ///                         indexed by PatchCoord::arrayIndex
-    ///
-    /// @param patchIndexBuffer an array of patch indices
-    ///                         indexed by PatchCoord::vertIndex
-    ///
-    /// @param patchParamBuffer an array of Osd::PatchParam struct
-    ///                         indexed by PatchCoord::patchIndex
-    ///
-    static bool EvalPatches(
-        const float *src, BufferDescriptor const &srcDesc,
-        float *dst,       BufferDescriptor const &dstDesc,
-        int numPatchCoords,
-        const PatchCoord *patchCoords,
-        const PatchArray *patchArrays,
-        const int *patchIndexBuffer,
-        const PatchParam *patchParamBuffer);
-
-    /// \brief Static limit eval function. It takes an array of PatchCoord
-    ///        and evaluate limit values on given PatchTable.
-    ///
-    /// @param src              Input primvar pointer. An offset of srcDesc
-    ///                         will be applied internally (i.e. the pointer
-    ///                         should not include the offset)
-    ///
-    /// @param srcDesc          vertex buffer descriptor for the input buffer
-    ///
-    /// @param dst              Output primvar pointer. An offset of dstDesc
-    ///                         will be applied internally.
-    ///
-    /// @param dstDesc          vertex buffer descriptor for the output buffer
-    ///
-    /// @param du               Output U-derivatives pointer. An offset of
-    ///                         duDesc will be applied internally.
-    ///
-    /// @param duDesc           vertex buffer descriptor for the du buffer
-    ///
-    /// @param dv               Output V-derivatives pointer. An offset of
-    ///                         dvDesc will be applied internally.
-    ///
-    /// @param dvDesc           vertex buffer descriptor for the dv buffer
-    ///
-    /// @param numPatchCoords   number of patchCoords.
-    ///
-    /// @param patchCoords      array of locations to be evaluated.
-    ///
-    /// @param patchArrays      an array of Osd::PatchArray struct
-    ///                         indexed by PatchCoord::arrayIndex
-    ///
-    /// @param patchIndexBuffer an array of patch indices
-    ///                         indexed by PatchCoord::vertIndex
-    ///
-    /// @param patchParamBuffer an array of Osd::PatchParam struct
-    ///                         indexed by PatchCoord::patchIndex
-    ///
-    static bool EvalPatches(
-        const float *src, BufferDescriptor const &srcDesc,
-        float *dst,       BufferDescriptor const &dstDesc,
-        float *du,        BufferDescriptor const &duDesc,
-        float *dv,        BufferDescriptor const &dvDesc,
-        int numPatchCoords,
-        PatchCoord const *patchCoords,
-        PatchArray const *patchArrays,
-        const int *patchIndexBuffer,
-        PatchParam const *patchParamBuffer);
-
-    /// ----------------------------------------------------------------------
-    ///
-    ///   Other methods
-    ///
-    /// ----------------------------------------------------------------------
-
-    /// \brief synchronize all asynchronous computation invoked on this device.
-    static void Synchronize(void * /*deviceContext = NULL*/) {
-        // nothing.
-    }
-};
-
-
-}  // end namespace Osd
-
-}  // end namespace OPENSUBDIV_VERSION
-using namespace OPENSUBDIV_VERSION;
-
-}  // end namespace OpenSubdiv
-
-
-#endif  // OPENSUBDIV3_OSD_CPU_EVALUATOR_H