Revert "Add ISPC limit surface evaluation"

2025-01-03 13:41:06 +00:00 · 2015-07-20 17:13:51 -07:00 · 2015-07-20 17:13:51 -07:00 · 8a8771c97d
commit 8a8771c97d
parent b006dc328e
13 changed files with 30 additions and 1953 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -197,8 +197,6 @@ if (CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_CLANGCC OR CMAKE_COMPILER_IS_IC
            endif()
        endforeach()
        list(APPEND OSD_COMPILER_FLAGS -std=c++11)
    endif()
 elseif(MSVC)
@ -323,9 +321,6 @@ endif()
 if(NOT NO_TBB)
    find_package(TBB 4.0)
 endif()
 if(NOT NO_ISPC)
    find_package(ISPC 1.6)
 endif()
 if (NOT NO_OPENGL)
    find_package(OpenGL)
 endif()
@ -544,12 +539,6 @@ if (NOT NO_MAYA)
    endif()
 endif()
 if(ISPC_FOUND)
    add_definitions(
        -DOPENSUBDIV_HAS_ISPC
    )
 endif()
 # Link examples & regressions dynamically against Osd
 set( OSD_LINK_TARGET osd_dynamic_cpu osd_dynamic_gpu )
--- a/cmake/FindISPC.cmake
+++ b/cmake/FindISPC.cmake
@ -1,94 +0,0 @@
 #
 #   Copyright 2013 Pixar
 #
 #   Licensed under the Apache License, Version 2.0 (the "Apache License")
 #   with the following modification; you may not use this file except in
 #   compliance with the Apache License and the following modification to it:
 #   Section 6. Trademarks. is deleted and replaced with:
 #
 #   6. Trademarks. This License does not grant permission to use the trade
 #      names, trademarks, service marks, or product names of the Licensor
 #      and its affiliates, except as required to comply with Section 4(c) of
 #      the License and to reproduce the content of the NOTICE file.
 #
 #   You may obtain a copy of the Apache License at
 #
 #       http://www.apache.org/licenses/LICENSE-2.0
 #
 #   Unless required by applicable law or agreed to in writing, software
 #   distributed under the Apache License with the above modification is
 #   distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 #   KIND, either express or implied. See the Apache License for the specific
 #   language governing permissions and limitations under the Apache License.
 #
 # - Try to find Intel's ISPC
 # Once done this will define
 #
 #  ISPC_FOUND - System has ISPC
 #  ISPC_DIR - The ISPC directory
 # Obtain ISPC directory
 if (WIN32)
    #NOT IMPLEMENTED
 elseif (APPLE)
    #NOT IMPLEMENTED
 else ()
    find_path(ISPC_DIR
        NAMES
            ispc
        PATHS
            ${ISPC_LOCATION}  
        NO_DEFAULT_PATH NO_SYSTEM_ENVIRONMENT_PATH
        DOC "The directory where ISPC reside")
 endif ()
 if (ISPC_DIR)
    execute_process(COMMAND ${ISPC_DIR}/ispc --version OUTPUT_VARIABLE ISPC_VERSION)
    string(REGEX MATCH "[0-9].[0-9].[0-9]" ISPC_VERSION ${ISPC_VERSION})
 endif ()
 include(FindPackageHandleStandardArgs)
 find_package_handle_standard_args(ISPC
    REQUIRED_VARS
        ISPC_DIR
    VERSION_VAR
        ISPC_VERSION
 )
 mark_as_advanced( ISPC_DIR )
 MACRO (ispc_compile)
    SET(ISPC_TARGET_DIR ${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/osd_ispc_obj.dir)
    SET(ISPC_OBJECTS "")
    FOREACH(src ${ARGN})
        GET_FILENAME_COMPONENT(fname ${src} NAME_WE)
        SET(results "${ISPC_TARGET_DIR}/${fname}.dev.o")
        ADD_CUSTOM_COMMAND(
            OUTPUT ${results} ${ISPC_TARGET_DIR}/${fname}_ispc.h
            COMMAND  ${ISPC_DIR}/ispc  
            --pic
            -O1
            --wno-perf
            --woff
            -h ${ISPC_TARGET_DIR}/${fname}_ispc.h
            -MMM  ${ISPC_TARGET_DIR}/${fname}.dev.idep 
            -o ${ISPC_TARGET_DIR}/${fname}.dev.o
            ${CMAKE_CURRENT_SOURCE_DIR}/${src} 
            \;
            DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/${src} 
        )
        SET(ISPC_OBJECTS ${ISPC_OBJECTS} ${results})
    ENDFOREACH()
 ENDMACRO()
--- a/examples/glEvalLimit/glEvalLimit.cpp
+++ b/examples/glEvalLimit/glEvalLimit.cpp
@ -34,10 +34,6 @@ GLFWmonitor* g_primary=0;
 #include <osd/cpuGLVertexBuffer.h>
 #include <osd/mesh.h>
 #ifdef OPENSUBDIV_HAS_ISPC
    #include <osd/ispcEvaluator.h>
 #endif    
 #ifdef OPENSUBDIV_HAS_TBB
    #include <osd/tbbEvaluator.h>
 #endif
@ -108,8 +104,7 @@ enum KernelType { kCPU = 0,
                  kCUDA = 3,
                  kCL = 4,
                  kGLXFB = 5,
-                  kGLCompute = 6,
+                  kGLCompute = 6 };
                  kISPC = 7 };
 enum EndCap      { kEndCapBSplineBasis,
                   kEndCapGregoryBasis };
@ -174,10 +169,10 @@ float g_currentTime = 0;
 Stopwatch g_fpsTimer;
 //------------------------------------------------------------------------------
-int g_nParticles = 655360;
+int g_nParticles = 65536;
 bool g_randomStart = true;//false;
-bool g_animParticles = false;
+bool g_animParticles = true;
 GLuint g_samplesVAO=0;
@ -444,9 +439,7 @@ updateGeom() {
    assert(g_particles);
    float elapsed = g_currentTime - g_prevTime;
-    if(elapsed != 0.0f) {
+    g_particles->Update(elapsed);
        g_particles->Update(elapsed);
    }
    g_prevTime = g_currentTime;
    std::vector<OpenSubdiv::Osd::PatchCoord> const &patchCoords
@ -655,20 +648,8 @@ createOsdMesh(ShapeDesc const & shapeDesc, int level) {
            (vertexStencils, varyingStencils,
             nCoarseVertices, nverts, g_nParticles, g_patchTable,
             &glComputeEvaluatorCache);
    }             
 #endif
 #if  defined(OPENSUBDIV_HAS_ISPC) && defined(OPENSUBDIV_HAS_TBB)
    else if(g_kernel == kISPC) {
        g_evalOutput = new EvalOutput<Osd::CpuGLVertexBuffer,
                                      Osd::CpuGLVertexBuffer,
                                      Far::StencilTable,
                                      Osd::CpuPatchTable,
                                      Osd::IspcEvaluator>
            (vertexStencils, varyingStencils,
             nCoarseVertices, nverts, g_nParticles, g_patchTable);    
    }
 #endif    
    // Create the 'uv particles' manager - this class manages the limit
    // location samples (ptex face index, (s,t) and updates them between frames.
@ -894,7 +875,7 @@ display() {
        }
        if (g_endCap != kEndCapBSplineBasis &&
-            (g_kernel != kCPU && g_kernel != kOPENMP && g_kernel != kTBB && g_kernel != kISPC)) {
+            (g_kernel != kCPU && g_kernel != kOPENMP && g_kernel != kTBB)) {
            static char msg[] =
                "ERROR: This kernel only supports BSpline basis patches.";
            g_hud.DrawString(g_width/4, g_height/4+20, 1, 0, 0, msg);
@ -1148,9 +1129,6 @@ initHUD() {
 #ifdef OPENSUBDIV_HAS_TBB
    g_hud.AddPullDownButton(compute_pulldown, "TBB", kTBB);
 #endif
 #if  defined(OPENSUBDIV_HAS_ISPC) && defined(OPENSUBDIV_HAS_TBB)
    g_hud.AddPullDownButton(compute_pulldown, "ISPC", kISPC);
 #endif
 #ifdef OPENSUBDIV_HAS_CUDA
    g_hud.AddPullDownButton(compute_pulldown, "CUDA", kCUDA);
 #endif
--- a/examples/glEvalLimit/particles.cpp
+++ b/examples/glEvalLimit/particles.cpp
@ -32,17 +32,17 @@
 #ifdef OPENSUBDIV_HAS_TBB
 #include <tbb/parallel_for.h>
 #include <tbb/atomic.h>
-
+tbb::atomic<int> g_tbbCounter;
 class TbbUpdateKernel {
 public:
    TbbUpdateKernel(float speed,
                    STParticles::Position *positions,
                    float *velocities,
                    std::vector<STParticles::FaceInfo> const &adjacency,
-                    PatchHandleMap *patchHandleMap,
+                    OpenSubdiv::Osd::PatchCoord *patchCoords,
                    OpenSubdiv::Far::PatchMap const *patchMap) :
        _speed(speed), _positions(positions), _velocities(velocities),
-        _adjacency(adjacency), _patchHandleMap(patchHandleMap), _patchMap(patchMap) {
+        _adjacency(adjacency), _patchCoords(patchCoords), _patchMap(patchMap) {
    }
    void operator () (tbb::blocked_range<int> const &r) const {
@ -76,13 +76,9 @@ public:
            OpenSubdiv::Far::PatchTable::PatchHandle const *handle =
                _patchMap->FindPatch(p->ptexIndex, p->s, p->t);
            if (handle) {
-                PatchHandleMap::accessor a;
+                int index = g_tbbCounter.fetch_and_add(1);
-                if( !_patchHandleMap->find(a, handle)) {  
+                _patchCoords[index] =
-                    _patchHandleMap->insert(a, handle);               
+                    OpenSubdiv::Osd::PatchCoord(*handle, p->s, p->t);
                }
                std::vector<float> &st = a->second;
                st.push_back(p->s);
                st.push_back(p->t);  
            }
        }
    }
@ -91,7 +87,7 @@ private:
    STParticles::Position *_positions;
    float *_velocities;
    std::vector<STParticles::FaceInfo> const &_adjacency;
-    PatchHandleMap *_patchHandleMap;
+    OpenSubdiv::Osd::PatchCoord *_patchCoords;
    OpenSubdiv::Far::PatchMap const *_patchMap;
 };
 #endif
@ -280,36 +276,18 @@ STParticles::Update(float deltaTime) {
    if (deltaTime == 0) return;
    float speed = GetSpeed() * std::max(0.001f, std::min(deltaTime, 0.5f));
    _patchCoords.clear();
    // XXX: this process should be parallelized.
 #ifdef OPENSUBDIV_HAS_TBB
    _patchHandleMap.clear();
    _patchCoords.resize((int)GetNumParticles());
    TbbUpdateKernel kernel(speed, &_positions[0], &_velocities[0],
-                           _adjacency, &_patchHandleMap, _patchMap);;
+                           _adjacency, &_patchCoords[0], _patchMap);;
    g_tbbCounter = 0;
    tbb::blocked_range<int> range(0, GetNumParticles(), 256);
    tbb::parallel_for(range, kernel);
-    
+    _patchCoords.resize(g_tbbCounter);
    int nCoord = 0;
    for(PatchHandleMap::iterator i  = _patchHandleMap.begin();
                                 i != _patchHandleMap.end();
                                 i ++) {
        nCoord += (i->second.size() / 2);
    }
    _patchCoords.resize(nCoord);
    int index = 0;
    for(PatchHandleMap::iterator i  = _patchHandleMap.begin();
                                 i != _patchHandleMap.end();
                                 i ++) {
        for(int j = 0; j < i->second.size(); j += 2) {
            _patchCoords[index].handle = *(i->first);
            _patchCoords[index].s      = i->second[j];
            _patchCoords[index].t      = i->second[j+1];
            index ++;
        }
    }     
 #else
    Position *  p = &_positions[0];
    float    * dp = &_velocities[0];
--- a/examples/glEvalLimit/particles.h
+++ b/examples/glEvalLimit/particles.h
@ -30,11 +30,6 @@
 #include <osd/types.h>
 #include <iostream>
 #ifdef OPENSUBDIV_HAS_TBB
 #include <tbb/concurrent_hash_map.h>
 typedef tbb::concurrent_hash_map< OpenSubdiv::Far::PatchTable::PatchHandle const*, std::vector<float> > PatchHandleMap;
 #endif
 //
 // In order to emphasize the dynamic nature of the EvalLimit API, where the
 // locations can be arbitrarily updated before each evaluation, the glEvalLimit
@ -147,7 +142,7 @@ public:
        return _velocities;
    }
-    std::vector<OpenSubdiv::Osd::PatchCoord> const &GetPatchCoords() const {
+    std::vector<OpenSubdiv::Osd::PatchCoord> GetPatchCoords() const {
        return _patchCoords;
    }
@ -165,10 +160,6 @@ private:
    std::vector<float> _velocities;
 #ifdef OPENSUBDIV_HAS_TBB    
    PatchHandleMap  _patchHandleMap;
 #endif
    std::vector<OpenSubdiv::Osd::PatchCoord> _patchCoords;
    float _speed;  // velocity multiplier
--- a/opensubdiv/CMakeLists.txt
+++ b/opensubdiv/CMakeLists.txt
@ -147,16 +147,9 @@ if (NOT NO_LIB)
    )
    set_target_properties(osd_static_cpu PROPERTIES OUTPUT_NAME osdCPU CLEAN_DIRECT_OUTPUT 1)
-    if( ISPC_FOUND)
+    target_link_libraries(osd_static_cpu
-        target_link_libraries(osd_static_cpu
+        ${PLATFORM_CPU_LIBRARIES}
-            osd_ispc_obj
+    )
            ${PLATFORM_CPU_LIBRARIES}
        )
    else()
        target_link_libraries(osd_static_cpu
            ${PLATFORM_CPU_LIBRARIES}
        )    
    endif()
    install( TARGETS osd_static_cpu DESTINATION "${CMAKE_LIBDIR_BASE}" )
@ -207,16 +200,9 @@ if (NOT NO_LIB)
                )
        endif()
-        if ( ISPC_FOUND)
+        target_link_libraries(osd_dynamic_cpu
-            target_link_libraries(osd_dynamic_cpu
+            ${PLATFORM_CPU_LIBRARIES}
-                osd_ispc_obj
+        )
                ${PLATFORM_CPU_LIBRARIES}
            )
        else()
            target_link_libraries(osd_dynamic_cpu
                ${PLATFORM_CPU_LIBRARIES}
            )        
        endif()
        install( TARGETS osd_dynamic_cpu LIBRARY DESTINATION "${CMAKE_LIBDIR_BASE}" )
--- a/opensubdiv/far/patchParam.h
+++ b/opensubdiv/far/patchParam.h
@ -116,15 +116,6 @@ struct PatchParam {
    ///
    void Normalize( float & u, float & v ) const;
    /// This function is the reverse operation of function Normalize()
    /// The (u,v) pair is converted from patch sub-parametric space to control
    /// face parametric space.
    ///
    /// @param u  u parameter
    /// @param v  v parameter
    ///        
    void Denormalize( float & u, float & v) const;
    unsigned int field0:32;
    unsigned int field1:32;
 };
@ -170,20 +161,6 @@ PatchParam::Normalize( float & u, float & v ) const {
    v = (v - pv) / frac;
 }
 inline void
 PatchParam::Denormalize( float & u, float & v ) const {
    float frac = GetParamFraction();
    // top left corner
    float pu = (float)GetU()*frac;
    float pv = (float)GetV()*frac;
    // normalize u,v coordinates
    u = u * frac + pu;
    v = v * frac + pv;    
 }
 } // end namespace Far
 } // end namespace OPENSUBDIV_VERSION
--- a/opensubdiv/far/patchTable.h
+++ b/opensubdiv/far/patchTable.h
@ -68,12 +68,6 @@ public:
        Index arrayIndex, // Array index of the patch
              patchIndex, // Absolute Index of the patch
              vertIndex;  // Relative offset to the first CV of the patch in array
        bool isEqual(const PatchHandle &other) {
            return other.arrayIndex == arrayIndex &&
                   other.patchIndex == patchIndex &&
                   other.vertIndex  == vertIndex;
        }
    };
 public:
--- a/opensubdiv/osd/CMakeLists.txt
+++ b/opensubdiv/osd/CMakeLists.txt
@ -26,7 +26,6 @@
 #-------------------------------------------------------------------------------
 # source & headers
 set(CPU_SOURCE_FILES
    cpuEvaluator.cpp
    cpuKernel.cpp
@ -34,12 +33,8 @@ set(CPU_SOURCE_FILES
    cpuVertexBuffer.cpp
 )
 if( ISPC_FOUND) 
    list(APPEND CPU_SOURCE_FILES ispcEvaluator.cpp)  
 endif()
 set(GPU_SOURCE_FILES )
-set(ISPC_SOURCE_FILES )
+
 set(INC_FILES )
 set(PRIVATE_HEADER_FILES
@ -301,17 +296,6 @@ if( CUDA_FOUND )
    endif()
 endif()
 if( ISPC_FOUND) 
    list(APPEND ISPC_SOURCE_FILES
         ispcEvalLimitKernel.ispc
    ) 
    # Compile ISPC code to objs
    ispc_compile(${ISPC_SOURCE_FILES})
    ADD_LIBRARY(osd_ispc_obj STATIC ${ISPC_OBJECTS})   
    SET_TARGET_PROPERTIES(osd_ispc_obj PROPERTIES LINKER_LANGUAGE C)    
 endif()
 list(APPEND DOXY_HEADER_FILES ${CUDA_PUBLIC_HEADERS})
 #-------------------------------------------------------------------------------
--- a/opensubdiv/osd/ispcEvalLimitKernel.ispc
+++ b/opensubdiv/osd/ispcEvalLimitKernel.ispc
@ -1,880 +0,0 @@
 //
 //   Copyright 2013 Pixar
 //
 //   Licensed under the Apache License, Version 2.0 (the "Apache License")
 //   with the following modification; you may not use this file except in
 //   compliance with the Apache License and the following modification to it:
 //   Section 6. Trademarks. is deleted and replaced with:
 //
 //   6. Trademarks. This License does not grant permission to use the trade
 //      names, trademarks, service marks, or product names of the Licensor
 //      and its affiliates, except as required to comply with Section 4(c) of
 //      the License and to reproduce the content of the NOTICE file.
 //
 //   You may obtain a copy of the Apache License at
 //
 //       http://www.apache.org/licenses/LICENSE-2.0
 //
 //   Unless required by applicable law or agreed to in writing, software
 //   distributed under the Apache License with the above modification is
 //   distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 //   KIND, either express or implied. See the Apache License for the specific
 //   language governing permissions and limitations under the Apache License.
 //
 #define MAX_CHANNEL 4
 struct BufferDescriptor {
    int offset;  // offset to desired element data
    int length;  // number or length of the data
    int stride;  // stride to the next element    
 };
 struct Point {
    float x;
    float y;
    float z;
 };
 inline struct Point operator+(struct Point a, struct Point b) {
    struct Point result;
    result.x = a.x + b.x;
    result.y = a.y + b.y;
    result.z = a.z + b.z;        
    return result;
 }
 inline uniform struct Point operator+(uniform struct Point a, uniform struct Point b) {
    uniform struct Point result;
    result.x = a.x + b.x;
    result.y = a.y + b.y;
    result.z = a.z + b.z;        
    return result;
 }
 inline struct Point operator-(struct Point a, struct Point b) {
    struct Point result;
    result.x = a.x - b.x;
    result.y = a.y - b.y;
    result.z = a.z - b.z;        
    return result;
 }
 inline uniform struct Point operator-(uniform struct Point a, uniform struct Point b) {
    uniform struct Point result;
    result.x = a.x - b.x;
    result.y = a.y - b.y;
    result.z = a.z - b.z;        
    return result;
 }
 inline struct Point operator*(struct Point a, float b) {
    struct Point result;
    result.x = a.x * b;
    result.y = a.y * b;
    result.z = a.z * b;        
    return result;
 }
 inline uniform struct Point operator*(uniform struct Point a, uniform float b) {
    uniform struct Point result;
    result.x = a.x * b;
    result.y = a.y * b;
    result.z = a.z * b;        
    return result;
 } 
 inline struct Point operator*(float b, struct Point a) {
    struct Point result;
    result.x = b * a.x;
    result.y = b * a.y;
    result.z = b * a.z;        
    return result;
 }
 inline uniform struct Point operator*(uniform float b, uniform struct Point a) {
    uniform struct Point result;
    result.x = b * a.x;
    result.y = b * a.y;
    result.z = b * a.z;        
    return result;
 }
 inline struct Point operator/(struct Point a, float b) {
    struct Point result;
    result.x = a.x / b;
    result.y = a.y / b;
    result.z = a.z / b;        
    return result;
 }
 inline uniform struct Point operator/(uniform struct Point a, uniform float b) {
    uniform struct Point result;
    result.x = a.x / b;
    result.y = a.y / b;
    result.z = a.z / b;        
    return result;
 }
 inline void cross(struct Point &a, struct Point &b, struct Point &c)
 {
    c.x = a.y*b.z - a.z*b.y;
    c.y = a.z*b.x - a.x*b.z;
    c.z = a.x*b.y - a.y*b.x;
 }
 inline uniform bool
 nonQuadRoot(uniform unsigned int bitField) 
 {
    return (bitField >> 3) & 0x1;
 }
 inline uniform unsigned int getU(uniform unsigned int bitField) 
 { 
    return (uniform unsigned int)((bitField >> 22) & 0x3ff); 
 }
 inline uniform unsigned int getV(uniform unsigned int bitField) 
 { 
    return (uniform unsigned int)((bitField >> 12) & 0x3ff); 
 }
 inline uniform unsigned int getBoundary(uniform unsigned int bitField)
 { 
    return (uniform unsigned int)((bitField >> 8) & 0xf); 
 }
 inline uniform unsigned int getDepth(uniform unsigned int bitField)
 { 
    return  (uniform unsigned int)(bitField & 0xf); 
 }
 inline uniform float
 getParamFraction(uniform unsigned int bitField){
    if (nonQuadRoot(bitField)) {
        return 1.0f / (1 << (getDepth(bitField)-1));
    } else {
        return 1.0f / (1 << getDepth(bitField));
    }
 }
 inline void 
 adjustBoundaryWeights(uniform unsigned int bitField,
                      float                sWeights[4], 
                      float                tWeights[4]) {
    uniform int boundary = getBoundary(bitField);
    if (boundary & 1) {
        tWeights[2] -= tWeights[0];
        tWeights[1] += 2*tWeights[0];
        tWeights[0] = 0;
    }
    if (boundary & 2) {
        sWeights[1] -= sWeights[3];
        sWeights[2] += 2*sWeights[3];
        sWeights[3] = 0;
    }
    if (boundary & 4) {
        tWeights[1] -= tWeights[3];
        tWeights[2] += 2*tWeights[3];
        tWeights[3] = 0;
    }
    if (boundary & 8) {
        sWeights[2] -= sWeights[0];
        sWeights[1] += 2*sWeights[0];
        sWeights[0] = 0;
    }
 }
 inline void
 getBSplineWeights(float t, float point[4], float deriv[4]) {
    // The four uniform cubic B-Spline basis functions evaluated at t:
    float const one6th = 1.0f / 6.0f;
    float t2 = t * t;
    float t3 = t * t2;
    point[0] = one6th * (1.0f - 3.0f*(t -      t2) -      t3);
    point[1] = one6th * (4.0f           - 6.0f*t2  + 3.0f*t3);
    point[2] = one6th * (1.0f + 3.0f*(t +      t2  -      t3));
    point[3] = one6th * (                                 t3);
    // Derivatives of the above four basis functions at t:
    deriv[0] = -0.5f*t2 +      t - 0.5f;
    deriv[1] =  1.5f*t2 - 2.0f*t;
    deriv[2] = -1.5f*t2 +      t + 0.5f;
    deriv[3] =  0.5f*t2;
 }
 inline void
 getBezierWeights(float t, float point[4], float deriv[4]) {
    // The four uniform cubic Bezier basis functions (in terms of t and its
    // complement tC) evaluated at t:
    float t2 = t*t;
    float tC = 1.0f - t;
    float tC2 = tC * tC;
    point[0] = tC2 * tC;
    point[1] = tC2 * t * 3.0f;
    point[2] = t2 * tC * 3.0f;
    point[3] = t2 * t;
    // Derivatives of the above four basis functions at t:
    deriv[0] = -3.0f * tC2;
    deriv[1] =  9.0f * t2 - 12.0f * t + 3.0f;
    deriv[2] = -9.0f * t2 +  6.0f * t;
    deriv[3] =  3.0f * t2;
 }
 inline void
 getBSplineWeightsNoDerivative(float t, float point[4]) {
    // The four uniform cubic B-Spline basis functions evaluated at t:
    float const one6th = 1.0f / 6.0f;
    float t2 = t * t;
    float t3 = t * t2;
    point[0] = one6th * (1.0f - 3.0f*(t -      t2) -      t3);
    point[1] = one6th * (4.0f           - 6.0f*t2  + 3.0f*t3);
    point[2] = one6th * (1.0f + 3.0f*(t +      t2  -      t3));
    point[3] = one6th * (                                 t3);
 }
 inline void
 getBezierWeightsNoDerivative(float t, float point[4]) {
    // The four uniform cubic Bezier basis functions (in terms of t and its
    // complement tC) evaluated at t:
    float t2 = t*t;
    float tC = 1.0f - t;
    float tC2 = tC * tC;
    point[0] = tC2 * tC;
    point[1] = tC2 * t * 3.0f;
    point[2] = t2 * tC * 3.0f;
    point[3] = t2 * t;
 }
 export void
 evalBilinear(uniform unsigned int                  bitField,
             uniform int                           nPoint, 
             uniform const float  * uniform        u, 
             uniform const float  * uniform        v,             
             uniform const int    * uniform        vertexIndices,
             uniform const BufferDescriptor       &inDesc,
             uniform const float * uniform         inQ,
             uniform const BufferDescriptor       &outDesc,
             uniform float *uniform                outQ,
             uniform const BufferDescriptor       &duDesc,            
             uniform float *uniform                outDQU,
             uniform const BufferDescriptor       &dvDesc,            
             uniform float *uniform                outDQV)
 {
    uniform int nChannel = inDesc.length / 3;
    assert(nChannel < MAX_CHANNEL);
    uniform Point controlVertices[MAX_CHANNEL*4];
    for(uniform int i=0; i<4; i++) {
        uniform unsigned int id = vertexIndices[i];
        uniform const float * uniform pVertex = inQ + inDesc.offset + id * inDesc.stride;
        for(uniform int c=0; c<nChannel; c++) {
            uniform int offset = c * 4 + i;
            controlVertices[offset].x = pVertex[0];
            controlVertices[offset].y = pVertex[1];
            controlVertices[offset].z = pVertex[2];
            pVertex += 3;
        }
    }        
    foreach( n = 0 ... nPoint) {        
        float ou   = 1.0f - u[n];
        float ov   = 1.0f - v[n];
        float w[4] = { ov*ou, v[n]*ou, v[n]*u[n], ov*u[n] };
        float *pOutQ   = outQ   + outDesc.offset + n * outDesc.stride;        
        for(uniform int c=0; c<nChannel; c++) { 
            Point Q;
            Q.x = Q.y = Q.z = 0.0;
            for (uniform int i=0; i<4; ++i) {
                Q = Q + w[i] * controlVertices[c * 4 + i];              
            }    
            *pOutQ ++ = Q.x, *pOutQ ++ = Q.y, *pOutQ ++ = Q.z;
        }        
    }
    uniform Point dU[MAX_CHANNEL], dV[MAX_CHANNEL];
    for(uniform int c=0; c<nChannel; c++) { 
        dU[c] = 0.5 * (controlVertices[c * 4 + 3] - controlVertices[c * 4 + 0] +
                       controlVertices[c * 4 + 2] - controlVertices[c * 4 + 1]  );
        dV[c] = 0.5 * (controlVertices[c * 4 + 1] - controlVertices[c * 4 + 0] +
                       controlVertices[c * 4 + 2] - controlVertices[c * 4 + 3]  );                       
    }    
    foreach( n = 0 ... nPoint) {
        float *pOutDQU = outDQU +  duDesc.offset  + n *  duDesc.stride;     
        float *pOutDQV = outDQV +  dvDesc.offset  + n *  dvDesc.stride;           
        for(uniform int c=0; c<nChannel; c++) { 
            *pOutDQU ++ = dU[c].x, *pOutDQU ++ = dU[c].y, *pOutDQU ++ = dU[c].z;
            *pOutDQV ++ = dV[c].x, *pOutDQV ++ = dV[c].y, *pOutDQV ++ = dV[c].z;            
        }
    }    
 }   
 export void
 evalBilinearNoDerivative(uniform unsigned int                  bitField,
                         uniform int                           nPoint, 
                         uniform const float  * uniform        u, 
                         uniform const float  * uniform        v,             
                         uniform const int    * uniform        vertexIndices,
                         uniform const BufferDescriptor       &inDesc,
                         uniform const float * uniform         inQ,
                         uniform const BufferDescriptor       &outDesc,
                         uniform float *uniform                outQ)
 {
    uniform int nChannel = inDesc.length / 3;
    assert(nChannel < MAX_CHANNEL);
    uniform Point controlVertices[MAX_CHANNEL*4];
    for(uniform int i=0; i<4; i++) {
        uniform unsigned int id = vertexIndices[i];
        uniform const float * uniform pVertex = inQ + inDesc.offset + id * inDesc.stride;
        for(uniform int c=0; c<nChannel; c++) {
            uniform int offset = c * 4 + i;
            controlVertices[offset].x = pVertex[0];
            controlVertices[offset].y = pVertex[1];
            controlVertices[offset].z = pVertex[2];
            pVertex += 3;
        }
    }        
    foreach( n = 0 ... nPoint) {        
        float ou   = 1.0f - u[n];
        float ov   = 1.0f - v[n];
        float w[4] = { ov*ou, v[n]*ou, v[n]*u[n], ov*u[n] };
        float *pOutQ   = outQ   + outDesc.offset + n * outDesc.stride;        
        for(uniform int c=0; c<nChannel; c++) { 
            Point Q;
            Q.x = Q.y = Q.z = 0.0;
            for (uniform int i=0; i<4; ++i) {
                Q = Q + w[i] * controlVertices[c * 4 + i];              
            }    
            *pOutQ ++ = Q.x, *pOutQ ++ = Q.y, *pOutQ ++ = Q.z;
        }        
    }
 }   
 export void
 evalBSpline(uniform unsigned int                  bitField,
            uniform int                           nPoint, 
            uniform const float  * uniform        u, 
            uniform const float  * uniform        v,             
            uniform const int    * uniform        vertexIndices,
            uniform const BufferDescriptor       &inDesc,
            uniform const float * uniform         inQ,
            uniform const BufferDescriptor       &outDesc,
            uniform float *uniform                outQ,
            uniform const BufferDescriptor       &duDesc,            
            uniform float *uniform                outDQU,
            uniform const BufferDescriptor       &dvDesc,            
            uniform float *uniform                outDQV)
 {
    uniform int nChannel = inDesc.length / 3;
    assert(nChannel < MAX_CHANNEL);
    uniform Point controlVertices[MAX_CHANNEL*16];
    for(uniform int i=0; i<16; i++) {
        uniform unsigned int id = vertexIndices[i];
        uniform const float * uniform pVertex = inQ + inDesc.offset + id * inDesc.stride;
        for(uniform int c=0; c<nChannel; c++) {
            uniform int offset = c * 16 + i;
            controlVertices[offset].x = pVertex[0];
            controlVertices[offset].y = pVertex[1];
            controlVertices[offset].z = pVertex[2];
            pVertex += 3;
        }
    }
    uniform float dScale = (uniform float)(1 << getDepth(bitField));
    uniform float frac = getParamFraction(bitField);
    // top left corner
    uniform float pu = (uniform float)getU(bitField)*frac;
    uniform float pv = (uniform float)getV(bitField)*frac;
    foreach( n = 0 ... nPoint) {
        // normalize u,v coordinates
        float s = (u[n] - pu) / frac;
        float t = (v[n] - pv) / frac;
        float sWeights[4], tWeights[4], dsWeights[4], dtWeights[4];
        getBSplineWeights(s, sWeights, dsWeights);
        getBSplineWeights(t, tWeights, dtWeights);      
        adjustBoundaryWeights(bitField,  sWeights,  tWeights);
        adjustBoundaryWeights(bitField, dsWeights, dtWeights);            
        float weight[16];       
        for (uniform int i = 0; i < 4; ++i) {
            for (uniform int j = 0; j < 4; ++j) {
                weight[4*i+j] = sWeights[j] * tWeights[i];
            }
        }
        float *pOutQ = outQ + outDesc.offset + n * outDesc.stride;             
        for(uniform int c=0; c<nChannel; c++) { 
            uniform int offset = c * 16;
            Point Q;
            Q.x = Q.y = Q.z = 0.0;
            for (uniform int i=0; i<16; ++i) {
                Q = Q + weight[i] * controlVertices[offset + i];                            
            }    
            *pOutQ ++ = Q.x, *pOutQ ++ = Q.y, *pOutQ ++ = Q.z;
        }   
        float derivS[16], derivT[16];       
        for (uniform int i = 0; i < 4; ++i) {
            for (uniform int j = 0; j < 4; ++j) {
                derivS[4*i+j] = dsWeights[j] *  tWeights[i] * dScale;
                derivT[4*i+j] =  sWeights[j] * dtWeights[i] * dScale;                
            }
        }
        float *pOutDQU = outDQU + duDesc.offset + n * duDesc.stride;
        float *pOutDQV = outDQV + dvDesc.offset + n * dvDesc.stride;                                  
        for(uniform int c=0; c<nChannel; c++) { 
            uniform int offset = c * 16;
            Point DQU, DQV;
            DQU.x = DQU.y = DQU.z = 0.0;
            DQV.x = DQV.y = DQV.z = 0.0;            
            for (uniform int i=0; i<16; ++i) {
                DQU = DQU + derivS[i] * controlVertices[offset + i];
                DQV = DQV + derivT[i] * controlVertices[offset + i];                                            
            }    
            *pOutDQU ++ = DQU.x, *pOutDQU ++ = DQU.y, *pOutDQU ++ = DQU.z;
            *pOutDQV ++ = DQV.x, *pOutDQV ++ = DQV.y, *pOutDQV ++ = DQV.z;            
        }                   
    }
 }  
 export void
 evalBSplineNoDerivative(uniform unsigned int                  bitField,
                        uniform int                           nPoint, 
                        uniform const float  * uniform        u, 
                        uniform const float  * uniform        v,             
                        uniform const int    * uniform        vertexIndices,
                        uniform const BufferDescriptor       &inDesc,
                        uniform const float * uniform         inQ,
                        uniform const BufferDescriptor       &outDesc,
                        uniform float *uniform                outQ)
 {
    uniform int nChannel = inDesc.length / 3;
    assert(nChannel < MAX_CHANNEL);
    uniform Point controlVertices[MAX_CHANNEL*16];
    for(uniform int i=0; i<16; i++) {
        uniform unsigned int id = vertexIndices[i];
        uniform const float * uniform pVertex = inQ + inDesc.offset + id * inDesc.stride;
        for(uniform int c=0; c<nChannel; c++) {
            uniform int offset = c * 16 + i;
            controlVertices[offset].x = pVertex[0];
            controlVertices[offset].y = pVertex[1];
            controlVertices[offset].z = pVertex[2];
            pVertex += 3;
        }
    }
    uniform float frac = getParamFraction(bitField);
    // top left corner
    uniform float pu = (uniform float)getU(bitField)*frac;
    uniform float pv = (uniform float)getV(bitField)*frac;
    foreach( n = 0 ... nPoint) {
        // normalize u,v coordinates
        float s = (u[n] - pu) / frac;
        float t = (v[n] - pv) / frac;
        float sWeights[4], tWeights[4];
        getBSplineWeightsNoDerivative(s, sWeights);
        getBSplineWeightsNoDerivative(t, tWeights);      
        adjustBoundaryWeights(bitField, sWeights, tWeights);  
        float weight[16];       
        for (uniform int i = 0; i < 4; ++i) {
            for (uniform int j = 0; j < 4; ++j) {
                weight[4*i+j] = sWeights[j] * tWeights[i];
            }
        }
        float *pOutQ = outQ + outDesc.offset + n * outDesc.stride;             
        for(uniform int c=0; c<nChannel; c++) { 
            uniform int offset = c * 16;
            Point Q;
            Q.x = Q.y = Q.z = 0.0;
            for (uniform int i=0; i<16; ++i) {
                Q = Q + weight[i] * controlVertices[offset + i];                            
            }    
            *pOutQ ++ = Q.x, *pOutQ ++ = Q.y, *pOutQ ++ = Q.z;
        }           
    }
 }  
 void getGregoryWeights(uniform unsigned int bitField, 
                       float s, float t, float point[20], float deriv1[20], float deriv2[20]) {
    //
    //  P3         e3-      e2+         P2
    //     15------17-------11--------10
    //     |        |        |        |
    //     |        |        |        |
    //     |        | f3-    | f2+    |
    //     |       19       13        |
    // e3+ 16-----18           14-----12 e2-
    //     |     f3+          f2-     |
    //     |                          |
    //     |                          |
    //     |      f0-         f1+     |
    // e0- 2------4            8------6 e1+
    //     |        3        9        |
    //     |        | f0+    | f1-    |
    //     |        |        |        |
    //     |        |        |        |
    //     O--------1--------7--------5
    //  P0         e0+      e1-         P1
    //
    //  Indices of boundary and interior points and their corresponding Bezier points
    //  (this can be reduced with more direct indexing and unrolling of loops):
    //
    static uniform int const boundaryGregory[12] = { 0, 1, 7, 5, 2, 6, 16, 12, 15, 17, 11, 10 };
    static uniform int const boundaryBezSCol[12] = { 0, 1, 2, 3, 0, 3,  0,  3,  0,  1,  2,  3 };
    static uniform int const boundaryBezTRow[12] = { 0, 0, 0, 0, 1, 1,  2,  2,  3,  3,  3,  3 };
    static uniform int const interiorGregory[8] = { 3, 4,  8, 9,  13, 14,  18, 19 };
    static uniform int const interiorBezSCol[8] = { 1, 1,  2, 2,   2,  2,   1,  1 };
    static uniform int const interiorBezTRow[8] = { 1, 1,  1, 1,   2,  2,   2,  2 };
    //
    //  Bezier basis functions are denoted with B while the rational multipliers for the
    //  interior points will be denoted G -- so we have B(s), B(t) and G(s,t):
    //
    //  Directional Bezier basis functions B at s and t:
    float Bs[4], Bds[4];
    float Bt[4], Bdt[4];
    getBezierWeights(s, Bs, Bds);
    getBezierWeights(t, Bt, Bdt);
    //  Rational multipliers G at s and t:
    float sC = 1.0f - s;
    float tC = 1.0f - t;
    //  Use <= here to avoid compiler warnings -- the sums should always be non-negative:
    float df0 = s  + t;   df0 = (df0 <= 0.0f) ? 1.0f : (1.0f / df0);
    float df1 = sC + t;   df1 = (df1 <= 0.0f) ? 1.0f : (1.0f / df1);
    float df2 = sC + tC;  df2 = (df2 <= 0.0f) ? 1.0f : (1.0f / df2);
    float df3 = s  + tC;  df3 = (df3 <= 0.0f) ? 1.0f : (1.0f / df3);
    float G[8] = { s*df0, t*df0,  t*df1, sC*df1,  sC*df2, tC*df2,  tC*df3, s*df3 };
    //  Combined weights for boundary and interior points:
    for (uniform int i = 0; i < 12; ++i) {
        point[boundaryGregory[i]] = Bs[boundaryBezSCol[i]] * Bt[boundaryBezTRow[i]];
    }
    for (uniform int i = 0; i < 8; ++i) {
        point[interiorGregory[i]] = Bs[interiorBezSCol[i]] * Bt[interiorBezTRow[i]] * G[i];
    }
    //
    //  For derivatives, the basis functions for the interior points are rational and ideally
    //  require appropriate differentiation, i.e. product rule for the combination of B and G
    //  and the quotient rule for the rational G itself.  As initially proposed by Loop et al
    //  though, the approximation using the 16 Bezier points arising from the G(s,t) has
    //  proved adequate (and is what the GPU shaders use) so we continue to use that here.
    //
    //  An implementation of the true derivatives is provided for future reference -- it is
    //  unclear if the approximations will hold up under surface analysis involving higher
    //  order differentiation.
    //
    //  Remember to include derivative scaling in all assignments below:
    uniform float dScale = (uniform float)(1 << getDepth(bitField));
    //  Combined weights for boundary points -- simple (scaled) tensor products:
    for (uniform int i = 0; i < 12; ++i) {
        uniform int iDst = boundaryGregory[i];
        uniform int tRow = boundaryBezTRow[i];
        uniform int sCol = boundaryBezSCol[i];
        deriv1[iDst] = Bds[sCol] * Bt[tRow] * dScale;
        deriv2[iDst] = Bdt[tRow] * Bs[sCol] * dScale;
    }
 #define _USE_BEZIER_PSEUDO_DERIVATIVES
 #ifdef _USE_BEZIER_PSEUDO_DERIVATIVES
    //  Approximation to the true Gregory derivatives by differentiating the Bezier patch
    //  unique to the given (s,t), i.e. having F = (g^+ * f^+) + (g^- * f^-) as its four
    //  interior points:
    //
    //  Combined weights for interior points -- (scaled) tensor products with G+ or G-:
    for (uniform int i = 0; i < 8; ++i) {
        uniform int iDst = interiorGregory[i];
        uniform int tRow = interiorBezTRow[i];
        uniform int sCol = interiorBezSCol[i];
        deriv1[iDst] = Bds[sCol] * Bt[tRow] * G[i] * dScale;
        deriv2[iDst] = Bdt[tRow] * Bs[sCol] * G[i] * dScale;
    }
 #else
    //  True Gregory derivatives using appropriate differentiation of composite functions:
    //
    //  Note that for G(s,t) = N(s,t) / D(s,t), all N' and D' are trivial constants (which
    //  simplifies things for higher order derivatives).  And while each pair of functions
    //  G (i.e. the G+ and G- corresponding to points f+ and f-) must sum to 1 to ensure
    //  Bezier equivalence (when f+ = f-), the pairs of G' must similarly sum to 0.  So we
    //  can potentially compute only one of the pair and negate the result for the other
    //  (and with 4 or 8 computations involving these constants, this is all very SIMD
    //  friendly...) but for now we treat all 8 independently for simplicity.
    //
    //float N[8] = {   s,     t,      t,     sC,      sC,     tC,      tC,     s };
    uniform float D[8] = {   df0,   df0,    df1,    df1,     df2,    df2,     df3,   df3 };
    static uniform float const Nds[8] = { 1.0f, 0.0f,  0.0f, -1.0f, -1.0f,  0.0f,  0.0f,  1.0f };
    static uniform float const Ndt[8] = { 0.0f, 1.0f,  1.0f,  0.0f,  0.0f, -1.0f, -1.0f,  0.0f };
    static uniform float const Dds[8] = { 1.0f, 1.0f, -1.0f, -1.0f, -1.0f, -1.0f,  1.0f,  1.0f };
    static uniform float const Ddt[8] = { 1.0f, 1.0f,  1.0f,  1.0f, -1.0f, -1.0f, -1.0f, -1.0f };
    //  Combined weights for interior points -- (scaled) combinations of B, B', G and G':
    for (uniform int i = 0; i < 8; ++i) {
        uniform int iDst = interiorGregory[i];
        uniform int tRow = interiorBezTRow[i];
        uniform int sCol = interiorBezSCol[i];
        //  Quotient rule for G' (re-expressed in terms of G to simplify (and D = 1/D)):
        float Gds = (Nds[i] - Dds[i] * G[i]) * D[i];
        float Gdt = (Ndt[i] - Ddt[i] * G[i]) * D[i];
        //  Product rule combining B and B' with G and G' (and scaled):
        deriv1[iDst] = (Bds[sCol] * G[i] + Bs[sCol] * Gds) * Bt[tRow] * dScale;
        deriv2[iDst] = (Bdt[tRow] * G[i] + Bt[tRow] * Gdt) * Bs[sCol] * dScale;
    }
 #endif
 }
 void getGregoryWeightsNoDerivative(uniform unsigned int bitField, float s, float t, float point[20]) {
    //
    //  P3         e3-      e2+         P2
    //     15------17-------11--------10
    //     |        |        |        |
    //     |        |        |        |
    //     |        | f3-    | f2+    |
    //     |       19       13        |
    // e3+ 16-----18           14-----12 e2-
    //     |     f3+          f2-     |
    //     |                          |
    //     |                          |
    //     |      f0-         f1+     |
    // e0- 2------4            8------6 e1+
    //     |        3        9        |
    //     |        | f0+    | f1-    |
    //     |        |        |        |
    //     |        |        |        |
    //     O--------1--------7--------5
    //  P0         e0+      e1-         P1
    //
    //  Indices of boundary and interior points and their corresponding Bezier points
    //  (this can be reduced with more direct indexing and unrolling of loops):
    //
    static uniform int const boundaryGregory[12] = { 0, 1, 7, 5, 2, 6, 16, 12, 15, 17, 11, 10 };
    static uniform int const boundaryBezSCol[12] = { 0, 1, 2, 3, 0, 3,  0,  3,  0,  1,  2,  3 };
    static uniform int const boundaryBezTRow[12] = { 0, 0, 0, 0, 1, 1,  2,  2,  3,  3,  3,  3 };
    static uniform int const interiorGregory[8] = { 3, 4,  8, 9,  13, 14,  18, 19 };
    static uniform int const interiorBezSCol[8] = { 1, 1,  2, 2,   2,  2,   1,  1 };
    static uniform int const interiorBezTRow[8] = { 1, 1,  1, 1,   2,  2,   2,  2 };
    //
    //  Bezier basis functions are denoted with B while the rational multipliers for the
    //  interior points will be denoted G -- so we have B(s), B(t) and G(s,t):
    //
    //  Directional Bezier basis functions B at s and t:
    float Bs[4];
    float Bt[4];
    getBezierWeightsNoDerivative(s, Bs);
    getBezierWeightsNoDerivative(t, Bt);
    //  Rational multipliers G at s and t:
    float sC = 1.0f - s;
    float tC = 1.0f - t;
    //  Use <= here to avoid compiler warnings -- the sums should always be non-negative:
    float df0 = s  + t;   df0 = (df0 <= 0.0f) ? 1.0f : (1.0f / df0);
    float df1 = sC + t;   df1 = (df1 <= 0.0f) ? 1.0f : (1.0f / df1);
    float df2 = sC + tC;  df2 = (df2 <= 0.0f) ? 1.0f : (1.0f / df2);
    float df3 = s  + tC;  df3 = (df3 <= 0.0f) ? 1.0f : (1.0f / df3);
    float G[8] = { s*df0, t*df0,  t*df1, sC*df1,  sC*df2, tC*df2,  tC*df3, s*df3 };
    //  Combined weights for boundary and interior points:
    for (uniform int i = 0; i < 12; ++i) {
        point[boundaryGregory[i]] = Bs[boundaryBezSCol[i]] * Bt[boundaryBezTRow[i]];
    }
    for (uniform int i = 0; i < 8; ++i) {
        point[interiorGregory[i]] = Bs[interiorBezSCol[i]] * Bt[interiorBezTRow[i]] * G[i];
    }
 }
 export void
 evalGregory(uniform   unsigned int            bitField,
            uniform   int                     nPoint, 
            uniform   float                   u[], 
            uniform   float                   v[],                    
            uniform   const unsigned int      vertexIndices[],
            uniform   const BufferDescriptor &inDesc,
            uniform   const float             inQ[], 
            uniform   const BufferDescriptor &outDesc,
            uniform   float                   outQ[], 
            uniform   const BufferDescriptor &duDesc,
            uniform   float                   outDQU[],
            uniform   const BufferDescriptor &dvDesc,            
            uniform   float                   outDQV[])
 {
    uniform int nChannel = inDesc.length / 3;
    assert(nChannel < MAX_CHANNEL);
    uniform Point controlVertices[MAX_CHANNEL*20];
    for(uniform int i=0; i<20; i++) {
        uniform unsigned int id = vertexIndices[i];
        uniform const float * uniform pVertex = inQ + inDesc.offset + id * inDesc.stride;
        for(uniform int c=0; c<nChannel; c++) {
            uniform int offset = c * 20 + i;
            controlVertices[offset].x = pVertex[0];
            controlVertices[offset].y = pVertex[1];
            controlVertices[offset].z = pVertex[2];
            pVertex += 3;
        }
    }
    uniform float frac = getParamFraction(bitField);
    // top left corner
    uniform float pu = (uniform float)getU(bitField)*frac;
    uniform float pv = (uniform float)getV(bitField)*frac;
    foreach( n = 0 ... nPoint) {
        // normalize u,v coordinates
        float s = (u[n] - pu) / frac;
        float t = (v[n] - pv) / frac;
        float point[20], deriv1[20], deriv2[20];
        getGregoryWeights(bitField, s, t, point, deriv1, deriv2);
        float *pOutQ = outQ + outDesc.offset + n * outDesc.stride;             
        for(uniform int c=0; c<nChannel; c++) { 
            uniform int offset = c * 16;
            Point Q;
            Q.x = Q.y = Q.z = 0.0;
            for (uniform int i=0; i<16; ++i) {
                Q = Q + point[i] * controlVertices[offset + i];                            
            }    
            *pOutQ ++ = Q.x, *pOutQ ++ = Q.y, *pOutQ ++ = Q.z;
        }   
        float *pOutDQU = outDQU + duDesc.offset + n * duDesc.stride;
        float *pOutDQV = outDQV + dvDesc.offset + n * dvDesc.stride;                                  
        for(uniform int c=0; c<nChannel; c++) { 
            uniform int offset = c * 20;
            Point DQU, DQV;
            DQU.x = DQU.y = DQU.z = 0.0;
            DQV.x = DQV.y = DQV.z = 0.0;            
            for (uniform int i=0; i<20; ++i) {
                DQU = DQU + deriv1[i] * controlVertices[offset + i];
                DQV = DQV + deriv2[i] * controlVertices[offset + i];                                            
            }    
            *pOutDQU ++ = DQU.x, *pOutDQU ++ = DQU.y, *pOutDQU ++ = DQU.z;
            *pOutDQV ++ = DQV.x, *pOutDQV ++ = DQV.y, *pOutDQV ++ = DQV.z;            
        }                           
    }
 }   
 export void
 evalGregoryNoDerivative(uniform unsigned int            bitField,
                        uniform int                     nPoint, 
                        uniform float                   u[], 
                        uniform float                   v[],                    
                        uniform const unsigned int      vertexIndices[],
                        uniform const BufferDescriptor &inDesc,
                        uniform const float             inQ[], 
                        uniform const BufferDescriptor &outDesc,
                        uniform float                   outQ[]
                       )
 {
    uniform int nChannel = inDesc.length / 3;
    assert(nChannel < MAX_CHANNEL);
    uniform Point controlVertices[MAX_CHANNEL*20];
    for(uniform int i=0; i<20; i++) {
        uniform unsigned int id = vertexIndices[i];
        uniform const float * uniform pVertex = inQ + inDesc.offset + id * inDesc.stride;
        for(uniform int c=0; c<nChannel; c++) {
            uniform int offset = c * 20 + i;
            controlVertices[offset].x = pVertex[0];
            controlVertices[offset].y = pVertex[1];
            controlVertices[offset].z = pVertex[2];
            pVertex += 3;
        }
    }
    uniform float frac = getParamFraction(bitField);
    // top left corner
    uniform float pu = (uniform float)getU(bitField)*frac;
    uniform float pv = (uniform float)getV(bitField)*frac;
    foreach( n = 0 ... nPoint) {
        // normalize u,v coordinates
        float s = (u[n] - pu) / frac;
        float t = (v[n] - pv) / frac;
        float point[20];
        getGregoryWeightsNoDerivative(bitField, s, t, point);
        float *pOutQ = outQ + outDesc.offset + n * outDesc.stride;             
        for(uniform int c=0; c<nChannel; c++) { 
            uniform int offset = c * 20;
            Point Q;
            Q.x = Q.y = Q.z = 0.0;
            for (uniform int i=0; i<20; ++i) {
                Q = Q + point[i] * controlVertices[offset + i];                            
            }    
            *pOutQ ++ = Q.x, *pOutQ ++ = Q.y, *pOutQ ++ = Q.z;
        }   
    }
 }    
--- a/opensubdiv/osd/ispcEvalLimitKernel.isph
+++ b/opensubdiv/osd/ispcEvalLimitKernel.isph
@ -1,55 +0,0 @@
 //
 // ispcEvalLimitKernel.isph
 // (Header automatically generated by the ispc compiler.)
 // DO NOT EDIT THIS FILE.
 //
 #ifndef ISPC_ISPCEVALLIMITKERNEL_ISPH
 #define ISPC_ISPCEVALLIMITKERNEL_ISPH
 #include <stdint.h>
 #ifdef __cplusplus
 namespace ispc { /* namespace */
 #endif // __cplusplus
 #ifndef __ISPC_STRUCT_BufferDescriptor__
 #define __ISPC_STRUCT_BufferDescriptor__
 struct BufferDescriptor {
    int32_t offset;
    int32_t length;
    int32_t stride;
 };
 #endif
 ///////////////////////////////////////////////////////////////////////////
 // Functions exported from ispc code
 ///////////////////////////////////////////////////////////////////////////
 #if defined(__cplusplus) && !defined(__ISPC_NO_EXTERN_C)
 extern "C" {
 #endif // __cplusplus
    extern void evalBSpline(int32_t bitField, int32_t nPoint, const float * u, const float * v, const int32_t * vertexIndices, const struct BufferDescriptor &inDesc, const float * inQ, const struct BufferDescriptor &outDesc, float * outQ, const struct BufferDescriptor &duDesc, float * outDQU, const struct BufferDescriptor &dvDesc, float * outDQV);
    extern void evalBilinear(int32_t bitField, int32_t nPoint, const float * u, const float * v, const int32_t * vertexIndices, const struct BufferDescriptor &inDesc, const float * inQ, const struct BufferDescriptor &outDesc, float * outQ, const struct BufferDescriptor &duDesc, float * outDQU, const struct BufferDescriptor &dvDesc, float * outDQV);
    extern void evalGregory(int32_t bitField, int32_t nPoint, const float * u, const float * v, const int32_t * vertexIndices, const struct BufferDescriptor &inDesc, const float * inQ, const struct BufferDescriptor &outDesc, float * outQ, const struct BufferDescriptor &duDesc, float * outDQU, const struct BufferDescriptor &dvDesc, float * outDQV);
    extern void evalBSplineNoDerivative(int32_t bitField, int32_t nPoint, const float * u, const float * v, const int32_t * vertexIndices, const struct BufferDescriptor &inDesc, const float * inQ, const struct BufferDescriptor &outDesc, float * outQ);
    extern void evalBilinearNoDerivative(int32_t bitField, int32_t nPoint, const float * u, const float * v, const int32_t * vertexIndices, const struct BufferDescriptor &inDesc, const float * inQ, const struct BufferDescriptor &outDesc, float * outQ);
    extern void evalGregoryNoDerivative(int32_t bitField, int32_t nPoint, const float * u, const float * v, const int32_t * vertexIndices, const struct BufferDescriptor &inDesc, const float * inQ, const struct BufferDescriptor &outDesc, float * outQ);
    extern void getSIMDWidth(int32_t &simdWidth);
 #if defined(__cplusplus) && !defined(__ISPC_NO_EXTERN_C)
 } /* end extern C */
 #endif // __cplusplus
 #ifdef __cplusplus
 } /* namespace */
 #endif // __cplusplus
 #endif // ISPC_ISPCEVALLIMITKERNEL_ISPH
--- a/opensubdiv/osd/ispcEvaluator.cpp
+++ b/opensubdiv/osd/ispcEvaluator.cpp
@ -1,289 +0,0 @@
 //
 //   Copyright 2015 Pixar
 //
 //   Licensed under the Apache License, Version 2.0 (the "Apache License")
 //   with the following modification; you may not use this file except in
 //   compliance with the Apache License and the following modification to it:
 //   Section 6. Trademarks. is deleted and replaced with:
 //
 //   6. Trademarks. This License does not grant permission to use the trade
 //      names, trademarks, service marks, or product names of the Licensor
 //      and its affiliates, except as required to comply with Section 4(c) of
 //      the License and to reproduce the content of the NOTICE file.
 //
 //   You may obtain a copy of the Apache License at
 //
 //       http://www.apache.org/licenses/LICENSE-2.0
 //
 //   Unless required by applicable law or agreed to in writing, software
 //   distributed under the Apache License with the above modification is
 //   distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 //   KIND, either express or implied. See the Apache License for the specific
 //   language governing permissions and limitations under the Apache License.
 //
 #include "ispcEvaluator.h"
 #include "cpuKernel.h"
 #include "../far/patchBasis.h"
 #include "ispcEvalLimitKernel.isph"
 #include <tbb/parallel_for.h>
 #include <cstdlib>
 namespace OpenSubdiv {
 namespace OPENSUBDIV_VERSION {
 namespace Osd {
 #define grain_size  512
 /* static */
 bool
 IspcEvaluator::EvalStencils(const float *src, BufferDescriptor const &srcDesc,
                           float *dst,       BufferDescriptor const &dstDesc,
                           const int * sizes,
                           const int * offsets,
                           const int * indices,
                           const float * weights,
                           int start, int end) {
    if (end <= start) return true;
    if (srcDesc.length != dstDesc.length) return false;
    // XXX: we can probably expand cpuKernel.cpp to here.
    CpuEvalStencils(src, srcDesc, dst, dstDesc,
                    sizes, offsets, indices, weights, start, end);
    return true;
 }
 /* static */
 bool
 IspcEvaluator::EvalStencils(const float *src, BufferDescriptor const &srcDesc,
                           float *dst,       BufferDescriptor const &dstDesc,
                           float *du,        BufferDescriptor const &duDesc,
                           float *dv,        BufferDescriptor const &dvDesc,
                           const int * sizes,
                           const int * offsets,
                           const int * indices,
                           const float * weights,
                           const float * duWeights,
                           const float * dvWeights,
                           int start, int end) {
    if (end <= start) return true;
    if (srcDesc.length != dstDesc.length) return false;
    if (srcDesc.length != duDesc.length) return false;
    if (srcDesc.length != dvDesc.length) return false;
    CpuEvalStencils(src, srcDesc,
                    dst, dstDesc,
                    du,  duDesc,
                    dv,  dvDesc,
                    sizes, offsets, indices,
                    weights, duWeights, dvWeights,
                    start, end);
    return true;
 }
 template <typename T>
 struct BufferAdapter {
    BufferAdapter(T *p, int length, int stride) :
        _p(p), _length(length), _stride(stride) { }
    void Clear() {
        for (int i = 0; i < _length; ++i) _p[i] = 0;
    }
    void AddWithWeight(T const *src, float w) {
        if (_p) {
            for (int i = 0; i < _length; ++i) {
                _p[i] += src[i] * w;
            }
        }
    }
    const T *operator[] (int index) const {
        return _p + _stride * index;
    }
    BufferAdapter<T> & operator ++() {
        if (_p) {
            _p += _stride;
        }
        return *this;
    }
    T *_p;
    int _length;
    int _stride;
 };
 /* static */
 bool
 IspcEvaluator::EvalPatches(const float *src, BufferDescriptor const &srcDesc,
                           float *dst,       BufferDescriptor const &dstDesc,
                           int numPatchCoords,
                           const PatchCoord *patchCoords,
                           const PatchArray *patchArrays,
                           const int *patchIndexBuffer,
                           const PatchParam *patchParamBuffer) { 
    if (srcDesc.length != dstDesc.length) return false;
    // Copy BufferDescriptor to ispc version
    // Since memory alignment in ISPC may be different from C++,
    // we use the assignment for each field instead of the assignment for 
    // the whole struct
    ispc::BufferDescriptor ispcSrcDesc;
    ispcSrcDesc.offset = srcDesc.offset;
    ispcSrcDesc.length = srcDesc.length;
    ispcSrcDesc.stride = srcDesc.stride;                                           
    tbb::blocked_range<int> range = tbb::blocked_range<int>(0, numPatchCoords, grain_size);
    tbb::parallel_for(range, [&](const tbb::blocked_range<int> &r)
    {    
    uint i = r.begin();
    ispc::BufferDescriptor ispcDstDesc, ispcDuDesc, ispcDvDesc;                               
    ispcDstDesc.offset = dstDesc.offset + dstDesc.offset + i * dstDesc.stride;
    ispcDstDesc.length = dstDesc.length;
    ispcDstDesc.stride = dstDesc.stride;
    while (i < r.end()) {
        // the patch coordinates are sorted by patch handle
        // the following code searches the coordinates that
        // belongs to the same patch so that they can be evalauated 
        // with ISPC
        int nCoord = 1;
        Far::PatchTable::PatchHandle handle = patchCoords[i].handle;
        while(i + nCoord < r.end() && 
              handle.isEqual(patchCoords[i + nCoord].handle) )
              nCoord ++;
        PatchArray const &array = patchArrays[handle.arrayIndex];
        int patchType = array.GetPatchType();
        Far::PatchParam const & param = patchParamBuffer[handle.patchIndex];
        unsigned int bitField = param.field1;
        const int *cvs = &patchIndexBuffer[array.indexBase + handle.vertIndex];
        __declspec( align(64) ) float u[nCoord];
        __declspec( align(64) ) float v[nCoord];        
        for(int n=0; n<nCoord; n++) {
            u[n] = patchCoords[i + n].s;
            v[n] = patchCoords[i + n].t;            
        }
        if (patchType == Far::PatchDescriptor::REGULAR) {
            ispc::evalBSplineNoDerivative(bitField, nCoord, u, v, cvs, ispcSrcDesc, src, 
                              ispcDstDesc, dst);
        } else if (patchType == Far::PatchDescriptor::GREGORY_BASIS) {
            ispc::evalGregoryNoDerivative(bitField, nCoord, u, v, cvs, ispcSrcDesc, src, 
                              ispcDstDesc, dst);        
        } else if (patchType == Far::PatchDescriptor::QUADS) {
            ispc::evalBilinearNoDerivative(bitField, nCoord, u, v, cvs, ispcSrcDesc, src, 
                               ispcDstDesc, dst);           
        } else {
            assert(0);
        }
        i += nCoord;
        ispcDstDesc.offset = dstDesc.offset + i * dstDesc.stride;                                                  
    }
    });
    return true;
 }
 /* static */
 bool
 IspcEvaluator::EvalPatches(const float *src, BufferDescriptor const &srcDesc,
                           float *dst,       BufferDescriptor const &dstDesc,
                           float *du,        BufferDescriptor const &duDesc,
                           float *dv,        BufferDescriptor const &dvDesc,
                           int numPatchCoords,
                           const PatchCoord *patchCoords,
                           const PatchArray *patchArrays,
                           const int *patchIndexBuffer,
                           const PatchParam *patchParamBuffer) {
    if (srcDesc.length != dstDesc.length) return false;
    // Copy BufferDescriptor to ispc version
    // Since memory alignment in ISPC may be different from C++,
    // we use the assignment for each field instead of the assignment for 
    // the whole struct
    ispc::BufferDescriptor ispcSrcDesc;
    ispcSrcDesc.offset = srcDesc.offset;
    ispcSrcDesc.length = srcDesc.length;
    ispcSrcDesc.stride = srcDesc.stride;                      
    tbb::blocked_range<int> range = tbb::blocked_range<int>(0, numPatchCoords, grain_size);
    tbb::parallel_for(range, [&](const tbb::blocked_range<int> &r)
    {    
    uint i = r.begin();
    ispc::BufferDescriptor ispcDstDesc, ispcDuDesc, ispcDvDesc;                               
    ispcDstDesc.offset = dstDesc.offset + dstDesc.offset + i * dstDesc.stride;
    ispcDstDesc.length = dstDesc.length;
    ispcDstDesc.stride = dstDesc.stride;
    ispcDuDesc.offset  = duDesc.offset  + i * duDesc.stride;
    ispcDuDesc.length  = duDesc.length;
    ispcDuDesc.stride  = duDesc.stride;
    ispcDvDesc.offset  = dvDesc.offset  + i * dvDesc.stride;
    ispcDvDesc.length  = dvDesc.length;
    ispcDvDesc.stride  = dvDesc.stride;
    while (i < r.end()) {
        // the patch coordinates are sorted by patch handle
        // the following code searches the coordinates that
        // belongs to the same patch so that they can be evalauated 
        // with ISPC
        int nCoord = 1;
        Far::PatchTable::PatchHandle handle = patchCoords[i].handle;
        while(i + nCoord < r.end() && 
              handle.isEqual(patchCoords[i + nCoord].handle) )
              nCoord ++;
        PatchArray const &array = patchArrays[handle.arrayIndex];
        int patchType = array.GetPatchType();
        Far::PatchParam const & param = patchParamBuffer[handle.patchIndex];
        unsigned int bitField = param.field1;
        const int *cvs = &patchIndexBuffer[array.indexBase + handle.vertIndex];
        __declspec( align(64) ) float u[nCoord];
        __declspec( align(64) ) float v[nCoord];        
        for(int n=0; n<nCoord; n++) {
            u[n] = patchCoords[i + n].s;
            v[n] = patchCoords[i + n].t;            
        }
        if (patchType == Far::PatchDescriptor::REGULAR) {
            ispc::evalBSpline(bitField, nCoord, u, v, cvs, ispcSrcDesc, src, 
                              ispcDstDesc, dst, ispcDuDesc, du, ispcDvDesc, dv);
        } else if (patchType == Far::PatchDescriptor::GREGORY_BASIS) {
            ispc::evalGregory(bitField, nCoord, u, v, cvs, ispcSrcDesc, src, 
                              ispcDstDesc, dst, ispcDuDesc, du, ispcDvDesc, dv);        
        } else if (patchType == Far::PatchDescriptor::QUADS) {
            ispc::evalBilinear(bitField, nCoord, u, v, cvs, ispcSrcDesc, src, 
                               ispcDstDesc, dst, ispcDuDesc, du, ispcDvDesc, dv);           
        } else {
            assert(0);
        }
        i += nCoord;
        ispcDstDesc.offset = dstDesc.offset + i * dstDesc.stride;
        ispcDuDesc.offset  = duDesc.offset  + i * duDesc.stride;
        ispcDvDesc.offset  = dvDesc.offset  + i * dvDesc.stride;                                                        
    }
    });
    return true;
 }
 }  // end namespace Osd
 }  // end namespace OPENSUBDIV_VERSION
 }  // end namespace OpenSubdiv
--- a/opensubdiv/osd/ispcEvaluator.h
+++ b/opensubdiv/osd/ispcEvaluator.h
@ -1,482 +0,0 @@
 //
 //   Copyright 2015 Pixar
 //
 //   Licensed under the Apache License, Version 2.0 (the "Apache License")
 //   with the following modification; you may not use this file except in
 //   compliance with the Apache License and the following modification to it:
 //   Section 6. Trademarks. is deleted and replaced with:
 //
 //   6. Trademarks. This License does not grant permission to use the trade
 //      names, trademarks, service marks, or product names of the Licensor
 //      and its affiliates, except as required to comply with Section 4(c) of
 //      the License and to reproduce the content of the NOTICE file.
 //
 //   You may obtain a copy of the Apache License at
 //
 //       http://www.apache.org/licenses/LICENSE-2.0
 //
 //   Unless required by applicable law or agreed to in writing, software
 //   distributed under the Apache License with the above modification is
 //   distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 //   KIND, either express or implied. See the Apache License for the specific
 //   language governing permissions and limitations under the Apache License.
 //
 #ifndef OPENSUBDIV3_OSD_ISPC_EVALUATOR_H
 #define OPENSUBDIV3_OSD_ISPC_EVALUATOR_H
 #include "../version.h"
 #include <cstddef>
 #include <vector>
 #include "../osd/bufferDescriptor.h"
 #include "../osd/types.h"
 namespace OpenSubdiv {
 namespace OPENSUBDIV_VERSION {
 namespace Osd {
 class IspcEvaluator {
 public:
    /// ----------------------------------------------------------------------
    ///
    ///   Stencil evaluations with StencilTable
    ///
    /// ----------------------------------------------------------------------
    /// \brief Generic static eval stencils function. This function has a same
    ///        signature as other device kernels have so that it can be called
    ///        in the same way from OsdMesh template interface.
    ///
    /// @param srcBuffer      Input primvar buffer.
    ///                       must have BindCpuBuffer() method returning a
    ///                       const float pointer for read
    ///
    /// @param srcDesc        vertex buffer descriptor for the input buffer
    ///
    /// @param dstBuffer      Output primvar buffer
    ///                       must have BindCpuBuffer() method returning a
    ///                       float pointer for write
    ///
    /// @param dstDesc        vertex buffer descriptor for the output buffer
    ///
    /// @param stencilTable   Far::StencilTable or equivalent
    ///
    /// @param instance       not used in the cpu kernel
    ///                       (declared as a typed pointer to prevent
    ///                        undesirable template resolution)
    ///
    /// @param deviceContext  not used in the cpu kernel
    ///
    template <typename SRC_BUFFER, typename DST_BUFFER, typename STENCIL_TABLE>
    static bool EvalStencils(
        SRC_BUFFER *srcBuffer, BufferDescriptor const &srcDesc,
        DST_BUFFER *dstBuffer, BufferDescriptor const &dstDesc,
        STENCIL_TABLE const *stencilTable,
        const IspcEvaluator *instance = NULL,
        void * deviceContext = NULL) {
        (void)instance;       // unused
        (void)deviceContext;  // unused
        if (stencilTable->GetNumStencils() == 0)
            return false;
        return EvalStencils(srcBuffer->BindCpuBuffer(), srcDesc,
                            dstBuffer->BindCpuBuffer(), dstDesc,
                            &stencilTable->GetSizes()[0],
                            &stencilTable->GetOffsets()[0],
                            &stencilTable->GetControlIndices()[0],
                            &stencilTable->GetWeights()[0],
                            /*start = */ 0,
                            /*end   = */ stencilTable->GetNumStencils());
    }
    /// \brief Static eval stencils function which takes raw CPU pointers for
    ///        input and output.
    ///
    /// @param src            Input primvar pointer. An offset of srcDesc
    ///                       will be applied internally (i.e. the pointer
    ///                       should not include the offset)
    ///
    /// @param srcDesc        vertex buffer descriptor for the input buffer
    ///
    /// @param dst            Output primvar pointer. An offset of dstDesc
    ///                       will be applied internally.
    ///
    /// @param dstDesc        vertex buffer descriptor for the output buffer
    ///
    /// @param sizes          pointer to the sizes buffer of the stencil table
    ///                       to apply for the range [start, end)
    ///
    /// @param offsets        pointer to the offsets buffer of the stencil table
    ///
    /// @param indices        pointer to the indices buffer of the stencil table
    ///
    /// @param weights        pointer to the weights buffer of the stencil table
    ///
    /// @param start          start index of stencil table
    ///
    /// @param end            end index of stencil table
    ///
    static bool EvalStencils(
        const float *src,  BufferDescriptor const &srcDesc,
        float *dst,        BufferDescriptor const &dstDesc,
        const int * sizes,
        const int * offsets,
        const int * indices,
        const float * weights,
        int start, int end);
    /// \brief Generic static eval stencils function with derivatives.
    ///        This function has a same signature as other device kernels
    ///        have so that it can be called in the same way from OsdMesh
    ///        template interface.
    ///
    /// @param srcBuffer      Input primvar buffer.
    ///                       must have BindCpuBuffer() method returning a
    ///                       const float pointer for read
    ///
    /// @param srcDesc        vertex buffer descriptor for the input buffer
    ///
    /// @param dstBuffer      Output primvar buffer
    ///                       must have BindCpuBuffer() method returning a
    ///                       float pointer for write
    ///
    /// @param dstDesc        vertex buffer descriptor for the output buffer
    ///
    /// @param duBuffer       Output U-derivative buffer
    ///                       must have BindCpuBuffer() method returning a
    ///                       float pointer for write
    ///
    /// @param duDesc         vertex buffer descriptor for the output buffer
    ///
    /// @param dvBuffer       Output V-derivative buffer
    ///                       must have BindCpuBuffer() method returning a
    ///                       float pointer for write
    ///
    /// @param dvDesc         vertex buffer descriptor for the output buffer
    ///
    /// @param stencilTable   Far::StencilTable or equivalent
    ///
    /// @param instance       not used in the cpu kernel
    ///                       (declared as a typed pointer to prevent
    ///                        undesirable template resolution)
    ///
    /// @param deviceContext  not used in the cpu kernel
    ///
    template <typename SRC_BUFFER, typename DST_BUFFER, typename STENCIL_TABLE>
    static bool EvalStencils(
        SRC_BUFFER *srcBuffer, BufferDescriptor const &srcDesc,
        DST_BUFFER *dstBuffer, BufferDescriptor const &dstDesc,
        DST_BUFFER *duBuffer,  BufferDescriptor const &duDesc,
        DST_BUFFER *dvBuffer,  BufferDescriptor const &dvDesc,
        STENCIL_TABLE const *stencilTable,
        const IspcEvaluator *instance = NULL,
        void * deviceContext = NULL) {
        (void)instance;       // unused
        (void)deviceContext;  // unused
        return EvalStencils(srcBuffer->BindCpuBuffer(), srcDesc,
                            dstBuffer->BindCpuBuffer(), dstDesc,
                            duBuffer->BindCpuBuffer(),  duDesc,
                            dvBuffer->BindCpuBuffer(),  dvDesc,
                            &stencilTable->GetSizes()[0],
                            &stencilTable->GetOffsets()[0],
                            &stencilTable->GetControlIndices()[0],
                            &stencilTable->GetWeights()[0],
                            &stencilTable->GetDuWeights()[0],
                            &stencilTable->GetDvWeights()[0],
                            /*start = */ 0,
                            /*end   = */ stencilTable->GetNumStencils());
    }
    /// \brief Static eval stencils function with derivatives, which takes
    ///        raw CPU pointers for input and output.
    ///
    /// @param src            Input primvar pointer. An offset of srcDesc
    ///                       will be applied internally (i.e. the pointer
    ///                       should not include the offset)
    ///
    /// @param srcDesc        vertex buffer descriptor for the input buffer
    ///
    /// @param dst            Output primvar pointer. An offset of dstDesc
    ///                       will be applied internally.
    ///
    /// @param dstDesc        vertex buffer descriptor for the output buffer
    ///
    /// @param du             Output U-derivatives pointer. An offset of
    ///                       duDesc will be applied internally.
    ///
    /// @param duDesc         vertex buffer descriptor for the output buffer
    ///
    /// @param dv             Output V-derivatives pointer. An offset of
    ///                       dvDesc will be applied internally.
    ///
    /// @param dvDesc         vertex buffer descriptor for the output buffer
    ///
    /// @param sizes          pointer to the sizes buffer of the stencil table
    ///
    /// @param offsets        pointer to the offsets buffer of the stencil table
    ///
    /// @param indices        pointer to the indices buffer of the stencil table
    ///
    /// @param weights        pointer to the weights buffer of the stencil table
    ///
    /// @param duWeights      pointer to the du-weights buffer of the stencil table
    ///
    /// @param dvWeights      pointer to the dv-weights buffer of the stencil table
    ///
    /// @param start          start index of stencil table
    ///
    /// @param end            end index of stencil table
    ///
    static bool EvalStencils(
        const float *src, BufferDescriptor const &srcDesc,
        float *dst,       BufferDescriptor const &dstDesc,
        float *du,        BufferDescriptor const &duDesc,
        float *dv,        BufferDescriptor const &dvDesc,
        const int * sizes,
        const int * offsets,
        const int * indices,
        const float * weights,
        const float * duWeights,
        const float * dvWeights,
        int start, int end);
    /// ----------------------------------------------------------------------
    ///
    ///   Limit evaluations with PatchTable
    ///
    /// ----------------------------------------------------------------------
    /// \brief Generic limit eval function. This function has a same
    ///        signature as other device kernels have so that it can be called
    ///        in the same way.
    ///
    /// @param srcBuffer        Input primvar buffer.
    ///                         must have BindCpuBuffer() method returning a
    ///                         const float pointer for read
    ///
    /// @param srcDesc          vertex buffer descriptor for the input buffer
    ///
    /// @param dstBuffer        Output primvar buffer
    ///                         must have BindCpuBuffer() method returning a
    ///                         float pointer for write
    ///
    /// @param dstDesc          vertex buffer descriptor for the output buffer
    ///
    /// @param numPatchCoords   number of patchCoords.
    ///
    /// @param patchCoords      array of locations to be evaluated.
    ///
    /// @param patchTable       CpuPatchTable or equivalent
    ///                         XXX: currently Far::PatchTable can't be used
    ///                              due to interface mismatch
    ///
    /// @param instance         not used in the cpu evaluator
    ///
    /// @param deviceContext    not used in the cpu evaluator
    ///
    template <typename SRC_BUFFER, typename DST_BUFFER,
              typename PATCHCOORD_BUFFER, typename PATCH_TABLE>
    static bool EvalPatches(
        SRC_BUFFER *srcBuffer, BufferDescriptor const &srcDesc,
        DST_BUFFER *dstBuffer, BufferDescriptor const &dstDesc,
        int numPatchCoords,
        PATCHCOORD_BUFFER *patchCoords,
        PATCH_TABLE *patchTable,
        IspcEvaluator const *instance = NULL,
        void * deviceContext = NULL) {
        (void)instance;       // unused
        (void)deviceContext;  // unused
        return EvalPatches(srcBuffer->BindCpuBuffer(), srcDesc,
                           dstBuffer->BindCpuBuffer(), dstDesc,
                           numPatchCoords,
                           (const PatchCoord*)patchCoords->BindCpuBuffer(),
                           patchTable->GetPatchArrayBuffer(),
                           patchTable->GetPatchIndexBuffer(),
                           patchTable->GetPatchParamBuffer());
    }
    /// \brief Generic limit eval function with derivatives. This function has
    ///        a same signature as other device kernels have so that it can be
    ///        called in the same way.
    ///
    /// @param srcBuffer        Input primvar buffer.
    ///                         must have BindCpuBuffer() method returning a
    ///                         const float pointer for read
    ///
    /// @param srcDesc          vertex buffer descriptor for the input buffer
    ///
    /// @param dstBuffer        Output primvar buffer
    ///                         must have BindCpuBuffer() method returning a
    ///                         float pointer for write
    ///
    /// @param dstDesc          vertex buffer descriptor for the output buffer
    ///
    /// @param duBuffer         Output U-derivatives buffer
    ///                         must have BindCpuBuffer() method returning a
    ///                         float pointer for write
    ///
    /// @param duDesc           vertex buffer descriptor for the duBuffer
    ///
    /// @param dvBuffer         Output V-derivatives buffer
    ///                         must have BindCpuBuffer() method returning a
    ///                         float pointer for write
    ///
    /// @param dvDesc           vertex buffer descriptor for the dvBuffer
    ///
    /// @param numPatchCoords   number of patchCoords.
    ///
    /// @param patchCoords      array of locations to be evaluated.
    ///
    /// @param patchTable       CpuPatchTable or equivalent
    ///                         XXX: currently Far::PatchTable can't be used
    ///                              due to interface mismatch
    ///
    /// @param instance         not used in the cpu evaluator
    ///
    /// @param deviceContext    not used in the cpu evaluator
    ///
    template <typename SRC_BUFFER, typename DST_BUFFER,
              typename PATCHCOORD_BUFFER, typename PATCH_TABLE>
    static bool EvalPatches(
        SRC_BUFFER *srcBuffer, BufferDescriptor const &srcDesc,
        DST_BUFFER *dstBuffer, BufferDescriptor const &dstDesc,
        DST_BUFFER *duBuffer,  BufferDescriptor const &duDesc,
        DST_BUFFER *dvBuffer,  BufferDescriptor const &dvDesc,
        int numPatchCoords,
        PATCHCOORD_BUFFER *patchCoords,
        PATCH_TABLE *patchTable,
        IspcEvaluator const *instance = NULL,
        void * deviceContext = NULL) {
        (void)instance;       // unused
        (void)deviceContext;  // unused
        // XXX: PatchCoords is somewhat abusing vertex primvar buffer interop.
        //      ideally all buffer classes should have templated by datatype
        //      so that downcast isn't needed there.
        //      (e.g. Osd::CpuBuffer<PatchCoord> )
        //
        return EvalPatches(srcBuffer->BindCpuBuffer(), srcDesc,
                           dstBuffer->BindCpuBuffer(), dstDesc,
                           duBuffer->BindCpuBuffer(),  duDesc,
                           dvBuffer->BindCpuBuffer(),  dvDesc,
                           numPatchCoords,
                           (const PatchCoord*)patchCoords->BindCpuBuffer(),
                           patchTable->GetPatchArrayBuffer(),
                           patchTable->GetPatchIndexBuffer(),
                           patchTable->GetPatchParamBuffer());
    }
    /// \brief Static limit eval function. It takes an array of PatchCoord
    ///        and evaluate limit values on given PatchTable.
    ///
    /// @param src              Input primvar pointer. An offset of srcDesc
    ///                         will be applied internally (i.e. the pointer
    ///                         should not include the offset)
    ///
    /// @param srcDesc          vertex buffer descriptor for the input buffer
    ///
    /// @param dst              Output primvar pointer. An offset of dstDesc
    ///                         will be applied internally.
    ///
    /// @param dstDesc          vertex buffer descriptor for the output buffer
    ///
    /// @param numPatchCoords   number of patchCoords.
    ///
    /// @param patchCoords      array of locations to be evaluated.
    ///
    /// @param patchArrays      an array of Osd::PatchArray struct
    ///                         indexed by PatchCoord::arrayIndex
    ///
    /// @param patchIndexBuffer an array of patch indices
    ///                         indexed by PatchCoord::vertIndex
    ///
    /// @param patchParamBuffer an array of Osd::PatchParam struct
    ///                         indexed by PatchCoord::patchIndex
    ///
    static bool EvalPatches(
        const float *src, BufferDescriptor const &srcDesc,
        float *dst,       BufferDescriptor const &dstDesc,
        int numPatchCoords,
        const PatchCoord *patchCoords,
        const PatchArray *patchArrays,
        const int *patchIndexBuffer,
        const PatchParam *patchParamBuffer);
    /// \brief Static limit eval function. It takes an array of PatchCoord
    ///        and evaluate limit values on given PatchTable.
    ///
    /// @param src              Input primvar pointer. An offset of srcDesc
    ///                         will be applied internally (i.e. the pointer
    ///                         should not include the offset)
    ///
    /// @param srcDesc          vertex buffer descriptor for the input buffer
    ///
    /// @param dst              Output primvar pointer. An offset of dstDesc
    ///                         will be applied internally.
    ///
    /// @param dstDesc          vertex buffer descriptor for the output buffer
    ///
    /// @param du               Output U-derivatives pointer. An offset of
    ///                         duDesc will be applied internally.
    ///
    /// @param duDesc           vertex buffer descriptor for the du buffer
    ///
    /// @param dv               Output V-derivatives pointer. An offset of
    ///                         dvDesc will be applied internally.
    ///
    /// @param dvDesc           vertex buffer descriptor for the dv buffer
    ///
    /// @param numPatchCoords   number of patchCoords.
    ///
    /// @param patchCoords      array of locations to be evaluated.
    ///
    /// @param patchArrays      an array of Osd::PatchArray struct
    ///                         indexed by PatchCoord::arrayIndex
    ///
    /// @param patchIndexBuffer an array of patch indices
    ///                         indexed by PatchCoord::vertIndex
    ///
    /// @param patchParamBuffer an array of Osd::PatchParam struct
    ///                         indexed by PatchCoord::patchIndex
    ///
    static bool EvalPatches(
        const float *src, BufferDescriptor const &srcDesc,
        float *dst,       BufferDescriptor const &dstDesc,
        float *du,        BufferDescriptor const &duDesc,
        float *dv,        BufferDescriptor const &dvDesc,
        int numPatchCoords,
        PatchCoord const *patchCoords,
        PatchArray const *patchArrays,
        const int *patchIndexBuffer,
        PatchParam const *patchParamBuffer);
    /// ----------------------------------------------------------------------
    ///
    ///   Other methods
    ///
    /// ----------------------------------------------------------------------
    /// \brief synchronize all asynchronous computation invoked on this device.
    static void Synchronize(void * /*deviceContext = NULL*/) {
        // nothing.
    }
 };
 }  // end namespace Osd
 }  // end namespace OPENSUBDIV_VERSION
 using namespace OPENSUBDIV_VERSION;
 }  // end namespace OpenSubdiv
 #endif  // OPENSUBDIV3_OSD_CPU_EVALUATOR_H