Fix tangents in Osd::EvalLimitController

- don't rotate (s,t) coordinates but rotate the patch instead ! - refactor osd/cpuEvalLimitKernels to share Far::PatchTables cubic spline interpolation functions : this replaces tensor product formulation with weight matrices, which does not really impact performance here, but would have to be replaced when implementing regular gridding functions. - fix OsdCpuEvalLimitController to not rotate coordinates and pass the rotation bitfields - expose Far::PatchTables spline interpolation API (protected -> public) - fix glEvalLimit tangent buffers (remove empty padding - see below) - change policy for tangent buffers : the output buffer descriptor is **NO LONGER APPLIED** to tangent output buffers. Tangent primvar data buffers are no longer applying the offset and stride from the descriptor (because it doesn't make sense to share it). If more flexiblity is required, we will consider adding independent descriptors for the tangent buffers. This change will impact existing code that generates tangents with the EvalLimit controller. fixes #370
2024-11-27 05:50:05 +00:00 · 2014-12-25 13:03:53 -08:00 · 2014-12-25 13:03:53 -08:00 · 7954fbab37
commit 7954fbab37
parent 5944ada0f9
7 changed files with 277 additions and 405 deletions
--- a/examples/glEvalLimit/glEvalLimit.cpp
+++ b/examples/glEvalLimit/glEvalLimit.cpp
@ -415,12 +415,12 @@ createOsdMesh(ShapeDesc const & shapeDesc, int level) {
        }

        delete g_dQs;
-        g_dQs = Osd::CpuGLVertexBuffer::Create(6,g_nparticles);
-        memset( g_dQs->BindCpuBuffer(), 0, g_nparticles*6*sizeof(float));
+        g_dQs = Osd::CpuGLVertexBuffer::Create(3,g_nparticles);
+        memset( g_dQs->BindCpuBuffer(), 0, g_nparticles*3*sizeof(float));

        delete g_dQt;
-        g_dQt = Osd::CpuGLVertexBuffer::Create(6,g_nparticles);
-        memset( g_dQt->BindCpuBuffer(), 0, g_nparticles*6*sizeof(float));
+        g_dQt = Osd::CpuGLVertexBuffer::Create(3,g_nparticles);
+        memset( g_dQt->BindCpuBuffer(), 0, g_nparticles*3*sizeof(float));
    }

    updateGeom();
@ -439,8 +439,7 @@ createOsdMesh(ShapeDesc const & shapeDesc, int level) {
 }

 //------------------------------------------------------------------------------
-struct Program
-{
+struct Program {
    GLuint program;
    GLuint uniformModelViewProjectionMatrix;
    GLuint attrPosition;
@ -449,8 +448,7 @@ struct Program

 //------------------------------------------------------------------------------
 static void
-checkGLErrors(std::string const & where = "")
-{
+checkGLErrors(std::string const & where = "") {
    GLuint err;
    while ((err = glGetError()) != GL_NO_ERROR) {

@ -462,8 +460,7 @@ checkGLErrors(std::string const & where = "")

 //------------------------------------------------------------------------------
 static GLuint
-compileShader(GLenum shaderType, const char *source)
-{
+compileShader(GLenum shaderType, const char *source) {
    GLuint shader = glCreateShader(shaderType);
    glShaderSource(shader, 1, &source, NULL);
    glCompileShader(shader);
@ -473,8 +470,8 @@ compileShader(GLenum shaderType, const char *source)

 //------------------------------------------------------------------------------
 static bool
-linkDefaultProgram()
-{
+linkDefaultProgram() {
+
 #if defined(GL_ARB_tessellation_shader) || defined(GL_VERSION_4_0)
    #define GLSL_VERSION_DEFINE "#version 400\n"
 #else
@ -537,8 +534,7 @@ linkDefaultProgram()

 //------------------------------------------------------------------------------
 static inline void
-setSharpnessColor(float s, float *r, float *g, float *b)
-{
+setSharpnessColor(float s, float *r, float *g, float *b) {
    //  0.0       2.0       4.0
    // green --- yellow --- red
    *r = std::min(1.0f, s * 0.5f);
--- a/opensubdiv/far/patchTables.cpp
+++ b/opensubdiv/far/patchTables.cpp
@ -144,10 +144,10 @@ getBoxSplineWeights(float v, float w, float B[12]) {
 }

 void
-PatchTables::getBasisWeights(TensorBasis basis, PatchParam::BitField bits,
+PatchTables::GetBasisWeights(TensorBasis basis, PatchParam::BitField bits,
    float s, float t, float point[16], float deriv1[16], float deriv2[16]) {

-    int const rots[4][16] =
+    static int const rots[4][16] =
        { { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
          { 12, 8, 4, 0, 13, 9, 5, 1, 14, 10, 6, 2, 15, 11, 7, 3 },
          { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 },
--- a/opensubdiv/far/patchTables.h
+++ b/opensubdiv/far/patchTables.h
@ -412,6 +412,16 @@ public:
    template <class T, class U> void Limit(PatchHandle const & handle,
        float s, float t, T const & src, U & dst) const;

+    enum TensorBasis {
+        BASIS_BEZIER,    ///< Bi-cubic bezier patch basis
+        BASIS_BSPLINE    ///< Bi-cubic bspline patch basis
+    };
+
+    /// \brief Returns bi-cubic weights matrix for a given (s,t) location
+    /// on the patch
+    static void GetBasisWeights(TensorBasis basis, PatchParam::BitField bits,
+        float s, float t, float point[16], float deriv1[16], float deriv2[16]);
+
 protected:

    friend class PatchTablesFactory;
@ -419,18 +429,6 @@ protected:
    // Factory constructor
    PatchTables(int maxvalence);

-    enum TensorBasis {
-        BASIS_BEZIER,
-        BASIS_BSPLINE
-    };
-
-    // Returns bi-cubic interpolation coefficients for a given (s,t) location
-    // on a b-spline patch
-    static void getBasisWeights(TensorBasis basis, PatchParam::BitField bits,
-        float s, float t, float point[16], float deriv1[16], float deriv2[16]);
-
-protected:
-
    void reservePatchArrays(int numPatchArrays);

    void pushPatchArray(PatchDescriptor desc,
@ -752,7 +750,7 @@ PatchTables::Limit(PatchHandle const & handle, float s, float t,

    if (ptype>=PatchDescriptor::REGULAR and ptype<=PatchDescriptor::CORNER) {

-        getBasisWeights(BASIS_BSPLINE, bits, s, t, Q, Qd1, Qd2);
+        GetBasisWeights(BASIS_BSPLINE, bits, s, t, Q, Qd1, Qd2);

        ConstIndexArray cvs = GetPatchVertices(handle);

@ -781,7 +779,7 @@ PatchTables::Limit(PatchHandle const & handle, float s, float t,

        assert(_endcapStencilTables);

-        getBasisWeights(BASIS_BEZIER, bits, s, t, Q, Qd1, Qd2);
+        GetBasisWeights(BASIS_BEZIER, bits, s, t, Q, Qd1, Qd2);

        InterpolateGregoryPatch(_endcapStencilTables, handle.vertIndex,
            s, t, Q, Qd1, Qd2, src, dst);
--- a/opensubdiv/osd/cpuEvalLimitController.cpp
+++ b/opensubdiv/osd/cpuEvalLimitController.cpp
@ -67,68 +67,62 @@ CpuEvalLimitController::EvalLimitSample( LimitLocation const & coord,

    if (vertexData.in) {

-        float * out   = outQ ? outQ + outDesc.offset : 0,
-              * outDu = outDQU ? outDQU + outDesc.offset : 0,
-              * outDv = outDQV ? outDQV + outDesc.offset : 0;
-
        Far::PatchTables const & ptables = context->GetPatchTables();

-        computeSubPatchCoords(ptables.GetPatchParam(*handle), s, t);
+        Far::PatchParam pparam = ptables.GetPatchParam(*handle);
+        pparam.bitField.Normalize(s, t);

        Far::ConstIndexArray cvs = ptables.GetPatchVertices(*handle);

        Far::PatchDescriptor desc = ptables.GetPatchDescriptor(*handle);
-        switch( desc.GetType() ) {
-            case Desc::REGULAR  : evalBSpline( t, s, cvs.begin(),
+        switch (desc.GetType()) {
+            case Desc::REGULAR  : evalBSpline( pparam.bitField, s, t, cvs.begin(),
                                               vertexData.inDesc,
                                               vertexData.in,
                                               outDesc,
-                                               out, outDu, outDv );
+                                               outQ, outDQU, outDQV );
                                  break;
-
-            case Desc::BOUNDARY : evalBoundary( t, s, cvs.begin(),
+            case Desc::BOUNDARY : evalBoundary( pparam.bitField, s, t, cvs.begin(),
                                                vertexData.inDesc,
                                                vertexData.in,
                                                outDesc,
-                                                out, outDu, outDv );
+                                                outQ, outDQU, outDQV );
                                  break;
-
-            case Desc::CORNER   : evalCorner( t, s, cvs.begin(),
+            case Desc::CORNER   : evalCorner( pparam.bitField, s, t, cvs.begin(),
                                              vertexData.inDesc,
                                              vertexData.in,
                                              outDesc,
-                                              out, outDu, outDv );
+                                              outQ, outDQU, outDQV );
                                  break;
-            case Desc::GREGORY  : evalGregory( t, s, cvs.begin(),
+            case Desc::GREGORY  : evalGregory( pparam.bitField, t, s, cvs.begin(),
                                               &ptables.GetVertexValenceTable()[0],
                                               ptables.GetPatchQuadOffsets(*handle).begin(),
                                               ptables.GetMaxValence(),
                                               vertexData.inDesc,
                                               vertexData.in,
                                               outDesc,
-                                               out, outDu, outDv );
+                                               outQ, outDQU, outDQV );
                                  break;
-
-            case Desc::GREGORY_BOUNDARY : evalGregoryBoundary( t, s, cvs.begin(),
+            case Desc::GREGORY_BOUNDARY : evalGregoryBoundary( pparam.bitField, t, s, cvs.begin(),
                                                               &ptables.GetVertexValenceTable()[0],
                                                               ptables.GetPatchQuadOffsets(*handle).begin(),
                                                               ptables.GetMaxValence(),
                                                               vertexData.inDesc,
                                                               vertexData.in,
                                                               outDesc,
-                                                               out, outDu, outDv );
+                                                               outQ, outDQU, outDQV );
                                          break;
            case Desc::GREGORY_BASIS : {
                                           Far::StencilTables const * stencils =
                                               ptables.GetEndCapStencilTables();
                                           assert(stencils and stencils->GetNumStencils()>0);
-                                           evalGregoryBasis( t, s,
+                                           evalGregoryBasis( pparam.bitField, s, t,
                                                             *stencils,
                                                             ptables.GetEndCapStencilIndex(*handle),
                                                             vertexData.inDesc,
                                                             vertexData.in,
                                                             vertexData.outDesc,
-                                                             out, outDu, outDv );
+                                                             outQ, outDQU, outDQV );
                                       } break;
            default:
                assert(0);
@ -157,44 +151,46 @@ CpuEvalLimitController::_EvalLimitSample( LimitLocation const & coords,

    Far::PatchTables const & ptables = context->GetPatchTables();

+    Far::PatchParam pparam = ptables.GetPatchParam(*handle);
+    pparam.bitField.Normalize(s, t);
+
    Far::PatchDescriptor desc = ptables.GetPatchDescriptor(*handle);

    Far::ConstIndexArray cvs = ptables.GetPatchVertices(*handle);

    if (vertexData.in) {

-        int offset = vertexData.outDesc.stride * index;
+        int offset = vertexData.outDesc.stride * index,
+            doffset = vertexData.outDesc.length * index;

        if (vertexData.out) {

+            // note : don't apply outDesc.offset here, it's done inside patch
+            // evaluation
            float * out   = vertexData.out+offset,
-                  * outDu = vertexData.outDu ? vertexData.outDu+offset : 0,
-                  * outDv = vertexData.outDv ? vertexData.outDv+offset : 0;
+                  * outDu = vertexData.outDu ? vertexData.outDu+doffset : 0,
+                  * outDv = vertexData.outDv ? vertexData.outDv+doffset : 0;

-            computeSubPatchCoords(ptables.GetPatchParam(*handle), s, t);
-
-            switch(desc.GetType()) {
-                case Desc::REGULAR  : evalBSpline( t, s, cvs.begin(),
+            switch (desc.GetType()) {
+                case Desc::REGULAR  : evalBSpline( pparam.bitField, s, t, cvs.begin(),
                                                   vertexData.inDesc,
                                                   vertexData.in,
                                                   vertexData.outDesc,
                                                   out, outDu, outDv );
                                      break;
-
-                case Desc::BOUNDARY : evalBoundary( t, s, cvs.begin(),
+                case Desc::BOUNDARY : evalBoundary( pparam.bitField, s, t, cvs.begin(),
                                                    vertexData.inDesc,
                                                    vertexData.in,
                                                    vertexData.outDesc,
                                                    out, outDu, outDv );
                                      break;
-
-                case Desc::CORNER   : evalCorner( t, s, cvs.begin(),
+                case Desc::CORNER   : evalCorner( pparam.bitField, s, t, cvs.begin(),
                                                  vertexData.inDesc,
                                                  vertexData.in,
                                                  vertexData.outDesc,
                                                  out, outDu, outDv );
                                      break;
-                case Desc::GREGORY  : evalGregory( t, s, cvs.begin(),
+                case Desc::GREGORY  : evalGregory( pparam.bitField, t, s, cvs.begin(),
                                                   &ptables.GetVertexValenceTable()[0],
                                                   ptables.GetPatchQuadOffsets(*handle).begin(),
                                                   ptables.GetMaxValence(),
@ -203,8 +199,7 @@ CpuEvalLimitController::_EvalLimitSample( LimitLocation const & coords,
                                                   vertexData.outDesc,
                                                   out, outDu, outDv );
                                      break;
-
-                case Desc::GREGORY_BOUNDARY : evalGregoryBoundary( t, s, cvs.begin(),
+                case Desc::GREGORY_BOUNDARY : evalGregoryBoundary( pparam.bitField, t, s, cvs.begin(),
                                                                   &ptables.GetVertexValenceTable()[0],
                                                                   ptables.GetPatchQuadOffsets(*handle).begin(),
                                                                   ptables.GetMaxValence(),
@ -217,7 +212,7 @@ CpuEvalLimitController::_EvalLimitSample( LimitLocation const & coords,
                                               Far::StencilTables const * stencils =
                                                   ptables.GetEndCapStencilTables();
                                               assert(stencils and stencils->GetNumStencils()>0);
-                                               evalGregoryBasis( s, t,
+                                               evalGregoryBasis( pparam.bitField, s, t,
                                                                 *stencils,
                                                                 ptables.GetEndCapStencilIndex(*handle),
                                                                 vertexData.inDesc,
@ -231,6 +226,8 @@ CpuEvalLimitController::_EvalLimitSample( LimitLocation const & coords,
        }
    }

+    pparam.bitField.Rotate(s, t);
+
    VaryingData const & varyingData = _currentBindState.varyingData;

    if (varyingData.in and varyingData.out) {
@ -282,7 +279,7 @@ CpuEvalLimitController::_EvalLimitSample( LimitLocation const & coords,
            // XXXX manuelk this assumes FVar data is ordered with 4 CVs / patch :
            //              bi-cubic FVar interpolation will require proper topology
            //              accessors in Far::PatchTables and this code will change
-            evalBilinear( t, s, zeroRing,
+            evalBilinear( s, t, zeroRing,
                          facevaryingData.inDesc,
                          &facevaryingData.in[handle->patchIndex*4*facevaryingData.outDesc.stride],
                          facevaryingData.outDesc,
--- a/opensubdiv/osd/cpuEvalLimitController.h
+++ b/opensubdiv/osd/cpuEvalLimitController.h
@ -74,7 +74,9 @@ public:
    ///
    /// @param inQ     input vertex data
    ///
-    /// @param oDesc   data descriptor shared by all output data buffers
+    /// @param oDesc   data descriptor for the outQ data buffer
+    ///                -- derivative buffers do not have a descriptor and
+    ///                cannot be offset or padded with a stride (yet ?)
    ///
    /// @param outQ    output vertex data
    ///
@ -102,7 +104,7 @@ public:
    ///
    /// @param inQ    input varying data
    ///
-    /// @param oDesc  data descriptor shared by all output data buffers
+    /// @param oDesc  data descriptor for the outQ data buffer
    ///
    /// @param outQ   output varying data
    ///
@ -127,7 +129,7 @@ public:
    ///
    /// @param inQ    input face-varying data
    ///
-    /// @param oDesc  data descriptor shared by all output data buffers
+    /// @param oDesc  data descriptor for the outQ data buffer
    ///
    /// @param outQ   output face-varying data
    ///
@ -153,7 +155,9 @@ public:
    ///
    /// @param context  the EvalLimitContext that the controller will evaluate
    ///
-    /// @param outDesc  data descriptor (offset, length, stride)
+    /// @param outDesc  data descriptor for the outQ data buffer
+    ///                 -- derivative buffers do not have a descriptor and
+    ///                 cannot be offset or padded with a stride (yet ?)
    ///
    /// @param outQ    output vertex data
    ///
--- a/opensubdiv/osd/cpuEvalLimitKernel.cpp
+++ b/opensubdiv/osd/cpuEvalLimitKernel.cpp
@ -23,6 +23,7 @@
 //

 #include "../osd/cpuEvalLimitKernel.h"
+#include "../far/patchTables.h"
 #include "../far/stencilTables.h"

 #include <math.h>
@ -68,6 +69,30 @@ evalBilinear(float u, float v,
    }
 }

+#ifdef TENSOR_PRODUCT_CUBIC_SPLINES
+
+// manuelk code was refactored to use the matrix formulation of cubic splines
+// exposed in Far::PatchTables for consistency. I am keeping these temporarily
+// for reference.
+
+inline void
+evalCubicBezier(float u, float B[4], float BU[3]) {
+    float u2 = u*u,
+          w0 = 1.0f - u,
+          w2 = w0 * w0;
+
+    B[0] = w0*w2;
+    B[1] = 3.0f * u * w2;
+    B[2] = 3.0f * u2 * w0;
+    B[3] = u*u2;
+
+    if (BU) {
+        BU[0] = w2;
+        BU[1] = 2.0f * u * w0;
+        BU[2] = u2;
+    }
+}
+
 inline void
 evalCubicBSpline(float u, float B[4], float BU[4]) {
    float t = u;
@ -90,101 +115,107 @@ evalCubicBSpline(float u, float B[4], float BU[4]) {
    }
 }

+inline void
+univar4x4(float u, float B[4], float D[4]) {

+    float t = u;
+    float s = 1.0f - u;
+
+    float A0 = s * s;
+    float A1 = 2 * s * t;
+    float A2 = t * t;
+
+    B[0] = s * A0;
+    B[1] = t * A0 + s * A1;
+    B[2] = t * A1 + s * A2;
+    B[3] = t * A2;
+
+    if (D) {
+        D[0] =    - A0;
+        D[1] = A0 - A1;
+        D[2] = A1 - A2;
+        D[3] = A2;
+    }
+}
+
+#endif

 void
-evalBSpline(float u, float v,
+evalBSpline(Far::PatchParam::BitField bits,
+            float s, float t,
            Far::Index const * vertexIndices,
            VertexBufferDescriptor const & inDesc,
            float const * inQ,
            VertexBufferDescriptor const & outDesc,
            float * outQ,
-            float * outDQU,
-            float * outDQV ) {
+            float * outDQ1,
+            float * outDQ2 ) {

    // make sure that we have enough space to store results
    assert( outQ and inDesc.length <= (outDesc.stride-outDesc.offset) );

-    bool evalDeriv = (outDQU or outDQV);
-
-    float B[4], D[4],
-          *BU=(float*)alloca(inDesc.length*4*sizeof(float)),
-          *DU=(float*)alloca(inDesc.length*4*sizeof(float));
-
-    memset(BU, 0, inDesc.length*4*sizeof(float));
-    memset(DU, 0, inDesc.length*4*sizeof(float));
-
-    evalCubicBSpline(u, B, evalDeriv ? D : 0);
+    float Q[16], dQ1[16], dQ2[16];
+    Far::PatchTables::GetBasisWeights(Far::PatchTables::BASIS_BSPLINE, bits, s, t,
+        outQ ? Q : 0, outDQ1 ? dQ1 : 0, outDQ2 ? dQ2 : 0);

    float const * inOffset = inQ + inDesc.offset;

-    for (int i=0; i<4; ++i) {
-        for (int j=0; j<4; ++j) {
+    outQ += outDesc.offset;

-            float const * in = inOffset + vertexIndices[i+j*4]*inDesc.stride;
-
-            for (int k=0; k<inDesc.length; ++k) {
-
-                BU[i*inDesc.length+k] += in[k] * B[j];
-
-                if (evalDeriv)
-                    DU[i*inDesc.length+k] += in[k] * D[j];
-            }
-        }
+    memset(outQ, 0, inDesc.length*sizeof(float));
+    if (outDQ1) {
+        memset(outDQ1, 0, inDesc.length*sizeof(float));
+    }
+    if (outDQ2) {
+        memset(outDQ2, 0, inDesc.length*sizeof(float));
    }

-    evalCubicBSpline(v, B, evalDeriv ? D : 0);

-    float * Q = outQ + outDesc.offset,
-          * dQU = outDQU + outDesc.offset,
-          * dQV = outDQV + outDesc.offset;
+    for (int i=0; i<16; ++i) {

-    // clear result
-    memset(Q, 0, inDesc.length*sizeof(float));
-    if (evalDeriv) {
-        memset(dQU, 0, inDesc.length*sizeof(float));
-        memset(dQV, 0, inDesc.length*sizeof(float));
-    }
+        float const * in = inOffset + vertexIndices[i]*inDesc.stride;

-    for (int i=0; i<4; ++i) {
        for (int k=0; k<inDesc.length; ++k) {
-            Q[k] += BU[inDesc.length*i+k] * B[i];
-
-            if (evalDeriv) {
-                dQU[k] += DU[inDesc.length*i+k] * B[i];
-                dQV[k] += BU[inDesc.length*i+k] * D[i];
+            outQ[k] += Q[i] * in[k];
+            if (outDQ1) {
+                outDQ1[k] += dQ1[i] * in[k];
+            }
+            if (outDQ2) {
+                outDQ2[k] += dQ2[i] * in[k];
            }
        }
    }
 }

-
-
 void
-evalBoundary(float u, float v,
+evalBoundary(Far::PatchParam::BitField bits,
+             float s, float t,
             Far::Index const * vertexIndices,
             VertexBufferDescriptor const & inDesc,
             float const * inQ,
             VertexBufferDescriptor const & outDesc,
             float * outQ,
-             float * outDQU,
-             float * outDQV ) {
+             float * outDQ1,
+             float * outDQ2 ) {

+    // make sure that we have enough space to store results
    assert( outQ and inDesc.length <= (outDesc.stride-outDesc.offset) );

-    bool evalDeriv = (outDQU or outDQV);
-
-    float B[4], D[4],
-          *BU=(float*)alloca(inDesc.length*4*sizeof(float)),
-          *DU=(float*)alloca(inDesc.length*4*sizeof(float));
-
-    memset(BU, 0, inDesc.length*4*sizeof(float));
-    memset(DU, 0, inDesc.length*4*sizeof(float));
-
-    evalCubicBSpline(u, B, evalDeriv ? D : 0);
+    float Q[16], dQ1[16], dQ2[16];
+    Far::PatchTables::GetBasisWeights(Far::PatchTables::BASIS_BSPLINE, bits, s, t,
+        outQ ? Q : 0, outDQ1 ? dQ1 : 0, outDQ2 ? dQ2 : 0);

    float const * inOffset = inQ + inDesc.offset;

+    outQ += outDesc.offset;
+
+    memset(outQ, 0, inDesc.length*sizeof(float));
+    if (outDQ1) {
+        memset(outDQ1, 0, inDesc.length*sizeof(float));
+    }
+    if (outDQ2) {
+        memset(outDQ2, 0, inDesc.length*sizeof(float));
+    }

    // mirror the missing vertices (M)
    //
@ -217,77 +248,52 @@ evalBoundary(float u, float v,
        M[3*inDesc.length+k] = 2.0f*v3[k] - v7[k];  // M4 = 2*v2 - v1
    }

-    for (int i=0; i<4; ++i) {
-        for (int j=0; j<4; ++j) {
+    for (int i=0; i<16; ++i) {

-            // swap the missing row of verts with our mirrored ones
-            float const * in = j==0 ? &M[i*inDesc.length] :
-                inOffset + vertexIndices[i+(j-1)*4]*inDesc.stride;
+        float const * in = i < 4 ?
+            M + i*inDesc.length : inOffset + vertexIndices[i-4]*inDesc.stride;

-            for (int k=0; k<inDesc.length; ++k) {
-
-                BU[i*inDesc.length+k] += in[k] * B[j];
-
-                if (evalDeriv)
-                    DU[i*inDesc.length+k] += in[k] * D[j];
-            }
-        }
-    }
-
-    evalCubicBSpline(v, B, evalDeriv ? D : 0);
-
-    float * Q = outQ + outDesc.offset,
-          * dQU = outDQU + outDesc.offset,
-          * dQV = outDQV + outDesc.offset;
-
-    // clear result
-    memset(Q, 0, inDesc.length*sizeof(float));
-    if (evalDeriv) {
-        memset(dQU, 0, inDesc.length*sizeof(float));
-        memset(dQV, 0, inDesc.length*sizeof(float));
-    }
-
-    for (int i=0; i<4; ++i) {
        for (int k=0; k<inDesc.length; ++k) {
-            Q[k] += BU[inDesc.length*i+k] * B[i];
-
-            if (evalDeriv) {
-                dQU[k] += DU[inDesc.length*i+k] * B[i];
-                dQV[k] += BU[inDesc.length*i+k] * D[i];
+            outQ[k] += Q[i] * in[k];
+            if (outDQ1) {
+                outDQ1[k] += dQ1[i] * in[k];
+            }
+            if (outDQ2) {
+                outDQ2[k] += dQ2[i] * in[k];
            }
        }
    }
 }

-
-
 void
-evalCorner(float u, float v,
+evalCorner(Far::PatchParam::BitField bits,
+           float s, float t,
           Far::Index const * vertexIndices,
           VertexBufferDescriptor const & inDesc,
           float const * inQ,
           VertexBufferDescriptor const & outDesc,
           float * outQ,
-           float * outDQU,
-           float * outDQV ) {
+           float * outDQ1,
+           float * outDQ2 ) {

+    // make sure that we have enough space to store results
    assert( outQ and inDesc.length <= (outDesc.stride-outDesc.offset) );

-    int length = inDesc.length;
+    float Q[16], dQ1[16], dQ2[16];
+    Far::PatchTables::GetBasisWeights(Far::PatchTables::BASIS_BSPLINE, bits, s, t,
+        outQ ? Q : 0, outDQ1 ? dQ1 : 0, outDQ2 ? dQ2 : 0);

-    bool evalDeriv = (outDQU or outDQV);
+    float const * inOffset = inQ + inDesc.offset;

-    float B[4], D[4],
-          *BU=(float*)alloca(length*4*sizeof(float)),
-          *DU=(float*)alloca(length*4*sizeof(float));
+    outQ += outDesc.offset;

-    memset(BU, 0, length*4*sizeof(float));
-    memset(DU, 0, length*4*sizeof(float));
-
-
-    evalCubicBSpline(u, B, evalDeriv ? D : 0);
-
-    float const *inOffset = inQ + inDesc.offset;
+    memset(outQ, 0, inDesc.length*sizeof(float));
+    if (outDQ1) {
+        memset(outDQ1, 0, inDesc.length*sizeof(float));
+    }
+    if (outDQ2) {
+        memset(outDQ2, 0, inDesc.length*sizeof(float));
+    }

    // mirror the missing vertices (M)
    //
@ -302,7 +308,7 @@ evalCorner(float u, float v,
    //   |.....|.....|     |
    //  v6 -- v7 -- v8 -- M6

-    float *M = (float*)alloca(length*7*sizeof(float));
+    float *M = (float*)alloca(inDesc.length*7*sizeof(float));

    float const *v0 = inOffset + vertexIndices[0]*inDesc.stride,
                *v1 = inOffset + vertexIndices[1]*inDesc.stride,
@ -314,88 +320,47 @@ evalCorner(float u, float v,
                *v8 = inOffset + vertexIndices[8]*inDesc.stride;

    for (int k=0; k<inDesc.length; ++k) {
-        M[0*length+k] = 2.0f*v0[k] - v3[k];  // M0 = 2*v0 - v3
-        M[1*length+k] = 2.0f*v1[k] - v4[k];  // M0 = 2*v1 - v4
-        M[2*length+k] = 2.0f*v2[k] - v5[k];  // M1 = 2*v2 - v5
+        M[0*inDesc.length+k] = 2.0f*v0[k] - v3[k];  // M0 = 2*v0 - v3
+        M[1*inDesc.length+k] = 2.0f*v1[k] - v4[k];  // M0 = 2*v1 - v4
+        M[2*inDesc.length+k] = 2.0f*v2[k] - v5[k];  // M1 = 2*v2 - v5

-        M[4*length+k] = 2.0f*v2[k] - v1[k];  // M4 = 2*v2 - v1
-        M[5*length+k] = 2.0f*v5[k] - v4[k];  // M5 = 2*v5 - v4
-        M[6*length+k] = 2.0f*v8[k] - v7[k];  // M6 = 2*v8 - v7
+        M[4*inDesc.length+k] = 2.0f*v2[k] - v1[k];  // M4 = 2*v2 - v1
+        M[5*inDesc.length+k] = 2.0f*v5[k] - v4[k];  // M5 = 2*v5 - v4
+        M[6*inDesc.length+k] = 2.0f*v8[k] - v7[k];  // M6 = 2*v8 - v7

        // M3 = 2*M2 - M1
-        M[3*length+k] = 2.0f*M[2*length+k] - M[1*length+k];
+        M[3*inDesc.length+k] = 2.0f*M[2*inDesc.length+k] - M[1*inDesc.length+k];
    }

    for (int i=0; i<4; ++i) {
        for (int j=0; j<4; ++j) {

-            float const * in = NULL;
-
+            float const * in = 0;
            if (j==0) { // (2)
-                in = &M[i*inDesc.length];
+                in = M + i*inDesc.length;
            } else if (i==3) {
-                in = &M[(j+3)*inDesc.length];
+                in = M + (j+3)*inDesc.length;
            } else {
                in = inOffset + vertexIndices[i+(j-1)*3]*inDesc.stride;
            }
-
            assert(in);

-            for (int k=0; k<length; ++k) {
-
-                BU[i*length+k] += in[k] * B[j];
-
-                if (evalDeriv)
-                    DU[i*length+k] += in[k] * D[j];
+            int idx = j*4+i;
+            for (int k=0; k<inDesc.length; ++k) {
+                outQ[k] += Q[idx] * in[k];
+                if (outDQ1) {
+                    outDQ1[k] += dQ1[idx] * in[k];
+                }
+                if (outDQ2) {
+                    outDQ2[k] += dQ2[idx] * in[k];
+                }
            }
        }
    }
-
-    evalCubicBSpline(v, B, evalDeriv ? D : 0);
-
-    float * Q = outQ + outDesc.offset,
-          * dQU = outDQU + outDesc.offset,
-          * dQV = outDQV + outDesc.offset;
-
-    // clear result
-    memset(Q, 0, length*sizeof(float));
-    if (evalDeriv) {
-        memset(dQU, 0, length*sizeof(float));
-        memset(dQV, 0, length*sizeof(float));
-    }
-
-    for (int i=0; i<4; ++i) {
-        for (int k=0; k<length; ++k) {
-            Q[k] += BU[length*i+k] * B[i];
-
-            if (evalDeriv) {
-                dQU[k] += DU[length*i+k] * B[i];
-                dQV[k] += BU[length*i+k] * D[i];
-            }
-        }
-    }
-}
-
-inline void
-evalCubicBezier(float u, float B[4], float BU[3]) {
-    float u2 = u*u,
-          w0 = 1.0f - u,
-          w2 = w0 * w0;
-
-    B[0] = w0*w2;
-    B[1] = 3.0f * u * w2;
-    B[2] = 3.0f * u2 * w0;
-    B[3] = u*u2;
-
-    if (BU) {
-        BU[0] = w2;
-        BU[1] = 2.0f * u * w0;
-        BU[2] = u2;
-    }
 }

 void
-evalGregoryBasis(float u, float v,
+evalGregoryBasis(Far::PatchParam::BitField bits, float u, float v,
                 Far::StencilTables const & basisStencils,
                 int stencilIndex,
                 VertexBufferDescriptor const & inDesc,
@ -409,54 +374,21 @@ evalGregoryBasis(float u, float v,

    int length = inDesc.length;

-    bool evalDeriv = (outDQU or outDQV);
-
-    float S[4], T[4], DS[3], DT[3];
-    evalCubicBezier(u, S, evalDeriv ? DS : 0);
-    evalCubicBezier(v, T, evalDeriv ? DT : 0);
-
    float BU[16], DU[16], DV[16];
-    memset(BU, 0, 16*sizeof(float));
-    for (int i=0; i<4; ++i) {
-        for (int j=0; j<4; ++j) {
-            BU[4*i+j] += S[j] * T[i];
-        }
-    }
-
-    if (evalDeriv) {
-        memset(DU, 0, 16*sizeof(float));
-        for (int i=0; i<4; ++i) {
-            float pw = 0.0f;
-            for (int j=0; j<3; ++j) {
-                float w = DS[j] * T[i];
-                DU[4*i+j] += pw - w;
-                pw = w;
-            }
-            DU[4*i+3]+=pw;
-        }
-        memset(DV, 0, 16*sizeof(float));
-        for (int j=0; j<4; ++j) {
-            float pw = 0.0f;
-            for (int i=0; i<3; ++i) {
-                float w = S[j] * DT[i];
-                DV[4*i+j] += pw - w;
-                pw = w;
-            }
-            DV[12+j]+=pw;
-        }
-    }
+    Far::PatchTables::GetBasisWeights(Far::PatchTables::BASIS_BEZIER, bits, u, v,
+        outQ ? BU : 0, outDQU ? DU : 0, outDQV ? DV : 0);

    float const *inOffset = inQ + inDesc.offset;

-    float * Q = outQ + outDesc.offset,
-          * dQU = outDQU + outDesc.offset,
-          * dQV = outDQV + outDesc.offset;
+    float * Q = outQ + outDesc.offset;

    // clear result
    memset(Q, 0, length*sizeof(float));
-    if (evalDeriv) {
-        memset(dQU, 0, length*sizeof(float));
-        memset(dQV, 0, length*sizeof(float));
+    if (outDQU) {
+        memset(outDQU, 0, length*sizeof(float));
+    }
+    if (outDQV) {
+        memset(outDQV, 0, length*sizeof(float));
    }

    float uu = 1-u,
@ -531,12 +463,15 @@ evalGregoryBasis(float u, float v,
                    float const * in = inOffset + srcIndices[j]*inDesc.stride;
                    float w = BU[i] * w0 * srcWeights[j],
                          dw1 = DU[i] * w0 * srcWeights[j],
-                          dw2 = DV[i] * w0 * srcWeights[j];
+                          dw2 = DV[i] * w0 * srcWeights[
+j];
                    for (int k=0; k<length; ++k) {
                        Q[k] += in[k] * w;
-                        if (evalDeriv) {
-                            dQU[k] += in[k] * dw1;
-                            dQV[k] += in[k] * dw2;
+                        if (outDQU) {
+                            outDQU[k] += in[k] * dw1;
+                        }
+                        if (outDQV) {
+                            outDQV[k] += in[k] * dw2;
                        }
                    }
                }
@ -551,9 +486,11 @@ evalGregoryBasis(float u, float v,
                          dw2 = DV[i] * w1 * srcWeights[j];
                    for (int k=0; k<length; ++k) {
                        Q[k] += in[k] * w;
-                        if (evalDeriv) {
-                            dQU[k] += in[k] * dw1;
-                            dQV[k] += in[k] * dw2;
+                        if (outDQU) {
+                            outDQU[k] += in[k] * dw1;
+                        }
+                        if (outDQV) {
+                            outDQV[k] += in[k] * dw2;
                        }
                    }
                }
@ -570,9 +507,11 @@ evalGregoryBasis(float u, float v,
                      dw2 = DV[i] * srcWeights[j];
                for (int k=0; k<length; ++k) {
                    Q[k] += in[k] * w;
-                    if (evalDeriv) {
-                        dQU[k] += in[k] * dw1;
-                        dQV[k] += in[k] * dw2;
+                    if (outDQU) {
+                        outDQU[k] += in[k] * dw1;
+                    }
+                    if (outDQV) {
+                        outDQV[k] += in[k] * dw2;
                    }
                }
            }
@ -580,7 +519,6 @@ evalGregoryBasis(float u, float v,
    }
 }

-
 /*
 static float ef[7] = {
    0.813008f, 0.500000f, 0.363636f, 0.287505f,
@ -597,29 +535,6 @@ static float ef[27] = {
    0.0569311f, 0.0548745f, 0.0529621f
 };

-inline void
-univar4x4(float u, float B[4], float D[4]) {
-
-    float t = u;
-    float s = 1.0f - u;
-
-    float A0 = s * s;
-    float A1 = 2 * s * t;
-    float A2 = t * t;
-
-    B[0] = s * A0;
-    B[1] = t * A0 + s * A1;
-    B[2] = t * A1 + s * A2;
-    B[3] = t * A2;
-
-    if (D) {
-        D[0] =    - A0;
-        D[1] = A0 - A1;
-        D[2] = A1 - A2;
-        D[3] = A2;
-    }
-}
-
 inline float
 csf(Far::Index n, Far::Index j) {
    if (j%2 == 0) {
@ -631,7 +546,7 @@ csf(Far::Index n, Far::Index j) {


 void
-evalGregory(float u, float v,
+evalGregory(Far::PatchParam::BitField bits, float u, float v,
            Far::Index const * vertexIndices,
            Far::Index const * vertexValenceBuffer,
            unsigned int const * quadOffsetBuffer,
@ -640,16 +555,12 @@ evalGregory(float u, float v,
            float const * inQ,
            VertexBufferDescriptor const & outDesc,
            float * outQ,
-            float * outDQU,
-            float * outDQV ) {
-
-    // vertex
+            float * outDQ1,
+            float * outDQ2 ) {

    // make sure that we have enough space to store results
    assert( outQ and inDesc.length <= (outDesc.stride-outDesc.offset) );

-    bool evalDeriv = (outDQU or outDQV);
-
    int valences[4], length=inDesc.length;

    float const * inOffset = inQ + inDesc.offset;
@ -725,8 +636,6 @@ evalGregory(float u, float v,
        }
    }

-    // tess control
-
    // Control Vertices based on :
    // "Approximating Subdivision Surfaces with Gregory Patches for Hardware Tessellation"
    // Loop, Schaefer, Ni, Castafio (ACM ToG Siggraph Asia 2009)
@ -839,49 +748,32 @@ evalGregory(float u, float v,
    memcpy(q+14*length, p[11], length*sizeof(float));
    memcpy(q+15*length, p[10], length*sizeof(float));

-    float B[4], D[4],
-          *BU=(float*)alloca(inDesc.length*4*sizeof(float)),
-          *DU=(float*)alloca(inDesc.length*4*sizeof(float));
-    memset(BU, 0, inDesc.length*4*sizeof(float));
-    memset(DU, 0, inDesc.length*4*sizeof(float));
+    float Q[16], dQ1[16], dQ2[16];
+    Far::PatchTables::GetBasisWeights(Far::PatchTables::BASIS_BEZIER, bits, u, v,
+        outQ ? Q : 0, outDQ1 ? dQ1 : 0, outDQ2 ? dQ2 : 0);

-    univar4x4(u, B, evalDeriv ? D : 0);
+    outQ += outDesc.offset;

-    for (int i=0; i<4; ++i) {
-        for (int j=0; j<4; ++j) {
-
-            float const * in = q + (i+j*4)*length;
-
-            for (int k=0; k<inDesc.length; ++k) {
-
-                BU[i*inDesc.length+k] += in[k] * B[j];
-
-                if (evalDeriv)
-                    DU[i*inDesc.length+k] += in[k] * D[j];
-            }
-        }
+    memset(outQ, 0, inDesc.length*sizeof(float));
+    if (outDQ1) {
+        memset(outDQ1, 0, inDesc.length*sizeof(float));
+    }
+    if (outDQ2) {
+        memset(outDQ2, 0, inDesc.length*sizeof(float));
    }

-    univar4x4(v, B, evalDeriv ? D : 0);

-    float * Q = outQ + outDesc.offset;
-    float * dQU = outDQU + outDesc.offset;
-    float * dQV = outDQV + outDesc.offset;
+    for (int i=0; i<16; ++i) {

-    // clear result
-    memset(Q, 0, outDesc.length*sizeof(float));
-    if (evalDeriv) {
-        memset(dQU, 0, outDesc.length*sizeof(float));
-        memset(dQV, 0, outDesc.length*sizeof(float));
-    }
+        float const * in = q + i*length;

-    for (int i=0; i<4; ++i) {
        for (int k=0; k<inDesc.length; ++k) {
-            Q[k] += BU[inDesc.length*i+k] * B[i];
-
-            if (evalDeriv) {
-                dQU[k] += DU[inDesc.length*i+k] * B[i];
-                dQV[k] += BU[inDesc.length*i+k] * D[i];
+            outQ[k] += Q[i] * in[k];
+            if (outDQ1) {
+                outDQ1[k] += dQ1[i] * in[k];
+            }
+            if (outDQ2) {
+                outDQ2[k] += dQ2[i] * in[k];
            }
        }
    }
@ -889,7 +781,7 @@ evalGregory(float u, float v,


 void
-evalGregoryBoundary(float u, float v,
+evalGregoryBoundary(Far::PatchParam::BitField bits, float u, float v,
                    Far::Index const * vertexIndices,
                    Far::Index const * vertexValenceBuffer,
                    unsigned int const * quadOffsetBuffer,
@ -898,16 +790,14 @@ evalGregoryBoundary(float u, float v,
                    float const * inQ,
                    VertexBufferDescriptor const & outDesc,
                    float * outQ,
-                    float * outDQU,
-                    float * outDQV ) {
+                    float * outDQ1,
+                    float * outDQ2 ) {

    // vertex

    // make sure that we have enough space to store results
    assert( outQ and inDesc.length <= (outDesc.stride-outDesc.offset) );

-    bool evalDeriv = (outDQU or outDQV);
-
    int valences[4], zerothNeighbors[4], length=inDesc.length;

    float const * inOffset = inQ + inDesc.offset;
@ -1245,49 +1135,32 @@ evalGregoryBoundary(float u, float v,
    memcpy(q+14*length, p[11], length*sizeof(float));
    memcpy(q+15*length, p[10], length*sizeof(float));

-    float B[4], D[4],
-          *BU=(float*)alloca(inDesc.length*4*sizeof(float)),
-          *DU=(float*)alloca(inDesc.length*4*sizeof(float));
-    memset(BU, 0, inDesc.length*4*sizeof(float));
-    memset(DU, 0, inDesc.length*4*sizeof(float));
+    float Q[16], dQ1[16], dQ2[16];
+    Far::PatchTables::GetBasisWeights(Far::PatchTables::BASIS_BEZIER, bits, u, v,
+        outQ ? Q : 0, outDQ1 ? dQ1 : 0, outDQ2 ? dQ2 : 0);

-    univar4x4(u, B, evalDeriv ? D : 0);
+    outQ += outDesc.offset;

-    for (int i=0; i<4; ++i) {
-        for (int j=0; j<4; ++j) {
-
-            float const * in = q + (i+j*4)*length;
-
-            for (int k=0; k<inDesc.length; ++k) {
-
-                BU[i*inDesc.length+k] += in[k] * B[j];
-
-                if (evalDeriv)
-                    DU[i*inDesc.length+k] += in[k] * D[j];
-            }
-        }
+    memset(outQ, 0, inDesc.length*sizeof(float));
+    if (outDQ1) {
+        memset(outDQ1, 0, inDesc.length*sizeof(float));
+    }
+    if (outDQ2) {
+        memset(outDQ2, 0, inDesc.length*sizeof(float));
    }

-    univar4x4(v, B, evalDeriv ? D : 0);

-    float * Q = outQ + outDesc.offset;
-    float * dQU = outDQU + outDesc.offset;
-    float * dQV = outDQV + outDesc.offset;
+    for (int i=0; i<16; ++i) {

-    // clear result
-    memset(Q, 0, outDesc.length*sizeof(float));
-    if (evalDeriv) {
-        memset(dQU, 0, outDesc.length*sizeof(float));
-        memset(dQV, 0, outDesc.length*sizeof(float));
-    }
+        float const * in = q + i*length;

-    for (int i=0; i<4; ++i) {
        for (int k=0; k<inDesc.length; ++k) {
-            Q[k] += BU[inDesc.length*i+k] * B[i];
-
-            if (evalDeriv) {
-                dQU[k] += DU[inDesc.length*i+k] * B[i];
-                dQV[k] += BU[inDesc.length*i+k] * D[i];
+            outQ[k] += Q[i] * in[k];
+            if (outDQ1) {
+                outDQ1[k] += dQ1[i] * in[k];
+            }
+            if (outDQ2) {
+                outDQ2[k] += dQ2[i] * in[k];
            }
        }
    }
--- a/opensubdiv/osd/cpuEvalLimitKernel.h
+++ b/opensubdiv/osd/cpuEvalLimitKernel.h
@ -28,6 +28,7 @@
 #include "../version.h"

 #include "../osd/vertexDescriptor.h"
+#include "../far/patchParam.h"

 #include "../far/types.h"

@ -49,7 +50,8 @@ evalBilinear(float u, float v,
             float * outQ);

 void
-evalBSpline(float u, float v,
+evalBSpline(Far::PatchParam::BitField bits,
+            float u, float v,
            Far::Index const * vertexIndices,
            VertexBufferDescriptor const & inDesc,
            float const * inQ,
@ -59,7 +61,8 @@ evalBSpline(float u, float v,
            float * outDQV );

 void
-evalBoundary(float u, float v,
+evalBoundary(Far::PatchParam::BitField bits,
+             float u, float v,
             Far::Index const * vertexIndices,
             VertexBufferDescriptor const & inDesc,
             float const * inQ,
@ -69,7 +72,8 @@ evalBoundary(float u, float v,
             float * outDQV );

 void
-evalCorner(float u, float v,
+evalCorner(Far::PatchParam::BitField bits,
+           float u, float v,
           Far::Index const * vertexIndices,
           VertexBufferDescriptor const & inDesc,
           float const * inQ,
@ -79,7 +83,7 @@ evalCorner(float u, float v,
           float * outDQV );

 void
-evalGregoryBasis(float u, float v,
+evalGregoryBasis(Far::PatchParam::BitField bits, float u, float v,
                 Far::StencilTables const & basisStencils,
                 int stencilIndex,
                 VertexBufferDescriptor const & inDesc,
@ -90,7 +94,7 @@ evalGregoryBasis(float u, float v,
                 float * outDQV );

 void
-evalGregory(float u, float v,
+evalGregory(Far::PatchParam::BitField bits, float u, float v,
            Far::Index const * vertexIndices,
            Far::Index const * vertexValenceBuffer,
            unsigned int const * quadOffsetBuffer,
@ -103,7 +107,7 @@ evalGregory(float u, float v,
            float * outDQV );

 void
-evalGregoryBoundary(float u, float v,
+evalGregoryBoundary(Far::PatchParam::BitField bits, float u, float v,
                    Far::Index const * vertexIndices,
                    Far::Index const * vertexValenceBuffer,
                    unsigned int const * quadOffsetBuffer,