diff --git a/documentation/release_notes.rst b/documentation/release_notes.rst
index 291558d6..901d975f 100644
--- a/documentation/release_notes.rst
+++ b/documentation/release_notes.rst
@@ -31,6 +31,27 @@
 
 ----
 
+Release 3.0.3
+=============
+
+Release 3.0.3 is a minor stability release which includes important performance
+and bug fixes.
+
+**New Features**
+    - Smooth normal generation tutorial, far_tutorial_8
+
+**Changes**
+    - Major performance improvement in PatchTable construction
+    - Improved patch approximations for non-manifold features
+
+**Bug Fixes**
+    - Fixed double delete in GLSL Compute controller
+    - Fixed buffer layout for GLSL Compute kernel
+    - Fixed GL buffer leak in Osd::GLPatchTable
+    - Fixed out-of-bounds data access for TBB and OMP stencil evaluation
+    - Fixed WIN32_LEAN_AND_MEAN typo
+    - Fixed Loop-related shader issues glFVarViewer
+
 Release 3.0.2
 =============
 
diff --git a/examples/common/clDeviceContext.cpp b/examples/common/clDeviceContext.cpp
index 5cbd016b..2b0f6936 100644
--- a/examples/common/clDeviceContext.cpp
+++ b/examples/common/clDeviceContext.cpp
@@ -27,6 +27,7 @@
 #include "clDeviceContext.h"
 
 #if defined(_WIN32)
+    #define WIN32_LEAN_AND_MEAN
     #include <windows.h>
 #elif defined(__APPLE__)
     #include <OpenGL/OpenGL.h>
diff --git a/examples/common/cudaDeviceContext.cpp b/examples/common/cudaDeviceContext.cpp
index ee0be938..c71f89d7 100644
--- a/examples/common/cudaDeviceContext.cpp
+++ b/examples/common/cudaDeviceContext.cpp
@@ -25,6 +25,7 @@
 #include "cudaDeviceContext.h"
 
 #if defined(_WIN32)
+    #define WIN32_LEAN_AND_MEAN
     #include <windows.h>
 #elif defined(__APPLE__)
     #include <OpenGL/OpenGL.h>
diff --git a/examples/common/stopwatch.h b/examples/common/stopwatch.h
index 32d0b153..c39dc258 100644
--- a/examples/common/stopwatch.h
+++ b/examples/common/stopwatch.h
@@ -25,12 +25,12 @@
 #ifndef STOPWATCH_H
 #define STOPWATCH_H
 
-#if not (_WIN32 or _WIN64)
+#if (_WIN32 or _WIN64)
+    #include <windows.h>
+#else
     #include <sys/types.h>
     #include <sys/time.h>
     #include <sys/resource.h>
-#else
-
 #endif
 
 class Stopwatch {
diff --git a/examples/glFVarViewer/glFVarViewer.cpp b/examples/glFVarViewer/glFVarViewer.cpp
index 1d01cc6f..4ea9e78e 100644
--- a/examples/glFVarViewer/glFVarViewer.cpp
+++ b/examples/glFVarViewer/glFVarViewer.cpp
@@ -501,6 +501,9 @@ public:
 
         if (type == Far::PatchDescriptor::QUADS) {
             ss << "#define PRIM_QUAD\n";
+        } else if (type == Far::PatchDescriptor::TRIANGLES) {
+            ss << "#define PRIM_TRI\n";
+            ss << "#define LOOP\n";
         } else {
             ss << "#define PRIM_TRI\n";
         }
diff --git a/opensubdiv/far/endCapBSplineBasisPatchFactory.cpp b/opensubdiv/far/endCapBSplineBasisPatchFactory.cpp
index 72103194..31f72243 100644
--- a/opensubdiv/far/endCapBSplineBasisPatchFactory.cpp
+++ b/opensubdiv/far/endCapBSplineBasisPatchFactory.cpp
@@ -51,14 +51,61 @@ namespace {
 EndCapBSplineBasisPatchFactory::EndCapBSplineBasisPatchFactory(
     TopologyRefiner const & refiner) :
     _refiner(&refiner), _numVertices(0), _numPatches(0) {
+
+    // Sanity check: the mesh must be adaptively refined
+    assert(not refiner.IsUniform());
+
+    // Reserve the patch point stencils. Ideally topology refiner
+    // would have an API to return how many endcap patches will be required.
+    // Instead we conservatively estimate by the number of patches at the
+    // finest level.
+    int numMaxLevelFaces = refiner.GetLevel(refiner.GetMaxLevel()).GetNumFaces();
+
+    _vertexStencils.reserve(numMaxLevelFaces*16);
+    _varyingStencils.reserve(numMaxLevelFaces*16);
 }
 
 ConstIndexArray
 EndCapBSplineBasisPatchFactory::GetPatchPoints(
-    Vtr::internal::Level const * level, Index faceIndex,
-    PatchTableFactory::PatchFaceTag const * /*levelPatchTags*/,
+    Vtr::internal::Level const * level, Index thisFace,
+    PatchTableFactory::PatchFaceTag const *levelPatchTags,
     int levelVertOffset) {
 
+    Vtr::ConstIndexArray facePoints = level->getFaceVertices(thisFace);
+    PatchTableFactory::PatchFaceTag patchTag = levelPatchTags[thisFace];
+    // if it's boundary, fallback to use GregoryBasis
+    if (patchTag._boundaryCount > 0) {
+        return getPatchPointsFromGregoryBasis(
+            level, thisFace, facePoints, levelVertOffset);
+    }
+
+    // there's a short-cut when the face contains only 1 extraordinary vertex.
+    // (we can achieve this by isolating 2 levels)
+    // look for the extraordinary vertex
+    int irregular = -1;
+    for (int i = 0; i < 4; ++i) {
+        int valence = level->getVertexFaces(facePoints[i]).size();
+        if (valence != 4) {
+            if (irregular != -1) {
+                // more than one extraoridinary vertices.
+                // fallback to use GregoryBasis
+                return getPatchPointsFromGregoryBasis(
+                    level, thisFace, facePoints, levelVertOffset);
+            }
+            irregular = i;
+        }
+    }
+
+    // faster B-spline endcap generation
+    return getPatchPoints(level, thisFace, irregular, facePoints,
+                          levelVertOffset);
+}
+
+ConstIndexArray
+EndCapBSplineBasisPatchFactory::getPatchPointsFromGregoryBasis(
+    Vtr::internal::Level const * level, Index thisFace,
+    ConstIndexArray facePoints, int levelVertOffset) {
+
     // XXX: For now, always create new 16 indices for each patch.
     // we'll optimize later to share all regular control points with
     // other patches as well as to try to make extra ordinary verts watertight.
@@ -68,66 +115,386 @@ EndCapBSplineBasisPatchFactory::GetPatchPoints(
         _patchPoints.push_back(_numVertices + offset);
         ++_numVertices;
     }
-
+    GregoryBasis::ProtoBasis basis(*level, thisFace, levelVertOffset, -1);
     // XXX: temporary hack. we should traverse topology and find existing
     //      vertices if available
     //
     // Reorder gregory basis stencils into regular bezier
-    GregoryBasis::ProtoBasis basis(*level, faceIndex, levelVertOffset, -1);
-    std::vector<GregoryBasis::Point> bezierCP;
-    bezierCP.reserve(16);
+    GregoryBasis::Point const *bezierCP[16];
 
-    bezierCP.push_back(basis.P[0]);
-    bezierCP.push_back(basis.Ep[0]);
-    bezierCP.push_back(basis.Em[1]);
-    bezierCP.push_back(basis.P[1]);
+    bezierCP[0] = &basis.P[0];
+    bezierCP[1] = &basis.Ep[0];
+    bezierCP[2] = &basis.Em[1];
+    bezierCP[3] = &basis.P[1];
 
-    bezierCP.push_back(basis.Em[0]);
-    bezierCP.push_back(basis.Fp[0]); // arbitrary
-    bezierCP.push_back(basis.Fp[1]); // arbitrary
-    bezierCP.push_back(basis.Ep[1]);
+    bezierCP[4] = &basis.Em[0];
+    bezierCP[5] = &basis.Fp[0]; // arbitrary
+    bezierCP[6] = &basis.Fp[1]; // arbitrary
+    bezierCP[7] = &basis.Ep[1];
 
-    bezierCP.push_back(basis.Ep[3]);
-    bezierCP.push_back(basis.Fp[3]); // arbitrary
-    bezierCP.push_back(basis.Fp[2]); // arbitrary
-    bezierCP.push_back(basis.Em[2]);
+    bezierCP[8]  = &basis.Ep[3];
+    bezierCP[9]  = &basis.Fp[3]; // arbitrary
+    bezierCP[10] = &basis.Fp[2]; // arbitrary
+    bezierCP[11] = &basis.Em[2];
 
-    bezierCP.push_back(basis.P[3]);
-    bezierCP.push_back(basis.Em[3]);
-    bezierCP.push_back(basis.Ep[2]);
-    bezierCP.push_back(basis.P[2]);
+    bezierCP[12] = &basis.P[3];
+    bezierCP[13] = &basis.Em[3];
+    bezierCP[14] = &basis.Ep[2];
+    bezierCP[15] = &basis.P[2];
+
+    // all stencils should have the same capacity.
+    int stencilCapacity = basis.P[0].GetCapacity();
 
     // Apply basis conversion from bezier to b-spline
     float Q[4][4] = {{ 6, -7,  2, 0},
                      { 0,  2, -1, 0},
                      { 0, -1,  2, 0},
                      { 0,  2, -7, 6} };
-    std::vector<GregoryBasis::Point> H(16);
+    Vtr::internal::StackBuffer<GregoryBasis::Point, 16> H(16);
     for (int i = 0; i < 4; ++i) {
         for (int j = 0; j < 4; ++j) {
-            for (int k = 0; k < 4; ++k) {            
-                if (isWeightNonZero(Q[i][k])) H[i*4+j] += bezierCP[j+k*4] * Q[i][k];
+            H[i*4+j].Clear(stencilCapacity);
+            for (int k = 0; k < 4; ++k) {
+                if (isWeightNonZero(Q[i][k])) {
+                    H[i*4+j].AddWithWeight(*bezierCP[j+k*4], Q[i][k]);
+                }
             }
         }
     }
     for (int i = 0; i < 4; ++i) {
         for (int j = 0; j < 4; ++j) {
-            GregoryBasis::Point p;
+            GregoryBasis::Point p(stencilCapacity);
             for (int k = 0; k < 4; ++k) {
-                if (isWeightNonZero(Q[j][k])) p += H[i*4+k] * Q[j][k];
+                if (isWeightNonZero(Q[j][k])) {
+                    p.AddWithWeight(H[i*4+k], Q[j][k]);
+                }
             }
             _vertexStencils.push_back(p);
         }
     }
-    
     int varyingIndices[] = { 0, 0, 1, 1,
                              0, 0, 1, 1,
                              3, 3, 2, 2,
                              3, 3, 2, 2,};
     for (int i = 0; i < 16; ++i) {
-        _varyingStencils.push_back(basis.V[varyingIndices[i]]);
+        GregoryBasis::Point p(1);
+        p.AddWithWeight(facePoints[varyingIndices[i]] + levelVertOffset, 1.0f);
+        _varyingStencils.push_back(p);
     }
 
+
+    ++_numPatches;
+    return ConstIndexArray(&_patchPoints[(_numPatches-1)*16], 16);
+}
+
+void
+EndCapBSplineBasisPatchFactory::computeLimitStencils(
+    Vtr::internal::Level const *level,
+    ConstIndexArray facePoints, int vid,
+    GregoryBasis::Point *P, GregoryBasis::Point *Ep, GregoryBasis::Point *Em)
+{
+    int maxvalence = level->getMaxValence();
+
+    Vtr::internal::StackBuffer<Index, 40> manifoldRing;
+    manifoldRing.SetSize(maxvalence*2);
+
+    int ringSize =
+        level->gatherQuadRegularRingAroundVertex(
+            facePoints[vid], manifoldRing, /*fvarChannel*/-1);
+
+    // note: this function has not yet supported boundary.
+    assert((ringSize & 1) == 0);
+    int valence = ringSize/2;
+    int stencilCapacity = ringSize + 1;
+
+    Index start = -1, prev = -1;
+    {
+        int ip = (vid+1)%4, im = (vid+3)%4;
+        for (int i = 0; i < valence; ++i) {
+            if (manifoldRing[i*2] == facePoints[ip])
+                start = i;
+            if (manifoldRing[i*2] == facePoints[im])
+                prev = i;
+        }
+    }
+    assert(start > -1 && prev > -1);
+
+    GregoryBasis::Point e0, e1;
+    e0.Clear(stencilCapacity);
+    e1.Clear(stencilCapacity);
+
+    float t = 2.0f * float(M_PI) / float(valence);
+    float ef = 1.0f / (valence * (cosf(t) + 5.0f +
+                                  sqrtf((cosf(t) + 9) * (cosf(t) + 1)))/16.0f);
+
+    for (int i = 0; i < valence; ++i) {
+        Index ip = (i+1)%valence;
+        Index idx_neighbor   = (manifoldRing[2*i  + 0]),
+              idx_diagonal   = (manifoldRing[2*i  + 1]),
+              idx_neighbor_p = (manifoldRing[2*ip + 0]);
+
+        float d = float(valence)+5.0f;
+
+        GregoryBasis::Point f(4);
+        f.AddWithWeight(facePoints[vid], float(valence)/d);
+        f.AddWithWeight(idx_neighbor_p,  2.0f/d);
+        f.AddWithWeight(idx_neighbor,    2.0f/d);
+        f.AddWithWeight(idx_diagonal,    1.0f/d);
+
+        P->AddWithWeight(f, 1.0f/float(valence));
+
+        float c0 = 0.5f*cosf((float(2*M_PI) * float(i)/float(valence)))
+                 + 0.5f*cosf((float(2*M_PI) * float(ip)/float(valence)));
+        float c1 = 0.5f*sinf((float(2*M_PI) * float(i)/float(valence)))
+                 + 0.5f*sinf((float(2*M_PI) * float(ip)/float(valence)));
+        e0.AddWithWeight(f, c0*ef);
+        e1.AddWithWeight(f, c1*ef);
+    }
+
+    *Ep = *P;
+    Ep->AddWithWeight(e0, cosf((float(2*M_PI) * float(start)/float(valence))));
+    Ep->AddWithWeight(e1, sinf((float(2*M_PI) * float(start)/float(valence))));
+
+    *Em = *P;
+    Em->AddWithWeight(e0, cosf((float(2*M_PI) * float(prev)/float(valence))));
+    Em->AddWithWeight(e1, sinf((float(2*M_PI) * float(prev)/float(valence))));
+}
+
+ConstIndexArray
+EndCapBSplineBasisPatchFactory::getPatchPoints(
+    Vtr::internal::Level const *level, Index thisFace,
+    Index extraOrdinaryIndex, ConstIndexArray facePoints,
+    int levelVertOffset) {
+
+    //  Fast B-spline endcap construction.
+    //
+    //  This function assumes the patch is not on boundary
+    //  and it contains only 1 extraordinary vertex.
+    //  The location of the extraoridnary vertex can be one of
+    //  0-ring quad corner.
+    //
+    //  B-Spline control point gathering indice
+    //
+    //     [5]   (4)---(15)--(14)    0 : extraoridnary vertex
+    //            |     |     |
+    //            |     |     |      1,2,3,9,10,11,12,13 :
+    //     (6)----0-----3-----13       B-Spline control points, gathered by
+    //      |     |     |     |         traversing topology
+    //      |     |     |     |
+    //     (7)----1-----2-----12     (5) :
+    //      |     |     |     |        Fitted patch point (from limit position)
+    //      |     |     |     |
+    //     (8)----9-----10----11     (4),(6),(7),(8),(14),(15) :
+    //                                 Fitted patch points
+    //                                   (from limit tangents and bezier CP)
+    //
+    static int const rotation[4][16] = {
+        /*= 0 ring =*/ /* ================ 1 ring ================== */
+        { 0, 1, 2, 3,    4,  5,  6,  7,  8,  9, 10, 11, 12, 13 ,14, 15},
+        { 1, 2, 3, 0,    7,  8,  9, 10, 11, 12, 13, 14, 15,  4,  5,  6},
+        { 2, 3, 0, 1,   10, 11, 12, 13, 14, 15,  4,  5,  6,  7,  8,  9},
+        { 3, 0, 1, 2,   13, 14, 15,  4,  5,  6,  7,  8,  9, 10, 11, 12}};
+
+    int maxvalence = level->getMaxValence();
+    int stencilCapacity = 2*maxvalence + 16;
+    GregoryBasis::Point P(stencilCapacity), Em(stencilCapacity), Ep(stencilCapacity);
+
+    computeLimitStencils(level, facePoints, extraOrdinaryIndex, &P, &Em, &Ep);
+    P.OffsetIndices(levelVertOffset);
+    Em.OffsetIndices(levelVertOffset);
+    Ep.OffsetIndices(levelVertOffset);
+
+    // returning patch indices (a mix of cage vertices and patch points)
+    int patchPoints[16];
+
+    // first, we traverse the topology to gather 15 vertices. This process is
+    // similar to Vtr::Level::gatherQuadRegularInteriorPatchPoints
+    int pointIndex = 0;
+    int vid = extraOrdinaryIndex;
+
+    // 0-ring
+    patchPoints[pointIndex++] = facePoints[0] + levelVertOffset;
+    patchPoints[pointIndex++] = facePoints[1] + levelVertOffset;
+    patchPoints[pointIndex++] = facePoints[2] + levelVertOffset;
+    patchPoints[pointIndex++] = facePoints[3] + levelVertOffset;
+
+    // 1-ring
+    ConstIndexArray thisFaceVerts = level->getFaceVertices(thisFace);
+    for (int i = 0; i < 4; ++i) {
+        Index v = thisFaceVerts[i];
+        ConstIndexArray      vFaces   = level->getVertexFaces(v);
+        ConstLocalIndexArray vInFaces = level->getVertexFaceLocalIndices(v);
+
+        if (i != vid) {
+            // regular corner
+            int thisFaceInVFaces = vFaces.FindIndexIn4Tuple(thisFace);
+
+            int intFaceInVFaces  = (thisFaceInVFaces + 2) & 0x3;
+            Index intFace    = vFaces[intFaceInVFaces];
+            int   vInIntFace = vInFaces[intFaceInVFaces];
+            ConstIndexArray facePoints = level->getFaceVertices(intFace);
+
+            patchPoints[pointIndex++] =
+                facePoints[(vInIntFace + 1)&3] + levelVertOffset;
+            patchPoints[pointIndex++] =
+                facePoints[(vInIntFace + 2)&3] + levelVertOffset;
+            patchPoints[pointIndex++] =
+                facePoints[(vInIntFace + 3)&3] + levelVertOffset;
+        } else {
+            // irregular corner
+            int thisFaceInVFaces = vFaces.FindIndex(thisFace);
+            int valence = vFaces.size();
+            {
+                // first
+                int intFaceInVFaces  = (thisFaceInVFaces + 1) % valence;
+                Index intFace    = vFaces[intFaceInVFaces];
+                int   vInIntFace = vInFaces[intFaceInVFaces];
+                ConstIndexArray facePoints = level->getFaceVertices(intFace);
+                patchPoints[pointIndex++] =
+                    facePoints[(vInIntFace+3)&3] + levelVertOffset;
+            }
+            {
+                // middle: (n-vertices) needs a limit stencil. skip for now
+                pointIndex++;
+            }
+            {
+                // end
+                int intFaceInVFaces  = (thisFaceInVFaces + (valence-1)) %valence;
+                Index intFace    = vFaces[intFaceInVFaces];
+                int   vInIntFace = vInFaces[intFaceInVFaces];
+                ConstIndexArray facePoints = level->getFaceVertices(intFace);
+                patchPoints[pointIndex++] =
+                    facePoints[(vInIntFace+1)&3] + levelVertOffset;
+
+            }
+        }
+    }
+
+    // stencils for patch points
+    GregoryBasis::Point X5(stencilCapacity),
+        X6(stencilCapacity),
+        X7(stencilCapacity),
+        X8(stencilCapacity),
+        X4(stencilCapacity),
+        X15(stencilCapacity),
+        X14(stencilCapacity);
+
+    // limit tangent : Em
+    // X6 = 1/3 * ( 36Em - 16P0 - 8P1 - 2P2 - 4P3 -  P6 - 2P7)
+    // X7 = 1/3 * (-18Em +  8P0 + 4P1 +  P2 + 2P3 + 2P6 + 4P7)
+    // X8 = X6 + (P8-P6)
+    X6.AddWithWeight(Em,                             36.0f/3.0f);
+    X6.AddWithWeight(patchPoints[rotation[vid][0]], -16.0f/3.0f);
+    X6.AddWithWeight(patchPoints[rotation[vid][1]],  -8.0f/3.0f);
+    X6.AddWithWeight(patchPoints[rotation[vid][2]],  -2.0f/3.0f);
+    X6.AddWithWeight(patchPoints[rotation[vid][3]],  -4.0f/3.0f);
+    X6.AddWithWeight(patchPoints[rotation[vid][6]],  -1.0f/3.0f);
+    X6.AddWithWeight(patchPoints[rotation[vid][7]],  -2.0f/3.0f);
+
+    X7.AddWithWeight(Em,                            -18.0f/3.0f);
+    X7.AddWithWeight(patchPoints[rotation[vid][0]],   8.0f/3.0f);
+    X7.AddWithWeight(patchPoints[rotation[vid][1]],   4.0f/3.0f);
+    X7.AddWithWeight(patchPoints[rotation[vid][2]],   1.0f/3.0f);
+    X7.AddWithWeight(patchPoints[rotation[vid][3]],   2.0f/3.0f);
+    X7.AddWithWeight(patchPoints[rotation[vid][6]],   2.0f/3.0f);
+    X7.AddWithWeight(patchPoints[rotation[vid][7]],   4.0f/3.0f);
+
+    X8 = X6;
+    X8.AddWithWeight(patchPoints[rotation[vid][8]], 1.0f);
+    X8.AddWithWeight(patchPoints[rotation[vid][6]], -1.0f);
+
+    // limit tangent : Ep
+    // X4  = 1/3 * ( 36EP - 16P0 - 4P1 - 2P15 - 2P2 - 8P3 -  P4)
+    // X15 = 1/3 * (-18EP +  8P0 + 2P1 + 4P15 +  P2 + 4P3 + 2P4)
+    // X14 = X4  + (P14 - P4)
+    X4.AddWithWeight(Ep,                             36.0f/3.0f);
+    X4.AddWithWeight(patchPoints[rotation[vid][0]], -16.0f/3.0f);
+    X4.AddWithWeight(patchPoints[rotation[vid][1]],  -4.0f/3.0f);
+    X4.AddWithWeight(patchPoints[rotation[vid][2]],  -2.0f/3.0f);
+    X4.AddWithWeight(patchPoints[rotation[vid][3]],  -8.0f/3.0f);
+    X4.AddWithWeight(patchPoints[rotation[vid][4]],  -1.0f/3.0f);
+    X4.AddWithWeight(patchPoints[rotation[vid][15]], -2.0f/3.0f);
+
+    X15.AddWithWeight(Ep,                            -18.0f/3.0f);
+    X15.AddWithWeight(patchPoints[rotation[vid][0]],   8.0f/3.0f);
+    X15.AddWithWeight(patchPoints[rotation[vid][1]],   2.0f/3.0f);
+    X15.AddWithWeight(patchPoints[rotation[vid][2]],   1.0f/3.0f);
+    X15.AddWithWeight(patchPoints[rotation[vid][3]],   4.0f/3.0f);
+    X15.AddWithWeight(patchPoints[rotation[vid][4]],   2.0f/3.0f);
+    X15.AddWithWeight(patchPoints[rotation[vid][15]],  4.0f/3.0f);
+
+    X14 = X4;
+    X14.AddWithWeight(patchPoints[rotation[vid][14]],  1.0f);
+    X14.AddWithWeight(patchPoints[rotation[vid][4]],  -1.0f);
+
+    // limit corner (16th free vert)
+    // X5 = 36LP - 16P0 - 4(P1 + P3 + P4 + P6) - (P2 + P7 + P15)
+    X5.AddWithWeight(P,                              36.0f);
+    X5.AddWithWeight(patchPoints[rotation[vid][0]], -16.0f);
+    X5.AddWithWeight(patchPoints[rotation[vid][1]],  -4.0f);
+    X5.AddWithWeight(patchPoints[rotation[vid][3]],  -4.0f);
+    X5.AddWithWeight(X4,                             -4.0f);
+    X5.AddWithWeight(X6,                             -4.0f);
+    X5.AddWithWeight(patchPoints[rotation[vid][2]],  -1.0f);
+    X5.AddWithWeight(X7,                             -1.0f);
+    X5.AddWithWeight(X15,                            -1.0f);
+
+    //     [5]   (4)---(15)--(14)    0 : extraoridnary vertex
+    //            |     |     |
+    //            |     |     |      1,2,3,9,10,11,12,13 :
+    //     (6)----0-----3-----13       B-Spline control points, gathered by
+    //      |     |     |     |         traversing topology
+    //      |     |     |     |
+    //     (7)----1-----2-----12     (5) :
+    //      |     |     |     |        Fitted patch point (from limit position)
+    //      |     |     |     |
+    //     (8)----9-----10----11     (4),(6),(7),(8),(14),(15) :
+    //
+    // patch point stencils will be stored in this order
+    // (Em) 6, 7, 8, (Ep) 4, 15, 14, (P) 5
+
+    int offset = _refiner->GetNumVerticesTotal();
+
+    GregoryBasis::Point V0, V1, V3;
+    V0.AddWithWeight(facePoints[vid] + levelVertOffset, 1.0f);
+    V1.AddWithWeight(facePoints[(vid+1)&3] + levelVertOffset, 1.0f);
+    V3.AddWithWeight(facePoints[(vid+3)&3] + levelVertOffset, 1.0f);
+
+    // push back to stencils;
+    patchPoints[3* vid + 6]        = (_numVertices++) + offset;
+    _vertexStencils.push_back(X6);
+    _varyingStencils.push_back(V0);
+
+    patchPoints[3*((vid+1)%4) + 4] = (_numVertices++) + offset;
+    _vertexStencils.push_back(X7);
+    _varyingStencils.push_back(V1);
+
+    patchPoints[3*((vid+1)%4) + 5] = (_numVertices++) + offset;
+    _vertexStencils.push_back(X8);
+    _varyingStencils.push_back(V1);
+
+    patchPoints[3* vid + 4]        = (_numVertices++) + offset;
+    _vertexStencils.push_back(X4);
+    _varyingStencils.push_back(V0);
+
+    patchPoints[3*((vid+3)%4) + 6] = (_numVertices++) + offset;
+    _vertexStencils.push_back(X15);
+    _varyingStencils.push_back(V3);
+
+    patchPoints[3*((vid+3)%4) + 5] = (_numVertices++) + offset;
+    _vertexStencils.push_back(X14);
+    _varyingStencils.push_back(V3);
+
+    patchPoints[3*vid + 5]         = (_numVertices++) + offset;
+    _vertexStencils.push_back(X5);
+    _varyingStencils.push_back(V0);
+
+    // reorder into UV row-column
+    static int const permuteRegular[16] =
+        { 5, 6, 7, 8, 4, 0, 1, 9, 15, 3, 2, 10, 14, 13, 12, 11 };
+    for (int i = 0; i < 16; ++i) {
+        _patchPoints.push_back(patchPoints[permuteRegular[i]]);
+    }
     ++_numPatches;
     return ConstIndexArray(&_patchPoints[(_numPatches-1)*16], 16);
 }
diff --git a/opensubdiv/far/endCapBSplineBasisPatchFactory.h b/opensubdiv/far/endCapBSplineBasisPatchFactory.h
index 9f54a190..28c27160 100644
--- a/opensubdiv/far/endCapBSplineBasisPatchFactory.h
+++ b/opensubdiv/far/endCapBSplineBasisPatchFactory.h
@@ -91,6 +91,22 @@ public:
     }
 
 private:
+    ConstIndexArray getPatchPointsFromGregoryBasis(
+        Vtr::internal::Level const * level, Index thisFace,
+        ConstIndexArray facePoints,
+        int levelVertOffset);
+
+    ConstIndexArray getPatchPoints(
+        Vtr::internal::Level const *level, Index thisFace,
+        Index extraOrdinaryIndex, ConstIndexArray facePoints,
+        int levelVertOffset);
+
+    void computeLimitStencils(
+        Vtr::internal::Level const *level,
+        ConstIndexArray facePoints, int vid,
+        GregoryBasis::Point *P, GregoryBasis::Point *Ep, GregoryBasis::Point *Em);
+
+
     TopologyRefiner const *_refiner;
     GregoryBasis::PointsVector _vertexStencils;
     GregoryBasis::PointsVector _varyingStencils;
diff --git a/opensubdiv/far/endCapGregoryBasisPatchFactory.cpp b/opensubdiv/far/endCapGregoryBasisPatchFactory.cpp
index a884350a..1cae6856 100644
--- a/opensubdiv/far/endCapGregoryBasisPatchFactory.cpp
+++ b/opensubdiv/far/endCapGregoryBasisPatchFactory.cpp
@@ -47,6 +47,15 @@ EndCapGregoryBasisPatchFactory::EndCapGregoryBasisPatchFactory(
 
     // Sanity check: the mesh must be adaptively refined
     assert(not refiner.IsUniform());
+
+    // Reserve the patch point stencils. Ideally topology refiner
+    // would have an API to return how many endcap patches will be required.
+    // Instead we conservatively estimate by the number of patches at the
+    // finest level.
+    int numMaxLevelFaces = refiner.GetLevel(refiner.GetMaxLevel()).GetNumFaces();
+
+    _vertexStencils.reserve(numMaxLevelFaces*20);
+    _varyingStencils.reserve(numMaxLevelFaces*20);
 }
 
 //
diff --git a/opensubdiv/far/gregoryBasis.cpp b/opensubdiv/far/gregoryBasis.cpp
index 6ba50fc3..a80dbfc5 100644
--- a/opensubdiv/far/gregoryBasis.cpp
+++ b/opensubdiv/far/gregoryBasis.cpp
@@ -36,57 +36,6 @@ namespace OpenSubdiv {
 namespace OPENSUBDIV_VERSION {
 
 namespace Far {
-// Builds a table of local indices pairs for each vertex of the patch.
-//
-//            o
-//         N0 |
-//            |              ....
-//            |              .... : Gregory patch
-//   o ------ o ------ o     ....
-// N1       V | .... M3
-//            | .......
-//            | .......
-//            o .......
-//          N2
-//
-// [...] [N2 - N3] [...]
-//
-// Each value pair is composed of 2 index values in range [0-4[ pointing
-// to the 2 neighbor vertices of the vertex 'V' belonging to the Gregory patch.
-// Neighbor ordering is valence CCW and must match the winding of the 1-ring
-// vertices.
-//
-static void
-getQuadOffsets(Vtr::internal::Level const & level, Vtr::Index fIndex,
-    Vtr::Index offsets[], int fvarChannel=-1) {
-
-    Far::ConstIndexArray fPoints = (fvarChannel<0) ?
-        level.getFaceVertices(fIndex) :
-            level.getFaceFVarValues(fIndex, fvarChannel);
-    assert(fPoints.size()==4);
-
-    for (int i = 0; i < 4; ++i) {
-
-        Vtr::Index      vIndex = fPoints[i];
-        Vtr::ConstIndexArray vFaces = level.getVertexFaces(vIndex),
-                             vEdges = level.getVertexEdges(vIndex);
-
-        int thisFaceInVFaces = -1;
-        for (int j = 0; j < vFaces.size(); ++j) {
-            if (fIndex == vFaces[j]) {
-                thisFaceInVFaces = j;
-                break;
-            }
-        }
-        assert(thisFaceInVFaces != -1);
-
-        // we have to use the number of incident edges to modulo the local index
-        // because there could be 2 consecutive edges in the face belonging to
-        // the Gregory patch.
-        offsets[i*2+0] = thisFaceInVFaces;
-        offsets[i*2+1] = (thisFaceInVFaces + 1)%vEdges.size();
-    }
-}
 
 int
 GregoryBasis::ProtoBasis::GetNumElements() const {
@@ -153,6 +102,8 @@ GregoryBasis::ProtoBasis::ProtoBasis(
     Vtr::internal::Level const & level, Index faceIndex,
     int levelVertOffset, int fvarChannel) {
 
+    // XXX: This function is subject to refactor in 3.1
+
     Vtr::ConstIndexArray facePoints = (fvarChannel<0) ?
         level.getFaceVertices(faceIndex) :
             level.getFaceFVarValues(faceIndex, fvarChannel);
@@ -162,27 +113,45 @@ GregoryBasis::ProtoBasis::ProtoBasis(
         valences[4],
         zerothNeighbors[4];
 
-    Vtr::internal::StackBuffer<Index,40> manifoldRing((maxvalence+2)*2);
+    // XXX: a temporary hack for the performance issue
+    // ensure Point has a capacity for the neighborhood of
+    // 2 extraordinary verts + 2 regular verts
+    // worse case: n-valence verts at a corner of n-gon.
+    int stencilCapacity =
+        4/*0-ring*/ + 2*(2*(maxvalence-2)/*1-ring around extraordinaries*/
+                         + 2/*1-ring around regulars, excluding shared ones*/);
 
-    Vtr::internal::StackBuffer<Point,16> f(maxvalence);
-    Vtr::internal::StackBuffer<Point,64> r(maxvalence*4);
+    Point e0[4], e1[4];
+    for (int i = 0; i < 4; ++i) {
+        P[i].Clear(stencilCapacity);
+        e0[i].Clear(stencilCapacity);
+        e1[i].Clear(stencilCapacity);
+        V[i].Clear(1);
+    }
 
-    Point e0[4], e1[4], org[4];
+    Vtr::internal::StackBuffer<Index, 40> manifoldRings[4];
+    manifoldRings[0].SetSize(maxvalence*2);
+    manifoldRings[1].SetSize(maxvalence*2);
+    manifoldRings[2].SetSize(maxvalence*2);
+    manifoldRings[3].SetSize(maxvalence*2);
+
+    Vtr::internal::StackBuffer<Point, 10> f(maxvalence);
+    Vtr::internal::StackBuffer<Point, 40> r(maxvalence*4);
+
+    // the first phase
 
     for (int vid=0; vid<4; ++vid) {
-
-        org[vid] = facePoints[vid];
         // save for varying stencils
-        V[vid] = facePoints[vid];
+        V[vid].AddWithWeight(facePoints[vid], 1.0f);
 
         int ringSize =
             level.gatherQuadRegularRingAroundVertex(
-                facePoints[vid], manifoldRing, fvarChannel);
+                facePoints[vid], manifoldRings[vid], fvarChannel);
 
         int valence;
         if (ringSize & 1) {
             // boundary vertex
-            manifoldRing[ringSize] = manifoldRing[ringSize-1];
+            manifoldRings[vid][ringSize] = manifoldRings[vid][ringSize-1];
             ++ringSize;
             valence = -ringSize/2;
         } else {
@@ -196,21 +165,19 @@ GregoryBasis::ProtoBasis::ProtoBasis(
               zerothNeighbor=0,
               ibefore=0;
 
-        Point pos(facePoints[vid]);
-
         for (int i=0; i<ivalence; ++i) {
 
             Index im = (i+ivalence-1)%ivalence,
                   ip = (i+1)%ivalence;
 
-            Index idx_neighbor = (manifoldRing[2*i + 0]),
-                  idx_diagonal = (manifoldRing[2*i + 1]),
-                  idx_neighbor_p = (manifoldRing[2*ip + 0]),
-                  idx_neighbor_m = (manifoldRing[2*im + 0]),
-                  idx_diagonal_m = (manifoldRing[2*im + 1]);
+            Index idx_neighbor = (manifoldRings[vid][2*i + 0]),
+                  idx_diagonal = (manifoldRings[vid][2*i + 1]),
+                  idx_neighbor_p = (manifoldRings[vid][2*ip + 0]),
+                  idx_neighbor_m = (manifoldRings[vid][2*im + 0]),
+                  idx_diagonal_m = (manifoldRings[vid][2*im + 1]);
 
             bool boundaryNeighbor = (level.getVertexEdges(idx_neighbor).size() >
-                level.getVertexFaces(idx_neighbor).size());
+                                     level.getVertexFaces(idx_neighbor).size());
 
             if (fvarChannel>=0) {
                 // XXXX manuelk need logic to check for boundary in fvar
@@ -232,21 +199,22 @@ GregoryBasis::ProtoBasis::ProtoBasis(
                 }
             }
 
-            Point neighbor(idx_neighbor),
-                  diagonal(idx_diagonal),
-                  neighbor_p(idx_neighbor_p),
-                  neighbor_m(idx_neighbor_m),
-                  diagonal_m(idx_diagonal_m);
+            float d = float(ivalence)+5.0f;
+            f[i].Clear(4);
+            f[i].AddWithWeight(facePoints[vid], float(ivalence)/d);
+            f[i].AddWithWeight(idx_neighbor_p,  2.0f/d);
+            f[i].AddWithWeight(idx_neighbor,    2.0f/d);
+            f[i].AddWithWeight(idx_diagonal,    1.0f/d);
+            P[vid].AddWithWeight(f[i], 1.0f/float(ivalence));
 
-            f[i] = (pos*float(ivalence) + (neighbor_p+neighbor)*2.0f + diagonal) / (float(ivalence)+5.0f);
-
-            P[vid] += f[i];
-
-            r[vid*maxvalence+i] = (neighbor_p-neighbor_m)/3.0f + (diagonal-diagonal_m)/6.0f;
+            int rid = vid * maxvalence + i;
+            r[rid].Clear(4);
+            r[rid].AddWithWeight(idx_neighbor_p,  1.0f/3.0f);
+            r[rid].AddWithWeight(idx_neighbor_m, -1.0f/3.0f);
+            r[rid].AddWithWeight(idx_diagonal,    1.0f/6.0f);
+            r[rid].AddWithWeight(idx_diagonal_m, -1.0f/6.0f);
         }
 
-        P[vid] /= float(ivalence);
-
         zerothNeighbors[vid] = zerothNeighbor;
         if (currentNeighbor == 1) {
             boundaryEdgeNeighbors[1] = boundaryEdgeNeighbors[0];
@@ -254,24 +222,27 @@ GregoryBasis::ProtoBasis::ProtoBasis(
 
         for (int i=0; i<ivalence; ++i) {
             int im = (i+ivalence-1)%ivalence;
-            Point e = (f[i]+f[im])*0.5f;
-            e0[vid] += e * csf(ivalence-3, 2*i);
-            e1[vid] += e * csf(ivalence-3, 2*i+1);
+            float c0 = 0.5f * csf(ivalence-3, 2*i);
+            float c1 = 0.5f * csf(ivalence-3, 2*i+1);
+            e0[vid].AddWithWeight(f[i ], c0);
+            e0[vid].AddWithWeight(f[im], c0);
+            e1[vid].AddWithWeight(f[i ], c1);
+            e1[vid].AddWithWeight(f[im], c1);
         }
 
         float ef = computeCoefficient(ivalence);
         e0[vid] *= ef;
         e1[vid] *= ef;
 
-        if (valence<0) {
-
-            Point b0(boundaryEdgeNeighbors[0]),
-                  b1(boundaryEdgeNeighbors[1]);
-
+        // Boundary gregory case:
+        if (valence < 0) {
+            P[vid].Clear(stencilCapacity);
             if (ivalence>2) {
-                P[vid] = (b0 + b1 + pos*4.0f)/6.0f;
+                P[vid].AddWithWeight(boundaryEdgeNeighbors[0], 1.0f/6.0f);
+                P[vid].AddWithWeight(boundaryEdgeNeighbors[1], 1.0f/6.0f);
+                P[vid].AddWithWeight(facePoints[vid], 4.0f/6.0f);
             } else {
-                P[vid] = pos;
+                P[vid].AddWithWeight(facePoints[vid], 1.0f);
             }
             float k = float(float(ivalence) - 1.0f);    //k is the number of faces
             float c = cosf(float(M_PI)/k);
@@ -280,10 +251,17 @@ GregoryBasis::ProtoBasis::ProtoBasis(
             float alpha_0k = -((1.0f+2.0f*c)*sqrtf(1.0f+c))/((3.0f*k+c)*sqrtf(1.0f-c));
             float beta_0 = s/(3.0f*k + c);
 
-            Point diagonal(manifoldRing[2*zerothNeighbor + 1]);
+            int idx_diagonal = manifoldRings[vid][2*zerothNeighbor + 1];
 
-            e0[vid] = (b0 - b1)/6.0f;
-            e1[vid] = pos*gamma + diagonal*beta_0 + (b0 + b1)*alpha_0k;
+            e0[vid].Clear(stencilCapacity);
+            e0[vid].AddWithWeight(boundaryEdgeNeighbors[0],  1.0f/6.0f);
+            e0[vid].AddWithWeight(boundaryEdgeNeighbors[1], -1.0f/6.0f);
+
+            e1[vid].Clear(stencilCapacity);
+            e1[vid].AddWithWeight(facePoints[vid],           gamma);
+            e1[vid].AddWithWeight(idx_diagonal,              beta_0);
+            e1[vid].AddWithWeight(boundaryEdgeNeighbors[0],  alpha_0k);
+            e1[vid].AddWithWeight(boundaryEdgeNeighbors[1],  alpha_0k);
 
             for (int x=1; x<ivalence-1; ++x) {
 
@@ -292,50 +270,68 @@ GregoryBasis::ProtoBasis::ProtoBasis(
                 float alpha = (4.0f*sinf((float(M_PI) * float(x))/k))/(3.0f*k+c),
                       beta = (sinf((float(M_PI) * float(x))/k) + sinf((float(M_PI) * float(x+1))/k))/(3.0f*k+c);
 
-                Index idx_neighbor = manifoldRing[2*curri + 0],
-                      idx_diagonal = manifoldRing[2*curri + 1];
+                Index idx_neighbor = manifoldRings[vid][2*curri + 0],
+                      idx_diagonal = manifoldRings[vid][2*curri + 1];
 
-                Point p_neighbor(idx_neighbor),
-                      p_diagonal(idx_diagonal);
-
-                e1[vid] += p_neighbor*alpha + p_diagonal*beta;
+                e1[vid].AddWithWeight(idx_neighbor, alpha);
+                e1[vid].AddWithWeight(idx_diagonal, beta);
             }
-            e1[vid] /= 3.0f;
+            e1[vid] *= 1.0f/3.0f;
         }
     }
 
-    Index quadOffsets[8];
-    getQuadOffsets(level, faceIndex, quadOffsets, fvarChannel);
+    // the second phase
 
     for (int vid=0; vid<4; ++vid) {
 
-        int n = abs(valences[vid]),
-            ivalence = n;
+        int n = abs(valences[vid]);
+        int ivalence = n;
 
         int ip = (vid+1)%4,
             im = (vid+3)%4,
             np = abs(valences[ip]),
             nm = abs(valences[im]);
 
-        Index start = quadOffsets[vid*2+0],
-              prev = quadOffsets[vid*2+1],
-              start_m = quadOffsets[im*2],
-              prev_p = quadOffsets[ip*2+1];
+        Index start = -1, prev = -1, start_m = -1, prev_p = -1;
+        for (int i = 0; i < n; ++i) {
+            if (manifoldRings[vid][i*2] == facePoints[ip])
+                start = i;
+            if (manifoldRings[vid][i*2] == facePoints[im])
+                prev = i;
+        }
+        for (int i = 0; i < np; ++i) {
+            if (manifoldRings[ip][i*2] == facePoints[vid]) {
+                prev_p = i;
+                break;
+            }
+        }
+        for (int i = 0; i < nm; ++i) {
+            if (manifoldRings[im][i*2] == facePoints[vid]) {
+                start_m = i;
+                break;
+            }
+        }
+        assert(start != -1 && prev != -1 && start_m != -1 && prev_p != -1);
 
-        Point Em_ip, Ep_im;
+        Point Em_ip = P[ip];
+        Point Ep_im = P[im];
 
         if (valences[ip]<-2) {
             Index j = (np + prev_p - zerothNeighbors[ip]) % np;
-            Em_ip = P[ip] + e0[ip]*cosf((float(M_PI)*j)/float(np-1)) + e1[ip]*sinf((float(M_PI)*j)/float(np-1));
+            Em_ip.AddWithWeight(e0[ip], cosf((float(M_PI)*j)/float(np-1)));
+            Em_ip.AddWithWeight(e1[ip], sinf((float(M_PI)*j)/float(np-1)));
         } else {
-            Em_ip = P[ip] + e0[ip]*csf(np-3,2*prev_p) + e1[ip]*csf(np-3,2*prev_p+1);
+            Em_ip.AddWithWeight(e0[ip], csf(np-3, 2*prev_p));
+            Em_ip.AddWithWeight(e1[ip], csf(np-3, 2*prev_p+1));
         }
 
         if (valences[im]<-2) {
             Index j = (nm + start_m - zerothNeighbors[im]) % nm;
-            Ep_im = P[im] + e0[im]*cosf((float(M_PI)*j)/float(nm-1)) + e1[im]*sinf((float(M_PI)*j)/float(nm-1));
+            Ep_im.AddWithWeight(e0[im], cosf((float(M_PI)*j)/float(nm-1)));
+            Ep_im.AddWithWeight(e1[im], sinf((float(M_PI)*j)/float(nm-1)));
         } else {
-            Ep_im = P[im] + e0[im]*csf(nm-3,2*start_m) + e1[im]*csf(nm-3,2*start_m+1);
+            Ep_im.AddWithWeight(e0[im], csf(nm-3, 2*start_m));
+            Ep_im.AddWithWeight(e1[im], csf(nm-3, 2*start_m+1));
         }
 
         if (valences[vid] < 0) {
@@ -355,12 +351,25 @@ GregoryBasis::ProtoBasis::ProtoBasis(
             float s1 = 3.0f - 2.0f*csf(n-3,2)-csf(np-3,2),
                   s2 = 2.0f*csf(n-3,2),
                   s3 = 3.0f -2.0f*cosf(2.0f*float(M_PI)/float(n)) - cosf(2.0f*float(M_PI)/float(nm));
+            Ep[vid] = P[vid];
+            Ep[vid].AddWithWeight(e0[vid], csf(n-3, 2*start));
+            Ep[vid].AddWithWeight(e1[vid], csf(n-3, 2*start +1));
 
-            Ep[vid] = P[vid] + e0[vid]*csf(n-3, 2*start) + e1[vid]*csf(n-3, 2*start +1);
-            Em[vid] = P[vid] + e0[vid]*csf(n-3, 2*prev ) + e1[vid]*csf(n-3, 2*prev + 1);
-            Fp[vid] = (P[vid]*csf(np-3,2) + Ep[vid]*s1 + Em_ip*s2 + rp[start])/3.0f;
-            Fm[vid] = (P[vid]*csf(nm-3,2) + Em[vid]*s3 + Ep_im*s2 - rp[prev])/3.0f;
+            Em[vid] = P[vid];
+            Em[vid].AddWithWeight(e0[vid], csf(n-3, 2*prev ));
+            Em[vid].AddWithWeight(e1[vid], csf(n-3, 2*prev + 1));
 
+            Fp[vid].Clear(stencilCapacity);
+            Fp[vid].AddWithWeight(P[vid],    csf(np-3, 2)/3.0f);
+            Fp[vid].AddWithWeight(Ep[vid],   s1/3.0f);
+            Fp[vid].AddWithWeight(Em_ip,     s2/3.0f);
+            Fp[vid].AddWithWeight(rp[start], 1.0f/3.0f);
+
+            Fm[vid].Clear(stencilCapacity);
+            Fm[vid].AddWithWeight(P[vid],   csf(nm-3, 2)/3.0f);
+            Fm[vid].AddWithWeight(Em[vid],  s3/3.0f);
+            Fm[vid].AddWithWeight(Ep_im,    s2/3.0f);
+            Fm[vid].AddWithWeight(rp[prev], -1.0f/3.0f);
         } else if (valences[vid] < -2) {
 
             Index jp = (ivalence + start - zerothNeighbors[vid]) % ivalence,
@@ -370,24 +379,59 @@ GregoryBasis::ProtoBasis::ProtoBasis(
                   s2 = 2*csf(n-3,2),
                   s3 = 3.0f-2.0f*cosf(2.0f*float(M_PI)/n)-cosf(2.0f*float(M_PI)/nm);
 
-            Ep[vid] = P[vid] + e0[vid]*cosf((float(M_PI)*jp)/float(ivalence-1)) + e1[vid]*sinf((float(M_PI)*jp)/float(ivalence-1));
-            Em[vid] = P[vid] + e0[vid]*cosf((float(M_PI)*jm)/float(ivalence-1)) + e1[vid]*sinf((float(M_PI)*jm)/float(ivalence-1));
-            Fp[vid] = (P[vid]*csf(np-3,2) + Ep[vid]*s1 + Em_ip*s2 + rp[start])/3.0f;
-            Fm[vid] = (P[vid]*csf(nm-3,2) + Em[vid]*s3 + Ep_im*s2 - rp[prev])/3.0f;
+            Ep[vid] = P[vid];
+            Ep[vid].AddWithWeight(e0[vid], cosf((float(M_PI)*jp)/float(ivalence-1)));
+            Ep[vid].AddWithWeight(e1[vid], sinf((float(M_PI)*jp)/float(ivalence-1)));
+
+            Em[vid] = P[vid];
+            Em[vid].AddWithWeight(e0[vid], cosf((float(M_PI)*jm)/float(ivalence-1)));
+            Em[vid].AddWithWeight(e1[vid], sinf((float(M_PI)*jm)/float(ivalence-1)));
+
+            Fp[vid].Clear(stencilCapacity);
+            Fp[vid].AddWithWeight(P[vid],    csf(np-3,2)/3.0f);
+            Fp[vid].AddWithWeight(Ep[vid],   s1/3.0f);
+            Fp[vid].AddWithWeight(Em_ip,     s2/3.0f);
+            Fp[vid].AddWithWeight(rp[start], 1.0f/3.0f);
+
+            Fm[vid].Clear(stencilCapacity);
+            Fm[vid].AddWithWeight(P[vid],   csf(nm-3,2)/3.0f);
+            Fm[vid].AddWithWeight(Em[vid],  s3/3.0f);
+            Fm[vid].AddWithWeight(Ep_im,    s2/3.0f);
+            Fm[vid].AddWithWeight(rp[prev], -1.0f/3.0f);
 
             if (valences[im]<0) {
                 s1=3-2*csf(n-3,2)-csf(np-3,2);
-                Fp[vid] = Fm[vid] = (P[vid]*csf(np-3,2) + Ep[vid]*s1 + Em_ip*s2 + rp[start])/3.0f;
+                Fp[vid].Clear(stencilCapacity);
+                Fp[vid].AddWithWeight(P[vid],    csf(np-3,2)/3.0f);
+                Fp[vid].AddWithWeight(Ep[vid],   s1/3.0f);
+                Fp[vid].AddWithWeight(Em_ip,     s2/3.0f);
+                Fp[vid].AddWithWeight(rp[start], 1.0f/3.0f);
+                Fm[vid] = Fp[vid];
             } else if (valences[ip]<0) {
                 s1 = 3.0f-2.0f*cosf(2.0f*float(M_PI)/n)-cosf(2.0f*float(M_PI)/nm);
-                Fm[vid] = Fp[vid] = (P[vid]*csf(nm-3,2) + Em[vid]*s1 + Ep_im*s2 - rp[prev])/3.0f;
+                Fm[vid].Clear(stencilCapacity);
+                Fm[vid].AddWithWeight(P[vid],   csf(nm-3,2)/3.0f);
+                Fm[vid].AddWithWeight(Em[vid],  s1/3.0f);
+                Fm[vid].AddWithWeight(Ep_im,    s2/3.0f);
+                Fm[vid].AddWithWeight(rp[prev], -1.0f/3.0f);
+                Fp[vid] = Fm[vid];
             }
 
         } else if (valences[vid]==-2) {
+            Ep[vid].Clear(stencilCapacity);
+            Ep[vid].AddWithWeight(facePoints[vid], 2.0f/3.0f);
+            Ep[vid].AddWithWeight(facePoints[ip],  1.0f/3.0f);
 
-            Ep[vid] = (org[vid]*2.0f + org[ip])/3.0f;
-            Em[vid] = (org[vid]*2.0f + org[im])/3.0f;
-            Fp[vid] = Fm[vid] = (org[vid]*4.0f + org[((vid+2)%n)] + org[ip]*2.0f + org[im]*2.0f)/9.0f;
+            Em[vid].Clear(stencilCapacity);
+            Em[vid].AddWithWeight(facePoints[vid], 2.0f/3.0f);
+            Em[vid].AddWithWeight(facePoints[im],  1.0f/3.0f);
+
+            Fp[vid].Clear(stencilCapacity);
+            Fp[vid].AddWithWeight(facePoints[vid],         4.0f/9.0f);
+            Fp[vid].AddWithWeight(facePoints[((vid+2)%n)], 1.0f/9.0f);
+            Fp[vid].AddWithWeight(facePoints[ip],          2.0f/9.0f);
+            Fp[vid].AddWithWeight(facePoints[im],          2.0f/9.0f);
+            Fm[vid] = Fp[vid];
         }
     }
 
@@ -429,16 +473,7 @@ GregoryBasis::CreateStencilTable(PointsVector const &stencils) {
     float * weights = &stencilTable->_weights[0];
 
     for (int i = 0; i < nStencils; ++i) {
-        GregoryBasis::Point const &src = stencils[i];
-
-        int size = src.GetSize();
-        memcpy(indices, src.GetIndices(), size*sizeof(Index));
-        memcpy(weights, src.GetWeights(), size*sizeof(float));
-        *sizes = size;
-
-        indices += size;
-        weights += size;
-        ++sizes;
+        stencils[i].Copy(&sizes, &indices, &weights);
     }
     stencilTable->generateOffsets();
 
diff --git a/opensubdiv/far/gregoryBasis.h b/opensubdiv/far/gregoryBasis.h
index deff764c..0a622488 100644
--- a/opensubdiv/far/gregoryBasis.h
+++ b/opensubdiv/far/gregoryBasis.h
@@ -26,6 +26,7 @@
 #define OPENSUBDIV3_FAR_GREGORY_BASIS_H
 
 #include "../vtr/level.h"
+#include "../vtr/stackBuffer.h"
 #include "../far/types.h"
 #include "../far/stencilTable.h"
 #include <cstring>
@@ -79,22 +80,15 @@ public:
     //
     class Point {
     public:
-        static const int RESERVED_ENTRY_SIZE = 64;
+        // 40 means up to valence=10 is on stack
+        static const int RESERVED_STENCIL_SIZE = 40;
 
-        Point() : _size(0) {
-            _indices.reserve(RESERVED_ENTRY_SIZE);
-            _weights.reserve(RESERVED_ENTRY_SIZE);
-        }
-
-        Point(Vtr::Index idx, float weight = 1.0f) {
-            _indices.reserve(RESERVED_ENTRY_SIZE);
-            _weights.reserve(RESERVED_ENTRY_SIZE);
-            _size = 1;
-            _indices.push_back(idx);
-            _weights.push_back(weight);
+        Point(int stencilCapacity=RESERVED_STENCIL_SIZE) : _size(0) {
+            _stencils.SetSize(stencilCapacity);
         }
 
         Point(Point const & other) {
+            _stencils.SetSize(other._stencils.GetSize());
             *this = other;
         }
 
@@ -102,96 +96,81 @@ public:
             return _size;
         }
 
-        Vtr::Index const * GetIndices() const {
-            return &_indices[0];
+        int GetCapacity() const {
+            return _stencils.GetSize();
         }
 
-        float const * GetWeights() const {
-            return &_weights[0];
+        void Clear(int capacity) {
+            _size = 0;
+            if ((int)_stencils.GetSize() < capacity) {
+                _stencils.SetSize(capacity);
+            }
+        }
+
+        void AddWithWeight(Vtr::Index idx, float weight) {
+            for (int i = 0; i < _size; ++i) {
+                if (_stencils[i].index == idx) {
+                    _stencils[i].weight += weight;
+                    return;
+                }
+            }
+            assert(_size < (int)_stencils.GetSize());
+            _stencils[_size].index = idx;
+            _stencils[_size].weight = weight;
+            ++_size;
+        }
+
+        void AddWithWeight(Point const &src, float weight) {
+            for (int i = 0; i < src._size; ++i) {
+                AddWithWeight(src._stencils[i].index,
+                              src._stencils[i].weight * weight);
+            }
         }
 
         Point & operator = (Point const & other) {
+            Clear(other.GetCapacity());
             _size = other._size;
-            _indices = other._indices;
-            _weights = other._weights;
-            return *this;
-        }
-
-        Point & operator += (Point const & other) {
-            for (int i=0; i<other._size; ++i) {
-                Vtr::Index idx = findIndex(other._indices[i]);
-                _weights[idx] += other._weights[i];
-            }
-            return *this;
-        }
-
-        Point & operator -= (Point const & other) {
-            for (int i=0; i<other._size; ++i) {
-                Vtr::Index idx = findIndex(other._indices[i]);
-                _weights[idx] -= other._weights[i];
+            assert(_size <= (int)_stencils.GetSize());
+            for (int i = 0; i < _size; ++i) {
+                _stencils[i] = other._stencils[i];
             }
             return *this;
         }
 
         Point & operator *= (float f) {
             for (int i=0; i<_size; ++i) {
-                _weights[i] *= f;
+                _stencils[i].weight *= f;
             }
             return *this;
         }
 
-        Point & operator /= (float f) {
-            return (*this)*=(1.0f/f);
-        }
-
-        friend Point operator * (Point const & src, float f) {
-            Point p( src ); return p*=f;
-        }
-
-        friend Point operator / (Point const & src, float f) {
-            Point p( src ); return p*= (1.0f/f);
-        }
-
-        Point operator + (Point const & other) {
-            Point p(*this); return p+=other;
-        }
-
-        Point operator - (Point const & other) {
-            Point p(*this); return p-=other;
-        }
-
         void OffsetIndices(Vtr::Index offset) {
             for (int i=0; i<_size; ++i) {
-                _indices[i] += offset;
+                _stencils[i].index += offset;
             }
         }
 
         void Copy(int ** size, Vtr::Index ** indices, float ** weights) const {
-            memcpy(*indices, &_indices[0], _size*sizeof(Vtr::Index));
-            memcpy(*weights, &_weights[0], _size*sizeof(float));
+            for (int i = 0; i < _size; ++i) {
+                **indices = _stencils[i].index;
+                **weights = _stencils[i].weight;
+                ++(*indices);
+                ++(*weights);
+            }
             **size = _size;
-            *indices += _size;
-            *weights += _size;
             ++(*size);
         }
 
     private:
 
-        int findIndex(Vtr::Index idx) {
-            for (int i=0; i<_size; ++i) {
-                if (_indices[i]==idx) {
-                    return i;
-                }
-            }
-            _indices.push_back(idx);
-            _weights.push_back(0.0f);
-            ++_size;
-            return _size-1;
-        }
-
         int _size;
-        std::vector<Vtr::Index> _indices;
-        std::vector<float> _weights;
+
+        struct Stencil {
+            Vtr::Index index;
+            float weight;
+        };
+
+        Vtr::internal::StackBuffer<Stencil, RESERVED_STENCIL_SIZE> _stencils;
     };
 
     //
diff --git a/opensubdiv/far/patchTableFactory.cpp b/opensubdiv/far/patchTableFactory.cpp
index 32203d94..8d64e6f8 100644
--- a/opensubdiv/far/patchTableFactory.cpp
+++ b/opensubdiv/far/patchTableFactory.cpp
@@ -898,7 +898,10 @@ PatchTableFactory::identifyAdaptivePatches(AdaptiveContext & context) {
             }
 
             //  Identify boundaries for both regular and xordinary patches -- non-manifold
-            //  edges and vertices are interpreted as boundaries for regular patches
+            //  (infinitely sharp) edges and vertices are currently interpreted as boundaries
+            //  for regular patches, though an irregular patch or extrapolated boundary patch
+            //  is really necessary in future for some non-manifold cases.
+            //
             if (hasBoundaryVertex or hasNonManifoldVertex) {
                 Vtr::ConstIndexArray fEdges = level->getFaceEdges(faceIndex);
 
@@ -911,6 +914,27 @@ PatchTableFactory::identifyAdaptivePatches(AdaptiveContext & context) {
                                          ((level->getEdgeTag(fEdges[1])._nonManifold) << 1) |
                                          ((level->getEdgeTag(fEdges[2])._nonManifold) << 2) |
                                          ((level->getEdgeTag(fEdges[3])._nonManifold) << 3);
+
+                    //  Other than non-manifold edges, non-manifold vertices that were made
+                    //  sharp should also trigger new "boundary" edges for the sharp corner
+                    //  patches introduced in these cases.
+                    //
+                    if (level->getVertexTag(fVerts[0])._nonManifold &&
+                        level->getVertexTag(fVerts[0])._infSharp) {
+                        nonManEdgeMask |= (1 << 0) | (1 << 3);
+                    }
+                    if (level->getVertexTag(fVerts[1])._nonManifold &&
+                        level->getVertexTag(fVerts[1])._infSharp) {
+                        nonManEdgeMask |= (1 << 1) | (1 << 0);
+                    }
+                    if (level->getVertexTag(fVerts[2])._nonManifold &&
+                        level->getVertexTag(fVerts[2])._infSharp) {
+                        nonManEdgeMask |= (1 << 2) | (1 << 1);
+                    }
+                    if (level->getVertexTag(fVerts[3])._nonManifold &&
+                        level->getVertexTag(fVerts[3])._infSharp) {
+                        nonManEdgeMask |= (1 << 3) | (1 << 2);
+                    }
                     boundaryEdgeMask |= nonManEdgeMask;
                 }
 
diff --git a/opensubdiv/osd/glComputeEvaluator.cpp b/opensubdiv/osd/glComputeEvaluator.cpp
index eb3c4ca2..ace3acbc 100644
--- a/opensubdiv/osd/glComputeEvaluator.cpp
+++ b/opensubdiv/osd/glComputeEvaluator.cpp
@@ -114,12 +114,6 @@ GLComputeEvaluator::GLComputeEvaluator() : _workGroupSize(64) {
 }
 
 GLComputeEvaluator::~GLComputeEvaluator() {
-    if (_stencilKernel.program) {
-        glDeleteProgram(_stencilKernel.program);
-    }
-    if (_patchKernel.program) {
-        glDeleteProgram(_patchKernel.program);
-    }
 }
 
 static GLuint
diff --git a/opensubdiv/osd/glPatchTable.cpp b/opensubdiv/osd/glPatchTable.cpp
index 72a57dd0..fba630f2 100644
--- a/opensubdiv/osd/glPatchTable.cpp
+++ b/opensubdiv/osd/glPatchTable.cpp
@@ -70,12 +70,11 @@ GLPatchTable::allocate(Far::PatchTable const *farPatchTable) {
                         patchTable.GetPatchArrayBuffer() + numPatchArrays);
 
     // copy index buffer
-    glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, _patchIndexBuffer);
-    glBufferData(GL_ELEMENT_ARRAY_BUFFER,
+    glBindBuffer(GL_ARRAY_BUFFER, _patchIndexBuffer);
+    glBufferData(GL_ARRAY_BUFFER,
                  indexSize * sizeof(GLint),
                  patchTable.GetPatchIndexBuffer(),
                  GL_STATIC_DRAW);
-    glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, 0);
 
     // copy patchparam buffer
     glBindBuffer(GL_ARRAY_BUFFER, _patchParamBuffer);
@@ -89,17 +88,8 @@ GLPatchTable::allocate(Far::PatchTable const *farPatchTable) {
     glGenTextures(1, &_patchIndexTexture);
     glGenTextures(1, &_patchParamTexture);
 
-    GLuint buffer;
-    glGenBuffers(1, &buffer);
-    glBindBuffer(GL_ARRAY_BUFFER, buffer);
-    glBufferData(GL_ARRAY_BUFFER,
-                 indexSize * sizeof(GLint),
-                 patchTable.GetPatchIndexBuffer(),
-                 GL_STATIC_DRAW);
-
     glBindTexture(GL_TEXTURE_BUFFER, _patchIndexTexture);
-//    glTexBuffer(GL_TEXTURE_BUFFER, GL_R32I, _patchIndexBuffer);
-    glTexBuffer(GL_TEXTURE_BUFFER, GL_R32I, buffer);
+    glTexBuffer(GL_TEXTURE_BUFFER, GL_R32I, _patchIndexBuffer);
 
     glBindTexture(GL_TEXTURE_BUFFER, _patchParamTexture);
     glTexBuffer(GL_TEXTURE_BUFFER, GL_RGB32I, _patchParamBuffer);
diff --git a/opensubdiv/osd/glslComputeKernel.glsl b/opensubdiv/osd/glslComputeKernel.glsl
index 1ee45761..451d46d3 100644
--- a/opensubdiv/osd/glslComputeKernel.glsl
+++ b/opensubdiv/osd/glslComputeKernel.glsl
@@ -26,6 +26,7 @@
 
 
 layout(local_size_x=WORK_GROUP_SIZE, local_size_y=1, local_size_z=1) in;
+layout(std430) buffer;
 
 // source and destination buffers
 
diff --git a/opensubdiv/osd/ompEvaluator.h b/opensubdiv/osd/ompEvaluator.h
index 9000f326..625dd8b1 100644
--- a/opensubdiv/osd/ompEvaluator.h
+++ b/opensubdiv/osd/ompEvaluator.h
@@ -79,6 +79,9 @@ public:
         (void)instance;       // unused
         (void)deviceContext;  // unused
 
+        if (stencilTable->GetNumStencils() == 0)
+            return false;
+
         return EvalStencils(srcBuffer->BindCpuBuffer(), srcDesc,
                             dstBuffer->BindCpuBuffer(), dstDesc,
                             &stencilTable->GetSizes()[0],
diff --git a/opensubdiv/osd/opengl.h b/opensubdiv/osd/opengl.h
index 16693c6b..c9b73bde 100644
--- a/opensubdiv/osd/opengl.h
+++ b/opensubdiv/osd/opengl.h
@@ -40,7 +40,7 @@
     #include <GLES2/gl2.h>
 #else
     #if defined(_WIN32)
-        #define W32_LEAN_AND_MEAN
+        #define WIN32_LEAN_AND_MEAN
         #include <windows.h>
     #endif
     #if defined(OSD_USES_GLEW)
diff --git a/opensubdiv/osd/tbbEvaluator.h b/opensubdiv/osd/tbbEvaluator.h
index 4c8d1d14..e2e0ffff 100644
--- a/opensubdiv/osd/tbbEvaluator.h
+++ b/opensubdiv/osd/tbbEvaluator.h
@@ -80,6 +80,9 @@ public:
         (void)instance;   // unused
         (void)deviceContext;  // unused
 
+        if (stencilTable->GetNumStencils() == 0)
+            return false;
+
         return EvalStencils(srcBuffer->BindCpuBuffer(), srcDesc,
                             dstBuffer->BindCpuBuffer(), dstDesc,
                             &stencilTable->GetSizes()[0],
diff --git a/opensubdiv/sdc/loopScheme.h b/opensubdiv/sdc/loopScheme.h
index ecc16c19..f8e9aee7 100644
--- a/opensubdiv/sdc/loopScheme.h
+++ b/opensubdiv/sdc/loopScheme.h
@@ -489,8 +489,8 @@ Scheme<SCHEME_LOOP>::assignCreaseLimitTangentMasks(VERTEX const& vertex,
 
         double theta = M_PI / (interiorEdgeCount + 1);
 
-        Weight cWeight      = -3.0f * std::sin(theta);
-        Weight eWeightCoeff = -3.0f * (2.0f * std::cos(theta) - 2.0f);
+        Weight cWeight      = -3.0f * (Weight) std::sin(theta);
+        Weight eWeightCoeff = -3.0f * (2.0f * (Weight) std::cos(theta) - 2.0f);
 
         tan2Mask.VertexWeight(0) = 0.0f;
 
@@ -498,7 +498,7 @@ Scheme<SCHEME_LOOP>::assignCreaseLimitTangentMasks(VERTEX const& vertex,
         tan2Mask.EdgeWeight(creaseEnds[1]) = cWeight;
 
         for (int i = 1; i <= interiorEdgeCount; ++i) {
-            tan2Mask.EdgeWeight(creaseEnds[0] + i) = eWeightCoeff * std::sin(i * theta);
+            tan2Mask.EdgeWeight(creaseEnds[0] + i) = eWeightCoeff * (Weight) std::sin(i * theta);
         }
     } else if (interiorEdgeCount == 1) {
         //  See notes above regarding scale factor of 3.0:
@@ -566,8 +566,8 @@ Scheme<SCHEME_LOOP>::assignSmoothLimitTangentMasks(VERTEX const& vertex,
         Weight alpha = (Weight) (2.0f * M_PI / valence);
         for (int i = 0; i < valence; ++i) {
             double alphaI = alpha * i;
-            tan1Mask.EdgeWeight(i) = std::cos(alphaI);
-            tan2Mask.EdgeWeight(i) = std::sin(alphaI);
+            tan1Mask.EdgeWeight(i) = (Weight) std::cos(alphaI);
+            tan2Mask.EdgeWeight(i) = (Weight) std::sin(alphaI);
         }
     }
 }
diff --git a/opensubdiv/version.h b/opensubdiv/version.h
index f3e14bda..644414a0 100644
--- a/opensubdiv/version.h
+++ b/opensubdiv/version.h
@@ -25,7 +25,7 @@
 #ifndef OPENSUBDIV3_VERSION_H
 #define OPENSUBDIV3_VERSION_H
 
-#define OPENSUBDIV_VERSION v3_0_2
+#define OPENSUBDIV_VERSION v3_0_3
 
 namespace OpenSubdiv {
 namespace OPENSUBDIV_VERSION {
diff --git a/regression/CMakeLists.txt b/regression/CMakeLists.txt
index 26149b87..0c1fb9d5 100644
--- a/regression/CMakeLists.txt
+++ b/regression/CMakeLists.txt
@@ -30,6 +30,8 @@ if (NOT NO_REGRESSION)
 
     add_subdirectory(far_regression)
 
+    add_subdirectory(far_perf)
+
     if(OPENGL_FOUND AND (GLEW_FOUND OR APPLE) AND GLFW_FOUND)
         add_subdirectory(osd_regression)
     else()
diff --git a/regression/far_perf/CMakeLists.txt b/regression/far_perf/CMakeLists.txt
new file mode 100644
index 00000000..521192fc
--- /dev/null
+++ b/regression/far_perf/CMakeLists.txt
@@ -0,0 +1,49 @@
+#
+#   Copyright 2015 Pixar
+#
+#   Licensed under the Apache License, Version 2.0 (the "Apache License")
+#   with the following modification; you may not use this file except in
+#   compliance with the Apache License and the following modification to it:
+#   Section 6. Trademarks. is deleted and replaced with:
+#
+#   6. Trademarks. This License does not grant permission to use the trade
+#      names, trademarks, service marks, or product names of the Licensor
+#      and its affiliates, except as required to comply with Section 4(c) of
+#      the License and to reproduce the content of the NOTICE file.
+#
+#   You may obtain a copy of the Apache License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#   Unless required by applicable law or agreed to in writing, software
+#   distributed under the Apache License with the above modification is
+#   distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+#   KIND, either express or implied. See the Apache License for the specific
+#   language governing permissions and limitations under the Apache License.
+#
+
+include_directories(
+    "${OPENSUBDIV_INCLUDE_DIR}/"
+    "${PROJECT_SOURCE_DIR}/"
+)
+
+set(SOURCE_FILES
+    far_perf.cpp
+)
+
+set(PLATFORM_LIBRARIES
+    "${OSD_LINK_TARGET}"
+)
+
+_add_executable(far_perf
+    ${SOURCE_FILES}
+    $<TARGET_OBJECTS:sdc_obj>
+    $<TARGET_OBJECTS:vtr_obj>
+    $<TARGET_OBJECTS:far_obj>
+    $<TARGET_OBJECTS:regression_common_obj>
+)
+
+install(TARGETS far_perf DESTINATION "${CMAKE_BINDIR_BASE}")
+
+add_test(far_perf ${EXECUTABLE_OUTPUT_PATH}/far_regression)
+
diff --git a/regression/far_perf/far_perf.cpp b/regression/far_perf/far_perf.cpp
new file mode 100644
index 00000000..2371fe0a
--- /dev/null
+++ b/regression/far_perf/far_perf.cpp
@@ -0,0 +1,171 @@
+//
+//   Copyright 2015 Pixar
+//
+//   Licensed under the Apache License, Version 2.0 (the "Apache License")
+//   with the following modification; you may not use this file except in
+//   compliance with the Apache License and the following modification to it:
+//   Section 6. Trademarks. is deleted and replaced with:
+//
+//   6. Trademarks. This License does not grant permission to use the trade
+//      names, trademarks, service marks, or product names of the Licensor
+//      and its affiliates, except as required to comply with Section 4(c) of
+//      the License and to reproduce the content of the NOTICE file.
+//
+//   You may obtain a copy of the Apache License at
+//
+//       http://www.apache.org/licenses/LICENSE-2.0
+//
+//   Unless required by applicable law or agreed to in writing, software
+//   distributed under the Apache License with the above modification is
+//   distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+//   KIND, either express or implied. See the Apache License for the specific
+//   language governing permissions and limitations under the Apache License.
+//
+
+#include <cassert>
+#include <cstdio>
+#include <fstream>
+#include <sstream>
+
+#include <opensubdiv/far/primvarRefiner.h>
+#include <opensubdiv/far/stencilTableFactory.h>
+#include <opensubdiv/far/patchTableFactory.h>
+#include "../../regression/common/far_utils.h"
+// XXX: revisit the directory structure for examples/tests
+#include "../../examples/common/stopwatch.h"
+
+#include "init_shapes.h"
+
+//------------------------------------------------------------------------------
+static void
+doPerf(const Shape *shape, int maxlevel, int endCapType)
+{
+    using namespace OpenSubdiv;
+
+    Sdc::SchemeType type = OpenSubdiv::Sdc::SCHEME_CATMARK;
+
+    Sdc::Options sdcOptions;
+    sdcOptions.SetVtxBoundaryInterpolation(Sdc::Options::VTX_BOUNDARY_EDGE_ONLY);
+
+    Stopwatch s;
+
+    // ----------------------------------------------------------------------
+    // Instantiate a FarTopologyRefiner from the descriptor and refine
+    s.Start();
+    Far::TopologyRefiner * refiner = Far::TopologyRefinerFactory<Shape>::Create(
+        *shape, Far::TopologyRefinerFactory<Shape>::Options(type, sdcOptions));
+    {
+        Far::TopologyRefiner::AdaptiveOptions options(maxlevel);
+        refiner->RefineAdaptive(options);
+    }
+
+    s.Stop();
+    double timeRefine = s.GetElapsed();
+
+    // ----------------------------------------------------------------------
+    // Create stencil table
+    s.Start();
+    Far::StencilTable const * vertexStencils = NULL;
+    {
+        Far::StencilTableFactory::Options options;
+        vertexStencils = Far::StencilTableFactory::Create(*refiner, options);
+    }
+    s.Stop();
+    double timeCreateStencil = s.GetElapsed();
+
+    // ----------------------------------------------------------------------
+    // Create patch table
+    s.Start();
+    Far::PatchTable const * patchTable = NULL;
+    {
+        Far::PatchTableFactory::Options poptions(maxlevel);
+        poptions.SetEndCapType((Far::PatchTableFactory::Options::EndCapType)endCapType);
+        patchTable = Far::PatchTableFactory::Create(*refiner, poptions);
+    }
+
+    s.Stop();
+    double timeCreatePatch = s.GetElapsed();
+
+    // ----------------------------------------------------------------------
+    // append local points to stencils
+    s.Start();
+    {
+        if (Far::StencilTable const *vertexStencilsWithLocalPoints =
+            Far::StencilTableFactory::AppendLocalPointStencilTable(
+                *refiner, vertexStencils,
+                patchTable->GetLocalPointStencilTable())) {
+            delete vertexStencils;
+            vertexStencils = vertexStencilsWithLocalPoints;
+        }
+    }
+    s.Stop();
+    double timeAppendStencil = s.GetElapsed();
+
+    // ---------------------------------------------------------------------
+    double timeTotal = s.GetTotalElapsed();
+
+    printf("TopologyRefiner::Refine     %f %5.2f%%\n",
+           timeRefine, timeRefine/timeTotal*100);
+    printf("StencilTableFactory::Create %f %5.2f%%\n",
+           timeCreateStencil, timeCreateStencil/timeTotal*100);
+    printf("PatchTableFactory::Create   %f %5.2f%%\n",
+           timeCreatePatch, timeCreatePatch/timeTotal*100);
+    printf("StencilTableFactory::Append %f %5.2f%%\n",
+           timeAppendStencil, timeAppendStencil/timeTotal*100);
+    printf("Total                       %f\n", timeTotal);
+}
+
+//------------------------------------------------------------------------------
+int main(int argc, char **argv)
+{
+    using namespace OpenSubdiv;
+
+    int maxlevel = 8;
+    std::string str;
+    int endCapType = Far::PatchTableFactory::Options::ENDCAP_GREGORY_BASIS;
+
+    for (int i = 1; i < argc; ++i) {
+        if (strstr(argv[i], ".obj")) {
+            std::ifstream ifs(argv[i]);
+            if (ifs) {
+                std::stringstream ss;
+                ss << ifs.rdbuf();
+                ifs.close();
+                str = ss.str();
+                g_shapes.push_back(ShapeDesc(argv[i], str.c_str(), kCatmark));
+            }
+        }
+        else if (!strcmp(argv[i], "-l")) {
+            maxlevel = atoi(argv[++i]);
+        }
+        else if (!strcmp(argv[i], "-e")) {
+            const char *type = argv[++i];
+            if (!strcmp(type, "bspline")) {
+                endCapType = Far::PatchTableFactory::Options::ENDCAP_BSPLINE_BASIS;
+            } else if (!strcmp(type, "gregory")) {
+                endCapType = Far::PatchTableFactory::Options::ENDCAP_GREGORY_BASIS;
+            } else {
+                printf("Unknown endcap type %s\n", type);
+                return 1;
+            }
+        }
+    }
+
+    if (g_shapes.empty()) {
+        initShapes();
+    }
+
+    for (int i = 0; i < (int)g_shapes.size(); ++i) {
+        Shape const * shape = Shape::parseObj(
+            g_shapes[i].data.c_str(),
+            g_shapes[i].scheme,
+            g_shapes[i].isLeftHanded);
+
+        for (int lv = 1; lv <= maxlevel; ++lv) {
+            printf("---- %s, level %d ----\n", g_shapes[i].name.c_str(), lv);
+            doPerf(shape, lv, endCapType);
+        }
+    }
+}
+
+//------------------------------------------------------------------------------
diff --git a/regression/far_perf/init_shapes.h b/regression/far_perf/init_shapes.h
new file mode 100644
index 00000000..40d37225
--- /dev/null
+++ b/regression/far_perf/init_shapes.h
@@ -0,0 +1,48 @@
+//
+//   Copyright 2013 Pixar
+//
+//   Licensed under the Apache License, Version 2.0 (the "Apache License")
+//   with the following modification; you may not use this file except in
+//   compliance with the Apache License and the following modification to it:
+//   Section 6. Trademarks. is deleted and replaced with:
+//
+//   6. Trademarks. This License does not grant permission to use the trade
+//      names, trademarks, service marks, or product names of the Licensor
+//      and its affiliates, except as required to comply with Section 4(c) of
+//      the License and to reproduce the content of the NOTICE file.
+//
+//   You may obtain a copy of the Apache License at
+//
+//       http://www.apache.org/licenses/LICENSE-2.0
+//
+//   Unless required by applicable law or agreed to in writing, software
+//   distributed under the Apache License with the above modification is
+//   distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+//   KIND, either express or implied. See the Apache License for the specific
+//   language governing permissions and limitations under the Apache License.
+//
+
+#include "../common/shape_utils.h"
+
+struct ShapeDesc {
+
+    ShapeDesc(char const * iname, std::string const & idata, Scheme ischeme,
+              bool iisLeftHanded=false) :
+        name(iname), data(idata), scheme(ischeme), isLeftHanded(iisLeftHanded) { }
+
+    std::string name,
+                data;
+    Scheme      scheme;
+    bool        isLeftHanded;
+};
+
+static std::vector<ShapeDesc> g_shapes;
+
+#include "../shapes/all.h"
+
+//------------------------------------------------------------------------------
+static void initShapes() {
+    g_shapes.push_back( ShapeDesc("catmark_car",     catmark_car,   kCatmark ) );
+    g_shapes.push_back( ShapeDesc("catmark_pole64", catmark_pole64, kCatmark ) );
+}
+//------------------------------------------------------------------------------
diff --git a/tutorials/far/CMakeLists.txt b/tutorials/far/CMakeLists.txt
index 02f685d4..0d30c983 100644
--- a/tutorials/far/CMakeLists.txt
+++ b/tutorials/far/CMakeLists.txt
@@ -31,6 +31,7 @@ set(TUTORIALS
     tutorial_5
     tutorial_6
     tutorial_7
+    tutorial_8
 )
 
 foreach(tutorial ${TUTORIALS})
diff --git a/tutorials/far/tutorial_8/CMakeLists.txt b/tutorials/far/tutorial_8/CMakeLists.txt
new file mode 100644
index 00000000..c52f1622
--- /dev/null
+++ b/tutorials/far/tutorial_8/CMakeLists.txt
@@ -0,0 +1,37 @@
+#
+#   Copyright 2013 Pixar
+#
+#   Licensed under the Apache License, Version 2.0 (the "Apache License")
+#   with the following modification; you may not use this file except in
+#   compliance with the Apache License and the following modification to it:
+#   Section 6. Trademarks. is deleted and replaced with:
+#
+#   6. Trademarks. This License does not grant permission to use the trade
+#      names, trademarks, service marks, or product names of the Licensor
+#      and its affiliates, except as required to comply with Section 4(c) of
+#      the License and to reproduce the content of the NOTICE file.
+#
+#   You may obtain a copy of the Apache License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#   Unless required by applicable law or agreed to in writing, software
+#   distributed under the Apache License with the above modification is
+#   distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+#   KIND, either express or implied. See the Apache License for the specific
+#   language governing permissions and limitations under the Apache License.
+#
+
+set(SOURCE_FILES
+    far_tutorial_8.cpp
+)
+
+_add_executable(far_tutorial_8
+    ${SOURCE_FILES}
+    $<TARGET_OBJECTS:sdc_obj>
+    $<TARGET_OBJECTS:vtr_obj>
+    $<TARGET_OBJECTS:far_obj>
+)
+
+install(TARGETS far_tutorial_8 DESTINATION "${CMAKE_BINDIR_BASE}/tutorials")
+
diff --git a/tutorials/far/tutorial_8/far_tutorial_8.cpp b/tutorials/far/tutorial_8/far_tutorial_8.cpp
new file mode 100644
index 00000000..6f80c76e
--- /dev/null
+++ b/tutorials/far/tutorial_8/far_tutorial_8.cpp
@@ -0,0 +1,527 @@
+//
+//   Copyright 2013 Pixar
+//
+//   Licensed under the Apache License, Version 2.0 (the "Apache License")
+//   with the following modification; you may not use this file except in
+//   compliance with the Apache License and the following modification to it:
+//   Section 6. Trademarks. is deleted and replaced with:
+//
+//   6. Trademarks. This License does not grant permission to use the trade
+//      names, trademarks, service marks, or product names of the Licensor
+//      and its affiliates, except as required to comply with Section 4(c) of
+//      the License and to reproduce the content of the NOTICE file.
+//
+//   You may obtain a copy of the Apache License at
+//
+//       http://www.apache.org/licenses/LICENSE-2.0
+//
+//   Unless required by applicable law or agreed to in writing, software
+//   distributed under the Apache License with the above modification is
+//   distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+//   KIND, either express or implied. See the Apache License for the specific
+//   language governing permissions and limitations under the Apache License.
+//
+
+
+//------------------------------------------------------------------------------
+// Tutorial description:
+//
+// NOTE: The following approaches are approximations to compute smooth normals,
+//       for highest fidelity patches should be used for positions and normals, 
+//       which form the true limit surface.
+//
+// Building on tutorial 3, this example shows how to instantiate a simple mesh,
+// refine it uniformly, interpolate both 'vertex' and 'face-varying'
+// primvar data, and finally calculate approximated smooth normals. 
+// The resulting interpolated data is output in 'obj' format.
+//
+// Currently, this tutorial supports 3 methods to approximate smooth normals:
+// 
+//     CrossTriangle : Calculates smooth normals (accumulating per vertex) using
+//                     3 verts to generate 2 vectors. This approximation has
+//                     trouble when working with quads (which can be non-planar)
+//                     since it only takes into account half of each face. 
+//
+//     CrossQuad     : Calculates smooth normals (accumulating per vertex) 
+//                     but this time, instead of taking into account only 3 verts
+//                     it creates 2 vectors crossing the quad.
+//                     This approximation builds upon CrossTriangle but takes
+//                     into account the 4 verts of the face.
+//
+//     Limit         : Calculates the normals at the limit for each vert
+//                     at the last level of subdivision.
+//                     These are the true limit normals, however, in this example
+//                     they are used with verts that are not at the limit. 
+//                     This can lead to new visual artifacts since the normals
+//                     and the positions don't match. Additionally, this approach
+//                     requires extra computation to calculate the limit normals.
+//                     For this reason, we strongly suggest using  
+//                     limit positions with limit normals.
+//
+
+#include <opensubdiv/far/topologyDescriptor.h>
+#include <opensubdiv/far/primvarRefiner.h>
+
+#include <cstdio>
+
+//------------------------------------------------------------------------------
+// Math helpers.
+//
+//
+
+// Returns the normalized version of the input vector
+inline void
+normalize(float *n) {
+    float rn = 1.0f/sqrtf(n[0]*n[0] + n[1]*n[1] + n[2]*n[2]);
+    n[0] *= rn;
+    n[1] *= rn;
+    n[2] *= rn;
+}
+
+// Returns the cross product of \p v1 and \p v2.                                
+void cross(float const *v1, float const *v2, float* vOut)
+{                                                                                
+    vOut[0] = v1[1] * v2[2] - v1[2] * v2[1];
+    vOut[1] = v1[2] * v2[0] - v1[0] * v2[2];
+    vOut[2] = v1[0] * v2[1] - v1[1] * v2[0];
+}
+
+//------------------------------------------------------------------------------
+// Face-varying implementation.
+//
+//
+struct Vertex {
+
+    // Minimal required interface ----------------------
+    Vertex() { 
+        Clear();
+    }
+
+    Vertex(Vertex const & src) {
+        position[0] = src.position[0];
+        position[1] = src.position[1];
+        position[2] = src.position[2];
+    }
+
+    void Clear() {
+        position[0]=position[1]=position[2]=0.0f;
+    }
+
+    void AddWithWeight(Vertex const & src, float weight) {
+        position[0]+=weight*src.position[0];
+        position[1]+=weight*src.position[1];
+        position[2]+=weight*src.position[2];
+    }
+
+    // Public interface ------------------------------------
+    void SetPosition(float x, float y, float z) {
+        position[0]=x;
+        position[1]=y;
+        position[2]=z;
+    }
+
+    const float * GetPosition() const {
+        return position;
+    }
+
+    float position[3];
+};
+
+//------------------------------------------------------------------------------
+// Face-varying container implementation.
+//
+// We are using a uv texture layout as a 'face-varying' primtiive variable
+// attribute. Because face-varying data is specified 'per-face-per-vertex',
+// we cannot use the same container that we use for 'vertex' or 'varying'
+// data. We specify a new container, which only carries (u,v) coordinates.
+// Similarly to our 'Vertex' container, we add a minimaliztic interpolation
+// interface with a 'Clear()' and 'AddWithWeight()' methods.
+//
+struct FVarVertexUV {
+
+    // Minimal required interface ----------------------
+    void Clear() {
+        u=v=0.0f;
+    }
+
+    void AddWithWeight(FVarVertexUV const & src, float weight) {
+        u += weight * src.u;
+        v += weight * src.v;
+    }
+
+    // Basic 'uv' layout channel
+    float u,v;
+};
+
+struct FVarVertexColor {
+
+    // Minimal required interface ----------------------
+    void Clear() {
+        r=g=b=a=0.0f;
+    }
+
+    void AddWithWeight(FVarVertexColor const & src, float weight) {
+        r += weight * src.r;
+        g += weight * src.g;
+        b += weight * src.b;
+        a += weight * src.a;
+    }
+
+    // Basic 'color' layout channel
+    float r,g,b,a;
+};
+
+//------------------------------------------------------------------------------
+// Cube geometry from catmark_cube.h
+
+// 'vertex' primitive variable data & topology
+static float g_verts[8][3] = {{ -0.5f, -0.5f,  0.5f },
+                              {  0.5f, -0.5f,  0.5f },
+                              { -0.5f,  0.5f,  0.5f },
+                              {  0.5f,  0.5f,  0.5f },
+                              { -0.5f,  0.5f, -0.5f },
+                              {  0.5f,  0.5f, -0.5f },
+                              { -0.5f, -0.5f, -0.5f },
+                              {  0.5f, -0.5f, -0.5f }};
+static int g_nverts = 8,
+           g_nfaces = 6;
+
+static int g_vertsperface[6] = { 4, 4, 4, 4, 4, 4 };
+
+static int g_vertIndices[24] = { 0, 1, 3, 2,
+                                 2, 3, 5, 4,
+                                 4, 5, 7, 6,
+                                 6, 7, 1, 0,
+                                 1, 7, 5, 3,
+                                 6, 0, 2, 4  };
+
+// 'face-varying' primitive variable data & topology for UVs
+static float g_uvs[14][2] = {{ 0.375, 0.00 },
+                             { 0.625, 0.00 },
+                             { 0.375, 0.25 },
+                             { 0.625, 0.25 },
+                             { 0.375, 0.50 },
+                             { 0.625, 0.50 },
+                             { 0.375, 0.75 },
+                             { 0.625, 0.75 },
+                             { 0.375, 1.00 },
+                             { 0.625, 1.00 },
+                             { 0.875, 0.00 },
+                             { 0.875, 0.25 },
+                             { 0.125, 0.00 },
+                             { 0.125, 0.25 }};
+
+static int g_nuvs = 14;
+
+static int g_uvIndices[24] = {  0,  1,  3,  2,
+                                2,  3,  5,  4,
+                                4,  5,  7,  6,
+                                6,  7,  9,  8,
+                                1, 10, 11,  3,
+                               12,  0,  2, 13  };
+
+// 'face-varying' primitive variable data & topology for color
+static float g_colors[24][4] = {{1.0, 1.0, 1.0, 1.0},
+                                {1.0, 1.0, 1.0, 1.0},
+                                {1.0, 1.0, 1.0, 1.0},
+                                {1.0, 1.0, 1.0, 1.0},
+                                {1.0, 1.0, 1.0, 1.0},
+                                {1.0, 1.0, 1.0, 1.0},
+                                {1.0, 1.0, 1.0, 1.0},
+                                {1.0, 1.0, 1.0, 1.0},
+                                {1.0, 1.0, 1.0, 1.0},
+                                {1.0, 0.0, 0.0, 1.0},
+                                {1.0, 0.0, 0.0, 1.0},
+                                {1.0, 0.0, 0.0, 1.0},
+                                {1.0, 1.0, 1.0, 1.0},
+                                {1.0, 1.0, 1.0, 1.0},
+                                {1.0, 1.0, 1.0, 1.0},
+                                {1.0, 1.0, 1.0, 1.0},
+                                {1.0, 1.0, 1.0, 1.0},
+                                {1.0, 1.0, 1.0, 1.0},
+                                {1.0, 1.0, 1.0, 1.0},
+                                {1.0, 1.0, 1.0, 1.0},
+                                {1.0, 1.0, 1.0, 1.0},
+                                {1.0, 1.0, 1.0, 1.0},
+                                {1.0, 1.0, 1.0, 1.0},
+                                {1.0, 1.0, 1.0, 1.0}};
+
+static int g_ncolors = 24;
+
+static int g_colorIndices[24] = { 0,  3,  9,  6,
+                                  7, 10, 15, 12, 
+                                 13, 16, 21, 18,
+                                 19, 22,  4,  1,
+                                  5, 23, 17, 11,
+                                 20,  2,  8, 14 };
+
+using namespace OpenSubdiv;
+
+// Approximation methods for smooth normal computations
+enum NormalApproximation
+{
+    CrossTriangle,
+    CrossQuad,
+    Limit
+};
+
+//------------------------------------------------------------------------------
+int main(int argc, char ** argv) {
+
+    const int maxlevel = 2;
+    enum NormalApproximation normalApproximation = CrossTriangle;
+
+    // Parsing command line parameters to see if the user wants to use a  
+    // specific method to calculate normals
+    for (int i = 1; i < argc; ++i) {
+
+        if (strstr(argv[i], "-limit")) {
+            normalApproximation = Limit;
+        } else if (!strcmp(argv[i], "-crossquad")) {
+            normalApproximation = CrossQuad;
+        } else if (!strcmp(argv[i], "-crosstriangle")) {
+            normalApproximation = CrossTriangle;
+        } else {
+            printf("Parameters : \n");
+            printf("  -crosstriangle : use the cross product of vectors\n");
+            printf("                   generated from 3 verts (default).\n");
+            printf("  -crossquad     : use the cross product of vectors\n");
+            printf("                   generated from 4 verts.\n");
+            printf("  -limit         : use normals calculated from the limit.\n");
+            return 0;
+        }
+    }
+
+    typedef Far::TopologyDescriptor Descriptor;
+    Sdc::SchemeType type = OpenSubdiv::Sdc::SCHEME_CATMARK;
+    Sdc::Options options;
+    options.SetVtxBoundaryInterpolation(Sdc::Options::VTX_BOUNDARY_EDGE_ONLY);
+    options.SetFVarLinearInterpolation(Sdc::Options::FVAR_LINEAR_NONE);
+
+    // Populate a topology descriptor with our raw data
+    Descriptor desc;
+    desc.numVertices  = g_nverts;
+    desc.numFaces     = g_nfaces;
+    desc.numVertsPerFace = g_vertsperface;
+    desc.vertIndicesPerFace  = g_vertIndices;
+   
+    // Create a face-varying channel descriptor
+    const int numChannels  = 2;
+    const int channelUV    = 0;
+    const int channelColor = 1;
+    Descriptor::FVarChannel channels[numChannels];
+    channels[channelUV].numValues = g_nuvs;
+    channels[channelUV].valueIndices = g_uvIndices;
+    channels[channelColor].numValues = g_ncolors;
+    channels[channelColor].valueIndices = g_colorIndices;
+
+    // Add the channel topology to the main descriptor
+    desc.numFVarChannels = numChannels;
+    desc.fvarChannels = channels;
+
+    // Instantiate a FarTopologyRefiner from the descriptor
+    Far::TopologyRefiner * refiner =
+        Far::TopologyRefinerFactory<Descriptor>::Create(desc,
+            Far::TopologyRefinerFactory<Descriptor>::Options(type, options));
+
+    // Uniformly refine the topolgy up to 'maxlevel'
+    // note: fullTopologyInLastLevel must be true to work with face-varying data
+    {
+        Far::TopologyRefiner::UniformOptions refineOptions(maxlevel);
+        refineOptions.fullTopologyInLastLevel = true;
+        refiner->RefineUniform(refineOptions);
+    }
+
+    // Allocate and initialize the 'vertex' primvar data (see tutorial 2 for
+    // more details).
+    std::vector<Vertex> vbuffer(refiner->GetNumVerticesTotal());
+    Vertex * verts = &vbuffer[0];
+    for (int i=0; i<g_nverts; ++i) {
+        verts[i].SetPosition(g_verts[i][0], g_verts[i][1], g_verts[i][2]);
+    }
+
+    // Allocate & initialize the first channel of 'face-varying' primvars (UVs)
+    std::vector<FVarVertexUV> fvBufferUV(refiner->GetNumFVarValuesTotal(channelUV));
+    FVarVertexUV * fvVertsUV = &fvBufferUV[0];
+    for (int i=0; i<g_nuvs; ++i) {
+        fvVertsUV[i].u = g_uvs[i][0];
+        fvVertsUV[i].v = g_uvs[i][1];
+    }
+
+    // Allocate & interpolate the 'face-varying' primvar data (colors)
+    std::vector<FVarVertexColor> fvBufferColor(refiner->GetNumFVarValuesTotal(channelColor));
+    FVarVertexColor * fvVertsColor = &fvBufferColor[0];
+    for (int i=0; i<g_ncolors; ++i) {
+        fvVertsColor[i].r = g_colors[i][0];
+        fvVertsColor[i].g = g_colors[i][1];
+        fvVertsColor[i].b = g_colors[i][2];
+        fvVertsColor[i].a = g_colors[i][3];
+    }
+
+    // Interpolate both vertex and face-varying primvar data
+    Far::PrimvarRefiner primvarRefiner(*refiner);
+    Vertex *          srcVert = verts;
+    FVarVertexUV *    srcFVarUV = fvVertsUV;
+    FVarVertexColor * srcFVarColor = fvVertsColor;
+
+    for (int level = 1; level <= maxlevel; ++level) {
+        Vertex *     dstVert = srcVert + refiner->GetLevel(level-1).GetNumVertices();
+        FVarVertexUV * dstFVarUV = srcFVarUV + refiner->GetLevel(level-1).GetNumFVarValues(channelUV);
+        FVarVertexColor * dstFVarColor = srcFVarColor + refiner->GetLevel(level-1).GetNumFVarValues(channelColor);
+
+        primvarRefiner.Interpolate(level, srcVert, dstVert);
+        primvarRefiner.InterpolateFaceVarying(level, srcFVarUV, dstFVarUV, channelUV);
+        primvarRefiner.InterpolateFaceVarying(level, srcFVarColor, dstFVarColor, channelColor);
+
+        srcVert = dstVert;
+        srcFVarUV = dstFVarUV;
+        srcFVarColor = dstFVarColor;
+    }
+
+    // Approximate normals
+    Far::TopologyLevel const & refLastLevel = refiner->GetLevel(maxlevel);
+    int nverts = refLastLevel.GetNumVertices();
+    int nfaces = refLastLevel.GetNumFaces();
+    int firstOfLastVerts = refiner->GetNumVerticesTotal() - nverts;
+
+    std::vector<Vertex> normals(nverts);
+
+    // Different ways to approximate smooth normals
+    //
+    // For details check the description at the beginning of the file
+    if (normalApproximation == Limit) {
+
+        // Approximation using the normal at the limit with verts that are 
+        // not at the limit
+        //
+        // For details check the description at the beginning of the file
+
+        std::vector<Vertex> fineLimitPos(nverts);
+        std::vector<Vertex> fineDu(nverts);
+        std::vector<Vertex> fineDv(nverts);
+
+        primvarRefiner.Limit(&verts[firstOfLastVerts], fineLimitPos, fineDu, fineDv);
+        
+        for (int vert = 0; vert < nverts; ++vert) {
+            float const * du = fineDu[vert].GetPosition();
+            float const * dv = fineDv[vert].GetPosition();
+            
+            float norm[3];
+            cross(du, dv, norm);
+            normals[vert].SetPosition(norm[0], norm[1], norm[2]);
+        }
+
+    } else if (normalApproximation == CrossQuad) {
+
+        // Approximate smooth normals by accumulating normal vectors computed as
+        // the cross product of two vectors generated by the 4 verts that 
+        // form each quad
+        //
+        // For details check the description at the beginning of the file
+
+        for (int f = 0; f < nfaces; f++) {
+            Far::ConstIndexArray faceVertices = refLastLevel.GetFaceVertices(f);
+
+            // We will use the first three verts to calculate a normal
+            const float * v0 = verts[ firstOfLastVerts + faceVertices[0] ].GetPosition();
+            const float * v1 = verts[ firstOfLastVerts + faceVertices[1] ].GetPosition();
+            const float * v2 = verts[ firstOfLastVerts + faceVertices[2] ].GetPosition();
+            const float * v3 = verts[ firstOfLastVerts + faceVertices[3] ].GetPosition();
+
+            // Calculate the cross product between the vectors formed by v1-v0 and
+            // v2-v0, and then normalize the result
+            float normalCalculated [] = {0.0,0.0,0.0};
+            float a[3] = { v2[0]-v0[0], v2[1]-v0[1], v2[2]-v0[2] };
+            float b[3] = { v3[0]-v1[0], v3[1]-v1[1], v3[2]-v1[2] };          
+            cross(a, b, normalCalculated);
+            normalize(normalCalculated);
+
+            // Accumulate that normal on all verts that are part of that face
+            for(int vInFace = 0; vInFace < faceVertices.size() ; vInFace++ ) {
+
+                int vertexIndex = faceVertices[vInFace];
+                normals[vertexIndex].position[0] += normalCalculated[0];
+                normals[vertexIndex].position[1] += normalCalculated[1];
+                normals[vertexIndex].position[2] += normalCalculated[2];
+            }
+        }
+
+    } else if (normalApproximation == CrossTriangle) {
+
+        // Approximate smooth normals by accumulating normal vectors computed as
+        // the cross product of two vectors generated by 3 verts of the quad
+        //
+        // For details check the description at the beginning of the file
+
+        for (int f = 0; f < nfaces; f++) {
+            Far::ConstIndexArray faceVertices = refLastLevel.GetFaceVertices(f);
+
+            // We will use the first three verts to calculate a normal
+            const float * v0 = verts[ firstOfLastVerts + faceVertices[0] ].GetPosition();
+            const float * v1 = verts[ firstOfLastVerts + faceVertices[1] ].GetPosition();
+            const float * v2 = verts[ firstOfLastVerts + faceVertices[2] ].GetPosition();
+
+            // Calculate the cross product between the vectors formed by v1-v0 and
+            // v2-v0, and then normalize the result
+            float normalCalculated [] = {0.0,0.0,0.0};
+            float a[3] = { v1[0]-v0[0], v1[1]-v0[1], v1[2]-v0[2] };
+            float b[3] = { v2[0]-v0[0], v2[1]-v0[1], v2[2]-v0[2] };
+            cross(a, b, normalCalculated);
+            normalize(normalCalculated);
+
+            // Accumulate that normal on all verts that are part of that face
+            for(int vInFace = 0; vInFace < faceVertices.size() ; vInFace++ ) {
+
+                int vertexIndex = faceVertices[vInFace];
+                normals[vertexIndex].position[0] += normalCalculated[0];
+                normals[vertexIndex].position[1] += normalCalculated[1];
+                normals[vertexIndex].position[2] += normalCalculated[2];
+            }
+        }
+    }
+
+    // Finally we just need to normalize the accumulated normals
+    for (int vert = 0; vert < nverts; ++vert) {
+        normalize(&normals[vert].position[0]);
+    }
+   
+    { // Output OBJ of the highest level refined -----------
+
+        // Print vertex positions
+        for (int vert = 0; vert < nverts; ++vert) {
+            float const * pos = verts[firstOfLastVerts + vert].GetPosition();
+            printf("v %f %f %f\n", pos[0], pos[1], pos[2]);
+        }
+        
+        // Print vertex normals
+        for (int vert = 0; vert < nverts; ++vert) {
+            float const * pos = normals[vert].GetPosition();
+            printf("vn %f %f %f\n", pos[0], pos[1], pos[2]);
+        }
+
+        // Print uvs
+        int nuvs   = refLastLevel.GetNumFVarValues(channelUV);
+        int firstOfLastUvs = refiner->GetNumFVarValuesTotal(channelUV) - nuvs;
+        for (int fvvert = 0; fvvert < nuvs; ++fvvert) {
+            FVarVertexUV const & uv = fvVertsUV[firstOfLastUvs + fvvert];
+            printf("vt %f %f\n", uv.u, uv.v);
+        }
+
+        // Print faces
+        for (int face = 0; face < nfaces; ++face) {
+            Far::ConstIndexArray fverts = refLastLevel.GetFaceVertices(face);
+            Far::ConstIndexArray fuvs   = refLastLevel.GetFaceFVarValues(face, channelUV);
+
+            // all refined Catmark faces should be quads
+            assert(fverts.size()==4 and fuvs.size()==4);
+
+            printf("f ");
+            for (int vert=0; vert<fverts.size(); ++vert) {
+                // OBJ uses 1-based arrays...
+                printf("%d/%d/%d ", fverts[vert]+1, fuvs[vert]+1, fverts[vert]+1);
+            }
+            printf("\n");
+        }
+    }
+}
+//------------------------------------------------------------------------------
\ No newline at end of file