Fix OpenCL and CUDA EvalPatches kernels.

2025-01-17 03:30:06 +00:00 · 2015-05-26 11:13:30 -07:00 · 2015-05-26 11:13:30 -07:00 · 749bbf4271
commit 749bbf4271
parent 541aeddd3a
5 changed files with 112 additions and 71 deletions
--- a/opensubdiv/osd/clEvaluator.cpp
+++ b/opensubdiv/osd/clEvaluator.cpp
@ -219,17 +219,16 @@ CLEvaluator::EvalPatches(cl_mem src, VertexBufferDescriptor const &srcDesc,
    clSetKernelArg(_patchKernel,  1, sizeof(int),    &srcDesc.offset);
    clSetKernelArg(_patchKernel,  2, sizeof(cl_mem), &dst);
    clSetKernelArg(_patchKernel,  3, sizeof(int),    &dstDesc.offset);
-    clSetKernelArg(_patchKernel,  4, sizeof(int),    &numPatchCoords);
-    // clSetKernelArg(_patchKernel,  4, sizeof(cl_mem), &du);
-    // clSetKernelArg(_patchKernel,  5, sizeof(int),    &duDesc.offset);
-    // clSetKernelArg(_patchKernel,  6, sizeof(int),    &duDesc.stride);
-    // clSetKernelArg(_patchKernel,  7, sizeof(cl_mem), &dv);
-    // clSetKernelArg(_patchKernel,  8, sizeof(int),    &dvDesc.offset);
-    // clSetKernelArg(_patchKernel,  9, sizeof(int),    &dvDesc.stride);
-    clSetKernelArg(_patchKernel, 5, sizeof(cl_mem), &patchCoordsBuffer);
-    clSetKernelArg(_patchKernel, 6, sizeof(cl_mem), &patchArrayBuffer);
-    clSetKernelArg(_patchKernel, 7, sizeof(cl_mem), &patchIndexBuffer);
-    clSetKernelArg(_patchKernel, 8, sizeof(cl_mem), &patchParamBuffer);
+    clSetKernelArg(_patchKernel,  4, sizeof(cl_mem), &du);
+    clSetKernelArg(_patchKernel,  5, sizeof(int),    &duDesc.offset);
+    clSetKernelArg(_patchKernel,  6, sizeof(int),    &duDesc.stride);
+    clSetKernelArg(_patchKernel,  7, sizeof(cl_mem), &dv);
+    clSetKernelArg(_patchKernel,  8, sizeof(int),    &dvDesc.offset);
+    clSetKernelArg(_patchKernel,  9, sizeof(int),    &dvDesc.stride);
+    clSetKernelArg(_patchKernel, 10, sizeof(cl_mem), &patchCoordsBuffer);
+    clSetKernelArg(_patchKernel, 11, sizeof(cl_mem), &patchArrayBuffer);
+    clSetKernelArg(_patchKernel, 12, sizeof(cl_mem), &patchIndexBuffer);
+    clSetKernelArg(_patchKernel, 13, sizeof(cl_mem), &patchParamBuffer);

    cl_int errNum = clEnqueueNDRangeKernel(
        _clCommandQueue, _patchKernel, 1, NULL,
--- a/opensubdiv/osd/clKernel.cl
+++ b/opensubdiv/osd/clKernel.cl
@ -128,37 +128,85 @@ static void getBSplineWeights(float t, float *point, float *deriv) {
    deriv[3] =  0.5f*t2;
 }

+static void adjustBoundaryWeights(uint bits, float *sWeights, float *tWeights) {
+    int boundary = ((bits >> 4) & 0xf);
+
+    if (boundary & 1) {
+        tWeights[2] -= tWeights[0];
+        tWeights[1] += 2*tWeights[0];
+        tWeights[0] = 0;
+    }
+    if (boundary & 2) {
+        sWeights[1] -= sWeights[3];
+        sWeights[2] += 2*sWeights[3];
+        sWeights[3] = 0;
+    }
+    if (boundary & 4) {
+        tWeights[1] -= tWeights[3];
+        tWeights[2] += 2*tWeights[3];
+        tWeights[3] = 0;
+    }
+    if (boundary & 8) {
+        sWeights[2] -= sWeights[0];
+        sWeights[1] += 2*sWeights[0];
+        sWeights[0] = 0;
+    }
+}
+
+static int getDepth(uint patchBits) {
+    return (patchBits & 0x7);
+}
+
+static float getParamFraction(uint patchBits) {
+    bool nonQuadRoot = (patchBits >> 3) & 0x1;
+    int depth = getDepth(patchBits);
+    if (nonQuadRoot) {
+        return 1.0f / (float)( 1 << (depth-1) );
+    } else {
+        return 1.0f / (float)( 1 << depth );
+    }
+}
+
+static void normalizePatchCoord(uint patchBits, float *uv) {
+    float frac = getParamFraction(patchBits);
+
+    int iu = (patchBits >> 22) & 0x3ff;
+    int iv = (patchBits >> 12) & 0x3ff;
+
+    // top left corner
+    float pu = (float)iu*frac;
+    float pv = (float)iv*frac;
+
+    // normalize u,v coordinates
+    uv[0] = (uv[0] - pu) / frac;
+    uv[1] = (uv[1] - pv) / frac;
+}
+
 __kernel void computePatches(__global float *src, int srcOffset,
                             __global float *dst, int dstOffset,
-//                             __global float *du,  int duOffset, int duStride,
-//                             __global float *dv,  int dvOffset, int dvStride,
-                             int numPatchCoords,
+                             __global float *du,  int duOffset, int duStride,
+                             __global float *dv,  int dvOffset, int dvStride,
                             __global struct PatchCoord *patchCoords,
                             __global struct PatchArray *patchArrayBuffer,
                             __global int *patchIndexBuffer,
                             __global struct PatchParam *patchParamBuffer) {
    int current = get_global_id(0);

-    if (current > numPatchCoords) return;
-
-    src += srcOffset;
-    dst += dstOffset;
-    // du += duOffset;
-    // dv += dvOffset;
+    if (src) src += srcOffset;
+    if (dst) dst += dstOffset;
+    if (du)  du += duOffset;
+    if (dv)  dv += dvOffset;

    struct PatchCoord coord = patchCoords[current];
-    int patchIndex = coord.patchIndex;
-//    struct PatchArray array = patchArrayBuffer[coord.arrayIndex];
-    struct PatchArray array = patchArrayBuffer[0];
+    struct PatchArray array = patchArrayBuffer[coord.arrayIndex];

-    int patchType = 6; // array.x XXX: REGULAR only for now.
+    int patchType = 6; // array.patchType XXX: REGULAR only for now.
    int numControlVertices = 16;
-
-    uint patchBits = patchParamBuffer[patchIndex].patchBits;
-//    vec2 uv = normalizePatchCoord(patchBits, vec2(coord.s, coord.t));
-    float dScale = 1.0f;//float(1 << getDepth(patchBits));
+    uint patchBits = patchParamBuffer[coord.patchIndex].patchBits;

    float uv[2] = {coord.s, coord.t};
+    normalizePatchCoord(patchBits, uv);
+    float dScale = (float)(1 << getDepth(patchBits));

    float wP[20], wDs[20], wDt[20];
    if (patchType == 6) {  // REGULAR
@ -166,8 +214,8 @@ __kernel void computePatches(__global float *src, int srcOffset,
        getBSplineWeights(uv[0], sWeights, dsWeights);
        getBSplineWeights(uv[1], tWeights, dtWeights);

-//        adjustBoundaryWeights(patchBits, sWeights, tWeights);
-//        adjustBoundaryWeights(patchBits, dsWeights, dtWeights);
+        adjustBoundaryWeights(patchBits, sWeights, tWeights);
+        adjustBoundaryWeights(patchBits, dsWeights, dtWeights);

        for (int k = 0; k < 4; ++k) {
            for (int l = 0; l < 4; ++l) {
@ -180,23 +228,33 @@ __kernel void computePatches(__global float *src, int srcOffset,
        // TODO: GREGORY BASIS
    }

+    int indexBase = array.indexBase + coord.vertIndex;
+
    struct Vertex v;
    clear(&v);
-
-#if 1
-    // debug
-    v.v[0] = uv[0];
-    v.v[1] = uv[1];
-    v.v[2] = patchIndexBuffer[current] * 0.1;
-    writeVertex(dst, current, &v);
-    return;
-#endif
-
-    int indexBase = array.indexBase + coord.vertIndex;
    for (int i = 0; i < numControlVertices; ++i) {
        int index = patchIndexBuffer[indexBase + i];
-        if (index < 0) index = 0;
        addWithWeight(&v, src, index, wP[i]);
    }
    writeVertex(dst, current, &v);
+
+    if (du) {
+        struct Vertex vdu;
+        clear(&vdu);
+        for (int i = 0; i < numControlVertices; ++i) {
+            int index = patchIndexBuffer[indexBase + i];
+            addWithWeight(&vdu, src, index, wDs[i]);
+        }
+        writeVertex(du, current, &vdu);
+    }
+    if (dv) {
+        struct Vertex vdv;
+        clear(&vdv);
+        for (int i = 0; i < numControlVertices; ++i) {
+            int index = patchIndexBuffer[indexBase + i];
+            addWithWeight(&vdv, src, index, wDt[i]);
+        }
+        writeVertex(dv, current, &vdv);
+    }
+
 }
--- a/opensubdiv/osd/clPatchTable.cpp
+++ b/opensubdiv/osd/clPatchTable.cpp
@ -62,7 +62,8 @@ CLPatchTable::allocate(Far::PatchTable const *farPatchTable, cl_context clContex
    size_t patchParamSize = patchTable.GetPatchParamSize();

    cl_int err = 0;
-    _patchArrays = clCreateBuffer(clContext, CL_MEM_READ_WRITE,
+    _patchArrays = clCreateBuffer(clContext,
+                                  CL_MEM_READ_WRITE|CL_MEM_COPY_HOST_PTR,
                                  numPatchArrays * sizeof(Osd::PatchArray),
                                  (void*)patchTable.GetPatchArrayBuffer(),
                                  &err);
@ -71,7 +72,8 @@ CLPatchTable::allocate(Far::PatchTable const *farPatchTable, cl_context clContex
        return false;
    }

-    _indexBuffer = clCreateBuffer(clContext, CL_MEM_READ_WRITE,
+    _indexBuffer = clCreateBuffer(clContext,
+                                  CL_MEM_READ_WRITE|CL_MEM_COPY_HOST_PTR,
                                  indexSize * sizeof(int),
                                  (void*)patchTable.GetPatchIndexBuffer(),
                                  &err);
@ -80,7 +82,8 @@ CLPatchTable::allocate(Far::PatchTable const *farPatchTable, cl_context clContex
        return false;
    }

-    _patchParamBuffer = clCreateBuffer(clContext, CL_MEM_READ_WRITE,
+    _patchParamBuffer = clCreateBuffer(clContext,
+                                       CL_MEM_READ_WRITE|CL_MEM_COPY_HOST_PTR,
                                       patchParamSize * sizeof(Osd::PatchParam),
                                       (void*)patchTable.GetPatchParamBuffer(),
                                       &err);
--- a/opensubdiv/osd/cudaEvaluator.cpp
+++ b/opensubdiv/osd/cudaEvaluator.cpp
@ -194,9 +194,8 @@ CudaEvaluator::EvalPatches(const float *src,
                           const PatchArray *patchArrays,
                           const int *patchIndices,
                           const PatchParam *patchParams) {
-    src += srcDesc.offset;
-    if (dst)
-        dst += dstDesc.offset;
+    if (src) src += srcDesc.offset;
+    if (dst) dst += dstDesc.offset;

    CudaEvalPatches(src, dst,
                    srcDesc.length, srcDesc.stride, dstDesc.stride,
--- a/opensubdiv/osd/cudaKernel.cu
+++ b/opensubdiv/osd/cudaKernel.cu
@ -334,24 +334,10 @@ void normalizePatchCoord(unsigned int patchBits, float *u, float *v) {
    float pv = (float)iv*frac;

    // normalize u,v coordinates
-    *u = (*u - pu) / frac,
+    *u = (*u - pu) / frac;
    *v = (*v - pv) / frac;
 }

-// Far::PatchDescriptor::Type
-enum Type {
-    NON_PATCH = 0,     ///< undefined
-    POINTS,            ///< points (useful for cage drawing)
-    LINES,             ///< lines  (useful for cage drawing)
-    QUADS,             ///< bilinear quads-only patches
-    TRIANGLES,         ///< bilinear triangles-only mesh
-    LOOP,              ///< Loop patch
-    REGULAR,           ///< feature-adaptive bicubic patches
-    GREGORY,
-    GREGORY_BOUNDARY,
-    GREGORY_BASIS
-};
-
 __global__ void
 computePatches(const float *src, float *dst, float *dstDu, float *dstDv,
               int length, int srcStride, int dstStride, int dstDuStride, int dstDvStride,
@ -371,7 +357,7 @@ computePatches(const float *src, float *dst, float *dstDu, float *dstDv,
        PatchCoord const &coord = patchCoords[i];
        PatchArray const &array = patchArrayBuffer[coord.arrayIndex];

-        int patchType = array.patchType;
+        int patchType = 6; // array.patchType XXX: REGULAR only for now.
        int numControlVertices = 16;
        // note: patchIndex is absolute.
        unsigned int patchBits = patchParamBuffer[coord.patchIndex].bitField;
@ -382,7 +368,7 @@ computePatches(const float *src, float *dst, float *dstDu, float *dstDv,
        normalizePatchCoord(patchBits, &s, &t);
        float dScale = (float)(1 << getDepth(patchBits));

-        if (patchType == REGULAR) {
+        if (patchType == 6) {
            float sWeights[4], tWeights[4], dsWeights[4], dtWeights[4];
            getBSplineWeights(s, sWeights, dsWeights);
            getBSplineWeights(t, tWeights, dtWeights);
@ -399,15 +385,11 @@ computePatches(const float *src, float *dst, float *dstDu, float *dstDv,
                    wDt[4*k+l] = sWeights[l]  * dtWeights[k] * dScale;
                }
            }
-        } else if (patchType == GREGORY_BASIS) {
-            // XXX: not yet implemented.
-            continue;
        } else {
-            // unknown patchType
+            // TODO: Gregory Basis.
            continue;
        }
-        const int *cvs =
-            &patchIndexBuffer[array.indexBase + coord.vertIndex];
+        const int *cvs = patchIndexBuffer + array.indexBase + coord.vertIndex;

        float * dstVert = dst + i * dstStride;
        clear(dstVert, length);