// // Copyright (C) Pixar. All rights reserved. // // This license governs use of the accompanying software. If you // use the software, you accept this license. If you do not accept // the license, do not use the software. // // 1. Definitions // The terms "reproduce," "reproduction," "derivative works," and // "distribution" have the same meaning here as under U.S. // copyright law. A "contribution" is the original software, or // any additions or changes to the software. // A "contributor" is any person or entity that distributes its // contribution under this license. // "Licensed patents" are a contributor's patent claims that read // directly on its contribution. // // 2. Grant of Rights // (A) Copyright Grant- Subject to the terms of this license, // including the license conditions and limitations in section 3, // each contributor grants you a non-exclusive, worldwide, // royalty-free copyright license to reproduce its contribution, // prepare derivative works of its contribution, and distribute // its contribution or any derivative works that you create. // (B) Patent Grant- Subject to the terms of this license, // including the license conditions and limitations in section 3, // each contributor grants you a non-exclusive, worldwide, // royalty-free license under its licensed patents to make, have // made, use, sell, offer for sale, import, and/or otherwise // dispose of its contribution in the software or derivative works // of the contribution in the software. // // 3. Conditions and Limitations // (A) No Trademark License- This license does not grant you // rights to use any contributor's name, logo, or trademarks. // (B) If you bring a patent claim against any contributor over // patents that you claim are infringed by the software, your // patent license from such contributor to the software ends // automatically. // (C) If you distribute any portion of the software, you must // retain all copyright, patent, trademark, and attribution // notices that are present in the software. // (D) If you distribute any portion of the software in source // code form, you may do so only under this license by including a // complete copy of this license with your distribution. If you // distribute any portion of the software in compiled or object // code form, you may only do so under a license that complies // with this license. // (E) The software is licensed "as-is." You bear the risk of // using it. The contributors give no express warranties, // guarantees or conditions. You may have additional consumer // rights under your local laws which this license cannot change. // To the extent permitted under your local laws, the contributors // exclude the implied warranties of merchantability, fitness for // a particular purpose and non-infringement. // #include template struct DeviceVertex { float pos[3]; float userVertexData[N]; __device__ void addWithWeight(const DeviceVertex *src, float weight) { pos[0] += src->pos[0] * weight; pos[1] += src->pos[1] * weight; pos[2] += src->pos[2] * weight; for(int i = 0; i < N; ++i){ userVertexData[i] += src->userVertexData[i] * weight; } } __device__ void clear() { pos[0] = pos[1] = pos[2] = 0.0f; for(int i = 0; i < N; ++i){ userVertexData[i] = 0.0f; } } }; template struct DeviceVarying { float v[N]; __device__ void addVaryingWithWeight(const DeviceVarying *src, float weight) { for(int i = 0; i < N; ++i){ v[i] += src->v[i] * weight; } } __device__ void clear() { for(int i = 0; i < N; ++i){ v[i] = 0.0f; } } }; // Specialize DeviceVarying for N=0 to avoid compile error: // "flexible array member in otherwise empty struct" template<> struct DeviceVarying<0> { __device__ void addVaryingWithWeight(const DeviceVarying<0> *src, float weight) { } __device__ void clear() { } }; struct DeviceTable { void **tables; int *F0_IT; int *F0_ITa; int *E0_IT; int *V0_IT; int *V0_ITa; float *E0_S; float *V0_S; }; __device__ void clear(float *dst, int count) { for(int i = 0; i < count; ++i) dst[i] = 0; } __device__ void addWithWeight(float *dst, float *src, float weight, int count) { for(int i = 0; i < count; ++i) dst[i] += src[i] * weight; } __device__ void addVaryingWithWeight(float *dst, float *src, float weight, int count) { for(int i = 0; i < count; ++i) dst[i] += src[i] * weight; } template __global__ void computeFace(float *fVertex, float *fVaryings, int *F0_IT, int *F0_ITa, int offset, int tableOffset, int start, int end) { DeviceVertex *vertex = (DeviceVertex*)fVertex; DeviceVarying *varyings = (DeviceVarying*)fVaryings; for(int i = start + tableOffset + threadIdx.x + blockIdx.x*blockDim.x; i < end + tableOffset; i += blockDim.x * gridDim.x){ int h = F0_ITa[2*i]; int n = F0_ITa[2*i+1]; float weight = 1.0f/n; DeviceVertex dst; dst.clear(); if(NUM_VARYING_ELEMENTS > 0){ DeviceVarying dstVarying; dstVarying.clear(); for(int j=0; j __global__ void computeEdge(float *fVertex, float *fVaryings, int *E0_IT, float *E0_S, int offset, int tableOffset, int start, int end) { DeviceVertex *vertex = (DeviceVertex*)fVertex; DeviceVarying *varyings = (DeviceVarying*)fVaryings; for(int i = start + tableOffset + threadIdx.x + blockIdx.x*blockDim.x; i < end + tableOffset; i+= blockDim.x * gridDim.x){ int eidx0 = E0_IT[4*i+0]; int eidx1 = E0_IT[4*i+1]; int eidx2 = E0_IT[4*i+2]; int eidx3 = E0_IT[4*i+3]; float vertWeight = E0_S[i*2+0]; // Fully sharp edge : vertWeight = 0.5f; DeviceVertex dst; dst.clear(); dst.addWithWeight(&vertex[eidx0], vertWeight); dst.addWithWeight(&vertex[eidx1], vertWeight); if(eidx2 > -1){ float faceWeight = E0_S[i*2+1]; dst.addWithWeight(&vertex[eidx2], faceWeight); dst.addWithWeight(&vertex[eidx3], faceWeight); } vertex[offset+i-tableOffset] = dst; if(NUM_VARYING_ELEMENTS > 0){ DeviceVarying dstVarying; dstVarying.clear(); dstVarying.addVaryingWithWeight(&varyings[eidx0], 0.5f); dstVarying.addVaryingWithWeight(&varyings[eidx1], 0.5f); varyings[offset+i-tableOffset] = dstVarying; } } } __global__ void computeEdge(float *fVertex, int numVertexElements, float *fVarying, int numVaryingElements, int *E0_IT, float *E0_S, int offset, int tableOffset, int start, int end) { for(int i = start + tableOffset + threadIdx.x + blockIdx.x*blockDim.x; i < end + tableOffset; i+= blockDim.x * gridDim.x){ int eidx0 = E0_IT[4*i+0]; int eidx1 = E0_IT[4*i+1]; int eidx2 = E0_IT[4*i+2]; int eidx3 = E0_IT[4*i+3]; float vertWeight = E0_S[i*2+0]; // Fully sharp edge : vertWeight = 0.5f; float *dstVertex = fVertex + (i+offset-tableOffset)*numVertexElements; clear(dstVertex, numVertexElements); addWithWeight(dstVertex, fVertex + eidx0*numVertexElements, vertWeight, numVertexElements); addWithWeight(dstVertex, fVertex + eidx1*numVertexElements, vertWeight, numVertexElements); if(eidx2 > -1){ float faceWeight = E0_S[i*2+1]; addWithWeight(dstVertex, fVertex + eidx2*numVertexElements, faceWeight, numVertexElements); addWithWeight(dstVertex, fVertex + eidx3*numVertexElements, faceWeight, numVertexElements); } if(numVaryingElements > 0){ float *dstVarying = fVarying + (i+offset-tableOffset)*numVaryingElements; clear(dstVarying, numVaryingElements); addVaryingWithWeight(dstVarying, fVarying + eidx0*numVaryingElements, 0.5f, numVaryingElements); addVaryingWithWeight(dstVarying, fVarying + eidx1*numVaryingElements, 0.5f, numVaryingElements); } } } template __global__ void computeVertexA(float *fVertex, float *fVaryings, int *V0_ITa, float *V0_S, int offset, int tableOffset, int start, int end, int pass) { DeviceVertex *vertex = (DeviceVertex*)fVertex; DeviceVarying *varyings = (DeviceVarying*)fVaryings; for(int i = start + tableOffset + threadIdx.x + blockIdx.x*blockDim.x; i < end+tableOffset; i += blockDim.x * gridDim.x){ int n = V0_ITa[5*i+1]; int p = V0_ITa[5*i+2]; int eidx0 = V0_ITa[5*i+3]; int eidx1 = V0_ITa[5*i+4]; float weight = (pass==1) ? V0_S[i] : 1.0f - V0_S[i]; // In the case of fractional weight, the weight must be inverted since // the value is shared with the k_Smooth kernel (statistically the // k_Smooth kernel runs much more often than this one) if (weight>0.0f && weight<1.0f && n > 0) weight=1.0f-weight; DeviceVertex dst; if (not pass) { dst.clear(); } else { dst = vertex[i+offset-tableOffset]; } if (eidx0==-1 || (pass==0 && (n==-1)) ) { dst.addWithWeight(&vertex[p], weight); } else { dst.addWithWeight(&vertex[p], weight * 0.75f); dst.addWithWeight(&vertex[eidx0], weight * 0.125f); dst.addWithWeight(&vertex[eidx1], weight * 0.125f); } vertex[i+offset-tableOffset] = dst; if(NUM_VARYING_ELEMENTS > 0){ if(not pass){ DeviceVarying dstVarying; dstVarying.clear(); dstVarying.addVaryingWithWeight(&varyings[p], 1.0f); varyings[i+offset-tableOffset] = dstVarying; } } } } __global__ void computeVertexA(float *fVertex, int numVertexElements, float *fVaryings, int numVaryingElements, int *V0_ITa, float *V0_S, int offset, int tableOffset, int start, int end, int pass) { for(int i = start + tableOffset + threadIdx.x + blockIdx.x*blockDim.x; i < end + tableOffset; i += blockDim.x * gridDim.x){ int n = V0_ITa[5*i+1]; int p = V0_ITa[5*i+2]; int eidx0 = V0_ITa[5*i+3]; int eidx1 = V0_ITa[5*i+4]; float weight = (pass==1) ? V0_S[i] : 1.0f - V0_S[i]; // In the case of fractional weight, the weight must be inverted since // the value is shared with the k_Smooth kernel (statistically the // k_Smooth kernel runs much more often than this one) if (weight>0.0f && weight<1.0f && n > 0) weight=1.0f-weight; float *dstVertex = fVertex + (i+offset-tableOffset)*numVertexElements; if (not pass) { clear(dstVertex, numVertexElements); } if (eidx0==-1 || (pass==0 && (n==-1)) ) { addWithWeight(dstVertex, fVertex + p*numVertexElements, weight, numVertexElements); } else { addWithWeight(dstVertex, fVertex + p*numVertexElements, weight*0.75f, numVertexElements); addWithWeight(dstVertex, fVertex + eidx0*numVertexElements, weight*0.125f, numVertexElements); addWithWeight(dstVertex, fVertex + eidx1*numVertexElements, weight*0.125f, numVertexElements); } if(numVaryingElements > 0){ if(not pass){ float *dstVarying = fVaryings + (i+offset-tableOffset)*numVaryingElements; clear(dstVarying, numVaryingElements); addVaryingWithWeight(dstVarying, fVaryings + p*numVaryingElements, 1.0f, numVaryingElements); } } } } //texture texV0_IT; template __global__ void computeVertexB(float *fVertex, float *fVaryings, const int *V0_ITa, const int *V0_IT, const float *V0_S, int offset, int tableOffset, int start, int end) { DeviceVertex *vertex = (DeviceVertex*)fVertex; DeviceVarying *varyings = (DeviceVarying*)fVaryings; for(int i = start + tableOffset + threadIdx.x + blockIdx.x*blockDim.x; i < end + tableOffset; i += blockDim.x * gridDim.x){ int h = V0_ITa[5*i]; int n = V0_ITa[5*i+1]; int p = V0_ITa[5*i+2]; float weight = V0_S[i]; float wp = 1.0f/float(n*n); float wv = (n-2.0f) * n * wp; DeviceVertex dst; dst.clear(); dst.addWithWeight(&vertex[p], weight * wv); for(int j = 0; j < n; ++j){ dst.addWithWeight(&vertex[V0_IT[h+j*2]], weight * wp); dst.addWithWeight(&vertex[V0_IT[h+j*2+1]], weight * wp); // int idx0 = tex1Dfetch(texV0_IT, h+j*2); // int idx1 = tex1Dfetch(texV0_IT, h+j*2+1); // dst.addWithWeight(&vertex[idx0], weight * wp); // dst.addWithWeight(&vertex[idx1], weight * wp); } vertex[i+offset-tableOffset] = dst; if(NUM_VARYING_ELEMENTS > 0){ DeviceVarying dstVarying; dstVarying.clear(); dstVarying.addVaryingWithWeight(&varyings[p], 1.0f); varyings[i+offset-tableOffset] = dstVarying; } } } __global__ void computeVertexB(float *fVertex, int numVertexElements, float *fVaryings, int numVaryingElements, const int *V0_ITa, const int *V0_IT, const float *V0_S, int offset, int tableOffset, int start, int end) { for(int i = start + tableOffset + threadIdx.x + blockIdx.x*blockDim.x; i < end + tableOffset; i += blockDim.x * gridDim.x){ int h = V0_ITa[5*i]; int n = V0_ITa[5*i+1]; int p = V0_ITa[5*i+2]; float weight = V0_S[i]; float wp = 1.0f/float(n*n); float wv = (n-2.0f) * n * wp; float *dstVertex = fVertex + (i+offset-tableOffset)*numVertexElements; clear(dstVertex, numVertexElements); addWithWeight(dstVertex, fVertex + p*numVertexElements, weight*wv, numVertexElements); for(int j = 0; j < n; ++j){ addWithWeight(dstVertex, fVertex + V0_IT[h+j*2]*numVertexElements, weight*wp, numVertexElements); addWithWeight(dstVertex, fVertex + V0_IT[h+j*2+1]*numVertexElements, weight*wp, numVertexElements); } if(numVaryingElements > 0){ float *dstVarying = fVaryings + (i+offset-tableOffset)*numVaryingElements; clear(dstVarying, numVaryingElements); addVaryingWithWeight(dstVarying, fVaryings + p*numVaryingElements, 1.0f, numVaryingElements); } } } // -------------------------------------------------------------------------------------------- template __global__ void computeLoopVertexB(float *fVertex, float *fVaryings, int *V0_ITa, int *V0_IT, float *V0_S, int offset, int tableOffset, int start, int end) { DeviceVertex *vertex = (DeviceVertex*)fVertex; DeviceVarying *varyings = (DeviceVarying*)fVaryings; for(int i = start + tableOffset + threadIdx.x + blockIdx.x*blockDim.x; i < end + tableOffset; i += blockDim.x * gridDim.x){ int h = V0_ITa[5*i]; int n = V0_ITa[5*i+1]; int p = V0_ITa[5*i+2]; float weight = V0_S[i]; float wp = 1.0f/float(n); float beta = 0.25f * __cosf(float(M_PI) * 2.0f * wp) + 0.375f; beta = beta * beta; beta = (0.625f - beta) * wp; DeviceVertex dst; dst.clear(); dst.addWithWeight(&vertex[p], weight * (1.0f - (beta * n))); for(int j = 0; j < n; ++j){ dst.addWithWeight(&vertex[V0_IT[h+j]], weight * beta); } vertex[i+offset-tableOffset] = dst; if(NUM_VARYING_ELEMENTS > 0){ DeviceVarying dstVarying; dstVarying.clear(); dstVarying.addVaryingWithWeight(&varyings[p], 1.0f); varyings[i+offset-tableOffset] = dstVarying; } } } __global__ void computeLoopVertexB(float *fVertex, int numVertexElements, float *fVaryings, int numVaryingElements, const int *V0_ITa, const int *V0_IT, const float *V0_S, int offset, int tableOffset, int start, int end) { for(int i = start + tableOffset + threadIdx.x + blockIdx.x*blockDim.x; i < end + tableOffset; i += blockDim.x * gridDim.x){ int h = V0_ITa[5*i]; int n = V0_ITa[5*i+1]; int p = V0_ITa[5*i+2]; float weight = V0_S[i]; float wp = 1.0f/float(n); float beta = 0.25f * __cosf(float(M_PI) * 2.0f * wp) + 0.375f; beta = beta * beta; beta = (0.625f - beta) * wp; float *dstVertex = fVertex + (i+offset-tableOffset)*numVertexElements; clear(dstVertex, numVertexElements); addWithWeight(dstVertex, fVertex + p*numVertexElements, weight*(1.0f-(beta*n)), numVertexElements); for(int j = 0; j < n; ++j){ addWithWeight(dstVertex, fVertex + V0_IT[h+j]*numVertexElements, weight*beta, numVertexElements); } if(numVaryingElements > 0){ float *dstVarying = fVaryings + (i+offset-tableOffset)*numVaryingElements; clear(dstVarying, numVaryingElements); addVaryingWithWeight(dstVarying, fVaryings + p*numVaryingElements, 1.0f, numVaryingElements); } } } // -------------------------------------------------------------------------------------------- template __global__ void computeBilinearEdge(float *fVertex, float *fVaryings, int *E0_IT, int offset, int tableOffset, int start, int end) { DeviceVertex *vertex = (DeviceVertex*)fVertex; DeviceVarying *varyings = (DeviceVarying*)fVaryings; for(int i = start + tableOffset + threadIdx.x + blockIdx.x*blockDim.x; i < end + tableOffset; i+= blockDim.x * gridDim.x){ int eidx0 = E0_IT[2*i+0]; int eidx1 = E0_IT[2*i+1]; DeviceVertex dst; dst.clear(); dst.addWithWeight(&vertex[eidx0], 0.5f); dst.addWithWeight(&vertex[eidx1], 0.5f); vertex[offset+i-tableOffset] = dst; if(NUM_VARYING_ELEMENTS > 0){ DeviceVarying dstVarying; dstVarying.clear(); dstVarying.addVaryingWithWeight(&varyings[eidx0], 0.5f); dstVarying.addVaryingWithWeight(&varyings[eidx1], 0.5f); varyings[offset+i-tableOffset] = dstVarying; } } } __global__ void computeBilinearEdge(float *fVertex, int numVertexElements, float *fVarying, int numVaryingElements, int *E0_IT, int offset, int tableOffset, int start, int end) { for(int i = start + tableOffset + threadIdx.x + blockIdx.x*blockDim.x; i < end + tableOffset; i+= blockDim.x * gridDim.x){ int eidx0 = E0_IT[2*i+0]; int eidx1 = E0_IT[2*i+1]; float *dstVertex = fVertex + (i+offset-tableOffset)*numVertexElements; clear(dstVertex, numVertexElements); addWithWeight(dstVertex, fVertex + eidx0*numVertexElements, 0.5f, numVertexElements); addWithWeight(dstVertex, fVertex + eidx1*numVertexElements, 0.5f, numVertexElements); if(numVaryingElements > 0){ float *dstVarying = fVarying + (i+offset-tableOffset)*numVaryingElements; clear(dstVarying, numVaryingElements); addVaryingWithWeight(dstVarying, fVarying + eidx0*numVaryingElements, 0.5f, numVaryingElements); addVaryingWithWeight(dstVarying, fVarying + eidx1*numVaryingElements, 0.5f, numVaryingElements); } } } template __global__ void computeBilinearVertex(float *fVertex, float *fVaryings, int *V0_ITa, int offset, int tableOffset, int start, int end) { DeviceVertex *vertex = (DeviceVertex*)fVertex; DeviceVarying *varyings = (DeviceVarying*)fVaryings; for(int i = start + tableOffset + threadIdx.x + blockIdx.x*blockDim.x; i < end + tableOffset; i += blockDim.x * gridDim.x){ int p = V0_ITa[i]; DeviceVertex dst; dst.clear(); dst.addWithWeight(&vertex[p], 1.0f); vertex[i+offset-tableOffset] = dst; if(NUM_VARYING_ELEMENTS > 0){ DeviceVarying dstVarying; dstVarying.clear(); dstVarying.addVaryingWithWeight(&varyings[p], 1.0f); varyings[i+offset-tableOffset] = dstVarying; } } } __global__ void computeBilinearVertex(float *fVertex, int numVertexElements, float *fVaryings, int numVaryingElements, const int *V0_ITa, int offset, int tableOffset, int start, int end) { for(int i = start + tableOffset + threadIdx.x + blockIdx.x*blockDim.x; i < end + tableOffset; i += blockDim.x * gridDim.x){ int p = V0_ITa[i]; float *dstVertex = fVertex + (i+offset-tableOffset)*numVertexElements; clear(dstVertex, numVertexElements); addWithWeight(dstVertex, fVertex + p*numVertexElements, 1.0f, numVertexElements); if(numVaryingElements > 0){ float *dstVarying = fVaryings + (i+offset-tableOffset)*numVaryingElements; clear(dstVarying, numVaryingElements); addVaryingWithWeight(dstVarying, fVaryings + p*numVaryingElements, 1.0f, numVaryingElements); } } } // -------------------------------------------------------------------------------------------- __global__ void editVertexAdd(float *fVertex, int numVertexElements, int primVarOffset, int primVarWidth, int vertexOffset, int tableOffset, int start, int end, const int *editIndices, const float *editValues) { for(int i = start + tableOffset + threadIdx.x + blockIdx.x*blockDim.x; i < end + tableOffset; i += blockDim.x * gridDim.x) { float *dstVertex = fVertex + (editIndices[i] + vertexOffset) * numVertexElements + primVarOffset; for(int j = 0; j < primVarWidth; j++) { // XXX: wrong? maybe editValues[i*primVarWidth + j]... *dstVertex++ += editValues[j]; } } } // -------------------------------------------------------------------------------------------- #include "../version.h" // XXX: this macro usage is tentative. Since cuda kernel can't be dynamically configured, // still trying to find better way to have optimized kernel.. #define OPT_KERNEL(NUM_USER_VERTEX_ELEMENTS, NUM_VARYING_ELEMENTS, KERNEL, X, Y, ARG) \ if(numUserVertexElements == NUM_USER_VERTEX_ELEMENTS && \ numVaryingElements == NUM_VARYING_ELEMENTS) \ { KERNEL<<>>ARG; \ return; } extern "C" { void OsdCudaComputeFace(float *vertex, float *varying, int numUserVertexElements, int numVaryingElements, int *F_IT, int *F_ITa, int offset, int tableOffset, int start, int end) { //computeFace<3, 0><<<512,32>>>(vertex, varying, F_IT, F_ITa, offset, start, end); OPT_KERNEL(0, 0, computeFace, 512, 32, (vertex, varying, F_IT, F_ITa, offset, tableOffset, start, end)); OPT_KERNEL(0, 3, computeFace, 512, 32, (vertex, varying, F_IT, F_ITa, offset, tableOffset, start, end)); OPT_KERNEL(3, 0, computeFace, 512, 32, (vertex, varying, F_IT, F_ITa, offset, tableOffset, start, end)); OPT_KERNEL(3, 3, computeFace, 512, 32, (vertex, varying, F_IT, F_ITa, offset, tableOffset, start, end)); // fallback kernel (slow) computeFace<<<512, 32>>>(vertex, 3+numUserVertexElements, varying, numVaryingElements, F_IT, F_ITa, offset, tableOffset, start, end); } void OsdCudaComputeEdge(float *vertex, float *varying, int numUserVertexElements, int numVaryingElements, int *E_IT, float *E_W, int offset, int tableOffset, int start, int end) { //computeEdge<0, 3><<<512,32>>>(vertex, varying, E_IT, E_W, offset, start, end); OPT_KERNEL(0, 0, computeEdge, 512, 32, (vertex, varying, E_IT, E_W, offset, tableOffset, start, end)); OPT_KERNEL(0, 3, computeEdge, 512, 32, (vertex, varying, E_IT, E_W, offset, tableOffset, start, end)); OPT_KERNEL(3, 0, computeEdge, 512, 32, (vertex, varying, E_IT, E_W, offset, tableOffset, start, end)); OPT_KERNEL(3, 3, computeEdge, 512, 32, (vertex, varying, E_IT, E_W, offset, tableOffset, start, end)); computeEdge<<<512, 32>>>(vertex, 3+numUserVertexElements, varying, numVaryingElements, E_IT, E_W, offset, tableOffset, start, end); } void OsdCudaComputeVertexA(float *vertex, float *varying, int numUserVertexElements, int numVaryingElements, int *V_ITa, float *V_W, int offset, int tableOffset, int start, int end, int pass) { // computeVertexA<0, 3><<<512,32>>>(vertex, varying, V_ITa, V_W, offset, start, end, pass); OPT_KERNEL(0, 0, computeVertexA, 512, 32, (vertex, varying, V_ITa, V_W, offset, tableOffset, start, end, pass)); OPT_KERNEL(0, 3, computeVertexA, 512, 32, (vertex, varying, V_ITa, V_W, offset, tableOffset, start, end, pass)); OPT_KERNEL(3, 0, computeVertexA, 512, 32, (vertex, varying, V_ITa, V_W, offset, tableOffset, start, end, pass)); OPT_KERNEL(3, 3, computeVertexA, 512, 32, (vertex, varying, V_ITa, V_W, offset, tableOffset, start, end, pass)); computeVertexA<<<512, 32>>>(vertex, 3+numUserVertexElements, varying, numVaryingElements, V_ITa, V_W, offset, tableOffset, start, end, pass); } void OsdCudaComputeVertexB(float *vertex, float *varying, int numUserVertexElements, int numVaryingElements, int *V_ITa, int *V_IT, float *V_W, int offset, int tableOffset, int start, int end) { // computeVertexB<0, 3><<<512,32>>>(vertex, varying, V_ITa, V_IT, V_W, offset, start, end); OPT_KERNEL(0, 0, computeVertexB, 512, 32, (vertex, varying, V_ITa, V_IT, V_W, offset, tableOffset, start, end)); OPT_KERNEL(0, 3, computeVertexB, 512, 32, (vertex, varying, V_ITa, V_IT, V_W, offset, tableOffset, start, end)); OPT_KERNEL(3, 0, computeVertexB, 512, 32, (vertex, varying, V_ITa, V_IT, V_W, offset, tableOffset, start, end)); OPT_KERNEL(3, 3, computeVertexB, 512, 32, (vertex, varying, V_ITa, V_IT, V_W, offset, tableOffset, start, end)); computeVertexB<<<512, 32>>>(vertex, 3+numUserVertexElements, varying, numVaryingElements, V_ITa, V_IT, V_W, offset, tableOffset, start, end); } void OsdCudaComputeLoopVertexB(float *vertex, float *varying, int numUserVertexElements, int numVaryingElements, int *V_ITa, int *V_IT, float *V_W, int offset, int tableOffset, int start, int end) { // computeLoopVertexB<0, 3><<<512,32>>>(vertex, varying, V_ITa, V_IT, V_W, offset, start, end); OPT_KERNEL(0, 0, computeLoopVertexB, 512, 32, (vertex, varying, V_ITa, V_IT, V_W, offset, tableOffset, start, end)); OPT_KERNEL(0, 3, computeLoopVertexB, 512, 32, (vertex, varying, V_ITa, V_IT, V_W, offset, tableOffset, start, end)); OPT_KERNEL(3, 0, computeLoopVertexB, 512, 32, (vertex, varying, V_ITa, V_IT, V_W, offset, tableOffset, start, end)); OPT_KERNEL(3, 3, computeLoopVertexB, 512, 32, (vertex, varying, V_ITa, V_IT, V_W, offset, tableOffset, start, end)); computeLoopVertexB<<<512, 32>>>(vertex, 3+numUserVertexElements, varying, numVaryingElements, V_ITa, V_IT, V_W, offset, tableOffset, start, end); } void OsdCudaComputeBilinearEdge(float *vertex, float *varying, int numUserVertexElements, int numVaryingElements, int *E_IT, int offset, int tableOffset, int start, int end) { //computeBilinearEdge<0, 3><<<512,32>>>(vertex, varying, E_IT, offset, start, end); OPT_KERNEL(0, 0, computeBilinearEdge, 512, 32, (vertex, varying, E_IT, offset, tableOffset, start, end)); OPT_KERNEL(0, 3, computeBilinearEdge, 512, 32, (vertex, varying, E_IT, offset, tableOffset, start, end)); OPT_KERNEL(3, 0, computeBilinearEdge, 512, 32, (vertex, varying, E_IT, offset, tableOffset, start, end)); OPT_KERNEL(3, 3, computeBilinearEdge, 512, 32, (vertex, varying, E_IT, offset, tableOffset, start, end)); computeBilinearEdge<<<512, 32>>>(vertex, 3+numUserVertexElements, varying, numVaryingElements, E_IT, offset, tableOffset, start, end); } void OsdCudaComputeBilinearVertex(float *vertex, float *varying, int numUserVertexElements, int numVaryingElements, int *V_ITa, int offset, int tableOffset, int start, int end) { // computeBilinearVertex<0, 3><<<512,32>>>(vertex, varying, V_ITa, offset, start, end); OPT_KERNEL(0, 0, computeBilinearVertex, 512, 32, (vertex, varying, V_ITa, offset, tableOffset, start, end)); OPT_KERNEL(0, 3, computeBilinearVertex, 512, 32, (vertex, varying, V_ITa, offset, tableOffset, start, end)); OPT_KERNEL(3, 0, computeBilinearVertex, 512, 32, (vertex, varying, V_ITa, offset, tableOffset, start, end)); OPT_KERNEL(3, 3, computeBilinearVertex, 512, 32, (vertex, varying, V_ITa, offset, tableOffset, start, end)); computeBilinearVertex<<<512, 32>>>(vertex, 3+numUserVertexElements, varying, numVaryingElements, V_ITa, offset, tableOffset, start, end); } void OsdCudaEditVertexAdd(float *vertex, int numUserVertexElements, int primVarOffset, int primVarWidth, int vertexOffset, int tableOffset, int start, int end, int *editIndices, float *editValues) { editVertexAdd<<<512, 32>>>(vertex, 3+numUserVertexElements, primVarOffset, primVarWidth, vertexOffset, tableOffset, start, end, editIndices, editValues); } } /* extern "C" */