diff --git a/opensubdiv/osd/cudaComputeContext.cpp b/opensubdiv/osd/cudaComputeContext.cpp index fb32f690..797a8960 100644 --- a/opensubdiv/osd/cudaComputeContext.cpp +++ b/opensubdiv/osd/cudaComputeContext.cpp @@ -30,14 +30,17 @@ namespace OpenSubdiv { namespace OPENSUBDIV_VERSION { bool -OsdCudaTable::createCudaBuffer(size_t size, const void *ptr) { +OsdCudaTable::createCudaBuffer(cudaStream_t stream, size_t size, const void *ptr) { + + cudaHostRegister((void**)&ptr, size); + /* The above command is slow. Try to use cudaMallocHost during the allocation of ptr to speedup */ cudaError_t err = cudaMalloc(&_devicePtr, size); if (err != cudaSuccess) { return false; } - err = cudaMemcpy(_devicePtr, ptr, size, cudaMemcpyHostToDevice); + err = cudaMemcpyAsync(_devicePtr, ptr, size, cudaMemcpyHostToDevice, stream); if (err != cudaSuccess) { cudaFree(_devicePtr); _devicePtr = NULL; @@ -140,15 +143,15 @@ OsdCudaComputeContext::initialize(FarSubdivisionTables const *subdivisionTables, // allocate 5 or 7 tables _tables.resize(subdivisionTables->GetNumTables(), 0); - _tables[FarSubdivisionTables::E_IT] = OsdCudaTable::Create(subdivisionTables->Get_E_IT()); - _tables[FarSubdivisionTables::V_IT] = OsdCudaTable::Create(subdivisionTables->Get_V_IT()); - _tables[FarSubdivisionTables::V_ITa] = OsdCudaTable::Create(subdivisionTables->Get_V_ITa()); - _tables[FarSubdivisionTables::E_W] = OsdCudaTable::Create(subdivisionTables->Get_E_W()); - _tables[FarSubdivisionTables::V_W] = OsdCudaTable::Create(subdivisionTables->Get_V_W()); + _tables[FarSubdivisionTables::E_IT] = OsdCudaTable::Create(GetStream(), subdivisionTables->Get_E_IT()); + _tables[FarSubdivisionTables::V_IT] = OsdCudaTable::Create(GetStream(), subdivisionTables->Get_V_IT()); + _tables[FarSubdivisionTables::V_ITa] = OsdCudaTable::Create(GetStream(), subdivisionTables->Get_V_ITa()); + _tables[FarSubdivisionTables::E_W] = OsdCudaTable::Create(GetStream(), subdivisionTables->Get_E_W()); + _tables[FarSubdivisionTables::V_W] = OsdCudaTable::Create(GetStream(), subdivisionTables->Get_V_W()); if (subdivisionTables->GetNumTables() > 5) { - _tables[FarSubdivisionTables::F_IT] = OsdCudaTable::Create(subdivisionTables->Get_F_IT()); - _tables[FarSubdivisionTables::F_ITa] = OsdCudaTable::Create(subdivisionTables->Get_F_ITa()); + _tables[FarSubdivisionTables::F_IT] = OsdCudaTable::Create(GetStream(), subdivisionTables->Get_F_IT()); + _tables[FarSubdivisionTables::F_ITa] = OsdCudaTable::Create(GetStream(), subdivisionTables->Get_F_ITa()); } // error check @@ -202,6 +205,10 @@ OsdCudaComputeContext::Create(FarSubdivisionTables const *subdivisionTables, OsdCudaComputeContext *result = new OsdCudaComputeContext(); + cudaStream_t stream; + cudaStreamCreate(&stream); + _stream = &stream; + if (result->initialize(subdivisionTables, vertexEditTables) == false) { delete result; return NULL; @@ -209,5 +216,10 @@ OsdCudaComputeContext::Create(FarSubdivisionTables const *subdivisionTables, return result; } +cudaStream_t +OsdComputeContext::GetStream(){ + return *_stream +} + } // end namespace OPENSUBDIV_VERSION } // end namespace OpenSubdiv diff --git a/opensubdiv/osd/cudaComputeContext.h b/opensubdiv/osd/cudaComputeContext.h index 270d114f..68918071 100644 --- a/opensubdiv/osd/cudaComputeContext.h +++ b/opensubdiv/osd/cudaComputeContext.h @@ -42,9 +42,9 @@ namespace OPENSUBDIV_VERSION { class OsdCudaTable : OsdNonCopyable { public: template - static OsdCudaTable * Create(const std::vector &table) { + static OsdCudaTable * Create(cudaStream_t stream, const std::vector &table) { OsdCudaTable *result = new OsdCudaTable(); - if (not result->createCudaBuffer(table.size() * sizeof(T), table.empty() ? NULL : &table[0])) { + if (not result->createCudaBuffer(stream, table.size() * sizeof(T), table.empty() ? NULL : &table[0])) { delete result; return NULL; } @@ -55,12 +55,16 @@ public: void * GetCudaMemory() const; + cudaStream_t GetStream(); + private: OsdCudaTable() : _devicePtr(NULL) {} bool createCudaBuffer(size_t size, const void *ptr); void *_devicePtr; + + cudaStream_t *_stream; }; class OsdCudaHEditTable : OsdNonCopyable { diff --git a/opensubdiv/osd/cudaComputeController.cpp b/opensubdiv/osd/cudaComputeController.cpp index d93f7a62..7997e996 100644 --- a/opensubdiv/osd/cudaComputeController.cpp +++ b/opensubdiv/osd/cudaComputeController.cpp @@ -30,78 +30,92 @@ extern "C" { -void OsdCudaComputeFace(float *vertex, float *varying, +void OsdCudaComputeFace(cudaStream_t stream, + float *vertex, float *varying, int vertexLength, int vertexStride, int varyingLength, int varyingStride, int *F_IT, int *F_ITa, int offset, int tableOffset, int start, int end); -void OsdCudaComputeQuadFace(float *vertex, float *varying, +void OsdCudaComputeQuadFace(cudaStream_t stream, + float *vertex, float *varying, int vertexLength, int vertexStride, int varyingLength, int varyingStride, int *F_IT, int offset, int tableOffset, int start, int end); -void OsdCudaComputeTriQuadFace(float *vertex, float *varying, +void OsdCudaComputeTriQuadFace(cudaStream_t stream, + float *vertex, float *varying, int vertexLength, int vertexStride, int varyingLength, int varyingStride, int *F_IT, int offset, int tableOffset, int start, int end); -void OsdCudaComputeEdge(float *vertex, float *varying, +void OsdCudaComputeEdge(cudaStream_t stream, + float *vertex, float *varying, int vertexLength, int vertexStride, int varyingLength, int varyingStride, int *E_IT, float *E_W, int offset, int tableOffset, int start, int end); -void OsdCudaComputeRestrictedEdge(float *vertex, float *varying, +void OsdCudaComputeRestrictedEdge(cudaStream_t stream, + float *vertex, float *varying, int vertexLength, int vertexStride, int varyingLength, int varyingStride, int *E_IT, int offset, int tableOffset, int start, int end); -void OsdCudaComputeVertexA(float *vertex, float *varying, +void OsdCudaComputeVertexA(cudaStream_t stream, + float *vertex, float *varying, int vertexLength, int vertexStride, int varyingLength, int varyingStride, int *V_ITa, float *V_W, int offset, int tableOffset, int start, int end, int pass); -void OsdCudaComputeVertexB(float *vertex, float *varying, +void OsdCudaComputeVertexB(cudaStream_t stream, + float *vertex, float *varying, int vertexLength, int vertexStride, int varyingLength, int varyingStride, int *V_ITa, int *V_IT, float *V_W, int offset, int tableOffset, int start, int end); -void OsdCudaComputeRestrictedVertexA(float *vertex, float *varying, +void OsdCudaComputeRestrictedVertexA(cudaStream_t stream, + float *vertex, float *varying, int vertexLength, int vertexStride, int varyingLength, int varyingStride, int *V_ITa, int offset, int tableOffset, int start, int end); -void OsdCudaComputeRestrictedVertexB1(float *vertex, float *varying, +void OsdCudaComputeRestrictedVertexB1(cudaStream_t stream, + float *vertex, float *varying, int vertexLength, int vertexStride, int varyingLength, int varyingStride, int *V_ITa, int *V_IT, int offset, int tableOffset, int start, int end); -void OsdCudaComputeRestrictedVertexB2(float *vertex, float *varying, +void OsdCudaComputeRestrictedVertexB2(cudaStream_t stream, + float *vertex, float *varying, int vertexLength, int vertexStride, int varyingLength, int varyingStride, int *V_ITa, int *V_IT, int offset, int tableOffset, int start, int end); -void OsdCudaComputeLoopVertexB(float *vertex, float *varying, +void OsdCudaComputeLoopVertexB(cudaStream_t stream, + float *vertex, float *varying, int vertexLength, int vertexStride, int varyingLength, int varyingStride, int *V_ITa, int *V_IT, float *V_W, int offset, int tableOffset, int start, int end); -void OsdCudaComputeBilinearEdge(float *vertex, float *varying, +void OsdCudaComputeBilinearEdge(cudaStream_t stream, + float *vertex, float *varying, int vertexLength, int vertexStride, int varyingLength, int varyingStride, int *E_IT, int offset, int tableOffset, int start, int end); -void OsdCudaComputeBilinearVertex(float *vertex, float *varying, +void OsdCudaComputeBilinearVertex(cudaStream_t stream, + float *vertex, float *varying, int vertexLength, int vertexStride, int varyingLength, int varyingStride, int *V_ITa, int offset, int tableOffset, int start, int end); -void OsdCudaEditVertexAdd(float *vertex, +void OsdCudaEditVertexAdd(cudaStream_t stream, + float *vertex, int vertexLength, int vertexStride, int primVarOffset, int primVarWidth, int offset, int tableOffset, @@ -133,6 +147,7 @@ OsdCudaComputeController::ApplyBilinearFaceVerticesKernel( float *varying = _currentBindState.GetOffsettedVaryingBuffer(); OsdCudaComputeFace( + context->GetStream(), vertex, varying, _currentBindState.vertexDesc.length, _currentBindState.vertexDesc.stride, _currentBindState.varyingDesc.length, _currentBindState.varyingDesc.stride, @@ -154,6 +169,7 @@ OsdCudaComputeController::ApplyBilinearEdgeVerticesKernel( float *varying = _currentBindState.GetOffsettedVaryingBuffer(); OsdCudaComputeBilinearEdge( + context->GetStream(), vertex, varying, _currentBindState.vertexDesc.length, _currentBindState.vertexDesc.stride, _currentBindState.varyingDesc.length, _currentBindState.varyingDesc.stride, @@ -174,6 +190,7 @@ OsdCudaComputeController::ApplyBilinearVertexVerticesKernel( float *varying = _currentBindState.GetOffsettedVaryingBuffer(); OsdCudaComputeBilinearVertex( + context->GetStream(), vertex, varying, _currentBindState.vertexDesc.length, _currentBindState.vertexDesc.stride, _currentBindState.varyingDesc.length, _currentBindState.varyingDesc.stride, @@ -196,6 +213,7 @@ OsdCudaComputeController::ApplyCatmarkFaceVerticesKernel( float *varying = _currentBindState.GetOffsettedVaryingBuffer(); OsdCudaComputeFace( + context->GetStream(), vertex, varying, _currentBindState.vertexDesc.length, _currentBindState.vertexDesc.stride, _currentBindState.varyingDesc.length, _currentBindState.varyingDesc.stride, @@ -217,6 +235,7 @@ OsdCudaComputeController::ApplyCatmarkQuadFaceVerticesKernel( float *varying = _currentBindState.GetOffsettedVaryingBuffer(); OsdCudaComputeQuadFace( + context->GetStream(), vertex, varying, _currentBindState.vertexDesc.length, _currentBindState.vertexDesc.stride, _currentBindState.varyingDesc.length, _currentBindState.varyingDesc.stride, @@ -237,6 +256,7 @@ OsdCudaComputeController::ApplyCatmarkTriQuadFaceVerticesKernel( float *varying = _currentBindState.GetOffsettedVaryingBuffer(); OsdCudaComputeTriQuadFace( + context->GetStream(), vertex, varying, _currentBindState.vertexDesc.length, _currentBindState.vertexDesc.stride, _currentBindState.varyingDesc.length, _currentBindState.varyingDesc.stride, @@ -259,6 +279,7 @@ OsdCudaComputeController::ApplyCatmarkEdgeVerticesKernel( float *varying = _currentBindState.GetOffsettedVaryingBuffer(); OsdCudaComputeEdge( + context->GetStream(), vertex, varying, _currentBindState.vertexDesc.length, _currentBindState.vertexDesc.stride, _currentBindState.varyingDesc.length, _currentBindState.varyingDesc.stride, @@ -280,6 +301,7 @@ OsdCudaComputeController::ApplyCatmarkRestrictedEdgeVerticesKernel( float *varying = _currentBindState.GetOffsettedVaryingBuffer(); OsdCudaComputeRestrictedEdge( + context->GetStream(), vertex, varying, _currentBindState.vertexDesc.length, _currentBindState.vertexDesc.stride, _currentBindState.varyingDesc.length, _currentBindState.varyingDesc.stride, @@ -304,6 +326,7 @@ OsdCudaComputeController::ApplyCatmarkVertexVerticesKernelB( float *varying = _currentBindState.GetOffsettedVaryingBuffer(); OsdCudaComputeVertexB( + context->GetStream(), vertex, varying, _currentBindState.vertexDesc.length, _currentBindState.vertexDesc.stride, _currentBindState.varyingDesc.length, _currentBindState.varyingDesc.stride, @@ -328,6 +351,7 @@ OsdCudaComputeController::ApplyCatmarkVertexVerticesKernelA1( float *varying = _currentBindState.GetOffsettedVaryingBuffer(); OsdCudaComputeVertexA( + context->GetStream(), vertex, varying, _currentBindState.vertexDesc.length, _currentBindState.vertexDesc.stride, _currentBindState.varyingDesc.length, _currentBindState.varyingDesc.stride, @@ -351,6 +375,7 @@ OsdCudaComputeController::ApplyCatmarkVertexVerticesKernelA2( float *varying = _currentBindState.GetOffsettedVaryingBuffer(); OsdCudaComputeVertexA( + context->GetStream(), vertex, varying, _currentBindState.vertexDesc.length, _currentBindState.vertexDesc.stride, _currentBindState.varyingDesc.length, _currentBindState.varyingDesc.stride, @@ -374,6 +399,7 @@ OsdCudaComputeController::ApplyCatmarkRestrictedVertexVerticesKernelB1( float *varying = _currentBindState.GetOffsettedVaryingBuffer(); OsdCudaComputeRestrictedVertexB1( + context->GetStream(), vertex, varying, _currentBindState.vertexDesc.length, _currentBindState.vertexDesc.stride, _currentBindState.varyingDesc.length, _currentBindState.varyingDesc.stride, @@ -397,6 +423,7 @@ OsdCudaComputeController::ApplyCatmarkRestrictedVertexVerticesKernelB2( float *varying = _currentBindState.GetOffsettedVaryingBuffer(); OsdCudaComputeRestrictedVertexB2( + context->GetStream(), vertex, varying, _currentBindState.vertexDesc.length, _currentBindState.vertexDesc.stride, _currentBindState.varyingDesc.length, _currentBindState.varyingDesc.stride, @@ -418,6 +445,7 @@ OsdCudaComputeController::ApplyCatmarkRestrictedVertexVerticesKernelA( float *varying = _currentBindState.GetOffsettedVaryingBuffer(); OsdCudaComputeRestrictedVertexA( + context->GetStream(), vertex, varying, _currentBindState.vertexDesc.length, _currentBindState.vertexDesc.stride, _currentBindState.varyingDesc.length, _currentBindState.varyingDesc.stride, @@ -440,6 +468,7 @@ OsdCudaComputeController::ApplyLoopEdgeVerticesKernel( float *varying = _currentBindState.GetOffsettedVaryingBuffer(); OsdCudaComputeEdge( + context->GetStream(), vertex, varying, _currentBindState.vertexDesc.length, _currentBindState.vertexDesc.stride, _currentBindState.varyingDesc.length, _currentBindState.varyingDesc.stride, @@ -465,6 +494,7 @@ OsdCudaComputeController::ApplyLoopVertexVerticesKernelB( float *varying = _currentBindState.GetOffsettedVaryingBuffer(); OsdCudaComputeLoopVertexB( + context->GetStream(), vertex, varying, _currentBindState.vertexDesc.length, _currentBindState.vertexDesc.stride, _currentBindState.varyingDesc.length, _currentBindState.varyingDesc.stride, @@ -489,6 +519,7 @@ OsdCudaComputeController::ApplyLoopVertexVerticesKernelA1( float *varying = _currentBindState.GetOffsettedVaryingBuffer(); OsdCudaComputeVertexA( + context->GetStream(), vertex, varying, _currentBindState.vertexDesc.length, _currentBindState.vertexDesc.stride, _currentBindState.varyingDesc.length, _currentBindState.varyingDesc.stride, @@ -512,6 +543,7 @@ OsdCudaComputeController::ApplyLoopVertexVerticesKernelA2( float *varying = _currentBindState.GetOffsettedVaryingBuffer(); OsdCudaComputeVertexA( + context->GetStream(), vertex, varying, _currentBindState.vertexDesc.length, _currentBindState.vertexDesc.stride, _currentBindState.varyingDesc.length, _currentBindState.varyingDesc.stride, @@ -536,6 +568,7 @@ OsdCudaComputeController::ApplyVertexEdits( if (edit->GetOperation() == FarVertexEdit::Add) { OsdCudaEditVertexAdd( + context->GetStream(), vertex, _currentBindState.vertexDesc.length, _currentBindState.vertexDesc.stride, edit->GetPrimvarOffset(), diff --git a/opensubdiv/osd/cudaKernel.cu b/opensubdiv/osd/cudaKernel.cu index 11324bb1..5d25db66 100644 --- a/opensubdiv/osd/cudaKernel.cu +++ b/opensubdiv/osd/cudaKernel.cu @@ -1020,234 +1020,258 @@ editVertexAdd(float *fVertex, int vertexLength, int vertexStride, // XXX: this macro usage is tentative. Since cuda kernel can't be dynamically configured, // still trying to find better way to have optimized kernel.. -#define OPT_KERNEL(NUM_VERTEX_ELEMENTS, NUM_VARYING_ELEMENTS, KERNEL, X, Y, ARG) \ +#define OPT_KERNEL(NUM_VERTEX_ELEMENTS, NUM_VARYING_ELEMENTS, KERNEL, X, Y, STREAM, ARG) \ if(vertexLength == NUM_VERTEX_ELEMENTS && \ varyingLength == NUM_VARYING_ELEMENTS && \ vertexStride == vertexLength && \ varyingStride == varyingLength) \ - { KERNEL<<>>ARG; \ + { KERNEL<<>>ARG; \ return; } extern "C" { -void OsdCudaComputeFace(float *vertex, float *varying, +void OsdCudaComputeFace(cudaStream_t stream, + float *vertex, float *varying, int vertexLength, int vertexStride, int varyingLength, int varyingStride, int *F_IT, int *F_ITa, int offset, int tableOffset, int start, int end) { - //computeFace<3, 0><<<512,32>>>(vertex, varying, F_IT, F_ITa, offset, start, end); - OPT_KERNEL(0, 0, computeFace, 512, 32, (vertex, varying, F_IT, F_ITa, offset, tableOffset, start, end)); - OPT_KERNEL(0, 3, computeFace, 512, 32, (vertex, varying, F_IT, F_ITa, offset, tableOffset, start, end)); - OPT_KERNEL(3, 0, computeFace, 512, 32, (vertex, varying, F_IT, F_ITa, offset, tableOffset, start, end)); - OPT_KERNEL(3, 3, computeFace, 512, 32, (vertex, varying, F_IT, F_ITa, offset, tableOffset, start, end)); + //computeFace<3, 0><<<512,32,0,stream>>>(vertex, varying, F_IT, F_ITa, offset, start, end); + OPT_KERNEL(0, 0, computeFace, 512, 32, stream, (vertex, varying, F_IT, F_ITa, offset, tableOffset, start, end)); + OPT_KERNEL(0, 3, computeFace, 512, 32, stream, (vertex, varying, F_IT, F_ITa, offset, tableOffset, start, end)); + OPT_KERNEL(3, 0, computeFace, 512, 32, stream, (vertex, varying, F_IT, F_ITa, offset, tableOffset, start, end)); + OPT_KERNEL(3, 3, computeFace, 512, 32, stream, (vertex, varying, F_IT, F_ITa, offset, tableOffset, start, end)); // fallback kernel (slow) - computeFace<<<512, 32>>>(vertex, varying, + computeFace<<<512, 32, 0, stream>>>(vertex, varying, vertexLength, vertexStride, varyingLength, varyingStride, F_IT, F_ITa, offset, tableOffset, start, end); } -void OsdCudaComputeQuadFace(float *vertex, float *varying, +void OsdCudaComputeQuadFace(cudaStream_t stream, + float *vertex, float *varying, int vertexLength, int vertexStride, int varyingLength, int varyingStride, int *F_IT, int offset, int tableOffset, int start, int end) { - //computeQuadFace<3, 0><<<512,32>>>(vertex, varying, F_IT, offset, start, end); - OPT_KERNEL(0, 0, computeQuadFace, 512, 32, (vertex, varying, F_IT, offset, tableOffset, start, end)); - OPT_KERNEL(0, 3, computeQuadFace, 512, 32, (vertex, varying, F_IT, offset, tableOffset, start, end)); - OPT_KERNEL(3, 0, computeQuadFace, 512, 32, (vertex, varying, F_IT, offset, tableOffset, start, end)); - OPT_KERNEL(3, 3, computeQuadFace, 512, 32, (vertex, varying, F_IT, offset, tableOffset, start, end)); + //computeQuadFace<3, 0><<<512,32,0,stream>>>(vertex, varying, F_IT, offset, start, end); + OPT_KERNEL(0, 0, computeQuadFace, 512, 32, stream, (vertex, varying, F_IT, offset, tableOffset, start, end)); + OPT_KERNEL(0, 3, computeQuadFace, 512, 32, stream, (vertex, varying, F_IT, offset, tableOffset, start, end)); + OPT_KERNEL(3, 0, computeQuadFace, 512, 32, stream, (vertex, varying, F_IT, offset, tableOffset, start, end)); + OPT_KERNEL(3, 3, computeQuadFace, 512, 32, stream, (vertex, varying, F_IT, offset, tableOffset, start, end)); // fallback kernel (slow) - computeQuadFace<<<512, 32>>>(vertex, varying, + computeQuadFace<<<512, 32, 0, stream>>>(vertex, varying, vertexLength, vertexStride, varyingLength, varyingStride, F_IT, offset, tableOffset, start, end); } -void OsdCudaComputeTriQuadFace(float *vertex, float *varying, +void OsdCudaComputeTriQuadFace(cudaStream_t stream, + float *vertex, float *varying, int vertexLength, int vertexStride, int varyingLength, int varyingStride, int *F_IT, int offset, int tableOffset, int start, int end) { - //computeTriQuadFace<3, 0><<<512,32>>>(vertex, varying, F_IT, offset, start, end); - OPT_KERNEL(0, 0, computeTriQuadFace, 512, 32, (vertex, varying, F_IT, offset, tableOffset, start, end)); - OPT_KERNEL(0, 3, computeTriQuadFace, 512, 32, (vertex, varying, F_IT, offset, tableOffset, start, end)); - OPT_KERNEL(3, 0, computeTriQuadFace, 512, 32, (vertex, varying, F_IT, offset, tableOffset, start, end)); - OPT_KERNEL(3, 3, computeTriQuadFace, 512, 32, (vertex, varying, F_IT, offset, tableOffset, start, end)); + //computeTriQuadFace<3, 0><<<512,32,0,stream>>>(vertex, varying, F_IT, offset, start, end); + OPT_KERNEL(0, 0, computeTriQuadFace, 512, 32, stream, (vertex, varying, F_IT, offset, tableOffset, start, end)); + OPT_KERNEL(0, 3, computeTriQuadFace, 512, 32, stream, (vertex, varying, F_IT, offset, tableOffset, start, end)); + OPT_KERNEL(3, 0, computeTriQuadFace, 512, 32, stream, (vertex, varying, F_IT, offset, tableOffset, start, end)); + OPT_KERNEL(3, 3, computeTriQuadFace, 512, 32, stream, (vertex, varying, F_IT, offset, tableOffset, start, end)); // fallback kernel (slow) - computeTriQuadFace<<<512, 32>>>(vertex, varying, + computeTriQuadFace<<<512, 32, 0, stream>>>(vertex, varying, vertexLength, vertexStride, varyingLength, varyingStride, F_IT, offset, tableOffset, start, end); } -void OsdCudaComputeEdge(float *vertex, float *varying, +void OsdCudaComputeEdge(cudaStream_t stream, + float *vertex, float *varying, int vertexLength, int vertexStride, int varyingLength, int varyingStride, int *E_IT, float *E_W, int offset, int tableOffset, int start, int end) { - //computeEdge<0, 3><<<512,32>>>(vertex, varying, E_IT, E_W, offset, start, end); - OPT_KERNEL(0, 0, computeEdge, 512, 32, (vertex, varying, E_IT, E_W, offset, tableOffset, start, end)); - OPT_KERNEL(0, 3, computeEdge, 512, 32, (vertex, varying, E_IT, E_W, offset, tableOffset, start, end)); - OPT_KERNEL(3, 0, computeEdge, 512, 32, (vertex, varying, E_IT, E_W, offset, tableOffset, start, end)); - OPT_KERNEL(3, 3, computeEdge, 512, 32, (vertex, varying, E_IT, E_W, offset, tableOffset, start, end)); + //computeEdge<0, 3><<<512,32,0,stream>>>(vertex, varying, E_IT, E_W, offset, start, end); + OPT_KERNEL(0, 0, computeEdge, 512, 32, stream, (vertex, varying, E_IT, E_W, offset, tableOffset, start, end)); + OPT_KERNEL(0, 3, computeEdge, 512, 32, stream, (vertex, varying, E_IT, E_W, offset, tableOffset, start, end)); + OPT_KERNEL(3, 0, computeEdge, 512, 32, stream, (vertex, varying, E_IT, E_W, offset, tableOffset, start, end)); + OPT_KERNEL(3, 3, computeEdge, 512, 32, stream, (vertex, varying, E_IT, E_W, offset, tableOffset, start, end)); - computeEdge<<<512, 32>>>(vertex, varying, + // fallback kernel (slow) + computeEdge<<<512, 32, 0, stream>>>(vertex, varying, vertexLength, vertexStride, varyingLength, varyingStride, E_IT, E_W, offset, tableOffset, start, end); } -void OsdCudaComputeRestrictedEdge(float *vertex, float *varying, +void OsdCudaComputeRestrictedEdge(cudaStream_t stream, + float *vertex, float *varying, int vertexLength, int vertexStride, int varyingLength, int varyingStride, int *E_IT, int offset, int tableOffset, int start, int end) { - //computeRestrictedEdge<0, 3><<<512,32>>>(vertex, varying, E_IT, offset, start, end); - OPT_KERNEL(0, 0, computeRestrictedEdge, 512, 32, (vertex, varying, E_IT, offset, tableOffset, start, end)); - OPT_KERNEL(0, 3, computeRestrictedEdge, 512, 32, (vertex, varying, E_IT, offset, tableOffset, start, end)); - OPT_KERNEL(3, 0, computeRestrictedEdge, 512, 32, (vertex, varying, E_IT, offset, tableOffset, start, end)); - OPT_KERNEL(3, 3, computeRestrictedEdge, 512, 32, (vertex, varying, E_IT, offset, tableOffset, start, end)); + //computeRestrictedEdge<0, 3><<<512,32,0,stream>>>(vertex, varying, E_IT, offset, start, end); + OPT_KERNEL(0, 0, computeRestrictedEdge, 512, 32, stream, (vertex, varying, E_IT, offset, tableOffset, start, end)); + OPT_KERNEL(0, 3, computeRestrictedEdge, 512, 32, stream, (vertex, varying, E_IT, offset, tableOffset, start, end)); + OPT_KERNEL(3, 0, computeRestrictedEdge, 512, 32, stream, (vertex, varying, E_IT, offset, tableOffset, start, end)); + OPT_KERNEL(3, 3, computeRestrictedEdge, 512, 32, stream, (vertex, varying, E_IT, offset, tableOffset, start, end)); - computeRestrictedEdge<<<512, 32>>>(vertex, varying, + // fallback kernel (slow) + computeRestrictedEdge<<<512, 32, 0, stream>>>(vertex, varying, vertexLength, vertexStride, varyingLength, varyingStride, E_IT, offset, tableOffset, start, end); } -void OsdCudaComputeVertexA(float *vertex, float *varying, +void OsdCudaComputeVertexA(cudaStream_t stream, + float *vertex, float *varying, int vertexLength, int vertexStride, int varyingLength, int varyingStride, int *V_ITa, float *V_W, int offset, int tableOffset, int start, int end, int pass) { -// computeVertexA<0, 3><<<512,32>>>(vertex, varying, V_ITa, V_W, offset, start, end, pass); - OPT_KERNEL(0, 0, computeVertexA, 512, 32, (vertex, varying, V_ITa, V_W, offset, tableOffset, start, end, pass)); - OPT_KERNEL(0, 3, computeVertexA, 512, 32, (vertex, varying, V_ITa, V_W, offset, tableOffset, start, end, pass)); - OPT_KERNEL(3, 0, computeVertexA, 512, 32, (vertex, varying, V_ITa, V_W, offset, tableOffset, start, end, pass)); - OPT_KERNEL(3, 3, computeVertexA, 512, 32, (vertex, varying, V_ITa, V_W, offset, tableOffset, start, end, pass)); +// computeVertexA<0, 3><<<512,32,0,stream>>>(vertex, varying, V_ITa, V_W, offset, start, end, pass); + OPT_KERNEL(0, 0, computeVertexA, 512, 32, stream, (vertex, varying, V_ITa, V_W, offset, tableOffset, start, end, pass)); + OPT_KERNEL(0, 3, computeVertexA, 512, 32, stream, (vertex, varying, V_ITa, V_W, offset, tableOffset, start, end, pass)); + OPT_KERNEL(3, 0, computeVertexA, 512, 32, stream, (vertex, varying, V_ITa, V_W, offset, tableOffset, start, end, pass)); + OPT_KERNEL(3, 3, computeVertexA, 512, 32, stream, (vertex, varying, V_ITa, V_W, offset, tableOffset, start, end, pass)); - computeVertexA<<<512, 32>>>(vertex, varying, + // fallback kernel (slow) + computeVertexA<<<512, 32, 0, stream>>>(vertex, varying, vertexLength, vertexStride, varyingLength, varyingStride, V_ITa, V_W, offset, tableOffset, start, end, pass); } -void OsdCudaComputeVertexB(float *vertex, float *varying, +void OsdCudaComputeVertexB(cudaStream_t stream, + float *vertex, float *varying, int vertexLength, int vertexStride, int varyingLength, int varyingStride, int *V_ITa, int *V_IT, float *V_W, int offset, int tableOffset, int start, int end) { -// computeVertexB<0, 3><<<512,32>>>(vertex, varying, V_ITa, V_IT, V_W, offset, start, end); - OPT_KERNEL(0, 0, computeVertexB, 512, 32, (vertex, varying, V_ITa, V_IT, V_W, offset, tableOffset, start, end)); - OPT_KERNEL(0, 3, computeVertexB, 512, 32, (vertex, varying, V_ITa, V_IT, V_W, offset, tableOffset, start, end)); - OPT_KERNEL(3, 0, computeVertexB, 512, 32, (vertex, varying, V_ITa, V_IT, V_W, offset, tableOffset, start, end)); - OPT_KERNEL(3, 3, computeVertexB, 512, 32, (vertex, varying, V_ITa, V_IT, V_W, offset, tableOffset, start, end)); +// computeVertexB<0, 3><<<512,32,0,stream>>>(vertex, varying, V_ITa, V_IT, V_W, offset, start, end); + OPT_KERNEL(0, 0, computeVertexB, 512, 32, stream, (vertex, varying, V_ITa, V_IT, V_W, offset, tableOffset, start, end)); + OPT_KERNEL(0, 3, computeVertexB, 512, 32, stream, (vertex, varying, V_ITa, V_IT, V_W, offset, tableOffset, start, end)); + OPT_KERNEL(3, 0, computeVertexB, 512, 32, stream, (vertex, varying, V_ITa, V_IT, V_W, offset, tableOffset, start, end)); + OPT_KERNEL(3, 3, computeVertexB, 512, 32, stream, (vertex, varying, V_ITa, V_IT, V_W, offset, tableOffset, start, end)); - computeVertexB<<<512, 32>>>(vertex, varying, + // fallback kernel (slow) + computeVertexB<<<512, 32, 0, stream>>>(vertex, varying, vertexLength, vertexStride, varyingLength, varyingStride, V_ITa, V_IT, V_W, offset, tableOffset, start, end); } -void OsdCudaComputeRestrictedVertexA(float *vertex, float *varying, +void OsdCudaComputeRestrictedVertexA(cudaStream_t stream, + float *vertex, float *varying, int vertexLength, int vertexStride, int varyingLength, int varyingStride, int *V_ITa, int offset, int tableOffset, int start, int end) { -// computeRestrictedVertexA<0, 3><<<512,32>>>(vertex, varying, V_ITa, offset, start, end); - OPT_KERNEL(0, 0, computeRestrictedVertexA, 512, 32, (vertex, varying, V_ITa, offset, tableOffset, start, end)); - OPT_KERNEL(0, 3, computeRestrictedVertexA, 512, 32, (vertex, varying, V_ITa, offset, tableOffset, start, end)); - OPT_KERNEL(3, 0, computeRestrictedVertexA, 512, 32, (vertex, varying, V_ITa, offset, tableOffset, start, end)); - OPT_KERNEL(3, 3, computeRestrictedVertexA, 512, 32, (vertex, varying, V_ITa, offset, tableOffset, start, end)); +// computeRestrictedVertexA<0, 3><<<512,32,0,stream>>>(vertex, varying, V_ITa, offset, start, end); + OPT_KERNEL(0, 0, computeRestrictedVertexA, 512, 32, stream, (vertex, varying, V_ITa, offset, tableOffset, start, end)); + OPT_KERNEL(0, 3, computeRestrictedVertexA, 512, 32, stream, (vertex, varying, V_ITa, offset, tableOffset, start, end)); + OPT_KERNEL(3, 0, computeRestrictedVertexA, 512, 32, stream, (vertex, varying, V_ITa, offset, tableOffset, start, end)); + OPT_KERNEL(3, 3, computeRestrictedVertexA, 512, 32, stream, (vertex, varying, V_ITa, offset, tableOffset, start, end)); - computeRestrictedVertexA<<<512, 32>>>(vertex, varying, + // fallback kernel (slow) + computeRestrictedVertexA<<<512, 32, 0, stream>>>(vertex, varying, vertexLength, vertexStride, varyingLength, varyingStride, V_ITa, offset, tableOffset, start, end); } -void OsdCudaComputeRestrictedVertexB1(float *vertex, float *varying, +void OsdCudaComputeRestrictedVertexB1(cudaStream_t stream, + float *vertex, float *varying, int vertexLength, int vertexStride, int varyingLength, int varyingStride, int *V_ITa, int *V_IT, int offset, int tableOffset, int start, int end) { -// computeRestrictedVertexB1<0, 3><<<512,32>>>(vertex, varying, V_ITa, V_IT, offset, start, end); - OPT_KERNEL(0, 0, computeRestrictedVertexB1, 512, 32, (vertex, varying, V_ITa, V_IT, offset, tableOffset, start, end)); - OPT_KERNEL(0, 3, computeRestrictedVertexB1, 512, 32, (vertex, varying, V_ITa, V_IT, offset, tableOffset, start, end)); - OPT_KERNEL(3, 0, computeRestrictedVertexB1, 512, 32, (vertex, varying, V_ITa, V_IT, offset, tableOffset, start, end)); - OPT_KERNEL(3, 3, computeRestrictedVertexB1, 512, 32, (vertex, varying, V_ITa, V_IT, offset, tableOffset, start, end)); +// computeRestrictedVertexB1<0, 3><<<512,32,0,stream>>>(vertex, varying, V_ITa, V_IT, offset, start, end); + OPT_KERNEL(0, 0, computeRestrictedVertexB1, 512, 32, stream, (vertex, varying, V_ITa, V_IT, offset, tableOffset, start, end)); + OPT_KERNEL(0, 3, computeRestrictedVertexB1, 512, 32, stream, (vertex, varying, V_ITa, V_IT, offset, tableOffset, start, end)); + OPT_KERNEL(3, 0, computeRestrictedVertexB1, 512, 32, stream, (vertex, varying, V_ITa, V_IT, offset, tableOffset, start, end)); + OPT_KERNEL(3, 3, computeRestrictedVertexB1, 512, 32, stream, (vertex, varying, V_ITa, V_IT, offset, tableOffset, start, end)); - computeRestrictedVertexB1 <<<512, 32>>>(vertex, varying, + // fallback kernel (slow) + computeRestrictedVertexB1<<<512, 32, 0, stream>>>(vertex, varying, vertexLength, vertexStride, varyingLength, varyingStride, V_ITa, V_IT, offset, tableOffset, start, end); } -void OsdCudaComputeRestrictedVertexB2(float *vertex, float *varying, +void OsdCudaComputeRestrictedVertexB2(cudaStream_t stream, + float *vertex, float *varying, int vertexLength, int vertexStride, int varyingLength, int varyingStride, int *V_ITa, int *V_IT, int offset, int tableOffset, int start, int end) { -// computeRestrictedVertexB2<0, 3><<<512,32>>>(vertex, varying, V_ITa, V_IT, offset, start, end); - OPT_KERNEL(0, 0, computeRestrictedVertexB2, 512, 32, (vertex, varying, V_ITa, V_IT, offset, tableOffset, start, end)); - OPT_KERNEL(0, 3, computeRestrictedVertexB2, 512, 32, (vertex, varying, V_ITa, V_IT, offset, tableOffset, start, end)); - OPT_KERNEL(3, 0, computeRestrictedVertexB2, 512, 32, (vertex, varying, V_ITa, V_IT, offset, tableOffset, start, end)); - OPT_KERNEL(3, 3, computeRestrictedVertexB2, 512, 32, (vertex, varying, V_ITa, V_IT, offset, tableOffset, start, end)); +// computeRestrictedVertexB2<0, 3><<<512,32,0,stream>>>(vertex, varying, V_ITa, V_IT, offset, start, end); + OPT_KERNEL(0, 0, computeRestrictedVertexB2, 512, 32, stream, (vertex, varying, V_ITa, V_IT, offset, tableOffset, start, end)); + OPT_KERNEL(0, 3, computeRestrictedVertexB2, 512, 32, stream, (vertex, varying, V_ITa, V_IT, offset, tableOffset, start, end)); + OPT_KERNEL(3, 0, computeRestrictedVertexB2, 512, 32, stream, (vertex, varying, V_ITa, V_IT, offset, tableOffset, start, end)); + OPT_KERNEL(3, 3, computeRestrictedVertexB2, 512, 32, stream, (vertex, varying, V_ITa, V_IT, offset, tableOffset, start, end)); - computeRestrictedVertexB2 <<<512, 32>>>(vertex, varying, + // fallback kernel (slow) + computeRestrictedVertexB2<<<512, 32, 0, stream>>>(vertex, varying, vertexLength, vertexStride, varyingLength, varyingStride, V_ITa, V_IT, offset, tableOffset, start, end); } -void OsdCudaComputeLoopVertexB(float *vertex, float *varying, +void OsdCudaComputeLoopVertexB(cudaStream_t stream, + float *vertex, float *varying, int vertexLength, int vertexStride, int varyingLength, int varyingStride, int *V_ITa, int *V_IT, float *V_W, int offset, int tableOffset, int start, int end) { -// computeLoopVertexB<0, 3><<<512,32>>>(vertex, varying, V_ITa, V_IT, V_W, offset, start, end); - OPT_KERNEL(0, 0, computeLoopVertexB, 512, 32, (vertex, varying, V_ITa, V_IT, V_W, offset, tableOffset, start, end)); - OPT_KERNEL(0, 3, computeLoopVertexB, 512, 32, (vertex, varying, V_ITa, V_IT, V_W, offset, tableOffset, start, end)); - OPT_KERNEL(3, 0, computeLoopVertexB, 512, 32, (vertex, varying, V_ITa, V_IT, V_W, offset, tableOffset, start, end)); - OPT_KERNEL(3, 3, computeLoopVertexB, 512, 32, (vertex, varying, V_ITa, V_IT, V_W, offset, tableOffset, start, end)); +// computeLoopVertexB<0, 3><<<512,32,0,stream>>>(vertex, varying, V_ITa, V_IT, V_W, offset, start, end); + OPT_KERNEL(0, 0, computeLoopVertexB, 512, 32, stream, (vertex, varying, V_ITa, V_IT, V_W, offset, tableOffset, start, end)); + OPT_KERNEL(0, 3, computeLoopVertexB, 512, 32, stream, (vertex, varying, V_ITa, V_IT, V_W, offset, tableOffset, start, end)); + OPT_KERNEL(3, 0, computeLoopVertexB, 512, 32, stream, (vertex, varying, V_ITa, V_IT, V_W, offset, tableOffset, start, end)); + OPT_KERNEL(3, 3, computeLoopVertexB, 512, 32, stream, (vertex, varying, V_ITa, V_IT, V_W, offset, tableOffset, start, end)); - computeLoopVertexB<<<512, 32>>>(vertex, varying, + // fallback kernel (slow) + computeLoopVertexB<<<512, 32, 0, stream>>>(vertex, varying, vertexLength, vertexStride, varyingLength, varyingStride, V_ITa, V_IT, V_W, offset, tableOffset, start, end); } -void OsdCudaComputeBilinearEdge(float *vertex, float *varying, +void OsdCudaComputeBilinearEdge(cudaStream_t stream, + float *vertex, float *varying, int vertexLength, int vertexStride, int varyingLength, int varyingStride, int *E_IT, int offset, int tableOffset, int start, int end) { - //computeBilinearEdge<0, 3><<<512,32>>>(vertex, varying, E_IT, offset, start, end); - OPT_KERNEL(0, 0, computeBilinearEdge, 512, 32, (vertex, varying, E_IT, offset, tableOffset, start, end)); - OPT_KERNEL(0, 3, computeBilinearEdge, 512, 32, (vertex, varying, E_IT, offset, tableOffset, start, end)); - OPT_KERNEL(3, 0, computeBilinearEdge, 512, 32, (vertex, varying, E_IT, offset, tableOffset, start, end)); - OPT_KERNEL(3, 3, computeBilinearEdge, 512, 32, (vertex, varying, E_IT, offset, tableOffset, start, end)); + //computeBilinearEdge<0, 3><<<512,32,0,stream>>>(vertex, varying, E_IT, offset, start, end); + OPT_KERNEL(0, 0, computeBilinearEdge, 512, 32, stream, (vertex, varying, E_IT, offset, tableOffset, start, end)); + OPT_KERNEL(0, 3, computeBilinearEdge, 512, 32, stream, (vertex, varying, E_IT, offset, tableOffset, start, end)); + OPT_KERNEL(3, 0, computeBilinearEdge, 512, 32, stream, (vertex, varying, E_IT, offset, tableOffset, start, end)); + OPT_KERNEL(3, 3, computeBilinearEdge, 512, 32, stream, (vertex, varying, E_IT, offset, tableOffset, start, end)); - computeBilinearEdge<<<512, 32>>>(vertex, varying, + // fallback kernel (slow) + computeBilinearEdge<<<512, 32, 0, stream>>>(vertex, varying, vertexLength, vertexStride, varyingLength, varyingStride, E_IT, offset, tableOffset, start, end); } -void OsdCudaComputeBilinearVertex(float *vertex, float *varying, +void OsdCudaComputeBilinearVertex(cudaStream_t stream, + float *vertex, float *varying, int vertexLength, int vertexStride, int varyingLength, int varyingStride, int *V_ITa, int offset, int tableOffset, int start, int end) { -// computeBilinearVertex<0, 3><<<512,32>>>(vertex, varying, V_ITa, offset, start, end); - OPT_KERNEL(0, 0, computeBilinearVertex, 512, 32, (vertex, varying, V_ITa, offset, tableOffset, start, end)); - OPT_KERNEL(0, 3, computeBilinearVertex, 512, 32, (vertex, varying, V_ITa, offset, tableOffset, start, end)); - OPT_KERNEL(3, 0, computeBilinearVertex, 512, 32, (vertex, varying, V_ITa, offset, tableOffset, start, end)); - OPT_KERNEL(3, 3, computeBilinearVertex, 512, 32, (vertex, varying, V_ITa, offset, tableOffset, start, end)); +// computeBilinearVertex<0, 3><<<512,32,0,stream>>>(vertex, varying, V_ITa, offset, start, end); + OPT_KERNEL(0, 0, computeBilinearVertex, 512, 32, stream, (vertex, varying, V_ITa, offset, tableOffset, start, end)); + OPT_KERNEL(0, 3, computeBilinearVertex, 512, 32, stream, (vertex, varying, V_ITa, offset, tableOffset, start, end)); + OPT_KERNEL(3, 0, computeBilinearVertex, 512, 32, stream, (vertex, varying, V_ITa, offset, tableOffset, start, end)); + OPT_KERNEL(3, 3, computeBilinearVertex, 512, 32, stream, (vertex, varying, V_ITa, offset, tableOffset, start, end)); - computeBilinearVertex<<<512, 32>>>(vertex, varying, + // fallback kernel (slow) + computeBilinearVertex<<<512, 32, 0, stream>>>(vertex, varying, vertexLength, vertexStride, varyingLength, varyingStride, V_ITa, offset, tableOffset, start, end); } -void OsdCudaEditVertexAdd(float *vertex, int vertexLength, int vertexStride, +void OsdCudaEditVertexAdd(cudaStream_t stream, + float *vertex, int vertexLength, int vertexStride, int primVarOffset, int primVarWidth, int vertexOffset, int tableOffset, int start, int end, int *editIndices, float *editValues) { - editVertexAdd<<<512, 32>>>(vertex, vertexLength, vertexStride, primVarOffset, primVarWidth, + editVertexAdd<<<512, 32, 0, stream>>>(vertex, vertexLength, vertexStride, primVarOffset, primVarWidth, vertexOffset, tableOffset, start, end, editIndices, editValues); }