Merge pull request #340 from adityaatluri/dev

added memcpyasync and streams to cuda backend
This commit is contained in:
Manuel Kraemer 2014-07-24 17:42:41 -07:00
commit ae4ecf5bf9
4 changed files with 193 additions and 120 deletions

View File

@ -30,14 +30,17 @@ namespace OpenSubdiv {
namespace OPENSUBDIV_VERSION {
bool
OsdCudaTable::createCudaBuffer(size_t size, const void *ptr) {
OsdCudaTable::createCudaBuffer(cudaStream_t stream, size_t size, const void *ptr) {
cudaHostRegister((void**)&ptr, size);
/* The above command is slow. Try to use cudaMallocHost during the allocation of ptr to speedup */
cudaError_t err = cudaMalloc(&_devicePtr, size);
if (err != cudaSuccess) {
return false;
}
err = cudaMemcpy(_devicePtr, ptr, size, cudaMemcpyHostToDevice);
err = cudaMemcpyAsync(_devicePtr, ptr, size, cudaMemcpyHostToDevice, stream);
if (err != cudaSuccess) {
cudaFree(_devicePtr);
_devicePtr = NULL;
@ -140,15 +143,15 @@ OsdCudaComputeContext::initialize(FarSubdivisionTables const *subdivisionTables,
// allocate 5 or 7 tables
_tables.resize(subdivisionTables->GetNumTables(), 0);
_tables[FarSubdivisionTables::E_IT] = OsdCudaTable::Create(subdivisionTables->Get_E_IT());
_tables[FarSubdivisionTables::V_IT] = OsdCudaTable::Create(subdivisionTables->Get_V_IT());
_tables[FarSubdivisionTables::V_ITa] = OsdCudaTable::Create(subdivisionTables->Get_V_ITa());
_tables[FarSubdivisionTables::E_W] = OsdCudaTable::Create(subdivisionTables->Get_E_W());
_tables[FarSubdivisionTables::V_W] = OsdCudaTable::Create(subdivisionTables->Get_V_W());
_tables[FarSubdivisionTables::E_IT] = OsdCudaTable::Create(GetStream(), subdivisionTables->Get_E_IT());
_tables[FarSubdivisionTables::V_IT] = OsdCudaTable::Create(GetStream(), subdivisionTables->Get_V_IT());
_tables[FarSubdivisionTables::V_ITa] = OsdCudaTable::Create(GetStream(), subdivisionTables->Get_V_ITa());
_tables[FarSubdivisionTables::E_W] = OsdCudaTable::Create(GetStream(), subdivisionTables->Get_E_W());
_tables[FarSubdivisionTables::V_W] = OsdCudaTable::Create(GetStream(), subdivisionTables->Get_V_W());
if (subdivisionTables->GetNumTables() > 5) {
_tables[FarSubdivisionTables::F_IT] = OsdCudaTable::Create(subdivisionTables->Get_F_IT());
_tables[FarSubdivisionTables::F_ITa] = OsdCudaTable::Create(subdivisionTables->Get_F_ITa());
_tables[FarSubdivisionTables::F_IT] = OsdCudaTable::Create(GetStream(), subdivisionTables->Get_F_IT());
_tables[FarSubdivisionTables::F_ITa] = OsdCudaTable::Create(GetStream(), subdivisionTables->Get_F_ITa());
}
// error check
@ -202,6 +205,10 @@ OsdCudaComputeContext::Create(FarSubdivisionTables const *subdivisionTables,
OsdCudaComputeContext *result = new OsdCudaComputeContext();
cudaStream_t stream;
cudaStreamCreate(&stream);
_stream = &stream;
if (result->initialize(subdivisionTables, vertexEditTables) == false) {
delete result;
return NULL;
@ -209,5 +216,10 @@ OsdCudaComputeContext::Create(FarSubdivisionTables const *subdivisionTables,
return result;
}
cudaStream_t
OsdComputeContext::GetStream(){
return *_stream
}
} // end namespace OPENSUBDIV_VERSION
} // end namespace OpenSubdiv

View File

@ -42,9 +42,9 @@ namespace OPENSUBDIV_VERSION {
class OsdCudaTable : OsdNonCopyable<OsdCudaTable> {
public:
template<typename T>
static OsdCudaTable * Create(const std::vector<T> &table) {
static OsdCudaTable * Create(cudaStream_t stream, const std::vector<T> &table) {
OsdCudaTable *result = new OsdCudaTable();
if (not result->createCudaBuffer(table.size() * sizeof(T), table.empty() ? NULL : &table[0])) {
if (not result->createCudaBuffer(stream, table.size() * sizeof(T), table.empty() ? NULL : &table[0])) {
delete result;
return NULL;
}
@ -55,12 +55,16 @@ public:
void * GetCudaMemory() const;
cudaStream_t GetStream();
private:
OsdCudaTable() : _devicePtr(NULL) {}
bool createCudaBuffer(size_t size, const void *ptr);
void *_devicePtr;
cudaStream_t *_stream;
};
class OsdCudaHEditTable : OsdNonCopyable<OsdCudaHEditTable> {

View File

@ -30,78 +30,92 @@
extern "C" {
void OsdCudaComputeFace(float *vertex, float *varying,
void OsdCudaComputeFace(cudaStream_t stream,
float *vertex, float *varying,
int vertexLength, int vertexStride,
int varyingLength, int varyingStride,
int *F_IT, int *F_ITa, int offset, int tableOffset, int start, int end);
void OsdCudaComputeQuadFace(float *vertex, float *varying,
void OsdCudaComputeQuadFace(cudaStream_t stream,
float *vertex, float *varying,
int vertexLength, int vertexStride,
int varyingLength, int varyingStride,
int *F_IT, int offset, int tableOffset, int start, int end);
void OsdCudaComputeTriQuadFace(float *vertex, float *varying,
void OsdCudaComputeTriQuadFace(cudaStream_t stream,
float *vertex, float *varying,
int vertexLength, int vertexStride,
int varyingLength, int varyingStride,
int *F_IT, int offset, int tableOffset, int start, int end);
void OsdCudaComputeEdge(float *vertex, float *varying,
void OsdCudaComputeEdge(cudaStream_t stream,
float *vertex, float *varying,
int vertexLength, int vertexStride,
int varyingLength, int varyingStride,
int *E_IT, float *E_W, int offset, int tableOffset, int start, int end);
void OsdCudaComputeRestrictedEdge(float *vertex, float *varying,
void OsdCudaComputeRestrictedEdge(cudaStream_t stream,
float *vertex, float *varying,
int vertexLength, int vertexStride,
int varyingLength, int varyingStride,
int *E_IT, int offset, int tableOffset, int start, int end);
void OsdCudaComputeVertexA(float *vertex, float *varying,
void OsdCudaComputeVertexA(cudaStream_t stream,
float *vertex, float *varying,
int vertexLength, int vertexStride,
int varyingLength, int varyingStride,
int *V_ITa, float *V_W, int offset, int tableOffset,
int start, int end, int pass);
void OsdCudaComputeVertexB(float *vertex, float *varying,
void OsdCudaComputeVertexB(cudaStream_t stream,
float *vertex, float *varying,
int vertexLength, int vertexStride,
int varyingLength, int varyingStride,
int *V_ITa, int *V_IT, float *V_W, int offset, int tableOffset,
int start, int end);
void OsdCudaComputeRestrictedVertexA(float *vertex, float *varying,
void OsdCudaComputeRestrictedVertexA(cudaStream_t stream,
float *vertex, float *varying,
int vertexLength, int vertexStride,
int varyingLength, int varyingStride,
int *V_ITa, int offset, int tableOffset,
int start, int end);
void OsdCudaComputeRestrictedVertexB1(float *vertex, float *varying,
void OsdCudaComputeRestrictedVertexB1(cudaStream_t stream,
float *vertex, float *varying,
int vertexLength, int vertexStride,
int varyingLength, int varyingStride,
int *V_ITa, int *V_IT, int offset, int tableOffset,
int start, int end);
void OsdCudaComputeRestrictedVertexB2(float *vertex, float *varying,
void OsdCudaComputeRestrictedVertexB2(cudaStream_t stream,
float *vertex, float *varying,
int vertexLength, int vertexStride,
int varyingLength, int varyingStride,
int *V_ITa, int *V_IT, int offset, int tableOffset,
int start, int end);
void OsdCudaComputeLoopVertexB(float *vertex, float *varying,
void OsdCudaComputeLoopVertexB(cudaStream_t stream,
float *vertex, float *varying,
int vertexLength, int vertexStride,
int varyingLength, int varyingStride,
int *V_ITa, int *V_IT, float *V_W, int offset, int tableOffset,
int start, int end);
void OsdCudaComputeBilinearEdge(float *vertex, float *varying,
void OsdCudaComputeBilinearEdge(cudaStream_t stream,
float *vertex, float *varying,
int vertexLength, int vertexStride,
int varyingLength, int varyingStride,
int *E_IT, int offset, int tableOffset, int start, int end);
void OsdCudaComputeBilinearVertex(float *vertex, float *varying,
void OsdCudaComputeBilinearVertex(cudaStream_t stream,
float *vertex, float *varying,
int vertexLength, int vertexStride,
int varyingLength, int varyingStride,
int *V_ITa, int offset, int tableOffset, int start, int end);
void OsdCudaEditVertexAdd(float *vertex,
void OsdCudaEditVertexAdd(cudaStream_t stream,
float *vertex,
int vertexLength, int vertexStride,
int primVarOffset, int primVarWidth,
int offset, int tableOffset,
@ -133,6 +147,7 @@ OsdCudaComputeController::ApplyBilinearFaceVerticesKernel(
float *varying = _currentBindState.GetOffsettedVaryingBuffer();
OsdCudaComputeFace(
context->GetStream(),
vertex, varying,
_currentBindState.vertexDesc.length, _currentBindState.vertexDesc.stride,
_currentBindState.varyingDesc.length, _currentBindState.varyingDesc.stride,
@ -154,6 +169,7 @@ OsdCudaComputeController::ApplyBilinearEdgeVerticesKernel(
float *varying = _currentBindState.GetOffsettedVaryingBuffer();
OsdCudaComputeBilinearEdge(
context->GetStream(),
vertex, varying,
_currentBindState.vertexDesc.length, _currentBindState.vertexDesc.stride,
_currentBindState.varyingDesc.length, _currentBindState.varyingDesc.stride,
@ -174,6 +190,7 @@ OsdCudaComputeController::ApplyBilinearVertexVerticesKernel(
float *varying = _currentBindState.GetOffsettedVaryingBuffer();
OsdCudaComputeBilinearVertex(
context->GetStream(),
vertex, varying,
_currentBindState.vertexDesc.length, _currentBindState.vertexDesc.stride,
_currentBindState.varyingDesc.length, _currentBindState.varyingDesc.stride,
@ -196,6 +213,7 @@ OsdCudaComputeController::ApplyCatmarkFaceVerticesKernel(
float *varying = _currentBindState.GetOffsettedVaryingBuffer();
OsdCudaComputeFace(
context->GetStream(),
vertex, varying,
_currentBindState.vertexDesc.length, _currentBindState.vertexDesc.stride,
_currentBindState.varyingDesc.length, _currentBindState.varyingDesc.stride,
@ -217,6 +235,7 @@ OsdCudaComputeController::ApplyCatmarkQuadFaceVerticesKernel(
float *varying = _currentBindState.GetOffsettedVaryingBuffer();
OsdCudaComputeQuadFace(
context->GetStream(),
vertex, varying,
_currentBindState.vertexDesc.length, _currentBindState.vertexDesc.stride,
_currentBindState.varyingDesc.length, _currentBindState.varyingDesc.stride,
@ -237,6 +256,7 @@ OsdCudaComputeController::ApplyCatmarkTriQuadFaceVerticesKernel(
float *varying = _currentBindState.GetOffsettedVaryingBuffer();
OsdCudaComputeTriQuadFace(
context->GetStream(),
vertex, varying,
_currentBindState.vertexDesc.length, _currentBindState.vertexDesc.stride,
_currentBindState.varyingDesc.length, _currentBindState.varyingDesc.stride,
@ -259,6 +279,7 @@ OsdCudaComputeController::ApplyCatmarkEdgeVerticesKernel(
float *varying = _currentBindState.GetOffsettedVaryingBuffer();
OsdCudaComputeEdge(
context->GetStream(),
vertex, varying,
_currentBindState.vertexDesc.length, _currentBindState.vertexDesc.stride,
_currentBindState.varyingDesc.length, _currentBindState.varyingDesc.stride,
@ -280,6 +301,7 @@ OsdCudaComputeController::ApplyCatmarkRestrictedEdgeVerticesKernel(
float *varying = _currentBindState.GetOffsettedVaryingBuffer();
OsdCudaComputeRestrictedEdge(
context->GetStream(),
vertex, varying,
_currentBindState.vertexDesc.length, _currentBindState.vertexDesc.stride,
_currentBindState.varyingDesc.length, _currentBindState.varyingDesc.stride,
@ -304,6 +326,7 @@ OsdCudaComputeController::ApplyCatmarkVertexVerticesKernelB(
float *varying = _currentBindState.GetOffsettedVaryingBuffer();
OsdCudaComputeVertexB(
context->GetStream(),
vertex, varying,
_currentBindState.vertexDesc.length, _currentBindState.vertexDesc.stride,
_currentBindState.varyingDesc.length, _currentBindState.varyingDesc.stride,
@ -328,6 +351,7 @@ OsdCudaComputeController::ApplyCatmarkVertexVerticesKernelA1(
float *varying = _currentBindState.GetOffsettedVaryingBuffer();
OsdCudaComputeVertexA(
context->GetStream(),
vertex, varying,
_currentBindState.vertexDesc.length, _currentBindState.vertexDesc.stride,
_currentBindState.varyingDesc.length, _currentBindState.varyingDesc.stride,
@ -351,6 +375,7 @@ OsdCudaComputeController::ApplyCatmarkVertexVerticesKernelA2(
float *varying = _currentBindState.GetOffsettedVaryingBuffer();
OsdCudaComputeVertexA(
context->GetStream(),
vertex, varying,
_currentBindState.vertexDesc.length, _currentBindState.vertexDesc.stride,
_currentBindState.varyingDesc.length, _currentBindState.varyingDesc.stride,
@ -374,6 +399,7 @@ OsdCudaComputeController::ApplyCatmarkRestrictedVertexVerticesKernelB1(
float *varying = _currentBindState.GetOffsettedVaryingBuffer();
OsdCudaComputeRestrictedVertexB1(
context->GetStream(),
vertex, varying,
_currentBindState.vertexDesc.length, _currentBindState.vertexDesc.stride,
_currentBindState.varyingDesc.length, _currentBindState.varyingDesc.stride,
@ -397,6 +423,7 @@ OsdCudaComputeController::ApplyCatmarkRestrictedVertexVerticesKernelB2(
float *varying = _currentBindState.GetOffsettedVaryingBuffer();
OsdCudaComputeRestrictedVertexB2(
context->GetStream(),
vertex, varying,
_currentBindState.vertexDesc.length, _currentBindState.vertexDesc.stride,
_currentBindState.varyingDesc.length, _currentBindState.varyingDesc.stride,
@ -418,6 +445,7 @@ OsdCudaComputeController::ApplyCatmarkRestrictedVertexVerticesKernelA(
float *varying = _currentBindState.GetOffsettedVaryingBuffer();
OsdCudaComputeRestrictedVertexA(
context->GetStream(),
vertex, varying,
_currentBindState.vertexDesc.length, _currentBindState.vertexDesc.stride,
_currentBindState.varyingDesc.length, _currentBindState.varyingDesc.stride,
@ -440,6 +468,7 @@ OsdCudaComputeController::ApplyLoopEdgeVerticesKernel(
float *varying = _currentBindState.GetOffsettedVaryingBuffer();
OsdCudaComputeEdge(
context->GetStream(),
vertex, varying,
_currentBindState.vertexDesc.length, _currentBindState.vertexDesc.stride,
_currentBindState.varyingDesc.length, _currentBindState.varyingDesc.stride,
@ -465,6 +494,7 @@ OsdCudaComputeController::ApplyLoopVertexVerticesKernelB(
float *varying = _currentBindState.GetOffsettedVaryingBuffer();
OsdCudaComputeLoopVertexB(
context->GetStream(),
vertex, varying,
_currentBindState.vertexDesc.length, _currentBindState.vertexDesc.stride,
_currentBindState.varyingDesc.length, _currentBindState.varyingDesc.stride,
@ -489,6 +519,7 @@ OsdCudaComputeController::ApplyLoopVertexVerticesKernelA1(
float *varying = _currentBindState.GetOffsettedVaryingBuffer();
OsdCudaComputeVertexA(
context->GetStream(),
vertex, varying,
_currentBindState.vertexDesc.length, _currentBindState.vertexDesc.stride,
_currentBindState.varyingDesc.length, _currentBindState.varyingDesc.stride,
@ -512,6 +543,7 @@ OsdCudaComputeController::ApplyLoopVertexVerticesKernelA2(
float *varying = _currentBindState.GetOffsettedVaryingBuffer();
OsdCudaComputeVertexA(
context->GetStream(),
vertex, varying,
_currentBindState.vertexDesc.length, _currentBindState.vertexDesc.stride,
_currentBindState.varyingDesc.length, _currentBindState.varyingDesc.stride,
@ -536,6 +568,7 @@ OsdCudaComputeController::ApplyVertexEdits(
if (edit->GetOperation() == FarVertexEdit::Add) {
OsdCudaEditVertexAdd(
context->GetStream(),
vertex,
_currentBindState.vertexDesc.length, _currentBindState.vertexDesc.stride,
edit->GetPrimvarOffset(),

View File

@ -1020,234 +1020,258 @@ editVertexAdd(float *fVertex, int vertexLength, int vertexStride,
// XXX: this macro usage is tentative. Since cuda kernel can't be dynamically configured,
// still trying to find better way to have optimized kernel..
#define OPT_KERNEL(NUM_VERTEX_ELEMENTS, NUM_VARYING_ELEMENTS, KERNEL, X, Y, ARG) \
#define OPT_KERNEL(NUM_VERTEX_ELEMENTS, NUM_VARYING_ELEMENTS, KERNEL, X, Y, STREAM, ARG) \
if(vertexLength == NUM_VERTEX_ELEMENTS && \
varyingLength == NUM_VARYING_ELEMENTS && \
vertexStride == vertexLength && \
varyingStride == varyingLength) \
{ KERNEL<NUM_VERTEX_ELEMENTS, NUM_VARYING_ELEMENTS><<<X,Y>>>ARG; \
{ KERNEL<NUM_VERTEX_ELEMENTS, NUM_VARYING_ELEMENTS><<<X,Y,0,STREAM>>>ARG; \
return; }
extern "C" {
void OsdCudaComputeFace(float *vertex, float *varying,
void OsdCudaComputeFace(cudaStream_t stream,
float *vertex, float *varying,
int vertexLength, int vertexStride,
int varyingLength, int varyingStride,
int *F_IT, int *F_ITa, int offset, int tableOffset, int start, int end)
{
//computeFace<3, 0><<<512,32>>>(vertex, varying, F_IT, F_ITa, offset, start, end);
OPT_KERNEL(0, 0, computeFace, 512, 32, (vertex, varying, F_IT, F_ITa, offset, tableOffset, start, end));
OPT_KERNEL(0, 3, computeFace, 512, 32, (vertex, varying, F_IT, F_ITa, offset, tableOffset, start, end));
OPT_KERNEL(3, 0, computeFace, 512, 32, (vertex, varying, F_IT, F_ITa, offset, tableOffset, start, end));
OPT_KERNEL(3, 3, computeFace, 512, 32, (vertex, varying, F_IT, F_ITa, offset, tableOffset, start, end));
//computeFace<3, 0><<<512,32,0,stream>>>(vertex, varying, F_IT, F_ITa, offset, start, end);
OPT_KERNEL(0, 0, computeFace, 512, 32, stream, (vertex, varying, F_IT, F_ITa, offset, tableOffset, start, end));
OPT_KERNEL(0, 3, computeFace, 512, 32, stream, (vertex, varying, F_IT, F_ITa, offset, tableOffset, start, end));
OPT_KERNEL(3, 0, computeFace, 512, 32, stream, (vertex, varying, F_IT, F_ITa, offset, tableOffset, start, end));
OPT_KERNEL(3, 3, computeFace, 512, 32, stream, (vertex, varying, F_IT, F_ITa, offset, tableOffset, start, end));
// fallback kernel (slow)
computeFace<<<512, 32>>>(vertex, varying,
computeFace<<<512, 32, 0, stream>>>(vertex, varying,
vertexLength, vertexStride, varyingLength, varyingStride,
F_IT, F_ITa, offset, tableOffset, start, end);
}
void OsdCudaComputeQuadFace(float *vertex, float *varying,
void OsdCudaComputeQuadFace(cudaStream_t stream,
float *vertex, float *varying,
int vertexLength, int vertexStride,
int varyingLength, int varyingStride,
int *F_IT, int offset, int tableOffset, int start, int end)
{
//computeQuadFace<3, 0><<<512,32>>>(vertex, varying, F_IT, offset, start, end);
OPT_KERNEL(0, 0, computeQuadFace, 512, 32, (vertex, varying, F_IT, offset, tableOffset, start, end));
OPT_KERNEL(0, 3, computeQuadFace, 512, 32, (vertex, varying, F_IT, offset, tableOffset, start, end));
OPT_KERNEL(3, 0, computeQuadFace, 512, 32, (vertex, varying, F_IT, offset, tableOffset, start, end));
OPT_KERNEL(3, 3, computeQuadFace, 512, 32, (vertex, varying, F_IT, offset, tableOffset, start, end));
//computeQuadFace<3, 0><<<512,32,0,stream>>>(vertex, varying, F_IT, offset, start, end);
OPT_KERNEL(0, 0, computeQuadFace, 512, 32, stream, (vertex, varying, F_IT, offset, tableOffset, start, end));
OPT_KERNEL(0, 3, computeQuadFace, 512, 32, stream, (vertex, varying, F_IT, offset, tableOffset, start, end));
OPT_KERNEL(3, 0, computeQuadFace, 512, 32, stream, (vertex, varying, F_IT, offset, tableOffset, start, end));
OPT_KERNEL(3, 3, computeQuadFace, 512, 32, stream, (vertex, varying, F_IT, offset, tableOffset, start, end));
// fallback kernel (slow)
computeQuadFace<<<512, 32>>>(vertex, varying,
computeQuadFace<<<512, 32, 0, stream>>>(vertex, varying,
vertexLength, vertexStride, varyingLength, varyingStride,
F_IT, offset, tableOffset, start, end);
}
void OsdCudaComputeTriQuadFace(float *vertex, float *varying,
void OsdCudaComputeTriQuadFace(cudaStream_t stream,
float *vertex, float *varying,
int vertexLength, int vertexStride,
int varyingLength, int varyingStride,
int *F_IT, int offset, int tableOffset, int start, int end)
{
//computeTriQuadFace<3, 0><<<512,32>>>(vertex, varying, F_IT, offset, start, end);
OPT_KERNEL(0, 0, computeTriQuadFace, 512, 32, (vertex, varying, F_IT, offset, tableOffset, start, end));
OPT_KERNEL(0, 3, computeTriQuadFace, 512, 32, (vertex, varying, F_IT, offset, tableOffset, start, end));
OPT_KERNEL(3, 0, computeTriQuadFace, 512, 32, (vertex, varying, F_IT, offset, tableOffset, start, end));
OPT_KERNEL(3, 3, computeTriQuadFace, 512, 32, (vertex, varying, F_IT, offset, tableOffset, start, end));
//computeTriQuadFace<3, 0><<<512,32,0,stream>>>(vertex, varying, F_IT, offset, start, end);
OPT_KERNEL(0, 0, computeTriQuadFace, 512, 32, stream, (vertex, varying, F_IT, offset, tableOffset, start, end));
OPT_KERNEL(0, 3, computeTriQuadFace, 512, 32, stream, (vertex, varying, F_IT, offset, tableOffset, start, end));
OPT_KERNEL(3, 0, computeTriQuadFace, 512, 32, stream, (vertex, varying, F_IT, offset, tableOffset, start, end));
OPT_KERNEL(3, 3, computeTriQuadFace, 512, 32, stream, (vertex, varying, F_IT, offset, tableOffset, start, end));
// fallback kernel (slow)
computeTriQuadFace<<<512, 32>>>(vertex, varying,
computeTriQuadFace<<<512, 32, 0, stream>>>(vertex, varying,
vertexLength, vertexStride, varyingLength, varyingStride,
F_IT, offset, tableOffset, start, end);
}
void OsdCudaComputeEdge(float *vertex, float *varying,
void OsdCudaComputeEdge(cudaStream_t stream,
float *vertex, float *varying,
int vertexLength, int vertexStride,
int varyingLength, int varyingStride,
int *E_IT, float *E_W, int offset, int tableOffset, int start, int end)
{
//computeEdge<0, 3><<<512,32>>>(vertex, varying, E_IT, E_W, offset, start, end);
OPT_KERNEL(0, 0, computeEdge, 512, 32, (vertex, varying, E_IT, E_W, offset, tableOffset, start, end));
OPT_KERNEL(0, 3, computeEdge, 512, 32, (vertex, varying, E_IT, E_W, offset, tableOffset, start, end));
OPT_KERNEL(3, 0, computeEdge, 512, 32, (vertex, varying, E_IT, E_W, offset, tableOffset, start, end));
OPT_KERNEL(3, 3, computeEdge, 512, 32, (vertex, varying, E_IT, E_W, offset, tableOffset, start, end));
//computeEdge<0, 3><<<512,32,0,stream>>>(vertex, varying, E_IT, E_W, offset, start, end);
OPT_KERNEL(0, 0, computeEdge, 512, 32, stream, (vertex, varying, E_IT, E_W, offset, tableOffset, start, end));
OPT_KERNEL(0, 3, computeEdge, 512, 32, stream, (vertex, varying, E_IT, E_W, offset, tableOffset, start, end));
OPT_KERNEL(3, 0, computeEdge, 512, 32, stream, (vertex, varying, E_IT, E_W, offset, tableOffset, start, end));
OPT_KERNEL(3, 3, computeEdge, 512, 32, stream, (vertex, varying, E_IT, E_W, offset, tableOffset, start, end));
computeEdge<<<512, 32>>>(vertex, varying,
// fallback kernel (slow)
computeEdge<<<512, 32, 0, stream>>>(vertex, varying,
vertexLength, vertexStride, varyingLength, varyingStride,
E_IT, E_W, offset, tableOffset, start, end);
}
void OsdCudaComputeRestrictedEdge(float *vertex, float *varying,
void OsdCudaComputeRestrictedEdge(cudaStream_t stream,
float *vertex, float *varying,
int vertexLength, int vertexStride,
int varyingLength, int varyingStride,
int *E_IT, int offset, int tableOffset, int start, int end)
{
//computeRestrictedEdge<0, 3><<<512,32>>>(vertex, varying, E_IT, offset, start, end);
OPT_KERNEL(0, 0, computeRestrictedEdge, 512, 32, (vertex, varying, E_IT, offset, tableOffset, start, end));
OPT_KERNEL(0, 3, computeRestrictedEdge, 512, 32, (vertex, varying, E_IT, offset, tableOffset, start, end));
OPT_KERNEL(3, 0, computeRestrictedEdge, 512, 32, (vertex, varying, E_IT, offset, tableOffset, start, end));
OPT_KERNEL(3, 3, computeRestrictedEdge, 512, 32, (vertex, varying, E_IT, offset, tableOffset, start, end));
//computeRestrictedEdge<0, 3><<<512,32,0,stream>>>(vertex, varying, E_IT, offset, start, end);
OPT_KERNEL(0, 0, computeRestrictedEdge, 512, 32, stream, (vertex, varying, E_IT, offset, tableOffset, start, end));
OPT_KERNEL(0, 3, computeRestrictedEdge, 512, 32, stream, (vertex, varying, E_IT, offset, tableOffset, start, end));
OPT_KERNEL(3, 0, computeRestrictedEdge, 512, 32, stream, (vertex, varying, E_IT, offset, tableOffset, start, end));
OPT_KERNEL(3, 3, computeRestrictedEdge, 512, 32, stream, (vertex, varying, E_IT, offset, tableOffset, start, end));
computeRestrictedEdge<<<512, 32>>>(vertex, varying,
// fallback kernel (slow)
computeRestrictedEdge<<<512, 32, 0, stream>>>(vertex, varying,
vertexLength, vertexStride, varyingLength, varyingStride,
E_IT, offset, tableOffset, start, end);
}
void OsdCudaComputeVertexA(float *vertex, float *varying,
void OsdCudaComputeVertexA(cudaStream_t stream,
float *vertex, float *varying,
int vertexLength, int vertexStride,
int varyingLength, int varyingStride,
int *V_ITa, float *V_W, int offset, int tableOffset, int start, int end, int pass)
{
// computeVertexA<0, 3><<<512,32>>>(vertex, varying, V_ITa, V_W, offset, start, end, pass);
OPT_KERNEL(0, 0, computeVertexA, 512, 32, (vertex, varying, V_ITa, V_W, offset, tableOffset, start, end, pass));
OPT_KERNEL(0, 3, computeVertexA, 512, 32, (vertex, varying, V_ITa, V_W, offset, tableOffset, start, end, pass));
OPT_KERNEL(3, 0, computeVertexA, 512, 32, (vertex, varying, V_ITa, V_W, offset, tableOffset, start, end, pass));
OPT_KERNEL(3, 3, computeVertexA, 512, 32, (vertex, varying, V_ITa, V_W, offset, tableOffset, start, end, pass));
// computeVertexA<0, 3><<<512,32,0,stream>>>(vertex, varying, V_ITa, V_W, offset, start, end, pass);
OPT_KERNEL(0, 0, computeVertexA, 512, 32, stream, (vertex, varying, V_ITa, V_W, offset, tableOffset, start, end, pass));
OPT_KERNEL(0, 3, computeVertexA, 512, 32, stream, (vertex, varying, V_ITa, V_W, offset, tableOffset, start, end, pass));
OPT_KERNEL(3, 0, computeVertexA, 512, 32, stream, (vertex, varying, V_ITa, V_W, offset, tableOffset, start, end, pass));
OPT_KERNEL(3, 3, computeVertexA, 512, 32, stream, (vertex, varying, V_ITa, V_W, offset, tableOffset, start, end, pass));
computeVertexA<<<512, 32>>>(vertex, varying,
// fallback kernel (slow)
computeVertexA<<<512, 32, 0, stream>>>(vertex, varying,
vertexLength, vertexStride, varyingLength, varyingStride,
V_ITa, V_W, offset, tableOffset, start, end, pass);
}
void OsdCudaComputeVertexB(float *vertex, float *varying,
void OsdCudaComputeVertexB(cudaStream_t stream,
float *vertex, float *varying,
int vertexLength, int vertexStride,
int varyingLength, int varyingStride,
int *V_ITa, int *V_IT, float *V_W, int offset, int tableOffset, int start, int end)
{
// computeVertexB<0, 3><<<512,32>>>(vertex, varying, V_ITa, V_IT, V_W, offset, start, end);
OPT_KERNEL(0, 0, computeVertexB, 512, 32, (vertex, varying, V_ITa, V_IT, V_W, offset, tableOffset, start, end));
OPT_KERNEL(0, 3, computeVertexB, 512, 32, (vertex, varying, V_ITa, V_IT, V_W, offset, tableOffset, start, end));
OPT_KERNEL(3, 0, computeVertexB, 512, 32, (vertex, varying, V_ITa, V_IT, V_W, offset, tableOffset, start, end));
OPT_KERNEL(3, 3, computeVertexB, 512, 32, (vertex, varying, V_ITa, V_IT, V_W, offset, tableOffset, start, end));
// computeVertexB<0, 3><<<512,32,0,stream>>>(vertex, varying, V_ITa, V_IT, V_W, offset, start, end);
OPT_KERNEL(0, 0, computeVertexB, 512, 32, stream, (vertex, varying, V_ITa, V_IT, V_W, offset, tableOffset, start, end));
OPT_KERNEL(0, 3, computeVertexB, 512, 32, stream, (vertex, varying, V_ITa, V_IT, V_W, offset, tableOffset, start, end));
OPT_KERNEL(3, 0, computeVertexB, 512, 32, stream, (vertex, varying, V_ITa, V_IT, V_W, offset, tableOffset, start, end));
OPT_KERNEL(3, 3, computeVertexB, 512, 32, stream, (vertex, varying, V_ITa, V_IT, V_W, offset, tableOffset, start, end));
computeVertexB<<<512, 32>>>(vertex, varying,
// fallback kernel (slow)
computeVertexB<<<512, 32, 0, stream>>>(vertex, varying,
vertexLength, vertexStride, varyingLength, varyingStride,
V_ITa, V_IT, V_W, offset, tableOffset, start, end);
}
void OsdCudaComputeRestrictedVertexA(float *vertex, float *varying,
void OsdCudaComputeRestrictedVertexA(cudaStream_t stream,
float *vertex, float *varying,
int vertexLength, int vertexStride,
int varyingLength, int varyingStride,
int *V_ITa, int offset, int tableOffset, int start, int end)
{
// computeRestrictedVertexA<0, 3><<<512,32>>>(vertex, varying, V_ITa, offset, start, end);
OPT_KERNEL(0, 0, computeRestrictedVertexA, 512, 32, (vertex, varying, V_ITa, offset, tableOffset, start, end));
OPT_KERNEL(0, 3, computeRestrictedVertexA, 512, 32, (vertex, varying, V_ITa, offset, tableOffset, start, end));
OPT_KERNEL(3, 0, computeRestrictedVertexA, 512, 32, (vertex, varying, V_ITa, offset, tableOffset, start, end));
OPT_KERNEL(3, 3, computeRestrictedVertexA, 512, 32, (vertex, varying, V_ITa, offset, tableOffset, start, end));
// computeRestrictedVertexA<0, 3><<<512,32,0,stream>>>(vertex, varying, V_ITa, offset, start, end);
OPT_KERNEL(0, 0, computeRestrictedVertexA, 512, 32, stream, (vertex, varying, V_ITa, offset, tableOffset, start, end));
OPT_KERNEL(0, 3, computeRestrictedVertexA, 512, 32, stream, (vertex, varying, V_ITa, offset, tableOffset, start, end));
OPT_KERNEL(3, 0, computeRestrictedVertexA, 512, 32, stream, (vertex, varying, V_ITa, offset, tableOffset, start, end));
OPT_KERNEL(3, 3, computeRestrictedVertexA, 512, 32, stream, (vertex, varying, V_ITa, offset, tableOffset, start, end));
computeRestrictedVertexA<<<512, 32>>>(vertex, varying,
// fallback kernel (slow)
computeRestrictedVertexA<<<512, 32, 0, stream>>>(vertex, varying,
vertexLength, vertexStride, varyingLength, varyingStride,
V_ITa, offset, tableOffset, start, end);
}
void OsdCudaComputeRestrictedVertexB1(float *vertex, float *varying,
void OsdCudaComputeRestrictedVertexB1(cudaStream_t stream,
float *vertex, float *varying,
int vertexLength, int vertexStride,
int varyingLength, int varyingStride,
int *V_ITa, int *V_IT, int offset, int tableOffset, int start, int end)
{
// computeRestrictedVertexB1<0, 3><<<512,32>>>(vertex, varying, V_ITa, V_IT, offset, start, end);
OPT_KERNEL(0, 0, computeRestrictedVertexB1, 512, 32, (vertex, varying, V_ITa, V_IT, offset, tableOffset, start, end));
OPT_KERNEL(0, 3, computeRestrictedVertexB1, 512, 32, (vertex, varying, V_ITa, V_IT, offset, tableOffset, start, end));
OPT_KERNEL(3, 0, computeRestrictedVertexB1, 512, 32, (vertex, varying, V_ITa, V_IT, offset, tableOffset, start, end));
OPT_KERNEL(3, 3, computeRestrictedVertexB1, 512, 32, (vertex, varying, V_ITa, V_IT, offset, tableOffset, start, end));
// computeRestrictedVertexB1<0, 3><<<512,32,0,stream>>>(vertex, varying, V_ITa, V_IT, offset, start, end);
OPT_KERNEL(0, 0, computeRestrictedVertexB1, 512, 32, stream, (vertex, varying, V_ITa, V_IT, offset, tableOffset, start, end));
OPT_KERNEL(0, 3, computeRestrictedVertexB1, 512, 32, stream, (vertex, varying, V_ITa, V_IT, offset, tableOffset, start, end));
OPT_KERNEL(3, 0, computeRestrictedVertexB1, 512, 32, stream, (vertex, varying, V_ITa, V_IT, offset, tableOffset, start, end));
OPT_KERNEL(3, 3, computeRestrictedVertexB1, 512, 32, stream, (vertex, varying, V_ITa, V_IT, offset, tableOffset, start, end));
computeRestrictedVertexB1 <<<512, 32>>>(vertex, varying,
// fallback kernel (slow)
computeRestrictedVertexB1<<<512, 32, 0, stream>>>(vertex, varying,
vertexLength, vertexStride, varyingLength, varyingStride,
V_ITa, V_IT, offset, tableOffset, start, end);
}
void OsdCudaComputeRestrictedVertexB2(float *vertex, float *varying,
void OsdCudaComputeRestrictedVertexB2(cudaStream_t stream,
float *vertex, float *varying,
int vertexLength, int vertexStride,
int varyingLength, int varyingStride,
int *V_ITa, int *V_IT, int offset, int tableOffset, int start, int end)
{
// computeRestrictedVertexB2<0, 3><<<512,32>>>(vertex, varying, V_ITa, V_IT, offset, start, end);
OPT_KERNEL(0, 0, computeRestrictedVertexB2, 512, 32, (vertex, varying, V_ITa, V_IT, offset, tableOffset, start, end));
OPT_KERNEL(0, 3, computeRestrictedVertexB2, 512, 32, (vertex, varying, V_ITa, V_IT, offset, tableOffset, start, end));
OPT_KERNEL(3, 0, computeRestrictedVertexB2, 512, 32, (vertex, varying, V_ITa, V_IT, offset, tableOffset, start, end));
OPT_KERNEL(3, 3, computeRestrictedVertexB2, 512, 32, (vertex, varying, V_ITa, V_IT, offset, tableOffset, start, end));
// computeRestrictedVertexB2<0, 3><<<512,32,0,stream>>>(vertex, varying, V_ITa, V_IT, offset, start, end);
OPT_KERNEL(0, 0, computeRestrictedVertexB2, 512, 32, stream, (vertex, varying, V_ITa, V_IT, offset, tableOffset, start, end));
OPT_KERNEL(0, 3, computeRestrictedVertexB2, 512, 32, stream, (vertex, varying, V_ITa, V_IT, offset, tableOffset, start, end));
OPT_KERNEL(3, 0, computeRestrictedVertexB2, 512, 32, stream, (vertex, varying, V_ITa, V_IT, offset, tableOffset, start, end));
OPT_KERNEL(3, 3, computeRestrictedVertexB2, 512, 32, stream, (vertex, varying, V_ITa, V_IT, offset, tableOffset, start, end));
computeRestrictedVertexB2 <<<512, 32>>>(vertex, varying,
// fallback kernel (slow)
computeRestrictedVertexB2<<<512, 32, 0, stream>>>(vertex, varying,
vertexLength, vertexStride, varyingLength, varyingStride,
V_ITa, V_IT, offset, tableOffset, start, end);
}
void OsdCudaComputeLoopVertexB(float *vertex, float *varying,
void OsdCudaComputeLoopVertexB(cudaStream_t stream,
float *vertex, float *varying,
int vertexLength, int vertexStride,
int varyingLength, int varyingStride,
int *V_ITa, int *V_IT, float *V_W, int offset, int tableOffset, int start, int end)
{
// computeLoopVertexB<0, 3><<<512,32>>>(vertex, varying, V_ITa, V_IT, V_W, offset, start, end);
OPT_KERNEL(0, 0, computeLoopVertexB, 512, 32, (vertex, varying, V_ITa, V_IT, V_W, offset, tableOffset, start, end));
OPT_KERNEL(0, 3, computeLoopVertexB, 512, 32, (vertex, varying, V_ITa, V_IT, V_W, offset, tableOffset, start, end));
OPT_KERNEL(3, 0, computeLoopVertexB, 512, 32, (vertex, varying, V_ITa, V_IT, V_W, offset, tableOffset, start, end));
OPT_KERNEL(3, 3, computeLoopVertexB, 512, 32, (vertex, varying, V_ITa, V_IT, V_W, offset, tableOffset, start, end));
// computeLoopVertexB<0, 3><<<512,32,0,stream>>>(vertex, varying, V_ITa, V_IT, V_W, offset, start, end);
OPT_KERNEL(0, 0, computeLoopVertexB, 512, 32, stream, (vertex, varying, V_ITa, V_IT, V_W, offset, tableOffset, start, end));
OPT_KERNEL(0, 3, computeLoopVertexB, 512, 32, stream, (vertex, varying, V_ITa, V_IT, V_W, offset, tableOffset, start, end));
OPT_KERNEL(3, 0, computeLoopVertexB, 512, 32, stream, (vertex, varying, V_ITa, V_IT, V_W, offset, tableOffset, start, end));
OPT_KERNEL(3, 3, computeLoopVertexB, 512, 32, stream, (vertex, varying, V_ITa, V_IT, V_W, offset, tableOffset, start, end));
computeLoopVertexB<<<512, 32>>>(vertex, varying,
// fallback kernel (slow)
computeLoopVertexB<<<512, 32, 0, stream>>>(vertex, varying,
vertexLength, vertexStride, varyingLength, varyingStride,
V_ITa, V_IT, V_W, offset, tableOffset, start, end);
}
void OsdCudaComputeBilinearEdge(float *vertex, float *varying,
void OsdCudaComputeBilinearEdge(cudaStream_t stream,
float *vertex, float *varying,
int vertexLength, int vertexStride,
int varyingLength, int varyingStride,
int *E_IT, int offset, int tableOffset, int start, int end)
{
//computeBilinearEdge<0, 3><<<512,32>>>(vertex, varying, E_IT, offset, start, end);
OPT_KERNEL(0, 0, computeBilinearEdge, 512, 32, (vertex, varying, E_IT, offset, tableOffset, start, end));
OPT_KERNEL(0, 3, computeBilinearEdge, 512, 32, (vertex, varying, E_IT, offset, tableOffset, start, end));
OPT_KERNEL(3, 0, computeBilinearEdge, 512, 32, (vertex, varying, E_IT, offset, tableOffset, start, end));
OPT_KERNEL(3, 3, computeBilinearEdge, 512, 32, (vertex, varying, E_IT, offset, tableOffset, start, end));
//computeBilinearEdge<0, 3><<<512,32,0,stream>>>(vertex, varying, E_IT, offset, start, end);
OPT_KERNEL(0, 0, computeBilinearEdge, 512, 32, stream, (vertex, varying, E_IT, offset, tableOffset, start, end));
OPT_KERNEL(0, 3, computeBilinearEdge, 512, 32, stream, (vertex, varying, E_IT, offset, tableOffset, start, end));
OPT_KERNEL(3, 0, computeBilinearEdge, 512, 32, stream, (vertex, varying, E_IT, offset, tableOffset, start, end));
OPT_KERNEL(3, 3, computeBilinearEdge, 512, 32, stream, (vertex, varying, E_IT, offset, tableOffset, start, end));
computeBilinearEdge<<<512, 32>>>(vertex, varying,
// fallback kernel (slow)
computeBilinearEdge<<<512, 32, 0, stream>>>(vertex, varying,
vertexLength, vertexStride, varyingLength, varyingStride,
E_IT, offset, tableOffset, start, end);
}
void OsdCudaComputeBilinearVertex(float *vertex, float *varying,
void OsdCudaComputeBilinearVertex(cudaStream_t stream,
float *vertex, float *varying,
int vertexLength, int vertexStride,
int varyingLength, int varyingStride,
int *V_ITa, int offset, int tableOffset, int start, int end)
{
// computeBilinearVertex<0, 3><<<512,32>>>(vertex, varying, V_ITa, offset, start, end);
OPT_KERNEL(0, 0, computeBilinearVertex, 512, 32, (vertex, varying, V_ITa, offset, tableOffset, start, end));
OPT_KERNEL(0, 3, computeBilinearVertex, 512, 32, (vertex, varying, V_ITa, offset, tableOffset, start, end));
OPT_KERNEL(3, 0, computeBilinearVertex, 512, 32, (vertex, varying, V_ITa, offset, tableOffset, start, end));
OPT_KERNEL(3, 3, computeBilinearVertex, 512, 32, (vertex, varying, V_ITa, offset, tableOffset, start, end));
// computeBilinearVertex<0, 3><<<512,32,0,stream>>>(vertex, varying, V_ITa, offset, start, end);
OPT_KERNEL(0, 0, computeBilinearVertex, 512, 32, stream, (vertex, varying, V_ITa, offset, tableOffset, start, end));
OPT_KERNEL(0, 3, computeBilinearVertex, 512, 32, stream, (vertex, varying, V_ITa, offset, tableOffset, start, end));
OPT_KERNEL(3, 0, computeBilinearVertex, 512, 32, stream, (vertex, varying, V_ITa, offset, tableOffset, start, end));
OPT_KERNEL(3, 3, computeBilinearVertex, 512, 32, stream, (vertex, varying, V_ITa, offset, tableOffset, start, end));
computeBilinearVertex<<<512, 32>>>(vertex, varying,
// fallback kernel (slow)
computeBilinearVertex<<<512, 32, 0, stream>>>(vertex, varying,
vertexLength, vertexStride, varyingLength, varyingStride,
V_ITa, offset, tableOffset, start, end);
}
void OsdCudaEditVertexAdd(float *vertex, int vertexLength, int vertexStride,
void OsdCudaEditVertexAdd(cudaStream_t stream,
float *vertex, int vertexLength, int vertexStride,
int primVarOffset, int primVarWidth,
int vertexOffset, int tableOffset,
int start, int end, int *editIndices, float *editValues)
{
editVertexAdd<<<512, 32>>>(vertex, vertexLength, vertexStride, primVarOffset, primVarWidth,
editVertexAdd<<<512, 32, 0, stream>>>(vertex, vertexLength, vertexStride, primVarOffset, primVarWidth,
vertexOffset, tableOffset, start, end,
editIndices, editValues);
}