Revert "added memcpyasync and streams to cuda backend"

This reverts commit 84212cd725.
This commit is contained in:
Manuel Kraemer 2014-08-02 12:40:08 -07:00
parent 8c9096cdc8
commit df719eb9ed
4 changed files with 120 additions and 193 deletions

View File

@ -30,17 +30,14 @@ namespace OpenSubdiv {
namespace OPENSUBDIV_VERSION {
bool
OsdCudaTable::createCudaBuffer(cudaStream_t stream, size_t size, const void *ptr) {
cudaHostRegister((void**)&ptr, size);
/* The above command is slow. Try to use cudaMallocHost during the allocation of ptr to speedup */
OsdCudaTable::createCudaBuffer(size_t size, const void *ptr) {
cudaError_t err = cudaMalloc(&_devicePtr, size);
if (err != cudaSuccess) {
return false;
}
err = cudaMemcpyAsync(_devicePtr, ptr, size, cudaMemcpyHostToDevice, stream);
err = cudaMemcpy(_devicePtr, ptr, size, cudaMemcpyHostToDevice);
if (err != cudaSuccess) {
cudaFree(_devicePtr);
_devicePtr = NULL;
@ -143,15 +140,15 @@ OsdCudaComputeContext::initialize(FarSubdivisionTables const *subdivisionTables,
// allocate 5 or 7 tables
_tables.resize(subdivisionTables->GetNumTables(), 0);
_tables[FarSubdivisionTables::E_IT] = OsdCudaTable::Create(GetStream(), subdivisionTables->Get_E_IT());
_tables[FarSubdivisionTables::V_IT] = OsdCudaTable::Create(GetStream(), subdivisionTables->Get_V_IT());
_tables[FarSubdivisionTables::V_ITa] = OsdCudaTable::Create(GetStream(), subdivisionTables->Get_V_ITa());
_tables[FarSubdivisionTables::E_W] = OsdCudaTable::Create(GetStream(), subdivisionTables->Get_E_W());
_tables[FarSubdivisionTables::V_W] = OsdCudaTable::Create(GetStream(), subdivisionTables->Get_V_W());
_tables[FarSubdivisionTables::E_IT] = OsdCudaTable::Create(subdivisionTables->Get_E_IT());
_tables[FarSubdivisionTables::V_IT] = OsdCudaTable::Create(subdivisionTables->Get_V_IT());
_tables[FarSubdivisionTables::V_ITa] = OsdCudaTable::Create(subdivisionTables->Get_V_ITa());
_tables[FarSubdivisionTables::E_W] = OsdCudaTable::Create(subdivisionTables->Get_E_W());
_tables[FarSubdivisionTables::V_W] = OsdCudaTable::Create(subdivisionTables->Get_V_W());
if (subdivisionTables->GetNumTables() > 5) {
_tables[FarSubdivisionTables::F_IT] = OsdCudaTable::Create(GetStream(), subdivisionTables->Get_F_IT());
_tables[FarSubdivisionTables::F_ITa] = OsdCudaTable::Create(GetStream(), subdivisionTables->Get_F_ITa());
_tables[FarSubdivisionTables::F_IT] = OsdCudaTable::Create(subdivisionTables->Get_F_IT());
_tables[FarSubdivisionTables::F_ITa] = OsdCudaTable::Create(subdivisionTables->Get_F_ITa());
}
// error check
@ -205,10 +202,6 @@ OsdCudaComputeContext::Create(FarSubdivisionTables const *subdivisionTables,
OsdCudaComputeContext *result = new OsdCudaComputeContext();
cudaStream_t stream;
cudaStreamCreate(&stream);
_stream = &stream;
if (result->initialize(subdivisionTables, vertexEditTables) == false) {
delete result;
return NULL;
@ -216,10 +209,5 @@ OsdCudaComputeContext::Create(FarSubdivisionTables const *subdivisionTables,
return result;
}
cudaStream_t
OsdComputeContext::GetStream(){
return *_stream
}
} // end namespace OPENSUBDIV_VERSION
} // end namespace OpenSubdiv

View File

@ -42,9 +42,9 @@ namespace OPENSUBDIV_VERSION {
class OsdCudaTable : OsdNonCopyable<OsdCudaTable> {
public:
template<typename T>
static OsdCudaTable * Create(cudaStream_t stream, const std::vector<T> &table) {
static OsdCudaTable * Create(const std::vector<T> &table) {
OsdCudaTable *result = new OsdCudaTable();
if (not result->createCudaBuffer(stream, table.size() * sizeof(T), table.empty() ? NULL : &table[0])) {
if (not result->createCudaBuffer(table.size() * sizeof(T), table.empty() ? NULL : &table[0])) {
delete result;
return NULL;
}
@ -55,16 +55,12 @@ public:
void * GetCudaMemory() const;
cudaStream_t GetStream();
private:
OsdCudaTable() : _devicePtr(NULL) {}
bool createCudaBuffer(size_t size, const void *ptr);
void *_devicePtr;
cudaStream_t *_stream;
};
class OsdCudaHEditTable : OsdNonCopyable<OsdCudaHEditTable> {

View File

@ -30,92 +30,78 @@
extern "C" {
void OsdCudaComputeFace(cudaStream_t stream,
float *vertex, float *varying,
void OsdCudaComputeFace(float *vertex, float *varying,
int vertexLength, int vertexStride,
int varyingLength, int varyingStride,
int *F_IT, int *F_ITa, int offset, int tableOffset, int start, int end);
void OsdCudaComputeQuadFace(cudaStream_t stream,
float *vertex, float *varying,
void OsdCudaComputeQuadFace(float *vertex, float *varying,
int vertexLength, int vertexStride,
int varyingLength, int varyingStride,
int *F_IT, int offset, int tableOffset, int start, int end);
void OsdCudaComputeTriQuadFace(cudaStream_t stream,
float *vertex, float *varying,
void OsdCudaComputeTriQuadFace(float *vertex, float *varying,
int vertexLength, int vertexStride,
int varyingLength, int varyingStride,
int *F_IT, int offset, int tableOffset, int start, int end);
void OsdCudaComputeEdge(cudaStream_t stream,
float *vertex, float *varying,
void OsdCudaComputeEdge(float *vertex, float *varying,
int vertexLength, int vertexStride,
int varyingLength, int varyingStride,
int *E_IT, float *E_W, int offset, int tableOffset, int start, int end);
void OsdCudaComputeRestrictedEdge(cudaStream_t stream,
float *vertex, float *varying,
void OsdCudaComputeRestrictedEdge(float *vertex, float *varying,
int vertexLength, int vertexStride,
int varyingLength, int varyingStride,
int *E_IT, int offset, int tableOffset, int start, int end);
void OsdCudaComputeVertexA(cudaStream_t stream,
float *vertex, float *varying,
void OsdCudaComputeVertexA(float *vertex, float *varying,
int vertexLength, int vertexStride,
int varyingLength, int varyingStride,
int *V_ITa, float *V_W, int offset, int tableOffset,
int start, int end, int pass);
void OsdCudaComputeVertexB(cudaStream_t stream,
float *vertex, float *varying,
void OsdCudaComputeVertexB(float *vertex, float *varying,
int vertexLength, int vertexStride,
int varyingLength, int varyingStride,
int *V_ITa, int *V_IT, float *V_W, int offset, int tableOffset,
int start, int end);
void OsdCudaComputeRestrictedVertexA(cudaStream_t stream,
float *vertex, float *varying,
void OsdCudaComputeRestrictedVertexA(float *vertex, float *varying,
int vertexLength, int vertexStride,
int varyingLength, int varyingStride,
int *V_ITa, int offset, int tableOffset,
int start, int end);
void OsdCudaComputeRestrictedVertexB1(cudaStream_t stream,
float *vertex, float *varying,
void OsdCudaComputeRestrictedVertexB1(float *vertex, float *varying,
int vertexLength, int vertexStride,
int varyingLength, int varyingStride,
int *V_ITa, int *V_IT, int offset, int tableOffset,
int start, int end);
void OsdCudaComputeRestrictedVertexB2(cudaStream_t stream,
float *vertex, float *varying,
void OsdCudaComputeRestrictedVertexB2(float *vertex, float *varying,
int vertexLength, int vertexStride,
int varyingLength, int varyingStride,
int *V_ITa, int *V_IT, int offset, int tableOffset,
int start, int end);
void OsdCudaComputeLoopVertexB(cudaStream_t stream,
float *vertex, float *varying,
void OsdCudaComputeLoopVertexB(float *vertex, float *varying,
int vertexLength, int vertexStride,
int varyingLength, int varyingStride,
int *V_ITa, int *V_IT, float *V_W, int offset, int tableOffset,
int start, int end);
void OsdCudaComputeBilinearEdge(cudaStream_t stream,
float *vertex, float *varying,
void OsdCudaComputeBilinearEdge(float *vertex, float *varying,
int vertexLength, int vertexStride,
int varyingLength, int varyingStride,
int *E_IT, int offset, int tableOffset, int start, int end);
void OsdCudaComputeBilinearVertex(cudaStream_t stream,
float *vertex, float *varying,
void OsdCudaComputeBilinearVertex(float *vertex, float *varying,
int vertexLength, int vertexStride,
int varyingLength, int varyingStride,
int *V_ITa, int offset, int tableOffset, int start, int end);
void OsdCudaEditVertexAdd(cudaStream_t stream,
float *vertex,
void OsdCudaEditVertexAdd(float *vertex,
int vertexLength, int vertexStride,
int primVarOffset, int primVarWidth,
int offset, int tableOffset,
@ -147,7 +133,6 @@ OsdCudaComputeController::ApplyBilinearFaceVerticesKernel(
float *varying = _currentBindState.GetOffsettedVaryingBuffer();
OsdCudaComputeFace(
context->GetStream(),
vertex, varying,
_currentBindState.vertexDesc.length, _currentBindState.vertexDesc.stride,
_currentBindState.varyingDesc.length, _currentBindState.varyingDesc.stride,
@ -169,7 +154,6 @@ OsdCudaComputeController::ApplyBilinearEdgeVerticesKernel(
float *varying = _currentBindState.GetOffsettedVaryingBuffer();
OsdCudaComputeBilinearEdge(
context->GetStream(),
vertex, varying,
_currentBindState.vertexDesc.length, _currentBindState.vertexDesc.stride,
_currentBindState.varyingDesc.length, _currentBindState.varyingDesc.stride,
@ -190,7 +174,6 @@ OsdCudaComputeController::ApplyBilinearVertexVerticesKernel(
float *varying = _currentBindState.GetOffsettedVaryingBuffer();
OsdCudaComputeBilinearVertex(
context->GetStream(),
vertex, varying,
_currentBindState.vertexDesc.length, _currentBindState.vertexDesc.stride,
_currentBindState.varyingDesc.length, _currentBindState.varyingDesc.stride,
@ -213,7 +196,6 @@ OsdCudaComputeController::ApplyCatmarkFaceVerticesKernel(
float *varying = _currentBindState.GetOffsettedVaryingBuffer();
OsdCudaComputeFace(
context->GetStream(),
vertex, varying,
_currentBindState.vertexDesc.length, _currentBindState.vertexDesc.stride,
_currentBindState.varyingDesc.length, _currentBindState.varyingDesc.stride,
@ -235,7 +217,6 @@ OsdCudaComputeController::ApplyCatmarkQuadFaceVerticesKernel(
float *varying = _currentBindState.GetOffsettedVaryingBuffer();
OsdCudaComputeQuadFace(
context->GetStream(),
vertex, varying,
_currentBindState.vertexDesc.length, _currentBindState.vertexDesc.stride,
_currentBindState.varyingDesc.length, _currentBindState.varyingDesc.stride,
@ -256,7 +237,6 @@ OsdCudaComputeController::ApplyCatmarkTriQuadFaceVerticesKernel(
float *varying = _currentBindState.GetOffsettedVaryingBuffer();
OsdCudaComputeTriQuadFace(
context->GetStream(),
vertex, varying,
_currentBindState.vertexDesc.length, _currentBindState.vertexDesc.stride,
_currentBindState.varyingDesc.length, _currentBindState.varyingDesc.stride,
@ -279,7 +259,6 @@ OsdCudaComputeController::ApplyCatmarkEdgeVerticesKernel(
float *varying = _currentBindState.GetOffsettedVaryingBuffer();
OsdCudaComputeEdge(
context->GetStream(),
vertex, varying,
_currentBindState.vertexDesc.length, _currentBindState.vertexDesc.stride,
_currentBindState.varyingDesc.length, _currentBindState.varyingDesc.stride,
@ -301,7 +280,6 @@ OsdCudaComputeController::ApplyCatmarkRestrictedEdgeVerticesKernel(
float *varying = _currentBindState.GetOffsettedVaryingBuffer();
OsdCudaComputeRestrictedEdge(
context->GetStream(),
vertex, varying,
_currentBindState.vertexDesc.length, _currentBindState.vertexDesc.stride,
_currentBindState.varyingDesc.length, _currentBindState.varyingDesc.stride,
@ -326,7 +304,6 @@ OsdCudaComputeController::ApplyCatmarkVertexVerticesKernelB(
float *varying = _currentBindState.GetOffsettedVaryingBuffer();
OsdCudaComputeVertexB(
context->GetStream(),
vertex, varying,
_currentBindState.vertexDesc.length, _currentBindState.vertexDesc.stride,
_currentBindState.varyingDesc.length, _currentBindState.varyingDesc.stride,
@ -351,7 +328,6 @@ OsdCudaComputeController::ApplyCatmarkVertexVerticesKernelA1(
float *varying = _currentBindState.GetOffsettedVaryingBuffer();
OsdCudaComputeVertexA(
context->GetStream(),
vertex, varying,
_currentBindState.vertexDesc.length, _currentBindState.vertexDesc.stride,
_currentBindState.varyingDesc.length, _currentBindState.varyingDesc.stride,
@ -375,7 +351,6 @@ OsdCudaComputeController::ApplyCatmarkVertexVerticesKernelA2(
float *varying = _currentBindState.GetOffsettedVaryingBuffer();
OsdCudaComputeVertexA(
context->GetStream(),
vertex, varying,
_currentBindState.vertexDesc.length, _currentBindState.vertexDesc.stride,
_currentBindState.varyingDesc.length, _currentBindState.varyingDesc.stride,
@ -399,7 +374,6 @@ OsdCudaComputeController::ApplyCatmarkRestrictedVertexVerticesKernelB1(
float *varying = _currentBindState.GetOffsettedVaryingBuffer();
OsdCudaComputeRestrictedVertexB1(
context->GetStream(),
vertex, varying,
_currentBindState.vertexDesc.length, _currentBindState.vertexDesc.stride,
_currentBindState.varyingDesc.length, _currentBindState.varyingDesc.stride,
@ -423,7 +397,6 @@ OsdCudaComputeController::ApplyCatmarkRestrictedVertexVerticesKernelB2(
float *varying = _currentBindState.GetOffsettedVaryingBuffer();
OsdCudaComputeRestrictedVertexB2(
context->GetStream(),
vertex, varying,
_currentBindState.vertexDesc.length, _currentBindState.vertexDesc.stride,
_currentBindState.varyingDesc.length, _currentBindState.varyingDesc.stride,
@ -445,7 +418,6 @@ OsdCudaComputeController::ApplyCatmarkRestrictedVertexVerticesKernelA(
float *varying = _currentBindState.GetOffsettedVaryingBuffer();
OsdCudaComputeRestrictedVertexA(
context->GetStream(),
vertex, varying,
_currentBindState.vertexDesc.length, _currentBindState.vertexDesc.stride,
_currentBindState.varyingDesc.length, _currentBindState.varyingDesc.stride,
@ -468,7 +440,6 @@ OsdCudaComputeController::ApplyLoopEdgeVerticesKernel(
float *varying = _currentBindState.GetOffsettedVaryingBuffer();
OsdCudaComputeEdge(
context->GetStream(),
vertex, varying,
_currentBindState.vertexDesc.length, _currentBindState.vertexDesc.stride,
_currentBindState.varyingDesc.length, _currentBindState.varyingDesc.stride,
@ -494,7 +465,6 @@ OsdCudaComputeController::ApplyLoopVertexVerticesKernelB(
float *varying = _currentBindState.GetOffsettedVaryingBuffer();
OsdCudaComputeLoopVertexB(
context->GetStream(),
vertex, varying,
_currentBindState.vertexDesc.length, _currentBindState.vertexDesc.stride,
_currentBindState.varyingDesc.length, _currentBindState.varyingDesc.stride,
@ -519,7 +489,6 @@ OsdCudaComputeController::ApplyLoopVertexVerticesKernelA1(
float *varying = _currentBindState.GetOffsettedVaryingBuffer();
OsdCudaComputeVertexA(
context->GetStream(),
vertex, varying,
_currentBindState.vertexDesc.length, _currentBindState.vertexDesc.stride,
_currentBindState.varyingDesc.length, _currentBindState.varyingDesc.stride,
@ -543,7 +512,6 @@ OsdCudaComputeController::ApplyLoopVertexVerticesKernelA2(
float *varying = _currentBindState.GetOffsettedVaryingBuffer();
OsdCudaComputeVertexA(
context->GetStream(),
vertex, varying,
_currentBindState.vertexDesc.length, _currentBindState.vertexDesc.stride,
_currentBindState.varyingDesc.length, _currentBindState.varyingDesc.stride,
@ -568,7 +536,6 @@ OsdCudaComputeController::ApplyVertexEdits(
if (edit->GetOperation() == FarVertexEdit::Add) {
OsdCudaEditVertexAdd(
context->GetStream(),
vertex,
_currentBindState.vertexDesc.length, _currentBindState.vertexDesc.stride,
edit->GetPrimvarOffset(),

View File

@ -1020,258 +1020,234 @@ editVertexAdd(float *fVertex, int vertexLength, int vertexStride,
// XXX: this macro usage is tentative. Since cuda kernel can't be dynamically configured,
// still trying to find better way to have optimized kernel..
#define OPT_KERNEL(NUM_VERTEX_ELEMENTS, NUM_VARYING_ELEMENTS, KERNEL, X, Y, STREAM, ARG) \
#define OPT_KERNEL(NUM_VERTEX_ELEMENTS, NUM_VARYING_ELEMENTS, KERNEL, X, Y, ARG) \
if(vertexLength == NUM_VERTEX_ELEMENTS && \
varyingLength == NUM_VARYING_ELEMENTS && \
vertexStride == vertexLength && \
varyingStride == varyingLength) \
{ KERNEL<NUM_VERTEX_ELEMENTS, NUM_VARYING_ELEMENTS><<<X,Y,0,STREAM>>>ARG; \
{ KERNEL<NUM_VERTEX_ELEMENTS, NUM_VARYING_ELEMENTS><<<X,Y>>>ARG; \
return; }
extern "C" {
void OsdCudaComputeFace(cudaStream_t stream,
float *vertex, float *varying,
void OsdCudaComputeFace(float *vertex, float *varying,
int vertexLength, int vertexStride,
int varyingLength, int varyingStride,
int *F_IT, int *F_ITa, int offset, int tableOffset, int start, int end)
{
//computeFace<3, 0><<<512,32,0,stream>>>(vertex, varying, F_IT, F_ITa, offset, start, end);
OPT_KERNEL(0, 0, computeFace, 512, 32, stream, (vertex, varying, F_IT, F_ITa, offset, tableOffset, start, end));
OPT_KERNEL(0, 3, computeFace, 512, 32, stream, (vertex, varying, F_IT, F_ITa, offset, tableOffset, start, end));
OPT_KERNEL(3, 0, computeFace, 512, 32, stream, (vertex, varying, F_IT, F_ITa, offset, tableOffset, start, end));
OPT_KERNEL(3, 3, computeFace, 512, 32, stream, (vertex, varying, F_IT, F_ITa, offset, tableOffset, start, end));
//computeFace<3, 0><<<512,32>>>(vertex, varying, F_IT, F_ITa, offset, start, end);
OPT_KERNEL(0, 0, computeFace, 512, 32, (vertex, varying, F_IT, F_ITa, offset, tableOffset, start, end));
OPT_KERNEL(0, 3, computeFace, 512, 32, (vertex, varying, F_IT, F_ITa, offset, tableOffset, start, end));
OPT_KERNEL(3, 0, computeFace, 512, 32, (vertex, varying, F_IT, F_ITa, offset, tableOffset, start, end));
OPT_KERNEL(3, 3, computeFace, 512, 32, (vertex, varying, F_IT, F_ITa, offset, tableOffset, start, end));
// fallback kernel (slow)
computeFace<<<512, 32, 0, stream>>>(vertex, varying,
computeFace<<<512, 32>>>(vertex, varying,
vertexLength, vertexStride, varyingLength, varyingStride,
F_IT, F_ITa, offset, tableOffset, start, end);
}
void OsdCudaComputeQuadFace(cudaStream_t stream,
float *vertex, float *varying,
void OsdCudaComputeQuadFace(float *vertex, float *varying,
int vertexLength, int vertexStride,
int varyingLength, int varyingStride,
int *F_IT, int offset, int tableOffset, int start, int end)
{
//computeQuadFace<3, 0><<<512,32,0,stream>>>(vertex, varying, F_IT, offset, start, end);
OPT_KERNEL(0, 0, computeQuadFace, 512, 32, stream, (vertex, varying, F_IT, offset, tableOffset, start, end));
OPT_KERNEL(0, 3, computeQuadFace, 512, 32, stream, (vertex, varying, F_IT, offset, tableOffset, start, end));
OPT_KERNEL(3, 0, computeQuadFace, 512, 32, stream, (vertex, varying, F_IT, offset, tableOffset, start, end));
OPT_KERNEL(3, 3, computeQuadFace, 512, 32, stream, (vertex, varying, F_IT, offset, tableOffset, start, end));
//computeQuadFace<3, 0><<<512,32>>>(vertex, varying, F_IT, offset, start, end);
OPT_KERNEL(0, 0, computeQuadFace, 512, 32, (vertex, varying, F_IT, offset, tableOffset, start, end));
OPT_KERNEL(0, 3, computeQuadFace, 512, 32, (vertex, varying, F_IT, offset, tableOffset, start, end));
OPT_KERNEL(3, 0, computeQuadFace, 512, 32, (vertex, varying, F_IT, offset, tableOffset, start, end));
OPT_KERNEL(3, 3, computeQuadFace, 512, 32, (vertex, varying, F_IT, offset, tableOffset, start, end));
// fallback kernel (slow)
computeQuadFace<<<512, 32, 0, stream>>>(vertex, varying,
computeQuadFace<<<512, 32>>>(vertex, varying,
vertexLength, vertexStride, varyingLength, varyingStride,
F_IT, offset, tableOffset, start, end);
}
void OsdCudaComputeTriQuadFace(cudaStream_t stream,
float *vertex, float *varying,
void OsdCudaComputeTriQuadFace(float *vertex, float *varying,
int vertexLength, int vertexStride,
int varyingLength, int varyingStride,
int *F_IT, int offset, int tableOffset, int start, int end)
{
//computeTriQuadFace<3, 0><<<512,32,0,stream>>>(vertex, varying, F_IT, offset, start, end);
OPT_KERNEL(0, 0, computeTriQuadFace, 512, 32, stream, (vertex, varying, F_IT, offset, tableOffset, start, end));
OPT_KERNEL(0, 3, computeTriQuadFace, 512, 32, stream, (vertex, varying, F_IT, offset, tableOffset, start, end));
OPT_KERNEL(3, 0, computeTriQuadFace, 512, 32, stream, (vertex, varying, F_IT, offset, tableOffset, start, end));
OPT_KERNEL(3, 3, computeTriQuadFace, 512, 32, stream, (vertex, varying, F_IT, offset, tableOffset, start, end));
//computeTriQuadFace<3, 0><<<512,32>>>(vertex, varying, F_IT, offset, start, end);
OPT_KERNEL(0, 0, computeTriQuadFace, 512, 32, (vertex, varying, F_IT, offset, tableOffset, start, end));
OPT_KERNEL(0, 3, computeTriQuadFace, 512, 32, (vertex, varying, F_IT, offset, tableOffset, start, end));
OPT_KERNEL(3, 0, computeTriQuadFace, 512, 32, (vertex, varying, F_IT, offset, tableOffset, start, end));
OPT_KERNEL(3, 3, computeTriQuadFace, 512, 32, (vertex, varying, F_IT, offset, tableOffset, start, end));
// fallback kernel (slow)
computeTriQuadFace<<<512, 32, 0, stream>>>(vertex, varying,
computeTriQuadFace<<<512, 32>>>(vertex, varying,
vertexLength, vertexStride, varyingLength, varyingStride,
F_IT, offset, tableOffset, start, end);
}
void OsdCudaComputeEdge(cudaStream_t stream,
float *vertex, float *varying,
void OsdCudaComputeEdge(float *vertex, float *varying,
int vertexLength, int vertexStride,
int varyingLength, int varyingStride,
int *E_IT, float *E_W, int offset, int tableOffset, int start, int end)
{
//computeEdge<0, 3><<<512,32,0,stream>>>(vertex, varying, E_IT, E_W, offset, start, end);
OPT_KERNEL(0, 0, computeEdge, 512, 32, stream, (vertex, varying, E_IT, E_W, offset, tableOffset, start, end));
OPT_KERNEL(0, 3, computeEdge, 512, 32, stream, (vertex, varying, E_IT, E_W, offset, tableOffset, start, end));
OPT_KERNEL(3, 0, computeEdge, 512, 32, stream, (vertex, varying, E_IT, E_W, offset, tableOffset, start, end));
OPT_KERNEL(3, 3, computeEdge, 512, 32, stream, (vertex, varying, E_IT, E_W, offset, tableOffset, start, end));
//computeEdge<0, 3><<<512,32>>>(vertex, varying, E_IT, E_W, offset, start, end);
OPT_KERNEL(0, 0, computeEdge, 512, 32, (vertex, varying, E_IT, E_W, offset, tableOffset, start, end));
OPT_KERNEL(0, 3, computeEdge, 512, 32, (vertex, varying, E_IT, E_W, offset, tableOffset, start, end));
OPT_KERNEL(3, 0, computeEdge, 512, 32, (vertex, varying, E_IT, E_W, offset, tableOffset, start, end));
OPT_KERNEL(3, 3, computeEdge, 512, 32, (vertex, varying, E_IT, E_W, offset, tableOffset, start, end));
// fallback kernel (slow)
computeEdge<<<512, 32, 0, stream>>>(vertex, varying,
computeEdge<<<512, 32>>>(vertex, varying,
vertexLength, vertexStride, varyingLength, varyingStride,
E_IT, E_W, offset, tableOffset, start, end);
}
void OsdCudaComputeRestrictedEdge(cudaStream_t stream,
float *vertex, float *varying,
void OsdCudaComputeRestrictedEdge(float *vertex, float *varying,
int vertexLength, int vertexStride,
int varyingLength, int varyingStride,
int *E_IT, int offset, int tableOffset, int start, int end)
{
//computeRestrictedEdge<0, 3><<<512,32,0,stream>>>(vertex, varying, E_IT, offset, start, end);
OPT_KERNEL(0, 0, computeRestrictedEdge, 512, 32, stream, (vertex, varying, E_IT, offset, tableOffset, start, end));
OPT_KERNEL(0, 3, computeRestrictedEdge, 512, 32, stream, (vertex, varying, E_IT, offset, tableOffset, start, end));
OPT_KERNEL(3, 0, computeRestrictedEdge, 512, 32, stream, (vertex, varying, E_IT, offset, tableOffset, start, end));
OPT_KERNEL(3, 3, computeRestrictedEdge, 512, 32, stream, (vertex, varying, E_IT, offset, tableOffset, start, end));
//computeRestrictedEdge<0, 3><<<512,32>>>(vertex, varying, E_IT, offset, start, end);
OPT_KERNEL(0, 0, computeRestrictedEdge, 512, 32, (vertex, varying, E_IT, offset, tableOffset, start, end));
OPT_KERNEL(0, 3, computeRestrictedEdge, 512, 32, (vertex, varying, E_IT, offset, tableOffset, start, end));
OPT_KERNEL(3, 0, computeRestrictedEdge, 512, 32, (vertex, varying, E_IT, offset, tableOffset, start, end));
OPT_KERNEL(3, 3, computeRestrictedEdge, 512, 32, (vertex, varying, E_IT, offset, tableOffset, start, end));
// fallback kernel (slow)
computeRestrictedEdge<<<512, 32, 0, stream>>>(vertex, varying,
computeRestrictedEdge<<<512, 32>>>(vertex, varying,
vertexLength, vertexStride, varyingLength, varyingStride,
E_IT, offset, tableOffset, start, end);
}
void OsdCudaComputeVertexA(cudaStream_t stream,
float *vertex, float *varying,
void OsdCudaComputeVertexA(float *vertex, float *varying,
int vertexLength, int vertexStride,
int varyingLength, int varyingStride,
int *V_ITa, float *V_W, int offset, int tableOffset, int start, int end, int pass)
{
// computeVertexA<0, 3><<<512,32,0,stream>>>(vertex, varying, V_ITa, V_W, offset, start, end, pass);
OPT_KERNEL(0, 0, computeVertexA, 512, 32, stream, (vertex, varying, V_ITa, V_W, offset, tableOffset, start, end, pass));
OPT_KERNEL(0, 3, computeVertexA, 512, 32, stream, (vertex, varying, V_ITa, V_W, offset, tableOffset, start, end, pass));
OPT_KERNEL(3, 0, computeVertexA, 512, 32, stream, (vertex, varying, V_ITa, V_W, offset, tableOffset, start, end, pass));
OPT_KERNEL(3, 3, computeVertexA, 512, 32, stream, (vertex, varying, V_ITa, V_W, offset, tableOffset, start, end, pass));
// computeVertexA<0, 3><<<512,32>>>(vertex, varying, V_ITa, V_W, offset, start, end, pass);
OPT_KERNEL(0, 0, computeVertexA, 512, 32, (vertex, varying, V_ITa, V_W, offset, tableOffset, start, end, pass));
OPT_KERNEL(0, 3, computeVertexA, 512, 32, (vertex, varying, V_ITa, V_W, offset, tableOffset, start, end, pass));
OPT_KERNEL(3, 0, computeVertexA, 512, 32, (vertex, varying, V_ITa, V_W, offset, tableOffset, start, end, pass));
OPT_KERNEL(3, 3, computeVertexA, 512, 32, (vertex, varying, V_ITa, V_W, offset, tableOffset, start, end, pass));
// fallback kernel (slow)
computeVertexA<<<512, 32, 0, stream>>>(vertex, varying,
computeVertexA<<<512, 32>>>(vertex, varying,
vertexLength, vertexStride, varyingLength, varyingStride,
V_ITa, V_W, offset, tableOffset, start, end, pass);
}
void OsdCudaComputeVertexB(cudaStream_t stream,
float *vertex, float *varying,
void OsdCudaComputeVertexB(float *vertex, float *varying,
int vertexLength, int vertexStride,
int varyingLength, int varyingStride,
int *V_ITa, int *V_IT, float *V_W, int offset, int tableOffset, int start, int end)
{
// computeVertexB<0, 3><<<512,32,0,stream>>>(vertex, varying, V_ITa, V_IT, V_W, offset, start, end);
OPT_KERNEL(0, 0, computeVertexB, 512, 32, stream, (vertex, varying, V_ITa, V_IT, V_W, offset, tableOffset, start, end));
OPT_KERNEL(0, 3, computeVertexB, 512, 32, stream, (vertex, varying, V_ITa, V_IT, V_W, offset, tableOffset, start, end));
OPT_KERNEL(3, 0, computeVertexB, 512, 32, stream, (vertex, varying, V_ITa, V_IT, V_W, offset, tableOffset, start, end));
OPT_KERNEL(3, 3, computeVertexB, 512, 32, stream, (vertex, varying, V_ITa, V_IT, V_W, offset, tableOffset, start, end));
// computeVertexB<0, 3><<<512,32>>>(vertex, varying, V_ITa, V_IT, V_W, offset, start, end);
OPT_KERNEL(0, 0, computeVertexB, 512, 32, (vertex, varying, V_ITa, V_IT, V_W, offset, tableOffset, start, end));
OPT_KERNEL(0, 3, computeVertexB, 512, 32, (vertex, varying, V_ITa, V_IT, V_W, offset, tableOffset, start, end));
OPT_KERNEL(3, 0, computeVertexB, 512, 32, (vertex, varying, V_ITa, V_IT, V_W, offset, tableOffset, start, end));
OPT_KERNEL(3, 3, computeVertexB, 512, 32, (vertex, varying, V_ITa, V_IT, V_W, offset, tableOffset, start, end));
// fallback kernel (slow)
computeVertexB<<<512, 32, 0, stream>>>(vertex, varying,
computeVertexB<<<512, 32>>>(vertex, varying,
vertexLength, vertexStride, varyingLength, varyingStride,
V_ITa, V_IT, V_W, offset, tableOffset, start, end);
}
void OsdCudaComputeRestrictedVertexA(cudaStream_t stream,
float *vertex, float *varying,
void OsdCudaComputeRestrictedVertexA(float *vertex, float *varying,
int vertexLength, int vertexStride,
int varyingLength, int varyingStride,
int *V_ITa, int offset, int tableOffset, int start, int end)
{
// computeRestrictedVertexA<0, 3><<<512,32,0,stream>>>(vertex, varying, V_ITa, offset, start, end);
OPT_KERNEL(0, 0, computeRestrictedVertexA, 512, 32, stream, (vertex, varying, V_ITa, offset, tableOffset, start, end));
OPT_KERNEL(0, 3, computeRestrictedVertexA, 512, 32, stream, (vertex, varying, V_ITa, offset, tableOffset, start, end));
OPT_KERNEL(3, 0, computeRestrictedVertexA, 512, 32, stream, (vertex, varying, V_ITa, offset, tableOffset, start, end));
OPT_KERNEL(3, 3, computeRestrictedVertexA, 512, 32, stream, (vertex, varying, V_ITa, offset, tableOffset, start, end));
// computeRestrictedVertexA<0, 3><<<512,32>>>(vertex, varying, V_ITa, offset, start, end);
OPT_KERNEL(0, 0, computeRestrictedVertexA, 512, 32, (vertex, varying, V_ITa, offset, tableOffset, start, end));
OPT_KERNEL(0, 3, computeRestrictedVertexA, 512, 32, (vertex, varying, V_ITa, offset, tableOffset, start, end));
OPT_KERNEL(3, 0, computeRestrictedVertexA, 512, 32, (vertex, varying, V_ITa, offset, tableOffset, start, end));
OPT_KERNEL(3, 3, computeRestrictedVertexA, 512, 32, (vertex, varying, V_ITa, offset, tableOffset, start, end));
// fallback kernel (slow)
computeRestrictedVertexA<<<512, 32, 0, stream>>>(vertex, varying,
computeRestrictedVertexA<<<512, 32>>>(vertex, varying,
vertexLength, vertexStride, varyingLength, varyingStride,
V_ITa, offset, tableOffset, start, end);
}
void OsdCudaComputeRestrictedVertexB1(cudaStream_t stream,
float *vertex, float *varying,
void OsdCudaComputeRestrictedVertexB1(float *vertex, float *varying,
int vertexLength, int vertexStride,
int varyingLength, int varyingStride,
int *V_ITa, int *V_IT, int offset, int tableOffset, int start, int end)
{
// computeRestrictedVertexB1<0, 3><<<512,32,0,stream>>>(vertex, varying, V_ITa, V_IT, offset, start, end);
OPT_KERNEL(0, 0, computeRestrictedVertexB1, 512, 32, stream, (vertex, varying, V_ITa, V_IT, offset, tableOffset, start, end));
OPT_KERNEL(0, 3, computeRestrictedVertexB1, 512, 32, stream, (vertex, varying, V_ITa, V_IT, offset, tableOffset, start, end));
OPT_KERNEL(3, 0, computeRestrictedVertexB1, 512, 32, stream, (vertex, varying, V_ITa, V_IT, offset, tableOffset, start, end));
OPT_KERNEL(3, 3, computeRestrictedVertexB1, 512, 32, stream, (vertex, varying, V_ITa, V_IT, offset, tableOffset, start, end));
// computeRestrictedVertexB1<0, 3><<<512,32>>>(vertex, varying, V_ITa, V_IT, offset, start, end);
OPT_KERNEL(0, 0, computeRestrictedVertexB1, 512, 32, (vertex, varying, V_ITa, V_IT, offset, tableOffset, start, end));
OPT_KERNEL(0, 3, computeRestrictedVertexB1, 512, 32, (vertex, varying, V_ITa, V_IT, offset, tableOffset, start, end));
OPT_KERNEL(3, 0, computeRestrictedVertexB1, 512, 32, (vertex, varying, V_ITa, V_IT, offset, tableOffset, start, end));
OPT_KERNEL(3, 3, computeRestrictedVertexB1, 512, 32, (vertex, varying, V_ITa, V_IT, offset, tableOffset, start, end));
// fallback kernel (slow)
computeRestrictedVertexB1<<<512, 32, 0, stream>>>(vertex, varying,
computeRestrictedVertexB1 <<<512, 32>>>(vertex, varying,
vertexLength, vertexStride, varyingLength, varyingStride,
V_ITa, V_IT, offset, tableOffset, start, end);
}
void OsdCudaComputeRestrictedVertexB2(cudaStream_t stream,
float *vertex, float *varying,
void OsdCudaComputeRestrictedVertexB2(float *vertex, float *varying,
int vertexLength, int vertexStride,
int varyingLength, int varyingStride,
int *V_ITa, int *V_IT, int offset, int tableOffset, int start, int end)
{
// computeRestrictedVertexB2<0, 3><<<512,32,0,stream>>>(vertex, varying, V_ITa, V_IT, offset, start, end);
OPT_KERNEL(0, 0, computeRestrictedVertexB2, 512, 32, stream, (vertex, varying, V_ITa, V_IT, offset, tableOffset, start, end));
OPT_KERNEL(0, 3, computeRestrictedVertexB2, 512, 32, stream, (vertex, varying, V_ITa, V_IT, offset, tableOffset, start, end));
OPT_KERNEL(3, 0, computeRestrictedVertexB2, 512, 32, stream, (vertex, varying, V_ITa, V_IT, offset, tableOffset, start, end));
OPT_KERNEL(3, 3, computeRestrictedVertexB2, 512, 32, stream, (vertex, varying, V_ITa, V_IT, offset, tableOffset, start, end));
// computeRestrictedVertexB2<0, 3><<<512,32>>>(vertex, varying, V_ITa, V_IT, offset, start, end);
OPT_KERNEL(0, 0, computeRestrictedVertexB2, 512, 32, (vertex, varying, V_ITa, V_IT, offset, tableOffset, start, end));
OPT_KERNEL(0, 3, computeRestrictedVertexB2, 512, 32, (vertex, varying, V_ITa, V_IT, offset, tableOffset, start, end));
OPT_KERNEL(3, 0, computeRestrictedVertexB2, 512, 32, (vertex, varying, V_ITa, V_IT, offset, tableOffset, start, end));
OPT_KERNEL(3, 3, computeRestrictedVertexB2, 512, 32, (vertex, varying, V_ITa, V_IT, offset, tableOffset, start, end));
// fallback kernel (slow)
computeRestrictedVertexB2<<<512, 32, 0, stream>>>(vertex, varying,
computeRestrictedVertexB2 <<<512, 32>>>(vertex, varying,
vertexLength, vertexStride, varyingLength, varyingStride,
V_ITa, V_IT, offset, tableOffset, start, end);
}
void OsdCudaComputeLoopVertexB(cudaStream_t stream,
float *vertex, float *varying,
void OsdCudaComputeLoopVertexB(float *vertex, float *varying,
int vertexLength, int vertexStride,
int varyingLength, int varyingStride,
int *V_ITa, int *V_IT, float *V_W, int offset, int tableOffset, int start, int end)
{
// computeLoopVertexB<0, 3><<<512,32,0,stream>>>(vertex, varying, V_ITa, V_IT, V_W, offset, start, end);
OPT_KERNEL(0, 0, computeLoopVertexB, 512, 32, stream, (vertex, varying, V_ITa, V_IT, V_W, offset, tableOffset, start, end));
OPT_KERNEL(0, 3, computeLoopVertexB, 512, 32, stream, (vertex, varying, V_ITa, V_IT, V_W, offset, tableOffset, start, end));
OPT_KERNEL(3, 0, computeLoopVertexB, 512, 32, stream, (vertex, varying, V_ITa, V_IT, V_W, offset, tableOffset, start, end));
OPT_KERNEL(3, 3, computeLoopVertexB, 512, 32, stream, (vertex, varying, V_ITa, V_IT, V_W, offset, tableOffset, start, end));
// computeLoopVertexB<0, 3><<<512,32>>>(vertex, varying, V_ITa, V_IT, V_W, offset, start, end);
OPT_KERNEL(0, 0, computeLoopVertexB, 512, 32, (vertex, varying, V_ITa, V_IT, V_W, offset, tableOffset, start, end));
OPT_KERNEL(0, 3, computeLoopVertexB, 512, 32, (vertex, varying, V_ITa, V_IT, V_W, offset, tableOffset, start, end));
OPT_KERNEL(3, 0, computeLoopVertexB, 512, 32, (vertex, varying, V_ITa, V_IT, V_W, offset, tableOffset, start, end));
OPT_KERNEL(3, 3, computeLoopVertexB, 512, 32, (vertex, varying, V_ITa, V_IT, V_W, offset, tableOffset, start, end));
// fallback kernel (slow)
computeLoopVertexB<<<512, 32, 0, stream>>>(vertex, varying,
computeLoopVertexB<<<512, 32>>>(vertex, varying,
vertexLength, vertexStride, varyingLength, varyingStride,
V_ITa, V_IT, V_W, offset, tableOffset, start, end);
}
void OsdCudaComputeBilinearEdge(cudaStream_t stream,
float *vertex, float *varying,
void OsdCudaComputeBilinearEdge(float *vertex, float *varying,
int vertexLength, int vertexStride,
int varyingLength, int varyingStride,
int *E_IT, int offset, int tableOffset, int start, int end)
{
//computeBilinearEdge<0, 3><<<512,32,0,stream>>>(vertex, varying, E_IT, offset, start, end);
OPT_KERNEL(0, 0, computeBilinearEdge, 512, 32, stream, (vertex, varying, E_IT, offset, tableOffset, start, end));
OPT_KERNEL(0, 3, computeBilinearEdge, 512, 32, stream, (vertex, varying, E_IT, offset, tableOffset, start, end));
OPT_KERNEL(3, 0, computeBilinearEdge, 512, 32, stream, (vertex, varying, E_IT, offset, tableOffset, start, end));
OPT_KERNEL(3, 3, computeBilinearEdge, 512, 32, stream, (vertex, varying, E_IT, offset, tableOffset, start, end));
//computeBilinearEdge<0, 3><<<512,32>>>(vertex, varying, E_IT, offset, start, end);
OPT_KERNEL(0, 0, computeBilinearEdge, 512, 32, (vertex, varying, E_IT, offset, tableOffset, start, end));
OPT_KERNEL(0, 3, computeBilinearEdge, 512, 32, (vertex, varying, E_IT, offset, tableOffset, start, end));
OPT_KERNEL(3, 0, computeBilinearEdge, 512, 32, (vertex, varying, E_IT, offset, tableOffset, start, end));
OPT_KERNEL(3, 3, computeBilinearEdge, 512, 32, (vertex, varying, E_IT, offset, tableOffset, start, end));
// fallback kernel (slow)
computeBilinearEdge<<<512, 32, 0, stream>>>(vertex, varying,
computeBilinearEdge<<<512, 32>>>(vertex, varying,
vertexLength, vertexStride, varyingLength, varyingStride,
E_IT, offset, tableOffset, start, end);
}
void OsdCudaComputeBilinearVertex(cudaStream_t stream,
float *vertex, float *varying,
void OsdCudaComputeBilinearVertex(float *vertex, float *varying,
int vertexLength, int vertexStride,
int varyingLength, int varyingStride,
int *V_ITa, int offset, int tableOffset, int start, int end)
{
// computeBilinearVertex<0, 3><<<512,32,0,stream>>>(vertex, varying, V_ITa, offset, start, end);
OPT_KERNEL(0, 0, computeBilinearVertex, 512, 32, stream, (vertex, varying, V_ITa, offset, tableOffset, start, end));
OPT_KERNEL(0, 3, computeBilinearVertex, 512, 32, stream, (vertex, varying, V_ITa, offset, tableOffset, start, end));
OPT_KERNEL(3, 0, computeBilinearVertex, 512, 32, stream, (vertex, varying, V_ITa, offset, tableOffset, start, end));
OPT_KERNEL(3, 3, computeBilinearVertex, 512, 32, stream, (vertex, varying, V_ITa, offset, tableOffset, start, end));
// computeBilinearVertex<0, 3><<<512,32>>>(vertex, varying, V_ITa, offset, start, end);
OPT_KERNEL(0, 0, computeBilinearVertex, 512, 32, (vertex, varying, V_ITa, offset, tableOffset, start, end));
OPT_KERNEL(0, 3, computeBilinearVertex, 512, 32, (vertex, varying, V_ITa, offset, tableOffset, start, end));
OPT_KERNEL(3, 0, computeBilinearVertex, 512, 32, (vertex, varying, V_ITa, offset, tableOffset, start, end));
OPT_KERNEL(3, 3, computeBilinearVertex, 512, 32, (vertex, varying, V_ITa, offset, tableOffset, start, end));
// fallback kernel (slow)
computeBilinearVertex<<<512, 32, 0, stream>>>(vertex, varying,
computeBilinearVertex<<<512, 32>>>(vertex, varying,
vertexLength, vertexStride, varyingLength, varyingStride,
V_ITa, offset, tableOffset, start, end);
}
void OsdCudaEditVertexAdd(cudaStream_t stream,
float *vertex, int vertexLength, int vertexStride,
void OsdCudaEditVertexAdd(float *vertex, int vertexLength, int vertexStride,
int primVarOffset, int primVarWidth,
int vertexOffset, int tableOffset,
int start, int end, int *editIndices, float *editValues)
{
editVertexAdd<<<512, 32, 0, stream>>>(vertex, vertexLength, vertexStride, primVarOffset, primVarWidth,
editVertexAdd<<<512, 32>>>(vertex, vertexLength, vertexStride, primVarOffset, primVarWidth,
vertexOffset, tableOffset, start, end,
editIndices, editValues);
}