mirror of
https://github.com/PixarAnimationStudios/OpenSubdiv
synced 2025-01-05 06:21:07 +00:00
Merge pull request #340 from adityaatluri/dev
added memcpyasync and streams to cuda backend
This commit is contained in:
commit
ae4ecf5bf9
@ -30,14 +30,17 @@ namespace OpenSubdiv {
|
||||
namespace OPENSUBDIV_VERSION {
|
||||
|
||||
bool
|
||||
OsdCudaTable::createCudaBuffer(size_t size, const void *ptr) {
|
||||
OsdCudaTable::createCudaBuffer(cudaStream_t stream, size_t size, const void *ptr) {
|
||||
|
||||
cudaHostRegister((void**)&ptr, size);
|
||||
/* The above command is slow. Try to use cudaMallocHost during the allocation of ptr to speedup */
|
||||
|
||||
cudaError_t err = cudaMalloc(&_devicePtr, size);
|
||||
if (err != cudaSuccess) {
|
||||
return false;
|
||||
}
|
||||
|
||||
err = cudaMemcpy(_devicePtr, ptr, size, cudaMemcpyHostToDevice);
|
||||
err = cudaMemcpyAsync(_devicePtr, ptr, size, cudaMemcpyHostToDevice, stream);
|
||||
if (err != cudaSuccess) {
|
||||
cudaFree(_devicePtr);
|
||||
_devicePtr = NULL;
|
||||
@ -140,15 +143,15 @@ OsdCudaComputeContext::initialize(FarSubdivisionTables const *subdivisionTables,
|
||||
// allocate 5 or 7 tables
|
||||
_tables.resize(subdivisionTables->GetNumTables(), 0);
|
||||
|
||||
_tables[FarSubdivisionTables::E_IT] = OsdCudaTable::Create(subdivisionTables->Get_E_IT());
|
||||
_tables[FarSubdivisionTables::V_IT] = OsdCudaTable::Create(subdivisionTables->Get_V_IT());
|
||||
_tables[FarSubdivisionTables::V_ITa] = OsdCudaTable::Create(subdivisionTables->Get_V_ITa());
|
||||
_tables[FarSubdivisionTables::E_W] = OsdCudaTable::Create(subdivisionTables->Get_E_W());
|
||||
_tables[FarSubdivisionTables::V_W] = OsdCudaTable::Create(subdivisionTables->Get_V_W());
|
||||
_tables[FarSubdivisionTables::E_IT] = OsdCudaTable::Create(GetStream(), subdivisionTables->Get_E_IT());
|
||||
_tables[FarSubdivisionTables::V_IT] = OsdCudaTable::Create(GetStream(), subdivisionTables->Get_V_IT());
|
||||
_tables[FarSubdivisionTables::V_ITa] = OsdCudaTable::Create(GetStream(), subdivisionTables->Get_V_ITa());
|
||||
_tables[FarSubdivisionTables::E_W] = OsdCudaTable::Create(GetStream(), subdivisionTables->Get_E_W());
|
||||
_tables[FarSubdivisionTables::V_W] = OsdCudaTable::Create(GetStream(), subdivisionTables->Get_V_W());
|
||||
|
||||
if (subdivisionTables->GetNumTables() > 5) {
|
||||
_tables[FarSubdivisionTables::F_IT] = OsdCudaTable::Create(subdivisionTables->Get_F_IT());
|
||||
_tables[FarSubdivisionTables::F_ITa] = OsdCudaTable::Create(subdivisionTables->Get_F_ITa());
|
||||
_tables[FarSubdivisionTables::F_IT] = OsdCudaTable::Create(GetStream(), subdivisionTables->Get_F_IT());
|
||||
_tables[FarSubdivisionTables::F_ITa] = OsdCudaTable::Create(GetStream(), subdivisionTables->Get_F_ITa());
|
||||
}
|
||||
|
||||
// error check
|
||||
@ -202,6 +205,10 @@ OsdCudaComputeContext::Create(FarSubdivisionTables const *subdivisionTables,
|
||||
|
||||
OsdCudaComputeContext *result = new OsdCudaComputeContext();
|
||||
|
||||
cudaStream_t stream;
|
||||
cudaStreamCreate(&stream);
|
||||
_stream = &stream;
|
||||
|
||||
if (result->initialize(subdivisionTables, vertexEditTables) == false) {
|
||||
delete result;
|
||||
return NULL;
|
||||
@ -209,5 +216,10 @@ OsdCudaComputeContext::Create(FarSubdivisionTables const *subdivisionTables,
|
||||
return result;
|
||||
}
|
||||
|
||||
cudaStream_t
|
||||
OsdComputeContext::GetStream(){
|
||||
return *_stream
|
||||
}
|
||||
|
||||
} // end namespace OPENSUBDIV_VERSION
|
||||
} // end namespace OpenSubdiv
|
||||
|
@ -42,9 +42,9 @@ namespace OPENSUBDIV_VERSION {
|
||||
class OsdCudaTable : OsdNonCopyable<OsdCudaTable> {
|
||||
public:
|
||||
template<typename T>
|
||||
static OsdCudaTable * Create(const std::vector<T> &table) {
|
||||
static OsdCudaTable * Create(cudaStream_t stream, const std::vector<T> &table) {
|
||||
OsdCudaTable *result = new OsdCudaTable();
|
||||
if (not result->createCudaBuffer(table.size() * sizeof(T), table.empty() ? NULL : &table[0])) {
|
||||
if (not result->createCudaBuffer(stream, table.size() * sizeof(T), table.empty() ? NULL : &table[0])) {
|
||||
delete result;
|
||||
return NULL;
|
||||
}
|
||||
@ -55,12 +55,16 @@ public:
|
||||
|
||||
void * GetCudaMemory() const;
|
||||
|
||||
cudaStream_t GetStream();
|
||||
|
||||
private:
|
||||
OsdCudaTable() : _devicePtr(NULL) {}
|
||||
|
||||
bool createCudaBuffer(size_t size, const void *ptr);
|
||||
|
||||
void *_devicePtr;
|
||||
|
||||
cudaStream_t *_stream;
|
||||
};
|
||||
|
||||
class OsdCudaHEditTable : OsdNonCopyable<OsdCudaHEditTable> {
|
||||
|
@ -30,78 +30,92 @@
|
||||
|
||||
extern "C" {
|
||||
|
||||
void OsdCudaComputeFace(float *vertex, float *varying,
|
||||
void OsdCudaComputeFace(cudaStream_t stream,
|
||||
float *vertex, float *varying,
|
||||
int vertexLength, int vertexStride,
|
||||
int varyingLength, int varyingStride,
|
||||
int *F_IT, int *F_ITa, int offset, int tableOffset, int start, int end);
|
||||
|
||||
void OsdCudaComputeQuadFace(float *vertex, float *varying,
|
||||
void OsdCudaComputeQuadFace(cudaStream_t stream,
|
||||
float *vertex, float *varying,
|
||||
int vertexLength, int vertexStride,
|
||||
int varyingLength, int varyingStride,
|
||||
int *F_IT, int offset, int tableOffset, int start, int end);
|
||||
|
||||
void OsdCudaComputeTriQuadFace(float *vertex, float *varying,
|
||||
void OsdCudaComputeTriQuadFace(cudaStream_t stream,
|
||||
float *vertex, float *varying,
|
||||
int vertexLength, int vertexStride,
|
||||
int varyingLength, int varyingStride,
|
||||
int *F_IT, int offset, int tableOffset, int start, int end);
|
||||
|
||||
void OsdCudaComputeEdge(float *vertex, float *varying,
|
||||
void OsdCudaComputeEdge(cudaStream_t stream,
|
||||
float *vertex, float *varying,
|
||||
int vertexLength, int vertexStride,
|
||||
int varyingLength, int varyingStride,
|
||||
int *E_IT, float *E_W, int offset, int tableOffset, int start, int end);
|
||||
|
||||
void OsdCudaComputeRestrictedEdge(float *vertex, float *varying,
|
||||
void OsdCudaComputeRestrictedEdge(cudaStream_t stream,
|
||||
float *vertex, float *varying,
|
||||
int vertexLength, int vertexStride,
|
||||
int varyingLength, int varyingStride,
|
||||
int *E_IT, int offset, int tableOffset, int start, int end);
|
||||
|
||||
void OsdCudaComputeVertexA(float *vertex, float *varying,
|
||||
void OsdCudaComputeVertexA(cudaStream_t stream,
|
||||
float *vertex, float *varying,
|
||||
int vertexLength, int vertexStride,
|
||||
int varyingLength, int varyingStride,
|
||||
int *V_ITa, float *V_W, int offset, int tableOffset,
|
||||
int start, int end, int pass);
|
||||
|
||||
void OsdCudaComputeVertexB(float *vertex, float *varying,
|
||||
void OsdCudaComputeVertexB(cudaStream_t stream,
|
||||
float *vertex, float *varying,
|
||||
int vertexLength, int vertexStride,
|
||||
int varyingLength, int varyingStride,
|
||||
int *V_ITa, int *V_IT, float *V_W, int offset, int tableOffset,
|
||||
int start, int end);
|
||||
|
||||
void OsdCudaComputeRestrictedVertexA(float *vertex, float *varying,
|
||||
void OsdCudaComputeRestrictedVertexA(cudaStream_t stream,
|
||||
float *vertex, float *varying,
|
||||
int vertexLength, int vertexStride,
|
||||
int varyingLength, int varyingStride,
|
||||
int *V_ITa, int offset, int tableOffset,
|
||||
int start, int end);
|
||||
|
||||
void OsdCudaComputeRestrictedVertexB1(float *vertex, float *varying,
|
||||
void OsdCudaComputeRestrictedVertexB1(cudaStream_t stream,
|
||||
float *vertex, float *varying,
|
||||
int vertexLength, int vertexStride,
|
||||
int varyingLength, int varyingStride,
|
||||
int *V_ITa, int *V_IT, int offset, int tableOffset,
|
||||
int start, int end);
|
||||
|
||||
void OsdCudaComputeRestrictedVertexB2(float *vertex, float *varying,
|
||||
void OsdCudaComputeRestrictedVertexB2(cudaStream_t stream,
|
||||
float *vertex, float *varying,
|
||||
int vertexLength, int vertexStride,
|
||||
int varyingLength, int varyingStride,
|
||||
int *V_ITa, int *V_IT, int offset, int tableOffset,
|
||||
int start, int end);
|
||||
|
||||
void OsdCudaComputeLoopVertexB(float *vertex, float *varying,
|
||||
void OsdCudaComputeLoopVertexB(cudaStream_t stream,
|
||||
float *vertex, float *varying,
|
||||
int vertexLength, int vertexStride,
|
||||
int varyingLength, int varyingStride,
|
||||
int *V_ITa, int *V_IT, float *V_W, int offset, int tableOffset,
|
||||
int start, int end);
|
||||
|
||||
void OsdCudaComputeBilinearEdge(float *vertex, float *varying,
|
||||
void OsdCudaComputeBilinearEdge(cudaStream_t stream,
|
||||
float *vertex, float *varying,
|
||||
int vertexLength, int vertexStride,
|
||||
int varyingLength, int varyingStride,
|
||||
int *E_IT, int offset, int tableOffset, int start, int end);
|
||||
|
||||
void OsdCudaComputeBilinearVertex(float *vertex, float *varying,
|
||||
void OsdCudaComputeBilinearVertex(cudaStream_t stream,
|
||||
float *vertex, float *varying,
|
||||
int vertexLength, int vertexStride,
|
||||
int varyingLength, int varyingStride,
|
||||
int *V_ITa, int offset, int tableOffset, int start, int end);
|
||||
|
||||
void OsdCudaEditVertexAdd(float *vertex,
|
||||
void OsdCudaEditVertexAdd(cudaStream_t stream,
|
||||
float *vertex,
|
||||
int vertexLength, int vertexStride,
|
||||
int primVarOffset, int primVarWidth,
|
||||
int offset, int tableOffset,
|
||||
@ -133,6 +147,7 @@ OsdCudaComputeController::ApplyBilinearFaceVerticesKernel(
|
||||
float *varying = _currentBindState.GetOffsettedVaryingBuffer();
|
||||
|
||||
OsdCudaComputeFace(
|
||||
context->GetStream(),
|
||||
vertex, varying,
|
||||
_currentBindState.vertexDesc.length, _currentBindState.vertexDesc.stride,
|
||||
_currentBindState.varyingDesc.length, _currentBindState.varyingDesc.stride,
|
||||
@ -154,6 +169,7 @@ OsdCudaComputeController::ApplyBilinearEdgeVerticesKernel(
|
||||
float *varying = _currentBindState.GetOffsettedVaryingBuffer();
|
||||
|
||||
OsdCudaComputeBilinearEdge(
|
||||
context->GetStream(),
|
||||
vertex, varying,
|
||||
_currentBindState.vertexDesc.length, _currentBindState.vertexDesc.stride,
|
||||
_currentBindState.varyingDesc.length, _currentBindState.varyingDesc.stride,
|
||||
@ -174,6 +190,7 @@ OsdCudaComputeController::ApplyBilinearVertexVerticesKernel(
|
||||
float *varying = _currentBindState.GetOffsettedVaryingBuffer();
|
||||
|
||||
OsdCudaComputeBilinearVertex(
|
||||
context->GetStream(),
|
||||
vertex, varying,
|
||||
_currentBindState.vertexDesc.length, _currentBindState.vertexDesc.stride,
|
||||
_currentBindState.varyingDesc.length, _currentBindState.varyingDesc.stride,
|
||||
@ -196,6 +213,7 @@ OsdCudaComputeController::ApplyCatmarkFaceVerticesKernel(
|
||||
float *varying = _currentBindState.GetOffsettedVaryingBuffer();
|
||||
|
||||
OsdCudaComputeFace(
|
||||
context->GetStream(),
|
||||
vertex, varying,
|
||||
_currentBindState.vertexDesc.length, _currentBindState.vertexDesc.stride,
|
||||
_currentBindState.varyingDesc.length, _currentBindState.varyingDesc.stride,
|
||||
@ -217,6 +235,7 @@ OsdCudaComputeController::ApplyCatmarkQuadFaceVerticesKernel(
|
||||
float *varying = _currentBindState.GetOffsettedVaryingBuffer();
|
||||
|
||||
OsdCudaComputeQuadFace(
|
||||
context->GetStream(),
|
||||
vertex, varying,
|
||||
_currentBindState.vertexDesc.length, _currentBindState.vertexDesc.stride,
|
||||
_currentBindState.varyingDesc.length, _currentBindState.varyingDesc.stride,
|
||||
@ -237,6 +256,7 @@ OsdCudaComputeController::ApplyCatmarkTriQuadFaceVerticesKernel(
|
||||
float *varying = _currentBindState.GetOffsettedVaryingBuffer();
|
||||
|
||||
OsdCudaComputeTriQuadFace(
|
||||
context->GetStream(),
|
||||
vertex, varying,
|
||||
_currentBindState.vertexDesc.length, _currentBindState.vertexDesc.stride,
|
||||
_currentBindState.varyingDesc.length, _currentBindState.varyingDesc.stride,
|
||||
@ -259,6 +279,7 @@ OsdCudaComputeController::ApplyCatmarkEdgeVerticesKernel(
|
||||
float *varying = _currentBindState.GetOffsettedVaryingBuffer();
|
||||
|
||||
OsdCudaComputeEdge(
|
||||
context->GetStream(),
|
||||
vertex, varying,
|
||||
_currentBindState.vertexDesc.length, _currentBindState.vertexDesc.stride,
|
||||
_currentBindState.varyingDesc.length, _currentBindState.varyingDesc.stride,
|
||||
@ -280,6 +301,7 @@ OsdCudaComputeController::ApplyCatmarkRestrictedEdgeVerticesKernel(
|
||||
float *varying = _currentBindState.GetOffsettedVaryingBuffer();
|
||||
|
||||
OsdCudaComputeRestrictedEdge(
|
||||
context->GetStream(),
|
||||
vertex, varying,
|
||||
_currentBindState.vertexDesc.length, _currentBindState.vertexDesc.stride,
|
||||
_currentBindState.varyingDesc.length, _currentBindState.varyingDesc.stride,
|
||||
@ -304,6 +326,7 @@ OsdCudaComputeController::ApplyCatmarkVertexVerticesKernelB(
|
||||
float *varying = _currentBindState.GetOffsettedVaryingBuffer();
|
||||
|
||||
OsdCudaComputeVertexB(
|
||||
context->GetStream(),
|
||||
vertex, varying,
|
||||
_currentBindState.vertexDesc.length, _currentBindState.vertexDesc.stride,
|
||||
_currentBindState.varyingDesc.length, _currentBindState.varyingDesc.stride,
|
||||
@ -328,6 +351,7 @@ OsdCudaComputeController::ApplyCatmarkVertexVerticesKernelA1(
|
||||
float *varying = _currentBindState.GetOffsettedVaryingBuffer();
|
||||
|
||||
OsdCudaComputeVertexA(
|
||||
context->GetStream(),
|
||||
vertex, varying,
|
||||
_currentBindState.vertexDesc.length, _currentBindState.vertexDesc.stride,
|
||||
_currentBindState.varyingDesc.length, _currentBindState.varyingDesc.stride,
|
||||
@ -351,6 +375,7 @@ OsdCudaComputeController::ApplyCatmarkVertexVerticesKernelA2(
|
||||
float *varying = _currentBindState.GetOffsettedVaryingBuffer();
|
||||
|
||||
OsdCudaComputeVertexA(
|
||||
context->GetStream(),
|
||||
vertex, varying,
|
||||
_currentBindState.vertexDesc.length, _currentBindState.vertexDesc.stride,
|
||||
_currentBindState.varyingDesc.length, _currentBindState.varyingDesc.stride,
|
||||
@ -374,6 +399,7 @@ OsdCudaComputeController::ApplyCatmarkRestrictedVertexVerticesKernelB1(
|
||||
float *varying = _currentBindState.GetOffsettedVaryingBuffer();
|
||||
|
||||
OsdCudaComputeRestrictedVertexB1(
|
||||
context->GetStream(),
|
||||
vertex, varying,
|
||||
_currentBindState.vertexDesc.length, _currentBindState.vertexDesc.stride,
|
||||
_currentBindState.varyingDesc.length, _currentBindState.varyingDesc.stride,
|
||||
@ -397,6 +423,7 @@ OsdCudaComputeController::ApplyCatmarkRestrictedVertexVerticesKernelB2(
|
||||
float *varying = _currentBindState.GetOffsettedVaryingBuffer();
|
||||
|
||||
OsdCudaComputeRestrictedVertexB2(
|
||||
context->GetStream(),
|
||||
vertex, varying,
|
||||
_currentBindState.vertexDesc.length, _currentBindState.vertexDesc.stride,
|
||||
_currentBindState.varyingDesc.length, _currentBindState.varyingDesc.stride,
|
||||
@ -418,6 +445,7 @@ OsdCudaComputeController::ApplyCatmarkRestrictedVertexVerticesKernelA(
|
||||
float *varying = _currentBindState.GetOffsettedVaryingBuffer();
|
||||
|
||||
OsdCudaComputeRestrictedVertexA(
|
||||
context->GetStream(),
|
||||
vertex, varying,
|
||||
_currentBindState.vertexDesc.length, _currentBindState.vertexDesc.stride,
|
||||
_currentBindState.varyingDesc.length, _currentBindState.varyingDesc.stride,
|
||||
@ -440,6 +468,7 @@ OsdCudaComputeController::ApplyLoopEdgeVerticesKernel(
|
||||
float *varying = _currentBindState.GetOffsettedVaryingBuffer();
|
||||
|
||||
OsdCudaComputeEdge(
|
||||
context->GetStream(),
|
||||
vertex, varying,
|
||||
_currentBindState.vertexDesc.length, _currentBindState.vertexDesc.stride,
|
||||
_currentBindState.varyingDesc.length, _currentBindState.varyingDesc.stride,
|
||||
@ -465,6 +494,7 @@ OsdCudaComputeController::ApplyLoopVertexVerticesKernelB(
|
||||
float *varying = _currentBindState.GetOffsettedVaryingBuffer();
|
||||
|
||||
OsdCudaComputeLoopVertexB(
|
||||
context->GetStream(),
|
||||
vertex, varying,
|
||||
_currentBindState.vertexDesc.length, _currentBindState.vertexDesc.stride,
|
||||
_currentBindState.varyingDesc.length, _currentBindState.varyingDesc.stride,
|
||||
@ -489,6 +519,7 @@ OsdCudaComputeController::ApplyLoopVertexVerticesKernelA1(
|
||||
float *varying = _currentBindState.GetOffsettedVaryingBuffer();
|
||||
|
||||
OsdCudaComputeVertexA(
|
||||
context->GetStream(),
|
||||
vertex, varying,
|
||||
_currentBindState.vertexDesc.length, _currentBindState.vertexDesc.stride,
|
||||
_currentBindState.varyingDesc.length, _currentBindState.varyingDesc.stride,
|
||||
@ -512,6 +543,7 @@ OsdCudaComputeController::ApplyLoopVertexVerticesKernelA2(
|
||||
float *varying = _currentBindState.GetOffsettedVaryingBuffer();
|
||||
|
||||
OsdCudaComputeVertexA(
|
||||
context->GetStream(),
|
||||
vertex, varying,
|
||||
_currentBindState.vertexDesc.length, _currentBindState.vertexDesc.stride,
|
||||
_currentBindState.varyingDesc.length, _currentBindState.varyingDesc.stride,
|
||||
@ -536,6 +568,7 @@ OsdCudaComputeController::ApplyVertexEdits(
|
||||
|
||||
if (edit->GetOperation() == FarVertexEdit::Add) {
|
||||
OsdCudaEditVertexAdd(
|
||||
context->GetStream(),
|
||||
vertex,
|
||||
_currentBindState.vertexDesc.length, _currentBindState.vertexDesc.stride,
|
||||
edit->GetPrimvarOffset(),
|
||||
|
@ -1020,234 +1020,258 @@ editVertexAdd(float *fVertex, int vertexLength, int vertexStride,
|
||||
// XXX: this macro usage is tentative. Since cuda kernel can't be dynamically configured,
|
||||
// still trying to find better way to have optimized kernel..
|
||||
|
||||
#define OPT_KERNEL(NUM_VERTEX_ELEMENTS, NUM_VARYING_ELEMENTS, KERNEL, X, Y, ARG) \
|
||||
#define OPT_KERNEL(NUM_VERTEX_ELEMENTS, NUM_VARYING_ELEMENTS, KERNEL, X, Y, STREAM, ARG) \
|
||||
if(vertexLength == NUM_VERTEX_ELEMENTS && \
|
||||
varyingLength == NUM_VARYING_ELEMENTS && \
|
||||
vertexStride == vertexLength && \
|
||||
varyingStride == varyingLength) \
|
||||
{ KERNEL<NUM_VERTEX_ELEMENTS, NUM_VARYING_ELEMENTS><<<X,Y>>>ARG; \
|
||||
{ KERNEL<NUM_VERTEX_ELEMENTS, NUM_VARYING_ELEMENTS><<<X,Y,0,STREAM>>>ARG; \
|
||||
return; }
|
||||
|
||||
extern "C" {
|
||||
|
||||
void OsdCudaComputeFace(float *vertex, float *varying,
|
||||
void OsdCudaComputeFace(cudaStream_t stream,
|
||||
float *vertex, float *varying,
|
||||
int vertexLength, int vertexStride,
|
||||
int varyingLength, int varyingStride,
|
||||
int *F_IT, int *F_ITa, int offset, int tableOffset, int start, int end)
|
||||
{
|
||||
//computeFace<3, 0><<<512,32>>>(vertex, varying, F_IT, F_ITa, offset, start, end);
|
||||
OPT_KERNEL(0, 0, computeFace, 512, 32, (vertex, varying, F_IT, F_ITa, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(0, 3, computeFace, 512, 32, (vertex, varying, F_IT, F_ITa, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(3, 0, computeFace, 512, 32, (vertex, varying, F_IT, F_ITa, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(3, 3, computeFace, 512, 32, (vertex, varying, F_IT, F_ITa, offset, tableOffset, start, end));
|
||||
//computeFace<3, 0><<<512,32,0,stream>>>(vertex, varying, F_IT, F_ITa, offset, start, end);
|
||||
OPT_KERNEL(0, 0, computeFace, 512, 32, stream, (vertex, varying, F_IT, F_ITa, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(0, 3, computeFace, 512, 32, stream, (vertex, varying, F_IT, F_ITa, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(3, 0, computeFace, 512, 32, stream, (vertex, varying, F_IT, F_ITa, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(3, 3, computeFace, 512, 32, stream, (vertex, varying, F_IT, F_ITa, offset, tableOffset, start, end));
|
||||
|
||||
// fallback kernel (slow)
|
||||
computeFace<<<512, 32>>>(vertex, varying,
|
||||
computeFace<<<512, 32, 0, stream>>>(vertex, varying,
|
||||
vertexLength, vertexStride, varyingLength, varyingStride,
|
||||
F_IT, F_ITa, offset, tableOffset, start, end);
|
||||
}
|
||||
|
||||
void OsdCudaComputeQuadFace(float *vertex, float *varying,
|
||||
void OsdCudaComputeQuadFace(cudaStream_t stream,
|
||||
float *vertex, float *varying,
|
||||
int vertexLength, int vertexStride,
|
||||
int varyingLength, int varyingStride,
|
||||
int *F_IT, int offset, int tableOffset, int start, int end)
|
||||
{
|
||||
//computeQuadFace<3, 0><<<512,32>>>(vertex, varying, F_IT, offset, start, end);
|
||||
OPT_KERNEL(0, 0, computeQuadFace, 512, 32, (vertex, varying, F_IT, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(0, 3, computeQuadFace, 512, 32, (vertex, varying, F_IT, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(3, 0, computeQuadFace, 512, 32, (vertex, varying, F_IT, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(3, 3, computeQuadFace, 512, 32, (vertex, varying, F_IT, offset, tableOffset, start, end));
|
||||
//computeQuadFace<3, 0><<<512,32,0,stream>>>(vertex, varying, F_IT, offset, start, end);
|
||||
OPT_KERNEL(0, 0, computeQuadFace, 512, 32, stream, (vertex, varying, F_IT, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(0, 3, computeQuadFace, 512, 32, stream, (vertex, varying, F_IT, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(3, 0, computeQuadFace, 512, 32, stream, (vertex, varying, F_IT, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(3, 3, computeQuadFace, 512, 32, stream, (vertex, varying, F_IT, offset, tableOffset, start, end));
|
||||
|
||||
// fallback kernel (slow)
|
||||
computeQuadFace<<<512, 32>>>(vertex, varying,
|
||||
computeQuadFace<<<512, 32, 0, stream>>>(vertex, varying,
|
||||
vertexLength, vertexStride, varyingLength, varyingStride,
|
||||
F_IT, offset, tableOffset, start, end);
|
||||
}
|
||||
|
||||
void OsdCudaComputeTriQuadFace(float *vertex, float *varying,
|
||||
void OsdCudaComputeTriQuadFace(cudaStream_t stream,
|
||||
float *vertex, float *varying,
|
||||
int vertexLength, int vertexStride,
|
||||
int varyingLength, int varyingStride,
|
||||
int *F_IT, int offset, int tableOffset, int start, int end)
|
||||
{
|
||||
//computeTriQuadFace<3, 0><<<512,32>>>(vertex, varying, F_IT, offset, start, end);
|
||||
OPT_KERNEL(0, 0, computeTriQuadFace, 512, 32, (vertex, varying, F_IT, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(0, 3, computeTriQuadFace, 512, 32, (vertex, varying, F_IT, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(3, 0, computeTriQuadFace, 512, 32, (vertex, varying, F_IT, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(3, 3, computeTriQuadFace, 512, 32, (vertex, varying, F_IT, offset, tableOffset, start, end));
|
||||
//computeTriQuadFace<3, 0><<<512,32,0,stream>>>(vertex, varying, F_IT, offset, start, end);
|
||||
OPT_KERNEL(0, 0, computeTriQuadFace, 512, 32, stream, (vertex, varying, F_IT, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(0, 3, computeTriQuadFace, 512, 32, stream, (vertex, varying, F_IT, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(3, 0, computeTriQuadFace, 512, 32, stream, (vertex, varying, F_IT, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(3, 3, computeTriQuadFace, 512, 32, stream, (vertex, varying, F_IT, offset, tableOffset, start, end));
|
||||
|
||||
// fallback kernel (slow)
|
||||
computeTriQuadFace<<<512, 32>>>(vertex, varying,
|
||||
computeTriQuadFace<<<512, 32, 0, stream>>>(vertex, varying,
|
||||
vertexLength, vertexStride, varyingLength, varyingStride,
|
||||
F_IT, offset, tableOffset, start, end);
|
||||
}
|
||||
|
||||
|
||||
void OsdCudaComputeEdge(float *vertex, float *varying,
|
||||
void OsdCudaComputeEdge(cudaStream_t stream,
|
||||
float *vertex, float *varying,
|
||||
int vertexLength, int vertexStride,
|
||||
int varyingLength, int varyingStride,
|
||||
int *E_IT, float *E_W, int offset, int tableOffset, int start, int end)
|
||||
{
|
||||
//computeEdge<0, 3><<<512,32>>>(vertex, varying, E_IT, E_W, offset, start, end);
|
||||
OPT_KERNEL(0, 0, computeEdge, 512, 32, (vertex, varying, E_IT, E_W, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(0, 3, computeEdge, 512, 32, (vertex, varying, E_IT, E_W, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(3, 0, computeEdge, 512, 32, (vertex, varying, E_IT, E_W, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(3, 3, computeEdge, 512, 32, (vertex, varying, E_IT, E_W, offset, tableOffset, start, end));
|
||||
//computeEdge<0, 3><<<512,32,0,stream>>>(vertex, varying, E_IT, E_W, offset, start, end);
|
||||
OPT_KERNEL(0, 0, computeEdge, 512, 32, stream, (vertex, varying, E_IT, E_W, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(0, 3, computeEdge, 512, 32, stream, (vertex, varying, E_IT, E_W, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(3, 0, computeEdge, 512, 32, stream, (vertex, varying, E_IT, E_W, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(3, 3, computeEdge, 512, 32, stream, (vertex, varying, E_IT, E_W, offset, tableOffset, start, end));
|
||||
|
||||
computeEdge<<<512, 32>>>(vertex, varying,
|
||||
// fallback kernel (slow)
|
||||
computeEdge<<<512, 32, 0, stream>>>(vertex, varying,
|
||||
vertexLength, vertexStride, varyingLength, varyingStride,
|
||||
E_IT, E_W, offset, tableOffset, start, end);
|
||||
}
|
||||
|
||||
void OsdCudaComputeRestrictedEdge(float *vertex, float *varying,
|
||||
void OsdCudaComputeRestrictedEdge(cudaStream_t stream,
|
||||
float *vertex, float *varying,
|
||||
int vertexLength, int vertexStride,
|
||||
int varyingLength, int varyingStride,
|
||||
int *E_IT, int offset, int tableOffset, int start, int end)
|
||||
{
|
||||
//computeRestrictedEdge<0, 3><<<512,32>>>(vertex, varying, E_IT, offset, start, end);
|
||||
OPT_KERNEL(0, 0, computeRestrictedEdge, 512, 32, (vertex, varying, E_IT, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(0, 3, computeRestrictedEdge, 512, 32, (vertex, varying, E_IT, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(3, 0, computeRestrictedEdge, 512, 32, (vertex, varying, E_IT, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(3, 3, computeRestrictedEdge, 512, 32, (vertex, varying, E_IT, offset, tableOffset, start, end));
|
||||
//computeRestrictedEdge<0, 3><<<512,32,0,stream>>>(vertex, varying, E_IT, offset, start, end);
|
||||
OPT_KERNEL(0, 0, computeRestrictedEdge, 512, 32, stream, (vertex, varying, E_IT, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(0, 3, computeRestrictedEdge, 512, 32, stream, (vertex, varying, E_IT, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(3, 0, computeRestrictedEdge, 512, 32, stream, (vertex, varying, E_IT, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(3, 3, computeRestrictedEdge, 512, 32, stream, (vertex, varying, E_IT, offset, tableOffset, start, end));
|
||||
|
||||
computeRestrictedEdge<<<512, 32>>>(vertex, varying,
|
||||
// fallback kernel (slow)
|
||||
computeRestrictedEdge<<<512, 32, 0, stream>>>(vertex, varying,
|
||||
vertexLength, vertexStride, varyingLength, varyingStride,
|
||||
E_IT, offset, tableOffset, start, end);
|
||||
}
|
||||
|
||||
void OsdCudaComputeVertexA(float *vertex, float *varying,
|
||||
void OsdCudaComputeVertexA(cudaStream_t stream,
|
||||
float *vertex, float *varying,
|
||||
int vertexLength, int vertexStride,
|
||||
int varyingLength, int varyingStride,
|
||||
int *V_ITa, float *V_W, int offset, int tableOffset, int start, int end, int pass)
|
||||
{
|
||||
// computeVertexA<0, 3><<<512,32>>>(vertex, varying, V_ITa, V_W, offset, start, end, pass);
|
||||
OPT_KERNEL(0, 0, computeVertexA, 512, 32, (vertex, varying, V_ITa, V_W, offset, tableOffset, start, end, pass));
|
||||
OPT_KERNEL(0, 3, computeVertexA, 512, 32, (vertex, varying, V_ITa, V_W, offset, tableOffset, start, end, pass));
|
||||
OPT_KERNEL(3, 0, computeVertexA, 512, 32, (vertex, varying, V_ITa, V_W, offset, tableOffset, start, end, pass));
|
||||
OPT_KERNEL(3, 3, computeVertexA, 512, 32, (vertex, varying, V_ITa, V_W, offset, tableOffset, start, end, pass));
|
||||
// computeVertexA<0, 3><<<512,32,0,stream>>>(vertex, varying, V_ITa, V_W, offset, start, end, pass);
|
||||
OPT_KERNEL(0, 0, computeVertexA, 512, 32, stream, (vertex, varying, V_ITa, V_W, offset, tableOffset, start, end, pass));
|
||||
OPT_KERNEL(0, 3, computeVertexA, 512, 32, stream, (vertex, varying, V_ITa, V_W, offset, tableOffset, start, end, pass));
|
||||
OPT_KERNEL(3, 0, computeVertexA, 512, 32, stream, (vertex, varying, V_ITa, V_W, offset, tableOffset, start, end, pass));
|
||||
OPT_KERNEL(3, 3, computeVertexA, 512, 32, stream, (vertex, varying, V_ITa, V_W, offset, tableOffset, start, end, pass));
|
||||
|
||||
computeVertexA<<<512, 32>>>(vertex, varying,
|
||||
// fallback kernel (slow)
|
||||
computeVertexA<<<512, 32, 0, stream>>>(vertex, varying,
|
||||
vertexLength, vertexStride, varyingLength, varyingStride,
|
||||
V_ITa, V_W, offset, tableOffset, start, end, pass);
|
||||
}
|
||||
|
||||
void OsdCudaComputeVertexB(float *vertex, float *varying,
|
||||
void OsdCudaComputeVertexB(cudaStream_t stream,
|
||||
float *vertex, float *varying,
|
||||
int vertexLength, int vertexStride,
|
||||
int varyingLength, int varyingStride,
|
||||
int *V_ITa, int *V_IT, float *V_W, int offset, int tableOffset, int start, int end)
|
||||
{
|
||||
// computeVertexB<0, 3><<<512,32>>>(vertex, varying, V_ITa, V_IT, V_W, offset, start, end);
|
||||
OPT_KERNEL(0, 0, computeVertexB, 512, 32, (vertex, varying, V_ITa, V_IT, V_W, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(0, 3, computeVertexB, 512, 32, (vertex, varying, V_ITa, V_IT, V_W, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(3, 0, computeVertexB, 512, 32, (vertex, varying, V_ITa, V_IT, V_W, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(3, 3, computeVertexB, 512, 32, (vertex, varying, V_ITa, V_IT, V_W, offset, tableOffset, start, end));
|
||||
// computeVertexB<0, 3><<<512,32,0,stream>>>(vertex, varying, V_ITa, V_IT, V_W, offset, start, end);
|
||||
OPT_KERNEL(0, 0, computeVertexB, 512, 32, stream, (vertex, varying, V_ITa, V_IT, V_W, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(0, 3, computeVertexB, 512, 32, stream, (vertex, varying, V_ITa, V_IT, V_W, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(3, 0, computeVertexB, 512, 32, stream, (vertex, varying, V_ITa, V_IT, V_W, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(3, 3, computeVertexB, 512, 32, stream, (vertex, varying, V_ITa, V_IT, V_W, offset, tableOffset, start, end));
|
||||
|
||||
computeVertexB<<<512, 32>>>(vertex, varying,
|
||||
// fallback kernel (slow)
|
||||
computeVertexB<<<512, 32, 0, stream>>>(vertex, varying,
|
||||
vertexLength, vertexStride, varyingLength, varyingStride,
|
||||
V_ITa, V_IT, V_W, offset, tableOffset, start, end);
|
||||
}
|
||||
|
||||
void OsdCudaComputeRestrictedVertexA(float *vertex, float *varying,
|
||||
void OsdCudaComputeRestrictedVertexA(cudaStream_t stream,
|
||||
float *vertex, float *varying,
|
||||
int vertexLength, int vertexStride,
|
||||
int varyingLength, int varyingStride,
|
||||
int *V_ITa, int offset, int tableOffset, int start, int end)
|
||||
{
|
||||
// computeRestrictedVertexA<0, 3><<<512,32>>>(vertex, varying, V_ITa, offset, start, end);
|
||||
OPT_KERNEL(0, 0, computeRestrictedVertexA, 512, 32, (vertex, varying, V_ITa, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(0, 3, computeRestrictedVertexA, 512, 32, (vertex, varying, V_ITa, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(3, 0, computeRestrictedVertexA, 512, 32, (vertex, varying, V_ITa, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(3, 3, computeRestrictedVertexA, 512, 32, (vertex, varying, V_ITa, offset, tableOffset, start, end));
|
||||
// computeRestrictedVertexA<0, 3><<<512,32,0,stream>>>(vertex, varying, V_ITa, offset, start, end);
|
||||
OPT_KERNEL(0, 0, computeRestrictedVertexA, 512, 32, stream, (vertex, varying, V_ITa, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(0, 3, computeRestrictedVertexA, 512, 32, stream, (vertex, varying, V_ITa, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(3, 0, computeRestrictedVertexA, 512, 32, stream, (vertex, varying, V_ITa, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(3, 3, computeRestrictedVertexA, 512, 32, stream, (vertex, varying, V_ITa, offset, tableOffset, start, end));
|
||||
|
||||
computeRestrictedVertexA<<<512, 32>>>(vertex, varying,
|
||||
// fallback kernel (slow)
|
||||
computeRestrictedVertexA<<<512, 32, 0, stream>>>(vertex, varying,
|
||||
vertexLength, vertexStride, varyingLength, varyingStride,
|
||||
V_ITa, offset, tableOffset, start, end);
|
||||
}
|
||||
|
||||
void OsdCudaComputeRestrictedVertexB1(float *vertex, float *varying,
|
||||
void OsdCudaComputeRestrictedVertexB1(cudaStream_t stream,
|
||||
float *vertex, float *varying,
|
||||
int vertexLength, int vertexStride,
|
||||
int varyingLength, int varyingStride,
|
||||
int *V_ITa, int *V_IT, int offset, int tableOffset, int start, int end)
|
||||
{
|
||||
// computeRestrictedVertexB1<0, 3><<<512,32>>>(vertex, varying, V_ITa, V_IT, offset, start, end);
|
||||
OPT_KERNEL(0, 0, computeRestrictedVertexB1, 512, 32, (vertex, varying, V_ITa, V_IT, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(0, 3, computeRestrictedVertexB1, 512, 32, (vertex, varying, V_ITa, V_IT, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(3, 0, computeRestrictedVertexB1, 512, 32, (vertex, varying, V_ITa, V_IT, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(3, 3, computeRestrictedVertexB1, 512, 32, (vertex, varying, V_ITa, V_IT, offset, tableOffset, start, end));
|
||||
// computeRestrictedVertexB1<0, 3><<<512,32,0,stream>>>(vertex, varying, V_ITa, V_IT, offset, start, end);
|
||||
OPT_KERNEL(0, 0, computeRestrictedVertexB1, 512, 32, stream, (vertex, varying, V_ITa, V_IT, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(0, 3, computeRestrictedVertexB1, 512, 32, stream, (vertex, varying, V_ITa, V_IT, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(3, 0, computeRestrictedVertexB1, 512, 32, stream, (vertex, varying, V_ITa, V_IT, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(3, 3, computeRestrictedVertexB1, 512, 32, stream, (vertex, varying, V_ITa, V_IT, offset, tableOffset, start, end));
|
||||
|
||||
computeRestrictedVertexB1 <<<512, 32>>>(vertex, varying,
|
||||
// fallback kernel (slow)
|
||||
computeRestrictedVertexB1<<<512, 32, 0, stream>>>(vertex, varying,
|
||||
vertexLength, vertexStride, varyingLength, varyingStride,
|
||||
V_ITa, V_IT, offset, tableOffset, start, end);
|
||||
}
|
||||
|
||||
void OsdCudaComputeRestrictedVertexB2(float *vertex, float *varying,
|
||||
void OsdCudaComputeRestrictedVertexB2(cudaStream_t stream,
|
||||
float *vertex, float *varying,
|
||||
int vertexLength, int vertexStride,
|
||||
int varyingLength, int varyingStride,
|
||||
int *V_ITa, int *V_IT, int offset, int tableOffset, int start, int end)
|
||||
{
|
||||
// computeRestrictedVertexB2<0, 3><<<512,32>>>(vertex, varying, V_ITa, V_IT, offset, start, end);
|
||||
OPT_KERNEL(0, 0, computeRestrictedVertexB2, 512, 32, (vertex, varying, V_ITa, V_IT, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(0, 3, computeRestrictedVertexB2, 512, 32, (vertex, varying, V_ITa, V_IT, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(3, 0, computeRestrictedVertexB2, 512, 32, (vertex, varying, V_ITa, V_IT, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(3, 3, computeRestrictedVertexB2, 512, 32, (vertex, varying, V_ITa, V_IT, offset, tableOffset, start, end));
|
||||
// computeRestrictedVertexB2<0, 3><<<512,32,0,stream>>>(vertex, varying, V_ITa, V_IT, offset, start, end);
|
||||
OPT_KERNEL(0, 0, computeRestrictedVertexB2, 512, 32, stream, (vertex, varying, V_ITa, V_IT, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(0, 3, computeRestrictedVertexB2, 512, 32, stream, (vertex, varying, V_ITa, V_IT, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(3, 0, computeRestrictedVertexB2, 512, 32, stream, (vertex, varying, V_ITa, V_IT, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(3, 3, computeRestrictedVertexB2, 512, 32, stream, (vertex, varying, V_ITa, V_IT, offset, tableOffset, start, end));
|
||||
|
||||
computeRestrictedVertexB2 <<<512, 32>>>(vertex, varying,
|
||||
// fallback kernel (slow)
|
||||
computeRestrictedVertexB2<<<512, 32, 0, stream>>>(vertex, varying,
|
||||
vertexLength, vertexStride, varyingLength, varyingStride,
|
||||
V_ITa, V_IT, offset, tableOffset, start, end);
|
||||
}
|
||||
|
||||
void OsdCudaComputeLoopVertexB(float *vertex, float *varying,
|
||||
void OsdCudaComputeLoopVertexB(cudaStream_t stream,
|
||||
float *vertex, float *varying,
|
||||
int vertexLength, int vertexStride,
|
||||
int varyingLength, int varyingStride,
|
||||
int *V_ITa, int *V_IT, float *V_W, int offset, int tableOffset, int start, int end)
|
||||
{
|
||||
// computeLoopVertexB<0, 3><<<512,32>>>(vertex, varying, V_ITa, V_IT, V_W, offset, start, end);
|
||||
OPT_KERNEL(0, 0, computeLoopVertexB, 512, 32, (vertex, varying, V_ITa, V_IT, V_W, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(0, 3, computeLoopVertexB, 512, 32, (vertex, varying, V_ITa, V_IT, V_W, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(3, 0, computeLoopVertexB, 512, 32, (vertex, varying, V_ITa, V_IT, V_W, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(3, 3, computeLoopVertexB, 512, 32, (vertex, varying, V_ITa, V_IT, V_W, offset, tableOffset, start, end));
|
||||
// computeLoopVertexB<0, 3><<<512,32,0,stream>>>(vertex, varying, V_ITa, V_IT, V_W, offset, start, end);
|
||||
OPT_KERNEL(0, 0, computeLoopVertexB, 512, 32, stream, (vertex, varying, V_ITa, V_IT, V_W, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(0, 3, computeLoopVertexB, 512, 32, stream, (vertex, varying, V_ITa, V_IT, V_W, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(3, 0, computeLoopVertexB, 512, 32, stream, (vertex, varying, V_ITa, V_IT, V_W, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(3, 3, computeLoopVertexB, 512, 32, stream, (vertex, varying, V_ITa, V_IT, V_W, offset, tableOffset, start, end));
|
||||
|
||||
computeLoopVertexB<<<512, 32>>>(vertex, varying,
|
||||
// fallback kernel (slow)
|
||||
computeLoopVertexB<<<512, 32, 0, stream>>>(vertex, varying,
|
||||
vertexLength, vertexStride, varyingLength, varyingStride,
|
||||
V_ITa, V_IT, V_W, offset, tableOffset, start, end);
|
||||
}
|
||||
|
||||
void OsdCudaComputeBilinearEdge(float *vertex, float *varying,
|
||||
void OsdCudaComputeBilinearEdge(cudaStream_t stream,
|
||||
float *vertex, float *varying,
|
||||
int vertexLength, int vertexStride,
|
||||
int varyingLength, int varyingStride,
|
||||
int *E_IT, int offset, int tableOffset, int start, int end)
|
||||
{
|
||||
//computeBilinearEdge<0, 3><<<512,32>>>(vertex, varying, E_IT, offset, start, end);
|
||||
OPT_KERNEL(0, 0, computeBilinearEdge, 512, 32, (vertex, varying, E_IT, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(0, 3, computeBilinearEdge, 512, 32, (vertex, varying, E_IT, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(3, 0, computeBilinearEdge, 512, 32, (vertex, varying, E_IT, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(3, 3, computeBilinearEdge, 512, 32, (vertex, varying, E_IT, offset, tableOffset, start, end));
|
||||
//computeBilinearEdge<0, 3><<<512,32,0,stream>>>(vertex, varying, E_IT, offset, start, end);
|
||||
OPT_KERNEL(0, 0, computeBilinearEdge, 512, 32, stream, (vertex, varying, E_IT, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(0, 3, computeBilinearEdge, 512, 32, stream, (vertex, varying, E_IT, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(3, 0, computeBilinearEdge, 512, 32, stream, (vertex, varying, E_IT, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(3, 3, computeBilinearEdge, 512, 32, stream, (vertex, varying, E_IT, offset, tableOffset, start, end));
|
||||
|
||||
computeBilinearEdge<<<512, 32>>>(vertex, varying,
|
||||
// fallback kernel (slow)
|
||||
computeBilinearEdge<<<512, 32, 0, stream>>>(vertex, varying,
|
||||
vertexLength, vertexStride, varyingLength, varyingStride,
|
||||
E_IT, offset, tableOffset, start, end);
|
||||
}
|
||||
|
||||
void OsdCudaComputeBilinearVertex(float *vertex, float *varying,
|
||||
void OsdCudaComputeBilinearVertex(cudaStream_t stream,
|
||||
float *vertex, float *varying,
|
||||
int vertexLength, int vertexStride,
|
||||
int varyingLength, int varyingStride,
|
||||
int *V_ITa, int offset, int tableOffset, int start, int end)
|
||||
{
|
||||
// computeBilinearVertex<0, 3><<<512,32>>>(vertex, varying, V_ITa, offset, start, end);
|
||||
OPT_KERNEL(0, 0, computeBilinearVertex, 512, 32, (vertex, varying, V_ITa, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(0, 3, computeBilinearVertex, 512, 32, (vertex, varying, V_ITa, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(3, 0, computeBilinearVertex, 512, 32, (vertex, varying, V_ITa, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(3, 3, computeBilinearVertex, 512, 32, (vertex, varying, V_ITa, offset, tableOffset, start, end));
|
||||
// computeBilinearVertex<0, 3><<<512,32,0,stream>>>(vertex, varying, V_ITa, offset, start, end);
|
||||
OPT_KERNEL(0, 0, computeBilinearVertex, 512, 32, stream, (vertex, varying, V_ITa, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(0, 3, computeBilinearVertex, 512, 32, stream, (vertex, varying, V_ITa, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(3, 0, computeBilinearVertex, 512, 32, stream, (vertex, varying, V_ITa, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(3, 3, computeBilinearVertex, 512, 32, stream, (vertex, varying, V_ITa, offset, tableOffset, start, end));
|
||||
|
||||
computeBilinearVertex<<<512, 32>>>(vertex, varying,
|
||||
// fallback kernel (slow)
|
||||
computeBilinearVertex<<<512, 32, 0, stream>>>(vertex, varying,
|
||||
vertexLength, vertexStride, varyingLength, varyingStride,
|
||||
V_ITa, offset, tableOffset, start, end);
|
||||
}
|
||||
|
||||
void OsdCudaEditVertexAdd(float *vertex, int vertexLength, int vertexStride,
|
||||
void OsdCudaEditVertexAdd(cudaStream_t stream,
|
||||
float *vertex, int vertexLength, int vertexStride,
|
||||
int primVarOffset, int primVarWidth,
|
||||
int vertexOffset, int tableOffset,
|
||||
int start, int end, int *editIndices, float *editValues)
|
||||
{
|
||||
editVertexAdd<<<512, 32>>>(vertex, vertexLength, vertexStride, primVarOffset, primVarWidth,
|
||||
editVertexAdd<<<512, 32, 0, stream>>>(vertex, vertexLength, vertexStride, primVarOffset, primVarWidth,
|
||||
vertexOffset, tableOffset, start, end,
|
||||
editIndices, editValues);
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user