mirror of
https://github.com/PixarAnimationStudios/OpenSubdiv
synced 2024-11-23 12:10:08 +00:00
Revert "added memcpyasync and streams to cuda backend"
This reverts commit 84212cd725
.
This commit is contained in:
parent
8c9096cdc8
commit
df719eb9ed
@ -30,17 +30,14 @@ namespace OpenSubdiv {
|
||||
namespace OPENSUBDIV_VERSION {
|
||||
|
||||
bool
|
||||
OsdCudaTable::createCudaBuffer(cudaStream_t stream, size_t size, const void *ptr) {
|
||||
|
||||
cudaHostRegister((void**)&ptr, size);
|
||||
/* The above command is slow. Try to use cudaMallocHost during the allocation of ptr to speedup */
|
||||
OsdCudaTable::createCudaBuffer(size_t size, const void *ptr) {
|
||||
|
||||
cudaError_t err = cudaMalloc(&_devicePtr, size);
|
||||
if (err != cudaSuccess) {
|
||||
return false;
|
||||
}
|
||||
|
||||
err = cudaMemcpyAsync(_devicePtr, ptr, size, cudaMemcpyHostToDevice, stream);
|
||||
err = cudaMemcpy(_devicePtr, ptr, size, cudaMemcpyHostToDevice);
|
||||
if (err != cudaSuccess) {
|
||||
cudaFree(_devicePtr);
|
||||
_devicePtr = NULL;
|
||||
@ -143,15 +140,15 @@ OsdCudaComputeContext::initialize(FarSubdivisionTables const *subdivisionTables,
|
||||
// allocate 5 or 7 tables
|
||||
_tables.resize(subdivisionTables->GetNumTables(), 0);
|
||||
|
||||
_tables[FarSubdivisionTables::E_IT] = OsdCudaTable::Create(GetStream(), subdivisionTables->Get_E_IT());
|
||||
_tables[FarSubdivisionTables::V_IT] = OsdCudaTable::Create(GetStream(), subdivisionTables->Get_V_IT());
|
||||
_tables[FarSubdivisionTables::V_ITa] = OsdCudaTable::Create(GetStream(), subdivisionTables->Get_V_ITa());
|
||||
_tables[FarSubdivisionTables::E_W] = OsdCudaTable::Create(GetStream(), subdivisionTables->Get_E_W());
|
||||
_tables[FarSubdivisionTables::V_W] = OsdCudaTable::Create(GetStream(), subdivisionTables->Get_V_W());
|
||||
_tables[FarSubdivisionTables::E_IT] = OsdCudaTable::Create(subdivisionTables->Get_E_IT());
|
||||
_tables[FarSubdivisionTables::V_IT] = OsdCudaTable::Create(subdivisionTables->Get_V_IT());
|
||||
_tables[FarSubdivisionTables::V_ITa] = OsdCudaTable::Create(subdivisionTables->Get_V_ITa());
|
||||
_tables[FarSubdivisionTables::E_W] = OsdCudaTable::Create(subdivisionTables->Get_E_W());
|
||||
_tables[FarSubdivisionTables::V_W] = OsdCudaTable::Create(subdivisionTables->Get_V_W());
|
||||
|
||||
if (subdivisionTables->GetNumTables() > 5) {
|
||||
_tables[FarSubdivisionTables::F_IT] = OsdCudaTable::Create(GetStream(), subdivisionTables->Get_F_IT());
|
||||
_tables[FarSubdivisionTables::F_ITa] = OsdCudaTable::Create(GetStream(), subdivisionTables->Get_F_ITa());
|
||||
_tables[FarSubdivisionTables::F_IT] = OsdCudaTable::Create(subdivisionTables->Get_F_IT());
|
||||
_tables[FarSubdivisionTables::F_ITa] = OsdCudaTable::Create(subdivisionTables->Get_F_ITa());
|
||||
}
|
||||
|
||||
// error check
|
||||
@ -205,10 +202,6 @@ OsdCudaComputeContext::Create(FarSubdivisionTables const *subdivisionTables,
|
||||
|
||||
OsdCudaComputeContext *result = new OsdCudaComputeContext();
|
||||
|
||||
cudaStream_t stream;
|
||||
cudaStreamCreate(&stream);
|
||||
_stream = &stream;
|
||||
|
||||
if (result->initialize(subdivisionTables, vertexEditTables) == false) {
|
||||
delete result;
|
||||
return NULL;
|
||||
@ -216,10 +209,5 @@ OsdCudaComputeContext::Create(FarSubdivisionTables const *subdivisionTables,
|
||||
return result;
|
||||
}
|
||||
|
||||
cudaStream_t
|
||||
OsdComputeContext::GetStream(){
|
||||
return *_stream
|
||||
}
|
||||
|
||||
} // end namespace OPENSUBDIV_VERSION
|
||||
} // end namespace OpenSubdiv
|
||||
|
@ -42,9 +42,9 @@ namespace OPENSUBDIV_VERSION {
|
||||
class OsdCudaTable : OsdNonCopyable<OsdCudaTable> {
|
||||
public:
|
||||
template<typename T>
|
||||
static OsdCudaTable * Create(cudaStream_t stream, const std::vector<T> &table) {
|
||||
static OsdCudaTable * Create(const std::vector<T> &table) {
|
||||
OsdCudaTable *result = new OsdCudaTable();
|
||||
if (not result->createCudaBuffer(stream, table.size() * sizeof(T), table.empty() ? NULL : &table[0])) {
|
||||
if (not result->createCudaBuffer(table.size() * sizeof(T), table.empty() ? NULL : &table[0])) {
|
||||
delete result;
|
||||
return NULL;
|
||||
}
|
||||
@ -55,16 +55,12 @@ public:
|
||||
|
||||
void * GetCudaMemory() const;
|
||||
|
||||
cudaStream_t GetStream();
|
||||
|
||||
private:
|
||||
OsdCudaTable() : _devicePtr(NULL) {}
|
||||
|
||||
bool createCudaBuffer(size_t size, const void *ptr);
|
||||
|
||||
void *_devicePtr;
|
||||
|
||||
cudaStream_t *_stream;
|
||||
};
|
||||
|
||||
class OsdCudaHEditTable : OsdNonCopyable<OsdCudaHEditTable> {
|
||||
|
@ -30,92 +30,78 @@
|
||||
|
||||
extern "C" {
|
||||
|
||||
void OsdCudaComputeFace(cudaStream_t stream,
|
||||
float *vertex, float *varying,
|
||||
void OsdCudaComputeFace(float *vertex, float *varying,
|
||||
int vertexLength, int vertexStride,
|
||||
int varyingLength, int varyingStride,
|
||||
int *F_IT, int *F_ITa, int offset, int tableOffset, int start, int end);
|
||||
|
||||
void OsdCudaComputeQuadFace(cudaStream_t stream,
|
||||
float *vertex, float *varying,
|
||||
void OsdCudaComputeQuadFace(float *vertex, float *varying,
|
||||
int vertexLength, int vertexStride,
|
||||
int varyingLength, int varyingStride,
|
||||
int *F_IT, int offset, int tableOffset, int start, int end);
|
||||
|
||||
void OsdCudaComputeTriQuadFace(cudaStream_t stream,
|
||||
float *vertex, float *varying,
|
||||
void OsdCudaComputeTriQuadFace(float *vertex, float *varying,
|
||||
int vertexLength, int vertexStride,
|
||||
int varyingLength, int varyingStride,
|
||||
int *F_IT, int offset, int tableOffset, int start, int end);
|
||||
|
||||
void OsdCudaComputeEdge(cudaStream_t stream,
|
||||
float *vertex, float *varying,
|
||||
void OsdCudaComputeEdge(float *vertex, float *varying,
|
||||
int vertexLength, int vertexStride,
|
||||
int varyingLength, int varyingStride,
|
||||
int *E_IT, float *E_W, int offset, int tableOffset, int start, int end);
|
||||
|
||||
void OsdCudaComputeRestrictedEdge(cudaStream_t stream,
|
||||
float *vertex, float *varying,
|
||||
void OsdCudaComputeRestrictedEdge(float *vertex, float *varying,
|
||||
int vertexLength, int vertexStride,
|
||||
int varyingLength, int varyingStride,
|
||||
int *E_IT, int offset, int tableOffset, int start, int end);
|
||||
|
||||
void OsdCudaComputeVertexA(cudaStream_t stream,
|
||||
float *vertex, float *varying,
|
||||
void OsdCudaComputeVertexA(float *vertex, float *varying,
|
||||
int vertexLength, int vertexStride,
|
||||
int varyingLength, int varyingStride,
|
||||
int *V_ITa, float *V_W, int offset, int tableOffset,
|
||||
int start, int end, int pass);
|
||||
|
||||
void OsdCudaComputeVertexB(cudaStream_t stream,
|
||||
float *vertex, float *varying,
|
||||
void OsdCudaComputeVertexB(float *vertex, float *varying,
|
||||
int vertexLength, int vertexStride,
|
||||
int varyingLength, int varyingStride,
|
||||
int *V_ITa, int *V_IT, float *V_W, int offset, int tableOffset,
|
||||
int start, int end);
|
||||
|
||||
void OsdCudaComputeRestrictedVertexA(cudaStream_t stream,
|
||||
float *vertex, float *varying,
|
||||
void OsdCudaComputeRestrictedVertexA(float *vertex, float *varying,
|
||||
int vertexLength, int vertexStride,
|
||||
int varyingLength, int varyingStride,
|
||||
int *V_ITa, int offset, int tableOffset,
|
||||
int start, int end);
|
||||
|
||||
void OsdCudaComputeRestrictedVertexB1(cudaStream_t stream,
|
||||
float *vertex, float *varying,
|
||||
void OsdCudaComputeRestrictedVertexB1(float *vertex, float *varying,
|
||||
int vertexLength, int vertexStride,
|
||||
int varyingLength, int varyingStride,
|
||||
int *V_ITa, int *V_IT, int offset, int tableOffset,
|
||||
int start, int end);
|
||||
|
||||
void OsdCudaComputeRestrictedVertexB2(cudaStream_t stream,
|
||||
float *vertex, float *varying,
|
||||
void OsdCudaComputeRestrictedVertexB2(float *vertex, float *varying,
|
||||
int vertexLength, int vertexStride,
|
||||
int varyingLength, int varyingStride,
|
||||
int *V_ITa, int *V_IT, int offset, int tableOffset,
|
||||
int start, int end);
|
||||
|
||||
void OsdCudaComputeLoopVertexB(cudaStream_t stream,
|
||||
float *vertex, float *varying,
|
||||
void OsdCudaComputeLoopVertexB(float *vertex, float *varying,
|
||||
int vertexLength, int vertexStride,
|
||||
int varyingLength, int varyingStride,
|
||||
int *V_ITa, int *V_IT, float *V_W, int offset, int tableOffset,
|
||||
int start, int end);
|
||||
|
||||
void OsdCudaComputeBilinearEdge(cudaStream_t stream,
|
||||
float *vertex, float *varying,
|
||||
void OsdCudaComputeBilinearEdge(float *vertex, float *varying,
|
||||
int vertexLength, int vertexStride,
|
||||
int varyingLength, int varyingStride,
|
||||
int *E_IT, int offset, int tableOffset, int start, int end);
|
||||
|
||||
void OsdCudaComputeBilinearVertex(cudaStream_t stream,
|
||||
float *vertex, float *varying,
|
||||
void OsdCudaComputeBilinearVertex(float *vertex, float *varying,
|
||||
int vertexLength, int vertexStride,
|
||||
int varyingLength, int varyingStride,
|
||||
int *V_ITa, int offset, int tableOffset, int start, int end);
|
||||
|
||||
void OsdCudaEditVertexAdd(cudaStream_t stream,
|
||||
float *vertex,
|
||||
void OsdCudaEditVertexAdd(float *vertex,
|
||||
int vertexLength, int vertexStride,
|
||||
int primVarOffset, int primVarWidth,
|
||||
int offset, int tableOffset,
|
||||
@ -147,7 +133,6 @@ OsdCudaComputeController::ApplyBilinearFaceVerticesKernel(
|
||||
float *varying = _currentBindState.GetOffsettedVaryingBuffer();
|
||||
|
||||
OsdCudaComputeFace(
|
||||
context->GetStream(),
|
||||
vertex, varying,
|
||||
_currentBindState.vertexDesc.length, _currentBindState.vertexDesc.stride,
|
||||
_currentBindState.varyingDesc.length, _currentBindState.varyingDesc.stride,
|
||||
@ -169,7 +154,6 @@ OsdCudaComputeController::ApplyBilinearEdgeVerticesKernel(
|
||||
float *varying = _currentBindState.GetOffsettedVaryingBuffer();
|
||||
|
||||
OsdCudaComputeBilinearEdge(
|
||||
context->GetStream(),
|
||||
vertex, varying,
|
||||
_currentBindState.vertexDesc.length, _currentBindState.vertexDesc.stride,
|
||||
_currentBindState.varyingDesc.length, _currentBindState.varyingDesc.stride,
|
||||
@ -190,7 +174,6 @@ OsdCudaComputeController::ApplyBilinearVertexVerticesKernel(
|
||||
float *varying = _currentBindState.GetOffsettedVaryingBuffer();
|
||||
|
||||
OsdCudaComputeBilinearVertex(
|
||||
context->GetStream(),
|
||||
vertex, varying,
|
||||
_currentBindState.vertexDesc.length, _currentBindState.vertexDesc.stride,
|
||||
_currentBindState.varyingDesc.length, _currentBindState.varyingDesc.stride,
|
||||
@ -213,7 +196,6 @@ OsdCudaComputeController::ApplyCatmarkFaceVerticesKernel(
|
||||
float *varying = _currentBindState.GetOffsettedVaryingBuffer();
|
||||
|
||||
OsdCudaComputeFace(
|
||||
context->GetStream(),
|
||||
vertex, varying,
|
||||
_currentBindState.vertexDesc.length, _currentBindState.vertexDesc.stride,
|
||||
_currentBindState.varyingDesc.length, _currentBindState.varyingDesc.stride,
|
||||
@ -235,7 +217,6 @@ OsdCudaComputeController::ApplyCatmarkQuadFaceVerticesKernel(
|
||||
float *varying = _currentBindState.GetOffsettedVaryingBuffer();
|
||||
|
||||
OsdCudaComputeQuadFace(
|
||||
context->GetStream(),
|
||||
vertex, varying,
|
||||
_currentBindState.vertexDesc.length, _currentBindState.vertexDesc.stride,
|
||||
_currentBindState.varyingDesc.length, _currentBindState.varyingDesc.stride,
|
||||
@ -256,7 +237,6 @@ OsdCudaComputeController::ApplyCatmarkTriQuadFaceVerticesKernel(
|
||||
float *varying = _currentBindState.GetOffsettedVaryingBuffer();
|
||||
|
||||
OsdCudaComputeTriQuadFace(
|
||||
context->GetStream(),
|
||||
vertex, varying,
|
||||
_currentBindState.vertexDesc.length, _currentBindState.vertexDesc.stride,
|
||||
_currentBindState.varyingDesc.length, _currentBindState.varyingDesc.stride,
|
||||
@ -279,7 +259,6 @@ OsdCudaComputeController::ApplyCatmarkEdgeVerticesKernel(
|
||||
float *varying = _currentBindState.GetOffsettedVaryingBuffer();
|
||||
|
||||
OsdCudaComputeEdge(
|
||||
context->GetStream(),
|
||||
vertex, varying,
|
||||
_currentBindState.vertexDesc.length, _currentBindState.vertexDesc.stride,
|
||||
_currentBindState.varyingDesc.length, _currentBindState.varyingDesc.stride,
|
||||
@ -301,7 +280,6 @@ OsdCudaComputeController::ApplyCatmarkRestrictedEdgeVerticesKernel(
|
||||
float *varying = _currentBindState.GetOffsettedVaryingBuffer();
|
||||
|
||||
OsdCudaComputeRestrictedEdge(
|
||||
context->GetStream(),
|
||||
vertex, varying,
|
||||
_currentBindState.vertexDesc.length, _currentBindState.vertexDesc.stride,
|
||||
_currentBindState.varyingDesc.length, _currentBindState.varyingDesc.stride,
|
||||
@ -326,7 +304,6 @@ OsdCudaComputeController::ApplyCatmarkVertexVerticesKernelB(
|
||||
float *varying = _currentBindState.GetOffsettedVaryingBuffer();
|
||||
|
||||
OsdCudaComputeVertexB(
|
||||
context->GetStream(),
|
||||
vertex, varying,
|
||||
_currentBindState.vertexDesc.length, _currentBindState.vertexDesc.stride,
|
||||
_currentBindState.varyingDesc.length, _currentBindState.varyingDesc.stride,
|
||||
@ -351,7 +328,6 @@ OsdCudaComputeController::ApplyCatmarkVertexVerticesKernelA1(
|
||||
float *varying = _currentBindState.GetOffsettedVaryingBuffer();
|
||||
|
||||
OsdCudaComputeVertexA(
|
||||
context->GetStream(),
|
||||
vertex, varying,
|
||||
_currentBindState.vertexDesc.length, _currentBindState.vertexDesc.stride,
|
||||
_currentBindState.varyingDesc.length, _currentBindState.varyingDesc.stride,
|
||||
@ -375,7 +351,6 @@ OsdCudaComputeController::ApplyCatmarkVertexVerticesKernelA2(
|
||||
float *varying = _currentBindState.GetOffsettedVaryingBuffer();
|
||||
|
||||
OsdCudaComputeVertexA(
|
||||
context->GetStream(),
|
||||
vertex, varying,
|
||||
_currentBindState.vertexDesc.length, _currentBindState.vertexDesc.stride,
|
||||
_currentBindState.varyingDesc.length, _currentBindState.varyingDesc.stride,
|
||||
@ -399,7 +374,6 @@ OsdCudaComputeController::ApplyCatmarkRestrictedVertexVerticesKernelB1(
|
||||
float *varying = _currentBindState.GetOffsettedVaryingBuffer();
|
||||
|
||||
OsdCudaComputeRestrictedVertexB1(
|
||||
context->GetStream(),
|
||||
vertex, varying,
|
||||
_currentBindState.vertexDesc.length, _currentBindState.vertexDesc.stride,
|
||||
_currentBindState.varyingDesc.length, _currentBindState.varyingDesc.stride,
|
||||
@ -423,7 +397,6 @@ OsdCudaComputeController::ApplyCatmarkRestrictedVertexVerticesKernelB2(
|
||||
float *varying = _currentBindState.GetOffsettedVaryingBuffer();
|
||||
|
||||
OsdCudaComputeRestrictedVertexB2(
|
||||
context->GetStream(),
|
||||
vertex, varying,
|
||||
_currentBindState.vertexDesc.length, _currentBindState.vertexDesc.stride,
|
||||
_currentBindState.varyingDesc.length, _currentBindState.varyingDesc.stride,
|
||||
@ -445,7 +418,6 @@ OsdCudaComputeController::ApplyCatmarkRestrictedVertexVerticesKernelA(
|
||||
float *varying = _currentBindState.GetOffsettedVaryingBuffer();
|
||||
|
||||
OsdCudaComputeRestrictedVertexA(
|
||||
context->GetStream(),
|
||||
vertex, varying,
|
||||
_currentBindState.vertexDesc.length, _currentBindState.vertexDesc.stride,
|
||||
_currentBindState.varyingDesc.length, _currentBindState.varyingDesc.stride,
|
||||
@ -468,7 +440,6 @@ OsdCudaComputeController::ApplyLoopEdgeVerticesKernel(
|
||||
float *varying = _currentBindState.GetOffsettedVaryingBuffer();
|
||||
|
||||
OsdCudaComputeEdge(
|
||||
context->GetStream(),
|
||||
vertex, varying,
|
||||
_currentBindState.vertexDesc.length, _currentBindState.vertexDesc.stride,
|
||||
_currentBindState.varyingDesc.length, _currentBindState.varyingDesc.stride,
|
||||
@ -494,7 +465,6 @@ OsdCudaComputeController::ApplyLoopVertexVerticesKernelB(
|
||||
float *varying = _currentBindState.GetOffsettedVaryingBuffer();
|
||||
|
||||
OsdCudaComputeLoopVertexB(
|
||||
context->GetStream(),
|
||||
vertex, varying,
|
||||
_currentBindState.vertexDesc.length, _currentBindState.vertexDesc.stride,
|
||||
_currentBindState.varyingDesc.length, _currentBindState.varyingDesc.stride,
|
||||
@ -519,7 +489,6 @@ OsdCudaComputeController::ApplyLoopVertexVerticesKernelA1(
|
||||
float *varying = _currentBindState.GetOffsettedVaryingBuffer();
|
||||
|
||||
OsdCudaComputeVertexA(
|
||||
context->GetStream(),
|
||||
vertex, varying,
|
||||
_currentBindState.vertexDesc.length, _currentBindState.vertexDesc.stride,
|
||||
_currentBindState.varyingDesc.length, _currentBindState.varyingDesc.stride,
|
||||
@ -543,7 +512,6 @@ OsdCudaComputeController::ApplyLoopVertexVerticesKernelA2(
|
||||
float *varying = _currentBindState.GetOffsettedVaryingBuffer();
|
||||
|
||||
OsdCudaComputeVertexA(
|
||||
context->GetStream(),
|
||||
vertex, varying,
|
||||
_currentBindState.vertexDesc.length, _currentBindState.vertexDesc.stride,
|
||||
_currentBindState.varyingDesc.length, _currentBindState.varyingDesc.stride,
|
||||
@ -568,7 +536,6 @@ OsdCudaComputeController::ApplyVertexEdits(
|
||||
|
||||
if (edit->GetOperation() == FarVertexEdit::Add) {
|
||||
OsdCudaEditVertexAdd(
|
||||
context->GetStream(),
|
||||
vertex,
|
||||
_currentBindState.vertexDesc.length, _currentBindState.vertexDesc.stride,
|
||||
edit->GetPrimvarOffset(),
|
||||
|
@ -1020,258 +1020,234 @@ editVertexAdd(float *fVertex, int vertexLength, int vertexStride,
|
||||
// XXX: this macro usage is tentative. Since cuda kernel can't be dynamically configured,
|
||||
// still trying to find better way to have optimized kernel..
|
||||
|
||||
#define OPT_KERNEL(NUM_VERTEX_ELEMENTS, NUM_VARYING_ELEMENTS, KERNEL, X, Y, STREAM, ARG) \
|
||||
#define OPT_KERNEL(NUM_VERTEX_ELEMENTS, NUM_VARYING_ELEMENTS, KERNEL, X, Y, ARG) \
|
||||
if(vertexLength == NUM_VERTEX_ELEMENTS && \
|
||||
varyingLength == NUM_VARYING_ELEMENTS && \
|
||||
vertexStride == vertexLength && \
|
||||
varyingStride == varyingLength) \
|
||||
{ KERNEL<NUM_VERTEX_ELEMENTS, NUM_VARYING_ELEMENTS><<<X,Y,0,STREAM>>>ARG; \
|
||||
{ KERNEL<NUM_VERTEX_ELEMENTS, NUM_VARYING_ELEMENTS><<<X,Y>>>ARG; \
|
||||
return; }
|
||||
|
||||
extern "C" {
|
||||
|
||||
void OsdCudaComputeFace(cudaStream_t stream,
|
||||
float *vertex, float *varying,
|
||||
void OsdCudaComputeFace(float *vertex, float *varying,
|
||||
int vertexLength, int vertexStride,
|
||||
int varyingLength, int varyingStride,
|
||||
int *F_IT, int *F_ITa, int offset, int tableOffset, int start, int end)
|
||||
{
|
||||
//computeFace<3, 0><<<512,32,0,stream>>>(vertex, varying, F_IT, F_ITa, offset, start, end);
|
||||
OPT_KERNEL(0, 0, computeFace, 512, 32, stream, (vertex, varying, F_IT, F_ITa, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(0, 3, computeFace, 512, 32, stream, (vertex, varying, F_IT, F_ITa, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(3, 0, computeFace, 512, 32, stream, (vertex, varying, F_IT, F_ITa, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(3, 3, computeFace, 512, 32, stream, (vertex, varying, F_IT, F_ITa, offset, tableOffset, start, end));
|
||||
//computeFace<3, 0><<<512,32>>>(vertex, varying, F_IT, F_ITa, offset, start, end);
|
||||
OPT_KERNEL(0, 0, computeFace, 512, 32, (vertex, varying, F_IT, F_ITa, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(0, 3, computeFace, 512, 32, (vertex, varying, F_IT, F_ITa, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(3, 0, computeFace, 512, 32, (vertex, varying, F_IT, F_ITa, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(3, 3, computeFace, 512, 32, (vertex, varying, F_IT, F_ITa, offset, tableOffset, start, end));
|
||||
|
||||
// fallback kernel (slow)
|
||||
computeFace<<<512, 32, 0, stream>>>(vertex, varying,
|
||||
computeFace<<<512, 32>>>(vertex, varying,
|
||||
vertexLength, vertexStride, varyingLength, varyingStride,
|
||||
F_IT, F_ITa, offset, tableOffset, start, end);
|
||||
}
|
||||
|
||||
void OsdCudaComputeQuadFace(cudaStream_t stream,
|
||||
float *vertex, float *varying,
|
||||
void OsdCudaComputeQuadFace(float *vertex, float *varying,
|
||||
int vertexLength, int vertexStride,
|
||||
int varyingLength, int varyingStride,
|
||||
int *F_IT, int offset, int tableOffset, int start, int end)
|
||||
{
|
||||
//computeQuadFace<3, 0><<<512,32,0,stream>>>(vertex, varying, F_IT, offset, start, end);
|
||||
OPT_KERNEL(0, 0, computeQuadFace, 512, 32, stream, (vertex, varying, F_IT, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(0, 3, computeQuadFace, 512, 32, stream, (vertex, varying, F_IT, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(3, 0, computeQuadFace, 512, 32, stream, (vertex, varying, F_IT, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(3, 3, computeQuadFace, 512, 32, stream, (vertex, varying, F_IT, offset, tableOffset, start, end));
|
||||
//computeQuadFace<3, 0><<<512,32>>>(vertex, varying, F_IT, offset, start, end);
|
||||
OPT_KERNEL(0, 0, computeQuadFace, 512, 32, (vertex, varying, F_IT, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(0, 3, computeQuadFace, 512, 32, (vertex, varying, F_IT, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(3, 0, computeQuadFace, 512, 32, (vertex, varying, F_IT, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(3, 3, computeQuadFace, 512, 32, (vertex, varying, F_IT, offset, tableOffset, start, end));
|
||||
|
||||
// fallback kernel (slow)
|
||||
computeQuadFace<<<512, 32, 0, stream>>>(vertex, varying,
|
||||
computeQuadFace<<<512, 32>>>(vertex, varying,
|
||||
vertexLength, vertexStride, varyingLength, varyingStride,
|
||||
F_IT, offset, tableOffset, start, end);
|
||||
}
|
||||
|
||||
void OsdCudaComputeTriQuadFace(cudaStream_t stream,
|
||||
float *vertex, float *varying,
|
||||
void OsdCudaComputeTriQuadFace(float *vertex, float *varying,
|
||||
int vertexLength, int vertexStride,
|
||||
int varyingLength, int varyingStride,
|
||||
int *F_IT, int offset, int tableOffset, int start, int end)
|
||||
{
|
||||
//computeTriQuadFace<3, 0><<<512,32,0,stream>>>(vertex, varying, F_IT, offset, start, end);
|
||||
OPT_KERNEL(0, 0, computeTriQuadFace, 512, 32, stream, (vertex, varying, F_IT, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(0, 3, computeTriQuadFace, 512, 32, stream, (vertex, varying, F_IT, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(3, 0, computeTriQuadFace, 512, 32, stream, (vertex, varying, F_IT, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(3, 3, computeTriQuadFace, 512, 32, stream, (vertex, varying, F_IT, offset, tableOffset, start, end));
|
||||
//computeTriQuadFace<3, 0><<<512,32>>>(vertex, varying, F_IT, offset, start, end);
|
||||
OPT_KERNEL(0, 0, computeTriQuadFace, 512, 32, (vertex, varying, F_IT, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(0, 3, computeTriQuadFace, 512, 32, (vertex, varying, F_IT, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(3, 0, computeTriQuadFace, 512, 32, (vertex, varying, F_IT, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(3, 3, computeTriQuadFace, 512, 32, (vertex, varying, F_IT, offset, tableOffset, start, end));
|
||||
|
||||
// fallback kernel (slow)
|
||||
computeTriQuadFace<<<512, 32, 0, stream>>>(vertex, varying,
|
||||
computeTriQuadFace<<<512, 32>>>(vertex, varying,
|
||||
vertexLength, vertexStride, varyingLength, varyingStride,
|
||||
F_IT, offset, tableOffset, start, end);
|
||||
}
|
||||
|
||||
|
||||
void OsdCudaComputeEdge(cudaStream_t stream,
|
||||
float *vertex, float *varying,
|
||||
void OsdCudaComputeEdge(float *vertex, float *varying,
|
||||
int vertexLength, int vertexStride,
|
||||
int varyingLength, int varyingStride,
|
||||
int *E_IT, float *E_W, int offset, int tableOffset, int start, int end)
|
||||
{
|
||||
//computeEdge<0, 3><<<512,32,0,stream>>>(vertex, varying, E_IT, E_W, offset, start, end);
|
||||
OPT_KERNEL(0, 0, computeEdge, 512, 32, stream, (vertex, varying, E_IT, E_W, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(0, 3, computeEdge, 512, 32, stream, (vertex, varying, E_IT, E_W, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(3, 0, computeEdge, 512, 32, stream, (vertex, varying, E_IT, E_W, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(3, 3, computeEdge, 512, 32, stream, (vertex, varying, E_IT, E_W, offset, tableOffset, start, end));
|
||||
//computeEdge<0, 3><<<512,32>>>(vertex, varying, E_IT, E_W, offset, start, end);
|
||||
OPT_KERNEL(0, 0, computeEdge, 512, 32, (vertex, varying, E_IT, E_W, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(0, 3, computeEdge, 512, 32, (vertex, varying, E_IT, E_W, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(3, 0, computeEdge, 512, 32, (vertex, varying, E_IT, E_W, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(3, 3, computeEdge, 512, 32, (vertex, varying, E_IT, E_W, offset, tableOffset, start, end));
|
||||
|
||||
// fallback kernel (slow)
|
||||
computeEdge<<<512, 32, 0, stream>>>(vertex, varying,
|
||||
computeEdge<<<512, 32>>>(vertex, varying,
|
||||
vertexLength, vertexStride, varyingLength, varyingStride,
|
||||
E_IT, E_W, offset, tableOffset, start, end);
|
||||
}
|
||||
|
||||
void OsdCudaComputeRestrictedEdge(cudaStream_t stream,
|
||||
float *vertex, float *varying,
|
||||
void OsdCudaComputeRestrictedEdge(float *vertex, float *varying,
|
||||
int vertexLength, int vertexStride,
|
||||
int varyingLength, int varyingStride,
|
||||
int *E_IT, int offset, int tableOffset, int start, int end)
|
||||
{
|
||||
//computeRestrictedEdge<0, 3><<<512,32,0,stream>>>(vertex, varying, E_IT, offset, start, end);
|
||||
OPT_KERNEL(0, 0, computeRestrictedEdge, 512, 32, stream, (vertex, varying, E_IT, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(0, 3, computeRestrictedEdge, 512, 32, stream, (vertex, varying, E_IT, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(3, 0, computeRestrictedEdge, 512, 32, stream, (vertex, varying, E_IT, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(3, 3, computeRestrictedEdge, 512, 32, stream, (vertex, varying, E_IT, offset, tableOffset, start, end));
|
||||
//computeRestrictedEdge<0, 3><<<512,32>>>(vertex, varying, E_IT, offset, start, end);
|
||||
OPT_KERNEL(0, 0, computeRestrictedEdge, 512, 32, (vertex, varying, E_IT, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(0, 3, computeRestrictedEdge, 512, 32, (vertex, varying, E_IT, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(3, 0, computeRestrictedEdge, 512, 32, (vertex, varying, E_IT, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(3, 3, computeRestrictedEdge, 512, 32, (vertex, varying, E_IT, offset, tableOffset, start, end));
|
||||
|
||||
// fallback kernel (slow)
|
||||
computeRestrictedEdge<<<512, 32, 0, stream>>>(vertex, varying,
|
||||
computeRestrictedEdge<<<512, 32>>>(vertex, varying,
|
||||
vertexLength, vertexStride, varyingLength, varyingStride,
|
||||
E_IT, offset, tableOffset, start, end);
|
||||
}
|
||||
|
||||
void OsdCudaComputeVertexA(cudaStream_t stream,
|
||||
float *vertex, float *varying,
|
||||
void OsdCudaComputeVertexA(float *vertex, float *varying,
|
||||
int vertexLength, int vertexStride,
|
||||
int varyingLength, int varyingStride,
|
||||
int *V_ITa, float *V_W, int offset, int tableOffset, int start, int end, int pass)
|
||||
{
|
||||
// computeVertexA<0, 3><<<512,32,0,stream>>>(vertex, varying, V_ITa, V_W, offset, start, end, pass);
|
||||
OPT_KERNEL(0, 0, computeVertexA, 512, 32, stream, (vertex, varying, V_ITa, V_W, offset, tableOffset, start, end, pass));
|
||||
OPT_KERNEL(0, 3, computeVertexA, 512, 32, stream, (vertex, varying, V_ITa, V_W, offset, tableOffset, start, end, pass));
|
||||
OPT_KERNEL(3, 0, computeVertexA, 512, 32, stream, (vertex, varying, V_ITa, V_W, offset, tableOffset, start, end, pass));
|
||||
OPT_KERNEL(3, 3, computeVertexA, 512, 32, stream, (vertex, varying, V_ITa, V_W, offset, tableOffset, start, end, pass));
|
||||
// computeVertexA<0, 3><<<512,32>>>(vertex, varying, V_ITa, V_W, offset, start, end, pass);
|
||||
OPT_KERNEL(0, 0, computeVertexA, 512, 32, (vertex, varying, V_ITa, V_W, offset, tableOffset, start, end, pass));
|
||||
OPT_KERNEL(0, 3, computeVertexA, 512, 32, (vertex, varying, V_ITa, V_W, offset, tableOffset, start, end, pass));
|
||||
OPT_KERNEL(3, 0, computeVertexA, 512, 32, (vertex, varying, V_ITa, V_W, offset, tableOffset, start, end, pass));
|
||||
OPT_KERNEL(3, 3, computeVertexA, 512, 32, (vertex, varying, V_ITa, V_W, offset, tableOffset, start, end, pass));
|
||||
|
||||
// fallback kernel (slow)
|
||||
computeVertexA<<<512, 32, 0, stream>>>(vertex, varying,
|
||||
computeVertexA<<<512, 32>>>(vertex, varying,
|
||||
vertexLength, vertexStride, varyingLength, varyingStride,
|
||||
V_ITa, V_W, offset, tableOffset, start, end, pass);
|
||||
}
|
||||
|
||||
void OsdCudaComputeVertexB(cudaStream_t stream,
|
||||
float *vertex, float *varying,
|
||||
void OsdCudaComputeVertexB(float *vertex, float *varying,
|
||||
int vertexLength, int vertexStride,
|
||||
int varyingLength, int varyingStride,
|
||||
int *V_ITa, int *V_IT, float *V_W, int offset, int tableOffset, int start, int end)
|
||||
{
|
||||
// computeVertexB<0, 3><<<512,32,0,stream>>>(vertex, varying, V_ITa, V_IT, V_W, offset, start, end);
|
||||
OPT_KERNEL(0, 0, computeVertexB, 512, 32, stream, (vertex, varying, V_ITa, V_IT, V_W, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(0, 3, computeVertexB, 512, 32, stream, (vertex, varying, V_ITa, V_IT, V_W, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(3, 0, computeVertexB, 512, 32, stream, (vertex, varying, V_ITa, V_IT, V_W, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(3, 3, computeVertexB, 512, 32, stream, (vertex, varying, V_ITa, V_IT, V_W, offset, tableOffset, start, end));
|
||||
// computeVertexB<0, 3><<<512,32>>>(vertex, varying, V_ITa, V_IT, V_W, offset, start, end);
|
||||
OPT_KERNEL(0, 0, computeVertexB, 512, 32, (vertex, varying, V_ITa, V_IT, V_W, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(0, 3, computeVertexB, 512, 32, (vertex, varying, V_ITa, V_IT, V_W, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(3, 0, computeVertexB, 512, 32, (vertex, varying, V_ITa, V_IT, V_W, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(3, 3, computeVertexB, 512, 32, (vertex, varying, V_ITa, V_IT, V_W, offset, tableOffset, start, end));
|
||||
|
||||
// fallback kernel (slow)
|
||||
computeVertexB<<<512, 32, 0, stream>>>(vertex, varying,
|
||||
computeVertexB<<<512, 32>>>(vertex, varying,
|
||||
vertexLength, vertexStride, varyingLength, varyingStride,
|
||||
V_ITa, V_IT, V_W, offset, tableOffset, start, end);
|
||||
}
|
||||
|
||||
void OsdCudaComputeRestrictedVertexA(cudaStream_t stream,
|
||||
float *vertex, float *varying,
|
||||
void OsdCudaComputeRestrictedVertexA(float *vertex, float *varying,
|
||||
int vertexLength, int vertexStride,
|
||||
int varyingLength, int varyingStride,
|
||||
int *V_ITa, int offset, int tableOffset, int start, int end)
|
||||
{
|
||||
// computeRestrictedVertexA<0, 3><<<512,32,0,stream>>>(vertex, varying, V_ITa, offset, start, end);
|
||||
OPT_KERNEL(0, 0, computeRestrictedVertexA, 512, 32, stream, (vertex, varying, V_ITa, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(0, 3, computeRestrictedVertexA, 512, 32, stream, (vertex, varying, V_ITa, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(3, 0, computeRestrictedVertexA, 512, 32, stream, (vertex, varying, V_ITa, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(3, 3, computeRestrictedVertexA, 512, 32, stream, (vertex, varying, V_ITa, offset, tableOffset, start, end));
|
||||
// computeRestrictedVertexA<0, 3><<<512,32>>>(vertex, varying, V_ITa, offset, start, end);
|
||||
OPT_KERNEL(0, 0, computeRestrictedVertexA, 512, 32, (vertex, varying, V_ITa, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(0, 3, computeRestrictedVertexA, 512, 32, (vertex, varying, V_ITa, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(3, 0, computeRestrictedVertexA, 512, 32, (vertex, varying, V_ITa, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(3, 3, computeRestrictedVertexA, 512, 32, (vertex, varying, V_ITa, offset, tableOffset, start, end));
|
||||
|
||||
// fallback kernel (slow)
|
||||
computeRestrictedVertexA<<<512, 32, 0, stream>>>(vertex, varying,
|
||||
computeRestrictedVertexA<<<512, 32>>>(vertex, varying,
|
||||
vertexLength, vertexStride, varyingLength, varyingStride,
|
||||
V_ITa, offset, tableOffset, start, end);
|
||||
}
|
||||
|
||||
void OsdCudaComputeRestrictedVertexB1(cudaStream_t stream,
|
||||
float *vertex, float *varying,
|
||||
void OsdCudaComputeRestrictedVertexB1(float *vertex, float *varying,
|
||||
int vertexLength, int vertexStride,
|
||||
int varyingLength, int varyingStride,
|
||||
int *V_ITa, int *V_IT, int offset, int tableOffset, int start, int end)
|
||||
{
|
||||
// computeRestrictedVertexB1<0, 3><<<512,32,0,stream>>>(vertex, varying, V_ITa, V_IT, offset, start, end);
|
||||
OPT_KERNEL(0, 0, computeRestrictedVertexB1, 512, 32, stream, (vertex, varying, V_ITa, V_IT, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(0, 3, computeRestrictedVertexB1, 512, 32, stream, (vertex, varying, V_ITa, V_IT, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(3, 0, computeRestrictedVertexB1, 512, 32, stream, (vertex, varying, V_ITa, V_IT, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(3, 3, computeRestrictedVertexB1, 512, 32, stream, (vertex, varying, V_ITa, V_IT, offset, tableOffset, start, end));
|
||||
// computeRestrictedVertexB1<0, 3><<<512,32>>>(vertex, varying, V_ITa, V_IT, offset, start, end);
|
||||
OPT_KERNEL(0, 0, computeRestrictedVertexB1, 512, 32, (vertex, varying, V_ITa, V_IT, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(0, 3, computeRestrictedVertexB1, 512, 32, (vertex, varying, V_ITa, V_IT, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(3, 0, computeRestrictedVertexB1, 512, 32, (vertex, varying, V_ITa, V_IT, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(3, 3, computeRestrictedVertexB1, 512, 32, (vertex, varying, V_ITa, V_IT, offset, tableOffset, start, end));
|
||||
|
||||
// fallback kernel (slow)
|
||||
computeRestrictedVertexB1<<<512, 32, 0, stream>>>(vertex, varying,
|
||||
computeRestrictedVertexB1 <<<512, 32>>>(vertex, varying,
|
||||
vertexLength, vertexStride, varyingLength, varyingStride,
|
||||
V_ITa, V_IT, offset, tableOffset, start, end);
|
||||
}
|
||||
|
||||
void OsdCudaComputeRestrictedVertexB2(cudaStream_t stream,
|
||||
float *vertex, float *varying,
|
||||
void OsdCudaComputeRestrictedVertexB2(float *vertex, float *varying,
|
||||
int vertexLength, int vertexStride,
|
||||
int varyingLength, int varyingStride,
|
||||
int *V_ITa, int *V_IT, int offset, int tableOffset, int start, int end)
|
||||
{
|
||||
// computeRestrictedVertexB2<0, 3><<<512,32,0,stream>>>(vertex, varying, V_ITa, V_IT, offset, start, end);
|
||||
OPT_KERNEL(0, 0, computeRestrictedVertexB2, 512, 32, stream, (vertex, varying, V_ITa, V_IT, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(0, 3, computeRestrictedVertexB2, 512, 32, stream, (vertex, varying, V_ITa, V_IT, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(3, 0, computeRestrictedVertexB2, 512, 32, stream, (vertex, varying, V_ITa, V_IT, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(3, 3, computeRestrictedVertexB2, 512, 32, stream, (vertex, varying, V_ITa, V_IT, offset, tableOffset, start, end));
|
||||
// computeRestrictedVertexB2<0, 3><<<512,32>>>(vertex, varying, V_ITa, V_IT, offset, start, end);
|
||||
OPT_KERNEL(0, 0, computeRestrictedVertexB2, 512, 32, (vertex, varying, V_ITa, V_IT, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(0, 3, computeRestrictedVertexB2, 512, 32, (vertex, varying, V_ITa, V_IT, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(3, 0, computeRestrictedVertexB2, 512, 32, (vertex, varying, V_ITa, V_IT, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(3, 3, computeRestrictedVertexB2, 512, 32, (vertex, varying, V_ITa, V_IT, offset, tableOffset, start, end));
|
||||
|
||||
// fallback kernel (slow)
|
||||
computeRestrictedVertexB2<<<512, 32, 0, stream>>>(vertex, varying,
|
||||
computeRestrictedVertexB2 <<<512, 32>>>(vertex, varying,
|
||||
vertexLength, vertexStride, varyingLength, varyingStride,
|
||||
V_ITa, V_IT, offset, tableOffset, start, end);
|
||||
}
|
||||
|
||||
void OsdCudaComputeLoopVertexB(cudaStream_t stream,
|
||||
float *vertex, float *varying,
|
||||
void OsdCudaComputeLoopVertexB(float *vertex, float *varying,
|
||||
int vertexLength, int vertexStride,
|
||||
int varyingLength, int varyingStride,
|
||||
int *V_ITa, int *V_IT, float *V_W, int offset, int tableOffset, int start, int end)
|
||||
{
|
||||
// computeLoopVertexB<0, 3><<<512,32,0,stream>>>(vertex, varying, V_ITa, V_IT, V_W, offset, start, end);
|
||||
OPT_KERNEL(0, 0, computeLoopVertexB, 512, 32, stream, (vertex, varying, V_ITa, V_IT, V_W, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(0, 3, computeLoopVertexB, 512, 32, stream, (vertex, varying, V_ITa, V_IT, V_W, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(3, 0, computeLoopVertexB, 512, 32, stream, (vertex, varying, V_ITa, V_IT, V_W, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(3, 3, computeLoopVertexB, 512, 32, stream, (vertex, varying, V_ITa, V_IT, V_W, offset, tableOffset, start, end));
|
||||
// computeLoopVertexB<0, 3><<<512,32>>>(vertex, varying, V_ITa, V_IT, V_W, offset, start, end);
|
||||
OPT_KERNEL(0, 0, computeLoopVertexB, 512, 32, (vertex, varying, V_ITa, V_IT, V_W, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(0, 3, computeLoopVertexB, 512, 32, (vertex, varying, V_ITa, V_IT, V_W, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(3, 0, computeLoopVertexB, 512, 32, (vertex, varying, V_ITa, V_IT, V_W, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(3, 3, computeLoopVertexB, 512, 32, (vertex, varying, V_ITa, V_IT, V_W, offset, tableOffset, start, end));
|
||||
|
||||
// fallback kernel (slow)
|
||||
computeLoopVertexB<<<512, 32, 0, stream>>>(vertex, varying,
|
||||
computeLoopVertexB<<<512, 32>>>(vertex, varying,
|
||||
vertexLength, vertexStride, varyingLength, varyingStride,
|
||||
V_ITa, V_IT, V_W, offset, tableOffset, start, end);
|
||||
}
|
||||
|
||||
void OsdCudaComputeBilinearEdge(cudaStream_t stream,
|
||||
float *vertex, float *varying,
|
||||
void OsdCudaComputeBilinearEdge(float *vertex, float *varying,
|
||||
int vertexLength, int vertexStride,
|
||||
int varyingLength, int varyingStride,
|
||||
int *E_IT, int offset, int tableOffset, int start, int end)
|
||||
{
|
||||
//computeBilinearEdge<0, 3><<<512,32,0,stream>>>(vertex, varying, E_IT, offset, start, end);
|
||||
OPT_KERNEL(0, 0, computeBilinearEdge, 512, 32, stream, (vertex, varying, E_IT, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(0, 3, computeBilinearEdge, 512, 32, stream, (vertex, varying, E_IT, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(3, 0, computeBilinearEdge, 512, 32, stream, (vertex, varying, E_IT, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(3, 3, computeBilinearEdge, 512, 32, stream, (vertex, varying, E_IT, offset, tableOffset, start, end));
|
||||
//computeBilinearEdge<0, 3><<<512,32>>>(vertex, varying, E_IT, offset, start, end);
|
||||
OPT_KERNEL(0, 0, computeBilinearEdge, 512, 32, (vertex, varying, E_IT, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(0, 3, computeBilinearEdge, 512, 32, (vertex, varying, E_IT, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(3, 0, computeBilinearEdge, 512, 32, (vertex, varying, E_IT, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(3, 3, computeBilinearEdge, 512, 32, (vertex, varying, E_IT, offset, tableOffset, start, end));
|
||||
|
||||
// fallback kernel (slow)
|
||||
computeBilinearEdge<<<512, 32, 0, stream>>>(vertex, varying,
|
||||
computeBilinearEdge<<<512, 32>>>(vertex, varying,
|
||||
vertexLength, vertexStride, varyingLength, varyingStride,
|
||||
E_IT, offset, tableOffset, start, end);
|
||||
}
|
||||
|
||||
void OsdCudaComputeBilinearVertex(cudaStream_t stream,
|
||||
float *vertex, float *varying,
|
||||
void OsdCudaComputeBilinearVertex(float *vertex, float *varying,
|
||||
int vertexLength, int vertexStride,
|
||||
int varyingLength, int varyingStride,
|
||||
int *V_ITa, int offset, int tableOffset, int start, int end)
|
||||
{
|
||||
// computeBilinearVertex<0, 3><<<512,32,0,stream>>>(vertex, varying, V_ITa, offset, start, end);
|
||||
OPT_KERNEL(0, 0, computeBilinearVertex, 512, 32, stream, (vertex, varying, V_ITa, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(0, 3, computeBilinearVertex, 512, 32, stream, (vertex, varying, V_ITa, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(3, 0, computeBilinearVertex, 512, 32, stream, (vertex, varying, V_ITa, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(3, 3, computeBilinearVertex, 512, 32, stream, (vertex, varying, V_ITa, offset, tableOffset, start, end));
|
||||
// computeBilinearVertex<0, 3><<<512,32>>>(vertex, varying, V_ITa, offset, start, end);
|
||||
OPT_KERNEL(0, 0, computeBilinearVertex, 512, 32, (vertex, varying, V_ITa, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(0, 3, computeBilinearVertex, 512, 32, (vertex, varying, V_ITa, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(3, 0, computeBilinearVertex, 512, 32, (vertex, varying, V_ITa, offset, tableOffset, start, end));
|
||||
OPT_KERNEL(3, 3, computeBilinearVertex, 512, 32, (vertex, varying, V_ITa, offset, tableOffset, start, end));
|
||||
|
||||
// fallback kernel (slow)
|
||||
computeBilinearVertex<<<512, 32, 0, stream>>>(vertex, varying,
|
||||
computeBilinearVertex<<<512, 32>>>(vertex, varying,
|
||||
vertexLength, vertexStride, varyingLength, varyingStride,
|
||||
V_ITa, offset, tableOffset, start, end);
|
||||
}
|
||||
|
||||
void OsdCudaEditVertexAdd(cudaStream_t stream,
|
||||
float *vertex, int vertexLength, int vertexStride,
|
||||
void OsdCudaEditVertexAdd(float *vertex, int vertexLength, int vertexStride,
|
||||
int primVarOffset, int primVarWidth,
|
||||
int vertexOffset, int tableOffset,
|
||||
int start, int end, int *editIndices, float *editValues)
|
||||
{
|
||||
editVertexAdd<<<512, 32, 0, stream>>>(vertex, vertexLength, vertexStride, primVarOffset, primVarWidth,
|
||||
editVertexAdd<<<512, 32>>>(vertex, vertexLength, vertexStride, primVarOffset, primVarWidth,
|
||||
vertexOffset, tableOffset, start, end,
|
||||
editIndices, editValues);
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user