add bilinear computation for cuda kernel (not yet for CL, GLSL kernels)

This commit is contained in:
Takahito Tejima 2012-06-11 18:09:23 -07:00
parent ee40f8d259
commit 6cae48665d
3 changed files with 165 additions and 1 deletions

View File

@ -72,6 +72,10 @@ void OsdCudaComputeVertexB(float *vertex, float *varying, int numUserVertexEleme
void OsdCudaComputeLoopVertexB(float *vertex, float *varying, int numUserVertexElements, int numVaryingElements, int *V_ITa, int *V_IT, float *V_W, int offset, int start, int end);
void OsdCudaComputeBilinearEdge(float *vertex, float *varying, int numUserVertexElements, int numVaryingElements, int *E_IT, int offset, int start, int end);
void OsdCudaComputeBilinearVertex(float *vertex, float *varying, int numUserVertexElements, int numVaryingElements, int *V_ITa, int offset, int start, int end);
}
namespace OpenSubdiv {
@ -178,9 +182,35 @@ OsdCudaKernelDispatcher::Synchronize() {
cudaThreadSynchronize();
}
void
OsdCudaKernelDispatcher::ApplyBilinearFaceVerticesKernel(FarMesh<OsdVertex> * mesh, int offset, int level, int start, int end, void * data) const {
OsdCudaComputeFace(_deviceVertices, _deviceVaryings,
_numVertexElements-3, _numVaryingElements,
(int*)_tables[F_IT].devicePtr + _tableOffsets[F_IT][level-1],
(int*)_tables[F_ITa].devicePtr + _tableOffsets[F_ITa][level-1],
offset, start, end);
}
void
OsdCudaKernelDispatcher::ApplyBilinearEdgeVerticesKernel(FarMesh<OsdVertex> * mesh, int offset, int level, int start, int end, void * data) const {
OsdCudaComputeBilinearEdge(_deviceVertices, _deviceVaryings,
_numVertexElements-3, _numVaryingElements,
(int*)_tables[E_IT].devicePtr + _tableOffsets[E_IT][level-1],
offset, start, end);
}
void
OsdCudaKernelDispatcher::ApplyBilinearVertexVerticesKernel(FarMesh<OsdVertex> * mesh, int offset, int level, int start, int end, void * data) const {
OsdCudaComputeBilinearVertex(_deviceVertices, _deviceVaryings,
_numVertexElements-3, _numVaryingElements,
(int*)_tables[V_ITa].devicePtr + _tableOffsets[V_ITa][level-1],
offset, start, end);
}
void
OsdCudaKernelDispatcher::ApplyCatmarkFaceVerticesKernel(FarMesh<OsdVertex> * mesh, int offset, int level, int start, int end, void * data) const {
// XXX: use static bridge function to avoid nvcc includes many amber headers...
OsdCudaComputeFace(_deviceVertices, _deviceVaryings,
_numVertexElements-3, _numVaryingElements,
(int*)_tables[F_IT].devicePtr + _tableOffsets[F_IT][level-1],

View File

@ -79,6 +79,14 @@ public:
virtual ~OsdCudaKernelDispatcher();
virtual void ApplyBilinearFaceVerticesKernel(FarMesh<OsdVertex> * mesh, int offset, int level, int start, int end, void * data) const;
virtual void ApplyBilinearEdgeVerticesKernel(FarMesh<OsdVertex> * mesh, int offset, int level, int start, int end, void * data) const;
virtual void ApplyBilinearVertexVerticesKernel(FarMesh<OsdVertex> * mesh, int offset, int level, int start, int end, void * data) const;
virtual void ApplyCatmarkFaceVerticesKernel(FarMesh<OsdVertex> * mesh, int offset, int level, int start, int end, void * data) const;
virtual void ApplyCatmarkEdgeVerticesKernel(FarMesh<OsdVertex> * mesh, int offset, int level, int start, int end, void * data) const;

View File

@ -491,12 +491,110 @@ computeLoopVertexB(float *fVertex, int numVertexElements, float *fVaryings, int
}
}
// --------------------------------------------------------------------------------------------
template <int NUM_USER_VERTEX_ELEMENTS, int NUM_VARYING_ELEMENTS> __global__ void
computeBilinearEdge(float *fVertex, float *fVaryings, int *E0_IT, int offset, int start, int end)
{
DeviceVertex<NUM_USER_VERTEX_ELEMENTS> *vertex = (DeviceVertex<NUM_USER_VERTEX_ELEMENTS>*)fVertex;
DeviceVarying<NUM_VARYING_ELEMENTS> *varyings = (DeviceVarying<NUM_VARYING_ELEMENTS>*)fVaryings;
for(int i = start + threadIdx.x + blockIdx.x*blockDim.x; i < end; i+= blockDim.x * gridDim.x){
int eidx0 = E0_IT[2*i+0];
int eidx1 = E0_IT[2*i+1];
DeviceVertex<NUM_USER_VERTEX_ELEMENTS> dst;
dst.clear();
dst.addWithWeight(&vertex[eidx0], 0.5f);
dst.addWithWeight(&vertex[eidx1], 0.5f);
vertex[offset+i] = dst;
if(NUM_VARYING_ELEMENTS > 0){
DeviceVarying<NUM_VARYING_ELEMENTS> dstVarying;
dstVarying.clear();
dstVarying.addVaryingWithWeight(&varyings[eidx0], 0.5f);
dstVarying.addVaryingWithWeight(&varyings[eidx1], 0.5f);
varyings[offset+i] = dstVarying;
}
}
}
__global__ void
computeBilinearEdge(float *fVertex, int numVertexElements, float *fVarying, int numVaryingElements,
int *E0_IT, int offset, int start, int end)
{
for(int i = start + threadIdx.x + blockIdx.x*blockDim.x; i < end; i+= blockDim.x * gridDim.x){
int eidx0 = E0_IT[2*i+0];
int eidx1 = E0_IT[2*i+1];
float *dstVertex = fVertex + (i+offset)*numVertexElements;
clear(dstVertex, numVertexElements);
addWithWeight(dstVertex, fVertex + eidx0*numVertexElements, 0.5f, numVertexElements);
addWithWeight(dstVertex, fVertex + eidx1*numVertexElements, 0.5f, numVertexElements);
if(numVaryingElements > 0){
float *dstVarying = fVarying + i*numVaryingElements;
clear(dstVarying, numVaryingElements);
addVaryingWithWeight(dstVarying, fVarying + eidx0*numVaryingElements, 0.5f, numVaryingElements);
addVaryingWithWeight(dstVarying, fVarying + eidx1*numVaryingElements, 0.5f, numVaryingElements);
}
}
}
template <int NUM_USER_VERTEX_ELEMENTS, int NUM_VARYING_ELEMENTS> __global__ void
computeBilinearVertex(float *fVertex, float *fVaryings, int *V0_ITa, int offset, int start, int end)
{
DeviceVertex<NUM_USER_VERTEX_ELEMENTS> *vertex = (DeviceVertex<NUM_USER_VERTEX_ELEMENTS>*)fVertex;
DeviceVarying<NUM_VARYING_ELEMENTS> *varyings = (DeviceVarying<NUM_VARYING_ELEMENTS>*)fVaryings;
for(int i = start + threadIdx.x + blockIdx.x*blockDim.x; i < end; i += blockDim.x * gridDim.x){
int p = V0_ITa[i];
DeviceVertex<NUM_USER_VERTEX_ELEMENTS> dst;
dst.clear();
dst.addWithWeight(&vertex[p], 1.0f);
vertex[i+offset] = dst;
if(NUM_VARYING_ELEMENTS > 0){
DeviceVarying<NUM_VARYING_ELEMENTS> dstVarying;
dstVarying.clear();
dstVarying.addVaryingWithWeight(&varyings[p], 1.0f);
varyings[i+offset] = dstVarying;
}
}
}
__global__ void
computeBilinearVertex(float *fVertex, int numVertexElements, float *fVaryings, int numVaryingElements,
const int *V0_ITa, int offset, int start, int end)
{
for(int i = start + threadIdx.x + blockIdx.x*blockDim.x; i < end; i += blockDim.x * gridDim.x){
int p = V0_ITa[i];
float *dstVertex = fVertex + (i+offset)*numVertexElements;
clear(dstVertex, numVertexElements);
addWithWeight(dstVertex, fVertex + p*numVertexElements, 1.0f, numVertexElements);
if(numVaryingElements > 0){
float *dstVarying = fVaryings + i*numVaryingElements;
clear(dstVarying, numVaryingElements);
addVaryingWithWeight(dstVarying, fVaryings + p*numVaryingElements, 1.0f, numVaryingElements);
}
}
}
// --------------------------------------------------------------------------------------------
#include "../version.h"
// XXX: this macro usage is tentative. Since cuda kernel can't be dynamically configured,
// still trying to find better way to have optimized kernel..
#define OPT_KERNEL(NUM_USER_VERTEX_ELEMENTS, NUM_VARYING_ELEMENTS, KERNEL, X, Y, ARG) \
if(numUserVertexElements == NUM_USER_VERTEX_ELEMENTS && \
numVaryingElements == NUM_VARYING_ELEMENTS) \
@ -576,4 +674,32 @@ void OsdCudaComputeLoopVertexB(float *vertex, float *varying,
V_ITa, V_IT, V_W, offset, start, end);
}
void OsdCudaComputeBilinearEdge(float *vertex, float *varying,
int numUserVertexElements, int numVaryingElements,
int *E_IT, int offset, int start, int end)
{
//computeBilinearEdge<0, 3><<<512,32>>>(vertex, varying, E_IT, offset, start, end);
OPT_KERNEL(0, 0, computeBilinearEdge, 512, 32, (vertex, varying, E_IT, offset, start, end));
OPT_KERNEL(0, 3, computeBilinearEdge, 512, 32, (vertex, varying, E_IT, offset, start, end));
OPT_KERNEL(3, 0, computeBilinearEdge, 512, 32, (vertex, varying, E_IT, offset, start, end));
OPT_KERNEL(3, 3, computeBilinearEdge, 512, 32, (vertex, varying, E_IT, offset, start, end));
computeBilinearEdge<<<512, 32>>>(vertex, 3+numUserVertexElements, varying, numVaryingElements,
E_IT, offset, start, end);
}
void OsdCudaComputeBilinearVertex(float *vertex, float *varying,
int numUserVertexElements, int numVaryingElements,
int *V_ITa, int offset, int start, int end)
{
// computeBilinearVertex<0, 3><<<512,32>>>(vertex, varying, V_ITa, offset, start, end);
OPT_KERNEL(0, 0, computeBilinearVertex, 512, 32, (vertex, varying, V_ITa, offset, start, end));
OPT_KERNEL(0, 3, computeBilinearVertex, 512, 32, (vertex, varying, V_ITa, offset, start, end));
OPT_KERNEL(3, 0, computeBilinearVertex, 512, 32, (vertex, varying, V_ITa, offset, start, end));
OPT_KERNEL(3, 3, computeBilinearVertex, 512, 32, (vertex, varying, V_ITa, offset, start, end));
computeBilinearVertex<<<512, 32>>>(vertex, 3+numUserVertexElements, varying, numVaryingElements,
V_ITa, offset, start, end);
}
}