// // Copyright 2013 Pixar // // Licensed under the Apache License, Version 2.0 (the "Apache License") // with the following modification; you may not use this file except in // compliance with the Apache License and the following modification to it: // Section 6. Trademarks. is deleted and replaced with: // // 6. Trademarks. This License does not grant permission to use the trade // names, trademarks, service marks, or product names of the Licensor // and its affiliates, except as required to comply with Section 4(c) of // the License and to reproduce the content of the NOTICE file. // // You may obtain a copy of the Apache License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the Apache License with the above modification is // distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the Apache License for the specific // language governing permissions and limitations under the Apache License. // #include "../osd/cpuKernel.h" #include "../osd/tbbKernel.h" #include "../osd/types.h" #include "../osd/bufferDescriptor.h" #include "../osd/patchBasis.h" #include #include #include namespace OpenSubdiv { namespace OPENSUBDIV_VERSION { namespace Osd { #define grain_size 200 template T * elementAtIndex(T * src, int index, BufferDescriptor const &desc) { return src + index * desc.stride; } static inline void clear(float *dst, BufferDescriptor const &desc) { assert(dst); memset(dst, 0, desc.length*sizeof(float)); } static inline void addWithWeight(float *dst, const float *src, int srcIndex, float weight, BufferDescriptor const &desc) { assert(src && dst); src = elementAtIndex(src, srcIndex, desc); for (int k = 0; k < desc.length; ++k) { dst[k] += src[k] * weight; } } static inline void copy(float *dst, int dstIndex, const float *src, BufferDescriptor const &desc) { assert(src && dst); dst = elementAtIndex(dst, dstIndex, desc); memcpy(dst, src, desc.length*sizeof(float)); } class TBBStencilKernel { BufferDescriptor _srcDesc; BufferDescriptor _dstDesc; float const * _vertexSrc; float * _vertexDst; int const * _sizes; int const * _offsets, * _indices; float const * _weights; public: TBBStencilKernel(float const *src, BufferDescriptor srcDesc, float *dst, BufferDescriptor dstDesc, int const * sizes, int const * offsets, int const * indices, float const * weights) : _srcDesc(srcDesc), _dstDesc(dstDesc), _vertexSrc(src), _vertexDst(dst), _sizes(sizes), _offsets(offsets), _indices(indices), _weights(weights) { } TBBStencilKernel(TBBStencilKernel const & other) { _srcDesc = other._srcDesc; _dstDesc = other._dstDesc; _sizes = other._sizes; _offsets = other._offsets; _indices = other._indices; _weights = other._weights; _vertexSrc = other._vertexSrc; _vertexDst = other._vertexDst; } void operator() (tbb::blocked_range const &r) const { #define USE_SIMD #ifdef USE_SIMD if (_srcDesc.length==4 && _srcDesc.stride==4 && _dstDesc.stride==4) { // SIMD fast path for aligned primvar data (4 floats) int offset = _offsets[r.begin()]; ComputeStencilKernel<4>(_vertexSrc, _vertexDst, _sizes, _indices+offset, _weights+offset, r.begin(), r.end()); } else if (_srcDesc.length==8 && _srcDesc.stride==4 && _dstDesc.stride==4) { // SIMD fast path for aligned primvar data (8 floats) int offset = _offsets[r.begin()]; ComputeStencilKernel<8>(_vertexSrc, _vertexDst, _sizes, _indices+offset, _weights+offset, r.begin(), r.end()); } else { #else { #endif int const * sizes = _sizes; int const * indices = _indices; float const * weights = _weights; if (r.begin()>0) { sizes += r.begin(); indices += _offsets[r.begin()]; weights += _offsets[r.begin()]; } // Slow path for non-aligned data float * result = (float*)alloca(_srcDesc.length * sizeof(float)); for (int i=r.begin(); i range(start, end, grain_size); tbb::parallel_for(range, kernel); } void TbbEvalStencils(float const * src, BufferDescriptor const &srcDesc, float * dst, BufferDescriptor const &dstDesc, float * du, BufferDescriptor const &duDesc, float * dv, BufferDescriptor const &dvDesc, int const * sizes, int const * offsets, int const * indices, float const * weights, float const * duWeights, float const * dvWeights, int start, int end) { if (src) src += srcDesc.offset; if (dst) dst += dstDesc.offset; if (du) du += duDesc.offset; if (dv) dv += dvDesc.offset; // PERFORMANCE: need to combine 3 launches together if (dst) { TBBStencilKernel kernel(src, srcDesc, dst, dstDesc, sizes, offsets, indices, weights); tbb::blocked_range range(start, end, grain_size); tbb::parallel_for(range, kernel); } if (du) { TBBStencilKernel kernel(src, srcDesc, du, duDesc, sizes, offsets, indices, duWeights); tbb::blocked_range range(start, end, grain_size); tbb::parallel_for(range, kernel); } if (dv) { TBBStencilKernel kernel(src, srcDesc, dv, dvDesc, sizes, offsets, indices, dvWeights); tbb::blocked_range range(start, end, grain_size); tbb::parallel_for(range, kernel); } } void TbbEvalStencils(float const * src, BufferDescriptor const &srcDesc, float * dst, BufferDescriptor const &dstDesc, float * du, BufferDescriptor const &duDesc, float * dv, BufferDescriptor const &dvDesc, float * duu, BufferDescriptor const &duuDesc, float * duv, BufferDescriptor const &duvDesc, float * dvv, BufferDescriptor const &dvvDesc, int const * sizes, int const * offsets, int const * indices, float const * weights, float const * duWeights, float const * dvWeights, float const * duuWeights, float const * duvWeights, float const * dvvWeights, int start, int end) { if (src) src += srcDesc.offset; if (dst) dst += dstDesc.offset; if (du) du += duDesc.offset; if (dv) dv += dvDesc.offset; if (duu) duu += duuDesc.offset; if (duv) duv += duvDesc.offset; if (dvv) dvv += dvvDesc.offset; // PERFORMANCE: need to combine 3 launches together if (dst) { TBBStencilKernel kernel(src, srcDesc, dst, dstDesc, sizes, offsets, indices, weights); tbb::blocked_range range(start, end, grain_size); tbb::parallel_for(range, kernel); } if (du) { TBBStencilKernel kernel(src, srcDesc, du, duDesc, sizes, offsets, indices, duWeights); tbb::blocked_range range(start, end, grain_size); tbb::parallel_for(range, kernel); } if (dv) { TBBStencilKernel kernel(src, srcDesc, dv, dvDesc, sizes, offsets, indices, dvWeights); tbb::blocked_range range(start, end, grain_size); tbb::parallel_for(range, kernel); } if (duu) { TBBStencilKernel kernel(src, srcDesc, duu, duuDesc, sizes, offsets, indices, duuWeights); tbb::blocked_range range(start, end, grain_size); tbb::parallel_for(range, kernel); } if (duv) { TBBStencilKernel kernel(src, srcDesc, duv, duvDesc, sizes, offsets, indices, duvWeights); tbb::blocked_range range(start, end, grain_size); tbb::parallel_for(range, kernel); } if (dvv) { TBBStencilKernel kernel(src, srcDesc, dvv, dvvDesc, sizes, offsets, indices, dvvWeights); tbb::blocked_range range(start, end, grain_size); tbb::parallel_for(range, kernel); } } // --------------------------------------------------------------------------- template struct BufferAdapter { BufferAdapter(T *p, int length, int stride) : _p(p), _length(length), _stride(stride) { } void Clear() { for (int i = 0; i < _length; ++i) _p[i] = 0; } void AddWithWeight(T const *src, float w) { if (_p) { for (int i = 0; i < _length; ++i) { _p[i] += src[i] * w; } } } const T *operator[] (int index) const { return _p + _stride * index; } BufferAdapter & operator ++() { if (_p) { _p += _stride; } return *this; } T *_p; int _length; int _stride; }; class TbbEvalPatchesKernel { BufferDescriptor _srcDesc; BufferDescriptor _dstDesc; BufferDescriptor _dstDuDesc; BufferDescriptor _dstDvDesc; BufferDescriptor _dstDuuDesc; BufferDescriptor _dstDuvDesc; BufferDescriptor _dstDvvDesc; float const * _src; float * _dst; float * _dstDu; float * _dstDv; float * _dstDuu; float * _dstDuv; float * _dstDvv; int _numPatchCoords; const PatchCoord *_patchCoords; const PatchArray *_patchArrayBuffer; const int *_patchIndexBuffer; const PatchParam *_patchParamBuffer; public: TbbEvalPatchesKernel(float const *src, BufferDescriptor srcDesc, float *dst, BufferDescriptor dstDesc, float *dstDu, BufferDescriptor dstDuDesc, float *dstDv, BufferDescriptor dstDvDesc, float *dstDuu, BufferDescriptor dstDuuDesc, float *dstDuv, BufferDescriptor dstDuvDesc, float *dstDvv, BufferDescriptor dstDvvDesc, int numPatchCoords, const PatchCoord *patchCoords, const PatchArray *patchArrayBuffer, const int *patchIndexBuffer, const PatchParam *patchParamBuffer) : _srcDesc(srcDesc), _dstDesc(dstDesc), _dstDuDesc(dstDuDesc), _dstDvDesc(dstDvDesc), _dstDuuDesc(dstDuuDesc), _dstDuvDesc(dstDuvDesc), _dstDvvDesc(dstDvvDesc), _src(src), _dst(dst), _dstDu(dstDu), _dstDv(dstDv), _dstDuu(dstDuu), _dstDuv(dstDuv), _dstDvv(dstDvv), _numPatchCoords(numPatchCoords), _patchCoords(patchCoords), _patchArrayBuffer(patchArrayBuffer), _patchIndexBuffer(patchIndexBuffer), _patchParamBuffer(patchParamBuffer) { } void operator() (tbb::blocked_range const &r) const { if (_dstDu == NULL && _dstDv == NULL) { compute(r); } else if (_dstDuu == NULL && _dstDuv == NULL && _dstDvv == NULL) { computeWith1stDerivative(r); } else { computeWith2ndDerivative(r); } } void compute(tbb::blocked_range const &r) const { float wP[20]; BufferAdapter srcT(_src + _srcDesc.offset, _srcDesc.length, _srcDesc.stride); BufferAdapter dstT(_dst + _dstDesc.offset + r.begin() * _dstDesc.stride, _dstDesc.length, _dstDesc.stride); for (int i = r.begin(); i < r.end(); ++i) { PatchCoord const &coord = _patchCoords[i]; PatchArray const &array = _patchArrayBuffer[coord.handle.arrayIndex]; Osd::PatchParam const & paramStruct = _patchParamBuffer[coord.handle.patchIndex]; OsdPatchParam param = OsdPatchParamInit( paramStruct.field0, paramStruct.field1, paramStruct.sharpness); int patchType = OsdPatchParamIsRegular(param) ? array.GetPatchTypeRegular() : array.GetPatchTypeIrregular(); int nPoints = OsdEvaluatePatchBasis(patchType, param, coord.s, coord.t, wP, 0, 0, 0, 0, 0); int indexBase = array.GetIndexBase() + array.GetStride() * (coord.handle.patchIndex - array.GetPrimitiveIdBase()); const int *cvs = &_patchIndexBuffer[indexBase]; dstT.Clear(); for (int j = 0; j < nPoints; ++j) { dstT.AddWithWeight(srcT[cvs[j]], wP[j]); } ++dstT; } } void computeWith1stDerivative(tbb::blocked_range const &r) const { float wP[20], wDu[20], wDv[20]; BufferAdapter srcT(_src + _srcDesc.offset, _srcDesc.length, _srcDesc.stride); BufferAdapter dstT(_dst + _dstDesc.offset + r.begin() * _dstDesc.stride, _dstDesc.length, _dstDesc.stride); BufferAdapter dstDuT(_dstDu + _dstDuDesc.offset + r.begin() * _dstDuDesc.stride, _dstDuDesc.length, _dstDuDesc.stride); BufferAdapter dstDvT(_dstDv + _dstDvDesc.offset + r.begin() * _dstDvDesc.stride, _dstDvDesc.length, _dstDvDesc.stride); for (int i = r.begin(); i < r.end(); ++i) { PatchCoord const &coord = _patchCoords[i]; PatchArray const &array = _patchArrayBuffer[coord.handle.arrayIndex]; Osd::PatchParam const & paramStruct = _patchParamBuffer[coord.handle.patchIndex]; OsdPatchParam param = OsdPatchParamInit( paramStruct.field0, paramStruct.field1, paramStruct.sharpness); int patchType = OsdPatchParamIsRegular(param) ? array.GetPatchTypeRegular() : array.GetPatchTypeIrregular(); int nPoints = OsdEvaluatePatchBasis(patchType, param, coord.s, coord.t, wP, wDu, wDv, 0, 0, 0); int indexBase = array.GetIndexBase() + array.GetStride() * (coord.handle.patchIndex - array.GetPrimitiveIdBase()); const int *cvs = &_patchIndexBuffer[indexBase]; dstT.Clear(); dstDuT.Clear(); dstDvT.Clear(); for (int j = 0; j < nPoints; ++j) { dstT.AddWithWeight(srcT[cvs[j]], wP[j]); dstDuT.AddWithWeight(srcT[cvs[j]], wDu[j]); dstDvT.AddWithWeight(srcT[cvs[j]], wDv[j]); } ++dstT; ++dstDuT; ++dstDvT; } } void computeWith2ndDerivative(tbb::blocked_range const &r) const { float wP[20], wDu[20], wDv[20], wDuu[20], wDuv[20], wDvv[20]; BufferAdapter srcT(_src + _srcDesc.offset, _srcDesc.length, _srcDesc.stride); BufferAdapter dstT(_dst + _dstDesc.offset + r.begin() * _dstDesc.stride, _dstDesc.length, _dstDesc.stride); BufferAdapter dstDuT(_dstDu + _dstDuDesc.offset + r.begin() * _dstDuDesc.stride, _dstDuDesc.length, _dstDuDesc.stride); BufferAdapter dstDvT(_dstDv + _dstDvDesc.offset + r.begin() * _dstDvDesc.stride, _dstDvDesc.length, _dstDvDesc.stride); BufferAdapter dstDuuT(_dstDuu + _dstDuuDesc.offset + r.begin() * _dstDuuDesc.stride, _dstDuuDesc.length, _dstDuuDesc.stride); BufferAdapter dstDuvT(_dstDuv + _dstDuvDesc.offset + r.begin() * _dstDuvDesc.stride, _dstDuvDesc.length, _dstDuvDesc.stride); BufferAdapter dstDvvT(_dstDvv + _dstDvvDesc.offset + r.begin() * _dstDvvDesc.stride, _dstDvvDesc.length, _dstDvvDesc.stride); for (int i = r.begin(); i < r.end(); ++i) { PatchCoord const &coord = _patchCoords[i]; PatchArray const &array = _patchArrayBuffer[coord.handle.arrayIndex]; Osd::PatchParam const & paramStruct = _patchParamBuffer[coord.handle.patchIndex]; OsdPatchParam param = OsdPatchParamInit( paramStruct.field0, paramStruct.field1, paramStruct.sharpness); int patchType = OsdPatchParamIsRegular(param) ? array.GetPatchTypeRegular() : array.GetPatchTypeIrregular(); int nPoints = OsdEvaluatePatchBasis(patchType, param, coord.s, coord.t, wP, wDu, wDv, wDuu, wDuv, wDvv); int indexBase = array.GetIndexBase() + array.GetStride() * (coord.handle.patchIndex - array.GetPrimitiveIdBase()); const int *cvs = &_patchIndexBuffer[indexBase]; dstT.Clear(); dstDuT.Clear(); dstDvT.Clear(); dstDuuT.Clear(); dstDuvT.Clear(); dstDvvT.Clear(); for (int j = 0; j < nPoints; ++j) { dstT.AddWithWeight(srcT[cvs[j]], wP[j]); dstDuT.AddWithWeight(srcT[cvs[j]], wDu[j]); dstDvT.AddWithWeight(srcT[cvs[j]], wDv[j]); dstDuuT.AddWithWeight(srcT[cvs[j]], wDuu[j]); dstDuvT.AddWithWeight(srcT[cvs[j]], wDuv[j]); dstDvvT.AddWithWeight(srcT[cvs[j]], wDvv[j]); } ++dstT; ++dstDuT; ++dstDvT; ++dstDuuT; ++dstDuvT; ++dstDvvT; } } }; void TbbEvalPatches(float const *src, BufferDescriptor const &srcDesc, float *dst, BufferDescriptor const &dstDesc, float *dstDu, BufferDescriptor const &dstDuDesc, float *dstDv, BufferDescriptor const &dstDvDesc, int numPatchCoords, const PatchCoord *patchCoords, const PatchArray *patchArrayBuffer, const int *patchIndexBuffer, const PatchParam *patchParamBuffer) { TbbEvalPatchesKernel kernel(src, srcDesc, dst, dstDesc, dstDu, dstDuDesc, dstDv, dstDvDesc, NULL, BufferDescriptor(), NULL, BufferDescriptor(), NULL, BufferDescriptor(), numPatchCoords, patchCoords, patchArrayBuffer, patchIndexBuffer, patchParamBuffer); tbb::blocked_range range(0, numPatchCoords, grain_size); tbb::parallel_for(range, kernel); } void TbbEvalPatches(float const *src, BufferDescriptor const &srcDesc, float *dst, BufferDescriptor const &dstDesc, float *dstDu, BufferDescriptor const &dstDuDesc, float *dstDv, BufferDescriptor const &dstDvDesc, float *dstDuu, BufferDescriptor const &dstDuuDesc, float *dstDuv, BufferDescriptor const &dstDuvDesc, float *dstDvv, BufferDescriptor const &dstDvvDesc, int numPatchCoords, const PatchCoord *patchCoords, const PatchArray *patchArrayBuffer, const int *patchIndexBuffer, const PatchParam *patchParamBuffer) { TbbEvalPatchesKernel kernel(src, srcDesc, dst, dstDesc, dstDu, dstDuDesc, dstDv, dstDvDesc, dstDuu, dstDuuDesc, dstDuv, dstDuvDesc, dstDvv, dstDvvDesc, numPatchCoords, patchCoords, patchArrayBuffer, patchIndexBuffer, patchParamBuffer); tbb::blocked_range range(0, numPatchCoords, grain_size); tbb::parallel_for(range, kernel); } } // end namespace Osd } // end namespace OPENSUBDIV_VERSION } // end namespace OpenSubdiv