gcd: convert face loop to work in 32 item size batches.

MBP6,2 (2 core 4 thread), catmark_car Lv4: 31.2ms CPU, 15.8ms GCD (was 22ms)
This commit is contained in:
Aras Pranckevicius 2013-02-03 10:05:58 +02:00
parent c4b463668a
commit e8ceaba176

View File

@ -72,24 +72,18 @@ void OsdGcdComputeFace(
const int *F_IT, const int *F_ITa, int offset, int start, int end,
dispatch_queue_t gcdq) {
dispatch_apply(end-start, gcdq, ^(size_t blockIdx){
int i = start+blockIdx;
int h = F_ITa[2*i];
int n = F_ITa[2*i+1];
float weight = 1.0f/n;
// XXX: should use local vertex struct variable instead of
// accumulating directly into global memory.
int dstIndex = offset + i;
vdesc->Clear(vertex, varying, dstIndex);
for (int j = 0; j < n; ++j) {
int index = F_IT[h+j];
vdesc->AddWithWeight(vertex, dstIndex, index, weight);
vdesc->AddVaryingWithWeight(varying, dstIndex, index, weight);
}
const int workSize = end-start;
dispatch_apply(workSize/GCD_WORK_STRIDE, gcdq, ^(size_t blockIdx){
const int start_i = start + blockIdx*GCD_WORK_STRIDE;
const int end_i = start_i + GCD_WORK_STRIDE;
OsdCpuComputeFace(vdesc, vertex, varying, F_IT, F_ITa, offset,
start_i, end_i);
});
const int start_e = end - workSize%GCD_WORK_STRIDE;
const int end_e = end;
if (start_e < end_e)
OsdCpuComputeFace(vdesc, vertex, varying, F_IT, F_ITa, offset,
start_e, end_e);
}
void OsdGcdComputeEdge(