Replace interp() with clut_{3,4}D stages.

I tried to follow exactly the same strategy as a start.
(Though I did fix the off-by-one dimensions.)

It does rather look like we only need 3D and 4D now
that I've looked at the call sites.

Looks like about a 20% speedup.

Change-Id: I8b1af64750ad1750716ee1ab0767e64591c7206a
Reviewed-on: https://skia-review.googlesource.com/32842
Commit-Queue: Mike Klein <mtklein@google.com>
Reviewed-by: Brian Osman <brianosman@google.com>
This commit is contained in:
Mike Klein 2017-08-09 18:23:25 -04:00 committed by Skia Commit-Bot
parent f4874bc5c1
commit c2f876bb8d
9 changed files with 24856 additions and 4633 deletions

View File

@ -18,59 +18,3 @@ SkColorLookUpTable::SkColorLookUpTable(uint8_t inputChannels, const uint8_t limi
SkASSERT(fLimits[i] > 1);
}
}
// Our general strategy is to recursively interpolate each dimension,
// accumulating the index to sample at, and our current pixel stride to help accumulate the index.
template <int dim>
static Sk4f interp_dimension(const float* table, const uint8_t* limits,
const float* src, int index, int stride) {
// We'd logically like to sample this dimension at x.
int limit = limits[dim];
float x = src[dim] * (limit - 1);
// We can't index an array by a float (darn) so we have to snap to nearby integers lo and hi.
int lo = (int)(x ),
hi = (int)(x + 0.9999f);
// Recursively sample at lo and hi.
Sk4f L = interp_dimension<dim-1>(table,limits,src, stride*lo + index, stride*limit),
H = interp_dimension<dim-1>(table,limits,src, stride*hi + index, stride*limit);
// Linearly interpolate those colors based on their distance to x.
float t = (x - lo);
return (1 - t)*L + t*H;
}
// Bottom out our recursion at 0 dimensions, i.e. just return the color at index.
template <>
Sk4f interp_dimension<-1>(const float* table, const uint8_t* limits,
const float* src, int index, int stride) {
return {
table[3*index+0],
table[3*index+1],
table[3*index+2],
0.0f,
};
}
template <int dim>
static Sk4f interp_dimension(const float* table, const uint8_t* limits, const float* src) {
// Start our accumulated index and stride off at their identity values, 0 and 1.
return interp_dimension<dim>(table, limits, src, 0,1);
}
void SkColorLookUpTable::interp(float* dst, const float* src) const {
Sk4f rgb;
switch (fInputChannels-1) {
case 0: rgb = interp_dimension<0>(this->table(), fLimits, src); break;
case 1: rgb = interp_dimension<1>(this->table(), fLimits, src); break;
case 2: rgb = interp_dimension<2>(this->table(), fLimits, src); break;
case 3: rgb = interp_dimension<3>(this->table(), fLimits, src); break;
default: SkDEBUGFAIL("oops"); return;
}
rgb = Sk4f::Max(0, Sk4f::Min(rgb, 1));
dst[0] = rgb[0];
dst[1] = rgb[1];
dst[2] = rgb[2];
}

View File

@ -21,11 +21,6 @@ public:
SkColorLookUpTable(uint8_t inputChannels, const uint8_t limits[]);
// This always does the appropriate multilinear interpolation.
// We used to do tetrahedral for 3D tables, but found that was slower!
// src must point to fInputChannels values, one per channel.
void interp(float dst[3], const float src[]) const;
int inputChannels() const { return fInputChannels; }
int outputChannels() const { return kOutputChannels; }
@ -41,11 +36,11 @@ public:
void* operator new(size_t, void* p) { return p; }
void operator delete(void* p) { sk_free(p); }
private:
const float* table() const {
return SkTAddOffset<const float>(this, sizeof(SkColorLookUpTable));
}
private:
uint8_t fInputChannels;
uint8_t fLimits[kMaxColorChannels];
};

View File

@ -189,27 +189,24 @@ SkColorSpaceXform_A2B::SkColorSpaceXform_A2B(SkColorSpace_A2B* srcSpace,
case SkColorSpace_A2B::Element::Type::kCLUT: {
SkCSXformPrintf("CLUT (%d -> %d) stage added\n", e.colorLUT().inputChannels(),
e.colorLUT().outputChannels());
struct CallbackCtx : SkJumper_CallbackCtx {
sk_sp<const SkColorLookUpTable> clut;
// clut->interp() can't always safely alias its arguments,
// so we allocate a second buffer to hold our results.
float results[4*SkJumper_kMaxStride];
};
auto cb = fAlloc.make<CallbackCtx>();
cb->clut = sk_ref_sp(&e.colorLUT());
cb->read_from = cb->results;
cb->fn = [](SkJumper_CallbackCtx* ctx, int active_pixels) {
auto c = (CallbackCtx*)ctx;
for (int i = 0; i < active_pixels; i++) {
// Look up red, green, and blue for this pixel using 3-4 values from rgba.
c->clut->interp(c->results+4*i, c->rgba+4*i);
// If we used 3 inputs (rgb) preserve the fourth as alpha.
// If we used 4 inputs (cmyk) force alpha to 1.
c->results[4*i+3] = (3 == c->clut->inputChannels()) ? c->rgba[4*i+3] : 1.0f;
}
struct Ctx : SkJumper_ColorLookupTableCtx {
sk_sp<const SkColorLookUpTable> clut;
};
fElementsPipeline.append(SkRasterPipeline::callback, cb);
auto ctx = fAlloc.make<Ctx>();
ctx->clut = sk_ref_sp(&e.colorLUT());
ctx->table = ctx->clut->table();
for (int i = 0; i < ctx->clut->inputChannels(); i++) {
ctx->limits[i] = ctx->clut->gridPoints(i);
}
switch (e.colorLUT().inputChannels()) {
case 3: fElementsPipeline.append(SkRasterPipeline::clut_3D, ctx); break;
case 4: fElementsPipeline.append(SkRasterPipeline::clut_4D, ctx); break;
default: SkDEBUGFAIL("need to handle 1 or 2 channel color lookup tables.");
}
fElementsPipeline.append(SkRasterPipeline::clamp_0);
fElementsPipeline.append(SkRasterPipeline::clamp_1);
break;
}
case SkColorSpace_A2B::Element::Type::kMatrix:

View File

@ -91,7 +91,8 @@ struct SkJumper_Engine;
M(xy_to_2pt_conical_linear) \
M(mask_2pt_conical_degenerates) M(apply_vector_mask) \
M(byte_tables) M(byte_tables_rgb) \
M(rgb_to_hsl) M(hsl_to_rgb)
M(rgb_to_hsl) M(hsl_to_rgb) \
M(clut_3D) M(clut_4D)
class SkRasterPipeline {
public:

View File

@ -121,4 +121,9 @@ struct SkJumper_UniformColorCtx {
uint32_t rgba;
};
struct SkJumper_ColorLookupTableCtx {
const float* table;
int limits[4];
};
#endif//SkJumper_DEFINED

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1495,3 +1495,55 @@ STAGE(callback) {
c->fn(c, tail ? tail : kStride);
load4(c->read_from,0, &r,&g,&b,&a);
}
// Our general strategy is to recursively interpolate each dimension,
// accumulating the index to sample at, and our current pixel stride to help accumulate the index.
template <int dim>
SI void color_lookup_table(const SkJumper_ColorLookupTableCtx* ctx,
F& r, F& g, F& b, F a, U32 index, U32 stride) {
// We'd logically like to sample this dimension at x.
int limit = ctx->limits[dim-1];
F src;
switch(dim) {
case 1: src = r; break;
case 2: src = g; break;
case 3: src = b; break;
case 4: src = a; break;
}
F x = src * (limit - 1);
// We can't index an array by a float (darn) so we have to snap to nearby integers lo and hi.
U32 lo = trunc_(x ),
hi = trunc_(x + 0.9999f);
// Recursively sample at lo and hi.
F lr = r, lg = g, lb = b,
hr = r, hg = g, hb = b;
color_lookup_table<dim-1>(ctx, lr,lg,lb,a, stride*lo + index, stride*limit);
color_lookup_table<dim-1>(ctx, hr,hg,hb,a, stride*hi + index, stride*limit);
// Linearly interpolate those colors based on their distance to x.
F t = x - cast(lo);
r = lerp(lr, hr, t);
g = lerp(lg, hg, t);
b = lerp(lb, hb, t);
}
// Bottom out our recursion at 0 dimensions, i.e. just return the colors at index.
template<>
inline void color_lookup_table<0>(const SkJumper_ColorLookupTableCtx* ctx,
F& r, F& g, F& b, F a, U32 index, U32 stride) {
r = gather(ctx->table, 3*index+0);
g = gather(ctx->table, 3*index+1);
b = gather(ctx->table, 3*index+2);
}
STAGE(clut_3D) {
color_lookup_table<3>(ctx, r,g,b,a, 0,1);
// This 3D color lookup table leaves alpha alone.
}
STAGE(clut_4D) {
color_lookup_table<4>(ctx, r,g,b,a, 0,1);
// "a" was really CMYK's K, so we just set alpha opaque.
a = 1.0f;
}

View File

@ -10,14 +10,16 @@
#include "SkBitmap.h"
#include "SkCanvas.h"
#include "SkCodec.h"
#include "SkColorSpacePriv.h"
#include "SkColorSpace_A2B.h"
#include "SkColorSpace_XYZ.h"
#include "SkColorSpacePriv.h"
#include "SkCommandLineFlags.h"
#include "SkICCPriv.h"
#include "SkImageEncoder.h"
#include "SkMatrix44.h"
#include "SkOSFile.h"
#include "SkRasterPipeline.h"
#include "../src/jumper/SkJumper.h"
#include "sk_tool_utils.h"
@ -264,6 +266,27 @@ static int cut_size(const SkColorLookUpTable& clut, int dimOrder[4]) {
return cutWidth < cutHeight ? cutWidth : cutHeight;
}
static void clut_interp(const SkColorLookUpTable& clut, float out[3], const float in[4]) {
// This is kind of a toy implementation.
// You generally wouldn't want to do this 1 pixel at a time.
SkJumper_ColorLookupTableCtx ctx;
ctx.table = clut.table();
for (int i = 0; i < clut.inputChannels(); i++) {
ctx.limits[i] = clut.gridPoints(i);
}
SkSTArenaAlloc<256> alloc;
SkRasterPipeline p(&alloc);
p.append_constant_color(&alloc, in);
p.append(clut.inputChannels() == 3 ? SkRasterPipeline::clut_3D
: SkRasterPipeline::clut_4D, &ctx);
p.append(SkRasterPipeline::clamp_0);
p.append(SkRasterPipeline::clamp_1);
p.append(SkRasterPipeline::store_f32, &out);
p.run(0,0, 1,1);
}
static void draw_clut(SkCanvas* canvas, const SkColorLookUpTable& clut, int dimOrder[4]) {
dump_clut(clut);
@ -291,7 +314,7 @@ static void draw_clut(SkCanvas* canvas, const SkColorLookUpTable& clut, int dimO
const float w = row / (rows - 1.0f);
const float input[4] = {x, y, z, w};
float output[3];
clut.interp(output, input);
clut_interp(clut, output, input);
paint.setColor(SkColorSetRGB(255*output[0], 255*output[1], 255*output[2]));
canvas->drawRect(SkRect::MakeLTRB(ox + cutSize * x, oy + cutSize * y,
ox + cutSize * (x + xStep),