Reland "lowp bilerp_clamp_8888"
This is a reland of ef96fa9e83
Addition: add the flag SK_SUPPORT_LEGACY_BILERP_HIGHP to
enable fixing chrome unittests and blink tests
Original change's description:
> lowp bilerp_clamp_8888
>
> Use scaling multipy, _mm_mulhrs_epi16(Intel) and vqrdmulhq_s16(ARM),
> to implement bilerp_8888_clamp.
>
> This CL results in 756usec to 590usec improvement for
> samplingoptions_filter_1_mipmap_0 on Intel. For ARM, this improvement
> is 1180usec -> 897usec.
>
> This CL introduces scaled_mult which takes fixed-point numbers on the
> interval [-1, 1) and returns a result which is rescaled to the
> interval [-1, 1).
>
> It also introduces the notion of constrained_add(I16, U16) where the
> result is guaranteed to be U16. This avoids moving to a 32-bit integer
> for during the computation.
>
> Change-Id: I410e494364039df63e5976f433f7e68355e9cfbf
> Reviewed-on: https://skia-review.googlesource.com/c/skia/+/443896
> Reviewed-by: Brian Osman <brianosman@google.com>
> Commit-Queue: Herb Derby <herb@google.com>
Change-Id: I43e572226e11a75a6e7de5b124f2b1ae3cc37c87
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/453556
Reviewed-by: Brian Osman <brianosman@google.com>
Commit-Queue: Herb Derby <herb@google.com>
This commit is contained in:
parent
a20c60d9e5
commit
86eb628f56
@ -2908,6 +2908,8 @@ namespace lowp {
|
||||
using I16 = int16_t __attribute__((ext_vector_type(16)));
|
||||
using I32 = int32_t __attribute__((ext_vector_type(16)));
|
||||
using U32 = uint32_t __attribute__((ext_vector_type(16)));
|
||||
using I64 = int64_t __attribute__((ext_vector_type(16)));
|
||||
using U64 = uint64_t __attribute__((ext_vector_type(16)));
|
||||
using F = float __attribute__((ext_vector_type(16)));
|
||||
#else
|
||||
using U8 = uint8_t __attribute__((ext_vector_type(8)));
|
||||
@ -2915,6 +2917,8 @@ namespace lowp {
|
||||
using I16 = int16_t __attribute__((ext_vector_type(8)));
|
||||
using I32 = int32_t __attribute__((ext_vector_type(8)));
|
||||
using U32 = uint32_t __attribute__((ext_vector_type(8)));
|
||||
using I64 = int64_t __attribute__((ext_vector_type(8)));
|
||||
using U64 = uint64_t __attribute__((ext_vector_type(8)));
|
||||
using F = float __attribute__((ext_vector_type(8)));
|
||||
#endif
|
||||
|
||||
@ -3122,6 +3126,12 @@ SI F if_then_else(I32 c, F t, F e) {
|
||||
SI F max(F x, F y) { return if_then_else(x < y, y, x); }
|
||||
SI F min(F x, F y) { return if_then_else(x < y, x, y); }
|
||||
|
||||
SI I32 if_then_else(I32 c, I32 t, I32 e) {
|
||||
return (t & c) | (e & ~c);
|
||||
}
|
||||
SI I32 max(I32 x, I32 y) { return if_then_else(x < y, y, x); }
|
||||
SI I32 min(I32 x, I32 y) { return if_then_else(x < y, x, y); }
|
||||
|
||||
SI F mad(F f, F m, F a) { return f*m+a; }
|
||||
SI U32 trunc_(F x) { return (U32)cast<I32>(x); }
|
||||
|
||||
@ -3192,6 +3202,39 @@ SI F floor_(F x) {
|
||||
return roundtrip - if_then_else(roundtrip > x, F(1), F(0));
|
||||
#endif
|
||||
}
|
||||
|
||||
// scaled_mult interprets a and b as number on [-1, 1) which are numbers in Q15 format. Functionally
|
||||
// this multiply is:
|
||||
// (2 * a * b + (1 << 15)) >> 16
|
||||
// The result is a number on [-1, 1).
|
||||
// Note: on neon this is a saturating multiply while the others are not.
|
||||
SI I16 scaled_mult(I16 a, I16 b) {
|
||||
#if defined(JUMPER_IS_HSW) || defined(JUMPER_IS_SKX)
|
||||
return _mm256_mulhrs_epi16(a, b);
|
||||
#elif defined(JUMPER_IS_SSE41) || defined(JUMPER_IS_AVX)
|
||||
return _mm_mulhrs_epi16(a, b);
|
||||
#elif defined(SK_CPU_ARM64)
|
||||
return vqrdmulhq_s16(a, b);
|
||||
#elif defined(JUMPER_IS_NEON)
|
||||
return vqrdmulhq_s16(a, b);
|
||||
#else
|
||||
const I32 roundingTerm = 1 << 14;
|
||||
return cast<I16>((cast<I32>(a) * cast<I32>(b) + roundingTerm) >> 15);
|
||||
#endif
|
||||
}
|
||||
|
||||
// This sum is to support lerp where the result will always be a positive number. In general,
|
||||
// a sum like this would require an additional bit, but because we know the range of the result
|
||||
// we know that the extra bit will always be zero.
|
||||
SI I16 constrained_add(I16 a, U16 b) {
|
||||
for (size_t i = 0; i < N; i++) {
|
||||
// Ensure that a + b is on the interval [0, UINT16_MAX]
|
||||
SkASSERT(-b[i] <= a[i] && a[i] <= UINT16_MAX - b[i]);
|
||||
}
|
||||
U16 answer = (U16)a + b;
|
||||
return (I16)answer;
|
||||
}
|
||||
|
||||
SI F fract(F x) { return x - floor_(x); }
|
||||
SI F abs_(F x) { return sk_bit_cast<F>( sk_bit_cast<I32>(x) & 0x7fffffff ); }
|
||||
|
||||
@ -3385,6 +3428,19 @@ SI U32 ix_and_ptr(T** ptr, const SkRasterPipeline_GatherCtx* ctx, F x, F y) {
|
||||
return trunc_(y)*ctx->stride + trunc_(x);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
SI U32 ix_and_ptr(T** ptr, const SkRasterPipeline_GatherCtx* ctx, I32 x, I32 y) {
|
||||
// Exclusive -> inclusive.
|
||||
const I32 w = ctx->width - 1,
|
||||
h = ctx->height - 1;
|
||||
|
||||
U32 ax = cast<U32>(min(max(0, x), w)),
|
||||
ay = cast<U32>(min(max(0, y), h));
|
||||
|
||||
*ptr = (const T*)ctx->pixels;
|
||||
return ay * ctx->stride + ax;
|
||||
}
|
||||
|
||||
template <typename V, typename T>
|
||||
SI V load(const T* ptr, size_t tail) {
|
||||
V v = 0;
|
||||
@ -3496,6 +3552,10 @@ SI void from_8888(U32 rgba, U16* r, U16* g, U16* b, U16* a) {
|
||||
*a = cast_U16(rgba >> 16) >> 8;
|
||||
}
|
||||
|
||||
SI void from_8888(U32 rgba, I16* r, I16* g, I16* b, I16* a) {
|
||||
from_8888(rgba, (U16*)r, (U16*)g, (U16*)b, (U16*)a);
|
||||
}
|
||||
|
||||
SI void load_8888_(const uint32_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) {
|
||||
#if 1 && defined(JUMPER_IS_NEON)
|
||||
uint8x8x4_t rgba;
|
||||
@ -3991,6 +4051,111 @@ STAGE_GP(evenly_spaced_2_stop_gradient, const SkRasterPipeline_EvenlySpaced2Stop
|
||||
&r,&g,&b,&a);
|
||||
}
|
||||
|
||||
SI F cast (U32 v) { return __builtin_convertvector((I32)v, F); }
|
||||
SI void from_8888(U32 _8888, F* r, F* g, F* b, F* a) {
|
||||
*r = cast((_8888 ) & 0xff) * (1/255.0f);
|
||||
*g = cast((_8888 >> 8) & 0xff) * (1/255.0f);
|
||||
*b = cast((_8888 >> 16) & 0xff) * (1/255.0f);
|
||||
*a = cast((_8888 >> 24) ) * (1/255.0f);
|
||||
}
|
||||
|
||||
#if !defined(SK_SUPPORT_LEGACY_BILERP_HIGHP)
|
||||
STAGE_GP(bilerp_clamp_8888, const SkRasterPipeline_GatherCtx* ctx) {
|
||||
// Quantize sample point and transform into lerp coordinates converting them to 16.16 fixed
|
||||
// point number.
|
||||
I32 qx = cast<I32>(floor_(65536.0f * x + 0.5f)) - 32768,
|
||||
qy = cast<I32>(floor_(65536.0f * y + 0.5f)) - 32768;
|
||||
|
||||
// Calculate screen coordinates sx & sy by flooring qx and qy.
|
||||
I32 sx = qx >> 16,
|
||||
sy = qy >> 16;
|
||||
|
||||
// We are going to perform a change of parameters for qx on [0, 1) to tx on [-1, 1).
|
||||
// This will put tx in Q15 format for use with q_mult.
|
||||
// Calculate tx and ty on the interval of [-1, 1). Give {qx} and {qy} are on the interval
|
||||
// [0, 1), where {v} is fract(v), we can transform to tx in the following manner ty follows
|
||||
// the same math:
|
||||
// tx = 2 * {qx} - 1, so
|
||||
// {qx} = (tx + 1) / 2.
|
||||
// Calculate {qx} - 1 and {qy} - 1 where the {} operation is handled by the cast, and the - 1
|
||||
// is handled by the ^ 0x8000, dividing by 2 is deferred and handled in lerpX and lerpY in
|
||||
// order to use the full 16-bit resolution.
|
||||
I16 tx = cast<I16>(qx ^ 0x8000),
|
||||
ty = cast<I16>(qy ^ 0x8000);
|
||||
|
||||
// Substituting the {qx} by the equation for tx from above into the lerp equation where v is
|
||||
// the lerped value:
|
||||
// v = {qx}*(R - L) + L,
|
||||
// v = 1/2*(tx + 1)*(R - L) + L
|
||||
// 2 * v = (tx + 1)*(R - L) + 2*L
|
||||
// = tx*R - tx*L + R - L + 2*L
|
||||
// = tx*(R - L) + (R + L).
|
||||
// Since R and L are on [0, 255] we need them on the interval [0, 1/2] to get them into form
|
||||
// for Q15_mult. If L and R where in 16.16 format, this would be done by dividing by 2^9. In
|
||||
// code, we can multiply by 2^7 to get the value directly.
|
||||
// 2 * v = tx*(R - L) + (R + L)
|
||||
// 2^-9 * 2 * v = tx*(R - L)*2^-9 + (R + L)*2^-9
|
||||
// 2^-8 * v = 2^-9 * (tx*(R - L) + (R + L))
|
||||
// v = 1/2 * (tx*(R - L) + (R + L))
|
||||
auto lerpX = [&](I16 left, I16 right) -> I16 {
|
||||
U16 middle = (U16)(right + left) << 7;
|
||||
I16 width = (right - left) << 7,
|
||||
// The constrained_add is the most subtle part of lerp. The first term is on the interval
|
||||
// [-1, 1), and the second term is on the interval is on the interval [0, 1) because
|
||||
// both terms are too high by a factor of 2 which will be handled below. (Both R and L are
|
||||
// on [0, 1/2), but the sum R + L is on the interval [0, 1).) Generally, the sum below
|
||||
// should overflow, but because we know that sum produces an output on the
|
||||
// interval [0, 1) we know that the extra bit that would be needed will always be 0. So
|
||||
// we need to be careful to treat this sum as an unsigned positive number in the divide
|
||||
// by 2 below.
|
||||
v2 = constrained_add(scaled_mult(tx, width), middle);
|
||||
// Divide by 2 to calculate v and at the same time bring the intermediate value onto the
|
||||
// interval [0, 1/2] to set up for the lerpY.
|
||||
return (I16)(((U16)(v2 + 1)) >> 1);
|
||||
};
|
||||
|
||||
const uint32_t* ptr;
|
||||
U32 ix = ix_and_ptr(&ptr, ctx, sx, sy);
|
||||
I16 leftR, leftG, leftB, leftA;
|
||||
from_8888(gather<U32>(ptr, ix), &leftR,&leftG,&leftB,&leftA);
|
||||
|
||||
ix = ix_and_ptr(&ptr, ctx, sx+1, sy);
|
||||
I16 rightR, rightG, rightB, rightA;
|
||||
from_8888(gather<U32>(ptr, ix), &rightR,&rightG,&rightB,&rightA);
|
||||
|
||||
I16 topR = lerpX(leftR, rightR),
|
||||
topG = lerpX(leftG, rightG),
|
||||
topB = lerpX(leftB, rightB),
|
||||
topA = lerpX(leftA, rightA);
|
||||
|
||||
ix = ix_and_ptr(&ptr, ctx, sx, sy+1);
|
||||
from_8888(gather<U32>(ptr, ix), &leftR,&leftG,&leftB,&leftA);
|
||||
|
||||
ix = ix_and_ptr(&ptr, ctx, sx+1, sy+1);
|
||||
from_8888(gather<U32>(ptr, ix), &rightR,&rightG,&rightB,&rightA);
|
||||
|
||||
I16 bottomR = lerpX(leftR, rightR),
|
||||
bottomG = lerpX(leftG, rightG),
|
||||
bottomB = lerpX(leftB, rightB),
|
||||
bottomA = lerpX(leftA, rightA);
|
||||
|
||||
// lerpY plays the same mathematical tricks as lerpX, but the final divide is by 256 resulting
|
||||
// in a value on [0, 255].
|
||||
auto lerpY = [&](I16 top, I16 bottom) -> U16 {
|
||||
I16 width = bottom - top,
|
||||
middle = bottom + top,
|
||||
blend = scaled_mult(ty, width) + middle;
|
||||
|
||||
return ((U16)(blend + 0x80)) >> 8;
|
||||
};
|
||||
|
||||
r = lerpY(topR, bottomR);
|
||||
g = lerpY(topG, bottomG);
|
||||
b = lerpY(topB, bottomB);
|
||||
a = lerpY(topA, bottomA);
|
||||
}
|
||||
#endif // SK_SUPPORT_LEGACY_BILERP_HIGHP
|
||||
|
||||
STAGE_GG(xy_to_unit_angle, Ctx::None) {
|
||||
F xabs = abs_(x),
|
||||
yabs = abs_(y);
|
||||
@ -4121,7 +4286,9 @@ STAGE_PP(swizzle, void* ctx) {
|
||||
NOT_IMPLEMENTED(repeat_y)
|
||||
NOT_IMPLEMENTED(negate_x)
|
||||
NOT_IMPLEMENTED(bilinear)
|
||||
#if defined(SK_SUPPORT_LEGACY_BILERP_HIGHP)
|
||||
NOT_IMPLEMENTED(bilerp_clamp_8888)
|
||||
#endif
|
||||
NOT_IMPLEMENTED(bicubic)
|
||||
NOT_IMPLEMENTED(bicubic_clamp_8888)
|
||||
NOT_IMPLEMENTED(bilinear_nx)
|
||||
|
Loading…
Reference in New Issue
Block a user