add constrained_add

This adds check to make sure that the results in the last
add of the lerp are in range. Also, Smooth out types.

Change-Id: I853835e530f6b6790e16464db12964d68ab9ef8d
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/453718
Bot-Commit: Rubber Stamper <rubber-stamper@appspot.gserviceaccount.com>
Commit-Queue: Herb Derby <herb@google.com>
This commit is contained in:
Herb Derby 2021-09-28 17:24:00 -04:00 committed by SkCQ
parent afa657d6ab
commit 83e99569bd
2 changed files with 26 additions and 23 deletions

View File

@ -23,11 +23,26 @@ static_assert(false, "This only works on clang.");
#include <arm_neon.h> #include <arm_neon.h>
#endif #endif
#include <cassert>
#include <cstdint>
using Q15 = V<8, uint16_t>; using Q15 = V<8, uint16_t>;
using I16 = V<8, int16_t>;
using U16 = V<8, uint16_t>;
static inline U16 constrained_add(I16 a, U16 b) {
for (size_t i = 0; i < 8; i++) {
// Ensure that a + b is on the interval [0, UINT16_MAX]
assert(-b[i] <= a[i] && a[i] <= UINT16_MAX - b[i]);
}
U16 answer = b + a;
return answer;
}
// A pure C version of the ssse3 intrinsic mm_mulhrs_epi16; // A pure C version of the ssse3 intrinsic mm_mulhrs_epi16;
static inline Q15 simulate_ssse3_mm_mulhrs_epi16(Q15 a, Q15 b) { static inline I16 simulate_ssse3_mm_mulhrs_epi16(I16 a, I16 b) {
Q15 result; I16 result;
auto m = [](int16_t r, int16_t s) { auto m = [](int16_t r, int16_t s) {
const int32_t rounding = 1 << 14; const int32_t rounding = 1 << 14;
int32_t temp = (int32_t)r * (int32_t)s + rounding; int32_t temp = (int32_t)r * (int32_t)s + rounding;

View File

@ -65,37 +65,25 @@ static int16_t full_res_bilerp(
return rounded >> 32; return rounded >> 32;
} }
// Change of parameters on t from [0, 1) to [-1, 1). This cuts the number if differences in half.
static int16_t lerp(float t, int16_t a, int16_t b) {
const int logPixelScale = 7;
const uint16_t half = 1 << logPixelScale;
// t on [-1, 1).
Q15 qt (floor(t * 65536.0f - 32768.0f + 0.5f));
// need to pick logPixelScale to scale by addition 1/2.
Q15 qw ((b - a) << logPixelScale);
Q15 qm ((a + b) << logPixelScale);
Q15 answer = simulate_ssse3_mm_mulhrs_epi16(qt, qw) + qm;
// Extra shift to divide by 2.
return (answer[0] + half) >> (logPixelScale + 1);
}
static int16_t bilerp_1(float tx, float ty, int16_t p00, int16_t p10, int16_t p01, int16_t p11) { static int16_t bilerp_1(float tx, float ty, int16_t p00, int16_t p10, int16_t p01, int16_t p11) {
const int logPixelScale = 7; const int logPixelScale = 7;
const int16_t half = 1 << logPixelScale; const int16_t half = 1 << logPixelScale;
Q15 qtx = floor(tx * 65536.0f - 32768.0f + 0.5f); I16 qtx = floor(tx * 65536.0f - 32768.0f + 0.5f);
Q15 qw = (p10 - p00) << logPixelScale; I16 qw = (p10 - p00) << logPixelScale;
Q15 qm = (p10 + p00) << logPixelScale; U16 qm = (p10 + p00) << logPixelScale;
Q15 top = (simulate_ssse3_mm_mulhrs_epi16(qtx, qw) + qm + 1) >> 1; I16 top = (I16)((U16)(constrained_add(simulate_ssse3_mm_mulhrs_epi16(qtx, qw), qm) + 1) >> 1);
qw = (p11 - p01) << logPixelScale; qw = (p11 - p01) << logPixelScale;
qm = (p11 + p01) << logPixelScale; qm = (p11 + p01) << logPixelScale;
Q15 bottom = (simulate_ssse3_mm_mulhrs_epi16(qtx, qw) + qm + 1) >> 1; I16 bottom =
(I16)((U16)(constrained_add(simulate_ssse3_mm_mulhrs_epi16(qtx, qw), qm) + 1) >> 1);
Q15 qty = floor(ty * 65536.0f - 32768.0f + 0.5f); I16 qty = floor(ty * 65536.0f - 32768.0f + 0.5f);
qw = bottom - top; qw = bottom - top;
qm = bottom + top; qm = (U16)bottom + (U16)top;
Q15 scaledAnswer = simulate_ssse3_mm_mulhrs_epi16(qty, qw) + qm; U16 scaledAnswer = constrained_add(simulate_ssse3_mm_mulhrs_epi16(qty, qw), qm);
return (scaledAnswer[0] + half) >> (logPixelScale + 1); return (scaledAnswer[0] + half) >> (logPixelScale + 1);
} }