forked from AuroraMiddleware/gtk
gdk: Remove pixel format conversion ARM intrinsics
This was only useful when building for AArch32 without -mfpu=neon, on
AArch64 or with -mfpu=neon gcc is smart enough to do the auto-
vectorisation, leading to code almost as good as what I wrote in
1fdf5b7cf8
.
This commit is contained in:
parent
f0e8ee3c29
commit
08099ed17d
@ -25,10 +25,6 @@
|
||||
|
||||
#include <epoxy/gl.h>
|
||||
|
||||
#ifdef __ARM_NEON
|
||||
#include <arm_neon.h>
|
||||
#endif
|
||||
|
||||
typedef struct _GdkMemoryFormatDescription GdkMemoryFormatDescription;
|
||||
|
||||
#define TYPED_FUNCS(name, T, R, G, B, A, bpp, scale) \
|
||||
@ -178,52 +174,6 @@ r8g8b8a8_to_b8g8r8a8_premultiplied (guchar *dest,
|
||||
const guchar *src,
|
||||
gsize n)
|
||||
{
|
||||
#ifdef __ARM_NEON
|
||||
uint16x8_t one = vdupq_n_u16 (1);
|
||||
uint16x8_t half = vdupq_n_u16 (127);
|
||||
|
||||
for (gsize i = n / 8; i > 0; i--)
|
||||
{
|
||||
// Work on “just” 8 pixels at once, since we need the full 16-bytes of
|
||||
// the q registers for the multiplication.
|
||||
uint8x8x4_t rgba = vld4_u8 (src);
|
||||
uint8x8_t r8 = rgba.val[0];
|
||||
uint8x8_t g8 = rgba.val[1];
|
||||
uint8x8_t b8 = rgba.val[2];
|
||||
uint8x8_t a8 = rgba.val[3];
|
||||
|
||||
// This is the same algorithm as premultiply(), but on packed 16-bit
|
||||
// instead of float.
|
||||
uint16x8_t r16 = vmull_u8 (r8, a8);
|
||||
uint16x8_t g16 = vmull_u8 (g8, a8);
|
||||
uint16x8_t b16 = vmull_u8 (b8, a8);
|
||||
|
||||
r16 = vaddq_u16 (r16, half);
|
||||
g16 = vaddq_u16 (g16, half);
|
||||
b16 = vaddq_u16 (b16, half);
|
||||
|
||||
r16 = vsraq_n_u16 (r16, r16, 8);
|
||||
g16 = vsraq_n_u16 (g16, g16, 8);
|
||||
b16 = vsraq_n_u16 (b16, b16, 8);
|
||||
|
||||
r16 = vaddq_u16 (r16, one);
|
||||
g16 = vaddq_u16 (g16, one);
|
||||
b16 = vaddq_u16 (b16, one);
|
||||
|
||||
// Just like the other one, here we use BGRA instead of RGBA!
|
||||
rgba.val[0] = vshrn_n_u16 (b16, 8);
|
||||
rgba.val[1] = vshrn_n_u16 (g16, 8);
|
||||
rgba.val[2] = vshrn_n_u16 (r16, 8);
|
||||
|
||||
vst4_u8 (dest, rgba);
|
||||
src += 32;
|
||||
dest += 32;
|
||||
}
|
||||
|
||||
// We want the fallthrough here for the last (up to) seven bytes of the row.
|
||||
n = n % 8;
|
||||
#endif // __ARM_NEON
|
||||
|
||||
for (; n > 0; n--)
|
||||
{
|
||||
guchar a = src[3];
|
||||
|
Loading…
Reference in New Issue
Block a user