gtk/gdk/gdkmemoryformat.c
Emmanuel Gil Peyrot 1fdf5b7cf8 gdk: Optimise RGBA8 → premultiplied BGRA8 for ARM
This more than halves the total runtime of this function since the
previous commit, from 8.36% to 4.02%, and is most likely memory
bandwidth limited on this specific board now.

I tried to do a SSE2 version as well, but couldn’t find any equivalent
of the LD4/ST4 ARM instruction.
2022-02-16 16:36:33 +01:00

582 lines
15 KiB
C

/*
* Copyright © 2021 Benjamin Otte
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library. If not, see <http://www.gnu.org/licenses/>.
*
* Authors: Benjamin Otte <otte@gnome.org>
*/
#include "config.h"
#include "gdkmemoryformatprivate.h"
#include "gsk/gl/fp16private.h"
#include <epoxy/gl.h>
#ifdef __ARM_NEON
#include <arm_neon.h>
#endif
typedef struct _GdkMemoryFormatDescription GdkMemoryFormatDescription;
#define TYPED_FUNCS(name, T, R, G, B, A, bpp, scale) \
static void \
name ## _to_float (float *dest, \
const guchar *src_data, \
gsize n) \
{ \
for (gsize i = 0; i < n; i++) \
{ \
T *src = (T *) (src_data + i * bpp); \
dest[0] = (float) src[R] / scale; \
dest[1] = (float) src[G] / scale; \
dest[2] = (float) src[B] / scale; \
if (A >= 0) dest[3] = (float) src[A] / scale; else dest[3] = 1.0; \
dest += 4; \
} \
} \
\
static void \
name ## _from_float (guchar *dest_data, \
const float *src, \
gsize n) \
{ \
for (gsize i = 0; i < n; i++) \
{ \
T *dest = (T *) (dest_data + i * bpp); \
dest[R] = CLAMP (src[0] * scale + 0.5, 0, scale); \
dest[G] = CLAMP (src[1] * scale + 0.5, 0, scale); \
dest[B] = CLAMP (src[2] * scale + 0.5, 0, scale); \
if (A >= 0) dest[A] = CLAMP (src[3] * scale + 0.5, 0, scale); \
src += 4; \
} \
}
TYPED_FUNCS (b8g8r8a8_premultiplied, guchar, 2, 1, 0, 3, 4, 255)
TYPED_FUNCS (a8r8g8b8_premultiplied, guchar, 1, 2, 3, 0, 4, 255)
TYPED_FUNCS (r8g8b8a8_premultiplied, guchar, 0, 1, 2, 3, 4, 255)
TYPED_FUNCS (b8g8r8a8, guchar, 2, 1, 0, 3, 4, 255)
TYPED_FUNCS (a8r8g8b8, guchar, 1, 2, 3, 0, 4, 255)
TYPED_FUNCS (r8g8b8a8, guchar, 0, 1, 2, 3, 4, 255)
TYPED_FUNCS (a8b8g8r8, guchar, 3, 2, 1, 0, 4, 255)
TYPED_FUNCS (r8g8b8, guchar, 0, 1, 2, -1, 3, 255)
TYPED_FUNCS (b8g8r8, guchar, 2, 1, 0, -1, 3, 255)
TYPED_FUNCS (r16g16b16, guint16, 0, 1, 2, -1, 6, 65535)
TYPED_FUNCS (r16g16b16a16, guint16, 0, 1, 2, 3, 8, 65535)
static void
r16g16b16_float_to_float (float *dest,
const guchar *src_data,
gsize n)
{
guint16 *src = (guint16 *) src_data;
for (gsize i = 0; i < n; i++)
{
half_to_float (src, dest, 3);
dest[3] = 1.0;
dest += 4;
src += 3;
}
}
static void
r16g16b16_float_from_float (guchar *dest_data,
const float *src,
gsize n)
{
guint16 *dest = (guint16 *) dest_data;
for (gsize i = 0; i < n; i++)
{
float_to_half (src, dest, 3);
dest += 3;
src += 4;
}
}
static void
r16g16b16a16_float_to_float (float *dest,
const guchar *src,
gsize n)
{
half_to_float ((const guint16 *) src, dest, 4 * n);
}
static void
r16g16b16a16_float_from_float (guchar *dest,
const float *src,
gsize n)
{
float_to_half (src, (guint16 *) dest, 4 * n);
}
static void
r32g32b32_float_to_float (float *dest,
const guchar *src_data,
gsize n)
{
float *src = (float *) src_data;
for (gsize i = 0; i < n; i++)
{
dest[0] = src[0];
dest[1] = src[1];
dest[2] = src[2];
dest[3] = 1.0;
dest += 4;
src += 3;
}
}
static void
r32g32b32_float_from_float (guchar *dest_data,
const float *src,
gsize n)
{
float *dest = (float *) dest_data;
for (gsize i = 0; i < n; i++)
{
dest[0] = src[0];
dest[1] = src[1];
dest[2] = src[2];
dest += 3;
src += 4;
}
}
static void
r32g32b32a32_float_to_float (float *dest,
const guchar *src,
gsize n)
{
memcpy (dest, src, sizeof (float) * n * 4);
}
static void
r32g32b32a32_float_from_float (guchar *dest,
const float *src,
gsize n)
{
memcpy (dest, src, sizeof (float) * n * 4);
}
// This one conversion is quite important, it converts from RGBA with straight
// alpha (as found in PNG for instance) to BGRA with premultiplied alpha (the
// sole cairo format available).
static void
r8g8b8a8_to_b8g8r8a8_premultiplied (guchar *dest,
const guchar *src,
gsize n)
{
#ifdef __ARM_NEON
uint16x8_t one = vdupq_n_u16 (1);
uint16x8_t half = vdupq_n_u16 (127);
for (gsize i = n / 8; i > 0; i--)
{
// Work on “just” 8 pixels at once, since we need the full 16-bytes of
// the q registers for the multiplication.
uint8x8x4_t rgba = vld4_u8 (src);
uint8x8_t r8 = rgba.val[0];
uint8x8_t g8 = rgba.val[1];
uint8x8_t b8 = rgba.val[2];
uint8x8_t a8 = rgba.val[3];
// This is the same algorithm as premultiply(), but on packed 16-bit
// instead of float.
uint16x8_t r16 = vmull_u8 (r8, a8);
uint16x8_t g16 = vmull_u8 (g8, a8);
uint16x8_t b16 = vmull_u8 (b8, a8);
r16 = vaddq_u16 (r16, half);
g16 = vaddq_u16 (g16, half);
b16 = vaddq_u16 (b16, half);
r16 = vsraq_n_u16 (r16, r16, 8);
g16 = vsraq_n_u16 (g16, g16, 8);
b16 = vsraq_n_u16 (b16, b16, 8);
r16 = vaddq_u16 (r16, one);
g16 = vaddq_u16 (g16, one);
b16 = vaddq_u16 (b16, one);
// Just like the other one, here we use BGRA instead of RGBA!
rgba.val[0] = vshrn_n_u16 (b16, 8);
rgba.val[1] = vshrn_n_u16 (g16, 8);
rgba.val[2] = vshrn_n_u16 (r16, 8);
vst4_u8 (dest, rgba);
src += 32;
dest += 32;
}
// We want the fallthrough here for the last (up to) seven bytes of the row.
n = n % 8;
#endif // __ARM_NEON
for (; n > 0; n--)
{
guchar a = src[3];
guint16 r = (guint16)src[0] * a + 127;
guint16 g = (guint16)src[1] * a + 127;
guint16 b = (guint16)src[2] * a + 127;
dest[0] = (b + (b >> 8) + 1) >> 8;
dest[1] = (g + (g >> 8) + 1) >> 8;
dest[2] = (r + (r >> 8) + 1) >> 8;
dest[3] = a;
dest += 4;
src += 4;
}
}
struct _GdkMemoryFormatDescription
{
GdkMemoryAlpha alpha;
gsize bytes_per_pixel;
gsize alignment;
gboolean prefers_high_depth;
gboolean supports_gles;
struct {
guint internal_format;
guint format;
guint type;
} gl;
/* no premultiplication going on here */
void (* to_float) (float *, const guchar*, gsize);
void (* from_float) (guchar *, const float *, gsize);
};
#if G_BYTE_ORDER == G_LITTLE_ENDIAN
# define GDK_GL_UNSIGNED_BYTE_FLIPPED GL_UNSIGNED_INT_8_8_8_8
#elif G_BYTE_ORDER == G_BIG_ENDIAN
# define GDK_GL_UNSIGNED_BYTE_FLIPPED GL_UNSIGNED_INT_8_8_8_8_REV
#else
# error "Define the right GL flags here"
#endif
static const GdkMemoryFormatDescription memory_formats[GDK_MEMORY_N_FORMATS] = {
[GDK_MEMORY_B8G8R8A8_PREMULTIPLIED] = {
GDK_MEMORY_ALPHA_PREMULTIPLIED,
4,
G_ALIGNOF (guchar),
FALSE,
FALSE,
{ GL_RGBA8, GL_BGRA, GL_UNSIGNED_BYTE },
b8g8r8a8_premultiplied_to_float,
b8g8r8a8_premultiplied_from_float,
},
[GDK_MEMORY_A8R8G8B8_PREMULTIPLIED] = {
GDK_MEMORY_ALPHA_PREMULTIPLIED,
4,
G_ALIGNOF (guchar),
FALSE,
FALSE,
{ GL_RGBA8, GL_BGRA, GDK_GL_UNSIGNED_BYTE_FLIPPED },
a8r8g8b8_premultiplied_to_float,
a8r8g8b8_premultiplied_from_float,
},
[GDK_MEMORY_R8G8B8A8_PREMULTIPLIED] = {
GDK_MEMORY_ALPHA_PREMULTIPLIED,
4,
G_ALIGNOF (guchar),
FALSE,
TRUE,
{ GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE },
r8g8b8a8_premultiplied_to_float,
r8g8b8a8_premultiplied_from_float,
},
[GDK_MEMORY_B8G8R8A8] = {
GDK_MEMORY_ALPHA_STRAIGHT,
4,
G_ALIGNOF (guchar),
FALSE,
FALSE,
{ GL_RGBA8, GL_BGRA, GL_UNSIGNED_BYTE },
b8g8r8a8_to_float,
b8g8r8a8_from_float,
},
[GDK_MEMORY_A8R8G8B8] = {
GDK_MEMORY_ALPHA_STRAIGHT,
4,
G_ALIGNOF (guchar),
FALSE,
FALSE,
{ GL_RGBA8, GL_RGBA, GDK_GL_UNSIGNED_BYTE_FLIPPED },
a8r8g8b8_to_float,
a8r8g8b8_from_float,
},
[GDK_MEMORY_R8G8B8A8] = {
GDK_MEMORY_ALPHA_STRAIGHT,
4,
G_ALIGNOF (guchar),
FALSE,
TRUE,
{ GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE },
r8g8b8a8_to_float,
r8g8b8a8_from_float,
},
[GDK_MEMORY_A8B8G8R8] = {
GDK_MEMORY_ALPHA_STRAIGHT,
4,
G_ALIGNOF (guchar),
FALSE,
FALSE,
{ GL_RGBA8, GL_BGRA, GDK_GL_UNSIGNED_BYTE_FLIPPED },
a8b8g8r8_to_float,
a8b8g8r8_from_float,
},
[GDK_MEMORY_R8G8B8] = {
GDK_MEMORY_ALPHA_OPAQUE,
3,
G_ALIGNOF (guchar),
FALSE,
TRUE,
{ GL_RGB8, GL_RGB, GL_UNSIGNED_BYTE },
r8g8b8_to_float,
r8g8b8_from_float,
},
[GDK_MEMORY_B8G8R8] = {
GDK_MEMORY_ALPHA_OPAQUE,
3,
G_ALIGNOF (guchar),
FALSE,
FALSE,
{ GL_RGB8, GL_BGR, GL_UNSIGNED_BYTE },
b8g8r8_to_float,
b8g8r8_from_float,
},
[GDK_MEMORY_R16G16B16] = {
GDK_MEMORY_ALPHA_OPAQUE,
6,
G_ALIGNOF (guint16),
TRUE,
TRUE,
{ GL_RGB16, GL_RGB, GL_UNSIGNED_SHORT },
r16g16b16_to_float,
r16g16b16_from_float,
},
[GDK_MEMORY_R16G16B16A16_PREMULTIPLIED] = {
GDK_MEMORY_ALPHA_PREMULTIPLIED,
8,
G_ALIGNOF (guint16),
TRUE,
TRUE,
{ GL_RGBA16, GL_RGBA, GL_UNSIGNED_SHORT },
r16g16b16a16_to_float,
r16g16b16a16_from_float,
},
[GDK_MEMORY_R16G16B16A16] = {
GDK_MEMORY_ALPHA_STRAIGHT,
8,
G_ALIGNOF (guint16),
TRUE,
TRUE,
{ GL_RGBA16, GL_RGBA, GL_UNSIGNED_SHORT },
r16g16b16a16_to_float,
r16g16b16a16_from_float,
},
[GDK_MEMORY_R16G16B16_FLOAT] = {
GDK_MEMORY_ALPHA_OPAQUE,
6,
G_ALIGNOF (guint16),
TRUE,
TRUE,
{ GL_RGB16F, GL_RGB, GL_HALF_FLOAT },
r16g16b16_float_to_float,
r16g16b16_float_from_float,
},
[GDK_MEMORY_R16G16B16A16_FLOAT_PREMULTIPLIED] = {
GDK_MEMORY_ALPHA_PREMULTIPLIED,
8,
G_ALIGNOF (guint16),
TRUE,
TRUE,
{ GL_RGBA16F, GL_RGBA, GL_HALF_FLOAT },
r16g16b16a16_float_to_float,
r16g16b16a16_float_from_float,
},
[GDK_MEMORY_R16G16B16A16_FLOAT] = {
GDK_MEMORY_ALPHA_STRAIGHT,
8,
G_ALIGNOF (guint16),
TRUE,
TRUE,
{ GL_RGBA16F, GL_RGBA, GL_HALF_FLOAT },
r16g16b16a16_float_to_float,
r16g16b16a16_float_from_float,
},
[GDK_MEMORY_R32G32B32_FLOAT] = {
GDK_MEMORY_ALPHA_OPAQUE,
12,
G_ALIGNOF (float),
TRUE,
TRUE,
{ GL_RGB32F, GL_RGB, GL_FLOAT },
r32g32b32_float_to_float,
r32g32b32_float_from_float,
},
[GDK_MEMORY_R32G32B32A32_FLOAT_PREMULTIPLIED] = {
GDK_MEMORY_ALPHA_PREMULTIPLIED,
16,
G_ALIGNOF (float),
TRUE,
TRUE,
{ GL_RGBA32F, GL_RGBA, GL_FLOAT },
r32g32b32a32_float_to_float,
r32g32b32a32_float_from_float,
},
[GDK_MEMORY_R32G32B32A32_FLOAT] = {
GDK_MEMORY_ALPHA_STRAIGHT,
16,
G_ALIGNOF (float),
TRUE,
TRUE,
{ GL_RGBA32F, GL_RGBA, GL_FLOAT },
r32g32b32a32_float_to_float,
r32g32b32a32_float_from_float,
}
};
gsize
gdk_memory_format_bytes_per_pixel (GdkMemoryFormat format)
{
return memory_formats[format].bytes_per_pixel;
}
GdkMemoryAlpha
gdk_memory_format_alpha (GdkMemoryFormat format)
{
return memory_formats[format].alpha;
}
gsize
gdk_memory_format_alignment (GdkMemoryFormat format)
{
return memory_formats[format].alignment;
}
/*<private>
* gdk_memory_format_prefers_high_depth:
* @format: a memory format
*
* Checks if the given format benefits from being rendered
* in bit depths higher than 8bits per pixel. See
* gsk_render_node_prefers_high_depth() for more information
* on this.
* Usually this is the case when
* gdk_memory_format_bytes_per_pixel() is larger than 4.
*
* Returns: %TRUE if the format benefits from being
* composited in hgiher bit depths.
**/
gboolean
gdk_memory_format_prefers_high_depth (GdkMemoryFormat format)
{
return memory_formats[format].prefers_high_depth;
}
gboolean
gdk_memory_format_gl_format (GdkMemoryFormat format,
gboolean gles,
guint *out_internal_format,
guint *out_format,
guint *out_type)
{
*out_internal_format = memory_formats[format].gl.internal_format;
*out_format = memory_formats[format].gl.format;
*out_type = memory_formats[format].gl.type;
if (memory_formats[format].alpha == GDK_MEMORY_ALPHA_STRAIGHT)
return FALSE;
if (gles && !memory_formats[format].supports_gles)
return FALSE;
return TRUE;
}
static void
premultiply (float *rgba,
gsize n)
{
for (gsize i = 0; i < n; i++)
{
rgba[0] *= rgba[3];
rgba[1] *= rgba[3];
rgba[2] *= rgba[3];
rgba += 4;
}
}
static void
unpremultiply (float *rgba,
gsize n)
{
for (gsize i = 0; i < n; i++)
{
if (rgba[3] > 1/255.0)
{
rgba[0] /= rgba[3];
rgba[1] /= rgba[3];
rgba[2] /= rgba[3];
}
rgba += 4;
}
}
void
gdk_memory_convert (guchar *dest_data,
gsize dest_stride,
GdkMemoryFormat dest_format,
const guchar *src_data,
gsize src_stride,
GdkMemoryFormat src_format,
gsize width,
gsize height)
{
const GdkMemoryFormatDescription *dest_desc = &memory_formats[dest_format];
const GdkMemoryFormatDescription *src_desc = &memory_formats[src_format];
float *tmp;
gsize y;
g_assert (dest_format < GDK_MEMORY_N_FORMATS);
g_assert (src_format < GDK_MEMORY_N_FORMATS);
if (src_format == GDK_MEMORY_R8G8B8A8 && dest_format == GDK_MEMORY_B8G8R8A8_PREMULTIPLIED)
{
for (y = 0; y < height; y++)
{
r8g8b8a8_to_b8g8r8a8_premultiplied (dest_data, src_data, width);
src_data += src_stride;
dest_data += dest_stride;
}
return;
}
tmp = g_new (float, width * 4);
for (y = 0; y < height; y++)
{
src_desc->to_float (tmp, src_data, width);
if (src_desc->alpha == GDK_MEMORY_ALPHA_PREMULTIPLIED && dest_desc->alpha == GDK_MEMORY_ALPHA_STRAIGHT)
unpremultiply (tmp, width);
else if (src_desc->alpha == GDK_MEMORY_ALPHA_STRAIGHT && dest_desc->alpha != GDK_MEMORY_ALPHA_STRAIGHT)
premultiply (tmp, width);
dest_desc->from_float (dest_data, tmp, width);
src_data += src_stride;
dest_data += dest_stride;
}
g_free (tmp);
}