Implement Unicode compatibility decompositions
Based on patch from Philip Withnall. https://bugs.freedesktop.org/show_bug.cgi?id=41095
This commit is contained in:
parent
321ec29cc2
commit
378d279bbf
@ -336,6 +336,36 @@ hb_glib_unicode_decompose (hb_unicode_funcs_t *ufuncs HB_UNUSED,
|
||||
return ret;
|
||||
}
|
||||
|
||||
static unsigned int
|
||||
hb_glib_unicode_decompose_compatibility (hb_unicode_funcs_t *ufuncs,
|
||||
hb_codepoint_t u,
|
||||
hb_codepoint_t *decomposed,
|
||||
void *user_data HB_UNUSED)
|
||||
{
|
||||
#if GLIB_CHECK_VERSION(2,29,12)
|
||||
return g_unichar_fully_decompose (u, TRUE, decomposed, HB_UNICODE_MAX_DECOMPOSITION_LEN);
|
||||
#endif
|
||||
|
||||
/* If the user doesn't have GLib >= 2.29.12 we have to perform
|
||||
* a round trip to UTF-8 and the associated memory management dance. */
|
||||
gchar utf8[6];
|
||||
gchar *utf8_decomposed, *c;
|
||||
gsize utf8_len, utf8_decomposed_len, i;
|
||||
|
||||
/* Convert @u to UTF-8 and normalise it in NFKD mode. This performs the compatibility decomposition. */
|
||||
utf8_len = g_unichar_to_utf8 (u, utf8);
|
||||
utf8_decomposed = g_utf8_normalize (utf8, utf8_len, G_NORMALIZE_NFKD);
|
||||
utf8_decomposed_len = g_utf8_strlen (utf8_decomposed, -1);
|
||||
|
||||
assert (utf8_decomposed_len <= HB_UNICODE_MAX_DECOMPOSITION_LEN);
|
||||
|
||||
for (i = 0, c = utf8_decomposed; i < utf8_decomposed_len; i++, c = g_utf8_next_char (c))
|
||||
*decomposed++ = g_utf8_get_char (c);
|
||||
|
||||
g_free (utf8_decomposed);
|
||||
|
||||
return utf8_decomposed_len;
|
||||
}
|
||||
|
||||
extern HB_INTERNAL const hb_unicode_funcs_t _hb_glib_unicode_funcs;
|
||||
const hb_unicode_funcs_t _hb_glib_unicode_funcs = {
|
||||
|
@ -207,7 +207,7 @@ hb_icu_unicode_decompose (hb_unicode_funcs_t *ufuncs HB_UNUSED,
|
||||
hb_codepoint_t *b,
|
||||
void *user_data HB_UNUSED)
|
||||
{
|
||||
UChar utf16[2], normalized[20];
|
||||
UChar utf16[2], normalized[2 * HB_UNICODE_MAX_DECOMPOSITION_LEN + 1];
|
||||
int len;
|
||||
hb_bool_t ret, err;
|
||||
UErrorCode icu_err;
|
||||
@ -271,6 +271,40 @@ hb_icu_unicode_decompose (hb_unicode_funcs_t *ufuncs HB_UNUSED,
|
||||
return ret;
|
||||
}
|
||||
|
||||
static unsigned int
|
||||
hb_icu_unicode_decompose_compatibility (hb_unicode_funcs_t *ufuncs HB_UNUSED,
|
||||
hb_codepoint_t u,
|
||||
hb_codepoint_t *decomposed,
|
||||
void *user_data HB_UNUSED)
|
||||
{
|
||||
UChar utf16[2], normalized[2 * HB_UNICODE_MAX_DECOMPOSITION_LEN + 1];
|
||||
gint len;
|
||||
int32_t utf32_len;
|
||||
hb_bool_t err;
|
||||
UErrorCode icu_err;
|
||||
|
||||
/* Copy @u into a UTF-16 array to be passed to ICU. */
|
||||
len = 0;
|
||||
err = FALSE;
|
||||
U16_APPEND (utf16, len, ARRAY_LENGTH (utf16), u, err);
|
||||
if (err)
|
||||
return 0;
|
||||
|
||||
/* Normalise the codepoint using NFKD mode. */
|
||||
icu_err = U_ZERO_ERROR;
|
||||
len = unorm_normalize (utf16, len, UNORM_NFKD, 0, normalized, ARRAY_LENGTH (normalized), &icu_err);
|
||||
if (icu_err)
|
||||
return 0;
|
||||
|
||||
/* Convert the decomposed form from UTF-16 to UTF-32. */
|
||||
icu_err = U_ZERO_ERROR;
|
||||
u_strToUTF32 ((UChar32*) decomposed, HB_UNICODE_MAX_DECOMPOSITION_LEN, &utf32_len, normalized, len, &icu_err);
|
||||
if (icu_err)
|
||||
return 0;
|
||||
|
||||
return utf32_len;
|
||||
}
|
||||
|
||||
|
||||
extern HB_INTERNAL const hb_unicode_funcs_t _hb_icu_unicode_funcs;
|
||||
const hb_unicode_funcs_t _hb_icu_unicode_funcs = {
|
||||
|
@ -62,7 +62,8 @@
|
||||
* knowledge too. We need to provide assistance to the itemizer.
|
||||
*
|
||||
* - When a font does not support a character but supports its decomposition,
|
||||
* well, use the decomposition.
|
||||
* well, use the decomposition (preferring the canonical decomposition, but
|
||||
* falling back to the compatibility decomposition if necessary).
|
||||
*
|
||||
* - The Indic shaper requests decomposed output. This will handle splitting
|
||||
* matra for the Indic shaper.
|
||||
@ -111,29 +112,45 @@ decompose (hb_font_t *font, hb_buffer_t *buffer,
|
||||
return false;
|
||||
}
|
||||
|
||||
static void
|
||||
decompose_current_glyph (hb_font_t *font, hb_buffer_t *buffer,
|
||||
bool shortest)
|
||||
static bool
|
||||
decompose_compatibility (hb_font_t *font, hb_buffer_t *buffer,
|
||||
hb_codepoint_t u)
|
||||
{
|
||||
if (decompose (font, buffer, shortest, buffer->cur().codepoint))
|
||||
buffer->skip_glyph ();
|
||||
else
|
||||
buffer->next_glyph ();
|
||||
unsigned int len, i;
|
||||
hb_codepoint_t decomposed[HB_UNICODE_MAX_DECOMPOSITION_LEN];
|
||||
|
||||
len = hb_unicode_decompose_compatibility (buffer->unicode, u, decomposed);
|
||||
if (!len)
|
||||
return false;
|
||||
|
||||
hb_codepoint_t glyph;
|
||||
for (i = 0; i < len; i++)
|
||||
if (!hb_font_get_glyph (font, decomposed[i], 0, &glyph))
|
||||
return false;
|
||||
|
||||
for (i = 0; i < len; i++)
|
||||
output_glyph (buffer, decomposed[i]);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static void
|
||||
decompose_single_char_cluster (hb_font_t *font, hb_buffer_t *buffer,
|
||||
bool will_recompose)
|
||||
decompose_current_character (hb_font_t *font, hb_buffer_t *buffer,
|
||||
bool shortest)
|
||||
{
|
||||
hb_codepoint_t glyph;
|
||||
|
||||
/* If recomposing and font supports this, we're good to go */
|
||||
if (will_recompose && hb_font_get_glyph (font, buffer->cur().codepoint, 0, &glyph)) {
|
||||
/* Kind of a cute waterfall here... */
|
||||
if (shortest && hb_font_get_glyph (font, buffer->cur().codepoint, 0, &glyph))
|
||||
buffer->next_glyph ();
|
||||
else if (decompose (font, buffer, shortest, buffer->cur().codepoint))
|
||||
buffer->skip_glyph ();
|
||||
else if (!shortest && hb_font_get_glyph (font, buffer->cur().codepoint, 0, &glyph))
|
||||
buffer->next_glyph ();
|
||||
else if (decompose_compatibility (font, buffer, buffer->cur().codepoint))
|
||||
buffer->skip_glyph ();
|
||||
else
|
||||
buffer->next_glyph ();
|
||||
return;
|
||||
}
|
||||
|
||||
decompose_current_glyph (font, buffer, will_recompose);
|
||||
}
|
||||
|
||||
static void
|
||||
@ -149,7 +166,7 @@ decompose_multi_char_cluster (hb_font_t *font, hb_buffer_t *buffer,
|
||||
}
|
||||
|
||||
while (buffer->idx < end)
|
||||
decompose_current_glyph (font, buffer, false);
|
||||
decompose_current_character (font, buffer, false);
|
||||
}
|
||||
|
||||
static int
|
||||
@ -188,7 +205,7 @@ _hb_ot_shape_normalize (hb_font_t *font, hb_buffer_t *buffer,
|
||||
break;
|
||||
|
||||
if (buffer->idx + 1 == end)
|
||||
decompose_single_char_cluster (font, buffer, recompose);
|
||||
decompose_current_character (font, buffer, recompose);
|
||||
else {
|
||||
decompose_multi_char_cluster (font, buffer, end);
|
||||
has_multichar_clusters = true;
|
||||
|
@ -50,6 +50,7 @@
|
||||
HB_UNICODE_FUNC_IMPLEMENT (script) \
|
||||
HB_UNICODE_FUNC_IMPLEMENT (compose) \
|
||||
HB_UNICODE_FUNC_IMPLEMENT (decompose) \
|
||||
HB_UNICODE_FUNC_IMPLEMENT (decompose_compatibility) \
|
||||
/* ^--- Add new callbacks here */
|
||||
|
||||
/* Simple callbacks are those taking a hb_codepoint_t and returning a hb_codepoint_t */
|
||||
|
@ -99,6 +99,15 @@ hb_unicode_decompose_nil (hb_unicode_funcs_t *ufuncs HB_UNUSED,
|
||||
}
|
||||
|
||||
|
||||
static unsigned int
|
||||
hb_unicode_decompose_compatibility_nil (hb_unicode_funcs_t *ufuncs HB_UNUSED,
|
||||
hb_codepoint_t u HB_UNUSED,
|
||||
hb_codepoint_t *decomposed HB_UNUSED,
|
||||
void *user_data HB_UNUSED)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
hb_unicode_funcs_t *
|
||||
hb_unicode_funcs_get_default (void)
|
||||
@ -312,6 +321,23 @@ hb_unicode_decompose (hb_unicode_funcs_t *ufuncs,
|
||||
return ufuncs->func.decompose (ufuncs, ab, a, b, ufuncs->user_data.decompose);
|
||||
}
|
||||
|
||||
unsigned int
|
||||
hb_unicode_decompose_compatibility (hb_unicode_funcs_t *ufuncs,
|
||||
hb_codepoint_t u,
|
||||
hb_codepoint_t *decomposed)
|
||||
{
|
||||
unsigned int ret = ufuncs->func.decompose_compatibility (ufuncs, u,
|
||||
decomposed,
|
||||
ufuncs->user_data.decompose_compatibility);
|
||||
if (ret == 1 && u == decomposed[0]) {
|
||||
decomposed[0] = 0;
|
||||
return 0;
|
||||
}
|
||||
|
||||
decomposed[ret] = 0;
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
unsigned int
|
||||
@ -380,4 +406,3 @@ _hb_unicode_modified_combining_class (hb_unicode_funcs_t *ufuncs,
|
||||
|
||||
return c;
|
||||
}
|
||||
|
||||
|
@ -1,7 +1,7 @@
|
||||
/*
|
||||
* Copyright © 2009 Red Hat, Inc.
|
||||
* Copyright © 2011 Codethink Limited
|
||||
* Copyright © 2011 Google, Inc.
|
||||
* Copyright © 2011,2012 Google, Inc.
|
||||
*
|
||||
* This is part of HarfBuzz, a text shaping library.
|
||||
*
|
||||
@ -122,6 +122,32 @@ typedef hb_bool_t (*hb_unicode_decompose_func_t) (hb_unicode_funcs_t *ufuncs,
|
||||
hb_codepoint_t *b,
|
||||
void *user_data);
|
||||
|
||||
/**
|
||||
* hb_unicode_decompose_compatibility_func_t:
|
||||
* @ufuncs: Unicode function structure
|
||||
* @u: codepoint to decompose
|
||||
* @decomposed: address of codepoint array (of length %HB_UNICODE_MAX_DECOMPOSITION_LEN) to write decomposition into
|
||||
* @user_data: user data pointer as passed to hb_unicode_funcs_set_decompose_compatibility_func()
|
||||
*
|
||||
* Fully decompose @u to its Unicode compatibility decomposition. The codepoints of the decomposition will be written to @decomposed.
|
||||
* The complete length of the decomposition will be returned.
|
||||
*
|
||||
* If @u has no compatibility decomposition, zero should be returned.
|
||||
*
|
||||
* The Unicode standard guarantees that a buffer of length %HB_UNICODE_MAX_DECOMPOSITION_LEN codepoints will always be sufficient for any
|
||||
* compatibility decomposition plus an terminating value of 0. Consequently, @decompose must be allocated by the caller to be at least this length. Implementations
|
||||
* of this function type must ensure that they do not write past the provided array.
|
||||
*
|
||||
* Return value: number of codepoints in the full compatibility decomposition of @u, or 0 if no decomposition available.
|
||||
*/
|
||||
typedef unsigned int (*hb_unicode_decompose_compatibility_func_t) (hb_unicode_funcs_t *ufuncs,
|
||||
hb_codepoint_t u,
|
||||
hb_codepoint_t *decomposed,
|
||||
void *user_data);
|
||||
|
||||
/* See Unicode 6.1 for details on the maximum decomposition length. */
|
||||
#define HB_UNICODE_MAX_DECOMPOSITION_LEN (18+1) /* codepoints */
|
||||
|
||||
/* setters */
|
||||
|
||||
void
|
||||
@ -159,6 +185,10 @@ hb_unicode_funcs_set_decompose_func (hb_unicode_funcs_t *ufuncs,
|
||||
hb_unicode_decompose_func_t decompose_func,
|
||||
void *user_data, hb_destroy_func_t destroy);
|
||||
|
||||
void
|
||||
hb_unicode_funcs_set_decompose_compatibility_func (hb_unicode_funcs_t *ufuncs,
|
||||
hb_unicode_decompose_compatibility_func_t decompose_compatibility_func,
|
||||
void *user_data, hb_destroy_func_t destroy);
|
||||
|
||||
/* accessors */
|
||||
|
||||
@ -193,6 +223,11 @@ hb_unicode_decompose (hb_unicode_funcs_t *ufuncs,
|
||||
hb_codepoint_t *a,
|
||||
hb_codepoint_t *b);
|
||||
|
||||
unsigned int
|
||||
hb_unicode_decompose_compatibility (hb_unicode_funcs_t *ufuncs,
|
||||
hb_codepoint_t u,
|
||||
hb_codepoint_t *decomposed);
|
||||
|
||||
HB_END_DECLS
|
||||
|
||||
#endif /* HB_UNICODE_H */
|
||||
|
@ -33,6 +33,7 @@
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
|
||||
HB_BEGIN_DECLS
|
||||
|
||||
|
@ -786,6 +786,7 @@ test_unicode_normalization (gconstpointer user_data)
|
||||
{
|
||||
hb_unicode_funcs_t *uf = (hb_unicode_funcs_t *) user_data;
|
||||
gunichar a, b, ab;
|
||||
hb_codepoint_t decomposed[HB_UNICODE_MAX_DECOMPOSITION_LEN];
|
||||
|
||||
|
||||
/* Test compose() */
|
||||
@ -849,6 +850,55 @@ test_unicode_normalization (gconstpointer user_data)
|
||||
g_assert (hb_unicode_decompose (uf, 0xCE31, &a, &b) && a == 0xCE20 && b == 0x11B8);
|
||||
g_assert (hb_unicode_decompose (uf, 0xCE20, &a, &b) && a == 0x110E && b == 0x1173);
|
||||
|
||||
|
||||
/* Test decompose_compatibility() */
|
||||
|
||||
/* Not decomposable */
|
||||
g_assert (hb_unicode_decompose_compatibility (uf, 0x0041, decomposed) == 0);
|
||||
g_assert (hb_unicode_decompose_compatibility (uf, 0x1F632, decomposed) == 0);
|
||||
|
||||
/* Singletons */
|
||||
g_assert (hb_unicode_decompose_compatibility (uf, 0x00B5, decomposed) == 1 && decomposed[0] == 0x03BC);
|
||||
g_assert (hb_unicode_decompose_compatibility (uf, 0x03D6, decomposed) == 1 && decomposed[0] == 0x03C0);
|
||||
|
||||
/* Arabic compatibility */
|
||||
g_assert (hb_unicode_decompose_compatibility (uf, 0xFB54, decomposed) == 1 && decomposed[0] == 0x067B);
|
||||
|
||||
/* Longest decomposition ever */
|
||||
g_assert (18 <= HB_UNICODE_MAX_DECOMPOSITION_LEN);
|
||||
g_assert (hb_unicode_decompose_compatibility (uf, 0xFDFA, decomposed) == 18 && decomposed[17] == 0x0645);
|
||||
|
||||
/* Note: we deliberately don't test characters that have canonical decompositions but no
|
||||
* compatibility decomposition against the decompose_compatibility() function as that we
|
||||
* leave up to implementations (for now). */
|
||||
|
||||
/* Spaces */
|
||||
g_assert (hb_unicode_decompose_compatibility (uf, 0x2002, decomposed) == 1 && decomposed[0] == 0x0020);
|
||||
g_assert (hb_unicode_decompose_compatibility (uf, 0x2003, decomposed) == 1 && decomposed[0] == 0x0020);
|
||||
g_assert (hb_unicode_decompose_compatibility (uf, 0x2004, decomposed) == 1 && decomposed[0] == 0x0020);
|
||||
g_assert (hb_unicode_decompose_compatibility (uf, 0x2005, decomposed) == 1 && decomposed[0] == 0x0020);
|
||||
g_assert (hb_unicode_decompose_compatibility (uf, 0x2006, decomposed) == 1 && decomposed[0] == 0x0020);
|
||||
g_assert (hb_unicode_decompose_compatibility (uf, 0x2008, decomposed) == 1 && decomposed[0] == 0x0020);
|
||||
g_assert (hb_unicode_decompose_compatibility (uf, 0x2009, decomposed) == 1 && decomposed[0] == 0x0020);
|
||||
g_assert (hb_unicode_decompose_compatibility (uf, 0x200A, decomposed) == 1 && decomposed[0] == 0x0020);
|
||||
|
||||
/* Pairs */
|
||||
g_assert (hb_unicode_decompose_compatibility (uf, 0x0587, decomposed) == 2 &&
|
||||
decomposed[0] == 0x0565 && decomposed[1] == 0x0582);
|
||||
g_assert (hb_unicode_decompose_compatibility (uf, 0x2017, decomposed) == 2 &&
|
||||
decomposed[0] == 0x0020 && decomposed[1] == 0x0333);
|
||||
g_assert (hb_unicode_decompose_compatibility (uf, 0x2025, decomposed) == 2 &&
|
||||
decomposed[0] == 0x002E && decomposed[1] == 0x002E);
|
||||
g_assert (hb_unicode_decompose_compatibility (uf, 0x2033, decomposed) == 2 &&
|
||||
decomposed[0] == 0x2032 && decomposed[1] == 0x2032);
|
||||
|
||||
/* Triples */
|
||||
g_assert (hb_unicode_decompose_compatibility (uf, 0x2026, decomposed) == 3 &&
|
||||
decomposed[0] == 0x002E && decomposed[1] == 0x002E && decomposed[2] == 0x002E);
|
||||
g_assert (hb_unicode_decompose_compatibility (uf, 0x2034, decomposed) == 3 &&
|
||||
decomposed[0] == 0x2032 && decomposed[1] == 0x2032 && decomposed[2] == 0x2032);
|
||||
g_assert (hb_unicode_decompose_compatibility (uf, 0x213B, decomposed) == 3 &&
|
||||
decomposed[0] == 0x0046 && decomposed[1] == 0x0041 && decomposed[2] == 0x0058);
|
||||
}
|
||||
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user