[khmer] Rewrite most of shaper to better follow spec

Khmer spec has only one reordering phase, and only simple prebase matra and Coeng-Ro reordering. Implement that. Specifically, this was done to address recognizing different orders of the matra and Coeng-Ro sequence. That said, some combinations are now reordered differently from Uniscribe. Not clear if that's intended or a bug in Uniscribe. The following two sequences render the same in Uniscribe whereas we reorder them differently: U+17A0,U+17D2,U+179A,U+17C2 U+17A0,U+17C2,U+17D2,U+179A For that reason, our test suite numbers regressed slightly. Used to be at 34 for fails, now at: KHMER: 299080 out of 299124 tests passed. 44 failed (0.0147096%) But generally a good change, and removed lots of code. Fixes https://github.com/harfbuzz/harfbuzz/issues/1026
2018-07-31 11:45:32 -07:00 · 2018-07-31 11:45:32 -07:00 · 1a96cc825d
commit 1a96cc825d
parent f5152cea42
3 changed files with 64 additions and 413 deletions
--- a/src/hb-ot-shape-complex-indic-private.hh
+++ b/src/hb-ot-shape-complex-indic-private.hh
@ -300,7 +300,9 @@ static const hb_codepoint_t ra_chars[] = {
  0x0CB0u, /* Kannada */
  0x0D30u, /* Malayalam */	/* No Reph, Logical Repha */

-  0x0DBBu, /* Sinhala */		/* Reph formed only with ZWJ */
+  0x0DBBu, /* Sinhala */	/* Reph formed only with ZWJ */
+
+  0x179Au, /* Khmer */
 };

 static inline bool
--- a/src/hb-ot-shape-complex-khmer.cc
+++ b/src/hb-ot-shape-complex-khmer.cc
@ -42,7 +42,7 @@ khmer_features[] =
 {
  /*
   * Basic features.
-   * These features are applied in order, one at a time, after initial_reordering.
+   * These features are applied in order, one at a time, after reordering.
   */
  {HB_TAG('p','r','e','f'), F_NONE},
  {HB_TAG('b','l','w','f'), F_NONE},
@ -51,9 +51,7 @@ khmer_features[] =
  {HB_TAG('c','f','a','r'), F_NONE},
  /*
   * Other features.
-   * These features are applied all at once, after final_reordering.
-   * Default Bengali font in Windows for example has intermixed
-   * lookups for init,pres,abvs,blws features.
+   * These features are applied all at once.
   */
  {HB_TAG('p','r','e','s'), F_GLOBAL},
  {HB_TAG('a','b','v','s'), F_GLOBAL},
@ -92,13 +90,9 @@ setup_syllables (const hb_ot_shape_plan_t *plan,
 		 hb_font_t *font,
 		 hb_buffer_t *buffer);
 static void
-initial_reordering (const hb_ot_shape_plan_t *plan,
-		    hb_font_t *font,
-		    hb_buffer_t *buffer);
-static void
-final_reordering (const hb_ot_shape_plan_t *plan,
-		  hb_font_t *font,
-		  hb_buffer_t *buffer);
+reorder (const hb_ot_shape_plan_t *plan,
+	 hb_font_t *font,
+	 hb_buffer_t *buffer);
 static void
 clear_syllables (const hb_ot_shape_plan_t *plan,
 		 hb_font_t *font,
@ -119,12 +113,11 @@ collect_features_khmer (hb_ot_shape_planner_t *plan)


  unsigned int i = 0;
-  map->add_gsub_pause (initial_reordering);
+  map->add_gsub_pause (reorder);
  for (; i < KHMER_BASIC_FEATURES; i++) {
    map->add_feature (khmer_features[i].tag, 1, khmer_features[i].flags | F_MANUAL_ZWJ | F_MANUAL_ZWNJ);
    map->add_gsub_pause (nullptr);
  }
-  map->add_gsub_pause (final_reordering);
  for (; i < KHMER_NUM_FEATURES; i++) {
    map->add_feature (khmer_features[i].tag, 1, khmer_features[i].flags | F_MANUAL_ZWJ | F_MANUAL_ZWNJ);
  }
@ -264,162 +257,58 @@ setup_syllables (const hb_ot_shape_plan_t *plan HB_UNUSED,
    buffer->unsafe_to_break (start, end);
 }

-static int
-compare_khmer_order (const hb_glyph_info_t *pa, const hb_glyph_info_t *pb)
-{
-  int a = pa->khmer_position();
-  int b = pb->khmer_position();
-
-  return a < b ? -1 : a == b ? 0 : +1;
-}
-

 /* Rules from:
 * https://docs.microsoft.com/en-us/typography/script-development/devanagari */

 static void
-initial_reordering_consonant_syllable (const hb_ot_shape_plan_t *plan,
-				       hb_face_t *face,
-				       hb_buffer_t *buffer,
-				       unsigned int start, unsigned int end)
+reorder_consonant_syllable (const hb_ot_shape_plan_t *plan,
+			    hb_face_t *face,
+			    hb_buffer_t *buffer,
+			    unsigned int start, unsigned int end)
 {
  const khmer_shape_plan_t *khmer_plan = (const khmer_shape_plan_t *) plan->data;
  hb_glyph_info_t *info = buffer->info;

-  /* 1. Khmer shaping assumes that a syllable will begin with a Cons, IndV, or Number. */
-
-  /* The first consonant is always the base. */
-  unsigned int base = start;
-  info[base].khmer_position() = POS_BASE_C;
-
-  /* Mark all subsequent consonants as below. */
-  for (unsigned int i = base + 1; i < end; i++)
-    if (is_consonant_or_vowel (info[i]))
-      info[i].khmer_position() = POS_BELOW_C;
-
-  /* Mark final consonants.  A final consonant is one appearing after a matra,
-   * like in Khmer. */
-  for (unsigned int i = base + 1; i < end; i++)
-    if (info[i].khmer_category() == OT_M) {
-      for (unsigned int j = i + 1; j < end; j++)
-        if (is_consonant_or_vowel (info[j])) {
-	  info[j].khmer_position() = POS_FINAL_C;
-	  break;
-	}
-      break;
-    }
-
-  /* Attach misc marks to previous char to move with them. */
+  /* Setup masks. */
  {
-    khmer_position_t last_pos = POS_START;
-    for (unsigned int i = start; i < end; i++)
-    {
-      if ((FLAG_UNSAFE (info[i].khmer_category()) & (JOINER_FLAGS | FLAG (OT_N) | FLAG (OT_RS) | MEDIAL_FLAGS | FLAG (OT_Coeng))))
-      {
-	info[i].khmer_position() = last_pos;
-	if (unlikely (info[i].khmer_category() == OT_Coeng &&
-		      info[i].khmer_position() == POS_PRE_M))
-	{
-	  /*
-	   * Uniscribe doesn't move the Halant with Left Matra.
-	   * TEST: U+092B,U+093F,U+094DE
-	   * We follow.  This is important for the Sinhala
-	   * U+0DDA split matra since it decomposes to U+0DD9,U+0DCA
-	   * where U+0DD9 is a left matra and U+0DCA is the virama.
-	   * We don't want to move the virama with the left matra.
-	   * TEST: U+0D9A,U+0DDA
-	   */
-	  for (unsigned int j = i; j > start; j--)
-	    if (info[j - 1].khmer_position() != POS_PRE_M) {
-	      info[i].khmer_position() = info[j - 1].khmer_position();
-	      break;
-	    }
-	}
-      } else if (info[i].khmer_position() != POS_SMVD) {
-        last_pos = (khmer_position_t) info[i].khmer_position();
-      }
-    }
-  }
-  /* For post-base consonants let them own anything before them
-   * since the last consonant or matra. */
-  {
-    unsigned int last = base;
-    for (unsigned int i = base + 1; i < end; i++)
-      if (is_consonant_or_vowel (info[i]))
-      {
-	for (unsigned int j = last + 1; j < i; j++)
-	  if (info[j].khmer_position() < POS_SMVD)
-	    info[j].khmer_position() = info[i].khmer_position();
-	last = i;
-      } else if (info[i].khmer_category() == OT_M)
-        last = i;
-  }
-
-  {
-    /* Use syllable() for sort accounting temporarily. */
-    unsigned int syllable = info[start].syllable();
-    for (unsigned int i = start; i < end; i++)
-      info[i].syllable() = i - start;
-
-    /* Sit tight, rock 'n roll! */
-    hb_stable_sort (info + start, end - start, compare_khmer_order);
-    /* Find base again */
-    base = end;
-    for (unsigned int i = start; i < end; i++)
-      if (info[i].khmer_position() == POS_BASE_C)
-      {
-	base = i;
-	break;
-      }
-
-    if (unlikely (end - start >= 127))
-      buffer->merge_clusters (start, end);
-    else
-      /* Note!  syllable() is a one-byte field. */
-      for (unsigned int i = base; i < end; i++)
-	if (info[i].syllable() != 255)
-	{
-	  unsigned int max = i;
-	  unsigned int j = start + info[i].syllable();
-	  while (j != i)
-	  {
-	    max = MAX (max, j);
-	    unsigned int next = start + info[j].syllable();
-	    info[j].syllable() = 255; /* So we don't process j later again. */
-	    j = next;
-	  }
-	  if (i != max)
-	    buffer->merge_clusters (i, max + 1);
-	}
-
-    /* Put syllable back in. */
-    for (unsigned int i = start; i < end; i++)
-      info[i].syllable() = syllable;
-  }
-
-  /* Setup masks now */
-
-  {
-    hb_mask_t mask;
-
    /* Post-base */
-    mask = khmer_plan->mask_array[BLWF] | khmer_plan->mask_array[ABVF] | khmer_plan->mask_array[PSTF];
-    for (unsigned int i = base + 1; i < end; i++)
+    hb_mask_t mask = khmer_plan->mask_array[BLWF] | khmer_plan->mask_array[ABVF] | khmer_plan->mask_array[PSTF];
+    for (unsigned int i = start + 1; i < end; i++)
      info[i].mask  |= mask;
  }

-  unsigned int pref_len = 2;
-  if (khmer_plan->mask_array[PREF] && base + pref_len < end)
+  unsigned int num_coengs = 0;
+  for (unsigned int i = start + 1; i < end; i++)
  {
-    /* Find a Halant,Ra sequence and mark it for pre-base-reordering processing. */
-    for (unsigned int i = base + 1; i + pref_len - 1 < end; i++) {
-      hb_codepoint_t glyphs[2];
-      for (unsigned int j = 0; j < pref_len; j++)
-        glyphs[j] = info[i + j].codepoint;
-      if (khmer_plan->pref.would_substitute (glyphs, pref_len, face))
+    /* """
+     * When a COENG + (Cons | IndV) combination are found (and subscript count
+     * is less than two) the character combination is handled according to the
+     * subscript type of the character following the COENG.
+     *
+     * ...
+     *
+     * Subscript Type 2 - The COENG + RO characters are reordered to immediately
+     * before the base glyph. Then the COENG + RO characters are assigned to have
+     * the 'pref' OpenType feature applied to them.
+     * """
+     */
+    if (info[i].khmer_category() == OT_Coeng && num_coengs <= 2 && i + 1 < end)
+    {
+      num_coengs++;
+
+      if (info[i + 1].khmer_category() == OT_Ra)
      {
-	for (unsigned int j = 0; j < pref_len; j++)
-	  info[i++].mask |= khmer_plan->mask_array[PREF];
+	for (unsigned int j = 0; j < 2; j++)
+	  info[i + j].mask |= khmer_plan->mask_array[PREF];
+
+	/* Move the Coeng,Ro sequence to the start. */
+	buffer->merge_clusters (start, i + 2);
+	hb_glyph_info_t t0 = info[i];
+	hb_glyph_info_t t1 = info[i + 1];
+	memmove (&info[start + 2], &info[start], (i - start) * sizeof (info[0]));
+	info[start] = t0;
+	info[start + 1] = t1;

 	/* Mark the subsequent stuff with 'cfar'.  Used in Khmer.
 	 * Read the feature spec.
@ -428,12 +317,22 @@ initial_reordering_consonant_syllable (const hb_ot_shape_plan_t *plan,
 	 * U+1784,U+17D2,U+1782,U+17D2,U+179A
 	 */
 	if (khmer_plan->mask_array[CFAR])
-	  for (; i < end; i++)
-	    info[i].mask |= khmer_plan->mask_array[CFAR];
+	  for (unsigned int j = i + 2; j < end; j++)
+	    info[j].mask |= khmer_plan->mask_array[CFAR];

-	break;
+	num_coengs = 2; /* Done. */
      }
    }
+
+    /* Reorder left matra piece. */
+    else if (info[i].khmer_position() == POS_PRE_M)
+    {
+      /* Move to the start. */
+      buffer->merge_clusters (start, i + 1);
+      hb_glyph_info_t t = info[i];
+      memmove (&info[start + 1], &info[start], (i - start) * sizeof (info[0]));
+      info[start] = t;
+    }
  }
 }

@ -448,7 +347,7 @@ initial_reordering_syllable (const hb_ot_shape_plan_t *plan,
  {
    case broken_cluster: /* We already inserted dotted-circles, so just call the consonant_syllable. */
    case consonant_syllable:
-     initial_reordering_consonant_syllable (plan, face, buffer, start, end);
+     reorder_consonant_syllable (plan, face, buffer, start, end);
     break;

    case non_khmer_cluster:
@ -518,263 +417,26 @@ insert_dotted_circles (const hb_ot_shape_plan_t *plan HB_UNUSED,
 }

 static void
-initial_reordering (const hb_ot_shape_plan_t *plan,
-		    hb_font_t *font,
-		    hb_buffer_t *buffer)
+reorder (const hb_ot_shape_plan_t *plan,
+	 hb_font_t *font,
+	 hb_buffer_t *buffer)
 {
  insert_dotted_circles (plan, font, buffer);

  foreach_syllable (buffer, start, end)
    initial_reordering_syllable (plan, font->face, buffer, start, end);
-}
-
-static void
-final_reordering_syllable (const hb_ot_shape_plan_t *plan,
-			   hb_buffer_t *buffer,
-			   unsigned int start, unsigned int end)
-{
-  const khmer_shape_plan_t *khmer_plan = (const khmer_shape_plan_t *) plan->data;
-  hb_glyph_info_t *info = buffer->info;
-
-
-  /* This function relies heavily on halant glyphs.  Lots of ligation
-   * and possibly multiple substitutions happened prior to this
-   * phase, and that might have messed up our properties.  Recover
-   * from a particular case of that where we're fairly sure that a
-   * class of OT_Coeng is desired but has been lost. */
-  if (khmer_plan->virama_glyph)
-  {
-    unsigned int virama_glyph = khmer_plan->virama_glyph;
-    for (unsigned int i = start; i < end; i++)
-      if (info[i].codepoint == virama_glyph &&
-	  _hb_glyph_info_ligated (&info[i]) &&
-	  _hb_glyph_info_multiplied (&info[i]))
-      {
-        /* This will make sure that this glyph passes is_coeng() test. */
-	info[i].khmer_category() = OT_Coeng;
-	_hb_glyph_info_clear_ligated_and_multiplied (&info[i]);
-      }
-  }
-
-
-  /* 4. Final reordering:
-   *
-   * After the localized forms and basic shaping forms GSUB features have been
-   * applied (see below), the shaping engine performs some final glyph
-   * reordering before applying all the remaining font features to the entire
-   * syllable.
-   */
-
-  bool try_pref = !!khmer_plan->mask_array[PREF];
-
-  /* Find base again */
-  unsigned int base;
-  for (base = start; base < end; base++)
-    if (info[base].khmer_position() >= POS_BASE_C)
-    {
-      if (try_pref && base + 1 < end)
-      {
-	for (unsigned int i = base + 1; i < end; i++)
-	  if ((info[i].mask & khmer_plan->mask_array[PREF]) != 0)
-	  {
-	    if (!(_hb_glyph_info_substituted (&info[i]) &&
-		  _hb_glyph_info_ligated_and_didnt_multiply (&info[i])))
-	    {
-	      /* Ok, this was a 'pref' candidate but didn't form any.
-	       * Base is around here... */
-	      base = i;
-	      while (base < end && is_coeng (info[base]))
-		base++;
-	      info[base].khmer_position() = POS_BASE_C;
-
-	      try_pref = false;
-	    }
-	    break;
-	  }
-      }
-
-      if (start < base && info[base].khmer_position() > POS_BASE_C)
-        base--;
-      break;
-    }
-  if (base == end && start < base &&
-      is_one_of (info[base - 1], FLAG (OT_ZWJ)))
-    base--;
-  if (base < end)
-    while (start < base &&
-	   is_one_of (info[base], (FLAG (OT_N) | FLAG (OT_Coeng))))
-      base--;
-
-
-  /*   o Reorder matras:
-   *
-   *     If a pre-base matra character had been reordered before applying basic
-   *     features, the glyph can be moved closer to the main consonant based on
-   *     whether half-forms had been formed. Actual position for the matra is
-   *     defined as “after last standalone halant glyph, after initial matra
-   *     position and before the main consonant”. If ZWJ or ZWNJ follow this
-   *     halant, position is moved after it.
-   */
-
-  if (start + 1 < end && start < base) /* Otherwise there can't be any pre-base matra characters. */
-  {
-    /* If we lost track of base, alas, position before last thingy. */
-    unsigned int new_pos = base == end ? base - 2 : base - 1;
-
-    while (new_pos > start &&
-	   !(is_one_of (info[new_pos], (FLAG (OT_M) | FLAG (OT_Coeng)))))
-      new_pos--;
-
-    /* If we found no Halant we are done.
-     * Otherwise only proceed if the Halant does
-     * not belong to the Matra itself! */
-    if (is_coeng (info[new_pos]) &&
-	info[new_pos].khmer_position() != POS_PRE_M)
-    {
-      /* -> If ZWJ or ZWNJ follow this halant, position is moved after it. */
-      if (new_pos + 1 < end && is_joiner (info[new_pos + 1]))
-	new_pos++;
-    }
-    else
-      new_pos = start; /* No move. */
-
-    if (start < new_pos && info[new_pos].khmer_position () != POS_PRE_M)
-    {
-      /* Now go see if there's actually any matras... */
-      for (unsigned int i = new_pos; i > start; i--)
-	if (info[i - 1].khmer_position () == POS_PRE_M)
-	{
-	  unsigned int old_pos = i - 1;
-	  if (old_pos < base && base <= new_pos) /* Shouldn't actually happen. */
-	    base--;
-
-	  hb_glyph_info_t tmp = info[old_pos];
-	  memmove (&info[old_pos], &info[old_pos + 1], (new_pos - old_pos) * sizeof (info[0]));
-	  info[new_pos] = tmp;
-
-	  /* Note: this merge_clusters() is intentionally *after* the reordering.
-	   * Indic matra reordering is special and tricky... */
-	  buffer->merge_clusters (new_pos, MIN (end, base + 1));
-
-	  new_pos--;
-	}
-    } else {
-      for (unsigned int i = start; i < base; i++)
-	if (info[i].khmer_position () == POS_PRE_M) {
-	  buffer->merge_clusters (i, MIN (end, base + 1));
-	  break;
-	}
-    }
-  }
-
-
-  /*   o Reorder pre-base-reordering consonants:
-   *
-   *     If a pre-base-reordering consonant is found, reorder it according to
-   *     the following rules:
-   */
-
-  if (try_pref && base + 1 < end) /* Otherwise there can't be any pre-base-reordering Ra. */
-  {
-    for (unsigned int i = base + 1; i < end; i++)
-      if ((info[i].mask & khmer_plan->mask_array[PREF]) != 0)
-      {
-	/*       1. Only reorder a glyph produced by substitution during application
-	 *          of the <pref> feature. (Note that a font may shape a Ra consonant with
-	 *          the feature generally but block it in certain contexts.)
-	 */
-        /* Note: We just check that something got substituted.  We don't check that
-	 * the <pref> feature actually did it...
-	 *
-	 * Reorder pref only if it ligated. */
-	if (_hb_glyph_info_ligated_and_didnt_multiply (&info[i]))
-	{
-	  /*
-	   *       2. Try to find a target position the same way as for pre-base matra.
-	   *          If it is found, reorder pre-base consonant glyph.
-	   *
-	   *       3. If position is not found, reorder immediately before main
-	   *          consonant.
-	   */
-
-	  unsigned int new_pos = base;
-	  while (new_pos > start &&
-		 !(is_one_of (info[new_pos - 1], FLAG(OT_M) | FLAG (OT_Coeng))))
-	    new_pos--;
-
-	  /* In Khmer coeng model, a H,Ra can go *after* matras.  If it goes after a
-	   * split matra, it should be reordered to *before* the left part of such matra. */
-	  if (new_pos > start && info[new_pos - 1].khmer_category() == OT_M)
-	  {
-	    unsigned int old_pos = i;
-	    for (unsigned int j = base + 1; j < old_pos; j++)
-	      if (info[j].khmer_category() == OT_M)
-	      {
-		new_pos--;
-		break;
-	      }
-	  }
-
-	  if (new_pos > start && is_coeng (info[new_pos - 1]))
-	  {
-	    /* -> If ZWJ or ZWNJ follow this halant, position is moved after it. */
-	    if (new_pos < end && is_joiner (info[new_pos]))
-	      new_pos++;
-	  }
-
-	  {
-	    unsigned int old_pos = i;
-
-	    buffer->merge_clusters (new_pos, old_pos + 1);
-	    hb_glyph_info_t tmp = info[old_pos];
-	    memmove (&info[new_pos + 1], &info[new_pos], (old_pos - new_pos) * sizeof (info[0]));
-	    info[new_pos] = tmp;
-
-	    if (new_pos <= base && base < old_pos)
-	      base++;
-	  }
-	}
-
-        break;
-      }
-  }
-
-
-  /*
-   * Finish off the clusters and go home!
-   */
-  if (hb_options ().uniscribe_bug_compatible)
-  {
-    /* Uniscribe merges the entire syllable into a single cluster... Except for Tamil & Sinhala.
-     * This means, half forms are submerged into the main consonant's cluster.
-     * This is unnecessary, and makes cursor positioning harder, but that's what
-     * Uniscribe does. */
-    buffer->merge_clusters (start, end);
-  }
-}
-
-
-static void
-final_reordering (const hb_ot_shape_plan_t *plan,
-		  hb_font_t *font HB_UNUSED,
-		  hb_buffer_t *buffer)
-{
-  unsigned int count = buffer->len;
-  if (unlikely (!count)) return;
-
-  foreach_syllable (buffer, start, end)
-    final_reordering_syllable (plan, buffer, start, end);

  HB_BUFFER_DEALLOCATE_VAR (buffer, khmer_category);
  HB_BUFFER_DEALLOCATE_VAR (buffer, khmer_position);
 }

-
 static void
 clear_syllables (const hb_ot_shape_plan_t *plan HB_UNUSED,
 		 hb_font_t *font HB_UNUSED,
 		 hb_buffer_t *buffer)
 {
+  /* TODO: In USE, we clear syllables right after reorder.  Figure out
+   * what Uniscribe does. */
  hb_glyph_info_t *info = buffer->info;
  unsigned int count = buffer->len;
  for (unsigned int i = 0; i < count; i++)
--- a/src/hb-ot-shape-complex-private.hh
+++ b/src/hb-ot-shape-complex-private.hh
@ -279,20 +279,7 @@ hb_ot_shape_complex_categorize (const hb_ot_shape_planner_t *planner)
 	return &_hb_ot_complex_shaper_indic;

    case HB_SCRIPT_KHMER:
-      /* A number of Khmer fonts in the wild don't have a 'pref' feature,
-       * and as such won't shape properly via the Indic shaper;
-       * however, they typically have 'liga' / 'clig' features that implement
-       * the necessary "reordering" by means of ligature substitutions.
-       * So we send such pref-less fonts through the generic shaper instead. */
-      if (planner->map.found_script[0] &&
-	  hb_ot_layout_language_find_feature (planner->face, HB_OT_TAG_GSUB,
-					      planner->map.script_index[0],
-					      planner->map.language_index[0],
-					      HB_TAG ('p','r','e','f'),
-					      nullptr))
 	return &_hb_ot_complex_shaper_khmer;
-      else
-	return &_hb_ot_complex_shaper_default;

    case HB_SCRIPT_MYANMAR:
      if (planner->map.chosen_script[0] == HB_TAG ('m','y','m','2'))