[Indic] Add per-script configuration tables

This concludes the Indic shape_plan work. May do for Arabic also...
2012-08-02 14:21:40 -04:00 · 2012-08-02 14:21:40 -04:00 · 11b0e20ba4
commit 11b0e20ba4
parent 85fc6c483f
2 changed files with 164 additions and 171 deletions
--- a/src/hb-ot-shape-complex-indic-private.hh
+++ b/src/hb-ot-shape-complex-indic-private.hh
@ -157,8 +157,8 @@ enum indic_matra_category_t {

 #define IS_DEVA(u) (IN_HALF_BLOCK (u, 0x0900))
 #define IS_BENG(u) (IN_HALF_BLOCK (u, 0x0980))
-#define IS_GURM(u) (IN_HALF_BLOCK (u, 0x0A00))
-#define IS_GUJA(u) (IN_HALF_BLOCK (u, 0x0A80))
+#define IS_GURU(u) (IN_HALF_BLOCK (u, 0x0A00))
+#define IS_GUJR(u) (IN_HALF_BLOCK (u, 0x0A80))
 #define IS_ORYA(u) (IN_HALF_BLOCK (u, 0x0B00))
 #define IS_TAML(u) (IN_HALF_BLOCK (u, 0x0B80))
 #define IS_TELU(u) (IN_HALF_BLOCK (u, 0x0C00))
@ -172,8 +172,8 @@ enum indic_matra_category_t {
 #define MATRA_POS_RIGHT(u)	( \
 				  IS_DEVA(u) ? POS_AFTER_SUB  : \
 				  IS_BENG(u) ? POS_AFTER_POST : \
-				  IS_GURM(u) ? POS_AFTER_POST : \
-				  IS_GUJA(u) ? POS_AFTER_POST : \
+				  IS_GURU(u) ? POS_AFTER_POST : \
+				  IS_GUJR(u) ? POS_AFTER_POST : \
 				  IS_ORYA(u) ? POS_AFTER_POST : \
 				  IS_TAML(u) ? POS_AFTER_POST : \
 				  IS_TELU(u) ? (u <= 0x0C42 ? POS_BEFORE_SUB : POS_AFTER_SUB) : \
@ -185,8 +185,8 @@ enum indic_matra_category_t {
 				)
 #define MATRA_POS_TOP(u)	( /* BENG and MLYM don't have top matras. */ \
 				  IS_DEVA(u) ? POS_AFTER_SUB  : \
-				  IS_GURM(u) ? POS_AFTER_POST : /* Deviate from spec */ \
-				  IS_GUJA(u) ? POS_AFTER_SUB  : \
+				  IS_GURU(u) ? POS_AFTER_POST : /* Deviate from spec */ \
+				  IS_GUJR(u) ? POS_AFTER_SUB  : \
 				  IS_ORYA(u) ? POS_AFTER_MAIN : \
 				  IS_TAML(u) ? POS_AFTER_SUB  : \
 				  IS_TELU(u) ? POS_BEFORE_SUB : \
@ -198,8 +198,8 @@ enum indic_matra_category_t {
 #define MATRA_POS_BOTTOM(u)	( \
 				  IS_DEVA(u) ? POS_AFTER_SUB  : \
 				  IS_BENG(u) ? POS_AFTER_SUB  : \
-				  IS_GURM(u) ? POS_AFTER_POST : \
-				  IS_GUJA(u) ? POS_AFTER_POST : \
+				  IS_GURU(u) ? POS_AFTER_POST : \
+				  IS_GUJR(u) ? POS_AFTER_POST : \
 				  IS_ORYA(u) ? POS_AFTER_SUB  : \
 				  IS_TAML(u) ? POS_AFTER_POST : \
 				  IS_TELU(u) ? POS_BEFORE_SUB : \
--- a/src/hb-ot-shape-complex-indic.cc
+++ b/src/hb-ot-shape-complex-indic.cc
@ -25,23 +25,12 @@
 */

 #include "hb-ot-shape-complex-indic-private.hh"
-#include "hb-ot-shape-private.hh"
 #include "hb-ot-layout-private.hh"


-#define OLD_INDIC_TAG(script) (((hb_tag_t) script) | 0x20000000)
-#define IS_OLD_INDIC_TAG(tag) ( \
-				(tag) == OLD_INDIC_TAG (HB_SCRIPT_BENGALI)	|| \
-				(tag) == OLD_INDIC_TAG (HB_SCRIPT_DEVANAGARI)	|| \
-				(tag) == OLD_INDIC_TAG (HB_SCRIPT_GUJARATI)	|| \
-				(tag) == OLD_INDIC_TAG (HB_SCRIPT_GURMUKHI)	|| \
-				(tag) == OLD_INDIC_TAG (HB_SCRIPT_KANNADA)	|| \
-				(tag) == OLD_INDIC_TAG (HB_SCRIPT_MALAYALAM)	|| \
-				(tag) == OLD_INDIC_TAG (HB_SCRIPT_ORIYA)	|| \
-				(tag) == OLD_INDIC_TAG (HB_SCRIPT_TAMIL)	|| \
-				(tag) == OLD_INDIC_TAG (HB_SCRIPT_TELUGU)	|| \
-			      0)
-
+/*
+ * Global Indic shaper options.
+ */

 struct indic_options_t
 {
@ -82,6 +71,65 @@ indic_options (void)
 }


+/*
+ * Indic configurations.  Note that we do not want to keep every single script-specific
+ * behavior in these tables necessarily.  This should mainly be used for per-script
+ * properties that are cheaper keeping here, than in the code.  Ie. if, say, one and
+ * only one script has an exception, that one script can be if'ed directly in the code,
+ * instead of adding a new flag in these structs.
+ */
+
+enum base_position_t {
+  BASE_POS_FIRST,
+  BASE_POS_LAST
+};
+enum reph_position_t {
+  REPH_POS_DEFAULT     = POS_BEFORE_POST,
+
+  REPH_POS_AFTER_MAIN  = POS_AFTER_MAIN,
+  REPH_POS_BEFORE_SUB  = POS_BEFORE_SUB,
+  REPH_POS_AFTER_SUB   = POS_AFTER_SUB,
+  REPH_POS_BEFORE_POST = POS_BEFORE_POST,
+  REPH_POS_AFTER_POST  = POS_AFTER_POST
+};
+enum reph_mode_t {
+  REPH_MODE_IMPLICIT,  /* Reph formed out of initial Ra,H sequence. */
+  REPH_MODE_EXPLICIT,  /* Reph formed out of initial Ra,H,ZWJ sequence. */
+  REPH_MODE_VIS_REPHA, /* Encoded Repha character, no reordering needed. */
+  REPH_MODE_LOG_REPHA  /* Encoded Repha character, needs reordering. */
+};
+struct indic_config_t
+{
+  hb_script_t     script;
+  bool            has_old_spec;
+  hb_codepoint_t  virama;
+  base_position_t base_pos;
+  reph_position_t reph_pos;
+  reph_mode_t     reph_mode;
+};
+
+static const indic_config_t indic_configs[] =
+{
+  /* Default.  Should be first. */
+  {HB_SCRIPT_INVALID,	false,     0,BASE_POS_LAST, REPH_POS_DEFAULT,    REPH_MODE_IMPLICIT},
+  {HB_SCRIPT_DEVANAGARI,true, 0x094D,BASE_POS_LAST, REPH_POS_BEFORE_POST,REPH_MODE_IMPLICIT},
+  {HB_SCRIPT_BENGALI,	true, 0x09CD,BASE_POS_LAST, REPH_POS_AFTER_SUB,  REPH_MODE_IMPLICIT},
+  {HB_SCRIPT_GURMUKHI,	true, 0x0A4D,BASE_POS_LAST, REPH_POS_BEFORE_SUB, REPH_MODE_IMPLICIT},
+  {HB_SCRIPT_GUJARATI,	true, 0x0ACD,BASE_POS_LAST, REPH_POS_BEFORE_POST,REPH_MODE_IMPLICIT},
+  {HB_SCRIPT_ORIYA,	true, 0x0B4D,BASE_POS_LAST, REPH_POS_AFTER_MAIN, REPH_MODE_IMPLICIT},
+  {HB_SCRIPT_TAMIL,	true, 0x0BCD,BASE_POS_LAST, REPH_POS_AFTER_POST, REPH_MODE_IMPLICIT},
+  {HB_SCRIPT_TELUGU,	true, 0x0C4D,BASE_POS_LAST, REPH_POS_AFTER_POST, REPH_MODE_EXPLICIT},
+  {HB_SCRIPT_KANNADA,	true, 0x0CCD,BASE_POS_LAST, REPH_POS_AFTER_POST, REPH_MODE_IMPLICIT},
+  {HB_SCRIPT_MALAYALAM,	true, 0x0D4D,BASE_POS_LAST, REPH_POS_AFTER_MAIN, REPH_MODE_LOG_REPHA},
+  {HB_SCRIPT_SINHALA,	false,0x0DCA,BASE_POS_FIRST,REPH_POS_AFTER_MAIN, REPH_MODE_EXPLICIT},
+  {HB_SCRIPT_KHMER,	false,0x17D2,BASE_POS_FIRST,REPH_POS_DEFAULT,    REPH_MODE_VIS_REPHA},
+};
+
+
+
+/*
+ * Indic shaper.
+ */

 struct feature_list_t {
  hb_tag_t tag;
@ -228,7 +276,7 @@ struct indic_shape_plan_t
    hb_codepoint_t glyph = virama_glyph;
    if (unlikely (virama_glyph == (hb_codepoint_t) -1))
    {
-      if (!font->get_glyph (virama, 0, &glyph))
+      if (!config->virama || !font->get_glyph (config->virama, 0, &glyph))
 	glyph = 0;
      /* Technically speaking, the spec says we should apply 'locl' to virama too.
       * Maybe one day... */
@ -242,10 +290,9 @@ struct indic_shape_plan_t
    return glyph != 0;
  }

+  const indic_config_t *config;

  bool is_old_spec;
-
-  hb_codepoint_t virama;
  hb_codepoint_t virama_glyph;

  would_substitute_feature_t pref;
@ -262,26 +309,15 @@ data_create_indic (const hb_ot_shape_plan_t *plan)
  if (unlikely (!indic_plan))
    return NULL;

-  indic_plan->is_old_spec = IS_OLD_INDIC_TAG (plan->map.get_chosen_script (0));
-  {
-    hb_codepoint_t virama;
-    switch ((int) plan->props.script) {
-      case HB_SCRIPT_DEVANAGARI:virama = 0x094D; break;
-      case HB_SCRIPT_BENGALI:	virama = 0x09CD; break;
-      case HB_SCRIPT_GURMUKHI:	virama = 0x0A4D; break;
-      case HB_SCRIPT_GUJARATI:	virama = 0x0ACD; break;
-      case HB_SCRIPT_ORIYA:	virama = 0x0B4D; break;
-      case HB_SCRIPT_TAMIL:	virama = 0x0BCD; break;
-      case HB_SCRIPT_TELUGU:	virama = 0x0C4D; break;
-      case HB_SCRIPT_KANNADA:	virama = 0x0CCD; break;
-      case HB_SCRIPT_MALAYALAM:	virama = 0x0D4D; break;
-      case HB_SCRIPT_SINHALA:	virama = 0x0DCA; break;
-      case HB_SCRIPT_KHMER:	virama = 0x17D2; break;
-      default:			virama = 0;      break;
+  indic_plan->config = &indic_configs[0];
+  for (unsigned int i = 1; i < ARRAY_LENGTH (indic_configs); i++)
+    if (plan->props.script == indic_configs[i].script) {
+      indic_plan->config = &indic_configs[i];
+      break;
    }
-    indic_plan->virama = virama;
-  }
-  indic_plan->virama_glyph = indic_plan->virama ? (hb_codepoint_t) -1 : 0;
+
+  indic_plan->is_old_spec = indic_plan->config->has_old_spec && ((plan->map.get_chosen_script (0) & 0x000000FF) != '2');
+  indic_plan->virama_glyph = (hb_codepoint_t) -1;

  indic_plan->pref.init (&plan->map, HB_TAG('p','r','e','f'));
  indic_plan->blwf.init (&plan->map, HB_TAG('b','l','w','f'));
@ -397,9 +433,9 @@ initial_reordering_consonant_syllable (const hb_ot_shape_plan_t *plan, hb_buffer
 	start + 3 <= end &&
 	info[start].indic_category() == OT_Ra &&
 	info[start + 1].indic_category() == OT_H &&
-	(unlikely (buffer->props.script == HB_SCRIPT_SINHALA || buffer->props.script == HB_SCRIPT_TELUGU) ?
-	 info[start + 2].indic_category() == OT_ZWJ /* In Sinhala & Telugu, form Reph only if ZWJ is present */:
-	 !is_joiner (info[start + 2] /* In other scripts, any joiner blocks Reph formation */ )
+	(/* TODO Handle other Reph modes. */
+	 (indic_plan->config->reph_mode == REPH_MODE_IMPLICIT && !is_joiner (info[start + 2])) ||
+	 (indic_plan->config->reph_mode == REPH_MODE_EXPLICIT && info[start + 2].indic_category() == OT_ZWJ)
 	))
    {
      limit += 2;
@ -409,92 +445,84 @@ initial_reordering_consonant_syllable (const hb_ot_shape_plan_t *plan, hb_buffer
      has_reph = true;
    };

-     enum base_position_t {
-       BASE_FIRST,
-       BASE_LAST
-     } base_pos;
-
-    switch ((hb_tag_t) buffer->props.script)
+    switch (indic_plan->config->base_pos == BASE_POS_LAST)
    {
-      case HB_SCRIPT_SINHALA:
-      case HB_SCRIPT_KHMER:
-	base_pos = BASE_FIRST;
-	break;
+      case BASE_POS_LAST:
+      {
+	/* -> starting from the end of the syllable, move backwards */
+	unsigned int i = end;
+	bool seen_below = false;
+	do {
+	  i--;
+	  /* -> until a consonant is found */
+	  if (is_consonant (info[i]))
+	  {
+	    /* -> that does not have a below-base or post-base form
+	     * (post-base forms have to follow below-base forms), */
+	    if (info[i].indic_position() != POS_BELOW_C &&
+		(info[i].indic_position() != POS_POST_C || seen_below))
+	    {
+	      base = i;
+	      break;
+	    }
+	    if (info[i].indic_position() == POS_BELOW_C)
+	      seen_below = true;
+
+	    /* -> or that is not a pre-base reordering Ra,
+	     *
+	     * IMPLEMENTATION NOTES:
+	     *
+	     * Our pre-base reordering Ra's are marked POS_BELOW, so will be skipped
+	     * by the logic above already.
+	     */
+
+	    /* -> or arrive at the first consonant. The consonant stopped at will
+	     * be the base. */
+	    base = i;
+	  }
+	  else
+	  {
+	    /* A ZWJ after a Halant stops the base search, and requests an explicit
+	     * half form.
+	     * A ZWJ before a Halant, requests a subjoined form instead, and hence
+	     * search continues.  This is particularly important for Bengali
+	     * sequence Ra,H,Ya that shouls form Ya-Phalaa by subjoining Ya. */
+	    if (start < i &&
+		info[i].indic_category() == OT_ZWJ &&
+		info[i - 1].indic_category() == OT_H)
+	      break;
+	  }
+	} while (i > limit);
+      }
+      break;
+
+      case BASE_POS_FIRST:
+      {
+	/* In scripts without half forms (eg. Khmer), the first consonant is always the base. */
+
+	if (!has_reph)
+	  base = limit;
+
+	/* Find the last base consonant that is not blocked by ZWJ.  If there is
+	 * a ZWJ right before a base consonant, that would request a subjoined form. */
+	for (unsigned int i = limit; i < end; i++)
+	  if (is_consonant (info[i]) && info[i].indic_position() == POS_BASE_C)
+	  {
+	    if (limit < i && info[i - 1].indic_category() == OT_ZWJ)
+	      break;
+	    else
+	      base = i;
+	  }
+
+	/* Mark all subsequent consonants as below. */
+	for (unsigned int i = base + 1; i < end; i++)
+	  if (is_consonant (info[i]) && info[i].indic_position() == POS_BASE_C)
+	    info[i].indic_position() = POS_BELOW_C;
+      }
+      break;

      default:
-	base_pos = BASE_LAST;
-	break;
-    }
-
-    if (base_pos == BASE_LAST)
-    {
-      /* -> starting from the end of the syllable, move backwards */
-      unsigned int i = end;
-      bool seen_below = false;
-      do {
-	i--;
-	/* -> until a consonant is found */
-	if (is_consonant (info[i]))
-	{
-	  /* -> that does not have a below-base or post-base form
-	   * (post-base forms have to follow below-base forms), */
-	  if (info[i].indic_position() != POS_BELOW_C &&
-	      (info[i].indic_position() != POS_POST_C || seen_below))
-	  {
-	    base = i;
-	    break;
-	  }
-	  if (info[i].indic_position() == POS_BELOW_C)
-	    seen_below = true;
-
-	  /* -> or that is not a pre-base reordering Ra,
-	   *
-	   * IMPLEMENTATION NOTES:
-	   *
-	   * Our pre-base reordering Ra's are marked POS_BELOW, so will be skipped
-	   * by the logic above already.
-	   */
-
-	  /* -> or arrive at the first consonant. The consonant stopped at will
-	   * be the base. */
-	  base = i;
-	}
-	else
-	{
-	  /* A ZWJ after a Halant stops the base search, and requests an explicit
-	   * half form.
-	   * A ZWJ before a Halant, requests a subjoined form instead, and hence
-	   * search continues.  This is particularly important for Bengali
-	   * sequence Ra,H,Ya that shouls form Ya-Phalaa by subjoining Ya. */
-	  if (start < i &&
-	      info[i].indic_category() == OT_ZWJ &&
-	      info[i - 1].indic_category() == OT_H)
-	    break;
-	}
-      } while (i > limit);
-    }
-    else
-    {
-      /* In scripts without half forms (eg. Khmer), the first consonant is always the base. */
-
-      if (!has_reph)
-	base = limit;
-
-      /* Find the last base consonant that is not blocked by ZWJ.  If there is
-       * a ZWJ right before a base consonant, that would request a subjoined form. */
-      for (unsigned int i = limit; i < end; i++)
-        if (is_consonant (info[i]) && info[i].indic_position() == POS_BASE_C)
-	{
-	  if (limit < i && info[i - 1].indic_category() == OT_ZWJ)
-	    break;
-          else
-	    base = i;
-	}
-
-      /* Mark all subsequent consonants as below. */
-      for (unsigned int i = base + 1; i < end; i++)
-        if (is_consonant (info[i]) && info[i].indic_position() == POS_BASE_C)
-	  info[i].indic_position() = POS_BELOW_C;
+      abort ();
    }

    /* -> If the syllable starts with Ra + Halant (in a script that has Reph)
@ -864,50 +892,15 @@ final_reordering_syllable (const hb_ot_shape_plan_t *plan,
      info[start].indic_position() == POS_RA_TO_BECOME_REPH &&
      info[start + 1].indic_position() != POS_RA_TO_BECOME_REPH)
  {
-      unsigned int new_reph_pos;
+    unsigned int new_reph_pos;
+    reph_position_t reph_pos = indic_plan->config->reph_pos;

-     enum reph_position_t {
-       REPH_AFTER_MAIN,
-       REPH_BEFORE_SUBSCRIPT,
-       REPH_AFTER_SUBSCRIPT,
-       REPH_BEFORE_POSTSCRIPT,
-       REPH_AFTER_POSTSCRIPT
-     } reph_pos;
-
-     /* XXX Figure out old behavior too */
-     switch ((hb_tag_t) buffer->props.script)
-     {
-       case HB_SCRIPT_MALAYALAM:
-       case HB_SCRIPT_ORIYA:
-       case HB_SCRIPT_SINHALA:
-	 reph_pos = REPH_AFTER_MAIN;
-	 break;
-
-       case HB_SCRIPT_GURMUKHI:
-	 reph_pos = REPH_BEFORE_SUBSCRIPT;
-	 break;
-
-       case HB_SCRIPT_BENGALI:
-	 reph_pos = REPH_AFTER_SUBSCRIPT;
-	 break;
-
-       default:
-       case HB_SCRIPT_DEVANAGARI:
-       case HB_SCRIPT_GUJARATI:
-	 reph_pos = REPH_BEFORE_POSTSCRIPT;
-	 break;
-
-       case HB_SCRIPT_KANNADA:
-       case HB_SCRIPT_TAMIL:
-       case HB_SCRIPT_TELUGU:
-	 reph_pos = REPH_AFTER_POSTSCRIPT;
-	 break;
-     }
+    /* XXX Figure out old behavior too */

    /*       1. If reph should be positioned after post-base consonant forms,
     *          proceed to step 5.
     */
-    if (reph_pos == REPH_AFTER_POSTSCRIPT)
+    if (reph_pos == REPH_POS_AFTER_POST)
    {
      goto reph_step_5;
    }
@ -940,7 +933,7 @@ final_reordering_syllable (const hb_ot_shape_plan_t *plan,
     *          first consonant not ligated with main, or find the first
     *          consonant that is not a potential pre-base reordering Ra.
     */
-    if (reph_pos == REPH_AFTER_MAIN)
+    if (reph_pos == REPH_POS_AFTER_MAIN)
    {
      new_reph_pos = base;
      /* XXX Skip potential pre-base reordering Ra. */
@ -956,7 +949,7 @@ final_reordering_syllable (const hb_ot_shape_plan_t *plan,
     *          first matra, syllable modifier sign or vedic sign.
     */
    /* This is our take on what step 4 is trying to say (and failing, BADLY). */
-    if (reph_pos == REPH_AFTER_SUBSCRIPT)
+    if (reph_pos == REPH_POS_AFTER_SUB)
    {
      new_reph_pos = base;
      while (new_reph_pos < end &&