ICU-5431 Modifications to allow pre-base consonant reordering for Malayalam

X-SVN-Rev: 26090
2009-06-11 18:34:01 +00:00 · 2009-06-11 18:34:01 +00:00 · 76edc36b6e
commit 76edc36b6e
parent 9100ba632e
3 changed files with 66 additions and 8 deletions
--- a/icu4c/source/layout/IndicClassTables.cpp
+++ b/icu4c/source/layout/IndicClassTables.cpp
@ -66,6 +66,7 @@ U_NAMESPACE_BEGIN
 // special forms... (Bengali RA?)
 #define _bb  (_ct | CF_BELOW_BASE)
 #define _pb  (_ct | CF_POST_BASE)
+#define _fb  (_ct | CF_PRE_BASE)
 #define _vt  (_bb | CF_VATTU)
 #define _rv  (_vt | CF_REPH)
 #define _rp  (_pb | CF_REPH)
@ -195,7 +196,7 @@ static const IndicClassTable::CharClass mlymCharClasses[] =
    _xx, _xx, _mp, _mp, _xx, _iv, _iv, _iv, _iv, _iv, _iv, _iv, _iv, _xx, _iv, _iv, // 0D00 - 0D0F
    _iv, _xx, _iv, _iv, _iv, _ct, _ct, _ct, _ct, _ct, _ct, _ct, _ct, _ct, _ct, _ct, // 0D10 - 0D1F
    _ct, _ct, _ct, _ct, _ct, _ct, _ct, _ct, _ct, _xx, _ct, _ct, _ct, _ct, _ct, _pb, // 0D20 - 0D2F
-    _pb, _cn, _bb, _ct, _ct, _pb, _ct, _ct, _ct, _ct, _xx, _xx, _xx, _xx, _r2, _dr, // 0D30 - 0D3F
+    _fb, _fb, _bb, _ct, _ct, _pb, _ct, _ct, _ct, _ct, _xx, _xx, _xx, _xx, _r2, _dr, // 0D30 - 0D3F
    _dr, _dr, _dr, _dr, _xx, _xx, _l1, _l1, _dl, _xx, _s1, _s2, _s3, _vr, _xx, _xx, // 0D40 - 0D4F
    _xx, _xx, _xx, _xx, _xx, _xx, _xx, _m2, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, // 0D50 - 0D5F
    _iv, _iv, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx  // 0D60 - 0D6F
--- a/icu4c/source/layout/IndicReordering.cpp
+++ b/icu4c/source/layout/IndicReordering.cpp
@ -125,6 +125,10 @@ private:
    le_int32    fSMIndex;
    FeatureMask fSMFeatures;

+    LEUnicode   fPreBaseConsonant;
+    LEUnicode   fPreBaseVirama;
+    le_int32    fPBCIndex;
+    FeatureMask fPBCFeatures;

    void saveMatra(LEUnicode matra, le_int32 matraIndex, IndicClassTable::CharClass matraClass)
    {
@ -171,7 +175,8 @@ public:
          fMpost(0), fMpostIndex(0), fLengthMark(0), fLengthMarkIndex(0), fAlLakuna(0), fAlLakunaIndex(0),
          fMatraFeatures(0), fMPreOutIndex(-1), fMPreFixups(mpreFixups),
          fVMabove(0), fVMpost(0), fVMIndex(0), fVMFeatures(0),
-          fSMabove(0), fSMbelow(0), fSMIndex(0), fSMFeatures(0)
+          fSMabove(0), fSMbelow(0), fSMIndex(0), fSMFeatures(0),
+          fPreBaseConsonant(0), fPreBaseVirama(0), fPBCIndex(0), fPBCFeatures(0)
    {
        // nothing else to do...
    }
@ -190,6 +195,8 @@ public:
        
        fVMabove = fVMpost  = 0;
        fSMabove = fSMbelow = 0;
+
+        fPreBaseConsonant = fPreBaseVirama = 0;
    }

    void writeChar(LEUnicode ch, le_uint32 charIndex, FeatureMask charFeatures)
@ -385,6 +392,14 @@ public:
        }
    }

+    void notePreBaseConsonant(le_uint32 index,LEUnicode PBConsonant, LEUnicode PBVirama, FeatureMask features)
+    {
+        fPBCIndex = index;
+        fPreBaseConsonant = PBConsonant;
+        fPreBaseVirama = PBVirama;
+        fPBCFeatures = features;
+    }
+
    void noteBaseConsonant()
    {
        if (fMPreFixups != NULL && fMPreOutIndex >= 0) {
@ -464,6 +479,22 @@ public:
        }
    }
    
+    void writePreBaseConsonant()
+    {
+        // The TDIL spec says that consonant + virama + RRA should produce a rakar in Malayalam.  However,
+        // it seems that almost none of the fonts for Malayalam are set up to handle this.
+        // So, we're going to force the issue here by using the rakar as defined with RA in most fonts.
+
+        if (fPreBaseConsonant == 0x0d31) { // RRA
+            fPreBaseConsonant = 0x0d30; // RA
+        }
+        
+        if (fPreBaseConsonant != 0) {
+            writeChar(fPreBaseConsonant, fPBCIndex, fPBCFeatures);
+            writeChar(fPreBaseVirama,fPBCIndex-1,fPBCFeatures);
+        }
+    }
+
    le_int32 getOutputIndex()
    {
        return fOutIndex;
@ -722,6 +753,7 @@ le_int32 IndicReordering::reorder(const LEUnicode *chars, le_int32 charCount, le
                lastConsonant -= 1;
            }

+            
            IndicClassTable::CharClass charClass = CC_RESERVED;
            IndicClassTable::CharClass nextClass = CC_RESERVED;
            le_int32 baseConsonant = lastConsonant;
@ -729,9 +761,11 @@ le_int32 IndicReordering::reorder(const LEUnicode *chars, le_int32 charCount, le
            le_int32 postBaseLimit = classTable->scriptFlags & SF_POST_BASE_LIMIT_MASK;
            le_bool  seenVattu = FALSE;
            le_bool  seenBelowBaseForm = FALSE;
+            le_bool  seenPreBaseForm = FALSE;
            le_bool  hasNukta = FALSE;
            le_bool  hasBelowBaseForm = FALSE;
            le_bool  hasPostBaseForm = FALSE;
+            le_bool  hasPreBaseForm = FALSE;

            if (postBase < markStart && classTable->isNukta(chars[postBase])) {
                charClass = CC_NUKTA;
@ -745,14 +779,22 @@ le_int32 IndicReordering::reorder(const LEUnicode *chars, le_int32 charCount, le

                hasBelowBaseForm = IndicClassTable::hasBelowBaseForm(charClass) && !hasNukta;
                hasPostBaseForm  = IndicClassTable::hasPostBaseForm(charClass)  && !hasNukta;
+                hasPreBaseForm = IndicClassTable::hasPreBaseForm(charClass) && !hasNukta;

                if (IndicClassTable::isConsonant(charClass)) {
                    if (postBaseLimit == 0 || seenVattu ||
                        (baseConsonant > baseLimit && !classTable->isVirama(chars[baseConsonant - 1])) ||
-                        !(hasBelowBaseForm || hasPostBaseForm)) {
+                        !(hasBelowBaseForm || hasPostBaseForm || hasPreBaseForm)) {
                        break;
                    }

+                    // Note any pre-base consonants
+                    if ( baseConsonant == lastConsonant && lastConsonant > 0 && 
+                         hasPreBaseForm && classTable->isVirama(chars[baseConsonant - 1])) {
+                        output.notePreBaseConsonant(lastConsonant,chars[lastConsonant],chars[lastConsonant-1],tagArray2);
+                        seenPreBaseForm = TRUE;
+   
+                    }
                    // consonants with nuktas are never vattus
                    seenVattu = IndicClassTable::isVattu(charClass) && !hasNukta;

@ -785,12 +827,14 @@ le_int32 IndicReordering::reorder(const LEUnicode *chars, le_int32 charCount, le
            }

            // write any pre-base consonants
+            output.writePreBaseConsonant();
+
            le_bool supressVattu = TRUE;

            for (i = baseLimit; i < baseConsonant; i += 1) {
                LEUnicode ch = chars[i];
-                // Don't put 'blwf' on first consonant.
-                FeatureMask features = (i == baseLimit? tagArray2 : tagArray1);
+                // Don't put 'pstf' or 'blwf' on anything before the base consonant.
+                FeatureMask features = tagArray1 & ~( pstfFeatureMask | blwfFeatureMask );

                charClass = classTable->getCharClass(ch);
                nextClass = classTable->getCharClass(chars[i + 1]);
@ -841,7 +885,7 @@ le_int32 IndicReordering::reorder(const LEUnicode *chars, le_int32 charCount, le
            }

            // write below-base consonants
-            if (baseConsonant != lastConsonant) {
+            if (baseConsonant != lastConsonant && !seenPreBaseForm) {
                for (i = bcSpan + 1; i < postBase; i += 1) {
                    output.writeChar(chars[i], i, tagArray1);
                }
@ -871,7 +915,7 @@ le_int32 IndicReordering::reorder(const LEUnicode *chars, le_int32 charCount, le

            // write post-base consonants
            // FIXME: does this put the right tags on post-base consonants?
-            if (baseConsonant != lastConsonant) {
+            if (baseConsonant != lastConsonant && !seenPreBaseForm) {
                if (postBase <= lastConsonant) {
                    for (i = postBase; i <= lastConsonant; i += 1) {
                        output.writeChar(chars[i], i, tagArray3);
@ -1139,7 +1183,7 @@ le_int32 IndicReordering::v2process(const LEUnicode *chars, le_int32 charCount,
 }


-void IndicReordering::getDynamicProperties( DynamicProperties */*dProps*/, const IndicClassTable *classTable ) {
+void IndicReordering::getDynamicProperties( DynamicProperties *, const IndicClassTable *classTable ) {


    LEUnicode currentChar;
--- a/icu4c/source/layout/IndicReordering.h
+++ b/icu4c/source/layout/IndicReordering.h
@ -50,6 +50,7 @@ U_NAMESPACE_BEGIN
 #define CF_BELOW_BASE    0x10000000U
 #define CF_POST_BASE     0x08000000U
 #define CF_LENGTH_MARK   0x04000000U
+#define CF_PRE_BASE      0x02000000U

 #define CF_POS_BEFORE    0x00300000U
 #define CF_POS_BELOW     0x00200000U
@ -118,6 +119,7 @@ struct IndicClassTable
    inline le_bool hasPostBaseForm(LEUnicode ch) const;
    inline le_bool hasBelowBaseForm(LEUnicode ch) const;
    inline le_bool hasAboveBaseForm(LEUnicode ch) const;
+    inline le_bool hasPreBaseForm(LEUnicode ch) const;

    inline static le_bool isVowelModifier(CharClass charClass);
    inline static le_bool isStressMark(CharClass charClass);
@ -134,6 +136,7 @@ struct IndicClassTable
    inline static le_bool hasPostBaseForm(CharClass charClass);
    inline static le_bool hasBelowBaseForm(CharClass charClass);
    inline static le_bool hasAboveBaseForm(CharClass charClass);
+    inline static le_bool hasPreBaseForm(CharClass charClass);

    static const IndicClassTable *getScriptClassTable(le_int32 scriptCode);
 };
@ -255,6 +258,11 @@ inline le_bool IndicClassTable::hasPostBaseForm(CharClass charClass)
    return (charClass & CF_POST_BASE) != 0;
 }

+inline le_bool IndicClassTable::hasPreBaseForm(CharClass charClass)
+{
+    return (charClass & CF_PRE_BASE) != 0;
+}
+
 inline le_bool IndicClassTable::hasBelowBaseForm(CharClass charClass)
 {
    return (charClass & CF_BELOW_BASE) != 0;
@ -335,6 +343,11 @@ inline le_bool IndicClassTable::hasBelowBaseForm(LEUnicode ch) const
    return hasBelowBaseForm(getCharClass(ch));
 }

+inline le_bool IndicClassTable::hasPreBaseForm(LEUnicode ch) const
+{
+    return hasPreBaseForm(getCharClass(ch));
+}
+
 inline le_bool IndicClassTable::hasAboveBaseForm(LEUnicode ch) const
 {
    return hasAboveBaseForm(getCharClass(ch));