From 76edc36b6e85ae080deb63f251bdbef90891d590 Mon Sep 17 00:00:00 2001 From: John Emmons Date: Thu, 11 Jun 2009 18:34:01 +0000 Subject: [PATCH] ICU-5431 Modifications to allow pre-base consonant reordering for Malayalam X-SVN-Rev: 26090 --- icu4c/source/layout/IndicClassTables.cpp | 3 +- icu4c/source/layout/IndicReordering.cpp | 58 +++++++++++++++++++++--- icu4c/source/layout/IndicReordering.h | 13 ++++++ 3 files changed, 66 insertions(+), 8 deletions(-) diff --git a/icu4c/source/layout/IndicClassTables.cpp b/icu4c/source/layout/IndicClassTables.cpp index 2ec6ef0de8..9275058116 100644 --- a/icu4c/source/layout/IndicClassTables.cpp +++ b/icu4c/source/layout/IndicClassTables.cpp @@ -66,6 +66,7 @@ U_NAMESPACE_BEGIN // special forms... (Bengali RA?) #define _bb (_ct | CF_BELOW_BASE) #define _pb (_ct | CF_POST_BASE) +#define _fb (_ct | CF_PRE_BASE) #define _vt (_bb | CF_VATTU) #define _rv (_vt | CF_REPH) #define _rp (_pb | CF_REPH) @@ -195,7 +196,7 @@ static const IndicClassTable::CharClass mlymCharClasses[] = _xx, _xx, _mp, _mp, _xx, _iv, _iv, _iv, _iv, _iv, _iv, _iv, _iv, _xx, _iv, _iv, // 0D00 - 0D0F _iv, _xx, _iv, _iv, _iv, _ct, _ct, _ct, _ct, _ct, _ct, _ct, _ct, _ct, _ct, _ct, // 0D10 - 0D1F _ct, _ct, _ct, _ct, _ct, _ct, _ct, _ct, _ct, _xx, _ct, _ct, _ct, _ct, _ct, _pb, // 0D20 - 0D2F - _pb, _cn, _bb, _ct, _ct, _pb, _ct, _ct, _ct, _ct, _xx, _xx, _xx, _xx, _r2, _dr, // 0D30 - 0D3F + _fb, _fb, _bb, _ct, _ct, _pb, _ct, _ct, _ct, _ct, _xx, _xx, _xx, _xx, _r2, _dr, // 0D30 - 0D3F _dr, _dr, _dr, _dr, _xx, _xx, _l1, _l1, _dl, _xx, _s1, _s2, _s3, _vr, _xx, _xx, // 0D40 - 0D4F _xx, _xx, _xx, _xx, _xx, _xx, _xx, _m2, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, // 0D50 - 0D5F _iv, _iv, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx // 0D60 - 0D6F diff --git a/icu4c/source/layout/IndicReordering.cpp b/icu4c/source/layout/IndicReordering.cpp index d1c1814b1b..54f4861ce0 100644 --- a/icu4c/source/layout/IndicReordering.cpp +++ b/icu4c/source/layout/IndicReordering.cpp @@ -125,6 +125,10 @@ private: le_int32 fSMIndex; FeatureMask fSMFeatures; + LEUnicode fPreBaseConsonant; + LEUnicode fPreBaseVirama; + le_int32 fPBCIndex; + FeatureMask fPBCFeatures; void saveMatra(LEUnicode matra, le_int32 matraIndex, IndicClassTable::CharClass matraClass) { @@ -171,7 +175,8 @@ public: fMpost(0), fMpostIndex(0), fLengthMark(0), fLengthMarkIndex(0), fAlLakuna(0), fAlLakunaIndex(0), fMatraFeatures(0), fMPreOutIndex(-1), fMPreFixups(mpreFixups), fVMabove(0), fVMpost(0), fVMIndex(0), fVMFeatures(0), - fSMabove(0), fSMbelow(0), fSMIndex(0), fSMFeatures(0) + fSMabove(0), fSMbelow(0), fSMIndex(0), fSMFeatures(0), + fPreBaseConsonant(0), fPreBaseVirama(0), fPBCIndex(0), fPBCFeatures(0) { // nothing else to do... } @@ -190,6 +195,8 @@ public: fVMabove = fVMpost = 0; fSMabove = fSMbelow = 0; + + fPreBaseConsonant = fPreBaseVirama = 0; } void writeChar(LEUnicode ch, le_uint32 charIndex, FeatureMask charFeatures) @@ -385,6 +392,14 @@ public: } } + void notePreBaseConsonant(le_uint32 index,LEUnicode PBConsonant, LEUnicode PBVirama, FeatureMask features) + { + fPBCIndex = index; + fPreBaseConsonant = PBConsonant; + fPreBaseVirama = PBVirama; + fPBCFeatures = features; + } + void noteBaseConsonant() { if (fMPreFixups != NULL && fMPreOutIndex >= 0) { @@ -464,6 +479,22 @@ public: } } + void writePreBaseConsonant() + { + // The TDIL spec says that consonant + virama + RRA should produce a rakar in Malayalam. However, + // it seems that almost none of the fonts for Malayalam are set up to handle this. + // So, we're going to force the issue here by using the rakar as defined with RA in most fonts. + + if (fPreBaseConsonant == 0x0d31) { // RRA + fPreBaseConsonant = 0x0d30; // RA + } + + if (fPreBaseConsonant != 0) { + writeChar(fPreBaseConsonant, fPBCIndex, fPBCFeatures); + writeChar(fPreBaseVirama,fPBCIndex-1,fPBCFeatures); + } + } + le_int32 getOutputIndex() { return fOutIndex; @@ -722,6 +753,7 @@ le_int32 IndicReordering::reorder(const LEUnicode *chars, le_int32 charCount, le lastConsonant -= 1; } + IndicClassTable::CharClass charClass = CC_RESERVED; IndicClassTable::CharClass nextClass = CC_RESERVED; le_int32 baseConsonant = lastConsonant; @@ -729,9 +761,11 @@ le_int32 IndicReordering::reorder(const LEUnicode *chars, le_int32 charCount, le le_int32 postBaseLimit = classTable->scriptFlags & SF_POST_BASE_LIMIT_MASK; le_bool seenVattu = FALSE; le_bool seenBelowBaseForm = FALSE; + le_bool seenPreBaseForm = FALSE; le_bool hasNukta = FALSE; le_bool hasBelowBaseForm = FALSE; le_bool hasPostBaseForm = FALSE; + le_bool hasPreBaseForm = FALSE; if (postBase < markStart && classTable->isNukta(chars[postBase])) { charClass = CC_NUKTA; @@ -745,14 +779,22 @@ le_int32 IndicReordering::reorder(const LEUnicode *chars, le_int32 charCount, le hasBelowBaseForm = IndicClassTable::hasBelowBaseForm(charClass) && !hasNukta; hasPostBaseForm = IndicClassTable::hasPostBaseForm(charClass) && !hasNukta; + hasPreBaseForm = IndicClassTable::hasPreBaseForm(charClass) && !hasNukta; if (IndicClassTable::isConsonant(charClass)) { if (postBaseLimit == 0 || seenVattu || (baseConsonant > baseLimit && !classTable->isVirama(chars[baseConsonant - 1])) || - !(hasBelowBaseForm || hasPostBaseForm)) { + !(hasBelowBaseForm || hasPostBaseForm || hasPreBaseForm)) { break; } + // Note any pre-base consonants + if ( baseConsonant == lastConsonant && lastConsonant > 0 && + hasPreBaseForm && classTable->isVirama(chars[baseConsonant - 1])) { + output.notePreBaseConsonant(lastConsonant,chars[lastConsonant],chars[lastConsonant-1],tagArray2); + seenPreBaseForm = TRUE; + + } // consonants with nuktas are never vattus seenVattu = IndicClassTable::isVattu(charClass) && !hasNukta; @@ -785,12 +827,14 @@ le_int32 IndicReordering::reorder(const LEUnicode *chars, le_int32 charCount, le } // write any pre-base consonants + output.writePreBaseConsonant(); + le_bool supressVattu = TRUE; for (i = baseLimit; i < baseConsonant; i += 1) { LEUnicode ch = chars[i]; - // Don't put 'blwf' on first consonant. - FeatureMask features = (i == baseLimit? tagArray2 : tagArray1); + // Don't put 'pstf' or 'blwf' on anything before the base consonant. + FeatureMask features = tagArray1 & ~( pstfFeatureMask | blwfFeatureMask ); charClass = classTable->getCharClass(ch); nextClass = classTable->getCharClass(chars[i + 1]); @@ -841,7 +885,7 @@ le_int32 IndicReordering::reorder(const LEUnicode *chars, le_int32 charCount, le } // write below-base consonants - if (baseConsonant != lastConsonant) { + if (baseConsonant != lastConsonant && !seenPreBaseForm) { for (i = bcSpan + 1; i < postBase; i += 1) { output.writeChar(chars[i], i, tagArray1); } @@ -871,7 +915,7 @@ le_int32 IndicReordering::reorder(const LEUnicode *chars, le_int32 charCount, le // write post-base consonants // FIXME: does this put the right tags on post-base consonants? - if (baseConsonant != lastConsonant) { + if (baseConsonant != lastConsonant && !seenPreBaseForm) { if (postBase <= lastConsonant) { for (i = postBase; i <= lastConsonant; i += 1) { output.writeChar(chars[i], i, tagArray3); @@ -1139,7 +1183,7 @@ le_int32 IndicReordering::v2process(const LEUnicode *chars, le_int32 charCount, } -void IndicReordering::getDynamicProperties( DynamicProperties */*dProps*/, const IndicClassTable *classTable ) { +void IndicReordering::getDynamicProperties( DynamicProperties *, const IndicClassTable *classTable ) { LEUnicode currentChar; diff --git a/icu4c/source/layout/IndicReordering.h b/icu4c/source/layout/IndicReordering.h index 4d08504acd..c3452faad9 100644 --- a/icu4c/source/layout/IndicReordering.h +++ b/icu4c/source/layout/IndicReordering.h @@ -50,6 +50,7 @@ U_NAMESPACE_BEGIN #define CF_BELOW_BASE 0x10000000U #define CF_POST_BASE 0x08000000U #define CF_LENGTH_MARK 0x04000000U +#define CF_PRE_BASE 0x02000000U #define CF_POS_BEFORE 0x00300000U #define CF_POS_BELOW 0x00200000U @@ -118,6 +119,7 @@ struct IndicClassTable inline le_bool hasPostBaseForm(LEUnicode ch) const; inline le_bool hasBelowBaseForm(LEUnicode ch) const; inline le_bool hasAboveBaseForm(LEUnicode ch) const; + inline le_bool hasPreBaseForm(LEUnicode ch) const; inline static le_bool isVowelModifier(CharClass charClass); inline static le_bool isStressMark(CharClass charClass); @@ -134,6 +136,7 @@ struct IndicClassTable inline static le_bool hasPostBaseForm(CharClass charClass); inline static le_bool hasBelowBaseForm(CharClass charClass); inline static le_bool hasAboveBaseForm(CharClass charClass); + inline static le_bool hasPreBaseForm(CharClass charClass); static const IndicClassTable *getScriptClassTable(le_int32 scriptCode); }; @@ -255,6 +258,11 @@ inline le_bool IndicClassTable::hasPostBaseForm(CharClass charClass) return (charClass & CF_POST_BASE) != 0; } +inline le_bool IndicClassTable::hasPreBaseForm(CharClass charClass) +{ + return (charClass & CF_PRE_BASE) != 0; +} + inline le_bool IndicClassTable::hasBelowBaseForm(CharClass charClass) { return (charClass & CF_BELOW_BASE) != 0; @@ -335,6 +343,11 @@ inline le_bool IndicClassTable::hasBelowBaseForm(LEUnicode ch) const return hasBelowBaseForm(getCharClass(ch)); } +inline le_bool IndicClassTable::hasPreBaseForm(LEUnicode ch) const +{ + return hasPreBaseForm(getCharClass(ch)); +} + inline le_bool IndicClassTable::hasAboveBaseForm(LEUnicode ch) const { return hasAboveBaseForm(getCharClass(ch));