ICU-3409 Add support for stress marks, only permit one vowel modifier per syllable.

X-SVN-Rev: 14517
This commit is contained in:
Eric Mader 2004-02-13 19:34:17 +00:00
parent ac3df02904
commit c6a8dd30ca
3 changed files with 235 additions and 85 deletions

View File

@ -30,17 +30,19 @@ enum
// simple classes
_xx = IndicClassTable::CC_RESERVED,
_ma = IndicClassTable::CC_MODIFYING_MARK_ABOVE,
_mp = IndicClassTable::CC_MODIFYING_MARK_POST,
_ma = IndicClassTable::CC_VOWEL_MODIFIER | IndicClassTable::CF_POS_ABOVE,
_mp = IndicClassTable::CC_VOWEL_MODIFIER | IndicClassTable::CF_POS_AFTER,
_sa = IndicClassTable::CC_STRESS_MARK | IndicClassTable::CF_POS_ABOVE,
_sb = IndicClassTable::CC_STRESS_MARK | IndicClassTable::CF_POS_BELOW,
_iv = IndicClassTable::CC_INDEPENDENT_VOWEL,
_ct = IndicClassTable::CC_CONSONANT | IndicClassTable::CF_CONSONANT,
_cn = IndicClassTable::CC_CONSONANT_WITH_NUKTA | IndicClassTable::CF_CONSONANT,
_nu = IndicClassTable::CC_NUKTA,
_dv = IndicClassTable::CC_DEPENDENT_VOWEL,
_dl = _dv | IndicClassTable::CF_MATRA_PRE,
_db = _dv | IndicClassTable::CF_MATRA_BELOW,
_da = _dv | IndicClassTable::CF_MATRA_ABOVE,
_dr = _dv | IndicClassTable::CF_MATRA_POST,
_dl = _dv | IndicClassTable::CF_POS_BEFORE,
_db = _dv | IndicClassTable::CF_POS_BELOW,
_da = _dv | IndicClassTable::CF_POS_ABOVE,
_dr = _dv | IndicClassTable::CF_POS_AFTER,
_lm = _dv | IndicClassTable::CF_LENGTH_MARK,
_vr = IndicClassTable::CC_VIRAMA,
@ -76,7 +78,7 @@ static const IndicClassTable::CharClass devaCharClasses[] =
_ct, _ct, _ct, _ct, _ct, _ct, _ct, _ct, _ct, _cn, _ct, _ct, _ct, _ct, _ct, _ct, // 0920 - 092F
_rv, _cn, _ct, _ct, _cn, _ct, _ct, _ct, _ct, _ct, _xx, _xx, _nu, _xx, _dr, _dl, // 0930 - 093F
_dr, _db, _db, _db, _db, _da, _da, _da, _da, _dr, _dr, _dr, _dr, _vr, _xx, _xx, // 0940 - 094F
_xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _cn, _cn, _cn, _cn, _cn, _cn, _cn, _cn, // 0950 - 095F
_xx, _sa, _sb, _sa, _sa, _xx, _xx, _xx, _cn, _cn, _cn, _cn, _cn, _cn, _cn, _cn, // 0950 - 095F
_iv, _iv, _db, _db, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, // 0960 - 096F
_xx // 0970
};

View File

@ -2,8 +2,8 @@
* (C) Copyright IBM Corp. 1998-2003 - All Rights Reserved
*
* $Source: /xsrl/Nsvn/icu/icu/source/layout/IndicReordering.cpp,v $
* $Date: 2003/12/08 22:41:38 $
* $Revision: 1.14 $
* $Date: 2004/02/13 19:34:17 $
* $Revision: 1.15 $
*
*/
@ -30,31 +30,57 @@ private:
LEUnicode fLengthMark;
le_int32 fMatraIndex;
const LETag *fMatraTags;
le_int32 fMPreOutIndex;
le_int32 fMPreOutIndex;
MPreFixups *fMPreFixups;
LEUnicode fVMabove;
LEUnicode fVMpost;
le_int32 fVMIndex;
const LETag *fVMTags;
LEUnicode fSMabove;
LEUnicode fSMbelow;
le_int32 fSMIndex;
const LETag *fSMTags;
void saveMatra(LEUnicode matra, IndicClassTable::CharClass matraClass)
{
// FIXME: check if already set, or if not a matra...
if (IndicClassTable::isMpre(matraClass)) {
fMpre = matra;
} else if (IndicClassTable::isMbelow(matraClass)) {
fMbelow = matra;
} else if (IndicClassTable::isMabove(matraClass)) {
fMabove = matra;
} else if (IndicClassTable::isMpost(matraClass)) {
fMpost = matra;
} else if (IndicClassTable::isLengthMark(matraClass)) {
if (IndicClassTable::isLengthMark(matraClass)) {
fLengthMark = matra;
} else {
switch (matraClass & IndicClassTable::CF_POS_MASK) {
case IndicClassTable::CF_POS_BEFORE:
fMpre = matra;
break;
case IndicClassTable::CF_POS_BELOW:
fMbelow = matra;
break;
case IndicClassTable::CF_POS_ABOVE:
fMabove = matra;
break;
case IndicClassTable::CF_POS_AFTER:
fMpost = matra;
break;
default:
// can't get here...
break;
}
}
}
public:
ReorderingOutput(LEUnicode *outChars, le_int32 *charIndices, const LETag **charTags, MPreFixups *mpreFixups)
: fOutIndex(0), fOutChars(outChars), fCharIndices(charIndices), fCharTags(charTags),
fMpre(0), fMbelow(0), fMabove(0), fMpost(0), fLengthMark(0),
fMatraIndex(0), fMatraTags(NULL), fMPreOutIndex(-1), fMPreFixups(mpreFixups)
fMpre(0), fMbelow(0), fMabove(0), fMpost(0), fLengthMark(0), fMatraIndex(0), fMatraTags(NULL),
fMPreOutIndex(-1), fMPreFixups(mpreFixups),
fVMabove(0), fVMpost(0), fVMIndex(0), fVMTags(NULL),
fSMabove(0), fSMbelow(0), fSMIndex(0), fSMTags(NULL)
{
// nothing else to do...
}
@ -64,12 +90,19 @@ public:
// nothing to do here...
}
void reset()
{
fMpre = fMbelow = fMabove = fMpost = fLengthMark = 0;
fMPreOutIndex = -1;
fVMabove = fVMpost = 0;
fSMabove = fSMbelow = 0;
}
void noteMatra(const IndicClassTable *classTable, LEUnicode matra, le_uint32 matraIndex, const LETag *matraTags)
{
IndicClassTable::CharClass matraClass = classTable->getCharClass(matra);
fMpre = fMbelow = fMabove = fMpost = fLengthMark = 0;
fMPreOutIndex = -1;
fMatraIndex = matraIndex;
fMatraTags = matraTags;
@ -90,6 +123,54 @@ public:
}
}
void noteVowelModifier(const IndicClassTable *classTable, LEUnicode vowelModifier, le_uint32 vowelModifierIndex, const LETag *vowelModifierTags)
{
IndicClassTable::CharClass vmClass = classTable->getCharClass(vowelModifier);
fVMIndex = vowelModifierIndex;
fVMTags = vowelModifierTags;
if (IndicClassTable::isVowelModifier(vmClass)) {
switch (vmClass & IndicClassTable::CF_POS_MASK) {
case IndicClassTable::CF_POS_ABOVE:
fVMabove = vowelModifier;
break;
case IndicClassTable::CF_POS_AFTER:
fVMpost = vowelModifier;
break;
default:
// FIXME: this is an error...
break;
}
}
}
void noteStressMark(const IndicClassTable *classTable, LEUnicode stressMark, le_uint32 stressMarkIndex, const LETag *stressMarkTags)
{
IndicClassTable::CharClass smClass = classTable->getCharClass(stressMark);
fSMIndex = stressMarkIndex;
fSMTags = stressMarkTags;
if (IndicClassTable::isStressMark(smClass)) {
switch (smClass & IndicClassTable::CF_POS_MASK) {
case IndicClassTable::CF_POS_ABOVE:
fSMabove = stressMark;
break;
case IndicClassTable::CF_POS_BELOW:
fSMbelow = stressMark;
break;
default:
// FIXME: this is an error...
break;
}
}
}
void noteBaseConsonant()
{
if (fMPreFixups != NULL && fMPreOutIndex >= 0) {
@ -133,6 +214,34 @@ public:
}
}
void writeVMabove()
{
if (fVMabove != 0) {
writeChar(fVMabove, fVMIndex, fVMTags);
}
}
void writeVMpost()
{
if (fVMpost != 0) {
writeChar(fVMpost, fVMIndex, fVMTags);
}
}
void writeSMabove()
{
if (fSMabove != 0) {
writeChar(fSMabove, fSMIndex, fSMTags);
}
}
void writeSMbelow()
{
if (fSMbelow != 0) {
writeChar(fSMbelow, fSMIndex, fSMTags);
}
}
void writeChar(LEUnicode ch, le_uint32 charIndex, const LETag *charTags)
{
fOutChars[fOutIndex] = ch;
@ -195,16 +304,15 @@ const LETag tagArray[] =
const le_int8 stateTable[][IndicClassTable::CC_COUNT] =
{
// xx ma mp iv ct cn nu dv vr zw
{ 1, 1, 1, 5, 3, 2, 1, 1, 1, 1},
{-1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
{-1, 6, 1, -1, -1, -1, -1, 5, 4, -1},
{-1, 6, 1, -1, -1, -1, 2, 5, 4, -1},
{-1, -1, -1, -1, 3, 2, -1, -1, -1, 8},
{-1, 6, 1, -1, -1, -1, -1, -1, -1, -1},
{-1, 7, 1, -1, -1, -1, -1, -1, -1, -1},
{-1, -1, 1, -1, -1, -1, -1, -1, -1, -1},
{-1, -1, -1, -1, 3, 2, -1, -1, -1, -1}
// xx vm sm iv ct cn nu dv vr zw
{ 1, 1, 1, 5, 3, 2, 1, 1, 1, 1}, // 0
{-1, -1, -1, -1, -1, -1, -1, -1, -1, -1}, // 1
{-1, 6, 1, -1, -1, -1, -1, 5, 4, -1}, // 2
{-1, 6, 1, -1, -1, -1, 2, 5, 4, -1}, // 3
{-1, -1, -1, -1, 3, 2, -1, -1, -1, 7}, // 4
{-1, 6, 1, -1, -1, -1, -1, -1, -1, -1}, // 5
{-1, -1, 1, -1, -1, -1, -1, -1, -1, -1}, // 6
{-1, -1, -1, -1, 3, 2, -1, -1, -1, -1} // 7
};
@ -249,18 +357,21 @@ le_int32 IndicReordering::reorder(const LEUnicode *chars, le_int32 charCount, le
while (prev < charCount) {
le_int32 syllable = findSyllable(classTable, chars, prev, charCount);
le_int32 matra, vmabove, vmpost = syllable;
le_int32 matra, markStart = syllable;
while (vmpost > prev && classTable->isVMpost(chars[vmpost - 1])) {
vmpost -= 1;
output.reset();
if (classTable->isStressMark(chars[markStart - 1])) {
markStart -= 1;
output.noteStressMark(classTable, chars[markStart], markStart, &tagArray[1]);
}
vmabove = vmpost;
while (vmabove > prev && classTable->isVMabove(chars[vmabove - 1])) {
vmabove -= 1;
if (classTable->isVowelModifier(chars[markStart - 1])) {
markStart -= 1;
output.noteVowelModifier(classTable, chars[markStart], markStart, &tagArray[1]);
}
matra = vmabove - 1;
matra = markStart - 1;
output.noteMatra(classTable, chars[matra], matra, &tagArray[1]);
switch (classTable->getCharClass(chars[prev]) & IndicClassTable::CF_CLASS_MASK) {
@ -273,8 +384,8 @@ le_int32 IndicReordering::reorder(const LEUnicode *chars, le_int32 charCount, le
break;
case IndicClassTable::CC_MODIFYING_MARK_ABOVE:
case IndicClassTable::CC_MODIFYING_MARK_POST:
case IndicClassTable::CC_VOWEL_MODIFIER:
case IndicClassTable::CC_STRESS_MARK:
case IndicClassTable::CC_NUKTA:
case IndicClassTable::CC_VIRAMA:
output.writeChar(C_DOTTED_CIRCLE, prev, &tagArray[1]);
@ -293,8 +404,8 @@ le_int32 IndicReordering::reorder(const LEUnicode *chars, le_int32 charCount, le
case IndicClassTable::CC_CONSONANT:
case IndicClassTable::CC_CONSONANT_WITH_NUKTA:
{
le_uint32 length = vmabove - prev;
le_int32 lastConsonant = vmabove - 1;
le_uint32 length = markStart - prev;
le_int32 lastConsonant = markStart - 1;
le_int32 baseLimit = prev;
// Check for REPH at front of syllable
@ -385,14 +496,14 @@ le_int32 IndicReordering::reorder(const LEUnicode *chars, le_int32 charCount, le
le_int32 bcSpan = baseConsonant + 1;
if (bcSpan < vmabove && classTable->isNukta(chars[bcSpan])) {
if (bcSpan < markStart && classTable->isNukta(chars[bcSpan])) {
bcSpan += 1;
}
if (baseConsonant == lastConsonant && bcSpan < vmabove && classTable->isVirama(chars[bcSpan])) {
if (baseConsonant == lastConsonant && bcSpan < markStart && classTable->isVirama(chars[bcSpan])) {
bcSpan += 1;
if (bcSpan < vmabove && chars[bcSpan] == C_SIGN_ZWNJ) {
if (bcSpan < markStart && chars[bcSpan] == C_SIGN_ZWNJ) {
bcSpan += 1;
}
}
@ -407,6 +518,7 @@ le_int32 IndicReordering::reorder(const LEUnicode *chars, le_int32 charCount, le
if ((classTable->scriptFlags & IndicClassTable::SF_MATRAS_AFTER_BASE) != 0) {
output.writeMbelow();
output.writeSMbelow(); // FIXME: there are no SMs in these scripts...
output.writeMabove();
output.writeMpost();
}
@ -423,9 +535,10 @@ le_int32 IndicReordering::reorder(const LEUnicode *chars, le_int32 charCount, le
}
}
// write Mbelow, Mabove
// write Mbelow, SMbelow, Mabove
if ((classTable->scriptFlags & IndicClassTable::SF_MATRAS_AFTER_BASE) == 0) {
output.writeMbelow();
output.writeSMbelow();
output.writeMabove();
}
@ -435,10 +548,8 @@ le_int32 IndicReordering::reorder(const LEUnicode *chars, le_int32 charCount, le
output.writeChar(chars[prev + 1], prev + 1, &tagArray[0]);
}
// write VMabove
for (i = vmabove; i < vmpost; i += 1) {
output.writeChar(chars[i], i, &tagArray[1]);
}
output.writeVMabove();
output.writeSMabove(); // FIXME: there are no SM's in these scripts...
}
// write post-base consonants
@ -473,16 +584,11 @@ le_int32 IndicReordering::reorder(const LEUnicode *chars, le_int32 charCount, le
output.writeChar(chars[prev + 1], prev + 1, &tagArray[0]);
}
// write VMabove
for (i = vmabove; i < vmpost; i += 1) {
output.writeChar(chars[i], i, &tagArray[1]);
}
output.writeVMabove();
output.writeSMabove();
}
// write VMpost
for (i = vmpost; i < syllable; i += 1) {
output.writeChar(chars[i], i, &tagArray[1]);
}
output.writeVMpost();
break;
}

View File

@ -2,8 +2,8 @@
* (C) Copyright IBM Corp. 1998-2003 - All Rights Reserved
*
* $Source: /xsrl/Nsvn/icu/icu/source/layout/IndicReordering.h,v $
* $Date: 2003/11/25 23:41:24 $
* $Revision: 1.10 $
* $Date: 2004/02/13 19:34:17 $
* $Revision: 1.11 $
*
*/
@ -36,8 +36,8 @@ struct IndicClassTable
enum CharClassValues
{
CC_RESERVED = 0,
CC_MODIFYING_MARK_ABOVE = 1,
CC_MODIFYING_MARK_POST = 2,
CC_VOWEL_MODIFIER = 1,
CC_STRESS_MARK = 2,
CC_INDEPENDENT_VOWEL = 3,
CC_CONSONANT = 4,
CC_CONSONANT_WITH_NUKTA = 5,
@ -58,12 +58,14 @@ struct IndicClassTable
CF_VATTU = 0x20000000,
CF_BELOW_BASE = 0x10000000,
CF_POST_BASE = 0x08000000,
CF_LENGTH_MARK = 0x04000000,
CF_POS_BEFORE = 0x00300000,
CF_POS_BELOW = 0x00200000,
CF_POS_ABOVE = 0x00100000,
CF_POS_AFTER = 0x00000000,
CF_POS_MASK = 0x00300000,
CF_MATRA_PRE = 0x04000000,
CF_MATRA_BELOW = 0x02000000,
CF_MATRA_ABOVE = 0x01000000,
CF_MATRA_POST = 0x00800000,
CF_LENGTH_MARK = 0x00400000,
CF_INDEX_MASK = 0x000F0000,
CF_INDEX_SHIFT = 16
};
@ -95,8 +97,8 @@ struct IndicClassTable
CharClass getCharClass(LEUnicode ch) const;
const SplitMatra *getSplitMatra(CharClass charClass) const;
le_bool isVMabove(LEUnicode ch) const;
le_bool isVMpost(LEUnicode ch) const;
le_bool isVowelModifier(LEUnicode ch) const;
le_bool isStressMark(LEUnicode ch) const;
le_bool isConsonant(LEUnicode ch) const;
le_bool isReph(LEUnicode ch) const;
le_bool isVirama(LEUnicode ch) const;
@ -104,17 +106,21 @@ struct IndicClassTable
le_bool isVattu(LEUnicode ch) const;
le_bool isMatra(LEUnicode ch) const;
le_bool isSplitMatra(LEUnicode ch) const;
#if 0
le_bool isMpre(LEUnicode ch) const;
le_bool isMbelow(LEUnicode ch) const;
le_bool isMabove(LEUnicode ch) const;
le_bool isMpost(LEUnicode ch) const;
#endif
le_bool isLengthMark(LEUnicode ch) const;
le_bool hasPostOrBelowBaseForm(LEUnicode ch) const;
le_bool hasPostBaseForm(LEUnicode ch) const;
le_bool hasBelowBaseForm(LEUnicode ch) const;
static le_bool isVMabove(CharClass charClass);
static le_bool isVMpost(CharClass charClass);
static le_bool isVowelModifier(CharClass charClass);
static le_bool isStressMark(CharClass charClass);
static le_bool isConsonant(CharClass charClass);
static le_bool isReph(CharClass charClass);
static le_bool isVirama(CharClass charClass);
@ -122,11 +128,21 @@ struct IndicClassTable
static le_bool isVattu(CharClass charClass);
static le_bool isMatra(CharClass charClass);
static le_bool isSplitMatra(CharClass charClass);
#if 0
static le_bool isMpre(CharClass charClass);
static le_bool isMbelow(CharClass charClass);
static le_bool isMabove(CharClass charClass);
static le_bool isMpost(CharClass charClass);
#endif
static le_bool isLengthMark(CharClass charClass);
#if 0
static le_bool isBefore(CharClass charClass);
static le_bool isBelow(CharClass charClass);
static le_bool isAbove(CharClass charClass);
static le_bool isAfter(CharClass charClass);
#endif
static le_bool hasPostOrBelowBaseForm(CharClass charClass);
static le_bool hasPostBaseForm(CharClass charClass);
static le_bool hasBelowBaseForm(CharClass charClass);
@ -166,14 +182,14 @@ inline const SplitMatra *IndicClassTable::getSplitMatra(CharClass charClass) con
return &splitMatraTable[index - 1];
}
inline le_bool IndicClassTable::isVMabove(CharClass charClass)
inline le_bool IndicClassTable::isVowelModifier(CharClass charClass)
{
return (charClass & CF_CLASS_MASK) == CC_MODIFYING_MARK_ABOVE;
return (charClass & CF_CLASS_MASK) == CC_VOWEL_MODIFIER;
}
inline le_bool IndicClassTable::isVMpost(CharClass charClass)
inline le_bool IndicClassTable::isStressMark(CharClass charClass)
{
return (charClass & CF_CLASS_MASK) == CC_MODIFYING_MARK_POST;
return (charClass & CF_CLASS_MASK) == CC_STRESS_MARK;
}
inline le_bool IndicClassTable::isConsonant(CharClass charClass)
@ -211,6 +227,7 @@ inline le_bool IndicClassTable::isSplitMatra(CharClass charClass)
return (charClass & CF_INDEX_MASK) != 0;
}
#if 0
inline le_bool IndicClassTable::isMpre(CharClass charClass)
{
return (charClass & CF_MATRA_PRE) != 0;
@ -230,12 +247,35 @@ inline le_bool IndicClassTable::isMpost(CharClass charClass)
{
return (charClass & CF_MATRA_POST) != 0;
}
#endif
inline le_bool IndicClassTable::isLengthMark(CharClass charClass)
{
return (charClass & CF_LENGTH_MARK) != 0;
}
#if 0
inline le_bool IndicClassTable::isBefore(CharClass charClass)
{
return (charClass & CF_POS_MASK) == CF_POS_BEFORE;
}
inline le_bool IndicClassTable::isAbove(CharClass charClass)
{
return (charClass & CF_POS_MASK) == CF_POS_ABOVE;
}
inline le_bool IndicClassTable::isBelow(CharClass charClass)
{
return (charClass & CF_POS_MASK) == CF_POS_BELOW;
}
inline le_bool IndicClassTable::isAfter(CharClass charClass)
{
return (charClass & CF_POS_MASK) == CF_POS_AFTER;
}
#endif
inline le_bool IndicClassTable::hasPostOrBelowBaseForm(CharClass charClass)
{
return (charClass & (CF_POST_BASE | CF_BELOW_BASE)) != 0;
@ -251,14 +291,14 @@ inline le_bool IndicClassTable::hasBelowBaseForm(CharClass charClass)
return (charClass & CF_BELOW_BASE) != 0;
}
inline le_bool IndicClassTable::isVMabove(LEUnicode ch) const
inline le_bool IndicClassTable::isVowelModifier(LEUnicode ch) const
{
return isVMabove(getCharClass(ch));
return isVowelModifier(getCharClass(ch));
}
inline le_bool IndicClassTable::isVMpost(LEUnicode ch) const
inline le_bool IndicClassTable::isStressMark(LEUnicode ch) const
{
return isVMpost(getCharClass(ch));
return isStressMark(getCharClass(ch));
}
inline le_bool IndicClassTable::isConsonant(LEUnicode ch) const
@ -296,6 +336,7 @@ inline le_bool IndicClassTable::isSplitMatra(LEUnicode ch) const
return isSplitMatra(getCharClass(ch));
}
#if 0
inline le_bool IndicClassTable::isMpre(LEUnicode ch) const
{
return isMpre(getCharClass(ch));
@ -315,6 +356,7 @@ inline le_bool IndicClassTable::isMpost(LEUnicode ch) const
{
return isMpost(getCharClass(ch));
}
#endif
inline le_bool IndicClassTable::isLengthMark(LEUnicode ch) const
{