ea063658b4
X-SVN-Rev: 2547
416 lines
10 KiB
C++
416 lines
10 KiB
C++
/*******************************************************************************
|
|
* Copyright (C) 1996-1999, International Business Machines Corporation and *
|
|
* others. All Rights Reserved. *
|
|
*******************************************************************************
|
|
*/
|
|
//=============================================================================
|
|
//
|
|
// File ptnentry.cpp
|
|
//
|
|
// Contains PatternEntry, an internal class used by MergeCollation to store
|
|
// one collation element from a pattern.
|
|
//
|
|
// Created by: Helena Shih
|
|
//
|
|
// Modification History:
|
|
//
|
|
// Date Name Description
|
|
// 04/23/99 stephen Removed EDecompositionMode, merged with
|
|
// Normalizer::EMode
|
|
// Removed character literals.
|
|
//=============================================================================
|
|
|
|
#include "ptnentry.h"
|
|
|
|
#include "unicode/unicode.h"
|
|
#include "unicode/coll.h"
|
|
#include "unicode/normlzr.h"
|
|
|
|
|
|
// static member initialization
|
|
const int32_t PatternEntry::RESET = -2;
|
|
const int32_t PatternEntry::UNSET = -1;
|
|
|
|
// ===== privates =====
|
|
|
|
PatternEntry::PatternEntry()
|
|
: strength(PatternEntry::UNSET)
|
|
{
|
|
}
|
|
|
|
PatternEntry::PatternEntry(const PatternEntry& other)
|
|
: strength(other.strength), chars(other.chars), extension(other.extension)
|
|
{
|
|
}
|
|
|
|
PatternEntry::PatternEntry(int32_t newStrength,
|
|
const UnicodeString& newChars,
|
|
const UnicodeString& newExtensions,
|
|
Normalizer::EMode decompMode)
|
|
: strength(newStrength), extension(newExtensions)
|
|
{
|
|
// Normalize the characters in the new entry. Find occurances of all
|
|
// decomposed characters and normalize them. By "normalize",
|
|
// we mean that all precomposed Unicode characters must be converted into
|
|
// a base character and one or more combining characters (such as accents).
|
|
// When there are multiple combining characters attached to a base character,
|
|
// the combining characters must be in their canonical order
|
|
//
|
|
UErrorCode status = U_ZERO_ERROR;
|
|
Normalizer::normalize(newChars, decompMode, 0, chars, status);
|
|
if (U_FAILURE(status)) {
|
|
chars = newChars;
|
|
}
|
|
}
|
|
|
|
PatternEntry::~PatternEntry() {
|
|
}
|
|
|
|
const PatternEntry&
|
|
PatternEntry::operator=(const PatternEntry& other)
|
|
{
|
|
if (this != &other) {
|
|
strength = other.strength;
|
|
chars = other.chars;
|
|
extension = other.extension;
|
|
}
|
|
return *this;
|
|
}
|
|
|
|
/**
|
|
* Gets the current extension, quoted
|
|
* This is useful when constructing a user-readable string representing
|
|
* a pattern.
|
|
*/
|
|
void PatternEntry::appendQuotedExtension(UnicodeString& toAddTo) const {
|
|
appendQuoted(extension,toAddTo);
|
|
}
|
|
|
|
/**
|
|
* Gets the current chars, quoted
|
|
* This is useful when constructing a user-readable string representing
|
|
* a pattern.
|
|
*/
|
|
void PatternEntry::appendQuotedChars(UnicodeString& toAddTo) const {
|
|
appendQuoted(chars,toAddTo);
|
|
}
|
|
|
|
UBool PatternEntry::equals(const PatternEntry& other) const {
|
|
UBool result = ((strength == other.strength) &&
|
|
(chars == other.chars) &&
|
|
(extension == other.extension));
|
|
return result;
|
|
}
|
|
|
|
/**
|
|
* For debugging.
|
|
*/
|
|
UnicodeString&
|
|
PatternEntry::toString(UnicodeString& result) const
|
|
{
|
|
addToBuffer(result, TRUE, FALSE, NULL);
|
|
return result;
|
|
}
|
|
|
|
int32_t
|
|
PatternEntry::getStrength() const
|
|
{
|
|
return strength;
|
|
}
|
|
|
|
const UnicodeString&
|
|
PatternEntry::getExtension(UnicodeString& ext) const
|
|
{
|
|
ext = extension;
|
|
return ext;
|
|
}
|
|
|
|
const UnicodeString&
|
|
PatternEntry::getChars(UnicodeString& result) const
|
|
{
|
|
result = chars;
|
|
return result;
|
|
}
|
|
|
|
/*
|
|
Add the entry in textual form into the toAddTo buffer.
|
|
*/
|
|
void PatternEntry::addToBuffer(UnicodeString& toAddTo,
|
|
UBool showExtension,
|
|
UBool showWhiteSpace,
|
|
const PatternEntry* lastEntry) const
|
|
{
|
|
if (showWhiteSpace && toAddTo.length() > 0)
|
|
// Adds new line before each primary strength entry.
|
|
if (strength == Collator::PRIMARY || lastEntry != NULL)
|
|
toAddTo += (UChar)0x000A/*'\n'*/;
|
|
else
|
|
toAddTo += (UChar)0x0020/*' '*/;
|
|
if (lastEntry != NULL) {
|
|
toAddTo += (UChar)0x0026/*'&'*/;
|
|
if (showWhiteSpace)
|
|
toAddTo += (UChar)0x0020/*' '*/;
|
|
lastEntry->appendQuotedChars(toAddTo);
|
|
appendQuotedExtension(toAddTo);
|
|
if (showWhiteSpace)
|
|
toAddTo += (UChar)0x0020/*' '*/;
|
|
}
|
|
// Check the strength for the correct symbol to append
|
|
switch (strength) {
|
|
case Collator::IDENTICAL: toAddTo += (UChar)0x003D/*'='*/; break;
|
|
case Collator::TERTIARY: toAddTo += (UChar)0x002C/*','*/; break;
|
|
case Collator::SECONDARY: toAddTo += (UChar)0x003B/*';'*/; break;
|
|
case Collator::PRIMARY: toAddTo += (UChar)0x003C/*'<'*/; break;
|
|
case PatternEntry::RESET: toAddTo += (UChar)0x0026/*'&'*/; break;
|
|
case PatternEntry::UNSET: toAddTo += (UChar)0x003F/*'?'*/; break;
|
|
}
|
|
if (showWhiteSpace)
|
|
toAddTo += (UChar)0x0020/*' '*/;
|
|
appendQuoted(chars,toAddTo);
|
|
// If there's an expending char and needs to be shown,
|
|
// append that after the entry
|
|
if (showExtension && extension.length() != 0) {
|
|
toAddTo += (UChar)0x002F/*'/'*/;
|
|
appendQuoted(extension,toAddTo);
|
|
}
|
|
}
|
|
|
|
// Append a string to a pattern buffer, adding quotes if necessary
|
|
void PatternEntry::appendQuoted(const UnicodeString& chars, UnicodeString& toAddTo) {
|
|
UBool inQuote = FALSE;
|
|
UChar ch = chars[0];
|
|
if (Unicode::isSpaceChar(ch)) {
|
|
inQuote = TRUE;
|
|
toAddTo += (UChar)0x0027/*'\''*/;
|
|
} else if (isSpecialChar(ch)) {
|
|
inQuote = TRUE;
|
|
toAddTo += (UChar)0x0027/*'\''*/;
|
|
} else {
|
|
switch (ch) {
|
|
case 0x0010: case 0x000C/*'\f'*/:
|
|
case 0x000D/*'\r'*/: case 0x0009/*'\t'*/:
|
|
case 0x000A/*'\n'*/: case 0x0040/*'@'*/:
|
|
inQuote = TRUE;
|
|
toAddTo += (UChar)0x0027/*'\''*/;
|
|
break;
|
|
case 0x0027/*'\''*/:
|
|
inQuote = TRUE;
|
|
toAddTo += (UChar)0x0027/*'\''*/;
|
|
break;
|
|
default:
|
|
if (inQuote) {
|
|
inQuote = FALSE; toAddTo += (UChar)0x0027/*'\''*/;
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
toAddTo += chars;
|
|
if (inQuote)
|
|
toAddTo += (UChar)0x0027/*'\''*/;
|
|
}
|
|
|
|
PatternEntry::Parser::Parser(const UnicodeString &patternStr,
|
|
Normalizer::EMode decompMode)
|
|
: pattern(patternStr), index(0),
|
|
fDecompMode(decompMode), newChars(), newExtensions()
|
|
{
|
|
}
|
|
|
|
PatternEntry::Parser::Parser(const Parser &that)
|
|
: pattern(that.pattern), index(that.index), fDecompMode(that.fDecompMode),
|
|
newChars(that.newChars), newExtensions(that.newExtensions)
|
|
{
|
|
}
|
|
|
|
PatternEntry::Parser::~Parser()
|
|
{
|
|
}
|
|
|
|
PatternEntry::Parser &PatternEntry::Parser::operator=(const Parser &that)
|
|
{
|
|
if (this != &that)
|
|
{
|
|
this->pattern = that.pattern;
|
|
this->index = that.index;
|
|
this->fDecompMode = that.fDecompMode;
|
|
this->newChars = that.newChars;
|
|
this->newExtensions = that.newExtensions;
|
|
}
|
|
|
|
return *this;
|
|
}
|
|
|
|
PatternEntry *PatternEntry::Parser::next(UErrorCode &status)
|
|
{
|
|
int32_t newStrength = PatternEntry::UNSET;
|
|
UBool inChars = TRUE;
|
|
UBool inQuote = FALSE;
|
|
|
|
newChars.remove();
|
|
newExtensions.remove();
|
|
|
|
while (index < pattern.length())
|
|
{
|
|
UChar ch = pattern[index];
|
|
|
|
if (inQuote)
|
|
{
|
|
if (ch == 0x0027/*'\''*/)
|
|
{
|
|
inQuote = FALSE;
|
|
}
|
|
else
|
|
{
|
|
if ((newChars.length() == 0) || inChars)
|
|
{
|
|
newChars += ch;
|
|
}
|
|
else
|
|
{
|
|
newExtensions += ch;
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
// Sets the strength for this entry
|
|
switch (ch)
|
|
{
|
|
case 0x003D/*'='*/ :
|
|
if (newStrength != PatternEntry::UNSET)
|
|
{
|
|
goto EndOfLoop;
|
|
}
|
|
|
|
newStrength = Collator::IDENTICAL;
|
|
break;
|
|
|
|
case 0x002C/*','*/:
|
|
if (newStrength != PatternEntry::UNSET)
|
|
{
|
|
goto EndOfLoop;
|
|
}
|
|
|
|
newStrength = Collator::TERTIARY;
|
|
break;
|
|
|
|
case 0x003B/*';'*/:
|
|
if (newStrength != PatternEntry::UNSET)
|
|
{
|
|
goto EndOfLoop;
|
|
}
|
|
|
|
newStrength = Collator::SECONDARY;
|
|
break;
|
|
|
|
case 0x003C/*'<'*/:
|
|
if (newStrength != PatternEntry::UNSET)
|
|
{
|
|
goto EndOfLoop;
|
|
}
|
|
|
|
newStrength = Collator::PRIMARY;
|
|
break;
|
|
|
|
case 0x0026/*'&'*/:
|
|
if (newStrength != PatternEntry::UNSET)
|
|
{
|
|
goto EndOfLoop;
|
|
}
|
|
|
|
newStrength = PatternEntry::RESET;
|
|
break;
|
|
|
|
// Ignore the white spaces
|
|
case 0x0009/*'\t'*/:
|
|
case 0x000C/*'\f'*/:
|
|
case 0x000D/*'\r'*/:
|
|
case 0x000A/*'\n'*/:
|
|
case 0x0020/*' '*/:
|
|
break; // skip whitespace TODO use Unicode
|
|
|
|
case 0x002F/*'/'*/:
|
|
// This entry has an extension.
|
|
inChars = FALSE;
|
|
break;
|
|
|
|
case 0x0027/*'\''*/:
|
|
inQuote = TRUE;
|
|
ch = pattern[++index];
|
|
|
|
if (newChars.length() == 0)
|
|
{
|
|
newChars += ch;
|
|
}
|
|
else if (inChars)
|
|
{
|
|
newChars += ch;
|
|
}
|
|
else
|
|
{
|
|
newExtensions += ch;
|
|
}
|
|
|
|
break;
|
|
|
|
default:
|
|
if (newStrength == PatternEntry::UNSET)
|
|
{
|
|
status = U_INVALID_FORMAT_ERROR;
|
|
return NULL;
|
|
}
|
|
|
|
if (isSpecialChar(ch) && (inQuote == FALSE))
|
|
{
|
|
status = U_INVALID_FORMAT_ERROR;
|
|
return NULL;
|
|
}
|
|
|
|
if (inChars)
|
|
{
|
|
newChars += ch;
|
|
}
|
|
else
|
|
{
|
|
newExtensions += ch;
|
|
}
|
|
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (newChars.isBogus() || newExtensions.isBogus())
|
|
{
|
|
status = U_MEMORY_ALLOCATION_ERROR;
|
|
return NULL;
|
|
}
|
|
|
|
index += 1;
|
|
}
|
|
|
|
EndOfLoop:
|
|
if (newStrength == PatternEntry::UNSET)
|
|
{
|
|
return NULL;
|
|
}
|
|
|
|
if (newChars.length() == 0)
|
|
{
|
|
status = U_INVALID_FORMAT_ERROR;
|
|
return NULL;
|
|
}
|
|
|
|
return new PatternEntry(newStrength, newChars, newExtensions, fDecompMode);
|
|
}
|
|
|
|
// Check if the character is a special character. A special character
|
|
// would be meaningful in the rule only if quoted, otherwise it's used
|
|
// as a denotation for strength or merging symbols.
|
|
UBool PatternEntry::isSpecialChar(UChar ch)
|
|
{
|
|
return (((ch <= 0x002F) && (ch >= 0x0020)) ||
|
|
((ch <= 0x003F) && (ch >= 0x003A)) ||
|
|
((ch <= 0x0060) && (ch >= 0x005B)) ||
|
|
((ch <= 0x007E) && (ch >= 0x007B)));
|
|
}
|