scuffed-code/icu4c/source/i18n/ptnentry.cpp

/******************************************************************************
 * COPYRIGHT:
 *  (C) Copyright Taligent, Inc., 1996
 *  (C) Copyright IBM Corp. 1996-1998
 *  Licensed Material - Program-Property of IBM - All Rights Reserved.
 *  US Government Users Restricted Rights - Use, duplication, or disclosure
 *  restricted by GSA ADP Schedule Contact with IBM Corp.
 *
 ******************************************************************************
 */
//=============================================================================
//
// File ptnentry.cpp
//
// Contains PatternEntry, an internal class used by MergeCollation to store
// one collation element from a pattern.
//
// Created by: Helena Shih
//
// Modification History:
//
//  Date         Name          Description
// 04/23/99      stephen       Removed EDecompositionMode, merged with
//                             Normalizer::EMode
//                             Removed character literals.
//=============================================================================

#include "ptnentry.h"

#include "unicode.h"
#include "coll.h"
#include "normlzr.h"


// static member initialization
const int32_t PatternEntry::RESET = -2;
const int32_t PatternEntry::UNSET = -1;

// ===== privates =====

PatternEntry::PatternEntry()
  : strength(PatternEntry::UNSET)
{
}

PatternEntry::PatternEntry(const    PatternEntry&   other)
  : strength(other.strength), chars(other.chars), extension(other.extension)
{
}

PatternEntry::PatternEntry(int32_t newStrength,
               const UnicodeString& newChars,
               const UnicodeString& newExtensions,
               Normalizer::EMode decompMode)
  : strength(newStrength), extension(newExtensions)
{
  // Normalize the characters in the new entry.  Find occurances of all
  // decomposed characters and normalize them.  By "normalize",
  // we mean that all precomposed Unicode characters must be converted into
  // a base character and one or more combining characters (such as accents).
  // When there are multiple combining characters attached to a base character,
  // the combining characters must be in their canonical order
  //
  UErrorCode status = U_ZERO_ERROR;
  Normalizer::normalize(newChars, decompMode, 0, chars, status);
  if (FAILURE(status)) {
    chars = newChars;
  }
}

PatternEntry::~PatternEntry() {
}

const PatternEntry&
PatternEntry::operator=(const   PatternEntry& other)
{
  if (this != &other) {
    strength = other.strength;
    chars = other.chars;
    extension = other.extension;
  }
  return *this;
}

/**
 * Gets the current extension, quoted
 * This is useful when constructing a user-readable string representing
 * a pattern.
 */
void PatternEntry::appendQuotedExtension(UnicodeString& toAddTo) const {
  appendQuoted(extension,toAddTo);
}

/**
 * Gets the current chars, quoted
 * This is useful when constructing a user-readable string representing
 * a pattern.
 */
void PatternEntry::appendQuotedChars(UnicodeString& toAddTo) const {
  appendQuoted(chars,toAddTo);
}

bool_t PatternEntry::equals(const PatternEntry& other) const {
  bool_t result = ((strength == other.strength) &&
                   (chars == other.chars) &&
                   (extension == other.extension));
  return result;
}

/**
 * For debugging.
 */
UnicodeString&
PatternEntry::toString(UnicodeString& result) const
{
  addToBuffer(result, TRUE, FALSE, NULL);
  return result;
}

int32_t
PatternEntry::getStrength() const
{
  return strength;
}

const UnicodeString&
PatternEntry::getExtension(UnicodeString& ext) const
{
  ext = extension;
  return ext;
}

const UnicodeString&
PatternEntry::getChars(UnicodeString& result) const
{
  result = chars;
  return result;
}

/*
 Add the entry in textual form into the toAddTo buffer.
 */
void PatternEntry::addToBuffer(UnicodeString& toAddTo,
                   bool_t showExtension,
                   bool_t showWhiteSpace,
                   const PatternEntry* lastEntry) const
{
  if (showWhiteSpace && toAddTo.size() > 0)
    // Adds new line before each primary strength entry.
    if (strength == Collator::PRIMARY || lastEntry != NULL)
      toAddTo += 0x000A/*'\n'*/;
    else
      toAddTo += 0x0020/*' '*/;
  if (lastEntry != NULL) {
    toAddTo += 0x0026/*'&'*/;
    if (showWhiteSpace)
      toAddTo += 0x0020/*' '*/;
    lastEntry->appendQuotedChars(toAddTo);
    appendQuotedExtension(toAddTo);
    if (showWhiteSpace)
      toAddTo += 0x0020/*' '*/;
  }
  // Check the strength for the correct symbol to append
  switch (strength) {
  case Collator::IDENTICAL:   toAddTo += 0x003D/*'='*/; break;
  case Collator::TERTIARY:    toAddTo += 0x002C/*','*/; break;
  case Collator::SECONDARY:   toAddTo += 0x003B/*';'*/; break;
  case Collator::PRIMARY:     toAddTo += 0x003C/*'<'*/; break;
  case PatternEntry::RESET:   toAddTo += 0x0026/*'&'*/; break;
  case PatternEntry::UNSET:   toAddTo += 0x003F/*'?'*/; break;
  }
  if (showWhiteSpace)
    toAddTo += 0x0020/*' '*/;
  appendQuoted(chars,toAddTo);
  // If there's an expending char and needs to be shown,
  // append that after the entry
  if (showExtension && extension.size() != 0) {
    toAddTo += 0x002F/*'/'*/;
    appendQuoted(extension,toAddTo);
  }
}

// Append a string to a pattern buffer, adding quotes if necessary
void PatternEntry::appendQuoted(const UnicodeString& chars, UnicodeString& toAddTo) {
  bool_t inQuote = FALSE;
  UChar ch = chars[T_INT32(0)];
  if (Unicode::isSpaceChar(ch)) {
    inQuote = TRUE;
    toAddTo += 0x0027/*'\''*/;
  } else if (isSpecialChar(ch)) {
    inQuote = TRUE;
    toAddTo += 0x0027/*'\''*/;
  } else {
    switch (ch) {
    case 0x0010: case 0x000C/*'\f'*/:
    case 0x000D/*'\r'*/: case 0x0009/*'\t'*/:
    case 0x000A/*'\n'*/: case 0x0040/*'@'*/:
      inQuote = TRUE;
      toAddTo += 0x0027/*'\''*/;
      break;
    case 0x0027/*'\''*/:
      inQuote = TRUE;
      toAddTo += 0x0027/*'\''*/;
      break;
    default:
      if (inQuote) {
    inQuote = FALSE; toAddTo += 0x0027/*'\''*/;
      }
      break;
    }
  }
  toAddTo += chars;
  if (inQuote)
    toAddTo += 0x0027/*'\''*/;
}

PatternEntry::Parser::Parser(const UnicodeString &pattern,
                             Normalizer::EMode decompMode)
  : pattern(pattern), index(0),
    fDecompMode(decompMode), newChars(), newExtensions()
{
}

PatternEntry::Parser::Parser(const Parser &that)
  : pattern(that.pattern), index(that.index), fDecompMode(that.fDecompMode),
    newChars(that.newChars), newExtensions(that.newExtensions)
{
}

PatternEntry::Parser::~Parser()
{
}

PatternEntry::Parser &PatternEntry::Parser::operator=(const Parser &that)
{
  if (this != &that)
  {
    this->pattern = that.pattern;
    this->index = that.index;
    this->fDecompMode = that.fDecompMode;
    this->newChars = that.newChars;
    this->newExtensions = that.newExtensions;
  }

  return *this;
}

PatternEntry *PatternEntry::Parser::next(UErrorCode &status)
{
  int32_t newStrength = PatternEntry::UNSET;
  bool_t inChars = TRUE;
  bool_t inQuote = FALSE;

  newChars.remove();
  newExtensions.remove();

  while (index < pattern.size())
    {
      UChar ch = pattern[index];

      if (inQuote)
    {
      if (ch == 0x0027/*'\''*/)
        {
          inQuote = FALSE;
        }
      else
        {
          if ((newChars.size() == 0) || inChars)
        {
          newChars += ch;
        }
          else
        {
          newExtensions += ch;
        }
            }
        }
      else
    {
      // Sets the strength for this entry
      switch (ch)
        {
        case 0x003D/*'='*/ :
          if (newStrength != PatternEntry::UNSET)
        {
          goto EndOfLoop;
        }

          newStrength = Collator::IDENTICAL;
          break;

        case 0x002C/*','*/:
          if (newStrength != PatternEntry::UNSET)
        {
          goto EndOfLoop;
        }

          newStrength = Collator::TERTIARY;
          break;

        case  0x003B/*';'*/:
          if (newStrength != PatternEntry::UNSET)
        {
          goto EndOfLoop;
        }

          newStrength = Collator::SECONDARY;
          break;

        case 0x003C/*'<'*/:
          if (newStrength != PatternEntry::UNSET)
        {
          goto EndOfLoop;
        }

          newStrength = Collator::PRIMARY;
          break;

        case 0x0026/*'&'*/:
          if (newStrength != PatternEntry::UNSET)
        {
          goto EndOfLoop;
        }

          newStrength = PatternEntry::RESET;
          break;

          // Ignore the white spaces
        case 0x0009/*'\t'*/:
        case 0x000C/*'\f'*/:
        case 0x000D/*'\r'*/:
        case 0x000A/*'\n'*/:
        case 0x0020/*' '*/:
          break; // skip whitespace TODO use Unicode

        case 0x002F/*'/'*/:
                // This entry has an extension.
          inChars = FALSE;
          break;

        case 0x0027/*'\''*/:
          inQuote = TRUE;
          ch = pattern[++index];

          if (newChars.size() == 0)
        {
          newChars += ch;
        }
          else if (inChars)
        {
          newChars += ch;
        }
          else
        {
          newExtensions += ch;
        }

          break;

        default:
          if (newStrength == PatternEntry::UNSET)
        {
          status = U_INVALID_FORMAT_ERROR;
          return NULL;
        }

          if (isSpecialChar(ch) && (inQuote == FALSE))
        {
          status = U_INVALID_FORMAT_ERROR;
          return NULL;
        }

          if (inChars)
        {
          newChars += ch;
        }
          else
        {
          newExtensions += ch;
        }

          break;
        }
    }

      if (newChars.isBogus() || newExtensions.isBogus())
    {
      status = U_MEMORY_ALLOCATION_ERROR;
      return NULL;
        }

      index += 1;
    }

 EndOfLoop:
  if (newStrength == PatternEntry::UNSET)
    {
      return NULL;
    }

  if (newChars.size() == 0)
    {
      status = U_INVALID_FORMAT_ERROR;
      return NULL;
    }

  return new PatternEntry(newStrength, newChars, newExtensions, fDecompMode);
}

// Check if the character is a special character.  A special character
// would be meaningful in the rule only if quoted, otherwise it's used
// as a denotation for strength or merging symbols.
bool_t PatternEntry::isSpecialChar(UChar ch)
{
  return (((ch <= 0x002F) && (ch >= 0x0020)) ||
      ((ch <= 0x003F) && (ch >= 0x003A)) ||
      ((ch <= 0x0060) && (ch >= 0x005B)) ||
      ((ch <= 0x007E) && (ch >= 0x007B)));
}