scuffed-code/icu4c/source/common/normlzr.cpp
2001-08-31 21:37:39 +00:00

531 lines
16 KiB
C++

/*
*************************************************************************
* COPYRIGHT:
* Copyright (c) 1996-2001, International Business Machines Corporation and
* others. All Rights Reserved.
*************************************************************************
*/
/*
* Modification history
*
* Date Name Description
* 02/02/01 synwee Added converters from EMode to UNormalizationMode,
* getUNormalizationMode and getNormalizerEMode,
* useful in tbcoll and unorm.
* Added quickcheck method and incorporated it into
* normalize()
* 06/20/01+ Markus Scherer
* total rewrite, implement all normalization in unorm.cpp
* and turn Normalizer into a wrapper;
* fix the very broken iteration API
*/
#include "unicode/normlzr.h"
#include "unicode/utypes.h"
#include "unicode/unistr.h"
#include "unicode/chariter.h"
#include "unicode/schriter.h"
#include "unicode/uchriter.h"
#include "unormimp.h"
//-------------------------------------------------------------------------
// Constructors and other boilerplate
//-------------------------------------------------------------------------
Normalizer::Normalizer(const UnicodeString& str,
EMode mode) :
fMode(mode), fOptions(0),
text(new StringCharacterIterator(str)), nextIndex(-1),
buffer(), bufferPos(0)
{
checkData();
}
Normalizer::Normalizer(const UnicodeString& str,
EMode mode,
int32_t options) :
fMode(mode), fOptions(options),
text(new StringCharacterIterator(str)), nextIndex(-1),
buffer(), bufferPos(0)
{
checkData();
}
Normalizer::Normalizer(const UChar *str, int32_t length, EMode mode) :
fMode(mode), fOptions(0),
text(new UCharCharacterIterator(str, length)), nextIndex(-1),
buffer(), bufferPos(0)
{
checkData();
}
Normalizer::Normalizer(const CharacterIterator& iter,
EMode mode) :
fMode(mode), fOptions(0),
text(iter.clone()), nextIndex(-1),
buffer(), bufferPos(0)
{
checkData();
}
Normalizer::Normalizer(const CharacterIterator& iter,
EMode mode,
int32_t options) :
fMode(mode), fOptions(options),
text(iter.clone()), nextIndex(-1),
buffer(), bufferPos(0)
{
checkData();
}
Normalizer::Normalizer(const Normalizer &copy) :
fMode(copy.fMode), fOptions(copy.fOptions),
text(copy.text->clone()), nextIndex(copy.nextIndex),
buffer(copy.buffer), bufferPos(copy.bufferPos)
{
checkData();
}
static const UChar _NUL=0;
void
Normalizer::checkData() {
UErrorCode errorCode=U_ZERO_ERROR;
if(!unorm_haveData(&errorCode)) {
delete text;
text=new UCharCharacterIterator(&_NUL, 0);
}
}
Normalizer::~Normalizer()
{
delete text;
}
Normalizer*
Normalizer::clone() const
{
if(this!=0) {
return new Normalizer(*this);
} else {
return 0;
}
}
/**
* Generates a hash code for this iterator.
*/
int32_t Normalizer::hashCode() const
{
return text->hashCode() + fMode + fOptions + buffer.hashCode() + bufferPos + nextIndex;
}
UBool Normalizer::operator==(const Normalizer& that) const
{
return
this==&that ||
fMode==that.fMode &&
fOptions==that.fOptions &&
*text==*(that.text) &&
buffer==that.buffer &&
bufferPos==that.bufferPos &&
nextIndex==that.nextIndex;
}
//-------------------------------------------------------------------------
// Static utility methods
//-------------------------------------------------------------------------
void
Normalizer::normalize(const UnicodeString& source,
EMode mode,
int32_t options,
UnicodeString& result,
UErrorCode &status) {
if(source.isBogus() || U_FAILURE(status)) {
result.setToBogus();
} else {
/* make sure that we do not operate on the same buffer in source and result */
result.cloneArrayIfNeeded(-1, source.length()+20, FALSE);
result.fLength=unorm_internalNormalize(result.fArray, result.fCapacity,
source.fArray, source.fLength,
getUNormalizationMode(mode, status), (options&IGNORE_HANGUL)!=0,
UnicodeString::growBuffer, &result,
&status);
if(U_FAILURE(status)) {
result.setToBogus();
}
}
}
UNormalizationCheckResult
Normalizer::quickCheck(const UnicodeString& source,
Normalizer::EMode mode,
UErrorCode &status) {
if(U_FAILURE(status)) {
return UNORM_MAYBE;
}
return unorm_quickCheck(source.fArray, source.length(),
getUNormalizationMode(mode, status), &status);
}
void
Normalizer::compose(const UnicodeString& source,
UBool compat,
int32_t options,
UnicodeString& result,
UErrorCode &status) {
if(source.isBogus() || U_FAILURE(status)) {
result.setToBogus();
} else {
/* make sure that we do not operate on the same buffer in source and result */
result.cloneArrayIfNeeded(-1, source.length()+20, FALSE);
result.fLength=unorm_compose(result.fArray, result.fCapacity,
source.fArray, source.fLength,
compat, (options&IGNORE_HANGUL)!=0,
UnicodeString::growBuffer, &result,
&status);
if(U_FAILURE(status)) {
result.setToBogus();
}
}
}
void
Normalizer::decompose(const UnicodeString& source,
UBool compat,
int32_t options,
UnicodeString& result,
UErrorCode &status) {
if(source.isBogus() || U_FAILURE(status)) {
result.setToBogus();
} else {
/* make sure that we do not operate on the same buffer in source and result */
result.cloneArrayIfNeeded(-1, source.length()+20, FALSE);
result.fLength=unorm_decompose(result.fArray, result.fCapacity,
source.fArray, source.fLength,
compat, (options&IGNORE_HANGUL)!=0,
UnicodeString::growBuffer, &result,
&status);
if(U_FAILURE(status)) {
result.setToBogus();
}
}
}
//-------------------------------------------------------------------------
// Iteration API
//-------------------------------------------------------------------------
/**
* Return the current character in the normalized text.
*/
UChar32 Normalizer::current() {
if(bufferPos<buffer.length()) {
return buffer.char32At(bufferPos);
} else {
/*
* Normalize from the current index,
* return the first character from there, and
* reset the character iterator to the original index.
* Set nextIndex to where the iterator stopped so
* that next() can later continue from there.
*/
UTextOffset currentIndex=text->getIndex();
UChar32 c;
if(nextNormalize()) {
c=buffer.char32At(bufferPos);
nextIndex=text->getIndex();
} else {
c=DONE;
}
text->setIndex(currentIndex);
return c;
}
}
/**
* Return the next character in the normalized text and advance
* the iteration position by one. If the end
* of the text has already been reached, {@link #DONE} is returned.
*/
UChar32 Normalizer::next() {
UChar32 c;
if(bufferPos<buffer.length()) {
c=buffer.char32At(bufferPos);
bufferPos+=UTF_CHAR_LENGTH(c);
return c;
} else {
/*
* If the buffer (which is now exhausted) was normalized
* during current() or setIndex() then the character iterator
* must be set to behind what was normalized then
* in order to continue with the following text.
* That "position behind what was normalized" is nextIndex.
*/
if(nextIndex>=0) {
text->setIndex(nextIndex);
}
if(text->hasNext() && nextNormalize()) {
c=buffer.char32At(bufferPos);
bufferPos+=UTF_CHAR_LENGTH(c);
return c;
} else {
return DONE;
}
}
}
/**
* Return the previous character in the normalized text and decrement
* the iteration position by one. If the beginning
* of the text has already been reached, {@link #DONE} is returned.
*/
UChar32 Normalizer::previous() {
UChar32 c;
if(bufferPos>0 || text->hasPrevious() && previousNormalize()) {
c=buffer.char32At(bufferPos-1);
bufferPos-=UTF_CHAR_LENGTH(c);
return c;
} else {
return DONE;
}
}
void Normalizer::reset() {
text->setToStart();
clearBuffer();
}
void
Normalizer::setIndexOnly(UTextOffset index) {
text->setIndex(index);
clearBuffer();
}
/**
* Set the iteration position in the input text that is being normalized
* and return the first normalized character at that position.
* <p>
* <b>Note:</b> This method sets the position in the <em>input</em> text,
* while {@link #next} and {@link #previous} iterate through characters
* in the normalized <em>output</em>. This means that there is not
* necessarily a one-to-one correspondence between characters returned
* by <tt>next</tt> and <tt>previous</tt> and the indices passed to and
* returned from <tt>setIndex</tt> and {@link #getIndex}.
* <p>
* @param index the desired index in the input text.
*
* @return the first normalized character that is the result of iterating
* forward starting at the given index.
*
* @throws IllegalArgumentException if the given index is less than
* {@link #getBeginIndex} or greater than {@link #getEndIndex}.
*/
UChar32 Normalizer::setIndex(UTextOffset index) {
setIndexOnly(index);
return current();
}
/**
* Return the first character in the normalized text. This resets
* the <tt>Normalizer's</tt> position to the beginning of the text.
*/
UChar32 Normalizer::first() {
text->setToStart();
clearBuffer();
return next();
}
/**
* Return the last character in the normalized text. This resets
* the <tt>Normalizer's</tt> position to be just before the
* the input text corresponding to that normalized character.
*/
UChar32 Normalizer::last() {
text->setToEnd();
clearBuffer();
return previous();
}
/**
* Retrieve the current iteration position in the input text that is
* being normalized. This method is useful in applications such as
* searching, where you need to be able to determine the position in
* the input text that corresponds to a given normalized output character.
* <p>
* <b>Note:</b> This method sets the position in the <em>input</em>, while
* {@link #next} and {@link #previous} iterate through characters in the
* <em>output</em>. This means that there is not necessarily a one-to-one
* correspondence between characters returned by <tt>next</tt> and
* <tt>previous</tt> and the indices passed to and returned from
* <tt>setIndex</tt> and {@link #getIndex}.
*
*/
UTextOffset Normalizer::getIndex() const {
return text->getIndex();
}
/**
* Retrieve the index of the start of the input text. This is the begin index
* of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the <tt>String</tt>
* over which this <tt>Normalizer</tt> is iterating
*/
UTextOffset Normalizer::startIndex() const {
return text->startIndex();
}
/**
* Retrieve the index of the end of the input text. This is the end index
* of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt>
* over which this <tt>Normalizer</tt> is iterating
*/
UTextOffset Normalizer::endIndex() const {
return text->endIndex();
}
//-------------------------------------------------------------------------
// Property access methods
//-------------------------------------------------------------------------
void
Normalizer::setMode(EMode newMode)
{
fMode = newMode;
}
Normalizer::EMode
Normalizer::getMode() const
{
return fMode;
}
void
Normalizer::setOption(int32_t option,
UBool value)
{
if (value) {
fOptions |= option;
} else {
fOptions &= (~option);
}
}
UBool
Normalizer::getOption(int32_t option) const
{
return (fOptions & option) != 0;
}
/**
* Set the input text over which this <tt>Normalizer</tt> will iterate.
* The iteration position is set to the beginning of the input text.
*/
void
Normalizer::setText(const UnicodeString& newText,
UErrorCode &status)
{
if (U_FAILURE(status)) {
return;
}
CharacterIterator *newIter = new StringCharacterIterator(newText);
if (newIter == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
return;
}
delete text;
text = newIter;
reset();
}
/**
* Set the input text over which this <tt>Normalizer</tt> will iterate.
* The iteration position is set to the beginning of the string.
*/
void
Normalizer::setText(const CharacterIterator& newText,
UErrorCode &status)
{
if (U_FAILURE(status)) {
return;
}
CharacterIterator *newIter = newText.clone();
if (newIter == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
return;
}
delete text;
text = newIter;
reset();
}
void
Normalizer::setText(const UChar* newText,
int32_t length,
UErrorCode &status)
{
if (U_FAILURE(status)) {
return;
}
CharacterIterator *newIter = new UCharCharacterIterator(newText, length);
if (newIter == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
return;
}
delete text;
text = newIter;
reset();
}
/**
* Copies the text under iteration into the UnicodeString referred to by "result".
* @param result Receives a copy of the text under iteration.
*/
void
Normalizer::getText(UnicodeString& result)
{
text->getText(result);
}
//-------------------------------------------------------------------------
// Private utility methods
//-------------------------------------------------------------------------
void Normalizer::clearBuffer() {
nextIndex=-1;
buffer.remove();
bufferPos=0;
}
UBool
Normalizer::nextNormalize() {
UErrorCode errorCode=U_ZERO_ERROR;
clearBuffer();
buffer.fLength=unorm_nextNormalize(buffer.fArray, buffer.fCapacity, *text,
getUNormalizationMode(fMode, errorCode),
(fOptions&IGNORE_HANGUL)!=0,
UnicodeString::growBuffer, &buffer,
&errorCode);
return U_SUCCESS(errorCode) && buffer.length()>0;
}
UBool
Normalizer::previousNormalize() {
UErrorCode errorCode=U_ZERO_ERROR;
clearBuffer();
buffer.fLength=unorm_previousNormalize(buffer.fArray, buffer.fCapacity, *text,
getUNormalizationMode(fMode, errorCode),
(fOptions&IGNORE_HANGUL)!=0,
UnicodeString::growBuffer, &buffer,
&errorCode);
bufferPos=buffer.length();
return U_SUCCESS(errorCode) && buffer.length()>0;
}