ICU-45 Initial check-in of RuleBasedBreakIterator and DictionaryBasedBreakIterator.
X-SVN-Rev: 502
This commit is contained in:
parent
bbccafffa4
commit
016aa963f6
@ -190,6 +190,15 @@ StringCharacterIterator::getIndex() const
|
||||
return pos;
|
||||
}
|
||||
|
||||
void
|
||||
StringCharacterIterator::setText(const UnicodeString& newText)
|
||||
{
|
||||
text = newText;
|
||||
begin = 0;
|
||||
end = newText.length();
|
||||
pos = begin;
|
||||
}
|
||||
|
||||
void
|
||||
StringCharacterIterator::getText(UnicodeString& result)
|
||||
{
|
||||
|
@ -142,6 +142,15 @@ UCharCharacterIterator::getIndex() const
|
||||
return pos;
|
||||
}
|
||||
|
||||
void UCharCharacterIterator::setText(const UChar* newText,
|
||||
int32_t newTextLength)
|
||||
{
|
||||
text = newText;
|
||||
begin = 0;
|
||||
end = newTextLength;
|
||||
pos = begin;
|
||||
}
|
||||
|
||||
void
|
||||
UCharCharacterIterator::getText(UnicodeString& result)
|
||||
{
|
||||
|
@ -139,6 +139,11 @@ public:
|
||||
* returned by current()). */
|
||||
virtual UTextOffset getIndex(void) const;
|
||||
|
||||
/**
|
||||
* Sets the iterator to iterate over the provided string.
|
||||
*/
|
||||
virtual void setText(const UnicodeString& newText);
|
||||
|
||||
/**
|
||||
* Copies the UnicodeString under iteration into the UnicodeString
|
||||
* referred to by "result". Even if this iterator iterates across
|
||||
|
@ -108,6 +108,12 @@ public:
|
||||
* returned by current()). */
|
||||
virtual UTextOffset getIndex(void) const;
|
||||
|
||||
/**
|
||||
* Sets the iterator to iterate over a new range of text
|
||||
*/
|
||||
virtual void setText(const UChar* newText,
|
||||
int32_t newTextLength);
|
||||
|
||||
/**
|
||||
* Copies the UnicodeString under iteration into the UnicodeString
|
||||
* referred to by "result". Even if this iterator iterates across
|
||||
|
@ -49,7 +49,7 @@ UVector::~UVector() {
|
||||
}
|
||||
|
||||
void UVector::addElement(void* obj) {
|
||||
if (ensureCapacity(count+1)) {
|
||||
if (ensureCapacity(count + 1)) {
|
||||
elements[count++] = obj;
|
||||
}
|
||||
}
|
||||
@ -66,7 +66,7 @@ void UVector::setElementAt(void* obj, int32_t index) {
|
||||
|
||||
void UVector::insertElementAt(void* obj, int32_t index) {
|
||||
// must have 0 <= index <= count
|
||||
if (0 <= index && index <= count && ensureCapacity(count)) {
|
||||
if (0 <= index && index <= count && ensureCapacity(count + 1)) {
|
||||
for (int32_t i=count; i>index; --i) {
|
||||
elements[i] = elements[i-1];
|
||||
}
|
||||
|
@ -227,15 +227,6 @@ inline void* UVector::operator[](int32_t index) const {
|
||||
return elementAt(index);
|
||||
}
|
||||
|
||||
// Dummy implementation - disallowed method
|
||||
inline UVector::UVector(const UVector&) {}
|
||||
|
||||
// Dummy implementation - disallowed method
|
||||
inline UVector& UVector::operator=(const UVector&) {
|
||||
return *this;
|
||||
}
|
||||
|
||||
|
||||
// UStack inlines
|
||||
|
||||
inline bool_t UStack::empty(void) const {
|
||||
@ -251,12 +242,4 @@ inline void* UStack::push(void* obj) {
|
||||
return obj;
|
||||
}
|
||||
|
||||
// Dummy implementation - disallowed method
|
||||
inline UStack::UStack(const UStack&) {}
|
||||
|
||||
// Dummy implementation - disallowed method
|
||||
inline UStack& UStack::operator=(const UStack&) {
|
||||
return *this;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@ -17,9 +17,10 @@
|
||||
// This file was generated from the java source file BreakIterator.java
|
||||
// *****************************************************************************
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "dbbi.h"
|
||||
#include "unicode/brkiter.h"
|
||||
#include "simtxbd.h"
|
||||
#include "unicode/udata.h"
|
||||
#include "resbund.h"
|
||||
|
||||
#include <string.h>
|
||||
|
||||
@ -38,7 +39,41 @@ const UTextOffset BreakIterator::DONE = (int32_t)-1;
|
||||
BreakIterator*
|
||||
BreakIterator::createWordInstance(const Locale& key)
|
||||
{
|
||||
return new SimpleTextBoundary(&TextBoundaryData::kWordBreakData);
|
||||
// WARNING: This routine is currently written specifically to handle only the
|
||||
// default rules files and the alternate rules files for Thai. This function
|
||||
// will have to be made fully general at some time in the future!
|
||||
BreakIterator* result = NULL;
|
||||
const char* filename = "word";
|
||||
|
||||
UnicodeString temp;
|
||||
if (key.getLanguage(temp) == UnicodeString("th", (char*)0)) {
|
||||
filename = "word_th";
|
||||
}
|
||||
|
||||
UErrorCode err = U_ZERO_ERROR;
|
||||
UDataMemory* file = udata_open(NULL, "brk", filename, &err);
|
||||
|
||||
if (!U_FAILURE(err)) {
|
||||
const void* image = udata_getMemory(file);
|
||||
|
||||
if (image != NULL) {
|
||||
if (key.getLanguage(temp) == UnicodeString("th", (char*)0)) {
|
||||
const char* dataDir = u_getDataDirectory();
|
||||
filename = "thaidict.brk";
|
||||
char* fullPath = new char[strlen(dataDir) + strlen(filename) + 1];
|
||||
strcpy(fullPath, dataDir);
|
||||
strcpy(fullPath, filename);
|
||||
|
||||
result = new DictionaryBasedBreakIterator(image, fullPath);
|
||||
delete [] fullPath;
|
||||
}
|
||||
else {
|
||||
result = new RuleBasedBreakIterator(image);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
// -------------------------------------
|
||||
@ -47,7 +82,41 @@ BreakIterator::createWordInstance(const Locale& key)
|
||||
BreakIterator*
|
||||
BreakIterator::createLineInstance(const Locale& key)
|
||||
{
|
||||
return new SimpleTextBoundary(&TextBoundaryData::kLineBreakData);
|
||||
// WARNING: This routine is currently written specifically to handle only the
|
||||
// default rules files and the alternate rules files for Thai. This function
|
||||
// will have to be made fully general at some time in the future!
|
||||
BreakIterator* result = NULL;
|
||||
const char* filename = "line";
|
||||
|
||||
UnicodeString temp;
|
||||
if (key.getLanguage(temp) == UnicodeString("th", (char*)0)) {
|
||||
filename = "line_th";
|
||||
}
|
||||
|
||||
UErrorCode err = U_ZERO_ERROR;
|
||||
UDataMemory* file = udata_open(NULL, "brk", filename, &err);
|
||||
|
||||
if (!U_FAILURE(err)) {
|
||||
const void* image = udata_getMemory(file);
|
||||
|
||||
if (image != NULL) {
|
||||
if (key.getLanguage(temp) == UnicodeString("th", (char*)0)) {
|
||||
const char* dataDir = u_getDataDirectory();
|
||||
filename = "thaidict.brk";
|
||||
char* fullPath = new char[strlen(dataDir) + strlen(filename) + 1];
|
||||
strcpy(fullPath, dataDir);
|
||||
strcat(fullPath, filename);
|
||||
|
||||
result = new DictionaryBasedBreakIterator(image, fullPath);
|
||||
delete [] fullPath;
|
||||
}
|
||||
else {
|
||||
result = new RuleBasedBreakIterator(image);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
// -------------------------------------
|
||||
@ -56,7 +125,24 @@ BreakIterator::createLineInstance(const Locale& key)
|
||||
BreakIterator*
|
||||
BreakIterator::createCharacterInstance(const Locale& key)
|
||||
{
|
||||
return new SimpleTextBoundary(&TextBoundaryData::kCharacterBreakData);
|
||||
// WARNING: This routine is currently written specifically to handle only the
|
||||
// default rules files and the alternate rules files for Thai. This function
|
||||
// will have to be made fully general at some time in the future!
|
||||
BreakIterator* result = NULL;
|
||||
const char* filename = "char";
|
||||
|
||||
UErrorCode err = U_ZERO_ERROR;
|
||||
UDataMemory* file = udata_open(NULL, "brk", filename, &err);
|
||||
|
||||
if (!U_FAILURE(err)) {
|
||||
const void* image = udata_getMemory(file);
|
||||
|
||||
if (image != NULL) {
|
||||
result = new RuleBasedBreakIterator(image);
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
// -------------------------------------
|
||||
@ -65,7 +151,24 @@ BreakIterator::createCharacterInstance(const Locale& key)
|
||||
BreakIterator*
|
||||
BreakIterator::createSentenceInstance(const Locale& key)
|
||||
{
|
||||
return new SimpleTextBoundary(&TextBoundaryData::kSentenceBreakData);
|
||||
// WARNING: This routine is currently written specifically to handle only the
|
||||
// default rules files and the alternate rules files for Thai. This function
|
||||
// will have to be made fully general at some time in the future!
|
||||
BreakIterator* result = NULL;
|
||||
const char* filename = "sent";
|
||||
|
||||
UErrorCode err = U_ZERO_ERROR;
|
||||
UDataMemory* file = udata_open(NULL, "brk", filename, &err);
|
||||
|
||||
if (!U_FAILURE(err)) {
|
||||
const void* image = udata_getMemory(file);
|
||||
|
||||
if (image != NULL) {
|
||||
result = new RuleBasedBreakIterator(image);
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
// -------------------------------------
|
||||
|
439
icu4c/source/i18n/dbbi.cpp
Normal file
439
icu4c/source/i18n/dbbi.cpp
Normal file
@ -0,0 +1,439 @@
|
||||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 1999 IBM Corp. All rights reserved.
|
||||
**********************************************************************
|
||||
* Date Name Description
|
||||
* 12/1/99 rgillam Complete port from Java.
|
||||
**********************************************************************
|
||||
*/
|
||||
|
||||
#include "dbbi.h"
|
||||
#include "dbbi_tbl.h"
|
||||
#include "uvector.h"
|
||||
|
||||
char DictionaryBasedBreakIterator::fgClassID = 0;
|
||||
|
||||
//=======================================================================
|
||||
// constructors
|
||||
//=======================================================================
|
||||
|
||||
DictionaryBasedBreakIterator::DictionaryBasedBreakIterator(const void* tablesImage,
|
||||
char* dictionaryFilename)
|
||||
: RuleBasedBreakIterator((const void*)NULL),
|
||||
dictionaryCharCount(0),
|
||||
cachedBreakPositions(NULL),
|
||||
numCachedBreakPositions(0),
|
||||
positionInCache(0)
|
||||
{
|
||||
tables = new DictionaryBasedBreakIteratorTables(tablesImage, dictionaryFilename);
|
||||
tables->addReference();
|
||||
}
|
||||
|
||||
//=======================================================================
|
||||
// boilerplate
|
||||
//=======================================================================
|
||||
|
||||
/**
|
||||
* Destructor
|
||||
*/
|
||||
DictionaryBasedBreakIterator::~DictionaryBasedBreakIterator()
|
||||
{
|
||||
delete [] cachedBreakPositions;
|
||||
}
|
||||
|
||||
/**
|
||||
* Assignment operator. Sets this iterator to have the same behavior,
|
||||
* and iterate over the same text, as the one passed in.
|
||||
*/
|
||||
DictionaryBasedBreakIterator&
|
||||
DictionaryBasedBreakIterator::operator=(const DictionaryBasedBreakIterator& that) {
|
||||
reset();
|
||||
RuleBasedBreakIterator::operator=(that);
|
||||
return *this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a newly-constructed RuleBasedBreakIterator with the same
|
||||
* behavior, and iterating over the same text, as this one.
|
||||
*/
|
||||
BreakIterator*
|
||||
DictionaryBasedBreakIterator::clone() const {
|
||||
return new DictionaryBasedBreakIterator(*this);
|
||||
}
|
||||
|
||||
//=======================================================================
|
||||
// BreakIterator overrides
|
||||
//=======================================================================
|
||||
|
||||
/**
|
||||
* Advances the iterator one step backwards.
|
||||
* @return The position of the last boundary position before the
|
||||
* current iteration position
|
||||
*/
|
||||
int32_t
|
||||
DictionaryBasedBreakIterator::previous()
|
||||
{
|
||||
// if we have cached break positions and we're still in the range
|
||||
// covered by them, just move one step backward in the cache
|
||||
if (cachedBreakPositions != NULL && positionInCache > 0) {
|
||||
--positionInCache;
|
||||
text->setIndex(cachedBreakPositions[positionInCache]);
|
||||
return cachedBreakPositions[positionInCache];
|
||||
}
|
||||
|
||||
// otherwise, dump the cache and use the inherited previous() method to move
|
||||
// backward. This may fill up the cache with new break positions, in which
|
||||
// case we have to mark our position in the cache
|
||||
else {
|
||||
reset();
|
||||
int32_t result = RuleBasedBreakIterator::previous();
|
||||
if (cachedBreakPositions != NULL) {
|
||||
positionInCache = numCachedBreakPositions - 2;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the current iteration position to the last boundary position
|
||||
* before the specified position.
|
||||
* @param offset The position to begin searching from
|
||||
* @return The position of the last boundary before "offset"
|
||||
*/
|
||||
int32_t
|
||||
DictionaryBasedBreakIterator::preceding(int32_t offset)
|
||||
{
|
||||
// if the offset passed in is already past the end of the text,
|
||||
// just return DONE; if it's before the beginning, return the
|
||||
// text's starting offset
|
||||
if (text == NULL || offset > text->endIndex()) {
|
||||
return BreakIterator::DONE;
|
||||
}
|
||||
else if (offset < text->startIndex()) {
|
||||
return text->startIndex();
|
||||
}
|
||||
|
||||
// if we have no cached break positions, or "offset" is outside the
|
||||
// range covered by the cache, we can just call the inherited routine
|
||||
// (which will eventually call other routines in this class that may
|
||||
// refresh the cache)
|
||||
if (cachedBreakPositions == NULL || offset <= cachedBreakPositions[0] ||
|
||||
offset > cachedBreakPositions[numCachedBreakPositions - 1]) {
|
||||
reset();
|
||||
return RuleBasedBreakIterator::preceding(offset);
|
||||
}
|
||||
|
||||
// on the other hand, if "offset" is within the range covered by the cache,
|
||||
// then all we have to do is search the cache for the last break position
|
||||
// before "offset"
|
||||
else {
|
||||
positionInCache = 0;
|
||||
while (positionInCache < numCachedBreakPositions
|
||||
&& offset > cachedBreakPositions[positionInCache])
|
||||
++positionInCache;
|
||||
--positionInCache;
|
||||
text->setIndex(cachedBreakPositions[positionInCache]);
|
||||
return text->getIndex();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the current iteration position to the first boundary position after
|
||||
* the specified position.
|
||||
* @param offset The position to begin searching forward from
|
||||
* @return The position of the first boundary after "offset"
|
||||
*/
|
||||
int32_t
|
||||
DictionaryBasedBreakIterator::following(int32_t offset)
|
||||
{
|
||||
// if the offset passed in is already past the end of the text,
|
||||
// just return DONE; if it's before the beginning, return the
|
||||
// text's starting offset
|
||||
if (text == NULL || offset > text->endIndex()) {
|
||||
return BreakIterator::DONE;
|
||||
}
|
||||
else if (offset < text->startIndex()) {
|
||||
return text->startIndex();
|
||||
}
|
||||
|
||||
// if we have no cached break positions, or if "offset" is outside the
|
||||
// range covered by the cache, then dump the cache and call our
|
||||
// inherited following() method. This will call other methods in this
|
||||
// class that may refresh the cache.
|
||||
if (cachedBreakPositions == NULL || offset < cachedBreakPositions[0] ||
|
||||
offset >= cachedBreakPositions[numCachedBreakPositions - 1]) {
|
||||
reset();
|
||||
return RuleBasedBreakIterator::following(offset);
|
||||
}
|
||||
|
||||
// on the other hand, if "offset" is within the range covered by the
|
||||
// cache, then just search the cache for the first break position
|
||||
// after "offset"
|
||||
else {
|
||||
positionInCache = 0;
|
||||
while (positionInCache < numCachedBreakPositions
|
||||
&& offset >= cachedBreakPositions[positionInCache])
|
||||
++positionInCache;
|
||||
text->setIndex(cachedBreakPositions[positionInCache]);
|
||||
return text->getIndex();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* This is the implementation function for next().
|
||||
*/
|
||||
int32_t
|
||||
DictionaryBasedBreakIterator::handleNext()
|
||||
{
|
||||
// if there are no cached break positions, or if we've just moved
|
||||
// off the end of the range covered by the cache, we have to dump
|
||||
// and possibly regenerate the cache
|
||||
if (cachedBreakPositions == NULL || positionInCache == numCachedBreakPositions - 1) {
|
||||
|
||||
// start by using the inherited handleNext() to find a tentative return
|
||||
// value. dictionaryCharCount tells us how many dictionary characters
|
||||
// we passed over on our way to the tentative return value
|
||||
int32_t startPos = text->getIndex();
|
||||
dictionaryCharCount = 0;
|
||||
int32_t result = RuleBasedBreakIterator::handleNext();
|
||||
|
||||
// if we passed over more than one dictionary character, then we use
|
||||
// divideUpDictionaryRange() to regenerate the cached break positions
|
||||
// for the new range
|
||||
if (dictionaryCharCount > 1 && result - startPos > 1) {
|
||||
divideUpDictionaryRange(startPos, result);
|
||||
}
|
||||
|
||||
// otherwise, the value we got back from the inherited fuction
|
||||
// is our return value, and we can dump the cache
|
||||
else {
|
||||
reset();
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
// if the cache of break positions has been regenerated (or existed all
|
||||
// along), then just advance to the next break position in the cache
|
||||
// and return it
|
||||
if (cachedBreakPositions != NULL) {
|
||||
++positionInCache;
|
||||
text->setIndex(cachedBreakPositions[positionInCache]);
|
||||
return cachedBreakPositions[positionInCache];
|
||||
}
|
||||
return -9999; // SHOULD NEVER GET HERE!
|
||||
}
|
||||
|
||||
void
|
||||
DictionaryBasedBreakIterator::reset()
|
||||
{
|
||||
delete [] cachedBreakPositions;
|
||||
cachedBreakPositions = NULL;
|
||||
numCachedBreakPositions = 0;
|
||||
dictionaryCharCount = 0;
|
||||
positionInCache = 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* This is the function that actually implements the dictionary-based
|
||||
* algorithm. Given the endpoints of a range of text, it uses the
|
||||
* dictionary to determine the positions of any boundaries in this
|
||||
* range. It stores all the boundary positions it discovers in
|
||||
* cachedBreakPositions so that we only have to do this work once
|
||||
* for each time we enter the range.
|
||||
*/
|
||||
void
|
||||
DictionaryBasedBreakIterator::divideUpDictionaryRange(int32_t startPos, int32_t endPos)
|
||||
{
|
||||
// to avoid casts throughout the rest of this function
|
||||
DictionaryBasedBreakIteratorTables* tables
|
||||
= (DictionaryBasedBreakIteratorTables*)(this->tables);
|
||||
|
||||
// the range we're dividing may begin or end with non-dictionary characters
|
||||
// (i.e., for line breaking, we may have leading or trailing punctuation
|
||||
// that needs to be kept with the word). Seek from the beginning of the
|
||||
// range to the first dictionary character
|
||||
text->setIndex(startPos);
|
||||
UChar c = text->current();
|
||||
int category = tables->lookupCategory(c, this);
|
||||
while (category == IGNORE || !tables->categoryFlags[category]) {
|
||||
c = text->next();
|
||||
category = tables->lookupCategory(c, this);
|
||||
}
|
||||
|
||||
|
||||
// initialize. We maintain two stacks: currentBreakPositions contains
|
||||
// the list of break positions that will be returned if we successfully
|
||||
// finish traversing the whole range now. possibleBreakPositions lists
|
||||
// all other possible word ends we've passed along the way. (Whenever
|
||||
// we reach an error [a sequence of characters that can't begin any word
|
||||
// in the dictionary], we back up, possibly delete some breaks from
|
||||
// currentBreakPositions, move a break from possibleBreakPositions
|
||||
// to currentBreakPositions, and start over from there. This process
|
||||
// continues in this way until we either successfully make it all the way
|
||||
// across the range, or exhaust all of our combinations of break
|
||||
// positions.) wrongBreakPositions is used to keep track of paths we've
|
||||
// tried on previous iterations. As the iterator backs up further and
|
||||
// further, this saves us from having to follow each possible path
|
||||
// through the text all the way to the error (hopefully avoiding many
|
||||
// future recursive calls as well).
|
||||
UStack currentBreakPositions;
|
||||
UStack possibleBreakPositions;
|
||||
UVector wrongBreakPositions;
|
||||
|
||||
// the dictionary is implemented as a trie, which is treated as a state
|
||||
// machine. -1 represents the end of a legal word. Every word in the
|
||||
// dictionary is represented by a path from the root node to -1. A path
|
||||
// that ends in state 0 is an illegal combination of characters.
|
||||
int16_t state = 0;
|
||||
|
||||
// these two variables are used for error handling. We keep track of the
|
||||
// farthest we've gotten through the range being divided, and the combination
|
||||
// of breaks that got us that far. If we use up all possible break
|
||||
// combinations, the text contains an error or a word that's not in the
|
||||
// dictionary. In this case, we "bless" the break positions that got us the
|
||||
// farthest as real break positions, and then start over from scratch with
|
||||
// the character where the error occurred.
|
||||
int32_t farthestEndPoint = text->getIndex();
|
||||
UStack bestBreakPositions;
|
||||
bool_t bestBreakPositionsInitialized = FALSE;
|
||||
|
||||
// initialize (we always exit the loop with a break statement)
|
||||
c = text->current();
|
||||
while (true) {
|
||||
|
||||
// if we can transition to state "-1" from our current state, we're
|
||||
// on the last character of a legal word. Push that position onto
|
||||
// the possible-break-positions stack
|
||||
if (tables->dictionary.at(state, (int32_t)0) == -1) {
|
||||
possibleBreakPositions.push((void*)text->getIndex());
|
||||
}
|
||||
|
||||
// look up the new state to transition to in the dictionary
|
||||
state = tables->dictionary.at(state, c);
|
||||
|
||||
// if the character we're sitting on causes us to transition to
|
||||
// the "end of word" state, then it was a non-dictionary character
|
||||
// and we've successfully traversed the whole range. Drop out
|
||||
// of the loop.
|
||||
if (state == -1) {
|
||||
currentBreakPositions.push((void*)text->getIndex());
|
||||
break;
|
||||
}
|
||||
|
||||
// if the character we're sitting on causes us to transition to
|
||||
// the error state, or if we've gone off the end of the range
|
||||
// without transitioning to the "end of word" state, we've hit
|
||||
// an error...
|
||||
else if (state == 0 || text->getIndex() >= endPos) {
|
||||
|
||||
// if this is the farthest we've gotten, take note of it in
|
||||
// case there's an error in the text
|
||||
if (text->getIndex() > farthestEndPoint) {
|
||||
farthestEndPoint = text->getIndex();
|
||||
bestBreakPositions.removeAllElements();
|
||||
bestBreakPositionsInitialized = TRUE;
|
||||
for (int32_t i = 0; i < currentBreakPositions.size(); i++) {
|
||||
bestBreakPositions.push(currentBreakPositions.elementAt(i));
|
||||
}
|
||||
}
|
||||
|
||||
// wrongBreakPositions is a list of all break positions we've tried starting
|
||||
// that didn't allow us to traverse all the way through the text. Every time
|
||||
// we pop a break position off of currentBreakPositions, we put it into
|
||||
// wrongBreakPositions to avoid trying it again later. If we make it to this
|
||||
// spot, we're either going to back up to a break in possibleBreakPositions
|
||||
// and try starting over from there, or we've exhausted all possible break
|
||||
// positions and are going to do the fallback procedure. This loop prevents
|
||||
// us from messing with anything in possibleBreakPositions that didn't work as
|
||||
// a starting point the last time we tried it (this is to prevent a bunch of
|
||||
// repetitive checks from slowing down some extreme cases)
|
||||
while (!possibleBreakPositions.isEmpty() && wrongBreakPositions.contains(
|
||||
possibleBreakPositions.peek())) {
|
||||
possibleBreakPositions.pop();
|
||||
}
|
||||
|
||||
// if we've used up all possible break-position combinations, there's
|
||||
// an error or an unknown word in the text. In this case, we start
|
||||
// over, treating the farthest character we've reached as the beginning
|
||||
// of the range, and "blessing" the break positions that got us that
|
||||
// far as real break positions
|
||||
if (possibleBreakPositions.isEmpty()) {
|
||||
if (bestBreakPositionsInitialized) {
|
||||
currentBreakPositions.removeAllElements();
|
||||
for (int32_t i = 0; i < bestBreakPositions.size(); i++) {
|
||||
currentBreakPositions.push(bestBreakPositions.elementAt(i));
|
||||
}
|
||||
bestBreakPositions.removeAllElements();
|
||||
if (farthestEndPoint < endPos) {
|
||||
text->setIndex(farthestEndPoint + 1);
|
||||
}
|
||||
else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
else {
|
||||
if ((currentBreakPositions.isEmpty()
|
||||
|| (int32_t)currentBreakPositions.peek() != text->getIndex())
|
||||
&& text->getIndex() != startPos) {
|
||||
currentBreakPositions.push((void*)text->getIndex());
|
||||
}
|
||||
text->next();
|
||||
currentBreakPositions.push((void*)text->getIndex());
|
||||
}
|
||||
}
|
||||
|
||||
// if we still have more break positions we can try, then promote the
|
||||
// last break in possibleBreakPositions into currentBreakPositions,
|
||||
// and get rid of all entries in currentBreakPositions that come after
|
||||
// it. Then back up to that position and start over from there (i.e.,
|
||||
// treat that position as the beginning of a new word)
|
||||
else {
|
||||
int32_t temp = (int32_t)possibleBreakPositions.pop();
|
||||
void* temp2 = NULL;
|
||||
while (!currentBreakPositions.isEmpty() && temp <
|
||||
(int32_t)currentBreakPositions.peek()) {
|
||||
temp2 = currentBreakPositions.pop();
|
||||
wrongBreakPositions.addElement(temp2);
|
||||
}
|
||||
currentBreakPositions.push((void*)temp);
|
||||
text->setIndex((int32_t)currentBreakPositions.peek());
|
||||
}
|
||||
|
||||
// re-sync "c" for the next go-round, and drop out of the loop if
|
||||
// we've made it off the end of the range
|
||||
c = text->current();
|
||||
if (text->getIndex() >= endPos) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// if we didn't hit any exceptional conditions on this last iteration,
|
||||
// just advance to the next character and loop
|
||||
else {
|
||||
c = text->next();
|
||||
}
|
||||
}
|
||||
|
||||
// dump the last break position in the list, and replace it with the actual
|
||||
// end of the range (which may be the same character, or may be further on
|
||||
// because the range actually ended with non-dictionary characters we want to
|
||||
// keep with the word)
|
||||
if (!currentBreakPositions.isEmpty()) {
|
||||
currentBreakPositions.pop();
|
||||
}
|
||||
currentBreakPositions.push((void*)endPos);
|
||||
|
||||
// create a regular array to hold the break positions and copy
|
||||
// the break positions from the stack to the array (in addition,
|
||||
// our starting position goes into this array as a break position).
|
||||
// This array becomes the cache of break positions used by next()
|
||||
// and previous(), so this is where we actually refresh the cache.
|
||||
cachedBreakPositions = new int32_t[currentBreakPositions.size() + 1];
|
||||
numCachedBreakPositions = currentBreakPositions.size() + 1;
|
||||
cachedBreakPositions[0] = startPos;
|
||||
|
||||
for (int32_t i = 0; i < currentBreakPositions.size(); i++) {
|
||||
cachedBreakPositions[i + 1] = (int32_t)currentBreakPositions.elementAt(i);
|
||||
}
|
||||
positionInCache = 0;
|
||||
}
|
201
icu4c/source/i18n/dbbi.h
Normal file
201
icu4c/source/i18n/dbbi.h
Normal file
@ -0,0 +1,201 @@
|
||||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 1999 IBM Corp. All rights reserved.
|
||||
**********************************************************************
|
||||
* Date Name Description
|
||||
* 12/1/99 rgillam Complete port from Java.
|
||||
**********************************************************************
|
||||
*/
|
||||
|
||||
#ifndef DBBI_H
|
||||
#define DBBI_H
|
||||
|
||||
#include "rbbi.h"
|
||||
|
||||
/**
|
||||
* A subclass of RuleBasedBreakIterator that adds the ability to use a dictionary
|
||||
* to further subdivide ranges of text beyond what is possible using just the
|
||||
* state-table-based algorithm. This is necessary, for example, to handle
|
||||
* word and line breaking in Thai, which doesn't use spaces between words. The
|
||||
* state-table-based algorithm used by RuleBasedBreakIterator is used to divide
|
||||
* up text as far as possible, and then contiguous ranges of letters are
|
||||
* repeatedly compared against a list of known words (i.e., the dictionary)
|
||||
* to divide them up into words.
|
||||
*
|
||||
* DictionaryBasedBreakIterator uses the same rule language as RuleBasedBreakIterator,
|
||||
* but adds one more special substitution name: <dictionary>. This substitution
|
||||
* name is used to identify characters in words in the dictionary. The idea is that
|
||||
* if the iterator passes over a chunk of text that includes two or more characters
|
||||
* in a row that are included in <dictionary>, it goes back through that range and
|
||||
* derives additional break positions (if possible) using the dictionary.
|
||||
*
|
||||
* DictionaryBasedBreakIterator is also constructed with the filename of a dictionary
|
||||
* file. It follows a prescribed search path to locate the dictionary (right now,
|
||||
* it looks for it in /com/ibm/text/resources in each directory in the classpath,
|
||||
* and won't find it in JAR files, but this location is likely to change). The
|
||||
* dictionary file is in a serialized binary format. We have a very primitive (and
|
||||
* slow) BuildDictionaryFile utility for creating dictionary files, but aren't
|
||||
* currently making it public. Contact us for help.
|
||||
*/
|
||||
class U_I18N_API DictionaryBasedBreakIterator : public RuleBasedBreakIterator {
|
||||
|
||||
private:
|
||||
/**
|
||||
* a temporary hiding place for the number of dictionary characters in the
|
||||
* last range passed over by next()
|
||||
*/
|
||||
int32_t dictionaryCharCount;
|
||||
|
||||
/**
|
||||
* when a range of characters is divided up using the dictionary, the break
|
||||
* positions that are discovered are stored here, preventing us from having
|
||||
* to use either the dictionary or the state table again until the iterator
|
||||
* leaves this range of text
|
||||
*/
|
||||
int32_t* cachedBreakPositions;
|
||||
|
||||
/**
|
||||
* The number of elements in cachedBreakPositions
|
||||
*/
|
||||
int32_t numCachedBreakPositions;
|
||||
|
||||
/**
|
||||
* if cachedBreakPositions is not null, this indicates which item in the
|
||||
* cache the current iteration position refers to
|
||||
*/
|
||||
int32_t positionInCache;
|
||||
|
||||
/**
|
||||
* Class ID
|
||||
*/
|
||||
static char fgClassID;
|
||||
|
||||
public:
|
||||
//=======================================================================
|
||||
// constructors
|
||||
//=======================================================================
|
||||
|
||||
DictionaryBasedBreakIterator(const void* tablesImage, char* dictionaryFilename);
|
||||
|
||||
//=======================================================================
|
||||
// boilerplate
|
||||
//=======================================================================
|
||||
|
||||
/**
|
||||
* Destructor
|
||||
*/
|
||||
virtual ~DictionaryBasedBreakIterator();
|
||||
|
||||
/**
|
||||
* Assignment operator. Sets this iterator to have the same behavior,
|
||||
* and iterate over the same text, as the one passed in.
|
||||
*/
|
||||
DictionaryBasedBreakIterator& operator=(const DictionaryBasedBreakIterator& that);
|
||||
|
||||
/**
|
||||
* Returns a newly-constructed RuleBasedBreakIterator with the same
|
||||
* behavior, and iterating over the same text, as this one.
|
||||
*/
|
||||
virtual BreakIterator* clone() const;
|
||||
|
||||
//=======================================================================
|
||||
// BreakIterator overrides
|
||||
//=======================================================================
|
||||
/**
|
||||
* Advances the iterator backwards, to the last boundary preceding this one.
|
||||
* @return The position of the last boundary position preceding this one.
|
||||
*/
|
||||
virtual int32_t previous();
|
||||
|
||||
/**
|
||||
* Sets the iterator to refer to the first boundary position following
|
||||
* the specified position.
|
||||
* @offset The position from which to begin searching for a break position.
|
||||
* @return The position of the first break after the current position.
|
||||
*/
|
||||
virtual int32_t following(int32_t offset);
|
||||
|
||||
/**
|
||||
* Sets the iterator to refer to the last boundary position before the
|
||||
* specified position.
|
||||
* @offset The position to begin searching for a break from.
|
||||
* @return The position of the last boundary before the starting position.
|
||||
*/
|
||||
virtual int32_t preceding(int32_t offset);
|
||||
|
||||
/**
|
||||
* Returns a unique class ID POLYMORPHICALLY. Pure virtual override.
|
||||
* This method is to implement a simple version of RTTI, since not all
|
||||
* C++ compilers support genuine RTTI. Polymorphic operator==() and
|
||||
* clone() methods call this method.
|
||||
*
|
||||
* @return The class ID for this object. All objects of a
|
||||
* given class have the same class ID. Objects of
|
||||
* other classes have different class IDs.
|
||||
*/
|
||||
virtual UClassID getDynamicClassID() const;
|
||||
|
||||
/**
|
||||
* Returns the class ID for this class. This is useful only for
|
||||
* comparing to a return value from getDynamicClassID(). For example:
|
||||
*
|
||||
* Base* polymorphic_pointer = createPolymorphicObject();
|
||||
* if (polymorphic_pointer->getDynamicClassID() ==
|
||||
* Derived::getStaticClassID()) ...
|
||||
*
|
||||
* @return The class ID for all objects of this class.
|
||||
*/
|
||||
static UClassID getStaticClassID();
|
||||
|
||||
protected:
|
||||
//=======================================================================
|
||||
// implementation
|
||||
//=======================================================================
|
||||
/**
|
||||
* This method is the actual implementation of the next() method. All iteration
|
||||
* vectors through here. This method initializes the state machine to state 1
|
||||
* and advances through the text character by character until we reach the end
|
||||
* of the text or the state machine transitions to state 0. We update our return
|
||||
* value every time the state machine passes through a possible end state.
|
||||
*/
|
||||
virtual int32_t handleNext();
|
||||
|
||||
/**
|
||||
* dumps the cache of break positions (usually in response to a change in
|
||||
* position of some sort)
|
||||
*/
|
||||
virtual void reset();
|
||||
|
||||
private:
|
||||
/**
|
||||
* This is the function that actually implements the dictionary-based
|
||||
* algorithm. Given the endpoints of a range of text, it uses the
|
||||
* dictionary to determine the positions of any boundaries in this
|
||||
* range. It stores all the boundary positions it discovers in
|
||||
* cachedBreakPositions so that we only have to do this work once
|
||||
* for each time we enter the range.
|
||||
*/
|
||||
void divideUpDictionaryRange(int32_t startPos, int32_t endPos);
|
||||
|
||||
/**
|
||||
* Used by the tables object to increment the count of dictionary characters
|
||||
* during iteration
|
||||
*/
|
||||
void bumpDictionaryCharCount();
|
||||
|
||||
friend class DictionaryBasedBreakIteratorTables;
|
||||
};
|
||||
|
||||
inline UClassID DictionaryBasedBreakIterator::getDynamicClassID() const {
|
||||
return RuleBasedBreakIterator::getStaticClassID();
|
||||
}
|
||||
|
||||
inline UClassID DictionaryBasedBreakIterator::getStaticClassID() {
|
||||
return (UClassID)(&fgClassID);
|
||||
}
|
||||
|
||||
inline void DictionaryBasedBreakIterator::bumpDictionaryCharCount() {
|
||||
++dictionaryCharCount;
|
||||
}
|
||||
|
||||
#endif
|
64
icu4c/source/i18n/dbbi_bld.cpp
Normal file
64
icu4c/source/i18n/dbbi_bld.cpp
Normal file
@ -0,0 +1,64 @@
|
||||
/**
|
||||
* The Builder class for DictionaryBasedBreakIterator inherits almost all of
|
||||
* its functionality from the Builder class for RuleBasedBreakIterator, but
|
||||
* extends it with extra logic to handle the "<dictionary>" token
|
||||
*/
|
||||
protected class Builder extends RuleBasedBreakIterator.Builder {
|
||||
|
||||
/**
|
||||
* A CharSet that contains all the characters represented in the dictionary
|
||||
*/
|
||||
private CharSet dictionaryChars = new CharSet();
|
||||
private String dictionaryExpression = "";
|
||||
|
||||
/**
|
||||
* No special initialization
|
||||
*/
|
||||
public Builder() {
|
||||
}
|
||||
|
||||
/**
|
||||
* We override handleSpecialSubstitution() to add logic to handle
|
||||
* the <dictionary> tag. If we see a substitution named "<dictionary>",
|
||||
* parse the substitution expression and store the result in
|
||||
* dictionaryChars.
|
||||
*/
|
||||
protected void handleSpecialSubstitution(String replace, String replaceWith,
|
||||
int startPos, String description) {
|
||||
super.handleSpecialSubstitution(replace, replaceWith, startPos, description);
|
||||
|
||||
if (replace.equals("<dictionary>")) {
|
||||
if (replaceWith.charAt(0) == '(') {
|
||||
error("Dictionary group can't be enclosed in (", startPos, description);
|
||||
}
|
||||
dictionaryExpression = replaceWith;
|
||||
dictionaryChars = CharSet.parseString(replaceWith);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* The other half of the logic to handle the dictionary characters happens here.
|
||||
* After the inherited builder has derived the real character categories, we
|
||||
* set up the categoryFlags array in the iterator. This array contains "true"
|
||||
* for every character category that includes a dictionary character.
|
||||
*/
|
||||
protected void buildCharCategories(Vector tempRuleList) {
|
||||
super.buildCharCategories(tempRuleList);
|
||||
|
||||
categoryFlags = new boolean[categories.size()];
|
||||
for (int i = 0; i < categories.size(); i++) {
|
||||
CharSet cs = (CharSet)categories.elementAt(i);
|
||||
if (!(cs.intersection(dictionaryChars).empty())) {
|
||||
categoryFlags[i] = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// This function is actually called by RuleBasedBreakIterator.buildCharCategories(),
|
||||
// which is called by the function above. This gives us a way to create a separate
|
||||
// character category for the dictionary characters even when RuleBasedBreakIterator
|
||||
// isn't making a distinction
|
||||
protected void mungeExpressionList(Hashtable expressions) {
|
||||
expressions.put(dictionaryExpression, dictionaryChars);
|
||||
}
|
||||
}
|
64
icu4c/source/i18n/dbbi_bld.h
Normal file
64
icu4c/source/i18n/dbbi_bld.h
Normal file
@ -0,0 +1,64 @@
|
||||
/**
|
||||
* The Builder class for DictionaryBasedBreakIterator inherits almost all of
|
||||
* its functionality from the Builder class for RuleBasedBreakIterator, but
|
||||
* extends it with extra logic to handle the "<dictionary>" token
|
||||
*/
|
||||
protected class Builder extends RuleBasedBreakIterator.Builder {
|
||||
|
||||
/**
|
||||
* A CharSet that contains all the characters represented in the dictionary
|
||||
*/
|
||||
private CharSet dictionaryChars = new CharSet();
|
||||
private String dictionaryExpression = "";
|
||||
|
||||
/**
|
||||
* No special initialization
|
||||
*/
|
||||
public Builder() {
|
||||
}
|
||||
|
||||
/**
|
||||
* We override handleSpecialSubstitution() to add logic to handle
|
||||
* the <dictionary> tag. If we see a substitution named "<dictionary>",
|
||||
* parse the substitution expression and store the result in
|
||||
* dictionaryChars.
|
||||
*/
|
||||
protected void handleSpecialSubstitution(String replace, String replaceWith,
|
||||
int startPos, String description) {
|
||||
super.handleSpecialSubstitution(replace, replaceWith, startPos, description);
|
||||
|
||||
if (replace.equals("<dictionary>")) {
|
||||
if (replaceWith.charAt(0) == '(') {
|
||||
error("Dictionary group can't be enclosed in (", startPos, description);
|
||||
}
|
||||
dictionaryExpression = replaceWith;
|
||||
dictionaryChars = CharSet.parseString(replaceWith);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* The other half of the logic to handle the dictionary characters happens here.
|
||||
* After the inherited builder has derived the real character categories, we
|
||||
* set up the categoryFlags array in the iterator. This array contains "true"
|
||||
* for every character category that includes a dictionary character.
|
||||
*/
|
||||
protected void buildCharCategories(Vector tempRuleList) {
|
||||
super.buildCharCategories(tempRuleList);
|
||||
|
||||
categoryFlags = new boolean[categories.size()];
|
||||
for (int i = 0; i < categories.size(); i++) {
|
||||
CharSet cs = (CharSet)categories.elementAt(i);
|
||||
if (!(cs.intersection(dictionaryChars).empty())) {
|
||||
categoryFlags[i] = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// This function is actually called by RuleBasedBreakIterator.buildCharCategories(),
|
||||
// which is called by the function above. This gives us a way to create a separate
|
||||
// character category for the dictionary characters even when RuleBasedBreakIterator
|
||||
// isn't making a distinction
|
||||
protected void mungeExpressionList(Hashtable expressions) {
|
||||
expressions.put(dictionaryExpression, dictionaryChars);
|
||||
}
|
||||
}
|
59
icu4c/source/i18n/dbbi_tbl.cpp
Normal file
59
icu4c/source/i18n/dbbi_tbl.cpp
Normal file
@ -0,0 +1,59 @@
|
||||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 1999 IBM Corp. All rights reserved.
|
||||
**********************************************************************
|
||||
* Date Name Description
|
||||
* 12/1/99 rgillam Complete port from Java.
|
||||
**********************************************************************
|
||||
*/
|
||||
|
||||
#include "dbbi_tbl.h"
|
||||
#include "dbbi.h"
|
||||
|
||||
//=======================================================================
|
||||
// constructor
|
||||
//=======================================================================
|
||||
|
||||
DictionaryBasedBreakIteratorTables::DictionaryBasedBreakIteratorTables(
|
||||
const void* tablesImage,
|
||||
char* dictionaryFilename)
|
||||
: RuleBasedBreakIteratorTables(tablesImage),
|
||||
dictionary(dictionaryFilename)
|
||||
{
|
||||
const void** tablesIdx = (const void**)tablesImage;
|
||||
const void* dbbiImage = (const void*)((const int8_t*)tablesImage + (int32_t)tablesIdx[8]);
|
||||
// we know the offset into the memory image where the DBBI stuff
|
||||
// starts is stored in element 8 of the array. There should be
|
||||
// a way for the RBBI constructor to give us this, but there's
|
||||
// isn't a good one.
|
||||
const void** dbbiIdx = (const void**)dbbiImage;
|
||||
|
||||
categoryFlags = (int8_t*)((const int8_t*)dbbiImage + (int32_t)dbbiIdx[0]);
|
||||
}
|
||||
|
||||
//=======================================================================
|
||||
// boilerplate
|
||||
//=======================================================================
|
||||
|
||||
/**
|
||||
* Destructor
|
||||
*/
|
||||
DictionaryBasedBreakIteratorTables::~DictionaryBasedBreakIteratorTables() {
|
||||
if (ownTables)
|
||||
delete [] categoryFlags;
|
||||
}
|
||||
|
||||
int32_t
|
||||
DictionaryBasedBreakIteratorTables::lookupCategory(UChar c,
|
||||
BreakIterator* bi) const {
|
||||
// this override of lookupCategory() exists only to keep track of whether we've
|
||||
// passed over any dictionary characters. It calls the inherited lookupCategory()
|
||||
// to do the real work, and then checks whether its return value is one of the
|
||||
// categories represented in the dictionary. If it is, bump the dictionary-
|
||||
// character count.
|
||||
int32_t result = RuleBasedBreakIteratorTables::lookupCategory(c, bi);
|
||||
if (result != RuleBasedBreakIterator::IGNORE && categoryFlags[result]) {
|
||||
((DictionaryBasedBreakIterator*)bi)->bumpDictionaryCharCount();
|
||||
}
|
||||
return result;
|
||||
}
|
79
icu4c/source/i18n/dbbi_tbl.h
Normal file
79
icu4c/source/i18n/dbbi_tbl.h
Normal file
@ -0,0 +1,79 @@
|
||||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 1999 IBM Corp. All rights reserved.
|
||||
**********************************************************************
|
||||
* Date Name Description
|
||||
* 12/1/99 rgillam Complete port from Java.
|
||||
**********************************************************************
|
||||
*/
|
||||
|
||||
#ifndef DBBI_TBL_H
|
||||
#define DBBI_TBL_H
|
||||
|
||||
#include "rbbi_tbl.h"
|
||||
#include "brkdict.h"
|
||||
|
||||
/**
|
||||
* This subclass of RuleBasedBreakIteratorTables contains the additional
|
||||
* static data that is used by DictionaryBasedBreakIterator. This comprises
|
||||
* the dictionary itself and an array of flags that indicate which characters
|
||||
* are in the dictionary.
|
||||
*
|
||||
* @author Richard Gillam
|
||||
*/
|
||||
class DictionaryBasedBreakIteratorTables : public RuleBasedBreakIteratorTables {
|
||||
|
||||
private:
|
||||
/**
|
||||
* a list of known words that is used to divide up contiguous ranges of letters,
|
||||
* stored in a compressed, indexed, format that offers fast access
|
||||
*/
|
||||
BreakDictionary dictionary;
|
||||
|
||||
/**
|
||||
* a list of flags indicating which character categories are contained in
|
||||
* the dictionary file (this is used to determine which ranges of characters
|
||||
* to apply the dictionary to)
|
||||
*/
|
||||
int8_t* categoryFlags;
|
||||
|
||||
//=======================================================================
|
||||
// constructor
|
||||
//=======================================================================
|
||||
|
||||
DictionaryBasedBreakIteratorTables(const void* tablesImage,
|
||||
char* dictionaryFilename);
|
||||
|
||||
/**
|
||||
* The copy constructor is declared private and not implemented.
|
||||
* THIS CLASS MAY NOT BE COPIED.
|
||||
*/
|
||||
DictionaryBasedBreakIteratorTables(const DictionaryBasedBreakIteratorTables& that);
|
||||
|
||||
//=======================================================================
|
||||
// boilerplate
|
||||
//=======================================================================
|
||||
|
||||
/**
|
||||
* Destructor
|
||||
*/
|
||||
virtual ~DictionaryBasedBreakIteratorTables();
|
||||
|
||||
/**
|
||||
* The assignment operator is declared private and not implemented.
|
||||
* THIS CLASS MAY NOT BE COPIED.
|
||||
*/
|
||||
DictionaryBasedBreakIteratorTables& operator=(
|
||||
const DictionaryBasedBreakIteratorTables& that);
|
||||
|
||||
protected:
|
||||
/**
|
||||
* Looks up a character's category (i.e., its category for breaking purposes,
|
||||
* not its Unicode category)
|
||||
*/
|
||||
virtual int32_t lookupCategory(UChar c, BreakIterator* bi) const;
|
||||
|
||||
friend class DictionaryBasedBreakIterator;
|
||||
};
|
||||
|
||||
#endif
|
@ -69,7 +69,7 @@ LINK32=link.exe
|
||||
# PROP Ignore_Export_Lib 0
|
||||
# PROP Target_Dir ""
|
||||
# ADD BASE CPP /nologo /MTd /W3 /Gm /GX /ZI /Od /D "WIN32" /D "_DEBUG" /D "_WINDOWS" /D "_MBCS" /D "_USRDLL" /D "I18N_EXPORTS" /YX /FD /GZ /c
|
||||
# ADD CPP /nologo /MDd /Za /W3 /Gm /GX /ZI /Od /I "..\..\include" /I "..\..\source\common" /D "WIN32" /D "_DEBUG" /D "_WINDOWS" /D "_MBCS" /D "_USRDLL" /D "I18N_EXPORTS" /D "U_I18N_IMPLEMENTATION" /FR /YX /FD /GZ /c
|
||||
# ADD CPP /nologo /MDd /Za /W3 /Gm /GX /ZI /Od /I "..\..\include" /I "..\..\source\common" /D "_WINDOWS" /D "_USRDLL" /D "I18N_EXPORTS" /D "U_I18N_IMPLEMENTATION" /D "WIN32" /D "_DEBUG" /D "_MBCS" /D "UDATA_MAP" /FR /YX /FD /GZ /c
|
||||
# ADD BASE MTL /nologo /D "_DEBUG" /mktyplib203 /win32
|
||||
# ADD MTL /nologo /D "_DEBUG" /mktyplib203 /win32
|
||||
# ADD BASE RSC /l 0x409 /d "_DEBUG"
|
||||
@ -92,6 +92,10 @@ LINK32=link.exe
|
||||
# PROP Default_Filter "cpp;c;cxx;rc;def;r;odl;idl;hpj;bat"
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\brkdict.cpp
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\brkiter.cpp
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
@ -100,10 +104,6 @@ SOURCE=.\calendar.cpp
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\chbkdat.cpp
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\choicfmt.cpp
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
@ -132,6 +132,14 @@ SOURCE=.\datefmt.cpp
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\dbbi.cpp
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\dbbi_tbl.cpp
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\dcfmtsym.cpp
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
@ -161,10 +169,6 @@ SOURCE=.\hextouni.cpp
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\lnbkdat.cpp
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\mergecol.cpp
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
@ -181,6 +185,14 @@ SOURCE=.\ptnentry.cpp
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\rbbi.cpp
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\rbbi_tbl.cpp
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\rbt.cpp
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
@ -205,18 +217,10 @@ SOURCE=.\simpletz.cpp
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\simtxbd.cpp
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\smpdtfmt.cpp
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\snbkdat.cpp
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\sortkey.cpp
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
@ -241,10 +245,6 @@ SOURCE=.\translit.cpp
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\txtbdat.cpp
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\txtbdry.cpp
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
@ -269,10 +269,6 @@ SOURCE=.\umsg.cpp
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\unicdcm.cpp
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\unifltlg.cpp
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
@ -291,20 +287,16 @@ SOURCE=.\unitohex.cpp
|
||||
|
||||
SOURCE=.\unum.cpp
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\wdbkdat.cpp
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\wdbktbl.cpp
|
||||
# End Source File
|
||||
# End Group
|
||||
# Begin Group "Header Files"
|
||||
|
||||
# PROP Default_Filter "h;hpp;hxx;hm;inl"
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\brkdict.h
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\unicode\brkiter.h
|
||||
|
||||
!IF "$(CFG)" == "i18n - Win32 Release"
|
||||
@ -502,6 +494,14 @@ InputPath=.\unicode\datefmt.h
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\dbbi.h
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\dbbi_tbl.h
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\unicode\dcfmtsym.h
|
||||
|
||||
!IF "$(CFG)" == "i18n - Win32 Release"
|
||||
@ -811,7 +811,7 @@ SOURCE=.\rbbi.h
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\rbbi_bld.h
|
||||
SOURCE=.\rbbi_tbl.h
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
@ -885,10 +885,6 @@ InputPath=.\unicode\simpletz.h
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\simtxbd.h
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\unicode\smpdtfmt.h
|
||||
|
||||
!IF "$(CFG)" == "i18n - Win32 Release"
|
||||
@ -943,10 +939,6 @@ InputPath=.\unicode\sortkey.h
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\spclmap.h
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\tables.h
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
@ -1036,10 +1028,6 @@ InputPath=.\unicode\translit.h
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\txtbdat.h
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\txtbdry.h
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
@ -1179,10 +1167,6 @@ InputPath=.\unicode\umsg.h
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\unicdcm.h
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\unicode\unifilt.h
|
||||
|
||||
!IF "$(CFG)" == "i18n - Win32 Release"
|
||||
@ -1319,10 +1303,6 @@ InputPath=.\unicode\unum.h
|
||||
|
||||
!ENDIF
|
||||
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\wdbktbl.h
|
||||
# End Source File
|
||||
# End Group
|
||||
# Begin Group "Resource Files"
|
||||
|
@ -4,98 +4,237 @@
|
||||
* and others. All rights reserved. *
|
||||
**********************************************************************
|
||||
* Date Name Description
|
||||
* 10/22/99 alan Creation.
|
||||
* 11/11/99 rgillam Complete port from Java.
|
||||
**********************************************************************
|
||||
*/
|
||||
|
||||
#include "rbbi.h"
|
||||
#include "rbbi_bld.h"
|
||||
#include "schriter.h"
|
||||
|
||||
/**
|
||||
* A token used as a character-category value to identify ignore characters
|
||||
*/
|
||||
int8_t RuleBasedBreakIterator::IGNORE = -1;
|
||||
int8_t
|
||||
RuleBasedBreakIterator::IGNORE = -1;
|
||||
|
||||
/**
|
||||
* The state number of the starting state
|
||||
*/
|
||||
int16_t RuleBasedBreakIterator::START_STATE = 1;
|
||||
int16_t
|
||||
RuleBasedBreakIterator::START_STATE = 1;
|
||||
|
||||
/**
|
||||
* The state-transition value indicating "stop"
|
||||
*/
|
||||
int16_t RuleBasedBreakIterator::STOP_STATE = 0;
|
||||
int16_t
|
||||
RuleBasedBreakIterator::STOP_STATE = 0;
|
||||
|
||||
/**
|
||||
* Class ID. (value is irrelevant; address is important)
|
||||
*/
|
||||
char
|
||||
RuleBasedBreakIterator::fgClassID = 0;
|
||||
|
||||
//=======================================================================
|
||||
// constructors
|
||||
//=======================================================================
|
||||
|
||||
/**
|
||||
* Constructs a RuleBasedBreakIterator according to the description
|
||||
* provided. If the description is malformed, throws an
|
||||
* IllegalArgumentException. Normally, instead of constructing a
|
||||
* RuleBasedBreakIterator directory, you'll use the factory methods
|
||||
* on BreakIterator to create one indirectly from a description
|
||||
* in the framework's resource files. You'd use this when you want
|
||||
* special behavior not provided by the built-in iterators.
|
||||
* Constructs a RuleBasedBreakIterator that uses the already-created
|
||||
* tables object that is passed in as a parameter.
|
||||
*/
|
||||
RuleBasedBreakIterator::RuleBasedBreakIterator(const UnicodeString& description) {
|
||||
this.description = description;
|
||||
|
||||
// the actual work is done by the Builder class
|
||||
Builder builder;
|
||||
builder.buildBreakIterator(*this, description);
|
||||
RuleBasedBreakIterator::RuleBasedBreakIterator(RuleBasedBreakIteratorTables* tables)
|
||||
: tables(tables),
|
||||
text(NULL)
|
||||
{
|
||||
}
|
||||
|
||||
// This constructor uses the udata interface to create a BreakIterator whose
|
||||
// internal tables live in a memory-mapped file. "image" is a pointer to the
|
||||
// beginning of that file.
|
||||
RuleBasedBreakIterator::RuleBasedBreakIterator(const void* image)
|
||||
: tables(image != NULL ? new RuleBasedBreakIteratorTables(image) : NULL),
|
||||
text(NULL)
|
||||
{
|
||||
if (tables != NULL)
|
||||
tables->addReference();
|
||||
}
|
||||
|
||||
/**
|
||||
* Copy constructor. Will produce a collator with the same behavior,
|
||||
* and which iterates over the same text, as the one passed in.
|
||||
*/
|
||||
RuleBasedBreakIterator::RuleBasedBreakIterator(const RuleBasedBreakIterator& that)
|
||||
: tables(that.tables),
|
||||
text(that.text->clone())
|
||||
{
|
||||
tables->addReference();
|
||||
}
|
||||
|
||||
//=======================================================================
|
||||
// boilerplate
|
||||
//=======================================================================
|
||||
/**
|
||||
* Clones this iterator.
|
||||
* @return A newly-constructed RuleBasedBreakIterator with the same
|
||||
* behavior as this one.
|
||||
* Destructor
|
||||
*/
|
||||
RuleBasedBreakIterator* RuleBasedBreakIterator::clone(void) const {
|
||||
RuleBasedBreakIterator::~RuleBasedBreakIterator() {
|
||||
delete text;
|
||||
tables->removeReference();
|
||||
}
|
||||
|
||||
/**
|
||||
* Assignment operator. Sets this iterator to have the same behavior,
|
||||
* and iterate over the same text, as the one passed in.
|
||||
*/
|
||||
RuleBasedBreakIterator&
|
||||
RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) {
|
||||
delete text;
|
||||
text = that.text->clone();
|
||||
|
||||
tables->removeReference();
|
||||
tables = that.tables;
|
||||
tables->addReference();
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a newly-constructed RuleBasedBreakIterator with the same
|
||||
* behavior, and iterating over the same text, as this one.
|
||||
*/
|
||||
BreakIterator*
|
||||
RuleBasedBreakIterator::clone(void) const {
|
||||
return new RuleBasedBreakIterator(*this);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns true if both BreakIterators are of the same class, have the same
|
||||
* rules, and iterate over the same text.
|
||||
* Equality operator. Returns TRUE if both BreakIterators are of the
|
||||
* same class, have the same behavior, and iterate over the same text.
|
||||
*/
|
||||
bool_t RuleBasedBreakIterator::operator==(const RuleBasedBreakIterator& that) {
|
||||
return description.equals(((RuleBasedBreakIterator)that).description)
|
||||
&& text.equals(((RuleBasedBreakIterator)that).text);
|
||||
bool_t
|
||||
RuleBasedBreakIterator::operator==(const BreakIterator& that) const {
|
||||
if (that.getDynamicClassID() != getDynamicClassID())
|
||||
return FALSE;
|
||||
|
||||
|
||||
const RuleBasedBreakIterator& that2 = (const RuleBasedBreakIterator&)that;
|
||||
return (that2.text == text || *that2.text == *text)
|
||||
&& (that2.tables == tables || *that2.tables == *tables);
|
||||
}
|
||||
|
||||
/**
|
||||
* Compute a hash code for this BreakIterator
|
||||
* @return A hash code
|
||||
*/
|
||||
int32_t
|
||||
RuleBasedBreakIterator::hashCode(void) const {
|
||||
return tables->hashCode();
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the description used to create this iterator
|
||||
*/
|
||||
UnicodeString RuleBasedBreakIterator::toString(void) {
|
||||
return description;
|
||||
}
|
||||
|
||||
/**
|
||||
* Compute a hashcode for this BreakIterator
|
||||
* @return A hash code
|
||||
*/
|
||||
int32_t RuleBasedBreakIterator::hashCode(void) {
|
||||
return description.hashCode();
|
||||
const UnicodeString&
|
||||
RuleBasedBreakIterator::getRules() const {
|
||||
return tables->getRules();
|
||||
}
|
||||
|
||||
//=======================================================================
|
||||
// BreakIterator overrides
|
||||
//=======================================================================
|
||||
|
||||
/**
|
||||
* Return a CharacterIterator over the text being analyzed. This version
|
||||
* of this method returns the actual CharacterIterator we're using internally.
|
||||
* Changing the state of this iterator can have undefined consequences. If
|
||||
* you need to change it, clone it first.
|
||||
* @return An iterator over the text being analyzed.
|
||||
*/
|
||||
const CharacterIterator&
|
||||
RuleBasedBreakIterator::getText() const {
|
||||
RuleBasedBreakIterator* nonConstThis = (RuleBasedBreakIterator*)this;
|
||||
|
||||
// The iterator is initialized pointing to no text at all, so if this
|
||||
// function is called while we're in that state, we have to fudge an
|
||||
// an iterator to return.
|
||||
if (nonConstThis->text == NULL)
|
||||
nonConstThis->text = new StringCharacterIterator("");
|
||||
return *nonConstThis->text;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a newly-created CharacterIterator that the caller is to take
|
||||
* ownership of.
|
||||
* THIS FUNCTION SHOULD NOT BE HERE. IT'S HERE BECAUSE BreakIterator DEFINES
|
||||
* IT AS PURE VIRTUAL, FORCING RBBI TO IMPLEMENT IT. IT SHOULD BE REMOVED
|
||||
* FROM *BOTH* CLASSES.
|
||||
*/
|
||||
CharacterIterator*
|
||||
RuleBasedBreakIterator::createText() const {
|
||||
if (text == NULL)
|
||||
return new StringCharacterIterator("");
|
||||
else
|
||||
return text->clone();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Set the iterator to analyze a new piece of text. This function resets
|
||||
* the current iteration position to the beginning of the text.
|
||||
* @param newText An iterator over the text to analyze.
|
||||
*/
|
||||
void
|
||||
RuleBasedBreakIterator::adoptText(CharacterIterator* newText) {
|
||||
reset();
|
||||
delete text;
|
||||
text = newText;
|
||||
text->first();
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the iterator to analyze a new piece of text. This function resets
|
||||
* the current iteration position to the beginning of the text.
|
||||
* @param newText An iterator over the text to analyze.
|
||||
*/
|
||||
void
|
||||
RuleBasedBreakIterator::setText(const UnicodeString& newText) {
|
||||
reset();
|
||||
if (text != NULL && text->getDynamicClassID()
|
||||
== StringCharacterIterator::getStaticClassID()) {
|
||||
((StringCharacterIterator*)text)->setText(newText);
|
||||
}
|
||||
else {
|
||||
delete text;
|
||||
text = new StringCharacterIterator(newText);
|
||||
text->first();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the iterator to analyze a new piece of text. This function resets
|
||||
* the current iteration position to the beginning of the text.
|
||||
* @param newText The text to analyze.
|
||||
* THIS FUNCTION SHOULD NOT BE HERE. IT'S HERE BECAUSE BreakIterator DEFINES
|
||||
* IT AS PURE VIRTUAL, FORCING RBBI TO IMPLEMENT IT. IT SHOULD BE REMOVED
|
||||
* FROM *BOTH* CLASSES.
|
||||
*/
|
||||
void
|
||||
RuleBasedBreakIterator::setText(const UnicodeString* newText) {
|
||||
setText(*newText);
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the current iteration position to the beginning of the text.
|
||||
* (i.e., the CharacterIterator's starting offset).
|
||||
* @return The offset of the beginning of the text.
|
||||
*/
|
||||
int32_t RuleBasedBreakIterator::first(void) {
|
||||
CharacterIterator t = getText();
|
||||
reset();
|
||||
if (text == NULL)
|
||||
return BreakIterator::DONE;
|
||||
|
||||
t.first();
|
||||
return t.getIndex();
|
||||
text->first();
|
||||
return text->getIndex();
|
||||
}
|
||||
|
||||
/**
|
||||
@ -104,12 +243,16 @@ int32_t RuleBasedBreakIterator::first(void) {
|
||||
* @return The text's past-the-end offset.
|
||||
*/
|
||||
int32_t RuleBasedBreakIterator::last(void) {
|
||||
CharacterIterator t = getText();
|
||||
|
||||
reset();
|
||||
if (text == NULL)
|
||||
return BreakIterator::DONE;
|
||||
|
||||
// I'm not sure why, but t.last() returns the offset of the last character,
|
||||
// rather than the past-the-end offset
|
||||
t.setIndex(t.getEndIndex());
|
||||
return t.getIndex();
|
||||
|
||||
int32_t pos = text->endIndex();
|
||||
text->setIndex(pos);
|
||||
return pos;
|
||||
}
|
||||
|
||||
/**
|
||||
@ -148,9 +291,8 @@ int32_t RuleBasedBreakIterator::next(void) {
|
||||
*/
|
||||
int32_t RuleBasedBreakIterator::previous(void) {
|
||||
// if we're already sitting at the beginning of the text, return DONE
|
||||
CharacterIterator text = getText();
|
||||
if (current() == text.getBeginIndex())
|
||||
return BreakIterator.DONE;
|
||||
if (text == NULL || current() == text->startIndex())
|
||||
return BreakIterator::DONE;
|
||||
|
||||
// set things up. handlePrevious() will back us up to some valid
|
||||
// break position before the current position (we back our internal
|
||||
@ -158,21 +300,21 @@ int32_t RuleBasedBreakIterator::previous(void) {
|
||||
// the current position), but not necessarily the last one before
|
||||
// where we started
|
||||
int32_t start = current();
|
||||
text.previous();
|
||||
text->previous();
|
||||
int32_t lastResult = handlePrevious();
|
||||
int32_t result = lastResult;
|
||||
|
||||
// iterate forward from the known break position until we pass our
|
||||
// starting point. The last break position before the starting
|
||||
// point is our return value
|
||||
while (result != BreakIterator.DONE && result < start) {
|
||||
while (result != BreakIterator::DONE && result < start) {
|
||||
lastResult = result;
|
||||
result = handleNext();
|
||||
}
|
||||
|
||||
// set the current iteration position to be the last break position
|
||||
// before where we started, and then return that value
|
||||
text.setIndex(lastResult);
|
||||
text->setIndex(lastResult);
|
||||
return lastResult;
|
||||
}
|
||||
|
||||
@ -184,16 +326,20 @@ int32_t RuleBasedBreakIterator::previous(void) {
|
||||
*/
|
||||
int32_t RuleBasedBreakIterator::following(int32_t offset) {
|
||||
// if the offset passed in is already past the end of the text,
|
||||
// just return DONE
|
||||
CharacterIterator text = getText();
|
||||
if (offset == text.getEndIndex())
|
||||
return BreakIterator.DONE;
|
||||
// just return DONE; if it's before the beginning, return the
|
||||
// text's starting offset
|
||||
if (text == NULL || offset >= text->endIndex()) {
|
||||
return BreakIterator::DONE;
|
||||
}
|
||||
else if (offset < text->startIndex()) {
|
||||
return text->startIndex();
|
||||
}
|
||||
|
||||
// otherwise, set our internal iteration position (temporarily)
|
||||
// to the position passed in. If this is the _beginning_ position,
|
||||
// then we can just use next() to get our return value
|
||||
text.setIndex(offset);
|
||||
if (offset == text.getBeginIndex())
|
||||
text->setIndex(offset);
|
||||
if (offset == text->startIndex())
|
||||
return handleNext();
|
||||
|
||||
// otherwise, we have to sync up first. Use handlePrevious() to back
|
||||
@ -204,7 +350,7 @@ int32_t RuleBasedBreakIterator::following(int32_t offset) {
|
||||
// from here until we've passed the starting position. The position
|
||||
// we stop on will be the first break position after the specified one.
|
||||
int32_t result = handlePrevious();
|
||||
while (result != BreakIterator.DONE && result <= offset)
|
||||
while (result != BreakIterator::DONE && result <= offset)
|
||||
result = handleNext();
|
||||
return result;
|
||||
}
|
||||
@ -216,11 +362,20 @@ int32_t RuleBasedBreakIterator::following(int32_t offset) {
|
||||
* @return The position of the last boundary before the starting position.
|
||||
*/
|
||||
int32_t RuleBasedBreakIterator::preceding(int32_t offset) {
|
||||
// if the offset passed in is already past the end of the text,
|
||||
// just return DONE; if it's before the beginning, return the
|
||||
// text's starting offset
|
||||
if (text == NULL || offset > text->endIndex()) {
|
||||
return BreakIterator::DONE;
|
||||
}
|
||||
else if (offset < text->startIndex()) {
|
||||
return text->startIndex();
|
||||
}
|
||||
|
||||
// if we start by updating the current iteration position to the
|
||||
// position specified by the caller, we can just use previous()
|
||||
// to carry out this operation
|
||||
CharacterIterator text = getText();
|
||||
text.setIndex(offset);
|
||||
text->setIndex(offset);
|
||||
return previous();
|
||||
}
|
||||
|
||||
@ -232,10 +387,15 @@ int32_t RuleBasedBreakIterator::preceding(int32_t offset) {
|
||||
* @return True if "offset" is a boundary position.
|
||||
*/
|
||||
bool_t RuleBasedBreakIterator::isBoundary(int32_t offset) {
|
||||
// 0 is always a boundary position (I suspect this code is wrong; I think
|
||||
// we're supposed to be comparing "offset" against text.getBeginIndex(). )
|
||||
if (offset == 0)
|
||||
// the beginning index of the iterator is always a boundary position by definition
|
||||
if (text == NULL || offset == text->startIndex()) {
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
// out-of-range indexes are never boundary positions
|
||||
else if (offset < text->startIndex() || offset > text->endIndex()) {
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
// otherwise, we can use following() on the position before the specified
|
||||
// one and return true of the position we get back is the one the user
|
||||
@ -248,38 +408,14 @@ bool_t RuleBasedBreakIterator::isBoundary(int32_t offset) {
|
||||
* Returns the current iteration position.
|
||||
* @return The current iteration position.
|
||||
*/
|
||||
int32_t RuleBasedBreakIterator::current(void) {
|
||||
return getText().getIndex();
|
||||
int32_t RuleBasedBreakIterator::current(void) const {
|
||||
return (text != NULL) ? text->getIndex() : BreakIterator::DONE;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return a CharacterIterator over the text being analyzed. This version
|
||||
* of this method returns the actual CharacterIterator we're using internally.
|
||||
* Changing the state of this iterator can have undefined consequences. If
|
||||
* you need to change it, clone it first.
|
||||
* @return An iterator over the text being analyzed.
|
||||
*/
|
||||
CharacterIterator RuleBasedBreakIterator::getText(void) {
|
||||
// The iterator is initialized pointing to no text at all, so if this
|
||||
// function is called while we're in that state, we have to fudge an
|
||||
// an iterator to return.
|
||||
if (text == 0)
|
||||
text = new StringCharacterIterator("");
|
||||
return text;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the iterator to analyze a new piece of text. This function resets
|
||||
* the current iteration position to the beginning of the text.
|
||||
* @param newText An iterator over the text to analyze.
|
||||
*/
|
||||
void RuleBasedBreakIterator::setText(CharacterIterator newText) {
|
||||
text = newText;
|
||||
text.first();
|
||||
}
|
||||
//=======================================================================
|
||||
// implementation
|
||||
//=======================================================================
|
||||
|
||||
/**
|
||||
* This method is the actual implementation of the next() method. All iteration
|
||||
* vectors through here. This method initializes the state machine to state 1
|
||||
@ -289,38 +425,82 @@ void RuleBasedBreakIterator::setText(CharacterIterator newText) {
|
||||
*/
|
||||
int32_t RuleBasedBreakIterator::handleNext(void) {
|
||||
// if we're already at the end of the text, return DONE.
|
||||
CharacterIterator text = getText();
|
||||
if (text.getIndex() == text.getEndIndex())
|
||||
return BreakIterator.DONE;
|
||||
if (text == NULL || tables == NULL || text->getIndex() == text->endIndex())
|
||||
return BreakIterator::DONE;
|
||||
|
||||
// no matter what, we always advance at least one character forward
|
||||
int32_t result = text.getIndex() + 1;
|
||||
int32_t result = text->getIndex() + 1;
|
||||
int32_t lookaheadResult = 0;
|
||||
|
||||
// begin in state 1
|
||||
int32_t state = START_STATE;
|
||||
int32_t category;
|
||||
UChar c = text.current();
|
||||
UChar c = text->current();
|
||||
UChar lastC = c;
|
||||
int32_t lastCPos = 0;
|
||||
|
||||
|
||||
// loop until we reach the end of the text or transition to state 0
|
||||
while (c != CharacterIterator.DONE && state != STOP_STATE) {
|
||||
while (c != CharacterIterator::DONE && state != STOP_STATE) {
|
||||
|
||||
// look up the current character's character category (which tells us
|
||||
// which column in the state table to look at)
|
||||
category = lookupCategory(c);
|
||||
category = tables->lookupCategory(c, this);
|
||||
|
||||
// if the character isn't an ignore character, look up a state
|
||||
// transition in the state table
|
||||
if (category != IGNORE) {
|
||||
state = lookupState(state, category);
|
||||
state = tables->lookupState(state, category);
|
||||
}
|
||||
|
||||
// if the state we've just transitioned to is an accepting state,
|
||||
// if the state we've just transitioned to is a lookahead state,
|
||||
// (but not also an end state), save its position. If it's
|
||||
// both a lookahead state and an end state, update the break position
|
||||
// to the last saved lookup-state position
|
||||
if (tables->isLookaheadState(state)) {
|
||||
if (tables->isEndState(state)) {
|
||||
result = lookaheadResult;
|
||||
}
|
||||
else {
|
||||
lookaheadResult = text->getIndex() + 1;
|
||||
}
|
||||
}
|
||||
|
||||
// otherwise, if the state we've just transitioned to is an accepting state,
|
||||
// update our return value to be the current iteration position
|
||||
if (endStates[state])
|
||||
result = text.getIndex() + 1;
|
||||
c = text.next();
|
||||
else {
|
||||
if (tables->isEndState(state)) {
|
||||
result = text->getIndex() + 1;
|
||||
}
|
||||
}
|
||||
|
||||
// keep track of the last "real" character we saw. If this character isn't an
|
||||
// ignore character, take note of it and its position in the text
|
||||
if (category != IGNORE && state != STOP_STATE) {
|
||||
lastC = c;
|
||||
lastCPos = text->getIndex();
|
||||
}
|
||||
c = text->next();
|
||||
}
|
||||
text.setIndex(result);
|
||||
|
||||
// if we've run off the end of the text, and the very last character took us into
|
||||
// a lookahead state, advance the break position to the lookahead position
|
||||
// (the theory here is that if there are no characters at all after the lookahead
|
||||
// position, that always matches the lookahead criteria)
|
||||
if (c == CharacterIterator::DONE && lookaheadResult == text->endIndex()) {
|
||||
result = lookaheadResult;
|
||||
}
|
||||
|
||||
// if the last character we saw before the one that took us into the stop state
|
||||
// was a mandatory breaking character, then the break position goes right after it
|
||||
// (this is here so that breaks come before, rather than after, a string of
|
||||
// ignore characters when they follow a mandatory break character)
|
||||
else if (lastC == 0x0a || lastC == 0x0d || lastC == 0x0c || lastC == 0x2028
|
||||
|| lastC == 0x2029) {
|
||||
result = lastCPos + 1;
|
||||
}
|
||||
|
||||
text->setIndex(result);
|
||||
return result;
|
||||
}
|
||||
|
||||
@ -332,27 +512,29 @@ int32_t RuleBasedBreakIterator::handleNext(void) {
|
||||
* of buildBackwardsStateTable() in RuleBasedBreakIterator.Builder.)
|
||||
*/
|
||||
int32_t RuleBasedBreakIterator::handlePrevious(void) {
|
||||
CharacterIterator text = getText();
|
||||
if (text == NULL || tables == NULL)
|
||||
return 0;
|
||||
|
||||
int32_t state = START_STATE;
|
||||
int32_t category = 0;
|
||||
int32_t lastCategory = 0;
|
||||
UChar c = text.current();
|
||||
UChar c = text->current();
|
||||
|
||||
// loop until we reach the beginning of the text or transition to state 0
|
||||
while (c != CharacterIterator.DONE && state != STOP_STATE) {
|
||||
while (c != CharacterIterator::DONE && state != STOP_STATE) {
|
||||
|
||||
// save the last character's category and look up the current
|
||||
// character's category
|
||||
lastCategory = category;
|
||||
category = lookupCategory(c);
|
||||
category = tables->lookupCategory(c, this);
|
||||
|
||||
// if the current character isn't an ignore character, look up a
|
||||
// state transition in the backwards state table
|
||||
if (category != IGNORE)
|
||||
state = lookupBackwardState(state, category);
|
||||
state = tables->lookupBackwardState(state, category);
|
||||
|
||||
// then advance one character backwards
|
||||
c = text.previous();
|
||||
c = text->previous();
|
||||
}
|
||||
|
||||
// if we didn't march off the beginning of the text, we're either one or two
|
||||
@ -360,35 +542,19 @@ int32_t RuleBasedBreakIterator::handlePrevious(void) {
|
||||
// previous() at the end of the loop above, and another because the character
|
||||
// that takes us into the stop state will always be the character BEFORE
|
||||
// the break position.)
|
||||
if (c != CharacterIterator.DONE) {
|
||||
if (c != CharacterIterator::DONE) {
|
||||
if (lastCategory != IGNORE)
|
||||
text.setIndex(text.getIndex() + 2);
|
||||
text->setIndex(text->getIndex() + 2);
|
||||
else
|
||||
text.next();
|
||||
text->next();
|
||||
}
|
||||
return text.getIndex();
|
||||
|
||||
return text->getIndex();
|
||||
}
|
||||
|
||||
/**
|
||||
* Looks up a character's category (i.e., its category for breaking purposes,
|
||||
* not its Unicode category)
|
||||
*/
|
||||
int32_t RuleBasedBreakIterator::lookupCategory(UChar c) {
|
||||
return UCharCategoryTable.elementAt(c);
|
||||
}
|
||||
|
||||
/**
|
||||
* Given a current state and a character category, looks up the
|
||||
* next state to transition to in the state table.
|
||||
*/
|
||||
int32_t RuleBasedBreakIterator::lookupState(int32_t state, int32_t category) {
|
||||
return stateTable[state * numCategories + category];
|
||||
}
|
||||
|
||||
/**
|
||||
* Given a current state and a character category, looks up the
|
||||
* next state to transition to in the backwards state table.
|
||||
*/
|
||||
int32_t RuleBasedBreakIterator::lookupBackwardState(int32_t state, int32_t category) {
|
||||
return backwardsStateTable[state * numCategories + category];
|
||||
void
|
||||
RuleBasedBreakIterator::reset()
|
||||
{
|
||||
// Base-class version of this function is a no-op.
|
||||
// Subclasses may override with their own reset behavior.
|
||||
}
|
||||
|
@ -3,12 +3,18 @@
|
||||
**********************************************************************
|
||||
* Date Name Description
|
||||
* 10/22/99 alan Creation.
|
||||
* 11/11/99 rgillam Complete port from Java.
|
||||
**********************************************************************
|
||||
*/
|
||||
|
||||
#ifndef RBBI_H
|
||||
#define RBBI_H
|
||||
|
||||
#include "utypes.h"
|
||||
#include "rbbi_tbl.h"
|
||||
#include "unicode/brkiter.h"
|
||||
#include "filestrm.h"
|
||||
|
||||
/**
|
||||
* <p>A subclass of BreakIterator whose behavior is specified using a list of rules.</p>
|
||||
*
|
||||
@ -173,17 +179,15 @@
|
||||
*
|
||||
* @author Richard Gillam
|
||||
*/
|
||||
class RuleBasedBreakIterator {
|
||||
|
||||
protected:
|
||||
class U_I18N_API RuleBasedBreakIterator : public BreakIterator {
|
||||
|
||||
public:
|
||||
/**
|
||||
* A token used as a character-category value to identify ignore characters
|
||||
*/
|
||||
static int8_t IGNORE;
|
||||
|
||||
private:
|
||||
|
||||
/**
|
||||
* The state number of the starting state
|
||||
*/
|
||||
@ -194,92 +198,130 @@ private:
|
||||
*/
|
||||
static int16_t STOP_STATE;
|
||||
|
||||
/**
|
||||
* The textual description this iterator was created from
|
||||
*/
|
||||
UnicodeString description;
|
||||
|
||||
/**
|
||||
* A table that indexes from character values to character category numbers
|
||||
*/
|
||||
CompactByteArray charCategoryTable;
|
||||
|
||||
/**
|
||||
* The table of state transitions used for forward iteration
|
||||
*/
|
||||
int16_t* stateTable;
|
||||
|
||||
/**
|
||||
* The table of state transitions used to sync up the iterator with the
|
||||
* text in backwards and random-access iteration
|
||||
*/
|
||||
int16_t* backwardsStateTable;
|
||||
|
||||
/**
|
||||
* A list of flags indicating which states in the state table are accepting
|
||||
* ("end") states
|
||||
*/
|
||||
bool_t* endStates;
|
||||
|
||||
/**
|
||||
* The number of character categories (and, thus, the number of columns in
|
||||
* the state tables)
|
||||
*/
|
||||
int32_t numCategories;
|
||||
|
||||
protected:
|
||||
/**
|
||||
* The character iterator through which this BreakIterator accesses the text
|
||||
*/
|
||||
CharacterIterator text;
|
||||
CharacterIterator* text;
|
||||
|
||||
/**
|
||||
* The data tables this iterator uses to determine the break positions
|
||||
*/
|
||||
RuleBasedBreakIteratorTables* tables;
|
||||
|
||||
private:
|
||||
/**
|
||||
* Class ID
|
||||
*/
|
||||
static char fgClassID;
|
||||
|
||||
public:
|
||||
//=======================================================================
|
||||
// constructors
|
||||
//=======================================================================
|
||||
|
||||
public:
|
||||
|
||||
// This constructor uses the udata interface to create a BreakIterator whose
|
||||
// internal tables live in a memory-mapped file. "image" is a pointer to the
|
||||
// beginning of that file.
|
||||
RuleBasedBreakIterator(const void* image);
|
||||
|
||||
/**
|
||||
* Constructs a RuleBasedBreakIterator according to the description
|
||||
* provided. If the description is malformed, throws an
|
||||
* IllegalArgumentException. Normally, instead of constructing a
|
||||
* RuleBasedBreakIterator directory, you'll use the factory methods
|
||||
* on BreakIterator to create one indirectly from a description
|
||||
* in the framework's resource files. You'd use this when you want
|
||||
* special behavior not provided by the built-in iterators.
|
||||
* Copy constructor. Will produce a collator with the same behavior,
|
||||
* and which iterates over the same text, as the one passed in.
|
||||
*/
|
||||
RuleBasedBreakIterator(UnicodeString description);
|
||||
RuleBasedBreakIterator(const RuleBasedBreakIterator& that);
|
||||
|
||||
//=======================================================================
|
||||
// boilerplate
|
||||
//=======================================================================
|
||||
public:
|
||||
|
||||
/**
|
||||
* Clones this iterator.
|
||||
* @return A newly-constructed RuleBasedBreakIterator with the same
|
||||
* behavior as this one.
|
||||
* Destructor
|
||||
*/
|
||||
virtual Object clone(void);
|
||||
virtual ~RuleBasedBreakIterator();
|
||||
|
||||
/**
|
||||
* Returns true if both BreakIterators are of the same class, have the same
|
||||
* rules, and iterate over the same text.
|
||||
* Assignment operator. Sets this iterator to have the same behavior,
|
||||
* and iterate over the same text, as the one passed in.
|
||||
*/
|
||||
virtual bool_t equals(Object that);
|
||||
RuleBasedBreakIterator& operator=(const RuleBasedBreakIterator& that);
|
||||
|
||||
/**
|
||||
* Equality operator. Returns TRUE if both BreakIterators are of the
|
||||
* same class, have the same behavior, and iterate over the same text.
|
||||
*/
|
||||
virtual bool_t operator==(const BreakIterator& that) const;
|
||||
|
||||
/**
|
||||
* Not-equal operator. If operator== returns TRUE, this returns FALSE,
|
||||
* and vice versa.
|
||||
*/
|
||||
bool_t operator!=(const BreakIterator& that) const;
|
||||
|
||||
/**
|
||||
* Returns a newly-constructed RuleBasedBreakIterator with the same
|
||||
* behavior, and iterating over the same text, as this one.
|
||||
*/
|
||||
virtual BreakIterator* clone(void) const;
|
||||
|
||||
/**
|
||||
* Compute a hash code for this BreakIterator
|
||||
* @return A hash code
|
||||
*/
|
||||
virtual int32_t hashCode() const;
|
||||
|
||||
/**
|
||||
* Returns the description used to create this iterator
|
||||
*/
|
||||
virtual UnicodeString toString(void);
|
||||
virtual const UnicodeString& getRules() const;
|
||||
|
||||
/**
|
||||
* Compute a hashcode for this BreakIterator
|
||||
* @return A hash code
|
||||
*/
|
||||
virtual int32_t hashCode(void);
|
||||
//=======================================================================
|
||||
// BreakIterator overrides
|
||||
//=======================================================================
|
||||
|
||||
/**
|
||||
* Return a CharacterIterator over the text being analyzed. This version
|
||||
* of this method returns the actual CharacterIterator we're using internally.
|
||||
* Changing the state of this iterator can have undefined consequences. If
|
||||
* you need to change it, clone it first.
|
||||
* @return An iterator over the text being analyzed.
|
||||
*/
|
||||
virtual const CharacterIterator& getText() const;
|
||||
|
||||
/**
|
||||
* Returns a newly-created CharacterIterator that the caller is to take
|
||||
* ownership of.
|
||||
* THIS FUNCTION SHOULD NOT BE HERE. IT'S HERE BECAUSE BreakIterator DEFINES
|
||||
* IT AS PURE VIRTUAL, FORCING RBBI TO IMPLEMENT IT. IT SHOULD BE REMOVED
|
||||
* FROM *BOTH* CLASSES.
|
||||
*/
|
||||
virtual CharacterIterator* createText() const;
|
||||
|
||||
/**
|
||||
* Set the iterator to analyze a new piece of text. This function resets
|
||||
* the current iteration position to the beginning of the text.
|
||||
* @param newText An iterator over the text to analyze. The BreakIterator
|
||||
* takes ownership of the character iterator. The caller MUST NOT delete it!
|
||||
*/
|
||||
virtual void adoptText(CharacterIterator* newText);
|
||||
|
||||
/**
|
||||
* Set the iterator to analyze a new piece of text. This function resets
|
||||
* the current iteration position to the beginning of the text.
|
||||
* @param newText The text to analyze.
|
||||
*/
|
||||
virtual void setText(const UnicodeString& newText);
|
||||
|
||||
/**
|
||||
* Set the iterator to analyze a new piece of text. This function resets
|
||||
* the current iteration position to the beginning of the text.
|
||||
* @param newText The text to analyze.
|
||||
* THIS FUNCTION SHOULD NOT BE HERE. IT'S HERE BECAUSE BreakIterator DEFINES
|
||||
* IT AS PURE VIRTUAL, FORCING RBBI TO IMPLEMENT IT. IT SHOULD BE REMOVED
|
||||
* FROM *BOTH* CLASSES.
|
||||
*/
|
||||
virtual void setText(const UnicodeString* newText);
|
||||
|
||||
/**
|
||||
* Sets the current iteration position to the beginning of the text.
|
||||
* (i.e., the CharacterIterator's starting offset).
|
||||
@ -346,28 +388,36 @@ public:
|
||||
* Returns the current iteration position.
|
||||
* @return The current iteration position.
|
||||
*/
|
||||
virtual int32_t current(void);
|
||||
virtual int32_t current(void) const;
|
||||
|
||||
/**
|
||||
* Return a CharacterIterator over the text being analyzed. This version
|
||||
* of this method returns the actual CharacterIterator we're using internally.
|
||||
* Changing the state of this iterator can have undefined consequences. If
|
||||
* you need to change it, clone it first.
|
||||
* @return An iterator over the text being analyzed.
|
||||
* Returns a unique class ID POLYMORPHICALLY. Pure virtual override.
|
||||
* This method is to implement a simple version of RTTI, since not all
|
||||
* C++ compilers support genuine RTTI. Polymorphic operator==() and
|
||||
* clone() methods call this method.
|
||||
*
|
||||
* @return The class ID for this object. All objects of a
|
||||
* given class have the same class ID. Objects of
|
||||
* other classes have different class IDs.
|
||||
*/
|
||||
virtual CharacterIterator getText(void);
|
||||
virtual UClassID getDynamicClassID() const;
|
||||
|
||||
/**
|
||||
* Set the iterator to analyze a new piece of text. This function resets
|
||||
* the current iteration position to the beginning of the text.
|
||||
* @param newText An iterator over the text to analyze.
|
||||
* Returns the class ID for this class. This is useful only for
|
||||
* comparing to a return value from getDynamicClassID(). For example:
|
||||
*
|
||||
* Base* polymorphic_pointer = createPolymorphicObject();
|
||||
* if (polymorphic_pointer->getDynamicClassID() ==
|
||||
* Derived::getStaticClassID()) ...
|
||||
*
|
||||
* @return The class ID for all objects of this class.
|
||||
*/
|
||||
virtual void setText(CharacterIterator newText);
|
||||
static UClassID getStaticClassID();
|
||||
|
||||
protected:
|
||||
//=======================================================================
|
||||
// implementation
|
||||
//=======================================================================
|
||||
protected:
|
||||
|
||||
/**
|
||||
* This method is the actual implementation of the next() method. All iteration
|
||||
* vectors through here. This method initializes the state machine to state 1
|
||||
@ -387,22 +437,33 @@ protected:
|
||||
virtual int32_t handlePrevious(void);
|
||||
|
||||
/**
|
||||
* Looks up a character's category (i.e., its category for breaking purposes,
|
||||
* not its Unicode category)
|
||||
* Dumps caches and performs other actions associated with a complete change
|
||||
* in text or iteration position. This function is a no-op in RuleBasedBreakIterator,
|
||||
* but subclasses can and do override it.
|
||||
*/
|
||||
virtual int32_t lookupCategory(UChar c);
|
||||
virtual void reset();
|
||||
|
||||
private:
|
||||
|
||||
/**
|
||||
* Given a current state and a character category, looks up the
|
||||
* next state to transition to in the state table.
|
||||
* Constructs a RuleBasedBreakIterator that uses the already-created
|
||||
* tables object that is passed in as a parameter.
|
||||
*/
|
||||
virtual int32_t lookupState(int32_t state, int32_t category);
|
||||
RuleBasedBreakIterator(RuleBasedBreakIteratorTables* tables);
|
||||
|
||||
/**
|
||||
* Given a current state and a character category, looks up the
|
||||
* next state to transition to in the backwards state table.
|
||||
*/
|
||||
virtual int32_t lookupBackwardState(int32_t state, int32_t category);
|
||||
friend class BreakIterator;
|
||||
};
|
||||
|
||||
inline bool_t RuleBasedBreakIterator::operator!=(const BreakIterator& that) const {
|
||||
return !operator==(that);
|
||||
}
|
||||
|
||||
inline UClassID RuleBasedBreakIterator::getDynamicClassID() const {
|
||||
return RuleBasedBreakIterator::getStaticClassID();
|
||||
}
|
||||
|
||||
inline UClassID RuleBasedBreakIterator::getStaticClassID() {
|
||||
return (UClassID)(&fgClassID);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -2,8 +2,7 @@
|
||||
* Copyright © {1999}, International Business Machines Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
* Date Name Description
|
||||
* 10/22/99 alan Creation. This is an internal header; it
|
||||
* shall not be exported.
|
||||
* 12/15/99 rgillam Port from Java.
|
||||
**********************************************************************
|
||||
*/
|
||||
|
||||
@ -11,9 +10,12 @@
|
||||
#define RBBI_BLD_H
|
||||
|
||||
#include "rbbi.h"
|
||||
#include "rbbi_tbl.h"
|
||||
#include "unicode/uniset.h"
|
||||
#include "uvector.h"
|
||||
|
||||
class ExpressionList;
|
||||
|
||||
//=======================================================================
|
||||
// RuleBasedBreakIterator.Builder
|
||||
//=======================================================================
|
||||
@ -42,18 +44,37 @@
|
||||
class RuleBasedBreakIteratorBuilder {
|
||||
|
||||
protected:
|
||||
/**
|
||||
* The iterator we're constructing.
|
||||
*/
|
||||
RuleBasedBreakIterator& iterator;
|
||||
|
||||
/**
|
||||
* The tables object for the iterator we're constructing.
|
||||
*/
|
||||
RuleBasedBreakIteratorTables* tables;
|
||||
|
||||
/**
|
||||
* A temporary place to hold the rules as they're being processed.
|
||||
*/
|
||||
UVector tempRuleList;
|
||||
|
||||
/**
|
||||
* A temporary holding place used for calculating the character categories.
|
||||
* This object contains UnicodeSet objects.
|
||||
*/
|
||||
UVector categories;
|
||||
|
||||
/**
|
||||
* The number of categories (and thus the number of columns in the finished state tables)
|
||||
*/
|
||||
int32_t numCategories;
|
||||
|
||||
/**
|
||||
* A table used to map parts of regexp text to lists of character categories,
|
||||
* rather than having to figure them out from scratch each time
|
||||
*/
|
||||
Hashtable expressions;
|
||||
ExpressionList* expressions;
|
||||
|
||||
/**
|
||||
* A temporary holding place for the list of ignore characters
|
||||
@ -104,18 +125,56 @@ protected:
|
||||
*/
|
||||
bool_t clearLoopingStates;
|
||||
|
||||
/**
|
||||
* A place where an error message can be stored if we get a parse error.
|
||||
* The error message is never displayed anywhere, so this is useful pretty
|
||||
* much only in conjunction with a debugger.
|
||||
*/
|
||||
UnicodeString errorMessage;
|
||||
|
||||
/**
|
||||
* A bit mask used to indicate a bit in the table's flags column that marks a
|
||||
* state as an accepting state.
|
||||
*/
|
||||
static const int32_t END_STATE_FLAG /*= 0x8000*/;
|
||||
|
||||
/**
|
||||
* A bit mask used to indicate a bit in the table's flags column that marks a
|
||||
* state as one the builder shouldn't loop to any looping states
|
||||
*/
|
||||
static const int32_t DONT_LOOP_FLAG /*= 0x4000*/;
|
||||
|
||||
/**
|
||||
* A bit mask used to indicate a bit in the table's flags column that marks a
|
||||
* state as a lookahead state.
|
||||
*/
|
||||
static const int32_t LOOKAHEAD_STATE_FLAG /*= 0x2000*/;
|
||||
|
||||
/**
|
||||
* A bit mask representing the union of the mask values listed above.
|
||||
* Used for clearing or masking off the flag bits.
|
||||
*/
|
||||
static const int32_t ALL_FLAGS /*= END_STATE_FLAG | LOOKAHEAD_STATE_FLAG
|
||||
| DONT_LOOP_FLAG*/;
|
||||
|
||||
public:
|
||||
|
||||
/**
|
||||
* No special construction is required for the Builder.
|
||||
* The Builder class contains a reference to the iterator it's supposed to build.
|
||||
*/
|
||||
RuleBasedBreakIteratorBuilder();
|
||||
RuleBasedBreakIteratorBuilder(RuleBasedBreakIterator& iteratorToBuild);
|
||||
|
||||
/**
|
||||
* Destructor.
|
||||
*/
|
||||
~RuleBasedBreakIteratorBuilder();
|
||||
|
||||
/**
|
||||
* This is the main function for setting up the BreakIterator's tables. It
|
||||
* just UVectors different parts of the job off to other functions.
|
||||
* just vectors different parts of the job off to other functions.
|
||||
*/
|
||||
virtual void buildBreakIterator(void);
|
||||
virtual void buildBreakIterator(const UnicodeString& description,
|
||||
UErrorCode& err);
|
||||
|
||||
private:
|
||||
|
||||
@ -127,7 +186,8 @@ private:
|
||||
* <li>Perform variable-name substitutions (so that no one else sees variable names)
|
||||
* </ul>
|
||||
*/
|
||||
virtual UVector buildRuleList(UnicodeString description);
|
||||
virtual void buildRuleList(UnicodeString& description,
|
||||
UErrorCode& err);
|
||||
|
||||
protected:
|
||||
|
||||
@ -138,8 +198,11 @@ protected:
|
||||
* find-and-replace of the variable name with its text. (The variable text
|
||||
* must be enclosed in either [] or () for this to work.)
|
||||
*/
|
||||
virtual UnicodeString processSubstitution(UnicodeString substitutionRule, UnicodeString description,
|
||||
int32_t startPos);
|
||||
virtual void processSubstitution(UnicodeString& description,
|
||||
UTextOffset ruleStart,
|
||||
UTextOffset ruleEnd,
|
||||
UTextOffset startPos,
|
||||
UErrorCode& err);
|
||||
|
||||
/**
|
||||
* This function defines a protocol for handling substitution names that
|
||||
@ -150,8 +213,17 @@ protected:
|
||||
* that which is done by the normal substitution-processing code is done
|
||||
* here.
|
||||
*/
|
||||
virtual void handleSpecialSubstitution(UnicodeString replace, UnicodeString replaceWith,
|
||||
int32_t startPos, UnicodeString description);
|
||||
virtual void handleSpecialSubstitution(const UnicodeString& replace,
|
||||
const UnicodeString& replaceWith,
|
||||
int32_t startPos,
|
||||
const UnicodeString& description,
|
||||
UErrorCode& err);
|
||||
|
||||
/**
|
||||
* This function provides a hook for subclasses to mess with the character
|
||||
* category table.
|
||||
*/
|
||||
virtual void mungeExpressionList();
|
||||
|
||||
/**
|
||||
* This function builds the character category table. On entry,
|
||||
@ -161,7 +233,7 @@ protected:
|
||||
* character category numbers everywhere a literal character or a [] expression
|
||||
* originally occurred.
|
||||
*/
|
||||
virtual void buildCharCategories(UVector tempRuleList);
|
||||
virtual void buildCharCategories(UErrorCode& err);
|
||||
|
||||
private:
|
||||
|
||||
@ -170,7 +242,7 @@ private:
|
||||
* work is done in parseRule(), which is called once for each rule in the
|
||||
* description.
|
||||
*/
|
||||
virtual void buildStateTable(UVector tempRuleList);
|
||||
virtual void buildStateTable(UErrorCode& err);
|
||||
|
||||
/**
|
||||
* This is where most of the work really happens. This routine parses a single
|
||||
@ -179,7 +251,8 @@ private:
|
||||
* throughout the whole operation, although some ugly postprocessing is needed
|
||||
* to handle the *? token.
|
||||
*/
|
||||
virtual void parseRule(UnicodeString rule, bool_t forward);
|
||||
virtual void parseRule(const UnicodeString& rule,
|
||||
bool_t forward);
|
||||
|
||||
/**
|
||||
* Update entries in the state table, and merge states when necessary to keep
|
||||
@ -189,9 +262,9 @@ private:
|
||||
* list of the columns that need updating.
|
||||
* @param newValue Update the cells specfied above to contain this value
|
||||
*/
|
||||
virtual void updateStateTable(UVector rows,
|
||||
UnicodeString pendingChars,
|
||||
int16_t newValue);
|
||||
virtual void updateStateTable(const UVector& rows,
|
||||
const UnicodeString& pendingChars,
|
||||
int16_t newValue);
|
||||
|
||||
/**
|
||||
* The real work of making the state table deterministic happens here. This function
|
||||
@ -213,9 +286,9 @@ private:
|
||||
* (itself a copy of the decision point list from parseRule()). Newly-created
|
||||
* states get added to the decision point list if their "parents" were on it.
|
||||
*/
|
||||
virtual void mergeStates(int32_t rowNum,
|
||||
virtual void mergeStates(int32_t rowNum,
|
||||
int16_t* newValues,
|
||||
UVector rowsBeingUpdated);
|
||||
const UVector& rowsBeingUpdated);
|
||||
|
||||
/**
|
||||
* The merge list is a list of pairs of rows that have been merged somewhere in
|
||||
@ -236,7 +309,8 @@ private:
|
||||
* @param endStates The list of states to treat as end states (states that
|
||||
* can exit the loop).
|
||||
*/
|
||||
virtual void setLoopingStates(UVector newLoopingStates, UVector endStates);
|
||||
virtual void setLoopingStates(const UVector* newLoopingStates,
|
||||
const UVector& endStates);
|
||||
|
||||
/**
|
||||
* This removes "ending states" and states reachable from them from the
|
||||
@ -264,7 +338,7 @@ private:
|
||||
* table and any additional rules (identified by the ! on the front)
|
||||
* supplied in the description
|
||||
*/
|
||||
virtual void buildBackwardsStateTable(UVector tempRuleList);
|
||||
virtual void buildBackwardsStateTable(UErrorCode& err);
|
||||
|
||||
protected:
|
||||
|
||||
@ -276,7 +350,9 @@ protected:
|
||||
* discovered
|
||||
* @param context The string containing the error
|
||||
*/
|
||||
virtual void error(UnicodeString message, int32_t position, UnicodeString context);
|
||||
virtual void setUpErrorMessage(const UnicodeString& message,
|
||||
int32_t position,
|
||||
const UnicodeString& context);
|
||||
};
|
||||
|
||||
#endif
|
||||
|
128
icu4c/source/i18n/rbbi_tbl.cpp
Normal file
128
icu4c/source/i18n/rbbi_tbl.cpp
Normal file
@ -0,0 +1,128 @@
|
||||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 1999 IBM Corp. All rights reserved.
|
||||
**********************************************************************
|
||||
* Date Name Description
|
||||
* 11/11/99 rgillam Complete port from Java.
|
||||
**********************************************************************
|
||||
*/
|
||||
|
||||
#include "rbbi_tbl.h"
|
||||
#include "cmemory.h"
|
||||
|
||||
//=======================================================================
|
||||
// constructor
|
||||
//=======================================================================
|
||||
|
||||
RuleBasedBreakIteratorTables::RuleBasedBreakIteratorTables(const void* image)
|
||||
: refCount(0),
|
||||
ownTables(FALSE)
|
||||
{
|
||||
const void** im = (const void**)(image);
|
||||
const int8_t* base = (const int8_t*)(image);
|
||||
|
||||
// the memory image begins with an index that gives the offsets into the
|
||||
// image for each of the fields in the BreakIteratorTables object--
|
||||
// use those to initialize the tables object (it will end up pointing
|
||||
// into the memory image for everything)
|
||||
numCategories = (int32_t)im[0];
|
||||
description = UnicodeString(TRUE, (UChar*)((int32_t)im[1] + base), -1);
|
||||
charCategoryTable = ucmp8_openAdopt((uint16_t*)((int32_t)im[2] + base),
|
||||
(int8_t*)((int32_t)im[3] + base), 0);
|
||||
stateTable = (int16_t*)((int32_t)im[4] + base);
|
||||
backwardsStateTable = (int16_t*)((int32_t)im[5] + base);
|
||||
endStates = (int8_t*)((int32_t)im[6] + base);
|
||||
lookaheadStates = (int8_t*)((int32_t)im[7] + base);
|
||||
}
|
||||
|
||||
RuleBasedBreakIteratorTables::RuleBasedBreakIteratorTables()
|
||||
: refCount(0),
|
||||
ownTables(TRUE)
|
||||
{
|
||||
// everything else is null-initialized. This constructor depends on
|
||||
// a RuleBasedBreakIteratorBuilder filling in all the members
|
||||
}
|
||||
|
||||
//=======================================================================
|
||||
// boilerplate
|
||||
//=======================================================================
|
||||
|
||||
/**
|
||||
* Destructor
|
||||
*/
|
||||
RuleBasedBreakIteratorTables::~RuleBasedBreakIteratorTables() {
|
||||
if (ownTables) {
|
||||
delete [] stateTable;
|
||||
delete [] backwardsStateTable;
|
||||
delete [] endStates;
|
||||
delete [] lookaheadStates;
|
||||
ucmp8_close(charCategoryTable);
|
||||
}
|
||||
else {
|
||||
uprv_free(charCategoryTable);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Equality operator. Returns TRUE if both tables objects are of the
|
||||
* same class, have the same behavior, and iterate over the same text.
|
||||
*/
|
||||
bool_t
|
||||
RuleBasedBreakIteratorTables::operator==(const RuleBasedBreakIteratorTables& that) const {
|
||||
return this->description == that.description;
|
||||
}
|
||||
|
||||
/**
|
||||
* Compute a hash code for these tables
|
||||
* @return A hash code
|
||||
*/
|
||||
int32_t
|
||||
RuleBasedBreakIteratorTables::hashCode() const {
|
||||
return description.hashCode();
|
||||
}
|
||||
|
||||
//=======================================================================
|
||||
// implementation
|
||||
//=======================================================================
|
||||
/**
|
||||
* Looks up a character's category (i.e., its category for breaking purposes,
|
||||
* not its Unicode category)
|
||||
*/
|
||||
int32_t
|
||||
RuleBasedBreakIteratorTables::lookupCategory(UChar c, BreakIterator* ignored) const {
|
||||
return ucmp8_get(charCategoryTable, c);
|
||||
}
|
||||
|
||||
/**
|
||||
* Given a current state and a character category, looks up the
|
||||
* next state to transition to in the state table.
|
||||
*/
|
||||
int32_t
|
||||
RuleBasedBreakIteratorTables::lookupState(int32_t state, int32_t category) const {
|
||||
return stateTable[state * numCategories + category];
|
||||
}
|
||||
|
||||
/**
|
||||
* Given a current state and a character category, looks up the
|
||||
* next state to transition to in the backwards state table.
|
||||
*/
|
||||
int32_t
|
||||
RuleBasedBreakIteratorTables::lookupBackwardState(int32_t state, int32_t category) const {
|
||||
return backwardsStateTable[state * numCategories + category];
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns true if the specified state is an accepting state.
|
||||
*/
|
||||
bool_t
|
||||
RuleBasedBreakIteratorTables::isEndState(int32_t state) const {
|
||||
return endStates[state];
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns true if the specified state is a lookahead state.
|
||||
*/
|
||||
bool_t
|
||||
RuleBasedBreakIteratorTables::isLookaheadState(int32_t state) const {
|
||||
return lookaheadStates[state];
|
||||
}
|
213
icu4c/source/i18n/rbbi_tbl.h
Normal file
213
icu4c/source/i18n/rbbi_tbl.h
Normal file
@ -0,0 +1,213 @@
|
||||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 1999 IBM Corp. All rights reserved.
|
||||
**********************************************************************
|
||||
* Date Name Description
|
||||
* 11/11/99 rgillam Complete port from Java.
|
||||
**********************************************************************
|
||||
*/
|
||||
|
||||
#ifndef RBBI_TBL_H
|
||||
#define RBBI_TBL_H
|
||||
|
||||
#include "ucmp8.h"
|
||||
#include "utypes.h"
|
||||
#include "unistr.h"
|
||||
#include "unicode/brkiter.h"
|
||||
#include "filestrm.h"
|
||||
|
||||
/**
|
||||
* This class contains the internal static tables that are used by the
|
||||
* RuleBasedBreakIterator. Once created, these tables are immutable,
|
||||
* so they can be shared among all break iterators using a particular
|
||||
* set of rules. This class uses a reference-counting scheme to
|
||||
* manage the sharing.
|
||||
*
|
||||
* @author Richard Gillam
|
||||
*/
|
||||
class RuleBasedBreakIteratorTables {
|
||||
|
||||
private:
|
||||
/**
|
||||
* The number of RuleBasedBreakIterators using this object.
|
||||
*/
|
||||
int16_t refCount;
|
||||
|
||||
protected:
|
||||
/**
|
||||
* Whether or not we own the storage for the tables (the tables may be
|
||||
* stored in a memory-mapped file)
|
||||
*/
|
||||
bool_t ownTables;
|
||||
|
||||
private:
|
||||
/**
|
||||
* The textual description that was used to create these tables
|
||||
*/
|
||||
UnicodeString description;
|
||||
|
||||
/**
|
||||
* A table that indexes from character values to character category numbers
|
||||
*/
|
||||
CompactByteArray* charCategoryTable;
|
||||
|
||||
/**
|
||||
* The table of state transitions used for forward iteration
|
||||
*/
|
||||
int16_t* stateTable;
|
||||
|
||||
/**
|
||||
* The table of state transitions used to sync up the iterator with the
|
||||
* text in backwards and random-access iteration
|
||||
*/
|
||||
int16_t* backwardsStateTable;
|
||||
|
||||
/**
|
||||
* A list of flags indicating which states in the state table are accepting
|
||||
* ("end") states
|
||||
*/
|
||||
int8_t* endStates;
|
||||
|
||||
/**
|
||||
* A list of flags indicating which states in the state table are
|
||||
* lookahead states (states which turn lookahead on and off)
|
||||
*/
|
||||
int8_t* lookaheadStates;
|
||||
|
||||
/**
|
||||
* The number of character categories (and, thus, the number of columns in
|
||||
* the state tables)
|
||||
*/
|
||||
int32_t numCategories;
|
||||
|
||||
//=======================================================================
|
||||
// constructor
|
||||
//=======================================================================
|
||||
|
||||
/**
|
||||
* Creates a tables object, adopting all of the tables that are passed in.
|
||||
*/
|
||||
protected:
|
||||
RuleBasedBreakIteratorTables();
|
||||
|
||||
RuleBasedBreakIteratorTables(const void* image);
|
||||
|
||||
private:
|
||||
/**
|
||||
* The copy constructor is declared private and is a no-op.
|
||||
* THIS CLASS MAY NOT BE COPIED.
|
||||
*/
|
||||
RuleBasedBreakIteratorTables(const RuleBasedBreakIteratorTables& that);
|
||||
|
||||
//=======================================================================
|
||||
// boilerplate
|
||||
//=======================================================================
|
||||
|
||||
protected:
|
||||
/**
|
||||
* Destructor
|
||||
*/
|
||||
virtual ~RuleBasedBreakIteratorTables();
|
||||
|
||||
private:
|
||||
/**
|
||||
* The assignment operator is declared private and is a no-op.
|
||||
* THIS CLASS MAY NOT BE COPIED.
|
||||
*/
|
||||
RuleBasedBreakIteratorTables& operator=(const RuleBasedBreakIteratorTables& that);
|
||||
|
||||
/**
|
||||
* Equality operator. Returns TRUE if both tables objects are of the
|
||||
* same class, have the same behavior, and iterate over the same text.
|
||||
*/
|
||||
virtual bool_t operator==(const RuleBasedBreakIteratorTables& that) const;
|
||||
|
||||
/**
|
||||
* Not-equal operator. If operator== returns TRUE, this returns FALSE,
|
||||
* and vice versa.
|
||||
*/
|
||||
bool_t operator!=(const RuleBasedBreakIteratorTables& that) const;
|
||||
|
||||
/**
|
||||
* Compute a hash code for these tables
|
||||
* @return A hash code
|
||||
*/
|
||||
virtual int32_t hashCode() const;
|
||||
|
||||
/**
|
||||
* Returns the description used to create these tables
|
||||
*/
|
||||
const UnicodeString& getRules() const;
|
||||
|
||||
//=======================================================================
|
||||
// reference counting
|
||||
//=======================================================================
|
||||
|
||||
/**
|
||||
* increments the reference count.
|
||||
*/
|
||||
void addReference();
|
||||
|
||||
/**
|
||||
* decrements the reference count and deletes the object if it reaches zero
|
||||
*/
|
||||
void removeReference();
|
||||
|
||||
protected:
|
||||
//=======================================================================
|
||||
// implementation
|
||||
//=======================================================================
|
||||
/**
|
||||
* Looks up a character's category (i.e., its category for breaking purposes,
|
||||
* not its Unicode category)
|
||||
*/
|
||||
virtual int32_t lookupCategory(UChar c, BreakIterator* bi) const;
|
||||
|
||||
/**
|
||||
* Given a current state and a character category, looks up the
|
||||
* next state to transition to in the state table.
|
||||
*/
|
||||
virtual int32_t lookupState(int32_t state, int32_t category) const;
|
||||
|
||||
/**
|
||||
* Given a current state and a character category, looks up the
|
||||
* next state to transition to in the backwards state table.
|
||||
*/
|
||||
virtual int32_t lookupBackwardState(int32_t state, int32_t category) const;
|
||||
|
||||
/**
|
||||
* Returns true if the specified state is an accepting state.
|
||||
*/
|
||||
virtual bool_t isEndState(int32_t state) const;
|
||||
|
||||
/**
|
||||
* Returns true if the specified state is a lookahead state.
|
||||
*/
|
||||
virtual bool_t isLookaheadState(int32_t state) const;
|
||||
|
||||
friend class RuleBasedBreakIterator;
|
||||
friend class DictionaryBasedBreakIterator;
|
||||
};
|
||||
|
||||
inline bool_t
|
||||
RuleBasedBreakIteratorTables::operator!=(const RuleBasedBreakIteratorTables& that) const {
|
||||
return !operator==(that);
|
||||
}
|
||||
|
||||
inline const UnicodeString&
|
||||
RuleBasedBreakIteratorTables::getRules() const {
|
||||
return description;
|
||||
}
|
||||
|
||||
inline void
|
||||
RuleBasedBreakIteratorTables::addReference() {
|
||||
++refCount;
|
||||
}
|
||||
|
||||
inline void
|
||||
RuleBasedBreakIteratorTables::removeReference() {
|
||||
if (--refCount <= 0)
|
||||
delete this;
|
||||
}
|
||||
|
||||
#endif
|
@ -79,6 +79,31 @@ ubrk_close(UBreakIterator *bi)
|
||||
delete (BreakIterator*) bi;
|
||||
}
|
||||
|
||||
U_CAPI void
|
||||
ubrk_setText(UBreakIterator* bi,
|
||||
const UChar* text,
|
||||
int32_t textLength,
|
||||
UErrorCode* status)
|
||||
{
|
||||
if (U_FAILURE(*status)) return;
|
||||
|
||||
const CharacterIterator& biText = ((BreakIterator*)bi)->getText();
|
||||
|
||||
int32_t textLen = (textLength == -1 ? u_strlen(text) : textLength);
|
||||
if (biText.getDynamicClassID() == UCharCharacterIterator::getStaticClassID()) {
|
||||
((UCharCharacterIterator&)biText).setText(text, textLen);
|
||||
}
|
||||
else {
|
||||
UCharCharacterIterator *iter = 0;
|
||||
iter = new UCharCharacterIterator(text, textLen);
|
||||
if(iter == 0) {
|
||||
*status = U_MEMORY_ALLOCATION_ERROR;
|
||||
return;
|
||||
}
|
||||
((BreakIterator*)bi)->adoptText(iter);
|
||||
}
|
||||
}
|
||||
|
||||
U_CAPI UTextOffset
|
||||
ubrk_current(const UBreakIterator *bi)
|
||||
{
|
||||
|
@ -177,53 +177,73 @@ public:
|
||||
* BreakIterator, as the argument. Text is considered the same if
|
||||
* it contains the same characters, it need not be the same
|
||||
* object, and styles are not considered.
|
||||
* @stable
|
||||
*/
|
||||
virtual bool_t operator==(const BreakIterator&) const = 0;
|
||||
|
||||
/**
|
||||
* Returns the complement of the result of operator==
|
||||
* @stable
|
||||
*/
|
||||
bool_t operator!=(const BreakIterator& rhs) const { return !operator==(rhs); }
|
||||
|
||||
/**
|
||||
* Return a polymorphic copy of this object. This is an abstract
|
||||
* method which subclasses implement.
|
||||
* @stable
|
||||
*/
|
||||
virtual BreakIterator* clone(void) const = 0;
|
||||
|
||||
/**
|
||||
* Return a polymorphic class ID for this object. Different subclasses
|
||||
* will return distinct unequal values.
|
||||
* @stable
|
||||
*/
|
||||
virtual UClassID getDynamicClassID(void) const = 0;
|
||||
|
||||
/**
|
||||
* Return a CharacterIterator over the text being analyzed.
|
||||
* @draft
|
||||
*/
|
||||
virtual const CharacterIterator& getText() const = 0;
|
||||
|
||||
/**
|
||||
* Get the text for which this object is finding the boundaries.
|
||||
* @draft
|
||||
*/
|
||||
virtual CharacterIterator* createText(void) const = 0;
|
||||
|
||||
/**
|
||||
* Change the text over which this operates. The text boundary is
|
||||
* reset to the start.
|
||||
* [This function should be modified to take a const UnicodeString& agrument.]
|
||||
* @deprecate
|
||||
*/
|
||||
virtual void setText(const UnicodeString* it) = 0;
|
||||
|
||||
/**
|
||||
* Change the text over which this operates. The text boundary is
|
||||
* reset to the start.
|
||||
* @stable
|
||||
*/
|
||||
virtual void adoptText(CharacterIterator* it) = 0;
|
||||
|
||||
/**
|
||||
* DONE is returned by previous() and next() after all valid
|
||||
* boundaries have been returned.
|
||||
@stable
|
||||
*/
|
||||
static const UTextOffset DONE;
|
||||
|
||||
/**
|
||||
* Return the index of the first character in the text being scanned.
|
||||
* @stable
|
||||
*/
|
||||
virtual UTextOffset first(void) = 0;
|
||||
|
||||
/**
|
||||
* Return the index immediately BEYOND the last character in the text being scanned.
|
||||
* @stable
|
||||
*/
|
||||
virtual UTextOffset last(void) = 0;
|
||||
|
||||
@ -231,6 +251,7 @@ public:
|
||||
* Return the boundary preceding the current boundary.
|
||||
* @return The character index of the previous text boundary or DONE if all
|
||||
* boundaries have been returned.
|
||||
* @stable
|
||||
*/
|
||||
virtual UTextOffset previous(void) = 0;
|
||||
|
||||
@ -238,6 +259,7 @@ public:
|
||||
* Return the boundary following the current boundary.
|
||||
* @return The character index of the next text boundary or DONE if all
|
||||
* boundaries have been returned.
|
||||
* @stable
|
||||
*/
|
||||
virtual UTextOffset next(void) = 0;
|
||||
|
||||
@ -245,6 +267,7 @@ public:
|
||||
* Return character index of the text boundary that was most recently
|
||||
* returned by next(), previous(), first(), or last()
|
||||
* @return The boundary most recently returned.
|
||||
* @stable
|
||||
*/
|
||||
virtual UTextOffset current(void) const = 0;
|
||||
|
||||
@ -254,6 +277,7 @@ public:
|
||||
* the value BreakIterator.DONE
|
||||
* @param offset the offset to begin scanning.
|
||||
* @return The first boundary after the specified offset.
|
||||
* @stable
|
||||
*/
|
||||
virtual UTextOffset following(UTextOffset offset) = 0;
|
||||
|
||||
@ -263,6 +287,7 @@ public:
|
||||
* the value BreakIterator.DONE
|
||||
* @param offset the offset to begin scanning.
|
||||
* @return The first boundary before the specified offset.
|
||||
* @stable
|
||||
*/
|
||||
virtual UTextOffset preceding(UTextOffset offset) = 0;
|
||||
|
||||
@ -270,6 +295,7 @@ public:
|
||||
* Return true if the specfied position is a boundary position.
|
||||
* @param offset the offset to check.
|
||||
* @return True if "offset" is a boundary position.
|
||||
* @stable
|
||||
*/
|
||||
virtual bool_t isBoundary(UTextOffset offset) = 0;
|
||||
|
||||
@ -280,6 +306,7 @@ public:
|
||||
* and positive values move to later boundaries.
|
||||
* @return The index of the nth boundary from the current position, or
|
||||
* DONE if there are fewer than |n| boundaries in the specfied direction.
|
||||
* @stable
|
||||
*/
|
||||
virtual UTextOffset next(int32_t n) = 0;
|
||||
|
||||
@ -290,6 +317,7 @@ public:
|
||||
* @param where the locale. If a specific WordBreak is not
|
||||
* avaliable for the specified locale, a default WordBreak is returned.
|
||||
* @return A BreakIterator for word-breaks
|
||||
* @stable
|
||||
*/
|
||||
static BreakIterator* createWordInstance(const Locale& where = Locale::getDefault());
|
||||
|
||||
@ -302,6 +330,7 @@ public:
|
||||
* @param where the locale. If a specific LineBreak is not
|
||||
* avaliable for the specified locale, a default LineBreak is returned.
|
||||
* @return A BreakIterator for line-breaks
|
||||
* @stable
|
||||
*/
|
||||
static BreakIterator* createLineInstance(const Locale& where = Locale::getDefault());
|
||||
|
||||
@ -312,6 +341,7 @@ public:
|
||||
* @param where the locale. If a specific character break is not
|
||||
* avaliable for the specified locale, a default character break is returned.
|
||||
* @return A BreakIterator for character-breaks
|
||||
* @stable
|
||||
*/
|
||||
static BreakIterator* createCharacterInstance(const Locale& where = Locale::getDefault());
|
||||
|
||||
@ -321,6 +351,7 @@ public:
|
||||
* @param where the locale. If a specific SentenceBreak is not
|
||||
* avaliable for the specified locale, a default SentenceBreak is returned.
|
||||
* @return A BreakIterator for sentence-breaks
|
||||
* @stable
|
||||
*/
|
||||
static BreakIterator* createSentenceInstance(const Locale& where = Locale::getDefault());
|
||||
|
||||
@ -328,6 +359,7 @@ public:
|
||||
* Get the set of Locales for which TextBoundaries are installed
|
||||
* @param count the output parameter of number of elements in the locale list
|
||||
* @return available locales
|
||||
* @stable
|
||||
*/
|
||||
static const Locale* getAvailableLocales(int32_t& count);
|
||||
|
||||
@ -338,6 +370,7 @@ public:
|
||||
* @param name the fill-in parameter of the return value
|
||||
* Uses best match.
|
||||
* @return user-displayable name
|
||||
* @stable
|
||||
*/
|
||||
static UnicodeString& getDisplayName(const Locale& objectLocale,
|
||||
const Locale& displayLocale,
|
||||
@ -349,6 +382,7 @@ public:
|
||||
* @param objectLocale must be from getMatchingLocales
|
||||
* @param name the fill-in parameter of the return value
|
||||
* @return user-displayable name
|
||||
* @stable
|
||||
*/
|
||||
static UnicodeString& getDisplayName(const Locale& objectLocale,
|
||||
UnicodeString& name);
|
||||
|
@ -178,6 +178,7 @@ typedef enum UBreakIteratorType UBreakIteratorType;
|
||||
* @param status A UErrorCode to receive any errors.
|
||||
* @return A UBreakIterator for the specified locale.
|
||||
* @see ubrk_openRules
|
||||
* @stable
|
||||
*/
|
||||
U_CAPI UBreakIterator*
|
||||
ubrk_open(UBreakIteratorType type,
|
||||
@ -196,6 +197,7 @@ ubrk_open(UBreakIteratorType type,
|
||||
* @param status A UErrorCode to receive any errors.
|
||||
* @return A UBreakIterator for the specified rules.
|
||||
* @see ubrk_open
|
||||
* @stable
|
||||
*/
|
||||
U_CAPI UBreakIterator*
|
||||
ubrk_openRules(const UChar *rules,
|
||||
@ -208,16 +210,28 @@ ubrk_openRules(const UChar *rules,
|
||||
* Close a UBreakIterator.
|
||||
* Once closed, a UBreakIterator may no longer be used.
|
||||
* @param bi The break iterator to close.
|
||||
* @stable
|
||||
*/
|
||||
U_CAPI void
|
||||
ubrk_close(UBreakIterator *bi);
|
||||
|
||||
/**
|
||||
* Sets an existing iterator to point to a new piece of text
|
||||
* @stable
|
||||
*/
|
||||
U_CAPI void
|
||||
ubrk_setText(UBreakIterator* bi,
|
||||
const UChar* text,
|
||||
int32_t textLength,
|
||||
UErrorCode* status);
|
||||
|
||||
/**
|
||||
* Determine the most recently-returned text boundary.
|
||||
*
|
||||
* @param bi The break iterator to use.
|
||||
* @return The character index most recently returned by \Ref{ubrk_next}, \Ref{ubrk_previous},
|
||||
* \Ref{ubrk_first}, or \Ref{ubrk_last}.
|
||||
* @stable
|
||||
*/
|
||||
U_CAPI UTextOffset
|
||||
ubrk_current(const UBreakIterator *bi);
|
||||
@ -229,6 +243,7 @@ ubrk_current(const UBreakIterator *bi);
|
||||
* @return The character index of the next text boundary, or UBRK_DONE
|
||||
* if all text boundaries have been returned.
|
||||
* @see ubrk_previous
|
||||
* @stable
|
||||
*/
|
||||
U_CAPI UTextOffset
|
||||
ubrk_next(UBreakIterator *bi);
|
||||
@ -240,6 +255,7 @@ ubrk_next(UBreakIterator *bi);
|
||||
* @return The character index of the preceding text boundary, or UBRK_DONE
|
||||
* if all text boundaries have been returned.
|
||||
* @see ubrk_next
|
||||
* @stable
|
||||
*/
|
||||
U_CAPI UTextOffset
|
||||
ubrk_previous(UBreakIterator *bi);
|
||||
@ -250,6 +266,7 @@ ubrk_previous(UBreakIterator *bi);
|
||||
* @param bi The break iterator to use.
|
||||
* @return The character index of the first character in the text being scanned.
|
||||
* @see ubrk_last
|
||||
* @stable
|
||||
*/
|
||||
U_CAPI UTextOffset
|
||||
ubrk_first(UBreakIterator *bi);
|
||||
@ -262,6 +279,7 @@ ubrk_first(UBreakIterator *bi);
|
||||
* @return The character offset immediately <EM>beyond</EM> the last character in the
|
||||
* text being scanned.
|
||||
* @see ubrk_first
|
||||
* @stable
|
||||
*/
|
||||
U_CAPI UTextOffset
|
||||
ubrk_last(UBreakIterator *bi);
|
||||
@ -273,6 +291,7 @@ ubrk_last(UBreakIterator *bi);
|
||||
* @param offset The offset to begin scanning.
|
||||
* @return The text boundary preceding offset, or UBRK_DONE.
|
||||
* @see ubrk_following
|
||||
* @stable
|
||||
*/
|
||||
U_CAPI UTextOffset
|
||||
ubrk_preceding(UBreakIterator *bi,
|
||||
@ -285,6 +304,7 @@ ubrk_preceding(UBreakIterator *bi,
|
||||
* @param offset The offset to begin scanning.
|
||||
* @return The text boundary following offset, or UBRK_DONE.
|
||||
* @see ubrk_preceding
|
||||
* @stable
|
||||
*/
|
||||
U_CAPI UTextOffset
|
||||
ubrk_following(UBreakIterator *bi,
|
||||
@ -297,6 +317,7 @@ ubrk_following(UBreakIterator *bi,
|
||||
* @param index The index of the desired locale.
|
||||
* @return A locale for which number text breaking information is available, or 0 if none.
|
||||
* @see ubrk_countAvailable
|
||||
* @stable
|
||||
*/
|
||||
U_CAPI const char*
|
||||
ubrk_getAvailable(int32_t index);
|
||||
@ -307,6 +328,7 @@ ubrk_getAvailable(int32_t index);
|
||||
* calls to \Ref{ubrk_getAvailable}.
|
||||
* @return The number of locales for which text breaking information is available.
|
||||
* @see ubrk_getAvailable
|
||||
* @stable
|
||||
*/
|
||||
U_CAPI int32_t
|
||||
ubrk_countAvailable(void);
|
||||
|
@ -1023,6 +1023,7 @@ AllocateTextBoundary();
|
||||
|
||||
/* in addition to the other invariants, a line-break iterator should make sure that:
|
||||
it doesn't break around the non-breaking characters */
|
||||
e = ubrk_open(UBRK_LINE, "en_US", work, u_strlen(work), &status);
|
||||
errorCount=0;
|
||||
status=U_ZERO_ERROR;
|
||||
u_strcpy(noBreak, CharsToUCharArray("\\u00a0\\u2007\\u2011\\ufeff"));
|
||||
@ -1035,9 +1036,8 @@ AllocateTextBoundary();
|
||||
for (j = 0; j < u_strlen(noBreak); j++) {
|
||||
work[1] = noBreak[j];
|
||||
for (k = 0; k < u_strlen(s); k++) {
|
||||
work[2] = s[k];
|
||||
|
||||
e = ubrk_open(UBRK_LINE, "en_US", work, u_strlen(work), &status);
|
||||
work[2] = s[k];
|
||||
ubrk_setText(e, work, u_strlen(work), &status);
|
||||
if(U_FAILURE(status)){
|
||||
log_err("FAIL: Error in opening the word break Iterator in testLineInvaiants:\n %s\n", myErrorName(status));
|
||||
return;
|
||||
@ -1530,7 +1530,8 @@ void doBreakInvariantTest(UBreakIteratorType type, UChar* testChars)
|
||||
|
||||
u_strcpy(breaks, CharsToUCharArray("\r\n\\u2029\\u2028"));
|
||||
|
||||
|
||||
tb = ubrk_open(type, "en_US", work, u_strlen(work), &status);
|
||||
|
||||
for (i = 0; i < u_strlen(breaks); i++) {
|
||||
work[1] = breaks[i];
|
||||
for (j = 0; j < u_strlen(testChars); j++) {
|
||||
@ -1545,7 +1546,7 @@ void doBreakInvariantTest(UBreakIteratorType type, UChar* testChars)
|
||||
continue;
|
||||
|
||||
work[2] = testChars[k];
|
||||
tb=ubrk_open(type, "en_US", work, u_strlen(work), &status);
|
||||
ubrk_setText(tb, work, u_strlen(work), &status);
|
||||
if(U_FAILURE(status)){
|
||||
log_err("ERROR in opening the breakIterator in doVariant Function: %s\n", myErrorName(status));
|
||||
}
|
||||
@ -1582,12 +1583,14 @@ void doOtherInvariantTest(UBreakIteratorType type , UChar* testChars)
|
||||
|
||||
log_verbose("doOtherInvariantTest text of length: %d\n", u_strlen(testChars));
|
||||
|
||||
tb = ubrk_open(type, "en_us", work, u_strlen(work), &status);
|
||||
|
||||
/* a break should never occur between CR and LF */
|
||||
for (i = 0; i < u_strlen(testChars); i++) {
|
||||
work[0] = testChars[i];
|
||||
for (j = 0; j < u_strlen(testChars); j++) {
|
||||
work[3] = testChars[j];
|
||||
tb=ubrk_open(type, "en_US", work, u_strlen(work), &status);
|
||||
ubrk_setText(tb, work, u_strlen(work), &status);
|
||||
if(U_FAILURE(status)){
|
||||
log_err("ERROR in opening the breakIterator in doVariant Function: %s\n", myErrorName(status));
|
||||
}
|
||||
@ -1601,7 +1604,7 @@ void doOtherInvariantTest(UBreakIteratorType type , UChar* testChars)
|
||||
}
|
||||
}
|
||||
}
|
||||
ubrk_close(tb);
|
||||
|
||||
/* a break should never occur before a non-spacing mark, unless the preceding
|
||||
character is CR, LF, PS, or LS */
|
||||
u_uastrcpy(work,"aaaa");
|
||||
@ -1616,7 +1619,7 @@ void doOtherInvariantTest(UBreakIteratorType type , UChar* testChars)
|
||||
(u_charType(c) != U_ENCLOSING_MARK))
|
||||
continue;
|
||||
work[2] = c;
|
||||
tb=ubrk_open(type, "en_US", work, u_strlen(work), &status);
|
||||
ubrk_setText(tb, work, u_strlen(work), &status);
|
||||
if(U_FAILURE(status)){
|
||||
log_err("ERROR in opening the breakIterator in doOtherVariant Function %s\n", myErrorName(status));
|
||||
}
|
||||
@ -1630,6 +1633,7 @@ void doOtherInvariantTest(UBreakIteratorType type , UChar* testChars)
|
||||
}
|
||||
}
|
||||
}
|
||||
ubrk_close(tb);
|
||||
}
|
||||
|
||||
void sample(UBreakIterator* tb, UChar* text)
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -32,121 +32,59 @@ public:
|
||||
~IntlTestTextBoundary();
|
||||
|
||||
void runIndexedTest( int32_t index, bool_t exec, char* &name, char* par = NULL );
|
||||
|
||||
/**
|
||||
* Test sentence break using doForwardSelectionTest
|
||||
* Test sentence break using generalIteratorTest()
|
||||
**/
|
||||
void TestForwardSentenceSelection(void);
|
||||
void TestSentenceIteration(void);
|
||||
/**
|
||||
* Test sentence break using doFirstSelectionTest
|
||||
* Test word break using generalIteratorTest()
|
||||
**/
|
||||
void TestFirstSentenceSelection(void);
|
||||
void TestWordIteration(void);
|
||||
/**
|
||||
* Test sentence break using doLastSelectionTest
|
||||
* Test line break using generalIteratorTest()
|
||||
**/
|
||||
void TestLineIteration(void);
|
||||
/**
|
||||
* Test character break using generalIteratorTest()
|
||||
**/
|
||||
void TestLastSentenceSelection(void);
|
||||
void TestCharacterIteration(void);
|
||||
/**
|
||||
* Test sentence break using doBackwardSelectionTest
|
||||
* Test sentence break using ()
|
||||
**/
|
||||
void TestBackwardSentenceSelection(void);
|
||||
/**
|
||||
* Test sentence break using doForwardIndexSelectionTest
|
||||
void TestSentenceInvariants(void);
|
||||
/**
|
||||
* Test sentence break Invariants using generalIteratorTest()
|
||||
**/
|
||||
void TestWordInvariants(void);
|
||||
/**
|
||||
* Test sentence break Invariants using generalIteratorTest()
|
||||
**/
|
||||
void TestForwardSentenceIndexSelection(void);
|
||||
/**
|
||||
* Test sentence break using doBackwardIndexSelectionTest
|
||||
void TestLineInvariants(void);
|
||||
/**
|
||||
* Test sentence break Invariants using generalIteratorTest()
|
||||
**/
|
||||
void TestBackwardSentenceIndexSelection(void);
|
||||
/**
|
||||
* Test sentence break using doMultipleSelectionTest
|
||||
void TestCharacterInvariants(void);
|
||||
/**
|
||||
* Test Japanese line break Invariants using generalIteratorTest()
|
||||
**/
|
||||
void TestSentenceMultipleSelection(void);
|
||||
/**
|
||||
* Test word break using doForwardSelectionTest
|
||||
void TestJapaneseLineBreak(void);
|
||||
/**
|
||||
* Test Thai line break using generalIteratorTest()
|
||||
**/
|
||||
void TestForwardWordSelection(void);
|
||||
/**
|
||||
* Test word break using doFirstSelectionTest
|
||||
void TestThaiLineBreak(void);
|
||||
/**
|
||||
* Test Mixed Thai (thai with other languages like english)line break using generalIteratorTest()
|
||||
**/
|
||||
void TestFirstWordSelection(void);
|
||||
void TestMixedThaiLineBreak(void);
|
||||
/**
|
||||
* Test word break using doLastSelectionTest
|
||||
* Test Thai Line break with Maiyamok using generalIteratorTest()
|
||||
* The Thai maiyamok character is a shorthand symbol that means "repeat the previous
|
||||
* word". Instead of appearing as a word unto itself, however, it's kept together
|
||||
* with the word before it
|
||||
**/
|
||||
void TestLastWordSelection(void);
|
||||
void TestMaiyamok(void);
|
||||
/**
|
||||
* Test word break using doBackwardSelectionTest
|
||||
**/
|
||||
void TestBackwardWordSelection(void);
|
||||
/**
|
||||
* Test word break using doForwardIndexSelectionTest
|
||||
**/
|
||||
void TestForwardWordIndexSelection(void);
|
||||
/**
|
||||
* Test word break using doBackwardIndexSelectionTest
|
||||
**/
|
||||
void TestBackwardWordIndexSelection(void);
|
||||
/**
|
||||
* Test word break using doMultipleSelectionTest
|
||||
**/
|
||||
void TestWordMultipleSelection(void);
|
||||
/**
|
||||
* Test line break using doLastSelectionTest
|
||||
**/
|
||||
void TestForwardLineSelection(void);
|
||||
/**
|
||||
* Test line break using doFirstSelectionTest
|
||||
**/
|
||||
void TestFirstLineSelection(void);
|
||||
/**
|
||||
* Test line break using doLastSelectionTest
|
||||
**/
|
||||
void TestLastLineSelection(void);
|
||||
/**
|
||||
* Test line break using doBackwardSelectionTest
|
||||
**/
|
||||
void TestBackwardLineSelection(void);
|
||||
/**
|
||||
* Test line break using doForwardIndexSelectionTest
|
||||
**/
|
||||
void TestForwardLineIndexSelection(void);
|
||||
/**
|
||||
* Test line break using doBackwardIndexSelectionTest
|
||||
**/
|
||||
void TestBackwardLineIndexSelection(void);
|
||||
/**
|
||||
* Test line break using doMultipleSelectionTest
|
||||
**/
|
||||
void TestLineMultipleSelection(void);
|
||||
/**
|
||||
* Test word break using doForwardIndexSelectionTest
|
||||
**/
|
||||
void TestForwardCharacterSelection(void);
|
||||
/**
|
||||
* Test character break using doFirstSelectionTest
|
||||
**/
|
||||
void TestFirstCharacterSelection(void);
|
||||
/**
|
||||
* Test character break using doLastSelectionTest
|
||||
**/
|
||||
void TestLastCharacterSelection(void);
|
||||
/**
|
||||
* Test character break using doBackwardSelectionTest
|
||||
**/
|
||||
void TestBackwardCharacterSelection(void);
|
||||
/**
|
||||
* Test character break using doForwardIndexSelectionTest
|
||||
**/
|
||||
void TestForwardCharacterIndexSelection(void);
|
||||
/**
|
||||
* Test character break using doBackwardIndexSelectionTest
|
||||
**/
|
||||
void TestBackwardCharacterIndexSelection(void);
|
||||
/**
|
||||
* Test character break using doMultipleSelectionTest
|
||||
**/
|
||||
void TestCharacterMultipleSelection(void);
|
||||
/**
|
||||
* test behaviour of BrakIteraor on an empty string
|
||||
* test behaviour of BreakIterator on an empty string
|
||||
**/
|
||||
void TestEmptyString(void);
|
||||
/**
|
||||
@ -162,20 +100,14 @@ public:
|
||||
**/
|
||||
void TestPreceding(void);
|
||||
|
||||
void TestJapaneseLineBreak(void);
|
||||
|
||||
void TestBug4153072(void);
|
||||
|
||||
void TestEndBehavior(void);
|
||||
|
||||
void TestSentenceInvariants(void);
|
||||
|
||||
void TestWordInvariants(void);
|
||||
/**
|
||||
* Test End Behaviour
|
||||
* @bug 4068137
|
||||
**/
|
||||
void TestEndBehaviour(void);
|
||||
|
||||
void TestLineInvariants(void);
|
||||
|
||||
void TestCharacterInvariants(void);
|
||||
|
||||
/***********************/
|
||||
private:
|
||||
/**
|
||||
* internal methods to prepare test data
|
||||
@ -184,62 +116,68 @@ private:
|
||||
void addTestSentenceData(void);
|
||||
void addTestLineData(void);
|
||||
void addTestCharacterData(void);
|
||||
|
||||
UnicodeString createTestData(Enumeration* e);
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Perform tests of BreakIterator forward functionality
|
||||
* on different kinds of iterators (word, sentence, line and character)
|
||||
* Perform tests of BreakIterator forward and backward functionality
|
||||
* on different kinds of iterators (word, sentence, line and character).
|
||||
* It tests the methods first(), next(), current(), preceding(), following()
|
||||
* previous() and isBoundary().
|
||||
* It makes use of internal functions to achieve this.
|
||||
**/
|
||||
void doForwardSelectionTest(BreakIterator& iterator, UnicodeString& testText, Vector* result);
|
||||
void generalIteratorTest(BreakIterator& bi, Vector* expectedResult);
|
||||
/**
|
||||
* Perform tests of BreakIterator backward functionality
|
||||
* on different kinds of iterators (word, sentence, line and character)
|
||||
**/
|
||||
void doBackwardSelectionTest(BreakIterator& iterator, UnicodeString& testText, Vector* result);
|
||||
* Internal method to perform iteration and test the first() and next() functions
|
||||
**/
|
||||
Vector* testFirstAndNext(BreakIterator& bi, UnicodeString& text);
|
||||
/**
|
||||
* Perform tests of BreakIterator first selection functionality
|
||||
* on different kinds of iterators (word, sentence, line and character)
|
||||
**/
|
||||
void doFirstSelectionTest(BreakIterator& iterator, UnicodeString& testText, Vector* result);
|
||||
* Internal method to perform iteration and test the last() and previous() functions
|
||||
**/
|
||||
Vector* testLastAndPrevious(BreakIterator& bi, UnicodeString& text);
|
||||
/**
|
||||
* Internal method to perform iteration and test the following() function
|
||||
**/
|
||||
void testFollowing(BreakIterator& bi, UnicodeString& text, int32_t *boundaries);
|
||||
/**
|
||||
* Perform tests of BreakIterator last selection functionality
|
||||
* on different kinds of iterators (word, sentence, line and character)
|
||||
**/
|
||||
void doLastSelectionTest(BreakIterator& iterator, UnicodeString& testText, Vector* result);
|
||||
* Internal method to perform iteration and test the preceding() function
|
||||
**/
|
||||
void testPreceding(BreakIterator& bi, UnicodeString& text, int32_t *boundaries);
|
||||
/**
|
||||
* Internal method to perform iteration and test the isBoundary() function
|
||||
**/
|
||||
void testIsBoundary(BreakIterator& bi, UnicodeString& text, int32_t *boundaries);
|
||||
/**
|
||||
* Internal method which does the comparision of expected and got results.
|
||||
**/
|
||||
void compareFragmentLists(UnicodeString& f1Name, UnicodeString& f2Name, Vector* f1, Vector* f2);
|
||||
/**
|
||||
* Perform tests of BreakIterator forward index functionality
|
||||
* on different kinds of iterators (word, sentence, line and character)
|
||||
**/
|
||||
void doForwardIndexSelectionTest(BreakIterator& iterator, UnicodeString& testText, Vector* result);
|
||||
/**
|
||||
* Perform tests of BreakIterator backward index functionality
|
||||
* on different kinds of iterators (word, sentence, line and character)
|
||||
**/
|
||||
void doBackwardIndexSelectionTest(BreakIterator& iterator, UnicodeString& testText, Vector* result);
|
||||
/**
|
||||
* Perform tests of BreakIterator multiple selection functionality
|
||||
* Internal method to perform tests of BreakIterator multiple selection functionality
|
||||
* on different kinds of iterators (word, sentence, line and character)
|
||||
**/
|
||||
void doMultipleSelectionTest(BreakIterator& iterator, UnicodeString& testText);
|
||||
/**
|
||||
* Perform tests with short sample code
|
||||
* Internal method to perform tests of BreakIterator break Invariants
|
||||
* on different kinds of iterators (word, sentence, line and character)
|
||||
**/
|
||||
void sample(BreakIterator& tb, UnicodeString& text, UnicodeString& title);
|
||||
|
||||
void doBreakInvariantTest(BreakIterator& tb, UnicodeString& testChars);
|
||||
|
||||
/**
|
||||
* Internal method to perform tests of BreakIterator other invariants
|
||||
* on different kinds of iterators (word, sentence, line and character)
|
||||
**/
|
||||
void doOtherInvariantTest(BreakIterator& tb, UnicodeString& testChars);
|
||||
|
||||
/**
|
||||
* Perform tests with short sample code
|
||||
**/
|
||||
void sample(BreakIterator& tb, UnicodeString& text, UnicodeString& title);
|
||||
/**
|
||||
* The vectors holding test data for testing
|
||||
* different kinds of iterators( word, sentence, line and character)
|
||||
**/
|
||||
Vector* lineSelectionData;
|
||||
UnicodeString testLineText;
|
||||
Vector* sentenceSelectionData;
|
||||
UnicodeString testSentenceText;
|
||||
Vector* wordSelectionData;
|
||||
UnicodeString testWordText;
|
||||
Vector* characterSelectionData;
|
||||
UnicodeString testCharacterText;
|
||||
|
||||
static const UChar cannedTestArray[];
|
||||
static UnicodeString *cannedTestChars;
|
||||
};
|
||||
|
@ -11,7 +11,7 @@
|
||||
!IF "$(CFG)" == ""
|
||||
CFG=Debug
|
||||
!MESSAGE No configuration specified. Defaulting to common - Win32 Debug.
|
||||
!ENDIF
|
||||
!ENDIF
|
||||
|
||||
!IF [cl.exe]
|
||||
!MESSAGE Could not find build tools!
|
||||
@ -24,7 +24,7 @@ CFG=Debug
|
||||
#Let's see if user has given us a path to ICU
|
||||
#This could be found according to the path to makefile, but for now it is this way
|
||||
!MESSAGE ICUP=$(ICUP)
|
||||
!IF "$(ICUP)"==""
|
||||
!IF "$(ICUP)"==""
|
||||
!ERROR Can't find path!
|
||||
!ELSE
|
||||
ICUDATA=$(ICUP)\icu\data
|
||||
@ -38,47 +38,47 @@ LINK32 = link.exe
|
||||
LINK32_FLAGS = /out:"$(ICUDATA)/icudata.dll" /DLL /NOENTRY /base:"0x4ad00000" /comment:" Copyright (C) 1999 International Business Machines Corporation and others. All Rights Reserved. "
|
||||
CPP_FLAGS = /I$(ICUP)\icu\include /GD /c
|
||||
|
||||
#Here we test if configuration is given
|
||||
#Here we test if configuration is given
|
||||
!IF "$(CFG)" != "Release" && "$(CFG)" != "release" && "$(CFG)" != "Debug" && "$(CFG)" != "debug"
|
||||
!MESSAGE Invalid configuration "$(CFG)" specified.
|
||||
!MESSAGE You can specify a configuration when running NMAKE
|
||||
!MESSAGE by defining the macro CFG on the command line. For example:
|
||||
!MESSAGE
|
||||
!MESSAGE
|
||||
!MESSAGE NMAKE /f "makedata.mak" CFG="Debug"
|
||||
!MESSAGE
|
||||
!MESSAGE
|
||||
!MESSAGE Possible choices for configuration are:
|
||||
!MESSAGE
|
||||
!MESSAGE
|
||||
!MESSAGE "Release"
|
||||
!MESSAGE "Debug"
|
||||
!MESSAGE
|
||||
!MESSAGE
|
||||
!ERROR An invalid configuration is specified.
|
||||
!ENDIF
|
||||
!ENDIF
|
||||
|
||||
# This appears in original Microsofts makefiles
|
||||
!IF "$(OS)" == "Windows_NT"
|
||||
NULL=
|
||||
!ELSE
|
||||
!ELSE
|
||||
NULL=nul
|
||||
!ENDIF
|
||||
!ENDIF
|
||||
|
||||
PATH = $(PATH);$(ICUP)\icu\bin\$(CFG)
|
||||
|
||||
# Suffixes for data files
|
||||
.SUFFIXES : .ucm .cnv .dll .dat .col .res .txt .c
|
||||
|
||||
# We're including a list of ucm files. There are two lists, one is essential 'ucmfiles.mk' and
|
||||
# We're including a list of ucm files. There are two lists, one is essential 'ucmfiles.mk' and
|
||||
# the other is optional 'ucmlocal.mk'
|
||||
!IF EXISTS("$(ICUTOOLS)\makeconv\ucmfiles.mk")
|
||||
!INCLUDE "$(ICUTOOLS)\makeconv\ucmfiles.mk"
|
||||
!IF EXISTS("$(ICUTOOLS)\makeconv\ucmlocal.mk")
|
||||
!INCLUDE "$(ICUTOOLS)\makeconv\ucmlocal.mk"
|
||||
$(UCM_SOURCE)=$(UCM_SOURCE) $(UCM_SOURCE_LOCAL)
|
||||
!ELSE
|
||||
!ELSE
|
||||
#!MESSAGE Warning: cannot find "ucmlocal.mk"
|
||||
!ENDIF
|
||||
!ELSE
|
||||
!ELSE
|
||||
!ERROR ERROR: cannot find "ucmfiles.mk"
|
||||
!ENDIF
|
||||
!ENDIF
|
||||
|
||||
# According to the read files, we will generate CNV and C files
|
||||
CNV_FILES=$(UCM_SOURCE:.ucm=.cnv)
|
||||
@ -91,12 +91,12 @@ OBJ_CNV_FILES = $(C_CNV_FILES:.c=.obj)
|
||||
!IF EXISTS("$(ICUTOOLS)\genrb\genrblocal.mk")
|
||||
!INCLUDE "$(ICUTOOLS)\genrb\genrblocal.mk"
|
||||
GENRB_SOURCE=$(GENRB_SOURCE) $(GENRB_SOURCE_LOCAL)
|
||||
!ELSE
|
||||
!ELSE
|
||||
#!MESSAGE Warning: cannot find "genrblocal.mk"
|
||||
!ENDIF
|
||||
!ELSE
|
||||
!ELSE
|
||||
!ERROR ERROR: cannot find "genrbfiles.mk"
|
||||
!ENDIF
|
||||
!ENDIF
|
||||
RB_FILES = $(GENRB_SOURCE:.txt=.res)
|
||||
|
||||
# Read list of resource bundle files for colation
|
||||
@ -105,48 +105,81 @@ RB_FILES = $(GENRB_SOURCE:.txt=.res)
|
||||
!IF EXISTS("$(ICUTOOLS)\gencol\gencollocal.mk")
|
||||
!INCLUDE "$(ICUTOOLS)\gencol\gencollocal.mk"
|
||||
GENCOL_SOURCE=$(GENCOL_SOURCE) $(GENCOL_SOURCE_LOCAL)
|
||||
!ELSE
|
||||
!ELSE
|
||||
#!MESSAGE Warning: cannot find "gencollocal.mk"
|
||||
!ENDIF
|
||||
!ELSE
|
||||
!ELSE
|
||||
!ERROR ERROR: cannot find "gencolfiles.mk"
|
||||
!ENDIF
|
||||
!ENDIF
|
||||
COL_FILES = $(GENCOL_SOURCE:.txt=.col)
|
||||
|
||||
|
||||
# This target should build all the data files
|
||||
ALL : GODATA $(RB_FILES) $(CNV_FILES) $(COL_FILES) icudata.dll icudata.dat GOBACK
|
||||
@echo All targets are up to date
|
||||
|
||||
CPP_SOURCES = $(C_CNV_FILES) unames_dat.c cnvalias_dat.c tz_dat.c
|
||||
|
||||
BRK_FILES = sent.brk char.brk line.brk word.brk line_th.brk word_th.brk
|
||||
BRK_CSOURCES = $(BRK_FILES:.brk=_brk.c)
|
||||
|
||||
CPP_SOURCES = $(C_CNV_FILES) unames_dat.c cnvalias_dat.c tz_dat.c $(BRK_CSOURCES)
|
||||
LINK32_OBJS = $(CPP_SOURCES:.c=.obj)
|
||||
|
||||
# target for DLL
|
||||
icudata.dll : $(LINK32_OBJS) $(CNV_FILES)
|
||||
@echo Creating DLL file
|
||||
@echo Creating DLL file
|
||||
@cd $(ICUDATA)
|
||||
@$(LINK32) @<<
|
||||
$(LINK32_FLAGS) $(LINK32_OBJS)
|
||||
<<
|
||||
|
||||
$(ICUDATA)\sent.brk : $(ICUDATA)\sentLE.brk
|
||||
copy $(ICUDATA)\sentLE.brk $(ICUDATA)\sent.brk
|
||||
|
||||
$(ICUDATA)\char.brk : $(ICUDATA)\charLE.brk
|
||||
copy $(ICUDATA)\charLE.brk $(ICUDATA)\char.brk
|
||||
|
||||
$(ICUDATA)\line.brk : $(ICUDATA)\lineLE.brk
|
||||
copy $(ICUDATA)\lineLE.brk $(ICUDATA)\line.brk
|
||||
|
||||
$(ICUDATA)\word.brk : $(ICUDATA)\wordLE.brk
|
||||
copy $(ICUDATA)\wordLE.brk $(ICUDATA)\word.brk
|
||||
|
||||
$(ICUDATA)\line_th.brk : $(ICUDATA)\line_thLE.brk
|
||||
copy $(ICUDATA)\line_thLE.brk $(ICUDATA)\line_th.brk
|
||||
|
||||
$(ICUDATA)\word_th.brk : $(ICUDATA)\word_thLE.brk
|
||||
copy $(ICUDATA)\word_thLE.brk $(ICUDATA)\word_th.brk
|
||||
|
||||
# target for memory mapped file
|
||||
icudata.dat : $(CNV_FILES) unames.dat cnvalias.dat tz.dat
|
||||
icudata.dat : $(CNV_FILES) unames.dat cnvalias.dat tz.dat
|
||||
@echo Creating memory-mapped file
|
||||
@cd $(ICUDATA)
|
||||
@$(ICUTOOLS)\gencmn\$(CFG)\gencmn 1000000 <<
|
||||
$(ICUDATA)\unames.dat
|
||||
$(ICUDATA)\cnvalias.dat
|
||||
$(ICUDATA)\tz.dat
|
||||
$(ICUDATA)\sent.brk
|
||||
$(ICUDATA)\char.brk
|
||||
$(ICUDATA)\line.brk
|
||||
$(ICUDATA)\word.brk
|
||||
$(ICUDATA)\line_th.brk
|
||||
$(ICUDATA)\word_th.brk
|
||||
$(CNV_FILES:.cnv =.cnv
|
||||
)
|
||||
<<
|
||||
|
||||
# nothing works without this target, but we're making
|
||||
# nothing works without this target, but we're making
|
||||
# these files while creating converters
|
||||
$(C_CNV_FILES) : $(CNV_FILES)
|
||||
@$(ICUTOOLS)\genccode\$(CFG)\genccode $(CNV_FILES)
|
||||
|
||||
# nothing works without this target, but we're making
|
||||
# these files while creating converters
|
||||
$(BRK_CSOURCES) : $(BRK_FILES)
|
||||
@$(ICUTOOLS)\genccode\$(CFG)\genccode $(BRK_FILES)
|
||||
|
||||
# utility to send us to the right dir
|
||||
GODATA :
|
||||
GODATA :
|
||||
@cd $(ICUDATA)
|
||||
|
||||
# utility to get us back to the right dir
|
||||
@ -164,8 +197,15 @@ CLEAN :
|
||||
-@erase "cnvalias*.*"
|
||||
-@erase "tz*.*"
|
||||
-@erase "ibm*_cnv.c"
|
||||
-@erase "*_brk.c"
|
||||
-@erase "icudata.*"
|
||||
-@erase "*.obj"
|
||||
-@erase "sent.brk"
|
||||
-@erase "char.brk"
|
||||
-@erase "line.brk"
|
||||
-@erase "word.brk"
|
||||
-@erase "line_th.brk"
|
||||
-@erase "word_th.brk"
|
||||
@cd $(TEST)
|
||||
-@erase "*.res"
|
||||
@cd $(ICUTOOLS)
|
||||
@ -184,7 +224,7 @@ CLEAN :
|
||||
@$(ICUTOOLS)\makeconv\$(CFG)\makeconv $<
|
||||
# @$(ICUTOOLS)\genccode\$(CFG)\genccode $(CNV_FILES)
|
||||
|
||||
# Inference rule for creating collation files -
|
||||
# Inference rule for creating collation files -
|
||||
# this should be integrated in genrb
|
||||
.txt.col::
|
||||
@echo Making Collation files
|
||||
@ -203,7 +243,7 @@ unames.dat : UnicodeData-3.0.0.txt
|
||||
@echo Creating data file for Unicode Names
|
||||
@$(ICUTOOLS)\gennames\$(CFG)\gennames -v- -c- UnicodeData-3.0.0.txt
|
||||
|
||||
unames_dat.c : unames.dat
|
||||
unames_dat.c : unames.dat
|
||||
@echo Creating C source file for Unicode Names
|
||||
@$(ICUTOOLS)\genccode\$(CFG)\genccode $(ICUDATA)\$?
|
||||
|
||||
@ -211,8 +251,8 @@ unames_dat.c : unames.dat
|
||||
cnvalias.dat : convrtrs.txt
|
||||
@echo Creating data file for Converter Aliases
|
||||
@$(ICUTOOLS)\gencnval\$(CFG)\gencnval -c-
|
||||
|
||||
cnvalias_dat.c : cnvalias.dat
|
||||
|
||||
cnvalias_dat.c : cnvalias.dat
|
||||
@echo Creating C source file for Converter Aliases
|
||||
@$(ICUTOOLS)\genccode\$(CFG)\genccode $(ICUDATA)\$?
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user