ICU-45 Initial check-in of RuleBasedBreakIterator and DictionaryBasedBreakIterator.

X-SVN-Rev: 502
This commit is contained in:
Richard Gillam 2000-01-08 02:05:05 +00:00
parent bbccafffa4
commit 016aa963f6
27 changed files with 3823 additions and 1795 deletions

View File

@ -190,6 +190,15 @@ StringCharacterIterator::getIndex() const
return pos;
}
void
StringCharacterIterator::setText(const UnicodeString& newText)
{
text = newText;
begin = 0;
end = newText.length();
pos = begin;
}
void
StringCharacterIterator::getText(UnicodeString& result)
{

View File

@ -142,6 +142,15 @@ UCharCharacterIterator::getIndex() const
return pos;
}
void UCharCharacterIterator::setText(const UChar* newText,
int32_t newTextLength)
{
text = newText;
begin = 0;
end = newTextLength;
pos = begin;
}
void
UCharCharacterIterator::getText(UnicodeString& result)
{

View File

@ -139,6 +139,11 @@ public:
* returned by current()). */
virtual UTextOffset getIndex(void) const;
/**
* Sets the iterator to iterate over the provided string.
*/
virtual void setText(const UnicodeString& newText);
/**
* Copies the UnicodeString under iteration into the UnicodeString
* referred to by "result". Even if this iterator iterates across

View File

@ -108,6 +108,12 @@ public:
* returned by current()). */
virtual UTextOffset getIndex(void) const;
/**
* Sets the iterator to iterate over a new range of text
*/
virtual void setText(const UChar* newText,
int32_t newTextLength);
/**
* Copies the UnicodeString under iteration into the UnicodeString
* referred to by "result". Even if this iterator iterates across

View File

@ -49,7 +49,7 @@ UVector::~UVector() {
}
void UVector::addElement(void* obj) {
if (ensureCapacity(count+1)) {
if (ensureCapacity(count + 1)) {
elements[count++] = obj;
}
}
@ -66,7 +66,7 @@ void UVector::setElementAt(void* obj, int32_t index) {
void UVector::insertElementAt(void* obj, int32_t index) {
// must have 0 <= index <= count
if (0 <= index && index <= count && ensureCapacity(count)) {
if (0 <= index && index <= count && ensureCapacity(count + 1)) {
for (int32_t i=count; i>index; --i) {
elements[i] = elements[i-1];
}

View File

@ -227,15 +227,6 @@ inline void* UVector::operator[](int32_t index) const {
return elementAt(index);
}
// Dummy implementation - disallowed method
inline UVector::UVector(const UVector&) {}
// Dummy implementation - disallowed method
inline UVector& UVector::operator=(const UVector&) {
return *this;
}
// UStack inlines
inline bool_t UStack::empty(void) const {
@ -251,12 +242,4 @@ inline void* UStack::push(void* obj) {
return obj;
}
// Dummy implementation - disallowed method
inline UStack::UStack(const UStack&) {}
// Dummy implementation - disallowed method
inline UStack& UStack::operator=(const UStack&) {
return *this;
}
#endif

View File

@ -17,9 +17,10 @@
// This file was generated from the java source file BreakIterator.java
// *****************************************************************************
#include "unicode/utypes.h"
#include "dbbi.h"
#include "unicode/brkiter.h"
#include "simtxbd.h"
#include "unicode/udata.h"
#include "resbund.h"
#include <string.h>
@ -38,7 +39,41 @@ const UTextOffset BreakIterator::DONE = (int32_t)-1;
BreakIterator*
BreakIterator::createWordInstance(const Locale& key)
{
return new SimpleTextBoundary(&TextBoundaryData::kWordBreakData);
// WARNING: This routine is currently written specifically to handle only the
// default rules files and the alternate rules files for Thai. This function
// will have to be made fully general at some time in the future!
BreakIterator* result = NULL;
const char* filename = "word";
UnicodeString temp;
if (key.getLanguage(temp) == UnicodeString("th", (char*)0)) {
filename = "word_th";
}
UErrorCode err = U_ZERO_ERROR;
UDataMemory* file = udata_open(NULL, "brk", filename, &err);
if (!U_FAILURE(err)) {
const void* image = udata_getMemory(file);
if (image != NULL) {
if (key.getLanguage(temp) == UnicodeString("th", (char*)0)) {
const char* dataDir = u_getDataDirectory();
filename = "thaidict.brk";
char* fullPath = new char[strlen(dataDir) + strlen(filename) + 1];
strcpy(fullPath, dataDir);
strcpy(fullPath, filename);
result = new DictionaryBasedBreakIterator(image, fullPath);
delete [] fullPath;
}
else {
result = new RuleBasedBreakIterator(image);
}
}
}
return result;
}
// -------------------------------------
@ -47,7 +82,41 @@ BreakIterator::createWordInstance(const Locale& key)
BreakIterator*
BreakIterator::createLineInstance(const Locale& key)
{
return new SimpleTextBoundary(&TextBoundaryData::kLineBreakData);
// WARNING: This routine is currently written specifically to handle only the
// default rules files and the alternate rules files for Thai. This function
// will have to be made fully general at some time in the future!
BreakIterator* result = NULL;
const char* filename = "line";
UnicodeString temp;
if (key.getLanguage(temp) == UnicodeString("th", (char*)0)) {
filename = "line_th";
}
UErrorCode err = U_ZERO_ERROR;
UDataMemory* file = udata_open(NULL, "brk", filename, &err);
if (!U_FAILURE(err)) {
const void* image = udata_getMemory(file);
if (image != NULL) {
if (key.getLanguage(temp) == UnicodeString("th", (char*)0)) {
const char* dataDir = u_getDataDirectory();
filename = "thaidict.brk";
char* fullPath = new char[strlen(dataDir) + strlen(filename) + 1];
strcpy(fullPath, dataDir);
strcat(fullPath, filename);
result = new DictionaryBasedBreakIterator(image, fullPath);
delete [] fullPath;
}
else {
result = new RuleBasedBreakIterator(image);
}
}
}
return result;
}
// -------------------------------------
@ -56,7 +125,24 @@ BreakIterator::createLineInstance(const Locale& key)
BreakIterator*
BreakIterator::createCharacterInstance(const Locale& key)
{
return new SimpleTextBoundary(&TextBoundaryData::kCharacterBreakData);
// WARNING: This routine is currently written specifically to handle only the
// default rules files and the alternate rules files for Thai. This function
// will have to be made fully general at some time in the future!
BreakIterator* result = NULL;
const char* filename = "char";
UErrorCode err = U_ZERO_ERROR;
UDataMemory* file = udata_open(NULL, "brk", filename, &err);
if (!U_FAILURE(err)) {
const void* image = udata_getMemory(file);
if (image != NULL) {
result = new RuleBasedBreakIterator(image);
}
}
return result;
}
// -------------------------------------
@ -65,7 +151,24 @@ BreakIterator::createCharacterInstance(const Locale& key)
BreakIterator*
BreakIterator::createSentenceInstance(const Locale& key)
{
return new SimpleTextBoundary(&TextBoundaryData::kSentenceBreakData);
// WARNING: This routine is currently written specifically to handle only the
// default rules files and the alternate rules files for Thai. This function
// will have to be made fully general at some time in the future!
BreakIterator* result = NULL;
const char* filename = "sent";
UErrorCode err = U_ZERO_ERROR;
UDataMemory* file = udata_open(NULL, "brk", filename, &err);
if (!U_FAILURE(err)) {
const void* image = udata_getMemory(file);
if (image != NULL) {
result = new RuleBasedBreakIterator(image);
}
}
return result;
}
// -------------------------------------

439
icu4c/source/i18n/dbbi.cpp Normal file
View File

@ -0,0 +1,439 @@
/*
**********************************************************************
* Copyright (C) 1999 IBM Corp. All rights reserved.
**********************************************************************
* Date Name Description
* 12/1/99 rgillam Complete port from Java.
**********************************************************************
*/
#include "dbbi.h"
#include "dbbi_tbl.h"
#include "uvector.h"
char DictionaryBasedBreakIterator::fgClassID = 0;
//=======================================================================
// constructors
//=======================================================================
DictionaryBasedBreakIterator::DictionaryBasedBreakIterator(const void* tablesImage,
char* dictionaryFilename)
: RuleBasedBreakIterator((const void*)NULL),
dictionaryCharCount(0),
cachedBreakPositions(NULL),
numCachedBreakPositions(0),
positionInCache(0)
{
tables = new DictionaryBasedBreakIteratorTables(tablesImage, dictionaryFilename);
tables->addReference();
}
//=======================================================================
// boilerplate
//=======================================================================
/**
* Destructor
*/
DictionaryBasedBreakIterator::~DictionaryBasedBreakIterator()
{
delete [] cachedBreakPositions;
}
/**
* Assignment operator. Sets this iterator to have the same behavior,
* and iterate over the same text, as the one passed in.
*/
DictionaryBasedBreakIterator&
DictionaryBasedBreakIterator::operator=(const DictionaryBasedBreakIterator& that) {
reset();
RuleBasedBreakIterator::operator=(that);
return *this;
}
/**
* Returns a newly-constructed RuleBasedBreakIterator with the same
* behavior, and iterating over the same text, as this one.
*/
BreakIterator*
DictionaryBasedBreakIterator::clone() const {
return new DictionaryBasedBreakIterator(*this);
}
//=======================================================================
// BreakIterator overrides
//=======================================================================
/**
* Advances the iterator one step backwards.
* @return The position of the last boundary position before the
* current iteration position
*/
int32_t
DictionaryBasedBreakIterator::previous()
{
// if we have cached break positions and we're still in the range
// covered by them, just move one step backward in the cache
if (cachedBreakPositions != NULL && positionInCache > 0) {
--positionInCache;
text->setIndex(cachedBreakPositions[positionInCache]);
return cachedBreakPositions[positionInCache];
}
// otherwise, dump the cache and use the inherited previous() method to move
// backward. This may fill up the cache with new break positions, in which
// case we have to mark our position in the cache
else {
reset();
int32_t result = RuleBasedBreakIterator::previous();
if (cachedBreakPositions != NULL) {
positionInCache = numCachedBreakPositions - 2;
}
return result;
}
}
/**
* Sets the current iteration position to the last boundary position
* before the specified position.
* @param offset The position to begin searching from
* @return The position of the last boundary before "offset"
*/
int32_t
DictionaryBasedBreakIterator::preceding(int32_t offset)
{
// if the offset passed in is already past the end of the text,
// just return DONE; if it's before the beginning, return the
// text's starting offset
if (text == NULL || offset > text->endIndex()) {
return BreakIterator::DONE;
}
else if (offset < text->startIndex()) {
return text->startIndex();
}
// if we have no cached break positions, or "offset" is outside the
// range covered by the cache, we can just call the inherited routine
// (which will eventually call other routines in this class that may
// refresh the cache)
if (cachedBreakPositions == NULL || offset <= cachedBreakPositions[0] ||
offset > cachedBreakPositions[numCachedBreakPositions - 1]) {
reset();
return RuleBasedBreakIterator::preceding(offset);
}
// on the other hand, if "offset" is within the range covered by the cache,
// then all we have to do is search the cache for the last break position
// before "offset"
else {
positionInCache = 0;
while (positionInCache < numCachedBreakPositions
&& offset > cachedBreakPositions[positionInCache])
++positionInCache;
--positionInCache;
text->setIndex(cachedBreakPositions[positionInCache]);
return text->getIndex();
}
}
/**
* Sets the current iteration position to the first boundary position after
* the specified position.
* @param offset The position to begin searching forward from
* @return The position of the first boundary after "offset"
*/
int32_t
DictionaryBasedBreakIterator::following(int32_t offset)
{
// if the offset passed in is already past the end of the text,
// just return DONE; if it's before the beginning, return the
// text's starting offset
if (text == NULL || offset > text->endIndex()) {
return BreakIterator::DONE;
}
else if (offset < text->startIndex()) {
return text->startIndex();
}
// if we have no cached break positions, or if "offset" is outside the
// range covered by the cache, then dump the cache and call our
// inherited following() method. This will call other methods in this
// class that may refresh the cache.
if (cachedBreakPositions == NULL || offset < cachedBreakPositions[0] ||
offset >= cachedBreakPositions[numCachedBreakPositions - 1]) {
reset();
return RuleBasedBreakIterator::following(offset);
}
// on the other hand, if "offset" is within the range covered by the
// cache, then just search the cache for the first break position
// after "offset"
else {
positionInCache = 0;
while (positionInCache < numCachedBreakPositions
&& offset >= cachedBreakPositions[positionInCache])
++positionInCache;
text->setIndex(cachedBreakPositions[positionInCache]);
return text->getIndex();
}
}
/**
* This is the implementation function for next().
*/
int32_t
DictionaryBasedBreakIterator::handleNext()
{
// if there are no cached break positions, or if we've just moved
// off the end of the range covered by the cache, we have to dump
// and possibly regenerate the cache
if (cachedBreakPositions == NULL || positionInCache == numCachedBreakPositions - 1) {
// start by using the inherited handleNext() to find a tentative return
// value. dictionaryCharCount tells us how many dictionary characters
// we passed over on our way to the tentative return value
int32_t startPos = text->getIndex();
dictionaryCharCount = 0;
int32_t result = RuleBasedBreakIterator::handleNext();
// if we passed over more than one dictionary character, then we use
// divideUpDictionaryRange() to regenerate the cached break positions
// for the new range
if (dictionaryCharCount > 1 && result - startPos > 1) {
divideUpDictionaryRange(startPos, result);
}
// otherwise, the value we got back from the inherited fuction
// is our return value, and we can dump the cache
else {
reset();
return result;
}
}
// if the cache of break positions has been regenerated (or existed all
// along), then just advance to the next break position in the cache
// and return it
if (cachedBreakPositions != NULL) {
++positionInCache;
text->setIndex(cachedBreakPositions[positionInCache]);
return cachedBreakPositions[positionInCache];
}
return -9999; // SHOULD NEVER GET HERE!
}
void
DictionaryBasedBreakIterator::reset()
{
delete [] cachedBreakPositions;
cachedBreakPositions = NULL;
numCachedBreakPositions = 0;
dictionaryCharCount = 0;
positionInCache = 0;
}
/**
* This is the function that actually implements the dictionary-based
* algorithm. Given the endpoints of a range of text, it uses the
* dictionary to determine the positions of any boundaries in this
* range. It stores all the boundary positions it discovers in
* cachedBreakPositions so that we only have to do this work once
* for each time we enter the range.
*/
void
DictionaryBasedBreakIterator::divideUpDictionaryRange(int32_t startPos, int32_t endPos)
{
// to avoid casts throughout the rest of this function
DictionaryBasedBreakIteratorTables* tables
= (DictionaryBasedBreakIteratorTables*)(this->tables);
// the range we're dividing may begin or end with non-dictionary characters
// (i.e., for line breaking, we may have leading or trailing punctuation
// that needs to be kept with the word). Seek from the beginning of the
// range to the first dictionary character
text->setIndex(startPos);
UChar c = text->current();
int category = tables->lookupCategory(c, this);
while (category == IGNORE || !tables->categoryFlags[category]) {
c = text->next();
category = tables->lookupCategory(c, this);
}
// initialize. We maintain two stacks: currentBreakPositions contains
// the list of break positions that will be returned if we successfully
// finish traversing the whole range now. possibleBreakPositions lists
// all other possible word ends we've passed along the way. (Whenever
// we reach an error [a sequence of characters that can't begin any word
// in the dictionary], we back up, possibly delete some breaks from
// currentBreakPositions, move a break from possibleBreakPositions
// to currentBreakPositions, and start over from there. This process
// continues in this way until we either successfully make it all the way
// across the range, or exhaust all of our combinations of break
// positions.) wrongBreakPositions is used to keep track of paths we've
// tried on previous iterations. As the iterator backs up further and
// further, this saves us from having to follow each possible path
// through the text all the way to the error (hopefully avoiding many
// future recursive calls as well).
UStack currentBreakPositions;
UStack possibleBreakPositions;
UVector wrongBreakPositions;
// the dictionary is implemented as a trie, which is treated as a state
// machine. -1 represents the end of a legal word. Every word in the
// dictionary is represented by a path from the root node to -1. A path
// that ends in state 0 is an illegal combination of characters.
int16_t state = 0;
// these two variables are used for error handling. We keep track of the
// farthest we've gotten through the range being divided, and the combination
// of breaks that got us that far. If we use up all possible break
// combinations, the text contains an error or a word that's not in the
// dictionary. In this case, we "bless" the break positions that got us the
// farthest as real break positions, and then start over from scratch with
// the character where the error occurred.
int32_t farthestEndPoint = text->getIndex();
UStack bestBreakPositions;
bool_t bestBreakPositionsInitialized = FALSE;
// initialize (we always exit the loop with a break statement)
c = text->current();
while (true) {
// if we can transition to state "-1" from our current state, we're
// on the last character of a legal word. Push that position onto
// the possible-break-positions stack
if (tables->dictionary.at(state, (int32_t)0) == -1) {
possibleBreakPositions.push((void*)text->getIndex());
}
// look up the new state to transition to in the dictionary
state = tables->dictionary.at(state, c);
// if the character we're sitting on causes us to transition to
// the "end of word" state, then it was a non-dictionary character
// and we've successfully traversed the whole range. Drop out
// of the loop.
if (state == -1) {
currentBreakPositions.push((void*)text->getIndex());
break;
}
// if the character we're sitting on causes us to transition to
// the error state, or if we've gone off the end of the range
// without transitioning to the "end of word" state, we've hit
// an error...
else if (state == 0 || text->getIndex() >= endPos) {
// if this is the farthest we've gotten, take note of it in
// case there's an error in the text
if (text->getIndex() > farthestEndPoint) {
farthestEndPoint = text->getIndex();
bestBreakPositions.removeAllElements();
bestBreakPositionsInitialized = TRUE;
for (int32_t i = 0; i < currentBreakPositions.size(); i++) {
bestBreakPositions.push(currentBreakPositions.elementAt(i));
}
}
// wrongBreakPositions is a list of all break positions we've tried starting
// that didn't allow us to traverse all the way through the text. Every time
// we pop a break position off of currentBreakPositions, we put it into
// wrongBreakPositions to avoid trying it again later. If we make it to this
// spot, we're either going to back up to a break in possibleBreakPositions
// and try starting over from there, or we've exhausted all possible break
// positions and are going to do the fallback procedure. This loop prevents
// us from messing with anything in possibleBreakPositions that didn't work as
// a starting point the last time we tried it (this is to prevent a bunch of
// repetitive checks from slowing down some extreme cases)
while (!possibleBreakPositions.isEmpty() && wrongBreakPositions.contains(
possibleBreakPositions.peek())) {
possibleBreakPositions.pop();
}
// if we've used up all possible break-position combinations, there's
// an error or an unknown word in the text. In this case, we start
// over, treating the farthest character we've reached as the beginning
// of the range, and "blessing" the break positions that got us that
// far as real break positions
if (possibleBreakPositions.isEmpty()) {
if (bestBreakPositionsInitialized) {
currentBreakPositions.removeAllElements();
for (int32_t i = 0; i < bestBreakPositions.size(); i++) {
currentBreakPositions.push(bestBreakPositions.elementAt(i));
}
bestBreakPositions.removeAllElements();
if (farthestEndPoint < endPos) {
text->setIndex(farthestEndPoint + 1);
}
else {
break;
}
}
else {
if ((currentBreakPositions.isEmpty()
|| (int32_t)currentBreakPositions.peek() != text->getIndex())
&& text->getIndex() != startPos) {
currentBreakPositions.push((void*)text->getIndex());
}
text->next();
currentBreakPositions.push((void*)text->getIndex());
}
}
// if we still have more break positions we can try, then promote the
// last break in possibleBreakPositions into currentBreakPositions,
// and get rid of all entries in currentBreakPositions that come after
// it. Then back up to that position and start over from there (i.e.,
// treat that position as the beginning of a new word)
else {
int32_t temp = (int32_t)possibleBreakPositions.pop();
void* temp2 = NULL;
while (!currentBreakPositions.isEmpty() && temp <
(int32_t)currentBreakPositions.peek()) {
temp2 = currentBreakPositions.pop();
wrongBreakPositions.addElement(temp2);
}
currentBreakPositions.push((void*)temp);
text->setIndex((int32_t)currentBreakPositions.peek());
}
// re-sync "c" for the next go-round, and drop out of the loop if
// we've made it off the end of the range
c = text->current();
if (text->getIndex() >= endPos) {
break;
}
}
// if we didn't hit any exceptional conditions on this last iteration,
// just advance to the next character and loop
else {
c = text->next();
}
}
// dump the last break position in the list, and replace it with the actual
// end of the range (which may be the same character, or may be further on
// because the range actually ended with non-dictionary characters we want to
// keep with the word)
if (!currentBreakPositions.isEmpty()) {
currentBreakPositions.pop();
}
currentBreakPositions.push((void*)endPos);
// create a regular array to hold the break positions and copy
// the break positions from the stack to the array (in addition,
// our starting position goes into this array as a break position).
// This array becomes the cache of break positions used by next()
// and previous(), so this is where we actually refresh the cache.
cachedBreakPositions = new int32_t[currentBreakPositions.size() + 1];
numCachedBreakPositions = currentBreakPositions.size() + 1;
cachedBreakPositions[0] = startPos;
for (int32_t i = 0; i < currentBreakPositions.size(); i++) {
cachedBreakPositions[i + 1] = (int32_t)currentBreakPositions.elementAt(i);
}
positionInCache = 0;
}

201
icu4c/source/i18n/dbbi.h Normal file
View File

@ -0,0 +1,201 @@
/*
**********************************************************************
* Copyright (C) 1999 IBM Corp. All rights reserved.
**********************************************************************
* Date Name Description
* 12/1/99 rgillam Complete port from Java.
**********************************************************************
*/
#ifndef DBBI_H
#define DBBI_H
#include "rbbi.h"
/**
* A subclass of RuleBasedBreakIterator that adds the ability to use a dictionary
* to further subdivide ranges of text beyond what is possible using just the
* state-table-based algorithm. This is necessary, for example, to handle
* word and line breaking in Thai, which doesn't use spaces between words. The
* state-table-based algorithm used by RuleBasedBreakIterator is used to divide
* up text as far as possible, and then contiguous ranges of letters are
* repeatedly compared against a list of known words (i.e., the dictionary)
* to divide them up into words.
*
* DictionaryBasedBreakIterator uses the same rule language as RuleBasedBreakIterator,
* but adds one more special substitution name: &lt;dictionary&gt;. This substitution
* name is used to identify characters in words in the dictionary. The idea is that
* if the iterator passes over a chunk of text that includes two or more characters
* in a row that are included in &lt;dictionary&gt;, it goes back through that range and
* derives additional break positions (if possible) using the dictionary.
*
* DictionaryBasedBreakIterator is also constructed with the filename of a dictionary
* file. It follows a prescribed search path to locate the dictionary (right now,
* it looks for it in /com/ibm/text/resources in each directory in the classpath,
* and won't find it in JAR files, but this location is likely to change). The
* dictionary file is in a serialized binary format. We have a very primitive (and
* slow) BuildDictionaryFile utility for creating dictionary files, but aren't
* currently making it public. Contact us for help.
*/
class U_I18N_API DictionaryBasedBreakIterator : public RuleBasedBreakIterator {
private:
/**
* a temporary hiding place for the number of dictionary characters in the
* last range passed over by next()
*/
int32_t dictionaryCharCount;
/**
* when a range of characters is divided up using the dictionary, the break
* positions that are discovered are stored here, preventing us from having
* to use either the dictionary or the state table again until the iterator
* leaves this range of text
*/
int32_t* cachedBreakPositions;
/**
* The number of elements in cachedBreakPositions
*/
int32_t numCachedBreakPositions;
/**
* if cachedBreakPositions is not null, this indicates which item in the
* cache the current iteration position refers to
*/
int32_t positionInCache;
/**
* Class ID
*/
static char fgClassID;
public:
//=======================================================================
// constructors
//=======================================================================
DictionaryBasedBreakIterator(const void* tablesImage, char* dictionaryFilename);
//=======================================================================
// boilerplate
//=======================================================================
/**
* Destructor
*/
virtual ~DictionaryBasedBreakIterator();
/**
* Assignment operator. Sets this iterator to have the same behavior,
* and iterate over the same text, as the one passed in.
*/
DictionaryBasedBreakIterator& operator=(const DictionaryBasedBreakIterator& that);
/**
* Returns a newly-constructed RuleBasedBreakIterator with the same
* behavior, and iterating over the same text, as this one.
*/
virtual BreakIterator* clone() const;
//=======================================================================
// BreakIterator overrides
//=======================================================================
/**
* Advances the iterator backwards, to the last boundary preceding this one.
* @return The position of the last boundary position preceding this one.
*/
virtual int32_t previous();
/**
* Sets the iterator to refer to the first boundary position following
* the specified position.
* @offset The position from which to begin searching for a break position.
* @return The position of the first break after the current position.
*/
virtual int32_t following(int32_t offset);
/**
* Sets the iterator to refer to the last boundary position before the
* specified position.
* @offset The position to begin searching for a break from.
* @return The position of the last boundary before the starting position.
*/
virtual int32_t preceding(int32_t offset);
/**
* Returns a unique class ID POLYMORPHICALLY. Pure virtual override.
* This method is to implement a simple version of RTTI, since not all
* C++ compilers support genuine RTTI. Polymorphic operator==() and
* clone() methods call this method.
*
* @return The class ID for this object. All objects of a
* given class have the same class ID. Objects of
* other classes have different class IDs.
*/
virtual UClassID getDynamicClassID() const;
/**
* Returns the class ID for this class. This is useful only for
* comparing to a return value from getDynamicClassID(). For example:
*
* Base* polymorphic_pointer = createPolymorphicObject();
* if (polymorphic_pointer->getDynamicClassID() ==
* Derived::getStaticClassID()) ...
*
* @return The class ID for all objects of this class.
*/
static UClassID getStaticClassID();
protected:
//=======================================================================
// implementation
//=======================================================================
/**
* This method is the actual implementation of the next() method. All iteration
* vectors through here. This method initializes the state machine to state 1
* and advances through the text character by character until we reach the end
* of the text or the state machine transitions to state 0. We update our return
* value every time the state machine passes through a possible end state.
*/
virtual int32_t handleNext();
/**
* dumps the cache of break positions (usually in response to a change in
* position of some sort)
*/
virtual void reset();
private:
/**
* This is the function that actually implements the dictionary-based
* algorithm. Given the endpoints of a range of text, it uses the
* dictionary to determine the positions of any boundaries in this
* range. It stores all the boundary positions it discovers in
* cachedBreakPositions so that we only have to do this work once
* for each time we enter the range.
*/
void divideUpDictionaryRange(int32_t startPos, int32_t endPos);
/**
* Used by the tables object to increment the count of dictionary characters
* during iteration
*/
void bumpDictionaryCharCount();
friend class DictionaryBasedBreakIteratorTables;
};
inline UClassID DictionaryBasedBreakIterator::getDynamicClassID() const {
return RuleBasedBreakIterator::getStaticClassID();
}
inline UClassID DictionaryBasedBreakIterator::getStaticClassID() {
return (UClassID)(&fgClassID);
}
inline void DictionaryBasedBreakIterator::bumpDictionaryCharCount() {
++dictionaryCharCount;
}
#endif

View File

@ -0,0 +1,64 @@
/**
* The Builder class for DictionaryBasedBreakIterator inherits almost all of
* its functionality from the Builder class for RuleBasedBreakIterator, but
* extends it with extra logic to handle the "<dictionary>" token
*/
protected class Builder extends RuleBasedBreakIterator.Builder {
/**
* A CharSet that contains all the characters represented in the dictionary
*/
private CharSet dictionaryChars = new CharSet();
private String dictionaryExpression = "";
/**
* No special initialization
*/
public Builder() {
}
/**
* We override handleSpecialSubstitution() to add logic to handle
* the <dictionary> tag. If we see a substitution named "<dictionary>",
* parse the substitution expression and store the result in
* dictionaryChars.
*/
protected void handleSpecialSubstitution(String replace, String replaceWith,
int startPos, String description) {
super.handleSpecialSubstitution(replace, replaceWith, startPos, description);
if (replace.equals("<dictionary>")) {
if (replaceWith.charAt(0) == '(') {
error("Dictionary group can't be enclosed in (", startPos, description);
}
dictionaryExpression = replaceWith;
dictionaryChars = CharSet.parseString(replaceWith);
}
}
/**
* The other half of the logic to handle the dictionary characters happens here.
* After the inherited builder has derived the real character categories, we
* set up the categoryFlags array in the iterator. This array contains "true"
* for every character category that includes a dictionary character.
*/
protected void buildCharCategories(Vector tempRuleList) {
super.buildCharCategories(tempRuleList);
categoryFlags = new boolean[categories.size()];
for (int i = 0; i < categories.size(); i++) {
CharSet cs = (CharSet)categories.elementAt(i);
if (!(cs.intersection(dictionaryChars).empty())) {
categoryFlags[i] = true;
}
}
}
// This function is actually called by RuleBasedBreakIterator.buildCharCategories(),
// which is called by the function above. This gives us a way to create a separate
// character category for the dictionary characters even when RuleBasedBreakIterator
// isn't making a distinction
protected void mungeExpressionList(Hashtable expressions) {
expressions.put(dictionaryExpression, dictionaryChars);
}
}

View File

@ -0,0 +1,64 @@
/**
* The Builder class for DictionaryBasedBreakIterator inherits almost all of
* its functionality from the Builder class for RuleBasedBreakIterator, but
* extends it with extra logic to handle the "<dictionary>" token
*/
protected class Builder extends RuleBasedBreakIterator.Builder {
/**
* A CharSet that contains all the characters represented in the dictionary
*/
private CharSet dictionaryChars = new CharSet();
private String dictionaryExpression = "";
/**
* No special initialization
*/
public Builder() {
}
/**
* We override handleSpecialSubstitution() to add logic to handle
* the <dictionary> tag. If we see a substitution named "<dictionary>",
* parse the substitution expression and store the result in
* dictionaryChars.
*/
protected void handleSpecialSubstitution(String replace, String replaceWith,
int startPos, String description) {
super.handleSpecialSubstitution(replace, replaceWith, startPos, description);
if (replace.equals("<dictionary>")) {
if (replaceWith.charAt(0) == '(') {
error("Dictionary group can't be enclosed in (", startPos, description);
}
dictionaryExpression = replaceWith;
dictionaryChars = CharSet.parseString(replaceWith);
}
}
/**
* The other half of the logic to handle the dictionary characters happens here.
* After the inherited builder has derived the real character categories, we
* set up the categoryFlags array in the iterator. This array contains "true"
* for every character category that includes a dictionary character.
*/
protected void buildCharCategories(Vector tempRuleList) {
super.buildCharCategories(tempRuleList);
categoryFlags = new boolean[categories.size()];
for (int i = 0; i < categories.size(); i++) {
CharSet cs = (CharSet)categories.elementAt(i);
if (!(cs.intersection(dictionaryChars).empty())) {
categoryFlags[i] = true;
}
}
}
// This function is actually called by RuleBasedBreakIterator.buildCharCategories(),
// which is called by the function above. This gives us a way to create a separate
// character category for the dictionary characters even when RuleBasedBreakIterator
// isn't making a distinction
protected void mungeExpressionList(Hashtable expressions) {
expressions.put(dictionaryExpression, dictionaryChars);
}
}

View File

@ -0,0 +1,59 @@
/*
**********************************************************************
* Copyright (C) 1999 IBM Corp. All rights reserved.
**********************************************************************
* Date Name Description
* 12/1/99 rgillam Complete port from Java.
**********************************************************************
*/
#include "dbbi_tbl.h"
#include "dbbi.h"
//=======================================================================
// constructor
//=======================================================================
DictionaryBasedBreakIteratorTables::DictionaryBasedBreakIteratorTables(
const void* tablesImage,
char* dictionaryFilename)
: RuleBasedBreakIteratorTables(tablesImage),
dictionary(dictionaryFilename)
{
const void** tablesIdx = (const void**)tablesImage;
const void* dbbiImage = (const void*)((const int8_t*)tablesImage + (int32_t)tablesIdx[8]);
// we know the offset into the memory image where the DBBI stuff
// starts is stored in element 8 of the array. There should be
// a way for the RBBI constructor to give us this, but there's
// isn't a good one.
const void** dbbiIdx = (const void**)dbbiImage;
categoryFlags = (int8_t*)((const int8_t*)dbbiImage + (int32_t)dbbiIdx[0]);
}
//=======================================================================
// boilerplate
//=======================================================================
/**
* Destructor
*/
DictionaryBasedBreakIteratorTables::~DictionaryBasedBreakIteratorTables() {
if (ownTables)
delete [] categoryFlags;
}
int32_t
DictionaryBasedBreakIteratorTables::lookupCategory(UChar c,
BreakIterator* bi) const {
// this override of lookupCategory() exists only to keep track of whether we've
// passed over any dictionary characters. It calls the inherited lookupCategory()
// to do the real work, and then checks whether its return value is one of the
// categories represented in the dictionary. If it is, bump the dictionary-
// character count.
int32_t result = RuleBasedBreakIteratorTables::lookupCategory(c, bi);
if (result != RuleBasedBreakIterator::IGNORE && categoryFlags[result]) {
((DictionaryBasedBreakIterator*)bi)->bumpDictionaryCharCount();
}
return result;
}

View File

@ -0,0 +1,79 @@
/*
**********************************************************************
* Copyright (C) 1999 IBM Corp. All rights reserved.
**********************************************************************
* Date Name Description
* 12/1/99 rgillam Complete port from Java.
**********************************************************************
*/
#ifndef DBBI_TBL_H
#define DBBI_TBL_H
#include "rbbi_tbl.h"
#include "brkdict.h"
/**
* This subclass of RuleBasedBreakIteratorTables contains the additional
* static data that is used by DictionaryBasedBreakIterator. This comprises
* the dictionary itself and an array of flags that indicate which characters
* are in the dictionary.
*
* @author Richard Gillam
*/
class DictionaryBasedBreakIteratorTables : public RuleBasedBreakIteratorTables {
private:
/**
* a list of known words that is used to divide up contiguous ranges of letters,
* stored in a compressed, indexed, format that offers fast access
*/
BreakDictionary dictionary;
/**
* a list of flags indicating which character categories are contained in
* the dictionary file (this is used to determine which ranges of characters
* to apply the dictionary to)
*/
int8_t* categoryFlags;
//=======================================================================
// constructor
//=======================================================================
DictionaryBasedBreakIteratorTables(const void* tablesImage,
char* dictionaryFilename);
/**
* The copy constructor is declared private and not implemented.
* THIS CLASS MAY NOT BE COPIED.
*/
DictionaryBasedBreakIteratorTables(const DictionaryBasedBreakIteratorTables& that);
//=======================================================================
// boilerplate
//=======================================================================
/**
* Destructor
*/
virtual ~DictionaryBasedBreakIteratorTables();
/**
* The assignment operator is declared private and not implemented.
* THIS CLASS MAY NOT BE COPIED.
*/
DictionaryBasedBreakIteratorTables& operator=(
const DictionaryBasedBreakIteratorTables& that);
protected:
/**
* Looks up a character's category (i.e., its category for breaking purposes,
* not its Unicode category)
*/
virtual int32_t lookupCategory(UChar c, BreakIterator* bi) const;
friend class DictionaryBasedBreakIterator;
};
#endif

View File

@ -69,7 +69,7 @@ LINK32=link.exe
# PROP Ignore_Export_Lib 0
# PROP Target_Dir ""
# ADD BASE CPP /nologo /MTd /W3 /Gm /GX /ZI /Od /D "WIN32" /D "_DEBUG" /D "_WINDOWS" /D "_MBCS" /D "_USRDLL" /D "I18N_EXPORTS" /YX /FD /GZ /c
# ADD CPP /nologo /MDd /Za /W3 /Gm /GX /ZI /Od /I "..\..\include" /I "..\..\source\common" /D "WIN32" /D "_DEBUG" /D "_WINDOWS" /D "_MBCS" /D "_USRDLL" /D "I18N_EXPORTS" /D "U_I18N_IMPLEMENTATION" /FR /YX /FD /GZ /c
# ADD CPP /nologo /MDd /Za /W3 /Gm /GX /ZI /Od /I "..\..\include" /I "..\..\source\common" /D "_WINDOWS" /D "_USRDLL" /D "I18N_EXPORTS" /D "U_I18N_IMPLEMENTATION" /D "WIN32" /D "_DEBUG" /D "_MBCS" /D "UDATA_MAP" /FR /YX /FD /GZ /c
# ADD BASE MTL /nologo /D "_DEBUG" /mktyplib203 /win32
# ADD MTL /nologo /D "_DEBUG" /mktyplib203 /win32
# ADD BASE RSC /l 0x409 /d "_DEBUG"
@ -92,6 +92,10 @@ LINK32=link.exe
# PROP Default_Filter "cpp;c;cxx;rc;def;r;odl;idl;hpj;bat"
# Begin Source File
SOURCE=.\brkdict.cpp
# End Source File
# Begin Source File
SOURCE=.\brkiter.cpp
# End Source File
# Begin Source File
@ -100,10 +104,6 @@ SOURCE=.\calendar.cpp
# End Source File
# Begin Source File
SOURCE=.\chbkdat.cpp
# End Source File
# Begin Source File
SOURCE=.\choicfmt.cpp
# End Source File
# Begin Source File
@ -132,6 +132,14 @@ SOURCE=.\datefmt.cpp
# End Source File
# Begin Source File
SOURCE=.\dbbi.cpp
# End Source File
# Begin Source File
SOURCE=.\dbbi_tbl.cpp
# End Source File
# Begin Source File
SOURCE=.\dcfmtsym.cpp
# End Source File
# Begin Source File
@ -161,10 +169,6 @@ SOURCE=.\hextouni.cpp
# End Source File
# Begin Source File
SOURCE=.\lnbkdat.cpp
# End Source File
# Begin Source File
SOURCE=.\mergecol.cpp
# End Source File
# Begin Source File
@ -181,6 +185,14 @@ SOURCE=.\ptnentry.cpp
# End Source File
# Begin Source File
SOURCE=.\rbbi.cpp
# End Source File
# Begin Source File
SOURCE=.\rbbi_tbl.cpp
# End Source File
# Begin Source File
SOURCE=.\rbt.cpp
# End Source File
# Begin Source File
@ -205,18 +217,10 @@ SOURCE=.\simpletz.cpp
# End Source File
# Begin Source File
SOURCE=.\simtxbd.cpp
# End Source File
# Begin Source File
SOURCE=.\smpdtfmt.cpp
# End Source File
# Begin Source File
SOURCE=.\snbkdat.cpp
# End Source File
# Begin Source File
SOURCE=.\sortkey.cpp
# End Source File
# Begin Source File
@ -241,10 +245,6 @@ SOURCE=.\translit.cpp
# End Source File
# Begin Source File
SOURCE=.\txtbdat.cpp
# End Source File
# Begin Source File
SOURCE=.\txtbdry.cpp
# End Source File
# Begin Source File
@ -269,10 +269,6 @@ SOURCE=.\umsg.cpp
# End Source File
# Begin Source File
SOURCE=.\unicdcm.cpp
# End Source File
# Begin Source File
SOURCE=.\unifltlg.cpp
# End Source File
# Begin Source File
@ -291,20 +287,16 @@ SOURCE=.\unitohex.cpp
SOURCE=.\unum.cpp
# End Source File
# Begin Source File
SOURCE=.\wdbkdat.cpp
# End Source File
# Begin Source File
SOURCE=.\wdbktbl.cpp
# End Source File
# End Group
# Begin Group "Header Files"
# PROP Default_Filter "h;hpp;hxx;hm;inl"
# Begin Source File
SOURCE=.\brkdict.h
# End Source File
# Begin Source File
SOURCE=.\unicode\brkiter.h
!IF "$(CFG)" == "i18n - Win32 Release"
@ -502,6 +494,14 @@ InputPath=.\unicode\datefmt.h
# End Source File
# Begin Source File
SOURCE=.\dbbi.h
# End Source File
# Begin Source File
SOURCE=.\dbbi_tbl.h
# End Source File
# Begin Source File
SOURCE=.\unicode\dcfmtsym.h
!IF "$(CFG)" == "i18n - Win32 Release"
@ -811,7 +811,7 @@ SOURCE=.\rbbi.h
# End Source File
# Begin Source File
SOURCE=.\rbbi_bld.h
SOURCE=.\rbbi_tbl.h
# End Source File
# Begin Source File
@ -885,10 +885,6 @@ InputPath=.\unicode\simpletz.h
# End Source File
# Begin Source File
SOURCE=.\simtxbd.h
# End Source File
# Begin Source File
SOURCE=.\unicode\smpdtfmt.h
!IF "$(CFG)" == "i18n - Win32 Release"
@ -943,10 +939,6 @@ InputPath=.\unicode\sortkey.h
# End Source File
# Begin Source File
SOURCE=.\spclmap.h
# End Source File
# Begin Source File
SOURCE=.\tables.h
# End Source File
# Begin Source File
@ -1036,10 +1028,6 @@ InputPath=.\unicode\translit.h
# End Source File
# Begin Source File
SOURCE=.\txtbdat.h
# End Source File
# Begin Source File
SOURCE=.\txtbdry.h
# End Source File
# Begin Source File
@ -1179,10 +1167,6 @@ InputPath=.\unicode\umsg.h
# End Source File
# Begin Source File
SOURCE=.\unicdcm.h
# End Source File
# Begin Source File
SOURCE=.\unicode\unifilt.h
!IF "$(CFG)" == "i18n - Win32 Release"
@ -1319,10 +1303,6 @@ InputPath=.\unicode\unum.h
!ENDIF
# End Source File
# Begin Source File
SOURCE=.\wdbktbl.h
# End Source File
# End Group
# Begin Group "Resource Files"

View File

@ -4,98 +4,237 @@
* and others. All rights reserved. *
**********************************************************************
* Date Name Description
* 10/22/99 alan Creation.
* 11/11/99 rgillam Complete port from Java.
**********************************************************************
*/
#include "rbbi.h"
#include "rbbi_bld.h"
#include "schriter.h"
/**
* A token used as a character-category value to identify ignore characters
*/
int8_t RuleBasedBreakIterator::IGNORE = -1;
int8_t
RuleBasedBreakIterator::IGNORE = -1;
/**
* The state number of the starting state
*/
int16_t RuleBasedBreakIterator::START_STATE = 1;
int16_t
RuleBasedBreakIterator::START_STATE = 1;
/**
* The state-transition value indicating "stop"
*/
int16_t RuleBasedBreakIterator::STOP_STATE = 0;
int16_t
RuleBasedBreakIterator::STOP_STATE = 0;
/**
* Class ID. (value is irrelevant; address is important)
*/
char
RuleBasedBreakIterator::fgClassID = 0;
//=======================================================================
// constructors
//=======================================================================
/**
* Constructs a RuleBasedBreakIterator according to the description
* provided. If the description is malformed, throws an
* IllegalArgumentException. Normally, instead of constructing a
* RuleBasedBreakIterator directory, you'll use the factory methods
* on BreakIterator to create one indirectly from a description
* in the framework's resource files. You'd use this when you want
* special behavior not provided by the built-in iterators.
* Constructs a RuleBasedBreakIterator that uses the already-created
* tables object that is passed in as a parameter.
*/
RuleBasedBreakIterator::RuleBasedBreakIterator(const UnicodeString& description) {
this.description = description;
// the actual work is done by the Builder class
Builder builder;
builder.buildBreakIterator(*this, description);
RuleBasedBreakIterator::RuleBasedBreakIterator(RuleBasedBreakIteratorTables* tables)
: tables(tables),
text(NULL)
{
}
// This constructor uses the udata interface to create a BreakIterator whose
// internal tables live in a memory-mapped file. "image" is a pointer to the
// beginning of that file.
RuleBasedBreakIterator::RuleBasedBreakIterator(const void* image)
: tables(image != NULL ? new RuleBasedBreakIteratorTables(image) : NULL),
text(NULL)
{
if (tables != NULL)
tables->addReference();
}
/**
* Copy constructor. Will produce a collator with the same behavior,
* and which iterates over the same text, as the one passed in.
*/
RuleBasedBreakIterator::RuleBasedBreakIterator(const RuleBasedBreakIterator& that)
: tables(that.tables),
text(that.text->clone())
{
tables->addReference();
}
//=======================================================================
// boilerplate
//=======================================================================
/**
* Clones this iterator.
* @return A newly-constructed RuleBasedBreakIterator with the same
* behavior as this one.
* Destructor
*/
RuleBasedBreakIterator* RuleBasedBreakIterator::clone(void) const {
RuleBasedBreakIterator::~RuleBasedBreakIterator() {
delete text;
tables->removeReference();
}
/**
* Assignment operator. Sets this iterator to have the same behavior,
* and iterate over the same text, as the one passed in.
*/
RuleBasedBreakIterator&
RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) {
delete text;
text = that.text->clone();
tables->removeReference();
tables = that.tables;
tables->addReference();
return *this;
}
/**
* Returns a newly-constructed RuleBasedBreakIterator with the same
* behavior, and iterating over the same text, as this one.
*/
BreakIterator*
RuleBasedBreakIterator::clone(void) const {
return new RuleBasedBreakIterator(*this);
}
/**
* Returns true if both BreakIterators are of the same class, have the same
* rules, and iterate over the same text.
* Equality operator. Returns TRUE if both BreakIterators are of the
* same class, have the same behavior, and iterate over the same text.
*/
bool_t RuleBasedBreakIterator::operator==(const RuleBasedBreakIterator& that) {
return description.equals(((RuleBasedBreakIterator)that).description)
&& text.equals(((RuleBasedBreakIterator)that).text);
bool_t
RuleBasedBreakIterator::operator==(const BreakIterator& that) const {
if (that.getDynamicClassID() != getDynamicClassID())
return FALSE;
const RuleBasedBreakIterator& that2 = (const RuleBasedBreakIterator&)that;
return (that2.text == text || *that2.text == *text)
&& (that2.tables == tables || *that2.tables == *tables);
}
/**
* Compute a hash code for this BreakIterator
* @return A hash code
*/
int32_t
RuleBasedBreakIterator::hashCode(void) const {
return tables->hashCode();
}
/**
* Returns the description used to create this iterator
*/
UnicodeString RuleBasedBreakIterator::toString(void) {
return description;
}
/**
* Compute a hashcode for this BreakIterator
* @return A hash code
*/
int32_t RuleBasedBreakIterator::hashCode(void) {
return description.hashCode();
const UnicodeString&
RuleBasedBreakIterator::getRules() const {
return tables->getRules();
}
//=======================================================================
// BreakIterator overrides
//=======================================================================
/**
* Return a CharacterIterator over the text being analyzed. This version
* of this method returns the actual CharacterIterator we're using internally.
* Changing the state of this iterator can have undefined consequences. If
* you need to change it, clone it first.
* @return An iterator over the text being analyzed.
*/
const CharacterIterator&
RuleBasedBreakIterator::getText() const {
RuleBasedBreakIterator* nonConstThis = (RuleBasedBreakIterator*)this;
// The iterator is initialized pointing to no text at all, so if this
// function is called while we're in that state, we have to fudge an
// an iterator to return.
if (nonConstThis->text == NULL)
nonConstThis->text = new StringCharacterIterator("");
return *nonConstThis->text;
}
/**
* Returns a newly-created CharacterIterator that the caller is to take
* ownership of.
* THIS FUNCTION SHOULD NOT BE HERE. IT'S HERE BECAUSE BreakIterator DEFINES
* IT AS PURE VIRTUAL, FORCING RBBI TO IMPLEMENT IT. IT SHOULD BE REMOVED
* FROM *BOTH* CLASSES.
*/
CharacterIterator*
RuleBasedBreakIterator::createText() const {
if (text == NULL)
return new StringCharacterIterator("");
else
return text->clone();
}
/**
* Set the iterator to analyze a new piece of text. This function resets
* the current iteration position to the beginning of the text.
* @param newText An iterator over the text to analyze.
*/
void
RuleBasedBreakIterator::adoptText(CharacterIterator* newText) {
reset();
delete text;
text = newText;
text->first();
}
/**
* Set the iterator to analyze a new piece of text. This function resets
* the current iteration position to the beginning of the text.
* @param newText An iterator over the text to analyze.
*/
void
RuleBasedBreakIterator::setText(const UnicodeString& newText) {
reset();
if (text != NULL && text->getDynamicClassID()
== StringCharacterIterator::getStaticClassID()) {
((StringCharacterIterator*)text)->setText(newText);
}
else {
delete text;
text = new StringCharacterIterator(newText);
text->first();
}
}
/**
* Set the iterator to analyze a new piece of text. This function resets
* the current iteration position to the beginning of the text.
* @param newText The text to analyze.
* THIS FUNCTION SHOULD NOT BE HERE. IT'S HERE BECAUSE BreakIterator DEFINES
* IT AS PURE VIRTUAL, FORCING RBBI TO IMPLEMENT IT. IT SHOULD BE REMOVED
* FROM *BOTH* CLASSES.
*/
void
RuleBasedBreakIterator::setText(const UnicodeString* newText) {
setText(*newText);
}
/**
* Sets the current iteration position to the beginning of the text.
* (i.e., the CharacterIterator's starting offset).
* @return The offset of the beginning of the text.
*/
int32_t RuleBasedBreakIterator::first(void) {
CharacterIterator t = getText();
reset();
if (text == NULL)
return BreakIterator::DONE;
t.first();
return t.getIndex();
text->first();
return text->getIndex();
}
/**
@ -104,12 +243,16 @@ int32_t RuleBasedBreakIterator::first(void) {
* @return The text's past-the-end offset.
*/
int32_t RuleBasedBreakIterator::last(void) {
CharacterIterator t = getText();
reset();
if (text == NULL)
return BreakIterator::DONE;
// I'm not sure why, but t.last() returns the offset of the last character,
// rather than the past-the-end offset
t.setIndex(t.getEndIndex());
return t.getIndex();
int32_t pos = text->endIndex();
text->setIndex(pos);
return pos;
}
/**
@ -148,9 +291,8 @@ int32_t RuleBasedBreakIterator::next(void) {
*/
int32_t RuleBasedBreakIterator::previous(void) {
// if we're already sitting at the beginning of the text, return DONE
CharacterIterator text = getText();
if (current() == text.getBeginIndex())
return BreakIterator.DONE;
if (text == NULL || current() == text->startIndex())
return BreakIterator::DONE;
// set things up. handlePrevious() will back us up to some valid
// break position before the current position (we back our internal
@ -158,21 +300,21 @@ int32_t RuleBasedBreakIterator::previous(void) {
// the current position), but not necessarily the last one before
// where we started
int32_t start = current();
text.previous();
text->previous();
int32_t lastResult = handlePrevious();
int32_t result = lastResult;
// iterate forward from the known break position until we pass our
// starting point. The last break position before the starting
// point is our return value
while (result != BreakIterator.DONE && result < start) {
while (result != BreakIterator::DONE && result < start) {
lastResult = result;
result = handleNext();
}
// set the current iteration position to be the last break position
// before where we started, and then return that value
text.setIndex(lastResult);
text->setIndex(lastResult);
return lastResult;
}
@ -184,16 +326,20 @@ int32_t RuleBasedBreakIterator::previous(void) {
*/
int32_t RuleBasedBreakIterator::following(int32_t offset) {
// if the offset passed in is already past the end of the text,
// just return DONE
CharacterIterator text = getText();
if (offset == text.getEndIndex())
return BreakIterator.DONE;
// just return DONE; if it's before the beginning, return the
// text's starting offset
if (text == NULL || offset >= text->endIndex()) {
return BreakIterator::DONE;
}
else if (offset < text->startIndex()) {
return text->startIndex();
}
// otherwise, set our internal iteration position (temporarily)
// to the position passed in. If this is the _beginning_ position,
// then we can just use next() to get our return value
text.setIndex(offset);
if (offset == text.getBeginIndex())
text->setIndex(offset);
if (offset == text->startIndex())
return handleNext();
// otherwise, we have to sync up first. Use handlePrevious() to back
@ -204,7 +350,7 @@ int32_t RuleBasedBreakIterator::following(int32_t offset) {
// from here until we've passed the starting position. The position
// we stop on will be the first break position after the specified one.
int32_t result = handlePrevious();
while (result != BreakIterator.DONE && result <= offset)
while (result != BreakIterator::DONE && result <= offset)
result = handleNext();
return result;
}
@ -216,11 +362,20 @@ int32_t RuleBasedBreakIterator::following(int32_t offset) {
* @return The position of the last boundary before the starting position.
*/
int32_t RuleBasedBreakIterator::preceding(int32_t offset) {
// if the offset passed in is already past the end of the text,
// just return DONE; if it's before the beginning, return the
// text's starting offset
if (text == NULL || offset > text->endIndex()) {
return BreakIterator::DONE;
}
else if (offset < text->startIndex()) {
return text->startIndex();
}
// if we start by updating the current iteration position to the
// position specified by the caller, we can just use previous()
// to carry out this operation
CharacterIterator text = getText();
text.setIndex(offset);
text->setIndex(offset);
return previous();
}
@ -232,10 +387,15 @@ int32_t RuleBasedBreakIterator::preceding(int32_t offset) {
* @return True if "offset" is a boundary position.
*/
bool_t RuleBasedBreakIterator::isBoundary(int32_t offset) {
// 0 is always a boundary position (I suspect this code is wrong; I think
// we're supposed to be comparing "offset" against text.getBeginIndex(). )
if (offset == 0)
// the beginning index of the iterator is always a boundary position by definition
if (text == NULL || offset == text->startIndex()) {
return TRUE;
}
// out-of-range indexes are never boundary positions
else if (offset < text->startIndex() || offset > text->endIndex()) {
return FALSE;
}
// otherwise, we can use following() on the position before the specified
// one and return true of the position we get back is the one the user
@ -248,38 +408,14 @@ bool_t RuleBasedBreakIterator::isBoundary(int32_t offset) {
* Returns the current iteration position.
* @return The current iteration position.
*/
int32_t RuleBasedBreakIterator::current(void) {
return getText().getIndex();
int32_t RuleBasedBreakIterator::current(void) const {
return (text != NULL) ? text->getIndex() : BreakIterator::DONE;
}
/**
* Return a CharacterIterator over the text being analyzed. This version
* of this method returns the actual CharacterIterator we're using internally.
* Changing the state of this iterator can have undefined consequences. If
* you need to change it, clone it first.
* @return An iterator over the text being analyzed.
*/
CharacterIterator RuleBasedBreakIterator::getText(void) {
// The iterator is initialized pointing to no text at all, so if this
// function is called while we're in that state, we have to fudge an
// an iterator to return.
if (text == 0)
text = new StringCharacterIterator("");
return text;
}
/**
* Set the iterator to analyze a new piece of text. This function resets
* the current iteration position to the beginning of the text.
* @param newText An iterator over the text to analyze.
*/
void RuleBasedBreakIterator::setText(CharacterIterator newText) {
text = newText;
text.first();
}
//=======================================================================
// implementation
//=======================================================================
/**
* This method is the actual implementation of the next() method. All iteration
* vectors through here. This method initializes the state machine to state 1
@ -289,38 +425,82 @@ void RuleBasedBreakIterator::setText(CharacterIterator newText) {
*/
int32_t RuleBasedBreakIterator::handleNext(void) {
// if we're already at the end of the text, return DONE.
CharacterIterator text = getText();
if (text.getIndex() == text.getEndIndex())
return BreakIterator.DONE;
if (text == NULL || tables == NULL || text->getIndex() == text->endIndex())
return BreakIterator::DONE;
// no matter what, we always advance at least one character forward
int32_t result = text.getIndex() + 1;
int32_t result = text->getIndex() + 1;
int32_t lookaheadResult = 0;
// begin in state 1
int32_t state = START_STATE;
int32_t category;
UChar c = text.current();
UChar c = text->current();
UChar lastC = c;
int32_t lastCPos = 0;
// loop until we reach the end of the text or transition to state 0
while (c != CharacterIterator.DONE && state != STOP_STATE) {
while (c != CharacterIterator::DONE && state != STOP_STATE) {
// look up the current character's character category (which tells us
// which column in the state table to look at)
category = lookupCategory(c);
category = tables->lookupCategory(c, this);
// if the character isn't an ignore character, look up a state
// transition in the state table
if (category != IGNORE) {
state = lookupState(state, category);
state = tables->lookupState(state, category);
}
// if the state we've just transitioned to is an accepting state,
// if the state we've just transitioned to is a lookahead state,
// (but not also an end state), save its position. If it's
// both a lookahead state and an end state, update the break position
// to the last saved lookup-state position
if (tables->isLookaheadState(state)) {
if (tables->isEndState(state)) {
result = lookaheadResult;
}
else {
lookaheadResult = text->getIndex() + 1;
}
}
// otherwise, if the state we've just transitioned to is an accepting state,
// update our return value to be the current iteration position
if (endStates[state])
result = text.getIndex() + 1;
c = text.next();
else {
if (tables->isEndState(state)) {
result = text->getIndex() + 1;
}
}
// keep track of the last "real" character we saw. If this character isn't an
// ignore character, take note of it and its position in the text
if (category != IGNORE && state != STOP_STATE) {
lastC = c;
lastCPos = text->getIndex();
}
c = text->next();
}
text.setIndex(result);
// if we've run off the end of the text, and the very last character took us into
// a lookahead state, advance the break position to the lookahead position
// (the theory here is that if there are no characters at all after the lookahead
// position, that always matches the lookahead criteria)
if (c == CharacterIterator::DONE && lookaheadResult == text->endIndex()) {
result = lookaheadResult;
}
// if the last character we saw before the one that took us into the stop state
// was a mandatory breaking character, then the break position goes right after it
// (this is here so that breaks come before, rather than after, a string of
// ignore characters when they follow a mandatory break character)
else if (lastC == 0x0a || lastC == 0x0d || lastC == 0x0c || lastC == 0x2028
|| lastC == 0x2029) {
result = lastCPos + 1;
}
text->setIndex(result);
return result;
}
@ -332,27 +512,29 @@ int32_t RuleBasedBreakIterator::handleNext(void) {
* of buildBackwardsStateTable() in RuleBasedBreakIterator.Builder.)
*/
int32_t RuleBasedBreakIterator::handlePrevious(void) {
CharacterIterator text = getText();
if (text == NULL || tables == NULL)
return 0;
int32_t state = START_STATE;
int32_t category = 0;
int32_t lastCategory = 0;
UChar c = text.current();
UChar c = text->current();
// loop until we reach the beginning of the text or transition to state 0
while (c != CharacterIterator.DONE && state != STOP_STATE) {
while (c != CharacterIterator::DONE && state != STOP_STATE) {
// save the last character's category and look up the current
// character's category
lastCategory = category;
category = lookupCategory(c);
category = tables->lookupCategory(c, this);
// if the current character isn't an ignore character, look up a
// state transition in the backwards state table
if (category != IGNORE)
state = lookupBackwardState(state, category);
state = tables->lookupBackwardState(state, category);
// then advance one character backwards
c = text.previous();
c = text->previous();
}
// if we didn't march off the beginning of the text, we're either one or two
@ -360,35 +542,19 @@ int32_t RuleBasedBreakIterator::handlePrevious(void) {
// previous() at the end of the loop above, and another because the character
// that takes us into the stop state will always be the character BEFORE
// the break position.)
if (c != CharacterIterator.DONE) {
if (c != CharacterIterator::DONE) {
if (lastCategory != IGNORE)
text.setIndex(text.getIndex() + 2);
text->setIndex(text->getIndex() + 2);
else
text.next();
text->next();
}
return text.getIndex();
return text->getIndex();
}
/**
* Looks up a character's category (i.e., its category for breaking purposes,
* not its Unicode category)
*/
int32_t RuleBasedBreakIterator::lookupCategory(UChar c) {
return UCharCategoryTable.elementAt(c);
}
/**
* Given a current state and a character category, looks up the
* next state to transition to in the state table.
*/
int32_t RuleBasedBreakIterator::lookupState(int32_t state, int32_t category) {
return stateTable[state * numCategories + category];
}
/**
* Given a current state and a character category, looks up the
* next state to transition to in the backwards state table.
*/
int32_t RuleBasedBreakIterator::lookupBackwardState(int32_t state, int32_t category) {
return backwardsStateTable[state * numCategories + category];
void
RuleBasedBreakIterator::reset()
{
// Base-class version of this function is a no-op.
// Subclasses may override with their own reset behavior.
}

View File

@ -3,12 +3,18 @@
**********************************************************************
* Date Name Description
* 10/22/99 alan Creation.
* 11/11/99 rgillam Complete port from Java.
**********************************************************************
*/
#ifndef RBBI_H
#define RBBI_H
#include "utypes.h"
#include "rbbi_tbl.h"
#include "unicode/brkiter.h"
#include "filestrm.h"
/**
* <p>A subclass of BreakIterator whose behavior is specified using a list of rules.</p>
*
@ -173,17 +179,15 @@
*
* @author Richard Gillam
*/
class RuleBasedBreakIterator {
protected:
class U_I18N_API RuleBasedBreakIterator : public BreakIterator {
public:
/**
* A token used as a character-category value to identify ignore characters
*/
static int8_t IGNORE;
private:
/**
* The state number of the starting state
*/
@ -194,92 +198,130 @@ private:
*/
static int16_t STOP_STATE;
/**
* The textual description this iterator was created from
*/
UnicodeString description;
/**
* A table that indexes from character values to character category numbers
*/
CompactByteArray charCategoryTable;
/**
* The table of state transitions used for forward iteration
*/
int16_t* stateTable;
/**
* The table of state transitions used to sync up the iterator with the
* text in backwards and random-access iteration
*/
int16_t* backwardsStateTable;
/**
* A list of flags indicating which states in the state table are accepting
* ("end") states
*/
bool_t* endStates;
/**
* The number of character categories (and, thus, the number of columns in
* the state tables)
*/
int32_t numCategories;
protected:
/**
* The character iterator through which this BreakIterator accesses the text
*/
CharacterIterator text;
CharacterIterator* text;
/**
* The data tables this iterator uses to determine the break positions
*/
RuleBasedBreakIteratorTables* tables;
private:
/**
* Class ID
*/
static char fgClassID;
public:
//=======================================================================
// constructors
//=======================================================================
public:
// This constructor uses the udata interface to create a BreakIterator whose
// internal tables live in a memory-mapped file. "image" is a pointer to the
// beginning of that file.
RuleBasedBreakIterator(const void* image);
/**
* Constructs a RuleBasedBreakIterator according to the description
* provided. If the description is malformed, throws an
* IllegalArgumentException. Normally, instead of constructing a
* RuleBasedBreakIterator directory, you'll use the factory methods
* on BreakIterator to create one indirectly from a description
* in the framework's resource files. You'd use this when you want
* special behavior not provided by the built-in iterators.
* Copy constructor. Will produce a collator with the same behavior,
* and which iterates over the same text, as the one passed in.
*/
RuleBasedBreakIterator(UnicodeString description);
RuleBasedBreakIterator(const RuleBasedBreakIterator& that);
//=======================================================================
// boilerplate
//=======================================================================
public:
/**
* Clones this iterator.
* @return A newly-constructed RuleBasedBreakIterator with the same
* behavior as this one.
* Destructor
*/
virtual Object clone(void);
virtual ~RuleBasedBreakIterator();
/**
* Returns true if both BreakIterators are of the same class, have the same
* rules, and iterate over the same text.
* Assignment operator. Sets this iterator to have the same behavior,
* and iterate over the same text, as the one passed in.
*/
virtual bool_t equals(Object that);
RuleBasedBreakIterator& operator=(const RuleBasedBreakIterator& that);
/**
* Equality operator. Returns TRUE if both BreakIterators are of the
* same class, have the same behavior, and iterate over the same text.
*/
virtual bool_t operator==(const BreakIterator& that) const;
/**
* Not-equal operator. If operator== returns TRUE, this returns FALSE,
* and vice versa.
*/
bool_t operator!=(const BreakIterator& that) const;
/**
* Returns a newly-constructed RuleBasedBreakIterator with the same
* behavior, and iterating over the same text, as this one.
*/
virtual BreakIterator* clone(void) const;
/**
* Compute a hash code for this BreakIterator
* @return A hash code
*/
virtual int32_t hashCode() const;
/**
* Returns the description used to create this iterator
*/
virtual UnicodeString toString(void);
virtual const UnicodeString& getRules() const;
/**
* Compute a hashcode for this BreakIterator
* @return A hash code
*/
virtual int32_t hashCode(void);
//=======================================================================
// BreakIterator overrides
//=======================================================================
/**
* Return a CharacterIterator over the text being analyzed. This version
* of this method returns the actual CharacterIterator we're using internally.
* Changing the state of this iterator can have undefined consequences. If
* you need to change it, clone it first.
* @return An iterator over the text being analyzed.
*/
virtual const CharacterIterator& getText() const;
/**
* Returns a newly-created CharacterIterator that the caller is to take
* ownership of.
* THIS FUNCTION SHOULD NOT BE HERE. IT'S HERE BECAUSE BreakIterator DEFINES
* IT AS PURE VIRTUAL, FORCING RBBI TO IMPLEMENT IT. IT SHOULD BE REMOVED
* FROM *BOTH* CLASSES.
*/
virtual CharacterIterator* createText() const;
/**
* Set the iterator to analyze a new piece of text. This function resets
* the current iteration position to the beginning of the text.
* @param newText An iterator over the text to analyze. The BreakIterator
* takes ownership of the character iterator. The caller MUST NOT delete it!
*/
virtual void adoptText(CharacterIterator* newText);
/**
* Set the iterator to analyze a new piece of text. This function resets
* the current iteration position to the beginning of the text.
* @param newText The text to analyze.
*/
virtual void setText(const UnicodeString& newText);
/**
* Set the iterator to analyze a new piece of text. This function resets
* the current iteration position to the beginning of the text.
* @param newText The text to analyze.
* THIS FUNCTION SHOULD NOT BE HERE. IT'S HERE BECAUSE BreakIterator DEFINES
* IT AS PURE VIRTUAL, FORCING RBBI TO IMPLEMENT IT. IT SHOULD BE REMOVED
* FROM *BOTH* CLASSES.
*/
virtual void setText(const UnicodeString* newText);
/**
* Sets the current iteration position to the beginning of the text.
* (i.e., the CharacterIterator's starting offset).
@ -346,28 +388,36 @@ public:
* Returns the current iteration position.
* @return The current iteration position.
*/
virtual int32_t current(void);
virtual int32_t current(void) const;
/**
* Return a CharacterIterator over the text being analyzed. This version
* of this method returns the actual CharacterIterator we're using internally.
* Changing the state of this iterator can have undefined consequences. If
* you need to change it, clone it first.
* @return An iterator over the text being analyzed.
* Returns a unique class ID POLYMORPHICALLY. Pure virtual override.
* This method is to implement a simple version of RTTI, since not all
* C++ compilers support genuine RTTI. Polymorphic operator==() and
* clone() methods call this method.
*
* @return The class ID for this object. All objects of a
* given class have the same class ID. Objects of
* other classes have different class IDs.
*/
virtual CharacterIterator getText(void);
virtual UClassID getDynamicClassID() const;
/**
* Set the iterator to analyze a new piece of text. This function resets
* the current iteration position to the beginning of the text.
* @param newText An iterator over the text to analyze.
* Returns the class ID for this class. This is useful only for
* comparing to a return value from getDynamicClassID(). For example:
*
* Base* polymorphic_pointer = createPolymorphicObject();
* if (polymorphic_pointer->getDynamicClassID() ==
* Derived::getStaticClassID()) ...
*
* @return The class ID for all objects of this class.
*/
virtual void setText(CharacterIterator newText);
static UClassID getStaticClassID();
protected:
//=======================================================================
// implementation
//=======================================================================
protected:
/**
* This method is the actual implementation of the next() method. All iteration
* vectors through here. This method initializes the state machine to state 1
@ -387,22 +437,33 @@ protected:
virtual int32_t handlePrevious(void);
/**
* Looks up a character's category (i.e., its category for breaking purposes,
* not its Unicode category)
* Dumps caches and performs other actions associated with a complete change
* in text or iteration position. This function is a no-op in RuleBasedBreakIterator,
* but subclasses can and do override it.
*/
virtual int32_t lookupCategory(UChar c);
virtual void reset();
private:
/**
* Given a current state and a character category, looks up the
* next state to transition to in the state table.
* Constructs a RuleBasedBreakIterator that uses the already-created
* tables object that is passed in as a parameter.
*/
virtual int32_t lookupState(int32_t state, int32_t category);
RuleBasedBreakIterator(RuleBasedBreakIteratorTables* tables);
/**
* Given a current state and a character category, looks up the
* next state to transition to in the backwards state table.
*/
virtual int32_t lookupBackwardState(int32_t state, int32_t category);
friend class BreakIterator;
};
inline bool_t RuleBasedBreakIterator::operator!=(const BreakIterator& that) const {
return !operator==(that);
}
inline UClassID RuleBasedBreakIterator::getDynamicClassID() const {
return RuleBasedBreakIterator::getStaticClassID();
}
inline UClassID RuleBasedBreakIterator::getStaticClassID() {
return (UClassID)(&fgClassID);
}
#endif

File diff suppressed because it is too large Load Diff

View File

@ -2,8 +2,7 @@
* Copyright © {1999}, International Business Machines Corporation and others. All Rights Reserved.
**********************************************************************
* Date Name Description
* 10/22/99 alan Creation. This is an internal header; it
* shall not be exported.
* 12/15/99 rgillam Port from Java.
**********************************************************************
*/
@ -11,9 +10,12 @@
#define RBBI_BLD_H
#include "rbbi.h"
#include "rbbi_tbl.h"
#include "unicode/uniset.h"
#include "uvector.h"
class ExpressionList;
//=======================================================================
// RuleBasedBreakIterator.Builder
//=======================================================================
@ -42,18 +44,37 @@
class RuleBasedBreakIteratorBuilder {
protected:
/**
* The iterator we're constructing.
*/
RuleBasedBreakIterator& iterator;
/**
* The tables object for the iterator we're constructing.
*/
RuleBasedBreakIteratorTables* tables;
/**
* A temporary place to hold the rules as they're being processed.
*/
UVector tempRuleList;
/**
* A temporary holding place used for calculating the character categories.
* This object contains UnicodeSet objects.
*/
UVector categories;
/**
* The number of categories (and thus the number of columns in the finished state tables)
*/
int32_t numCategories;
/**
* A table used to map parts of regexp text to lists of character categories,
* rather than having to figure them out from scratch each time
*/
Hashtable expressions;
ExpressionList* expressions;
/**
* A temporary holding place for the list of ignore characters
@ -104,18 +125,56 @@ protected:
*/
bool_t clearLoopingStates;
/**
* A place where an error message can be stored if we get a parse error.
* The error message is never displayed anywhere, so this is useful pretty
* much only in conjunction with a debugger.
*/
UnicodeString errorMessage;
/**
* A bit mask used to indicate a bit in the table's flags column that marks a
* state as an accepting state.
*/
static const int32_t END_STATE_FLAG /*= 0x8000*/;
/**
* A bit mask used to indicate a bit in the table's flags column that marks a
* state as one the builder shouldn't loop to any looping states
*/
static const int32_t DONT_LOOP_FLAG /*= 0x4000*/;
/**
* A bit mask used to indicate a bit in the table's flags column that marks a
* state as a lookahead state.
*/
static const int32_t LOOKAHEAD_STATE_FLAG /*= 0x2000*/;
/**
* A bit mask representing the union of the mask values listed above.
* Used for clearing or masking off the flag bits.
*/
static const int32_t ALL_FLAGS /*= END_STATE_FLAG | LOOKAHEAD_STATE_FLAG
| DONT_LOOP_FLAG*/;
public:
/**
* No special construction is required for the Builder.
* The Builder class contains a reference to the iterator it's supposed to build.
*/
RuleBasedBreakIteratorBuilder();
RuleBasedBreakIteratorBuilder(RuleBasedBreakIterator& iteratorToBuild);
/**
* Destructor.
*/
~RuleBasedBreakIteratorBuilder();
/**
* This is the main function for setting up the BreakIterator's tables. It
* just UVectors different parts of the job off to other functions.
* just vectors different parts of the job off to other functions.
*/
virtual void buildBreakIterator(void);
virtual void buildBreakIterator(const UnicodeString& description,
UErrorCode& err);
private:
@ -127,7 +186,8 @@ private:
* <li>Perform variable-name substitutions (so that no one else sees variable names)
* </ul>
*/
virtual UVector buildRuleList(UnicodeString description);
virtual void buildRuleList(UnicodeString& description,
UErrorCode& err);
protected:
@ -138,8 +198,11 @@ protected:
* find-and-replace of the variable name with its text. (The variable text
* must be enclosed in either [] or () for this to work.)
*/
virtual UnicodeString processSubstitution(UnicodeString substitutionRule, UnicodeString description,
int32_t startPos);
virtual void processSubstitution(UnicodeString& description,
UTextOffset ruleStart,
UTextOffset ruleEnd,
UTextOffset startPos,
UErrorCode& err);
/**
* This function defines a protocol for handling substitution names that
@ -150,8 +213,17 @@ protected:
* that which is done by the normal substitution-processing code is done
* here.
*/
virtual void handleSpecialSubstitution(UnicodeString replace, UnicodeString replaceWith,
int32_t startPos, UnicodeString description);
virtual void handleSpecialSubstitution(const UnicodeString& replace,
const UnicodeString& replaceWith,
int32_t startPos,
const UnicodeString& description,
UErrorCode& err);
/**
* This function provides a hook for subclasses to mess with the character
* category table.
*/
virtual void mungeExpressionList();
/**
* This function builds the character category table. On entry,
@ -161,7 +233,7 @@ protected:
* character category numbers everywhere a literal character or a [] expression
* originally occurred.
*/
virtual void buildCharCategories(UVector tempRuleList);
virtual void buildCharCategories(UErrorCode& err);
private:
@ -170,7 +242,7 @@ private:
* work is done in parseRule(), which is called once for each rule in the
* description.
*/
virtual void buildStateTable(UVector tempRuleList);
virtual void buildStateTable(UErrorCode& err);
/**
* This is where most of the work really happens. This routine parses a single
@ -179,7 +251,8 @@ private:
* throughout the whole operation, although some ugly postprocessing is needed
* to handle the *? token.
*/
virtual void parseRule(UnicodeString rule, bool_t forward);
virtual void parseRule(const UnicodeString& rule,
bool_t forward);
/**
* Update entries in the state table, and merge states when necessary to keep
@ -189,9 +262,9 @@ private:
* list of the columns that need updating.
* @param newValue Update the cells specfied above to contain this value
*/
virtual void updateStateTable(UVector rows,
UnicodeString pendingChars,
int16_t newValue);
virtual void updateStateTable(const UVector& rows,
const UnicodeString& pendingChars,
int16_t newValue);
/**
* The real work of making the state table deterministic happens here. This function
@ -213,9 +286,9 @@ private:
* (itself a copy of the decision point list from parseRule()). Newly-created
* states get added to the decision point list if their "parents" were on it.
*/
virtual void mergeStates(int32_t rowNum,
virtual void mergeStates(int32_t rowNum,
int16_t* newValues,
UVector rowsBeingUpdated);
const UVector& rowsBeingUpdated);
/**
* The merge list is a list of pairs of rows that have been merged somewhere in
@ -236,7 +309,8 @@ private:
* @param endStates The list of states to treat as end states (states that
* can exit the loop).
*/
virtual void setLoopingStates(UVector newLoopingStates, UVector endStates);
virtual void setLoopingStates(const UVector* newLoopingStates,
const UVector& endStates);
/**
* This removes "ending states" and states reachable from them from the
@ -264,7 +338,7 @@ private:
* table and any additional rules (identified by the ! on the front)
* supplied in the description
*/
virtual void buildBackwardsStateTable(UVector tempRuleList);
virtual void buildBackwardsStateTable(UErrorCode& err);
protected:
@ -276,7 +350,9 @@ protected:
* discovered
* @param context The string containing the error
*/
virtual void error(UnicodeString message, int32_t position, UnicodeString context);
virtual void setUpErrorMessage(const UnicodeString& message,
int32_t position,
const UnicodeString& context);
};
#endif

View File

@ -0,0 +1,128 @@
/*
**********************************************************************
* Copyright (C) 1999 IBM Corp. All rights reserved.
**********************************************************************
* Date Name Description
* 11/11/99 rgillam Complete port from Java.
**********************************************************************
*/
#include "rbbi_tbl.h"
#include "cmemory.h"
//=======================================================================
// constructor
//=======================================================================
RuleBasedBreakIteratorTables::RuleBasedBreakIteratorTables(const void* image)
: refCount(0),
ownTables(FALSE)
{
const void** im = (const void**)(image);
const int8_t* base = (const int8_t*)(image);
// the memory image begins with an index that gives the offsets into the
// image for each of the fields in the BreakIteratorTables object--
// use those to initialize the tables object (it will end up pointing
// into the memory image for everything)
numCategories = (int32_t)im[0];
description = UnicodeString(TRUE, (UChar*)((int32_t)im[1] + base), -1);
charCategoryTable = ucmp8_openAdopt((uint16_t*)((int32_t)im[2] + base),
(int8_t*)((int32_t)im[3] + base), 0);
stateTable = (int16_t*)((int32_t)im[4] + base);
backwardsStateTable = (int16_t*)((int32_t)im[5] + base);
endStates = (int8_t*)((int32_t)im[6] + base);
lookaheadStates = (int8_t*)((int32_t)im[7] + base);
}
RuleBasedBreakIteratorTables::RuleBasedBreakIteratorTables()
: refCount(0),
ownTables(TRUE)
{
// everything else is null-initialized. This constructor depends on
// a RuleBasedBreakIteratorBuilder filling in all the members
}
//=======================================================================
// boilerplate
//=======================================================================
/**
* Destructor
*/
RuleBasedBreakIteratorTables::~RuleBasedBreakIteratorTables() {
if (ownTables) {
delete [] stateTable;
delete [] backwardsStateTable;
delete [] endStates;
delete [] lookaheadStates;
ucmp8_close(charCategoryTable);
}
else {
uprv_free(charCategoryTable);
}
}
/**
* Equality operator. Returns TRUE if both tables objects are of the
* same class, have the same behavior, and iterate over the same text.
*/
bool_t
RuleBasedBreakIteratorTables::operator==(const RuleBasedBreakIteratorTables& that) const {
return this->description == that.description;
}
/**
* Compute a hash code for these tables
* @return A hash code
*/
int32_t
RuleBasedBreakIteratorTables::hashCode() const {
return description.hashCode();
}
//=======================================================================
// implementation
//=======================================================================
/**
* Looks up a character's category (i.e., its category for breaking purposes,
* not its Unicode category)
*/
int32_t
RuleBasedBreakIteratorTables::lookupCategory(UChar c, BreakIterator* ignored) const {
return ucmp8_get(charCategoryTable, c);
}
/**
* Given a current state and a character category, looks up the
* next state to transition to in the state table.
*/
int32_t
RuleBasedBreakIteratorTables::lookupState(int32_t state, int32_t category) const {
return stateTable[state * numCategories + category];
}
/**
* Given a current state and a character category, looks up the
* next state to transition to in the backwards state table.
*/
int32_t
RuleBasedBreakIteratorTables::lookupBackwardState(int32_t state, int32_t category) const {
return backwardsStateTable[state * numCategories + category];
}
/**
* Returns true if the specified state is an accepting state.
*/
bool_t
RuleBasedBreakIteratorTables::isEndState(int32_t state) const {
return endStates[state];
}
/**
* Returns true if the specified state is a lookahead state.
*/
bool_t
RuleBasedBreakIteratorTables::isLookaheadState(int32_t state) const {
return lookaheadStates[state];
}

View File

@ -0,0 +1,213 @@
/*
**********************************************************************
* Copyright (C) 1999 IBM Corp. All rights reserved.
**********************************************************************
* Date Name Description
* 11/11/99 rgillam Complete port from Java.
**********************************************************************
*/
#ifndef RBBI_TBL_H
#define RBBI_TBL_H
#include "ucmp8.h"
#include "utypes.h"
#include "unistr.h"
#include "unicode/brkiter.h"
#include "filestrm.h"
/**
* This class contains the internal static tables that are used by the
* RuleBasedBreakIterator. Once created, these tables are immutable,
* so they can be shared among all break iterators using a particular
* set of rules. This class uses a reference-counting scheme to
* manage the sharing.
*
* @author Richard Gillam
*/
class RuleBasedBreakIteratorTables {
private:
/**
* The number of RuleBasedBreakIterators using this object.
*/
int16_t refCount;
protected:
/**
* Whether or not we own the storage for the tables (the tables may be
* stored in a memory-mapped file)
*/
bool_t ownTables;
private:
/**
* The textual description that was used to create these tables
*/
UnicodeString description;
/**
* A table that indexes from character values to character category numbers
*/
CompactByteArray* charCategoryTable;
/**
* The table of state transitions used for forward iteration
*/
int16_t* stateTable;
/**
* The table of state transitions used to sync up the iterator with the
* text in backwards and random-access iteration
*/
int16_t* backwardsStateTable;
/**
* A list of flags indicating which states in the state table are accepting
* ("end") states
*/
int8_t* endStates;
/**
* A list of flags indicating which states in the state table are
* lookahead states (states which turn lookahead on and off)
*/
int8_t* lookaheadStates;
/**
* The number of character categories (and, thus, the number of columns in
* the state tables)
*/
int32_t numCategories;
//=======================================================================
// constructor
//=======================================================================
/**
* Creates a tables object, adopting all of the tables that are passed in.
*/
protected:
RuleBasedBreakIteratorTables();
RuleBasedBreakIteratorTables(const void* image);
private:
/**
* The copy constructor is declared private and is a no-op.
* THIS CLASS MAY NOT BE COPIED.
*/
RuleBasedBreakIteratorTables(const RuleBasedBreakIteratorTables& that);
//=======================================================================
// boilerplate
//=======================================================================
protected:
/**
* Destructor
*/
virtual ~RuleBasedBreakIteratorTables();
private:
/**
* The assignment operator is declared private and is a no-op.
* THIS CLASS MAY NOT BE COPIED.
*/
RuleBasedBreakIteratorTables& operator=(const RuleBasedBreakIteratorTables& that);
/**
* Equality operator. Returns TRUE if both tables objects are of the
* same class, have the same behavior, and iterate over the same text.
*/
virtual bool_t operator==(const RuleBasedBreakIteratorTables& that) const;
/**
* Not-equal operator. If operator== returns TRUE, this returns FALSE,
* and vice versa.
*/
bool_t operator!=(const RuleBasedBreakIteratorTables& that) const;
/**
* Compute a hash code for these tables
* @return A hash code
*/
virtual int32_t hashCode() const;
/**
* Returns the description used to create these tables
*/
const UnicodeString& getRules() const;
//=======================================================================
// reference counting
//=======================================================================
/**
* increments the reference count.
*/
void addReference();
/**
* decrements the reference count and deletes the object if it reaches zero
*/
void removeReference();
protected:
//=======================================================================
// implementation
//=======================================================================
/**
* Looks up a character's category (i.e., its category for breaking purposes,
* not its Unicode category)
*/
virtual int32_t lookupCategory(UChar c, BreakIterator* bi) const;
/**
* Given a current state and a character category, looks up the
* next state to transition to in the state table.
*/
virtual int32_t lookupState(int32_t state, int32_t category) const;
/**
* Given a current state and a character category, looks up the
* next state to transition to in the backwards state table.
*/
virtual int32_t lookupBackwardState(int32_t state, int32_t category) const;
/**
* Returns true if the specified state is an accepting state.
*/
virtual bool_t isEndState(int32_t state) const;
/**
* Returns true if the specified state is a lookahead state.
*/
virtual bool_t isLookaheadState(int32_t state) const;
friend class RuleBasedBreakIterator;
friend class DictionaryBasedBreakIterator;
};
inline bool_t
RuleBasedBreakIteratorTables::operator!=(const RuleBasedBreakIteratorTables& that) const {
return !operator==(that);
}
inline const UnicodeString&
RuleBasedBreakIteratorTables::getRules() const {
return description;
}
inline void
RuleBasedBreakIteratorTables::addReference() {
++refCount;
}
inline void
RuleBasedBreakIteratorTables::removeReference() {
if (--refCount <= 0)
delete this;
}
#endif

View File

@ -79,6 +79,31 @@ ubrk_close(UBreakIterator *bi)
delete (BreakIterator*) bi;
}
U_CAPI void
ubrk_setText(UBreakIterator* bi,
const UChar* text,
int32_t textLength,
UErrorCode* status)
{
if (U_FAILURE(*status)) return;
const CharacterIterator& biText = ((BreakIterator*)bi)->getText();
int32_t textLen = (textLength == -1 ? u_strlen(text) : textLength);
if (biText.getDynamicClassID() == UCharCharacterIterator::getStaticClassID()) {
((UCharCharacterIterator&)biText).setText(text, textLen);
}
else {
UCharCharacterIterator *iter = 0;
iter = new UCharCharacterIterator(text, textLen);
if(iter == 0) {
*status = U_MEMORY_ALLOCATION_ERROR;
return;
}
((BreakIterator*)bi)->adoptText(iter);
}
}
U_CAPI UTextOffset
ubrk_current(const UBreakIterator *bi)
{

View File

@ -177,53 +177,73 @@ public:
* BreakIterator, as the argument. Text is considered the same if
* it contains the same characters, it need not be the same
* object, and styles are not considered.
* @stable
*/
virtual bool_t operator==(const BreakIterator&) const = 0;
/**
* Returns the complement of the result of operator==
* @stable
*/
bool_t operator!=(const BreakIterator& rhs) const { return !operator==(rhs); }
/**
* Return a polymorphic copy of this object. This is an abstract
* method which subclasses implement.
* @stable
*/
virtual BreakIterator* clone(void) const = 0;
/**
* Return a polymorphic class ID for this object. Different subclasses
* will return distinct unequal values.
* @stable
*/
virtual UClassID getDynamicClassID(void) const = 0;
/**
* Return a CharacterIterator over the text being analyzed.
* @draft
*/
virtual const CharacterIterator& getText() const = 0;
/**
* Get the text for which this object is finding the boundaries.
* @draft
*/
virtual CharacterIterator* createText(void) const = 0;
/**
* Change the text over which this operates. The text boundary is
* reset to the start.
* [This function should be modified to take a const UnicodeString& agrument.]
* @deprecate
*/
virtual void setText(const UnicodeString* it) = 0;
/**
* Change the text over which this operates. The text boundary is
* reset to the start.
* @stable
*/
virtual void adoptText(CharacterIterator* it) = 0;
/**
* DONE is returned by previous() and next() after all valid
* boundaries have been returned.
@stable
*/
static const UTextOffset DONE;
/**
* Return the index of the first character in the text being scanned.
* @stable
*/
virtual UTextOffset first(void) = 0;
/**
* Return the index immediately BEYOND the last character in the text being scanned.
* @stable
*/
virtual UTextOffset last(void) = 0;
@ -231,6 +251,7 @@ public:
* Return the boundary preceding the current boundary.
* @return The character index of the previous text boundary or DONE if all
* boundaries have been returned.
* @stable
*/
virtual UTextOffset previous(void) = 0;
@ -238,6 +259,7 @@ public:
* Return the boundary following the current boundary.
* @return The character index of the next text boundary or DONE if all
* boundaries have been returned.
* @stable
*/
virtual UTextOffset next(void) = 0;
@ -245,6 +267,7 @@ public:
* Return character index of the text boundary that was most recently
* returned by next(), previous(), first(), or last()
* @return The boundary most recently returned.
* @stable
*/
virtual UTextOffset current(void) const = 0;
@ -254,6 +277,7 @@ public:
* the value BreakIterator.DONE
* @param offset the offset to begin scanning.
* @return The first boundary after the specified offset.
* @stable
*/
virtual UTextOffset following(UTextOffset offset) = 0;
@ -263,6 +287,7 @@ public:
* the value BreakIterator.DONE
* @param offset the offset to begin scanning.
* @return The first boundary before the specified offset.
* @stable
*/
virtual UTextOffset preceding(UTextOffset offset) = 0;
@ -270,6 +295,7 @@ public:
* Return true if the specfied position is a boundary position.
* @param offset the offset to check.
* @return True if "offset" is a boundary position.
* @stable
*/
virtual bool_t isBoundary(UTextOffset offset) = 0;
@ -280,6 +306,7 @@ public:
* and positive values move to later boundaries.
* @return The index of the nth boundary from the current position, or
* DONE if there are fewer than |n| boundaries in the specfied direction.
* @stable
*/
virtual UTextOffset next(int32_t n) = 0;
@ -290,6 +317,7 @@ public:
* @param where the locale. If a specific WordBreak is not
* avaliable for the specified locale, a default WordBreak is returned.
* @return A BreakIterator for word-breaks
* @stable
*/
static BreakIterator* createWordInstance(const Locale& where = Locale::getDefault());
@ -302,6 +330,7 @@ public:
* @param where the locale. If a specific LineBreak is not
* avaliable for the specified locale, a default LineBreak is returned.
* @return A BreakIterator for line-breaks
* @stable
*/
static BreakIterator* createLineInstance(const Locale& where = Locale::getDefault());
@ -312,6 +341,7 @@ public:
* @param where the locale. If a specific character break is not
* avaliable for the specified locale, a default character break is returned.
* @return A BreakIterator for character-breaks
* @stable
*/
static BreakIterator* createCharacterInstance(const Locale& where = Locale::getDefault());
@ -321,6 +351,7 @@ public:
* @param where the locale. If a specific SentenceBreak is not
* avaliable for the specified locale, a default SentenceBreak is returned.
* @return A BreakIterator for sentence-breaks
* @stable
*/
static BreakIterator* createSentenceInstance(const Locale& where = Locale::getDefault());
@ -328,6 +359,7 @@ public:
* Get the set of Locales for which TextBoundaries are installed
* @param count the output parameter of number of elements in the locale list
* @return available locales
* @stable
*/
static const Locale* getAvailableLocales(int32_t& count);
@ -338,6 +370,7 @@ public:
* @param name the fill-in parameter of the return value
* Uses best match.
* @return user-displayable name
* @stable
*/
static UnicodeString& getDisplayName(const Locale& objectLocale,
const Locale& displayLocale,
@ -349,6 +382,7 @@ public:
* @param objectLocale must be from getMatchingLocales
* @param name the fill-in parameter of the return value
* @return user-displayable name
* @stable
*/
static UnicodeString& getDisplayName(const Locale& objectLocale,
UnicodeString& name);

View File

@ -178,6 +178,7 @@ typedef enum UBreakIteratorType UBreakIteratorType;
* @param status A UErrorCode to receive any errors.
* @return A UBreakIterator for the specified locale.
* @see ubrk_openRules
* @stable
*/
U_CAPI UBreakIterator*
ubrk_open(UBreakIteratorType type,
@ -196,6 +197,7 @@ ubrk_open(UBreakIteratorType type,
* @param status A UErrorCode to receive any errors.
* @return A UBreakIterator for the specified rules.
* @see ubrk_open
* @stable
*/
U_CAPI UBreakIterator*
ubrk_openRules(const UChar *rules,
@ -208,16 +210,28 @@ ubrk_openRules(const UChar *rules,
* Close a UBreakIterator.
* Once closed, a UBreakIterator may no longer be used.
* @param bi The break iterator to close.
* @stable
*/
U_CAPI void
ubrk_close(UBreakIterator *bi);
/**
* Sets an existing iterator to point to a new piece of text
* @stable
*/
U_CAPI void
ubrk_setText(UBreakIterator* bi,
const UChar* text,
int32_t textLength,
UErrorCode* status);
/**
* Determine the most recently-returned text boundary.
*
* @param bi The break iterator to use.
* @return The character index most recently returned by \Ref{ubrk_next}, \Ref{ubrk_previous},
* \Ref{ubrk_first}, or \Ref{ubrk_last}.
* @stable
*/
U_CAPI UTextOffset
ubrk_current(const UBreakIterator *bi);
@ -229,6 +243,7 @@ ubrk_current(const UBreakIterator *bi);
* @return The character index of the next text boundary, or UBRK_DONE
* if all text boundaries have been returned.
* @see ubrk_previous
* @stable
*/
U_CAPI UTextOffset
ubrk_next(UBreakIterator *bi);
@ -240,6 +255,7 @@ ubrk_next(UBreakIterator *bi);
* @return The character index of the preceding text boundary, or UBRK_DONE
* if all text boundaries have been returned.
* @see ubrk_next
* @stable
*/
U_CAPI UTextOffset
ubrk_previous(UBreakIterator *bi);
@ -250,6 +266,7 @@ ubrk_previous(UBreakIterator *bi);
* @param bi The break iterator to use.
* @return The character index of the first character in the text being scanned.
* @see ubrk_last
* @stable
*/
U_CAPI UTextOffset
ubrk_first(UBreakIterator *bi);
@ -262,6 +279,7 @@ ubrk_first(UBreakIterator *bi);
* @return The character offset immediately <EM>beyond</EM> the last character in the
* text being scanned.
* @see ubrk_first
* @stable
*/
U_CAPI UTextOffset
ubrk_last(UBreakIterator *bi);
@ -273,6 +291,7 @@ ubrk_last(UBreakIterator *bi);
* @param offset The offset to begin scanning.
* @return The text boundary preceding offset, or UBRK_DONE.
* @see ubrk_following
* @stable
*/
U_CAPI UTextOffset
ubrk_preceding(UBreakIterator *bi,
@ -285,6 +304,7 @@ ubrk_preceding(UBreakIterator *bi,
* @param offset The offset to begin scanning.
* @return The text boundary following offset, or UBRK_DONE.
* @see ubrk_preceding
* @stable
*/
U_CAPI UTextOffset
ubrk_following(UBreakIterator *bi,
@ -297,6 +317,7 @@ ubrk_following(UBreakIterator *bi,
* @param index The index of the desired locale.
* @return A locale for which number text breaking information is available, or 0 if none.
* @see ubrk_countAvailable
* @stable
*/
U_CAPI const char*
ubrk_getAvailable(int32_t index);
@ -307,6 +328,7 @@ ubrk_getAvailable(int32_t index);
* calls to \Ref{ubrk_getAvailable}.
* @return The number of locales for which text breaking information is available.
* @see ubrk_getAvailable
* @stable
*/
U_CAPI int32_t
ubrk_countAvailable(void);

View File

@ -1023,6 +1023,7 @@ AllocateTextBoundary();
/* in addition to the other invariants, a line-break iterator should make sure that:
it doesn't break around the non-breaking characters */
e = ubrk_open(UBRK_LINE, "en_US", work, u_strlen(work), &status);
errorCount=0;
status=U_ZERO_ERROR;
u_strcpy(noBreak, CharsToUCharArray("\\u00a0\\u2007\\u2011\\ufeff"));
@ -1035,9 +1036,8 @@ AllocateTextBoundary();
for (j = 0; j < u_strlen(noBreak); j++) {
work[1] = noBreak[j];
for (k = 0; k < u_strlen(s); k++) {
work[2] = s[k];
e = ubrk_open(UBRK_LINE, "en_US", work, u_strlen(work), &status);
work[2] = s[k];
ubrk_setText(e, work, u_strlen(work), &status);
if(U_FAILURE(status)){
log_err("FAIL: Error in opening the word break Iterator in testLineInvaiants:\n %s\n", myErrorName(status));
return;
@ -1530,7 +1530,8 @@ void doBreakInvariantTest(UBreakIteratorType type, UChar* testChars)
u_strcpy(breaks, CharsToUCharArray("\r\n\\u2029\\u2028"));
tb = ubrk_open(type, "en_US", work, u_strlen(work), &status);
for (i = 0; i < u_strlen(breaks); i++) {
work[1] = breaks[i];
for (j = 0; j < u_strlen(testChars); j++) {
@ -1545,7 +1546,7 @@ void doBreakInvariantTest(UBreakIteratorType type, UChar* testChars)
continue;
work[2] = testChars[k];
tb=ubrk_open(type, "en_US", work, u_strlen(work), &status);
ubrk_setText(tb, work, u_strlen(work), &status);
if(U_FAILURE(status)){
log_err("ERROR in opening the breakIterator in doVariant Function: %s\n", myErrorName(status));
}
@ -1582,12 +1583,14 @@ void doOtherInvariantTest(UBreakIteratorType type , UChar* testChars)
log_verbose("doOtherInvariantTest text of length: %d\n", u_strlen(testChars));
tb = ubrk_open(type, "en_us", work, u_strlen(work), &status);
/* a break should never occur between CR and LF */
for (i = 0; i < u_strlen(testChars); i++) {
work[0] = testChars[i];
for (j = 0; j < u_strlen(testChars); j++) {
work[3] = testChars[j];
tb=ubrk_open(type, "en_US", work, u_strlen(work), &status);
ubrk_setText(tb, work, u_strlen(work), &status);
if(U_FAILURE(status)){
log_err("ERROR in opening the breakIterator in doVariant Function: %s\n", myErrorName(status));
}
@ -1601,7 +1604,7 @@ void doOtherInvariantTest(UBreakIteratorType type , UChar* testChars)
}
}
}
ubrk_close(tb);
/* a break should never occur before a non-spacing mark, unless the preceding
character is CR, LF, PS, or LS */
u_uastrcpy(work,"aaaa");
@ -1616,7 +1619,7 @@ void doOtherInvariantTest(UBreakIteratorType type , UChar* testChars)
(u_charType(c) != U_ENCLOSING_MARK))
continue;
work[2] = c;
tb=ubrk_open(type, "en_US", work, u_strlen(work), &status);
ubrk_setText(tb, work, u_strlen(work), &status);
if(U_FAILURE(status)){
log_err("ERROR in opening the breakIterator in doOtherVariant Function %s\n", myErrorName(status));
}
@ -1630,6 +1633,7 @@ void doOtherInvariantTest(UBreakIteratorType type , UChar* testChars)
}
}
}
ubrk_close(tb);
}
void sample(UBreakIterator* tb, UChar* text)

File diff suppressed because it is too large Load Diff

View File

@ -32,121 +32,59 @@ public:
~IntlTestTextBoundary();
void runIndexedTest( int32_t index, bool_t exec, char* &name, char* par = NULL );
/**
* Test sentence break using doForwardSelectionTest
* Test sentence break using generalIteratorTest()
**/
void TestForwardSentenceSelection(void);
void TestSentenceIteration(void);
/**
* Test sentence break using doFirstSelectionTest
* Test word break using generalIteratorTest()
**/
void TestFirstSentenceSelection(void);
void TestWordIteration(void);
/**
* Test sentence break using doLastSelectionTest
* Test line break using generalIteratorTest()
**/
void TestLineIteration(void);
/**
* Test character break using generalIteratorTest()
**/
void TestLastSentenceSelection(void);
void TestCharacterIteration(void);
/**
* Test sentence break using doBackwardSelectionTest
* Test sentence break using ()
**/
void TestBackwardSentenceSelection(void);
/**
* Test sentence break using doForwardIndexSelectionTest
void TestSentenceInvariants(void);
/**
* Test sentence break Invariants using generalIteratorTest()
**/
void TestWordInvariants(void);
/**
* Test sentence break Invariants using generalIteratorTest()
**/
void TestForwardSentenceIndexSelection(void);
/**
* Test sentence break using doBackwardIndexSelectionTest
void TestLineInvariants(void);
/**
* Test sentence break Invariants using generalIteratorTest()
**/
void TestBackwardSentenceIndexSelection(void);
/**
* Test sentence break using doMultipleSelectionTest
void TestCharacterInvariants(void);
/**
* Test Japanese line break Invariants using generalIteratorTest()
**/
void TestSentenceMultipleSelection(void);
/**
* Test word break using doForwardSelectionTest
void TestJapaneseLineBreak(void);
/**
* Test Thai line break using generalIteratorTest()
**/
void TestForwardWordSelection(void);
/**
* Test word break using doFirstSelectionTest
void TestThaiLineBreak(void);
/**
* Test Mixed Thai (thai with other languages like english)line break using generalIteratorTest()
**/
void TestFirstWordSelection(void);
void TestMixedThaiLineBreak(void);
/**
* Test word break using doLastSelectionTest
* Test Thai Line break with Maiyamok using generalIteratorTest()
* The Thai maiyamok character is a shorthand symbol that means "repeat the previous
* word". Instead of appearing as a word unto itself, however, it's kept together
* with the word before it
**/
void TestLastWordSelection(void);
void TestMaiyamok(void);
/**
* Test word break using doBackwardSelectionTest
**/
void TestBackwardWordSelection(void);
/**
* Test word break using doForwardIndexSelectionTest
**/
void TestForwardWordIndexSelection(void);
/**
* Test word break using doBackwardIndexSelectionTest
**/
void TestBackwardWordIndexSelection(void);
/**
* Test word break using doMultipleSelectionTest
**/
void TestWordMultipleSelection(void);
/**
* Test line break using doLastSelectionTest
**/
void TestForwardLineSelection(void);
/**
* Test line break using doFirstSelectionTest
**/
void TestFirstLineSelection(void);
/**
* Test line break using doLastSelectionTest
**/
void TestLastLineSelection(void);
/**
* Test line break using doBackwardSelectionTest
**/
void TestBackwardLineSelection(void);
/**
* Test line break using doForwardIndexSelectionTest
**/
void TestForwardLineIndexSelection(void);
/**
* Test line break using doBackwardIndexSelectionTest
**/
void TestBackwardLineIndexSelection(void);
/**
* Test line break using doMultipleSelectionTest
**/
void TestLineMultipleSelection(void);
/**
* Test word break using doForwardIndexSelectionTest
**/
void TestForwardCharacterSelection(void);
/**
* Test character break using doFirstSelectionTest
**/
void TestFirstCharacterSelection(void);
/**
* Test character break using doLastSelectionTest
**/
void TestLastCharacterSelection(void);
/**
* Test character break using doBackwardSelectionTest
**/
void TestBackwardCharacterSelection(void);
/**
* Test character break using doForwardIndexSelectionTest
**/
void TestForwardCharacterIndexSelection(void);
/**
* Test character break using doBackwardIndexSelectionTest
**/
void TestBackwardCharacterIndexSelection(void);
/**
* Test character break using doMultipleSelectionTest
**/
void TestCharacterMultipleSelection(void);
/**
* test behaviour of BrakIteraor on an empty string
* test behaviour of BreakIterator on an empty string
**/
void TestEmptyString(void);
/**
@ -162,20 +100,14 @@ public:
**/
void TestPreceding(void);
void TestJapaneseLineBreak(void);
void TestBug4153072(void);
void TestEndBehavior(void);
void TestSentenceInvariants(void);
void TestWordInvariants(void);
/**
* Test End Behaviour
* @bug 4068137
**/
void TestEndBehaviour(void);
void TestLineInvariants(void);
void TestCharacterInvariants(void);
/***********************/
private:
/**
* internal methods to prepare test data
@ -184,62 +116,68 @@ private:
void addTestSentenceData(void);
void addTestLineData(void);
void addTestCharacterData(void);
UnicodeString createTestData(Enumeration* e);
/**
* Perform tests of BreakIterator forward functionality
* on different kinds of iterators (word, sentence, line and character)
* Perform tests of BreakIterator forward and backward functionality
* on different kinds of iterators (word, sentence, line and character).
* It tests the methods first(), next(), current(), preceding(), following()
* previous() and isBoundary().
* It makes use of internal functions to achieve this.
**/
void doForwardSelectionTest(BreakIterator& iterator, UnicodeString& testText, Vector* result);
void generalIteratorTest(BreakIterator& bi, Vector* expectedResult);
/**
* Perform tests of BreakIterator backward functionality
* on different kinds of iterators (word, sentence, line and character)
**/
void doBackwardSelectionTest(BreakIterator& iterator, UnicodeString& testText, Vector* result);
* Internal method to perform iteration and test the first() and next() functions
**/
Vector* testFirstAndNext(BreakIterator& bi, UnicodeString& text);
/**
* Perform tests of BreakIterator first selection functionality
* on different kinds of iterators (word, sentence, line and character)
**/
void doFirstSelectionTest(BreakIterator& iterator, UnicodeString& testText, Vector* result);
* Internal method to perform iteration and test the last() and previous() functions
**/
Vector* testLastAndPrevious(BreakIterator& bi, UnicodeString& text);
/**
* Internal method to perform iteration and test the following() function
**/
void testFollowing(BreakIterator& bi, UnicodeString& text, int32_t *boundaries);
/**
* Perform tests of BreakIterator last selection functionality
* on different kinds of iterators (word, sentence, line and character)
**/
void doLastSelectionTest(BreakIterator& iterator, UnicodeString& testText, Vector* result);
* Internal method to perform iteration and test the preceding() function
**/
void testPreceding(BreakIterator& bi, UnicodeString& text, int32_t *boundaries);
/**
* Internal method to perform iteration and test the isBoundary() function
**/
void testIsBoundary(BreakIterator& bi, UnicodeString& text, int32_t *boundaries);
/**
* Internal method which does the comparision of expected and got results.
**/
void compareFragmentLists(UnicodeString& f1Name, UnicodeString& f2Name, Vector* f1, Vector* f2);
/**
* Perform tests of BreakIterator forward index functionality
* on different kinds of iterators (word, sentence, line and character)
**/
void doForwardIndexSelectionTest(BreakIterator& iterator, UnicodeString& testText, Vector* result);
/**
* Perform tests of BreakIterator backward index functionality
* on different kinds of iterators (word, sentence, line and character)
**/
void doBackwardIndexSelectionTest(BreakIterator& iterator, UnicodeString& testText, Vector* result);
/**
* Perform tests of BreakIterator multiple selection functionality
* Internal method to perform tests of BreakIterator multiple selection functionality
* on different kinds of iterators (word, sentence, line and character)
**/
void doMultipleSelectionTest(BreakIterator& iterator, UnicodeString& testText);
/**
* Perform tests with short sample code
* Internal method to perform tests of BreakIterator break Invariants
* on different kinds of iterators (word, sentence, line and character)
**/
void sample(BreakIterator& tb, UnicodeString& text, UnicodeString& title);
void doBreakInvariantTest(BreakIterator& tb, UnicodeString& testChars);
/**
* Internal method to perform tests of BreakIterator other invariants
* on different kinds of iterators (word, sentence, line and character)
**/
void doOtherInvariantTest(BreakIterator& tb, UnicodeString& testChars);
/**
* Perform tests with short sample code
**/
void sample(BreakIterator& tb, UnicodeString& text, UnicodeString& title);
/**
* The vectors holding test data for testing
* different kinds of iterators( word, sentence, line and character)
**/
Vector* lineSelectionData;
UnicodeString testLineText;
Vector* sentenceSelectionData;
UnicodeString testSentenceText;
Vector* wordSelectionData;
UnicodeString testWordText;
Vector* characterSelectionData;
UnicodeString testCharacterText;
static const UChar cannedTestArray[];
static UnicodeString *cannedTestChars;
};

View File

@ -11,7 +11,7 @@
!IF "$(CFG)" == ""
CFG=Debug
!MESSAGE No configuration specified. Defaulting to common - Win32 Debug.
!ENDIF
!ENDIF
!IF [cl.exe]
!MESSAGE Could not find build tools!
@ -24,7 +24,7 @@ CFG=Debug
#Let's see if user has given us a path to ICU
#This could be found according to the path to makefile, but for now it is this way
!MESSAGE ICUP=$(ICUP)
!IF "$(ICUP)"==""
!IF "$(ICUP)"==""
!ERROR Can't find path!
!ELSE
ICUDATA=$(ICUP)\icu\data
@ -38,47 +38,47 @@ LINK32 = link.exe
LINK32_FLAGS = /out:"$(ICUDATA)/icudata.dll" /DLL /NOENTRY /base:"0x4ad00000" /comment:" Copyright (C) 1999 International Business Machines Corporation and others. All Rights Reserved. "
CPP_FLAGS = /I$(ICUP)\icu\include /GD /c
#Here we test if configuration is given
#Here we test if configuration is given
!IF "$(CFG)" != "Release" && "$(CFG)" != "release" && "$(CFG)" != "Debug" && "$(CFG)" != "debug"
!MESSAGE Invalid configuration "$(CFG)" specified.
!MESSAGE You can specify a configuration when running NMAKE
!MESSAGE by defining the macro CFG on the command line. For example:
!MESSAGE
!MESSAGE
!MESSAGE NMAKE /f "makedata.mak" CFG="Debug"
!MESSAGE
!MESSAGE
!MESSAGE Possible choices for configuration are:
!MESSAGE
!MESSAGE
!MESSAGE "Release"
!MESSAGE "Debug"
!MESSAGE
!MESSAGE
!ERROR An invalid configuration is specified.
!ENDIF
!ENDIF
# This appears in original Microsofts makefiles
!IF "$(OS)" == "Windows_NT"
NULL=
!ELSE
!ELSE
NULL=nul
!ENDIF
!ENDIF
PATH = $(PATH);$(ICUP)\icu\bin\$(CFG)
# Suffixes for data files
.SUFFIXES : .ucm .cnv .dll .dat .col .res .txt .c
# We're including a list of ucm files. There are two lists, one is essential 'ucmfiles.mk' and
# We're including a list of ucm files. There are two lists, one is essential 'ucmfiles.mk' and
# the other is optional 'ucmlocal.mk'
!IF EXISTS("$(ICUTOOLS)\makeconv\ucmfiles.mk")
!INCLUDE "$(ICUTOOLS)\makeconv\ucmfiles.mk"
!IF EXISTS("$(ICUTOOLS)\makeconv\ucmlocal.mk")
!INCLUDE "$(ICUTOOLS)\makeconv\ucmlocal.mk"
$(UCM_SOURCE)=$(UCM_SOURCE) $(UCM_SOURCE_LOCAL)
!ELSE
!ELSE
#!MESSAGE Warning: cannot find "ucmlocal.mk"
!ENDIF
!ELSE
!ELSE
!ERROR ERROR: cannot find "ucmfiles.mk"
!ENDIF
!ENDIF
# According to the read files, we will generate CNV and C files
CNV_FILES=$(UCM_SOURCE:.ucm=.cnv)
@ -91,12 +91,12 @@ OBJ_CNV_FILES = $(C_CNV_FILES:.c=.obj)
!IF EXISTS("$(ICUTOOLS)\genrb\genrblocal.mk")
!INCLUDE "$(ICUTOOLS)\genrb\genrblocal.mk"
GENRB_SOURCE=$(GENRB_SOURCE) $(GENRB_SOURCE_LOCAL)
!ELSE
!ELSE
#!MESSAGE Warning: cannot find "genrblocal.mk"
!ENDIF
!ELSE
!ELSE
!ERROR ERROR: cannot find "genrbfiles.mk"
!ENDIF
!ENDIF
RB_FILES = $(GENRB_SOURCE:.txt=.res)
# Read list of resource bundle files for colation
@ -105,48 +105,81 @@ RB_FILES = $(GENRB_SOURCE:.txt=.res)
!IF EXISTS("$(ICUTOOLS)\gencol\gencollocal.mk")
!INCLUDE "$(ICUTOOLS)\gencol\gencollocal.mk"
GENCOL_SOURCE=$(GENCOL_SOURCE) $(GENCOL_SOURCE_LOCAL)
!ELSE
!ELSE
#!MESSAGE Warning: cannot find "gencollocal.mk"
!ENDIF
!ELSE
!ELSE
!ERROR ERROR: cannot find "gencolfiles.mk"
!ENDIF
!ENDIF
COL_FILES = $(GENCOL_SOURCE:.txt=.col)
# This target should build all the data files
ALL : GODATA $(RB_FILES) $(CNV_FILES) $(COL_FILES) icudata.dll icudata.dat GOBACK
@echo All targets are up to date
CPP_SOURCES = $(C_CNV_FILES) unames_dat.c cnvalias_dat.c tz_dat.c
BRK_FILES = sent.brk char.brk line.brk word.brk line_th.brk word_th.brk
BRK_CSOURCES = $(BRK_FILES:.brk=_brk.c)
CPP_SOURCES = $(C_CNV_FILES) unames_dat.c cnvalias_dat.c tz_dat.c $(BRK_CSOURCES)
LINK32_OBJS = $(CPP_SOURCES:.c=.obj)
# target for DLL
icudata.dll : $(LINK32_OBJS) $(CNV_FILES)
@echo Creating DLL file
@echo Creating DLL file
@cd $(ICUDATA)
@$(LINK32) @<<
$(LINK32_FLAGS) $(LINK32_OBJS)
<<
$(ICUDATA)\sent.brk : $(ICUDATA)\sentLE.brk
copy $(ICUDATA)\sentLE.brk $(ICUDATA)\sent.brk
$(ICUDATA)\char.brk : $(ICUDATA)\charLE.brk
copy $(ICUDATA)\charLE.brk $(ICUDATA)\char.brk
$(ICUDATA)\line.brk : $(ICUDATA)\lineLE.brk
copy $(ICUDATA)\lineLE.brk $(ICUDATA)\line.brk
$(ICUDATA)\word.brk : $(ICUDATA)\wordLE.brk
copy $(ICUDATA)\wordLE.brk $(ICUDATA)\word.brk
$(ICUDATA)\line_th.brk : $(ICUDATA)\line_thLE.brk
copy $(ICUDATA)\line_thLE.brk $(ICUDATA)\line_th.brk
$(ICUDATA)\word_th.brk : $(ICUDATA)\word_thLE.brk
copy $(ICUDATA)\word_thLE.brk $(ICUDATA)\word_th.brk
# target for memory mapped file
icudata.dat : $(CNV_FILES) unames.dat cnvalias.dat tz.dat
icudata.dat : $(CNV_FILES) unames.dat cnvalias.dat tz.dat
@echo Creating memory-mapped file
@cd $(ICUDATA)
@$(ICUTOOLS)\gencmn\$(CFG)\gencmn 1000000 <<
$(ICUDATA)\unames.dat
$(ICUDATA)\cnvalias.dat
$(ICUDATA)\tz.dat
$(ICUDATA)\sent.brk
$(ICUDATA)\char.brk
$(ICUDATA)\line.brk
$(ICUDATA)\word.brk
$(ICUDATA)\line_th.brk
$(ICUDATA)\word_th.brk
$(CNV_FILES:.cnv =.cnv
)
<<
# nothing works without this target, but we're making
# nothing works without this target, but we're making
# these files while creating converters
$(C_CNV_FILES) : $(CNV_FILES)
@$(ICUTOOLS)\genccode\$(CFG)\genccode $(CNV_FILES)
# nothing works without this target, but we're making
# these files while creating converters
$(BRK_CSOURCES) : $(BRK_FILES)
@$(ICUTOOLS)\genccode\$(CFG)\genccode $(BRK_FILES)
# utility to send us to the right dir
GODATA :
GODATA :
@cd $(ICUDATA)
# utility to get us back to the right dir
@ -164,8 +197,15 @@ CLEAN :
-@erase "cnvalias*.*"
-@erase "tz*.*"
-@erase "ibm*_cnv.c"
-@erase "*_brk.c"
-@erase "icudata.*"
-@erase "*.obj"
-@erase "sent.brk"
-@erase "char.brk"
-@erase "line.brk"
-@erase "word.brk"
-@erase "line_th.brk"
-@erase "word_th.brk"
@cd $(TEST)
-@erase "*.res"
@cd $(ICUTOOLS)
@ -184,7 +224,7 @@ CLEAN :
@$(ICUTOOLS)\makeconv\$(CFG)\makeconv $<
# @$(ICUTOOLS)\genccode\$(CFG)\genccode $(CNV_FILES)
# Inference rule for creating collation files -
# Inference rule for creating collation files -
# this should be integrated in genrb
.txt.col::
@echo Making Collation files
@ -203,7 +243,7 @@ unames.dat : UnicodeData-3.0.0.txt
@echo Creating data file for Unicode Names
@$(ICUTOOLS)\gennames\$(CFG)\gennames -v- -c- UnicodeData-3.0.0.txt
unames_dat.c : unames.dat
unames_dat.c : unames.dat
@echo Creating C source file for Unicode Names
@$(ICUTOOLS)\genccode\$(CFG)\genccode $(ICUDATA)\$?
@ -211,8 +251,8 @@ unames_dat.c : unames.dat
cnvalias.dat : convrtrs.txt
@echo Creating data file for Converter Aliases
@$(ICUTOOLS)\gencnval\$(CFG)\gencnval -c-
cnvalias_dat.c : cnvalias.dat
cnvalias_dat.c : cnvalias.dat
@echo Creating C source file for Converter Aliases
@$(ICUTOOLS)\genccode\$(CFG)\genccode $(ICUDATA)\$?