ICU-45 RBBI copyright notices, AIX portability, comments

X-SVN-Rev: 8963
This commit is contained in:
Andy Heninger 2002-06-27 21:14:47 +00:00
parent 59029844b7
commit e56b99a590
9 changed files with 171 additions and 213 deletions

View File

@ -1,3 +1,8 @@
//
// file: rbbi.c Contains the implementation of the rule based break iterator
// runtime engine and the API implementation for
// class RuleBasedBreakIterator
//
/*
**********************************************************************
* Copyright (C) 1999-2002 International Business Machines Corporation *
@ -5,6 +10,7 @@
**********************************************************************
*/
#include "unicode/rbbi.h"
#include "unicode/schriter.h"
#include "unicode/udata.h"
@ -151,12 +157,12 @@ RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) {
UBool RuleBasedBreakIterator::fTrace = FALSE;
void RuleBasedBreakIterator::init() {
static UBool debugInitDone = FALSE;
fText = NULL;
fData = NULL;
fCharMappings = NULL;
fLastBreakTag = 0;
fDictionaryCharCount = 0;
fDictionaryCharCount = 0;
if (debugInitDone == FALSE) {
char *debugEnv = getenv("U_RBBIDEBUG");
@ -190,7 +196,7 @@ RuleBasedBreakIterator::operator==(const BreakIterator& that) const {
if (that.getDynamicClassID() != getDynamicClassID())
return FALSE;
const RuleBasedBreakIterator& that2 = (const RuleBasedBreakIterator&)that;
UBool r = (that2.fText == fText);
r |= (*that2.fText == *fText);
@ -229,7 +235,7 @@ RuleBasedBreakIterator::getRules() const {
const CharacterIterator&
RuleBasedBreakIterator::getText() const {
RuleBasedBreakIterator* nonConstThis = (RuleBasedBreakIterator*)this;
// The iterator is initialized pointing to no text at all, so if this
// function is called while we're in that state, we have to fudge an
// an iterator to return.
@ -252,7 +258,7 @@ RuleBasedBreakIterator::adoptText(CharacterIterator* newText) {
}
/**
* Set the iterator to analyze a new piece of text. This function resets
* Set the iterator to analyze a new piece of text. This function resets
* the current iteration position to the beginning of the text.
* @param newText An iterator over the text to analyze.
*/
@ -295,7 +301,7 @@ int32_t RuleBasedBreakIterator::last(void) {
reset();
if (fText == NULL)
return BreakIterator::DONE;
// I'm not sure why, but t.last() returns the offset of the last character,
// rather than the past-the-end offset
@ -352,7 +358,7 @@ int32_t RuleBasedBreakIterator::previous(void) {
fText->previous32();
int32_t lastResult = handlePrevious();
int32_t result = lastResult;
// iterate forward from the known break position until we pass our
// starting point. The last break position before the starting
// point is our return value
@ -360,7 +366,7 @@ int32_t RuleBasedBreakIterator::previous(void) {
lastResult = result;
result = handleNext();
}
// set the current iteration position to be the last break position
// before where we started, and then return that value
fText->setIndex(lastResult);
@ -420,7 +426,7 @@ int32_t RuleBasedBreakIterator::preceding(int32_t offset) {
else if (offset < fText->startIndex()) {
return fText->startIndex();
}
// if we start by updating the current iteration position to the
// position specified by the caller, we can just use previous()
// to carry out this operation
@ -445,7 +451,7 @@ UBool RuleBasedBreakIterator::isBoundary(int32_t offset) {
else if (offset < fText->startIndex() || offset > fText->endIndex()) {
return FALSE;
}
// otherwise, we can use following() on the position before the specified
// one and return true of the position we get back is the one the user
// specified
@ -462,7 +468,7 @@ int32_t RuleBasedBreakIterator::current(void) const {
}
//=======================================================================
// implementation
// implementation
//=======================================================================
@ -487,11 +493,11 @@ int32_t RuleBasedBreakIterator::handleNext(void) {
// no matter what, we always advance at least one character forward
int32_t result = fText->getIndex() + 1;
int32_t lookaheadResult = 0;
// begin in state 1
int32_t state = START_STATE;
int16_t category;
UChar32 c = fText->current32();
UChar32 c = fText->current32();
RBBIStateTableRow *row;
int32_t lookaheadStatus = 0;
int32_t lookaheadTag = 0;
@ -505,7 +511,7 @@ int32_t RuleBasedBreakIterator::handleNext(void) {
fDictionaryCharCount++;
category &= ~0x4000;
}
// loop until we reach the end of the text or transition to state 0
for (;;) {
if (c == CharacterIterator::DONE ) {
@ -542,17 +548,17 @@ int32_t RuleBasedBreakIterator::handleNext(void) {
state = row->fNextState[category];
row = (RBBIStateTableRow *)
(fData->fForwardTable->fTableData + (fData->fForwardTable->fRowLen * state));
// Get the next character. Doing it here positions the iterator
// to the correct position for recording matches in the code that
// follows.
c = fText->next32();
if (row->fAccepting == 0 && row->fLookAhead == 0) {
// No match, nothing of interest happening, common case.
goto continueOn;
}
if (row->fAccepting == -1) {
// Match found, common case, no lookahead involved.
// (It's possible that some lookahead rule matched here also,
@ -562,7 +568,7 @@ int32_t RuleBasedBreakIterator::handleNext(void) {
fLastBreakTag = row->fTag; // Remember the break status (tag) value.
goto continueOn;
}
if (row->fAccepting == 0 && row->fLookAhead != 0) {
// Lookahead match point. Remember it, but only if no other rule has
// unconitionally matched up to this point.
@ -594,7 +600,7 @@ continueOn:
if (state == STOP_STATE) {
break;
}
// c = fText->next32();
}
@ -605,7 +611,7 @@ continueOn:
if (c == CharacterIterator::DONE && lookaheadResult == fText->endIndex()) {
result = lookaheadResult;
}
fText->setIndex(result);
if (fTrace) {
@ -626,13 +632,13 @@ continueOn:
//
//-----------------------------------------------------------------------------------
int32_t RuleBasedBreakIterator::handlePrevious(void) {
if (fText == NULL || fData == NULL) {
if (fText == NULL || fData == NULL) {
return 0;
}
if (fData->fReverseTable == NULL) {
return fText->setToStart();
}
int32_t state = START_STATE;
int32_t category;
int32_t lastCategory = 0;
@ -650,11 +656,11 @@ int32_t RuleBasedBreakIterator::handlePrevious(void) {
fDictionaryCharCount++;
category &= ~0x4000;
}
if (fTrace) {
printf("Handle Prev pos char state category \n");
}
// loop until we reach the beginning of the text or transition to state 0
for (;;) {
if (c == CharacterIterator::DONE) {
@ -693,14 +699,14 @@ int32_t RuleBasedBreakIterator::handlePrevious(void) {
// No match, nothing of interest happening, common case.
goto continueOn;
}
if (row->fAccepting == -1) {
// Match found, common case, no lookahead involved.
result = fText->getIndex();
lookaheadStatus = 0; // clear out any pending look-ahead matches.
goto continueOn;
}
if (row->fAccepting == 0 && row->fLookAhead != 0) {
// Lookahead match point. Remember it, but only if no other rule
// has unconditionally matched to this point.
@ -714,7 +720,7 @@ int32_t RuleBasedBreakIterator::handlePrevious(void) {
}
goto continueOn;
}
if (row->fAccepting != 0 && row->fLookAhead != 0) {
// Lookahead match is completed. Set the result accordingly, but only
// if no other rule has matched further in the mean time.
@ -732,18 +738,18 @@ continueOn:
if (state == STOP_STATE) {
break;
}
// then advance one character backwards
c = fText->previous32();
}
// Note: the result postion isn't what is returned to the user by previous(),
// but where the implementation of previous() turns around and
// Note: the result postion isn't what is returned to the user by previous(),
// but where the implementation of previous() turns around and
// starts iterating forward again.
if (c == CharacterIterator::DONE) {
result = fText->startIndex();
}
fText->setIndex(result);
}
fText->setIndex(result);
return result;
}
@ -808,7 +814,7 @@ BreakIterator * RuleBasedBreakIterator::createBufferClone(void *stackBuffer,
}
//
// If user buffer size is zero this is a preflight operation to
// If user buffer size is zero this is a preflight operation to
// obtain the needed buffer size, allowing for worst case misalignment.
//
if (bufferSize == 0) {
@ -859,7 +865,7 @@ BreakIterator * RuleBasedBreakIterator::createBufferClone(void *stackBuffer,
clone->fBufferClone = TRUE;
}
return clone;
return clone;
}

View File

@ -1,6 +1,21 @@
#
# rbbicst Compile the RBBI rule paser state table data into initialized C data.
# Usage:
# cd icu/source/common
# perl rbbicst.pl < rbbirpt.txt > rbbirpt.h
#
# The output file, rbbrpt.h, is included by some of the .cpp rbbi
# implementation files. This perl script is NOT run as part
# of a normal ICU build. It is run by hand when needed, and the
# rbbirpt.h generated file is put back into cvs.
#
# See rbbirpt.h for a description of the input format for this script.
#
#*********************************************************************
# Copyright (C) 2002 International Business Machines Corporation *
# and others. All rights reserved. *
#*********************************************************************
$num_states = 1; # Always the state number for the line being compiled.
$line_num = 0; # The line number in the input file.
@ -180,10 +195,14 @@ die if ($errors>0);
print "//---------------------------------------------------------------------------------\n";
print "//\n";
print "// Generated Header File. Do not edit by hand.\n";
print "// This file contains the state table for RBBI rule parser.\n";
print "// This file contains the state table for the ICU Rule Based Break Iterator\n";
print "// rule parser.\n";
print "// It is generated by the Perl script \"rbbicst.pl\" from\n";
print "// the rule parser state definitions file \"rbbirpt.txt\".\n";
print "//\n";
print "// Copyright (C) 2002 International Business Machines Corporation \n";
print "// and others. All rights reserved. \n";
print "//\n";
print "//---------------------------------------------------------------------------------\n";
print "#ifndef RBBIRPT_H\n";
print "#define RBBIRPT_H\n";
@ -257,7 +276,7 @@ for ($state=1; $state < $num_states; $state++) {
print " , {$state_func_name[$state],";
if ($state_literal_chars[$state] ne "") {
$c = $state_literal_chars[$state];
printf(" %d /*$c*/,", ord($c)); #TODO: use numeric value, so EBCDIC machines are ok.
printf(" %d /* $c */,", ord($c)); # use numeric value, so EBCDIC machines are ok.
}else {
print " $charClasses{$state_char_class[$state]},";
}

View File

@ -201,8 +201,8 @@ RBBIRuleBuilder::createRuleBasedBreakIterator( const UnicodeString &rules,
//
// Generate the DFA state transition table.
//
builder.fForwardTables = new RBBITableBuilder(&builder, builder.fForwardTree);
builder.fReverseTables = new RBBITableBuilder(&builder, builder.fReverseTree);
builder.fForwardTables = new RBBITableBuilder(&builder, &builder.fForwardTree);
builder.fReverseTables = new RBBITableBuilder(&builder, &builder.fReverseTree);
builder.fForwardTables->build();
builder.fReverseTables->build();
if (U_FAILURE(status)) {

View File

@ -1,10 +1,14 @@
//---------------------------------------------------------------------------------
//
// Generated Header File. Do not edit by hand.
// This file contains the state table for RBBI rule parser.
// This file contains the state table for the ICU Rule Based Break Iterator
// rule parser.
// It is generated by the Perl script "rbbicst.pl" from
// the rule parser state definitions file "rbbirpt.txt".
//
// Copyright (C) 2002 International Business Machines Corporation
// and others. All rights reserved.
//
//---------------------------------------------------------------------------------
#ifndef RBBIRPT_H
#define RBBIRPT_H
@ -71,87 +75,87 @@ struct RBBIRuleTableEl gRuleParseStateTable[] = {
{doNOP, 0, 0, 0, TRUE}
, {doExprStart, 254, 12, 8, FALSE} // 1 start
, {doNOP, 130, 1,0, TRUE} // 2
, {doExprStart, 36 /*$*/, 70, 80, FALSE} // 3
, {doReverseDir, 33 /*!*/, 11,0, TRUE} // 4
, {doNOP, 59 /*;*/, 1,0, TRUE} // 5
, {doExprStart, 36 /* $ */, 70, 80, FALSE} // 3
, {doReverseDir, 33 /* ! */, 11,0, TRUE} // 4
, {doNOP, 59 /* ; */, 1,0, TRUE} // 5
, {doNOP, 252, 0,0, FALSE} // 6
, {doExprStart, 255, 12, 8, FALSE} // 7
, {doEndOfRule, 59 /*;*/, 1,0, TRUE} // 8 break-rule-end
, {doEndOfRule, 59 /* ; */, 1,0, TRUE} // 8 break-rule-end
, {doNOP, 130, 8,0, TRUE} // 9
, {doRuleError, 255, 85,0, FALSE} // 10
, {doExprStart, 255, 12, 8, FALSE} // 11 reverse-rule
, {doRuleChar, 254, 21,0, TRUE} // 12 term
, {doNOP, 130, 12,0, TRUE} // 13
, {doRuleChar, 129, 21,0, TRUE} // 14
, {doNOP, 91 /*[*/, 76, 21, FALSE} // 15
, {doLParen, 40 /*(*/, 12, 21, TRUE} // 16
, {doNOP, 36 /*$*/, 70, 20, FALSE} // 17
, {doDotAny, 46 /*.*/, 21,0, TRUE} // 18
, {doNOP, 91 /* [ */, 76, 21, FALSE} // 15
, {doLParen, 40 /* ( */, 12, 21, TRUE} // 16
, {doNOP, 36 /* $ */, 70, 20, FALSE} // 17
, {doDotAny, 46 /* . */, 21,0, TRUE} // 18
, {doRuleError, 255, 85,0, FALSE} // 19
, {doCheckVarDef, 255, 21,0, FALSE} // 20 term-var-ref
, {doUnaryOpStar, 42 /***/, 25,0, TRUE} // 21 expr-mod
, {doUnaryOpPlus, 43 /*+*/, 25,0, TRUE} // 22
, {doUnaryOpQuestion, 63 /*?*/, 25,0, TRUE} // 23
, {doUnaryOpStar, 42 /* * */, 25,0, TRUE} // 21 expr-mod
, {doUnaryOpPlus, 43 /* + */, 25,0, TRUE} // 22
, {doUnaryOpQuestion, 63 /* ? */, 25,0, TRUE} // 23
, {doNOP, 255, 25,0, FALSE} // 24
, {doExprCatOperator, 254, 12,0, FALSE} // 25 expr-cont
, {doNOP, 130, 25,0, TRUE} // 26
, {doExprCatOperator, 129, 12,0, FALSE} // 27
, {doExprCatOperator, 91 /*[*/, 12,0, FALSE} // 28
, {doExprCatOperator, 40 /*(*/, 12,0, FALSE} // 29
, {doExprCatOperator, 36 /*$*/, 12,0, FALSE} // 30
, {doExprCatOperator, 46 /*.*/, 12,0, FALSE} // 31
, {doExprCatOperator, 47 /*/*/, 37,0, FALSE} // 32
, {doExprCatOperator, 123 /*{*/, 49,0, TRUE} // 33
, {doExprOrOperator, 124 /*|*/, 12,0, TRUE} // 34
, {doExprRParen, 41 /*)*/, 255,0, TRUE} // 35
, {doExprCatOperator, 91 /* [ */, 12,0, FALSE} // 28
, {doExprCatOperator, 40 /* ( */, 12,0, FALSE} // 29
, {doExprCatOperator, 36 /* $ */, 12,0, FALSE} // 30
, {doExprCatOperator, 46 /* . */, 12,0, FALSE} // 31
, {doExprCatOperator, 47 /* / */, 37,0, FALSE} // 32
, {doExprCatOperator, 123 /* { */, 49,0, TRUE} // 33
, {doExprOrOperator, 124 /* | */, 12,0, TRUE} // 34
, {doExprRParen, 41 /* ) */, 255,0, TRUE} // 35
, {doExprFinished, 255, 255,0, FALSE} // 36
, {doSlash, 47 /*/*/, 39,0, TRUE} // 37 look-ahead
, {doSlash, 47 /* / */, 39,0, TRUE} // 37 look-ahead
, {doNOP, 255, 85,0, FALSE} // 38
, {doExprCatOperator, 254, 12,0, FALSE} // 39 expr-cont-no-slash
, {doNOP, 130, 25,0, TRUE} // 40
, {doExprCatOperator, 129, 12,0, FALSE} // 41
, {doExprCatOperator, 91 /*[*/, 12,0, FALSE} // 42
, {doExprCatOperator, 40 /*(*/, 12,0, FALSE} // 43
, {doExprCatOperator, 36 /*$*/, 12,0, FALSE} // 44
, {doExprCatOperator, 46 /*.*/, 12,0, FALSE} // 45
, {doExprOrOperator, 124 /*|*/, 12,0, TRUE} // 46
, {doExprRParen, 41 /*)*/, 255,0, TRUE} // 47
, {doExprCatOperator, 91 /* [ */, 12,0, FALSE} // 42
, {doExprCatOperator, 40 /* ( */, 12,0, FALSE} // 43
, {doExprCatOperator, 36 /* $ */, 12,0, FALSE} // 44
, {doExprCatOperator, 46 /* . */, 12,0, FALSE} // 45
, {doExprOrOperator, 124 /* | */, 12,0, TRUE} // 46
, {doExprRParen, 41 /* ) */, 255,0, TRUE} // 47
, {doExprFinished, 255, 255,0, FALSE} // 48
, {doNOP, 130, 49,0, TRUE} // 49 tag-open
, {doStartTagValue, 128, 52,0, FALSE} // 50
, {doTagExpectedError, 255, 85,0, FALSE} // 51
, {doNOP, 130, 56,0, TRUE} // 52 tag-value
, {doNOP, 125 /*}*/, 56,0, FALSE} // 53
, {doNOP, 125 /* } */, 56,0, FALSE} // 53
, {doTagDigit, 128, 52,0, TRUE} // 54
, {doTagExpectedError, 255, 85,0, FALSE} // 55
, {doNOP, 130, 56,0, TRUE} // 56 tag-close
, {doTagValue, 125 /*}*/, 59,0, TRUE} // 57
, {doTagValue, 125 /* } */, 59,0, TRUE} // 57
, {doTagExpectedError, 255, 85,0, FALSE} // 58
, {doExprCatOperator, 254, 12,0, FALSE} // 59 expr-cont-no-tag
, {doNOP, 130, 59,0, TRUE} // 60
, {doExprCatOperator, 129, 12,0, FALSE} // 61
, {doExprCatOperator, 91 /*[*/, 12,0, FALSE} // 62
, {doExprCatOperator, 40 /*(*/, 12,0, FALSE} // 63
, {doExprCatOperator, 36 /*$*/, 12,0, FALSE} // 64
, {doExprCatOperator, 46 /*.*/, 12,0, FALSE} // 65
, {doExprCatOperator, 47 /*/*/, 37,0, FALSE} // 66
, {doExprOrOperator, 124 /*|*/, 12,0, TRUE} // 67
, {doExprRParen, 41 /*)*/, 255,0, TRUE} // 68
, {doExprCatOperator, 91 /* [ */, 12,0, FALSE} // 62
, {doExprCatOperator, 40 /* ( */, 12,0, FALSE} // 63
, {doExprCatOperator, 36 /* $ */, 12,0, FALSE} // 64
, {doExprCatOperator, 46 /* . */, 12,0, FALSE} // 65
, {doExprCatOperator, 47 /* / */, 37,0, FALSE} // 66
, {doExprOrOperator, 124 /* | */, 12,0, TRUE} // 67
, {doExprRParen, 41 /* ) */, 255,0, TRUE} // 68
, {doExprFinished, 255, 255,0, FALSE} // 69
, {doStartVariableName, 36 /*$*/, 72,0, TRUE} // 70 scan-var-name
, {doStartVariableName, 36 /* $ */, 72,0, TRUE} // 70 scan-var-name
, {doNOP, 255, 85,0, FALSE} // 71
, {doNOP, 132, 74,0, TRUE} // 72 scan-var-start
, {doVariableNameExpectedErr, 255, 85,0, FALSE} // 73
, {doNOP, 131, 74,0, TRUE} // 74 scan-var-body
, {doEndVariableName, 255, 255,0, FALSE} // 75
, {doScanUnicodeSet, 91 /*[*/, 255,0, TRUE} // 76 scan-unicode-set
, {doScanUnicodeSet, 112 /*p*/, 255,0, TRUE} // 77
, {doScanUnicodeSet, 80 /*P*/, 255,0, TRUE} // 78
, {doScanUnicodeSet, 91 /* [ */, 255,0, TRUE} // 76 scan-unicode-set
, {doScanUnicodeSet, 112 /* p */, 255,0, TRUE} // 77
, {doScanUnicodeSet, 80 /* P */, 255,0, TRUE} // 78
, {doNOP, 255, 85,0, FALSE} // 79
, {doNOP, 130, 80,0, TRUE} // 80 assign-or-rule
, {doStartAssign, 61 /*=*/, 12, 83, TRUE} // 81
, {doStartAssign, 61 /* = */, 12, 83, TRUE} // 81
, {doNOP, 255, 20, 8, FALSE} // 82
, {doEndAssign, 59 /*;*/, 1,0, TRUE} // 83 assign-end
, {doEndAssign, 59 /* ; */, 1,0, TRUE} // 83 assign-end
, {doRuleErrorAssignExpr, 255, 85,0, FALSE} // 84
, {doExit, 255, 85,0, TRUE} // 85 errorDeath
};

View File

@ -1,18 +1,20 @@
//
// rbbisetb.cpp
//
/*
**********************************************************************
* Copyright (c) 2001, International Business Machines
* Corporation and others. All Rights Reserved.
* Copyright (C) 2002 International Business Machines Corporation *
* and others. All rights reserved. *
**********************************************************************
*/
//
// RBBISetBuilder Handles processing of Unicode Sets from RBBI rules.
// RBBISetBuilder Handles processing of Unicode Sets from RBBI rules
// (part of the rule building process.)
//
// Starting with the rules parse tree from the scanner,
//
// - Enumerate the set of UnicodeSets that are referenced
// by the RBBI rules.
// by the RBBI rules.
// - compute a set of non-overlapping character ranges
// with all characters within a range belonging to the same
// set of input uniocde sets.
@ -62,10 +64,10 @@ RBBISetBuilder::RBBISetBuilder(RBBIRuleBuilder *rb)
// Destructor
//
//------------------------------------------------------------------------
RBBISetBuilder::~RBBISetBuilder()
RBBISetBuilder::~RBBISetBuilder()
{
RangeDescriptor *nextRangeDesc;
// Walk through & delete the linked list of RangeDescriptors
for (nextRangeDesc = fRangeList; nextRangeDesc!=NULL;) {
RangeDescriptor *r = nextRangeDesc;
@ -227,7 +229,7 @@ void RBBISetBuilder::build() {
if (fRB->fDebugEnv && strstr(fRB->fDebugEnv, "rgroup")) {printRangeGroups();}
if (fRB->fDebugEnv && strstr(fRB->fDebugEnv, "esets")) {printSets();}
//
// Build the Trie table for mapping UChar32 values to the corresponding
// range group number
@ -278,7 +280,7 @@ utrie_serialize(fTrie,
TRUE, // Reduce to 16 bits
fStatus);
}
//------------------------------------------------------------------------
//
// addValToSets Add a runtime-mapped input value to each uset from a
@ -291,7 +293,7 @@ utrie_serialize(fTrie,
//
// The "logically equivalent expression" is the tree for an
// or-ing together of all of the symbols that go into the set.
//
//
//------------------------------------------------------------------------
void RBBISetBuilder::addValToSets(UVector *sets, uint32_t val) {
int32_t ix;
@ -354,7 +356,7 @@ void RBBISetBuilder::printRanges() {
if (varRef != NULL && varRef->fType == RBBINode::varRef) {
setName = varRef->fText;
}
}
}
RBBINode::printUnicodeString(setName); printf(" ");
}
printf("\n");
@ -373,7 +375,7 @@ void RBBISetBuilder::printRangeGroups() {
RangeDescriptor *tRange;
int i;
int lastPrintedGroupNum = 0;
printf("\nRanges grouped by Unicode Set Membership...\n");
for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) {
int groupNum = rlRange->fNum & 0xbfff;
@ -382,7 +384,7 @@ void RBBISetBuilder::printRangeGroups() {
printf("%2i ", groupNum);
if (rlRange->fNum & 0x4000) { printf(" <DICT> ");};
for (i=0; i<rlRange->fIncludesSets->size(); i++) {
RBBINode *usetNode = (RBBINode *)rlRange->fIncludesSets->elementAt(i);
UnicodeString setName = "anon";
@ -392,8 +394,8 @@ void RBBISetBuilder::printRangeGroups() {
if (varRef != NULL && varRef->fType == RBBINode::varRef) {
setName = varRef->fText;
}
}
RBBINode::printUnicodeString(setName); printf(" ");
}
RBBINode::printUnicodeString(setName); printf(" ");
}
i = 0;
@ -410,7 +412,7 @@ void RBBISetBuilder::printRangeGroups() {
}
printf("\n");
}
//------------------------------------------------------------------------
@ -440,7 +442,7 @@ void RBBISetBuilder::printSets() {
if (varRef != NULL && varRef->fType == RBBINode::varRef) {
setName = varRef->fText;
}
}
}
RBBINode::printUnicodeString(setName);
printf(" ");
RBBINode::printUnicodeString(usetNode->fText);

View File

@ -1,12 +1,12 @@
//
// file: rbbistbl.cpp Implementation of the ICU RBBISymbolTable class
//
/********************************************************************
* COPYRIGHT:
* Copyright (c) 1997-2001, International Business Machines Corporation and
* others. All Rights Reserved.
********************************************************************/
/*
**********************************************************************
* Copyright (C) 2002 International Business Machines Corporation *
* and others. All rights reserved. *
**********************************************************************
*/
#include "unicode/unistr.h"
#include "unicode/uniset.h"

View File

@ -4,7 +4,7 @@
/*
**********************************************************************
* Copyright (c) 2001, International Business Machines
* Copyright (c) 2002, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
*/
@ -20,8 +20,8 @@
#include <assert.h>
RBBITableBuilder::RBBITableBuilder(RBBIRuleBuilder *rb, RBBINode *&rootNode) :
fTree(rootNode) {
RBBITableBuilder::RBBITableBuilder(RBBIRuleBuilder *rb, RBBINode **rootNode) :
fTree(*rootNode) {
fRB = rb;
fStatus = fRB->fStatus;
fDStates = new UVector(*fStatus);

View File

@ -4,7 +4,7 @@
/*
**********************************************************************
* Copyright (c) 2001, International Business Machines
* Copyright (c) 2002, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
*/
@ -21,6 +21,7 @@
U_NAMESPACE_BEGIN
class RBBIRuleScanner;
class RBBIRuleBuilder;
//
// class RBBITableBuilder is part of the RBBI rule compiler.
@ -33,9 +34,7 @@ class RBBIRuleScanner;
class RBBITableBuilder : public UObject {
public:
// TODO: add a root node param to the constructor. We're going to have two
// builders, one for the forward table, and one for the reverse table.
RBBITableBuilder(RBBIRuleBuilder *rb, RBBINode *&rootNode);
RBBITableBuilder(RBBIRuleBuilder *rb, RBBINode **rootNode);
~RBBITableBuilder();
void build();
@ -46,7 +45,7 @@ public:
// the specified location.
// TODO: add getter function(s) for the built table.
private:
void calcNullable(RBBINode *n);
void calcFirstPos(RBBINode *n);
@ -71,7 +70,7 @@ private:
private:
RBBIRuleBuilder *fRB;
RBBINode *&fTree; // The root node of the parse tree to build a
// table for.
// table for.
UErrorCode *fStatus;
UVector *fDStates; // D states (Aho's terminology)

View File

@ -1,5 +1,9 @@
/*
* Copyright (C) {1999-2001}, International Business Machines Corporation and others. All Rights Reserved.
***************************************************************************
* Copyright (C) 1999-2002 International Business Machines Corporation *
* and others. All rights reserved. *
***************************************************************************
**********************************************************************
* Date Name Description
* 10/22/99 alan Creation.
@ -28,26 +32,18 @@ class BreakIterator;
/**
* <p>A subclass of BreakIterator whose behavior is specified using a list of rules.</p>
*
* <p>There are two kinds of rules, which are separated by semicolons: <i>substitutions</i>
* <p>There are two kinds of rules, which are separated by semicolons: <i>variable definitions</i>
* and <i>regular expressions.</i></p>
*
* <p>A substitution rule defines a name that can be used in place of an expression. It
* consists of a name, which is a string of characters contained in angle brackets, an equals
* sign, and an expression. (There can be no whitespace on either side of the equals sign.)
* To keep its syntactic meaning intact, the expression must be enclosed in parentheses or
* square brackets. A substitution is visible after its definition, and is filled in using
* simple textual substitution. Substitution definitions can contain other substitutions, as
* long as those substitutions have been defined first. Substitutions are generally used to
* make the regular expressions (which can get quite complex) shorted and easier to read.
* <p>A varialbe definition defines a variable name that can be used in subsequent expressions.
* It consists of a name preceded by a dollar sign, an equals
* sign, and an expression.
* A $variable is visible after its definition.
* Variable definitions can contain other variables, as
* long as those variables have been defined first. Variables are generally used to
* make the regular expressions (which can get quite complex) shorter and easier to read.
* They typically define either character categories or commonly-used subexpressions.</p>
*
* <p>There is one special substitution.&nbsp; If the description defines a substitution
* called &quot;&lt;ignore&gt;&quot;, the expression must be a [] expression, and the
* expression defines a set of characters (the &quot;<em>ignore characters</em>&quot;) that
* will be transparent to the BreakIterator.&nbsp; A sequence of characters will break the
* same way it would if any ignore characters it contains are taken out.&nbsp; Break
* positions never occur befoer ignore characters.</p>
*
* <p>A regular expression uses a subset of the normal Unix regular-expression syntax, and
* defines a sequence of characters to be kept together. With one significant exception, the
* iterator uses a longest-possible-match algorithm when matching text to regular
@ -64,10 +60,6 @@ class BreakIterator;
* of times (including not at all).</td>
* </tr>
* <tr>
* <td width="6%">{}</td>
* <td width="94%">Encloses a sequence of characters that is optional.</td>
* </tr>
* <tr>
* <td width="6%">()</td>
* <td width="94%">Encloses a sequence of characters.&nbsp; If followed by *, the sequence
* repeats.&nbsp; Otherwise, the parentheses are just a grouping device and a way to delimit
@ -76,29 +68,17 @@ class BreakIterator;
* <tr>
* <td width="6%">|</td>
* <td width="94%">Separates two alternative sequences of characters.&nbsp; Either one
* sequence or the other, but not both, matches this expression.&nbsp; The | character can
* only occur inside ().</td>
* sequence or the other, but not both, matches this expression.</td>
* </tr>
* <tr>
* <td width="6%">.</td>
* <td width="94%">Matches any character.</td>
* </tr>
* <tr>
* <td width="6%">*?</td>
* <td width="94%">Specifies a non-greedy asterisk.&nbsp; *? works the same way as *, except
* when there is overlap between the last group of characters in the expression preceding the
* * and the first group of characters following the *.&nbsp; When there is this kind of
* overlap, * will match the longest sequence of characters that match the expression before
* the *, and *? will match the shortest sequence of characters matching the expression
* before the *?.&nbsp; For example, if you have &quot;xxyxyyyxyxyxxyxyxyy&quot; in the text,
* &quot;x[xy]*x&quot; will match through to the last x (i.e., &quot;<strong>xxyxyyyxyxyxxyxyx</strong>yy&quot;,
* but &quot;x[xy]*?x&quot; will only match the first two xes (&quot;<strong>xx</strong>yxyyyxyxyxxyxyxyy&quot;).</td>
* </tr>
* <tr>
* <td width="6%">[]</td>
* <td width="94%">Specifies a group of alternative characters.&nbsp; A [] expression will
* <td width="94%">Specify a set of characters.&nbsp; A [] expression will
* match any single character that is specified in the [] expression.&nbsp; For more on the
* syntax of [] expressions, see below.</td>
* syntax of [] expressions, see the ICU User Guide description of UnicodeSet.</td>
* </tr>
* <tr>
* <td width="6%">/</td>
@ -111,24 +91,16 @@ class BreakIterator;
* <tr>
* <td width="6%">\</td>
* <td width="94%">Escape character.&nbsp; The \ itself is ignored, but causes the next
* character to be treated as literal character.&nbsp; This has no effect for many
* characters, but for the characters listed above, this deprives them of their special
* meaning.&nbsp; (There are no special escape sequences for Unicode characters, or tabs and
* newlines; these are all handled by a higher-level protocol.&nbsp; In a Java string,
* &quot;\n&quot; will be converted to a literal newline character by the time the
* regular-expression parser sees it.&nbsp; Of course, this means that \ sequences that are
* visible to the regexp parser must be written as \\ when inside a Java string.)&nbsp; All
* characters in the ASCII range except for letters, digits, and control characters are
* reserved characters to the parser and must be preceded by \ even if they currently don't
* mean anything.</td>
* character to be treated as literal character.&nbsp; Except for letters and numbers,
* characters in the ASCII range must be escaped to be considered as literals.</td>
* </tr>
* <tr>
* <td width="6%">!</td>
* <td width="94%">If ! appears at the beginning of a regular expression, it tells the regexp
* parser that this expression specifies the backwards-iteration behavior of the iterator,
* and not its normal iteration behavior.&nbsp; This is generally only used in situations
* where the automatically-generated backwards-iteration brhavior doesn't produce
* satisfactory results and must be supplemented with extra client-specified rules.</td>
* and not its normal iteration behavior.&nbsp; The backwards rules must move the
* iterator to a safe position at or before the previous break position; forwards rules
* will then be used to find the exact previous position</td>
* </tr>
* <tr>
* <td width="6%"><em>(all others)</em></td>
@ -137,52 +109,6 @@ class BreakIterator;
* </tr>
* </table>
* </blockquote>
*
* <p>Within a [] expression, a number of other special characters can be used to specify
* groups of characters:</p>
*
* <blockquote>
* <table border="1" width="100%">
* <tr>
* <td width="6%">-</td>
* <td width="94%">Specifies a range of matching characters.&nbsp; For example
* &quot;[a-p]&quot; matches all lowercase Latin letters from a to p (inclusive).&nbsp; The -
* sign specifies ranges of continuous Unicode numeric values, not ranges of characters in a
* language's alphabetical order: &quot;[a-z]&quot; doesn't include capital letters, nor does
* it include accented letters such as a-umlaut.</td>
* </tr>
* <tr>
* <td width="6%">::</td>
* <td width="94%">A pair of colons containing a one- or two-letter code matches all
* characters in the corresponding Unicode category.&nbsp; The two-letter codes are the same
* as the two-letter codes in the Unicode database (for example, &quot;[:Sc::Sm:]&quot;
* matches all currency symbols and all math symbols).&nbsp; Specifying a one-letter code is
* the same as specifying all two-letter codes that begin with that letter (for example,
* &quot;[:L:]&quot; matches all letters, and is equivalent to
* &quot;[:Lu::Ll::Lo::Lm::Lt:]&quot;).&nbsp; Anything other than a valid two-letter Unicode
* category code or a single letter that begins a Unicode category code is illegal within
* colons.</td>
* </tr>
* <tr>
* <td width="6%">[]</td>
* <td width="94%">[] expressions can nest.&nbsp; This has no effect, except when used in
* conjunction with the ^ token.</td>
* </tr>
* <tr>
* <td width="6%">^</td>
* <td width="94%">Excludes the character (or the characters in the [] expression) following
* it from the group of characters.&nbsp; For example, &quot;[a-z^p]&quot; matches all Latin
* lowercase letters except p.&nbsp; &quot;[:L:^[\u4e00-\u9fff]]&quot; matches all letters
* except the Han ideographs.</td>
* </tr>
* <tr>
* <td width="6%"><em>(all others)</em></td>
* <td width="94%">All other characters are treated as literal characters.&nbsp; (For
* example, &quot;[aeiou]&quot; specifies just the letters a, e, i, o, and u.)</td>
* </tr>
* </table>
* </blockquote>
*
*/
@ -201,7 +127,9 @@ protected:
//
RBBIDataWrapper *fData;
UTrie *fCharMappings;
int32_t fLastBreakTag; // Rule {tag} value for the most recent match.
// Rule {tag} value for the most recent match.
int32_t fLastBreakTag;
//
// Counter for the number of characters encountered with the "dictionary"
@ -215,7 +143,7 @@ protected:
// Debugging flag.
//
static UBool fTrace;
private:
@ -228,7 +156,7 @@ protected:
//=======================================================================
// constructors
//=======================================================================
// This constructor uses the udata interface to create a BreakIterator whose
// internal tables live in a memory-mapped file. "image" is a pointer to the
// beginning of that file.
@ -248,7 +176,7 @@ protected:
friend class BreakIterator;
public:
/** Default constructor. Creates an empty shell of an iterator, with no
@ -500,7 +428,7 @@ protected:
* Return true if the category lookup for this char
* indicates that it is in the set of dictionary lookup chars.
* This function is intended for use by dictionary based break iterators.
*/
*/
virtual UBool isDictionaryChar(UChar32);
/**
@ -513,7 +441,7 @@ protected:
//----------------------------------------------------------------------------------
//
// Inline Functions Definitions ...