ICU-45 RBBI copyright notices, AIX portability, comments
X-SVN-Rev: 8963
This commit is contained in:
parent
59029844b7
commit
e56b99a590
@ -1,3 +1,8 @@
|
||||
//
|
||||
// file: rbbi.c Contains the implementation of the rule based break iterator
|
||||
// runtime engine and the API implementation for
|
||||
// class RuleBasedBreakIterator
|
||||
//
|
||||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 1999-2002 International Business Machines Corporation *
|
||||
@ -5,6 +10,7 @@
|
||||
**********************************************************************
|
||||
*/
|
||||
|
||||
|
||||
#include "unicode/rbbi.h"
|
||||
#include "unicode/schriter.h"
|
||||
#include "unicode/udata.h"
|
||||
@ -151,12 +157,12 @@ RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) {
|
||||
UBool RuleBasedBreakIterator::fTrace = FALSE;
|
||||
void RuleBasedBreakIterator::init() {
|
||||
static UBool debugInitDone = FALSE;
|
||||
|
||||
|
||||
fText = NULL;
|
||||
fData = NULL;
|
||||
fCharMappings = NULL;
|
||||
fLastBreakTag = 0;
|
||||
fDictionaryCharCount = 0;
|
||||
fDictionaryCharCount = 0;
|
||||
|
||||
if (debugInitDone == FALSE) {
|
||||
char *debugEnv = getenv("U_RBBIDEBUG");
|
||||
@ -190,7 +196,7 @@ RuleBasedBreakIterator::operator==(const BreakIterator& that) const {
|
||||
if (that.getDynamicClassID() != getDynamicClassID())
|
||||
return FALSE;
|
||||
|
||||
|
||||
|
||||
const RuleBasedBreakIterator& that2 = (const RuleBasedBreakIterator&)that;
|
||||
UBool r = (that2.fText == fText);
|
||||
r |= (*that2.fText == *fText);
|
||||
@ -229,7 +235,7 @@ RuleBasedBreakIterator::getRules() const {
|
||||
const CharacterIterator&
|
||||
RuleBasedBreakIterator::getText() const {
|
||||
RuleBasedBreakIterator* nonConstThis = (RuleBasedBreakIterator*)this;
|
||||
|
||||
|
||||
// The iterator is initialized pointing to no text at all, so if this
|
||||
// function is called while we're in that state, we have to fudge an
|
||||
// an iterator to return.
|
||||
@ -252,7 +258,7 @@ RuleBasedBreakIterator::adoptText(CharacterIterator* newText) {
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the iterator to analyze a new piece of text. This function resets
|
||||
* Set the iterator to analyze a new piece of text. This function resets
|
||||
* the current iteration position to the beginning of the text.
|
||||
* @param newText An iterator over the text to analyze.
|
||||
*/
|
||||
@ -295,7 +301,7 @@ int32_t RuleBasedBreakIterator::last(void) {
|
||||
reset();
|
||||
if (fText == NULL)
|
||||
return BreakIterator::DONE;
|
||||
|
||||
|
||||
// I'm not sure why, but t.last() returns the offset of the last character,
|
||||
// rather than the past-the-end offset
|
||||
|
||||
@ -352,7 +358,7 @@ int32_t RuleBasedBreakIterator::previous(void) {
|
||||
fText->previous32();
|
||||
int32_t lastResult = handlePrevious();
|
||||
int32_t result = lastResult;
|
||||
|
||||
|
||||
// iterate forward from the known break position until we pass our
|
||||
// starting point. The last break position before the starting
|
||||
// point is our return value
|
||||
@ -360,7 +366,7 @@ int32_t RuleBasedBreakIterator::previous(void) {
|
||||
lastResult = result;
|
||||
result = handleNext();
|
||||
}
|
||||
|
||||
|
||||
// set the current iteration position to be the last break position
|
||||
// before where we started, and then return that value
|
||||
fText->setIndex(lastResult);
|
||||
@ -420,7 +426,7 @@ int32_t RuleBasedBreakIterator::preceding(int32_t offset) {
|
||||
else if (offset < fText->startIndex()) {
|
||||
return fText->startIndex();
|
||||
}
|
||||
|
||||
|
||||
// if we start by updating the current iteration position to the
|
||||
// position specified by the caller, we can just use previous()
|
||||
// to carry out this operation
|
||||
@ -445,7 +451,7 @@ UBool RuleBasedBreakIterator::isBoundary(int32_t offset) {
|
||||
else if (offset < fText->startIndex() || offset > fText->endIndex()) {
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
|
||||
// otherwise, we can use following() on the position before the specified
|
||||
// one and return true of the position we get back is the one the user
|
||||
// specified
|
||||
@ -462,7 +468,7 @@ int32_t RuleBasedBreakIterator::current(void) const {
|
||||
}
|
||||
|
||||
//=======================================================================
|
||||
// implementation
|
||||
// implementation
|
||||
//=======================================================================
|
||||
|
||||
|
||||
@ -487,11 +493,11 @@ int32_t RuleBasedBreakIterator::handleNext(void) {
|
||||
// no matter what, we always advance at least one character forward
|
||||
int32_t result = fText->getIndex() + 1;
|
||||
int32_t lookaheadResult = 0;
|
||||
|
||||
|
||||
// begin in state 1
|
||||
int32_t state = START_STATE;
|
||||
int16_t category;
|
||||
UChar32 c = fText->current32();
|
||||
UChar32 c = fText->current32();
|
||||
RBBIStateTableRow *row;
|
||||
int32_t lookaheadStatus = 0;
|
||||
int32_t lookaheadTag = 0;
|
||||
@ -505,7 +511,7 @@ int32_t RuleBasedBreakIterator::handleNext(void) {
|
||||
fDictionaryCharCount++;
|
||||
category &= ~0x4000;
|
||||
}
|
||||
|
||||
|
||||
// loop until we reach the end of the text or transition to state 0
|
||||
for (;;) {
|
||||
if (c == CharacterIterator::DONE ) {
|
||||
@ -542,17 +548,17 @@ int32_t RuleBasedBreakIterator::handleNext(void) {
|
||||
state = row->fNextState[category];
|
||||
row = (RBBIStateTableRow *)
|
||||
(fData->fForwardTable->fTableData + (fData->fForwardTable->fRowLen * state));
|
||||
|
||||
|
||||
// Get the next character. Doing it here positions the iterator
|
||||
// to the correct position for recording matches in the code that
|
||||
// follows.
|
||||
c = fText->next32();
|
||||
|
||||
|
||||
if (row->fAccepting == 0 && row->fLookAhead == 0) {
|
||||
// No match, nothing of interest happening, common case.
|
||||
goto continueOn;
|
||||
}
|
||||
|
||||
|
||||
if (row->fAccepting == -1) {
|
||||
// Match found, common case, no lookahead involved.
|
||||
// (It's possible that some lookahead rule matched here also,
|
||||
@ -562,7 +568,7 @@ int32_t RuleBasedBreakIterator::handleNext(void) {
|
||||
fLastBreakTag = row->fTag; // Remember the break status (tag) value.
|
||||
goto continueOn;
|
||||
}
|
||||
|
||||
|
||||
if (row->fAccepting == 0 && row->fLookAhead != 0) {
|
||||
// Lookahead match point. Remember it, but only if no other rule has
|
||||
// unconitionally matched up to this point.
|
||||
@ -594,7 +600,7 @@ continueOn:
|
||||
if (state == STOP_STATE) {
|
||||
break;
|
||||
}
|
||||
|
||||
|
||||
// c = fText->next32();
|
||||
}
|
||||
|
||||
@ -605,7 +611,7 @@ continueOn:
|
||||
if (c == CharacterIterator::DONE && lookaheadResult == fText->endIndex()) {
|
||||
result = lookaheadResult;
|
||||
}
|
||||
|
||||
|
||||
|
||||
fText->setIndex(result);
|
||||
if (fTrace) {
|
||||
@ -626,13 +632,13 @@ continueOn:
|
||||
//
|
||||
//-----------------------------------------------------------------------------------
|
||||
int32_t RuleBasedBreakIterator::handlePrevious(void) {
|
||||
if (fText == NULL || fData == NULL) {
|
||||
if (fText == NULL || fData == NULL) {
|
||||
return 0;
|
||||
}
|
||||
if (fData->fReverseTable == NULL) {
|
||||
return fText->setToStart();
|
||||
}
|
||||
|
||||
|
||||
int32_t state = START_STATE;
|
||||
int32_t category;
|
||||
int32_t lastCategory = 0;
|
||||
@ -650,11 +656,11 @@ int32_t RuleBasedBreakIterator::handlePrevious(void) {
|
||||
fDictionaryCharCount++;
|
||||
category &= ~0x4000;
|
||||
}
|
||||
|
||||
|
||||
if (fTrace) {
|
||||
printf("Handle Prev pos char state category \n");
|
||||
}
|
||||
|
||||
|
||||
// loop until we reach the beginning of the text or transition to state 0
|
||||
for (;;) {
|
||||
if (c == CharacterIterator::DONE) {
|
||||
@ -693,14 +699,14 @@ int32_t RuleBasedBreakIterator::handlePrevious(void) {
|
||||
// No match, nothing of interest happening, common case.
|
||||
goto continueOn;
|
||||
}
|
||||
|
||||
|
||||
if (row->fAccepting == -1) {
|
||||
// Match found, common case, no lookahead involved.
|
||||
result = fText->getIndex();
|
||||
lookaheadStatus = 0; // clear out any pending look-ahead matches.
|
||||
goto continueOn;
|
||||
}
|
||||
|
||||
|
||||
if (row->fAccepting == 0 && row->fLookAhead != 0) {
|
||||
// Lookahead match point. Remember it, but only if no other rule
|
||||
// has unconditionally matched to this point.
|
||||
@ -714,7 +720,7 @@ int32_t RuleBasedBreakIterator::handlePrevious(void) {
|
||||
}
|
||||
goto continueOn;
|
||||
}
|
||||
|
||||
|
||||
if (row->fAccepting != 0 && row->fLookAhead != 0) {
|
||||
// Lookahead match is completed. Set the result accordingly, but only
|
||||
// if no other rule has matched further in the mean time.
|
||||
@ -732,18 +738,18 @@ continueOn:
|
||||
if (state == STOP_STATE) {
|
||||
break;
|
||||
}
|
||||
|
||||
|
||||
// then advance one character backwards
|
||||
c = fText->previous32();
|
||||
}
|
||||
|
||||
// Note: the result postion isn't what is returned to the user by previous(),
|
||||
// but where the implementation of previous() turns around and
|
||||
|
||||
// Note: the result postion isn't what is returned to the user by previous(),
|
||||
// but where the implementation of previous() turns around and
|
||||
// starts iterating forward again.
|
||||
if (c == CharacterIterator::DONE) {
|
||||
result = fText->startIndex();
|
||||
}
|
||||
fText->setIndex(result);
|
||||
}
|
||||
fText->setIndex(result);
|
||||
|
||||
return result;
|
||||
}
|
||||
@ -808,7 +814,7 @@ BreakIterator * RuleBasedBreakIterator::createBufferClone(void *stackBuffer,
|
||||
}
|
||||
|
||||
//
|
||||
// If user buffer size is zero this is a preflight operation to
|
||||
// If user buffer size is zero this is a preflight operation to
|
||||
// obtain the needed buffer size, allowing for worst case misalignment.
|
||||
//
|
||||
if (bufferSize == 0) {
|
||||
@ -859,7 +865,7 @@ BreakIterator * RuleBasedBreakIterator::createBufferClone(void *stackBuffer,
|
||||
clone->fBufferClone = TRUE;
|
||||
}
|
||||
|
||||
return clone;
|
||||
return clone;
|
||||
}
|
||||
|
||||
|
||||
|
@ -1,6 +1,21 @@
|
||||
#
|
||||
# rbbicst Compile the RBBI rule paser state table data into initialized C data.
|
||||
# Usage:
|
||||
# cd icu/source/common
|
||||
# perl rbbicst.pl < rbbirpt.txt > rbbirpt.h
|
||||
#
|
||||
# The output file, rbbrpt.h, is included by some of the .cpp rbbi
|
||||
# implementation files. This perl script is NOT run as part
|
||||
# of a normal ICU build. It is run by hand when needed, and the
|
||||
# rbbirpt.h generated file is put back into cvs.
|
||||
#
|
||||
# See rbbirpt.h for a description of the input format for this script.
|
||||
#
|
||||
#*********************************************************************
|
||||
# Copyright (C) 2002 International Business Machines Corporation *
|
||||
# and others. All rights reserved. *
|
||||
#*********************************************************************
|
||||
|
||||
|
||||
$num_states = 1; # Always the state number for the line being compiled.
|
||||
$line_num = 0; # The line number in the input file.
|
||||
@ -180,10 +195,14 @@ die if ($errors>0);
|
||||
print "//---------------------------------------------------------------------------------\n";
|
||||
print "//\n";
|
||||
print "// Generated Header File. Do not edit by hand.\n";
|
||||
print "// This file contains the state table for RBBI rule parser.\n";
|
||||
print "// This file contains the state table for the ICU Rule Based Break Iterator\n";
|
||||
print "// rule parser.\n";
|
||||
print "// It is generated by the Perl script \"rbbicst.pl\" from\n";
|
||||
print "// the rule parser state definitions file \"rbbirpt.txt\".\n";
|
||||
print "//\n";
|
||||
print "// Copyright (C) 2002 International Business Machines Corporation \n";
|
||||
print "// and others. All rights reserved. \n";
|
||||
print "//\n";
|
||||
print "//---------------------------------------------------------------------------------\n";
|
||||
print "#ifndef RBBIRPT_H\n";
|
||||
print "#define RBBIRPT_H\n";
|
||||
@ -257,7 +276,7 @@ for ($state=1; $state < $num_states; $state++) {
|
||||
print " , {$state_func_name[$state],";
|
||||
if ($state_literal_chars[$state] ne "") {
|
||||
$c = $state_literal_chars[$state];
|
||||
printf(" %d /*$c*/,", ord($c)); #TODO: use numeric value, so EBCDIC machines are ok.
|
||||
printf(" %d /* $c */,", ord($c)); # use numeric value, so EBCDIC machines are ok.
|
||||
}else {
|
||||
print " $charClasses{$state_char_class[$state]},";
|
||||
}
|
||||
|
@ -201,8 +201,8 @@ RBBIRuleBuilder::createRuleBasedBreakIterator( const UnicodeString &rules,
|
||||
//
|
||||
// Generate the DFA state transition table.
|
||||
//
|
||||
builder.fForwardTables = new RBBITableBuilder(&builder, builder.fForwardTree);
|
||||
builder.fReverseTables = new RBBITableBuilder(&builder, builder.fReverseTree);
|
||||
builder.fForwardTables = new RBBITableBuilder(&builder, &builder.fForwardTree);
|
||||
builder.fReverseTables = new RBBITableBuilder(&builder, &builder.fReverseTree);
|
||||
builder.fForwardTables->build();
|
||||
builder.fReverseTables->build();
|
||||
if (U_FAILURE(status)) {
|
||||
|
@ -1,10 +1,14 @@
|
||||
//---------------------------------------------------------------------------------
|
||||
//
|
||||
// Generated Header File. Do not edit by hand.
|
||||
// This file contains the state table for RBBI rule parser.
|
||||
// This file contains the state table for the ICU Rule Based Break Iterator
|
||||
// rule parser.
|
||||
// It is generated by the Perl script "rbbicst.pl" from
|
||||
// the rule parser state definitions file "rbbirpt.txt".
|
||||
//
|
||||
// Copyright (C) 2002 International Business Machines Corporation
|
||||
// and others. All rights reserved.
|
||||
//
|
||||
//---------------------------------------------------------------------------------
|
||||
#ifndef RBBIRPT_H
|
||||
#define RBBIRPT_H
|
||||
@ -71,87 +75,87 @@ struct RBBIRuleTableEl gRuleParseStateTable[] = {
|
||||
{doNOP, 0, 0, 0, TRUE}
|
||||
, {doExprStart, 254, 12, 8, FALSE} // 1 start
|
||||
, {doNOP, 130, 1,0, TRUE} // 2
|
||||
, {doExprStart, 36 /*$*/, 70, 80, FALSE} // 3
|
||||
, {doReverseDir, 33 /*!*/, 11,0, TRUE} // 4
|
||||
, {doNOP, 59 /*;*/, 1,0, TRUE} // 5
|
||||
, {doExprStart, 36 /* $ */, 70, 80, FALSE} // 3
|
||||
, {doReverseDir, 33 /* ! */, 11,0, TRUE} // 4
|
||||
, {doNOP, 59 /* ; */, 1,0, TRUE} // 5
|
||||
, {doNOP, 252, 0,0, FALSE} // 6
|
||||
, {doExprStart, 255, 12, 8, FALSE} // 7
|
||||
, {doEndOfRule, 59 /*;*/, 1,0, TRUE} // 8 break-rule-end
|
||||
, {doEndOfRule, 59 /* ; */, 1,0, TRUE} // 8 break-rule-end
|
||||
, {doNOP, 130, 8,0, TRUE} // 9
|
||||
, {doRuleError, 255, 85,0, FALSE} // 10
|
||||
, {doExprStart, 255, 12, 8, FALSE} // 11 reverse-rule
|
||||
, {doRuleChar, 254, 21,0, TRUE} // 12 term
|
||||
, {doNOP, 130, 12,0, TRUE} // 13
|
||||
, {doRuleChar, 129, 21,0, TRUE} // 14
|
||||
, {doNOP, 91 /*[*/, 76, 21, FALSE} // 15
|
||||
, {doLParen, 40 /*(*/, 12, 21, TRUE} // 16
|
||||
, {doNOP, 36 /*$*/, 70, 20, FALSE} // 17
|
||||
, {doDotAny, 46 /*.*/, 21,0, TRUE} // 18
|
||||
, {doNOP, 91 /* [ */, 76, 21, FALSE} // 15
|
||||
, {doLParen, 40 /* ( */, 12, 21, TRUE} // 16
|
||||
, {doNOP, 36 /* $ */, 70, 20, FALSE} // 17
|
||||
, {doDotAny, 46 /* . */, 21,0, TRUE} // 18
|
||||
, {doRuleError, 255, 85,0, FALSE} // 19
|
||||
, {doCheckVarDef, 255, 21,0, FALSE} // 20 term-var-ref
|
||||
, {doUnaryOpStar, 42 /***/, 25,0, TRUE} // 21 expr-mod
|
||||
, {doUnaryOpPlus, 43 /*+*/, 25,0, TRUE} // 22
|
||||
, {doUnaryOpQuestion, 63 /*?*/, 25,0, TRUE} // 23
|
||||
, {doUnaryOpStar, 42 /* * */, 25,0, TRUE} // 21 expr-mod
|
||||
, {doUnaryOpPlus, 43 /* + */, 25,0, TRUE} // 22
|
||||
, {doUnaryOpQuestion, 63 /* ? */, 25,0, TRUE} // 23
|
||||
, {doNOP, 255, 25,0, FALSE} // 24
|
||||
, {doExprCatOperator, 254, 12,0, FALSE} // 25 expr-cont
|
||||
, {doNOP, 130, 25,0, TRUE} // 26
|
||||
, {doExprCatOperator, 129, 12,0, FALSE} // 27
|
||||
, {doExprCatOperator, 91 /*[*/, 12,0, FALSE} // 28
|
||||
, {doExprCatOperator, 40 /*(*/, 12,0, FALSE} // 29
|
||||
, {doExprCatOperator, 36 /*$*/, 12,0, FALSE} // 30
|
||||
, {doExprCatOperator, 46 /*.*/, 12,0, FALSE} // 31
|
||||
, {doExprCatOperator, 47 /*/*/, 37,0, FALSE} // 32
|
||||
, {doExprCatOperator, 123 /*{*/, 49,0, TRUE} // 33
|
||||
, {doExprOrOperator, 124 /*|*/, 12,0, TRUE} // 34
|
||||
, {doExprRParen, 41 /*)*/, 255,0, TRUE} // 35
|
||||
, {doExprCatOperator, 91 /* [ */, 12,0, FALSE} // 28
|
||||
, {doExprCatOperator, 40 /* ( */, 12,0, FALSE} // 29
|
||||
, {doExprCatOperator, 36 /* $ */, 12,0, FALSE} // 30
|
||||
, {doExprCatOperator, 46 /* . */, 12,0, FALSE} // 31
|
||||
, {doExprCatOperator, 47 /* / */, 37,0, FALSE} // 32
|
||||
, {doExprCatOperator, 123 /* { */, 49,0, TRUE} // 33
|
||||
, {doExprOrOperator, 124 /* | */, 12,0, TRUE} // 34
|
||||
, {doExprRParen, 41 /* ) */, 255,0, TRUE} // 35
|
||||
, {doExprFinished, 255, 255,0, FALSE} // 36
|
||||
, {doSlash, 47 /*/*/, 39,0, TRUE} // 37 look-ahead
|
||||
, {doSlash, 47 /* / */, 39,0, TRUE} // 37 look-ahead
|
||||
, {doNOP, 255, 85,0, FALSE} // 38
|
||||
, {doExprCatOperator, 254, 12,0, FALSE} // 39 expr-cont-no-slash
|
||||
, {doNOP, 130, 25,0, TRUE} // 40
|
||||
, {doExprCatOperator, 129, 12,0, FALSE} // 41
|
||||
, {doExprCatOperator, 91 /*[*/, 12,0, FALSE} // 42
|
||||
, {doExprCatOperator, 40 /*(*/, 12,0, FALSE} // 43
|
||||
, {doExprCatOperator, 36 /*$*/, 12,0, FALSE} // 44
|
||||
, {doExprCatOperator, 46 /*.*/, 12,0, FALSE} // 45
|
||||
, {doExprOrOperator, 124 /*|*/, 12,0, TRUE} // 46
|
||||
, {doExprRParen, 41 /*)*/, 255,0, TRUE} // 47
|
||||
, {doExprCatOperator, 91 /* [ */, 12,0, FALSE} // 42
|
||||
, {doExprCatOperator, 40 /* ( */, 12,0, FALSE} // 43
|
||||
, {doExprCatOperator, 36 /* $ */, 12,0, FALSE} // 44
|
||||
, {doExprCatOperator, 46 /* . */, 12,0, FALSE} // 45
|
||||
, {doExprOrOperator, 124 /* | */, 12,0, TRUE} // 46
|
||||
, {doExprRParen, 41 /* ) */, 255,0, TRUE} // 47
|
||||
, {doExprFinished, 255, 255,0, FALSE} // 48
|
||||
, {doNOP, 130, 49,0, TRUE} // 49 tag-open
|
||||
, {doStartTagValue, 128, 52,0, FALSE} // 50
|
||||
, {doTagExpectedError, 255, 85,0, FALSE} // 51
|
||||
, {doNOP, 130, 56,0, TRUE} // 52 tag-value
|
||||
, {doNOP, 125 /*}*/, 56,0, FALSE} // 53
|
||||
, {doNOP, 125 /* } */, 56,0, FALSE} // 53
|
||||
, {doTagDigit, 128, 52,0, TRUE} // 54
|
||||
, {doTagExpectedError, 255, 85,0, FALSE} // 55
|
||||
, {doNOP, 130, 56,0, TRUE} // 56 tag-close
|
||||
, {doTagValue, 125 /*}*/, 59,0, TRUE} // 57
|
||||
, {doTagValue, 125 /* } */, 59,0, TRUE} // 57
|
||||
, {doTagExpectedError, 255, 85,0, FALSE} // 58
|
||||
, {doExprCatOperator, 254, 12,0, FALSE} // 59 expr-cont-no-tag
|
||||
, {doNOP, 130, 59,0, TRUE} // 60
|
||||
, {doExprCatOperator, 129, 12,0, FALSE} // 61
|
||||
, {doExprCatOperator, 91 /*[*/, 12,0, FALSE} // 62
|
||||
, {doExprCatOperator, 40 /*(*/, 12,0, FALSE} // 63
|
||||
, {doExprCatOperator, 36 /*$*/, 12,0, FALSE} // 64
|
||||
, {doExprCatOperator, 46 /*.*/, 12,0, FALSE} // 65
|
||||
, {doExprCatOperator, 47 /*/*/, 37,0, FALSE} // 66
|
||||
, {doExprOrOperator, 124 /*|*/, 12,0, TRUE} // 67
|
||||
, {doExprRParen, 41 /*)*/, 255,0, TRUE} // 68
|
||||
, {doExprCatOperator, 91 /* [ */, 12,0, FALSE} // 62
|
||||
, {doExprCatOperator, 40 /* ( */, 12,0, FALSE} // 63
|
||||
, {doExprCatOperator, 36 /* $ */, 12,0, FALSE} // 64
|
||||
, {doExprCatOperator, 46 /* . */, 12,0, FALSE} // 65
|
||||
, {doExprCatOperator, 47 /* / */, 37,0, FALSE} // 66
|
||||
, {doExprOrOperator, 124 /* | */, 12,0, TRUE} // 67
|
||||
, {doExprRParen, 41 /* ) */, 255,0, TRUE} // 68
|
||||
, {doExprFinished, 255, 255,0, FALSE} // 69
|
||||
, {doStartVariableName, 36 /*$*/, 72,0, TRUE} // 70 scan-var-name
|
||||
, {doStartVariableName, 36 /* $ */, 72,0, TRUE} // 70 scan-var-name
|
||||
, {doNOP, 255, 85,0, FALSE} // 71
|
||||
, {doNOP, 132, 74,0, TRUE} // 72 scan-var-start
|
||||
, {doVariableNameExpectedErr, 255, 85,0, FALSE} // 73
|
||||
, {doNOP, 131, 74,0, TRUE} // 74 scan-var-body
|
||||
, {doEndVariableName, 255, 255,0, FALSE} // 75
|
||||
, {doScanUnicodeSet, 91 /*[*/, 255,0, TRUE} // 76 scan-unicode-set
|
||||
, {doScanUnicodeSet, 112 /*p*/, 255,0, TRUE} // 77
|
||||
, {doScanUnicodeSet, 80 /*P*/, 255,0, TRUE} // 78
|
||||
, {doScanUnicodeSet, 91 /* [ */, 255,0, TRUE} // 76 scan-unicode-set
|
||||
, {doScanUnicodeSet, 112 /* p */, 255,0, TRUE} // 77
|
||||
, {doScanUnicodeSet, 80 /* P */, 255,0, TRUE} // 78
|
||||
, {doNOP, 255, 85,0, FALSE} // 79
|
||||
, {doNOP, 130, 80,0, TRUE} // 80 assign-or-rule
|
||||
, {doStartAssign, 61 /*=*/, 12, 83, TRUE} // 81
|
||||
, {doStartAssign, 61 /* = */, 12, 83, TRUE} // 81
|
||||
, {doNOP, 255, 20, 8, FALSE} // 82
|
||||
, {doEndAssign, 59 /*;*/, 1,0, TRUE} // 83 assign-end
|
||||
, {doEndAssign, 59 /* ; */, 1,0, TRUE} // 83 assign-end
|
||||
, {doRuleErrorAssignExpr, 255, 85,0, FALSE} // 84
|
||||
, {doExit, 255, 85,0, TRUE} // 85 errorDeath
|
||||
};
|
||||
|
@ -1,18 +1,20 @@
|
||||
//
|
||||
// rbbisetb.cpp
|
||||
//
|
||||
/*
|
||||
**********************************************************************
|
||||
* Copyright (c) 2001, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
* Copyright (C) 2002 International Business Machines Corporation *
|
||||
* and others. All rights reserved. *
|
||||
**********************************************************************
|
||||
*/
|
||||
//
|
||||
// RBBISetBuilder Handles processing of Unicode Sets from RBBI rules.
|
||||
// RBBISetBuilder Handles processing of Unicode Sets from RBBI rules
|
||||
// (part of the rule building process.)
|
||||
//
|
||||
// Starting with the rules parse tree from the scanner,
|
||||
//
|
||||
// - Enumerate the set of UnicodeSets that are referenced
|
||||
// by the RBBI rules.
|
||||
// by the RBBI rules.
|
||||
// - compute a set of non-overlapping character ranges
|
||||
// with all characters within a range belonging to the same
|
||||
// set of input uniocde sets.
|
||||
@ -62,10 +64,10 @@ RBBISetBuilder::RBBISetBuilder(RBBIRuleBuilder *rb)
|
||||
// Destructor
|
||||
//
|
||||
//------------------------------------------------------------------------
|
||||
RBBISetBuilder::~RBBISetBuilder()
|
||||
RBBISetBuilder::~RBBISetBuilder()
|
||||
{
|
||||
RangeDescriptor *nextRangeDesc;
|
||||
|
||||
|
||||
// Walk through & delete the linked list of RangeDescriptors
|
||||
for (nextRangeDesc = fRangeList; nextRangeDesc!=NULL;) {
|
||||
RangeDescriptor *r = nextRangeDesc;
|
||||
@ -227,7 +229,7 @@ void RBBISetBuilder::build() {
|
||||
|
||||
if (fRB->fDebugEnv && strstr(fRB->fDebugEnv, "rgroup")) {printRangeGroups();}
|
||||
if (fRB->fDebugEnv && strstr(fRB->fDebugEnv, "esets")) {printSets();}
|
||||
|
||||
|
||||
//
|
||||
// Build the Trie table for mapping UChar32 values to the corresponding
|
||||
// range group number
|
||||
@ -278,7 +280,7 @@ utrie_serialize(fTrie,
|
||||
TRUE, // Reduce to 16 bits
|
||||
fStatus);
|
||||
}
|
||||
|
||||
|
||||
//------------------------------------------------------------------------
|
||||
//
|
||||
// addValToSets Add a runtime-mapped input value to each uset from a
|
||||
@ -291,7 +293,7 @@ utrie_serialize(fTrie,
|
||||
//
|
||||
// The "logically equivalent expression" is the tree for an
|
||||
// or-ing together of all of the symbols that go into the set.
|
||||
//
|
||||
//
|
||||
//------------------------------------------------------------------------
|
||||
void RBBISetBuilder::addValToSets(UVector *sets, uint32_t val) {
|
||||
int32_t ix;
|
||||
@ -354,7 +356,7 @@ void RBBISetBuilder::printRanges() {
|
||||
if (varRef != NULL && varRef->fType == RBBINode::varRef) {
|
||||
setName = varRef->fText;
|
||||
}
|
||||
}
|
||||
}
|
||||
RBBINode::printUnicodeString(setName); printf(" ");
|
||||
}
|
||||
printf("\n");
|
||||
@ -373,7 +375,7 @@ void RBBISetBuilder::printRangeGroups() {
|
||||
RangeDescriptor *tRange;
|
||||
int i;
|
||||
int lastPrintedGroupNum = 0;
|
||||
|
||||
|
||||
printf("\nRanges grouped by Unicode Set Membership...\n");
|
||||
for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) {
|
||||
int groupNum = rlRange->fNum & 0xbfff;
|
||||
@ -382,7 +384,7 @@ void RBBISetBuilder::printRangeGroups() {
|
||||
printf("%2i ", groupNum);
|
||||
|
||||
if (rlRange->fNum & 0x4000) { printf(" <DICT> ");};
|
||||
|
||||
|
||||
for (i=0; i<rlRange->fIncludesSets->size(); i++) {
|
||||
RBBINode *usetNode = (RBBINode *)rlRange->fIncludesSets->elementAt(i);
|
||||
UnicodeString setName = "anon";
|
||||
@ -392,8 +394,8 @@ void RBBISetBuilder::printRangeGroups() {
|
||||
if (varRef != NULL && varRef->fType == RBBINode::varRef) {
|
||||
setName = varRef->fText;
|
||||
}
|
||||
}
|
||||
RBBINode::printUnicodeString(setName); printf(" ");
|
||||
}
|
||||
RBBINode::printUnicodeString(setName); printf(" ");
|
||||
}
|
||||
|
||||
i = 0;
|
||||
@ -410,7 +412,7 @@ void RBBISetBuilder::printRangeGroups() {
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
//------------------------------------------------------------------------
|
||||
@ -440,7 +442,7 @@ void RBBISetBuilder::printSets() {
|
||||
if (varRef != NULL && varRef->fType == RBBINode::varRef) {
|
||||
setName = varRef->fText;
|
||||
}
|
||||
}
|
||||
}
|
||||
RBBINode::printUnicodeString(setName);
|
||||
printf(" ");
|
||||
RBBINode::printUnicodeString(usetNode->fText);
|
||||
|
@ -1,12 +1,12 @@
|
||||
//
|
||||
// file: rbbistbl.cpp Implementation of the ICU RBBISymbolTable class
|
||||
//
|
||||
|
||||
/********************************************************************
|
||||
* COPYRIGHT:
|
||||
* Copyright (c) 1997-2001, International Business Machines Corporation and
|
||||
* others. All Rights Reserved.
|
||||
********************************************************************/
|
||||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 2002 International Business Machines Corporation *
|
||||
* and others. All rights reserved. *
|
||||
**********************************************************************
|
||||
*/
|
||||
|
||||
#include "unicode/unistr.h"
|
||||
#include "unicode/uniset.h"
|
||||
|
@ -4,7 +4,7 @@
|
||||
|
||||
/*
|
||||
**********************************************************************
|
||||
* Copyright (c) 2001, International Business Machines
|
||||
* Copyright (c) 2002, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
*/
|
||||
@ -20,8 +20,8 @@
|
||||
#include <assert.h>
|
||||
|
||||
|
||||
RBBITableBuilder::RBBITableBuilder(RBBIRuleBuilder *rb, RBBINode *&rootNode) :
|
||||
fTree(rootNode) {
|
||||
RBBITableBuilder::RBBITableBuilder(RBBIRuleBuilder *rb, RBBINode **rootNode) :
|
||||
fTree(*rootNode) {
|
||||
fRB = rb;
|
||||
fStatus = fRB->fStatus;
|
||||
fDStates = new UVector(*fStatus);
|
||||
|
@ -4,7 +4,7 @@
|
||||
|
||||
/*
|
||||
**********************************************************************
|
||||
* Copyright (c) 2001, International Business Machines
|
||||
* Copyright (c) 2002, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
*/
|
||||
@ -21,6 +21,7 @@
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
class RBBIRuleScanner;
|
||||
class RBBIRuleBuilder;
|
||||
|
||||
//
|
||||
// class RBBITableBuilder is part of the RBBI rule compiler.
|
||||
@ -33,9 +34,7 @@ class RBBIRuleScanner;
|
||||
|
||||
class RBBITableBuilder : public UObject {
|
||||
public:
|
||||
// TODO: add a root node param to the constructor. We're going to have two
|
||||
// builders, one for the forward table, and one for the reverse table.
|
||||
RBBITableBuilder(RBBIRuleBuilder *rb, RBBINode *&rootNode);
|
||||
RBBITableBuilder(RBBIRuleBuilder *rb, RBBINode **rootNode);
|
||||
~RBBITableBuilder();
|
||||
|
||||
void build();
|
||||
@ -46,7 +45,7 @@ public:
|
||||
// the specified location.
|
||||
|
||||
// TODO: add getter function(s) for the built table.
|
||||
|
||||
|
||||
private:
|
||||
void calcNullable(RBBINode *n);
|
||||
void calcFirstPos(RBBINode *n);
|
||||
@ -71,7 +70,7 @@ private:
|
||||
private:
|
||||
RBBIRuleBuilder *fRB;
|
||||
RBBINode *&fTree; // The root node of the parse tree to build a
|
||||
// table for.
|
||||
// table for.
|
||||
UErrorCode *fStatus;
|
||||
|
||||
UVector *fDStates; // D states (Aho's terminology)
|
||||
|
@ -1,5 +1,9 @@
|
||||
/*
|
||||
* Copyright (C) {1999-2001}, International Business Machines Corporation and others. All Rights Reserved.
|
||||
***************************************************************************
|
||||
* Copyright (C) 1999-2002 International Business Machines Corporation *
|
||||
* and others. All rights reserved. *
|
||||
***************************************************************************
|
||||
|
||||
**********************************************************************
|
||||
* Date Name Description
|
||||
* 10/22/99 alan Creation.
|
||||
@ -28,26 +32,18 @@ class BreakIterator;
|
||||
/**
|
||||
* <p>A subclass of BreakIterator whose behavior is specified using a list of rules.</p>
|
||||
*
|
||||
* <p>There are two kinds of rules, which are separated by semicolons: <i>substitutions</i>
|
||||
* <p>There are two kinds of rules, which are separated by semicolons: <i>variable definitions</i>
|
||||
* and <i>regular expressions.</i></p>
|
||||
*
|
||||
* <p>A substitution rule defines a name that can be used in place of an expression. It
|
||||
* consists of a name, which is a string of characters contained in angle brackets, an equals
|
||||
* sign, and an expression. (There can be no whitespace on either side of the equals sign.)
|
||||
* To keep its syntactic meaning intact, the expression must be enclosed in parentheses or
|
||||
* square brackets. A substitution is visible after its definition, and is filled in using
|
||||
* simple textual substitution. Substitution definitions can contain other substitutions, as
|
||||
* long as those substitutions have been defined first. Substitutions are generally used to
|
||||
* make the regular expressions (which can get quite complex) shorted and easier to read.
|
||||
* <p>A varialbe definition defines a variable name that can be used in subsequent expressions.
|
||||
* It consists of a name preceded by a dollar sign, an equals
|
||||
* sign, and an expression.
|
||||
* A $variable is visible after its definition.
|
||||
* Variable definitions can contain other variables, as
|
||||
* long as those variables have been defined first. Variables are generally used to
|
||||
* make the regular expressions (which can get quite complex) shorter and easier to read.
|
||||
* They typically define either character categories or commonly-used subexpressions.</p>
|
||||
*
|
||||
* <p>There is one special substitution. If the description defines a substitution
|
||||
* called "<ignore>", the expression must be a [] expression, and the
|
||||
* expression defines a set of characters (the "<em>ignore characters</em>") that
|
||||
* will be transparent to the BreakIterator. A sequence of characters will break the
|
||||
* same way it would if any ignore characters it contains are taken out. Break
|
||||
* positions never occur befoer ignore characters.</p>
|
||||
*
|
||||
* <p>A regular expression uses a subset of the normal Unix regular-expression syntax, and
|
||||
* defines a sequence of characters to be kept together. With one significant exception, the
|
||||
* iterator uses a longest-possible-match algorithm when matching text to regular
|
||||
@ -64,10 +60,6 @@ class BreakIterator;
|
||||
* of times (including not at all).</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td width="6%">{}</td>
|
||||
* <td width="94%">Encloses a sequence of characters that is optional.</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td width="6%">()</td>
|
||||
* <td width="94%">Encloses a sequence of characters. If followed by *, the sequence
|
||||
* repeats. Otherwise, the parentheses are just a grouping device and a way to delimit
|
||||
@ -76,29 +68,17 @@ class BreakIterator;
|
||||
* <tr>
|
||||
* <td width="6%">|</td>
|
||||
* <td width="94%">Separates two alternative sequences of characters. Either one
|
||||
* sequence or the other, but not both, matches this expression. The | character can
|
||||
* only occur inside ().</td>
|
||||
* sequence or the other, but not both, matches this expression.</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td width="6%">.</td>
|
||||
* <td width="94%">Matches any character.</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td width="6%">*?</td>
|
||||
* <td width="94%">Specifies a non-greedy asterisk. *? works the same way as *, except
|
||||
* when there is overlap between the last group of characters in the expression preceding the
|
||||
* * and the first group of characters following the *. When there is this kind of
|
||||
* overlap, * will match the longest sequence of characters that match the expression before
|
||||
* the *, and *? will match the shortest sequence of characters matching the expression
|
||||
* before the *?. For example, if you have "xxyxyyyxyxyxxyxyxyy" in the text,
|
||||
* "x[xy]*x" will match through to the last x (i.e., "<strong>xxyxyyyxyxyxxyxyx</strong>yy",
|
||||
* but "x[xy]*?x" will only match the first two xes ("<strong>xx</strong>yxyyyxyxyxxyxyxyy").</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td width="6%">[]</td>
|
||||
* <td width="94%">Specifies a group of alternative characters. A [] expression will
|
||||
* <td width="94%">Specify a set of characters. A [] expression will
|
||||
* match any single character that is specified in the [] expression. For more on the
|
||||
* syntax of [] expressions, see below.</td>
|
||||
* syntax of [] expressions, see the ICU User Guide description of UnicodeSet.</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td width="6%">/</td>
|
||||
@ -111,24 +91,16 @@ class BreakIterator;
|
||||
* <tr>
|
||||
* <td width="6%">\</td>
|
||||
* <td width="94%">Escape character. The \ itself is ignored, but causes the next
|
||||
* character to be treated as literal character. This has no effect for many
|
||||
* characters, but for the characters listed above, this deprives them of their special
|
||||
* meaning. (There are no special escape sequences for Unicode characters, or tabs and
|
||||
* newlines; these are all handled by a higher-level protocol. In a Java string,
|
||||
* "\n" will be converted to a literal newline character by the time the
|
||||
* regular-expression parser sees it. Of course, this means that \ sequences that are
|
||||
* visible to the regexp parser must be written as \\ when inside a Java string.) All
|
||||
* characters in the ASCII range except for letters, digits, and control characters are
|
||||
* reserved characters to the parser and must be preceded by \ even if they currently don't
|
||||
* mean anything.</td>
|
||||
* character to be treated as literal character. Except for letters and numbers,
|
||||
* characters in the ASCII range must be escaped to be considered as literals.</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td width="6%">!</td>
|
||||
* <td width="94%">If ! appears at the beginning of a regular expression, it tells the regexp
|
||||
* parser that this expression specifies the backwards-iteration behavior of the iterator,
|
||||
* and not its normal iteration behavior. This is generally only used in situations
|
||||
* where the automatically-generated backwards-iteration brhavior doesn't produce
|
||||
* satisfactory results and must be supplemented with extra client-specified rules.</td>
|
||||
* and not its normal iteration behavior. The backwards rules must move the
|
||||
* iterator to a safe position at or before the previous break position; forwards rules
|
||||
* will then be used to find the exact previous position</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td width="6%"><em>(all others)</em></td>
|
||||
@ -137,52 +109,6 @@ class BreakIterator;
|
||||
* </tr>
|
||||
* </table>
|
||||
* </blockquote>
|
||||
*
|
||||
* <p>Within a [] expression, a number of other special characters can be used to specify
|
||||
* groups of characters:</p>
|
||||
*
|
||||
* <blockquote>
|
||||
* <table border="1" width="100%">
|
||||
* <tr>
|
||||
* <td width="6%">-</td>
|
||||
* <td width="94%">Specifies a range of matching characters. For example
|
||||
* "[a-p]" matches all lowercase Latin letters from a to p (inclusive). The -
|
||||
* sign specifies ranges of continuous Unicode numeric values, not ranges of characters in a
|
||||
* language's alphabetical order: "[a-z]" doesn't include capital letters, nor does
|
||||
* it include accented letters such as a-umlaut.</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td width="6%">::</td>
|
||||
* <td width="94%">A pair of colons containing a one- or two-letter code matches all
|
||||
* characters in the corresponding Unicode category. The two-letter codes are the same
|
||||
* as the two-letter codes in the Unicode database (for example, "[:Sc::Sm:]"
|
||||
* matches all currency symbols and all math symbols). Specifying a one-letter code is
|
||||
* the same as specifying all two-letter codes that begin with that letter (for example,
|
||||
* "[:L:]" matches all letters, and is equivalent to
|
||||
* "[:Lu::Ll::Lo::Lm::Lt:]"). Anything other than a valid two-letter Unicode
|
||||
* category code or a single letter that begins a Unicode category code is illegal within
|
||||
* colons.</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td width="6%">[]</td>
|
||||
* <td width="94%">[] expressions can nest. This has no effect, except when used in
|
||||
* conjunction with the ^ token.</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td width="6%">^</td>
|
||||
* <td width="94%">Excludes the character (or the characters in the [] expression) following
|
||||
* it from the group of characters. For example, "[a-z^p]" matches all Latin
|
||||
* lowercase letters except p. "[:L:^[\u4e00-\u9fff]]" matches all letters
|
||||
* except the Han ideographs.</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td width="6%"><em>(all others)</em></td>
|
||||
* <td width="94%">All other characters are treated as literal characters. (For
|
||||
* example, "[aeiou]" specifies just the letters a, e, i, o, and u.)</td>
|
||||
* </tr>
|
||||
* </table>
|
||||
* </blockquote>
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
@ -201,7 +127,9 @@ protected:
|
||||
//
|
||||
RBBIDataWrapper *fData;
|
||||
UTrie *fCharMappings;
|
||||
int32_t fLastBreakTag; // Rule {tag} value for the most recent match.
|
||||
|
||||
// Rule {tag} value for the most recent match.
|
||||
int32_t fLastBreakTag;
|
||||
|
||||
//
|
||||
// Counter for the number of characters encountered with the "dictionary"
|
||||
@ -215,7 +143,7 @@ protected:
|
||||
// Debugging flag.
|
||||
//
|
||||
static UBool fTrace;
|
||||
|
||||
|
||||
|
||||
|
||||
private:
|
||||
@ -228,7 +156,7 @@ protected:
|
||||
//=======================================================================
|
||||
// constructors
|
||||
//=======================================================================
|
||||
|
||||
|
||||
// This constructor uses the udata interface to create a BreakIterator whose
|
||||
// internal tables live in a memory-mapped file. "image" is a pointer to the
|
||||
// beginning of that file.
|
||||
@ -248,7 +176,7 @@ protected:
|
||||
friend class BreakIterator;
|
||||
|
||||
|
||||
|
||||
|
||||
public:
|
||||
|
||||
/** Default constructor. Creates an empty shell of an iterator, with no
|
||||
@ -500,7 +428,7 @@ protected:
|
||||
* Return true if the category lookup for this char
|
||||
* indicates that it is in the set of dictionary lookup chars.
|
||||
* This function is intended for use by dictionary based break iterators.
|
||||
*/
|
||||
*/
|
||||
virtual UBool isDictionaryChar(UChar32);
|
||||
|
||||
/**
|
||||
@ -513,7 +441,7 @@ protected:
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
//----------------------------------------------------------------------------------
|
||||
//
|
||||
// Inline Functions Definitions ...
|
||||
|
Loading…
Reference in New Issue
Block a user