ICU-45 RBBI copyright notices, AIX portability, comments

X-SVN-Rev: 8963
2002-06-27 21:14:47 +00:00 · 2002-06-27 21:14:47 +00:00 · e56b99a590
commit e56b99a590
parent 59029844b7
9 changed files with 171 additions and 213 deletions
--- a/icu4c/source/common/rbbi.cpp
+++ b/icu4c/source/common/rbbi.cpp
@ -1,3 +1,8 @@
+//
+//  file:  rbbi.c    Contains the implementation of the rule based break iterator
+//                   runtime engine and the API implementation for
+//                   class RuleBasedBreakIterator
+//
 /*
 **********************************************************************
 *   Copyright (C) 1999-2002 International Business Machines Corporation   *
@ -5,6 +10,7 @@
 **********************************************************************
 */

+
 #include "unicode/rbbi.h"
 #include "unicode/schriter.h"
 #include "unicode/udata.h"
--- a/icu4c/source/common/rbbicst.pl
+++ b/icu4c/source/common/rbbicst.pl
@ -1,6 +1,21 @@
 #
 #  rbbicst   Compile the RBBI rule paser state table data into initialized C data.
+#            Usage:
+#                   cd icu/source/common
+#                   perl rbbicst.pl < rbbirpt.txt > rbbirpt.h
 #
+#             The output file, rbbrpt.h, is included by some of the .cpp rbbi
+#             implementation files.   This perl script is NOT run as part
+#             of a normal ICU build.  It is run by hand when needed, and the
+#             rbbirpt.h generated file is put back into cvs.
+#
+#             See rbbirpt.h for a description of the input format for this script.
+#
+#*********************************************************************
+#   Copyright (C) 2002 International Business Machines Corporation   *
+#   and others. All rights reserved.                                 *
+#*********************************************************************
+

 $num_states = 1;     # Always the state number for the line being compiled.
 $line_num  = 0;      # The line number in the input file.
@ -180,10 +195,14 @@ die if ($errors>0);
 print "//---------------------------------------------------------------------------------\n";
 print "//\n";
 print "// Generated Header File.  Do not edit by hand.\n";
-print "//    This file contains the state table for RBBI rule parser.\n";
+print "//    This file contains the state table for the ICU Rule Based Break Iterator\n";
+print "//    rule parser.\n";
 print "//    It is generated by the Perl script \"rbbicst.pl\" from\n";
 print "//    the rule parser state definitions file \"rbbirpt.txt\".\n";
 print "//\n";
+print "//   Copyright (C) 2002 International Business Machines Corporation \n";
+print "//   and others. All rights reserved.  \n";
+print "//\n";
 print "//---------------------------------------------------------------------------------\n";
 print "#ifndef RBBIRPT_H\n";
 print "#define RBBIRPT_H\n";
@ -257,7 +276,7 @@ for ($state=1; $state < $num_states; $state++) {
    print "    , {$state_func_name[$state],";
    if ($state_literal_chars[$state] ne "") {
        $c = $state_literal_chars[$state];
-        printf(" %d /*$c*/,", ord($c));   #TODO:  use numeric value, so EBCDIC machines are ok.
+        printf(" %d /* $c */,", ord($c));   #  use numeric value, so EBCDIC machines are ok.
    }else {
        print " $charClasses{$state_char_class[$state]},";
    }
--- a/icu4c/source/common/rbbirb.cpp
+++ b/icu4c/source/common/rbbirb.cpp
@ -201,8 +201,8 @@ RBBIRuleBuilder::createRuleBasedBreakIterator( const UnicodeString    &rules,
    //
    //   Generate the DFA state transition table.
    //
-    builder.fForwardTables = new RBBITableBuilder(&builder, builder.fForwardTree);
-    builder.fReverseTables = new RBBITableBuilder(&builder, builder.fReverseTree);
+    builder.fForwardTables = new RBBITableBuilder(&builder, &builder.fForwardTree);
+    builder.fReverseTables = new RBBITableBuilder(&builder, &builder.fReverseTree);
    builder.fForwardTables->build();
    builder.fReverseTables->build();
    if (U_FAILURE(status)) {
--- a/icu4c/source/common/rbbirpt.h
+++ b/icu4c/source/common/rbbirpt.h
@ -1,10 +1,14 @@
 //---------------------------------------------------------------------------------
 //
 // Generated Header File.  Do not edit by hand.
-//    This file contains the state table for RBBI rule parser.
+//    This file contains the state table for the ICU Rule Based Break Iterator
+//    rule parser.
 //    It is generated by the Perl script "rbbicst.pl" from
 //    the rule parser state definitions file "rbbirpt.txt".
 //
+//   Copyright (C) 2002 International Business Machines Corporation 
+//   and others. All rights reserved.  
+//
 //---------------------------------------------------------------------------------
 #ifndef RBBIRPT_H
 #define RBBIRPT_H
@ -71,87 +75,87 @@ struct RBBIRuleTableEl gRuleParseStateTable[] = {
    {doNOP, 0, 0, 0, TRUE}
    , {doExprStart, 254, 12, 8, FALSE}     //  1      start
    , {doNOP, 130, 1,0,  TRUE}     //  2 
-    , {doExprStart, 36 /*$*/, 70, 80, FALSE}     //  3 
-    , {doReverseDir, 33 /*!*/, 11,0,  TRUE}     //  4 
-    , {doNOP, 59 /*;*/, 1,0,  TRUE}     //  5 
+    , {doExprStart, 36 /* $ */, 70, 80, FALSE}     //  3 
+    , {doReverseDir, 33 /* ! */, 11,0,  TRUE}     //  4 
+    , {doNOP, 59 /* ; */, 1,0,  TRUE}     //  5 
    , {doNOP, 252, 0,0,  FALSE}     //  6 
    , {doExprStart, 255, 12, 8, FALSE}     //  7 
-    , {doEndOfRule, 59 /*;*/, 1,0,  TRUE}     //  8      break-rule-end
+    , {doEndOfRule, 59 /* ; */, 1,0,  TRUE}     //  8      break-rule-end
    , {doNOP, 130, 8,0,  TRUE}     //  9 
    , {doRuleError, 255, 85,0,  FALSE}     //  10 
    , {doExprStart, 255, 12, 8, FALSE}     //  11      reverse-rule
    , {doRuleChar, 254, 21,0,  TRUE}     //  12      term
    , {doNOP, 130, 12,0,  TRUE}     //  13 
    , {doRuleChar, 129, 21,0,  TRUE}     //  14 
-    , {doNOP, 91 /*[*/, 76, 21, FALSE}     //  15 
-    , {doLParen, 40 /*(*/, 12, 21, TRUE}     //  16 
-    , {doNOP, 36 /*$*/, 70, 20, FALSE}     //  17 
-    , {doDotAny, 46 /*.*/, 21,0,  TRUE}     //  18 
+    , {doNOP, 91 /* [ */, 76, 21, FALSE}     //  15 
+    , {doLParen, 40 /* ( */, 12, 21, TRUE}     //  16 
+    , {doNOP, 36 /* $ */, 70, 20, FALSE}     //  17 
+    , {doDotAny, 46 /* . */, 21,0,  TRUE}     //  18 
    , {doRuleError, 255, 85,0,  FALSE}     //  19 
    , {doCheckVarDef, 255, 21,0,  FALSE}     //  20      term-var-ref
-    , {doUnaryOpStar, 42 /***/, 25,0,  TRUE}     //  21      expr-mod
-    , {doUnaryOpPlus, 43 /*+*/, 25,0,  TRUE}     //  22 
-    , {doUnaryOpQuestion, 63 /*?*/, 25,0,  TRUE}     //  23 
+    , {doUnaryOpStar, 42 /* * */, 25,0,  TRUE}     //  21      expr-mod
+    , {doUnaryOpPlus, 43 /* + */, 25,0,  TRUE}     //  22 
+    , {doUnaryOpQuestion, 63 /* ? */, 25,0,  TRUE}     //  23 
    , {doNOP, 255, 25,0,  FALSE}     //  24 
    , {doExprCatOperator, 254, 12,0,  FALSE}     //  25      expr-cont
    , {doNOP, 130, 25,0,  TRUE}     //  26 
    , {doExprCatOperator, 129, 12,0,  FALSE}     //  27 
-    , {doExprCatOperator, 91 /*[*/, 12,0,  FALSE}     //  28 
-    , {doExprCatOperator, 40 /*(*/, 12,0,  FALSE}     //  29 
-    , {doExprCatOperator, 36 /*$*/, 12,0,  FALSE}     //  30 
-    , {doExprCatOperator, 46 /*.*/, 12,0,  FALSE}     //  31 
-    , {doExprCatOperator, 47 /*/*/, 37,0,  FALSE}     //  32 
-    , {doExprCatOperator, 123 /*{*/, 49,0,  TRUE}     //  33 
-    , {doExprOrOperator, 124 /*|*/, 12,0,  TRUE}     //  34 
-    , {doExprRParen, 41 /*)*/, 255,0,  TRUE}     //  35 
+    , {doExprCatOperator, 91 /* [ */, 12,0,  FALSE}     //  28 
+    , {doExprCatOperator, 40 /* ( */, 12,0,  FALSE}     //  29 
+    , {doExprCatOperator, 36 /* $ */, 12,0,  FALSE}     //  30 
+    , {doExprCatOperator, 46 /* . */, 12,0,  FALSE}     //  31 
+    , {doExprCatOperator, 47 /* / */, 37,0,  FALSE}     //  32 
+    , {doExprCatOperator, 123 /* { */, 49,0,  TRUE}     //  33 
+    , {doExprOrOperator, 124 /* | */, 12,0,  TRUE}     //  34 
+    , {doExprRParen, 41 /* ) */, 255,0,  TRUE}     //  35 
    , {doExprFinished, 255, 255,0,  FALSE}     //  36 
-    , {doSlash, 47 /*/*/, 39,0,  TRUE}     //  37      look-ahead
+    , {doSlash, 47 /* / */, 39,0,  TRUE}     //  37      look-ahead
    , {doNOP, 255, 85,0,  FALSE}     //  38 
    , {doExprCatOperator, 254, 12,0,  FALSE}     //  39      expr-cont-no-slash
    , {doNOP, 130, 25,0,  TRUE}     //  40 
    , {doExprCatOperator, 129, 12,0,  FALSE}     //  41 
-    , {doExprCatOperator, 91 /*[*/, 12,0,  FALSE}     //  42 
-    , {doExprCatOperator, 40 /*(*/, 12,0,  FALSE}     //  43 
-    , {doExprCatOperator, 36 /*$*/, 12,0,  FALSE}     //  44 
-    , {doExprCatOperator, 46 /*.*/, 12,0,  FALSE}     //  45 
-    , {doExprOrOperator, 124 /*|*/, 12,0,  TRUE}     //  46 
-    , {doExprRParen, 41 /*)*/, 255,0,  TRUE}     //  47 
+    , {doExprCatOperator, 91 /* [ */, 12,0,  FALSE}     //  42 
+    , {doExprCatOperator, 40 /* ( */, 12,0,  FALSE}     //  43 
+    , {doExprCatOperator, 36 /* $ */, 12,0,  FALSE}     //  44 
+    , {doExprCatOperator, 46 /* . */, 12,0,  FALSE}     //  45 
+    , {doExprOrOperator, 124 /* | */, 12,0,  TRUE}     //  46 
+    , {doExprRParen, 41 /* ) */, 255,0,  TRUE}     //  47 
    , {doExprFinished, 255, 255,0,  FALSE}     //  48 
    , {doNOP, 130, 49,0,  TRUE}     //  49      tag-open
    , {doStartTagValue, 128, 52,0,  FALSE}     //  50 
    , {doTagExpectedError, 255, 85,0,  FALSE}     //  51 
    , {doNOP, 130, 56,0,  TRUE}     //  52      tag-value
-    , {doNOP, 125 /*}*/, 56,0,  FALSE}     //  53 
+    , {doNOP, 125 /* } */, 56,0,  FALSE}     //  53 
    , {doTagDigit, 128, 52,0,  TRUE}     //  54 
    , {doTagExpectedError, 255, 85,0,  FALSE}     //  55 
    , {doNOP, 130, 56,0,  TRUE}     //  56      tag-close
-    , {doTagValue, 125 /*}*/, 59,0,  TRUE}     //  57 
+    , {doTagValue, 125 /* } */, 59,0,  TRUE}     //  57 
    , {doTagExpectedError, 255, 85,0,  FALSE}     //  58 
    , {doExprCatOperator, 254, 12,0,  FALSE}     //  59      expr-cont-no-tag
    , {doNOP, 130, 59,0,  TRUE}     //  60 
    , {doExprCatOperator, 129, 12,0,  FALSE}     //  61 
-    , {doExprCatOperator, 91 /*[*/, 12,0,  FALSE}     //  62 
-    , {doExprCatOperator, 40 /*(*/, 12,0,  FALSE}     //  63 
-    , {doExprCatOperator, 36 /*$*/, 12,0,  FALSE}     //  64 
-    , {doExprCatOperator, 46 /*.*/, 12,0,  FALSE}     //  65 
-    , {doExprCatOperator, 47 /*/*/, 37,0,  FALSE}     //  66 
-    , {doExprOrOperator, 124 /*|*/, 12,0,  TRUE}     //  67 
-    , {doExprRParen, 41 /*)*/, 255,0,  TRUE}     //  68 
+    , {doExprCatOperator, 91 /* [ */, 12,0,  FALSE}     //  62 
+    , {doExprCatOperator, 40 /* ( */, 12,0,  FALSE}     //  63 
+    , {doExprCatOperator, 36 /* $ */, 12,0,  FALSE}     //  64 
+    , {doExprCatOperator, 46 /* . */, 12,0,  FALSE}     //  65 
+    , {doExprCatOperator, 47 /* / */, 37,0,  FALSE}     //  66 
+    , {doExprOrOperator, 124 /* | */, 12,0,  TRUE}     //  67 
+    , {doExprRParen, 41 /* ) */, 255,0,  TRUE}     //  68 
    , {doExprFinished, 255, 255,0,  FALSE}     //  69 
-    , {doStartVariableName, 36 /*$*/, 72,0,  TRUE}     //  70      scan-var-name
+    , {doStartVariableName, 36 /* $ */, 72,0,  TRUE}     //  70      scan-var-name
    , {doNOP, 255, 85,0,  FALSE}     //  71 
    , {doNOP, 132, 74,0,  TRUE}     //  72      scan-var-start
    , {doVariableNameExpectedErr, 255, 85,0,  FALSE}     //  73 
    , {doNOP, 131, 74,0,  TRUE}     //  74      scan-var-body
    , {doEndVariableName, 255, 255,0,  FALSE}     //  75 
-    , {doScanUnicodeSet, 91 /*[*/, 255,0,  TRUE}     //  76      scan-unicode-set
-    , {doScanUnicodeSet, 112 /*p*/, 255,0,  TRUE}     //  77 
-    , {doScanUnicodeSet, 80 /*P*/, 255,0,  TRUE}     //  78 
+    , {doScanUnicodeSet, 91 /* [ */, 255,0,  TRUE}     //  76      scan-unicode-set
+    , {doScanUnicodeSet, 112 /* p */, 255,0,  TRUE}     //  77 
+    , {doScanUnicodeSet, 80 /* P */, 255,0,  TRUE}     //  78 
    , {doNOP, 255, 85,0,  FALSE}     //  79 
    , {doNOP, 130, 80,0,  TRUE}     //  80      assign-or-rule
-    , {doStartAssign, 61 /*=*/, 12, 83, TRUE}     //  81 
+    , {doStartAssign, 61 /* = */, 12, 83, TRUE}     //  81 
    , {doNOP, 255, 20, 8, FALSE}     //  82 
-    , {doEndAssign, 59 /*;*/, 1,0,  TRUE}     //  83      assign-end
+    , {doEndAssign, 59 /* ; */, 1,0,  TRUE}     //  83      assign-end
    , {doRuleErrorAssignExpr, 255, 85,0,  FALSE}     //  84 
    , {doExit, 255, 85,0,  TRUE}     //  85      errorDeath
 };
--- a/icu4c/source/common/rbbisetb.cpp
+++ b/icu4c/source/common/rbbisetb.cpp
@ -1,13 +1,15 @@
 //
 //  rbbisetb.cpp
+//
 /*
 **********************************************************************
-*   Copyright (c) 2001, International Business Machines
-*   Corporation and others.  All Rights Reserved.
+*   Copyright (C) 2002 International Business Machines Corporation   *
+*   and others. All rights reserved.                                 *
 **********************************************************************
 */
 //
-//  RBBISetBuilder   Handles processing of Unicode Sets from RBBI rules.
+//  RBBISetBuilder   Handles processing of Unicode Sets from RBBI rules
+//                   (part of the rule building process.)
 //
 //      Starting with the rules parse tree from the scanner,
 //
--- a/icu4c/source/common/rbbistbl.cpp
+++ b/icu4c/source/common/rbbistbl.cpp
@ -1,12 +1,12 @@
 //
 //  file:  rbbistbl.cpp    Implementation of the ICU RBBISymbolTable class
 //
-
-/********************************************************************
- * COPYRIGHT:
- * Copyright (c) 1997-2001, International Business Machines Corporation and
- * others. All Rights Reserved.
- ********************************************************************/
+/*
+**********************************************************************
+*   Copyright (C) 2002 International Business Machines Corporation   *
+*   and others. All rights reserved.                                 *
+**********************************************************************
+*/

 #include "unicode/unistr.h"
 #include "unicode/uniset.h"
--- a/icu4c/source/common/rbbitblb.cpp
+++ b/icu4c/source/common/rbbitblb.cpp
@ -4,7 +4,7 @@

 /*
 **********************************************************************
-*   Copyright (c) 2001, International Business Machines
+*   Copyright (c) 2002, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 **********************************************************************
 */
@ -20,8 +20,8 @@
 #include <assert.h>


-RBBITableBuilder::RBBITableBuilder(RBBIRuleBuilder *rb, RBBINode *&rootNode) :
- fTree(rootNode) {
+RBBITableBuilder::RBBITableBuilder(RBBIRuleBuilder *rb, RBBINode **rootNode) :
+ fTree(*rootNode) {
    fRB             = rb;
    fStatus         = fRB->fStatus;
    fDStates        = new UVector(*fStatus);
--- a/icu4c/source/common/rbbitblb.h
+++ b/icu4c/source/common/rbbitblb.h
@ -4,7 +4,7 @@

 /*
 **********************************************************************
-*   Copyright (c) 2001, International Business Machines
+*   Copyright (c) 2002, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 **********************************************************************
 */
@ -21,6 +21,7 @@
 U_NAMESPACE_BEGIN

 class RBBIRuleScanner;
+class RBBIRuleBuilder;

 //
 //  class RBBITableBuilder is part of the RBBI rule compiler.
@ -33,9 +34,7 @@ class RBBIRuleScanner;

 class RBBITableBuilder : public UObject {
 public:
-    // TODO:  add a root node param to the constructor.  We're going to have two
-    //        builders, one for the forward table, and one for the reverse table.
-    RBBITableBuilder(RBBIRuleBuilder *rb, RBBINode *&rootNode);
+    RBBITableBuilder(RBBIRuleBuilder *rb, RBBINode **rootNode);
    ~RBBITableBuilder();

    void     build();
--- a/icu4c/source/common/unicode/rbbi.h
+++ b/icu4c/source/common/unicode/rbbi.h
@ -1,5 +1,9 @@
 /*
-* Copyright (C) {1999-2001}, International Business Machines Corporation and others. All Rights Reserved.
+***************************************************************************
+*   Copyright (C) 1999-2002 International Business Machines Corporation   *
+*   and others. All rights reserved.                                      *
+***************************************************************************
+
 **********************************************************************
 *   Date        Name        Description
 *   10/22/99    alan        Creation.
@ -28,26 +32,18 @@ class BreakIterator;
 /**
 * <p>A subclass of BreakIterator whose behavior is specified using a list of rules.</p>
 *
- * <p>There are two kinds of rules, which are separated by semicolons: <i>substitutions</i>
+ * <p>There are two kinds of rules, which are separated by semicolons: <i>variable definitions</i>
 * and <i>regular expressions.</i></p>
 *
- * <p>A substitution rule defines a name that can be used in place of an expression. It
- * consists of a name, which is a string of characters contained in angle brackets, an equals
- * sign, and an expression. (There can be no whitespace on either side of the equals sign.)
- * To keep its syntactic meaning intact, the expression must be enclosed in parentheses or
- * square brackets. A substitution is visible after its definition, and is filled in using
- * simple textual substitution. Substitution definitions can contain other substitutions, as
- * long as those substitutions have been defined first. Substitutions are generally used to
- * make the regular expressions (which can get quite complex) shorted and easier to read.
+ * <p>A varialbe definition defines a variable name that can be used in subsequent expressions.
+ * It consists of a name preceded by a dollar sign, an equals
+ * sign, and an expression.
+ * A $variable is visible after its definition.
+ * Variable definitions can contain other variables, as
+ * long as those variables have been defined first. Variables are generally used to
+ * make the regular expressions (which can get quite complex) shorter and easier to read.
 * They typically define either character categories or commonly-used subexpressions.</p>
 *
- * <p>There is one special substitution.&nbsp; If the description defines a substitution
- * called &quot;&lt;ignore&gt;&quot;, the expression must be a [] expression, and the
- * expression defines a set of characters (the &quot;<em>ignore characters</em>&quot;) that
- * will be transparent to the BreakIterator.&nbsp; A sequence of characters will break the
- * same way it would if any ignore characters it contains are taken out.&nbsp; Break
- * positions never occur befoer ignore characters.</p>
- *
 * <p>A regular expression uses a subset of the normal Unix regular-expression syntax, and
 * defines a sequence of characters to be kept together. With one significant exception, the
 * iterator uses a longest-possible-match algorithm when matching text to regular
@ -64,10 +60,6 @@ class BreakIterator;
 *       of times (including not at all).</td>
 *     </tr>
 *     <tr>
- *       <td width="6%">{}</td>
- *       <td width="94%">Encloses a sequence of characters that is optional.</td>
- *     </tr>
- *     <tr>
 *       <td width="6%">()</td>
 *       <td width="94%">Encloses a sequence of characters.&nbsp; If followed by *, the sequence
 *       repeats.&nbsp; Otherwise, the parentheses are just a grouping device and a way to delimit
@ -76,29 +68,17 @@ class BreakIterator;
 *     <tr>
 *       <td width="6%">|</td>
 *       <td width="94%">Separates two alternative sequences of characters.&nbsp; Either one
- *       sequence or the other, but not both, matches this expression.&nbsp; The | character can
- *       only occur inside ().</td>
+ *       sequence or the other, but not both, matches this expression.</td>
 *     </tr>
 *     <tr>
 *       <td width="6%">.</td>
 *       <td width="94%">Matches any character.</td>
 *     </tr>
 *     <tr>
- *       <td width="6%">*?</td>
- *       <td width="94%">Specifies a non-greedy asterisk.&nbsp; *? works the same way as *, except
- *       when there is overlap between the last group of characters in the expression preceding the
- *       * and the first group of characters following the *.&nbsp; When there is this kind of
- *       overlap, * will match the longest sequence of characters that match the expression before
- *       the *, and *? will match the shortest sequence of characters matching the expression
- *       before the *?.&nbsp; For example, if you have &quot;xxyxyyyxyxyxxyxyxyy&quot; in the text,
- *       &quot;x[xy]*x&quot; will match through to the last x (i.e., &quot;<strong>xxyxyyyxyxyxxyxyx</strong>yy&quot;,
- *       but &quot;x[xy]*?x&quot; will only match the first two xes (&quot;<strong>xx</strong>yxyyyxyxyxxyxyxyy&quot;).</td>
- *     </tr>
- *     <tr>
 *       <td width="6%">[]</td>
- *       <td width="94%">Specifies a group of alternative characters.&nbsp; A [] expression will
+ *       <td width="94%">Specify a set of characters.&nbsp; A [] expression will
 *       match any single character that is specified in the [] expression.&nbsp; For more on the
- *       syntax of [] expressions, see below.</td>
+ *       syntax of [] expressions, see the ICU User Guide description of UnicodeSet.</td>
 *     </tr>
 *     <tr>
 *       <td width="6%">/</td>
@ -111,24 +91,16 @@ class BreakIterator;
 *     <tr>
 *       <td width="6%">\</td>
 *       <td width="94%">Escape character.&nbsp; The \ itself is ignored, but causes the next
- *       character to be treated as literal character.&nbsp; This has no effect for many
- *       characters, but for the characters listed above, this deprives them of their special
- *       meaning.&nbsp; (There are no special escape sequences for Unicode characters, or tabs and
- *       newlines; these are all handled by a higher-level protocol.&nbsp; In a Java string,
- *       &quot;\n&quot; will be converted to a literal newline character by the time the
- *       regular-expression parser sees it.&nbsp; Of course, this means that \ sequences that are
- *       visible to the regexp parser must be written as \\ when inside a Java string.)&nbsp; All
- *       characters in the ASCII range except for letters, digits, and control characters are
- *       reserved characters to the parser and must be preceded by \ even if they currently don't
- *       mean anything.</td>
+ *       character to be treated as literal character.&nbsp;  Except for letters and numbers,
+ *       characters in the ASCII range must be escaped to be considered as literals.</td>
 *     </tr>
 *     <tr>
 *       <td width="6%">!</td>
 *       <td width="94%">If ! appears at the beginning of a regular expression, it tells the regexp
 *       parser that this expression specifies the backwards-iteration behavior of the iterator,
- *       and not its normal iteration behavior.&nbsp; This is generally only used in situations
- *       where the automatically-generated backwards-iteration brhavior doesn't produce
- *       satisfactory results and must be supplemented with extra client-specified rules.</td>
+ *       and not its normal iteration behavior.&nbsp;  The backwards rules must move the
+ *       iterator to a safe position at or before the previous break position; forwards rules
+ *       will then be used to find the exact previous position</td>
 *     </tr>
 *     <tr>
 *       <td width="6%"><em>(all others)</em></td>
@ -137,52 +109,6 @@ class BreakIterator;
 *     </tr>
 *   </table>
 * </blockquote>
- *
- * <p>Within a [] expression, a number of other special characters can be used to specify
- * groups of characters:</p>
- *
- * <blockquote>
- *   <table border="1" width="100%">
- *     <tr>
- *       <td width="6%">-</td>
- *       <td width="94%">Specifies a range of matching characters.&nbsp; For example
- *       &quot;[a-p]&quot; matches all lowercase Latin letters from a to p (inclusive).&nbsp; The -
- *       sign specifies ranges of continuous Unicode numeric values, not ranges of characters in a
- *       language's alphabetical order: &quot;[a-z]&quot; doesn't include capital letters, nor does
- *       it include accented letters such as a-umlaut.</td>
- *     </tr>
- *     <tr>
- *       <td width="6%">::</td>
- *       <td width="94%">A pair of colons containing a one- or two-letter code matches all
- *       characters in the corresponding Unicode category.&nbsp; The two-letter codes are the same
- *       as the two-letter codes in the Unicode database (for example, &quot;[:Sc::Sm:]&quot;
- *       matches all currency symbols and all math symbols).&nbsp; Specifying a one-letter code is
- *       the same as specifying all two-letter codes that begin with that letter (for example,
- *       &quot;[:L:]&quot; matches all letters, and is equivalent to
- *       &quot;[:Lu::Ll::Lo::Lm::Lt:]&quot;).&nbsp; Anything other than a valid two-letter Unicode
- *       category code or a single letter that begins a Unicode category code is illegal within
- *       colons.</td>
- *     </tr>
- *     <tr>
- *       <td width="6%">[]</td>
- *       <td width="94%">[] expressions can nest.&nbsp; This has no effect, except when used in
- *       conjunction with the ^ token.</td>
- *     </tr>
- *     <tr>
- *       <td width="6%">^</td>
- *       <td width="94%">Excludes the character (or the characters in the [] expression) following
- *       it from the group of characters.&nbsp; For example, &quot;[a-z^p]&quot; matches all Latin
- *       lowercase letters except p.&nbsp; &quot;[:L:^[\u4e00-\u9fff]]&quot; matches all letters
- *       except the Han ideographs.</td>
- *     </tr>
- *     <tr>
- *       <td width="6%"><em>(all others)</em></td>
- *       <td width="94%">All other characters are treated as literal characters.&nbsp; (For
- *       example, &quot;[aeiou]&quot; specifies just the letters a, e, i, o, and u.)</td>
- *     </tr>
- *   </table>
- * </blockquote>
- *
 */


@ -201,7 +127,9 @@ protected:
    //
    RBBIDataWrapper    *fData;
    UTrie              *fCharMappings;
-    int32_t             fLastBreakTag;      // Rule {tag} value for the most recent match.
+
+    // Rule {tag} value for the most recent match.
+    int32_t             fLastBreakTag;

    //
    // Counter for the number of characters encountered with the "dictionary"