2002-06-25 17:23:07 +00:00
|
|
|
//
|
|
|
|
// rbbirb.h
|
|
|
|
//
|
2004-05-18 22:01:41 +00:00
|
|
|
// Copyright (C) 2002-2004, International Business Machines Corporation and others.
|
2002-06-25 17:23:07 +00:00
|
|
|
// All Rights Reserved.
|
|
|
|
//
|
2003-10-09 01:13:08 +00:00
|
|
|
// This file contains declarations for several classes from the
|
|
|
|
// Rule Based Break Iterator rule builder.
|
2002-06-25 17:23:07 +00:00
|
|
|
//
|
|
|
|
|
|
|
|
|
|
|
|
#ifndef RBBIRB_H
|
|
|
|
#define RBBIRB_H
|
|
|
|
|
2002-06-27 01:19:20 +00:00
|
|
|
#include "unicode/utypes.h"
|
|
|
|
#include "unicode/uobject.h"
|
2002-06-25 17:23:07 +00:00
|
|
|
#include "unicode/rbbi.h"
|
|
|
|
#include "unicode/uniset.h"
|
|
|
|
#include "unicode/parseerr.h"
|
|
|
|
#include "uhash.h"
|
|
|
|
#include "uvector.h"
|
2003-10-14 21:47:59 +00:00
|
|
|
#include "unicode/symtable.h"// For UnicodeSet parsing, is the interface that
|
2002-06-25 17:23:07 +00:00
|
|
|
// looks up references to $variables within a set.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
U_NAMESPACE_BEGIN
|
|
|
|
|
|
|
|
class RBBIRuleScanner;
|
|
|
|
struct RBBIRuleTableEl;
|
|
|
|
class RBBISetBuilder;
|
|
|
|
class RBBINode;
|
|
|
|
class RBBITableBuilder;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
//--------------------------------------------------------------------------------
|
|
|
|
//
|
|
|
|
// RBBISymbolTable. Implements SymbolTable interface that is used by the
|
|
|
|
// UnicodeSet parser to resolve references to $variables.
|
|
|
|
//
|
|
|
|
//--------------------------------------------------------------------------------
|
2002-10-04 01:23:34 +00:00
|
|
|
class RBBISymbolTableEntry : public UMemory { // The symbol table hash table contains one
|
2002-06-25 17:23:07 +00:00
|
|
|
public: // of these structs for each entry.
|
2002-10-04 01:23:34 +00:00
|
|
|
RBBISymbolTableEntry();
|
2002-06-25 17:23:07 +00:00
|
|
|
UnicodeString key;
|
|
|
|
RBBINode *val;
|
|
|
|
~RBBISymbolTableEntry();
|
2002-06-29 00:04:16 +00:00
|
|
|
|
|
|
|
private:
|
2002-10-04 01:23:34 +00:00
|
|
|
RBBISymbolTableEntry(const RBBISymbolTableEntry &other); // forbid copying of this class
|
|
|
|
RBBISymbolTableEntry &operator=(const RBBISymbolTableEntry &other); // forbid copying of this class
|
2002-06-25 17:23:07 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
|
2002-10-04 17:30:36 +00:00
|
|
|
class RBBISymbolTable : public UMemory, public SymbolTable {
|
2002-06-25 17:23:07 +00:00
|
|
|
private:
|
|
|
|
const UnicodeString &fRules;
|
|
|
|
UHashtable *fHashTable;
|
|
|
|
RBBIRuleScanner *fRuleScanner;
|
|
|
|
|
|
|
|
// These next two fields are part of the mechanism for passing references to
|
|
|
|
// already-constructed UnicodeSets back to the UnicodeSet constructor
|
|
|
|
// when the pattern includes $variable references.
|
|
|
|
const UnicodeString ffffString; // = "/uffff"
|
|
|
|
UnicodeSet *fCachedSetLookup;
|
|
|
|
|
|
|
|
public:
|
|
|
|
// API inherited from class SymbolTable
|
|
|
|
virtual const UnicodeString* lookup(const UnicodeString& s) const;
|
|
|
|
virtual const UnicodeFunctor* lookupMatcher(UChar32 ch) const;
|
|
|
|
virtual UnicodeString parseReference(const UnicodeString& text,
|
|
|
|
ParsePosition& pos, int32_t limit) const;
|
|
|
|
|
|
|
|
// Additional Functions
|
|
|
|
RBBISymbolTable(RBBIRuleScanner *, const UnicodeString &fRules, UErrorCode &status);
|
|
|
|
virtual ~RBBISymbolTable();
|
|
|
|
|
|
|
|
virtual RBBINode *lookupNode(const UnicodeString &key) const;
|
|
|
|
virtual void addEntry (const UnicodeString &key, RBBINode *val, UErrorCode &err);
|
|
|
|
|
2003-12-04 22:44:05 +00:00
|
|
|
#ifdef RBBI_DEBUG
|
|
|
|
virtual void rbbiSymtablePrint() const;
|
|
|
|
#else
|
|
|
|
// A do-nothing inline function for non-debug builds. Member funcs can't be empty
|
|
|
|
// or the call sites won't compile.
|
|
|
|
int fFakeField;
|
|
|
|
#define rbbiSymtablePrint() fFakeField=0;
|
|
|
|
#endif
|
2002-06-29 00:04:16 +00:00
|
|
|
|
|
|
|
private:
|
2002-10-04 17:30:36 +00:00
|
|
|
RBBISymbolTable(const RBBISymbolTable &other); // forbid copying of this class
|
|
|
|
RBBISymbolTable &operator=(const RBBISymbolTable &other); // forbid copying of this class
|
2002-06-25 17:23:07 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
//--------------------------------------------------------------------------------
|
|
|
|
//
|
|
|
|
// class RBBIRuleBuilder The top-level class handling RBBI rule compiling.
|
|
|
|
//
|
|
|
|
//--------------------------------------------------------------------------------
|
2002-10-04 01:23:34 +00:00
|
|
|
class RBBIRuleBuilder : public UMemory {
|
2002-06-25 17:23:07 +00:00
|
|
|
public:
|
|
|
|
|
|
|
|
// Create a rule based break iterator from a set of rules.
|
|
|
|
// This function is the main entry point into the rule builder. The
|
|
|
|
// public ICU API for creating RBBIs uses this function to do the actual work.
|
|
|
|
//
|
|
|
|
static BreakIterator * createRuleBasedBreakIterator( const UnicodeString &rules,
|
|
|
|
UParseError &parseError,
|
|
|
|
UErrorCode &status);
|
|
|
|
|
|
|
|
public:
|
|
|
|
// The "public" functions and data members that appear below are accessed
|
|
|
|
// (and shared) by the various parts that make up the rule builder. They
|
|
|
|
// are NOT intended to be accessed by anything outside of the
|
|
|
|
// rule builder implementation.
|
|
|
|
RBBIRuleBuilder(const UnicodeString &rules,
|
|
|
|
UParseError &parseErr,
|
|
|
|
UErrorCode &status
|
|
|
|
);
|
|
|
|
|
|
|
|
virtual ~RBBIRuleBuilder();
|
|
|
|
char *fDebugEnv; // controls debug trace output
|
|
|
|
UErrorCode *fStatus; // Error reporting. Keeping status
|
|
|
|
UParseError *fParseError; // here avoids passing it everywhere.
|
|
|
|
const UnicodeString &fRules; // The rule string that we are compiling
|
|
|
|
|
|
|
|
RBBIRuleScanner *fScanner; // The scanner.
|
|
|
|
RBBINode *fForwardTree; // The parse trees, generated by the scanner,
|
|
|
|
RBBINode *fReverseTree; // then manipulated by subsequent steps.
|
2003-11-05 02:03:44 +00:00
|
|
|
RBBINode *fSafeFwdTree;
|
|
|
|
RBBINode *fSafeRevTree;
|
|
|
|
|
|
|
|
RBBINode **fDefaultTree; // For rules not qualified with a !
|
|
|
|
// the tree to which they belong to.
|
2002-06-25 17:23:07 +00:00
|
|
|
|
2003-10-09 01:13:08 +00:00
|
|
|
UBool fChainRules; // True for chained Unicode TR style rules.
|
|
|
|
// False for traditional regexp rules.
|
|
|
|
|
2003-10-17 23:30:02 +00:00
|
|
|
UBool fLBCMNoChain; // True: suppress chaining of rules on
|
|
|
|
// chars with LineBreak property == CM.
|
|
|
|
|
2003-12-04 02:12:42 +00:00
|
|
|
UBool fLookAheadHardBreak; // True: Look ahead matches cause an
|
|
|
|
// immediate break, no continuing for the
|
|
|
|
// longest match.
|
|
|
|
|
2002-06-25 17:23:07 +00:00
|
|
|
RBBISetBuilder *fSetBuilder; // Set and Character Category builder.
|
2002-08-28 22:24:17 +00:00
|
|
|
UVector *fUSetNodes; // Vector of all uset nodes.
|
2002-06-25 17:23:07 +00:00
|
|
|
|
|
|
|
RBBITableBuilder *fForwardTables; // State transition tables
|
|
|
|
RBBITableBuilder *fReverseTables;
|
2003-11-05 02:03:44 +00:00
|
|
|
RBBITableBuilder *fSafeFwdTables;
|
|
|
|
RBBITableBuilder *fSafeRevTables;
|
2002-06-25 17:23:07 +00:00
|
|
|
|
2004-03-05 05:04:10 +00:00
|
|
|
UVector *fRuleStatusVals; // The values that can be returned
|
|
|
|
// from getRuleStatus().
|
|
|
|
|
2002-06-25 17:23:07 +00:00
|
|
|
RBBIDataHeader *flattenData(); // Create the flattened (runtime format)
|
|
|
|
// data tables..
|
|
|
|
private:
|
2002-10-04 01:23:34 +00:00
|
|
|
RBBIRuleBuilder(const RBBIRuleBuilder &other); // forbid copying of this class
|
|
|
|
RBBIRuleBuilder &operator=(const RBBIRuleBuilder &other); // forbid copying of this class
|
2002-06-25 17:23:07 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
//----------------------------------------------------------------------------
|
|
|
|
//
|
|
|
|
// RBBISetTableEl is an entry in the hash table of UnicodeSets that have
|
|
|
|
// been encountered. The val Node will be of nodetype uset
|
|
|
|
// and contain pointers to the actual UnicodeSets.
|
|
|
|
// The Key is the source string for initializing the set.
|
|
|
|
//
|
|
|
|
// The hash table is used to avoid creating duplicate
|
|
|
|
// unnamed (not $var references) UnicodeSets.
|
|
|
|
//
|
|
|
|
// Memory Management:
|
|
|
|
// The Hash Table owns these RBBISetTableEl structs and
|
|
|
|
// the key strings. It does NOT own the val nodes.
|
|
|
|
//
|
|
|
|
//----------------------------------------------------------------------------
|
|
|
|
struct RBBISetTableEl {
|
|
|
|
UnicodeString *key;
|
|
|
|
RBBINode *val;
|
|
|
|
};
|
|
|
|
|
|
|
|
|
2002-08-01 16:17:41 +00:00
|
|
|
//----------------------------------------------------------------------------
|
|
|
|
//
|
|
|
|
// RBBIDebugPrintf Printf equivalent, for debugging output.
|
|
|
|
// Conditional compilation of the implementation lets us
|
|
|
|
// get rid of the stdio dependency in environments where it
|
|
|
|
// is unavailable.
|
|
|
|
//
|
|
|
|
//----------------------------------------------------------------------------
|
2002-08-22 22:36:47 +00:00
|
|
|
#ifdef RBBI_DEBUG
|
|
|
|
#include <stdio.h>
|
|
|
|
#define RBBIDebugPrintf printf
|
2004-09-15 17:11:47 +00:00
|
|
|
#define RBBIDebugPuts puts
|
2002-08-22 22:36:47 +00:00
|
|
|
#else
|
2002-08-22 22:46:13 +00:00
|
|
|
inline void RBBIDebugPrintf(...) {}
|
2004-09-15 17:11:47 +00:00
|
|
|
#define RBBIDebugPuts(arg)
|
2002-08-22 22:36:47 +00:00
|
|
|
#endif
|
2002-08-01 16:17:41 +00:00
|
|
|
|
2002-06-25 17:23:07 +00:00
|
|
|
U_NAMESPACE_END
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
|
|
|