2002-06-25 17:23:07 +00:00
|
|
|
//
|
|
|
|
// rbbisetb.h
|
|
|
|
/*
|
|
|
|
**********************************************************************
|
|
|
|
* Copyright (c) 2001, International Business Machines
|
|
|
|
* Corporation and others. All Rights Reserved.
|
|
|
|
**********************************************************************
|
|
|
|
*/
|
|
|
|
|
|
|
|
#ifndef RBBISETB_H
|
|
|
|
#define RBBISETB_H
|
|
|
|
|
2002-06-27 01:19:20 +00:00
|
|
|
#include "unicode/utypes.h"
|
|
|
|
#include "unicode/uobject.h"
|
2002-06-25 17:23:07 +00:00
|
|
|
#include "rbbirb.h"
|
|
|
|
#include "uvector.h"
|
|
|
|
#include "uhash.h"
|
|
|
|
|
2002-07-22 22:02:08 +00:00
|
|
|
struct UNewTrie;
|
|
|
|
|
2002-06-25 17:23:07 +00:00
|
|
|
U_NAMESPACE_BEGIN
|
|
|
|
|
|
|
|
//
|
|
|
|
// RBBISetBuilder Derives the character categories used by the runtime RBBI engine
|
|
|
|
// from the Unicode Sets appearing in the source RBBI rules, and
|
|
|
|
// creates the TRIE table used to map from Unicode to the
|
|
|
|
// character categories.
|
|
|
|
//
|
|
|
|
|
|
|
|
|
|
|
|
//
|
|
|
|
// RangeDescriptor
|
|
|
|
//
|
|
|
|
// Each of the non-overlapping character ranges gets one of these descriptors.
|
|
|
|
// All of them are strung together in a linked list, which is kept in order
|
|
|
|
// (by character)
|
|
|
|
//
|
2002-07-16 01:55:55 +00:00
|
|
|
struct RangeDescriptor : public UObject {
|
2002-06-25 17:23:07 +00:00
|
|
|
UChar32 fStartChar; // Start of range, unicode 32 bit value.
|
|
|
|
UChar32 fEndChar; // End of range, unicode 32 bit value.
|
|
|
|
int32_t fNum; // runtime-mapped input value for this range.
|
|
|
|
UVector *fIncludesSets; // vector of the the original
|
|
|
|
// Unicode sets that include this range.
|
|
|
|
// (Contains ptrs to uset nodes)
|
|
|
|
RangeDescriptor *fNext; // Next RangeDescriptor in the linked list.
|
|
|
|
|
|
|
|
RangeDescriptor(UErrorCode &status);
|
|
|
|
RangeDescriptor(const RangeDescriptor &other, UErrorCode &status);
|
|
|
|
~RangeDescriptor();
|
|
|
|
void split(UChar32 where, UErrorCode &status); // Spit this range in two at "where", with
|
|
|
|
// where appearing in the second (higher) part.
|
|
|
|
void setDictionaryFlag(); // Check whether this range appears as part of
|
|
|
|
// the Unicode set named "dictionary"
|
2002-07-16 01:55:55 +00:00
|
|
|
|
|
|
|
/**
|
|
|
|
* ICU "poor man's RTTI", returns a UClassID for the actual class.
|
|
|
|
*
|
|
|
|
* @draft ICU 2.2
|
|
|
|
*/
|
|
|
|
virtual inline UClassID getDynamicClassID() const { return getStaticClassID(); }
|
|
|
|
|
|
|
|
/**
|
|
|
|
* ICU "poor man's RTTI", returns a UClassID for this class.
|
|
|
|
*
|
|
|
|
* @draft ICU 2.2
|
|
|
|
*/
|
|
|
|
static inline UClassID getStaticClassID() { return (UClassID)&fgClassID; }
|
|
|
|
|
|
|
|
private:
|
|
|
|
/**
|
|
|
|
* The address of this static class variable serves as this class's ID
|
|
|
|
* for ICU "poor man's RTTI".
|
|
|
|
*/
|
|
|
|
static const char fgClassID;
|
2002-06-25 17:23:07 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
//
|
|
|
|
// RBBISetBuilder Handles processing of Unicode Sets from RBBI rules.
|
|
|
|
//
|
|
|
|
// Starting with the rules parse tree from the scanner,
|
|
|
|
//
|
|
|
|
// - Enumerate the set of UnicodeSets that are referenced
|
|
|
|
// by the RBBI rules.
|
|
|
|
// - compute a derived set of non-overlapping UnicodeSets
|
|
|
|
// that will correspond to columns in the state table for
|
|
|
|
// the RBBI execution engine.
|
|
|
|
// - construct the trie table that maps input characters
|
|
|
|
// to set numbers in the non-overlapping set of sets.
|
|
|
|
//
|
|
|
|
|
|
|
|
|
2002-06-27 01:19:20 +00:00
|
|
|
class RBBISetBuilder : public UObject {
|
2002-06-25 17:23:07 +00:00
|
|
|
public:
|
|
|
|
RBBISetBuilder(RBBIRuleBuilder *rb);
|
|
|
|
~RBBISetBuilder();
|
|
|
|
|
2002-08-01 16:17:41 +00:00
|
|
|
void build();
|
2002-06-25 17:23:07 +00:00
|
|
|
void addValToSets(UVector *sets, uint32_t val);
|
|
|
|
int32_t getNumCharCategories(); // CharCategories are the same as input symbol set to the
|
|
|
|
// runtime state machine, which are the same as
|
|
|
|
// columns in the DFA state table
|
|
|
|
int32_t getTrieSize(); // Size in bytes of the serialized Trie.
|
|
|
|
void serializeTrie(uint8_t *where); // write out the serialized Trie.
|
|
|
|
void printSets();
|
|
|
|
void printRanges();
|
|
|
|
void printRangeGroups();
|
|
|
|
|
2002-06-29 00:04:16 +00:00
|
|
|
/**
|
|
|
|
* ICU "poor man's RTTI", returns a UClassID for the actual class.
|
|
|
|
*
|
|
|
|
* @draft ICU 2.2
|
|
|
|
*/
|
|
|
|
virtual inline UClassID getDynamicClassID() const { return getStaticClassID(); }
|
|
|
|
|
|
|
|
/**
|
|
|
|
* ICU "poor man's RTTI", returns a UClassID for this class.
|
|
|
|
*
|
|
|
|
* @draft ICU 2.2
|
|
|
|
*/
|
|
|
|
static inline UClassID getStaticClassID() { return (UClassID)&fgClassID; }
|
2002-06-25 17:23:07 +00:00
|
|
|
|
|
|
|
private:
|
2002-06-29 00:04:16 +00:00
|
|
|
void numberSets();
|
|
|
|
|
2002-06-25 17:23:07 +00:00
|
|
|
RBBIRuleBuilder *fRB; // The RBBI Rule Compiler that owns us.
|
|
|
|
UErrorCode *fStatus;
|
|
|
|
|
|
|
|
RangeDescriptor *fRangeList; // Head of the linked list of RangeDescriptors
|
|
|
|
|
2002-07-22 22:02:08 +00:00
|
|
|
UNewTrie *fTrie; // The mapping TRIE that is the end result of processing
|
2002-06-25 17:23:07 +00:00
|
|
|
uint32_t fTrieSize; // the Unicode Sets.
|
|
|
|
|
|
|
|
// Groups correspond to character categories -
|
|
|
|
// groups of ranges that are in the same original UnicodeSets.
|
|
|
|
// fGroupCount is the index of the last used group.
|
|
|
|
// The value is also the number of columns in the RBBI state table being compiled.
|
|
|
|
// Index 0 is not used. Funny counting.
|
|
|
|
int32_t fGroupCount;
|
|
|
|
|
2002-06-29 00:04:16 +00:00
|
|
|
/**
|
|
|
|
* The address of this static class variable serves as this class's ID
|
|
|
|
* for ICU "poor man's RTTI".
|
|
|
|
*/
|
|
|
|
static const char fgClassID;
|
2002-06-25 17:23:07 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
U_NAMESPACE_END
|
|
|
|
#endif
|