scuffed-code/icu4c/source/common/rbbirb.cpp
Andy Heninger 1eef362329 ICU-13565 Break Iteration, remove the dictionary bit from the implementation.
For identifying text that needs to be handled by a word dictionary for Break Iteration,
change from using a bit in the character category to sorting all dictionary categories
together, and recording the boundary between the non-dictionary and dictionary ranges.

This is internal to the implementaion. It does not affect behavior.
It does increase the number of character categories that can be handled using a
compact 8 bit Trie, from 127 to 255.
2020-06-17 12:00:14 -07:00

362 lines
11 KiB
C++

// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
//
// file: rbbirb.cpp
//
// Copyright (C) 2002-2011, International Business Machines Corporation and others.
// All Rights Reserved.
//
// This file contains the RBBIRuleBuilder class implementation. This is the main class for
// building (compiling) break rules into the tables required by the runtime
// RBBI engine.
//
#include "unicode/utypes.h"
#if !UCONFIG_NO_BREAK_ITERATION
#include "unicode/brkiter.h"
#include "unicode/rbbi.h"
#include "unicode/ubrk.h"
#include "unicode/unistr.h"
#include "unicode/uniset.h"
#include "unicode/uchar.h"
#include "unicode/uchriter.h"
#include "unicode/ustring.h"
#include "unicode/parsepos.h"
#include "unicode/parseerr.h"
#include "cmemory.h"
#include "cstring.h"
#include "rbbirb.h"
#include "rbbinode.h"
#include "rbbiscan.h"
#include "rbbisetb.h"
#include "rbbitblb.h"
#include "rbbidata.h"
#include "uassert.h"
U_NAMESPACE_BEGIN
//----------------------------------------------------------------------------------------
//
// Constructor.
//
//----------------------------------------------------------------------------------------
RBBIRuleBuilder::RBBIRuleBuilder(const UnicodeString &rules,
UParseError *parseErr,
UErrorCode &status)
: fRules(rules), fStrippedRules(rules)
{
fStatus = &status; // status is checked below
fParseError = parseErr;
fDebugEnv = NULL;
#ifdef RBBI_DEBUG
fDebugEnv = getenv("U_RBBIDEBUG");
#endif
fForwardTree = NULL;
fReverseTree = NULL;
fSafeFwdTree = NULL;
fSafeRevTree = NULL;
fDefaultTree = &fForwardTree;
fForwardTable = NULL;
fRuleStatusVals = NULL;
fChainRules = FALSE;
fLBCMNoChain = FALSE;
fLookAheadHardBreak = FALSE;
fUSetNodes = NULL;
fRuleStatusVals = NULL;
fScanner = NULL;
fSetBuilder = NULL;
if (parseErr) {
uprv_memset(parseErr, 0, sizeof(UParseError));
}
if (U_FAILURE(status)) {
return;
}
fUSetNodes = new UVector(status); // bcos status gets overwritten here
fRuleStatusVals = new UVector(status);
fScanner = new RBBIRuleScanner(this);
fSetBuilder = new RBBISetBuilder(this);
if (U_FAILURE(status)) {
return;
}
if(fSetBuilder == 0 || fScanner == 0 || fUSetNodes == 0 || fRuleStatusVals == 0) {
status = U_MEMORY_ALLOCATION_ERROR;
}
}
//----------------------------------------------------------------------------------------
//
// Destructor
//
//----------------------------------------------------------------------------------------
RBBIRuleBuilder::~RBBIRuleBuilder() {
int i;
for (i=0; ; i++) {
RBBINode *n = (RBBINode *)fUSetNodes->elementAt(i);
if (n==NULL) {
break;
}
delete n;
}
delete fUSetNodes;
delete fSetBuilder;
delete fForwardTable;
delete fForwardTree;
delete fReverseTree;
delete fSafeFwdTree;
delete fSafeRevTree;
delete fScanner;
delete fRuleStatusVals;
}
//----------------------------------------------------------------------------------------
//
// flattenData() - Collect up the compiled RBBI rule data and put it into
// the format for saving in ICU data files,
// which is also the format needed by the RBBI runtime engine.
//
//----------------------------------------------------------------------------------------
static int32_t align8(int32_t i) {return (i+7) & 0xfffffff8;}
RBBIDataHeader *RBBIRuleBuilder::flattenData() {
int32_t i;
if (U_FAILURE(*fStatus)) {
return NULL;
}
// Remove whitespace from the rules to make it smaller.
// The rule parser has already removed comments.
fStrippedRules = fScanner->stripRules(fStrippedRules);
// Calculate the size of each section in the data.
// Sizes here are padded up to a multiple of 8 for better memory alignment.
// Sections sizes actually stored in the header are for the actual data
// without the padding.
//
int32_t headerSize = align8(sizeof(RBBIDataHeader));
int32_t forwardTableSize = align8(fForwardTable->getTableSize());
int32_t reverseTableSize = align8(fForwardTable->getSafeTableSize());
int32_t trieSize = align8(fSetBuilder->getTrieSize());
int32_t statusTableSize = align8(fRuleStatusVals->size() * sizeof(int32_t));
int32_t rulesLengthInUTF8 = 0;
u_strToUTF8WithSub(0, 0, &rulesLengthInUTF8,
fStrippedRules.getBuffer(), fStrippedRules.length(),
0xfffd, nullptr, fStatus);
*fStatus = U_ZERO_ERROR;
int32_t rulesSize = align8((rulesLengthInUTF8+1));
int32_t totalSize = headerSize
+ forwardTableSize
+ reverseTableSize
+ statusTableSize + trieSize + rulesSize;
#ifdef RBBI_DEBUG
if (fDebugEnv && uprv_strstr(fDebugEnv, "size")) {
RBBIDebugPrintf("Header Size: %8d\n", headerSize);
RBBIDebugPrintf("Forward Table Size: %8d\n", forwardTableSize);
RBBIDebugPrintf("Reverse Table Size: %8d\n", reverseTableSize);
RBBIDebugPrintf("Trie Size: %8d\n", trieSize);
RBBIDebugPrintf("Status Table Size: %8d\n", statusTableSize);
RBBIDebugPrintf("Rules Size: %8d\n", rulesSize);
RBBIDebugPrintf("-----------------------------\n");
RBBIDebugPrintf("Total Size: %8d\n", totalSize);
}
#endif
RBBIDataHeader *data = (RBBIDataHeader *)uprv_malloc(totalSize);
if (data == NULL) {
*fStatus = U_MEMORY_ALLOCATION_ERROR;
return NULL;
}
uprv_memset(data, 0, totalSize);
data->fMagic = 0xb1a0;
data->fFormatVersion[0] = RBBI_DATA_FORMAT_VERSION[0];
data->fFormatVersion[1] = RBBI_DATA_FORMAT_VERSION[1];
data->fFormatVersion[2] = RBBI_DATA_FORMAT_VERSION[2];
data->fFormatVersion[3] = RBBI_DATA_FORMAT_VERSION[3];
data->fLength = totalSize;
data->fCatCount = fSetBuilder->getNumCharCategories();
data->fFTable = headerSize;
data->fFTableLen = forwardTableSize;
data->fRTable = data->fFTable + data->fFTableLen;
data->fRTableLen = reverseTableSize;
data->fTrie = data->fRTable + data->fRTableLen;
data->fTrieLen = trieSize;
data->fStatusTable = data->fTrie + data->fTrieLen;
data->fStatusTableLen= statusTableSize;
data->fRuleSource = data->fStatusTable + statusTableSize;
data->fRuleSourceLen = rulesLengthInUTF8;
uprv_memset(data->fReserved, 0, sizeof(data->fReserved));
fForwardTable->exportTable((uint8_t *)data + data->fFTable);
fForwardTable->exportSafeTable((uint8_t *)data + data->fRTable);
fSetBuilder->serializeTrie ((uint8_t *)data + data->fTrie);
int32_t *ruleStatusTable = (int32_t *)((uint8_t *)data + data->fStatusTable);
for (i=0; i<fRuleStatusVals->size(); i++) {
ruleStatusTable[i] = fRuleStatusVals->elementAti(i);
}
u_strToUTF8WithSub((char *)data+data->fRuleSource, rulesSize, &rulesLengthInUTF8,
fStrippedRules.getBuffer(), fStrippedRules.length(),
0xfffd, nullptr, fStatus);
if (U_FAILURE(*fStatus)) {
return NULL;
}
return data;
}
//----------------------------------------------------------------------------------------
//
// createRuleBasedBreakIterator construct from source rules that are passed in
// in a UnicodeString
//
//----------------------------------------------------------------------------------------
BreakIterator *
RBBIRuleBuilder::createRuleBasedBreakIterator( const UnicodeString &rules,
UParseError *parseError,
UErrorCode &status)
{
//
// Read the input rules, generate a parse tree, symbol table,
// and list of all Unicode Sets referenced by the rules.
//
RBBIRuleBuilder builder(rules, parseError, status);
if (U_FAILURE(status)) { // status checked here bcos build below doesn't
return NULL;
}
RBBIDataHeader *data = builder.build(status);
if (U_FAILURE(status)) {
return nullptr;
}
//
// Create a break iterator from the compiled rules.
// (Identical to creation from stored pre-compiled rules)
//
// status is checked after init in construction.
RuleBasedBreakIterator *This = new RuleBasedBreakIterator(data, status);
if (U_FAILURE(status)) {
delete This;
This = NULL;
}
else if(This == NULL) { // test for NULL
status = U_MEMORY_ALLOCATION_ERROR;
}
return This;
}
RBBIDataHeader *RBBIRuleBuilder::build(UErrorCode &status) {
if (U_FAILURE(status)) {
return nullptr;
}
fScanner->parse();
if (U_FAILURE(status)) {
return nullptr;
}
//
// UnicodeSet processing.
// Munge the Unicode Sets to create an initial set of character categories.
//
fSetBuilder->buildRanges();
//
// Generate the DFA state transition table.
//
fForwardTable = new RBBITableBuilder(this, &fForwardTree, status);
if (fForwardTable == nullptr) {
status = U_MEMORY_ALLOCATION_ERROR;
return nullptr;
}
fForwardTable->buildForwardTable();
// State table and character category optimization.
// Merge equivalent rows and columns.
// Note that this process alters the initial set of character categories,
// causing the representation of UnicodeSets in the parse tree to become invalid.
optimizeTables();
fForwardTable->buildSafeReverseTable(status);
#ifdef RBBI_DEBUG
if (fDebugEnv && uprv_strstr(fDebugEnv, "states")) {
fForwardTable->printStates();
fForwardTable->printRuleStatusTable();
fForwardTable->printReverseTable();
}
#endif
// Generate the mapping tables (TRIE) from input code points to
// the character categories.
//
fSetBuilder->buildTrie();
//
// Package up the compiled data into a memory image
// in the run-time format.
//
RBBIDataHeader *data = flattenData(); // returns NULL if error
if (U_FAILURE(status)) {
return nullptr;
}
return data;
}
void RBBIRuleBuilder::optimizeTables() {
bool didSomething;
do {
didSomething = false;
// Begin looking for duplicates with char class 3.
// Classes 0, 1 and 2 are special; they are unused, {bof} and {eof} respectively,
// and should not have other categories merged into them.
IntPair duplPair = {3, 0};
while (fForwardTable->findDuplCharClassFrom(&duplPair)) {
fSetBuilder->mergeCategories(duplPair);
fForwardTable->removeColumn(duplPair.second);
didSomething = true;
}
while (fForwardTable->removeDuplicateStates() > 0) {
didSomething = true;
}
} while (didSomething);
}
U_NAMESPACE_END
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */