2017-01-20 00:20:31 +00:00
|
|
|
// © 2016 and later: Unicode, Inc. and others.
|
2016-06-15 18:58:17 +00:00
|
|
|
// License & terms of use: http://www.unicode.org/copyright.html
|
2002-06-25 17:23:07 +00:00
|
|
|
//
|
|
|
|
// file: rbbirb.cpp
|
|
|
|
//
|
2016-05-31 21:45:07 +00:00
|
|
|
// Copyright (C) 2002-2011, International Business Machines Corporation and others.
|
2002-06-25 17:23:07 +00:00
|
|
|
// All Rights Reserved.
|
|
|
|
//
|
|
|
|
// This file contains the RBBIRuleBuilder class implementation. This is the main class for
|
|
|
|
// building (compiling) break rules into the tables required by the runtime
|
|
|
|
// RBBI engine.
|
|
|
|
//
|
|
|
|
|
2002-09-20 01:54:48 +00:00
|
|
|
#include "unicode/utypes.h"
|
|
|
|
|
|
|
|
#if !UCONFIG_NO_BREAK_ITERATION
|
2002-06-25 17:23:07 +00:00
|
|
|
|
|
|
|
#include "unicode/brkiter.h"
|
|
|
|
#include "unicode/rbbi.h"
|
|
|
|
#include "unicode/ubrk.h"
|
|
|
|
#include "unicode/unistr.h"
|
|
|
|
#include "unicode/uniset.h"
|
|
|
|
#include "unicode/uchar.h"
|
|
|
|
#include "unicode/uchriter.h"
|
|
|
|
#include "unicode/parsepos.h"
|
|
|
|
#include "unicode/parseerr.h"
|
2017-09-19 18:17:22 +00:00
|
|
|
|
2002-06-25 17:23:07 +00:00
|
|
|
#include "cmemory.h"
|
2002-08-01 16:17:41 +00:00
|
|
|
#include "cstring.h"
|
2002-06-25 17:23:07 +00:00
|
|
|
#include "rbbirb.h"
|
|
|
|
#include "rbbinode.h"
|
|
|
|
#include "rbbiscan.h"
|
|
|
|
#include "rbbisetb.h"
|
|
|
|
#include "rbbitblb.h"
|
2002-07-22 22:02:08 +00:00
|
|
|
#include "rbbidata.h"
|
2017-09-19 18:17:22 +00:00
|
|
|
#include "uassert.h"
|
2002-06-25 17:23:07 +00:00
|
|
|
|
|
|
|
|
|
|
|
U_NAMESPACE_BEGIN
|
|
|
|
|
|
|
|
|
|
|
|
//----------------------------------------------------------------------------------------
|
|
|
|
//
|
|
|
|
// Constructor.
|
|
|
|
//
|
|
|
|
//----------------------------------------------------------------------------------------
|
|
|
|
RBBIRuleBuilder::RBBIRuleBuilder(const UnicodeString &rules,
|
2008-02-17 19:13:10 +00:00
|
|
|
UParseError *parseErr,
|
2002-06-25 17:23:07 +00:00
|
|
|
UErrorCode &status)
|
2018-02-03 19:10:50 +00:00
|
|
|
: fRules(rules), fStrippedRules(rules)
|
2002-06-25 17:23:07 +00:00
|
|
|
{
|
2003-09-09 23:51:17 +00:00
|
|
|
fStatus = &status; // status is checked below
|
2008-02-17 19:13:10 +00:00
|
|
|
fParseError = parseErr;
|
2002-08-01 16:17:41 +00:00
|
|
|
fDebugEnv = NULL;
|
|
|
|
#ifdef RBBI_DEBUG
|
|
|
|
fDebugEnv = getenv("U_RBBIDEBUG");
|
|
|
|
#endif
|
2002-06-25 17:23:07 +00:00
|
|
|
|
2002-07-08 22:45:04 +00:00
|
|
|
|
2002-06-25 17:23:07 +00:00
|
|
|
fForwardTree = NULL;
|
|
|
|
fReverseTree = NULL;
|
2003-11-05 02:03:44 +00:00
|
|
|
fSafeFwdTree = NULL;
|
|
|
|
fSafeRevTree = NULL;
|
|
|
|
fDefaultTree = &fForwardTree;
|
2002-06-25 17:23:07 +00:00
|
|
|
fForwardTables = NULL;
|
|
|
|
fReverseTables = NULL;
|
2003-11-05 02:03:44 +00:00
|
|
|
fSafeFwdTables = NULL;
|
|
|
|
fSafeRevTables = NULL;
|
2004-03-05 05:04:10 +00:00
|
|
|
fRuleStatusVals = NULL;
|
2003-10-09 01:13:08 +00:00
|
|
|
fChainRules = FALSE;
|
2003-10-17 23:30:02 +00:00
|
|
|
fLBCMNoChain = FALSE;
|
2003-12-04 02:12:42 +00:00
|
|
|
fLookAheadHardBreak = FALSE;
|
2004-03-05 05:04:10 +00:00
|
|
|
fUSetNodes = NULL;
|
|
|
|
fRuleStatusVals = NULL;
|
|
|
|
fScanner = NULL;
|
|
|
|
fSetBuilder = NULL;
|
2008-02-17 19:13:10 +00:00
|
|
|
if (parseErr) {
|
|
|
|
uprv_memset(parseErr, 0, sizeof(UParseError));
|
|
|
|
}
|
2003-09-09 23:51:17 +00:00
|
|
|
|
2004-03-05 05:04:10 +00:00
|
|
|
if (U_FAILURE(status)) {
|
|
|
|
return;
|
|
|
|
}
|
2003-09-09 23:51:17 +00:00
|
|
|
|
|
|
|
fUSetNodes = new UVector(status); // bcos status gets overwritten here
|
2004-03-05 05:04:10 +00:00
|
|
|
fRuleStatusVals = new UVector(status);
|
2002-08-28 22:24:17 +00:00
|
|
|
fScanner = new RBBIRuleScanner(this);
|
|
|
|
fSetBuilder = new RBBISetBuilder(this);
|
2003-09-09 23:51:17 +00:00
|
|
|
if (U_FAILURE(status)) {
|
|
|
|
return;
|
|
|
|
}
|
2004-03-05 05:04:10 +00:00
|
|
|
if(fSetBuilder == 0 || fScanner == 0 || fUSetNodes == 0 || fRuleStatusVals == 0) {
|
2002-08-28 22:24:17 +00:00
|
|
|
status = U_MEMORY_ALLOCATION_ERROR;
|
|
|
|
}
|
2002-06-25 17:23:07 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
//----------------------------------------------------------------------------------------
|
|
|
|
//
|
|
|
|
// Destructor
|
|
|
|
//
|
|
|
|
//----------------------------------------------------------------------------------------
|
|
|
|
RBBIRuleBuilder::~RBBIRuleBuilder() {
|
|
|
|
|
2002-08-28 22:24:17 +00:00
|
|
|
int i;
|
|
|
|
for (i=0; ; i++) {
|
|
|
|
RBBINode *n = (RBBINode *)fUSetNodes->elementAt(i);
|
|
|
|
if (n==NULL) {
|
|
|
|
break;
|
|
|
|
}
|
2002-06-25 17:23:07 +00:00
|
|
|
delete n;
|
|
|
|
}
|
|
|
|
|
2002-08-28 22:24:17 +00:00
|
|
|
delete fUSetNodes;
|
2002-06-25 17:23:07 +00:00
|
|
|
delete fSetBuilder;
|
|
|
|
delete fForwardTables;
|
|
|
|
delete fReverseTables;
|
2003-11-05 02:03:44 +00:00
|
|
|
delete fSafeFwdTables;
|
|
|
|
delete fSafeRevTables;
|
|
|
|
|
2002-06-25 17:23:07 +00:00
|
|
|
delete fForwardTree;
|
|
|
|
delete fReverseTree;
|
2003-11-05 02:03:44 +00:00
|
|
|
delete fSafeFwdTree;
|
|
|
|
delete fSafeRevTree;
|
2002-06-25 17:23:07 +00:00
|
|
|
delete fScanner;
|
2004-03-05 05:04:10 +00:00
|
|
|
delete fRuleStatusVals;
|
2002-06-25 17:23:07 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
//----------------------------------------------------------------------------------------
|
|
|
|
//
|
|
|
|
// flattenData() - Collect up the compiled RBBI rule data and put it into
|
|
|
|
// the format for saving in ICU data files,
|
|
|
|
// which is also the format needed by the RBBI runtime engine.
|
|
|
|
//
|
|
|
|
//----------------------------------------------------------------------------------------
|
2002-11-30 04:41:53 +00:00
|
|
|
static int32_t align8(int32_t i) {return (i+7) & 0xfffffff8;}
|
|
|
|
|
2002-06-25 17:23:07 +00:00
|
|
|
RBBIDataHeader *RBBIRuleBuilder::flattenData() {
|
2004-03-05 05:04:10 +00:00
|
|
|
int32_t i;
|
|
|
|
|
2002-06-25 17:23:07 +00:00
|
|
|
if (U_FAILURE(*fStatus)) {
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2018-02-03 19:10:50 +00:00
|
|
|
// Remove whitespace from the rules to make it smaller.
|
|
|
|
// The rule parser has already removed comments.
|
|
|
|
fStrippedRules = fScanner->stripRules(fStrippedRules);
|
2003-02-17 18:06:42 +00:00
|
|
|
|
2002-06-25 17:23:07 +00:00
|
|
|
// Calculate the size of each section in the data.
|
|
|
|
// Sizes here are padded up to a multiple of 8 for better memory alignment.
|
|
|
|
// Sections sizes actually stored in the header are for the actual data
|
|
|
|
// without the padding.
|
|
|
|
//
|
|
|
|
int32_t headerSize = align8(sizeof(RBBIDataHeader));
|
|
|
|
int32_t forwardTableSize = align8(fForwardTables->getTableSize());
|
|
|
|
int32_t reverseTableSize = align8(fReverseTables->getTableSize());
|
2003-11-05 02:03:44 +00:00
|
|
|
int32_t safeFwdTableSize = align8(fSafeFwdTables->getTableSize());
|
|
|
|
int32_t safeRevTableSize = align8(fSafeRevTables->getTableSize());
|
2002-06-25 17:23:07 +00:00
|
|
|
int32_t trieSize = align8(fSetBuilder->getTrieSize());
|
2004-03-05 05:04:10 +00:00
|
|
|
int32_t statusTableSize = align8(fRuleStatusVals->size() * sizeof(int32_t));
|
2018-02-03 19:10:50 +00:00
|
|
|
int32_t rulesSize = align8((fStrippedRules.length()+1) * sizeof(UChar));
|
2002-06-25 17:23:07 +00:00
|
|
|
|
2017-09-19 18:17:22 +00:00
|
|
|
(void)safeFwdTableSize;
|
|
|
|
|
|
|
|
int32_t totalSize = headerSize
|
|
|
|
+ forwardTableSize
|
|
|
|
+ /* reverseTableSize */ 0
|
|
|
|
+ /* safeFwdTableSize */ 0
|
|
|
|
+ (safeRevTableSize ? safeRevTableSize : reverseTableSize)
|
2004-03-05 05:04:10 +00:00
|
|
|
+ statusTableSize + trieSize + rulesSize;
|
|
|
|
|
2002-06-25 17:23:07 +00:00
|
|
|
RBBIDataHeader *data = (RBBIDataHeader *)uprv_malloc(totalSize);
|
|
|
|
if (data == NULL) {
|
|
|
|
*fStatus = U_MEMORY_ALLOCATION_ERROR;
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
uprv_memset(data, 0, totalSize);
|
|
|
|
|
|
|
|
|
2005-03-28 05:21:50 +00:00
|
|
|
data->fMagic = 0xb1a0;
|
2017-06-20 22:11:08 +00:00
|
|
|
data->fFormatVersion[0] = RBBI_DATA_FORMAT_VERSION[0];
|
|
|
|
data->fFormatVersion[1] = RBBI_DATA_FORMAT_VERSION[1];
|
|
|
|
data->fFormatVersion[2] = RBBI_DATA_FORMAT_VERSION[2];
|
|
|
|
data->fFormatVersion[3] = RBBI_DATA_FORMAT_VERSION[3];
|
2005-03-28 05:21:50 +00:00
|
|
|
data->fLength = totalSize;
|
|
|
|
data->fCatCount = fSetBuilder->getNumCharCategories();
|
2002-06-25 17:23:07 +00:00
|
|
|
|
2017-09-19 18:17:22 +00:00
|
|
|
// Only save the forward table and the safe reverse table,
|
|
|
|
// because these are the only ones used at run-time.
|
|
|
|
//
|
|
|
|
// For the moment, we still build the other tables if they are present in the rule source files,
|
|
|
|
// for backwards compatibility. Old rule files need to work, and this is the simplest approach.
|
|
|
|
//
|
|
|
|
// Additional backwards compatibility consideration: if no safe rules are provided, consider the
|
|
|
|
// reverse rules to actually be the safe reverse rules.
|
|
|
|
|
2002-06-25 17:23:07 +00:00
|
|
|
data->fFTable = headerSize;
|
|
|
|
data->fFTableLen = forwardTableSize;
|
2017-09-19 18:17:22 +00:00
|
|
|
|
|
|
|
// Do not save Reverse Table.
|
2003-11-05 02:03:44 +00:00
|
|
|
data->fRTable = data->fFTable + forwardTableSize;
|
2017-09-19 18:17:22 +00:00
|
|
|
data->fRTableLen = 0;
|
|
|
|
|
|
|
|
// Do not save the Safe Forward table.
|
|
|
|
data->fSFTable = data->fRTable + 0;
|
|
|
|
data->fSFTableLen = 0;
|
|
|
|
|
|
|
|
data->fSRTable = data->fSFTable + 0;
|
|
|
|
if (safeRevTableSize > 0) {
|
|
|
|
data->fSRTableLen = safeRevTableSize;
|
|
|
|
} else if (reverseTableSize > 0) {
|
|
|
|
data->fSRTableLen = reverseTableSize;
|
|
|
|
} else {
|
|
|
|
U_ASSERT(FALSE); // Rule build should have failed for lack of a reverse table
|
|
|
|
// before reaching this point.
|
|
|
|
}
|
|
|
|
|
2003-11-05 02:03:44 +00:00
|
|
|
|
2017-09-19 18:17:22 +00:00
|
|
|
data->fTrie = data->fSRTable + data->fSRTableLen;
|
2002-06-25 17:23:07 +00:00
|
|
|
data->fTrieLen = fSetBuilder->getTrieSize();
|
2004-03-05 05:04:10 +00:00
|
|
|
data->fStatusTable = data->fTrie + trieSize;
|
|
|
|
data->fStatusTableLen= statusTableSize;
|
|
|
|
data->fRuleSource = data->fStatusTable + statusTableSize;
|
2018-02-03 19:10:50 +00:00
|
|
|
data->fRuleSourceLen = fStrippedRules.length() * sizeof(UChar);
|
2002-06-25 17:23:07 +00:00
|
|
|
|
|
|
|
uprv_memset(data->fReserved, 0, sizeof(data->fReserved));
|
|
|
|
|
|
|
|
fForwardTables->exportTable((uint8_t *)data + data->fFTable);
|
2017-09-19 18:17:22 +00:00
|
|
|
// fReverseTables->exportTable((uint8_t *)data + data->fRTable);
|
|
|
|
// fSafeFwdTables->exportTable((uint8_t *)data + data->fSFTable);
|
|
|
|
if (safeRevTableSize > 0) {
|
|
|
|
fSafeRevTables->exportTable((uint8_t *)data + data->fSRTable);
|
|
|
|
} else {
|
|
|
|
fReverseTables->exportTable((uint8_t *)data + data->fSRTable);
|
|
|
|
}
|
|
|
|
|
2002-06-25 17:23:07 +00:00
|
|
|
fSetBuilder->serializeTrie ((uint8_t *)data + data->fTrie);
|
2004-03-05 05:04:10 +00:00
|
|
|
|
|
|
|
int32_t *ruleStatusTable = (int32_t *)((uint8_t *)data + data->fStatusTable);
|
|
|
|
for (i=0; i<fRuleStatusVals->size(); i++) {
|
|
|
|
ruleStatusTable[i] = fRuleStatusVals->elementAti(i);
|
|
|
|
}
|
|
|
|
|
2018-02-03 19:10:50 +00:00
|
|
|
fStrippedRules.extract((UChar *)((uint8_t *)data+data->fRuleSource), rulesSize/2+1, *fStatus);
|
2002-06-25 17:23:07 +00:00
|
|
|
|
|
|
|
return data;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2002-06-27 01:50:22 +00:00
|
|
|
//----------------------------------------------------------------------------------------
|
2002-06-25 17:23:07 +00:00
|
|
|
//
|
2002-06-27 01:50:22 +00:00
|
|
|
// createRuleBasedBreakIterator construct from source rules that are passed in
|
|
|
|
// in a UnicodeString
|
2002-06-25 17:23:07 +00:00
|
|
|
//
|
2002-06-27 01:50:22 +00:00
|
|
|
//----------------------------------------------------------------------------------------
|
2002-07-08 22:45:04 +00:00
|
|
|
BreakIterator *
|
2002-06-25 17:23:07 +00:00
|
|
|
RBBIRuleBuilder::createRuleBasedBreakIterator( const UnicodeString &rules,
|
2008-02-17 19:13:10 +00:00
|
|
|
UParseError *parseError,
|
2002-06-25 17:23:07 +00:00
|
|
|
UErrorCode &status)
|
|
|
|
{
|
2003-09-09 23:51:17 +00:00
|
|
|
// status checked below
|
2002-06-25 17:23:07 +00:00
|
|
|
|
|
|
|
//
|
|
|
|
// Read the input rules, generate a parse tree, symbol table,
|
|
|
|
// and list of all Unicode Sets referenced by the rules.
|
|
|
|
//
|
|
|
|
RBBIRuleBuilder builder(rules, parseError, status);
|
2003-09-09 23:51:17 +00:00
|
|
|
if (U_FAILURE(status)) { // status checked here bcos build below doesn't
|
2002-06-25 17:23:07 +00:00
|
|
|
return NULL;
|
|
|
|
}
|
2008-02-17 19:13:10 +00:00
|
|
|
builder.fScanner->parse();
|
2002-06-25 17:23:07 +00:00
|
|
|
|
|
|
|
//
|
|
|
|
// UnicodeSet processing.
|
|
|
|
// Munge the Unicode Sets to create a set of character categories.
|
|
|
|
// Generate the mapping tables (TRIE) from input 32-bit characters to
|
|
|
|
// the character categories.
|
|
|
|
//
|
|
|
|
builder.fSetBuilder->build();
|
|
|
|
|
|
|
|
|
|
|
|
//
|
|
|
|
// Generate the DFA state transition table.
|
|
|
|
//
|
2002-06-27 21:14:47 +00:00
|
|
|
builder.fForwardTables = new RBBITableBuilder(&builder, &builder.fForwardTree);
|
|
|
|
builder.fReverseTables = new RBBITableBuilder(&builder, &builder.fReverseTree);
|
2003-11-05 02:03:44 +00:00
|
|
|
builder.fSafeFwdTables = new RBBITableBuilder(&builder, &builder.fSafeFwdTree);
|
|
|
|
builder.fSafeRevTables = new RBBITableBuilder(&builder, &builder.fSafeRevTree);
|
2011-11-14 19:32:51 +00:00
|
|
|
if (builder.fForwardTables == NULL || builder.fReverseTables == NULL ||
|
|
|
|
builder.fSafeFwdTables == NULL || builder.fSafeRevTables == NULL)
|
2003-09-09 23:51:17 +00:00
|
|
|
{
|
2002-06-29 09:31:05 +00:00
|
|
|
status = U_MEMORY_ALLOCATION_ERROR;
|
2011-11-14 19:32:51 +00:00
|
|
|
delete builder.fForwardTables; builder.fForwardTables = NULL;
|
|
|
|
delete builder.fReverseTables; builder.fReverseTables = NULL;
|
|
|
|
delete builder.fSafeFwdTables; builder.fSafeFwdTables = NULL;
|
|
|
|
delete builder.fSafeRevTables; builder.fSafeRevTables = NULL;
|
2002-07-08 22:45:04 +00:00
|
|
|
return NULL;
|
2002-06-29 09:31:05 +00:00
|
|
|
}
|
2002-07-08 22:45:04 +00:00
|
|
|
|
2002-06-25 17:23:07 +00:00
|
|
|
builder.fForwardTables->build();
|
|
|
|
builder.fReverseTables->build();
|
2003-11-05 02:03:44 +00:00
|
|
|
builder.fSafeFwdTables->build();
|
|
|
|
builder.fSafeRevTables->build();
|
2002-06-25 17:23:07 +00:00
|
|
|
|
2004-03-05 17:27:42 +00:00
|
|
|
#ifdef RBBI_DEBUG
|
2004-03-05 05:04:10 +00:00
|
|
|
if (builder.fDebugEnv && uprv_strstr(builder.fDebugEnv, "states")) {
|
|
|
|
builder.fForwardTables->printRuleStatusTable();
|
|
|
|
}
|
2004-03-05 17:27:42 +00:00
|
|
|
#endif
|
2004-03-05 05:04:10 +00:00
|
|
|
|
2002-06-25 17:23:07 +00:00
|
|
|
//
|
|
|
|
// Package up the compiled data into a memory image
|
|
|
|
// in the run-time format.
|
|
|
|
//
|
2003-09-09 23:51:17 +00:00
|
|
|
RBBIDataHeader *data = builder.flattenData(); // returns NULL if error
|
2008-02-23 19:15:18 +00:00
|
|
|
if (U_FAILURE(*builder.fStatus)) {
|
|
|
|
return NULL;
|
2008-01-19 00:09:54 +00:00
|
|
|
}
|
2002-06-25 17:23:07 +00:00
|
|
|
|
|
|
|
|
|
|
|
//
|
|
|
|
// Clean up the compiler related stuff
|
|
|
|
//
|
|
|
|
|
|
|
|
|
|
|
|
//
|
|
|
|
// Create a break iterator from the compiled rules.
|
|
|
|
// (Identical to creation from stored pre-compiled rules)
|
|
|
|
//
|
2003-09-09 23:51:17 +00:00
|
|
|
// status is checked after init in construction.
|
2002-06-25 17:23:07 +00:00
|
|
|
RuleBasedBreakIterator *This = new RuleBasedBreakIterator(data, status);
|
|
|
|
if (U_FAILURE(status)) {
|
|
|
|
delete This;
|
|
|
|
This = NULL;
|
2003-09-09 23:51:17 +00:00
|
|
|
}
|
|
|
|
else if(This == NULL) { // test for NULL
|
|
|
|
status = U_MEMORY_ALLOCATION_ERROR;
|
2002-06-25 17:23:07 +00:00
|
|
|
}
|
|
|
|
return This;
|
|
|
|
}
|
|
|
|
|
|
|
|
U_NAMESPACE_END
|
2002-09-20 01:54:48 +00:00
|
|
|
|
|
|
|
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
|