scuffed-code/icu4c/source/common/rbbistbl.cpp
Andy Heninger 92f758f6fc ICU-12797 RBBI rule compiler fix.
X-SVN-Rev: 39432
2016-10-10 21:49:01 +00:00

271 lines
8.6 KiB
C++

// Copyright (C) 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
//
// file: rbbistbl.cpp Implementation of the ICU RBBISymbolTable class
//
/*
***************************************************************************
* Copyright (C) 2002-2014 International Business Machines Corporation
* and others. All rights reserved.
***************************************************************************
*/
#include "unicode/utypes.h"
#if !UCONFIG_NO_BREAK_ITERATION
#include "unicode/unistr.h"
#include "unicode/uniset.h"
#include "unicode/uchar.h"
#include "unicode/parsepos.h"
#include "cstr.h"
#include "rbbinode.h"
#include "rbbirb.h"
#include "umutex.h"
//
// RBBISymbolTableEntry_deleter Used by the UHashTable to delete the contents
// when the hash table is deleted.
//
U_CDECL_BEGIN
static void U_CALLCONV RBBISymbolTableEntry_deleter(void *p) {
icu::RBBISymbolTableEntry *px = (icu::RBBISymbolTableEntry *)p;
delete px;
}
U_CDECL_END
U_NAMESPACE_BEGIN
RBBISymbolTable::RBBISymbolTable(RBBIRuleScanner *rs, const UnicodeString &rules, UErrorCode &status)
:fRules(rules), fRuleScanner(rs), ffffString(UChar(0xffff))
{
fHashTable = NULL;
fCachedSetLookup = NULL;
fHashTable = uhash_open(uhash_hashUnicodeString, uhash_compareUnicodeString, NULL, &status);
// uhash_open checks status
if (U_FAILURE(status)) {
return;
}
uhash_setValueDeleter(fHashTable, RBBISymbolTableEntry_deleter);
}
RBBISymbolTable::~RBBISymbolTable()
{
uhash_close(fHashTable);
}
//
// RBBISymbolTable::lookup This function from the abstract symbol table inteface
// looks up a variable name and returns a UnicodeString
// containing the substitution text.
//
// The variable name does NOT include the leading $.
//
const UnicodeString *RBBISymbolTable::lookup(const UnicodeString& s) const
{
RBBISymbolTableEntry *el;
RBBINode *varRefNode;
RBBINode *exprNode;
RBBINode *usetNode;
const UnicodeString *retString;
RBBISymbolTable *This = (RBBISymbolTable *)this; // cast off const
el = (RBBISymbolTableEntry *)uhash_get(fHashTable, &s);
if (el == NULL) {
return NULL;
}
varRefNode = el->val;
exprNode = varRefNode->fLeftChild; // Root node of expression for variable
if (exprNode->fType == RBBINode::setRef) {
// The $variable refers to a single UnicodeSet
// return the ffffString, which will subsequently be interpreted as a
// stand-in character for the set by RBBISymbolTable::lookupMatcher()
usetNode = exprNode->fLeftChild;
This->fCachedSetLookup = usetNode->fInputSet;
retString = &ffffString;
}
else
{
// The variable refers to something other than just a set.
// return the original source string for the expression
retString = &exprNode->fText;
This->fCachedSetLookup = NULL;
}
return retString;
}
//
// RBBISymbolTable::lookupMatcher This function from the abstract symbol table
// interface maps a single stand-in character to a
// pointer to a Unicode Set. The Unicode Set code uses this
// mechanism to get all references to the same $variable
// name to refer to a single common Unicode Set instance.
//
// This implementation cheats a little, and does not maintain a map of stand-in chars
// to sets. Instead, it takes advantage of the fact that the UnicodeSet
// constructor will always call this function right after calling lookup(),
// and we just need to remember what set to return between these two calls.
const UnicodeFunctor *RBBISymbolTable::lookupMatcher(UChar32 ch) const
{
UnicodeSet *retVal = NULL;
RBBISymbolTable *This = (RBBISymbolTable *)this; // cast off const
if (ch == 0xffff) {
retVal = fCachedSetLookup;
This->fCachedSetLookup = 0;
}
return retVal;
}
//
// RBBISymbolTable::parseReference This function from the abstract symbol table interface
// looks for a $variable name in the source text.
// It does not look it up, only scans for it.
// It is used by the UnicodeSet parser.
//
// This implementation is lifted pretty much verbatim
// from the rules based transliterator implementation.
// I didn't see an obvious way of sharing it.
//
UnicodeString RBBISymbolTable::parseReference(const UnicodeString& text,
ParsePosition& pos, int32_t limit) const
{
int32_t start = pos.getIndex();
int32_t i = start;
UnicodeString result;
while (i < limit) {
UChar c = text.charAt(i);
if ((i==start && !u_isIDStart(c)) || !u_isIDPart(c)) {
break;
}
++i;
}
if (i == start) { // No valid name chars
return result; // Indicate failure with empty string
}
pos.setIndex(i);
text.extractBetween(start, i, result);
return result;
}
//
// RBBISymbolTable::lookupNode Given a key (a variable name), return the
// corresponding RBBI Node. If there is no entry
// in the table for this name, return NULL.
//
RBBINode *RBBISymbolTable::lookupNode(const UnicodeString &key) const{
RBBINode *retNode = NULL;
RBBISymbolTableEntry *el;
el = (RBBISymbolTableEntry *)uhash_get(fHashTable, &key);
if (el != NULL) {
retNode = el->val;
}
return retNode;
}
//
// RBBISymbolTable::addEntry Add a new entry to the symbol table.
// Indicate an error if the name already exists -
// this will only occur in the case of duplicate
// variable assignments.
//
void RBBISymbolTable::addEntry (const UnicodeString &key, RBBINode *val, UErrorCode &err) {
RBBISymbolTableEntry *e;
/* test for buffer overflows */
if (U_FAILURE(err)) {
return;
}
e = (RBBISymbolTableEntry *)uhash_get(fHashTable, &key);
if (e != NULL) {
err = U_BRK_VARIABLE_REDFINITION;
return;
}
e = new RBBISymbolTableEntry;
if (e == NULL) {
err = U_MEMORY_ALLOCATION_ERROR;
return;
}
e->key = key;
e->val = val;
uhash_put( fHashTable, &e->key, e, &err);
}
RBBISymbolTableEntry::RBBISymbolTableEntry() : UMemory(), key(), val(NULL) {}
RBBISymbolTableEntry::~RBBISymbolTableEntry() {
// The "val" of a symbol table entry is a variable reference node.
// The l. child of the val is the rhs expression from the assignment.
// Unlike other node types, children of variable reference nodes are not
// automatically recursively deleted. We do it manually here.
delete val->fLeftChild;
val->fLeftChild = NULL;
delete val;
// Note: the key UnicodeString is destructed by virtue of being in the object by value.
}
//
// RBBISymbolTable::print Debugging function, dump out the symbol table contents.
//
#ifdef RBBI_DEBUG
void RBBISymbolTable::rbbiSymtablePrint() const {
RBBIDebugPrintf("Variable Definitions Symbol Table\n"
"Name Node serial String Val\n"
"-------------------------------------------------------------------\n");
int32_t pos = UHASH_FIRST;
const UHashElement *e = NULL;
for (;;) {
e = uhash_nextElement(fHashTable, &pos);
if (e == NULL ) {
break;
}
RBBISymbolTableEntry *s = (RBBISymbolTableEntry *)e->value.pointer;
RBBIDebugPrintf("%-19s %8p %7d ", CStr(s->key)(), (void *)s->val, s->val->fSerialNum);
RBBIDebugPrintf(" %s\n", CStr(s->val->fLeftChild->fText)());
}
RBBIDebugPrintf("\nParsed Variable Definitions\n");
pos = -1;
for (;;) {
e = uhash_nextElement(fHashTable, &pos);
if (e == NULL ) {
break;
}
RBBISymbolTableEntry *s = (RBBISymbolTableEntry *)e->value.pointer;
RBBIDebugPrintf("%s\n", CStr(s->key)());
RBBINode::printTree(s->val, TRUE);
RBBINode::printTree(s->val->fLeftChild, FALSE);
RBBIDebugPrintf("\n");
}
}
#endif
U_NAMESPACE_END
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */