scuffed-code/icu4c/source/tools/genrb/parse.c
Vladimir Weinstein b126b1ac93 ICU-402 genrb now handles \u0000 again
X-SVN-Rev: 1420
2000-05-20 02:56:59 +00:00

512 lines
16 KiB
C

/*
*******************************************************************************
*
* Copyright (C) 1998-2000, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
*
* File parse.c
*
* Modification History:
*
* Date Name Description
* 05/26/99 stephen Creation.
* 02/25/00 weiv Overhaul to write udata
*******************************************************************************
*/
#include "parse.h"
#include "error.h"
#include "uhash.h"
#include "cmemory.h"
#include "read.h"
#include "unicode/ustdio.h"
#include "ustr.h"
#include "reslist.h"
#include "unicode/ustring.h"
#include "unicode/ucol.h"
U_CAPI const UChar * U_EXPORT2 ucol_getDefaultRulesArray(uint32_t *size);
U_STRING_DECL(k_start_string, "string", 6);
U_STRING_DECL(k_start_binary, "binary", 6);
U_STRING_DECL(k_start_table, "table", 5);
U_STRING_DECL(k_start_int, "int", 3);
U_STRING_DECL(k_start_array, "array", 5);
U_STRING_DECL(k_start_intvector, "intvector", 9);
U_STRING_DECL(k_start_reserved, "reserved", 8);
static UBool didInit=FALSE;
/* Node IDs for the state transition table. */
enum ENode {
eError,
eInitial, /* Next: Locale name */
eGotLoc, /* Next: { */
eIdle, /* Next: Tag name | } */
eGotTag, /* Next: { | : */
eNode5, /* Next: Data | Subtag */
eNode6, /* Next: } | { | , */
eList, /* Next: List data */
eNode8, /* Next: , */
eTagList, /* Next: Subtag data */
eNode10, /* Next: } */
eNode11, /* Next: Subtag */
eNode12, /* Next: { */
e2dArray, /* Next: Data | } */
eNode14, /* Next: , | } */
eNode15, /* Next: , | } */
eNode16, /* Next: { | } */
eTypeStart, /* Next: Type name */
eGotType /* Next: { */
};
/* Action codes for the state transtiion table. */
enum EAction {
/* Generic actions */
eNOP = 0x0100, /* Do nothing */
eOpen = 0x0200, /* Open a new locale data block with the data
string as the locale name */
eClose = 0x0300, /* Close a locale data block */
eSetTag = 0x0400, /* Record the last string as the tag name */
/* Comma-delimited lists */
eBegList = 0x1100, /* Start a new string list with the last string
as the first element */
eEndList = 0x1200, /* Close a string list being built */
eListStr = 0x1300, /* Record the last string as a data string and
increment the index */
eStr = 0x1400, /* Record the last string as a singleton string */
/* 2-d lists */
eBeg2dList = 0x2100, /* Start a new 2d string list with no elements as yet */
eEnd2dList = 0x2200, /* Close a 2d string list being built */
e2dStr = 0x2300, /* Record the last string as a 2d string */
eNewRow = 0x2400, /* Start a new row */
/* Tagged lists */
eBegTagged = 0x3100, /* Start a new tagged list with the last
string as the first subtag */
eEndTagged = 0x3200, /* Close a tagged list being build */
eSubtag = 0x3300, /* Record the last string as the subtag */
eTaggedStr = 0x3400, /* Record the last string as a tagged string */
/* Type support */
eBegType = 0x4100, /* Start getting a type */
eSetType = 0x4200 /* Record and init type */
};
/* A struct which encapsulates a node ID and an action. */
struct STransition {
enum ENode fNext;
enum EAction fAction;
};
/* This table describes an ATM (state machine) which parses resource
bundle text files rather strictly. Each row represents a node. The
columns of that row represent transitions into other nodes. Most
transitions are "eError" because most transitions are
disallowed. For example, if the parser has just seen a tag name, it
enters node 4 ("eGotTag"). The state table then marks only one
valid transition, which is into node 5, upon seeing an eOpenBrace
token. We allow an extra comma after the last element in a
comma-delimited list (transition from eList to eIdle on
kCloseBrace). */
static struct STransition gTransitionTable [] = {
/* kString kOpenBrace kCloseBrace kComma
/*eError*/ {eError,eNOP}, {eError,eNOP}, {eError,eNOP}, {eError,eNOP},
/*eInitial*/ {eGotLoc,eOpen}, {eError,eNOP}, {eError,eNOP}, {eError,eNOP},
/*eGotLoc*/ {eError,eNOP}, {eIdle,eNOP}, {eError,eNOP}, {eError,eNOP},
/*eIdle*/ {eGotTag,eSetTag}, {eError,eNOP}, {eInitial,eClose}, {eError,eNOP},
/*eGotTag*/ {eError,eNOP}, {eNode5,eNOP}, {eError,eNOP}, {eError,eNOP},
/*eNode5*/ {eNode6,eNOP}, {e2dArray,eBeg2dList},{eError,eNOP}, {eError,eNOP},
/*eNode6*/ {eError,eNOP}, {eTagList,eBegTagged},{eIdle,eStr}, {eList,eBegList},
/*eList*/ {eNode8,eListStr}, {eError,eNOP}, {eIdle,eEndList}, {eError,eNOP},
/*eNode8*/ {eError,eNOP}, {eError,eNOP}, {eIdle,eEndList}, {eList,eNOP},
/*eTagList*/ {eNode10,eTaggedStr},{eError,eNOP}, {eError,eNOP}, {eError,eNOP},
/*eNode10*/ {eError,eNOP}, {eError,eNOP}, {eNode11,eNOP}, {eError,eNOP},
/*eNode11*/ {eNode12,eNOP}, {eError,eNOP}, {eIdle,eEndTagged},{eError,eNOP},
/*eNode12*/ {eError,eNOP}, {eTagList,eSubtag}, {eError,eNOP}, {eError,eNOP},
/*e2dArray*/ {eNode14,e2dStr}, {eError,eNOP}, {eNode15,eNOP}, {eError,eNOP},
/*eNode14*/ {eError,eNOP}, {eError,eNOP}, {eNode15,eNOP}, {e2dArray,eNOP},
/*eNode15*/ {eError,eNOP}, {e2dArray,eNewRow}, {eIdle,eEnd2dList},{eNode16,eNOP},
/*eNode16*/ {eError,eNOP}, {e2dArray,eNewRow}, {eIdle,eEnd2dList},{eError,eNOP},
/*eTypeStart*/{eGotType,eSetType}, {eError,eNOP}, {eError,eNOP}, {eError,eNOP},
/*eGotType*/ {eError,eNOP}, {eError,eNOP}, {eError,eNOP}, {eError,eNOP}
};
/* Row length is 4 */
#define GETTRANSITION(row,col) (gTransitionTable[col + (row<<2)])
/* Not anymore, it is 5 now */
/*#define GETTRANSITION(row,col) (gTransitionTable[col + (row*5)])*/
/*********************************************************************
* Hashtable glue
********************************************************************/
static UBool get(UHashtable *hash, const struct UString* tag) {
return uhash_get(hash, tag) != NULL;
}
static void put(UHashtable *hash, const struct UString *tag,
UErrorCode* status) {
struct UString* key = uprv_malloc(sizeof(struct UString));
ustr_init(key);
ustr_cpy(key, tag, status);
uhash_put(hash, key, (void*)1, status);
}
static void freeUString(void* ustr) {
ustr_deinit(ustr);
uprv_free(ustr);
}
static int32_t hashUString(const void* ustr) {
return uhash_hashUChars(((struct UString*)ustr)->fChars);
}
static UBool compareUString(const void* ustr1, const void* ustr2) {
return uhash_compareUChars(((struct UString*)ustr1)->fChars,
((struct UString*)ustr2)->fChars);
}
/*********************************************************************
* parse
********************************************************************/
struct SRBRoot*
parse(FileStream *f, const char *cp,
UErrorCode *status)
{
struct UFILE *file;
enum ETokenType type;
enum ENode node;
struct STransition t;
struct UString token;
struct UString tag;
char cTag[1024];
char cSubTag[1024];
struct SRBRoot *bundle = NULL;
struct SResource *rootTable = NULL;
struct SResource *temp = NULL;
struct SResource *temp1 = NULL;
struct SResource *temp2 = NULL;
UBool colEl = FALSE;
/* Hashtable for keeping track of seen tag names */
struct UHashtable *data;
if(U_FAILURE(*status)) return NULL;
/* setup */
ustr_init(&token);
ustr_init(&tag);
/*
cTag = uprv_malloc(1024);
if(cTag == NULL) {
*status = U_MEMORY_ALLOCATION_ERROR;
return NULL;
}
cSubTag = uprv_malloc(1024);
if(cSubTag == NULL) {
*status = U_MEMORY_ALLOCATION_ERROR;
return NULL;
}
*/
node = eInitial;
data = 0;
file = u_finit((FILE *)f, 0, cp);
/* file = u_finit(f, cp, status); */
bundle = bundle_open(status);
rootTable = bundle -> fRoot;
if(U_FAILURE(*status) || file == NULL) goto finish;
/* iterate through the stream */
for(;;) {
/* get next token from stream */
type = getNextToken(file, &token, status);
if(U_FAILURE(*status)) goto finish;
switch(type) {
case tok_EOF:
*status = (node == eInitial) ? U_ZERO_ERROR : U_INVALID_FORMAT_ERROR;
if(U_FAILURE(*status)) {
setErrorText("Unexpected EOF encountered");
}
goto finish;
/*break;*/
case tok_error:
*status = U_INVALID_FORMAT_ERROR;
goto finish;
/*break;*/
default:
break;
}
t = GETTRANSITION(node, type);
node = t.fNext;
if(node == eError) {
*status = U_INVALID_FORMAT_ERROR;
goto finish;
}
switch(t.fAction) {
case eNOP:
break;
/* Record the last string as the tag name */
case eSetTag:
ustr_cpy(&tag, &token, status);
u_UCharsToChars(tag.fChars, cTag, u_strlen(tag.fChars)+1);
if(U_FAILURE(*status)) goto finish;
if(uprv_strchr(cTag, ':')) {
/* type modificator - do the type modification*/
} else if(uprv_strcmp(cTag, "CollationElements") == 0) {
colEl = TRUE;
}
/*if(uhash_get(data, uhash_hashUString(tag.fChars)) != 0) {*/
if(get(data, &tag)) {
char *s;
*status = U_INVALID_FORMAT_ERROR;
s = uprv_malloc(1024);
strcpy(s, "Duplicate tag name detected: ");
u_austrcpy(s+strlen(s), tag.fChars);
setErrorText(s);
goto finish;
}
break;
/* Record a singleton string */
case eStr:
if(temp != NULL) {
*status = U_INTERNAL_PROGRAM_ERROR;
goto finish;
}
temp = string_open(bundle, cTag, token.fChars, token.fLength, status);
table_add(rootTable, temp, status);
if(colEl == TRUE) {
const UChar * defaultRulesArray;
uint32_t defaultRulesArrayLength = 0;
/* do the collation elements */
int32_t len = 0;
uint8_t *data = NULL;
UCollator *coll = NULL;
UChar *rules = NULL;
defaultRulesArray = ucol_getDefaultRulesArray(&defaultRulesArrayLength);
rules = uprv_malloc(sizeof(defaultRulesArray[0])*(defaultRulesArrayLength + token.fLength));
uprv_memcpy(rules, defaultRulesArray, defaultRulesArrayLength*sizeof(defaultRulesArray[0]));
uprv_memcpy(rules + defaultRulesArrayLength, token.fChars, token.fLength*sizeof(token.fChars[0]));
coll = ucol_openRules(rules, defaultRulesArrayLength + token.fLength, 0, 0, status);
if(U_SUCCESS(*status) && coll !=NULL) {
/* This is just for testing & should be removed
temp1 = bin_open(bundle, "%%Collation", sizeof(defaultRulesArray[0])*(defaultRulesArrayLength + token.fLength), (uint8_t *) rules, status);
table_add(rootTable, temp1, status);
*/
/*
data = ucol_cloneRuleData(coll, &len, status);
if(U_SUCCESS(*status) && data != NULL) {
temp1 = bin_open(bundle, "%%Collation", len, data, status);
table_add(rootTable, temp1, status);
uprv_free(data);
}
*/
ucol_close(coll);
}
uprv_free(rules);
colEl = FALSE;
}
/*uhash_put(data, tag.fChars, status);*/
put(data, &tag, status);
if(U_FAILURE(*status)) goto finish;
temp = NULL;
break;
/* Begin a string list */
case eBegList:
if(temp != NULL) {
*status = U_INTERNAL_PROGRAM_ERROR;
goto finish;
}
temp = array_open(bundle, cTag, status);
temp1 = string_open(bundle, NULL, token.fChars, token.fLength, status);
array_add(temp, temp1, status);
temp1 = NULL;
if(U_FAILURE(*status)) goto finish;
break;
/* Record a comma-delimited list string */
case eListStr:
temp1 = string_open(bundle, NULL, token.fChars, token.fLength, status);
array_add(temp, temp1, status);
temp1 = NULL;
if(U_FAILURE(*status)) goto finish;
break;
/* End a string list */
case eEndList:
/*uhash_put(data, tag.fChars, status);*/
put(data, &tag, status);
table_add(rootTable, temp, status);
temp = NULL;
if(U_FAILURE(*status)) goto finish;
break;
case eBeg2dList:
if(temp != NULL) {
*status = U_INTERNAL_PROGRAM_ERROR;
goto finish;
}
temp = array_open(bundle, cTag, status);
temp1 = array_open(bundle, NULL, status);
if(U_FAILURE(*status)) goto finish;
break;
case eEnd2dList:
/*uhash_put(data, tag.fChars, status);*/
put(data, &tag, status);
array_add(temp, temp1, status);
table_add(rootTable, temp, status);
temp1 = NULL;
temp = NULL;
if(U_FAILURE(*status)) goto finish;
break;
case e2dStr:
temp2 = string_open(bundle, NULL, token.fChars, token.fLength, status);
array_add(temp1, temp2, status);
temp2 = NULL;
if(U_FAILURE(*status)) goto finish;
break;
case eNewRow:
array_add(temp, temp1, status);
temp1 = array_open(bundle, NULL, status);
if(U_FAILURE(*status)) goto finish;
break;
case eBegTagged:
if(temp != NULL) {
*status = U_INTERNAL_PROGRAM_ERROR;
goto finish;
}
temp = table_open(bundle, cTag, status);
u_UCharsToChars(token.fChars, cSubTag, u_strlen(token.fChars)+1);
if(U_FAILURE(*status)) goto finish;
break;
case eEndTagged:
/*uhash_put(data, tag.fChars, status);*/
put(data, &tag, status);
table_add(rootTable, temp, status);
temp = NULL;
if(U_FAILURE(*status)) goto finish;
break;
case eTaggedStr:
temp1 = string_open(bundle, cSubTag, token.fChars, token.fLength, status);
table_add(temp, temp1, status);
temp1 = NULL;
if(U_FAILURE(*status)) goto finish;
break;
/* Record the last string as the subtag */
case eSubtag:
u_UCharsToChars(token.fChars, cSubTag, u_strlen(token.fChars)+1);
if(U_FAILURE(*status)) goto finish;
if(table_get(temp, cSubTag, status) != 0) {
*status = U_INVALID_FORMAT_ERROR;
setErrorText("Duplicate subtag found in tagged list");
goto finish;
}
break;
case eOpen:
if(data != 0) {
*status = U_INTERNAL_PROGRAM_ERROR;
goto finish;
}
bundle_setlocale(bundle, token.fChars, status);
if(U_FAILURE(*status)) goto finish;
data = uhash_open(hashUString, compareUString, status);
uhash_setKeyDeleter(data, freeUString);
break;
case eClose:
if(data == 0) {
*status = U_INTERNAL_PROGRAM_ERROR;
goto finish;
}
break;
case eSetType:
/* type recognition */
if(!didInit) {
U_STRING_INIT(k_start_string, "string", 6);
U_STRING_INIT(k_start_binary, "binary", 6);
U_STRING_INIT(k_start_table, "table", 5);
U_STRING_INIT(k_start_int, "int", 3);
U_STRING_INIT(k_start_array, "array", 5);
U_STRING_INIT(k_start_intvector, "intvector", 9);
U_STRING_INIT(k_start_reserved, "reserved", 8);
didInit=TRUE;
}
if(u_strcmp(token.fChars, k_start_string) == 0) {
node = eGotTag;
} else if(u_strcmp(token.fChars, k_start_array) == 0) {
node = eGotTag;
} else if(u_strcmp(token.fChars, k_start_table) == 0) {
node = eGotTag;
} else if(u_strcmp(token.fChars, k_start_binary) == 0) {
/* start of binary */
} else if(u_strcmp(token.fChars, k_start_int) == 0) {
/* start of integer */
} else if(u_strcmp(token.fChars, k_start_intvector) == 0) {
/* start of intvector */
} else if(u_strcmp(token.fChars, k_start_reserved) == 0) {
/* start of reserved */
} else {
*status = U_INTERNAL_PROGRAM_ERROR;
goto finish;
}
break;
}
}
finish:
/* clean up */
if(data != 0)
uhash_close(data);
ustr_deinit(&token);
ustr_deinit(&tag);
/*uprv_free(cTag);*/
/*uprv_free(cSubTag);*/
if(file != 0)
u_fclose(file);
return bundle;
}