/* ******************************************************************************* * * Copyright (C) 1998-2001, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* * * File parse.c * * Modification History: * * Date Name Description * 05/26/99 stephen Creation. * 02/25/00 weiv Overhaul to write udata ******************************************************************************* */ #include "ucol_imp.h" #include "parse.h" #include "error.h" #include "uhash.h" #include "cmemory.h" #include "read.h" #include "unicode/ustdio.h" #include "ustr.h" #include "reslist.h" #include "unicode/ustring.h" #include "unicode/putil.h" #define U_ICU_UNIDATA "unidata" U_STRING_DECL(k_start_string, "string", 6); U_STRING_DECL(k_start_binary, "binary", 6); U_STRING_DECL(k_start_table, "table", 5); U_STRING_DECL(k_start_int, "int", 3); U_STRING_DECL(k_start_array, "array", 5); U_STRING_DECL(k_start_intvector, "intvector", 9); U_STRING_DECL(k_start_reserved, "reserved", 8); /* U_STRING_DECL(rootName, "root", 4);*/ static UBool didInit=FALSE; /* static UBool didInitRoot=FALSE;*/ /* Node IDs for the state transition table. */ enum ENode { eError, eInitial, /* Next: Locale name */ eGotLoc, /* Next: { */ eIdle, /* Next: Tag name | } */ eGotTag, /* Next: { | : */ eNode5, /* Next: Data | Subtag */ eNode6, /* Next: } | { | , */ eList, /* Next: List data */ eNode8, /* Next: , */ eTagList, /* Next: Subtag data */ eNode10, /* Next: } */ eNode11, /* Next: Subtag */ eNode12, /* Next: { */ e2dArray, /* Next: Data | } */ eNode14, /* Next: , | } */ eNode15, /* Next: , | } */ eNode16, /* Next: { | } */ eTypeStart, /* Next: Type name */ eGotType /* Next: { */ }; /* Action codes for the state transtiion table. */ enum EAction { /* Generic actions */ eNOP = 0x0100, /* Do nothing */ eOpen = 0x0200, /* Open a new locale data block with the data string as the locale name */ eClose = 0x0300, /* Close a locale data block */ eSetTag = 0x0400, /* Record the last string as the tag name */ /* Comma-delimited lists */ eBegList = 0x1100, /* Start a new string list with the last string as the first element */ eEndList = 0x1200, /* Close a string list being built */ eListStr = 0x1300, /* Record the last string as a data string and increment the index */ eStr = 0x1400, /* Record the last string as a singleton string */ /* 2-d lists */ eBeg2dList = 0x2100, /* Start a new 2d string list with no elements as yet */ eEnd2dList = 0x2200, /* Close a 2d string list being built */ e2dStr = 0x2300, /* Record the last string as a 2d string */ eNewRow = 0x2400, /* Start a new row */ /* Tagged lists */ eBegTagged = 0x3100, /* Start a new tagged list with the last string as the first subtag */ eEndTagged = 0x3200, /* Close a tagged list being build */ eSubtag = 0x3300, /* Record the last string as the subtag */ eTaggedStr = 0x3400, /* Record the last string as a tagged string */ /* Type support */ eBegType = 0x4100, /* Start getting a type */ eSetType = 0x4200 /* Record and init type */ }; /* A struct which encapsulates a node ID and an action. */ struct STransition { enum ENode fNext; enum EAction fAction; }; /* This table describes an ATM (state machine) which parses resource bundle text files rather strictly. Each row represents a node. The columns of that row represent transitions into other nodes. Most transitions are "eError" because most transitions are disallowed. For example, if the parser has just seen a tag name, it enters node 4 ("eGotTag"). The state table then marks only one valid transition, which is into node 5, upon seeing an eOpenBrace token. We allow an extra comma after the last element in a comma-delimited list (transition from eList to eIdle on kCloseBrace). */ static struct STransition gTransitionTable [] = { /* kString kOpenBrace kCloseBrace kComma */ /*eError*/ {eError,eNOP}, {eError,eNOP}, {eError,eNOP}, {eError,eNOP}, /*eInitial*/ {eGotLoc,eOpen}, {eError,eNOP}, {eError,eNOP}, {eError,eNOP}, /*eGotLoc*/ {eError,eNOP}, {eIdle,eNOP}, {eError,eNOP}, {eError,eNOP}, /*eIdle*/ {eGotTag,eSetTag}, {eError,eNOP}, {eInitial,eClose}, {eError,eNOP}, /*eGotTag*/ {eError,eNOP}, {eNode5,eNOP}, {eError,eNOP}, {eError,eNOP}, /*eNode5*/ {eNode6,eNOP}, {e2dArray,eBeg2dList},{eError,eNOP}, {eError,eNOP}, /*eNode6*/ {eError,eNOP}, {eTagList,eBegTagged},{eIdle,eStr}, {eList,eBegList}, /*eList*/ {eNode8,eListStr}, {eError,eNOP}, {eIdle,eEndList}, {eError,eNOP}, /*eNode8*/ {eError,eNOP}, {eError,eNOP}, {eIdle,eEndList}, {eList,eNOP}, /*eTagList*/ {eNode10,eTaggedStr},{eError,eNOP}, {eError,eNOP}, {eError,eNOP}, /*eNode10*/ {eError,eNOP}, {eError,eNOP}, {eNode11,eNOP}, {eError,eNOP}, /*eNode11*/ {eNode12,eNOP}, {eError,eNOP}, {eIdle,eEndTagged},{eError,eNOP}, /*eNode12*/ {eError,eNOP}, {eTagList,eSubtag}, {eError,eNOP}, {eError,eNOP}, /*e2dArray*/ {eNode14,e2dStr}, {eError,eNOP}, {eNode15,eNOP}, {eError,eNOP}, /*eNode14*/ {eError,eNOP}, {eError,eNOP}, {eNode15,eNOP}, {e2dArray,eNOP}, /*eNode15*/ {eError,eNOP}, {e2dArray,eNewRow}, {eIdle,eEnd2dList},{eNode16,eNOP}, /*eNode16*/ {eError,eNOP}, {e2dArray,eNewRow}, {eIdle,eEnd2dList},{eError,eNOP}, /*eTypeStart*/{eGotType,eSetType}, {eError,eNOP}, {eError,eNOP}, {eError,eNOP}, /*eGotType*/ {eError,eNOP}, {eError,eNOP}, {eError,eNOP}, {eError,eNOP} }; /* Row length is 4 */ #define GETTRANSITION(row,col) (gTransitionTable[col + (row<<2)]) /* Not anymore, it is 5 now */ /*#define GETTRANSITION(row,col) (gTransitionTable[col + (row*5)])*/ /********************************************************************* * Hashtable glue ********************************************************************/ static UBool get(UHashtable *hash, const struct UString* tag) { return (UBool)(uhash_get(hash, tag) != NULL); } static void put(UHashtable *hash, const struct UString *tag, UErrorCode* status) { struct UString* key = (struct UString*)uprv_malloc(sizeof(struct UString)); ustr_init(key); ustr_cpy(key, tag, status); uhash_put(hash, key, (void*)1, status); } static void freeUString(void* ustr) { ustr_deinit((struct UString*)ustr); uprv_free(ustr); } static int32_t hashUString(const void* ustr) { return uhash_hashUChars(((struct UString*)ustr)->fChars); } static UBool compareUString(const void* ustr1, const void* ustr2) { return uhash_compareUChars(((struct UString*)ustr1)->fChars, ((struct UString*)ustr2)->fChars); } char *getModificationData(struct UFILE *file, UErrorCode *status) { enum ETokenType modType; struct UString modToken; char *retValue = NULL; ustr_init(&modToken); modType = getNextToken(file, &modToken, status); if(U_SUCCESS(*status) && modType == tok_open_brace) { modType = getNextToken(file, &modToken, status); if(U_SUCCESS(*status) && modType == tok_string) { retValue = uprv_malloc(u_strlen(modToken.fChars)+1); u_UCharsToChars(modToken.fChars, retValue, u_strlen(modToken.fChars)+1); modType = getNextToken(file, &modToken, status); if(U_SUCCESS(*status) && modType == tok_close_brace) { return retValue; } else { uprv_free(retValue); } } } setErrorText("Invalid modificator directive"); *status = U_INVALID_FORMAT_ERROR; return NULL; } /********************************************************************* * parse ********************************************************************/ int32_t lineCount = 0; char lastTag[200] = ""; struct SRBRoot* parse(FileStream *f, const char *cp, const char *inputDir, UErrorCode *status) { struct UFILE *file; enum ETokenType type; enum ENode node; struct STransition t; struct UString token; struct UString tag; char cTag[1024]; char cSubTag[1024]; struct SRBRoot *bundle = NULL; struct SResource *rootTable = NULL; struct SResource *temp = NULL; struct SResource *temp1 = NULL; struct SResource *temp2 = NULL; UBool colEl = FALSE, colOverride = FALSE, ucaEl = FALSE; UChar trueValue[] = {0x0054, 0x0052, 0x0055, 0x0045, 0x0000}; /* Just to store "TRUE" and "FALSE" */ /* UChar falseValue[] = {0x0046, 0x0041, 0x004C, 0x0053, 0x0045, 0x0000}; */ /* Unicode for "FALSE" */ /* Hashtable for keeping track of seen tag names */ struct UHashtable *data; strcpy(lastTag, ""); if(U_FAILURE(*status)) return NULL; /* setup */ ustr_init(&token); ustr_init(&tag); /* cTag = uprv_malloc(1024); if(cTag == NULL) { *status = U_MEMORY_ALLOCATION_ERROR; return NULL; } cSubTag = uprv_malloc(1024); if(cSubTag == NULL) { *status = U_MEMORY_ALLOCATION_ERROR; return NULL; } */ node = eInitial; data = 0; file = u_finit((FILE *)f, 0, cp); lineCount = 1; /* file = u_finit(f, cp, status); */ if(file == NULL) { setErrorText("Could not initialize input file - most probably because of wrong converter\n"); *status = U_INVALID_FORMAT_ERROR; goto finish; } bundle = bundle_open(status); rootTable = bundle -> fRoot; if(U_FAILURE(*status) || file == NULL) { goto finish; } /* iterate through the stream */ for(;;) { /* Collation tailoring rules version */ UVersionInfo version; /* get next token from stream */ type = getNextToken(file, &token, status); if(U_FAILURE(*status)) { goto finish; } switch(type) { case tok_EOF: *status = (node == eInitial) ? U_ZERO_ERROR : U_INVALID_FORMAT_ERROR; if(U_FAILURE(*status)) { setErrorText("Unexpected EOF encountered"); } goto finish; /*break;*/ case tok_error: *status = U_INVALID_FORMAT_ERROR; goto finish; /*break;*/ default: break; } t = GETTRANSITION(node, type); node = t.fNext; if(node == eError) { *status = U_INVALID_FORMAT_ERROR; goto finish; } switch(t.fAction) { case eNOP: break; /* Record the last string as the tag name */ case eSetTag: ustr_cpy(&tag, &token, status); u_UCharsToChars(tag.fChars, cTag, u_strlen(tag.fChars)+1); if(U_FAILURE(*status)) { goto finish; } strcpy(lastTag, cTag); /* fprintf(stdout, "%d: %s\n", lineCount, lastTag); //[prints all tags] */ if(get(data, &tag)) { char *s; *status = U_INVALID_FORMAT_ERROR; s = uprv_malloc(1024); strcpy(s, "Duplicate tag name detected: "); u_austrcpy(s+strlen(s), tag.fChars); setErrorText(s); goto finish; } { char *modificator = uprv_strchr(cTag, ':'); if(modificator != NULL) { /* type modificator - do the type modification*/ *modificator = '\0'; ustr_deinit(&tag); ustr_setlen(&tag, uprv_strlen(cTag), status); u_charsToUChars(cTag, tag.fChars, uprv_strlen(cTag)); /* we need to test whether we have the same name, different type here */ if(get(data, &tag)) { char *s; *status = U_INVALID_FORMAT_ERROR; s = uprv_malloc(1024); strcpy(s, "Duplicate tag name detected: "); u_austrcpy(s+strlen(s), tag.fChars); setErrorText(s); goto finish; } modificator++; /* including streams of binary data */ if(uprv_strcmp(modificator, "bin") == 0) { char *binaryValue; char toConv[3]; uint32_t i = 0, bytesConverted = 0; uint8_t val = 0; uint8_t *newValue; fprintf(stdout, "bin\n"); binaryValue = getModificationData(file, status); if(U_SUCCESS(*status) && binaryValue != NULL) { /* do the parsing & outputing of the data */ fprintf(stdout, "Will parse binary value %s and store it in tag: %s\n", binaryValue, cTag); newValue = uprv_malloc(sizeof(uint8_t)*uprv_strlen(binaryValue)); for(i = 0; i %s\n", cTag, val, u_errorName(*status) ); table_add(rootTable, temp1, status); put(data, &tag, status); node = eIdle; } else { if(intValue != NULL) { uprv_free(intValue); } node = eError; } } /* importing a file and storing it in a binary object */ else if(uprv_strcmp(modificator, "import") == 0) { FileStream *importFile; int32_t len; uint8_t *binData; char *fileName; fprintf(stdout, "import\n"); fileName = getModificationData(file, status); if(U_SUCCESS(*status) && fileName != NULL) { /* do the reading & outputing of the file */ fprintf(stdout, "Will read %s and store it in tag: %s\n", fileName, cTag); /* Open the input file for reading */ if(inputDir == NULL) { importFile = T_FileStream_open(fileName, "rb"); } else { char *openFileName = NULL; int32_t dirlen = uprv_strlen(inputDir); int32_t filelen = uprv_strlen(fileName); if(inputDir[dirlen-1] != U_FILE_SEP_CHAR) { openFileName = (char *) uprv_malloc(dirlen+filelen+2); uprv_strcpy(openFileName, inputDir); openFileName[dirlen] = U_FILE_SEP_CHAR; openFileName[dirlen+1] = '\0'; uprv_strcat(openFileName, fileName); } else { openFileName = (char *) uprv_malloc(dirlen+filelen+1); uprv_strcpy(openFileName, inputDir); uprv_strcat(openFileName, fileName); } importFile = T_FileStream_open(openFileName, "rb"); uprv_free(openFileName); } if(importFile == NULL) { fprintf(stderr, "Error! Couldn't open input file %s for tag %s\n", fileName, cTag); node = eError; continue; } len = T_FileStream_size(importFile); binData = uprv_malloc(len); T_FileStream_read(importFile,binData,len); T_FileStream_close(importFile); temp1 = bin_open(bundle, cTag, len, binData, status); fprintf(stdout, "Added %s, len %d -> %s\n", cTag, len, u_errorName(*status) ); table_add(rootTable, temp1, status); uprv_free(binData); uprv_free(fileName); put(data, &tag, status); node = eIdle; } else { if(fileName != NULL) { uprv_free(fileName); } node = eError; } } /* array of integers, still unimplemented */ else if(uprv_strcmp(modificator, "intarray") == 0) { fprintf(stdout, "intarray\n"); } /* unknown tupe - an error */ else { fprintf(stderr, "Unknown %s\n", modificator); } } else if(uprv_strcmp(cTag, "CollationElements") == 0) { colEl = TRUE; } else if(uprv_strcmp(cTag, "%%UCARULES")==0){ ucaEl =TRUE; } } break; /* Record a singleton string */ case eStr: /* check if we have reached here after finding %%UCARULES */ if(ucaEl==TRUE){ UChar *c,*end,*ucaRulesStr; FileStream *in =NULL; UFILE* ufile=NULL; int fileLength = 0; char fileName[256]={'\0'}; char cs[128] = { '\0'}; const char* cpStr=NULL; char start[3] ={'0'}; ucaEl=FALSE; /* reset ucaEL */ /* make the fileName including the directory */ uprv_strcat(fileName,inputDir); uprv_strcat(fileName,U_FILE_SEP_STRING); uprv_strcat(fileName,U_ICU_UNIDATA); uprv_strcat(fileName,U_FILE_SEP_STRING); u_UCharsToChars(token.fChars,cs,token.fLength); uprv_strcat(fileName, cs); /* open the file */ in = T_FileStream_open(fileName, "rb"); T_FileStream_read(in, start, 3); if(start[0] == '\xFE' && start[1] == '\xFF') { cpStr = "UTF16_BigEndian"; } else if(start[0] == '\xFF' && start[1] == '\xFE') { cpStr = "UTF16_LittleEndian"; } else if(start[0] == '\xEF' && start[1] == '\xBB' && start[2] == '\xBF') { cpStr = "UTF8"; } ufile = u_finit((FILE*) in,0, cpStr); if(in && ufile){ fileLength =T_FileStream_size(in); ucaRulesStr = (UChar*)uprv_malloc(sizeof(UChar) * fileLength); c= ucaRulesStr; end = ucaRulesStr + fileLength/2; /* read in the rulses */ while(c < end) { *c++ = u_fgetc(ufile); } /* couldn't read all chars */ if(c < end) { fprintf(stderr, "Error! Couldn't read all chars from input file %s for tag %s\n", fileName, cTag); }else{ /* Add it to bundle */ temp = string_open(bundle,cTag, ucaRulesStr, fileLength/2, status); table_add(rootTable, temp, status); put(data, &tag, status); if(U_FAILURE(*status)) { goto finish; } temp = NULL; } uprv_free(ucaRulesStr); }else{ fprintf(stderr, "Error! Couldn't open input file %s for tag %s\n", fileName, cTag ); goto finish; } }else{ if(temp != NULL) { *status = U_INTERNAL_PROGRAM_ERROR; goto finish; } temp = string_open(bundle, cTag, token.fChars, token.fLength, status); table_add(rootTable, temp, status); /*uhash_put(data, tag.fChars, status);*/ put(data, &tag, status); if(U_FAILURE(*status)) { goto finish; } temp = NULL; } break; /* Begin a string list */ case eBegList: if(temp != NULL) { *status = U_INTERNAL_PROGRAM_ERROR; goto finish; } temp = array_open(bundle, cTag, status); temp1 = string_open(bundle, NULL, token.fChars, token.fLength, status); array_add(temp, temp1, status); temp1 = NULL; if(U_FAILURE(*status)) { goto finish; } break; /* Record a comma-delimited list string */ case eListStr: temp1 = string_open(bundle, NULL, token.fChars, token.fLength, status); array_add(temp, temp1, status); temp1 = NULL; if(U_FAILURE(*status)) { goto finish; } break; /* End a string list */ case eEndList: /*uhash_put(data, tag.fChars, status);*/ put(data, &tag, status); table_add(rootTable, temp, status); temp = NULL; if(U_FAILURE(*status)) { goto finish; } break; case eBeg2dList: if(temp != NULL) { *status = U_INTERNAL_PROGRAM_ERROR; goto finish; } temp = array_open(bundle, cTag, status); temp1 = array_open(bundle, NULL, status); if(U_FAILURE(*status)) { goto finish; } break; case eEnd2dList: /*uhash_put(data, tag.fChars, status);*/ put(data, &tag, status); array_add(temp, temp1, status); table_add(rootTable, temp, status); temp1 = NULL; temp = NULL; if(U_FAILURE(*status)) { goto finish; } break; case e2dStr: temp2 = string_open(bundle, NULL, token.fChars, token.fLength, status); array_add(temp1, temp2, status); temp2 = NULL; if(U_FAILURE(*status)) { goto finish; } break; case eNewRow: array_add(temp, temp1, status); temp1 = array_open(bundle, NULL, status); if(U_FAILURE(*status)) { goto finish; } break; case eBegTagged: if(temp != NULL) { *status = U_INTERNAL_PROGRAM_ERROR; goto finish; } temp = table_open(bundle, cTag, status); u_UCharsToChars(token.fChars, cSubTag, u_strlen(token.fChars)+1); if(U_FAILURE(*status)) { goto finish; } break; case eEndTagged: /*uhash_put(data, tag.fChars, status);*/ put(data, &tag, status); table_add(rootTable, temp, status); temp = NULL; if(U_FAILURE(*status)) { goto finish; } break; case eTaggedStr: temp1 = string_open(bundle, cSubTag, token.fChars, token.fLength, status); table_add(temp, temp1, status); temp1 = NULL; if(U_FAILURE(*status)) { goto finish; } /* We have seen the Override tag aleady, now checks if the value is "TRUE" or "FALSE". */ if (uprv_strcmp(cSubTag, "Override") == 0) { if (u_strncmp(token.fChars, trueValue, u_strlen(trueValue)) == 0) { colOverride = TRUE; } else { colOverride = FALSE; } } if(colEl && (uprv_strcmp(cSubTag, "Version") == 0)){ char tVer[40]; int32_t length=u_strlen(token.fChars); if(length>=(int32_t)sizeof(tVer)) { length=(int32_t)sizeof(tVer)-1; } u_UCharsToChars(token.fChars, tVer, length); u_versionFromString(version,tVer); } if (colEl && (uprv_strcmp(cSubTag, "Sequence") == 0)) { UErrorCode intStatus = U_ZERO_ERROR; /* uint32_t defaultRulesArrayLength = 0;*/ /* do the collation elements */ int32_t len = 0; uint8_t *binColData = NULL; UCollator *coll = NULL; UChar *rules = NULL; struct UString newTag; coll = ucol_openRules(token.fChars, token.fLength, UCOL_DECOMP_CAN, 0, &intStatus); if(U_SUCCESS(intStatus) && coll !=NULL) { binColData = ucol_cloneRuleData(coll, &len, &intStatus); coll->dataInfo.dataVersion[1] = version[0]; /*tailoring rules version*/ if(U_SUCCESS(*status) && data != NULL) { temp1 = bin_open(bundle, "%%CollationNew", len, binColData, status); table_add(rootTable, temp1, status); uprv_free(binColData); } else { setErrorText("Warning: could not obtain rules from collator"); } ucol_close(coll); } else { setErrorText("Warning: %%Collation could not be constructed from CollationElements - check context!"); } uprv_free(rules); colEl = FALSE; colOverride = FALSE; intStatus = U_ZERO_ERROR; ustr_initChars(&newTag, "CollationElements", -1, &intStatus); if(U_FAILURE(intStatus)) { goto finish; } put(data, &newTag, &intStatus); ustr_deinit(&newTag); if(U_FAILURE(intStatus)) { goto finish; } } break; /* Record the last string as the subtag */ case eSubtag: u_UCharsToChars(token.fChars, cSubTag, u_strlen(token.fChars)+1); if(U_FAILURE(*status)) { goto finish; } if(table_get(temp, cSubTag, status) != 0) { *status = U_INVALID_FORMAT_ERROR; setErrorText("Duplicate subtag found in tagged list"); goto finish; } break; case eOpen: if(data != 0) { *status = U_INTERNAL_PROGRAM_ERROR; goto finish; } bundle_setlocale(bundle, token.fChars, status); if(U_FAILURE(*status)) { goto finish; } data = uhash_open(hashUString, compareUString, status); uhash_setKeyDeleter(data, freeUString); break; case eClose: if(data == 0) { *status = U_INTERNAL_PROGRAM_ERROR; goto finish; } break; case eSetType: /* type recognition */ if(!didInit) { U_STRING_INIT(k_start_string, "string", 6); U_STRING_INIT(k_start_binary, "binary", 6); U_STRING_INIT(k_start_table, "table", 5); U_STRING_INIT(k_start_int, "int", 3); U_STRING_INIT(k_start_array, "array", 5); U_STRING_INIT(k_start_intvector, "intvector", 9); U_STRING_INIT(k_start_reserved, "reserved", 8); didInit=TRUE; } if(u_strcmp(token.fChars, k_start_string) == 0) { node = eGotTag; } else if(u_strcmp(token.fChars, k_start_array) == 0) { node = eGotTag; } else if(u_strcmp(token.fChars, k_start_table) == 0) { node = eGotTag; } else if(u_strcmp(token.fChars, k_start_binary) == 0) { /* start of binary */ } else if(u_strcmp(token.fChars, k_start_int) == 0) { /* start of integer */ } else if(u_strcmp(token.fChars, k_start_intvector) == 0) { /* start of intvector */ } else if(u_strcmp(token.fChars, k_start_reserved) == 0) { /* start of reserved */ } else { *status = U_INTERNAL_PROGRAM_ERROR; goto finish; } break; } } finish: /* clean up */ if(data != 0) uhash_close(data); ustr_deinit(&token); ustr_deinit(&tag); /*uprv_free(cTag);*/ /*uprv_free(cSubTag);*/ if(file != 0) u_fclose(file); return bundle; }