/* ******************************************************************************* * * Copyright (C) 2001-2002, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* * file name: genidn.c * encoding: US-ASCII * tab size: 8 (not used) * indentation:4 * * created on: 2003-02-06 * created by: Ram Viswanadha * * This program reads the rfc3454_*.txt files, * parses them, and extracts the data for Nameprep conformance. * It then preprocesses it and writes a binary file for efficient use * in various IDNA conversion processes. */ #include #include #include "unicode/utypes.h" #include "unicode/uchar.h" #include "unicode/putil.h" #include "cmemory.h" #include "cstring.h" #include "unicode/udata.h" #include "unewdata.h" #include "uoptions.h" #include "uparse.h" #include "utrie.h" #include "umutex.h" #include "sprpimpl.h" #include "testidna.h" #ifdef WIN32 # pragma warning(disable: 4100) #endif UBool beVerbose=FALSE, haveCopyright=TRUE; /* prototypes --------------------------------------------------------------- */ static UBool isDataLoaded = FALSE; static UTrie idnTrie={ 0,0,0,0,0,0,0 }; static UDataMemory *idnData=NULL; static UErrorCode dataErrorCode =U_ZERO_ERROR; static const uint16_t* mappingData = NULL; static int32_t indexes[_IDNA_INDEX_TOP]={ 0 }; static void parseMappings(const char *filename, UBool withNorm, UBool reportError,TestIDNA& test, UErrorCode *pErrorCode); static void parseTable(const char *filename, UBool isUnassigned, TestIDNA& test, UErrorCode *pErrorCode); static UBool loadIDNData(UErrorCode &errorCode); static UBool cleanup(); static void compareMapping(uint32_t codepoint, uint32_t* mapping, int32_t mapLength, UBool withNorm, UErrorCode *status); static void compareFlagsForRange(uint32_t start, uint32_t end, UBool isUnassigned, UErrorCode *status); static void testAllCodepoints(TestIDNA& test); static TestIDNA* pTestIDNA =NULL; static const char* fileNames[] = { "rfc3454_A_1.txt", /* contains unassigned code points */ "rfc3454_C_X.txt", /* contains code points that are prohibited */ "rfc3454_B_1.txt", /* contains case mappings when normalization is turned off */ "rfc3454_B_2.txt", /* contains case mappings when normalization it turned on */ /* "NormalizationCorrections.txt",contains NFKC case mappings whicha are not included in UTR 21 */ }; /* -------------------------------------------------------------------------- */ static UOption options[]={ UOPTION_HELP_H, UOPTION_HELP_QUESTION_MARK, UOPTION_VERBOSE, UOPTION_COPYRIGHT, UOPTION_DESTDIR, UOPTION_SOURCEDIR, { "unicode", NULL, NULL, NULL, 'u', UOPT_REQUIRES_ARG, 0 } }; /* file definitions */ #define DATA_NAME "uidna" #define DATA_TYPE "icu" #define MISC_DIR "misc" extern int testData(TestIDNA& test) { char filename[300]; //TODO get the srcDir dynamically const char *srcDir=IntlTest::pathToDataDirectory(), *destDir=NULL, *suffix=NULL; char *basename=NULL; UErrorCode errorCode=U_ZERO_ERROR; char *saveBasename =NULL; loadIDNData(errorCode); if(U_FAILURE(dataErrorCode)){ test.errln( "Could not load data. Error: %s\n",u_errorName(dataErrorCode)); return dataErrorCode; } //initialize pTestIDNA = &test; /* prepare the filename beginning with the source dir */ if(srcDir[0] == U_FILE_SEP_CHAR){ filename[0]= 0x2E; uprv_strcat(filename+1,srcDir); }else if(uprv_strchr(srcDir,U_FILE_SEP_CHAR) == NULL){ filename[0] = 0x2E; filename[1] = U_FILE_SEP_CHAR; uprv_strcpy(filename+2,srcDir); }else{ uprv_strcpy(filename, srcDir); } /* process unassigned */ basename=filename+uprv_strlen(filename); if(basename>filename && *(basename-1)!=U_FILE_SEP_CHAR) { *basename++=U_FILE_SEP_CHAR; } uprv_strcpy(basename,MISC_DIR); basename= basename + uprv_strlen(MISC_DIR); *basename++ = U_FILE_SEP_CHAR; uprv_strcpy(basename,fileNames[0]); parseTable(filename,TRUE, test,&errorCode); if(U_FAILURE(errorCode)) { test.errln( "Could not open file %s for reading \n", filename); return errorCode; } /* process prohibited */ uprv_strcpy(basename,fileNames[1]); parseTable(filename,FALSE, test, &errorCode); if(U_FAILURE(errorCode)) { test.errln( "Could not open file %s for reading \n", filename); return errorCode; } /* process mappings */ uprv_strcpy(basename,fileNames[2]); parseMappings(filename, FALSE, FALSE,test, &errorCode); if(U_FAILURE(errorCode)) { test.errln( "Could not open file %s for reading \n", filename); return errorCode; } uprv_strcpy(basename,fileNames[3]); parseMappings(filename, TRUE, FALSE,test, &errorCode); if(U_FAILURE(errorCode)) { test.errln( "Could not open file %s for reading \n", filename); return errorCode; } testAllCodepoints(test); cleanup(); pTestIDNA = NULL; return errorCode; } U_CDECL_BEGIN static void U_CALLCONV caseMapLineFn(void *context, char *fields[][2], int32_t fieldCount, UErrorCode *pErrorCode) { uint32_t mapping[40]; char *end, *s; uint32_t code; int32_t length; UBool* mapWithNorm = (UBool*) context; /* get the character code, field 0 */ code=(uint32_t)uprv_strtoul(fields[0][0], &end, 16); if(end<=fields[0][0] || end!=fields[0][1]) { *pErrorCode=U_PARSE_ERROR; } s = fields[1][0]; /* parse the mapping string */ length=u_parseCodePoints(s, mapping, sizeof(mapping)/4, pErrorCode); /* store the mapping */ compareMapping(code,mapping, length, *mapWithNorm, pErrorCode); } U_CDECL_END static void parseMappings(const char *filename,UBool withNorm, UBool reportError, TestIDNA& test, UErrorCode *pErrorCode) { char *fields[3][2]; if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { return; } u_parseDelimitedFile(filename, ';', fields, 3, caseMapLineFn, &withNorm, pErrorCode); //fprintf(stdout,"Number of code points that have mappings with length >1 : %i\n",len); if(U_FAILURE(*pErrorCode) && (reportError || *pErrorCode!=U_FILE_ACCESS_ERROR)) { test.errln( "genidn error: u_parseDelimitedFile(\"%s\") failed - %s\n", filename, u_errorName(*pErrorCode)); } } /* parser for UnicodeData.txt ----------------------------------------------- */ U_CDECL_BEGIN static void U_CALLCONV unicodeDataLineFn(void *context, char *fields[][2], int32_t fieldCount, UErrorCode *pErrorCode) { uint32_t rangeStart=0,rangeEnd =0; UBool* isUnassigned = (UBool*) context; u_parseCodePointRange(fields[0][0], &rangeStart,&rangeEnd, pErrorCode); if(U_FAILURE(*pErrorCode)){ *pErrorCode = U_PARSE_ERROR; return; } compareFlagsForRange(rangeStart,rangeEnd,*isUnassigned, pErrorCode); } U_CDECL_END static void parseTable(const char *filename,UBool isUnassigned,TestIDNA& test, UErrorCode *pErrorCode) { char *fields[2][2]; int32_t len=0; if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { return; } u_parseDelimitedFile(filename, ';', fields, 1, unicodeDataLineFn, &isUnassigned, pErrorCode); if(U_FAILURE(*pErrorCode)) { test.errln( "genidn error: u_parseDelimitedFile(\"%s\") failed - %s\n", filename, u_errorName(*pErrorCode)); } } static void testAllCodepoints(TestIDNA& test){ if(isDataLoaded){ uint32_t i = 0; int32_t unassigned = 0; int32_t prohibited = 0; int32_t mappedWithNorm = 0; int32_t mapped = 0; int32_t noValueInTrie = 0; for(i=0;i<=0x10FFFF;i++){ uint32_t result = 0; UTRIE_GET16(&idnTrie,i, result); if(result != UIDNA_NO_VALUE ){ if((result & 0x07) == UIDNA_UNASSIGNED){ unassigned++; } if((result & 0x07) == UIDNA_PROHIBITED){ prohibited++; } if((result>>5) == _IDNA_MAP_TO_NOTHING){ mapped++; } if((result & 0x07) == UIDNA_MAP_NFKC){ mappedWithNorm++; } }else{ noValueInTrie++; if(result > 0){ test.errln("The return value for 0x%06X is wrong. %i\n",i,result); } } } test.logln("Number of Unassinged code points : %i \n",unassigned); test.logln("Number of Prohibited code points : %i \n",prohibited); test.logln("Number of Mapped code points : %i \n",mapped); test.logln("Number of Mapped with NFKC code points : %i \n",mappedWithNorm); test.logln("Number of code points that have no value in Trie: %i \n",noValueInTrie); } } static inline void getValues(uint32_t result, int8_t& flag, int8_t& length, int32_t& index){ /* first 3 bits contain the flag */ flag = (int8_t) (result & 0x07); /* next 2 bits contain the length */ length = (int8_t) ((result>>3) & 0x03); /* next 11 bits contain the index */ index = (result>> 5); } static void compareMapping(uint32_t codepoint, uint32_t* mapping,int32_t mapLength, UBool withNorm, UErrorCode *status){ if(isDataLoaded){ uint32_t result = 0; UTRIE_GET16(&idnTrie,codepoint, result); int8_t flag, length; int32_t index; getValues(result,flag,length, index); if(withNorm){ if(flag != UIDNA_MAP_NFKC){ pTestIDNA->errln( "Did not get the assigned flag for codepoint 0x%08X. Expected: %i Got: %i\n",codepoint, UIDNA_MAP_NFKC, flag); } }else{ if(flag==UIDNA_NO_VALUE || flag == UIDNA_PROHIBITED){ if(index != _IDNA_MAP_TO_NOTHING ){ pTestIDNA->errln( "Did not get the assigned flag for codepoint 0x%08X. Expected: %i Got: %i\n", codepoint, _IDNA_MAP_TO_NOTHING, index); } } } if(length ==_IDNA_LENGTH_IN_MAPPING_TABLE){ length = (int8_t)mappingData[index]; index++; } if(mapLength != length){ pTestIDNA->errln( "Did not get the expected length. Expected: %i Got: %i\n", mapLength, length); } for(int8_t i =0; i< mapLength; i++){ if(mapping[i] <= 0xFFFF){ if(mappingData[index+i] != (uint16_t)mapping[i]){ pTestIDNA->errln("Did not get the expected result. Expected: 0x%04X Got: 0x%04X \n", mapping[i], mappingData[index+i]); } }else{ UChar lead = UTF16_LEAD(mapping[i]); UChar trail = UTF16_TRAIL(mapping[i]); if(mappingData[index+i] != lead || mappingData[index+i+1] != trail){ pTestIDNA->errln( "Did not get the expected result. Expected: 0x%04X 0x%04X Got: 0x%04X 0x%04X", lead, trail, mappingData[index+i], mappingData[index+i+1]); } } } } } static void compareFlagsForRange(uint32_t start, uint32_t end, UBool isUnassigned, UErrorCode *status){ if(isDataLoaded){ uint32_t result =0 ; while(start < end+1){ UTRIE_GET16(&idnTrie,start, result); if(isUnassigned){ if(result != UIDNA_UNASSIGNED){ pTestIDNA->errln( "UIDNA_UASSIGNED flag failed for 0x%06X. Expected: %04X Got: %04X\n",start,UIDNA_UNASSIGNED, result); } }else{ if((result & 0x03) != UIDNA_PROHIBITED){ pTestIDNA->errln( "UIDNA_PROHIBITED flag failed for 0x%06X. Expected: %04X Got: %04X\n\n",start,UIDNA_PROHIBITED, result); } } start++; } } } UBool cleanup() { if(idnData!=NULL) { udata_close(idnData); idnData=NULL; } dataErrorCode=U_ZERO_ERROR; isDataLoaded=FALSE; return TRUE; } U_CDECL_BEGIN static UBool U_CALLCONV isAcceptable(void * /* context */, const char * /* type */, const char * /* name */, const UDataInfo *pInfo) { if( pInfo->size>=20 && pInfo->isBigEndian==U_IS_BIG_ENDIAN && pInfo->charsetFamily==U_CHARSET_FAMILY && pInfo->dataFormat[0]==0x49 && /* dataFormat="IDNA" 0x49, 0x44, 0x4e, 0x41 */ pInfo->dataFormat[1]==0x44 && pInfo->dataFormat[2]==0x4e && pInfo->dataFormat[3]==0x41 && pInfo->formatVersion[0]==2 && pInfo->formatVersion[2]==UTRIE_SHIFT && pInfo->formatVersion[3]==UTRIE_INDEX_SHIFT ) { return TRUE; } else { return FALSE; } } /* idnTrie: the folding offset is the lead FCD value itself */ static int32_t U_CALLCONV getFoldingOffset(uint32_t data) { if(data&0x8000) { return (int32_t)(data&0x7fff); } else { return 0; } } U_CDECL_END static UBool loadIDNData(UErrorCode &errorCode) { /* load Unicode normalization data from file */ if(isDataLoaded==FALSE) { UTrie _idnTrie={ 0,0,0,0,0,0,0 }; UDataMemory *data; const int32_t *p=NULL; const uint8_t *pb; if(&errorCode==NULL || U_FAILURE(errorCode)) { return 0; } /* open the data outside the mutex block */ data=udata_openChoice(NULL, DATA_TYPE, DATA_NAME, isAcceptable, NULL, &errorCode); dataErrorCode=errorCode; if(U_FAILURE(errorCode)) { return isDataLoaded=FALSE; } p=(const int32_t *)udata_getMemory(data); pb=(const uint8_t *)(p+_IDNA_INDEX_TOP); utrie_unserialize(&_idnTrie, pb, p[_IDNA_INDEX_TRIE_SIZE], &errorCode); _idnTrie.getFoldingOffset=getFoldingOffset; if(U_FAILURE(errorCode)) { dataErrorCode=errorCode; udata_close(data); return isDataLoaded=FALSE; } /* in the mutex block, set the data for this process */ umtx_lock(NULL); if(idnData==NULL) { idnData=data; data=NULL; uprv_memcpy(&indexes, p, sizeof(indexes)); uprv_memcpy(&idnTrie, &_idnTrie, sizeof(UTrie)); } else { p=(const int32_t *)udata_getMemory(idnData); } umtx_unlock(NULL); /* initialize some variables */ mappingData=(uint16_t *)((uint8_t *)(p+_IDNA_INDEX_TOP)+indexes[_IDNA_INDEX_TRIE_SIZE]); isDataLoaded = TRUE; /* if a different thread set it first, then close the extra data */ if(data!=NULL) { udata_close(data); /* NULL if it was set correctly */ } } return isDataLoaded; } /* * Hey, Emacs, please set the following: * * Local Variables: * indent-tabs-mode: nil * End: * */