/* ****************************************************************************** * * Copyright (C) 1999-2001, International Business Machines * Corporation and others. All Rights Reserved. * ****************************************************************************** * * * ucnv_io.c: * initializes global variables and defines functions pertaining to file access, * and name resolution aspect of the library. * * new implementation: * * created on: 1999nov22 * created by: Markus W. Scherer * * Use the binary cnvalias.dat (created from convrtrs.txt) to work * with aliases for converter names. ******************************************************************************* */ #include "unicode/utypes.h" #include "unicode/putil.h" #include "unicode/ucnv.h" /* This file implements ucnv_xXXX() APIs */ #include "umutex.h" #include "cstring.h" #include "cmemory.h" #include "ucnv_io.h" #include "unicode/udata.h" #include "ucln_cmn.h" /* Format of cnvalias.dat ------------------------------------------------------ * * cnvalias.dat is a binary, memory-mappable form of convrtrs.txt . * It contains two sorted tables and a block of zero-terminated strings. * Each table is preceded by the number of table entries. * * The first table maps from aliases to converter indexes. * The converter names themselves are listed as aliases in this table. * Each entry in this table has an offset to the alias and * an index of the converter in the converter table. * * The second table lists only the converters themselves. * Each entry in this table has an offset to the converter name and * the number of aliases, including the converter itself. * A count of 1 means that there is no alias, only the converter name. * * In the block of strings after the tables, each converter name is directly * followed by its aliases. All offsets to strings are offsets from the * beginning of the data. * * More formal file data structure (data format 2.1): * * uint16_t aliasCount; * uint16_t aliasOffsets[aliasCount]; * uint16_t converterIndexes[aliasCount]; * * uint16_t converterCount; * struct { * uint16_t converterOffset; * uint16_t aliasCount; * } converters[converterCount]; * * uint16_t tagCount; * uint16_t taggedAliasesOffsets[tagCount][converterCount]; * char tags[] = { "Tag0\Tag1\0..." }; * * char strings[]={ * "Converter0\0Alias1\0Alias2\0...Converter1\0Converter2\0Alias0\Alias1\0..." * }; * * The code included here can read versions 2 and 2.1 of the data format. * Version 2 does not have tag information, but since the code never refers * to strings[] by its base offset, it's okay. * */ #define DATA_NAME "cnvalias" #define DATA_TYPE "dat" static UDataMemory *aliasData=NULL; static const uint16_t *aliasTable=NULL; static const uint16_t *converterTable = NULL; static const uint16_t *tagTable = NULL; static char defaultConverterNameBuffer[100]; static const char *defaultConverterName = NULL; static UBool isAcceptable(void *context, const char *type, const char *name, const UDataInfo *pInfo) { return (UBool)( pInfo->size>=20 && pInfo->isBigEndian==U_IS_BIG_ENDIAN && pInfo->charsetFamily==U_CHARSET_FAMILY && pInfo->dataFormat[0]==0x43 && /* dataFormat="CvAl" */ pInfo->dataFormat[1]==0x76 && pInfo->dataFormat[2]==0x41 && pInfo->dataFormat[3]==0x6c && pInfo->formatVersion[0]>1); } static UBool haveAliasData(UErrorCode *pErrorCode) { if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { return FALSE; } /* load converter alias data from file if necessary */ if(aliasData==NULL) { UDataMemory *data; UDataInfo info; const uint16_t *table=NULL; /* open the data outside the mutex block */ data=udata_openChoice(NULL, DATA_TYPE, DATA_NAME, isAcceptable, NULL, pErrorCode); if(U_FAILURE(*pErrorCode)) { return FALSE; } table=(const uint16_t *)udata_getMemory(data); info.size=sizeof(UDataInfo); udata_getInfo(data, &info); /* in the mutex block, set the data for this process */ umtx_lock(NULL); if(aliasData==NULL) { aliasData=data; data=NULL; aliasTable=table; table=NULL; converterTable = aliasTable + 1 + 2 * *aliasTable; if (info.formatVersion[0] > 1 && info.formatVersion[1] > 0) { tagTable = converterTable + 1 + 2 * *converterTable; } } umtx_unlock(NULL); /* if a different thread set it first, then close the extra data */ if(data!=NULL) { udata_close(data); /* NULL if it was set correctly */ } } return TRUE; } static UBool isAlias(const char *alias, UErrorCode *pErrorCode) { if(alias==NULL) { *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; return FALSE; } else if(*alias==0) { return FALSE; } else { return TRUE; } } UBool ucnv_io_cleanup() { if (aliasData) { udata_close(aliasData); } aliasData = NULL; aliasTable = NULL; converterTable = NULL; tagTable = NULL; defaultConverterName = NULL; return TRUE; /* Everything was cleaned up */ } static int16_t getTagNumber(const char *tagname) { if (tagTable) { int16_t tag, count = (int16_t) *tagTable; const char *tags = (const char *) (tagTable + 1 + count * *converterTable); #if 0 char name[100]; int i; /* convert the tag name to lowercase to do case-insensitive comparisons */ for(i = 0; i < sizeof(name) - 1 && *tagname; ++i) { name[i] = (char)uprv_tolower(*tagname++); } name[i] = 0; #else const char *name = tagname; #endif for (tag = 0; count--; ++tag) { if (!uprv_stricmp(name, tags)) { return tag; } tags += strlen(tags) + 1; } } return -1; } /** * Do a fuzzy compare of a two converter/alias names. The comparison * is case-insensitive. It also ignores the characters '-', '_', and * ' ' (dash, underscore, and space). Thus the strings "UTF-8", * "utf_8", and "Utf 8" are exactly equivalent. * * This is a symmetrical (commutative) operation; order of arguments * is insignificant. This is an important property for sorting the * list (when the list is preprocessed into binary form) and for * performing binary searches on it at run time. * * @param name1 a converter name or alias, zero-terminated * @param name2 a converter name or alias, zero-terminated * @return 0 if the names match, or a negative value if the name1 * lexically precedes name2, or a positive value if the name1 * lexically follows name2. */ U_CAPI int U_EXPORT2 ucnv_compareNames(const char *name1, const char *name2) { int rc; unsigned char c1, c2; for (;;) { /* Ignore delimiters '-', '_', and ' ' */ while ((c1 = (unsigned char)*name1) == '-' || c1 == '_' || c1 == ' ') ++name1; while ((c2 = (unsigned char)*name2) == '-' || c2 == '_' || c2 == ' ') ++name2; /* If we reach the ends of both strings then they match */ if ((c1|c2)==0) { return 0; } /* Case-insensitive comparison */ rc = (int)(unsigned char)uprv_tolower(c1) - (int)(unsigned char)uprv_tolower(c2); if (rc!=0) { return rc; } ++name1; ++name2; } } /* * search for an alias * return NULL or a pointer to the converter table entry */ static const uint16_t * findAlias(const char *alias) { char name[100]; const uint16_t *p=aliasTable; uint16_t i, start, limit; limit=*p++; if(limit==0) { /* there are no aliases */ return NULL; } /* convert the alias name to lowercase to do case-insensitive comparisons */ for(i=0; i0) { /* skip a name, first the canonical converter name */ aliases+=uprv_strlen(aliases)+1; --n; } return aliases; } } } return NULL; } U_CFUNC uint16_t ucnv_io_countStandards(UErrorCode *pErrorCode) { if (haveAliasData(pErrorCode)) { if (!tagTable) { *pErrorCode = U_INVALID_FORMAT_ERROR; return 0; } return *tagTable; } return 0; } U_CAPI const char * U_EXPORT2 ucnv_getStandard(uint16_t n, UErrorCode *pErrorCode) { if (haveAliasData(pErrorCode) && tagTable) { int16_t count = (int16_t) *tagTable; const char *tags = (const char *) (tagTable + 1 + count * *converterTable); while (n-- && count--) { tags += strlen(tags) + 1; } return count ? tags : NULL; } return NULL; } U_CFUNC const char * U_EXPORT2 ucnv_getStandardName(const char *alias, const char *standard, UErrorCode *pErrorCode) { if (haveAliasData(pErrorCode) && isAlias(alias, pErrorCode)) { const uint16_t *p = findAlias(alias); if(p != NULL) { int16_t tag = getTagNumber(standard); if (tag > -1) { uint16_t offset = tagTable[1 + tag * *converterTable + (p - converterTable) / 2]; return offset ? (const char *) aliasTable + offset : NULL; } } } return NULL; } U_CFUNC uint16_t ucnv_io_countAvailableConverters(UErrorCode *pErrorCode) { if(haveAliasData(pErrorCode)) { return *converterTable; } return 0; } U_CFUNC const char * ucnv_io_getAvailableConverter(uint16_t n, UErrorCode *pErrorCode) { if(haveAliasData(pErrorCode)) { const uint16_t *p=converterTable; if(n<*p) { return (const char *)aliasTable+p[1+2*n]; } } return NULL; } U_CFUNC void ucnv_io_fillAvailableConverters(const char **aliases, UErrorCode *pErrorCode) { if(haveAliasData(pErrorCode)) { const uint16_t *p=converterTable; uint16_t count=*p++; while(count>0) { *aliases++=(const char *)aliasTable+*p; p+=2; --count; } } } U_CFUNC uint16_t ucnv_io_countAvailableAliases(UErrorCode *pErrorCode) { if(haveAliasData(pErrorCode)) { return *aliasTable; } return 0; } #if 0 /* * We are not currently using these functions, so I am commenting them out * to reduce the binary file size and improve the code coverage; * I do not currently want to remove this entirely because it may be useful * in the future and also serves to some degree as another piece of * documentation of the data structure. */ U_CFUNC const char * ucnv_io_getAvailableAlias(uint16_t n, UErrorCode *pErrorCode) { if(haveAliasData(pErrorCode) && n<*aliasTable) { return (const char *)aliasTable+*(aliasTable+1+n); } return NULL; } U_CFUNC void ucnv_io_fillAvailableAliases(const char **aliases, UErrorCode *pErrorCode) { if(haveAliasData(pErrorCode)) { const uint16_t *p=aliasTable; uint16_t count=*p++; while(count>0) { *aliases++=(const char *)aliasTable+*p; ++p; --count; } } } #endif /* default converter name --------------------------------------------------- */ /* * In order to be really thread-safe, the get function would have to take * a buffer parameter and copy the current string inside a mutex block. * This implementation only tries to be really thread-safe while * setting the name. * It assumes that setting a pointer is atomic. */ U_CFUNC const char * ucnv_io_getDefaultConverterName() { /* local variable to be thread-safe */ const char *name=defaultConverterName; if(name==NULL) { const char *codepage=0; umtx_lock(NULL); codepage = uprv_getDefaultCodepage(); umtx_unlock(NULL); if(codepage!=NULL) { UErrorCode errorCode=U_ZERO_ERROR; name=ucnv_io_getConverterName(codepage, &errorCode); if(U_FAILURE(errorCode) || name==NULL) { name=codepage; } } /* if the name is there, test it out */ if(name != NULL) { UErrorCode errorCode = U_ZERO_ERROR; UConverter *cnv; cnv = ucnv_open(name, &errorCode); if(U_FAILURE(errorCode) || (cnv == NULL)) { /* Panic time, let's use a fallback. */ #if (U_CHARSET_FAMILY == U_ASCII_FAMILY) name = "US-ASCII"; /* there is no 'algorithmic' converter for EBCDIC */ #elif defined(OS390) name = "ibm-1047-s390"; #else name = "ibm-37"; #endif } ucnv_close(cnv); } if(name != NULL) { /* Did find a name. And it works.*/ defaultConverterName=name; } } return name; } U_CFUNC void ucnv_io_setDefaultConverterName(const char *converterName) { if(converterName==NULL) { /* reset to the default codepage */ defaultConverterName=NULL; } else { UErrorCode errorCode=U_ZERO_ERROR; const char *name=ucnv_io_getConverterName(converterName, &errorCode); if(U_SUCCESS(errorCode) && name!=NULL) { defaultConverterName=name; } else { /* do not set the name if the alias lookup failed and it is too long */ int32_t length=uprv_strlen(converterName); if(length