/* ******************************************************************************* * * Copyright (C) 2003-2004, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* * file name: icuswap.cpp * encoding: US-ASCII * tab size: 8 (not used) * indentation:4 * * created on: 2003aug08 * created by: Markus W. Scherer * * This tool takes an ICU data file and "swaps" it, that is, changes its * platform properties between big-/little-endianness and ASCII/EBCDIC charset * families. * The modified data file is written to a new file. * Useful as an install-time tool for shipping only one flavor of ICU data * and preparing data files for the target platform. * Will not work with data DLLs (shared libraries). */ #include "unicode/utypes.h" #include "unicode/putil.h" #include "unicode/udata.h" #include "cmemory.h" #include "cstring.h" #include "uinvchar.h" #include "uarrsort.h" #include "ucmndata.h" #include "udataswp.h" #include "toolutil.h" #include "uoptions.h" /* swapping implementations in common */ #include "uresdata.h" #include "ucnv_io.h" #include "uprops.h" #include "ucase.h" #include "ubidi_props.h" #include "ucol_swp.h" #include "ucnv_bld.h" #include "unormimp.h" #include "sprpimpl.h" #include "propname.h" #include "rbbidata.h" #include #include #include /* swapping implementations in i18n */ /* definitions */ #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) static UOption options[]={ UOPTION_HELP_H, UOPTION_HELP_QUESTION_MARK, UOPTION_DEF("type", 't', UOPT_REQUIRES_ARG) }; enum { OPT_HELP_H, OPT_HELP_QUESTION_MARK, OPT_OUT_TYPE }; static int32_t fileSize(FILE *f) { int32_t size; fseek(f, 0, SEEK_END); size=(int32_t)ftell(f); fseek(f, 0, SEEK_SET); return size; } /** * Identifies and then transforms the ICU data piece in-place, or determines * its length. See UDataSwapFn. * This function handles .dat data packages as well as single data pieces * and internally dispatches to per-type swap functions. * Sets a U_UNSUPPORTED_ERROR if the data format is not recognized. * * @see UDataSwapFn * @see udata_openSwapper * @see udata_openSwapperForInputData * @draft ICU 2.8 */ static int32_t udata_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outData, UErrorCode *pErrorCode); /** * Swap an ICU .dat package, including swapping of enclosed items. */ U_CFUNC int32_t U_CALLCONV udata_swapPackage(const UDataSwapper *ds, const void *inData, int32_t length, void *outData, UErrorCode *pErrorCode); /* * udata_swapPackage() needs to rename ToC name entries from the old package * name to the new one. * We store the filenames here, and udata_swapPackage() will extract the * package names. */ static const char *inFilename, *outFilename; U_CDECL_BEGIN static void U_CALLCONV printError(void *context, const char *fmt, va_list args) { vfprintf((FILE *)context, fmt, args); } U_CDECL_END static int printUsage(const char *pname, UBool ishelp) { fprintf(stderr, "%csage: %s [ -h, -?, --help ] -tl|-tb|-te|--type=b|... infilename outfilename\n", ishelp ? 'U' : 'u', pname); if(ishelp) { fprintf(stderr, "\nOptions: -h, -?, --help print this message and exit\n" " Read the input file, swap its platform properties according\n" " to the -t or --type option, and write the result to the output file.\n" " -tl change to little-endian/ASCII charset family\n" " -tb change to big-endian/ASCII charset family\n" " -te change to big-endian/EBCDIC charset family\n"); } return !ishelp; } extern int main(int argc, char *argv[]) { FILE *in, *out; const char *pname; char *data; int32_t length; UBool ishelp; int rc; UDataSwapper *ds; UErrorCode errorCode; uint8_t outCharset; UBool outIsBigEndian; U_MAIN_INIT_ARGS(argc, argv); /* get the program basename */ pname=strrchr(argv[0], U_FILE_SEP_CHAR); if(pname==NULL) { pname=strrchr(argv[0], '/'); } if(pname!=NULL) { ++pname; } else { pname=argv[0]; } argc=u_parseArgs(argc, argv, LENGTHOF(options), options); ishelp=options[OPT_HELP_H].doesOccur || options[OPT_HELP_QUESTION_MARK].doesOccur; if(ishelp || argc!=3) { return printUsage(pname, ishelp); } /* parse the output type option */ data=(char *)options[OPT_OUT_TYPE].value; if(data[0]==0 || data[1]!=0) { /* the type must be exactly one letter */ return printUsage(pname, FALSE); } switch(data[0]) { case 'l': outIsBigEndian=FALSE; outCharset=U_ASCII_FAMILY; break; case 'b': outIsBigEndian=TRUE; outCharset=U_ASCII_FAMILY; break; case 'e': outIsBigEndian=TRUE; outCharset=U_EBCDIC_FAMILY; break; default: return printUsage(pname, FALSE); } in=out=NULL; data=NULL; /* udata_swapPackage() needs the filenames */ inFilename=argv[1]; outFilename=argv[2]; /* open the input file, get its length, allocate memory for it, read the file */ in=fopen(argv[1], "rb"); if(in==NULL) { fprintf(stderr, "%s: unable to open input file \"%s\"\n", pname, argv[1]); rc=2; goto done; } length=fileSize(in); if(length<=0) { fprintf(stderr, "%s: empty input file \"%s\"\n", pname, argv[1]); rc=2; goto done; } /* * +15: udata_swapPackage() may need to add a few padding bytes to the * last item if charset swapping is done, * because the last item may be resorted into the middle and then needs * additional padding bytes */ data=(char *)malloc(length+15); if(data==NULL) { fprintf(stderr, "%s: error allocating memory for \"%s\"\n", pname, argv[1]); rc=2; goto done; } /* set the last 15 bytes to the usual padding byte, see udata_swapPackage() */ uprv_memset(data+length-15, 0xaa, 15); if(length!=(int32_t)fread(data, 1, length, in)) { fprintf(stderr, "%s: error reading \"%s\"\n", pname, argv[1]); rc=3; goto done; } fclose(in); in=NULL; /* swap the data in-place */ errorCode=U_ZERO_ERROR; ds=udata_openSwapperForInputData(data, length, outIsBigEndian, outCharset, &errorCode); if(U_FAILURE(errorCode)) { fprintf(stderr, "%s: udata_openSwapperForInputData(\"%s\") failed - %s\n", pname, argv[1], u_errorName(errorCode)); rc=4; goto done; } ds->printError=printError; ds->printErrorContext=stderr; length=udata_swap(ds, data, length, data, &errorCode); udata_closeSwapper(ds); if(U_FAILURE(errorCode)) { fprintf(stderr, "%s: udata_swap(\"%s\") failed - %s\n", pname, argv[1], u_errorName(errorCode)); rc=4; goto done; } out=fopen(argv[2], "wb"); if(out==NULL) { fprintf(stderr, "%s: unable to open output file \"%s\"\n", pname, argv[2]); rc=5; goto done; } if(length!=(int32_t)fwrite(data, 1, length, out)) { fprintf(stderr, "%s: error writing \"%s\"\n", pname, argv[2]); rc=6; goto done; } fclose(out); out=NULL; /* all done */ rc=0; done: if(in!=NULL) { fclose(in); } if(out!=NULL) { fclose(out); } if(data!=NULL) { free(data); } return rc; } /* swap the data ------------------------------------------------------------ */ static const struct { uint8_t dataFormat[4]; UDataSwapFn *swapFn; } swapFns[]={ { { 0x52, 0x65, 0x73, 0x42 }, ures_swap }, /* dataFormat="ResB" */ #if !UCONFIG_NO_LEGACY_CONVERSION { { 0x63, 0x6e, 0x76, 0x74 }, ucnv_swap }, /* dataFormat="cnvt" */ { { 0x43, 0x76, 0x41, 0x6c }, ucnv_swapAliases }, /* dataFormat="CvAl" */ #endif { { 0x43, 0x6d, 0x6e, 0x44 }, udata_swapPackage }, /* dataFormat="CmnD" */ #if !UCONFIG_NO_IDNA { { 0x53, 0x50, 0x52, 0x50 }, usprep_swap }, /* dataFormat="SPRP" */ #endif /* insert data formats here, descending by expected frequency of occurrence */ { { 0x55, 0x50, 0x72, 0x6f }, uprops_swap }, /* dataFormat="UPro" */ { { UCASE_FMT_0, UCASE_FMT_1, UCASE_FMT_2, UCASE_FMT_3 }, ucase_swap }, /* dataFormat="cAsE" */ { { UBIDI_FMT_0, UBIDI_FMT_1, UBIDI_FMT_2, UBIDI_FMT_3 }, ubidi_swap }, /* dataFormat="BiDi" */ #if !UCONFIG_NO_NORMALIZATION { { 0x4e, 0x6f, 0x72, 0x6d }, unorm_swap }, /* dataFormat="Norm" */ #endif #if !UCONFIG_NO_COLLATION { { 0x55, 0x43, 0x6f, 0x6c }, ucol_swap }, /* dataFormat="UCol" */ { { 0x49, 0x6e, 0x76, 0x43 }, ucol_swapInverseUCA },/* dataFormat="InvC" */ #endif #if !UCONFIG_NO_BREAK_ITERATION { { 0x42, 0x72, 0x6b, 0x20 }, ubrk_swap }, /* dataFormat="Brk " */ #endif { { 0x70, 0x6e, 0x61, 0x6d }, upname_swap }, /* dataFormat="pnam" */ { { 0x75, 0x6e, 0x61, 0x6d }, uchar_swapNames } /* dataFormat="unam" */ }; static int32_t udata_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outData, UErrorCode *pErrorCode) { char dataFormatChars[4]; const UDataInfo *pInfo; int32_t headerSize, i, swappedLength; if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { return 0; } /* * Preflight the header first; checks for illegal arguments, too. * Do not swap the header right away because the format-specific swapper * will swap it, get the headerSize again, and also use the header * information. Otherwise we would have to pass some of the information * and not be able to use the UDataSwapFn signature. */ headerSize=udata_swapDataHeader(ds, inData, -1, NULL, pErrorCode); /* * If we wanted udata_swap() to also handle non-loadable data like a UTrie, * then we could check here for further known magic values and structures. */ if(U_FAILURE(*pErrorCode)) { return 0; /* the data format was not recognized */ } pInfo=(const UDataInfo *)((const char *)inData+4); { /* convert the data format from ASCII to Unicode to the system charset */ UChar u[4]={ pInfo->dataFormat[0], pInfo->dataFormat[1], pInfo->dataFormat[2], pInfo->dataFormat[3] }; if(uprv_isInvariantUString(u, 4)) { u_UCharsToChars(u, dataFormatChars, 4); } else { dataFormatChars[0]=dataFormatChars[1]=dataFormatChars[2]=dataFormatChars[3]='?'; } } /* dispatch to the swap function for the dataFormat */ for(i=0; idataFormat, 4)) { swappedLength=swapFns[i].swapFn(ds, inData, length, outData, pErrorCode); if(U_FAILURE(*pErrorCode)) { udata_printError(ds, "udata_swap(): failure swapping data format %02x.%02x.%02x.%02x (\"%c%c%c%c\") - %s\n", pInfo->dataFormat[0], pInfo->dataFormat[1], pInfo->dataFormat[2], pInfo->dataFormat[3], dataFormatChars[0], dataFormatChars[1], dataFormatChars[2], dataFormatChars[3], u_errorName(*pErrorCode)); } else if(swappedLength<(length-15)) { /* swapped less than expected */ udata_printError(ds, "udata_swap() warning: swapped only %d out of %d bytes - data format %02x.%02x.%02x.%02x (\"%c%c%c%c\")\n", swappedLength, length, pInfo->dataFormat[0], pInfo->dataFormat[1], pInfo->dataFormat[2], pInfo->dataFormat[3], dataFormatChars[0], dataFormatChars[1], dataFormatChars[2], dataFormatChars[3], u_errorName(*pErrorCode)); } return swappedLength; } } /* the dataFormat was not recognized */ udata_printError(ds, "udata_swap(): unknown data format %02x.%02x.%02x.%02x (\"%c%c%c%c\")\n", pInfo->dataFormat[0], pInfo->dataFormat[1], pInfo->dataFormat[2], pInfo->dataFormat[3], dataFormatChars[0], dataFormatChars[1], dataFormatChars[2], dataFormatChars[3]); *pErrorCode=U_UNSUPPORTED_ERROR; return 0; } /* swap .dat package files -------------------------------------------------- */ static int32_t extractPackageName(const UDataSwapper *ds, const char *filename, char pkg[], int32_t capacity, UErrorCode *pErrorCode) { const char *basename; int32_t len; if(U_FAILURE(*pErrorCode)) { return 0; } basename=findBasename(filename); len=(int32_t)uprv_strlen(basename)-4; /* -4: subtract the length of ".dat" */ if(len<=0 || 0!=uprv_strcmp(basename+len, ".dat")) { udata_printError(ds, "udata_swapPackage(): \"%s\" is not recognized as a package filename (must end with .dat)\n", basename); *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; return 0; } if(len>=capacity) { udata_printError(ds, "udata_swapPackage(): the package name \"%s\" is too long (>=%ld)\n", (long)capacity); *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; return 0; } uprv_memcpy(pkg, basename, len); pkg[len]=0; return len; } struct ToCEntry { uint32_t nameOffset, inOffset, outOffset, length; }; U_CDECL_BEGIN static int32_t U_CALLCONV compareToCEntries(const void *context, const void *left, const void *right) { const char *chars=(const char *)context; return (int32_t)uprv_strcmp(chars+((const ToCEntry *)left)->nameOffset, chars+((const ToCEntry *)right)->nameOffset); } U_CDECL_END U_CFUNC int32_t U_CALLCONV udata_swapPackage(const UDataSwapper *ds, const void *inData, int32_t length, void *outData, UErrorCode *pErrorCode) { const UDataInfo *pInfo; int32_t headerSize; const uint8_t *inBytes; uint8_t *outBytes; uint32_t itemCount, offset, i; int32_t itemLength; const UDataOffsetTOCEntry *inEntries; UDataOffsetTOCEntry *outEntries; ToCEntry *table; char inPkgName[32], outPkgName[32]; int32_t inPkgNameLength, outPkgNameLength; /* udata_swapDataHeader checks the arguments */ headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode); if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { return 0; } /* check data format and format version */ pInfo=(const UDataInfo *)((const char *)inData+4); if(!( pInfo->dataFormat[0]==0x43 && /* dataFormat="CmnD" */ pInfo->dataFormat[1]==0x6d && pInfo->dataFormat[2]==0x6e && pInfo->dataFormat[3]==0x44 && pInfo->formatVersion[0]==1 )) { udata_printError(ds, "udata_swapPackage(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as an ICU .dat package\n", pInfo->dataFormat[0], pInfo->dataFormat[1], pInfo->dataFormat[2], pInfo->dataFormat[3], pInfo->formatVersion[0]); *pErrorCode=U_UNSUPPORTED_ERROR; return 0; } /* * We need to change the ToC name entries so that they have the correct * package name prefix. * Extract the package names from the in/out filenames. */ inPkgNameLength=extractPackageName( ds, inFilename, inPkgName, (int32_t)sizeof(inPkgName), pErrorCode); outPkgNameLength=extractPackageName( ds, outFilename, outPkgName, (int32_t)sizeof(outPkgName), pErrorCode); if(U_FAILURE(*pErrorCode)) { return 0; } /* * It is possible to work with inPkgNameLength!=outPkgNameLength, * but then the length of the data file would change more significantly, * which we are not currently prepared for. */ if(inPkgNameLength!=outPkgNameLength) { udata_printError(ds, "udata_swapPackage(): the package names \"%s\" and \"%s\" must have the same length\n", inPkgName, outPkgName); *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; return 0; } inBytes=(const uint8_t *)inData+headerSize; inEntries=(const UDataOffsetTOCEntry *)(inBytes+4); if(length<0) { /* preflighting */ itemCount=ds->readUInt32(*(const uint32_t *)inBytes); if(itemCount==0) { /* no items: count only the item count and return */ return headerSize+4; } /* read the last item's offset and preflight it */ offset=ds->readUInt32(inEntries[itemCount-1].dataOffset); itemLength=udata_swap(ds, inBytes+offset, -1, NULL, pErrorCode); if(U_SUCCESS(*pErrorCode)) { return headerSize+offset+(uint32_t)itemLength; } else { return 0; } } else { /* check that the itemCount fits, then the ToC table, then at least the header of the last item */ length-=headerSize; if(length<4) { /* itemCount does not fit */ offset=0xffffffff; itemCount=0; /* make compilers happy */ } else { itemCount=ds->readUInt32(*(const uint32_t *)inBytes); if(itemCount==0) { offset=4; } else if((uint32_t)length<(4+8*itemCount)) { /* ToC table does not fit */ offset=0xffffffff; } else { /* offset of the last item plus at least 20 bytes for its header */ offset=20+ds->readUInt32(inEntries[itemCount-1].dataOffset); } } if((uint32_t)lengthswapArray32(ds, inBytes, 4, outBytes, pErrorCode); if(itemCount==0) { /* no items: just return now */ return headerSize+4; } /* swap the item name strings */ offset=4+8*itemCount; itemLength=(int32_t)(ds->readUInt32(inEntries[0].dataOffset)-offset); udata_swapInvStringBlock(ds, inBytes+offset, itemLength, outBytes+offset, pErrorCode); if(U_FAILURE(*pErrorCode)) { udata_printError(ds, "udata_swapPackage() failed to swap the data item name strings\n"); return 0; } /* keep offset and itemLength in case we allocate and copy the strings below */ /* swap the package names into the output charset */ if(ds->outCharset!=U_CHARSET_FAMILY) { UDataSwapper *ds2; ds2=udata_openSwapper(TRUE, U_CHARSET_FAMILY, TRUE, ds->outCharset, pErrorCode); ds2->swapInvChars(ds2, inPkgName, inPkgNameLength, inPkgName, pErrorCode); ds2->swapInvChars(ds2, outPkgName, outPkgNameLength, outPkgName, pErrorCode); udata_closeSwapper(ds2); if(U_FAILURE(*pErrorCode)) { udata_printError(ds, "udata_swapPackage() failed to swap the input/output package names\n"); } } /* change the prefix of each ToC entry name from the old to the new package name */ { char *entryName; for(i=0; ireadUInt32(inEntries[i].nameOffset); if(0==uprv_memcmp(entryName, inPkgName, inPkgNameLength)) { uprv_memcpy(entryName, outPkgName, inPkgNameLength); } else { udata_printError(ds, "udata_swapPackage() failed: ToC item %ld does not have the input package name as a prefix\n", (long)i); *pErrorCode=U_INVALID_FORMAT_ERROR; return 0; } } } /* * Allocate the ToC table and, if necessary, a temporary buffer for * pseudo-in-place swapping. * * We cannot swap in-place because: * * 1. If the swapping of an item fails mid-way, then in-place swapping * has destroyed its data. * Out-of-place swapping allows us to then copy its original data. * * 2. If swapping changes the charset family, then we must resort * not only the ToC table but also the data items themselves. * This requires a permutation and is best done with separate in/out * buffers. * * We swapped the strings above to avoid the malloc below if string swapping fails. */ if(inData==outData) { /* +15: prepare for extra padding of a newly-last item */ table=(ToCEntry *)uprv_malloc(itemCount*sizeof(ToCEntry)+length+15); if(table!=NULL) { outBytes=(uint8_t *)(table+itemCount); /* copy the item count and the swapped strings */ uprv_memcpy(outBytes, inBytes, 4); uprv_memcpy(outBytes+offset, inBytes+offset, itemLength); } } else { table=(ToCEntry *)uprv_malloc(itemCount*sizeof(ToCEntry)); } if(table==NULL) { udata_printError(ds, "udata_swapPackage(): out of memory allocating %d bytes\n", inData==outData ? itemCount*sizeof(ToCEntry)+length+15 : itemCount*sizeof(ToCEntry)); *pErrorCode=U_MEMORY_ALLOCATION_ERROR; return 0; } outEntries=(UDataOffsetTOCEntry *)(outBytes+4); /* read the ToC table */ for(i=0; ireadUInt32(inEntries[i].nameOffset); table[i].inOffset=ds->readUInt32(inEntries[i].dataOffset); if(i>0) { table[i-1].length=table[i].inOffset-table[i-1].inOffset; } } table[itemCount-1].length=(uint32_t)length-table[itemCount-1].inOffset; if(ds->inCharset==ds->outCharset) { /* no charset swapping, no resorting: keep item offsets the same */ for(i=0; iwriteUInt32(&outEntries[i].nameOffset, table[i].nameOffset); ds->writeUInt32(&outEntries[i].dataOffset, table[i].outOffset); } /* swap each data item */ for(i=0; ioutCharset==U_CHARSET_FAMILY) { udata_printError(ds, "warning: udata_swapPackage() failed to swap item \"%s\"\n" " at inOffset 0x%x length 0x%x - %s\n" " the data item will be copied, not swapped\n\n", (char *)outBytes+table[i].nameOffset, table[i].inOffset, table[i].length, u_errorName(*pErrorCode)); } else { udata_printError(ds, "warning: udata_swapPackage() failed to swap an item\n" " at inOffset 0x%x length 0x%x - %s\n" " the data item will be copied, not swapped\n\n", table[i].inOffset, table[i].length, u_errorName(*pErrorCode)); } /* reset the error code, copy the data item, and continue */ *pErrorCode=U_ZERO_ERROR; uprv_memcpy(outBytes+table[i].outOffset, inBytes+table[i].inOffset, table[i].length); } } if(inData==outData) { /* copy the data from the temporary buffer to the in-place buffer */ uprv_memcpy((uint8_t *)outData+headerSize, outBytes, length); } uprv_free(table); return headerSize+length; } } /* * Hey, Emacs, please set the following: * * Local Variables: * indent-tabs-mode: nil * End: * */