/******************************************************************** * COPYRIGHT: * Copyright (C) 2001-2011 IBM, Inc. All Rights Reserved. * ********************************************************************/ /******************************************************************************** * * File CALLCOLL.C * * Modification History: * Name Description * Andy Heninger First Version * ********************************************************************************* */ // // This program tests string collation and sort key generation performance. // Three APIs can be teste: ICU C , Unix strcoll, strxfrm and Windows LCMapString // A file of names is required as input, one per line. It must be in utf-8 or utf-16 format, // and include a byte order mark. Either LE or BE format is OK. // const char gUsageString[] = "usage: collperf options...\n" "-help Display this message.\n" "-file file_name utf-16 format file of names.\n" "-locale name ICU locale to use. Default is en_US\n" "-rules file_name Collation rules file (overrides locale)\n" "-langid 0x1234 Windows Language ID number. Default to value for -locale option\n" " see http://msdn.microsoft.com/library/psdk/winbase/nls_8xo3.htm\n" "-win Run test using Windows native services. (ICU is default)\n" "-unix Run test using Unix strxfrm, strcoll services.\n" "-uselen Use API with string lengths. Default is null-terminated strings\n" "-usekeys Run tests using sortkeys rather than strcoll\n" "-strcmp Run tests using u_strcmp rather than strcoll\n" "-strcmpCPO Run tests using u_strcmpCodePointOrder rather than strcoll\n" "-loop nnnn Loopcount for test. Adjust for reasonable total running time.\n" "-iloop n Inner Loop Count. Default = 1. Number of calls to function\n" " under test at each call point. For measuring test overhead.\n" "-terse Terse numbers-only output. Intended for use by scripts.\n" "-french French accent ordering\n" "-frenchoff No French accent ordering (for use with French locales.)\n" "-norm Normalizing mode on\n" "-shifted Shifted mode\n" "-lower Lower case first\n" "-upper Upper case first\n" "-case Enable separate case level\n" "-level n Sort level, 1 to 5, for Primary, Secndary, Tertiary, Quaternary, Identical\n" "-keyhist Produce a table sort key size vs. string length\n" "-binsearch Binary Search timing test\n" "-keygen Sort Key Generation timing test\n" "-qsort Quicksort timing test\n" "-iter Iteration Performance Test\n" "-dump Display strings, sort keys and CEs.\n" ; #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef WIN32 #include #else // // Stubs for Windows API functions when building on UNIXes. // typedef int DWORD; inline int CompareStringW(DWORD, DWORD, UChar *, int, UChar *, int) {return 0;} #include unsigned long timeGetTime() { struct timeval t; gettimeofday(&t, 0); unsigned long val = t.tv_sec * 1000; // Let it overflow. Who cares. val += t.tv_usec / 1000; return val; } inline int LCMapStringW(DWORD, DWORD, UChar *, int, UChar *, int) {return 0;} const int LCMAP_SORTKEY = 0; #define MAKELCID(a,b) 0 const int SORT_DEFAULT = 0; #endif // // Command line option variables // These global variables are set according to the options specified // on the command line by the user. char * opt_fName = 0; const char * opt_locale = "en_US"; int opt_langid = 0; // Defaults to value corresponding to opt_locale. char * opt_rules = 0; UBool opt_help = FALSE; int opt_loopCount = 1; int opt_iLoopCount = 1; UBool opt_terse = FALSE; UBool opt_qsort = FALSE; UBool opt_binsearch = FALSE; UBool opt_icu = TRUE; UBool opt_win = FALSE; // Run with Windows native functions. UBool opt_unix = FALSE; // Run with UNIX strcoll, strxfrm functions. UBool opt_uselen = FALSE; UBool opt_usekeys = FALSE; UBool opt_strcmp = FALSE; UBool opt_strcmpCPO = FALSE; UBool opt_norm = FALSE; UBool opt_keygen = FALSE; UBool opt_french = FALSE; UBool opt_frenchoff = FALSE; UBool opt_shifted = FALSE; UBool opt_lower = FALSE; UBool opt_upper = FALSE; UBool opt_case = FALSE; int opt_level = 0; UBool opt_keyhist = FALSE; UBool opt_itertest = FALSE; UBool opt_dump = FALSE; // // Definitions for the command line options // struct OptSpec { const char *name; enum {FLAG, NUM, STRING} type; void *pVar; }; OptSpec opts[] = { {"-file", OptSpec::STRING, &opt_fName}, {"-locale", OptSpec::STRING, &opt_locale}, {"-langid", OptSpec::NUM, &opt_langid}, {"-rules", OptSpec::STRING, &opt_rules}, {"-qsort", OptSpec::FLAG, &opt_qsort}, {"-binsearch", OptSpec::FLAG, &opt_binsearch}, {"-iter", OptSpec::FLAG, &opt_itertest}, {"-win", OptSpec::FLAG, &opt_win}, {"-unix", OptSpec::FLAG, &opt_unix}, {"-uselen", OptSpec::FLAG, &opt_uselen}, {"-usekeys", OptSpec::FLAG, &opt_usekeys}, {"-strcmp", OptSpec::FLAG, &opt_strcmp}, {"-strcmpCPO", OptSpec::FLAG, &opt_strcmpCPO}, {"-norm", OptSpec::FLAG, &opt_norm}, {"-french", OptSpec::FLAG, &opt_french}, {"-frenchoff", OptSpec::FLAG, &opt_frenchoff}, {"-shifted", OptSpec::FLAG, &opt_shifted}, {"-lower", OptSpec::FLAG, &opt_lower}, {"-upper", OptSpec::FLAG, &opt_upper}, {"-case", OptSpec::FLAG, &opt_case}, {"-level", OptSpec::NUM, &opt_level}, {"-keyhist", OptSpec::FLAG, &opt_keyhist}, {"-keygen", OptSpec::FLAG, &opt_keygen}, {"-loop", OptSpec::NUM, &opt_loopCount}, {"-iloop", OptSpec::NUM, &opt_iLoopCount}, {"-terse", OptSpec::FLAG, &opt_terse}, {"-dump", OptSpec::FLAG, &opt_dump}, {"-help", OptSpec::FLAG, &opt_help}, {"-?", OptSpec::FLAG, &opt_help}, {0, OptSpec::FLAG, 0} }; //--------------------------------------------------------------------------- // // Global variables pointing to and describing the test file // //--------------------------------------------------------------------------- // // struct Line // // Each line from the source file (containing a name, presumably) gets // one of these structs. // struct Line { UChar *name; int len; char *winSortKey; char *icuSortKey; char *unixSortKey; char *unixName; }; Line *gFileLines; // Ptr to array of Line structs, one per line in the file. int gNumFileLines; UCollator *gCol; DWORD gWinLCID; Line **gSortedLines; Line **gRandomLines; int gCount; //--------------------------------------------------------------------------- // // ProcessOptions() Function to read the command line options. // //--------------------------------------------------------------------------- UBool ProcessOptions(int argc, const char **argv, OptSpec opts[]) { int i; int argNum; const char *pArgName; OptSpec *pOpt; for (argNum=1; argNumname != 0; pOpt++) { if (strcmp(pOpt->name, pArgName) == 0) { switch (pOpt->type) { case OptSpec::FLAG: *(UBool *)(pOpt->pVar) = TRUE; break; case OptSpec::STRING: argNum ++; if (argNum >= argc) { fprintf(stderr, "value expected for \"%s\" option.\n", pOpt->name); return FALSE; } *(const char **)(pOpt->pVar) = argv[argNum]; break; case OptSpec::NUM: argNum ++; if (argNum >= argc) { fprintf(stderr, "value expected for \"%s\" option.\n", pOpt->name); return FALSE; } char *endp; i = strtol(argv[argNum], &endp, 0); if (endp == argv[argNum]) { fprintf(stderr, "integer value expected for \"%s\" option.\n", pOpt->name); return FALSE; } *(int *)(pOpt->pVar) = i; } break; } } if (pOpt->name == 0) { fprintf(stderr, "Unrecognized option \"%s\"\n", pArgName); return FALSE; } } return TRUE; } //--------------------------------------------------------------------------------------- // // Comparison functions for use by qsort. // // Six flavors, ICU or Windows, SortKey or String Compare, Strings with length // or null terminated. // //--------------------------------------------------------------------------------------- int ICUstrcmpK(const void *a, const void *b) { gCount++; int t = strcmp((*(Line **)a)->icuSortKey, (*(Line **)b)->icuSortKey); return t; } int ICUstrcmpL(const void *a, const void *b) { gCount++; UCollationResult t; t = ucol_strcoll(gCol, (*(Line **)a)->name, (*(Line **)a)->len, (*(Line **)b)->name, (*(Line **)b)->len); if (t == UCOL_LESS) return -1; if (t == UCOL_GREATER) return +1; return 0; } int ICUstrcmp(const void *a, const void *b) { gCount++; UCollationResult t; t = ucol_strcoll(gCol, (*(Line **)a)->name, -1, (*(Line **)b)->name, -1); if (t == UCOL_LESS) return -1; if (t == UCOL_GREATER) return +1; return 0; } int Winstrcmp(const void *a, const void *b) { gCount++; int t; t = CompareStringW(gWinLCID, 0, (*(Line **)a)->name, -1, (*(Line **)b)->name, -1); return t-2; } int UNIXstrcmp(const void *a, const void *b) { gCount++; int t; t = strcoll((*(Line **)a)->unixName, (*(Line **)b)->unixName); return t; } int WinstrcmpL(const void *a, const void *b) { gCount++; int t; t = CompareStringW(gWinLCID, 0, (*(Line **)a)->name, (*(Line **)a)->len, (*(Line **)b)->name, (*(Line **)b)->len); return t-2; } int WinstrcmpK(const void *a, const void *b) { gCount++; int t = strcmp((*(Line **)a)->winSortKey, (*(Line **)b)->winSortKey); return t; } //--------------------------------------------------------------------------------------- // // Function for sorting the names (lines) into a random order. // Order is based on a hash of the ICU Sort key for the lines // The randomized order is used as input for the sorting timing tests. // //--------------------------------------------------------------------------------------- int ICURandomCmp(const void *a, const void *b) { char *ask = (*(Line **)a)->icuSortKey; char *bsk = (*(Line **)b)->icuSortKey; int aVal = 0; int bVal = 0; int retVal; while (*ask != 0) { aVal += aVal*37 + *ask++; } while (*bsk != 0) { bVal += bVal*37 + *bsk++; } retVal = -1; if (aVal == bVal) { retVal = 0; } else if (aVal > bVal) { retVal = 1; } return retVal; } //--------------------------------------------------------------------------------------- // // doKeyGen() Key Generation Timing Test // //--------------------------------------------------------------------------------------- void doKeyGen() { int line; int loops = 0; int iLoop; int t; int len=-1; // Adjust loop count to compensate for file size. Should be order n double dLoopCount = double(opt_loopCount) * (1000. / double(gNumFileLines)); int adj_loopCount = int(dLoopCount); if (adj_loopCount < 1) adj_loopCount = 1; unsigned long startTime = timeGetTime(); if (opt_win) { for (loops=0; loopsname, (gSortedLines[guess])->name); } gCount++; if (r== 0) break; if (r < 0) hi = guess; else lo = guess; } } } elapsedTime = timeGetTime() - startTime; break; } if (opt_icu) { unsigned long startTime = timeGetTime(); UCollationResult r = UCOL_EQUAL; for (loops=0; loopslen; } int hi = gNumFileLines-1; int lo = 0; int guess = -1; for (;;) { int newGuess = (hi + lo) / 2; if (newGuess == guess) break; guess = newGuess; int ri = 0; if (opt_usekeys) { for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) { ri = strcmp((gSortedLines[line])->icuSortKey, (gSortedLines[guess])->icuSortKey); } gCount++; r=UCOL_GREATER; if(ri<0) {r=UCOL_LESS;} else if (ri==0) {r=UCOL_EQUAL;} } else { if (opt_uselen) { guessLen = (gSortedLines[guess])->len; } for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) { r = ucol_strcoll(gCol, (gSortedLines[line])->name, lineLen, (gSortedLines[guess])->name, guessLen); } gCount++; } if (r== UCOL_EQUAL) break; if (r == UCOL_LESS) hi = guess; else lo = guess; } } } elapsedTime = timeGetTime() - startTime; break; } if (opt_win) { unsigned long startTime = timeGetTime(); int r = 0; for (loops=0; loopslen; } int hi = gNumFileLines-1; int lo = 0; int guess = -1; for (;;) { int newGuess = (hi + lo) / 2; if (newGuess == guess) break; guess = newGuess; if (opt_usekeys) { for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) { r = strcmp((gSortedLines[line])->winSortKey, (gSortedLines[guess])->winSortKey); } gCount++; r+=2; } else { if (opt_uselen) { guessLen = (gSortedLines[guess])->len; } for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) { r = CompareStringW(gWinLCID, 0, (gSortedLines[line])->name, lineLen, (gSortedLines[guess])->name, guessLen); } if (r == 0) { if (opt_terse == FALSE) { fprintf(stderr, "Error returned from Windows CompareStringW.\n"); } exit(-1); } gCount++; } if (r== 2) // strings == break; if (r == 1) // line < guess hi = guess; else // line > guess lo = guess; } } } elapsedTime = timeGetTime() - startTime; break; } if (opt_unix) { unsigned long startTime = timeGetTime(); int r = 0; for (loops=0; loopsunixSortKey, (gSortedLines[guess])->unixSortKey); } gCount++; } else { for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) { r = strcoll((gSortedLines[line])->unixName, (gSortedLines[guess])->unixName); } errno = 0; if (errno != 0) { fprintf(stderr, "Error %d returned from strcoll.\n", errno); exit(-1); } gCount++; } if (r == 0) // strings == break; if (r < 0) // line < guess hi = guess; else // line > guess lo = guess; } } } elapsedTime = timeGetTime() - startTime; break; } break; } int ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount); if (opt_terse == FALSE) { printf("binary search: total # of string compares = %d\n", gCount); printf("binary search: compares per loop = %d\n", gCount / loops); printf("binary search: time per compare = %d ns\n", ns); } else { printf("%d, ", ns); } } //--------------------------------------------------------------------------------------- // // doQSort() The quick sort timing test. Uses the C library qsort function. // //--------------------------------------------------------------------------------------- void doQSort() { int i; Line **sortBuf = new Line *[gNumFileLines]; // Adjust loop count to compensate for file size. QSort should be n log(n) double dLoopCount = double(opt_loopCount) * 3000. / (log10((double)gNumFileLines) * double(gNumFileLines)); if (opt_usekeys) dLoopCount *= 5; int adj_loopCount = int(dLoopCount); if (adj_loopCount < 1) adj_loopCount = 1; gCount = 0; unsigned long startTime = timeGetTime(); if (opt_win && opt_usekeys) { for (i=0; i maxLen) maxLen = gFileLines[i].len; } // Allocate arrays to hold the histogram data int *accumulatedLen = new int[maxLen+1]; int *numKeysOfSize = new int[maxLen+1]; for (i=0; i<=maxLen; i++) { accumulatedLen[i] = 0; numKeysOfSize[i] = 0; } // Fill the arrays... for (i=0; i 0) { printf("%d, %f, %f\n", i, (float)accumulatedLen[i] / (float)numKeysOfSize[i], (float)accumulatedLen[i] / (float)(numKeysOfSize[i] * i)); } } delete []accumulatedLen; delete []numKeysOfSize ; } //--------------------------------------------------------------------------------------- // // doForwardIterTest(UBool) Forward iteration test // argument null-terminated string used // //--------------------------------------------------------------------------------------- void doForwardIterTest(UBool haslen) { int count = 0; UErrorCode error = U_ZERO_ERROR; printf("\n\nPerforming forward iteration performance test with "); if (haslen) { printf("non-null terminated data -----------\n"); } else { printf("null terminated data -----------\n"); } printf("performance test on strings from file -----------\n"); UChar dummytext[] = {0, 0}; UCollationElements *iter = ucol_openElements(gCol, NULL, 0, &error); ucol_setText(iter, dummytext, 1, &error); gCount = 0; unsigned long startTime = timeGetTime(); while (count < opt_loopCount) { int linecount = 0; while (linecount < gNumFileLines) { UChar *str = gFileLines[linecount].name; int strlen = haslen?gFileLines[linecount].len:-1; ucol_setText(iter, str, strlen, &error); while (ucol_next(iter, &error) != UCOL_NULLORDER) { gCount++; } linecount ++; } count ++; } unsigned long elapsedTime = timeGetTime() - startTime; printf("elapsedTime %ld\n", elapsedTime); // empty loop recalculation count = 0; startTime = timeGetTime(); while (count < opt_loopCount) { int linecount = 0; while (linecount < gNumFileLines) { UChar *str = gFileLines[linecount].name; int strlen = haslen?gFileLines[linecount].len:-1; ucol_setText(iter, str, strlen, &error); linecount ++; } count ++; } elapsedTime -= (timeGetTime() - startTime); printf("elapsedTime %ld\n", elapsedTime); ucol_closeElements(iter); int ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount); printf("Total number of strings compared %d in %d loops\n", gNumFileLines, opt_loopCount); printf("Average time per ucol_next() nano seconds %d\n", ns); printf("performance test on skipped-5 concatenated strings from file -----------\n"); UChar *str; int strlen = 0; // appending all the strings int linecount = 0; while (linecount < gNumFileLines) { strlen += haslen?gFileLines[linecount].len: u_strlen(gFileLines[linecount].name); linecount ++; } str = (UChar *)malloc(sizeof(UChar) * strlen); int strindex = 0; linecount = 0; while (strindex < strlen) { int len = 0; len += haslen?gFileLines[linecount].len: u_strlen(gFileLines[linecount].name); memcpy(str + strindex, gFileLines[linecount].name, sizeof(UChar) * len); strindex += len; linecount ++; } printf("Total size of strings %d\n", strlen); gCount = 0; count = 0; if (!haslen) { strlen = -1; } iter = ucol_openElements(gCol, str, strlen, &error); if (!haslen) { strlen = u_strlen(str); } strlen -= 5; // any left over characters are not iterated, // this is to ensure the backwards and forwards iterators // gets the same position startTime = timeGetTime(); while (count < opt_loopCount) { int count5 = 5; strindex = 0; ucol_setOffset(iter, strindex, &error); while (TRUE) { if (ucol_next(iter, &error) == UCOL_NULLORDER) { break; } gCount++; count5 --; if (count5 == 0) { strindex += 10; if (strindex > strlen) { break; } ucol_setOffset(iter, strindex, &error); count5 = 5; } } count ++; } elapsedTime = timeGetTime() - startTime; printf("elapsedTime %ld\n", elapsedTime); // empty loop recalculation int tempgCount = 0; count = 0; startTime = timeGetTime(); while (count < opt_loopCount) { int count5 = 5; strindex = 0; ucol_setOffset(iter, strindex, &error); while (TRUE) { tempgCount ++; count5 --; if (count5 == 0) { strindex += 10; if (strindex > strlen) { break; } ucol_setOffset(iter, strindex, &error); count5 = 5; } } count ++; } elapsedTime -= (timeGetTime() - startTime); printf("elapsedTime %ld\n", elapsedTime); ucol_closeElements(iter); printf("gCount %d\n", gCount); ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount); printf("Average time per ucol_next() nano seconds %d\n", ns); } //--------------------------------------------------------------------------------------- // // doBackwardIterTest(UBool) Backwards iteration test // argument null-terminated string used // //--------------------------------------------------------------------------------------- void doBackwardIterTest(UBool haslen) { int count = 0; UErrorCode error = U_ZERO_ERROR; printf("\n\nPerforming backward iteration performance test with "); if (haslen) { printf("non-null terminated data -----------\n"); } else { printf("null terminated data -----------\n"); } printf("performance test on strings from file -----------\n"); UCollationElements *iter = ucol_openElements(gCol, NULL, 0, &error); UChar dummytext[] = {0, 0}; ucol_setText(iter, dummytext, 1, &error); gCount = 0; unsigned long startTime = timeGetTime(); while (count < opt_loopCount) { int linecount = 0; while (linecount < gNumFileLines) { UChar *str = gFileLines[linecount].name; int strlen = haslen?gFileLines[linecount].len:-1; ucol_setText(iter, str, strlen, &error); while (ucol_previous(iter, &error) != UCOL_NULLORDER) { gCount ++; } linecount ++; } count ++; } unsigned long elapsedTime = timeGetTime() - startTime; printf("elapsedTime %ld\n", elapsedTime); // empty loop recalculation count = 0; startTime = timeGetTime(); while (count < opt_loopCount) { int linecount = 0; while (linecount < gNumFileLines) { UChar *str = gFileLines[linecount].name; int strlen = haslen?gFileLines[linecount].len:-1; ucol_setText(iter, str, strlen, &error); linecount ++; } count ++; } elapsedTime -= (timeGetTime() - startTime); printf("elapsedTime %ld\n", elapsedTime); ucol_closeElements(iter); int ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount); printf("Total number of strings compared %d in %d loops\n", gNumFileLines, opt_loopCount); printf("Average time per ucol_previous() nano seconds %d\n", ns); printf("performance test on skipped-5 concatenated strings from file -----------\n"); UChar *str; int strlen = 0; // appending all the strings int linecount = 0; while (linecount < gNumFileLines) { strlen += haslen?gFileLines[linecount].len: u_strlen(gFileLines[linecount].name); linecount ++; } str = (UChar *)malloc(sizeof(UChar) * strlen); int strindex = 0; linecount = 0; while (strindex < strlen) { int len = 0; len += haslen?gFileLines[linecount].len: u_strlen(gFileLines[linecount].name); memcpy(str + strindex, gFileLines[linecount].name, sizeof(UChar) * len); strindex += len; linecount ++; } printf("Total size of strings %d\n", strlen); gCount = 0; count = 0; if (!haslen) { strlen = -1; } iter = ucol_openElements(gCol, str, strlen, &error); if (!haslen) { strlen = u_strlen(str); } startTime = timeGetTime(); while (count < opt_loopCount) { int count5 = 5; strindex = 5; ucol_setOffset(iter, strindex, &error); while (TRUE) { if (ucol_previous(iter, &error) == UCOL_NULLORDER) { break; } gCount ++; count5 --; if (count5 == 0) { strindex += 10; if (strindex > strlen) { break; } ucol_setOffset(iter, strindex, &error); count5 = 5; } } count ++; } elapsedTime = timeGetTime() - startTime; printf("elapsedTime %ld\n", elapsedTime); // empty loop recalculation count = 0; int tempgCount = 0; startTime = timeGetTime(); while (count < opt_loopCount) { int count5 = 5; strindex = 5; ucol_setOffset(iter, strindex, &error); while (TRUE) { tempgCount ++; count5 --; if (count5 == 0) { strindex += 10; if (strindex > strlen) { break; } ucol_setOffset(iter, strindex, &error); count5 = 5; } } count ++; } elapsedTime -= (timeGetTime() - startTime); printf("elapsedTime %ld\n", elapsedTime); ucol_closeElements(iter); printf("gCount %d\n", gCount); ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount); printf("Average time per ucol_previous() nano seconds %d\n", ns); } //--------------------------------------------------------------------------------------- // // doIterTest() Iteration test // //--------------------------------------------------------------------------------------- void doIterTest() { doForwardIterTest(opt_uselen); doBackwardIterTest(opt_uselen); } //---------------------------------------------------------------------------------------- // // UnixConvert -- Convert the lines of the file to the encoding for UNIX // Since it appears that Unicode support is going in the general // direction of the use of UTF-8 locales, that is the approach // that is used here. // //---------------------------------------------------------------------------------------- void UnixConvert() { int line; UConverter *cvrtr; // An ICU code page converter. UErrorCode status = U_ZERO_ERROR; cvrtr = ucnv_open("utf-8", &status); // we are just doing UTF-8 locales for now. if (U_FAILURE(status)) { fprintf(stderr, "ICU Converter open failed.: %s\n", u_errorName(status)); exit(-1); } for (line=0; line < gNumFileLines; line++) { int sizeNeeded = ucnv_fromUChars(cvrtr, 0, // ptr to target buffer. 0, // length of target buffer. gFileLines[line].name, -1, // source is null terminated &status); if (status != U_BUFFER_OVERFLOW_ERROR && status != U_ZERO_ERROR) { //fprintf(stderr, "Conversion from Unicode, something is wrong.\n"); //exit(-1); } status = U_ZERO_ERROR; gFileLines[line].unixName = new char[sizeNeeded+1]; sizeNeeded = ucnv_fromUChars(cvrtr, gFileLines[line].unixName, // ptr to target buffer. sizeNeeded+1, // length of target buffer. gFileLines[line].name, -1, // source is null terminated &status); if (U_FAILURE(status)) { fprintf(stderr, "ICU Conversion Failed.: %d\n", status); exit(-1); } gFileLines[line].unixName[sizeNeeded] = 0; }; ucnv_close(cvrtr); } //---------------------------------------------------------------------------------------- // // class UCharFile Class to hide all the gorp to read a file in // and produce a stream of UChars. // //---------------------------------------------------------------------------------------- class UCharFile { public: UCharFile(const char *fileName); ~UCharFile(); UChar get(); UBool eof() {return fEof;}; UBool error() {return fError;}; private: UCharFile (const UCharFile & /*other*/) {}; // No copy constructor. UCharFile & operator = (const UCharFile &/*other*/) {return *this;}; // No assignment op FILE *fFile; const char *fName; UBool fEof; UBool fError; UChar fPending2ndSurrogate; enum {UTF16LE, UTF16BE, UTF8} fEncoding; }; UCharFile::UCharFile(const char * fileName) { fEof = FALSE; fError = FALSE; fName = fileName; fFile = fopen(fName, "rb"); fPending2ndSurrogate = 0; if (fFile == NULL) { fprintf(stderr, "Can not open file \"%s\"\n", opt_fName); fError = TRUE; return; } // // Look for the byte order mark at the start of the file. // int BOMC1, BOMC2, BOMC3; BOMC1 = fgetc(fFile); BOMC2 = fgetc(fFile); if (BOMC1 == 0xff && BOMC2 == 0xfe) { fEncoding = UTF16LE; } else if (BOMC1 == 0xfe && BOMC2 == 0xff) { fEncoding = UTF16BE; } else if (BOMC1 == 0xEF && BOMC2 == 0xBB && (BOMC3 = fgetc(fFile)) == 0xBF ) { fEncoding = UTF8; } else { fprintf(stderr, "collperf: file \"%s\" encoding must be UTF-8 or UTF-16, and " "must include a BOM.\n", fileName); fError = true; return; } } UCharFile::~UCharFile() { fclose(fFile); } UChar UCharFile::get() { UChar c; switch (fEncoding) { case UTF16LE: { int cL, cH; cL = fgetc(fFile); cH = fgetc(fFile); c = cL | (cH << 8); if (cH == EOF) { c = 0; fEof = TRUE; } break; } case UTF16BE: { int cL, cH; cH = fgetc(fFile); cL = fgetc(fFile); c = cL | (cH << 8); if (cL == EOF) { c = 0; fEof = TRUE; } break; } case UTF8: { if (fPending2ndSurrogate != 0) { c = fPending2ndSurrogate; fPending2ndSurrogate = 0; break; } int ch = fgetc(fFile); // Note: c and ch are separate cause eof test doesn't work on UChar type. if (ch == EOF) { c = 0; fEof = TRUE; break; } if (ch <= 0x7f) { // It's ascii. No further utf-8 conversion. c = ch; break; } // Figure out the lenght of the char and read the rest of the bytes // into a temp array. int nBytes; if (ch >= 0xF0) {nBytes=4;} else if (ch >= 0xE0) {nBytes=3;} else if (ch >= 0xC0) {nBytes=2;} else { fprintf(stderr, "utf-8 encoded file contains corrupt data.\n"); fError = TRUE; return 0; } unsigned char bytes[10]; bytes[0] = (unsigned char)ch; int i; for (i=1; i= 0xc0) { fprintf(stderr, "utf-8 encoded file contains corrupt data.\n"); fError = TRUE; return 0; } } // Convert the bytes from the temp array to a Unicode char. i = 0; uint32_t cp; UTF8_NEXT_CHAR_UNSAFE(bytes, i, cp); c = (UChar)cp; if (cp >= 0x10000) { // The code point needs to be broken up into a utf-16 surrogate pair. // Process first half this time through the main loop, and // remember the other half for the next time through. UChar utf16Buf[3]; i = 0; UTF16_APPEND_CHAR_UNSAFE(utf16Buf, i, cp); fPending2ndSurrogate = utf16Buf[1]; c = utf16Buf[0]; } break; }; default: c = 0xFFFD; /* Error, unspecified codepage*/ fprintf(stderr, "UCharFile: Error: unknown fEncoding\n"); exit(1); } return c; } //---------------------------------------------------------------------------------------- // // openRulesCollator - Command line specified a rules file. Read it in // and open a collator with it. // //---------------------------------------------------------------------------------------- UCollator *openRulesCollator() { UCharFile f(opt_rules); if (f.error()) { return 0; } int bufLen = 10000; UChar *buf = (UChar *)malloc(bufLen * sizeof(UChar)); UChar *tmp; int i = 0; for(;;) { buf[i] = f.get(); if (f.eof()) { break; } if (f.error()) { return 0; } i++; if (i >= bufLen) { tmp = buf; bufLen += 10000; buf = (UChar *)realloc(buf, bufLen); if (buf == NULL) { free(tmp); return 0; } } } buf[i] = 0; UErrorCode status = U_ZERO_ERROR; UCollator *coll = ucol_openRules(buf, u_strlen(buf), UCOL_OFF, UCOL_DEFAULT_STRENGTH, NULL, &status); if (U_FAILURE(status)) { fprintf(stderr, "ICU ucol_openRules() open failed.: %d\n", status); return 0; } free(buf); return coll; } //---------------------------------------------------------------------------------------- // // Main -- process command line, read in and pre-process the test file, // call other functions to do the actual tests. // //---------------------------------------------------------------------------------------- int main(int argc, const char** argv) { if (ProcessOptions(argc, argv, opts) != TRUE || opt_help || opt_fName == 0) { printf(gUsageString); exit (1); } // Make sure that we've only got one API selected. if (opt_unix || opt_win) opt_icu = FALSE; if (opt_unix) opt_win = FALSE; // // Set up an ICU collator // UErrorCode status = U_ZERO_ERROR; if (opt_rules != 0) { gCol = openRulesCollator(); if (gCol == 0) {return -1;} } else { gCol = ucol_open(opt_locale, &status); if (U_FAILURE(status)) { fprintf(stderr, "Collator creation failed.: %d\n", status); return -1; } } if (status==U_USING_DEFAULT_WARNING && opt_terse==FALSE) { fprintf(stderr, "Warning, U_USING_DEFAULT_WARNING for %s\n", opt_locale); } if (status==U_USING_FALLBACK_WARNING && opt_terse==FALSE) { fprintf(stderr, "Warning, U_USING_FALLBACK_ERROR for %s\n", opt_locale); } if (opt_norm) { ucol_setAttribute(gCol, UCOL_NORMALIZATION_MODE, UCOL_ON, &status); } if (opt_french && opt_frenchoff) { fprintf(stderr, "collperf: Error, specified both -french and -frenchoff options."); exit(-1); } if (opt_french) { ucol_setAttribute(gCol, UCOL_FRENCH_COLLATION, UCOL_ON, &status); } if (opt_frenchoff) { ucol_setAttribute(gCol, UCOL_FRENCH_COLLATION, UCOL_OFF, &status); } if (opt_lower) { ucol_setAttribute(gCol, UCOL_CASE_FIRST, UCOL_LOWER_FIRST, &status); } if (opt_upper) { ucol_setAttribute(gCol, UCOL_CASE_FIRST, UCOL_UPPER_FIRST, &status); } if (opt_case) { ucol_setAttribute(gCol, UCOL_CASE_LEVEL, UCOL_ON, &status); } if (opt_shifted) { ucol_setAttribute(gCol, UCOL_ALTERNATE_HANDLING, UCOL_SHIFTED, &status); } if (opt_level != 0) { switch (opt_level) { case 1: ucol_setAttribute(gCol, UCOL_STRENGTH, UCOL_PRIMARY, &status); break; case 2: ucol_setAttribute(gCol, UCOL_STRENGTH, UCOL_SECONDARY, &status); break; case 3: ucol_setAttribute(gCol, UCOL_STRENGTH, UCOL_TERTIARY, &status); break; case 4: ucol_setAttribute(gCol, UCOL_STRENGTH, UCOL_QUATERNARY, &status); break; case 5: ucol_setAttribute(gCol, UCOL_STRENGTH, UCOL_IDENTICAL, &status); break; default: fprintf(stderr, "-level param must be between 1 and 5\n"); exit(-1); } } if (U_FAILURE(status)) { fprintf(stderr, "Collator attribute setting failed.: %d\n", status); return -1; } // // Set up a Windows LCID // if (opt_langid != 0) { gWinLCID = MAKELCID(opt_langid, SORT_DEFAULT); } else { gWinLCID = uloc_getLCID(opt_locale); } // // Set the UNIX locale // if (opt_unix) { if (setlocale(LC_ALL, opt_locale) == 0) { fprintf(stderr, "setlocale(LC_ALL, %s) failed.\n", opt_locale); exit(-1); } } // Read in the input file. // File assumed to be utf-16. // Lines go onto heap buffers. Global index array to line starts is created. // Lines themselves are null terminated. // UCharFile f(opt_fName); if (f.error()) { exit(-1); } const int MAXLINES = 100000; gFileLines = new Line[MAXLINES]; UChar buf[1024]; int column = 0; // Read the file, split into lines, and save in memory. // Loop runs once per utf-16 value from the input file, // (The number of bytes read from file per loop iteration depends on external encoding.) for (;;) { UChar c = f.get(); if (f.error()){ exit(-1); } // We now have a good UTF-16 value in c. // Watch for CR, LF, EOF; these finish off a line. if (c == 0xd) { continue; } if (f.eof() || c == 0x0a || c==0x2028) { // Unipad inserts 2028 line separators! buf[column++] = 0; if (column > 1) { gFileLines[gNumFileLines].name = new UChar[column]; gFileLines[gNumFileLines].len = column-1; memcpy(gFileLines[gNumFileLines].name, buf, column * sizeof(UChar)); gNumFileLines++; column = 0; if (gNumFileLines >= MAXLINES) { fprintf(stderr, "File too big. Max number of lines is %d\n", MAXLINES); exit(-1); } } if (c == 0xa || c == 0x2028) continue; else break; // EOF } buf[column++] = c; if (column >= 1023) { static UBool warnFlag = TRUE; if (warnFlag) { fprintf(stderr, "Warning - file line longer than 1023 chars truncated.\n"); warnFlag = FALSE; } column--; } } if (opt_terse == FALSE) { printf("file \"%s\", %d lines.\n", opt_fName, gNumFileLines); } // Convert the lines to the UNIX encoding. if (opt_unix) { UnixConvert(); } // // Pre-compute ICU sort keys for the lines of the file. // int line; int32_t t; for (line=0; line (int32_t)sizeof(buf)) { t = ucol_getSortKey(gCol, gFileLines[line].name, -1, (unsigned char *)gFileLines[line].icuSortKey , t); } else { memcpy(gFileLines[line].icuSortKey, buf, t); } } // // Pre-compute Windows sort keys for the lines of the file. // for (line=0; line (int32_t)sizeof(buf)) { t = LCMapStringW(gWinLCID, LCMAP_SORTKEY, gFileLines[line].name, -1, (unsigned short *)(gFileLines[line].winSortKey), t); } else { memcpy(gFileLines[line].winSortKey, buf, t); } } // // Pre-compute UNIX sort keys for the lines of the file. // if (opt_unix) { for (line=0; line (int32_t)sizeof(buf)) { t = strxfrm(gFileLines[line].unixSortKey, gFileLines[line].unixName, sizeof(buf)); } else { memcpy(gFileLines[line].unixSortKey, buf, t); } } } // // Dump file lines, CEs, Sort Keys if requested. // if (opt_dump) { int i; for (line=0; line 0x7e) { printf("\\u%.4x", c); } else { printf("%c", c); } } printf("\n"); printf(" CEs: "); UCollationElements *CEiter = ucol_openElements(gCol, gFileLines[line].name, -1, &status); int32_t ce; i = 0; for (;;) { ce = ucol_next(CEiter, &status); if (ce == UCOL_NULLORDER) { break; } printf(" %.8x", ce); if (++i > 8) { printf("\n "); i = 0; } } printf("\n"); ucol_closeElements(CEiter); printf(" ICU Sort Key: "); for (i=0; ; i++) { unsigned char c = gFileLines[line].icuSortKey[i]; printf("%02x ", c); if (c == 0) { break; } if (i > 0 && i % 20 == 0) { printf("\n "); } } printf("\n"); } } // // Pre-sort the lines. // int i; gSortedLines = new Line *[gNumFileLines]; for (i=0; i