scuffed-code/icu4c/source/test/collperf/collperf.cpp

/********************************************************************
 * COPYRIGHT:
 * Copyright (C) 2001 IBM, Inc.   All Rights Reserved.
 *
 ********************************************************************/
/********************************************************************************
*
* File CALLCOLL.C
*
* Modification History:
*        Name                     Description
*     Andy Heninger             First Version
*
*********************************************************************************
*/

//
//  This program tests string collation and sort key generation performance.
//      Three APIs can be teste: ICU C , Unix strcoll, strxfrm and Windows LCMapString
//      A file of names is required as input, one per line.  It must be in utf-16 format, and
//      include a byte order mark.  Either LE or BE format is OK.
//
//      Usage:
//         collperf options...
//            -file file_name            utf-16 format file of names to sort/search
//            -locale name               ICU locale to use.  Default is en_US
//            -langid 0x1234             Windows Language ID number.  Default 0x409 (en_US)
//                                          see http://msdn.microsoft.com/library/psdk/winbase/nls_8xo3.htm
//            -win                       Run test using Windows native services.  (ICU is default)
//            -unix                      Run test using Unix strxfrm, strcoll services.
//            -uselen                    Use API with string lengths.  Default is null-terminated strings
//            -usekeys                   Run tests using sortkeys rather than strcoll
//            -loop nnnn                 Loopcount for test.  Adjust for reasonable total running time.
//            -terse                     Terse numbers-only output.  Intended for use by scripts.
//            -help                      Display this message.
//            -qsort                     Quicksort timing test
//            -binsearch                 Binary Search timing test
//            -keygen                    Sort Key Generation timing test
//            -french                    French accent ordering
//            -norm                      Normalizing mode on
//            -shifted                   Shifted mode
//            -lower                     Lower case first
//            -upper                     Upper case first
//            -case                      Enable separate case level
//            -level n                   Sort level, 1 to 5, for Primary, Secndary, Tertiary, Quaternary, Identical
//            -keyhist                   Sort Key size histogram


#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <math.h>
#include <locale.h>
#include <errno.h>

#include <unicode/utypes.h>
#include <unicode/ucol.h>
#include <unicode/uloc.h>
#include <unicode/ustring.h>
#include <unicode/ures.h>
#include <unicode/uchar.h>
#include <unicode/ucnv.h>

#ifdef WIN32
#include <windows.h>
#else
//
//  Stubs for Windows API functions when building on UNIXes.
//
typedef int DWORD;
inline int CompareStringW(DWORD, DWORD, UChar *, int, UChar *, int) {return 0;};
#include <sys/time.h>
unsigned long timeGetTime() {
    struct timeval t;
    gettimeofday(&t, 0);
    unsigned long val = t.tv_sec * 1000;  // Let it overflow.  Who cares.
    val += t.tv_usec / 1000;
    return val;
};
inline int LCMapStringW(DWORD, DWORD, UChar *, int, UChar *, int) {return 0;};
const int LCMAP_SORTKEY = 0;
#define MAKELCID(a,b) 0
const int SORT_DEFAULT = 0;
#endif


//
//  Command line option variables
//     These global variables are set according to the options specified
//     on the command line by the user.
char * opt_fName      = 0;
char * opt_locale     = "en_US";
int    opt_langid     = 0x409;      // English, US
UBool  opt_help       = FALSE;
int    opt_loopCount  = 1;
UBool  opt_terse      = FALSE;
UBool  opt_qsort      = FALSE;
UBool  opt_binsearch  = FALSE;
UBool  opt_icu        = TRUE;
UBool  opt_win        = FALSE;      // Run with Windows native functions.
UBool  opt_unix       = FALSE;      // Run with UNIX strcoll, strxfrm functions.
UBool  opt_uselen     = FALSE;
UBool  opt_usekeys    = FALSE;
UBool  opt_norm       = FALSE;
UBool  opt_keygen     = FALSE;
UBool  opt_french     = FALSE;
UBool  opt_shifted    = FALSE;
UBool  opt_lower      = FALSE;
UBool  opt_upper      = FALSE;
UBool  opt_case       = FALSE;
int    opt_level      = 0;
UBool  opt_keyhist    = FALSE;


//
//   Definitions for the command line options
//
struct OptSpec {
    const char *name;
    enum {FLAG, NUM, STRING} type;
    void *pVar;
};

OptSpec opts[] = {
    {"-file",        OptSpec::STRING, &opt_fName},
    {"-locale",      OptSpec::STRING, &opt_locale},
    {"-langid",      OptSpec::NUM,    &opt_langid},
    {"-qsort",       OptSpec::FLAG,   &opt_qsort},
    {"-binsearch",   OptSpec::FLAG,   &opt_binsearch},
    {"-win",         OptSpec::FLAG,   &opt_win},
    {"-unix",        OptSpec::FLAG,   &opt_unix},
    {"-uselen",      OptSpec::FLAG,   &opt_uselen},
    {"-usekeys",     OptSpec::FLAG,   &opt_usekeys},
    {"-norm",        OptSpec::FLAG,   &opt_norm},
    {"-french",      OptSpec::FLAG,   &opt_french},
    {"-shifted",     OptSpec::FLAG,   &opt_shifted},
    {"-lower",       OptSpec::FLAG,   &opt_lower},
    {"-upper",       OptSpec::FLAG,   &opt_upper},
    {"-case",        OptSpec::FLAG,   &opt_case},
    {"-level",       OptSpec::NUM,    &opt_level},
    {"-keyhist",     OptSpec::FLAG,   &opt_keyhist},
    {"-keygen",      OptSpec::FLAG,   &opt_keygen},
    {"-loop",        OptSpec::NUM,    &opt_loopCount},
    {"-terse",       OptSpec::FLAG,   &opt_terse},
    {"-help",        OptSpec::FLAG,   &opt_help},
    {"-?",           OptSpec::FLAG,   &opt_help},
    {0, OptSpec::FLAG, 0}
};


//---------------------------------------------------------------------------
//
//  Global variables pointing to and describing the test file
//
//---------------------------------------------------------------------------

//
//   struct Line
//
//      Each line from the source file (containing a name, presumably) gets
//      one of these structs.
//
struct  Line {
    UChar     *name;
    int        len;
    char      *winSortKey;
    char      *icuSortKey;
    char      *unixSortKey;
    char      *unixName;
};


Line          *gFileLines;           // Ptr to array of Line structs, one per line in the file.
int            gNumFileLines;
UCollator     *gCol;
DWORD          gWinLCID;

Line          **gSortedLines;
Line          **gRandomLines;
int            gCount;


//---------------------------------------------------------------------------
//
//  ProcessOptions()    Function to read the command line options.
//
//---------------------------------------------------------------------------
UBool ProcessOptions(int argc, const char **argv, OptSpec opts[])
{
    int         i;
    int         argNum;
    const char  *pArgName;
    OptSpec    *pOpt;

    for (argNum=1; argNum<argc; argNum++) {
        pArgName = argv[argNum];
        for (pOpt = opts;  pOpt->name != 0; pOpt++) {
            if (strcmp(pOpt->name, pArgName) == 0) {
                switch (pOpt->type) {
                case OptSpec::FLAG:
                    *(UBool *)(pOpt->pVar) = TRUE;
                    break;
                case OptSpec::STRING:
                    argNum ++;
                    if (argNum >= argc) {
                        fprintf(stderr, "value expected for \"%s\" option.\n", pOpt->name);
                        return FALSE;
                    }
                    *(const char **)(pOpt->pVar)  = argv[argNum];
                    break;
                case OptSpec::NUM:
                    argNum ++;
                    if (argNum >= argc) {
                        fprintf(stderr, "value expected for \"%s\" option.\n", pOpt->name);
                        return FALSE;
                    }
                    char *endp;
                    i = strtol(argv[argNum], &endp, 0);
                    if (endp == argv[argNum]) {
                        fprintf(stderr, "integer value expected for \"%s\" option.\n", pOpt->name);
                        return FALSE;
                    }
                    *(int *)(pOpt->pVar) = i;
                }
                break;
            }
        }
        if (pOpt->name == 0)
        {
            fprintf(stderr, "Unrecognized option \"%s\"\n", pArgName);
            return FALSE;
        }
    }
return TRUE;
}

//---------------------------------------------------------------------------------------
//
//   Comparison functions for use by qsort.
//
//       Six flavors, ICU or Windows, SortKey or String Compare, Strings with length
//           or null terminated.
//
//---------------------------------------------------------------------------------------
int ICUstrcmpK(const void *a, const void *b) {
    gCount++;
    int t = strcmp((*(Line **)a)->icuSortKey, (*(Line **)b)->icuSortKey);
    return t;
}


int ICUstrcmpL(const void *a, const void *b) {
    gCount++;
    UCollationResult t;
    t = ucol_strcoll(gCol, (*(Line **)a)->name, (*(Line **)a)->len, (*(Line **)b)->name, (*(Line **)b)->len);
    if (t == UCOL_LESS) return -1;
    if (t == UCOL_GREATER) return +1;
    return 0;
}


int ICUstrcmp(const void *a, const void *b) {
    gCount++;
    UCollationResult t;
    t = ucol_strcoll(gCol, (*(Line **)a)->name, -1, (*(Line **)b)->name, -1);
    if (t == UCOL_LESS) return -1;
    if (t == UCOL_GREATER) return +1;
    return 0;
}


int Winstrcmp(const void *a, const void *b) {
    gCount++;
    int t;
    t = CompareStringW(gWinLCID, 0, (*(Line **)a)->name, -1, (*(Line **)b)->name, -1);
    return t-2;
}


int UNIXstrcmp(const void *a, const void *b) {
    gCount++;
    int t;
    t = strcoll((*(Line **)a)->unixName, (*(Line **)b)->unixName);
    return t;
}


int WinstrcmpL(const void *a, const void *b) {
    gCount++;
    int t;
    t = CompareStringW(gWinLCID, 0, (*(Line **)a)->name, (*(Line **)a)->len, (*(Line **)b)->name, (*(Line **)b)->len);
    return t-2;
}


int WinstrcmpK(const void *a, const void *b) {
    gCount++;
    int t = strcmp((*(Line **)a)->winSortKey, (*(Line **)b)->winSortKey);
    return t;
}


//---------------------------------------------------------------------------------------
//
//   Function for sorting the names (lines) into a random order.
//      Order is based on a hash of the  ICU Sort key for the lines
//      The randomized order is used as input for the sorting timing tests.
//
//---------------------------------------------------------------------------------------
int ICURandomCmp(const void *a, const void *b) {
    char  *ask = (*(Line **)a)->icuSortKey;
    char  *bsk = (*(Line **)b)->icuSortKey;
    int   aVal = 0;
    int   bVal = 0;
    int   retVal;
    while (*ask != 0) {
        aVal += aVal*37 + *ask++;
    }
    while (*bsk != 0) {
        bVal += bVal*37 + *bsk++;
    }
    retVal = -1;
    if (aVal == bVal) {
        retVal = 0;
    }
    else if (aVal > bVal) {
        retVal = 1;
    }
    return retVal;
}

//---------------------------------------------------------------------------------------
//
//   doKeyGen()     Key Generation Timing Test
//
//---------------------------------------------------------------------------------------
void doKeyGen()
{
    int  line;
    int  loops;
    int  t;
    int  len=-1;

    // Adjust loop count to compensate for file size.   Should be order n
    double dLoopCount = double(opt_loopCount) * (1000. /  double(gNumFileLines));
    int adj_loopCount = int(dLoopCount);
    if (adj_loopCount < 1) adj_loopCount = 1;


    unsigned long startTime = timeGetTime();

    if (opt_win) {
        for (loops=0; loops<adj_loopCount; loops++) {
            for (line=0; line < gNumFileLines; line++) {
                if (opt_uselen) {
                    len = gFileLines[line].len;
                }
                t=LCMapStringW(gWinLCID, LCMAP_SORTKEY,
                    gFileLines[line].name, len,
                    (unsigned short *)gFileLines[line].winSortKey, 5000);    // TODO  something with length.
            }
        }
    }
    else if (opt_icu)
    {
        for (loops=0; loops<adj_loopCount; loops++) {
            for (line=0; line < gNumFileLines; line++) {
                if (opt_uselen) {
                    len = gFileLines[line].len;
                }
                t = ucol_getSortKey(gCol, gFileLines[line].name, len, (unsigned char *)gFileLines[line].icuSortKey, 5000);
            }
        }
    }
    else if (opt_unix)
    {
        for (loops=0; loops<adj_loopCount; loops++) {
            for (line=0; line < gNumFileLines; line++) {
                t = strxfrm(gFileLines[line].unixSortKey, gFileLines[line].unixName, 5000);
            }
        }
    }

    unsigned long elapsedTime = timeGetTime() - startTime;
    int ns = (int)(float(1000000) * (float)elapsedTime / (float)(adj_loopCount*gNumFileLines));

    if (opt_terse == FALSE) {
        printf("Sort Key Generation:  total # of keys = %d\n", loops*gNumFileLines);
        printf("Sort Key Generation:  time per key = %d ns\n", ns);
    }
    else {
        printf("%d,  ", ns);
    }

    int   totalKeyLen = 0;
    int   totalChars  = 0;
    for (line=0; line<gNumFileLines; line++) {
        totalChars += u_strlen(gFileLines[line].name);
        if (opt_win) {
            totalKeyLen += strlen(gFileLines[line].winSortKey);
        }
        else if (opt_icu) {
            totalKeyLen += strlen(gFileLines[line].icuSortKey);
        }
        else if (opt_unix) {
            totalKeyLen += strlen(gFileLines[line].unixSortKey);
        }

    }
    if (opt_terse == FALSE) {
        printf("Key Length / character = %f\n", (float)totalKeyLen / (float)totalChars);
    } else {
        printf("%f, ", (float)totalKeyLen / (float)totalChars);
    }
}


//---------------------------------------------------------------------------------------
//
//    doBinarySearch()    Binary Search timing test.  Each name from the list
//                        is looked up in the full sorted list of names.
//
//---------------------------------------------------------------------------------------
void doBinarySearch()
{

    gCount = 0;
    int  line;
    int  loops;

    // Adjust loop count to compensate for file size.   Should be order n (lookups) * log n  (compares/lookup)
    // Accurate timings do not depend on this being perfect.  The correction is just to try to
    //   get total running times of about the right order, so the that user doesn't need to
    //   manually adjust the loop count for every different file size.
    double dLoopCount = double(opt_loopCount) * 3000. / (log10(gNumFileLines) * double(gNumFileLines));
    if (opt_usekeys) dLoopCount *= 5;
    int adj_loopCount = int(dLoopCount);
    if (adj_loopCount < 1) adj_loopCount = 1;

    unsigned long startTime = timeGetTime();

    if (opt_icu )
    {
        UCollationResult  r;
        for (loops=0; loops<adj_loopCount; loops++) {

            for (line=0; line < gNumFileLines; line++) {
                int lineLen  = -1;
                int guessLen = -1;
                if (opt_uselen) {
                    lineLen = (gSortedLines[line])->len;
                }
                int hi      = gNumFileLines-1;
                int lo      = 0;
                int  guess = -1;
                for (;;) {
                    int newGuess = (hi + lo) / 2;
                    if (newGuess == guess)
                        break;
                    guess = newGuess;
                    if (opt_usekeys) {
                        int ri = strcmp((gSortedLines[line])->icuSortKey, (gSortedLines[guess])->icuSortKey);
                        gCount++;
                        r=UCOL_GREATER; if(ri<0) {r=UCOL_LESS;} else if (ri==0) {r=UCOL_EQUAL;}
                    }
                    else
                    {
                        if (opt_uselen) {
                            guessLen = (gSortedLines[guess])->len;
                        }
                        r = ucol_strcoll(gCol, (gSortedLines[line])->name, lineLen, (gSortedLines[guess])->name, guessLen);
                        gCount++;
                    }
                    if (r== UCOL_EQUAL)
                        break;
                    if (r == UCOL_LESS)
                        hi = guess;
                    else
                        lo   = guess;
                }
            }
        }
    }

    if (opt_win)
    {
        int r;
        for (loops=0; loops<adj_loopCount; loops++) {

            for (line=0; line < gNumFileLines; line++) {
                int lineLen  = -1;
                int guessLen = -1;
                if (opt_uselen) {
                    lineLen = (gSortedLines[line])->len;
                }
                int hi   = gNumFileLines-1;
                int lo   = 0;
                int  guess = -1;
                for (;;) {
                    int newGuess = (hi + lo) / 2;
                    if (newGuess == guess)
                        break;
                    guess = newGuess;
                    if (opt_usekeys) {
                        r = strcmp((gSortedLines[line])->winSortKey, (gSortedLines[guess])->winSortKey);
                        gCount++;
                        r+=2;
                    }
                    else
                    {
                        if (opt_uselen) {
                            guessLen = (gSortedLines[guess])->len;
                        }
                        r = CompareStringW(gWinLCID, 0, (gSortedLines[line])->name, lineLen, (gSortedLines[guess])->name, guessLen);
                        if (r == 0) {
                            fprintf(stderr, "Error returned from Windows CompareStringW.\n");
                            exit(-1);
                        }
                        gCount++;
                    }
                    if (r== 2)   //  strings ==
                        break;
                    if (r == 1)  //  line < guess
                        hi = guess;
                    else         //  line > guess
                        lo   = guess;
                }
            }
        }
    }

    if (opt_unix)
    {
        int r;
        for (loops=0; loops<adj_loopCount; loops++) {

            for (line=0; line < gNumFileLines; line++) {
                int hi   = gNumFileLines-1;
                int lo   = 0;
                int  guess = -1;
                for (;;) {
                    int newGuess = (hi + lo) / 2;
                    if (newGuess == guess)
                        break;
                    guess = newGuess;
                    if (opt_usekeys) {
                        r = strcmp((gSortedLines[line])->unixSortKey, (gSortedLines[guess])->unixSortKey);
                        gCount++;
                    }
                    else
                    {
                        r = strcoll((gSortedLines[line])->unixName, (gSortedLines[guess])->unixName);
                        errno = 0;
                        if (errno != 0) {
                            fprintf(stderr, "Error %d returned from strcoll.\n", errno);
                            exit(-1);
                        }
                        gCount++;
                    }
                    if (r == 0)   //  strings ==
                        break;
                    if (r < 0)  //  line < guess
                        hi = guess;
                    else         //  line > guess
                        lo   = guess;
                }
            }
        }
    }

    unsigned long elapsedTime = timeGetTime() - startTime;
    int ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount);
    if (opt_terse == FALSE) {
        printf("binary search:  total # of string compares = %d\n", gCount);
        printf("binary search:  compares per loop = %d\n", gCount / loops);
        printf("binary search:  time per compare = %d ns\n", ns);
    } else {
        printf("%d, ", ns);
    }

}


//---------------------------------------------------------------------------------------
//
//   doQSort()    The quick sort timing test.  Uses the C library qsort function.
//
//---------------------------------------------------------------------------------------
void doQSort() {
    int i;
    Line **sortBuf = new Line *[gNumFileLines];

    // Adjust loop count to compensate for file size.   QSort should be n log(n)
    double dLoopCount = double(opt_loopCount) * 3000. / (log10(gNumFileLines) * double(gNumFileLines));
    if (opt_usekeys) dLoopCount *= 5;
    int adj_loopCount = int(dLoopCount);
    if (adj_loopCount < 1) adj_loopCount = 1;


    gCount = 0;
    unsigned long startTime = timeGetTime();
    if (opt_win && opt_usekeys) {
        for (i=0; i<opt_loopCount; i++) {
            memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *));
            qsort(sortBuf, gNumFileLines, sizeof(Line *), WinstrcmpK);
        }
    }

    else if (opt_win && opt_uselen) {
        for (i=0; i<adj_loopCount; i++) {
            memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *));
            qsort(sortBuf, gNumFileLines, sizeof(Line *), WinstrcmpL);
        }
    }


    else if (opt_win && !opt_uselen) {
        for (i=0; i<adj_loopCount; i++) {
            memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *));
            qsort(sortBuf, gNumFileLines, sizeof(Line *), Winstrcmp);
        }
    }

    else if (opt_icu && opt_usekeys) {
        for (i=0; i<adj_loopCount; i++) {
            memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *));
            qsort(sortBuf, gNumFileLines, sizeof(Line *), ICUstrcmpK);
        }
    }

    else if (opt_icu && opt_uselen) {
        for (i=0; i<adj_loopCount; i++) {
            memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *));
            qsort(sortBuf, gNumFileLines, sizeof(Line *), ICUstrcmpL);
        }
    }


    else if (opt_icu && !opt_uselen) {
        for (i=0; i<adj_loopCount; i++) {
            memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *));
            qsort(sortBuf, gNumFileLines, sizeof(Line *), ICUstrcmp);
        }
    }

    else if (opt_unix && !opt_usekeys) {
        for (i=0; i<adj_loopCount; i++) {
            memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *));
            qsort(sortBuf, gNumFileLines, sizeof(Line *), UNIXstrcmp);
        }
    }

    unsigned long elapsedTime = timeGetTime() - startTime;
    int ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount);
    if (opt_terse == FALSE) {
        printf("qsort:  total # of string compares = %d\n", gCount);
        printf("qsort:  time per compare = %d ns\n", ns);
    } else {
        printf("%d, ", ns);
    }
};


//---------------------------------------------------------------------------------------
//
//    doKeyHist()       Output a table of data for
//                        average sort key size vs. string length.
//
//---------------------------------------------------------------------------------------
void doKeyHist() {
    int     i;
    int     maxLen = 0;

    // Find the maximum string length
    for (i=0; i<gNumFileLines; i++) {
        if (gFileLines[i].len > maxLen) maxLen = gFileLines[i].len;
    }

    // Allocate arrays to hold the histogram data
    int *accumulatedLen  = new int[maxLen+1];
    int *numKeysOfSize   = new int[maxLen+1];
    for (i=0; i<=maxLen; i++) {
        accumulatedLen[i] = 0;
        numKeysOfSize[i] = 0;
    }

    // Fill the arrays...
    for (i=0; i<gNumFileLines; i++) {
        int len = gFileLines[i].len;
        accumulatedLen[len] += strlen(gFileLines[i].icuSortKey);
        numKeysOfSize[len] += 1;
    }

    // And write out averages
    printf("String Length,  Avg Key Length,  Avg Key Len per char\n");
    for (i=1; i<=maxLen; i++) {
        if (numKeysOfSize[i] > 0) {
            printf("%d, %f, %f\n", i, (float)accumulatedLen[i] / (float)numKeysOfSize[i],
                (float)accumulatedLen[i] / (float)(numKeysOfSize[i] * i));
        }
    }
}


//----------------------------------------------------------------------------------------
//
//   UnixConvert   -- Convert the lines of the file to the encoding for UNIX
//                    Since it appears that Unicode support is going in the general
//                    direction of the use of UTF-8 locales, that is the approach
//                    that is used here.
//
//----------------------------------------------------------------------------------------
void  UnixConvert() {
    int    line;

    UConverter   *cvrtr;    // An ICU code page converter.
    UErrorCode    status = U_ZERO_ERROR;


    cvrtr = ucnv_open("utf-8", &status);    // we are just doing UTF-8 locales for now.
    if (U_FAILURE(status)) {
        fprintf(stderr, "ICU Converter open failed.: %d\n", &status);
        exit(-1);
    }

    for (line=0; line < gNumFileLines; line++) {
        int sizeNeeded = ucnv_fromUChars(cvrtr,
                                         0,            // ptr to target buffer.
                                         0,            // length of target buffer.
                                         gFileLines[line].name,
                                         -1,           //  source is null terminated
                                         &status);
        if (status != U_BUFFER_OVERFLOW_ERROR && status != U_ZERO_ERROR) {
            fprintf(stderr, "Conversion from Unicode, something is wrong.\n");
            exit(-1);
        }
        status = U_ZERO_ERROR;
        gFileLines[line].unixName = new char[sizeNeeded+1];
        sizeNeeded = ucnv_fromUChars(cvrtr,
                                         gFileLines[line].unixName, // ptr to target buffer.
                                         sizeNeeded+1, // length of target buffer.
                                         gFileLines[line].name,
                                         -1,           //  source is null terminated
                                         &status);
        if (U_FAILURE(status)) {
            fprintf(stderr, "ICU Conversion Failed.: %d\n", status);
            exit(-1);
        }
        gFileLines[line].unixName[sizeNeeded] = 0;
    };
    ucnv_close(cvrtr);
}


//----------------------------------------------------------------------------------------
//
//    Main   --  process command line, read in and pre-process the test file,
//                 call other functions to do the actual tests.
//
//----------------------------------------------------------------------------------------
int main(int argc, const char** argv) {
    if (ProcessOptions(argc, argv, opts) != TRUE || opt_help || opt_fName == 0) {
        printf("Usage:  strperf options...\n"
            "-help                      Display this message.\n"
            "-file file_name            utf-16 format file of names.\n"
            "-locale name               ICU locale to use.  Default is en_US\n"
            "-langid 0x1234             Windows Language ID number.  Default 0x409 (en_US)\n"
            "                              see http://msdn.microsoft.com/library/psdk/winbase/nls_8xo3.htm\n"
            "-win                       Run test using Windows native services.  (ICU is default)\n"
            "-unix                      Run test using Unix strxfrm, strcoll services.\n"
            "-uselen                    Use API with string lengths.  Default is null-terminated strings\n"
            "-usekeys                   Run tests using sortkeys rather than strcoll\n"
            "-loop nnnn                 Loopcount for test.  Adjust for reasonable total running time.\n"
            "-terse                     Terse numbers-only output.  Intended for use by scripts.\n"
            "-french                    French accent ordering\n"
            "-norm                      Normalizing mode on\n"
            "-shifted                   Shifted mode\n"
            "-lower                     Lower case first\n"
            "-upper                     Upper case first\n"
            "-case                      Enable separate case level\n"
            "-level n                   Sort level, 1 to 5, for Primary, Secndary, Tertiary, Quaternary, Identical\n"
            "-keyhist                   Produce a table sort key size vs. string length\n"
            "-binsearch                 Binary Search timing test\n"
            "-keygen                    Sort Key Generation timing test\n"
            "-qsort                     Quicksort timing test\n"
            );
        exit (1);
    }

    // Make sure that we've only got one API selected.
    if (opt_unix || opt_win) opt_icu = FALSE;
    if (opt_unix) opt_win = FALSE;

    //
    //  Set up an ICU collator
    //
    UErrorCode          status = U_ZERO_ERROR;

    gCol = ucol_open(opt_locale, &status);
    if (U_FAILURE(status)) {
        fprintf(stderr, "Collator creation failed.: %d\n", status);
        return -1;
    }
    if (opt_norm) {
        ucol_setAttribute(gCol, UCOL_NORMALIZATION_MODE, UCOL_ON, &status);
    }
    if (opt_french) {
        ucol_setAttribute(gCol, UCOL_FRENCH_COLLATION, UCOL_ON, &status);
    }
    if (opt_lower) {
        ucol_setAttribute(gCol, UCOL_CASE_FIRST, UCOL_LOWER_FIRST, &status);
    }
    if (opt_upper) {
        ucol_setAttribute(gCol, UCOL_CASE_FIRST, UCOL_UPPER_FIRST, &status);
    }
    if (opt_case) {
        ucol_setAttribute(gCol, UCOL_CASE_LEVEL, UCOL_ON, &status);
    }
    if (opt_shifted) {
        ucol_setAttribute(gCol, UCOL_ALTERNATE_HANDLING, UCOL_SHIFTED, &status);
    }
    if (opt_level != 0) {
        switch (opt_level) {
        case 1:
            ucol_setAttribute(gCol, UCOL_STRENGTH, UCOL_PRIMARY, &status);
            break;
        case 2:
            ucol_setAttribute(gCol, UCOL_STRENGTH, UCOL_SECONDARY, &status);
            break;
        case 3:
            ucol_setAttribute(gCol, UCOL_STRENGTH, UCOL_TERTIARY, &status);
            break;
        case 4:
            ucol_setAttribute(gCol, UCOL_STRENGTH, UCOL_QUATERNARY, &status);
            break;
        case 5:
            ucol_setAttribute(gCol, UCOL_STRENGTH, UCOL_IDENTICAL, &status);
            break;
        default:
            fprintf(stderr, "-level param must be between 1 and 5\n");
            exit(-1);
        }
    }

    if (U_FAILURE(status)) {
        fprintf(stderr, "Collator attribute setting failed.: %d\n", status);
        return -1;
    }


    //
    //  Set up a Windows LCID
    //
    gWinLCID = MAKELCID(opt_langid, SORT_DEFAULT);

    //
    //  Set the UNIX locale
    //
    if (opt_unix) {
        if (setlocale(LC_ALL, opt_locale) == 0) {
            fprintf(stderr, "setlocale(LC_ALL, %s) failed.\n", opt_locale);
            exit(-1);
        }
    }

    // Read in  the input file.
    //   File assumed to be utf-16.
    //   Lines go onto heap buffers.  Global index array to line starts is created.
    //   Lines themselves are null terminated.
    //
    FILE *f;
    f = fopen(opt_fName, "r");
    if (f == NULL) {
        fprintf(stderr, "Can not open file \"%s\"\n", opt_fName);
        exit(-1);
    }

    const int MAXLINES = 10000;
    gFileLines = new Line[MAXLINES];
    UChar buf[1024];
    int   column = 0;
    UBool  littleEndian = TRUE;
    UBool  sawBOM       = FALSE;
    for (;;) {
        UChar c;
        int  cL, cH;

        // Get next utf-16 UChar
        //
        if (littleEndian) {
            cL = fgetc(f);
            cH = fgetc(f);
        }
        else
        {
            cH = fgetc(f);
            cL = fgetc(f);
        }
        c  = cL  | (cH << 8);

        //
        //  Look for the byte order mark at the start of the file.
        //
        if (sawBOM == FALSE) {

            if (c == 0xfeff) {   // Little Endian BOM
                sawBOM = TRUE;
                continue;
            }
            if (c == 0xfffe) {  // Big endian BOM
                sawBOM = TRUE;
                littleEndian = FALSE;
                continue;
            }
            fprintf(stderr, "Error - no BOM in file.  File format must be UTF-16.\n");
            exit(-1);
        }

        // Watch for CR, LF, EOF; these finish off a line.
        if (c == 0xd) {
            continue;
        }

        if (cL == EOF || cH == EOF || c == 0x0a || c==0x2028) {  // Unipad inserts 2028 line separators!
            buf[column++] = 0;
            if (column > 1) {
                gFileLines[gNumFileLines].name  = new UChar[column];
                gFileLines[gNumFileLines].len   = column-1;
                memcpy(gFileLines[gNumFileLines].name, buf, column * sizeof(UChar));
                gNumFileLines++;
                column = 0;
                if (gNumFileLines >= MAXLINES) {
                    fprintf(stderr, "File too big.  Max number of lines is %d\n", MAXLINES);
                    exit(-1);
                }

            }
            if (c == 0xa || c == 0x2028)
                continue;
            else
                break;  // EOF
        }
        buf[column++] = c;
        if (column >= 1023)
        {
            static UBool warnFlag = TRUE;
            if (warnFlag) {
                fprintf(stderr, "Warning - file line longer than 1023 chars truncated.\n");
                warnFlag = FALSE;
            }
            column--;
        }
    }

    fclose(f);
    if (opt_terse == FALSE) {
        printf("file \"%s\", %d lines.\n", opt_fName, gNumFileLines);
    }


    // Convert the lines to the UNIX encoding.
    if (opt_unix) {
        UnixConvert();
    }

    //
    //  Pre-compute ICU sort keys for the lines of the file.
    //
    int line;
    int t;

    for (line=0; line<gNumFileLines; line++) {
         t = ucol_getSortKey(gCol, gFileLines[line].name, -1, (unsigned char *)buf, sizeof(buf));
         gFileLines[line].icuSortKey  = new char[t];

         if (t > sizeof(buf)) {
             t = ucol_getSortKey(gCol, gFileLines[line].name, -1, (unsigned char *)gFileLines[line].icuSortKey , t);
         }
         else
         {
             memcpy(gFileLines[line].icuSortKey, buf, t);
         }
    }


    //
    //  Pre-compute Windows sort keys for the lines of the file.
    //
    for (line=0; line<gNumFileLines; line++) {
         t=LCMapStringW(gWinLCID, LCMAP_SORTKEY, gFileLines[line].name, -1, buf, sizeof(buf));
         gFileLines[line].winSortKey  = new char[t];
         if (t > sizeof(buf)) {
             t = LCMapStringW(gWinLCID, LCMAP_SORTKEY, gFileLines[line].name, -1, (unsigned short *)(gFileLines[line].winSortKey), t);
         }
         else
         {
             memcpy(gFileLines[line].winSortKey, buf, t);
         }
    }

    //
    //  Pre-compute UNIX sort keys for the lines of the file.
    //
    if (opt_unix) {
        for (line=0; line<gNumFileLines; line++) {
            t=strxfrm((char *)buf,  gFileLines[line].unixName,  sizeof(buf));
            gFileLines[line].unixSortKey  = new char[t];
            if (t > sizeof(buf)) {
                t = strxfrm(gFileLines[line].unixSortKey,  gFileLines[line].unixName,  sizeof(buf));
            }
            else
            {
                memcpy(gFileLines[line].unixSortKey, buf, t);
            }
        }
    }


    //
    //  Pre-sort the lines.
    //
    int i;
    gSortedLines = new Line *[gNumFileLines];
    for (i=0; i<gNumFileLines; i++) {
        gSortedLines[i] = &gFileLines[i];
    }

    if (opt_win) {
        qsort(gSortedLines, gNumFileLines, sizeof(Line *), Winstrcmp);
    }
    else if (opt_unix) {
        qsort(gSortedLines, gNumFileLines, sizeof(Line *), UNIXstrcmp);
    }
    else   /* ICU */
    {
        qsort(gSortedLines, gNumFileLines, sizeof(Line *), ICUstrcmp);
    }


    //
    //  Make up a randomized order, will be used for sorting tests.
    //
    gRandomLines = new Line *[gNumFileLines];
    for (i=0; i<gNumFileLines; i++) {
        gRandomLines[i] = &gFileLines[i];
    }
    qsort(gRandomLines, gNumFileLines, sizeof(Line *), ICURandomCmp);


    //
    //  We've got the file read into memory.  Go do something with it.
    //

    if (opt_qsort)     doQSort();
    if (opt_binsearch) doBinarySearch();
    if (opt_keygen)    doKeyGen();
    if (opt_keyhist)   doKeyHist();

    return 0;

}