/* ********************************************************************** * Copyright (C) 2002-2005, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** * file name: utfperf.c * encoding: US-ASCII * tab size: 8 (not used) * indentation:4 * * created on: 2002apr17 * created by: Markus W. Scherer * * Performance test program for Unicode converters * (converters that support all Unicode code points). * Takes a UTF-8 file as input. */ #include #include #include /* for _O_BINARY */ #include /* for _setmode() */ #include "unicode/utypes.h" #include "unicode/ucnv.h" #include "unicode/ustring.h" #if defined(U_WINDOWS) # include #else # include static unsigned long timeGetTime() { struct timeval t; gettimeofday(&t, 0); return t.tv_sec*1000+t.tv_usec/1000; }; #endif /* definitions and text buffers */ #define INPUT_CAPACITY (1024*1024) #define INTERMEDIATE_CAPACITY 4096 #define INTERMEDIATE_SMALL_CAPACITY 20 #define OUTPUT_CAPACITY INPUT_CAPACITY #define TARGET_MEASURE_TIME_MS 2000 #define PERCENT(a, b) (int)(((a)*200+1)/(2*(b))) #define ARRAY_LENGTH(a) (sizeof(a)/sizeof((a)[0])) static UChar input[INPUT_CAPACITY], output[OUTPUT_CAPACITY]; static char intermediate[INTERMEDIATE_CAPACITY]; static int32_t inputLength, encodedLength, outputLength, countInputCodePoints; static int32_t utf8Length=0; static double utf8Time=0.; static const char *const utfNames[]={ "UTF-8", /* UTF-8 should always be first to serve as percentage reference */ "SCSU", "BOCU-1" /*, "CESU-8" *//*, "UTF-16BE", "UTF-16LE"*//*, "GB18030"*/ }; /* functions */ typedef void RoundtripFn(UConverter *cnv, int32_t intermediateCapacity, UErrorCode *pErrorCode); static void roundtrip(UConverter *cnv, int32_t intermediateCapacity, UErrorCode *pErrorCode) { const UChar *pIn, *pInLimit; UChar *pOut, *pOutLimit; char *pInter, *pInterLimit, *p; UBool flush; ucnv_reset(cnv); pIn=input; pInLimit=input+inputLength; pOut=output; pOutLimit=output+OUTPUT_CAPACITY; pInterLimit=intermediate+intermediateCapacity; encodedLength=outputLength=0; flush=FALSE; while(pIn0); _time=timeGetTime()-_time; if(U_FAILURE(errorCode)) { fprintf(stderr, "error in roundtrip conversion (%s): %s\n", encName, u_errorName(errorCode)); return 0x7fffffff; } if(0!=u_memcmp(input, output, inputLength)) { fprintf(stderr, "error: roundtrip failed, input[]!=output[]\n"); return 0x7fffffff; } return _time; } static void perEncAndCapacity(UConverter *cnv, const char *encName, int32_t intermediateCapacity) { double rtTime; unsigned long _time; int32_t n; /*printf("test performance for %s with intermediate capacity %d\n", encName, intermediateCapacity);*/ /* warm up caches and estimate loop time */ n=10; for(;;) { _time=measureRoundtrips(roundtrip, cnv, encName, intermediateCapacity, n); if(_time<500 && _time0) { /* convert the block */ p=intermediate; limit=p+length; ucnv_toUnicode(cnv, &pOut, pOutLimit, &p, limit, NULL, FALSE, &errorCode); if(U_FAILURE(errorCode)) { fprintf(stderr, "error converting input to UTF-16: %s\n", u_errorName(errorCode)); ucnv_close(cnv); return FALSE; } /* read the next block */ length=readBlock(in); if(length<0) { ucnv_close(cnv); return FALSE; } } /* flush the converter */ ucnv_toUnicode(cnv, &pOut, pOutLimit, &p, p, NULL, TRUE, &errorCode); ucnv_close(cnv); if(U_FAILURE(errorCode)) { fprintf(stderr, "error converting input to UTF-16: %s\n", u_errorName(errorCode)); return FALSE; } inputLength=(int32_t)(pOut-input); countInputCodePoints=u_countChar32(input, inputLength); if(inputLength<=0) { fprintf(stderr, "warning: input is empty\n"); return FALSE; } return TRUE; } static void showUsage(const char *myName) { fprintf(stderr, "Usage:\n" "%s [-e encoding-name] filename | '-'\n" " encoding-name must be the name of an encoding supported by ICU\n" " the filename of the input file with text to be used\n" " can be a dash (-) for standard input\n", myName); } /* * Read file using some encoding, convert to 1M UTF-16 input buffer. * For each UTF to be tested: * n times: * convert from UTF-16 input buffer to UTF, 4kB buffer * convert from 4kB buffer to 1M UTF-16 output buffer * adjust n so that time elapsed is 10s (#define) * ->divide 10s by time, increase n by that factor, run 2nd time * n times: * empty function * subtract out loop/function overhead * display #code points - #UTF bytes - time per roundtrip * * * do the same again with an intermediate buffer size of 20 instead of 4kB * * Test following UTFs: * UTF-16BE, UTF-16LE, UTF-8, SCSU, BOCU-1, CESU-8 * * Command-line arguments: * - encoding (default UTF-8, detect BOM) * - filename (allow "-") */ extern int main(int argc, const char *argv[]) { FILE *in; const char *myName, *encName, *filename, *basename; myName=argv[0]; if(argc<2) { showUsage(myName); return 1; } /* get encoding name argument */ if(argv[1][0]=='-' && argv[1][1]=='e') { encName=argv[1]+2; --argc; ++argv; if(*encName==0) { if(argc<2) { showUsage(myName); return 1; } encName=argv[1]; --argc; ++argv; } } else { encName=NULL; } /* get filename argument */ if(argc<2) { showUsage(myName); return 1; } filename=argv[1]; if(filename[0]=='-' && filename[1]==0) { filename="(standard input)"; in=stdin; /* set stdin to binary mode */ _setmode(_fileno(stdin), _O_BINARY); } else { in=fopen(filename, "rb"); if(in==NULL) { fprintf(stderr, "error opening \"%s\"\n", filename); showUsage(myName); return 2; } } /* read input */ basename=strrchr(filename, U_FILE_SEP_CHAR); if(basename!=NULL) { ++basename; } else { basename=filename; } printf("# testing converter performance with file \"%s\"\n", basename); if(!readInput(in, encName)) { fprintf(stderr, "error reading \"%s\" (encoding %s)\n", filename, encName); showUsage(myName); return 2; } if(in!=stdin) { fclose(in); } /* test performance */ testPerformance(); return 0; }