433 lines
12 KiB
C
433 lines
12 KiB
C
|
#include <stdio.h>
|
||
|
#include <string.h>
|
||
|
|
||
|
#include <fcntl.h> /* for _O_BINARY */
|
||
|
#include <io.h> /* for _setmode() */
|
||
|
|
||
|
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
|
||
|
# include <windows.h>
|
||
|
#else
|
||
|
# include <sys/time.h>
|
||
|
static unsigned long
|
||
|
timeGetTime() {
|
||
|
struct timeval t;
|
||
|
|
||
|
gettimeofday(&t, 0);
|
||
|
return t.tv_sec*1000+t.tv_usec/1000;
|
||
|
};
|
||
|
#endif
|
||
|
|
||
|
#include "unicode/utypes.h"
|
||
|
#include "unicode/ucnv.h"
|
||
|
#include "unicode/ustring.h"
|
||
|
|
||
|
/* definitions and text buffers */
|
||
|
|
||
|
#define INPUT_CAPACITY (1024*1024)
|
||
|
#define INTERMEDIATE_CAPACITY 4096
|
||
|
#define INTERMEDIATE_SMALL_CAPACITY 20
|
||
|
#define OUTPUT_CAPACITY INPUT_CAPACITY
|
||
|
|
||
|
#define TARGET_MEASURE_TIME_MS 2000
|
||
|
|
||
|
#define PERCENT(a, b) (int)(((a)*200+1)/(2*(b)))
|
||
|
|
||
|
#define ARRAY_LENGTH(a) (sizeof(a)/sizeof((a)[0]))
|
||
|
|
||
|
static UChar input[INPUT_CAPACITY], output[OUTPUT_CAPACITY];
|
||
|
static char intermediate[INTERMEDIATE_CAPACITY];
|
||
|
|
||
|
static int32_t inputLength, encodedLength, outputLength, countInputCodePoints;
|
||
|
|
||
|
static int32_t utf8Length=0;
|
||
|
static double utf8Time=0.;
|
||
|
|
||
|
static const char *const
|
||
|
utfNames[]={
|
||
|
"UTF-8", /* UTF-8 should always be first to serve as percentage reference */
|
||
|
"SCSU", "BOCU-1" /*, "CESU-8" *//*, "UTF-16BE", "UTF-16LE"*//*, "GB18030"*/
|
||
|
};
|
||
|
|
||
|
/* functions */
|
||
|
|
||
|
typedef void
|
||
|
RoundtripFn(UConverter *cnv, int32_t intermediateCapacity, UErrorCode *pErrorCode);
|
||
|
|
||
|
static void
|
||
|
roundtrip(UConverter *cnv, int32_t intermediateCapacity, UErrorCode *pErrorCode) {
|
||
|
const UChar *pIn, *pInLimit;
|
||
|
UChar *pOut, *pOutLimit;
|
||
|
char *pInter, *pInterLimit, *p;
|
||
|
UBool flush;
|
||
|
|
||
|
ucnv_reset(cnv);
|
||
|
|
||
|
pIn=input;
|
||
|
pInLimit=input+inputLength;
|
||
|
|
||
|
pOut=output;
|
||
|
pOutLimit=output+OUTPUT_CAPACITY;
|
||
|
|
||
|
pInterLimit=intermediate+intermediateCapacity;
|
||
|
|
||
|
encodedLength=outputLength=0;
|
||
|
flush=FALSE;
|
||
|
|
||
|
while(pIn<pInLimit || !flush) {
|
||
|
/* convert a block of [pIn..pInLimit[ to the encoding in intermediate[] */
|
||
|
pInter=intermediate;
|
||
|
flush=(UBool)(pIn==pInLimit);
|
||
|
ucnv_fromUnicode(cnv,
|
||
|
&pInter, pInterLimit,
|
||
|
&pIn, pInLimit,
|
||
|
NULL, flush,
|
||
|
pErrorCode);
|
||
|
encodedLength+=(int32_t)(pInter-intermediate);
|
||
|
|
||
|
if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
|
||
|
/* in case flush was TRUE make sure that we convert once more to really flush */
|
||
|
flush=FALSE;
|
||
|
*pErrorCode=U_ZERO_ERROR;
|
||
|
} else if(U_FAILURE(*pErrorCode)) {
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
/* convert the block [intermediate..pInter[ back to UTF-16 */
|
||
|
p=intermediate;
|
||
|
ucnv_toUnicode(cnv,
|
||
|
&pOut, pOutLimit,
|
||
|
&p, pInter,
|
||
|
NULL, flush,
|
||
|
pErrorCode);
|
||
|
if(U_FAILURE(*pErrorCode)) {
|
||
|
return;
|
||
|
}
|
||
|
/* intermediate must have been consumed (p==pInter) because of the converter semantics */
|
||
|
}
|
||
|
|
||
|
outputLength=pOut-output;
|
||
|
if(inputLength!=outputLength) {
|
||
|
fprintf(stderr, "error: roundtrip failed, inputLength %d!=outputLength %d\n", inputLength, outputLength);
|
||
|
*pErrorCode=U_INTERNAL_PROGRAM_ERROR;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
static void
|
||
|
noop(UConverter *cnv, int32_t intermediateCapacity, UErrorCode *pErrorCode) {
|
||
|
/* do nothing */
|
||
|
}
|
||
|
|
||
|
static unsigned long
|
||
|
measureRoundtrips(RoundtripFn *fn, UConverter *cnv, const char *encName, int32_t intermediateCapacity, int32_t n) {
|
||
|
unsigned long _time;
|
||
|
UErrorCode errorCode;
|
||
|
|
||
|
_time=timeGetTime();
|
||
|
errorCode=U_ZERO_ERROR;
|
||
|
do {
|
||
|
fn(cnv, intermediateCapacity, &errorCode);
|
||
|
} while(U_SUCCESS(errorCode) && --n>0);
|
||
|
_time=timeGetTime()-_time;
|
||
|
|
||
|
if(U_FAILURE(errorCode)) {
|
||
|
fprintf(stderr, "error in roundtrip conversion (%s): %s\n", encName, u_errorName(errorCode));
|
||
|
return 0x7fffffff;
|
||
|
}
|
||
|
|
||
|
if(0!=u_memcmp(input, output, inputLength)) {
|
||
|
fprintf(stderr, "error: roundtrip failed, input[]!=output[]\n");
|
||
|
return 0x7fffffff;
|
||
|
}
|
||
|
|
||
|
return _time;
|
||
|
}
|
||
|
|
||
|
static void
|
||
|
perEncAndCapacity(UConverter *cnv, const char *encName, int32_t intermediateCapacity) {
|
||
|
double rtTime;
|
||
|
unsigned long _time;
|
||
|
int32_t n;
|
||
|
|
||
|
/*printf("test performance for %s with intermediate capacity %d\n", encName, intermediateCapacity);*/
|
||
|
|
||
|
/* warm up caches and estimate loop time */
|
||
|
n=10;
|
||
|
for(;;) {
|
||
|
_time=measureRoundtrips(roundtrip, cnv, encName, intermediateCapacity, n);
|
||
|
if(_time<500 && _time<TARGET_MEASURE_TIME_MS/10) {
|
||
|
n*=10;
|
||
|
} else {
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if(_time<TARGET_MEASURE_TIME_MS) {
|
||
|
n=(n*TARGET_MEASURE_TIME_MS)/_time+1;
|
||
|
}
|
||
|
|
||
|
/* run actual measurement with a target test time of 10s */
|
||
|
_time=measureRoundtrips(roundtrip, cnv, encName, intermediateCapacity, n);
|
||
|
|
||
|
/* subtract same number of loops over no-operation function */
|
||
|
_time-=measureRoundtrips(noop, cnv, encName, intermediateCapacity, n);
|
||
|
|
||
|
rtTime=((double)_time*1000.)/(double)n;
|
||
|
|
||
|
/* report */
|
||
|
printf("* performance report for %8s:\n", encName);
|
||
|
printf(" intermediate buffer capacity %8d B\n", intermediateCapacity);
|
||
|
if(intermediateCapacity==INTERMEDIATE_CAPACITY && utf8Length!=0) {
|
||
|
printf(" number of encoding bytes %8d B (%3d%% of UTF-8)\n", encodedLength, PERCENT(encodedLength, utf8Length));
|
||
|
printf(" roundtrip conversion time %8g μs (%3d%% of UTF-8)\n", rtTime, PERCENT(rtTime, utf8Time));
|
||
|
} else {
|
||
|
printf(" number of encoding bytes %8d B\n", encodedLength);
|
||
|
printf(" roundtrip conversion time %8g μs\n", rtTime);
|
||
|
}
|
||
|
printf(" average bytes/code point %8g B/cp\n", (double)encodedLength/countInputCodePoints);
|
||
|
puts("");
|
||
|
|
||
|
/* set UTF-8 values */
|
||
|
if(intermediateCapacity==INTERMEDIATE_CAPACITY && 0==strcmp(encName, "UTF-8")) {
|
||
|
utf8Length=encodedLength;
|
||
|
utf8Time=rtTime;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
static void
|
||
|
perEnc(UConverter *cnv, const char *encName) {
|
||
|
/*printf("test performance for %s\n", encName);*/
|
||
|
perEncAndCapacity(cnv, encName, INTERMEDIATE_CAPACITY);
|
||
|
perEncAndCapacity(cnv, encName, INTERMEDIATE_SMALL_CAPACITY);
|
||
|
}
|
||
|
|
||
|
static void
|
||
|
testPerformance() {
|
||
|
UConverter *cnv;
|
||
|
UErrorCode errorCode;
|
||
|
int32_t i;
|
||
|
|
||
|
printf("number of code points %8d cp\n", countInputCodePoints);
|
||
|
printf("platform endianness: %8s-endian\n", U_IS_BIG_ENDIAN ? "big" : "little");
|
||
|
puts("");
|
||
|
for(i=0; i<ARRAY_LENGTH(utfNames); ++i) {
|
||
|
errorCode=U_ZERO_ERROR;
|
||
|
cnv=ucnv_open(utfNames[i], &errorCode);
|
||
|
if(U_SUCCESS(errorCode)) {
|
||
|
perEnc(cnv, utfNames[i]);
|
||
|
ucnv_close(cnv);
|
||
|
} else {
|
||
|
fprintf(stderr, "error opening converter for \"%s\" - %s\n", utfNames[i], u_errorName(errorCode));
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/* read a complete block from the input file */
|
||
|
static int32_t
|
||
|
readBlock(FILE *in) {
|
||
|
int length, blockLength;
|
||
|
|
||
|
blockLength=0;
|
||
|
while(blockLength<INTERMEDIATE_CAPACITY && !feof(in)) {
|
||
|
length=fread(intermediate, 1, INTERMEDIATE_CAPACITY-blockLength, in);
|
||
|
if(length<0 || ferror(in)) {
|
||
|
return -1;
|
||
|
}
|
||
|
blockLength+=length;
|
||
|
}
|
||
|
|
||
|
return (int32_t)blockLength;
|
||
|
}
|
||
|
|
||
|
static UBool
|
||
|
readInput(FILE *in, const char *encName) {
|
||
|
UConverter *cnv;
|
||
|
UChar *pOut, *pOutLimit;
|
||
|
const char *p, *limit;
|
||
|
int32_t length;
|
||
|
UErrorCode errorCode;
|
||
|
|
||
|
pOut=input;
|
||
|
pOutLimit=input+INPUT_CAPACITY;
|
||
|
|
||
|
errorCode=U_ZERO_ERROR;
|
||
|
|
||
|
/* read the first block and open the converter */
|
||
|
length=readBlock(in);
|
||
|
if(length<0) {
|
||
|
return FALSE;
|
||
|
}
|
||
|
|
||
|
if(encName==NULL) {
|
||
|
int32_t signatureLength;
|
||
|
encName=ucnv_detectUnicodeSignature(intermediate, length,
|
||
|
&signatureLength,
|
||
|
&errorCode);
|
||
|
if(U_FAILURE(errorCode) || encName==NULL) {
|
||
|
/* default to UTF-8 */
|
||
|
printf("no Unicode signature - using UTF-8\n");
|
||
|
encName="UTF-8";
|
||
|
errorCode=U_ZERO_ERROR;
|
||
|
} else {
|
||
|
printf("detected signature for %s (removing %d bytes)\n", encName, signatureLength);
|
||
|
/* remove signature byte sequence */
|
||
|
memmove(intermediate, intermediate+signatureLength, length-=signatureLength);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
cnv=ucnv_open(encName, &errorCode);
|
||
|
if(U_FAILURE(errorCode)) {
|
||
|
fprintf(stderr, "error: unable to ucnv_open(\"%s\") - %s\n", encName, u_errorName(errorCode));
|
||
|
return FALSE;
|
||
|
}
|
||
|
|
||
|
while(length>0) {
|
||
|
/* convert the block */
|
||
|
p=intermediate;
|
||
|
limit=p+length;
|
||
|
|
||
|
ucnv_toUnicode(cnv,
|
||
|
&pOut, pOutLimit,
|
||
|
&p, limit,
|
||
|
NULL, FALSE,
|
||
|
&errorCode);
|
||
|
if(U_FAILURE(errorCode)) {
|
||
|
fprintf(stderr, "error converting input to UTF-16: %s\n", u_errorName(errorCode));
|
||
|
ucnv_close(cnv);
|
||
|
return FALSE;
|
||
|
}
|
||
|
|
||
|
/* read the next block */
|
||
|
length=readBlock(in);
|
||
|
if(length<0) {
|
||
|
ucnv_close(cnv);
|
||
|
return FALSE;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/* flush the converter */
|
||
|
ucnv_toUnicode(cnv,
|
||
|
&pOut, pOutLimit,
|
||
|
&p, p,
|
||
|
NULL, TRUE,
|
||
|
&errorCode);
|
||
|
ucnv_close(cnv);
|
||
|
|
||
|
if(U_FAILURE(errorCode)) {
|
||
|
fprintf(stderr, "error converting input to UTF-16: %s\n", u_errorName(errorCode));
|
||
|
return FALSE;
|
||
|
}
|
||
|
|
||
|
inputLength=(int32_t)(pOut-input);
|
||
|
countInputCodePoints=u_countChar32(input, inputLength);
|
||
|
if(inputLength<=0) {
|
||
|
fprintf(stderr, "warning: input is empty\n");
|
||
|
return FALSE;
|
||
|
}
|
||
|
|
||
|
return TRUE;
|
||
|
}
|
||
|
|
||
|
static void
|
||
|
showUsage(const char *myName) {
|
||
|
fprintf(stderr,
|
||
|
"Usage:\n"
|
||
|
"%s [-e encoding-name] filename | '-'\n"
|
||
|
" encoding-name must be the name of an encoding supported by ICU\n"
|
||
|
" the filename of the input file with text to be used\n"
|
||
|
" can be a dash (-) for standard input\n",
|
||
|
myName);
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
* Read file using some encoding, convert to 1M UTF-16 input buffer.
|
||
|
* For each UTF to be tested:
|
||
|
* n times:
|
||
|
* convert from UTF-16 input buffer to UTF, 4kB buffer
|
||
|
* convert from 4kB buffer to 1M UTF-16 output buffer
|
||
|
* adjust n so that time elapsed is 10s (#define)
|
||
|
* ->divide 10s by time, increase n by that factor, run 2nd time
|
||
|
* n times:
|
||
|
* empty function
|
||
|
* subtract out loop/function overhead
|
||
|
* display #code points - #UTF bytes - time per roundtrip
|
||
|
*
|
||
|
* * do the same again with an intermediate buffer size of 20 instead of 4kB
|
||
|
*
|
||
|
* Test following UTFs:
|
||
|
* UTF-16BE, UTF-16LE, UTF-8, SCSU, BOCU-1, CESU-8
|
||
|
*
|
||
|
* Command-line arguments:
|
||
|
* - encoding (default UTF-8, detect BOM)
|
||
|
* - filename (allow "-")
|
||
|
*/
|
||
|
extern int
|
||
|
main(int argc, const char *argv[]) {
|
||
|
FILE *in;
|
||
|
const char *myName, *encName, *filename, *basename;
|
||
|
|
||
|
myName=argv[0];
|
||
|
if(argc<2) {
|
||
|
showUsage(myName);
|
||
|
return 1;
|
||
|
}
|
||
|
|
||
|
/* get encoding name argument */
|
||
|
if(argv[1][0]=='-' && argv[1][1]=='e') {
|
||
|
encName=argv[1]+2;
|
||
|
--argc;
|
||
|
++argv;
|
||
|
if(*encName==0) {
|
||
|
if(argc<2) {
|
||
|
showUsage(myName);
|
||
|
return 1;
|
||
|
}
|
||
|
encName=argv[1];
|
||
|
--argc;
|
||
|
++argv;
|
||
|
}
|
||
|
} else {
|
||
|
encName=NULL;
|
||
|
}
|
||
|
|
||
|
/* get filename argument */
|
||
|
if(argc<2) {
|
||
|
showUsage(myName);
|
||
|
return 1;
|
||
|
}
|
||
|
filename=argv[1];
|
||
|
if(filename[0]=='-' && filename[1]==0) {
|
||
|
filename="(standard input)";
|
||
|
in=stdin;
|
||
|
/* set stdin to binary mode */
|
||
|
_setmode(_fileno(stdin), _O_BINARY);
|
||
|
} else {
|
||
|
in=fopen(filename, "rb");
|
||
|
if(in==NULL) {
|
||
|
fprintf(stderr, "error opening \"%s\"\n", filename);
|
||
|
showUsage(myName);
|
||
|
return 2;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/* read input */
|
||
|
basename=strrchr(filename, U_FILE_SEP_CHAR);
|
||
|
if(basename!=NULL) {
|
||
|
++basename;
|
||
|
} else {
|
||
|
basename=filename;
|
||
|
}
|
||
|
printf("# testing converter performance with file \"%s\"\n", basename);
|
||
|
if(!readInput(in, encName)) {
|
||
|
fprintf(stderr, "error reading \"%s\" (encoding %s)\n", filename, encName);
|
||
|
showUsage(myName);
|
||
|
return 2;
|
||
|
}
|
||
|
if(in!=stdin) {
|
||
|
fclose(in);
|
||
|
}
|
||
|
|
||
|
/* test performance */
|
||
|
testPerformance();
|
||
|
return 0;
|
||
|
}
|