From 8f67b83ff7408ccffacb4e60ff6425fe846616d3 Mon Sep 17 00:00:00 2001 From: Markus Scherer Date: Tue, 28 May 2002 02:09:10 +0000 Subject: [PATCH] ICU-1864 add tests for BOCU-1: compare ICU and reference implementations X-SVN-Rev: 8711 --- icu4c/source/test/cintltst/Makefile.in | 2 +- icu4c/source/test/cintltst/bocu1tst.c | 1012 +++++++++++++++++++++++ icu4c/source/test/cintltst/cconvtst.c | 5 +- icu4c/source/test/cintltst/cintltst.dsp | 4 + 4 files changed, 1021 insertions(+), 2 deletions(-) create mode 100644 icu4c/source/test/cintltst/bocu1tst.c diff --git a/icu4c/source/test/cintltst/Makefile.in b/icu4c/source/test/cintltst/Makefile.in index 3094199434..50e05a3285 100644 --- a/icu4c/source/test/cintltst/Makefile.in +++ b/icu4c/source/test/cintltst/Makefile.in @@ -42,7 +42,7 @@ cdattst.o cdetst.o cdtdptst.o cdtrgtst.o cestst.o cfintst.o cformtst.o \ cfrtst.o cg7coll.o chashtst.o cintltst.o citertst.o cjaptst.o cloctst.o \ cmsccoll.o cmsgtst.o \ cnmdptst.o cnormtst.o cnumtst.o cregrtst.o crestst.o creststn.o cturtst.o \ -cucdtst.o cstrcase.o cutiltst.o encoll.o nucnvtst.o susctest.o nccbtst.o \ +cucdtst.o cstrcase.o cutiltst.o encoll.o nucnvtst.o susctest.o nccbtst.o bocu1tst.o \ cbiditst.o cbididat.o dadrcoll.o eurocreg.o udatatst.o utf16tst.o utransts.o \ ncnvfbts.o ncnvtst.o putiltst.o cstrtest.o mstrmtst.o utf8tst.o ucmptst.o \ stdnmtst.o ctstdep.o usrchtst.o custrtrn.o trietest.o diff --git a/icu4c/source/test/cintltst/bocu1tst.c b/icu4c/source/test/cintltst/bocu1tst.c new file mode 100644 index 0000000000..c7581b1ca7 --- /dev/null +++ b/icu4c/source/test/cintltst/bocu1tst.c @@ -0,0 +1,1012 @@ +/* +****************************************************************************** +* +* Copyright (C) 2002, International Business Machines +* Corporation and others. All Rights Reserved. +* +****************************************************************************** +* file name: bocu1tst.c +* encoding: US-ASCII +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2002may27 +* created by: Markus W. Scherer +* +* This is the reference implementation of BOCU-1, +* the MIME-friendly form of the Binary Ordered Compression for Unicode, +* taken directly from ### http://oss.software.ibm.com/cvs/icu/icuhtml/design/conversion/bocu1/ +* The files bocu1.h and bocu1.c from the design folder are taken +* verbatim (minus copyright and #include) and copied together into this file. +* The reference code and some of the reference bocu1tst.c +* is modified to run as part of the ICU cintltst +* test framework (minus main(), log_ln() etc. instead of printf()). +* +* This reference implementation is used here to verify +* the ICU BOCU-1 implementation, which is +* adapted for ICU conversion APIs and optimized. +* ### links in design doc to here and to ucnvbocu.c +*/ + +#include "unicode/utypes.h" +#include "unicode/ustring.h" +#include "unicode/ucnv.h" +#include "cmemory.h" +#include "cintltst.h" + +#define LENGTHOF(array) (sizeof(array)/sizeof((array)[0])) + +/* icuhtml/design/conversion/bocu1/bocu1.h ---------------------------------- */ + +/* BOCU-1 constants and macros ---------------------------------------------- */ + +/* + * BOCU-1 encodes the code points of a Unicode string as + * a sequence of byte-encoded differences (slope detection), + * preserving lexical order. + * + * Optimize the difference-taking for runs of Unicode text within + * small scripts: + * + * Most small scripts are allocated within aligned 128-blocks of Unicode + * code points. Lexical order is preserved if the "previous code point" state + * is always moved into the middle of such a block. + * + * Additionally, "prev" is moved from anywhere in the Unihan and Hangul + * areas into the middle of those areas. + * + * C0 control codes and space are encoded with their US-ASCII bytes. + * "prev" is reset for C0 controls but not for space. + */ + +/* initial value for "prev": middle of the ASCII range */ +#define BOCU1_ASCII_PREV 0x40 + +/* bounding byte values for differences */ +#define BOCU1_MIN 0x21 +#define BOCU1_MIDDLE 0x90 +#define BOCU1_MAX_LEAD 0xfe +#define BOCU1_MAX_TRAIL 0xff +#define BOCU1_RESET 0xff + +/* number of lead bytes */ +#define BOCU1_COUNT (BOCU1_MAX_LEAD-BOCU1_MIN+1) + +/* adjust trail byte counts for the use of some C0 control byte values */ +#define BOCU1_TRAIL_CONTROLS_COUNT 20 +#define BOCU1_TRAIL_BYTE_OFFSET (BOCU1_MIN-BOCU1_TRAIL_CONTROLS_COUNT) + +/* number of trail bytes */ +#define BOCU1_TRAIL_COUNT ((BOCU1_MAX_TRAIL-BOCU1_MIN+1)+BOCU1_TRAIL_CONTROLS_COUNT) + +/* + * number of positive and negative single-byte codes + * (counting 0==BOCU1_MIDDLE among the positive ones) + */ +#define BOCU1_SINGLE 64 + +/* number of lead bytes for positive and negative 2/3/4-byte sequences */ +#define BOCU1_LEAD_2 43 +#define BOCU1_LEAD_3 3 +#define BOCU1_LEAD_4 1 + +/* The difference value range for single-byters. */ +#define BOCU1_REACH_POS_1 (BOCU1_SINGLE-1) +#define BOCU1_REACH_NEG_1 (-BOCU1_SINGLE) + +/* The difference value range for double-byters. */ +#define BOCU1_REACH_POS_2 (BOCU1_REACH_POS_1+BOCU1_LEAD_2*BOCU1_TRAIL_COUNT) +#define BOCU1_REACH_NEG_2 (BOCU1_REACH_NEG_1-BOCU1_LEAD_2*BOCU1_TRAIL_COUNT) + +/* The difference value range for 3-byters. */ +#define BOCU1_REACH_POS_3 \ + (BOCU1_REACH_POS_2+BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT) + +#define BOCU1_REACH_NEG_3 (BOCU1_REACH_NEG_2-BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT) + +/* The lead byte start values. */ +#define BOCU1_START_POS_2 (BOCU1_MIDDLE+BOCU1_REACH_POS_1+1) +#define BOCU1_START_POS_3 (BOCU1_START_POS_2+BOCU1_LEAD_2) +#define BOCU1_START_POS_4 (BOCU1_START_POS_3+BOCU1_LEAD_3) + /* ==BOCU1_MAX_LEAD */ + +#define BOCU1_START_NEG_2 (BOCU1_MIDDLE+BOCU1_REACH_NEG_1) +#define BOCU1_START_NEG_3 (BOCU1_START_NEG_2-BOCU1_LEAD_2) +#define BOCU1_START_NEG_4 (BOCU1_START_NEG_3-BOCU1_LEAD_3) + /* ==BOCU1_MIN+1 */ + +/* The length of a byte sequence, according to the lead byte (!=BOCU1_RESET). */ +#define BOCU1_LENGTH_FROM_LEAD(lead) \ + ((BOCU1_START_NEG_2<=(lead) && (lead)>24 : 4) + +/* + * 12 commonly used C0 control codes (and space) are only used to encode + * themselves directly, + * which makes BOCU-1 MIME-usable and reasonably safe for + * ASCII-oriented software. + * + * These controls are + * 0 NUL + * + * 7 BEL + * 8 BS + * + * 9 TAB + * a LF + * b VT + * c FF + * d CR + * + * e SO + * f SI + * + * 1a SUB + * 1b ESC + * + * The other 20 C0 controls are also encoded directly (to preserve order) + * but are also used as trail bytes in difference encoding + * (for better compression). + */ +#define BOCU1_TRAIL_TO_BYTE(t) ((t)>=BOCU1_TRAIL_CONTROLS_COUNT ? (t)+BOCU1_TRAIL_BYTE_OFFSET : bocu1TrailToByte[t]) + +/* + * Byte value map for control codes, + * from external byte values 0x00..0x20 + * to trail byte values 0..19 (0..0x13) as used in the difference calculation. + * External byte values that are illegal as trail bytes are mapped to -1. + */ +static int8_t +bocu1ByteToTrail[BOCU1_MIN]={ +/* 0 1 2 3 4 5 6 7 */ + -1, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, -1, + +/* 8 9 a b c d e f */ + -1, -1, -1, -1, -1, -1, -1, -1, + +/* 10 11 12 13 14 15 16 17 */ + 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, + +/* 18 19 1a 1b 1c 1d 1e 1f */ + 0x0e, 0x0f, -1, -1, 0x10, 0x11, 0x12, 0x13, + +/* 20 */ + -1 +}; + +/* + * Byte value map for control codes, + * from trail byte values 0..19 (0..0x13) as used in the difference calculation + * to external byte values 0x00..0x20. + */ +static int8_t +bocu1TrailToByte[BOCU1_TRAIL_CONTROLS_COUNT]={ +/* 0 1 2 3 4 5 6 7 */ + 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x10, 0x11, + +/* 8 9 a b c d e f */ + 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, + +/* 10 11 12 13 */ + 0x1c, 0x1d, 0x1e, 0x1f +}; + +/** + * Integer division and modulo with negative numerators + * yields negative modulo results and quotients that are one more than + * what we need here. + * This macro adjust the results so that the modulo-value m is always >=0. + * + * For positive n, the if() condition is always FALSE. + * + * @param n Number to be split into quotient and rest. + * Will be modified to contain the quotient. + * @param d Divisor. + * @param m Output variable for the rest (modulo result). + */ +#define NEGDIVMOD(n, d, m) { \ + (m)=(n)%(d); \ + (n)/=(d); \ + if((m)<0) { \ + --(n); \ + (m)+=(d); \ + } \ +} + +/* State for BOCU-1 decoder function. */ +struct Bocu1Rx { + int32_t prev, count, diff; +}; + +typedef struct Bocu1Rx Bocu1Rx; + +/* Function prototypes ------------------------------------------------------ */ + +/* see bocu1.c */ +U_CFUNC int32_t +packDiff(int32_t diff); + +U_CFUNC int32_t +encodeBocu1(int32_t *pPrev, int32_t c); + +U_CFUNC int32_t +decodeBocu1(Bocu1Rx *pRx, uint8_t b); + +/* icuhtml/design/conversion/bocu1/bocu1.c ---------------------------------- */ + +/* BOCU-1 implementation functions ------------------------------------------ */ + +/** + * Compute the next "previous" value for differencing + * from the current code point. + * + * @param c current code point, 0..0x10ffff + * @return "previous code point" state value + */ +U_INLINE int32_t +bocu1Prev(int32_t c) { + /* compute new prev */ + if(0x3040<=c && c<=0x309f) { + /* Hiragana is not 128-aligned */ + return 0x3070; + } else if(0x4e00<=c && c<=0x9fa5) { + /* CJK Unihan */ + return 0x4e00-BOCU1_REACH_NEG_2; + } else if(0xac00<=c && c<=0xd7a3) { + /* Korean Hangul */ + return (0xd7a3+0xac00)/2; + } else { + /* mostly small scripts */ + return (c&~0x7f)+BOCU1_ASCII_PREV; + } +} + +/** + * Encode a difference -0x10ffff..0x10ffff in 1..4 bytes + * and return a packed integer with them. + * + * The encoding favors small absolut differences with short encodings + * to compress runs of same-script characters. + * + * @param diff difference value -0x10ffff..0x10ffff + * @return + * 0x010000zz for 1-byte sequence zz + * 0x0200yyzz for 2-byte sequence yy zz + * 0x03xxyyzz for 3-byte sequence xx yy zz + * 0xwwxxyyzz for 4-byte sequence ww xx yy zz (ww>0x03) + */ +U_CFUNC int32_t +packDiff(int32_t diff) { + int32_t result, m, lead, count, shift; + + if(diff>=BOCU1_REACH_NEG_1) { + /* mostly positive differences, and single-byte negative ones */ + if(diff<=BOCU1_REACH_POS_1) { + /* single byte */ + return 0x01000000|(BOCU1_MIDDLE+diff); + } else if(diff<=BOCU1_REACH_POS_2) { + /* two bytes */ + diff-=BOCU1_REACH_POS_1+1; + lead=BOCU1_START_POS_2; + count=1; + } else if(diff<=BOCU1_REACH_POS_3) { + /* three bytes */ + diff-=BOCU1_REACH_POS_2+1; + lead=BOCU1_START_POS_3; + count=2; + } else { + /* four bytes */ + diff-=BOCU1_REACH_POS_3+1; + lead=BOCU1_START_POS_4; + count=3; + } + } else { + /* two- and four-byte negative differences */ + if(diff>=BOCU1_REACH_NEG_2) { + /* two bytes */ + diff-=BOCU1_REACH_NEG_1; + lead=BOCU1_START_NEG_2; + count=1; + } else if(diff>=BOCU1_REACH_NEG_3) { + /* three bytes */ + diff-=BOCU1_REACH_NEG_2; + lead=BOCU1_START_NEG_3; + count=2; + } else { + /* four bytes */ + diff-=BOCU1_REACH_NEG_3; + lead=BOCU1_START_NEG_4; + count=3; + } + } + + /* encode the length of the packed result */ + if(count<3) { + result=(count+1)<<24; + } else /* count==3, MSB used for the lead byte */ { + result=0; + } + + /* calculate trail bytes like digits in itoa() */ + shift=0; + do { + NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m); + result|=BOCU1_TRAIL_TO_BYTE(m)<0); + + /* add lead byte */ + result|=(lead+diff)<0x10ffff) { + /* illegal argument */ + return 0; + } + + prev=*pPrev; + if(prev==0) { + /* lenient handling of initial value 0 */ + prev=*pPrev=BOCU1_ASCII_PREV; + } + + if(c<=0x20) { + /* + * ISO C0 control & space: + * Encode directly for MIME compatibility, + * and reset state except for space, to not disrupt compression. + */ + if(c!=0x20) { + *pPrev=BOCU1_ASCII_PREV; + } + return 0x01000000|c; + } + + /* + * all other Unicode code points c==U+0021..U+10ffff + * are encoded with the difference c-prev + * + * a new prev is computed from c, + * placed in the middle of a 0x80-block (for most small scripts) or + * in the middle of the Unihan and Hangul blocks + * to statistically minimize the following difference + */ + *pPrev=bocu1Prev(c); + return packDiff(c-prev); +} + +/** + * Function for BOCU-1 decoder; handles multi-byte lead bytes. + * + * @param pRx pointer to the decoder state structure + * @param b lead byte; + * BOCU1_MIN<=b=BOCU1_START_NEG_2) { + /* positive difference */ + if(b=BOCU1_START_NEG_3) { + /* two bytes */ + c=((int32_t)b-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1; + count=1; + } else if(b>BOCU1_MIN) { + /* three bytes */ + c=((int32_t)b-BOCU1_START_NEG_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_2; + count=2; + } else { + /* four bytes */ + c=-BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_3; + count=3; + } + } + + /* set the state for decoding the trail byte(s) */ + pRx->diff=c; + pRx->count=count; + return -1; +} + +/** + * Function for BOCU-1 decoder; handles multi-byte trail bytes. + * + * @param pRx pointer to the decoder state structure + * @param b trail byte + * @return result value, same as decodeBocu1 + * + * @see decodeBocu1 + */ +static int32_t +decodeBocu1TrailByte(Bocu1Rx *pRx, uint8_t b) { + int32_t t, c, count; + + if(b<=0x20) { + /* skip some C0 controls and make the trail byte range contiguous */ + t=bocu1ByteToTrail[b]; + if(t<0) { + /* illegal trail byte value */ + pRx->prev=BOCU1_ASCII_PREV; + pRx->count=0; + return -99; + } +#if BOCU1_MAX_TRAIL<0xff + } else if(b>BOCU1_MAX_TRAIL) { + return -99; +#endif + } else { + t=(int32_t)b-BOCU1_TRAIL_BYTE_OFFSET; + } + + /* add trail byte into difference and decrement count */ + c=pRx->diff; + count=pRx->count; + + if(count==1) { + /* final trail byte, deliver a code point */ + c=pRx->prev+c+t; + if(0<=c && c<=0x10ffff) { + /* valid code point result */ + pRx->prev=bocu1Prev(c); + pRx->count=0; + return c; + } else { + /* illegal code point result */ + pRx->prev=BOCU1_ASCII_PREV; + pRx->count=0; + return -99; + } + } + + /* intermediate trail byte */ + if(count==2) { + pRx->diff=c+t*BOCU1_TRAIL_COUNT; + } else /* count==3 */ { + pRx->diff=c+t*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT; + } + pRx->count=count-1; + return -1; +} + +/** + * BOCU-1 decoder function. + * + * @param pRx pointer to the decoder state structure; + * the initial values should be 0 which + * decodeBocu1 will set to actual initial state values + * @param b an input byte + * @return + * 0..0x10ffff for a result code point + * -1 if only the state changed without code point output + * <-1 if an error occurs + */ +U_CFUNC int32_t +decodeBocu1(Bocu1Rx *pRx, uint8_t b) { + int32_t prev, c, count; + + if(pRx==NULL) { + /* illegal argument */ + return -99; + } + + prev=pRx->prev; + if(prev==0) { + /* lenient handling of initial 0 values */ + prev=pRx->prev=BOCU1_ASCII_PREV; + count=pRx->count=0; + } else { + count=pRx->count; + } + + if(count==0) { + /* byte in lead position */ + if(b<=0x20) { + /* + * Direct-encoded C0 control code or space. + * Reset prev for C0 control codes but not for space. + */ + if(b!=0x20) { + pRx->prev=BOCU1_ASCII_PREV; + } + return b; + } + + /* + * b is a difference lead byte. + * + * Return a code point directly from a single-byte difference. + * + * For multi-byte difference lead bytes, set the decoder state + * with the partial difference value from the lead byte and + * with the number of trail bytes. + * + * For four-byte differences, the signedness also affects the + * first trail byte, which has special handling farther below. + */ + if(b>=BOCU1_START_NEG_2 && bprev=bocu1Prev(c); + return c; + } else if(b==BOCU1_RESET) { + /* only reset the state, no code point */ + pRx->prev=BOCU1_ASCII_PREV; + return -1; + } else { + return decodeBocu1LeadByte(pRx, b); + } + } else { + /* trail byte in any position */ + return decodeBocu1TrailByte(pRx, b); + } +} + +/* icuhtml/design/conversion/bocu1/bocu1tst.c ------------------------------- */ + +/* test code ---------------------------------------------------------------- */ + +/* test code options */ + +/* ignore comma when processing name lists in testText() */ +#define TEST_IGNORE_COMMA 1 + +/** + * Write a packed BOCU-1 byte sequence into a byte array, + * without overflow check. + * Test function. + * + * @param packed packed BOCU-1 byte sequence, see packDiff() + * @param p pointer to byte array + * @return number of bytes + * + * @see packDiff + */ +static int32_t +writePacked(int32_t packed, uint8_t *p) { + int32_t count=BOCU1_LENGTH_FROM_PACKED(packed); + switch(count) { + case 4: + *p++=(uint8_t)(packed>>24); + case 3: + *p++=(uint8_t)(packed>>16); + case 2: + *p++=(uint8_t)(packed>>8); + case 1: + *p++=(uint8_t)packed; + default: + break; + } + + return count; +} + +/** + * Unpack a packed BOCU-1 non-C0/space byte sequence and get + * the difference to initialPrev. + * Used only for round-trip testing of the difference encoding and decoding. + * Test function. + * + * @param initialPrev bogus "previous code point" value to make sure that + * the resulting code point is in the range 0..0x10ffff + * @param packed packed BOCU-1 byte sequence + * @return the difference to initialPrev + * + * @see packDiff + * @see writeDiff + */ +static int32_t +unpackDiff(int32_t initialPrev, int32_t packed) { + Bocu1Rx rx={ 0, 0, 0 }; + int32_t count; + + rx.prev=initialPrev; + count=BOCU1_LENGTH_FROM_PACKED(packed); + switch(count) { + case 4: + decodeBocu1(&rx, (uint8_t)(packed>>24)); + case 3: + decodeBocu1(&rx, (uint8_t)(packed>>16)); + case 2: + decodeBocu1(&rx, (uint8_t)(packed>>8)); + case 1: + /* subtract initial prev */ + return decodeBocu1(&rx, (uint8_t)packed)-initialPrev; + default: + return -0x7fffffff; + } +} + +/** + * Encode one difference value -0x10ffff..+0x10ffff in 1..4 bytes, + * preserving lexical order. + * Also checks for roundtripping of the difference encoding. + * Test function. + * + * @param diff difference value to test, -0x10ffff..0x10ffff + * @param p pointer to output byte array + * @return p advanced by number of bytes output + * + * @see unpackDiff + */ +static uint8_t * +writeDiff(int32_t diff, uint8_t *p) { + /* generate the difference as a packed value and serialize it */ + int32_t packed, initialPrev; + + packed=packDiff(diff); + + /* + * bogus initial "prev" to work around + * code point range check in decodeBocu1() + */ + if(diff<=0) { + initialPrev=0x10ffff; + } else { + initialPrev=-1; + } + + if(diff!=unpackDiff(initialPrev, packed)) { + log_err("error: unpackDiff(packDiff(diff=%ld)=0x%08lx)=%ld!=diff\n", + diff, packed, unpackDiff(initialPrev, packed)); + } + return p+writePacked(packed, p); +} + +/** + * Encode a UTF-16 string in BOCU-1. + * Does not check for overflows, but otherwise useful function. + * + * @param s input UTF-16 string + * @param length number of UChar code units in s + * @param p pointer to output byte array + * @return number of bytes output + */ +static int32_t +writeString(const UChar *s, int32_t length, uint8_t *p) { + uint8_t *p0; + int32_t c, prev, i; + + prev=0; + p0=p; + i=0; + while(i=0) { + UTF_APPEND_CHAR_UNSAFE(s, sLength, c); + } + } + return sLength; +} + +static U_INLINE char +hexDigit(uint8_t digit) { + return digit<=9 ? (char)('0'+digit) : (char)('a'-10+digit); +} + +/** + * Pretty-print 0-terminated byte values. + * Helper function for test output. + * + * @param bytes 0-terminated byte array to print + */ +static void +printBytes(uint8_t *bytes, char *out) { + int i; + uint8_t b; + + i=0; + while((b=*bytes++)!=0) { + *out++=' '; + *out++=hexDigit((uint8_t)(b>>4)); + *out++=hexDigit((uint8_t)(b&0xf)); + ++i; + } + i=3*(5-i); + while(i>0) { + *out++=' '; + --i; + } + *out=0; +} + +/** + * Basic BOCU-1 test function, called when there are no command line arguments. + * Prints some of the #define values and performs round-trip tests of the + * difference encoding and decoding. + */ +static void +TestBOCU1RefDiff(void) { + char buf1[80], buf2[80]; + uint8_t prev[5], level[5]; + int32_t i, cmp, countErrors; + + log_verbose("reach of single bytes: %ld\n", 1+BOCU1_REACH_POS_1-BOCU1_REACH_NEG_1); + log_verbose("reach of 2 bytes : %ld\n", 1+BOCU1_REACH_POS_2-BOCU1_REACH_NEG_2); + log_verbose("reach of 3 bytes : %ld\n\n", 1+BOCU1_REACH_POS_3-BOCU1_REACH_NEG_3); + + log_verbose(" BOCU1_REACH_NEG_1 %8ld BOCU1_REACH_POS_1 %8ld\n", BOCU1_REACH_NEG_1, BOCU1_REACH_POS_1); + log_verbose(" BOCU1_REACH_NEG_2 %8ld BOCU1_REACH_POS_2 %8ld\n", BOCU1_REACH_NEG_2, BOCU1_REACH_POS_2); + log_verbose(" BOCU1_REACH_NEG_3 %8ld BOCU1_REACH_POS_3 %8ld\n\n", BOCU1_REACH_NEG_3, BOCU1_REACH_POS_3); + + log_verbose(" BOCU1_MIDDLE 0x%02x\n", BOCU1_MIDDLE); + log_verbose(" BOCU1_START_NEG_2 0x%02x BOCU1_START_POS_2 0x%02x\n", BOCU1_START_NEG_2, BOCU1_START_POS_2); + log_verbose(" BOCU1_START_NEG_3 0x%02x BOCU1_START_POS_3 0x%02x\n\n", BOCU1_START_NEG_3, BOCU1_START_POS_3); + + /* test packDiff() & unpackDiff() with some specific values */ + writeDiff(0, level); + writeDiff(1, level); + writeDiff(65, level); + writeDiff(130, level); + writeDiff(30000, level); + writeDiff(1000000, level); + writeDiff(-65, level); + writeDiff(-130, level); + writeDiff(-30000, level); + writeDiff(-1000000, level); + + /* test that each value is smaller than any following one */ + countErrors=0; + i=-0x10ffff; + *writeDiff(i, prev)=0; + + /* show first number and bytes */ + printBytes(prev, buf1); + log_verbose(" wD(%8ld) %s\n", i, buf1); + + for(++i; i<=0x10ffff; ++i) { + *writeDiff(i, level)=0; + cmp=strcmp((const char *)prev, (const char *)level); + if(BOCU1_LENGTH_FROM_LEAD(level[0])!=(int32_t)strlen((const char *)level)) { + log_verbose("BOCU1_LENGTH_FROM_LEAD(0x%02x)=%ld!=%ld=strlen(writeDiff(%ld))\n", + level[0], BOCU1_LENGTH_FROM_LEAD(level[0]), strlen((const char *)level), i); + } + if(cmp<0) { + if(i==0 || i==1 || strlen((const char *)prev)!=strlen((const char *)level)) { + /* + * if the result is good, then print only if the length changed + * to get little but interesting output + */ + printBytes(prev, buf1); + printBytes(level, buf2); + log_verbose("ok: strcmp(wD(%8ld), wD(%8ld))=%2d %s%s\n", i-1, i, cmp, buf1, buf2); + } + } else { + ++countErrors; + printBytes(prev, buf1); + printBytes(level, buf2); + log_verbose("wrong: strcmp(wD(%8ld), wD(%8ld))=%2d %s%s\n", i-1, i, cmp, buf1, buf2); + } + /* remember the previous bytes */ + memcpy(prev, level, 4); + } + + /* show last number and bytes */ + printBytes((uint8_t *)"", buf1); + printBytes(prev, buf2); + log_verbose(" wD(%8ld) %s%s\n", i-1, buf1, buf2); + + if(countErrors==0) { + log_info("writeDiff(-0x10ffff..0x10ffff) works fine\n"); + } else { + log_err("writeDiff(-0x10ffff..0x10ffff) violates lexical ordering in %d cases\n", countErrors); + } + + /* output signature byte sequence */ + i=0; + writePacked(encodeBocu1(&i, 0xfeff), level); + log_info("\nBOCU-1 signature byte sequence: %02x %02x %02x\n", + level[0], level[1], level[2]); +} + +/* cintltst code ------------------------------------------------------------ */ + +/* test one string with the ICU and the reference BOCU-1 implementations */ +static void +roundtripBOCU1(UConverter *bocu1, int32_t number, const UChar *text, int32_t length) { + static UChar roundtripRef[30000], roundtripICU[30000]; + static char bocu1Ref[30000], bocu1ICU[30000]; + + int32_t bocu1RefLength, bocu1ICULength, roundtripRefLength, roundtripICULength; + UErrorCode errorCode; + + /* Unicode -> BOCU-1 */ + bocu1RefLength=writeString(text, length, (uint8_t *)bocu1Ref); + + errorCode=U_ZERO_ERROR; + bocu1ICULength=ucnv_fromUChars(bocu1, bocu1ICU, sizeof(bocu1ICU), text, length, &errorCode); + if(U_FAILURE(errorCode)) { + log_err("ucnv_fromUChars(BOCU-1, text(%d)[%d]) failed: %s\n", number, length, u_errorName(errorCode)); + return; + } + + if(bocu1RefLength!=bocu1ICULength || 0!=uprv_memcmp(bocu1Ref, bocu1ICU, bocu1RefLength)) { + log_err("Unicode(%d)[%d] -> BOCU-1: reference[%d]!=ICU[%d]\n", number, length, bocu1RefLength, bocu1ICULength); + return; + } + + /* BOCU-1 -> Unicode */ + roundtripRefLength=readString((uint8_t *)bocu1Ref, bocu1RefLength, roundtripRef); + if(roundtripRefLength<0) { + return; /* readString() found an error and reported it */ + } + + roundtripICULength=ucnv_toUChars(bocu1, roundtripICU, sizeof(roundtripICU)/U_SIZEOF_UCHAR, bocu1ICU, bocu1ICULength, &errorCode); + if(U_FAILURE(errorCode)) { + log_err("ucnv_toUChars(BOCU-1, text(%d)[%d]) failed: %s\n", number, length, u_errorName(errorCode)); + return; + } + + if(length!=roundtripRefLength || 0!=u_memcmp(text, roundtripRef, length)) { + log_err("BOCU-1 -> Unicode: original(%d)[%d]!=reference[%d]\n", number, length, roundtripRefLength); + return; + } + if(roundtripRefLength!=roundtripICULength || 0!=u_memcmp(roundtripRef, roundtripICU, roundtripRefLength)) { + log_err("BOCU-1 -> Unicode: reference(%d)[%d]!=ICU[%d]\n", number, roundtripRefLength, roundtripICULength); + return; + } +} + +static const UChar feff[]={ 0xfeff }; +static const UChar ascii[]={ 0x61, 0x62, 0x20, 0x63, 0x61 }; +static const UChar crlf[]={ 0xd, 0xa, 0x20 }; +static const UChar nul[]={ 0 }; +static const UChar latin[]={ 0xdf, 0xe6 }; +static const UChar devanagari[]={ 0x930, 0x20, 0x918, 0x909 }; +static const UChar hiragana[]={ 0x3086, 0x304d, 0x20, 0x3053, 0x4000 }; +static const UChar unihan[]={ 0x4e00, 0x7777, 0x20, 0x9fa5, 0x4e00 }; +static const UChar hangul[]={ 0xac00, 0xbcde, 0x20, 0xd7a3 }; +static const UChar surrogates[]={ 0xdc00, 0xd800 }; /* single surrogates, unmatched! */ +static const UChar plane1[]={ 0xd800, 0xdc00 }; +static const UChar plane2[]={ 0xd845, 0xdddd }; +static const UChar plane15[]={ 0xdbbb, 0xddee, 0x20 }; +static const UChar plane16[]={ 0xdbff, 0xdfff }; +static const UChar c0[]={ 1, 0xe40, 0x20, 9 }; + +static const struct { + const UChar *s; + int32_t length; +} strings[]={ + { feff, LENGTHOF(feff) }, + { ascii, LENGTHOF(ascii) }, + { crlf, LENGTHOF(crlf) }, + { nul, LENGTHOF(nul) }, + { latin, LENGTHOF(latin) }, + { devanagari, LENGTHOF(devanagari) }, + { hiragana, LENGTHOF(hiragana) }, + { unihan, LENGTHOF(unihan) }, + { hangul, LENGTHOF(hangul) }, + { surrogates, LENGTHOF(surrogates) }, + { plane1, LENGTHOF(plane1) }, + { plane2, LENGTHOF(plane2) }, + { plane15, LENGTHOF(plane15) }, + { plane16, LENGTHOF(plane16) }, + { c0, LENGTHOF(c0) } +}; + +/* + * Verify that the ICU BOCU-1 implementation produces the same results as + * the reference implementation from the design folder. + * Generate some texts and convert them with both converters, verifying + * identical results and roundtripping. + */ +static void +TestBOCU1(void) { + UChar text[30000]; + int32_t i, length; + + UConverter *bocu1; + UErrorCode errorCode; + + errorCode=U_ZERO_ERROR; + bocu1=ucnv_open("BOCU-1", &errorCode); + if(U_FAILURE(errorCode)) { + log_err("error: unable to open BOCU-1 converter: %s\n", u_errorName(errorCode)); + return; + } + + /* text 1: each of strings[] once */ + length=0; + for(i=0; i=LENGTHOF(strings)) { + i-=LENGTHOF(strings); + } + u_memcpy(text+length, strings[i].s, strings[i].length); + length+=strings[i].length; + } + roundtripBOCU1(bocu1, 3, text, length); + + ucnv_close(bocu1); +} + +U_CFUNC void +addBOCU1Tests(TestNode** root) { + addTest(root, TestBOCU1RefDiff, "tsconv/bocu1tst/TestBOCU1RefDiff"); + addTest(root, TestBOCU1, "tsconv/bocu1tst/TestBOCU1"); +} diff --git a/icu4c/source/test/cintltst/cconvtst.c b/icu4c/source/test/cintltst/cconvtst.c index 7a845212aa..5d154fe7b9 100644 --- a/icu4c/source/test/cintltst/cconvtst.c +++ b/icu4c/source/test/cintltst/cconvtst.c @@ -21,12 +21,15 @@ void addTestEuroRegression(TestNode** root); void addTestConverterFallBack(TestNode** root); void addExtraTests(TestNode** root); -void addConvert(TestNode** root); +/* bocu1tst.c */ +U_CFUNC void +addBOCU1Tests(TestNode** root); void addConvert(TestNode** root) { addTestConvert(root); addTestNewConvert(root); + addBOCU1Tests(root); addTestConvertErrorCallBack(root); addTestEuroRegression(root); addTestConverterFallBack(root); diff --git a/icu4c/source/test/cintltst/cintltst.dsp b/icu4c/source/test/cintltst/cintltst.dsp index 30a60e1c5f..5ce139364b 100644 --- a/icu4c/source/test/cintltst/cintltst.dsp +++ b/icu4c/source/test/cintltst/cintltst.dsp @@ -145,6 +145,10 @@ LINK32=link.exe # PROP Default_Filter "cpp;c;cxx;rc;def;r;odl;idl;hpj;bat" # Begin Source File +SOURCE=.\bocu1tst.c +# End Source File +# Begin Source File + SOURCE=.\callcoll.c # End Source File # Begin Source File