995a589267
X-SVN-Rev: 8481
1621 lines
51 KiB
C
1621 lines
51 KiB
C
/*
|
|
******************************************************************************
|
|
*
|
|
* Copyright (C) 2002, International Business Machines
|
|
* Corporation and others. All Rights Reserved.
|
|
*
|
|
******************************************************************************
|
|
* file name: ucnvbocu.c
|
|
* encoding: US-ASCII
|
|
* tab size: 8 (not used)
|
|
* indentation:4
|
|
*
|
|
* created on: 2002mar27
|
|
* created by: Markus W. Scherer
|
|
*
|
|
* This is an implementation of the Binary Ordered Compression for Unicode,
|
|
* in its MIME-friendly form as defined in ### TODO http://... 1. doc/papers 2. design
|
|
*/
|
|
|
|
#include "unicode/utypes.h"
|
|
#include "unicode/ucnv.h"
|
|
#include "unicode/ucnv_cb.h"
|
|
#include "ucnv_bld.h"
|
|
#include "ucnv_cnv.h"
|
|
|
|
/* BOCU-1 constants and macros ---------------------------------------------- */
|
|
|
|
/*
|
|
* BOCU-1 encodes the code points of a Unicode string as
|
|
* a sequence of byte-encoded differences (slope detection),
|
|
* preserving lexical order.
|
|
*
|
|
* Optimize the difference-taking for runs of Unicode text within
|
|
* small scripts:
|
|
*
|
|
* Most small scripts are allocated within aligned 128-blocks of Unicode
|
|
* code points. Lexical order is preserved if the "previous code point" state
|
|
* is always moved into the middle of such a block.
|
|
*
|
|
* Additionally, "prev" is moved from anywhere in the Unihan and Hangul
|
|
* areas into the middle of those areas.
|
|
*
|
|
* C0 control codes and space are encoded with their US-ASCII bytes.
|
|
* "prev" is reset for C0 controls but not for space.
|
|
*/
|
|
|
|
/* initial value for "prev": middle of the ASCII range */
|
|
#define BOCU1_ASCII_PREV 0x40
|
|
|
|
/* bounding byte values for differences */
|
|
#define BOCU1_MIN 0x21
|
|
#define BOCU1_MIDDLE 0x90
|
|
#define BOCU1_MAX_LEAD 0xfe
|
|
#define BOCU1_MAX_TRAIL 0xff
|
|
#define BOCU1_RESET 0xff
|
|
|
|
/* number of lead bytes */
|
|
#define BOCU1_COUNT (BOCU1_MAX_LEAD-BOCU1_MIN+1)
|
|
|
|
/* adjust trail byte counts for the use of some C0 control byte values */
|
|
#define BOCU1_TRAIL_CONTROLS_COUNT 20
|
|
#define BOCU1_TRAIL_BYTE_OFFSET (BOCU1_MIN-BOCU1_TRAIL_CONTROLS_COUNT)
|
|
|
|
/* number of trail bytes */
|
|
#define BOCU1_TRAIL_COUNT ((BOCU1_MAX_TRAIL-BOCU1_MIN+1)+BOCU1_TRAIL_CONTROLS_COUNT)
|
|
|
|
/*
|
|
* number of positive and negative single-byte codes
|
|
* (counting 0==BOCU1_MIDDLE among the positive ones)
|
|
*/
|
|
#define BOCU1_SINGLE 64
|
|
|
|
/* number of lead bytes for positive and negative 2/3/4-byte sequences */
|
|
#define BOCU1_LEAD_2 43
|
|
#define BOCU1_LEAD_3 3
|
|
#define BOCU1_LEAD_4 1
|
|
|
|
/* The difference value range for single-byters. */
|
|
#define BOCU1_REACH_POS_1 (BOCU1_SINGLE-1)
|
|
#define BOCU1_REACH_NEG_1 (-BOCU1_SINGLE)
|
|
|
|
/* The difference value range for double-byters. */
|
|
#define BOCU1_REACH_POS_2 (BOCU1_REACH_POS_1+BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)
|
|
#define BOCU1_REACH_NEG_2 (BOCU1_REACH_NEG_1-BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)
|
|
|
|
/* The difference value range for 3-byters. */
|
|
#define BOCU1_REACH_POS_3 \
|
|
(BOCU1_REACH_POS_2+BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT)
|
|
|
|
#define BOCU1_REACH_NEG_3 (BOCU1_REACH_NEG_2-BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT)
|
|
|
|
/* The lead byte start values. */
|
|
#define BOCU1_START_POS_2 (BOCU1_MIDDLE+BOCU1_REACH_POS_1+1)
|
|
#define BOCU1_START_POS_3 (BOCU1_START_POS_2+BOCU1_LEAD_2)
|
|
#define BOCU1_START_POS_4 (BOCU1_START_POS_3+BOCU1_LEAD_3)
|
|
/* ==BOCU1_MAX_LEAD */
|
|
|
|
#define BOCU1_START_NEG_2 (BOCU1_MIDDLE+BOCU1_REACH_NEG_1)
|
|
#define BOCU1_START_NEG_3 (BOCU1_START_NEG_2-BOCU1_LEAD_2)
|
|
#define BOCU1_START_NEG_4 (BOCU1_START_NEG_3-BOCU1_LEAD_3)
|
|
/* ==BOCU1_MIN+1 */
|
|
|
|
/* The length of a byte sequence, according to the lead byte (!=BOCU1_RESET). */
|
|
#define BOCU1_LENGTH_FROM_LEAD(lead) \
|
|
((BOCU1_START_NEG_2<=(lead) && (lead)<BOCU1_START_POS_2) ? 1 : \
|
|
(BOCU1_START_NEG_3<=(lead) && (lead)<BOCU1_START_POS_3) ? 2 : \
|
|
(BOCU1_START_NEG_4<=(lead) && (lead)<BOCU1_START_POS_4) ? 3 : 4)
|
|
|
|
/* The length of a byte sequence, according to its packed form. */
|
|
#define BOCU1_LENGTH_FROM_PACKED(packed) \
|
|
((uint32_t)(packed)<0x04000000 ? (packed)>>24 : 4)
|
|
|
|
/*
|
|
* 12 commonly used C0 control codes (and space) are only used to encode
|
|
* themselves directly,
|
|
* which makes BOCU-1 MIME-usable and reasonably safe for
|
|
* ASCII-oriented software.
|
|
*
|
|
* These controls are
|
|
* 0 NUL
|
|
*
|
|
* 7 BEL
|
|
* 8 BS
|
|
*
|
|
* 9 TAB
|
|
* a LF
|
|
* b VT
|
|
* c FF
|
|
* d CR
|
|
*
|
|
* e SO
|
|
* f SI
|
|
*
|
|
* 1a SUB
|
|
* 1b ESC
|
|
*
|
|
* The other 20 C0 controls are also encoded directly (to preserve order)
|
|
* but are also used as trail bytes in difference encoding
|
|
* (for better compression).
|
|
*/
|
|
#define BOCU1_TRAIL_TO_BYTE(t) ((t)>=BOCU1_TRAIL_CONTROLS_COUNT ? (t)+BOCU1_TRAIL_BYTE_OFFSET : bocu1TrailToByte[t])
|
|
|
|
/*
|
|
* Byte value map for control codes,
|
|
* from external byte values 0x00..0x20
|
|
* to trail byte values 0..19 (0..0x13) as used in the difference calculation.
|
|
* External byte values that are illegal as trail bytes are mapped to -1.
|
|
*/
|
|
static int8_t
|
|
bocu1ByteToTrail[BOCU1_MIN]={
|
|
/* 0 1 2 3 4 5 6 7 */
|
|
-1, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, -1,
|
|
|
|
/* 8 9 a b c d e f */
|
|
-1, -1, -1, -1, -1, -1, -1, -1,
|
|
|
|
/* 10 11 12 13 14 15 16 17 */
|
|
0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d,
|
|
|
|
/* 18 19 1a 1b 1c 1d 1e 1f */
|
|
0x0e, 0x0f, -1, -1, 0x10, 0x11, 0x12, 0x13,
|
|
|
|
/* 20 */
|
|
-1
|
|
};
|
|
|
|
/*
|
|
* Byte value map for control codes,
|
|
* from trail byte values 0..19 (0..0x13) as used in the difference calculation
|
|
* to external byte values 0x00..0x20.
|
|
*/
|
|
static int8_t
|
|
bocu1TrailToByte[BOCU1_TRAIL_CONTROLS_COUNT]={
|
|
/* 0 1 2 3 4 5 6 7 */
|
|
0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x10, 0x11,
|
|
|
|
/* 8 9 a b c d e f */
|
|
0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19,
|
|
|
|
/* 10 11 12 13 */
|
|
0x1c, 0x1d, 0x1e, 0x1f
|
|
};
|
|
|
|
/**
|
|
* Integer division and modulo with negative numerators
|
|
* yields negative modulo results and quotients that are one more than
|
|
* what we need here.
|
|
* This macro adjust the results so that the modulo-value m is always >=0.
|
|
*
|
|
* For positive n, the if() condition is always FALSE.
|
|
*
|
|
* @param n Number to be split into quotient and rest.
|
|
* Will be modified to contain the quotient.
|
|
* @param d Divisor.
|
|
* @param m Output variable for the rest (modulo result).
|
|
*/
|
|
#define NEGDIVMOD(n, d, m) { \
|
|
(m)=(n)%(d); \
|
|
(n)/=(d); \
|
|
if((m)<0) { \
|
|
--(n); \
|
|
(m)+=(d); \
|
|
} \
|
|
}
|
|
|
|
/* BOCU-1 implementation functions ------------------------------------------ */
|
|
|
|
#define BOCU1_SIMPLE_PREV(c) (((c)&~0x7f)+BOCU1_ASCII_PREV)
|
|
|
|
/**
|
|
* Compute the next "previous" value for differencing
|
|
* from the current code point.
|
|
*
|
|
* @param c current code point, 0x3040..0xd7a3 (rest handled by macro below)
|
|
* @return "previous code point" state value
|
|
*/
|
|
static U_INLINE int32_t
|
|
bocu1Prev(int32_t c) {
|
|
/* compute new prev */
|
|
if(/* 0x3040<=c && */ c<=0x309f) {
|
|
/* Hiragana is not 128-aligned */
|
|
return 0x3070;
|
|
} else if(0x4e00<=c && c<=0x9fa5) {
|
|
/* CJK Unihan */
|
|
return 0x4e00-BOCU1_REACH_NEG_2;
|
|
} else if(0xac00<=c /* && c<=0xd7a3 */) {
|
|
/* Korean Hangul */
|
|
return (0xd7a3+0xac00)/2;
|
|
} else {
|
|
/* mostly small scripts */
|
|
return BOCU1_SIMPLE_PREV(c);
|
|
}
|
|
}
|
|
|
|
/** Fast version of bocu1Prev() for most scripts. */
|
|
#define BOCU1_PREV(c) ((c)<0x3040 || (c)>0xd7a3 ? BOCU1_SIMPLE_PREV(c) : bocu1Prev(c))
|
|
|
|
/*
|
|
* The BOCU-1 converter uses the standard setup code in ucnv.c/ucnv_bld.c.
|
|
* The UConverter fields are used as follows:
|
|
*
|
|
* fromUnicodeStatus encoder's prev (0 will be interpreted as BOCU1_ASCII_PREV)
|
|
*
|
|
* toUnicodeStatus decoder's prev (0 will be interpreted as BOCU1_ASCII_PREV)
|
|
* mode decoder's incomplete (diff<<2)|count (ignored when toULength==0)
|
|
*/
|
|
|
|
/* BOCU-1-from-Unicode conversion functions --------------------------------- */
|
|
|
|
/**
|
|
* Encode a difference -0x10ffff..0x10ffff in 1..4 bytes
|
|
* and return a packed integer with them.
|
|
*
|
|
* The encoding favors small absolut differences with short encodings
|
|
* to compress runs of same-script characters.
|
|
*
|
|
* Optimized version with unrolled loops and fewer floating-point operations
|
|
* than the standard packDiff().
|
|
*
|
|
* @param diff difference value -0x10ffff..0x10ffff
|
|
* @return
|
|
* 0x010000zz for 1-byte sequence zz
|
|
* 0x0200yyzz for 2-byte sequence yy zz
|
|
* 0x03xxyyzz for 3-byte sequence xx yy zz
|
|
* 0xwwxxyyzz for 4-byte sequence ww xx yy zz (ww>0x03)
|
|
*/
|
|
static int32_t
|
|
packDiff(int32_t diff) {
|
|
int32_t result, m;
|
|
|
|
if(diff>=BOCU1_REACH_NEG_1) {
|
|
/* mostly positive differences, and single-byte negative ones */
|
|
#if 0 /* single-byte case handled in macros, see below */
|
|
if(diff<=BOCU1_REACH_POS_1) {
|
|
/* single byte */
|
|
return 0x01000000|(BOCU1_MIDDLE+diff);
|
|
} else
|
|
#endif
|
|
if(diff<=BOCU1_REACH_POS_2) {
|
|
/* two bytes */
|
|
diff-=BOCU1_REACH_POS_1+1;
|
|
result=0x02000000;
|
|
|
|
m=diff%BOCU1_TRAIL_COUNT;
|
|
diff/=BOCU1_TRAIL_COUNT;
|
|
result|=BOCU1_TRAIL_TO_BYTE(m);
|
|
|
|
result|=(BOCU1_START_POS_2+diff)<<8;
|
|
} else if(diff<=BOCU1_REACH_POS_3) {
|
|
/* three bytes */
|
|
diff-=BOCU1_REACH_POS_2+1;
|
|
result=0x03000000;
|
|
|
|
m=diff%BOCU1_TRAIL_COUNT;
|
|
diff/=BOCU1_TRAIL_COUNT;
|
|
result|=BOCU1_TRAIL_TO_BYTE(m);
|
|
|
|
m=diff%BOCU1_TRAIL_COUNT;
|
|
diff/=BOCU1_TRAIL_COUNT;
|
|
result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
|
|
|
|
result|=(BOCU1_START_POS_3+diff)<<16;
|
|
} else {
|
|
/* four bytes */
|
|
diff-=BOCU1_REACH_POS_3+1;
|
|
|
|
m=diff%BOCU1_TRAIL_COUNT;
|
|
diff/=BOCU1_TRAIL_COUNT;
|
|
result=BOCU1_TRAIL_TO_BYTE(m);
|
|
|
|
m=diff%BOCU1_TRAIL_COUNT;
|
|
diff/=BOCU1_TRAIL_COUNT;
|
|
result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
|
|
|
|
/*
|
|
* We know that / and % would deliver quotient 0 and rest=diff.
|
|
* Avoid division and modulo for performance.
|
|
*/
|
|
result|=BOCU1_TRAIL_TO_BYTE(diff)<<16;
|
|
|
|
result|=BOCU1_START_POS_4<<24;
|
|
}
|
|
} else {
|
|
/* two- to four-byte negative differences */
|
|
if(diff>=BOCU1_REACH_NEG_2) {
|
|
/* two bytes */
|
|
diff-=BOCU1_REACH_NEG_1;
|
|
result=0x02000000;
|
|
|
|
NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
|
|
result|=BOCU1_TRAIL_TO_BYTE(m);
|
|
|
|
result|=(BOCU1_START_NEG_2+diff)<<8;
|
|
} else if(diff>=BOCU1_REACH_NEG_3) {
|
|
/* three bytes */
|
|
diff-=BOCU1_REACH_NEG_2;
|
|
result=0x03000000;
|
|
|
|
NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
|
|
result|=BOCU1_TRAIL_TO_BYTE(m);
|
|
|
|
NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
|
|
result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
|
|
|
|
result|=(BOCU1_START_NEG_3+diff)<<16;
|
|
} else {
|
|
/* four bytes */
|
|
diff-=BOCU1_REACH_NEG_3;
|
|
|
|
NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
|
|
result=BOCU1_TRAIL_TO_BYTE(m);
|
|
|
|
NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
|
|
result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
|
|
|
|
/*
|
|
* We know that NEGDIVMOD would deliver
|
|
* quotient -1 and rest=diff+BOCU1_TRAIL_COUNT.
|
|
* Avoid division and modulo for performance.
|
|
*/
|
|
m=diff+BOCU1_TRAIL_COUNT;
|
|
result|=BOCU1_TRAIL_TO_BYTE(m)<<16;
|
|
|
|
result|=BOCU1_MIN<<24;
|
|
}
|
|
}
|
|
return result;
|
|
}
|
|
|
|
/* Faster versions of packDiff() for single-byte-encoded diff values. */
|
|
|
|
/** Is a diff value encodable in a single byte? */
|
|
#define DIFF_IS_SINGLE(diff) (BOCU1_REACH_NEG_1<=(diff) && (diff)<=BOCU1_REACH_POS_1)
|
|
|
|
/** Encode a diff value in a single byte. */
|
|
#define PACK_SINGLE_DIFF(diff) (BOCU1_MIDDLE+(diff))
|
|
|
|
/** Is a diff value encodable in two bytes? */
|
|
#define DIFF_IS_DOUBLE(diff) (BOCU1_REACH_NEG_2<=(diff) && (diff)<=BOCU1_REACH_POS_2)
|
|
|
|
static void
|
|
_Bocu1FromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
|
|
UErrorCode *pErrorCode) {
|
|
UConverter *cnv;
|
|
const UChar *source, *sourceLimit;
|
|
uint8_t *target;
|
|
int32_t targetCapacity;
|
|
int32_t *offsets;
|
|
|
|
int32_t prev, c, diff;
|
|
|
|
int32_t sourceIndex, nextSourceIndex;
|
|
|
|
U_ALIGN_CODE(16)
|
|
|
|
/* set up the local pointers */
|
|
cnv=pArgs->converter;
|
|
source=pArgs->source;
|
|
sourceLimit=pArgs->sourceLimit;
|
|
target=(uint8_t *)pArgs->target;
|
|
targetCapacity=pArgs->targetLimit-pArgs->target;
|
|
offsets=pArgs->offsets;
|
|
|
|
/* get the converter state from UConverter */
|
|
c=cnv->fromUSurrogateLead;
|
|
prev=(int32_t)cnv->fromUnicodeStatus;
|
|
if(prev==0) {
|
|
prev=BOCU1_ASCII_PREV;
|
|
}
|
|
|
|
/* sourceIndex=-1 if the current character began in the previous buffer */
|
|
sourceIndex= c==0 ? 0 : -1;
|
|
nextSourceIndex=0;
|
|
|
|
/* conversion loop */
|
|
if(c!=0 && targetCapacity>0) {
|
|
goto getTrail;
|
|
}
|
|
|
|
fastSingle:
|
|
/* fast loop for single-byte differences */
|
|
/* use only one loop counter variable, targetCapacity, not also source */
|
|
diff=sourceLimit-source;
|
|
if(targetCapacity>diff) {
|
|
targetCapacity=diff;
|
|
}
|
|
/* ### TODO if WithOffsets is never used without offsets, then remove all offsets==NULL branches and checks */
|
|
if(offsets==NULL) {
|
|
while(targetCapacity>0 && (c=*source)<0x3000) {
|
|
if(c<=0x20) {
|
|
if(c!=0x20) {
|
|
prev=BOCU1_ASCII_PREV;
|
|
}
|
|
*target++=(uint8_t)c;
|
|
} else {
|
|
diff=c-prev;
|
|
if(DIFF_IS_SINGLE(diff)) {
|
|
prev=BOCU1_SIMPLE_PREV(c);
|
|
*target++=(uint8_t)PACK_SINGLE_DIFF(diff);
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
++source;
|
|
--targetCapacity;
|
|
}
|
|
} else {
|
|
while(targetCapacity>0 && (c=*source)<0x3000) {
|
|
if(c<=0x20) {
|
|
if(c!=0x20) {
|
|
prev=BOCU1_ASCII_PREV;
|
|
}
|
|
*target++=(uint8_t)c;
|
|
*offsets++=nextSourceIndex++;
|
|
++source;
|
|
--targetCapacity;
|
|
} else {
|
|
diff=c-prev;
|
|
if(DIFF_IS_SINGLE(diff)) {
|
|
prev=BOCU1_SIMPLE_PREV(c);
|
|
*target++=(uint8_t)PACK_SINGLE_DIFF(diff);
|
|
*offsets++=nextSourceIndex++;
|
|
++source;
|
|
--targetCapacity;
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
/* restore real values */
|
|
targetCapacity=(const uint8_t *)pArgs->targetLimit-target;
|
|
sourceIndex=nextSourceIndex; /* wrong if offsets==NULL but does not matter */
|
|
|
|
/* regular loop for all cases */
|
|
while(source<sourceLimit) {
|
|
if(targetCapacity>0) {
|
|
c=*source++;
|
|
++nextSourceIndex;
|
|
|
|
if(c<=0x20) {
|
|
/*
|
|
* ISO C0 control & space:
|
|
* Encode directly for MIME compatibility,
|
|
* and reset state except for space, to not disrupt compression.
|
|
*/
|
|
if(c!=0x20) {
|
|
prev=BOCU1_ASCII_PREV;
|
|
}
|
|
*target++=(uint8_t)c;
|
|
if(offsets!=NULL) {
|
|
*offsets++=sourceIndex;
|
|
}
|
|
--targetCapacity;
|
|
|
|
sourceIndex=nextSourceIndex;
|
|
continue;
|
|
}
|
|
|
|
if(UTF_IS_LEAD(c)) {
|
|
getTrail:
|
|
if(source<sourceLimit) {
|
|
/* test the following code unit */
|
|
UChar trail=*source;
|
|
if(UTF_IS_SECOND_SURROGATE(trail)) {
|
|
++source;
|
|
++nextSourceIndex;
|
|
c=UTF16_GET_PAIR_VALUE(c, trail);
|
|
}
|
|
} else {
|
|
/* no more input */
|
|
c=-c; /* negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else */
|
|
break;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* all other Unicode code points c==U+0021..U+10ffff
|
|
* are encoded with the difference c-prev
|
|
*
|
|
* a new prev is computed from c,
|
|
* placed in the middle of a 0x80-block (for most small scripts) or
|
|
* in the middle of the Unihan and Hangul blocks
|
|
* to statistically minimize the following difference
|
|
*/
|
|
diff=c-prev;
|
|
prev=BOCU1_PREV(c);
|
|
if(DIFF_IS_SINGLE(diff)) {
|
|
*target++=(uint8_t)PACK_SINGLE_DIFF(diff);
|
|
if(offsets!=NULL) {
|
|
*offsets++=sourceIndex;
|
|
}
|
|
--targetCapacity;
|
|
sourceIndex=nextSourceIndex;
|
|
if(c<0x3000) {
|
|
goto fastSingle;
|
|
}
|
|
} else if(DIFF_IS_DOUBLE(diff) && 2<=targetCapacity) {
|
|
/* optimize 2-byte case */
|
|
int32_t m;
|
|
|
|
if(diff>=0) {
|
|
diff-=BOCU1_REACH_POS_1+1;
|
|
m=diff%BOCU1_TRAIL_COUNT;
|
|
diff/=BOCU1_TRAIL_COUNT;
|
|
diff+=BOCU1_START_POS_2;
|
|
} else {
|
|
diff-=BOCU1_REACH_NEG_1;
|
|
NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
|
|
diff+=BOCU1_START_NEG_2;
|
|
}
|
|
*target++=(uint8_t)diff;
|
|
*target++=(uint8_t)BOCU1_TRAIL_TO_BYTE(m);
|
|
if(offsets!=NULL) {
|
|
*offsets++=sourceIndex;
|
|
*offsets++=sourceIndex;
|
|
}
|
|
targetCapacity-=2;
|
|
sourceIndex=nextSourceIndex;
|
|
} else {
|
|
int32_t length; /* will be 2..4 */
|
|
|
|
diff=packDiff(diff);
|
|
length=BOCU1_LENGTH_FROM_PACKED(diff);
|
|
|
|
/* write the output character bytes from diff and length */
|
|
/* from the first if in the loop we know that targetCapacity>0 */
|
|
if(length<=targetCapacity) {
|
|
if(offsets==NULL) {
|
|
switch(length) {
|
|
/* each branch falls through to the next one */
|
|
case 4:
|
|
*target++=(uint8_t)(diff>>24);
|
|
case 3:
|
|
*target++=(uint8_t)(diff>>16);
|
|
/* case 2: handled above */
|
|
*target++=(uint8_t)(diff>>8);
|
|
/* case 1: handled above */
|
|
*target++=(uint8_t)diff;
|
|
default:
|
|
/* will never occur */
|
|
break;
|
|
}
|
|
} else {
|
|
switch(length) {
|
|
/* each branch falls through to the next one */
|
|
case 4:
|
|
*target++=(uint8_t)(diff>>24);
|
|
*offsets++=sourceIndex;
|
|
case 3:
|
|
*target++=(uint8_t)(diff>>16);
|
|
*offsets++=sourceIndex;
|
|
case 2:
|
|
*target++=(uint8_t)(diff>>8);
|
|
*offsets++=sourceIndex;
|
|
/* case 1: handled above */
|
|
*target++=(uint8_t)diff;
|
|
*offsets++=sourceIndex;
|
|
default:
|
|
/* will never occur */
|
|
break;
|
|
}
|
|
}
|
|
targetCapacity-=length;
|
|
sourceIndex=nextSourceIndex;
|
|
} else {
|
|
uint8_t *charErrorBuffer;
|
|
|
|
/*
|
|
* We actually do this backwards here:
|
|
* In order to save an intermediate variable, we output
|
|
* first to the overflow buffer what does not fit into the
|
|
* regular target.
|
|
*/
|
|
/* we know that 1<=targetCapacity<length<=4 */
|
|
length-=targetCapacity;
|
|
charErrorBuffer=(uint8_t *)cnv->charErrorBuffer;
|
|
switch(length) {
|
|
/* each branch falls through to the next one */
|
|
case 3:
|
|
*charErrorBuffer++=(uint8_t)(diff>>16);
|
|
case 2:
|
|
*charErrorBuffer++=(uint8_t)(diff>>8);
|
|
case 1:
|
|
*charErrorBuffer=(uint8_t)diff;
|
|
default:
|
|
/* will never occur */
|
|
break;
|
|
}
|
|
cnv->charErrorBufferLength=(int8_t)length;
|
|
|
|
/* now output what fits into the regular target */
|
|
diff>>=8*length; /* length was reduced by targetCapacity */
|
|
switch(targetCapacity) {
|
|
/* each branch falls through to the next one */
|
|
case 3:
|
|
*target++=(uint8_t)(diff>>16);
|
|
if(offsets!=NULL) {
|
|
*offsets++=sourceIndex;
|
|
}
|
|
case 2:
|
|
*target++=(uint8_t)(diff>>8);
|
|
if(offsets!=NULL) {
|
|
*offsets++=sourceIndex;
|
|
}
|
|
case 1:
|
|
*target++=(uint8_t)diff;
|
|
if(offsets!=NULL) {
|
|
*offsets++=sourceIndex;
|
|
}
|
|
default:
|
|
/* will never occur */
|
|
break;
|
|
}
|
|
|
|
/* target overflow */
|
|
targetCapacity=0;
|
|
*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
|
|
break;
|
|
}
|
|
}
|
|
} else {
|
|
/* target is full */
|
|
*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if(pArgs->flush && source>=sourceLimit) {
|
|
/* reset the state for the next conversion */
|
|
if(c<0 && U_SUCCESS(*pErrorCode)) {
|
|
/* a Unicode code point remains incomplete (only a first surrogate) */
|
|
*pErrorCode=U_TRUNCATED_CHAR_FOUND;
|
|
}
|
|
cnv->fromUSurrogateLead=0;
|
|
cnv->fromUnicodeStatus=BOCU1_ASCII_PREV;
|
|
} else {
|
|
/* set the converter state back into UConverter */
|
|
cnv->fromUSurrogateLead= c<0 ? (UChar)-c : 0;
|
|
cnv->fromUnicodeStatus=(uint32_t)prev;
|
|
}
|
|
|
|
/* write back the updated pointers */
|
|
pArgs->source=source;
|
|
pArgs->target=(char *)target;
|
|
pArgs->offsets=offsets;
|
|
}
|
|
|
|
/*
|
|
* Identical to _Bocu1FromUnicodeWithOffsets but without offset handling.
|
|
* If a change is made in the original function, then either
|
|
* change this function the same way or
|
|
* re-copy the original function and remove the variables
|
|
* offsets, sourceIndex, and nextSourceIndex.
|
|
*/
|
|
static void
|
|
_Bocu1FromUnicode(UConverterFromUnicodeArgs *pArgs,
|
|
UErrorCode *pErrorCode) {
|
|
UConverter *cnv;
|
|
const UChar *source, *sourceLimit;
|
|
uint8_t *target;
|
|
int32_t targetCapacity;
|
|
|
|
int32_t prev, c, diff;
|
|
|
|
/* set up the local pointers */
|
|
cnv=pArgs->converter;
|
|
source=pArgs->source;
|
|
sourceLimit=pArgs->sourceLimit;
|
|
target=(uint8_t *)pArgs->target;
|
|
targetCapacity=pArgs->targetLimit-pArgs->target;
|
|
|
|
/* get the converter state from UConverter */
|
|
c=cnv->fromUSurrogateLead;
|
|
prev=(int32_t)cnv->fromUnicodeStatus;
|
|
if(prev==0) {
|
|
prev=BOCU1_ASCII_PREV;
|
|
}
|
|
|
|
/* conversion loop */
|
|
if(c!=0 && targetCapacity>0) {
|
|
goto getTrail;
|
|
}
|
|
|
|
fastSingle:
|
|
/* fast loop for single-byte differences */
|
|
/* use only one loop counter variable, targetCapacity, not also source */
|
|
diff=sourceLimit-source;
|
|
if(targetCapacity>diff) {
|
|
targetCapacity=diff;
|
|
}
|
|
while(targetCapacity>0 && (c=*source)<0x3000) {
|
|
if(c<=0x20) {
|
|
if(c!=0x20) {
|
|
prev=BOCU1_ASCII_PREV;
|
|
}
|
|
*target++=(uint8_t)c;
|
|
} else {
|
|
diff=c-prev;
|
|
if(DIFF_IS_SINGLE(diff)) {
|
|
prev=BOCU1_SIMPLE_PREV(c);
|
|
*target++=(uint8_t)PACK_SINGLE_DIFF(diff);
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
++source;
|
|
--targetCapacity;
|
|
}
|
|
/* restore real values */
|
|
targetCapacity=(const uint8_t *)pArgs->targetLimit-target;
|
|
|
|
/* regular loop for all cases */
|
|
while(source<sourceLimit) {
|
|
if(targetCapacity>0) {
|
|
c=*source++;
|
|
|
|
if(c<=0x20) {
|
|
/*
|
|
* ISO C0 control & space:
|
|
* Encode directly for MIME compatibility,
|
|
* and reset state except for space, to not disrupt compression.
|
|
*/
|
|
if(c!=0x20) {
|
|
prev=BOCU1_ASCII_PREV;
|
|
}
|
|
*target++=(uint8_t)c;
|
|
--targetCapacity;
|
|
continue;
|
|
}
|
|
|
|
if(UTF_IS_LEAD(c)) {
|
|
getTrail:
|
|
if(source<sourceLimit) {
|
|
/* test the following code unit */
|
|
UChar trail=*source;
|
|
if(UTF_IS_SECOND_SURROGATE(trail)) {
|
|
++source;
|
|
c=UTF16_GET_PAIR_VALUE(c, trail);
|
|
}
|
|
} else {
|
|
/* no more input */
|
|
c=-c; /* negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else */
|
|
break;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* all other Unicode code points c==U+0021..U+10ffff
|
|
* are encoded with the difference c-prev
|
|
*
|
|
* a new prev is computed from c,
|
|
* placed in the middle of a 0x80-block (for most small scripts) or
|
|
* in the middle of the Unihan and Hangul blocks
|
|
* to statistically minimize the following difference
|
|
*/
|
|
diff=c-prev;
|
|
prev=BOCU1_PREV(c);
|
|
if(DIFF_IS_SINGLE(diff)) {
|
|
*target++=(uint8_t)PACK_SINGLE_DIFF(diff);
|
|
--targetCapacity;
|
|
if(c<0x3000) {
|
|
goto fastSingle;
|
|
}
|
|
} else if(DIFF_IS_DOUBLE(diff) && 2<=targetCapacity) {
|
|
/* optimize 2-byte case */
|
|
int32_t m;
|
|
|
|
if(diff>=0) {
|
|
diff-=BOCU1_REACH_POS_1+1;
|
|
m=diff%BOCU1_TRAIL_COUNT;
|
|
diff/=BOCU1_TRAIL_COUNT;
|
|
diff+=BOCU1_START_POS_2;
|
|
} else {
|
|
diff-=BOCU1_REACH_NEG_1;
|
|
NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
|
|
diff+=BOCU1_START_NEG_2;
|
|
}
|
|
*target++=(uint8_t)diff;
|
|
*target++=(uint8_t)BOCU1_TRAIL_TO_BYTE(m);
|
|
targetCapacity-=2;
|
|
} else {
|
|
int32_t length; /* will be 2..4 */
|
|
|
|
diff=packDiff(diff);
|
|
length=BOCU1_LENGTH_FROM_PACKED(diff);
|
|
|
|
/* write the output character bytes from diff and length */
|
|
/* from the first if in the loop we know that targetCapacity>0 */
|
|
if(length<=targetCapacity) {
|
|
switch(length) {
|
|
/* each branch falls through to the next one */
|
|
case 4:
|
|
*target++=(uint8_t)(diff>>24);
|
|
case 3:
|
|
*target++=(uint8_t)(diff>>16);
|
|
/* case 2: handled above */
|
|
*target++=(uint8_t)(diff>>8);
|
|
/* case 1: handled above */
|
|
*target++=(uint8_t)diff;
|
|
default:
|
|
/* will never occur */
|
|
break;
|
|
}
|
|
targetCapacity-=length;
|
|
} else {
|
|
uint8_t *charErrorBuffer;
|
|
|
|
/*
|
|
* We actually do this backwards here:
|
|
* In order to save an intermediate variable, we output
|
|
* first to the overflow buffer what does not fit into the
|
|
* regular target.
|
|
*/
|
|
/* we know that 1<=targetCapacity<length<=4 */
|
|
length-=targetCapacity;
|
|
charErrorBuffer=(uint8_t *)cnv->charErrorBuffer;
|
|
switch(length) {
|
|
/* each branch falls through to the next one */
|
|
case 3:
|
|
*charErrorBuffer++=(uint8_t)(diff>>16);
|
|
case 2:
|
|
*charErrorBuffer++=(uint8_t)(diff>>8);
|
|
case 1:
|
|
*charErrorBuffer=(uint8_t)diff;
|
|
default:
|
|
/* will never occur */
|
|
break;
|
|
}
|
|
cnv->charErrorBufferLength=(int8_t)length;
|
|
|
|
/* now output what fits into the regular target */
|
|
diff>>=8*length; /* length was reduced by targetCapacity */
|
|
switch(targetCapacity) {
|
|
/* each branch falls through to the next one */
|
|
case 3:
|
|
*target++=(uint8_t)(diff>>16);
|
|
case 2:
|
|
*target++=(uint8_t)(diff>>8);
|
|
case 1:
|
|
*target++=(uint8_t)diff;
|
|
default:
|
|
/* will never occur */
|
|
break;
|
|
}
|
|
|
|
/* target overflow */
|
|
targetCapacity=0;
|
|
*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
|
|
break;
|
|
}
|
|
}
|
|
} else {
|
|
/* target is full */
|
|
*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if(pArgs->flush && source>=sourceLimit) {
|
|
/* reset the state for the next conversion */
|
|
if(c<0 && U_SUCCESS(*pErrorCode)) {
|
|
/* a Unicode code point remains incomplete (only a first surrogate) */
|
|
*pErrorCode=U_TRUNCATED_CHAR_FOUND;
|
|
}
|
|
cnv->fromUSurrogateLead=0;
|
|
cnv->fromUnicodeStatus=BOCU1_ASCII_PREV;
|
|
} else {
|
|
/* set the converter state back into UConverter */
|
|
cnv->fromUSurrogateLead= c<0 ? (UChar)-c : 0;
|
|
cnv->fromUnicodeStatus=(uint32_t)prev;
|
|
}
|
|
|
|
/* write back the updated pointers */
|
|
pArgs->source=source;
|
|
pArgs->target=(char *)target;
|
|
}
|
|
|
|
/* BOCU-1-to-Unicode conversion functions ----------------------------------- */
|
|
|
|
/**
|
|
* Function for BOCU-1 decoder; handles multi-byte lead bytes.
|
|
*
|
|
* @param b lead byte;
|
|
* BOCU1_MIN<=b<BOCU1_START_NEG_2 or BOCU1_START_POS_2<=b<BOCU1_MAX_LEAD
|
|
* @return (diff<<2)|count
|
|
*/
|
|
static U_INLINE int32_t
|
|
decodeBocu1LeadByte(int32_t b) {
|
|
int32_t diff, count;
|
|
|
|
if(b>=BOCU1_START_NEG_2) {
|
|
/* positive difference */
|
|
if(b<BOCU1_START_POS_3) {
|
|
/* two bytes */
|
|
diff=((int32_t)b-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;
|
|
count=1;
|
|
} else if(b<BOCU1_START_POS_4) {
|
|
/* three bytes */
|
|
diff=((int32_t)b-BOCU1_START_POS_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_2+1;
|
|
count=2;
|
|
} else {
|
|
/* four bytes */
|
|
diff=BOCU1_REACH_POS_3+1;
|
|
count=3;
|
|
}
|
|
} else {
|
|
/* negative difference */
|
|
if(b>=BOCU1_START_NEG_3) {
|
|
/* two bytes */
|
|
diff=((int32_t)b-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;
|
|
count=1;
|
|
} else if(b>BOCU1_MIN) {
|
|
/* three bytes */
|
|
diff=((int32_t)b-BOCU1_START_NEG_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_2;
|
|
count=2;
|
|
} else {
|
|
/* four bytes */
|
|
diff=-BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_3;
|
|
count=3;
|
|
}
|
|
}
|
|
|
|
/* return the state for decoding the trail byte(s) */
|
|
return (diff<<2)|count;
|
|
}
|
|
|
|
/**
|
|
* Function for BOCU-1 decoder; handles multi-byte trail bytes.
|
|
*
|
|
* @param count number of remaining trail bytes including this one
|
|
* @param b trail byte
|
|
* @return new delta for diff including b - <0 indicates an error
|
|
*
|
|
* @see decodeBocu1
|
|
*/
|
|
static U_INLINE int32_t
|
|
decodeBocu1TrailByte(int32_t count, int32_t b) {
|
|
if(b<=0x20) {
|
|
/* skip some C0 controls and make the trail byte range contiguous */
|
|
b=bocu1ByteToTrail[b];
|
|
/* b<0 for an illegal trail byte value will result in return<0 below */
|
|
#if BOCU1_MAX_TRAIL<0xff
|
|
} else if(b>BOCU1_MAX_TRAIL) {
|
|
return -99;
|
|
#endif
|
|
} else {
|
|
b-=BOCU1_TRAIL_BYTE_OFFSET;
|
|
}
|
|
|
|
/* add trail byte into difference and decrement count */
|
|
if(count==1) {
|
|
return b;
|
|
} else if(count==2) {
|
|
return b*BOCU1_TRAIL_COUNT;
|
|
} else /* count==3 */ {
|
|
return b*(BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT);
|
|
}
|
|
}
|
|
|
|
static void
|
|
_Bocu1ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
|
|
UErrorCode *pErrorCode) {
|
|
UConverter *cnv;
|
|
const uint8_t *source, *sourceLimit;
|
|
UChar *target;
|
|
const UChar *targetLimit;
|
|
int32_t *offsets;
|
|
|
|
int32_t prev, count, diff, c;
|
|
|
|
int8_t byteIndex;
|
|
uint8_t *bytes;
|
|
|
|
int32_t sourceIndex, nextSourceIndex;
|
|
|
|
/* set up the local pointers */
|
|
cnv=pArgs->converter;
|
|
source=(const uint8_t *)pArgs->source;
|
|
sourceLimit=(const uint8_t *)pArgs->sourceLimit;
|
|
target=pArgs->target;
|
|
targetLimit=pArgs->targetLimit;
|
|
offsets=pArgs->offsets;
|
|
|
|
/* get the converter state from UConverter */
|
|
prev=(int32_t)cnv->toUnicodeStatus;
|
|
if(prev==0) {
|
|
prev=BOCU1_ASCII_PREV;
|
|
}
|
|
diff=cnv->mode; /* mode may be set to UCNV_SI by ucnv_bld.c but then toULength==0 */
|
|
count=diff&3;
|
|
diff>>=2;
|
|
|
|
byteIndex=cnv->toULength;
|
|
bytes=cnv->toUBytes;
|
|
|
|
/* sourceIndex=-1 if the current character began in the previous buffer */
|
|
sourceIndex=byteIndex==0 ? 0 : -1;
|
|
nextSourceIndex=0;
|
|
|
|
/* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */
|
|
loop:
|
|
if(count>0 && byteIndex>0 && target<targetLimit) {
|
|
goto getTrail;
|
|
}
|
|
|
|
fastSingle:
|
|
/* fast loop for single-byte differences */
|
|
/* use count as the only loop counter variable */
|
|
diff=sourceLimit-source;
|
|
count=pArgs->targetLimit-target;
|
|
if(count>diff) {
|
|
count=diff;
|
|
}
|
|
if(offsets==NULL) {
|
|
while(count>0) {
|
|
if(BOCU1_START_NEG_2<=(c=*source) && c<BOCU1_START_POS_2) {
|
|
c=prev+(c-BOCU1_MIDDLE);
|
|
if(c<0x3000) {
|
|
*target++=(UChar)c;
|
|
prev=BOCU1_SIMPLE_PREV(c);
|
|
} else {
|
|
break;
|
|
}
|
|
} else if(c<=0x20) {
|
|
if(c!=0x20) {
|
|
prev=BOCU1_ASCII_PREV;
|
|
}
|
|
*target++=(UChar)c;
|
|
} else {
|
|
break;
|
|
}
|
|
++source;
|
|
--count;
|
|
}
|
|
/* sourceIndex and nextSourceIndex are wrong but does not matter */
|
|
} else {
|
|
while(count>0) {
|
|
if(BOCU1_START_NEG_2<=(c=*source) && c<BOCU1_START_POS_2) {
|
|
c=prev+(c-BOCU1_MIDDLE);
|
|
if(c<0x3000) {
|
|
*target++=(UChar)c;
|
|
*offsets++=nextSourceIndex++;
|
|
prev=BOCU1_SIMPLE_PREV(c);
|
|
} else {
|
|
break;
|
|
}
|
|
} else if(c<=0x20) {
|
|
if(c!=0x20) {
|
|
prev=BOCU1_ASCII_PREV;
|
|
}
|
|
*target++=(UChar)c;
|
|
*offsets++=nextSourceIndex++;
|
|
} else {
|
|
break;
|
|
}
|
|
++source;
|
|
--count;
|
|
}
|
|
sourceIndex=nextSourceIndex; /* wrong if offsets==NULL but does not matter */
|
|
}
|
|
|
|
/* decode a sequence of single and lead bytes */
|
|
while(source<sourceLimit) {
|
|
if(target>=targetLimit) {
|
|
/* target is full */
|
|
*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
|
|
break;
|
|
}
|
|
|
|
++nextSourceIndex;
|
|
c=*source++;
|
|
if(BOCU1_START_NEG_2<=c && c<BOCU1_START_POS_2) {
|
|
/* Write a code point directly from a single-byte difference. */
|
|
c=prev+(c-BOCU1_MIDDLE);
|
|
if(c<0x3000) {
|
|
*target++=(UChar)c;
|
|
if(offsets!=NULL) {
|
|
*offsets++=sourceIndex;
|
|
}
|
|
prev=BOCU1_SIMPLE_PREV(c);
|
|
sourceIndex=nextSourceIndex;
|
|
goto fastSingle;
|
|
}
|
|
} else if(c<=0x20) {
|
|
/*
|
|
* Direct-encoded C0 control code or space.
|
|
* Reset prev for C0 control codes but not for space.
|
|
*/
|
|
if(c!=0x20) {
|
|
prev=BOCU1_ASCII_PREV;
|
|
}
|
|
*target++=(UChar)c;
|
|
if(offsets!=NULL) {
|
|
*offsets++=sourceIndex;
|
|
}
|
|
sourceIndex=nextSourceIndex;
|
|
continue;
|
|
} else if(BOCU1_START_NEG_3<=c && c<BOCU1_START_POS_3 && source<sourceLimit) {
|
|
/* Optimize two-byte case. */
|
|
if(c>=BOCU1_MIDDLE) {
|
|
diff=((int32_t)c-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;
|
|
} else {
|
|
diff=((int32_t)c-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;
|
|
}
|
|
|
|
/* trail byte */
|
|
++nextSourceIndex;
|
|
c=decodeBocu1TrailByte(1, *source++);
|
|
if(c<0 || (uint32_t)(c=prev+diff+c)>0x10ffff) {
|
|
bytes[0]=source[-2];
|
|
bytes[1]=source[-1];
|
|
byteIndex=2;
|
|
goto callback;
|
|
}
|
|
} else if(c==BOCU1_RESET) {
|
|
/* only reset the state, no code point */
|
|
prev=BOCU1_ASCII_PREV;
|
|
sourceIndex=nextSourceIndex;
|
|
continue;
|
|
} else {
|
|
/*
|
|
* For multi-byte difference lead bytes, set the decoder state
|
|
* with the partial difference value from the lead byte and
|
|
* with the number of trail bytes.
|
|
*/
|
|
bytes[0]=(uint8_t)c;
|
|
byteIndex=1;
|
|
|
|
diff=decodeBocu1LeadByte(c);
|
|
count=diff&3;
|
|
diff>>=2;
|
|
getTrail:
|
|
for(;;) {
|
|
if(source>=sourceLimit) {
|
|
goto endloop;
|
|
}
|
|
++nextSourceIndex;
|
|
c=bytes[byteIndex++]=*source++;
|
|
|
|
/* trail byte in any position */
|
|
c=decodeBocu1TrailByte(count, c);
|
|
if(c<0) {
|
|
goto callback;
|
|
}
|
|
|
|
diff+=c;
|
|
if(--count==0) {
|
|
/* final trail byte, deliver a code point */
|
|
byteIndex=0;
|
|
c=prev+diff;
|
|
if((uint32_t)c>0x10ffff) {
|
|
goto callback;
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
/* calculate the next prev and output c */
|
|
prev=BOCU1_PREV(c);
|
|
if(c<=0xffff) {
|
|
*target++=(UChar)c;
|
|
if(offsets!=NULL) {
|
|
*offsets++=sourceIndex;
|
|
}
|
|
} else {
|
|
/* output surrogate pair */
|
|
*target++=UTF16_LEAD(c);
|
|
if(target<targetLimit) {
|
|
*target++=UTF16_TRAIL(c);
|
|
if(offsets!=NULL) {
|
|
*offsets++=sourceIndex;
|
|
*offsets++=sourceIndex;
|
|
}
|
|
} else {
|
|
/* target overflow */
|
|
if(offsets!=NULL) {
|
|
*offsets++=sourceIndex;
|
|
}
|
|
cnv->UCharErrorBuffer[0]=UTF16_TRAIL(c);
|
|
cnv->UCharErrorBufferLength=1;
|
|
*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
|
|
break;
|
|
}
|
|
}
|
|
sourceIndex=nextSourceIndex;
|
|
}
|
|
endloop:
|
|
|
|
if(pArgs->flush && source>=sourceLimit) {
|
|
/* reset the state for the next conversion */
|
|
if(byteIndex>0 && U_SUCCESS(*pErrorCode)) {
|
|
/* a character byte sequence remains incomplete */
|
|
*pErrorCode=U_TRUNCATED_CHAR_FOUND;
|
|
}
|
|
cnv->toUnicodeStatus=BOCU1_ASCII_PREV;
|
|
cnv->mode=0;
|
|
cnv->toULength=0;
|
|
} else {
|
|
/* set the converter state back into UConverter */
|
|
cnv->toUnicodeStatus=(uint32_t)prev;
|
|
cnv->mode=(diff<<2)|count;
|
|
cnv->toULength=byteIndex;
|
|
}
|
|
|
|
finish:
|
|
/* write back the updated pointers */
|
|
pArgs->source=(const char *)source;
|
|
pArgs->target=target;
|
|
pArgs->offsets=offsets;
|
|
return;
|
|
|
|
callback:
|
|
/* call the callback function with all the preparations and post-processing */
|
|
/* update the arguments structure */
|
|
pArgs->source=(const char *)source;
|
|
pArgs->target=target;
|
|
pArgs->offsets=offsets;
|
|
|
|
/* copy the current bytes to invalidCharBuffer */
|
|
cnv->invalidCharBuffer[0]=bytes[0];
|
|
cnv->invalidCharBuffer[1]=bytes[1];
|
|
cnv->invalidCharBuffer[2]=bytes[2];
|
|
cnv->invalidCharBuffer[3]=bytes[3];
|
|
cnv->invalidCharLength=(int8_t)byteIndex;
|
|
|
|
/* set the converter state in UConverter to deal with the next character */
|
|
cnv->toUnicodeStatus=BOCU1_ASCII_PREV;
|
|
cnv->mode=0;
|
|
cnv->toULength=0;
|
|
|
|
/* call the callback function */
|
|
*pErrorCode=U_ILLEGAL_CHAR_FOUND;
|
|
cnv->fromCharErrorBehaviour(cnv->toUContext, pArgs, (const char *)bytes, byteIndex, UCNV_ILLEGAL, pErrorCode);
|
|
|
|
/* get the converter state from UConverter */
|
|
prev=(int32_t)cnv->toUnicodeStatus;
|
|
if(prev==0) {
|
|
prev=BOCU1_ASCII_PREV;
|
|
}
|
|
diff=cnv->mode;
|
|
count=diff&3;
|
|
diff>>=2;
|
|
|
|
byteIndex=cnv->toULength;
|
|
|
|
/* update target and deal with offsets if necessary */
|
|
offsets=ucnv_updateCallbackOffsets(offsets, pArgs->target-target, sourceIndex);
|
|
target=pArgs->target;
|
|
|
|
/* update the source pointer and index */
|
|
sourceIndex=nextSourceIndex+((const uint8_t *)pArgs->source-source);
|
|
source=(const uint8_t *)pArgs->source;
|
|
|
|
/*
|
|
* If the callback overflowed the target, then we need to
|
|
* stop here with an overflow indication.
|
|
*/
|
|
if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
|
|
goto endloop;
|
|
} else if(cnv->UCharErrorBufferLength>0) {
|
|
/* target is full */
|
|
*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
|
|
goto endloop;
|
|
} else if(U_FAILURE(*pErrorCode)) {
|
|
/* reset and break on error */
|
|
cnv->toUnicodeStatus=BOCU1_ASCII_PREV;
|
|
cnv->mode=0;
|
|
cnv->toULength=0;
|
|
goto finish;
|
|
} else {
|
|
goto loop;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Identical to _Bocu1ToUnicodeWithOffsets but without offset handling.
|
|
* If a change is made in the original function, then either
|
|
* change this function the same way or
|
|
* re-copy the original function and remove the variables
|
|
* offsets, sourceIndex, and nextSourceIndex.
|
|
*/
|
|
static void
|
|
_Bocu1ToUnicode(UConverterToUnicodeArgs *pArgs,
|
|
UErrorCode *pErrorCode) {
|
|
UConverter *cnv;
|
|
const uint8_t *source, *sourceLimit;
|
|
UChar *target;
|
|
const UChar *targetLimit;
|
|
|
|
int32_t prev, count, diff, c;
|
|
|
|
int8_t byteIndex;
|
|
uint8_t *bytes;
|
|
|
|
U_ALIGN_CODE(16)
|
|
|
|
/* set up the local pointers */
|
|
cnv=pArgs->converter;
|
|
source=(const uint8_t *)pArgs->source;
|
|
sourceLimit=(const uint8_t *)pArgs->sourceLimit;
|
|
target=pArgs->target;
|
|
targetLimit=pArgs->targetLimit;
|
|
|
|
/* get the converter state from UConverter */
|
|
prev=(int32_t)cnv->toUnicodeStatus;
|
|
if(prev==0) {
|
|
prev=BOCU1_ASCII_PREV;
|
|
}
|
|
diff=cnv->mode; /* mode may be set to UCNV_SI by ucnv_bld.c but then toULength==0 */
|
|
count=diff&3;
|
|
diff>>=2;
|
|
|
|
byteIndex=cnv->toULength;
|
|
bytes=cnv->toUBytes;
|
|
|
|
/* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */
|
|
loop:
|
|
if(count>0 && byteIndex>0 && target<targetLimit) {
|
|
goto getTrail;
|
|
}
|
|
|
|
fastSingle:
|
|
/* fast loop for single-byte differences */
|
|
/* use count as the only loop counter variable */
|
|
diff=sourceLimit-source;
|
|
count=pArgs->targetLimit-target;
|
|
if(count>diff) {
|
|
count=diff;
|
|
}
|
|
while(count>0) {
|
|
if(BOCU1_START_NEG_2<=(c=*source) && c<BOCU1_START_POS_2) {
|
|
c=prev+(c-BOCU1_MIDDLE);
|
|
if(c<0x3000) {
|
|
*target++=(UChar)c;
|
|
prev=BOCU1_SIMPLE_PREV(c);
|
|
} else {
|
|
break;
|
|
}
|
|
} else if(c<=0x20) {
|
|
if(c!=0x20) {
|
|
prev=BOCU1_ASCII_PREV;
|
|
}
|
|
*target++=(UChar)c;
|
|
} else {
|
|
break;
|
|
}
|
|
++source;
|
|
--count;
|
|
}
|
|
|
|
/* decode a sequence of single and lead bytes */
|
|
while(source<sourceLimit) {
|
|
if(target>=targetLimit) {
|
|
/* target is full */
|
|
*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
|
|
break;
|
|
}
|
|
|
|
c=*source++;
|
|
if(BOCU1_START_NEG_2<=c && c<BOCU1_START_POS_2) {
|
|
/* Write a code point directly from a single-byte difference. */
|
|
c=prev+(c-BOCU1_MIDDLE);
|
|
if(c<0x3000) {
|
|
*target++=(UChar)c;
|
|
prev=BOCU1_SIMPLE_PREV(c);
|
|
goto fastSingle;
|
|
}
|
|
} else if(c<=0x20) {
|
|
/*
|
|
* Direct-encoded C0 control code or space.
|
|
* Reset prev for C0 control codes but not for space.
|
|
*/
|
|
if(c!=0x20) {
|
|
prev=BOCU1_ASCII_PREV;
|
|
}
|
|
*target++=(UChar)c;
|
|
continue;
|
|
} else if(BOCU1_START_NEG_3<=c && c<BOCU1_START_POS_3 && source<sourceLimit) {
|
|
/* Optimize two-byte case. */
|
|
if(c>=BOCU1_MIDDLE) {
|
|
diff=((int32_t)c-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;
|
|
} else {
|
|
diff=((int32_t)c-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;
|
|
}
|
|
|
|
/* trail byte */
|
|
c=decodeBocu1TrailByte(1, *source++);
|
|
if(c<0 || (uint32_t)(c=prev+diff+c)>0x10ffff) {
|
|
bytes[0]=source[-2];
|
|
bytes[1]=source[-1];
|
|
byteIndex=2;
|
|
goto callback;
|
|
}
|
|
} else if(c==BOCU1_RESET) {
|
|
/* only reset the state, no code point */
|
|
prev=BOCU1_ASCII_PREV;
|
|
continue;
|
|
} else {
|
|
/*
|
|
* For multi-byte difference lead bytes, set the decoder state
|
|
* with the partial difference value from the lead byte and
|
|
* with the number of trail bytes.
|
|
*/
|
|
bytes[0]=(uint8_t)c;
|
|
byteIndex=1;
|
|
|
|
diff=decodeBocu1LeadByte(c);
|
|
count=diff&3;
|
|
diff>>=2;
|
|
getTrail:
|
|
for(;;) {
|
|
if(source>=sourceLimit) {
|
|
goto endloop;
|
|
}
|
|
c=bytes[byteIndex++]=*source++;
|
|
|
|
/* trail byte in any position */
|
|
c=decodeBocu1TrailByte(count, c);
|
|
if(c<0) {
|
|
goto callback;
|
|
}
|
|
|
|
diff+=c;
|
|
if(--count==0) {
|
|
/* final trail byte, deliver a code point */
|
|
byteIndex=0;
|
|
c=prev+diff;
|
|
if((uint32_t)c>0x10ffff) {
|
|
goto callback;
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
/* calculate the next prev and output c */
|
|
prev=BOCU1_PREV(c);
|
|
if(c<=0xffff) {
|
|
*target++=(UChar)c;
|
|
} else {
|
|
/* output surrogate pair */
|
|
*target++=UTF16_LEAD(c);
|
|
if(target<targetLimit) {
|
|
*target++=UTF16_TRAIL(c);
|
|
} else {
|
|
/* target overflow */
|
|
cnv->UCharErrorBuffer[0]=UTF16_TRAIL(c);
|
|
cnv->UCharErrorBufferLength=1;
|
|
*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
endloop:
|
|
|
|
if(pArgs->flush && source>=sourceLimit) {
|
|
/* reset the state for the next conversion */
|
|
if(byteIndex>0 && U_SUCCESS(*pErrorCode)) {
|
|
/* a character byte sequence remains incomplete */
|
|
*pErrorCode=U_TRUNCATED_CHAR_FOUND;
|
|
}
|
|
cnv->toUnicodeStatus=BOCU1_ASCII_PREV;
|
|
cnv->mode=0;
|
|
cnv->toULength=0;
|
|
} else {
|
|
/* set the converter state back into UConverter */
|
|
cnv->toUnicodeStatus=(uint32_t)prev;
|
|
cnv->mode=(diff<<2)|count;
|
|
cnv->toULength=byteIndex;
|
|
}
|
|
|
|
finish:
|
|
/* write back the updated pointers */
|
|
pArgs->source=(const char *)source;
|
|
pArgs->target=target;
|
|
return;
|
|
|
|
callback:
|
|
/* call the callback function with all the preparations and post-processing */
|
|
/* update the arguments structure */
|
|
pArgs->source=(const char *)source;
|
|
pArgs->target=target;
|
|
|
|
/* copy the current bytes to invalidCharBuffer */
|
|
cnv->invalidCharBuffer[0]=bytes[0];
|
|
cnv->invalidCharBuffer[1]=bytes[1];
|
|
cnv->invalidCharBuffer[2]=bytes[2];
|
|
cnv->invalidCharBuffer[3]=bytes[3];
|
|
cnv->invalidCharLength=(int8_t)byteIndex;
|
|
|
|
/* set the converter state in UConverter to deal with the next character */
|
|
cnv->toUnicodeStatus=BOCU1_ASCII_PREV;
|
|
cnv->mode=0;
|
|
cnv->toULength=0;
|
|
|
|
/* call the callback function */
|
|
*pErrorCode=U_ILLEGAL_CHAR_FOUND;
|
|
cnv->fromCharErrorBehaviour(cnv->toUContext, pArgs, (const char *)bytes, byteIndex, UCNV_ILLEGAL, pErrorCode);
|
|
|
|
/* get the converter state from UConverter */
|
|
prev=(int32_t)cnv->toUnicodeStatus;
|
|
if(prev==0) {
|
|
prev=BOCU1_ASCII_PREV;
|
|
}
|
|
diff=cnv->mode;
|
|
count=diff&3;
|
|
diff>>=2;
|
|
|
|
byteIndex=cnv->toULength;
|
|
|
|
target=pArgs->target;
|
|
|
|
/* update the source pointer and index */
|
|
source=(const uint8_t *)pArgs->source;
|
|
|
|
/*
|
|
* If the callback overflowed the target, then we need to
|
|
* stop here with an overflow indication.
|
|
*/
|
|
if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
|
|
goto endloop;
|
|
} else if(cnv->UCharErrorBufferLength>0) {
|
|
/* target is full */
|
|
*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
|
|
goto endloop;
|
|
} else if(U_FAILURE(*pErrorCode)) {
|
|
/* reset and break on error */
|
|
cnv->toUnicodeStatus=BOCU1_ASCII_PREV;
|
|
cnv->mode=0;
|
|
cnv->toULength=0;
|
|
goto finish;
|
|
} else {
|
|
goto loop;
|
|
}
|
|
}
|
|
|
|
/* miscellaneous ------------------------------------------------------------ */
|
|
|
|
static const UConverterImpl _Bocu1Impl={
|
|
UCNV_BOCU1,
|
|
|
|
NULL,
|
|
NULL,
|
|
|
|
NULL,
|
|
NULL,
|
|
NULL,
|
|
|
|
_Bocu1ToUnicode,
|
|
_Bocu1ToUnicodeWithOffsets,
|
|
_Bocu1FromUnicode,
|
|
_Bocu1FromUnicodeWithOffsets,
|
|
NULL,
|
|
|
|
NULL,
|
|
NULL,
|
|
NULL,
|
|
NULL
|
|
};
|
|
|
|
static const UConverterStaticData _Bocu1StaticData={
|
|
sizeof(UConverterStaticData),
|
|
"BOCU-1",
|
|
0, /* CCSID for BOCU-1 */
|
|
UCNV_IBM, UCNV_BOCU1,
|
|
1, 4, /* one UChar generates at least 1 byte and at most 4 bytes */
|
|
{ 0x1a, 0, 0, 0 }, 1, /* BOCU-1 never needs to write a subchar */
|
|
FALSE, FALSE,
|
|
0,
|
|
0,
|
|
{ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
|
|
};
|
|
|
|
const UConverterSharedData _Bocu1Data={
|
|
sizeof(UConverterSharedData), ~((uint32_t)0),
|
|
NULL, NULL, &_Bocu1StaticData, FALSE, &_Bocu1Impl,
|
|
0
|
|
};
|