c4997713ef
X-SVN-Rev: 4238
1400 lines
43 KiB
C
1400 lines
43 KiB
C
/*
|
|
******************************************************************************
|
|
*
|
|
* Copyright (C) 1999-2001, International Business Machines
|
|
* Corporation and others. All Rights Reserved.
|
|
*
|
|
******************************************************************************
|
|
*
|
|
* File scsu.c
|
|
*
|
|
* Modification History:
|
|
*
|
|
* Date Name Description
|
|
* 05/17/99 stephen Creation (ported from java UnicodeCompressor.java)
|
|
* 09/21/99 stephen Updated to handle data splits on decompression.
|
|
******************************************************************************
|
|
*/
|
|
|
|
#include "unicode/scsu.h"
|
|
|
|
#include "cmemory.h"
|
|
|
|
/* Generic window shift */
|
|
#define COMPRESSIONOFFSET 0x80
|
|
|
|
/* Indicates a window index is invalid */
|
|
#define INVALIDWINDOW -1
|
|
|
|
/* Indicates a character doesn't exist in input */
|
|
#define INVALIDCHAR -1
|
|
|
|
/* Compression modes */
|
|
#define SINGLEBYTEMODE 0
|
|
#define UNICODEMODE 1
|
|
|
|
/* Reserved index value */
|
|
#define RESERVEDINDEX 0x00
|
|
|
|
/* Indices for scripts which cross a half-block boundary */
|
|
#define LATININDEX 0xF9
|
|
#define IPAEXTENSIONINDEX 0xFA
|
|
#define GREEKINDEX 0xFB
|
|
#define ARMENIANINDEX 0xFC
|
|
#define HIRAGANAINDEX 0xFD
|
|
#define KATAKANAINDEX 0xFE
|
|
#define HALFWIDTHKATAKANAINDEX 0xFF
|
|
|
|
/* Single-byte mode tags */
|
|
#define SDEFINEX 0x0B
|
|
/* 0x0C is a reserved value*/
|
|
#define SRESERVED 0x0C
|
|
#define SQUOTEU 0x0E
|
|
#define SCHANGEU 0x0F
|
|
|
|
#define SQUOTE0 0x01
|
|
#define SQUOTE1 0x02
|
|
#define SQUOTE2 0x03
|
|
#define SQUOTE3 0x04
|
|
#define SQUOTE4 0x05
|
|
#define SQUOTE5 0x06
|
|
#define SQUOTE6 0x07
|
|
#define SQUOTE7 0x08
|
|
|
|
#define SCHANGE0 0x10
|
|
#define SCHANGE1 0x11
|
|
#define SCHANGE2 0x12
|
|
#define SCHANGE3 0x13
|
|
#define SCHANGE4 0x14
|
|
#define SCHANGE5 0x15
|
|
#define SCHANGE6 0x16
|
|
#define SCHANGE7 0x17
|
|
|
|
#define SDEFINE0 0x18
|
|
#define SDEFINE1 0x19
|
|
#define SDEFINE2 0x1A
|
|
#define SDEFINE3 0x1B
|
|
#define SDEFINE4 0x1C
|
|
#define SDEFINE5 0x1D
|
|
#define SDEFINE6 0x1E
|
|
#define SDEFINE7 0x1F
|
|
|
|
/* Unicode mode tags */
|
|
#define UCHANGE0 0xE0
|
|
#define UCHANGE1 0xE1
|
|
#define UCHANGE2 0xE2
|
|
#define UCHANGE3 0xE3
|
|
#define UCHANGE4 0xE4
|
|
#define UCHANGE5 0xE5
|
|
#define UCHANGE6 0xE6
|
|
#define UCHANGE7 0xE7
|
|
|
|
#define UDEFINE0 0xE8
|
|
#define UDEFINE1 0xE9
|
|
#define UDEFINE2 0xEA
|
|
#define UDEFINE3 0xEB
|
|
#define UDEFINE4 0xEC
|
|
#define UDEFINE5 0xED
|
|
#define UDEFINE6 0xEE
|
|
#define UDEFINE7 0xEF
|
|
|
|
#define UQUOTEU 0xF0
|
|
#define UDEFINEX 0xF1
|
|
/* 0xF2 is a reserved value*/
|
|
#define URESERVED 0xF2
|
|
|
|
/* Local function prototypes */
|
|
static int32_t scsu_makeIndex(int32_t c);
|
|
static UBool scsu_inDynamicWindow(const UnicodeCompressor *comp,
|
|
int32_t c,
|
|
int32_t whichWindow);
|
|
static UBool scsu_inStaticWindow(int32_t c,
|
|
int32_t whichWindow);
|
|
static UBool scsu_isCompressible(int32_t c);
|
|
static int32_t scsu_findDynamicWindow(const UnicodeCompressor *comp,
|
|
int32_t c);
|
|
static int32_t scsu_findStaticWindow(int32_t c);
|
|
static int32_t scsu_getLRDefinedWindow(const UnicodeCompressor *comp);
|
|
|
|
/* Static tables generated by CompressionTableGenerator */
|
|
|
|
/** For window offset mapping */
|
|
static int32_t sOffsetTable [] = {
|
|
0x0, 0x80, 0x100, 0x180, 0x200, 0x280, 0x300, 0x380, 0x400, 0x480,
|
|
0x500, 0x580, 0x600, 0x680, 0x700, 0x780, 0x800, 0x880, 0x900,
|
|
0x980, 0xa00, 0xa80, 0xb00, 0xb80, 0xc00, 0xc80, 0xd00, 0xd80,
|
|
0xe00, 0xe80, 0xf00, 0xf80, 0x1000, 0x1080, 0x1100, 0x1180,
|
|
0x1200, 0x1280, 0x1300, 0x1380, 0x1400, 0x1480, 0x1500, 0x1580,
|
|
0x1600, 0x1680, 0x1700, 0x1780, 0x1800, 0x1880, 0x1900, 0x1980,
|
|
0x1a00, 0x1a80, 0x1b00, 0x1b80, 0x1c00, 0x1c80, 0x1d00, 0x1d80,
|
|
0x1e00, 0x1e80, 0x1f00, 0x1f80, 0x2000, 0x2080, 0x2100, 0x2180,
|
|
0x2200, 0x2280, 0x2300, 0x2380, 0x2400, 0x2480, 0x2500, 0x2580,
|
|
0x2600, 0x2680, 0x2700, 0x2780, 0x2800, 0x2880, 0x2900, 0x2980,
|
|
0x2a00, 0x2a80, 0x2b00, 0x2b80, 0x2c00, 0x2c80, 0x2d00, 0x2d80,
|
|
0x2e00, 0x2e80, 0x2f00, 0x2f80, 0x3000, 0x3080, 0x3100, 0x3180,
|
|
0x3200, 0x3280, 0x3300, 0x3380, 0xe000, 0xe080, 0xe100, 0xe180,
|
|
0xe200, 0xe280, 0xe300, 0xe380, 0xe400, 0xe480, 0xe500, 0xe580,
|
|
0xe600, 0xe680, 0xe700, 0xe780, 0xe800, 0xe880, 0xe900, 0xe980,
|
|
0xea00, 0xea80, 0xeb00, 0xeb80, 0xec00, 0xec80, 0xed00, 0xed80,
|
|
0xee00, 0xee80, 0xef00, 0xef80, 0xf000, 0xf080, 0xf100, 0xf180,
|
|
0xf200, 0xf280, 0xf300, 0xf380, 0xf400, 0xf480, 0xf500, 0xf580,
|
|
0xf600, 0xf680, 0xf700, 0xf780, 0xf800, 0xf880, 0xf900, 0xf980,
|
|
0xfa00, 0xfa80, 0xfb00, 0xfb80, 0xfc00, 0xfc80, 0xfd00, 0xfd80,
|
|
0xfe00, 0xfe80, 0xff00, 0xff80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
|
|
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
|
|
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
|
|
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
|
|
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
|
|
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
|
|
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xc0, 0x250, 0x370,
|
|
0x530, 0x3040, 0x30a0, 0xff60
|
|
};
|
|
|
|
/** For quick identification of a byte as a single-byte mode tag */
|
|
static UBool sSingleTagTable [] = {
|
|
FALSE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, FALSE,
|
|
FALSE, TRUE, TRUE, FALSE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,
|
|
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,
|
|
TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,FALSE, FALSE,
|
|
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
|
|
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
|
|
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
|
|
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
|
|
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
|
|
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
|
|
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
|
|
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
|
|
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
|
|
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
|
|
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
|
|
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
|
|
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
|
|
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
|
|
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
|
|
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
|
|
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
|
|
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
|
|
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
|
|
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
|
|
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
|
|
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
|
|
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
|
|
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE
|
|
};
|
|
|
|
/** For quick identification of a byte as a unicode mode tag */
|
|
static UBool sUnicodeTagTable [] = {
|
|
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
|
|
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
|
|
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
|
|
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
|
|
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
|
|
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
|
|
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
|
|
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
|
|
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
|
|
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
|
|
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
|
|
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
|
|
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
|
|
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
|
|
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
|
|
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
|
|
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
|
|
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
|
|
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
|
|
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
|
|
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
|
|
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
|
|
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
|
|
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
|
|
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE,
|
|
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,
|
|
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, FALSE, FALSE, FALSE,
|
|
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
|
|
FALSE
|
|
};
|
|
|
|
/** Static compression window offsets */
|
|
static int32_t sOffsets [] = {
|
|
0x0000, /* for quoting single-byte mode tags*/
|
|
0x0080, /* Latin-1 Supplement*/
|
|
0x0100, /* Latin Extended-A*/
|
|
0x0300, /* Combining Diacritical Marks*/
|
|
0x2000, /* General Punctuation*/
|
|
0x2080, /* Curency Symbols*/
|
|
0x2100, /* Letterlike Symbols and Number Forms*/
|
|
0x3000 /* CJK Symbols and Punctuation*/
|
|
};
|
|
|
|
|
|
void
|
|
scsu_init(UnicodeCompressor *comp)
|
|
{
|
|
/* initialize to defaults*/
|
|
scsu_reset(comp);
|
|
}
|
|
|
|
void
|
|
scsu_compress(UnicodeCompressor *comp,
|
|
uint8_t **target,
|
|
const uint8_t *targetLimit,
|
|
const UChar **source,
|
|
const UChar *sourceLimit,
|
|
UErrorCode *status)
|
|
{
|
|
/* the current position in the source unichar buffer*/
|
|
const UChar *unicharBuffer = *source;
|
|
|
|
/* the current position in the target byte buffer*/
|
|
uint8_t *byteBuffer = *target;
|
|
|
|
/* the current unicode character from the source buffer*/
|
|
int32_t curUC = INVALIDCHAR;
|
|
|
|
/* the index for the current character*/
|
|
int32_t curIndex = -1;
|
|
|
|
/* look ahead*/
|
|
int32_t nextUC = INVALIDCHAR;
|
|
int32_t forwardUC = INVALIDCHAR;
|
|
|
|
/* temporary for window searching*/
|
|
int32_t whichWindow = 0;
|
|
|
|
/* high and low bytes of the current unicode character*/
|
|
int32_t hiByte = 0;
|
|
int32_t loByte = 0;
|
|
|
|
|
|
/* verify we weren't passed a failing error code */
|
|
if(U_FAILURE(*status)) {
|
|
return;
|
|
}
|
|
/* verify the target buffer can hold at least 4 bytes */
|
|
else if(targetLimit - byteBuffer < 4) {
|
|
*status = U_ILLEGAL_ARGUMENT_ERROR;
|
|
return;
|
|
}
|
|
|
|
mainLoop:
|
|
while( unicharBuffer < sourceLimit && byteBuffer < targetLimit) {
|
|
switch( comp->fMode ) {
|
|
|
|
/* main single byte mode compression loop*/
|
|
case SINGLEBYTEMODE:
|
|
while( unicharBuffer < sourceLimit && byteBuffer < targetLimit ) {
|
|
|
|
/* get current char*/
|
|
curUC = *unicharBuffer++;
|
|
|
|
/* get next char*/
|
|
if( unicharBuffer < sourceLimit )
|
|
nextUC = *unicharBuffer;
|
|
else
|
|
nextUC = INVALIDCHAR;
|
|
|
|
/* chars less than 0x0080 (excluding tags) go straight in
|
|
stream */
|
|
if( curUC < 0x0080 ) {
|
|
loByte = curUC;
|
|
|
|
/* we need to check and make sure we don't
|
|
accidentally write a single byte mode tag to
|
|
the stream unless it's quoted */
|
|
if(sSingleTagTable[loByte]) {
|
|
/* make sure there is enough room to write
|
|
both bytes and if not, rewind the source
|
|
stream and break out*/
|
|
if( (byteBuffer + 1) >= targetLimit) {
|
|
--unicharBuffer;
|
|
goto finish;
|
|
}
|
|
|
|
/* since we know the byte is less than 0x80, SQUOTE0
|
|
will use static window 0, or Latin-1*/
|
|
*byteBuffer++ = (uint8_t) SQUOTE0;
|
|
}
|
|
|
|
*byteBuffer++ = (uint8_t) loByte;
|
|
}
|
|
|
|
/* if the char belongs to current window, convert it
|
|
to a byte by adding the generic compression offset
|
|
and subtracting the window's offset*/
|
|
else if(scsu_inDynamicWindow(comp,
|
|
curUC, comp->fCurrentWindow) ) {
|
|
*byteBuffer++ = (uint8_t)
|
|
(curUC - comp->fOffsets[ comp->fCurrentWindow ]
|
|
+ COMPRESSIONOFFSET);
|
|
}
|
|
|
|
/* if char is not in compressible range, either switch
|
|
to or quote from unicode*/
|
|
else if( ! scsu_isCompressible(curUC) ) {
|
|
/* only check next character if it is valid*/
|
|
if(nextUC != INVALIDCHAR && scsu_isCompressible(nextUC)) {
|
|
/* make sure there is enough room to write all
|
|
three bytes if not, rewind the source
|
|
stream and break out*/
|
|
if( (byteBuffer + 2) >= targetLimit) {
|
|
--unicharBuffer;
|
|
goto finish;
|
|
}
|
|
|
|
*byteBuffer++ = (uint8_t) SQUOTEU;
|
|
*byteBuffer++ = (uint8_t) (curUC >> 8);
|
|
*byteBuffer++ = (uint8_t) curUC;
|
|
}
|
|
else {
|
|
/* make sure there is enough room to write all
|
|
four bytes and if not, rewind the source
|
|
stream and break out*/
|
|
if( (byteBuffer + 3) >= targetLimit) {
|
|
--unicharBuffer;
|
|
goto finish;
|
|
}
|
|
|
|
*byteBuffer++ = (uint8_t) SCHANGEU;
|
|
|
|
hiByte = curUC >> 8;
|
|
loByte = curUC;
|
|
|
|
/* add quote Unicode tag */
|
|
if( sUnicodeTagTable[hiByte] )
|
|
*byteBuffer++ = (uint8_t) UQUOTEU;
|
|
|
|
*byteBuffer++ = (uint8_t) hiByte;
|
|
*byteBuffer++ = (uint8_t) loByte;
|
|
|
|
comp->fMode = UNICODEMODE;
|
|
|
|
/* use a goto here for speed, to avoid having
|
|
to check fMode in the while loop at the top
|
|
of the case */
|
|
goto mainLoop;
|
|
}
|
|
}
|
|
|
|
/* if the char is in a currently defined dynamic
|
|
window, figure out which one, and either switch to
|
|
it or quote from it*/
|
|
else if( (whichWindow = scsu_findDynamicWindow(comp, curUC))
|
|
!= INVALIDWINDOW ) {
|
|
/* look ahead*/
|
|
if( (unicharBuffer + 1) < sourceLimit )
|
|
forwardUC = *(unicharBuffer + 1);
|
|
else
|
|
forwardUC = INVALIDCHAR;
|
|
|
|
/* all three chars in same window, switch to that
|
|
window- inDynamicWindow will return FALSE for
|
|
INVALIDCHAR*/
|
|
if( scsu_inDynamicWindow(comp, nextUC, whichWindow)
|
|
&& scsu_inDynamicWindow(comp, forwardUC, whichWindow)){
|
|
/* make sure there is enough room to write
|
|
both bytes and if not, rewind the source
|
|
stream and break out*/
|
|
if( (byteBuffer + 1) >= targetLimit) {
|
|
--unicharBuffer;
|
|
goto finish;
|
|
}
|
|
|
|
*byteBuffer++ = (uint8_t) (SCHANGE0 + whichWindow);
|
|
*byteBuffer++ = (uint8_t)
|
|
(curUC - comp->fOffsets[whichWindow]
|
|
+ COMPRESSIONOFFSET);
|
|
comp->fTimeStamps [ whichWindow ] = ++(comp->fTimeStamp);
|
|
comp->fCurrentWindow = whichWindow;
|
|
}
|
|
|
|
/* either only next char or neither in same
|
|
window, so quote*/
|
|
else {
|
|
/* make sure there is enough room to write
|
|
both bytes and if not, rewind the source stream
|
|
and break out*/
|
|
if( (byteBuffer + 1) >= targetLimit) {
|
|
--unicharBuffer;
|
|
goto finish;
|
|
}
|
|
|
|
*byteBuffer++ = (uint8_t) (SQUOTE0 + whichWindow);
|
|
*byteBuffer++ = (uint8_t)
|
|
(curUC - comp->fOffsets[whichWindow]
|
|
+ COMPRESSIONOFFSET);
|
|
}
|
|
}
|
|
|
|
/* if a static window is defined, and the following
|
|
character is not in that static window, quote from
|
|
the static window Note: to quote from a static
|
|
window, don't add 0x80*/
|
|
else if( (whichWindow = scsu_findStaticWindow(curUC))
|
|
!= INVALIDWINDOW
|
|
&& ! scsu_inStaticWindow(nextUC, whichWindow) ) {
|
|
/* make sure there is enough room to write both
|
|
bytes if not, rewind the source stream and
|
|
break out*/
|
|
if( (byteBuffer + 1) >= targetLimit) {
|
|
--unicharBuffer;
|
|
goto finish;
|
|
}
|
|
|
|
*byteBuffer++ = (uint8_t) (SQUOTE0 + whichWindow);
|
|
*byteBuffer++ = (uint8_t) (curUC - sOffsets[whichWindow]);
|
|
}
|
|
|
|
/* if a window is not defined, decide if we want to
|
|
define a new one or switch to unicode mode*/
|
|
else {
|
|
/* determine index for current char (char is
|
|
compressible)*/
|
|
curIndex = scsu_makeIndex(curUC);
|
|
comp->fIndexCount[curIndex]++;
|
|
|
|
/* look ahead*/
|
|
if( (unicharBuffer + 1) < sourceLimit )
|
|
forwardUC = *(unicharBuffer + 1);
|
|
else
|
|
forwardUC = INVALIDCHAR;
|
|
|
|
/* if we have encountered this index at least once
|
|
before, define a new window*/
|
|
if( comp->fIndexCount[curIndex] > 1 ) {
|
|
/* make sure there is enough room to write all
|
|
three bytes and if not, rewind the source
|
|
stream and break out*/
|
|
if( (byteBuffer + 2) >= targetLimit) {
|
|
--unicharBuffer;
|
|
goto finish;
|
|
}
|
|
|
|
/* get least recently defined window*/
|
|
whichWindow = scsu_getLRDefinedWindow(comp);
|
|
|
|
*byteBuffer++ = (uint8_t) (SDEFINE0 + whichWindow);
|
|
*byteBuffer++ = (uint8_t) curIndex;
|
|
*byteBuffer++ = (uint8_t)
|
|
(curUC - sOffsetTable[curIndex]
|
|
+ COMPRESSIONOFFSET);
|
|
|
|
comp->fOffsets[whichWindow] = sOffsetTable[curIndex];
|
|
comp->fCurrentWindow = whichWindow;
|
|
comp->fTimeStamps [whichWindow] = ++(comp->fTimeStamp);
|
|
}
|
|
|
|
/* three chars in a row with same index, define a
|
|
new window- makeIndex will return RESERVEDINDEX
|
|
for INVALIDCHAR*/
|
|
else if( curIndex == scsu_makeIndex(nextUC)
|
|
&& curIndex == scsu_makeIndex(forwardUC) ) {
|
|
/* make sure there is enough room to write all
|
|
three bytes if not, rewind the source
|
|
stream and break out*/
|
|
if( (byteBuffer + 2) >= targetLimit) {
|
|
--unicharBuffer;
|
|
goto finish;
|
|
}
|
|
|
|
whichWindow = scsu_getLRDefinedWindow(comp);
|
|
|
|
*byteBuffer++ = (uint8_t) (SDEFINE0 + whichWindow);
|
|
*byteBuffer++ = (uint8_t) curIndex;
|
|
*byteBuffer++ = (uint8_t)
|
|
(curUC - sOffsetTable[curIndex]
|
|
+ COMPRESSIONOFFSET);
|
|
|
|
comp->fOffsets[whichWindow] = sOffsetTable[curIndex];
|
|
comp->fCurrentWindow = whichWindow;
|
|
comp->fTimeStamps [whichWindow] = ++(comp->fTimeStamp);
|
|
}
|
|
|
|
/* only two chars in a row with same index, so
|
|
switch to unicode mode makeIndex will return
|
|
RESERVEDINDEX for INVALIDCHAR*/
|
|
else if( curIndex == scsu_makeIndex(nextUC)
|
|
&& curIndex != scsu_makeIndex(forwardUC) ) {
|
|
/* make sure there is enough room to write all
|
|
four bytes if not, rewind the source stream
|
|
and break out*/
|
|
if( (byteBuffer + 3) >= targetLimit) {
|
|
--unicharBuffer;
|
|
goto finish;
|
|
}
|
|
|
|
*byteBuffer++ = (uint8_t) SCHANGEU;
|
|
|
|
hiByte = curUC >> 8;
|
|
loByte = curUC;
|
|
|
|
/* add quote Unicode tag */
|
|
if( sUnicodeTagTable[hiByte] )
|
|
*byteBuffer++ = (uint8_t) UQUOTEU;
|
|
|
|
*byteBuffer++ = (uint8_t) hiByte;
|
|
*byteBuffer++ = (uint8_t) loByte;
|
|
|
|
comp->fMode = UNICODEMODE;
|
|
|
|
/* use a goto here for speed, to avoid having
|
|
to check fMode in the while loop at the top
|
|
of the case */
|
|
goto mainLoop;
|
|
}
|
|
|
|
/* three chars have different indices, so switch
|
|
to unicode mode*/
|
|
else {
|
|
/* make sure there is enough room to write all
|
|
four bytes and if not, rewind the source
|
|
stream and break out*/
|
|
if( (byteBuffer + 3) >= targetLimit) {
|
|
--unicharBuffer;
|
|
goto finish;
|
|
}
|
|
|
|
*byteBuffer++ = (uint8_t) SCHANGEU;
|
|
|
|
hiByte = curUC >> 8;
|
|
loByte = curUC;
|
|
|
|
/* add quote Unicode tag*/
|
|
if( sUnicodeTagTable[ hiByte ] )
|
|
*byteBuffer++ = (uint8_t) UQUOTEU;
|
|
|
|
*byteBuffer++ = (uint8_t) hiByte;
|
|
*byteBuffer++ = (uint8_t) loByte;
|
|
|
|
comp->fMode = UNICODEMODE;
|
|
|
|
/* use a goto here for speed, to avoid having
|
|
to check fMode in the while loop at the top
|
|
of the case */
|
|
goto mainLoop;
|
|
}
|
|
}
|
|
}
|
|
break;
|
|
|
|
/* main unicode mode compression loop*/
|
|
case UNICODEMODE:
|
|
while(unicharBuffer < sourceLimit && byteBuffer < targetLimit) {
|
|
|
|
/* get current char*/
|
|
curUC = *unicharBuffer++;
|
|
|
|
/* get next char*/
|
|
if( unicharBuffer < sourceLimit )
|
|
nextUC = *unicharBuffer;
|
|
else
|
|
nextUC = INVALIDCHAR;
|
|
|
|
/* if we have two uncompressible unichars in a row,
|
|
put the current char's bytes in the stream*/
|
|
if( ! scsu_isCompressible(curUC)
|
|
|| (nextUC != INVALIDCHAR
|
|
&& ! scsu_isCompressible(nextUC)) ) {
|
|
/* make sure there is enough room to write all
|
|
three bytes and if not, rewind the source
|
|
stream and break out*/
|
|
if( (byteBuffer + 2) >= targetLimit) {
|
|
--unicharBuffer;
|
|
goto finish;
|
|
}
|
|
|
|
hiByte = curUC >> 8;
|
|
loByte = curUC;
|
|
|
|
/* add quote Unicode tag*/
|
|
if( sUnicodeTagTable[ hiByte ] )
|
|
*byteBuffer++ = (uint8_t) UQUOTEU;
|
|
|
|
*byteBuffer++ = (uint8_t) hiByte;
|
|
*byteBuffer++ = (uint8_t) loByte;
|
|
}
|
|
|
|
/* bytes less than 0x80 can go straight in the stream,
|
|
but in single-byte mode*/
|
|
else if( curUC < 0x0080 ) {
|
|
loByte = curUC;
|
|
|
|
/* if two chars in a row below 0x80 and the
|
|
current char is not a single-byte mode tag,
|
|
switch to single-byte mode*/
|
|
if(nextUC != INVALIDCHAR
|
|
&& nextUC < 0x0080 && ! sSingleTagTable[ loByte ] ) {
|
|
/* make sure there is enough room to write
|
|
both bytes and if not, rewind the source stream
|
|
and break out*/
|
|
if( (byteBuffer + 1) >= targetLimit) {
|
|
--unicharBuffer;
|
|
goto finish;
|
|
}
|
|
|
|
/* use window 0, but any would work*/
|
|
*byteBuffer++ = (uint8_t) UCHANGE0;
|
|
*byteBuffer++ = (uint8_t) loByte;
|
|
|
|
comp->fCurrentWindow = 0;
|
|
comp->fTimeStamps [0] = ++(comp->fTimeStamp);
|
|
comp->fMode = SINGLEBYTEMODE;
|
|
|
|
/* use a goto here for speed, to avoid having
|
|
to check fMode in the while loop at the top
|
|
of the case */
|
|
goto mainLoop;
|
|
}
|
|
|
|
/* otherwise, just write the bytes to the stream
|
|
(this will cover the case of only 1 char less
|
|
than 0x80 and single-byte mode tags)*/
|
|
else {
|
|
/* make sure there is enough room to write
|
|
both bytes and if not, rewind the source
|
|
stream and break out*/
|
|
if( (byteBuffer + 1) >= targetLimit) {
|
|
--unicharBuffer;
|
|
goto finish;
|
|
}
|
|
|
|
/* since the character is less than 0x80, the
|
|
high byte is always 0x00 - no need for
|
|
(curUC >> 8)*/
|
|
*byteBuffer++ = (uint8_t) 0x00;
|
|
*byteBuffer++ = (uint8_t) loByte;
|
|
}
|
|
}
|
|
|
|
/* figure out if the current unichar is in a defined
|
|
window*/
|
|
else if( (whichWindow = scsu_findDynamicWindow(comp, curUC))
|
|
!= INVALIDWINDOW ) {
|
|
/* if two chars in a row in the same window,
|
|
switch to that window and go to single-byte
|
|
mode inDynamicWindow will return FALSE for
|
|
INVALIDCHAR*/
|
|
if( scsu_inDynamicWindow(comp, nextUC, whichWindow) ) {
|
|
/* make sure there is enough room to write
|
|
both bytes if not, rewind the source stream
|
|
and break out*/
|
|
if( (byteBuffer + 1) >= targetLimit) {
|
|
--unicharBuffer;
|
|
goto finish;
|
|
}
|
|
|
|
*byteBuffer++ = (uint8_t) (UCHANGE0 + whichWindow);
|
|
*byteBuffer++ = (uint8_t)
|
|
(curUC - comp->fOffsets[whichWindow]
|
|
+ COMPRESSIONOFFSET);
|
|
|
|
comp->fTimeStamps[whichWindow] = ++(comp->fTimeStamp);
|
|
comp->fCurrentWindow = whichWindow;
|
|
comp->fMode = SINGLEBYTEMODE;
|
|
|
|
/* use a goto here for speed, to avoid having
|
|
to check fMode in the while loop at the top
|
|
of the case */
|
|
goto mainLoop;
|
|
}
|
|
|
|
/* otherwise, just quote the unicode for the
|
|
char*/
|
|
else {
|
|
/* make sure there is enough room to write all
|
|
three bytes and if not, rewind the source
|
|
stream and break out*/
|
|
if( (byteBuffer + 2) >= targetLimit) {
|
|
--unicharBuffer;
|
|
goto finish;
|
|
}
|
|
|
|
hiByte = curUC >> 8;
|
|
loByte = curUC;
|
|
|
|
/* add quote Unicode tag*/
|
|
if( sUnicodeTagTable[ hiByte ] )
|
|
*byteBuffer++ = (uint8_t) UQUOTEU;
|
|
|
|
*byteBuffer++ = (uint8_t) hiByte;
|
|
*byteBuffer++ = (uint8_t) loByte;
|
|
}
|
|
}
|
|
|
|
/* char is not in a defined window*/
|
|
else {
|
|
/* determine index for current char (char is
|
|
compressible)*/
|
|
curIndex = scsu_makeIndex(curUC);
|
|
comp->fIndexCount[curIndex]++;
|
|
|
|
/* look ahead*/
|
|
if( (unicharBuffer + 1) < sourceLimit )
|
|
forwardUC = *unicharBuffer;
|
|
else
|
|
forwardUC = INVALIDCHAR;
|
|
|
|
/* if we have encountered this index at least once
|
|
before, define a new window for it that hasn't
|
|
previously been redefined*/
|
|
if( comp->fIndexCount[curIndex] > 1 ) {
|
|
/* make sure there is enough room to write all
|
|
three bytes if not, rewind the source
|
|
stream and break out*/
|
|
if( (byteBuffer + 2) >= targetLimit) {
|
|
--unicharBuffer;
|
|
goto finish;
|
|
}
|
|
|
|
/* get least recently defined window*/
|
|
whichWindow = scsu_getLRDefinedWindow(comp);
|
|
|
|
*byteBuffer++ = (uint8_t) (UDEFINE0 + whichWindow);
|
|
*byteBuffer++ = (uint8_t) curIndex;
|
|
*byteBuffer++ = (uint8_t)
|
|
(curUC - sOffsetTable[curIndex]
|
|
+ COMPRESSIONOFFSET);
|
|
|
|
comp->fOffsets[whichWindow] = sOffsetTable[curIndex];
|
|
comp->fCurrentWindow = whichWindow;
|
|
comp->fTimeStamps[whichWindow] = ++(comp->fTimeStamp);
|
|
comp->fMode = SINGLEBYTEMODE;
|
|
|
|
/* use a goto here for speed, to avoid having
|
|
to check fMode in the while loop at the top
|
|
of the case */
|
|
goto mainLoop;
|
|
}
|
|
|
|
/* if three chars in a row with the same index,
|
|
define a new window makeIndex will return
|
|
RESERVEDINDEX for INVALIDCHAR*/
|
|
else if( curIndex == scsu_makeIndex(nextUC)
|
|
&& curIndex == scsu_makeIndex(forwardUC) ) {
|
|
/* make sure there is enough room to write all
|
|
three bytes if not, rewind the source
|
|
stream and break out*/
|
|
if( (byteBuffer + 2) >= targetLimit) {
|
|
--unicharBuffer;
|
|
goto finish;
|
|
}
|
|
|
|
whichWindow = scsu_getLRDefinedWindow(comp);
|
|
|
|
*byteBuffer++ = (uint8_t) (UDEFINE0 + whichWindow);
|
|
*byteBuffer++ = (uint8_t) curIndex;
|
|
*byteBuffer++ = (uint8_t)
|
|
(curUC - sOffsetTable[curIndex]
|
|
+ COMPRESSIONOFFSET);
|
|
|
|
comp->fOffsets[whichWindow] = sOffsetTable[curIndex];
|
|
comp->fCurrentWindow = whichWindow;
|
|
comp->fTimeStamps[whichWindow] = ++(comp->fTimeStamp);
|
|
comp->fMode = SINGLEBYTEMODE;
|
|
|
|
/* use a goto here for speed, to avoid having
|
|
to check fMode in the while loop at the top
|
|
of the case */
|
|
goto mainLoop;
|
|
}
|
|
|
|
/* otherwise just quote the unicode, and save our
|
|
windows for longer runs*/
|
|
else {
|
|
/* make sure there is enough room to write all
|
|
three bytes and if not, rewind the source
|
|
stream and break out*/
|
|
if( (byteBuffer + 2) >= targetLimit) {
|
|
--unicharBuffer;
|
|
goto finish;
|
|
}
|
|
|
|
hiByte = curUC >> 8;
|
|
loByte = curUC;
|
|
|
|
/* add quote Unicode tag*/
|
|
if( sUnicodeTagTable[ hiByte ] )
|
|
*byteBuffer++ = (uint8_t) UQUOTEU;
|
|
|
|
*byteBuffer++ = (uint8_t) hiByte;
|
|
*byteBuffer++ = (uint8_t) loByte;
|
|
}
|
|
}
|
|
}
|
|
} /* end switch*/
|
|
}
|
|
|
|
finish:
|
|
|
|
/* fill in output parameters*/
|
|
*target = byteBuffer;
|
|
*source = unicharBuffer;
|
|
|
|
if(unicharBuffer < sourceLimit)
|
|
*status = U_BUFFER_OVERFLOW_ERROR;
|
|
}
|
|
|
|
void
|
|
scsu_decompress(UnicodeCompressor *comp,
|
|
UChar **target,
|
|
const UChar *targetLimit,
|
|
const uint8_t **source,
|
|
const uint8_t *sourceLimit,
|
|
UErrorCode *status)
|
|
{
|
|
/* the current position in the source byte buffer*/
|
|
const uint8_t *byteBuffer = *source;
|
|
|
|
/* the current position in the target unichar buffer*/
|
|
UChar *unicharBuffer = *target;
|
|
|
|
/* the current byte from the source buffer*/
|
|
int32_t aByte = 0x00;
|
|
|
|
/* temporary for calculating surrogate pairs */
|
|
int32_t normalizedBase;
|
|
|
|
/* temporary used for look-ahead */
|
|
int32_t dByte;
|
|
|
|
|
|
/* verify we weren't passed a failing error code */
|
|
if(U_FAILURE(*status)) {
|
|
return;
|
|
}
|
|
/* verify the target buffer can hold at least 1 UChar */
|
|
else if(targetLimit - unicharBuffer < sizeof(UChar)) {
|
|
*status = U_ILLEGAL_ARGUMENT_ERROR;
|
|
return;
|
|
}
|
|
|
|
/* if our internal buffer isn't empty, flush its contents
|
|
to the output buffer before doing any more decompression */
|
|
if(comp->fBufferLength > 0) {
|
|
|
|
int32_t newBytes = 0;
|
|
const uint8_t *newSource = comp->fBuffer;
|
|
const uint8_t *newSourceLimit = comp->fBuffer + USCSU_BUFSIZE;
|
|
|
|
/* fill the buffer completely, to guarantee one full character */
|
|
if(comp->fBufferLength != USCSU_BUFSIZE) {
|
|
newBytes = USCSU_BUFSIZE - comp->fBufferLength;
|
|
|
|
/* verify there are newBytes bytes in byteBuffer */
|
|
if(sourceLimit - byteBuffer < newBytes)
|
|
newBytes = sourceLimit - byteBuffer;
|
|
|
|
uprv_memcpy(comp->fBuffer + comp->fBufferLength, byteBuffer, newBytes);
|
|
}
|
|
|
|
/* reset buffer length to 0 before recursive call */
|
|
comp->fBufferLength = 0;
|
|
|
|
/* call self recursively to decompress the buffer */
|
|
scsu_decompress(comp, &unicharBuffer, targetLimit,
|
|
&newSource, newSourceLimit, status);
|
|
|
|
/* update the positions into the arrays */
|
|
/* unicharBuffer was updated by the call to decompress above */
|
|
byteBuffer += newBytes;
|
|
}
|
|
|
|
/* the main decompression loop*/
|
|
mainLoop:
|
|
while(byteBuffer < sourceLimit && unicharBuffer < targetLimit) {
|
|
|
|
switch(comp->fMode) {
|
|
|
|
/* single-byte mode decompression loop*/
|
|
case SINGLEBYTEMODE:
|
|
while(byteBuffer < sourceLimit && unicharBuffer < targetLimit) {
|
|
|
|
/* get the next byte */
|
|
aByte = *byteBuffer++;
|
|
|
|
switch(aByte) {
|
|
/* All bytes from 0x80 through 0xFF are remapped to
|
|
chars or surrogate pairs according to the currently
|
|
active window */
|
|
case 0x80: case 0x81: case 0x82: case 0x83: case 0x84:
|
|
case 0x85: case 0x86: case 0x87: case 0x88: case 0x89:
|
|
case 0x8A: case 0x8B: case 0x8C: case 0x8D: case 0x8E:
|
|
case 0x8F: case 0x90: case 0x91: case 0x92: case 0x93:
|
|
case 0x94: case 0x95: case 0x96: case 0x97: case 0x98:
|
|
case 0x99: case 0x9A: case 0x9B: case 0x9C: case 0x9D:
|
|
case 0x9E: case 0x9F: case 0xA0: case 0xA1: case 0xA2:
|
|
case 0xA3: case 0xA4: case 0xA5: case 0xA6: case 0xA7:
|
|
case 0xA8: case 0xA9: case 0xAA: case 0xAB: case 0xAC:
|
|
case 0xAD: case 0xAE: case 0xAF: case 0xB0: case 0xB1:
|
|
case 0xB2: case 0xB3: case 0xB4: case 0xB5: case 0xB6:
|
|
case 0xB7: case 0xB8: case 0xB9: case 0xBA: case 0xBB:
|
|
case 0xBC: case 0xBD: case 0xBE: case 0xBF: case 0xC0:
|
|
case 0xC1: case 0xC2: case 0xC3: case 0xC4: case 0xC5:
|
|
case 0xC6: case 0xC7: case 0xC8: case 0xC9: case 0xCA:
|
|
case 0xCB: case 0xCC: case 0xCD: case 0xCE: case 0xCF:
|
|
case 0xD0: case 0xD1: case 0xD2: case 0xD3: case 0xD4:
|
|
case 0xD5: case 0xD6: case 0xD7: case 0xD8: case 0xD9:
|
|
case 0xDA: case 0xDB: case 0xDC: case 0xDD: case 0xDE:
|
|
case 0xDF: case 0xE0: case 0xE1: case 0xE2: case 0xE3:
|
|
case 0xE4: case 0xE5: case 0xE6: case 0xE7: case 0xE8:
|
|
case 0xE9: case 0xEA: case 0xEB: case 0xEC: case 0xED:
|
|
case 0xEE: case 0xEF: case 0xF0: case 0xF1: case 0xF2:
|
|
case 0xF3: case 0xF4: case 0xF5: case 0xF6: case 0xF7:
|
|
case 0xF8: case 0xF9: case 0xFA: case 0xFB: case 0xFC:
|
|
case 0xFD: case 0xFE: case 0xFF:
|
|
|
|
/* For offsets <= 0xFFFF, convert to a single char by
|
|
adding the window's offset and subtracting the
|
|
generic compression offset*/
|
|
if(comp->fOffsets[ comp->fCurrentWindow ] <= 0xFFFF) {
|
|
*unicharBuffer++ = (UChar)
|
|
(aByte + comp->fOffsets[comp->fCurrentWindow]
|
|
- COMPRESSIONOFFSET);
|
|
}
|
|
/* For offsets > 0x10000, convert to a surrogate pair by
|
|
normBase = window's offset - 0x10000
|
|
high surrogate = 0xD800 + (normBase >> 10)
|
|
low surrogate = 0xDC00 + (normBase & 0x3FF)
|
|
+ (byte & 0x7F) */
|
|
else {
|
|
/* make sure there is enough room to write
|
|
both characters
|
|
if not, save state and break out */
|
|
if((unicharBuffer + 1) >= targetLimit) {
|
|
--byteBuffer;
|
|
uprv_memcpy(comp->fBuffer, byteBuffer,
|
|
sourceLimit - byteBuffer);
|
|
comp->fBufferLength = sourceLimit - byteBuffer;
|
|
byteBuffer += comp->fBufferLength;
|
|
goto finish;
|
|
}
|
|
|
|
normalizedBase = comp->fOffsets[comp->fCurrentWindow]
|
|
- 0x10000;
|
|
*unicharBuffer++ =
|
|
(UChar) (0xD800 + (normalizedBase >> 10));
|
|
*unicharBuffer++ = (UChar)
|
|
(0xDC00 + (normalizedBase & 0x3FF)
|
|
+ (aByte & 0x7F));
|
|
}
|
|
break;
|
|
|
|
/* bytes from 0x20 through 0x7F are treated as ASCII
|
|
and are remapped to chars by padding the high byte
|
|
(this is the same as quoting from static window 0)
|
|
NUL (0x00), HT (0x09), CR (0x0A), LF (0x0D) are
|
|
treated as ASCII as well*/
|
|
case 0x00: case 0x09: case 0x0A: case 0x0D:
|
|
case 0x20: case 0x21: case 0x22: case 0x23: case 0x24:
|
|
case 0x25: case 0x26: case 0x27: case 0x28: case 0x29:
|
|
case 0x2A: case 0x2B: case 0x2C: case 0x2D: case 0x2E:
|
|
case 0x2F: case 0x30: case 0x31: case 0x32: case 0x33:
|
|
case 0x34: case 0x35: case 0x36: case 0x37: case 0x38:
|
|
case 0x39: case 0x3A: case 0x3B: case 0x3C: case 0x3D:
|
|
case 0x3E: case 0x3F: case 0x40: case 0x41: case 0x42:
|
|
case 0x43: case 0x44: case 0x45: case 0x46: case 0x47:
|
|
case 0x48: case 0x49: case 0x4A: case 0x4B: case 0x4C:
|
|
case 0x4D: case 0x4E: case 0x4F: case 0x50: case 0x51:
|
|
case 0x52: case 0x53: case 0x54: case 0x55: case 0x56:
|
|
case 0x57: case 0x58: case 0x59: case 0x5A: case 0x5B:
|
|
case 0x5C: case 0x5D: case 0x5E: case 0x5F: case 0x60:
|
|
case 0x61: case 0x62: case 0x63: case 0x64: case 0x65:
|
|
case 0x66: case 0x67: case 0x68: case 0x69: case 0x6A:
|
|
case 0x6B: case 0x6C: case 0x6D: case 0x6E: case 0x6F:
|
|
case 0x70: case 0x71: case 0x72: case 0x73: case 0x74:
|
|
case 0x75: case 0x76: case 0x77: case 0x78: case 0x79:
|
|
case 0x7A: case 0x7B: case 0x7C: case 0x7D: case 0x7E:
|
|
case 0x7F:
|
|
*unicharBuffer++ = (UChar) aByte;
|
|
break;
|
|
|
|
/* quote unicode*/
|
|
case SQUOTEU:
|
|
/* verify we have two bytes following tag and if not,
|
|
rewind the source stream and break out */
|
|
if( (byteBuffer + 1) >= sourceLimit ) {
|
|
--byteBuffer;
|
|
uprv_memcpy(comp->fBuffer, byteBuffer,
|
|
sourceLimit - byteBuffer);
|
|
comp->fBufferLength = sourceLimit - byteBuffer;
|
|
byteBuffer += comp->fBufferLength;
|
|
goto finish;
|
|
}
|
|
|
|
aByte = *byteBuffer++;
|
|
*unicharBuffer++ =
|
|
(UChar) (aByte << 8 | *byteBuffer++);
|
|
break;
|
|
|
|
/* switch to Unicode mode*/
|
|
case SCHANGEU:
|
|
comp->fMode = UNICODEMODE;
|
|
/* use a goto here for speed, to avoid having to check
|
|
fMode in the while loop at the top of the case */
|
|
goto mainLoop;
|
|
break;
|
|
|
|
/* handle all quote tags*/
|
|
case SQUOTE0: case SQUOTE1: case SQUOTE2: case SQUOTE3:
|
|
case SQUOTE4: case SQUOTE5: case SQUOTE6: case SQUOTE7:
|
|
/* verify there is a byte following the tag and if
|
|
not, rewind the source stream and break out*/
|
|
if( byteBuffer >= sourceLimit ) {
|
|
--byteBuffer;
|
|
uprv_memcpy(comp->fBuffer, byteBuffer,
|
|
sourceLimit - byteBuffer);
|
|
comp->fBufferLength = sourceLimit - byteBuffer;
|
|
byteBuffer += comp->fBufferLength;
|
|
goto finish;
|
|
}
|
|
|
|
/* if the byte is in the range 0x00 - 0x7F, use static
|
|
window n- otherwise, use dynamic window n */
|
|
dByte = *byteBuffer++;
|
|
*unicharBuffer++ = (UChar)
|
|
(dByte + (dByte >= 0x00 && dByte < 0x80
|
|
? sOffsets[aByte - SQUOTE0]
|
|
: (comp->fOffsets[aByte - SQUOTE0]
|
|
- COMPRESSIONOFFSET)));
|
|
break;
|
|
|
|
/* handle all change tags*/
|
|
case SCHANGE0: case SCHANGE1: case SCHANGE2: case SCHANGE3:
|
|
case SCHANGE4: case SCHANGE5: case SCHANGE6: case SCHANGE7:
|
|
comp->fCurrentWindow = (aByte - SCHANGE0);
|
|
break;
|
|
|
|
/* handle all define tags*/
|
|
case SDEFINE0: case SDEFINE1: case SDEFINE2: case SDEFINE3:
|
|
case SDEFINE4: case SDEFINE5: case SDEFINE6: case SDEFINE7:
|
|
/* verify there is a byte following the tag and if
|
|
not, rewind the source stream and break out*/
|
|
if( byteBuffer >= sourceLimit ) {
|
|
--byteBuffer;
|
|
uprv_memcpy(comp->fBuffer, byteBuffer,
|
|
sourceLimit - byteBuffer);
|
|
comp->fBufferLength = sourceLimit - byteBuffer;
|
|
byteBuffer += comp->fBufferLength;
|
|
goto finish;
|
|
}
|
|
|
|
comp->fCurrentWindow = (aByte - SDEFINE0);
|
|
comp->fOffsets[comp->fCurrentWindow] =
|
|
sOffsetTable[*byteBuffer++];
|
|
break;
|
|
|
|
/* handle define extended tag*/
|
|
case SDEFINEX:
|
|
/* verify we have two bytes following tag and if not,
|
|
rewind the source stream and break out*/
|
|
if( (byteBuffer + 1) >= sourceLimit ) {
|
|
--byteBuffer;
|
|
uprv_memcpy(comp->fBuffer, byteBuffer,
|
|
sourceLimit - byteBuffer);
|
|
comp->fBufferLength = sourceLimit - byteBuffer;
|
|
byteBuffer += comp->fBufferLength;
|
|
goto finish;
|
|
}
|
|
|
|
aByte = *byteBuffer++;
|
|
comp->fCurrentWindow = (aByte & 0xE0) >> 5;
|
|
comp->fOffsets[comp->fCurrentWindow] = 0x10000
|
|
+ (0x80
|
|
* (((aByte & 0x1F) << 8) | *byteBuffer++));
|
|
break;
|
|
|
|
/* reserved, shouldn't happen*/
|
|
case SRESERVED:
|
|
break;
|
|
|
|
} /* end switch*/
|
|
} /* end while*/
|
|
break;
|
|
|
|
/* unicode mode decompression loop*/
|
|
case UNICODEMODE:
|
|
while( byteBuffer < sourceLimit && unicharBuffer < targetLimit ) {
|
|
|
|
/* get the next byte */
|
|
aByte = *byteBuffer++;
|
|
|
|
switch( aByte ) {
|
|
/* handle all define tags*/
|
|
case UDEFINE0: case UDEFINE1: case UDEFINE2: case UDEFINE3:
|
|
case UDEFINE4: case UDEFINE5: case UDEFINE6: case UDEFINE7:
|
|
/* verify there is a byte following tag and if not,
|
|
rewind the source stream and break out*/
|
|
if( byteBuffer >= sourceLimit ) {
|
|
--byteBuffer;
|
|
uprv_memcpy(comp->fBuffer, byteBuffer,
|
|
sourceLimit - byteBuffer);
|
|
comp->fBufferLength = sourceLimit - byteBuffer;
|
|
byteBuffer += comp->fBufferLength;
|
|
goto finish;
|
|
}
|
|
|
|
comp->fCurrentWindow = (aByte - UDEFINE0);
|
|
comp->fOffsets[comp->fCurrentWindow] =
|
|
sOffsetTable[*byteBuffer++];
|
|
comp->fMode = SINGLEBYTEMODE;
|
|
/* use a goto here for speed, to avoid having to check
|
|
fMode in the while loop at the top of the case */
|
|
goto mainLoop;
|
|
break;
|
|
|
|
/* handle define extended tag*/
|
|
case UDEFINEX:
|
|
/* verify we have two bytes following tag if not,
|
|
rewind the source stream and break out*/
|
|
if( (byteBuffer + 1) >= sourceLimit ) {
|
|
--byteBuffer;
|
|
uprv_memcpy(comp->fBuffer, byteBuffer,
|
|
sourceLimit - byteBuffer);
|
|
comp->fBufferLength = sourceLimit - byteBuffer;
|
|
byteBuffer += comp->fBufferLength;
|
|
goto finish;
|
|
}
|
|
|
|
aByte = *byteBuffer++;
|
|
comp->fCurrentWindow = (aByte & 0xE0) >> 5;
|
|
comp->fOffsets[comp->fCurrentWindow] = 0x10000
|
|
+ (0x80
|
|
* (((aByte & 0x1F) << 8) | *byteBuffer++));
|
|
comp->fMode = SINGLEBYTEMODE;
|
|
/* use a goto here for speed, to avoid having to check
|
|
fMode in the while loop at the top of the case */
|
|
goto mainLoop;
|
|
break;
|
|
|
|
/* handle all change tags*/
|
|
case UCHANGE0: case UCHANGE1: case UCHANGE2: case UCHANGE3:
|
|
case UCHANGE4: case UCHANGE5: case UCHANGE6: case UCHANGE7:
|
|
comp->fCurrentWindow = (aByte - UCHANGE0);
|
|
comp->fMode = SINGLEBYTEMODE;
|
|
/* use a goto here for speed, to avoid having to check
|
|
fMode in the while loop at the top of the case */
|
|
goto mainLoop;
|
|
break;
|
|
|
|
/* quote unicode*/
|
|
case UQUOTEU:
|
|
/* verify we have two bytes following tag if not,
|
|
rewind the source stream and break out*/
|
|
if( byteBuffer >= sourceLimit - 1) {
|
|
--byteBuffer;
|
|
uprv_memcpy(comp->fBuffer, byteBuffer,
|
|
sourceLimit - byteBuffer);
|
|
comp->fBufferLength = sourceLimit - byteBuffer;
|
|
byteBuffer += comp->fBufferLength;
|
|
goto finish;
|
|
}
|
|
|
|
aByte = *byteBuffer++;
|
|
*unicharBuffer++ = (UChar)
|
|
(aByte << 8 | *byteBuffer++);
|
|
break;
|
|
|
|
default:
|
|
/* verify there is a byte following tag if not, rewind
|
|
the source stream and break out*/
|
|
if( byteBuffer >= sourceLimit ) {
|
|
--byteBuffer;
|
|
uprv_memcpy(comp->fBuffer, byteBuffer,
|
|
sourceLimit - byteBuffer);
|
|
comp->fBufferLength = sourceLimit - byteBuffer;
|
|
byteBuffer += comp->fBufferLength;
|
|
goto finish;
|
|
}
|
|
|
|
*unicharBuffer++ = (UChar) (aByte << 8 | *byteBuffer++);
|
|
break;
|
|
|
|
} /* end switch*/
|
|
} /* end while*/
|
|
break;
|
|
|
|
} /* end switch( comp->fMode )*/
|
|
} /* end while*/
|
|
|
|
|
|
finish:
|
|
|
|
/* fill in return values*/
|
|
*target = unicharBuffer;
|
|
*source = byteBuffer;
|
|
|
|
if(byteBuffer < sourceLimit)
|
|
*status = U_BUFFER_OVERFLOW_ERROR;
|
|
}
|
|
|
|
/** Reset the compressor to its initial state. */
|
|
void
|
|
scsu_reset(UnicodeCompressor *comp)
|
|
{
|
|
int32_t i;
|
|
|
|
/* reset dynamic windows*/
|
|
comp->fOffsets[0] = 0x0080; /* Latin-1*/
|
|
comp->fOffsets[1] = 0x00C0; /* Latin-1 Supplement + Latin Extended-A*/
|
|
comp->fOffsets[2] = 0x0400; /* Cyrillic*/
|
|
comp->fOffsets[3] = 0x0600; /* Arabic*/
|
|
comp->fOffsets[4] = 0x0900; /* Devanagari*/
|
|
comp->fOffsets[5] = 0x3040; /* Hiragana*/
|
|
comp->fOffsets[6] = 0x30A0; /* Katakana*/
|
|
comp->fOffsets[7] = 0xFF00; /* Fullwidth ASCII*/
|
|
|
|
/* reset time stamps*/
|
|
for(i = 0; i < USCSU_NUM_WINDOWS; i++) {
|
|
comp->fTimeStamps[i] = 0;
|
|
}
|
|
|
|
/* reset count of seen indices*/
|
|
for( i = 0; i <= USCSU_MAX_INDEX; i++ ) {
|
|
comp->fIndexCount[i] = 0;
|
|
}
|
|
|
|
comp->fTimeStamp = 0; /* Reset current time stamp*/
|
|
comp->fCurrentWindow = 0; /* Make current window Latin-1*/
|
|
comp->fMode = SINGLEBYTEMODE; /* Start in single-byte mode*/
|
|
comp->fBufferLength = 0; /* Empty buffer */
|
|
}
|
|
|
|
/**
|
|
* Create the index value for a character.
|
|
* For more information on this function, refer to table X-3
|
|
* <A HREF="http://www.unicode.org/unicode/reports/tr6">UTR6</A>.
|
|
* @param c The character in question.
|
|
* @return An index for c
|
|
*/
|
|
static int32_t
|
|
scsu_makeIndex(int32_t c)
|
|
{
|
|
/* check the predefined indices*/
|
|
if( c >= 0x00C0 && c < 0x0140)
|
|
return LATININDEX;
|
|
else if( c >= 0x0250 && c < 0x02D0 )
|
|
return IPAEXTENSIONINDEX;
|
|
else if( c >= 0x0370 && c < 0x03F0 )
|
|
return GREEKINDEX;
|
|
else if( c >= 0x0530 && c < 0x0590 )
|
|
return ARMENIANINDEX;
|
|
else if( c >= 0x3040 && c < 0x30A0 )
|
|
return HIRAGANAINDEX;
|
|
else if( c >= 0x30A0 && c < 0x3120)
|
|
return KATAKANAINDEX;
|
|
else if( c >= 0xFF60 && c < 0xFF9F )
|
|
return HALFWIDTHKATAKANAINDEX;
|
|
|
|
/* calculate index*/
|
|
else if( c >= 0x0080 && c < 0x3400 )
|
|
return (c / 0x80) & 0xFF;
|
|
else if( c >= 0xE000 && c <= 0xFFFF )
|
|
return ((c - 0xAC00) / 0x80) & 0xFF;
|
|
|
|
/* should never happen*/
|
|
else {
|
|
return RESERVEDINDEX;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Determine if a character is in a dynamic window.
|
|
* @param c The character to test
|
|
* @param whichWindow The dynamic window the test
|
|
* @return TRUE if <TT>c</TT> will fit in <TT>whichWindow</TT>, FALSE
|
|
* otherwise.
|
|
*/
|
|
static UBool
|
|
scsu_inDynamicWindow(const UnicodeCompressor *comp,
|
|
int32_t c,
|
|
int32_t whichWindow)
|
|
{
|
|
return (UBool)(c >= comp->fOffsets[whichWindow]
|
|
&& c < (comp->fOffsets[whichWindow] + 0x80));
|
|
}
|
|
|
|
/**
|
|
* Determine if a character is in a static window.
|
|
* @param c The character to test
|
|
* @param whichWindow The static window the test
|
|
* @return TRUE if <TT>c</TT> will fit in <TT>whichWindow</TT>, FALSE
|
|
* otherwise.
|
|
*/
|
|
static UBool
|
|
scsu_inStaticWindow(int32_t c,
|
|
int32_t whichWindow)
|
|
{
|
|
return (UBool)(c >= sOffsets[whichWindow] && c < (sOffsets[whichWindow] + 0x80));
|
|
}
|
|
|
|
/**
|
|
* Determine if a character is compressible.
|
|
* @param c The character to test.
|
|
* @return TRUE if the <TT>c</TT> is compressible, FALSE otherwise.
|
|
*/
|
|
static UBool
|
|
scsu_isCompressible(int32_t c)
|
|
{
|
|
return (UBool)(c < 0x3400 || c >= 0xE000);
|
|
}
|
|
|
|
/**
|
|
* Determine if a dynamic window for a certain character is defined
|
|
* @param c The character in question
|
|
* @return The dynamic window containing <TT>c</TT>, or INVALIDWINDOW if
|
|
* not defined.
|
|
*/
|
|
static int32_t
|
|
scsu_findDynamicWindow(const UnicodeCompressor *comp,
|
|
int32_t c)
|
|
{
|
|
int32_t i;
|
|
|
|
for(i = 0; i < USCSU_NUM_WINDOWS; i++) {
|
|
if(scsu_inDynamicWindow(comp, c, i)) {
|
|
return i;
|
|
}
|
|
}
|
|
|
|
return INVALIDWINDOW;
|
|
}
|
|
|
|
/**
|
|
* Determine if a static window for a certain character is defined
|
|
* @param c The character in question
|
|
* @return The static window containing <TT>c</TT>, or INVALIDWINDOW if
|
|
* not defined.
|
|
*/
|
|
static int32_t
|
|
scsu_findStaticWindow(int32_t c)
|
|
{
|
|
int32_t i;
|
|
|
|
for(i = 0; i < USCSU_NUM_STATIC_WINDOWS; i++) {
|
|
if(scsu_inStaticWindow(c, i)) {
|
|
return i;
|
|
}
|
|
}
|
|
|
|
return INVALIDWINDOW;
|
|
}
|
|
|
|
/** Find the least-recently defined window */
|
|
static int32_t
|
|
scsu_getLRDefinedWindow(const UnicodeCompressor *comp)
|
|
{
|
|
int32_t leastRU = INT32_MAX;
|
|
int32_t whichWindow = INVALIDWINDOW;
|
|
int32_t i;
|
|
|
|
/* find least recently used window*/
|
|
for(i = 0; i < USCSU_NUM_WINDOWS; i++ ) {
|
|
if(comp->fTimeStamps[i] < leastRU) {
|
|
leastRU = comp->fTimeStamps[i];
|
|
whichWindow = i;
|
|
}
|
|
}
|
|
|
|
return whichWindow;
|
|
}
|