scuffed-code/icu4c/source/common/scsu.c

1400 lines
43 KiB
C
Raw Normal View History

1999-08-16 21:50:52 +00:00
/*
******************************************************************************
*
* Copyright (C) 1999-2001, International Business Machines
* Corporation and others. All Rights Reserved.
*
******************************************************************************
1999-08-16 21:50:52 +00:00
*
* File scsu.c
*
* Modification History:
*
* Date Name Description
* 05/17/99 stephen Creation (ported from java UnicodeCompressor.java)
* 09/21/99 stephen Updated to handle data splits on decompression.
******************************************************************************
1999-08-16 21:50:52 +00:00
*/
#include "unicode/scsu.h"
1999-08-16 21:50:52 +00:00
#include "cmemory.h"
1999-08-16 21:50:52 +00:00
/* Generic window shift */
#define COMPRESSIONOFFSET 0x80
/* Indicates a window index is invalid */
#define INVALIDWINDOW -1
/* Indicates a character doesn't exist in input */
#define INVALIDCHAR -1
/* Compression modes */
#define SINGLEBYTEMODE 0
#define UNICODEMODE 1
/* Reserved index value */
#define RESERVEDINDEX 0x00
/* Indices for scripts which cross a half-block boundary */
#define LATININDEX 0xF9
#define IPAEXTENSIONINDEX 0xFA
#define GREEKINDEX 0xFB
#define ARMENIANINDEX 0xFC
#define HIRAGANAINDEX 0xFD
#define KATAKANAINDEX 0xFE
#define HALFWIDTHKATAKANAINDEX 0xFF
/* Single-byte mode tags */
#define SDEFINEX 0x0B
/* 0x0C is a reserved value*/
#define SRESERVED 0x0C
#define SQUOTEU 0x0E
#define SCHANGEU 0x0F
#define SQUOTE0 0x01
#define SQUOTE1 0x02
#define SQUOTE2 0x03
#define SQUOTE3 0x04
#define SQUOTE4 0x05
#define SQUOTE5 0x06
#define SQUOTE6 0x07
#define SQUOTE7 0x08
#define SCHANGE0 0x10
#define SCHANGE1 0x11
#define SCHANGE2 0x12
#define SCHANGE3 0x13
#define SCHANGE4 0x14
#define SCHANGE5 0x15
#define SCHANGE6 0x16
#define SCHANGE7 0x17
#define SDEFINE0 0x18
#define SDEFINE1 0x19
#define SDEFINE2 0x1A
#define SDEFINE3 0x1B
#define SDEFINE4 0x1C
#define SDEFINE5 0x1D
#define SDEFINE6 0x1E
#define SDEFINE7 0x1F
/* Unicode mode tags */
#define UCHANGE0 0xE0
#define UCHANGE1 0xE1
#define UCHANGE2 0xE2
#define UCHANGE3 0xE3
#define UCHANGE4 0xE4
#define UCHANGE5 0xE5
#define UCHANGE6 0xE6
#define UCHANGE7 0xE7
#define UDEFINE0 0xE8
#define UDEFINE1 0xE9
#define UDEFINE2 0xEA
#define UDEFINE3 0xEB
#define UDEFINE4 0xEC
#define UDEFINE5 0xED
#define UDEFINE6 0xEE
#define UDEFINE7 0xEF
#define UQUOTEU 0xF0
#define UDEFINEX 0xF1
/* 0xF2 is a reserved value*/
#define URESERVED 0xF2
/* Local function prototypes */
static int32_t scsu_makeIndex(int32_t c);
static UBool scsu_inDynamicWindow(const UnicodeCompressor *comp,
int32_t c,
int32_t whichWindow);
static UBool scsu_inStaticWindow(int32_t c,
int32_t whichWindow);
static UBool scsu_isCompressible(int32_t c);
1999-08-16 21:50:52 +00:00
static int32_t scsu_findDynamicWindow(const UnicodeCompressor *comp,
int32_t c);
1999-08-16 21:50:52 +00:00
static int32_t scsu_findStaticWindow(int32_t c);
static int32_t scsu_getLRDefinedWindow(const UnicodeCompressor *comp);
/* Static tables generated by CompressionTableGenerator */
/** For window offset mapping */
static int32_t sOffsetTable [] = {
0x0, 0x80, 0x100, 0x180, 0x200, 0x280, 0x300, 0x380, 0x400, 0x480,
0x500, 0x580, 0x600, 0x680, 0x700, 0x780, 0x800, 0x880, 0x900,
0x980, 0xa00, 0xa80, 0xb00, 0xb80, 0xc00, 0xc80, 0xd00, 0xd80,
0xe00, 0xe80, 0xf00, 0xf80, 0x1000, 0x1080, 0x1100, 0x1180,
0x1200, 0x1280, 0x1300, 0x1380, 0x1400, 0x1480, 0x1500, 0x1580,
0x1600, 0x1680, 0x1700, 0x1780, 0x1800, 0x1880, 0x1900, 0x1980,
0x1a00, 0x1a80, 0x1b00, 0x1b80, 0x1c00, 0x1c80, 0x1d00, 0x1d80,
0x1e00, 0x1e80, 0x1f00, 0x1f80, 0x2000, 0x2080, 0x2100, 0x2180,
0x2200, 0x2280, 0x2300, 0x2380, 0x2400, 0x2480, 0x2500, 0x2580,
0x2600, 0x2680, 0x2700, 0x2780, 0x2800, 0x2880, 0x2900, 0x2980,
0x2a00, 0x2a80, 0x2b00, 0x2b80, 0x2c00, 0x2c80, 0x2d00, 0x2d80,
0x2e00, 0x2e80, 0x2f00, 0x2f80, 0x3000, 0x3080, 0x3100, 0x3180,
0x3200, 0x3280, 0x3300, 0x3380, 0xe000, 0xe080, 0xe100, 0xe180,
0xe200, 0xe280, 0xe300, 0xe380, 0xe400, 0xe480, 0xe500, 0xe580,
0xe600, 0xe680, 0xe700, 0xe780, 0xe800, 0xe880, 0xe900, 0xe980,
0xea00, 0xea80, 0xeb00, 0xeb80, 0xec00, 0xec80, 0xed00, 0xed80,
0xee00, 0xee80, 0xef00, 0xef80, 0xf000, 0xf080, 0xf100, 0xf180,
0xf200, 0xf280, 0xf300, 0xf380, 0xf400, 0xf480, 0xf500, 0xf580,
0xf600, 0xf680, 0xf700, 0xf780, 0xf800, 0xf880, 0xf900, 0xf980,
0xfa00, 0xfa80, 0xfb00, 0xfb80, 0xfc00, 0xfc80, 0xfd00, 0xfd80,
0xfe00, 0xfe80, 0xff00, 0xff80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xc0, 0x250, 0x370,
0x530, 0x3040, 0x30a0, 0xff60
1999-08-16 21:50:52 +00:00
};
/** For quick identification of a byte as a single-byte mode tag */
static UBool sSingleTagTable [] = {
FALSE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, FALSE,
FALSE, TRUE, TRUE, FALSE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,
TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE
1999-08-16 21:50:52 +00:00
};
/** For quick identification of a byte as a unicode mode tag */
static UBool sUnicodeTagTable [] = {
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE,
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE
1999-08-16 21:50:52 +00:00
};
/** Static compression window offsets */
static int32_t sOffsets [] = {
0x0000, /* for quoting single-byte mode tags*/
0x0080, /* Latin-1 Supplement*/
0x0100, /* Latin Extended-A*/
0x0300, /* Combining Diacritical Marks*/
0x2000, /* General Punctuation*/
0x2080, /* Curency Symbols*/
0x2100, /* Letterlike Symbols and Number Forms*/
0x3000 /* CJK Symbols and Punctuation*/
1999-08-16 21:50:52 +00:00
};
void
scsu_init(UnicodeCompressor *comp)
1999-08-16 21:50:52 +00:00
{
/* initialize to defaults*/
scsu_reset(comp);
1999-08-16 21:50:52 +00:00
}
void
scsu_compress(UnicodeCompressor *comp,
uint8_t **target,
const uint8_t *targetLimit,
const UChar **source,
const UChar *sourceLimit,
UErrorCode *status)
1999-08-16 21:50:52 +00:00
{
/* the current position in the source unichar buffer*/
const UChar *unicharBuffer = *source;
1999-08-16 21:50:52 +00:00
/* the current position in the target byte buffer*/
uint8_t *byteBuffer = *target;
1999-08-16 21:50:52 +00:00
/* the current unicode character from the source buffer*/
int32_t curUC = INVALIDCHAR;
1999-08-16 21:50:52 +00:00
/* the index for the current character*/
int32_t curIndex = -1;
1999-08-16 21:50:52 +00:00
/* look ahead*/
int32_t nextUC = INVALIDCHAR;
int32_t forwardUC = INVALIDCHAR;
1999-08-16 21:50:52 +00:00
/* temporary for window searching*/
int32_t whichWindow = 0;
1999-08-16 21:50:52 +00:00
/* high and low bytes of the current unicode character*/
int32_t hiByte = 0;
int32_t loByte = 0;
1999-08-16 21:50:52 +00:00
/* verify we weren't passed a failing error code */
if(U_FAILURE(*status)) {
return;
}
/* verify the target buffer can hold at least 4 bytes */
else if(targetLimit - byteBuffer < 4) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
1999-08-16 21:50:52 +00:00
mainLoop:
while( unicharBuffer < sourceLimit && byteBuffer < targetLimit) {
switch( comp->fMode ) {
/* main single byte mode compression loop*/
case SINGLEBYTEMODE:
while( unicharBuffer < sourceLimit && byteBuffer < targetLimit ) {
1999-08-16 21:50:52 +00:00
/* get current char*/
curUC = *unicharBuffer++;
/* get next char*/
if( unicharBuffer < sourceLimit )
nextUC = *unicharBuffer;
else
nextUC = INVALIDCHAR;
/* chars less than 0x0080 (excluding tags) go straight in
stream */
if( curUC < 0x0080 ) {
loByte = curUC;
/* we need to check and make sure we don't
accidentally write a single byte mode tag to
the stream unless it's quoted */
if(sSingleTagTable[loByte]) {
/* make sure there is enough room to write
both bytes and if not, rewind the source
stream and break out*/
if( (byteBuffer + 1) >= targetLimit) {
--unicharBuffer;
goto finish;
}
/* since we know the byte is less than 0x80, SQUOTE0
will use static window 0, or Latin-1*/
*byteBuffer++ = (uint8_t) SQUOTE0;
}
*byteBuffer++ = (uint8_t) loByte;
}
/* if the char belongs to current window, convert it
to a byte by adding the generic compression offset
and subtracting the window's offset*/
else if(scsu_inDynamicWindow(comp,
curUC, comp->fCurrentWindow) ) {
*byteBuffer++ = (uint8_t)
(curUC - comp->fOffsets[ comp->fCurrentWindow ]
+ COMPRESSIONOFFSET);
}
/* if char is not in compressible range, either switch
to or quote from unicode*/
else if( ! scsu_isCompressible(curUC) ) {
/* only check next character if it is valid*/
if(nextUC != INVALIDCHAR && scsu_isCompressible(nextUC)) {
/* make sure there is enough room to write all
three bytes if not, rewind the source
stream and break out*/
if( (byteBuffer + 2) >= targetLimit) {
--unicharBuffer;
goto finish;
}
*byteBuffer++ = (uint8_t) SQUOTEU;
*byteBuffer++ = (uint8_t) (curUC >> 8);
*byteBuffer++ = (uint8_t) curUC;
}
else {
/* make sure there is enough room to write all
four bytes and if not, rewind the source
stream and break out*/
if( (byteBuffer + 3) >= targetLimit) {
--unicharBuffer;
goto finish;
}
*byteBuffer++ = (uint8_t) SCHANGEU;
hiByte = curUC >> 8;
loByte = curUC;
/* add quote Unicode tag */
if( sUnicodeTagTable[hiByte] )
*byteBuffer++ = (uint8_t) UQUOTEU;
*byteBuffer++ = (uint8_t) hiByte;
*byteBuffer++ = (uint8_t) loByte;
comp->fMode = UNICODEMODE;
/* use a goto here for speed, to avoid having
to check fMode in the while loop at the top
of the case */
goto mainLoop;
}
}
/* if the char is in a currently defined dynamic
window, figure out which one, and either switch to
it or quote from it*/
else if( (whichWindow = scsu_findDynamicWindow(comp, curUC))
!= INVALIDWINDOW ) {
/* look ahead*/
if( (unicharBuffer + 1) < sourceLimit )
forwardUC = *(unicharBuffer + 1);
else
forwardUC = INVALIDCHAR;
/* all three chars in same window, switch to that
window- inDynamicWindow will return FALSE for
INVALIDCHAR*/
if( scsu_inDynamicWindow(comp, nextUC, whichWindow)
&& scsu_inDynamicWindow(comp, forwardUC, whichWindow)){
/* make sure there is enough room to write
both bytes and if not, rewind the source
stream and break out*/
if( (byteBuffer + 1) >= targetLimit) {
--unicharBuffer;
goto finish;
}
*byteBuffer++ = (uint8_t) (SCHANGE0 + whichWindow);
*byteBuffer++ = (uint8_t)
(curUC - comp->fOffsets[whichWindow]
+ COMPRESSIONOFFSET);
comp->fTimeStamps [ whichWindow ] = ++(comp->fTimeStamp);
comp->fCurrentWindow = whichWindow;
}
/* either only next char or neither in same
window, so quote*/
else {
/* make sure there is enough room to write
both bytes and if not, rewind the source stream
and break out*/
if( (byteBuffer + 1) >= targetLimit) {
--unicharBuffer;
goto finish;
}
*byteBuffer++ = (uint8_t) (SQUOTE0 + whichWindow);
*byteBuffer++ = (uint8_t)
(curUC - comp->fOffsets[whichWindow]
+ COMPRESSIONOFFSET);
}
}
/* if a static window is defined, and the following
character is not in that static window, quote from
the static window Note: to quote from a static
window, don't add 0x80*/
else if( (whichWindow = scsu_findStaticWindow(curUC))
!= INVALIDWINDOW
&& ! scsu_inStaticWindow(nextUC, whichWindow) ) {
/* make sure there is enough room to write both
bytes if not, rewind the source stream and
break out*/
if( (byteBuffer + 1) >= targetLimit) {
--unicharBuffer;
goto finish;
}
*byteBuffer++ = (uint8_t) (SQUOTE0 + whichWindow);
*byteBuffer++ = (uint8_t) (curUC - sOffsets[whichWindow]);
}
/* if a window is not defined, decide if we want to
define a new one or switch to unicode mode*/
else {
/* determine index for current char (char is
compressible)*/
curIndex = scsu_makeIndex(curUC);
comp->fIndexCount[curIndex]++;
/* look ahead*/
if( (unicharBuffer + 1) < sourceLimit )
forwardUC = *(unicharBuffer + 1);
else
forwardUC = INVALIDCHAR;
/* if we have encountered this index at least once
before, define a new window*/
if( comp->fIndexCount[curIndex] > 1 ) {
/* make sure there is enough room to write all
three bytes and if not, rewind the source
stream and break out*/
if( (byteBuffer + 2) >= targetLimit) {
--unicharBuffer;
goto finish;
}
/* get least recently defined window*/
whichWindow = scsu_getLRDefinedWindow(comp);
*byteBuffer++ = (uint8_t) (SDEFINE0 + whichWindow);
*byteBuffer++ = (uint8_t) curIndex;
*byteBuffer++ = (uint8_t)
(curUC - sOffsetTable[curIndex]
+ COMPRESSIONOFFSET);
comp->fOffsets[whichWindow] = sOffsetTable[curIndex];
comp->fCurrentWindow = whichWindow;
comp->fTimeStamps [whichWindow] = ++(comp->fTimeStamp);
}
/* three chars in a row with same index, define a
new window- makeIndex will return RESERVEDINDEX
for INVALIDCHAR*/
else if( curIndex == scsu_makeIndex(nextUC)
&& curIndex == scsu_makeIndex(forwardUC) ) {
/* make sure there is enough room to write all
three bytes if not, rewind the source
stream and break out*/
if( (byteBuffer + 2) >= targetLimit) {
--unicharBuffer;
goto finish;
}
whichWindow = scsu_getLRDefinedWindow(comp);
*byteBuffer++ = (uint8_t) (SDEFINE0 + whichWindow);
*byteBuffer++ = (uint8_t) curIndex;
*byteBuffer++ = (uint8_t)
(curUC - sOffsetTable[curIndex]
+ COMPRESSIONOFFSET);
comp->fOffsets[whichWindow] = sOffsetTable[curIndex];
comp->fCurrentWindow = whichWindow;
comp->fTimeStamps [whichWindow] = ++(comp->fTimeStamp);
}
/* only two chars in a row with same index, so
switch to unicode mode makeIndex will return
RESERVEDINDEX for INVALIDCHAR*/
else if( curIndex == scsu_makeIndex(nextUC)
&& curIndex != scsu_makeIndex(forwardUC) ) {
/* make sure there is enough room to write all
four bytes if not, rewind the source stream
and break out*/
if( (byteBuffer + 3) >= targetLimit) {
--unicharBuffer;
goto finish;
}
*byteBuffer++ = (uint8_t) SCHANGEU;
hiByte = curUC >> 8;
loByte = curUC;
/* add quote Unicode tag */
if( sUnicodeTagTable[hiByte] )
*byteBuffer++ = (uint8_t) UQUOTEU;
*byteBuffer++ = (uint8_t) hiByte;
*byteBuffer++ = (uint8_t) loByte;
comp->fMode = UNICODEMODE;
1999-08-16 21:50:52 +00:00
/* use a goto here for speed, to avoid having
to check fMode in the while loop at the top
of the case */
goto mainLoop;
}
/* three chars have different indices, so switch
to unicode mode*/
else {
/* make sure there is enough room to write all
four bytes and if not, rewind the source
stream and break out*/
if( (byteBuffer + 3) >= targetLimit) {
--unicharBuffer;
goto finish;
}
*byteBuffer++ = (uint8_t) SCHANGEU;
hiByte = curUC >> 8;
loByte = curUC;
/* add quote Unicode tag*/
if( sUnicodeTagTable[ hiByte ] )
*byteBuffer++ = (uint8_t) UQUOTEU;
*byteBuffer++ = (uint8_t) hiByte;
*byteBuffer++ = (uint8_t) loByte;
comp->fMode = UNICODEMODE;
/* use a goto here for speed, to avoid having
to check fMode in the while loop at the top
of the case */
goto mainLoop;
}
}
}
break;
/* main unicode mode compression loop*/
case UNICODEMODE:
while(unicharBuffer < sourceLimit && byteBuffer < targetLimit) {
1999-08-16 21:50:52 +00:00
/* get current char*/
curUC = *unicharBuffer++;
/* get next char*/
if( unicharBuffer < sourceLimit )
nextUC = *unicharBuffer;
else
nextUC = INVALIDCHAR;
/* if we have two uncompressible unichars in a row,
put the current char's bytes in the stream*/
if( ! scsu_isCompressible(curUC)
|| (nextUC != INVALIDCHAR
&& ! scsu_isCompressible(nextUC)) ) {
/* make sure there is enough room to write all
three bytes and if not, rewind the source
stream and break out*/
if( (byteBuffer + 2) >= targetLimit) {
--unicharBuffer;
goto finish;
}
hiByte = curUC >> 8;
loByte = curUC;
/* add quote Unicode tag*/
if( sUnicodeTagTable[ hiByte ] )
*byteBuffer++ = (uint8_t) UQUOTEU;
*byteBuffer++ = (uint8_t) hiByte;
*byteBuffer++ = (uint8_t) loByte;
}
/* bytes less than 0x80 can go straight in the stream,
but in single-byte mode*/
else if( curUC < 0x0080 ) {
loByte = curUC;
/* if two chars in a row below 0x80 and the
current char is not a single-byte mode tag,
switch to single-byte mode*/
if(nextUC != INVALIDCHAR
&& nextUC < 0x0080 && ! sSingleTagTable[ loByte ] ) {
/* make sure there is enough room to write
both bytes and if not, rewind the source stream
and break out*/
if( (byteBuffer + 1) >= targetLimit) {
--unicharBuffer;
goto finish;
}
/* use window 0, but any would work*/
*byteBuffer++ = (uint8_t) UCHANGE0;
*byteBuffer++ = (uint8_t) loByte;
comp->fCurrentWindow = 0;
comp->fTimeStamps [0] = ++(comp->fTimeStamp);
comp->fMode = SINGLEBYTEMODE;
/* use a goto here for speed, to avoid having
to check fMode in the while loop at the top
of the case */
goto mainLoop;
}
/* otherwise, just write the bytes to the stream
(this will cover the case of only 1 char less
than 0x80 and single-byte mode tags)*/
else {
/* make sure there is enough room to write
both bytes and if not, rewind the source
stream and break out*/
if( (byteBuffer + 1) >= targetLimit) {
--unicharBuffer;
goto finish;
}
/* since the character is less than 0x80, the
high byte is always 0x00 - no need for
(curUC >> 8)*/
*byteBuffer++ = (uint8_t) 0x00;
*byteBuffer++ = (uint8_t) loByte;
}
}
/* figure out if the current unichar is in a defined
window*/
else if( (whichWindow = scsu_findDynamicWindow(comp, curUC))
!= INVALIDWINDOW ) {
/* if two chars in a row in the same window,
switch to that window and go to single-byte
mode inDynamicWindow will return FALSE for
INVALIDCHAR*/
if( scsu_inDynamicWindow(comp, nextUC, whichWindow) ) {
/* make sure there is enough room to write
both bytes if not, rewind the source stream
and break out*/
if( (byteBuffer + 1) >= targetLimit) {
--unicharBuffer;
goto finish;
}
*byteBuffer++ = (uint8_t) (UCHANGE0 + whichWindow);
*byteBuffer++ = (uint8_t)
(curUC - comp->fOffsets[whichWindow]
+ COMPRESSIONOFFSET);
comp->fTimeStamps[whichWindow] = ++(comp->fTimeStamp);
comp->fCurrentWindow = whichWindow;
comp->fMode = SINGLEBYTEMODE;
/* use a goto here for speed, to avoid having
to check fMode in the while loop at the top
of the case */
goto mainLoop;
}
/* otherwise, just quote the unicode for the
char*/
else {
/* make sure there is enough room to write all
three bytes and if not, rewind the source
stream and break out*/
if( (byteBuffer + 2) >= targetLimit) {
--unicharBuffer;
goto finish;
}
hiByte = curUC >> 8;
loByte = curUC;
/* add quote Unicode tag*/
if( sUnicodeTagTable[ hiByte ] )
*byteBuffer++ = (uint8_t) UQUOTEU;
*byteBuffer++ = (uint8_t) hiByte;
*byteBuffer++ = (uint8_t) loByte;
}
}
/* char is not in a defined window*/
else {
/* determine index for current char (char is
compressible)*/
curIndex = scsu_makeIndex(curUC);
comp->fIndexCount[curIndex]++;
/* look ahead*/
if( (unicharBuffer + 1) < sourceLimit )
forwardUC = *unicharBuffer;
else
forwardUC = INVALIDCHAR;
/* if we have encountered this index at least once
before, define a new window for it that hasn't
previously been redefined*/
if( comp->fIndexCount[curIndex] > 1 ) {
/* make sure there is enough room to write all
three bytes if not, rewind the source
stream and break out*/
if( (byteBuffer + 2) >= targetLimit) {
--unicharBuffer;
goto finish;
}
/* get least recently defined window*/
whichWindow = scsu_getLRDefinedWindow(comp);
*byteBuffer++ = (uint8_t) (UDEFINE0 + whichWindow);
*byteBuffer++ = (uint8_t) curIndex;
*byteBuffer++ = (uint8_t)
(curUC - sOffsetTable[curIndex]
+ COMPRESSIONOFFSET);
comp->fOffsets[whichWindow] = sOffsetTable[curIndex];
comp->fCurrentWindow = whichWindow;
comp->fTimeStamps[whichWindow] = ++(comp->fTimeStamp);
comp->fMode = SINGLEBYTEMODE;
/* use a goto here for speed, to avoid having
to check fMode in the while loop at the top
of the case */
goto mainLoop;
}
/* if three chars in a row with the same index,
define a new window makeIndex will return
RESERVEDINDEX for INVALIDCHAR*/
else if( curIndex == scsu_makeIndex(nextUC)
&& curIndex == scsu_makeIndex(forwardUC) ) {
/* make sure there is enough room to write all
three bytes if not, rewind the source
stream and break out*/
if( (byteBuffer + 2) >= targetLimit) {
--unicharBuffer;
goto finish;
}
whichWindow = scsu_getLRDefinedWindow(comp);
*byteBuffer++ = (uint8_t) (UDEFINE0 + whichWindow);
*byteBuffer++ = (uint8_t) curIndex;
*byteBuffer++ = (uint8_t)
(curUC - sOffsetTable[curIndex]
+ COMPRESSIONOFFSET);
comp->fOffsets[whichWindow] = sOffsetTable[curIndex];
comp->fCurrentWindow = whichWindow;
comp->fTimeStamps[whichWindow] = ++(comp->fTimeStamp);
comp->fMode = SINGLEBYTEMODE;
/* use a goto here for speed, to avoid having
to check fMode in the while loop at the top
of the case */
goto mainLoop;
}
/* otherwise just quote the unicode, and save our
windows for longer runs*/
else {
/* make sure there is enough room to write all
three bytes and if not, rewind the source
stream and break out*/
if( (byteBuffer + 2) >= targetLimit) {
--unicharBuffer;
goto finish;
}
hiByte = curUC >> 8;
loByte = curUC;
/* add quote Unicode tag*/
if( sUnicodeTagTable[ hiByte ] )
*byteBuffer++ = (uint8_t) UQUOTEU;
*byteBuffer++ = (uint8_t) hiByte;
*byteBuffer++ = (uint8_t) loByte;
}
}
}
} /* end switch*/
}
1999-08-16 21:50:52 +00:00
finish:
/* fill in output parameters*/
*target = byteBuffer;
*source = unicharBuffer;
1999-08-16 21:50:52 +00:00
if(unicharBuffer < sourceLimit)
*status = U_BUFFER_OVERFLOW_ERROR;
1999-08-16 21:50:52 +00:00
}
void
scsu_decompress(UnicodeCompressor *comp,
UChar **target,
const UChar *targetLimit,
const uint8_t **source,
const uint8_t *sourceLimit,
UErrorCode *status)
1999-08-16 21:50:52 +00:00
{
/* the current position in the source byte buffer*/
const uint8_t *byteBuffer = *source;
1999-08-16 21:50:52 +00:00
/* the current position in the target unichar buffer*/
UChar *unicharBuffer = *target;
1999-08-16 21:50:52 +00:00
/* the current byte from the source buffer*/
int32_t aByte = 0x00;
1999-08-16 21:50:52 +00:00
/* temporary for calculating surrogate pairs */
int32_t normalizedBase;
1999-08-16 21:50:52 +00:00
/* temporary used for look-ahead */
int32_t dByte;
1999-08-16 21:50:52 +00:00
/* verify we weren't passed a failing error code */
if(U_FAILURE(*status)) {
return;
}
/* verify the target buffer can hold at least 1 UChar */
else if(targetLimit - unicharBuffer < sizeof(UChar)) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
/* if our internal buffer isn't empty, flush its contents
to the output buffer before doing any more decompression */
if(comp->fBufferLength > 0) {
int32_t newBytes = 0;
const uint8_t *newSource = comp->fBuffer;
const uint8_t *newSourceLimit = comp->fBuffer + USCSU_BUFSIZE;
/* fill the buffer completely, to guarantee one full character */
if(comp->fBufferLength != USCSU_BUFSIZE) {
newBytes = USCSU_BUFSIZE - comp->fBufferLength;
/* verify there are newBytes bytes in byteBuffer */
if(sourceLimit - byteBuffer < newBytes)
newBytes = sourceLimit - byteBuffer;
uprv_memcpy(comp->fBuffer + comp->fBufferLength, byteBuffer, newBytes);
1999-08-16 21:50:52 +00:00
}
/* reset buffer length to 0 before recursive call */
comp->fBufferLength = 0;
/* call self recursively to decompress the buffer */
scsu_decompress(comp, &unicharBuffer, targetLimit,
&newSource, newSourceLimit, status);
/* update the positions into the arrays */
/* unicharBuffer was updated by the call to decompress above */
byteBuffer += newBytes;
}
/* the main decompression loop*/
1999-08-16 21:50:52 +00:00
mainLoop:
while(byteBuffer < sourceLimit && unicharBuffer < targetLimit) {
1999-08-16 21:50:52 +00:00
switch(comp->fMode) {
1999-08-16 21:50:52 +00:00
/* single-byte mode decompression loop*/
case SINGLEBYTEMODE:
while(byteBuffer < sourceLimit && unicharBuffer < targetLimit) {
/* get the next byte */
aByte = *byteBuffer++;
switch(aByte) {
/* All bytes from 0x80 through 0xFF are remapped to
chars or surrogate pairs according to the currently
active window */
case 0x80: case 0x81: case 0x82: case 0x83: case 0x84:
case 0x85: case 0x86: case 0x87: case 0x88: case 0x89:
case 0x8A: case 0x8B: case 0x8C: case 0x8D: case 0x8E:
case 0x8F: case 0x90: case 0x91: case 0x92: case 0x93:
case 0x94: case 0x95: case 0x96: case 0x97: case 0x98:
case 0x99: case 0x9A: case 0x9B: case 0x9C: case 0x9D:
case 0x9E: case 0x9F: case 0xA0: case 0xA1: case 0xA2:
case 0xA3: case 0xA4: case 0xA5: case 0xA6: case 0xA7:
case 0xA8: case 0xA9: case 0xAA: case 0xAB: case 0xAC:
case 0xAD: case 0xAE: case 0xAF: case 0xB0: case 0xB1:
case 0xB2: case 0xB3: case 0xB4: case 0xB5: case 0xB6:
case 0xB7: case 0xB8: case 0xB9: case 0xBA: case 0xBB:
case 0xBC: case 0xBD: case 0xBE: case 0xBF: case 0xC0:
case 0xC1: case 0xC2: case 0xC3: case 0xC4: case 0xC5:
case 0xC6: case 0xC7: case 0xC8: case 0xC9: case 0xCA:
case 0xCB: case 0xCC: case 0xCD: case 0xCE: case 0xCF:
case 0xD0: case 0xD1: case 0xD2: case 0xD3: case 0xD4:
case 0xD5: case 0xD6: case 0xD7: case 0xD8: case 0xD9:
case 0xDA: case 0xDB: case 0xDC: case 0xDD: case 0xDE:
case 0xDF: case 0xE0: case 0xE1: case 0xE2: case 0xE3:
case 0xE4: case 0xE5: case 0xE6: case 0xE7: case 0xE8:
case 0xE9: case 0xEA: case 0xEB: case 0xEC: case 0xED:
case 0xEE: case 0xEF: case 0xF0: case 0xF1: case 0xF2:
case 0xF3: case 0xF4: case 0xF5: case 0xF6: case 0xF7:
case 0xF8: case 0xF9: case 0xFA: case 0xFB: case 0xFC:
case 0xFD: case 0xFE: case 0xFF:
/* For offsets <= 0xFFFF, convert to a single char by
adding the window's offset and subtracting the
generic compression offset*/
if(comp->fOffsets[ comp->fCurrentWindow ] <= 0xFFFF) {
*unicharBuffer++ = (UChar)
(aByte + comp->fOffsets[comp->fCurrentWindow]
- COMPRESSIONOFFSET);
}
/* For offsets > 0x10000, convert to a surrogate pair by
normBase = window's offset - 0x10000
high surrogate = 0xD800 + (normBase >> 10)
low surrogate = 0xDC00 + (normBase & 0x3FF)
+ (byte & 0x7F) */
else {
/* make sure there is enough room to write
both characters
if not, save state and break out */
if((unicharBuffer + 1) >= targetLimit) {
--byteBuffer;
uprv_memcpy(comp->fBuffer, byteBuffer,
sourceLimit - byteBuffer);
comp->fBufferLength = sourceLimit - byteBuffer;
byteBuffer += comp->fBufferLength;
goto finish;
}
normalizedBase = comp->fOffsets[comp->fCurrentWindow]
- 0x10000;
*unicharBuffer++ =
(UChar) (0xD800 + (normalizedBase >> 10));
*unicharBuffer++ = (UChar)
(0xDC00 + (normalizedBase & 0x3FF)
+ (aByte & 0x7F));
}
break;
/* bytes from 0x20 through 0x7F are treated as ASCII
and are remapped to chars by padding the high byte
(this is the same as quoting from static window 0)
NUL (0x00), HT (0x09), CR (0x0A), LF (0x0D) are
treated as ASCII as well*/
case 0x00: case 0x09: case 0x0A: case 0x0D:
case 0x20: case 0x21: case 0x22: case 0x23: case 0x24:
case 0x25: case 0x26: case 0x27: case 0x28: case 0x29:
case 0x2A: case 0x2B: case 0x2C: case 0x2D: case 0x2E:
case 0x2F: case 0x30: case 0x31: case 0x32: case 0x33:
case 0x34: case 0x35: case 0x36: case 0x37: case 0x38:
case 0x39: case 0x3A: case 0x3B: case 0x3C: case 0x3D:
case 0x3E: case 0x3F: case 0x40: case 0x41: case 0x42:
case 0x43: case 0x44: case 0x45: case 0x46: case 0x47:
case 0x48: case 0x49: case 0x4A: case 0x4B: case 0x4C:
case 0x4D: case 0x4E: case 0x4F: case 0x50: case 0x51:
case 0x52: case 0x53: case 0x54: case 0x55: case 0x56:
case 0x57: case 0x58: case 0x59: case 0x5A: case 0x5B:
case 0x5C: case 0x5D: case 0x5E: case 0x5F: case 0x60:
case 0x61: case 0x62: case 0x63: case 0x64: case 0x65:
case 0x66: case 0x67: case 0x68: case 0x69: case 0x6A:
case 0x6B: case 0x6C: case 0x6D: case 0x6E: case 0x6F:
case 0x70: case 0x71: case 0x72: case 0x73: case 0x74:
case 0x75: case 0x76: case 0x77: case 0x78: case 0x79:
case 0x7A: case 0x7B: case 0x7C: case 0x7D: case 0x7E:
case 0x7F:
*unicharBuffer++ = (UChar) aByte;
break;
/* quote unicode*/
case SQUOTEU:
/* verify we have two bytes following tag and if not,
rewind the source stream and break out */
if( (byteBuffer + 1) >= sourceLimit ) {
--byteBuffer;
uprv_memcpy(comp->fBuffer, byteBuffer,
sourceLimit - byteBuffer);
comp->fBufferLength = sourceLimit - byteBuffer;
byteBuffer += comp->fBufferLength;
goto finish;
}
aByte = *byteBuffer++;
*unicharBuffer++ =
(UChar) (aByte << 8 | *byteBuffer++);
break;
/* switch to Unicode mode*/
case SCHANGEU:
comp->fMode = UNICODEMODE;
/* use a goto here for speed, to avoid having to check
fMode in the while loop at the top of the case */
goto mainLoop;
break;
/* handle all quote tags*/
case SQUOTE0: case SQUOTE1: case SQUOTE2: case SQUOTE3:
case SQUOTE4: case SQUOTE5: case SQUOTE6: case SQUOTE7:
/* verify there is a byte following the tag and if
not, rewind the source stream and break out*/
if( byteBuffer >= sourceLimit ) {
--byteBuffer;
uprv_memcpy(comp->fBuffer, byteBuffer,
sourceLimit - byteBuffer);
comp->fBufferLength = sourceLimit - byteBuffer;
byteBuffer += comp->fBufferLength;
goto finish;
}
/* if the byte is in the range 0x00 - 0x7F, use static
window n- otherwise, use dynamic window n */
dByte = *byteBuffer++;
*unicharBuffer++ = (UChar)
(dByte + (dByte >= 0x00 && dByte < 0x80
? sOffsets[aByte - SQUOTE0]
: (comp->fOffsets[aByte - SQUOTE0]
- COMPRESSIONOFFSET)));
break;
/* handle all change tags*/
case SCHANGE0: case SCHANGE1: case SCHANGE2: case SCHANGE3:
case SCHANGE4: case SCHANGE5: case SCHANGE6: case SCHANGE7:
comp->fCurrentWindow = (aByte - SCHANGE0);
break;
/* handle all define tags*/
case SDEFINE0: case SDEFINE1: case SDEFINE2: case SDEFINE3:
case SDEFINE4: case SDEFINE5: case SDEFINE6: case SDEFINE7:
/* verify there is a byte following the tag and if
not, rewind the source stream and break out*/
if( byteBuffer >= sourceLimit ) {
--byteBuffer;
uprv_memcpy(comp->fBuffer, byteBuffer,
sourceLimit - byteBuffer);
comp->fBufferLength = sourceLimit - byteBuffer;
byteBuffer += comp->fBufferLength;
goto finish;
}
comp->fCurrentWindow = (aByte - SDEFINE0);
comp->fOffsets[comp->fCurrentWindow] =
sOffsetTable[*byteBuffer++];
break;
/* handle define extended tag*/
case SDEFINEX:
/* verify we have two bytes following tag and if not,
rewind the source stream and break out*/
if( (byteBuffer + 1) >= sourceLimit ) {
--byteBuffer;
uprv_memcpy(comp->fBuffer, byteBuffer,
sourceLimit - byteBuffer);
comp->fBufferLength = sourceLimit - byteBuffer;
byteBuffer += comp->fBufferLength;
goto finish;
}
aByte = *byteBuffer++;
comp->fCurrentWindow = (aByte & 0xE0) >> 5;
comp->fOffsets[comp->fCurrentWindow] = 0x10000
+ (0x80
* (((aByte & 0x1F) << 8) | *byteBuffer++));
break;
/* reserved, shouldn't happen*/
case SRESERVED:
break;
} /* end switch*/
} /* end while*/
break;
/* unicode mode decompression loop*/
case UNICODEMODE:
while( byteBuffer < sourceLimit && unicharBuffer < targetLimit ) {
1999-08-16 21:50:52 +00:00
/* get the next byte */
aByte = *byteBuffer++;
switch( aByte ) {
/* handle all define tags*/
case UDEFINE0: case UDEFINE1: case UDEFINE2: case UDEFINE3:
case UDEFINE4: case UDEFINE5: case UDEFINE6: case UDEFINE7:
/* verify there is a byte following tag and if not,
rewind the source stream and break out*/
if( byteBuffer >= sourceLimit ) {
--byteBuffer;
uprv_memcpy(comp->fBuffer, byteBuffer,
sourceLimit - byteBuffer);
comp->fBufferLength = sourceLimit - byteBuffer;
byteBuffer += comp->fBufferLength;
goto finish;
}
comp->fCurrentWindow = (aByte - UDEFINE0);
comp->fOffsets[comp->fCurrentWindow] =
sOffsetTable[*byteBuffer++];
comp->fMode = SINGLEBYTEMODE;
/* use a goto here for speed, to avoid having to check
fMode in the while loop at the top of the case */
goto mainLoop;
break;
/* handle define extended tag*/
case UDEFINEX:
/* verify we have two bytes following tag if not,
rewind the source stream and break out*/
if( (byteBuffer + 1) >= sourceLimit ) {
--byteBuffer;
uprv_memcpy(comp->fBuffer, byteBuffer,
sourceLimit - byteBuffer);
comp->fBufferLength = sourceLimit - byteBuffer;
byteBuffer += comp->fBufferLength;
goto finish;
}
aByte = *byteBuffer++;
comp->fCurrentWindow = (aByte & 0xE0) >> 5;
comp->fOffsets[comp->fCurrentWindow] = 0x10000
+ (0x80
* (((aByte & 0x1F) << 8) | *byteBuffer++));
comp->fMode = SINGLEBYTEMODE;
/* use a goto here for speed, to avoid having to check
fMode in the while loop at the top of the case */
goto mainLoop;
break;
/* handle all change tags*/
case UCHANGE0: case UCHANGE1: case UCHANGE2: case UCHANGE3:
case UCHANGE4: case UCHANGE5: case UCHANGE6: case UCHANGE7:
comp->fCurrentWindow = (aByte - UCHANGE0);
comp->fMode = SINGLEBYTEMODE;
/* use a goto here for speed, to avoid having to check
fMode in the while loop at the top of the case */
goto mainLoop;
break;
/* quote unicode*/
case UQUOTEU:
/* verify we have two bytes following tag if not,
rewind the source stream and break out*/
if( byteBuffer >= sourceLimit - 1) {
--byteBuffer;
uprv_memcpy(comp->fBuffer, byteBuffer,
sourceLimit - byteBuffer);
comp->fBufferLength = sourceLimit - byteBuffer;
byteBuffer += comp->fBufferLength;
goto finish;
}
aByte = *byteBuffer++;
*unicharBuffer++ = (UChar)
(aByte << 8 | *byteBuffer++);
break;
default:
/* verify there is a byte following tag if not, rewind
the source stream and break out*/
if( byteBuffer >= sourceLimit ) {
--byteBuffer;
uprv_memcpy(comp->fBuffer, byteBuffer,
sourceLimit - byteBuffer);
comp->fBufferLength = sourceLimit - byteBuffer;
byteBuffer += comp->fBufferLength;
goto finish;
}
*unicharBuffer++ = (UChar) (aByte << 8 | *byteBuffer++);
break;
} /* end switch*/
} /* end while*/
break;
1999-08-16 21:50:52 +00:00
} /* end switch( comp->fMode )*/
} /* end while*/
1999-08-16 21:50:52 +00:00
finish:
/* fill in return values*/
*target = unicharBuffer;
*source = byteBuffer;
1999-08-16 21:50:52 +00:00
if(byteBuffer < sourceLimit)
*status = U_BUFFER_OVERFLOW_ERROR;
1999-08-16 21:50:52 +00:00
}
/** Reset the compressor to its initial state. */
void
scsu_reset(UnicodeCompressor *comp)
1999-08-16 21:50:52 +00:00
{
int32_t i;
/* reset dynamic windows*/
comp->fOffsets[0] = 0x0080; /* Latin-1*/
comp->fOffsets[1] = 0x00C0; /* Latin-1 Supplement + Latin Extended-A*/
comp->fOffsets[2] = 0x0400; /* Cyrillic*/
comp->fOffsets[3] = 0x0600; /* Arabic*/
comp->fOffsets[4] = 0x0900; /* Devanagari*/
comp->fOffsets[5] = 0x3040; /* Hiragana*/
comp->fOffsets[6] = 0x30A0; /* Katakana*/
comp->fOffsets[7] = 0xFF00; /* Fullwidth ASCII*/
1999-08-16 21:50:52 +00:00
/* reset time stamps*/
for(i = 0; i < USCSU_NUM_WINDOWS; i++) {
comp->fTimeStamps[i] = 0;
}
/* reset count of seen indices*/
for( i = 0; i <= USCSU_MAX_INDEX; i++ ) {
comp->fIndexCount[i] = 0;
}
1999-08-16 21:50:52 +00:00
comp->fTimeStamp = 0; /* Reset current time stamp*/
comp->fCurrentWindow = 0; /* Make current window Latin-1*/
comp->fMode = SINGLEBYTEMODE; /* Start in single-byte mode*/
comp->fBufferLength = 0; /* Empty buffer */
1999-08-16 21:50:52 +00:00
}
/**
* Create the index value for a character.
* For more information on this function, refer to table X-3
* <A HREF="http://www.unicode.org/unicode/reports/tr6">UTR6</A>.
* @param c The character in question.
* @return An index for c
*/
static int32_t
scsu_makeIndex(int32_t c)
1999-08-16 21:50:52 +00:00
{
/* check the predefined indices*/
if( c >= 0x00C0 && c < 0x0140)
return LATININDEX;
else if( c >= 0x0250 && c < 0x02D0 )
return IPAEXTENSIONINDEX;
else if( c >= 0x0370 && c < 0x03F0 )
return GREEKINDEX;
else if( c >= 0x0530 && c < 0x0590 )
return ARMENIANINDEX;
else if( c >= 0x3040 && c < 0x30A0 )
return HIRAGANAINDEX;
else if( c >= 0x30A0 && c < 0x3120)
return KATAKANAINDEX;
else if( c >= 0xFF60 && c < 0xFF9F )
return HALFWIDTHKATAKANAINDEX;
1999-08-16 21:50:52 +00:00
/* calculate index*/
else if( c >= 0x0080 && c < 0x3400 )
return (c / 0x80) & 0xFF;
else if( c >= 0xE000 && c <= 0xFFFF )
return ((c - 0xAC00) / 0x80) & 0xFF;
1999-08-16 21:50:52 +00:00
/* should never happen*/
else {
return RESERVEDINDEX;
}
1999-08-16 21:50:52 +00:00
}
/**
* Determine if a character is in a dynamic window.
* @param c The character to test
* @param whichWindow The dynamic window the test
* @return TRUE if <TT>c</TT> will fit in <TT>whichWindow</TT>, FALSE
* otherwise.
*/
static UBool
scsu_inDynamicWindow(const UnicodeCompressor *comp,
int32_t c,
int32_t whichWindow)
1999-08-16 21:50:52 +00:00
{
return (UBool)(c >= comp->fOffsets[whichWindow]
&& c < (comp->fOffsets[whichWindow] + 0x80));
1999-08-16 21:50:52 +00:00
}
/**
* Determine if a character is in a static window.
* @param c The character to test
* @param whichWindow The static window the test
* @return TRUE if <TT>c</TT> will fit in <TT>whichWindow</TT>, FALSE
* otherwise.
*/
static UBool
scsu_inStaticWindow(int32_t c,
int32_t whichWindow)
1999-08-16 21:50:52 +00:00
{
return (UBool)(c >= sOffsets[whichWindow] && c < (sOffsets[whichWindow] + 0x80));
1999-08-16 21:50:52 +00:00
}
/**
* Determine if a character is compressible.
* @param c The character to test.
* @return TRUE if the <TT>c</TT> is compressible, FALSE otherwise.
*/
static UBool
scsu_isCompressible(int32_t c)
1999-08-16 21:50:52 +00:00
{
return (UBool)(c < 0x3400 || c >= 0xE000);
1999-08-16 21:50:52 +00:00
}
/**
* Determine if a dynamic window for a certain character is defined
* @param c The character in question
* @return The dynamic window containing <TT>c</TT>, or INVALIDWINDOW if
* not defined.
*/
static int32_t
scsu_findDynamicWindow(const UnicodeCompressor *comp,
int32_t c)
1999-08-16 21:50:52 +00:00
{
int32_t i;
1999-08-16 21:50:52 +00:00
for(i = 0; i < USCSU_NUM_WINDOWS; i++) {
if(scsu_inDynamicWindow(comp, c, i)) {
return i;
1999-08-16 21:50:52 +00:00
}
}
1999-08-16 21:50:52 +00:00
return INVALIDWINDOW;
1999-08-16 21:50:52 +00:00
}
/**
* Determine if a static window for a certain character is defined
* @param c The character in question
* @return The static window containing <TT>c</TT>, or INVALIDWINDOW if
* not defined.
*/
static int32_t
scsu_findStaticWindow(int32_t c)
1999-08-16 21:50:52 +00:00
{
int32_t i;
for(i = 0; i < USCSU_NUM_STATIC_WINDOWS; i++) {
if(scsu_inStaticWindow(c, i)) {
return i;
1999-08-16 21:50:52 +00:00
}
}
return INVALIDWINDOW;
1999-08-16 21:50:52 +00:00
}
/** Find the least-recently defined window */
static int32_t
scsu_getLRDefinedWindow(const UnicodeCompressor *comp)
1999-08-16 21:50:52 +00:00
{
int32_t leastRU = INT32_MAX;
int32_t whichWindow = INVALIDWINDOW;
int32_t i;
/* find least recently used window*/
for(i = 0; i < USCSU_NUM_WINDOWS; i++ ) {
if(comp->fTimeStamps[i] < leastRU) {
leastRU = comp->fTimeStamps[i];
whichWindow = i;
1999-08-16 21:50:52 +00:00
}
}
return whichWindow;
1999-08-16 21:50:52 +00:00
}