1999-08-16 21:50:52 +00:00
/*
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
2001-03-17 00:46:46 +00:00
* Copyright ( C ) 1996 - 2001 , International Business Machines
1999-11-23 01:30:04 +00:00
* Corporation and others . All Rights Reserved .
1999-08-16 21:50:52 +00:00
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
2001-03-17 00:46:46 +00:00
* file name : ucol . cpp
* encoding : US - ASCII
* tab size : 8 ( not used )
* indentation : 4
*
2001-02-20 00:26:50 +00:00
* Modification history
* Date Name Comments
2001-03-17 00:46:46 +00:00
* 1996 - 1999 various members of ICU team maintained C API for collation framework
2001-04-23 01:53:49 +00:00
* 02 / 16 / 2001 synwee Added internal method getPrevSpecialCE
2001-03-02 01:14:03 +00:00
* 03 / 01 / 2001 synwee Added maxexpansion functionality .
2001-03-17 00:46:46 +00:00
* 03 / 16 / 2001 weiv Collation framework is rewritten in C and made UCA compliant
1999-08-16 21:50:52 +00:00
*/
2001-03-08 17:40:42 +00:00
2002-09-20 01:54:48 +00:00
# include "unicode/utypes.h"
2003-02-06 23:29:56 +00:00
# include "uassert.h"
2002-09-20 01:54:48 +00:00
# if !UCONFIG_NO_COLLATION
2002-07-12 21:42:24 +00:00
# include "unicode/uloc.h"
# include "unicode/coll.h"
# include "unicode/tblcoll.h"
# include "unicode/coleitr.h"
# include "unicode/unorm.h"
# include "unicode/udata.h"
# include "unicode/uchar.h"
2002-09-17 06:27:51 +00:00
# include "unicode/caniter.h"
2002-07-12 21:42:24 +00:00
2001-03-08 17:40:42 +00:00
# include "ucol_bld.h"
# include "ucol_imp.h"
# include "ucol_tok.h"
# include "ucol_elm.h"
2001-05-18 19:49:04 +00:00
# include "bocsu.h"
1999-08-16 21:50:52 +00:00
2001-11-30 00:06:13 +00:00
# include "unormimp.h"
2003-01-23 01:52:34 +00:00
# include "unorm_it.h"
2002-02-28 07:20:52 +00:00
# include "uresimp.h"
2001-11-30 00:06:13 +00:00
# include "umutex.h"
# include "uhash.h"
# include "ucln_in.h"
2002-07-12 21:42:24 +00:00
# include "cstring.h"
2000-12-06 00:53:48 +00:00
2001-06-01 22:08:06 +00:00
# ifdef UCOL_DEBUG
2000-11-30 23:20:14 +00:00
# include <stdio.h>
2001-06-01 22:08:06 +00:00
# endif
2000-11-30 23:20:14 +00:00
2001-10-08 23:26:58 +00:00
U_NAMESPACE_USE
2001-04-12 00:08:26 +00:00
/* added by synwee for trie manipulation*/
# define STAGE_1_SHIFT_ 10
# define STAGE_2_SHIFT_ 4
# define STAGE_2_MASK_AFTER_SHIFT_ 0x3F
# define STAGE_3_MASK_ 0xF
# define LAST_BYTE_MASK_ 0xFF
# define SECOND_LAST_BYTE_SHIFT_ 8
2001-01-04 00:45:41 +00:00
2001-04-20 22:29:53 +00:00
# define ZERO_CC_LIMIT_ 0xC0
2001-01-15 07:28:54 +00:00
static UCollator * UCA = NULL ;
2002-07-02 22:32:14 +00:00
static UCAConstants * UCAconsts = NULL ;
2001-08-30 02:59:19 +00:00
static UDataMemory * UCA_DATA_MEM = NULL ;
2001-01-04 00:45:41 +00:00
2001-07-02 22:56:52 +00:00
U_CDECL_BEGIN
2001-03-02 00:19:43 +00:00
static UBool U_CALLCONV
2001-06-01 22:08:06 +00:00
isAcceptableUCA ( void * /*context*/ ,
const char * /*type*/ , const char * /*name*/ ,
2001-01-04 00:45:41 +00:00
const UDataInfo * pInfo ) {
2001-02-28 19:01:23 +00:00
/* context, type & name are intentionally not used */
2001-01-04 00:45:41 +00:00
if ( pInfo - > size > = 20 & &
pInfo - > isBigEndian = = U_IS_BIG_ENDIAN & &
pInfo - > charsetFamily = = U_CHARSET_FAMILY & &
2001-10-23 21:48:14 +00:00
pInfo - > dataFormat [ 0 ] = = ucaDataInfo . dataFormat [ 0 ] & & /* dataFormat="UCol" */
pInfo - > dataFormat [ 1 ] = = ucaDataInfo . dataFormat [ 1 ] & &
pInfo - > dataFormat [ 2 ] = = ucaDataInfo . dataFormat [ 2 ] & &
pInfo - > dataFormat [ 3 ] = = ucaDataInfo . dataFormat [ 3 ] & &
pInfo - > formatVersion [ 0 ] = = ucaDataInfo . formatVersion [ 0 ] & &
2003-04-24 07:00:27 +00:00
pInfo - > formatVersion [ 1 ] > = ucaDataInfo . formatVersion [ 1 ] // &&
2002-11-14 21:00:19 +00:00
//pInfo->formatVersion[1]==ucaDataInfo.formatVersion[1] &&
//pInfo->formatVersion[2]==ucaDataInfo.formatVersion[2] && // Too harsh
//pInfo->formatVersion[3]==ucaDataInfo.formatVersion[3] && // Too harsh
2003-04-24 07:00:27 +00:00
) {
UVersionInfo UCDVersion ;
u_getUnicodeVersion ( UCDVersion ) ;
if ( pInfo - > dataVersion [ 0 ] = = UCDVersion [ 0 ] & &
pInfo - > dataVersion [ 1 ] > = UCDVersion [ 1 ] ) { // &&
2002-11-14 21:00:19 +00:00
//pInfo->dataVersion[2]==ucaDataInfo.dataVersion[2] &&
//pInfo->dataVersion[3]==ucaDataInfo.dataVersion[3]) {
2003-04-24 07:00:27 +00:00
return TRUE ;
} else {
return FALSE ;
}
2001-01-04 00:45:41 +00:00
} else {
return FALSE ;
}
}
2002-07-02 22:32:14 +00:00
static int32_t U_CALLCONV
_getFoldingOffset ( uint32_t data ) {
return ( int32_t ) ( data & 0xFFFFFF ) ;
}
2001-07-02 22:56:52 +00:00
U_CDECL_END
2001-04-06 23:37:48 +00:00
2001-10-22 05:30:22 +00:00
static
2001-04-23 01:53:49 +00:00
inline void IInit_collIterate ( const UCollator * collator , const UChar * sourceString ,
int32_t sourceLen , collIterate * s ) {
( s ) - > string = ( s ) - > pos = ( UChar * ) ( sourceString ) ;
2001-05-10 17:49:24 +00:00
( s ) - > origFlags = 0 ;
2001-04-06 23:37:48 +00:00
( s ) - > flags = 0 ;
if ( sourceLen > = 0 ) {
s - > flags | = UCOL_ITER_HASLEN ;
2001-04-26 01:15:34 +00:00
( s ) - > endp = ( UChar * ) sourceString + sourceLen ;
}
else {
/* change to enable easier checking for end of string for fcdpositon */
( s ) - > endp = NULL ;
2001-04-06 23:37:48 +00:00
}
2001-04-23 01:53:49 +00:00
( s ) - > CEpos = ( s ) - > toReturn = ( s ) - > CEs ;
( s ) - > writableBuffer = ( s ) - > stackWritableBuffer ;
( s ) - > writableBufSize = UCOL_WRITABLE_BUFFER_SIZE ;
( s ) - > coll = ( collator ) ;
( s ) - > fcdPosition = 0 ;
2001-04-12 19:59:28 +00:00
if ( collator - > normalizationMode = = UCOL_ON ) {
2001-10-05 02:07:51 +00:00
( s ) - > flags | = UCOL_ITER_NORM ;
}
2002-09-04 06:02:13 +00:00
if ( collator - > hiraganaQ = = UCOL_ON & & collator - > strength > = UCOL_QUATERNARY ) {
2001-10-05 02:07:51 +00:00
( s ) - > flags | = UCOL_HIRAGANA_Q ;
}
2003-01-20 07:43:32 +00:00
( s ) - > iterator = NULL ;
//(s)->iteratorIndex = 0;
2001-04-06 23:37:48 +00:00
}
2001-11-21 01:08:55 +00:00
U_CAPI void U_EXPORT2
2002-07-16 01:46:42 +00:00
uprv_init_collIterate ( const UCollator * collator , const UChar * sourceString ,
2001-04-06 23:37:48 +00:00
int32_t sourceLen , collIterate * s ) {
2001-04-12 19:59:28 +00:00
/* Out-of-line version for use from other files. */
2001-04-06 23:37:48 +00:00
IInit_collIterate ( collator , sourceString , sourceLen , s ) ;
}
2003-01-20 07:43:32 +00:00
2001-04-23 01:53:49 +00:00
/**
2001-04-18 01:34:26 +00:00
* Backup the state of the collIterate struct data
* @ param data collIterate to backup
2001-04-23 01:53:49 +00:00
* @ param backup storage
2001-04-18 01:34:26 +00:00
*/
2001-10-22 05:30:22 +00:00
static
2001-04-23 01:53:49 +00:00
inline void backupState ( const collIterate * data , collIterateState * backup )
2001-04-18 01:34:26 +00:00
{
2001-04-23 01:53:49 +00:00
backup - > fcdPosition = data - > fcdPosition ;
backup - > flags = data - > flags ;
backup - > origFlags = data - > origFlags ;
backup - > pos = data - > pos ;
2001-05-02 01:36:29 +00:00
backup - > bufferaddress = data - > writableBuffer ;
2001-04-20 22:29:53 +00:00
backup - > buffersize = data - > writableBufSize ;
2003-01-20 07:43:32 +00:00
if ( data - > iterator ! = NULL ) {
2003-01-23 01:52:34 +00:00
//backup->iteratorIndex = data->iterator->getIndex(data->iterator, UITER_CURRENT);
backup - > iteratorIndex = data - > iterator - > getState ( data - > iterator ) ;
2003-02-06 23:29:56 +00:00
// no we try to fixup if we're using a normalizing iterator and we get UITER_NO_STATE
backup - > iteratorMove = 0 ;
if ( backup - > iteratorIndex = = UITER_NO_STATE ) {
while ( ( backup - > iteratorIndex = data - > iterator - > getState ( data - > iterator ) ) = = UITER_NO_STATE ) {
backup - > iteratorMove + + ;
data - > iterator - > move ( data - > iterator , - 1 , UITER_CURRENT ) ;
}
data - > iterator - > move ( data - > iterator , backup - > iteratorMove , UITER_CURRENT ) ;
}
2003-01-20 07:43:32 +00:00
}
2001-04-18 01:34:26 +00:00
}
2001-04-23 01:53:49 +00:00
/**
2001-04-18 01:34:26 +00:00
* Loads the state into the collIterate struct data
* @ param data collIterate to backup
2001-04-23 01:53:49 +00:00
* @ param backup storage
2001-05-11 01:13:08 +00:00
* @ param forwards boolean to indicate if forwards iteration is used ,
2001-05-02 01:36:29 +00:00
* false indicates backwards iteration
2001-04-18 01:34:26 +00:00
*/
2001-10-22 05:30:22 +00:00
static
2001-05-11 01:13:08 +00:00
inline void loadState ( collIterate * data , const collIterateState * backup ,
2001-05-02 01:36:29 +00:00
UBool forwards )
2001-04-18 01:34:26 +00:00
{
2003-01-23 01:52:34 +00:00
UErrorCode status = U_ZERO_ERROR ;
2001-04-18 01:34:26 +00:00
data - > flags = backup - > flags ;
data - > origFlags = backup - > origFlags ;
2003-01-20 07:43:32 +00:00
if ( data - > iterator ! = NULL ) {
2003-01-23 01:52:34 +00:00
//data->iterator->move(data->iterator, backup->iteratorIndex, UITER_ZERO);
data - > iterator - > setState ( data - > iterator , backup - > iteratorIndex , & status ) ;
2003-02-06 23:29:56 +00:00
if ( backup - > iteratorMove ! = 0 ) {
data - > iterator - > move ( data - > iterator , backup - > iteratorMove , UITER_CURRENT ) ;
}
2003-01-20 07:43:32 +00:00
}
2001-04-18 01:34:26 +00:00
data - > pos = backup - > pos ;
2001-05-11 01:13:08 +00:00
if ( ( data - > flags & UCOL_ITER_INNORMBUF ) & &
2001-05-02 01:36:29 +00:00
data - > writableBuffer ! = backup - > bufferaddress ) {
2001-05-11 01:13:08 +00:00
/*
this is when a new buffer has been reallocated and we ' ll have to
2001-04-20 22:29:53 +00:00
calculate the new position .
note the new buffer has to contain the contents of the old buffer .
*/
2001-05-02 01:36:29 +00:00
if ( forwards ) {
2001-05-11 01:13:08 +00:00
data - > pos = data - > writableBuffer +
2001-05-02 01:36:29 +00:00
( data - > pos - backup - > bufferaddress ) ;
}
else {
/* backwards direction */
2001-05-11 01:13:08 +00:00
uint32_t temp = backup - > buffersize -
2001-05-02 01:36:29 +00:00
( data - > pos - backup - > bufferaddress ) ;
data - > pos = data - > writableBuffer + ( data - > writableBufSize - temp ) ;
}
2001-04-25 23:28:55 +00:00
}
if ( ( data - > flags & UCOL_ITER_INNORMBUF ) = = 0 ) {
2001-05-11 01:13:08 +00:00
/*
2001-04-25 23:28:55 +00:00
this is alittle tricky .
2001-05-11 01:13:08 +00:00
if we are initially not in the normalization buffer , even if we
2001-04-25 23:28:55 +00:00
normalize in the later stage , the data in the buffer will be
ignored , since we skip back up to the data string .
however if we are already in the normalization buffer , any
2001-05-11 01:13:08 +00:00
further normalization will pull data into the normalization
2001-04-25 23:28:55 +00:00
buffer and modify the fcdPosition .
2001-05-11 01:13:08 +00:00
since we are keeping the data in the buffer for use , the
2001-04-25 23:28:55 +00:00
fcdPosition can not be reverted back .
arrgghh . . . .
*/
data - > fcdPosition = backup - > fcdPosition ;
}
2001-04-18 01:34:26 +00:00
}
2001-06-06 23:26:50 +00:00
/*
* collIter_eos ( )
* Checks for a collIterate being positioned at the end of
* its source string .
*
*/
2001-10-22 05:30:22 +00:00
static
2001-06-06 23:26:50 +00:00
inline UBool collIter_eos ( collIterate * s ) {
2003-01-20 07:43:32 +00:00
if ( s - > flags & UCOL_USE_ITERATOR ) {
return ! ( s - > iterator - > hasNext ( s - > iterator ) ) ;
}
2001-06-06 23:26:50 +00:00
if ( ( s - > flags & UCOL_ITER_HASLEN ) = = 0 & & * s - > pos ! = 0 ) {
// Null terminated string, but not at null, so not at end.
// Whether in main or normalization buffer doesn't matter.
return FALSE ;
}
// String with length. Can't be in normalization buffer, which is always
// null termintated.
if ( s - > flags & UCOL_ITER_HASLEN ) {
return ( s - > pos = = s - > endp ) ;
}
// We are at a null termination, could be either normalization buffer or main string.
if ( ( s - > flags & UCOL_ITER_INNORMBUF ) = = 0 ) {
// At null at end of main string.
return TRUE ;
}
// At null at end of normalization buffer. Need to check whether there there are
// any characters left in the main buffer.
2003-01-20 07:43:32 +00:00
if ( s - > origFlags & UCOL_USE_ITERATOR ) {
return ! ( s - > iterator - > hasNext ( s - > iterator ) ) ;
} else if ( ( s - > origFlags & UCOL_ITER_HASLEN ) = = 0 ) {
2001-06-06 23:26:50 +00:00
// Null terminated main string. fcdPosition is the 'return' position into main buf.
return ( * s - > fcdPosition = = 0 ) ;
}
else {
// Main string with an end pointer.
return s - > fcdPosition = = s - > endp ;
}
}
2002-07-02 22:32:14 +00:00
/*
* collIter_bos ( )
* Checks for a collIterate being positioned at the start of
* its source string .
*
*/
static
inline UBool collIter_bos ( collIterate * source ) {
2003-01-20 07:43:32 +00:00
// if we're going backwards, we need to know whether there is more in the
// iterator, even if we are in the side buffer
if ( source - > flags & UCOL_USE_ITERATOR | | source - > origFlags & UCOL_USE_ITERATOR ) {
return ! source - > iterator - > hasPrevious ( source - > iterator ) ;
}
2002-07-02 22:32:14 +00:00
if ( source - > pos < = source - > string | |
( ( source - > flags & UCOL_ITER_INNORMBUF ) & &
* ( source - > pos - 1 ) = = 0 & & source - > fcdPosition = = NULL ) ) {
return TRUE ;
}
return FALSE ;
}
2003-02-08 02:16:54 +00:00
static
inline UBool collIter_SimpleBos ( collIterate * source ) {
// if we're going backwards, we need to know whether there is more in the
// iterator, even if we are in the side buffer
if ( source - > flags & UCOL_USE_ITERATOR | | source - > origFlags & UCOL_USE_ITERATOR ) {
return ! source - > iterator - > hasPrevious ( source - > iterator ) ;
}
if ( source - > pos = = source - > string ) {
return TRUE ;
}
return FALSE ;
}
//return (data->pos == data->string) ||
2001-06-06 23:26:50 +00:00
2001-05-10 17:49:24 +00:00
/**
* Checks and free writable buffer if it is not the original stack buffer
* in collIterate . This function does not reassign the writable buffer .
* @ param data collIterate struct to determine and free the writable buffer
*/
2001-10-22 05:30:22 +00:00
static
2001-05-10 17:49:24 +00:00
inline void freeHeapWritableBuffer ( collIterate * data )
{
if ( data - > writableBuffer ! = data - > stackWritableBuffer ) {
uprv_free ( data - > writableBuffer ) ;
}
}
2001-06-06 23:26:50 +00:00
2001-01-16 00:28:40 +00:00
/****************************************************************************/
/* Following are the open/close functions */
/* */
/****************************************************************************/
2003-04-24 07:00:27 +00:00
static UCollator *
tryOpeningFromRules ( UResourceBundle * collElem , UErrorCode * status ) {
int32_t rulesLen = 0 ;
const UChar * rules = ures_getStringByKey ( collElem , " Sequence " , & rulesLen , status ) ;
return ucol_openRules ( rules , rulesLen , UCOL_DEFAULT , UCOL_DEFAULT , NULL , status ) ;
}
2001-11-21 01:08:55 +00:00
U_CAPI UCollator * U_EXPORT2
2001-01-16 00:28:40 +00:00
ucol_open ( const char * loc ,
UErrorCode * status )
{
2001-04-23 01:53:49 +00:00
2001-04-05 01:40:36 +00:00
ucol_initUCA ( status ) ;
2001-04-23 01:53:49 +00:00
2001-01-16 00:28:40 +00:00
/* New version */
if ( U_FAILURE ( * status ) ) return 0 ;
2001-04-23 01:53:49 +00:00
2001-01-16 00:28:40 +00:00
UCollator * result = NULL ;
UResourceBundle * b = ures_open ( NULL , loc , status ) ;
2002-06-14 05:59:49 +00:00
UResourceBundle * collElem = ures_getByKey ( b , " CollationElements " , NULL , status ) ;
UResourceBundle * binary = ures_getByKey ( collElem , " %%CollationBin " , NULL , status ) ;
2001-02-26 23:52:44 +00:00
2001-02-10 02:42:54 +00:00
if ( * status = = U_MISSING_RESOURCE_ERROR ) { /* if we don't find tailoring, we'll fallback to UCA */
2002-08-21 19:12:24 +00:00
* status = U_USING_DEFAULT_WARNING ;
2001-04-23 01:53:49 +00:00
result = ucol_initCollator ( UCA - > image , result , status ) ;
2002-02-28 07:20:52 +00:00
// if we use UCA, real locale is root
result - > rb = ures_open ( NULL , " " , status ) ;
2002-03-13 05:48:25 +00:00
result - > binary = ures_open ( NULL , " " , status ) ;
2002-02-28 07:20:52 +00:00
if ( U_FAILURE ( * status ) ) {
goto clean ;
}
ures_close ( binary ) ;
ures_close ( b ) ;
2001-02-27 21:01:11 +00:00
result - > hasRealData = FALSE ;
2001-01-16 00:28:40 +00:00
} else if ( U_SUCCESS ( * status ) ) { /* otherwise, we'll pick a collation data that exists */
int32_t len = 0 ;
const uint8_t * inData = ures_getBinary ( binary , & len , status ) ;
2003-04-24 07:00:27 +00:00
UCATableHeader * colData = ( UCATableHeader * ) inData ;
if ( uprv_memcmp ( colData - > UCAVersion , UCA - > image - > UCAVersion , sizeof ( UVersionInfo ) ) ! = 0 | |
uprv_memcmp ( colData - > UCDVersion , UCA - > image - > UCDVersion , sizeof ( UVersionInfo ) ) ! = 0 ) {
result = tryOpeningFromRules ( collElem , status ) ;
2001-02-26 10:28:56 +00:00
} else {
2001-04-05 01:40:36 +00:00
if ( U_FAILURE ( * status ) ) {
goto clean ;
}
2003-04-24 07:00:27 +00:00
if ( ( uint32_t ) len > ( paddedsize ( sizeof ( UCATableHeader ) ) + paddedsize ( sizeof ( UColOptionSet ) ) ) ) {
result = ucol_initCollator ( ( const UCATableHeader * ) inData , result , status ) ;
if ( U_FAILURE ( * status ) ) {
goto clean ;
}
result - > hasRealData = TRUE ;
} else {
result = ucol_initCollator ( UCA - > image , result , status ) ;
ucol_setOptionsFromHeader ( result , ( UColOptionSet * ) ( inData + ( ( const UCATableHeader * ) inData ) - > options ) , status ) ;
if ( U_FAILURE ( * status ) ) {
goto clean ;
}
result - > hasRealData = FALSE ;
}
2001-02-26 10:28:56 +00:00
}
2002-03-13 05:48:25 +00:00
result - > binary = binary ;
result - > rb = b ;
2001-02-26 10:28:56 +00:00
} else { /* There is another error, and we're just gonna clean up */
2001-04-05 01:40:36 +00:00
clean :
2001-02-26 10:28:56 +00:00
ures_close ( b ) ;
2002-06-14 05:59:49 +00:00
ures_close ( collElem ) ;
2001-04-05 01:40:36 +00:00
ures_close ( binary ) ;
2001-02-26 10:28:56 +00:00
return NULL ;
2001-02-10 02:42:54 +00:00
}
2002-03-13 05:48:25 +00:00
if ( loc = = NULL ) {
loc = ures_getLocale ( result - > rb , status ) ;
}
result - > requestedLocale = ( char * ) uprv_malloc ( ( uprv_strlen ( loc ) + 1 ) * sizeof ( char ) ) ;
2002-07-02 15:10:30 +00:00
/* test for NULL */
2002-06-29 09:32:36 +00:00
if ( result - > requestedLocale = = NULL ) {
* status = U_MEMORY_ALLOCATION_ERROR ;
return NULL ;
}
2002-03-13 05:48:25 +00:00
uprv_strcpy ( result - > requestedLocale , loc ) ;
2002-06-14 05:59:49 +00:00
ures_close ( collElem ) ;
2001-01-16 00:28:40 +00:00
return result ;
}
2002-12-17 22:44:46 +00:00
# ifdef U_USE_DEPRECATED_UCOL_API
2001-03-14 00:22:56 +00:00
U_CAPI UCollator * U_EXPORT2
ucol_openVersion ( const char * loc ,
UVersionInfo version ,
UErrorCode * status ) {
UCollator * collator ;
UVersionInfo info ;
collator = ucol_open ( loc , status ) ;
if ( U_SUCCESS ( * status ) ) {
ucol_getVersion ( collator , info ) ;
if ( 0 ! = uprv_memcmp ( version , info , sizeof ( UVersionInfo ) ) ) {
ucol_close ( collator ) ;
* status = U_MISSING_RESOURCE_ERROR ;
return NULL ;
}
}
return collator ;
}
2002-12-17 22:44:46 +00:00
# endif
2001-02-21 21:43:17 +00:00
2001-07-06 04:57:28 +00:00
2001-11-21 01:08:55 +00:00
U_CAPI void U_EXPORT2
2001-01-16 00:28:40 +00:00
ucol_close ( UCollator * coll )
{
2002-07-20 06:00:04 +00:00
if ( coll ! = NULL ) {
/* Here, it would be advisable to close: */
/* - UData for UCA (unless we stuff it in the root resb */
/* Again, do we need additional housekeeping... HMMM! */
if ( coll - > freeOnClose = = FALSE ) {
return ; /* for safeClone, if freeOnClose is FALSE,
don ' t free the other instance data */
}
if ( coll - > freeOptionsOnClose ! = FALSE ) {
if ( coll - > options ! = NULL ) {
uprv_free ( coll - > options ) ;
}
2001-03-30 00:23:46 +00:00
}
2002-07-20 06:00:04 +00:00
if ( coll - > mapping ! = NULL ) {
/*ucmpe32_close(coll->mapping);*/
uprv_free ( coll - > mapping ) ;
}
if ( coll - > rules ! = NULL & & coll - > freeRulesOnClose ) {
uprv_free ( ( UChar * ) coll - > rules ) ;
}
if ( coll - > rb ! = NULL ) { /* pointing to read-only memory */
ures_close ( coll - > rb ) ;
} else if ( coll - > hasRealData = = TRUE ) {
uprv_free ( ( UCATableHeader * ) coll - > image ) ;
}
if ( coll - > binary ! = NULL ) {
ures_close ( coll - > binary ) ;
}
if ( coll - > requestedLocale ! = NULL ) {
uprv_free ( coll - > requestedLocale ) ;
}
2002-09-04 06:02:13 +00:00
if ( coll - > latinOneCEs ! = NULL ) {
uprv_free ( coll - > latinOneCEs ) ;
}
2002-07-20 06:00:04 +00:00
uprv_free ( coll ) ;
2001-03-30 00:23:46 +00:00
}
2001-01-16 00:28:40 +00:00
}
2001-05-11 01:13:08 +00:00
2001-11-21 01:08:55 +00:00
U_CAPI UCollator * U_EXPORT2
2001-08-16 00:55:16 +00:00
ucol_openRules ( const UChar * rules ,
int32_t rulesLength ,
2001-09-22 01:24:15 +00:00
UColAttributeValue normalizationMode ,
2001-08-16 00:55:16 +00:00
UCollationStrength strength ,
UParseError * parseError ,
UErrorCode * status )
2001-01-16 00:28:40 +00:00
{
2001-06-06 20:18:38 +00:00
uint32_t listLen = 0 ;
2001-02-05 05:36:12 +00:00
UColTokenParser src ;
2001-04-24 23:32:40 +00:00
UColAttributeValue norm ;
2001-08-16 00:55:16 +00:00
UParseError tErr ;
2001-09-22 01:24:15 +00:00
if ( status = = NULL | | U_FAILURE ( * status ) ) {
return 0 ;
}
if ( rulesLength < - 1 | | ( rules = = NULL & & rulesLength ! = 0 ) ) {
* status = U_ILLEGAL_ARGUMENT_ERROR ;
return 0 ;
}
if ( rulesLength = = - 1 ) {
rulesLength = u_strlen ( rules ) ;
}
2001-08-16 00:55:16 +00:00
if ( parseError = = NULL ) {
2001-09-22 01:24:15 +00:00
parseError = & tErr ;
2001-08-16 00:55:16 +00:00
}
2002-08-21 19:12:24 +00:00
switch ( normalizationMode ) { // TODO friendly deprecation helper, remove the (int) cast >2002-sep-30
2001-09-22 01:24:15 +00:00
case UCOL_OFF :
2002-08-21 19:12:24 +00:00
# ifdef ICU_NORMALIZER_USE_DEPRECATES
2001-09-22 01:24:15 +00:00
case UNORM_NONE : // TODO friendly deprecation helper, remove >2002-sep-30
2002-08-21 19:12:24 +00:00
# endif
2001-04-24 23:32:40 +00:00
norm = UCOL_OFF ;
break ;
2001-09-22 01:24:15 +00:00
case UCOL_ON :
2002-08-21 19:12:24 +00:00
# ifdef ICU_NORMALIZER_USE_DEPRECATES
2001-09-22 01:24:15 +00:00
case UNORM_NFD : // TODO friendly deprecation helper, remove >2002-sep-30
2002-08-21 19:12:24 +00:00
# endif
2001-04-24 23:32:40 +00:00
norm = UCOL_ON ;
break ;
2001-05-02 05:09:40 +00:00
case UCOL_DEFAULT :
2002-08-21 19:12:24 +00:00
# ifdef ICU_NORMALIZER_USE_DEPRECATES
case UCOL_DEFAULT_NORMALIZATION : // TODO friendly deprecation helper, remove >2002-sep-30
# endif
2001-04-24 23:32:40 +00:00
norm = UCOL_DEFAULT ;
break ;
default :
* status = U_ILLEGAL_ARGUMENT_ERROR ;
return 0 ;
}
2001-01-29 22:09:24 +00:00
ucol_initUCA ( status ) ;
2001-08-16 00:55:16 +00:00
if ( U_FAILURE ( * status ) ) {
2002-07-20 06:00:04 +00:00
return NULL ;
2001-08-16 00:55:16 +00:00
}
2001-01-16 00:28:40 +00:00
2001-06-06 20:18:38 +00:00
ucol_tok_initTokenList ( & src , rules , rulesLength , UCA , status ) ;
2001-08-16 00:55:16 +00:00
listLen = ucol_tok_assembleTokenList ( & src , parseError , status ) ;
2001-06-06 20:18:38 +00:00
2001-04-23 01:53:49 +00:00
if ( U_FAILURE ( * status ) ) {
2001-03-06 07:44:37 +00:00
/* if status is U_ILLEGAL_ARGUMENT_ERROR, src->current points at the offending option */
/* if status is U_INVALID_FORMAT_ERROR, src->current points after the problematic part of the rules */
/* so something might be done here... or on lower level */
# ifdef UCOL_DEBUG
if ( * status = = U_ILLEGAL_ARGUMENT_ERROR ) {
fprintf ( stderr , " bad option starting at offset %i \n " , src . current - src . source ) ;
} else {
fprintf ( stderr , " invalid rule just before offset %i \n " , src . current - src . source ) ;
}
# endif
2001-02-28 21:50:23 +00:00
ucol_tok_closeTokenList ( & src ) ;
2001-01-29 22:34:04 +00:00
return NULL ;
}
2001-02-26 10:28:56 +00:00
UCollator * result = NULL ;
UCATableHeader * table = NULL ;
2002-10-16 22:34:16 +00:00
if ( src . resultLen > 0 | | src . removeSet ! = NULL ) { /* we have a set of rules, let's make something of it */
/* also, if we wanted to remove some contractions, we should make a tailoring */
2001-02-28 19:01:23 +00:00
table = ucol_assembleTailoringTable ( & src , status ) ;
2001-04-23 01:53:49 +00:00
if ( U_SUCCESS ( * status ) ) {
2003-04-24 07:00:27 +00:00
// builder version
table - > version [ 0 ] = UCOL_BUILDER_VERSION ;
// no tailoring information on this level
table - > version [ 1 ] = table - > version [ 2 ] = table - > version [ 3 ] = 0 ;
// set UCD version
u_getUnicodeVersion ( table - > UCDVersion ) ;
// set UCA version
uprv_memcpy ( table - > UCAVersion , UCA - > image - > UCAVersion , sizeof ( UVersionInfo ) ) ;
2001-04-23 01:53:49 +00:00
result = ucol_initCollator ( table , 0 , status ) ;
result - > hasRealData = TRUE ;
}
2001-02-26 10:28:56 +00:00
} else { /* no rules, but no error either */
2001-06-07 21:28:56 +00:00
// must be only options
// We will init the collator from UCA
2001-02-26 10:28:56 +00:00
result = ucol_initCollator ( UCA - > image , 0 , status ) ;
2001-06-07 21:28:56 +00:00
// And set only the options
UColOptionSet * opts = ( UColOptionSet * ) uprv_malloc ( sizeof ( UColOptionSet ) ) ;
2002-07-02 15:10:30 +00:00
/* test for NULL */
2002-06-29 09:32:36 +00:00
if ( opts = = NULL ) {
* status = U_MEMORY_ALLOCATION_ERROR ;
2002-07-20 06:00:04 +00:00
goto cleanup ;
2002-06-29 09:32:36 +00:00
}
2001-06-07 21:28:56 +00:00
uprv_memcpy ( opts , src . opts , sizeof ( UColOptionSet ) ) ;
ucol_setOptionsFromHeader ( result , opts , status ) ;
2001-03-30 00:23:46 +00:00
result - > freeOptionsOnClose = TRUE ;
2001-02-26 10:28:56 +00:00
result - > hasRealData = FALSE ;
}
2001-04-23 01:53:49 +00:00
2001-01-31 07:20:56 +00:00
if ( U_SUCCESS ( * status ) ) {
2001-09-22 01:24:15 +00:00
UChar * newRules ;
2001-03-22 18:45:31 +00:00
result - > dataInfo . dataVersion [ 0 ] = UCOL_BUILDER_VERSION ;
2001-09-22 01:24:15 +00:00
if ( rulesLength > 0 ) {
2002-07-20 06:00:04 +00:00
newRules = ( UChar * ) uprv_malloc ( ( rulesLength + 1 ) * U_SIZEOF_UCHAR ) ;
/* test for NULL */
if ( newRules = = NULL ) {
* status = U_MEMORY_ALLOCATION_ERROR ;
goto cleanup ;
}
2001-09-22 01:24:15 +00:00
uprv_memcpy ( newRules , rules , rulesLength * U_SIZEOF_UCHAR ) ;
2002-07-20 06:00:04 +00:00
newRules [ rulesLength ] = 0 ;
result - > rules = newRules ;
result - > rulesLength = rulesLength ;
result - > freeRulesOnClose = TRUE ;
2001-09-22 01:24:15 +00:00
}
2002-03-13 05:48:25 +00:00
result - > rb = NULL ;
result - > binary = NULL ;
result - > requestedLocale = NULL ;
2001-04-24 23:32:40 +00:00
ucol_setAttribute ( result , UCOL_STRENGTH , strength , status ) ;
ucol_setAttribute ( result , UCOL_NORMALIZATION_MODE , norm , status ) ;
2001-01-31 07:20:56 +00:00
} else {
2002-07-20 06:00:04 +00:00
cleanup :
2001-04-23 01:53:49 +00:00
if ( result ! = NULL ) {
2001-01-31 07:20:56 +00:00
ucol_close ( result ) ;
2002-07-20 06:00:04 +00:00
} else {
if ( table ! = NULL ) {
uprv_free ( table ) ;
}
2001-01-31 07:20:56 +00:00
}
2001-04-23 01:53:49 +00:00
result = NULL ;
2001-01-31 07:20:56 +00:00
}
2001-06-06 20:18:38 +00:00
2001-02-28 21:50:23 +00:00
ucol_tok_closeTokenList ( & src ) ;
2001-02-27 21:01:11 +00:00
2001-01-16 00:28:40 +00:00
return result ;
}
2001-09-22 01:24:15 +00:00
2001-01-16 00:28:40 +00:00
/* This one is currently used by genrb & tests. After constructing from rules (tailoring),*/
/* you should be able to get the binary chunk to write out... Doesn't look very full now */
2001-11-21 01:08:55 +00:00
U_CAPI uint8_t * U_EXPORT2
2001-07-09 22:24:23 +00:00
ucol_cloneRuleData ( const UCollator * coll , int32_t * length , UErrorCode * status )
2001-01-16 00:28:40 +00:00
{
2001-02-26 10:28:56 +00:00
uint8_t * result = NULL ;
2001-05-28 20:57:59 +00:00
if ( U_FAILURE ( * status ) ) {
return NULL ;
}
2001-02-26 10:28:56 +00:00
if ( coll - > hasRealData = = TRUE ) {
* length = coll - > image - > size ;
result = ( uint8_t * ) uprv_malloc ( * length ) ;
2002-07-02 15:10:30 +00:00
/* test for NULL */
2002-06-29 09:32:36 +00:00
if ( result = = NULL ) {
* status = U_MEMORY_ALLOCATION_ERROR ;
return NULL ;
}
2001-02-26 10:28:56 +00:00
uprv_memcpy ( result , coll - > image , * length ) ;
} else {
2002-02-20 18:13:29 +00:00
* length = ( int32_t ) ( paddedsize ( sizeof ( UCATableHeader ) ) + paddedsize ( sizeof ( UColOptionSet ) ) ) ;
2001-03-30 00:23:46 +00:00
result = ( uint8_t * ) uprv_malloc ( * length ) ;
2002-07-02 15:10:30 +00:00
/* test for NULL */
2002-06-29 09:32:36 +00:00
if ( result = = NULL ) {
* status = U_MEMORY_ALLOCATION_ERROR ;
return NULL ;
}
2001-03-30 00:23:46 +00:00
uprv_memcpy ( result , UCA - > image , sizeof ( UCATableHeader ) ) ;
uprv_memcpy ( result + paddedsize ( sizeof ( UCATableHeader ) ) , coll - > options , sizeof ( UColOptionSet ) ) ;
2001-02-26 10:28:56 +00:00
}
return result ;
}
2001-03-30 00:23:46 +00:00
void ucol_setOptionsFromHeader ( UCollator * result , UColOptionSet * opts , UErrorCode * status ) {
2001-02-28 19:01:23 +00:00
if ( U_FAILURE ( * status ) ) {
return ;
}
2002-09-17 04:53:35 +00:00
result - > caseFirst = ( UColAttributeValue ) opts - > caseFirst ;
result - > caseLevel = ( UColAttributeValue ) opts - > caseLevel ;
result - > frenchCollation = ( UColAttributeValue ) opts - > frenchCollation ;
result - > normalizationMode = ( UColAttributeValue ) opts - > normalizationMode ;
result - > strength = ( UColAttributeValue ) opts - > strength ;
2002-10-08 16:10:49 +00:00
result - > variableTopValue = opts - > variableTopValue ;
2002-09-17 04:53:35 +00:00
result - > alternateHandling = ( UColAttributeValue ) opts - > alternateHandling ;
result - > hiraganaQ = ( UColAttributeValue ) opts - > hiraganaQ ;
2001-02-26 10:28:56 +00:00
result - > caseFirstisDefault = TRUE ;
result - > caseLevelisDefault = TRUE ;
result - > frenchCollationisDefault = TRUE ;
result - > normalizationModeisDefault = TRUE ;
result - > strengthisDefault = TRUE ;
result - > variableTopValueisDefault = TRUE ;
2001-10-02 16:49:57 +00:00
result - > hiraganaQisDefault = TRUE ;
2001-03-02 00:19:43 +00:00
2002-09-04 06:02:13 +00:00
ucol_updateInternalState ( result , status ) ;
2001-03-30 00:23:46 +00:00
result - > options = opts ;
2001-02-26 10:28:56 +00:00
}
2001-11-13 22:55:05 +00:00
#if 0
// doesn't look like anybody is using this
2001-03-30 00:23:46 +00:00
void ucol_putOptionsToHeader ( UCollator * result , UColOptionSet * opts , UErrorCode * status ) {
2001-02-28 19:01:23 +00:00
if ( U_FAILURE ( * status ) ) {
return ;
}
2001-03-30 00:23:46 +00:00
opts - > caseFirst = result - > caseFirst ;
opts - > caseLevel = result - > caseLevel ;
opts - > frenchCollation = result - > frenchCollation ;
opts - > normalizationMode = result - > normalizationMode ;
opts - > strength = result - > strength ;
2001-04-23 01:53:49 +00:00
opts - > variableTopValue = result - > variableTopValue ;
2001-03-30 00:23:46 +00:00
opts - > alternateHandling = result - > alternateHandling ;
2001-10-02 16:49:57 +00:00
opts - > hiraganaQ = opts - > hiraganaQ ;
2001-01-16 00:28:40 +00:00
}
2001-11-13 22:55:05 +00:00
# endif
2001-01-16 00:28:40 +00:00
2001-08-17 00:21:18 +00:00
static const uint16_t * fcdTrieIndex = NULL ;
2001-04-10 22:06:25 +00:00
2001-06-06 23:26:50 +00:00
/**
2001-05-17 01:06:25 +00:00
* Approximate determination if a character is at a contraction end .
2001-06-06 23:26:50 +00:00
* Guaranteed to be TRUE if a character is at the end of a contraction ,
2001-05-17 01:06:25 +00:00
* otherwise it is not deterministic .
* @ param c character to be determined
* @ param coll collator
*/
2001-10-22 05:30:22 +00:00
static
2001-05-10 17:49:24 +00:00
inline UBool ucol_contractionEndCP ( UChar c , const UCollator * coll ) {
2001-09-18 18:37:57 +00:00
if ( UTF_IS_TRAIL ( c ) ) {
return TRUE ;
}
2001-05-10 22:12:53 +00:00
if ( c < coll - > minContrEndCP ) {
return FALSE ;
}
int32_t hash = c ;
uint8_t htbyte ;
if ( hash > = UCOL_UNSAFECP_TABLE_SIZE * 8 ) {
hash = ( hash & UCOL_UNSAFECP_TABLE_MASK ) + 256 ;
}
htbyte = coll - > contrEndCP [ hash > > 3 ] ;
2001-05-11 01:13:08 +00:00
return ( ( ( htbyte > > ( hash & 7 ) ) & 1 ) = = 1 ) ;
2001-05-10 17:49:24 +00:00
}
2001-04-10 22:06:25 +00:00
2001-03-03 03:35:17 +00:00
2001-05-11 01:13:08 +00:00
2001-06-06 23:26:50 +00:00
/*
* i_getCombiningClass ( )
* A fast , at least partly inline version of u_getCombiningClass ( )
* This is a candidate for further optimization . Used heavily
* in contraction processing .
*/
2001-10-22 05:30:22 +00:00
static
2001-06-06 23:26:50 +00:00
inline uint8_t i_getCombiningClass ( UChar c , const UCollator * coll ) {
uint8_t sCC = 0 ;
if ( c > = 0x300 & & ucol_unsafeCP ( c , coll ) ) {
sCC = u_getCombiningClass ( c ) ;
}
return sCC ;
}
2001-01-15 07:28:54 +00:00
UCollator * ucol_initCollator ( const UCATableHeader * image , UCollator * fillIn , UErrorCode * status ) {
2001-04-18 19:31:05 +00:00
UChar c ;
2001-01-15 07:28:54 +00:00
UCollator * result = fillIn ;
2001-01-29 22:09:24 +00:00
if ( U_FAILURE ( * status ) | | image = = NULL ) {
2001-01-05 00:47:25 +00:00
return NULL ;
}
if ( result = = NULL ) {
2001-01-15 19:02:30 +00:00
result = ( UCollator * ) uprv_malloc ( sizeof ( UCollator ) ) ;
2001-01-05 00:47:25 +00:00
if ( result = = NULL ) {
* status = U_MEMORY_ALLOCATION_ERROR ;
return result ;
}
result - > freeOnClose = TRUE ;
} else {
result - > freeOnClose = FALSE ;
}
result - > image = image ;
const uint8_t * mapping = ( uint8_t * ) result - > image + result - > image - > mappingPosition ;
2001-12-19 07:00:45 +00:00
/*CompactEIntArray *newUCAmapping = ucmpe32_openFromData(&mapping, status);*/
UTrie * newUCAmapping = ( UTrie * ) uprv_malloc ( sizeof ( UTrie ) ) ;
if ( newUCAmapping ! = NULL ) {
2001-12-28 20:55:24 +00:00
utrie_unserialize ( newUCAmapping , mapping , result - > image - > endExpansionCE - result - > image - > mappingPosition , status ) ;
2001-12-19 07:00:45 +00:00
} else {
* status = U_MEMORY_ALLOCATION_ERROR ;
if ( result - > freeOnClose = = TRUE ) {
uprv_free ( result ) ;
result = NULL ;
}
return result ;
}
2001-01-05 00:47:25 +00:00
if ( U_SUCCESS ( * status ) ) {
result - > mapping = newUCAmapping ;
} else {
if ( result - > freeOnClose = = TRUE ) {
uprv_free ( result ) ;
result = NULL ;
}
2001-12-19 07:00:45 +00:00
uprv_free ( newUCAmapping ) ;
2001-01-05 00:47:25 +00:00
return result ;
}
2001-12-19 07:00:45 +00:00
/*result->latinOneMapping = (uint32_t*)((uint8_t*)result->image+result->image->latinOneMapping);*/
result - > latinOneMapping = UTRIE_GET32_LATIN1 ( result - > mapping ) ;
2001-01-05 00:47:25 +00:00
result - > contractionCEs = ( uint32_t * ) ( ( uint8_t * ) result - > image + result - > image - > contractionCEs ) ;
result - > contractionIndex = ( UChar * ) ( ( uint8_t * ) result - > image + result - > image - > contractionIndex ) ;
result - > expansion = ( uint32_t * ) ( ( uint8_t * ) result - > image + result - > image - > expansion ) ;
2001-03-30 00:23:46 +00:00
result - > options = ( UColOptionSet * ) ( ( uint8_t * ) result - > image + result - > image - > options ) ;
result - > freeOptionsOnClose = FALSE ;
2001-01-05 00:47:25 +00:00
/* set attributes */
2002-09-17 04:53:35 +00:00
result - > caseFirst = ( UColAttributeValue ) result - > options - > caseFirst ;
result - > caseLevel = ( UColAttributeValue ) result - > options - > caseLevel ;
result - > frenchCollation = ( UColAttributeValue ) result - > options - > frenchCollation ;
result - > normalizationMode = ( UColAttributeValue ) result - > options - > normalizationMode ;
result - > strength = ( UColAttributeValue ) result - > options - > strength ;
2002-10-08 16:10:49 +00:00
result - > variableTopValue = result - > options - > variableTopValue ;
2002-09-17 04:53:35 +00:00
result - > alternateHandling = ( UColAttributeValue ) result - > options - > alternateHandling ;
result - > hiraganaQ = ( UColAttributeValue ) result - > options - > hiraganaQ ;
2001-01-09 00:52:18 +00:00
result - > caseFirstisDefault = TRUE ;
result - > caseLevelisDefault = TRUE ;
result - > frenchCollationisDefault = TRUE ;
result - > normalizationModeisDefault = TRUE ;
result - > strengthisDefault = TRUE ;
result - > variableTopValueisDefault = TRUE ;
2001-02-27 21:01:11 +00:00
result - > alternateHandlingisDefault = TRUE ;
2001-10-02 16:49:57 +00:00
result - > hiraganaQisDefault = TRUE ;
2001-01-09 00:52:18 +00:00
2001-01-18 00:46:19 +00:00
result - > scriptOrder = NULL ;
2001-01-16 00:28:40 +00:00
result - > rules = NULL ;
2001-09-27 01:01:30 +00:00
result - > rulesLength = 0 ;
/* get the version info from UCATableHeader and populate the Collator struct*/
2001-02-26 23:52:44 +00:00
result - > dataInfo . dataVersion [ 0 ] = result - > image - > version [ 0 ] ; /* UCA Builder version*/
result - > dataInfo . dataVersion [ 1 ] = result - > image - > version [ 1 ] ; /* UCA Tailoring rules version*/
2001-03-02 01:14:03 +00:00
2001-03-03 03:35:17 +00:00
result - > unsafeCP = ( uint8_t * ) result - > image + result - > image - > unsafeCP ;
2001-04-18 19:31:05 +00:00
result - > minUnsafeCP = 0 ;
for ( c = 0 ; c < 0x300 ; c + + ) { // Find the smallest unsafe char.
if ( ucol_unsafeCP ( c , result ) ) break ;
}
result - > minUnsafeCP = c ;
2001-03-03 03:35:17 +00:00
2001-05-10 22:12:53 +00:00
result - > contrEndCP = ( uint8_t * ) result - > image + result - > image - > contrEndCP ;
result - > minContrEndCP = 0 ;
for ( c = 0 ; c < 0x300 ; c + + ) { // Find the Contraction-ending char.
if ( ucol_contractionEndCP ( c , result ) ) break ;
}
result - > minContrEndCP = c ;
2001-03-02 01:14:03 +00:00
/* max expansion tables */
2001-04-23 01:53:49 +00:00
result - > endExpansionCE = ( uint32_t * ) ( ( uint8_t * ) result - > image +
2001-03-02 01:14:03 +00:00
result - > image - > endExpansionCE ) ;
2001-04-23 01:53:49 +00:00
result - > lastEndExpansionCE = result - > endExpansionCE +
2001-03-02 01:14:03 +00:00
result - > image - > endExpansionCECount - 1 ;
2001-04-23 01:53:49 +00:00
result - > expansionCESize = ( uint8_t * ) result - > image +
2001-03-02 01:14:03 +00:00
result - > image - > expansionCESize ;
2001-04-23 01:53:49 +00:00
2001-08-17 00:21:18 +00:00
if ( fcdTrieIndex = = NULL ) {
fcdTrieIndex = unorm_getFCDTrie ( status ) ;
2001-04-10 22:06:25 +00:00
}
2001-03-02 01:14:03 +00:00
2003-03-27 20:09:38 +00:00
//result->errorCode = *status;
2002-09-04 06:02:13 +00:00
result - > latinOneCEs = NULL ;
result - > latinOneRegenTable = FALSE ;
result - > latinOneFailed = FALSE ;
ucol_updateInternalState ( result , status ) ;
2001-04-10 22:06:25 +00:00
2001-01-05 00:47:25 +00:00
return result ;
}
2001-08-30 02:59:19 +00:00
U_CFUNC UBool
ucol_cleanup ( void )
{
if ( UCA_DATA_MEM ) {
udata_close ( UCA_DATA_MEM ) ;
UCA_DATA_MEM = NULL ;
}
if ( UCA ) {
2002-09-20 21:57:37 +00:00
ucol_close ( UCA ) ;
2001-08-30 02:59:19 +00:00
UCA = NULL ;
2001-01-31 07:20:56 +00:00
}
2001-08-30 02:59:19 +00:00
return TRUE ;
}
2001-01-31 07:20:56 +00:00
2002-06-13 18:34:41 +00:00
/* Following is a port of Mark's code for new treatment of implicits.
* It is positioned here , since ucol_initUCA need to initialize the
* variables below according to the data in the fractional UCA .
*/
2002-08-01 23:09:41 +00:00
2002-06-13 18:34:41 +00:00
/**
* Function used to :
* a ) collapse the 2 different Han ranges from UCA into one ( in the right order ) , and
* b ) bump any non - CJK characters by 10FF FF .
* The relevant blocks are :
* A : 4E00 . .9F FF ; CJK Unified Ideographs
* F900 . . FAFF ; CJK Compatibility Ideographs
* B : 3400. .4 DBF ; CJK Unified Ideographs Extension A
* 20000. . XX ; CJK Unified Ideographs Extension B ( and others later on )
* As long as
* no new B characters are allocated between 4E00 and FAFF , and
* no new A characters are outside of this range ,
* ( very high probability ) this simple code will work .
* The reordered blocks are :
* Block1 is CJK
* Block2 is CJK_COMPAT_USED
* Block3 is CJK_A
* Any other CJK gets its normal code point
* Any non - CJK gets + 10FF FF
* When we reorder Block1 , we make sure that it is at the very start ,
* so that it will use a 3 - byte form .
*/
// CONSTANTS
static const uint32_t
NON_CJK_OFFSET = 0x110000 ,
BYTES_TO_AVOID = 3 ,
OTHER_COUNT = 256 - BYTES_TO_AVOID ,
LAST_COUNT = OTHER_COUNT / 2 ,
LAST_COUNT2 = OTHER_COUNT / 21 , // room for intervening, without expanding to 5 bytes
IMPLICIT_3BYTE_COUNT = 1 ;
// These depend on initUCA, and are initialized at that time
static uint32_t
IMPLICIT_BASE_BYTE = 0 ,
IMPLICIT_LIMIT_BYTE = 0 , // leave room for 1 3-byte and 2 4-byte forms
IMPLICIT_4BYTE_BOUNDARY = 0 ,
LAST_MULTIPLIER = 0 ,
LAST2_MULTIPLIER = 0 ,
IMPLICIT_BASE_3BYTE = 0 ,
IMPLICIT_BASE_4BYTE = 0 ;
2002-07-25 17:38:34 +00:00
static const UChar32
2002-06-13 18:34:41 +00:00
CJK_BASE = 0x4E00 ,
CJK_LIMIT = 0x9FFF + 1 ,
CJK_COMPAT_USED_BASE = 0xFA0E ,
CJK_COMPAT_USED_LIMIT = 0xFA2F + 1 ,
CJK_A_BASE = 0x3400 ,
CJK_A_LIMIT = 0x4DBF + 1 ,
CJK_B_BASE = 0x20000 ,
CJK_B_LIMIT = 0x2A6DF + 1 ;
static inline UChar32 swapCJK ( UChar32 cp ) {
if ( cp > = CJK_BASE ) {
if ( cp < CJK_LIMIT ) return cp - CJK_BASE ;
if ( cp < CJK_COMPAT_USED_BASE ) return cp + NON_CJK_OFFSET ;
if ( cp < CJK_COMPAT_USED_LIMIT ) return cp - CJK_COMPAT_USED_BASE
+ ( CJK_LIMIT - CJK_BASE ) ;
if ( cp < CJK_B_BASE ) return cp + NON_CJK_OFFSET ;
if ( cp < CJK_B_LIMIT ) return cp ; // non-BMP-CJK
return cp + NON_CJK_OFFSET ; // non-CJK
}
if ( cp < CJK_A_BASE ) return cp + NON_CJK_OFFSET ;
if ( cp < CJK_A_LIMIT ) return cp - CJK_A_BASE
+ ( CJK_LIMIT - CJK_BASE )
+ ( CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE ) ;
return cp + NON_CJK_OFFSET ; // non-CJK
}
// GET IMPLICIT PRIMARY WEIGHTS
// Return value is left justified primary key
static inline uint32_t getImplicitPrimary ( UChar32 cp ) {
//if (DEBUG) System.out.println("Incoming: " + Utility.hex(cp));
cp = swapCJK ( cp ) ;
//if (DEBUG) System.out.println("CJK swapped: " + Utility.hex(cp));
// we now have a range of numbers from 0 to 21FFFF.
// we must skip all 00, 01, 02 bytes, so most bytes have 253 values
// we must leave a gap of 01 between all values of the last byte, so the last byte has 126 values (3 byte case)
// we shift so that HAN all has the same first primary, for compression.
// for the 4 byte case, we make the gap as large as we can fit.
// Three byte forms are EC xx xx, ED xx xx, EE xx xx (with a gap of 1)
// Four byte forms (most supplementaries) are EF xx xx xx (with a gap of LAST2_MULTIPLIER == 14)
int32_t last0 = cp - IMPLICIT_4BYTE_BOUNDARY ;
if ( last0 < 0 ) {
int32_t last1 = cp / LAST_COUNT ;
last0 = cp % LAST_COUNT ;
int32_t last2 = last1 / OTHER_COUNT ;
last1 % = OTHER_COUNT ;
/*
if ( DEBUG | | last2 > 0xFF - BYTES_TO_AVOID ) System . out . println ( " 3B: " + Utility . hex ( cp ) + " => "
+ Utility . hex ( last2 ) + " , "
+ Utility . hex ( last1 ) + " , "
+ Utility . hex ( last0 ) + " , "
) ;
*/
return IMPLICIT_BASE_3BYTE + ( last2 < < 24 ) + ( last1 < < 16 ) + ( ( last0 * LAST_MULTIPLIER ) < < 8 ) ;
} else {
int32_t last1 = last0 / LAST_COUNT2 ;
last0 % = LAST_COUNT2 ;
int32_t last2 = last1 / OTHER_COUNT ;
last1 % = OTHER_COUNT ;
int32_t last3 = last2 / OTHER_COUNT ;
last2 % = OTHER_COUNT ;
/*
if ( DEBUG | | last3 > 0xFF - BYTES_TO_AVOID ) System . out . println ( " 4B: " + Utility . hex ( cp ) + " => "
+ Utility . hex ( last3 ) + " , "
+ Utility . hex ( last2 ) + " , "
+ Utility . hex ( last1 ) + " , "
+ Utility . hex ( last0 * LAST2_MULTIPLIER ) + " , "
) ;
*/
return IMPLICIT_BASE_4BYTE + ( last3 < < 24 ) + ( last2 < < 16 ) + ( last1 < < 8 ) + ( last0 * LAST2_MULTIPLIER ) ;
}
}
/* this function is either called from initUCA or from genUCA before
* doing canonical closure for the UCA .
*/
U_CAPI void U_EXPORT2
uprv_uca_initImplicitConstants ( uint32_t baseByte )
{
IMPLICIT_BASE_BYTE = baseByte ;
IMPLICIT_LIMIT_BYTE = IMPLICIT_BASE_BYTE + 4 ; // leave room for 1 3-byte and 2 4-byte forms
IMPLICIT_4BYTE_BOUNDARY = IMPLICIT_3BYTE_COUNT * OTHER_COUNT * LAST_COUNT ;
LAST_MULTIPLIER = OTHER_COUNT / LAST_COUNT ;
LAST2_MULTIPLIER = OTHER_COUNT / LAST_COUNT2 ;
IMPLICIT_BASE_3BYTE = ( IMPLICIT_BASE_BYTE < < 24 ) + 0x030300 ;
IMPLICIT_BASE_4BYTE = ( ( IMPLICIT_BASE_BYTE + IMPLICIT_3BYTE_COUNT ) < < 24 ) + 0x030303 ;
}
2001-08-30 02:59:19 +00:00
void ucol_initUCA ( UErrorCode * status ) {
if ( U_FAILURE ( * status ) )
return ;
if ( UCA = = NULL ) {
2002-09-20 21:57:37 +00:00
UCollator * newUCA = NULL ;
2001-08-30 02:59:19 +00:00
UDataMemory * result = udata_openChoice ( NULL , UCA_DATA_TYPE , UCA_DATA_NAME , isAcceptableUCA , NULL , status ) ;
if ( U_FAILURE ( * status ) ) {
if ( result ) {
udata_close ( result ) ;
}
2001-04-05 01:40:36 +00:00
uprv_free ( newUCA ) ;
}
2001-08-30 02:59:19 +00:00
if ( result ! = NULL ) { /* It looks like sometimes we can fail to find the data file */
newUCA = ucol_initCollator ( ( const UCATableHeader * ) udata_getMemory ( result ) , newUCA , status ) ;
if ( U_SUCCESS ( * status ) ) {
newUCA - > rb = NULL ;
2002-09-20 21:57:37 +00:00
newUCA - > binary = NULL ;
newUCA - > requestedLocale = NULL ;
newUCA - > hasRealData = FALSE ; // real data lives in .dat file...
2001-08-30 02:59:19 +00:00
umtx_lock ( NULL ) ;
if ( UCA = = NULL ) {
UCA = newUCA ;
UCA_DATA_MEM = result ;
result = NULL ;
newUCA = NULL ;
}
umtx_unlock ( NULL ) ;
if ( newUCA ! = NULL ) {
udata_close ( result ) ;
uprv_free ( newUCA ) ;
}
else {
2001-08-31 02:13:00 +00:00
ucln_i18n_registerCleanup ( ) ;
2001-08-30 02:59:19 +00:00
}
2002-06-13 18:34:41 +00:00
// Initalize variables for implicit generation
2002-07-02 22:32:14 +00:00
UCAconsts = ( UCAConstants * ) ( ( uint8_t * ) UCA - > image + UCA - > image - > UCAConsts ) ;
uprv_uca_initImplicitConstants ( UCAconsts - > UCA_PRIMARY_IMPLICIT_MIN ) ;
UCA - > mapping - > getFoldingOffset = _getFoldingOffset ;
2001-08-30 02:59:19 +00:00
} else {
udata_close ( result ) ;
uprv_free ( newUCA ) ;
UCA = NULL ;
}
}
2001-01-04 00:45:41 +00:00
}
}
2000-11-30 23:20:14 +00:00
2003-01-20 07:43:32 +00:00
2001-04-23 21:29:14 +00:00
/* collIterNormalize Incremental Normalization happens here. */
/* pick up the range of chars identifed by FCD, */
/* normalize it into the collIterate's writable buffer, */
/* switch the collIterate's state to use the writable buffer. */
/* */
2001-10-20 01:09:31 +00:00
static
2001-04-23 21:29:14 +00:00
void collIterNormalize ( collIterate * collationSource )
{
UErrorCode status = U_ZERO_ERROR ;
2003-01-20 07:43:32 +00:00
int32_t normLen ;
2001-04-23 21:29:14 +00:00
UChar * srcP = collationSource - > pos - 1 ; /* Start of chars to normalize */
UChar * endP = collationSource - > fcdPosition ; /* End of region to normalize+1 */
2001-10-19 17:36:02 +00:00
normLen = unorm_decompose ( collationSource - > writableBuffer , ( int32_t ) collationSource - > writableBufSize ,
2001-09-27 01:01:30 +00:00
srcP , ( int32_t ) ( endP - srcP ) ,
2003-02-15 02:02:13 +00:00
FALSE , 0 ,
2001-09-27 01:01:30 +00:00
& status ) ;
2001-10-19 17:36:02 +00:00
if ( status = = U_BUFFER_OVERFLOW_ERROR | | status = = U_STRING_NOT_TERMINATED_WARNING ) {
2001-09-27 01:01:30 +00:00
// reallocate and terminate
if ( ! u_growBufferFromStatic ( collationSource - > stackWritableBuffer ,
& collationSource - > writableBuffer ,
( int32_t * ) & collationSource - > writableBufSize , normLen + 1 ,
2001-10-19 17:36:02 +00:00
0 )
2001-09-27 01:01:30 +00:00
) {
2001-05-17 23:09:35 +00:00
# ifdef UCOL_DEBUG
2001-09-27 01:01:30 +00:00
fprintf ( stderr , " collIterNormalize(), out of memory \n " ) ;
2001-05-17 23:09:35 +00:00
# endif
2001-04-23 21:29:14 +00:00
return ;
}
2001-10-19 17:36:02 +00:00
status = U_ZERO_ERROR ;
normLen = unorm_decompose ( collationSource - > writableBuffer , ( int32_t ) collationSource - > writableBufSize ,
srcP , ( int32_t ) ( endP - srcP ) ,
2003-02-15 02:02:13 +00:00
FALSE , 0 ,
2001-10-19 17:36:02 +00:00
& status ) ;
}
if ( U_FAILURE ( status ) ) {
# ifdef UCOL_DEBUG
fprintf ( stderr , " collIterNormalize(), unorm_decompose() failed, status = %s \n " , u_errorName ( status ) ) ;
# endif
return ;
2001-04-23 21:29:14 +00:00
}
2001-04-24 03:18:54 +00:00
2003-01-20 07:43:32 +00:00
if ( collationSource - > writableBuffer ! = collationSource - > stackWritableBuffer ) {
collationSource - > flags | = UCOL_ITER_ALLOCATED ;
}
collationSource - > pos = collationSource - > writableBuffer ;
collationSource - > origFlags = collationSource - > flags ;
collationSource - > flags | = UCOL_ITER_INNORMBUF ;
collationSource - > flags & = ~ ( UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR ) ;
2001-04-23 21:29:14 +00:00
}
2003-01-20 07:43:32 +00:00
// This function takes the iterator and extracts normalized stuff up to the next boundary
// It is similar in the end results to the collIterNormalize, but for the cases when we
// use an iterator
static
inline void normalizeIterator ( collIterate * collationSource ) {
UErrorCode status = U_ZERO_ERROR ;
UBool wasNormalized = FALSE ;
2003-01-23 01:52:34 +00:00
//int32_t iterIndex = collationSource->iterator->getIndex(collationSource->iterator, UITER_CURRENT);
uint32_t iterIndex = collationSource - > iterator - > getState ( collationSource - > iterator ) ;
2003-01-20 07:43:32 +00:00
int32_t normLen = unorm_next ( collationSource - > iterator , collationSource - > writableBuffer ,
( int32_t ) collationSource - > writableBufSize , UNORM_FCD , 0 , TRUE , & wasNormalized , & status ) ;
if ( status = = U_BUFFER_OVERFLOW_ERROR | | normLen = = ( int32_t ) collationSource - > writableBufSize ) {
// reallocate and terminate
if ( ! u_growBufferFromStatic ( collationSource - > stackWritableBuffer ,
& collationSource - > writableBuffer ,
( int32_t * ) & collationSource - > writableBufSize , normLen + 1 ,
0 )
) {
# ifdef UCOL_DEBUG
fprintf ( stderr , " normalizeIterator(), out of memory \n " ) ;
# endif
return ;
}
status = U_ZERO_ERROR ;
2003-01-23 01:52:34 +00:00
//collationSource->iterator->move(collationSource->iterator, iterIndex, UITER_ZERO);
collationSource - > iterator - > setState ( collationSource - > iterator , iterIndex , & status ) ;
2003-01-20 07:43:32 +00:00
normLen = unorm_next ( collationSource - > iterator , collationSource - > writableBuffer ,
( int32_t ) collationSource - > writableBufSize , UNORM_FCD , 0 , TRUE , & wasNormalized , & status ) ;
}
// Terminate the buffer - we already checked that it is big enough
collationSource - > writableBuffer [ normLen ] = 0 ;
if ( collationSource - > writableBuffer ! = collationSource - > stackWritableBuffer ) {
collationSource - > flags | = UCOL_ITER_ALLOCATED ;
}
collationSource - > pos = collationSource - > writableBuffer ;
collationSource - > origFlags = collationSource - > flags ;
collationSource - > flags | = UCOL_ITER_INNORMBUF ;
collationSource - > flags & = ~ ( UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR ) ;
}
// This function takes the iterator and extracts normalized stuff up to the previous boundary
// There is one assumption I use here: due to the nature of how the iterative collation works,
// we will never arrive here unless the normalization mode is turned on and in that case
// we are always in the normalization buffer and want to preserve the original flags (see
// below).
static
inline void normalizeIteratorBackwards ( collIterate * collationSource ) {
UErrorCode status = U_ZERO_ERROR ;
UBool wasNormalized = FALSE ;
collationSource - > iterator - > move ( collationSource - > iterator , - 1 , UITER_CURRENT ) ;
2003-01-23 01:52:34 +00:00
//int32_t iterIndex = collationSource->iterator->getIndex(collationSource->iterator, UITER_CURRENT);
uint32_t iterIndex = collationSource - > iterator - > getState ( collationSource - > iterator ) ;
2003-01-20 07:43:32 +00:00
* ( collationSource - > writableBuffer ) = 0 ;
int32_t normLen = unorm_previous ( collationSource - > iterator , collationSource - > writableBuffer + 1 ,
( int32_t ) collationSource - > writableBufSize , UNORM_FCD , 0 , TRUE , & wasNormalized , & status ) ;
if ( status = = U_BUFFER_OVERFLOW_ERROR | | normLen = = ( int32_t ) collationSource - > writableBufSize ) {
// reallocate and terminate
if ( ! u_growBufferFromStatic ( collationSource - > stackWritableBuffer ,
& collationSource - > writableBuffer ,
( int32_t * ) & collationSource - > writableBufSize , normLen + 1 ,
0 )
) {
# ifdef UCOL_DEBUG
fprintf ( stderr , " normalizeIterator(), out of memory \n " ) ;
# endif
return ;
}
* ( collationSource - > writableBuffer ) = 0 ;
status = U_ZERO_ERROR ;
2003-01-23 01:52:34 +00:00
//collationSource->iterator->move(collationSource->iterator, iterIndex, UITER_ZERO);
collationSource - > iterator - > setState ( collationSource - > iterator , iterIndex , & status ) ;
2003-01-20 07:43:32 +00:00
normLen = unorm_previous ( collationSource - > iterator , collationSource - > writableBuffer + 1 ,
( int32_t ) collationSource - > writableBufSize , UNORM_FCD , 0 , TRUE , & wasNormalized , & status ) ;
}
// Terminate the buffer - we already checked that it is big enough
if ( collationSource - > writableBuffer ! = collationSource - > stackWritableBuffer ) {
collationSource - > flags | = UCOL_ITER_ALLOCATED ;
}
collationSource - > pos = collationSource - > writableBuffer + 1 + normLen ;
// Do not copy the original flags, they were already copied. See the comment
// on the opening line of the function.
//collationSource->origFlags = collationSource->flags;
collationSource - > flags | = UCOL_ITER_INNORMBUF ;
collationSource - > flags & = ~ ( UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR ) ;
}
2001-04-23 21:29:14 +00:00
/* Incremental FCD check and normalize */
/* Called from getNextCE when normalization state is suspect. */
/* When entering, the state is known to be this: */
/* o We are working in the main buffer of the collIterate, not the side */
/* writable buffer. When in the side buffer, normalization mode is always off, */
/* so we won't get here. */
/* o The leading combining class from the current character is 0 or */
/* the trailing combining class of the previous char was zero. */
/* True because the previous call to this function will have always exited */
/* that way, and we get called for every char where cc might be non-zero. */
2001-10-22 05:30:22 +00:00
static
2001-04-24 03:18:54 +00:00
inline UBool collIterFCD ( collIterate * collationSource ) {
2001-08-17 00:21:18 +00:00
UChar c , c2 ;
const UChar * srcP , * endP ;
2001-04-23 21:29:14 +00:00
uint8_t leadingCC ;
uint8_t prevTrailingCC = 0 ;
uint16_t fcd ;
UBool needNormalize = FALSE ;
srcP = collationSource - > pos - 1 ;
2001-08-17 00:21:18 +00:00
if ( collationSource - > flags & UCOL_ITER_HASLEN ) {
endP = collationSource - > endp ;
} else {
endP = NULL ;
}
2001-04-23 21:29:14 +00:00
// Get the trailing combining class of the current character. If it's zero,
// we are OK.
2001-08-17 00:21:18 +00:00
c = * srcP + + ;
2001-04-23 21:29:14 +00:00
/* trie access */
2001-08-17 00:21:18 +00:00
fcd = unorm_getFCD16 ( fcdTrieIndex , c ) ;
if ( fcd ! = 0 ) {
if ( UTF_IS_FIRST_SURROGATE ( c ) ) {
if ( ( endP = = NULL | | srcP ! = endP ) & & UTF_IS_SECOND_SURROGATE ( c2 = * srcP ) ) {
+ + srcP ;
fcd = unorm_getFCD16FromSurrogatePair ( fcdTrieIndex , fcd , c2 ) ;
} else {
fcd = 0 ;
2001-04-23 21:29:14 +00:00
}
2001-08-17 00:21:18 +00:00
}
2001-04-23 21:29:14 +00:00
2001-08-17 00:21:18 +00:00
prevTrailingCC = ( uint8_t ) ( fcd & LAST_BYTE_MASK_ ) ;
if ( prevTrailingCC ! = 0 ) {
// The current char has a non-zero trailing CC. Scan forward until we find
// a char with a leading cc of zero.
while ( endP = = NULL | | srcP ! = endP )
{
const UChar * savedSrcP = srcP ;
2001-04-23 21:29:14 +00:00
2001-08-17 00:21:18 +00:00
c = * srcP + + ;
/* trie access */
fcd = unorm_getFCD16 ( fcdTrieIndex , c ) ;
if ( fcd ! = 0 & & UTF_IS_FIRST_SURROGATE ( c ) ) {
if ( ( endP = = NULL | | srcP ! = endP ) & & UTF_IS_SECOND_SURROGATE ( c2 = * srcP ) ) {
+ + srcP ;
fcd = unorm_getFCD16FromSurrogatePair ( fcdTrieIndex , fcd , c2 ) ;
} else {
fcd = 0 ;
}
}
leadingCC = ( uint8_t ) ( fcd > > SECOND_LAST_BYTE_SHIFT_ ) ;
if ( leadingCC = = 0 ) {
srcP = savedSrcP ; // Hit char that is not part of combining sequence.
// back up over it. (Could be surrogate pair!)
break ;
}
if ( leadingCC < prevTrailingCC ) {
needNormalize = TRUE ;
}
prevTrailingCC = ( uint8_t ) ( fcd & LAST_BYTE_MASK_ ) ;
}
2001-04-23 21:29:14 +00:00
}
}
2001-05-11 01:13:08 +00:00
2001-08-17 00:21:18 +00:00
collationSource - > fcdPosition = ( UChar * ) srcP ;
2001-04-23 21:29:14 +00:00
2001-04-24 03:18:54 +00:00
return needNormalize ;
2001-04-23 21:29:14 +00:00
}
2001-01-16 00:28:40 +00:00
/****************************************************************************/
/* Following are the CE retrieval functions */
/* */
/****************************************************************************/
1999-08-16 21:50:52 +00:00
2001-01-16 00:28:40 +00:00
/* there should be a macro version of this function in the header file */
/* This is the first function that tries to fetch a collation element */
/* If it's not succesfull or it encounters a more difficult situation */
/* some more sofisticated and slower functions are invoked */
2001-10-22 05:30:22 +00:00
static
2001-04-18 19:31:05 +00:00
inline uint32_t ucol_IGetNextCE ( const UCollator * coll , collIterate * collationSource , UErrorCode * status ) {
2002-12-04 00:28:06 +00:00
uint32_t order = 0 ;
2001-02-06 00:36:48 +00:00
if ( collationSource - > CEpos > collationSource - > toReturn ) { /* Are there any CEs from previous expansions? */
order = * ( collationSource - > toReturn + + ) ; /* if so, return them */
if ( collationSource - > CEpos = = collationSource - > toReturn ) {
2001-04-23 01:53:49 +00:00
collationSource - > CEpos = collationSource - > toReturn = collationSource - > CEs ;
2001-02-06 00:36:48 +00:00
}
2001-04-06 23:37:48 +00:00
return order ;
}
2002-12-04 00:28:06 +00:00
UChar ch = 0 ;
2001-04-06 23:37:48 +00:00
2001-04-12 19:59:28 +00:00
for ( ; ; ) /* Loop handles case when incremental normalize switches */
{ /* to or from the side buffer / original string, and we */
/* need to start again to get the next character. */
2001-04-23 01:53:49 +00:00
2003-01-20 07:43:32 +00:00
if ( ( collationSource - > flags & ( UCOL_ITER_HASLEN | UCOL_ITER_INNORMBUF | UCOL_ITER_NORM | UCOL_HIRAGANA_Q | UCOL_USE_ITERATOR ) ) = = 0 )
2001-04-23 01:53:49 +00:00
{
2001-04-06 23:37:48 +00:00
// The source string is null terminated and we're not working from the side buffer,
// and we're not normalizing. This is the fast path.
2001-04-23 21:29:14 +00:00
// (We can be in the side buffer for Thai pre-vowel reordering even when not normalizing.)
2001-04-06 23:37:48 +00:00
ch = * collationSource - > pos + + ;
2001-04-12 19:59:28 +00:00
if ( ch ! = 0 ) {
2001-04-06 23:37:48 +00:00
break ;
2001-04-12 19:59:28 +00:00
}
else {
2001-04-06 23:37:48 +00:00
return UCOL_NO_MORE_CES ;
2001-04-12 19:59:28 +00:00
}
2001-04-06 23:37:48 +00:00
}
if ( collationSource - > flags & UCOL_ITER_HASLEN ) {
// Normal path for strings when length is specified.
// (We can't be in side buffer because it is always null terminated.)
if ( collationSource - > pos > = collationSource - > endp ) {
// Ran off of the end of the main source string. We're done.
return UCOL_NO_MORE_CES ;
}
ch = * collationSource - > pos + + ;
}
2003-01-20 07:43:32 +00:00
else if ( collationSource - > flags & UCOL_USE_ITERATOR ) {
2003-01-23 01:52:34 +00:00
//if(!(collationSource->flags & UCOL_ITER_NORM)) {
2003-02-06 23:29:56 +00:00
UChar32 iterCh = collationSource - > iterator - > next ( collationSource - > iterator ) ;
if ( iterCh = = U_SENTINEL ) {
2003-01-20 07:43:32 +00:00
return UCOL_NO_MORE_CES ;
}
2003-02-06 23:29:56 +00:00
ch = ( UChar ) iterCh ;
2003-01-23 01:52:34 +00:00
#if 0
2003-01-20 07:43:32 +00:00
} else {
// do the incremental normalization of the iterator contents.
// God knows how we're going to get back from it :)
if ( collationSource - > iterator - > hasNext ( collationSource - > iterator ) ) {
normalizeIterator ( collationSource ) ;
continue ;
} else {
return UCOL_NO_MORE_CES ;
}
}
2003-01-23 01:52:34 +00:00
# endif
2003-01-20 07:43:32 +00:00
}
2001-04-06 23:37:48 +00:00
else
{
2001-04-23 21:29:14 +00:00
// Null terminated string.
2001-04-06 23:37:48 +00:00
ch = * collationSource - > pos + + ;
2001-04-23 21:29:14 +00:00
if ( ch = = 0 ) {
// Ran off end of buffer.
if ( ( collationSource - > flags & UCOL_ITER_INNORMBUF ) = = 0 ) {
2001-06-20 18:14:51 +00:00
// Ran off end of main string. backing up one character.
collationSource - > pos - - ;
2001-04-23 21:29:14 +00:00
return UCOL_NO_MORE_CES ;
}
else
{
2001-05-25 19:30:01 +00:00
// Hit null in the normalize side buffer.
// Usually this means the end of the normalized data,
// except for one odd case: a null followed by combining chars,
// which is the case if we are at the start of the buffer.
2003-01-20 07:43:32 +00:00
// iterTODO - this seems to be fine with the iterator code.
2001-05-25 19:30:01 +00:00
if ( collationSource - > pos = = collationSource - > writableBuffer + 1 ) {
break ;
}
// Null marked end of side buffer.
// Revert to the main string and
2001-04-23 21:29:14 +00:00
// loop back to top to try again to get a character.
2003-01-20 07:43:32 +00:00
// iterTODO - this also seems to be fine - fcdPosition should be NULL
// when we constructed the side buffer. origFlags will put the iterator
// back in control.
2001-04-23 21:29:14 +00:00
collationSource - > pos = collationSource - > fcdPosition ;
collationSource - > flags = collationSource - > origFlags ;
continue ;
}
2001-04-12 19:59:28 +00:00
}
2001-04-06 23:37:48 +00:00
}
2001-04-23 01:53:49 +00:00
2001-10-05 02:07:51 +00:00
if ( collationSource - > flags & UCOL_HIRAGANA_Q ) {
2001-11-01 00:00:15 +00:00
if ( ( ch > = 0x3040 & & ch < = 0x3094 ) | | ch = = 0x309d | | ch = = 0x309e ) {
2001-10-05 02:07:51 +00:00
collationSource - > flags | = UCOL_WAS_HIRAGANA ;
} else {
collationSource - > flags & = ~ UCOL_WAS_HIRAGANA ;
}
}
2001-04-06 23:37:48 +00:00
// We've got a character. See if there's any fcd and/or normalization stuff to do.
2001-04-23 21:29:14 +00:00
// Note that UCOL_ITER_NORM flag is always zero when we are in the side buffer.
2001-04-12 19:59:28 +00:00
if ( ( collationSource - > flags & UCOL_ITER_NORM ) = = 0 ) {
2001-04-06 23:37:48 +00:00
break ;
2001-04-12 19:59:28 +00:00
}
2001-04-23 01:53:49 +00:00
2003-01-20 07:43:32 +00:00
// iterTODO
2001-04-06 23:37:48 +00:00
if ( collationSource - > fcdPosition > = collationSource - > pos ) {
// An earlier FCD check has already covered the current character.
// We can go ahead and process this char.
break ;
}
2001-04-23 01:53:49 +00:00
2001-04-20 22:29:53 +00:00
if ( ch < ZERO_CC_LIMIT_ ) {
2001-04-06 23:37:48 +00:00
// Fast fcd safe path. Trailing combining class == 0. This char is OK.
break ;
}
2001-04-23 01:53:49 +00:00
2001-04-06 23:37:48 +00:00
if ( ch < NFC_ZERO_CC_BLOCK_LIMIT_ ) {
// We need to peek at the next character in order to tell if we are FCD
2003-01-20 07:43:32 +00:00
// iterTODO
2001-04-12 19:59:28 +00:00
if ( ( collationSource - > flags & UCOL_ITER_HASLEN ) & & collationSource - > pos > = collationSource - > endp ) {
2001-04-06 23:37:48 +00:00
// We are at the last char of source string.
// It is always OK for FCD check.
break ;
2001-04-12 19:59:28 +00:00
}
2001-04-23 01:53:49 +00:00
2001-04-06 23:37:48 +00:00
// Not at last char of source string (or we'll check against terminating null). Do the FCD fast test
2003-01-20 07:43:32 +00:00
// iterTODO
2001-04-12 19:59:28 +00:00
if ( * collationSource - > pos < NFC_ZERO_CC_BLOCK_LIMIT_ ) {
2001-04-06 23:37:48 +00:00
break ;
2001-04-12 19:59:28 +00:00
}
2001-04-06 23:37:48 +00:00
}
2001-04-23 01:53:49 +00:00
2003-01-20 07:43:32 +00:00
2001-04-06 23:37:48 +00:00
// Need a more complete FCD check and possible normalization.
2001-04-24 03:18:54 +00:00
if ( collIterFCD ( collationSource ) ) {
2003-01-20 07:43:32 +00:00
// iterTODO done above!
2001-04-24 03:18:54 +00:00
collIterNormalize ( collationSource ) ;
}
2001-04-06 23:37:48 +00:00
if ( ( collationSource - > flags & UCOL_ITER_INNORMBUF ) = = 0 ) {
// No normalization was needed. Go ahead and process the char we already had.
break ;
}
2001-04-23 01:53:49 +00:00
2001-04-06 23:37:48 +00:00
// Some normalization happened. Next loop iteration will pick up a char
// from the normalization buffer.
2001-04-23 01:53:49 +00:00
} // end for (;;)
2001-04-18 19:31:05 +00:00
if ( ch < = 0xFF ) {
/* For latin-1 characters we never need to fall back to the UCA table */
/* because all of the UCA data is replicated in the latinOneMapping array */
order = coll - > latinOneMapping [ ch ] ;
if ( order > UCOL_NOT_FOUND ) {
2001-10-20 01:09:31 +00:00
order = ucol_prv_getSpecialCE ( coll , ch , order , collationSource , status ) ;
2001-04-18 19:31:05 +00:00
}
}
else
{
2001-12-19 07:00:45 +00:00
/*order = ucmpe32_get(coll->mapping, ch);*/ /* we'll go for slightly slower trie */
order = UTRIE_GET32_FROM_LEAD ( coll - > mapping , ch ) ;
2001-04-18 19:31:05 +00:00
if ( order > UCOL_NOT_FOUND ) { /* if a CE is special */
2001-10-20 01:09:31 +00:00
order = ucol_prv_getSpecialCE ( coll , ch , order , collationSource , status ) ; /* and try to get the special CE */
2001-04-18 19:31:05 +00:00
}
2001-04-23 01:53:49 +00:00
if ( order = = UCOL_NOT_FOUND ) { /* We couldn't find a good CE in the tailoring */
2001-09-20 20:16:39 +00:00
/* if we got here, the codepoint MUST be over 0xFF - so we look directly in the trie */
2001-12-19 07:00:45 +00:00
/*order = ucmpe32_get(UCA->mapping, ch);*/
order = UTRIE_GET32_FROM_LEAD ( UCA - > mapping , ch ) ;
2001-09-20 20:16:39 +00:00
if ( order > UCOL_NOT_FOUND ) { /* UCA also gives us a special CE */
2001-10-20 01:09:31 +00:00
order = ucol_prv_getSpecialCE ( UCA , ch , order , collationSource , status ) ;
2001-09-20 20:16:39 +00:00
}
2001-04-23 01:53:49 +00:00
}
}
2001-02-06 00:36:48 +00:00
return order ; /* return the CE */
}
2001-04-18 19:31:05 +00:00
/* ucol_getNextCE, out-of-line version for use from other files. */
2001-11-21 01:08:55 +00:00
U_CAPI uint32_t U_EXPORT2
ucol_getNextCE ( const UCollator * coll , collIterate * collationSource , UErrorCode * status ) {
2001-04-18 19:31:05 +00:00
return ucol_IGetNextCE ( coll , collationSource , status ) ;
}
2001-04-12 00:08:26 +00:00
/**
2001-04-23 01:53:49 +00:00
* Incremental previous normalization happens here . Pick up the range of chars
* identifed by FCD , normalize it into the collIterate ' s writable buffer ,
* switch the collIterate ' s state to use the writable buffer .
2001-04-12 00:08:26 +00:00
* @ param data collation iterator data
*/
2001-10-20 01:09:31 +00:00
static
2001-04-12 00:08:26 +00:00
void collPrevIterNormalize ( collIterate * data )
{
2001-04-17 02:43:35 +00:00
UErrorCode status = U_ZERO_ERROR ;
2001-04-24 03:18:54 +00:00
UChar * pEnd = data - > pos ; /* End normalize + 1 */
2001-04-12 18:25:07 +00:00
UChar * pStart ;
2001-04-12 00:08:26 +00:00
uint32_t normLen ;
2001-04-20 22:29:53 +00:00
UChar * pStartNorm ;
2001-04-12 00:08:26 +00:00
2001-04-12 18:25:07 +00:00
/* Start normalize */
if ( data - > fcdPosition = = NULL ) {
pStart = data - > string ;
}
else {
2001-04-23 01:53:49 +00:00
pStart = data - > fcdPosition + 1 ;
2001-04-12 18:25:07 +00:00
}
2001-04-24 03:18:54 +00:00
normLen = unorm_normalize ( pStart , ( pEnd - pStart ) + 1 , UNORM_NFD , 0 ,
2001-04-20 22:29:53 +00:00
data - > writableBuffer , 0 , & status ) ;
2001-05-11 01:13:08 +00:00
2001-04-20 22:29:53 +00:00
if ( data - > writableBufSize < = normLen ) {
2001-05-10 17:49:24 +00:00
freeHeapWritableBuffer ( data ) ;
2001-04-23 01:53:49 +00:00
data - > writableBuffer = ( UChar * ) uprv_malloc ( ( normLen + 1 ) *
sizeof ( UChar ) ) ;
2002-07-20 06:00:04 +00:00
if ( data - > writableBuffer = = NULL ) { // something is wrong here, return
return ;
}
2001-09-27 01:01:30 +00:00
data - > flags | = UCOL_ITER_ALLOCATED ;
2001-04-23 01:53:49 +00:00
/* to handle the zero termination */
data - > writableBufSize = normLen + 1 ;
2001-04-12 00:08:26 +00:00
}
2001-04-23 01:53:49 +00:00
status = U_ZERO_ERROR ;
2001-05-11 01:13:08 +00:00
/*
2001-04-20 22:29:53 +00:00
this puts the null termination infront of the normalized string instead
of the end
*/
pStartNorm = data - > writableBuffer + ( data - > writableBufSize - normLen ) ;
* ( pStartNorm - 1 ) = 0 ;
2001-05-11 01:13:08 +00:00
unorm_normalize ( pStart , ( pEnd - pStart ) + 1 , UNORM_NFD , 0 , pStartNorm ,
2001-04-24 03:18:54 +00:00
normLen , & status ) ;
2001-04-23 01:53:49 +00:00
2001-04-20 22:29:53 +00:00
data - > pos = data - > writableBuffer + data - > writableBufSize ;
2001-04-12 00:08:26 +00:00
data - > origFlags = data - > flags ;
data - > flags | = UCOL_ITER_INNORMBUF ;
data - > flags & = ~ ( UCOL_ITER_NORM | UCOL_ITER_HASLEN ) ;
}
2001-04-06 23:37:48 +00:00
2001-04-12 00:08:26 +00:00
/**
2001-04-23 01:53:49 +00:00
* Incremental FCD check for previous iteration and normalize . Called from
* getPrevCE when normalization state is suspect .
* When entering , the state is known to be this :
* o We are working in the main buffer of the collIterate , not the side
* writable buffer . When in the side buffer , normalization mode is always
* off , so we won ' t get here .
* o The leading combining class from the current character is 0 or the
* trailing combining class of the previous char was zero .
* True because the previous call to this function will have always exited
* that way , and we get called for every char where cc might be non - zero .
2001-04-12 00:08:26 +00:00
* @ param data collation iterate struct
2001-05-11 01:13:08 +00:00
* @ return normalization status , TRUE for normalization to be done , FALSE
2001-04-20 22:29:53 +00:00
* otherwise
2001-04-12 00:08:26 +00:00
*/
2001-10-22 05:30:22 +00:00
static
2001-05-11 01:13:08 +00:00
inline UBool collPrevIterFCD ( collIterate * data )
2001-04-12 00:08:26 +00:00
{
2001-08-17 00:21:18 +00:00
const UChar * src , * start ;
UChar c , c2 ;
2001-04-12 00:08:26 +00:00
uint8_t leadingCC ;
uint8_t trailingCC = 0 ;
uint16_t fcd ;
2001-04-20 22:29:53 +00:00
UBool result = FALSE ;
2001-04-23 01:53:49 +00:00
2001-08-17 00:21:18 +00:00
start = data - > string ;
src = data - > pos + 1 ;
2001-04-06 23:37:48 +00:00
2001-04-12 00:08:26 +00:00
/* Get the trailing combining class of the current character. */
2001-08-17 00:21:18 +00:00
c = * - - src ;
if ( ! UTF_IS_SURROGATE ( c ) ) {
fcd = unorm_getFCD16 ( fcdTrieIndex , c ) ;
} else if ( UTF_IS_SECOND_SURROGATE ( c ) & & start < src & & UTF_IS_FIRST_SURROGATE ( c2 = * ( src - 1 ) ) ) {
- - src ;
fcd = unorm_getFCD16 ( fcdTrieIndex , c2 ) ;
if ( fcd ! = 0 ) {
fcd = unorm_getFCD16FromSurrogatePair ( fcdTrieIndex , fcd , c ) ;
}
} else /* unpaired surrogate */ {
fcd = 0 ;
}
2001-04-06 23:37:48 +00:00
2001-04-12 00:08:26 +00:00
leadingCC = ( uint8_t ) ( fcd > > SECOND_LAST_BYTE_SHIFT_ ) ;
2001-04-23 01:53:49 +00:00
2001-04-12 00:08:26 +00:00
if ( leadingCC ! = 0 ) {
/*
2001-04-23 01:53:49 +00:00
The current char has a non - zero leading combining class .
2001-04-12 00:08:26 +00:00
Scan backward until we find a char with a trailing cc of zero .
*/
2001-05-02 23:36:22 +00:00
for ( ; ; )
2001-04-12 00:08:26 +00:00
{
2001-08-17 00:21:18 +00:00
if ( start = = src ) {
data - > fcdPosition = NULL ;
return result ;
2001-04-12 00:08:26 +00:00
}
2001-04-06 23:37:48 +00:00
2001-08-17 00:21:18 +00:00
c = * - - src ;
if ( ! UTF_IS_SURROGATE ( c ) ) {
fcd = unorm_getFCD16 ( fcdTrieIndex , c ) ;
} else if ( UTF_IS_SECOND_SURROGATE ( c ) & & start < src & & UTF_IS_FIRST_SURROGATE ( c2 = * ( src - 1 ) ) ) {
- - src ;
fcd = unorm_getFCD16 ( fcdTrieIndex , c2 ) ;
if ( fcd ! = 0 ) {
fcd = unorm_getFCD16FromSurrogatePair ( fcdTrieIndex , fcd , c ) ;
}
} else /* unpaired surrogate */ {
fcd = 0 ;
}
2001-04-12 00:08:26 +00:00
trailingCC = ( uint8_t ) ( fcd & LAST_BYTE_MASK_ ) ;
if ( trailingCC = = 0 ) {
break ;
}
2001-04-23 01:53:49 +00:00
2001-04-12 00:08:26 +00:00
if ( leadingCC < trailingCC ) {
2001-04-20 22:29:53 +00:00
result = TRUE ;
2001-04-12 00:08:26 +00:00
}
2001-04-23 01:53:49 +00:00
leadingCC = ( uint8_t ) ( fcd > > SECOND_LAST_BYTE_SHIFT_ ) ;
2001-04-12 00:08:26 +00:00
}
}
2001-04-23 01:53:49 +00:00
2001-08-17 00:21:18 +00:00
data - > fcdPosition = ( UChar * ) src ;
2001-04-12 00:08:26 +00:00
2001-04-20 22:29:53 +00:00
return result ;
2001-04-12 00:08:26 +00:00
}
2003-02-06 23:29:56 +00:00
/** gets a character from the string at a given offset
* Handles both normal and iterative cases .
* No error checking - caller beware !
*/
inline static
UChar peekCharacter ( collIterate * source , int32_t offset ) {
if ( source - > pos ! = NULL ) {
return * ( source - > pos + offset ) ;
} else if ( source - > iterator ! = NULL ) {
if ( offset ! = 0 ) {
source - > iterator - > move ( source - > iterator , offset , UITER_CURRENT ) ;
UChar toReturn = ( UChar ) source - > iterator - > next ( source - > iterator ) ;
source - > iterator - > move ( source - > iterator , - offset - 1 , UITER_CURRENT ) ;
return toReturn ;
} else {
return ( UChar ) source - > iterator - > current ( source - > iterator ) ;
}
} else {
return U_SENTINEL ;
}
}
2001-05-17 01:06:25 +00:00
/**
* Determines if we are at the start of the data string in the backwards
* collation iterator
* @ param data collation iterator
* @ return TRUE if we are at the start
*/
2001-10-22 05:30:22 +00:00
static
2001-05-17 01:06:25 +00:00
inline UBool isAtStartPrevIterate ( collIterate * data ) {
2003-02-08 02:16:54 +00:00
if ( data - > pos = = NULL & & data - > iterator ! = NULL ) {
return ! data - > iterator - > hasPrevious ( data - > iterator ) ;
}
//return (collIter_bos(data)) ||
return ( data - > pos = = data - > string ) | |
2001-05-17 01:06:25 +00:00
( ( data - > flags & UCOL_ITER_INNORMBUF ) & &
* ( data - > pos - 1 ) = = 0 & & data - > fcdPosition = = NULL ) ;
}
2001-04-12 00:08:26 +00:00
/**
* Inline function that gets a simple CE .
2001-04-23 01:53:49 +00:00
* So what it does is that it will first check the expansion buffer . If the
* expansion buffer is not empty , ie the end pointer to the expansion buffer
* is different from the string pointer , we return the collation element at the
2001-04-12 00:08:26 +00:00
* return pointer and decrement it .
* For more complicated CEs it resorts to getComplicatedCE .
* @ param coll collator data
* @ param data collation iterator struct
* @ param status error status
*/
2001-10-22 05:30:22 +00:00
static
2001-04-18 19:31:05 +00:00
inline uint32_t ucol_IGetPrevCE ( const UCollator * coll , collIterate * data ,
2001-04-23 01:53:49 +00:00
UErrorCode * status )
2001-04-12 00:08:26 +00:00
{
uint32_t result = UCOL_NULLORDER ;
2001-04-23 01:53:49 +00:00
if ( data - > CEpos > data - > CEs ) {
data - > toReturn - - ;
result = * ( data - > toReturn ) ;
if ( data - > CEs = = data - > toReturn ) {
2001-09-07 21:56:18 +00:00
data - > CEpos = data - > toReturn ;
2001-04-23 01:53:49 +00:00
}
}
else {
2002-12-04 00:28:06 +00:00
UChar ch = 0 ;
2001-04-23 01:53:49 +00:00
/*
Loop handles case when incremental normalize switches to or from the
side buffer / original string , and we need to start again to get the
2001-04-12 00:08:26 +00:00
next character .
*/
2001-05-02 23:36:22 +00:00
for ( ; ; ) {
2001-06-20 18:14:51 +00:00
if ( data - > flags & UCOL_ITER_HASLEN ) {
2001-04-23 01:53:49 +00:00
/*
2001-04-12 00:08:26 +00:00
Normal path for strings when length is specified .
Not in side buffer because it is always null terminated .
*/
if ( data - > pos < = data - > string ) {
/* End of the main source string */
return UCOL_NO_MORE_CES ;
}
2001-06-20 18:14:51 +00:00
data - > pos - - ;
ch = * data - > pos ;
2001-04-12 00:08:26 +00:00
}
2003-02-06 23:29:56 +00:00
// we are using an iterator to go back. Pray for us!
else if ( data - > flags & UCOL_USE_ITERATOR ) {
UChar32 iterCh = data - > iterator - > previous ( data - > iterator ) ;
if ( iterCh = = U_SENTINEL ) {
return UCOL_NO_MORE_CES ;
} else {
ch = ( UChar ) iterCh ;
}
}
2001-04-12 00:08:26 +00:00
else {
2001-06-20 18:14:51 +00:00
data - > pos - - ;
ch = * data - > pos ;
2001-04-12 00:08:26 +00:00
/* we are in the side buffer. */
2001-06-20 18:14:51 +00:00
if ( ch = = 0 ) {
2001-04-23 01:53:49 +00:00
/*
At the start of the normalize side buffer .
2001-04-12 00:08:26 +00:00
Go back to string .
2001-04-12 18:25:07 +00:00
Because pointer points to the last accessed character ,
hence we have to increment it by one here .
2001-04-12 00:08:26 +00:00
*/
2001-04-12 18:25:07 +00:00
if ( data - > fcdPosition = = NULL ) {
data - > pos = data - > string ;
return UCOL_NO_MORE_CES ;
}
else {
data - > pos = data - > fcdPosition + 1 ;
}
2001-04-12 00:08:26 +00:00
data - > flags = data - > origFlags ;
continue ;
}
}
2001-10-05 02:07:51 +00:00
if ( data - > flags & UCOL_HIRAGANA_Q ) {
if ( ch > = 0x3040 & & ch < = 0x309f ) {
data - > flags | = UCOL_WAS_HIRAGANA ;
} else {
data - > flags & = ~ UCOL_WAS_HIRAGANA ;
}
}
2001-06-20 18:14:51 +00:00
2001-04-23 01:53:49 +00:00
/*
2001-06-20 18:14:51 +00:00
* got a character to determine if there ' s fcd and / or normalization
* stuff to do .
2001-04-12 00:08:26 +00:00
* if the current character is not fcd .
2001-04-25 23:28:55 +00:00
* if current character is at the start of the string
2001-04-12 00:08:26 +00:00
* Trailing combining class = = 0 .
* Note if pos is in the writablebuffer , norm is always 0
*/
2001-06-28 20:42:56 +00:00
if ( ch < ZERO_CC_LIMIT_ | |
2003-02-06 23:29:56 +00:00
// this should propel us out of the loop in the iterator case
2001-06-28 20:42:56 +00:00
( data - > flags & UCOL_ITER_NORM ) = = 0 | |
2001-06-20 18:14:51 +00:00
( data - > fcdPosition ! = NULL & & data - > fcdPosition < = data - > pos )
| | data - > string = = data - > pos ) {
2001-04-12 00:08:26 +00:00
break ;
}
2001-04-23 01:53:49 +00:00
2001-04-12 00:08:26 +00:00
if ( ch < NFC_ZERO_CC_BLOCK_LIMIT_ ) {
/* if next character is FCD */
if ( data - > pos = = data - > string ) {
/* First char of string is always OK for FCD check */
break ;
}
2001-04-23 01:53:49 +00:00
2001-04-12 00:08:26 +00:00
/* Not first char of string, do the FCD fast test */
if ( * ( data - > pos - 1 ) < NFC_ZERO_CC_BLOCK_LIMIT_ ) {
break ;
}
}
2001-04-23 01:53:49 +00:00
2001-04-12 00:08:26 +00:00
/* Need a more complete FCD check and possible normalization. */
2001-04-20 22:29:53 +00:00
if ( collPrevIterFCD ( data ) ) {
collPrevIterNormalize ( data ) ;
}
2001-04-12 00:08:26 +00:00
if ( ( data - > flags & UCOL_ITER_INNORMBUF ) = = 0 ) {
/* No normalization. Go ahead and process the char. */
break ;
}
2001-04-23 01:53:49 +00:00
/*
Some normalization happened .
Next loop picks up a char from the normalization buffer .
2001-04-12 00:08:26 +00:00
*/
}
2001-04-23 01:53:49 +00:00
2001-06-06 23:26:50 +00:00
/* attempt to handle contractions, after removal of the backwards
2001-05-17 01:06:25 +00:00
contraction
*/
2001-06-28 20:42:56 +00:00
if ( ucol_contractionEndCP ( ch , coll ) & & ! isAtStartPrevIterate ( data ) ) {
2001-10-20 01:09:31 +00:00
result = ucol_prv_getSpecialPrevCE ( coll , ch , UCOL_CONTRACTION , data , status ) ;
2001-04-23 01:53:49 +00:00
}
else {
2001-05-17 01:06:25 +00:00
if ( ch < = 0xFF ) {
result = coll - > latinOneMapping [ ch ] ;
2001-06-20 18:14:51 +00:00
if ( result > UCOL_NOT_FOUND ) {
2001-10-20 01:09:31 +00:00
result = ucol_prv_getSpecialPrevCE ( coll , ch , result , data , status ) ;
2001-06-20 18:14:51 +00:00
}
2001-04-23 01:53:49 +00:00
}
else {
2003-02-06 23:29:56 +00:00
// TODO: fix me for THAI - I reference *(data->pos-1)
2001-05-17 01:06:25 +00:00
if ( ( data - > flags & UCOL_ITER_INNORMBUF ) = = 0 & &
2002-07-09 23:57:45 +00:00
/*UCOL_ISTHAIBASECONSONANT(ch) &&*/ // This is from the old specs - we now rearrange unconditionally
data - > pos > data - > string & &
2003-02-06 23:29:56 +00:00
UCOL_ISTHAIPREVOWEL ( peekCharacter ( data , - 1 ) ) )
//UCOL_ISTHAIPREVOWEL(*(data->pos -1)))
2001-05-17 01:06:25 +00:00
{
result = UCOL_THAI ;
}
else {
2001-12-19 07:00:45 +00:00
/*result = ucmpe32_get(coll->mapping, ch);*/
result = UTRIE_GET32_FROM_LEAD ( coll - > mapping , ch ) ;
2001-05-17 01:06:25 +00:00
}
2001-06-20 18:14:51 +00:00
if ( result > UCOL_NOT_FOUND ) {
2001-10-20 01:09:31 +00:00
result = ucol_prv_getSpecialPrevCE ( coll , ch , result , data , status ) ;
2001-06-20 18:14:51 +00:00
}
if ( result = = UCOL_NOT_FOUND ) {
2001-09-20 20:16:39 +00:00
if ( ! isAtStartPrevIterate ( data ) & &
ucol_contractionEndCP ( ch , data - > coll ) ) {
result = UCOL_CONTRACTION ;
}
else {
2001-12-19 07:00:45 +00:00
/*result = ucmpe32_get(UCA->mapping, ch);*/
result = UTRIE_GET32_FROM_LEAD ( UCA - > mapping , ch ) ;
2001-09-20 20:16:39 +00:00
}
if ( result > UCOL_NOT_FOUND ) {
2001-10-20 01:09:31 +00:00
result = ucol_prv_getSpecialPrevCE ( UCA , ch , result , data , status ) ;
2001-09-20 20:16:39 +00:00
}
2001-06-20 18:14:51 +00:00
}
2001-04-18 19:31:05 +00:00
}
2001-04-23 01:53:49 +00:00
}
2001-04-12 00:08:26 +00:00
}
return result ;
}
2001-04-06 23:37:48 +00:00
2001-04-18 19:31:05 +00:00
/* ucol_getPrevCE, out-of-line version for use from other files. */
2001-11-21 01:08:55 +00:00
U_CAPI uint32_t U_EXPORT2
ucol_getPrevCE ( const UCollator * coll , collIterate * data ,
2001-04-18 19:31:05 +00:00
UErrorCode * status ) {
return ucol_IGetPrevCE ( coll , data , status ) ;
}
2001-03-09 00:50:37 +00:00
/* this should be connected to special Jamo handling */
2001-11-21 01:08:55 +00:00
U_CAPI uint32_t U_EXPORT2
ucol_getFirstCE ( const UCollator * coll , UChar u , UErrorCode * status ) {
2001-03-09 00:50:37 +00:00
collIterate colIt ;
uint32_t order ;
2001-04-06 23:37:48 +00:00
IInit_collIterate ( coll , & u , 1 , & colIt ) ;
2001-04-18 19:31:05 +00:00
order = ucol_IGetNextCE ( coll , & colIt , status ) ;
2001-04-06 23:37:48 +00:00
/*UCOL_GETNEXTCE(order, coll, colIt, status);*/
2001-03-09 00:50:37 +00:00
return order ;
}
2001-04-24 03:18:54 +00:00
/**
2001-05-11 01:13:08 +00:00
* Inserts the argument character into the end of the buffer pushing back the
2001-04-24 03:18:54 +00:00
* null terminator .
2001-04-28 01:22:25 +00:00
* @ param data collIterate struct data
* @ param pNull pointer to the null termination
2001-04-24 03:18:54 +00:00
* @ param ch character to be appended
2001-04-25 23:28:55 +00:00
* @ return the position of the new addition
2001-04-24 03:18:54 +00:00
*/
2001-10-22 05:30:22 +00:00
static
2001-05-11 01:13:08 +00:00
inline UChar * insertBufferEnd ( collIterate * data , UChar * pNull , UChar ch )
2001-04-24 03:18:54 +00:00
{
uint32_t size = data - > writableBufSize ;
UChar * newbuffer ;
const uint32_t incsize = 5 ;
2001-04-28 01:22:25 +00:00
if ( ( data - > writableBuffer + size ) > ( pNull + 1 ) ) {
* pNull = ch ;
* ( pNull + 1 ) = 0 ;
return pNull ;
2001-04-24 03:18:54 +00:00
}
2001-05-11 01:13:08 +00:00
/*
2001-04-24 03:18:54 +00:00
buffer will always be null terminated at the end .
giving extra space since it is likely that more characters will be added .
*/
size + = incsize ;
newbuffer = ( UChar * ) uprv_malloc ( sizeof ( UChar ) * size ) ;
2002-07-20 06:00:04 +00:00
if ( newbuffer ! = NULL ) { // something wrong, but no status
uprv_memcpy ( newbuffer , data - > writableBuffer ,
data - > writableBufSize * sizeof ( UChar ) ) ;
2001-04-24 03:18:54 +00:00
2002-07-20 06:00:04 +00:00
freeHeapWritableBuffer ( data ) ;
data - > writableBufSize = size ;
data - > writableBuffer = newbuffer ;
2001-04-25 23:28:55 +00:00
2002-07-20 06:00:04 +00:00
newbuffer = newbuffer + data - > writableBufSize ;
* newbuffer = ch ;
* ( newbuffer + 1 ) = 0 ;
}
2001-04-25 23:28:55 +00:00
return newbuffer ;
2001-04-24 03:18:54 +00:00
}
2001-05-10 17:49:24 +00:00
/**
2001-05-11 01:13:08 +00:00
* Inserts the argument string into the end of the buffer pushing back the
2001-05-10 17:49:24 +00:00
* null terminator .
* @ param data collIterate struct data
* @ param pNull pointer to the null termination
* @ param string to be appended
* @ param length of the string to be appended
* @ return the position of the new addition
*/
2001-10-22 05:30:22 +00:00
static
2001-05-10 17:49:24 +00:00
inline UChar * insertBufferEnd ( collIterate * data , UChar * pNull , UChar * str ,
2001-06-03 23:40:41 +00:00
int32_t length )
2001-05-10 17:49:24 +00:00
{
uint32_t size = pNull - data - > writableBuffer ;
UChar * newbuffer ;
2001-05-11 01:13:08 +00:00
2001-05-10 17:49:24 +00:00
if ( data - > writableBuffer + data - > writableBufSize > pNull + length + 1 ) {
uprv_memcpy ( pNull , str , length * sizeof ( UChar ) ) ;
* ( pNull + length ) = 0 ;
return pNull ;
}
2001-05-11 01:13:08 +00:00
/*
2001-05-10 17:49:24 +00:00
buffer will always be null terminated at the end .
giving extra space since it is likely that more characters will be added .
*/
newbuffer = ( UChar * ) uprv_malloc ( sizeof ( UChar ) * ( size + length + 1 ) ) ;
2002-07-20 06:00:04 +00:00
if ( newbuffer ! = NULL ) {
uprv_memcpy ( newbuffer , data - > writableBuffer , size * sizeof ( UChar ) ) ;
uprv_memcpy ( newbuffer + size , str , length * sizeof ( UChar ) ) ;
2001-05-10 17:49:24 +00:00
2002-07-20 06:00:04 +00:00
freeHeapWritableBuffer ( data ) ;
data - > writableBufSize = size + length + 1 ;
data - > writableBuffer = newbuffer ;
}
2001-05-10 17:49:24 +00:00
return newbuffer ;
}
2001-04-24 03:18:54 +00:00
/**
* Special normalization function for contraction in the forwards iterator .
* This normalization sequence will place the current character at source - > pos
* and its following normalized sequence into the buffer .
2001-05-11 01:13:08 +00:00
* The fcd position , pos will be changed .
2001-04-24 03:18:54 +00:00
* pos will now point to positions in the buffer .
* Flags will be changed accordingly .
* @ param data collation iterator data
*/
2001-10-22 05:30:22 +00:00
static
2001-04-24 03:18:54 +00:00
inline void normalizeNextContraction ( collIterate * data )
2001-05-11 01:13:08 +00:00
{
2001-04-24 03:18:54 +00:00
UChar * buffer = data - > writableBuffer ;
uint32_t buffersize = data - > writableBufSize ;
uint32_t strsize ;
UErrorCode status = U_ZERO_ERROR ;
2001-04-25 23:28:55 +00:00
/* because the pointer points to the next character */
2001-05-11 01:13:08 +00:00
UChar * pStart = data - > pos - 1 ;
2001-04-24 03:18:54 +00:00
UChar * pEnd ;
uint32_t normLen ;
UChar * pStartNorm ;
2001-04-25 23:28:55 +00:00
if ( ( data - > flags & UCOL_ITER_INNORMBUF ) = = 0 ) {
2001-04-24 03:18:54 +00:00
* data - > writableBuffer = * ( pStart - 1 ) ;
strsize = 1 ;
}
else {
strsize = u_strlen ( data - > writableBuffer ) ;
}
2001-05-11 01:13:08 +00:00
pEnd = data - > fcdPosition ;
normLen = unorm_normalize ( pStart , pEnd - pStart , UNORM_NFD , 0 , buffer , 0 ,
2001-04-24 03:18:54 +00:00
& status ) ;
if ( buffersize < = normLen + strsize ) {
uint32_t size = strsize + normLen + 1 ;
UChar * temp = ( UChar * ) uprv_malloc ( size * sizeof ( UChar ) ) ;
2002-07-20 06:00:04 +00:00
if ( temp ! = NULL ) {
uprv_memcpy ( temp , buffer , sizeof ( UChar ) * strsize ) ;
freeHeapWritableBuffer ( data ) ;
data - > writableBuffer = temp ;
data - > writableBufSize = size ;
data - > flags | = UCOL_ITER_ALLOCATED ;
}
2001-04-24 03:18:54 +00:00
}
status = U_ZERO_ERROR ;
pStartNorm = buffer + strsize ;
/* null-termination will be added here */
2001-05-11 01:13:08 +00:00
unorm_normalize ( pStart , pEnd - pStart , UNORM_NFD , 0 , pStartNorm ,
2001-04-25 23:28:55 +00:00
normLen + 1 , & status ) ;
2001-05-11 01:13:08 +00:00
2001-04-24 03:18:54 +00:00
data - > pos = data - > writableBuffer + strsize ;
data - > origFlags = data - > flags ;
data - > flags | = UCOL_ITER_INNORMBUF ;
data - > flags & = ~ ( UCOL_ITER_NORM | UCOL_ITER_HASLEN ) ;
}
/**
* Contraction character management function that returns the next character
* for the forwards iterator .
2001-05-11 01:13:08 +00:00
* Does nothing if the next character is in buffer and not the first character
2001-04-24 03:18:54 +00:00
* in it .
* Else it checks next character in data string to see if it is normalizable .
* If it is not , the character is simply copied into the buffer , else
2001-05-11 01:13:08 +00:00
* the whole normalized substring is copied into the buffer , including the
2001-04-24 03:18:54 +00:00
* current character .
* @ param data collation element iterator data
* @ return next character
*/
2001-10-22 05:30:22 +00:00
static
2001-05-11 01:13:08 +00:00
inline UChar getNextNormalizedChar ( collIterate * data )
2001-04-24 03:18:54 +00:00
{
2001-04-25 23:28:55 +00:00
UChar nextch ;
2001-04-24 03:18:54 +00:00
UChar ch ;
2003-01-20 07:43:32 +00:00
// Here we need to add the iterator code. One problem is the way
// end of string is handled. If we just return next char, it could
// be the sentinel. Most of the cases already check for this, but we
// need to be sure.
2001-06-06 23:26:50 +00:00
if ( ( data - > flags & ( UCOL_ITER_NORM | UCOL_ITER_INNORMBUF ) ) = = 0 ) {
/* if no normalization and not in buffer. */
2003-01-20 07:43:32 +00:00
if ( data - > flags & UCOL_USE_ITERATOR ) {
return ( UChar ) data - > iterator - > next ( data - > iterator ) ;
} else {
2001-06-06 23:26:50 +00:00
return * ( data - > pos + + ) ;
2003-01-20 07:43:32 +00:00
}
}
2003-01-23 01:52:34 +00:00
//if (data->flags & UCOL_ITER_NORM && data->flags & UCOL_USE_ITERATOR) {
//normalizeIterator(data);
//}
2001-06-06 23:26:50 +00:00
2001-05-10 17:49:24 +00:00
UChar * pEndWritableBuffer = NULL ;
2001-06-06 23:26:50 +00:00
UBool innormbuf = ( UBool ) ( data - > flags & UCOL_ITER_INNORMBUF ) ;
if ( ( innormbuf & & * data - > pos ! = 0 ) | |
2001-05-11 01:13:08 +00:00
( data - > fcdPosition ! = NULL & & ! innormbuf & &
2001-05-10 17:49:24 +00:00
data - > pos < data - > fcdPosition ) ) {
2001-05-11 01:13:08 +00:00
/*
2001-04-24 03:18:54 +00:00
if next character is in normalized buffer , no further normalization
is required
*/
return * ( data - > pos + + ) ;
}
if ( data - > flags & UCOL_ITER_HASLEN ) {
/* in data string */
if ( data - > pos + 1 = = data - > endp ) {
2001-04-25 23:28:55 +00:00
return * ( data - > pos + + ) ;
2001-04-24 03:18:54 +00:00
}
}
else {
2001-04-26 01:15:34 +00:00
if ( innormbuf ) {
2003-01-20 07:43:32 +00:00
// inside the normalization buffer, but at the end
// (since we encountered zero). This means, in the
// case we're using char iterator, that we need to
// do another round of normalization.
2003-01-23 01:52:34 +00:00
//if(data->origFlags & UCOL_USE_ITERATOR) {
2003-01-20 07:43:32 +00:00
// we need to restore original flags,
// otherwise, we'll lose them
2003-01-23 01:52:34 +00:00
//data->flags = data->origFlags;
//normalizeIterator(data);
//return *(data->pos++);
//} else {
2001-05-11 01:13:08 +00:00
/*
in writable buffer , at this point fcdPosition can not be
2001-04-24 03:18:54 +00:00
pointing to the end of the data string . see contracting tag .
*/
2003-02-20 08:18:37 +00:00
if ( data - > fcdPosition ) {
2001-04-25 23:28:55 +00:00
if ( * ( data - > fcdPosition + 1 ) = = 0 | |
data - > fcdPosition + 1 = = data - > endp ) {
/* at the end of the string, dump it into the normalizer */
2001-05-11 01:13:08 +00:00
data - > pos = insertBufferEnd ( data , data - > pos ,
2001-04-28 01:22:25 +00:00
* ( data - > fcdPosition ) ) + 1 ;
2001-04-25 23:28:55 +00:00
return * ( data - > fcdPosition + + ) ;
2001-04-24 03:18:54 +00:00
}
2001-05-10 17:49:24 +00:00
pEndWritableBuffer = data - > pos ;
2001-04-24 03:18:54 +00:00
data - > pos = data - > fcdPosition ;
2003-02-20 08:18:37 +00:00
} else if ( data - > origFlags & UCOL_USE_ITERATOR ) {
// if we are here, we're using a normalizing iterator.
// we should just continue further.
data - > flags = data - > origFlags ;
data - > pos = NULL ;
return ( UChar ) data - > iterator - > next ( data - > iterator ) ;
}
2003-01-23 01:52:34 +00:00
//}
2001-04-24 03:18:54 +00:00
}
else {
if ( * ( data - > pos + 1 ) = = 0 ) {
return * ( data - > pos + + ) ;
}
}
}
2001-04-25 23:28:55 +00:00
ch = * data - > pos + + ;
nextch = * data - > pos ;
2001-05-11 01:13:08 +00:00
/*
2001-04-24 03:18:54 +00:00
* if the current character is not fcd .
* Trailing combining class = = 0 .
*/
2001-05-11 01:13:08 +00:00
if ( ( data - > fcdPosition = = NULL | | data - > fcdPosition < data - > pos ) & &
2001-04-24 03:18:54 +00:00
( nextch > = NFC_ZERO_CC_BLOCK_LIMIT_ | |
2001-04-25 23:28:55 +00:00
ch > = NFC_ZERO_CC_BLOCK_LIMIT_ ) ) {
2001-05-11 01:13:08 +00:00
/*
Need a more complete FCD check and possible normalization .
normalize substring will be appended to buffer
2001-04-24 03:18:54 +00:00
*/
2001-04-25 23:28:55 +00:00
if ( collIterFCD ( data ) ) {
2001-04-24 03:18:54 +00:00
normalizeNextContraction ( data ) ;
2001-04-25 23:28:55 +00:00
return * ( data - > pos + + ) ;
}
2001-05-10 17:49:24 +00:00
else if ( innormbuf ) {
2001-05-11 01:13:08 +00:00
/* fcdposition shifted even when there's no normalization, if we
don ' t input the rest into this , we ' ll get the wrong position when
2001-05-10 17:49:24 +00:00
we reach the end of the writableBuffer */
2001-06-03 23:40:41 +00:00
int32_t length = data - > fcdPosition - data - > pos + 1 ;
2001-05-11 01:13:08 +00:00
data - > pos = insertBufferEnd ( data , pEndWritableBuffer ,
2001-05-10 17:49:24 +00:00
data - > pos - 1 , length ) ;
return * ( data - > pos + + ) ;
}
2001-04-24 03:18:54 +00:00
}
2001-05-11 01:13:08 +00:00
2001-04-26 01:15:34 +00:00
if ( innormbuf ) {
2001-05-11 01:13:08 +00:00
/*
no normalization is to be done hence only one character will be
2001-04-24 03:18:54 +00:00
appended to the buffer .
*/
2001-05-10 17:49:24 +00:00
data - > pos = insertBufferEnd ( data , pEndWritableBuffer , ch ) + 1 ;
2001-04-24 03:18:54 +00:00
}
2001-05-11 01:13:08 +00:00
2001-04-24 03:18:54 +00:00
/* points back to the pos in string */
return ch ;
}
2003-01-20 07:43:32 +00:00
static
2003-02-06 23:29:56 +00:00
inline void goBackOne ( collIterate * data ) {
2003-01-20 07:43:32 +00:00
# if 0
// somehow, it looks like we need to keep iterator synced up
// at all times, as above.
if ( data - > pos ) {
data - > pos - - ;
}
if ( data - > iterator ) {
data - > iterator - > previous ( data - > iterator ) ;
}
# endif
if ( data - > iterator & & ( data - > flags & UCOL_USE_ITERATOR ) ) {
data - > iterator - > previous ( data - > iterator ) ;
}
if ( data - > pos ) {
data - > pos - - ;
}
}
2001-06-06 23:26:50 +00:00
/**
* Function to copy the buffer into writableBuffer and sets the fcd position to
2001-05-14 18:57:54 +00:00
* the correct position
* @ param source data string source
* @ param buffer character buffer
* @ param tempdb current position in buffer that has been used up
*/
2001-10-22 05:30:22 +00:00
static
2001-06-06 23:26:50 +00:00
inline void setDiscontiguosAttribute ( collIterate * source , UChar * buffer ,
2001-05-14 18:57:54 +00:00
UChar * tempdb )
{
2001-06-06 23:26:50 +00:00
/* okay confusing part here. to ensure that the skipped characters are
considered later , we need to place it in the appropriate position in the
normalization buffer and reassign the pos pointer . simple case if pos
reside in string , simply copy to normalization buffer and
fcdposition = pos , pos = start of normalization buffer . if pos in
normalization buffer , we ' ll insert the copy infront of pos and point pos
to the start of the normalization buffer . why am i doing these copies ?
2001-10-20 01:09:31 +00:00
well , so that the whole chunk of codes in the getNextCE , ucol_prv_getSpecialCE does
2001-05-14 18:57:54 +00:00
not require any changes , which be really painful . */
uint32_t length = u_strlen ( buffer ) ; ;
if ( source - > flags & UCOL_ITER_INNORMBUF ) {
u_strcpy ( tempdb , source - > pos ) ;
}
else {
source - > fcdPosition = source - > pos ;
source - > origFlags = source - > flags ;
source - > flags | = UCOL_ITER_INNORMBUF ;
2003-02-14 07:46:20 +00:00
source - > flags & = ~ ( UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR ) ;
2001-05-14 18:57:54 +00:00
}
if ( length > = source - > writableBufSize ) {
freeHeapWritableBuffer ( source ) ;
source - > writableBuffer =
( UChar * ) uprv_malloc ( ( length + 1 ) * sizeof ( UChar ) ) ;
2002-07-20 06:00:04 +00:00
if ( source - > writableBuffer = = NULL ) {
return ;
}
2001-05-14 18:57:54 +00:00
source - > writableBufSize = length ;
}
u_strcpy ( source - > writableBuffer , buffer ) ;
source - > pos = source - > writableBuffer ;
}
2001-05-10 17:49:24 +00:00
/**
* Function to get the discontiguos collation element within the source .
* Note this function will set the position to the appropriate places .
* @ param coll current collator used
* @ param source data string source
* @ param constart index to the start character in the contraction table
* @ return discontiguos collation element offset
*/
2001-10-20 01:09:31 +00:00
static
2001-06-06 23:26:50 +00:00
uint32_t getDiscontiguous ( const UCollator * coll , collIterate * source ,
2001-05-10 17:49:24 +00:00
const UChar * constart )
{
/* source->pos currently points to the second combining character after
the start character */
UChar * temppos = source - > pos ;
2002-07-27 05:16:44 +00:00
UChar buffer [ 4 * UCOL_MAX_BUFFER ] ;
2001-05-10 17:49:24 +00:00
UChar * tempdb = buffer ;
const UChar * tempconstart = constart ;
uint8_t tempflags = source - > flags ;
2001-05-14 18:57:54 +00:00
UBool multicontraction = FALSE ;
UChar * tempbufferpos = 0 ;
2003-02-14 07:46:20 +00:00
collIterateState discState ;
backupState ( source , & discState ) ;
2001-05-11 01:13:08 +00:00
2003-02-06 23:29:56 +00:00
//*tempdb = *(source->pos - 1);
* tempdb = peekCharacter ( source , - 1 ) ;
2001-05-10 17:49:24 +00:00
tempdb + + ;
while ( TRUE ) {
UChar * UCharOffset ;
UChar schar ,
tchar ;
uint32_t result ;
2001-05-11 01:13:08 +00:00
if ( ( ( source - > flags & UCOL_ITER_HASLEN ) & & source - > pos > = source - > endp )
2003-02-06 23:29:56 +00:00
| | ( peekCharacter ( source , 0 ) = = 0 & &
//|| (*source->pos == 0 &&
2001-05-10 17:49:24 +00:00
( ( source - > flags & UCOL_ITER_INNORMBUF ) = = 0 | |
2001-05-11 01:13:08 +00:00
source - > fcdPosition = = NULL | |
2001-05-10 17:49:24 +00:00
source - > fcdPosition = = source - > endp | |
* ( source - > fcdPosition ) = = 0 | |
u_getCombiningClass ( * ( source - > fcdPosition ) ) = = 0 ) ) | |
2001-05-11 01:13:08 +00:00
/* end of string in null terminated string or stopped by a
null character , note fcd does not always point to a base
2001-05-10 17:49:24 +00:00
character after the discontiguos change */
2003-02-06 23:29:56 +00:00
u_getCombiningClass ( peekCharacter ( source , 0 ) ) = = 0 ) {
//u_getCombiningClass(*(source->pos)) == 0) {
2001-05-12 00:12:55 +00:00
//constart = (UChar *)coll->image + getContractOffset(CE);
2001-05-14 18:57:54 +00:00
if ( multicontraction ) {
* tempbufferpos = 0 ;
source - > pos = temppos - 1 ;
setDiscontiguosAttribute ( source , buffer , tempdb ) ;
2001-06-06 23:26:50 +00:00
return * ( coll - > contractionCEs +
2001-05-14 18:57:54 +00:00
( tempconstart - coll - > contractionIndex ) ) ;
}
2001-05-12 00:12:55 +00:00
constart = tempconstart ;
2001-05-10 17:49:24 +00:00
break ;
}
UCharOffset = ( UChar * ) ( tempconstart + 1 ) ; /* skip the backward offset*/
schar = getNextNormalizedChar ( source ) ;
2001-05-11 01:13:08 +00:00
while ( schar > ( tchar = * UCharOffset ) ) {
2001-05-10 17:49:24 +00:00
UCharOffset + + ;
}
2001-05-11 01:13:08 +00:00
if ( schar ! = tchar ) {
/* not the correct codepoint. we stuff the current codepoint into
2001-05-10 17:49:24 +00:00
the discontiguos buffer and try the next character */
* tempdb = schar ;
tempdb + + ;
continue ;
}
else {
2001-05-11 01:13:08 +00:00
if ( u_getCombiningClass ( schar ) = =
2003-02-06 23:29:56 +00:00
u_getCombiningClass ( peekCharacter ( source , - 2 ) ) ) {
//u_getCombiningClass(*(source->pos - 2))) {
2001-05-10 17:49:24 +00:00
* tempdb = schar ;
tempdb + + ;
continue ;
}
2001-05-11 01:13:08 +00:00
result = * ( coll - > contractionCEs +
2001-05-10 17:49:24 +00:00
( UCharOffset - coll - > contractionIndex ) ) ;
}
* tempdb = 0 ;
if ( result = = UCOL_NOT_FOUND ) {
break ;
2001-05-11 01:13:08 +00:00
} else if ( isContraction ( result ) ) {
2001-05-10 17:49:24 +00:00
/* this is a multi-contraction*/
tempconstart = ( UChar * ) coll - > image + getContractOffset ( result ) ;
2001-05-14 18:57:54 +00:00
if ( * ( coll - > contractionCEs + ( constart - coll - > contractionIndex ) )
! = UCOL_NOT_FOUND ) {
multicontraction = TRUE ;
temppos = source - > pos + 1 ;
tempbufferpos = buffer + u_strlen ( buffer ) ;
2001-05-10 17:49:24 +00:00
}
2001-05-14 18:57:54 +00:00
} else {
setDiscontiguosAttribute ( source , buffer , tempdb ) ;
2001-05-10 17:49:24 +00:00
return result ;
}
}
2001-05-11 01:13:08 +00:00
/* no problems simply reverting just like that,
if we are in string before getting into this function , points back to
2001-05-10 17:49:24 +00:00
string hence no problem .
2001-05-11 01:13:08 +00:00
if we are in normalization buffer before getting into this function ,
since we ' ll never use another normalization within this function , we
know that fcdposition points to a base character . the normalization buffer
2001-05-10 17:49:24 +00:00
never change , hence this revert works . */
2003-02-14 07:46:20 +00:00
loadState ( source , & discState , TRUE ) ;
goBackOne ( source ) ;
//source->pos = temppos - 1;
2001-05-10 17:49:24 +00:00
source - > flags = tempflags ;
return * ( coll - > contractionCEs + ( constart - coll - > contractionIndex ) ) ;
}
2001-10-22 05:30:22 +00:00
static
2002-07-02 22:32:14 +00:00
inline UBool isNonChar ( UChar32 cp ) {
2002-08-01 23:09:41 +00:00
if ( ( cp & 0xFFFE ) = = 0xFFFE | | ( 0xFDD0 < = cp & & cp < = 0xFDEF ) | | ( 0xD800 < = cp & & cp < = 0xDFFF ) ) {
2002-07-02 22:32:14 +00:00
return TRUE ;
2001-09-21 21:22:44 +00:00
}
2002-07-02 22:32:14 +00:00
return FALSE ;
2001-09-20 20:16:39 +00:00
}
2002-06-13 18:34:41 +00:00
/* now uses Mark's getImplicitPrimary code */
static
inline uint32_t getImplicit ( UChar32 cp , collIterate * collationSource ) {
2002-07-02 22:32:14 +00:00
if ( isNonChar ( cp ) ) {
return 0 ;
2002-06-13 18:34:41 +00:00
}
uint32_t r = getImplicitPrimary ( cp ) ;
* ( collationSource - > CEpos + + ) = ( ( r & 0x0000FFFF ) < < 16 ) | 0x000000C0 ;
return ( r & UCOL_PRIMARYMASK ) | 0x00000505 ; // This was 'order'
}
2001-09-20 20:16:39 +00:00
2001-10-20 01:09:31 +00:00
static
2001-10-15 00:09:23 +00:00
inline UChar getPrevNormalizedChar ( collIterate * data ) ;
2001-01-16 00:28:40 +00:00
/* This function handles the special CEs like contractions, expansions, surrogates, Thai */
2001-09-20 20:16:39 +00:00
/* It is called by getNextCE */
2001-10-20 01:09:31 +00:00
uint32_t ucol_prv_getSpecialCE ( const UCollator * coll , UChar ch , uint32_t CE , collIterate * source , UErrorCode * status ) {
2001-10-15 00:09:23 +00:00
collIterateState entryState ;
backupState ( source , & entryState ) ;
2001-11-08 21:16:38 +00:00
UChar32 cp = ch ;
2002-07-09 23:57:45 +00:00
2001-10-15 00:09:23 +00:00
//UChar *entryPos = source->pos;
2001-01-15 23:18:06 +00:00
for ( ; ; ) {
2001-06-06 23:26:50 +00:00
// This loop will repeat only in the case of contractions, and only when a contraction
// is found and the first CE resulting from that contraction is itself a special
// (an expansion, for example.) All other special CE types are fully handled the
// first time through, and the loop exits.
2001-01-05 00:47:25 +00:00
const uint32_t * CEOffset = NULL ;
switch ( getCETag ( CE ) ) {
case NOT_FOUND_TAG :
/* This one is not found, and we'll let somebody else bother about it... no more games */
return CE ;
case SURROGATE_TAG :
2001-08-10 20:30:44 +00:00
/* we encountered a leading surrogate. We shall get the CE by using the following code unit */
/* two things can happen here: next code point can be a trailing surrogate - we will use it */
/* to retrieve the CE, or it is not a trailing surrogate (or the string is done). In that case */
/* we return 0 (completely ignorable - per UCA specification */
{
UChar trail ;
2001-10-08 19:29:12 +00:00
collIterateState state ;
backupState ( source , & state ) ;
2001-08-10 20:30:44 +00:00
if ( collIter_eos ( source ) | | ! ( UTF16_IS_TRAIL ( ( trail = getNextNormalizedChar ( source ) ) ) ) ) {
2001-11-10 05:08:51 +00:00
// we chould have stepped one char forward and it might have turned that it
// was not a trail surrogate. In that case, we have to backup.
loadState ( source , & state , TRUE ) ;
2001-08-10 20:30:44 +00:00
return 0 ;
} else {
2001-12-19 07:00:45 +00:00
/* CE = ucmpe32_getSurrogate(coll->mapping, CE, trail); */
/* TODO: CE contain the data from the previous CE + the mask. It should at least be unmasked */
CE = UTRIE_GET32_FROM_OFFSET_TRAIL ( coll - > mapping , CE & 0xFFFFFF , trail ) ;
2001-10-08 19:29:12 +00:00
if ( CE = = UCOL_NOT_FOUND ) { // there are tailored surrogates in this block, but not this one.
// We need to backup
loadState ( source , & state , TRUE ) ;
2001-11-08 21:16:38 +00:00
return CE ;
}
// calculate the supplementary code point value, if surrogate was not tailored
cp = ( ( ( ( uint32_t ) ch ) < < 10UL ) + ( trail ) - ( ( ( uint32_t ) 0xd800 < < 10UL ) + 0xdc00 - 0x10000 ) ) ;
2001-08-10 20:30:44 +00:00
}
}
break ;
2001-01-05 00:47:25 +00:00
case THAI_TAG :
2001-01-05 06:36:10 +00:00
/* Thai/Lao reordering */
2002-07-09 23:57:45 +00:00
if ( ( ( source - > flags ) & UCOL_ITER_INNORMBUF ) /* Already Swapped || */
2003-02-14 07:46:20 +00:00
| | ( source - > iterator & & ! source - > iterator - > hasNext ( source - > iterator ) )
| | ( source - > pos & & source - > endp = = source - > pos ) /* At end of string. No swap possible || */
2002-07-09 23:57:45 +00:00
/*|| UCOL_ISTHAIBASECONSONANT(*(source->pos)) == 0*/ ) /* next char not Thai base cons.*/ // This is from the old specs - we now rearrange unconditionally
2001-04-06 23:37:48 +00:00
{
// Treat Thai as a length one expansion */
CEOffset = ( uint32_t * ) coll - > image + getExpansionOffset ( CE ) ; /* find the offset to expansion table */
CE = * CEOffset + + ;
2001-01-05 00:47:25 +00:00
}
2001-04-06 23:37:48 +00:00
else
{
// Move the prevowel and the following base Consonant into the normalization buffer
// with their order swapped
2003-02-14 07:46:20 +00:00
source - > writableBuffer [ 0 ] = peekCharacter ( source , 0 ) ;
//source->writableBuffer[0] = *source->pos;
source - > writableBuffer [ 1 ] = peekCharacter ( source , - 1 ) ;
//source->writableBuffer[1] = *(source->pos - 1);
2001-04-06 23:37:48 +00:00
source - > writableBuffer [ 2 ] = 0 ;
2001-04-23 01:53:49 +00:00
2003-02-20 01:06:06 +00:00
if ( source - > pos ) {
source - > fcdPosition = source - > pos + 1 ; // Indicate where to continue in main input string
// after exhausting the writableBuffer
} else if ( source - > iterator ) {
source - > iterator - > next ( source - > iterator ) ;
}
2001-03-12 20:05:46 +00:00
source - > pos = source - > writableBuffer ;
2001-04-06 23:37:48 +00:00
source - > origFlags = source - > flags ;
source - > flags | = UCOL_ITER_INNORMBUF ;
2003-02-14 07:46:20 +00:00
source - > flags & = ~ ( UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR ) ;
2001-04-23 01:53:49 +00:00
2001-01-05 06:36:10 +00:00
CE = UCOL_IGNORABLE ;
}
2001-01-05 00:47:25 +00:00
break ;
2001-09-27 23:18:14 +00:00
case SPEC_PROC_TAG :
{
// Special processing is getting a CE that is preceded by a certain prefix
// Currently this is only needed for optimizing Japanese length and iteration marks.
// When we encouter a special processing tag, we go backwards and try to see if
// we have a match.
// Contraction tables are used - so the whole process is not unlike contraction.
// prefix data is stored backwards in the table.
const UChar * UCharOffset ;
2001-10-08 02:31:50 +00:00
UChar schar , tchar ;
2001-10-20 01:09:31 +00:00
//UChar32 normOutput = 0;
2001-10-15 00:09:23 +00:00
collIterateState prefixState ;
backupState ( source , & prefixState ) ;
loadState ( source , & entryState , TRUE ) ;
2003-02-06 23:29:56 +00:00
goBackOne ( source ) ;
2003-01-20 07:43:32 +00:00
//source->pos--;
2001-10-15 00:09:23 +00:00
//UChar *sourcePointer = --entryPos; //source->pos; // We want to look at the point where we entered - actually one
// before that...
2001-09-27 23:18:14 +00:00
for ( ; ; ) {
// This loop will run once per source string character, for as long as we
// are matching a potential contraction sequence
2001-10-15 00:09:23 +00:00
// First we position ourselves at the begining of contraction sequence
const UChar * ContractionStart = UCharOffset = ( UChar * ) coll - > image + getContractOffset ( CE ) ;
2002-07-02 22:32:14 +00:00
if ( collIter_bos ( source ) ) {
2001-10-08 02:31:50 +00:00
CE = * ( coll - > contractionCEs + ( UCharOffset - coll - > contractionIndex ) ) ;
break ;
}
2001-10-15 00:09:23 +00:00
schar = getPrevNormalizedChar ( source ) ;
2003-02-06 23:29:56 +00:00
goBackOne ( source ) ;
2003-01-20 07:43:32 +00:00
//source->pos--;
2001-10-15 00:09:23 +00:00
//schar = *(--sourcePointer);
2001-04-06 23:37:48 +00:00
2001-10-15 00:09:23 +00:00
while ( schar > ( tchar = * UCharOffset ) ) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
UCharOffset + + ;
}
2001-10-13 16:20:01 +00:00
2001-10-15 00:09:23 +00:00
if ( schar = = tchar ) {
// Found the source string char in the table.
// Pick up the corresponding CE from the table.
CE = * ( coll - > contractionCEs +
( UCharOffset - coll - > contractionIndex ) ) ;
}
else
{
2002-07-02 22:32:14 +00:00
// if there is a completely ignorable code point in the middle of
// a prefix, we need to act as if it's not there
// assumption: 'real' noncharacters (*fffe, *ffff, fdd0-fdef are set to zero)
// lone surrogates cannot be set to zero as it would break other processing
uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD ( coll - > mapping , schar ) ;
// it's easy for BMP code points
if ( isZeroCE = = 0 ) {
continue ;
} else if ( UTF_IS_TRAIL ( schar ) | | UTF_IS_LEAD ( schar ) ) {
// for supplementary code points, we have to check the next one
// situations where we are going to ignore
// 1. beginning of the string: schar is a lone surrogate
// 2. schar is a lone surrogate
// 3. schar is a trail surrogate in a valid surrogate sequence
// that is explicitly set to zero.
if ( ! collIter_bos ( source ) ) {
UChar lead ;
if ( UTF_IS_LEAD ( lead = getPrevNormalizedChar ( source ) ) ) {
isZeroCE = UTRIE_GET32_FROM_LEAD ( coll - > mapping , lead ) ;
if ( getCETag ( isZeroCE ) = = SURROGATE_TAG ) {
uint32_t finalCE = UTRIE_GET32_FROM_OFFSET_TRAIL ( coll - > mapping , isZeroCE & 0xFFFFFF , schar ) ;
if ( finalCE = = 0 ) {
// this is a real, assigned completely ignorable code point
2003-02-06 23:29:56 +00:00
goBackOne ( source ) ;
2003-01-20 07:43:32 +00:00
//source->pos--;
2002-07-02 22:32:14 +00:00
continue ;
}
}
} else {
// lone surrogate, completely ignorable
continue ;
}
} else {
// lone surrogate at the beggining, completely ignorable
continue ;
}
}
2001-10-15 00:09:23 +00:00
// Source string char was not in the table.
// We have not found the prefix.
CE = * ( coll - > contractionCEs +
( ContractionStart - coll - > contractionIndex ) ) ;
}
2001-09-27 23:18:14 +00:00
2001-10-15 00:09:23 +00:00
if ( ! isPrefix ( CE ) ) {
// The source string char was in the contraction table, and the corresponding
// CE is not a prefix CE. We found the prefix, break
// out of loop, this CE will end up being returned. This is the normal
// way out of prefix handling when the source actually contained
// the prefix.
break ;
}
2001-09-27 23:18:14 +00:00
}
2001-11-01 00:00:15 +00:00
if ( CE ! = UCOL_NOT_FOUND ) { // we found something and we can merilly continue
loadState ( source , & prefixState , TRUE ) ;
2003-01-20 07:43:32 +00:00
if ( source - > origFlags & UCOL_USE_ITERATOR ) {
source - > flags = source - > origFlags ;
}
2001-11-01 00:00:15 +00:00
} else { // prefix search was a failure, we have to backup all the way to the start
loadState ( source , & entryState , TRUE ) ;
}
2001-09-27 23:18:14 +00:00
break ;
}
2001-01-05 00:47:25 +00:00
case CONTRACTION_TAG :
2001-06-06 23:26:50 +00:00
{
2001-01-05 00:47:25 +00:00
/* This should handle contractions */
2001-06-06 23:26:50 +00:00
collIterateState state ;
backupState ( source , & state ) ;
uint32_t firstCE = UCOL_NOT_FOUND ;
const UChar * UCharOffset ;
UChar schar , tchar ;
2001-01-15 23:18:06 +00:00
for ( ; ; ) {
2001-06-06 23:26:50 +00:00
/* This loop will run once per source string character, for as long as we */
/* are matching a potential contraction sequence */
2001-01-05 00:47:25 +00:00
/* First we position ourselves at the begining of contraction sequence */
2001-01-09 00:52:18 +00:00
const UChar * ContractionStart = UCharOffset = ( UChar * ) coll - > image + getContractOffset ( CE ) ;
2001-01-05 00:47:25 +00:00
2001-06-06 23:26:50 +00:00
if ( collIter_eos ( source ) ) {
// Ran off the end of the source string.
2001-05-11 01:13:08 +00:00
CE = * ( coll - > contractionCEs + ( UCharOffset - coll - > contractionIndex ) ) ;
2001-05-10 17:49:24 +00:00
// So we'll pick whatever we have at the point...
2001-03-21 00:22:54 +00:00
if ( CE = = UCOL_NOT_FOUND ) {
2001-06-06 23:26:50 +00:00
// back up the source over all the chars we scanned going into this contraction.
CE = firstCE ;
loadState ( source , & state , TRUE ) ;
2003-01-20 07:43:32 +00:00
if ( source - > origFlags & UCOL_USE_ITERATOR ) {
source - > flags = source - > origFlags ;
}
2001-03-09 21:35:31 +00:00
}
2001-06-06 23:26:50 +00:00
break ;
2001-01-05 00:47:25 +00:00
}
2001-02-07 00:57:39 +00:00
2001-08-28 18:53:23 +00:00
uint8_t maxCC = ( uint8_t ) ( * ( UCharOffset ) & 0xFF ) ; /*get the discontiguos stuff */ /* skip the backward offset, see above */
uint8_t allSame = ( uint8_t ) ( * ( UCharOffset + + ) > > 8 ) ;
2001-03-03 09:27:42 +00:00
2001-04-24 03:18:54 +00:00
schar = getNextNormalizedChar ( source ) ;
2001-01-10 00:52:06 +00:00
while ( schar > ( tchar = * UCharOffset ) ) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
UCharOffset + + ;
}
2001-06-06 23:26:50 +00:00
if ( schar = = tchar ) {
// Found the source string char in the contraction table.
// Pick up the corresponding CE from the table.
2001-05-15 23:00:04 +00:00
CE = * ( coll - > contractionCEs +
2001-06-06 23:26:50 +00:00
( UCharOffset - coll - > contractionIndex ) ) ;
}
else
{
2002-07-02 22:32:14 +00:00
// if there is a completely ignorable code point in the middle of
// contraction, we need to act as if it's not there
uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD ( coll - > mapping , schar ) ;
// it's easy for BMP code points
if ( isZeroCE = = 0 ) {
continue ;
} else if ( UTF_IS_LEAD ( schar ) ) {
if ( ! collIter_eos ( source ) ) {
backupState ( source , & state ) ;
UChar trail = getNextNormalizedChar ( source ) ;
if ( UTF_IS_TRAIL ( trail ) ) { // do stuff with trail
if ( getCETag ( isZeroCE ) = = SURROGATE_TAG ) {
uint32_t finalCE = UTRIE_GET32_FROM_OFFSET_TRAIL ( coll - > mapping , isZeroCE & 0xFFFFFF , trail ) ;
if ( finalCE = = 0 ) {
continue ;
}
}
} else {
// broken surrogate sequence, thus completely ignorable
loadState ( source , & state , TRUE ) ;
continue ;
}
loadState ( source , & state , TRUE ) ;
} else { // no more characters, so broken surrogate pair...
// this contraction will ultimately fail, but not because of us
continue ;
}
2003-01-20 07:43:32 +00:00
} // else if(UTF_IS_LEAD(schar))
2002-07-02 22:32:14 +00:00
2001-06-06 23:26:50 +00:00
// Source string char was not in contraction table.
// Unless we have a discontiguous contraction, we have finished
// with this contraction.
uint8_t sCC ;
if ( schar < 0x300 | |
maxCC = = 0 | |
( sCC = i_getCombiningClass ( schar , coll ) ) = = 0 | |
sCC > maxCC | |
( allSame ! = 0 & & sCC = = maxCC ) | |
collIter_eos ( source ) ) {
// Contraction can not be discontiguous.
2003-02-06 23:29:56 +00:00
goBackOne ( source ) ;
2003-01-20 07:43:32 +00:00
//source->pos --; // back up the source string pointer by one,
2001-06-06 23:26:50 +00:00
// because the character we just looked at was
// not part of the contraction. */
CE = * ( coll - > contractionCEs +
( ContractionStart - coll - > contractionIndex ) ) ;
} else {
//
// Contraction is possibly discontiguous.
// Scan more of source string looking for a match
//
UChar tempchar ;
2001-05-16 18:23:26 +00:00
/* find the next character if schar is not a base character
2001-06-06 23:26:50 +00:00
and we are not yet at the end of the string */
2001-05-16 18:23:26 +00:00
tempchar = getNextNormalizedChar ( source ) ;
2003-02-06 23:29:56 +00:00
goBackOne ( source ) ;
2003-01-20 07:43:32 +00:00
//source->pos --;
2001-06-06 23:26:50 +00:00
if ( i_getCombiningClass ( tempchar , coll ) = = 0 ) {
2003-02-06 23:29:56 +00:00
goBackOne ( source ) ;
2003-01-20 07:43:32 +00:00
//source->pos --;
2001-06-06 23:26:50 +00:00
/* Spit out the last char of the string, wasn't tasty enough */
CE = * ( coll - > contractionCEs +
( ContractionStart - coll - > contractionIndex ) ) ;
} else {
CE = getDiscontiguous ( coll , source , ContractionStart ) ;
}
2001-05-16 18:23:26 +00:00
}
2003-01-20 07:43:32 +00:00
} // else after if(schar == tchar)
2001-03-21 00:22:54 +00:00
2001-03-09 07:18:33 +00:00
if ( CE = = UCOL_NOT_FOUND ) {
2001-06-06 23:26:50 +00:00
/* The Source string did not match the contraction that we were checking. */
/* Back up the source position to undo the effects of having partially */
/* scanned through what ultimately proved to not be a contraction. */
2001-05-02 01:36:29 +00:00
loadState ( source , & state , TRUE ) ;
2001-05-10 17:49:24 +00:00
CE = firstCE ;
2003-01-20 07:43:32 +00:00
if ( source - > origFlags & UCOL_USE_ITERATOR ) {
source - > flags = source - > origFlags ;
}
2001-03-16 02:14:37 +00:00
break ;
2001-06-06 23:26:50 +00:00
}
if ( ! isContraction ( CE ) ) {
// The source string char was in the contraction table, and the corresponding
// CE is not a contraction CE. We completed the contraction, break
// out of loop, this CE will end up being returned. This is the normal
// way out of contraction handling when the source actually contained
// the contraction.
break ;
}
// The source string char was in the contraction table, and the corresponding
// CE is IS a contraction CE. We will continue looping to check the source
// string for the remaining chars in the contraction.
uint32_t tempCE = * ( coll - > contractionCEs + ( ContractionStart - coll - > contractionIndex ) ) ;
if ( tempCE ! = UCOL_NOT_FOUND ) {
// We have scanned a a section of source string for which there is a
// CE from the contraction table. Remember the CE and scan position, so
// that we can return to this point if further scanning fails to
// match a longer contraction sequence.
2001-04-25 23:28:55 +00:00
firstCE = tempCE ;
2003-02-06 23:29:56 +00:00
goBackOne ( source ) ;
2001-04-24 03:18:54 +00:00
backupState ( source , & state ) ;
2003-02-06 23:29:56 +00:00
getNextNormalizedChar ( source ) ;
// Another way to do this is:
//collIterateState tempState;
//backupState(source, &tempState);
//goBackOne(source);
//backupState(source, &state);
//loadState(source, &tempState, TRUE);
// The problem is that for incomplete contractions we have to remember the previous
// position. Before, the only thing I needed to do was state.pos--;
// After iterator introduction and especially after introduction of normalizing
// iterators, it became much more difficult to decrease the saved state.
// I'm not yet sure which of the two methods above is faster.
2001-03-09 07:18:33 +00:00
}
2003-01-20 07:43:32 +00:00
} // for(;;)
2001-01-05 00:47:25 +00:00
break ;
2003-01-20 07:43:32 +00:00
} // case CONTRACTION_TAG:
2002-01-08 00:11:54 +00:00
case LONG_PRIMARY_TAG :
{
* ( source - > CEpos + + ) = ( ( CE & 0xFF ) < < 24 ) | UCOL_CONTINUATION_MARKER ;
CE = ( ( CE & 0xFFFF00 ) < < 8 ) | ( UCOL_BYTE_COMMON < < 8 ) | UCOL_BYTE_COMMON ;
return CE ;
}
2001-01-05 00:47:25 +00:00
case EXPANSION_TAG :
2001-06-06 23:26:50 +00:00
{
2001-01-05 00:47:25 +00:00
/* This should handle expansion. */
/* NOTE: we can encounter both continuations and expansions in an expansion! */
/* I have to decide where continuations are going to be dealt with */
2001-06-06 23:26:50 +00:00
uint32_t size ;
uint32_t i ; /* general counter */
2001-01-10 00:52:06 +00:00
CEOffset = ( uint32_t * ) coll - > image + getExpansionOffset ( CE ) ; /* find the offset to expansion table */
2001-01-05 00:47:25 +00:00
size = getExpansionCount ( CE ) ;
CE = * CEOffset + + ;
if ( size ! = 0 ) { /* if there are less than 16 elements in expansion, we don't terminate */
for ( i = 1 ; i < size ; i + + ) {
* ( source - > CEpos + + ) = * CEOffset + + ;
}
} else { /* else, we do */
while ( * CEOffset ! = 0 ) {
* ( source - > CEpos + + ) = * CEOffset + + ;
}
}
2001-01-10 00:52:06 +00:00
return CE ;
2001-06-06 23:26:50 +00:00
}
2001-09-20 20:16:39 +00:00
/* various implicits optimization */
2002-06-13 18:34:41 +00:00
// TODO: remove CJK_IMPLICIT_TAG completely - handled by the getImplicit
2001-09-20 20:16:39 +00:00
case CJK_IMPLICIT_TAG : /* 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D*/
2002-06-13 18:34:41 +00:00
//return getImplicit(cp, source, 0x04000000);
return getImplicit ( cp , source ) ;
2001-09-20 20:16:39 +00:00
case IMPLICIT_TAG : /* everything that is not defined otherwise */
/* UCA is filled with these. Tailorings are NOT_FOUND */
2002-06-13 18:34:41 +00:00
//return getImplicit(cp, source, 0);
return getImplicit ( cp , source ) ;
2001-09-20 20:16:39 +00:00
case TRAIL_SURROGATE_TAG : /* DC00-DFFF*/
return 0 ; /* broken surrogate sequence */
case LEAD_SURROGATE_TAG : /* D800-DBFF*/
UChar nextChar ;
2003-01-20 07:43:32 +00:00
if ( source - > flags & UCOL_USE_ITERATOR ) {
if ( U_IS_TRAIL ( nextChar = ( UChar ) source - > iterator - > current ( source - > iterator ) ) ) {
cp = U16_GET_SUPPLEMENTARY ( ch , nextChar ) ;
source - > iterator - > next ( source - > iterator ) ;
return getImplicit ( cp , source ) ;
} else {
return 0 ;
}
} else if ( ( ( ( source - > flags & UCOL_ITER_HASLEN ) = = 0 ) | | ( source - > pos < source - > endp ) ) & &
U_IS_TRAIL ( ( nextChar = * source - > pos ) ) ) {
cp = U16_GET_SUPPLEMENTARY ( ch , nextChar ) ;
//cp = ((((uint32_t)ch)<<10UL)+(nextChar)-(((uint32_t)0xd800<<10UL)+0xdc00-0x10000));
2001-09-20 20:16:39 +00:00
source - > pos + + ;
2002-06-13 18:34:41 +00:00
#if 0
// CJKs handled in the getImplicit function. No need for fixup
2001-11-10 06:53:03 +00:00
if ( ( cp > = 0x20000 & & cp < = 0x2a6d6 ) | |
( cp > = 0x2F800 & & cp < = 0x2FA1D ) ) { // this might be a CJK supplementary cp
2002-10-16 22:34:16 +00:00
return getImplicit ( cp , source , 0x04000000 ) ;
2001-11-10 06:53:03 +00:00
} else { // or a regular one
return getImplicit ( cp , source , 0 ) ;
}
2002-06-13 18:34:41 +00:00
# endif
return getImplicit ( cp , source ) ;
2001-09-20 20:16:39 +00:00
} else {
return 0 ; /* completely ignorable */
}
case HANGUL_SYLLABLE_TAG : /* AC00-D7AF*/
{
const uint32_t
2001-10-08 02:31:50 +00:00
SBase = 0xAC00 , LBase = 0x1100 , VBase = 0x1161 , TBase = 0x11A7 ;
//const uint32_t LCount = 19;
const uint32_t VCount = 21 ;
const uint32_t TCount = 28 ;
//const uint32_t NCount = VCount * TCount; // 588
//const uint32_t SCount = LCount * NCount; // 11172
2001-09-20 20:16:39 +00:00
uint32_t L = ch - SBase ;
// divide into pieces
uint32_t T = L % TCount ; // we do it in this order since some compilers can do % and / in one operation
L / = TCount ;
uint32_t V = L % VCount ;
L / = VCount ;
// offset them
L + = LBase ;
V + = VBase ;
T + = TBase ;
// return the first CE, but first put the rest into the expansion buffer
if ( ! source - > coll - > image - > jamoSpecial ) { // FAST PATH
2001-12-19 07:00:45 +00:00
/**(source->CEpos++) = ucmpe32_get(UCA->mapping, V);*/
2002-06-13 18:34:41 +00:00
/**(source->CEpos++) = UTRIE_GET32_FROM_LEAD(UCA->mapping, V);*/
* ( source - > CEpos + + ) = UTRIE_GET32_FROM_LEAD ( coll - > mapping , V ) ;
2001-09-20 20:16:39 +00:00
if ( T ! = TBase ) {
2001-12-19 07:00:45 +00:00
/**(source->CEpos++) = ucmpe32_get(UCA->mapping, T);*/
2002-06-13 18:34:41 +00:00
/**(source->CEpos++) = UTRIE_GET32_FROM_LEAD(UCA->mapping, T);*/
* ( source - > CEpos + + ) = UTRIE_GET32_FROM_LEAD ( coll - > mapping , T ) ;
2001-09-20 20:16:39 +00:00
}
2001-12-19 07:00:45 +00:00
/*return ucmpe32_get(UCA->mapping, L);*/ // return first one
2002-06-13 18:34:41 +00:00
/*return UTRIE_GET32_FROM_LEAD(UCA->mapping, L);*/
return UTRIE_GET32_FROM_LEAD ( coll - > mapping , L ) ;
2001-09-20 20:16:39 +00:00
} else { // Jamo is Special
2002-02-28 01:42:40 +00:00
// Since Hanguls pass the FCD check, it is
2001-11-14 21:55:21 +00:00
// guaranteed that we won't be in
// the normalization buffer if something like this happens
2003-01-20 07:43:32 +00:00
// However, if we are using a uchar iterator and normalization
// is ON, the Hangul that lead us here is going to be in that
// normalization buffer. Here we want to restore the uchar
// iterator state and pull out of the normalization buffer
if ( source - > iterator ! = NULL & & source - > flags & UCOL_ITER_INNORMBUF ) {
source - > flags = source - > origFlags ; // restore the iterator
source - > pos = NULL ;
}
2001-09-27 18:36:18 +00:00
// Move Jamos into normalization buffer
source - > writableBuffer [ 0 ] = ( UChar ) L ;
source - > writableBuffer [ 1 ] = ( UChar ) V ;
if ( T ! = TBase ) {
source - > writableBuffer [ 2 ] = ( UChar ) T ;
source - > writableBuffer [ 3 ] = 0 ;
2001-09-20 20:16:39 +00:00
} else {
2001-09-27 18:36:18 +00:00
source - > writableBuffer [ 2 ] = 0 ;
}
2001-09-20 20:16:39 +00:00
2001-09-27 18:36:18 +00:00
source - > fcdPosition = source - > pos ; // Indicate where to continue in main input string
// after exhausting the writableBuffer
source - > pos = source - > writableBuffer ;
source - > origFlags = source - > flags ;
source - > flags | = UCOL_ITER_INNORMBUF ;
source - > flags & = ~ ( UCOL_ITER_NORM | UCOL_ITER_HASLEN ) ;
2001-09-27 17:17:54 +00:00
2001-09-27 18:36:18 +00:00
return ( UCOL_IGNORABLE ) ;
2001-09-20 20:16:39 +00:00
}
}
2001-01-05 00:47:25 +00:00
case CHARSET_TAG :
2001-09-20 20:16:39 +00:00
/* not yet implemented */
2001-01-05 06:36:10 +00:00
/* probably after 1.8 */
return UCOL_NOT_FOUND ;
2001-01-05 00:47:25 +00:00
default :
* status = U_INTERNAL_PROGRAM_ERROR ;
2001-01-10 00:52:06 +00:00
CE = 0 ;
2001-01-05 00:47:25 +00:00
break ;
}
if ( CE < = UCOL_NOT_FOUND ) break ;
}
return CE ;
1999-08-16 21:50:52 +00:00
}
2001-04-18 19:31:05 +00:00
/**
2001-05-11 01:13:08 +00:00
* Inserts the argument character into the front of the buffer replacing the
2001-04-20 22:29:53 +00:00
* front null terminator .
* @ param data collation element iterator data
2001-04-28 01:22:25 +00:00
* @ param pNull pointer to the null terminator
2001-04-20 22:29:53 +00:00
* @ param ch character to be appended
2001-04-25 23:28:55 +00:00
* @ return positon of added character
2001-04-20 22:29:53 +00:00
*/
2001-10-22 05:30:22 +00:00
static
2001-05-11 01:13:08 +00:00
inline UChar * insertBufferFront ( collIterate * data , UChar * pNull , UChar ch )
2001-04-20 22:29:53 +00:00
{
uint32_t size = data - > writableBufSize ;
2001-04-28 01:22:25 +00:00
UChar * end ;
2001-04-20 22:29:53 +00:00
UChar * newbuffer ;
const uint32_t incsize = 5 ;
2001-04-28 01:22:25 +00:00
if ( pNull > data - > writableBuffer + 1 ) {
* pNull = ch ;
* ( pNull - 1 ) = 0 ;
return pNull ;
2001-04-20 22:29:53 +00:00
}
2001-05-11 01:13:08 +00:00
/*
2001-04-20 22:29:53 +00:00
buffer will always be null terminated infront .
giving extra space since it is likely that more characters will be added .
*/
size + = incsize ;
newbuffer = ( UChar * ) uprv_malloc ( sizeof ( UChar ) * size ) ;
2002-07-20 06:00:04 +00:00
if ( newbuffer = = NULL ) {
return NULL ;
}
2001-04-20 22:29:53 +00:00
end = newbuffer + incsize ;
2001-05-11 01:13:08 +00:00
uprv_memcpy ( end , data - > writableBuffer ,
2001-04-20 22:29:53 +00:00
data - > writableBufSize * sizeof ( UChar ) ) ;
* end = ch ;
* ( end - 1 ) = 0 ;
2001-05-10 17:49:24 +00:00
freeHeapWritableBuffer ( data ) ;
2001-04-20 22:29:53 +00:00
data - > writableBufSize = size ;
data - > writableBuffer = newbuffer ;
2001-04-25 23:28:55 +00:00
return end ;
2001-04-20 22:29:53 +00:00
}
/**
* Special normalization function for contraction in the previous iterator .
* This normalization sequence will place the current character at source - > pos
* and its following normalized sequence into the buffer .
2001-05-11 01:13:08 +00:00
* The fcd position , pos will be changed .
2001-04-20 22:29:53 +00:00
* pos will now point to positions in the buffer .
* Flags will be changed accordingly .
* @ param data collation iterator data
*/
2001-10-22 05:30:22 +00:00
static
2001-04-20 22:29:53 +00:00
inline void normalizePrevContraction ( collIterate * data )
2001-05-11 01:13:08 +00:00
{
2001-04-20 22:29:53 +00:00
UChar * buffer = data - > writableBuffer ;
uint32_t buffersize = data - > writableBufSize ;
uint32_t nulltermsize ;
UErrorCode status = U_ZERO_ERROR ;
UChar * pEnd = data - > pos + 1 ; /* End normalize + 1 */
UChar * pStart ;
uint32_t normLen ;
UChar * pStartNorm ;
if ( data - > flags & UCOL_ITER_HASLEN ) {
2001-05-11 01:13:08 +00:00
/*
normalization buffer not used yet , we ' ll pull down the next
2001-04-20 22:29:53 +00:00
character into the end of the buffer
*/
2001-04-25 23:28:55 +00:00
* ( buffer + ( buffersize - 1 ) ) = * ( data - > pos + 1 ) ;
2001-04-20 22:29:53 +00:00
nulltermsize = buffersize - 1 ;
}
else {
nulltermsize = buffersize ;
UChar * temp = buffer + ( nulltermsize - 1 ) ;
while ( * ( temp - - ) ! = 0 ) {
nulltermsize - - ;
}
}
/* Start normalize */
if ( data - > fcdPosition = = NULL ) {
pStart = data - > string ;
}
else {
2001-05-11 01:13:08 +00:00
pStart = data - > fcdPosition + 1 ;
2001-04-20 22:29:53 +00:00
}
2001-05-11 01:13:08 +00:00
normLen = unorm_normalize ( pStart , pEnd - pStart , UNORM_NFD , 0 , buffer , 0 ,
2001-04-20 22:29:53 +00:00
& status ) ;
if ( nulltermsize < = normLen ) {
uint32_t size = buffersize - nulltermsize + normLen + 1 ;
UChar * temp = ( UChar * ) uprv_malloc ( size * sizeof ( UChar ) ) ;
2002-07-20 06:00:04 +00:00
if ( temp ! = NULL ) {
nulltermsize = normLen + 1 ;
uprv_memcpy ( temp + normLen , buffer ,
sizeof ( UChar ) * ( buffersize - nulltermsize ) ) ;
freeHeapWritableBuffer ( data ) ;
data - > writableBuffer = temp ;
data - > writableBufSize = size ;
}
2001-04-20 22:29:53 +00:00
}
status = U_ZERO_ERROR ;
2001-05-11 01:13:08 +00:00
/*
2001-04-20 22:29:53 +00:00
this puts the null termination infront of the normalized string instead
of the end
*/
pStartNorm = buffer + ( nulltermsize - normLen ) ;
* ( pStartNorm - 1 ) = 0 ;
2001-05-11 01:13:08 +00:00
unorm_normalize ( pStart , pEnd - pStart , UNORM_NFD , 0 , pStartNorm , normLen ,
2001-04-20 22:29:53 +00:00
& status ) ;
2001-05-11 01:13:08 +00:00
2001-04-20 22:29:53 +00:00
data - > pos = data - > writableBuffer + nulltermsize ;
data - > origFlags = data - > flags ;
data - > flags | = UCOL_ITER_INNORMBUF ;
data - > flags & = ~ ( UCOL_ITER_NORM | UCOL_ITER_HASLEN ) ;
}
/**
* Contraction character management function that returns the previous character
* for the backwards iterator .
2001-05-11 01:13:08 +00:00
* Does nothing if the previous character is in buffer and not the first
2001-04-20 22:29:53 +00:00
* character in it .
2001-05-11 01:13:08 +00:00
* Else it checks previous character in data string to see if it is
2001-04-20 22:29:53 +00:00
* normalizable .
* If it is not , the character is simply copied into the buffer , else
2001-05-11 01:13:08 +00:00
* the whole normalized substring is copied into the buffer , including the
2001-04-20 22:29:53 +00:00
* current character .
* @ param data collation element iterator data
* @ return previous character
*/
2001-10-20 01:09:31 +00:00
static
2001-05-11 01:13:08 +00:00
inline UChar getPrevNormalizedChar ( collIterate * data )
2001-04-20 22:29:53 +00:00
{
UChar prevch ;
UChar ch ;
UChar * start ;
2001-05-02 23:36:22 +00:00
UBool innormbuf = ( UBool ) ( data - > flags & UCOL_ITER_INNORMBUF ) ;
2001-04-28 01:22:25 +00:00
UChar * pNull = NULL ;
2001-05-11 01:13:08 +00:00
if ( ( data - > flags & ( UCOL_ITER_NORM | UCOL_ITER_INNORMBUF ) ) = = 0 | |
2001-04-26 01:15:34 +00:00
( innormbuf & & * ( data - > pos - 1 ) ! = 0 ) ) {
2001-05-11 01:13:08 +00:00
/*
2001-04-20 22:29:53 +00:00
if no normalization .
if previous character is in normalized buffer , no further normalization
is required
*/
2003-01-20 07:43:32 +00:00
if ( data - > flags & UCOL_USE_ITERATOR ) {
data - > iterator - > move ( data - > iterator , - 1 , UITER_CURRENT ) ;
return ( UChar ) data - > iterator - > next ( data - > iterator ) ;
} else if ( ( data - > origFlags & UCOL_USE_ITERATOR ) & & innormbuf ) {
normalizeIteratorBackwards ( data ) ;
2001-04-20 22:29:53 +00:00
return * ( data - > pos - 1 ) ;
2003-01-20 07:43:32 +00:00
} else {
return * ( data - > pos - 1 ) ;
}
}
2003-01-23 01:52:34 +00:00
//if(data->flags & UCOL_USE_ITERATOR) {
//normalizeIteratorBackwards(data);
//return *(data->pos - 1);
//}
2001-04-20 22:29:53 +00:00
start = data - > pos ;
if ( data - > flags & UCOL_ITER_HASLEN ) {
/* in data string */
if ( ( start - 1 ) = = data - > string ) {
return * ( start - 1 ) ;
}
2001-04-25 23:28:55 +00:00
start - - ;
ch = * start ;
prevch = * ( start - 1 ) ;
2001-04-20 22:29:53 +00:00
}
else {
2001-05-11 01:13:08 +00:00
/*
2001-04-20 22:29:53 +00:00
in writable buffer , at this point fcdPosition can not be NULL .
see contracting tag .
*/
if ( data - > fcdPosition = = data - > string ) {
/* at the start of the string, just dump it into the normalizer */
2001-04-28 01:22:25 +00:00
insertBufferFront ( data , data - > pos - 1 , * ( data - > fcdPosition ) ) ;
2001-04-25 23:28:55 +00:00
data - > fcdPosition = NULL ;
return * ( data - > pos - 1 ) ;
2001-04-20 22:29:53 +00:00
}
2001-04-28 01:22:25 +00:00
pNull = data - > pos - 1 ;
2001-04-25 23:28:55 +00:00
start = data - > fcdPosition ;
ch = * start ;
prevch = * ( start - 1 ) ;
2001-04-20 22:29:53 +00:00
}
2001-05-11 01:13:08 +00:00
/*
2001-04-20 22:29:53 +00:00
* if the current character is not fcd .
* Trailing combining class = = 0 .
*/
2001-05-11 01:13:08 +00:00
if ( data - > fcdPosition > start & &
( ch > = NFC_ZERO_CC_BLOCK_LIMIT_ | | prevch > = NFC_ZERO_CC_BLOCK_LIMIT_ ) )
2001-04-25 23:28:55 +00:00
{
2001-05-11 01:13:08 +00:00
/*
Need a more complete FCD check and possible normalization .
normalize substring will be appended to buffer
2001-04-25 23:28:55 +00:00
*/
UChar * backuppos = data - > pos ;
data - > pos = start ;
if ( collPrevIterFCD ( data ) ) {
2001-04-20 22:29:53 +00:00
normalizePrevContraction ( data ) ;
2001-04-25 23:28:55 +00:00
return * ( data - > pos - 1 ) ;
}
data - > pos = backuppos ;
data - > fcdPosition + + ;
2001-04-20 22:29:53 +00:00
}
2001-05-11 01:13:08 +00:00
2001-04-26 01:15:34 +00:00
if ( innormbuf ) {
2001-05-11 01:13:08 +00:00
/*
no normalization is to be done hence only one character will be
2001-04-25 23:28:55 +00:00
appended to the buffer .
*/
2001-04-28 01:22:25 +00:00
insertBufferFront ( data , pNull , ch ) ;
2001-04-25 23:28:55 +00:00
data - > fcdPosition - - ;
2001-04-20 22:29:53 +00:00
}
2001-05-11 01:13:08 +00:00
2001-04-20 22:29:53 +00:00
return ch ;
}
2002-06-13 18:34:41 +00:00
/* now uses Mark's getImplicitPrimary code */
static
inline uint32_t getPrevImplicit ( UChar32 cp , collIterate * collationSource ) {
2002-07-02 22:32:14 +00:00
if ( isNonChar ( cp ) ) {
return 0 ;
}
2002-06-13 18:34:41 +00:00
2002-07-02 22:32:14 +00:00
uint32_t r = getImplicitPrimary ( cp ) ;
2002-06-13 18:34:41 +00:00
2002-07-02 22:32:14 +00:00
* ( collationSource - > CEpos + + ) = ( r & UCOL_PRIMARYMASK ) | 0x00000505 ;
collationSource - > toReturn = collationSource - > CEpos ;
return ( ( r & 0x0000FFFF ) < < 16 ) | 0x000000C0 ;
2002-06-13 18:34:41 +00:00
}
2001-09-20 20:16:39 +00:00
2001-05-11 01:13:08 +00:00
/**
2002-06-13 18:34:41 +00:00
* This function handles the special CEs like contractions , expansions ,
* surrogates , Thai .
* It is called by both getPrevCE
*/
2001-10-20 01:09:31 +00:00
uint32_t ucol_prv_getSpecialPrevCE ( const UCollator * coll , UChar ch , uint32_t CE ,
2001-04-23 01:53:49 +00:00
collIterate * source ,
UErrorCode * status )
2001-02-20 00:26:50 +00:00
{
const uint32_t * CEOffset = NULL ;
2001-05-10 17:49:24 +00:00
UChar * UCharOffset = NULL ;
UChar schar ;
2001-02-20 00:26:50 +00:00
const UChar * constart = NULL ;
uint32_t size ;
2001-05-11 16:23:29 +00:00
UChar buffer [ UCOL_MAX_BUFFER ] ;
2001-05-17 21:34:04 +00:00
uint32_t * endCEBuffer ;
2001-05-17 01:06:25 +00:00
UChar * strbuffer ;
2002-07-02 22:32:14 +00:00
int32_t noChars = 0 ;
2001-04-20 22:29:53 +00:00
2001-02-28 19:30:41 +00:00
for ( ; ; )
2001-02-20 00:26:50 +00:00
{
2001-03-09 23:09:21 +00:00
/* the only ces that loops are thai and contractions */
2001-04-23 01:53:49 +00:00
switch ( getCETag ( CE ) )
2001-02-20 00:26:50 +00:00
{
2001-03-09 23:09:21 +00:00
case NOT_FOUND_TAG : /* this tag always returns */
2001-02-20 00:26:50 +00:00
return CE ;
2001-09-18 18:37:57 +00:00
case SURROGATE_TAG : /* This is a surrogate pair */
/* essentialy an engaged lead surrogate. */
/* if you have encountered it here, it means that a */
/* broken sequence was encountered and this is an error */
return 0 ;
2001-02-20 00:26:50 +00:00
case THAI_TAG :
2001-04-12 00:08:26 +00:00
if ( ( source - > flags & UCOL_ITER_INNORMBUF ) | | /* Already Swapped || */
source - > string = = source - > pos | | /* At start of string.|| */
2001-04-23 01:53:49 +00:00
/* previous char not Thai prevowel */
2002-07-09 23:57:45 +00:00
/*UCOL_ISTHAIBASECONSONANT(*(source->pos)) == FALSE ||*/ // This is from the old specs - we now rearrange unconditionally
2003-02-06 23:29:56 +00:00
UCOL_ISTHAIPREVOWEL ( peekCharacter ( source , - 1 ) ) = = FALSE )
//UCOL_ISTHAIPREVOWEL(*(source->pos - 1)) == FALSE)
2001-04-12 00:08:26 +00:00
{
/* Treat Thai as a length one expansion */
/* find the offset to expansion table */
2001-04-23 01:53:49 +00:00
CEOffset = ( uint32_t * ) coll - > image + getExpansionOffset ( CE ) ;
2001-04-12 00:08:26 +00:00
CE = * CEOffset + + ;
}
else
{
2001-04-23 01:53:49 +00:00
/*
Move the prevowel and the following base Consonant into the
2001-04-12 00:08:26 +00:00
normalization buffer with their order swapped
2001-02-20 00:26:50 +00:00
*/
2001-05-11 01:13:08 +00:00
UChar * tempbuffer = source - > writableBuffer +
2001-04-20 22:29:53 +00:00
( source - > writableBufSize - 1 ) ;
* ( tempbuffer - 2 ) = 0 ;
2003-02-06 23:29:56 +00:00
* ( tempbuffer - 1 ) = peekCharacter ( source , 0 ) ;
* ( tempbuffer ) = peekCharacter ( source , - 1 ) ;
//*(tempbuffer - 1) = *source->pos;
//*(tempbuffer) = *(source->pos - 1);
2001-04-23 01:53:49 +00:00
/*
Indicate where to continue in main input string after exhausting
the writableBuffer
2001-04-12 00:08:26 +00:00
*/
2001-04-12 18:25:07 +00:00
if ( source - > pos - 1 = = source - > string ) {
source - > fcdPosition = NULL ;
2001-09-27 17:17:54 +00:00
} else {
source - > fcdPosition = source - > pos - 2 ;
2001-04-12 18:25:07 +00:00
}
2001-04-20 22:29:53 +00:00
source - > pos = tempbuffer ;
2001-04-12 00:08:26 +00:00
source - > origFlags = source - > flags ;
source - > flags | = UCOL_ITER_INNORMBUF ;
source - > flags & = ~ ( UCOL_ITER_NORM | UCOL_ITER_HASLEN ) ;
2001-04-23 01:53:49 +00:00
2001-09-27 17:17:54 +00:00
//CE = UCOL_IGNORABLE;
return ( UCOL_IGNORABLE ) ;
2001-02-20 00:26:50 +00:00
}
break ;
2001-10-02 01:25:25 +00:00
case SPEC_PROC_TAG :
{
// Special processing is getting a CE that is preceded by a certain prefix
// Currently this is only needed for optimizing Japanese length and iteration marks.
// When we encouter a special processing tag, we go backwards and try to see if
// we have a match.
// Contraction tables are used - so the whole process is not unlike contraction.
// prefix data is stored backwards in the table.
const UChar * UCharOffset ;
2001-10-08 02:31:50 +00:00
UChar schar , tchar ;
2001-10-15 00:09:23 +00:00
collIterateState prefixState ;
backupState ( source , & prefixState ) ;
2001-10-08 02:31:50 +00:00
//UChar *sourcePointer = source->pos;
2001-10-20 01:09:31 +00:00
//UChar32 normOutput = 0;
2001-10-02 01:25:25 +00:00
for ( ; ; ) {
// This loop will run once per source string character, for as long as we
// are matching a potential contraction sequence
2001-10-15 00:09:23 +00:00
// First we position ourselves at the begining of contraction sequence
const UChar * ContractionStart = UCharOffset = ( UChar * ) coll - > image + getContractOffset ( CE ) ;
2002-07-02 22:32:14 +00:00
if ( collIter_bos ( source ) ) {
2001-10-15 00:09:23 +00:00
//if(sourcePointer == source->string) {
2001-10-08 02:31:50 +00:00
CE = * ( coll - > contractionCEs + ( UCharOffset - coll - > contractionIndex ) ) ;
break ;
}
2001-10-15 00:09:23 +00:00
schar = getPrevNormalizedChar ( source ) ;
2003-02-06 23:29:56 +00:00
goBackOne ( source ) ;
2003-01-20 07:43:32 +00:00
//source->pos--;
2001-10-15 00:09:23 +00:00
//schar = *(--sourcePointer);
2001-10-02 01:25:25 +00:00
2001-10-15 00:09:23 +00:00
while ( schar > ( tchar = * UCharOffset ) ) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
UCharOffset + + ;
}
2001-10-02 01:25:25 +00:00
2001-10-15 00:09:23 +00:00
if ( schar = = tchar ) {
// Found the source string char in the table.
// Pick up the corresponding CE from the table.
CE = * ( coll - > contractionCEs +
( UCharOffset - coll - > contractionIndex ) ) ;
}
else
2002-07-02 22:32:14 +00:00
{
// if there is a completely ignorable code point in the middle of
// a prefix, we need to act as if it's not there
// assumption: 'real' noncharacters (*fffe, *ffff, fdd0-fdef are set to zero)
// lone surrogates cannot be set to zero as it would break other processing
uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD ( coll - > mapping , schar ) ;
// it's easy for BMP code points
if ( isZeroCE = = 0 ) {
continue ;
} else if ( UTF_IS_TRAIL ( schar ) | | UTF_IS_LEAD ( schar ) ) {
// for supplementary code points, we have to check the next one
// situations where we are going to ignore
// 1. beginning of the string: schar is a lone surrogate
// 2. schar is a lone surrogate
// 3. schar is a trail surrogate in a valid surrogate sequence
// that is explicitly set to zero.
if ( ! collIter_bos ( source ) ) {
UChar lead ;
if ( UTF_IS_LEAD ( lead = getPrevNormalizedChar ( source ) ) ) {
isZeroCE = UTRIE_GET32_FROM_LEAD ( coll - > mapping , lead ) ;
if ( getCETag ( isZeroCE ) = = SURROGATE_TAG ) {
uint32_t finalCE = UTRIE_GET32_FROM_OFFSET_TRAIL ( coll - > mapping , isZeroCE & 0xFFFFFF , schar ) ;
if ( finalCE = = 0 ) {
// this is a real, assigned completely ignorable code point
2003-02-06 23:29:56 +00:00
goBackOne ( source ) ;
2003-01-20 07:43:32 +00:00
//source->pos--;
2002-07-02 22:32:14 +00:00
continue ;
}
}
} else {
// lone surrogate, completely ignorable
continue ;
}
} else {
// lone surrogate at the beggining, completely ignorable
continue ;
}
}
2001-10-15 00:09:23 +00:00
// Source string char was not in the table.
// We have not found the prefix.
CE = * ( coll - > contractionCEs +
( ContractionStart - coll - > contractionIndex ) ) ;
}
2001-10-02 01:25:25 +00:00
2001-10-15 00:09:23 +00:00
if ( ! isPrefix ( CE ) ) {
// The source string char was in the contraction table, and the corresponding
// CE is not a prefix CE. We found the prefix, break
// out of loop, this CE will end up being returned. This is the normal
// way out of prefix handling when the source actually contained
// the prefix.
break ;
}
2001-10-02 01:25:25 +00:00
}
2001-10-15 00:09:23 +00:00
loadState ( source , & prefixState , TRUE ) ;
2001-10-02 01:25:25 +00:00
break ;
}
2001-02-20 00:26:50 +00:00
case CONTRACTION_TAG :
2001-05-11 01:13:08 +00:00
/* to ensure that the backwards and forwards iteration matches, we
2001-05-10 17:49:24 +00:00
take the current region of most possible match and pass it through
the forward iteration . this will ensure that the obstinate problem of
overlapping contractions will not occur .
*/
2003-02-06 23:29:56 +00:00
schar = peekCharacter ( source , 0 ) ;
//schar = *(source->pos);
2001-05-10 17:49:24 +00:00
constart = ( UChar * ) coll - > image + getContractOffset ( CE ) ;
2001-05-17 01:06:25 +00:00
if ( isAtStartPrevIterate ( source )
/* commented away contraction end checks after adding the checks
2001-09-20 20:16:39 +00:00
in getPrevCE */ ) {
2001-05-10 17:49:24 +00:00
/* start of string or this is not the end of any contraction */
2001-06-06 23:26:50 +00:00
CE = * ( coll - > contractionCEs +
2001-05-17 01:06:25 +00:00
( constart - coll - > contractionIndex ) ) ;
2001-05-10 23:05:23 +00:00
break ;
2001-05-10 17:49:24 +00:00
}
2001-05-17 01:06:25 +00:00
strbuffer = buffer ;
UCharOffset = strbuffer + ( UCOL_MAX_BUFFER - 1 ) ;
2001-05-10 17:49:24 +00:00
* ( UCharOffset - - ) = 0 ;
2002-07-02 22:32:14 +00:00
noChars = 0 ;
2003-02-22 01:26:28 +00:00
// have to swap thai characters
while ( ucol_unsafeCP ( schar , coll ) | | UCOL_ISTHAIBASECONSONANT ( schar ) ) {
2001-05-10 17:49:24 +00:00
* ( UCharOffset ) = schar ;
2002-07-02 22:32:14 +00:00
noChars + + ;
2001-05-10 17:49:24 +00:00
UCharOffset - - ;
2001-04-20 22:29:53 +00:00
schar = getPrevNormalizedChar ( source ) ;
2003-02-06 23:29:56 +00:00
goBackOne ( source ) ;
2003-01-20 07:43:32 +00:00
//source->pos --;
2003-02-06 23:29:56 +00:00
// TODO: when we exhaust the contraction buffer,
// it needs to get reallocated. The problem is
// that the size depends on the string which is
// not iterated over. However, since we're travelling
// backwards, we already had to set the iterator at
// the end - so we might as well know where we are?
2001-05-17 01:06:25 +00:00
if ( UCharOffset + 1 = = buffer ) {
/* we have exhausted the buffer */
2003-02-06 23:29:56 +00:00
int32_t newsize = 0 ;
if ( source - > pos ) { // actually dealing with a position
newsize = source - > pos - source - > string + 1 ;
} else { // iterator
newsize = 4 * UCOL_MAX_BUFFER ;
}
2001-06-06 23:26:50 +00:00
strbuffer = ( UChar * ) uprv_malloc ( sizeof ( UChar ) *
2001-05-17 01:06:25 +00:00
( newsize + UCOL_MAX_BUFFER ) ) ;
2002-07-02 15:10:30 +00:00
/* test for NULL */
2002-06-29 09:32:36 +00:00
if ( strbuffer = = NULL ) {
* status = U_MEMORY_ALLOCATION_ERROR ;
2002-07-02 01:28:49 +00:00
return UCOL_NO_MORE_CES ;
2002-06-29 09:32:36 +00:00
}
2001-05-17 01:06:25 +00:00
UCharOffset = strbuffer + newsize ;
2001-06-06 23:26:50 +00:00
uprv_memcpy ( UCharOffset , buffer ,
2001-05-17 01:06:25 +00:00
UCOL_MAX_BUFFER * sizeof ( UChar ) ) ;
UCharOffset - - ;
}
2003-02-08 02:16:54 +00:00
//if (collIter_bos(source) ||
if ( ( source - > pos & & ( source - > pos = = source - > string | |
2001-05-11 01:13:08 +00:00
( ( source - > flags & UCOL_ITER_INNORMBUF ) & &
2003-02-08 02:16:54 +00:00
* ( source - > pos - 1 ) = = 0 & & source - > fcdPosition = = NULL ) ) )
| | ( source - > iterator & & ! source - > iterator - > hasPrevious ( source - > iterator ) ) ) {
2001-05-10 17:49:24 +00:00
break ;
2001-04-18 01:34:26 +00:00
}
2001-05-10 17:49:24 +00:00
}
/* adds the initial base character to the string */
* ( UCharOffset ) = schar ;
2002-07-02 22:32:14 +00:00
noChars + + ;
2001-05-10 17:49:24 +00:00
/* a new collIterate is used to simply things, since using the current
2001-05-11 01:13:08 +00:00
collIterate will mean that the forward and backwards iteration will
2001-05-10 17:49:24 +00:00
share and change the same buffers . we don ' t want to get into that . */
collIterate temp ;
2002-07-02 22:32:14 +00:00
//IInit_collIterate(coll, UCharOffset, -1, &temp);
IInit_collIterate ( coll , UCharOffset , noChars , & temp ) ;
2001-05-10 17:49:24 +00:00
temp . flags & = ~ UCOL_ITER_NORM ;
CE = ucol_IGetNextCE ( coll , & temp , status ) ;
2001-05-17 21:34:04 +00:00
endCEBuffer = source - > CEs + UCOL_EXPAND_CE_BUFFER_SIZE ;
2001-05-10 17:49:24 +00:00
while ( CE ! = UCOL_NO_MORE_CES ) {
2001-05-11 01:13:08 +00:00
* ( source - > CEpos + + ) = CE ;
2001-05-17 21:34:04 +00:00
if ( source - > CEpos = = endCEBuffer ) {
/* ran out of CE space, bail.
2001-06-06 23:26:50 +00:00
there ' s no guarantee of the right character position after
2001-05-17 21:34:04 +00:00
this bail */
* status = U_BUFFER_OVERFLOW_ERROR ;
2001-05-17 23:09:35 +00:00
source - > CEpos = source - > CEs ;
2001-11-14 22:22:50 +00:00
freeHeapWritableBuffer ( & temp ) ;
if ( strbuffer ! = buffer ) {
uprv_free ( strbuffer ) ;
}
2001-05-17 21:34:04 +00:00
return UCOL_NULLORDER ;
}
2001-05-10 17:49:24 +00:00
CE = ucol_IGetNextCE ( coll , & temp , status ) ;
}
freeHeapWritableBuffer ( & temp ) ;
2001-05-17 01:06:25 +00:00
if ( strbuffer ! = buffer ) {
uprv_free ( strbuffer ) ;
}
2001-05-10 17:49:24 +00:00
source - > toReturn = source - > CEpos - 1 ;
if ( source - > toReturn = = source - > CEs ) {
source - > CEpos = source - > CEs ;
}
return * ( source - > toReturn ) ;
2002-01-08 00:11:54 +00:00
case LONG_PRIMARY_TAG :
{
* ( source - > CEpos + + ) = ( ( CE & 0xFFFF00 ) < < 8 ) | ( UCOL_BYTE_COMMON < < 8 ) | UCOL_BYTE_COMMON ;
* ( source - > CEpos + + ) = ( ( CE & 0xFF ) < < 24 ) | UCOL_CONTINUATION_MARKER ;
source - > toReturn = source - > CEpos - 1 ;
return * ( source - > toReturn ) ;
}
2001-03-09 23:09:21 +00:00
case EXPANSION_TAG : /* this tag always returns */
2001-04-23 01:53:49 +00:00
/*
2001-02-20 00:26:50 +00:00
This should handle expansion .
2001-04-23 01:53:49 +00:00
NOTE : we can encounter both continuations and expansions in an expansion !
I have to decide where continuations are going to be dealt with
2001-02-20 00:26:50 +00:00
*/
/* find the offset to expansion table */
2001-04-23 01:53:49 +00:00
CEOffset = ( uint32_t * ) coll - > image + getExpansionOffset ( CE ) ;
2001-02-20 00:26:50 +00:00
size = getExpansionCount ( CE ) ;
2001-03-07 21:01:53 +00:00
if ( size ! = 0 ) {
2001-04-23 01:53:49 +00:00
/*
if there are less than 16 elements in expansion , we don ' t terminate
2001-02-20 00:26:50 +00:00
*/
2001-05-10 17:49:24 +00:00
uint32_t count ;
2001-03-07 21:01:53 +00:00
for ( count = 0 ; count < size ; count + + ) {
2001-02-20 00:26:50 +00:00
* ( source - > CEpos + + ) = * CEOffset + + ;
2001-03-07 21:01:53 +00:00
}
}
else {
2001-02-20 00:26:50 +00:00
/* else, we do */
2001-03-07 21:01:53 +00:00
while ( * CEOffset ! = 0 ) {
2001-02-20 00:26:50 +00:00
* ( source - > CEpos + + ) = * CEOffset + + ;
2001-03-07 21:01:53 +00:00
}
}
2001-02-20 00:26:50 +00:00
source - > toReturn = source - > CEpos - 1 ;
2001-02-23 23:36:42 +00:00
return * ( source - > toReturn ) ;
2001-09-20 20:16:39 +00:00
case HANGUL_SYLLABLE_TAG : /* AC00-D7AF*/
{
2001-10-08 02:31:50 +00:00
const uint32_t
SBase = 0xAC00 , LBase = 0x1100 , VBase = 0x1161 , TBase = 0x11A7 ;
//const uint32_t LCount = 19;
const uint32_t VCount = 21 ;
const uint32_t TCount = 28 ;
//const uint32_t NCount = VCount * TCount; /* 588 */
//const uint32_t SCount = LCount * NCount; /* 11172 */
2001-09-20 20:16:39 +00:00
uint32_t L = ch - SBase ;
/*
divide into pieces .
we do it in this order since some compilers can do % and / in one
operation
*/
uint32_t T = L % TCount ;
L / = TCount ;
uint32_t V = L % VCount ;
L / = VCount ;
/* offset them */
L + = LBase ;
V + = VBase ;
T + = TBase ;
/*
return the first CE , but first put the rest into the expansion buffer
*/
if ( ! source - > coll - > image - > jamoSpecial )
{
2001-12-19 07:00:45 +00:00
/**(source->CEpos ++) = ucmpe32_get(UCA->mapping, L);*/
2002-06-13 18:34:41 +00:00
/**(source->CEpos++) = UTRIE_GET32_FROM_LEAD(UCA->mapping, L);*/
* ( source - > CEpos + + ) = UTRIE_GET32_FROM_LEAD ( coll - > mapping , L ) ;
2001-12-19 07:00:45 +00:00
/**(source->CEpos ++) = ucmpe32_get(UCA->mapping, V);*/
2002-06-13 18:34:41 +00:00
/**(source->CEpos++) = UTRIE_GET32_FROM_LEAD(UCA->mapping, V);*/
* ( source - > CEpos + + ) = UTRIE_GET32_FROM_LEAD ( coll - > mapping , V ) ;
2001-09-20 20:16:39 +00:00
if ( T ! = TBase )
2001-12-19 07:00:45 +00:00
/**(source->CEpos ++) = ucmpe32_get(UCA->mapping, T);*/
2002-06-13 18:34:41 +00:00
/**(source->CEpos++) = UTRIE_GET32_FROM_LEAD(UCA->mapping, T);*/
* ( source - > CEpos + + ) = UTRIE_GET32_FROM_LEAD ( coll - > mapping , T ) ;
2001-09-20 20:16:39 +00:00
source - > toReturn = source - > CEpos - 1 ;
return * ( source - > toReturn ) ;
} else {
2002-02-28 01:42:40 +00:00
// Since Hanguls pass the FCD check, it is
2001-11-14 21:55:21 +00:00
// guaranteed that we won't be in
// the normalization buffer if something like this happens
2001-09-27 18:36:18 +00:00
// Move Jamos into normalization buffer
/*
Move the Jamos into the
normalization buffer
*/
UChar * tempbuffer = source - > writableBuffer +
( source - > writableBufSize - 1 ) ;
* ( tempbuffer ) = 0 ;
if ( T ! = TBase ) {
* ( tempbuffer - 1 ) = ( UChar ) T ;
* ( tempbuffer - 2 ) = ( UChar ) V ;
* ( tempbuffer - 3 ) = ( UChar ) L ;
* ( tempbuffer - 4 ) = 0 ;
} else {
* ( tempbuffer - 1 ) = ( UChar ) V ;
* ( tempbuffer - 2 ) = ( UChar ) L ;
* ( tempbuffer - 3 ) = 0 ;
}
2001-09-24 02:24:09 +00:00
2001-09-27 18:36:18 +00:00
/*
Indicate where to continue in main input string after exhausting
the writableBuffer
*/
if ( source - > pos = = source - > string ) {
source - > fcdPosition = NULL ;
2001-09-20 20:16:39 +00:00
} else {
2001-09-27 18:36:18 +00:00
source - > fcdPosition = source - > pos - 1 ;
}
2001-09-20 20:16:39 +00:00
2001-09-27 18:36:18 +00:00
source - > pos = tempbuffer ;
source - > origFlags = source - > flags ;
source - > flags | = UCOL_ITER_INNORMBUF ;
source - > flags & = ~ ( UCOL_ITER_NORM | UCOL_ITER_HASLEN ) ;
2001-09-27 17:17:54 +00:00
2001-09-27 18:36:18 +00:00
return ( UCOL_IGNORABLE ) ;
2001-09-20 20:16:39 +00:00
}
}
case LEAD_SURROGATE_TAG : /* D800-DBFF*/
return 0 ; /* broken surrogate sequence */
case TRAIL_SURROGATE_TAG : /* DC00-DFFF*/
{
UChar32 cp = 0 ;
UChar prevChar ;
UChar * prev ;
if ( isAtStartPrevIterate ( source ) ) {
/* we are at the start of the string, wrong place to be at */
return 0 ;
}
if ( source - > pos ! = source - > writableBuffer ) {
prev = source - > pos - 1 ;
} else {
prev = source - > fcdPosition ;
}
prevChar = * prev ;
/* Handles Han and Supplementary characters here.*/
if ( UTF_IS_FIRST_SURROGATE ( prevChar ) ) {
cp = ( ( ( ( uint32_t ) prevChar ) < < 10UL ) + ( ch ) - ( ( ( uint32_t ) 0xd800 < < 10UL ) + 0xdc00 - 0x10000 ) ) ;
source - > pos = prev ;
} else {
return 0 ; /* completely ignorable */
}
2002-06-13 18:34:41 +00:00
return getPrevImplicit ( cp , source ) ;
2001-09-20 20:16:39 +00:00
}
2002-06-13 18:34:41 +00:00
// TODO: Remove CJK implicits as they are handled by the getImplicitPrimary function
2001-09-20 20:16:39 +00:00
case CJK_IMPLICIT_TAG : /* 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D*/
2002-06-13 18:34:41 +00:00
return getPrevImplicit ( ch , source ) ;
2001-09-20 20:16:39 +00:00
case IMPLICIT_TAG : /* everything that is not defined otherwise */
2002-06-13 18:34:41 +00:00
return getPrevImplicit ( ch , source ) ;
2001-09-20 20:16:39 +00:00
/* UCA is filled with these. Tailorings are NOT_FOUND */
/* not yet implemented */
2001-03-09 23:09:21 +00:00
case CHARSET_TAG : /* this tag always returns */
2001-02-20 00:26:50 +00:00
/* probably after 1.8 */
return UCOL_NOT_FOUND ;
2001-03-09 23:09:21 +00:00
default : /* this tag always returns */
2001-02-20 00:26:50 +00:00
* status = U_INTERNAL_PROGRAM_ERROR ;
CE = 0 ;
break ;
}
2001-03-07 21:01:53 +00:00
if ( CE < = UCOL_NOT_FOUND ) {
break ;
}
2001-02-20 00:26:50 +00:00
}
return CE ;
}
2001-01-16 00:28:40 +00:00
/* This should really be a macro */
/* However, it is used only when stack buffers are not sufficiently big, and then we're messed up performance wise */
/* anyway */
2001-10-20 01:09:31 +00:00
static
2001-03-14 07:49:03 +00:00
uint8_t * reallocateBuffer ( uint8_t * * secondaries , uint8_t * secStart , uint8_t * second , uint32_t * secSize , uint32_t newSize , UErrorCode * status ) {
2001-05-17 23:09:35 +00:00
# ifdef UCOL_DEBUG
2001-03-14 07:49:03 +00:00
fprintf ( stderr , " . " ) ;
2001-05-17 18:19:42 +00:00
# endif
2001-01-09 00:52:18 +00:00
uint8_t * newStart = NULL ;
2002-08-07 18:26:18 +00:00
uint32_t offset = * secondaries - secStart ;
2001-01-09 00:52:18 +00:00
if ( secStart = = second ) {
2001-03-14 07:49:03 +00:00
newStart = ( uint8_t * ) uprv_malloc ( newSize ) ;
2001-01-09 00:52:18 +00:00
if ( newStart = = NULL ) {
* status = U_MEMORY_ALLOCATION_ERROR ;
return NULL ;
}
uprv_memcpy ( newStart , secStart , * secondaries - secStart ) ;
} else {
2001-03-14 07:49:03 +00:00
newStart = ( uint8_t * ) uprv_realloc ( secStart , newSize ) ;
2001-01-09 00:52:18 +00:00
if ( newStart = = NULL ) {
* status = U_MEMORY_ALLOCATION_ERROR ;
return NULL ;
}
}
2002-08-07 18:26:18 +00:00
* secondaries = newStart + offset ;
2001-03-14 07:49:03 +00:00
* secSize = newSize ;
2001-01-09 00:52:18 +00:00
return newStart ;
}
2001-01-16 00:28:40 +00:00
/* This should really be a macro */
/* This function is used to reverse parts of a buffer. We need this operation when doing continuation */
/* secondaries in French */
/*
2001-01-09 00:52:18 +00:00
void uprv_ucol_reverse_buffer ( uint8_t * start , uint8_t * end ) {
uint8_t temp ;
while ( start < end ) {
temp = * start ;
* start + + = * end ;
* end - - = temp ;
}
}
2001-01-16 00:28:40 +00:00
*/
2001-01-09 00:52:18 +00:00
2001-01-16 00:28:40 +00:00
# define uprv_ucol_reverse_buffer(TYPE, start, end) { \
TYPE tempA ; \
while ( ( start ) < ( end ) ) { \
tempA = * ( start ) ; \
* ( start ) + + = * ( end ) ; \
* ( end ) - - = tempA ; \
} \
2001-04-23 01:53:49 +00:00
}
2001-01-15 07:28:54 +00:00
2001-01-16 00:28:40 +00:00
/****************************************************************************/
/* Following are the sortkey generation functions */
/* */
/****************************************************************************/
2001-01-15 07:28:54 +00:00
2001-09-18 18:37:57 +00:00
/**
* Merge two sort keys .
* This is useful , for example , to combine sort keys from first and last names
* to sort such pairs .
* Merged sort keys consider on each collation level the first part first entirely ,
* then the second one .
* It is possible to merge multiple sort keys by consecutively merging
* another one with the intermediate result .
*
* The length of the merge result is the sum of the lengths of the input sort keys
* minus 1.
*
* @ param src1 the first sort key
* @ param src1Length the length of the first sort key , including the zero byte at the end ;
* can be - 1 if the function is to find the length
* @ param src2 the second sort key
* @ param src2Length the length of the second sort key , including the zero byte at the end ;
* can be - 1 if the function is to find the length
* @ param dest the buffer where the merged sort key is written ,
* can be NULL if destCapacity = = 0
* @ param destCapacity the number of bytes in the dest buffer
* @ return the length of the merged sort key , src1Length + src2Length - 1 ;
* can be larger than destCapacity , or 0 if an error occurs ( only for illegal arguments ) ,
* in which cases the contents of dest is undefined
*
* @ draft
*/
2001-11-21 01:08:55 +00:00
U_CAPI int32_t U_EXPORT2
2001-09-18 18:37:57 +00:00
ucol_mergeSortkeys ( const uint8_t * src1 , int32_t src1Length ,
const uint8_t * src2 , int32_t src2Length ,
uint8_t * dest , int32_t destCapacity ) {
int32_t destLength ;
uint8_t b ;
/* check arguments */
if ( src1 = = NULL | | src1Length < - 2 | | src1Length = = 0 | | ( src1Length > 0 & & src1 [ src1Length - 1 ] ! = 0 ) | |
src2 = = NULL | | src2Length < - 2 | | src2Length = = 0 | | ( src2Length > 0 & & src2 [ src2Length - 1 ] ! = 0 ) | |
destCapacity < 0 | | ( destCapacity > 0 & & dest = = NULL )
) {
/* error, attempt to write a zero byte and return 0 */
if ( dest ! = NULL & & destCapacity > 0 ) {
* dest = 0 ;
}
return 0 ;
}
/* check lengths and capacity */
if ( src1Length < 0 ) {
src1Length = ( int32_t ) uprv_strlen ( ( const char * ) src1 ) + 1 ;
}
if ( src2Length < 0 ) {
src2Length = ( int32_t ) uprv_strlen ( ( const char * ) src2 ) + 1 ;
}
destLength = src1Length + src2Length - 1 ;
if ( destLength > destCapacity ) {
/* the merged sort key does not fit into the destination */
return destLength ;
}
/* merge the sort keys with the same number of levels */
while ( * src1 ! = 0 & & * src2 ! = 0 ) { /* while both have another level */
/* copy level from src1 not including 00 or 01 */
while ( ( b = * src1 ) > = 2 ) {
+ + src1 ;
* dest + + = b ;
}
/* add a 02 merge separator */
* dest + + = 2 ;
/* copy level from src2 not including 00 or 01 */
while ( ( b = * src2 ) > = 2 ) {
+ + src2 ;
* dest + + = b ;
}
/* if both sort keys have another level, then add a 01 level separator and continue */
if ( * src1 = = 1 & & * src2 = = 1 ) {
+ + src1 ;
+ + src2 ;
* dest + + = 1 ;
}
}
/*
* here , at least one sort key is finished now , but the other one
* might have some contents left from containing more levels ;
* that contents is just appended to the result
*/
if ( * src1 ! = 0 ) {
/* src1 is not finished, therefore *src2==0, and src1 is appended */
src2 = src1 ;
}
/* append src2, "the other, unfinished sort key" */
uprv_strcpy ( ( char * ) dest , ( const char * ) src2 ) ;
/* trust that neither sort key contained illegally embedded zero bytes */
return destLength ;
}
2001-01-16 00:28:40 +00:00
/* sortkey API */
2001-11-21 01:08:55 +00:00
U_CAPI int32_t U_EXPORT2
2001-01-16 00:28:40 +00:00
ucol_getSortKey ( const UCollator * coll ,
const UChar * source ,
int32_t sourceLength ,
uint8_t * result ,
int32_t resultLength )
{
UErrorCode status = U_ZERO_ERROR ;
2001-03-14 07:49:03 +00:00
/* this uses the function pointer that is set in updateinternalstate */
/* currently, there are two funcs: */
/*ucol_calcSortKey(...);*/
/*ucol_calcSortKeySimpleTertiary(...);*/
int32_t keySize = coll - > sortKeyGen ( coll , source , sourceLength , & result , resultLength , FALSE , & status ) ;
2003-03-27 20:09:38 +00:00
//((UCollator *)coll)->errorCode = status; /*semantically const */
2001-03-14 07:49:03 +00:00
return keySize ;
2001-01-16 00:28:40 +00:00
}
/* this function is called by the C++ API for sortkey generation */
2001-09-27 01:01:30 +00:00
U_CFUNC int32_t
ucol_getSortKeyWithAllocation ( const UCollator * coll ,
const UChar * source , int32_t sourceLength ,
uint8_t * * pResult ,
UErrorCode * pErrorCode ) {
* pResult = 0 ;
return coll - > sortKeyGen ( coll , source , sourceLength , pResult , 0 , TRUE , pErrorCode ) ;
2001-01-15 07:28:54 +00:00
}
2001-09-24 00:20:59 +00:00
# define UCOL_FSEC_BUF_SIZE 256
2001-01-16 00:28:40 +00:00
/* This function tries to get the size of a sortkey. It will be invoked if the size of resulting buffer is 0 */
/* or if we run out of space while making a sortkey and want to return ASAP */
2002-08-02 06:55:51 +00:00
int32_t ucol_getSortKeySize ( const UCollator * coll , collIterate * s , int32_t currentSize , UColAttributeValue strength , int32_t len ) {
2001-01-05 00:47:25 +00:00
UErrorCode status = U_ZERO_ERROR ;
2001-02-28 19:30:41 +00:00
uint8_t compareSec = ( uint8_t ) ( ( strength > = UCOL_SECONDARY ) ? 0 : 0xFF ) ;
uint8_t compareTer = ( uint8_t ) ( ( strength > = UCOL_TERTIARY ) ? 0 : 0xFF ) ;
uint8_t compareQuad = ( uint8_t ) ( ( strength > = UCOL_QUATERNARY ) ? 0 : 0xFF ) ;
2001-01-05 00:47:25 +00:00
UBool compareIdent = ( strength = = UCOL_IDENTICAL ) ;
2001-01-09 00:52:18 +00:00
UBool doCase = ( coll - > caseLevel = = UCOL_ON ) ;
2001-03-15 02:49:35 +00:00
UBool shifted = ( coll - > alternateHandling = = UCOL_SHIFTED ) ;
2001-10-20 01:09:31 +00:00
//UBool qShifted = shifted && (compareQuad == 0);
2001-10-10 01:48:36 +00:00
UBool doHiragana = ( coll - > hiraganaQ = = UCOL_ON ) & & ( compareQuad = = 0 ) ;
2001-01-26 00:12:23 +00:00
UBool isFrenchSec = ( coll - > frenchCollation = = UCOL_ON ) & & ( compareSec = = 0 ) ;
2001-09-24 00:20:59 +00:00
uint8_t fSecsBuff [ UCOL_FSEC_BUF_SIZE ] ;
uint8_t * fSecs = fSecsBuff ;
uint32_t fSecsLen = 0 , fSecsMaxLen = UCOL_FSEC_BUF_SIZE ;
uint8_t * frenchStartPtr = NULL , * frenchEndPtr = NULL ;
2001-01-09 00:52:18 +00:00
2001-06-26 22:24:10 +00:00
uint32_t variableTopValue = coll - > variableTopValue ;
2001-10-02 16:49:57 +00:00
uint8_t UCOL_COMMON_BOT4 = ( uint8_t ) ( ( coll - > variableTopValue > > 8 ) + 1 ) ;
if ( doHiragana ) {
UCOL_COMMON_BOT4 + + ;
/* allocate one more space for hiragana */
}
2001-02-28 19:30:41 +00:00
uint8_t UCOL_BOT_COUNT4 = ( uint8_t ) ( 0xFF - UCOL_COMMON_BOT4 ) ;
2001-01-09 00:52:18 +00:00
2001-03-22 21:16:20 +00:00
uint32_t order = UCOL_NO_MORE_CES ;
2001-01-09 00:52:18 +00:00
uint8_t primary1 = 0 ;
uint8_t primary2 = 0 ;
2001-01-05 00:47:25 +00:00
uint8_t secondary = 0 ;
uint8_t tertiary = 0 ;
2001-01-09 00:52:18 +00:00
int32_t caseShift = 0 ;
2001-01-18 00:46:19 +00:00
uint32_t c2 = 0 , c3 = 0 , c4 = 0 ; /* variables for compression */
2001-03-02 00:19:43 +00:00
uint8_t caseSwitch = coll - > caseSwitch ;
uint8_t tertiaryMask = coll - > tertiaryMask ;
2001-05-10 22:33:50 +00:00
uint8_t tertiaryCommon = coll - > tertiaryCommon ;
2001-03-02 00:19:43 +00:00
2001-02-26 10:28:56 +00:00
UBool wasShifted = FALSE ;
UBool notIsContinuation = FALSE ;
2001-05-03 23:33:29 +00:00
uint8_t leadPrimary = 0 ;
2001-04-23 01:53:49 +00:00
1999-08-16 21:50:52 +00:00
2001-01-05 00:47:25 +00:00
for ( ; ; ) {
2001-04-18 19:31:05 +00:00
order = ucol_IGetNextCE ( coll , s , & status ) ;
2001-01-24 16:18:48 +00:00
if ( order = = UCOL_NO_MORE_CES ) {
2001-01-09 00:52:18 +00:00
break ;
}
2001-05-29 04:59:29 +00:00
if ( order = = 0 ) {
2001-03-15 02:49:35 +00:00
continue ;
}
1999-08-16 21:50:52 +00:00
2001-05-29 04:59:29 +00:00
notIsContinuation = ! isContinuation ( order ) ;
2001-01-09 00:52:18 +00:00
2001-03-16 19:06:07 +00:00
if ( notIsContinuation ) {
2001-05-10 22:33:50 +00:00
tertiary = ( uint8_t ) ( ( order & UCOL_BYTE_SIZE_MASK ) ) ;
2001-03-16 19:06:07 +00:00
} else {
2001-05-10 22:33:50 +00:00
tertiary = ( uint8_t ) ( ( order & UCOL_REMOVE_CONTINUATION ) ) ;
2001-04-23 01:53:49 +00:00
}
2001-06-25 04:01:49 +00:00
secondary = ( uint8_t ) ( ( order > > = 8 ) & UCOL_BYTE_SIZE_MASK ) ;
primary2 = ( uint8_t ) ( ( order > > = 8 ) & UCOL_BYTE_SIZE_MASK ) ;
2001-05-29 04:59:29 +00:00
primary1 = ( uint8_t ) ( order > > 8 ) ;
2001-01-09 00:52:18 +00:00
2001-06-26 22:24:10 +00:00
if ( shifted & & ( ( notIsContinuation & & order < = variableTopValue & & primary1 > 0 )
2002-06-05 21:14:41 +00:00
| | ( ! notIsContinuation & & wasShifted ) )
| | ( wasShifted & & primary1 = = 0 ) ) { /* amendment to the UCA says that primary ignorables */
/* and other ignorables should be removed if following a shifted code point */
if ( primary1 = = 0 ) { /* if we were shifted and we got an ignorable code point */
/* we should just completely ignore it */
continue ;
}
2001-03-15 02:49:35 +00:00
if ( compareQuad = = 0 ) {
if ( c4 > 0 ) {
currentSize + = ( c2 / UCOL_BOT_COUNT4 ) + 1 ;
c4 = 0 ;
}
2001-01-09 00:52:18 +00:00
currentSize + + ;
2001-03-15 02:49:35 +00:00
if ( primary2 ! = 0 ) {
currentSize + + ;
}
2001-01-05 00:47:25 +00:00
}
2001-02-26 10:28:56 +00:00
wasShifted = TRUE ;
2001-01-09 00:52:18 +00:00
} else {
2001-02-26 10:28:56 +00:00
wasShifted = FALSE ;
2001-01-09 00:52:18 +00:00
/* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */
/* Usually, we'll have non-zero primary1 & primary2, except in cases of LatinOne and friends, when primary2 will */
2001-04-25 18:00:41 +00:00
/* calculate sortkey size */
2001-04-23 23:41:46 +00:00
if ( primary1 ! = UCOL_IGNORABLE ) {
if ( notIsContinuation ) {
2001-04-25 18:00:41 +00:00
if ( leadPrimary = = primary1 ) {
2001-04-24 23:32:40 +00:00
currentSize + + ;
2001-04-25 18:00:41 +00:00
} else {
if ( leadPrimary ! = 0 ) {
2001-04-23 23:41:46 +00:00
currentSize + + ;
}
2001-04-25 18:00:41 +00:00
if ( primary2 = = UCOL_IGNORABLE ) {
/* one byter, not compressed */
currentSize + + ;
leadPrimary = 0 ;
} else if ( primary1 < UCOL_BYTE_FIRST_NON_LATIN_PRIMARY | |
2002-07-02 22:32:14 +00:00
//(primary1 > (UCOL_RESET_TOP_VALUE>>24) && primary1 < (UCOL_NEXT_TOP_VALUE>>24))) {
( primary1 > ( * UCAconsts - > UCA_LAST_NON_VARIABLE > > 24 ) & & primary1 < ( * UCAconsts - > UCA_FIRST_IMPLICIT > > 24 ) ) ) {
2001-04-25 18:00:41 +00:00
/* not compressible */
leadPrimary = 0 ;
currentSize + = 2 ;
} else { /* compress */
2001-05-11 01:13:08 +00:00
leadPrimary = primary1 ;
2001-04-25 18:00:41 +00:00
currentSize + = 2 ;
2001-05-11 01:13:08 +00:00
}
2001-04-23 23:41:46 +00:00
}
2001-04-24 23:32:40 +00:00
} else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */
2001-04-23 23:41:46 +00:00
currentSize + + ;
2001-04-24 23:32:40 +00:00
if ( primary2 ! = UCOL_IGNORABLE ) {
currentSize + + ;
}
2001-04-23 23:41:46 +00:00
}
2001-05-11 01:13:08 +00:00
}
2001-01-09 00:52:18 +00:00
if ( secondary > compareSec ) { /* I think that != 0 test should be != IGNORABLE */
2001-01-26 00:12:23 +00:00
if ( ! isFrenchSec ) {
2001-02-26 10:28:56 +00:00
if ( secondary = = UCOL_COMMON2 & & notIsContinuation ) {
2001-01-26 00:12:23 +00:00
c2 + + ;
} else {
if ( c2 > 0 ) {
2001-03-03 03:35:17 +00:00
if ( secondary > UCOL_COMMON2 ) { // not necessary for 4th level.
2001-05-02 23:36:22 +00:00
currentSize + = ( c2 / ( uint32_t ) UCOL_TOP_COUNT2 ) + 1 ;
2001-01-26 00:12:23 +00:00
} else {
2001-05-02 23:36:22 +00:00
currentSize + = ( c2 / ( uint32_t ) UCOL_BOT_COUNT2 ) + 1 ;
2001-01-26 00:12:23 +00:00
}
c2 = 0 ;
2001-01-18 00:46:19 +00:00
}
2001-01-26 00:12:23 +00:00
currentSize + + ;
2001-01-18 00:46:19 +00:00
}
2001-01-26 00:12:23 +00:00
} else {
2001-09-24 00:20:59 +00:00
fSecs [ fSecsLen + + ] = secondary ;
if ( fSecsLen = = fSecsMaxLen ) {
if ( fSecs = = fSecsBuff ) {
fSecs = ( uint8_t * ) uprv_malloc ( 2 * fSecsLen ) ;
} else {
fSecs = ( uint8_t * ) uprv_realloc ( fSecs , 2 * fSecsLen ) ;
}
2002-07-20 06:00:04 +00:00
if ( fSecs = = NULL ) {
status = U_MEMORY_ALLOCATION_ERROR ;
return - 1 ;
}
2001-09-24 00:20:59 +00:00
fSecsMaxLen * = 2 ;
}
if ( notIsContinuation ) {
if ( frenchStartPtr ! = NULL ) {
/* reverse secondaries from frenchStartPtr up to frenchEndPtr */
uprv_ucol_reverse_buffer ( uint8_t , frenchStartPtr , frenchEndPtr ) ;
frenchStartPtr = NULL ;
}
} else {
if ( frenchStartPtr = = NULL ) {
frenchStartPtr = fSecs + fSecsLen - 2 ;
}
frenchEndPtr = fSecs + fSecsLen - 1 ;
}
2001-01-18 00:46:19 +00:00
}
2001-01-05 00:47:25 +00:00
}
2001-01-09 00:52:18 +00:00
if ( doCase ) {
if ( caseShift = = 0 ) {
2001-01-05 00:47:25 +00:00
currentSize + + ;
2001-03-16 19:06:07 +00:00
caseShift = UCOL_CASE_SHIFT_START ;
2001-01-09 00:52:18 +00:00
}
2001-05-10 22:33:50 +00:00
if ( ( tertiary & 0x3F ) > 0 & & notIsContinuation ) {
2001-01-26 00:12:23 +00:00
caseShift - - ;
2001-05-10 22:33:50 +00:00
if ( ( tertiary & 0xC0 ) ! = 0 ) {
if ( caseShift = = 0 ) {
currentSize + + ;
caseShift = UCOL_CASE_SHIFT_START ;
}
caseShift - - ;
2001-05-11 01:13:08 +00:00
}
2001-05-10 22:33:50 +00:00
}
} else {
if ( notIsContinuation ) {
tertiary ^ = caseSwitch ;
2001-01-26 00:12:23 +00:00
}
2001-01-05 00:47:25 +00:00
}
2001-01-09 00:52:18 +00:00
2001-05-10 22:33:50 +00:00
tertiary & = tertiaryMask ;
2001-01-09 00:52:18 +00:00
if ( tertiary > compareTer ) { /* I think that != 0 test should be != IGNORABLE */
2001-05-10 22:33:50 +00:00
if ( tertiary = = tertiaryCommon & & notIsContinuation ) {
2001-01-18 00:46:19 +00:00
c3 + + ;
} else {
if ( c3 > 0 ) {
2001-05-11 01:13:08 +00:00
if ( ( tertiary > tertiaryCommon & & tertiaryCommon = = UCOL_COMMON3_NORMAL )
2001-05-10 22:33:50 +00:00
| | ( tertiary < = tertiaryCommon & & tertiaryCommon = = UCOL_COMMON3_UPPERFIRST ) ) {
currentSize + = ( c3 / ( uint32_t ) coll - > tertiaryTopCount ) + 1 ;
2001-01-18 00:46:19 +00:00
} else {
2001-05-10 22:33:50 +00:00
currentSize + = ( c3 / ( uint32_t ) coll - > tertiaryBottomCount ) + 1 ;
2001-01-18 00:46:19 +00:00
}
c3 = 0 ;
}
currentSize + + ;
}
2001-01-05 00:47:25 +00:00
}
2001-01-09 00:52:18 +00:00
2001-10-02 16:49:57 +00:00
if ( /*qShifted*/ ( compareQuad = = 0 ) & & notIsContinuation ) {
2001-10-05 02:07:51 +00:00
if ( s - > flags & UCOL_WAS_HIRAGANA ) { // This was Hiragana and we need to note it
if ( c4 > 0 ) { // Close this part
currentSize + = ( c4 / UCOL_BOT_COUNT4 ) + 1 ;
c4 = 0 ;
}
currentSize + + ; // Add the Hiragana
} else { // This wasn't Hiragana, so we can continue adding stuff
c4 + + ;
}
2001-01-05 00:47:25 +00:00
}
2001-01-09 00:52:18 +00:00
}
2001-01-05 00:47:25 +00:00
}
2000-11-20 19:17:17 +00:00
2001-09-24 00:20:59 +00:00
if ( ! isFrenchSec ) {
if ( c2 > 0 ) {
2003-01-20 07:43:32 +00:00
currentSize + = ( c2 / ( uint32_t ) UCOL_BOT_COUNT2 ) + ( ( c2 % ( uint32_t ) UCOL_BOT_COUNT2 ! = 0 ) ? 1 : 0 ) ;
2001-09-24 00:20:59 +00:00
}
} else {
uint32_t i = 0 ;
if ( frenchStartPtr ! = NULL ) {
uprv_ucol_reverse_buffer ( uint8_t , frenchStartPtr , frenchEndPtr ) ;
}
for ( i = 0 ; i < fSecsLen ; i + + ) {
secondary = * ( fSecs + fSecsLen - i - 1 ) ;
/* This is compression code. */
if ( secondary = = UCOL_COMMON2 ) {
+ + c2 ;
} else {
if ( c2 > 0 ) {
if ( secondary > UCOL_COMMON2 ) { // not necessary for 4th level.
2003-01-20 07:43:32 +00:00
currentSize + = ( c2 / ( uint32_t ) UCOL_TOP_COUNT2 ) + ( ( c2 % ( uint32_t ) UCOL_TOP_COUNT2 ! = 0 ) ? 1 : 0 ) ;
2001-09-24 00:20:59 +00:00
} else {
2003-01-20 07:43:32 +00:00
currentSize + = ( c2 / ( uint32_t ) UCOL_BOT_COUNT2 ) + ( ( c2 % ( uint32_t ) UCOL_BOT_COUNT2 ! = 0 ) ? 1 : 0 ) ;
2001-09-24 00:20:59 +00:00
}
c2 = 0 ;
}
currentSize + + ;
}
}
if ( c2 > 0 ) {
2003-01-20 07:43:32 +00:00
currentSize + = ( c2 / ( uint32_t ) UCOL_BOT_COUNT2 ) + ( ( c2 % ( uint32_t ) UCOL_BOT_COUNT2 ! = 0 ) ? 1 : 0 ) ;
2001-09-24 00:20:59 +00:00
}
if ( fSecs ! = fSecsBuff ) {
uprv_free ( fSecs ) ;
}
2001-01-23 06:58:22 +00:00
}
if ( c3 > 0 ) {
2003-01-20 07:43:32 +00:00
currentSize + = ( c3 / ( uint32_t ) coll - > tertiaryBottomCount ) + ( ( c3 % ( uint32_t ) coll - > tertiaryBottomCount ! = 0 ) ? 1 : 0 ) ;
2001-01-23 06:58:22 +00:00
}
2001-03-15 02:49:35 +00:00
if ( c4 > 0 & & compareQuad = = 0 ) {
2003-01-20 07:43:32 +00:00
currentSize + = ( c4 / ( uint32_t ) UCOL_BOT_COUNT4 ) + ( ( c4 % ( uint32_t ) UCOL_BOT_COUNT4 ! = 0 ) ? 1 : 0 ) ;
2001-01-23 06:58:22 +00:00
}
2001-01-05 00:47:25 +00:00
if ( compareIdent ) {
2001-05-18 19:49:04 +00:00
currentSize + = u_lengthOfIdenticalLevelRun ( s - > string , len ) ;
2001-04-19 19:02:01 +00:00
}
return currentSize ;
2001-04-06 23:37:48 +00:00
2001-04-19 19:02:01 +00:00
}
1999-08-16 21:50:52 +00:00
2001-10-22 05:30:22 +00:00
static
2001-05-10 22:33:50 +00:00
inline void doCaseShift ( uint8_t * * cases , uint32_t & caseShift ) {
if ( caseShift = = 0 ) {
* ( * cases ) + + = UCOL_CASE_BYTE_START ;
caseShift = UCOL_CASE_SHIFT_START ;
}
}
2001-04-23 01:53:49 +00:00
2002-07-27 05:16:44 +00:00
// Adds a value to the buffer if it's safe to add. Increments the number of added values, so that we
// know how many values we wanted to add, even if we didn't add them all
static
inline void addWithIncrement ( uint8_t * & primaries , uint8_t * limit , uint32_t & size , const uint8_t value ) {
size + + ;
if ( primaries < limit ) {
* ( primaries ) + + = value ;
}
}
// Packs the secondary buffer when processing French locale. Adds the terminator.
2001-10-22 05:30:22 +00:00
static
2002-07-27 05:16:44 +00:00
inline uint8_t * packFrench ( uint8_t * primaries , uint8_t * primEnd , uint8_t * secondaries , uint32_t * secsize , uint8_t * frenchStartPtr , uint8_t * frenchEndPtr ) {
2001-09-24 00:20:59 +00:00
uint8_t secondary ;
int32_t count2 = 0 ;
2002-07-27 05:16:44 +00:00
uint32_t i = 0 , size = 0 ;
// we use i here since the key size already accounts for terminators, so we'll discard the increment
addWithIncrement ( primaries , primEnd , i , UCOL_LEVELTERMINATOR ) ;
2001-09-24 00:20:59 +00:00
/* If there are any unresolved continuation secondaries, reverse them here so that we can reverse the whole secondary thing */
if ( frenchStartPtr ! = NULL ) {
uprv_ucol_reverse_buffer ( uint8_t , frenchStartPtr , frenchEndPtr ) ;
}
for ( i = 0 ; i < * secsize ; i + + ) {
secondary = * ( secondaries - i - 1 ) ;
/* This is compression code. */
if ( secondary = = UCOL_COMMON2 ) {
+ + count2 ;
} else {
if ( count2 > 0 ) {
if ( secondary > UCOL_COMMON2 ) { // not necessary for 4th level.
while ( count2 > UCOL_TOP_COUNT2 ) {
2002-07-27 05:16:44 +00:00
addWithIncrement ( primaries , primEnd , size , ( uint8_t ) ( UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2 ) ) ;
2001-09-24 00:20:59 +00:00
count2 - = ( uint32_t ) UCOL_TOP_COUNT2 ;
}
2002-07-27 05:16:44 +00:00
addWithIncrement ( primaries , primEnd , size , ( uint8_t ) ( UCOL_COMMON_TOP2 - ( count2 - 1 ) ) ) ;
2001-09-24 00:20:59 +00:00
} else {
while ( count2 > UCOL_BOT_COUNT2 ) {
2002-07-27 05:16:44 +00:00
addWithIncrement ( primaries , primEnd , size , ( uint8_t ) ( UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2 ) ) ;
2001-09-24 00:20:59 +00:00
count2 - = ( uint32_t ) UCOL_BOT_COUNT2 ;
}
2002-07-27 05:16:44 +00:00
addWithIncrement ( primaries , primEnd , size , ( uint8_t ) ( UCOL_COMMON_BOT2 + ( count2 - 1 ) ) ) ;
2001-09-24 00:20:59 +00:00
}
count2 = 0 ;
}
2002-07-27 05:16:44 +00:00
addWithIncrement ( primaries , primEnd , size , secondary ) ;
2001-09-24 00:20:59 +00:00
}
}
if ( count2 > 0 ) {
while ( count2 > UCOL_BOT_COUNT2 ) {
2002-07-27 05:16:44 +00:00
addWithIncrement ( primaries , primEnd , size , ( uint8_t ) ( UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2 ) ) ;
2001-09-24 00:20:59 +00:00
count2 - = ( uint32_t ) UCOL_BOT_COUNT2 ;
}
2002-07-27 05:16:44 +00:00
addWithIncrement ( primaries , primEnd , size , ( uint8_t ) ( UCOL_COMMON_BOT2 + ( count2 - 1 ) ) ) ;
2001-09-24 00:20:59 +00:00
}
2002-07-27 05:16:44 +00:00
* secsize = size ;
2001-09-24 00:20:59 +00:00
return primaries ;
}
2001-01-16 00:28:40 +00:00
/* This is the sortkey work horse function */
2002-07-11 17:06:51 +00:00
U_CFUNC int32_t U_CALLCONV
2001-01-15 07:28:54 +00:00
ucol_calcSortKey ( const UCollator * coll ,
2001-01-05 00:47:25 +00:00
const UChar * source ,
int32_t sourceLength ,
uint8_t * * result ,
2001-02-28 19:01:23 +00:00
uint32_t resultLength ,
2001-11-14 21:55:21 +00:00
UBool allocateSKBuffer ,
2001-01-05 06:36:10 +00:00
UErrorCode * status )
2001-01-05 00:47:25 +00:00
{
2001-01-05 06:36:10 +00:00
uint32_t i = 0 ; /* general purpose counter */
1999-08-16 21:50:52 +00:00
2001-01-05 06:36:10 +00:00
/* Stack allocated buffers for buffers we use */
2001-03-14 07:49:03 +00:00
uint8_t prim [ UCOL_PRIMARY_MAX_BUFFER ] , second [ UCOL_SECONDARY_MAX_BUFFER ] , tert [ UCOL_TERTIARY_MAX_BUFFER ] , caseB [ UCOL_CASE_MAX_BUFFER ] , quad [ UCOL_QUAD_MAX_BUFFER ] ;
1999-10-22 00:43:39 +00:00
2001-01-05 06:36:10 +00:00
uint8_t * primaries = * result , * secondaries = second , * tertiaries = tert , * cases = caseB , * quads = quad ;
1999-08-16 21:50:52 +00:00
2001-01-05 06:36:10 +00:00
if ( U_FAILURE ( * status ) ) {
return 0 ;
}
1999-08-16 21:50:52 +00:00
2001-11-14 21:55:21 +00:00
if ( primaries = = NULL & & allocateSKBuffer = = TRUE ) {
2001-03-14 07:49:03 +00:00
primaries = * result = prim ;
resultLength = UCOL_PRIMARY_MAX_BUFFER ;
2001-01-05 00:47:25 +00:00
}
1999-08-16 21:50:52 +00:00
2001-04-23 01:53:49 +00:00
uint32_t secSize = UCOL_SECONDARY_MAX_BUFFER , terSize = UCOL_TERTIARY_MAX_BUFFER ,
2001-03-14 07:49:03 +00:00
caseSize = UCOL_CASE_MAX_BUFFER , quadSize = UCOL_QUAD_MAX_BUFFER ;
1999-08-16 21:50:52 +00:00
2001-02-28 19:01:23 +00:00
uint32_t sortKeySize = 1 ; /* it is always \0 terminated */
1999-08-16 21:50:52 +00:00
2001-03-14 07:49:03 +00:00
UChar normBuffer [ UCOL_NORMALIZATION_MAX_BUFFER ] ;
2001-01-05 00:47:25 +00:00
UChar * normSource = normBuffer ;
2001-03-14 07:49:03 +00:00
int32_t normSourceLen = UCOL_NORMALIZATION_MAX_BUFFER ;
1999-08-16 21:50:52 +00:00
2001-03-03 03:35:17 +00:00
int32_t len = ( sourceLength = = - 1 ? u_strlen ( source ) : sourceLength ) ;
1999-08-16 21:50:52 +00:00
2001-01-09 00:52:18 +00:00
UColAttributeValue strength = coll - > strength ;
1999-08-16 21:50:52 +00:00
2001-02-28 19:30:41 +00:00
uint8_t compareSec = ( uint8_t ) ( ( strength > = UCOL_SECONDARY ) ? 0 : 0xFF ) ;
uint8_t compareTer = ( uint8_t ) ( ( strength > = UCOL_TERTIARY ) ? 0 : 0xFF ) ;
uint8_t compareQuad = ( uint8_t ) ( ( strength > = UCOL_QUATERNARY ) ? 0 : 0xFF ) ;
2001-01-05 00:47:25 +00:00
UBool compareIdent = ( strength = = UCOL_IDENTICAL ) ;
2001-01-09 00:52:18 +00:00
UBool doCase = ( coll - > caseLevel = = UCOL_ON ) ;
2001-01-16 00:28:40 +00:00
UBool isFrenchSec = ( coll - > frenchCollation = = UCOL_ON ) & & ( compareSec = = 0 ) ;
2001-03-15 02:49:35 +00:00
UBool shifted = ( coll - > alternateHandling = = UCOL_SHIFTED ) ;
2001-10-20 01:09:31 +00:00
//UBool qShifted = shifted && (compareQuad == 0);
2001-10-10 01:48:36 +00:00
UBool doHiragana = ( coll - > hiraganaQ = = UCOL_ON ) & & ( compareQuad = = 0 ) ;
2001-01-18 00:46:19 +00:00
const uint8_t * scriptOrder = coll - > scriptOrder ;
2001-01-08 06:51:18 +00:00
2001-10-02 16:49:57 +00:00
uint32_t variableTopValue = coll - > variableTopValue ;
2001-10-05 02:07:51 +00:00
// TODO: UCOL_COMMON_BOT4 should be a function of qShifted. If we have no
// qShifted, we don't need to set UCOL_COMMON_BOT4 so high.
2001-10-02 16:49:57 +00:00
uint8_t UCOL_COMMON_BOT4 = ( uint8_t ) ( ( coll - > variableTopValue > > 8 ) + 1 ) ;
2001-10-05 02:07:51 +00:00
uint8_t UCOL_HIRAGANA_QUAD = 0 ;
2001-10-02 16:49:57 +00:00
if ( doHiragana ) {
2001-10-05 02:07:51 +00:00
UCOL_HIRAGANA_QUAD = UCOL_COMMON_BOT4 + + ;
/* allocate one more space for hiragana, value for hiragana */
2001-10-02 16:49:57 +00:00
}
uint8_t UCOL_BOT_COUNT4 = ( uint8_t ) ( 0xFF - UCOL_COMMON_BOT4 ) ;
2001-01-08 06:51:18 +00:00
/* support for special features like caselevel and funky secondaries */
uint8_t * frenchStartPtr = NULL ;
uint8_t * frenchEndPtr = NULL ;
uint32_t caseShift = 0 ;
2000-12-07 07:22:55 +00:00
2001-10-02 16:49:57 +00:00
sortKeySize + = ( ( compareSec ? 0 : 1 ) + ( compareTer ? 0 : 1 ) + ( doCase ? 1 : 0 ) + /*(qShifted?1:0)*/ ( compareQuad ? 0 : 1 ) + ( compareIdent ? 1 : 0 ) ) ;
2000-12-07 07:22:55 +00:00
2001-09-27 01:01:30 +00:00
/* If we need to normalize, we'll do it all at once at the beginning! */
UNormalizationMode normMode ;
2001-03-14 02:45:39 +00:00
if ( compareIdent ) {
2001-09-27 01:01:30 +00:00
normMode = UNORM_NFD ;
} else if ( coll - > normalizationMode ! = UCOL_OFF ) {
normMode = UNORM_FCD ;
} else {
normMode = UNORM_NONE ;
}
if ( normMode ! = UNORM_NONE & & UNORM_YES ! = unorm_quickCheck ( source , len , normMode , status ) ) {
2001-10-19 17:36:02 +00:00
len = unorm_internalNormalize ( normSource , normSourceLen ,
2001-09-27 01:01:30 +00:00
source , len ,
normMode , FALSE ,
status ) ;
2001-10-19 17:36:02 +00:00
if ( * status = = U_BUFFER_OVERFLOW_ERROR ) {
normSourceLen = len ;
normSource = ( UChar * ) uprv_malloc ( len * U_SIZEOF_UCHAR ) ;
if ( normSource = = NULL ) {
* status = U_MEMORY_ALLOCATION_ERROR ;
return 0 ;
}
* status = U_ZERO_ERROR ;
len = unorm_internalNormalize ( normSource , normSourceLen ,
source , len ,
normMode , FALSE ,
status ) ;
}
2001-01-05 06:36:10 +00:00
if ( U_FAILURE ( * status ) ) {
2001-09-27 01:01:30 +00:00
return 0 ;
2001-01-05 00:47:25 +00:00
}
2001-09-27 01:01:30 +00:00
source = normSource ;
}
2000-12-07 07:22:55 +00:00
2001-09-27 01:01:30 +00:00
collIterate s ;
IInit_collIterate ( coll , ( UChar * ) source , len , & s ) ;
if ( source = = normSource ) {
s . flags & = ~ UCOL_ITER_NORM ;
2001-04-06 23:37:48 +00:00
}
2000-12-07 07:22:55 +00:00
2001-03-23 07:48:40 +00:00
if ( resultLength = = 0 | | primaries = = NULL ) {
2002-08-02 06:55:51 +00:00
int32_t keyLen = ucol_getSortKeySize ( coll , & s , sortKeySize , strength , len ) ;
2001-11-07 00:01:29 +00:00
if ( normSource ! = normBuffer ) {
uprv_free ( normSource ) ;
}
return keyLen ;
2000-12-07 07:22:55 +00:00
}
2001-03-23 07:48:40 +00:00
uint8_t * primarySafeEnd = primaries + resultLength - 2 ;
2000-12-07 07:22:55 +00:00
2001-02-28 19:01:23 +00:00
uint32_t minBufferSize = UCOL_MAX_BUFFER ;
2000-11-07 00:00:17 +00:00
2001-01-05 00:47:25 +00:00
uint8_t * primStart = primaries ;
uint8_t * secStart = secondaries ;
uint8_t * terStart = tertiaries ;
2001-01-05 06:36:10 +00:00
uint8_t * caseStart = cases ;
uint8_t * quadStart = quads ;
2000-11-22 23:52:43 +00:00
2001-01-05 00:47:25 +00:00
uint32_t order = 0 ;
2000-11-22 00:04:03 +00:00
2001-01-08 06:51:18 +00:00
uint8_t primary1 = 0 ;
uint8_t primary2 = 0 ;
2001-01-05 00:47:25 +00:00
uint8_t secondary = 0 ;
uint8_t tertiary = 0 ;
2001-03-02 00:19:43 +00:00
uint8_t caseSwitch = coll - > caseSwitch ;
uint8_t tertiaryMask = coll - > tertiaryMask ;
2001-06-26 22:24:10 +00:00
int8_t tertiaryAddition = ( int8_t ) coll - > tertiaryAddition ;
2001-05-10 22:33:50 +00:00
uint8_t tertiaryTop = coll - > tertiaryTop ;
2001-05-17 23:14:44 +00:00
uint8_t tertiaryBottom = coll - > tertiaryBottom ;
2001-05-10 22:33:50 +00:00
uint8_t tertiaryCommon = coll - > tertiaryCommon ;
uint8_t caseBits = 0 ;
2000-11-22 00:04:03 +00:00
2001-01-05 00:47:25 +00:00
UBool finished = FALSE ;
2001-01-24 16:18:48 +00:00
UBool wasShifted = FALSE ;
2001-02-06 00:36:48 +00:00
UBool notIsContinuation = FALSE ;
2000-11-22 23:52:43 +00:00
2001-02-28 19:01:23 +00:00
uint32_t prevBuffSize = 0 ;
2001-01-15 07:28:54 +00:00
2001-01-23 06:58:22 +00:00
uint32_t count2 = 0 , count3 = 0 , count4 = 0 ;
2001-05-03 23:33:29 +00:00
uint8_t leadPrimary = 0 ;
2001-01-18 00:46:19 +00:00
2001-01-05 00:47:25 +00:00
for ( ; ; ) {
for ( i = prevBuffSize ; i < minBufferSize ; + + i ) {
2000-11-07 00:00:17 +00:00
2001-04-18 19:31:05 +00:00
order = ucol_IGetNextCE ( coll , & s , status ) ;
2001-01-24 16:18:48 +00:00
if ( order = = UCOL_NO_MORE_CES ) {
2001-01-05 00:47:25 +00:00
finished = TRUE ;
break ;
}
2000-11-30 23:20:14 +00:00
2001-05-29 04:59:29 +00:00
if ( order = = 0 ) {
2001-03-15 02:49:35 +00:00
continue ;
}
2001-05-29 04:59:29 +00:00
notIsContinuation = ! isContinuation ( order ) ;
2001-01-08 06:51:18 +00:00
2001-05-10 22:33:50 +00:00
if ( notIsContinuation ) {
tertiary = ( uint8_t ) ( order & UCOL_BYTE_SIZE_MASK ) ;
} else {
tertiary = ( uint8_t ) ( ( order & UCOL_REMOVE_CONTINUATION ) ) ;
2001-03-16 19:06:07 +00:00
}
2001-04-23 01:53:49 +00:00
2001-03-14 07:49:03 +00:00
secondary = ( uint8_t ) ( ( order > > = 8 ) & UCOL_BYTE_SIZE_MASK ) ;
primary2 = ( uint8_t ) ( ( order > > = 8 ) & UCOL_BYTE_SIZE_MASK ) ;
2001-05-29 04:59:29 +00:00
primary1 = ( uint8_t ) ( order > > 8 ) ;
2001-01-12 08:18:12 +00:00
2001-02-06 00:36:48 +00:00
if ( notIsContinuation ) {
2001-01-18 00:46:19 +00:00
if ( scriptOrder ! = NULL ) {
primary1 = scriptOrder [ primary1 ] ;
}
2001-01-08 06:51:18 +00:00
}
2001-06-26 22:24:10 +00:00
if ( shifted & & ( ( notIsContinuation & & order < = variableTopValue & & primary1 > 0 )
2002-06-05 21:14:41 +00:00
| | ( ! notIsContinuation & & wasShifted ) )
| | ( wasShifted & & primary1 = = 0 ) ) { /* amendment to the UCA says that primary ignorables */
/* and other ignorables should be removed if following a shifted code point */
if ( primary1 = = 0 ) { /* if we were shifted and we got an ignorable code point */
/* we should just completely ignore it */
continue ;
}
2002-08-02 06:55:51 +00:00
if ( compareQuad = = 0 ) {
if ( count4 > 0 ) {
while ( count4 > UCOL_BOT_COUNT4 ) {
* quads + + = ( uint8_t ) ( UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4 ) ;
count4 - = UCOL_BOT_COUNT4 ;
}
* quads + + = ( uint8_t ) ( UCOL_COMMON_BOT4 + ( count4 - 1 ) ) ;
count4 = 0 ;
}
/* We are dealing with a variable and we're treating them as shifted */
/* This is a shifted ignorable */
if ( primary1 ! = 0 ) { /* we need to check this since we could be in continuation */
* quads + + = primary1 ;
}
if ( primary2 ! = 0 ) {
* quads + + = primary2 ;
2001-03-03 03:35:17 +00:00
}
2001-01-08 06:51:18 +00:00
}
2001-01-24 16:18:48 +00:00
wasShifted = TRUE ;
2001-01-08 06:51:18 +00:00
} else {
2001-01-24 16:18:48 +00:00
wasShifted = FALSE ;
2001-01-08 06:51:18 +00:00
/* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */
/* Usually, we'll have non-zero primary1 & primary2, except in cases of LatinOne and friends, when primary2 will */
2001-05-29 04:59:29 +00:00
/* regular and simple sortkey calc */
2001-04-25 18:00:41 +00:00
if ( primary1 ! = UCOL_IGNORABLE ) {
if ( notIsContinuation ) {
if ( leadPrimary = = primary1 ) {
* primaries + + = primary2 ;
} else {
2001-04-24 23:32:40 +00:00
if ( leadPrimary ! = 0 ) {
2001-08-28 18:53:23 +00:00
* primaries + + = ( uint8_t ) ( ( primary1 > leadPrimary ) ? UCOL_BYTE_UNSHIFTED_MAX : UCOL_BYTE_UNSHIFTED_MIN ) ;
2001-04-24 23:32:40 +00:00
}
2001-04-25 18:00:41 +00:00
if ( primary2 = = UCOL_IGNORABLE ) {
/* one byter, not compressed */
* primaries + + = primary1 ;
leadPrimary = 0 ;
} else if ( primary1 < UCOL_BYTE_FIRST_NON_LATIN_PRIMARY | |
2002-07-02 22:32:14 +00:00
( primary1 > ( * UCAconsts - > UCA_LAST_NON_VARIABLE > > 24 ) & & primary1 < ( * UCAconsts - > UCA_FIRST_IMPLICIT > > 24 ) ) ) {
2001-04-25 18:00:41 +00:00
/* not compressible */
leadPrimary = 0 ;
* primaries + + = primary1 ;
* primaries + + = primary2 ;
} else { /* compress */
2001-05-11 01:13:08 +00:00
* primaries + + = leadPrimary = primary1 ;
2001-04-25 18:00:41 +00:00
* primaries + + = primary2 ;
2001-05-11 01:13:08 +00:00
}
2001-04-23 23:41:46 +00:00
}
2001-04-25 18:00:41 +00:00
} else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */
2001-05-11 01:13:08 +00:00
* primaries + + = primary1 ;
2001-04-25 18:00:41 +00:00
if ( primary2 ! = UCOL_IGNORABLE ) {
* primaries + + = primary2 ; /* second part */
2001-04-24 23:32:40 +00:00
}
2001-04-23 23:41:46 +00:00
}
2001-05-11 01:13:08 +00:00
}
2001-01-08 06:51:18 +00:00
2001-04-23 01:53:49 +00:00
if ( secondary > compareSec ) {
2001-01-26 00:12:23 +00:00
if ( ! isFrenchSec ) {
2001-01-18 00:46:19 +00:00
/* This is compression code. */
2001-02-06 00:36:48 +00:00
if ( secondary = = UCOL_COMMON2 & & notIsContinuation ) {
2001-03-03 03:35:17 +00:00
+ + count2 ;
2001-01-12 08:18:12 +00:00
} else {
2001-03-03 03:35:17 +00:00
if ( count2 > 0 ) {
if ( secondary > UCOL_COMMON2 ) { // not necessary for 4th level.
2001-09-21 21:22:44 +00:00
while ( count2 > UCOL_TOP_COUNT2 ) {
2001-05-02 23:36:22 +00:00
* secondaries + + = ( uint8_t ) ( UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2 ) ;
count2 - = ( uint32_t ) UCOL_TOP_COUNT2 ;
2001-03-03 03:35:17 +00:00
}
2001-09-21 21:22:44 +00:00
* secondaries + + = ( uint8_t ) ( UCOL_COMMON_TOP2 - ( count2 - 1 ) ) ;
2001-03-03 03:35:17 +00:00
} else {
2001-09-21 21:22:44 +00:00
while ( count2 > UCOL_BOT_COUNT2 ) {
2001-05-02 23:36:22 +00:00
* secondaries + + = ( uint8_t ) ( UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2 ) ;
count2 - = ( uint32_t ) UCOL_BOT_COUNT2 ;
2001-03-03 03:35:17 +00:00
}
2001-09-21 21:22:44 +00:00
* secondaries + + = ( uint8_t ) ( UCOL_COMMON_BOT2 + ( count2 - 1 ) ) ;
2001-03-03 03:35:17 +00:00
}
count2 = 0 ;
}
2001-01-12 08:18:12 +00:00
* secondaries + + = secondary ;
2001-01-26 00:12:23 +00:00
}
} else {
* secondaries + + = secondary ;
/* Do the special handling for French secondaries */
/* We need to get continuation elements and do intermediate restore */
/* abc1c2c3de with french secondaries need to be edc1c2c3ba NOT edc3c2c1ba */
2001-06-06 23:26:50 +00:00
if ( notIsContinuation ) {
2001-05-29 04:59:29 +00:00
if ( frenchStartPtr ! = NULL ) {
/* reverse secondaries from frenchStartPtr up to frenchEndPtr */
uprv_ucol_reverse_buffer ( uint8_t , frenchStartPtr , frenchEndPtr ) ;
frenchStartPtr = NULL ;
}
} else {
2001-01-26 00:12:23 +00:00
if ( frenchStartPtr = = NULL ) {
frenchStartPtr = secondaries - 2 ;
2001-01-08 06:51:18 +00:00
}
2001-01-26 00:12:23 +00:00
frenchEndPtr = secondaries - 1 ;
2001-06-06 23:26:50 +00:00
}
2001-04-23 01:53:49 +00:00
}
2001-01-26 00:12:23 +00:00
}
if ( doCase ) {
2001-05-10 22:33:50 +00:00
doCaseShift ( & cases , caseShift ) ;
2001-03-16 19:06:07 +00:00
if ( notIsContinuation ) {
2001-08-28 18:53:23 +00:00
caseBits = ( uint8_t ) ( tertiary & 0xC0 ) ;
2001-05-10 22:33:50 +00:00
2001-03-16 19:06:07 +00:00
if ( tertiary ! = 0 ) {
2001-05-10 22:33:50 +00:00
if ( coll - > caseFirst = = UCOL_UPPER_FIRST ) {
if ( ( caseBits & 0xC0 ) = = 0 ) {
* ( cases - 1 ) | = 1 < < ( - - caseShift ) ;
} else {
* ( cases - 1 ) | = 0 < < ( - - caseShift ) ;
/* second bit */
doCaseShift ( & cases , caseShift ) ;
* ( cases - 1 ) | = ( ( caseBits > > 6 ) & 1 ) < < ( - - caseShift ) ;
}
} else {
if ( ( caseBits & 0xC0 ) = = 0 ) {
* ( cases - 1 ) | = 0 < < ( - - caseShift ) ;
} else {
* ( cases - 1 ) | = 1 < < ( - - caseShift ) ;
/* second bit */
doCaseShift ( & cases , caseShift ) ;
* ( cases - 1 ) | = ( ( caseBits > > 7 ) & 1 ) < < ( - - caseShift ) ;
}
}
2001-03-16 19:06:07 +00:00
}
2001-05-10 22:33:50 +00:00
}
} else {
if ( notIsContinuation ) {
tertiary ^ = caseSwitch ;
2001-01-08 06:51:18 +00:00
}
2001-01-25 06:51:18 +00:00
}
2001-01-26 00:12:23 +00:00
2001-05-10 22:33:50 +00:00
tertiary & = tertiaryMask ;
2001-04-23 01:53:49 +00:00
if ( tertiary > compareTer ) {
2001-01-25 06:51:18 +00:00
/* This is compression code. */
/* sequence size check is included in the if clause */
2001-05-10 22:33:50 +00:00
if ( tertiary = = tertiaryCommon & & notIsContinuation ) {
2001-03-03 03:35:17 +00:00
+ + count3 ;
2001-01-25 06:51:18 +00:00
} else {
2001-05-11 01:13:08 +00:00
if ( ( tertiary > tertiaryCommon & & tertiaryCommon = = UCOL_COMMON3_NORMAL )
2001-05-10 22:33:50 +00:00
| | ( tertiary < = tertiaryCommon & & tertiaryCommon = = UCOL_COMMON3_UPPERFIRST ) ) {
tertiary + = tertiaryAddition ;
2001-01-25 06:51:18 +00:00
}
2001-03-03 03:35:17 +00:00
if ( count3 > 0 ) {
2001-05-17 23:14:44 +00:00
if ( ( tertiary > tertiaryCommon ) ) {
2001-09-21 21:22:44 +00:00
while ( count3 > coll - > tertiaryTopCount ) {
2001-05-10 22:33:50 +00:00
* tertiaries + + = ( uint8_t ) ( tertiaryTop - coll - > tertiaryTopCount ) ;
count3 - = ( uint32_t ) coll - > tertiaryTopCount ;
2001-03-03 03:35:17 +00:00
}
2001-09-21 21:22:44 +00:00
* tertiaries + + = ( uint8_t ) ( tertiaryTop - ( count3 - 1 ) ) ;
2001-03-03 03:35:17 +00:00
} else {
2001-09-21 21:22:44 +00:00
while ( count3 > coll - > tertiaryBottomCount ) {
2001-05-17 23:14:44 +00:00
* tertiaries + + = ( uint8_t ) ( tertiaryBottom + coll - > tertiaryBottomCount ) ;
2001-05-10 22:33:50 +00:00
count3 - = ( uint32_t ) coll - > tertiaryBottomCount ;
2001-03-03 03:35:17 +00:00
}
2001-09-21 21:22:44 +00:00
* tertiaries + + = ( uint8_t ) ( tertiaryBottom + ( count3 - 1 ) ) ;
2001-03-03 03:35:17 +00:00
}
count3 = 0 ;
}
2001-01-25 06:51:18 +00:00
* tertiaries + + = tertiary ;
2001-01-12 08:18:12 +00:00
}
2001-01-08 06:51:18 +00:00
}
2001-02-07 00:57:39 +00:00
2001-10-05 02:07:51 +00:00
if ( /*qShifted*/ ( compareQuad = = 0 ) & & notIsContinuation ) {
if ( s . flags & UCOL_WAS_HIRAGANA ) { // This was Hiragana and we need to note it
if ( count4 > 0 ) { // Close this part
while ( count4 > UCOL_BOT_COUNT4 ) {
* quads + + = ( uint8_t ) ( UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4 ) ;
count4 - = UCOL_BOT_COUNT4 ;
}
* quads + + = ( uint8_t ) ( UCOL_COMMON_BOT4 + ( count4 - 1 ) ) ;
count4 = 0 ;
}
* quads + + = UCOL_HIRAGANA_QUAD ; // Add the Hiragana
} else { // This wasn't Hiragana, so we can continue adding stuff
count4 + + ;
}
2001-01-25 06:51:18 +00:00
}
2001-01-08 06:51:18 +00:00
}
2001-02-06 00:36:48 +00:00
if ( primaries > primarySafeEnd ) { /* We have stepped over the primary buffer */
2001-11-14 21:55:21 +00:00
if ( allocateSKBuffer = = FALSE ) { /* need to save our butts if we cannot reallocate */
2002-08-02 06:55:51 +00:00
IInit_collIterate ( coll , ( UChar * ) source , len , & s ) ;
if ( source = = normSource ) {
s . flags & = ~ UCOL_ITER_NORM ;
}
sortKeySize = ucol_getSortKeySize ( coll , & s , sortKeySize , strength , len ) ;
2002-07-27 05:16:44 +00:00
* status = U_BUFFER_OVERFLOW_ERROR ;
2001-01-08 06:51:18 +00:00
finished = TRUE ;
break ;
} else { /* It's much nicer if we can actually reallocate */
2002-08-02 06:55:51 +00:00
int32_t sks = sortKeySize + ( primaries - primStart ) + ( secondaries - secStart ) + ( tertiaries - terStart ) + ( cases - caseStart ) + ( quads - quadStart ) ;
2001-03-14 07:49:03 +00:00
primStart = reallocateBuffer ( & primaries , * result , prim , & resultLength , 2 * sks , status ) ;
2002-07-27 05:16:44 +00:00
if ( U_SUCCESS ( * status ) ) {
* result = primStart ;
primarySafeEnd = primStart + resultLength - 2 ;
} else {
2002-08-02 06:55:51 +00:00
IInit_collIterate ( coll , ( UChar * ) source , len , & s ) ;
if ( source = = normSource ) {
s . flags & = ~ UCOL_ITER_NORM ;
}
sortKeySize = ucol_getSortKeySize ( coll , & s , sortKeySize , strength , len ) ;
2002-07-27 05:16:44 +00:00
finished = TRUE ;
break ;
}
2001-01-08 06:51:18 +00:00
}
2001-01-05 00:47:25 +00:00
}
}
if ( finished ) {
break ;
} else {
2001-01-05 06:36:10 +00:00
prevBuffSize = minBufferSize ;
2001-03-14 07:49:03 +00:00
secStart = reallocateBuffer ( & secondaries , secStart , second , & secSize , 2 * secSize , status ) ;
terStart = reallocateBuffer ( & tertiaries , terStart , tert , & terSize , 2 * terSize , status ) ;
caseStart = reallocateBuffer ( & cases , caseStart , caseB , & caseSize , 2 * caseSize , status ) ;
quadStart = reallocateBuffer ( & quads , quadStart , quad , & quadSize , 2 * quadSize , status ) ;
2001-01-05 06:36:10 +00:00
minBufferSize * = 2 ;
2002-07-27 05:16:44 +00:00
if ( U_FAILURE ( * status ) ) { // if we cannot reallocate buffers, we can at least give the sortkey size
2002-08-02 06:55:51 +00:00
IInit_collIterate ( coll , ( UChar * ) source , len , & s ) ;
if ( source = = normSource ) {
s . flags & = ~ UCOL_ITER_NORM ;
}
sortKeySize = ucol_getSortKeySize ( coll , & s , sortKeySize , strength , len ) ;
2002-07-27 05:16:44 +00:00
break ;
}
2000-11-30 23:20:14 +00:00
}
}
2000-12-14 01:11:11 +00:00
2001-03-14 07:49:03 +00:00
/* Here, we are generally done with processing */
/* bailing out would not be too productive */
2001-01-05 06:36:10 +00:00
if ( U_SUCCESS ( * status ) ) {
2001-02-06 06:50:16 +00:00
sortKeySize + = ( primaries - primStart ) ;
2001-01-05 06:36:10 +00:00
/* we have done all the CE's, now let's put them together to form a key */
2001-01-09 00:52:18 +00:00
if ( compareSec = = 0 ) {
2001-03-03 03:35:17 +00:00
if ( count2 > 0 ) {
2001-09-21 21:22:44 +00:00
while ( count2 > UCOL_BOT_COUNT2 ) {
2001-05-02 23:36:22 +00:00
* secondaries + + = ( uint8_t ) ( UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2 ) ;
count2 - = ( uint32_t ) UCOL_BOT_COUNT2 ;
2001-03-03 03:35:17 +00:00
}
2001-09-21 21:22:44 +00:00
* secondaries + + = ( uint8_t ) ( UCOL_COMMON_BOT2 + ( count2 - 1 ) ) ;
2001-03-03 03:35:17 +00:00
}
2001-01-05 06:36:10 +00:00
uint32_t secsize = secondaries - secStart ;
2002-07-27 05:16:44 +00:00
if ( ! isFrenchSec ) { // Regular situation, we know the length of secondaries
sortKeySize + = secsize ;
if ( sortKeySize < = resultLength ) {
* ( primaries + + ) = UCOL_LEVELTERMINATOR ;
2001-04-23 01:53:49 +00:00
uprv_memcpy ( primaries , secStart , secsize ) ;
2001-03-14 07:49:03 +00:00
primaries + = secsize ;
2002-07-27 05:16:44 +00:00
} else {
if ( allocateSKBuffer = = TRUE ) { /* need to save our butts if we cannot reallocate */
primStart = reallocateBuffer ( & primaries , * result , prim , & resultLength , 2 * sortKeySize , status ) ;
if ( U_SUCCESS ( * status ) ) {
* result = primStart ;
* ( primaries + + ) = UCOL_LEVELTERMINATOR ;
uprv_memcpy ( primaries , secStart , secsize ) ;
primaries + = secsize ;
}
} else {
* status = U_BUFFER_OVERFLOW_ERROR ;
}
2001-03-14 07:49:03 +00:00
}
2002-07-27 05:16:44 +00:00
} else { // French secondary is on. We will need to pack French. packFrench will add the level terminator
uint8_t * newPrim = packFrench ( primaries , primStart + resultLength , secondaries , & secsize , frenchStartPtr , frenchEndPtr ) ;
sortKeySize + = secsize ;
if ( sortKeySize < = resultLength ) { // if we managed to pack fine
primaries = newPrim ; // update the primary pointer
} else { // overflow, need to reallocate and redo
if ( allocateSKBuffer = = TRUE ) { /* need to save our butts if we cannot reallocate */
primStart = reallocateBuffer ( & primaries , * result , prim , & resultLength , 2 * sortKeySize , status ) ;
if ( U_SUCCESS ( * status ) ) {
primaries = packFrench ( primaries , primStart + resultLength , secondaries , & secsize , frenchStartPtr , frenchEndPtr ) ;
}
2001-04-23 01:53:49 +00:00
} else {
2002-07-27 05:16:44 +00:00
* status = U_BUFFER_OVERFLOW_ERROR ;
2001-03-14 07:49:03 +00:00
}
2001-01-05 00:47:25 +00:00
}
2001-01-08 06:51:18 +00:00
}
2001-01-05 06:36:10 +00:00
}
2000-12-14 01:11:11 +00:00
2001-01-05 06:36:10 +00:00
if ( doCase ) {
uint32_t casesize = cases - caseStart ;
2001-02-06 06:50:16 +00:00
sortKeySize + = casesize ;
2001-03-14 07:49:03 +00:00
if ( sortKeySize < = resultLength ) {
2002-07-27 05:16:44 +00:00
* ( primaries + + ) = UCOL_LEVELTERMINATOR ;
2001-03-14 07:49:03 +00:00
uprv_memcpy ( primaries , caseStart , casesize ) ;
primaries + = casesize ;
} else {
2001-11-14 21:55:21 +00:00
if ( allocateSKBuffer = = TRUE ) {
2001-03-14 07:49:03 +00:00
primStart = reallocateBuffer ( & primaries , * result , prim , & resultLength , 2 * sortKeySize , status ) ;
2002-07-27 05:16:44 +00:00
if ( U_SUCCESS ( * status ) ) {
* result = primStart ;
* ( primaries + + ) = UCOL_LEVELTERMINATOR ;
uprv_memcpy ( primaries , caseStart , casesize ) ;
}
2001-03-14 07:49:03 +00:00
} else {
2002-07-27 05:16:44 +00:00
* status = U_BUFFER_OVERFLOW_ERROR ;
2001-03-14 07:49:03 +00:00
}
}
2001-01-05 06:36:10 +00:00
}
2001-01-05 00:47:25 +00:00
2001-01-09 00:52:18 +00:00
if ( compareTer = = 0 ) {
2001-03-03 03:35:17 +00:00
if ( count3 > 0 ) {
2001-05-10 22:33:50 +00:00
if ( coll - > tertiaryCommon ! = UCOL_COMMON_BOT3 ) {
while ( count3 > = coll - > tertiaryTopCount ) {
* tertiaries + + = ( uint8_t ) ( tertiaryTop - coll - > tertiaryTopCount ) ;
count3 - = ( uint32_t ) coll - > tertiaryTopCount ;
}
* tertiaries + + = ( uint8_t ) ( tertiaryTop - count3 ) ;
} else {
2001-09-21 21:22:44 +00:00
while ( count3 > coll - > tertiaryBottomCount ) {
2001-05-17 23:14:44 +00:00
* tertiaries + + = ( uint8_t ) ( tertiaryBottom + coll - > tertiaryBottomCount ) ;
2001-05-10 22:33:50 +00:00
count3 - = ( uint32_t ) coll - > tertiaryBottomCount ;
}
2001-09-21 21:22:44 +00:00
* tertiaries + + = ( uint8_t ) ( tertiaryBottom + ( count3 - 1 ) ) ;
2001-03-03 03:35:17 +00:00
}
}
2001-01-05 06:36:10 +00:00
uint32_t tersize = tertiaries - terStart ;
2001-02-06 06:50:16 +00:00
sortKeySize + = tersize ;
2001-03-14 07:49:03 +00:00
if ( sortKeySize < = resultLength ) {
2002-07-27 05:16:44 +00:00
* ( primaries + + ) = UCOL_LEVELTERMINATOR ;
2001-03-14 07:49:03 +00:00
uprv_memcpy ( primaries , terStart , tersize ) ;
primaries + = tersize ;
2002-07-27 05:16:44 +00:00
} else {
if ( allocateSKBuffer = = TRUE ) {
primStart = reallocateBuffer ( & primaries , * result , prim , & resultLength , 2 * sortKeySize , status ) ;
if ( U_SUCCESS ( * status ) ) {
* result = primStart ;
* ( primaries + + ) = UCOL_LEVELTERMINATOR ;
uprv_memcpy ( primaries , terStart , tersize ) ;
}
} else {
* status = U_BUFFER_OVERFLOW_ERROR ;
}
}
if ( compareQuad = = 0 /*qShifted == TRUE*/ ) {
if ( count4 > 0 ) {
while ( count4 > UCOL_BOT_COUNT4 ) {
* quads + + = ( uint8_t ) ( UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4 ) ;
count4 - = UCOL_BOT_COUNT4 ;
2001-03-03 03:35:17 +00:00
}
2002-07-27 05:16:44 +00:00
* quads + + = ( uint8_t ) ( UCOL_COMMON_BOT4 + ( count4 - 1 ) ) ;
}
uint32_t quadsize = quads - quadStart ;
sortKeySize + = quadsize ;
if ( sortKeySize < = resultLength ) {
2001-03-14 07:49:03 +00:00
* ( primaries + + ) = UCOL_LEVELTERMINATOR ;
2002-07-27 05:16:44 +00:00
uprv_memcpy ( primaries , quadStart , quadsize ) ;
primaries + = quadsize ;
} else {
if ( allocateSKBuffer = = TRUE ) {
primStart = reallocateBuffer ( & primaries , * result , prim , & resultLength , 2 * sortKeySize , status ) ;
if ( U_SUCCESS ( * status ) ) {
2001-03-14 07:49:03 +00:00
* result = primStart ;
2002-07-27 05:16:44 +00:00
* ( primaries + + ) = UCOL_LEVELTERMINATOR ;
2001-03-14 07:49:03 +00:00
uprv_memcpy ( primaries , quadStart , quadsize ) ;
}
2002-07-27 05:16:44 +00:00
} else {
* status = U_BUFFER_OVERFLOW_ERROR ;
2001-03-14 07:49:03 +00:00
}
2002-07-27 05:16:44 +00:00
}
2001-01-18 00:46:19 +00:00
}
2001-01-05 06:36:10 +00:00
2001-01-18 00:46:19 +00:00
if ( compareIdent ) {
2001-05-29 04:59:29 +00:00
sortKeySize + = u_lengthOfIdenticalLevelRun ( s . string , len ) ;
if ( sortKeySize < = resultLength ) {
2002-07-27 05:16:44 +00:00
* ( primaries + + ) = UCOL_LEVELTERMINATOR ;
2001-05-29 04:59:29 +00:00
primaries + = u_writeIdenticalLevelRun ( s . string , len , primaries ) ;
} else {
2001-11-14 21:55:21 +00:00
if ( allocateSKBuffer = = TRUE ) {
2001-05-29 04:59:29 +00:00
primStart = reallocateBuffer ( & primaries , * result , prim , & resultLength , sortKeySize , status ) ;
2002-07-27 05:16:44 +00:00
if ( U_SUCCESS ( * status ) ) {
* result = primStart ;
* ( primaries + + ) = UCOL_LEVELTERMINATOR ;
u_writeIdenticalLevelRun ( s . string , len , primaries ) ;
}
} else {
* status = U_BUFFER_OVERFLOW_ERROR ;
2001-05-29 04:59:29 +00:00
}
2001-02-06 06:50:16 +00:00
}
2001-04-19 19:02:01 +00:00
}
2001-01-05 06:36:10 +00:00
}
* ( primaries + + ) = ' \0 ' ;
}
2001-01-05 00:47:25 +00:00
if ( terStart ! = tert ) {
uprv_free ( terStart ) ;
uprv_free ( secStart ) ;
2001-01-05 06:36:10 +00:00
uprv_free ( caseStart ) ;
uprv_free ( quadStart ) ;
2001-01-05 00:47:25 +00:00
}
2001-01-05 06:36:10 +00:00
2001-01-05 00:47:25 +00:00
if ( normSource ! = normBuffer ) {
uprv_free ( normSource ) ;
}
2001-11-14 21:55:21 +00:00
if ( allocateSKBuffer = = TRUE ) {
2001-03-14 07:49:03 +00:00
* result = ( uint8_t * ) uprv_malloc ( sortKeySize ) ;
2002-07-02 15:10:30 +00:00
/* test for NULL */
2002-06-29 09:32:36 +00:00
if ( * result = = NULL ) {
* status = U_MEMORY_ALLOCATION_ERROR ;
2002-07-27 05:16:44 +00:00
return sortKeySize ;
2002-06-29 09:32:36 +00:00
}
2001-03-14 07:49:03 +00:00
uprv_memcpy ( * result , primStart , sortKeySize ) ;
if ( primStart ! = prim ) {
uprv_free ( primStart ) ;
}
}
2001-01-05 00:47:25 +00:00
return sortKeySize ;
}
2001-05-10 22:12:53 +00:00
2002-07-11 17:06:51 +00:00
U_CFUNC int32_t U_CALLCONV
2001-02-07 00:57:39 +00:00
ucol_calcSortKeySimpleTertiary ( const UCollator * coll ,
const UChar * source ,
int32_t sourceLength ,
uint8_t * * result ,
2001-03-14 02:45:39 +00:00
uint32_t resultLength ,
2001-11-14 21:55:21 +00:00
UBool allocateSKBuffer ,
2001-02-07 00:57:39 +00:00
UErrorCode * status )
{
2001-05-10 22:12:53 +00:00
U_ALIGN_CODE ( 16 ) ;
2001-02-07 00:57:39 +00:00
uint32_t i = 0 ; /* general purpose counter */
/* Stack allocated buffers for buffers we use */
2001-03-14 07:49:03 +00:00
uint8_t prim [ UCOL_PRIMARY_MAX_BUFFER ] , second [ UCOL_SECONDARY_MAX_BUFFER ] , tert [ UCOL_TERTIARY_MAX_BUFFER ] ;
2001-02-07 00:57:39 +00:00
uint8_t * primaries = * result , * secondaries = second , * tertiaries = tert ;
if ( U_FAILURE ( * status ) ) {
return 0 ;
}
2001-11-14 21:55:21 +00:00
if ( primaries = = NULL & & allocateSKBuffer = = TRUE ) {
2001-03-14 07:49:03 +00:00
primaries = * result = prim ;
resultLength = UCOL_PRIMARY_MAX_BUFFER ;
2001-02-07 00:57:39 +00:00
}
2001-03-14 07:49:03 +00:00
uint32_t secSize = UCOL_SECONDARY_MAX_BUFFER , terSize = UCOL_TERTIARY_MAX_BUFFER ;
2001-02-07 00:57:39 +00:00
2001-03-22 21:16:20 +00:00
uint32_t sortKeySize = 3 ; /* it is always \0 terminated plus separators for secondary and tertiary */
2001-02-07 00:57:39 +00:00
2001-03-14 07:49:03 +00:00
UChar normBuffer [ UCOL_NORMALIZATION_MAX_BUFFER ] ;
2001-02-07 00:57:39 +00:00
UChar * normSource = normBuffer ;
2001-03-14 07:49:03 +00:00
int32_t normSourceLen = UCOL_NORMALIZATION_MAX_BUFFER ;
2001-02-07 00:57:39 +00:00
2001-05-10 22:12:53 +00:00
int32_t len = sourceLength ;
2001-02-07 00:57:39 +00:00
2001-09-27 01:01:30 +00:00
/* If we need to normalize, we'll do it all at once at the beginning! */
if ( coll - > normalizationMode ! = UCOL_OFF & & UNORM_YES ! = unorm_quickCheck ( source , len , UNORM_FCD , status ) ) {
2001-10-19 17:36:02 +00:00
len = unorm_internalNormalize ( normSource , normSourceLen ,
2001-09-27 01:01:30 +00:00
source , len ,
UNORM_FCD , FALSE ,
status ) ;
2001-10-19 17:36:02 +00:00
if ( * status = = U_BUFFER_OVERFLOW_ERROR ) {
normSourceLen = len ;
normSource = ( UChar * ) uprv_malloc ( len * U_SIZEOF_UCHAR ) ;
if ( normSource = = NULL ) {
* status = U_MEMORY_ALLOCATION_ERROR ;
return 0 ;
}
* status = U_ZERO_ERROR ;
len = unorm_internalNormalize ( normSource , normSourceLen ,
source , len ,
UNORM_FCD , FALSE ,
status ) ;
}
2001-09-27 01:01:30 +00:00
if ( U_FAILURE ( * status ) ) {
return 0 ;
}
source = normSource ;
}
2001-02-07 00:57:39 +00:00
collIterate s ;
2001-04-06 23:37:48 +00:00
IInit_collIterate ( coll , ( UChar * ) source , len , & s ) ;
2001-09-27 01:01:30 +00:00
if ( source = = normSource ) {
s . flags & = ~ UCOL_ITER_NORM ;
2001-03-03 03:35:17 +00:00
}
2001-05-11 01:13:08 +00:00
2001-03-23 07:48:40 +00:00
if ( resultLength = = 0 | | primaries = = NULL ) {
2002-08-02 06:55:51 +00:00
int32_t t = ucol_getSortKeySize ( coll , & s , sortKeySize , coll - > strength , len ) ;
2001-05-17 23:09:35 +00:00
if ( normSource ! = normBuffer ) {
uprv_free ( normSource ) ;
2001-11-07 00:01:29 +00:00
}
2001-05-17 23:09:35 +00:00
return t ;
}
2001-03-23 07:48:40 +00:00
uint8_t * primarySafeEnd = primaries + resultLength - 2 ;
2001-02-07 00:57:39 +00:00
2001-02-28 19:01:23 +00:00
uint32_t minBufferSize = UCOL_MAX_BUFFER ;
2001-02-07 00:57:39 +00:00
uint8_t * primStart = primaries ;
uint8_t * secStart = secondaries ;
uint8_t * terStart = tertiaries ;
uint32_t order = 0 ;
uint8_t primary1 = 0 ;
uint8_t primary2 = 0 ;
uint8_t secondary = 0 ;
uint8_t tertiary = 0 ;
2001-03-02 00:19:43 +00:00
uint8_t caseSwitch = coll - > caseSwitch ;
uint8_t tertiaryMask = coll - > tertiaryMask ;
2001-06-26 22:24:10 +00:00
int8_t tertiaryAddition = ( int8_t ) coll - > tertiaryAddition ;
2001-05-10 22:33:50 +00:00
uint8_t tertiaryTop = coll - > tertiaryTop ;
2001-05-17 23:14:44 +00:00
uint8_t tertiaryBottom = coll - > tertiaryBottom ;
2001-05-10 22:33:50 +00:00
uint8_t tertiaryCommon = coll - > tertiaryCommon ;
2001-02-07 00:57:39 +00:00
2001-02-28 19:01:23 +00:00
uint32_t prevBuffSize = 0 ;
2001-02-07 00:57:39 +00:00
UBool finished = FALSE ;
UBool notIsContinuation = FALSE ;
uint32_t count2 = 0 , count3 = 0 ;
2001-05-03 23:33:29 +00:00
uint8_t leadPrimary = 0 ;
2001-02-07 00:57:39 +00:00
for ( ; ; ) {
for ( i = prevBuffSize ; i < minBufferSize ; + + i ) {
2001-04-18 19:31:05 +00:00
order = ucol_IGetNextCE ( coll , & s , status ) ;
2001-02-07 00:57:39 +00:00
2001-05-29 04:59:29 +00:00
if ( order = = 0 ) {
2001-02-07 00:57:39 +00:00
continue ;
}
if ( order = = UCOL_NO_MORE_CES ) {
finished = TRUE ;
break ;
}
2001-05-29 04:59:29 +00:00
notIsContinuation = ! isContinuation ( order ) ;
2001-02-07 00:57:39 +00:00
2001-03-16 19:06:07 +00:00
if ( notIsContinuation ) {
2001-04-23 01:53:49 +00:00
tertiary = ( uint8_t ) ( ( order & tertiaryMask ) ) ;
2001-03-16 19:06:07 +00:00
} else {
2001-05-10 22:33:50 +00:00
tertiary = ( uint8_t ) ( ( order & UCOL_REMOVE_CONTINUATION ) ) ;
2001-04-23 01:53:49 +00:00
}
2001-03-14 07:49:03 +00:00
secondary = ( uint8_t ) ( ( order > > = 8 ) & UCOL_BYTE_SIZE_MASK ) ;
primary2 = ( uint8_t ) ( ( order > > = 8 ) & UCOL_BYTE_SIZE_MASK ) ;
2001-05-29 04:59:29 +00:00
primary1 = ( uint8_t ) ( order > > 8 ) ;
2001-02-07 00:57:39 +00:00
/* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */
/* Usually, we'll have non-zero primary1 & primary2, except in cases of LatinOne and friends, when primary2 will */
/* be zero with non zero primary1. primary3 is different than 0 only for long primaries - see above. */
2001-05-29 04:59:29 +00:00
/* regular and simple sortkey calc */
2001-04-19 23:49:03 +00:00
if ( primary1 ! = UCOL_IGNORABLE ) {
if ( notIsContinuation ) {
2001-04-25 18:00:41 +00:00
if ( leadPrimary = = primary1 ) {
* primaries + + = primary2 ;
} else {
if ( leadPrimary ! = 0 ) {
2001-08-28 18:53:23 +00:00
* primaries + + = ( uint8_t ) ( ( primary1 > leadPrimary ) ? UCOL_BYTE_UNSHIFTED_MAX : UCOL_BYTE_UNSHIFTED_MIN ) ;
2001-04-23 23:41:46 +00:00
}
2001-04-25 18:00:41 +00:00
if ( primary2 = = UCOL_IGNORABLE ) {
/* one byter, not compressed */
* primaries + + = primary1 ;
leadPrimary = 0 ;
} else if ( primary1 < UCOL_BYTE_FIRST_NON_LATIN_PRIMARY | |
2002-07-27 05:16:44 +00:00
//(primary1 > (UCOL_RESET_TOP_VALUE>>24) && primary1 < (UCOL_NEXT_TOP_VALUE>>24)))
2002-07-02 22:32:14 +00:00
( primary1 > ( * UCAconsts - > UCA_LAST_NON_VARIABLE > > 24 ) & & primary1 < ( * UCAconsts - > UCA_FIRST_IMPLICIT > > 24 ) ) ) {
2001-04-25 18:00:41 +00:00
/* not compressible */
leadPrimary = 0 ;
* primaries + + = primary1 ;
* primaries + + = primary2 ;
} else { /* compress */
2001-05-11 01:13:08 +00:00
* primaries + + = leadPrimary = primary1 ;
2001-04-25 18:00:41 +00:00
* primaries + + = primary2 ;
2001-05-11 01:13:08 +00:00
}
2001-04-24 23:32:40 +00:00
}
} else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */
2001-05-11 01:13:08 +00:00
* primaries + + = primary1 ;
2001-04-24 23:32:40 +00:00
if ( primary2 ! = UCOL_IGNORABLE ) {
* primaries + + = primary2 ; /* second part */
2001-04-23 23:41:46 +00:00
}
2001-04-19 23:49:03 +00:00
}
2001-05-11 01:13:08 +00:00
}
2001-02-07 00:57:39 +00:00
2001-05-29 04:59:29 +00:00
if ( secondary > 0 ) { /* I think that != 0 test should be != IGNORABLE */
2001-03-14 02:45:39 +00:00
/* This is compression code. */
if ( secondary = = UCOL_COMMON2 & & notIsContinuation ) {
+ + count2 ;
} else {
if ( count2 > 0 ) {
if ( secondary > UCOL_COMMON2 ) { // not necessary for 4th level.
2001-09-21 21:22:44 +00:00
while ( count2 > UCOL_TOP_COUNT2 ) {
2001-05-02 23:36:22 +00:00
* secondaries + + = ( uint8_t ) ( UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2 ) ;
count2 - = ( uint32_t ) UCOL_TOP_COUNT2 ;
2001-03-14 02:45:39 +00:00
}
2001-09-21 21:22:44 +00:00
* secondaries + + = ( uint8_t ) ( UCOL_COMMON_TOP2 - ( count2 - 1 ) ) ;
2001-03-14 02:45:39 +00:00
} else {
2001-09-21 21:22:44 +00:00
while ( count2 > UCOL_BOT_COUNT2 ) {
2001-05-02 23:36:22 +00:00
* secondaries + + = ( uint8_t ) ( UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2 ) ;
count2 - = ( uint32_t ) UCOL_BOT_COUNT2 ;
2001-03-14 02:45:39 +00:00
}
2001-09-21 21:22:44 +00:00
* secondaries + + = ( uint8_t ) ( UCOL_COMMON_BOT2 + ( count2 - 1 ) ) ;
2001-03-03 03:35:17 +00:00
}
2001-03-14 02:45:39 +00:00
count2 = 0 ;
2001-03-03 03:35:17 +00:00
}
2001-03-14 02:45:39 +00:00
* secondaries + + = secondary ;
2001-03-03 03:35:17 +00:00
}
2001-02-07 00:57:39 +00:00
}
2001-05-10 22:33:50 +00:00
if ( notIsContinuation ) {
tertiary ^ = caseSwitch ;
}
2001-02-07 00:57:39 +00:00
2001-05-10 22:12:53 +00:00
if ( tertiary > 0 ) {
2001-03-14 02:45:39 +00:00
/* This is compression code. */
/* sequence size check is included in the if clause */
2001-05-10 22:33:50 +00:00
if ( tertiary = = tertiaryCommon & & notIsContinuation ) {
2001-03-14 02:45:39 +00:00
+ + count3 ;
} else {
2001-05-17 23:14:44 +00:00
if ( tertiary > tertiaryCommon & & tertiaryCommon = = UCOL_COMMON3_NORMAL ) {
2001-05-10 22:33:50 +00:00
tertiary + = tertiaryAddition ;
2001-05-17 23:14:44 +00:00
} else if ( tertiary < = tertiaryCommon & & tertiaryCommon = = UCOL_COMMON3_UPPERFIRST ) {
tertiary - = tertiaryAddition ;
2001-03-14 02:45:39 +00:00
}
if ( count3 > 0 ) {
2001-05-17 23:14:44 +00:00
if ( ( tertiary > tertiaryCommon ) ) {
2001-09-21 21:22:44 +00:00
while ( count3 > coll - > tertiaryTopCount ) {
2001-05-10 22:33:50 +00:00
* tertiaries + + = ( uint8_t ) ( tertiaryTop - coll - > tertiaryTopCount ) ;
count3 - = ( uint32_t ) coll - > tertiaryTopCount ;
2001-03-14 02:45:39 +00:00
}
2001-09-21 21:22:44 +00:00
* tertiaries + + = ( uint8_t ) ( tertiaryTop - ( count3 - 1 ) ) ;
2001-03-14 02:45:39 +00:00
} else {
2001-09-21 21:22:44 +00:00
while ( count3 > coll - > tertiaryBottomCount ) {
2001-05-17 23:14:44 +00:00
* tertiaries + + = ( uint8_t ) ( tertiaryBottom + coll - > tertiaryBottomCount ) ;
2001-05-10 22:33:50 +00:00
count3 - = ( uint32_t ) coll - > tertiaryBottomCount ;
2001-03-14 02:45:39 +00:00
}
2001-09-21 21:22:44 +00:00
* tertiaries + + = ( uint8_t ) ( tertiaryBottom + ( count3 - 1 ) ) ;
2001-03-03 03:35:17 +00:00
}
2001-03-14 02:45:39 +00:00
count3 = 0 ;
2001-03-03 03:35:17 +00:00
}
2001-03-14 02:45:39 +00:00
* tertiaries + + = tertiary ;
2001-03-03 03:35:17 +00:00
}
2001-02-07 00:57:39 +00:00
}
if ( primaries > primarySafeEnd ) { /* We have stepped over the primary buffer */
2001-11-14 21:55:21 +00:00
if ( allocateSKBuffer = = FALSE ) { /* need to save our butts if we cannot reallocate */
2002-08-02 06:55:51 +00:00
IInit_collIterate ( coll , ( UChar * ) source , len , & s ) ;
if ( source = = normSource ) {
s . flags & = ~ UCOL_ITER_NORM ;
}
sortKeySize = ucol_getSortKeySize ( coll , & s , sortKeySize , coll - > strength , len ) ;
2002-07-27 05:16:44 +00:00
* status = U_BUFFER_OVERFLOW_ERROR ;
2001-02-07 00:57:39 +00:00
finished = TRUE ;
break ;
} else { /* It's much nicer if we can actually reallocate */
2002-08-02 06:55:51 +00:00
int32_t sks = sortKeySize + ( primaries - primStart ) + ( secondaries - secStart ) + ( tertiaries - terStart ) ;
2001-03-14 07:49:03 +00:00
primStart = reallocateBuffer ( & primaries , * result , prim , & resultLength , 2 * sks , status ) ;
2002-07-27 05:16:44 +00:00
if ( U_SUCCESS ( * status ) ) {
* result = primStart ;
primarySafeEnd = primStart + resultLength - 2 ;
} else {
2002-08-02 06:55:51 +00:00
IInit_collIterate ( coll , ( UChar * ) source , len , & s ) ;
if ( source = = normSource ) {
s . flags & = ~ UCOL_ITER_NORM ;
}
sortKeySize = ucol_getSortKeySize ( coll , & s , sortKeySize , coll - > strength , len ) ;
2002-07-27 05:16:44 +00:00
finished = TRUE ;
break ;
}
2001-02-07 00:57:39 +00:00
}
}
}
if ( finished ) {
break ;
} else {
prevBuffSize = minBufferSize ;
2001-03-14 07:49:03 +00:00
secStart = reallocateBuffer ( & secondaries , secStart , second , & secSize , 2 * secSize , status ) ;
terStart = reallocateBuffer ( & tertiaries , terStart , tert , & terSize , 2 * terSize , status ) ;
2001-02-07 00:57:39 +00:00
minBufferSize * = 2 ;
2002-07-27 05:16:44 +00:00
if ( U_FAILURE ( * status ) ) { // if we cannot reallocate buffers, we can at least give the sortkey size
2002-08-02 06:55:51 +00:00
IInit_collIterate ( coll , ( UChar * ) source , len , & s ) ;
if ( source = = normSource ) {
s . flags & = ~ UCOL_ITER_NORM ;
}
sortKeySize = ucol_getSortKeySize ( coll , & s , sortKeySize , coll - > strength , len ) ;
2002-07-27 05:16:44 +00:00
break ;
}
2001-02-07 00:57:39 +00:00
}
}
if ( U_SUCCESS ( * status ) ) {
sortKeySize + = ( primaries - primStart ) ;
/* we have done all the CE's, now let's put them together to form a key */
2001-03-03 03:35:17 +00:00
if ( count2 > 0 ) {
2001-09-21 21:22:44 +00:00
while ( count2 > UCOL_BOT_COUNT2 ) {
2001-05-02 23:36:22 +00:00
* secondaries + + = ( uint8_t ) ( UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2 ) ;
count2 - = ( uint32_t ) UCOL_BOT_COUNT2 ;
2001-03-03 03:35:17 +00:00
}
2001-09-21 21:22:44 +00:00
* secondaries + + = ( uint8_t ) ( UCOL_COMMON_BOT2 + ( count2 - 1 ) ) ;
2001-03-03 03:35:17 +00:00
}
2001-02-07 00:57:39 +00:00
uint32_t secsize = secondaries - secStart ;
sortKeySize + = secsize ;
2001-03-14 07:49:03 +00:00
if ( sortKeySize < = resultLength ) {
* ( primaries + + ) = UCOL_LEVELTERMINATOR ;
2001-04-23 01:53:49 +00:00
uprv_memcpy ( primaries , secStart , secsize ) ;
2001-03-14 07:49:03 +00:00
primaries + = secsize ;
} else {
2001-11-14 21:55:21 +00:00
if ( allocateSKBuffer = = TRUE ) {
2001-03-14 07:49:03 +00:00
primStart = reallocateBuffer ( & primaries , * result , prim , & resultLength , 2 * sortKeySize , status ) ;
2002-07-27 05:16:44 +00:00
if ( U_SUCCESS ( * status ) ) {
* ( primaries + + ) = UCOL_LEVELTERMINATOR ;
* result = primStart ;
uprv_memcpy ( primaries , secStart , secsize ) ;
}
2001-03-14 07:49:03 +00:00
} else {
2002-07-27 05:16:44 +00:00
* status = U_BUFFER_OVERFLOW_ERROR ;
2001-03-14 07:49:03 +00:00
}
}
2001-02-07 00:57:39 +00:00
2001-03-03 03:35:17 +00:00
if ( count3 > 0 ) {
2001-05-17 23:14:44 +00:00
if ( coll - > tertiaryCommon ! = UCOL_COMMON3_NORMAL ) {
2001-05-10 22:33:50 +00:00
while ( count3 > = coll - > tertiaryTopCount ) {
* tertiaries + + = ( uint8_t ) ( tertiaryTop - coll - > tertiaryTopCount ) ;
count3 - = ( uint32_t ) coll - > tertiaryTopCount ;
}
* tertiaries + + = ( uint8_t ) ( tertiaryTop - count3 ) ;
} else {
2001-09-21 21:22:44 +00:00
while ( count3 > coll - > tertiaryBottomCount ) {
2001-05-17 23:14:44 +00:00
* tertiaries + + = ( uint8_t ) ( tertiaryBottom + coll - > tertiaryBottomCount ) ;
2001-05-10 22:33:50 +00:00
count3 - = ( uint32_t ) coll - > tertiaryBottomCount ;
}
2001-09-21 21:22:44 +00:00
* tertiaries + + = ( uint8_t ) ( tertiaryBottom + ( count3 - 1 ) ) ;
2001-03-03 03:35:17 +00:00
}
}
2001-02-07 00:57:39 +00:00
uint32_t tersize = tertiaries - terStart ;
sortKeySize + = tersize ;
2001-03-14 07:49:03 +00:00
if ( sortKeySize < = resultLength ) {
2002-07-27 05:16:44 +00:00
* ( primaries + + ) = UCOL_LEVELTERMINATOR ;
2001-03-14 07:49:03 +00:00
uprv_memcpy ( primaries , terStart , tersize ) ;
primaries + = tersize ;
} else {
2001-11-14 21:55:21 +00:00
if ( allocateSKBuffer = = TRUE ) {
2001-03-14 07:49:03 +00:00
primStart = reallocateBuffer ( & primaries , * result , prim , & resultLength , 2 * sortKeySize , status ) ;
2002-07-27 05:16:44 +00:00
if ( U_SUCCESS ( * status ) ) {
* result = primStart ;
* ( primaries + + ) = UCOL_LEVELTERMINATOR ;
uprv_memcpy ( primaries , terStart , tersize ) ;
}
2001-03-14 07:49:03 +00:00
} else {
* status = U_MEMORY_ALLOCATION_ERROR ;
}
}
2001-02-07 00:57:39 +00:00
* ( primaries + + ) = ' \0 ' ;
}
if ( terStart ! = tert ) {
uprv_free ( terStart ) ;
uprv_free ( secStart ) ;
}
if ( normSource ! = normBuffer ) {
uprv_free ( normSource ) ;
}
2001-11-14 21:55:21 +00:00
if ( allocateSKBuffer = = TRUE ) {
2001-03-14 07:49:03 +00:00
* result = ( uint8_t * ) uprv_malloc ( sortKeySize ) ;
2002-07-02 15:10:30 +00:00
/* test for NULL */
2002-06-29 09:32:36 +00:00
if ( * result = = NULL ) {
* status = U_MEMORY_ALLOCATION_ERROR ;
2002-07-27 05:16:44 +00:00
return sortKeySize ;
2002-06-29 09:32:36 +00:00
}
2001-03-14 07:49:03 +00:00
uprv_memcpy ( * result , primStart , sortKeySize ) ;
if ( primStart ! = prim ) {
uprv_free ( primStart ) ;
}
}
2001-02-07 00:57:39 +00:00
return sortKeySize ;
}
2003-02-06 23:29:56 +00:00
static inline
UBool isShiftedCE ( uint32_t CE , uint32_t LVT , UBool * wasShifted ) {
UBool notIsContinuation = ! isContinuation ( CE ) ;
uint8_t primary1 = ( uint8_t ) ( ( CE > > 24 ) & 0xFF ) ;
2003-02-20 08:18:37 +00:00
if ( LVT & & ( ( notIsContinuation & & ( CE & 0xFFFF0000 ) < = LVT & & primary1 > 0 )
2003-02-06 23:29:56 +00:00
| | ( ! notIsContinuation & & * wasShifted ) )
| | ( * wasShifted & & primary1 = = 0 ) ) { /* amendment to the UCA says that primary ignorables */
// The stuff below should probably be in the sortkey code... maybe not...
if ( primary1 ! = 0 ) { /* if we were shifted and we got an ignorable code point */
/* we should just completely ignore it */
* wasShifted = TRUE ;
//continue;
}
//*wasShifted = TRUE;
return TRUE ;
} else {
* wasShifted = FALSE ;
return FALSE ;
}
}
2003-03-25 18:25:09 +00:00
static inline
void terminatePSKLevel ( int32_t level , int32_t maxLevel , int32_t & i , uint8_t * dest ) {
if ( level < maxLevel ) {
dest [ i + + ] = UCOL_LEVELTERMINATOR ;
} else {
dest [ i + + ] = 0 ;
}
}
2003-01-23 01:52:34 +00:00
2003-03-07 07:00:53 +00:00
/** enumeration of level identifiers for partial sort key generation */
enum {
UCOL_PSK_PRIMARY = 0 ,
UCOL_PSK_SECONDARY = 1 ,
UCOL_PSK_CASE = 2 ,
UCOL_PSK_TERTIARY = 3 ,
UCOL_PSK_QUATERNARY = 4 ,
UCOL_PSK_QUIN = 5 , /** This is an extra level, not used - but we have three bits to blow */
UCOL_PSK_IDENTICAL = 6 ,
UCOL_PSK_NULL = 7 , /** level for the end of sort key. Will just produce zeros */
UCOL_PSK_LIMIT
} ;
/** collation state enum. *_SHIFT value is how much to shift right
* to get the state piece to the right . * _MASK value should be
* ANDed with the shifted state . This data is stored in state [ 1 ]
* field .
*/
enum {
UCOL_PSK_LEVEL_SHIFT = 0 , /** level identificator. stores an enum value from above */
UCOL_PSK_LEVEL_MASK = 7 , /** three bits */
2003-03-13 17:15:53 +00:00
UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT = 3 , /** number of bytes of primary or quaternary already written */
UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK = 1 ,
/** can be only 0 or 1, since we get up to two bytes from primary or quaternary
* This field is also used to denote that the French secondary level is finished
*/
2003-03-07 07:00:53 +00:00
UCOL_PSK_WAS_SHIFTED_SHIFT = 4 , /** was the last value shifted */
UCOL_PSK_WAS_SHIFTED_MASK = 1 , /** can be 0 or 1 (Boolean) */
UCOL_PSK_USED_FRENCH_SHIFT = 5 , /** how many French bytes have we already written */
UCOL_PSK_USED_FRENCH_MASK = 3 , /** up to 4 bytes. See comment just below */
/** When we do French we need to reverse secondary values. However, continuations
* need to stay the same . So if you had abc1c2c3de , you need to have edc1c2c3ba
*/
UCOL_PSK_USED_ELEMENTS_SHIFT = 7 ,
UCOL_PSK_USED_ELEMENTS_MASK = 0x3FF ,
UCOL_PSK_ITER_SKIP_SHIFT = 17 ,
UCOL_PSK_ITER_SKIP_MASK = 0x7FFF
} ;
2003-03-13 17:15:53 +00:00
/** main sortkey part procedure. On the first call,
* you should pass in a collator , an iterator , empty state
* state [ 0 ] = = state [ 1 ] = = 0 , a buffer to hold results
* number of bytes you need and an error code pointer .
* Make sure your buffer is big enough to hold the wanted
* number of sortkey bytes . I don ' t check .
* The only meaningful status you can get back is
* U_BUFFER_OVERFLOW_ERROR , which basically means that you
* have been dealt a raw deal and that you probably won ' t
* be able to use partial sortkey generation for this
* particular combination of string and collator . This
* is highly unlikely , but you should still check the error code .
* Any other status means that you ' re not in a sane situation
* anymore . After the first call , preserve state values and
* use them on subsequent calls to obtain more bytes of a sortkey .
* Use until the number of bytes written is smaller than the requested
* number of bytes . Generated sortkey is not compatible with the
* one generated by ucol_getSortKey , as we don ' t do any compression .
* However , levels are still terminated by a 1 ( one ) and the sortkey
* is terminated by a 0 ( zero ) . Identical level is the same as in the
* regular sortkey - internal bocu - 1 implementation is used .
* For curious , although you cannot do much about this , here is
* the structure of state words .
* state [ 0 ] - iterator state . Depends on the iterator implementation ,
* but allows the iterator to continue where it stopped in
* the last iteration .
* state [ 1 ] - collation processing state . Here is the distribution
* of the bits :
* 0 , 1 , 2 - level of the sortkey - primary , secondary , case , tertiary
* quaternary , quin ( we don ' t use this one ) , identical and
* null ( producing only zeroes - first one to terminate the
* sortkey and subsequent to fill the buffer ) .
* 3 - byte count . Number of bytes written on the primary level .
* 4 - was shifted . Whether the previous iteration finished in the
* shifted state .
* 5 , 6 - French continuation bytes written . See the comment in the enum
* 7. .16 - Used elements . Number of CEs that were already used from the
* expansion buffer or number of bytes from a bocu sequence on
* the identical level .
* 17. .31 - iterator skip . Number of move operations iterator needs to
* skip from the current state in order to continue . This is used
* only if normalization is turned on , since the normalizing iterator
* can return undefined state , which means that it ' s in the middle
* of normalizing sequence .
*/
2003-01-23 01:52:34 +00:00
U_CAPI int32_t U_EXPORT2
2003-03-27 20:09:38 +00:00
ucol_nextSortKeyPart ( const UCollator * coll ,
2003-01-23 01:52:34 +00:00
UCharIterator * iter ,
uint32_t state [ 2 ] ,
uint8_t * dest , int32_t count ,
UErrorCode * status ) {
/* error checking */
if ( status = = NULL | | U_FAILURE ( * status ) ) {
return 0 ;
}
if ( coll = = NULL | | iter = = NULL | |
state = = NULL | |
count < 0 | | ( count > 0 & & dest = = NULL )
) {
* status = U_ILLEGAL_ARGUMENT_ERROR ;
}
if ( count = = 0 ) {
/* nothing to do */
return 0 ;
}
2003-03-13 17:15:53 +00:00
/** Setting up situation according to the state we got from the previous iteration */
// The state of the iterator from the previous invocation
uint32_t iterState = state [ 0 ] ;
// Has the last iteration ended in the shifted state
UBool wasShifted = ( ( state [ 1 ] > > UCOL_PSK_WAS_SHIFTED_SHIFT ) & UCOL_PSK_WAS_SHIFTED_MASK ) ? TRUE : FALSE ;
// What is the current level of the sortkey?
int32_t level = ( state [ 1 ] > > UCOL_PSK_LEVEL_SHIFT ) & UCOL_PSK_LEVEL_MASK ;
// Have we written only one byte from a two byte primary in the previous iteration?
// Also on secondary level - have we finished with the French secondary?
int32_t byteCountOrFrenchDone = ( state [ 1 ] > > UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT ) & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK ;
// number of bytes in the continuation buffer for French
int32_t usedFrench = ( state [ 1 ] > > UCOL_PSK_USED_FRENCH_SHIFT ) & UCOL_PSK_USED_FRENCH_MASK ;
// Skip the CEs that we got from an extraction
// and delivered in the previous call
int32_t usedElements = ( state [ 1 ] > > UCOL_PSK_USED_ELEMENTS_SHIFT ) & UCOL_PSK_USED_ELEMENTS_MASK ;
// Number of times to skip because the iterator returned
// UITER_NO_STATE when it was stopped in the last iteration, so we had to save the
// last valid state.
int32_t iterSkips = ( state [ 1 ] > > UCOL_PSK_ITER_SKIP_SHIFT ) & UCOL_PSK_ITER_SKIP_MASK ;
/** values that depend on the collator attributes */
// strength of the collator.
2003-01-23 01:52:34 +00:00
int32_t strength = ucol_getAttribute ( coll , UCOL_STRENGTH , status ) ;
2003-03-25 18:25:09 +00:00
// maximal level of the partial sortkey. Need to take whether case level is done
int32_t maxLevel = 0 ;
if ( strength < UCOL_TERTIARY ) {
if ( ucol_getAttribute ( coll , UCOL_CASE_LEVEL , status ) = = UCOL_ON ) {
maxLevel = UCOL_PSK_CASE ;
} else {
maxLevel = strength ;
}
} else {
if ( strength = = UCOL_TERTIARY ) {
maxLevel = UCOL_PSK_TERTIARY ;
} else if ( strength = = UCOL_QUATERNARY ) {
maxLevel = UCOL_PSK_QUATERNARY ;
} else { // identical
maxLevel = UCOL_IDENTICAL ;
}
}
2003-03-13 17:15:53 +00:00
// value for the quaternary level if Hiragana is encountered. Used for JIS X 4061 collation
2003-01-23 01:52:34 +00:00
uint8_t UCOL_HIRAGANA_QUAD =
( ucol_getAttribute ( coll , UCOL_HIRAGANA_QUATERNARY_MODE , status ) = = UCOL_ON ) ? 0xFE : 0xFF ;
2003-02-06 23:29:56 +00:00
// Boundary value that decides whether a CE is shifted or not
uint32_t LVT = ( coll - > alternateHandling = = UCOL_SHIFTED ) ? ( coll - > variableTopValue < < 16 ) : 0 ;
2003-03-13 17:15:53 +00:00
// Are we doing French collation?
UBool doingFrench = ( ucol_getAttribute ( coll , UCOL_FRENCH_COLLATION , status ) = = UCOL_ON ) ;
2003-02-06 23:29:56 +00:00
2003-03-13 17:15:53 +00:00
/** initializing the collation state */
UBool notIsContinuation = FALSE ;
uint32_t CE = UCOL_NO_MORE_CES ;
2003-01-23 01:52:34 +00:00
2003-03-13 17:15:53 +00:00
collIterate s ;
2003-01-23 01:52:34 +00:00
IInit_collIterate ( coll , NULL , - 1 , & s ) ;
s . iterator = iter ;
s . flags | = UCOL_USE_ITERATOR ;
2003-03-13 17:15:53 +00:00
// This variable tells us whether we have produced some other levels in this iteration
// before we moved to the identical level. In that case, we need to switch the
// type of the iterator.
2003-02-14 07:46:20 +00:00
UBool doingIdenticalFromStart = FALSE ;
2003-03-13 17:15:53 +00:00
// Normalizing iterator
2003-03-17 21:20:36 +00:00
UAlignedMemory stackNormIter [ UNORM_ITER_SIZE / sizeof ( UAlignedMemory ) ] ;
2003-01-23 01:52:34 +00:00
UNormIterator * normIter = NULL ;
2003-03-13 17:15:53 +00:00
// If the normalization is turned on for the collator and we are below identical level
// we will use a FCD normalizing iterator
2003-02-06 23:29:56 +00:00
if ( ucol_getAttribute ( coll , UCOL_NORMALIZATION_MODE , status ) = = UCOL_ON & & level < UCOL_PSK_IDENTICAL ) {
2003-03-17 21:20:36 +00:00
normIter = unorm_openIter ( stackNormIter , sizeof ( stackNormIter ) , status ) ;
2003-01-23 01:52:34 +00:00
s . iterator = unorm_setIter ( normIter , iter , UNORM_FCD , status ) ;
s . flags & = ~ UCOL_ITER_NORM ;
if ( U_FAILURE ( * status ) ) {
return 0 ;
}
2003-02-14 07:46:20 +00:00
} else if ( level = = UCOL_PSK_IDENTICAL ) {
2003-03-13 17:15:53 +00:00
// for identical level, we need a NFD iterator. We need to instantiate it here, since we
// will be updating the state - and this cannot be done on an ordinary iterator.
2003-03-17 21:20:36 +00:00
normIter = unorm_openIter ( stackNormIter , sizeof ( stackNormIter ) , status ) ;
2003-02-14 07:46:20 +00:00
s . iterator = unorm_setIter ( normIter , iter , UNORM_NFD , status ) ;
s . flags & = ~ UCOL_ITER_NORM ;
if ( U_FAILURE ( * status ) ) {
return 0 ;
}
doingIdenticalFromStart = TRUE ;
2003-01-23 01:52:34 +00:00
}
2003-03-13 17:15:53 +00:00
// This is the tentative new state of the iterator. The problem
// is that the iterator might return an undefined state, in
// which case we should save the last valid state and increase
// the iterator skip value.
2003-01-23 01:52:34 +00:00
uint32_t newState = 0 ;
2003-03-13 17:15:53 +00:00
// First, we set the iterator to the last valid position
// from the last iteration. This was saved in state[0].
2003-01-23 01:52:34 +00:00
if ( iterState = = 0 ) {
/* initial state */
2003-03-13 17:15:53 +00:00
if ( level = = UCOL_PSK_SECONDARY & & doingFrench & & ! byteCountOrFrenchDone ) {
s . iterator - > move ( s . iterator , 0 , UITER_LIMIT ) ;
} else {
s . iterator - > move ( s . iterator , 0 , UITER_START ) ;
}
2003-01-23 01:52:34 +00:00
} else {
/* reset to previous state */
2003-03-13 17:15:53 +00:00
s . iterator - > setState ( s . iterator , iterState , status ) ;
2003-01-23 01:52:34 +00:00
if ( U_FAILURE ( * status ) ) {
return 0 ;
}
}
2003-03-13 17:15:53 +00:00
// Then, we may have to move more, if the normalizing iterator
// was going through a normalizing sequence.
2003-01-23 01:52:34 +00:00
if ( iterSkips ) {
2003-02-06 23:29:56 +00:00
// if we are on secondary level AND we do French, we need to go backward instead of forward
2003-03-13 17:15:53 +00:00
if ( level = = UCOL_PSK_SECONDARY & & doingFrench ) {
2003-02-06 23:29:56 +00:00
s . iterator - > move ( s . iterator , - iterSkips , UITER_CURRENT ) ;
} else {
s . iterator - > move ( s . iterator , iterSkips , UITER_CURRENT ) ;
}
2003-01-23 01:52:34 +00:00
}
2003-03-13 17:15:53 +00:00
// Number of expansion CEs that were already consumed in the
// previous iteration for the last code point processed. We
// want to clean out the expansion buffer, so that we can
// get correct CEs. This value is persistent over iterations,
// since we can have several iterations on the one expansion
// buffer.
2003-02-06 23:29:56 +00:00
int32_t consumedExpansionCEs = usedElements ;
2003-03-13 17:15:53 +00:00
// Number of bytes already writted from a bocsu sequence. Since
// the longes bocsu sequence is 4 long, this can be up to 3. It
// shares the state field with consumedExpansionCEs value, since
// they cannot simultanously appear on the same level
int32_t bocsuBytesUsed = 0 ;
2003-02-06 23:29:56 +00:00
// Clean out the expansion buffer unless we are on
// identical level. In that case we use this field
// to store the number of bytes already written
// from the previous bocsu sequence.
if ( level < UCOL_PSK_IDENTICAL & & usedElements ! = 0 ) {
while ( usedElements - - > 0 ) {
2003-03-13 17:15:53 +00:00
// If we're doing French and we are on the secondary level,
// we go backwards.
if ( level = = UCOL_PSK_SECONDARY & & doingFrench ) {
2003-02-06 23:29:56 +00:00
CE = ucol_IGetPrevCE ( coll , & s , status ) ;
} else {
CE = ucol_IGetNextCE ( coll , & s , status ) ;
}
2003-01-23 01:52:34 +00:00
if ( CE = = UCOL_NO_MORE_CES ) {
2003-02-06 23:29:56 +00:00
/* should not happen */
* status = U_INTERNAL_PROGRAM_ERROR ;
return 0 ;
2003-01-23 01:52:34 +00:00
}
2003-02-06 23:29:56 +00:00
}
} else {
bocsuBytesUsed = usedElements ;
2003-01-23 01:52:34 +00:00
}
2003-02-06 23:29:56 +00:00
2003-03-13 17:15:53 +00:00
// This variable prevents the adjusting of iterator
// skip variable when we are the first time on a
// level. I hope there is a better way to do it, but
// I could not think of it.
UBool firstTimeOnLevel = TRUE ;
// French secondary needs to know whether the iterator state of zero came from previous level OR
// from a new invocation...
UBool wasDoingPrimary = FALSE ;
// Case level is kind of goofy. This variable tells us that
// we are still not done with the case level.
2003-01-23 01:52:34 +00:00
UBool dontAdvanceIteratorBecauseWeNeedALevelTerminator = FALSE ;
2003-03-13 17:15:53 +00:00
// destination buffer byte counter. When this guy
// gets to count, we're done with the iteration
int32_t i = 0 ;
// used to count the zero bytes written after we
// have finished with the sort key
2003-01-23 01:52:34 +00:00
int32_t j = 0 ;
2003-03-13 17:15:53 +00:00
// Hm.... I think we're ready to plunge in. Basic story is as following:
// we have a fall through case based on level. This is used for initial
// positioning on iteration start. Every level processor contains a
// for(;;) which will be broken when we exhaust all the CEs. Other
// way to exit is a goto saveState, which happens when we have filled
// out our buffer.
switch ( level ) {
case UCOL_PSK_PRIMARY :
wasDoingPrimary = TRUE ;
for ( ; ; ) {
if ( i = = count ) {
goto saveState ;
}
// We should save the state only if we
// are sure that we are done with the
// previous iterator state
if ( consumedExpansionCEs = = 0 & & byteCountOrFrenchDone = = 0 ) {
newState = s . iterator - > getState ( s . iterator ) ;
if ( newState ! = UITER_NO_STATE ) {
iterState = newState ;
iterSkips = 0 ;
} else {
if ( ! firstTimeOnLevel & & ! byteCountOrFrenchDone ) {
2003-02-20 01:06:06 +00:00
iterSkips + + ;
}
2003-03-13 17:15:53 +00:00
}
2003-01-23 01:52:34 +00:00
}
2003-03-13 17:15:53 +00:00
firstTimeOnLevel = FALSE ;
CE = ucol_IGetNextCE ( coll , & s , status ) ;
if ( CE = = UCOL_NO_MORE_CES ) {
// Add the level separator
2003-03-25 18:25:09 +00:00
terminatePSKLevel ( level , maxLevel , i , dest ) ;
//dest[i++] = UCOL_LEVELTERMINATOR;
2003-03-13 17:15:53 +00:00
byteCountOrFrenchDone = 0 ;
// Restart the iteration an move to the
// second level
s . iterator - > move ( s . iterator , 0 , UITER_START ) ;
level = UCOL_PSK_SECONDARY ;
break ;
}
if ( ! isShiftedCE ( CE , LVT , & wasShifted ) ) {
CE > > = 16 ; /* get primary */
if ( CE ! = 0 ) {
if ( byteCountOrFrenchDone = = 0 ) {
dest [ i + + ] = ( uint8_t ) ( CE > > 8 ) ;
} else {
byteCountOrFrenchDone = 0 ;
2003-01-23 01:52:34 +00:00
}
2003-03-13 17:15:53 +00:00
if ( ( CE & = 0xff ) ! = 0 ) {
if ( i = = count ) {
/* overflow */
byteCountOrFrenchDone = 1 ;
goto saveState ;
2003-02-06 23:29:56 +00:00
}
2003-03-13 17:15:53 +00:00
dest [ i + + ] = ( uint8_t ) CE ;
2003-01-23 01:52:34 +00:00
}
}
2003-03-13 17:15:53 +00:00
}
if ( s . CEpos - s . toReturn | | ( s . pos & & * s . pos ! = 0 ) ) {
// s.pos != NULL means there is a normalization buffer in effect
// in iterative case, this means that we are doing Thai (maybe discontiguos)
consumedExpansionCEs + + ;
2003-01-23 01:52:34 +00:00
} else {
2003-03-13 17:15:53 +00:00
consumedExpansionCEs = 0 ;
2003-01-23 01:52:34 +00:00
}
2003-03-13 17:15:53 +00:00
if ( s . pos & & * s . pos = = 0 ) {
// maybe it is the end of Thai - we have to have
// an extra skip
iterSkips + + ;
}
}
/* fall through to next level */
case UCOL_PSK_SECONDARY :
if ( strength > = UCOL_SECONDARY ) {
if ( ! doingFrench ) {
for ( ; ; ) {
if ( i = = count ) {
goto saveState ;
}
// We should save the state only if we
// are sure that we are done with the
// previous iterator state
if ( consumedExpansionCEs = = 0 ) {
newState = s . iterator - > getState ( s . iterator ) ;
if ( newState ! = UITER_NO_STATE ) {
iterState = newState ;
iterSkips = 0 ;
} else {
if ( ! firstTimeOnLevel ) {
iterSkips + + ;
2003-01-23 01:52:34 +00:00
}
}
2003-03-13 17:15:53 +00:00
}
firstTimeOnLevel = FALSE ;
CE = ucol_IGetNextCE ( coll , & s , status ) ;
if ( CE = = UCOL_NO_MORE_CES ) {
// Add the level separator
2003-03-25 18:25:09 +00:00
terminatePSKLevel ( level , maxLevel , i , dest ) ;
//dest[i++] = UCOL_LEVELTERMINATOR;
2003-03-13 17:15:53 +00:00
byteCountOrFrenchDone = 0 ;
// Restart the iteration an move to the
// second level
s . iterator - > move ( s . iterator , 0 , UITER_START ) ;
level = UCOL_PSK_CASE ;
2003-01-23 01:52:34 +00:00
break ;
2003-03-13 17:15:53 +00:00
}
if ( ! isShiftedCE ( CE , LVT , & wasShifted ) ) {
CE > > = 8 ; /* get secondary */
if ( CE ! = 0 ) {
dest [ i + + ] = ( uint8_t ) CE ;
2003-02-20 01:06:06 +00:00
}
2003-01-23 01:52:34 +00:00
}
2003-03-13 17:15:53 +00:00
if ( s . CEpos - s . toReturn | | ( s . pos & & * s . pos ! = 0 ) ) {
consumedExpansionCEs + + ;
} else {
consumedExpansionCEs = 0 ;
}
if ( s . pos & & * s . pos = = 0 ) {
iterSkips + + ;
}
2003-01-23 01:52:34 +00:00
}
2003-03-13 17:15:53 +00:00
} else { // French secondary processing
uint8_t frenchBuff [ UCOL_MAX_BUFFER ] ;
int32_t frenchIndex = 0 ;
// Here we are going backwards.
// If the iterator is at the beggining, it should be
// moved to end.
if ( wasDoingPrimary ) {
s . iterator - > move ( s . iterator , 0 , UITER_LIMIT ) ;
}
for ( ; ; ) {
if ( i = = count ) {
goto saveState ;
}
if ( consumedExpansionCEs = = 0 ) {
newState = s . iterator - > getState ( s . iterator ) ;
if ( newState ! = UITER_NO_STATE ) {
iterState = newState ;
iterSkips = 0 ;
} else {
if ( ! firstTimeOnLevel ) {
iterSkips + + ;
2003-01-23 01:52:34 +00:00
}
}
2003-03-13 17:15:53 +00:00
}
firstTimeOnLevel = FALSE ;
CE = ucol_IGetPrevCE ( coll , & s , status ) ;
if ( CE = = UCOL_NO_MORE_CES ) {
// Add the level separator
2003-03-25 18:25:09 +00:00
terminatePSKLevel ( level , maxLevel , i , dest ) ;
//dest[i++] = UCOL_LEVELTERMINATOR;
2003-03-13 17:15:53 +00:00
byteCountOrFrenchDone = 0 ;
// Restart the iteration an move to the next level
s . iterator - > move ( s . iterator , 0 , UITER_START ) ;
level = UCOL_PSK_CASE ;
break ;
}
if ( isContinuation ( CE ) ) { // if it's a continuation, we want to save it and
// reverse when we get a first non-continuation CE.
CE > > = 8 ;
frenchBuff [ frenchIndex + + ] = ( uint8_t ) CE ;
} else if ( ! isShiftedCE ( CE , LVT , & wasShifted ) ) {
CE > > = 8 ; /* get secondary */
if ( ! frenchIndex ) {
2003-02-06 23:29:56 +00:00
if ( CE ! = 0 ) {
dest [ i + + ] = ( uint8_t ) CE ;
}
2003-01-23 01:52:34 +00:00
} else {
2003-03-13 17:15:53 +00:00
frenchBuff [ frenchIndex + + ] = ( uint8_t ) CE ;
frenchIndex - = usedFrench ;
usedFrench = 0 ;
while ( i < count & & frenchIndex ) {
dest [ i + + ] = frenchBuff [ - - frenchIndex ] ;
usedFrench + + ;
}
2003-01-23 01:52:34 +00:00
}
2003-03-13 17:15:53 +00:00
}
if ( s . CEpos - s . toReturn | | ( s . pos & & * s . pos ! = 0 ) ) {
consumedExpansionCEs + + ;
} else {
consumedExpansionCEs = 0 ;
}
if ( s . pos & & * s . pos = = 0 ) {
iterSkips + + ;
}
}
}
} else {
level = UCOL_PSK_CASE ;
}
/* fall through to next level */
case UCOL_PSK_CASE :
if ( ucol_getAttribute ( coll , UCOL_CASE_LEVEL , status ) = = UCOL_ON ) {
uint32_t caseShift = UCOL_CASE_SHIFT_START ;
uint8_t caseByte = UCOL_CASE_BYTE_START ;
uint8_t caseBits = 0 ;
for ( ; ; ) {
if ( i = = count ) {
goto saveState ;
}
// We should save the state only if we
// are sure that we are done with the
// previous iterator state
if ( consumedExpansionCEs = = 0 ) {
newState = s . iterator - > getState ( s . iterator ) ;
if ( newState ! = UITER_NO_STATE ) {
iterState = newState ;
iterSkips = 0 ;
} else {
if ( ! firstTimeOnLevel ) {
2003-02-20 01:06:06 +00:00
iterSkips + + ;
}
2003-01-23 01:52:34 +00:00
}
}
2003-03-13 17:15:53 +00:00
firstTimeOnLevel = FALSE ;
CE = ucol_IGetNextCE ( coll , & s , status ) ;
if ( CE = = UCOL_NO_MORE_CES ) {
// On the case level we might have an unfinished
// case byte. Add one if it's started.
if ( caseShift ! = UCOL_CASE_SHIFT_START ) {
dest [ i + + ] = caseByte ;
}
// This is kind of tricky - situation where
// we need to keep the iterator in the old
// state, but don't need to bring anything
// to the next invocation
if ( i < count ) {
// Add the level separator
2003-03-25 18:25:09 +00:00
terminatePSKLevel ( level , maxLevel , i , dest ) ;
//dest[i++] = UCOL_LEVELTERMINATOR;
2003-03-13 17:15:53 +00:00
// Restart the iteration and move to the
// next level
s . iterator - > move ( s . iterator , 0 , UITER_START ) ;
level = UCOL_PSK_TERTIARY ;
} else {
dontAdvanceIteratorBecauseWeNeedALevelTerminator = TRUE ;
}
break ;
}
if ( ! isShiftedCE ( CE , LVT , & wasShifted ) ) {
if ( ! isContinuation ( CE ) ) {
CE = ( uint8_t ) ( CE & UCOL_BYTE_SIZE_MASK ) ;
caseBits = ( uint8_t ) ( CE & 0xC0 ) ;
// this copies the case level logic from the
// sort key generation code
if ( CE ! = 0 ) {
if ( coll - > caseFirst = = UCOL_UPPER_FIRST ) {
if ( ( caseBits & 0xC0 ) = = 0 ) {
caseByte | = 1 < < ( - - caseShift ) ;
2003-02-06 23:29:56 +00:00
} else {
2003-03-13 17:15:53 +00:00
caseByte | = 0 < < ( - - caseShift ) ;
/* second bit */
if ( caseShift = = 0 ) {
dest [ i + + ] = caseByte ;
caseShift = UCOL_CASE_SHIFT_START ;
caseByte = UCOL_CASE_BYTE_START ;
}
caseByte | = ( ( caseBits > > 6 ) & 1 ) < < ( - - caseShift ) ;
2003-02-06 23:29:56 +00:00
}
2003-03-13 17:15:53 +00:00
} else {
if ( ( caseBits & 0xC0 ) = = 0 ) {
caseByte | = 0 < < ( - - caseShift ) ;
2003-02-06 23:29:56 +00:00
} else {
2003-03-13 17:15:53 +00:00
caseByte | = 1 < < ( - - caseShift ) ;
/* second bit */
if ( caseShift = = 0 ) {
dest [ i + + ] = caseByte ;
caseShift = UCOL_CASE_SHIFT_START ;
caseByte = UCOL_CASE_BYTE_START ;
}
caseByte | = ( ( caseBits > > 7 ) & 1 ) < < ( - - caseShift ) ;
2003-02-06 23:29:56 +00:00
}
2003-01-23 01:52:34 +00:00
}
}
2003-03-13 17:15:53 +00:00
2003-01-23 01:52:34 +00:00
}
2003-03-13 17:15:53 +00:00
}
// Not sure this is correct for the case level - revisit
if ( s . CEpos - s . toReturn | | ( s . pos & & * s . pos ! = 0 ) ) {
consumedExpansionCEs + + ;
2003-01-23 01:52:34 +00:00
} else {
2003-03-13 17:15:53 +00:00
consumedExpansionCEs = 0 ;
2003-01-23 01:52:34 +00:00
}
2003-03-13 17:15:53 +00:00
if ( s . pos & & * s . pos = = 0 ) {
iterSkips + + ;
}
}
} else {
level = UCOL_PSK_TERTIARY ;
}
/* fall through to next level */
case UCOL_PSK_TERTIARY :
if ( strength > = UCOL_TERTIARY ) {
for ( ; ; ) {
if ( i = = count ) {
goto saveState ;
}
// We should save the state only if we
// are sure that we are done with the
// previous iterator state
if ( consumedExpansionCEs = = 0 ) {
newState = s . iterator - > getState ( s . iterator ) ;
if ( newState ! = UITER_NO_STATE ) {
iterState = newState ;
iterSkips = 0 ;
} else {
if ( ! firstTimeOnLevel ) {
iterSkips + + ;
}
2003-02-06 23:29:56 +00:00
}
2003-03-13 17:15:53 +00:00
}
firstTimeOnLevel = FALSE ;
CE = ucol_IGetNextCE ( coll , & s , status ) ;
if ( CE = = UCOL_NO_MORE_CES ) {
// Add the level separator
2003-03-25 18:25:09 +00:00
terminatePSKLevel ( level , maxLevel , i , dest ) ;
//dest[i++] = UCOL_LEVELTERMINATOR;
2003-03-13 17:15:53 +00:00
byteCountOrFrenchDone = 0 ;
// Restart the iteration an move to the
// second level
s . iterator - > move ( s . iterator , 0 , UITER_START ) ;
level = UCOL_PSK_QUATERNARY ;
break ;
}
if ( ! isShiftedCE ( CE , LVT , & wasShifted ) ) {
notIsContinuation = ! isContinuation ( CE ) ;
if ( notIsContinuation ) {
CE = ( uint8_t ) ( CE & UCOL_BYTE_SIZE_MASK ) ;
CE ^ = coll - > caseSwitch ;
CE & = coll - > tertiaryMask ;
2003-02-06 23:29:56 +00:00
} else {
2003-03-13 17:15:53 +00:00
CE = ( uint8_t ) ( ( CE & UCOL_REMOVE_CONTINUATION ) ) ;
2003-02-06 23:29:56 +00:00
}
2003-03-13 17:15:53 +00:00
if ( CE ! = 0 ) {
dest [ i + + ] = ( uint8_t ) CE ;
}
}
if ( s . CEpos - s . toReturn | | ( s . pos & & * s . pos ! = 0 ) ) {
consumedExpansionCEs + + ;
} else {
consumedExpansionCEs = 0 ;
}
if ( s . pos & & * s . pos = = 0 ) {
iterSkips + + ;
}
}
} else {
// if we're not doing tertiary
// skip to the end
level = UCOL_PSK_NULL ;
}
/* fall through to next level */
case UCOL_PSK_QUATERNARY :
if ( strength > = UCOL_QUATERNARY ) {
for ( ; ; ) {
if ( i = = count ) {
goto saveState ;
}
// We should save the state only if we
// are sure that we are done with the
// previous iterator state
if ( consumedExpansionCEs = = 0 ) {
newState = s . iterator - > getState ( s . iterator ) ;
if ( newState ! = UITER_NO_STATE ) {
iterState = newState ;
iterSkips = 0 ;
} else {
if ( ! firstTimeOnLevel ) {
iterSkips + + ;
2003-02-06 23:29:56 +00:00
}
2003-03-13 17:15:53 +00:00
}
}
firstTimeOnLevel = FALSE ;
CE = ucol_IGetNextCE ( coll , & s , status ) ;
if ( CE = = UCOL_NO_MORE_CES ) {
// Add the level separator
2003-03-25 18:25:09 +00:00
terminatePSKLevel ( level , maxLevel , i , dest ) ;
//dest[i++] = UCOL_LEVELTERMINATOR;
2003-03-13 17:15:53 +00:00
byteCountOrFrenchDone = 0 ;
// Restart the iteration an move to the
// second level
s . iterator - > move ( s . iterator , 0 , UITER_START ) ;
level = UCOL_PSK_QUIN ;
break ;
}
if ( isShiftedCE ( CE , LVT , & wasShifted ) ) {
CE > > = 16 ; /* get primary */
if ( CE ! = 0 ) {
if ( byteCountOrFrenchDone = = 0 ) {
dest [ i + + ] = ( uint8_t ) ( CE > > 8 ) ;
2003-02-06 23:29:56 +00:00
} else {
2003-03-13 17:15:53 +00:00
byteCountOrFrenchDone = 0 ;
2003-02-06 23:29:56 +00:00
}
2003-03-13 17:15:53 +00:00
if ( ( CE & = 0xff ) ! = 0 ) {
if ( i = = count ) {
/* overflow */
byteCountOrFrenchDone = 1 ;
goto saveState ;
}
dest [ i + + ] = ( uint8_t ) CE ;
2003-02-06 23:29:56 +00:00
}
2003-03-13 17:15:53 +00:00
}
} else {
notIsContinuation = ! isContinuation ( CE ) ;
if ( notIsContinuation ) {
if ( s . flags & UCOL_WAS_HIRAGANA ) { // This was Hiragana and we need to note it
dest [ i + + ] = UCOL_HIRAGANA_QUAD ;
} else {
dest [ i + + ] = 0xFF ;
2003-02-06 23:29:56 +00:00
}
2003-03-13 17:15:53 +00:00
}
}
if ( s . CEpos - s . toReturn | | ( s . pos & & * s . pos ! = 0 ) ) {
consumedExpansionCEs + + ;
} else {
consumedExpansionCEs = 0 ;
}
if ( s . pos & & * s . pos = = 0 ) {
iterSkips + + ;
}
}
} else {
// if we're not doing quaternary
// skip to the end
level = UCOL_PSK_NULL ;
}
/* fall through to next level */
case UCOL_PSK_QUIN :
level = UCOL_PSK_IDENTICAL ;
/* fall through to next level */
case UCOL_PSK_IDENTICAL :
if ( strength > = UCOL_IDENTICAL ) {
UChar32 first , second ;
int32_t bocsuBytesWritten = 0 ;
// We always need to do identical on
// the NFD form of the string.
if ( normIter = = NULL ) {
// we arrived from the level below and
// normalization was not turned on.
// therefore, we need to make a fresh NFD iterator
2003-03-17 21:20:36 +00:00
normIter = unorm_openIter ( stackNormIter , sizeof ( stackNormIter ) , status ) ;
2003-03-13 17:15:53 +00:00
s . iterator = unorm_setIter ( normIter , iter , UNORM_NFD , status ) ;
} else if ( ! doingIdenticalFromStart ) {
// there is an iterator, but we did some other levels.
// therefore, we have a FCD iterator - need to make
// a NFD one.
// normIter being at the beginning does not guarantee
// that the underlying iterator is at the beginning
iter - > move ( iter , 0 , UITER_START ) ;
s . iterator = unorm_setIter ( normIter , iter , UNORM_NFD , status ) ;
}
// At this point we have a NFD iterator that is positioned
// in the right place
if ( U_FAILURE ( * status ) ) {
return 0 ;
}
first = uiter_previous32 ( s . iterator ) ;
// maybe we're at the start of the string
if ( first = = U_SENTINEL ) {
first = 0 ;
} else {
uiter_next32 ( s . iterator ) ;
}
2003-02-06 23:29:56 +00:00
2003-03-13 17:15:53 +00:00
j = 0 ;
for ( ; ; ) {
if ( i = = count ) {
if ( j + 1 < bocsuBytesWritten ) {
bocsuBytesUsed = j + 1 ;
2003-02-06 23:29:56 +00:00
}
2003-03-13 17:15:53 +00:00
goto saveState ;
}
2003-02-06 23:29:56 +00:00
2003-03-13 17:15:53 +00:00
// On identical level, we will always save
// the state if we reach this point, since
// we don't depend on getNextCE for content
// all the content is in our buffer and we
// already either stored the full buffer OR
// otherwise we won't arrive here.
newState = s . iterator - > getState ( s . iterator ) ;
if ( newState ! = UITER_NO_STATE ) {
iterState = newState ;
iterSkips = 0 ;
2003-01-23 01:52:34 +00:00
} else {
2003-03-13 17:15:53 +00:00
iterSkips + + ;
}
uint8_t buff [ 4 ] ;
second = uiter_next32 ( s . iterator ) ;
// end condition for identical level
if ( second = = U_SENTINEL ) {
2003-03-25 18:25:09 +00:00
terminatePSKLevel ( level , maxLevel , i , dest ) ;
2003-01-23 01:52:34 +00:00
level = UCOL_PSK_NULL ;
2003-03-13 17:15:53 +00:00
break ;
2003-01-23 01:52:34 +00:00
}
2003-03-13 17:15:53 +00:00
bocsuBytesWritten = u_writeIdenticalLevelRunTwoChars ( first , second , buff ) ;
first = second ;
j = 0 ;
if ( bocsuBytesUsed ! = 0 ) {
while ( bocsuBytesUsed - - > 0 ) {
j + + ;
}
}
while ( i < count & & j < bocsuBytesWritten ) {
dest [ i + + ] = buff [ j + + ] ;
2003-01-23 01:52:34 +00:00
}
}
2003-03-13 17:15:53 +00:00
} else {
level = UCOL_PSK_NULL ;
}
/* fall through to next level */
case UCOL_PSK_NULL :
j = i ;
while ( j < count ) {
dest [ j + + ] = 0 ;
}
break ;
default :
* status = U_INTERNAL_PROGRAM_ERROR ;
return 0 ;
}
2003-01-23 01:52:34 +00:00
2003-03-13 17:15:53 +00:00
saveState :
// Now we need to return stuff. First we want to see whether we have
2003-01-23 01:52:34 +00:00
// done everything for the current state of iterator.
2003-03-13 17:15:53 +00:00
if ( consumedExpansionCEs | | byteCountOrFrenchDone
2003-02-06 23:29:56 +00:00
| | dontAdvanceIteratorBecauseWeNeedALevelTerminator ) {
2003-03-13 17:15:53 +00:00
// Any of above mean that the previous transaction
// wasn't finished and that we should store the
// previous iterator state.
2003-01-23 01:52:34 +00:00
state [ 0 ] = iterState ;
} else {
2003-03-13 17:15:53 +00:00
// The transaction is complete. We will continue in
// next iteration.
2003-01-23 01:52:34 +00:00
if ( ( newState = s . iterator - > getState ( s . iterator ) ) ! = UITER_NO_STATE ) {
state [ 0 ] = s . iterator - > getState ( s . iterator ) ;
2003-02-14 07:46:20 +00:00
iterSkips = 0 ;
2003-01-23 01:52:34 +00:00
} else {
state [ 0 ] = iterState ;
iterSkips + + ;
}
}
2003-03-13 17:15:53 +00:00
// Store the number of elements processed. On CE levels, this is
// the number of expansion CEs processed. On identical level, this
// is the number of bocsu bytes written.
2003-02-06 23:29:56 +00:00
if ( level < UCOL_PSK_IDENTICAL ) {
2003-03-04 06:31:20 +00:00
if ( ( consumedExpansionCEs & UCOL_PSK_USED_ELEMENTS_MASK ) ! = consumedExpansionCEs ) {
* status = U_BUFFER_OVERFLOW_ERROR ;
}
2003-02-06 23:29:56 +00:00
state [ 1 ] = ( consumedExpansionCEs & UCOL_PSK_USED_ELEMENTS_MASK ) < < UCOL_PSK_USED_ELEMENTS_SHIFT ;
} else {
2003-03-04 06:31:20 +00:00
if ( ( bocsuBytesUsed & UCOL_PSK_USED_ELEMENTS_MASK ) ! = bocsuBytesUsed ) {
* status = U_BUFFER_OVERFLOW_ERROR ;
}
2003-02-06 23:29:56 +00:00
state [ 1 ] = ( bocsuBytesUsed & UCOL_PSK_USED_ELEMENTS_MASK ) < < UCOL_PSK_USED_ELEMENTS_SHIFT ;
}
2003-03-13 17:15:53 +00:00
// Next we put in the level of comparison
state [ 1 ] | = ( ( level & UCOL_PSK_LEVEL_MASK ) < < UCOL_PSK_LEVEL_SHIFT ) ;
// If we are doing French, we need to store whether we have just finished the French level
if ( level = = UCOL_PSK_SECONDARY & & doingFrench ) {
state [ 1 ] | = ( ( ( state [ 0 ] = = 0 ) & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK ) < < UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT ) ;
} else {
state [ 1 ] | = ( ( byteCountOrFrenchDone & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK ) < < UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT ) ;
}
// Was the latest CE shifted
2003-02-06 23:29:56 +00:00
if ( wasShifted ) {
state [ 1 ] | = 1 < < UCOL_PSK_WAS_SHIFTED_SHIFT ;
}
2003-03-13 17:15:53 +00:00
// Check for iterSkips overflow
2003-03-04 06:31:20 +00:00
if ( ( iterSkips & UCOL_PSK_ITER_SKIP_MASK ) ! = iterSkips ) {
* status = U_BUFFER_OVERFLOW_ERROR ;
}
2003-03-13 17:15:53 +00:00
// Store iterSkips
2003-02-06 23:29:56 +00:00
state [ 1 ] | = ( ( iterSkips & UCOL_PSK_ITER_SKIP_MASK ) < < UCOL_PSK_ITER_SKIP_SHIFT ) ;
2003-03-13 17:15:53 +00:00
// Check for French overflow
2003-03-04 06:31:20 +00:00
if ( ( usedFrench & UCOL_PSK_USED_FRENCH_MASK ) ! = usedFrench ) {
* status = U_BUFFER_OVERFLOW_ERROR ;
}
2003-03-13 17:15:53 +00:00
// Store number of bytes written in the French secondary continuation sequence
2003-02-06 23:29:56 +00:00
state [ 1 ] | = ( ( usedFrench & UCOL_PSK_USED_FRENCH_MASK ) < < UCOL_PSK_USED_FRENCH_SHIFT ) ;
2003-01-23 01:52:34 +00:00
2003-03-13 17:15:53 +00:00
// If we have used normalizing iterator, get rid of it
2003-01-23 01:52:34 +00:00
if ( normIter ! = NULL ) {
unorm_closeIter ( normIter ) ;
}
2003-03-13 17:15:53 +00:00
// Return number of meaningful sortkey bytes.
2003-01-23 01:52:34 +00:00
return i ;
}
2002-01-21 23:54:34 +00:00
/**
2002-01-24 23:02:02 +00:00
* Produce a bound for a given sortkey and a number of levels .
2002-01-21 23:54:34 +00:00
*/
U_CAPI int32_t U_EXPORT2
2002-01-24 23:02:02 +00:00
ucol_getBound ( const uint8_t * source ,
2002-01-21 23:54:34 +00:00
int32_t sourceLength ,
2002-01-24 23:02:02 +00:00
UColBoundMode boundType ,
uint32_t noOfLevels ,
2002-01-21 23:54:34 +00:00
uint8_t * result ,
int32_t resultLength ,
UErrorCode * status ) {
// consistency checks
if ( status = = NULL | | U_FAILURE ( * status ) ) {
return 0 ;
}
2002-01-24 23:02:02 +00:00
if ( source = = NULL ) {
2002-01-21 23:54:34 +00:00
* status = U_ILLEGAL_ARGUMENT_ERROR ;
return 0 ;
}
2002-01-24 23:02:02 +00:00
int32_t sourceIndex = 0 ;
2002-01-21 23:54:34 +00:00
// Scan the string until we skip enough of the key OR reach the end of the key
do {
sourceIndex + + ;
if ( source [ sourceIndex ] = = UCOL_LEVELTERMINATOR ) {
2002-01-24 23:02:02 +00:00
noOfLevels - - ;
2002-01-21 23:54:34 +00:00
}
2002-01-30 06:16:58 +00:00
} while ( noOfLevels > 0
2003-02-13 23:34:01 +00:00
& & ( source [ sourceIndex ] ! = 0 | | sourceIndex < sourceLength ) ) ;
2002-01-21 23:54:34 +00:00
2003-02-13 23:34:01 +00:00
if ( ( source [ sourceIndex ] = = 0 | | sourceIndex = = sourceLength )
2002-01-30 06:16:58 +00:00
& & noOfLevels > 0 ) {
2002-01-24 23:02:02 +00:00
* status = U_SORT_KEY_TOO_SHORT_WARNING ;
2002-01-21 23:54:34 +00:00
}
2002-01-24 23:02:02 +00:00
// READ ME: this code assumes that the values for boundType
// enum will not changes. They are set so that the enum value
// corresponds to the number of extra bytes each bound type
// needs.
if ( result ! = NULL & & resultLength > = sourceIndex + boundType ) {
uprv_memcpy ( result , source , sourceIndex ) ;
switch ( boundType ) {
// Lower bound just gets terminated. No extra bytes
case UCOL_BOUND_LOWER : // = 0
break ;
// Upper bound needs one extra byte
case UCOL_BOUND_UPPER : // = 1
result [ sourceIndex + + ] = 2 ;
break ;
// Upper long bound needs two extra bytes
case UCOL_BOUND_UPPER_LONG : // = 2
result [ sourceIndex + + ] = 0xFF ;
result [ sourceIndex + + ] = 0xFF ;
break ;
default :
* status = U_ILLEGAL_ARGUMENT_ERROR ;
return 0 ;
2002-01-21 23:54:34 +00:00
}
2002-01-24 23:02:02 +00:00
result [ sourceIndex + + ] = 0 ;
2002-01-21 23:54:34 +00:00
2002-01-24 23:02:02 +00:00
return sourceIndex ;
} else {
return sourceIndex + boundType + 1 ;
2002-01-21 23:54:34 +00:00
}
}
2001-10-22 05:30:22 +00:00
static
2001-06-03 23:40:41 +00:00
inline void uprv_appendByteToHexString ( char * dst , uint8_t val ) {
2001-08-28 18:53:23 +00:00
uint32_t len = ( uint32_t ) uprv_strlen ( dst ) ;
2001-06-03 23:40:41 +00:00
* ( dst + len ) = T_CString_itosOffset ( ( val > > 4 ) ) ;
* ( dst + len + 1 ) = T_CString_itosOffset ( ( val & 0xF ) ) ;
* ( dst + len + 2 ) = 0 ;
}
2001-03-20 00:56:37 +00:00
/* this function makes a string with representation of a sortkey */
2001-11-21 01:08:55 +00:00
U_CAPI char * U_EXPORT2 ucol_sortKeyToString ( const UCollator * coll , const uint8_t * sortkey , char * buffer , uint32_t * len ) {
2001-03-22 21:16:20 +00:00
int32_t strength = UCOL_PRIMARY ;
2001-03-20 00:56:37 +00:00
uint32_t res_size = 0 ;
UBool doneCase = FALSE ;
char * current = buffer ;
const uint8_t * currentSk = sortkey ;
2001-06-03 23:40:41 +00:00
uprv_strcpy ( current , " [ " ) ;
2001-03-20 00:56:37 +00:00
while ( strength < = UCOL_QUATERNARY & & strength < = coll - > strength ) {
if ( strength > UCOL_PRIMARY ) {
2001-06-03 23:40:41 +00:00
strcat ( current , " . " ) ;
2001-03-20 00:56:37 +00:00
}
while ( * currentSk ! = 0x01 & & * currentSk ! = 0x00 ) { /* print a level */
2001-06-03 23:40:41 +00:00
uprv_appendByteToHexString ( current , * currentSk + + ) ;
uprv_strcat ( current , " " ) ;
2001-03-20 00:56:37 +00:00
}
if ( coll - > caseLevel = = UCOL_ON & & strength = = UCOL_SECONDARY & & doneCase = = FALSE ) {
doneCase = TRUE ;
} else if ( coll - > caseLevel = = UCOL_OFF | | doneCase = = TRUE | | strength ! = UCOL_SECONDARY ) {
strength + + ;
2001-04-23 01:53:49 +00:00
}
2001-06-03 23:40:41 +00:00
uprv_appendByteToHexString ( current , * currentSk + + ) ; /* This should print '01' */
2001-03-20 00:56:37 +00:00
if ( strength = = UCOL_QUATERNARY & & coll - > alternateHandling = = UCOL_NON_IGNORABLE ) {
break ;
}
}
if ( coll - > strength = = UCOL_IDENTICAL ) {
2001-06-03 23:40:41 +00:00
uprv_strcat ( current , " . " ) ;
2001-03-20 00:56:37 +00:00
while ( * currentSk ! = 0 ) {
2001-06-03 23:40:41 +00:00
uprv_appendByteToHexString ( current , * currentSk + + ) ;
uprv_strcat ( current , " " ) ;
2001-03-20 00:56:37 +00:00
}
2001-06-03 23:40:41 +00:00
uprv_appendByteToHexString ( current , * currentSk + + ) ;
2001-03-20 00:56:37 +00:00
}
2001-06-03 23:40:41 +00:00
uprv_strcat ( current , " ] " ) ;
2001-03-20 00:56:37 +00:00
2001-03-22 21:16:20 +00:00
if ( res_size > * len ) {
return NULL ;
}
2001-03-20 00:56:37 +00:00
return buffer ;
}
2001-01-16 00:28:40 +00:00
/****************************************************************************/
/* Following are the functions that deal with the properties of a collator */
/* there are new APIs and some compatibility APIs */
/****************************************************************************/
2002-09-04 06:02:13 +00:00
static inline void
ucol_addLatinOneEntry ( UCollator * coll , UChar ch , uint32_t CE ,
int32_t * primShift , int32_t * secShift , int32_t * terShift ) {
uint8_t primary1 = 0 , primary2 = 0 , secondary = 0 , tertiary = 0 ;
UBool reverseSecondary = FALSE ;
if ( ! isContinuation ( CE ) ) {
tertiary = ( uint8_t ) ( ( CE & coll - > tertiaryMask ) ) ;
tertiary ^ = coll - > caseSwitch ;
reverseSecondary = TRUE ;
} else {
tertiary = ( uint8_t ) ( ( CE & UCOL_REMOVE_CONTINUATION ) ) ;
tertiary & = UCOL_REMOVE_CASE ;
reverseSecondary = FALSE ;
}
secondary = ( uint8_t ) ( ( CE > > = 8 ) & UCOL_BYTE_SIZE_MASK ) ;
primary2 = ( uint8_t ) ( ( CE > > = 8 ) & UCOL_BYTE_SIZE_MASK ) ;
primary1 = ( uint8_t ) ( CE > > 8 ) ;
if ( primary1 ! = 0 ) {
coll - > latinOneCEs [ ch ] | = ( primary1 < < * primShift ) ;
* primShift - = 8 ;
}
if ( primary2 ! = 0 ) {
2002-12-06 20:05:09 +00:00
if ( * primShift < 0 ) {
2002-09-04 06:02:13 +00:00
coll - > latinOneCEs [ ch ] = UCOL_BAIL_OUT_CE ;
coll - > latinOneCEs [ coll - > latinOneTableLen + ch ] = UCOL_BAIL_OUT_CE ;
coll - > latinOneCEs [ 2 * coll - > latinOneTableLen + ch ] = UCOL_BAIL_OUT_CE ;
return ;
}
coll - > latinOneCEs [ ch ] | = ( primary2 < < * primShift ) ;
* primShift - = 8 ;
}
if ( secondary ! = 0 ) {
if ( reverseSecondary & & coll - > frenchCollation = = UCOL_ON ) { // reverse secondary
coll - > latinOneCEs [ coll - > latinOneTableLen + ch ] > > = 8 ; // make space for secondary
coll - > latinOneCEs [ coll - > latinOneTableLen + ch ] | = ( secondary < < 24 ) ;
} else { // normal case
coll - > latinOneCEs [ coll - > latinOneTableLen + ch ] | = ( secondary < < * secShift ) ;
}
* secShift - = 8 ;
}
if ( tertiary ! = 0 ) {
coll - > latinOneCEs [ 2 * coll - > latinOneTableLen + ch ] | = ( tertiary < < * terShift ) ;
* terShift - = 8 ;
}
}
static inline UBool
ucol_resizeLatinOneTable ( UCollator * coll , int32_t size , UErrorCode * status ) {
uint32_t * newTable = ( uint32_t * ) uprv_malloc ( size * sizeof ( uint32_t ) * 3 ) ;
if ( newTable = = NULL ) {
* status = U_MEMORY_ALLOCATION_ERROR ;
coll - > latinOneFailed = TRUE ;
return FALSE ;
}
int32_t sizeToCopy = ( ( size < coll - > latinOneTableLen ) ? size : coll - > latinOneTableLen ) * sizeof ( uint32_t ) ;
uprv_memset ( newTable , 0 , size * sizeof ( uint32_t ) * 3 ) ;
uprv_memcpy ( newTable , coll - > latinOneCEs , sizeToCopy ) ;
uprv_memcpy ( newTable + size , coll - > latinOneCEs + coll - > latinOneTableLen , sizeToCopy ) ;
uprv_memcpy ( newTable + 2 * size , coll - > latinOneCEs + 2 * coll - > latinOneTableLen , sizeToCopy ) ;
coll - > latinOneTableLen = size ;
uprv_free ( coll - > latinOneCEs ) ;
coll - > latinOneCEs = newTable ;
return TRUE ;
}
UBool ucol_setUpLatinOne ( UCollator * coll , UErrorCode * status ) {
UBool result = TRUE ;
if ( coll - > latinOneCEs = = NULL ) {
coll - > latinOneCEs = ( uint32_t * ) uprv_malloc ( sizeof ( uint32_t ) * UCOL_LATINONETABLELEN * 3 ) ;
if ( coll - > latinOneCEs = = NULL ) {
* status = U_MEMORY_ALLOCATION_ERROR ;
return FALSE ;
}
coll - > latinOneTableLen = UCOL_LATINONETABLELEN ;
}
UChar ch = 0 ;
UCollationElements * it = ucol_openElements ( coll , & ch , 1 , status ) ;
uprv_memset ( coll - > latinOneCEs , 0 , sizeof ( uint32_t ) * coll - > latinOneTableLen * 3 ) ;
int32_t primShift = 24 , secShift = 24 , terShift = 24 ;
uint32_t CE = 0 ;
int32_t contractionOffset = UCOL_ENDOFLATINONERANGE + 1 ;
// TODO: make safe if you get more than you wanted...
for ( ch = 0 ; ch < = UCOL_ENDOFLATINONERANGE ; ch + + ) {
primShift = 24 ; secShift = 24 ; terShift = 24 ;
if ( ch < 0x100 ) {
CE = coll - > latinOneMapping [ ch ] ;
} else {
CE = UTRIE_GET32_FROM_LEAD ( coll - > mapping , ch ) ;
if ( CE = = UCOL_NOT_FOUND ) {
CE = UTRIE_GET32_FROM_LEAD ( UCA - > mapping , ch ) ;
2001-03-02 00:19:43 +00:00
}
2002-09-04 06:02:13 +00:00
}
if ( CE < UCOL_NOT_FOUND ) {
ucol_addLatinOneEntry ( coll , ch , CE , & primShift , & secShift , & terShift ) ;
} else {
switch ( getCETag ( CE ) ) {
case EXPANSION_TAG :
ucol_setText ( it , & ch , 1 , status ) ;
while ( ( CE = ucol_next ( it , status ) ) ! = UCOL_NULLORDER ) {
if ( primShift < 0 | | secShift < 0 | | terShift < 0 ) {
coll - > latinOneCEs [ ch ] = UCOL_BAIL_OUT_CE ;
coll - > latinOneCEs [ coll - > latinOneTableLen + ch ] = UCOL_BAIL_OUT_CE ;
coll - > latinOneCEs [ 2 * coll - > latinOneTableLen + ch ] = UCOL_BAIL_OUT_CE ;
break ;
}
ucol_addLatinOneEntry ( coll , ch , CE , & primShift , & secShift , & terShift ) ;
}
break ;
case CONTRACTION_TAG :
// here is the trick
// F2 is contraction. We do something very similar to contractions
// but have two indices, one in the real contraction table and the
// other to where we stuffed things. This hopes that we don't have
// many contractions (this should work for latin-1 tables).
{
if ( ( CE & 0x00FFF000 ) ! = 0 ) {
* status = U_UNSUPPORTED_ERROR ;
return FALSE ;
}
2001-05-10 22:33:50 +00:00
2002-09-04 06:02:13 +00:00
const UChar * UCharOffset = ( UChar * ) coll - > image + getContractOffset ( CE ) ;
CE | = ( contractionOffset & 0xFFF ) < < 12 ; // insert the offset in latin-1 table
coll - > latinOneCEs [ ch ] = CE ;
coll - > latinOneCEs [ coll - > latinOneTableLen + ch ] = CE ;
coll - > latinOneCEs [ 2 * coll - > latinOneTableLen + ch ] = CE ;
// We're going to jump into contraction table, pick the elements
// and use them
do {
CE = * ( coll - > contractionCEs +
( UCharOffset - coll - > contractionIndex ) ) ;
if ( getCETag ( CE ) = = EXPANSION_TAG ) {
uint32_t size ;
uint32_t i ; /* general counter */
uint32_t * CEOffset = ( uint32_t * ) coll - > image + getExpansionOffset ( CE ) ; /* find the offset to expansion table */
size = getExpansionCount ( CE ) ;
//CE = *CEOffset++;
if ( size ! = 0 ) { /* if there are less than 16 elements in expansion, we don't terminate */
for ( i = 0 ; i < size ; i + + ) {
if ( primShift < 0 | | secShift < 0 | | terShift < 0 ) {
coll - > latinOneCEs [ ( UChar ) contractionOffset ] = UCOL_BAIL_OUT_CE ;
coll - > latinOneCEs [ coll - > latinOneTableLen + ( UChar ) contractionOffset ] = UCOL_BAIL_OUT_CE ;
coll - > latinOneCEs [ 2 * coll - > latinOneTableLen + ( UChar ) contractionOffset ] = UCOL_BAIL_OUT_CE ;
break ;
}
ucol_addLatinOneEntry ( coll , ( UChar ) contractionOffset , * CEOffset + + , & primShift , & secShift , & terShift ) ;
}
} else { /* else, we do */
while ( * CEOffset ! = 0 ) {
if ( primShift < 0 | | secShift < 0 | | terShift < 0 ) {
coll - > latinOneCEs [ ( UChar ) contractionOffset ] = UCOL_BAIL_OUT_CE ;
coll - > latinOneCEs [ coll - > latinOneTableLen + ( UChar ) contractionOffset ] = UCOL_BAIL_OUT_CE ;
coll - > latinOneCEs [ 2 * coll - > latinOneTableLen + ( UChar ) contractionOffset ] = UCOL_BAIL_OUT_CE ;
break ;
}
ucol_addLatinOneEntry ( coll , ( UChar ) contractionOffset , * CEOffset + + , & primShift , & secShift , & terShift ) ;
}
}
contractionOffset + + ;
2002-11-21 21:05:14 +00:00
} else if ( CE < UCOL_NOT_FOUND ) {
2002-09-04 06:02:13 +00:00
ucol_addLatinOneEntry ( coll , ( UChar ) contractionOffset + + , CE , & primShift , & secShift , & terShift ) ;
2002-11-21 21:05:14 +00:00
} else {
coll - > latinOneCEs [ ( UChar ) contractionOffset ] = UCOL_BAIL_OUT_CE ;
coll - > latinOneCEs [ coll - > latinOneTableLen + ( UChar ) contractionOffset ] = UCOL_BAIL_OUT_CE ;
coll - > latinOneCEs [ 2 * coll - > latinOneTableLen + ( UChar ) contractionOffset ] = UCOL_BAIL_OUT_CE ;
contractionOffset + + ;
}
2002-09-04 06:02:13 +00:00
UCharOffset + + ;
primShift = 24 ; secShift = 24 ; terShift = 24 ;
if ( contractionOffset = = coll - > latinOneTableLen ) { // we need to reallocate
if ( ! ucol_resizeLatinOneTable ( coll , 2 * coll - > latinOneTableLen , status ) ) {
return FALSE ;
}
}
} while ( * UCharOffset ! = 0xFFFF ) ;
}
break ;
default :
coll - > latinOneFailed = TRUE ;
result = FALSE ;
break ;
}
}
}
ucol_closeElements ( it ) ;
// compact table
if ( contractionOffset < coll - > latinOneTableLen ) {
if ( ! ucol_resizeLatinOneTable ( coll , contractionOffset , status ) ) {
return FALSE ;
}
}
return result ;
}
void ucol_updateInternalState ( UCollator * coll , UErrorCode * status ) {
if ( U_SUCCESS ( * status ) ) {
2001-05-10 22:33:50 +00:00
if ( coll - > caseFirst = = UCOL_UPPER_FIRST ) {
2002-09-04 06:02:13 +00:00
coll - > caseSwitch = UCOL_CASE_SWITCH ;
2001-05-10 22:33:50 +00:00
} else {
2002-09-04 06:02:13 +00:00
coll - > caseSwitch = UCOL_NO_CASE_SWITCH ;
}
if ( coll - > caseLevel = = UCOL_ON | | coll - > caseFirst = = UCOL_OFF ) {
coll - > tertiaryMask = UCOL_REMOVE_CASE ;
2001-05-10 22:33:50 +00:00
coll - > tertiaryCommon = UCOL_COMMON3_NORMAL ;
2002-09-04 06:02:13 +00:00
coll - > tertiaryAddition = UCOL_FLAG_BIT_MASK_CASE_SW_OFF ;
coll - > tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_OFF ;
coll - > tertiaryBottom = UCOL_COMMON_BOT3 ;
} else {
coll - > tertiaryMask = UCOL_KEEP_CASE ;
coll - > tertiaryAddition = UCOL_FLAG_BIT_MASK_CASE_SW_ON ;
if ( coll - > caseFirst = = UCOL_UPPER_FIRST ) {
coll - > tertiaryCommon = UCOL_COMMON3_UPPERFIRST ;
coll - > tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_UPPER ;
coll - > tertiaryBottom = UCOL_COMMON_BOTTOM3_CASE_SW_UPPER ;
} else {
coll - > tertiaryCommon = UCOL_COMMON3_NORMAL ;
coll - > tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_LOWER ;
coll - > tertiaryBottom = UCOL_COMMON_BOTTOM3_CASE_SW_LOWER ;
}
2001-05-10 22:33:50 +00:00
}
2001-05-15 17:38:18 +00:00
2002-09-04 06:02:13 +00:00
/* Set the compression values */
uint8_t tertiaryTotal = ( uint8_t ) ( coll - > tertiaryTop - UCOL_COMMON_BOT3 - 1 ) ;
coll - > tertiaryTopCount = ( uint8_t ) ( UCOL_PROPORTION3 * tertiaryTotal ) ; /* we multilply double with int, but need only int */
coll - > tertiaryBottomCount = ( uint8_t ) ( tertiaryTotal - coll - > tertiaryTopCount ) ;
2001-05-15 17:38:18 +00:00
2002-09-04 06:02:13 +00:00
if ( coll - > caseLevel = = UCOL_OFF & & coll - > strength = = UCOL_TERTIARY
& & coll - > frenchCollation = = UCOL_OFF & & coll - > alternateHandling = = UCOL_NON_IGNORABLE ) {
coll - > sortKeyGen = ucol_calcSortKeySimpleTertiary ;
} else {
coll - > sortKeyGen = ucol_calcSortKey ;
}
if ( coll - > caseLevel = = UCOL_OFF & & coll - > strength < = UCOL_TERTIARY
& & coll - > alternateHandling = = UCOL_NON_IGNORABLE & & ! coll - > latinOneFailed ) {
if ( coll - > latinOneCEs = = NULL | | coll - > latinOneRegenTable ) {
if ( ucol_setUpLatinOne ( coll , status ) ) { // if we succeed in building latin1 table, we'll use it
//fprintf(stderr, "F");
coll - > latinOneUse = TRUE ;
} else {
coll - > latinOneUse = FALSE ;
}
} else { // latin1Table exists and it doesn't need to be regenerated, just use it
coll - > latinOneUse = TRUE ;
}
} else {
coll - > latinOneUse = FALSE ;
}
2001-03-14 02:45:39 +00:00
}
2001-03-02 00:19:43 +00:00
}
2001-01-16 00:28:40 +00:00
2001-11-21 01:08:55 +00:00
U_CAPI uint32_t U_EXPORT2
ucol_setVariableTop ( UCollator * coll , const UChar * varTop , int32_t len , UErrorCode * status ) {
2001-06-26 22:24:10 +00:00
if ( U_FAILURE ( * status ) | | coll = = NULL ) {
2001-06-25 04:01:49 +00:00
return 0 ;
}
if ( len = = - 1 ) {
len = u_strlen ( varTop ) ;
}
if ( len = = 0 ) {
* status = U_ILLEGAL_ARGUMENT_ERROR ;
return 0 ;
}
collIterate s ;
IInit_collIterate ( coll , varTop , len , & s ) ;
2001-06-26 22:24:10 +00:00
uint32_t CE = ucol_IGetNextCE ( coll , & s , status ) ;
2002-07-02 22:32:14 +00:00
/* here we check if we have consumed all characters */
/* you can put in either one character or a contraction */
/* you shouldn't put more... */
if ( s . pos ! = s . endp | | CE = = UCOL_NO_MORE_CES ) {
2001-06-26 22:24:10 +00:00
* status = U_CE_NOT_FOUND_ERROR ;
return 0 ;
}
uint32_t nextCE = ucol_IGetNextCE ( coll , & s , status ) ;
if ( isContinuation ( nextCE ) & & ( nextCE & UCOL_PRIMARYMASK ) ! = 0 ) {
* status = U_PRIMARY_TOO_LONG_ERROR ;
return 0 ;
}
2001-06-25 04:01:49 +00:00
2001-06-26 22:24:10 +00:00
coll - > variableTopValue = ( CE & UCOL_PRIMARYMASK ) > > 16 ;
2001-06-25 04:01:49 +00:00
2001-06-26 22:24:10 +00:00
return CE & UCOL_PRIMARYMASK ;
2001-06-25 04:01:49 +00:00
}
2002-06-29 09:32:36 +00:00
U_CAPI uint32_t U_EXPORT2 ucol_getVariableTop ( const UCollator * coll , UErrorCode * status ) {
2001-06-26 22:24:10 +00:00
if ( U_FAILURE ( * status ) | | coll = = NULL ) {
return 0 ;
}
return coll - > variableTopValue < < 16 ;
}
2001-06-25 04:01:49 +00:00
2001-11-21 01:08:55 +00:00
U_CAPI void U_EXPORT2
ucol_restoreVariableTop ( UCollator * coll , const uint32_t varTop , UErrorCode * status ) {
2001-06-26 22:24:10 +00:00
if ( U_FAILURE ( * status ) | | coll = = NULL ) {
return ;
}
coll - > variableTopValue = ( varTop & UCOL_PRIMARYMASK ) > > 16 ;
}
2001-01-16 00:28:40 +00:00
/* Attribute setter API */
2001-11-21 01:08:55 +00:00
U_CAPI void U_EXPORT2
ucol_setAttribute ( UCollator * coll , UColAttribute attr , UColAttributeValue value , UErrorCode * status ) {
2001-06-26 22:24:10 +00:00
if ( U_FAILURE ( * status ) | | coll = = NULL ) {
return ;
}
2002-09-04 06:02:13 +00:00
UColAttributeValue oldFrench = coll - > frenchCollation ;
UColAttributeValue oldCaseFirst = coll - > caseFirst ;
2001-03-03 03:35:17 +00:00
switch ( attr ) {
2001-10-02 16:49:57 +00:00
case UCOL_HIRAGANA_QUATERNARY_MODE : /* special quaternary values for Hiragana */
if ( value = = UCOL_ON ) {
coll - > hiraganaQ = UCOL_ON ;
coll - > hiraganaQisDefault = FALSE ;
} else if ( value = = UCOL_OFF ) {
coll - > hiraganaQ = UCOL_OFF ;
coll - > hiraganaQisDefault = FALSE ;
} else if ( value = = UCOL_DEFAULT ) {
coll - > hiraganaQisDefault = TRUE ;
2002-09-17 04:53:35 +00:00
coll - > hiraganaQ = ( UColAttributeValue ) coll - > options - > hiraganaQ ;
2001-10-02 16:49:57 +00:00
} else {
* status = U_ILLEGAL_ARGUMENT_ERROR ;
}
break ;
2001-03-03 03:35:17 +00:00
case UCOL_FRENCH_COLLATION : /* attribute for direction of secondary weights*/
if ( value = = UCOL_ON ) {
coll - > frenchCollation = UCOL_ON ;
2001-01-16 00:28:40 +00:00
coll - > frenchCollationisDefault = FALSE ;
2001-03-03 03:35:17 +00:00
} else if ( value = = UCOL_OFF ) {
coll - > frenchCollation = UCOL_OFF ;
2001-01-16 00:28:40 +00:00
coll - > frenchCollationisDefault = FALSE ;
2001-03-03 03:35:17 +00:00
} else if ( value = = UCOL_DEFAULT ) {
2001-01-16 00:28:40 +00:00
coll - > frenchCollationisDefault = TRUE ;
2002-09-17 04:53:35 +00:00
coll - > frenchCollation = ( UColAttributeValue ) coll - > options - > frenchCollation ;
2001-03-03 03:35:17 +00:00
} else {
* status = U_ILLEGAL_ARGUMENT_ERROR ;
}
break ;
2001-01-05 00:47:25 +00:00
case UCOL_ALTERNATE_HANDLING : /* attribute for handling variable elements*/
2001-03-03 03:35:17 +00:00
if ( value = = UCOL_SHIFTED ) {
coll - > alternateHandling = UCOL_SHIFTED ;
2001-01-09 00:52:18 +00:00
coll - > alternateHandlingisDefault = FALSE ;
2001-03-03 03:35:17 +00:00
} else if ( value = = UCOL_NON_IGNORABLE ) {
coll - > alternateHandling = UCOL_NON_IGNORABLE ;
2001-01-09 00:52:18 +00:00
coll - > alternateHandlingisDefault = FALSE ;
2001-03-03 03:35:17 +00:00
} else if ( value = = UCOL_DEFAULT ) {
2001-01-09 00:52:18 +00:00
coll - > alternateHandlingisDefault = TRUE ;
2002-09-17 04:53:35 +00:00
coll - > alternateHandling = ( UColAttributeValue ) coll - > options - > alternateHandling ;
2001-03-03 03:35:17 +00:00
} else {
* status = U_ILLEGAL_ARGUMENT_ERROR ;
}
break ;
case UCOL_CASE_FIRST : /* who goes first, lower case or uppercase */
if ( value = = UCOL_LOWER_FIRST ) {
coll - > caseFirst = UCOL_LOWER_FIRST ;
2001-01-09 00:52:18 +00:00
coll - > caseFirstisDefault = FALSE ;
2001-03-03 03:35:17 +00:00
} else if ( value = = UCOL_UPPER_FIRST ) {
coll - > caseFirst = UCOL_UPPER_FIRST ;
2001-01-09 00:52:18 +00:00
coll - > caseFirstisDefault = FALSE ;
2001-03-02 00:19:43 +00:00
} else if ( value = = UCOL_OFF ) {
coll - > caseFirst = UCOL_OFF ;
coll - > caseFirstisDefault = FALSE ;
2001-03-03 03:35:17 +00:00
} else if ( value = = UCOL_DEFAULT ) {
2002-09-17 04:53:35 +00:00
coll - > caseFirst = ( UColAttributeValue ) coll - > options - > caseFirst ;
2001-01-09 00:52:18 +00:00
coll - > caseFirstisDefault = TRUE ;
2001-03-03 03:35:17 +00:00
} else {
* status = U_ILLEGAL_ARGUMENT_ERROR ;
}
break ;
case UCOL_CASE_LEVEL : /* do we have an extra case level */
if ( value = = UCOL_ON ) {
coll - > caseLevel = UCOL_ON ;
2001-01-09 00:52:18 +00:00
coll - > caseLevelisDefault = FALSE ;
2001-03-03 03:35:17 +00:00
} else if ( value = = UCOL_OFF ) {
coll - > caseLevel = UCOL_OFF ;
2001-01-09 00:52:18 +00:00
coll - > caseLevelisDefault = FALSE ;
2001-03-03 03:35:17 +00:00
} else if ( value = = UCOL_DEFAULT ) {
2002-09-17 04:53:35 +00:00
coll - > caseLevel = ( UColAttributeValue ) coll - > options - > caseLevel ;
2001-01-09 00:52:18 +00:00
coll - > caseLevelisDefault = TRUE ;
2001-03-03 03:35:17 +00:00
} else {
* status = U_ILLEGAL_ARGUMENT_ERROR ;
}
break ;
case UCOL_NORMALIZATION_MODE : /* attribute for normalization */
if ( value = = UCOL_ON ) {
2001-01-05 00:47:25 +00:00
coll - > normalizationMode = UCOL_ON ;
2001-01-09 00:52:18 +00:00
coll - > normalizationModeisDefault = FALSE ;
2001-03-03 03:35:17 +00:00
} else if ( value = = UCOL_OFF ) {
2001-01-05 00:47:25 +00:00
coll - > normalizationMode = UCOL_OFF ;
2001-01-09 00:52:18 +00:00
coll - > normalizationModeisDefault = FALSE ;
2001-03-03 03:35:17 +00:00
} else if ( value = = UCOL_DEFAULT ) {
2001-01-09 00:52:18 +00:00
coll - > normalizationModeisDefault = TRUE ;
2002-09-17 04:53:35 +00:00
coll - > normalizationMode = ( UColAttributeValue ) coll - > options - > normalizationMode ;
2001-03-03 03:35:17 +00:00
} else {
* status = U_ILLEGAL_ARGUMENT_ERROR ;
}
break ;
case UCOL_STRENGTH : /* attribute for strength */
2001-01-05 00:47:25 +00:00
if ( value = = UCOL_DEFAULT ) {
2001-01-09 00:52:18 +00:00
coll - > strengthisDefault = TRUE ;
2002-09-17 04:53:35 +00:00
coll - > strength = ( UColAttributeValue ) coll - > options - > strength ;
2001-03-03 03:35:17 +00:00
} else if ( value < = UCOL_IDENTICAL ) {
2001-01-09 00:52:18 +00:00
coll - > strengthisDefault = FALSE ;
2001-03-03 03:35:17 +00:00
coll - > strength = value ;
} else {
* status = U_ILLEGAL_ARGUMENT_ERROR ;
}
break ;
case UCOL_ATTRIBUTE_COUNT :
default :
* status = U_ILLEGAL_ARGUMENT_ERROR ;
break ;
}
2002-09-04 06:02:13 +00:00
if ( oldFrench ! = coll - > frenchCollation | | oldCaseFirst ! = coll - > caseFirst ) {
coll - > latinOneRegenTable = TRUE ;
} else {
coll - > latinOneRegenTable = FALSE ;
}
ucol_updateInternalState ( coll , status ) ;
2001-01-05 00:47:25 +00:00
}
2001-11-21 01:08:55 +00:00
U_CAPI UColAttributeValue U_EXPORT2
ucol_getAttribute ( const UCollator * coll , UColAttribute attr , UErrorCode * status ) {
2001-06-26 22:24:10 +00:00
if ( U_FAILURE ( * status ) | | coll = = NULL ) {
return UCOL_DEFAULT ;
}
2001-03-03 03:35:17 +00:00
switch ( attr ) {
2001-10-02 16:49:57 +00:00
case UCOL_HIRAGANA_QUATERNARY_MODE :
return coll - > hiraganaQ ;
2001-03-03 03:35:17 +00:00
case UCOL_FRENCH_COLLATION : /* attribute for direction of secondary weights*/
2001-03-23 23:48:26 +00:00
return coll - > frenchCollation ;
2001-01-05 00:47:25 +00:00
case UCOL_ALTERNATE_HANDLING : /* attribute for handling variable elements*/
2001-03-23 23:48:26 +00:00
return coll - > alternateHandling ;
2001-03-03 03:35:17 +00:00
case UCOL_CASE_FIRST : /* who goes first, lower case or uppercase */
2001-03-23 23:48:26 +00:00
return coll - > caseFirst ;
2001-03-03 03:35:17 +00:00
case UCOL_CASE_LEVEL : /* do we have an extra case level */
2001-03-23 23:48:26 +00:00
return coll - > caseLevel ;
2001-03-03 03:35:17 +00:00
case UCOL_NORMALIZATION_MODE : /* attribute for normalization */
2001-03-23 23:48:26 +00:00
return coll - > normalizationMode ;
2001-03-03 03:35:17 +00:00
case UCOL_STRENGTH : /* attribute for strength */
2001-03-23 23:48:26 +00:00
return coll - > strength ;
2001-03-03 03:35:17 +00:00
case UCOL_ATTRIBUTE_COUNT :
default :
* status = U_ILLEGAL_ARGUMENT_ERROR ;
break ;
}
return UCOL_DEFAULT ;
2001-01-05 00:47:25 +00:00
}
2002-08-21 19:12:24 +00:00
# ifdef U_USE_DEPRECATED_UCOL_API
2001-09-22 01:24:15 +00:00
// deprecated
2001-11-21 01:08:55 +00:00
U_CAPI void U_EXPORT2
2001-01-15 07:28:54 +00:00
ucol_setNormalization ( UCollator * coll ,
UNormalizationMode mode )
2001-01-05 00:47:25 +00:00
{
2001-01-15 07:28:54 +00:00
UErrorCode status = U_ZERO_ERROR ;
2001-01-05 00:47:25 +00:00
switch ( mode ) {
2001-09-22 01:24:15 +00:00
case UNORM_NONE :
2001-01-15 07:28:54 +00:00
ucol_setAttribute ( coll , UCOL_NORMALIZATION_MODE , UCOL_OFF , & status ) ;
2001-01-05 00:47:25 +00:00
break ;
2001-09-22 01:24:15 +00:00
case UNORM_NFD :
2001-01-15 07:28:54 +00:00
ucol_setAttribute ( coll , UCOL_NORMALIZATION_MODE , UCOL_ON , & status ) ;
2001-01-05 00:47:25 +00:00
break ;
default :
2001-01-15 07:28:54 +00:00
/* Shouldn't get here. */
/* This is quite a bad API */
/* *status = U_ILLEGAL_ARGUMENT_ERROR; */
return ;
2001-01-05 00:47:25 +00:00
}
}
2001-09-22 01:24:15 +00:00
// deprecated
2001-11-21 01:08:55 +00:00
U_CAPI UNormalizationMode U_EXPORT2
2001-01-16 00:28:40 +00:00
ucol_getNormalization ( const UCollator * coll )
{
UErrorCode status = U_ZERO_ERROR ;
if ( ucol_getAttribute ( coll , UCOL_NORMALIZATION_MODE , & status ) = = UCOL_ON ) {
2001-04-23 01:53:49 +00:00
return UNORM_NFD ;
2001-01-16 00:28:40 +00:00
} else {
2001-04-23 01:53:49 +00:00
return UNORM_NONE ;
2001-01-16 00:28:40 +00:00
}
}
2002-08-21 19:12:24 +00:00
# endif
2001-01-16 00:28:40 +00:00
2001-11-21 01:08:55 +00:00
U_CAPI void U_EXPORT2
2001-01-16 00:28:40 +00:00
ucol_setStrength ( UCollator * coll ,
UCollationStrength strength )
{
UErrorCode status = U_ZERO_ERROR ;
ucol_setAttribute ( coll , UCOL_STRENGTH , strength , & status ) ;
}
2001-11-21 01:08:55 +00:00
U_CAPI UCollationStrength U_EXPORT2
2001-01-16 00:28:40 +00:00
ucol_getStrength ( const UCollator * coll )
{
UErrorCode status = U_ZERO_ERROR ;
return ucol_getAttribute ( coll , UCOL_STRENGTH , & status ) ;
}
/****************************************************************************/
/* Following are misc functions */
/* there are new APIs and some compatibility APIs */
/****************************************************************************/
2001-11-21 01:08:55 +00:00
U_CAPI UCollator * U_EXPORT2
2001-04-23 01:53:49 +00:00
ucol_safeClone ( const UCollator * coll , void * stackBuffer , int32_t * pBufferSize , UErrorCode * status )
2001-02-22 16:32:40 +00:00
{
UCollator * localCollator ;
2001-08-28 18:53:23 +00:00
int32_t bufferSizeNeeded = ( int32_t ) sizeof ( UCollator ) ;
2001-09-26 21:09:18 +00:00
char * stackBufferChars = ( char * ) stackBuffer ;
2001-02-22 16:32:40 +00:00
if ( status = = NULL | | U_FAILURE ( * status ) ) {
return 0 ;
}
if ( ! pBufferSize | | ! coll ) {
* status = U_ILLEGAL_ARGUMENT_ERROR ;
return 0 ;
}
2001-09-26 21:09:18 +00:00
/* Pointers on 64-bit platforms need to be aligned
* on a 64 - bit boundry in memory .
*/
if ( U_ALIGNMENT_OFFSET ( stackBuffer ) ! = 0 ) {
int32_t offsetUp = ( int32_t ) U_ALIGNMENT_OFFSET_UP ( stackBufferChars ) ;
* pBufferSize - = offsetUp ;
stackBufferChars + = offsetUp ;
}
stackBuffer = ( void * ) stackBufferChars ;
if ( * pBufferSize < = 0 ) { /* 'preflighting' request - set needed size into *pBufferSize */
2001-03-03 03:35:17 +00:00
* pBufferSize = bufferSizeNeeded ;
return 0 ;
2001-02-22 16:32:40 +00:00
}
2001-04-23 01:53:49 +00:00
if ( * pBufferSize < bufferSizeNeeded | | stackBuffer = = NULL ) {
2001-02-22 16:32:40 +00:00
/* allocate one here...*/
int32_t length ;
const UChar * rules = ucol_getRules ( coll , & length ) ;
2001-04-23 01:53:49 +00:00
localCollator = ucol_openRules ( rules ,
length ,
2001-09-22 01:24:15 +00:00
ucol_getAttribute ( coll , UCOL_NORMALIZATION_MODE , status ) ,
2001-04-23 01:53:49 +00:00
ucol_getStrength ( coll ) ,
2001-08-16 00:55:16 +00:00
NULL ,
2001-02-22 16:32:40 +00:00
status ) ;
2001-03-03 03:35:17 +00:00
if ( U_SUCCESS ( * status ) )
{
2002-08-21 19:12:24 +00:00
* status = U_SAFECLONE_ALLOCATED_WARNING ;
2001-03-03 03:35:17 +00:00
}
2001-02-22 16:32:40 +00:00
} else {
2001-03-03 03:35:17 +00:00
localCollator = ( UCollator * ) stackBuffer ;
memcpy ( localCollator , coll , sizeof ( UCollator ) ) ;
localCollator - > freeOnClose = FALSE ;
}
2001-04-23 01:53:49 +00:00
return localCollator ;
2001-01-16 00:28:40 +00:00
}
2001-11-21 01:08:55 +00:00
U_CAPI int32_t U_EXPORT2
2001-02-22 16:32:40 +00:00
ucol_getRulesEx ( const UCollator * coll , UColRuleOption delta , UChar * buffer , int32_t bufferLen ) {
2001-10-04 00:14:12 +00:00
UErrorCode status = U_ZERO_ERROR ;
2001-02-28 19:01:23 +00:00
int32_t len = 0 ;
int32_t UCAlen = 0 ;
2001-03-22 21:16:20 +00:00
const UChar * ucaRules = 0 ;
2001-02-28 19:01:23 +00:00
const UChar * rules = ucol_getRules ( coll , & len ) ;
if ( delta = = UCOL_FULL_RULES ) {
/* take the UCA rules and append real rules at the end */
2001-03-03 09:27:42 +00:00
/* UCA rules will be probably coming from the root RB */
ucaRules = ures_getStringByKey ( coll - > rb , " %%UCARULES " , & UCAlen , & status ) ;
2001-02-28 19:01:23 +00:00
}
2001-10-04 00:14:12 +00:00
if ( U_FAILURE ( status ) ) {
return 0 ;
}
if ( buffer ! = 0 & & bufferLen > 0 ) {
2001-03-10 03:03:45 +00:00
* buffer = 0 ;
2001-10-04 00:14:12 +00:00
if ( UCAlen > 0 ) {
u_memcpy ( buffer , ucaRules , uprv_min ( UCAlen , bufferLen ) ) ;
}
if ( len > 0 & & bufferLen > UCAlen ) {
u_memcpy ( buffer + UCAlen , rules , uprv_min ( len , bufferLen - UCAlen ) ) ;
2001-03-10 03:03:45 +00:00
}
2001-02-28 19:01:23 +00:00
}
2001-10-04 00:14:12 +00:00
return u_terminateUChars ( buffer , bufferLen , len + UCAlen , & status ) ;
2001-01-16 00:28:40 +00:00
}
2001-09-27 01:01:30 +00:00
static const UChar _NUL = 0 ;
2001-11-21 01:08:55 +00:00
U_CAPI const UChar * U_EXPORT2
2001-04-23 01:53:49 +00:00
ucol_getRules ( const UCollator * coll ,
2001-01-16 00:28:40 +00:00
int32_t * length )
{
if ( coll - > rules ! = NULL ) {
2001-09-27 01:01:30 +00:00
* length = coll - > rulesLength ;
2001-01-16 00:28:40 +00:00
return coll - > rules ;
} else {
2001-03-03 09:27:42 +00:00
UErrorCode status = U_ZERO_ERROR ;
if ( coll - > rb ! = NULL ) {
UResourceBundle * collElem = ures_getByKey ( coll - > rb , " CollationElements " , NULL , & status ) ;
if ( U_SUCCESS ( status ) ) {
/*Semantic const */
( ( UCollator * ) coll ) - > rules = ures_getStringByKey ( collElem , " Sequence " , length , & status ) ;
2001-09-27 01:01:30 +00:00
( ( UCollator * ) coll ) - > rulesLength = * length ;
2001-03-03 09:27:42 +00:00
( ( UCollator * ) coll ) - > freeRulesOnClose = FALSE ;
ures_close ( collElem ) ;
return coll - > rules ;
}
}
2001-01-16 00:28:40 +00:00
* length = 0 ;
2001-09-27 01:01:30 +00:00
return & _NUL ;
2001-01-16 00:28:40 +00:00
}
}
2001-11-21 01:08:55 +00:00
U_CAPI int32_t U_EXPORT2
2001-01-16 00:28:40 +00:00
ucol_getDisplayName ( const char * objLoc ,
const char * dispLoc ,
UChar * result ,
int32_t resultLength ,
UErrorCode * status )
{
2001-10-08 23:26:58 +00:00
2001-01-16 00:28:40 +00:00
if ( U_FAILURE ( * status ) ) return - 1 ;
2002-11-16 01:14:40 +00:00
UnicodeString dst ;
if ( ! ( result = = NULL & & resultLength = = 0 ) ) {
// NULL destination for pure preflighting: empty dummy string
// otherwise, alias the destination buffer
dst . setTo ( result , 0 , resultLength ) ;
}
2001-01-16 00:28:40 +00:00
Collator : : getDisplayName ( Locale ( objLoc ) , Locale ( dispLoc ) , dst ) ;
2001-09-05 23:39:38 +00:00
return dst . extract ( result , resultLength , * status ) ;
2001-01-16 00:28:40 +00:00
}
2001-11-21 01:08:55 +00:00
U_CAPI const char * U_EXPORT2
2001-01-16 00:28:40 +00:00
ucol_getAvailable ( int32_t index )
{
return uloc_getAvailable ( index ) ;
}
2001-11-21 01:08:55 +00:00
U_CAPI int32_t U_EXPORT2
2001-01-16 00:28:40 +00:00
ucol_countAvailable ( )
{
return uloc_countAvailable ( ) ;
}
2001-11-21 01:08:55 +00:00
U_CAPI void U_EXPORT2
2001-04-23 01:53:49 +00:00
ucol_getVersion ( const UCollator * coll ,
UVersionInfo versionInfo )
2001-01-16 00:28:40 +00:00
{
2001-02-10 02:42:54 +00:00
/* RunTime version */
uint8_t rtVersion = UCOL_RUNTIME_VERSION ;
2001-02-26 23:52:44 +00:00
/* Builder version*/
2003-04-24 07:00:27 +00:00
uint8_t bdVersion = coll - > image - > version [ 0 ] ;
2001-02-26 23:52:44 +00:00
2001-02-10 02:42:54 +00:00
/* Charset Version. Need to get the version from cnv files
2001-04-23 01:53:49 +00:00
* makeconv should populate cnv files with version and
2001-02-10 02:42:54 +00:00
* an api has to be provided in ucnv . h to obtain this version
*/
uint8_t csVersion = 0 ;
/* combine the version info */
2001-02-28 19:30:41 +00:00
uint16_t cmbVersion = ( uint16_t ) ( ( rtVersion < < 11 ) | ( bdVersion < < 6 ) | ( csVersion ) ) ;
2001-02-10 02:42:54 +00:00
2001-02-26 23:52:44 +00:00
/* Tailoring rules */
2001-02-28 19:30:41 +00:00
versionInfo [ 0 ] = ( uint8_t ) ( cmbVersion > > 8 ) ;
2001-02-10 02:42:54 +00:00
versionInfo [ 1 ] = ( uint8_t ) cmbVersion ;
2003-04-24 07:00:27 +00:00
versionInfo [ 2 ] = coll - > image - > version [ 1 ] ;
versionInfo [ 3 ] = UCA - > image - > UCAVersion [ 0 ] ;
2001-03-03 03:35:17 +00:00
}
2001-02-26 23:52:44 +00:00
2001-01-16 00:28:40 +00:00
2001-03-22 18:45:31 +00:00
/* This internal API checks whether a character is tailored or not */
2001-11-21 01:08:55 +00:00
U_CAPI UBool U_EXPORT2
2002-07-16 01:46:42 +00:00
ucol_isTailored ( const UCollator * coll , const UChar u , UErrorCode * status ) {
2001-03-22 18:45:31 +00:00
uint32_t CE = UCOL_NOT_FOUND ;
const UChar * ContractionStart = NULL ;
if ( U_SUCCESS ( * status ) & & coll ! = NULL ) {
if ( coll = = UCA ) {
return FALSE ;
} else if ( u < 0x100 ) { /* latin-1 */
CE = coll - > latinOneMapping [ u ] ;
if ( CE = = UCA - > latinOneMapping [ u ] ) {
return FALSE ;
2001-04-23 01:53:49 +00:00
}
2001-03-22 18:45:31 +00:00
} else { /* regular */
2001-12-19 07:00:45 +00:00
/*CE = ucmpe32_get(coll->mapping, u);*/
CE = UTRIE_GET32_FROM_LEAD ( coll - > mapping , u ) ;
2001-03-22 18:45:31 +00:00
}
if ( isContraction ( CE ) ) {
ContractionStart = ( UChar * ) coll - > image + getContractOffset ( CE ) ;
CE = * ( coll - > contractionCEs + ( ContractionStart - coll - > contractionIndex ) ) ;
}
if ( CE = = UCOL_NOT_FOUND ) {
return FALSE ;
} else {
return TRUE ;
}
} else {
return FALSE ;
}
}
2001-03-03 03:35:17 +00:00
2001-01-16 00:28:40 +00:00
/****************************************************************************/
/* Following are the string compare functions */
/* */
/****************************************************************************/
2001-04-06 23:37:48 +00:00
/* ucol_checkIdent internal function. Does byte level string compare. */
/* Used by strcoll if strength == identical and strings */
/* are otherwise equal. Moved out-of-line because this */
/* is a rare case. */
/* */
/* Comparison must be done on NFD normalized strings. */
/* FCD is not good enough. */
/* */
/* TODO: make an incremental NFD Comparison function, which could */
/* be of general use */
2001-10-20 01:09:31 +00:00
static
2003-01-20 07:43:32 +00:00
UCollationResult ucol_checkIdent ( collIterate * sColl , collIterate * tColl , UBool normalize , UErrorCode * status )
2001-04-06 23:37:48 +00:00
{
2001-04-23 01:53:49 +00:00
2003-01-20 07:43:32 +00:00
// TODO: When we have an UChar iterator, we need to access the whole string. One
// useful modification would be a UChar iterator extract API, since reset next next...
// is not optimal.
// TODO: Handle long strings. Do the same in compareUsingSortKeys.
2001-04-23 01:53:49 +00:00
2003-01-20 07:43:32 +00:00
// When we arrive here, we can have normal strings or UCharIterators. Currently they are both
// of same type, but that doesn't really mean that it will stay that way.
2003-03-17 21:20:36 +00:00
UAlignedMemory stackNormIter1 [ UNORM_ITER_SIZE / sizeof ( UAlignedMemory ) ] ;
UAlignedMemory stackNormIter2 [ UNORM_ITER_SIZE / sizeof ( UAlignedMemory ) ] ;
2003-02-06 23:29:56 +00:00
//UChar sStackBuf[256], tStackBuf[256];
//int32_t sBufSize = 256, tBufSize = 256;
2003-01-20 07:43:32 +00:00
int32_t comparison ;
int32_t sLen = 0 ;
UChar * sBuf = NULL ;
int32_t tLen = 0 ;
UChar * tBuf = NULL ;
UBool freeSBuf = FALSE , freeTBuf = FALSE ;
if ( sColl - > flags & UCOL_USE_ITERATOR ) {
2003-02-06 23:29:56 +00:00
UNormIterator * sNIt = NULL , * tNIt = NULL ;
2003-03-17 21:20:36 +00:00
sNIt = unorm_openIter ( stackNormIter1 , sizeof ( stackNormIter1 ) , status ) ;
tNIt = unorm_openIter ( stackNormIter2 , sizeof ( stackNormIter2 ) , status ) ;
2003-01-20 07:43:32 +00:00
sColl - > iterator - > move ( sColl - > iterator , 0 , UITER_START ) ;
tColl - > iterator - > move ( tColl - > iterator , 0 , UITER_START ) ;
2003-02-06 23:29:56 +00:00
UCharIterator * sIt = unorm_setIter ( sNIt , sColl - > iterator , UNORM_NFD , status ) ;
UCharIterator * tIt = unorm_setIter ( tNIt , tColl - > iterator , UNORM_NFD , status ) ;
comparison = u_strCompareIter ( sIt , tIt , TRUE ) ;
unorm_closeIter ( sNIt ) ;
unorm_closeIter ( tNIt ) ;
#if 0
2003-01-20 07:43:32 +00:00
sBuf = sStackBuf ;
UChar * sBufp = sBuf ;
tBuf = tStackBuf ;
UChar * tBufp = tBuf ;
while ( sColl - > iterator - > hasNext ( sColl - > iterator ) ) {
* sBufp + + = ( UChar ) sColl - > iterator - > next ( sColl - > iterator ) ;
if ( sBufp - sBuf = = sBufSize ) {
int32_t sSize = sColl - > iterator - > getIndex ( sColl - > iterator , UITER_LENGTH ) ;
UChar * newBuf = ( UChar * ) uprv_malloc ( 2 * sSize * sizeof ( UChar ) ) ; // Two times bigger, for normalization.
if ( newBuf = = NULL ) {
* status = U_MEMORY_ALLOCATION_ERROR ;
return UCOL_LESS ;
}
uprv_memcpy ( newBuf , sBuf , sBufSize * sizeof ( UChar ) ) ;
sBufp = newBuf + sBufSize ;
sBuf = newBuf ;
freeSBuf = TRUE ;
}
}
while ( tColl - > iterator - > hasNext ( tColl - > iterator ) ) {
* tBufp + + = ( UChar ) tColl - > iterator - > next ( tColl - > iterator ) ;
if ( tBufp - tBuf = = tBufSize ) {
int32_t tSize = tColl - > iterator - > getIndex ( tColl - > iterator , UITER_LENGTH ) ;
UChar * newBuf = ( UChar * ) uprv_malloc ( 2 * tSize * sizeof ( UChar ) ) ; // Two times bigger, for normalization.
if ( newBuf = = NULL ) {
* status = U_MEMORY_ALLOCATION_ERROR ;
return UCOL_LESS ;
}
uprv_memcpy ( newBuf , tBuf , tBufSize * sizeof ( UChar ) ) ;
tBufp = newBuf + tBufSize ;
tBuf = newBuf ;
freeTBuf = TRUE ;
}
}
sLen = sBufp - sBuf ;
tLen = tBufp - tBuf ;
2003-02-06 23:29:56 +00:00
# endif
2003-01-20 07:43:32 +00:00
} else {
sLen = ( sColl - > flags & UCOL_ITER_HASLEN ) ? sColl - > endp - sColl - > string : - 1 ;
sBuf = sColl - > string ;
tLen = ( tColl - > flags & UCOL_ITER_HASLEN ) ? tColl - > endp - tColl - > string : - 1 ;
tBuf = tColl - > string ;
2003-02-20 01:06:06 +00:00
2003-02-06 23:29:56 +00:00
if ( normalize ) {
* status = U_ZERO_ERROR ;
if ( unorm_quickCheck ( sBuf , sLen , UNORM_NFD , status ) ! = UNORM_YES ) {
sLen = unorm_decompose ( sColl - > writableBuffer , ( int32_t ) sColl - > writableBufSize ,
sBuf , sLen ,
2003-02-15 02:02:13 +00:00
FALSE , 0 ,
2003-02-06 23:29:56 +00:00
status ) ;
if ( * status = = U_BUFFER_OVERFLOW_ERROR ) {
if ( ! u_growBufferFromStatic ( sColl - > stackWritableBuffer ,
& sColl - > writableBuffer ,
( int32_t * ) & sColl - > writableBufSize , sLen ,
0 )
) {
* status = U_MEMORY_ALLOCATION_ERROR ;
return UCOL_LESS ; /* TODO set *status = U_MEMORY_ALLOCATION_ERROR; */
}
* status = U_ZERO_ERROR ;
sLen = unorm_decompose ( sColl - > writableBuffer , ( int32_t ) sColl - > writableBufSize ,
sBuf , sLen ,
2003-02-15 02:02:13 +00:00
FALSE , 0 ,
2003-02-06 23:29:56 +00:00
status ) ;
}
if ( freeSBuf ) {
uprv_free ( sBuf ) ;
freeSBuf = FALSE ;
}
sBuf = sColl - > writableBuffer ;
if ( sBuf ! = sColl - > stackWritableBuffer ) {
sColl - > flags | = UCOL_ITER_ALLOCATED ;
}
}
2001-04-23 01:53:49 +00:00
2003-02-06 23:29:56 +00:00
* status = U_ZERO_ERROR ;
if ( unorm_quickCheck ( tBuf , tLen , UNORM_NFD , status ) ! = UNORM_YES ) {
tLen = unorm_decompose ( tColl - > writableBuffer , ( int32_t ) tColl - > writableBufSize ,
tBuf , tLen ,
2003-02-15 02:02:13 +00:00
FALSE , 0 ,
2003-02-06 23:29:56 +00:00
status ) ;
if ( * status = = U_BUFFER_OVERFLOW_ERROR ) {
if ( ! u_growBufferFromStatic ( tColl - > stackWritableBuffer ,
& tColl - > writableBuffer ,
( int32_t * ) & tColl - > writableBufSize , tLen ,
0 )
) {
* status = U_MEMORY_ALLOCATION_ERROR ;
return UCOL_LESS ; /* TODO set *status = U_MEMORY_ALLOCATION_ERROR; */
}
* status = U_ZERO_ERROR ;
tLen = unorm_decompose ( tColl - > writableBuffer , ( int32_t ) tColl - > writableBufSize ,
tBuf , tLen ,
2003-02-15 02:02:13 +00:00
FALSE , 0 ,
2003-02-06 23:29:56 +00:00
status ) ;
}
if ( freeTBuf ) {
uprv_free ( tBuf ) ;
freeTBuf = FALSE ;
}
tBuf = tColl - > writableBuffer ;
if ( tBuf ! = tColl - > stackWritableBuffer ) {
tColl - > flags | = UCOL_ITER_ALLOCATED ;
}
}
}
2001-04-23 01:53:49 +00:00
2003-02-06 23:29:56 +00:00
if ( sLen = = - 1 & & tLen = = - 1 ) {
comparison = u_strcmpCodePointOrder ( sBuf , tBuf ) ;
} else {
if ( sLen = = - 1 ) {
sLen = u_strlen ( sBuf ) ;
}
if ( tLen = = - 1 ) {
tLen = u_strlen ( tBuf ) ;
}
comparison = u_memcmpCodePointOrder ( sBuf , tBuf , uprv_min ( sLen , tLen ) ) ;
if ( comparison = = 0 ) {
comparison = sLen - tLen ;
}
}
2001-04-06 23:37:48 +00:00
}
2003-02-20 01:06:06 +00:00
2001-09-27 01:01:30 +00:00
if ( comparison < 0 ) {
return UCOL_LESS ;
} else if ( comparison = = 0 ) {
return UCOL_EQUAL ;
} else /* comparison > 0 */ {
return UCOL_GREATER ;
2001-04-12 19:59:28 +00:00
}
2001-04-06 23:37:48 +00:00
}
2001-04-18 19:31:05 +00:00
/* CEBuf - A struct and some inline functions to handle the saving */
/* of CEs in a buffer within ucol_strcoll */
# define UCOL_CEBUF_SIZE 512
typedef struct ucol_CEBuf {
uint32_t * buf ;
uint32_t * endp ;
uint32_t * pos ;
uint32_t localArray [ UCOL_CEBUF_SIZE ] ;
} ucol_CEBuf ;
2001-04-06 23:37:48 +00:00
2001-04-18 19:31:05 +00:00
2001-10-22 05:30:22 +00:00
static
2001-04-18 19:31:05 +00:00
inline void UCOL_INIT_CEBUF ( ucol_CEBuf * b ) {
( b ) - > buf = ( b ) - > pos = ( b ) - > localArray ;
( b ) - > endp = ( b ) - > buf + UCOL_CEBUF_SIZE ;
} ;
2001-10-20 01:09:31 +00:00
static
2001-04-18 19:31:05 +00:00
void ucol_CEBuf_Expand ( ucol_CEBuf * b , collIterate * ci ) {
2001-04-06 23:37:48 +00:00
uint32_t oldSize ;
uint32_t newSize ;
uint32_t * newBuf ;
2001-04-18 19:31:05 +00:00
ci - > flags | = UCOL_ITER_ALLOCATED ;
2001-04-06 23:37:48 +00:00
oldSize = b - > pos - b - > buf ;
newSize = oldSize * 2 ;
newBuf = ( uint32_t * ) uprv_malloc ( newSize * sizeof ( uint32_t ) ) ;
2002-07-20 06:00:04 +00:00
if ( newBuf ! = NULL ) {
uprv_memcpy ( newBuf , b - > buf , oldSize * sizeof ( uint32_t ) ) ;
if ( b - > buf ! = b - > localArray ) {
uprv_free ( b - > buf ) ;
}
b - > buf = newBuf ;
b - > endp = b - > buf + newSize ;
b - > pos = b - > buf + oldSize ;
2001-04-12 19:59:28 +00:00
}
2001-04-06 23:37:48 +00:00
}
2001-10-22 05:30:22 +00:00
static
2001-05-17 23:09:35 +00:00
inline void UCOL_CEBUF_PUT ( ucol_CEBuf * b , uint32_t ce , collIterate * ci ) {
if ( b - > pos = = b - > endp ) {
ucol_CEBuf_Expand ( b , ci ) ;
2001-04-18 19:31:05 +00:00
}
* ( b ) - > pos + + = ce ;
} ;
2001-10-13 16:20:01 +00:00
/* This is a trick string compare function that goes in and uses sortkeys to compare */
/* It is used when compare gets in trouble and needs to bail out */
2003-01-20 07:43:32 +00:00
static UCollationResult ucol_compareUsingSortKeys ( collIterate * sColl ,
collIterate * tColl )
2001-10-13 16:20:01 +00:00
{
uint8_t sourceKey [ UCOL_MAX_BUFFER ] , targetKey [ UCOL_MAX_BUFFER ] ;
uint8_t * sourceKeyP = sourceKey ;
uint8_t * targetKeyP = targetKey ;
int32_t sourceKeyLen = UCOL_MAX_BUFFER , targetKeyLen = UCOL_MAX_BUFFER ;
2003-01-20 07:43:32 +00:00
const UCollator * coll = sColl - > coll ;
UChar * source = NULL ;
UChar * target = NULL ;
UChar sStackBuf [ 256 ] , tStackBuf [ 256 ] ;
int32_t sBufSize = 256 , tBufSize = 256 ;
int32_t sourceLength = ( sColl - > flags & UCOL_ITER_HASLEN ) ? ( sColl - > endp - sColl - > string ) : - 1 ;
int32_t targetLength = ( tColl - > flags & UCOL_ITER_HASLEN ) ? ( tColl - > endp - tColl - > string ) : - 1 ;
// TODO: Handle long strings. Do the same in ucol_checkIdent.
if ( sColl - > flags & UCOL_USE_ITERATOR ) {
sColl - > iterator - > move ( sColl - > iterator , 0 , UITER_START ) ;
tColl - > iterator - > move ( tColl - > iterator , 0 , UITER_START ) ;
source = sStackBuf ;
UChar * sBufp = source ;
target = tStackBuf ;
UChar * tBufp = target ;
while ( sColl - > iterator - > hasNext ( sColl - > iterator ) ) {
* sBufp + + = ( UChar ) sColl - > iterator - > next ( sColl - > iterator ) ;
}
while ( tColl - > iterator - > hasNext ( tColl - > iterator ) ) {
* tBufp + + = ( UChar ) tColl - > iterator - > next ( tColl - > iterator ) ;
}
sourceLength = sBufp - source ;
targetLength = tBufp - target ;
} else { // no iterators
sourceLength = ( sColl - > flags & UCOL_ITER_HASLEN ) ? ( sColl - > endp - sColl - > string ) : - 1 ;
targetLength = ( tColl - > flags & UCOL_ITER_HASLEN ) ? ( tColl - > endp - tColl - > string ) : - 1 ;
source = sColl - > string ;
target = tColl - > string ;
}
2001-10-13 16:20:01 +00:00
sourceKeyLen = ucol_getSortKey ( coll , source , sourceLength , sourceKeyP , sourceKeyLen ) ;
if ( sourceKeyLen > UCOL_MAX_BUFFER ) {
sourceKeyP = ( uint8_t * ) uprv_malloc ( sourceKeyLen * sizeof ( uint8_t ) ) ;
2002-07-20 06:00:04 +00:00
if ( sourceKeyP ! = NULL ) {
sourceKeyLen = ucol_getSortKey ( coll , source , sourceLength , sourceKeyP , sourceKeyLen ) ;
}
2001-10-13 16:20:01 +00:00
}
targetKeyLen = ucol_getSortKey ( coll , target , targetLength , targetKeyP , targetKeyLen ) ;
if ( targetKeyLen > UCOL_MAX_BUFFER ) {
targetKeyP = ( uint8_t * ) uprv_malloc ( targetKeyLen * sizeof ( uint8_t ) ) ;
2002-07-20 06:00:04 +00:00
if ( targetKeyP ! = NULL ) {
targetKeyLen = ucol_getSortKey ( coll , target , targetLength , targetKeyP , targetKeyLen ) ;
}
2001-10-13 16:20:01 +00:00
}
int32_t result = uprv_strcmp ( ( const char * ) sourceKeyP , ( const char * ) targetKeyP ) ;
if ( sourceKeyP ! = sourceKey ) {
uprv_free ( sourceKeyP ) ;
}
if ( targetKeyP ! = targetKey ) {
uprv_free ( targetKeyP ) ;
}
if ( result < 0 ) {
return UCOL_LESS ;
} else if ( result > 0 ) {
return UCOL_GREATER ;
} else {
return UCOL_EQUAL ;
}
}
2001-04-06 23:37:48 +00:00
2003-01-23 01:52:34 +00:00
static inline UCollationResult
2003-01-20 07:43:32 +00:00
ucol_strcollRegular ( collIterate * sColl , collIterate * tColl ,
// const UCollator *coll,
// const UChar *source,
// int32_t sourceLength,
// const UChar *target,
// int32_t targetLength,
2002-09-04 06:02:13 +00:00
UErrorCode * status )
2001-01-05 00:47:25 +00:00
{
2001-05-10 22:12:53 +00:00
U_ALIGN_CODE ( 16 ) ;
2001-01-05 00:47:25 +00:00
2003-01-20 07:43:32 +00:00
const UCollator * coll = sColl - > coll ;
2001-04-23 01:53:49 +00:00
2001-05-22 22:26:58 +00:00
// setting up the collator parameters
2001-03-14 02:45:39 +00:00
UColAttributeValue strength = coll - > strength ;
UBool initialCheckSecTer = ( strength > = UCOL_SECONDARY ) ;
UBool checkSecTer = initialCheckSecTer ;
UBool checkTertiary = ( strength > = UCOL_TERTIARY ) ;
UBool checkQuad = ( strength > = UCOL_QUATERNARY ) ;
UBool checkIdent = ( strength = = UCOL_IDENTICAL ) ;
UBool checkCase = ( coll - > caseLevel = = UCOL_ON ) ;
UBool isFrenchSec = ( coll - > frenchCollation = = UCOL_ON ) & & checkSecTer ;
2001-03-15 02:49:35 +00:00
UBool shifted = ( coll - > alternateHandling = = UCOL_SHIFTED ) ;
UBool qShifted = shifted & & checkQuad ;
2001-10-10 01:48:36 +00:00
UBool doHiragana = ( coll - > hiraganaQ = = UCOL_ON ) & & checkQuad ;
2001-03-14 02:45:39 +00:00
2001-10-13 16:20:01 +00:00
if ( doHiragana & & shifted ) {
2003-01-20 07:43:32 +00:00
return ( ucol_compareUsingSortKeys ( sColl , tColl ) ) ;
2001-10-13 16:20:01 +00:00
}
2001-05-22 22:26:58 +00:00
uint8_t caseSwitch = coll - > caseSwitch ;
uint8_t tertiaryMask = coll - > tertiaryMask ;
// This is the lowest primary value that will not be ignored if shifted
2001-06-26 22:24:10 +00:00
uint32_t LVT = ( shifted ) ? ( coll - > variableTopValue < < 16 ) : 0 ;
2001-05-22 22:26:58 +00:00
2001-01-15 07:28:54 +00:00
UCollationResult result = UCOL_EQUAL ;
2001-10-05 02:07:51 +00:00
UCollationResult hirResult = UCOL_EQUAL ;
2001-01-05 00:47:25 +00:00
2001-05-22 22:26:58 +00:00
// Preparing the CE buffers. They will be filled during the primary phase
2001-04-06 23:37:48 +00:00
ucol_CEBuf sCEs ;
ucol_CEBuf tCEs ;
UCOL_INIT_CEBUF ( & sCEs ) ;
UCOL_INIT_CEBUF ( & tCEs ) ;
2001-01-24 16:18:48 +00:00
uint32_t secS = 0 , secT = 0 ;
2001-01-24 00:35:19 +00:00
uint32_t sOrder = 0 , tOrder = 0 ;
2001-05-22 22:26:58 +00:00
// Non shifted primary processing is quite simple
2001-01-25 06:51:18 +00:00
if ( ! shifted ) {
for ( ; ; ) {
2001-05-22 22:26:58 +00:00
// We fetch CEs until we hit a non ignorable primary or end.
do {
// We get the next CE
2003-01-20 07:43:32 +00:00
sOrder = ucol_IGetNextCE ( coll , sColl , status ) ;
2001-05-22 22:26:58 +00:00
// Stuff it in the buffer
2003-01-20 07:43:32 +00:00
UCOL_CEBUF_PUT ( & sCEs , sOrder , sColl ) ;
2001-05-22 22:26:58 +00:00
// And keep just the primary part.
2001-05-21 22:30:49 +00:00
sOrder & = UCOL_PRIMARYMASK ;
2001-05-22 22:26:58 +00:00
} while ( sOrder = = 0 ) ;
2001-01-05 00:47:25 +00:00
2001-05-22 22:26:58 +00:00
// see the comments on the above block
do {
2003-01-20 07:43:32 +00:00
tOrder = ucol_IGetNextCE ( coll , tColl , status ) ;
UCOL_CEBUF_PUT ( & tCEs , tOrder , tColl ) ;
2001-05-21 22:30:49 +00:00
tOrder & = UCOL_PRIMARYMASK ;
2001-05-22 22:26:58 +00:00
} while ( tOrder = = 0 ) ;
2001-01-05 00:47:25 +00:00
2001-05-22 22:26:58 +00:00
// if both primaries are the same
2001-01-25 06:51:18 +00:00
if ( sOrder = = tOrder ) {
2001-11-01 00:00:15 +00:00
// and there are no more CEs, we advance to the next level
if ( sOrder = = UCOL_NO_MORE_CES_PRIMARY ) {
break ;
}
2001-10-05 02:07:51 +00:00
if ( doHiragana & & hirResult = = UCOL_EQUAL ) {
2003-01-20 07:43:32 +00:00
if ( ( sColl - > flags & UCOL_WAS_HIRAGANA ) ! = ( tColl - > flags & UCOL_WAS_HIRAGANA ) ) {
hirResult = ( ( sColl - > flags & UCOL_WAS_HIRAGANA ) > ( tColl - > flags & UCOL_WAS_HIRAGANA ) )
2001-10-05 02:07:51 +00:00
? UCOL_LESS : UCOL_GREATER ;
}
}
2001-01-25 06:51:18 +00:00
} else {
2001-05-22 22:26:58 +00:00
// if two primaries are different, we are done
2001-04-06 23:37:48 +00:00
result = ( sOrder < tOrder ) ? UCOL_LESS : UCOL_GREATER ;
goto commonReturn ;
2001-04-23 01:53:49 +00:00
}
2001-05-22 22:26:58 +00:00
} // no primary difference... do the rest from the buffers
} else { // shifted - do a slightly more complicated processing :)
2001-01-25 06:51:18 +00:00
for ( ; ; ) {
UBool sInShifted = FALSE ;
UBool tInShifted = FALSE ;
2001-05-22 22:26:58 +00:00
// This version of code can be refactored. However, it seems easier to understand this way.
2001-06-06 23:26:50 +00:00
// Source loop. Sam as the target loop.
2001-01-26 00:12:23 +00:00
for ( ; ; ) {
2003-01-20 07:43:32 +00:00
sOrder = ucol_IGetNextCE ( coll , sColl , status ) ;
2001-03-02 00:19:43 +00:00
if ( sOrder = = UCOL_NO_MORE_CES ) {
2003-01-20 07:43:32 +00:00
UCOL_CEBUF_PUT ( & sCEs , sOrder , sColl ) ;
2001-01-26 00:12:23 +00:00
break ;
2002-06-05 21:14:41 +00:00
} else if ( sOrder = = 0
| | ( sInShifted & & ( sOrder & UCOL_PRIMARYMASK ) = = 0 ) ) {
/* UCA amendment - ignore ignorables that follow shifted code points */
2001-01-26 00:12:23 +00:00
continue ;
} else if ( isContinuation ( sOrder ) ) {
2001-05-21 22:30:49 +00:00
if ( ( sOrder & UCOL_PRIMARYMASK ) > 0 ) { /* There is primary value */
2001-01-26 00:12:23 +00:00
if ( sInShifted ) {
2001-06-02 01:01:18 +00:00
sOrder = ( sOrder & UCOL_PRIMARYMASK ) | 0xC0 ; /* preserve interesting continuation */
2003-01-20 07:43:32 +00:00
UCOL_CEBUF_PUT ( & sCEs , sOrder , sColl ) ;
2001-01-26 00:12:23 +00:00
continue ;
} else {
2003-01-20 07:43:32 +00:00
UCOL_CEBUF_PUT ( & sCEs , sOrder , sColl ) ;
2001-01-26 00:12:23 +00:00
break ;
}
} else { /* Just lower level values */
if ( sInShifted ) {
continue ;
} else {
2003-01-20 07:43:32 +00:00
UCOL_CEBUF_PUT ( & sCEs , sOrder , sColl ) ;
2001-01-26 00:12:23 +00:00
continue ;
}
}
} else { /* regular */
2001-06-25 04:01:49 +00:00
if ( ( sOrder & UCOL_PRIMARYMASK ) > LVT ) {
2003-01-20 07:43:32 +00:00
UCOL_CEBUF_PUT ( & sCEs , sOrder , sColl ) ;
2001-01-26 00:12:23 +00:00
break ;
} else {
2001-05-21 22:30:49 +00:00
if ( ( sOrder & UCOL_PRIMARYMASK ) > 0 ) {
2001-01-26 00:12:23 +00:00
sInShifted = TRUE ;
2001-05-21 22:30:49 +00:00
sOrder & = UCOL_PRIMARYMASK ;
2003-01-20 07:43:32 +00:00
UCOL_CEBUF_PUT ( & sCEs , sOrder , sColl ) ;
2001-01-26 00:12:23 +00:00
continue ;
} else {
2003-01-20 07:43:32 +00:00
UCOL_CEBUF_PUT ( & sCEs , sOrder , sColl ) ;
2001-05-15 17:38:18 +00:00
sInShifted = FALSE ;
2001-01-26 00:12:23 +00:00
continue ;
}
}
}
}
2001-05-21 22:30:49 +00:00
sOrder & = UCOL_PRIMARYMASK ;
2001-01-26 00:12:23 +00:00
sInShifted = FALSE ;
2001-01-22 23:48:48 +00:00
2001-01-25 06:51:18 +00:00
for ( ; ; ) {
2003-01-20 07:43:32 +00:00
tOrder = ucol_IGetNextCE ( coll , tColl , status ) ;
2001-03-02 00:19:43 +00:00
if ( tOrder = = UCOL_NO_MORE_CES ) {
2003-01-20 07:43:32 +00:00
UCOL_CEBUF_PUT ( & tCEs , tOrder , tColl ) ;
2001-01-25 06:51:18 +00:00
break ;
2002-06-05 21:14:41 +00:00
} else if ( tOrder = = 0
| | ( tInShifted & & ( tOrder & UCOL_PRIMARYMASK ) = = 0 ) ) {
/* UCA amendment - ignore ignorables that follow shifted code points */
2001-01-25 06:51:18 +00:00
continue ;
} else if ( isContinuation ( tOrder ) ) {
2001-05-21 22:30:49 +00:00
if ( ( tOrder & UCOL_PRIMARYMASK ) > 0 ) { /* There is primary value */
2001-01-25 06:51:18 +00:00
if ( tInShifted ) {
2001-06-02 01:01:18 +00:00
tOrder = ( tOrder & UCOL_PRIMARYMASK ) | 0xC0 ; /* preserve interesting continuation */
2003-01-20 07:43:32 +00:00
UCOL_CEBUF_PUT ( & tCEs , tOrder , tColl ) ;
2001-01-25 06:51:18 +00:00
continue ;
} else {
2003-01-20 07:43:32 +00:00
UCOL_CEBUF_PUT ( & tCEs , tOrder , tColl ) ;
2001-01-24 00:35:19 +00:00
break ;
2001-01-25 06:51:18 +00:00
}
} else { /* Just lower level values */
if ( tInShifted ) {
continue ;
2001-01-23 06:58:22 +00:00
} else {
2003-01-20 07:43:32 +00:00
UCOL_CEBUF_PUT ( & tCEs , tOrder , tColl ) ;
2001-01-24 00:35:19 +00:00
continue ;
2001-01-23 06:58:22 +00:00
}
2001-01-25 06:51:18 +00:00
}
} else { /* regular */
2001-06-25 04:01:49 +00:00
if ( ( tOrder & UCOL_PRIMARYMASK ) > LVT ) {
2003-01-20 07:43:32 +00:00
UCOL_CEBUF_PUT ( & tCEs , tOrder , tColl ) ;
2001-01-25 06:51:18 +00:00
break ;
2001-01-24 00:35:19 +00:00
} else {
2001-05-21 22:30:49 +00:00
if ( ( tOrder & UCOL_PRIMARYMASK ) > 0 ) {
2001-01-25 06:51:18 +00:00
tInShifted = TRUE ;
2001-05-21 22:30:49 +00:00
tOrder & = UCOL_PRIMARYMASK ;
2003-01-20 07:43:32 +00:00
UCOL_CEBUF_PUT ( & tCEs , tOrder , tColl ) ;
2001-01-25 06:51:18 +00:00
continue ;
} else {
2003-01-20 07:43:32 +00:00
UCOL_CEBUF_PUT ( & tCEs , tOrder , tColl ) ;
2001-05-15 17:38:18 +00:00
tInShifted = FALSE ;
2001-01-25 06:51:18 +00:00
continue ;
2001-01-24 16:18:48 +00:00
}
2000-11-20 19:17:17 +00:00
}
2001-01-25 06:51:18 +00:00
}
}
2001-05-21 22:30:49 +00:00
tOrder & = UCOL_PRIMARYMASK ;
2001-01-26 00:12:23 +00:00
tInShifted = FALSE ;
2001-01-16 00:28:40 +00:00
2001-01-25 06:51:18 +00:00
if ( sOrder = = tOrder ) {
2001-10-13 16:20:01 +00:00
/*
2001-10-05 02:07:51 +00:00
if ( doHiragana & & hirResult = = UCOL_EQUAL ) {
if ( ( sColl . flags & UCOL_WAS_HIRAGANA ) ! = ( tColl . flags & UCOL_WAS_HIRAGANA ) ) {
hirResult = ( ( sColl . flags & UCOL_WAS_HIRAGANA ) > ( tColl . flags & UCOL_WAS_HIRAGANA ) )
? UCOL_LESS : UCOL_GREATER ;
}
}
2001-10-13 16:20:01 +00:00
*/
2001-05-21 22:30:49 +00:00
if ( sOrder = = UCOL_NO_MORE_CES_PRIMARY ) {
2001-01-25 06:51:18 +00:00
break ;
} else {
sOrder = 0 ; tOrder = 0 ;
continue ;
2000-11-29 00:16:15 +00:00
}
2001-01-25 06:51:18 +00:00
} else {
2001-04-06 23:37:48 +00:00
result = ( sOrder < tOrder ) ? UCOL_LESS : UCOL_GREATER ;
goto commonReturn ;
2001-04-23 01:53:49 +00:00
}
2001-01-25 06:51:18 +00:00
} /* no primary difference... do the rest from the buffers */
}
2000-11-29 00:16:15 +00:00
2001-01-25 06:51:18 +00:00
/* now, we're gonna reexamine collected CEs */
2001-04-06 23:37:48 +00:00
uint32_t * sCE ;
uint32_t * tCE ;
2001-01-25 06:51:18 +00:00
2001-02-28 19:01:23 +00:00
/* This is the secondary level of comparison */
2001-01-25 06:51:18 +00:00
if ( checkSecTer ) {
if ( ! isFrenchSec ) { /* normal */
2001-04-06 23:37:48 +00:00
sCE = sCEs . buf ;
tCE = tCEs . buf ;
2001-01-25 06:51:18 +00:00
for ( ; ; ) {
2001-03-02 00:19:43 +00:00
while ( secS = = 0 ) {
2001-05-21 22:30:49 +00:00
secS = * ( sCE + + ) & UCOL_SECONDARYMASK ;
2001-01-25 06:51:18 +00:00
}
2001-03-02 00:19:43 +00:00
while ( secT = = 0 ) {
2001-05-21 22:30:49 +00:00
secT = * ( tCE + + ) & UCOL_SECONDARYMASK ;
2001-01-25 06:51:18 +00:00
}
if ( secS = = secT ) {
2001-05-21 22:30:49 +00:00
if ( secS = = UCOL_NO_MORE_CES_SECONDARY ) {
2001-01-25 06:51:18 +00:00
break ;
2001-01-24 16:18:48 +00:00
} else {
2001-04-23 01:53:49 +00:00
secS = 0 ; secT = 0 ;
2001-01-25 06:51:18 +00:00
continue ;
}
} else {
2001-04-06 23:37:48 +00:00
result = ( secS < secT ) ? UCOL_LESS : UCOL_GREATER ;
goto commonReturn ;
2001-04-23 01:53:49 +00:00
}
2001-01-25 06:51:18 +00:00
}
} else { /* do the French */
2001-01-26 00:12:23 +00:00
uint32_t * sCESave = NULL ;
uint32_t * tCESave = NULL ;
2001-04-06 23:37:48 +00:00
sCE = sCEs . pos - 2 ; /* this could also be sCEs-- if needs to be optimized */
tCE = tCEs . pos - 2 ;
2001-01-26 00:12:23 +00:00
for ( ; ; ) {
2001-04-06 23:37:48 +00:00
while ( secS = = 0 & & sCE > = sCEs . buf ) {
2001-01-26 00:12:23 +00:00
if ( sCESave = = 0 ) {
2001-05-21 22:30:49 +00:00
secS = * ( sCE - - ) ;
2001-01-26 00:12:23 +00:00
if ( isContinuation ( secS ) ) {
2001-05-21 22:30:49 +00:00
while ( isContinuation ( secS = * ( sCE - - ) ) ) ;
2001-01-26 00:12:23 +00:00
/* after this, secS has the start of continuation, and sCEs points before that */
2001-04-06 23:37:48 +00:00
sCESave = sCE ; /* we save it, so that we know where to come back AND that we need to go forward */
sCE + = 2 ; /* need to point to the first continuation CP */
2001-01-26 00:12:23 +00:00
/* However, now you can just continue doing stuff */
}
} else {
2001-05-21 22:30:49 +00:00
secS = * ( sCE + + ) ;
2001-01-26 00:12:23 +00:00
if ( ! isContinuation ( secS ) ) { /* This means we have finished with this cont */
2001-04-06 23:37:48 +00:00
sCE = sCESave ; /* reset the pointer to before continuation */
2001-01-26 00:12:23 +00:00
sCESave = 0 ;
continue ;
}
}
2001-05-21 22:30:49 +00:00
secS & = UCOL_SECONDARYMASK ; /* remove the continuation bit */
2001-01-26 00:12:23 +00:00
}
2001-04-06 23:37:48 +00:00
while ( secT = = 0 & & tCE > = tCEs . buf ) {
2001-01-26 00:12:23 +00:00
if ( tCESave = = 0 ) {
2001-05-21 22:30:49 +00:00
secT = * ( tCE - - ) ;
2001-01-26 00:12:23 +00:00
if ( isContinuation ( secT ) ) {
2001-05-21 22:30:49 +00:00
while ( isContinuation ( secT = * ( tCE - - ) ) ) ;
2001-01-26 00:12:23 +00:00
/* after this, secS has the start of continuation, and sCEs points before that */
2001-04-06 23:37:48 +00:00
tCESave = tCE ; /* we save it, so that we know where to come back AND that we need to go forward */
tCE + = 2 ; /* need to point to the first continuation CP */
2001-01-26 00:12:23 +00:00
/* However, now you can just continue doing stuff */
}
} else {
2001-05-21 22:30:49 +00:00
secT = * ( tCE + + ) ;
2001-01-26 00:12:23 +00:00
if ( ! isContinuation ( secT ) ) { /* This means we have finished with this cont */
2001-04-06 23:37:48 +00:00
tCE = tCESave ; /* reset the pointer to before continuation */
2001-01-26 00:12:23 +00:00
tCESave = 0 ;
continue ;
}
}
2001-05-21 22:30:49 +00:00
secT & = UCOL_SECONDARYMASK ; /* remove the continuation bit */
2001-01-26 00:12:23 +00:00
}
if ( secS = = secT ) {
2001-05-21 22:30:49 +00:00
if ( secS = = UCOL_NO_MORE_CES_SECONDARY | | ( sCE < sCEs . buf & & tCE < tCEs . buf ) ) {
2001-01-26 00:12:23 +00:00
break ;
} else {
2001-04-23 01:53:49 +00:00
secS = 0 ; secT = 0 ;
2001-01-26 00:12:23 +00:00
continue ;
}
} else {
2001-04-06 23:37:48 +00:00
result = ( secS < secT ) ? UCOL_LESS : UCOL_GREATER ;
goto commonReturn ;
2001-04-23 01:53:49 +00:00
}
2001-01-26 00:12:23 +00:00
}
2001-01-25 06:51:18 +00:00
}
}
2001-01-26 00:12:23 +00:00
/* doing the case bit */
2001-01-25 06:51:18 +00:00
if ( checkCase ) {
2001-04-06 23:37:48 +00:00
sCE = sCEs . buf ;
tCE = tCEs . buf ;
2001-01-25 06:51:18 +00:00
for ( ; ; ) {
2001-03-02 00:19:43 +00:00
while ( ( secS & UCOL_REMOVE_CASE ) = = 0 ) {
2001-04-06 23:37:48 +00:00
if ( ! isContinuation ( * sCE + + ) ) {
secS = * ( sCE - 1 ) & UCOL_TERT_CASE_MASK ;
2001-05-10 22:33:50 +00:00
secS ^ = caseSwitch ;
} else {
secS = 0 ;
2001-04-23 01:53:49 +00:00
}
2001-01-26 00:12:23 +00:00
}
2001-03-02 00:19:43 +00:00
while ( ( secT & UCOL_REMOVE_CASE ) = = 0 ) {
2001-04-06 23:37:48 +00:00
if ( ! isContinuation ( * tCE + + ) ) {
secT = * ( tCE - 1 ) & UCOL_TERT_CASE_MASK ;
2001-05-10 22:33:50 +00:00
secT ^ = caseSwitch ;
} else {
secT = 0 ;
2001-03-16 19:06:07 +00:00
}
2001-01-26 00:12:23 +00:00
}
2001-01-25 06:51:18 +00:00
2001-03-02 00:19:43 +00:00
if ( ( secS & UCOL_CASE_BIT_MASK ) < ( secT & UCOL_CASE_BIT_MASK ) ) {
2001-04-06 23:37:48 +00:00
result = UCOL_LESS ;
goto commonReturn ;
2001-03-02 00:19:43 +00:00
} else if ( ( secS & UCOL_CASE_BIT_MASK ) > ( secT & UCOL_CASE_BIT_MASK ) ) {
2001-04-06 23:37:48 +00:00
result = UCOL_GREATER ;
goto commonReturn ;
2001-04-23 01:53:49 +00:00
}
2001-03-02 00:19:43 +00:00
2001-05-21 22:30:49 +00:00
if ( ( secS & UCOL_REMOVE_CASE ) = = UCOL_NO_MORE_CES_TERTIARY | | ( secT & UCOL_REMOVE_CASE ) = = UCOL_NO_MORE_CES_TERTIARY ) {
2001-03-02 00:19:43 +00:00
break ;
} else {
secS = 0 ;
secT = 0 ;
}
2001-01-25 06:51:18 +00:00
}
}
2001-02-28 19:01:23 +00:00
/* Tertiary level */
2001-01-25 06:51:18 +00:00
if ( checkTertiary ) {
2001-04-23 01:53:49 +00:00
secS = 0 ;
2001-01-25 06:51:18 +00:00
secT = 0 ;
2001-04-06 23:37:48 +00:00
sCE = sCEs . buf ;
tCE = tCEs . buf ;
2001-01-25 06:51:18 +00:00
for ( ; ; ) {
2001-03-02 00:19:43 +00:00
while ( ( secS & UCOL_REMOVE_CASE ) = = 0 ) {
2001-04-06 23:37:48 +00:00
secS = * ( sCE + + ) & tertiaryMask ;
2001-05-10 22:33:50 +00:00
if ( ! isContinuation ( secS ) ) {
secS ^ = caseSwitch ;
} else {
secS & = UCOL_REMOVE_CASE ;
}
2001-01-25 06:51:18 +00:00
}
2001-03-02 00:19:43 +00:00
while ( ( secT & UCOL_REMOVE_CASE ) = = 0 ) {
2001-05-10 22:33:50 +00:00
secT = * ( tCE + + ) & tertiaryMask ;
if ( ! isContinuation ( secT ) ) {
secT ^ = caseSwitch ;
} else {
secT & = UCOL_REMOVE_CASE ;
}
2001-01-25 06:51:18 +00:00
}
if ( secS = = secT ) {
2001-03-02 00:19:43 +00:00
if ( ( secS & UCOL_REMOVE_CASE ) = = 1 ) {
2001-01-25 06:51:18 +00:00
break ;
} else {
2001-04-23 01:53:49 +00:00
secS = 0 ; secT = 0 ;
2001-01-25 06:51:18 +00:00
continue ;
}
} else {
2001-04-06 23:37:48 +00:00
result = ( secS < secT ) ? UCOL_LESS : UCOL_GREATER ;
goto commonReturn ;
2001-04-23 01:53:49 +00:00
}
2001-01-25 06:51:18 +00:00
}
}
2001-04-23 01:53:49 +00:00
2001-01-25 06:51:18 +00:00
2001-10-05 02:07:51 +00:00
if ( qShifted /*checkQuad*/ ) {
2001-01-25 06:51:18 +00:00
UBool sInShifted = TRUE ;
UBool tInShifted = TRUE ;
2001-04-23 01:53:49 +00:00
secS = 0 ;
2001-01-25 06:51:18 +00:00
secT = 0 ;
2001-04-06 23:37:48 +00:00
sCE = sCEs . buf ;
tCE = tCEs . buf ;
2001-01-25 06:51:18 +00:00
for ( ; ; ) {
2001-05-21 22:30:49 +00:00
while ( secS = = 0 & & secS ! = UCOL_NO_MORE_CES | | ( isContinuation ( secS ) & & ! sInShifted ) ) {
2001-04-06 23:37:48 +00:00
secS = * ( sCE + + ) ;
2001-06-02 01:01:18 +00:00
if ( isContinuation ( secS ) ) {
if ( ! sInShifted ) {
continue ;
}
} else if ( secS > LVT | | ( secS & UCOL_PRIMARYMASK ) = = 0 ) { /* non continuation */
2001-05-21 22:30:49 +00:00
secS = UCOL_PRIMARYMASK ;
2001-01-25 06:51:18 +00:00
sInShifted = FALSE ;
} else {
sInShifted = TRUE ;
}
}
2001-05-21 22:30:49 +00:00
secS & = UCOL_PRIMARYMASK ;
2000-11-29 00:16:15 +00:00
2001-01-25 06:51:18 +00:00
2001-05-21 22:30:49 +00:00
while ( secT = = 0 & & secT ! = UCOL_NO_MORE_CES | | ( isContinuation ( secT ) & & ! tInShifted ) ) {
2001-04-06 23:37:48 +00:00
secT = * ( tCE + + ) ;
2001-06-02 01:01:18 +00:00
if ( isContinuation ( secT ) ) {
if ( ! tInShifted ) {
continue ;
}
} else if ( secT > LVT | | ( secT & UCOL_PRIMARYMASK ) = = 0 ) {
2001-05-21 22:30:49 +00:00
secT = UCOL_PRIMARYMASK ;
2001-01-25 06:51:18 +00:00
tInShifted = FALSE ;
} else {
tInShifted = TRUE ;
}
2000-11-20 19:17:17 +00:00
}
2001-05-21 22:30:49 +00:00
secT & = UCOL_PRIMARYMASK ;
2001-01-25 06:51:18 +00:00
if ( secS = = secT ) {
2001-05-21 22:30:49 +00:00
if ( secS = = UCOL_NO_MORE_CES_PRIMARY ) {
2001-01-25 06:51:18 +00:00
break ;
} else {
secS = 0 ; secT = 0 ;
continue ;
}
} else {
2001-04-06 23:37:48 +00:00
result = ( secS < secT ) ? UCOL_LESS : UCOL_GREATER ;
goto commonReturn ;
2001-04-23 01:53:49 +00:00
}
2001-01-25 06:51:18 +00:00
}
2001-10-05 02:07:51 +00:00
} else if ( doHiragana & & hirResult ! = UCOL_EQUAL ) {
// If we're fine on quaternaries, we might be different
// on Hiragana. This, however, might fail us in shifted.
result = hirResult ;
goto commonReturn ;
2001-01-25 06:51:18 +00:00
}
2000-11-20 19:17:17 +00:00
2001-01-16 00:28:40 +00:00
/* For IDENTICAL comparisons, we use a bitwise character comparison */
2001-04-06 23:37:48 +00:00
/* as a tiebreaker if all else is equal. */
/* Getting here should be quite rare - strings are not identical - */
/* that is checked first, but compared == through all other checks. */
2001-02-28 19:01:23 +00:00
if ( checkIdent )
2000-11-20 19:17:17 +00:00
{
2001-11-01 00:00:15 +00:00
//result = ucol_checkIdent(&sColl, &tColl, coll->normalizationMode == UCOL_ON);
2003-01-20 07:43:32 +00:00
result = ucol_checkIdent ( sColl , tColl , TRUE , status ) ;
2001-04-06 23:37:48 +00:00
}
2001-03-14 02:45:39 +00:00
2001-04-06 23:37:48 +00:00
commonReturn :
2003-01-20 07:43:32 +00:00
if ( ( sColl - > flags | tColl - > flags ) & UCOL_ITER_ALLOCATED ) {
freeHeapWritableBuffer ( sColl ) ;
freeHeapWritableBuffer ( tColl ) ;
2000-11-20 19:17:17 +00:00
2001-04-23 01:53:49 +00:00
if ( sCEs . buf ! = sCEs . localArray ) {
uprv_free ( sCEs . buf ) ;
}
if ( tCEs . buf ! = tCEs . localArray ) {
uprv_free ( tCEs . buf ) ;
2001-04-18 19:31:05 +00:00
}
2001-04-12 19:59:28 +00:00
}
2000-11-20 19:17:17 +00:00
return result ;
}
2002-09-04 06:02:13 +00:00
static inline uint32_t
ucol_getLatinOneContraction ( const UCollator * coll , int32_t strength ,
uint32_t CE , const UChar * s , int32_t * index , int32_t len ) {
const UChar * UCharOffset = ( UChar * ) coll - > image + getContractOffset ( CE & 0xFFF ) ;
int32_t latinOneOffset = ( CE & 0x00FFF000 ) > > 12 ;
int32_t offset = 1 ;
UChar schar = 0 , tchar = 0 ;
for ( ; ; ) {
if ( len = = - 1 ) {
if ( s [ * index ] = = 0 ) { // end of string
return ( coll - > latinOneCEs [ strength * coll - > latinOneTableLen + latinOneOffset ] ) ;
} else {
schar = s [ * index ] ;
}
} else {
if ( * index = = len ) {
return ( coll - > latinOneCEs [ strength * coll - > latinOneTableLen + latinOneOffset ] ) ;
} else {
schar = s [ * index ] ;
}
}
while ( schar > ( tchar = * ( UCharOffset + offset ) ) ) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
offset + + ;
}
if ( schar = = tchar ) {
( * index ) + + ;
return ( coll - > latinOneCEs [ strength * coll - > latinOneTableLen + latinOneOffset + offset ] ) ;
}
else
{
if ( schar & 0xFF00 /*> UCOL_ENDOFLATIN1RANGE*/ ) {
return UCOL_BAIL_OUT_CE ;
}
// skip completely ignorables
uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD ( coll - > mapping , schar ) ;
if ( isZeroCE = = 0 ) { // we have to ignore completely ignorables
( * index ) + + ;
continue ;
}
return ( coll - > latinOneCEs [ strength * coll - > latinOneTableLen + latinOneOffset ] ) ;
}
}
}
/**
* This is a fast strcoll , geared towards text in Latin - 1.
* It supports contractions of size two , French secondaries
* and case switching . You can use it with strengths primary
* to tertiary . It does not support shifted and case level .
* It relies on the table build by setupLatin1Table . If it
* doesn ' t understand something , it will go to the regular
* strcoll .
*/
static inline UCollationResult
ucol_strcollUseLatin1 ( const UCollator * coll ,
const UChar * source ,
int32_t sLen ,
const UChar * target ,
int32_t tLen ,
UErrorCode * status )
{
U_ALIGN_CODE ( 16 ) ;
int32_t strength = coll - > strength ;
int32_t sIndex = 0 , tIndex = 0 ;
UChar sChar = 0 , tChar = 0 ;
uint32_t sOrder = 0 , tOrder = 0 ;
UBool endOfSource = FALSE , endOfTarget = FALSE ;
uint32_t * elements = coll - > latinOneCEs ;
UBool haveContractions = FALSE ; // if we have contractions in our string
// we cannot do French secondary
// Do the primary level
for ( ; ; ) {
while ( sOrder = = 0 ) { // this loop skips primary ignorables
// sOrder=getNextlatinOneCE(source);
if ( sLen = = - 1 ) { // handling zero terminated strings
sChar = source [ sIndex + + ] ;
if ( sChar = = 0 ) {
endOfSource = TRUE ;
break ;
}
} else { // handling strings with known length
if ( sIndex = = sLen ) {
endOfSource = TRUE ;
break ;
}
sChar = source [ sIndex + + ] ;
}
if ( sChar & 0xFF00 ) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32)
//fprintf(stderr, "R");
2003-01-20 07:43:32 +00:00
goto returnRegular ;
//return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
2002-09-04 06:02:13 +00:00
}
sOrder = elements [ sChar ] ;
if ( sOrder > = UCOL_NOT_FOUND ) { // if we got a special
// specials can basically be either contractions or bail-out signs. If we get anything
// else, we'll bail out anywasy
if ( getCETag ( sOrder ) = = CONTRACTION_TAG ) {
sOrder = ucol_getLatinOneContraction ( coll , UCOL_PRIMARY , sOrder , source , & sIndex , sLen ) ;
haveContractions = TRUE ; // if there are contractions, we cannot do French secondary
// However, if there are contractions in the table, but we always use just one char,
// we might be able to do French. This should be checked out.
}
if ( sOrder > = UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/ ) {
//fprintf(stderr, "S");
2003-01-20 07:43:32 +00:00
goto returnRegular ;
//return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
2002-09-04 06:02:13 +00:00
}
}
}
while ( tOrder = = 0 ) { // this loop skips primary ignorables
// tOrder=getNextlatinOneCE(target);
if ( tLen = = - 1 ) { // handling zero terminated strings
tChar = target [ tIndex + + ] ;
if ( tChar = = 0 ) {
if ( endOfSource ) { // this is different than source loop,
// as we already know that source loop is done here,
// so we can either finish the primary loop if both
// strings are done or anounce the result if only
// target is done. Same below.
goto endOfPrimLoop ;
} else {
return UCOL_GREATER ;
}
}
} else { // handling strings with known length
if ( tIndex = = tLen ) {
if ( endOfSource ) {
goto endOfPrimLoop ;
} else {
return UCOL_GREATER ;
}
}
tChar = target [ tIndex + + ] ;
}
if ( tChar & 0xFF00 ) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32)
//fprintf(stderr, "R");
2003-01-20 07:43:32 +00:00
goto returnRegular ;
//return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
2002-09-04 06:02:13 +00:00
}
tOrder = elements [ tChar ] ;
if ( tOrder > = UCOL_NOT_FOUND ) {
// Handling specials, see the comments for source
if ( getCETag ( tOrder ) = = CONTRACTION_TAG ) {
tOrder = ucol_getLatinOneContraction ( coll , UCOL_PRIMARY , tOrder , target , & tIndex , tLen ) ;
haveContractions = TRUE ;
}
if ( tOrder > = UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/ ) {
//fprintf(stderr, "S");
2003-01-20 07:43:32 +00:00
goto returnRegular ;
//return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
2002-09-04 06:02:13 +00:00
}
}
}
if ( endOfSource ) { // source is finished, but target is not, say the result.
return UCOL_LESS ;
}
if ( sOrder = = tOrder ) { // if we have same CEs, we continue the loop
sOrder = 0 ; tOrder = 0 ;
continue ;
} else {
// compare current top bytes
if ( ( ( sOrder ^ tOrder ) & 0xFF000000 ) ! = 0 ) {
// top bytes differ, return difference
if ( sOrder < tOrder ) {
return UCOL_LESS ;
} else if ( sOrder > tOrder ) {
return UCOL_GREATER ;
}
// instead of return (int32_t)(sOrder>>24)-(int32_t)(tOrder>>24);
// since we must return enum value
}
// top bytes match, continue with following bytes
sOrder < < = 8 ;
tOrder < < = 8 ;
}
}
endOfPrimLoop :
// after primary loop, we definitely know the sizes of strings,
// so we set it and use simpler loop for secondaries and tertiaries
sLen = sIndex ; tLen = tIndex ;
if ( strength > = UCOL_SECONDARY ) {
// adjust the table beggining
elements + = coll - > latinOneTableLen ;
endOfSource = FALSE ; endOfTarget = FALSE ;
if ( coll - > frenchCollation = = UCOL_OFF ) { // non French
// This loop is a simplified copy of primary loop
// at this point we know that whole strings are latin-1, so we don't
// check for that. We also know that we only have contractions as
// specials.
sIndex = 0 ; tIndex = 0 ;
for ( ; ; ) {
while ( sOrder = = 0 ) {
if ( sIndex = = sLen ) {
endOfSource = TRUE ;
break ;
}
sChar = source [ sIndex + + ] ;
sOrder = elements [ sChar ] ;
if ( sOrder > UCOL_NOT_FOUND ) {
sOrder = ucol_getLatinOneContraction ( coll , UCOL_SECONDARY , sOrder , source , & sIndex , sLen ) ;
}
}
while ( tOrder = = 0 ) {
if ( tIndex = = tLen ) {
if ( endOfSource ) {
goto endOfSecLoop ;
} else {
return UCOL_GREATER ;
}
}
tChar = target [ tIndex + + ] ;
tOrder = elements [ tChar ] ;
if ( tOrder > UCOL_NOT_FOUND ) {
tOrder = ucol_getLatinOneContraction ( coll , UCOL_SECONDARY , tOrder , target , & tIndex , tLen ) ;
}
}
if ( endOfSource ) {
return UCOL_LESS ;
}
if ( sOrder = = tOrder ) {
sOrder = 0 ; tOrder = 0 ;
continue ;
} else {
// see primary loop for comments on this
if ( ( ( sOrder ^ tOrder ) & 0xFF000000 ) ! = 0 ) {
if ( sOrder < tOrder ) {
return UCOL_LESS ;
} else if ( sOrder > tOrder ) {
return UCOL_GREATER ;
}
}
sOrder < < = 8 ;
tOrder < < = 8 ;
}
}
} else { // French
if ( haveContractions ) { // if we have contractions, we have to bail out
// since we don't really know how to handle them here
2003-01-20 07:43:32 +00:00
goto returnRegular ;
//return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
2002-09-04 06:02:13 +00:00
}
// For French, we go backwards
sIndex = sLen ; tIndex = tLen ;
for ( ; ; ) {
while ( sOrder = = 0 ) {
if ( sIndex = = 0 ) {
endOfSource = TRUE ;
break ;
}
sChar = source [ - - sIndex ] ;
sOrder = elements [ sChar ] ;
// don't even look for contractions
}
while ( tOrder = = 0 ) {
if ( tIndex = = 0 ) {
if ( endOfSource ) {
goto endOfSecLoop ;
} else {
return UCOL_GREATER ;
}
}
tChar = target [ - - tIndex ] ;
tOrder = elements [ tChar ] ;
// don't even look for contractions
}
if ( endOfSource ) {
return UCOL_LESS ;
}
if ( sOrder = = tOrder ) {
sOrder = 0 ; tOrder = 0 ;
continue ;
} else {
// see the primary loop for comments
if ( ( ( sOrder ^ tOrder ) & 0xFF000000 ) ! = 0 ) {
if ( sOrder < tOrder ) {
return UCOL_LESS ;
} else if ( sOrder > tOrder ) {
return UCOL_GREATER ;
}
}
sOrder < < = 8 ;
tOrder < < = 8 ;
}
}
}
}
endOfSecLoop :
if ( strength > = UCOL_TERTIARY ) {
// tertiary loop is the same as secondary (except no French)
elements + = coll - > latinOneTableLen ;
sIndex = 0 ; tIndex = 0 ;
endOfSource = FALSE ; endOfTarget = FALSE ;
for ( ; ; ) {
while ( sOrder = = 0 ) {
if ( sIndex = = sLen ) {
endOfSource = TRUE ;
break ;
}
sChar = source [ sIndex + + ] ;
sOrder = elements [ sChar ] ;
if ( sOrder > UCOL_NOT_FOUND ) {
sOrder = ucol_getLatinOneContraction ( coll , UCOL_TERTIARY , sOrder , source , & sIndex , sLen ) ;
}
}
while ( tOrder = = 0 ) {
if ( tIndex = = tLen ) {
if ( endOfSource ) {
return UCOL_EQUAL ; // if both strings are at the end, they are equal
} else {
return UCOL_GREATER ;
}
}
tChar = target [ tIndex + + ] ;
tOrder = elements [ tChar ] ;
if ( tOrder > UCOL_NOT_FOUND ) {
tOrder = ucol_getLatinOneContraction ( coll , UCOL_TERTIARY , tOrder , target , & tIndex , tLen ) ;
}
}
if ( endOfSource ) {
return UCOL_LESS ;
}
if ( sOrder = = tOrder ) {
sOrder = 0 ; tOrder = 0 ;
continue ;
} else {
if ( ( ( sOrder ^ tOrder ) & 0xff000000 ) ! = 0 ) {
if ( sOrder < tOrder ) {
return UCOL_LESS ;
} else if ( sOrder > tOrder ) {
return UCOL_GREATER ;
}
}
sOrder < < = 8 ;
tOrder < < = 8 ;
}
}
}
return UCOL_EQUAL ;
2003-01-20 07:43:32 +00:00
returnRegular :
// Preparing the context objects for iterating over strings
collIterate sColl , tColl ;
IInit_collIterate ( coll , source , sLen , & sColl ) ;
IInit_collIterate ( coll , target , tLen , & tColl ) ;
return ucol_strcollRegular ( & sColl , & tColl , status ) ;
}
U_CAPI UCollationResult U_EXPORT2
ucol_strcollIter ( const UCollator * coll ,
UCharIterator * sIter ,
UCharIterator * tIter ,
UErrorCode * status ) {
2003-02-20 01:06:06 +00:00
if ( ! status | | U_FAILURE ( * status ) | | sIter = = tIter ) {
2003-01-20 07:43:32 +00:00
return UCOL_EQUAL ;
}
2003-02-20 01:06:06 +00:00
UCollationResult result = UCOL_EQUAL ;
// Preparing the context objects for iterating over strings
2003-01-20 07:43:32 +00:00
collIterate sColl , tColl ;
2003-03-17 21:20:36 +00:00
UAlignedMemory stackNormIter1 [ UNORM_ITER_SIZE / sizeof ( UAlignedMemory ) ] ;
UAlignedMemory stackNormIter2 [ UNORM_ITER_SIZE / sizeof ( UAlignedMemory ) ] ;
2003-02-20 01:06:06 +00:00
UNormIterator * sNormIter = NULL , * tNormIter = NULL ;
2003-01-20 07:43:32 +00:00
IInit_collIterate ( coll , NULL , - 1 , & sColl ) ;
sColl . iterator = sIter ;
sColl . flags | = UCOL_USE_ITERATOR ;
IInit_collIterate ( coll , NULL , - 1 , & tColl ) ;
tColl . flags | = UCOL_USE_ITERATOR ;
tColl . iterator = tIter ;
2003-01-23 01:52:34 +00:00
if ( ucol_getAttribute ( coll , UCOL_NORMALIZATION_MODE , status ) = = UCOL_ON ) {
2003-03-17 21:20:36 +00:00
sNormIter = unorm_openIter ( stackNormIter1 , sizeof ( stackNormIter1 ) , status ) ;
2003-01-23 01:52:34 +00:00
sColl . iterator = unorm_setIter ( sNormIter , sIter , UNORM_FCD , status ) ;
sColl . flags & = ~ UCOL_ITER_NORM ;
2003-03-17 21:20:36 +00:00
tNormIter = unorm_openIter ( stackNormIter2 , sizeof ( stackNormIter2 ) , status ) ;
2003-01-23 01:52:34 +00:00
tColl . iterator = unorm_setIter ( tNormIter , tIter , UNORM_FCD , status ) ;
tColl . flags & = ~ UCOL_ITER_NORM ;
2003-02-20 01:06:06 +00:00
}
2003-01-23 01:52:34 +00:00
2003-02-20 01:06:06 +00:00
UChar32 sChar = U_SENTINEL , tChar = U_SENTINEL ;
while ( ( sChar = sColl . iterator - > next ( sColl . iterator ) ) = =
( tChar = tColl . iterator - > next ( tColl . iterator ) ) ) {
if ( sChar = = U_SENTINEL ) {
2003-02-25 21:32:33 +00:00
result = UCOL_EQUAL ;
goto end_compare ;
2003-01-23 01:52:34 +00:00
}
2003-02-20 01:06:06 +00:00
}
if ( sChar = = U_SENTINEL ) {
tChar = tColl . iterator - > previous ( tColl . iterator ) ;
}
2003-01-23 01:52:34 +00:00
2003-02-20 01:06:06 +00:00
if ( tChar = = U_SENTINEL ) {
sChar = sColl . iterator - > previous ( sColl . iterator ) ;
}
sChar = sColl . iterator - > previous ( sColl . iterator ) ;
tChar = tColl . iterator - > previous ( tColl . iterator ) ;
if ( ucol_unsafeCP ( ( UChar ) sChar , coll ) | | ucol_unsafeCP ( ( UChar ) tChar , coll ) )
{
// We are stopped in the middle of a contraction.
// Scan backwards through the == part of the string looking for the start of the contraction.
// It doesn't matter which string we scan, since they are the same in this region.
do
{
sChar = sColl . iterator - > previous ( sColl . iterator ) ;
tChar = tColl . iterator - > previous ( tColl . iterator ) ;
}
while ( sChar ! = U_SENTINEL & & ucol_unsafeCP ( ( UChar ) sChar , coll ) ) ;
}
if ( U_SUCCESS ( * status ) ) {
result = ucol_strcollRegular ( & sColl , & tColl , status ) ;
}
2003-02-25 21:32:33 +00:00
end_compare :
2003-02-20 01:06:06 +00:00
if ( sNormIter | | tNormIter ) {
2003-01-23 01:52:34 +00:00
unorm_closeIter ( sNormIter ) ;
unorm_closeIter ( tNormIter ) ;
}
2003-01-20 07:43:32 +00:00
2003-02-20 01:06:06 +00:00
return result ;
2002-09-04 06:02:13 +00:00
}
2003-01-20 07:43:32 +00:00
2002-09-04 06:02:13 +00:00
/* */
/* ucol_strcoll Main public API string comparison function */
/* */
U_CAPI UCollationResult U_EXPORT2
ucol_strcoll ( const UCollator * coll ,
const UChar * source ,
int32_t sourceLength ,
const UChar * target ,
int32_t targetLength ) {
U_ALIGN_CODE ( 16 ) ;
UErrorCode status = U_ZERO_ERROR ;
2003-01-23 01:52:34 +00:00
collIterate sColl , tColl ;
2002-09-04 06:02:13 +00:00
/* Scan the strings. Find: */
/* The length of any leading portion that is equal */
/* Whether they are exactly equal. (in which case we just return) */
const UChar * pSrc = source ;
const UChar * pTarg = target ;
int32_t equalLength ;
if ( sourceLength = = - 1 & & targetLength = = - 1 ) {
// Both strings are null terminated.
// Check for them being the same string, and scan through
// any leading equal portion.
if ( source = = target ) {
return UCOL_EQUAL ;
}
for ( ; ; ) {
if ( * pSrc ! = * pTarg | | * pSrc = = 0 ) {
break ;
}
pSrc + + ;
pTarg + + ;
}
if ( * pSrc = = 0 & & * pTarg = = 0 ) {
return UCOL_EQUAL ;
}
equalLength = pSrc - source ;
}
else
{
// One or both strings has an explicit length.
/* check if source and target are same strings */
if ( source = = target & & sourceLength = = targetLength ) {
return UCOL_EQUAL ;
}
const UChar * pSrcEnd = source + sourceLength ;
const UChar * pTargEnd = target + targetLength ;
// Scan while the strings are bitwise ==, or until one is exhausted.
for ( ; ; ) {
if ( pSrc = = pSrcEnd | | pTarg = = pTargEnd ) {
break ;
}
if ( ( * pSrc = = 0 & & sourceLength = = - 1 ) | | ( * pTarg = = 0 & & targetLength = = - 1 ) ) {
break ;
}
if ( * pSrc ! = * pTarg ) {
break ;
}
pSrc + + ;
pTarg + + ;
}
equalLength = pSrc - source ;
// If we made it all the way through both strings, we are done. They are ==
if ( ( pSrc = = pSrcEnd | | ( pSrcEnd < pSrc & & * pSrc = = 0 ) ) & & /* At end of src string, however it was specified. */
( pTarg = = pTargEnd | | ( pTargEnd < pTarg & & * pTarg = = 0 ) ) ) { /* and also at end of dest string */
return UCOL_EQUAL ;
}
}
if ( equalLength > 0 ) {
/* There is an identical portion at the beginning of the two strings. */
/* If the identical portion ends within a contraction or a comibining */
/* character sequence, back up to the start of that sequence. */
pSrc = source + equalLength ; /* point to the first differing chars */
pTarg = target + equalLength ;
if ( pSrc ! = source + sourceLength & & ucol_unsafeCP ( * pSrc , coll ) | |
pTarg ! = target + targetLength & & ucol_unsafeCP ( * pTarg , coll ) )
{
// We are stopped in the middle of a contraction.
// Scan backwards through the == part of the string looking for the start of the contraction.
// It doesn't matter which string we scan, since they are the same in this region.
do
{
equalLength - - ;
pSrc - - ;
}
while ( equalLength > 0 & & ucol_unsafeCP ( * pSrc , coll ) ) ;
}
source + = equalLength ;
target + = equalLength ;
if ( sourceLength > 0 ) {
sourceLength - = equalLength ;
}
if ( targetLength > 0 ) {
targetLength - = equalLength ;
}
}
2003-01-23 01:52:34 +00:00
if ( ! coll - > latinOneUse | | ( sourceLength > 0 & & * source & 0xff00 ) | | ( targetLength > 0 & & * target & 0xff00 ) ) {
// Preparing the context objects for iterating over strings
IInit_collIterate ( coll , source , sourceLength , & sColl ) ;
IInit_collIterate ( coll , target , targetLength , & tColl ) ;
return ucol_strcollRegular ( & sColl , & tColl , & status ) ;
} else {
return ucol_strcollUseLatin1 ( coll , source , sourceLength , target , targetLength , & status ) ;
}
2003-01-20 07:43:32 +00:00
2003-01-23 01:52:34 +00:00
#if 0
2003-01-20 07:43:32 +00:00
// TODO: revisit the conditions here. We don't want to initialize colliterate structures if we're going to use the regular loop
2002-09-04 06:02:13 +00:00
if ( coll - > latinOneUse ) {
2002-12-04 00:28:06 +00:00
if ( ( sourceLength > 0 & & * source & 0xff00 ) | | ( targetLength > 0 & & * target & 0xff00 ) ) { // source or target start with non-latin-1
2003-01-23 01:52:34 +00:00
// Preparing the context objects for iterating over strings
collIterate sColl , tColl ;
IInit_collIterate ( coll , source , sourceLength , & sColl ) ;
IInit_collIterate ( coll , target , targetLength , & tColl ) ;
2003-01-20 07:43:32 +00:00
return ucol_strcollRegular ( & sColl , & tColl , & status ) ;
//return ucol_strcollRegular(coll, source, sourceLength, target, targetLength, &status);
2002-09-04 06:02:13 +00:00
} else {
return ucol_strcollUseLatin1 ( coll , source , sourceLength , target , targetLength , & status ) ;
}
} else {
2003-01-23 01:52:34 +00:00
// Preparing the context objects for iterating over strings
collIterate sColl , tColl ;
IInit_collIterate ( coll , source , sourceLength , & sColl ) ;
IInit_collIterate ( coll , target , targetLength , & tColl ) ;
2003-01-20 07:43:32 +00:00
return ucol_strcollRegular ( & sColl , & tColl , & status ) ;
//return ucol_strcollRegular(coll, source, sourceLength, target, targetLength, &status);
2002-09-04 06:02:13 +00:00
}
2003-01-23 01:52:34 +00:00
# endif
2002-09-04 06:02:13 +00:00
}
2001-01-16 00:28:40 +00:00
/* convenience function for comparing strings */
2001-11-21 01:08:55 +00:00
U_CAPI UBool U_EXPORT2
2001-01-16 00:28:40 +00:00
ucol_greater ( const UCollator * coll ,
const UChar * source ,
int32_t sourceLength ,
const UChar * target ,
int32_t targetLength )
{
2001-04-23 01:53:49 +00:00
return ( ucol_strcoll ( coll , source , sourceLength , target , targetLength )
2001-01-16 00:28:40 +00:00
= = UCOL_GREATER ) ;
}
2001-01-04 00:45:41 +00:00
2001-01-16 00:28:40 +00:00
/* convenience function for comparing strings */
2001-11-21 01:08:55 +00:00
U_CAPI UBool U_EXPORT2
2001-01-16 00:28:40 +00:00
ucol_greaterOrEqual ( const UCollator * coll ,
const UChar * source ,
int32_t sourceLength ,
const UChar * target ,
int32_t targetLength )
{
2001-04-23 01:53:49 +00:00
return ( ucol_strcoll ( coll , source , sourceLength , target , targetLength )
2001-01-16 00:28:40 +00:00
! = UCOL_LESS ) ;
}
/* convenience function for comparing strings */
2001-11-21 01:08:55 +00:00
U_CAPI UBool U_EXPORT2
2001-01-16 00:28:40 +00:00
ucol_equal ( const UCollator * coll ,
const UChar * source ,
int32_t sourceLength ,
const UChar * target ,
int32_t targetLength )
{
2001-04-23 01:53:49 +00:00
return ( ucol_strcoll ( coll , source , sourceLength , target , targetLength )
2001-01-16 00:28:40 +00:00
= = UCOL_EQUAL ) ;
2001-01-15 07:28:54 +00:00
}
2001-08-16 00:55:16 +00:00
2002-02-28 07:20:52 +00:00
/* returns the locale name the collation data comes from */
U_CAPI const char * U_EXPORT2
2002-03-13 05:48:25 +00:00
ucol_getLocale ( const UCollator * coll , ULocDataLocaleType type , UErrorCode * status ) {
const char * result = NULL ;
2002-02-28 07:20:52 +00:00
if ( status = = NULL | | U_FAILURE ( * status ) ) {
return NULL ;
}
2002-03-13 05:48:25 +00:00
switch ( type ) {
case ULOC_ACTUAL_LOCALE :
if ( coll - > binary ! = NULL ) {
result = ures_getLocale ( coll - > binary , status ) ;
}
break ;
case ULOC_VALID_LOCALE :
if ( coll - > rb ! = NULL ) {
result = ures_getLocale ( coll - > rb , status ) ;
}
break ;
case ULOC_REQUESTED_LOCALE :
result = coll - > requestedLocale ;
break ;
default :
* status = U_ILLEGAL_ARGUMENT_ERROR ;
2002-02-28 07:20:52 +00:00
}
2002-03-13 05:48:25 +00:00
return result ;
2002-02-28 07:20:52 +00:00
}
2002-09-04 06:08:04 +00:00
U_CAPI USet * U_EXPORT2
ucol_getTailoredSet ( const UCollator * coll , UErrorCode * status )
{
if ( status = = NULL | | U_FAILURE ( * status ) ) {
return NULL ;
}
if ( coll = = NULL ) {
* status = U_ILLEGAL_ARGUMENT_ERROR ;
}
UParseError parseError ;
UColTokenParser src ;
int32_t rulesLen = 0 ;
const UChar * rules = ucol_getRules ( coll , & rulesLen ) ;
const UChar * current = NULL ;
UBool startOfRules = TRUE ;
2002-09-17 06:27:51 +00:00
// we internally use the C++ class, for the following reasons:
// 1. we need to utilize canonical iterator, which is a C++ only class
// 2. canonical iterator returns UnicodeStrings - USet cannot take them
// 3. USet is internally really UnicodeSet, C is just a wrapper
UnicodeSet * tailored = new UnicodeSet ( ) ;
2002-09-04 06:08:04 +00:00
UnicodeString pattern ;
2002-09-17 06:27:51 +00:00
CanonicalIterator it ( " " , * status ) ;
2002-09-04 06:08:04 +00:00
2002-09-17 06:27:51 +00:00
// The idea is to tokenize the rule set. For each non-reset token,
// we add all the canonicaly equivalent FCD sequences
2002-09-04 06:08:04 +00:00
ucol_tok_initTokenList ( & src , rules , rulesLen , UCA , status ) ;
while ( ( current = ucol_tok_parseNextToken ( & src , startOfRules , & parseError , status ) ) ! = NULL ) {
startOfRules = FALSE ;
if ( src . parsedToken . strength ! = UCOL_TOK_RESET ) {
const UChar * stuff = src . source + ( src . parsedToken . charsOffset ) ;
2002-09-17 06:27:51 +00:00
it . setSource ( UnicodeString ( stuff , src . parsedToken . charsLen ) , * status ) ;
pattern = it . next ( ) ;
while ( ! pattern . isBogus ( ) ) {
if ( Normalizer : : quickCheck ( pattern , UNORM_FCD , * status ) ! = UNORM_NO ) {
tailored - > add ( pattern ) ;
}
pattern = it . next ( ) ;
}
2002-09-04 06:08:04 +00:00
}
}
ucol_tok_closeTokenList ( & src ) ;
2002-09-17 06:27:51 +00:00
return ( USet * ) tailored ;
2002-09-04 15:15:15 +00:00
}
2002-09-17 06:27:51 +00:00
2002-09-20 01:54:48 +00:00
# endif /* #if !UCONFIG_NO_COLLATION */