2001-02-23 04:57:47 +00:00
/*
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
*
2011-10-30 00:20:58 +00:00
* Copyright ( C ) 2000 - 2011 , International Business Machines
2001-02-23 04:57:47 +00:00
* Corporation and others . All Rights Reserved .
*
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
* file name : genuca . cpp
* encoding : US - ASCII
* tab size : 8 ( not used )
* indentation : 4
*
2001-02-26 10:28:56 +00:00
* created at the end of XX century
* created by : Vladimir Weinstein
*
2001-02-23 04:57:47 +00:00
* This program reads the Franctional UCA table and generates
* internal format for UCA table as well as inverse UCA table .
* It then writes binary files containing the data : ucadata . dat
* & invuca . dat
2001-02-23 19:10:28 +00:00
* Change history :
* 02 / 23 / 2001 grhoten Made it into a tool
2001-02-26 10:28:56 +00:00
* 02 / 23 / 2001 weiv Moved element & table handling code to i18n
2001-05-10 22:32:08 +00:00
* 05 / 09 / 2001 weiv Case bits are now in the CEs , not in front
2010-10-28 19:05:02 +00:00
* 10 / 26 / 2010 sgill Support for reordering codes
2001-02-23 04:57:47 +00:00
*/
2011-12-07 06:14:56 +00:00
# define U_NO_DEFAULT_INCLUDE_UTF_HEADERS 1
2002-10-01 01:26:49 +00:00
# include "unicode/utypes.h"
2004-09-23 00:24:36 +00:00
# include "unicode/putil.h"
2002-10-01 17:44:04 +00:00
# include "unicode/udata.h"
2003-08-14 21:34:54 +00:00
# include "unicode/uclean.h"
2010-11-04 20:41:35 +00:00
# include "unicode/uscript.h"
2010-10-28 19:05:02 +00:00
# include "unicode/ustring.h"
2011-12-07 06:14:56 +00:00
# include "unicode/utf16.h"
2010-10-28 19:05:02 +00:00
# include "ucol_bld.h"
2002-10-01 17:44:04 +00:00
# include "ucol_imp.h"
2001-02-22 23:30:21 +00:00
# include "genuca.h"
2001-02-23 01:21:38 +00:00
# include "uoptions.h"
2010-10-07 19:46:41 +00:00
# include "uparse.h"
2001-02-23 01:21:38 +00:00
# include "toolutil.h"
2002-10-01 17:44:04 +00:00
# include "unewdata.h"
2001-02-23 04:57:47 +00:00
# include "cstring.h"
2002-02-28 01:42:40 +00:00
# include "cmemory.h"
2001-02-22 21:18:29 +00:00
2004-11-12 00:26:54 +00:00
# include <stdio.h>
2010-10-07 19:46:41 +00:00
# define LENGTHOF(array) (int32_t)(sizeof(array) / sizeof((array)[0]))
2011-12-07 06:14:56 +00:00
/** The maximum UTF-16 length (number of UChars) in a UCA contraction. */
static const int32_t MAX_UCA_CONTRACTION_LENGTH = 4 ;
2010-10-28 19:05:02 +00:00
// script reordering structures
typedef struct {
uint16_t reorderCode ;
uint16_t offset ;
} ReorderIndex ;
typedef struct {
uint16_t LEAD_BYTE_TO_SCRIPTS_INDEX_LENGTH ;
uint16_t * LEAD_BYTE_TO_SCRIPTS_INDEX ;
uint16_t LEAD_BYTE_TO_SCRIPTS_DATA_LENGTH ;
uint16_t * LEAD_BYTE_TO_SCRIPTS_DATA ;
uint16_t LEAD_BYTE_TO_SCRIPTS_DATA_OFFSET ;
uint16_t SCRIPT_TO_LEAD_BYTES_INDEX_LENGTH ;
ReorderIndex * SCRIPT_TO_LEAD_BYTES_INDEX ;
uint16_t SCRIPT_TO_LEAD_BYTES_INDEX_COUNT ;
uint16_t SCRIPT_TO_LEAD_BYTES_DATA_LENGTH ;
uint16_t * SCRIPT_TO_LEAD_BYTES_DATA ;
uint16_t SCRIPT_TO_LEAD_BYTES_DATA_OFFSET ;
} LeadByteConstants ;
int ReorderIndexComparer ( const void * a , const void * b ) {
return reinterpret_cast < const ReorderIndex * > ( a ) - > reorderCode - reinterpret_cast < const ReorderIndex * > ( b ) - > reorderCode ;
}
2001-02-23 01:21:38 +00:00
/*
* Global - verbosity
*/
UBool VERBOSE = FALSE ;
2003-05-06 16:50:18 +00:00
static UVersionInfo UCAVersion ;
2002-10-01 17:44:04 +00:00
# if UCONFIG_NO_COLLATION
/* dummy UDataInfo cf. udata.h */
static UDataInfo dummyDataInfo = {
sizeof ( UDataInfo ) ,
0 ,
U_IS_BIG_ENDIAN ,
U_CHARSET_FAMILY ,
U_SIZEOF_UCHAR ,
0 ,
{ 0 , 0 , 0 , 0 } , /* dummy dataFormat */
{ 0 , 0 , 0 , 0 } , /* dummy formatVersion */
{ 0 , 0 , 0 , 0 } /* dummy dataVersion */
} ;
# else
2004-08-28 05:50:39 +00:00
static const UDataInfo ucaDataInfo = {
sizeof ( UDataInfo ) ,
0 ,
U_IS_BIG_ENDIAN ,
U_CHARSET_FAMILY ,
sizeof ( UChar ) ,
0 ,
{ UCA_DATA_FORMAT_0 , UCA_DATA_FORMAT_1 , UCA_DATA_FORMAT_2 , UCA_DATA_FORMAT_3 } , /* dataFormat="UCol" */
/* 03/26/2002 bumped up version since format has changed */
/* 09/16/2002 bumped up version since we went from UColAttributeValue */
/* to int32_t in UColOptionSet */
/* 05/13/2003 This one also updated since we added UCA and UCD versions */
/* to header */
/* 09/11/2003 Adding information required by data swapper */
{ UCA_FORMAT_VERSION_0 , UCA_FORMAT_VERSION_1 , UCA_FORMAT_VERSION_2 , UCA_FORMAT_VERSION_3 } , /* formatVersion */
{ 0 , 0 , 0 , 0 } /* dataVersion = Unicode Version*/
} ;
static const UDataInfo invUcaDataInfo = {
sizeof ( UDataInfo ) ,
0 ,
U_IS_BIG_ENDIAN ,
U_CHARSET_FAMILY ,
sizeof ( UChar ) ,
0 ,
{ INVUCA_DATA_FORMAT_0 , INVUCA_DATA_FORMAT_1 , INVUCA_DATA_FORMAT_2 , INVUCA_DATA_FORMAT_3 } , /* dataFormat="InvC" */
/* 03/26/2002 bumped up version since format has changed */
/* 04/29/2003 2.1 format - we have added UCA version to header */
{ INVUCA_FORMAT_VERSION_0 , INVUCA_FORMAT_VERSION_1 , INVUCA_FORMAT_VERSION_2 , INVUCA_FORMAT_VERSION_3 } , /* formatVersion */
{ 0 , 0 , 0 , 0 } /* dataVersion = Unicode Version*/
} ;
2002-10-01 17:44:04 +00:00
UCAElements le ;
2010-10-28 19:05:02 +00:00
// returns number of characters read
2001-02-22 21:18:29 +00:00
int32_t readElement ( char * * from , char * to , char separator , UErrorCode * status ) {
if ( U_FAILURE ( * status ) ) {
return 0 ;
}
char buffer [ 1024 ] ;
int32_t i = 0 ;
2011-11-03 23:09:27 +00:00
for ( ; ; ) {
char c = * * from ;
if ( c = = separator | | ( separator = = ' ' & & c = = ' \t ' ) ) {
break ;
}
if ( c = = ' \0 ' ) {
2010-10-28 19:05:02 +00:00
return 0 ;
}
2011-11-03 23:09:27 +00:00
if ( c ! = ' ' ) {
* ( buffer + i + + ) = c ;
2001-02-22 21:18:29 +00:00
}
( * from ) + + ;
}
( * from ) + + ;
* ( buffer + i ) = 0 ;
//*to = (char *)malloc(strlen(buffer)+1);
strcpy ( to , buffer ) ;
2010-10-28 19:05:02 +00:00
return i ;
2001-02-22 21:18:29 +00:00
}
2010-10-28 19:05:02 +00:00
int32_t skipUntilWhiteSpace ( char * * from , UErrorCode * status ) {
if ( U_FAILURE ( * status ) ) {
return 0 ;
}
int32_t count = 0 ;
while ( * * from ! = ' ' & & * * from ! = ' \t ' & & * * from ! = ' \0 ' ) {
( * from ) + + ;
count + + ;
}
return count ;
}
int32_t skipWhiteSpace ( char * * from , UErrorCode * status ) {
if ( U_FAILURE ( * status ) ) {
return 0 ;
}
int32_t count = 0 ;
while ( * * from = = ' ' | | * * from = = ' \t ' ) {
( * from ) + + ;
count + + ;
}
return count ;
}
2001-02-22 21:18:29 +00:00
2001-05-10 22:32:08 +00:00
uint32_t getSingleCEValue ( char * primary , char * secondary , char * tertiary , UErrorCode * status ) {
2001-02-22 21:18:29 +00:00
if ( U_FAILURE ( * status ) ) {
return 0 ;
}
uint32_t value = 0 ;
char primsave = ' \0 ' ;
char secsave = ' \0 ' ;
char tersave = ' \0 ' ;
char * primend = primary + 4 ;
if ( strlen ( primary ) > 4 ) {
primsave = * primend ;
* primend = ' \0 ' ;
}
char * secend = secondary + 2 ;
if ( strlen ( secondary ) > 2 ) {
secsave = * secend ;
* secend = ' \0 ' ;
}
char * terend = tertiary + 2 ;
if ( strlen ( tertiary ) > 2 ) {
tersave = * terend ;
* terend = ' \0 ' ;
}
2001-05-22 22:25:48 +00:00
uint32_t primvalue = ( uint32_t ) ( ( * primary ! = ' \0 ' ) ? strtoul ( primary , & primend , 16 ) : 0 ) ;
uint32_t secvalue = ( uint32_t ) ( ( * secondary ! = ' \0 ' ) ? strtoul ( secondary , & secend , 16 ) : 0 ) ;
uint32_t tervalue = ( uint32_t ) ( ( * tertiary ! = ' \0 ' ) ? strtoul ( tertiary , & terend , 16 ) : 0 ) ;
2001-02-22 21:18:29 +00:00
if ( primvalue < = 0xFF ) {
primvalue < < = 8 ;
}
value = ( ( primvalue < < UCOL_PRIMARYORDERSHIFT ) & UCOL_PRIMARYORDERMASK ) |
( ( secvalue < < UCOL_SECONDARYORDERSHIFT ) & UCOL_SECONDARYORDERMASK ) |
( tervalue & UCOL_TERTIARYORDERMASK ) ;
if ( primsave ! = ' \0 ' ) {
* primend = primsave ;
}
if ( secsave ! = ' \0 ' ) {
* secend = secsave ;
}
if ( tersave ! = ' \0 ' ) {
* terend = tersave ;
}
return value ;
}
2001-05-03 23:10:45 +00:00
static uint32_t inverseTable [ 0xFFFF ] [ 3 ] ;
static uint32_t inversePos = 0 ;
static UChar stringContinue [ 0xFFFF ] ;
static uint32_t sContPos = 0 ;
2001-02-22 21:18:29 +00:00
2001-05-03 23:10:45 +00:00
static void addNewInverse ( UCAElements * element , UErrorCode * status ) {
2001-03-22 21:16:20 +00:00
if ( U_FAILURE ( * status ) ) {
return ;
}
2001-02-26 10:28:56 +00:00
if ( VERBOSE & & isContinuation ( element - > CEs [ 1 ] ) ) {
2001-11-10 00:13:03 +00:00
//fprintf(stdout, "+");
2001-02-26 10:28:56 +00:00
}
inversePos + + ;
inverseTable [ inversePos ] [ 0 ] = element - > CEs [ 0 ] ;
if ( element - > noOfCEs > 1 & & isContinuation ( element - > CEs [ 1 ] ) ) {
inverseTable [ inversePos ] [ 1 ] = element - > CEs [ 1 ] ;
2001-03-15 21:54:36 +00:00
} else {
inverseTable [ inversePos ] [ 1 ] = 0 ;
2001-02-26 10:28:56 +00:00
}
if ( element - > cSize < 2 ) {
inverseTable [ inversePos ] [ 2 ] = element - > cPoints [ 0 ] ;
} else { /* add a new store of cruft */
inverseTable [ inversePos ] [ 2 ] = ( ( element - > cSize + 1 ) < < UCOL_INV_SHIFTVALUE ) | sContPos ;
memcpy ( stringContinue + sContPos , element - > cPoints , element - > cSize * sizeof ( UChar ) ) ;
sContPos + = element - > cSize + 1 ;
}
2001-02-22 21:18:29 +00:00
}
2001-05-03 23:10:45 +00:00
static void insertInverse ( UCAElements * element , uint32_t position , UErrorCode * status ) {
2001-03-22 21:16:20 +00:00
if ( U_FAILURE ( * status ) ) {
return ;
}
2001-03-15 02:36:03 +00:00
if ( VERBOSE & & isContinuation ( element - > CEs [ 1 ] ) ) {
2001-11-10 00:13:03 +00:00
//fprintf(stdout, "+");
2001-03-15 02:36:03 +00:00
}
if ( position < = inversePos ) {
/*move stuff around */
2002-07-02 22:28:40 +00:00
uint32_t amountToMove = ( inversePos - position + 1 ) * sizeof ( inverseTable [ 0 ] ) ;
uprv_memmove ( inverseTable [ position + 1 ] , inverseTable [ position ] , amountToMove ) ;
2001-03-15 02:36:03 +00:00
}
inverseTable [ position ] [ 0 ] = element - > CEs [ 0 ] ;
if ( element - > noOfCEs > 1 & & isContinuation ( element - > CEs [ 1 ] ) ) {
inverseTable [ position ] [ 1 ] = element - > CEs [ 1 ] ;
} else {
inverseTable [ position ] [ 1 ] = 0 ;
}
if ( element - > cSize < 2 ) {
inverseTable [ position ] [ 2 ] = element - > cPoints [ 0 ] ;
} else { /* add a new store of cruft */
inverseTable [ position ] [ 2 ] = ( ( element - > cSize + 1 ) < < UCOL_INV_SHIFTVALUE ) | sContPos ;
memcpy ( stringContinue + sContPos , element - > cPoints , element - > cSize * sizeof ( UChar ) ) ;
sContPos + = element - > cSize + 1 ;
}
inversePos + + ;
}
2001-05-03 23:10:45 +00:00
static void addToExistingInverse ( UCAElements * element , uint32_t position , UErrorCode * status ) {
2001-02-22 21:18:29 +00:00
2001-03-22 21:16:20 +00:00
if ( U_FAILURE ( * status ) ) {
return ;
}
2001-02-26 10:28:56 +00:00
if ( ( inverseTable [ position ] [ 2 ] & UCOL_INV_SIZEMASK ) = = 0 ) { /* single element, have to make new extension place and put both guys there */
2001-02-23 19:10:28 +00:00
stringContinue [ sContPos ] = ( UChar ) inverseTable [ position ] [ 2 ] ;
2001-02-22 21:18:29 +00:00
inverseTable [ position ] [ 2 ] = ( ( element - > cSize + 3 ) < < UCOL_INV_SHIFTVALUE ) | sContPos ;
sContPos + + ;
stringContinue [ sContPos + + ] = 0xFFFF ;
memcpy ( stringContinue + sContPos , element - > cPoints , element - > cSize * sizeof ( UChar ) ) ;
sContPos + = element - > cSize ;
stringContinue [ sContPos + + ] = 0xFFFE ;
2001-02-26 10:28:56 +00:00
} else { /* adding to the already existing continuing table */
2001-02-22 21:18:29 +00:00
uint32_t contIndex = inverseTable [ position ] [ 2 ] & UCOL_INV_OFFSETMASK ;
uint32_t contSize = ( inverseTable [ position ] [ 2 ] & UCOL_INV_SIZEMASK ) > > UCOL_INV_SHIFTVALUE ;
if ( contIndex + contSize < sContPos ) {
2001-02-26 10:28:56 +00:00
/*fprintf(stderr, ".", sContPos, contIndex+contSize);*/
memcpy ( stringContinue + contIndex + contSize + element - > cSize + 1 , stringContinue + contIndex + contSize , ( element - > cSize + 1 ) * sizeof ( UChar ) ) ;
2001-02-22 21:18:29 +00:00
}
stringContinue [ contIndex + contSize - 1 ] = 0xFFFF ;
memcpy ( stringContinue + contIndex + contSize , element - > cPoints , element - > cSize * sizeof ( UChar ) ) ;
sContPos + = element - > cSize + 1 ;
stringContinue [ contIndex + contSize + element - > cSize ] = 0xFFFE ;
inverseTable [ position ] [ 2 ] = ( ( contSize + element - > cSize + 1 ) < < UCOL_INV_SHIFTVALUE ) | contIndex ;
2001-02-26 10:28:56 +00:00
}
2001-02-22 21:18:29 +00:00
}
2003-08-23 01:38:31 +00:00
/*
* Takes two CEs ( lead and continuation ) and
* compares them as CEs should be compared :
* primary vs . primary , secondary vs . secondary
* tertiary vs . tertiary
*/
static int32_t compareCEs ( uint32_t * source , uint32_t * target ) {
uint32_t s1 = source [ 0 ] , s2 , t1 = target [ 0 ] , t2 ;
if ( isContinuation ( source [ 1 ] ) ) {
s2 = source [ 1 ] ;
} else {
s2 = 0 ;
}
if ( isContinuation ( target [ 1 ] ) ) {
t2 = target [ 1 ] ;
} else {
t2 = 0 ;
}
uint32_t s = 0 , t = 0 ;
if ( s1 = = t1 & & s2 = = t2 ) {
return 0 ;
}
s = ( s1 & 0xFFFF0000 ) | ( ( s2 & 0xFFFF0000 ) > > 16 ) ;
t = ( t1 & 0xFFFF0000 ) | ( ( t2 & 0xFFFF0000 ) > > 16 ) ;
if ( s < t ) {
return - 1 ;
} else if ( s > t ) {
return 1 ;
} else {
s = ( s1 & 0x0000FF00 ) | ( s2 & 0x0000FF00 ) > > 8 ;
t = ( t1 & 0x0000FF00 ) | ( t2 & 0x0000FF00 ) > > 8 ;
if ( s < t ) {
return - 1 ;
} else if ( s > t ) {
return 1 ;
} else {
s = ( s1 & 0x000000FF ) < < 8 | ( s2 & 0x000000FF ) ;
t = ( t1 & 0x000000FF ) < < 8 | ( t2 & 0x000000FF ) ;
if ( s < t ) {
return - 1 ;
} else {
return 1 ;
}
}
}
}
2001-05-03 23:10:45 +00:00
static uint32_t addToInverse ( UCAElements * element , UErrorCode * status ) {
2001-03-15 21:54:36 +00:00
uint32_t position = inversePos ;
2001-05-10 22:32:08 +00:00
uint32_t saveElement = element - > CEs [ 0 ] ;
2003-08-23 01:38:31 +00:00
int32_t compResult = 0 ;
2001-05-10 22:32:08 +00:00
element - > CEs [ 0 ] & = 0xFFFFFF3F ;
2001-03-15 21:54:36 +00:00
if ( element - > noOfCEs = = 1 ) {
element - > CEs [ 1 ] = 0 ;
}
if ( inversePos = = 0 ) {
inverseTable [ 0 ] [ 0 ] = inverseTable [ 0 ] [ 1 ] = inverseTable [ 0 ] [ 2 ] = 0 ;
addNewInverse ( element , status ) ;
2003-08-23 01:38:31 +00:00
} else if ( compareCEs ( inverseTable [ inversePos ] , element - > CEs ) > 0 ) {
while ( ( compResult = compareCEs ( inverseTable [ - - position ] , element - > CEs ) ) > 0 ) ;
2004-05-19 04:01:34 +00:00
if ( VERBOSE ) { fprintf ( stdout , " p:%u " , ( int ) position ) ; }
2003-08-23 01:38:31 +00:00
if ( compResult = = 0 ) {
addToExistingInverse ( element , position , status ) ;
} else {
2001-03-15 02:36:03 +00:00
insertInverse ( element , position + 1 , status ) ;
}
2003-08-23 01:38:31 +00:00
} else if ( compareCEs ( inverseTable [ inversePos ] , element - > CEs ) = = 0 ) {
addToExistingInverse ( element , inversePos , status ) ;
} else {
2001-02-26 10:28:56 +00:00
addNewInverse ( element , status ) ;
}
2001-05-10 22:32:08 +00:00
element - > CEs [ 0 ] = saveElement ;
2002-07-02 22:28:40 +00:00
if ( VERBOSE ) { fprintf ( stdout , " + " ) ; }
2001-02-26 10:28:56 +00:00
return inversePos ;
2001-02-22 21:18:29 +00:00
}
2003-04-30 00:49:01 +00:00
static InverseUCATableHeader * assembleInverseTable ( UErrorCode * status )
2001-02-23 01:21:38 +00:00
{
2003-04-30 00:49:01 +00:00
InverseUCATableHeader * result = NULL ;
uint32_t headerByteSize = paddedsize ( sizeof ( InverseUCATableHeader ) ) ;
2001-02-26 10:28:56 +00:00
uint32_t inverseTableByteSize = ( inversePos + 2 ) * sizeof ( uint32_t ) * 3 ;
uint32_t contsByteSize = sContPos * sizeof ( UChar ) ;
2001-03-15 21:54:36 +00:00
uint32_t i = 0 ;
2001-02-22 21:18:29 +00:00
2003-04-30 00:49:01 +00:00
result = ( InverseUCATableHeader * ) uprv_malloc ( headerByteSize + inverseTableByteSize + contsByteSize ) ;
2003-11-04 18:56:31 +00:00
uprv_memset ( result , 0 , headerByteSize + inverseTableByteSize + contsByteSize ) ;
2001-02-26 10:28:56 +00:00
if ( result ! = NULL ) {
result - > byteSize = headerByteSize + inverseTableByteSize + contsByteSize ;
2001-02-22 21:18:29 +00:00
2001-02-26 10:28:56 +00:00
inversePos + + ;
inverseTable [ inversePos ] [ 0 ] = 0xFFFFFFFF ;
inverseTable [ inversePos ] [ 1 ] = 0xFFFFFFFF ;
inverseTable [ inversePos ] [ 2 ] = 0x0000FFFF ;
inversePos + + ;
2001-02-22 21:18:29 +00:00
2001-03-15 21:54:36 +00:00
for ( i = 2 ; i < inversePos ; i + + ) {
2003-08-23 01:38:31 +00:00
if ( compareCEs ( inverseTable [ i - 1 ] , inverseTable [ i ] ) > 0 ) {
2004-05-19 04:01:34 +00:00
fprintf ( stderr , " Error at %i: %08X & %08X \n " , ( int ) i , ( int ) inverseTable [ i - 1 ] [ 0 ] , ( int ) inverseTable [ i ] [ 0 ] ) ;
2001-03-15 21:54:36 +00:00
} else if ( inverseTable [ i - 1 ] [ 0 ] = = inverseTable [ i ] [ 0 ] & & ! ( inverseTable [ i - 1 ] [ 1 ] < inverseTable [ i ] [ 1 ] ) ) {
2004-05-19 04:01:34 +00:00
fprintf ( stderr , " Continuation error at %i: %08X %08X & %08X %08X \n " , ( int ) i , ( int ) inverseTable [ i - 1 ] [ 0 ] , ( int ) inverseTable [ i - 1 ] [ 1 ] , ( int ) inverseTable [ i ] [ 0 ] , ( int ) inverseTable [ i ] [ 1 ] ) ;
2001-03-15 21:54:36 +00:00
}
}
2001-02-26 10:28:56 +00:00
result - > tableSize = inversePos ;
result - > contsSize = sContPos ;
2001-02-22 21:18:29 +00:00
2001-02-26 10:28:56 +00:00
result - > table = headerByteSize ;
result - > conts = headerByteSize + inverseTableByteSize ;
2001-02-22 21:18:29 +00:00
2001-02-26 10:28:56 +00:00
memcpy ( ( uint8_t * ) result + result - > table , inverseTable , inverseTableByteSize ) ;
memcpy ( ( uint8_t * ) result + result - > conts , stringContinue , contsByteSize ) ;
2001-02-22 21:18:29 +00:00
2001-02-26 10:28:56 +00:00
} else {
* status = U_MEMORY_ALLOCATION_ERROR ;
return NULL ;
}
2001-02-22 21:18:29 +00:00
2001-02-26 10:28:56 +00:00
return result ;
2001-02-23 01:21:38 +00:00
}
2003-04-30 00:49:01 +00:00
static void writeOutInverseData ( InverseUCATableHeader * data ,
2002-07-02 22:28:40 +00:00
const char * outputDir ,
2001-02-23 19:10:28 +00:00
const char * copyright ,
2001-02-23 01:21:38 +00:00
UErrorCode * status )
{
2001-02-22 21:18:29 +00:00
UNewDataMemory * pData ;
long dataLength ;
2003-04-24 06:57:36 +00:00
UDataInfo invUcaInfo ;
uprv_memcpy ( & invUcaInfo , & invUcaDataInfo , sizeof ( UDataInfo ) ) ;
u_getUnicodeVersion ( invUcaInfo . dataVersion ) ;
2004-04-14 20:08:16 +00:00
pData = udata_create ( outputDir , INVC_DATA_TYPE , INVC_DATA_NAME , & invUcaInfo ,
2001-02-23 19:10:28 +00:00
copyright , status ) ;
2001-02-22 21:18:29 +00:00
if ( U_FAILURE ( * status ) ) {
2004-07-18 22:04:47 +00:00
fprintf ( stderr , " Error: unable to create %s " INVC_DATA_NAME " , error %s \n " , outputDir , u_errorName ( * status ) ) ;
2001-02-23 01:21:38 +00:00
return ;
2001-02-22 21:18:29 +00:00
}
/* write the data to the file */
2002-01-04 23:48:46 +00:00
if ( VERBOSE ) {
fprintf ( stdout , " Writing out inverse UCA table: %s%c%s.%s \n " , outputDir , U_FILE_SEP_CHAR ,
2004-04-14 20:08:16 +00:00
INVC_DATA_NAME ,
2001-02-23 19:10:28 +00:00
INVC_DATA_TYPE ) ;
2002-01-04 23:48:46 +00:00
}
2001-02-23 01:21:38 +00:00
udata_writeBlock ( pData , data , data - > byteSize ) ;
2001-02-22 21:18:29 +00:00
/* finish up */
dataLength = udata_finish ( pData , status ) ;
if ( U_FAILURE ( * status ) ) {
fprintf ( stderr , " Error: error %d writing the output file \n " , * status ) ;
2001-02-23 01:21:38 +00:00
return ;
2001-02-22 21:18:29 +00:00
}
}
2001-05-03 23:10:45 +00:00
static int32_t hex2num ( char hex ) {
2001-02-22 21:18:29 +00:00
if ( hex > = ' 0 ' & & hex < = ' 9 ' ) {
return hex - ' 0 ' ;
} else if ( hex > = ' a ' & & hex < = ' f ' ) {
return hex - ' a ' + 10 ;
} else if ( hex > = ' A ' & & hex < = ' F ' ) {
return hex - ' A ' + 10 ;
} else {
return 0 ;
}
}
2010-10-28 19:05:02 +00:00
// static char* CHARACTER_CATEGORY_REORDER_CODES[] = {
// "Zs", "Nd", "Sc"
// };
// static const uint16_t CHARACTER_CATEGORY_REORDER_CODE_OFFSET = 0x1000;
// static uint16_t CHARACTER_CATEGORY_REORDER_CODES_VALUE[] = {
// U_SPACE_SEPARATOR + CHARACTER_CATEGORY_REORDER_CODE_OFFSET,
// U_DECIMAL_DIGIT_NUMBER + CHARACTER_CATEGORY_REORDER_CODE_OFFSET,
// U_CURRENCY_SYMBOL + CHARACTER_CATEGORY_REORDER_CODE_OFFSET
// };
2011-11-03 23:09:27 +00:00
static const struct {
const char * name ;
int32_t code ;
} specialReorderTokens [ ] = {
{ " TERMINATOR " , - 2 } , // -2 means "ignore"
{ " LEVEL-SEPARATOR " , - 2 } ,
{ " FIELD-SEPARATOR " , - 2 } ,
{ " COMPRESS " , - 2 } , // TODO: We should parse/store which lead bytes are compressible; there is a ticket for that.
{ " PUNCTUATION " , UCOL_REORDER_CODE_PUNCTUATION } ,
{ " IMPLICIT " , USCRIPT_HAN } , // Implicit weights are usually for Han characters. Han & unassigned share a lead byte.
{ " TRAILING " , - 2 } , // We do not reorder trailing weights (those after implicits).
{ " SPECIAL " , - 2 } // We must never reorder internal, special CE lead bytes.
} ;
int32_t getReorderCode ( const char * name ) {
int32_t code = ucol_findReorderingEntry ( name ) ;
if ( code > = 0 ) {
return code ;
2010-10-28 19:05:02 +00:00
}
2011-11-03 23:09:27 +00:00
code = u_getPropertyValueEnum ( UCHAR_SCRIPT , name ) ;
if ( code > = 0 ) {
return code ;
2010-10-28 19:05:02 +00:00
}
2011-11-03 23:09:27 +00:00
for ( int32_t i = 0 ; i < LENGTHOF ( specialReorderTokens ) ; + + i ) {
if ( 0 = = strcmp ( name , specialReorderTokens [ i ] . name ) ) {
return specialReorderTokens [ i ] . code ;
}
2010-10-28 19:05:02 +00:00
}
2011-11-03 23:09:27 +00:00
return - 1 ; // Same as UCHAR_INVALID_CODE or USCRIPT_INVALID_CODE.
2010-10-28 19:05:02 +00:00
}
UCAElements * readAnElement ( FILE * data , tempUCATable * t , UCAConstants * consts , LeadByteConstants * leadByteConstants , UErrorCode * status ) {
static int itemsToDataBlock = 0 ;
static int scriptDataWritten = 0 ;
2001-02-22 21:18:29 +00:00
char buffer [ 2048 ] , primary [ 100 ] , secondary [ 100 ] , tertiary [ 100 ] ;
2010-10-28 19:05:02 +00:00
UChar uBuffer [ 2048 ] ;
UChar uBuffer2 [ 2048 ] ;
UChar leadByte [ 100 ] , scriptCode [ 100 ] ;
2001-02-22 21:18:29 +00:00
int32_t i = 0 ;
2001-02-27 20:09:03 +00:00
unsigned int theValue ;
2001-02-22 21:18:29 +00:00
char * pointer = NULL ;
char * commentStart = NULL ;
char * startCodePoint = NULL ;
char * endCodePoint = NULL ;
char * result = fgets ( buffer , 2048 , data ) ;
2003-12-11 05:00:40 +00:00
int32_t buflen = ( int32_t ) uprv_strlen ( buffer ) ;
2001-02-22 21:18:29 +00:00
if ( U_FAILURE ( * status ) ) {
return 0 ;
}
* primary = * secondary = * tertiary = ' \0 ' ;
2010-10-28 19:05:02 +00:00
* leadByte = * scriptCode = ' \0 ' ;
2001-02-22 21:18:29 +00:00
if ( result = = NULL ) {
if ( feof ( data ) ) {
return NULL ;
} else {
fprintf ( stderr , " empty line but no EOF! \n " ) ;
* status = U_INVALID_FORMAT_ERROR ;
return NULL ;
}
}
2002-07-19 21:59:26 +00:00
while ( buflen > 0 & & ( buffer [ buflen - 1 ] = = ' \r ' | | buffer [ buflen - 1 ] = = ' \n ' ) ) {
buffer [ - - buflen ] = 0 ;
}
if ( buffer [ 0 ] = = 0 | | buffer [ 0 ] = = ' # ' ) {
2001-02-22 21:18:29 +00:00
return NULL ; // just a comment, skip whole line
}
2010-10-07 19:46:41 +00:00
UCAElements * element = & le ;
memset ( element , 0 , sizeof ( * element ) ) ;
2001-02-22 21:18:29 +00:00
2002-07-02 22:28:40 +00:00
enum ActionType {
READCE ,
2010-10-13 17:40:32 +00:00
READHEX1 ,
READHEX2 ,
2010-10-28 19:05:02 +00:00
READUCAVERSION ,
READLEADBYTETOSCRIPTS ,
READSCRIPTTOLEADBYTES ,
IGNORE ,
2002-07-02 22:28:40 +00:00
} ;
2002-06-13 18:24:36 +00:00
// Directives.
2001-02-22 21:18:29 +00:00
if ( buffer [ 0 ] = = ' [ ' ) {
2002-06-13 18:24:36 +00:00
uint32_t cnt = 0 ;
2007-08-29 02:57:42 +00:00
static const struct {
char name [ 128 ] ;
2002-06-13 18:24:36 +00:00
uint32_t * what ;
2002-07-02 22:28:40 +00:00
ActionType what_to_do ;
} vt [ ] = { { " [first tertiary ignorable " , consts - > UCA_FIRST_TERTIARY_IGNORABLE , READCE } ,
{ " [last tertiary ignorable " , consts - > UCA_LAST_TERTIARY_IGNORABLE , READCE } ,
{ " [first secondary ignorable " , consts - > UCA_FIRST_SECONDARY_IGNORABLE , READCE } ,
{ " [last secondary ignorable " , consts - > UCA_LAST_SECONDARY_IGNORABLE , READCE } ,
{ " [first primary ignorable " , consts - > UCA_FIRST_PRIMARY_IGNORABLE , READCE } ,
{ " [last primary ignorable " , consts - > UCA_LAST_PRIMARY_IGNORABLE , READCE } ,
{ " [first variable " , consts - > UCA_FIRST_VARIABLE , READCE } ,
{ " [last variable " , consts - > UCA_LAST_VARIABLE , READCE } ,
2002-07-11 22:42:57 +00:00
{ " [first regular " , consts - > UCA_FIRST_NON_VARIABLE , READCE } ,
{ " [last regular " , consts - > UCA_LAST_NON_VARIABLE , READCE } ,
2002-07-02 22:28:40 +00:00
{ " [first implicit " , consts - > UCA_FIRST_IMPLICIT , READCE } ,
{ " [last implicit " , consts - > UCA_LAST_IMPLICIT , READCE } ,
{ " [first trailing " , consts - > UCA_FIRST_TRAILING , READCE } ,
{ " [last trailing " , consts - > UCA_LAST_TRAILING , READCE } ,
2010-10-28 19:05:02 +00:00
{ " [fixed top " , & consts - > UCA_PRIMARY_TOP_MIN , READHEX1 } ,
{ " [fixed first implicit byte " , & consts - > UCA_PRIMARY_IMPLICIT_MIN , READHEX1 } ,
{ " [fixed last implicit byte " , & consts - > UCA_PRIMARY_IMPLICIT_MAX , READHEX1 } ,
{ " [fixed first trail byte " , & consts - > UCA_PRIMARY_TRAILING_MIN , READHEX1 } ,
{ " [fixed last trail byte " , & consts - > UCA_PRIMARY_TRAILING_MAX , READHEX1 } ,
{ " [fixed first special byte " , & consts - > UCA_PRIMARY_SPECIAL_MIN , READHEX1 } ,
{ " [fixed last special byte " , & consts - > UCA_PRIMARY_SPECIAL_MAX , READHEX1 } ,
{ " [variable top = " , & t - > options - > variableTopValue , READHEX2 } ,
{ " [UCA version = " , NULL , READUCAVERSION } ,
2011-12-07 00:34:11 +00:00
{ " [top_byte " , NULL , READLEADBYTETOSCRIPTS } ,
{ " [reorderingTokens " , NULL , READSCRIPTTOLEADBYTES } ,
{ " [categories " , NULL , IGNORE } ,
2010-10-28 19:05:02 +00:00
{ " [first tertiary in secondary non-ignorable " , NULL , IGNORE } ,
{ " [last tertiary in secondary non-ignorable " , NULL , IGNORE } ,
{ " [first secondary in primary non-ignorable " , NULL , IGNORE } ,
{ " [last secondary in primary non-ignorable " , NULL , IGNORE } ,
2002-06-13 18:24:36 +00:00
} ;
for ( cnt = 0 ; cnt < sizeof ( vt ) / sizeof ( vt [ 0 ] ) ; cnt + + ) {
uint32_t vtLen = ( uint32_t ) uprv_strlen ( vt [ cnt ] . name ) ;
if ( uprv_strncmp ( buffer , vt [ cnt ] . name , vtLen ) = = 0 ) {
2010-10-13 17:40:32 +00:00
ActionType what_to_do = vt [ cnt ] . what_to_do ;
2010-10-28 19:05:02 +00:00
if ( what_to_do = = IGNORE ) { //vt[cnt].what_to_do == IGNORE
return NULL ;
} else if ( what_to_do = = READHEX1 | | what_to_do = = READHEX2 ) {
2010-10-13 17:40:32 +00:00
pointer = buffer + vtLen ;
2010-10-28 19:05:02 +00:00
int32_t numBytes = readElement ( & pointer , primary , ' ] ' , status ) / 2 ;
2010-10-13 17:40:32 +00:00
if ( numBytes ! = ( what_to_do = = READHEX1 ? 1 : 2 ) ) {
fprintf ( stderr , " Value of \" %s \" has unexpected number of %d bytes \n " ,
buffer , ( int ) numBytes ) ;
2010-10-28 19:05:02 +00:00
//*status = U_INVALID_FORMAT_ERROR;
2010-10-13 17:40:32 +00:00
return NULL ;
2002-07-02 22:28:40 +00:00
}
2010-10-13 17:40:32 +00:00
* ( vt [ cnt ] . what ) = ( uint32_t ) uprv_strtoul ( primary , & pointer , 16 ) ;
if ( * pointer ! = 0 ) {
fprintf ( stderr , " Value of \" %s \" is not a hexadecimal number \n " , buffer ) ;
2010-10-28 19:05:02 +00:00
//*status = U_INVALID_FORMAT_ERROR;
2010-10-13 17:40:32 +00:00
return NULL ;
}
} else if ( what_to_do = = READCE ) {
2009-11-13 19:25:21 +00:00
// TODO: combine & clean up the two CE parsers
2002-07-02 22:28:40 +00:00
pointer = strchr ( buffer + vtLen , ' [ ' ) ;
if ( pointer ) {
pointer + + ;
2010-10-28 19:05:02 +00:00
element - > sizePrim [ 0 ] = readElement ( & pointer , primary , ' , ' , status ) / 2 ;
element - > sizeSec [ 0 ] = readElement ( & pointer , secondary , ' , ' , status ) / 2 ;
element - > sizeTer [ 0 ] = readElement ( & pointer , tertiary , ' ] ' , status ) / 2 ;
2002-07-02 22:28:40 +00:00
vt [ cnt ] . what [ 0 ] = getSingleCEValue ( primary , secondary , tertiary , status ) ;
if ( element - > sizePrim [ 0 ] > 2 | | element - > sizeSec [ 0 ] > 1 | | element - > sizeTer [ 0 ] > 1 ) {
uint32_t CEi = 1 ;
uint32_t value = UCOL_CONTINUATION_MARKER ; /* Continuation marker */
if ( 2 * CEi < element - > sizePrim [ i ] ) {
value | = ( ( hex2num ( * ( primary + 4 * CEi ) ) & 0xF ) < < 28 ) ;
value | = ( ( hex2num ( * ( primary + 4 * CEi + 1 ) ) & 0xF ) < < 24 ) ;
}
if ( 2 * CEi + 1 < element - > sizePrim [ i ] ) {
value | = ( ( hex2num ( * ( primary + 4 * CEi + 2 ) ) & 0xF ) < < 20 ) ;
value | = ( ( hex2num ( * ( primary + 4 * CEi + 3 ) ) & 0xF ) < < 16 ) ;
}
if ( CEi < element - > sizeSec [ i ] ) {
value | = ( ( hex2num ( * ( secondary + 2 * CEi ) ) & 0xF ) < < 12 ) ;
value | = ( ( hex2num ( * ( secondary + 2 * CEi + 1 ) ) & 0xF ) < < 8 ) ;
}
if ( CEi < element - > sizeTer [ i ] ) {
value | = ( ( hex2num ( * ( tertiary + 2 * CEi ) ) & 0x3 ) < < 4 ) ;
value | = ( hex2num ( * ( tertiary + 2 * CEi + 1 ) ) & 0xF ) ;
}
CEi + + ;
vt [ cnt ] . what [ 1 ] = value ;
//element->CEs[CEindex++] = value;
} else {
vt [ cnt ] . what [ 1 ] = 0 ;
}
} else {
fprintf ( stderr , " Failed to read a CE from line %s \n " , buffer ) ;
}
2010-10-28 19:05:02 +00:00
} else if ( what_to_do = = READUCAVERSION ) { //vt[cnt].what_to_do == READUCAVERSION
2003-04-24 06:57:36 +00:00
u_versionFromString ( UCAVersion , buffer + vtLen ) ;
if ( VERBOSE ) {
2011-10-30 00:20:58 +00:00
char uca [ U_MAX_VERSION_STRING_LENGTH ] ;
u_versionToString ( UCAVersion , uca ) ;
fprintf ( stdout , " UCA version %s \n " , uca ) ;
}
UVersionInfo UCDVersion ;
u_getUnicodeVersion ( UCDVersion ) ;
if ( UCAVersion [ 0 ] ! = UCDVersion [ 0 ] | | UCAVersion [ 1 ] ! = UCDVersion [ 1 ] ) {
char uca [ U_MAX_VERSION_STRING_LENGTH ] ;
char ucd [ U_MAX_VERSION_STRING_LENGTH ] ;
u_versionToString ( UCAVersion , uca ) ;
u_versionToString ( UCDVersion , ucd ) ;
fprintf ( stderr , " error: UCA version %s != UCD version %s (temporarily change the FractionalUCA.txt UCA version during Unicode version upgrade) \n " , uca , ucd ) ;
* status = U_INVALID_FORMAT_ERROR ;
return NULL ;
2003-04-24 06:57:36 +00:00
}
2010-10-28 19:05:02 +00:00
} else if ( what_to_do = = READLEADBYTETOSCRIPTS ) { //vt[cnt].what_to_do == READLEADBYTETOSCRIPTS
pointer = buffer + vtLen ;
2011-12-07 00:34:11 +00:00
skipWhiteSpace ( & pointer , status ) ;
2010-10-28 19:05:02 +00:00
uint16_t leadByte = ( hex2num ( * pointer + + ) * 16 ) + hex2num ( * pointer + + ) ;
//fprintf(stdout, "~~~~ processing lead byte = %02x\n", leadByte);
if ( leadByte > = leadByteConstants - > LEAD_BYTE_TO_SCRIPTS_INDEX_LENGTH ) {
fprintf ( stderr , " Lead byte larger than allocated table! " ) ;
// set status and return
* status = U_INTERNAL_PROGRAM_ERROR ;
return NULL ;
}
skipWhiteSpace ( & pointer , status ) ;
2011-11-03 23:09:27 +00:00
2010-10-28 19:05:02 +00:00
int32_t reorderCodeArray [ 100 ] ;
uint32_t reorderCodeArrayCount = 0 ;
char scriptName [ 100 ] ;
int32_t elementLength = 0 ;
while ( ( elementLength = readElement ( & pointer , scriptName , ' ' , status ) ) > 0 ) {
if ( scriptName [ 0 ] = = ' ] ' ) {
break ;
}
2011-11-03 23:09:27 +00:00
int32_t reorderCode = getReorderCode ( scriptName ) ;
if ( reorderCode = = - 2 ) {
continue ; // Ignore "TERMINATOR" etc.
}
if ( reorderCode < 0 ) {
fprintf ( stdout , " Syntax error: unable to parse reorder code from '%s' \n " , scriptName ) ;
* status = U_INVALID_FORMAT_ERROR ;
return NULL ;
2010-10-28 19:05:02 +00:00
}
2011-11-03 23:09:27 +00:00
if ( reorderCodeArrayCount > = LENGTHOF ( reorderCodeArray ) ) {
fprintf ( stdout , " reorder code array count is greater than allocated size! \n " ) ;
2010-10-28 19:05:02 +00:00
* status = U_INTERNAL_PROGRAM_ERROR ;
return NULL ;
}
2011-11-03 23:09:27 +00:00
reorderCodeArray [ reorderCodeArrayCount + + ] = reorderCode ;
2010-10-28 19:05:02 +00:00
}
//fprintf(stdout, "reorderCodeArrayCount = %d\n", reorderCodeArrayCount);
switch ( reorderCodeArrayCount ) {
case 0 :
leadByteConstants - > LEAD_BYTE_TO_SCRIPTS_INDEX [ leadByte ] = 0 ;
break ;
case 1 :
// TODO = move 0x8000 into defined constant
leadByteConstants - > LEAD_BYTE_TO_SCRIPTS_INDEX [ leadByte ] = 0x8000 | reorderCodeArray [ 0 ] ;
break ;
default :
if ( reorderCodeArrayCount + leadByteConstants - > LEAD_BYTE_TO_SCRIPTS_DATA_OFFSET > leadByteConstants - > LEAD_BYTE_TO_SCRIPTS_DATA_LENGTH ) {
// Error condition
}
leadByteConstants - > LEAD_BYTE_TO_SCRIPTS_INDEX [ leadByte ] = leadByteConstants - > LEAD_BYTE_TO_SCRIPTS_DATA_OFFSET ;
leadByteConstants - > LEAD_BYTE_TO_SCRIPTS_DATA [ leadByteConstants - > LEAD_BYTE_TO_SCRIPTS_DATA_OFFSET + + ] = reorderCodeArrayCount ;
for ( int reorderCodeIndex = 0 ; reorderCodeIndex < reorderCodeArrayCount ; reorderCodeIndex + + ) {
leadByteConstants - > LEAD_BYTE_TO_SCRIPTS_DATA [ leadByteConstants - > LEAD_BYTE_TO_SCRIPTS_DATA_OFFSET + + ] = reorderCodeArray [ reorderCodeIndex ] ;
}
}
} else if ( what_to_do = = READSCRIPTTOLEADBYTES ) { //vt[cnt].what_to_do == READSCRIPTTOLEADBYTES
uint16_t leadByteArray [ 100 ] ;
uint32_t leadByteArrayCount = 0 ;
char scriptName [ 100 ] ;
pointer = buffer + vtLen ;
2011-12-07 00:34:11 +00:00
skipWhiteSpace ( & pointer , status ) ;
2010-10-28 19:05:02 +00:00
uint32_t scriptNameLength = readElement ( & pointer , scriptName , ' \t ' , status ) ;
2011-11-03 23:09:27 +00:00
int32_t reorderCode = getReorderCode ( scriptName ) ;
if ( reorderCode > = 0 ) {
//fprintf(stdout, "^^^ processing reorder code = %04x (%s)\n", reorderCode, scriptName);
2010-10-28 19:05:02 +00:00
skipWhiteSpace ( & pointer , status ) ;
2011-11-03 23:09:27 +00:00
2010-10-28 19:05:02 +00:00
int32_t elementLength = 0 ;
char leadByteString [ 100 ] ;
while ( ( elementLength = readElement ( & pointer , leadByteString , ' = ' , status ) ) = = 2 ) {
//fprintf(stdout, "\tleadByteArrayCount = %d, elementLength = %d, leadByteString = %s\n", leadByteArrayCount, elementLength, leadByteString);
uint32_t leadByte = ( hex2num ( leadByteString [ 0 ] ) * 16 ) + hex2num ( leadByteString [ 1 ] ) ;
leadByteArray [ leadByteArrayCount + + ] = ( uint16_t ) leadByte ;
skipUntilWhiteSpace ( & pointer , status ) ;
}
2011-11-03 23:09:27 +00:00
2010-10-28 19:05:02 +00:00
if ( leadByteConstants - > SCRIPT_TO_LEAD_BYTES_INDEX_COUNT > = leadByteConstants - > SCRIPT_TO_LEAD_BYTES_INDEX_LENGTH ) {
//fprintf(stdout, "\tError condition\n");
//fprintf(stdout, "\tindex count = %d, total index size = %d\n", leadByteConstants->SCRIPT_TO_LEAD_BYTES_INDEX_COUNT, sizeof(leadByteConstants->SCRIPT_TO_LEAD_BYTES_INDEX) / sizeof(leadByteConstants->SCRIPT_TO_LEAD_BYTES_INDEX[0]));
// Error condition
* status = U_INTERNAL_PROGRAM_ERROR ;
return NULL ;
}
2011-11-03 23:09:27 +00:00
leadByteConstants - > SCRIPT_TO_LEAD_BYTES_INDEX [ leadByteConstants - > SCRIPT_TO_LEAD_BYTES_INDEX_COUNT ] . reorderCode = reorderCode ;
2010-10-28 19:05:02 +00:00
//fprintf(stdout, "\tlead byte count = %d\n", leadByteArrayCount);
//fprintf(stdout, "\tlead byte array = ");
//for (int i = 0; i < leadByteArrayCount; i++) {
// fprintf(stdout, "%02x, ", leadByteArray[i]);
//}
//fprintf(stdout, "\n");
2011-11-03 23:09:27 +00:00
2010-10-28 19:05:02 +00:00
switch ( leadByteArrayCount ) {
case 0 :
leadByteConstants - > SCRIPT_TO_LEAD_BYTES_INDEX [ leadByteConstants - > SCRIPT_TO_LEAD_BYTES_INDEX_COUNT ] . offset = 0 ;
break ;
case 1 :
// TODO = move 0x8000 into defined constant
//fprintf(stdout, "\t+++++ lead byte = &x\n", leadByteArray[0]);
leadByteConstants - > SCRIPT_TO_LEAD_BYTES_INDEX [ leadByteConstants - > SCRIPT_TO_LEAD_BYTES_INDEX_COUNT ] . offset = 0x8000 | leadByteArray [ 0 ] ;
break ;
default :
//fprintf(stdout, "\t+++++ lead bytes written to data block - %d\n", itemsToDataBlock++);
//fprintf(stdout, "\tlead bytes = ");
//for (int i = 0; i < leadByteArrayCount; i++) {
// fprintf(stdout, "%02x, ", leadByteArray[i]);
//}
//fprintf(stdout, "\n");
//fprintf(stdout, "\tBEFORE data bytes = ");
//for (int i = 0; i < leadByteConstants->SCRIPT_TO_LEAD_BYTES_DATA_OFFSET; i++) {
// fprintf(stdout, "%02x, ", leadByteConstants->SCRIPT_TO_LEAD_BYTES_DATA[i]);
//}
//fprintf(stdout, "\n");
//fprintf(stdout, "\tdata offset = %d, data length = %d\n", leadByteConstants->SCRIPT_TO_LEAD_BYTES_DATA_OFFSET, leadByteConstants->SCRIPT_TO_LEAD_BYTES_DATA_LENGTH);
if ( ( leadByteArrayCount + leadByteConstants - > SCRIPT_TO_LEAD_BYTES_DATA_OFFSET ) > leadByteConstants - > SCRIPT_TO_LEAD_BYTES_DATA_LENGTH ) {
//fprintf(stdout, "\tError condition\n");
// Error condition
* status = U_INTERNAL_PROGRAM_ERROR ;
return NULL ;
}
leadByteConstants - > SCRIPT_TO_LEAD_BYTES_INDEX [ leadByteConstants - > SCRIPT_TO_LEAD_BYTES_INDEX_COUNT ] . offset = leadByteConstants - > SCRIPT_TO_LEAD_BYTES_DATA_OFFSET ;
leadByteConstants - > SCRIPT_TO_LEAD_BYTES_DATA [ leadByteConstants - > SCRIPT_TO_LEAD_BYTES_DATA_OFFSET + + ] = leadByteArrayCount ;
scriptDataWritten + + ;
memcpy ( & leadByteConstants - > SCRIPT_TO_LEAD_BYTES_DATA [ leadByteConstants - > SCRIPT_TO_LEAD_BYTES_DATA_OFFSET ] ,
leadByteArray , leadByteArrayCount * sizeof ( leadByteArray [ 0 ] ) ) ;
scriptDataWritten + = leadByteArrayCount ;
//fprintf(stdout, "\tlead byte data written = %d\n", scriptDataWritten);
//fprintf(stdout, "\tcurrentIndex.reorderCode = %04x, currentIndex.offset = %04x\n",
// leadByteConstants->SCRIPT_TO_LEAD_BYTES_INDEX_COUNT.reorderCode, leadByteConstants->SCRIPT_TO_LEAD_BYTES_INDEX_COUNT.offset);
leadByteConstants - > SCRIPT_TO_LEAD_BYTES_DATA_OFFSET + = leadByteArrayCount ;
//fprintf(stdout, "\tdata offset = %d\n", leadByteConstants->SCRIPT_TO_LEAD_BYTES_DATA_OFFSET);
//fprintf(stdout, "\tAFTER data bytes = ");
//for (int i = 0; i < leadByteConstants->SCRIPT_TO_LEAD_BYTES_DATA_OFFSET; i++) {
// fprintf(stdout, "%02x, ", leadByteConstants->SCRIPT_TO_LEAD_BYTES_DATA[i]);
//}
//fprintf(stdout, "\n");
}
2011-11-03 23:09:27 +00:00
//if (reorderCode >= 0x1000) {
2010-10-28 19:05:02 +00:00
// fprintf(stdout, "@@@@ reorderCode = %x, offset = %x\n", leadByteConstants->SCRIPT_TO_LEAD_BYTES_INDEX[leadByteConstants->SCRIPT_TO_LEAD_BYTES_INDEX_COUNT].reorderCode, leadByteConstants->SCRIPT_TO_LEAD_BYTES_INDEX[leadByteConstants->SCRIPT_TO_LEAD_BYTES_INDEX_COUNT].offset);
// for (int i = 0; i < leadByteConstants->SCRIPT_TO_LEAD_BYTES_DATA_OFFSET; i++) {
// fprintf(stdout, "%02x, ", leadByteConstants->SCRIPT_TO_LEAD_BYTES_DATA[i]);
// }
// fprintf(stdout, "\n");
// }
leadByteConstants - > SCRIPT_TO_LEAD_BYTES_INDEX_COUNT + + ;
}
2002-06-13 18:24:36 +00:00
}
2002-04-30 18:04:07 +00:00
return NULL ;
2001-04-19 17:08:07 +00:00
}
2002-06-13 18:24:36 +00:00
}
2002-07-02 22:28:40 +00:00
fprintf ( stderr , " Warning: unrecognized option: %s \n " , buffer ) ;
//*status = U_INVALID_FORMAT_ERROR;
2002-06-13 18:24:36 +00:00
return NULL ;
2001-02-22 21:18:29 +00:00
}
startCodePoint = buffer ;
endCodePoint = strchr ( startCodePoint , ' ; ' ) ;
if ( endCodePoint = = 0 ) {
fprintf ( stderr , " error - line with no code point! \n " ) ;
* status = U_INVALID_FORMAT_ERROR ; /* No code point - could be an error, but probably only an empty line */
return NULL ;
} else {
* ( endCodePoint ) = 0 ;
}
2010-10-07 19:46:41 +00:00
char * pipePointer = strchr ( buffer , ' | ' ) ;
if ( pipePointer ! = NULL ) {
// Read the prefix string which precedes the actual string.
* pipePointer = 0 ;
element - > prefixSize =
u_parseString ( startCodePoint ,
element - > prefixChars , LENGTHOF ( element - > prefixChars ) ,
NULL , status ) ;
if ( U_FAILURE ( * status ) ) {
fprintf ( stderr , " error - parsing of prefix \" %s \" failed: %s \n " ,
startCodePoint , u_errorName ( * status ) ) ;
* status = U_INVALID_FORMAT_ERROR ;
return NULL ;
2001-02-22 21:18:29 +00:00
}
2010-10-07 19:46:41 +00:00
element - > prefix = element - > prefixChars ;
startCodePoint = pipePointer + 1 ;
}
2001-02-22 21:18:29 +00:00
2010-10-07 19:46:41 +00:00
// Read the string which gets the CE(s) assigned.
element - > cSize =
u_parseString ( startCodePoint ,
element - > uchars , LENGTHOF ( element - > uchars ) ,
NULL , status ) ;
if ( U_FAILURE ( * status ) ) {
fprintf ( stderr , " error - parsing of code point(s) \" %s \" failed: %s \n " ,
startCodePoint , u_errorName ( * status ) ) ;
* status = U_INVALID_FORMAT_ERROR ;
return NULL ;
2001-02-22 21:18:29 +00:00
}
2010-10-07 19:46:41 +00:00
element - > cPoints = element - > uchars ;
2001-02-22 21:18:29 +00:00
startCodePoint = endCodePoint + 1 ;
2001-05-10 22:32:08 +00:00
2001-02-22 21:18:29 +00:00
commentStart = strchr ( startCodePoint , ' # ' ) ;
if ( commentStart = = NULL ) {
2002-07-19 21:59:26 +00:00
commentStart = strlen ( startCodePoint ) + startCodePoint ;
2001-02-22 21:18:29 +00:00
}
i = 0 ;
uint32_t CEindex = 0 ;
element - > noOfCEs = 0 ;
for ( ; ; ) {
endCodePoint = strchr ( startCodePoint , ' ] ' ) ;
if ( endCodePoint = = NULL | | endCodePoint > = commentStart ) {
break ;
}
pointer = strchr ( startCodePoint , ' [ ' ) ;
pointer + + ;
2010-10-28 19:05:02 +00:00
element - > sizePrim [ i ] = readElement ( & pointer , primary , ' , ' , status ) / 2 ;
element - > sizeSec [ i ] = readElement ( & pointer , secondary , ' , ' , status ) / 2 ;
element - > sizeTer [ i ] = readElement ( & pointer , tertiary , ' ] ' , status ) / 2 ;
2001-02-22 21:18:29 +00:00
/* I want to get the CEs entered right here, including continuation */
2001-05-10 22:32:08 +00:00
element - > CEs [ CEindex + + ] = getSingleCEValue ( primary , secondary , tertiary , status ) ;
uint32_t CEi = 1 ;
while ( 2 * CEi < element - > sizePrim [ i ] | | CEi < element - > sizeSec [ i ] | | CEi < element - > sizeTer [ i ] ) {
uint32_t value = UCOL_CONTINUATION_MARKER ; /* Continuation marker */
if ( 2 * CEi < element - > sizePrim [ i ] ) {
value | = ( ( hex2num ( * ( primary + 4 * CEi ) ) & 0xF ) < < 28 ) ;
value | = ( ( hex2num ( * ( primary + 4 * CEi + 1 ) ) & 0xF ) < < 24 ) ;
}
if ( 2 * CEi + 1 < element - > sizePrim [ i ] ) {
value | = ( ( hex2num ( * ( primary + 4 * CEi + 2 ) ) & 0xF ) < < 20 ) ;
value | = ( ( hex2num ( * ( primary + 4 * CEi + 3 ) ) & 0xF ) < < 16 ) ;
}
if ( CEi < element - > sizeSec [ i ] ) {
value | = ( ( hex2num ( * ( secondary + 2 * CEi ) ) & 0xF ) < < 12 ) ;
value | = ( ( hex2num ( * ( secondary + 2 * CEi + 1 ) ) & 0xF ) < < 8 ) ;
}
if ( CEi < element - > sizeTer [ i ] ) {
value | = ( ( hex2num ( * ( tertiary + 2 * CEi ) ) & 0x3 ) < < 4 ) ;
value | = ( hex2num ( * ( tertiary + 2 * CEi + 1 ) ) & 0xF ) ;
}
CEi + + ;
element - > CEs [ CEindex + + ] = value ;
2001-02-22 21:18:29 +00:00
}
2001-05-10 22:32:08 +00:00
startCodePoint = endCodePoint + 1 ;
i + + ;
2001-02-22 21:18:29 +00:00
}
element - > noOfCEs = CEindex ;
2005-05-19 06:51:31 +00:00
#if 0
2001-03-05 20:08:31 +00:00
element - > isThai = UCOL_ISTHAIPREVOWEL ( element - > cPoints [ 0 ] ) ;
2005-05-19 06:51:31 +00:00
# endif
2001-02-22 21:18:29 +00:00
// we don't want any strange stuff after useful data!
2007-08-29 02:57:42 +00:00
if ( pointer = = NULL ) {
/* huh? Did we get ']' without the '['? Pair your brackets! */
* status = U_INVALID_FORMAT_ERROR ;
}
else {
while ( pointer < commentStart ) {
if ( * pointer ! = ' ' & & * pointer ! = ' \t ' )
{
* status = U_INVALID_FORMAT_ERROR ;
break ;
}
pointer + + ;
2001-02-22 21:18:29 +00:00
}
}
2010-10-12 23:54:40 +00:00
if ( element - > cSize = = 1 & & element - > cPoints [ 0 ] = = 0xfffe ) {
// UCA 6.0 gives U+FFFE a special minimum weight using the
// byte 02 which is the merge-sort-key separator and illegal for any
// other characters.
} else {
// Rudimentary check for valid bytes in CE weights.
// For a more comprehensive check see cintltst /tscoll/citertst/TestCEValidity
for ( i = 0 ; i < ( int32_t ) CEindex ; + + i ) {
uint32_t value = element - > CEs [ i ] ;
uint8_t bytes [ 4 ] = {
( uint8_t ) ( value > > 24 ) ,
( uint8_t ) ( value > > 16 ) ,
( uint8_t ) ( value > > 8 ) ,
( uint8_t ) ( value & UCOL_NEW_TERTIARYORDERMASK )
} ;
for ( int j = 0 ; j < 4 ; + + j ) {
if ( 0 ! = bytes [ j ] & & bytes [ j ] < 3 ) {
fprintf ( stderr , " Warning: invalid UCA weight byte %02X for %s \n " , bytes [ j ] , buffer ) ;
return NULL ;
}
}
// Primary second bytes 03 and FF are compression terminators.
if ( ! isContinuation ( value ) & & ( bytes [ 1 ] = = 3 | | bytes [ 1 ] = = 0xFF ) ) {
fprintf ( stderr , " Warning: invalid UCA primary second weight byte %02X for %s \n " ,
bytes [ 1 ] , buffer ) ;
2010-06-15 06:06:24 +00:00
return NULL ;
2009-11-13 19:25:21 +00:00
}
}
}
2001-02-22 21:18:29 +00:00
if ( U_FAILURE ( * status ) ) {
2003-08-23 01:38:31 +00:00
fprintf ( stderr , " problem putting stuff in hash table %s \n " , u_errorName ( * status ) ) ;
2001-02-22 21:18:29 +00:00
* status = U_INTERNAL_PROGRAM_ERROR ;
return NULL ;
}
return element ;
}
2001-02-23 01:21:38 +00:00
void writeOutData ( UCATableHeader * data ,
2002-06-13 18:24:36 +00:00
UCAConstants * consts ,
2010-10-28 19:05:02 +00:00
LeadByteConstants * leadByteConstants ,
2011-12-07 06:14:56 +00:00
UChar contractions [ ] [ MAX_UCA_CONTRACTION_LENGTH ] ,
2001-06-05 22:52:56 +00:00
uint32_t noOfcontractions ,
2001-02-23 01:21:38 +00:00
const char * outputDir ,
2001-02-23 19:10:28 +00:00
const char * copyright ,
2001-02-23 01:21:38 +00:00
UErrorCode * status )
{
2001-02-22 21:18:29 +00:00
if ( U_FAILURE ( * status ) ) {
return ;
}
2001-06-05 22:52:56 +00:00
uint32_t size = data - > size ;
2003-09-11 18:56:13 +00:00
data - > UCAConsts = data - > size ;
data - > size + = paddedsize ( sizeof ( UCAConstants ) ) ;
2001-06-05 22:52:56 +00:00
if ( noOfcontractions ! = 0 ) {
2011-12-07 06:14:56 +00:00
uprv_memset ( & contractions [ noOfcontractions ] [ 0 ] , 0 , MAX_UCA_CONTRACTION_LENGTH * U_SIZEOF_UCHAR ) ;
2001-06-05 22:52:56 +00:00
noOfcontractions + + ;
2002-07-02 22:28:40 +00:00
data - > contractionUCACombos = data - > size ;
2011-12-07 06:14:56 +00:00
data - > contractionUCACombosWidth = ( uint8_t ) MAX_UCA_CONTRACTION_LENGTH ;
2003-09-11 18:56:13 +00:00
data - > contractionUCACombosSize = noOfcontractions ;
2011-12-07 06:14:56 +00:00
data - > size + = paddedsize ( ( noOfcontractions * MAX_UCA_CONTRACTION_LENGTH * U_SIZEOF_UCHAR ) ) ;
2001-06-05 22:52:56 +00:00
}
2010-10-28 19:05:02 +00:00
data - > scriptToLeadByte = data - > size ;
//fprintf(stdout, "@@@@ script to lead byte offset = 0x%x (%d)\n", data->size, data->size);
data - > size + =
sizeof ( leadByteConstants - > SCRIPT_TO_LEAD_BYTES_INDEX_COUNT ) + // index table header
leadByteConstants - > SCRIPT_TO_LEAD_BYTES_INDEX_COUNT * sizeof ( leadByteConstants - > SCRIPT_TO_LEAD_BYTES_INDEX [ 0 ] ) + // index table
sizeof ( leadByteConstants - > SCRIPT_TO_LEAD_BYTES_DATA_OFFSET ) + // data table header
leadByteConstants - > SCRIPT_TO_LEAD_BYTES_DATA_OFFSET * sizeof ( leadByteConstants - > SCRIPT_TO_LEAD_BYTES_DATA [ 0 ] ) ; // data table
data - > leadByteToScript = data - > size ;
//fprintf(stdout, "@@@@ lead byte to script offset = 0x%x (%d)\n", data->size, data->size);
data - > size + =
sizeof ( leadByteConstants - > LEAD_BYTE_TO_SCRIPTS_INDEX_LENGTH ) + // index table header
leadByteConstants - > LEAD_BYTE_TO_SCRIPTS_INDEX_LENGTH * sizeof ( leadByteConstants - > LEAD_BYTE_TO_SCRIPTS_INDEX [ 0 ] ) + // index table
sizeof ( leadByteConstants - > LEAD_BYTE_TO_SCRIPTS_DATA_OFFSET ) + // data table header
leadByteConstants - > LEAD_BYTE_TO_SCRIPTS_DATA_OFFSET * sizeof ( leadByteConstants - > LEAD_BYTE_TO_SCRIPTS_DATA [ 0 ] ) ; // data table
2001-06-05 22:52:56 +00:00
2001-02-22 21:18:29 +00:00
UNewDataMemory * pData ;
2001-02-26 10:28:56 +00:00
2001-02-22 21:18:29 +00:00
long dataLength ;
2003-04-24 06:57:36 +00:00
UDataInfo ucaInfo ;
uprv_memcpy ( & ucaInfo , & ucaDataInfo , sizeof ( UDataInfo ) ) ;
u_getUnicodeVersion ( ucaInfo . dataVersion ) ;
2001-02-22 21:18:29 +00:00
2004-04-14 20:08:16 +00:00
pData = udata_create ( outputDir , UCA_DATA_TYPE , UCA_DATA_NAME , & ucaInfo ,
2001-02-23 19:10:28 +00:00
copyright , status ) ;
2001-02-22 21:18:29 +00:00
if ( U_FAILURE ( * status ) ) {
2004-07-18 22:04:47 +00:00
fprintf ( stderr , " Error: unable to create %s " UCA_DATA_NAME " , error %s \n " , outputDir , u_errorName ( * status ) ) ;
2001-02-22 21:18:29 +00:00
return ;
}
/* write the data to the file */
2002-01-04 23:48:46 +00:00
if ( VERBOSE ) {
fprintf ( stdout , " Writing out UCA table: %s%c%s.%s \n " , outputDir ,
U_FILE_SEP_CHAR ,
2002-07-17 03:56:50 +00:00
U_ICUDATA_NAME " _ " UCA_DATA_NAME ,
2001-02-23 19:10:28 +00:00
UCA_DATA_TYPE ) ;
2002-01-04 23:48:46 +00:00
}
2001-06-05 22:52:56 +00:00
udata_writeBlock ( pData , data , size ) ;
2002-06-13 18:24:36 +00:00
// output the constants here
udata_writeBlock ( pData , consts , sizeof ( UCAConstants ) ) ;
2010-10-28 19:05:02 +00:00
if ( VERBOSE ) {
fprintf ( stdout , " first tertiary ignorable = %x %x \n " , consts - > UCA_FIRST_TERTIARY_IGNORABLE [ 0 ] , consts - > UCA_FIRST_TERTIARY_IGNORABLE [ 1 ] ) ;
fprintf ( stdout , " last tertiary ignorable = %x %x \n " , consts - > UCA_LAST_TERTIARY_IGNORABLE [ 0 ] , consts - > UCA_LAST_TERTIARY_IGNORABLE [ 1 ] ) ;
fprintf ( stdout , " first secondary ignorable = %x %x \n " , consts - > UCA_FIRST_SECONDARY_IGNORABLE [ 0 ] , consts - > UCA_FIRST_SECONDARY_IGNORABLE [ 1 ] ) ;
fprintf ( stdout , " contractionUCACombosSize = %d \n " , data - > contractionUCACombosSize ) ;
fprintf ( stdout , " contractionSize = %d \n " , data - > contractionSize ) ;
fprintf ( stdout , " number of UCA contractions = %d \n " , noOfcontractions ) ;
}
2001-06-05 22:52:56 +00:00
if ( noOfcontractions ! = 0 ) {
2011-12-07 06:14:56 +00:00
udata_writeBlock ( pData , contractions , noOfcontractions * MAX_UCA_CONTRACTION_LENGTH * U_SIZEOF_UCHAR ) ;
udata_writePadding ( pData , paddedsize ( ( noOfcontractions * MAX_UCA_CONTRACTION_LENGTH * U_SIZEOF_UCHAR ) ) - noOfcontractions * MAX_UCA_CONTRACTION_LENGTH * U_SIZEOF_UCHAR ) ;
2001-06-05 22:52:56 +00:00
}
2001-02-22 21:18:29 +00:00
2010-10-28 19:05:02 +00:00
// output the script to lead bytes table here
if ( VERBOSE ) {
fprintf ( stdout , " Writing Script to Lead Byte Data \n " ) ;
2010-11-04 20:41:35 +00:00
fprintf ( stdout , " \t index table size = %x \n " , leadByteConstants - > SCRIPT_TO_LEAD_BYTES_INDEX_COUNT ) ;
fprintf ( stdout , " \t data block size = %x \n " , leadByteConstants - > SCRIPT_TO_LEAD_BYTES_DATA_OFFSET ) ;
2010-10-28 19:05:02 +00:00
}
udata_write16 ( pData , leadByteConstants - > SCRIPT_TO_LEAD_BYTES_INDEX_COUNT ) ;
udata_write16 ( pData , leadByteConstants - > SCRIPT_TO_LEAD_BYTES_DATA_OFFSET ) ;
2010-11-04 20:41:35 +00:00
// fprintf(stdout, "#### Script to Lead Byte Index Before Sort\n");
2010-10-28 19:05:02 +00:00
// for (int reorderCodeIndex = 0; reorderCodeIndex < leadByteConstants->SCRIPT_TO_LEAD_BYTES_INDEX_COUNT; reorderCodeIndex++) {
// fprintf(stdout, "\t%04x = %04x\n", leadByteConstants->SCRIPT_TO_LEAD_BYTES_INDEX[reorderCodeIndex].reorderCode, leadByteConstants->SCRIPT_TO_LEAD_BYTES_INDEX[reorderCodeIndex].offset);
// }
qsort ( leadByteConstants - > SCRIPT_TO_LEAD_BYTES_INDEX , leadByteConstants - > SCRIPT_TO_LEAD_BYTES_INDEX_COUNT , sizeof ( leadByteConstants - > SCRIPT_TO_LEAD_BYTES_INDEX [ 0 ] ) , ReorderIndexComparer ) ;
udata_writeBlock ( pData , leadByteConstants - > SCRIPT_TO_LEAD_BYTES_INDEX , leadByteConstants - > SCRIPT_TO_LEAD_BYTES_INDEX_COUNT * sizeof ( leadByteConstants - > SCRIPT_TO_LEAD_BYTES_INDEX [ 0 ] ) ) ;
2010-11-04 20:41:35 +00:00
// fprintf(stdout, "#### Script to Lead Byte Index After Sort\n");
// for (int reorderCodeIndex = 0; reorderCodeIndex < leadByteConstants->SCRIPT_TO_LEAD_BYTES_INDEX_COUNT; reorderCodeIndex++) {
// fprintf(stdout, "\t%04x = %04x\n", leadByteConstants->SCRIPT_TO_LEAD_BYTES_INDEX[reorderCodeIndex].reorderCode, leadByteConstants->SCRIPT_TO_LEAD_BYTES_INDEX[reorderCodeIndex].offset);
// }
2010-10-28 19:05:02 +00:00
// write out the script to lead bytes data block
udata_writeBlock ( pData , leadByteConstants - > SCRIPT_TO_LEAD_BYTES_DATA , leadByteConstants - > SCRIPT_TO_LEAD_BYTES_DATA_OFFSET * sizeof ( * leadByteConstants - > SCRIPT_TO_LEAD_BYTES_DATA ) ) ;
if ( VERBOSE ) {
fprintf ( stdout , " Writing Lead Byte To Script Data \n " ) ;
2010-11-04 20:41:35 +00:00
fprintf ( stdout , " \t index table size = %x \n " , leadByteConstants - > LEAD_BYTE_TO_SCRIPTS_INDEX_LENGTH ) ;
fprintf ( stdout , " \t data block size = %x \n " , leadByteConstants - > LEAD_BYTE_TO_SCRIPTS_DATA_OFFSET ) ;
2010-10-28 19:05:02 +00:00
}
// output the header info
udata_write16 ( pData , leadByteConstants - > LEAD_BYTE_TO_SCRIPTS_INDEX_LENGTH ) ;
udata_write16 ( pData , leadByteConstants - > LEAD_BYTE_TO_SCRIPTS_DATA_OFFSET ) ;
// output the index table
udata_writeBlock ( pData , leadByteConstants - > LEAD_BYTE_TO_SCRIPTS_INDEX ,
leadByteConstants - > LEAD_BYTE_TO_SCRIPTS_INDEX_LENGTH * sizeof ( leadByteConstants - > LEAD_BYTE_TO_SCRIPTS_INDEX ) [ 0 ] ) ;
2010-11-04 20:41:35 +00:00
// for (int leadByte = 0; leadByte < leadByteConstants->LEAD_BYTE_TO_SCRIPTS_INDEX_LENGTH; leadByte++) {
// fprintf(stdout, "\t%02x = %04x\n", leadByte, leadByteConstants->LEAD_BYTE_TO_SCRIPTS_INDEX[leadByte]);
// }
2010-10-28 19:05:02 +00:00
// output the data
udata_writeBlock ( pData , leadByteConstants - > LEAD_BYTE_TO_SCRIPTS_DATA ,
leadByteConstants - > LEAD_BYTE_TO_SCRIPTS_DATA_OFFSET * sizeof ( * leadByteConstants - > LEAD_BYTE_TO_SCRIPTS_DATA ) ) ;
2001-02-22 21:18:29 +00:00
/* finish up */
dataLength = udata_finish ( pData , status ) ;
if ( U_FAILURE ( * status ) ) {
fprintf ( stderr , " Error: error %d writing the output file \n " , * status ) ;
return ;
}
}
2009-11-13 19:25:21 +00:00
enum {
/*
* Maximum number of UCA contractions we can store .
* May need to be increased for a new Unicode version .
*/
2011-12-07 06:14:56 +00:00
MAX_UCA_CONTRACTIONS = 2048
2009-11-13 19:25:21 +00:00
} ;
2001-02-23 01:21:38 +00:00
static int32_t
write_uca_table ( const char * filename ,
const char * outputDir ,
2001-02-23 19:10:28 +00:00
const char * copyright ,
2001-02-23 01:21:38 +00:00
UErrorCode * status )
{
FILE * data = fopen ( filename , " r " ) ;
2006-03-28 07:40:46 +00:00
if ( data = = NULL ) {
fprintf ( stderr , " Couldn't open file: %s \n " , filename ) ;
return - 1 ;
}
2001-03-22 21:16:20 +00:00
uint32_t line = 0 ;
2001-02-22 21:18:29 +00:00
UCAElements * element = NULL ;
2001-02-26 10:28:56 +00:00
UCATableHeader * myD = ( UCATableHeader * ) uprv_malloc ( sizeof ( UCATableHeader ) ) ;
2002-07-02 15:10:30 +00:00
/* test for NULL */
2004-11-11 23:34:58 +00:00
if ( myD = = NULL ) {
* status = U_MEMORY_ALLOCATION_ERROR ;
fclose ( data ) ;
return 0 ;
}
2003-11-04 18:56:31 +00:00
uprv_memset ( myD , 0 , sizeof ( UCATableHeader ) ) ;
2001-03-30 00:23:46 +00:00
UColOptionSet * opts = ( UColOptionSet * ) uprv_malloc ( sizeof ( UColOptionSet ) ) ;
2002-07-02 15:10:30 +00:00
/* test for NULL */
2004-11-11 23:34:58 +00:00
if ( opts = = NULL ) {
* status = U_MEMORY_ALLOCATION_ERROR ;
uprv_free ( myD ) ;
fclose ( data ) ;
return 0 ;
}
2003-11-04 18:56:31 +00:00
uprv_memset ( opts , 0 , sizeof ( UColOptionSet ) ) ;
2011-12-07 06:14:56 +00:00
UChar contractions [ MAX_UCA_CONTRACTIONS ] [ MAX_UCA_CONTRACTION_LENGTH ] ;
uprv_memset ( contractions , 0 , sizeof ( contractions ) ) ;
2001-06-05 22:52:56 +00:00
uint32_t noOfContractions = 0 ;
2002-07-02 22:28:40 +00:00
UCAConstants consts ;
2004-02-27 20:13:47 +00:00
uprv_memset ( & consts , 0 , sizeof ( consts ) ) ;
2002-07-02 22:28:40 +00:00
#if 0
2002-06-13 18:24:36 +00:00
UCAConstants consts = {
UCOL_RESET_TOP_VALUE ,
UCOL_FIRST_PRIMARY_IGNORABLE ,
UCOL_LAST_PRIMARY_IGNORABLE ,
UCOL_LAST_PRIMARY_IGNORABLE_CONT ,
UCOL_FIRST_SECONDARY_IGNORABLE ,
UCOL_LAST_SECONDARY_IGNORABLE ,
UCOL_FIRST_TERTIARY_IGNORABLE ,
UCOL_LAST_TERTIARY_IGNORABLE ,
UCOL_FIRST_VARIABLE ,
UCOL_LAST_VARIABLE ,
UCOL_FIRST_NON_VARIABLE ,
UCOL_LAST_NON_VARIABLE ,
UCOL_NEXT_TOP_VALUE ,
/*
UCOL_NEXT_FIRST_PRIMARY_IGNORABLE ,
UCOL_NEXT_LAST_PRIMARY_IGNORABLE ,
UCOL_NEXT_FIRST_SECONDARY_IGNORABLE ,
UCOL_NEXT_LAST_SECONDARY_IGNORABLE ,
UCOL_NEXT_FIRST_TERTIARY_IGNORABLE ,
UCOL_NEXT_LAST_TERTIARY_IGNORABLE ,
UCOL_NEXT_FIRST_VARIABLE ,
UCOL_NEXT_LAST_VARIABLE ,
*/
PRIMARY_IMPLICIT_MIN ,
PRIMARY_IMPLICIT_MAX
} ;
2002-07-02 22:28:40 +00:00
# endif
2001-02-26 10:28:56 +00:00
2010-10-28 19:05:02 +00:00
//fprintf(stdout, "Allocating LeadByteConstants\n");
LeadByteConstants leadByteConstants ;
uprv_memset ( & leadByteConstants , 0x00 , sizeof ( LeadByteConstants ) ) ;
leadByteConstants . SCRIPT_TO_LEAD_BYTES_INDEX_LENGTH = 256 ;
leadByteConstants . SCRIPT_TO_LEAD_BYTES_INDEX = ( ReorderIndex * ) uprv_malloc ( leadByteConstants . SCRIPT_TO_LEAD_BYTES_INDEX_LENGTH * sizeof ( ReorderIndex ) ) ;
uprv_memset ( leadByteConstants . SCRIPT_TO_LEAD_BYTES_INDEX , 0x00 , leadByteConstants . SCRIPT_TO_LEAD_BYTES_INDEX_LENGTH * sizeof ( ReorderIndex ) ) ;
leadByteConstants . SCRIPT_TO_LEAD_BYTES_DATA_LENGTH = 1024 ;
leadByteConstants . SCRIPT_TO_LEAD_BYTES_DATA = ( uint16_t * ) uprv_malloc ( leadByteConstants . SCRIPT_TO_LEAD_BYTES_DATA_LENGTH * sizeof ( uint16_t ) ) ;
uprv_memset ( leadByteConstants . SCRIPT_TO_LEAD_BYTES_DATA , 0x00 , leadByteConstants . SCRIPT_TO_LEAD_BYTES_DATA_LENGTH * sizeof ( uint16_t ) ) ;
//fprintf(stdout, "\tFinished Allocating LeadByteConstants\n");
leadByteConstants . LEAD_BYTE_TO_SCRIPTS_INDEX_LENGTH = 256 ;
leadByteConstants . LEAD_BYTE_TO_SCRIPTS_INDEX = ( uint16_t * ) uprv_malloc ( leadByteConstants . LEAD_BYTE_TO_SCRIPTS_INDEX_LENGTH * sizeof ( uint16_t ) ) ;
uprv_memset ( leadByteConstants . LEAD_BYTE_TO_SCRIPTS_INDEX , 0x8000 | USCRIPT_INVALID_CODE , leadByteConstants . LEAD_BYTE_TO_SCRIPTS_INDEX_LENGTH * sizeof ( uint16_t ) ) ;
leadByteConstants . LEAD_BYTE_TO_SCRIPTS_DATA_LENGTH = 1024 ;
leadByteConstants . LEAD_BYTE_TO_SCRIPTS_DATA_OFFSET = 1 ; // offset by 1 to leave zero location for those lead bytes with no reorder codes
leadByteConstants . LEAD_BYTE_TO_SCRIPTS_DATA = ( uint16_t * ) uprv_malloc ( leadByteConstants . LEAD_BYTE_TO_SCRIPTS_DATA_LENGTH * sizeof ( uint16_t ) ) ;
uprv_memset ( leadByteConstants . LEAD_BYTE_TO_SCRIPTS_DATA , 0x00 , leadByteConstants . LEAD_BYTE_TO_SCRIPTS_DATA_LENGTH * sizeof ( uint16_t ) ) ;
2001-02-22 21:18:29 +00:00
2003-11-04 18:56:31 +00:00
uprv_memset ( inverseTable , 0xDA , sizeof ( int32_t ) * 3 * 0xFFFF ) ;
2001-02-22 21:18:29 +00:00
2010-10-13 17:40:32 +00:00
opts - > variableTopValue = 0 ;
2001-03-30 00:23:46 +00:00
opts - > strength = UCOL_TERTIARY ;
opts - > frenchCollation = UCOL_OFF ;
opts - > alternateHandling = UCOL_NON_IGNORABLE ; /* attribute for handling variable elements*/
opts - > caseFirst = UCOL_OFF ; /* who goes first, lower case or uppercase */
opts - > caseLevel = UCOL_OFF ; /* do we have an extra case level */
2001-06-05 22:52:56 +00:00
opts - > normalizationMode = UCOL_OFF ; /* attribute for normalization */
2001-10-10 01:48:36 +00:00
opts - > hiraganaQ = UCOL_OFF ; /* attribute for JIS X 4061, used only in Japanese */
2003-10-06 21:05:04 +00:00
opts - > numericCollation = UCOL_OFF ;
2001-03-08 21:01:34 +00:00
myD - > jamoSpecial = FALSE ;
2001-02-22 21:18:29 +00:00
2003-07-24 23:29:34 +00:00
tempUCATable * t = uprv_uca_initTempTable ( myD , opts , NULL , IMPLICIT_TAG , LEAD_SURROGATE_TAG , status ) ;
2002-07-17 03:56:50 +00:00
if ( U_FAILURE ( * status ) )
{
fprintf ( stderr , " Failed to init UCA temp table: %s \n " , u_errorName ( * status ) ) ;
2006-03-28 07:40:46 +00:00
uprv_free ( opts ) ;
uprv_free ( myD ) ;
fclose ( data ) ;
2002-07-17 03:56:50 +00:00
return - 1 ;
}
2001-09-20 20:16:39 +00:00
2010-10-13 17:40:32 +00:00
// * set to zero
struct {
UChar32 start ;
UChar32 end ;
int32_t value ;
2001-11-06 22:55:29 +00:00
} ranges [ ] =
{
2010-10-13 17:40:32 +00:00
{ 0xAC00 , 0xD7B0 , UCOL_SPECIAL_FLAG | ( HANGUL_SYLLABLE_TAG < < 24 ) } , //0 HANGUL_SYLLABLE_TAG,/* AC00-D7AF*/
//{0xD800, 0xDC00, UCOL_SPECIAL_FLAG | (LEAD_SURROGATE_TAG << 24) }, //1 LEAD_SURROGATE_TAG, already set in utrie_open() /* D800-DBFF*/
{ 0xDC00 , 0xE000 , UCOL_SPECIAL_FLAG | ( TRAIL_SURROGATE_TAG < < 24 ) } , //2 TRAIL_SURROGATE DC00-DFFF
// Now directly handled in the collation code by the swapCJK function.
//{0x3400, 0x4DB6, UCOL_SPECIAL_FLAG | (CJK_IMPLICIT_TAG << 24) }, //3 CJK_IMPLICIT_TAG, /* 0x3400-0x4DB5*/
//{0x4E00, 0x9FA6, UCOL_SPECIAL_FLAG | (CJK_IMPLICIT_TAG << 24) }, //4 CJK_IMPLICIT_TAG, /* 0x4E00-0x9FA5*/
//{0xF900, 0xFA2E, UCOL_SPECIAL_FLAG | (CJK_IMPLICIT_TAG << 24) }, //5 CJK_IMPLICIT_TAG, /* 0xF900-0xFA2D*/
//{0x20000, 0x2A6D7, UCOL_SPECIAL_FLAG | (CJK_IMPLICIT_TAG << 24) }, //6 CJK_IMPLICIT_TAG, /* 0x20000-0x2A6D6*/
//{0x2F800, 0x2FA1E, UCOL_SPECIAL_FLAG | (CJK_IMPLICIT_TAG << 24) }, //7 CJK_IMPLICIT_TAG, /* 0x2F800-0x2FA1D*/
2001-11-06 22:55:29 +00:00
} ;
uint32_t i = 0 ;
for ( i = 0 ; i < sizeof ( ranges ) / sizeof ( ranges [ 0 ] ) ; i + + ) {
2001-12-19 07:00:45 +00:00
/*ucmpe32_setRange32(t->mapping, ranges[i].start, ranges[i].end, ranges[i].value); */
utrie_setRange32 ( t - > mapping , ranges [ i ] . start , ranges [ i ] . end , ranges [ i ] . value , TRUE ) ;
2001-11-06 22:55:29 +00:00
}
2002-02-08 04:53:41 +00:00
int32_t surrogateCount = 0 ;
2001-02-22 21:18:29 +00:00
while ( ! feof ( data ) ) {
2001-02-23 01:21:38 +00:00
if ( U_FAILURE ( * status ) ) {
2004-05-19 04:01:34 +00:00
fprintf ( stderr , " Something returned an error %i (%s) while processing line %u of %s. Exiting... \n " ,
* status , u_errorName ( * status ) , ( int ) line , filename ) ;
2001-02-23 01:21:38 +00:00
exit ( * status ) ;
2001-02-22 21:18:29 +00:00
}
line + + ;
2002-07-02 22:28:40 +00:00
if ( VERBOSE ) {
2004-05-19 04:01:34 +00:00
fprintf ( stdout , " %u " , ( int ) line ) ;
2002-07-02 22:28:40 +00:00
}
2010-10-28 19:05:02 +00:00
element = readAnElement ( data , t , & consts , & leadByteConstants , status ) ;
2001-02-22 21:18:29 +00:00
if ( element ! = NULL ) {
// we have read the line, now do something sensible with the read data!
2002-06-13 18:24:36 +00:00
2011-12-07 06:14:56 +00:00
// if element is a contraction, we want to add it to contractions[]
int32_t length = ( int32_t ) element - > cSize ;
if ( length > 1 & & element - > cPoints [ 0 ] ! = 0xFDD0 ) { // this is a contraction
if ( U16_IS_LEAD ( element - > cPoints [ 0 ] ) & & U16_IS_TRAIL ( element - > cPoints [ 1 ] ) & & length = = 2 ) {
2001-11-10 00:13:03 +00:00
surrogateCount + + ;
2001-06-05 22:52:56 +00:00
} else {
2011-12-07 06:14:56 +00:00
if ( noOfContractions > = MAX_UCA_CONTRACTIONS ) {
2009-11-13 19:25:21 +00:00
fprintf ( stderr ,
2011-12-07 06:14:56 +00:00
" \n More than %d contractions. Please increase MAX_UCA_CONTRACTIONS in genuca.cpp. "
2009-11-13 19:25:21 +00:00
" Exiting... \n " ,
2011-12-07 06:14:56 +00:00
( int ) MAX_UCA_CONTRACTIONS ) ;
exit ( U_BUFFER_OVERFLOW_ERROR ) ;
2009-11-13 19:25:21 +00:00
}
2011-12-07 06:14:56 +00:00
if ( length > MAX_UCA_CONTRACTION_LENGTH ) {
fprintf ( stderr ,
" \n Line %d: Contraction of length %d is too long. Please increase MAX_UCA_CONTRACTION_LENGTH in genuca.cpp. "
" Exiting... \n " ,
( int ) line , ( int ) length ) ;
exit ( U_BUFFER_OVERFLOW_ERROR ) ;
}
UChar * t = & contractions [ noOfContractions ] [ 0 ] ;
u_memcpy ( t , element - > cPoints , length ) ;
t + = length ;
for ( ; length < MAX_UCA_CONTRACTION_LENGTH ; + + length ) {
* t + + = 0 ;
2001-11-06 22:55:29 +00:00
}
noOfContractions + + ;
2001-06-05 22:52:56 +00:00
}
}
2008-04-04 22:47:43 +00:00
else {
// TODO (claireho): does this work? Need more tests
// The following code is to handle the UCA pre-context rules
// for L/l with middle dot. We share the structures for contractionCombos.
// The format for pre-context character is
2011-12-07 06:14:56 +00:00
// contractions[0]: codepoint in element->cPoints[0]
// contractions[1]: '\0' to differentiate from a contraction
// contractions[2]: prefix char
2008-04-04 22:47:43 +00:00
if ( element - > prefixSize > 0 ) {
2011-12-07 06:14:56 +00:00
if ( length > 1 | | element - > prefixSize > 1 ) {
2010-10-11 21:01:31 +00:00
fprintf ( stderr ,
2011-12-07 06:14:56 +00:00
" \n Line %d: Character with prefix, "
" either too many characters or prefix too long. \n " ,
( int ) line ) ;
exit ( U_INTERNAL_PROGRAM_ERROR ) ;
2010-10-11 21:01:31 +00:00
}
2011-12-07 06:14:56 +00:00
if ( noOfContractions > = MAX_UCA_CONTRACTIONS ) {
2009-11-13 19:25:21 +00:00
fprintf ( stderr ,
2011-12-07 06:14:56 +00:00
" \n More than %d contractions. Please increase MAX_UCA_CONTRACTIONS in genuca.cpp. "
2009-11-13 19:25:21 +00:00
" Exiting... \n " ,
2011-12-07 06:14:56 +00:00
( int ) MAX_UCA_CONTRACTIONS ) ;
exit ( U_BUFFER_OVERFLOW_ERROR ) ;
}
UChar * t = & contractions [ noOfContractions ] [ 0 ] ;
t [ 0 ] = element - > cPoints [ 0 ] ;
t [ 1 ] = 0 ;
t [ 2 ] = element - > prefixChars [ 0 ] ;
t + = 3 ;
for ( length = 3 ; length < MAX_UCA_CONTRACTION_LENGTH ; + + length ) {
* t + + = 0 ;
2009-11-13 19:25:21 +00:00
}
2008-04-04 22:47:43 +00:00
noOfContractions + + ;
}
}
2001-06-05 22:52:56 +00:00
2001-02-22 21:18:29 +00:00
/* we're first adding to inverse, because addAnElement will reverse the order */
/* of code points and stuff... we don't want that to happen */
2010-10-13 22:20:26 +00:00
if ( ( element - > CEs [ 0 ] > > 24 ) ! = 2 ) {
// Add every element except for the special minimum-weight character U+FFFE
// which has 02 weights.
// If we had 02 weights in the invuca table, then tailoring primary
// after an ignorable would try to put a weight before 02 which is not valid.
// We could fix this in a complicated way in the from-rule-string builder,
// but omitting this special element from invuca is simple and effective.
addToInverse ( element , status ) ;
}
2011-12-07 06:14:56 +00:00
if ( ! ( length > 1 & & element - > cPoints [ 0 ] = = 0xFDD0 ) ) {
2002-07-02 22:28:40 +00:00
uprv_uca_addAnElement ( t , element , status ) ;
}
2001-02-22 21:18:29 +00:00
}
}
2003-04-30 00:49:01 +00:00
if ( UCAVersion [ 0 ] = = 0 & & UCAVersion [ 1 ] = = 0 & & UCAVersion [ 2 ] = = 0 & & UCAVersion [ 3 ] = = 0 ) {
2006-03-28 07:40:46 +00:00
fprintf ( stderr , " UCA version not specified. Cannot create data file! \n " ) ;
uprv_uca_closeTempTable ( t ) ;
uprv_free ( opts ) ;
uprv_free ( myD ) ;
fclose ( data ) ;
return - 1 ;
2003-04-30 00:49:01 +00:00
}
2003-12-02 02:31:40 +00:00
/* {
2003-07-24 23:29:34 +00:00
uint32_t trieWord = utrie_get32 ( t - > mapping , 0xDC01 , NULL ) ;
2003-12-02 02:31:40 +00:00
} */
2001-02-22 21:18:29 +00:00
2001-02-23 01:21:38 +00:00
if ( VERBOSE ) {
2004-05-19 04:01:34 +00:00
fprintf ( stdout , " \n Lines read: %u \n " , ( int ) line ) ;
fprintf ( stdout , " Surrogate count: %i \n " , ( int ) surrogateCount ) ;
2001-11-10 00:13:03 +00:00
fprintf ( stdout , " Raw data breakdown: \n " ) ;
2001-12-19 07:00:45 +00:00
/*fprintf(stdout, "Compact array stage1 top: %i, stage2 top: %i\n", t->mapping->stage1Top, t->mapping->stage2Top);*/
2004-05-19 04:01:34 +00:00
fprintf ( stdout , " Number of contractions: %u \n " , ( int ) noOfContractions ) ;
fprintf ( stdout , " Contraction image size: %u \n " , ( int ) t - > image - > contractionSize ) ;
fprintf ( stdout , " Expansions size: %i \n " , ( int ) t - > expansions - > position ) ;
2001-02-23 01:21:38 +00:00
}
2001-02-22 21:18:29 +00:00
2002-06-13 18:24:36 +00:00
/* produce canonical closure for table */
/* first set up constants for implicit calculation */
2008-01-05 01:27:56 +00:00
uprv_uca_initImplicitConstants ( status ) ;
2002-06-13 18:24:36 +00:00
/* do the closure */
2010-10-07 19:46:41 +00:00
UnicodeSet closed ;
int32_t noOfClosures = uprv_uca_canonicalClosure ( t , NULL , & closed , status ) ;
2002-07-13 05:24:35 +00:00
if ( noOfClosures ! = 0 ) {
2010-10-07 19:46:41 +00:00
fprintf ( stderr , " Warning: %i canonical closures occured! \n " , ( int ) noOfClosures ) ;
UnicodeString pattern ;
std : : string utf8 ;
closed . toPattern ( pattern , TRUE ) . toUTF8String ( utf8 ) ;
fprintf ( stderr , " UTF-8 pattern string: %s \n " , utf8 . c_str ( ) ) ;
2002-07-13 05:24:35 +00:00
}
2002-06-13 18:24:36 +00:00
2001-02-22 21:18:29 +00:00
/* test */
2001-02-26 10:28:56 +00:00
UCATableHeader * myData = uprv_uca_assembleTable ( t , status ) ;
2001-11-10 00:13:03 +00:00
if ( VERBOSE ) {
fprintf ( stdout , " Compacted data breakdown: \n " ) ;
2001-12-19 07:00:45 +00:00
/*fprintf(stdout, "Compact array stage1 top: %i, stage2 top: %i\n", t->mapping->stage1Top, t->mapping->stage2Top);*/
2004-05-19 04:01:34 +00:00
fprintf ( stdout , " Number of contractions: %u \n " , ( int ) noOfContractions ) ;
fprintf ( stdout , " Contraction image size: %u \n " , ( int ) t - > image - > contractionSize ) ;
fprintf ( stdout , " Expansions size: %i \n " , ( int ) t - > expansions - > position ) ;
2001-11-10 00:13:03 +00:00
}
2004-01-16 07:12:35 +00:00
if ( U_FAILURE ( * status ) ) {
fprintf ( stderr , " Error creating table: %s \n " , u_errorName ( * status ) ) ;
2006-03-28 07:40:46 +00:00
uprv_uca_closeTempTable ( t ) ;
uprv_free ( opts ) ;
uprv_free ( myD ) ;
fclose ( data ) ;
2004-01-16 07:12:35 +00:00
return - 1 ;
}
2003-04-24 06:57:36 +00:00
/* populate the version info struct with version info*/
myData - > version [ 0 ] = UCOL_BUILDER_VERSION ;
myData - > version [ 1 ] = UCAVersion [ 0 ] ;
myData - > version [ 2 ] = UCAVersion [ 1 ] ;
myData - > version [ 3 ] = UCAVersion [ 2 ] ;
/*TODO:The fractional rules version should be taken from FractionalUCA.txt*/
// Removed this macro. Instead, we use the fields below
//myD->version[1] = UCOL_FRACTIONAL_UCA_VERSION;
//myD->UCAVersion = UCAVersion; // out of FractionalUCA.txt
uprv_memcpy ( myData - > UCAVersion , UCAVersion , sizeof ( UVersionInfo ) ) ;
u_getUnicodeVersion ( myData - > UCDVersion ) ;
2011-12-07 06:14:56 +00:00
writeOutData ( myData , & consts , & leadByteConstants , contractions , noOfContractions , outputDir , copyright , status ) ;
2001-02-22 21:18:29 +00:00
2003-04-30 00:49:01 +00:00
InverseUCATableHeader * inverse = assembleInverseTable ( status ) ;
uprv_memcpy ( inverse - > UCAVersion , UCAVersion , sizeof ( UVersionInfo ) ) ;
2001-02-23 19:10:28 +00:00
writeOutInverseData ( inverse , outputDir , copyright , status ) ;
2001-02-22 21:18:29 +00:00
2006-03-28 07:40:46 +00:00
uprv_uca_closeTempTable ( t ) ;
2001-02-26 10:28:56 +00:00
uprv_free ( myD ) ;
2001-03-30 00:23:46 +00:00
uprv_free ( opts ) ;
2001-02-22 21:18:29 +00:00
2001-05-03 23:10:45 +00:00
uprv_free ( myData ) ;
uprv_free ( inverse ) ;
2010-10-28 19:05:02 +00:00
uprv_free ( leadByteConstants . LEAD_BYTE_TO_SCRIPTS_INDEX ) ;
uprv_free ( leadByteConstants . LEAD_BYTE_TO_SCRIPTS_DATA ) ;
uprv_free ( leadByteConstants . SCRIPT_TO_LEAD_BYTES_INDEX ) ;
uprv_free ( leadByteConstants . SCRIPT_TO_LEAD_BYTES_DATA ) ;
2001-02-23 01:21:38 +00:00
fclose ( data ) ;
2001-02-22 21:18:29 +00:00
2002-04-02 02:55:31 +00:00
return 0 ;
2001-02-22 21:18:29 +00:00
}
2001-02-23 01:21:38 +00:00
2002-10-01 17:44:04 +00:00
# endif /* #if !UCONFIG_NO_COLLATION */
2001-02-23 01:21:38 +00:00
static UOption options [ ] = {
UOPTION_HELP_H , /* 0 Numbers for those who*/
UOPTION_HELP_QUESTION_MARK , /* 1 can't count. */
UOPTION_COPYRIGHT , /* 2 */
UOPTION_VERSION , /* 3 */
UOPTION_DESTDIR , /* 4 */
2001-02-23 04:57:47 +00:00
UOPTION_SOURCEDIR , /* 5 */
2002-03-15 23:41:40 +00:00
UOPTION_VERBOSE , /* 6 */
UOPTION_ICUDATADIR /* 7 */
2001-02-23 04:57:47 +00:00
/* weiv can't count :))))) */
2001-02-23 01:21:38 +00:00
} ;
int main ( int argc , char * argv [ ] ) {
UErrorCode status = U_ZERO_ERROR ;
2001-02-23 04:57:47 +00:00
const char * destdir = NULL ;
const char * srcDir = NULL ;
char filename [ 300 ] ;
char * basename = NULL ;
2001-02-23 19:10:28 +00:00
const char * copyright = NULL ;
2003-04-30 00:49:01 +00:00
uprv_memset ( & UCAVersion , 0 , 4 ) ;
2001-02-23 01:21:38 +00:00
2002-03-15 23:41:40 +00:00
U_MAIN_INIT_ARGS ( argc , argv ) ;
2001-02-23 01:21:38 +00:00
/* preset then read command line options */
options [ 4 ] . value = u_getDataDirectory ( ) ;
2001-02-23 04:57:47 +00:00
options [ 5 ] . value = " " ;
2001-02-23 01:21:38 +00:00
argc = u_parseArgs ( argc , argv , sizeof ( options ) / sizeof ( options [ 0 ] ) , options ) ;
/* error handling, printing usage message */
if ( argc < 0 ) {
fprintf ( stderr ,
" error in command line argument \" %s \" \n " ,
argv [ - argc ] ) ;
} else if ( argc < 2 ) {
argc = - 1 ;
}
2001-02-23 04:57:47 +00:00
if ( options [ 0 ] . doesOccur | | options [ 1 ] . doesOccur ) {
2001-02-23 01:21:38 +00:00
fprintf ( stderr ,
" usage: %s [-options] file \n "
" \t Read in UCA collation text data and write out the binary collation data \n "
2002-03-15 23:41:40 +00:00
" options: \n "
" \t -h or -? or --help this usage text \n "
" \t -V or --version show a version message \n "
" \t -c or --copyright include a copyright notice \n "
" \t -d or --destdir destination directory, followed by the path \n "
" \t -s or --sourcedir source directory, followed by the path \n "
" \t -v or --verbose turn on verbose output \n "
" \t -i or --icudatadir directory for locating any needed intermediate data files, \n "
" \t followed by path, defaults to %s \n " ,
2002-04-04 18:41:16 +00:00
argv [ 0 ] , u_getDataDirectory ( ) ) ;
2001-02-23 01:21:38 +00:00
return argc < 0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR ;
}
if ( options [ 3 ] . doesOccur ) {
2004-07-18 22:04:47 +00:00
fprintf ( stdout , " genuca version %hu.%hu, ICU tool to read UCA text data and create UCA data tables for collation. \n " ,
2002-10-01 17:44:04 +00:00
# if UCONFIG_NO_COLLATION
0 , 0
# else
2004-08-28 05:50:39 +00:00
UCA_FORMAT_VERSION_0 , UCA_FORMAT_VERSION_1
2002-10-01 17:44:04 +00:00
# endif
) ;
2004-07-18 22:04:47 +00:00
fprintf ( stdout , U_COPYRIGHT_STRING " \n " ) ;
2001-02-23 01:21:38 +00:00
exit ( 0 ) ;
}
/* get the options values */
destdir = options [ 4 ] . value ;
2001-02-23 04:57:47 +00:00
srcDir = options [ 5 ] . value ;
VERBOSE = options [ 6 ] . doesOccur ;
2001-02-23 19:10:28 +00:00
if ( options [ 2 ] . doesOccur ) {
copyright = U_COPYRIGHT_STRING ;
}
2002-03-15 23:41:40 +00:00
if ( options [ 7 ] . doesOccur ) {
u_setDataDirectory ( options [ 7 ] . value ) ;
}
2003-08-15 01:26:22 +00:00
/* Initialize ICU */
u_init ( & status ) ;
if ( U_FAILURE ( status ) & & status ! = U_FILE_ACCESS_ERROR ) {
fprintf ( stderr , " %s: can not initialize ICU. status = %s \n " ,
argv [ 0 ] , u_errorName ( status ) ) ;
exit ( 1 ) ;
}
status = U_ZERO_ERROR ;
2002-03-15 23:41:40 +00:00
2001-11-10 00:13:03 +00:00
/* prepare the filename beginning with the source dir */
uprv_strcpy ( filename , srcDir ) ;
basename = filename + uprv_strlen ( filename ) ;
2001-02-23 04:57:47 +00:00
2001-11-10 00:13:03 +00:00
if ( basename > filename & & * ( basename - 1 ) ! = U_FILE_SEP_CHAR ) {
2002-03-15 23:41:40 +00:00
* basename + + = U_FILE_SEP_CHAR ;
2001-11-10 00:13:03 +00:00
}
2001-02-23 04:57:47 +00:00
2001-11-10 00:13:03 +00:00
if ( argc < 0 ) {
2001-02-23 04:57:47 +00:00
uprv_strcpy ( basename , " FractionalUCA.txt " ) ;
} else {
argv + + ;
2001-11-10 00:13:03 +00:00
uprv_strcpy ( basename , getLongPathname ( * argv ) ) ;
2001-02-23 04:57:47 +00:00
}
2001-02-23 01:21:38 +00:00
2002-07-17 03:56:50 +00:00
#if 0
if ( u_getCombiningClass ( 0x0053 ) = = 0 )
{
fprintf ( stderr , " SEVERE ERROR: Normalization data is not functioning! Bailing out. Was not able to load unorm.dat. \n " ) ;
exit ( 1 ) ;
}
# endif
2002-10-01 17:44:04 +00:00
# if UCONFIG_NO_COLLATION
UNewDataMemory * pData ;
const char * msg ;
2004-04-14 20:08:16 +00:00
msg = " genuca writes dummy " UCA_DATA_NAME " . " UCA_DATA_TYPE " because of UCONFIG_NO_COLLATION, see uconfig.h " ;
2002-10-01 17:44:04 +00:00
fprintf ( stderr , " %s \n " , msg ) ;
2004-04-14 20:08:16 +00:00
pData = udata_create ( destdir , UCA_DATA_TYPE , UCA_DATA_NAME , & dummyDataInfo ,
2002-10-01 17:44:04 +00:00
NULL , & status ) ;
udata_writeBlock ( pData , msg , strlen ( msg ) ) ;
udata_finish ( pData , & status ) ;
2004-04-14 20:08:16 +00:00
msg = " genuca writes dummy " INVC_DATA_NAME " . " INVC_DATA_TYPE " because of UCONFIG_NO_COLLATION, see uconfig.h " ;
2002-10-01 17:44:04 +00:00
fprintf ( stderr , " %s \n " , msg ) ;
2004-04-14 20:08:16 +00:00
pData = udata_create ( destdir , INVC_DATA_TYPE , INVC_DATA_NAME , & dummyDataInfo ,
2002-10-01 17:44:04 +00:00
NULL , & status ) ;
udata_writeBlock ( pData , msg , strlen ( msg ) ) ;
udata_finish ( pData , & status ) ;
return ( int ) status ;
# else
2001-02-23 19:10:28 +00:00
return write_uca_table ( filename , destdir , copyright , & status ) ;
2001-02-23 01:21:38 +00:00
2002-10-01 17:44:04 +00:00
# endif
}
2002-10-01 01:26:49 +00:00
2001-02-23 01:21:38 +00:00
/*
* Hey , Emacs , please set the following :
*
* Local Variables :
* indent - tabs - mode : nil
* End :
*
*/