2001-08-31 00:30:17 +00:00
/ * *
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
* Copyright ( C ) 1996 - 2001 , International Business Machines Corporation and *
* others . All Rights Reserved . *
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
*
* $Source : / xsrl / Nsvn / icu / unicodetools / com / ibm / text / UCD / ConvertUCD . java , v $
2004-11-12 23:17:15 +00:00
* $Date : 2004 / 11 / 12 23 : 17 : 15 $
* $Revision : 1 . 16 $
2001-08-31 00:30:17 +00:00
*
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
* /
2001-08-30 20:50:18 +00:00
package com.ibm.text.UCD ;
import com.ibm.text.utility.* ;
import java.util.* ;
import java.text.NumberFormat ;
import java.io.* ;
2001-08-31 00:30:17 +00:00
/ * * Simple program to merge UCD files into XML . Not yet documented ! !
2001-08-30 20:50:18 +00:00
* @author Mark Davis
* /
public final class ConvertUCD implements UCD_Types {
2002-03-20 00:21:43 +00:00
public static final boolean SHOW = false ;
2001-08-30 20:50:18 +00:00
public static final boolean DEBUG = false ;
2004-02-06 18:32:05 +00:00
static final boolean SHOW_SAMPLE = false ;
2001-08-31 00:30:17 +00:00
2004-02-06 18:32:05 +00:00
int major ;
int minor ;
int update ;
2001-08-31 00:30:17 +00:00
2004-02-06 18:32:05 +00:00
String version ;
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
// varies by version
/ *
public static final String BASE_DIR11 = DATA_DIR + " \\ Versions \\ " ;
public static final String BASE_DIR20 = DATA_DIR + " \\ Versions \\ " ;
public static final String BASE_DIR21 = DATA_DIR + " \\ Versions \\ " ;
public static final String BASE_DIR30 = DATA_DIR + " \\ Update 3.0.1 \\ " ;
public static final String BASE_DIR31 = DATA_DIR + " \\ 3.1-Update \\ " ;
* /
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
//public static final String blocksnamePlain = "Blocks.txt";
//public static final String blocksname31 = "Blocks-4d2.beta";
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
/ * * First item is file name , rest are field names ( skipping character ) .
* " OMIT " is special - - means don ' t record
* /
static String [ ] [ ] labelList = {
// Labels for the incoming files. Labels MUST match field order in file.
// IMPORTANT - defaults of form y-=x must occur after x is encountered!
// The one exception is "st", which is handled specially.
// So file order is important.
//*
// 01CA;LATIN CAPITAL LETTER NJ;Lu;0; L; <compat> 004E 004A; ; ; ;N ;LATIN CAPITAL LETTER N J; ; ;01CC;01CB
// n gc cc bc dm dd dv nv bm on cm, uc lc tc
{ " UnicodeData " , " n " , " gc " , " cc " , " bc " , " dm " , " dd " , " dv " , " nv " , " bm " , " on " , " OMIT " , " *uc " , " *lc " , " *tc " } ,
2002-03-15 00:34:46 +00:00
//{"ExtraProperties", "xp"},
2001-08-31 00:30:17 +00:00
{ " PropList " , " binary " } ,
2001-08-30 20:50:18 +00:00
//{"ExtraProperties", "xp"},
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
{ " EastAsianWidth " , " ea " , " OMIT " } ,
{ " LineBreak " , " lb " , " OMIT " } ,
{ " SpecialCasing " , " *sl " , " *st " , " *su " , " sc " } ,
{ " CompositionExclusions " , " ce " } ,
{ " CaseFolding " , " OMIT " , " *fc " } ,
{ " ArabicShaping " , " OMIT " , " jt " , " jg " } ,
{ " BidiMirroring " , " *bg " } ,
{ " Scripts " , " sn " } ,
//{"Jamo", "jn"},
//{"Scripts-1d4", "RANGE", "sn"},
//{"Age", "*sn"},
//*/
/ *
//*/
} ;
2004-02-06 18:32:05 +00:00
static HashMap isHex = new HashMap ( ) ;
static HashMap defaults = new HashMap ( ) ;
static {
for ( int j = 0 ; j < labelList . length ; + + j ) {
String [ ] labels = labelList [ j ] ;
for ( int i = 1 ; i < labels . length ; + + i ) {
boolean hex = false ;
String def = null ;
//char appendChar = '\u0000';
// pull off "*": hex interpretation
if ( labels [ i ] . charAt ( 0 ) = = '*' ) { // HEX value
hex = true ;
labels [ i ] = labels [ i ] . substring ( 1 ) ;
}
/ *
// pull off "$": append duplicates
if ( labels [ i ] . charAt ( 0 ) = = '$' ) { // HEX value
appendChar = labels [ i ] . charAt ( 1 ) ;
labels [ i ] = labels [ i ] . substring ( 2 ) ;
}
// pull off default values
int pos = labels [ i ] . indexOf ( '-' ) ;
if ( pos > = 0 ) {
def = labels [ i ] . substring ( pos + 1 ) ;
labels [ i ] = labels [ i ] . substring ( 0 , pos ) ;
}
* /
// store results
// we do this after all processing, so that the label is clean!!
if ( hex ) isHex . put ( labels [ i ] , " " ) ;
//if (appendChar != 0) appendDuplicates.put(labels[i], String.valueOf(appendChar));
defaults . put ( labels [ i ] , def ) ;
}
}
}
2001-08-30 20:50:18 +00:00
/ *
static String [ ] [ ] labelList31 = {
// Labels for the incoming files. Labels MUST match field order in file.
// IMPORTANT - defaults of form y-=x must occur after x is encountered!
// The one exception is "st", which is handled specially.
// So file order is important.
//*
// 01CA;LATIN CAPITAL LETTER NJ;Lu;0; L; <compat> 004E 004A; ; ; ;N ;LATIN CAPITAL LETTER N J; ; ;01CC;01CB
// n gc cc bc dm dd dv nv bm on cm, uc lc tc
{ " UnicodeData-3.1.0d8.beta " , " n " , " gc " , " cc " , " bc " , " dm " , " dd " , " dv " , " nv " , " bm " , " on " , " OMIT " , " *uc " , " *lc " , " *tc " } ,
2001-08-31 00:30:17 +00:00
{ " PropList-3.1.0d5.beta " , " binary " } ,
2001-08-30 20:50:18 +00:00
{ " ExtraProperties " , " xp " } ,
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
{ " EastAsianWidth-4d7.beta " , " ea " , " OMIT " } ,
{ " LineBreak-6d6.beta " , " lb " , " OMIT " } ,
{ " SpecialCasing-4d1.beta " , " *sl " , " *st " , " *su " , " sc " } ,
{ " CompositionExclusions-3d6.beta " , " ce " } ,
{ " CaseFolding-3d4.beta " , " OMIT " , " *fc " } ,
{ " ArabicShaping " , " OMIT " , " jt " , " jg " } ,
{ " BidiMirroring " , " *bg " } ,
{ " Scripts-3.1.0d4.beta " , " sn " } ,
//{"Scripts-1d4", "RANGE", "sn"},
//{"Age", "*sn"},
//*/
/ *
{ " Jamo " , " jn " } ,
//
} ;
/ *
{ " UnicodeData-3.1.0d8.beta " , " n " , " gc " , " cc " , " bc " , " dm " , " dd " , " dv " , " nv " , " bm " , " on " , " OMIT " , " *uc " , " *lc " , " *tc " } ,
{ " ExtraProperties " , " xp " } ,
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
{ " EastAsianWidth-4d7.beta " , " ea " , " OMIT " } ,
{ " LineBreak-6d6.beta " , " lb " , " OMIT " } ,
{ " SpecialCasing-4d1.beta " , " *sl " , " *st " , " *su " , " sc " } ,
{ " CompositionExclusions-3d6.beta " , " ce " } ,
{ " CaseFolding-3d4.beta " , " OMIT " , " *fc " } ,
2001-08-31 00:30:17 +00:00
{ " PropList-3.1.0d2.beta " , " PROP " , " OMIT " } ,
2001-08-30 20:50:18 +00:00
{ " ArabicShaping " , " OMIT " , " jt " , " jg " } ,
{ " BidiMirroring " , " *bg " } ,
{ " Scripts-1d4 " , " sn " } ,
//{"Scripts-1d4", "RANGE", "sn"},
//{"Age", "*sn"},
//*/
/ *
{ " Jamo " , " jn " } ,
//
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
//"NamesList-3.1.0d1.beta"
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
static String [ ] [ ] labelList30 = {
// Labels for the incoming files. Labels MUST match field order in file.
// IMPORTANT - defaults of form y-=x must occur after x is encountered!
// The one exception is "st", which is handled specially.
// So file order is important.
//*
{ " UnicodeData " , " n " , " gc " , " cc " , " bc " , " dm " , " dd " , " dv " , " nv " , " bm " , " on " , " OMIT " , " *uc " , " *lc " , " *tc " } ,
{ " CompositionExclusions " , " ce " } ,
{ " EastAsianWidth " , " ea " , " OMIT " } ,
{ " LineBreak " , " lb " , " OMIT " } ,
{ " SpecialCasing " , " *sl " , " *st " , " *su " , " sc " } ,
{ " CaseFolding " , " OMIT " , " *fc " } ,
{ " ArabicShaping " , " OMIT " , " jt " , " jg " } ,
{ " BidiMirroring " , " *bg " } ,
/ *
{ " Jamo " , " jn " } ,
2001-08-31 00:30:17 +00:00
{ " PropList.alpha " , " RANGE " , " OMIT " } ,
2001-08-30 20:50:18 +00:00
//
} ;
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
static String [ ] [ ] labelList11 = {
{ " UnicodeData-1.1 " , " n " , " gc " , " cc " , " bc " , " dm " , " dd " , " dv " , " nv " , " bm " , " on " , " OMIT " , " *uc " , " *lc " , " *tc " } ,
} ;
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
static String [ ] [ ] labelList20 = {
{ " UnicodeData-2.0 " , " n " , " gc " , " cc " , " bc " , " dm " , " dd " , " dv " , " nv " , " bm " , " on " , " OMIT " , " *uc " , " *lc " , " *tc " } ,
} ;
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
static String [ ] [ ] labelList21 = {
{ " UnicodeData-2.1 " , " n " , " gc " , " cc " , " bc " , " dm " , " dd " , " dv " , " nv " , " bm " , " on " , " OMIT " , " *uc " , " *lc " , " *tc " } ,
} ;
* /
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
// handles
public static final String blocksname = " Blocks " ;
//public static final String[][] labelList;
public static final boolean NEWPROPS = true ;
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
/ *
static {
switch ( major * 10 + minor ) {
case 31 :
blocksname = blocksname31 ;
labelList = labelList31 ;
break ;
case 30 :
blocksname = blocksnamePlain ;
labelList = labelList30 ;
break ;
case 21 :
blocksname = blocksnamePlain ;
labelList = labelList21 ;
break ;
case 20 :
blocksname = blocksnamePlain ;
labelList = labelList20 ;
break ;
default :
blocksname = blocksnamePlain ;
labelList = labelList11 ;
break ;
}
}
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
* /
static final String dataFilePrefix = " UCD_Data " ;
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
// MAIN!!
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
public static void main ( String [ ] args ) throws Exception {
2002-03-20 00:21:43 +00:00
System . out . println ( " Building binary version of UCD " ) ;
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
log = new PrintWriter ( new BufferedWriter (
new OutputStreamWriter (
new FileOutputStream ( GEN_DIR + " UCD-log.txt " ) ,
" UTF8 " ) ,
32 * 1024 ) ) ;
log . write ( " \ uFEFF " ) ; // BOM
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
try {
for ( int i = 0 ; i < args . length ; + + i ) {
2004-02-06 18:32:05 +00:00
String version = args [ i ] ;
2001-08-30 20:50:18 +00:00
if ( version . length ( ) = = 0 ) version = UCD . latestVersion ;
2001-08-31 00:30:17 +00:00
2004-02-06 18:32:05 +00:00
new ConvertUCD ( ) . toJava ( version ) ;
2001-08-30 20:50:18 +00:00
}
} finally {
log . close ( ) ;
}
}
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
/ *
static void toXML ( ) throws Exception {
// Blocks is special
// Unihan is special
// collect all the other .txt files in the directory
if ( false ) readBlocks ( ) ;
if ( true ) for ( int i = 0 ; i < labelList . length ; + + i ) {
readSemi ( labelList [ i ] ) ;
} else {
readSemi ( labelList [ 0 ] ) ; // TESTING ONLY
}
writeXML ( ) ;
}
* /
2001-08-31 00:30:17 +00:00
2004-02-06 18:32:05 +00:00
void toJava ( String version ) throws Exception {
this . version = version ;
String [ ] parts = new String [ 3 ] ;
Utility . split ( version , '.' , parts ) ;
major = Integer . parseInt ( parts [ 0 ] ) ;
minor = Integer . parseInt ( parts [ 1 ] ) ;
update = Integer . parseInt ( parts [ 2 ] ) ;
2001-09-01 01:11:13 +00:00
System . out . println ( " Building " + version ) ;
2001-08-30 20:50:18 +00:00
// Blocks is special
// Unihan is special
// collect all the other .txt files in the directory
if ( false ) readBlocks ( ) ;
if ( true ) for ( int i = 0 ; i < labelList . length ; + + i ) {
readSemi ( labelList [ i ] ) ;
} else {
readSemi ( labelList [ 0 ] ) ; // TESTING ONLY
}
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
Iterator it = charData . keySet ( ) . iterator ( ) ;
while ( it . hasNext ( ) ) {
Object key = it . next ( ) ;
UData value = ( UData ) charData . get ( key ) ;
value . compact ( ) ;
}
2002-03-20 00:21:43 +00:00
2004-03-11 19:04:00 +00:00
/ *
2002-03-20 00:21:43 +00:00
UData ud ;
ud = getEntry ( 0x5e ) ;
System . out . println ( " SPOT-CHECK: 5e: " + ud ) ;
2004-02-06 18:32:05 +00:00
2002-03-20 00:21:43 +00:00
ud = getEntry ( 0x130 ) ;
System . out . println ( " SPOT-CHECK: 130: " + ud ) ;
2004-02-06 18:32:05 +00:00
ud = getEntry ( 0x1f6 ) ;
System . out . println ( " SPOT-CHECK: 1f6: " + ud ) ;
2002-03-20 00:21:43 +00:00
ud = getEntry ( 0x2A6D6 ) ;
2001-08-30 20:50:18 +00:00
System . out . println ( " SPOT-CHECK: 2A6D6: " + ud ) ;
2002-03-20 00:21:43 +00:00
2001-08-30 20:50:18 +00:00
ud = getEntry ( 0xFFFF ) ;
System . out . println ( " SPOT-CHECK: FFFF: " + ud ) ;
2004-03-11 19:04:00 +00:00
* /
2001-08-30 20:50:18 +00:00
writeJavaData ( ) ;
}
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
static PrintWriter log ;
//static String directory = BASE_DIR;
//static Map appendDuplicates = new HashMap();
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
/ * * First item in labels is file name , rest are field names ( skipping character ) .
* " OMIT " is special - - means don ' t record
* /
2001-08-31 00:30:17 +00:00
2004-02-06 18:32:05 +00:00
List blockData = new LinkedList ( ) ;
2001-08-31 00:30:17 +00:00
2004-02-06 18:32:05 +00:00
void readBlocks ( ) throws Exception {
2001-08-30 20:50:18 +00:00
System . out . println ( " Reading 'Blocks' " ) ;
2002-10-05 01:28:58 +00:00
BufferedReader input = Utility . openUnicodeFile ( blocksname , version , true , Utility . LATIN1 ) ;
2001-08-30 20:50:18 +00:00
String line = " " ;
try {
String [ ] parts = new String [ 20 ] ;
for ( int lineNumber = 1 ; ; + + lineNumber ) {
line = input . readLine ( ) ;
if ( line = = null ) break ;
if ( SHOW & & ( lineNumber % 500 ) = = 0 ) System . out . println ( " // " + lineNumber + " : ' " + line + " ' " ) ;
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
//String original = line;
String comment = " " ;
int commentPos = line . indexOf ( '#' ) ;
if ( commentPos > = 0 ) {
comment = line . substring ( commentPos + 1 ) . trim ( ) ;
line = line . substring ( 0 , commentPos ) ;
}
line = line . trim ( ) ;
if ( line . length ( ) = = 0 ) continue ;
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
int count = Utility . split ( line , ';' , parts ) ;
if ( count ! = 3 ) throw new ChainException ( " Bad count in Blocks " , null ) ;
blockData . add ( new String [ ] { Utility . fromHex ( parts [ 0 ] ) , Utility . fromHex ( parts [ 1 ] ) , parts [ 2 ] . trim ( ) } ) ;
}
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
} catch ( Exception e ) {
System . out . println ( " Exception at: " + line ) ;
throw e ;
} finally {
input . close ( ) ;
}
}
2001-08-31 00:30:17 +00:00
2004-02-06 18:32:05 +00:00
Set properties = new TreeSet ( ) ;
2001-08-31 00:30:17 +00:00
2004-02-06 18:32:05 +00:00
void readSemi ( String [ ] labels ) throws Exception {
2001-08-30 20:50:18 +00:00
System . out . println ( ) ;
System . out . println ( " Reading ' " + labels [ 0 ] + " ' " ) ;
if ( major < 3 | | ( major = = 3 & & minor < 1 ) ) {
if ( labels [ 0 ] = = " PropList " ) {
System . out . println ( " SKIPPING old format of Proplist for " + version ) ;
return ;
}
}
String tempVersion = version ;
if ( version . equals ( UCD . latestVersion ) ) tempVersion = " " ;
2002-10-05 01:28:58 +00:00
BufferedReader input = Utility . openUnicodeFile ( labels [ 0 ] , tempVersion , true , Utility . LATIN1 ) ;
2001-08-30 20:50:18 +00:00
if ( input = = null ) {
System . out . println ( " COULDN'T OPEN: " + labels [ 0 ] ) ;
return ;
}
boolean showedSemi = false ;
boolean showedShort = false ;
String line = " " ;
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
try {
String [ ] parts = new String [ 20 ] ;
for ( int lineNumber = 1 ; ; + + lineNumber ) {
line = input . readLine ( ) ;
if ( line = = null ) break ;
if ( SHOW & & ( lineNumber % 500 ) = = 0 ) System . out . println ( " // " + lineNumber + " : ' " + line + " ' " ) ;
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
String original = line ;
String comment = " " ;
int commentPos = line . indexOf ( '#' ) ;
if ( commentPos > = 0 ) {
comment = line . substring ( commentPos + 1 ) . trim ( ) ;
line = line . substring ( 0 , commentPos ) ;
}
line = line . trim ( ) ;
if ( line . length ( ) = = 0 ) continue ;
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
int count = Utility . split ( line , ';' , parts ) ;
2001-08-31 00:30:17 +00:00
2004-03-11 19:04:00 +00:00
if ( false & & parts [ 0 ] . equals ( " 2801 " ) ) {
2001-08-30 20:50:18 +00:00
System . out . println ( " debug? " ) ;
}
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
// fix malformed or simple lists.
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
if ( count ! = labels . length ) {
if ( count = = labels . length + 1 & & parts [ count - 1 ] . equals ( " " ) ) {
if ( ! showedSemi ) System . out . println ( " Extra semicolon in: " + original ) ;
showedSemi = true ;
} else if ( count = = 1 ) { // fix simple list
+ + count ;
parts [ 1 ] = " Y " ;
} else if ( count < labels . length ) {
if ( ! showedShort ) System . out . println ( " Line shorter than labels: " + original ) ;
showedShort = true ;
for ( int i = count ; i < labels . length ; + + i ) {
parts [ i ] = " " ;
}
} else {
2001-08-31 00:30:17 +00:00
throw new ChainException ( " wrong count: {0} " ,
2001-08-30 20:50:18 +00:00
new Object [ ] { new Integer ( line ) , new Integer ( count ) } ) ;
}
}
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
// store char
// first field is always character OR range. May be UTF-32
int cpTop ;
int cpStart ;
int ddot = parts [ 0 ] . indexOf ( " . " ) ;
if ( ddot > = 0 ) {
cpStart = UTF32 . char32At ( Utility . fromHex ( parts [ 0 ] . substring ( 0 , ddot ) ) , 0 ) ;
cpTop = UTF32 . char32At ( Utility . fromHex ( parts [ 0 ] . substring ( ddot + 2 ) ) , 0 ) ;
2002-03-15 00:34:46 +00:00
// System.out.println(Utility.hex(cpStart) + " ... " + Utility.hex(cpTop));
2001-08-30 20:50:18 +00:00
} else {
cpStart = UTF32 . char32At ( Utility . fromHex ( parts [ 0 ] ) , 0 ) ;
cpTop = cpStart ;
if ( labels [ 1 ] . equals ( " RANGE " ) ) UTF32 . char32At ( Utility . fromHex ( parts [ 1 ] ) , 0 ) ;
}
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
// properties first
if ( labels [ 1 ] . equals ( " PROP " ) ) {
String prop = parts [ 2 ] . trim ( ) ;
// FIX!!
boolean skipLetters = false ;
if ( prop . equals ( " Alphabetic " ) ) {
prop = " Other_Alphabetic " ;
skipLetters = true ;
}
// END FIX!!
properties . add ( prop ) ;
2002-03-15 00:34:46 +00:00
if ( Utility . find ( prop , UCD_Names . DeletedProperties , true ) = = - 1 ) { // only undeleted
2001-08-30 20:50:18 +00:00
int end = UTF32 . char32At ( Utility . fromHex ( parts [ 1 ] ) , 0 ) ;
2001-08-31 00:30:17 +00:00
if ( end = = 0 ) end = cpStart ;
2001-08-30 20:50:18 +00:00
for ( int j = cpStart ; j < = end ; + + j ) {
2004-03-11 19:04:00 +00:00
if ( j ! = UCD . mapToRepresentative ( j , Integer . MAX_VALUE ) ) continue ;
2001-08-30 20:50:18 +00:00
if ( skipLetters & & getEntry ( cpStart ) . isLetter ( ) ) continue ;
appendCharProperties ( j , prop ) ;
}
}
} else { // not range!
String val = " " ;
String lastVal ;
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
for ( int i = 1 ; i < labels . length ; + + i ) {
String key = labels [ i ] ;
lastVal = val ;
if ( isHex . get ( key ) ! = null ) {
val = Utility . fromHex ( parts [ i ] ) ;
} else {
val = parts [ i ] . trim ( ) ;
}
if ( key . equals ( " OMIT " ) ) continue ; // do after val, so lastVal is correct
if ( key . equals ( " RANGE " ) ) continue ; // do after val, so lastVal is correct
if ( val . equals ( " " ) ) continue ; // skip empty values, they mean default
for ( int cps = cpStart ; cps < = cpTop ; + + cps ) {
2004-03-11 19:04:00 +00:00
if ( UCD . mapToRepresentative ( cps , Integer . MAX_VALUE ) ! = cps ) continue ; // skip condensed ranges
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
if ( key . equals ( " binary " ) ) {
appendCharProperties ( cps , val ) ;
} else if ( key . equals ( " fc " ) ) {
UData data = getEntry ( cps ) ;
String type = parts [ i - 1 ] . trim ( ) ;
if ( type . equals ( " F " ) | | type . equals ( " C " ) | | type . equals ( " E " ) | | type . equals ( " L " ) ) {
data . fullCaseFolding = val ;
//System.out.println("*<" + parts[i-1] + "> Setting " + Utility.hex(cps) + ": " + Utility.hex(val));
}
if ( type . equals ( " S " ) | | type . equals ( " C " ) | | type . equals ( " L " ) ) {
data . simpleCaseFolding = val ;
//System.out.println("<" + parts[i-1] + "> Setting " + Utility.hex(cps) + ": " + Utility.hex(val));
}
if ( type . equals ( " I " ) ) {
data . simpleCaseFolding = val ;
setBinaryProperty ( cps , CaseFoldTurkishI ) ;
2004-03-11 19:04:00 +00:00
if ( DEBUG ) System . out . println ( " SPOT-CHECK: < " + parts [ i - 1 ] + " > Setting "
2002-03-20 00:21:43 +00:00
+ Utility . hex ( cps ) + " : " + Utility . hex ( val ) ) ;
}
} else if ( labels [ 0 ] . equals ( " SpecialCasing " ) // special handling for special casing
& & labels [ 4 ] . equals ( " sc " )
& & parts [ 4 ] . trim ( ) . length ( ) > 0 ) {
if ( i < 4 ) {
if ( DEBUG ) System . out . println ( " Got special: " + Utility . hex ( cps ) + " , "
+ Utility . hex ( key ) + " : " + Utility . hex ( val ) ) ;
addCharData ( cps , " sc " , parts [ 4 ] . trim ( ) + " : " + key + " : " + val ) ;
2001-08-30 20:50:18 +00:00
}
} else {
/ * if ( key . equals ( " sn " ) ) { // SKIP UNDEFINED!!
UData data = getEntryIfExists ( cps ) ;
if ( data = = null | | data . generalCategory = = Cn ) continue ;
}
* /
addCharData ( cps , key , val ) ;
}
}
}
}
}
} catch ( Exception e ) {
System . out . println ( " Exception at: " + line + " , " + e . getMessage ( ) ) ;
throw e ;
} finally {
input . close ( ) ;
}
//printValues("JOINING_TYPE", jtSet);
//printValues("JOINING_GROUP", jgSet);
}
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
static void printValues ( String title , Set s ) {
Iterator it = s . iterator ( ) ;
System . out . println ( " public static String[] " + title + " = { " ) ;
while ( it . hasNext ( ) ) {
String value = ( String ) it . next ( ) ;
System . out . println ( " \" " + value + " \" , " ) ;
}
System . out . println ( " }; " ) ;
it = s . iterator ( ) ;
System . out . println ( " public static byte " ) ;
int count = 0 ;
while ( it . hasNext ( ) ) {
String value = ( String ) it . next ( ) ;
System . out . println ( " " + value . replace ( ' ' , '-' ) . toUpperCase ( ) + " = " + ( count + + ) + " , " ) ;
}
System . out . println ( " LIMIT_ " + title + " = " + count ) ;
System . out . println ( " ; " ) ;
}
2001-08-31 00:30:17 +00:00
2004-02-06 18:32:05 +00:00
Map charData = new TreeMap ( ) ;
2001-08-31 00:30:17 +00:00
2004-02-06 18:32:05 +00:00
/ *
2001-08-30 20:50:18 +00:00
static void writeXML ( ) throws IOException {
System . out . println ( " Writing 'UCD-Main.xml' " ) ;
BufferedWriter output = new BufferedWriter (
new OutputStreamWriter (
new FileOutputStream ( UCD . BIN_DIR + " UCD_Data.xml " ) ,
" UTF8 " ) ,
32 * 1024 ) ;
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
try {
// write header
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
output . write ( " <?xml version='1.0' encoding='utf-8'?> \ r \ n " ) ;
output . write ( " <UnicodeCharacterDatabase> \ r \ n " ) ;
output . write ( " <!-- IMPORTANT: see UCD-Notes.html for information on the format. This file CANNOT be read correctly without that information. --> \ r \ n " ) ;
output . write ( " <unicode version=' " + major + " ' minor=' " + minor + " ' update=' " + update + " '/> \ r \ n " ) ;
output . write ( " <fileVersion status='DRAFT' date=' " + new Date ( ) + " '/> \ r \ n " ) ;
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
// write blocks
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
Iterator it = blockData . iterator ( ) ;
while ( it . hasNext ( ) ) {
String [ ] block = ( String [ ] ) it . next ( ) ;
2001-08-31 00:30:17 +00:00
output . write ( " <block start=' " + Utility . quoteXML ( block [ 0 ] )
2001-08-30 20:50:18 +00:00
+ " ' end=' " + Utility . quoteXML ( block [ 1 ] )
+ " ' name=' " + Utility . quoteXML ( block [ 2 ] )
+ " '/> \ r \ n " ) ;
}
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
// write char data
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
it = charData . keySet ( ) . iterator ( ) ;
while ( it . hasNext ( ) ) {
Integer cc = ( Integer ) it . next ( ) ;
2002-04-24 02:38:53 +00:00
output . write ( " <e c=' " + Utility . quoteXML ( cc . intValue ( ) ) + " ' " ) ;
2001-08-30 20:50:18 +00:00
/ *
UData data = ( UData ) charData . get ( cc ) ;
Iterator dataIt = data . keySet ( ) . iterator ( ) ;
while ( dataIt . hasNext ( ) ) {
String label = ( String ) dataIt . next ( ) ;
if ( label . equals ( " c " ) ) continue ; // already wrote it.
if ( label . equals ( " fc " ) ) {
String fc = getResolved ( data , " fc " ) ;
String lc = getResolved ( data , " lc " ) ;
if ( ! fc . equals ( lc ) & & ! lc . equals ( cc ) ) log . println ( " FC " + fc . length ( ) + " : " + toString ( cc ) ) ;
}
String value = Utility . quoteXML ( ( String ) data . get ( label ) ) ;
output . write ( " " + label + " =' " + value + " ' " ) ;
}
2004-02-06 18:32:05 +00:00
* //*
2001-08-30 20:50:18 +00:00
output . write ( " /> \ r \ n " ) ;
}
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
// write footer
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
output . write ( " </UnicodeCharacterDatabase> \ r \ n " ) ;
} finally {
output . close ( ) ;
}
}
2004-02-06 18:32:05 +00:00
* /
void writeJavaData ( ) throws IOException {
2001-08-30 20:50:18 +00:00
Iterator it = charData . keySet ( ) . iterator ( ) ;
int codePoint = - 1 ;
System . out . println ( " Writing " + dataFilePrefix + version ) ;
DataOutputStream dataOut = new DataOutputStream (
new BufferedOutputStream (
new FileOutputStream ( UCD . BIN_DIR + dataFilePrefix + version + " .bin " ) ,
128 * 1024 ) ) ;
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
// write header
dataOut . writeByte ( BINARY_FORMAT ) ;
dataOut . writeByte ( major ) ;
dataOut . writeByte ( minor ) ;
dataOut . writeByte ( update ) ;
long millis = System . currentTimeMillis ( ) ;
dataOut . writeLong ( millis ) ;
dataOut . writeInt ( charData . size ( ) ) ;
System . out . println ( " Data Size: " + NumberFormat . getInstance ( ) . format ( charData . size ( ) ) ) ;
int count = 0 ;
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
// write records
try {
// write char data
while ( it . hasNext ( ) ) {
Object cc = ( Object ) it . next ( ) ;
//codePoint = UTF32.char32At(cc,0);
if ( DEBUG ) System . out . println ( Utility . hex ( cc ) ) ;
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
UData uData = ( UData ) charData . get ( cc ) ;
if ( false & & uData . name = = null ) {
System . out . println ( " Warning: NULL name \ r \ n " + uData ) ;
System . out . println ( ) ;
}
2004-03-11 19:04:00 +00:00
if ( false & & uData . codePoint = = 0x2801 ) {
2001-08-30 20:50:18 +00:00
System . out . println ( " SPOT-CHECK: " + uData ) ;
}
uData . writeBytes ( dataOut ) ;
count + + ;
if ( DEBUG ) System . out . println ( " Setting2 " ) ;
}
System . out . println ( " Wrote Data " + count ) ;
} catch ( Exception e ) {
throw new ChainException ( " Bad data write {0} " , new Object [ ] { Utility . hex ( codePoint ) } , e ) ;
} finally {
dataOut . close ( ) ;
}
}
2001-08-31 00:30:17 +00:00
2004-02-06 18:32:05 +00:00
//static String[] xsSplit = new String[40];
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
// Cache a little bit for speed
2004-02-06 18:32:05 +00:00
int getEntryCodePoint = - 1 ;
UData getEntryUData = null ;
2001-08-31 00:30:17 +00:00
2004-02-06 18:32:05 +00:00
UData getEntryIfExists ( int cp ) {
2001-08-30 20:50:18 +00:00
if ( cp = = getEntryCodePoint ) return getEntryUData ;
Integer cc = new Integer ( cp ) ;
UData charEntry = ( UData ) charData . get ( cc ) ;
if ( charEntry = = null ) return null ;
getEntryCodePoint = cp ;
getEntryUData = charEntry ;
return charEntry ;
}
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
/ * Get entry in table for cc
* /
2004-02-06 18:32:05 +00:00
UData getEntry ( int cp ) {
2001-08-30 20:50:18 +00:00
if ( cp = = getEntryCodePoint ) return getEntryUData ;
Integer cc = new Integer ( cp ) ;
UData charEntry = ( UData ) charData . get ( cc ) ;
if ( charEntry = = null ) {
charEntry = new UData ( cp ) ;
charData . put ( cc , charEntry ) ;
//charEntry.put("c", cc);
}
getEntryCodePoint = cp ;
getEntryUData = charEntry ;
return charEntry ;
}
/ * * Adds the character data . Signals duplicates with an exception
* /
2004-02-06 18:32:05 +00:00
void setBinaryProperty ( int cp , int binProp ) {
2001-08-30 20:50:18 +00:00
UData charEntry = getEntry ( cp ) ;
2003-07-21 15:50:07 +00:00
charEntry . binaryProperties | = ( 1L < < binProp ) ;
2001-08-30 20:50:18 +00:00
}
2001-08-31 00:30:17 +00:00
2004-02-06 18:32:05 +00:00
void appendCharProperties ( int cp , String key ) {
2001-08-30 20:50:18 +00:00
int ind ;
//if (true || NEWPROPS) {
2002-03-15 00:34:46 +00:00
ind = Utility . lookup ( key , UCD_Names . BP , true ) ;
2001-08-30 20:50:18 +00:00
/ * } else {
ind = Utility . lookup ( key , UCD_Names . BP_OLD ) ;
}
* /
//charEntry.binaryProperties |= (1 << ind);
setBinaryProperty ( cp , ind ) ;
}
2001-08-31 00:30:17 +00:00
2004-02-06 18:32:05 +00:00
Set jtSet = new TreeSet ( ) ;
Set jgSet = new TreeSet ( ) ;
2003-02-25 23:38:23 +00:00
2001-08-30 20:50:18 +00:00
/ * * Adds the character data . Signals duplicates with an exception
* /
2004-02-06 18:32:05 +00:00
void addCharData ( int cp , String key , String value ) {
2001-08-30 20:50:18 +00:00
//if (cp < 10) System.out.println("A: " + Utility.hex(cp) + ", " + key + ", " + Utility.quoteJavaString(value));
UData charEntry = getEntry ( cp ) ;
//if (cp < 10) System.out.println(" " + charEntry);
2003-02-25 23:38:23 +00:00
if ( SHOW_SAMPLE & & cp = = 0x221 ) {
System . out . println ( " Sample: " + cp + " , " + key + " , " + value ) ;
System . out . println ( charEntry ) ;
}
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
if ( key . equals ( " bm " ) ) {
if ( value . equals ( " Y " ) ) charEntry . binaryProperties | = 1 ;
} else if ( key . equals ( " ce " ) ) {
charEntry . binaryProperties | = 2 ;
} else if ( key . equals ( " on " ) ) {
if ( charEntry . name . charAt ( 0 ) = = '<' ) {
charEntry . name = '<' + value + '>' ;
}
} else if ( key . equals ( " dm " ) ) {
charEntry . decompositionType = CANONICAL ;
if ( value . charAt ( 0 ) = = '<' ) {
int pos = value . indexOf ( '>' ) ;
String dType = value . substring ( 1 , pos ) ;
if ( major < 2 ) if ( dType . charAt ( 0 ) = = '+' ) dType = dType . substring ( 1 ) ;
value = value . substring ( pos + 1 ) ;
setField ( charEntry , " dt " , dType ) ;
}
// FIX OLD
if ( major < 2 ) {
int oldStyle = value . indexOf ( '<' ) ;
if ( oldStyle > 0 ) {
value = value . substring ( 0 , oldStyle ) ;
}
oldStyle = value . indexOf ( '{' ) ;
if ( oldStyle > 0 ) {
value = value . substring ( 0 , oldStyle ) ;
}
}
setField ( charEntry , key , Utility . fromHex ( value ) ) ;
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
// fix the numeric fields to be more sensible
} else if ( key . equals ( " dd " ) ) {
if ( charEntry . numericType < UCD_Types . DECIMAL ) {
charEntry . numericType = UCD_Types . DECIMAL ;
}
setField ( charEntry , " nv " , value ) ;
} else if ( key . equals ( " dv " ) ) {
if ( charEntry . numericType < UCD_Types . DIGIT ) {
charEntry . numericType = UCD_Types . DIGIT ;
}
setField ( charEntry , " nv " , value ) ;
} else if ( key . equals ( " nv " ) ) {
if ( charEntry . numericType < UCD_Types . NUMERIC ) {
charEntry . numericType = UCD_Types . NUMERIC ;
}
setField ( charEntry , " nv " , value ) ;
/ * } else if ( key . equals ( " jt " ) ) {
jtSet . add ( value ) ;
} else if ( key . equals ( " jg " ) ) {
jgSet . add ( value ) ;
* /
} else {
setField ( charEntry , key , value ) ;
}
2003-02-25 23:38:23 +00:00
if ( SHOW_SAMPLE & & cp = = 0x221 ) {
System . out . println ( " Sample Result: " ) ;
System . out . println ( charEntry ) ;
}
2001-08-30 20:50:18 +00:00
}
2001-08-31 00:30:17 +00:00
2004-02-06 18:32:05 +00:00
public void setField ( UData uData , String fieldName , String fieldValue ) {
2001-08-30 20:50:18 +00:00
try {
if ( fieldName . equals ( " n " ) ) {
uData . name = fieldValue ;
} else if ( fieldName . equals ( " dm " ) ) {
uData . decompositionMapping = fieldValue ;
} else if ( fieldName . equals ( " bg " ) ) {
uData . bidiMirror = fieldValue ;
} else if ( fieldName . equals ( " uc " ) ) {
uData . simpleUppercase = fieldValue ;
} else if ( fieldName . equals ( " lc " ) ) {
uData . simpleLowercase = fieldValue ;
} else if ( fieldName . equals ( " tc " ) ) {
uData . simpleTitlecase = fieldValue ;
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
} else if ( fieldName . equals ( " su " ) ) {
uData . fullUppercase = fieldValue ;
} else if ( fieldName . equals ( " sl " ) ) {
2002-03-20 00:21:43 +00:00
if ( DEBUG ) System . out . println ( " Setting full lowercase to " + Utility . hex ( fieldValue ) + uData ) ;
2001-08-30 20:50:18 +00:00
uData . fullLowercase = fieldValue ;
} else if ( fieldName . equals ( " st " ) ) {
uData . fullTitlecase = fieldValue ;
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
} else if ( fieldName . equals ( " sc " ) ) {
2002-03-20 00:21:43 +00:00
if ( uData . specialCasing . length ( ) > 0 ) {
uData . specialCasing + = " ; " ;
}
uData . specialCasing + = fieldValue ;
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
} else if ( fieldName . equals ( " xp " ) ) {
2003-07-21 15:50:07 +00:00
uData . binaryProperties | = 1L < < Utility . lookup ( fieldValue , UCD_Names . BP , true ) ;
2001-08-30 20:50:18 +00:00
//UCD_Names.BP_OLD
} else if ( fieldName . equals ( " gc " ) ) {
2004-02-12 08:23:19 +00:00
uData . generalCategory = Utility . lookup ( fieldValue , UCD_Names . GENERAL_CATEGORY , true ) ;
2001-08-30 20:50:18 +00:00
} else if ( fieldName . equals ( " bc " ) ) {
2004-02-12 08:23:19 +00:00
uData . bidiClass = Utility . lookup ( fieldValue , UCD_Names . BIDI_CLASS , true ) ;
2001-08-30 20:50:18 +00:00
} else if ( fieldName . equals ( " dt " ) ) {
if ( major < 2 ) {
if ( fieldValue . equals ( " no-break " ) ) fieldValue = " noBreak " ;
else if ( fieldValue . equals ( " circled " ) ) fieldValue = " circle " ;
else if ( fieldValue . equals ( " sup " ) ) fieldValue = " super " ;
else if ( fieldValue . equals ( " break " ) ) fieldValue = " compat " ;
else if ( fieldValue . equals ( " font variant " ) ) fieldValue = " font " ;
else if ( fieldValue . equals ( " no-join " ) ) fieldValue = " compat " ;
else if ( fieldValue . equals ( " join " ) ) fieldValue = " compat " ;
}
2004-02-12 08:23:19 +00:00
uData . decompositionType = Utility . lookup ( fieldValue , UCD_Names . LONG_DECOMPOSITION_TYPE , true ) ;
2001-08-30 20:50:18 +00:00
} else if ( fieldName . equals ( " nt " ) ) {
2004-02-12 08:23:19 +00:00
uData . numericType = Utility . lookup ( fieldValue , UCD_Names . LONG_NUMERIC_TYPE , true ) ;
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
} else if ( fieldName . equals ( " ea " ) ) {
2004-02-12 08:23:19 +00:00
uData . eastAsianWidth = Utility . lookup ( fieldValue , UCD_Names . EAST_ASIAN_WIDTH , true ) ;
2001-08-30 20:50:18 +00:00
} else if ( fieldName . equals ( " lb " ) ) {
2004-02-12 08:23:19 +00:00
uData . lineBreak = Utility . lookup ( fieldValue , UCD_Names . LINE_BREAK , true ) ;
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
} else if ( fieldName . equals ( " sn " ) ) {
2004-02-12 08:23:19 +00:00
uData . script = Utility . lookup ( fieldValue , UCD_Names . LONG_SCRIPT , true ) ;
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
} else if ( fieldName . equals ( " jt " ) ) {
2002-03-15 00:34:46 +00:00
uData . joiningType = Utility . lookup ( fieldValue , UCD_Names . JOINING_TYPE , true ) ;
2001-08-30 20:50:18 +00:00
} else if ( fieldName . equals ( " jg " ) ) {
2004-11-12 23:17:15 +00:00
byte temp = ( byte ) Utility . find ( fieldValue , UCD_Names . OLD_JOINING_GROUP , true ) ;
if ( temp ! = - 1 ) uData . joiningGroup = temp ;
else uData . joiningGroup = Utility . lookup ( fieldValue , UCD_Names . JOINING_GROUP , true ) ;
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
} else if ( fieldName . equals ( " nv " ) ) {
if ( major < 2 ) {
if ( fieldValue . equals ( " - " ) ) return ;
}
2003-03-12 16:01:26 +00:00
uData . numericValue = Utility . doubleFrom ( fieldValue ) ;
2001-08-30 20:50:18 +00:00
} else if ( fieldName . equals ( " cc " ) ) {
uData . combiningClass = ( byte ) Utility . intFrom ( fieldValue ) ;
} else if ( fieldName . equals ( " bp " ) ) {
2003-07-21 15:50:07 +00:00
uData . binaryProperties = ( byte ) Utility . longFrom ( fieldValue ) ;
2001-08-30 20:50:18 +00:00
} else {
throw new IllegalArgumentException ( " Unknown fieldName " ) ;
}
} catch ( Exception e ) {
throw new ChainException (
" Bad field name= \" {0} \" , value= \" {1} \" " , new Object [ ] { fieldName , fieldValue } , e ) ;
}
}
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
}