ICU-3170 update code for Unicode 4.0.1

X-SVN-Rev: 14844
This commit is contained in:
Markus Scherer 2004-04-02 17:41:06 +00:00
parent 72265d2950
commit 9b10636685
6 changed files with 75 additions and 28 deletions

View File

@ -1,6 +1,6 @@
/*
**********************************************************************
* Copyright (C) 1997-2003, International Business Machines
* Copyright (C) 1997-2004, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
*
@ -286,6 +286,17 @@ typedef enum UProperty {
mapping or _in_ the target of a case mapping. Not the same as
the general category Cased_Letter. @draft ICU 2.6 */
UCHAR_CASE_SENSITIVE,
/** Binary property STerm (new in Unicode 4.0.1).
Sentence Terminal. Used in UAX #29: Text Boundaries
(http://www.unicode.org/reports/tr29/)
@draft ICU 3.0 */
UCHAR_S_TERM,
/** Binary property Variation_Selector (new in Unicode 4.0.1).
Indicates all those characters that qualify as Variation Selectors.
For details on the behavior of these characters,
see StandardizedVariants.html and 15.6 Variation Selectors.
@draft ICU 3.0 */
UCHAR_VARIATION_SELECTOR,
/** One more than the last constant for binary Unicode properties. @stable ICU 2.1 */
UCHAR_BINARY_LIMIT,
@ -956,6 +967,11 @@ enum UBlockCode {
/** @stable ICU 2.2 */
UBLOCK_CYRILLIC_SUPPLEMENTARY = 97, /*[0500]*/
/**
* Unicode 4.0.1 renames the "Cyrillic Supplementary" block to "Cyrillic Supplement".
* @draft ICU 3.0
*/
UBLOCK_CYRILLIC_SUPPLEMENT = 97, /*[0500]*/
/** @stable ICU 2.2 */
UBLOCK_TAGALOG = 98, /*[1700]*/
/** @stable ICU 2.2 */
@ -1215,6 +1231,8 @@ typedef enum ULineBreak {
U_LB_HYPHEN, /*[HY]*/
U_LB_IDEOGRAPHIC, /*[ID]*/
U_LB_INSEPERABLE, /*[IN]*/
/** Renamed from the misspelled "inseperable" in Unicode 4.0.1/ICU 3.0 @draft ICU 3.0 */
U_LB_INSEPARABLE=U_LB_INSEPERABLE,/*[IN]*/
U_LB_INFIX_NUMERIC, /*[IS]*/
U_LB_LINE_FEED, /*[LF]*/
U_LB_NONSTARTER, /*[NS]*/

View File

@ -1,6 +1,6 @@
/*
**********************************************************************
* Copyright (C) 1997-2003, International Business Machines
* Copyright (C) 1997-2004, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
*
@ -83,6 +83,9 @@ typedef enum UScriptCode {
USCRIPT_TAI_LE, /* Tale */
USCRIPT_UGARITIC, /* Ugar */
/** New script code in Unicode 4.0.1 @draft ICU 3.0 */
USCRIPT_KATAKANA_OR_HIRAGANA,/*Hrkt */
USCRIPT_CODE_LIMIT
} UScriptCode;

View File

@ -1,7 +1,7 @@
/*
*******************************************************************************
*
* Copyright (C) 2002-2003, International Business Machines
* Copyright (C) 2002-2004, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
@ -214,7 +214,9 @@ static const struct {
{ 1, U_MASK(UPROPS_WHITE_SPACE) },
{ 1, U_MASK(UPROPS_XID_CONTINUE) },
{ 1, U_MASK(UPROPS_XID_START) },
{ -1, U_MASK(UPROPS_CASE_SENSITIVE_SHIFT) }
{ -1, U_MASK(UPROPS_CASE_SENSITIVE_SHIFT) },
{ 2, U_MASK(UPROPS_V2_S_TERM) },
{ 2, U_MASK(UPROPS_V2_VARIATION_SELECTOR) }
};
U_CAPI UBool U_EXPORT2

View File

@ -1,7 +1,7 @@
/*
*******************************************************************************
*
* Copyright (C) 2002-2003, International Business Machines
* Copyright (C) 2002-2004, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
@ -164,6 +164,7 @@ enum {
/*
* Properties in vector word 2
* Bits
* 31..24 More binary properties
* 13..11 Joining Type
* 10.. 5 Joining Group
* 4.. 0 Decomposition Type
@ -176,6 +177,12 @@ enum {
#define UPROPS_DT_MASK 0x0000001f
enum {
UPROPS_V2_S_TERM=24, /* new in ICU 3.0 and Unicode 4.0.1 */
UPROPS_V2_VARIATION_SELECTOR,
UPROPS_V2_TOP /* must be <=32 */
};
/**
* Get a properties vector word for a code point.
* Implemented in uchar.c for uprops.c.

View File

@ -1,7 +1,7 @@
/*
*******************************************************************************
*
* Copyright (C) 2001-2003, International Business Machines
* Copyright (C) 2001-2004, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
@ -246,8 +246,22 @@ derivedNormalizationPropertiesLineFn(void *context,
qcFlags&=0xf;
} else if(0==uprv_memcmp(s, "MAYBE", 5)) {
qcFlags&=0x30;
} else if(0==uprv_memcmp(s, "QC", 2) && *(s=(char *)u_skipWhitespace(s+2))==';') {
/*
* Unicode 4.0.1:
* changes single field "NFD_NO" -> two fields "NFD_QC; N" etc.
*/
/* start of the field */
s=(char *)u_skipWhitespace(s+1);
if(*s=='N') {
qcFlags&=0xf;
} else if(*s=='M') {
qcFlags&=0x30;
} else {
return; /* do nothing for "Yes" because it's the default value */
}
} else {
return;
return; /* do nothing for "Yes" because it's the default value */
}
/* set this flag for all code points in this range */
@ -259,7 +273,10 @@ derivedNormalizationPropertiesLineFn(void *context,
while(start<=end) {
setCompositionExclusion(start++);
}
} else if(0==uprv_memcmp(s, "FNC", 3) && *(s=(char *)u_skipWhitespace(s+3))==';') {
} else if(
(0==uprv_memcmp(s, "FNC", 3) || 0==uprv_memcmp(s, "FC_NFKC", 7))
&& *(s=(char *)u_skipWhitespace(s+3))==';'
) {
/* FC_NFKC_Closure, parse field 2 to get the string */
char *t;

View File

@ -1,7 +1,7 @@
/*
*******************************************************************************
*
* Copyright (C) 2002-2003, International Business Machines
* Copyright (C) 2002-2004, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
@ -265,7 +265,11 @@ propListNames[]={
{ "Unified_Ideograph", 1, UPROPS_UNIFIED_IDEOGRAPH },
{ "Deprecated", 1, UPROPS_DEPRECATED },
{ "Soft_Dotted", 1, UPROPS_SOFT_DOTTED },
{ "Logical_Order_Exception", 1, UPROPS_LOGICAL_ORDER_EXCEPTION }
{ "Logical_Order_Exception", 1, UPROPS_LOGICAL_ORDER_EXCEPTION },
/* new properties in Unicode 4.0.1 */
{ "STerm", 2, UPROPS_V2_S_TERM },
{ "Variation_Selector", 2, UPROPS_V2_VARIATION_SELECTOR }
};
static const Binaries
@ -399,7 +403,7 @@ generateAdditionalProperties(char *filename, const char *suffix, UErrorCode *pEr
/* process various UCD .txt files */
/* add Han numeric types & values */
parseMultiFieldFile(filename, basename, "DerivedNumericValues", suffix, 3, numericLineFn, pErrorCode);
parseMultiFieldFile(filename, basename, "DerivedNumericValues", suffix, 2, numericLineFn, pErrorCode);
/* set proper bidi class for unassigned code points (Cn) */
parseTwoFieldFile(filename, basename, "DerivedBidiClass", suffix, bidiClassLineFn, pErrorCode);
@ -537,7 +541,7 @@ numericLineFn(void *context,
Props newProps;
char *s, *end;
uint32_t start, limit, value, oldProps32;
int32_t type, oldType;
int32_t oldType;
char c;
UBool isFraction;
@ -586,26 +590,22 @@ numericLineFn(void *context,
}
}
/* parse numeric type */
s=trimTerminateField(fields[2][0], fields[2][1]);
type=u_getPropertyValueEnum(UCHAR_NUMERIC_TYPE, s);
if(type<=0) {
fprintf(stderr, "genprops error: unknown numeric type in DerivedNumericValues.txt field 1 at %s\n", s);
exit(U_PARSE_ERROR);
}
/*
* Unicode 4.0.1 removes the third column that used to list the numeric type.
* Assume that either the data is the same as in UnicodeData.txt,
* or else that the numeric type is "numeric".
* This should work because we only expect to add numeric values for
* Han characters; for those, UnicodeData.txt lists only ranges without
* specific properties for single characters.
*/
for(; start<limit; ++start) {
oldProps32=getProps(start);
oldType=(int32_t)GET_NUMERIC_TYPE(oldProps32);
if(oldType==type) {
if(oldType!=0) {
/* this code point was already listed with its numeric value in UnicodeData.txt */
continue;
}
if(oldType!=0) {
/* the numeric type differs from what we got from UnicodeData.txt */
fprintf(stderr, "genprops error: new numeric value for an already numeric character in DerivedNumericValues.txt at %s\n", fields[0][0]);
exit(U_PARSE_ERROR);
}
/*
* Do not set a numeric value for code points that have other
@ -630,7 +630,7 @@ numericLineFn(void *context,
}
if(beVerbose) {
printf("adding U+%04x numeric type %d value %u\n", start, type, value);
printf("adding U+%04x numeric type %d value %u\n", start, U_NT_NUMERIC, value);
}
/* reconstruct the properties and set the new numeric type and value */
@ -639,8 +639,8 @@ numericLineFn(void *context,
newProps.generalCategory=(uint8_t)GET_CATEGORY(oldProps32);
newProps.bidi=(uint8_t)GET_BIDI_CLASS(oldProps32);
newProps.isMirrored=(uint8_t)(oldProps32&(1UL<<UPROPS_MIRROR_SHIFT) ? TRUE : FALSE);
newProps.numericType=(uint8_t)type; /* newly parsed numeric type */
newProps.numericValue=(int32_t)value; /* newly parsed numeric value */
newProps.numericType=(uint8_t)U_NT_NUMERIC; /* assumed numeric type, see Unicode 4.0.1 comment */
newProps.numericValue=(int32_t)value; /* newly parsed numeric value */
addProps(start, makeProps(&newProps));
}
}