ICU-1709 do not tweak the UCD data to mark some control characters

as spacing ones.

X-SVN-Rev: 7749
This commit is contained in:
Yves Arrouye 2002-02-24 21:00:19 +00:00
parent 3699fd2f22
commit 504a75e931
7 changed files with 46 additions and 147 deletions

View File

@ -30,17 +30,11 @@
#include "utrie.h" #include "utrie.h"
#include "ustr_imp.h" #include "ustr_imp.h"
/*
* Since genprops overrides the general category for some control codes,
* we need to hardcode ISO 8 controls for u_iscntrl(), u_isprint(), etc.
*/
#define IS_ISO_8_CONTROL(c) ((uint32_t)(c)<0x20 || (uint32_t)((c)-0x7f)<=0x20)
/* dynamically loaded Unicode character properties -------------------------- */ /* dynamically loaded Unicode character properties -------------------------- */
/* fallback properties for the ASCII range if the data cannot be loaded */ /* fallback properties for the ASCII range if the data cannot be loaded */
/* these are printed by genprops in verbose mode */ /* these are printed by genprops in verbose mode */
static const uint32_t staticProps32Table[]={ static uint32_t staticProps32Table[0xa0]={
/* 0x00 */ 0x48f, /* 0x00 */ 0x48f,
/* 0x01 */ 0x48f, /* 0x01 */ 0x48f,
/* 0x02 */ 0x48f, /* 0x02 */ 0x48f,
@ -50,11 +44,11 @@ static const uint32_t staticProps32Table[]={
/* 0x06 */ 0x48f, /* 0x06 */ 0x48f,
/* 0x07 */ 0x48f, /* 0x07 */ 0x48f,
/* 0x08 */ 0x48f, /* 0x08 */ 0x48f,
/* 0x09 */ 0x20c, /* 0x09 */ 0x20f,
/* 0x0a */ 0x1ce, /* 0x0a */ 0x1cf,
/* 0x0b */ 0x20c, /* 0x0b */ 0x20f,
/* 0x0c */ 0x24d, /* 0x0c */ 0x24f,
/* 0x0d */ 0x1ce, /* 0x0d */ 0x1cf,
/* 0x0e */ 0x48f, /* 0x0e */ 0x48f,
/* 0x0f */ 0x48f, /* 0x0f */ 0x48f,
/* 0x10 */ 0x48f, /* 0x10 */ 0x48f,
@ -69,10 +63,10 @@ static const uint32_t staticProps32Table[]={
/* 0x19 */ 0x48f, /* 0x19 */ 0x48f,
/* 0x1a */ 0x48f, /* 0x1a */ 0x48f,
/* 0x1b */ 0x48f, /* 0x1b */ 0x48f,
/* 0x1c */ 0x1ce, /* 0x1c */ 0x1cf,
/* 0x1d */ 0x1ce, /* 0x1d */ 0x1cf,
/* 0x1e */ 0x1ce, /* 0x1e */ 0x1cf,
/* 0x1f */ 0x20c, /* 0x1f */ 0x20f,
/* 0x20 */ 0x24c, /* 0x20 */ 0x24c,
/* 0x21 */ 0x297, /* 0x21 */ 0x297,
/* 0x22 */ 0x297, /* 0x22 */ 0x297,
@ -114,8 +108,8 @@ static const uint32_t staticProps32Table[]={
/* 0x46 */ 0x2000001, /* 0x46 */ 0x2000001,
/* 0x47 */ 0x2000001, /* 0x47 */ 0x2000001,
/* 0x48 */ 0x2000001, /* 0x48 */ 0x2000001,
/* 0x49 */ 0x2000001, /* 0x49 */ 0x1, /* has exception */
/* 0x4a */ 0x2000001, /* 0x4a */ 0x300001, /* has exception */
/* 0x4b */ 0x2000001, /* 0x4b */ 0x2000001,
/* 0x4c */ 0x2000001, /* 0x4c */ 0x2000001,
/* 0x4d */ 0x2000001, /* 0x4d */ 0x2000001,
@ -146,7 +140,7 @@ static const uint32_t staticProps32Table[]={
/* 0x66 */ 0x2000002, /* 0x66 */ 0x2000002,
/* 0x67 */ 0x2000002, /* 0x67 */ 0x2000002,
/* 0x68 */ 0x2000002, /* 0x68 */ 0x2000002,
/* 0x69 */ 0x2000002, /* 0x69 */ 0x600002, /* has exception */
/* 0x6a */ 0x2000002, /* 0x6a */ 0x2000002,
/* 0x6b */ 0x2000002, /* 0x6b */ 0x2000002,
/* 0x6c */ 0x2000002, /* 0x6c */ 0x2000002,
@ -174,7 +168,7 @@ static const uint32_t staticProps32Table[]={
/* 0x82 */ 0x48f, /* 0x82 */ 0x48f,
/* 0x83 */ 0x48f, /* 0x83 */ 0x48f,
/* 0x84 */ 0x48f, /* 0x84 */ 0x48f,
/* 0x85 */ 0x1ce, /* 0x85 */ 0x1cf,
/* 0x86 */ 0x48f, /* 0x86 */ 0x48f,
/* 0x87 */ 0x48f, /* 0x87 */ 0x48f,
/* 0x88 */ 0x48f, /* 0x88 */ 0x48f,
@ -200,7 +194,7 @@ static const uint32_t staticProps32Table[]={
/* 0x9c */ 0x48f, /* 0x9c */ 0x48f,
/* 0x9d */ 0x48f, /* 0x9d */ 0x48f,
/* 0x9e */ 0x48f, /* 0x9e */ 0x48f,
/* 0x9f */ 0x48f /* 0x9f */ 0x48f,
}; };
/* /*
@ -424,18 +418,6 @@ u_charType(UChar32 c) {
return (int8_t)GET_CATEGORY(props); return (int8_t)GET_CATEGORY(props);
} }
/* Gets the Unicode character's general category, as per the UCD.*/
U_CAPI int8_t U_EXPORT2
u_charUCDType(UChar32 c) {
if (IS_ISO_8_CONTROL(c)) {
return U_CONTROL_CHAR;
} else {
uint32_t props;
GET_PROPS(c, props);
return (int8_t)GET_CATEGORY(props);
}
}
/* Enumerate all code points with their general categories. */ /* Enumerate all code points with their general categories. */
struct _EnumTypeCallback { struct _EnumTypeCallback {
UCharEnumTypeRange *enumRange; UCharEnumTypeRange *enumRange;
@ -547,26 +529,26 @@ u_isbase(UChar32 c) {
/* Checks if the Unicode character is a control character.*/ /* Checks if the Unicode character is a control character.*/
U_CAPI UBool U_EXPORT2 U_CAPI UBool U_EXPORT2
u_iscntrl(UChar32 c) { u_iscntrl(UChar32 c) {
if(IS_ISO_8_CONTROL(c)) { uint32_t props;
return TRUE; GET_PROPS(c, props);
} else { return (UBool)(
uint32_t props; ((1UL<<GET_CATEGORY(props))&
GET_PROPS(c, props); (1UL<<U_CONTROL_CHAR|1UL<<U_FORMAT_CHAR|1UL<<U_LINE_SEPARATOR|1UL<<U_PARAGRAPH_SEPARATOR)
return (UBool)( )!=0);
((1UL<<GET_CATEGORY(props))&
(1UL<<U_CONTROL_CHAR|1UL<<U_FORMAT_CHAR|1UL<<U_LINE_SEPARATOR|1UL<<U_PARAGRAPH_SEPARATOR)
)!=0);
}
} }
/* Some control characters that are used as space. */
#define IS_THAT_CONTROL_SPACE(c) \
((c>=0x09 && c <= 0x0d) || (c>=0x1c && c <=0x1f) || c==0x85)
/* Checks if the Unicode character is a space character.*/ /* Checks if the Unicode character is a space character.*/
U_CAPI UBool U_EXPORT2 U_CAPI UBool U_EXPORT2
u_isspace(UChar32 c) { u_isspace(UChar32 c) {
uint32_t props; uint32_t props;
GET_PROPS(c, props); GET_PROPS(c, props);
return (UBool)(((1UL<<GET_CATEGORY(props))& return (UBool)((((1UL<<GET_CATEGORY(props))&
(1UL<<U_SPACE_SEPARATOR|1UL<<U_LINE_SEPARATOR|1UL<<U_PARAGRAPH_SEPARATOR) (1UL<<U_SPACE_SEPARATOR|1UL<<U_LINE_SEPARATOR|1UL<<U_PARAGRAPH_SEPARATOR)
)!=0); )!=0) || IS_THAT_CONTROL_SPACE(c));
} }
/* Checks if the Unicode character is a whitespace character.*/ /* Checks if the Unicode character is a whitespace character.*/
@ -574,27 +556,24 @@ U_CAPI UBool U_EXPORT2
u_isWhitespace(UChar32 c) { u_isWhitespace(UChar32 c) {
uint32_t props; uint32_t props;
GET_PROPS(c, props); GET_PROPS(c, props);
return (UBool)(((1UL<<GET_CATEGORY(props))& return (UBool)((((1UL<<GET_CATEGORY(props))&
(1UL<<U_SPACE_SEPARATOR|1UL<<U_LINE_SEPARATOR|1UL<<U_PARAGRAPH_SEPARATOR) (1UL<<U_SPACE_SEPARATOR|1UL<<U_LINE_SEPARATOR|1UL<<U_PARAGRAPH_SEPARATOR)
)!=0 && )!=0 &&
c!=0xa0 && c!=0x202f && c!=0xfeff); /* exclude no-break spaces */ c!=0xa0 && c!=0x202f && c!=0xfeff) || /* exclude no-break spaces */
IS_THAT_CONTROL_SPACE(c));
} }
/* Checks if the Unicode character is printable.*/ /* Checks if the Unicode character is printable.*/
U_CAPI UBool U_EXPORT2 U_CAPI UBool U_EXPORT2
u_isprint(UChar32 c) { u_isprint(UChar32 c) {
if(IS_ISO_8_CONTROL(c)) { uint32_t props;
return FALSE; GET_PROPS(c, props);
} else { return (UBool)(
uint32_t props; ((1UL<<GET_CATEGORY(props))&
GET_PROPS(c, props); ~(1UL<<U_UNASSIGNED|
return (UBool)( 1UL<<U_CONTROL_CHAR|1UL<<U_FORMAT_CHAR|1UL<<U_PRIVATE_USE_CHAR|1UL<<U_SURROGATE|
((1UL<<GET_CATEGORY(props))& 1UL<<U_GENERAL_OTHER_TYPES|1UL<<31)
~(1UL<<U_UNASSIGNED| )!=0);
1UL<<U_CONTROL_CHAR|1UL<<U_FORMAT_CHAR|1UL<<U_PRIVATE_USE_CHAR|1UL<<U_SURROGATE|
1UL<<U_GENERAL_OTHER_TYPES|1UL<<31)
)!=0);
}
} }
/* Checks if the Unicode character can start a Unicode identifier.*/ /* Checks if the Unicode character can start a Unicode identifier.*/
@ -1091,9 +1070,6 @@ u_charCellWidth(UChar32 ch)
/* these Unicode character types are scattered throughout the Unicode range, so /* these Unicode character types are scattered throughout the Unicode range, so
special-case for them*/ special-case for them*/
if(IS_ISO_8_CONTROL(ch)) {
return U_ZERO_WIDTH;
}
switch (type) { switch (type) {
case U_UNASSIGNED: case U_UNASSIGNED:
case U_NON_SPACING_MARK: case U_NON_SPACING_MARK:

View File

@ -1299,15 +1299,8 @@ static uint8_t getCharCat(UChar32 cp) {
return U_NONCHARACTER_CODE_POINT; return U_NONCHARACTER_CODE_POINT;
} }
/* Undo ICU exceptions to the UCD when determining the if ((cat = u_charType(cp)) == U_SURROGATE) {
category. */ cat = UTF_IS_LEAD(cp) ? U_LEAD_SURROGATE : U_TRAIL_SURROGATE;
if (u_iscntrl(cp)) {
cat = U_CONTROL_CHAR;
} else {
if ((cat = u_charType(cp)) == U_SURROGATE) {
cat = UTF_IS_LEAD(cp) ? U_LEAD_SURROGATE : U_TRAIL_SURROGATE;
}
} }
return cat; return cat;

View File

@ -993,8 +993,6 @@ u_charCellWidth(UChar32 c);
* Returns a value indicating a character category. * Returns a value indicating a character category.
* The categories are taken from the Unicode Character Database (UCD) in * The categories are taken from the Unicode Character Database (UCD) in
* UnicodeData.txt. * UnicodeData.txt.
* ICU changes the category of some of the ISO control characters to various
* separators categories.
* *
* @param c the character to be tested * @param c the character to be tested
* @return a value of type int, the character category. * @return a value of type int, the character category.

View File

@ -495,29 +495,8 @@ unicodeDataLineFn(void *context,
} }
/* get general category, field 2 */ /* get general category, field 2 */
/* we override the general category of some control characters */ *fields[2][1]=0;
switch(c) { type = (int8_t)tagValues[MakeProp(fields[2][0])];
case 9:
case 0xb:
case 0x1f:
type = U_SPACE_SEPARATOR;
break;
case 0xc:
type = U_LINE_SEPARATOR;
break;
case 0xa:
case 0xd:
case 0x1c:
case 0x1d:
case 0x1e:
case 0x85:
type = U_PARAGRAPH_SEPARATOR;
break;
default:
*fields[2][1]=0;
type = (int8_t)tagValues[MakeProp(fields[2][0])];
break;
}
if(u_charType(c)!=type) { if(u_charType(c)!=type) {
log_err("error: u_charType(U+%04lx)==%u instead of %u\n", c, u_charType(c), type); log_err("error: u_charType(U+%04lx)==%u instead of %u\n", c, u_charType(c), type);
} }

View File

@ -209,7 +209,7 @@ void TransliteratorTest::TestInstantiation() {
rules, UTRANS_FORWARD, parseError,status); rules, UTRANS_FORWARD, parseError,status);
if (u == 0) { if (u == 0) {
errln(UnicodeString("FAIL: ") + id + errln(UnicodeString("FAIL: ") + id +
".toRules() => bad rules" + ".createFromRules() => bad rules" +
/*", parse error " + parseError.code +*/ /*", parse error " + parseError.code +*/
", line " + parseError.line + ", line " + parseError.line +
", offset " + parseError.offset + ", offset " + parseError.offset +
@ -2967,7 +2967,7 @@ void TransliteratorTest::TestAnchorMasking(){
Transliterator* t = Transliterator::createFromRules("ID", rule, UTRANS_FORWARD,parseError,status); Transliterator* t = Transliterator::createFromRules("ID", rule, UTRANS_FORWARD,parseError,status);
if(U_FAILURE(status)){ if(U_FAILURE(status)){
errln(UnicodeString("FAIL: ") + "ID" + errln(UnicodeString("FAIL: ") + "ID" +
".toRules() => bad rules" + ".createFromRules() => bad rules" +
/*", parse error " + parseError.code +*/ /*", parse error " + parseError.code +*/
", line " + parseError.line + ", line " + parseError.line +
", offset " + parseError.offset + ", offset " + parseError.offset +

View File

@ -342,29 +342,8 @@ UnicodeTest::unicodeDataLineFn(void *context,
} }
/* get general category, field 2 */ /* get general category, field 2 */
/* we override the general category of some control characters */ *fields[2][1]=0;
switch(c) { type = (int8_t)tagValues[me->MakeProp(fields[2][0])];
case 9:
case 0xb:
case 0x1f:
type = U_SPACE_SEPARATOR;
break;
case 0xc:
type = U_LINE_SEPARATOR;
break;
case 0xa:
case 0xd:
case 0x1c:
case 0x1d:
case 0x1e:
case 0x85:
type = U_PARAGRAPH_SEPARATOR;
break;
default:
*fields[2][1]=0;
type = (int8_t)tagValues[me->MakeProp(fields[2][0])];
break;
}
if(Unicode::getType(c)!=type) { if(Unicode::getType(c)!=type) {
me->errln("error: Unicode::getType(U+%04lx)==%u instead of %u\n", c, Unicode::getType(c), type); me->errln("error: Unicode::getType(U+%04lx)==%u instead of %u\n", c, Unicode::getType(c), type);
*pErrorCode = U_PARSE_ERROR; *pErrorCode = U_PARSE_ERROR;

View File

@ -525,23 +525,6 @@ bidiNames[U_CHAR_DIRECTION_COUNT]={
"WS", "ON", "LRE", "LRO", "AL", "RLE", "RLO", "PDF", "NSM", "BN" "WS", "ON", "LRE", "LRO", "AL", "RLE", "RLO", "PDF", "NSM", "BN"
}; };
/* control code properties */
static const struct {
uint32_t code;
uint8_t generalCategory;
} controlProps[]={
/* TAB */ {0x9, U_SPACE_SEPARATOR},
/* VT */ {0xb, U_SPACE_SEPARATOR},
/* LF */ {0xa, U_PARAGRAPH_SEPARATOR},
/* FF */ {0xc, U_LINE_SEPARATOR},
/* CR */ {0xd, U_PARAGRAPH_SEPARATOR},
/* FS */ {0x1c, U_PARAGRAPH_SEPARATOR},
/* GS */ {0x1d, U_PARAGRAPH_SEPARATOR},
/* RS */ {0x1e, U_PARAGRAPH_SEPARATOR},
/* US */ {0x1f, U_SPACE_SEPARATOR},
/* NL */ {0x85, U_PARAGRAPH_SEPARATOR}
};
static struct { static struct {
uint32_t first, last, props; uint32_t first, last, props;
char name[80]; char name[80];
@ -714,15 +697,6 @@ unicodeDataLineFn(void *context,
} }
p.titleCase=value; p.titleCase=value;
/* override properties for some common control characters */
if(p.generalCategory==U_CONTROL_CHAR) {
for(i=0; i<sizeof(controlProps)/sizeof(controlProps[0]); ++i) {
if(controlProps[i].code==p.code) {
p.generalCategory=controlProps[i].generalCategory;
}
}
}
/* set additional properties from previously parsed files */ /* set additional properties from previously parsed files */
if(mirrorIndex<mirrorCount && p.code==mirrorMappings[mirrorIndex][0]) { if(mirrorIndex<mirrorCount && p.code==mirrorMappings[mirrorIndex][0]) {
p.mirrorMapping=mirrorMappings[mirrorIndex++][1]; p.mirrorMapping=mirrorMappings[mirrorIndex++][1];