ICU-1709 do not tweak the UCD data to mark some control characters

as spacing ones. X-SVN-Rev: 7749
2002-02-24 21:00:19 +00:00 · 2002-02-24 21:00:19 +00:00 · 504a75e931
commit 504a75e931
parent 3699fd2f22
7 changed files with 46 additions and 147 deletions
--- a/icu4c/source/common/uchar.c
+++ b/icu4c/source/common/uchar.c
@ -30,17 +30,11 @@
 #include "utrie.h"
 #include "ustr_imp.h"
 /*
 * Since genprops overrides the general category for some control codes,
 * we need to hardcode ISO 8 controls for u_iscntrl(), u_isprint(), etc.
 */
 #define IS_ISO_8_CONTROL(c) ((uint32_t)(c)<0x20 || (uint32_t)((c)-0x7f)<=0x20)
 /* dynamically loaded Unicode character properties -------------------------- */
 /* fallback properties for the ASCII range if the data cannot be loaded */
 /* these are printed by genprops in verbose mode */
-static const uint32_t staticProps32Table[]={
+static uint32_t staticProps32Table[0xa0]={
    /* 0x00 */ 0x48f,
    /* 0x01 */ 0x48f,
    /* 0x02 */ 0x48f,
@ -50,11 +44,11 @@ static const uint32_t staticProps32Table[]={
    /* 0x06 */ 0x48f,
    /* 0x07 */ 0x48f,
    /* 0x08 */ 0x48f,
-    /* 0x09 */ 0x20c,
+    /* 0x09 */ 0x20f,
-    /* 0x0a */ 0x1ce,
+    /* 0x0a */ 0x1cf,
-    /* 0x0b */ 0x20c,
+    /* 0x0b */ 0x20f,
-    /* 0x0c */ 0x24d,
+    /* 0x0c */ 0x24f,
-    /* 0x0d */ 0x1ce,
+    /* 0x0d */ 0x1cf,
    /* 0x0e */ 0x48f,
    /* 0x0f */ 0x48f,
    /* 0x10 */ 0x48f,
@ -69,10 +63,10 @@ static const uint32_t staticProps32Table[]={
    /* 0x19 */ 0x48f,
    /* 0x1a */ 0x48f,
    /* 0x1b */ 0x48f,
-    /* 0x1c */ 0x1ce,
+    /* 0x1c */ 0x1cf,
-    /* 0x1d */ 0x1ce,
+    /* 0x1d */ 0x1cf,
-    /* 0x1e */ 0x1ce,
+    /* 0x1e */ 0x1cf,
-    /* 0x1f */ 0x20c,
+    /* 0x1f */ 0x20f,
    /* 0x20 */ 0x24c,
    /* 0x21 */ 0x297,
    /* 0x22 */ 0x297,
@ -114,8 +108,8 @@ static const uint32_t staticProps32Table[]={
    /* 0x46 */ 0x2000001,
    /* 0x47 */ 0x2000001,
    /* 0x48 */ 0x2000001,
-    /* 0x49 */ 0x2000001,
+    /* 0x49 */ 0x1, /* has exception */
-    /* 0x4a */ 0x2000001,
+    /* 0x4a */ 0x300001, /* has exception */
    /* 0x4b */ 0x2000001,
    /* 0x4c */ 0x2000001,
    /* 0x4d */ 0x2000001,
@ -146,7 +140,7 @@ static const uint32_t staticProps32Table[]={
    /* 0x66 */ 0x2000002,
    /* 0x67 */ 0x2000002,
    /* 0x68 */ 0x2000002,
-    /* 0x69 */ 0x2000002,
+    /* 0x69 */ 0x600002, /* has exception */
    /* 0x6a */ 0x2000002,
    /* 0x6b */ 0x2000002,
    /* 0x6c */ 0x2000002,
@ -174,7 +168,7 @@ static const uint32_t staticProps32Table[]={
    /* 0x82 */ 0x48f,
    /* 0x83 */ 0x48f,
    /* 0x84 */ 0x48f,
-    /* 0x85 */ 0x1ce,
+    /* 0x85 */ 0x1cf,
    /* 0x86 */ 0x48f,
    /* 0x87 */ 0x48f,
    /* 0x88 */ 0x48f,
@ -200,7 +194,7 @@ static const uint32_t staticProps32Table[]={
    /* 0x9c */ 0x48f,
    /* 0x9d */ 0x48f,
    /* 0x9e */ 0x48f,
-    /* 0x9f */ 0x48f
+    /* 0x9f */ 0x48f,
 };
 /*
@ -424,18 +418,6 @@ u_charType(UChar32 c) {
    return (int8_t)GET_CATEGORY(props);
 }
 /* Gets the Unicode character's general category, as per the UCD.*/
 U_CAPI int8_t U_EXPORT2
 u_charUCDType(UChar32 c) {
    if (IS_ISO_8_CONTROL(c)) {
        return U_CONTROL_CHAR;
    } else {
        uint32_t props;
        GET_PROPS(c, props);
        return (int8_t)GET_CATEGORY(props);
    }
 }
 /* Enumerate all code points with their general categories. */
 struct _EnumTypeCallback {
    UCharEnumTypeRange *enumRange;
@ -547,26 +529,26 @@ u_isbase(UChar32 c) {
 /* Checks if the Unicode character is a control character.*/
 U_CAPI UBool U_EXPORT2
 u_iscntrl(UChar32 c) {
-    if(IS_ISO_8_CONTROL(c)) {
+    uint32_t props;
-        return TRUE;
+    GET_PROPS(c, props);
-    } else {
+    return (UBool)(
-        uint32_t props;
+           ((1UL<<GET_CATEGORY(props))&
-        GET_PROPS(c, props);
+            (1UL<<U_CONTROL_CHAR|1UL<<U_FORMAT_CHAR|1UL<<U_LINE_SEPARATOR|1UL<<U_PARAGRAPH_SEPARATOR)
-        return (UBool)(
+           )!=0);
               ((1UL<<GET_CATEGORY(props))&
                (1UL<<U_CONTROL_CHAR|1UL<<U_FORMAT_CHAR|1UL<<U_LINE_SEPARATOR|1UL<<U_PARAGRAPH_SEPARATOR)
               )!=0);
    }
 }
 /* Some control characters that are used as space. */
 #define IS_THAT_CONTROL_SPACE(c) \
    ((c>=0x09 && c <= 0x0d) || (c>=0x1c && c <=0x1f) || c==0x85)
 /* Checks if the Unicode character is a space character.*/
 U_CAPI UBool U_EXPORT2
 u_isspace(UChar32 c) {
    uint32_t props;
    GET_PROPS(c, props);
-    return (UBool)(((1UL<<GET_CATEGORY(props))&
+    return (UBool)((((1UL<<GET_CATEGORY(props))&
            (1UL<<U_SPACE_SEPARATOR|1UL<<U_LINE_SEPARATOR|1UL<<U_PARAGRAPH_SEPARATOR)
-           )!=0);
+           )!=0) || IS_THAT_CONTROL_SPACE(c));
 }
 /* Checks if the Unicode character is a whitespace character.*/
@ -574,27 +556,24 @@ U_CAPI UBool U_EXPORT2
 u_isWhitespace(UChar32 c) {
    uint32_t props;
    GET_PROPS(c, props);
-    return (UBool)(((1UL<<GET_CATEGORY(props))&
+    return (UBool)((((1UL<<GET_CATEGORY(props))&
            (1UL<<U_SPACE_SEPARATOR|1UL<<U_LINE_SEPARATOR|1UL<<U_PARAGRAPH_SEPARATOR)
           )!=0 &&
-           c!=0xa0 && c!=0x202f && c!=0xfeff); /* exclude no-break spaces */
+           c!=0xa0 && c!=0x202f && c!=0xfeff) || /* exclude no-break spaces */
           IS_THAT_CONTROL_SPACE(c));
 }
 /* Checks if the Unicode character is printable.*/
 U_CAPI UBool U_EXPORT2
 u_isprint(UChar32 c) {
-    if(IS_ISO_8_CONTROL(c)) {
+    uint32_t props;
-        return FALSE;
+    GET_PROPS(c, props);
-    } else {
+    return (UBool)(
-        uint32_t props;
+            ((1UL<<GET_CATEGORY(props))&
-        GET_PROPS(c, props);
+            ~(1UL<<U_UNASSIGNED|
-        return (UBool)(
+              1UL<<U_CONTROL_CHAR|1UL<<U_FORMAT_CHAR|1UL<<U_PRIVATE_USE_CHAR|1UL<<U_SURROGATE|
-                ((1UL<<GET_CATEGORY(props))&
+              1UL<<U_GENERAL_OTHER_TYPES|1UL<<31)
-                ~(1UL<<U_UNASSIGNED|
+           )!=0);
                  1UL<<U_CONTROL_CHAR|1UL<<U_FORMAT_CHAR|1UL<<U_PRIVATE_USE_CHAR|1UL<<U_SURROGATE|
                  1UL<<U_GENERAL_OTHER_TYPES|1UL<<31)
               )!=0);
    }
 }
 /* Checks if the Unicode character can start a Unicode identifier.*/
@ -1091,9 +1070,6 @@ u_charCellWidth(UChar32 ch)
    /* these Unicode character types are scattered throughout the Unicode range, so
     special-case for them*/
    if(IS_ISO_8_CONTROL(ch)) {
        return U_ZERO_WIDTH;
    }
    switch (type) {
        case U_UNASSIGNED:
        case U_NON_SPACING_MARK:
--- a/icu4c/source/common/unames.c
+++ b/icu4c/source/common/unames.c
@ -1299,15 +1299,8 @@ static uint8_t getCharCat(UChar32 cp) {
        return U_NONCHARACTER_CODE_POINT;
    }
-    /* Undo ICU exceptions to the UCD when determining the
+    if ((cat = u_charType(cp)) == U_SURROGATE) {
-       category. */
+        cat = UTF_IS_LEAD(cp) ? U_LEAD_SURROGATE : U_TRAIL_SURROGATE;
    if (u_iscntrl(cp)) {
        cat = U_CONTROL_CHAR;
    } else {
        if ((cat = u_charType(cp)) == U_SURROGATE) {
            cat = UTF_IS_LEAD(cp) ? U_LEAD_SURROGATE : U_TRAIL_SURROGATE;
        }
    }
    return cat;
--- a/icu4c/source/common/unicode/uchar.h
+++ b/icu4c/source/common/unicode/uchar.h
@ -993,8 +993,6 @@ u_charCellWidth(UChar32 c);
 * Returns a value indicating a character category.
 * The categories are taken from the Unicode Character Database (UCD) in
 * UnicodeData.txt.
 * ICU changes the category of some of the ISO control characters to various
 * separators categories.
 *
 * @param c            the character to be tested
 * @return a value of type int, the character category.
--- a/icu4c/source/test/cintltst/cucdtst.c
+++ b/icu4c/source/test/cintltst/cucdtst.c
@ -495,29 +495,8 @@ unicodeDataLineFn(void *context,
    }
    /* get general category, field 2 */
-    /* we override the general category of some control characters */
+    *fields[2][1]=0;
-    switch(c) {
+    type = (int8_t)tagValues[MakeProp(fields[2][0])];
    case 9:
    case 0xb:
    case 0x1f:
        type = U_SPACE_SEPARATOR;
        break;
    case 0xc:
        type = U_LINE_SEPARATOR;
        break;
    case 0xa:
    case 0xd:
    case 0x1c:
    case 0x1d:
    case 0x1e:
    case 0x85:
        type = U_PARAGRAPH_SEPARATOR;
        break;
    default:
        *fields[2][1]=0;
        type = (int8_t)tagValues[MakeProp(fields[2][0])];
        break;
    }
    if(u_charType(c)!=type) {
        log_err("error: u_charType(U+%04lx)==%u instead of %u\n", c, u_charType(c), type);
    }
--- a/icu4c/source/test/intltest/transtst.cpp
+++ b/icu4c/source/test/intltest/transtst.cpp
@ -209,7 +209,7 @@ void TransliteratorTest::TestInstantiation() {
                                    rules, UTRANS_FORWARD, parseError,status);
            if (u == 0) {
                errln(UnicodeString("FAIL: ") + id +
-                      ".toRules() => bad rules" +
+                      ".createFromRules() => bad rules" +
                      /*", parse error " + parseError.code +*/
                      ", line " + parseError.line +
                      ", offset " + parseError.offset +
@ -2967,7 +2967,7 @@ void TransliteratorTest::TestAnchorMasking(){
    Transliterator* t = Transliterator::createFromRules("ID", rule, UTRANS_FORWARD,parseError,status);
    if(U_FAILURE(status)){
        errln(UnicodeString("FAIL: ") + "ID" +
-              ".toRules() => bad rules" +
+              ".createFromRules() => bad rules" +
              /*", parse error " + parseError.code +*/
              ", line " + parseError.line +
              ", offset " + parseError.offset +
--- a/icu4c/source/test/intltest/ucdtest.cpp
+++ b/icu4c/source/test/intltest/ucdtest.cpp
@ -342,29 +342,8 @@ UnicodeTest::unicodeDataLineFn(void *context,
    }
    /* get general category, field 2 */
-    /* we override the general category of some control characters */
+    *fields[2][1]=0;
-    switch(c) {
+    type = (int8_t)tagValues[me->MakeProp(fields[2][0])];
    case 9:
    case 0xb:
    case 0x1f:
        type = U_SPACE_SEPARATOR;
        break;
    case 0xc:
        type = U_LINE_SEPARATOR;
        break;
    case 0xa:
    case 0xd:
    case 0x1c:
    case 0x1d:
    case 0x1e:
    case 0x85:
        type = U_PARAGRAPH_SEPARATOR;
        break;
    default:
        *fields[2][1]=0;
        type = (int8_t)tagValues[me->MakeProp(fields[2][0])];
        break;
    }
    if(Unicode::getType(c)!=type) {
        me->errln("error: Unicode::getType(U+%04lx)==%u instead of %u\n", c, Unicode::getType(c), type);
        *pErrorCode = U_PARSE_ERROR;
--- a/icu4c/source/tools/genprops/genprops.c
+++ b/icu4c/source/tools/genprops/genprops.c
@ -525,23 +525,6 @@ bidiNames[U_CHAR_DIRECTION_COUNT]={
    "WS", "ON", "LRE", "LRO", "AL", "RLE", "RLO", "PDF", "NSM", "BN"
 };
 /* control code properties */
 static const struct {
    uint32_t code;
    uint8_t generalCategory;
 } controlProps[]={
    /* TAB */   {0x9, U_SPACE_SEPARATOR},
    /* VT */    {0xb, U_SPACE_SEPARATOR},
    /* LF */    {0xa, U_PARAGRAPH_SEPARATOR},
    /* FF */    {0xc, U_LINE_SEPARATOR},
    /* CR */    {0xd, U_PARAGRAPH_SEPARATOR},
    /* FS */    {0x1c, U_PARAGRAPH_SEPARATOR},
    /* GS */    {0x1d, U_PARAGRAPH_SEPARATOR},
    /* RS */    {0x1e, U_PARAGRAPH_SEPARATOR},
    /* US */    {0x1f, U_SPACE_SEPARATOR},
    /* NL */    {0x85, U_PARAGRAPH_SEPARATOR}
 };
 static struct {
    uint32_t first, last, props;
    char name[80];
@ -714,15 +697,6 @@ unicodeDataLineFn(void *context,
    }
    p.titleCase=value;
    /* override properties for some common control characters */
    if(p.generalCategory==U_CONTROL_CHAR) {
        for(i=0; i<sizeof(controlProps)/sizeof(controlProps[0]); ++i) {
            if(controlProps[i].code==p.code) {
                p.generalCategory=controlProps[i].generalCategory;
            }
        }
    }
    /* set additional properties from previously parsed files */
    if(mirrorIndex<mirrorCount && p.code==mirrorMappings[mirrorIndex][0]) {
        p.mirrorMapping=mirrorMappings[mirrorIndex++][1];