ICU-1709 do not tweak the UCD data to mark some control characters

as spacing ones. X-SVN-Rev: 7749
2002-02-24 21:00:19 +00:00 · 2002-02-24 21:00:19 +00:00 · 504a75e931
commit 504a75e931
parent 3699fd2f22
7 changed files with 46 additions and 147 deletions
--- a/icu4c/source/common/uchar.c
+++ b/icu4c/source/common/uchar.c
@ -30,17 +30,11 @@
 #include "utrie.h"
 #include "ustr_imp.h"

-/*
- * Since genprops overrides the general category for some control codes,
- * we need to hardcode ISO 8 controls for u_iscntrl(), u_isprint(), etc.
- */
-#define IS_ISO_8_CONTROL(c) ((uint32_t)(c)<0x20 || (uint32_t)((c)-0x7f)<=0x20)
-
 /* dynamically loaded Unicode character properties -------------------------- */

 /* fallback properties for the ASCII range if the data cannot be loaded */
 /* these are printed by genprops in verbose mode */
-static const uint32_t staticProps32Table[]={
+static uint32_t staticProps32Table[0xa0]={
    /* 0x00 */ 0x48f,
    /* 0x01 */ 0x48f,
    /* 0x02 */ 0x48f,
@ -50,11 +44,11 @@ static const uint32_t staticProps32Table[]={
    /* 0x06 */ 0x48f,
    /* 0x07 */ 0x48f,
    /* 0x08 */ 0x48f,
-    /* 0x09 */ 0x20c,
-    /* 0x0a */ 0x1ce,
-    /* 0x0b */ 0x20c,
-    /* 0x0c */ 0x24d,
-    /* 0x0d */ 0x1ce,
+    /* 0x09 */ 0x20f,
+    /* 0x0a */ 0x1cf,
+    /* 0x0b */ 0x20f,
+    /* 0x0c */ 0x24f,
+    /* 0x0d */ 0x1cf,
    /* 0x0e */ 0x48f,
    /* 0x0f */ 0x48f,
    /* 0x10 */ 0x48f,
@ -69,10 +63,10 @@ static const uint32_t staticProps32Table[]={
    /* 0x19 */ 0x48f,
    /* 0x1a */ 0x48f,
    /* 0x1b */ 0x48f,
-    /* 0x1c */ 0x1ce,
-    /* 0x1d */ 0x1ce,
-    /* 0x1e */ 0x1ce,
-    /* 0x1f */ 0x20c,
+    /* 0x1c */ 0x1cf,
+    /* 0x1d */ 0x1cf,
+    /* 0x1e */ 0x1cf,
+    /* 0x1f */ 0x20f,
    /* 0x20 */ 0x24c,
    /* 0x21 */ 0x297,
    /* 0x22 */ 0x297,
@ -114,8 +108,8 @@ static const uint32_t staticProps32Table[]={
    /* 0x46 */ 0x2000001,
    /* 0x47 */ 0x2000001,
    /* 0x48 */ 0x2000001,
-    /* 0x49 */ 0x2000001,
-    /* 0x4a */ 0x2000001,
+    /* 0x49 */ 0x1, /* has exception */
+    /* 0x4a */ 0x300001, /* has exception */
    /* 0x4b */ 0x2000001,
    /* 0x4c */ 0x2000001,
    /* 0x4d */ 0x2000001,
@ -146,7 +140,7 @@ static const uint32_t staticProps32Table[]={
    /* 0x66 */ 0x2000002,
    /* 0x67 */ 0x2000002,
    /* 0x68 */ 0x2000002,
-    /* 0x69 */ 0x2000002,
+    /* 0x69 */ 0x600002, /* has exception */
    /* 0x6a */ 0x2000002,
    /* 0x6b */ 0x2000002,
    /* 0x6c */ 0x2000002,
@ -174,7 +168,7 @@ static const uint32_t staticProps32Table[]={
    /* 0x82 */ 0x48f,
    /* 0x83 */ 0x48f,
    /* 0x84 */ 0x48f,
-    /* 0x85 */ 0x1ce,
+    /* 0x85 */ 0x1cf,
    /* 0x86 */ 0x48f,
    /* 0x87 */ 0x48f,
    /* 0x88 */ 0x48f,
@ -200,7 +194,7 @@ static const uint32_t staticProps32Table[]={
    /* 0x9c */ 0x48f,
    /* 0x9d */ 0x48f,
    /* 0x9e */ 0x48f,
-    /* 0x9f */ 0x48f
+    /* 0x9f */ 0x48f,
 };

 /*
@ -424,18 +418,6 @@ u_charType(UChar32 c) {
    return (int8_t)GET_CATEGORY(props);
 }

-/* Gets the Unicode character's general category, as per the UCD.*/
-U_CAPI int8_t U_EXPORT2
-u_charUCDType(UChar32 c) {
-    if (IS_ISO_8_CONTROL(c)) {
-        return U_CONTROL_CHAR;
-    } else {
-        uint32_t props;
-        GET_PROPS(c, props);
-        return (int8_t)GET_CATEGORY(props);
-    }
-}
-
 /* Enumerate all code points with their general categories. */
 struct _EnumTypeCallback {
    UCharEnumTypeRange *enumRange;
@ -547,26 +529,26 @@ u_isbase(UChar32 c) {
 /* Checks if the Unicode character is a control character.*/
 U_CAPI UBool U_EXPORT2
 u_iscntrl(UChar32 c) {
-    if(IS_ISO_8_CONTROL(c)) {
-        return TRUE;
-    } else {
-        uint32_t props;
-        GET_PROPS(c, props);
-        return (UBool)(
-               ((1UL<<GET_CATEGORY(props))&
-                (1UL<<U_CONTROL_CHAR|1UL<<U_FORMAT_CHAR|1UL<<U_LINE_SEPARATOR|1UL<<U_PARAGRAPH_SEPARATOR)
-               )!=0);
-    }
+    uint32_t props;
+    GET_PROPS(c, props);
+    return (UBool)(
+           ((1UL<<GET_CATEGORY(props))&
+            (1UL<<U_CONTROL_CHAR|1UL<<U_FORMAT_CHAR|1UL<<U_LINE_SEPARATOR|1UL<<U_PARAGRAPH_SEPARATOR)
+           )!=0);
 }

+/* Some control characters that are used as space. */
+#define IS_THAT_CONTROL_SPACE(c) \
+    ((c>=0x09 && c <= 0x0d) || (c>=0x1c && c <=0x1f) || c==0x85)
+
 /* Checks if the Unicode character is a space character.*/
 U_CAPI UBool U_EXPORT2
 u_isspace(UChar32 c) {
    uint32_t props;
    GET_PROPS(c, props);
-    return (UBool)(((1UL<<GET_CATEGORY(props))&
+    return (UBool)((((1UL<<GET_CATEGORY(props))&
            (1UL<<U_SPACE_SEPARATOR|1UL<<U_LINE_SEPARATOR|1UL<<U_PARAGRAPH_SEPARATOR)
-           )!=0);
+           )!=0) || IS_THAT_CONTROL_SPACE(c));
 }

 /* Checks if the Unicode character is a whitespace character.*/
@ -574,27 +556,24 @@ U_CAPI UBool U_EXPORT2
 u_isWhitespace(UChar32 c) {
    uint32_t props;
    GET_PROPS(c, props);
-    return (UBool)(((1UL<<GET_CATEGORY(props))&
+    return (UBool)((((1UL<<GET_CATEGORY(props))&
            (1UL<<U_SPACE_SEPARATOR|1UL<<U_LINE_SEPARATOR|1UL<<U_PARAGRAPH_SEPARATOR)
           )!=0 &&
-           c!=0xa0 && c!=0x202f && c!=0xfeff); /* exclude no-break spaces */
+           c!=0xa0 && c!=0x202f && c!=0xfeff) || /* exclude no-break spaces */
+           IS_THAT_CONTROL_SPACE(c));
 }

 /* Checks if the Unicode character is printable.*/
 U_CAPI UBool U_EXPORT2
 u_isprint(UChar32 c) {
-    if(IS_ISO_8_CONTROL(c)) {
-        return FALSE;
-    } else {
-        uint32_t props;
-        GET_PROPS(c, props);
-        return (UBool)(
-                ((1UL<<GET_CATEGORY(props))&
-                ~(1UL<<U_UNASSIGNED|
-                  1UL<<U_CONTROL_CHAR|1UL<<U_FORMAT_CHAR|1UL<<U_PRIVATE_USE_CHAR|1UL<<U_SURROGATE|
-                  1UL<<U_GENERAL_OTHER_TYPES|1UL<<31)
-               )!=0);
-    }
+    uint32_t props;
+    GET_PROPS(c, props);
+    return (UBool)(
+            ((1UL<<GET_CATEGORY(props))&
+            ~(1UL<<U_UNASSIGNED|
+              1UL<<U_CONTROL_CHAR|1UL<<U_FORMAT_CHAR|1UL<<U_PRIVATE_USE_CHAR|1UL<<U_SURROGATE|
+              1UL<<U_GENERAL_OTHER_TYPES|1UL<<31)
+           )!=0);
 }

 /* Checks if the Unicode character can start a Unicode identifier.*/
@ -1091,9 +1070,6 @@ u_charCellWidth(UChar32 ch)

    /* these Unicode character types are scattered throughout the Unicode range, so
     special-case for them*/
-    if(IS_ISO_8_CONTROL(ch)) {
-        return U_ZERO_WIDTH;
-    }
    switch (type) {
        case U_UNASSIGNED:
        case U_NON_SPACING_MARK:
--- a/icu4c/source/common/unames.c
+++ b/icu4c/source/common/unames.c
@ -1299,15 +1299,8 @@ static uint8_t getCharCat(UChar32 cp) {
        return U_NONCHARACTER_CODE_POINT;
    }

-    /* Undo ICU exceptions to the UCD when determining the
-       category. */
-
-    if (u_iscntrl(cp)) {
-        cat = U_CONTROL_CHAR;
-    } else {
-        if ((cat = u_charType(cp)) == U_SURROGATE) {
-            cat = UTF_IS_LEAD(cp) ? U_LEAD_SURROGATE : U_TRAIL_SURROGATE;
-        }
+    if ((cat = u_charType(cp)) == U_SURROGATE) {
+        cat = UTF_IS_LEAD(cp) ? U_LEAD_SURROGATE : U_TRAIL_SURROGATE;
    }

    return cat;
--- a/icu4c/source/common/unicode/uchar.h
+++ b/icu4c/source/common/unicode/uchar.h
@ -993,8 +993,6 @@ u_charCellWidth(UChar32 c);
 * Returns a value indicating a character category.
 * The categories are taken from the Unicode Character Database (UCD) in
 * UnicodeData.txt.
- * ICU changes the category of some of the ISO control characters to various
- * separators categories.
 *
 * @param c            the character to be tested
 * @return a value of type int, the character category.
--- a/icu4c/source/test/cintltst/cucdtst.c
+++ b/icu4c/source/test/cintltst/cucdtst.c
@ -495,29 +495,8 @@ unicodeDataLineFn(void *context,
    }

    /* get general category, field 2 */
-    /* we override the general category of some control characters */
-    switch(c) {
-    case 9:
-    case 0xb:
-    case 0x1f:
-        type = U_SPACE_SEPARATOR;
-        break;
-    case 0xc:
-        type = U_LINE_SEPARATOR;
-        break;
-    case 0xa:
-    case 0xd:
-    case 0x1c:
-    case 0x1d:
-    case 0x1e:
-    case 0x85:
-        type = U_PARAGRAPH_SEPARATOR;
-        break;
-    default:
-        *fields[2][1]=0;
-        type = (int8_t)tagValues[MakeProp(fields[2][0])];
-        break;
-    }
+    *fields[2][1]=0;
+    type = (int8_t)tagValues[MakeProp(fields[2][0])];
    if(u_charType(c)!=type) {
        log_err("error: u_charType(U+%04lx)==%u instead of %u\n", c, u_charType(c), type);
    }
--- a/icu4c/source/test/intltest/transtst.cpp
+++ b/icu4c/source/test/intltest/transtst.cpp
@ -209,7 +209,7 @@ void TransliteratorTest::TestInstantiation() {
                                    rules, UTRANS_FORWARD, parseError,status);
            if (u == 0) {
                errln(UnicodeString("FAIL: ") + id +
-                      ".toRules() => bad rules" +
+                      ".createFromRules() => bad rules" +
                      /*", parse error " + parseError.code +*/
                      ", line " + parseError.line +
                      ", offset " + parseError.offset +
@ -2967,7 +2967,7 @@ void TransliteratorTest::TestAnchorMasking(){
    Transliterator* t = Transliterator::createFromRules("ID", rule, UTRANS_FORWARD,parseError,status);
    if(U_FAILURE(status)){
        errln(UnicodeString("FAIL: ") + "ID" +
-              ".toRules() => bad rules" +
+              ".createFromRules() => bad rules" +
              /*", parse error " + parseError.code +*/
              ", line " + parseError.line +
              ", offset " + parseError.offset +
--- a/icu4c/source/test/intltest/ucdtest.cpp
+++ b/icu4c/source/test/intltest/ucdtest.cpp
@ -342,29 +342,8 @@ UnicodeTest::unicodeDataLineFn(void *context,
    }

    /* get general category, field 2 */
-    /* we override the general category of some control characters */
-    switch(c) {
-    case 9:
-    case 0xb:
-    case 0x1f:
-        type = U_SPACE_SEPARATOR;
-        break;
-    case 0xc:
-        type = U_LINE_SEPARATOR;
-        break;
-    case 0xa:
-    case 0xd:
-    case 0x1c:
-    case 0x1d:
-    case 0x1e:
-    case 0x85:
-        type = U_PARAGRAPH_SEPARATOR;
-        break;
-    default:
-        *fields[2][1]=0;
-        type = (int8_t)tagValues[me->MakeProp(fields[2][0])];
-        break;
-    }
+    *fields[2][1]=0;
+    type = (int8_t)tagValues[me->MakeProp(fields[2][0])];
    if(Unicode::getType(c)!=type) {
        me->errln("error: Unicode::getType(U+%04lx)==%u instead of %u\n", c, Unicode::getType(c), type);
        *pErrorCode = U_PARSE_ERROR;
--- a/icu4c/source/tools/genprops/genprops.c
+++ b/icu4c/source/tools/genprops/genprops.c
@ -525,23 +525,6 @@ bidiNames[U_CHAR_DIRECTION_COUNT]={
    "WS", "ON", "LRE", "LRO", "AL", "RLE", "RLO", "PDF", "NSM", "BN"
 };

-/* control code properties */
-static const struct {
-    uint32_t code;
-    uint8_t generalCategory;
-} controlProps[]={
-    /* TAB */   {0x9, U_SPACE_SEPARATOR},
-    /* VT */    {0xb, U_SPACE_SEPARATOR},
-    /* LF */    {0xa, U_PARAGRAPH_SEPARATOR},
-    /* FF */    {0xc, U_LINE_SEPARATOR},
-    /* CR */    {0xd, U_PARAGRAPH_SEPARATOR},
-    /* FS */    {0x1c, U_PARAGRAPH_SEPARATOR},
-    /* GS */    {0x1d, U_PARAGRAPH_SEPARATOR},
-    /* RS */    {0x1e, U_PARAGRAPH_SEPARATOR},
-    /* US */    {0x1f, U_SPACE_SEPARATOR},
-    /* NL */    {0x85, U_PARAGRAPH_SEPARATOR}
-};
-
 static struct {
    uint32_t first, last, props;
    char name[80];
@ -714,15 +697,6 @@ unicodeDataLineFn(void *context,
    }
    p.titleCase=value;

-    /* override properties for some common control characters */
-    if(p.generalCategory==U_CONTROL_CHAR) {
-        for(i=0; i<sizeof(controlProps)/sizeof(controlProps[0]); ++i) {
-            if(controlProps[i].code==p.code) {
-                p.generalCategory=controlProps[i].generalCategory;
-            }
-        }
-    }
-
    /* set additional properties from previously parsed files */
    if(mirrorIndex<mirrorCount && p.code==mirrorMappings[mirrorIndex][0]) {
        p.mirrorMapping=mirrorMappings[mirrorIndex++][1];