ICU-1970 add ISO_Comment

X-SVN-Rev: 9046
2002-07-08 16:31:25 +00:00 · 2002-07-08 16:31:25 +00:00 · da30a01ee9
commit da30a01ee9
parent 98dbc49f16
2 changed files with 117 additions and 49 deletions
--- a/icu4c/source/common/unames.c
+++ b/icu4c/source/common/unames.c
@ -215,6 +215,31 @@ u_charName(UChar32 code, UCharNameChoice nameChoice,
    return u_terminateChars(buffer, bufferLength, length, pErrorCode);
 }

+#define _U_ISO_COMMENT U_CHAR_NAME_CHOICE_COUNT
+
+U_CAPI int32_t U_EXPORT2
+u_getISOComment(UChar32 c,
+                char *dest, int32_t destCapacity,
+                UErrorCode *pErrorCode) {
+    int32_t length;
+
+    /* check the argument values */
+    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
+        return 0;
+    } else if(destCapacity<0 || (destCapacity>0 && dest==NULL)) {
+        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
+        return 0;
+    }
+
+    if((uint32_t)c>UCHAR_MAX_VALUE || !isDataLoaded(pErrorCode)) {
+        return u_terminateChars(dest, destCapacity, 0, pErrorCode);
+    }
+
+    /* the ISO comment is stored like a normal character name */
+    length=getName(uCharNames, (uint32_t)c, _U_ISO_COMMENT, dest, (uint16_t)destCapacity);
+    return u_terminateChars(dest, destCapacity, length, pErrorCode);
+}
+
 U_CAPI UChar32 U_EXPORT2
 u_charFromName(UCharNameChoice nameChoice,
               const char *name,
@ -596,7 +621,7 @@ expandName(UCharNames *names,
    uint8_t *tokenStrings=(uint8_t *)names+names->tokenStringOffset;
    uint8_t c;

-    if(nameChoice==U_UNICODE_10_CHAR_NAME) {
+    if(nameChoice==U_UNICODE_10_CHAR_NAME || nameChoice==_U_ISO_COMMENT) {
        /*
         * skip the modern name if it is not requested _and_
         * if the semicolon byte value is a character, not a token number
@ -608,6 +633,15 @@ expandName(UCharNames *names,
                    break;
                }
            }
+            if(nameChoice==_U_ISO_COMMENT) {
+                /* skip the Unicode 1.0 name as well to get the ISO comment */
+                while(nameLength>0) {
+                    --nameLength;
+                    if(*name++==';') {
+                        break;
+                    }
+                }
+            }
        } else {
            /*
             * the semicolon byte value is a token number, therefore
--- a/icu4c/source/tools/gennames/gennames.c
+++ b/icu4c/source/tools/gennames/gennames.c
@ -16,7 +16,7 @@
 *   This program reads the Unicode character database text file,
 *   parses it, and extracts the character code,
 *   the "modern" character name, and optionally the
-*   Unicode 1.0 character name.
+*   Unicode 1.0 character name, and (starting with ICU 2.2) the ISO 10646 comment.
 *   It then tokenizes and compresses the names and builds
 *   compact binary tables for random-access lookup
 *   in a u_charName() API function.
@ -53,6 +53,9 @@
 *           tokenString=tokenStrings+token; (tokenStrings=start of names data + tokenStringOffset;)
 *           append zero-terminated tokenString;
 *
+*    Different strings for a code point - normal name, 1.0 name, and ISO comment -
+*    are separated by ';'.
+*
 * uint16_t groupCount;
 * struct {
 *   uint16_t groupMSB; -- for a group of 32 character names stored, this is code point>>5
@ -242,7 +245,7 @@ static void
 countWord(Word *word);

 static void
-addLine(uint32_t code, char *name1, int16_t name1Length, char *name2, int16_t name2Length);
+addLine(uint32_t code, char *names[], int16_t lengths[], int16_t count);

 static void
 addGroup(uint32_t groupMSB, uint8_t *strings, int16_t length);
@ -257,7 +260,7 @@ static void
 appendLineLengthNibble(uint8_t nibble);

 static uint8_t *
-allocLine(uint32_t length);
+allocLine(int32_t length);

 static uint8_t *
 allocWord(uint32_t length);
@ -284,7 +287,7 @@ main(int argc, char* argv[]) {

    /* preset then read command line options */
    options[5].value=u_getDataDirectory();
-    options[6].value="3.1.1";
+    options[6].value="3.2";
    argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options);

    /* error handling, printing usage message */
@ -355,10 +358,10 @@ static void
 lineFn(void *context,
       char *fields[][2], int32_t fieldCount,
       UErrorCode *pErrorCode) {
+    char *names[3];
+    int16_t lengths[3];
    static uint32_t prevCode=0;
    uint32_t code=0;
-    char *name1Start, *name2Start;
-    int16_t name1Length, name2Length;

    if(U_FAILURE(*pErrorCode)) {
        return;
@ -367,25 +370,32 @@ lineFn(void *context,
    code=uprv_strtoul(fields[0][0], NULL, 16);

    /* get the character name */
-    name1Start=fields[1][0];
+    names[0]=fields[1][0];
    if(fields[1][0][0]!='<') {
-        name1Length=(int16_t)(fields[1][1]-name1Start);
+        lengths[0]=(int16_t)(fields[1][1]-names[0]);
    } else {
        /* do not store pseudo-names in <> brackets */
-        name1Length=0;
+        lengths[0]=0;
    }

    /* store 1.0 names */
    /* get the second character name, the one from Unicode 1.0 */
    /* do not store pseudo-names in <> brackets */
-    name2Start=fields[10][0];
+    names[1]=fields[10][0];
    if(*(UBool *)context && fields[10][0][0]!='<') {
-        name2Length=(int16_t)(fields[10][1]-name2Start);
+        lengths[1]=(int16_t)(fields[10][1]-names[1]);
    } else {
-        name2Length=0;
+        lengths[1]=0;
    }

-    if(name1Length+name2Length==0) {
+    /* get the ISO 10646 comment */
+    names[2]=fields[11][0];
+    lengths[2]=(int16_t)(fields[11][1]-names[2]);
+    if(lengths[2]!=0) {
+        char *s=names[2];
+    }
+
+    if(lengths[0]+lengths[1]+lengths[2]==0) {
        return;
    }

@ -406,22 +416,27 @@ lineFn(void *context,
    }
    prevCode=code;

-    /* printf("%lx:%.*s(%.*s)\n", code, name1Length, line+name1Start, name2Length, line+name2Start); */
+    parseName(names[0], lengths[0]);
+    parseName(names[1], lengths[1]);
+    parseName(names[2], lengths[2]);

-    parseName(name1Start, name1Length);
-    parseName(name2Start, name2Length);
-
-    addLine(code, name1Start, name1Length, name2Start, name2Length);
+    /*
+     * set the count argument to
+     * 1: only store regular names
+     * 2: store regular and 1.0 names
+     * 3: store names and ISO 10646 comment
+     */
+    addLine(code, names, lengths, 3);
 }

 static void
 parseDB(const char *filename, UBool store10Names) {
-    char *fields[11][2];
+    char *fields[15][2];
    UErrorCode errorCode=U_ZERO_ERROR;

-    /* parsing the 11 fields 0..10 is enough for gennames */
-    u_parseDelimitedFile(filename, ';', fields, 11, lineFn, &store10Names, &errorCode);
+    u_parseDelimitedFile(filename, ';', fields, 15, lineFn, &store10Names, &errorCode);
    if(U_FAILURE(errorCode)) {
+        fprintf(stderr, "gennames parse error: %s\n", u_errorName(errorCode));
        exit(errorCode);
    }

@ -482,17 +497,23 @@ parseName(char *name, int16_t length) {
    }
 }

+static U_INLINE
+isWordChar(char c) {
+    return ('A'<=c && c<='I') || /* EBCDIC-safe check for letters */
+           ('J'<=c && c<='R') ||
+           ('S'<=c && c<='Z') ||
+
+           ('a'<=c && c<='i') || /* lowercase letters for ISO comments */
+           ('j'<=c && c<='r') ||
+           ('s'<=c && c<='z') ||
+
+           ('0'<=c && c<='9');
+}
+
 static int16_t
 skipNoise(char *line, int16_t start, int16_t limit) {
-    char c;
-
    /* skip anything that is not part of a word in this sense */
-    while(start<limit &&
-          !(('A'<=(c=line[start]) && c<='I') || /* EBCDIC-safe check for letters */
-            ('J'<=c && c<='R') ||
-            ('S'<=c && c<='Z') ||
-            ('0'<=c && c<='9'))
-    ) {
+    while(start<limit && !isWordChar(line[start])) {
        ++start;
    }

@ -504,17 +525,12 @@ getWord(char *line, int16_t start, int16_t limit) {
    char c=0; /* initialize to avoid a compiler warning although the code was safe */

    /* a unicode character name word consists of A-Z0-9 */
-    while(start<limit &&
-          (('A'<=(c=line[start]) && c<='I') || /* EBCDIC-safe check for letters */
-           ('J'<=c && c<='R') ||
-           ('S'<=c && c<='Z') ||
-           ('0'<=c && c<='9'))
-    ) {
+    while(start<limit && isWordChar(line[start])) {
        ++start;
    }

    /* include a following space or dash */
-    if(start<limit && (c==' ' || c=='-')) {
+    if(start<limit && ((c=line[start])==' ' || c=='-')) {
        ++start;
    }

@ -1103,28 +1119,46 @@ countWord(Word *word) {
 }

 static void
-addLine(uint32_t code, char *name1, int16_t name1Length, char *name2, int16_t name2Length) {
+addLine(uint32_t code, char *names[], int16_t lengths[], int16_t count) {
    uint8_t *stringStart;
    Line *line;
-    int16_t length;
+    int16_t i, length;

    if(lineCount==MAX_LINE_COUNT) {
        fprintf(stderr, "gennames: too many lines\n");
        exit(U_BUFFER_OVERFLOW_ERROR);
    }

-    length=name1Length;
-    if(name2Length>0) {
-        length=(int16_t)(length+1+name2Length);
+    /* find the last non-empty name */
+    while(count>0 && lengths[count-1]==0) {
+        --count;
+    }
+    if(count==0) {
+        return; /* should not occur: caller should not have called */
    }

-    stringStart=allocLine(length);
-    if(name1Length>0) {
-        uprv_memcpy(stringStart, name1, name1Length);
+    /* there will be (count-1) separator characters */
+    i=count;
+    length=count-1;
+
+    /* add lengths of strings */
+    while(i>0) {
+        length+=lengths[--i];
    }
-    if(name2Length>0) {
-        stringStart[name1Length]=NAME_SEPARATOR_CHAR;
-        uprv_memcpy(stringStart+name1Length+1, name2, name2Length);
+
+    /* allocate line memory */
+    stringStart=allocLine(length);
+
+    /* copy all strings into the line memory */
+    length=0; /* number of chars copied so far */
+    for(i=0; i<count; ++i) {
+        if(i>0) {
+            stringStart[length++]=NAME_SEPARATOR_CHAR;
+        }
+        if(lengths[i]>0) {
+            uprv_memcpy(stringStart+length, names[i], lengths[i]);
+            length+=lengths[i];
+        }
    }

    line=lines+lineCount;
@ -1201,7 +1235,7 @@ appendLineLengthNibble(uint8_t nibble) {
 }

 static uint8_t *
-allocLine(uint32_t length) {
+allocLine(int32_t length) {
    uint32_t top=lineTop+length;
    uint8_t *p;