ICU-158 add u_charName()

X-SVN-Rev: 217
1999-11-22 19:43:31 +00:00 · 1999-11-22 19:43:31 +00:00 · f8abd4f79a
commit f8abd4f79a
parent 77586e9d8b
2 changed files with 491 additions and 0 deletions
--- a/icu4c/source/common/uchar.h
+++ b/icu4c/source/common/uchar.h
@ -387,6 +387,24 @@ enum UCellWidth
 };

 typedef enum UCellWidth UCellWidth;
+
+/**
+ * Selector constants for u_charName().
+ * <code>u_charName() returns either the "modern" name of a
+ * Unicode character or the name that was defined in
+ * Unicode version 1.0, before the Unicode standard merged
+ * with ISO-10646.
+ *
+ * @see u_charName()
+ */
+enum UCharNameChoice {
+    U_UNICODE_CHAR_NAME,
+    U_UNICODE_10_CHAR_NAME,
+    U_CHAR_NAME_CHOICE_COUNT
+};
+
+typedef enum UCharNameChoice UCharNameChoice;
+
 /**
 * Functions to classify characters.
 */
@ -618,6 +636,30 @@ u_charDigitValue(UChar c);
 U_CAPI UCharScript     U_EXPORT2
 u_charScript(UChar    ch);

+/**
+ * Retrieve the name of a Unicode character.
+ * Depending on <code>nameChoice</code>, the character name written
+ * into the buffer is the "modern" name or the name that was defined
+ * in Unicode version 1.0.
+ * The name contains only "invariant" characters
+ * like A-Z, 0-9, space, and '-'.
+ *
+ * @param code The character (code point) for which to get the name.
+ *             It must be <code>0&lt;=code&lt;0x10ffff</code>.
+ * @param nameChoice Selector for which name to get.
+ * @param buffer Destination address for copying the name.
+ * @param bufferLength <code>==sizeof(buffer)</code>
+ * @param pErrorCode Pointer to a UErrorCode variable;
+ *        check for <code>U_SUCCESS()</code> after <code>u_charName()</code>
+ *        returns.
+ *
+ * @see UCharNameChoice
+ */
+U_CAPI UTextOffset U_EXPORT2
+u_charName(uint32_t code, UCharNameChoice nameChoice,
+           char *buffer, UTextOffset bufferLength,
+           UErrorCode *pErrorCode);
+
 /** 
 * The following functions are java specific.
 */
--- a/icu4c/source/common/unames.c
+++ b/icu4c/source/common/unames.c
@ -0,0 +1,449 @@
+/*
+*******************************************************************************
+*                                                                             *
+* COPYRIGHT:                                                                  *
+*   (C) Copyright International Business Machines Corporation, 1999           *
+*   Licensed Material - Program-Property of IBM - All Rights Reserved.        *
+*   US Government Users Restricted Rights - Use, duplication, or disclosure   *
+*   restricted by GSA ADP Schedule Contract with IBM Corp.                    *
+*                                                                             *
+*******************************************************************************
+*   file name:  unames.c
+*   encoding:   US-ASCII
+*   tab size:   8 (not used)
+*   indentation:4
+*
+*   created on: 1999oct04
+*   created by: Markus W. Scherer
+*/
+
+/* set import/export definitions */
+#ifndef U_COMMON_IMPLEMENTATION
+#   define U_COMMON_IMPLEMENTATION
+#endif
+
+#include "utypes.h"
+#include "umutex.h"
+#include "cmemory.h"
+#include "uchar.h"
+#include "udata.h"
+
+/* prototypes --------------------------------------------------------------- */
+
+#define DATA_NAME "unames"
+#define DATA_TYPE "dat"
+
+#define GROUP_SHIFT 5
+#define LINES_PER_GROUP (1UL<<GROUP_SHIFT)
+#define GROUP_MASK (LINES_PER_GROUP-1)
+
+typedef struct {
+    uint16_t groupMSB,
+             offsetHigh, offsetLow; /* avoid padding */
+} Group;
+
+typedef struct {
+    uint32_t start, end;
+    uint8_t type, variant;
+    uint16_t size;
+} AlgorithmicRange;
+
+typedef struct {
+    uint32_t tokenStringOffset, groupsOffset, groupStringOffset, algNamesOffset;
+} UCharNames;
+
+static UDataMemory *uCharNamesData=NULL;
+static UCharNames *uCharNames=NULL;
+
+static bool_t
+isAcceptable(void *context,
+             const char *type, const char *name,
+             UDataInfo *pInfo);
+
+static uint16_t
+getName(UCharNames *names, uint32_t code, UCharNameChoice nameChoice,
+        char *buffer, uint16_t bufferLength);
+
+static uint16_t
+expandGroupName(UCharNames *names, Group *group,
+                uint16_t lineNumber, UCharNameChoice nameChoice,
+                char *buffer, uint16_t bufferLength);
+
+static uint16_t
+expandName(UCharNames *names,
+           uint8_t *name, uint16_t nameLength, UCharNameChoice nameChoice,
+           char *buffer, uint16_t bufferLength);
+
+static uint16_t
+getAlgName(AlgorithmicRange *range, uint32_t code, UCharNameChoice nameChoice,
+        char *buffer, uint16_t bufferLength);
+
+/* public API --------------------------------------------------------------- */
+
+U_CAPI UTextOffset U_EXPORT2
+u_charName(uint32_t code, UCharNameChoice nameChoice,
+           char *buffer, UTextOffset bufferLength,
+           UErrorCode *pErrorCode) {
+    AlgorithmicRange *algRange;
+    uint32_t *p;
+    uint32_t i;
+
+    /* check the argument values */
+    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
+        return 0;
+    } else if(nameChoice>=U_CHAR_NAME_CHOICE_COUNT || buffer==NULL) {
+        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
+        return 0;
+    }
+
+    if(code>0x10ffff) {
+        return 0;
+    }
+
+    /* load UCharNames from file if necessary */
+    if(uCharNames==NULL) {
+        UCharNames *names;
+        UDataMemory *data;
+
+        /* open the data outside the mutex block */
+        data=udata_openChoice(NULL, DATA_TYPE, DATA_NAME, isAcceptable, NULL, pErrorCode);
+        if(U_FAILURE(*pErrorCode)) {
+            return 0;
+        }
+
+        names=(UCharNames *)udata_getMemory(data);
+
+        /* in the mutex block, set the data for this process */
+        {
+            umtx_lock(NULL);
+            if(uCharNames==NULL) {
+                uCharNames=names;
+                uCharNamesData=data;
+                data=NULL;
+                names=NULL;
+            }
+            umtx_unlock(NULL);
+        }
+
+        /* if a different thread set it first, then close the extra data */
+        if(data!=NULL) {
+            udata_close(data); /* NULL if it was set correctly */
+        }
+    }
+
+    /* try algorithmic names first */
+    p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset);
+    i=*p;
+    algRange=(AlgorithmicRange *)(p+1);
+    while(i>0) {
+        if(algRange->start<=code && code<=algRange->end) {
+            return getAlgName(algRange, code, nameChoice, buffer, (uint16_t)bufferLength);
+        }
+        algRange=(AlgorithmicRange *)((uint8_t *)algRange+algRange->size);
+        --i;
+    }
+
+    /* normal character name */
+    return getName(uCharNames, code, nameChoice, buffer, (uint16_t)bufferLength);
+}
+
+/* implementation ----------------------------------------------------------- */
+
+static bool_t
+isAcceptable(void *context,
+             const char *type, const char *name,
+             UDataInfo *pInfo) {
+    return
+        pInfo->size>=20 &&
+        pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
+        pInfo->charsetFamily==U_CHARSET_FAMILY &&
+        pInfo->dataFormat[0]==0x75 &&   /* dataFormat="unam" */
+        pInfo->dataFormat[1]==0x6e &&
+        pInfo->dataFormat[2]==0x61 &&
+        pInfo->dataFormat[3]==0x6d &&
+        pInfo->formatVersion[0]==1;
+}
+
+static uint16_t
+getName(UCharNames *names, uint32_t code, UCharNameChoice nameChoice,
+        char *buffer, uint16_t bufferLength) {
+    uint16_t groupMSB=(uint16_t)(code>>GROUP_SHIFT),
+             start=0,
+             limit=*(uint16_t *)((char *)names+names->groupsOffset),
+             number;
+    Group *groups=(Group *)((char *)names+names->groupsOffset+2);
+
+    /* binary search for the group of names that contains the one for code */
+    while(start<limit-1) {
+        number=(start+limit)/2;
+        if(groupMSB<groups[number].groupMSB) {
+            limit=number;
+        } else {
+            start=number;
+        }
+    }
+
+    if(groupMSB==groups[start].groupMSB) {
+        return expandGroupName(names, groups+start, (uint16_t)(code&GROUP_MASK), nameChoice,
+                               buffer, bufferLength);
+    } else {
+        /* group not found */
+        /* zero-terminate */
+        if(bufferLength>0) {
+            *buffer=0;
+        }
+        return 0;
+    }
+}
+
+static uint16_t
+expandGroupName(UCharNames *names, Group *group,
+                uint16_t lineNumber, UCharNameChoice nameChoice,
+                char *buffer, uint16_t bufferLength) {
+    uint8_t *s=(uint8_t *)names+names->groupStringOffset+
+                   (group->offsetHigh<<16|group->offsetLow);
+
+    /* read the length of this string and get the group strings offset */
+    uint16_t i=0, offset=0, length=0, nameOffset, nameLength;
+    uint8_t lengthByte;
+
+    /* all 32 lengths must be read to get the offset of the first group string */
+    while(i<LINES_PER_GROUP) {
+        lengthByte=*s++;
+
+        /* read even nibble - MSBs of lengthByte */
+        if(length>=12) {
+            /* double-nibble length spread across two bytes */
+            length=((length&0x3)<<4|lengthByte>>4)+12;
+            lengthByte&=0xf;
+        } else if((lengthByte&0xf0)>=0xc0) {
+            /* double-nibble length spread across this one byte */
+            length=(lengthByte&0x3f)+12;
+        } else {
+            /* single-nibble length in MSBs */
+            length=lengthByte>>4;
+            lengthByte&=0xf;
+        }
+
+        if(i==lineNumber) {
+            nameOffset=offset;
+            nameLength=length;
+        }
+
+        offset+=length;
+        ++i;
+
+        /* read odd nibble - LSBs of lengthByte */
+        if((lengthByte&0xf0)==0) {
+            /* this nibble was not consumed for a double-nibble length above */
+            length=lengthByte;
+            if(length<12) {
+                /* single-nibble length in LSBs */
+                if(i==lineNumber) {
+                    nameOffset=offset;
+                    nameLength=length;
+                }
+
+                offset+=length;
+                ++i;
+            }
+        } else {
+            length=0;   /* prevent double-nibble detection in the next iteration */
+        }
+    }
+
+    return expandName(names, s+nameOffset, nameLength, nameChoice,
+                      buffer, bufferLength);
+}
+
+#define WRITE_CHAR(buffer, bufferLength, bufferPos, c) { \
+    if((bufferLength)>0) { \
+        *(buffer)++=c; \
+        --(bufferLength); \
+    } \
+    ++(bufferPos); \
+}
+
+static uint16_t
+expandName(UCharNames *names,
+           uint8_t *name, uint16_t nameLength, UCharNameChoice nameChoice,
+           char *buffer, uint16_t bufferLength) {
+    uint16_t *tokens=(uint16_t *)names+8;
+    uint16_t token, tokenCount=*tokens++, bufferPos=0;
+    uint8_t *tokenStrings=(uint8_t *)names+names->tokenStringOffset;
+    uint8_t c;
+
+    if(nameChoice!=U_UNICODE_CHAR_NAME) {
+        /* skip the modern name */
+        while(nameLength>0) {
+            --nameLength;
+            if(*name++==';') {
+                break;
+            }
+        }
+    }
+
+    /* write each letter directly, and write a token word per token */
+    while(nameLength>0) {
+        --nameLength;
+        c=*name++;
+
+        if(c==';') {
+            /* finished */
+            break;
+        }
+
+        if(c>=tokenCount) {
+            /* implicit letter */
+            WRITE_CHAR(buffer, bufferLength, bufferPos, c);
+        } else {
+            token=tokens[c];
+            if(token==(uint16_t)(-2)) {
+                /* this is a lead byte for a double-byte token */
+                token=tokens[c<<8|*name++];
+                --nameLength;
+            }
+            if(token==(uint16_t)(-1)) {
+                /* explicit letter */
+                WRITE_CHAR(buffer, bufferLength, bufferPos, c);
+            } else {
+                /* write token word */
+                uint8_t *tokenString=tokenStrings+token;
+                while((c=*tokenString++)!=0) {
+                    WRITE_CHAR(buffer, bufferLength, bufferPos, c);
+                }
+            }
+        }
+    }
+
+    /* zero-terminate */
+    if(bufferLength>0) {
+        *buffer=0;
+    }
+
+    return bufferPos;
+}
+
+static uint16_t
+getAlgName(AlgorithmicRange *range, uint32_t code, UCharNameChoice nameChoice,
+        char *buffer, uint16_t bufferLength) {
+    uint16_t bufferPos=0;
+
+    switch(range->type) {
+    case 0: {
+        /* name = prefix hex-digits */
+        char *s=(char *)(range+1);
+        char c;
+
+        uint16_t i, count;
+
+        /* copy prefix */
+        while((c=*s++)!=0) {
+            WRITE_CHAR(buffer, bufferLength, bufferPos, c);
+        }
+
+        /* write hexadecimal code point value */
+        count=range->variant;
+
+        /* zero-terminate */
+        if(count<bufferLength) {
+            buffer[count]=0;
+        }
+
+        for(i=count; i>0;) {
+            if(--i<bufferLength) {
+                c=(char)code&0xf;
+                if(c<10) {
+                    c+='0';
+                } else {
+                    c+='A'-10;
+                }
+                buffer[i]=c;
+            }
+            code>>=4;
+        }
+
+        bufferPos+=count;
+        break;
+    }
+    case 1: {
+        /* name = prefix factorized-elements */
+        uint16_t *factors=(uint16_t *)(range+1);
+        char *s=(char *)(factors+range->variant);
+        char c;
+
+        uint16_t indeces[8];
+        uint16_t i, count, factor;
+
+        /* copy prefix */
+        while((c=*s++)!=0) {
+            WRITE_CHAR(buffer, bufferLength, bufferPos, c);
+        }
+
+        /* write elements according to the factors */
+        code-=range->start;
+
+        /*
+         * the factorized elements are determined by modulo arithmetic
+         * with the factors of this algorithm
+         *
+         * note that for fewer operations, count is decremented here
+         */
+        count=range->variant-1;
+        for(i=count; i>0; --i) {
+            factor=factors[i];
+            indeces[i]=(uint16_t)(code%factor);
+            code/=factor;
+        }
+        /*
+         * we don't need to calculate the last modulus because start<=code<=end
+         * guarantees here that code<=factors[0]
+         */
+        indeces[0]=(uint16_t)code;
+
+        /* write each element */
+        for(;;) {
+            /* skip indeces[i] strings */
+            factor=indeces[i];
+            while(factor>0) {
+                while(*s++!=0) {}
+                --factor;
+            }
+
+            /* write element */
+            while((c=*s++)!=0) {
+                WRITE_CHAR(buffer, bufferLength, bufferPos, c);
+            }
+
+            /* we do not need to perform the rest of this loop for i==count - break here */
+            if(i>=count) {
+                break;
+            }
+
+            /* skip the rest of the strings for this factors[i] */
+            factor=factors[i]-indeces[i]-1;
+            while(factor>0) {
+                while(*s++!=0) {}
+                --factor;
+            }
+
+            ++i;
+        }
+
+        /* zero-terminate */
+        if(bufferLength>0) {
+            *buffer=0;
+        }
+        break;
+    }
+    default:
+        /* undefined type */
+        /* zero-terminate */
+        if(bufferLength>0) {
+            *buffer=0;
+        }
+        break;
+    }
+
+    return bufferPos;
+}