ICU-868 New alias scheme.

X-SVN-Rev: 8975
2002-06-28 23:13:30 +00:00 · 2002-06-28 23:13:30 +00:00 · 70debd215f
commit 70debd215f
parent a79775fe45
11 changed files with 1339 additions and 661 deletions
--- a/icu4c/source/common/putil.c
+++ b/icu4c/source/common/putil.c
@ -1724,7 +1724,9 @@ _uErrorInfoName[U_ERROR_WARNING_LIMIT-U_ERROR_WARNING_START]={
    "U_USING_DEFAULT_WARNING",
    "U_SAFECLONE_ALLOCATED_WARNING",
    "U_STATE_OLD_WARNING",
-    "U_STRING_NOT_TERMINATED_WARNING"
+    "U_STRING_NOT_TERMINATED_WARNING",
+    "U_SORT_KEY_TOO_SHORT_WARNING",
+    "U_AMBIGUOUS_ALIAS_WARNING"
 };

 static const char * const
--- a/icu4c/source/common/ucnv.c
+++ b/icu4c/source/common/ucnv.c
@ -283,8 +283,7 @@ ucnv_countAvailable ()
 U_CAPI uint16_t U_EXPORT2
 ucnv_countAliases(const char *alias, UErrorCode *pErrorCode)
 {
-    const char *p;
-    return ucnv_io_getAliases(alias, &p, pErrorCode);
+    return ucnv_io_countAliases(alias, pErrorCode);
 }


@ -297,14 +296,7 @@ ucnv_getAlias(const char *alias, uint16_t n, UErrorCode *pErrorCode)
 U_CAPI void U_EXPORT2
 ucnv_getAliases(const char *alias, const char **aliases, UErrorCode *pErrorCode)
 {
-    const char *p;
-    uint16_t count=ucnv_io_getAliases(alias, &p, pErrorCode);
-    while(count>0) {
-        *aliases++=p;
-        /* skip a name, first the canonical converter name */
-        p+=uprv_strlen(p)+1;
-        --count;
-    }
+    ucnv_io_getAliases(alias, 0, aliases, pErrorCode);
 }

 U_CAPI uint16_t U_EXPORT2
--- a/icu4c/source/common/ucnv_io.c
+++ b/icu4c/source/common/ucnv_io.c
@ -31,67 +31,140 @@
 #include "unicode/udata.h"
 #include "ucln_cmn.h"

-/* Format of cnvalias.dat ------------------------------------------------------
+/* Format of cnvalias.icu -----------------------------------------------------
 *
- * cnvalias.dat is a binary, memory-mappable form of convrtrs.txt .
- * It contains two sorted tables and a block of zero-terminated strings.
- * Each table is preceded by the number of table entries.
+ * cnvalias.dat is a binary, memory-mappable form of convrtrs.txt.
+ * This binary form contains several tables. All indexes are to uint16_t
+ * units, and not to the bytes (uint8_t units). Addressing everything on
+ * 16-bit boundaries allows us to store more information with small index
+ * numbers, which are also 16-bit in size. The majority of the table (except
+ * the string table) are 16-bit numbers.
 *
- * The first table maps from aliases to converter indexes.
- * The converter names themselves are listed as aliases in this table.
- * Each entry in this table has an offset to the alias and
- * an index of the converter in the converter table.
+ * First there is the size of the Table of Contents (TOC). The TOC
+ * entries contain the size of each section. In order to find the offset
+ * you just need to sum up the previous offsets.
 *
- * The second table lists only the converters themselves.
- * Each entry in this table has an offset to the converter name and
- * the number of aliases, including the converter itself.
- * A count of 1 means that there is no alias, only the converter name.
+ * 1) This section contains a list of converters. This list contains indexes
+ * into the string table for the converter name. The index of this list is
+ * also used by other sections, which are mentioned later on.
 *
- * In the block of strings after the tables, each converter name is directly
- * followed by its aliases. All offsets to strings are offsets from the
- * beginning of the data.
+ * 2) This section contains a list of tags. This list contains indexes
+ * into the string table for the tag name. The index of this list is
+ * also used by other sections, which are mentioned later on.
 *
- * More formal file data structure (data format 2.1):
+ * 3) This section contains a list of sorted list of unique aliases. This
+ * list contains indexes into the string table for the alias name. The
+ * index of this list is also used by other sections, which are mentioned
+ * later on.
 *
- * uint16_t aliasCount;
- * uint16_t aliasOffsets[aliasCount];
- * uint16_t converterIndexes[aliasCount];
+ * 4) This section contains a list of mapped converter names. Consider this
+ * as a table that maps the 3rd section to the 1st section. This list contains
+ * indexes into the 1st section. The index of this list is the same index in
+ * the 3rd section. There is also some extra information in the high bits of
+ * each converter index in this table. Currently it's only used to say that
+ * an alias mapped to this converter is ambiguous. See UCNV_CONVERTER_INDEX_MASK
+ * and UCNV_AMBIGUOUS_ALIAS_MAP_BIT for more information. This section is
+ * the predigested form of the 5th section so that an alias lookup can be fast.
+ * 
+ * 5) This section contains a 2D array with indexes to the 6th section. This
+ * section is the full form of all alias mappings. The column index is the
+ * index into the converter list (column header). The row index is the index
+ * to tag list (row header). This 2D array is the top part a 3D array. The
+ * third dimension is in the 6th section.
 *
- * uint16_t converterCount;
- * struct {
- *     uint16_t converterOffset;
- *     uint16_t aliasCount;
- * } converters[converterCount];
+ * 6) This is blob of variable length arrays. Each array starts with a size,
+ * and is followed by indexes to alias names in the string table. This is
+ * the third dimension to the section 5. No other section should be referencing
+ * this section.
 *
- * uint16_t tagCount;
- * uint16_t taggedAliasesOffsets[tagCount][converterCount];
- * char tags[] = { "Tag0\Tag1\0..." };
+ * 7) Reserved at this time (There is no information). This _usually_ has a
+ * size of 0. Future versions may add more information here.
 *
- * char strings[]={
- *     "Converter0\0Alias1\0Alias2\0...Converter1\0Converter2\0Alias0\Alias1\0..."
- * };
+ * 8) This is the string table. All strings are indexed on an even address.
+ * There are two reasons for this. First many chip architectures locate strings
+ * faster on even address boundaries. Second, since all indexes are 16-bit
+ * numbers, this string table can be 128KB in size instead of 64KB when we
+ * only have strings starting on an even address.
 *
- * The code included here can read versions 2 and 2.1 of the data format.
- * Version 2 does not have tag information, but since the code never refers
- * to strings[] by its base offset, it's okay.
 *
+ * Here is the concept of section 5 and 6. It's a 3D cube. Each tag
+ * has a unique alias among all converters. That same alias can
+ * be mentioned in other standards on different converters,
+ * but only one alias per tag can be unique.
+ *
+ *
+ *              Converter Names (Usually in TR22 form)
+ *           -------------------------------------------.
+ *     T    /                                          /|
+ *     a   /                                          / |
+ *     g  /                                          /  |
+ *     s /                                          /   |
+ *      /                                          /    |
+ *      ------------------------------------------/     |
+ *    A |                                         |     |
+ *    l |                                         |     |
+ *    i |                                         |    /
+ *    a |                                         |   /
+ *    s |                                         |  /
+ *    e |                                         | /
+ *    s |                                         |/
+ *      -------------------------------------------
+ *
+ *
+ *
+ * Here is what it really looks like. It's like swiss cheese.
+ * There are holes. Some converters aren't recognized by
+ * a standard, or they are really old converters that the
+ * standard doesn't recognize anymore.
+ *
+ *              Converter Names (Usually in TR22 form)
+ *           -------------------------------------------.
+ *     T    /##########################################/|
+ *     a   /     #            #                       /#
+ *     g  /  #      ##     ##     ### # ### ### ### #/  
+ *     s / #             #####  ####        ##  ## #/#  
+ *      / ### # # ##  #  #   #          ### # #   #/##  
+ *      ------------------------------------------/# #
+ *    A |### # # ##  #  #   #          ### # #   #|# #
+ *    l |# # #    #     #               ## #     #|# #
+ *    i |# # #    #     #                #       #|#
+ *    a |#                                       #|#
+ *    s |                                        #|#
+ *    e 
+ *    s 
+ *      
 */

 static const char DATA_NAME[] = "cnvalias";
-static const char DATA_TYPE[] = "dat";
+static const char DATA_TYPE[] = "icu";

 static UDataMemory *aliasData=NULL;
-static const uint16_t *aliasTable=NULL;
+
+static const uint16_t *converterList = NULL;
+static const uint16_t *tagList = NULL;
+static const uint16_t *aliasList = NULL;
+static const uint16_t *untaggedConvArray = NULL;
+static const uint16_t *taggedAliasArray = NULL;
+static const uint16_t *taggedAliasLists = NULL;
+static const uint16_t *stringTable = NULL;
+
+static uint32_t converterListNum;
+static uint32_t tagListNum;
+static uint32_t aliasListNum;
+static uint32_t untaggedConvArraySize;
+static uint32_t taggedAliasArraySize;
+static uint32_t taggedAliasListsSize;
+static uint32_t stringTableSize;

 static const char **availableConverters = NULL;
 static uint16_t availableConverterCount = 0;

-static const uint16_t *converterTable = NULL;
-static const uint16_t *tagTable = NULL;
-
-static char defaultConverterNameBuffer[100];
+static char defaultConverterNameBuffer[UCNV_MAX_CONVERTER_NAME_LENGTH + 1]; /* +1 for NULL */
 static const char *defaultConverterName = NULL;

+#define GET_STRING(idx) (const char *)(stringTable + (idx))
+#define NUM_RESERVED_TAGS 2
+
 static UBool
 isAcceptable(void *context,
             const char *type, const char *name,
@ -104,7 +177,7 @@ isAcceptable(void *context,
        pInfo->dataFormat[1]==0x76 &&
        pInfo->dataFormat[2]==0x41 &&
        pInfo->dataFormat[3]==0x6c &&
-        pInfo->formatVersion[0]==2);
+        pInfo->formatVersion[0]==3);
 }

 static UBool
@ -115,32 +188,64 @@ haveAliasData(UErrorCode *pErrorCode) {

    /* load converter alias data from file if necessary */
    if(aliasData==NULL) {
-        UDataMemory *data;
-        UDataInfo info;
-        const uint16_t *table=NULL;
+        UDataMemory *data = NULL;
+        const uint16_t *table = NULL;
+        uint32_t tableStart;
+        uint32_t currOffset;
+        uint32_t reservedSize1;

-        /* open the data outside the mutex block */
-        data=udata_openChoice(NULL, DATA_TYPE, DATA_NAME, isAcceptable, NULL, pErrorCode);
+        data = udata_openChoice(NULL, DATA_TYPE, DATA_NAME, isAcceptable, NULL, pErrorCode);
        if(U_FAILURE(*pErrorCode)) {
            return FALSE;
        }

-        table=(const uint16_t *)udata_getMemory(data);
-        info.size=sizeof(UDataInfo);
-        udata_getInfo(data, &info);
+        table = (const uint16_t *)udata_getMemory(data);
+
+        tableStart      = ((const uint32_t *)(table))[0];
+        if (tableStart < 8) {
+            *pErrorCode = U_INVALID_FORMAT_ERROR;
+            return FALSE;
+        }

-        /* in the mutex block, set the data for this process */
        umtx_lock(NULL);
        if(aliasData==NULL) {
-            aliasData=data;
+            aliasData = data;
            data=NULL;
-            aliasTable=table;
-            table=NULL;
-            converterTable = aliasTable + 1 + 2 * *aliasTable;

-            if (info.formatVersion[0] == 2 && info.formatVersion[1] > 0) {
-                tagTable = converterTable + 1 + 2 * *converterTable;
-            }
+            converterListNum        = ((const uint32_t *)(table))[1];
+            tagListNum              = ((const uint32_t *)(table))[2];
+            aliasListNum            = ((const uint32_t *)(table))[3];
+            untaggedConvArraySize  = ((const uint32_t *)(table))[4];
+            taggedAliasArraySize    = ((const uint32_t *)(table))[5];
+            taggedAliasListsSize    = ((const uint32_t *)(table))[6];
+            reservedSize1           = ((const uint32_t *)(table))[7];   /* reserved */
+            stringTableSize         = ((const uint32_t *)(table))[8];
+
+            currOffset = tableStart * (sizeof(uint32_t)/sizeof(uint16_t)) + (sizeof(uint32_t)/sizeof(uint16_t));
+            converterList = table + currOffset;
+
+            currOffset += converterListNum;
+            tagList = table + currOffset;
+
+            currOffset += tagListNum;
+            aliasList = table + currOffset;
+
+            currOffset += aliasListNum;
+            untaggedConvArray = table + currOffset;
+
+            currOffset += untaggedConvArraySize;
+            taggedAliasArray = table + currOffset;
+
+            /* aliasLists is a 1's based array, but it has a padding character */
+            currOffset += taggedAliasArraySize;
+            taggedAliasLists = table + currOffset;
+
+            currOffset += taggedAliasListsSize;
+            /* reserved */
+
+            currOffset += reservedSize1;
+            stringTable = table + currOffset;
+
        }
        umtx_unlock(NULL);

@ -175,49 +280,40 @@ ucnv_io_cleanup()

    ucnv_io_flushAvailableConverterCache();

-    aliasData = NULL;
-    aliasTable = NULL;
+    converterListNum        = 0;
+    tagListNum              = 0;
+    aliasListNum            = 0;
+    untaggedConvArraySize  = 0;
+    taggedAliasArraySize    = 0;
+    taggedAliasListsSize    = 0;
+    stringTableSize         = 0;

-    converterTable = NULL;
-    tagTable = NULL;
+    converterList = NULL;
+    tagList = NULL;
+    aliasList = NULL;
+    untaggedConvArray = NULL;
+    taggedAliasArray = NULL;
+    taggedAliasLists = NULL;
+    stringTable = NULL;

    defaultConverterName = NULL;
+    defaultConverterNameBuffer[0] = 0;

    return TRUE;                   /* Everything was cleaned up */
 }


-static int16_t getTagNumber(const char *tagname) {
-    if (tagTable) {
-        int16_t tag, count = (int16_t) *tagTable;
-        const char *tags = (const char *) (tagTable + 1 + count * *converterTable);
-
-#if 0
-
-        char name[100];
-        int i;
-
-        /* convert the tag name to lowercase to do case-insensitive comparisons */
-        for(i = 0; i < sizeof(name) - 1 && *tagname; ++i) {
-            name[i] = (char)uprv_tolower(*tagname++);
-        }
-        name[i] = 0;
-
-#else
-
-        const char *name = tagname;
-
-#endif
-
-        for (tag = 0; count--; ++tag) {
-            if (!uprv_stricmp(name, tags)) {
-                return tag;
+static uint32_t getTagNumber(const char *tagname) {
+    if (tagList) {
+        uint32_t tagNum;
+        for (tagNum = 0; tagNum < tagListNum; tagNum++) {
+            if (!uprv_stricmp(GET_STRING(tagList[tagNum]), tagname)) {
+                return tagNum;
            }
-            tags += strlen(tags) + 1;
        }
    }

-    return -1;
+    return UINT32_MAX;
 }

 /**
@ -240,14 +336,16 @@ static int16_t getTagNumber(const char *tagname) {
 U_CAPI int U_EXPORT2
 ucnv_compareNames(const char *name1, const char *name2) {
    int rc;
-    unsigned char c1, c2;
+    char c1, c2;

    for (;;) {
        /* Ignore delimiters '-', '_', and ' ' */
-        while ((c1 = (unsigned char)*name1) == '-'
-               || c1 == '_' || c1 == ' ') ++name1;
-        while ((c2 = (unsigned char)*name2) == '-'
-               || c2 == '_' || c2 == ' ') ++name2;
+        while ((c1 = *name1) == '-' || c1 == '_' || c1 == ' ') {
+            ++name1;
+        }
+        while ((c2 = *name2) == '-' || c2 == '_' || c2 == ' ') {
+            ++name2;
+        }

        /* If we reach the ends of both strings then they match */
        if ((c1|c2)==0) {
@ -257,7 +355,7 @@ ucnv_compareNames(const char *name1, const char *name2) {
        /* Case-insensitive comparison */
        rc = (int)(unsigned char)uprv_tolower(c1) -
             (int)(unsigned char)uprv_tolower(c2);
-        if (rc!=0) {
+        if (rc != 0) {
            return rc;
        }
        ++name1;
@ -267,69 +365,87 @@ ucnv_compareNames(const char *name1, const char *name2) {

 /*
 * search for an alias
- * return NULL or a pointer to the converter table entry
+ * return the converter number index for converterList
 */
-static const uint16_t *
-findAlias(const char *alias) {
-    char name[100];
-    const uint16_t *p=aliasTable;
-    uint16_t i, start, limit;
-
-    limit=*p++;
-    if(limit==0) {
-        /* there are no aliases */
-        return NULL;
-    }
-
-    /* convert the alias name to lowercase to do case-insensitive comparisons */
-    for(i=0; i<sizeof(name)-1 && *alias!=0; ++i) {
-        name[i]=(char)uprv_tolower(*alias++);
-    }
-    name[i]=0;
+static uint32_t
+findConverter(const char *alias, UErrorCode *pErrorCode) {
+    uint32_t mid, start, limit;
+    int result;

    /* do a binary search for the alias */
-    start=0;
-    while(start<limit-1) {
-        i=(uint16_t)((start+limit)/2);
-        if(ucnv_compareNames(name, (const char *)aliasTable+p[i])<0) {
-            limit=i;
+    start = 0;
+    limit = untaggedConvArraySize - 1;
+    mid = limit;
+
+    /* Once mid == 0 we've already checked the 0'th element and we can stop */
+    while (start <= limit && mid != 0) {
+        mid = (uint32_t)((start + limit + 1) / 2);    /* +1 is to round properly */
+        result = ucnv_compareNames(alias, GET_STRING(aliasList[mid]));
+
+        if (result < 0) {
+            limit = mid-1;
+        } else if (result > 0) {
+            start = mid+1;
        } else {
-            start=i;
+            /* Since the gencnval tool folds duplicates into one entry,
+             * this alias in aliasList is unique, but different standards
+             * may map an alias to different converters.
+             */
+            if (untaggedConvArray[mid] & UCNV_AMBIGUOUS_ALIAS_MAP_BIT) {
+                *pErrorCode = U_AMBIGUOUS_ALIAS_WARNING;
+            }
+            return untaggedConvArray[mid] & UCNV_CONVERTER_INDEX_MASK;
        }
    }

-    /* did we really find it? */
-    if(ucnv_compareNames(name, (const char *)aliasTable+p[start])==0) {
-        limit=*(p-1);       /* aliasCount */
-        p+=limit;           /* advance to the second column of the alias table */
-        i=p[start];         /* converter index */
-        return
-            p+limit+        /* beginning of converter table */
-            1+              /* skip its count */
-            2*i;            /* go to this converter's entry and return a pointer to it */
-    } else {
-        return NULL;
-    }
+    return UINT32_MAX;
 }

 U_CFUNC const char *
 ucnv_io_getConverterName(const char *alias, UErrorCode *pErrorCode) {
    if(haveAliasData(pErrorCode) && isAlias(alias, pErrorCode)) {
-        const uint16_t *p=findAlias(alias);
-        if(p!=NULL) {
-            return (const char *)aliasTable+*p;
+        uint32_t convNum = findConverter(alias, pErrorCode);
+        if (convNum < converterListNum) {
+            return GET_STRING(converterList[convNum]);
        }
    }
    return NULL;
 }

 U_CFUNC uint16_t
-ucnv_io_getAliases(const char *alias, const char **aliases, UErrorCode *pErrorCode) {
+ucnv_io_countAliases(const char *alias, UErrorCode *pErrorCode) {
    if(haveAliasData(pErrorCode) && isAlias(alias, pErrorCode)) {
-        const uint16_t *p=findAlias(alias);
-        if(p!=NULL) {
-            *aliases=(const char *)aliasTable+*p;
-            return *(p+1);
+        uint32_t convNum = findConverter(alias, pErrorCode);
+        if (convNum < converterListNum) {
+            /* tagListNum - 1 is the ALL tag */
+            int32_t listOffset = taggedAliasArray[(tagListNum - 1)*converterListNum + convNum];
+
+            if (listOffset) {
+                return taggedAliasLists[listOffset];
+            }
+        }
+    }
+    return 0;
+}
+
+U_CFUNC uint16_t
+ucnv_io_getAliases(const char *alias, uint16_t start, const char **aliases, UErrorCode *pErrorCode) {
+    if(haveAliasData(pErrorCode) && isAlias(alias, pErrorCode)) {
+        uint32_t currAlias;
+        uint32_t convNum = findConverter(alias, pErrorCode);
+        if (convNum < converterListNum) {
+            /* tagListNum - 1 is the ALL tag */
+            int32_t listOffset = taggedAliasArray[(tagListNum - 1)*converterListNum + convNum];
+
+            if (listOffset) {
+                uint32_t listCount = taggedAliasLists[listOffset];
+                /* +1 to skip listCount */
+                const uint16_t *currList = taggedAliasLists + listOffset + 1;
+
+                for (currAlias = start; currAlias < listCount; currAlias++) {
+                    aliases[currAlias] = GET_STRING(currList[currAlias]);
+                }
+            }
        }
    }
    return 0;
@ -338,17 +454,20 @@ ucnv_io_getAliases(const char *alias, const char **aliases, UErrorCode *pErrorCo
 U_CFUNC const char *
 ucnv_io_getAlias(const char *alias, uint16_t n, UErrorCode *pErrorCode) {
    if(haveAliasData(pErrorCode) && isAlias(alias, pErrorCode)) {
-        const uint16_t *p=findAlias(alias);
-        if(p!=NULL) {
-            uint16_t count=*(p+1);
-            if(n<count) {
-                const char *aliases=(const char *)aliasTable+*p;
-                while(n>0) {
-                    /* skip a name, first the canonical converter name */
-                    aliases+=uprv_strlen(aliases)+1;
-                    --n;
+        uint32_t convNum = findConverter(alias, pErrorCode);
+        if (convNum < converterListNum) {
+            /* tagListNum - 1 is the ALL tag */
+            int32_t listOffset = taggedAliasArray[(tagListNum - 1)*converterListNum + convNum];
+
+            if (listOffset) {
+                uint32_t listCount = taggedAliasLists[listOffset];
+                /* +1 to skip listCount */
+                const uint16_t *currList = taggedAliasLists + listOffset + 1;
+
+                if (n < listCount)  {
+                    return GET_STRING(currList[n]);
                }
-                return aliases;
+                *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;
            }
        }
    }
@ -358,12 +477,8 @@ ucnv_io_getAlias(const char *alias, uint16_t n, UErrorCode *pErrorCode) {
 U_CFUNC uint16_t
 ucnv_io_countStandards(UErrorCode *pErrorCode) {
    if (haveAliasData(pErrorCode)) {
-        if (!tagTable) {
-            *pErrorCode = U_INVALID_FORMAT_ERROR;
-            return 0;
-        }
-
-        return *tagTable;
+        /* Don't include the empty list */
+        return (uint16_t)(tagListNum - NUM_RESERVED_TAGS);
    }

    return 0;
@ -371,15 +486,11 @@ ucnv_io_countStandards(UErrorCode *pErrorCode) {

 U_CAPI const char * U_EXPORT2
 ucnv_getStandard(uint16_t n, UErrorCode *pErrorCode) {
-    if (haveAliasData(pErrorCode) && tagTable) {
-        int16_t count = (int16_t) *tagTable;
-        const char *tags = (const char *) (tagTable + 1 + count * *converterTable);
-
-        while (n-- && count--) {
-            tags += strlen(tags) + 1;
+    if (haveAliasData(pErrorCode)) {
+        if (n < tagListNum - NUM_RESERVED_TAGS) {
+            return GET_STRING(tagList[n]);
        }
-
-        return count ? tags : NULL;
+        *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;
    }

    return NULL;
@ -388,18 +499,56 @@ ucnv_getStandard(uint16_t n, UErrorCode *pErrorCode) {
 U_CFUNC const char * U_EXPORT2
 ucnv_getStandardName(const char *alias, const char *standard, UErrorCode *pErrorCode) {
    if (haveAliasData(pErrorCode) && isAlias(alias, pErrorCode)) {
-        const uint16_t *p = findAlias(alias);
-        if(p != NULL) {
-            int16_t tag = getTagNumber(standard);
+        uint32_t idx;
+        uint32_t listOffset;
+        uint32_t convNum;
+        uint32_t tagNum = getTagNumber(standard);
+        UErrorCode myErr = U_ZERO_ERROR;

-            if (tag > -1) {
-                uint16_t offset = tagTable[1 + tag * *converterTable + (p - converterTable) / 2];
-                return offset ? (const char *) aliasTable + offset : NULL;
+        /* Make a quick guess. Hopefully they used a TR22 canonical alias. */
+        convNum = findConverter(alias, &myErr);
+
+        if (tagNum < (tagListNum - NUM_RESERVED_TAGS) && convNum < converterListNum) {
+            if (myErr == U_AMBIGUOUS_ALIAS_WARNING) {
+                /* Uh Oh! They used an ambiguous alias.
+                   Hopefully the standard knows the alias.
+                   This may take a while.
+                */
+                for (idx = 0; idx < converterListNum; idx++) {
+                    listOffset = taggedAliasArray[tagNum*converterListNum + idx];
+                    if (listOffset) {
+                        uint32_t currAlias;
+                        uint32_t listCount = taggedAliasLists[listOffset];
+                        /* +1 to skip listCount */
+                        const uint16_t *currList = taggedAliasLists + listOffset + 1;
+                        for (currAlias = 0; currAlias < listCount; currAlias++) {
+                            if (currList[currAlias]
+                                && ucnv_compareNames(alias, GET_STRING(currList[currAlias]))==0)
+                            {
+                                if (currList[0]) {
+                                    return GET_STRING(currList[0]);
+                                }
+                                else {
+                                    /* Someone screwed up the alias table. */
+                                    return NULL;
+                                }
+                            }
+                        }
+                    }
+                }
+                /* The standard doesn't know about the alias */
+                *pErrorCode = U_AMBIGUOUS_ALIAS_WARNING;
            }
+            listOffset = taggedAliasArray[tagNum*converterListNum + convNum];
+            if (listOffset && taggedAliasLists[listOffset + 1]) {
+                return GET_STRING(taggedAliasLists[listOffset + 1]);
+            }
+            /* else no default name */
        }
+        /* else converter or tag not found */
    }

-   return NULL;
+    return NULL;
 }

 void
@ -413,41 +562,52 @@ ucnv_io_flushAvailableConverterCache() {
    availableConverterCount = 0;
 }

-static void ucnv_io_loadAvailableConverterList(void) {
-    uint16_t idx = 0;
-    uint16_t localConverterCount = 0;
-    UErrorCode status;
-    char *converterName;
-
-    /* We can't have more than "*converterTable" converters to open */
-    char **localConverterList = (char **) uprv_malloc(*converterTable * sizeof(char*));
-
-    for (; idx < *converterTable; idx++) {
-        status = U_ZERO_ERROR;
-        converterName = (char *)aliasTable+converterTable[1+2*idx];
-        ucnv_close(ucnv_open(converterName, &status));
-        if (U_SUCCESS(status)) {
-            localConverterList[localConverterCount++] = converterName;
-        }
-    }
-
-    umtx_lock(NULL);
+static UBool haveAvailableConverterList(UErrorCode *pErrorCode) {
    if (availableConverters == NULL) {
-        availableConverters = (const char **)localConverterList;
-        availableConverterCount = localConverterCount;
+        uint16_t idx;
+        uint16_t localConverterCount;
+        UErrorCode status;
+        const char *converterName;
+        const char **localConverterList;
+
+        if (!haveAliasData(pErrorCode)) {
+            return FALSE;
+        }
+
+        /* We can't have more than "*converterTable" converters to open */
+        localConverterList = (const char **) uprv_malloc(converterListNum * sizeof(char*));
+        if (!localConverterList) {
+            *pErrorCode = U_MEMORY_ALLOCATION_ERROR;
+            return FALSE;
+        }
+
+        localConverterCount = 0;
+
+        for (idx = 0; idx < converterListNum; idx++) {
+            status = U_ZERO_ERROR;
+            converterName = GET_STRING(converterList[idx]);
+            ucnv_close(ucnv_open(converterName, &status));
+            if (U_SUCCESS(status)) {
+                localConverterList[localConverterCount++] = converterName;
+            }
+        }
+
+        umtx_lock(NULL);
+        if (availableConverters == NULL) {
+            availableConverters = localConverterList;
+            availableConverterCount = localConverterCount;
+        }
+        else {
+            uprv_free((char **)localConverterList);
+        }
+        umtx_unlock(NULL);
    }
-    else {
-        uprv_free(localConverterList);
-    }
-    umtx_unlock(NULL);
+    return TRUE;
 }

 U_CFUNC uint16_t
 ucnv_io_countAvailableConverters(UErrorCode *pErrorCode) {
-    if(haveAliasData(pErrorCode)) {
-        if (availableConverters == NULL) {
-            ucnv_io_loadAvailableConverterList();
-        }
+    if (haveAvailableConverterList(pErrorCode)) {
        return availableConverterCount;
    }
    return 0;
@ -455,20 +615,18 @@ ucnv_io_countAvailableConverters(UErrorCode *pErrorCode) {

 U_CFUNC const char *
 ucnv_io_getAvailableConverter(uint16_t n, UErrorCode *pErrorCode) {
-    if(haveAliasData(pErrorCode)) {
-        if (availableConverters == NULL) {
-            ucnv_io_loadAvailableConverterList();
-        }
-        if(n < availableConverterCount) {
+    if (haveAvailableConverterList(pErrorCode)) {
+        if (n < availableConverterCount) {
            return availableConverters[n];
        }
+        *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;
    }
    return NULL;
 }

 U_CFUNC void
 ucnv_io_fillAvailableConverters(const char **aliases, UErrorCode *pErrorCode) {
-    if(haveAliasData(pErrorCode)) {
+    if (haveAvailableConverterList(pErrorCode)) {
        uint16_t count = 0;
        while (count < availableConverterCount) {
            *aliases++=availableConverters[count++];
@ -478,42 +636,12 @@ ucnv_io_fillAvailableConverters(const char **aliases, UErrorCode *pErrorCode) {

 U_CFUNC uint16_t
 ucnv_io_countAvailableAliases(UErrorCode *pErrorCode) {
-    if(haveAliasData(pErrorCode)) {
-        return *aliasTable;
+    if (haveAliasData(pErrorCode)) {
+        return (uint16_t)aliasListNum;
    }
    return 0;
 }

-#if 0
-/*
- * We are not currently using these functions, so I am commenting them out
- * to reduce the binary file size and improve the code coverage;
- * I do not currently want to remove this entirely because it may be useful
- * in the future and also serves to some degree as another piece of
- * documentation of the data structure.
- */
-U_CFUNC const char *
-ucnv_io_getAvailableAlias(uint16_t n, UErrorCode *pErrorCode) {
-    if(haveAliasData(pErrorCode) && n<*aliasTable) {
-        return (const char *)aliasTable+*(aliasTable+1+n);
-    }
-    return NULL;
-}
-
-U_CFUNC void
-ucnv_io_fillAvailableAliases(const char **aliases, UErrorCode *pErrorCode) {
-    if(haveAliasData(pErrorCode)) {
-        const uint16_t *p=aliasTable;
-        uint16_t count=*p++;
-        while(count>0) {
-            *aliases++=(const char *)aliasTable+*p;
-            ++p;
-            --count;
-        }
-    }
-}
-#endif
-
 /* default converter name --------------------------------------------------- */

 /*
@ -529,10 +657,7 @@ ucnv_io_getDefaultConverterName() {
    /* local variable to be thread-safe */
    const char *name=defaultConverterName;
    if(name==NULL) {
-        const char *codepage=0;
-        umtx_lock(NULL);        
-        codepage = uprv_getDefaultCodepage();
-        umtx_unlock(NULL);
+        const char *codepage = uprv_getDefaultCodepage();
        if(codepage!=NULL) {
            UErrorCode errorCode=U_ZERO_ERROR;
            name=ucnv_io_getConverterName(codepage, &errorCode);
@ -543,26 +668,27 @@ ucnv_io_getDefaultConverterName() {

        /* if the name is there, test it out */
        if(name != NULL) {
-          UErrorCode errorCode = U_ZERO_ERROR;
-          UConverter *cnv;
-          cnv = ucnv_open(name, &errorCode);
-          if(U_FAILURE(errorCode) || (cnv == NULL)) {
-            /* Panic time, let's use a fallback. */
+            UErrorCode errorCode = U_ZERO_ERROR;
+            UConverter *cnv = ucnv_open(name, &errorCode);
+            if(U_FAILURE(errorCode) || (cnv == NULL)) {
+                /* Panic time, let's use a fallback. */
 #if (U_CHARSET_FAMILY == U_ASCII_FAMILY) 
-            name = "US-ASCII";
-            /* there is no 'algorithmic' converter for EBCDIC */
+                name = "US-ASCII";
+                /* there is no 'algorithmic' converter for EBCDIC */
 #elif defined(OS390)
-            name = "ibm-1047-s390";
+                name = "ibm-1047-s390";
 #else
-            name = "ibm-37";
+                name = "ibm-37";
 #endif
-          }
-          ucnv_close(cnv);
+            }
+            ucnv_close(cnv);
        }

        if(name != NULL) {
-           /* Did find a name. And it works.*/
-          defaultConverterName=name;
+            umtx_lock(NULL);
+            /* Did find a name. And it works.*/
+            defaultConverterName=name;
+            umtx_unlock(NULL);
        }
    }

--- a/icu4c/source/common/ucnv_io.h
+++ b/icu4c/source/common/ucnv_io.h
@ -15,6 +15,9 @@

 #include "unicode/utypes.h"

+#define UCNV_AMBIGUOUS_ALIAS_MAP_BIT 0x8000
+#define UCNV_CONVERTER_INDEX_MASK 0x7FF
+
 /**
 * Map a converter alias name to a canonical converter name.
 * The alias is searched for case-insensitively, the converter name
@ -24,6 +27,12 @@
 U_CFUNC const char *
 ucnv_io_getConverterName(const char *alias, UErrorCode *pErrorCode);

+/**
+ * The count for ucnv_io_getAliases and ucnv_io_getAlias
+ */
+U_CFUNC uint16_t
+ucnv_io_countAliases(const char *alias, UErrorCode *pErrorCode);
+
 /**
 * Search case-insensitively for a converter alias and set aliases to
 * a pointer to the list of aliases for the actual converter.
@ -34,7 +43,7 @@ ucnv_io_getConverterName(const char *alias, UErrorCode *pErrorCode);
 * or 0 if the alias is not found.
 */
 U_CFUNC uint16_t
-ucnv_io_getAliases(const char *alias, const char **aliases, UErrorCode *pErrorCode);
+ucnv_io_getAliases(const char *alias, uint16_t start, const char **aliases, UErrorCode *pErrorCode);

 /**
 * Search case-insensitively for a converter alias and return
@ -85,21 +94,6 @@ ucnv_io_flushAvailableConverterCache(void);
 U_CFUNC uint16_t
 ucnv_io_countAvailableAliases(UErrorCode *pErrorCode);

-/**
- * Return the (n)th alias or converter name in mixed case, or NULL
- * if there is none (typically, if the data cannot be loaded).
- * 0<=index<ucnv_io_countAvailableAliases().
- */
-U_CFUNC const char *
-ucnv_io_getAvailableAlias(uint16_t n, UErrorCode *pErrorCode);
-
-/**
- * Fill an array const char *aliases[ucnv_io_countAvailableAliases()]
- * with pointers to all aliases and converter names in mixed-case.
- */
-U_CFUNC void
-ucnv_io_fillAvailableAliases(const char **aliases, UErrorCode *pErrorCode);
-
 /**
 * Get the name of the default converter.
 * This name is already resolved by <code>ucnv_io_getConverterName()</code>.
--- a/icu4c/source/common/unicode/ucnv.h
+++ b/icu4c/source/common/unicode/ucnv.h
@ -40,6 +40,7 @@ U_CDECL_BEGIN

 /* maximum length of the converter names */
 #define UCNV_MAX_CONVERTER_NAME_LENGTH 60
+/* maximum length of the converter name including path */
 #define UCNV_MAX_FULL_FILE_NAME_LENGTH (600+UCNV_MAX_CONVERTER_NAME_LENGTH)

 #define  UCNV_SI 0x0F           /*Shift in for EBDCDIC_STATEFUL and iso2022 states */
@ -979,7 +980,6 @@ ucnv_getAvailableName (int32_t n);

 /**
 * Gives the number of aliases for a given converter or alias name.
- * Note that additional aliases are recognized by ucnv_open().
 * This method only enumerates the listed entries in the alias file.
 * @param alias alias name
 * @param pErrorCode error status
@ -991,7 +991,6 @@ ucnv_countAliases(const char *alias, UErrorCode *pErrorCode);

 /**
 * Gives the name of the alias at given index of alias list.
- * Note that additional aliases are recognized by ucnv_open().
 * This method only enumerates the listed entries in the alias file.
 * @param alias alias name
 * @param n index in alias list
@ -1005,7 +1004,6 @@ ucnv_getAlias(const char *alias, uint16_t n, UErrorCode *pErrorCode);

 /**
 * Fill-up the list of alias names for the given alias.
- * Note that additional aliases are recognized by ucnv_open().
 * This method only enumerates the listed entries in the alias file.
 * @param alias alias name
 * @param aliases fill-in list, aliases is a pointer to an array of
--- a/icu4c/source/common/unicode/utypes.h
+++ b/icu4c/source/common/unicode/utypes.h
@ -379,8 +379,11 @@ enum UErrorCode {
    U_STATE_OLD_WARNING       = -125,   /**< ICU has to use compatibility layer to construct the service. Expect performance/memory usage degradation. Consider upgrading */

    U_STRING_NOT_TERMINATED_WARNING = -124,/**< An output string could not be NUL-terminated because output length==destCapacity. */
+
    U_SORT_KEY_TOO_SHORT_WARNING = -123,

+    U_AMBIGUOUS_ALIAS_WARNING = -122,
+
    U_ERROR_WARNING_LIMIT,                 /**< This must always be the last warning value to indicate the limit for UErrorCode warnings (last warning code +1) */
    
    /** @deprecated use the enum that ends in _WARNING */
@ -476,18 +479,18 @@ enum UErrorCode {
    /* 
     * the error code range 0x10200 0x10300 are reserved for Break Iterator related error
     */
-     U_BRK_ERROR_START=0x10200,
-     U_BRK_INTERNAL_ERROR,
-     U_BRK_HEX_DIGITS_EXPECTED,
-     U_BRK_SEMICOLON_EXPECTED,
-     U_BRK_RULE_SYNTAX,
-     U_BRK_UNCLOSED_SET,
-     U_BRK_ASSIGN_ERROR,
-     U_BRK_VARIABLE_REDFINITION,
-     U_BRK_MISMATCHED_PAREN,
-     U_BRK_NEW_LINE_IN_QUOTED_STRING,
-     U_BRK_UNDEFINED_VARIABLE,
-     U_BRK_ERROR_LIMIT,
+    U_BRK_ERROR_START=0x10200,
+    U_BRK_INTERNAL_ERROR,
+    U_BRK_HEX_DIGITS_EXPECTED,
+    U_BRK_SEMICOLON_EXPECTED,
+    U_BRK_RULE_SYNTAX,
+    U_BRK_UNCLOSED_SET,
+    U_BRK_ASSIGN_ERROR,
+    U_BRK_VARIABLE_REDFINITION,
+    U_BRK_MISMATCHED_PAREN,
+    U_BRK_NEW_LINE_IN_QUOTED_STRING,
+    U_BRK_UNDEFINED_VARIABLE,
+    U_BRK_ERROR_LIMIT,

    U_ERROR_LIMIT=U_BRK_ERROR_LIMIT /**< This must always be the last value to indicate the limit for UErrorCode (last error code +1) */
 };
--- a/icu4c/source/data/makedata.mak
+++ b/icu4c/source/data/makedata.mak
@ -35,6 +35,8 @@ ICUOUT=$(ICUMAKE)\out
 #
 ICUP=$(ICUMAKE)\..\..
 ICUP=$(ICUP:\source\data\..\..=)
+# In case the first one didn't do it, try this one.  .NET would do the second one.
+ICUP=$(ICUP:\source\data\\..\..=)
 !MESSAGE ICU root path is $(ICUP)


@ -238,14 +240,14 @@ BRK_FILES = "$(ICUBLD)\sent.brk" "$(ICUBLD)\char.brk" "$(ICUBLD)\line.brk" "$(IC
 #  move the .dll and .lib files to their final destination afterwards.
 #  The $(U_ICUDATA_NAME).lib and $(U_ICUDATA_NAME).exp should already be in the right place due to stubdata.
 #
-"$(DLL_OUTPUT)\$(U_ICUDATA_NAME).dll" : "$(ICUTOOLS)\pkgdata\$(CFG)\pkgdata.exe" $(CNV_FILES) $(BRK_FILES) "$(ICUBLD)\uprops.dat" "$(ICUBLD)\unames.dat" "$(ICUBLD)\unorm.dat" "$(ICUBLD)\cnvalias.dat" "$(ICUBLD)\tz.dat" "$(ICUBLD)\ucadata.dat" "$(ICUBLD)\invuca.dat" $(ALL_RES) "$(ICUBLD)\icudata.res" "$(ICUP)\source\stubdata\stubdatabuilt.txt"
+"$(DLL_OUTPUT)\$(U_ICUDATA_NAME).dll" : "$(ICUTOOLS)\pkgdata\$(CFG)\pkgdata.exe" $(CNV_FILES) $(BRK_FILES) "$(ICUBLD)\uprops.dat" "$(ICUBLD)\unames.dat" "$(ICUBLD)\unorm.dat" "$(ICUBLD)\cnvalias.icu" "$(ICUBLD)\tz.dat" "$(ICUBLD)\ucadata.dat" "$(ICUBLD)\invuca.dat" $(ALL_RES) "$(ICUBLD)\icudata.res" "$(ICUP)\source\stubdata\stubdatabuilt.txt"
 	@echo Building icu data
 	@cd "$(ICUBLD)"
 	"$(ICUTOOLS)\pkgdata\$(CFG)\pkgdata" -e $(U_ICUDATA_NAME) -v -m dll -c -p $(U_ICUDATA_NAME) -O "$(PKGOPT)" -d "$(ICUBLD)" -s . <<pkgdatain.txt
 uprops.dat
 unames.dat
 unorm.dat
-cnvalias.dat
+cnvalias.icu
 tz.dat
 ucadata.dat
 invuca.dat
@ -390,7 +392,7 @@ res_index {
 	@"$(ICUTOOLS)\gennorm\$(CFG)\gennorm" -u $(UNICODE_VERSION) -s "$(ICUUNIDATA)"

 # Targets for converters
-"$(ICUBLD)\cnvalias.dat" : {"$(ICUSRCDATA)\$(ICUUCM)"}\convrtrs.txt "$(ICUTOOLS)\gencnval\$(CFG)\gencnval.exe"
+"$(ICUBLD)\cnvalias.icu" : {"$(ICUSRCDATA)\$(ICUUCM)"}\convrtrs.txt "$(ICUTOOLS)\gencnval\$(CFG)\gencnval.exe"
 	@echo Creating data file for Converter Aliases
 	@set ICU_DATA=$(ICUBLD)
 	@"$(ICUTOOLS)\gencnval\$(CFG)\gencnval" "$(ICUSRCDATA)\$(ICUUCM)\convrtrs.txt"
--- a/icu4c/source/data/mappings/convrtrs.txt
+++ b/icu4c/source/data/mappings/convrtrs.txt
@ -11,6 +11,9 @@
 # run gencnval, and eventually pkgdata to update the representation that
 # ICU uses for aliases.

+# Please be friendly to the rest of use that edit this table by
+# keeping this table free of tabs.
+
 # This is an alias file used by the character set converter.
 #
 # Format:
@ -21,8 +24,8 @@
 # by whitespace.
 #
 # All names can be tagged by including a space-separated list of tags in
-# curly braces, as in ISO_8859-1:1987{IANA} iso-8859-1 { MIME } or
-# some-charset{MIME IANA}. The order of tags does not matter, and
+# curly braces, as in ISO_8859-1:1987{IANA*} iso-8859-1 { MIME* } or
+# some-charset{MIME* IANA*}. The order of tags does not matter, and
 # whitespace is allowed between the tagged name and the tags list.
 #
 # The tags can be used to get standard names using ucnv_getStandardName().
@ -31,6 +34,10 @@
 #
 # IANA          The IANA charset name, as documented in RFC 1700.
 # MIME          The MIME charset name, used for content type tagging. 
+#
+# The * after the standard tag denotes that the previous alias is the
+# preferred (default) charset name for that standard. There can only
+# be one of these default charset names per converter.

 # The world is getting more complicated...
 # Supporting XML parsers, HTML, MIME, and similar applications
@ -63,13 +70,17 @@
 # or names of algorithmic converters, and their case must not
 # be changed - or else code and/or file names must also be changed.

-# List of supported standard tags
-{   IANA MIME
+# This is the list of supported standard tags.
+# When multiple converters have the same alias under different standards,
+# the standard nearest to the top of this list with that alias will
+# be the first converter that will be opened.
+{   IANA            # Source: http://www.iana.org/assignments/character-sets
+    MIME            # Source: http://www.iana.org/assignments/character-sets
    #ICU             # Can also use ICU_FEATURE ICU_CANONICAL
    #IBM AIX DB2
    #WINDOWS MSIE    # MSIE is Internet Explorer, which is different from Windows
    #GLIBC
-    #JAVA
+    JAVA            # Source: Sun JDK.  Preferred name must be an exact match.  Alias name case is ignored, but dashes are not ignored.
    #SOLARIS
    #APPLE
    #HPUX
@ -80,20 +91,20 @@

 # Fully algorithmic converters

-UTF-8 { IANA MIME }             ibm-1208 cp1208 
+UTF-8 { IANA* MIME* }           ibm-1208 cp1208

 # The ICU 2.2 UTF-16/32 converters detect and write a BOM.
-UTF-16 { IANA MIME }            ISO-10646-UCS-2 { IANA } csUnicode ibm-17584 ibm-13488 ibm-1200 cp1200 ucs-2
-UTF-16BE { IANA MIME }          UTF16_BigEndian x-utf-16be
-UTF-16LE { IANA MIME }          UTF16_LittleEndian x-utf-16le
+UTF-16 { IANA* MIME* }          ISO-10646-UCS-2 { IANA } csUnicode ibm-17584 ibm-13488 ibm-1200 cp1200 ucs-2
+UTF-16BE { IANA* MIME* }        UTF16_BigEndian x-utf-16be
+UTF-16LE { IANA* MIME* }        UTF16_LittleEndian x-utf-16le

 # ICU-specific names for special uses
 UTF16_PlatformEndian
 UTF16_OppositeEndian

-UTF-32 { IANA MIME }            ISO-10646-UCS-4 { IANA } csUCS4 ucs-4 ibm-1232
-UTF-32BE { IANA }               UTF32_BigEndian
-UTF-32LE { IANA }               UTF32_LittleEndian
+UTF-32 { IANA* MIME* }          ISO-10646-UCS-4 { IANA } csUCS4 ucs-4 ibm-1232
+UTF-32BE { IANA* }              UTF32_BigEndian
+UTF-32LE { IANA* }              UTF32_LittleEndian

 # ICU-specific names for special uses
 UTF32_PlatformEndian
@ -108,31 +119,57 @@ UTF32_OppositeEndian
 # By choosing the option "version=1", set O will be escaped instead.
 # For example:
 #     utf7Converter=ucnv_open("UTF-7,version=1");
-UTF-7 { IANA MIME }
+UTF-7 { IANA* MIME* }

-SCSU { IANA }
+SCSU { IANA* }
 BOCU-1

 # See http://www.unicode.org/unicode/reports/tr26 for this Compatibility Encoding Scheme for UTF-16
 # The Unicode Consortium does not encourage the use of CESU-8
-CESU-8 { IANA }
+CESU-8 { IANA* }

-ISO-8859-1 { MIME }              LATIN_1 ibm-819 cp819 latin1 8859-1 csisolatin1 iso-ir-100 ISO_8859-1:1987 { IANA } l1 ANSI_X3.110-1983   819 #!!!!! There's whole lot of names for this
-US-ASCII { MIME }                ascii ascii-7 ANSI_X3.4-1968 { IANA } ANSI_X3.4-1986 ISO_646.irv:1991 iso646-us us csASCII 646 iso-ir-6 cp367
+ISO-8859-1 { MIME* IANA }
+    LATIN_1     # Old ICU name
+    ibm-819
+    IBM819 { IANA }
+    cp819 { IANA }
+    latin1 { IANA }
+    8859-1
+    csISOLatin1 { IANA }
+    iso-ir-100 { IANA }
+    ISO_8859-1:1987 { IANA* }
+    l1 { IANA }
+    819
+    # ANSI_X3.110-1983  # This is for a different IANA alias.  This isn't iso-8859-1.
+
+US-ASCII { MIME* IANA }
+    ASCII { JAVA* IANA }
+    ascii-7 { JAVA }
+    ANSI_X3.4-1968 { IANA* }
+    ANSI_X3.4-1986 { IANA }
+    ISO_646.irv:1991 { IANA }
+    iso_646.irv:1983 { JAVA }
+    ISO646-US { JAVA IANA }
+    us { IANA }
+    csASCII { IANA }
+    646 { JAVA }
+    iso-ir-6 { IANA }
+    cp367 { IANA }
+    # Java says "default" too, but that makes no sense.

 # Partially algorithmic converters

-ISO_2022                         ISO-2022 { MIME } 2022 cp2022
-ISO_2022,locale=ja,version=0     ISO-2022-JP { IANA MIME } csISO2022JP
-ISO_2022,locale=ja,version=1     ISO-2022-JP-1 JIS JIS_Encoding { IANA }
-ISO_2022,locale=ja,version=2     ISO-2022-JP-2 { IANA MIME } csISO2022JP2
+ISO_2022                         ISO-2022 { MIME* } 2022 cp2022
+ISO_2022,locale=ja,version=0     ISO-2022-JP { IANA* MIME* } csISO2022JP
+ISO_2022,locale=ja,version=1     ISO-2022-JP-1 JIS JIS_Encoding { IANA* }
+ISO_2022,locale=ja,version=2     ISO-2022-JP-2 { IANA* MIME* } csISO2022JP2
 ISO_2022,locale=ja,version=3     JIS7 csJISEncoding
 ISO_2022,locale=ja,version=4     JIS8
-ISO_2022,locale=ko,version=0     ISO-2022-KR { IANA MIME } csISO2022KR
+ISO_2022,locale=ko,version=0     ISO-2022-KR { IANA* MIME* } csISO2022KR
 ISO_2022,locale=ko,version=1     ibm-25546 ibm-25546_P100 25546
-ISO_2022,locale=zh,version=0     ISO-2022-CN { IANA MIME } csISO2022CN
-ISO_2022,locale=zh,version=1     ISO-2022-CN-EXT { IANA MIME }
-HZ                               HZ-GB-2312 { IANA MIME }
+ISO_2022,locale=zh,version=0     ISO-2022-CN { IANA* MIME* } # csISO2022CN
+ISO_2022,locale=zh,version=1     ISO-2022-CN-EXT { IANA* MIME* }
+HZ                               HZ-GB-2312 { IANA* MIME* }
 LMBCS-1                          lmbcs
 LMBCS-2
 LMBCS-3
@ -155,82 +192,210 @@ ISCII,version=6           iscii-tlg x-iscii-te
 ISCII,version=7           iscii-knd x-iscii-ka
 ISCII,version=8           iscii-mlm x-iscii-ma

-# Table-based
+# Table-based interchange codepages

-ibm-367
+ibm-367                 IBM367 { IANA* }    # This is ASCII, but it has fallbacks
+
+# Central Europe
+# Standard iso-8859-1, which does not have the Euro update.
+# See iso-8859-15 (latin9) for the Euro update
+ibm-912                 iso-8859-2 { MIME* IANA }
+                        latin2 { IANA }
+                        # ISO8859_2 { JAVA* } # This is really the default for Java and many others.
+                        8859-2
+                        csISOLatin2 { IANA }
+                        iso-ir-101 { IANA }
+                        ISO_8859-2:1987 { IANA* }
+                        l2 { IANA }
+                        cp912
+                        912
+
+# Maltese Esperanto
+ibm-913                 iso-8859-3 { MIME* IANA }
+                        latin3 { IANA }
+                        8859-3
+                        csISOLatin3 { IANA }
+                        iso-ir-109
+                        ISO_8859-3:1988 { IANA* }
+                        l3 { IANA }
+                        cp913
+                        913
+
+# Baltic
+ibm-914                 iso-8859-4 { MIME* IANA }
+                        latin4 { IANA }
+                        8859-4
+                        csISOLatin4 { IANA }
+                        iso-ir-110 { IANA }
+                        ISO_8859-4:1988 { IANA* }
+                        l4 { IANA }
+                        cp914
+                        914
+
+# Cyrillic
+ibm-915                 iso-8859-5 { MIME* IANA }
+                        cyrillic { IANA }
+                        8859-5
+                        csISOLatinCyrillic { IANA }
+                        iso-ir-144 { IANA }
+                        ISO_8859-5:1988 { IANA* }
+                        cp915
+                        915
+
+# Arabic
+# ISO_8859-6-E and ISO_8859-6-I are similar to this charset, but they are not the same
+# -E means explicit. -I means implicit.  However those aliases are rarely used.
+ibm-1089                iso-8859-6 { MIME* IANA }
+                        arabic { IANA }
+                        8859-6
+                        csISOLatinArabic { IANA }
+                        iso-ir-127 { IANA }
+                        ISO_8859-6:1987 { IANA* }
+                        ecma-114 { IANA }
+                        asmo-708 { IANA }
+                        cp1089
+                        1089
+
+# ISO Greek (w/ euro update)
+ibm-4909                iso-8859-7 { MIME* IANA }
+                        greek { IANA }
+                        greek8 { IANA }
+                        elot_928 { IANA }
+                        ecma-118 { IANA }
+                        8859-7
+                        csISOLatinGreek { IANA }
+                        iso-ir-126 { IANA }
+                        ISO_8859-7:1987 { IANA* }
+                        cp813
+                        813
+ibm-813                 # Same as 4909 above but without the euro update
+
+# hebrew
+# ISO_8859-8-E and ISO_8859-8-I are similar to this charset, but they are not the same
+# -E means explicit. -I means implicit.
+ibm-916                 iso-8859-8 { MIME* IANA }
+                        hebrew { IANA }
+                        8859-8
+                        csISOLatinHebrew { IANA }
+                        iso-ir-138 { IANA }
+                        ISO_8859-8:1988 { IANA* }
+                        cp916
+                        916
+
+# Turkish
+ibm-920                 iso-8859-9 { MIME* IANA }
+                        ECMA-128    # IANA doesn't have this alias 6/24/2002
+                        latin5 { IANA }
+                        8859-9
+                        csISOLatin5 { IANA }
+                        iso-ir-148 { IANA }
+                        ISO_8859-9:1989 { IANA* }
+                        l5 { IANA }
+                        cp920
+                        920
+
+# Latin 9
+ibm-923                 iso-8859-15 { IANA* MIME* } # IANA only has iso-8859-15 (6/24/2002)
+                        # ISO8859_15 { JAVA* } # This is really the default for Java and many others.
+                        8859-15
+                        latin9
+                        latin0
+                        csisolatin0
+                        csisolatin9
+                        iso8859_15_fdis
+                        cp923
+                        923

-# Interchange codepages
-ibm-912                 iso-8859-2 { MIME } latin2 cp912 8859-2 csisolatin2     iso-ir-101  ISO_8859-2:1987 { IANA } l2 912 # Central Europe
-ibm-913                 iso-8859-3 { MIME } latin3 cp913 8859-3 csisolatin3     iso-ir-109  ISO_8859-3:1988 { IANA } l3 913 # Maltese Esperanto
-ibm-914                 iso-8859-4 { MIME } latin4 cp914 8859-4 csisolatin4     iso-ir-110  ISO_8859-4:1988 { IANA } l4 914 # Baltic
-ibm-915                 iso-8859-5 { MIME } cyrillic cp915 8859-5 csisolatincyrillic iso-ir-144 ISO_8859-5:1988 { IANA } 915 # Cyrillic
-ibm-1089                iso-8859-6 { MIME } arabic cp1089 8859-6 csisolatinarabic iso-ir-127 ISO_8859-6:1987 { IANA } ecma-114 asmo-708 1089   # Arabic
-ibm-4909                iso-8859-7 { MIME } greek cp813 greek8 elot_928 ecma-118 8859-7 csisolatingreek iso-ir-126 ISO_8859-7:1987 { IANA } 813 # ISO Greek (w/ euro update)
-ibm-813                 # Same as 4909 (w/o euro update)
-ibm-916                 iso-8859-8 { MIME } hebrew cp916 8859-8 csisolatinhebrew iso-ir-138 ISO_8859-8:1988 { IANA }   916 # hebrew iso-8859-8i - typo?
-ibm-920                 iso-8859-9 { MIME } ECMA-128 latin5 cp920 8859-9 csisolatin5 iso-ir-148 ISO_8859-9:1989 { IANA } l5 920 # Turkish
-ibm-923                 iso-8859-15 { IANA MIME } latin9 cp923 8859-15 latin0 csisolatin0 iso8859_15_fdis csisolatin9       923 # Latin 9
 ibm-1252                ibm-1004 cp1004      # Windows Latin 1 without Euro
 ibm-942_P120-2000       ibm-942_VASCII_VSUB_VPUA ibm-942 ibm-932 ibm-932_VASCII_VSUB_VPUA   # Old s_jis ibm-932 added!   
 ibm-942_P12A-2000       ibm-942_VSUB_VPUA shift_jis78 sjis78 ibm-932_VSUB_VPUA
 ibm-943_P130-2000       ibm-943_VASCII_VSUB_VPUA ibm-943 # japanese. Unicode name is \u30b7\u30d5\u30c8\u7b26\u53f7\u5316\u8868\u73fe Iana says that Windows-31J is an extension to csshiftjis ibm-932 removed
-ibm-943_P14A-2000       ibm-943_VSUB_VPUA Shift_JIS { MIME } csWindows31J sjis cp943 cp932 pck ms_kanji csshiftjis windows-31j  x-sjis 943
+ibm-943_P14A-2000       ibm-943_VSUB_VPUA Shift_JIS { MIME* } csWindows31J sjis cp943 cp932 pck ms_kanji csshiftjis windows-31j  x-sjis 943
 ibm-949_P110-2000       ibm-949_VASCII_VSUB_VPUA ibm-949
-ibm-949_P11A-2000       ibm-949_VSUB_VPUA KS_C_5601-1987 { IANA } iso-ir-149 KS_C_5601-1989 csKSC56011987 KSC_5601 { MIME } johab ks_x_1001:1992 949 ksc5601_1992  ksc5601_1987 # KSC-5601-1992, korean
-ibm-1370                Big5 { IANA MIME } csBig5 x-big5 cp950 950 # Taiwan Big-5 (w/ euro update)
+
+ibm-949_P11A-2000
+                        ibm-949_VSUB_VPUA
+                        KS_C_5601-1987 { IANA* }
+                        iso-ir-149 { IANA }
+                        KS_C_5601-1989 { IANA }
+                        csKSC56011987 { IANA }
+                        KSC_5601 { MIME* IANA }
+                        johab
+                        ks_x_1001:1992
+                        949
+                        korean { IANA }
+                        ksc5601_1992 # KSC-5601-1992
+                        ksc5601_1987 # Needed by Java
+
+ibm-1370                Big5 { IANA* MIME* } csBig5 x-big5 cp950 950 # Taiwan Big-5 (w/ euro update)
 ibm-950                 # Taiwan Big-5 (w/o euro update)
-ibm-1386                gbk { IANA } cp936 windows-936 ms936 zh_cn # Chinese GBK removed
+ibm-1386                gbk { IANA* } cp936 windows-936 ms936 zh_cn # Chinese GBK removed
 ibm-33722_P120-2000     ibm-33722_VASCII_VPUA ibm-33722 cp33722 33722 ibm-5050 # Japan EUC with \ <-> Yen mapping
-ibm-33722_P12A-2000     ibm-33722_VPUA EUC-JP { MIME } ibm-eucJP eucjis Extended_UNIX_Code_Packed_Format_for_Japanese { IANA } cseucpkdfmtjapanese X-EUC-JP # Japan EUC. x-euc-jp is a MIME name
-ibm-970                 EUC-KR { IANA MIME } ibm-eucKR csEUCKR  # Korean EUC. x-euc-kr is a MIME name
+ibm-33722_P12A-2000     ibm-33722_VPUA EUC-JP { MIME* } ibm-eucJP eucjis Extended_UNIX_Code_Packed_Format_for_Japanese { IANA* } cseucpkdfmtjapanese X-EUC-JP # Japan EUC. x-euc-jp is a MIME name
+ibm-970                 EUC-KR { IANA* MIME* } ibm-eucKR csEUCKR  # Korean EUC. x-euc-kr is a MIME name
 ibm-964                 EUC-TW ibm-eucTW cns11643               # Taiwan EUC. x-euc-tw is a MIME name
-ibm-1383_P110-2000      ibm-1383_VPUA ibm-1383 EUC-CN ibm-eucCN GB_2312-80 { IANA } chinese gb iso-ir-58 csISO58GB231280 GB2312 { MIME } gb2312-1980 cp1383 1383  csGB2312# China EUC. x-euc-cn is a MIME name
-ibm-1162                tis-620 { IANA } cp874 windows-874 ms874 cp9066 874 # Thai (w/ euro update)
+
+ibm-1383_P110-2000      ibm-1383_VPUA
+                ibm-1383
+                EUC-CN
+                ibm-eucCN
+                GB_2312-80 { IANA* }
+                chinese { IANA }
+                gb              # This is not an IANA name. gb in IANA means Great Britain.
+                iso-ir-58 { IANA }
+                csISO58GB231280 { IANA }
+                GB2312 { MIME* }
+                gb2312-1980
+                cp1383
+                1383
+                csGB2312    # China EUC. x-euc-cn is a MIME name
+
+ibm-1162                tis-620 { IANA* } cp874 windows-874 ms874 cp9066 874 # Thai (w/ euro update)
 ibm-874                 ibm-1161 # Same as 1162 (w/o euro update)

 # Platform codepages
 ibm-437                 cp437 csPC8CodePage437 437         # PC US
 # HSYS:
-ibm-850                 IBM850 { IANA } cp850 { MIME } 850 csPC850Multilingual    # PC latin1
-ibm-851                 IBM851 { IANA } cp851 { MIME } 851 csPC851           # PC DOS Greek (w/o euro)
-ibm-858                 cp858  { MIME } IBM00858 { IANA }       # PC latin1     with Euro cp850 removed
+ibm-850                 IBM850 { IANA* } cp850 { MIME* } 850 csPC850Multilingual    # PC latin1
+ibm-851                 IBM851 { IANA* } cp851 { MIME* } 851 csPC851           # PC DOS Greek (w/o euro)
+ibm-858                 cp858  { MIME* } IBM00858 { IANA* }       # PC latin1     with Euro cp850 removed
 ibm-9044                852 csPCp852 cp852  # PC latin2 (w/ euro update) cp852 is a MIME name for IBM-852
-ibm-852                 IBM852 { IANA }     # PC latin2 (w/o euro update)
+ibm-852                 IBM852 { IANA* }     # PC latin2 (w/o euro update)
 ibm-872                 855 csIBM855 cp855 csPCp855 # PC cyrillic (w/ euro update) cp855 is a MIME name for IBM-855
-ibm-855                 IBM855 { IANA }             # PC cyrillic (w/o euro update)
-ibm-856                 cp856 { MIME }  856 # PC Hebrew (old)
-ibm-9049                857 csIBM857 cp857 { MIME } # PC Latin 5 (Turkish) (w/ euro     update)
-ibm-857                 IBM857 { IANA }     # PC Latin 5 (w/o euro update)
-ibm-859                 cp859 { MIME }      # PC Latin     9 (w/ euro update)
-ibm-860                 IBM860 { IANA } cp860 { MIME } 860 csIBM860         # PC Portugal
-ibm-861                 IBM861 { IANA } cp861 { MIME } 861 cp-is csIBM861   # PC Iceland
-ibm-867                 cp867  862 cp862 { MIME } cspc862latinhebrew    # PC Hebrew (w/ euro update)
-ibm-862                 IBM862 { IANA }                                 # PC Hebrew     (w/o euro update)
-ibm-863                 IBM863 { IANA } cp863 { MIME } 863 csIBM863         # PC Canadian     French
-ibm-17248               cp864 { MIME } csIBM864 # PC Arabic (w/ euro update)
-ibm-864                 IBM864 { IANA }         # PC Arabic (w/o euro update)
-ibm-865                 IBM865 { IANA } cp865 { MIME } 865 csIBM865         # PC     Nordic
-ibm-808                 cp866 { MIME } 866 csIBM866 # PC Russian (w/ euro update)
+ibm-855                 IBM855 { IANA* }             # PC cyrillic (w/o euro update)
+ibm-856                 cp856 { MIME* }  856 # PC Hebrew (old)
+ibm-9049                857 csIBM857 cp857 { MIME* } # PC Latin 5 (Turkish) (w/ euro     update)
+ibm-857                 IBM857 { IANA* }     # PC Latin 5 (w/o euro update)
+ibm-859                 cp859 { MIME* }      # PC Latin     9 (w/ euro update)
+ibm-860                 IBM860 { IANA* } cp860 { MIME* } 860 csIBM860         # PC Portugal
+ibm-861                 IBM861 { IANA* } cp861 { MIME* } 861 cp-is csIBM861   # PC Iceland
+ibm-867                 cp867  862 cp862 { MIME* } cspc862latinhebrew    # PC Hebrew (w/ euro update)
+ibm-862                 IBM862 { IANA* }                                 # PC Hebrew     (w/o euro update)
+ibm-863                 IBM863 { IANA* } cp863 { MIME* } 863 csIBM863         # PC Canadian     French
+ibm-17248               cp864 { MIME* } csIBM864 # PC Arabic (w/ euro update)
+ibm-864                 IBM864 { IANA* }         # PC Arabic (w/o euro update)
+ibm-865                 IBM865 { IANA* } cp865 { MIME* } 865 csIBM865         # PC     Nordic
+ibm-808                 cp866 { MIME* } 866 csIBM866 # PC Russian (w/ euro update)
 ibm-866                                             # PC Russian (w/o euro update)
-ibm-868                 IBM868 { IANA } cp868 { MIME } cp-ar csIBM868 868   # PC Urdu
-ibm-9061                cp869 { MIME } 869 cp-gr csIBM869   # PC Greek (w/ euro update)
-ibm-869                 IBM869 { IANA }                     # PC Greek (w/o euro update)
-ibm-878                 KOI8-R { IANA MIME } cp878 koi8 cskoi8r  # Russian internet
-ibm-901                 cp921 { MIME } 921  # PC Baltic (w/ euro update)
+ibm-868                 IBM868 { IANA* } cp868 { MIME* } cp-ar csIBM868 868   # PC Urdu
+ibm-9061                cp869 { MIME* } 869 cp-gr csIBM869   # PC Greek (w/ euro update)
+ibm-869                 IBM869 { IANA* }                     # PC Greek (w/o euro update)
+ibm-878                 KOI8-R { IANA* MIME* } cp878 koi8 cskoi8r  # Russian internet
+ibm-901                 cp921 { MIME* } 921  # PC Baltic (w/ euro update)
 ibm-921                                     # PC Baltic (w/o euro update)
-ibm-902                 cp922 { MIME } 922  # PC Estonian (w/ euro update)
+ibm-902                 cp922 { MIME* } 922  # PC Estonian (w/ euro update)
 ibm-922                                     # PC Estonian (w/o euro update)
 #ibm-941                jis-208 jisx-208    # Pure DBCS jisx-208 # ibm-941 is not JISX 208 code page
 #ibm-1038               Adobe-Symbol-Encoding csHPPSMath symbol
-ibm-5346                windows-1250 { IANA  } cp1250       # Windows Latin2 (w/ euro update)
-ibm-5347                windows-1251 { IANA  } cp1251       # Windows Cyrillic (w/ euro update)
-ibm-5348                windows-1252 { IANA  } cp1252       # Windows Latin1 (w/ euro update)
-ibm-5349                windows-1253 { IANA  } cp1253       # Windows Greek (w/ euro update)
-ibm-5350                windows-1254 { IANA  } cp1254       # Windows Turkish (w/ euro update)
-ibm-5351                windows-1255 { IANA  } cp1255       # Windows Hebrew (w/ euro update)
-ibm-5352                windows-1256 { IANA  } cp1256       # Windows Arabic (w/ euro update)
-ibm-5353                windows-1257 { IANA  } cp1257       # Windows Baltic (w/ euro update)
-ibm-5354                windows-1258 { IANA  } cp1258       # Windows Vietnamese (w/ euro update)
+ibm-5346                windows-1250 { IANA*  } cp1250       # Windows Latin2 (w/ euro update)
+ibm-5347                windows-1251 { IANA*  } cp1251       # Windows Cyrillic (w/ euro update)
+ibm-5348                windows-1252 { IANA*  } cp1252       # Windows Latin1 (w/ euro update)
+ibm-5349                windows-1253 { IANA*  } cp1253       # Windows Greek (w/ euro update)
+ibm-5350                windows-1254 { IANA*  } cp1254       # Windows Turkish (w/ euro update)
+ibm-5351                windows-1255 { IANA*  } cp1255       # Windows Hebrew (w/ euro update)
+ibm-5352                windows-1256 { IANA*  } cp1256       # Windows Arabic (w/ euro update)
+ibm-5353                windows-1257 { IANA*  } cp1257       # Windows Baltic (w/ euro update)
+ibm-5354                windows-1258 { IANA*  } cp1258       # Windows Vietnamese (w/ euro update)
 ibm-1250                # Windows Latin2 (w/o euro update)
 ibm-1251                # Windows Cyrillic (w/o euro update)
 ibm-1253                # Windows Greek (w/o euro update)
@ -240,15 +405,15 @@ ibm-1256                # Windows Arabic (w/o euro update)
 ibm-1257                # Windows Baltic (w/o euro update)
 ibm-1258                # Windows Vietnamese (w/o euro update)

-ibm-1275                macintosh { IANA } mac { MIME } csMacintosh # Apple latin 1
-ibm-1276                Adobe-Standard-Encoding { IANA } csAdobeStandardEncoding # Different from ISO-Unicode-IBM-1276 (GCSGID: 1276)
+ibm-1275                macintosh { IANA* } mac { MIME* } csMacintosh # Apple latin 1
+ibm-1276                Adobe-Standard-Encoding { IANA* } csAdobeStandardEncoding # Different from ISO-Unicode-IBM-1276 (GCSGID: 1276)
 ibm-1277                Adobe-Latin1-Encoding
 ibm-1280                macgr                   # Apple Greek
 ibm-1281                mactr                   # Apple Turkish
 ibm-1282                macce                   # Apple Central Europe
 ibm-1283                maccy                   # Apple Cyrillic

-ibm-1051                hp-roman8 { IANA } roman8 r8 csHPRoman8  # HP Latin1
+ibm-1051                hp-roman8 { IANA* } roman8 r8 csHPRoman8  # HP Latin1

 ibm-806_P100-2000       ibm-806 ibm-806_VSUB    # PC ISCII-91: Indian Script Code
 ibm-1006_P100-2000      ibm-1006 ibm-1006_VPUA  # Urdu
@ -265,120 +430,127 @@ ibm-9066_P100-2000      ibm-9066 ibm-9066_VSUB  # Thai PC

 # Added for more euro support

-ibm-849                cp1131          # PC     Belarus (w/ euro update)
-ibm-848                cp1125          # PC     Ukraine (w/ euro update)
-ibm-5104               cp1008          # 8-bit Arabic (w/ euro update)
-ibm-9238               cp1046          # PC     Arabic Extended (w/     euro update)
-ibm-1363_P110-2000     ibm-1363 ibm-1363_VASCII_VSUB_VPUA ibm-1362 # Korean KSC Korean Windows MBCS 
-ibm-1363_P11B-2000     ibm-1363_VSUB_VPUA windows-949 cp949 cp1363 ksc korean 
-ibm-5210               cp1114          # PC     SBCS Big-5 (w/ euro     update)
-ibm-21427              cp947           # PC     DBCS Big-5 (w/ euro     update)
+ibm-849                 cp1131          # PC     Belarus (w/ euro update)
+ibm-848                 cp1125          # PC     Ukraine (w/ euro update)
+ibm-5104                cp1008          # 8-bit Arabic (w/ euro update)
+ibm-9238                cp1046          # PC     Arabic Extended (w/     euro update)
+ibm-1363_P110-2000      ibm-1363 ibm-1363_VASCII_VSUB_VPUA ibm-1362 # Korean KSC Korean Windows MBCS 
+
+ibm-1363_P11B-2000      ibm-1363_VSUB_VPUA
+                        windows-949
+                        cp949
+                        cp1363
+                        ksc
+                        # korean # The korean alias from IANA goes to ibm-949_P11A-2000
+
+ibm-5210                cp1114          # PC     SBCS Big-5 (w/ euro     update)
+ibm-21427               cp947           # PC     DBCS Big-5 (w/ euro     update)

 # EBCDIC codepages according to the CDRA

 # without Euro
-ibm-37                 IBM037 { IANA } ibm-037 cpibm37 ebcdic-cp-us ebcdic-cp-ca ebcdic-cp-wt ebcdic-cp-nl csIBM037 cp37 cp037 037 # EBCDIC US
-ibm-273                IBM273 { IANA } csIBM273 ebcdic-de cp273 cpibm273 273 # EBCDIC Germanay, Austria...
-ibm-277                IBM277 { IANA } EBCDIC-CP-DK EBCDIC-CP-NO csIBM277 ebcdic-dk cp277 cpibm277 277 # EBCDIC Denmark...
-ibm-278                IBM278 { IANA } ebcdic-cp-fi ebcdic-cp-se csIBM278 ebcdic-sv cp278 cpibm278 278 # EBCDIC Sweden
-ibm-280                IBM280 { IANA } ebcdic-cp-it csIBM280 cp280 cpibm280 280 # EBCDIC Italy
-ibm-284                IBM284 { IANA } ebcdic-cp-es csIBM284 cp284 cpibm284 284 # EBCDIC Spain
-ibm-285                IBM285 { IANA } ebcdic-cp-gb csIBM285 ebcdic-gb cp285 cpibm285 285 # EBCDIC UK Ireland
-ibm-290                IBM290 { IANA } EBCDIC-JP-kana csIBM290 cp290 # host SBCS (Katakana)
-ibm-297                IBM297 { IANA } ebcdic-cp-fr csIBM297 cp297 cpibm297 297 # EBCDIC France
-ibm-420                IBM420 { IANA } ebcdic-cp-ar1 csIBM420 cp420 420
-ibm-424                IBM424 { IANA } ebcdic-cp-he csIBM424 cp424 424
-ibm-500                IBM500 { IANA } cpibm500 csIBM500 cp500 ebcdic-cp-be ebcdic-cp-ch 500 # EBCDIC International Latin1
-ibm-803                cp803                # Old EBCDIC Hebrew
-ibm-834                cp834                # Korean DBCS Host
-ibm-835                cp835                # DBCS T-Ch Host
-ibm-870_P100-2000      IBM870 { IANA } ibm-870 CP870 ibm-870_STD ebcdic-cp-roece ebcdic-cp-yu csIBM870
-ibm-871                IBM871 { IANA } ebcdic-cp-is csIBM871 cpibm871 cp871 871 # EBCDIC Iceland
-ibm-875_P100-2000      ibm-875 cp875 ibm-875 875 ibm-875_STD 
-ibm-918_P100-2000      IBM918 { IANA } ibm-918 CP918 ibm-918_VPUA ebcdic-cp-ar2 csIBM918 
-ibm-918_X100-2000      ibm-918_STD
-ibm-930                cp930 cpibm930 930   # Japan EBCDIC MIXED
-ibm-933                cp933 cpibm933 933   # Korea EBCDIC MIXED 
-ibm-935                cp935 cpibm935 935   # China EBCDIC MIXED 
-ibm-937                cp937 cpibm937 937   # Taiwan EBCDIC MIXED
-ibm-939                cp939 939            # Host MBCS (Latin-Kanji) EBCDIC
-ibm-1025_P100-2000     ibm-1025 ibm-1025_STD
-ibm-1026_P100-2000     IBM1026 { IANA } ibm-1026 CP1026 csIBM1026 ibm-1026_STD
-ibm-1047               cpibm1047            # EBCDIC Open systems Latin1
-ibm-1097_P100-2000     ibm-1097 ibm-1097_VPUA
-ibm-1097_X100-2000     ibm-1097_STD 
-ibm-1112_P100-2000     ibm-1112 cp1112 1112  ibm-1112_STD
-ibm-1122_P100-2000     ibm-1122 cp1122 ibm-1122 1122 ibm-1122_STD
-ibm-1130_P100-2000     ibm-1130 ibm-1130_STD
-ibm-1132_P100-2000     ibm-1132 ibm-1132_STD
-ibm-1137_P100-2000     ibm-1137 ibm-1137_STD
-ibm-1388_P103-2001     ibm-1388             # S-Ch DBCS-Host Data GBK mixed MBCS
-ibm-9030_P100-2000     ibm-9030 ibm-9030_STD
+ibm-37                  IBM037 { IANA* } ibm-037 cpibm37 ebcdic-cp-us ebcdic-cp-ca ebcdic-cp-wt ebcdic-cp-nl csIBM037 cp37 cp037 037 # EBCDIC US
+ibm-273                 IBM273 { IANA* } csIBM273 ebcdic-de cp273 cpibm273 273 # EBCDIC Germanay, Austria...
+ibm-277                 IBM277 { IANA* } EBCDIC-CP-DK EBCDIC-CP-NO csIBM277 ebcdic-dk cp277 cpibm277 277 # EBCDIC Denmark...
+ibm-278                 IBM278 { IANA* } ebcdic-cp-fi ebcdic-cp-se csIBM278 ebcdic-sv cp278 cpibm278 278 # EBCDIC Sweden
+ibm-280                 IBM280 { IANA* } ebcdic-cp-it csIBM280 cp280 cpibm280 280 # EBCDIC Italy
+ibm-284                 IBM284 { IANA* } ebcdic-cp-es csIBM284 cp284 cpibm284 284 # EBCDIC Spain
+ibm-285                 IBM285 { IANA* } ebcdic-cp-gb csIBM285 ebcdic-gb cp285 cpibm285 285 # EBCDIC UK Ireland
+ibm-290                 IBM290 { IANA* } EBCDIC-JP-kana csIBM290 cp290 # host SBCS (Katakana)
+ibm-297                 IBM297 { IANA* } ebcdic-cp-fr csIBM297 cp297 cpibm297 297 # EBCDIC France
+ibm-420                 IBM420 { IANA* } ebcdic-cp-ar1 csIBM420 cp420 420
+ibm-424                 IBM424 { IANA* } ebcdic-cp-he csIBM424 cp424 424
+ibm-500                 IBM500 { IANA* } cpibm500 csIBM500 cp500 ebcdic-cp-be ebcdic-cp-ch 500 # EBCDIC International Latin1
+ibm-803                 cp803                # Old EBCDIC Hebrew
+ibm-834                 cp834                # Korean DBCS Host
+ibm-835                 cp835                # DBCS T-Ch Host
+ibm-870_P100-2000       IBM870 { IANA* } ibm-870 CP870 ibm-870_STD ebcdic-cp-roece ebcdic-cp-yu csIBM870
+ibm-871                 IBM871 { IANA* } ebcdic-cp-is csIBM871 cpibm871 cp871 871 # EBCDIC Iceland
+ibm-875_P100-2000       ibm-875 cp875 875 ibm-875_STD 
+ibm-918_P100-2000       IBM918 { IANA* } ibm-918 CP918 ibm-918_VPUA ebcdic-cp-ar2 csIBM918 
+ibm-918_X100-2000       ibm-918_STD
+ibm-930                 cp930 cpibm930 930   # Japan EBCDIC MIXED
+ibm-933                 cp933 cpibm933 933   # Korea EBCDIC MIXED 
+ibm-935                 cp935 cpibm935 935   # China EBCDIC MIXED 
+ibm-937                 cp937 cpibm937 937   # Taiwan EBCDIC MIXED
+ibm-939                 cp939 939            # Host MBCS (Latin-Kanji) EBCDIC
+ibm-1025_P100-2000      ibm-1025 ibm-1025_STD
+ibm-1026_P100-2000      IBM1026 { IANA* } ibm-1026 CP1026 csIBM1026 ibm-1026_STD
+ibm-1047                cpibm1047            # EBCDIC Open systems Latin1
+ibm-1097_P100-2000      ibm-1097 ibm-1097_VPUA
+ibm-1097_X100-2000      ibm-1097_STD 
+ibm-1112_P100-2000      ibm-1112 cp1112 1112 ibm-1112_STD
+ibm-1122_P100-2000      ibm-1122 cp1122 1122 ibm-1122_STD
+ibm-1130_P100-2000      ibm-1130 ibm-1130_STD
+ibm-1132_P100-2000      ibm-1132 ibm-1132_STD
+ibm-1137_P100-2000      ibm-1137 ibm-1137_STD
+ibm-1388_P103-2001      ibm-1388             # S-Ch DBCS-Host Data GBK mixed MBCS
+ibm-9030_P100-2000      ibm-9030 ibm-9030_STD

-#ibm-1046              # PC     Arabic without EURO
+#ibm-1046               # PC     Arabic without EURO
 # with Euro
-ibm-1123               cpibm1123            # EBCDIC Cyrillic Ukraine
-ibm-1140               cpibm1140 IBM01140 { IANA } # EBCDIC US...
-ibm-1141               cpibm1141 IBM01141 { IANA } # EBCDIC Germanay,   Austria...
-ibm-1142               cpibm1142 IBM01142 { IANA } # EBCDIC Denmark...
-ibm-1143               cpibm1143 IBM01143 { IANA } # EBCDIC Sweden
-ibm-1144               cpibm1144            # EBCDIC Italy
-ibm-1145               cpibm1145            # EBCDIC Spain
-ibm-1146               cpibm1146            # EBCDIC UK Ireland
-ibm-1147               cpibm1147            # EBCDIC France
-ibm-1148               cpibm1148            # EBCDIC International Latin1
-ibm-1149               cpibm1149 ebcdic-is  # EBCDIC Iceland
-ibm-1153               cpibm1153            # EBCDIC latin 2
-ibm-1154               cp1025 cpibm1154     # EBCDIC Cyrillic Multilingual
-ibm-1155               cpibm1155            # EBCDIC Turkey
-ibm-1156               cpibm1156            # EBCDIC Baltic Multilingual
-ibm-1157               cpibm1157            # EBCDIC Estonia
-ibm-1158               cp1123 cpibm1158 1123    # EBCDIC Cyrillic Ukraine
-ibm-1159               cp28709              # SBCS T-Ch Host
-ibm-1160               cp9030 cpibm1160     # EBCDIC Thailand
-ibm-1164               cp1130 cpibm1164     # EBCDIC Viet Nam
-ibm-1364_P110-2000     ibm-1364_VPUA ibm-1364 cp1364    # Korean Host Mixed
-ibm-1371               cpibm1371            # Taiwan EBCDIC MIXED
-ibm-1390               cpibm1390            # Japan EBCDIC MIXED
-ibm-1399                                    # Host MBCS (Latin-Kanji)
-ibm-4899               cpibm4899            # Old EBCDIC Hebrew
-ibm-4971               cpibm4971            # EBCDIC Greek
-ibm-5123               cp1027               # Host Roman Jis
-ibm-8482                                    # host SBCS (Katakana)
-ibm-9027                                    # DBCS T-Ch Host
-ibm-12712              cpibm12712 ebcdic-he # EBCDIC Hebrew (new sheqel, control charaters update)
-ibm-16684              cp300                # Jis + Roman Jis Host
-ibm-16804              cpibm16804 ebcdic-ar # EBCDIC Arabic
+ibm-1123                cpibm1123            # EBCDIC Cyrillic Ukraine
+ibm-1140                cpibm1140 IBM01140 { IANA* } # EBCDIC US...
+ibm-1141                cpibm1141 IBM01141 { IANA* } # EBCDIC Germanay,   Austria...
+ibm-1142                cpibm1142 IBM01142 { IANA* } # EBCDIC Denmark...
+ibm-1143                cpibm1143 IBM01143 { IANA* } # EBCDIC Sweden
+ibm-1144                cpibm1144            # EBCDIC Italy
+ibm-1145                cpibm1145            # EBCDIC Spain
+ibm-1146                cpibm1146            # EBCDIC UK Ireland
+ibm-1147                cpibm1147            # EBCDIC France
+ibm-1148                cpibm1148            # EBCDIC International Latin1
+ibm-1149                cpibm1149 ebcdic-is  # EBCDIC Iceland
+ibm-1153                cpibm1153            # EBCDIC latin 2
+ibm-1154                cp1025 cpibm1154     # EBCDIC Cyrillic Multilingual
+ibm-1155                cpibm1155            # EBCDIC Turkey
+ibm-1156                cpibm1156            # EBCDIC Baltic Multilingual
+ibm-1157                cpibm1157            # EBCDIC Estonia
+ibm-1158                cp1123 cpibm1158 1123    # EBCDIC Cyrillic Ukraine
+ibm-1159                cp28709              # SBCS T-Ch Host
+ibm-1160                cp9030 cpibm1160     # EBCDIC Thailand
+ibm-1164                cp1130 cpibm1164     # EBCDIC Viet Nam
+ibm-1364_P110-2000      ibm-1364_VPUA ibm-1364 cp1364    # Korean Host Mixed
+ibm-1371                cpibm1371            # Taiwan EBCDIC MIXED
+ibm-1390                cpibm1390            # Japan EBCDIC MIXED
+ibm-1399                                     # Host MBCS (Latin-Kanji)
+ibm-4899                cpibm4899            # Old EBCDIC Hebrew
+ibm-4971                cpibm4971            # EBCDIC Greek
+ibm-5123                cp1027               # Host Roman Jis
+ibm-8482                                     # host SBCS (Katakana)
+ibm-9027                                     # DBCS T-Ch Host
+ibm-12712               cpibm12712 ebcdic-he # EBCDIC Hebrew (new sheqel, control charaters update)
+ibm-16684               cp300                # Jis + Roman Jis Host
+ibm-16804               cpibm16804 ebcdic-ar # EBCDIC Arabic

 # unsupported IANA names
 # ebcdic-it csEBCDICIT
 # ebcdic-es csEBCDICES
 # csEBCDICFR ebcdic-fr
-# ibm-274                IBM274 { IANA } cp274 csIBM274 ebcdic-be
-# ibm-870 IBM870 { IANA } ebcdic-cp-roece ebcdic-cp-yu csIBM870 cp870 870
+# ibm-274                IBM274 { IANA* } cp274 csIBM274 ebcdic-be
+# ibm-870 IBM870 { IANA* } ebcdic-cp-roece ebcdic-cp-yu csIBM870 cp870 870

 # EBCDIC codepages for S/390, with LF and NL codes swapped

 ebcdic-xml-us

 # without Euro
-ibm-37-s390            ibm037-s390   # EBCDIC US
-ibm-1047-s390          # EBCDIC for S/390 Open Edition
+ibm-37-s390             ibm037-s390   # EBCDIC US
+ibm-1047-s390           # EBCDIC for S/390 Open Edition

 # with Euro 
-ibm-1140-s390          # EBCDIC US
-ibm-1142-s390          # EBCDIC Denmark
-ibm-1143-s390          # EBCDIC Sweden
-ibm-1144-s390          # EBCDIC Italy
-ibm-1145-s390          # EBCDIC Spain
-ibm-1146-s390          # EBCDIC UK Ireland
-ibm-1147-s390          # EBCDIC France
-ibm-1148-s390          # EBCDIC International Latin1
-ibm-1149-s390          # EBCDIC Iceland
-ibm-1153-s390          # EBCDIC latin 2
-ibm-12712-s390         # EBCDIC Hebrew
-ibm-16804-s390         # EBCDIC Arabic
+ibm-1140-s390           # EBCDIC US
+ibm-1142-s390           # EBCDIC Denmark
+ibm-1143-s390           # EBCDIC Sweden
+ibm-1144-s390           # EBCDIC Italy
+ibm-1145-s390           # EBCDIC Spain
+ibm-1146-s390           # EBCDIC UK Ireland
+ibm-1147-s390           # EBCDIC France
+ibm-1148-s390           # EBCDIC International Latin1
+ibm-1149-s390           # EBCDIC Iceland
+ibm-1153-s390           # EBCDIC latin 2
+ibm-12712-s390          # EBCDIC Hebrew
+ibm-16804-s390          # EBCDIC Arabic

 # GB 18030 is partly algorithmic, using the MBCS converter
-gb18030                { IANA } ibm-1392
+gb18030 { IANA* }       ibm-1392

--- a/icu4c/source/test/cintltst/ccapitst.c
+++ b/icu4c/source/test/cintltst/ccapitst.c
@ -240,6 +240,8 @@ static void TestConvert()
     /*Testing ucnv_openU()*/
    {
        UChar converterName[]={ 0x0069, 0x0062, 0x006d, 0x002d, 0x0039, 0x0034, 0x0033, 0x0000}; /*ibm-943*/
+        UChar firstSortedName[]={ 0x0021, 0x0000}; /* ! */
+        UChar lastSortedName[]={ 0x007E, 0x0000}; /* ~ */
        const char *illegalNameChars={ "ibm-943 ibm-943 ibm-943 ibm-943 ibm-943 ibm-943 ibm-943 ibm-943 ibm-943 ibm-943"};
        UChar illegalName[100];
        UConverter *converter=NULL;
@ -269,8 +271,20 @@ static void TestConvert()
        if(!(err==U_ILLEGAL_ARGUMENT_ERROR)){
            log_err("FAILURE! ucnv_openU(illegalName, err) is expected to fail\n");
        }
+
+        err=U_ZERO_ERROR;
+        ucnv_openU(firstSortedName, &err);
+        if(err!=U_FILE_ACCESS_ERROR){
+            log_err("FAILURE! ucnv_openU(firstSortedName, err) is expected to fail\n");
+        }
+
+        err=U_ZERO_ERROR;
+        ucnv_openU(lastSortedName, &err);
+        if(err!=U_FILE_ACCESS_ERROR){
+            log_err("FAILURE! ucnv_openU(lastSortedName, err) is expected to fail\n");
+        }
+
        err=U_ZERO_ERROR;
-      
    }
    log_verbose("Testing ucnv_open() with converter name greater than 7 characters\n");
    {
@ -455,6 +469,11 @@ static void TestConvert()
        char* index = NULL;
        strcpy(ucs_file_name, loadTestData(&err));
        
+        if(U_FAILURE(err)){
+            log_err("Couldn't get the test data directory... Exiting...Error:%s\n", u_errorName(err));
+            return;
+        }
+
        index=strrchr(ucs_file_name,(char)U_FILE_SEP_CHAR);

        if((unsigned int)(index-ucs_file_name) != (strlen(ucs_file_name)-1)){
@ -462,11 +481,6 @@ static void TestConvert()
        }
        
        strcat(ucs_file_name,".."U_FILE_SEP_STRING);
-        
-        if(U_FAILURE(err)){
-            log_err("Couldn't get the test data directory... Exiting...Error:%s\n", u_errorName(err));
-            return;
-        }
        strcat(ucs_file_name, CodePagesTestFiles[codepage_index]);

        ucs_file_in = fopen(ucs_file_name,"rb");
--- a/icu4c/source/test/cintltst/udatatst.c
+++ b/icu4c/source/test/cintltst/udatatst.c
@ -67,7 +67,7 @@ static void TestUDataOpen(){
    UErrorCode status=U_ZERO_ERROR;
    const char* memMap[][2]={
        {"tz", "dat"},
-        {"cnvalias", "dat"},
+        {"cnvalias", "icu"},
        {"unames",   "dat"},
        {"ibm-1141", "cnv"}
    };
@ -388,7 +388,7 @@ isAcceptable1(void *context,
        pInfo->dataFormat[1]==0x76 &&
        pInfo->dataFormat[2]==0x41 &&
        pInfo->dataFormat[3]==0x6c &&
-        pInfo->formatVersion[0]==2 )
+        pInfo->formatVersion[0]==3 )
    {
        log_verbose("The data from \"%s.%s\" IS acceptable using the verifing function isAcceptable1()\n", name, type);
        return TRUE;
@ -473,7 +473,7 @@ static void TestUDataOpenChoiceDemo1() {

    strcat(strcpy(testPath, u_getDataDirectory()), "testdata");

-    result=udata_openChoice(NULL, type, name[0], isAcceptable1, NULL, &status);
+    result=udata_openChoice(NULL, "icu", name[0], isAcceptable1, NULL, &status);
    if(U_FAILURE(status)){
        log_err("FAIL: udata_openChoice() failed name=%s, type=%s, \n errorcode=%s\n", name[0], type, myErrorName(status));
    } else {
@ -624,7 +624,7 @@ static void TestUDataGetInfo() {


    log_verbose("Testing udata_getInfo() for cnvalias.dat\n");
-    result=udata_open(NULL, type, name, &status);
+    result=udata_open(NULL, "icu", name, &status);
    if(U_FAILURE(status)){
        log_err("FAIL: udata_open() failed for path = NULL, name=%s, type=%s, \n errorcode=%s\n", path, name, type, myErrorName(status));
        return;
@ -677,32 +677,34 @@ static void TestUDataGetInfo() {
 static void TestUDataGetMemory() {

    UDataMemory *result;
-    const uint16_t *table=NULL;
+    const int32_t *table=NULL;
    uint16_t* intValue=0;
    UErrorCode status=U_ZERO_ERROR;
    const char* name="cnvalias";
-    const char* type="dat";
+    const char* type;

    const char* name2="test";

-   char* testPath=(char*)malloc(sizeof(char) * (strlen(u_getDataDirectory()) + strlen("testdata") +1 ) );
+    char* testPath=(char*)malloc(sizeof(char) * (strlen(u_getDataDirectory()) + strlen("testdata") +1 ) );

-   strcat(strcpy(testPath, u_getDataDirectory()), "testdata");
+    strcat(strcpy(testPath, u_getDataDirectory()), "testdata");

+    type="icu";
    log_verbose("Testing udata_getMemory for \"cnvalias.dat()\"\n");
    result=udata_openChoice(NULL, type, name, isAcceptable1, NULL, &status);
    if(U_FAILURE(status)){
        log_err("FAIL: udata_openChoice() failed for name=%s, type=%s, \n errorcode=%s\n", name, type, myErrorName(status));
        return;
    }
-    table=(const uint16_t *)udata_getMemory(result);
+    table=(const uint32_t *)udata_getMemory(result);

    /* The alias table may list more converters than what's actually available now. [grhoten] */
-    if(ucnv_countAvailable() > table[1+2*(*table)])      /*???*/
+    if(ucnv_countAvailable() > table[1])      /*???*/
        log_err("FAIL: udata_getMemory() failed ucnv_countAvailable returned = %d, expected = %d\n", ucnv_countAvailable(), table[1+2*(*table)]);

    udata_close(result);

+    type="dat";
    log_verbose("Testing udata_getMemory for \"test.dat\"()\n");
    result=udata_openChoice(testPath, type, name2, isAcceptable3, NULL, &status);
    if(U_FAILURE(status)){
--- a/icu4c/source/tools/gencnval/gencnval.c
+++ b/icu4c/source/tools/gencnval/gencnval.c