ICU-3748 handle POSIX names in canonicalization, etc.

X-SVN-Rev: 15267
2004-05-12 00:11:26 +00:00 · 2004-05-12 00:11:26 +00:00 · f2732cc03c
commit f2732cc03c
parent 2dc9492004
2 changed files with 515 additions and 268 deletions
--- a/icu4j/src/com/ibm/icu/dev/test/util/ULocaleTest.java
+++ b/icu4j/src/com/ibm/icu/dev/test/util/ULocaleTest.java
@ -500,95 +500,107 @@ public class ULocaleTest extends TestFmwk {

        }
    }
-    String [][] testData = new String[][]{
-        {"sv", "", "FI", "AL", "sv-fi-al", "sv_FI_AL" },
-        {"en", "", "GB", "", "en-gb", "en_GB" },
-        {"i-hakka", "", "MT", "XEMXIJA", "i-hakka_MT_XEMXIJA", "i-hakka_MT_XEMXIJA"},
-        {"i-hakka", "", "CN", "", "i-hakka_CN", "i-hakka_CN"},
-        {"i-hakka", "", "MX", "", "I-hakka_MX", "i-hakka_MX"},
-        {"x-klingon", "", "US", "SANJOSE", "X-KLINGON_us_SANJOSE", "x-klingon_US_SANJOSE"},
+
+  public void TestPrefixes() {
+    // POSIX ids are no longer handled by getName, so POSIX cases are removed
+    final String [][] testData = new String[][]{
+      {"sv", "", "FI", "AL", "sv-fi-al", "sv_FI_AL" },
+      {"en", "", "GB", "", "en-gb", "en_GB" },
+      {"i-hakka", "", "MT", "XEMXIJA", "i-hakka_MT_XEMXIJA", "i-hakka_MT_XEMXIJA"},
+      {"i-hakka", "", "CN", "", "i-hakka_CN", "i-hakka_CN"},
+      {"i-hakka", "", "MX", "", "I-hakka_MX", "i-hakka_MX"},
+      {"x-klingon", "", "US", "SANJOSE", "X-KLINGON_us_SANJOSE", "x-klingon_US_SANJOSE"},
        
-        {"mr", "", "", "", "mr.utf8", "mr"},
-        {"de", "", "TV", "", "de-tv.koi8r", "de_TV"},
-        {"x-piglatin", "", "ML", "", "x-piglatin_ML.MBE", "x-piglatin_ML"},  /* Multibyte English */
-        {"i-cherokee", "","US", "", "i-Cherokee_US.utf7", "i-cherokee_US"},
-        {"x-filfli", "", "MT", "FILFLA", "x-filfli_MT_FILFLA.gb-18030", "x-filfli_MT_FILFLA"},
-        {"no", "", "NO", "NY", "no-no-ny.utf32@B", "no_NO_NY"}, /* @ ignored unless variant is empty */
-// ULocale canonicalizes, so this does have a variant
-//      {"no", "", "NO", "",  "no-no.utf32@B", "no_NO_B" },
-        {"no", "", "NO", "B",  "no-no.utf32@B", "no_NO_B" },
-        {"no", "", "",   "NY", "no__ny", "no__NY" },
-// ULocale canonicalizes, so this does have a variant
-//      {"no", "", "",   "", "no@ny", "no__NY" },
-        {"no", "", "",   "NY", "no@ny", "no__NY" },
-        {"el", "Latn", "", "", "el-latn", "el_Latn" },
-        {"en", "Cyrl", "RU", "", "en-cyrl-ru", "en_Cyrl_RU" },
-        {"zh", "Hant", "TW", "STROKE", "zh-hant_TW_STROKE", "zh_Hant_TW_STROKE" },
-        {"qq", "Qqqq", "QQ", "QQ", "qq_Qqqq_QQ_QQ", "qq_Qqqq_QQ_QQ" },
-        {"qq", "Qqqq", "", "QQ", "qq_Qqqq__QQ", "qq_Qqqq__QQ" },
-        {"12", "3456", "78", "90", "12_3456_78_90", "12_3456_78_90" }, /* total garbage */
-        
-        { "","","","",""}
+      //      {"mr", "", "", "", "mr.utf8", "mr"},
+      //      {"de", "", "TV", "", "de-tv.koi8r", "de_TV"},
+      //      {"x-piglatin", "", "ML", "", "x-piglatin_ML.MBE", "x-piglatin_ML"},  /* Multibyte English */
+      //      {"i-cherokee", "","US", "", "i-Cherokee_US.utf7", "i-cherokee_US"},
+      //      {"x-filfli", "", "MT", "FILFLA", "x-filfli_MT_FILFLA.gb-18030", "x-filfli_MT_FILFLA"},
+      //      {"no", "", "NO", "NY", "no-no-ny.utf32@B", "no_NO_NY"}, /* @ ignored unless variant is empty */
+      // ULocale handles POSIX, so this does have a variant
+      //      {"no", "", "NO", "",  "no-no.utf32@B", "no_NO_B" },
+      // no we don't handle POSIX by default anymore
+      //      {"no", "", "NO", "B",  "no-no.utf32@B", "no_NO_B" },
+      {"no", "", "",   "NY", "no__ny", "no__NY" },
+      // ULocale handles POSIX, so this does have a variant
+      //      {"no", "", "",   "", "no@ny", "no__NY" },
+      // no we don't handle POSIX by default anymore
+      //      {"no", "", "",   "NY", "no@ny", "no__NY" },
+      {"el", "Latn", "", "", "el-latn", "el_Latn" },
+      {"en", "Cyrl", "RU", "", "en-cyrl-ru", "en_Cyrl_RU" },
+      {"zh", "Hant", "TW", "STROKE", "zh-hant_TW_STROKE", "zh_Hant_TW_STROKE" },
+      {"qq", "Qqqq", "QQ", "QQ", "qq_Qqqq_QQ_QQ", "qq_Qqqq_QQ_QQ" },
+      {"qq", "Qqqq", "", "QQ", "qq_Qqqq__QQ", "qq_Qqqq__QQ" },
+      {"12", "3456", "78", "90", "12_3456_78_90", "12_3456_78_90" }, /* total garbage */
+      { "","","","",""}
    };
-
-    public void TestPrefixes() {
        
-        String loc, buf,buf1;
-        String [] testTitles = new String[] { "ULocale.getLanguage()", "ULocale.getScript()", "ULocale.getCountry()", "ULocale.getVariant()", "name", "ULocale.getName()", "country3" };
-        ULocale uloc;
+    String loc, buf,buf1;
+    final String [] testTitles = { 
+      "ULocale.getLanguage()", 
+      "ULocale.getScript()", 
+      "ULocale.getCountry()", 
+      "ULocale.getVariant()", 
+      "name", 
+      "ULocale.getName()", 
+      "country3" 
+    };
+    ULocale uloc;
        
-        for(int row=0;testData[row][0].length()!= 0;row++) {
-            loc = testData[row][NAME];
-            logln("Test #"+row+": "+loc);
+    for(int row=0;testData[row][0].length()!= 0;row++) {
+      loc = testData[row][NAME];
+      logln("Test #"+row+": "+loc);
            
-            uloc = new ULocale(loc);    
+      uloc = new ULocale(loc);    
            
-            for(int n=0;n<=(NAME+1);n++) {
-                if(n==NAME) continue;
+      for(int n=0;n<=(NAME+1);n++) {
+	if(n==NAME) continue;

-                switch(n) {
-                case LANG:
-                    buf  = ULocale.getLanguage(loc);
-                    buf1 = uloc.getLanguage();
-                    break;
+	switch(n) {
+	case LANG:
+	  buf  = ULocale.getLanguage(loc);
+	  buf1 = uloc.getLanguage();
+	  break;
                    
-                case SCRIPT:
-                    buf  = ULocale.getScript(loc);
-                    buf1 = uloc.getScript();
-                    break;
+	case SCRIPT:
+	  buf  = ULocale.getScript(loc);
+	  buf1 = uloc.getScript();
+	  break;
                    
-                case CTRY:
-                    buf  = ULocale.getCountry(loc);
-                    buf1 = uloc.getCountry();
-                    break;
+	case CTRY:
+	  buf  = ULocale.getCountry(loc);
+	  buf1 = uloc.getCountry();
+	  break;
                    
-                case VAR:
-                    buf  = ULocale.getVariant(loc);
-                    buf1 = buf;
-                    break;
+	case VAR:
+	  buf  = ULocale.getVariant(loc);
+	  buf1 = buf;
+	  break;
                    
-                case NAME+1:
-                    buf  = ULocale.getName(loc);
-                    buf1 = uloc.getName();
-                    break;
+	case NAME+1:
+	  buf  = ULocale.getName(loc);
+	  buf1 = uloc.getName();
+	  break;
                    
-                default:
-                    buf = "**??";
-                    buf1 = buf;
-                }
+	default:
+	  buf = "**??";
+	  buf1 = buf;
+	}
                
-                logln("#"+row+": "+testTitles[n]+" on "+loc+": -> ["+buf+"]");
+	logln("#"+row+": "+testTitles[n]+" on "+loc+": -> ["+buf+"]");
                
-                if(buf.compareTo(testData[row][n])!=0) {
-                    errln("#"+row+": "+testTitles[n]+" on "+loc+": -> ["+buf+"] (expected '"+testData[row][n]+"'!)");
-                }
-                if(buf1.compareTo(testData[row][n])!=0) {
-                    errln("#"+row+": "+testTitles[n]+" on ULocale object "+loc+": -> ["+buf1+"] (expected '"+testData[row][n]+"'!)");
-                }
-            }
-        }
+	if(buf.compareTo(testData[row][n])!=0) {
+	  errln("#"+row+": "+testTitles[n]+" on "+loc+": -> ["+buf+"] (expected '"+testData[row][n]+"'!)");
+	}
+	if(buf1.compareTo(testData[row][n])!=0) {
+	  errln("#"+row+": "+testTitles[n]+" on ULocale object "+loc+": -> ["+buf1+"] (expected '"+testData[row][n]+"'!)");
+	}
+      }
    }
-    private static final String[][] tests = new String[][]{
+  }
+
+    public void TestObsoleteNames(){
+      final String[][] tests = new String[][]{
          /* locale, language3, language2, Country3, country2 */  
        { "eng_USA", "eng", "en", "USA", "US" },
        { "kok",  "kok", "kok", "", "" },
@ -613,8 +625,6 @@ public class ULocaleTest extends TestFmwk {
        { "sh", "srp", "sh", "", "" },
        { "", "", "", "", "" }
    };
-
-    public void TestObsoleteNames(){
        
        for(int i=0;i<tests.length;i++){
            String locale = tests[i][0];
@ -680,9 +690,10 @@ public class ULocaleTest extends TestFmwk {
            errln("ULocale.getLanguage(\"kok\") failed. Expected: kok Got: "+buff);   
        }
    }
-    private static final String[][]testCases = new String[][]{
+    public void TestCanonicalization(){      
+      final String[][]testCases = new String[][]{
        { "ca_ES_PREEURO-with-extra-stuff-that really doesn't make any sense-unless-you're trying to increase code coverage",
-            "ca_ES_PREEURO_WITH_EXTRA_STUFF_THAT REALLY DOESN'T MAKE ANY SENSE_UNLESS_YOU'RE TRYING TO INCREASE CODE COVERAGE"},
+	  "ca_ES_PREEURO_WITH_EXTRA_STUFF_THAT REALLY DOESN'T MAKE ANY SENSE_UNLESS_YOU'RE TRYING TO INCREASE CODE COVERAGE"},
        { "ca_ES_PREEURO", "ca_ES@currency=ESP" },
        { "de_AT_PREEURO", "de_AT@currency=ATS" },
        { "de_DE_PREEURO", "de_DE@currency=DEM" },
@ -716,9 +727,12 @@ public class ULocaleTest extends TestFmwk {
        { "zh_CN_CA@collation=pinyin", "zh_CN_CA@collation=pinyin" },
        { "en_US_POSIX", "en_US_POSIX" }, 
        { "hy_AM_REVISED", "hy_AM_REVISED" }, 
-        { "no_NO_NY",   "no_NO_NY" },
-        { "no@ny",      "no__NY" }, //POSIX ID
-        { "no-no.utf32@B", "no_NO_B" }, //POSIX ID
+        { "no_NO_NY",   "nn_NO" },
+        { "no@ny",      "nn" }, //POSIX ID
+        { "no-no.utf32@B", "nb_NO_B" }, //POSIX ID
+//          { "no_NO_NY",   "no_NO_NY" },
+//          { "no@ny",      "no__NY" }, //POSIX ID
+//          { "no-no.utf32@B", "no_NO_B" }, //POSIX ID
        { "qz-qz@Euro", "qz_QZ@currency=EUR" }, /* qz-qz uses private use iso codes */
        { "en-BOONT",   "en__BOONT" }, /* registered name */
        { "de-1901",    "de__1901" }, /* registered name */
@ -729,8 +743,30 @@ public class ULocaleTest extends TestFmwk {
        { "uz-UZ-Latn",     "uz_Latn_UZ" }, /* .NET name */
        { "zh-CHS",         "zh_Hans" }, /* .NET name */
        { "zh-CHT",         "zh_TW" }, /* .NET name This may change back to zh_Hant */
-    };
-    public void TestCanonicalization(){      
+
+	// posix behavior that used to be performed by getName
+	{ "mr.utf8", "mr" },
+	{ "de-tv.koi8r", "de_TV" },
+	{ "x-piglatin_ML.MBE", "x-piglatin_ML" },
+	{ "i-cherokee_US.utf7", "i-cherokee_US" },
+	{ "x-filfli_MT_FILFLA.gb-18030", "x-filfli_MT_FILFLA" },
+	{ "no-no-ny.utf8@B", "nn_NO" }, /* @ ignored unless variant is empty */
+
+	// fleshing out canonicalization
+	// trim space and sort keywords, ';' is separator so not present at end in canonical form
+	{ "en_Hant_IL_VALLEY_GIRL@ currency = EUR; calendar = Japanese ;", "en_Hant_IL_VALLEY_GIRL@calendar=Japanese;currency=EUR" },
+	// already-canonical ids are not changed
+	{ "en_Hant_IL_VALLEY_GIRL@calendar=Japanese;currency=EUR", "en_Hant_IL_VALLEY_GIRL@calendar=Japanese;currency=EUR" },
+	// PRE_EURO and EURO conversions don't affect other keywords
+	{ "es_ES_PREEURO@CALendar=Japanese", "es_ES@calendar=Japanese;currency=ESP" },
+	{ "es_ES_EURO@SHOUT=zipeedeedoodah", "es_ES@currency=EUR;shout=zipeedeedoodah" },
+	// currency keyword overrides PRE_EURO and EURO currency
+	{ "es_ES_PREEURO@currency=EUR", "es_ES@currency=EUR" },
+	{ "es_ES_EURO@currency=ESP", "es_ES@currency=ESP" },
+	// norwegian is just too weird, if we handle things in their full generality
+	{ "no-Hant-GB_NY@currency=$$$", "nn_Hant_GB@currency=$$$" },
+      };
+
        for(int i = 0; i< testCases.length;i++){
            String canonical = ULocale.canonicalize(testCases[i][0]);
            if(!canonical.equals(testCases[i][1])){
@ -739,8 +775,8 @@ public class ULocaleTest extends TestFmwk {
                        " Got: "+ canonical);   
            }
        }
-           
    }
+
    public void TestGetAvailable(){
        ULocale[] locales = ULocale.getAvailableLocales();
        if(locales.length<10){
--- a/icu4j/src/com/ibm/icu/util/ULocale.java
+++ b/icu4j/src/com/ibm/icu/util/ULocale.java
@ -19,20 +19,43 @@ import java.util.TreeMap;
 import com.ibm.icu.impl.ICUResourceBundle;

 /**
- * A class is analogous to {@link java.util.Locale} and provides additional
+ * A class analogous to {@link java.util.Locale} that provides additional
 * support for ICU protocol.  In ICU 3.0 this class is enhanced to support
 * RFC 3066 language identifiers.
 *
- * <p>Many classes and services in ICU follow a factory idiom, in which a
- * factory method or object responds to a client request with an
- * object.  The request includes a locale (the <i>requested</i>
+ * <p>Many classes and services in ICU follow a factory idiom, in
+ * which a factory method or object responds to a client request with
+ * an object.  The request includes a locale (the <i>requested</i>
 * locale), and the returned object is constructed using data for that
 * locale.  The system may lack data for the requested locale, in
 * which case the locale fallback mechanism will be invoked until a
 * populated locale is found (the <i>valid</i> locale).  Furthermore,
- * even when a valid locale is found, further fallback may be required
- * to reach a locale containing the specific data required by the
- * service (the <i>actual</i> locale).
+ * even when a populated locale is found (the <i>valid</i> locale),
+ * further fallback may be required to reach a locale containing the
+ * specific data required by the service (the <i>actual</i> locale).
+ *
+ * <p>ULocale performs <b>'normalization'</b> and <b>'canonicalization'</b> of locale ids.
+ * Normalization 'cleans up' ICU locale ids as follows:
+ * <ul>
+ * <li>language, script, country, variant, and keywords are properly cased<br>
+ * (lower, title, upper, upper, and lower case respectively)</li>
+ * <li>hyphens used as separators are converted to underscores</li>
+ * <li>three-letter language and country ids are converted to two-letter
+ * equivalents where available</li>
+ * <li>surrounding spaces are removed from keywords and values</li>
+ * <li>if there are multiple keywords, they are put in sorted order</li>
+ * </li>
+ * Canonicalization additionally performs the following:
+ * <ul>
+ * <li>POSIX ids are converted to ICU format IDs</li>
+ * <li>'grandfathered' 3066 ids are converted to ICU standard form</li>
+ * <li>'PRE_EURO' and 'EURO' variants are converted to currency keyword form, with the currency
+ * id appropriate to the country of the locale (for PRE_EURO) or EUR (for EURO).
+ * </ul>
+ * All ULocale constructors automatically normalize the locale id.  To handle
+ * POSIX ids, the <code>canonicalize</code> can be called to convert the id
+ * to canonical form, or the <code>canonicalInstance</code> factory method
+ * can be called.</p>
 *
 * <p>This class provides selectors {@link #VALID_LOCALE} and {@link
 * #ACTUAL_LOCALE} intended for use in methods named
@ -56,7 +79,7 @@ import com.ibm.icu.impl.ICUResourceBundle;
 * @author weiv
 * @author Alan Liu
 * @author Ram Viswanadha
- * @draft ICU 2.8
+ * @draft ICU 2.8 
 */
 public final class ULocale implements Serializable {
    /** 
@ -284,8 +307,12 @@ public final class ULocale implements Serializable {
        "zu",  "zun", 
    };

+    private static final String[] replacementLanguages = {
+	"id", "he", "yi", "jv", "sr", "nb",/* replacement language codes */
+    };
+
    private static final String[] obsoleteLanguages = {
-        "in",  "iw",  "ji",  "jw",  "sh",    /* obsolete language codes */         
+        "in",  "iw",  "ji",  "jw",  "sh", "no",    /* obsolete language codes */         
    };
    
    /* This list MUST contain a three-letter code for every two-letter code in the
@ -530,60 +557,64 @@ public final class ULocale implements Serializable {
        "FXX", "ROM", "TMP", "ZAR",    
    };

-    /* The left side is the result after getName is processes the name */
-    /* The right side is what the locale should be converted to. */
+    /**
+     * This table lists pairs of locale ids for canonicalization.  The
+     * The first item is the normalized id, the second item is the
+     * canonicalized id.
+     */
    private static final String[][] variantsToKeywords = {
-        { EMPTY_STRING,     "en_US_POSIX" }, /* .NET name */
-        { "C",              "en_US_POSIX" }, /* POSIX name */
-        { "art_LOJBAN",     "jbo" }, /* registered name */
-        { "az_AZ_CYRL",     "az_Cyrl_AZ" }, /* .NET name */
-        { "az_AZ_LATN",     "az_Latn_AZ" }, /* .NET name */
-        { "ca_ES_PREEURO",  "ca_ES@currency=ESP" },
-        { "cel_GAULISH",    "cel__GAULISH" }, /* registered name */
-        { "de_1901",        "de__1901" }, /* registered name */
-        { "de_1906",        "de__1906" }, /* registered name */
-        { "de__PHONEBOOK",  "de@collation=phonebook" },
-        { "de_AT_PREEURO",  "de_AT@currency=ATS" },
-        { "de_DE_PREEURO",  "de_DE@currency=DEM" },
-        { "de_LU_PREEURO",  "de_LU@currency=EUR" },
-        { "el_GR_PREEURO",  "el_GR@currency=GRD" },
-        { "en_BOONT",       "en__BOONT" }, /* registered name */
-        { "en_SCOUSE",      "en__SCOUSE" }, /* registered name */
-        { "en_BE_PREEURO",  "en_BE@currency=BEF" },
-        { "en_IE_PREEURO",  "en_IE@currency=IEP" },
-        { "es__TRADITIONAL", "es@collation=traditional" },
-        { "es_ES_PREEURO",  "es_ES@currency=ESP" },
-        { "eu_ES_PREEURO",  "eu_ES@currency=ESP" },
-        { "fi_FI_PREEURO",  "fi_FI@currency=FIM" },
-        { "fr_BE_PREEURO",  "fr_BE@currency=BEF" },
-        { "fr_FR_PREEURO",  "fr_FR@currency=FRF" },
-        { "fr_LU_PREEURO",  "fr_LU@currency=LUF" },
-        { "ga_IE_PREEURO",  "ga_IE@currency=IEP" },
-        { "gl_ES_PREEURO",  "gl_ES@currency=ESP" },
-        { "hi__DIRECT",     "hi@collation=direct" },
-        { "it_IT_PREEURO",  "it_IT@currency=ITL" },
-        { "ja_JP_TRADITIONAL", "ja_JP@calendar=japanese" },
-        { "nl_BE_PREEURO",  "nl_BE@currency=BEF" },
-        { "nl_NL_PREEURO",  "nl_NL@currency=NLG" },
-        { "pt_PT_PREEURO",  "pt_PT@currency=PTE" },
-        { "sl_ROZAJ",       "sl__ROZAJ" }, /* registered name */
-        { "sr_SP_CYRL",     "sr_Cyrl_SP" }, /* .NET name */
-        { "sr_SP_LATN",     "sr_Latn_SP" }, /* .NET name */
-        { "uz_UZ_CYRL",     "uz_Cyrl_UZ" }, /* .NET name */
-        { "uz_UZ_LATN",     "uz_Latn_UZ" }, /* .NET name */
-        { "zh_CHS",         "zh_Hans" }, /* .NET name */
-        { "zh_CHT",         "zh_TW" }, /* .NET name TODO: This should be zh_Hant once the locale structure is fixed. */
-        { "zh_GAN",         "zh__GAN" }, /* registered name */
-        { "zh_GUOYU",       "zh" }, /* registered name */
-        { "zh_HAKKA",       "zh__HAKKA" }, /* registered name */
-        { "zh_MIN",         "zh__MIN" }, /* registered name */
-        { "zh_MIN_NAN",     "zh__MINNAN" }, /* registered name */
-        { "zh_WUU",         "zh__WUU" }, /* registered name */
-        { "zh_XIANG",       "zh__XIANG" }, /* registered name */
-        { "zh_YUE",         "zh__YUE" }, /* registered name */
-        { "th_TH_TRADITIONAL", "th_TH@calendar=buddhist" },
-        { "zh_TW_STROKE",   "zh_TW@collation=stroke" },
-        { "zh__PINYIN",     "zh@collation=pinyin" }
+        { EMPTY_STRING,     "en_US_POSIX", null, null }, /* .NET name */
+        { "C",              "en_US_POSIX", null, null }, /* POSIX name */
+        { "art_LOJBAN",     "jbo", null, null }, /* registered name */
+        { "az_AZ_CYRL",     "az_Cyrl_AZ", null, null }, /* .NET name */
+        { "az_AZ_LATN",     "az_Latn_AZ", null, null }, /* .NET name */
+        { "ca_ES_PREEURO",  "ca_ES", "currency", "ESP" },
+        { "cel_GAULISH",    "cel__GAULISH", null, null }, /* registered name */
+        { "de_1901",        "de__1901", null, null }, /* registered name */
+        { "de_1906",        "de__1906", null, null }, /* registered name */
+        { "de__PHONEBOOK",  "de", "collation", "phonebook" },
+        { "de_AT_PREEURO",  "de_AT", "currency", "ATS" },
+        { "de_DE_PREEURO",  "de_DE", "currency", "DEM" },
+        { "de_LU_PREEURO",  "de_LU", "currency", "EUR" },
+        { "el_GR_PREEURO",  "el_GR", "currency", "GRD" },
+        { "en_BOONT",       "en__BOONT", null, null }, /* registered name */
+        { "en_SCOUSE",      "en__SCOUSE", null, null }, /* registered name */
+        { "en_BE_PREEURO",  "en_BE", "currency", "BEF" },
+        { "en_IE_PREEURO",  "en_IE", "currency", "IEP" },
+        { "es__TRADITIONAL", "es", "collation", "traditional" },
+        { "es_ES_PREEURO",  "es_ES", "currency", "ESP" },
+        { "eu_ES_PREEURO",  "eu_ES", "currency", "ESP" },
+        { "fi_FI_PREEURO",  "fi_FI", "currency", "FIM" },
+        { "fr_BE_PREEURO",  "fr_BE", "currency", "BEF" },
+        { "fr_FR_PREEURO",  "fr_FR", "currency", "FRF" },
+        { "fr_LU_PREEURO",  "fr_LU", "currency", "LUF" },
+        { "ga_IE_PREEURO",  "ga_IE", "currency", "IEP" },
+        { "gl_ES_PREEURO",  "gl_ES", "currency", "ESP" },
+        { "hi__DIRECT",     "hi", "collation", "direct" },
+        { "it_IT_PREEURO",  "it_IT", "currency", "ITL" },
+        { "ja_JP_TRADITIONAL", "ja_JP", "calendar", "japanese" },
+	{ "nb_NO_NY",       "nn_NO", null, null }, 	// "markus said this was ok" :-)
+        { "nl_BE_PREEURO",  "nl_BE", "currency", "BEF" },
+        { "nl_NL_PREEURO",  "nl_NL", "currency", "NLG" },
+        { "pt_PT_PREEURO",  "pt_PT", "currency", "PTE" },
+        { "sl_ROZAJ",       "sl__ROZAJ", null, null }, /* registered name */
+        { "sr_SP_CYRL",     "sr_Cyrl_SP", null, null }, /* .NET name */
+        { "sr_SP_LATN",     "sr_Latn_SP", null, null }, /* .NET name */
+        { "uz_UZ_CYRL",     "uz_Cyrl_UZ", null, null }, /* .NET name */
+        { "uz_UZ_LATN",     "uz_Latn_UZ", null, null }, /* .NET name */
+        { "zh_CHS",         "zh_Hans", null, null }, /* .NET name */
+        { "zh_CHT",         "zh_TW", null, null }, /* .NET name TODO: This should be zh_Hant once the locale structure is fixed. */
+        { "zh_GAN",         "zh__GAN", null, null }, /* registered name */
+        { "zh_GUOYU",       "zh", null, null }, /* registered name */
+        { "zh_HAKKA",       "zh__HAKKA", null, null }, /* registered name */
+        { "zh_MIN",         "zh__MIN", null, null }, /* registered name */
+        { "zh_MIN_NAN",     "zh__MINNAN", null, null }, /* registered name */
+        { "zh_WUU",         "zh__WUU", null, null }, /* registered name */
+        { "zh_XIANG",       "zh__XIANG", null, null }, /* registered name */
+        { "zh_YUE",         "zh__YUE", null, null }, /* registered name */
+        { "th_TH_TRADITIONAL", "th_TH", "calendar", "buddhist" },
+        { "zh_TW_STROKE",   "zh_TW", "collation", "stroke" },
+        { "zh__PINYIN",     "zh", "collation", "pinyin" }
    };

    /**
@ -600,7 +631,7 @@ public final class ULocale implements Serializable {
     * @draft ICU 2.8
     */
    public ULocale(Locale loc) {
-        this.localeID = loc.toString();
+        this.localeID = getName(loc.toString());
        this.locale = loc;
    }

@ -614,7 +645,7 @@ public final class ULocale implements Serializable {
     * @draft ICU 2.8
     */ 
    public ULocale(String localeID) {
-        this.localeID = localeID;
+        this.localeID = getName(localeID);
    }

    /**
@ -637,26 +668,31 @@ public final class ULocale implements Serializable {
     * @draft ICU 3.0
     */
    public ULocale(String lang, String script, String country, String variant) {
+        localeID = getName(lscvToID(lang, script, country, variant));
+    }
+
+    private static String lscvToID(String lang, String script, String country, String variant) {
        StringBuffer buf = new StringBuffer();
-        
-        buf.append(lang.toLowerCase());
-        if(script!=null && script.length() > 0){
+     
+	if (lang != null && lang.length() > 0) {
+	    buf.append(lang);
+	}
+        if (script != null && script.length() > 0) {
            buf.append(UNDERSCORE);
-            buf.append(script.substring(0, 1).toUpperCase());
-            buf.append(script.substring(1).toLowerCase());
+	    buf.append(script);
        }
-        if(country!=null && country.length() > 0){
+        if (country != null && country.length() > 0) {
            buf.append(UNDERSCORE);
-            buf.append(country.toUpperCase());
+	    buf.append(country);
        }
-        if(variant!=null && variant.length() > 0){
+        if (variant != null && variant.length() > 0) {
            if (country == null || country.length() == 0) {
                buf.append(UNDERSCORE);
            }
            buf.append(UNDERSCORE);
-            buf.append(variant.toUpperCase());
+            buf.append(variant);
        }
-        localeID = buf.toString();
+	return buf.toString();
    }

    /**
@ -885,7 +921,7 @@ public final class ULocale implements Serializable {
     * @draft ICU 3.0
     */ 
    public String getName() {
-        return getName(localeID);   
+        return localeID; // always normalized
    }

    /**
@ -908,8 +944,9 @@ public final class ULocale implements Serializable {
    }

    /**
-     * Gets an iterator over keywords for the specified locale.
-     * @return iterator over keywords
+     * Gets an iterator over keywords for the specified locale.  If there 
+     * are no keywords, returns null.
+     * @return iterator over keywords, or null if there are no keywords.
     * @draft ICU 3.0
     */
    public Iterator getKeywords() {
@ -917,8 +954,10 @@ public final class ULocale implements Serializable {
    }

    /**
-     * Gets an iterator over keywords for the specified locale.
-     * @return an iterator over the keywords in the specified locale.
+     * Gets an iterator over keywords for the specified locale.  If there 
+     * are no keywords, returns null.
+     * @return an iterator over the keywords in the specified locale, or null
+     * if there are no keywords.
     * @draft ICU 3.0
     */
    public static Iterator getKeywords(String localeID){
@ -926,9 +965,9 @@ public final class ULocale implements Serializable {
    }

    /**
-     * Get the value for a keyword. Locale name does not need to be normalized.
+     * Get the value for a keyword. If the keyword is not defined, null is returned.
     * @param keywordName name of the keyword for which we want the value. Case insensitive.
-     * @return String the value of the keyword as a string
+     * @return the value of the keyword, or null.
     * @draft ICU 3.0
     */
    public String getKeywordValue(String keywordName){
@ -953,6 +992,12 @@ public final class ULocale implements Serializable {
        private int index;
        private char[] buffer;
        private int blen;
+	// um, don't handle POSIX ids unless we request it.  why not?  well... because.
+	private boolean canonicalize;
+
+      // used when canonicalizing
+      Map keywords;
+      String baseName;

        /**
         * Parsing constants.
@ -964,11 +1009,16 @@ public final class ULocale implements Serializable {
        private static final char ITEM_SEPARATOR        = ';';
        private static final char DOT                   = '.';

-        private IDParser(String localeID){
+	private IDParser(String localeID) {
+	    this(localeID, false);
+	}
+
+        private IDParser(String localeID, boolean canonicalize) {
            id = localeID.toCharArray();
            index = 0;
            buffer = new char[id.length + 5];
            blen = 0;
+	    this.canonicalize = canonicalize;
        }

        private void reset() {
@ -1063,7 +1113,7 @@ public final class ULocale implements Serializable {
        /**
         * Return true if the character is an id separator (underscore or hyphen).
         */
-        private static boolean isIDSeparator(char c) {
+        private boolean isIDSeparator(char c) {
            return c == UNDERSCORE || c == HYPHEN;
        }

@ -1071,17 +1121,18 @@ public final class ULocale implements Serializable {
         * Return true if the character is a terminator (keyword separator, dot, or DONE).
         * Dot is a terminator because of the POSIX form, where dot precedes the codepage.
         */
-        private static boolean isTerminator(char c){
-            return c == KEYWORD_SEPARATOR || c == DOT || c == DONE;   
+        private boolean isTerminator(char c) {
+	    // always terminate at DOT, even if not handling POSIX.  It's an error...
+            return c == KEYWORD_SEPARATOR || c == DONE || (c == DOT);
        }

        /**
         * Return true if the character is a terminator or id separator.
         */
-        private static boolean isTerminatorOrIDSeparator(char c) {
-            return (c == KEYWORD_SEPARATOR) || (c == DOT) || 
+        private boolean isTerminatorOrIDSeparator(char c) {
+            return (c == KEYWORD_SEPARATOR) || 
                (c == UNDERSCORE || c == HYPHEN) || 
-                (c == DONE);   
+                (c == DONE) || (canonicalize && c == DOT);   
        }

        /**
@ -1130,19 +1181,30 @@ public final class ULocale implements Serializable {
            }
            --index;

-            if (blen == 3) {
-                /* convert 3 character code to 2 character code if possible *CWB*/
-                String lang = getString(0);
-                int offset = findIndex(languages3, lang);
-                if (offset >= 0) {
-                    set(0, languages[offset]);
-                } else {
-                    offset = findIndex(obsoleteLanguages3, lang);
-                    if (offset >= 0) {
-                        set(0, obsoleteLanguages[offset]);
-                    }
-                }
-            }
+	    if (blen == 3) {
+	      /* convert 3 character code to 2 character code if possible *CWB*/
+	      String lang = getString(0);
+	      int offset = findIndex(languages3, lang);
+	      if (offset >= 0) {
+		set(0, languages[offset]);
+	      } else {
+		offset = findIndex(obsoleteLanguages3, lang);
+		if (offset >= 0) {
+		  if (canonicalize) {
+		    set(0, replacementLanguages[offset]);
+		  } else {
+		    set(0, obsoleteLanguages[offset]);
+		  }
+		}
+	      }
+	    } else if (canonicalize && blen == 2) {
+	      // update canonical languages
+	      String lang = getString(0);
+	      int offset = findIndex(obsoleteLanguages, lang);
+	      if (offset >= 0) {
+		set(0, replacementLanguages[offset]);
+	      }
+	    }

            return 0;
        }
@ -1300,17 +1362,20 @@ public final class ULocale implements Serializable {
         * We identify this use of '@' in POSIX ids by looking for an '=' following
         * the '@'.  If there is one, we consider '@' to start a keyword list, instead of
         * being part of a POSIX id.
+	 *
+	 * Note:  since it was decided that we want an option to not handle POSIX ids, this
+	 * becomes a bit more complex.
         */
        private int parseVariant() {
            int oldBlen = blen;

            boolean first = true;
            char c = next();
-            if (c == DOT) {
-		// if we have a DOT, we ignore anything after the '@'
+            if (canonicalize && c == DOT) {
+		// if we have a DOT, we ignore anything up to the '@'
                while (!isTerminator(c = next())); // skip to terminator, assume no more DOTs
            } 
-	    if (c != DONE && (c != KEYWORD_SEPARATOR || !haveKeywordAssign())) {
+	    if (c != DONE && (c != KEYWORD_SEPARATOR || (canonicalize && !haveKeywordAssign()))) {
                // we have more text, and either had an id separator, or 
                // had a keyword separator in a POSIX locale
                // either way, we accumulate text until a terminator
@ -1388,7 +1453,14 @@ public final class ULocale implements Serializable {
            };
        }

+      public void setBaseName(String baseName) {
+	this.baseName = baseName;
+      }
+
        public void parseBaseName() {
+	  if (baseName != null) {
+	    set(0, baseName);
+	  } else {
            reset();
            parseLanguage();
            parseScript();
@ -1399,6 +1471,7 @@ public final class ULocale implements Serializable {
            if (blen > 1 && buffer[blen-1] == UNDERSCORE) {
                --blen;
            }
+	  }
        }

        /**
@ -1406,6 +1479,9 @@ public final class ULocale implements Serializable {
         * form does not include keywords.
         */
        public String getBaseName() {
+	  if (baseName != null) {
+	    return baseName;
+	  }
            parseBaseName();
            return getString(0);
        }
@ -1429,12 +1505,19 @@ public final class ULocale implements Serializable {
        private boolean setToKeywordStart() {
            for (int i = index; i < id.length; ++i) {
                if (id[i] == KEYWORD_SEPARATOR) {
-                    for (int j = ++i; j < id.length; ++j) { // increment i past separator for return
-                        if (id[j] == KEYWORD_ASSIGN) {
-                            index = i;
-                            return true;
-                        }
-                    }
+		    if (canonicalize) {
+			for (int j = ++i; j < id.length; ++j) { // increment i past separator for return
+			    if (id[j] == KEYWORD_ASSIGN) {
+				index = i;
+				return true;
+			    }
+			}
+		    } else {
+			if (++i < id.length) {
+			    index = i;
+			    return true;
+			}
+		    }
                }
            }
            return false;
@ -1462,41 +1545,47 @@ public final class ULocale implements Serializable {
            return new String(id, start, index-start).trim(); // leave case alone
        }

+	private Comparator getKeyComparator() {
+	    final Comparator comp = new Comparator() {
+		    public int compare(Object lhs, Object rhs) {
+			return ((String)lhs).compareTo(rhs);
+		    }
+		};
+	    return comp;
+	}
+
        /**
-         * Return a map of the keywords and values.
+         * Return a map of the keywords and values, or null if there are none.
         */
        private Map getKeywordMap() {
-            Map m = null;
+	    if (keywords == null) {
+		TreeMap m = null;
+		if (setToKeywordStart()) {
+		    // trim spaces and convert to lower case, both keywords and values.
+		    do {
+			String key = getKeyword();
+			if (key.length() == 0) {
+			    break;
+			}
+			if (next() != KEYWORD_ASSIGN) {
+			    throw new IllegalArgumentException("key '" + key + "' missing a value.");
+			}
+			String value = getValue();
+			if (value.length() == 0) {
+			    throw new IllegalArgumentException("key '" + key + "' missing a value.");
+			}
+			if (m == null) {
+			    m = new TreeMap(getKeyComparator());
+			} else if (m.containsKey(key)) {
+			    throw new IllegalArgumentException("key '" + key + "' already has a value.");
+			}
+			m.put(key, value);
+		    } while (next() == ITEM_SEPARATOR);
+		}		
+		keywords = m != null ? m : Collections.EMPTY_MAP;
+	    }

-            if (setToKeywordStart()) {
-                // trim spaces and convert to lower case, both keywords and values.
-                do {
-                    String key = getKeyword();
-                    if (key.length() == 0) {
-                        break;
-                    }
-                    if (next() != KEYWORD_ASSIGN) {
-                        throw new IllegalArgumentException("key '" + key + "' missing a value.");
-                    }
-                    String value = getValue();
-                    if (value.length() == 0) {
-                        throw new IllegalArgumentException("key '" + key + "' missing a value.");
-                    }
-                    if (m == null) {
-                        final Comparator comp = new Comparator() {
-                                public int compare(Object lhs, Object rhs) {
-                                    return ((String)lhs).compareTo(rhs);
-                                }
-                            };
-                        m = new TreeMap(comp);
-                    }
-                    if (m.containsKey(key)) {
-                        throw new IllegalArgumentException("key '" + key + "' already has a value.");
-                    }
-                    m.put(key, value);
-                } while (next() == ITEM_SEPARATOR);
-            }
-            return m == null ? Collections.EMPTY_MAP : m;
+	    return keywords;
        }

        /**
@ -1505,27 +1594,30 @@ public final class ULocale implements Serializable {
        private int parseKeywords() {
            int oldBlen = blen;
            Map m = getKeywordMap();
-            Iterator iter = m.entrySet().iterator();
-            boolean first = true;
-            while (iter.hasNext()) {
-                append(first ? KEYWORD_SEPARATOR : ITEM_SEPARATOR);
-                first = false;
-                Map.Entry e = (Map.Entry)iter.next();
-                append((String)e.getKey());
-                append(KEYWORD_ASSIGN);
-                append((String)e.getValue());
-            }
-            if (blen != oldBlen) {
-                ++oldBlen;
-            }
+	    if (!m.isEmpty()) {
+		Iterator iter = m.entrySet().iterator();
+		boolean first = true;
+		while (iter.hasNext()) {
+		    append(first ? KEYWORD_SEPARATOR : ITEM_SEPARATOR);
+		    first = false;
+		    Map.Entry e = (Map.Entry)iter.next();
+		    append((String)e.getKey());
+		    append(KEYWORD_ASSIGN);
+		    append((String)e.getValue());
+		}
+		if (blen != oldBlen) {
+		    ++oldBlen;
+		}
+	    }
            return oldBlen;
        }

        /**
-         * Return an iterator over the keywords.
+         * Return an iterator over the keywords, or null if we have an empty map.
         */
        public Iterator getKeywords() {
-            return getKeywordMap().keySet().iterator();
+	    Map m = getKeywordMap();
+            return m.isEmpty() ? null : m.keySet().iterator();
        }

        /**
@ -1533,8 +1625,70 @@ public final class ULocale implements Serializable {
         * present.
         */
        public String getKeywordValue(String keywordName) {
-            return (String)getKeywordMap().get(keywordName.trim().toLowerCase());
+	    Map m = getKeywordMap();
+            return m.isEmpty() ? null : (String)m.get(keywordName.trim().toLowerCase());
        }
+
+	/**
+	 * Set the keyword value only if it is not already set to something else.
+	 */
+	public void defaultKeywordValue(String keywordName, String value) {
+	    setKeywordValue(keywordName, value, false);
+	}
+	    
+	/**
+	 * Set the value for the named keyword, or unset it if value is null.  If
+	 * keywordName itself is null, unset all keywords.  If keywordName is not null,
+	 * value must not be null.
+	 */
+	public void setKeywordValue(String keywordName, String value) {
+	    setKeywordValue(keywordName, value, true);
+	}
+
+	/**
+	 * Set the value for the named keyword, or unset it if value is null.  If
+	 * keywordName itself is null, unset all keywords.  If keywordName is not null,
+	 * value must not be null.  If reset is true, ignore any previous value for 
+	 * the keyword, otherwise do not change the keyword (including removal of
+	 * one or all keywords).
+	 */
+	private void setKeywordValue(String keywordName, String value, boolean reset) {
+	    if (keywordName == null) {
+		if (reset) {
+		    // force new map, ignore value
+		    keywords = Collections.EMPTY_MAP;
+		}
+	    } else {
+		keywordName = keywordName.trim().toLowerCase();
+		if (keywordName.length() == 0) {
+		    throw new IllegalArgumentException("keyword must not be empty");
+		}
+		value = value.trim();
+		if (value.length() == 0) {
+		    throw new IllegalArgumentException("value must not be empty");
+		}
+		Map m = getKeywordMap();
+		if (m.isEmpty()) { // it is EMPTY_MAP
+		    if (value != null) {
+			// force new map
+			keywords = new TreeMap(getKeyComparator());
+			keywords.put(keywordName, value.trim());
+		    }
+		} else {
+		    if (reset || !m.containsKey(keywordName)) {
+			if (value != null) {
+			    m.put(keywordName, value);
+			} else {
+			    m.remove(keywordName);
+			    if (m.isEmpty()) {
+				// force new map
+				keywords = Collections.EMPTY_MAP;
+			    }
+			}
+		    }
+		}
+	    }
+	}
    }

    /**
@ -1559,25 +1713,79 @@ public final class ULocale implements Serializable {
     * @draft ICU 3.0
     */
    public static String canonicalize(String localeID){
-        String locStr = new IDParser(localeID).getName();
-        // now we have an ID in the form xx_Yyyy_ZZ_KKKKK
+      IDParser parser = new IDParser(localeID, true);
+      String baseName = parser.getBaseName();
+      boolean foundVariant = false;
+      
+        // we have an ID in the form xx_Yyyy_ZZ_KKKKK
+
        /* See if this is an already known locale */
        for (int i = 0; i < variantsToKeywords.length; i++) {
-            if (variantsToKeywords[i][0].equals(locStr)) {
-                locStr = variantsToKeywords[i][1];
+            if (variantsToKeywords[i][0].equals(baseName)) {
+		foundVariant = true;
+
+		String[] vals = variantsToKeywords[i];
+                parser.setBaseName(vals[1]);
+		if (vals[2] != null) {
+		    parser.defaultKeywordValue(vals[2], vals[3]);
+		}
                break;
            }
        }

        /* convert the Euro variant to appropriate ID */
-        int idx = locStr.indexOf("_EURO");
-        if (idx > -1) {
-            locStr = locStr.substring(0,idx)+"@currency=EUR";       
-        }
-        
-        return locStr;
+	if (!foundVariant) {
+	  int idx = baseName.indexOf("_EURO");
+	  if (idx > -1) {
+	    parser.setBaseName(baseName.substring(0, idx));
+	    parser.defaultKeywordValue("currency", "EUR");
+	  }
+	}
+
+	/* total mondo hack for Norwegian, fortunately the main NY case is handled earlier */
+	if (!foundVariant) {
+	    if (parser.getLanguage().equals("nb") && parser.getVariant().equals("NY")) {
+		parser.setBaseName(lscvToID("nn", parser.getScript(), parser.getCountry(), null));
+	    }
+	}
+
+        return parser.getName();
    }
    
+    /**
+     * Given a locale id, a keyword, and a value, return a new locale id with an updated
+     * keyword and value.  If keyword is null, this removes all keywords from the locale id.
+     * Otherwise, if the value is null, this removes the value for this keyword from the
+     * locale id.  Otherwise, this adds/replaces the value for this keyword in the locale id.
+     * The keyword and value must not be empty.
+     * @param localeID the locale id to modify
+     * @param keyword the keyword to add/remove, or null to remove all keywords.
+     * @param the value to add/set, or null to remove this particular keyword.
+     * @return the updated locale id
+     * @internal
+     */
+    private static String setKeywordValue(String localeID, String keyword, String value) {
+	IDParser parser = new IDParser(localeID);
+	parser.setKeywordValue(keyword, value);
+	return parser.getName();
+    }
+
+    /**
+     * Given a locale id, a keyword, and a value, return a new locale id with an updated
+     * keyword and value, if the keyword does not already have a value.  The keyword and
+     * value must not be null or empty.
+     * @param localeID the locale id to modify
+     * @param keyword the keyword to add, if not already present
+     * @param the value to add, if not already present
+     * @return the updated locale id
+     * @internal
+     */
+    private static String defaultKeywordValue(String localeID, String keyword, String value) {
+	IDParser parser = new IDParser(localeID);
+	parser.defaultKeywordValue(keyword, value);
+	return parser.getName();
+    }
+
    /**
     * Returns a three-letter abbreviation for this locale's language.  If the locale
     * doesn't specify a language, this will be the empty string.  Otherwise, this will
@ -2010,23 +2218,26 @@ public final class ULocale implements Serializable {
            }
        }

-        Iterator keys = parser.getKeywordMap().entrySet().iterator();
-        while (keys.hasNext()) {
-            if (buf.length() > 0) {
-                if (haveLanguage & !openParen) {
-                    buf.append(" (");
-                    openParen = true;
-                } else {
-                    buf.append(", ");
-                }
-            }
-            Map.Entry e = (Map.Entry)keys.next();
-            String key = (String)e.getKey();
-            String val = (String)e.getValue();
-            buf.append(getTableString("Keys", null, key, bundle));
-            buf.append("=");
-            buf.append(getTableString("Types", key, val, bundle));
-        }
+	Map m = parser.getKeywordMap();
+	if (!m.isEmpty()) {
+	    Iterator keys = m.entrySet().iterator();
+	    while (keys.hasNext()) {
+		if (buf.length() > 0) {
+		    if (haveLanguage & !openParen) {
+			buf.append(" (");
+			openParen = true;
+		    } else {
+			buf.append(", ");
+		    }
+		}
+		Map.Entry e = (Map.Entry)keys.next();
+		String key = (String)e.getKey();
+		String val = (String)e.getValue();
+		buf.append(getTableString("Keys", null, key, bundle));
+		buf.append("=");
+		buf.append(getTableString("Types", key, val, bundle));
+	    }
+	}

        if (openParen) {
            buf.append(")");