ICU-1272 initial implementation of perl-ish character property syntax for UnicodeSet

X-SVN-Rev: 6281
2001-10-17 19:21:12 +00:00 · 2001-10-17 19:21:12 +00:00 · 68744138aa
commit 68744138aa
parent a4a66fdc7f
12 changed files with 1118 additions and 335 deletions
--- a/icu4c/source/i18n/Makefile.in
+++ b/icu4c/source/i18n/Makefile.in
@ -71,7 +71,7 @@ cpdtrans.o hextouni.o rbt.o rbt_data.o rbt_pars.o rbt_rule.o rbt_set.o \
 dbbi.o dbbi_tbl.o rbbi.o rbbi_tbl.o nultrans.o \
 remtrans.o titletrn.o tolowtrn.o toupptrn.o xformtrn.o \
 name2uni.o uni2name.o unitohex.o nortrans.o unifilt.o quant.o transreg.o \
-llong.o nfrs.o nfrule.o nfsubs.o rbnf.o
+llong.o nfrs.o nfrule.o nfsubs.o rbnf.o upropset.o



--- a/icu4c/source/i18n/i18n.dsp
+++ b/icu4c/source/i18n/i18n.dsp
@ -370,6 +370,10 @@ SOURCE=.\unum.cpp
 # End Source File
 # Begin Source File

+SOURCE=.\upropset.cpp
+# End Source File
+# Begin Source File
+
 SOURCE=.\usearch.cpp
 # End Source File
 # Begin Source File
--- a/icu4c/source/i18n/quant.cpp
+++ b/icu4c/source/i18n/quant.cpp
@ -46,9 +46,15 @@ UMatchDegree Quantifier::matches(const Replaceable& text,
    int32_t start = offset;
    uint32_t count = 0;
    while (count < maxCount) {
+        int32_t pos = offset;
        UMatchDegree m = matcher->matches(text, offset, limit, incremental);
        if (m == U_MATCH) {
            ++count;
+            if (pos == offset) {
+                // If offset has not moved we have a zero-width match.
+                // Don't keep matching it infinitely.
+                break;
+            }
        } else if (incremental && m == U_PARTIAL_MATCH) {
            return U_PARTIAL_MATCH;
        } else {
--- a/icu4c/source/i18n/rbt_pars.cpp
+++ b/icu4c/source/i18n/rbt_pars.cpp
@ -41,8 +41,6 @@
 #define SEGMENT_CLOSE      ((UChar)0x0029) /*)*/
 #define CONTEXT_ANTE       ((UChar)0x007B) /*{*/
 #define CONTEXT_POST       ((UChar)0x007D) /*}*/
-#define SET_OPEN           ((UChar)0x005B) /*[*/
-#define SET_CLOSE          ((UChar)0x005D) /*]*/
 #define CURSOR_POS         ((UChar)0x007C) /*|*/
 #define CURSOR_OFFSET      ((UChar)0x0040) /*@*/
 #define ANCHOR_START       ((UChar)0x005E) /*^*/
@ -50,6 +48,13 @@
 #define ONE_OR_MORE        ((UChar)0x002B) /*+*/
 #define ZERO_OR_ONE        ((UChar)0x003F) /*?*/

+#define DOT                ((UChar)46)     /*.*/
+
+static const UChar DOT_SET[] = { // "[^[:Zp:][:Zl:]\r\n$]";
+    91, 94, 91, 58, 90, 112, 58, 93, 91, 58, 90,
+    108, 58, 93, 92, 114, 92, 110, 36, 93, 0
+};
+
 // By definition, the ANCHOR_END special character is a
 // trailing SymbolTable.SYMBOL_REF character.
 // private static final char ANCHOR_END       = '$';
@ -514,6 +519,15 @@ int32_t RuleHalf::parse(const UnicodeString& rule, int32_t pos, int32_t limit) {
            // Text after a presumed end anchor is a syntax err
            return syntaxError(U_MALFORMED_VARIABLE_REFERENCE, rule, start);
        }
+        if (UnicodeSet::resemblesPattern(rule, pos-1)) {
+            pp.setIndex(pos-1); // Backup to opening '['
+            buf.append(parser.parseSet(rule, pp));
+            if (U_FAILURE(parser.status)) {
+                return syntaxError(U_MALFORMED_SET, rule, start);
+            }
+            pos = pp.getIndex();                    
+            continue;
+        }
        // Handle escapes
        if (c == ESCAPE) {
            if (pos == limit) {
@ -653,14 +667,6 @@ int32_t RuleHalf::parse(const UnicodeString& rule, int32_t pos, int32_t limit) {
            }
            post = buf.length();
            break;
-        case SET_OPEN:
-            pp.setIndex(pos-1); // Backup to opening '['
-            buf.append(parser.parseSet(rule, pp));
-            if (U_FAILURE(parser.status)) {
-                return syntaxError(U_MALFORMED_SET, rule, start);
-            }
-            pos = pp.getIndex();
-            break;
        case CURSOR_POS:
            if (cursor >= 0) {
                return syntaxError(U_MULTIPLE_CURSORS, rule, start);
@ -689,6 +695,9 @@ int32_t RuleHalf::parse(const UnicodeString& rule, int32_t pos, int32_t limit) {
                }
            }
            break;
+        case DOT:
+            buf.append(parser.getDotStandIn());
+            break;
        case KLEENE_STAR:
        case ONE_OR_MORE:
        case ZERO_OR_ONE:
@ -749,7 +758,6 @@ int32_t RuleHalf::parse(const UnicodeString& rule, int32_t pos, int32_t limit) {
                buf.append(parser.generateStandInFor(m));
            }
            break;
-        // case SET_CLOSE:
        default:
            // Disallow unquoted characters other than [0-9A-Za-z]
            // in the printable ASCII range.  These characters are
@ -892,6 +900,7 @@ void TransliteratorParser::parseRules(const UnicodeString& rules,
    }
    parseData->data = data;
    determineVariableRange(rules);
+    dotStandIn = (UChar) -1;

    UnicodeString str; // scratch
    idBlock.truncate(0);
@ -1257,6 +1266,17 @@ UChar TransliteratorParser::generateStandInFor(UnicodeMatcher* adopted) {
    return variableNext++;
 }

+/**
+ * Return the stand-in for the dot set.  It is allocated the first
+ * time and reused thereafter.
+ */
+UChar TransliteratorParser::getDotStandIn() {
+    if (dotStandIn == (UChar) -1) {
+        dotStandIn = generateStandInFor(new UnicodeSet(DOT_SET, status));
+    }
+    return dotStandIn;
+}
+
 /**
 * Append the value of the given variable name to the given
 * UnicodeString.
--- a/icu4c/source/i18n/rbt_pars.h
+++ b/icu4c/source/i18n/rbt_pars.h
@ -108,6 +108,13 @@ class TransliteratorParser {
     */
    UnicodeString undefinedVariableName;

+    /**
+     * The stand-in character for the 'dot' set, represented by '.' in
+     * patterns.  This is allocated the first time it is needed, and
+     * reused thereafter.
+     */
+    UChar dotStandIn;
+
 public:

    /**
@ -190,6 +197,12 @@ private:
     */
    UChar generateStandInFor(UnicodeMatcher* adopted);

+    /**
+     * Return the stand-in for the dot set.  It is allocated the first
+     * time and reused thereafter.
+     */
+    UChar getDotStandIn();
+
    /**
     * Append the value of the given variable name to the given
     * UnicodeString.
--- a/icu4c/source/i18n/uniset.cpp
+++ b/icu4c/source/i18n/uniset.cpp
@ -17,6 +17,7 @@
 #include "rbt_rule.h"
 #include "umutex.h"
 #include "ucln_in.h"
+#include "upropset.h"

 // HIGH_VALUE > all valid values. 110000 for codepoints
 #define UNICODESET_HIGH 0x0110000
@ -42,49 +43,40 @@
 #define UPPER_U         ((UChar)0x0055) /*U*/
 #define LOWER_U         ((UChar)0x0075) /*u*/

-// N.B.: This mapping is different in ICU and Java
-//const UnicodeString UnicodeSet::CATEGORY_NAMES(
-//    "CnLuLlLtLmLoMnMeMcNdNlNoZsZlZpCcCfCoCsPdPsPePcPoSmScSkSoPiPf", "");
-static const UChar CATEGORY_NAMES[] = {
-    0x43, 0x6E, /* "Cn" */
-    0x4C, 0x75, /* "Lu" */
-    0x4C, 0x6C, /* "Ll" */
-    0x4C, 0x74, /* "Lt" */
-    0x4C, 0x6D, /* "Lm" */
-    0x4C, 0x6F, /* "Lo" */
-    0x4D, 0x6E, /* "Mn" */
-    0x4D, 0x65, /* "Me" */
-    0x4D, 0x63, /* "Mc" */
-    0x4E, 0x64, /* "Nd" */
-    0x4E, 0x6C, /* "Nl" */
-    0x4E, 0x6F, /* "No" */
-    0x5A, 0x73, /* "Zs" */
-    0x5A, 0x6C, /* "Zl" */
-    0x5A, 0x70, /* "Zp" */
-    0x43, 0x63, /* "Cc" */
-    0x43, 0x66, /* "Cf" */
-    0x43, 0x6F, /* "Co" */
-    0x43, 0x73, /* "Cs" */
-    0x50, 0x64, /* "Pd" */
-    0x50, 0x73, /* "Ps" */
-    0x50, 0x65, /* "Pe" */
-    0x50, 0x63, /* "Pc" */
-    0x50, 0x6F, /* "Po" */
-    0x53, 0x6D, /* "Sm" */
-    0x53, 0x63, /* "Sc" */
-    0x53, 0x6B, /* "Sk" */
-    0x53, 0x6F, /* "So" */
-    0x50, 0x69, /* "Pi" */
-    0x50, 0x66, /* "Pf" */
-    0x00
-};
-
-/**
- * A cache mapping character category integers, as returned by
- * Unicode::getType(), to pairs strings.  Entries are initially
- * zero length and are filled in on demand.
- */
-static UnicodeSet* CATEGORY_CACHE = NULL;
+//// TEMPORARY: Remove when deprecated category code constructor is removed.
+//static const UChar CATEGORY_NAMES[] = {
+//    0x43, 0x6E, /* "Cn" */
+//    0x4C, 0x75, /* "Lu" */
+//    0x4C, 0x6C, /* "Ll" */
+//    0x4C, 0x74, /* "Lt" */
+//    0x4C, 0x6D, /* "Lm" */
+//    0x4C, 0x6F, /* "Lo" */
+//    0x4D, 0x6E, /* "Mn" */
+//    0x4D, 0x65, /* "Me" */
+//    0x4D, 0x63, /* "Mc" */
+//    0x4E, 0x64, /* "Nd" */
+//    0x4E, 0x6C, /* "Nl" */
+//    0x4E, 0x6F, /* "No" */
+//    0x5A, 0x73, /* "Zs" */
+//    0x5A, 0x6C, /* "Zl" */
+//    0x5A, 0x70, /* "Zp" */
+//    0x43, 0x63, /* "Cc" */
+//    0x43, 0x66, /* "Cf" */
+//    0x43, 0x6F, /* "Co" */
+//    0x43, 0x73, /* "Cs" */
+//    0x50, 0x64, /* "Pd" */
+//    0x50, 0x73, /* "Ps" */
+//    0x50, 0x65, /* "Pe" */
+//    0x50, 0x63, /* "Pc" */
+//    0x50, 0x6F, /* "Po" */
+//    0x53, 0x6D, /* "Sm" */
+//    0x53, 0x63, /* "Sc" */
+//    0x53, 0x6B, /* "Sk" */
+//    0x53, 0x6F, /* "So" */
+//    0x50, 0x69, /* "Pi" */
+//    0x50, 0x66, /* "Pf" */
+//    0x00
+//};

 /**
 * Delimiter string used in patterns to close a category reference:
@ -92,16 +84,12 @@ static UnicodeSet* CATEGORY_CACHE = NULL;
 */
 static const UChar CATEGORY_CLOSE[] = {COLON, SET_CLOSE, 0x0000}; /* ":]" */

-
 /**
 * Cleanup function for transliterator component; delegates to
 * Transliterator::cleanupRegistry().
 */
 U_CFUNC UBool unicodeset_cleanup(void) {
-    if (CATEGORY_CACHE) {
-        delete []CATEGORY_CACHE;
-        CATEGORY_CACHE = NULL;
-    }
+    UnicodePropertySet::cleanup();
    return TRUE;
 }

@ -174,24 +162,24 @@ UnicodeSet::UnicodeSet(const UnicodeString& pattern, ParsePosition& pos,
    applyPattern(pattern, pos, &symbols, status);
 }

-/**
- * Constructs a set from the given Unicode character category.
- * @param category an integer indicating the character category as
- * returned by <code>Unicode::getType()</code>.
- */
-UnicodeSet::UnicodeSet(int8_t category, UErrorCode& status) :
-    len(0), capacity(START_EXTRA), bufferCapacity(0), list(0),
-    buffer(0)
-{
-    if (U_SUCCESS(status)) {
-        if (category < 0 || category >= Unicode::GENERAL_TYPES_COUNT) {
-            status = U_ILLEGAL_ARGUMENT_ERROR;
-        } else {
-            list = new UChar32[capacity];
-            *this = getCategorySet(category);
-        }
-    }
-}
+///**
+// * Constructs a set from the given Unicode character category.
+// * @param category an integer indicating the character category as
+// * returned by <code>Unicode::getType()</code>.
+// */
+//UnicodeSet::UnicodeSet(int8_t category, UErrorCode& status) :
+//    len(0), capacity(START_EXTRA), bufferCapacity(0), list(0),
+//    buffer(0)
+//{
+//    if (U_SUCCESS(status)) {
+//        if (category < 0 || category >= Unicode::GENERAL_TYPES_COUNT) {
+//            status = U_ILLEGAL_ARGUMENT_ERROR;
+//        } else {
+//            list = new UChar32[capacity];
+//            *this = getCategorySet(category);
+//        }
+//    }
+//}

 /**
 * Constructs a set that is identical to the given UnicodeSet.
@ -319,6 +307,16 @@ void UnicodeSet::applyPattern(const UnicodeString& pattern,
    }
 }

+/**
+ * Return true if the given position, in the given pattern, appears
+ * to be the start of a UnicodeSet pattern.
+ */
+UBool UnicodeSet::resemblesPattern(const UnicodeString& pattern, int32_t pos) {
+    return ((pos+1) < pattern.length() &&
+            pattern.charAt(pos) == (UChar)91/*[*/) ||
+        UnicodePropertySet::resemblesPattern(pattern, pos);
+}
+
 /**
 * Append the <code>toPattern()</code> representation of a
 * character to the given <code>StringBuffer</code>.
@ -339,6 +337,8 @@ void UnicodeSet::_appendToPat(UnicodeString& buf, UChar32 c, UBool useHexEscape)
    case COMPLEMENT:
    case INTERSECTION:
    case BACKSLASH:
+    case 123/*{*/:
+    case 125/*}*/:
        buf.append(BACKSLASH);
        break;
    default:
@ -451,15 +451,15 @@ UnicodeString& UnicodeSet::_generatePattern(UnicodeString& result,
                                            UBool escapeUnprintable) const {
    result.append(SET_OPEN);

-    // Check against the predefined categories.  We implicitly build
-    // up ALL category sets the first time toPattern() is called.
-    for (int8_t cat=0; cat<Unicode::GENERAL_TYPES_COUNT; ++cat) {
-        if (*this == getCategorySet(cat)) {
-            result.append(COLON);
-            result.append(CATEGORY_NAMES, cat*2, 2);
-            return result.append(CATEGORY_CLOSE);
-        }
-    }
+//  // Check against the predefined categories.  We implicitly build
+//  // up ALL category sets the first time toPattern() is called.
+//  for (int8_t cat=0; cat<Unicode::GENERAL_TYPES_COUNT; ++cat) {
+//      if (*this == getCategorySet(cat)) {
+//          result.append(COLON);
+//          result.append(CATEGORY_NAMES, cat*2, 2);
+//          return result.append(CATEGORY_CLOSE);
+//      }
+//  }

    int32_t count = getRangeCount();

@ -940,9 +940,9 @@ void UnicodeSet::_applyPattern(const UnicodeString& pattern,
    // mode 1: '[' seen; if next is '^' or ':' then special
    // mode 2: '[' '^'? seen; parse pattern and close with ']'
    // mode 3: '[:' seen; parse category and close with ':]'
-    // mode 4: Pattern closed cleanly
+    // mode 4: ']' seen; parse complete
+    // mode 5: Top-level property pattern seen
    int8_t mode = 0;
-    int32_t colonPos = 0; // Expected pos of ':' in '[:'
    int32_t i = pos.getIndex();
    int32_t limit = pattern.length();
    UnicodeSet nestedAux;
@ -997,9 +997,11 @@ void UnicodeSet::_applyPattern(const UnicodeString& pattern,
        // Parse the opening '[' and optional following '^'
        switch (mode) {
        case 0:
-            if (c == SET_OPEN) {
+            if (UnicodePropertySet::resemblesPattern(pattern, i-1)) {
+                mode = 3;
+                break; // Fall through
+            } else if (c == SET_OPEN) {
                mode = 1; // Next look for '^' or ':'
-                colonPos = i; // Expect ':' at next offset
                continue;
            } else {
                // throw new IllegalArgumentException("Missing opening '['");
@ -1013,18 +1015,6 @@ void UnicodeSet::_applyPattern(const UnicodeString& pattern,
                invert = TRUE;
                newPat.append(c);
                continue; // Back to top to fetch next character
-            case COLON:
-                // '[:' cannot have whitespace in it.  'i' has already
-                // been advanced.
-                if (i-1 == colonPos) {
-                    --i; // Backup to the '['
-                    c = SET_OPEN;
-                    mode = 3;
-                    // Fall through and parse category using the same
-                    // code used to parse a nested category.  The mode
-                    // will indicate that this is actually top level.
-                }
-                break; // Fall through
            case HYPHEN:
                isLiteral = TRUE; // Treat leading '-' as a literal
                break; // Fall through
@ -1041,12 +1031,59 @@ void UnicodeSet::_applyPattern(const UnicodeString& pattern,
        // buffer.  Characters in the variable buffer have already
        // benn through escape and variable reference processing.
        if (varValueBuffer == NULL) {
+            /**
+             * Handle property set patterns.
+             */
+            if (UnicodePropertySet::resemblesPattern(pattern, i-1)) {
+                ParsePosition pp(i-1);
+                nestedSet = UnicodePropertySet::createFromPattern(pattern, pp);
+                if (nestedSet == NULL) {
+                    // assert(pp.getIndex() == i-1);
+                    //throw new IllegalArgumentException("Invalid property pattern " +
+                    //                                   pattern.substring(i-1));
+                    status = U_INVALID_PROPERTY_PATTERN;
+                    return;
+                }
+                // TODO This is very inefficient.  We create a new UnicodeSet,
+                // then do an assignment, then delete it.  Clean this up in
+                // the future so that either (1) we just use the new set
+                // directly, and delete it when we're done, or (2) even better,
+                // UnicodePropertySet takes an existing set.
+                nestedAux = *nestedSet;
+                delete nestedSet;
+                nestedSet = &nestedAux;
+                nestedPatStart = newPat.length();
+                nestedPatDone = TRUE; // we're going to do it just below
+                
+                // If we have a top-level property pattern, then trim
+                // off the opening '[' and use the property pattern
+                // as the entire pattern.
+                if (mode == 3) {
+                    newPat.truncate(0);
+                }
+                UnicodeString str;
+                pattern.extractBetween(i-1, pp.getIndex(), str);
+                newPat.append(str);
+                rebuildPattern = TRUE;
+                
+                i = pp.getIndex(); // advance past property pattern
+                
+                if (mode == 3) {
+                    // Entire pattern is a category; leave parse
+                    // loop.  This is one of 2 ways we leave this
+                    // loop if the pattern is well-formed.
+                    *this = nestedAux;
+                    mode = 5;
+                    break;
+                }
+            }
+            
            /* Handle escapes.  If a character is escaped, then it assumes its
             * literal value.  This is true for all characters, both special
             * characters and characters with no special meaning.  We also
             * interpret '\\uxxxx' Unicode escapes here (as literals).
             */
-            if (c == BACKSLASH) {
+            else if (c == BACKSLASH) {
                UChar32 escaped = pattern.unescapeAt(i);
                if (escaped == (UChar32) -1) {
                    status = U_ILLEGAL_ARGUMENT_ERROR;
@ -1084,73 +1121,28 @@ void UnicodeSet::_applyPattern(const UnicodeString& pattern,
            }

            /* An opening bracket indicates the first bracket of a nested
-             * subpattern, either a normal pattern or a category pattern.  We
-             * recognize these here and set nestedSet accordingly.
-             *
-             * The other way we wind up here is with a top level category.
-             * If that is the case, the mode will be set accordingly.
+             * subpattern.
             */
            else if (!isLiteral && c == SET_OPEN) {
                // Record position before nested pattern
                nestedPatStart = newPat.length();

-                // Handle "[:...:]", representing a character category
-                if (i < pattern.length() && pattern.charAt(i) == COLON) {
-                    ++i;
-                    int32_t j = pattern.indexOf(CATEGORY_CLOSE, i);
-                    if (j < 0) {
-                        // throw new IllegalArgumentException("Missing \":]\"");
-                        status = U_ILLEGAL_ARGUMENT_ERROR;
-                        return;
-                    }
-                    scratch.truncate(0);
-                    pattern.extractBetween(i, j, scratch);
-                    nestedAux.applyCategory(scratch, status);
-                    nestedSet = &nestedAux;
-                    nestedPatDone = TRUE; // We're going to do it just below
-                    if (U_FAILURE(status)) {
-                        return;
-                    }
-                    i = j+2; // Advance i past ":]"
-
-                    // Use a rebuilt pattern.  If we are top level,
-                    // then there is already a SET_OPEN in newPat, and
-                    // SET_CLOSE will be appended elsewhere.
-                    if (mode != 3) {
-                        newPat.append(SET_OPEN);
-                    }
-                    newPat.append(COLON).append(scratch).append(COLON);
-                    if (mode != 3) {
-                        newPat.append(SET_CLOSE);
-                    }
-                    rebuildPattern = TRUE;
-
-                    if (mode == 3) {
-                        // Entire pattern is a category; leave parse
-                        // loop.  This is one of 2 ways we leave this
-                        // loop if the pattern is well-formed.
-                        *this = *nestedSet;
-                        mode = 4;
-                        break;
-                    }
-                } else {
-                    // Recurse to get the pairs for this nested set.
-                    // Backup i to '['.
-                    pos.setIndex(--i);
-                    switch (lastOp) {
-                    case HYPHEN:
-                    case INTERSECTION:
-                        newPat.append(lastOp);
-                        break;
-                    }
-                    nestedAux._applyPattern(pattern, pos, symbols, newPat, status);
-                    nestedSet = &nestedAux;
-                    nestedPatDone =  TRUE;
-                    if (U_FAILURE(status)) {
-                        return;
-                    }
-                    i = pos.getIndex();
+                // Recurse to get the pairs for this nested set.
+                // Backup i to '['.
+                pos.setIndex(--i);
+                switch (lastOp) {
+                case HYPHEN:
+                case INTERSECTION:
+                    newPat.append(lastOp);
+                    break;
                }
+                nestedAux._applyPattern(pattern, pos, symbols, newPat, status);
+                nestedSet = &nestedAux;
+                nestedPatDone =  TRUE;
+                if (U_FAILURE(status)) {
+                    return;
+                }
+                i = pos.getIndex();
            }
        }

@ -1255,7 +1247,22 @@ void UnicodeSet::_applyPattern(const UnicodeString& pattern,
        }
    }

-    if (lastChar != NONE) {
+    if (mode < 4) {
+        // throw new IllegalArgumentException("Missing ']'");
+        status = U_ILLEGAL_ARGUMENT_ERROR;
+        return;
+    }
+
+    // Treat a trailing '$' as indicating ETHER.  This code is only
+    // executed if symbols == NULL; otherwise other code parses the
+    // anchor.
+    if (lastChar == (UChar)SymbolTable::SYMBOL_REF) {
+        rebuildPattern = TRUE;
+        newPat.append(lastChar);
+        add(TransliterationRule::ETHER);
+    }
+
+    else if (lastChar != NONE) {
        add(lastChar, lastChar);
        _appendToPat(newPat, lastChar, FALSE);
    }
@ -1271,7 +1278,9 @@ void UnicodeSet::_applyPattern(const UnicodeString& pattern,
        return;
    }

-    newPat.append(SET_CLOSE);
+    if (mode == 4) {
+        newPat.append(SET_CLOSE);
+    }

    /**
     * If we saw a '^' after the initial '[' of this pattern, then perform
@ -1281,12 +1290,6 @@ void UnicodeSet::_applyPattern(const UnicodeString& pattern,
        complement();
    }

-    if (mode != 4) {
-        // throw new IllegalArgumentException("Missing ']'");
-        status = U_ILLEGAL_ARGUMENT_ERROR;
-        return;
-    }
-
    pos.setIndex(i);

    // Use the rebuilt pattern (newPat) only if necessary.  Prefer the
@ -1298,157 +1301,6 @@ void UnicodeSet::_applyPattern(const UnicodeString& pattern,
    }
 }

-//----------------------------------------------------------------
-// Implementation: Generation of pairs for Unicode categories
-//----------------------------------------------------------------
-
-/**
- * Sets this object to the given category, given its name.
- * The category name must be either a two-letter name, such as
- * "Lu", or a one letter name, such as "L".  One-letter names
- * indicate the logical union of all two-letter names that start
- * with that letter.  Case is significant.  If the name starts
- * with the character '^' then the complement of the given
- * character set is returned.
- *
- * Although individual categories such as "Lu" are cached, we do
- * not currently cache single-letter categories such as "L" or
- * complements such as "^Lu" or "^L".  It would be easy to cache
- * these as well in a hashtable should the need arise.
- */
-void UnicodeSet::applyCategory(const UnicodeString& catName,
-                               UErrorCode& status) {
-    if (U_FAILURE(status)) {
-        return;
-    }
-
-    UnicodeString cat(catName);
-    UBool invert = (catName.length() > 1 &&
-                     catName.charAt(0) == COMPLEMENT);
-    if (invert) {
-        cat.remove(0, 1);
-    }
-
-    UBool match = FALSE;
-
-    // if we have two characters, search the category map for that
-    // code and either construct and return a UnicodeSet from the
-    // data in the category map or throw an exception
-    if (cat.length() == 2) {
-        int32_t i = 0;
-        int32_t numCategories = Unicode::GENERAL_TYPES_COUNT * 2;
-
-        while (i < numCategories)
-        {
-            if (CATEGORY_NAMES[i] == cat.charAt(0)
-                && CATEGORY_NAMES[i+1] == cat.charAt(1))
-            {
-                *this = getCategorySet((int8_t)(i/2));
-                match = TRUE;
-                break;
-            }
-            i += 2;
-        }
-    } else if (cat.length() == 1) {
-        // if we have one character, search the category map for
-        // codes beginning with that letter, and union together
-        // all of the matching sets that we find (or throw an
-        // exception if there are no matches)
-        clear();
-        for (int32_t i=0; i<Unicode::GENERAL_TYPES_COUNT; ++i) {
-            if (CATEGORY_NAMES[2*i] == cat.charAt(0)) {
-                addAll(getCategorySet((int8_t)i));
-                match = TRUE;
-            }
-        }
-    }
-
-    if (!match) {
-        // TODO: Add caching of these, if desired
-        char buf[128];
-        catName.extract(buf, sizeof(buf), NULL, status);
-        UScriptCode script = uscript_getCode(buf, &status);
-        if (script != USCRIPT_INVALID_CODE) {
-            match = TRUE;
-            clear();
-            int32_t start = -1;
-            int32_t end = -2;
-            for (UChar32 i=MIN_VALUE; i<=MAX_VALUE; ++i) {
-                if (uscript_getScript(i, &status) == script) {
-                    if ((end+1) == (int32_t) i) {
-                        end = i;
-                    } else {
-                        if (start >= 0) {
-                            add((UChar32) start, (UChar32) end);
-                        }
-                        start = end = i;
-                    }
-                }
-            }
-            if (start >= 0) {
-                add((UChar32) start, (UChar32) end);
-            }
-        }
-    }
-
-    if (!match) {
-        status = U_ILLEGAL_ARGUMENT_ERROR;
-        return;
-    }
-
-    if (invert) {
-        complement();
-    }
-}
-
-/**
- * Returns a pairs string for the given category.  This string is
- * cached and returned again if this method is called again with
- * the same parameter.
- */
-const UnicodeSet& UnicodeSet::getCategorySet(int8_t cat) {
-    // In order to tell what cache entries are empty, we assume
-    // every category specifies at least one character.  Thus
-    // sets in the cache that are empty are uninitialized.
-    if (CATEGORY_CACHE == NULL) {
-        umtx_lock(NULL);
-        if (CATEGORY_CACHE == NULL) {
-            CATEGORY_CACHE = new UnicodeSet[Unicode::GENERAL_TYPES_COUNT];
-            ucln_i18n_registerCleanup();
-        }
-        umtx_unlock(NULL);
-    }
-    if (CATEGORY_CACHE[cat].isEmpty()) {
-        // Walk through all Unicode characters, noting the start
-        // and end of each range for which Character.getType(c)
-        // returns the given category integer.  Since we are
-        // iterating in order, we can simply append the resulting
-        // ranges to the pairs string.
-        UnicodeSet& set = CATEGORY_CACHE[cat];
-        int32_t start = -1;
-        int32_t end = -2;
-        // N.B.: There seems to be a bug that deadlocks if you
-		// call getType() with a supplemental character right now.
-		// TODO: Change 0xFFFF to MAX_VALUE later.
-        for (int32_t i=MIN_VALUE; i<=0xFFFF/*TEMPORARY*/; ++i) {
-            if (Unicode::getType((UChar)i) == cat) {
-                if ((end+1) == i) {
-                    end = i;
-                } else {
-                    if (start >= 0) {
-                        set.add((UChar32)start, (UChar32)end);
-                    }
-                    start = end = i;
-                }
-            }
-        }
-        if (start >= 0) {
-            set.add((UChar32)start, (UChar32)end);
-        }
-    }
-    return CATEGORY_CACHE[cat];
-}
-
 //----------------------------------------------------------------
 // Implementation: Utility methods
 //----------------------------------------------------------------
--- a/icu4c/source/i18n/upropset.cpp
+++ b/icu4c/source/i18n/upropset.cpp
@ -0,0 +1,597 @@
+/*
+**********************************************************************
+*   Copyright (c) 2001, International Business Machines
+*   Corporation and others.  All Rights Reserved.
+**********************************************************************
+* $Source: /xsrl/Nsvn/icu/icu/source/i18n/Attic/upropset.cpp,v $
+* $Date: 2001/10/17 19:20:41 $
+* $Revision: 1.1 $
+**********************************************************************
+*/
+#include "upropset.h"
+#include "ustrfmt.h"
+#include "unicode/unistr.h"
+#include "unicode/uscript.h"
+#include "unicode/uniset.h"
+#include "unicode/parsepos.h"
+#include "hash.h"
+
+U_NAMESPACE_BEGIN
+
+static Hashtable* NAME_MAP = NULL;
+
+static Hashtable* CATEGORY_MAP = NULL;
+
+/**
+ * A cache mapping character category integers, as returned by
+ * UCharacter.getType(), to sets.  Entries are initially
+ * null and are created on demand.
+ */
+static UnicodeSet* CATEGORY_CACHE = NULL;
+
+/**
+ * A cache mapping script integers, as defined by
+ * UScript, to sets.  Entries are initially
+ * null and are created on demand.
+ */
+static UnicodeSet* SCRIPT_CACHE = NULL;
+
+// Special value codes
+static const int32_t ANY = -1; // general category: all code points
+
+//----------------------------------------------------------------
+// Unicode string and character constants
+//----------------------------------------------------------------
+
+static const UChar POSIX_OPEN[]  = { 91,58,0 }; // "[:"
+static const UChar POSIX_CLOSE[] = { 58,93,0 }; // ":]"
+
+static const UChar PERL_OPEN[]  = { 92,112,0 }; // "\\p"
+static const UChar PERL_CLOSE[] = { 125,0 };    // "}"
+
+static const UChar HAT        = 0x005E; /*^*/
+static const UChar UPPER_P    = 0x0050; /*P*/
+static const UChar LEFT_BRACE = 0x007B; /*{*/
+static const UChar EQUALS     = 0x003D; /*=*/
+
+//----------------------------------------------------------------------
+// class _CharString
+// An identical class named CharString can be found in transreg.cpp.
+// If we find ourselves needing another copy of this utility class we
+// should probably pull it out into putil or some such place.
+//----------------------------------------------------------------------
+
+class _CharString {
+ public:
+    _CharString(const UnicodeString& str);
+    ~_CharString();
+    operator char*() { return ptr; }
+ private:
+    char buf[128];
+    char* ptr;
+};
+
+_CharString::_CharString(const UnicodeString& str) {
+    if (str.length() >= (int32_t)sizeof(buf)) {
+        ptr = new char[str.length() + 8];
+    } else {
+        ptr = buf;
+    }
+    str.extract(0, 0x7FFFFFFF, ptr, "");
+}
+
+_CharString::~_CharString() {
+    if (ptr != buf) {
+        delete[] ptr;
+    }
+}
+
+//----------------------------------------------------------------
+// Public API
+//----------------------------------------------------------------
+
+/**
+ * Return true if the given position, in the given pattern, appears
+ * to be the start of a property set pattern [:foo:], \p{foo}, or
+ * \P{foo}.
+ */
+UBool UnicodePropertySet::resemblesPattern(const UnicodeString& pattern,
+                                           int32_t pos) {
+    // Patterns are at least 5 characters long
+    if ((pos+5) > pattern.length()) {
+        return FALSE;
+    }
+
+    // Look for an opening [:, [:^, \p, or \P
+    return (0 == pattern.compare(pos, 2, POSIX_OPEN)) ||
+        (0 == pattern.caseCompare(pos, 2, PERL_OPEN, U_FOLD_CASE_DEFAULT));
+}
+
+/**
+ * Create a UnicodeSet by parsing the given pattern at the given
+ * parse position.
+ *
+ * @param pattern the pattern string
+ * @param ppos on entry, the position at which to begin parsing.
+ * This shold be one of the locations marked '^':
+ *
+ *   [:blah:]     \p{blah}     \P{blah}
+ *   ^       %    ^       %    ^       %
+ *
+ * On return, the position after the last character parsed, that is,
+ * the locations marked '%'.  If the parse fails, ppos is returned
+ * unchanged.
+ * @return a newly-constructed UnicodeSet object, or null upon
+ * failure.
+ */
+UnicodeSet* UnicodePropertySet::createFromPattern(const UnicodeString& pattern,
+                                                  ParsePosition& ppos) {
+    init();
+
+    UnicodeSet* set = NULL;
+
+    int32_t pos = ppos.getIndex();
+
+    // On entry, ppos should point to one of the following locations:
+
+    // Minimum length is 5 characters, e.g. \p{L}
+    if ((pos+5) > pattern.length()) {
+        return NULL;
+    }
+
+    UBool posix = FALSE; // true for [:pat:], false for \p{pat} \P{pat}
+    UBool invert = FALSE;
+
+    // Look for an opening [:, [:^, \p, or \P
+    if (0 == pattern.compare(pos, 2, POSIX_OPEN)) {
+        posix = TRUE;
+        pos = skipWhitespace(pattern, pos+2);
+        if (pos < pattern.length() && pattern.charAt(pos) == HAT) {
+            ++pos;
+            invert = TRUE;
+        }
+    } else if (0 == pattern.caseCompare(pos, 2, PERL_OPEN, U_FOLD_CASE_DEFAULT)) {
+        invert = (pattern.charAt(pos+1) == UPPER_P);
+        pos = skipWhitespace(pattern, pos+2);
+        if (pos == pattern.length() || pattern.charAt(pos++) != LEFT_BRACE) {
+            // Syntax error; "\p" or "\P" not followed by "{"
+            return NULL;
+        }
+    } else {
+        // Open delimiter not seen
+        return NULL;
+    }
+
+    // Look for the matching close delimiter, either :] or }
+    int32_t close = pattern.indexOf(posix ? POSIX_CLOSE : PERL_CLOSE, pos);
+    if (close < 0) {
+        // Syntax error; close delimiter missing
+        return NULL;
+    }
+
+    // Look for an '=' sign.  If this is present, we will parse a
+    // medium \p{gc=Cf} or long \p{GeneralCategory=Format}
+    // pattern.
+    int32_t equals = pattern.indexOf(EQUALS, pos);
+    if (equals >= 0 && equals < close) {
+        // Equals seen; parse medium/long pattern
+        UnicodeString typeName = munge(pattern, pos, equals);
+        UnicodeString valueName = munge(pattern, equals+1, close);
+        SetFactory factory;
+        factory = voidPtrToSetFactory(NAME_MAP->get(typeName));
+        if (factory == NULL) {
+            // Syntax error; type name not recognized
+            return NULL;
+        }
+        set = (*factory)(valueName);
+    } else {
+        // No equals seen; parse short format \p{Cf}
+        UnicodeString shortName = munge(pattern, pos, close);
+
+        // First try general category
+        set = createCategorySet(shortName);
+
+        // If this fails, try script
+        if (set == NULL) {
+            set = createScriptSet(shortName);
+        }
+    }
+
+    // Upon failure, return NULL with ppos unchanged
+    if (set == NULL) {
+        return NULL;
+    }
+
+    if (invert) {
+        set->complement();
+    }
+
+    // Move to the limit position after the close delimiter
+    ppos.setIndex(close + (posix ? 2 : 1));
+
+    return set;
+}
+
+//----------------------------------------------------------------
+// Property set factory static methods
+// NOTE: This will change/go away when we implement UCharacter
+// based property retrieval.
+//----------------------------------------------------------------
+
+static UBool _numericValueFilter(UChar32 c, void* context) {
+    int32_t value = * (int32_t*) context;
+    // TODO: Change this to a more generic function, like
+    // u_charNumericValue (when one exists).
+    return u_charDigitValue(c) == value;
+}
+
+UnicodeSet* UnicodePropertySet::createNumericValueSet(const UnicodeString& valueName) {
+    _CharString cvalueName(valueName);
+    UnicodeSet* set = new UnicodeSet();
+    char* end;
+    double value = uprv_strtod(cvalueName, &end);
+    int32_t ivalue = (int32_t) value;
+    if (ivalue != value || ivalue < 0 || *end != 0) {
+        // UCharacter doesn't support negative or non-integral
+        // values, so just return an empty set
+        return set;
+    }
+    initSetFromFilter(*set, _numericValueFilter, &ivalue);
+    return set;
+}
+
+/**
+ * Given a general category value name, create a corresponding
+ * set and return it, or return null if the name is invalid.
+ * @param valueName a pre-munged general category value name
+ */
+UnicodeSet* UnicodePropertySet::createCategorySet(const UnicodeString& valueName) {
+    int32_t valueCode = CATEGORY_MAP->geti(valueName);
+    if (valueCode == 0) {
+        return NULL;
+    }
+
+    UnicodeSet* set = new UnicodeSet();
+    if (valueCode == ANY) {
+        set->complement();
+        return set;
+    }
+    for (int32_t cat=0; cat<U_CHAR_CATEGORY_COUNT; ++cat) {
+        if ((valueCode & (1 << cat)) != 0) {
+            set->addAll(getCategorySet(cat));
+        }
+    }
+    return set;
+}
+
+/**
+ * Given a script value name, create a corresponding set and
+ * return it, or return null if the name is invalid.
+ * @param valueName a pre-munged script value name
+ */
+UnicodeSet* UnicodePropertySet::createScriptSet(const UnicodeString& valueName) {
+    _CharString cvalueName(valueName);
+    UErrorCode ec = U_ZERO_ERROR;
+    UScriptCode script = uscript_getCode(cvalueName, &ec);
+    if (script == USCRIPT_INVALID_CODE || U_FAILURE(ec)) {
+        // Syntax error; unknown short name
+        return NULL;
+    }
+    return new UnicodeSet(getScriptSet(script));
+}
+
+//----------------------------------------------------------------
+// Utility methods
+//----------------------------------------------------------------
+
+static UBool _categoryFilter(UChar32 c, void* context) {
+    int32_t value = * (int32_t*) context;
+    return u_charType(c) == value;
+}
+
+/**
+ * Returns a UnicodeSet for the given category.  This set is
+ * cached and returned again if this method is called again with
+ * the same parameter.
+ *
+ * Callers MUST NOT MODIFY the returned set.
+ */
+const UnicodeSet& UnicodePropertySet::getCategorySet(int32_t cat) {
+    if (CATEGORY_CACHE[cat].isEmpty()) {
+        initSetFromFilter(CATEGORY_CACHE[cat], _categoryFilter, &cat);
+    }
+    return CATEGORY_CACHE[cat];
+}
+
+static UBool _scriptFilter(UChar32 c, void* context) {
+    UScriptCode value = * (UScriptCode*) context;
+    UErrorCode ec = U_ZERO_ERROR;
+    return uscript_getScript(c, &ec) == value;
+}
+
+/**
+ * Returns a UnicodeSet for the given script.  This set is
+ * cached and returned again if this method is called again with
+ * the same parameter.
+ *
+ * Callers MUST NOT MODIFY the returned set.
+ */
+const UnicodeSet& UnicodePropertySet::getScriptSet(UScriptCode script) {
+    if (SCRIPT_CACHE[script].isEmpty()) {
+        initSetFromFilter(CATEGORY_CACHE[script], _scriptFilter, &script);
+    }
+    return SCRIPT_CACHE[script];
+}
+
+/**
+ * Given a string, munge it to lost the whitespace.  So "General
+ * Category " becomes "GeneralCategory".  We munge all type and value
+ * strings, and store all type and value keys pre-munged.  NOTE:
+ * Unlike the Java version, we do not modify the case, since we use a
+ * case-insensitive compare function.
+ */
+UnicodeString UnicodePropertySet::munge(const UnicodeString& str,
+                                        int32_t start, int32_t limit) {
+    UnicodeString buf;
+    for (int32_t i=start; i<limit; ) {
+        UChar32 c = str.char32At(i);
+        i += UTF_CHAR_LENGTH(c);
+        if (!u_isspace(c)) {
+            buf.append(c);
+        }
+    }
+    return buf;
+}
+
+/**
+ * Skip over a sequence of zero or more white space characters
+ * at pos.  Return the index of the first non-white-space character
+ * at or after pos, or str.length(), if there is none.
+ */
+int32_t UnicodePropertySet::skipWhitespace(const UnicodeString& str,
+                                           int32_t pos) {
+    while (pos < str.length()) {
+        UChar32 c = str.char32At(pos);
+        if (!u_isspace(c)) {
+            break;
+        }
+        pos += UTF_CHAR_LENGTH(c);
+    }
+    return pos;
+}
+
+//----------------------------------------------------------------
+// Generic filter-based scanning code
+//
+// NOTE: In general, we don't want to do this!  This is a temporary
+// implementation until we have time for something that examines
+// the underlying UCharacter data structures in an intelligent
+// way.  Iterating over all code points is dumb.  What we want to
+// do, for instance, is iterate over internally-stored ranges
+// of characters that have a given property.
+//----------------------------------------------------------------
+
+void UnicodePropertySet::initSetFromFilter(UnicodeSet& set, Filter filter,
+                                           void* context) {
+    // Walk through all Unicode characters, noting the start
+    // and end of each range for which filter.contain(c) is
+    // true.  Add each range to a set.
+    set.clear();
+
+    int32_t start = -1;
+    int32_t end = -2;
+    
+    // TODO Extend this up to UnicodeSet.MAX_VALUE when we have
+    // better performance; i.e., when this code can get moved into
+    // the UCharacter class and not have to iterate over code
+    // points.  Right now it's way too slow to iterate to 10FFFF.
+    
+    for (int32_t i=UnicodeSet::MIN_VALUE; i<=0xFFFF/*TEMPORARY*/; ++i) {
+        if ((*filter)((UChar32) i, context)) {
+            if ((end+1) == i) {
+                end = i;
+            } else {
+                if (start >= 0) {
+                    set.add((UChar32)start, (UChar32)end);
+                }
+                start = end = i;
+            }
+        }
+    }
+    if (start >= 0) {
+        set.add((UChar32)start, (UChar32)end);
+    }
+}
+
+//----------------------------------------------------------------
+// Type and value name maps
+//----------------------------------------------------------------
+
+/**
+ * Add a type mapping to the name map.
+ */
+void UnicodePropertySet::addType(const UnicodeString& shortName,
+                                 const UnicodeString& longName,
+                                 SetFactory factory) {
+    UErrorCode ec = U_ZERO_ERROR;
+    void* p = setFactoryToVoidPtr(factory);
+    NAME_MAP->put(shortName, p, ec);
+    NAME_MAP->put(longName, p, ec);
+}
+
+/**
+ * Add a value mapping to the name map.
+ */
+void UnicodePropertySet::addValue(Hashtable* map,
+                                  const UnicodeString& shortName,
+                                  const UnicodeString& longName,
+                                  int32_t value) {
+    // assert(value != 0);
+    UErrorCode ec = U_ZERO_ERROR;
+    map->puti(shortName, value, ec);
+    if (longName.length() != 0) {
+        map->puti(longName, value, ec);
+    }
+}
+
+void UnicodePropertySet::init() {
+    if (NAME_MAP != NULL) {
+        return;
+    }
+
+    NAME_MAP = new Hashtable(TRUE);
+    CATEGORY_MAP = new Hashtable(TRUE);
+    CATEGORY_CACHE = new UnicodeSet[U_CHAR_CATEGORY_COUNT];
+    SCRIPT_CACHE = new UnicodeSet[USCRIPT_CODE_LIMIT];
+
+    // NOTE:  We munge all search keys to have no whitespace
+    // and upper case.  As such, all stored keys should have
+    // this format.
+
+    // Load the map with type data
+
+    addType("GC", "GENERALCATEGORY", createCategorySet);
+
+    //addType("CC", "COMBININGCLASS", COMBINING_CLASS);
+    //addType("BC", "BIDICLASS", BIDI_CLASS);
+    //addType("DT", "DECOMPOSITIONTYPE", DECOMPOSITION_TYPE);
+
+    addType("NV", "NUMERICVALUE", createNumericValueSet);
+
+    //addType("NT", "NUMERICTYPE", NUMERIC_TYPE);
+    //addType("EA", "EASTASIANWIDTH", EAST_ASIAN_WIDTH);
+    //addType("LB", "LINEBREAK", LINE_BREAK);
+    //addType("JT", "JOININGTYPE", JOINING_TYPE);
+
+    addType("SC", "SCRIPT", createScriptSet);
+
+    // Load the map with value data
+
+    // General Category
+
+    addValue(CATEGORY_MAP, "ANY", "", ANY); // special case
+
+    addValue(CATEGORY_MAP, "C", "OTHER",
+             (1 << U_CONTROL_CHAR) |
+             (1 << U_FORMAT_CHAR) |
+             (1 << U_GENERAL_OTHER_TYPES) |
+             (1 << U_PRIVATE_USE_CHAR) |
+             (1 << U_SURROGATE));
+
+    addValue(CATEGORY_MAP, "CC", "CONTROL",
+             1 << U_CONTROL_CHAR);
+    addValue(CATEGORY_MAP, "CF", "FORMAT",
+             1 << U_FORMAT_CHAR);
+    addValue(CATEGORY_MAP, "CN", "UNASSIGNED",
+             1 << U_GENERAL_OTHER_TYPES);
+    addValue(CATEGORY_MAP, "CO", "PRIVATEUSE",
+             1 << U_PRIVATE_USE_CHAR);
+    addValue(CATEGORY_MAP, "CS", "SURROGATE",
+             1 << U_SURROGATE);
+
+    addValue(CATEGORY_MAP, "L", "LETTER",
+             (1 << U_LOWERCASE_LETTER) |
+             (1 << U_MODIFIER_LETTER) |
+             (1 << U_OTHER_LETTER) |
+             (1 << U_TITLECASE_LETTER) |
+             (1 << U_UPPERCASE_LETTER));
+
+    addValue(CATEGORY_MAP, "LL", "LOWERCASELETTER",
+             1 << U_LOWERCASE_LETTER);
+    addValue(CATEGORY_MAP, "LM", "MODIFIERLETTER",
+             1 << U_MODIFIER_LETTER);
+    addValue(CATEGORY_MAP, "LO", "OTHERLETTER",
+             1 << U_OTHER_LETTER);
+    addValue(CATEGORY_MAP, "LT", "TITLECASELETTER",
+             1 << U_TITLECASE_LETTER);
+    addValue(CATEGORY_MAP, "LU", "UPPERCASELETTER",
+             1 << U_UPPERCASE_LETTER);
+
+    addValue(CATEGORY_MAP, "M", "MARK",
+             (1 << U_NON_SPACING_MARK) |
+             (1 << U_COMBINING_SPACING_MARK) |
+             (1 << U_ENCLOSING_MARK));
+
+    addValue(CATEGORY_MAP, "MN", "NONSPACINGMARK",
+             1 << U_NON_SPACING_MARK);
+    addValue(CATEGORY_MAP, "MC", "SPACINGMARK",
+             1 << U_COMBINING_SPACING_MARK);
+    addValue(CATEGORY_MAP, "ME", "ENCLOSINGMARK",
+             1 << U_ENCLOSING_MARK);
+
+    addValue(CATEGORY_MAP, "N", "NUMBER",
+             (1 << U_DECIMAL_DIGIT_NUMBER) |
+             (1 << U_LETTER_NUMBER) |
+             (1 << U_OTHER_NUMBER));
+
+    addValue(CATEGORY_MAP, "ND", "DECIMALNUMBER",
+             1 << U_DECIMAL_DIGIT_NUMBER);
+    addValue(CATEGORY_MAP, "NL", "LETTERNUMBER",
+             1 << U_LETTER_NUMBER);
+    addValue(CATEGORY_MAP, "NO", "OTHERNUMBER",
+             1 << U_OTHER_NUMBER);
+
+    addValue(CATEGORY_MAP, "P", "PUNCTUATION",
+             (1 << U_CONNECTOR_PUNCTUATION) |
+             (1 << U_DASH_PUNCTUATION) |
+             (1 << U_END_PUNCTUATION) |
+             (1 << U_FINAL_PUNCTUATION) |
+             (1 << U_INITIAL_PUNCTUATION) |
+             (1 << U_OTHER_PUNCTUATION) |
+             (1 << U_START_PUNCTUATION));
+
+    addValue(CATEGORY_MAP, "PC", "CONNECTORPUNCTUATION",
+             1 << U_CONNECTOR_PUNCTUATION);
+    addValue(CATEGORY_MAP, "PD", "DASHPUNCTUATION",
+             1 << U_DASH_PUNCTUATION);
+    addValue(CATEGORY_MAP, "PE", "ENDPUNCTUATION",
+             1 << U_END_PUNCTUATION);
+    addValue(CATEGORY_MAP, "PF", "FINALPUNCTUATION",
+             1 << U_FINAL_PUNCTUATION);
+    addValue(CATEGORY_MAP, "PI", "INITIALPUNCTUATION",
+             1 << U_INITIAL_PUNCTUATION);
+    addValue(CATEGORY_MAP, "PO", "OTHERPUNCTUATION",
+             1 << U_OTHER_PUNCTUATION);
+    addValue(CATEGORY_MAP, "PS", "STARTPUNCTUATION",
+             1 << U_START_PUNCTUATION);
+
+    addValue(CATEGORY_MAP, "S", "SYMBOL",
+             (1 << U_CURRENCY_SYMBOL) |
+             (1 << U_MODIFIER_SYMBOL) |
+             (1 << U_MATH_SYMBOL) |
+             (1 << U_OTHER_SYMBOL));
+
+    addValue(CATEGORY_MAP, "SC", "CURRENCYSYMBOL",
+             1 << U_CURRENCY_SYMBOL);
+    addValue(CATEGORY_MAP, "SK", "MODIFIERSYMBOL",
+             1 << U_MODIFIER_SYMBOL);
+    addValue(CATEGORY_MAP, "SM", "MATHSYMBOL",
+             1 << U_MATH_SYMBOL);
+    addValue(CATEGORY_MAP, "SO", "OTHERSYMBOL",
+             1 << U_OTHER_SYMBOL);
+
+    addValue(CATEGORY_MAP, "Z", "SEPARATOR",
+             (1 << U_LINE_SEPARATOR) |
+             (1 << U_PARAGRAPH_SEPARATOR) |
+             (1 << U_SPACE_SEPARATOR));
+
+    addValue(CATEGORY_MAP, "ZL", "LINESEPARATOR",
+             1 << U_LINE_SEPARATOR);
+    addValue(CATEGORY_MAP, "ZP", "PARAGRAPHSEPARATOR",
+             1 << U_PARAGRAPH_SEPARATOR);
+    addValue(CATEGORY_MAP, "ZS", "SPACESEPARATOR",
+             1 << U_SPACE_SEPARATOR);
+}
+
+void UnicodePropertySet::cleanup() {
+    if (NAME_MAP != NULL) {
+        delete NAME_MAP; NAME_MAP = NULL;
+        delete CATEGORY_MAP; CATEGORY_MAP = NULL;
+        delete[] CATEGORY_CACHE; CATEGORY_CACHE = NULL;
+        delete[] SCRIPT_CACHE; SCRIPT_CACHE = NULL;
+    }
+}
+
+U_NAMESPACE_END
+
+//eof
--- a/icu4c/source/i18n/upropset.h
+++ b/icu4c/source/i18n/upropset.h
@ -0,0 +1,240 @@
+/*
+**********************************************************************
+*   Copyright (c) 2001, International Business Machines
+*   Corporation and others.  All Rights Reserved.
+**********************************************************************
+* $Source: /xsrl/Nsvn/icu/icu/source/i18n/Attic/upropset.h,v $
+* $Date: 2001/10/17 19:20:41 $
+* $Revision: 1.1 $
+**********************************************************************
+*/
+#ifndef _UPROPSET_H_
+#define _UPROPSET_H_
+
+#include "unicode/utypes.h"
+#include "unicode/uscript.h"
+
+U_NAMESPACE_BEGIN
+
+class UnicodeString;
+class UnicodeSet;
+class ParsePosition;
+class Hashtable;
+
+/**
+ * INTERNAL CLASS implementing the UnicodeSet properties as outlined
+ * at:
+ *
+ * http://oss.software.ibm.com/cvs/icu/~checkout~/icuhtml/design/unicodeset_properties.html
+ *
+ * Recognized syntax:
+ *
+ * [:foo:] [:^foo:] - white space not allowed within "[:" or ":]"
+ * \p{foo} \P{foo}  - white space not allowed within "\p" or "\P"
+ *
+ * Other than the above restrictions, white space is ignored.  Case
+ * is ignored except in "\p" and "\P".
+ *
+ * This class cannot be instantiated.  It has a public static method,
+ * createPropertySet(), with takes a pattern to be parsed and returns
+ * a new UnicodeSet.  Another public static method,
+ * resemblesPattern(), returns true if a given pattern string appears
+ * to be a property set pattern, and therefore should be passed in to
+ * createPropertySet().
+ *
+ * NOTE: Current implementation is incomplete.  The following list
+ * indicates which properties are supported.
+ *
+ *    + GeneralCategory
+ *      CombiningClass
+ *      BidiClass
+ *      DecompositionType
+ *    + NumericValue
+ *      NumericType
+ *      EastAsianWidth
+ *      LineBreak
+ *      JoiningType
+ *    + Script
+ *
+ * '+' indicates a supported property.
+ *
+ * @author Alan Liu
+ * @version $RCSfile: upropset.h,v $ $Revision: 1.1 $ $Date: 2001/10/17 19:20:41 $
+ */
+class UnicodePropertySet {
+
+ public:
+
+    //----------------------------------------------------------------
+    // Public API
+    //----------------------------------------------------------------
+
+    /**
+     * Return true if the given position, in the given pattern, appears
+     * to be the start of a property set pattern [:foo:], \p{foo}, or
+     * \P{foo}.
+     */
+    static UBool resemblesPattern(const UnicodeString& pattern, int32_t pos);
+
+    /**
+     * Create a UnicodeSet by parsing the given pattern at the given
+     * parse position.
+     *
+     * @param pattern the pattern string
+     * @param ppos on entry, the position at which to begin parsing.
+     * This shold be one of the locations marked '^':
+     *
+     *   [:blah:]     \p{blah}     \P{blah}
+     *   ^       %    ^       %    ^       %
+     *
+     * On return, the position after the last character parsed, that is,
+     * the locations marked '%'.  If the parse fails, ppos is returned
+     * unchanged.
+     * @return a newly-constructed UnicodeSet object, or null upon
+     * failure.
+     */
+    static UnicodeSet* createFromPattern(const UnicodeString& pattern,
+                                         ParsePosition& ppos);
+
+ private:
+
+    //----------------------------------------------------------------
+    // Property set factory static methods
+    // NOTE: This will change/go away when we implement UCharacter
+    // based property retrieval.
+    //----------------------------------------------------------------
+
+    typedef UnicodeSet* (*SetFactory)(const UnicodeString& valueName);
+
+    static UnicodeSet* createNumericValueSet(const UnicodeString& valueName);
+
+    /**
+     * Given a general category value name, create a corresponding
+     * set and return it, or return null if the name is invalid.
+     * @param valueName a pre-munged general category value name
+     */
+    static UnicodeSet* createCategorySet(const UnicodeString& valueName);
+
+    /**
+     * Given a script value name, create a corresponding set and
+     * return it, or return null if the name is invalid.
+     * @param valueName a pre-munged script value name
+     */
+    static UnicodeSet* createScriptSet(const UnicodeString& valueName);
+
+    //----------------------------------------------------------------
+    // Utility methods
+    //----------------------------------------------------------------
+
+    /**
+     * Returns a UnicodeSet for the given category.  This set is
+     * cached and returned again if this method is called again with
+     * the same parameter.
+     *
+     * Callers MUST NOT MODIFY the returned set.
+     */
+    static const UnicodeSet& getCategorySet(int32_t cat);
+
+    /**
+     * Returns a UnicodeSet for the given script.  This set is
+     * cached and returned again if this method is called again with
+     * the same parameter.
+     *
+     * Callers MUST NOT MODIFY the returned set.
+     */
+    static const UnicodeSet& getScriptSet(UScriptCode script);
+
+    /**
+     * Given a string, munge it to upper case and lose the whitespace.
+     * So "General Category " becomes "GENERALCATEGORY".  We munge all
+     * type and value strings, and store all type and value keys
+     * pre-munged.
+     */
+    static UnicodeString munge(const UnicodeString& str, int32_t start, int32_t limit);
+
+    /**
+     * Skip over a sequence of zero or more white space characters
+     * at pos.  Return the index of the first non-white-space character
+     * at or after pos, or str.length(), if there is none.
+     */
+    static int32_t skipWhitespace(const UnicodeString& str, int32_t pos);
+
+    //----------------------------------------------------------------
+    // Generic filter-based scanning code
+    //
+    // NOTE: In general, we don't want to do this!  This is a temporary
+    // implementation until we have time for something that examines
+    // the underlying UCharacter data structures in an intelligent
+    // way.  Iterating over all code points is dumb.  What we want to
+    // do, for instance, is iterate over internally-stored ranges
+    // of characters that have a given property.
+    //----------------------------------------------------------------
+
+    /**
+     * A filter that returns TRUE if the given code point should be
+     * included in the UnicodeSet being constructed.
+     */
+    typedef UBool (*Filter)(UChar32 codePoint, void* context);
+
+    /**
+     * Set the given UnicodeSet to contain all code points for which
+     * filter returns TRUE.  The context parameter is passed unchanged
+     * to the filter function.
+     */
+    static void initSetFromFilter(UnicodeSet& set, Filter filter,
+                                  void* context);
+
+    //----------------------------------------------------------------
+    // Type and value name maps
+    //----------------------------------------------------------------
+
+    /**
+     * Add a type mapping to the name map.
+     */
+    static void addType(const UnicodeString& shortName,
+                        const UnicodeString& longName,
+                        SetFactory factory);
+
+    /**
+     * Add a value mapping to the name map.
+     */
+    static void addValue(Hashtable* map,
+                         const UnicodeString& shortName,
+                         const UnicodeString& longName,
+                         int32_t value);
+
+    static void init();
+
+ public:
+    static void cleanup();
+
+ private:
+    //----------------------------------------------------------------
+    // SetFactory <=> void*
+    // I don't know why the compiler won't cast between these types.
+    // They should be interconvertible.  Does C++ distinguish between
+    // pointers into code and pointers into data?  In any case, we
+    // convert between these types in a safe way here.
+    //----------------------------------------------------------------
+    
+    union SetFactoryTok {
+        void*       voidPointer;
+        SetFactory  functionPointer;
+    };
+
+    inline static void* setFactoryToVoidPtr(SetFactory f) {
+        SetFactoryTok tok;
+        tok.functionPointer = f;
+        return tok.voidPointer;
+    }
+
+    inline static SetFactory voidPtrToSetFactory(void* p) {
+        SetFactoryTok tok;
+        tok.voidPointer = p;
+        return tok.functionPointer;
+    }
+};
+
+U_NAMESPACE_END
+
+#endif
--- a/icu4c/source/test/intltest/transtst.cpp
+++ b/icu4c/source/test/intltest/transtst.cpp
@ -124,10 +124,11 @@ TransliteratorTest::runIndexedTest(int32_t index, UBool exec,
        TESTCASE(42,TestUndefinedVariable);
        TESTCASE(43,TestEmptyContext);
        TESTCASE(44,TestCompoundFilterID);
-        TESTCASE(45,TestDevanagariLatinRT);
-        TESTCASE(46,TestTeluguLatinRT);
-        TESTCASE(47,TestCompoundLatinRT);
-        TESTCASE(48,TestSanskritLatinRT);
+        TESTCASE(45,TestPropertySet);
+        TESTCASE(46,TestDevanagariLatinRT);
+        TESTCASE(47,TestTeluguLatinRT);
+        TESTCASE(48,TestCompoundLatinRT);
+        TESTCASE(49,TestSanskritLatinRT);
        default: name = ""; break;
    }
 }
@ -2066,6 +2067,15 @@ void TransliteratorTest::TestCompoundFilterID(void) {
    }
 }

+/**
+ * Test new property set syntax
+ */
+void TransliteratorTest::TestPropertySet() {
+    expect("a>A; \\p{Lu}>x; \\p{ANY}>y;", "abcDEF", "Ayyxxx");
+    expect("(.+)>'[' $1 ']';", " a stitch \n in time \r saves 9",
+           "[ a stitch ]\n[ in time ]\r[ saves 9]");
+}
+
 //======================================================================
 // Ram's tests
 //======================================================================
--- a/icu4c/source/test/intltest/transtst.h
+++ b/icu4c/source/test/intltest/transtst.h
@ -210,6 +210,11 @@ class TransliteratorTest : public IntlTest {
     */
    void TestCompoundFilterID(void);

+    /**
+     * Test new property set syntax
+     */
+    void TestPropertySet(void);
+
    /* Devanagari-Latin rules Test */
    void TestDevanagariLatinRT(void);

--- a/icu4c/source/test/intltest/usettest.cpp
+++ b/icu4c/source/test/intltest/usettest.cpp
@ -41,7 +41,8 @@ UnicodeSetTest::runIndexedTest(int32_t index, UBool exec,
        CASE(4,TestMinimalRep);
        CASE(5,TestAPI);
        CASE(6,TestScriptSet);
-        CASE(7,TestExhaustive);
+        CASE(7,TestPropertySet);
+        CASE(8,TestExhaustive);
        default: name = ""; break;
    }
 }
@ -103,16 +104,18 @@ UnicodeSetTest::TestCategories(void) {
 void
 UnicodeSetTest::TestCloneEqualHash(void) {
    UErrorCode status = U_ZERO_ERROR;
-    int8_t category=Unicode::LOWERCASE_LETTER;
-    UnicodeSet *set1=new UnicodeSet(category, status); //  :Li: Letter, lowercase
+    //int8_t category=Unicode::LOWERCASE_LETTER;
+    //UnicodeSet *set1=new UnicodeSet(category, status); //  :Li: Letter, lowercase
+    UnicodeSet *set1=new UnicodeSet("[:Ll:]", status); //  Letter, lowercase
    if (U_FAILURE(status)){
-        errln((UnicodeString)"FAIL: Can't construst set with cateegory->Ll");
+        errln((UnicodeString)"FAIL: Can't construst set with category->Ll");
        return;
    }
-    category=Unicode::DECIMAL_DIGIT_NUMBER;
-    UnicodeSet *set2=new UnicodeSet(category, status);   //Number, Decimal digit
+    //category=Unicode::DECIMAL_DIGIT_NUMBER;
+    //UnicodeSet *set2=new UnicodeSet(category, status);   //Number, Decimal digit
+    UnicodeSet *set2=new UnicodeSet("[:Nd:]", status);   //Number, Decimal digit
    if (U_FAILURE(status)){
-        errln((UnicodeString)"FAIL: Can't construct set with cateegory->Nd");
+        errln((UnicodeString)"FAIL: Can't construct set with category->Nd");
        return;
    }

@ -407,6 +410,22 @@ void UnicodeSetTest::TestScriptSet() {
    expectContainment(set2, "[:Greek:]", CharsToUnicodeString("\\u0391\\u03B1"), "aA");
 }

+/**
+ * Test the [:Latin:] syntax.
+ */
+void UnicodeSetTest::TestPropertySet() {
+    UErrorCode status = U_ZERO_ERROR;
+    UnicodeSet set("[:Latin:]", status);
+    if (U_FAILURE(status)) { errln("FAIL"); return; }
+    expectContainment(set, "aA", CharsToUnicodeString("\\u0391\\u03B1"));
+    set.applyPattern("[\\p{Greek}]", status);
+    if (U_FAILURE(status)) { errln("FAIL"); return; }
+    expectContainment(set, CharsToUnicodeString("\\u0391\\u03B1"), "aA");
+    set.applyPattern("\\P{ GENERAL Category = upper case letter }", status);
+    if (U_FAILURE(status)) { errln("FAIL"); return; }
+    expectContainment(set, "abc", "ABC");
+}
+
 void UnicodeSetTest::TestExhaustive() {
    // exhaustive tests. Simulate UnicodeSets with integers.
    // That gives us very solid tests (except for large memory tests).
@ -569,6 +588,15 @@ UnicodeString UnicodeSetTest::getPairs(const UnicodeSet& set) {
    return pairs;
 }

+void
+UnicodeSetTest::expectContainment(const UnicodeSet& set,
+                                  const UnicodeString& charsIn,
+                                  const UnicodeString& charsOut) {
+    UnicodeString pat;
+    set.toPattern(pat);
+    expectContainment(set, pat, charsIn, charsOut);
+}
+
 void
 UnicodeSetTest::expectContainment(const UnicodeSet& set,
                                  const UnicodeString& setName,
--- a/icu4c/source/test/intltest/usettest.h
+++ b/icu4c/source/test/intltest/usettest.h
@ -41,6 +41,11 @@ private:

    void TestScriptSet(void);

+    /**
+     * Test the [:Latin:] syntax.
+     */
+    void TestPropertySet(void);
+
    void TestExhaustive(void);

 private:
@ -79,6 +84,9 @@ private:
     */
    static UnicodeString getPairs(const UnicodeSet& set);

+    void expectContainment(const UnicodeSet& set,
+                           const UnicodeString& charsIn,
+                           const UnicodeString& charsOut);
    void expectContainment(const UnicodeSet& set,
                           const UnicodeString& setName,
                           const UnicodeString& charsIn,