ICU-4521 UText-based regex implementation

X-SVN-Rev: 27482
2010-02-03 02:59:35 +00:00 · 2010-02-03 02:59:35 +00:00 · 8216117f21
commit 8216117f21
parent 7a93a3c3e2
21 changed files with 9262 additions and 855 deletions
--- a/.gitattributes
+++ b/.gitattributes
@ -54,6 +54,8 @@ icu4c/source/data/in/nfkc.nrm -text
 icu4c/source/data/in/nfkc_cf.nrm -text
 icu4c/source/data/in/unorm.icu -text
 icu4c/source/data/locales/pool.res -text
+icu4c/source/i18n/regextxt.cpp -text
+icu4c/source/i18n/regextxt.h -text
 icu4c/source/samples/ucnv/data02.bin -text
 icu4c/source/test/perf/README -text
 icu4c/source/test/testdata/TestFont1.otf -text
--- a/icu4c/source/common/unicode/utext.h
+++ b/icu4c/source/common/unicode/utext.h
@ -1,7 +1,7 @@
 /*
 *******************************************************************************
 *
-*   Copyright (C) 2004-2009, International Business Machines
+*   Copyright (C) 2004-2010, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 *
 *******************************************************************************
@ -136,6 +136,7 @@


 #include "unicode/utypes.h"
+#include "unicode/uchar.h"
 #if U_SHOW_CPLUSPLUS_API
 #include "unicode/localpointer.h"
 #include "unicode/rep.h"
@ -674,6 +675,148 @@ utext_extract(UText *ut,
             UErrorCode *status);


+/**
+ * Compare two UTexts (binary order). The comparison begins at each source text's
+ * iteration position. The iteration position of each UText will be left following
+ * the last character compared.
+ *
+ * The comparison is done in code point order; unlike u_strCompare, you
+ * cannot choose to use code unit order. This is because the characters
+ * in a UText are accessed one code point at a time, and may not be from a UTF-16
+ * context.
+ *
+ * This functions works with strings of different explicitly specified lengths
+ * unlike the ANSI C-like u_strcmp() and u_memcmp() etc.
+ * A length argument of -1 signifies that as much of the string should be used as
+ * is necessary to compare with the other string. If both length arguments are -1,
+ * the entire remaining portionss of both strings are used.
+ *
+ * @param s1 First source string.
+ * @param length1 Length of first source string in UTF-32 code points.
+ *
+ * @param s2 Second source string.
+ * @param length2 Length of second source string in UTF-32 code points.
+ *
+ * @return <0 or 0 or >0 as usual for string comparisons
+ *
+ * @internal ICU 4.4 technology preview
+ */
+U_INTERNAL int32_t U_EXPORT2
+utext_compare(UText *s1, int32_t length1,
+              UText *s2, int32_t length2);    
+
+/**
+ * Compare two UTexts (binary order). The comparison begins at each source text's
+ * iteration position. The iteration position of each UText will be left following
+ * the last character compared. This method differs from utext_compare in that
+ * it accepts native limits rather than lengths for each string.
+ *
+ * The comparison is done in code point order; unlike u_strCompare, you
+ * cannot choose to use code unit order. This is because the characters
+ * in a UText are accessed one code point at a time, and may not be from a UTF-16
+ * context.
+ *
+ * This functions works with strings of different explicitly specified lengths
+ * unlike the ANSI C-like u_strcmp() and u_memcmp() etc.
+ * A limit argument of -1 signifies that as much of the string should be used as
+ * is necessary to compare with the other string. If both limit arguments are -1,
+ * the entire remaining portionss of both strings are used.
+ *
+ * @param s1 First source string.
+ * @param limit1 Native index of the last character in the first source string to be considered.
+ *
+ * @param s2 Second source string.
+ * @param limit2 Native index of the last character in the second source string to be considered.
+ *
+ * @return <0 or 0 or >0 as usual for string comparisons
+ *
+ * @internal ICU 4.4 technology preview
+ */
+U_INTERNAL int32_t U_EXPORT2
+utext_compareNativeLimit(UText *s1, int64_t limit1,
+                         UText *s2, int64_t limit2);    
+
+/**
+ * Compare two UTexts case-insensitively using full case folding. The comparison
+ * begins at each source text's iteration position. The iteration position of each
+ * UText will be left following the last character compared.
+ *
+ * The comparison is done in code point order; this is because the characters
+ * in a UText are accessed one code point at a time, and may not be from a UTF-16
+ * context.
+ *
+ * This functions works with strings of different explicitly specified lengths
+ * unlike the ANSI C-like u_strcmp() and u_memcmp() etc.
+ * A length argument of -1 signifies that as much of the string should be used as
+ * is necessary to compare with the other string. If both length arguments are -1,
+ * the entire remaining portionss of both strings are used.
+ *
+ * @param s1 First source string.
+ * @param length1 Length of first source string in UTF-32 code points.
+ *
+ * @param s2 Second source string.
+ * @param length2 Length of second source string in UTF-32 code points.
+ *
+ * @param options A bit set of options:
+ *   - U_FOLD_CASE_DEFAULT or 0 is used for default options:
+ *     Comparison in code point order with default case folding.
+ *
+ *   - U_FOLD_CASE_EXCLUDE_SPECIAL_I
+ *
+ * @param pErrorCode Must be a valid pointer to an error code value,
+ *                  which must not indicate a failure before the function call.
+ *
+ * @return <0 or 0 or >0 as usual for string comparisons
+ *
+ * @internal ICU 4.4 technology preview
+ */
+U_INTERNAL int32_t U_EXPORT2
+utext_caseCompare(UText *s1, int32_t length1,
+                  UText *s2, int32_t length2,
+                  uint32_t options, UErrorCode *pErrorCode);    
+
+/**
+ * Compare two UTexts case-insensitively using full case folding. The comparison
+ * begins at each source text's iteration position. The iteration position of each
+ * UText will be left following the last character compared. This method differs from
+ * utext_caseCompare in that it accepts native limits rather than lengths for each
+ * string.
+ *
+ * The comparison is done in code point order; this is because the characters
+ * in a UText are accessed one code point at a time, and may not be from a UTF-16
+ * context.
+ *
+ * This functions works with strings of different explicitly specified lengths
+ * unlike the ANSI C-like u_strcmp() and u_memcmp() etc.
+ * A limit argument of -1 signifies that as much of the string should be used as
+ * is necessary to compare with the other string. If both length arguments are -1,
+ * the entire remaining portionss of both strings are used.
+ *
+ * @param s1 First source string.
+ * @param limit1 Native index of the last character in the first source string to be considered.
+ *
+ * @param s2 Second source string.
+ * @param limit2 Native index of the last character in the second source string to be considered.
+ *
+ * @param options A bit set of options:
+ *   - U_FOLD_CASE_DEFAULT or 0 is used for default options:
+ *     Comparison in code point order with default case folding.
+ *
+ *   - U_FOLD_CASE_EXCLUDE_SPECIAL_I
+ *
+ * @param pErrorCode Must be a valid pointer to an error code value,
+ *                  which must not indicate a failure before the function call.
+ *
+ * @return <0 or 0 or >0 as usual for string comparisons
+ *
+ * @internal ICU 4.4 technology preview
+ */
+U_INTERNAL int32_t U_EXPORT2
+utext_caseCompareNativeLimit(UText *s1, int64_t limit1,
+                             UText *s2, int64_t limit2,
+                             uint32_t options, UErrorCode *pErrorCode);    
+
+
 /************************************************************************************
 *
 *  #define inline versions of selected performance-critical text access functions
@ -689,6 +832,19 @@ utext_extract(UText *ut,
 *
 ************************************************************************************/

+/**
+ * inline version of utext_current32(), for performance-critical situations.
+ *
+ * Get the code point at the current iteration position of the UText.
+ * Returns U_SENTINEL (-1) if the position is at the end of the
+ * text.
+ *
+ * @internal ICU 4.4 technology preview
+ */
+#define UTEXT_CURRENT32(ut)  \
+    ((ut)->chunkOffset < (ut)->chunkLength && ((ut)->chunkContents)[(ut)->chunkOffset]<0xd800 ? \
+    ((ut)->chunkContents)[((ut)->chunkOffset)] : utext_current32(ut))
+
 /**
 * inline version of utext_next32(), for performance-critical situations.
 *
@ -1291,8 +1447,8 @@ struct UTextFuncs {
      * (private)  Spare function pointer
      * @internal
      */
-
    UTextClose  *spare1;
+    
    /**
      * (private)  Spare function pointer
      * @internal
--- a/icu4c/source/common/utext.cpp
+++ b/icu4c/source/common/utext.cpp
@ -1,7 +1,7 @@
 /*
 *******************************************************************************
 *
-*   Copyright (C) 2005-2009, International Business Machines
+*   Copyright (C) 2005-2010, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 *
 *******************************************************************************
@ -23,6 +23,7 @@
 #include "cmemory.h"
 #include "cstring.h"
 #include "uassert.h"
+#include "putilimp.h"

 U_NAMESPACE_USE

@ -450,6 +451,361 @@ utext_equals(const UText *a, const UText *b) {
    return TRUE;
 }

+U_CAPI int32_t U_EXPORT2
+utext_compare(UText *s1, int32_t length1,
+              UText *s2, int32_t length2) {
+    UChar32 c1, c2;
+    
+    if(length1<0 && length2<0) {
+        /* strcmp style, go until end of string */
+        for(;;) {
+            c1 = UTEXT_NEXT32(s1);
+            c2 = UTEXT_NEXT32(s2);
+            if(c1 != c2) {
+                break;
+            } else if(c1 == U_SENTINEL) {
+                return 0;
+            }
+        }
+    } else {
+        if(length1 < 0) {
+            length1 = INT32_MIN;
+        } else if (length2 < 0) {
+            length2 = INT32_MIN;
+        }
+        
+        /* memcmp/UnicodeString style, both length-specified */        
+        while((length1 > 0 || length1 == INT32_MIN) && (length2 > 0 || length2 == INT32_MIN)) {
+            c1 = UTEXT_NEXT32(s1);
+            c2 = UTEXT_NEXT32(s2);
+                       
+            if(c1 != c2) {
+                break;
+            } else if(c1 == U_SENTINEL) {
+                return 0;
+            }
+            
+            if (length1 != INT32_MIN) {
+                length1 -= 1;
+            }
+            if (length2 != INT32_MIN) {
+                length2 -= 1;
+            }
+        }
+        
+        if(length1 <= 0 && length1 != INT32_MIN) {
+            if(length2 <= 0) {
+                return 0;
+            } else {
+                return -1;
+            }
+        } else if(length2 <= 0 && length2 != INT32_MIN) {
+            if (length1 <= 0) {
+                return 0;
+            } else {
+                return 1;
+            }
+        }
+    }
+    
+    return (int32_t)c1-(int32_t)c2;
+}
+
+U_CAPI int32_t U_EXPORT2
+utext_compareNativeLimit(UText *s1, int64_t limit1,
+                         UText *s2, int64_t limit2) {
+    UChar32 c1, c2;
+    
+    if(limit1<0 && limit2<0) {
+        /* strcmp style, go until end of string */
+        for(;;) {
+            c1 = UTEXT_NEXT32(s1);
+            c2 = UTEXT_NEXT32(s2);
+            if(c1 != c2) {
+                return (int32_t)c1-(int32_t)c2;
+            } else if(c1 == U_SENTINEL) {
+                return 0;
+            }
+        }
+    } else {
+        /* memcmp/UnicodeString style, both length-specified */   
+        int64_t index1 = (limit1 >= 0 ? UTEXT_GETNATIVEINDEX(s1) : 0);
+        int64_t index2 = (limit2 >= 0 ? UTEXT_GETNATIVEINDEX(s2) : 0);
+        
+        while((limit1 < 0 || index1 < limit1) && (limit2 < 0 || index2 < limit2)) {
+            c1 = UTEXT_NEXT32(s1);
+            c2 = UTEXT_NEXT32(s2);
+            
+            if(c1 != c2) {
+                return (int32_t)c1-(int32_t)c2;
+            } else if(c1 == U_SENTINEL) {
+                return 0;
+            }
+            
+            if (limit1 >= 0) {
+                index1 = UTEXT_GETNATIVEINDEX(s1);
+            }
+            if (limit2 >= 0) {
+                index2 = UTEXT_GETNATIVEINDEX(s2);
+            }
+        }
+        
+        if(limit1 >= 0 && index1 >= limit1) {
+            if(index2 >= limit2) {
+                return 0;
+            } else {
+                return -1;
+            }
+        } else {
+            if(index1 >= limit1) {
+                return 0;
+            } else {
+                return 1;
+            }
+        }
+    }
+}
+
+U_CAPI int32_t U_EXPORT2
+utext_caseCompare(UText *s1, int32_t length1,
+                     UText *s2, int32_t length2,
+                     uint32_t options, UErrorCode *pErrorCode) {
+    const UCaseProps *csp;
+    
+    /* case folding variables */
+    const UChar *p;
+    int32_t length;
+    
+    /* case folding buffers, only use current-level start/limit */
+    UChar fold1[UCASE_MAX_STRING_LENGTH+1], fold2[UCASE_MAX_STRING_LENGTH+1];
+    int32_t foldOffset1, foldOffset2, foldLength1, foldLength2;
+    
+    /* current code points */
+    UChar32 c1, c2;
+    uint8_t cLength1, cLength2;
+    
+    /* argument checking */
+    if(pErrorCode==0 || U_FAILURE(*pErrorCode)) {
+        return 0;
+    }
+    if(s1==NULL || s2==NULL) {
+        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
+        return 0;
+    }    
+    
+    csp=ucase_getSingleton(pErrorCode);
+    if(U_FAILURE(*pErrorCode)) {
+        return 0;
+    }
+    
+    /* for variable-length strings */
+    if(length1 < 0) {
+        length1 = INT32_MIN;
+    }
+    if (length2 < 0) {
+        length2 = INT32_MIN;
+    }
+    
+    /* initialize */
+    foldOffset1 = foldOffset2 = foldLength1 = foldLength2 = 0;
+    
+    /* comparison loop */
+    while((foldOffset1 < foldLength1 || length1 > 0 || length1 == INT32_MIN) &&
+          (foldOffset2 < foldLength2 || length2 > 0 || length2 == INT32_MIN)) {
+        if(foldOffset1 < foldLength1) {
+            U16_NEXT_UNSAFE(fold1, foldOffset1, c1);
+            cLength1 = 0;
+        } else {
+            c1 = UTEXT_NEXT32(s1);
+            if (c1 != U_SENTINEL) {
+                cLength1 = U16_LENGTH(c1);
+                
+                length = ucase_toFullFolding(csp, c1, &p, options);
+                if(length >= 0) {
+                    if(length <= UCASE_MAX_STRING_LENGTH) {   // !!!: Does not correctly handle 0-length folded-case strings
+                        u_memcpy(fold1, p, length);
+                        foldOffset1 = 0;
+                        foldLength1 = length;
+                        U16_NEXT_UNSAFE(fold1, foldOffset1, c1);
+                    } else {
+                        c1 = length;
+                    }
+                }
+            }
+            
+            if(length1 != INT32_MIN) {
+                length1 -= 1;
+            }
+        }
+        
+        if(foldOffset2 < foldLength2) {
+            U16_NEXT_UNSAFE(fold2, foldOffset2, c2);
+            cLength2 = 0;
+        } else {
+            c2 = UTEXT_NEXT32(s2);
+            if (c2 != U_SENTINEL) {
+                cLength2 = U16_LENGTH(c2);
+                
+                length = ucase_toFullFolding(csp, c2, &p, options);
+                if(length >= 0) {
+                    if(length <= UCASE_MAX_STRING_LENGTH) {   // !!!: Does not correctly handle 0-length folded-case strings
+                        u_memcpy(fold2, p, length);
+                        foldOffset2 = 0;
+                        foldLength2 = length;
+                        U16_NEXT_UNSAFE(fold2, foldOffset2, c2);
+                    } else {
+                        c2 = length;
+                    }
+                }
+            } else if(c1 == U_SENTINEL) {
+                return 0; // end of both strings at once
+            }
+            
+            if(length2 != INT32_MIN) {
+                length2 -= 1;
+            }
+        }
+        
+        if(c1 != c2) {
+            return (int32_t)c1-(int32_t)c2;
+        }
+    }
+    
+    /* By now at least one of the strings is out of characters */
+    length1 += foldLength1 - foldOffset1;
+    length2 += foldLength2 - foldOffset2;
+    
+    if(length1 <= 0 && length1 != INT32_MIN) {
+        if(length2 <= 0) {
+            return 0;
+        } else {
+            return -1;
+        }
+    } else {
+        if (length1 <= 0) {
+            return 0;
+        } else {
+            return 1;
+        }
+    }
+}
+
+U_CAPI int32_t U_EXPORT2
+utext_caseCompareNativeLimit(UText *s1, int64_t limit1,
+                                UText *s2, int64_t limit2,
+                                uint32_t options, UErrorCode *pErrorCode) {
+    const UCaseProps *csp;
+    
+    /* case folding variables */
+    const UChar *p;
+    int32_t length;
+    
+    /* case folding buffers, only use current-level start/limit */
+    UChar fold1[UCASE_MAX_STRING_LENGTH+1], fold2[UCASE_MAX_STRING_LENGTH+1];
+    int32_t foldOffset1, foldOffset2, foldLength1, foldLength2;
+    
+    /* current code points */
+    UChar32 c1, c2;
+    
+    /* native indexes into s1 and s2 */
+    int64_t index1, index2;
+    
+    /* argument checking */
+    if(pErrorCode==0 || U_FAILURE(*pErrorCode)) {
+        return 0;
+    }
+    if(s1==NULL || s2==NULL) {
+        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
+        return 0;
+    }    
+    
+    csp=ucase_getSingleton(pErrorCode);
+    if(U_FAILURE(*pErrorCode)) {
+        return 0;
+    }
+    
+    /* initialize */
+    index1 = (limit1 >= 0 ? UTEXT_GETNATIVEINDEX(s1) : 0);
+    index2 = (limit2 >= 0 ? UTEXT_GETNATIVEINDEX(s2) : 0);
+
+    foldOffset1 = foldOffset2 = foldLength1 = foldLength2 = 0;
+    
+    /* comparison loop */
+    while((foldOffset1 < foldLength1 || limit1 < 0 || index1 < limit1) &&
+          (foldOffset2 < foldLength2 || limit2 < 0 || index2 < limit2)) {
+        if(foldOffset1 < foldLength1) {
+            U16_NEXT_UNSAFE(fold1, foldOffset1, c1);
+        } else {
+            c1 = UTEXT_NEXT32(s1);
+            if (c1 != U_SENTINEL) {
+                length = ucase_toFullFolding(csp, c1, &p, options);
+                if(length >= 0) {
+                    if(length <= UCASE_MAX_STRING_LENGTH) {   // !!!: Does not correctly handle 0-length folded-case strings
+                        u_memcpy(fold1, p, length);
+                        foldOffset1 = 0;
+                        foldLength1 = length;
+                        U16_NEXT_UNSAFE(fold1, foldOffset1, c1);
+                    } else {
+                        c1 = length;
+                    }
+                }
+            }
+            
+            if (limit1 >= 0) {
+                index1 = UTEXT_GETNATIVEINDEX(s1);
+            }
+        }
+        
+        if(foldOffset2 < foldLength2) {
+            U16_NEXT_UNSAFE(fold2, foldOffset2, c2);
+        } else {
+            c2 = UTEXT_NEXT32(s2);
+            if (c2 != U_SENTINEL) {
+                length = ucase_toFullFolding(csp, c2, &p, options);
+                if(length >= 0) {
+                    if(length <= UCASE_MAX_STRING_LENGTH) {   // !!!: Does not correctly handle 0-length folded-case strings
+                        u_memcpy(fold2, p, length);
+                        foldOffset2 = 0;
+                        foldLength2 = length;
+                        U16_NEXT_UNSAFE(fold2, foldOffset2, c2);
+                    } else {
+                        c2 = length;
+                    }
+                }
+            } else if(c1 == U_SENTINEL) {
+                return 0;
+            }
+            
+            if (limit2 >= 0) {
+                index2 = UTEXT_GETNATIVEINDEX(s2);
+            }
+        }
+        
+        if(c1 != c2) {
+            return (int32_t)c1-(int32_t)c2;
+        }
+    }
+    
+    /* By now at least one of the strings is out of characters */
+    index1 -= foldLength1 - foldOffset1;
+    index2 -= foldLength2 - foldOffset2;
+    
+    if(limit1 >= 0 && index1 >= limit1) {
+        if(index2 >= limit2) {
+            return 0;
+        } else {
+            return -1;
+        }
+    } else {
+        if(index1 >= limit1) {
+            return 0;
+        } else {
+            return 1;
+        }
+    }
+}
+
+
 U_CAPI UBool U_EXPORT2
 utext_isWritable(const UText *ut)
 {
@ -800,7 +1156,7 @@ shallowTextClone(UText * dest, const UText * src, UErrorCode * status) {
    adjustPointer(dest, &dest->p, src);
    adjustPointer(dest, &dest->q, src);
    adjustPointer(dest, &dest->r, src);
-	adjustPointer(dest, (const void **)&dest->chunkContents, src);
+    adjustPointer(dest, (const void **)&dest->chunkContents, src);

    return dest;
 }
@ -932,7 +1288,7 @@ utf8TextAccess(UText *ut, int64_t index, UBool forward) {
    if (ix>length) {
        if (length>=0) {
            ix=length;
-        } else if (ix>ut->c) {
+        } else if (ix>=ut->c) {
            // Zero terminated string, and requested index is beyond
            //   the region that has already been scanned.
            //   Scan up to either the end of the string or to the
@ -1415,7 +1771,7 @@ utext_strFromUTF8(UChar *dest,
            if(ch<0){
                ch = 0xfffd;
            }
-            if(ch<=0xFFFF){
+            if(U_IS_BMP(ch)){
                *(pDest++)=(UChar)ch;
            }else{
                *(pDest++)=UTF16_LEAD(ch);
@ -1438,7 +1794,7 @@ utext_strFromUTF8(UChar *dest,
            if(ch<0){
                ch = 0xfffd;
            }
-            reqLength+=UTF_CHAR_LENGTH(ch);
+            reqLength+=U16_LENGTH(ch);
        }
    }

@ -1485,7 +1841,7 @@ utf8TextExtract(UText *ut,
    int i;
    if (start32 < ut->chunkNativeLimit) {
        for (i=0; i<3; i++) {
-            if (U8_IS_LEAD(buf[start32]) || start32==0) {
+            if (U8_IS_SINGLE(buf[start32]) || U8_IS_LEAD(buf[start32]) || start32==0) {
                break;
            }
            start32--;
@ -1494,7 +1850,7 @@ utf8TextExtract(UText *ut,

    if (limit32 < ut->chunkNativeLimit) {
        for (i=0; i<3; i++) {
-            if (U8_IS_LEAD(buf[limit32]) || limit32==0) {
+            if (U8_IS_SINGLE(buf[limit32]) || U8_IS_LEAD(buf[limit32]) || limit32==0) {
                break;
            }
            limit32--;
@ -1506,6 +1862,7 @@ utf8TextExtract(UText *ut,
    utext_strFromUTF8(dest, destCapacity, &destLength,
                    (const char *)ut->context+start32, limit32-start32,
                    pErrorCode);
+    utf8TextAccess(ut, limit32, TRUE);
    return destLength;
 }

@ -1870,6 +2227,8 @@ repTextExtract(UText *ut,
    }
    UnicodeString buffer(dest, 0, destCapacity); // writable alias
    rep->extractBetween(start32, limit32, buffer);
+    repTextAccess(ut, limit32, TRUE);
+    
    return u_terminateUChars(dest, destCapacity, length, status);
 }

@ -2138,6 +2497,9 @@ unistrTextExtract(UText *t,
            trimmedLength=destCapacity;
        }
        us->extract(start32, trimmedLength, dest);
+        t->chunkOffset = start32+trimmedLength;
+    } else {
+        t->chunkOffset = start32;
    }
    u_terminateUChars(dest, destCapacity, length, pErrorCode);
    return length;
@ -2528,7 +2890,7 @@ ucstrTextExtract(UText *ut,
            if (strLength>=0) {
                // We have filled the destination buffer, and the string length is known.
                //  Cut the loop short.  There is no need to scan string termination.
-                di = strLength;
+                di = limit32 - start32;
                si = limit32;
                break;
            }
@ -2548,7 +2910,7 @@ ucstrTextExtract(UText *ut,
    }

    // Put iteration position at the point just following the extracted text
-    ut->chunkOffset = si;
+    ut->chunkOffset = uprv_min(strLength, start32 + destCapacity);

    // Add a terminating NUL if space in the buffer permits,
    // and set the error status as required.
@ -2754,21 +3116,26 @@ charIterTextExtract(UText *ut,
    int32_t  limit32 = pinIndex(limit, length);
    int32_t  desti   = 0;
    int32_t  srci;
+    int32_t  copyLimit;

    CharacterIterator *ci = (CharacterIterator *)ut->context;
    ci->setIndex32(start32);   // Moves ix to lead of surrogate pair, if needed.
    srci = ci->getIndex();
+    copyLimit = srci;
    while (srci<limit32) {
        UChar32 c = ci->next32PostInc();
        int32_t  len = U16_LENGTH(c);
        if (desti+len <= destCapacity) {
            U16_APPEND_UNSAFE(dest, desti, c);
+            copyLimit = srci+len;
        } else {
            desti += len;
            *status = U_BUFFER_OVERFLOW_ERROR;
        }
        srci += len;
    }
+    
+    charIterTextAccess(ut, copyLimit, TRUE);

    u_terminateUChars(dest, destCapacity, desti, status);
    return desti;
--- a/icu4c/source/i18n/Makefile.in
+++ b/icu4c/source/i18n/Makefile.in
@ -76,7 +76,7 @@ translit.o utrans.o esctrn.o unesctrn.o funcrepl.o strrepl.o tridpars.o \
 cpdtrans.o rbt.o rbt_data.o rbt_pars.o rbt_rule.o rbt_set.o \
 nultrans.o remtrans.o casetrn.o titletrn.o tolowtrn.o toupptrn.o anytrans.o \
 name2uni.o uni2name.o nortrans.o quant.o transreg.o brktrans.o \
-regexcmp.o rematch.o repattrn.o regexst.o udatpg.o uregex.o uregexc.o \
+regexcmp.o rematch.o repattrn.o regexst.o regextxt.o udatpg.o uregex.o uregexc.o \
 ulocdata.o measfmt.o currfmt.o curramt.o currunit.o measure.o utmscale.o \
 csdetect.o csmatch.o csr2022.o csrecog.o csrmbcs.o csrsbcs.o csrucode.o csrutf8.o inputext.o \
 wintzimpl.o windtfmt.o winnmfmt.o basictz.o dtrule.o rbtz.o tzrule.o tztrans.o vtzone.o \
--- a/icu4c/source/i18n/i18n.vcproj
+++ b/icu4c/source/i18n/i18n.vcproj
@ -3602,6 +3602,14 @@
 				RelativePath=".\regexst.h"
 				>
 			</File>
+			<File
+				RelativePath=".\regextxt.cpp"
+				>
+			</File>
+			<File
+				RelativePath=".\regextxt.h"
+				>
+			</File>
 			<File
 				RelativePath=".\rematch.cpp"
 				>
--- a/icu4c/source/i18n/regexcmp.cpp
+++ b/icu4c/source/i18n/regexcmp.cpp
@ -1,7 +1,7 @@
 //
 //  file:  regexcmp.cpp
 //
-//  Copyright (C) 2002-2009 International Business Machines Corporation and others.
+//  Copyright (C) 2002-2010 International Business Machines Corporation and others.
 //  All Rights Reserved.
 //
 //  This file contains the ICU regular expression compiler, which is responsible
@ -13,6 +13,7 @@

 #if !UCONFIG_NO_REGULAR_EXPRESSIONS

+#include "unicode/ustring.h"
 #include "unicode/unistr.h"
 #include "unicode/uniset.h"
 #include "unicode/uchar.h"
@ -21,6 +22,7 @@
 #include "unicode/parseerr.h"
 #include "unicode/regex.h"
 #include "util.h"
+#include "putilimp.h"
 #include "cmemory.h"
 #include "cstring.h"
 #include "uvectr32.h"
@ -33,6 +35,7 @@
                        //   generated by a Perl script.
 #include "regexcmp.h"
 #include "regexst.h"
+#include "regextxt.h"



@ -47,11 +50,13 @@ U_NAMESPACE_BEGIN
 RegexCompile::RegexCompile(RegexPattern *rxp, UErrorCode &status) :
   fParenStack(status), fSetStack(status), fSetOpStack(status)
 {
+    // Lazy init of all shared global sets (needed for init()'s empty text)
+    RegexStaticSets::initGlobals(&status);
+
    fStatus           = &status;

    fRXPat            = rxp;
    fScanIndex        = 0;
-    fNextIndex        = 0;
    fPeekChar         = -1;
    fLineNum          = 1;
    fCharNum          = 0;
@ -97,6 +102,24 @@ void    RegexCompile::compile(
                         const UnicodeString &pat,   // Source pat to be compiled.
                         UParseError &pp,            // Error position info
                         UErrorCode &e)              // Error Code
+{
+    UText patternText = UTEXT_INITIALIZER;
+    utext_openConstUnicodeString(&patternText, &pat, &e);
+    
+    if (U_SUCCESS(e)) {
+        compile(&patternText, pp, e);
+        utext_close(&patternText);
+    }
+}
+
+//
+//   compile, UText mode
+//     All the work is actually done here.
+//
+void    RegexCompile::compile(
+                         UText *pat,                 // Source pat to be compiled.
+                         UParseError &pp,            // Error position info
+                         UErrorCode &e)              // Error Code
 {
    fStatus             = &e;
    fParseErr           = &pp;
@ -108,16 +131,16 @@ void    RegexCompile::compile(
    }

    // There should be no pattern stuff in the RegexPattern object.  They can not be reused.
-    U_ASSERT(fRXPat->fPattern.length() == 0);
+    U_ASSERT(fRXPat->fPattern == NULL || utext_nativeLength(fRXPat->fPattern) == 0);

    // Prepare the RegexPattern object to receive the compiled pattern.
-    fRXPat->fPattern        = pat;
+    fRXPat->fPattern        = utext_clone(fRXPat->fPattern, pat, FALSE, TRUE, fStatus);
    fRXPat->fStaticSets     = RegexStaticSets::gStaticSets->fPropSets;
    fRXPat->fStaticSets8    = RegexStaticSets::gStaticSets->fPropSets8;


    // Initialize the pattern scanning state machine
-    fPatternLength = pat.length();
+    fPatternLength = utext_nativeLength(pat);
    uint16_t                state = 1;
    const RegexTableEl      *tableEl;
    nextChar(fC);                        // Fetch the first char from the pattern string.
@ -250,34 +273,13 @@ void    RegexCompile::compile(
    // The pattern has now been read and processed, and the compiled code generated.
    //

-    // Back-reference fixup
-    //
-    int32_t loc;
-    for (loc=0; loc<fRXPat->fCompiledPat->size(); loc++) {
-        int32_t op = fRXPat->fCompiledPat->elementAti(loc);
-        int32_t opType = URX_TYPE(op);
-        if (opType == URX_BACKREF || opType == URX_BACKREF_I) {
-            int32_t where = URX_VAL(op);
-            if (where > fRXPat->fGroupMap->size()) {
-                error(U_REGEX_INVALID_BACK_REF);
-                break;
-            }
-            where = fRXPat->fGroupMap->elementAti(where-1);
-            op    = URX_BUILD(opType, where);
-            fRXPat->fCompiledPat->setElementAt(op, loc);
-        }
-    }
-
-
    //
    // Compute the number of digits requried for the largest capture group number.
    //
    fRXPat->fMaxCaptureDigits = 1;
    int32_t  n = 10;
-    for (;;) {
-        if (n > fRXPat->fGroupMap->size()) {
-            break;
-        }
+    int32_t  groupCount = fRXPat->fGroupMap->size();
+    while (n <= groupCount) {
        fRXPat->fMaxCaptureDigits++;
        n *= 10;
    }
@ -286,10 +288,15 @@ void    RegexCompile::compile(
    // The pattern's fFrameSize so far has accumulated the requirements for
    //   storage for capture parentheses, counters, etc. that are encountered
    //   in the pattern.  Add space for the two variables that are always
-    //   present in the saved state:  the input string position and the
-    //   position in the compiled pattern.
+    //   present in the saved state:  the input string position (int64_t) and
+    //   the position in the compiled pattern.
    //
-    fRXPat->fFrameSize+=2;
+    fRXPat->fFrameSize+=3;
+
+    //
+    // Optimization pass 1: NOPs, back-references, and case-folding
+    //
+    stripNOPs();

    //
    // Get bounds for the minimum and maximum length of a string that this
@ -299,10 +306,9 @@ void    RegexCompile::compile(
    fRXPat->fMinMatchLen = minMatchLength(3, fRXPat->fCompiledPat->size()-1);

    //
-    // Optimization passes
+    // Optimization pass 2: match start type
    //
    matchStartType();
-    stripNOPs();

    //
    // Set up fast latin-1 range sets
@ -426,19 +432,19 @@ UBool RegexCompile::doParseActions(int32_t action)
        //      - NOP, which may later be replaced by a save-state if there
        //             is an '|' alternation within the parens.
        //
-        //    Each capture group gets three slots in the save stack frame:
-        //         0:   Capture Group start position (in input string being matched.)
-        //         1:   Capture Group end   positino.
-        //         2:   Start of Match-in-progress.
+        //    Each capture group gets three double-width slots in the save stack frame:
+        //         0-1: Capture Group start position (in input string being matched.)
+        //         2-3: Capture Group end position.
+        //         4-5: Start of Match-in-progress.
        //    The first two locations are for a completed capture group, and are
        //     referred to by back references and the like.
        //    The third location stores the capture start position when an START_CAPTURE is
        //      encountered.  This will be promoted to a completed capture when (and if) the corresponding
-        //      END_CAPure is encountered.
+        //      END_CAPTURE is encountered.
        {
            fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
-            int32_t  varsLoc    = fRXPat->fFrameSize;    // Reserve three slots in match stack frame.
-            fRXPat->fFrameSize += 3;
+            int32_t  varsLoc    = fRXPat->fFrameSize;    // Reserve five slots in match stack frame.
+            fRXPat->fFrameSize += 6;
            int32_t  cop        = URX_BUILD(URX_START_CAPTURE, varsLoc);
            fRXPat->fCompiledPat->addElement(cop, *fStatus);
            fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
@ -532,10 +538,10 @@ UBool RegexCompile::doParseActions(int32_t action)
        //    8.     code for parenthesized stuff.
        //    9.   LA_END
        //
-        //  Two data slots are reserved, for saving the stack ptr and the input position.
+        //  Three data slots are reserved, for saving the stack ptr and the (double-width) input position.
        {
            int32_t dataLoc = fRXPat->fDataSize;
-            fRXPat->fDataSize += 2;
+            fRXPat->fDataSize += 3;
            int32_t op = URX_BUILD(URX_LA_START, dataLoc);
            fRXPat->fCompiledPat->addElement(op, *fStatus);

@ -576,9 +582,10 @@ UBool RegexCompile::doParseActions(int32_t action)
        //    6.    BACKTRACK             // code in block succeeded, so neg. lookahead fails.
        //    7.    END_LA                // Restore match region, in case look-ahead was using
        //                                        an alternate (transparent) region.
+        //  Three data slots are reserved, for saving the stack ptr and the (double-width) input position.
        {
            int32_t dataLoc = fRXPat->fDataSize;
-            fRXPat->fDataSize += 2;
+            fRXPat->fDataSize += 3;
            int32_t op = URX_BUILD(URX_LA_START, dataLoc);
            fRXPat->fCompiledPat->addElement(op, *fStatus);

@ -617,12 +624,12 @@ UBool RegexCompile::doParseActions(int32_t action)
            //          Allocate a block of matcher data, to contain (when running a match)
            //              0:    Stack ptr on entry
            //              1:    Input Index on entry
-            //              2:    Start index of match current match attempt.
-            //              3:    Original Input String len.
+            //              2-3:  Start index of match current match attempt.
+            //              4-5:  Original Input String len.

            // Allocate data space
            int32_t dataLoc = fRXPat->fDataSize;
-            fRXPat->fDataSize += 4;
+            fRXPat->fDataSize += 6;

            // Emit URX_LB_START
            int32_t op = URX_BUILD(URX_LB_START, dataLoc);
@ -670,12 +677,12 @@ UBool RegexCompile::doParseActions(int32_t action)
            //          Allocate a block of matcher data, to contain (when running a match)
            //              0:    Stack ptr on entry
            //              1:    Input Index on entry
-            //              2:    Start index of match current match attempt.
-            //              3:    Original Input String len.
+            //              2-3:    Start index of match current match attempt.
+            //              4-5:    Original Input String len.

            // Allocate data space
            int32_t dataLoc = fRXPat->fDataSize;
-            fRXPat->fDataSize += 4;
+            fRXPat->fDataSize += 6;

            // Emit URX_LB_START
            int32_t op = URX_BUILD(URX_LB_START, dataLoc);
@ -764,7 +771,7 @@ UBool RegexCompile::doParseActions(int32_t action)
                    int32_t loopOpI = URX_BUILD(URX_LOOP_SR_I, URX_VAL(repeatedOp));
                    fRXPat->fCompiledPat->addElement(loopOpI, *fStatus);
                    frameLoc = fRXPat->fFrameSize;
-                    fRXPat->fFrameSize++;
+                    fRXPat->fFrameSize += 2; // double-width index
                    int32_t loopOpC = URX_BUILD(URX_LOOP_C, frameLoc);
                    fRXPat->fCompiledPat->addElement(loopOpC, *fStatus);
                    break;
@ -784,7 +791,7 @@ UBool RegexCompile::doParseActions(int32_t action)
                    }
                    fRXPat->fCompiledPat->addElement(loopOpI, *fStatus);
                    frameLoc = fRXPat->fFrameSize;
-                    fRXPat->fFrameSize++;
+                    fRXPat->fFrameSize += 2; // double-width index
                    int32_t loopOpC = URX_BUILD(URX_LOOP_C, frameLoc);
                    fRXPat->fCompiledPat->addElement(loopOpC, *fStatus);
                    break;
@ -801,7 +808,7 @@ UBool RegexCompile::doParseActions(int32_t action)
                // Emit the code sequence that can handle it.
                insertOp(topLoc);
                frameLoc =  fRXPat->fFrameSize;
-                fRXPat->fFrameSize++;
+                fRXPat->fFrameSize += 2; // double-width index

                int32_t op = URX_BUILD(URX_STO_INP_LOC, frameLoc);
                fRXPat->fCompiledPat->setElementAt(op, topLoc);
@ -907,7 +914,7 @@ UBool RegexCompile::doParseActions(int32_t action)
                    int32_t loopOpI = URX_BUILD(URX_LOOP_SR_I, URX_VAL(repeatedOp));
                    fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc);
                    dataLoc = fRXPat->fFrameSize;
-                    fRXPat->fFrameSize++;
+                    fRXPat->fFrameSize += 2; // double-width index
                    int32_t loopOpC = URX_BUILD(URX_LOOP_C, dataLoc);
                    fRXPat->fCompiledPat->addElement(loopOpC, *fStatus);
                    break;
@ -927,7 +934,7 @@ UBool RegexCompile::doParseActions(int32_t action)
                    }
                    fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc);
                    dataLoc = fRXPat->fFrameSize;
-                    fRXPat->fFrameSize++;
+                    fRXPat->fFrameSize += 2; // double-width index
                    int32_t loopOpC = URX_BUILD(URX_LOOP_C, dataLoc);
                    fRXPat->fCompiledPat->addElement(loopOpC, *fStatus);
                    break;
@ -945,7 +952,7 @@ UBool RegexCompile::doParseActions(int32_t action)
            if (minMatchLength(saveStateLoc, fRXPat->fCompiledPat->size()-1) == 0) {
                insertOp(saveStateLoc);
                dataLoc =  fRXPat->fFrameSize;
-                fRXPat->fFrameSize++;
+                fRXPat->fFrameSize += 2; // double-width index

                int32_t op = URX_BUILD(URX_STO_INP_LOC, dataLoc);
                fRXPat->fCompiledPat->setElementAt(op, saveStateLoc+1);
@ -1658,7 +1665,7 @@ UBool RegexCompile::doParseActions(int32_t action)
        }


-        case  doSetNegate:
+    case  doSetNegate:
        // Scanned a '^' at the start of a set.
        // Push the negation operator onto the set op stack.
        // A twist for case-insensitive matching:
@ -1770,17 +1777,12 @@ void RegexCompile::literalChar(UChar32 c)  {
        // First char of a string in the pattern.
        // Emit a OneChar op into the compiled pattern.
        emitONE_CHAR(c);
-
-        // Also add it to the string pool, in case we get a second adjacent literal
-        //   and want to change form ONE_CHAR to STRING
+        
+        // Mark that we might actually be starting a string here
        fStringOpStart = fRXPat->fLiteralText.length();
-        fRXPat->fLiteralText.append(c);
        return;
    }

-    // We are adding onto an existing string
-    fRXPat->fLiteralText.append(c);
-
    op     = fRXPat->fCompiledPat->lastElementi();
    opType = URX_TYPE(op);
    U_ASSERT(opType == URX_ONECHAR || opType == URX_ONECHAR_I || opType == URX_STRING_LEN);
@ -1797,10 +1799,12 @@ void RegexCompile::literalChar(UChar32 c)  {
            fRXPat->fCompiledPat->setElementAt(op, patternLoc);
            return;
        }
-
+        
        // The most recently emitted op is a ONECHAR.
        //  We've now received another adjacent char.  Change the ONECHAR op
        //   to a string op.
+        fRXPat->fLiteralText.append(URX_VAL(op));
+
        if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
            op     = URX_BUILD(URX_STRING_I, fStringOpStart);
        } else {
@ -1811,7 +1815,10 @@ void RegexCompile::literalChar(UChar32 c)  {
        op         = URX_BUILD(URX_STRING_LEN, 0);
        fRXPat->fCompiledPat->addElement(op, *fStatus);
    }
-
+    
+    // We are adding onto an existing string
+    fRXPat->fLiteralText.append(c);
+    
    // The pattern contains a URX_SRING / URX_STRING_LEN.  Update the
    //  string length to reflect the new char we just added to the string.
    stringLen  = fRXPat->fLiteralText.length() - fStringOpStart;
@ -1834,7 +1841,7 @@ void RegexCompile::emitONE_CHAR(UChar32  c) {
    if ((fModeFlags & UREGEX_CASE_INSENSITIVE) &&
        u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) {
        // We have a cased character, and are in case insensitive matching mode.
-        c  = u_foldCase(c, U_FOLD_CASE_DEFAULT);
+        //c  = u_foldCase(c, U_FOLD_CASE_DEFAULT);  // !!!: handled in stripNOPs() now
        op = URX_BUILD(URX_ONECHAR_I, c);
    } else {
        // Uncased char, or case sensitive match mode.
@ -2245,7 +2252,6 @@ void        RegexCompile::compileSet(UnicodeSet *theSet)
    //      ignored strings, that would be better.)
    theSet->removeAllStrings();
    int32_t  setSize = theSet->size();
-    UChar32  firstSetChar = theSet->charAt(0);

    switch (setSize) {
    case 0:
@ -2261,7 +2267,7 @@ void        RegexCompile::compileSet(UnicodeSet *theSet)
            // The set contains only a single code point.  Put it into
            //   the compiled pattern as a single char operation rather
            //   than a set, and discard the set itself.
-            literalChar(firstSetChar);
+            literalChar(theSet->charAt(0));
            delete theSet;
        }
        break;
@ -2472,7 +2478,7 @@ void   RegexCompile::matchStartType() {
        case URX_STO_INP_LOC:
        case URX_BACKREF:         // BackRef.  Must assume that it might be a zero length match
        case URX_BACKREF_I:
-
+                
        case URX_STO_SP:          // Setup for atomic or possessive blocks.  Doesn't change what can match.
        case URX_LD_SP:
            break;
@ -3378,6 +3384,14 @@ int32_t   RegexCompile::maxMatchLength(int32_t start, int32_t end) {
 //                code generation to provide locations that may be patched later.
 //                Many end up unneeded, and are removed by this function.
 //
+//                In order to minimize the number of passes through the pattern,
+//                back-reference fixup is also performed here (adjusting
+//                back-reference operands to point to the correct frame offsets).
+//
+//                In addition, case-insensitive character and string literals are
+//                now case-folded here, rather than when first parsed or at match
+//                time.
+//
 //------------------------------------------------------------------------------
 void RegexCompile::stripNOPs() {

@ -3399,6 +3413,9 @@ void RegexCompile::stripNOPs() {
            d++;
        }
    }
+    
+    UnicodeString caseStringBuffer;
+    int32_t stringDelta = 0;

    // Make a second pass over the code, removing the NOPs by moving following
    //  code up, and patching operands that refer to code locations that
@ -3432,12 +3449,69 @@ void RegexCompile::stripNOPs() {
                break;
            }

+        case URX_ONECHAR_I:
+            {
+                UChar32 c = URX_VAL(op);
+                if (u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) {
+                    // We have a cased character to fold
+                    c  = u_foldCase(c, U_FOLD_CASE_DEFAULT);
+                    op = URX_BUILD(URX_ONECHAR_I, c);
+                }
+                
+                fRXPat->fCompiledPat->setElementAt(op, dst);
+                dst++;
+                break;
+            }
+        case URX_STRING_I:
+            {
+                op = URX_BUILD(URX_STRING_I, URX_VAL(op)+stringDelta);
+                
+                src++;
+                int32_t lengthOp = fRXPat->fCompiledPat->elementAti(src);
+                
+                caseStringBuffer.setTo(fRXPat->fLiteralText, URX_VAL(op), URX_VAL(lengthOp));
+                caseStringBuffer.foldCase(U_FOLD_CASE_DEFAULT);
+                
+                int32_t newLen = caseStringBuffer.length();
+                if (newLen <= URX_VAL(lengthOp)) {
+                    // don't shift if we don't have to, take the tiny memory hit of a smaller string
+                    fRXPat->fLiteralText.replace(URX_VAL(op), newLen, caseStringBuffer);
+                } else {
+                    // shift other strings over...at least UnicodeString handles this for us!
+                    fRXPat->fLiteralText.replace(URX_VAL(op), URX_VAL(lengthOp), caseStringBuffer);
+                    stringDelta += newLen - URX_VAL(lengthOp);
+                }
+                lengthOp = URX_BUILD(URX_STRING_LEN, newLen);
+                
+                fRXPat->fCompiledPat->setElementAt(op, dst);
+                fRXPat->fCompiledPat->setElementAt(lengthOp, dst+1);
+                dst += 2;
+                break;
+            }
+        case URX_BACKREF:
+        case URX_BACKREF_I:
+            {
+                int32_t where = URX_VAL(op);
+                if (where > fRXPat->fGroupMap->size()) {
+                    error(U_REGEX_INVALID_BACK_REF);
+                    break;
+                }
+                where = fRXPat->fGroupMap->elementAti(where-1);
+                op    = URX_BUILD(opType, where);
+                fRXPat->fCompiledPat->setElementAt(op, dst);
+                dst++;
+                
+                fRXPat->fNeedsAltInput = TRUE;
+                break;
+            }
+        case URX_STRING:
+            op = URX_BUILD(URX_STRING, URX_VAL(op)+stringDelta);
+            // continue
        case URX_RESERVED_OP:
        case URX_RESERVED_OP_N:
        case URX_BACKTRACK:
        case URX_END:
        case URX_ONECHAR:
-        case URX_STRING:
        case URX_STRING_LEN:
        case URX_START_CAPTURE:
        case URX_END_CAPTURE:
@ -3460,13 +3534,9 @@ void RegexCompile::stripNOPs() {
        case URX_DOTANY_UNIX:
        case URX_STO_SP:
        case URX_LD_SP:
-        case URX_BACKREF:
        case URX_STO_INP_LOC:
        case URX_LA_START:
        case URX_LA_END:
-        case URX_ONECHAR_I:
-        case URX_STRING_I:
-        case URX_BACKREF_I:
        case URX_DOLLAR_M:
        case URX_CARET_M:
        case URX_CARET_M_UNIX:
@ -3509,15 +3579,15 @@ void RegexCompile::error(UErrorCode e) {
        *fStatus = e;
        fParseErr->line   = fLineNum;
        fParseErr->offset = fCharNum;
+        
+        UErrorCode status = U_ZERO_ERROR; // throwaway status for extracting context

        // Fill in the context.
        //   Note: extractBetween() pins supplied indicies to the string bounds.
        uprv_memset(fParseErr->preContext,  0, sizeof(fParseErr->preContext));
        uprv_memset(fParseErr->postContext, 0, sizeof(fParseErr->postContext));
-        fRXPat->fPattern.extractBetween(fScanIndex-U_PARSE_CONTEXT_LEN+1, fScanIndex,
-            fParseErr->preContext,  0);
-        fRXPat->fPattern.extractBetween(fScanIndex, fScanIndex+U_PARSE_CONTEXT_LEN-1,
-            fParseErr->postContext, 0);
+        utext_extract(fRXPat->fPattern, fScanIndex-U_PARSE_CONTEXT_LEN+1, fScanIndex, fParseErr->preContext, U_PARSE_CONTEXT_LEN, &status);
+        utext_extract(fRXPat->fPattern, fScanIndex, fScanIndex+U_PARSE_CONTEXT_LEN-1, fParseErr->postContext, U_PARSE_CONTEXT_LEN, &status);
    }
 }

@ -3557,18 +3627,18 @@ static const UChar      chLS        = 0x2028;    //    Unicode Line Separator
 //------------------------------------------------------------------------------
 UChar32  RegexCompile::nextCharLL() {
    UChar32       ch;
-    UnicodeString &pattern = fRXPat->fPattern;

    if (fPeekChar != -1) {
        ch = fPeekChar;
        fPeekChar = -1;
        return ch;
    }
-    if (fPatternLength==0 || fNextIndex >= fPatternLength) {
-        return (UChar32)-1;
+    
+    // assume we're already in the right place
+    ch = UTEXT_NEXT32(fRXPat->fPattern);
+    if (ch == U_SENTINEL) {
+        return ch;
    }
-    ch         = pattern.char32At(fNextIndex);
-    fNextIndex = pattern.moveIndex32(fNextIndex, 1);

    if (ch == chCR ||
        ch == chNEL ||
@ -3613,7 +3683,7 @@ UChar32  RegexCompile::peekCharLL() {
 //------------------------------------------------------------------------------
 void RegexCompile::nextChar(RegexPatternChar &c) {

-    fScanIndex = fNextIndex;
+    fScanIndex = UTEXT_GETNATIVEINDEX(fRXPat->fPattern);
    c.fChar    = nextCharLL();
    c.fQuoted  = FALSE;

@ -3670,8 +3740,7 @@ void RegexCompile::nextChar(RegexPatternChar &c) {
        //  check for backslash escaped characters.
        //
        if (c.fChar == chBackSlash) {
-            int32_t startX = fNextIndex;  // start and end positions of the
-            int32_t endX   = fNextIndex;  //   sequence following the '\'
+            int64_t pos = UTEXT_GETNATIVEINDEX(fRXPat->fPattern);
            if (RegexStaticSets::gStaticSets->fUnescapeCharSet.contains(peekCharLL())) {
                //
                // A '\' sequence that is handled by ICU's standard unescapeAt function.
@ -3680,19 +3749,39 @@ void RegexCompile::nextChar(RegexPatternChar &c) {
                //
                nextCharLL();                 // get & discard the peeked char.
                c.fQuoted = TRUE;
-                c.fChar = fRXPat->fPattern.unescapeAt(endX);
-                if (startX == endX) {
-                    error(U_REGEX_BAD_ESCAPE_SEQUENCE);
+                
+                if (UTEXT_FULL_TEXT_IN_CHUNK(fRXPat->fPattern, fPatternLength)) {
+                    int32_t endIndex = pos;
+                    c.fChar = u_unescapeAt(uregex_ucstr_unescape_charAt, &endIndex, fPatternLength, (void *)fRXPat->fPattern->chunkContents);
+                    
+                    if (endIndex == pos) {
+                        error(U_REGEX_BAD_ESCAPE_SEQUENCE);
+                    }
+                    fCharNum += endIndex - pos;
+                    UTEXT_SETNATIVEINDEX(fRXPat->fPattern, endIndex);
+                } else {
+                    int32_t offset = 0;
+                    struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(fRXPat->fPattern);
+                    
+                    UTEXT_SETNATIVEINDEX(fRXPat->fPattern, pos);
+                    c.fChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context);
+
+                    if (offset == 0) {
+                        error(U_REGEX_BAD_ESCAPE_SEQUENCE);
+                    } else if (context.lastOffset == offset) {
+                        UTEXT_PREVIOUS32(fRXPat->fPattern);
+                    } else if (context.lastOffset != offset-1) {
+                        utext_moveIndex32(fRXPat->fPattern, offset - context.lastOffset - 1);
+                    }
+                    fCharNum += offset;
                }
-                fCharNum += endX - startX;
-                fNextIndex = endX;
            }
            else if (peekCharLL() == chDigit0) {
                //  Octal Escape, using Java Regexp Conventions
                //    which are \0 followed by 1-3 octal digits.
                //    Different from ICU Unescape handling of Octal, which does not
                //    require the leading 0.
-                //  Java also has the convention of only consuning 2 octal digits if
+                //  Java also has the convention of only consuming 2 octal digits if
                //    the three digit number would be > 0xff
                //
                c.fChar = 0;
@ -3873,13 +3962,13 @@ UnicodeSet *RegexCompile::scanPosixProp() {

    // Save the scanner state.
    // TODO:  move this into the scanner, with the state encapsulated in some way.  Ticket 6062
-    int32_t     savedScanIndex        = fScanIndex;
-    int32_t     savedNextIndex        = fNextIndex;
+    int64_t     savedScanIndex        = fScanIndex;
+    int64_t     savedNextIndex        = UTEXT_GETNATIVEINDEX(fRXPat->fPattern);
    UBool       savedQuoteMode        = fQuoteMode;
    UBool       savedInBackslashQuote = fInBackslashQuote;
    UBool       savedEOLComments      = fEOLComments;
-    int32_t     savedLineNum          = fLineNum;
-    int32_t     savedCharNum          = fCharNum;
+    int64_t     savedLineNum          = fLineNum;
+    int64_t     savedCharNum          = fCharNum;
    UChar32     savedLastChar         = fLastChar;
    UChar32     savedPeekChar         = fPeekChar;
    RegexPatternChar savedfC          = fC;
@ -3926,7 +4015,6 @@ UnicodeSet *RegexCompile::scanPosixProp() {
        //  The main scanner will retry the input as a normal set expression,
        //    not a [:Property:] expression.
        fScanIndex        = savedScanIndex;
-        fNextIndex        = savedNextIndex;
        fQuoteMode        = savedQuoteMode;
        fInBackslashQuote = savedInBackslashQuote;
        fEOLComments      = savedEOLComments;
@ -3935,6 +4023,7 @@ UnicodeSet *RegexCompile::scanPosixProp() {
        fLastChar         = savedLastChar;
        fPeekChar         = savedPeekChar;
        fC                = savedfC;
+        UTEXT_SETNATIVEINDEX(fRXPat->fPattern, savedNextIndex);
    }
    return uset;
 }
--- a/icu4c/source/i18n/regexcmp.h
+++ b/icu4c/source/i18n/regexcmp.h
@ -1,7 +1,7 @@
 //
 //  regexcmp.h
 //
-//  Copyright (C) 2002-2008, International Business Machines Corporation and others.
+//  Copyright (C) 2002-2010, International Business Machines Corporation and others.
 //  All Rights Reserved.
 //
 //  This file contains declarations for the class RegexCompile
@ -54,7 +54,8 @@ public:
    RegexCompile(RegexPattern *rp, UErrorCode &e);

    void       compile(const UnicodeString &pat, UParseError &pp, UErrorCode &e);
-
+    void       compile(UText *pat, UParseError &pp, UErrorCode &e);
+    

    virtual    ~RegexCompile();

@ -102,7 +103,7 @@ private:
    void        fixLiterals(UBool split=FALSE);      // Fix literal strings.
    void        insertOp(int32_t where);             // Open up a slot for a new op in the
                                                     //   generated code at the specified location.
-    void        emitONE_CHAR(UChar32 c);             // EMit a ONE_CHAR op into the compiled code,
+    void        emitONE_CHAR(UChar32 c);             // Emit a ONE_CHAR op into the compiled code,
                                                     //   taking case mode into account.
    int32_t     minMatchLength(int32_t start,
                               int32_t end);
@ -124,16 +125,14 @@ private:
    //
    //  Data associated with low level character scanning
    //
-    int32_t                       fScanIndex;        // Index of current character being processed
+    int64_t                       fScanIndex;        // Index of current character being processed
                                                     //   in the rule input string.
-    int32_t                       fNextIndex;        // Index of the next character, which
-                                                     //   is the first character not yet scanned.
    UBool                         fQuoteMode;        // Scan is in a \Q...\E quoted region
    UBool                         fInBackslashQuote; // Scan is between a '\' and the following char.
    UBool                         fEOLComments;      // When scan is just after '(?',  inhibit #... to
                                                     //   end of line comments, in favor of (?#...) comments.
-    int32_t                       fLineNum;          // Line number in input file.
-    int32_t                       fCharNum;          // Char position within the line.
+    int64_t                       fLineNum;          // Line number in input file.
+    int64_t                       fCharNum;          // Char position within the line.
    UChar32                       fLastChar;         // Previous char, needed to count CR-LF
                                                     //   as a single line, not two.
    UChar32                       fPeekChar;         // Saved char, if we've scanned ahead.
@ -168,8 +167,8 @@ private:
                                                     //   holds the start index within RegexPattern.
                                                     //   fLiteralText where the string is being stored.

-    int32_t                       fPatternLength;    // Length of the input pattern string.
-
+    int64_t                       fPatternLength;    // Length of the input pattern string.
+    
    UVector32                     fParenStack;       // parentheses stack.  Each frame consists of
                                                     //   the positions of compiled pattern operations
                                                     //   needing fixup, followed by negative value.  The
@ -196,7 +195,7 @@ private:
                                                     //   -1 for the upper interval value means none
                                                     //   was specified (unlimited occurences.)

-    int32_t                       fNameStartPos;     // Starting position of a \N{NAME} name in a
+    int64_t                       fNameStartPos;     // Starting position of a \N{NAME} name in a
                                                     //   pattern, valid while remainder of name is
                                                     //   scanned.

@ -208,7 +207,6 @@ private:
    UChar32                       fLastSetLiteral;   // The last single code point added to a set.
                                                     //   needed when "-y" is scanned, and we need
                                                     //   to turn "x-y" into a range.
-
 };

 // Constant values to be pushed onto fSetOpStack while scanning & evalueating [set expressions]
--- a/icu4c/source/i18n/regeximp.h
+++ b/icu4c/source/i18n/regeximp.h
@ -1,5 +1,5 @@
 //
-//   Copyright (C) 2002-2007 International Business Machines Corporation
+//   Copyright (C) 2002-2010 International Business Machines Corporation
 //   and others. All rights reserved.
 //
 //   file:  regeximp.h
@ -279,11 +279,12 @@ enum {
 //  Match Engine State Stack Frame Layout.
 //
 struct REStackFrame {
-    int32_t            fInputIdx;        // Position of next character in the input string
+    int64_t            fInputIdx;        // Position of next character in the input string
    int32_t            fPatIdx;          // Position of next Op in the compiled pattern
    int32_t            fExtra[2];        // Extra state, for capture group start/ends
                                         //   atomic parentheses, repeat counts, etc.
                                         //   Locations assigned at pattern compile time.
+                                         //   Note that this will likely end up longer than 64 bits.
 };

 //
@ -307,7 +308,6 @@ enum StartOfMatch {
                               (v)==START_STRING?  "START_STRING"  : \
                                                   "ILLEGAL")

-
 //
 //  8 bit set, to fast-path latin-1 set membership tests.
 //
@ -347,7 +347,6 @@ inline void Regex8BitSet::operator = (const Regex8BitSet &s) {
   uprv_memcpy(d, s.d, sizeof(d));
 }

-
 U_NAMESPACE_END
 #endif

--- a/icu4c/source/i18n/regexst.cpp
+++ b/icu4c/source/i18n/regexst.cpp
@ -1,7 +1,7 @@
 //
 //  regexst.h
 //
-//  Copyright (C) 2004-2008, International Business Machines Corporation and others.
+//  Copyright (C) 2004-2010, International Business Machines Corporation and others.
 //  All Rights Reserved.
 //
 //  This file contains class RegexStaticSets
@ -214,6 +214,10 @@ fRuleDigitsAlias(NULL)
    for (i=0; i<(int32_t)(sizeof(fRuleSets)/sizeof(fRuleSets[0])); i++) {
        fRuleSets[i].compact();
    }
+    
+    // Finally, initialize an empty string for utility purposes
+    fEmptyText = utext_openUChars(NULL, NULL, 0, status);
+    
    return; // If we reached this point, everything is fine so just exit

 ExitConstrDeleteAll: // Remove fPropSets and fRuleSets and return error
@ -233,6 +237,8 @@ RegexStaticSets::~RegexStaticSets() {
        fPropSets[i] = NULL;
    }
    fRuleDigitsAlias = NULL;
+    
+    utext_close(fEmptyText);
 }


--- a/icu4c/source/i18n/regexst.h
+++ b/icu4c/source/i18n/regexst.h
@ -1,7 +1,7 @@
 //
 //  regexst.h
 //
-//  Copyright (C) 2003-2008, International Business Machines Corporation and others.
+//  Copyright (C) 2003-2010, International Business Machines Corporation and others.
 //  All Rights Reserved.
 //
 //  This file contains declarations for the class RegexStaticSets
@ -19,6 +19,7 @@
 #define REGEXST_H

 #include "unicode/utypes.h"
+#include "unicode/utext.h"
 #if !UCONFIG_NO_REGULAR_EXPRESSIONS

 #include "regeximp.h"
@ -45,7 +46,7 @@ public:
    UnicodeSet    fUnescapeCharSet;            // Set of chars handled by unescape when
                                               //   encountered with a \ in a pattern.
    UnicodeSet    *fRuleDigitsAlias;
-    UnicodeString fEmptyString;                // An empty string, to be used when a matcher
+    UText         *fEmptyText;                 // An empty string, to be used when a matcher
                                               //   is created with no input.

 };
--- a/icu4c/source/i18n/regextxt.cpp
+++ b/icu4c/source/i18n/regextxt.cpp
@ -0,0 +1,45 @@
+/********************************************************************
+ * COPYRIGHT:
+ * Copyright (c) 2008-2010, International Business Machines Corporation and
+ * others. All Rights Reserved.
+ ********************************************************************/
+//
+//  file:  regextxt.cpp
+//
+//  This file contains utility code for supporting UText in the regular expression engine.
+//
+
+#include "regextxt.h"
+
+U_NAMESPACE_BEGIN
+
+U_CFUNC UChar U_CALLCONV
+uregex_utext_unescape_charAt(int32_t offset, void *ct) {
+    struct URegexUTextUnescapeCharContext *context = (struct URegexUTextUnescapeCharContext *)ct;
+    UChar32 c;
+    if (offset == context->lastOffset + 1) {
+        c = UTEXT_NEXT32(context->text);
+        context->lastOffset++;
+    } else if (offset == context->lastOffset) {
+        c = UTEXT_PREVIOUS32(context->text);
+        UTEXT_NEXT32(context->text);
+    } else {
+        utext_moveIndex32(context->text, offset - context->lastOffset - 1);
+        c = UTEXT_NEXT32(context->text);
+        context->lastOffset = offset;
+    }
+    
+    // !!!: Doesn't handle characters outside BMP
+    if (U_IS_BMP(c)) {
+        return (UChar)c;
+    } else {
+        return 0;
+    }
+}
+
+U_CFUNC UChar U_CALLCONV
+uregex_ucstr_unescape_charAt(int32_t offset, void *context) {
+    return ((UChar *)context)[offset];
+}
+
+U_NAMESPACE_END
--- a/icu4c/source/i18n/regextxt.h
+++ b/icu4c/source/i18n/regextxt.h
@ -0,0 +1,48 @@
+/********************************************************************
+ * COPYRIGHT:
+ * Copyright (c) 2008-2010, International Business Machines Corporation and
+ * others. All Rights Reserved.
+ ********************************************************************/
+//
+//  file:  regextxt.h
+//
+//  This file contains utility code for supporting UText in the regular expression engine.
+//
+//  This class is internal to the regular expression implementation.
+//  For the public Regular Expression API, see the file "unicode/regex.h"
+//
+
+#ifndef _REGEXTXT_H
+#define _REGEXTXT_H
+
+#include <unicode/utypes.h>
+#include <unicode/utext.h>
+
+U_NAMESPACE_BEGIN
+
+#define UTEXT_USES_U16(ut) (NULL==((ut)->pFuncs->mapNativeIndexToUTF16))
+
+#if 0
+#define REGEX_DISABLE_CHUNK_MODE 1
+#endif
+
+#ifdef REGEX_DISABLE_CHUNK_MODE
+#  define UTEXT_FULL_TEXT_IN_CHUNK(ut,len) (FALSE)
+#else
+#  define UTEXT_FULL_TEXT_IN_CHUNK(ut,len) ((0==((ut)->chunkNativeStart))&&((len)==((ut)->chunkNativeLimit))&&((len)==((ut)->nativeIndexingLimit)))
+#endif
+
+struct URegexUTextUnescapeCharContext {
+    UText *text;
+    int32_t lastOffset;
+};
+#define U_REGEX_UTEXT_UNESCAPE_CONTEXT(text) { (text), -1 }
+
+U_CFUNC UChar U_CALLCONV
+uregex_utext_unescape_charAt(int32_t offset, void * /* struct URegexUTextUnescapeCharContext* */ context);
+U_CFUNC UChar U_CALLCONV
+uregex_ucstr_unescape_charAt(int32_t offset, void * /* UChar* */ context);
+
+U_NAMESPACE_END
+
+#endif
--- a/icu4c/source/i18n/rematch.cpp
+++ b/icu4c/source/i18n/rematch.cpp
--- a/icu4c/source/i18n/repattrn.cpp
+++ b/icu4c/source/i18n/repattrn.cpp
@ -3,7 +3,7 @@
 //
 /*
 ***************************************************************************
-*   Copyright (C) 2002-2009 International Business Machines Corporation   *
+*   Copyright (C) 2002-2010 International Business Machines Corporation   *
 *   and others. All rights reserved.                                      *
 ***************************************************************************
 */
@ -29,11 +29,11 @@ U_NAMESPACE_BEGIN
 //
 //--------------------------------------------------------------------------
 RegexPattern::RegexPattern() {
-    // Init all of this instance's data.
-    init();
+    UErrorCode status = U_ZERO_ERROR;
+    u_init(&status);

-    // Lazy init of all shared global sets.
-    RegexStaticSets::initGlobals(&fDeferredStatus);
+    // Init all of this instances data.
+    init();
 }


@ -52,7 +52,7 @@ RegexPattern::RegexPattern(const RegexPattern &other) :  UObject(other) {

 //--------------------------------------------------------------------------
 //
-//    Assignmenet Operator
+//    Assignment Operator
 //
 //--------------------------------------------------------------------------
 RegexPattern &RegexPattern::operator = (const RegexPattern &other) {
@ -68,7 +68,7 @@ RegexPattern &RegexPattern::operator = (const RegexPattern &other) {
    init();

    // Copy simple fields
-    fPattern          = other.fPattern;
+    fPattern          = utext_clone(fPattern, other.fPattern, FALSE, TRUE, &fDeferredStatus);
    fFlags            = other.fFlags;
    fLiteralText      = other.fLiteralText;
    fDeferredStatus   = other.fDeferredStatus;
@ -85,6 +85,7 @@ RegexPattern &RegexPattern::operator = (const RegexPattern &other) {
    *fInitialChars    = *other.fInitialChars;
    fInitialChar      = other.fInitialChar;
    *fInitialChars8   = *other.fInitialChars8;
+    fNeedsAltInput    = other.fNeedsAltInput;

    //  Copy the pattern.  It's just values, nothing deep to copy.
    fCompiledPat->assign(*other.fCompiledPat, fDeferredStatus);
@ -126,7 +127,6 @@ RegexPattern &RegexPattern::operator = (const RegexPattern &other) {
 //
 //--------------------------------------------------------------------------
 void RegexPattern::init() {
-    fPattern.remove();
    fFlags            = 0;
    fCompiledPat      = 0;
    fLiteralText.remove();
@ -146,7 +146,9 @@ void RegexPattern::init() {
    fInitialChars     = NULL;
    fInitialChar      = 0;
    fInitialChars8    = NULL;
+    fNeedsAltInput    = FALSE;

+    fPattern          = NULL; // will be set later
    fCompiledPat      = new UVector32(fDeferredStatus);
    fGroupMap         = new UVector32(fDeferredStatus);
    fSets             = new UVector(fDeferredStatus);
@ -192,6 +194,9 @@ void RegexPattern::zap() {
    fInitialChars = NULL;
    delete fInitialChars8;
    fInitialChars8 = NULL;
+    if (fPattern != NULL) {
+        utext_close(fPattern);
+    }
 }


@ -220,13 +225,27 @@ RegexPattern  *RegexPattern::clone() const {
 //
 //   operator ==   (comparison)    Consider to patterns to be == if the
 //                                 pattern strings and the flags are the same.
+//                                 Note that pattern strings with the same
+//                                 characters can still be considered different.
 //
 //--------------------------------------------------------------------------
 UBool   RegexPattern::operator ==(const RegexPattern &other) const {
-    UBool r = this->fFlags    == other.fFlags &&
-              this->fPattern  == other.fPattern &&
-              this->fDeferredStatus == other.fDeferredStatus;
-    return r;
+    if (this->fPattern == NULL) {
+        if (other.fPattern == NULL) {
+            return this->fFlags == other.fFlags && this->fDeferredStatus == other.fDeferredStatus;
+        } else {
+            return FALSE;
+        }
+    } else {
+        if (other.fPattern == NULL) {
+            return FALSE;
+        } else {
+            UTEXT_SETNATIVEINDEX(this->fPattern, 0);
+            UTEXT_SETNATIVEINDEX(other.fPattern, 0);
+            return this->fFlags == other.fFlags && this->fDeferredStatus == other.fDeferredStatus &&
+                utext_equals(this->fPattern, other.fPattern);
+        }
+    }
 }

 //---------------------------------------------------------------------
@ -240,7 +259,57 @@ RegexPattern::compile(const UnicodeString &regex,
                      UParseError          &pe,
                      UErrorCode           &status)
 {
+    if (U_FAILURE(status)) {
+        return NULL;
+    }
+    
+    const uint32_t allFlags = UREGEX_CANON_EQ | UREGEX_CASE_INSENSITIVE | UREGEX_COMMENTS |
+    UREGEX_DOTALL   | UREGEX_MULTILINE        | UREGEX_UWORD |
+    UREGEX_ERROR_ON_UNKNOWN_ESCAPES           | UREGEX_UNIX_LINES;
+    
+    if ((flags & ~allFlags) != 0) {
+        status = U_REGEX_INVALID_FLAG;
+        return NULL;
+    }
+    
+    if ((flags & UREGEX_CANON_EQ) != 0) {
+        status = U_REGEX_UNIMPLEMENTED;
+        return NULL;
+    }
+    
+    RegexPattern *This = new RegexPattern;
+    if (This == NULL) {
+        status = U_MEMORY_ALLOCATION_ERROR;
+        return NULL;
+    }
+    if (U_FAILURE(This->fDeferredStatus)) {
+        status = This->fDeferredStatus;
+        delete This;
+        return NULL;
+    }
+    This->fFlags = flags;
+    
+    RegexCompile     compiler(This, status);
+    compiler.compile(regex, pe, status);
+    
+    if (U_FAILURE(status)) {
+        delete This;
+        This = NULL;
+    }
+    
+    return This;
+}

+
+//
+//   compile, UText mode
+//
+RegexPattern * U_EXPORT2
+RegexPattern::compile(UText                *regex,
+                      uint32_t             flags,
+                      UParseError          &pe,
+                      UErrorCode           &status)
+{
    if (U_FAILURE(status)) {
        return NULL;
    }
@ -294,20 +363,43 @@ RegexPattern::compile(const UnicodeString &regex,
 }


+//
+//   compile with default flags, UText mode
+//
+RegexPattern * U_EXPORT2
+RegexPattern::compile(UText               *regex,
+                      UParseError         &pe,
+                      UErrorCode          &err)
+{
+    return compile(regex, 0, pe, err);
+}
+

 //
 //   compile with no UParseErr parameter.
 //
 RegexPattern * U_EXPORT2
-RegexPattern::compile( const UnicodeString &regex,
-        uint32_t             flags,
-        UErrorCode           &err)
+RegexPattern::compile(const UnicodeString &regex,
+                      uint32_t             flags,
+                      UErrorCode          &err)
 {
    UParseError pe;
    return compile(regex, flags, pe, err);
 }


+//
+//   compile with no UParseErr parameter, UText mode
+//
+RegexPattern * U_EXPORT2
+RegexPattern::compile(UText                *regex,
+                      uint32_t             flags,
+                      UErrorCode           &err)
+{
+    UParseError pe;
+    return compile(regex, flags, pe, err);
+}
+

 //---------------------------------------------------------------------
 //
@ -327,8 +419,21 @@ uint32_t RegexPattern::flags() const {
 RegexMatcher *RegexPattern::matcher(const UnicodeString &input,
                                    UErrorCode          &status)  const {
    RegexMatcher    *retMatcher = matcher(status);
-    retMatcher->fDeferredStatus = status;
    if (retMatcher != NULL) {
+        retMatcher->fDeferredStatus = status;
+        retMatcher->reset(input);
+    }
+    return retMatcher;
+}
+
+//
+//   matcher, UText mode
+//
+RegexMatcher *RegexPattern::matcher(UText               *input,
+                                    UErrorCode          &status)  const {
+    RegexMatcher    *retMatcher = matcher(status);
+    if (retMatcher != NULL) {
+        retMatcher->fDeferredStatus = status;
        retMatcher->reset(input);
    }
    return retMatcher;
@ -399,6 +504,31 @@ UBool U_EXPORT2 RegexPattern::matches(const UnicodeString   &regex,
 }


+//
+//   matches, UText mode
+//
+UBool U_EXPORT2 RegexPattern::matches(UText                *regex,
+                    UText           *input,
+                    UParseError     &pe,
+                    UErrorCode      &status) {
+
+    if (U_FAILURE(status)) {return FALSE;}
+
+    UBool         retVal;
+    RegexPattern *pat     = NULL;
+    RegexMatcher *matcher = NULL;
+
+    pat     = RegexPattern::compile(regex, 0, pe, status);
+    matcher = pat->matcher(input, status);
+    retVal  = matcher->matches(status);
+
+    delete matcher;
+    delete pat;
+    return retVal;
+}
+
+
+


 //---------------------------------------------------------------------
@ -407,12 +537,43 @@ UBool U_EXPORT2 RegexPattern::matches(const UnicodeString   &regex,
 //
 //---------------------------------------------------------------------
 UnicodeString RegexPattern::pattern() const {
-    return fPattern;
+    if (fPattern == NULL) {
+        return UnicodeString();
+    } else {
+        UErrorCode status = U_ZERO_ERROR;
+        int64_t nativeLen = utext_nativeLength(fPattern);
+        int32_t len16 = utext_extract(fPattern, 0, nativeLen, NULL, 0, &status); // buffer overflow error
+        UnicodeString result;
+        
+        status = U_ZERO_ERROR;
+        UChar *resultChars = result.getBuffer(len16);
+        utext_extract(fPattern, 0, nativeLen, resultChars, len16, &status); // unterminated warning
+        result.releaseBuffer(len16);
+        
+        return result;
+    }
 }




+//---------------------------------------------------------------------
+//
+//   patternText
+//
+//---------------------------------------------------------------------
+UText *RegexPattern::patternText() const {
+    if (fPattern != NULL) {
+        return fPattern;
+    } else {
+        UErrorCode status = U_ZERO_ERROR;
+        RegexStaticSets::initGlobals(&status);
+        return RegexStaticSets::gStaticSets->fEmptyText;
+    }
+}
+
+
+
 //---------------------------------------------------------------------
 //
 //   split
@ -421,7 +582,28 @@ UnicodeString RegexPattern::pattern() const {
 int32_t  RegexPattern::split(const UnicodeString &input,
        UnicodeString    dest[],
        int32_t          destCapacity,
-        UErrorCode       &status) const
+        UErrorCode      &status) const
+{
+    if (U_FAILURE(status)) {
+        return 0;
+    };
+
+    RegexMatcher  m(this);
+    int32_t r = 0;
+    // Check m's status to make sure all is ok.
+    if (U_SUCCESS(m.fDeferredStatus)) {
+    	r = m.split(input, dest, destCapacity, status);
+    }
+    return r;
+}
+
+//
+//   split, UText mode
+//
+int32_t  RegexPattern::split(UText *input,
+        UText           *dest[],
+        int32_t          destCapacity,
+        UErrorCode      &status) const
 {
    if (U_FAILURE(status)) {
        return 0;
@ -572,17 +754,24 @@ RegexPatternDump(const RegexPattern *This) {
    int      i;

    REGEX_DUMP_DEBUG_PRINTF(("Original Pattern:  "));
-    for (i=0; i<This->fPattern.length(); i++) {
-        REGEX_DUMP_DEBUG_PRINTF(("%c", This->fPattern.charAt(i)));
+    UChar32 c = utext_next32From(This->fPattern, 0);
+    while (c != U_SENTINEL) {
+        if (c<32 || c>256) {
+            c = '.';
+        }
+        REGEX_DUMP_DEBUG_PRINTF(("%c", c));
+        
+        c = UTEXT_NEXT32(This->fPattern);
    }
    REGEX_DUMP_DEBUG_PRINTF(("\n"));
    REGEX_DUMP_DEBUG_PRINTF(("   Min Match Length:  %d\n", This->fMinMatchLen));
    REGEX_DUMP_DEBUG_PRINTF(("   Match Start Type:  %s\n", START_OF_MATCH_STR(This->fStartType)));
    if (This->fStartType == START_STRING) {
-        REGEX_DUMP_DEBUG_PRINTF(("    Initial match sting: \""));
+        REGEX_DUMP_DEBUG_PRINTF(("    Initial match string: \""));
        for (i=This->fInitialStringIdx; i<This->fInitialStringIdx+This->fInitialStringLen; i++) {
            REGEX_DUMP_DEBUG_PRINTF(("%c", This->fLiteralText[i]));   // TODO:  non-printables, surrogates.
        }
+        REGEX_DUMP_DEBUG_PRINTF(("\"\n"));

    } else if (This->fStartType == START_SET) {
        int32_t numSetChars = This->fInitialChars->size();
--- a/icu4c/source/i18n/unicode/regex.h
+++ b/icu4c/source/i18n/unicode/regex.h
@ -1,6 +1,6 @@
 /*
 **********************************************************************
-*   Copyright (C) 2002-2009, International Business Machines
+*   Copyright (C) 2002-2010, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 **********************************************************************
 *   file name:  regex.h
@ -48,6 +48,7 @@

 #include "unicode/uobject.h"
 #include "unicode/unistr.h"
+#include "unicode/utext.h"
 #include "unicode/parseerr.h"

 #include "unicode/uregex.h"
@ -187,6 +188,35 @@ public:
        UParseError          &pe,
        UErrorCode           &status);

+
+   /**
+    * Compiles the regular expression in string form into a RegexPattern
+    * object.  These compile methods, rather than the constructors, are the usual
+    * way that RegexPattern objects are created.
+    *
+    * <p>Note that RegexPattern objects must not be deleted while RegexMatcher
+    * objects created from the pattern are active.  RegexMatchers keep a pointer
+    * back to their pattern, so premature deletion of the pattern is a
+    * catastrophic error.</p>
+    *
+    * <p>All pattern match mode flags are set to their default values.</p>
+    *
+    * <p>Note that it is often more convenient to construct a RegexMatcher directly
+    *    from a pattern string rather than separately compiling the pattern and
+    *    then creating a RegexMatcher object from the pattern.</p>
+    *
+    * @param regex The regular expression to be compiled.
+    * @param pe    Receives the position (line and column nubers) of any error
+    *              within the regular expression.)
+    * @param status A reference to a UErrorCode to receive any errors.
+    * @return      A regexPattern object for the compiled pattern.
+    *
+    * @internal ICU 4.4 technology preview
+    */
+    static RegexPattern * U_EXPORT2 compile( UText *regex,
+        UParseError          &pe,
+        UErrorCode           &status);
+
   /**
    * Compiles the regular expression in string form into a RegexPattern
    * object using the specified match mode flags.  These compile methods,
@ -204,7 +234,7 @@ public:
    *
    * @param regex The regular expression to be compiled.
    * @param flags The match mode flags to be used.
-    * @param pe    Receives the position (line and column nubers) of any error
+    * @param pe    Receives the position (line and column numbers) of any error
    *              within the regular expression.)
    * @param status   A reference to a UErrorCode to receive any errors.
    * @return      A regexPattern object for the compiled pattern.
@ -215,7 +245,37 @@ public:
        uint32_t             flags,
        UParseError          &pe,
        UErrorCode           &status);
-
+        
+        
+   /**
+    * Compiles the regular expression in string form into a RegexPattern
+    * object using the specified match mode flags.  These compile methods,
+    * rather than the constructors, are the usual way that RegexPattern objects
+    * are created.
+    *
+    * <p>Note that RegexPattern objects must not be deleted while RegexMatcher
+    * objects created from the pattern are active.  RegexMatchers keep a pointer
+    * back to their pattern, so premature deletion of the pattern is a
+    * catastrophic error.</p>
+    *
+    * <p>Note that it is often more convenient to construct a RegexMatcher directly
+    *    from a pattern string instead of than separately compiling the pattern and
+    *    then creating a RegexMatcher object from the pattern.</p>
+    *
+    * @param regex The regular expression to be compiled.
+    * @param flags The match mode flags to be used.
+    * @param pe    Receives the position (line and column numbers) of any error
+    *              within the regular expression.)
+    * @param status   A reference to a UErrorCode to receive any errors.
+    * @return      A regexPattern object for the compiled pattern.
+    *
+    * @internal ICU 4.4 technology preview
+    */
+    static RegexPattern * U_EXPORT2 compile( UText *regex,
+        uint32_t             flags,
+        UParseError          &pe,
+        UErrorCode           &status);
+    

   /**
    * Compiles the regular expression in string form into a RegexPattern
@ -244,6 +304,33 @@ public:
        UErrorCode           &status);


+   /**
+    * Compiles the regular expression in string form into a RegexPattern
+    * object using the specified match mode flags.  These compile methods,
+    * rather than the constructors, are the usual way that RegexPattern objects
+    * are created.
+    *
+    * <p>Note that RegexPattern objects must not be deleted while RegexMatcher
+    * objects created from the pattern are active.  RegexMatchers keep a pointer
+    * back to their pattern, so premature deletion of the pattern is a
+    * catastrophic error.</p>
+    *
+    * <p>Note that it is often more convenient to construct a RegexMatcher directly
+    *    from a pattern string instead of than separately compiling the pattern and
+    *    then creating a RegexMatcher object from the pattern.</p>
+    *
+    * @param regex The regular expression to be compiled.
+    * @param flags The match mode flags to be used.
+    * @param status   A reference to a UErrorCode to receive any errors.
+    * @return      A regexPattern object for the compiled pattern.
+    *
+    * @internal ICU 4.4 technology preview
+    */
+    static RegexPattern * U_EXPORT2 compile( UText *regex,
+        uint32_t             flags,
+        UErrorCode           &status);
+    
+
   /**
    * Get the match mode flags that were used when compiling this pattern.
    * @return  the match mode flags
@ -270,6 +357,27 @@ public:
    */
    virtual RegexMatcher *matcher(const UnicodeString &input,
        UErrorCode          &status) const;
+        
+        
+   /**
+    * Creates a RegexMatcher that will match the given input against this pattern.  The
+    * RegexMatcher can then be used to perform match, find or replace operations
+    * on the input.  Note that a RegexPattern object must not be deleted while
+    * RegexMatchers created from it still exist and might possibly be used again.
+    * <p>
+    * The matcher will make a shallow clone of the supplied input text, and all regexp
+    * pattern matching operations happen on this clone.  While read-only operations on
+    * the supplied text are permitted, it is critical that the underlying string not be
+    * altered or deleted before use by the regular expression operations is complete.
+    *
+    * @param input    The input text to which the regular expression will be applied.
+    * @param status   A reference to a UErrorCode to receive any errors.
+    * @return         A RegexMatcher object for this pattern and input.
+    *
+    * @internal ICU 4.4 technology preview
+    */
+    virtual RegexMatcher *matcher(UText *input,
+        UErrorCode          &status) const;

 private:
    /**
@ -280,6 +388,8 @@ private:
     * To efficiently work with UChar *strings, wrap the data in a UnicodeString
     * using one of the aliasing constructors, such as
     * <code>UnicodeString(UBool isTerminated, const UChar *text, int32_t textLength);</code>
+     * or in a UText, using
+     * <code>utext_openUChars(UText *ut, const UChar *text, int64_t textLength, UErrorCode *status);</code>
     *
     * @internal
     */
@ -318,15 +428,52 @@ public:
    */
    static UBool U_EXPORT2 matches(const UnicodeString   &regex,
        const UnicodeString   &input,
+              UParseError     &pe,
+              UErrorCode      &status);
+
+
+   /**
+    * Test whether a string matches a regular expression.  This convenience function
+    * both compiles the reguluar expression and applies it in a single operation.
+    * Note that if the same pattern needs to be applied repeatedly, this method will be
+    * less efficient than creating and reusing a RegexMatcher object.
+    *
+    * @param regex The regular expression
+    * @param input The string data to be matched
+    * @param pe Receives the position of any syntax errors within the regular expression
+    * @param status A reference to a UErrorCode to receive any errors.
+    * @return True if the regular expression exactly matches the full input string.
+    *
+    * @internal ICU 4.4 technology preview
+    */
+    static UBool U_EXPORT2 matches(UText *regex,
+        UText           *input,
        UParseError     &pe,
        UErrorCode      &status);


   /**
-    *    Returns the regular expression from which this pattern was compiled.
-    *    @stable ICU 2.4
+    * Returns the regular expression from which this pattern was compiled. This method will work
+    * even if the pattern was compiled from a UText.
+    *
+    * Note: If the pattern was originally compiled from a UText, and that UText was modified,
+    * the returned string may no longer reflect the RegexPattern object.
+    * @stable ICU 2.4
    */
    virtual UnicodeString pattern() const;
+    
+    
+   /**
+    * Returns the regular expression from which this pattern was compiled. This method will work
+    * even if the pattern was compiled from a UnicodeString.
+    *
+    * Note: This is the original input, not a clone. If the pattern was originally compiled from a
+    * UText, and that UText was modified, the returned UText may no longer reflect the RegexPattern
+    * object.
+    *
+    * @internal ICU 4.4 technology preview
+    */
+    virtual UText *patternText() const;


    /**
@ -360,6 +507,37 @@ public:
        UErrorCode       &status) const;


+    /**
+     * Split a string into fields.  Somewhat like split() from Perl.
+     * The pattern matches identify delimiters that separate the input
+     *  into fields.  The input data between the matches becomes the
+     *  fields themselves.
+     * <p>
+     *  For the best performance on split() operations,
+     *  <code>RegexMatcher::split</code> is perferable to this function
+     *
+     * @param input   The string to be split into fields.  The field delimiters
+     *                match the pattern (in the "this" object)
+     * @param dest    An array of mutable UText structs to receive the results of the split.
+     *                If a field is NULL, a new UText is allocated to contain the results for
+     *                that field. This new UText is not guaranteed to be mutable.
+     * @param destCapacity  The number of elements in the destination array.
+     *                If the number of fields found is less than destCapacity, the
+     *                extra strings in the destination array are not altered.
+     *                If the number of destination strings is less than the number
+     *                of fields, the trailing part of the input string, including any
+     *                field delimiters, is placed in the last destination string.
+     * @param status  A reference to a UErrorCode to receive any errors.
+     * @return        The number of fields into which the input string was split.
+     *
+     * @internal ICU 4.4 technology preview
+     */
+    virtual int32_t  split(UText *input,
+        UText            *dest[],
+        int32_t          destCapacity,
+        UErrorCode       &status) const;
+
+
    /**
     * ICU "poor man's RTTI", returns a UClassID for the actual class.
     *
@ -378,7 +556,7 @@ private:
    //
    //  Implementation Data
    //
-    UnicodeString   fPattern;      // The original pattern string.
+    UText          *fPattern;      // The original pattern string.
    uint32_t        fFlags;        // The flags used when compiling the pattern.
                                   //
    UVector32       *fCompiledPat; // The compiled pattern p-code.
@ -396,7 +574,7 @@ private:
                                   //   >= this value.  For some patterns, this calculated
                                   //   value may be less than the true shortest
                                   //   possible match.
-
+    
    int32_t         fFrameSize;    // Size of a state stack frame in the
                                   //   execution engine.

@ -421,6 +599,7 @@ private:
    UnicodeSet     *fInitialChars;
    UChar32         fInitialChar;
    Regex8BitSet   *fInitialChars8;
+    UBool           fNeedsAltInput;

    friend class RegexCompile;
    friend class RegexMatcher;
@ -468,6 +647,23 @@ public:
      */
    RegexMatcher(const UnicodeString &regexp, uint32_t flags, UErrorCode &status);

+    /**
+      * Construct a RegexMatcher for a regular expression.
+      * This is a convenience method that avoids the need to explicitly create
+      * a RegexPattern object.  Note that if several RegexMatchers need to be
+      * created for the same expression, it will be more efficient to
+      * separately create and cache a RegexPattern object, and use
+      * its matcher() method to create the RegexMatcher objects.
+      *
+      *  @param regexp The regular expression to be compiled.
+      *  @param flags  Regular expression options, such as case insensitive matching.
+      *                @see UREGEX_CASE_INSENSITIVE
+      *  @param status Any errors are reported by setting this UErrorCode variable.
+      *
+      *  @internal ICU 4.4 technology preview
+      */
+    RegexMatcher(UText *regexp, uint32_t flags, UErrorCode &status);
+    
    /**
      * Construct a RegexMatcher for a regular expression.
      * This is a convenience method that avoids the need to explicitly create
@ -492,6 +688,30 @@ public:
    RegexMatcher(const UnicodeString &regexp, const UnicodeString &input,
        uint32_t flags, UErrorCode &status);

+    /**
+      * Construct a RegexMatcher for a regular expression.
+      * This is a convenience method that avoids the need to explicitly create
+      * a RegexPattern object.  Note that if several RegexMatchers need to be
+      * created for the same expression, it will be more efficient to
+      * separately create and cache a RegexPattern object, and use
+      * its matcher() method to create the RegexMatcher objects.
+      * <p>
+      * The matcher will make a shallow clone of the supplied input text, and all regexp
+      * pattern matching operations happen on this clone.  While read-only operations on
+      * the supplied text are permitted, it is critical that the underlying string not be
+      * altered or deleted before use by the regular expression operations is complete.
+      *
+      *  @param regexp The Regular Expression to be compiled.
+      *  @param input  The string to match.  The matcher retains a shallow clone of the text.
+      *  @param flags  Regular expression options, such as case insensitive matching.
+      *                @see UREGEX_CASE_INSENSITIVE
+      *  @param status Any errors are reported by setting this UErrorCode variable.
+      *
+      *  @internal ICU 4.4 technology preview
+      */
+    RegexMatcher(UText *regexp, UText *input,
+        uint32_t flags, UErrorCode &status);
+
 private:
    /**
     * Cause a compilation error if an application accidently attempts to
@ -501,6 +721,8 @@ private:
     * To efficiently work with UChar *strings, wrap the data in a UnicodeString
     * using one of the aliasing constructors, such as
     * <code>UnicodeString(UBool isTerminated, const UChar *text, int32_t textLength);</code>
+     * or in a UText, using
+     * <code>utext_openUChars(UText *ut, const UChar *text, int64_t textLength, UErrorCode *status);</code>
     *
     * @internal
     */
@ -525,6 +747,7 @@ public:
    */
    virtual UBool matches(UErrorCode &status);

+
   /**
    *   Resets the matcher, then attempts to match the input beginning 
    *   at the specified startIndex, and extending to the end of the input.
@ -538,8 +761,6 @@ public:
    virtual UBool matches(int32_t startIndex, UErrorCode &status);


-
-
   /**
    *   Attempts to match the input string, starting from the beginning of the region,
    *   against the pattern.  Like the matches() method, this function 
@ -571,6 +792,7 @@ public:
    */
    virtual UBool lookingAt(int32_t startIndex, UErrorCode &status);

+
   /**
    *  Find the next pattern match in the input string.
    *  The find begins searching the input at the location following the end of
@ -610,6 +832,22 @@ public:
    virtual UnicodeString group(UErrorCode &status) const;


+   /**
+    *   Returns a string containing the text matched by the previous match.
+    *   If the pattern can match an empty string, an empty string may be returned.
+    *   @param   dest        A mutable UText in which the matching text is placed.
+    *                        If NULL, a new UText will be created (which may not be mutable).
+    *   @param   status      A reference to a UErrorCode to receive any errors.
+    *                        Possible errors are  U_REGEX_INVALID_STATE if no match
+    *                        has been attempted or the last match failed.
+    *   @return  A string containing the matched input text. If a pre-allocated UText
+    *            was provided, it will always be used and returned.
+    *
+    *   @internal ICU 4.4 technology preview
+    */
+    virtual UText *group(UText *dest, UErrorCode &status) const;
+
+
   /**
    *    Returns a string containing the text captured by the given group
    *    during the previous match operation.  Group(0) is the entire match.
@ -625,6 +863,24 @@ public:
    virtual UnicodeString group(int32_t groupNum, UErrorCode &status) const;


+   /**
+    *   Returns a string containing the text captured by the given group
+    *   during the previous match operation.  Group(0) is the entire match.
+    *
+    *   @param   groupNum    the capture group number
+    *   @param   dest        A mutable UText in which the matching text is placed.
+    *                        If NULL, a new UText will be created (which may not be mutable).
+    *   @param   status      A reference to a UErrorCode to receive any errors.
+    *                        Possible errors are  U_REGEX_INVALID_STATE if no match
+    *                        has been attempted or the last match failed.
+    *   @return  A string containing the matched input text. If a pre-allocated UText
+    *            was provided, it will always be used and returned.
+    *
+    *   @internal ICU 4.4 technology preview
+    */
+    virtual UText *group(int32_t groupNum, UText *dest, UErrorCode &status) const;
+
+
   /**
    *   Returns the number of capturing groups in this matcher's pattern.
    *   @return the number of capture groups
@ -726,11 +982,31 @@ public:
    *                Because no copy of the string is made, it is essential that the
    *                caller not delete the string until after regexp operations on it
    *                are done.
+    *                Note that while a reset on the matcher with an input string that is then
+    *                modified across/during matcher operations may be supported currently for UnicodeString,
+    *                this was not originally intended behavior, and support for this is not guaranteed
+    *                in upcoming versions of ICU.
    *   @return this RegexMatcher.
    *   @stable ICU 2.4
    */
    virtual RegexMatcher &reset(const UnicodeString &input);

+
+   /**
+    *   Resets this matcher with a new input string.  This allows instances of RegexMatcher
+    *     to be reused, which is more efficient than creating a new RegexMatcher for
+    *     each input string to be processed.
+    *   @param input The new string on which subsequent pattern matches will operate.
+    *                The matcher makes a shallow clone of the given text; ownership of the
+    *                original string remains with the caller. Because no deep copy of the
+    *                text is made, it is essential that the caller not modify the string
+    *                until after regexp operations on it are done.
+    *   @return this RegexMatcher.
+    *
+    *   @internal ICU 4.4 technology preview
+    */
+    virtual RegexMatcher &reset(UText *input);
+
 private:
    /**
     * Cause a compilation error if an application accidently attempts to
@ -740,6 +1016,8 @@ private:
     * To efficiently work with UChar *strings, wrap the data in a UnicodeString
     * using one of the aliasing constructors, such as
     * <code>UnicodeString(UBool isTerminated, const UChar *text, int32_t textLength);</code>
+     * or in a UText, using
+     * <code>utext_openUChars(UText *ut, const UChar *text, int64_t textLength, UErrorCode *status);</code>
     *
     * @internal
     */
@ -747,13 +1025,34 @@ private:
 public:

   /**
-    *   Returns the input string being matched.  The returned string is not a copy,
-    *   but the live input string.  It should not be altered or deleted.
+    *   Returns the input string being matched.  Ownership of the string belongs to
+    *   the matcher; it should not be altered or deleted. This method will work even if the input
+    *   was originally supplied as a UText.
    *   @return the input string
    *   @stable ICU 2.4
    */
    virtual const UnicodeString &input() const;
    
+   /**
+    *   Returns the input string being matched.  This is the live input text; it should not be
+    *   altered or deleted. This method will work even if the input was originally supplied as
+    *   a UnicodeString.
+    *   @return the input text
+    *
+    *   @internal ICU 4.4 technology preview
+    */
+    virtual UText *inputText() const;
+    
+   /**
+    *   Returns the input string being matched, either by copying it into the provided
+    *   UText parameter or by returning a shallow clone of the live input. Note that copying
+    *   the entire input may cause significant performance and memory issues.
+    *   @param dest The UText into which the input should be copied, or NULL to create a new UText
+    *   @return dest if non-NULL, a shallow copy of the input text otherwise
+    *
+    *   @internal ICU 4.4 technology preview
+    */
+    virtual UText *getInput(UText *dest) const;
    

   /** Sets the limits of this matcher's region.
@ -838,6 +1137,7 @@ public:
      */    
      virtual UBool hasAnchoringBounds() const;

+
    /**
      * Set whether this matcher is using Anchoring Bounds for its region.
      * With anchoring bounds, pattern anchors such as ^ and $ will match at the start
@ -852,6 +1152,7 @@ public:
      */
      virtual RegexMatcher &useAnchoringBounds(UBool b);

+
    /**
      * Return TRUE if the most recent matching operation touched the
      *  end of the text being processed.  In this case, additional input text could
@ -878,9 +1179,6 @@ public:
      virtual UBool requireEnd() const;


-
-
-
   /**
    *    Returns the pattern that is interpreted by this matcher.
    *    @return  the RegexPattern for this RegexMatcher
@ -908,6 +1206,29 @@ public:
    virtual UnicodeString replaceAll(const UnicodeString &replacement, UErrorCode &status);


+   /**
+    *    Replaces every substring of the input that matches the pattern
+    *    with the given replacement string.  This is a convenience function that
+    *    provides a complete find-and-replace-all operation.
+    *
+    *    This method first resets this matcher. It then scans the input string
+    *    looking for matches of the pattern. Input that is not part of any
+    *    match is left unchanged; each match is replaced in the result by the
+    *    replacement string. The replacement string may contain references to
+    *    capture groups.
+    *
+    *    @param   replacement a string containing the replacement text.
+    *    @param   dest        a mutable UText in which the results are placed.
+    *                          If NULL, a new UText will be created (which may not be mutable).
+    *    @param   status      a reference to a UErrorCode to receive any errors.
+    *    @return              a string containing the results of the find and replace.
+    *                          If a pre-allocated UText was provided, it will always be used and returned.
+    *
+    *    @internal ICU 4.4 technology preview
+    */
+    virtual UText *replaceAll(UText *replacement, UText *dest, UErrorCode &status);
+    
+
   /**
    * Replaces the first substring of the input that matches
    * the pattern with the replacement string.   This is a convenience
@ -929,7 +1250,35 @@ public:
    *    @stable ICU 2.4
    */
    virtual UnicodeString replaceFirst(const UnicodeString &replacement, UErrorCode &status);
+    

+   /**
+    * Replaces the first substring of the input that matches
+    * the pattern with the replacement string.   This is a convenience
+    * function that provides a complete find-and-replace operation.
+    *
+    * <p>This function first resets this RegexMatcher. It then scans the input string
+    * looking for a match of the pattern. Input that is not part
+    * of the match is appended directly to the result string; the match is replaced
+    * in the result by the replacement string. The replacement string may contain
+    * references to captured groups.</p>
+    *
+    * <p>The state of the matcher (the position at which a subsequent find()
+    *    would begin) after completing a replaceFirst() is not specified.  The
+    *    RegexMatcher should be reset before doing additional find() operations.</p>
+    *
+    *    @param   replacement a string containing the replacement text.
+    *    @param   dest        a mutable UText in which the results are placed.
+    *                          If NULL, a new UText will be created (which may not be mutable).
+    *    @param   status      a reference to a UErrorCode to receive any errors.
+    *    @return              a string containing the results of the find and replace.
+    *                          If a pre-allocated UText was provided, it will always be used and returned.
+    *
+    *    @internal ICU 4.4 technology preview
+    */
+    virtual UText *replaceFirst(UText *replacement, UText *dest, UErrorCode &status);
+    
+    
   /**
    *   Implements a replace operation intended to be used as part of an
    *   incremental find-and-replace.
@ -959,6 +1308,37 @@ public:
    */
    virtual RegexMatcher &appendReplacement(UnicodeString &dest,
        const UnicodeString &replacement, UErrorCode &status);
+    
+    
+   /**
+    *   Implements a replace operation intended to be used as part of an
+    *   incremental find-and-replace.
+    *
+    *   <p>The input string, starting from the end of the previous replacement and ending at
+    *   the start of the current match, is appended to the destination string.  Then the
+    *   replacement string is appended to the output string,
+    *   including handling any substitutions of captured text.</p>
+    *
+    *   <p>For simple, prepackaged, non-incremental find-and-replace
+    *   operations, see replaceFirst() or replaceAll().</p>
+    *
+    *   @param   dest        A mutable UText to which the results of the find-and-replace are appended.
+    *                         Must not be NULL.
+    *   @param   replacement A UText that provides the text to be substituted for
+    *                        the input text that matched the regexp pattern.  The replacement
+    *                        text may contain references to captured text from the input.
+    *   @param   status      A reference to a UErrorCode to receive any errors.  Possible
+    *                        errors are  U_REGEX_INVALID_STATE if no match has been
+    *                        attempted or the last match failed, and U_INDEX_OUTOFBOUNDS_ERROR
+    *                        if the replacement text specifies a capture group that
+    *                        does not exist in the pattern.
+    *
+    *   @return  this  RegexMatcher
+    *
+    *   @internal ICU 4.4 technology preview
+    */
+    virtual RegexMatcher &appendReplacement(UText *dest,
+        UText *replacement, UErrorCode &status);


   /**
@ -974,13 +1354,26 @@ public:
    virtual UnicodeString &appendTail(UnicodeString &dest);


+   /**
+    * As the final step in a find-and-replace operation, append the remainder
+    * of the input string, starting at the position following the last appendReplacement(),
+    * to the destination string. <code>appendTail()</code> is intended to be invoked after one
+    * or more invocations of the <code>RegexMatcher::appendReplacement()</code>.
+    *
+    *  @param dest A mutable UText to which the results of the find-and-replace are appended.
+    *               Must not be NULL.
+    *  @return  the destination string.
+    *
+    *  @internal ICU 4.4 technology preview
+    */
+    virtual UText *appendTail(UText *dest);
+

    /**
     * Split a string into fields.  Somewhat like split() from Perl.
     * The pattern matches identify delimiters that separate the input
     *  into fields.  The input data between the matches becomes the
     *  fields themselves.
-     * <p>
     *
     * @param input   The string to be split into fields.  The field delimiters
     *                match the pattern (in the "this" object).  This matcher
@ -1004,6 +1397,35 @@ public:
        int32_t          destCapacity,
        UErrorCode       &status);

+
+    /**
+     * Split a string into fields.  Somewhat like split() from Perl.
+     * The pattern matches identify delimiters that separate the input
+     *  into fields.  The input data between the matches becomes the
+     *  fields themselves.
+     *
+     * @param input   The string to be split into fields.  The field delimiters
+     *                match the pattern (in the "this" object).  This matcher
+     *                will be reset to this input string.
+     * @param dest    An array of mutable UText structs to receive the results of the split.
+     *                If a field is NULL, a new UText is allocated to contain the results for
+     *                that field. This new UText is not guaranteed to be mutable.
+     * @param destCapacity  The number of elements in the destination array.
+     *                If the number of fields found is less than destCapacity, the
+     *                extra strings in the destination array are not altered.
+     *                If the number of destination strings is less than the number
+     *                of fields, the trailing part of the input string, including any
+     *                field delimiters, is placed in the last destination string.
+     * @param status  A reference to a UErrorCode to receive any errors.
+     * @return        The number of fields into which the input string was split.
+     *
+     * @internal ICU 4.4 technology preview
+     */
+    virtual int32_t  split(UText *input,
+        UText           *dest[],
+        int32_t          destCapacity,
+        UErrorCode       &status);
+    
  /**
    *   Set a processing time limit for match operations with this Matcher.
    *  
@ -1086,7 +1508,6 @@ public:
                                  UErrorCode              &status);


-
  /**
    *  Get the callback function for this URegularExpression.
    *
@ -1132,7 +1553,7 @@ private:
    RegexMatcher(const RegexMatcher &other);
    RegexMatcher &operator =(const RegexMatcher &rhs);
    void init(UErrorCode &status);                      // Common initialization
-    void init2(const UnicodeString &s, UErrorCode &e);  // Common initialization, part 2.
+    void init2(UText *t, UErrorCode &e);  // Common initialization, part 2.

    friend class RegexPattern;
    friend class RegexCImpl;
@ -1145,34 +1566,43 @@ private:
    //  MatchAt   This is the internal interface to the match engine itself.
    //            Match status comes back in matcher member variables.
    //
-    void                 MatchAt(int32_t startIdx, UBool toEnd, UErrorCode &status);
-    inline void          backTrack(int32_t &inputIdx, int32_t &patIdx);
-    UBool                isWordBoundary(int32_t pos);         // perform Perl-like  \b test
-    UBool                isUWordBoundary(int32_t pos);        // perform RBBI based \b test
+    void                 MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status);
+    inline void          backTrack(int64_t &inputIdx, int32_t &patIdx);
+    UBool                isWordBoundary(int64_t pos);         // perform Perl-like  \b test
+    UBool                isUWordBoundary(int64_t pos);        // perform RBBI based \b test
    REStackFrame        *resetStack();
    inline REStackFrame *StateSave(REStackFrame *fp, int32_t savePatIdx, UErrorCode &status);
    void                 IncrementTime(UErrorCode &status);
-
+    
+    int64_t              appendGroup(int32_t groupNum, UText *dest, UErrorCode &status) const;
+    
+    UBool                findUsingChunk();
+    void                 MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &status);
+    UBool                isChunkWordBoundary(int32_t pos);

    const RegexPattern  *fPattern;
    RegexPattern        *fPatternOwned;    // Non-NULL if this matcher owns the pattern, and
                                           //   should delete it when through.

-    const UnicodeString *fInput;           // The text being matched. Is never NULL.
+    const UnicodeString *fInput;           // The string being matched. Only used for input()
+    UText               *fInputText;       // The text being matched. Is never NULL.
+    UText               *fAltInputText;    // A shallow copy of the text being matched.
+                                           //   Only created if the pattern contains backreferences.
+    int64_t              fInputLength;     // Full length of the input text.
    int32_t              fFrameSize;       // The size of a frame in the backtrack stack.
    
-    int32_t              fRegionStart;     // Start of the input region, default = 0.
-    int32_t              fRegionLimit;     // End of input region, default to input.length.
+    int64_t              fRegionStart;     // Start of the input region, default = 0.
+    int64_t              fRegionLimit;     // End of input region, default to input.length.
    
-    int32_t              fAnchorStart;     // Region bounds for anchoring operations (^ or $).
-    int32_t              fAnchorLimit;     //   See useAnchoringBounds
+    int64_t              fAnchorStart;     // Region bounds for anchoring operations (^ or $).
+    int64_t              fAnchorLimit;     //   See useAnchoringBounds
    
-    int32_t              fLookStart;       // Region bounds for look-ahead/behind and
-    int32_t              fLookLimit;       //   and other boundary tests.  See
+    int64_t              fLookStart;       // Region bounds for look-ahead/behind and
+    int64_t              fLookLimit;       //   and other boundary tests.  See
                                           //   useTransparentBounds

-    int32_t              fActiveStart;     // Currently active bounds for matching.
-    int32_t              fActiveLimit;     //   Usually is the same as region, but
+    int64_t              fActiveStart;     // Currently active bounds for matching.
+    int64_t              fActiveLimit;     //   Usually is the same as region, but
                                           //   is changed to fLookStart/Limit when
                                           //   entering look around regions.

@ -1180,13 +1610,13 @@ private:
    UBool                fAnchoringBounds; // True if using anchoring bounds.

    UBool                fMatch;           // True if the last attempted match was successful.
-    int32_t              fMatchStart;      // Position of the start of the most recent match
-    int32_t              fMatchEnd;        // First position after the end of the most recent match
+    int64_t              fMatchStart;      // Position of the start of the most recent match
+    int64_t              fMatchEnd;        // First position after the end of the most recent match
                                           //   Zero if no previous match, even when a region
                                           //   is active.
-    int32_t              fLastMatchEnd;    // First position after the end of the previous match,
+    int64_t              fLastMatchEnd;    // First position after the end of the previous match,
                                           //   or -1 if there was no previous match.
-    int32_t              fAppendPosition;  // First position after the end of the previous
+    int64_t              fAppendPosition;  // First position after the end of the previous
                                           //   appendReplacement().  As described by the
                                           //   JavaDoc for Java Matcher, where it is called 
                                           //   "append position"
@ -1218,6 +1648,8 @@ private:
                                           //   NULL if there is no callback.
    const void         *fCallbackContext;  // User Context ptr for callback function.

+    UBool               fInputUniStrMaybeMutable;  // Set when fInputText wraps a UnicodeString that may be mutable - compatibility.
+
    UBool               fTraceDebug;       // Set true for debug tracing of match engine.

    UErrorCode          fDeferredStatus;   // Save error state that cannot be immediately
--- a/icu4c/source/i18n/unicode/uregex.h
+++ b/icu4c/source/i18n/unicode/uregex.h
@ -3,7 +3,7 @@
 *   Copyright (C) 2004-2010, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 **********************************************************************
-*   file name:  regex.h
+*   file name:  uregex.h
 *   encoding:   US-ASCII
 *   indentation:4
 *
@ -23,6 +23,7 @@
 #ifndef UREGEX_H
 #define UREGEX_H

+#include "unicode/utext.h"
 #include "unicode/utypes.h"

 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
@ -112,6 +113,7 @@ typedef enum URegexpFlag{
  *  string form into an internal representation using the specified match mode flags.
  *  The resulting regular expression handle can then be used to perform various
  *   matching operations.
+  * 
  *
  * @param pattern        The Regular Expression pattern to be compiled. 
  * @param patternLength  The length of the pattern, or -1 if the pattern is
@ -134,7 +136,36 @@ uregex_open( const  UChar          *pattern,
                    uint32_t        flags,
                    UParseError    *pe,
                    UErrorCode     *status);
-
+                    
+/**
+  *  Open (compile) an ICU regular expression.  Compiles the regular expression in
+  *  string form into an internal representation using the specified match mode flags.
+  *  The resulting regular expression handle can then be used to perform various
+  *   matching operations.
+  *  <p>
+  *  The contents of the pattern UText will be extracted and saved. Ownership of the
+  *   UText struct itself remains with the caller. This is to match the behavior of
+  *   uregex_open().
+  *
+  * @param pattern        The Regular Expression pattern to be compiled. 
+  * @param flags          Flags that alter the default matching behavior for
+  *                       the regular expression, UREGEX_CASE_INSENSITIVE, for
+  *                       example.  For default behavior, set this parameter to zero.
+  *                       See <code>enum URegexpFlag</code>.  All desired flags
+  *                       are bitwise-ORed together.
+  * @param pe             Receives the position (line and column nubers) of any syntax
+  *                       error within the source regular expression string.  If this
+  *                       information is not wanted, pass NULL for this parameter.
+  * @param status         Receives error detected by this function.
+  *
+  * @internal ICU 4.4 technology preview
+  */
+U_INTERNAL URegularExpression *  U_EXPORT2
+uregex_openUText(UText          *pattern,
+                 uint32_t        flags,
+                 UParseError    *pe,
+                 UErrorCode     *status);
+    
 /**
  *  Open (compile) an ICU regular expression.  The resulting regular expression
  *   handle can then be used to perform various matching operations.
@ -219,7 +250,8 @@ U_STABLE URegularExpression * U_EXPORT2
 uregex_clone(const URegularExpression *regexp, UErrorCode *status);

 /**
- *  Return a pointer to the source form of the pattern for this regular expression.
+ *  Returns a pointer to the source form of the pattern for this regular expression.
+ *  This function will work even if the pattern was originally specified as a UText.
 *
 * @param regexp     The compiled regular expression.
 * @param patLength  This output parameter will be set to the length of the
@ -235,9 +267,24 @@ uregex_clone(const URegularExpression *regexp, UErrorCode *status);
 * @stable ICU 3.0
 */
 U_STABLE const UChar * U_EXPORT2 
-uregex_pattern(const  URegularExpression   *regexp,
-                         int32_t           *patLength,
-                         UErrorCode        *status);
+uregex_pattern(const URegularExpression *regexp,
+                     int32_t            *patLength,
+                     UErrorCode         *status);
+
+/**
+ *  Returns the source text of the pattern for this regular expression.
+ *  This function will work even if the pattern was originally specified as a UChar string.
+ *
+ * @param regexp     The compiled regular expression.
+ * @param status     Receives errors detected by this function.
+ * @return the pattern text.  The storage for the text is owned by the regular expression
+ *                   object, and must not be altered or deleted.
+ *
+ * @internal ICU 4.4 technology preview
+ */
+U_INTERNAL UText * U_EXPORT2 
+uregex_patternUText(const URegularExpression *regexp,
+                          UErrorCode         *status);


 /**
@ -279,10 +326,36 @@ uregex_setText(URegularExpression *regexp,
               int32_t             textLength,
               UErrorCode         *status);

+
+/**
+  *  Set the subject text string upon which the regular expression will look for matches.
+  *  This function may be called any number of times, allowing the regular
+  *  expression pattern to be applied to different strings.
+  *  <p>
+  *  Regular expression matching operations work directly on the application's
+  *  string data; only a shallow clone is made.  The subject string data must not be
+  *  altered after calling this function until after all regular expression
+  *  operations involving this string data are completed.  
+  *
+  * @param regexp     The compiled regular expression.
+  * @param text       The subject text string.
+  * @param status     Receives errors detected by this function.
+  *
+  * @internal ICU 4.4 technology preview
+  */
+U_INTERNAL void U_EXPORT2 
+uregex_setUText(URegularExpression *regexp,
+                UText              *text,
+                UErrorCode         *status);
+
 /**
  *  Get the subject text that is currently associated with this 
-  *   regular expression object.  This simply returns whatever string
-  *   pointer was previously supplied via uregex_setText().
+  *   regular expression object.  If the input was supplied using uregex_setText(),
+  *   that pointer will be returned.  Otherwise, the characters in the input will
+  *   be extracted to a buffer and returned.  In either case, ownership remains
+  *   with the regular expression object.
+  *
+  *  This function will work even if the input was originally specified as a UText.
  *
  * @param regexp      The compiled regular expression.
  * @param textLength  The length of the string is returned in this output parameter. 
@ -291,7 +364,7 @@ uregex_setText(URegularExpression *regexp,
  *                    the text is known in advance to be a NUL terminated
  *                    string.
  * @param status      Receives errors detected by this function.
-  * @return            Poiner to the subject text string currently associated with
+  * @return            Pointer to the subject text string currently associated with
  *                    this regular expression.
  * @stable ICU 3.0
  */
@ -299,6 +372,28 @@ U_STABLE const UChar * U_EXPORT2
 uregex_getText(URegularExpression *regexp,
               int32_t            *textLength,
               UErrorCode         *status);
+               
+               
+/**
+  *  Get the subject text that is currently associated with this 
+  *   regular expression object.
+  *
+  *  This function will work even if the input was originally specified as a UChar string.
+  *
+  * @param regexp      The compiled regular expression.
+  * @param dest        A mutable UText in which to store the current input.
+  *                    If NULL, a new UText will be created as an immutable shallow clone
+  *                    of the actual input string.
+  * @param status      Receives errors detected by this function.
+  * @return            The subject text currently associated with this regular expression.
+  *                    If a pre-allocated UText was provided, it will always be used and returned.
+  *
+  * @internal ICU 4.4 technology preview
+  */
+U_INTERNAL UText * U_EXPORT2 
+uregex_getUText(URegularExpression *regexp,
+                UText              *dest,
+                UErrorCode         *status);

 /**
  *   Attempts to match the input string against the pattern.
@ -428,6 +523,29 @@ uregex_group(URegularExpression *regexp,
             int32_t             destCapacity,
             UErrorCode          *status);

+/** Extract the string for the specified matching expression or subexpression.
+  * Group #0 is the complete string of matched text.
+  * Group #1 is the text matched by the first set of capturing parentheses.
+  *
+  *   @param   regexp       The compiled regular expression.
+  *   @param   groupNum     The capture group to extract.  Group 0 is the complete
+  *                         match.  The value of this parameter must be
+  *                         less than or equal to the number of capture groups in
+  *                         the pattern.
+  *   @param   dest         Mutable UText to receive the matching string data.
+  *                         If NULL, a new UText will be created (which may not be mutable).
+  *   @param   status       A reference to a UErrorCode to receive any errors.
+  *   @return               The matching string data. If a pre-allocated UText was provided,
+  *                          it will always be used and returned.
+  *
+  *   @internal ICU 4.4 technology preview
+  */
+U_INTERNAL UText * U_EXPORT2 
+uregex_groupUText(URegularExpression *regexp,
+                  int32_t             groupNum,
+                  UText              *dest,
+                  UErrorCode         *status);
+

 /**
  *   Returns the index in the input string of the start of the text matched by the
@ -676,6 +794,32 @@ uregex_replaceAll(URegularExpression    *regexp,
                  int32_t                destCapacity,
                  UErrorCode            *status);

+/**
+  *    Replaces every substring of the input that matches the pattern
+  *    with the given replacement string.  This is a convenience function that
+  *    provides a complete find-and-replace-all operation.
+  *
+  *    This method scans the input string looking for matches of the pattern. 
+  *    Input that is not part of any match is copied unchanged to the
+  *    destination buffer.  Matched regions are replaced in the output
+  *    buffer by the replacement string.   The replacement string may contain
+  *    references to capture groups; these take the form of $1, $2, etc.
+  *
+  *    @param   regexp         The compiled regular expression.
+  *    @param   replacement    A string containing the replacement text.
+  *    @param   dest           A mutable UText that will receive the result.
+  *                             If NULL, a new UText will be created (which may not be mutable).
+  *    @param   status         A reference to a UErrorCode to receive any errors.
+  *    @return                 A UText containing the results of the find and replace.
+  *                             If a pre-allocated UText was provided, it will always be used and returned.
+  *
+  *    @internal ICU 4.4 technology preview
+  */
+U_INTERNAL UText * U_EXPORT2 
+uregex_replaceAllUText(URegularExpression *regexp,
+                       UText              *replacement,
+                       UText              *dest,
+                       UErrorCode         *status);

 /**
  *    Replaces the first substring of the input that matches the pattern
@ -709,6 +853,33 @@ uregex_replaceFirst(URegularExpression  *regexp,
                    int32_t              destCapacity,
                    UErrorCode          *status);

+/**
+  *    Replaces the first substring of the input that matches the pattern
+  *    with the given replacement string.  This is a convenience function that
+  *    provides a complete find-and-replace operation.
+  *
+  *    This method scans the input string looking for a match of the pattern. 
+  *    All input that is not part of the match is copied unchanged to the
+  *    destination buffer.  The matched region is replaced in the output
+  *    buffer by the replacement string.   The replacement string may contain
+  *    references to capture groups; these take the form of $1, $2, etc.
+  *
+  *    @param   regexp         The compiled regular expression.
+  *    @param   replacement    A string containing the replacement text.
+  *    @param   dest           A mutable UText that will receive the result.
+  *                             If NULL, a new UText will be created (which may not be mutable).
+  *    @param   status         A reference to a UErrorCode to receive any errors.
+  *    @return                 A UText containing the results of the find and replace.
+  *                             If a pre-allocated UText was provided, it will always be used and returned.
+  *
+  *    @internal ICU 4.4 technology preview
+  */
+U_INTERNAL UText * U_EXPORT2 
+uregex_replaceFirstUText(URegularExpression *regexp,
+                         UText              *replacement,
+                         UText              *dest,
+                         UErrorCode         *status);
+

 /**
  *   Implements a replace operation intended to be used as part of an
@ -758,11 +929,40 @@ uregex_replaceFirst(URegularExpression  *regexp,
  */
 U_STABLE int32_t U_EXPORT2 
 uregex_appendReplacement(URegularExpression    *regexp,
-                  const UChar           *replacementText,
-                  int32_t                replacementLength,
-                  UChar                **destBuf,
-                  int32_t               *destCapacity,
-                  UErrorCode            *status);
+                         const UChar           *replacementText,
+                         int32_t                replacementLength,
+                         UChar                **destBuf,
+                         int32_t               *destCapacity,
+                         UErrorCode            *status);
+
+
+/**
+  *   Implements a replace operation intended to be used as part of an
+  *   incremental find-and-replace.
+  *
+  *   <p>The input string, starting from the end of the previous match and ending at
+  *   the start of the current match, is appended to the destination string.  Then the
+  *   replacement string is appended to the output string,
+  *   including handling any substitutions of captured text.</p>
+  *
+  *   <p>For simple, prepackaged, non-incremental find-and-replace
+  *      operations, see replaceFirst() or replaceAll().</p>
+  *
+  *   @param   regexp      The regular expression object.  
+  *   @param   replacementText The string that will replace the matched portion of the
+  *                        input string as it is copied to the destination buffer.
+  *                        The replacement text may contain references ($1, for
+  *                        example) to capture groups from the match.
+  *   @param   dest        A mutable UText that will receive the result. Must not be NULL.
+  *   @param   status      A reference to a UErrorCode to receive any errors. 
+  *
+  *   @internal ICU 4.4 technology preview
+  */
+U_INTERNAL void U_EXPORT2 
+uregex_appendReplacementUText(URegularExpression    *regexp,
+                              UText                 *replacementText,
+                              UText                 *dest,
+                              UErrorCode            *status);


 /**
@ -794,7 +994,27 @@ uregex_appendTail(URegularExpression    *regexp,
                  UChar                **destBuf,
                  int32_t               *destCapacity,
                  UErrorCode            *status);
+                  

+/**
+  * As the final step in a find-and-replace operation, append the remainder
+  * of the input string, starting at the position following the last match,
+  * to the destination string. <code>uregex_appendTailUText()</code> is intended 
+  *  to be invoked after one or more invocations of the
+  *  <code>uregex_appendReplacementUText()</code> function.
+  *
+  *   @param   regexp      The regular expression object.  This is needed to 
+  *                        obtain the input string and with the position
+  *                        of the last match within it.
+  *   @param   dest        A mutable UText that will receive the result. Must not be NULL.
+  *   @param   status      A reference to a UErrorCode to receive any errors. 
+  *   @return              The destination UText.
+  *
+  *   @internal ICU 4.4 technology preview
+  */
+U_INTERNAL UText * U_EXPORT2 
+uregex_appendTailUText(URegularExpression    *regexp,
+                       UText                 *dest);



@ -808,6 +1028,22 @@ uregex_appendTail(URegularExpression    *regexp,
   *  buffer, and NUL terminated.  The position of each field within
   *  the destination buffer is returned in the destFields array.
   *
+   *  Note:  another choice for the design of this function would be to not
+   *         copy the resulting fields at all, but to return indexes and
+   *         lengths within the source text.  
+   *           Advantages would be
+   *             o  Faster.  No Copying.
+   *             o  Nothing extra needed when field data may contain embedded NUL chars.
+   *             o  Less memory needed if working on large data.
+   *           Disadvantages
+   *             o  Less consistent with C++ split, which copies into an
+   *                array of UnicodeStrings.
+   *             o  No NUL termination, extracted fields would be less convenient
+   *                to use in most cases.
+   *             o  Possible problems in the future, when support Unicode Normalization
+   *                could cause the fields to not correspond exactly to
+   *                a range of the source text.
+   * 
   *    @param   regexp      The compiled regular expression.
   *    @param   destBuf     A (UChar *) buffer to receive the fields that
   *                         are extracted from the input string. These
@ -846,6 +1082,39 @@ uregex_split(   URegularExpression      *regexp,
                  UErrorCode            *status);


+  /**
+   * Split a string into fields.  Somewhat like split() from Perl.
+   * The pattern matches identify delimiters that separate the input
+   *  into fields.  The input data between the matches becomes the
+   *  fields themselves.
+   * <p>
+   * The behavior of this function is not very closely aligned with uregex_split();
+   * instead, it is based on (and implemented directly on top of) the C++ split method.
+   *
+   * @param regexp  The compiled regular expression.
+   * @param dest    An array of mutable UText structs to receive the results of the split.
+   *                If a field is NULL, a new UText is allocated to contain the results for
+   *                that field. This new UText is not guaranteed to be mutable.
+   * @param destCapacity  The number of elements in the destination array.
+   *                If the number of fields found is less than destCapacity, the
+   *                extra strings in the destination array are not altered.
+   *                If the number of destination strings is less than the number
+   *                of fields, the trailing part of the input string, including any
+   *                field delimiters, is placed in the last destination string.
+   *                This behavior mimics that of Perl.  It is not  an error condition, and no
+   *                error status is returned when all destField positions are used.
+   * @param status  A reference to a UErrorCode to receive any errors.
+   * @return        The number of fields into which the input string was split.
+   *
+   * @internal ICU 4.4 technology preview
+   */
+U_INTERNAL int32_t U_EXPORT2 
+uregex_splitUText(URegularExpression    *regexp,
+                  UText                 *destFields[],
+                  int32_t                destFieldsCapacity,
+                  UErrorCode            *status);
+
+


 /**
--- a/icu4c/source/i18n/uregex.cpp
+++ b/icu4c/source/i18n/uregex.cpp
--- a/icu4c/source/test/cintltst/reapits.c
+++ b/icu4c/source/test/cintltst/reapits.c
@ -1,6 +1,6 @@
 /********************************************************************
 * COPYRIGHT: 
- * Copyright (c) 2004-2009, International Business Machines Corporation and
+ * Copyright (c) 2004-2010, International Business Machines Corporation and
 * others. All Rights Reserved.
 ********************************************************************/
 /********************************************************************************
@ -26,6 +26,7 @@
 #include "unicode/uloc.h"
 #include "unicode/uregex.h"
 #include "unicode/ustring.h"
+#include "unicode/utext.h"
 #include "cintltst.h"

 #define TEST_ASSERT_SUCCESS(status) {if (U_FAILURE(status)) { \
@ -86,11 +87,34 @@ static void test_assert_string(const char *expected, const UChar *actual, UBool
 #define TEST_ASSERT_STRING(expected, actual, nulTerm) test_assert_string(expected, actual, nulTerm, __FILE__, __LINE__)
             

+static void test_assert_utext(const char *expected, UText *actual, const char *file, int line) {
+    UErrorCode status = U_ZERO_ERROR;
+    UText expectedText = UTEXT_INITIALIZER;
+    utext_openUTF8(&expectedText, expected, -1, &status);
+    utext_setNativeIndex(actual, 0);
+    if (utext_compare(&expectedText, -1, actual, -1) != 0) {
+        UChar32 c;
+        log_err("Failure at file %s, line %d, expected \"%s\", got \"", file, line, expected);
+        c = utext_next32From(actual, 0);
+        while (c != U_SENTINEL) {
+            if (0x20<c && c <0x7e) {
+                log_err("%c", c);
+            } else {
+                log_err("%#x", c);
+            }
+            c = UTEXT_NEXT32(actual);
+        }
+        log_err("\"\n");
+    }
+}
+
+#define TEST_ASSERT_UTEXT(expected, actual) test_assert_utext(expected, actual, __FILE__, __LINE__)



 static void TestRegexCAPI(void);
 static void TestBug4315(void);
+static void TestUTextAPI(void);

 void addURegexTest(TestNode** root);

@ -98,6 +122,7 @@ void addURegexTest(TestNode** root)
 {
    addTest(root, &TestRegexCAPI, "regex/TestRegexCAPI");
    addTest(root, &TestBug4315,   "regex/TestBug4315");
+    addTest(root, &TestUTextAPI,  "regex/TestUTextAPI");
 }

 /*
@ -1319,4 +1344,697 @@ static void TestBug4315(void) {
    uregex_close(theRegEx);
 }

+/* Based on TestRegexCAPI() */
+static void TestUTextAPI(void) {
+    UErrorCode           status = U_ZERO_ERROR;
+    URegularExpression  *re;
+    UText                patternText = UTEXT_INITIALIZER;
+    UChar                pat[200];
+
+    /* Mimimalist open/close */
+    utext_openUTF8(&patternText, "abc*", -1, &status);
+    re = uregex_openUText(&patternText, 0, 0, &status);
+    if (U_FAILURE(status)) {
+         log_err("Failed to open regular expression, line %d, error is \"%s\"\n", __LINE__, u_errorName(status));
+         return;
+    }
+    uregex_close(re);
+
+    /* Open with all flag values set */
+    status = U_ZERO_ERROR;
+    re = uregex_openUText(&patternText, 
+        UREGEX_CASE_INSENSITIVE | UREGEX_COMMENTS | UREGEX_DOTALL | UREGEX_MULTILINE | UREGEX_UWORD,
+        0, &status);
+    TEST_ASSERT_SUCCESS(status);
+    uregex_close(re);
+
+    /* Open with an invalid flag */
+    status = U_ZERO_ERROR;
+    re = uregex_openUText(&patternText, 0x40000000, 0, &status);
+    TEST_ASSERT(status == U_REGEX_INVALID_FLAG);
+    uregex_close(re);
+
+    /* open with an invalid parameter */
+    status = U_ZERO_ERROR;
+    re = uregex_openUText(NULL,
+        UREGEX_CASE_INSENSITIVE | UREGEX_COMMENTS | UREGEX_DOTALL | UREGEX_MULTILINE | UREGEX_UWORD, 0, &status);
+    TEST_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR && re == NULL);
+
+    /*
+     *  clone
+     */
+    {
+        URegularExpression *clone1;
+        URegularExpression *clone2;
+        URegularExpression *clone3;
+        UChar  testString1[30];
+        UChar  testString2[30];
+        UBool  result;
+
+
+        status = U_ZERO_ERROR;
+        re = uregex_openUText(&patternText, 0, 0, &status);
+        TEST_ASSERT_SUCCESS(status);
+        clone1 = uregex_clone(re, &status);
+        TEST_ASSERT_SUCCESS(status);
+        TEST_ASSERT(clone1 != NULL);
+
+        status = U_ZERO_ERROR;
+        clone2 = uregex_clone(re, &status);
+        TEST_ASSERT_SUCCESS(status);
+        TEST_ASSERT(clone2 != NULL);
+        uregex_close(re);
+
+        status = U_ZERO_ERROR;
+        clone3 = uregex_clone(clone2, &status);
+        TEST_ASSERT_SUCCESS(status);
+        TEST_ASSERT(clone3 != NULL);
+
+        u_uastrncpy(testString1, "abcccd", sizeof(pat)/2);
+        u_uastrncpy(testString2, "xxxabcccd", sizeof(pat)/2);
+
+        status = U_ZERO_ERROR;
+        uregex_setText(clone1, testString1, -1, &status);
+        TEST_ASSERT_SUCCESS(status);
+        result = uregex_lookingAt(clone1, 0, &status);
+        TEST_ASSERT_SUCCESS(status);
+        TEST_ASSERT(result==TRUE);
+        
+        status = U_ZERO_ERROR;
+        uregex_setText(clone2, testString2, -1, &status);
+        TEST_ASSERT_SUCCESS(status);
+        result = uregex_lookingAt(clone2, 0, &status);
+        TEST_ASSERT_SUCCESS(status);
+        TEST_ASSERT(result==FALSE);
+        result = uregex_find(clone2, 0, &status);
+        TEST_ASSERT_SUCCESS(status);
+        TEST_ASSERT(result==TRUE);
+
+        uregex_close(clone1);
+        uregex_close(clone2);
+        uregex_close(clone3);
+
+    }
+
+    /*
+     *  pattern() and patternText()
+     */
+    {
+        const UChar  *resultPat;
+        int32_t       resultLen;
+        UText        *resultText;
+        u_uastrncpy(pat, "hello", sizeof(pat)/2); /* for comparison */
+        status = U_ZERO_ERROR;
+        
+        utext_openUTF8(&patternText, "hello", -1, &status);
+        re = uregex_open(pat, -1, 0, NULL, &status);
+        resultPat = uregex_pattern(re, &resultLen, &status);
+        TEST_ASSERT_SUCCESS(status);
+
+        /* The TEST_ASSERT_SUCCESS above should change too... */
+        if (U_SUCCESS(status)) {
+            TEST_ASSERT(resultLen == -1);
+            TEST_ASSERT(u_strcmp(resultPat, pat) == 0);
+        }
+        
+        resultText = uregex_patternUText(re, &status);
+        TEST_ASSERT_SUCCESS(status);
+        TEST_ASSERT_UTEXT("hello", resultText);
+
+        uregex_close(re);
+
+        status = U_ZERO_ERROR;
+        re = uregex_open(pat, 3, 0, NULL, &status);
+        resultPat = uregex_pattern(re, &resultLen, &status);
+        TEST_ASSERT_SUCCESS(status);
+
+        /* The TEST_ASSERT_SUCCESS above should change too... */
+        if (U_SUCCESS(status)) {
+            TEST_ASSERT(resultLen == 3);
+            TEST_ASSERT(u_strncmp(resultPat, pat, 3) == 0);
+            TEST_ASSERT(u_strlen(resultPat) == 3);
+        }
+        
+        resultText = uregex_patternUText(re, &status);
+        TEST_ASSERT_SUCCESS(status);
+        TEST_ASSERT_UTEXT("hel", resultText);
+
+        uregex_close(re);
+    }
+
+    /*
+     *  setUText() and lookingAt()
+     */
+    {
+        UText  text1 = UTEXT_INITIALIZER;
+        UText  text2 = UTEXT_INITIALIZER;
+        UBool  result;
+
+        status = U_ZERO_ERROR;
+        utext_openUTF8(&text1, "abcccd", -1, &status);
+        utext_openUTF8(&text2, "abcccxd", -1, &status);
+        
+        utext_openUTF8(&patternText, "abc*d", -1, &status);
+        re = uregex_openUText(&patternText, 0, NULL, &status);
+        TEST_ASSERT_SUCCESS(status);
+
+        /* Operation before doing a setText should fail... */
+        status = U_ZERO_ERROR;
+        uregex_lookingAt(re, 0, &status);
+        TEST_ASSERT( status== U_REGEX_INVALID_STATE);
+
+        status = U_ZERO_ERROR;
+        uregex_setUText(re, &text1, &status);
+        result = uregex_lookingAt(re, 0, &status);
+        TEST_ASSERT(result == TRUE);
+        TEST_ASSERT_SUCCESS(status);
+
+        status = U_ZERO_ERROR;
+        uregex_setUText(re, &text2, &status);
+        result = uregex_lookingAt(re, 0, &status);
+        TEST_ASSERT(result == FALSE);
+        TEST_ASSERT_SUCCESS(status);
+
+        status = U_ZERO_ERROR;
+        uregex_setUText(re, &text1, &status);
+        result = uregex_lookingAt(re, 0, &status);
+        TEST_ASSERT(result == TRUE);
+        TEST_ASSERT_SUCCESS(status);
+
+        uregex_close(re);
+        utext_close(&text1);
+        utext_close(&text2);
+    }
+
+
+    /*
+     *  getText() and getUText()
+     */
+    {
+        UText  text1 = UTEXT_INITIALIZER;
+        UText  text2 = UTEXT_INITIALIZER;
+        UChar  text2Chars[20];
+        UText  *resultText;
+        const UChar   *result;
+        int32_t  textLength;
+
+        status = U_ZERO_ERROR;
+        utext_openUTF8(&text1, "abcccd", -1, &status);
+        u_uastrncpy(text2Chars, "abcccxd", sizeof(text2)/2);
+        utext_openUChars(&text2, text2Chars, -1, &status);
+        
+        utext_openUTF8(&patternText, "abc*d", -1, &status);
+        re = uregex_openUText(&patternText, 0, NULL, &status);
+
+        /* First set a UText */
+        uregex_setUText(re, &text1, &status);
+        resultText = uregex_getUText(re, NULL, &status);
+        TEST_ASSERT_SUCCESS(status);
+        TEST_ASSERT(resultText != &text1);
+        utext_setNativeIndex(resultText, 0);
+        utext_setNativeIndex(&text1, 0);
+        TEST_ASSERT(utext_compare(resultText, -1, &text1, -1) == 0);
+        utext_close(resultText);
+        
+        result = uregex_getText(re, &textLength, &status); /* flattens UText into buffer */
+        TEST_ASSERT(textLength == -1 || textLength == 6);
+        resultText = uregex_getUText(re, NULL, &status);
+        TEST_ASSERT_SUCCESS(status);
+        TEST_ASSERT(resultText != &text1);
+        utext_setNativeIndex(resultText, 0);
+        utext_setNativeIndex(&text1, 0);
+        TEST_ASSERT(utext_compare(resultText, -1, &text1, -1) == 0);
+        utext_close(resultText);
+
+        /* Then set a UChar * */
+        uregex_setText(re, text2Chars, 7, &status);
+        resultText = uregex_getUText(re, NULL, &status);
+        TEST_ASSERT_SUCCESS(status);
+        utext_setNativeIndex(resultText, 0);
+        utext_setNativeIndex(&text2, 0);
+        TEST_ASSERT(utext_compare(resultText, -1, &text2, -1) == 0);
+        utext_close(resultText);
+        result = uregex_getText(re, &textLength, &status);
+        TEST_ASSERT(textLength == 7);
+        
+        uregex_close(re);
+        utext_close(&text1);
+        utext_close(&text2);
+    }
+
+    /*
+     *  matches()
+     */
+    {
+        UText   text1 = UTEXT_INITIALIZER;
+        UBool   result;
+        UText   nullText = UTEXT_INITIALIZER;
+
+        status = U_ZERO_ERROR;
+        utext_openUTF8(&text1, "abcccde", -1, &status);
+        utext_openUTF8(&patternText, "abc*d", -1, &status);
+        re = uregex_openUText(&patternText, 0, NULL, &status);
+
+        uregex_setUText(re, &text1, &status);
+        result = uregex_matches(re, 0, &status);
+        TEST_ASSERT(result == FALSE);
+        TEST_ASSERT_SUCCESS(status);
+        uregex_close(re);
+
+        status = U_ZERO_ERROR;
+        re = uregex_openC(".?", 0, NULL, &status);
+        uregex_setUText(re, &text1, &status);
+        result = uregex_matches(re, 7, &status);
+        TEST_ASSERT(result == TRUE);
+        TEST_ASSERT_SUCCESS(status);
+
+        status = U_ZERO_ERROR;
+        utext_openUTF8(&nullText, "", -1, &status);
+        uregex_setUText(re, &nullText, &status);
+        TEST_ASSERT_SUCCESS(status);
+        result = uregex_matches(re, 0, &status);
+        TEST_ASSERT(result == TRUE);
+        TEST_ASSERT_SUCCESS(status);
+        
+        uregex_close(re);
+        utext_close(&text1);
+        utext_close(&nullText);
+    }
+
+
+    /*
+     *  lookingAt()    Used in setText test.
+     */
+
+
+    /*
+     *  find(), findNext, start, end, reset
+     */
+    {
+        UChar    text1[50];
+        UBool    result;
+        u_uastrncpy(text1, "012rx5rx890rxrx...",  sizeof(text1)/2);
+        status = U_ZERO_ERROR;
+        re = uregex_openC("rx", 0, NULL, &status);
+
+        uregex_setText(re, text1, -1, &status);
+        result = uregex_find(re, 0, &status);
+        TEST_ASSERT(result == TRUE);
+        TEST_ASSERT(uregex_start(re, 0, &status) == 3);
+        TEST_ASSERT(uregex_end(re, 0, &status) == 5);
+        TEST_ASSERT_SUCCESS(status);
+
+        result = uregex_find(re, 9, &status);
+        TEST_ASSERT(result == TRUE);
+        TEST_ASSERT(uregex_start(re, 0, &status) == 11);
+        TEST_ASSERT(uregex_end(re, 0, &status) == 13);
+        TEST_ASSERT_SUCCESS(status);
+
+        result = uregex_find(re, 14, &status);
+        TEST_ASSERT(result == FALSE);
+        TEST_ASSERT_SUCCESS(status);
+
+        status = U_ZERO_ERROR;
+        uregex_reset(re, 0, &status);
+
+        result = uregex_findNext(re, &status);
+        TEST_ASSERT(result == TRUE);
+        TEST_ASSERT(uregex_start(re, 0, &status) == 3);
+        TEST_ASSERT(uregex_end(re, 0, &status) == 5);
+        TEST_ASSERT_SUCCESS(status);
+
+        result = uregex_findNext(re, &status);
+        TEST_ASSERT(result == TRUE);
+        TEST_ASSERT(uregex_start(re, 0, &status) == 6);
+        TEST_ASSERT(uregex_end(re, 0, &status) == 8);
+        TEST_ASSERT_SUCCESS(status);
+
+        status = U_ZERO_ERROR;
+        uregex_reset(re, 12, &status);
+
+        result = uregex_findNext(re, &status);
+        TEST_ASSERT(result == TRUE);
+        TEST_ASSERT(uregex_start(re, 0, &status) == 13);
+        TEST_ASSERT(uregex_end(re, 0, &status) == 15);
+        TEST_ASSERT_SUCCESS(status);
+
+        result = uregex_findNext(re, &status);
+        TEST_ASSERT(result == FALSE);
+        TEST_ASSERT_SUCCESS(status);
+
+        uregex_close(re);
+    }
+
+    /*
+     *  group()
+     */
+    {
+        UChar    text1[80];
+        UText   *actual;
+        UBool    result;
+        u_uastrncpy(text1, "noise abc interior def, and this is off the end",  sizeof(text1)/2);
+
+        status = U_ZERO_ERROR;
+        re = uregex_openC("abc(.*?)def", 0, NULL, &status);
+        TEST_ASSERT_SUCCESS(status);
+
+        uregex_setText(re, text1, -1, &status);
+        result = uregex_find(re, 0, &status);
+        TEST_ASSERT(result==TRUE);
+
+        /*  Capture Group 0, the full match.  Should succeed.  */
+        status = U_ZERO_ERROR;
+        actual = uregex_groupUText(re, 0, NULL, &status);
+        TEST_ASSERT_SUCCESS(status);
+        TEST_ASSERT_UTEXT("abc interior def", actual);
+        utext_close(actual);
+
+        /*  Capture group #1.  Should succeed. */
+        status = U_ZERO_ERROR;
+        actual = uregex_groupUText(re, 1, NULL, &status);
+        TEST_ASSERT_SUCCESS(status);
+        TEST_ASSERT_UTEXT(" interior ", actual);
+        utext_close(actual);
+
+        /*  Capture group out of range.  Error. */
+        status = U_ZERO_ERROR;
+        actual = uregex_groupUText(re, 2, NULL, &status);
+        TEST_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
+        TEST_ASSERT(utext_nativeLength(actual) == 0);
+        utext_close(actual);
+
+        uregex_close(re);
+
+    }
+    
+    /*
+     *  replaceFirst()
+     */
+    {
+        UChar    text1[80];
+        UChar    text2[80];
+        UText    replText = UTEXT_INITIALIZER;
+        UText   *result;
+        
+        status = U_ZERO_ERROR;
+        u_uastrncpy(text1, "Replace xaax x1x x...x.",  sizeof(text1)/2);
+        u_uastrncpy(text2, "No match here.",  sizeof(text2)/2);
+        utext_openUTF8(&replText, "<$1>", -1, &status);
+
+        re = uregex_openC("x(.*?)x", 0, NULL, &status);
+        TEST_ASSERT_SUCCESS(status);
+
+        /*  Normal case, with match */
+        uregex_setText(re, text1, -1, &status);
+        result = uregex_replaceFirstUText(re, &replText, NULL, &status);
+        TEST_ASSERT_SUCCESS(status);
+        TEST_ASSERT_UTEXT("Replace <aa> x1x x...x.", result);
+        utext_close(result);
+
+        /* No match.  Text should copy to output with no changes.  */
+        uregex_setText(re, text2, -1, &status);
+        result = uregex_replaceFirstUText(re, &replText, NULL, &status);
+        TEST_ASSERT_SUCCESS(status);
+        TEST_ASSERT_UTEXT("No match here.", result);
+        utext_close(result);
+        
+        /* Unicode escapes */
+        uregex_setText(re, text1, -1, &status);
+        utext_openUTF8(&replText, "\\\\\\u0041$1\\U00000042$\\a", -1, &status);
+        result = uregex_replaceFirstUText(re, &replText, NULL, &status);
+        TEST_ASSERT_SUCCESS(status);
+        TEST_ASSERT_UTEXT("Replace \\AaaB$a x1x x...x.", result);
+        utext_close(result);
+
+        uregex_close(re);
+        utext_close(&replText);
+    }
+
+
+    /*
+     *  replaceAll()
+     */
+    {
+        UChar    text1[80];
+        UChar    text2[80];
+        UText    replText = UTEXT_INITIALIZER;
+        UText   *result;
+
+        status = U_ZERO_ERROR;
+        u_uastrncpy(text1, "Replace xaax x1x x...x.",  sizeof(text1)/2);
+        u_uastrncpy(text2, "No match here.",  sizeof(text2)/2);
+        utext_openUTF8(&replText, "<$1>", -1, &status);
+
+        re = uregex_openC("x(.*?)x", 0, NULL, &status);
+        TEST_ASSERT_SUCCESS(status);
+
+        /*  Normal case, with match */
+        uregex_setText(re, text1, -1, &status);
+        result = uregex_replaceAllUText(re, &replText, NULL, &status);
+        TEST_ASSERT_SUCCESS(status);
+        TEST_ASSERT_UTEXT("Replace <aa> <1> <...>.", result);
+        utext_close(result);
+
+        /* No match.  Text should copy to output with no changes.  */
+        uregex_setText(re, text2, -1, &status);
+        result = uregex_replaceAllUText(re, &replText, NULL, &status);
+        TEST_ASSERT_SUCCESS(status);
+        TEST_ASSERT_UTEXT("No match here.", result);
+        utext_close(result);
+
+        uregex_close(re);
+        utext_close(&replText);
+    }
+
+
+    /*
+     *  appendReplacement()
+     */
+    {
+        UChar    text[100];
+        UChar    repl[100];
+        UChar    buf[100];
+        UChar   *bufPtr;
+        int32_t  bufCap;
+
+
+        status = U_ZERO_ERROR;
+        re = uregex_openC(".*", 0, 0, &status);
+        TEST_ASSERT_SUCCESS(status);
+
+        u_uastrncpy(text, "whatever",  sizeof(text)/2);
+        u_uastrncpy(repl, "some other", sizeof(repl)/2);
+        uregex_setText(re, text, -1, &status);
+
+        /* match covers whole target string */
+        uregex_find(re, 0, &status);
+        TEST_ASSERT_SUCCESS(status);
+        bufPtr = buf;
+        bufCap = sizeof(buf) / 2;
+        uregex_appendReplacement(re, repl, -1, &bufPtr, &bufCap, &status);
+        TEST_ASSERT_SUCCESS(status);
+        TEST_ASSERT_STRING("some other", buf, TRUE);
+
+        /* Match has \u \U escapes */
+        uregex_find(re, 0, &status);
+        TEST_ASSERT_SUCCESS(status);
+        bufPtr = buf;
+        bufCap = sizeof(buf) / 2;
+        u_uastrncpy(repl, "abc\\u0041\\U00000042 \\\\ $ \\abc", sizeof(repl)/2);
+        uregex_appendReplacement(re, repl, -1, &bufPtr, &bufCap, &status);
+        TEST_ASSERT_SUCCESS(status);
+        TEST_ASSERT_STRING("abcAB \\ $ abc", buf, TRUE); 
+
+        uregex_close(re);
+    }
+
+
+    /*
+     *  appendReplacement(), appendTail() checked in replaceFirst(), replaceAll().
+     */
+
+    /*
+     *  splitUText()
+     */
+    {
+        UChar    textToSplit[80];
+        UChar    text2[80];
+        UText    *fields[10];
+        int32_t  numFields;
+
+        u_uastrncpy(textToSplit, "first : second:  third",  sizeof(textToSplit)/2);
+        u_uastrncpy(text2, "No match here.",  sizeof(text2)/2);
+
+        status = U_ZERO_ERROR;
+        re = uregex_openC(":", 0, NULL, &status);
+
+
+        /*  Simple split */ 
+
+        uregex_setText(re, textToSplit, -1, &status);
+        TEST_ASSERT_SUCCESS(status);
+
+        /* The TEST_ASSERT_SUCCESS call above should change too... */
+        if (U_SUCCESS(status)) {
+            memset(fields, 0, sizeof(fields));
+            numFields = uregex_splitUText(re, fields, 10, &status);
+            TEST_ASSERT_SUCCESS(status);
+
+            /* The TEST_ASSERT_SUCCESS call above should change too... */
+            if(U_SUCCESS(status)) {
+                TEST_ASSERT(numFields == 3);
+                TEST_ASSERT_UTEXT("first ",  fields[0]);
+                TEST_ASSERT_UTEXT(" second", fields[1]);
+                TEST_ASSERT_UTEXT("  third", fields[2]);
+                TEST_ASSERT(fields[3] == NULL);
+            }
+        }
+
+        uregex_close(re);
+
+    
+        /*  Split with too few output strings available */
+        status = U_ZERO_ERROR;
+        re = uregex_openC(":", 0, NULL, &status);
+        uregex_setText(re, textToSplit, -1, &status);
+        TEST_ASSERT_SUCCESS(status);
+
+        /* The TEST_ASSERT_SUCCESS call above should change too... */
+        if(U_SUCCESS(status)) {
+            fields[0] = NULL;
+            fields[1] = NULL;
+            fields[2] = &patternText;
+            numFields = uregex_splitUText(re, fields, 2, &status);
+            TEST_ASSERT_SUCCESS(status);
+
+            /* The TEST_ASSERT_SUCCESS call above should change too... */
+            if(U_SUCCESS(status)) {
+                TEST_ASSERT(numFields == 2);
+                TEST_ASSERT_UTEXT("first ",  fields[0]);
+                TEST_ASSERT_UTEXT(" second:  third", fields[1]);
+                TEST_ASSERT(fields[2] == &patternText);
+            }
+        }
+
+        uregex_close(re);
+    }
+
+    /* splitUText(), part 2.  Patterns with capture groups.  The capture group text
+     *                   comes out as additional fields.  */
+    {
+        UChar    textToSplit[80];
+        UText    *fields[10];
+        int32_t  numFields;
+
+        u_uastrncpy(textToSplit, "first <tag-a> second<tag-b>  third",  sizeof(textToSplit)/2);
+
+        status = U_ZERO_ERROR;
+        re = uregex_openC("<(.*?)>", 0, NULL, &status);
+
+        uregex_setText(re, textToSplit, -1, &status);
+        TEST_ASSERT_SUCCESS(status);
+
+        /* The TEST_ASSERT_SUCCESS call above should change too... */
+        if(U_SUCCESS(status)) {
+            memset(fields, 0, sizeof(fields));
+            numFields = uregex_splitUText(re, fields, 10, &status);
+            TEST_ASSERT_SUCCESS(status);
+
+            /* The TEST_ASSERT_SUCCESS call above should change too... */
+            if(U_SUCCESS(status)) {
+                TEST_ASSERT(numFields == 5);
+                TEST_ASSERT_UTEXT("first ",  fields[0]);
+                TEST_ASSERT_UTEXT("tag-a",   fields[1]);
+                TEST_ASSERT_UTEXT(" second", fields[2]);
+                TEST_ASSERT_UTEXT("tag-b",   fields[3]);
+                TEST_ASSERT_UTEXT("  third", fields[4]);
+                TEST_ASSERT(fields[5] == NULL);
+            }
+        }
+    
+        /*  Split with too few output strings available (2) */
+        status = U_ZERO_ERROR;
+        fields[0] = NULL;
+        fields[1] = NULL;
+        fields[2] = &patternText;
+        numFields = uregex_splitUText(re, fields, 2, &status);
+        TEST_ASSERT_SUCCESS(status);
+
+        /* The TEST_ASSERT_SUCCESS call above should change too... */
+        if(U_SUCCESS(status)) {
+            TEST_ASSERT(numFields == 2);
+            TEST_ASSERT_UTEXT("first ",  fields[0]);
+            TEST_ASSERT_UTEXT(" second<tag-b>  third", fields[1]);
+            TEST_ASSERT(fields[2] == &patternText);
+        }
+
+        /*  Split with too few output strings available (3) */
+        status = U_ZERO_ERROR;
+        fields[0] = NULL;
+        fields[1] = NULL;
+        fields[2] = NULL;
+        fields[3] = &patternText;
+        numFields = uregex_splitUText(re, fields, 3, &status);
+        TEST_ASSERT_SUCCESS(status);
+
+        /* The TEST_ASSERT_SUCCESS call above should change too... */
+        if(U_SUCCESS(status)) {
+            TEST_ASSERT(numFields == 3);
+            TEST_ASSERT_UTEXT("first ",  fields[0]);
+            TEST_ASSERT_UTEXT("tag-a",   fields[1]);
+            TEST_ASSERT_UTEXT(" second<tag-b>  third", fields[2]);
+            TEST_ASSERT(fields[3] == &patternText);
+        }
+
+        /*  Split with just enough output strings available (5) */
+        status = U_ZERO_ERROR;
+        fields[0] = NULL;
+        fields[1] = NULL;
+        fields[2] = NULL;
+        fields[3] = NULL;
+        fields[4] = NULL;
+        fields[5] = &patternText;
+        numFields = uregex_splitUText(re, fields, 5, &status);
+        TEST_ASSERT_SUCCESS(status);
+
+        /* The TEST_ASSERT_SUCCESS call above should change too... */
+        if(U_SUCCESS(status)) {
+            TEST_ASSERT(numFields == 5);
+            TEST_ASSERT_UTEXT("first ",  fields[0]);
+            TEST_ASSERT_UTEXT("tag-a",   fields[1]);
+            TEST_ASSERT_UTEXT(" second", fields[2]);
+            TEST_ASSERT_UTEXT("tag-b",   fields[3]);
+            TEST_ASSERT_UTEXT("  third", fields[4]);
+            TEST_ASSERT(fields[5] == &patternText);
+        }
+
+        /* Split, end of text is a field delimiter.   */
+        status = U_ZERO_ERROR;
+        uregex_setText(re, textToSplit, strlen("first <tag-a> second<tag-b>"), &status);
+        TEST_ASSERT_SUCCESS(status);
+
+        /* The TEST_ASSERT_SUCCESS call above should change too... */
+        if(U_SUCCESS(status)) {
+            memset(fields, 0, sizeof(fields));
+            fields[9] = &patternText;
+            numFields = uregex_splitUText(re, fields, 9, &status);
+            TEST_ASSERT_SUCCESS(status);
+
+            /* The TEST_ASSERT_SUCCESS call above should change too... */
+            if(U_SUCCESS(status)) {
+                TEST_ASSERT(numFields == 4);
+                TEST_ASSERT_UTEXT("first ",  fields[0]);
+                TEST_ASSERT_UTEXT("tag-a",   fields[1]);
+                TEST_ASSERT_UTEXT(" second", fields[2]);
+                TEST_ASSERT_UTEXT("tag-b",   fields[3]);
+                TEST_ASSERT(fields[4] == NULL);
+                TEST_ASSERT(fields[8] == NULL);
+                TEST_ASSERT(fields[9] == &patternText);
+            }
+        }
+
+        uregex_close(re);
+    }
+}
+
 #endif   /*  !UCONFIG_NO_REGULAR_EXPRESSIONS */
--- a/icu4c/source/test/intltest/regextst.cpp
+++ b/icu4c/source/test/intltest/regextst.cpp
--- a/icu4c/source/test/intltest/utxttest.cpp
+++ b/icu4c/source/test/intltest/utxttest.cpp
@ -1,6 +1,6 @@
 /********************************************************************
 * COPYRIGHT: 
- * Copyright (c) 2005-2009, International Business Machines Corporation and
+ * Copyright (c) 2005-2010, International Business Machines Corporation and
 * others. All Rights Reserved.
 ********************************************************************/
 /************************************************************************
@ -58,6 +58,8 @@ UTextTest::runIndexedTest(int32_t index, UBool exec,
            if (exec) Ticket5560();  break;
        case 4: name = "Ticket6847";
            if (exec) Ticket6847();  break;
+        case 5: name = "ComparisonTest";
+            if (exec) ComparisonTest(); break;
        default: name = "";          break;
    }
 }
@ -836,6 +838,476 @@ void UTextTest::TestAccessNoClone(const UnicodeString &us, UText *ut, int cpCoun
 }


+//
+//  ComparisonTest()    Check the string comparison functions. Based on UnicodeStringTest::TestCompare()
+//
+void UTextTest::ComparisonTest()
+{
+    UErrorCode status = U_ZERO_ERROR;
+    UnicodeString   test1Str("this is a test");
+    UnicodeString   test2Str("this is a test");
+    UnicodeString   test3Str("this is a test of the emergency broadcast system");
+    UnicodeString   test4Str("never say, \"this is a test\"!!");
+    
+    UText test1 = UTEXT_INITIALIZER;
+    UText test2 = UTEXT_INITIALIZER;
+    UText test3 = UTEXT_INITIALIZER;
+    UText test4 = UTEXT_INITIALIZER;
+    
+    UChar        uniChars[] = { 0x74, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 
+                                0x20, 0x61, 0x20, 0x74, 0x65, 0x73, 0x74, 0 };
+    char            chars[] = { 0x74, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 
+                                0x20, 0x61, 0x20, 0x74, 0x65, 0x73, 0x74, 0 };
+    
+    UText uniCharText = UTEXT_INITIALIZER;
+    UText charText = UTEXT_INITIALIZER;
+    
+    utext_openUnicodeString(&test1, &test1Str, &status);
+    utext_openUnicodeString(&test2, &test2Str, &status);
+    utext_openUnicodeString(&test3, &test3Str, &status);
+    utext_openUnicodeString(&test4, &test4Str, &status);
+
+    utext_openUChars(&uniCharText, uniChars, -1, &status);
+    utext_openUTF8(&charText, chars, -1, &status);
+    
+    TEST_SUCCESS(status);
+        
+    // test utext_compare(), simple
+    UTEXT_SETNATIVEINDEX(&test1, 0);
+    UTEXT_SETNATIVEINDEX(&test2, 0);
+    if (utext_compare(&test1, -1, &test2, -1) != 0) errln("utext_compare() failed, simple setup");
+    UTEXT_SETNATIVEINDEX(&test1, 0);
+    UTEXT_SETNATIVEINDEX(&test3, 0);
+    if (utext_compare(&test1, -1, &test3, -1) >= 0) errln("utext_compare() failed, simple setup");
+    UTEXT_SETNATIVEINDEX(&test1, 0);
+    UTEXT_SETNATIVEINDEX(&test4, 0);
+    if (utext_compare(&test1, -1, &test4, -1) <= 0) errln("utext_compare() failed, simple setup");
+    
+    // test utext_compareNativeLimit(), simple
+    UTEXT_SETNATIVEINDEX(&test1, 0);
+    UTEXT_SETNATIVEINDEX(&test2, 0);
+    if (utext_compareNativeLimit(&test1, -1, &test2, -1) != 0) errln("utext_compareNativeLimit() failed, simple setup");
+    UTEXT_SETNATIVEINDEX(&test1, 0);
+    UTEXT_SETNATIVEINDEX(&test3, 0);
+    if (utext_compareNativeLimit(&test1, -1, &test3, -1) >= 0) errln("utext_compareNativeLimit() failed, simple setup");
+    UTEXT_SETNATIVEINDEX(&test1, 0);
+    UTEXT_SETNATIVEINDEX(&test4, 0);
+    if (utext_compareNativeLimit(&test1, -1, &test4, -1) <= 0) errln("utext_compareNativeLimit() failed, simple setup");
+    
+    // test utext_compare(), one explicit length
+    UTEXT_SETNATIVEINDEX(&test1, 0);
+    UTEXT_SETNATIVEINDEX(&test2, 0);
+    if (utext_compare(&test1, 14, &test2, -1) != 0) errln("utext_compare() failed, one explicit length");
+    UTEXT_SETNATIVEINDEX(&test2, 0);
+    UTEXT_SETNATIVEINDEX(&test3, 0);
+    if (utext_compare(&test3, 14, &test2, -1) != 0) errln("utext_compare() failed, one explicit length");
+    UTEXT_SETNATIVEINDEX(&test2, 0);
+    UTEXT_SETNATIVEINDEX(&test4, 12);
+    if (utext_compare(&test4, 14, &test2, -1) != 0) errln("utext_compare() failed, one explicit length and offset");
+    UTEXT_SETNATIVEINDEX(&test1, 0);
+    UTEXT_SETNATIVEINDEX(&test3, 0);
+    if (utext_compare(&test3, 18, &test2, -1) <= 0) errln("utext_compare() failed, one explicit length");
+    
+    // test utext_compareNativeLimit(), one explicit length
+    UTEXT_SETNATIVEINDEX(&test1, 0);
+    UTEXT_SETNATIVEINDEX(&test2, 0);
+    if (utext_compareNativeLimit(&test1, 14, &test2, -1) != 0) errln("utext_compareNativeLimit() failed, one explicit length");
+    UTEXT_SETNATIVEINDEX(&test2, 0);
+    UTEXT_SETNATIVEINDEX(&test3, 0);
+    if (utext_compareNativeLimit(&test3, 14, &test2, -1) != 0) errln("utext_compareNativeLimit() failed, one explicit length");
+    UTEXT_SETNATIVEINDEX(&test2, 0);
+    UTEXT_SETNATIVEINDEX(&test4, 12);
+    if (utext_compareNativeLimit(&test4, 26, &test2, -1) != 0) errln("utext_compareNativeLimit() failed, one explicit length and limit");
+    UTEXT_SETNATIVEINDEX(&test1, 0);
+    UTEXT_SETNATIVEINDEX(&test3, 0);
+    if (utext_compareNativeLimit(&test3, 18, &test2, -1) <= 0) errln("utext_compareNativeLimit() failed, one explicit length");
+    
+    // test utext_compare(), UChar-based UText
+    UTEXT_SETNATIVEINDEX(&uniCharText, 0);
+    UTEXT_SETNATIVEINDEX(&test2, 0);
+    if (utext_compare(&test2, -1, &uniCharText, -1) != 0) errln("utext_compare() failed, UChar-based UText");
+    UTEXT_SETNATIVEINDEX(&uniCharText, 0);
+    UTEXT_SETNATIVEINDEX(&test3, 0);
+    if (utext_compare(&test3, -1, &uniCharText, -1) <= 0) errln("utext_compare() failed, UChar-based UText");
+    UTEXT_SETNATIVEINDEX(&uniCharText, 0);
+    UTEXT_SETNATIVEINDEX(&test4, 0);
+    if (utext_compare(&test4, -1, &uniCharText, -1) >= 0) errln("utext_compare() failed, UChar-based UText");
+    
+    // test utext_compareNativeLimit(), UChar-based UText
+    UTEXT_SETNATIVEINDEX(&uniCharText, 0);
+    UTEXT_SETNATIVEINDEX(&test2, 0);
+    if (utext_compareNativeLimit(&test2, -1, &uniCharText, -1) != 0) errln("utext_compareNativeLimit() failed, UChar-based UText");
+    UTEXT_SETNATIVEINDEX(&uniCharText, 0);
+    UTEXT_SETNATIVEINDEX(&test3, 0);
+    if (utext_compareNativeLimit(&test3, -1, &uniCharText, -1) <= 0) errln("utext_compareNativeLimit() failed, UChar-based UText");
+    UTEXT_SETNATIVEINDEX(&uniCharText, 0);
+    UTEXT_SETNATIVEINDEX(&test4, 0);
+    if (utext_compareNativeLimit(&test4, -1, &uniCharText, -1) >= 0) errln("utext_compareNativeLimit() failed, UChar-based UText");
+    
+    // test utext_compare(), UTF8-based UText
+    UTEXT_SETNATIVEINDEX(&charText, 0);
+    UTEXT_SETNATIVEINDEX(&test2, 0);
+    if (utext_compare(&test2, -1, &charText, -1) != 0) errln("utext_compare() failed, UTF8-based UText");
+    UTEXT_SETNATIVEINDEX(&charText, 0);
+    UTEXT_SETNATIVEINDEX(&test3, 0);
+    if (utext_compare(&test3, -1, &charText, -1) <= 0) errln("utext_compare() failed, UTF8-based UText");
+    UTEXT_SETNATIVEINDEX(&charText, 0);
+    UTEXT_SETNATIVEINDEX(&test4, 0);
+    if (utext_compare(&test4, -1, &charText, -1) >= 0) errln("utext_compare() failed, UTF8-based UText");
+    
+    // test utext_compareNativeLimit(), UTF8-based UText
+    UTEXT_SETNATIVEINDEX(&charText, 0);
+    UTEXT_SETNATIVEINDEX(&test2, 0);
+    if (utext_compareNativeLimit(&test2, -1, &charText, -1) != 0) errln("utext_compareNativeLimit() failed, UTF8-based UText");
+    UTEXT_SETNATIVEINDEX(&charText, 0);
+    UTEXT_SETNATIVEINDEX(&test3, 0);
+    if (utext_compareNativeLimit(&test3, -1, &charText, -1) <= 0) errln("utext_compareNativeLimit() failed, UTF8-based UText");
+    UTEXT_SETNATIVEINDEX(&charText, 0);
+    UTEXT_SETNATIVEINDEX(&test4, 0);
+    if (utext_compareNativeLimit(&test4, -1, &charText, -1) >= 0) errln("utext_compareNativeLimit() failed, UTF8-based UText");
+    
+    // test utext_compare(), length
+    UTEXT_SETNATIVEINDEX(&test1, 0);
+    UTEXT_SETNATIVEINDEX(&test2, 0);
+    if (utext_compare(&test1, -1, &test2, 4) != 0) errln("utext_compare() failed, one length");
+    UTEXT_SETNATIVEINDEX(&test1, 0);
+    UTEXT_SETNATIVEINDEX(&test2, 0);
+    if (utext_compare(&test1, 5, &test2, 4) <= 0) errln("utext_compare() failed, both lengths");
+    
+    // test utext_compareNativeLimit(), limit
+    UTEXT_SETNATIVEINDEX(&test1, 0);
+    UTEXT_SETNATIVEINDEX(&test2, 0);
+    if (utext_compareNativeLimit(&test1, -1, &test2, 4) != 0) errln("utext_compareNativeLimit() failed, one limit");
+    UTEXT_SETNATIVEINDEX(&test1, 0);
+    UTEXT_SETNATIVEINDEX(&test2, 0);
+    if (utext_compareNativeLimit(&test1, 5, &test2, 4) <= 0) errln("utext_compareNativeLimit() failed, both limits");
+    
+    // test utext_compare(), both explicit offsets and lengths
+    UTEXT_SETNATIVEINDEX(&test1, 0);
+    UTEXT_SETNATIVEINDEX(&test2, 0);
+    if (utext_compare(&test1, 14, &test2, 14) != 0) errln("utext_compare() failed, both explicit offsets and lengths");
+    UTEXT_SETNATIVEINDEX(&test1, 0);
+    UTEXT_SETNATIVEINDEX(&test3, 0);
+    if (utext_compare(&test1, 14, &test3, 14) != 0) errln("utext_compare() failed, both explicit offsets and lengths");
+    UTEXT_SETNATIVEINDEX(&test1, 0);
+    UTEXT_SETNATIVEINDEX(&test4, 12);
+    if (utext_compare(&test1, 14, &test4, 14) != 0) errln("utext_compare() failed, both explicit offsets and lengths");
+    UTEXT_SETNATIVEINDEX(&test1, 10);
+    UTEXT_SETNATIVEINDEX(&test2, 0);
+    if (utext_compare(&test1, 4, &test2, 4) >= 0) errln("utext_compare() failed, both explicit offsets and lengths");
+    UTEXT_SETNATIVEINDEX(&test1, 10);
+    UTEXT_SETNATIVEINDEX(&test3, 22);
+    if (utext_compare(&test1, 4, &test3, 9) <= 0) errln("utext_compare() failed, both explicit offsets and lengths");
+    UTEXT_SETNATIVEINDEX(&test1, 10);
+    UTEXT_SETNATIVEINDEX(&test4, 22);
+    if (utext_compare(&test1, 4, &test4, 4) != 0) errln("utext_compare() failed, both explicit offsets and lengths");
+    
+    // test utext_compareNativeLimit(), both explicit offsets and limits
+    UTEXT_SETNATIVEINDEX(&test1, 0);
+    UTEXT_SETNATIVEINDEX(&test2, 0);
+    if (utext_compareNativeLimit(&test1, 14, &test2, 14) != 0) errln("utext_compareNativeLimit() failed, both explicit offsets and limits");
+    UTEXT_SETNATIVEINDEX(&test1, 0);
+    UTEXT_SETNATIVEINDEX(&test3, 0);
+    if (utext_compareNativeLimit(&test1, 14, &test3, 14) != 0) errln("utext_compareNativeLimit() failed, both explicit offsets and limits");
+    UTEXT_SETNATIVEINDEX(&test1, 0);
+    UTEXT_SETNATIVEINDEX(&test4, 12);
+    if (utext_compareNativeLimit(&test1, 14, &test4, 26) != 0) errln("utext_compareNativeLimit() failed, both explicit offsets and limits");
+    UTEXT_SETNATIVEINDEX(&test1, 10);
+    UTEXT_SETNATIVEINDEX(&test2, 0);
+    if (utext_compareNativeLimit(&test1, 14, &test2, 4) >= 0) errln("utext_compareNativeLimit() failed, both explicit offsets and limits");
+    UTEXT_SETNATIVEINDEX(&test1, 10);
+    UTEXT_SETNATIVEINDEX(&test3, 22);
+    if (utext_compareNativeLimit(&test1, 14, &test3, 31) <= 0) errln("utext_compareNativeLimit() failed, both explicit offsets and limits");
+    UTEXT_SETNATIVEINDEX(&test1, 10);
+    UTEXT_SETNATIVEINDEX(&test4, 22);
+    if (utext_compareNativeLimit(&test1, 14, &test4, 26) != 0) errln("utext_compareNativeLimit() failed, both explicit offsets and limits");
+    
+    /* test caseCompare() */
+    {
+        static const UChar
+        _mixed[]=               { 0x61, 0x42, 0x131, 0x3a3, 0xdf,       0x130,       0x49,  0xfb03,           0xd93f, 0xdfff, 0 },
+        _otherDefault[]=        { 0x41, 0x62, 0x131, 0x3c3, 0x73, 0x53, 0x69, 0x307, 0x69,  0x46, 0x66, 0x49, 0xd93f, 0xdfff, 0 },
+        _otherExcludeSpecialI[]={ 0x41, 0x62, 0x131, 0x3c3, 0x53, 0x73, 0x69,        0x131, 0x66, 0x46, 0x69, 0xd93f, 0xdfff, 0 },
+        _different[]=           { 0x41, 0x62, 0x131, 0x3c3, 0x73, 0x53, 0x130,       0x49,  0x46, 0x66, 0x49, 0xd93f, 0xdffd, 0 };
+        
+        UText
+        mixed = UTEXT_INITIALIZER,
+        otherDefault = UTEXT_INITIALIZER,
+        otherExcludeSpecialI = UTEXT_INITIALIZER,
+        different = UTEXT_INITIALIZER;
+        
+        utext_openUChars(&mixed, _mixed, -1, &status);
+        utext_openUChars(&otherDefault, _otherDefault, -1, &status);
+        utext_openUChars(&otherExcludeSpecialI, _otherExcludeSpecialI, -1, &status);
+        utext_openUChars(&different, _different, -1, &status);
+        
+        TEST_SUCCESS(status);
+        
+        int32_t result;
+        
+        /* test default options */
+        UTEXT_SETNATIVEINDEX(&mixed, 0);
+        UTEXT_SETNATIVEINDEX(&otherDefault, 0);
+        result = utext_caseCompare(&mixed, -1, &otherDefault, -1, U_FOLD_CASE_DEFAULT, &status);
+        if (0 != result || U_FAILURE(status)) {
+            errln("error: utext_caseCompare (other, default) gives %ld (should be 0) (%s)\n", result, u_errorName(status));
+        }
+        UTEXT_SETNATIVEINDEX(&mixed, 0);
+        UTEXT_SETNATIVEINDEX(&otherDefault, 0);
+        result = utext_caseCompareNativeLimit(&mixed, -1, &otherDefault, -1, U_FOLD_CASE_DEFAULT, &status);
+        if (0 != result || U_FAILURE(status)) {
+            errln("error: utext_caseCompareNativeLimit (other, default) gives %ld (should be 0) (%s)\n", result, u_errorName(status));
+        }
+        
+        /* test excluding special I */
+        UTEXT_SETNATIVEINDEX(&mixed, 0);
+        UTEXT_SETNATIVEINDEX(&otherExcludeSpecialI, 0);
+        result = utext_caseCompare(&mixed, -1, &otherExcludeSpecialI, -1, U_FOLD_CASE_EXCLUDE_SPECIAL_I, &status);
+        if (0 != result || U_FAILURE(status)) {
+            errln("error: utext_caseCompare (otherExcludeSpecialI, U_FOLD_CASE_EXCLUDE_SPECIAL_I) gives %ld (should be 0) (%s)\n", result, u_errorName(status));
+        }
+        UTEXT_SETNATIVEINDEX(&mixed, 0);
+        UTEXT_SETNATIVEINDEX(&otherExcludeSpecialI, 0);
+        result = utext_caseCompareNativeLimit(&mixed, -1, &otherExcludeSpecialI, -1, U_FOLD_CASE_EXCLUDE_SPECIAL_I, &status);
+        if (0 != result || U_FAILURE(status)) {
+            errln("error: utext_caseCompareNativeLimit (otherExcludeSpecialI, U_FOLD_CASE_EXCLUDE_SPECIAL_I) gives %ld (should be 0) (%s)\n", result, u_errorName(status));
+        }
+        UTEXT_SETNATIVEINDEX(&mixed, 0);
+        UTEXT_SETNATIVEINDEX(&otherDefault, 0);
+        result = utext_caseCompare(&mixed, -1, &otherDefault, -1, U_FOLD_CASE_EXCLUDE_SPECIAL_I, &status);
+        if (0 == result || U_FAILURE(status)) {
+            errln("error: utext_caseCompare (other, U_FOLD_CASE_EXCLUDE_SPECIAL_I) gives %ld (should be nonzero) (%s)\n", result, u_errorName(status));
+        }
+        UTEXT_SETNATIVEINDEX(&mixed, 0);
+        UTEXT_SETNATIVEINDEX(&otherDefault, 0);
+        result = utext_caseCompareNativeLimit(&mixed, -1, &otherDefault, -1, U_FOLD_CASE_EXCLUDE_SPECIAL_I, &status);
+        if (0 == result || U_FAILURE(status)) {
+            errln("error: utext_caseCompareNativeLimit (other, U_FOLD_CASE_EXCLUDE_SPECIAL_I) gives %ld (should be nonzero) (%s)\n", result, u_errorName(status));
+        }
+        
+        /* test against different string */
+        UTEXT_SETNATIVEINDEX(&mixed, 0);
+        UTEXT_SETNATIVEINDEX(&different, 0);
+        result = utext_caseCompare(&mixed, -1, &different, -1, U_FOLD_CASE_DEFAULT, &status);
+        if (0 >= result || U_FAILURE(status)) {
+            errln("error: utext_caseCompare (different, default) gives %ld (should be positive) (%s)\n", result, u_errorName(status));
+        }
+        UTEXT_SETNATIVEINDEX(&mixed, 0);
+        UTEXT_SETNATIVEINDEX(&different, 0);
+        result = utext_caseCompareNativeLimit(&mixed, -1, &different, -1, U_FOLD_CASE_DEFAULT, &status);
+        if (0 >= result || U_FAILURE(status)) {
+            errln("error: utext_caseCompareNativeLimit (different, default) gives %ld (should be positive) (%s)\n", result, u_errorName(status));
+        }
+
+        /* test caseCompare() - include the folded sharp s (U+00df) with different lengths */
+        UTEXT_SETNATIVEINDEX(&mixed, 1);
+        UTEXT_SETNATIVEINDEX(&different, 1);
+        result = utext_caseCompare(&mixed, 4, &different, 5, U_FOLD_CASE_DEFAULT, &status);
+        if (0 != result || U_FAILURE(status)) {
+            errln("error: utext_caseCompare (mixed[1-5), different[1-6), default) gives %ld (should be 0) (%s)\n", result, u_errorName(status));
+        }
+        UTEXT_SETNATIVEINDEX(&mixed, 1);
+        UTEXT_SETNATIVEINDEX(&different, 1);
+        result = utext_caseCompareNativeLimit(&mixed, 5, &different, 6, U_FOLD_CASE_DEFAULT, &status);
+        if (0 != result || U_FAILURE(status)) {
+            errln("error: utext_caseCompareNativeLimit (mixed[1-5), different[1-6), default) gives %ld (should be 0) (%s)\n", result, u_errorName(status));
+        }
+
+        /* test caseCompare() - stop in the middle of the sharp s (U+00df) */
+        UTEXT_SETNATIVEINDEX(&mixed, 1);
+        UTEXT_SETNATIVEINDEX(&different, 1);
+        result = utext_caseCompare(&mixed, 4, &different, 4, U_FOLD_CASE_DEFAULT, &status);
+        if (0 >= result || U_FAILURE(status)) {
+            errln("error: utext_caseCompare (mixed[1-5), different[1-5), default) gives %ld (should be positive) (%s)\n", result, u_errorName(status));
+        }
+        UTEXT_SETNATIVEINDEX(&mixed, 1);
+        UTEXT_SETNATIVEINDEX(&different, 1);
+        result = utext_caseCompareNativeLimit(&mixed, 5, &different, 5, U_FOLD_CASE_DEFAULT, &status);
+        if (0 >= result || U_FAILURE(status)) {
+            errln("error: utext_caseCompareNativeLimit (mixed[1-5), different[1-5), default) gives %ld (should be positive) (%s)\n", result, u_errorName(status));
+        }
+    }
+    
+    /* test surrogates in comparison */
+    {
+        static const UChar
+        _before[] = { 0x65, 0xd800, 0xd800, 0xdc01, 0x65, 0x00 },
+        _after[]  = { 0x65, 0xd800, 0xdc00, 0x65, 0x00 };
+        
+        UText
+        before = UTEXT_INITIALIZER,
+        after  = UTEXT_INITIALIZER;
+        
+        utext_openUChars(&before, _before, -1, &status);
+        utext_openUChars(&after, _after, -1, &status);
+        
+        TEST_SUCCESS(status);
+        int32_t result;
+        
+        UTEXT_SETNATIVEINDEX(&before, 1);
+        UTEXT_SETNATIVEINDEX(&after, 1);
+        result = utext_compare(&before, -1, &after, -1);
+        if (0 <= result || U_FAILURE(status)) {
+            errln("error: utext_compare ({ 65, d800, 10001, 65 }, { 65, 10000, 65 }) gives %ld (should be negative) (%s)\n", result, u_errorName(status));
+        }
+        
+        UTEXT_SETNATIVEINDEX(&before, 1);
+        UTEXT_SETNATIVEINDEX(&after, 1);
+        result = utext_compare(&before, 3, &after, 3);
+        if (0 <= result || U_FAILURE(status)) {
+            errln("error: utext_compare with lengths ({ 65, d800, 10001, 65 }, { 65, 10000, 65 }) gives %ld (should be negative) (%s)\n", result, u_errorName(status));
+        }
+        
+        UTEXT_SETNATIVEINDEX(&before, 1);
+        UTEXT_SETNATIVEINDEX(&after, 1);
+        result = utext_caseCompare(&before, -1, &after, -1, U_FOLD_CASE_DEFAULT, &status);
+        if (0 <= result || U_FAILURE(status)) {
+            errln("error: utext_caseCompare ({ 65, d800, 10001, 65 }, { 65, 10000, 65 }) gives %ld (should be negative) (%s)\n", result, u_errorName(status));
+        }
+        
+        UTEXT_SETNATIVEINDEX(&before, 1);
+        UTEXT_SETNATIVEINDEX(&after, 1);
+        result = utext_caseCompare(&before, 3, &after, 3, U_FOLD_CASE_DEFAULT, &status);
+        if (0 <= result || U_FAILURE(status)) {
+            errln("error: utext_caseCompare with lengths ({ 65, d800, 10001, 65 }, { 65, 10000, 65 }) gives %ld (should be negative) (%s)\n", result, u_errorName(status));
+        }
+        
+        utext_close(&before);
+        utext_close(&after);
+    }
+    
+    /* test surrogates at end of string */
+    {
+        static const UChar
+        _before[] = { 0x65, 0xd800, 0xd800, 0xdc01, 0x00 },
+        _after[]  = { 0x65, 0xd800, 0xdc00, 0x00 };
+        
+        UText
+        before = UTEXT_INITIALIZER,
+        after  = UTEXT_INITIALIZER;
+        
+        utext_openUChars(&before, _before, -1, &status);
+        utext_openUChars(&after, _after, -1, &status);
+        
+        TEST_SUCCESS(status);
+        int32_t result;
+        
+        UTEXT_SETNATIVEINDEX(&before, 1);
+        UTEXT_SETNATIVEINDEX(&after, 1);
+        result = utext_compare(&before, -1, &after, -1);
+        if (0 <= result || U_FAILURE(status)) {
+            errln("error: utext_compare ({ 65, d800, 10001 }, { 65, 10000 }) gives %ld (should be negative) (%s)\n", result, u_errorName(status));
+        }
+        
+        UTEXT_SETNATIVEINDEX(&before, 1);
+        UTEXT_SETNATIVEINDEX(&after, 1);
+        result = utext_caseCompare(&before, -1, &after, -1, U_FOLD_CASE_DEFAULT, &status);
+        if (0 <= result || U_FAILURE(status)) {
+            errln("error: utext_caseCompare ({ 65, d800, 10001 }, { 65, 10000 }) gives %ld (should be negative) (%s)\n", result, u_errorName(status));
+        }
+        
+        utext_close(&before);
+        utext_close(&after);
+    }
+    
+    /* test empty strings */
+    {
+        UChar zero16 = 0;
+        char zero8 = 0;
+        UText emptyUChar = UTEXT_INITIALIZER;
+        UText emptyUTF8 = UTEXT_INITIALIZER;
+        UText nullUChar = UTEXT_INITIALIZER;
+        UText nullUTF8 = UTEXT_INITIALIZER;
+        
+        utext_openUChars(&emptyUChar, &zero16, -1, &status);
+        utext_openUTF8(&emptyUTF8, &zero8, -1, &status);
+        utext_openUChars(&nullUChar, NULL, 0, &status);
+        utext_openUTF8(&nullUTF8, NULL, 0, &status);
+        
+        if (utext_compare(&emptyUChar, -1, &emptyUTF8, -1) != 0) {
+            errln("error: utext_compare(&emptyUChar, -1, &emptyUTF8, -1) != 0");
+        }
+        if (utext_compare(&emptyUChar, -1, &nullUChar, -1) != 0) {
+            errln("error: utext_compare(&emptyUChar, -1, &nullUChar, -1) != 0");
+        }
+        if (utext_compare(&emptyUChar, -1, &nullUTF8, -1) != 0) {
+            errln("error: utext_compare(&emptyUChar, -1, &nullUTF8, -1) != 0");
+        }
+        if (utext_compare(&emptyUTF8, -1, &nullUChar, -1) != 0) {
+            errln("error: utext_compare(&emptyUTF8, -1, &nullUChar, -1) != 0");
+        }
+        if (utext_compare(&emptyUTF8, -1, &nullUTF8, -1) != 0) {
+            errln("error: utext_compare(&emptyUTF8, -1, &nullUTF8, -1) != 0");
+        }
+        if (utext_compare(&nullUChar, -1, &nullUTF8, -1) != 0) {
+            errln("error: utext_compare(&nullUChar, -1, &nullUTF8, -1) != 0");
+        }
+
+        if (utext_compareNativeLimit(&emptyUChar, -1, &emptyUTF8, -1) != 0) {
+            errln("error: utext_compareNativeLimit(&emptyUChar, -1, &emptyUTF8, -1) != 0");
+        }
+        if (utext_compareNativeLimit(&emptyUChar, -1, &nullUChar, -1) != 0) {
+            errln("error: utext_compareNativeLimit(&emptyUChar, -1, &nullUChar, -1) != 0");
+        }
+        if (utext_compareNativeLimit(&emptyUChar, -1, &nullUTF8, -1) != 0) {
+            errln("error: utext_compareNativeLimit(&emptyUChar, -1, &nullUTF8, -1) != 0");
+        }
+        if (utext_compareNativeLimit(&emptyUTF8, -1, &nullUChar, -1) != 0) {
+            errln("error: utext_compareNativeLimit(&emptyUTF8, -1, &nullUChar, -1) != 0");
+        }
+        if (utext_compareNativeLimit(&emptyUTF8, -1, &nullUTF8, -1) != 0) {
+            errln("error: utext_compareNativeLimit(&emptyUTF8, -1, &nullUTF8, -1) != 0");
+        }
+        if (utext_compareNativeLimit(&nullUChar, -1, &nullUTF8, -1) != 0) {
+            errln("error: utext_compareNativeLimit(&nullUChar, -1, &nullUTF8, -1) != 0");
+        }
+
+        if (utext_caseCompare(&emptyUChar, -1, &emptyUTF8, -1, 0, &status) != 0) {
+            errln("error: utext_caseCompare(&emptyUChar, -1, &emptyUTF8, -1, 0, &status) != 0");
+        }
+        if (utext_caseCompare(&emptyUChar, -1, &nullUChar, -1, 0, &status) != 0) {
+            errln("error: utext_caseCompare(&emptyUChar, -1, &nullUChar, -1, 0, &status) != 0");
+        }
+        if (utext_caseCompare(&emptyUChar, -1, &nullUTF8, -1, 0, &status) != 0) {
+            errln("error: utext_caseCompare(&emptyUChar, -1, &nullUTF8, -1, 0, &status) != 0");
+        }
+        if (utext_caseCompare(&emptyUTF8, -1, &nullUChar, -1, 0, &status) != 0) {
+            errln("error: utext_caseCompare(&emptyUTF8, -1, &nullUChar, -1, 0, &status) != 0");
+        }
+        if (utext_caseCompare(&emptyUTF8, -1, &nullUTF8, -1, 0, &status) != 0) {
+            errln("error: utext_caseCompare(&emptyUTF8, -1, &nullUTF8, -1, 0, &status) != 0");
+        }
+        if (utext_caseCompare(&nullUChar, -1, &nullUTF8, -1, 0, &status) != 0) {
+            errln("error: utext_caseCompare(&nullUChar, -1, &nullUTF8, -1, 0, &status) != 0");
+        }
+
+        if (utext_caseCompareNativeLimit(&emptyUChar, -1, &emptyUTF8, -1, 0, &status) != 0) {
+            errln("error: utext_caseCompareNativeLimit(&emptyUChar, -1, &emptyUTF8, -1, 0, &status) != 0");
+        }
+        if (utext_caseCompareNativeLimit(&emptyUChar, -1, &nullUChar, -1, 0, &status) != 0) {
+            errln("error: utext_caseCompareNativeLimit(&emptyUChar, -1, &nullUChar, -1, 0, &status) != 0");
+        }
+        if (utext_caseCompareNativeLimit(&emptyUChar, -1, &nullUTF8, -1, 0, &status) != 0) {
+            errln("error: utext_caseCompareNativeLimit(&emptyUChar, -1, &nullUTF8, -1, 0, &status) != 0");
+        }
+        if (utext_caseCompareNativeLimit(&emptyUTF8, -1, &nullUChar, -1, 0, &status) != 0) {
+            errln("error: utext_caseCompareNativeLimit(&emptyUTF8, -1, &nullUChar, -1, 0, &status) != 0");
+        }
+        if (utext_caseCompareNativeLimit(&emptyUTF8, -1, &nullUTF8, -1, 0, &status) != 0) {
+            errln("error: utext_caseCompareNativeLimit(&emptyUTF8, -1, &nullUTF8, -1, 0, &status) != 0");
+        }
+        if (utext_caseCompareNativeLimit(&nullUChar, -1, &nullUTF8, -1, 0, &status) != 0) {
+            errln("error: utext_caseCompareNativeLimit(&nullUChar, -1, &nullUTF8, -1, 0, &status) != 0");
+        }
+        
+        utext_close(&emptyUChar);
+        utext_close(&emptyUTF8);
+        utext_close(&nullUChar);
+        utext_close(&nullUTF8);
+    }
+}
+
+

 //
 //  ErrorTest()    Check various error and edge cases.
--- a/icu4c/source/test/intltest/utxttest.h
+++ b/icu4c/source/test/intltest/utxttest.h
@ -1,6 +1,6 @@
 /********************************************************************
 * COPYRIGHT: 
- * Copyright (c) 2005-2009, International Business Machines Corporation and
+ * Copyright (c) 2005-2010, International Business Machines Corporation and
 * others. All Rights Reserved.
 ********************************************************************/
 /************************************************************************
@ -33,6 +33,7 @@ public:
    void FreezeTest();
    void Ticket5560();
    void Ticket6847();
+    void ComparisonTest();

 private:
    struct m {                              // Map between native indices & code points.