ICU-3944 text access, work in progress

X-SVN-Rev: 17742
2005-05-31 03:43:53 +00:00 · 2005-05-31 03:43:53 +00:00 · d3f5881c7a
commit d3f5881c7a
parent 611515f257
4 changed files with 580 additions and 194 deletions
--- a/icu4c/source/common/unicode/utext.h
+++ b/icu4c/source/common/unicode/utext.h
@ -95,70 +95,89 @@ typedef struct UTextChunk UTextChunk; /**< C typedef for struct UTextChunk. @dra
 *
 *   C Functions for creating UText wrappers around various kinds of text strings.
 *
- *         TODO:  Have a single generic close function
- *                     utext_close(UText *t) 
- *                so client code doesn't need to keep track of how one was opened.
- *
 ****************************************************************************************/


+/**
+  * utext_close    Close function for UText instances.
+  *                Cleans up, releases any resources being held by an
+  *                open UText instance.
+  * <p/>
+  *     If the UText was originally allocated by one of the utext_open functions,
+  *     the storage associated with the utext will also be deleted.
+  *     If the UText storage originated with the application, as it would with
+  *     a local or static instance, the storage will not be deleted.
+  * 
+  * @param ut  The UText instance object to be closed.
+  */
+U_DRAFT void U_EXPORT2
+utext_close(UText *ut);

 /**
 * Open a read-only UText implementation for UTF-8 strings.
+ * 
+ * @param t      Pointer to a UText struct.  If NULL, a new UText will be created.
+ *               If non-NULL, must refer to an already existing UText, which will then
+ *               be reset to reference the specified UTF-8 string.
+ * @param s      A utf-8 string
+ *               TODO:  does this want to be (uint8_t *) or (char *)?
+ * @param length The length of the utf-8 string in bytes, or -1 if the string is
+ *               zero terminated.
+ * @param pErrorCode Errors are returned here.
 */
 U_DRAFT UText * U_EXPORT2
-utext_openUTF8(const uint8_t *s, int32_t length, UErrorCode *pErrorCode);
-
-U_DRAFT void U_EXPORT2
-utext_closeUTF8(UText *t);
-
-U_DRAFT void U_EXPORT2
-utext_resetUTF8(UText *t, const uint8_t *s, int32_t length, UErrorCode *pErrorCode);
+utext_openUTF8(UText *t, const uint8_t *s, int32_t length, UErrorCode *pErrorCode);

 /**
 * Open a read-only UText implementation for SBCS strings.
 * The implementation converts 1:1 according to the provided mapping table.
 * Supplementary code points are not supported.
 *
- * @param toU Mapping table for conversion from SBCS to Unicode (BMP only).
- *            The mapping table must be available during the lifetime of the
- *            UText object.
+ * @param t      Pointer to a UText struct.  If NULL, a new UText will be created.
+ *               If non-NULL, must refer to an already existing UText, which will then
+ *               be reset to reference the specified input string.
+ * @param toU    Mapping table for conversion from SBCS to Unicode (BMP only).
+ *               The mapping table must be available during the lifetime of the
+ *               UText object.
+ * @param s      A byte text string
+ * @param length The length of the input string in bytes, or -1 if the string is
+ *               zero terminated.
+ * @param pErrorCode Errors are returned here.
 */
 U_DRAFT UText * U_EXPORT2
-utext_openSBCS(const UChar toU[256],
+utext_openSBCS(UText *t,
+               const UChar toU[256],
               const char *s, int32_t length,
               UErrorCode *pErrorCode);

-U_DRAFT void U_EXPORT2
-utext_closeSBCS(UText *t);
-
-U_DRAFT void U_EXPORT2
-utext_resetSBCS(UText *t, const char *s, int32_t length, UErrorCode *pErrorCode);
-


 /**
- * Set the UText object to handle a writable UnicodeString.
+ * Open a  UText object for a UnicodeString.
+ * 
+ * @param t      Pointer to a UText struct.  If NULL, a new UText will be created.
+ *               If non-NULL, must refer to an initialized UText, which will then
+ *               be reset to reference the specified UTF-8 string.
+ * @param s      A UnicodeString
+ * @param pErrorCode Errors are returned here.
+ * @return       Pointer to the UText.  If a UText was supplied as input, this
+ *               will always be returned.
 */
-U_DRAFT void U_EXPORT2
-utext_setUnicodeString(UText *t, UnicodeString *s);
+U_DRAFT UText * U_EXPORT2
+utext_openUnicodeString(UText *t, UnicodeString *s, UErrorCode *pErrorCode);


-#if 0 // initially commented out to reduce testing

 /**
 * Open a writable UText implementation for Replaceable objects.
+ * @param t      Pointer to a UText struct.  If NULL, a new UText will be created.
+ *               If non-NULL, must refer to an already existing UText, which will then
+ *               be reset to reference the specified UTF-8 string.
+ * @param rep    A Replaceable text object.
+ * @param pErrorCode Errors are returned here.
 */
 U_DRAFT UText * U_EXPORT2
-utext_openReplaceable(Replaceable *rep, UErrorCode *pErrorCode);
-
-U_DRAFT void U_EXPORT2
-utext_closeReplaceable(UText *t);
-
-U_DRAFT void U_EXPORT2
-utext_resetReplaceable(UText *t, Replaceable *rep, UErrorCode *pErrorCode);
-
-#endif
+utext_openReplaceable(UText *t, Replaceable *rep, UErrorCode *pErrorCode);


 struct UTextChunk {
@ -225,6 +244,12 @@ enum {
 * clone this UText.  
 * Text providers are not required to support clone.
 * Applications must be prepared for the possibility that clone is not supported.
+ *
+ * This is a shallow clone.  The underlying text is not copied, only the
+ * UText wrapper to the text is cloned.  A cloned UText allows having multiple
+ * UTextIteartors active over the same underlying text.  (Any single instance
+ * of UText can only have one iterator active.)
+ *
 * TODO:  should we just drop clone altogether?
 *
 * @return a pointer to the newly created copy of the UTex object.
@ -281,24 +306,38 @@ UTextAccess(UText *t, int32_t index, UBool forward, UTextChunk *chunk);
 /**
 * Function type declaration for UText.extract().
 *
- * TBD
+ * Extract text from a UText into a UChar buffer.  The range of text to be extracted
+ * is specified in the native indices of the UText provider.  These may not necessarily
+ * be utf-16 indices.
+ * <p/>
+ * The size (number of 16 bit UChars) in the data to be extracted is returned.  The
+ * full amount is returned, even when the specified buffer size is smaller.
 *
- * The extracted string must be NUL-terminated if possible.
+ * The extracted string will (if you are a user) / must (if you are a text provider)
+ * be NUL-terminated if there is sufficient space in the destination buffer.
 *
- * @return Number of UChars extracted.
+ * @param  ut    the UText from which to extract data.
+ * @param  start the native index of the first characer to extract.
+ * @param  limit the native string index of the position following the last
+ *               character to extract.
+ * @param  dest  the UChar (utf-16) buffer into which the extracted text is placed
+ * @param  destCapacity  The size, in UChars, of the destination buffer.  May be zero
+ *               for precomputing the required size.
+ * @param  status receives any error status.
 *         If U_BUFFER_OVERFLOW_ERROR: Returns number of UChars for
 *         preflighting.
- *         If U_INDEX_OUTOFBOUNDS_ERROR: Start and limit do not specify
- *         accessible text. Return value undefined.
+ * @return Number of UChars in the data.  Does not include a trailing NUL.
+ *
+ *  TODO: how should invalid source data be handled?  Corrupt utf-8, for example.
 *
 * @see UText
 * @draft ICU 3.4
 */
 typedef int32_t U_CALLCONV
-UTextExtract(UText *t,
+UTextExtract(UText *ut,
             int32_t start, int32_t limit,
             UChar *dest, int32_t destCapacity,
-             UErrorCode *pErrorCode);
+             UErrorCode *status);

 /**
 * Function type declaration for UText.replace().
@ -381,6 +420,20 @@ typedef int32_t U_CALLCONV
 UTextMapIndexToUTF16(UText *t, UTextChunk *chunk, int32_t index);


+/**
+ * Function type declaration for UText.utextClose().
+ *
+ * TBD
+ *
+ * @param t A UText object to be closed.
+ *
+ * @see UText
+ * @draft ICU 3.4
+ */
+typedef void U_CALLCONV
+UTextClose(UText *t);
+
+
 /**
  *   UText struct.  Provides the interface between the generic UText access code
  *                  and the UText provider code that works on specific kinds of
@ -410,6 +463,29 @@ struct UText {
     */
    const void *p, *q, *r;

+    /**
+     *  (protected)  Pointer to additional space requested by the
+     *               provider during the utext_open operation.
+     */
+    void     *pExtra;
+
+    /**
+     *   (protected)  Size in bytes of the extra space (pExtra).
+     */
+    int32_t   extraSize;
+
+    /**
+     *     (private)  Flags for managing the allocation and freeing of
+     *                memory associated with this UText.
+     */
+    int32_t   flags;
+
+    /**
+     *     (private)  Magic.  Try to detect when we are handed junk.
+     */
+    int32_t   magic;
+
+
    /**
     * (public) sizeOfStruct=sizeof(UText)
     * Allows possible backward compatible extension.
@ -515,8 +591,76 @@ struct UText {
     * @draft ICU 3.4
     */
    UTextMapIndexToUTF16 *mapIndexToUTF16;
+
+    /**
+      * (public)
+      *
+      * @see UTextClose
+      * @draft ICU 3.4
+      */
+    UTextClose  *close;
 };

+
+/**
+  *  Function for use by Text Provider implementations to allocate and/or initialize
+  *  a new UText struct.  To be called in the implementation of utext_open() functions.
+  *  If the suppliec utxt parameter is null, a new UText struct will be allocated on the heap.
+  *  If the supplied UText is already open, the provider's clsoe function will be called
+  *  so that the struct can be reused by the open that is in progress.
+  *
+  * @param utxt pointer to a UText struct to be re-used, or null if a new UText
+  *             should be allocated.
+  * @param extraSpace The amount of additional space to be allocated as part
+  *             of this UText, for use by types of providers that require
+  *             additional storage.
+  */
+U_DRAFT UText * U_EXPORT2
+UTextSetup(UText *utxt, int32_t extraSpace, UErrorCode *status);
+
+/**
+  * @internal
+  */
+enum {
+    UTEXT_MAGIC = 0xe45ad82c
+};
+
+
+/**
+ * @internal
+ */
+#define UTEXT_INITIALZIER_HEAD  \
+                  NULL,                 /* context      */ \
+                  NULL, NULL, NULL,     /* p, q, r      */ \
+                  NULL,                 /* pExtra       */ \
+                  0,                    /* extraSize    */ \
+                  0,                    /* flags        */ \
+                  UTEXT_MAGIC,          /* magic        */ \
+                  sizeof(UText),        /* sizeOfStruct */ \
+                  0, 0, 0        
+
+
+/**
+ * initializer to be used with local (stack) instances of a UText
+ *  struct.  UText structs must be initialized before passing
+ *  them to one of the utext_open functions.
+ *
+ * @draft ICU 3.4
+ */
+#define UTEXT_INITIALIZER {                                \
+                  UTEXT_INITIALZIER_HEAD,                  \
+                  NULL,                 /* clone ()     */ \
+                  NULL,                 /* properties ()*/ \
+                  NULL,                 /* length ()    */ \
+                  NULL,                 /* access ()    */ \
+                  NULL,                 /* extract ()   */ \
+                  NULL,                 /* replace ()   */ \
+                  NULL,                 /* copy ()      */ \
+                  NULL, NULL,           /* map * 2 ()   */ \
+                  NULL                  /* close ()     */ \
+};
+
+
 U_CDECL_END


@ -735,12 +879,21 @@ UTextIterator::setIndex(int32_t index) {
    if(index<chunk.start || chunk.limit<index) {
        // The desired position is outside of the current chunk.  Invalidate it and
        // leave it to next32() or previous32() to access the text
-        // in the desired direction
+        // in the desired direction.
        setChunkInvalid(index);
    } else if(chunk.nonUTF16Indexes) {
        chunkOffset=t->mapIndexToUTF16(t, &chunk, index);
    } else {
        chunkOffset=index-chunk.start;
+        // Our convention is that the index must always be on a code point boundary.
+        //  If we are somewhere in the middle of a utf-16 buffer, check that new index
+        //  is not in the middle of a surrogate pair.
+        if (index>chunk.start && index < chunk.limit) {   // TODO:  clean up end-of-chunk / end of input handling.  Everywhere.
+            UChar c = chunk.contents[chunkOffset];
+            if (U16_TRAIL(c)) {
+                this->getSupplementary();  // force index onto a code point boundary.
+            }
+        }
    }
 }

--- a/icu4c/source/common/utext.cpp
+++ b/icu4c/source/common/utext.cpp
@ -28,7 +28,9 @@ U_NAMESPACE_BEGIN

 /*---------------------------------------------------------------------------
 *
- * UTextIterator implementation
+ * UTextIterator implementation.   Note: the most common UTextIterator 
+ *                                  functions are inline, implemented in
+ *                                  utext.h
 *
 * ---------------------------------------------------------------------------*/

@ -85,16 +87,16 @@ UTextIterator::moveIndex(int32_t delta) {
        } while(--delta>0);
    } else if (delta<0) {
        do {
-            if(chunkOffset<=chunk.start && !access(chunk.start, FALSE)) {
+            if(chunkOffset<=0 && !access(chunk.start, FALSE)) {
                retval = FALSE;
                break;
            }
-            U16_BACK_1(chunk.contents, chunk.start, chunkOffset);
+            U16_BACK_1(chunk.contents, 0, chunkOffset);
        } while(++delta<0);
    } else {
        // Delta == 0.
        // Need to trim current postion to be within the bounds of the text.
-        if (chunkOffset>=0 && chunkOffset<chunk.limit) {
+        if (chunkOffset>=0 && chunkOffset<chunk.length) {
            // Current position is within the current chunk.
            // No action needed.
        } else if (chunk.start<=0) {
@ -175,8 +177,136 @@ U_NAMESPACE_END



+//------------------------------------------------------------------------------
+//
+//   UText common functions implementation
+//
+//------------------------------------------------------------------------------

-/* No-Op UText implementation for illegal input ----------------------------- */
+//
+//  UText.flags bit definitions
+//
+enum {
+    UTEXT_HEAP_ALLOCATED  = 1,      //  1 if ICU has allocated this UText struct on the heap.
+                                    //  0 if caller provided storage for the UText.
+
+    UTEXT_EXTRA_HEAP_ALLOCATED = 2, //  1 if ICU has allocated extra storage as a separate
+                                    //     heap block.
+                                    //  0 if there is no separate allocation.  Either no extra
+                                    //     storage was requested, or it is appended to the end
+                                    //     of the main UText storage.
+
+    UTEXT_OPEN = 4                  //  1 if this UText is currently open
+                                    //  0 if this UText is not open.
+};
+
+
+//
+//  Extended form of a UText.  The purpose is to aid in computing the total size required
+//    when a provider asks for a UText to be allocated with extra storage.
+//
+struct ExtendedUText: public UText {
+    void  *extension;
+};
+
+static UText emptyText = UTEXT_INITIALIZER;
+
+U_DRAFT UText * U_EXPORT2
+utext_setup(UText *ut, int32_t extraSpace, UErrorCode *status) {
+    if (U_FAILURE(*status)) {
+        return ut;
+    }
+
+    if (ut == NULL) {
+        // We need to heap-allocate storage for the new UText
+        int32_t spaceRequired = sizeof(UText);
+        if (extraSpace > 0) {
+            spaceRequired = sizeof(ExtendedUText) + extraSpace - sizeof(void *);
+        }
+        ut = (UText *)uprv_malloc(spaceRequired);
+        *ut = emptyText;
+        ut->flags |= UTEXT_HEAP_ALLOCATED;
+        if (spaceRequired>0) {
+            ut->extraSize = spaceRequired;
+            ut->pExtra    = &((ExtendedUText *)ut)->extension;
+        }
+    } else {
+        // We have been supplied with an already existing UText.
+        // Verify that it really appears to be a UText.
+        if (ut->magic != UTEXT_MAGIC) {
+            *status = U_ILLEGAL_ARGUMENT_ERROR;
+            return ut;
+        }
+        // If the ut is already open and there's a provider supplied close
+        //   function, call it.
+        if ((ut->flags & UTEXT_OPEN) && ut->close != NULL)  {
+            ut->close(ut);
+        }
+        ut->flags &= ~UTEXT_OPEN;
+
+        // If extra space was requested by our caller, check whether
+        //   sufficient already exists, and allocate new if needed.
+        if (extraSpace > ut->extraSize) {
+            // Need more space.  If there is existing separately allocated space,
+            //   delete it first, then allocate new space.
+            if (ut->flags & UTEXT_EXTRA_HEAP_ALLOCATED) {
+                uprv_free(ut->pExtra);
+                ut->extraSize = 0;
+            }
+            ut->pExtra = uprv_malloc(extraSpace);
+            if (ut->pExtra == NULL) {
+                *status = U_MEMORY_ALLOCATION_ERROR;
+            } else {
+                ut->extraSize = extraSpace;
+            }
+        }
+    }
+    return ut;
+}
+
+
+U_DRAFT void U_EXPORT2
+utext_close(UText *ut) {
+    if (ut==NULL ||
+        ut->magic != UTEXT_MAGIC ||
+        (ut->flags & UTEXT_OPEN) == 0)
+    {
+        // The supplied ut is not an open UText.
+        // Do nothing.
+        return;
+    }
+
+    // If the provider gave us a close function, call it now.
+    // This will clean up anything allocated specifically by the provider.
+    if (ut->close != NULL) {
+        ut->close(ut);
+    }
+    ut->flags &= ~UTEXT_OPEN;
+
+    // If we (the famework) allocated the UText or subsidiary storage,
+    //   delete it.
+    if (ut->flags & UTEXT_EXTRA_HEAP_ALLOCATED) {
+        uprv_free(ut->pExtra);
+        ut->pExtra = NULL;
+    }
+    if (ut->flags & UTEXT_HEAP_ALLOCATED) {
+        // This UText was allocated by UText setup.  We need to free it.
+        // Clear magic, so we can detect if the user messes up and immediately
+        //  tries to reopen another UText using the deleted storage.
+        ut->magic = 0;
+        uprv_free(ut);
+    }
+}
+
+
+
+
+
+//------------------------------------------------------------------------------
+//
+// No-Op UText implementation for illegal input 
+//
+//------------------------------------------------------------------------------

 static UText * U_CALLCONV
 noopTextClone(const UText *t) {
@ -219,8 +349,7 @@ noopTextMapIndexToUTF16(UText *t, UTextChunk *chunk, int32_t index) {
 }

 static const UText noopText={
-    NULL, NULL, NULL, NULL,
-    (int32_t)sizeof(UText), 0, 0, 0,
+    UTEXT_INITIALZIER_HEAD,
    noopTextClone,
    noopTextGetProperties,
    noopTextLength,
@ -229,7 +358,8 @@ static const UText noopText={
    NULL, // replace
    NULL, // copy
    noopTextMapOffsetToNative,
-    noopTextMapIndexToUTF16
+    noopTextMapIndexToUTF16,
+    NULL  // close
 };


@ -241,13 +371,14 @@ static const UText noopText={
 //         Use of UText data members:
 //            context    pointer to UTF-8 string
 //
+//      TODO:  make creation of the index mapping array lazy.
+//             Create it for a chunk the first time the user asks for an index.
+//
 //------------------------------------------------------------------------------

 enum { UTF8_TEXT_CHUNK_SIZE=10 };

-struct UTF8Text : public UText {
-    /* length of UTF-8 string (in bytes) */
-    int32_t length;
+struct UTF8Extra {
    /*
     * Chunk UChars.
     * +1 to simplify filling with surrogate pair at the end.
@ -261,10 +392,16 @@ struct UTF8Text : public UText {
     * of s[].
     */
    int32_t map[UTF8_TEXT_CHUNK_SIZE+2];
-    /* points into map[] corresponding to where chunk.contents starts in s[] */
-    int32_t *chunkMap;
 };

+//  utext.b  is the input string length (bytes).
+//  utext.q  pointer to the filled part of the Map array.
+//
+//     because backwards iteration fills the buffers starting at the end and
+//     working towards the front, the filled part of the buffers may not begin
+//     at the start of the available storage for the buffers.
+
+
 static int32_t U_CALLCONV
 utf8TextGetProperties(UText * /*t*/) {
    return
@ -275,16 +412,20 @@ utf8TextGetProperties(UText * /*t*/) {
 }

 static int32_t U_CALLCONV
-utf8TextLength(UText *t) {
-    return ((UTF8Text *)t)->length;
+utf8TextLength(UText *ut) {
+    return ut->b;
 }

 static int32_t U_CALLCONV
-utf8TextAccess(UText *t, int32_t index, UBool forward, UTextChunk *chunk) {
-    UTF8Text *t8=(UTF8Text *)t;
-    const uint8_t *s8=(const uint8_t *)t8->context;
-    UChar32 c;
-    int32_t i, length=t8->length;
+utf8TextAccess(UText *ut, int32_t index, UBool forward, UTextChunk *chunk) {
+    const uint8_t *s8=(const uint8_t *)ut->context;
+    UChar32  c;
+    int32_t  i;
+    int32_t  length = ut->b;              // Length of original utf-8
+
+    UTF8Extra  *ut8e   = (UTF8Extra *)ut->pExtra;
+    UChar      *u16buf = ut8e->s;
+    int32_t    *map    = ut8e->map;

    if(forward) {
        if(length<=index) {
@ -294,39 +435,42 @@ utf8TextAccess(UText *t, int32_t index, UBool forward, UTextChunk *chunk) {
        chunk->start=index;
        c=s8[index];
        if(c<=0x7f) {
-            // get a chunk of ASCII characters
-            t8->s[0]=(UChar)c;
+            // get a run of ASCII characters.
+            // Even if we don't fill the buffer, we will stop with the first
+            //   non-ascii char, so that the buffer can use utf-16 indexing.
+            u16buf[0]=(UChar)c;
            for(i=1, ++index;
                i<UTF8_TEXT_CHUNK_SIZE && index<length && (c=s8[index])<=0x7f;
                ++i, ++index
            ) {
-                t8->s[i]=(UChar)c;
+                u16buf[i]=(UChar)c;
            }
            chunk->nonUTF16Indexes=FALSE;
        } else {
            // get a chunk of characters starting with a non-ASCII one
-            U8_SET_CP_START(s8, 0, index);
-            for(i=0;
-                i<UTF8_TEXT_CHUNK_SIZE && index<length;
-                ++i
-            ) {
-                t8->map[i]=index;
-                t8->map[i+1]=index; // in case there is a trail surrogate
+            U8_SET_CP_START(s8, 0, index);  // put utf-8 index at first byte of char, if not there already.
+            for(i=0;  i<UTF8_TEXT_CHUNK_SIZE && index<length;  ) {
+                //  i     is utf-16 index into chunk buffer.
+                //  index is utf-8 index into original string
+                map[i]=index;
+                map[i+1]=index; // in case there is a trail surrogate
                U8_NEXT(s8, index, length, c);
                if(c<0) {
                    c=0xfffd; // use SUB for illegal sequences
                }
-                U16_APPEND_UNSAFE(t8->s, i, c);
+                U16_APPEND_UNSAFE(u16buf, i, c);    // post-increments i.
            }
-            t8->map[i]=index;
-            t8->chunkMap=t8->map;
+            map[i]=index;
            chunk->nonUTF16Indexes=TRUE;
        }
-        chunk->contents=t8->s;
-        chunk->length=i;
-        chunk->limit=index;
+        chunk->contents = u16buf;
+        chunk->length   = i;
+        chunk->limit    = index;
+        ut->q           = map;  
        return 0; // chunkOffset corresponding to index
    } else {
+        // Reverse Access.  The chunk buffer must be filled so as to contain the
+        //                  character preceding the specified index.
        if(index<=0) {
            return -1;
        }
@ -334,11 +478,10 @@ utf8TextAccess(UText *t, int32_t index, UBool forward, UTextChunk *chunk) {
        chunk->limit=index;
        c=s8[index-1];
        if(c<=0x7f) {
-            // get a chunk of ASCII characters
+            // get a chunk of ASCII characters.  Don't build the index map
            i=UTF8_TEXT_CHUNK_SIZE;
-            t8->map[i]=index;
            do {
-                t8->s[--i]=(UChar)c;
+                u16buf[--i]=(UChar)c;
                --index;
            } while(i>0 && index>0 && (c=s8[index-1])<=0x7f);
            chunk->nonUTF16Indexes=FALSE;
@ -347,52 +490,65 @@ utf8TextAccess(UText *t, int32_t index, UBool forward, UTextChunk *chunk) {
            if(index<length) {
                U8_SET_CP_START(s8, 0, index);
            }
-            i=UTF8_TEXT_CHUNK_SIZE+1;
-            t8->map[i]=index;
+            i=UTF8_TEXT_CHUNK_SIZE;
+            map[i]=index;    // map position for char following the last one in the buffer.
            do {
+                //  i     is utf-16 index into chunk buffer.
+                //  index is utf-8 index into original string
                U8_PREV(s8, 0, index, c);
                if(c<0) {
                    c=0xfffd; // use SUB for illegal sequences
                }
                if(c<=0xffff) {
-                    t8->s[--i]=(UChar)c;
-                    t8->map[i]=index;
+                    u16buf[--i]=(UChar)c;
+                    map[i]=index;
                } else {
-                    t8->s[--i]=U16_TRAIL(c);
-                    t8->map[i]=index;
-                    t8->s[--i]=U16_LEAD(c);
-                    t8->map[i]=index;
+                    // We've got a supplementary char
+                    if (i<2) {
+                        // Both halves of the surrogate pair wont fit in the chunk buffer.
+                        // Stop without putting either half in.
+                        U8_NEXT(s8, index, length, c);  // restore index.
+                        break;
+                    }
+                    u16buf[--i]=U16_TRAIL(c);
+                    map[i]=index;
+                    u16buf[--i]=U16_LEAD(c);
+                    map[i]=index;
                }
-            } while(i>1 && index>0);
-            t8->chunkMap=t8->map+i;
+            } while(i>0 && index>0);
+
+            // Because we have filled the map & chunk buffers from back to front,
+            //   the start position for accesses may not be at the start of the
+            //   available storage.
+            ut->q = map+i;
            chunk->nonUTF16Indexes=TRUE;
        }
-        chunk->contents=t8->s+i;
-        chunk->length=(UTF8_TEXT_CHUNK_SIZE+1)-i;
+        // Common reverse iteration, for both UTF16 and non-UTIF16 indexes.
+        chunk->contents=u16buf+i;
+        chunk->length=(UTF8_TEXT_CHUNK_SIZE)-i;
        chunk->start=index;
        return chunk->length; // chunkOffset corresponding to index
    }
 }

 static int32_t U_CALLCONV
-utf8TextExtract(UText *t,
+utf8TextExtract(UText *ut,
                int32_t start, int32_t limit,
                UChar *dest, int32_t destCapacity,
                UErrorCode *pErrorCode) {
-    UTF8Text *t8=(UTF8Text *)t;
    if(U_FAILURE(*pErrorCode)) {
        return 0;
    }
    if(destCapacity<0 || (dest==NULL && destCapacity>0)) {
        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
    }
-    if(start<0 || start>limit || t8->length<limit) {
+    if(start<0 || start>limit || ut->b<limit) {
        *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
        return 0;
    }
    int32_t destLength=0;
    u_strFromUTF8(dest, destCapacity, &destLength,
-                    (const char *)t8->context+start, limit-start,
+                    (const char *)ut->context+start, limit-start,
                    pErrorCode);
    return destLength;
    // TODO: if U_INVALID|ILLEGAL_CHAR_FOUND, extract text anyway and use SUB for illegal sequences?
@ -400,16 +556,16 @@ utf8TextExtract(UText *t,

 // Assume nonUTF16Indexes and 0<=offset<=chunk->length
 static int32_t U_CALLCONV
-utf8TextMapOffsetToNative(UText *t, UTextChunk *chunk, int32_t offset) {
-    UTF8Text *t8=(UTF8Text *)t;
-    return t8->chunkMap[offset];
+utf8TextMapOffsetToNative(UText *ut, UTextChunk *chunk, int32_t offset) {
+    // UText.q points to the index mapping array that is allocated in the extra storage area.
+    int32_t *map=(int32_t *)(ut->q);
+    return map[offset];
 }

 // Assume nonUTF16Indexes and chunk->start<=index<=chunk->limit
 static int32_t U_CALLCONV
-utf8TextMapIndexToUTF16(UText *t, UTextChunk *chunk, int32_t index) {
-    UTF8Text *t8=(UTF8Text *)t;
-    int32_t *map=t8->chunkMap;
+utf8TextMapIndexToUTF16(UText *ut, UTextChunk *chunk, int32_t index) {
+    int32_t *map=(int32_t *)(ut->q);
    int32_t offset=0;

    while(index>map[offset]) {
@ -418,69 +574,43 @@ utf8TextMapIndexToUTF16(UText *t, UTextChunk *chunk, int32_t index) {
    return offset;
 }

-static const UText utf8Text={
-    NULL, NULL, NULL, NULL,
-    (int32_t)sizeof(UText), 0, 0, 0,
-    noopTextClone,
-    utf8TextGetProperties,
-    utf8TextLength,
-    utf8TextAccess,
-    utf8TextExtract,
-    NULL, // replace
-    NULL, // copy
-    utf8TextMapOffsetToNative,
-    utf8TextMapIndexToUTF16
-};
+
+

 U_DRAFT UText * U_EXPORT2
-utext_openUTF8(const uint8_t *s, int32_t length, UErrorCode *pErrorCode) {
-    if(U_FAILURE(*pErrorCode)) {
+utext_openUTF8(UText *ut, const uint8_t *s, int32_t length, UErrorCode *status) {
+    if(U_FAILURE(*status)) {
        return NULL;
    }
    if(s==NULL || length<-1) {
-        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
+        *status=U_ILLEGAL_ARGUMENT_ERROR;
        return NULL;
    }
-    UTF8Text *t8=(UTF8Text *)uprv_malloc(sizeof(UTF8Text));
-    if(t8==NULL) {
-        *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
-        return NULL;
+
+    ut = utext_setup(ut, sizeof(UTF8Extra), status);
+    if (U_FAILURE(*status)) {
+        return ut;
    }
-    *((UText *)t8)=utf8Text;
-    t8->context=s;
+
+    ut->clone      = noopTextClone;
+    ut->properties = utf8TextGetProperties;
+    ut->length     = utf8TextLength;
+    ut->access     = utf8TextAccess;
+    ut->extract    = utf8TextExtract;
+    ut->mapOffsetToNative = utf8TextMapOffsetToNative;
+    ut->mapIndexToUTF16   = utf8TextMapIndexToUTF16;
+
+    ut->context=s;
    if(length>=0) {
-        t8->length=length;
+        ut->b=length;
    } else {
        // TODO:  really undesirable to do this scan upfront.
-        t8->length=(int32_t)uprv_strlen((const char *)s);
+        ut->b=(int32_t)uprv_strlen((const char *)s);
    }
-    return t8;
+
+    return ut;
 }

-U_DRAFT void U_EXPORT2
-utext_closeUTF8(UText *t) {
-    if(t!=NULL) {
-        uprv_free((UTF8Text *)t);
-    }
-}
-
-U_DRAFT void U_EXPORT2
-utext_resetUTF8(UText *t, const uint8_t *s, int32_t length, UErrorCode *pErrorCode) {
-    if(U_FAILURE(*pErrorCode)) {
-        return;
-    }
-    if(s==NULL || length<-1) {
-        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
-        return;
-    }
-    UTF8Text *t8=(UTF8Text *)t;
-    t8->context=s;
-    if(length>=0) {
-        t8->length=length;
-    } else {
-        t8->length=(int32_t)uprv_strlen((const char *)s);
-    }
-}



@ -595,8 +725,7 @@ sbcsTextExtract(UText *t,
 }

 static const UText sbcsText={
-    NULL, NULL, NULL, NULL,
-    (int32_t)sizeof(UText), 0, 0, 0,
+    UTEXT_INITIALZIER_HEAD,
    noopTextClone,
    sbcsTextGetProperties,
    sbcsTextLength,
@ -605,11 +734,13 @@ static const UText sbcsText={
    NULL, // replace
    NULL, // copy
    NULL, // mapOffsetToNative
-    NULL  // mapIndexToUTF16
+    NULL, // mapIndexToUTF16
+    NULL  // close
 };

 U_DRAFT UText * U_EXPORT2
-utext_openSBCS(const UChar toU[256],
+utext_openSBCS(UText *ut,
+               const UChar toU[256],
               const char *s, int32_t length,
               UErrorCode *pErrorCode) {
    if(U_FAILURE(*pErrorCode)) {
@ -1025,11 +1156,15 @@ unistrTextExtract(UText *t,
        return 0;
    }
    length=limit-start;
-    if(length>destCapacity) {
-        length=destCapacity;
+    if (destCapacity>0 && dest!=NULL) {
+        int32_t trimmedLength = length;
+        if(trimmedLength>destCapacity) {
+            trimmedLength=destCapacity;
+        }
+        us->extract(start, trimmedLength, dest);
    }
-    us->extract(start, length, dest);
-    return u_terminateUChars(dest, destCapacity, length, pErrorCode);
+    u_terminateUChars(dest, destCapacity, length, pErrorCode);
+    return length;
 }

 static int32_t U_CALLCONV
@ -1107,28 +1242,23 @@ unistrTextCopy(UText *t,
    }
 };

-//
-//  Statically initialized utext object, pre-setup
-//   for UnicodeStrings.
-//  
-static const UText unistrText={
-    NULL, NULL, NULL, NULL,
-    (int32_t)sizeof(UText), 0, 0, 0,
-    unistrTextClone,
-    unistrTextGetProperties,
-    unistrTextLength,
-    unistrTextAccess,
-    unistrTextExtract,
-    unistrTextReplace,
-    unistrTextCopy,
-    NULL, // mapOffsetToNative
-    NULL  // mapIndexToUTF16
-};

-U_DRAFT void U_EXPORT2
-utext_setUnicodeString(UText *t, UnicodeString *s) {
-    *t=unistrText;
-    t->context=s;
+
+U_DRAFT UText * U_EXPORT2
+utext_openUnicodeString(UText *ut, UnicodeString *s, UErrorCode *status) {
+    ut = utext_setup(ut, 0, status);
+    if (U_SUCCESS(*status)) {
+        ut->clone      = unistrTextClone;
+        ut->properties = unistrTextGetProperties;
+        ut->length     = unistrTextLength;
+        ut->access     = unistrTextAccess;
+        ut->extract    = unistrTextExtract;
+        ut->replace    = unistrTextReplace;
+        ut->copy       = unistrTextCopy;
+
+        ut->context     = s;
+    }
+    return ut;
 }


--- a/icu4c/source/test/intltest/utxttest.cpp
+++ b/icu4c/source/test/intltest/utxttest.cpp
@ -14,6 +14,8 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <unicode/utext.h>
+#include <unicode/utf8.h>
+#include <unicode/ustring.h>
 #include "utxttest.h"

 UBool  gFailed = FALSE;
@ -22,6 +24,13 @@ UBool  gFailed = FALSE;
                     gFailed = TRUE;\
   }}

+
+#define TEST_SUCCESS(status) \
+   {if (U_FAILURE(status)) {errln("Test failure in file %s at line %d. Error = \"%s\"\n", \
+       __FILE__, __LINE__, u_errorName(status)); \
+       gFailed = TRUE;\
+   }}
+
 UTextTest::UTextTest() {
 }

@ -55,6 +64,7 @@ void UTextTest::TestString(const UnicodeString &s) {
    int         j;
    UChar32     c;
    int         cpCount = 0;
+    UErrorCode  status = U_ZERO_ERROR;

    UnicodeString sa = s.unescape();

@ -70,6 +80,7 @@ void UTextTest::TestString(const UnicodeString &s) {
        j++;
        cpCount++;
    }
+    cpMap[j].nativeIdx = i;   // position following the last char in utf-16 string.    


    // UChar * test, null term
@ -82,11 +93,37 @@ void UTextTest::TestString(const UnicodeString &s) {
    // const UChar * test, length

    // UnicodeString test
-    UText ut;
-    utext_setUnicodeString(&ut, &sa);
-    TestAccess(&ut, cpCount, cpMap);
+    UText *ut;
+    ut = utext_openUnicodeString(NULL, &sa, &status);
+    TEST_SUCCESS(status);
+    TestAccess(sa, ut, cpCount, cpMap);
+    utext_close(ut);

+    //
    // UTF-8 test
+    //
+
+    // Convert the test string from UnicodeString to (char *) in utf-8 format
+    int u8Len = sa.extract(0, sa.length(), NULL, 0, "utf-8");
+    char *u8String = new char[u8Len + 1];
+    sa.extract(0, sa.length(), u8String, u8Len+1, "utf-8");
+
+    // Build up the map of code point indices in the utf-8 string
+    m * u8Map = new m[sa.length() + 1];
+    i = 0;   // native utf-8 index
+    for (j=0; j<cpCount ; j++) {  // code point number
+        u8Map[j].nativeIdx = i;
+        U8_NEXT(u8String, i, u8Len, c)
+        u8Map[j].cp = c;
+    }
+    u8Map[cpCount].nativeIdx = u8Len;   // position following the last char in utf-8 string.
+
+    // Do the test itself
+    status = U_ZERO_ERROR;
+    ut = utext_openUTF8(NULL, (uint8_t *)u8String, -1, &status);
+    TEST_SUCCESS(status);
+    TestAccess(sa, ut, cpCount, u8Map);
+    utext_close(ut);

    // UTF-32 test

@ -97,7 +134,16 @@ void UTextTest::TestString(const UnicodeString &s) {
 }


-void UTextTest::TestAccess(UText *ut, int cpCount, m *cpMap) {
+void UTextTest::TestAccess(const UnicodeString &us, UText *ut, int cpCount, m *cpMap) {
+    UErrorCode  status = U_ZERO_ERROR;
+
+    //
+    //  Check the length from the UText
+    //
+    int expectedLen = cpMap[cpCount].nativeIdx;
+    int utlen = ut->length(ut);
+    TEST_ASSERT(expectedLen == utlen);
+
    //
    //  Iterate forwards, verify that we get the correct code points
    //   at the correct native offsets.
@ -140,10 +186,10 @@ void UTextTest::TestAccess(UText *ut, int cpCount, m *cpMap) {
    len = uti.getIndex();
    uti.setIndex(len);
    for (i=cpCount-1; i>=0; i--) {
-        foundC        = uti.previous32();
        expectedC     = cpMap[i].cp;
-        foundIndex    = uti.getIndex();
        expectedIndex = cpMap[i].nativeIdx;
+        foundC        = uti.previous32();
+        foundIndex    = uti.getIndex();
        TEST_ASSERT(expectedIndex == foundIndex);
        TEST_ASSERT(expectedC == foundC);
        if (gFailed) {
@ -167,7 +213,7 @@ void UTextTest::TestAccess(UText *ut, int cpCount, m *cpMap) {
    }

    //
-    //  Iterate in a somewhat random order.
+    //  next32From(), prevous32From(), Iterate in a somewhat random order.
    //
    int  cpIndex = 0;
    for (i=0; i<cpCount; i++) {
@ -185,9 +231,9 @@ void UTextTest::TestAccess(UText *ut, int cpCount, m *cpMap) {
    cpIndex = 0;
    for (i=0; i<cpCount; i++) {
        cpIndex = (cpIndex + 9973) % cpCount;
-        index         = cpMap[cpIndex].nativeIdx;
+        index         = cpMap[cpIndex+1].nativeIdx;
        expectedC     = cpMap[cpIndex].cp;
-        foundC        = uti.previous32From(index+1);
+        foundC        = uti.previous32From(index);
        TEST_ASSERT(expectedC == foundC);
        TEST_ASSERT(expectedIndex == foundIndex);
        if (gFailed) {
@ -198,6 +244,17 @@ void UTextTest::TestAccess(UText *ut, int cpCount, m *cpMap) {
    //
    // moveIndex(int32_t delta);
    //
+
+    // Walk through frontwards, incrementing by one
+    uti.setIndex(0);
+    for (i=1; i<=cpCount; i++) {
+        uti.moveIndex(1);
+        index = uti.getIndex();
+        expectedIndex = cpMap[i].nativeIdx;
+        TEST_ASSERT(expectedIndex == index);
+    }
+
+    // Walk through frontwards, incrementing by two
    uti.setIndex(0);
    for (i=2; i<cpCount; i+=2) {
        uti.moveIndex(2);
@ -206,17 +263,63 @@ void UTextTest::TestAccess(UText *ut, int cpCount, m *cpMap) {
        TEST_ASSERT(expectedIndex == index);
    }

-    i = cpMap[cpCount-1].nativeIdx;
+    // walk through the string backwards, decrementing by one.
+    i = cpMap[cpCount].nativeIdx;
    uti.setIndex(i);
-    for (i=cpCount-1; i>=0; i-=3) {
-        index = uti.getIndex();
+    for (i=cpCount; i>=0; i--) {
        expectedIndex = cpMap[i].nativeIdx;
+        index = uti.getIndex();
+        TEST_ASSERT(expectedIndex == index);
+        uti.moveIndex(-1);
+    }
+
+
+    // walk through backwards, decrementing by three
+    i = cpMap[cpCount].nativeIdx;
+    uti.setIndex(i);
+    for (i=cpCount; i>=0; i-=3) {
+        expectedIndex = cpMap[i].nativeIdx;
+        index = uti.getIndex();
        TEST_ASSERT(expectedIndex == index);
        uti.moveIndex(-3);
    }


+    //
+    // Extract
+    //
+    int bufSize = us.length() + 10;
+    UChar *buf = new UChar[bufSize];
+    status = U_ZERO_ERROR;
+    expectedLen = us.length();
+    len = ut->extract(ut, 0, utlen, buf, bufSize, &status);
+    TEST_SUCCESS(status);
+    TEST_ASSERT(len == expectedLen);
+    int compareResult = us.compare(buf, -1);
+    TEST_ASSERT(compareResult == 0);

+    status = U_ZERO_ERROR;
+    len = ut->extract(ut, 0, utlen, NULL, 0, &status);
+    TEST_ASSERT(status == U_BUFFER_OVERFLOW_ERROR)
+    TEST_ASSERT(len == expectedLen);
+
+    status = U_ZERO_ERROR;
+    u_memset(buf, 0x5555, bufSize);
+    len = ut->extract(ut, 0, utlen, buf, 1, &status);
+    if (us.length() == 0) {
+        TEST_SUCCESS(status);
+        TEST_ASSERT(buf[0] == 0);
+    } else {
+        TEST_ASSERT(buf[0] == us.charAt(0));
+        TEST_ASSERT(buf[1] == 0x5555);
+        if (us.length() == 1) {
+            TEST_ASSERT(status == U_STRING_NOT_TERMINATED_WARNING);
+        } else {
+            TEST_ASSERT(status == U_BUFFER_OVERFLOW_ERROR);
+        }
+    }
+
+    delete buf;

 }

--- a/icu4c/source/test/intltest/utxttest.h
+++ b/icu4c/source/test/intltest/utxttest.h
@ -37,7 +37,7 @@ private:
    };

    void TestString(const UnicodeString &s);
-    void TestAccess(UText *ut, int cpCount, m *cpMap);
+    void TestAccess(const UnicodeString &us, UText *ut, int cpCount, m *cpMap);
 };