ICU-2397 split uciter8.c to make separate uit_len8.h/.c

X-SVN-Rev: 11002
2003-02-10 19:56:40 +00:00 · 2003-02-10 19:56:40 +00:00 · 7677ee0d22
commit 7677ee0d22
parent 9fcdfc0f88
6 changed files with 746 additions and 675 deletions
--- a/icu4c/source/samples/uciter8/Makefile
+++ b/icu4c/source/samples/uciter8/Makefile
@ -11,7 +11,7 @@
 TARGET=uciter8

 # All object files (C or C++)
-OBJECTS=uciter8.o
+OBJECTS=uciter8.o uit_len8.o

 # Load in standard makefile definitions
 include ../defs.mk
--- a/icu4c/source/samples/uciter8/readme.txt
+++ b/icu4c/source/samples/uciter8/readme.txt
@ -8,9 +8,20 @@ UTF-8-style macros are defined as well as a UCharIterator.
 The macros are incomplete (do not assemble code points from pairs of surrogates)
 but sufficient for the iterator.

+If you wish to use the lenient-UTF/CESU-8 UCharIterator in a context outside of
+this sample, then copy the uit_len8.c file,
+as well as either the uit_len8.h header or just the prototype that it contains.
+
+*** Warning: ***
+This UCharIterator reads an arbitrary mix of UTF-8 and CESU-8 text.
+It does not conform to any one Unicode charset specification,
+and its use may lead to security risks.
+

 Files:
    uciter8.c        Main source file in C
+    uit_len8.c       Lenient-UTF/CESU-8 UCharIterator implementation
+    uit_len8.h       Header file with the prototoype for the lenient-UTF/CESU-8 UCharIterator
    uciter8.dsw      Windows MSVC workspace.  Double-click this to get started.
    uciter8.dsp      Windows MSVC project file

--- a/icu4c/source/samples/uciter8/uciter8.c
+++ b/icu4c/source/samples/uciter8/uciter8.c
@ -16,684 +16,18 @@
 *   This file contains sample code that illustrates reading
 *   8-bit Unicode text leniently, accepting a mix of UTF-8 and CESU-8
 *   and also accepting single surrogates.
-*   UTF-8-style macros are defined as well as a UCharIterator.
-*   The macros are incomplete (do not assemble code points from pairs of
-*   surrogates, see comment below)
-*   but sufficient for the iterator.
 */

 #include <stdio.h>
 #include <string.h>
 #include "unicode/utypes.h"
 #include "unicode/uiter.h"
+#include "uit_len8.h"

 #define LENGTHOF(array) (sizeof(array)/sizeof((array)[0]))

 #define log_err printf

-/* lenient UTF-8/CESU-8 macros ---------------------------------------------- */
-
-/*
- * This code leniently reads 8-bit Unicode strings,
- * which could contain a mix of UTF-8 and CESU-8.
- * More precisely:
- * - supplementary code points may be encoded with dedicated 4-byte sequences
- *   (UTF-8 style)
- * - supplementary code points may be encoded with
- *   pairs of 3-byte sequences, one for each surrogate of the UTF-16 form
- *   (CESU-8 style)
- * - single surrogates are allowed, encoded with their "natural" 3-byte sequences
- *
- * Limitation:
- * Right now, the macros do not attempt to assemble code points from pairs of
- * separately encoded surrogates.
- * This would not be sufficient for processing based on these macros,
- * but it is sufficient for a UCharIterator that returns only UChars anyway.
- *
- * The code is copied and modified from utf_impl.c and utf8.h.
- * The "strict" argument in the implementation functions is completely removed,
- * using the "<0" branch from the original code.
- * Checks for surrogate code points are removed for the leniency
- * described above.
- */
-
-static const UChar32
-lenient8_minLegal[4]={ 0, 0x80, 0x800, 0x10000 };
-
-static UChar32
-lenient8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, UChar32 c) {
-    int32_t i=*pi;
-    uint8_t count=U8_COUNT_TRAIL_BYTES(c);
-    if((i)+count<=(length)) {
-        uint8_t trail, illegal=0;
-
-        U8_MASK_LEAD_BYTE((c), count);
-        /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
-        switch(count) {
-        /* each branch falls through to the next one */
-        case 5:
-        case 4:
-            /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
-            illegal=1;
-            break;
-        case 3:
-            trail=s[(i)++];
-            (c)=((c)<<6)|(trail&0x3f);
-            if(c<0x110) {
-                illegal|=(trail&0xc0)^0x80;
-            } else {
-                /* code point>0x10ffff, outside Unicode */
-                illegal=1;
-                break;
-            }
-        case 2:
-            trail=s[(i)++];
-            (c)=((c)<<6)|(trail&0x3f);
-            illegal|=(trail&0xc0)^0x80;
-        case 1:
-            trail=s[(i)++];
-            (c)=((c)<<6)|(trail&0x3f);
-            illegal|=(trail&0xc0)^0x80;
-            break;
-        case 0:
-            return U_SENTINEL;
-        /* no default branch to optimize switch()  - all values are covered */
-        }
-
-        /* correct sequence - all trail bytes have (b7..b6)==(10)? */
-        /* illegal is also set if count>=4 */
-        if(illegal || (c)<lenient8_minLegal[count]) {
-            /* error handling */
-            uint8_t errorCount=count;
-            /* don't go beyond this sequence */
-            i=*pi;
-            while(count>0 && U8_IS_TRAIL(s[i])) {
-                ++(i);
-                --count;
-            }
-            c=U_SENTINEL;
-        }
-    } else /* too few bytes left */ {
-        /* error handling */
-        int32_t i0=i;
-        /* don't just set (i)=(length) in case there is an illegal sequence */
-        while((i)<(length) && U8_IS_TRAIL(s[i])) {
-            ++(i);
-        }
-        c=U_SENTINEL;
-    }
-    *pi=i;
-    return c;
-}
-
-static UChar32
-lenient8_prevCharSafeBody(const uint8_t *s, int32_t start, int32_t *pi, UChar32 c) {
-    int32_t i=*pi;
-    uint8_t b, count=1, shift=6;
-
-    /* extract value bits from the last trail byte */
-    c&=0x3f;
-
-    for(;;) {
-        if(i<=start) {
-            /* no lead byte at all */
-            return U_SENTINEL;
-        }
-
-        /* read another previous byte */
-        b=s[--i];
-        if((uint8_t)(b-0x80)<0x7e) { /* 0x80<=b<0xfe */
-            if(b&0x40) {
-                /* lead byte, this will always end the loop */
-                uint8_t shouldCount=U8_COUNT_TRAIL_BYTES(b);
-
-                if(count==shouldCount) {
-                    /* set the new position */
-                    *pi=i;
-                    U8_MASK_LEAD_BYTE(b, count);
-                    c|=(UChar32)b<<shift;
-                    if(count>=4 || c>0x10ffff || c<lenient8_minLegal[count]) {
-                        /* illegal sequence */
-                        if(count>=4) {
-                            count=3;
-                        }
-                        c=U_SENTINEL;
-                    } else {
-                        /* exit with correct c */
-                    }
-                } else {
-                    /* the lead byte does not match the number of trail bytes */
-                    /* only set the position to the lead byte if it would
-                       include the trail byte that we started with */
-                    if(count<shouldCount) {
-                        *pi=i;
-                    }
-                    c=U_SENTINEL;
-                }
-                break;
-            } else if(count<5) {
-                /* trail byte */
-                c|=(UChar32)(b&0x3f)<<shift;
-                ++count;
-                shift+=6;
-            } else {
-                /* more than 5 trail bytes is illegal */
-                c=U_SENTINEL;
-                break;
-            }
-        } else {
-            /* single-byte character precedes trailing bytes */
-            c=U_SENTINEL;
-            break;
-        }
-    }
-    return c;
-}
-
-#define L8_NEXT(s, i, length, c) { \
-    (c)=(s)[(i)++]; \
-    if((c)>=0x80) { \
-        if(U8_IS_LEAD(c)) { \
-            (c)=lenient8_nextCharSafeBody(s, &(i), (int32_t)(length), c); \
-        } else { \
-            (c)=U_SENTINEL; \
-        } \
-    } \
-}
-
-#define L8_PREV(s, start, i, c) { \
-    (c)=(s)[--(i)]; \
-    if((c)>=0x80) { \
-        if((c)<=0xbf) { \
-            (c)=lenient8_prevCharSafeBody(s, start, &(i), c); \
-        } else { \
-            (c)=U_SENTINEL; \
-        } \
-    } \
-}
-
-/* lenient-8 UCharIterator -------------------------------------------------- */
-
-/*
- * This is a copy of the UTF-8 UCharIterator in uiter.cpp,
- * except that it uses the lenient-8-bit-Unicode macros above.
- */
-
-/*
- * Minimal implementation:
- * Maintain a single-UChar buffer for an additional surrogate.
- * The caller must not modify start and limit because they are used internally.
- *
- * Use UCharIterator fields as follows:
- *   context        pointer to UTF-8 string
- *   length         UTF-16 length of the string; -1 until lazy evaluation
- *   start          current UTF-8 index
- *   index          current UTF-16 index; may be -1="unknown" after setState()
- *   limit          UTF-8 length of the string
- *   reservedField  supplementary code point
- *
- * Since UCharIterator delivers 16-bit code units, the iteration can be
- * currently in the middle of the byte sequence for a supplementary code point.
- * In this case, reservedField will contain that code point and start will
- * point to after the corresponding byte sequence. The UTF-16 index will be
- * one less than what it would otherwise be corresponding to the UTF-8 index.
- * Otherwise, reservedField will be 0.
- */
-
-/*
- * Possible optimization for NUL-terminated UTF-8 and UTF-16 strings:
- * Add implementations that do not call strlen() for iteration but check for NUL.
- */
-
-static int32_t U_CALLCONV
-lenient8IteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin) {
-    switch(origin) {
-    case UITER_ZERO:
-    case UITER_START:
-        return 0;
-    case UITER_CURRENT:
-        if(iter->index<0) {
-            /* the current UTF-16 index is unknown after setState(), count from the beginning */
-            const uint8_t *s;
-            UChar32 c;
-            int32_t i, limit, index;
-
-            s=(const uint8_t *)iter->context;
-            i=index=0;
-            limit=iter->start; /* count up to the UTF-8 index */
-            while(i<limit) {
-                L8_NEXT(s, i, limit, c);
-                if(c<=0xffff) {
-                    ++index;
-                } else {
-                    index+=2;
-                }
-            }
-
-            iter->start=i; /* just in case setState() did not get us to a code point boundary */
-            if(i==iter->limit) {
-                iter->length=index; /* in case it was <0 or wrong */
-            }
-            if(iter->reservedField!=0) {
-                --index; /* we are in the middle of a supplementary code point */
-            }
-            iter->index=index;
-        }
-        return iter->index;
-    case UITER_LIMIT:
-    case UITER_LENGTH:
-        if(iter->length<0) {
-            const uint8_t *s;
-            UChar32 c;
-            int32_t i, limit, length;
-
-            s=(const uint8_t *)iter->context;
-            if(iter->index<0) {
-                /*
-                 * the current UTF-16 index is unknown after setState(),
-                 * we must first count from the beginning to here
-                 */
-                i=length=0;
-                limit=iter->start;
-
-                /* count from the beginning to the current index */
-                while(i<limit) {
-                    L8_NEXT(s, i, limit, c);
-                    if(c<=0xffff) {
-                        ++length;
-                    } else {
-                        length+=2;
-                    }
-                }
-
-                /* assume i==limit==iter->start, set the UTF-16 index */
-                iter->start=i; /* just in case setState() did not get us to a code point boundary */
-                iter->index= iter->reservedField!=0 ? length-1 : length;
-            } else {
-                i=iter->start;
-                length=iter->index;
-                if(iter->reservedField!=0) {
-                    ++length;
-                }
-            }
-
-            /* count from the current index to the end */
-            limit=iter->limit;
-            while(i<limit) {
-                L8_NEXT(s, i, limit, c);
-                if(c<=0xffff) {
-                    ++length;
-                } else {
-                    length+=2;
-                }
-            }
-            iter->length=length;
-        }
-        return iter->length;
-    default:
-        /* not a valid origin */
-        /* Should never get here! */
-        return -1;
-    }
-}
-
-static int32_t U_CALLCONV
-lenient8IteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin origin) {
-    const uint8_t *s;
-    UChar32 c;
-    int32_t pos; /* requested UTF-16 index */
-    int32_t i; /* UTF-8 index */
-    UBool havePos;
-
-    /* calculate the requested UTF-16 index */
-    switch(origin) {
-    case UITER_ZERO:
-    case UITER_START:
-        pos=delta;
-        havePos=TRUE;
-        /* iter->index<0 (unknown) is possible */
-        break;
-    case UITER_CURRENT:
-        if(iter->index>=0) {
-            pos=iter->index+delta;
-            havePos=TRUE;
-        } else {
-            /* the current UTF-16 index is unknown after setState(), use only delta */
-            pos=0;
-            havePos=FALSE;
-        }
-        break;
-    case UITER_LIMIT:
-    case UITER_LENGTH:
-        if(iter->length>=0) {
-            pos=iter->length+delta;
-            havePos=TRUE;
-        } else {
-            /* pin to the end, avoid counting the length */
-            iter->index=-1;
-            iter->start=iter->limit;
-            iter->reservedField=0;
-            if(delta>=0) {
-                return UITER_UNKNOWN_INDEX;
-            } else {
-                /* the current UTF-16 index is unknown, use only delta */
-                pos=0;
-                havePos=FALSE;
-            }
-        }
-        break;
-    default:
-        return -1;  /* Error */
-    }
-
-    if(havePos) {
-        /* shortcuts: pinning to the edges of the string */
-        if(pos<=0) {
-            iter->index=iter->start=iter->reservedField=0;
-            return 0;
-        } else if(iter->length>=0 && pos>=iter->length) {
-            iter->index=iter->length;
-            iter->start=iter->limit;
-            iter->reservedField=0;
-            return iter->index;
-        }
-
-        /* minimize the number of L8_NEXT/PREV operations */
-        if(iter->index<0 || pos<iter->index/2) {
-            /* go forward from the start instead of backward from the current index */
-            iter->index=iter->start=iter->reservedField=0;
-        } else if(iter->length>=0 && (iter->length-pos)<(pos-iter->index)) {
-            /*
-             * if we have the UTF-16 index and length and the new position is
-             * closer to the end than the current index,
-             * then go backward from the end instead of forward from the current index
-             */
-            iter->index=iter->length;
-            iter->start=iter->limit;
-            iter->reservedField=0;
-        }
-
-        delta=pos-iter->index;
-        if(delta==0) {
-            return iter->index; /* nothing to do */
-        }
-    } else {
-        /* move relative to unknown UTF-16 index */
-        if(delta==0) {
-            return UITER_UNKNOWN_INDEX; /* nothing to do */
-        } else if(-delta>=iter->start) {
-            /* moving backwards by more UChars than there are UTF-8 bytes, pin to 0 */
-            iter->index=iter->start=iter->reservedField=0;
-            return 0;
-        } else if(delta>=(iter->limit-iter->start)) {
-            /* moving forward by more UChars than the remaining UTF-8 bytes, pin to the end */
-            iter->index=iter->length; /* may or may not be <0 (unknown) */
-            iter->start=iter->limit;
-            iter->reservedField=0;
-            return iter->index>=0 ? iter->index : UITER_UNKNOWN_INDEX;
-        }
-    }
-
-    /* delta!=0 */
-
-    /* move towards the requested position, pin to the edges of the string */
-    s=(const uint8_t *)iter->context;
-    pos=iter->index; /* could be <0 (unknown) */
-    i=iter->start;
-    if(delta>0) {
-        /* go forward */
-        int32_t limit=iter->limit;
-        if(iter->reservedField!=0) {
-            iter->reservedField=0;
-            ++pos;
-            --delta;
-        }
-        while(delta>0 && i<limit) {
-            L8_NEXT(s, i, limit, c);
-            if(c<0xffff) {
-                ++pos;
-                --delta;
-            } else if(delta>=2) {
-                pos+=2;
-                delta-=2;
-            } else /* delta==1 */ {
-                /* stop in the middle of a supplementary code point */
-                iter->reservedField=c;
-                ++pos;
-                break; /* delta=0; */
-            }
-        }
-        if(i==limit) {
-            if(iter->length<0 && iter->index>=0) {
-                iter->length= iter->reservedField==0 ? pos : pos+1;
-            } else if(iter->index<0 && iter->length>=0) {
-                iter->index= iter->reservedField==0 ? iter->length : iter->length-1;
-            }
-        }
-    } else /* delta<0 */ {
-        /* go backward */
-        if(iter->reservedField!=0) {
-            iter->reservedField=0;
-            i-=4; /* we stayed behind the supplementary code point; go before it now */
-            --pos;
-            ++delta;
-        }
-        while(delta<0 && i>0) {
-            L8_PREV(s, 0, i, c);
-            if(c<0xffff) {
-                --pos;
-                ++delta;
-            } else if(delta<=-2) {
-                pos-=2;
-                delta+=2;
-            } else /* delta==-1 */ {
-                /* stop in the middle of a supplementary code point */
-                i+=4; /* back to behind this supplementary code point for consistent state */
-                iter->reservedField=c;
-                --pos;
-                break; /* delta=0; */
-            }
-        }
-    }
-
-    iter->start=i;
-    if(iter->index>=0) {
-        return iter->index=pos;
-    } else {
-        /* we started with index<0 (unknown) so pos is bogus */
-        if(i<=1) {
-            return iter->index=i; /* reached the beginning */
-        } else {
-            /* we still don't know the UTF-16 index */
-            return UITER_UNKNOWN_INDEX;
-        }
-    }
-}
-
-static UBool U_CALLCONV
-lenient8IteratorHasNext(UCharIterator *iter) {
-    return iter->reservedField!=0 || iter->start<iter->limit;
-}
-
-static UBool U_CALLCONV
-lenient8IteratorHasPrevious(UCharIterator *iter) {
-    return iter->start>0;
-}
-
-static UChar32 U_CALLCONV
-lenient8IteratorCurrent(UCharIterator *iter) {
-    if(iter->reservedField!=0) {
-        return U16_TRAIL(iter->reservedField);
-    } else if(iter->start<iter->limit) {
-        const uint8_t *s=(const uint8_t *)iter->context;
-        UChar32 c;
-        int32_t i=iter->start;
-
-        L8_NEXT(s, i, iter->limit, c);
-        if(c<0) {
-            return 0xfffd;
-        } else if(c<=0xffff) {
-            return c;
-        } else {
-            return U16_LEAD(c);
-        }
-    } else {
-        return U_SENTINEL;
-    }
-}
-
-static UChar32 U_CALLCONV
-lenient8IteratorNext(UCharIterator *iter) {
-    int32_t index;
-
-    if(iter->reservedField!=0) {
-        UChar trail=U16_TRAIL(iter->reservedField);
-        iter->reservedField=0;
-        if((index=iter->index)>=0) {
-            iter->index=index+1;
-        }
-        return trail;
-    } else if(iter->start<iter->limit) {
-        const uint8_t *s=(const uint8_t *)iter->context;
-        UChar32 c;
-
-        L8_NEXT(s, iter->start, iter->limit, c);
-        if((index=iter->index)>=0) {
-            iter->index=++index;
-            if(iter->length<0 && iter->start==iter->limit) {
-                iter->length= c<=0xffff ? index : index+1;
-            }
-        } else if(iter->start==iter->limit && iter->length>=0) {
-            iter->index= c<=0xffff ? iter->length : iter->length-1;
-        }
-        if(c<0) {
-            return 0xfffd;
-        } else if(c<=0xffff) {
-            return c;
-        } else {
-            iter->reservedField=c;
-            return U16_LEAD(c);
-        }
-    } else {
-        return U_SENTINEL;
-    }
-}
-
-static UChar32 U_CALLCONV
-lenient8IteratorPrevious(UCharIterator *iter) {
-    int32_t index;
-
-    if(iter->reservedField!=0) {
-        UChar lead=U16_LEAD(iter->reservedField);
-        iter->reservedField=0;
-        iter->start-=4; /* we stayed behind the supplementary code point; go before it now */
-        if((index=iter->index)>0) {
-            iter->index=index-1;
-        }
-        return lead;
-    } else if(iter->start>0) {
-        const uint8_t *s=(const uint8_t *)iter->context;
-        UChar32 c;
-
-        L8_PREV(s, 0, iter->start, c);
-        if((index=iter->index)>0) {
-            iter->index=index-1;
-        } else if(iter->start<=1) {
-            iter->index= c<=0xffff ? iter->start : iter->start+1;
-        }
-        if(c<0) {
-            return 0xfffd;
-        } else if(c<=0xffff) {
-            return c;
-        } else {
-            iter->start+=4; /* back to behind this supplementary code point for consistent state */
-            iter->reservedField=c;
-            return U16_TRAIL(c);
-        }
-    } else {
-        return U_SENTINEL;
-    }
-}
-
-static uint32_t U_CALLCONV
-lenient8IteratorGetState(const UCharIterator *iter) {
-    uint32_t state=(uint32_t)(iter->start<<1);
-    if(iter->reservedField!=0) {
-        state|=1;
-    }
-    return state;
-}
-
-static void U_CALLCONV
-lenient8IteratorSetState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode) {
-    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
-        /* do nothing */
-    } else if(iter==NULL) {
-        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
-    } else if(state==lenient8IteratorGetState(iter)) {
-        /* setting to the current state: no-op */
-    } else {
-        int32_t index=(int32_t)(state>>1); /* UTF-8 index */
-        state&=1; /* 1 if in surrogate pair, must be index>=4 */
-
-        if((state==0 ? index<0 : index<4) || iter->limit<index) {
-            *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
-        } else {
-            iter->start=index; /* restore UTF-8 byte index */
-            if(index<=1) {
-                iter->index=index;
-            } else {
-                iter->index=-1; /* unknown UTF-16 index */
-            }
-            if(state==0) {
-                iter->reservedField=0;
-            } else {
-                /* verified index>=4 above */
-                UChar32 c;
-                L8_PREV((const uint8_t *)iter->context, 0, index, c);
-                if(c<=0xffff) {
-                    *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
-                } else {
-                    iter->reservedField=c;
-                }
-            }
-        }
-    }
-}
-
-static const UCharIterator lenient8Iterator={
-    0, 0, 0, 0, 0, 0,
-    lenient8IteratorGetIndex,
-    lenient8IteratorMove,
-    lenient8IteratorHasNext,
-    lenient8IteratorHasPrevious,
-    lenient8IteratorCurrent,
-    lenient8IteratorNext,
-    lenient8IteratorPrevious,
-    NULL,
-    lenient8IteratorGetState,
-    lenient8IteratorSetState
-};
-
-U_CAPI void U_EXPORT2
-uiter_setLenient8(UCharIterator *iter, const char *s, int32_t length) {
-    if(iter!=0) {
-        if(s!=0 && length>=-1) {
-            *iter=lenient8Iterator;
-            iter->context=s;
-            if(length>=0) {
-                iter->limit=length;
-            } else {
-                iter->limit=strlen(s);
-            }
-            iter->length= iter->limit<=1 ? iter->limit : -1;
-        } else {
-            /* set no-op iterator */
-            uiter_setString(iter, NULL, 0);
-        }
-    }
-}
-
 /* UCharIterator test ------------------------------------------------------- */

 /*
--- a/icu4c/source/samples/uciter8/uciter8.dsp
+++ b/icu4c/source/samples/uciter8/uciter8.dsp
@ -89,10 +89,18 @@ LINK32=link.exe

 SOURCE=.\uciter8.c
 # End Source File
+# Begin Source File
+
+SOURCE=.\uit_len8.c
+# End Source File
 # End Group
 # Begin Group "Header Files"

 # PROP Default_Filter "h;hpp;hxx;hm;inl"
+# Begin Source File
+
+SOURCE=.\uit_len8.h
+# End Source File
 # End Group
 # Begin Group "Resource Files"

--- a/icu4c/source/samples/uciter8/uit_len8.c
+++ b/icu4c/source/samples/uciter8/uit_len8.c
@ -0,0 +1,689 @@
+/*
+*******************************************************************************
+*
+*   Copyright (C) 2003, International Business Machines
+*   Corporation and others.  All Rights Reserved.
+*
+*******************************************************************************
+*   file name:  uit_len8.c
+*   encoding:   US-ASCII
+*   tab size:   8 (not used)
+*   indentation:4
+*
+*   created on: 2003feb10
+*   created by: Markus W. Scherer
+*
+*   This file contains the implementation of the "lenient UTF-8" UCharIterator
+*   as used in the uciter8 sample code.
+*   UTF-8-style macros are defined as well as the UCharIterator.
+*   The macros are incomplete (do not assemble code points from pairs of
+*   surrogates, see comment below)
+*   but sufficient for the iterator.
+*/
+
+#include <string.h>
+#include "unicode/utypes.h"
+#include "unicode/uiter.h"
+
+/* lenient UTF-8/CESU-8 macros ---------------------------------------------- */
+
+/*
+ * This code leniently reads 8-bit Unicode strings,
+ * which could contain a mix of UTF-8 and CESU-8.
+ * More precisely:
+ * - supplementary code points may be encoded with dedicated 4-byte sequences
+ *   (UTF-8 style)
+ * - supplementary code points may be encoded with
+ *   pairs of 3-byte sequences, one for each surrogate of the UTF-16 form
+ *   (CESU-8 style)
+ * - single surrogates are allowed, encoded with their "natural" 3-byte sequences
+ *
+ * Limitation:
+ * Right now, the macros do not attempt to assemble code points from pairs of
+ * separately encoded surrogates.
+ * This would not be sufficient for processing based on these macros,
+ * but it is sufficient for a UCharIterator that returns only UChars anyway.
+ *
+ * The code is copied and modified from utf_impl.c and utf8.h.
+ * The "strict" argument in the implementation functions is completely removed,
+ * using the "<0" branch from the original code.
+ * Checks for surrogate code points are removed for the leniency
+ * described above.
+ */
+
+static const UChar32
+lenient8_minLegal[4]={ 0, 0x80, 0x800, 0x10000 };
+
+static UChar32
+lenient8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, UChar32 c) {
+    int32_t i=*pi;
+    uint8_t count=U8_COUNT_TRAIL_BYTES(c);
+    if((i)+count<=(length)) {
+        uint8_t trail, illegal=0;
+
+        U8_MASK_LEAD_BYTE((c), count);
+        /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
+        switch(count) {
+        /* each branch falls through to the next one */
+        case 5:
+        case 4:
+            /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
+            illegal=1;
+            break;
+        case 3:
+            trail=s[(i)++];
+            (c)=((c)<<6)|(trail&0x3f);
+            if(c<0x110) {
+                illegal|=(trail&0xc0)^0x80;
+            } else {
+                /* code point>0x10ffff, outside Unicode */
+                illegal=1;
+                break;
+            }
+        case 2:
+            trail=s[(i)++];
+            (c)=((c)<<6)|(trail&0x3f);
+            illegal|=(trail&0xc0)^0x80;
+        case 1:
+            trail=s[(i)++];
+            (c)=((c)<<6)|(trail&0x3f);
+            illegal|=(trail&0xc0)^0x80;
+            break;
+        case 0:
+            return U_SENTINEL;
+        /* no default branch to optimize switch()  - all values are covered */
+        }
+
+        /* correct sequence - all trail bytes have (b7..b6)==(10)? */
+        /* illegal is also set if count>=4 */
+        if(illegal || (c)<lenient8_minLegal[count]) {
+            /* error handling */
+            uint8_t errorCount=count;
+            /* don't go beyond this sequence */
+            i=*pi;
+            while(count>0 && U8_IS_TRAIL(s[i])) {
+                ++(i);
+                --count;
+            }
+            c=U_SENTINEL;
+        }
+    } else /* too few bytes left */ {
+        /* error handling */
+        int32_t i0=i;
+        /* don't just set (i)=(length) in case there is an illegal sequence */
+        while((i)<(length) && U8_IS_TRAIL(s[i])) {
+            ++(i);
+        }
+        c=U_SENTINEL;
+    }
+    *pi=i;
+    return c;
+}
+
+static UChar32
+lenient8_prevCharSafeBody(const uint8_t *s, int32_t start, int32_t *pi, UChar32 c) {
+    int32_t i=*pi;
+    uint8_t b, count=1, shift=6;
+
+    /* extract value bits from the last trail byte */
+    c&=0x3f;
+
+    for(;;) {
+        if(i<=start) {
+            /* no lead byte at all */
+            return U_SENTINEL;
+        }
+
+        /* read another previous byte */
+        b=s[--i];
+        if((uint8_t)(b-0x80)<0x7e) { /* 0x80<=b<0xfe */
+            if(b&0x40) {
+                /* lead byte, this will always end the loop */
+                uint8_t shouldCount=U8_COUNT_TRAIL_BYTES(b);
+
+                if(count==shouldCount) {
+                    /* set the new position */
+                    *pi=i;
+                    U8_MASK_LEAD_BYTE(b, count);
+                    c|=(UChar32)b<<shift;
+                    if(count>=4 || c>0x10ffff || c<lenient8_minLegal[count]) {
+                        /* illegal sequence */
+                        if(count>=4) {
+                            count=3;
+                        }
+                        c=U_SENTINEL;
+                    } else {
+                        /* exit with correct c */
+                    }
+                } else {
+                    /* the lead byte does not match the number of trail bytes */
+                    /* only set the position to the lead byte if it would
+                       include the trail byte that we started with */
+                    if(count<shouldCount) {
+                        *pi=i;
+                    }
+                    c=U_SENTINEL;
+                }
+                break;
+            } else if(count<5) {
+                /* trail byte */
+                c|=(UChar32)(b&0x3f)<<shift;
+                ++count;
+                shift+=6;
+            } else {
+                /* more than 5 trail bytes is illegal */
+                c=U_SENTINEL;
+                break;
+            }
+        } else {
+            /* single-byte character precedes trailing bytes */
+            c=U_SENTINEL;
+            break;
+        }
+    }
+    return c;
+}
+
+#define L8_NEXT(s, i, length, c) { \
+    (c)=(s)[(i)++]; \
+    if((c)>=0x80) { \
+        if(U8_IS_LEAD(c)) { \
+            (c)=lenient8_nextCharSafeBody(s, &(i), (int32_t)(length), c); \
+        } else { \
+            (c)=U_SENTINEL; \
+        } \
+    } \
+}
+
+#define L8_PREV(s, start, i, c) { \
+    (c)=(s)[--(i)]; \
+    if((c)>=0x80) { \
+        if((c)<=0xbf) { \
+            (c)=lenient8_prevCharSafeBody(s, start, &(i), c); \
+        } else { \
+            (c)=U_SENTINEL; \
+        } \
+    } \
+}
+
+/* lenient-8 UCharIterator -------------------------------------------------- */
+
+/*
+ * This is a copy of the UTF-8 UCharIterator in uiter.cpp,
+ * except that it uses the lenient-8-bit-Unicode macros above.
+ */
+
+/*
+ * Minimal implementation:
+ * Maintain a single-UChar buffer for an additional surrogate.
+ * The caller must not modify start and limit because they are used internally.
+ *
+ * Use UCharIterator fields as follows:
+ *   context        pointer to UTF-8 string
+ *   length         UTF-16 length of the string; -1 until lazy evaluation
+ *   start          current UTF-8 index
+ *   index          current UTF-16 index; may be -1="unknown" after setState()
+ *   limit          UTF-8 length of the string
+ *   reservedField  supplementary code point
+ *
+ * Since UCharIterator delivers 16-bit code units, the iteration can be
+ * currently in the middle of the byte sequence for a supplementary code point.
+ * In this case, reservedField will contain that code point and start will
+ * point to after the corresponding byte sequence. The UTF-16 index will be
+ * one less than what it would otherwise be corresponding to the UTF-8 index.
+ * Otherwise, reservedField will be 0.
+ */
+
+/*
+ * Possible optimization for NUL-terminated UTF-8 and UTF-16 strings:
+ * Add implementations that do not call strlen() for iteration but check for NUL.
+ */
+
+static int32_t U_CALLCONV
+lenient8IteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin) {
+    switch(origin) {
+    case UITER_ZERO:
+    case UITER_START:
+        return 0;
+    case UITER_CURRENT:
+        if(iter->index<0) {
+            /* the current UTF-16 index is unknown after setState(), count from the beginning */
+            const uint8_t *s;
+            UChar32 c;
+            int32_t i, limit, index;
+
+            s=(const uint8_t *)iter->context;
+            i=index=0;
+            limit=iter->start; /* count up to the UTF-8 index */
+            while(i<limit) {
+                L8_NEXT(s, i, limit, c);
+                if(c<=0xffff) {
+                    ++index;
+                } else {
+                    index+=2;
+                }
+            }
+
+            iter->start=i; /* just in case setState() did not get us to a code point boundary */
+            if(i==iter->limit) {
+                iter->length=index; /* in case it was <0 or wrong */
+            }
+            if(iter->reservedField!=0) {
+                --index; /* we are in the middle of a supplementary code point */
+            }
+            iter->index=index;
+        }
+        return iter->index;
+    case UITER_LIMIT:
+    case UITER_LENGTH:
+        if(iter->length<0) {
+            const uint8_t *s;
+            UChar32 c;
+            int32_t i, limit, length;
+
+            s=(const uint8_t *)iter->context;
+            if(iter->index<0) {
+                /*
+                 * the current UTF-16 index is unknown after setState(),
+                 * we must first count from the beginning to here
+                 */
+                i=length=0;
+                limit=iter->start;
+
+                /* count from the beginning to the current index */
+                while(i<limit) {
+                    L8_NEXT(s, i, limit, c);
+                    if(c<=0xffff) {
+                        ++length;
+                    } else {
+                        length+=2;
+                    }
+                }
+
+                /* assume i==limit==iter->start, set the UTF-16 index */
+                iter->start=i; /* just in case setState() did not get us to a code point boundary */
+                iter->index= iter->reservedField!=0 ? length-1 : length;
+            } else {
+                i=iter->start;
+                length=iter->index;
+                if(iter->reservedField!=0) {
+                    ++length;
+                }
+            }
+
+            /* count from the current index to the end */
+            limit=iter->limit;
+            while(i<limit) {
+                L8_NEXT(s, i, limit, c);
+                if(c<=0xffff) {
+                    ++length;
+                } else {
+                    length+=2;
+                }
+            }
+            iter->length=length;
+        }
+        return iter->length;
+    default:
+        /* not a valid origin */
+        /* Should never get here! */
+        return -1;
+    }
+}
+
+static int32_t U_CALLCONV
+lenient8IteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin origin) {
+    const uint8_t *s;
+    UChar32 c;
+    int32_t pos; /* requested UTF-16 index */
+    int32_t i; /* UTF-8 index */
+    UBool havePos;
+
+    /* calculate the requested UTF-16 index */
+    switch(origin) {
+    case UITER_ZERO:
+    case UITER_START:
+        pos=delta;
+        havePos=TRUE;
+        /* iter->index<0 (unknown) is possible */
+        break;
+    case UITER_CURRENT:
+        if(iter->index>=0) {
+            pos=iter->index+delta;
+            havePos=TRUE;
+        } else {
+            /* the current UTF-16 index is unknown after setState(), use only delta */
+            pos=0;
+            havePos=FALSE;
+        }
+        break;
+    case UITER_LIMIT:
+    case UITER_LENGTH:
+        if(iter->length>=0) {
+            pos=iter->length+delta;
+            havePos=TRUE;
+        } else {
+            /* pin to the end, avoid counting the length */
+            iter->index=-1;
+            iter->start=iter->limit;
+            iter->reservedField=0;
+            if(delta>=0) {
+                return UITER_UNKNOWN_INDEX;
+            } else {
+                /* the current UTF-16 index is unknown, use only delta */
+                pos=0;
+                havePos=FALSE;
+            }
+        }
+        break;
+    default:
+        return -1;  /* Error */
+    }
+
+    if(havePos) {
+        /* shortcuts: pinning to the edges of the string */
+        if(pos<=0) {
+            iter->index=iter->start=iter->reservedField=0;
+            return 0;
+        } else if(iter->length>=0 && pos>=iter->length) {
+            iter->index=iter->length;
+            iter->start=iter->limit;
+            iter->reservedField=0;
+            return iter->index;
+        }
+
+        /* minimize the number of L8_NEXT/PREV operations */
+        if(iter->index<0 || pos<iter->index/2) {
+            /* go forward from the start instead of backward from the current index */
+            iter->index=iter->start=iter->reservedField=0;
+        } else if(iter->length>=0 && (iter->length-pos)<(pos-iter->index)) {
+            /*
+             * if we have the UTF-16 index and length and the new position is
+             * closer to the end than the current index,
+             * then go backward from the end instead of forward from the current index
+             */
+            iter->index=iter->length;
+            iter->start=iter->limit;
+            iter->reservedField=0;
+        }
+
+        delta=pos-iter->index;
+        if(delta==0) {
+            return iter->index; /* nothing to do */
+        }
+    } else {
+        /* move relative to unknown UTF-16 index */
+        if(delta==0) {
+            return UITER_UNKNOWN_INDEX; /* nothing to do */
+        } else if(-delta>=iter->start) {
+            /* moving backwards by more UChars than there are UTF-8 bytes, pin to 0 */
+            iter->index=iter->start=iter->reservedField=0;
+            return 0;
+        } else if(delta>=(iter->limit-iter->start)) {
+            /* moving forward by more UChars than the remaining UTF-8 bytes, pin to the end */
+            iter->index=iter->length; /* may or may not be <0 (unknown) */
+            iter->start=iter->limit;
+            iter->reservedField=0;
+            return iter->index>=0 ? iter->index : UITER_UNKNOWN_INDEX;
+        }
+    }
+
+    /* delta!=0 */
+
+    /* move towards the requested position, pin to the edges of the string */
+    s=(const uint8_t *)iter->context;
+    pos=iter->index; /* could be <0 (unknown) */
+    i=iter->start;
+    if(delta>0) {
+        /* go forward */
+        int32_t limit=iter->limit;
+        if(iter->reservedField!=0) {
+            iter->reservedField=0;
+            ++pos;
+            --delta;
+        }
+        while(delta>0 && i<limit) {
+            L8_NEXT(s, i, limit, c);
+            if(c<0xffff) {
+                ++pos;
+                --delta;
+            } else if(delta>=2) {
+                pos+=2;
+                delta-=2;
+            } else /* delta==1 */ {
+                /* stop in the middle of a supplementary code point */
+                iter->reservedField=c;
+                ++pos;
+                break; /* delta=0; */
+            }
+        }
+        if(i==limit) {
+            if(iter->length<0 && iter->index>=0) {
+                iter->length= iter->reservedField==0 ? pos : pos+1;
+            } else if(iter->index<0 && iter->length>=0) {
+                iter->index= iter->reservedField==0 ? iter->length : iter->length-1;
+            }
+        }
+    } else /* delta<0 */ {
+        /* go backward */
+        if(iter->reservedField!=0) {
+            iter->reservedField=0;
+            i-=4; /* we stayed behind the supplementary code point; go before it now */
+            --pos;
+            ++delta;
+        }
+        while(delta<0 && i>0) {
+            L8_PREV(s, 0, i, c);
+            if(c<0xffff) {
+                --pos;
+                ++delta;
+            } else if(delta<=-2) {
+                pos-=2;
+                delta+=2;
+            } else /* delta==-1 */ {
+                /* stop in the middle of a supplementary code point */
+                i+=4; /* back to behind this supplementary code point for consistent state */
+                iter->reservedField=c;
+                --pos;
+                break; /* delta=0; */
+            }
+        }
+    }
+
+    iter->start=i;
+    if(iter->index>=0) {
+        return iter->index=pos;
+    } else {
+        /* we started with index<0 (unknown) so pos is bogus */
+        if(i<=1) {
+            return iter->index=i; /* reached the beginning */
+        } else {
+            /* we still don't know the UTF-16 index */
+            return UITER_UNKNOWN_INDEX;
+        }
+    }
+}
+
+static UBool U_CALLCONV
+lenient8IteratorHasNext(UCharIterator *iter) {
+    return iter->reservedField!=0 || iter->start<iter->limit;
+}
+
+static UBool U_CALLCONV
+lenient8IteratorHasPrevious(UCharIterator *iter) {
+    return iter->start>0;
+}
+
+static UChar32 U_CALLCONV
+lenient8IteratorCurrent(UCharIterator *iter) {
+    if(iter->reservedField!=0) {
+        return U16_TRAIL(iter->reservedField);
+    } else if(iter->start<iter->limit) {
+        const uint8_t *s=(const uint8_t *)iter->context;
+        UChar32 c;
+        int32_t i=iter->start;
+
+        L8_NEXT(s, i, iter->limit, c);
+        if(c<0) {
+            return 0xfffd;
+        } else if(c<=0xffff) {
+            return c;
+        } else {
+            return U16_LEAD(c);
+        }
+    } else {
+        return U_SENTINEL;
+    }
+}
+
+static UChar32 U_CALLCONV
+lenient8IteratorNext(UCharIterator *iter) {
+    int32_t index;
+
+    if(iter->reservedField!=0) {
+        UChar trail=U16_TRAIL(iter->reservedField);
+        iter->reservedField=0;
+        if((index=iter->index)>=0) {
+            iter->index=index+1;
+        }
+        return trail;
+    } else if(iter->start<iter->limit) {
+        const uint8_t *s=(const uint8_t *)iter->context;
+        UChar32 c;
+
+        L8_NEXT(s, iter->start, iter->limit, c);
+        if((index=iter->index)>=0) {
+            iter->index=++index;
+            if(iter->length<0 && iter->start==iter->limit) {
+                iter->length= c<=0xffff ? index : index+1;
+            }
+        } else if(iter->start==iter->limit && iter->length>=0) {
+            iter->index= c<=0xffff ? iter->length : iter->length-1;
+        }
+        if(c<0) {
+            return 0xfffd;
+        } else if(c<=0xffff) {
+            return c;
+        } else {
+            iter->reservedField=c;
+            return U16_LEAD(c);
+        }
+    } else {
+        return U_SENTINEL;
+    }
+}
+
+static UChar32 U_CALLCONV
+lenient8IteratorPrevious(UCharIterator *iter) {
+    int32_t index;
+
+    if(iter->reservedField!=0) {
+        UChar lead=U16_LEAD(iter->reservedField);
+        iter->reservedField=0;
+        iter->start-=4; /* we stayed behind the supplementary code point; go before it now */
+        if((index=iter->index)>0) {
+            iter->index=index-1;
+        }
+        return lead;
+    } else if(iter->start>0) {
+        const uint8_t *s=(const uint8_t *)iter->context;
+        UChar32 c;
+
+        L8_PREV(s, 0, iter->start, c);
+        if((index=iter->index)>0) {
+            iter->index=index-1;
+        } else if(iter->start<=1) {
+            iter->index= c<=0xffff ? iter->start : iter->start+1;
+        }
+        if(c<0) {
+            return 0xfffd;
+        } else if(c<=0xffff) {
+            return c;
+        } else {
+            iter->start+=4; /* back to behind this supplementary code point for consistent state */
+            iter->reservedField=c;
+            return U16_TRAIL(c);
+        }
+    } else {
+        return U_SENTINEL;
+    }
+}
+
+static uint32_t U_CALLCONV
+lenient8IteratorGetState(const UCharIterator *iter) {
+    uint32_t state=(uint32_t)(iter->start<<1);
+    if(iter->reservedField!=0) {
+        state|=1;
+    }
+    return state;
+}
+
+static void U_CALLCONV
+lenient8IteratorSetState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode) {
+    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
+        /* do nothing */
+    } else if(iter==NULL) {
+        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
+    } else if(state==lenient8IteratorGetState(iter)) {
+        /* setting to the current state: no-op */
+    } else {
+        int32_t index=(int32_t)(state>>1); /* UTF-8 index */
+        state&=1; /* 1 if in surrogate pair, must be index>=4 */
+
+        if((state==0 ? index<0 : index<4) || iter->limit<index) {
+            *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
+        } else {
+            iter->start=index; /* restore UTF-8 byte index */
+            if(index<=1) {
+                iter->index=index;
+            } else {
+                iter->index=-1; /* unknown UTF-16 index */
+            }
+            if(state==0) {
+                iter->reservedField=0;
+            } else {
+                /* verified index>=4 above */
+                UChar32 c;
+                L8_PREV((const uint8_t *)iter->context, 0, index, c);
+                if(c<=0xffff) {
+                    *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
+                } else {
+                    iter->reservedField=c;
+                }
+            }
+        }
+    }
+}
+
+static const UCharIterator lenient8Iterator={
+    0, 0, 0, 0, 0, 0,
+    lenient8IteratorGetIndex,
+    lenient8IteratorMove,
+    lenient8IteratorHasNext,
+    lenient8IteratorHasPrevious,
+    lenient8IteratorCurrent,
+    lenient8IteratorNext,
+    lenient8IteratorPrevious,
+    NULL,
+    lenient8IteratorGetState,
+    lenient8IteratorSetState
+};
+
+U_CAPI void U_EXPORT2
+uiter_setLenient8(UCharIterator *iter, const char *s, int32_t length) {
+    if(iter!=0) {
+        if(s!=0 && length>=-1) {
+            *iter=lenient8Iterator;
+            iter->context=s;
+            if(length>=0) {
+                iter->limit=length;
+            } else {
+                iter->limit=strlen(s);
+            }
+            iter->length= iter->limit<=1 ? iter->limit : -1;
+        } else {
+            /* set no-op iterator */
+            uiter_setString(iter, NULL, 0);
+        }
+    }
+}
--- a/icu4c/source/samples/uciter8/uit_len8.h
+++ b/icu4c/source/samples/uciter8/uit_len8.h
@ -0,0 +1,29 @@
+/*
+*******************************************************************************
+*
+*   Copyright (C) 2003, International Business Machines
+*   Corporation and others.  All Rights Reserved.
+*
+*******************************************************************************
+*   file name:  uit_len8.h
+*   encoding:   US-ASCII
+*   tab size:   8 (not used)
+*   indentation:4
+*
+*   created on: 2003feb10
+*   created by: Markus W. Scherer
+*
+*   This file contains the declaration for a "lenient UTF-8" UCharIterator
+*   as used in the uciter8 sample code.
+*/
+
+#ifndef __UIT_LEN8_H__
+#define __UIT_LEN8_H__
+
+#include "unicode/utypes.h"
+#include "unicode/uiter.h"
+
+U_CAPI void U_EXPORT2
+uiter_setLenient8(UCharIterator *iter, const char *s, int32_t length);
+
+#endif