ICU-9662 add & test & use U8_GET_OR_FFFD(), U8_NEXT_OR_FFFD(), U8_PREV_OR_FFFD()

X-SVN-Rev: 32796
2012-11-11 06:14:18 +00:00 · 2012-11-11 06:14:18 +00:00 · bc31ae8173
commit bc31ae8173
parent cb4157921b
10 changed files with 297 additions and 413 deletions
--- a/icu4c/source/common/bmpset.cpp
+++ b/icu4c/source/common/bmpset.cpp
@ -690,16 +690,9 @@ BMPSet::spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCon

        int32_t prev=length;
        UChar32 c;
-        if(b<0xc0) {
-            // trail byte: collect a multi-byte character
-            c=utf8_prevCharSafeBody(s, 0, &length, b, -1);
-            if(c<0) {
-                c=0xfffd;
-            }
-        } else {
-            // lead byte in last-trail position
-            c=0xfffd;
-        }
+        // trail byte: collect a multi-byte character
+        // (or  lead byte in last-trail position)
+        c=utf8_prevCharSafeBody(s, 0, &length, b, -3);
        // c is a valid code point, not ASCII, not a surrogate
        if(c<=0x7ff) {
            if((USetSpanCondition)((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0) != spanCondition) {
--- a/icu4c/source/common/uiter.cpp
+++ b/icu4c/source/common/uiter.cpp
@ -1,7 +1,7 @@
 /*
 *******************************************************************************
 *
-*   Copyright (C) 2002-2011, International Business Machines
+*   Copyright (C) 2002-2012, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 *
 *******************************************************************************
@ -600,12 +600,8 @@ utf8IteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin) {
            i=index=0;
            limit=iter->start; /* count up to the UTF-8 index */
            while(i<limit) {
-                U8_NEXT(s, i, limit, c);
-                if(c<=0xffff) {
-                    ++index;
-                } else {
-                    index+=2;
-                }
+                U8_NEXT_OR_FFFD(s, i, limit, c);
+                index+=U16_LENGTH(c);
            }

            iter->start=i; /* just in case setState() did not get us to a code point boundary */
@ -636,12 +632,8 @@ utf8IteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin) {

                /* count from the beginning to the current index */
                while(i<limit) {
-                    U8_NEXT(s, i, limit, c);
-                    if(c<=0xffff) {
-                        ++length;
-                    } else {
-                        length+=2;
-                    }
+                    U8_NEXT_OR_FFFD(s, i, limit, c);
+                    length+=U16_LENGTH(c);
                }

                /* assume i==limit==iter->start, set the UTF-16 index */
@ -658,12 +650,8 @@ utf8IteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin) {
            /* count from the current index to the end */
            limit=iter->limit;
            while(i<limit) {
-                U8_NEXT(s, i, limit, c);
-                if(c<=0xffff) {
-                    ++length;
-                } else {
-                    length+=2;
-                }
+                U8_NEXT_OR_FFFD(s, i, limit, c);
+                length+=U16_LENGTH(c);
            }
            iter->length=length;
        }
@ -787,8 +775,8 @@ utf8IteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin origin)
            --delta;
        }
        while(delta>0 && i<limit) {
-            U8_NEXT(s, i, limit, c);
-            if(c<0xffff) {
+            U8_NEXT_OR_FFFD(s, i, limit, c);
+            if(c<=0xffff) {
                ++pos;
                --delta;
            } else if(delta>=2) {
@ -817,8 +805,8 @@ utf8IteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin origin)
            ++delta;
        }
        while(delta<0 && i>0) {
-            U8_PREV(s, 0, i, c);
-            if(c<0xffff) {
+            U8_PREV_OR_FFFD(s, 0, i, c);
+            if(c<=0xffff) {
                --pos;
                ++delta;
            } else if(delta<=-2) {
@ -867,10 +855,8 @@ utf8IteratorCurrent(UCharIterator *iter) {
        UChar32 c;
        int32_t i=iter->start;

-        U8_NEXT(s, i, iter->limit, c);
-        if(c<0) {
-            return 0xfffd;
-        } else if(c<=0xffff) {
+        U8_NEXT_OR_FFFD(s, i, iter->limit, c);
+        if(c<=0xffff) {
            return c;
        } else {
            return U16_LEAD(c);
@ -895,7 +881,7 @@ utf8IteratorNext(UCharIterator *iter) {
        const uint8_t *s=(const uint8_t *)iter->context;
        UChar32 c;

-        U8_NEXT(s, iter->start, iter->limit, c);
+        U8_NEXT_OR_FFFD(s, iter->start, iter->limit, c);
        if((index=iter->index)>=0) {
            iter->index=++index;
            if(iter->length<0 && iter->start==iter->limit) {
@ -904,9 +890,7 @@ utf8IteratorNext(UCharIterator *iter) {
        } else if(iter->start==iter->limit && iter->length>=0) {
            iter->index= c<=0xffff ? iter->length : iter->length-1;
        }
-        if(c<0) {
-            return 0xfffd;
-        } else if(c<=0xffff) {
+        if(c<=0xffff) {
            return c;
        } else {
            iter->reservedField=c;
@ -933,15 +917,13 @@ utf8IteratorPrevious(UCharIterator *iter) {
        const uint8_t *s=(const uint8_t *)iter->context;
        UChar32 c;

-        U8_PREV(s, 0, iter->start, c);
+        U8_PREV_OR_FFFD(s, 0, iter->start, c);
        if((index=iter->index)>0) {
            iter->index=index-1;
        } else if(iter->start<=1) {
            iter->index= c<=0xffff ? iter->start : iter->start+1;
        }
-        if(c<0) {
-            return 0xfffd;
-        } else if(c<=0xffff) {
+        if(c<=0xffff) {
            return c;
        } else {
            iter->start+=4; /* back to behind this supplementary code point for consistent state */
@ -991,7 +973,7 @@ utf8IteratorSetState(UCharIterator *iter,
            } else {
                /* verified index>=4 above */
                UChar32 c;
-                U8_PREV((const uint8_t *)iter->context, 0, index, c);
+                U8_PREV_OR_FFFD((const uint8_t *)iter->context, 0, index, c);
                if(c<=0xffff) {
                    *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
                } else {
--- a/icu4c/source/common/unicode/utf8.h
+++ b/icu4c/source/common/unicode/utf8.h
@ -253,6 +253,37 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
    U8_NEXT(s, _u8_get_index, length, c); \
 }

+/**
+ * Get a code point from a string at a random-access offset,
+ * without changing the offset.
+ * The offset may point to either the lead byte or one of the trail bytes
+ * for a code point, in which case the macro will read all of the bytes
+ * for the code point.
+ *
+ * The length can be negative for a NUL-terminated string.
+ *
+ * If the offset points to an illegal UTF-8 byte sequence, then
+ * c is set to U+FFFD.
+ * Iteration through a string is more efficient with U8_NEXT_UNSAFE or U8_NEXT_OR_FFFD.
+ *
+ * This macro does not distinguish between a real U+FFFD in the text
+ * and U+FFFD returned for an ill-formed sequence.
+ * Use U8_GET() if that distinction is important.
+ *
+ * @param s const uint8_t * string
+ * @param start int32_t starting string offset
+ * @param i int32_t string offset, must be start<=i<length
+ * @param length int32_t string length
+ * @param c output UChar32 variable, set to U+FFFD in case of an error
+ * @see U8_GET
+ * @draft ICU 51
+ */
+#define U8_GET_OR_FFFD(s, start, i, length, c) { \
+    int32_t _u8_get_index=(i); \
+    U8_SET_CP_START(s, start, _u8_get_index); \
+    U8_NEXT_OR_FFFD(s, _u8_get_index, length, c); \
+}
+
 /* definitions with forward iteration --------------------------------------- */

 /**
@ -328,11 +359,60 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
        ) { \
            (c)=(UChar)((((c)&0x1f)<<6)|__t1); \
            ++(i); \
-        } else if(U8_IS_LEAD(c)) { \
+        } else { \
            /* function call for "complicated" and error cases */ \
            (c)=utf8_nextCharSafeBody((const uint8_t *)s, &(i), (length), c, -1); \
+        } \
+    } \
+}
+
+/**
+ * Get a code point from a string at a code point boundary offset,
+ * and advance the offset to the next code point boundary.
+ * (Post-incrementing forward iteration.)
+ * "Safe" macro, checks for illegal sequences and for string boundaries.
+ *
+ * The length can be negative for a NUL-terminated string.
+ *
+ * The offset may point to the lead byte of a multi-byte sequence,
+ * in which case the macro will read the whole sequence.
+ * If the offset points to a trail byte or an illegal UTF-8 sequence, then
+ * c is set to U+FFFD.
+ *
+ * This macro does not distinguish between a real U+FFFD in the text
+ * and U+FFFD returned for an ill-formed sequence.
+ * Use U8_NEXT() if that distinction is important.
+ *
+ * @param s const uint8_t * string
+ * @param i int32_t string offset, must be i<length
+ * @param length int32_t string length
+ * @param c output UChar32 variable, set to U+FFFD in case of an error
+ * @see U8_NEXT
+ * @draft ICU 51
+ */
+#define U8_NEXT_OR_FFFD(s, i, length, c) { \
+    (c)=(uint8_t)(s)[(i)++]; \
+    if((c)>=0x80) { \
+        uint8_t __t1, __t2; \
+        if( /* handle U+1000..U+CFFF inline */ \
+            (0xe0<(c) && (c)<=0xec) && \
+            (((i)+1)<(length) || (length)<0) && \
+            (__t1=(uint8_t)((s)[i]-0x80))<=0x3f && \
+            (__t2=(uint8_t)((s)[(i)+1]-0x80))<= 0x3f \
+        ) { \
+            /* no need for (c&0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ \
+            (c)=(UChar)(((c)<<12)|(__t1<<6)|__t2); \
+            (i)+=2; \
+        } else if( /* handle U+0080..U+07FF inline */ \
+            ((c)<0xe0 && (c)>=0xc2) && \
+            ((i)!=(length)) && \
+            (__t1=(uint8_t)((s)[i]-0x80))<=0x3f \
+        ) { \
+            (c)=(UChar)((((c)&0x1f)<<6)|__t1); \
+            ++(i); \
        } else { \
-            (c)=U_SENTINEL; \
+            /* function call for "complicated" and error cases */ \
+            (c)=utf8_nextCharSafeBody((const uint8_t *)s, &(i), (length), c, -3); \
        } \
    } \
 }
@ -588,11 +668,38 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
 #define U8_PREV(s, start, i, c) { \
    (c)=(uint8_t)(s)[--(i)]; \
    if((c)>=0x80) { \
-        if((c)<=0xbf) { \
-            (c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -1); \
-        } else { \
-            (c)=U_SENTINEL; \
-        } \
+        (c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -1); \
+    } \
+}
+
+/**
+ * Move the string offset from one code point boundary to the previous one
+ * and get the code point between them.
+ * (Pre-decrementing backward iteration.)
+ * "Safe" macro, checks for illegal sequences and for string boundaries.
+ *
+ * The input offset may be the same as the string length.
+ * If the offset is behind a multi-byte sequence, then the macro will read
+ * the whole sequence.
+ * If the offset is behind a lead byte, then that itself
+ * will be returned as the code point.
+ * If the offset is behind an illegal UTF-8 sequence, then c is set to U+FFFD.
+ *
+ * This macro does not distinguish between a real U+FFFD in the text
+ * and U+FFFD returned for an ill-formed sequence.
+ * Use U8_PREV() if that distinction is important.
+ *
+ * @param s const uint8_t * string
+ * @param start int32_t starting string offset (usually 0)
+ * @param i int32_t string offset, must be start<i
+ * @param c output UChar32 variable, set to U+FFFD in case of an error
+ * @see U8_PREV
+ * @draft ICU 51
+ */
+#define U8_PREV_OR_FFFD(s, start, i, c) { \
+    (c)=(uint8_t)(s)[--(i)]; \
+    if((c)>=0x80) { \
+        (c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -3); \
    } \
 }

--- a/icu4c/source/common/uniset.cpp
+++ b/icu4c/source/common/uniset.cpp
@ -1,6 +1,6 @@
 /*
 **********************************************************************
-*   Copyright (C) 1999-2011, International Business Machines
+*   Copyright (C) 1999-2012, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 **********************************************************************
 *   Date        Name        Description
@ -2234,10 +2234,7 @@ int32_t UnicodeSet::spanUTF8(const char *s, int32_t length, USetSpanCondition sp
    UChar32 c;
    int32_t start=0, prev=0;
    do {
-        U8_NEXT(s, start, length, c);
-        if(c<0) {
-            c=0xfffd;
-        }
+        U8_NEXT_OR_FFFD(s, start, length, c);
        if(spanCondition!=contains(c)) {
            break;
        }
@ -2275,10 +2272,7 @@ int32_t UnicodeSet::spanBackUTF8(const char *s, int32_t length, USetSpanConditio
    UChar32 c;
    int32_t prev=length;
    do {
-        U8_PREV(s, 0, length, c);
-        if(c<0) {
-            c=0xfffd;
-        }
+        U8_PREV_OR_FFFD(s, 0, length, c);
        if(spanCondition!=contains(c)) {
            break;
        }
--- a/icu4c/source/common/unisetspan.cpp
+++ b/icu4c/source/common/unisetspan.cpp
@ -1,7 +1,7 @@
 /*
 ******************************************************************************
 *
-*   Copyright (C) 2007-2011, International Business Machines
+*   Copyright (C) 2007-2012, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 *
 ******************************************************************************
@ -503,9 +503,9 @@ spanOneUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) {
    if((int8_t)c>=0) {
        return set.contains(c) ? 1 : -1;
    }
-    // Take advantage of non-ASCII fastpaths in U8_NEXT().
+    // Take advantage of non-ASCII fastpaths in U8_NEXT_OR_FFFD().
    int32_t i=0;
-    U8_NEXT(s, i, length, c);
+    U8_NEXT_OR_FFFD(s, i, length, c);
    return set.contains(c) ? i : -i;
 }

@ -516,7 +516,7 @@ spanOneBackUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) {
        return set.contains(c) ? 1 : -1;
    }
    int32_t i=length-1;
-    c=utf8_prevCharSafeBody(s, 0, &i, c, -1);
+    c=utf8_prevCharSafeBody(s, 0, &i, c, -3);
    length-=i;
    return set.contains(c) ? length : -length;
 }
--- a/icu4c/source/common/utext.cpp
+++ b/icu4c/source/common/utext.cpp
@ -1217,15 +1217,11 @@ fillForward:
                int32_t  cIx      = srcIx;
                int32_t  dIx      = destIx;
                int32_t  dIxSaved = destIx;
-                U8_NEXT(s8, srcIx, strLen, c);
+                U8_NEXT_OR_FFFD(s8, srcIx, strLen, c);
                if (c==0 && nulTerminated) {
                    srcIx--;
                    break;
                }
-                if (c<0) {
-                    // Illegal UTF-8.  Replace with sub character.
-                    c = 0x0fffd;
-                }

                U16_APPEND_UNSAFE(buf, destIx, c);
                do {
@ -1334,15 +1330,11 @@ fillReverse:
                int32_t  sIx      = srcIx;  // ix of last byte of multi-byte u8 char

                // Get the full character from the UTF8 string.
-                //   use code derived from tbe macros in utf.8
+                //   use code derived from tbe macros in utf8.h
                //   Leaves srcIx pointing at the first byte of the UTF-8 char.
                //
-                if (c<=0xbf) {
-                    c=utf8_prevCharSafeBody(s8, 0, &srcIx, c, -1);
-                    // leaves srcIx at first byte of the multi-byte char.
-                } else {
-                    c=0x0fffd;
-                }
+                c=utf8_prevCharSafeBody(s8, 0, &srcIx, c, -3);
+                // leaves srcIx at first byte of the multi-byte char.

                // Store the character in UTF-16 buffer.
                if (c<0x10000) {
@ -1415,10 +1407,7 @@ utext_strFromUTF8(UChar *dest,
        if(ch <=0x7f){
            *pDest++=(UChar)ch;
        }else{
-            ch=utf8_nextCharSafeBody(pSrc, &index, srcLength, ch, -1);
-            if(ch<0){
-                ch = 0xfffd;
-            }
+            ch=utf8_nextCharSafeBody(pSrc, &index, srcLength, ch, -3);
            if(U_IS_BMP(ch)){
                *(pDest++)=(UChar)ch;
            }else{
@ -1438,10 +1427,7 @@ utext_strFromUTF8(UChar *dest,
        if(ch <= 0x7f){
            reqLength++;
        }else{
-            ch=utf8_nextCharSafeBody(pSrc, &index, srcLength, ch, -1);
-            if(ch<0){
-                ch = 0xfffd;
-            }
+            ch=utf8_nextCharSafeBody(pSrc, &index, srcLength, ch, -3);
            reqLength+=U16_LENGTH(ch);
        }
    }
--- a/icu4c/source/common/utf_impl.c
+++ b/icu4c/source/common/utf_impl.c
@ -86,15 +86,31 @@ utf8_errorValue[6]={
    0x3ffffff, 0x7fffffff
 };

+static UChar32
+errorValue(int32_t count, int8_t strict) {
+    if(strict>=0) {
+        return utf8_errorValue[count];
+    } else if(strict==-3) {
+        return 0xfffd;
+    } else {
+        return U_SENTINEL;
+    }
+}
+
 /*
- * Handle the non-inline part of the U8_NEXT() macro and its obsolete sibling
- * UTF8_NEXT_CHAR_SAFE().
+ * Handle the non-inline part of the U8_NEXT() and U8_NEXT_FFFD() macros
+ * and their obsolete sibling UTF8_NEXT_CHAR_SAFE().
 *
 * U8_NEXT() supports NUL-terminated strings indicated via length<0.
 *
 * The "strict" parameter controls the error behavior:
- * <0  "Safe" behavior of U8_NEXT(): All illegal byte sequences yield a negative
- *     code point result.
+ * <0  "Safe" behavior of U8_NEXT():
+ *     -1: All illegal byte sequences yield U_SENTINEL=-1.
+ *     -2: Same as -1, except for lenient treatment of surrogate code points as legal.
+ *         Some implementations use this for roundtripping of
+ *         Unicode 16-bit strings that are not well-formed UTF-16, that is, they
+ *         contain unpaired surrogates.
+ *     -3: All illegal byte sequences yield U+FFFD.
 *  0  Obsolete "safe" behavior of UTF8_NEXT_CHAR_SAFE(..., FALSE):
 *     All illegal byte sequences yield a positive code point such that this
 *     result code point would be encoded with the same number of bytes as
@ -103,11 +119,6 @@ utf8_errorValue[6]={
 *     Same as the obsolete "safe" behavior, but non-characters are also treated
 *     like illegal sequences.
 *
- * The special negative (<0) value -2 is used for lenient treatment of surrogate
- * code points as legal. Some implementations use this for roundtripping of
- * Unicode 16-bit strings that are not well-formed UTF-16, that is, they
- * contain unpaired surrogates.
- *
 * Note that a UBool is the same as an int8_t.
 */
 U_CAPI UChar32 U_EXPORT2
@ -165,11 +176,7 @@ utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, UChar32 c,
        ++i;
        --count;
    }
-    if(strict>=0) {
-        c=utf8_errorValue[i-*pi];
-    } else {
-        c=U_SENTINEL;
-    }
+    c=errorValue(i-*pi, strict);
    *pi=i;
    return c;
 }
@ -224,18 +231,15 @@ utf8_prevCharSafeBody(const uint8_t *s, int32_t start, int32_t *pi, UChar32 c, U
    int32_t i=*pi;
    uint8_t b, count=1, shift=6;

+    if(!U8_IS_TRAIL(c)) { return errorValue(0, strict); }
+
    /* extract value bits from the last trail byte */
    c&=0x3f;

    for(;;) {
        if(i<=start) {
            /* no lead byte at all */
-            if(strict>=0) {
-                return UTF8_ERROR_VALUE_1;
-            } else {
-                return U_SENTINEL;
-            }
-            /*break;*/
+            return errorValue(0, strict);
        }

        /* read another previous byte */
@ -255,11 +259,7 @@ utf8_prevCharSafeBody(const uint8_t *s, int32_t start, int32_t *pi, UChar32 c, U
                        if(count>=4) {
                            count=3;
                        }
-                        if(strict>=0) {
-                            c=utf8_errorValue[count];
-                        } else {
-                            c=U_SENTINEL;
-                        }
+                        c=errorValue(count, strict);
                    } else {
                        /* exit with correct c */
                    }
@ -269,17 +269,9 @@ utf8_prevCharSafeBody(const uint8_t *s, int32_t start, int32_t *pi, UChar32 c, U
                       include the trail byte that we started with */
                    if(count<shouldCount) {
                        *pi=i;
-                        if(strict>=0) {
-                            c=utf8_errorValue[count];
-                        } else {
-                            c=U_SENTINEL;
-                        }
+                        c=errorValue(count, strict);
                    } else {
-                        if(strict>=0) {
-                            c=UTF8_ERROR_VALUE_1;
-                        } else {
-                            c=U_SENTINEL;
-                        }
+                        c=errorValue(0, strict);
                    }
                }
                break;
@ -290,20 +282,12 @@ utf8_prevCharSafeBody(const uint8_t *s, int32_t start, int32_t *pi, UChar32 c, U
                shift+=6;
            } else {
                /* more than 5 trail bytes is illegal */
-                if(strict>=0) {
-                    c=UTF8_ERROR_VALUE_1;
-                } else {
-                    c=U_SENTINEL;
-                }
+                c=errorValue(0, strict);
                break;
            }
        } else {
            /* single-byte character precedes trailing bytes */
-            if(strict>=0) {
-                c=UTF8_ERROR_VALUE_1;
-            } else {
-                c=U_SENTINEL;
-            }
+            c=errorValue(0, strict);
            break;
        }
    }
--- a/icu4c/source/i18n/ucol.cpp
+++ b/icu4c/source/i18n/ucol.cpp
@ -8038,163 +8038,9 @@ endOfSecLoop:
 }

 /*
-  Slightly modified version of U8_NEXT macro defined in utf8.h. U8_NEXT requires
-  the length of UTF-8 string. This version assumes that the UTF-8 string is null
-  terminated and does not require the length as input.
-
  Note: ucol_strcollUTF8 supports null terminated input. Calculating length of
  null terminated input string takes extra amount of CPU cycles.
 */
-static const UChar32
-utf8_minLegal[4]={ 0, 0x80, 0x800, 0x10000 };
-
-#define UTF8_ERROR_VALUE_1 0x15
-#define UTF8_ERROR_VALUE_2 0x9f
-#define UTF_ERROR_VALUE 0xffff
-
-static const UChar32
-utf8_errorValue[6]={
-    UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_2, UTF_ERROR_VALUE, 0x10ffff,
-    0x3ffffff, 0x7fffffff
-};
-
-static
-UChar32 utf8_nextCharSafeBodyNullTerm(const uint8_t *s, int32_t *pi, UChar32 c, UBool strict) {
-    int32_t i=*pi;
-    uint8_t count=U8_COUNT_TRAIL_BYTES(c);
-    U_ASSERT(count <= 5); /* U8_COUNT_TRAIL_BYTES returns value 0...5 */
-
-    if (c) {
-        uint8_t trail, illegal=0;
-
-        U8_MASK_LEAD_BYTE((c), count);
-        /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
-        switch(count) {
-        /* each branch falls through to the next one */
-        case 5:
-        case 4:
-            /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
-            illegal=1;
-            break;
-        case 3:
-            trail=s[(i)];
-            if (trail==0) {
-                illegal=1;
-                break;
-            }
-            (c)=((c)<<6)|(trail&0x3f);
-            if(c<0x110) {
-                illegal|=(trail&0xc0)^0x80;
-            } else {
-                /* code point>0x10ffff, outside Unicode */
-                illegal=1;
-                break;
-            }
-            ++(i);
-        case 2:
-            trail=s[(i)];
-            if (trail==0) {
-                illegal=1;
-                break;
-            }
-            (c)=((c)<<6)|(trail&0x3f);
-            illegal|=(trail&0xc0)^0x80;
-            ++(i);
-        case 1:
-            trail=s[(i)];
-            if (trail==0) {
-                illegal=1;
-                break;
-            }
-            (c)=((c)<<6)|(trail&0x3f);
-            illegal|=(trail&0xc0)^0x80;
-            ++(i);
-            break;
-        case 0:
-            if(strict>=0) {
-                return UTF8_ERROR_VALUE_1;
-            } else {
-                return U_SENTINEL;
-            }
-        /* no default branch to optimize switch()  - all values are covered */
-        }
-
-        /*
-         * All the error handling should return a value
-         * that needs count bytes so that UTF8_GET_CHAR_SAFE() works right.
-         *
-         * Starting with Unicode 3.0.1, non-shortest forms are illegal.
-         * Starting with Unicode 3.2, surrogate code points must not be
-         * encoded in UTF-8, and there are no irregular sequences any more.
-         *
-         * U8_ macros (new in ICU 2.4) return negative values for error conditions.
-         */
-
-        /* correct sequence - all trail bytes have (b7..b6)==(10)? */
-        /* illegal is also set if count>=4 */
-        if(illegal || (c)<utf8_minLegal[count] || (U_IS_SURROGATE(c) && strict!=-2)) {
-            /* error handling */
-            uint8_t errorCount=count;
-            /* don't go beyond this sequence */
-            i=*pi;
-            while(count>0 && U8_IS_TRAIL(s[i])) {
-                ++(i);
-                --count;
-            }
-            if(strict>=0) {
-                c=utf8_errorValue[errorCount-count];
-            } else {
-                c=U_SENTINEL;
-            }
-        } else if((strict)>0 && U_IS_UNICODE_NONCHAR(c)) {
-            /* strict: forbid non-characters like U+fffe */
-            c=utf8_errorValue[count];
-        }
-    }
-    *pi=i;
-    return c;
-}
-
-#define U8_NEXT_NULLTERM(s, i, c) { \
-    (c)=(uint8_t)(s)[(i)]; \
-    if((c)>=0x80) { \
-        uint8_t __t1, __t2; \
-        if( /* handle U+1000..U+CFFF inline */ \
-            (0xe0<(c) && (c)<=0xec) && \
-            (__t1=(uint8_t)((s)[(i)+1]-0x80))<=0x3f && __t1 != 0 && \
-            (__t2=(uint8_t)((s)[(i)+2]-0x80))<= 0x3f && __t2 != 0 \
-        ) { \
-            /* no need for (c&0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ \
-            (c)=(UChar)(((c)<<12)|(__t1<<6)|__t2); \
-            (i)+=3; \
-        } else if( /* handle U+0080..U+07FF inline */ \
-            ((c)<0xe0 && (c)>=0xc2) && \
-            (__t1=(uint8_t)((s)[(i)+1]-0x80))<=0x3f && __t1 != 0 \
-        ) { \
-            (c)=(UChar)((((c)&0x1f)<<6)|__t1); \
-            (i)+=2; \
-        } else if(U8_IS_LEAD(c)) { \
-            /* function call for "complicated" and error cases */ \
-            ++(i); \
-            (c)=utf8_nextCharSafeBodyNullTerm((const uint8_t *)s, &(i), c, -1); \
-        } else { \
-            (c)=U_SENTINEL; \
-            ++(i); \
-        } \
-    } else { \
-        if ((c)) { \
-            ++(i); \
-        } \
-    } \
-}
-
-#define U8_GET_NULLTERM(s, start, i, c) { \
-    int32_t _u8_get_index=(int32_t)(i); \
-    U8_SET_CP_START(s, start, _u8_get_index); \
-    U8_NEXT_NULLTERM(s, _u8_get_index, c); \
-}
-
-
 static UCollationResult
 ucol_strcollRegularUTF8(
                    const UCollator *coll,
@ -8253,19 +8099,12 @@ ucol_getLatinOneContractionUTF8(const UCollator *coll, int32_t strength,
    UChar32 schar = 0, tchar = 0;

    for(;;) {
-        if (len == -1) {
-            U8_GET_NULLTERM((const uint8_t*)s, 0, *index, schar);
-            if (schar == 0) {
-                return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
-            }
-        } else {
-            if (*index == len) {
-                return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
-            }
-            U8_GET((const uint8_t*)s, 0, *index, len, schar);
+        if (*index == len) {
+            return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
        }
-        if (schar == -1) {
-            schar = 0xfffd;
+        U8_GET_OR_FFFD((const uint8_t*)s, 0, *index, len, schar);
+        if (len < 0 && schar == 0) {
+            return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
        }

        while(schar > (tchar = *(UCharOffset+offset))) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
@ -8320,22 +8159,15 @@ ucol_strcollUseLatin1UTF8(
    for(;;) {
        while(sOrder==0) { // this loop skips primary ignorables
            // sOrder=getNextlatinOneCE(source);
-            if (sLen==-1) {
-                U8_NEXT_NULLTERM(source, sIndex, sChar);
-                if (sChar == 0) {
-                    endOfSource = TRUE;
-                    sLen = sIndex;
-                    break;
-                }
-            } else {
-                if (sIndex == sLen) {
-                    endOfSource = TRUE;
-                    break;
-                }
-                U8_NEXT(source, sIndex, sLen ,sChar);
+            if (sIndex == sLen) {
+                endOfSource = TRUE;
+                break;
            }
-            if (sChar == -1) {
-                sChar = 0xfffd; // fallback for the bad code
+            U8_NEXT_OR_FFFD(source, sIndex, sLen ,sChar);
+            if (sLen < 0 && sChar == 0) {
+                endOfSource = TRUE;
+                sLen = sIndex;
+                break;
            }
            if(sChar&0xFFFFFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32)
                //fprintf(stderr, "R");
@ -8360,28 +8192,21 @@ ucol_strcollUseLatin1UTF8(

        while(tOrder==0) {  // this loop skips primary ignorables
            // tOrder=getNextlatinOneCE(target);
-            if (tLen == -1) {
-                U8_NEXT_NULLTERM(target, tIndex, tChar);
-                if (tChar == 0) {
-                    if(endOfSource) {
-                        tLen = tIndex;
-                        goto endOfPrimLoopU8;
-                    } else {
-                        return UCOL_GREATER;
-                    }
+            if (tIndex == tLen) {
+                if(endOfSource) {
+                    goto endOfPrimLoopU8;
+                } else {
+                    return UCOL_GREATER;
                }
-            } else {
-                if (tIndex == tLen) {
-                    if(endOfSource) {
-                        goto endOfPrimLoopU8;
-                    } else {
-                        return UCOL_GREATER;
-                    }
-                }
-                U8_NEXT(target, tIndex, tLen, tChar);
            }
-            if (tChar == -1) {
-                tChar = 0xfffd;
+            U8_NEXT_OR_FFFD(target, tIndex, tLen, tChar);
+            if (tLen < 0 && tChar == 0) {
+                if(endOfSource) {
+                    tLen = tIndex;
+                    goto endOfPrimLoopU8;
+                } else {
+                    return UCOL_GREATER;
+                }
            }
            if(tChar&0xFFFFFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32)
                //fprintf(stderr, "R");
@ -8448,7 +8273,7 @@ endOfPrimLoopU8:
                        break;
                    }
                    U_ASSERT(sLen >= 0);
-                    U8_NEXT(source, sIndex, sLen, sChar);
+                    U8_NEXT_OR_FFFD(source, sIndex, sLen, sChar);
                    U_ASSERT(sChar >= 0 && sChar <= 0xFF);
                    sOrder = elements[sChar];
                    if(sOrder > UCOL_NOT_FOUND) {
@ -8465,7 +8290,7 @@ endOfPrimLoopU8:
                        }
                    }
                    U_ASSERT(tLen >= 0);
-                    U8_NEXT(target, tIndex, tLen, tChar);
+                    U8_NEXT_OR_FFFD(target, tIndex, tLen, tChar);
                    U_ASSERT(tChar >= 0 && tChar <= 0xFF);
                    tOrder = elements[tChar];
                    if(tOrder > UCOL_NOT_FOUND) {
@ -8505,7 +8330,7 @@ endOfPrimLoopU8:
                        endOfSource = TRUE;
                        break;
                    }
-                    U8_PREV(source, 0, sIndex, sChar);
+                    U8_PREV_OR_FFFD(source, 0, sIndex, sChar);
                    U_ASSERT(sChar >= 0 && sChar <= 0xFF);
                    sOrder = elements[sChar];
                    // don't even look for contractions
@ -8519,7 +8344,7 @@ endOfPrimLoopU8:
                            return UCOL_GREATER;
                        }
                    }
-                    U8_PREV(target, 0, tIndex, tChar);
+                    U8_PREV_OR_FFFD(target, 0, tIndex, tChar);
                    U_ASSERT(tChar >= 0 && tChar <= 0xFF);
                    tOrder = elements[tChar];
                    // don't even look for contractions
@ -8560,7 +8385,7 @@ endOfSecLoopU8:
                    break;
                }
                U_ASSERT(sLen >= 0);
-                U8_NEXT(source, sIndex, sLen, sChar);
+                U8_NEXT_OR_FFFD(source, sIndex, sLen, sChar);
                U_ASSERT(sChar >= 0 && sChar <= 0xFF);
                sOrder = elements[sChar];
                if(sOrder > UCOL_NOT_FOUND) {
@ -8576,7 +8401,7 @@ endOfSecLoopU8:
                    }
                }
                U_ASSERT(tLen >= 0);
-                U8_NEXT(target, tIndex, tLen, tChar);
+                U8_NEXT_OR_FFFD(target, tIndex, tLen, tChar);
                U_ASSERT(tChar >= 0 && tChar <= 0xFF);
                tOrder = elements[tChar];
                if(tOrder > UCOL_NOT_FOUND) {
@ -8963,36 +8788,18 @@ ucol_strcollUTF8(
        UChar32 uc32 = -1;

        if (!bSrcLimit) {
-            if (sourceLength >= 0) {
-                U8_GET((uint8_t*)source, 0, equalLength, sourceLength, uc32);
-            } else {
-                U8_GET_NULLTERM((uint8_t*)source, 0, equalLength, uc32);
-            }
-            if (uc32 == -1) {
-                uc32 = 0xfffd;
-                bSawNonLatin1 |= TRUE;
-            } else {
-                if (uc32 >= 0x10000 || ucol_unsafeCP((UChar)uc32, coll)) {
-                    bUnsafeCP = TRUE;
-                }
-                bSawNonLatin1 |= (uc32 > 0xff);
+            U8_GET_OR_FFFD((const uint8_t*)source, 0, equalLength, sourceLength, uc32);
+            if (uc32 >= 0x10000 || ucol_unsafeCP((UChar)uc32, coll)) {
+                bUnsafeCP = TRUE;
            }
+            bSawNonLatin1 |= (uc32 > 0xff);
        }
        if (!bTargLimit) {
-            if (targetLength >= 0) {
-                U8_GET((uint8_t*)target, 0, equalLength, targetLength, uc32);
-            } else {
-                U8_GET_NULLTERM((uint8_t*)target, 0, equalLength, uc32);
-            }
-            if (uc32 == -1) {
-                uc32 = 0xfffd;
-                bSawNonLatin1 |= TRUE;
-            } else {
-                if (uc32 >= 0x10000 || ucol_unsafeCP((UChar)uc32, coll)) {
-                    bUnsafeCP = TRUE;
-                }
-                bSawNonLatin1 |= (uc32 > 0xff);
+            U8_GET_OR_FFFD((const uint8_t*)target, 0, equalLength, targetLength, uc32);
+            if (uc32 >= 0x10000 || ucol_unsafeCP((UChar)uc32, coll)) {
+                bUnsafeCP = TRUE;
            }
+            bSawNonLatin1 |= (uc32 > 0xff);
        }

        if (bUnsafeCP) {
@ -9000,7 +8807,7 @@ ucol_strcollUTF8(
                // We are stopped in the middle of a contraction.
                // Scan backwards through the == part of the string looking for the start of the contraction.
                //   It doesn't matter which string we scan, since they are the same in this region.
-                U8_PREV((uint8_t*)source, 0, equalLength, uc32);
+                U8_PREV_OR_FFFD((uint8_t*)source, 0, equalLength, uc32);
                bSawNonLatin1 |= (uc32 > 0xff);
                if (uc32 < 0x10000 && !ucol_unsafeCP((UChar)uc32, coll)) {
                    break;
--- a/icu4c/source/test/cintltst/utf8tst.c
+++ b/icu4c/source/test/cintltst/utf8tst.c
@ -195,7 +195,7 @@ static void TestGetChar()
        0x240,            UTF8_ERROR_VALUE_1,         UTF8_ERROR_VALUE_1
    };
    uint16_t i=0;
-    UChar32 c;
+    UChar32 c, expected;
    uint32_t offset=0;

    for(offset=0; offset<sizeof(input); offset++) {
@ -213,14 +213,22 @@ static void TestGetChar()
            }
        }

-        U8_GET(input, 0, offset, sizeof(input), c);
-        if(UTF_IS_ERROR(result[i+1]) ? c >= 0 : c != result[i+1]){
-            log_err("ERROR: UTF8_GET_CHAR_SAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i+1], c);
+        UTF8_GET_CHAR_SAFE(input, 0, offset, sizeof(input), c, FALSE);
+        expected=result[i+1];
+        if(c != expected){
+            log_err("ERROR: UTF8_GET_CHAR_SAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
        }

-        UTF8_GET_CHAR_SAFE(input, 0, offset, sizeof(input), c, FALSE);
-        if(c != result[i+1]){
-            log_err("ERROR: UTF8_GET_CHAR_SAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i+1], c);
+        U8_GET(input, 0, offset, sizeof(input), c);
+        if(UTF_IS_ERROR(expected)) { expected=U_SENTINEL; }
+        if(c != expected){
+            log_err("ERROR: U8_GET failed for offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
+        }
+
+        U8_GET_OR_FFFD(input, 0, offset, sizeof(input), c);
+        if(expected<0) { expected=0xfffd; }
+        if(c != expected){
+            log_err("ERROR: U8_GET_OR_FFFD failed for offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
        }

        UTF8_GET_CHAR_SAFE(input, 0, offset, sizeof(input), c, TRUE);
@ -228,7 +236,7 @@ static void TestGetChar()
            log_err("ERROR: UTF8_GET_CHAR_SAFE(strict) failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i+2], c);
        }

-         i=(uint16_t)(i+3);
+        i=(uint16_t)(i+3);
    }
 }

@ -274,7 +282,7 @@ static void TestNextPrevChar() {
    };
    /* TODO: remove unused columns for next_unsafe & prev_unsafe, and adjust the test code */

-    UChar32 c=0x0000;
+    UChar32 c, expected;
    uint32_t i=0;
    uint32_t offset=0;
    int32_t setOffset=0;
@ -285,9 +293,10 @@ static void TestNextPrevChar() {
             log_err("ERROR: UTF8_NEXT_CHAR_SAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
                 offset, movedOffset[i+1], setOffset);
         }
-         if(c != result[i+1]){
-             log_err("ERROR: UTF8_NEXT_CHAR_SAFE failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+1], c);
-         }
+        expected=result[i+1];
+        if(c != expected){
+            log_err("ERROR: UTF8_NEXT_CHAR_SAFE failed for input=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
+        }

         setOffset=offset;
         U8_NEXT(input, setOffset, sizeof(input), c);
@ -295,9 +304,21 @@ static void TestNextPrevChar() {
             log_err("ERROR: U8_NEXT failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
                 offset, movedOffset[i+1], setOffset);
         }
-         if(UTF_IS_ERROR(result[i+1]) ? c >= 0 : c != result[i+1]){
-             log_err("ERROR: U8_NEXT failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+1], c);
-         }
+        if(UTF_IS_ERROR(expected)) { expected=U_SENTINEL; }
+        if(c != expected){
+            log_err("ERROR: U8_NEXT failed for input=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
+        }
+
+        setOffset=offset;
+        U8_NEXT_OR_FFFD(input, setOffset, sizeof(input), c);
+        if(setOffset != movedOffset[i+1]){
+            log_err("ERROR: U8_NEXT_OR_FFFD failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
+                offset, movedOffset[i+1], setOffset);
+        }
+        if(expected<0) { expected=0xfffd; }
+        if(c != expected){
+            log_err("ERROR: U8_NEXT_OR_FFFD failed for input=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
+        }

         setOffset=offset;
         UTF8_NEXT_CHAR_SAFE(input, setOffset, sizeof(input), c, TRUE);
@ -320,9 +341,10 @@ static void TestNextPrevChar() {
             log_err("ERROR: UTF8_PREV_CHAR_SAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
                 offset, movedOffset[i+4], setOffset);
         }
-         if(c != result[i+4]){
-             log_err("ERROR: UTF8_PREV_CHAR_SAFE failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+4], c);
-         }
+        expected=result[i+4];
+        if(c != expected){
+            log_err("ERROR: UTF8_PREV_CHAR_SAFE failed for input=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
+        }

         setOffset=offset;
         U8_PREV(input, 0, setOffset, c);
@ -330,9 +352,21 @@ static void TestNextPrevChar() {
             log_err("ERROR: U8_PREV failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
                 offset, movedOffset[i+4], setOffset);
         }
-         if(UTF_IS_ERROR(result[i+4]) ? c >= 0 : c != result[i+4]){
-             log_err("ERROR: U8_PREV failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+4], c);
-         }
+        if(UTF_IS_ERROR(expected)) { expected=U_SENTINEL; }
+        if(c != expected){
+            log_err("ERROR: U8_PREV failed for input=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
+        }
+
+        setOffset=offset;
+        U8_PREV_OR_FFFD(input, 0, setOffset, c);
+        if(setOffset != movedOffset[i+4]){
+            log_err("ERROR: U8_PREV_OR_FFFD failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
+                offset, movedOffset[i+4], setOffset);
+        }
+        if(expected<0) { expected=0xfffd; }
+        if(c != expected){
+            log_err("ERROR: U8_PREV_OR_FFFD failed for input=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
+        }

         setOffset=offset;
         UTF8_PREV_CHAR_SAFE(input, 0,  setOffset, c, TRUE);
@ -378,14 +412,24 @@ static void TestNulTerminated() {
        0
    };

-    UChar32 c, c2;
+    UChar32 c, c2, expected;
    int32_t i0, i=0, j, k, expectedIndex;
    int32_t cpIndex=0;
    do {
        i0=i;
        U8_NEXT(input, i, -1, c);
-        if(c!=result[cpIndex]) {
-            log_err("U8_NEXT(from %d)=U+%04x != U+%04x\n", i0, c, result[cpIndex]);
+        expected=result[cpIndex];
+        if(c!=expected) {
+            log_err("U8_NEXT(from %d)=U+%04x != U+%04x\n", i0, c, expected);
+        }
+        j=i0;
+        U8_NEXT_OR_FFFD(input, j, -1, c);
+        if(expected<0) { expected=0xfffd; }
+        if(c!=expected) {
+            log_err("U8_NEXT_OR_FFFD(from %d)=U+%04x != U+%04x\n", i0, c, expected);
+        }
+        if(j!=i) {
+            log_err("U8_NEXT_OR_FFFD() moved to index %d but U8_NEXT() moved to %d\n", j, i);
        }
        j=i0;
        U8_FWD_1(input, j, -1);
@ -414,6 +458,11 @@ static void TestNulTerminated() {
            if(c2!=c) {
                log_err("U8_NEXT(from %d)=U+%04x != U+%04x=U8_GET(at %d)\n", i0, c, c2, j);
            }
+            U8_GET_OR_FFFD(input, 0, j, -1, c2);
+            expected= (c>=0) ? c : 0xfffd;
+            if(c2!=expected) {
+                log_err("U8_NEXT_OR_FFFD(from %d)=U+%04x != U+%04x=U8_GET_OR_FFFD(at %d)\n", i0, expected, c2, j);
+            }
            /* U8_SET_CP_LIMIT moves from a non-lead byte to the limit of the code point */
            k=j+1;
            U8_SET_CP_LIMIT(input, 0, k, -1);
--- a/icu4c/source/test/intltest/usettest.cpp
+++ b/icu4c/source/test/intltest/usettest.cpp
@ -2626,10 +2626,7 @@ static int32_t containsSpanUTF8(const UnicodeSetWithStrings &set, const char *s,
        UChar32 c;
        int32_t start=0, prev;
        while((prev=start)<length) {
-            U8_NEXT(s, start, length, c);
-            if(c<0) {
-                c=0xfffd;
-            }
+            U8_NEXT_OR_FFFD(s, start, length, c);
            if(realSet.contains(c)!=spanCondition) {
                break;
            }
@ -2640,10 +2637,7 @@ static int32_t containsSpanUTF8(const UnicodeSetWithStrings &set, const char *s,
        UChar32 c;
        int32_t start, next;
        for(start=next=0; start<length;) {
-            U8_NEXT(s, next, length, c);
-            if(c<0) {
-                c=0xfffd;
-            }
+            U8_NEXT_OR_FFFD(s, next, length, c);
            if(realSet.contains(c)) {
                break;
            }
@ -2664,10 +2658,7 @@ static int32_t containsSpanUTF8(const UnicodeSetWithStrings &set, const char *s,
        UChar32 c;
        int32_t start, next, maxSpanLimit=0;
        for(start=next=0; start<length;) {
-            U8_NEXT(s, next, length, c);
-            if(c<0) {
-                c=0xfffd;
-            }
+            U8_NEXT_OR_FFFD(s, next, length, c);
            if(!realSet.contains(c)) {
                next=start;  // Do not span this single, not-contained code point.
            }
@ -2738,10 +2729,7 @@ static int32_t containsSpanBackUTF8(const UnicodeSetWithStrings &set, const char
        UChar32 c;
        int32_t prev=length;
        do {
-            U8_PREV(s, 0, length, c);
-            if(c<0) {
-                c=0xfffd;
-            }
+            U8_PREV_OR_FFFD(s, 0, length, c);
            if(realSet.contains(c)!=spanCondition) {
                break;
            }
@ -2752,10 +2740,7 @@ static int32_t containsSpanBackUTF8(const UnicodeSetWithStrings &set, const char
        UChar32 c;
        int32_t prev=length;
        do {
-            U8_PREV(s, 0, length, c);
-            if(c<0) {
-                c=0xfffd;
-            }
+            U8_PREV_OR_FFFD(s, 0, length, c);
            if(realSet.contains(c)) {
                break;
            }
@ -2775,10 +2760,7 @@ static int32_t containsSpanBackUTF8(const UnicodeSetWithStrings &set, const char
        UChar32 c;
        int32_t prev=length, minSpanStart=length;
        do {
-            U8_PREV(s, 0, length, c);
-            if(c<0) {
-                c=0xfffd;
-            }
+            U8_PREV_OR_FFFD(s, 0, length, c);
            if(!realSet.contains(c)) {
                length=prev;  // Do not span this single, not-contained code point.
            }