ICU-13311 change illegal-UTF-8 handling in non-converter code

X-SVN-Rev: 40445
2017-09-21 23:45:08 +00:00 · 2017-09-21 23:45:08 +00:00 · 27c08578ac
commit 27c08578ac
parent 119d75dc46
19 changed files with 913 additions and 1057 deletions
--- a/icu4c/source/common/bmpset.cpp
+++ b/icu4c/source/common/bmpset.cpp
@ -28,7 +28,7 @@ U_NAMESPACE_BEGIN

 BMPSet::BMPSet(const int32_t *parentList, int32_t parentListLength) :
        list(parentList), listLength(parentListLength) {
-    uprv_memset(asciiBytes, 0, sizeof(asciiBytes));
+    uprv_memset(latin1Contains, 0, sizeof(latin1Contains));
    uprv_memset(table7FF, 0, sizeof(table7FF));
    uprv_memset(bmpBlockBits, 0, sizeof(bmpBlockBits));

@ -45,14 +45,16 @@ BMPSet::BMPSet(const int32_t *parentList, int32_t parentListLength) :
        list4kStarts[i]=findCodePoint(i<<12, list4kStarts[i-1], listLength-1);
    }
    list4kStarts[0x11]=listLength-1;
+    containsFFFD=containsSlow(0xfffd, list4kStarts[0xf], list4kStarts[0x10]);

    initBits();
    overrideIllegal();
 }

 BMPSet::BMPSet(const BMPSet &otherBMPSet, const int32_t *newParentList, int32_t newParentListLength) :
+        containsFFFD(otherBMPSet.containsFFFD),
        list(newParentList), listLength(newParentListLength) {
-    uprv_memcpy(asciiBytes, otherBMPSet.asciiBytes, sizeof(asciiBytes));
+    uprv_memcpy(latin1Contains, otherBMPSet.latin1Contains, sizeof(latin1Contains));
    uprv_memcpy(table7FF, otherBMPSet.table7FF, sizeof(table7FF));
    uprv_memcpy(bmpBlockBits, otherBMPSet.bmpBlockBits, sizeof(bmpBlockBits));
    uprv_memcpy(list4kStarts, otherBMPSet.list4kStarts, sizeof(list4kStarts));
@ -120,7 +122,7 @@ void BMPSet::initBits() {
    UChar32 start, limit;
    int32_t listIndex=0;

-    // Set asciiBytes[].
+    // Set latin1Contains[].
    do {
        start=list[listIndex++];
        if(listIndex<listLength) {
@ -128,13 +130,30 @@ void BMPSet::initBits() {
        } else {
            limit=0x110000;
        }
-        if(start>=0x80) {
+        if(start>=0x100) {
            break;
        }
        do {
-            asciiBytes[start++]=1;
-        } while(start<limit && start<0x80);
-    } while(limit<=0x80);
+            latin1Contains[start++]=1;
+        } while(start<limit && start<0x100);
+    } while(limit<=0x100);
+
+    // Find the first range overlapping with (or after) 80..FF again,
+    // to include them in table7FF as well.
+    for(listIndex=0;;) {
+        start=list[listIndex++];
+        if(listIndex<listLength) {
+            limit=list[listIndex++];
+        } else {
+            limit=0x110000;
+        }
+        if(limit>0x80) {
+            if(start<0x80) {
+                start=0x80;
+            }
+            break;
+        }
+    }

    // Set table7FF[].
    while(start<0x800) {
@ -204,19 +223,14 @@ void BMPSet::initBits() {
 * for faster validity checking at runtime.
 * No need to set 0 values where they were reset to 0 in the constructor
 * and not modified by initBits().
- * (asciiBytes[] trail bytes, table7FF[] 0..7F, bmpBlockBits[] 0..7FF)
+ * (table7FF[] 0..7F, bmpBlockBits[] 0..7FF)
 * Need to set 0 values for surrogates D800..DFFF.
 */
 void BMPSet::overrideIllegal() {
    uint32_t bits, mask;
    int32_t i;

-    if(containsSlow(0xfffd, list4kStarts[0xf], list4kStarts[0x10])) {
-        // contains(FFFD)==TRUE
-        for(i=0x80; i<0xc0; ++i) {
-            asciiBytes[i]=1;
-        }
-
+    if(containsFFFD) {
        bits=3;                 // Lead bytes 0xC0 and 0xC1.
        for(i=0; i<64; ++i) {
            table7FF[i]|=bits;
@ -233,7 +247,6 @@ void BMPSet::overrideIllegal() {
            bmpBlockBits[i]=(bmpBlockBits[i]&mask)|bits;
        }
    } else {
-        // contains(FFFD)==FALSE
        mask=~(0x10001<<0xd);   // Lead byte 0xED.
        for(i=32; i<64; ++i) {  // Second half of 4k block.
            bmpBlockBits[i]&=mask;
@ -277,8 +290,8 @@ int32_t BMPSet::findCodePoint(UChar32 c, int32_t lo, int32_t hi) const {

 UBool
 BMPSet::contains(UChar32 c) const {
-    if((uint32_t)c<=0x7f) {
-        return (UBool)asciiBytes[c];
+    if((uint32_t)c<=0xff) {
+        return (UBool)latin1Contains[c];
    } else if((uint32_t)c<=0x7ff) {
        return (UBool)((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0);
    } else if((uint32_t)c<0xd800 || (c>=0xe000 && c<=0xffff)) {
@ -314,8 +327,8 @@ BMPSet::span(const UChar *s, const UChar *limit, USetSpanCondition spanCondition
        // span
        do {
            c=*s;
-            if(c<=0x7f) {
-                if(!asciiBytes[c]) {
+            if(c<=0xff) {
+                if(!latin1Contains[c]) {
                    break;
                }
            } else if(c<=0x7ff) {
@ -354,8 +367,8 @@ BMPSet::span(const UChar *s, const UChar *limit, USetSpanCondition spanCondition
        // span not
        do {
            c=*s;
-            if(c<=0x7f) {
-                if(asciiBytes[c]) {
+            if(c<=0xff) {
+                if(latin1Contains[c]) {
                    break;
                }
            } else if(c<=0x7ff) {
@ -403,8 +416,8 @@ BMPSet::spanBack(const UChar *s, const UChar *limit, USetSpanCondition spanCondi
        // span
        for(;;) {
            c=*(--limit);
-            if(c<=0x7f) {
-                if(!asciiBytes[c]) {
+            if(c<=0xff) {
+                if(!latin1Contains[c]) {
                    break;
                }
            } else if(c<=0x7ff) {
@ -446,8 +459,8 @@ BMPSet::spanBack(const UChar *s, const UChar *limit, USetSpanCondition spanCondi
        // span not
        for(;;) {
            c=*(--limit);
-            if(c<=0x7f) {
-                if(asciiBytes[c]) {
+            if(c<=0xff) {
+                if(latin1Contains[c]) {
                    break;
                }
            } else if(c<=0x7ff) {
@ -497,22 +510,22 @@ const uint8_t *
 BMPSet::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
    const uint8_t *limit=s+length;
    uint8_t b=*s;
-    if((int8_t)b>=0) {
+    if(U8_IS_SINGLE(b)) {
        // Initial all-ASCII span.
        if(spanCondition) {
            do {
-                if(!asciiBytes[b] || ++s==limit) {
+                if(!latin1Contains[b] || ++s==limit) {
                    return s;
                }
                b=*s;
-            } while((int8_t)b>=0);
+            } while(U8_IS_SINGLE(b));
        } else {
            do {
-                if(asciiBytes[b] || ++s==limit) {
+                if(latin1Contains[b] || ++s==limit) {
                    return s;
                }
                b=*s;
-            } while((int8_t)b>=0);
+            } while(U8_IS_SINGLE(b));
        }
        length=(int32_t)(limit-s);
    }
@ -540,20 +553,20 @@ BMPSet::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanConditi
            // single trail byte, check for preceding 3- or 4-byte lead byte
            if(length>=2 && (b=*(limit-2))>=0xe0) {
                limit-=2;
-                if(asciiBytes[0x80]!=spanCondition) {
+                if(containsFFFD!=spanCondition) {
                    limit0=limit;
                }
            } else if(b<0xc0 && b>=0x80 && length>=3 && (b=*(limit-3))>=0xf0) {
                // 4-byte lead byte with only two trail bytes
                limit-=3;
-                if(asciiBytes[0x80]!=spanCondition) {
+                if(containsFFFD!=spanCondition) {
                    limit0=limit;
                }
            }
        } else {
            // lead byte with no trail bytes
            --limit;
-            if(asciiBytes[0x80]!=spanCondition) {
+            if(containsFFFD!=spanCondition) {
                limit0=limit;
            }
        }
@ -563,26 +576,26 @@ BMPSet::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanConditi

    while(s<limit) {
        b=*s;
-        if(b<0xc0) {
-            // ASCII; or trail bytes with the result of contains(FFFD).
+        if(U8_IS_SINGLE(b)) {
+            // ASCII
            if(spanCondition) {
                do {
-                    if(!asciiBytes[b]) {
+                    if(!latin1Contains[b]) {
                        return s;
                    } else if(++s==limit) {
                        return limit0;
                    }
                    b=*s;
-                } while(b<0xc0);
+                } while(U8_IS_SINGLE(b));
            } else {
                do {
-                    if(asciiBytes[b]) {
+                    if(latin1Contains[b]) {
                        return s;
                    } else if(++s==limit) {
                        return limit0;
                    }
                    b=*s;
-                } while(b<0xc0);
+                } while(U8_IS_SINGLE(b));
            }
        }
        ++s;  // Advance past the lead byte.
@ -619,7 +632,7 @@ BMPSet::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanConditi
                UChar32 c=((UChar32)(b-0xf0)<<18)|((UChar32)t1<<12)|(t2<<6)|t3;
                if( (   (0x10000<=c && c<=0x10ffff) ?
                            containsSlow(c, list4kStarts[0x10], list4kStarts[0x11]) :
-                            asciiBytes[0x80]
+                            containsFFFD
                    ) != spanCondition
                ) {
                    return s-1;
@ -627,8 +640,9 @@ BMPSet::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanConditi
                s+=3;
                continue;
            }
-        } else /* 0xc0<=b<0xe0 */ {
+        } else {
            if( /* handle U+0000..U+07FF inline */
+                b>=0xc0 &&
                (t1=(uint8_t)(*s-0x80)) <= 0x3f
            ) {
                if((USetSpanCondition)((table7FF[t1]&((uint32_t)1<<(b&0x1f)))!=0) != spanCondition) {
@ -642,7 +656,7 @@ BMPSet::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanConditi
        // Give an illegal sequence the same value as the result of contains(FFFD).
        // Handle each byte of an illegal sequence separately to simplify the code;
        // no need to optimize error handling.
-        if(asciiBytes[0x80]!=spanCondition) {
+        if(containsFFFD!=spanCondition) {
            return s-1;
        }
    }
@ -667,26 +681,26 @@ BMPSet::spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCon

    do {
        b=s[--length];
-        if((int8_t)b>=0) {
+        if(U8_IS_SINGLE(b)) {
            // ASCII sub-span
            if(spanCondition) {
                do {
-                    if(!asciiBytes[b]) {
+                    if(!latin1Contains[b]) {
                        return length+1;
                    } else if(length==0) {
                        return 0;
                    }
                    b=s[--length];
-                } while((int8_t)b>=0);
+                } while(U8_IS_SINGLE(b));
            } else {
                do {
-                    if(asciiBytes[b]) {
+                    if(latin1Contains[b]) {
                        return length+1;
                    } else if(length==0) {
                        return 0;
                    }
                    b=s[--length];
-                } while((int8_t)b>=0);
+                } while(U8_IS_SINGLE(b));
            }
        }

--- a/icu4c/source/common/bmpset.h
+++ b/icu4c/source/common/bmpset.h
@ -28,11 +28,12 @@ U_NAMESPACE_BEGIN
 * Helper class for frozen UnicodeSets, implements contains() and span()
 * optimized for BMP code points. Structured to be UTF-8-friendly.
 *
- * ASCII: Look up bytes.
+ * Latin-1: Look up bytes.
 * 2-byte characters: Bits organized vertically.
 * 3-byte characters: Use zero/one/mixed data per 64-block in U+0000..U+FFFF,
 *                    with mixed for illegal ranges.
- * Supplementary characters: Call contains() on the parent set.
+ * Supplementary characters: Binary search over
+ * the supplementary part of the parent set's inversion list.
 */
 class BMPSet : public UMemory {
 public:
@ -96,12 +97,12 @@ private:
    inline UBool containsSlow(UChar32 c, int32_t lo, int32_t hi) const;

    /*
-     * One byte per ASCII character, or trail byte in lead position.
-     * 0 or 1 for ASCII characters.
-     * The value for trail bytes is the result of contains(FFFD)
-     * for faster validity checking at runtime.
+     * One byte 0 or 1 per Latin-1 character.
     */
-    UBool asciiBytes[0xc0];
+    UBool latin1Contains[0x100];
+
+    /* TRUE if contains(U+FFFD). */
+    UBool containsFFFD;

    /*
     * One bit per code point from U+0000..U+07FF.
--- a/icu4c/source/common/unicode/utf.h
+++ b/icu4c/source/common/unicode/utf.h
@ -23,9 +23,6 @@
 * This file defines macros for checking whether a code point is
 * a surrogate or a non-character etc.
 *
- * The UChar and UChar32 data types for Unicode code units and code points
- * are defined in umachine.h because they can be machine-dependent.
- *
 * If U_NO_DEFAULT_INCLUDE_UTF_HEADERS is 0 then utf.h is included by utypes.h
 * and itself includes utf8.h and utf16.h after some
 * common definitions.
@ -50,11 +47,11 @@
 * but are optimized for the much more frequently occurring BMP code points.
 *
 * umachine.h defines UChar to be an unsigned 16-bit integer.
- * Where available, UChar is defined to be a char16_t
- * or a wchar_t (if that is an unsigned 16-bit type), otherwise uint16_t.
+ * Since ICU 59, ICU uses char16_t in C++, UChar only in C,
+ * and defines UChar=char16_t by default. See the UChar API docs for details.
 *
 * UChar32 is defined to be a signed 32-bit integer (int32_t), large enough for a 21-bit
- * Unicode code point (Unicode scalar value, 0..0x10ffff).
+ * Unicode code point (Unicode scalar value, 0..0x10ffff) and U_SENTINEL (-1).
 * Before ICU 2.4, the definition of UChar32 was similarly platform-dependent as
 * the definition of UChar. For details see the documentation for UChar32 itself.
 *
@ -63,11 +60,20 @@
 * For actual Unicode character properties see uchar.h.
 *
 * By default, string operations must be done with error checking in case
- * a string is not well-formed UTF-16.
- * The macros will detect if a surrogate code unit is unpaired
+ * a string is not well-formed UTF-16 or UTF-8.
+ *
+ * The U16_ macros detect if a surrogate code unit is unpaired
 * (lead unit without trail unit or vice versa) and just return the unit itself
 * as the code point.
 *
+ * The U8_ macros detect illegal byte sequences and return a negative value.
+ * Starting with ICU 60, the observable length of a single illegal byte sequence
+ * skipped by one of these macros follows the Unicode 6+ recommendation
+ * which is consistent with the W3C Encoding Standard.
+ *
+ * There are ..._OR_FFFD versions of both U16_ and U8_ macros
+ * that return U+FFFD for illegal code unit sequences.
+ *
 * The regular "safe" macros require that the initial, passed-in string index
 * is within bounds. They only check the index when they read more than one
 * code unit. This is usually done with code similar to the following loop:
@ -91,10 +97,7 @@
 * The performance differences are much larger here because UTF-8 provides so
 * many opportunities for malformed sequences.
 * The unsafe UTF-8 macros are entirely implemented inside the macro definitions
- * and are fast, while the safe UTF-8 macros call functions for all but the
- * trivial (ASCII) cases.
- * (ICU 3.6 optimizes U8_NEXT() and U8_APPEND() to handle most other common
- * characters inline as well.)
+ * and are fast, while the safe UTF-8 macros call functions for some complicated cases.
 *
 * Unlike with UTF-16, malformed sequences cannot be expressed with distinct
 * code point values (0..U+10ffff). They are indicated with negative values instead.
@ -126,8 +129,7 @@
 */
 #define U_IS_UNICODE_NONCHAR(c) \
    ((c)>=0xfdd0 && \
-     ((uint32_t)(c)<=0xfdef || ((c)&0xfffe)==0xfffe) && \
-     (uint32_t)(c)<=0x10ffff)
+     ((c)<=0xfdef || ((c)&0xfffe)==0xfffe) && (c)<=0x10ffff)

 /**
 * Is c a Unicode code point value (0..U+10ffff)
@ -148,9 +150,7 @@
 */
 #define U_IS_UNICODE_CHAR(c) \
    ((uint32_t)(c)<0xd800 || \
-        ((uint32_t)(c)>0xdfff && \
-         (uint32_t)(c)<=0x10ffff && \
-         !U_IS_UNICODE_NONCHAR(c)))
+        (0xdfff<(c) && (c)<=0x10ffff && !U_IS_UNICODE_NONCHAR(c)))

 /**
 * Is this code point a BMP code point (U+0000..U+ffff)?
--- a/icu4c/source/common/unicode/utf8.h
+++ b/icu4c/source/common/unicode/utf8.h
@ -41,34 +41,24 @@

 /* internal definitions ----------------------------------------------------- */

-
-
 /**
 * Counts the trail bytes for a UTF-8 lead byte.
- * Returns 0 for 0..0xbf as well as for 0xfe and 0xff.
+ * Returns 0 for 0..0xc1 as well as for 0xf5..0xff.
+ * leadByte might be evaluated multiple times.
 *
 * This is internal since it is not meant to be called directly by external clients;
 * however it is called by public macros in this file and thus must remain stable.
 *
- * Note: Beginning with ICU 50, the implementation uses a multi-condition expression
- * which was shown in 2012 (on x86-64) to compile to fast, branch-free code.
- * leadByte is evaluated multiple times.
- *
- * The pre-ICU 50 implementation used the exported array utf8_countTrailBytes:
- * #define U8_COUNT_TRAIL_BYTES(leadByte) (utf8_countTrailBytes[leadByte])
- * leadByte was evaluated exactly once.
- *
 * @param leadByte The first byte of a UTF-8 sequence. Must be 0..0xff.
 * @internal
 */
 #define U8_COUNT_TRAIL_BYTES(leadByte) \
-    ((uint8_t)(leadByte)<0xf0 ? \
-        ((uint8_t)(leadByte)>=0xc0)+((uint8_t)(leadByte)>=0xe0) : \
-        (uint8_t)(leadByte)<0xfe ? 3+((uint8_t)(leadByte)>=0xf8)+((uint8_t)(leadByte)>=0xfc) : 0)
+    ((uint8_t)(leadByte)<=0xf4 ? \
+        ((uint8_t)(leadByte)>=0xc2)+((uint8_t)(leadByte)>=0xe0)+((uint8_t)(leadByte)>=0xf0) : 0)

 /**
 * Counts the trail bytes for a UTF-8 lead byte of a valid UTF-8 sequence.
- * The maximum supported lead byte is 0xf4 corresponding to U+10FFFF.
+ * Returns 0 for 0..0xc1. Undefined for 0xf5..0xff.
 * leadByte might be evaluated multiple times.
 *
 * This is internal since it is not meant to be called directly by external clients;
@ -78,7 +68,7 @@
 * @internal
 */
 #define U8_COUNT_TRAIL_BYTES_UNSAFE(leadByte) \
-    (((leadByte)>=0xc0)+((leadByte)>=0xe0)+((leadByte)>=0xf0))
+    (((uint8_t)(leadByte)>=0xc2)+((uint8_t)(leadByte)>=0xe0)+((uint8_t)(leadByte)>=0xf0))

 /**
 * Mask a UTF-8 lead byte, leave only the lower bits that form part of the code point value.
@ -89,6 +79,34 @@
 */
 #define U8_MASK_LEAD_BYTE(leadByte, countTrailBytes) ((leadByte)&=(1<<(6-(countTrailBytes)))-1)

+/**
+ * Internal bit vector for 3-byte UTF-8 validity check.
+ * Lead byte E0..EF bits 3..0 as byte index,
+ * first trail byte bits 7..5 as bit index into that byte.
+ * @internal
+ */
+#define U8_LEAD3_T1_BITS "\x20\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x10\x30\x30"
+
+/**
+ * Internal 3-byte UTF-8 validity check.
+ * @internal
+ */
+#define U8_IS_VALID_LEAD3_AND_T1(lead, t1) (U8_LEAD3_T1_BITS[(lead)&0xf]&(1<<((uint8_t)(t1)>>5)))
+
+/**
+ * Internal bit vector for 4-byte UTF-8 validity check.
+ * First trail byte bits 7..4 as byte index,
+ * lead byte F0..F4 bits 2..0 as bit index into that byte.
+ * @internal
+ */
+#define U8_LEAD4_T1_BITS "\x00\x00\x00\x00\x00\x00\x00\x00\x1E\x0F\x0F\x0F\x00\x00\x00\x00"
+
+/**
+ * Internal 4-byte UTF-8 validity check.
+ * @internal
+ */
+#define U8_IS_VALID_LEAD4_AND_T1(lead, t1) (U8_LEAD4_T1_BITS[(uint8_t)(t1)>>4]&(1<<((lead)&7)))
+
 /**
 * Function for handling "next code point" with error-checking.
 *
@ -153,7 +171,8 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
 * @return TRUE or FALSE
 * @stable ICU 2.4
 */
-#define U8_IS_LEAD(c) ((uint8_t)((c)-0xc0)<0x3e)
+#define U8_IS_LEAD(c) ((uint8_t)((c)-0xc2)<=0x32)
+// 0x32=0xf4-0xc2

 /**
 * Is this code unit (byte) a UTF-8 trail byte?
@ -161,7 +180,7 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
 * @return TRUE or FALSE
 * @stable ICU 2.4
 */
-#define U8_IS_TRAIL(c) (((c)&0xc0)==0x80)
+#define U8_IS_TRAIL(c) ((int8_t)(c)<-0x40)

 /**
 * How many code units (bytes) are used for the UTF-8 encoding
@ -289,7 +308,7 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
 */
 #define U8_NEXT_UNSAFE(s, i, c) { \
    (c)=(uint8_t)(s)[(i)++]; \
-    if((c)>=0x80) { \
+    if(!U8_IS_SINGLE(c)) { \
        if((c)<0xe0) { \
            (c)=(((c)&0x1f)<<6)|((s)[(i)++]&0x3f); \
        } else if((c)<0xf0) { \
@ -325,22 +344,19 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
 */
 #define U8_NEXT(s, i, length, c) { \
    (c)=(uint8_t)(s)[(i)++]; \
-    if((c)>=0x80) { \
+    if(!U8_IS_SINGLE(c)) { \
        uint8_t __t1, __t2; \
-        if( /* handle U+1000..U+CFFF inline */ \
-            (0xe0<(c) && (c)<=0xec) && \
-            (((i)+1)<(length) || (length)<0) && \
-            (__t1=(uint8_t)((s)[i]-0x80))<=0x3f && \
-            (__t2=(uint8_t)((s)[(i)+1]-0x80))<= 0x3f \
-        ) { \
-            /* no need for (c&0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ \
-            (c)=(UChar)(((c)<<12)|(__t1<<6)|__t2); \
+        if( /* handle U+0800..U+FFFF inline */ \
+                (0xe0<=(c) && (c)<0xf0) && \
+                (((i)+1)<(length) || (length)<0) && \
+                U8_IS_VALID_LEAD3_AND_T1((c), __t1=(s)[i]) && \
+                (__t2=(s)[(i)+1]-0x80)<=0x3f) { \
+            (c)=(((c)&0xf)<<12)|((__t1&0x3f)<<6)|__t2; \
            (i)+=2; \
        } else if( /* handle U+0080..U+07FF inline */ \
-            ((c)<0xe0 && (c)>=0xc2) && \
-            ((i)!=(length)) && \
-            (__t1=(uint8_t)((s)[i]-0x80))<=0x3f \
-        ) { \
+                ((c)<0xe0 && (c)>=0xc2) && \
+                ((i)!=(length)) && \
+                (__t1=(s)[i]-0x80)<=0x3f) { \
            (c)=(((c)&0x1f)<<6)|__t1; \
            ++(i); \
        } else { \
@ -376,22 +392,19 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
 */
 #define U8_NEXT_OR_FFFD(s, i, length, c) { \
    (c)=(uint8_t)(s)[(i)++]; \
-    if((c)>=0x80) { \
+    if(!U8_IS_SINGLE(c)) { \
        uint8_t __t1, __t2; \
-        if( /* handle U+1000..U+CFFF inline */ \
-            (0xe0<(c) && (c)<=0xec) && \
-            (((i)+1)<(length) || (length)<0) && \
-            (__t1=(uint8_t)((s)[i]-0x80))<=0x3f && \
-            (__t2=(uint8_t)((s)[(i)+1]-0x80))<= 0x3f \
-        ) { \
-            /* no need for (c&0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ \
-            (c)=(UChar)(((c)<<12)|(__t1<<6)|__t2); \
+        if( /* handle U+0800..U+FFFF inline */ \
+                (0xe0<=(c) && (c)<0xf0) && \
+                (((i)+1)<(length) || (length)<0) && \
+                U8_IS_VALID_LEAD3_AND_T1((c), __t1=(s)[i]) && \
+                (__t2=(s)[(i)+1]-0x80)<=0x3f) { \
+            (c)=(((c)&0xf)<<12)|((__t1&0x3f)<<6)|__t2; \
            (i)+=2; \
        } else if( /* handle U+0080..U+07FF inline */ \
-            ((c)<0xe0 && (c)>=0xc2) && \
-            ((i)!=(length)) && \
-            (__t1=(uint8_t)((s)[i]-0x80))<=0x3f \
-        ) { \
+                ((c)<0xe0 && (c)>=0xc2) && \
+                ((i)!=(length)) && \
+                (__t1=(s)[i]-0x80)<=0x3f) { \
            (c)=(((c)&0x1f)<<6)|__t1; \
            ++(i); \
        } else { \
@ -476,7 +489,7 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
 * @stable ICU 2.4
 */
 #define U8_FWD_1_UNSAFE(s, i) { \
-    (i)+=1+U8_COUNT_TRAIL_BYTES_UNSAFE((uint8_t)(s)[i]); \
+    (i)+=1+U8_COUNT_TRAIL_BYTES_UNSAFE((s)[i]); \
 }

 /**
@ -493,15 +506,24 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
 * @stable ICU 2.4
 */
 #define U8_FWD_1(s, i, length) { \
-    uint8_t __b=(uint8_t)(s)[(i)++]; \
-    if(U8_IS_LEAD(__b)) { \
-        uint8_t __count=U8_COUNT_TRAIL_BYTES(__b); \
-        if((i)+__count>(length) && (length)>=0) { \
-            __count=(uint8_t)((length)-(i)); \
-        } \
-        while(__count>0 && U8_IS_TRAIL((s)[i])) { \
-            ++(i); \
-            --__count; \
+    uint8_t __b=(s)[(i)++]; \
+    if(U8_IS_LEAD(__b) && (i)!=(length)) { \
+        uint8_t __t1=(s)[i]; \
+        if((0xe0<=__b && __b<0xf0)) { \
+            if(U8_IS_VALID_LEAD3_AND_T1(__b, __t1) && \
+                    ++(i)!=(length) && U8_IS_TRAIL((s)[i])) { \
+                ++(i); \
+            } \
+        } else if(__b<0xe0) { \
+            if(U8_IS_TRAIL(__t1)) { \
+                ++(i); \
+            } \
+        } else /* c>=0xf0 */ { \
+            if(U8_IS_VALID_LEAD4_AND_T1(__b, __t1) && \
+                    ++(i)!=(length) && U8_IS_TRAIL((s)[i]) && \
+                    ++(i)!=(length) && U8_IS_TRAIL((s)[i])) { \
+                ++(i); \
+            } \
        } \
    } \
 }
@ -615,7 +637,7 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
        /* c is a trail byte */ \
        (c)&=0x3f; \
        for(;;) { \
-            __b=(uint8_t)(s)[--(i)]; \
+            __b=(s)[--(i)]; \
            if(__b>=0xc0) { \
                U8_MASK_LEAD_BYTE(__b, __count); \
                (c)|=(UChar32)__b<<__shift; \
@ -651,7 +673,7 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
 */
 #define U8_PREV(s, start, i, c) { \
    (c)=(uint8_t)(s)[--(i)]; \
-    if((c)>=0x80) { \
+    if(!U8_IS_SINGLE(c)) { \
        (c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -1); \
    } \
 }
@ -682,7 +704,7 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
 */
 #define U8_PREV_OR_FFFD(s, start, i, c) { \
    (c)=(uint8_t)(s)[--(i)]; \
-    if((c)>=0x80) { \
+    if(!U8_IS_SINGLE(c)) { \
        (c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -3); \
    } \
 }
--- a/icu4c/source/common/unisetspan.cpp
+++ b/icu4c/source/common/unisetspan.cpp
@ -502,7 +502,7 @@ spanOneBack(const UnicodeSet &set, const UChar *s, int32_t length) {
 static inline int32_t
 spanOneUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) {
    UChar32 c=*s;
-    if((int8_t)c>=0) {
+    if(U8_IS_SINGLE(c)) {
        return set.contains(c) ? 1 : -1;
    }
    // Take advantage of non-ASCII fastpaths in U8_NEXT_OR_FFFD().
@ -514,7 +514,7 @@ spanOneUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) {
 static inline int32_t
 spanOneBackUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) {
    UChar32 c=s[length-1];
-    if((int8_t)c>=0) {
+    if(U8_IS_SINGLE(c)) {
        return set.contains(c) ? 1 : -1;
    }
    int32_t i=length-1;
@ -1006,11 +1006,9 @@ int32_t UnicodeSetStringSpan::spanUTF8(const uint8_t *s, int32_t length, USetSpa
                    // Try to match if the increment is not listed already.
                    // Match at code point boundaries. (The UTF-8 strings were converted
                    // from UTF-16 and are guaranteed to be well-formed.)
-                    if( !U8_IS_TRAIL(s[pos-overlap]) &&
-                        !offsets.containsOffset(inc) &&
-                        matches8(s+pos-overlap, s8, length8)
-                        
-                    ) {
+                    if(!U8_IS_TRAIL(s[pos-overlap]) &&
+                            !offsets.containsOffset(inc) &&
+                            matches8(s+pos-overlap, s8, length8)) {
                        if(inc==rest) {
                            return length;  // Reached the end of the string.
                        }
@ -1052,11 +1050,10 @@ int32_t UnicodeSetStringSpan::spanUTF8(const uint8_t *s, int32_t length, USetSpa
                    // Try to match if the string is longer or starts earlier.
                    // Match at code point boundaries. (The UTF-8 strings were converted
                    // from UTF-16 and are guaranteed to be well-formed.)
-                    if( !U8_IS_TRAIL(s[pos-overlap]) &&
-                        (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ inc>maxInc) &&
-                        matches8(s+pos-overlap, s8, length8)
-                        
-                    ) {
+                    if(!U8_IS_TRAIL(s[pos-overlap]) &&
+                            (overlap>maxOverlap ||
+                                /* redundant overlap==maxOverlap && */ inc>maxInc) &&
+                            matches8(s+pos-overlap, s8, length8)) {
                        maxInc=inc;  // Longest match from earliest start.
                        maxOverlap=overlap;
                        break;
--- a/icu4c/source/common/ustrtrns.cpp
+++ b/icu4c/source/common/ustrtrns.cpp
@ -256,152 +256,6 @@ u_strToUTF32(UChar32 *dest,
            pErrorCode);
 }

-/* for utf8_nextCharSafeBodyTerminated() */
-static const UChar32
-utf8_minLegal[4]={ 0, 0x80, 0x800, 0x10000 };
-
-/*
- * Version of utf8_nextCharSafeBody() with the following differences:
- * - checks for NUL termination instead of length
- * - works with pointers instead of indexes
- * - always strict (strict==-1)
- *
- * *ps points to after the lead byte and will be moved to after the last trail byte.
- * c is the lead byte.
- * @return the code point, or U_SENTINEL
- */
-static UChar32
-utf8_nextCharSafeBodyTerminated(const uint8_t **ps, UChar32 c) {
-    const uint8_t *s=*ps;
-    uint8_t trail, illegal=0;
-    uint8_t count=U8_COUNT_TRAIL_BYTES(c);
-    U_ASSERT(count<6);
-    U8_MASK_LEAD_BYTE((c), count);
-    /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
-    switch(count) {
-    /* each branch falls through to the next one */
-    case 5:
-    case 4:
-        /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
-        illegal=1;
-        break;
-    case 3:
-        trail=(uint8_t)(*s++ - 0x80);
-        c=(c<<6)|trail;
-        if(trail>0x3f || c>=0x110) {
-            /* not a trail byte, or code point>0x10ffff (outside Unicode) */
-            illegal=1;
-            break;
-        }
-        U_FALLTHROUGH;
-    case 2:
-        trail=(uint8_t)(*s++ - 0x80);
-        if(trail>0x3f) {
-            /* not a trail byte */
-            illegal=1;
-            break;
-        }
-        c=(c<<6)|trail;
-        U_FALLTHROUGH;
-    case 1:
-        trail=(uint8_t)(*s++ - 0x80);
-        if(trail>0x3f) {
-            /* not a trail byte */
-            illegal=1;
-        }
-        c=(c<<6)|trail;
-        break;
-    case 0:
-        return U_SENTINEL;
-    /* no default branch to optimize switch()  - all values are covered */
-    }
-
-    /* correct sequence - all trail bytes have (b7..b6)==(10)? */
-    /* illegal is also set if count>=4 */
-    if(illegal || c<utf8_minLegal[count] || U_IS_SURROGATE(c)) {
-        /* error handling */
-        /* don't go beyond this sequence */
-        s=*ps;
-        while(count>0 && U8_IS_TRAIL(*s)) {
-            ++s;
-            --count;
-        }
-        c=U_SENTINEL;
-    }
-    *ps=s;
-    return c;
-}
-
-/*
- * Version of utf8_nextCharSafeBody() with the following differences:
- * - works with pointers instead of indexes
- * - always strict (strict==-1)
- *
- * *ps points to after the lead byte and will be moved to after the last trail byte.
- * c is the lead byte.
- * @return the code point, or U_SENTINEL
- */
-static UChar32
-utf8_nextCharSafeBodyPointer(const uint8_t **ps, const uint8_t *limit, UChar32 c) {
-    const uint8_t *s=*ps;
-    uint8_t trail, illegal=0;
-    uint8_t count=U8_COUNT_TRAIL_BYTES(c);
-    if((limit-s)>=count) {
-        U8_MASK_LEAD_BYTE((c), count);
-        /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
-        switch(count) {
-        /* each branch falls through to the next one */
-        case 5:
-        case 4:
-            /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
-            illegal=1;
-            break;
-        case 3:
-            trail=*s++;
-            c=(c<<6)|(trail&0x3f);
-            if(c<0x110) {
-                illegal|=(trail&0xc0)^0x80;
-            } else {
-                /* code point>0x10ffff, outside Unicode */
-                illegal=1;
-                break;
-            }
-            U_FALLTHROUGH;
-        case 2:
-            trail=*s++;
-            c=(c<<6)|(trail&0x3f);
-            illegal|=(trail&0xc0)^0x80;
-            U_FALLTHROUGH;
-        case 1:
-            trail=*s++;
-            c=(c<<6)|(trail&0x3f);
-            illegal|=(trail&0xc0)^0x80;
-            break;
-        case 0:
-            return U_SENTINEL;
-        /* no default branch to optimize switch()  - all values are covered */
-        }
-    } else {
-        illegal=1; /* too few bytes left */
-    }
-
-    /* correct sequence - all trail bytes have (b7..b6)==(10)? */
-    /* illegal is also set if count>=4 */
-    U_ASSERT(illegal || count<UPRV_LENGTHOF(utf8_minLegal));
-    if(illegal || c<utf8_minLegal[count] || U_IS_SURROGATE(c)) {
-        /* error handling */
-        /* don't go beyond this sequence */
-        s=*ps;
-        while(count>0 && s<limit && U8_IS_TRAIL(*s)) {
-            ++s;
-            --count;
-        }
-        c=U_SENTINEL;
-    }
-    *ps=s;
-    return c;
-}
-
 U_CAPI UChar* U_EXPORT2
 u_strFromUTF8WithSub(UChar *dest,
              int32_t destCapacity,
@ -410,19 +264,10 @@ u_strFromUTF8WithSub(UChar *dest,
              int32_t srcLength,
              UChar32 subchar, int32_t *pNumSubstitutions,
              UErrorCode *pErrorCode){
-    UChar *pDest = dest;
-    UChar *pDestLimit = dest+destCapacity;
-    UChar32 ch;
-    int32_t reqLength = 0;
-    const uint8_t* pSrc = (const uint8_t*) src;
-    uint8_t t1, t2; /* trail bytes */
-    int32_t numSubstitutions;
-
    /* args check */
-    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
+    if(U_FAILURE(*pErrorCode)) {
        return NULL;
    }
-        
    if( (src==NULL && srcLength!=0) || srcLength < -1 ||
        (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
        subchar > 0x10ffff || U_IS_SURROGATE(subchar)
@ -434,7 +279,10 @@ u_strFromUTF8WithSub(UChar *dest,
    if(pNumSubstitutions!=NULL) {
        *pNumSubstitutions=0;
    }
-    numSubstitutions=0;
+    UChar *pDest = dest;
+    UChar *pDestLimit = dest+destCapacity;
+    int32_t reqLength = 0;
+    int32_t numSubstitutions=0;

    /*
     * Inline processing of UTF-8 byte sequences:
@ -455,95 +303,81 @@ u_strFromUTF8WithSub(UChar *dest,
         * The code explicitly checks for NULs only in the lead byte position.
         * A NUL byte in the trail byte position fails the trail byte range check anyway.
         */
-        while(((ch = *pSrc) != 0) && (pDest < pDestLimit)) {
-            if(ch <= 0x7f){
-                *pDest++=(UChar)ch;
-                ++pSrc;
+        int32_t i;
+        UChar32 c;
+        for(i = 0; (c = (uint8_t)src[i]) != 0 && (pDest < pDestLimit);) {
+            // modified copy of U8_NEXT()
+            ++i;
+            if(U8_IS_SINGLE(c)) {
+                *pDest++=(UChar)c;
            } else {
-                if(ch > 0xe0) {
-                    if( /* handle U+1000..U+CFFF inline */
-                        ch <= 0xec &&
-                        (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
-                        (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
-                    ) {
-                        /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
-                        *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
-                        pSrc += 3;
-                        continue;
-                    }
-                } else if(ch < 0xe0) {
-                    if( /* handle U+0080..U+07FF inline */
-                        ch >= 0xc2 &&
-                        (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
-                    ) {
-                        *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
-                        pSrc += 2;
-                        continue;
-                    }
-                }
-
-                /* function call for "complicated" and error cases */
-                ++pSrc; /* continue after the lead byte */
-                ch=utf8_nextCharSafeBodyTerminated(&pSrc, ch);
-                if(ch<0 && (++numSubstitutions, ch = subchar) < 0) {
-                    *pErrorCode = U_INVALID_CHAR_FOUND;
-                    return NULL;
-                } else if(ch<=0xFFFF) {
-                    *(pDest++)=(UChar)ch;
+                uint8_t __t1, __t2;
+                if( /* handle U+0800..U+FFFF inline */
+                        (0xe0<=(c) && (c)<0xf0) &&
+                        U8_IS_VALID_LEAD3_AND_T1((c), src[i]) &&
+                        (__t2=src[(i)+1]-0x80)<=0x3f) {
+                    *pDest++ = (((c)&0xf)<<12)|((src[i]&0x3f)<<6)|__t2;
+                    i+=2;
+                } else if( /* handle U+0080..U+07FF inline */
+                        ((c)<0xe0 && (c)>=0xc2) &&
+                        (__t1=src[i]-0x80)<=0x3f) {
+                    *pDest++ = (((c)&0x1f)<<6)|__t1;
+                    ++(i);
                } else {
-                    *(pDest++)=U16_LEAD(ch);
-                    if(pDest<pDestLimit) {
-                        *(pDest++)=U16_TRAIL(ch);
+                    /* function call for "complicated" and error cases */
+                    (c)=utf8_nextCharSafeBody((const uint8_t *)src, &(i), -1, c, -1);
+                    if(c<0 && (++numSubstitutions, c = subchar) < 0) {
+                        *pErrorCode = U_INVALID_CHAR_FOUND;
+                        return NULL;
+                    } else if(c<=0xFFFF) {
+                        *(pDest++)=(UChar)c;
                    } else {
-                        reqLength++;
-                        break;
+                        *(pDest++)=U16_LEAD(c);
+                        if(pDest<pDestLimit) {
+                            *(pDest++)=U16_TRAIL(c);
+                        } else {
+                            reqLength++;
+                            break;
+                        }
                    }
                }
            }
        }

        /* Pre-flight the rest of the string. */
-        while((ch = *pSrc) != 0) {
-            if(ch <= 0x7f){
+        while((c = (uint8_t)src[i]) != 0) {
+            // modified copy of U8_NEXT()
+            ++i;
+            if(U8_IS_SINGLE(c)) {
                ++reqLength;
-                ++pSrc;
            } else {
-                if(ch > 0xe0) {
-                    if( /* handle U+1000..U+CFFF inline */
-                        ch <= 0xec &&
-                        (uint8_t)(pSrc[1] - 0x80) <= 0x3f &&
-                        (uint8_t)(pSrc[2] - 0x80) <= 0x3f
-                    ) {
-                        ++reqLength;
-                        pSrc += 3;
-                        continue;
-                    }
-                } else if(ch < 0xe0) {
-                    if( /* handle U+0080..U+07FF inline */
-                        ch >= 0xc2 &&
-                        (uint8_t)(pSrc[1] - 0x80) <= 0x3f
-                    ) {
-                        ++reqLength;
-                        pSrc += 2;
-                        continue;
+                uint8_t __t1, __t2;
+                if( /* handle U+0800..U+FFFF inline */
+                        (0xe0<=(c) && (c)<0xf0) &&
+                        U8_IS_VALID_LEAD3_AND_T1((c), src[i]) &&
+                        (__t2=src[(i)+1]-0x80)<=0x3f) {
+                    ++reqLength;
+                    i+=2;
+                } else if( /* handle U+0080..U+07FF inline */
+                        ((c)<0xe0 && (c)>=0xc2) &&
+                        (__t1=src[i]-0x80)<=0x3f) {
+                    ++reqLength;
+                    ++(i);
+                } else {
+                    /* function call for "complicated" and error cases */
+                    (c)=utf8_nextCharSafeBody((const uint8_t *)src, &(i), -1, c, -1);
+                    if(c<0 && (++numSubstitutions, c = subchar) < 0) {
+                        *pErrorCode = U_INVALID_CHAR_FOUND;
+                        return NULL;
                    }
+                    reqLength += U16_LENGTH(c);
                }
-
-                /* function call for "complicated" and error cases */
-                ++pSrc; /* continue after the lead byte */
-                ch=utf8_nextCharSafeBodyTerminated(&pSrc, ch);
-                if(ch<0 && (++numSubstitutions, ch = subchar) < 0) {
-                    *pErrorCode = U_INVALID_CHAR_FOUND;
-                    return NULL;
-                }
-                reqLength += U16_LENGTH(ch);
            }
        }
    } else /* srcLength >= 0 */ {
-        const uint8_t *pSrcLimit = pSrc + srcLength;
-        int32_t count;
-
-        /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
+        /* Faster loop without ongoing checking for srcLength and pDestLimit. */
+        int32_t i = 0;
+        UChar32 c;
        for(;;) {
            /*
             * Each iteration of the inner loop progresses by at most 3 UTF-8
@ -551,10 +385,10 @@ u_strFromUTF8WithSub(UChar *dest,
             * For supplementary code points (4 & 2), which are rare,
             * there is an additional adjustment.
             */
-            count = (int32_t)(pDestLimit - pDest);
-            srcLength = (int32_t)((pSrcLimit - pSrc) / 3);
-            if(count > srcLength) {
-                count = srcLength; /* min(remaining dest, remaining src/3) */
+            int32_t count = (int32_t)(pDestLimit - pDest);
+            int32_t count2 = (srcLength - i) / 3;
+            if(count > count2) {
+                count = count2; /* min(remaining dest, remaining src/3) */
            }
            if(count < 3) {
                /*
@ -565,147 +399,123 @@ u_strFromUTF8WithSub(UChar *dest,
            }

            do {
-                ch = *pSrc;
-                if(ch <= 0x7f){
-                    *pDest++=(UChar)ch;
-                    ++pSrc;
+                // modified copy of U8_NEXT()
+                c = (uint8_t)src[i++];
+                if(U8_IS_SINGLE(c)) {
+                    *pDest++=(UChar)c;
                } else {
-                    if(ch > 0xe0) {
-                        if( /* handle U+1000..U+CFFF inline */
-                            ch <= 0xec &&
-                            (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
-                            (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
-                        ) {
-                            /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
-                            *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
-                            pSrc += 3;
-                            continue;
+                    uint8_t __t1, __t2;
+                    if( /* handle U+0800..U+FFFF inline */
+                            (0xe0<=(c) && (c)<0xf0) &&
+                            ((i)+1)<srcLength &&
+                            U8_IS_VALID_LEAD3_AND_T1((c), src[i]) &&
+                            (__t2=src[(i)+1]-0x80)<=0x3f) {
+                        *pDest++ = (((c)&0xf)<<12)|((src[i]&0x3f)<<6)|__t2;
+                        i+=2;
+                    } else if( /* handle U+0080..U+07FF inline */
+                            ((c)<0xe0 && (c)>=0xc2) &&
+                            ((i)!=srcLength) &&
+                            (__t1=src[i]-0x80)<=0x3f) {
+                        *pDest++ = (((c)&0x1f)<<6)|__t1;
+                        ++(i);
+                    } else {
+                        if(c >= 0xf0 || subchar > 0xffff) {
+                            // We may read up to four bytes and write up to two UChars,
+                            // which we didn't account for with computing count,
+                            // so we adjust it here.
+                            if(--count == 0) {
+                                --i;  // back out byte c
+                                break;
+                            }
                        }
-                    } else if(ch < 0xe0) {
-                        if( /* handle U+0080..U+07FF inline */
-                            ch >= 0xc2 &&
-                            (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
-                        ) {
-                            *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
-                            pSrc += 2;
-                            continue;
-                        }
-                    }

-                    if(ch >= 0xf0 || subchar > 0xffff) {
-                        /*
-                         * We may read up to six bytes and write up to two UChars,
-                         * which we didn't account for with computing count,
-                         * so we adjust it here.
-                         */
-                        if(--count == 0) {
-                            break;
+                        /* function call for "complicated" and error cases */
+                        (c)=utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, c, -1);
+                        if(c<0 && (++numSubstitutions, c = subchar) < 0) {
+                            *pErrorCode = U_INVALID_CHAR_FOUND;
+                            return NULL;
+                        } else if(c<=0xFFFF) {
+                            *(pDest++)=(UChar)c;
+                        } else {
+                            *(pDest++)=U16_LEAD(c);
+                            *(pDest++)=U16_TRAIL(c);
                        }
                    }
-
-                    /* function call for "complicated" and error cases */
-                    ++pSrc; /* continue after the lead byte */
-                    ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
-                    if(ch<0 && (++numSubstitutions, ch = subchar) < 0){
-                        *pErrorCode = U_INVALID_CHAR_FOUND;
-                        return NULL;
-                    }else if(ch<=0xFFFF){
-                        *(pDest++)=(UChar)ch;
-                    }else{
-                        *(pDest++)=U16_LEAD(ch);
-                        *(pDest++)=U16_TRAIL(ch);
-                    }
                }
            } while(--count > 0);
        }

-        while((pSrc<pSrcLimit) && (pDest<pDestLimit)) {
-            ch = *pSrc;
-            if(ch <= 0x7f){
-                *pDest++=(UChar)ch;
-                ++pSrc;
+        while(i < srcLength && (pDest < pDestLimit)) {
+            // modified copy of U8_NEXT()
+            c = (uint8_t)src[i++];
+            if(U8_IS_SINGLE(c)) {
+                *pDest++=(UChar)c;
            } else {
-                if(ch > 0xe0) {
-                    if( /* handle U+1000..U+CFFF inline */
-                        ch <= 0xec &&
-                        ((pSrcLimit - pSrc) >= 3) &&
-                        (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
-                        (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
-                    ) {
-                        /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
-                        *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
-                        pSrc += 3;
-                        continue;
-                    }
-                } else if(ch < 0xe0) {
-                    if( /* handle U+0080..U+07FF inline */
-                        ch >= 0xc2 &&
-                        ((pSrcLimit - pSrc) >= 2) &&
-                        (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
-                    ) {
-                        *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
-                        pSrc += 2;
-                        continue;
-                    }
-                }
-
-                /* function call for "complicated" and error cases */
-                ++pSrc; /* continue after the lead byte */
-                ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
-                if(ch<0 && (++numSubstitutions, ch = subchar) < 0){
-                    *pErrorCode = U_INVALID_CHAR_FOUND;
-                    return NULL;
-                }else if(ch<=0xFFFF){
-                    *(pDest++)=(UChar)ch;
-                }else{
-                    *(pDest++)=U16_LEAD(ch);
-                    if(pDest<pDestLimit){
-                        *(pDest++)=U16_TRAIL(ch);
-                    }else{
-                        reqLength++;
-                        break;
+                uint8_t __t1, __t2;
+                if( /* handle U+0800..U+FFFF inline */
+                        (0xe0<=(c) && (c)<0xf0) &&
+                        ((i)+1)<srcLength &&
+                        U8_IS_VALID_LEAD3_AND_T1((c), src[i]) &&
+                        (__t2=src[(i)+1]-0x80)<=0x3f) {
+                    *pDest++ = (((c)&0xf)<<12)|((src[i]&0x3f)<<6)|__t2;
+                    i+=2;
+                } else if( /* handle U+0080..U+07FF inline */
+                        ((c)<0xe0 && (c)>=0xc2) &&
+                        ((i)!=srcLength) &&
+                        (__t1=src[i]-0x80)<=0x3f) {
+                    *pDest++ = (((c)&0x1f)<<6)|__t1;
+                    ++(i);
+                } else {
+                    /* function call for "complicated" and error cases */
+                    (c)=utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, c, -1);
+                    if(c<0 && (++numSubstitutions, c = subchar) < 0) {
+                        *pErrorCode = U_INVALID_CHAR_FOUND;
+                        return NULL;
+                    } else if(c<=0xFFFF) {
+                        *(pDest++)=(UChar)c;
+                    } else {
+                        *(pDest++)=U16_LEAD(c);
+                        if(pDest<pDestLimit) {
+                            *(pDest++)=U16_TRAIL(c);
+                        } else {
+                            reqLength++;
+                            break;
+                        }
                    }
                }
            }
        }
-        /* do not fill the dest buffer just count the UChars needed */
-        while(pSrc < pSrcLimit){
-            ch = *pSrc;
-            if(ch <= 0x7f){
-                reqLength++;
-                ++pSrc;
-            } else {
-                if(ch > 0xe0) {
-                    if( /* handle U+1000..U+CFFF inline */
-                        ch <= 0xec &&
-                        ((pSrcLimit - pSrc) >= 3) &&
-                        (uint8_t)(pSrc[1] - 0x80) <= 0x3f &&
-                        (uint8_t)(pSrc[2] - 0x80) <= 0x3f
-                    ) {
-                        reqLength++;
-                        pSrc += 3;
-                        continue;
-                    }
-                } else if(ch < 0xe0) {
-                    if( /* handle U+0080..U+07FF inline */
-                        ch >= 0xc2 &&
-                        ((pSrcLimit - pSrc) >= 2) &&
-                        (uint8_t)(pSrc[1] - 0x80) <= 0x3f
-                    ) {
-                        reqLength++;
-                        pSrc += 2;
-                        continue;
-                    }
-                }

-                /* function call for "complicated" and error cases */
-                ++pSrc; /* continue after the lead byte */
-                ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
-                if(ch<0 && (++numSubstitutions, ch = subchar) < 0){
-                    *pErrorCode = U_INVALID_CHAR_FOUND;
-                    return NULL;
+        /* Pre-flight the rest of the string. */
+        while(i < srcLength) {
+            // modified copy of U8_NEXT()
+            c = (uint8_t)src[i++];
+            if(U8_IS_SINGLE(c)) {
+                ++reqLength;
+            } else {
+                uint8_t __t1, __t2;
+                if( /* handle U+0800..U+FFFF inline */
+                        (0xe0<=(c) && (c)<0xf0) &&
+                        ((i)+1)<srcLength &&
+                        U8_IS_VALID_LEAD3_AND_T1((c), src[i]) &&
+                        (__t2=src[(i)+1]-0x80)<=0x3f) {
+                    ++reqLength;
+                    i+=2;
+                } else if( /* handle U+0080..U+07FF inline */
+                        ((c)<0xe0 && (c)>=0xc2) &&
+                        ((i)!=srcLength) &&
+                        (__t1=src[i]-0x80)<=0x3f) {
+                    ++reqLength;
+                    ++(i);
+                } else {
+                    /* function call for "complicated" and error cases */
+                    (c)=utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, c, -1);
+                    if(c<0 && (++numSubstitutions, c = subchar) < 0) {
+                        *pErrorCode = U_INVALID_CHAR_FOUND;
+                        return NULL;
+                    }
+                    reqLength += U16_LENGTH(c);
                }
-                reqLength+=U16_LENGTH(ch);
            }
        }
    }
@ -753,7 +563,7 @@ u_strFromUTF8Lenient(UChar *dest,
    uint8_t* pSrc = (uint8_t*) src;

    /* args check */
-    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
+    if(U_FAILURE(*pErrorCode)){
        return NULL;
    }
        
@ -994,7 +804,7 @@ u_strToUTF8WithSub(char *dest,
    int32_t numSubstitutions;

    /* args check */
-    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
+    if(U_FAILURE(*pErrorCode)){
        return NULL;
    }
        
@ -1266,18 +1076,8 @@ u_strFromJavaModifiedUTF8WithSub(
        int32_t srcLength,
        UChar32 subchar, int32_t *pNumSubstitutions,
        UErrorCode *pErrorCode) {
-    UChar *pDest = dest;
-    UChar *pDestLimit = dest+destCapacity;
-    UChar32 ch;
-    int32_t reqLength = 0;
-    const uint8_t* pSrc = (const uint8_t*) src;
-    const uint8_t *pSrcLimit;
-    int32_t count;
-    uint8_t t1, t2; /* trail bytes */
-    int32_t numSubstitutions;
-
    /* args check */
-    if(U_FAILURE(*pErrorCode)){
+    if(U_FAILURE(*pErrorCode)) {
        return NULL;
    }
    if( (src==NULL && srcLength!=0) || srcLength < -1 ||
@ -1291,18 +1091,22 @@ u_strFromJavaModifiedUTF8WithSub(
    if(pNumSubstitutions!=NULL) {
        *pNumSubstitutions=0;
    }
-    numSubstitutions=0;
+    UChar *pDest = dest;
+    UChar *pDestLimit = dest+destCapacity;
+    int32_t reqLength = 0;
+    int32_t numSubstitutions=0;

    if(srcLength < 0) {
        /*
         * Transform a NUL-terminated ASCII string.
         * Handle non-ASCII strings with slower code.
         */
-        while(((ch = *pSrc) != 0) && ch <= 0x7f && (pDest < pDestLimit)) {
-            *pDest++=(UChar)ch;
-            ++pSrc;
+        UChar32 c;
+        while(((c = (uint8_t)*src) != 0) && c <= 0x7f && (pDest < pDestLimit)) {
+            *pDest++=(UChar)c;
+            ++src;
        }
-        if(ch == 0) {
+        if(c == 0) {
            reqLength=(int32_t)(pDest - dest);
            if(pDestLength) {
                *pDestLength = reqLength;
@ -1312,33 +1116,38 @@ u_strFromJavaModifiedUTF8WithSub(
            u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
            return dest;
        }
-        srcLength = static_cast<int32_t>(uprv_strlen((const char *)pSrc));
+        srcLength = static_cast<int32_t>(uprv_strlen(src));
    }

-    /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
-    pSrcLimit = (pSrc == NULL) ? NULL : pSrc + srcLength;
+    /* Faster loop without ongoing checking for srcLength and pDestLimit. */
+    UChar32 ch;
+    uint8_t t1, t2;
+    int32_t i = 0;
    for(;;) {
-        count = (int32_t)(pDestLimit - pDest);
-        srcLength = (int32_t)(pSrcLimit - pSrc);
-        if(count >= srcLength && srcLength > 0 && *pSrc <= 0x7f) {
+        int32_t count = (int32_t)(pDestLimit - pDest);
+        int32_t count2 = srcLength - i;
+        if(count >= count2 && srcLength > 0 && U8_IS_SINGLE(*src)) {
            /* fast ASCII loop */
-            const uint8_t *prevSrc = pSrc;
-            int32_t delta;
-            while(pSrc < pSrcLimit && (ch = *pSrc) <= 0x7f) {
-                *pDest++=(UChar)ch;
-                ++pSrc;
+            int32_t start = i;
+            uint8_t b;
+            while(i < srcLength && U8_IS_SINGLE(b = src[i])) {
+                *pDest++=b;
+                ++i;
            }
-            delta = (int32_t)(pSrc - prevSrc);
+            int32_t delta = i - start;
            count -= delta;
-            srcLength -= delta;
+            count2 -= delta;
        }
        /*
         * Each iteration of the inner loop progresses by at most 3 UTF-8
         * bytes and one UChar.
         */
-        srcLength /= 3;
-        if(count > srcLength) {
-            count = srcLength; /* min(remaining dest, remaining src/3) */
+        if(subchar > 0xFFFF) {
+            break;
+        }
+        count2 /= 3;
+        if(count > count2) {
+            count = count2; /* min(remaining dest, remaining src/3) */
        }
        if(count < 3) {
            /*
@ -1348,29 +1157,28 @@ u_strFromJavaModifiedUTF8WithSub(
            break;
        }
        do {
-            ch = *pSrc;
-            if(ch <= 0x7f){
+            ch = (uint8_t)src[i++];
+            if(U8_IS_SINGLE(ch)) {
                *pDest++=(UChar)ch;
-                ++pSrc;
            } else {
                if(ch >= 0xe0) {
                    if( /* handle U+0000..U+FFFF inline */
                        ch <= 0xef &&
-                        (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
-                        (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
+                        (t1 = (uint8_t)(src[i] - 0x80)) <= 0x3f &&
+                        (t2 = (uint8_t)(src[i+1] - 0x80)) <= 0x3f
                    ) {
                        /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
                        *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
-                        pSrc += 3;
+                        i += 2;
                        continue;
                    }
                } else {
                    if( /* handle U+0000..U+07FF inline */
                        ch >= 0xc0 &&
-                        (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
+                        (t1 = (uint8_t)(src[i] - 0x80)) <= 0x3f
                    ) {
                        *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
-                        pSrc += 2;
+                        ++i;
                        continue;
                    }
                }
@ -1383,49 +1191,43 @@ u_strFromJavaModifiedUTF8WithSub(
                     * We need to write two UChars, adjusted count for that,
                     * and ran out of space.
                     */
+                    --i;  // back out byte ch
                    break;
                } else {
                    /* function call for error cases */
-                    ++pSrc; /* continue after the lead byte */
-                    utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
+                    utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, ch, -1);
                    ++numSubstitutions;
-                    if(subchar<=0xFFFF) {
-                        *(pDest++)=(UChar)subchar;
-                    } else {
-                        *(pDest++)=U16_LEAD(subchar);
-                        *(pDest++)=U16_TRAIL(subchar);
-                    }
+                    *(pDest++)=(UChar)subchar;
                }
            }
        } while(--count > 0);
    }

-    while((pSrc<pSrcLimit) && (pDest<pDestLimit)) {
-        ch = *pSrc;
-        if(ch <= 0x7f){
+    while(i < srcLength && (pDest < pDestLimit)) {
+        ch = (uint8_t)src[i++];
+        if(U8_IS_SINGLE(ch)){
            *pDest++=(UChar)ch;
-            ++pSrc;
        } else {
            if(ch >= 0xe0) {
                if( /* handle U+0000..U+FFFF inline */
                    ch <= 0xef &&
-                    ((pSrcLimit - pSrc) >= 3) &&
-                    (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
-                    (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
+                    (i+1) < srcLength &&
+                    (t1 = (uint8_t)(src[i] - 0x80)) <= 0x3f &&
+                    (t2 = (uint8_t)(src[i+1] - 0x80)) <= 0x3f
                ) {
                    /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
                    *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
-                    pSrc += 3;
+                    i += 2;
                    continue;
                }
            } else {
                if( /* handle U+0000..U+07FF inline */
                    ch >= 0xc0 &&
-                    ((pSrcLimit - pSrc) >= 2) &&
-                    (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
+                    i < srcLength &&
+                    (t1 = (uint8_t)(src[i] - 0x80)) <= 0x3f
                ) {
                    *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
-                    pSrc += 2;
+                    ++i;
                    continue;
                }
            }
@ -1435,8 +1237,7 @@ u_strFromJavaModifiedUTF8WithSub(
                return NULL;
            } else {
                /* function call for error cases */
-                ++pSrc; /* continue after the lead byte */
-                utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
+                utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, ch, -1);
                ++numSubstitutions;
                if(subchar<=0xFFFF) {
                    *(pDest++)=(UChar)subchar;
@ -1453,32 +1254,31 @@ u_strFromJavaModifiedUTF8WithSub(
        }
    }

-    /* do not fill the dest buffer just count the UChars needed */
-    while(pSrc < pSrcLimit){
-        ch = *pSrc;
-        if(ch <= 0x7f) {
+    /* Pre-flight the rest of the string. */
+    while(i < srcLength) {
+        ch = (uint8_t)src[i++];
+        if(U8_IS_SINGLE(ch)) {
            reqLength++;
-            ++pSrc;
        } else {
            if(ch >= 0xe0) {
                if( /* handle U+0000..U+FFFF inline */
                    ch <= 0xef &&
-                    ((pSrcLimit - pSrc) >= 3) &&
-                    (uint8_t)(pSrc[1] - 0x80) <= 0x3f &&
-                    (uint8_t)(pSrc[2] - 0x80) <= 0x3f
+                    (i+1) < srcLength &&
+                    (uint8_t)(src[i] - 0x80) <= 0x3f &&
+                    (uint8_t)(src[i+1] - 0x80) <= 0x3f
                ) {
                    reqLength++;
-                    pSrc += 3;
+                    i += 2;
                    continue;
                }
            } else {
                if( /* handle U+0000..U+07FF inline */
                    ch >= 0xc0 &&
-                    ((pSrcLimit - pSrc) >= 2) &&
-                    (uint8_t)(pSrc[1] - 0x80) <= 0x3f
+                    i < srcLength &&
+                    (uint8_t)(src[i] - 0x80) <= 0x3f
                ) {
                    reqLength++;
-                    pSrc += 2;
+                    ++i;
                    continue;
                }
            }
@ -1488,8 +1288,7 @@ u_strFromJavaModifiedUTF8WithSub(
                return NULL;
            } else {
                /* function call for error cases */
-                ++pSrc; /* continue after the lead byte */
-                utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
+                utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, ch, -1);
                ++numSubstitutions;
                reqLength+=U16_LENGTH(ch);
            }
--- a/icu4c/source/common/utext.cpp
+++ b/icu4c/source/common/utext.cpp
@ -847,15 +847,11 @@ U_CDECL_END
 //------------------------------------------------------------------------------

 // Chunk size.
-//     Must be less than 42  (256/6), because of byte mapping from UChar indexes to native indexes.
-//     Worst case there are six UTF-8 bytes per UChar.
-//         obsolete 6 byte form fd + 5 trails maps to fffd
-//         obsolete 5 byte form fc + 4 trails maps to fffd
-//         non-shortest 4 byte forms maps to fffd
-//         normal supplementaries map to a pair of utf-16, two utf8 bytes per utf-16 unit
-//     mapToUChars array size must allow for the worst case, 6.
-//     This could be brought down to 4, by treating fd and fc as pure illegal,
-//     rather than obsolete lead bytes. But that is not compatible with the utf-8 access macros.
+//     Must be less than 85 (256/3), because of byte mapping from UChar indexes to native indexes.
+//     Worst case is three native bytes to one UChar.  (Supplemenaries are 4 native bytes
+//     to two UChars.)
+//     The longest illegal byte sequence treated as a single error (and converted to U+FFFD)
+//     is a three-byte sequence (truncated four-byte sequence).
 //
 enum { UTF8_TEXT_CHUNK_SIZE=32 };

@ -895,7 +891,7 @@ struct UTF8Buf {
                                                     //  Requires two extra slots,
                                                     //    one for a supplementary starting in the last normal position,
                                                     //    and one for an entry for the buffer limit position.
-    uint8_t   mapToUChars[UTF8_TEXT_CHUNK_SIZE*6+6]; // Map native offset from bufNativeStart to
+    uint8_t   mapToUChars[UTF8_TEXT_CHUNK_SIZE*3+6]; // Map native offset from bufNativeStart to
                                                     //   correspoding offset in filled part of buf.
    int32_t   align;
 };
--- a/icu4c/source/common/utf_impl.cpp
+++ b/icu4c/source/common/utf_impl.cpp
@ -7,7 +7,7 @@
 *   Corporation and others.  All Rights Reserved.
 *
 ******************************************************************************
-*   file name:  utf_impl.c
+*   file name:  utf_impl.cpp
 *   encoding:   UTF-8
 *   tab size:   8 (not used)
 *   indentation:4
@ -54,10 +54,6 @@
 * - SUB AX, BX (result)
 * -finish:
 * (BSR: Bit Scan Reverse, scans for a 1-bit, starting from the MSB)
- *
- * In Unicode, all UTF-8 byte sequences with more than 4 bytes are illegal;
- * lead bytes above 0xf4 are illegal.
- * We keep them in this table for skipping long ISO 10646-UTF-8 sequences.
 */
 extern "C" U_EXPORT const uint8_t
 utf8_countTrailBytes[256]={
@ -76,27 +72,24 @@ utf8_countTrailBytes[256]={
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    // illegal C0 & C1
+    // 2-byte lead bytes C2..DF
+    0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

+    // 3-byte lead bytes E0..EF
    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-    3, 3, 3, 3, 3,
-    3, 3, 3,    /* illegal in Unicode */
-    4, 4, 4, 4, /* illegal in Unicode */
-    5, 5,       /* illegal in Unicode */
-    0, 0        /* illegal bytes 0xfe and 0xff */
+    // 4-byte lead bytes F0..F4
+    // illegal F5..FF
+    3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 };

-static const UChar32
-utf8_minLegal[4]={ 0, 0x80, 0x800, 0x10000 };
-
 static const UChar32
 utf8_errorValue[6]={
    // Same values as UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_2, UTF_ERROR_VALUE,
    // but without relying on the obsolete unicode/utf_old.h.
    0x15, 0x9f, 0xffff,
-    0x10ffff,
-    0x3ffffff, 0x7fffffff
+    0x10ffff
 };

 static UChar32
@ -136,61 +129,59 @@ errorValue(int32_t count, int8_t strict) {
 */
 U_CAPI UChar32 U_EXPORT2
 utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, UChar32 c, UBool strict) {
+    // *pi is one after byte c.
    int32_t i=*pi;
-    uint8_t count=U8_COUNT_TRAIL_BYTES(c);
-    U_ASSERT(count <= 5); /* U8_COUNT_TRAIL_BYTES returns value 0...5 */
-    if(i+count<=length || length<0) {
-        uint8_t trail;
-
-        U8_MASK_LEAD_BYTE(c, count);
-        /* support NUL-terminated strings: do not read beyond the first non-trail byte */
-        switch(count) {
-        /* each branch falls through to the next one */
-        case 0:
-            /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
-        case 5:
-        case 4:
-            /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
-            break;
-        case 3:
-            trail=s[i++]-0x80;
-            c=(c<<6)|trail;
-            /* c>=0x110 would result in code point>0x10ffff, outside Unicode */
-            if(c>=0x110 || trail>0x3f) { break; }
-            U_FALLTHROUGH;
-        case 2:
-            trail=s[i++]-0x80;
-            c=(c<<6)|trail;
-            /*
-             * test for a surrogate d800..dfff unless we are lenient:
-             * before the last (c<<6), a surrogate is c=360..37f
-             */
-            if(((c&0xffe0)==0x360 && strict!=-2) || trail>0x3f) { break; }
-            U_FALLTHROUGH;
-        case 1:
-            trail=s[i++]-0x80;
-            c=(c<<6)|trail;
-            if(trail>0x3f) { break; }
-            /* correct sequence - all trail bytes have (b7..b6)==(10) */
-            if(c>=utf8_minLegal[count] &&
-                    /* strict: forbid non-characters like U+fffe */
-                    (strict<=0 || !U_IS_UNICODE_NONCHAR(c))) {
+    // length can be negative for NUL-terminated strings: Read and validate one byte at a time.
+    if(i==length || c>0xf4) {
+        // end of string, or not a lead byte
+    } else if(c>=0xf0) {
+        // Test for 4-byte sequences first because
+        // U8_NEXT() handles shorter valid sequences inline.
+        uint8_t t1=s[i], t2, t3;
+        c&=7;
+        if(U8_IS_VALID_LEAD4_AND_T1(c, t1) &&
+                ++i!=length && (t2=s[i]-0x80)<=0x3f &&
+                ++i!=length && (t3=s[i]-0x80)<=0x3f) {
+            ++i;
+            c=(c<<18)|((t1&0x3f)<<12)|(t2<<6)|t3;
+            // strict: forbid non-characters like U+fffe
+            if(strict<=0 || !U_IS_UNICODE_NONCHAR(c)) {
                *pi=i;
                return c;
            }
-        /* no default branch to optimize switch()  - all values are covered */
        }
-    } else {
-        /* too few bytes left */
-        count=length-i;
-    }
+    } else if(c>=0xe0) {
+        c&=0xf;
+        if(strict!=-2) {
+            uint8_t t1=s[i], t2;
+            if(U8_IS_VALID_LEAD3_AND_T1(c, t1) &&
+                    ++i!=length && (t2=s[i]-0x80)<=0x3f) {
+                ++i;
+                c=(c<<12)|((t1&0x3f)<<6)|t2;
+                // strict: forbid non-characters like U+fffe
+                if(strict<=0 || !U_IS_UNICODE_NONCHAR(c)) {
+                    *pi=i;
+                    return c;
+                }
+            }
+        } else {
+            // strict=-2 -> lenient: allow surrogates
+            uint8_t t1=s[i]-0x80, t2;
+            if(t1<=0x3f && (c>0 || t1>=0x20) &&
+                    ++i!=length && (t2=s[i]-0x80)<=0x3f) {
+                *pi=i+1;
+                return (c<<12)|(t1<<6)|t2;
+            }
+        }
+    } else if(c>=0xc2) {
+        uint8_t t1=s[i]-0x80;
+        if(t1<=0x3f) {
+            *pi=i+1;
+            return ((c-0xc0)<<6)|t1;
+        }
+    }  // else 0x80<=c<0xc2 is not a lead byte

    /* error handling */
-    i=*pi;
-    while(count>0 && U8_IS_TRAIL(s[i])) {
-        ++i;
-        --count;
-    }
    c=errorValue(i-*pi, strict);
    *pi=i;
    return c;
@ -243,99 +234,99 @@ utf8_appendCharSafeBody(uint8_t *s, int32_t i, int32_t length, UChar32 c, UBool

 U_CAPI UChar32 U_EXPORT2
 utf8_prevCharSafeBody(const uint8_t *s, int32_t start, int32_t *pi, UChar32 c, UBool strict) {
+    // *pi is the index of byte c.
    int32_t i=*pi;
-    uint8_t b, count=1, shift=6;
-
-    if(!U8_IS_TRAIL(c)) { return errorValue(0, strict); }
-
-    /* extract value bits from the last trail byte */
-    c&=0x3f;
-
-    for(;;) {
-        if(i<=start) {
-            /* no lead byte at all */
-            return errorValue(0, strict);
-        }
-
-        /* read another previous byte */
-        b=s[--i];
-        if((uint8_t)(b-0x80)<0x7e) { /* 0x80<=b<0xfe */
-            if(b&0x40) {
-                /* lead byte, this will always end the loop */
-                uint8_t shouldCount=U8_COUNT_TRAIL_BYTES(b);
-
-                if(count==shouldCount) {
-                    /* set the new position */
-                    *pi=i;
-                    U8_MASK_LEAD_BYTE(b, count);
-                    c|=(UChar32)b<<shift;
-                    if(count>=4 || c>0x10ffff || c<utf8_minLegal[count] || (U_IS_SURROGATE(c) && strict!=-2) || (strict>0 && U_IS_UNICODE_NONCHAR(c))) {
-                        /* illegal sequence or (strict and non-character) */
-                        if(count>=4) {
-                            count=3;
+    if(U8_IS_TRAIL(c) && i>start) {
+        uint8_t b1=s[--i];
+        if(0xc2<=b1 && b1<0xe0) {
+            *pi=i;
+            return ((b1-0xc0)<<6)|(c&0x3f);
+        } else if(U8_IS_TRAIL(b1) && i>start) {
+            // Extract the value bits from the last trail byte.
+            c&=0x3f;
+            uint8_t b2=s[--i];
+            if(0xe0<=b2 && b2<0xf0) {
+                b2&=0xf;
+                if(strict!=-2) {
+                    if(U8_IS_VALID_LEAD3_AND_T1(b2, b1)) {
+                        *pi=i;
+                        c=(b2<<12)|((b1&0x3f)<<6)|c;
+                        if(strict<=0 || !U_IS_UNICODE_NONCHAR(c)) {
+                            return c;
+                        } else {
+                            // strict: forbid non-characters like U+fffe
+                            return errorValue(2, strict);
                        }
-                        c=errorValue(count, strict);
-                    } else {
-                        /* exit with correct c */
                    }
                } else {
-                    /* the lead byte does not match the number of trail bytes */
-                    /* only set the position to the lead byte if it would
-                       include the trail byte that we started with */
-                    if(count<shouldCount) {
+                    // strict=-2 -> lenient: allow surrogates
+                    b1-=0x80;
+                    if((b2>0 || b1>=0x20)) {
                        *pi=i;
-                        c=errorValue(count, strict);
-                    } else {
-                        c=errorValue(0, strict);
+                        return (b2<<12)|(b1<<6)|c;
                    }
                }
-                break;
-            } else if(count<5) {
-                /* trail byte */
-                c|=(UChar32)(b&0x3f)<<shift;
-                ++count;
-                shift+=6;
-            } else {
-                /* more than 5 trail bytes is illegal */
-                c=errorValue(0, strict);
-                break;
+            } else if(U8_IS_TRAIL(b2) && i>start) {
+                uint8_t b3=s[--i];
+                if(0xf0<=b3 && b3<=0xf4) {
+                    b3&=7;
+                    if(U8_IS_VALID_LEAD4_AND_T1(b3, b2)) {
+                        *pi=i;
+                        c=(b3<<18)|((b2&0x3f)<<12)|((b1&0x3f)<<6)|c;
+                        if(strict<=0 || !U_IS_UNICODE_NONCHAR(c)) {
+                            return c;
+                        } else {
+                            // strict: forbid non-characters like U+fffe
+                            return errorValue(3, strict);
+                        }
+                    }
+                }
+            } else if((0xf0<=b2 && b2<=0xf4) && U8_IS_VALID_LEAD4_AND_T1(b2, b1)) {
+                // Truncated 4-byte sequence.
+                *pi=i;
+                return errorValue(2, strict);
            }
-        } else {
-            /* single-byte character precedes trailing bytes */
-            c=errorValue(0, strict);
-            break;
+        } else if(((0xe0<=b1 && b1<0xf0) && U8_IS_VALID_LEAD3_AND_T1(b1, c)) ||
+                ((0xf0<=b1 && b1<=0xf4) && U8_IS_VALID_LEAD4_AND_T1(b1, c))) {
+            // Truncated 3- or 4-byte sequence.
+            *pi=i;
+            return errorValue(1, strict);
        }
    }
-    return c;
+    return errorValue(0, strict);
 }

 U_CAPI int32_t U_EXPORT2
 utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i) {
-    /* i had been decremented once before the function call */
-    int32_t I=i, Z;
-    uint8_t b;
-
-    /* read at most the 6 bytes s[Z] to s[i], inclusively */
-    if(I-5>start) {
-        Z=I-5;
-    } else {
-        Z=start;
-    }
-
-    /* return I if the sequence starting there is long enough to include i */
-    do {
-        b=s[I];
-        if((uint8_t)(b-0x80)>=0x7e) { /* not 0x80<=b<0xfe */
-            break;
-        } else if(b>=0xc0) {
-            if(U8_COUNT_TRAIL_BYTES(b)>=(i-I)) {
-                return I;
-            } else {
-                break;
+    // Same as utf8_prevCharSafeBody(..., strict=-1) minus assembling code points.
+    int32_t orig_i=i;
+    uint8_t c=s[i];
+    if(U8_IS_TRAIL(c) && i>start) {
+        uint8_t b1=s[--i];
+        if(0xc2<=b1 && b1<0xe0) {
+            return i;
+        } else if(U8_IS_TRAIL(b1) && i>start) {
+            uint8_t b2=s[--i];
+            if(0xe0<=b2 && b2<0xf0) {
+                if(U8_IS_VALID_LEAD3_AND_T1(b2, b1)) {
+                    return i;
+                }
+            } else if(U8_IS_TRAIL(b2) && i>start) {
+                uint8_t b3=s[--i];
+                if(0xf0<=b3 && b3<=0xf4) {
+                    if(U8_IS_VALID_LEAD4_AND_T1(b3, b2)) {
+                        return i;
+                    }
+                }
+            } else if((0xf0<=b2 && b2<=0xf4) && U8_IS_VALID_LEAD4_AND_T1(b2, b1)) {
+                // Truncated 4-byte sequence.
+                return i;
            }
+        } else if(((0xe0<=b1 && b1<0xf0) && U8_IS_VALID_LEAD3_AND_T1(b1, c)) ||
+                ((0xf0<=b1 && b1<=0xf4) && U8_IS_VALID_LEAD4_AND_T1(b1, c))) {
+            // Truncated 3- or 4-byte sequence.
+            return i;
        }
-    } while(Z<=--I);
-
-    /* return i itself to be consistent with the FWD_1 macro */
-    return i;
+    }
+    return orig_i;
 }
--- a/icu4c/source/common/utrie2.h
+++ b/icu4c/source/common/utrie2.h
@ -20,6 +20,7 @@
 #define __UTRIE2_H__

 #include "unicode/utypes.h"
+#include "unicode/utf8.h"
 #include "putilimp.h"
 #include "udataswp.h"

@ -54,6 +55,8 @@ typedef struct UTrie UTrie;
 *   is truncated, omitting both the BMP portion and the high range.
 * - There is a special small index for 2-byte UTF-8, and the initial data
 *   entries are designed for fast 1/2-byte UTF-8 lookup.
+ *   Starting with ICU 60, C0 and C1 are not recognized as UTF-8 lead bytes any more at all,
+ *   and the associated 2-byte indexes are unused.
 */

 /**
@ -933,29 +936,29 @@ utrie2_internalU8PrevIndex(const UTrie2 *trie, UChar32 c,
 /** Internal UTF-8 next-post-increment: get the next code point's data. */
 #define _UTRIE2_U8_NEXT(trie, ascii, data, src, limit, result) { \
    uint8_t __lead=(uint8_t)*(src)++; \
-    if(__lead<0xc0) { \
+    if(U8_IS_SINGLE(__lead)) { \
        (result)=(trie)->ascii[__lead]; \
    } else { \
        uint8_t __t1, __t2; \
-        if( /* handle U+0000..U+07FF inline */ \
-            __lead<0xe0 && (src)<(limit) && \
+        if( /* handle U+0800..U+FFFF inline */ \
+            0xe0<=__lead && __lead<0xf0 && ((src)+1)<(limit) && \
+            U8_IS_VALID_LEAD3_AND_T1(__lead, __t1=(uint8_t)*(src)) && \
+            (__t2=(uint8_t)(*((src)+1)-0x80))<= 0x3f \
+        ) { \
+            (src)+=2; \
+            (result)=(trie)->data[ \
+                ((int32_t)((trie)->index[((__lead-0xe0)<<(12-UTRIE2_SHIFT_2))+ \
+                                         ((__t1&0x3f)<<(6-UTRIE2_SHIFT_2))+(__t2>>UTRIE2_SHIFT_2)]) \
+                <<UTRIE2_INDEX_SHIFT)+ \
+                (__t2&UTRIE2_DATA_MASK)]; \
+        } else if( /* handle U+0080..U+07FF inline */ \
+            __lead<0xe0 && __lead>=0xc2 && (src)<(limit) && \
            (__t1=(uint8_t)(*(src)-0x80))<=0x3f \
        ) { \
            ++(src); \
            (result)=(trie)->data[ \
                (trie)->index[(UTRIE2_UTF8_2B_INDEX_2_OFFSET-0xc0)+__lead]+ \
                __t1]; \
-        } else if( /* handle U+0000..U+CFFF inline */ \
-            __lead<0xed && ((src)+1)<(limit) && \
-            (__t1=(uint8_t)(*(src)-0x80))<=0x3f && (__lead>0xe0 || __t1>=0x20) && \
-            (__t2=(uint8_t)(*((src)+1)-0x80))<= 0x3f \
-        ) { \
-            (src)+=2; \
-            (result)=(trie)->data[ \
-                ((int32_t)((trie)->index[((__lead-0xe0)<<(12-UTRIE2_SHIFT_2))+ \
-                                         (__t1<<(6-UTRIE2_SHIFT_2))+(__t2>>UTRIE2_SHIFT_2)]) \
-                <<UTRIE2_INDEX_SHIFT)+ \
-                (__t2&UTRIE2_DATA_MASK)]; \
        } else { \
            int32_t __index=utrie2_internalU8NextIndex((trie), __lead, (const uint8_t *)(src), \
                                                                       (const uint8_t *)(limit)); \
@ -968,7 +971,7 @@ utrie2_internalU8PrevIndex(const UTrie2 *trie, UChar32 c,
 /** Internal UTF-8 pre-decrement-previous: get the previous code point's data. */
 #define _UTRIE2_U8_PREV(trie, ascii, data, start, src, result) { \
    uint8_t __b=(uint8_t)*--(src); \
-    if(__b<0x80) { \
+    if(U8_IS_SINGLE(__b)) { \
        (result)=(trie)->ascii[__b]; \
    } else { \
        int32_t __index=utrie2_internalU8PrevIndex((trie), __b, (const uint8_t *)(start), \
--- a/icu4c/source/i18n/utf8collationiterator.cpp
+++ b/icu4c/source/i18n/utf8collationiterator.cpp
@ -49,26 +49,25 @@ UTF8CollationIterator::handleNextCE32(UChar32 &c, UErrorCode & /*errorCode*/) {
    }
    // Optimized combination of U8_NEXT_OR_FFFD() and UTRIE2_U8_NEXT32().
    c = u8[pos++];
-    if(c < 0xc0) {
-        // ASCII 00..7F; trail bytes 80..BF map to error values.
+    if(U8_IS_SINGLE(c)) {
+        // ASCII 00..7F
        return trie->data32[c];
    }
    uint8_t t1, t2;
-    if(c < 0xe0 && pos != length && (t1 = (u8[pos] - 0x80)) <= 0x3f) {
-        // U+0080..U+07FF; 00..7F map to error values.
+    if(0xe0 <= c && c < 0xf0 &&
+            ((pos + 1) < length || length < 0) &&
+            U8_IS_VALID_LEAD3_AND_T1(c, t1 = u8[pos]) &&
+            (t2 = (u8[pos + 1] - 0x80)) <= 0x3f) {
+        // U+0800..U+FFFF except surrogates
+        c = (((c & 0xf) << 12) | ((t1 & 0x3f) << 6) | t2);
+        pos += 2;
+        return UTRIE2_GET32_FROM_U16_SINGLE_LEAD(trie, c);
+    } else if(c < 0xe0 && c >= 0xc2 && pos != length && (t1 = (u8[pos] - 0x80)) <= 0x3f) {
+        // U+0080..U+07FF
        uint32_t ce32 = trie->data32[trie->index[(UTRIE2_UTF8_2B_INDEX_2_OFFSET - 0xc0) + c] + t1];
        c = ((c & 0x1f) << 6) | t1;
        ++pos;
        return ce32;
-    } else if(c <= 0xef &&
-              ((pos + 1) < length || length < 0) &&
-              (t1 = (u8[pos] - 0x80)) <= 0x3f && (c != 0xe0 || t1 >= 0x20) &&
-              (t2 = (u8[pos + 1] - 0x80)) <= 0x3f
-    ) {
-        // U+0800..U+FFFF; caller maps surrogates to error values.
-        c = (UChar)((c << 12) | (t1 << 6) | t2);
-        pos += 2;
-        return UTRIE2_GET32_FROM_U16_SINGLE_LEAD(trie, c);
    } else {
        // Function call for supplementary code points and error cases.
        // Illegal byte sequences yield U+FFFD.
@ -158,28 +157,17 @@ FCDUTF8CollationIterator::handleNextCE32(UChar32 &c, UErrorCode &errorCode) {
                return Collation::FALLBACK_CE32;
            }
            c = u8[pos++];
-            if(c < 0xc0) {
-                // ASCII 00..7F; trail bytes 80..BF map to error values.
+            if(U8_IS_SINGLE(c)) {
+                // ASCII 00..7F
                return trie->data32[c];
            }
            uint8_t t1, t2;
-            if(c < 0xe0 && pos != length && (t1 = (u8[pos] - 0x80)) <= 0x3f) {
-                // U+0080..U+07FF; 00..7F map to error values.
-                uint32_t ce32 = trie->data32[trie->index[(UTRIE2_UTF8_2B_INDEX_2_OFFSET - 0xc0) + c] + t1];
-                c = ((c & 0x1f) << 6) | t1;
-                ++pos;
-                if(CollationFCD::hasTccc(c) && pos != length && nextHasLccc()) {
-                    pos -= 2;
-                } else {
-                    return ce32;
-                }
-            } else if(c <= 0xef &&
-                      ((pos + 1) < length || length < 0) &&
-                      (t1 = (u8[pos] - 0x80)) <= 0x3f && (c != 0xe0 || t1 >= 0x20) &&
-                      (t2 = (u8[pos + 1] - 0x80)) <= 0x3f
-            ) {
-                // U+0800..U+FFFF; caller maps surrogates to error values.
-                c = (UChar)((c << 12) | (t1 << 6) | t2);
+            if(0xe0 <= c && c < 0xf0 &&
+                    ((pos + 1) < length || length < 0) &&
+                    U8_IS_VALID_LEAD3_AND_T1(c, t1 = u8[pos]) &&
+                    (t2 = (u8[pos + 1] - 0x80)) <= 0x3f) {
+                // U+0800..U+FFFF except surrogates
+                c = (((c & 0xf) << 12) | ((t1 & 0x3f) << 6) | t2);
                pos += 2;
                if(CollationFCD::hasTccc(c) &&
                        (CollationFCD::maybeTibetanCompositeVowel(c) ||
@ -188,6 +176,16 @@ FCDUTF8CollationIterator::handleNextCE32(UChar32 &c, UErrorCode &errorCode) {
                } else {
                    break;  // return CE32(BMP)
                }
+            } else if(c < 0xe0 && c >= 0xc2 && pos != length && (t1 = (u8[pos] - 0x80)) <= 0x3f) {
+                // U+0080..U+07FF
+                uint32_t ce32 = trie->data32[trie->index[(UTRIE2_UTF8_2B_INDEX_2_OFFSET - 0xc0) + c] + t1];
+                c = ((c & 0x1f) << 6) | t1;
+                ++pos;
+                if(CollationFCD::hasTccc(c) && pos != length && nextHasLccc()) {
+                    pos -= 2;
+                } else {
+                    return ce32;
+                }
            } else {
                // Function call for supplementary code points and error cases.
                // Illegal byte sequences yield U+FFFD.
@ -237,7 +235,7 @@ UBool
 FCDUTF8CollationIterator::previousHasTccc() const {
    U_ASSERT(state == CHECK_BWD && pos != 0);
    UChar32 c = u8[pos - 1];
-    if(c < 0x80) { return FALSE; }
+    if(U8_IS_SINGLE(c)) { return FALSE; }
    int32_t i = pos;
    U8_PREV_OR_FFFD(u8, 0, i, c);
    if(c > 0xffff) { c = U16_LEAD(c); }
@ -271,7 +269,7 @@ FCDUTF8CollationIterator::nextCodePoint(UErrorCode &errorCode) {
            if(pos == length || ((c = u8[pos]) == 0 && length < 0)) {
                return U_SENTINEL;
            }
-            if(c < 0x80) {
+            if(U8_IS_SINGLE(c)) {
                ++pos;
                return c;
            }
@ -309,7 +307,7 @@ FCDUTF8CollationIterator::previousCodePoint(UErrorCode &errorCode) {
            if(pos == 0) {
                return U_SENTINEL;
            }
-            if((c = u8[pos - 1]) < 0x80) {
+            if(U8_IS_SINGLE(c = u8[pos - 1])) {
                --pos;
                return c;
            }
--- a/icu4c/source/test/cintltst/custrtrn.c
+++ b/icu4c/source/test/cintltst/custrtrn.c
@ -670,12 +670,13 @@ static void Test_UChar_UTF8_API(void){
    }

    /* test UTF-8 with single surrogates - illegal in Unicode 3.2 */
+    // Since ICU 60, each surrogate byte sequence is treated as 3 single-byte errors.
    {
        static const UChar
            withLead16[]={ 0x1800, 0xd89a, 0x0061 },
            withTrail16[]={ 0x1800, 0xdcba, 0x0061, 0 },
-            withTrail16SubFFFD[]={ 0x1800, 0xfffd, 0x0061, 0 }, /* sub==U+FFFD */
-            withTrail16Sub50005[]={ 0x1800, 0xd900, 0xdc05, 0x0061, 0 }; /* sub==U+50005 */
+            withTrail16SubFFFD[]={ 0x1800, 0xfffd, 0xfffd, 0xfffd, 0x0061, 0 }, /* sub==U+FFFD */
+            withTrail16Sub50005[]={ 0x1800, 0xd900, 0xdc05, 0xd900, 0xdc05, 0xd900, 0xdc05, 0x0061, 0 }; /* sub==U+50005 */
        static const uint8_t
            withLead8[]={ 0xe1, 0xa0, 0x80, 0xed, 0xa2, 0x9a, 0x61 },
            withTrail8[]={ 0xe1, 0xa0, 0x80, 0xed, 0xb2, 0xba, 0x61, 0 },
@ -706,7 +707,7 @@ static void Test_UChar_UTF8_API(void){
                             &err);
        if(U_FAILURE(err) || uDestLen!=u_strlen(withTrail16Sub50005) ||
                             0!=u_memcmp(withTrail16Sub50005, out16, uDestLen+1) ||
-                             numSubstitutions!=1) {
+                             numSubstitutions!=3) {
            log_err("error: u_strFromUTF8WithSub(length) failed\n");
        }

@ -721,7 +722,7 @@ static void Test_UChar_UTF8_API(void){
                             &err);
        if(U_FAILURE(err) || uDestLen!=u_strlen(withTrail16SubFFFD) ||
                             0!=u_memcmp(withTrail16SubFFFD, out16, uDestLen+1) ||
-                             numSubstitutions!=1) {
+                             numSubstitutions!=3) {
            log_err("error: u_strFromUTF8WithSub(NUL termination) failed\n");
        }

@ -734,7 +735,7 @@ static void Test_UChar_UTF8_API(void){
                             (const char *)withTrail8, -1,
                             0x50005, &numSubstitutions,
                             &err);
-        if(err!=U_BUFFER_OVERFLOW_ERROR || uDestLen!=u_strlen(withTrail16Sub50005) || numSubstitutions!=1) {
+        if(err!=U_BUFFER_OVERFLOW_ERROR || uDestLen!=u_strlen(withTrail16Sub50005) || numSubstitutions!=3) {
            log_err("error: u_strFromUTF8WithSub(preflight/NUL termination) failed\n");
        }

@ -1015,14 +1016,6 @@ Test_FromUTF8Lenient(void) {
        log_err("u_strFromUTF8Lenient(U_MEMORY_ALLOCATION_ERROR) failed\n");
    }

-    dest[0]=0x1234;
-    destLength=-1;
-    errorCode=U_MEMORY_ALLOCATION_ERROR;
-    pDest=u_strFromUTF8Lenient(dest, 1, &destLength, (const char *)bytes, -1, NULL);
-    if(dest[0]!=0x1234) {
-        log_err("u_strFromUTF8Lenient(pErrorCode=NULL) failed\n");
-    }
-
    /* test normal behavior */
    number=0; /* string number for log_err() */

--- a/icu4c/source/test/cintltst/trie2test.c
+++ b/icu4c/source/test/cintltst/trie2test.c
@ -350,6 +350,11 @@ static void
 testTrieUTF8(const char *testName,
             const UTrie2 *trie, UTrie2ValueBits valueBits,
             const CheckRange checkRanges[], int32_t countCheckRanges) {
+    // Note: The byte sequence comments refer to the original UTF-8 definition.
+    // Starting with ICU 60, any sequence that is not a prefix of a valid one
+    // is treated as multiple single-byte errors.
+    // For testing, we only rely on U8_... and UTrie2 UTF-8 macros
+    // iterating consistently.
    static const uint8_t illegal[]={
        0xc0, 0x80,                         /* non-shortest U+0000 */
        0xc1, 0xbf,                         /* non-shortest U+007f */
@ -394,15 +399,36 @@ testTrieUTF8(const char *testName,
        value=checkRanges[i].value;
        /* write three legal (or surrogate) code points */
        U8_APPEND_UNSAFE(s, length, prevCP);    /* start of the range */
-        values[countValues++]=U_IS_SURROGATE(prevCP) ? errorValue : value;
+        if(U_IS_SURROGATE(prevCP)) {
+            // A surrogate byte sequence counts as 3 single-byte errors.
+            values[countValues++]=errorValue;
+            values[countValues++]=errorValue;
+            values[countValues++]=errorValue;
+        } else {
+            values[countValues++]=value;
+        }
        c=checkRanges[i].limit;
        prevCP=(prevCP+c)/2;                    /* middle of the range */
        U8_APPEND_UNSAFE(s, length, prevCP);
-        values[countValues++]=U_IS_SURROGATE(prevCP) ? errorValue : value;
+        if(U_IS_SURROGATE(prevCP)) {
+            // A surrogate byte sequence counts as 3 single-byte errors.
+            values[countValues++]=errorValue;
+            values[countValues++]=errorValue;
+            values[countValues++]=errorValue;
+        } else {
+            values[countValues++]=value;
+        }
        prevCP=c;
        --c;                                    /* end of the range */
        U8_APPEND_UNSAFE(s, length, c);
-        values[countValues++]=U_IS_SURROGATE(c) ? errorValue : value;
+        if(U_IS_SURROGATE(prevCP)) {
+            // A surrogate byte sequence counts as 3 single-byte errors.
+            values[countValues++]=errorValue;
+            values[countValues++]=errorValue;
+            values[countValues++]=errorValue;
+        } else {
+            values[countValues++]=value;
+        }
        /* write an illegal byte sequence */
        if(i8<sizeof(illegal)) {
            U8_FWD_1(illegal, i8, sizeof(illegal));
@ -435,17 +461,20 @@ testTrieUTF8(const char *testName,
        }
        bytes=0;
        if(value!=values[i] || i8!=(p-s)) {
-            while(prev8<i8) {
-                bytes=(bytes<<8)|s[prev8++];
+            int32_t k=prev8;
+            while(k<i8) {
+                bytes=(bytes<<8)|s[k++];
            }
        }
        if(value!=values[i]) {
-            log_err("error: wrong value from UTRIE2_U8_NEXT(%s)(%lx->U+%04lx): 0x%lx instead of 0x%lx\n",
-                    testName, (unsigned long)bytes, (long)c, (long)value, (long)values[i]);
+            log_err("error: wrong value from UTRIE2_U8_NEXT(%s)(from %d %lx->U+%04lx) (read %d bytes): "
+                    "0x%lx instead of 0x%lx\n",
+                    testName, (int)prev8, (unsigned long)bytes, (long)c, (int)((p-s)-prev8),
+                    (long)value, (long)values[i]);
        }
        if(i8!=(p-s)) {
-            log_err("error: wrong end index from UTRIE2_U8_NEXT(%s)(%lx->U+%04lx): %ld != %ld\n",
-                    testName, (unsigned long)bytes, (long)c, (long)(p-s), (long)i8);
+            log_err("error: wrong end index from UTRIE2_U8_NEXT(%s)(from %d %lx->U+%04lx): %ld != %ld\n",
+                    testName, (int)prev8, (unsigned long)bytes, (long)c, (long)(p-s), (long)i8);
            continue;
        }
        ++i;
@ -471,12 +500,14 @@ testTrieUTF8(const char *testName,
            }
        }
        if(value!=values[i]) {
-            log_err("error: wrong value from UTRIE2_U8_PREV(%s)(%lx->U+%04lx): 0x%lx instead of 0x%lx\n",
-                    testName, (unsigned long)bytes, (long)c, (long)value, (long)values[i]);
+            log_err("error: wrong value from UTRIE2_U8_PREV(%s)(from %d %lx->U+%04lx) (read %d bytes): "
+                    ": 0x%lx instead of 0x%lx\n",
+                    testName, (int)prev8, (unsigned long)bytes, (long)c, (int)(prev8-(p-s)),
+                    (long)value, (long)values[i]);
        }
        if(i8!=(p-s)) {
-            log_err("error: wrong end index from UTRIE2_U8_PREV(%s)(%lx->U+%04lx): %ld != %ld\n",
-                    testName, (unsigned long)bytes, (long)c, (long)(p-s), (long)i8);
+            log_err("error: wrong end index from UTRIE2_U8_PREV(%s)(from %d %lx->U+%04lx): %ld != %ld\n",
+                    testName, (int)prev8, (unsigned long)bytes, (long)c, (long)(p-s), (long)i8);
            continue;
        }
    }
--- a/icu4c/source/test/cintltst/utf8tst.c
+++ b/icu4c/source/test/cintltst/utf8tst.c
@ -121,7 +121,7 @@ addUTF8Test(TestNode** root)

 static void TestCodeUnitValues()
 {
-    static const uint8_t codeunit[]={0x00, 0x65, 0x7e, 0x7f, 0xc0, 0xc4, 0xf0, 0xfd, 0x80, 0x81, 0xbc, 0xbe,};
+    static const uint8_t codeunit[]={0x00, 0x65, 0x7e, 0x7f, 0xc2, 0xc4, 0xf0, 0xf4, 0x80, 0x81, 0xbc, 0xbe,};

    int16_t i;
    for(i=0; i<UPRV_LENGTHOF(codeunit); i++){
@ -231,28 +231,31 @@ static void TestGetChar()
        0x10401,          0x10401,                    0x10401 ,
        0x10401,          0x10401,                    0x10401 ,
        0x10401,          0x10401,                    0x10401,
-        0x25,             UTF8_ERROR_VALUE_1,         UTF8_ERROR_VALUE_1,
+        -1,               UTF8_ERROR_VALUE_1,         UTF8_ERROR_VALUE_1,
        0x65,             0x65,                       0x65,
        0x31,             0x31,                       0x31,
-        0x31,             UTF8_ERROR_VALUE_1,         UTF8_ERROR_VALUE_1,
-        0x240,            UTF8_ERROR_VALUE_1,         UTF8_ERROR_VALUE_1
+        -1,               UTF8_ERROR_VALUE_1,         UTF8_ERROR_VALUE_1,
+        -1,               UTF8_ERROR_VALUE_1,         UTF8_ERROR_VALUE_1
    };
    uint16_t i=0;
    UChar32 c, expected;
    uint32_t offset=0;

    for(offset=0; offset<sizeof(input); offset++) {
-        if (offset < sizeof(input) - 1) {
+        expected = result[i];
+        if (expected >= 0 && offset < sizeof(input) - 1) {
 #if !U_HIDE_OBSOLETE_UTF_OLD_H
            UTF8_GET_CHAR_UNSAFE(input, offset, c);
-            if(c != result[i]){
-                log_err("ERROR: UTF8_GET_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i], c);
+            if(c != expected) {
+                log_err("ERROR: UTF8_GET_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n",
+                        offset, expected, c);

            }
 #endif
            U8_GET_UNSAFE(input, offset, c);
-            if(c != result[i]){
-                log_err("ERROR: U8_GET_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i], c);
+            if(c != expected) {
+                log_err("ERROR: U8_GET_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n",
+                        offset, expected, c);

            }
        }
@ -285,146 +288,160 @@ static void TestGetChar()
 }

 static void TestNextPrevChar() {
-    static const uint8_t input[]={0x61, 0xf0, 0x90, 0x90, 0x81, 0xc0, 0x80, 0xfd, 0xbe, 0xc2, 0x61, 0x81, 0x90, 0x90, 0xf0, 0x00};
+    static const uint8_t input[]={
+        0x61,
+        0xf0, 0x90, 0x90, 0x81,
+        0xc0, 0x80,  // non-shortest form
+        0xf3, 0xbe,  // truncated
+        0xc2,  // truncated
+        0x61,
+        0x81, 0x90, 0x90, 0xf0,  // "backwards" sequence
+        0x00
+    };
    static const UChar32 result[]={
-    /*  next_unsafe    next_safe_ns        next_safe_s          prev_unsafe   prev_safe_ns        prev_safe_s */
-        0x0061,        0x0061,             0x0061,              0x0000,       0x0000,             0x0000,
-        0x10401,       0x10401,            0x10401,             0xf0,         UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
-        0x90,          UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  0x2841410,    UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
-        0x90,          UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  0xa1050,      UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
-        0x81,          UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  0x2841,       UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
-        0x00,          UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2,  0x61,         0x61,               0x61,
-        0x80,          UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  0xc2,         UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
-        0xfd,          UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2,  0x77e,        UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2,
-        0xbe,          UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  0xfd,         UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
-        0xa1,          UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  0x00,         UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2,
-        0x61,          0x61,               0x61,                0xc0,         UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
-        0x81,          UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  0x10401,      0x10401,            0x10401,
-        0x90,          UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  0x410,        UTF_ERROR_VALUE,    UTF_ERROR_VALUE,
-        0x90,          UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  0x410,        UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2,
-        0x0840,        UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  0xf0,         UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
-        0x0000,        0x0000,             0x0000,              0x0061,       0x0061,             0x0061
+    /*  next_safe_ns        next_safe_s          prev_safe_ns        prev_safe_s */
+        0x0061,             0x0061,              0x0000,             0x0000,
+        0x10401,            0x10401,             UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
+        UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
+        UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
+        UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
+        UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  0x61,               0x61,
+        UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
+        UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2,  UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2,
+        UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
+        UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
+        0x61,               0x61,                UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
+        UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  0x10401,            0x10401,
+        UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  UTF_ERROR_VALUE,    UTF_ERROR_VALUE,
+        UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2,
+        UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
+        0x0000,             0x0000,              0x0061,             0x0061
    };
    static const int32_t movedOffset[]={
-    /*  next_unsafe   next_safe_ns next_safe_s       prev_unsafe   prev_safe_ns      prev_safe_s */
-        1,            1,           1,                15,           15,               15,
-        5,            5,           5,                14,           14 ,              14,
-        3,            3,           3,                9,            13,               13,
-        4,            4,           4,                9,            12,               12,
-        5,            5,           5,                9,            11,               11,
-        7,            7,           7,                10,           10,               10,
-        7,            7,           7,                9,            9,                9,
-        8,            9,           9,                7,            7,                7,
-        9,            9,           9,                7,            7,                7,
-        11,           10,          10,               5,            5,                5,
-        11,           11,          11,               5,            5,                5,
-        12,           12,          12,               1,            1,                1,
-        13,           13,          13,               1,            1,                1,
-        14,           14,          14,               1,            1,                1,
-        14,           15,          15,               1,            1,                1,
-        14,           16,          16,               0,            0,                0,
+    /*  next_safe    prev_safe_s */
+        1,           15,
+        5,           14,
+        3,           13,
+        4,           12,
+        5,           11,
+        6,           10,
+        7,           9,
+        9,           7,
+        9,           7,
+        10,          6,
+        11,          5,
+        12,          1,
+        13,          1,
+        14,          1,
+        15,          1,
+        16,          0,
    };
-    /* TODO: remove unused columns for next_unsafe & prev_unsafe, and adjust the test code */

    UChar32 c, expected;
-    uint32_t i=0;
+    uint32_t i=0, j=0;
    uint32_t offset=0;
    int32_t setOffset=0;
    for(offset=0; offset<sizeof(input); offset++){
-        expected=result[i+1];
+        expected=result[i];  // next_safe_ns
 #if !U_HIDE_OBSOLETE_UTF_OLD_H
-         setOffset=offset;
-         UTF8_NEXT_CHAR_SAFE(input, setOffset, sizeof(input), c, FALSE);
-         if(setOffset != movedOffset[i+1]){
-             log_err("ERROR: UTF8_NEXT_CHAR_SAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
-                 offset, movedOffset[i+1], setOffset);
-         }
-        if(c != expected){
-            log_err("ERROR: UTF8_NEXT_CHAR_SAFE failed for input=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
+        setOffset=offset;
+        UTF8_NEXT_CHAR_SAFE(input, setOffset, sizeof(input), c, FALSE);
+        if(setOffset != movedOffset[j]) {
+            log_err("ERROR: UTF8_NEXT_CHAR_SAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
+                offset, movedOffset[j], setOffset);
+        }
+        if(c != expected) {
+            log_err("ERROR: UTF8_NEXT_CHAR_SAFE failed at offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
        }
 #endif
-         setOffset=offset;
-         U8_NEXT(input, setOffset, sizeof(input), c);
-         if(setOffset != movedOffset[i+1]){
-             log_err("ERROR: U8_NEXT failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
-                 offset, movedOffset[i+1], setOffset);
-         }
+        setOffset=offset;
+        U8_NEXT(input, setOffset, sizeof(input), c);
+        if(setOffset != movedOffset[j]) {
+            log_err("ERROR: U8_NEXT failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
+                offset, movedOffset[j], setOffset);
+        }
        if(UTF_IS_ERROR(expected)) { expected=U_SENTINEL; }
-        if(c != expected){
-            log_err("ERROR: U8_NEXT failed for input=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
+        if(c != expected) {
+            log_err("ERROR: U8_NEXT failed at offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
        }

        setOffset=offset;
        U8_NEXT_OR_FFFD(input, setOffset, sizeof(input), c);
-        if(setOffset != movedOffset[i+1]){
+        if(setOffset != movedOffset[j]) {
            log_err("ERROR: U8_NEXT_OR_FFFD failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
-                offset, movedOffset[i+1], setOffset);
+                offset, movedOffset[j], setOffset);
        }
        if(expected<0) { expected=0xfffd; }
-        if(c != expected){
-            log_err("ERROR: U8_NEXT_OR_FFFD failed for input=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
+        if(c != expected) {
+            log_err("ERROR: U8_NEXT_OR_FFFD failed at offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
        }
 #if !U_HIDE_OBSOLETE_UTF_OLD_H
-         setOffset=offset;
-         UTF8_NEXT_CHAR_SAFE(input, setOffset, sizeof(input), c, TRUE);
-         if(setOffset != movedOffset[i+1]){
-             log_err("ERROR: UTF8_NEXT_CHAR_SAFE(strict) failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
-                 offset, movedOffset[i+2], setOffset);
-         }
-         if(c != result[i+2]){
-             log_err("ERROR: UTF8_NEXT_CHAR_SAFE(strict) failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+2], c);
-         }
+        setOffset=offset;
+        UTF8_NEXT_CHAR_SAFE(input, setOffset, sizeof(input), c, TRUE);
+        if(setOffset != movedOffset[j]) {
+            log_err("ERROR: UTF8_NEXT_CHAR_SAFE(strict) failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
+                offset, movedOffset[j], setOffset);
+        }
+        expected=result[i+1];  // next_safe_s
+        if(c != expected) {
+            log_err("ERROR: UTF8_NEXT_CHAR_SAFE(strict) failed at offset=%ld. Expected:%lx Got:%lx\n",
+                    offset, expected, c);
+        }
 #endif
-         i=i+6;
+        i=i+4;
+        j=j+2;
    }

-    i=0;
+    i=j=0;
    for(offset=sizeof(input); offset > 0; --offset){
-        expected=result[i+4];
+        expected=result[i+2];  // prev_safe_ns
 #if !U_HIDE_OBSOLETE_UTF_OLD_H
-         setOffset=offset;
-         UTF8_PREV_CHAR_SAFE(input, 0, setOffset, c, FALSE);
-         if(setOffset != movedOffset[i+4]){
-             log_err("ERROR: UTF8_PREV_CHAR_SAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
-                 offset, movedOffset[i+4], setOffset);
-         }
-        if(c != expected){
-            log_err("ERROR: UTF8_PREV_CHAR_SAFE failed for input=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
+        setOffset=offset;
+        UTF8_PREV_CHAR_SAFE(input, 0, setOffset, c, FALSE);
+        if(setOffset != movedOffset[j+1]) {
+            log_err("ERROR: UTF8_PREV_CHAR_SAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
+                offset, movedOffset[j+1], setOffset);
+        }
+        if(c != expected) {
+            log_err("ERROR: UTF8_PREV_CHAR_SAFE failed at offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
        }
 #endif
-         setOffset=offset;
-         U8_PREV(input, 0, setOffset, c);
-         if(setOffset != movedOffset[i+4]){
-             log_err("ERROR: U8_PREV failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
-                 offset, movedOffset[i+4], setOffset);
-         }
+        setOffset=offset;
+        U8_PREV(input, 0, setOffset, c);
+        if(setOffset != movedOffset[j+1]) {
+            log_err("ERROR: U8_PREV failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
+                offset, movedOffset[j+1], setOffset);
+        }
        if(UTF_IS_ERROR(expected)) { expected=U_SENTINEL; }
-        if(c != expected){
-            log_err("ERROR: U8_PREV failed for input=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
+        if(c != expected) {
+            log_err("ERROR: U8_PREV failed at offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
        }

        setOffset=offset;
        U8_PREV_OR_FFFD(input, 0, setOffset, c);
-        if(setOffset != movedOffset[i+4]){
+        if(setOffset != movedOffset[j+1]) {
            log_err("ERROR: U8_PREV_OR_FFFD failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
-                offset, movedOffset[i+4], setOffset);
+                offset, movedOffset[j+1], setOffset);
        }
        if(expected<0) { expected=0xfffd; }
-        if(c != expected){
-            log_err("ERROR: U8_PREV_OR_FFFD failed for input=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
+        if(c != expected) {
+            log_err("ERROR: U8_PREV_OR_FFFD failed at offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
        }
 #if !U_HIDE_OBSOLETE_UTF_OLD_H
-         setOffset=offset;
-         UTF8_PREV_CHAR_SAFE(input, 0,  setOffset, c, TRUE);
-         if(setOffset != movedOffset[i+5]){
-             log_err("ERROR: UTF8_PREV_CHAR_SAFE(strict) failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
-                 offset, movedOffset[i+5], setOffset);
-         }
-         if(c != result[i+5]){
-             log_err("ERROR: UTF8_PREV_CHAR_SAFE(strict) failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+5], c);
-         }
+        setOffset=offset;
+        UTF8_PREV_CHAR_SAFE(input, 0,  setOffset, c, TRUE);
+        if(setOffset != movedOffset[j+1]) {
+            log_err("ERROR: UTF8_PREV_CHAR_SAFE(strict) failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
+                offset, movedOffset[j+1], setOffset);
+        }
+        expected=result[i+3];  // prev_safe_s
+        if(c != expected) {
+            log_err("ERROR: UTF8_PREV_CHAR_SAFE(strict) failed at offset=%ld. Expected:%lx Got:%lx\n",
+                    offset, expected, c);
+        }
 #endif
-         i=i+6;
+        i=i+4;
+        j=j+2;
    }
 }

@ -433,11 +450,13 @@ static void TestNulTerminated() {
    static const uint8_t input[]={
        /*  0 */  0x61,
        /*  1 */  0xf0, 0x90, 0x90, 0x81,
-        /*  5 */  0xc0, 0x80,
+        /*  5 */  0xc0,
+        /*  6 */  0x80,
        /*  7 */  0xdf, 0x80,
        /*  9 */  0xc2,
        /* 10 */  0x62,
-        /* 11 */  0xfd, 0xbe,
+        /* 11 */  0xfd,
+        /* 12 */  0xbe,
        /* 13 */  0xe0, 0xa0, 0x80,
        /* 16 */  0xe2, 0x82, 0xac,
        /* 19 */  0xf0, 0x90, 0x90,
@ -447,14 +466,16 @@ static void TestNulTerminated() {
    static const UChar32 result[]={
        0x61,
        0x10401,
-        U_SENTINEL,
+        U_SENTINEL,  // C0 not a lead byte
+        U_SENTINEL,  // 80
        0x7c0,
-        U_SENTINEL,
+        U_SENTINEL,  // C2
        0x62,
-        U_SENTINEL,
+        U_SENTINEL,  // FD not a lead byte
+        U_SENTINEL,  // BE
        0x800,
        0x20ac,
-        U_SENTINEL,
+        U_SENTINEL,  // truncated F0 90 90
        0
    };

@ -544,6 +565,22 @@ static void TestNextPrevNonCharacters() {
            log_err("U8_PREV(at %d) failed to read a non-character\n", idx);
        }
    }
+#if !U_HIDE_OBSOLETE_UTF_OLD_H
+    for(idx=0; idx<(int32_t)sizeof(nonChars);) {
+        UChar32 expected= nonChars[idx]<0xf0 ? 0xffff : 0x10ffff;
+        UTF8_NEXT_CHAR_SAFE(nonChars, idx, sizeof(nonChars), ch, TRUE);
+        if(ch!=expected) {
+            log_err("UTF8_NEXT_CHAR_SAFE(strict, before %d) failed to read a non-character\n", idx);
+        }
+    }
+    for(idx=(int32_t)sizeof(nonChars); idx>0;) {
+        UTF8_PREV_CHAR_SAFE(nonChars, 0, idx, ch, TRUE);
+        UChar32 expected= nonChars[idx]<0xf0 ? 0xffff : 0x10ffff;
+        if(ch!=expected) {
+            log_err("UTF8_PREV_CHAR_SAFE(strict, at %d) failed to read a non-character\n", idx);
+        }
+    }
+#endif
 }

 static void TestNextPrevCharUnsafe() {
@ -563,58 +600,83 @@ static void TestNextPrevCharUnsafe() {
    static const UChar32 codePoints[]={
        0x61,
        0x10401,
-        0,
+        -1,
        0x20ac,
        0xa1,
        0x10ffff,
        0
    };

-    UChar32 c;
+    UChar32 c, expected;
    int32_t i;
    uint32_t offset;
 #if !U_HIDE_OBSOLETE_UTF_OLD_H
    for(i=0, offset=0; offset<sizeof(input); ++i) {
        UTF8_NEXT_CHAR_UNSAFE(input, offset, c);
-        if(c != codePoints[i]){
+        expected = codePoints[i];
+        if(expected >= 0 && c != expected) {
            log_err("ERROR: UTF8_NEXT_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n",
-                    offset, codePoints[i], c);
+                    offset, expected, c);
+        }
+        if(offset==6) {
+            // The obsolete UTF8_NEXT_CHAR_UNSAFE() skips 1+UTF8_COUNT_TRAIL_BYTES(lead) bytes
+            // while the new one skips C0 80 together.
+            ++offset;
        }
    }
 #endif
    for(i=0, offset=0; offset<sizeof(input); ++i) {
        U8_NEXT_UNSAFE(input, offset, c);
-        if(c != codePoints[i]){
+        expected = codePoints[i];
+        if(expected >= 0 && c != expected) {
            log_err("ERROR: U8_NEXT_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n",
-                    offset, codePoints[i], c);
+                    offset, expected, c);
        }
    }
 #if !U_HIDE_OBSOLETE_UTF_OLD_H
    for(i=UPRV_LENGTHOF(codePoints)-1, offset=sizeof(input); offset > 0; --i){
-         UTF8_PREV_CHAR_UNSAFE(input, offset, c);
-         if(c != codePoints[i]){
-             log_err("ERROR: UTF8_PREV_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n",
-                     offset, codePoints[i], c);
-         }
+        UTF8_PREV_CHAR_UNSAFE(input, offset, c);
+        expected = codePoints[i];
+        if(expected >= 0 && c != expected) {
+            log_err("ERROR: UTF8_PREV_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n",
+                    offset, expected, c);
+        }
    }
 #endif
    for(i=UPRV_LENGTHOF(codePoints)-1, offset=sizeof(input); offset > 0; --i){
-         U8_PREV_UNSAFE(input, offset, c);
-         if(c != codePoints[i]){
-             log_err("ERROR: U8_PREV_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n",
-                     offset, codePoints[i], c);
-         }
+        U8_PREV_UNSAFE(input, offset, c);
+        expected = codePoints[i];
+        if(expected >= 0 && c != expected) {
+            log_err("ERROR: U8_PREV_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n",
+                    offset, expected, c);
+        }
    }
 }

 static void TestFwdBack() {
-    static const uint8_t input[]={0x61, 0xF0, 0x90, 0x90, 0x81, 0xff, 0x62, 0xc0, 0x80, 0x7f, 0x8f, 0xc0, 0x63, 0x81, 0x90, 0x90, 0xF0, 0x00};
-    static const uint16_t fwd_safe[]   ={1, 5, 6, 7, 9, 10, 11,  12, 13, 14, 15, 16, 17, 18};
-    static const uint16_t back_safe[]  ={17, 16, 15, 14, 13, 12, 11, 10, 9, 7, 6, 5, 1, 0};
+    static const uint8_t input[]={
+        0x61,
+        0xF0, 0x90, 0x90, 0x81,
+        0xff,
+        0x62,
+        0xc0,
+        0x80,
+        0x7f,
+        0x8f,
+        0xc0,
+        0x63,
+        0x81,
+        0x90,
+        0x90,
+        0xF0,
+        0x00
+    };
+    static const uint16_t fwd_safe[]   ={1, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18};
+    static const uint16_t back_safe[]  ={17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 1, 0};

-    static const uint16_t Nvalue[]= {0, 1, 2, 3, 1, 2, 1, 5};
+    static const uint16_t Nvalue[]= {0, 1, 2, 4, 1, 2, 1, 5};
    static const uint16_t fwd_N_safe[]   ={0, 1, 6, 10, 11, 13, 14, 18}; /*safe macro keeps it at the end of the string */
-    static const uint16_t back_N_safe[]  ={18, 17, 15, 12, 11, 9, 7, 0};
+    static const uint16_t back_N_safe[]  ={18, 17, 15, 11, 10, 8, 7, 0};

    uint32_t offsafe=0;

@ -707,7 +769,10 @@ static void TestFwdBackUnsafe() {
        0xf4, 0x8f, 0xbf, 0xbf,
        0x00
    };
-    static const int8_t boundaries[]={ 0, 1, 5, 7, 10, 12, 16, 17 };
+    // forward unsafe skips only C0
+    static const int8_t boundaries[]={ 0, 1, 5, 6, 7, 10, 12, 16, 17 };
+    // backward unsafe skips C0 80 together
+    static const int8_t backBoundaries[]={ 0, 1, 5, 7, 10, 12, 16, 17 };

    int32_t offset;
    int32_t i;
@ -726,17 +791,17 @@ static void TestFwdBackUnsafe() {
        }
    }
 #if !U_HIDE_OBSOLETE_UTF_OLD_H
-    for(i=UPRV_LENGTHOF(boundaries)-2, offset=UPRV_LENGTHOF(input); offset>0; --i) {
+    for(i=UPRV_LENGTHOF(backBoundaries)-2, offset=UPRV_LENGTHOF(input); offset>0; --i) {
        UTF8_BACK_1_UNSAFE(input, offset);
-        if(offset != boundaries[i]){
-            log_err("ERROR: UTF8_BACK_1_UNSAFE offset expected:%d, Got:%d\n", boundaries[i], offset);
+        if(offset != backBoundaries[i]){
+            log_err("ERROR: UTF8_BACK_1_UNSAFE offset expected:%d, Got:%d\n", backBoundaries[i], offset);
        }
    }
 #endif
-    for(i=UPRV_LENGTHOF(boundaries)-2, offset=UPRV_LENGTHOF(input); offset>0; --i) {
+    for(i=UPRV_LENGTHOF(backBoundaries)-2, offset=UPRV_LENGTHOF(input); offset>0; --i) {
        U8_BACK_1_UNSAFE(input, offset);
-        if(offset != boundaries[i]){
-            log_err("ERROR: U8_BACK_1_UNSAFE offset expected:%d, Got:%d\n", boundaries[i], offset);
+        if(offset != backBoundaries[i]){
+            log_err("ERROR: U8_BACK_1_UNSAFE offset expected:%d, Got:%d\n", backBoundaries[i], offset);
        }
    }
 #if !U_HIDE_OBSOLETE_UTF_OLD_H
@ -756,21 +821,21 @@ static void TestFwdBackUnsafe() {
        }
    }
 #if !U_HIDE_OBSOLETE_UTF_OLD_H
-    for(i=0; i<UPRV_LENGTHOF(boundaries); ++i) {
-        int32_t j=UPRV_LENGTHOF(boundaries)-1-i;
+    for(i=0; i<UPRV_LENGTHOF(backBoundaries); ++i) {
+        int32_t j=UPRV_LENGTHOF(backBoundaries)-1-i;
        offset=UPRV_LENGTHOF(input);
        UTF8_BACK_N_UNSAFE(input, offset, i);
-        if(offset != boundaries[j]) {
-            log_err("ERROR: UTF8_BACK_N_UNSAFE offset expected:%d, Got:%d\n", boundaries[j], offset);
+        if(offset != backBoundaries[j]) {
+            log_err("ERROR: UTF8_BACK_N_UNSAFE offset expected:%d, Got:%d\n", backBoundaries[j], offset);
        }
    }
 #endif
-    for(i=0; i<UPRV_LENGTHOF(boundaries); ++i) {
-        int32_t j=UPRV_LENGTHOF(boundaries)-1-i;
+    for(i=0; i<UPRV_LENGTHOF(backBoundaries); ++i) {
+        int32_t j=UPRV_LENGTHOF(backBoundaries)-1-i;
        offset=UPRV_LENGTHOF(input);
        U8_BACK_N_UNSAFE(input, offset, i);
-        if(offset != boundaries[j]) {
-            log_err("ERROR: U8_BACK_N_UNSAFE offset expected:%d, Got:%d\n", boundaries[j], offset);
+        if(offset != backBoundaries[j]) {
+            log_err("ERROR: U8_BACK_N_UNSAFE offset expected:%d, Got:%d\n", backBoundaries[j], offset);
        }
    }
 }
@ -1138,8 +1203,12 @@ TestSurrogates() {
            log_err("L8_NEXT(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cl, (long)cu);
        }

-        if(is!=iu || il!=iu) {
-            log_err("U8_NEXT(b[%ld]) or L8_NEXT(b[%ld]) did not advance the index correctly\n", (long)i, (long)i);
+        // U8_NEXT() skips only the first byte of a surrogate byte sequence.
+        if(U_IS_SURROGATE(cu) ? is!=(i+1) : is!=iu) {
+            log_err("U8_NEXT(b[%ld]) did not advance the index correctly\n", (long)i, (long)i);
+        }
+        if(il!=iu) {
+            log_err("L8_NEXT(b[%ld]) did not advance the index correctly\n", (long)i, (long)i);
        }

        ++k;    /* next code point */
@ -1175,8 +1244,12 @@ TestSurrogates() {
            log_err("L8_PREV(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cl, (long)cu);
        }

-        if(is!=iu || il !=iu) {
-            log_err("U8_PREV(b[%ld]) or L8_PREV(b[%ld]) did not advance the index correctly\n", (long)i, (long)i);
+        // U8_PREV() skips only the last byte of a surrogate byte sequence.
+        if(U_IS_SURROGATE(cu) ? is!=(i-1) : is!=iu) {
+            log_err("U8_PREV(b[%ld]) did not advance the index correctly\n", (long)i, (long)i);
+        }
+        if(il !=iu) {
+            log_err("L8_PREV(b[%ld]) did not advance the index correctly\n", (long)i, (long)i);
        }

        i=iu;   /* go back by one UTF-8 sequence */
--- a/icu4c/source/test/intltest/collationtest.cpp
+++ b/icu4c/source/test/intltest/collationtest.cpp
@ -294,24 +294,22 @@ void CollationTest::TestIllegalUTF8() {
    coll->setAttribute(UCOL_STRENGTH, UCOL_IDENTICAL, errorCode);

    static const char *strings[] = {
-        // U+FFFD
-        "a\xef\xbf\xbdz",
-        // illegal byte sequences
-        "a\x80z",  // trail byte
-        "a\xc1\x81z",  // non-shortest form
-        "a\xe0\x82\x83z",  // non-shortest form
-        "a\xed\xa0\x80z",  // lead surrogate: would be U+D800
-        "a\xed\xbf\xbfz",  // trail surrogate: would be U+DFFF
-        "a\xf0\x8f\xbf\xbfz",  // non-shortest form
-        "a\xf4\x90\x80\x80z"  // out of range: would be U+110000
+        // string with U+FFFD == illegal byte sequence
+        u8"a\uFFFDz", "a\x80z",  // trail byte
+        u8"a\uFFFD\uFFFDz", "a\xc1\x81z",  // non-shortest form
+        u8"a\uFFFD\uFFFD\uFFFDz", "a\xe0\x82\x83z",  // non-shortest form
+        u8"a\uFFFD\uFFFD\uFFFDz", "a\xed\xa0\x80z",  // lead surrogate: would be U+D800
+        u8"a\uFFFD\uFFFD\uFFFDz", "a\xed\xbf\xbfz",  // trail surrogate: would be U+DFFF
+        u8"a\uFFFD\uFFFD\uFFFD\uFFFDz", "a\xf0\x8f\xbf\xbfz",  // non-shortest form
+        u8"a\uFFFD\uFFFD\uFFFD\uFFFDz", "a\xf4\x90\x80\x80z"  // out of range: would be U+110000
    };

-    StringPiece fffd(strings[0]);
-    for(int32_t i = 1; i < UPRV_LENGTHOF(strings); ++i) {
-        StringPiece illegal(strings[i]);
+    for(int32_t i = 0; i < UPRV_LENGTHOF(strings); i += 2) {
+        StringPiece fffd(strings[i]);
+        StringPiece illegal(strings[i + 1]);
        UCollationResult order = coll->compareUTF8(fffd, illegal, errorCode);
        if(order != UCOL_EQUAL) {
-            errln("compareUTF8(U+FFFD, string %d with illegal UTF-8)=%d != UCOL_EQUAL",
+            errln("compareUTF8(pair %d: U+FFFD, illegal UTF-8)=%d != UCOL_EQUAL",
                  (int)i, order);
        }
    }
--- a/icu4c/source/test/intltest/strtest.cpp
+++ b/icu4c/source/test/intltest/strtest.cpp
@ -146,7 +146,7 @@ void
 StringTest::Test_UTF8_COUNT_TRAIL_BYTES() {
 #if !U_HIDE_OBSOLETE_UTF_OLD_H
    if(UTF8_COUNT_TRAIL_BYTES(0x7F) != 0
-            || UTF8_COUNT_TRAIL_BYTES(0xC0) != 1
+            || UTF8_COUNT_TRAIL_BYTES(0xC2) != 1
            || UTF8_COUNT_TRAIL_BYTES(0xE0) != 2
            || UTF8_COUNT_TRAIL_BYTES(0xF0) != 3) {
        errln("UTF8_COUNT_TRAIL_BYTES does not work right! See utf_old.h.");
@ -155,7 +155,7 @@ StringTest::Test_UTF8_COUNT_TRAIL_BYTES() {
    // Note: U8_COUNT_TRAIL_BYTES (current) and UTF8_COUNT_TRAIL_BYTES (deprecated)
    //       have completely different implementations.
    if (U8_COUNT_TRAIL_BYTES(0x7F) != 0
-            || U8_COUNT_TRAIL_BYTES(0xC0) != 1
+            || U8_COUNT_TRAIL_BYTES(0xC2) != 1
            || U8_COUNT_TRAIL_BYTES(0xE0) != 2
            || U8_COUNT_TRAIL_BYTES(0xF0) != 3) {
        errln("U8_COUNT_TRAIL_BYTES does not work right! See utf8.h.");
--- a/icu4c/source/test/intltest/ustrtest.cpp
+++ b/icu4c/source/test/intltest/ustrtest.cpp
@ -1881,9 +1881,9 @@ UnicodeStringTest::TestUTF8() {
        0xf3, 0xa0, 0x80, 0x80, 0xf4, 0x8f, 0xbf, 0xbf
    };
    static const UChar expected_utf16[] = {
-        0x41, 0xfffd,
-        0x61, 0xfffd,
-        0xfffd, 0x5a,
+        0x41, 0xfffd, 0xfffd, 0xfffd,
+        0x61, 0xfffd, 0xfffd, 0xfffd,
+        0xfffd,  0xfffd, 0xfffd, 0xfffd,0x5a,
        0xd900, 0xdc00, 0x7a,
        0xd800, 0xdc00, 0xd840, 0xdc00,
        0xdb40, 0xdc00, 0xdbff, 0xdfff
--- a/icu4c/source/test/intltest/utxttest.cpp
+++ b/icu4c/source/test/intltest/utxttest.cpp
@ -60,7 +60,6 @@ UTextTest::runIndexedTest(int32_t index, UBool exec,
    TESTCASE_AUTO(Ticket10562);
    TESTCASE_AUTO(Ticket10983);
    TESTCASE_AUTO(Ticket12130);
-    TESTCASE_AUTO(Ticket12888);
    TESTCASE_AUTO(Ticket13344);
    TESTCASE_AUTO_END;
 }
@ -951,10 +950,14 @@ void UTextTest::ErrorTest()
        UChar buf[10];
        int n = utext_extract(ut, 0, 9, buf, 10, &status);
        TEST_SUCCESS(status);
-        TEST_ASSERT(n==5);
+        TEST_ASSERT(n==7);
+        TEST_ASSERT(buf[0] == 0x41);
        TEST_ASSERT(buf[1] == 0xfffd);
-        TEST_ASSERT(buf[3] == 0xfffd);
        TEST_ASSERT(buf[2] == 0x42);
+        TEST_ASSERT(buf[3] == 0xfffd);
+        TEST_ASSERT(buf[4] == 0xfffd);
+        TEST_ASSERT(buf[5] == 0xfffd);
+        TEST_ASSERT(buf[6] == 0x43);
        utext_close(ut);
    }

@ -1578,66 +1581,6 @@ void UTextTest::Ticket12130() {
    utext_close(&ut);
 }

-// Ticket 12888: bad handling of illegal utf-8 containing many instances of the archaic, now illegal,
-//               six byte utf-8 forms. Original implementation had an assumption that
-//               there would be at most three utf-8 bytes per UTF-16 code unit.
-//               The five and six byte sequences map to a single replacement character.
-
-void UTextTest::Ticket12888() {
-    const char *badString = 
-            "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
-            "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
-            "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
-            "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
-            "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
-            "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
-            "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
-            "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
-            "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
-            "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
-            "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
-            "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
-            "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
-            "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
-            "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
-            "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
-            "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
-            "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
-            "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80"
-            "\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80\xfd\x80\x80\x80\x80\x80";
-
-    UErrorCode status = U_ZERO_ERROR;
-    LocalUTextPointer ut(utext_openUTF8(NULL, badString, -1, &status));
-    TEST_SUCCESS(status);
-    for (;;) {
-        UChar32 c = utext_next32(ut.getAlias());
-        if (c == U_SENTINEL) {
-            break;
-        }
-    }
-    int32_t endIdx = utext_getNativeIndex(ut.getAlias());
-    if (endIdx != (int32_t)strlen(badString)) {
-        errln("%s:%d expected=%d, actual=%d", __FILE__, __LINE__, strlen(badString), endIdx);
-        return;
-    }
-
-    for (int32_t prevIndex = endIdx; prevIndex>0;) {
-        UChar32 c = utext_previous32(ut.getAlias());
-        int32_t currentIndex = utext_getNativeIndex(ut.getAlias());
-        if (c != 0xfffd) {
-            errln("%s:%d (expected, actual, index) = (%d, %d, %d)\n",
-                    __FILE__, __LINE__, 0xfffd, c, currentIndex);
-            break;
-        }
-        if (currentIndex != prevIndex - 6) {
-            errln("%s:%d: wrong index. Expected, actual = %d, %d",
-                    __FILE__, __LINE__, prevIndex - 6, currentIndex);
-            break;
-        }
-        prevIndex = currentIndex;
-    }
-}
-
 // Ticket 13344 The macro form of UTEXT_SETNATIVEINDEX failed when target was a trail surrogate
 //              of a supplementary character.

--- a/icu4c/source/test/intltest/utxttest.h
+++ b/icu4c/source/test/intltest/utxttest.h
@ -38,7 +38,6 @@ public:
    void Ticket10562();
    void Ticket10983();
    void Ticket12130();
-    void Ticket12888();
    void Ticket13344();

 private:
--- a/icu4j/main/classes/core/src/com/ibm/icu/impl/BMPSet.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/BMPSet.java
@ -16,11 +16,12 @@ import com.ibm.icu.util.OutputInt;

 /**
 * Helper class for frozen UnicodeSets, implements contains() and span() optimized for BMP code points.
- * 
+ *
 * Latin-1: Look up bytes.
 * 2-byte characters: Bits organized vertically.
 * 3-byte characters: Use zero/one/mixed data per 64-block in U+0000..U+FFFF, with mixed for illegal ranges.
- * Supplementary characters: Call contains() on the parent set.
+ * Supplementary characters: Binary search over
+ * the supplementary part of the parent set's inversion list.
 */
 public final class BMPSet {
    public static int U16_SURROGATE_OFFSET = ((0xd800 << 10) + 0xdc00 - 0x10000);
@ -34,9 +35,8 @@ public final class BMPSet {
     * One bit per code point from U+0000..U+07FF. The bits are organized vertically; consecutive code points
     * correspond to the same bit positions in consecutive table words. With code point parts lead=c{10..6}
     * trail=c{5..0} it is set.contains(c)==(table7FF[trail] bit lead)
-     * 
-     * Bits for 0..7F (non-shortest forms) are set to the result of contains(FFFD) for faster validity checking at
-     * runtime.
+     *
+     * Bits for 0..FF are unused (0).
     */
    private int[] table7FF;

@ -46,9 +46,8 @@ public final class BMPSet {
     * t1=c{11..6} test bits (lead+16) and lead in bmpBlockBits[t1]. If the upper bit is 0, then the lower bit
     * indicates if contains(c) for all code points in the 64-block. If the upper bit is 1, then the block is mixed
     * and set.contains(c) must be called.
-     * 
-     * Bits for 0..7FF (non-shortest forms) and D800..DFFF are set to the result of contains(FFFD) for faster
-     * validity checking at runtime.
+     *
+     * Bits for 0..7FF are unused (0).
     */
    private int[] bmpBlockBits;

@ -127,7 +126,7 @@ public final class BMPSet {
    /**
     * Span the initial substring for which each character c has spanCondition==contains(c). It must be
     * spanCondition==0 or 1.
-     * 
+     *
     * @param start The start index
     * @param outCount If not null: Receives the number of code points in the span.
     * @return the limit (exclusive end) of the span
@ -232,7 +231,7 @@ public final class BMPSet {
     * Symmetrical with span().
     * Span the trailing substring for which each character c has spanCondition==contains(c). It must be s.length >=
     * limit and spanCondition==0 or 1.
-     * 
+     *
     * @return The string index which starts the span (i.e. inclusive).
     */
    public final int spanBack(CharSequence s, int limit, SpanCondition spanCondition) {
@ -462,10 +461,10 @@ public final class BMPSet {
    /**
     * Same as UnicodeSet.findCodePoint(int c) except that the binary search is restricted for finding code
     * points in a certain range.
-     * 
+     *
     * For restricting the search for finding in the range start..end, pass in lo=findCodePoint(start) and
     * hi=findCodePoint(end) with 0<=lo<=hi<len. findCodePoint(c) defaults to lo=0 and hi=len-1.
-     * 
+     *
     * @param c
     *            a character in a subrange of MIN_VALUE..MAX_VALUE
     * @param lo
@ -512,4 +511,3 @@ public final class BMPSet {
        return (0 != (findCodePoint(c, lo, hi) & 1));
    }
 }
-