From bf0220380bd3792f6317a419f5421f5f9ff47203 Mon Sep 17 00:00:00 2001
From: Andrew J Macheret <ajmacher@svn.icu-project.org>
Date: Fri, 8 Jun 2007 21:49:02 +0000
Subject: [PATCH] ICU-5739 dramatically increased the speed of the encoding and
 decoding of us-ascii and isolatin1.

X-SVN-Rev: 21678
---
 .../src/com/ibm/icu/charset/Charset88591.java |  59 +++++--
 .../src/com/ibm/icu/charset/CharsetASCII.java | 158 +++++++++++++-----
 2 files changed, 153 insertions(+), 64 deletions(-)

diff --git a/icu4j/src/com/ibm/icu/charset/Charset88591.java b/icu4j/src/com/ibm/icu/charset/Charset88591.java
index b8880df40f..08478debc7 100644
--- a/icu4j/src/com/ibm/icu/charset/Charset88591.java
+++ b/icu4j/src/com/ibm/icu/charset/Charset88591.java
@@ -17,8 +17,7 @@ import java.nio.charset.CharsetEncoder;
 import java.nio.charset.CoderResult;
 
 class Charset88591 extends CharsetASCII {
-    public Charset88591(String icuCanonicalName, String javaCanonicalName,
-            String[] aliases) {
+    public Charset88591(String icuCanonicalName, String javaCanonicalName, String[] aliases) {
         super(icuCanonicalName, javaCanonicalName, aliases);
     }
 
@@ -27,19 +26,26 @@ class Charset88591 extends CharsetASCII {
             super(cs);
         }
 
-        protected CoderResult decodeLoopCoreOptimized(ByteBuffer source,
-                CharBuffer target, byte[] sourceArray, char[] targetArray,
-                int oldSource, int offset, int limit) {
+        protected CoderResult decodeLoopCoreOptimized(ByteBuffer source, CharBuffer target,
+                byte[] sourceArray, char[] targetArray, int oldSource, int offset, int limit) {
 
+            /*
+             * perform 88591 conversion from the source array to the target array. no range check is
+             * necessary.
+             */
             for (int i = oldSource; i < limit; i++)
                 targetArray[i + offset] = (char) (sourceArray[i] & 0xff);
 
             return null;
         }
 
-        protected CoderResult decodeLoopCoreUnoptimized(ByteBuffer source,
-                CharBuffer target) throws BufferUnderflowException,
-                BufferOverflowException {
+        protected CoderResult decodeLoopCoreUnoptimized(ByteBuffer source, CharBuffer target)
+                throws BufferUnderflowException, BufferOverflowException {
+
+            /*
+             * perform 88591 conversion from the source buffer to the target buffer. no range check
+             * is necessary (an exception will be generated to end the loop).
+             */
             while (true)
                 target.put((char) (source.get() & 0xff));
         }
@@ -50,30 +56,47 @@ class Charset88591 extends CharsetASCII {
             super(cs);
         }
 
-        protected CoderResult encodeLoopCoreOptimized(CharBuffer source,
-                ByteBuffer target, char[] sourceArray, byte[] targetArray,
-                int oldSource, int offset, int limit, boolean flush) {
+        protected CoderResult encodeLoopCoreOptimized(CharBuffer source, ByteBuffer target,
+                char[] sourceArray, byte[] targetArray, int oldSource, int offset, int limit,
+                boolean flush) {
             int i, ch = 0;
-            for (i = oldSource; i < limit
-                    && (((ch = (int) sourceArray[i]) & 0xff00) == 0); i++)
+
+            /*
+             * perform 88591 conversion from the source array to the target array, making sure each
+             * char in the source is within the correct range
+             */
+            for (i = oldSource; i < limit && (((ch = (int) sourceArray[i]) & 0xff00) == 0); i++)
                 targetArray[i + offset] = (byte) ch;
 
+            /*
+             * if some byte was not in the correct range, we need to deal with this byte by calling
+             * encodeMalformedOrUnmappable and move the source and target positions to reflect the
+             * early termination of the loop
+             */
             if ((ch & 0xff00) != 0) {
                 source.position(i + 1);
                 target.position(i + offset);
-                return encodeIllegal(source, ch, flush);
+                return encodeMalformedOrUnmappable(source, ch, flush);
             } else
                 return null;
         }
 
-        protected CoderResult encodeLoopCoreUnoptimized(CharBuffer source,
-                ByteBuffer target, boolean flush)
-                throws BufferUnderflowException, BufferOverflowException {
+        protected CoderResult encodeLoopCoreUnoptimized(CharBuffer source, ByteBuffer target,
+                boolean flush) throws BufferUnderflowException, BufferOverflowException {
             int ch;
+
+            /*
+             * perform 88591 conversion from the source buffer to the target buffer, making sure
+             * each char in the source is within the correct range
+             */
             while (((ch = (int) source.get()) & 0xff00) == 0)
                 target.put((byte) ch);
 
-            return encodeIllegal(source, ch, flush);
+            /*
+             * if we reach here, it's because a character was not in the correct range, and we need
+             * to deak with this by calling encodeMalformedOrUnmappable.
+             */
+            return encodeMalformedOrUnmappable(source, ch, flush);
         }
 
     }
diff --git a/icu4j/src/com/ibm/icu/charset/CharsetASCII.java b/icu4j/src/com/ibm/icu/charset/CharsetASCII.java
index b4586a4624..122d80b27a 100644
--- a/icu4j/src/com/ibm/icu/charset/CharsetASCII.java
+++ b/icu4j/src/com/ibm/icu/charset/CharsetASCII.java
@@ -23,8 +23,7 @@ import com.ibm.icu.text.UTF16;
 class CharsetASCII extends CharsetICU {
     protected byte[] fromUSubstitution = new byte[] { (byte) 0x1a };
 
-    public CharsetASCII(String icuCanonicalName, String javaCanonicalName,
-            String[] aliases) {
+    public CharsetASCII(String icuCanonicalName, String javaCanonicalName, String[] aliases) {
         super(icuCanonicalName, javaCanonicalName, aliases);
         maxBytesPerChar = 1;
         minBytesPerChar = 1;
@@ -37,8 +36,8 @@ class CharsetASCII extends CharsetICU {
             super(cs);
         }
 
-        protected CoderResult decodeLoop(ByteBuffer source, CharBuffer target,
-                IntBuffer offsets, boolean flush) {
+        protected CoderResult decodeLoop(ByteBuffer source, CharBuffer target, IntBuffer offsets,
+                boolean flush) {
             if (!source.hasRemaining() && toUnicodeStatus == 0) {
                 /* no input, nothing to do */
                 return CoderResult.UNDERFLOW;
@@ -55,17 +54,24 @@ class CharsetASCII extends CharsetICU {
             if (source.hasArray() && target.hasArray()) {
                 /* optimized loop */
 
+                /*
+                 * extract arrays from the buffers and obtain various constant values that will be
+                 * necessary in the core loop
+                 */
                 byte[] sourceArray = source.array();
                 char[] targetArray = target.array();
                 int offset = oldTarget - oldSource;
                 int sourceLength = source.limit() - oldSource;
                 int targetLength = target.limit() - oldTarget;
-                int limit = ((sourceLength < targetLength) ? sourceLength
-                        : targetLength)
+                int limit = ((sourceLength < targetLength) ? sourceLength : targetLength)
                         + oldSource;
 
-                if ((cr = decodeLoopCoreOptimized(source, target, sourceArray,
-                        targetArray, oldSource, offset, limit)) == null) {
+                /*
+                 * perform the core loop... if it returns null, it must be due to an overflow or
+                 * underflow
+                 */
+                if ((cr = decodeLoopCoreOptimized(source, target, sourceArray, targetArray,
+                        oldSource, offset, limit)) == null) {
                     if (sourceLength <= targetLength) {
                         source.position(oldSource + sourceLength);
                         target.position(oldTarget + sourceLength);
@@ -80,6 +86,10 @@ class CharsetASCII extends CharsetICU {
                 /* unoptimized loop */
 
                 try {
+                    /*
+                     * perform the core loop... if it throws an exception, it must be due to an
+                     * overflow or underflow
+                     */
                     cr = decodeLoopCoreUnoptimized(source, target);
 
                 } catch (BufferUnderflowException ex) {
@@ -102,33 +112,53 @@ class CharsetASCII extends CharsetICU {
             return cr;
         }
 
-        protected CoderResult decodeLoopCoreOptimized(ByteBuffer source,
-                CharBuffer target, byte[] sourceArray, char[] targetArray,
-                int oldSource, int offset, int limit) {
+        protected CoderResult decodeLoopCoreOptimized(ByteBuffer source, CharBuffer target,
+                byte[] sourceArray, char[] targetArray, int oldSource, int offset, int limit) {
             int i, ch = 0;
-            for (i = oldSource; i < limit
-                    && (((ch = (sourceArray[i] & 0xff)) & 0x80) == 0); i++)
+
+            /*
+             * perform ascii conversion from the source array to the target array, making sure each
+             * byte in the source is within the correct range
+             */
+            for (i = oldSource; i < limit && (((ch = (sourceArray[i] & 0xff)) & 0x80) == 0); i++)
                 targetArray[i + offset] = (char) ch;
 
+            /*
+             * if some byte was not in the correct range, we need to deal with this byte by calling
+             * decodeMalformedOrUnmappable and move the source and target positions to reflect the
+             * early termination of the loop
+             */
             if ((ch & 0x80) != 0) {
                 source.position(i + 1);
                 target.position(i + offset);
-                return decodeIllegal(ch);
+                return decodeMalformedOrUnmappable(ch);
             } else
                 return null;
         }
 
-        protected CoderResult decodeLoopCoreUnoptimized(ByteBuffer source,
-                CharBuffer target) throws BufferUnderflowException,
-                BufferOverflowException {
+        protected CoderResult decodeLoopCoreUnoptimized(ByteBuffer source, CharBuffer target)
+                throws BufferUnderflowException, BufferOverflowException {
             int ch = 0;
+
+            /*
+             * perform ascii conversion from the source buffer to the target buffer, making sure
+             * each byte in the source is within the correct range
+             */
             while (((ch = (source.get() & 0xff)) & 0x80) == 0)
                 target.put((char) ch);
 
-            return decodeIllegal(ch);
+            /*
+             * if we reach here, it's because a character was not in the correct range, and we need
+             * to deak with this by calling decodeMalformedOrUnmappable
+             */
+            return decodeMalformedOrUnmappable(ch);
         }
 
-        protected CoderResult decodeIllegal(int ch) {
+        protected CoderResult decodeMalformedOrUnmappable(int ch) {
+            /*
+             * put the guilty character into toUBytesArray and return a message saying that the
+             * character was malformed and of length 1.
+             */
             toUBytesArray[0] = (byte) ch;
             return CoderResult.malformedForLength(toULength = 1);
         }
@@ -148,8 +178,8 @@ class CharsetASCII extends CharsetICU {
             fromUnicodeStatus = NEED_TO_WRITE_BOM;
         }
 
-        protected CoderResult encodeLoop(CharBuffer source, ByteBuffer target,
-                IntBuffer offsets, boolean flush) {
+        protected CoderResult encodeLoop(CharBuffer source, ByteBuffer target, IntBuffer offsets,
+                boolean flush) {
             if (!source.hasRemaining()) {
                 /* no input, nothing to do */
                 return CoderResult.UNDERFLOW;
@@ -164,25 +194,34 @@ class CharsetASCII extends CharsetICU {
             int oldTarget = target.position();
 
             if (fromUChar32 != 0) {
+                /*
+                 * if we have a leading character in fromUChar32 that needs to be dealt with, we
+                 * need to check for a matching trail character and taking the appropriate action as
+                 * dictated by encodeTrail.
+                 */
                 cr = encodeTrail(source, (char) fromUChar32, flush);
             } else {
-                int ch = 0;
-
                 if (source.hasArray() && target.hasArray()) {
                     /* optimized loop */
 
+                    /*
+                     * extract arrays from the buffers and obtain various constant values that will
+                     * be necessary in the core loop
+                     */
                     char[] sourceArray = source.array();
                     byte[] targetArray = target.array();
                     int offset = oldTarget - oldSource;
                     int sourceLength = source.limit() - oldSource;
                     int targetLength = target.limit() - oldTarget;
-                    int limit = ((sourceLength < targetLength) ? sourceLength
-                            : targetLength)
+                    int limit = ((sourceLength < targetLength) ? sourceLength : targetLength)
                             + oldSource;
 
-                    if ((cr = encodeLoopCoreOptimized(source, target,
-                            sourceArray, targetArray, oldSource, offset, limit,
-                            flush)) == null) {
+                    /*
+                     * perform the core loop... if it returns null, it must be due to an overflow or
+                     * underflow
+                     */
+                    if ((cr = encodeLoopCoreOptimized(source, target, sourceArray, targetArray,
+                            oldSource, offset, limit, flush)) == null) {
                         if (sourceLength <= targetLength) {
                             source.position(oldSource + sourceLength);
                             target.position(oldTarget + sourceLength);
@@ -197,10 +236,12 @@ class CharsetASCII extends CharsetICU {
                     /* unoptimized loop */
 
                     try {
+                        /*
+                         * perform the core loop... if it throws an exception, it must be due to an
+                         * overflow or underflow
+                         */
                         cr = encodeLoopCoreUnoptimized(source, target, flush);
 
-                        cr = encodeIllegal(source, ch, flush);
-
                     } catch (BufferUnderflowException ex) {
                         cr = CoderResult.UNDERFLOW;
                     } catch (BufferOverflowException ex) {
@@ -220,40 +261,65 @@ class CharsetASCII extends CharsetICU {
             return cr;
         }
 
-        protected CoderResult encodeLoopCoreOptimized(CharBuffer source,
-                ByteBuffer target, char[] sourceArray, byte[] targetArray,
-                int oldSource, int offset, int limit, boolean flush) {
+        protected CoderResult encodeLoopCoreOptimized(CharBuffer source, ByteBuffer target,
+                char[] sourceArray, byte[] targetArray, int oldSource, int offset, int limit,
+                boolean flush) {
             int i, ch = 0;
-            for (i = oldSource; i < limit
-                    && (((ch = (int) sourceArray[i]) & 0xff80) == 0); i++)
+
+            /*
+             * perform ascii conversion from the source array to the target array, making sure each
+             * char in the source is within the correct range
+             */
+            for (i = oldSource; i < limit && (((ch = (int) sourceArray[i]) & 0xff80) == 0); i++)
                 targetArray[i + offset] = (byte) ch;
 
+            /*
+             * if some byte was not in the correct range, we need to deal with this byte by calling
+             * encodeMalformedOrUnmappable and move the source and target positions to reflect the
+             * early termination of the loop
+             */
             if ((ch & 0xff80) != 0) {
                 source.position(i + 1);
                 target.position(i + offset);
-                return encodeIllegal(source, ch, flush);
+                return encodeMalformedOrUnmappable(source, ch, flush);
             } else
                 return null;
         }
 
-        protected CoderResult encodeLoopCoreUnoptimized(CharBuffer source,
-                ByteBuffer target, boolean flush)
-                throws BufferUnderflowException, BufferOverflowException {
+        protected CoderResult encodeLoopCoreUnoptimized(CharBuffer source, ByteBuffer target,
+                boolean flush) throws BufferUnderflowException, BufferOverflowException {
             int ch;
+
+            /*
+             * perform ascii conversion from the source buffer to the target buffer, making sure
+             * each char in the source is within the correct range
+             */
             while (((ch = (int) source.get()) & 0xff80) == 0)
                 target.put((byte) ch);
 
-            return encodeIllegal(source, ch, flush);
+            /*
+             * if we reach here, it's because a character was not in the correct range, and we need
+             * to deak with this by calling encodeMalformedOrUnmappable.
+             */
+            return encodeMalformedOrUnmappable(source, ch, flush);
         }
 
-        protected CoderResult encodeIllegal(CharBuffer source, int ch,
-                boolean flush) {
-            return (UTF16.isLeadSurrogate((char) ch)) ? encodeTrail(source,
-                    (char) ch, flush) : CoderResult.unmappableForLength(1);
+        protected CoderResult encodeMalformedOrUnmappable(CharBuffer source, int ch, boolean flush) {
+            /*
+             * if the character is a lead surrogate, we need to call encodeTrail to attempt to match
+             * it up with a trail surrogate. if not, the character is unmappable.
+             */
+            return (UTF16.isLeadSurrogate((char) ch)) ? encodeTrail(source, (char) ch, flush)
+                    : CoderResult.unmappableForLength(1);
         }
 
-        protected CoderResult encodeTrail(CharBuffer source, char lead,
-                boolean flush) {
+        protected CoderResult encodeTrail(CharBuffer source, char lead, boolean flush) {
+            /*
+             * if the next character is a trail surrogate, we have an unmappable codepoint of length
+             * 2. if the next character is not a trail surrogate, we have a single malformed
+             * character. if there is no next character, we either have a malformed character or an
+             * underflow, depending on whether flush is enabled.
+             */
             if (source.hasRemaining()) {
                 char trail = source.get();
                 if (UTF16.isTrailSurrogate(trail)) {