Implement Utf8.decodeUtf8 by using String constructor (#9415)

* Implement Utf8.decodeUtf8 by using String constructor and a search for the replacement string "\uFFFD". This greatly simplifies the implementation, speeds it up for ascii and saves in memory allocations for non ascii strings. * Remove irrelevant comment about indexOf. * Code style changes following review. * Remove TODO + remove final per google style. * Delete decodeUtf8 from UnsafeProcessor as it inherits the intended implementation from its parent. * Move decodeUtf8 implementation from Utf8::Processor to Utf8 since it has only a single implementation which is independent of whether the processor is safe or unsafe. * Change only the logic of UnsafeProcessor to use String constructor This is done since some Android versions will see a performance regression if this change is applied. So we are making this change only for UnsafeProcessor which is not used on Android. * Remove duplicated Javadoc Co-authored-by: ahadadi <ahadadi@outbrain.com>
2022-03-18 19:28:17 +02:00 · 2022-03-18 19:28:17 +02:00 · 46c3651c31
commit 46c3651c31
parent 7f1acff2a4
1 changed files with 23 additions and 80 deletions
--- a/java/core/src/main/java/com/google/protobuf/Utf8.java
+++ b/java/core/src/main/java/com/google/protobuf/Utf8.java
@ -42,6 +42,7 @@ import static java.lang.Character.isSurrogatePair;
 import static java.lang.Character.toCodePoint;

 import java.nio.ByteBuffer;
+import java.util.Arrays;

 /**
 * A set of low-level, high-performance static utility methods related to the UTF-8 character
@ -1365,88 +1366,30 @@ final class Utf8 {
    }

    @Override
-    String decodeUtf8(byte[] bytes, int index, int size) throws InvalidProtocolBufferException {
-      if ((index | size | bytes.length - index - size) < 0) {
+    String decodeUtf8(byte[] bytes, int index, int size)
+            throws InvalidProtocolBufferException {
+      try {
+        String s = new String(bytes, index, size, Internal.UTF_8);
+
+        // "\uFFFD" is UTF-8 default replacement string, which illegal byte sequences get replaced with.
+        if (!s.contains("\uFFFD")) {
+          return s;
+        }
+
+        // Since s contains "\uFFFD" there are 2 options:
+        // 1) The byte array slice is invalid UTF-8.
+        // 2) The byte array slice is valid UTF-8 and contains encodings for "\uFFFD".
+        // To rule out (1), we encode s and compare it to the byte array slice.
+        // If the byte array slice was invalid UTF-8, then we would get a different sequence of bytes.
+        if (Arrays.equals(s.getBytes(Internal.UTF_8), Arrays.copyOfRange(bytes, index, index + size))) {
+          return s;
+        }
+
+        throw InvalidProtocolBufferException.invalidUtf8();
+      } catch (IndexOutOfBoundsException e) {
        throw new ArrayIndexOutOfBoundsException(
-            String.format("buffer length=%d, index=%d, size=%d", bytes.length, index, size));
+                String.format("buffer length=%d, index=%d, size=%d", bytes.length, index, size));
      }
-
-      int offset = index + unsafeEstimateConsecutiveAscii(bytes, index, size);
-      final int limit = index + size;
-
-      // get an "exact" consecutive ASCII
-      while (offset < limit) {
-        byte b = UnsafeUtil.getByte(bytes, offset);
-        if (b < 0) {
-          break;
-        }
-        offset++;
-      }
-
-      if (offset == limit) {
-        // The entire byte sequence is ASCII.  Don't bother copying to a char[], JVMs using
-        // compact strings will just turn it back into the same byte[].
-        return new String(bytes, index, size, Internal.US_ASCII);
-      }
-
-      // It's not all ASCII, at this point.  This may over-allocate, but we will truncate in the
-      // end.
-      char[] resultArr = new char[size];
-      int resultPos = 0;
-
-      // Copy over the initial run of ASCII.
-      for (int i = index; i < offset; i++) {
-        DecodeUtil.handleOneByte(UnsafeUtil.getByte(bytes, i), resultArr, resultPos++);
-      }
-
-      while (offset < limit) {
-        byte byte1 = UnsafeUtil.getByte(bytes, offset++);
-        if (DecodeUtil.isOneByte(byte1)) {
-          DecodeUtil.handleOneByte(byte1, resultArr, resultPos++);
-
-          // It's common for there to be multiple ASCII characters in a run mixed in, so add an
-          // extra optimized loop to take care of these runs.
-          while (offset < limit) {
-            byte b = UnsafeUtil.getByte(bytes, offset);
-            if (!DecodeUtil.isOneByte(b)) {
-              break;
-            }
-            offset++;
-            DecodeUtil.handleOneByte(b, resultArr, resultPos++);
-          }
-        } else if (DecodeUtil.isTwoBytes(byte1)) {
-          if (offset >= limit) {
-            throw InvalidProtocolBufferException.invalidUtf8();
-          }
-          DecodeUtil.handleTwoBytes(
-              byte1, /* byte2 */ UnsafeUtil.getByte(bytes, offset++), resultArr, resultPos++);
-        } else if (DecodeUtil.isThreeBytes(byte1)) {
-          if (offset >= limit - 1) {
-            throw InvalidProtocolBufferException.invalidUtf8();
-          }
-          DecodeUtil.handleThreeBytes(
-              byte1,
-              /* byte2 */ UnsafeUtil.getByte(bytes, offset++),
-              /* byte3 */ UnsafeUtil.getByte(bytes, offset++),
-              resultArr,
-              resultPos++);
-        } else {
-          if (offset >= limit - 2) {
-            throw InvalidProtocolBufferException.invalidUtf8();
-          }
-          DecodeUtil.handleFourBytes(
-              byte1,
-              /* byte2 */ UnsafeUtil.getByte(bytes, offset++),
-              /* byte3 */ UnsafeUtil.getByte(bytes, offset++),
-              /* byte4 */ UnsafeUtil.getByte(bytes, offset++),
-              resultArr,
-              resultPos++);
-          // 4-byte case requires two chars.
-          resultPos++;
-        }
-      }
-
-      return new String(resultArr, 0, resultPos);
    }

    @Override