Implement Utf8.decodeUtf8 by using String constructor (#9415)

* Implement Utf8.decodeUtf8 by using String constructor and a search for the replacement string "\uFFFD". This greatly simplifies the implementation, speeds it up for ascii and saves in memory allocations for non ascii strings. * Remove irrelevant comment about indexOf. * Code style changes following review. * Remove TODO + remove final per google style. * Delete decodeUtf8 from UnsafeProcessor as it inherits the intended implementation from its parent. * Move decodeUtf8 implementation from Utf8::Processor to Utf8 since it has only a single implementation which is independent of whether the processor is safe or unsafe. * Change only the logic of UnsafeProcessor to use String constructor This is done since some Android versions will see a performance regression if this change is applied. So we are making this change only for UnsafeProcessor which is not used on Android. * Remove duplicated Javadoc Co-authored-by: ahadadi <ahadadi@outbrain.com>
2022-03-18 19:28:17 +02:00 · 2022-03-18 19:28:17 +02:00 · 46c3651c31
commit 46c3651c31
parent 7f1acff2a4
1 changed files with 23 additions and 80 deletions
--- a/java/core/src/main/java/com/google/protobuf/Utf8.java
+++ b/java/core/src/main/java/com/google/protobuf/Utf8.java
@ -42,6 +42,7 @@ import static java.lang.Character.isSurrogatePair;
 import static java.lang.Character.toCodePoint;
 import java.nio.ByteBuffer;
 import java.util.Arrays;
 /**
 * A set of low-level, high-performance static utility methods related to the UTF-8 character
@ -1365,88 +1366,30 @@ final class Utf8 {
    }
    @Override
-    String decodeUtf8(byte[] bytes, int index, int size) throws InvalidProtocolBufferException {
+    String decodeUtf8(byte[] bytes, int index, int size)
-      if ((index | size | bytes.length - index - size) < 0) {
+            throws InvalidProtocolBufferException {
      try {
        String s = new String(bytes, index, size, Internal.UTF_8);
        // "\uFFFD" is UTF-8 default replacement string, which illegal byte sequences get replaced with.
        if (!s.contains("\uFFFD")) {
          return s;
        }
        // Since s contains "\uFFFD" there are 2 options:
        // 1) The byte array slice is invalid UTF-8.
        // 2) The byte array slice is valid UTF-8 and contains encodings for "\uFFFD".
        // To rule out (1), we encode s and compare it to the byte array slice.
        // If the byte array slice was invalid UTF-8, then we would get a different sequence of bytes.
        if (Arrays.equals(s.getBytes(Internal.UTF_8), Arrays.copyOfRange(bytes, index, index + size))) {
          return s;
        }
        throw InvalidProtocolBufferException.invalidUtf8();
      } catch (IndexOutOfBoundsException e) {
        throw new ArrayIndexOutOfBoundsException(
-            String.format("buffer length=%d, index=%d, size=%d", bytes.length, index, size));
+                String.format("buffer length=%d, index=%d, size=%d", bytes.length, index, size));
      }
      int offset = index + unsafeEstimateConsecutiveAscii(bytes, index, size);
      final int limit = index + size;
      // get an "exact" consecutive ASCII
      while (offset < limit) {
        byte b = UnsafeUtil.getByte(bytes, offset);
        if (b < 0) {
          break;
        }
        offset++;
      }
      if (offset == limit) {
        // The entire byte sequence is ASCII.  Don't bother copying to a char[], JVMs using
        // compact strings will just turn it back into the same byte[].
        return new String(bytes, index, size, Internal.US_ASCII);
      }
      // It's not all ASCII, at this point.  This may over-allocate, but we will truncate in the
      // end.
      char[] resultArr = new char[size];
      int resultPos = 0;
      // Copy over the initial run of ASCII.
      for (int i = index; i < offset; i++) {
        DecodeUtil.handleOneByte(UnsafeUtil.getByte(bytes, i), resultArr, resultPos++);
      }
      while (offset < limit) {
        byte byte1 = UnsafeUtil.getByte(bytes, offset++);
        if (DecodeUtil.isOneByte(byte1)) {
          DecodeUtil.handleOneByte(byte1, resultArr, resultPos++);
          // It's common for there to be multiple ASCII characters in a run mixed in, so add an
          // extra optimized loop to take care of these runs.
          while (offset < limit) {
            byte b = UnsafeUtil.getByte(bytes, offset);
            if (!DecodeUtil.isOneByte(b)) {
              break;
            }
            offset++;
            DecodeUtil.handleOneByte(b, resultArr, resultPos++);
          }
        } else if (DecodeUtil.isTwoBytes(byte1)) {
          if (offset >= limit) {
            throw InvalidProtocolBufferException.invalidUtf8();
          }
          DecodeUtil.handleTwoBytes(
              byte1, /* byte2 */ UnsafeUtil.getByte(bytes, offset++), resultArr, resultPos++);
        } else if (DecodeUtil.isThreeBytes(byte1)) {
          if (offset >= limit - 1) {
            throw InvalidProtocolBufferException.invalidUtf8();
          }
          DecodeUtil.handleThreeBytes(
              byte1,
              /* byte2 */ UnsafeUtil.getByte(bytes, offset++),
              /* byte3 */ UnsafeUtil.getByte(bytes, offset++),
              resultArr,
              resultPos++);
        } else {
          if (offset >= limit - 2) {
            throw InvalidProtocolBufferException.invalidUtf8();
          }
          DecodeUtil.handleFourBytes(
              byte1,
              /* byte2 */ UnsafeUtil.getByte(bytes, offset++),
              /* byte3 */ UnsafeUtil.getByte(bytes, offset++),
              /* byte4 */ UnsafeUtil.getByte(bytes, offset++),
              resultArr,
              resultPos++);
          // 4-byte case requires two chars.
          resultPos++;
        }
      }
      return new String(resultArr, 0, resultPos);
    }
    @Override