Implement Utf8.decodeUtf8 by using String constructor (#9415)

* Implement Utf8.decodeUtf8 by using String constructor and a search for the replacement string "\uFFFD". This greatly simplifies the implementation, speeds it up for ascii and saves in memory allocations for non ascii strings.

* Remove irrelevant comment about indexOf.

* Code style changes following review.

* Remove TODO + remove final per google style.

* Delete decodeUtf8 from UnsafeProcessor as it inherits the intended implementation from its parent.

* Move decodeUtf8 implementation from Utf8::Processor to Utf8 since it has only a single implementation which is independent of whether the processor is safe or unsafe.

* Change only the logic of UnsafeProcessor to use String constructor
This is done since some Android versions will see a performance regression if this change is applied.
So we are making this change only for UnsafeProcessor which is not used on Android.

* Remove duplicated Javadoc

Co-authored-by: ahadadi <ahadadi@outbrain.com>
This commit is contained in:
amirhadadi 2022-03-18 19:28:17 +02:00 committed by GitHub
parent 7f1acff2a4
commit 46c3651c31
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -42,6 +42,7 @@ import static java.lang.Character.isSurrogatePair;
import static java.lang.Character.toCodePoint;
import java.nio.ByteBuffer;
import java.util.Arrays;
/**
* A set of low-level, high-performance static utility methods related to the UTF-8 character
@ -1365,88 +1366,30 @@ final class Utf8 {
}
@Override
String decodeUtf8(byte[] bytes, int index, int size) throws InvalidProtocolBufferException {
if ((index | size | bytes.length - index - size) < 0) {
String decodeUtf8(byte[] bytes, int index, int size)
throws InvalidProtocolBufferException {
try {
String s = new String(bytes, index, size, Internal.UTF_8);
// "\uFFFD" is UTF-8 default replacement string, which illegal byte sequences get replaced with.
if (!s.contains("\uFFFD")) {
return s;
}
// Since s contains "\uFFFD" there are 2 options:
// 1) The byte array slice is invalid UTF-8.
// 2) The byte array slice is valid UTF-8 and contains encodings for "\uFFFD".
// To rule out (1), we encode s and compare it to the byte array slice.
// If the byte array slice was invalid UTF-8, then we would get a different sequence of bytes.
if (Arrays.equals(s.getBytes(Internal.UTF_8), Arrays.copyOfRange(bytes, index, index + size))) {
return s;
}
throw InvalidProtocolBufferException.invalidUtf8();
} catch (IndexOutOfBoundsException e) {
throw new ArrayIndexOutOfBoundsException(
String.format("buffer length=%d, index=%d, size=%d", bytes.length, index, size));
String.format("buffer length=%d, index=%d, size=%d", bytes.length, index, size));
}
int offset = index + unsafeEstimateConsecutiveAscii(bytes, index, size);
final int limit = index + size;
// get an "exact" consecutive ASCII
while (offset < limit) {
byte b = UnsafeUtil.getByte(bytes, offset);
if (b < 0) {
break;
}
offset++;
}
if (offset == limit) {
// The entire byte sequence is ASCII. Don't bother copying to a char[], JVMs using
// compact strings will just turn it back into the same byte[].
return new String(bytes, index, size, Internal.US_ASCII);
}
// It's not all ASCII, at this point. This may over-allocate, but we will truncate in the
// end.
char[] resultArr = new char[size];
int resultPos = 0;
// Copy over the initial run of ASCII.
for (int i = index; i < offset; i++) {
DecodeUtil.handleOneByte(UnsafeUtil.getByte(bytes, i), resultArr, resultPos++);
}
while (offset < limit) {
byte byte1 = UnsafeUtil.getByte(bytes, offset++);
if (DecodeUtil.isOneByte(byte1)) {
DecodeUtil.handleOneByte(byte1, resultArr, resultPos++);
// It's common for there to be multiple ASCII characters in a run mixed in, so add an
// extra optimized loop to take care of these runs.
while (offset < limit) {
byte b = UnsafeUtil.getByte(bytes, offset);
if (!DecodeUtil.isOneByte(b)) {
break;
}
offset++;
DecodeUtil.handleOneByte(b, resultArr, resultPos++);
}
} else if (DecodeUtil.isTwoBytes(byte1)) {
if (offset >= limit) {
throw InvalidProtocolBufferException.invalidUtf8();
}
DecodeUtil.handleTwoBytes(
byte1, /* byte2 */ UnsafeUtil.getByte(bytes, offset++), resultArr, resultPos++);
} else if (DecodeUtil.isThreeBytes(byte1)) {
if (offset >= limit - 1) {
throw InvalidProtocolBufferException.invalidUtf8();
}
DecodeUtil.handleThreeBytes(
byte1,
/* byte2 */ UnsafeUtil.getByte(bytes, offset++),
/* byte3 */ UnsafeUtil.getByte(bytes, offset++),
resultArr,
resultPos++);
} else {
if (offset >= limit - 2) {
throw InvalidProtocolBufferException.invalidUtf8();
}
DecodeUtil.handleFourBytes(
byte1,
/* byte2 */ UnsafeUtil.getByte(bytes, offset++),
/* byte3 */ UnsafeUtil.getByte(bytes, offset++),
/* byte4 */ UnsafeUtil.getByte(bytes, offset++),
resultArr,
resultPos++);
// 4-byte case requires two chars.
resultPos++;
}
}
return new String(resultArr, 0, resultPos);
}
@Override