Implement Utf8.decodeUtf8 by using String constructor (#9415)
* Implement Utf8.decodeUtf8 by using String constructor and a search for the replacement string "\uFFFD". This greatly simplifies the implementation, speeds it up for ascii and saves in memory allocations for non ascii strings. * Remove irrelevant comment about indexOf. * Code style changes following review. * Remove TODO + remove final per google style. * Delete decodeUtf8 from UnsafeProcessor as it inherits the intended implementation from its parent. * Move decodeUtf8 implementation from Utf8::Processor to Utf8 since it has only a single implementation which is independent of whether the processor is safe or unsafe. * Change only the logic of UnsafeProcessor to use String constructor This is done since some Android versions will see a performance regression if this change is applied. So we are making this change only for UnsafeProcessor which is not used on Android. * Remove duplicated Javadoc Co-authored-by: ahadadi <ahadadi@outbrain.com>
This commit is contained in:
parent
7f1acff2a4
commit
46c3651c31
@ -42,6 +42,7 @@ import static java.lang.Character.isSurrogatePair;
|
||||
import static java.lang.Character.toCodePoint;
|
||||
|
||||
import java.nio.ByteBuffer;
|
||||
import java.util.Arrays;
|
||||
|
||||
/**
|
||||
* A set of low-level, high-performance static utility methods related to the UTF-8 character
|
||||
@ -1365,88 +1366,30 @@ final class Utf8 {
|
||||
}
|
||||
|
||||
@Override
|
||||
String decodeUtf8(byte[] bytes, int index, int size) throws InvalidProtocolBufferException {
|
||||
if ((index | size | bytes.length - index - size) < 0) {
|
||||
String decodeUtf8(byte[] bytes, int index, int size)
|
||||
throws InvalidProtocolBufferException {
|
||||
try {
|
||||
String s = new String(bytes, index, size, Internal.UTF_8);
|
||||
|
||||
// "\uFFFD" is UTF-8 default replacement string, which illegal byte sequences get replaced with.
|
||||
if (!s.contains("\uFFFD")) {
|
||||
return s;
|
||||
}
|
||||
|
||||
// Since s contains "\uFFFD" there are 2 options:
|
||||
// 1) The byte array slice is invalid UTF-8.
|
||||
// 2) The byte array slice is valid UTF-8 and contains encodings for "\uFFFD".
|
||||
// To rule out (1), we encode s and compare it to the byte array slice.
|
||||
// If the byte array slice was invalid UTF-8, then we would get a different sequence of bytes.
|
||||
if (Arrays.equals(s.getBytes(Internal.UTF_8), Arrays.copyOfRange(bytes, index, index + size))) {
|
||||
return s;
|
||||
}
|
||||
|
||||
throw InvalidProtocolBufferException.invalidUtf8();
|
||||
} catch (IndexOutOfBoundsException e) {
|
||||
throw new ArrayIndexOutOfBoundsException(
|
||||
String.format("buffer length=%d, index=%d, size=%d", bytes.length, index, size));
|
||||
String.format("buffer length=%d, index=%d, size=%d", bytes.length, index, size));
|
||||
}
|
||||
|
||||
int offset = index + unsafeEstimateConsecutiveAscii(bytes, index, size);
|
||||
final int limit = index + size;
|
||||
|
||||
// get an "exact" consecutive ASCII
|
||||
while (offset < limit) {
|
||||
byte b = UnsafeUtil.getByte(bytes, offset);
|
||||
if (b < 0) {
|
||||
break;
|
||||
}
|
||||
offset++;
|
||||
}
|
||||
|
||||
if (offset == limit) {
|
||||
// The entire byte sequence is ASCII. Don't bother copying to a char[], JVMs using
|
||||
// compact strings will just turn it back into the same byte[].
|
||||
return new String(bytes, index, size, Internal.US_ASCII);
|
||||
}
|
||||
|
||||
// It's not all ASCII, at this point. This may over-allocate, but we will truncate in the
|
||||
// end.
|
||||
char[] resultArr = new char[size];
|
||||
int resultPos = 0;
|
||||
|
||||
// Copy over the initial run of ASCII.
|
||||
for (int i = index; i < offset; i++) {
|
||||
DecodeUtil.handleOneByte(UnsafeUtil.getByte(bytes, i), resultArr, resultPos++);
|
||||
}
|
||||
|
||||
while (offset < limit) {
|
||||
byte byte1 = UnsafeUtil.getByte(bytes, offset++);
|
||||
if (DecodeUtil.isOneByte(byte1)) {
|
||||
DecodeUtil.handleOneByte(byte1, resultArr, resultPos++);
|
||||
|
||||
// It's common for there to be multiple ASCII characters in a run mixed in, so add an
|
||||
// extra optimized loop to take care of these runs.
|
||||
while (offset < limit) {
|
||||
byte b = UnsafeUtil.getByte(bytes, offset);
|
||||
if (!DecodeUtil.isOneByte(b)) {
|
||||
break;
|
||||
}
|
||||
offset++;
|
||||
DecodeUtil.handleOneByte(b, resultArr, resultPos++);
|
||||
}
|
||||
} else if (DecodeUtil.isTwoBytes(byte1)) {
|
||||
if (offset >= limit) {
|
||||
throw InvalidProtocolBufferException.invalidUtf8();
|
||||
}
|
||||
DecodeUtil.handleTwoBytes(
|
||||
byte1, /* byte2 */ UnsafeUtil.getByte(bytes, offset++), resultArr, resultPos++);
|
||||
} else if (DecodeUtil.isThreeBytes(byte1)) {
|
||||
if (offset >= limit - 1) {
|
||||
throw InvalidProtocolBufferException.invalidUtf8();
|
||||
}
|
||||
DecodeUtil.handleThreeBytes(
|
||||
byte1,
|
||||
/* byte2 */ UnsafeUtil.getByte(bytes, offset++),
|
||||
/* byte3 */ UnsafeUtil.getByte(bytes, offset++),
|
||||
resultArr,
|
||||
resultPos++);
|
||||
} else {
|
||||
if (offset >= limit - 2) {
|
||||
throw InvalidProtocolBufferException.invalidUtf8();
|
||||
}
|
||||
DecodeUtil.handleFourBytes(
|
||||
byte1,
|
||||
/* byte2 */ UnsafeUtil.getByte(bytes, offset++),
|
||||
/* byte3 */ UnsafeUtil.getByte(bytes, offset++),
|
||||
/* byte4 */ UnsafeUtil.getByte(bytes, offset++),
|
||||
resultArr,
|
||||
resultPos++);
|
||||
// 4-byte case requires two chars.
|
||||
resultPos++;
|
||||
}
|
||||
}
|
||||
|
||||
return new String(resultArr, 0, resultPos);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
Loading…
Reference in New Issue
Block a user