Implement Utf8.decodeUtf8 by using String constructor (#9415)
* Implement Utf8.decodeUtf8 by using String constructor and a search for the replacement string "\uFFFD". This greatly simplifies the implementation, speeds it up for ascii and saves in memory allocations for non ascii strings. * Remove irrelevant comment about indexOf. * Code style changes following review. * Remove TODO + remove final per google style. * Delete decodeUtf8 from UnsafeProcessor as it inherits the intended implementation from its parent. * Move decodeUtf8 implementation from Utf8::Processor to Utf8 since it has only a single implementation which is independent of whether the processor is safe or unsafe. * Change only the logic of UnsafeProcessor to use String constructor This is done since some Android versions will see a performance regression if this change is applied. So we are making this change only for UnsafeProcessor which is not used on Android. * Remove duplicated Javadoc Co-authored-by: ahadadi <ahadadi@outbrain.com>
This commit is contained in:
parent
7f1acff2a4
commit
46c3651c31
@ -42,6 +42,7 @@ import static java.lang.Character.isSurrogatePair;
|
|||||||
import static java.lang.Character.toCodePoint;
|
import static java.lang.Character.toCodePoint;
|
||||||
|
|
||||||
import java.nio.ByteBuffer;
|
import java.nio.ByteBuffer;
|
||||||
|
import java.util.Arrays;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A set of low-level, high-performance static utility methods related to the UTF-8 character
|
* A set of low-level, high-performance static utility methods related to the UTF-8 character
|
||||||
@ -1365,88 +1366,30 @@ final class Utf8 {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
String decodeUtf8(byte[] bytes, int index, int size) throws InvalidProtocolBufferException {
|
String decodeUtf8(byte[] bytes, int index, int size)
|
||||||
if ((index | size | bytes.length - index - size) < 0) {
|
throws InvalidProtocolBufferException {
|
||||||
|
try {
|
||||||
|
String s = new String(bytes, index, size, Internal.UTF_8);
|
||||||
|
|
||||||
|
// "\uFFFD" is UTF-8 default replacement string, which illegal byte sequences get replaced with.
|
||||||
|
if (!s.contains("\uFFFD")) {
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Since s contains "\uFFFD" there are 2 options:
|
||||||
|
// 1) The byte array slice is invalid UTF-8.
|
||||||
|
// 2) The byte array slice is valid UTF-8 and contains encodings for "\uFFFD".
|
||||||
|
// To rule out (1), we encode s and compare it to the byte array slice.
|
||||||
|
// If the byte array slice was invalid UTF-8, then we would get a different sequence of bytes.
|
||||||
|
if (Arrays.equals(s.getBytes(Internal.UTF_8), Arrays.copyOfRange(bytes, index, index + size))) {
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
|
||||||
|
throw InvalidProtocolBufferException.invalidUtf8();
|
||||||
|
} catch (IndexOutOfBoundsException e) {
|
||||||
throw new ArrayIndexOutOfBoundsException(
|
throw new ArrayIndexOutOfBoundsException(
|
||||||
String.format("buffer length=%d, index=%d, size=%d", bytes.length, index, size));
|
String.format("buffer length=%d, index=%d, size=%d", bytes.length, index, size));
|
||||||
}
|
}
|
||||||
|
|
||||||
int offset = index + unsafeEstimateConsecutiveAscii(bytes, index, size);
|
|
||||||
final int limit = index + size;
|
|
||||||
|
|
||||||
// get an "exact" consecutive ASCII
|
|
||||||
while (offset < limit) {
|
|
||||||
byte b = UnsafeUtil.getByte(bytes, offset);
|
|
||||||
if (b < 0) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
offset++;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (offset == limit) {
|
|
||||||
// The entire byte sequence is ASCII. Don't bother copying to a char[], JVMs using
|
|
||||||
// compact strings will just turn it back into the same byte[].
|
|
||||||
return new String(bytes, index, size, Internal.US_ASCII);
|
|
||||||
}
|
|
||||||
|
|
||||||
// It's not all ASCII, at this point. This may over-allocate, but we will truncate in the
|
|
||||||
// end.
|
|
||||||
char[] resultArr = new char[size];
|
|
||||||
int resultPos = 0;
|
|
||||||
|
|
||||||
// Copy over the initial run of ASCII.
|
|
||||||
for (int i = index; i < offset; i++) {
|
|
||||||
DecodeUtil.handleOneByte(UnsafeUtil.getByte(bytes, i), resultArr, resultPos++);
|
|
||||||
}
|
|
||||||
|
|
||||||
while (offset < limit) {
|
|
||||||
byte byte1 = UnsafeUtil.getByte(bytes, offset++);
|
|
||||||
if (DecodeUtil.isOneByte(byte1)) {
|
|
||||||
DecodeUtil.handleOneByte(byte1, resultArr, resultPos++);
|
|
||||||
|
|
||||||
// It's common for there to be multiple ASCII characters in a run mixed in, so add an
|
|
||||||
// extra optimized loop to take care of these runs.
|
|
||||||
while (offset < limit) {
|
|
||||||
byte b = UnsafeUtil.getByte(bytes, offset);
|
|
||||||
if (!DecodeUtil.isOneByte(b)) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
offset++;
|
|
||||||
DecodeUtil.handleOneByte(b, resultArr, resultPos++);
|
|
||||||
}
|
|
||||||
} else if (DecodeUtil.isTwoBytes(byte1)) {
|
|
||||||
if (offset >= limit) {
|
|
||||||
throw InvalidProtocolBufferException.invalidUtf8();
|
|
||||||
}
|
|
||||||
DecodeUtil.handleTwoBytes(
|
|
||||||
byte1, /* byte2 */ UnsafeUtil.getByte(bytes, offset++), resultArr, resultPos++);
|
|
||||||
} else if (DecodeUtil.isThreeBytes(byte1)) {
|
|
||||||
if (offset >= limit - 1) {
|
|
||||||
throw InvalidProtocolBufferException.invalidUtf8();
|
|
||||||
}
|
|
||||||
DecodeUtil.handleThreeBytes(
|
|
||||||
byte1,
|
|
||||||
/* byte2 */ UnsafeUtil.getByte(bytes, offset++),
|
|
||||||
/* byte3 */ UnsafeUtil.getByte(bytes, offset++),
|
|
||||||
resultArr,
|
|
||||||
resultPos++);
|
|
||||||
} else {
|
|
||||||
if (offset >= limit - 2) {
|
|
||||||
throw InvalidProtocolBufferException.invalidUtf8();
|
|
||||||
}
|
|
||||||
DecodeUtil.handleFourBytes(
|
|
||||||
byte1,
|
|
||||||
/* byte2 */ UnsafeUtil.getByte(bytes, offset++),
|
|
||||||
/* byte3 */ UnsafeUtil.getByte(bytes, offset++),
|
|
||||||
/* byte4 */ UnsafeUtil.getByte(bytes, offset++),
|
|
||||||
resultArr,
|
|
||||||
resultPos++);
|
|
||||||
// 4-byte case requires two chars.
|
|
||||||
resultPos++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return new String(resultArr, 0, resultPos);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
Loading…
Reference in New Issue
Block a user