b48262d719
Bug: v8:12868 A slight modification to the existing DFA-based UTF-8 allocator to allow decoding surrogates, for use in decoding WTF-8. We'll need to additionally constrain the decoder to disallow surrogate pairs. Change-Id: Ifddbf08d4eeeff8f270df52a68f01769ea790eec Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3652787 Commit-Queue: Andy Wingo <wingo@igalia.com> Reviewed-by: Marja Hölttä <marja@chromium.org> Reviewed-by: Jakob Kummerow <jkummerow@chromium.org> Cr-Commit-Position: refs/heads/main@{#80654}
187 lines
6.4 KiB
C++
187 lines
6.4 KiB
C++
// Copyright 2022 the V8 project authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
// found in the LICENSE file.
|
|
|
|
#include "src/strings/unicode.h"
|
|
#include "src/third_party/utf8-decoder/generalized-utf8-decoder.h"
|
|
#include "src/third_party/utf8-decoder/utf8-decoder.h"
|
|
#include "test/cctest/cctest.h"
|
|
|
|
namespace v8 {
|
|
namespace internal {
|
|
namespace wasm {
|
|
namespace test_wasm_strings {
|
|
|
|
struct Utf8Decoder {
|
|
Utf8DfaDecoder::State state = Utf8DfaDecoder::kAccept;
|
|
uint32_t codepoint = 0;
|
|
void Decode(uint8_t byte) {
|
|
DCHECK(!failure());
|
|
Utf8DfaDecoder::Decode(byte, &state, &codepoint);
|
|
}
|
|
bool success() const { return state == Utf8DfaDecoder::kAccept; }
|
|
bool failure() const { return state == Utf8DfaDecoder::kReject; }
|
|
bool incomplete() const { return !success() && !failure(); }
|
|
};
|
|
|
|
struct GeneralizedUtf8Decoder {
|
|
GeneralizedUtf8DfaDecoder::State state = GeneralizedUtf8DfaDecoder::kAccept;
|
|
uint32_t codepoint = 0;
|
|
void Decode(uint8_t byte) {
|
|
DCHECK(!failure());
|
|
GeneralizedUtf8DfaDecoder::Decode(byte, &state, &codepoint);
|
|
}
|
|
bool success() const { return state == GeneralizedUtf8DfaDecoder::kAccept; }
|
|
bool failure() const { return state == GeneralizedUtf8DfaDecoder::kReject; }
|
|
bool incomplete() const { return !success() && !failure(); }
|
|
};
|
|
|
|
struct DecodingOracle {
|
|
Utf8Decoder utf8;
|
|
GeneralizedUtf8Decoder generalized_utf8;
|
|
|
|
void Decode(uint8_t byte) {
|
|
utf8.Decode(byte);
|
|
generalized_utf8.Decode(byte);
|
|
}
|
|
|
|
void CheckSame() const {
|
|
CHECK_EQ(utf8.success(), generalized_utf8.success());
|
|
CHECK_EQ(utf8.failure(), generalized_utf8.failure());
|
|
if (utf8.success()) CHECK(utf8.codepoint == generalized_utf8.codepoint);
|
|
}
|
|
|
|
bool success() const {
|
|
CheckSame();
|
|
return utf8.success();
|
|
}
|
|
bool failure() const {
|
|
CheckSame();
|
|
return utf8.failure();
|
|
}
|
|
bool incomplete() const {
|
|
CheckSame();
|
|
return utf8.incomplete();
|
|
}
|
|
};
|
|
|
|
TEST(GeneralizedUTF8Decode) {
|
|
// Exhaustive check that the generalized UTF-8 decoder matches the strict
|
|
// UTF-8 encoder, except for surrogates. Each production should end the
|
|
// decoders accepting or rejecting the production.
|
|
for (uint32_t byte1 = 0; byte1 <= 0xFF; byte1++) {
|
|
DecodingOracle decoder1;
|
|
decoder1.Decode(byte1);
|
|
|
|
if (byte1 <= 0x7F) {
|
|
// First byte in [0x00, 0x7F]: one-byte.
|
|
CHECK(decoder1.success());
|
|
} else if (byte1 <= 0xC1) {
|
|
// First byte in [0x80, 0xC1]: invalid.
|
|
CHECK(decoder1.failure());
|
|
} else if (byte1 <= 0xDF) {
|
|
// First byte in [0xC2, 0xDF]: two-byte.
|
|
CHECK(decoder1.incomplete());
|
|
// Second byte completes the sequence. Only [0x80, 0xBF] is valid.
|
|
for (uint32_t byte2 = 0x00; byte2 <= 0xFF; byte2++) {
|
|
DecodingOracle decoder2 = decoder1;
|
|
decoder2.Decode(byte2);
|
|
if (0x80 <= byte2 && byte2 <= 0xBF) {
|
|
CHECK(decoder2.success());
|
|
} else {
|
|
CHECK(decoder2.failure());
|
|
}
|
|
}
|
|
} else if (byte1 <= 0xEF) {
|
|
// First byte in [0xE0, 0xEF]: three-byte sequence.
|
|
CHECK(decoder1.incomplete());
|
|
uint32_t min = byte1 == 0xE0 ? 0xA0 : 0x80;
|
|
for (uint32_t byte2 = 0x00; byte2 <= 0xFF; byte2++) {
|
|
DecodingOracle decoder2 = decoder1;
|
|
decoder2.Decode(byte2);
|
|
if (min <= byte2 && byte2 <= 0xBF) {
|
|
// Second byte in [min, 0xBF]: continuation.
|
|
bool is_surrogate = byte1 == 0xED && byte2 >= 0xA0;
|
|
if (is_surrogate) {
|
|
// Here's where we expect the two decoders to differ: generalized
|
|
// UTF-8 will get a surrogate and strict UTF-8 errors.
|
|
CHECK(decoder2.utf8.failure());
|
|
CHECK(decoder2.generalized_utf8.incomplete());
|
|
} else {
|
|
CHECK(decoder2.incomplete());
|
|
}
|
|
|
|
// Third byte completes the sequence. Only [0x80, 0xBF] is valid.
|
|
for (uint32_t byte3 = 0x00; byte3 <= 0xFF; byte3++) {
|
|
DecodingOracle decoder3 = decoder2;
|
|
if (is_surrogate) {
|
|
decoder3.generalized_utf8.Decode(byte3);
|
|
if (0x80 <= byte3 && byte3 <= 0xBF) {
|
|
CHECK(decoder3.generalized_utf8.success());
|
|
uint32_t codepoint = decoder3.generalized_utf8.codepoint;
|
|
CHECK(unibrow::Utf16::IsLeadSurrogate(codepoint) ||
|
|
unibrow::Utf16::IsTrailSurrogate(codepoint));
|
|
} else {
|
|
CHECK(decoder3.generalized_utf8.failure());
|
|
}
|
|
} else {
|
|
decoder3.Decode(byte3);
|
|
if (0x80 <= byte3 && byte3 <= 0xBF) {
|
|
CHECK(decoder3.success());
|
|
} else {
|
|
CHECK(decoder3.failure());
|
|
}
|
|
}
|
|
}
|
|
} else {
|
|
// Second byte not in range: failure.
|
|
CHECK(decoder2.failure());
|
|
}
|
|
}
|
|
} else if (byte1 <= 0xF4) {
|
|
// First byte in [0xF0, 0xF4]: four-byte sequence.
|
|
CHECK(decoder1.incomplete());
|
|
uint32_t min = byte1 == 0xF0 ? 0x90 : 0x80;
|
|
uint32_t max = byte1 == 0xF4 ? 0x8F : 0xBF;
|
|
for (uint32_t byte2 = 0x00; byte2 <= 0xFF; byte2++) {
|
|
DecodingOracle decoder2 = decoder1;
|
|
decoder2.Decode(byte2);
|
|
if (min <= byte2 && byte2 <= max) {
|
|
// Second byte in [min, max]: continuation.
|
|
CHECK(decoder2.incomplete());
|
|
for (uint32_t byte3 = 0x00; byte3 <= 0xFF; byte3++) {
|
|
DecodingOracle decoder3 = decoder2;
|
|
decoder3.Decode(byte3);
|
|
if (0x80 <= byte3 && byte3 <= 0xBF) {
|
|
// Third byte in [0x80, BF]: continuation.
|
|
CHECK(decoder3.incomplete());
|
|
for (uint32_t byte4 = 0x00; byte4 <= 0xFF; byte4++) {
|
|
DecodingOracle decoder4 = decoder3;
|
|
decoder4.Decode(byte4);
|
|
// Fourth byte4 completes the sequence.
|
|
if (0x80 <= byte4 && byte4 <= 0xBF) {
|
|
CHECK(decoder4.success());
|
|
} else {
|
|
CHECK(decoder4.failure());
|
|
}
|
|
}
|
|
} else {
|
|
CHECK(decoder3.failure());
|
|
}
|
|
}
|
|
} else {
|
|
CHECK(decoder2.failure());
|
|
}
|
|
}
|
|
} else {
|
|
// First byte in [0xF5, 0xFF]: failure.
|
|
CHECK(decoder1.failure());
|
|
}
|
|
}
|
|
}
|
|
|
|
} // namespace test_wasm_strings
|
|
} // namespace wasm
|
|
} // namespace internal
|
|
} // namespace v8
|