v8/test/cctest/wasm/test-wasm-strings.cc
Andy Wingo b48262d719 [stringrefs] Add generalized UTF-8 decoder / validator
Bug: v8:12868

A slight modification to the existing DFA-based UTF-8 allocator to allow
decoding surrogates, for use in decoding WTF-8.  We'll need to
additionally constrain the decoder to disallow surrogate pairs.

Change-Id: Ifddbf08d4eeeff8f270df52a68f01769ea790eec
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3652787
Commit-Queue: Andy Wingo <wingo@igalia.com>
Reviewed-by: Marja Hölttä <marja@chromium.org>
Reviewed-by: Jakob Kummerow <jkummerow@chromium.org>
Cr-Commit-Position: refs/heads/main@{#80654}
2022-05-20 08:05:04 +00:00

187 lines
6.4 KiB
C++

// Copyright 2022 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "src/strings/unicode.h"
#include "src/third_party/utf8-decoder/generalized-utf8-decoder.h"
#include "src/third_party/utf8-decoder/utf8-decoder.h"
#include "test/cctest/cctest.h"
namespace v8 {
namespace internal {
namespace wasm {
namespace test_wasm_strings {
struct Utf8Decoder {
Utf8DfaDecoder::State state = Utf8DfaDecoder::kAccept;
uint32_t codepoint = 0;
void Decode(uint8_t byte) {
DCHECK(!failure());
Utf8DfaDecoder::Decode(byte, &state, &codepoint);
}
bool success() const { return state == Utf8DfaDecoder::kAccept; }
bool failure() const { return state == Utf8DfaDecoder::kReject; }
bool incomplete() const { return !success() && !failure(); }
};
struct GeneralizedUtf8Decoder {
GeneralizedUtf8DfaDecoder::State state = GeneralizedUtf8DfaDecoder::kAccept;
uint32_t codepoint = 0;
void Decode(uint8_t byte) {
DCHECK(!failure());
GeneralizedUtf8DfaDecoder::Decode(byte, &state, &codepoint);
}
bool success() const { return state == GeneralizedUtf8DfaDecoder::kAccept; }
bool failure() const { return state == GeneralizedUtf8DfaDecoder::kReject; }
bool incomplete() const { return !success() && !failure(); }
};
struct DecodingOracle {
Utf8Decoder utf8;
GeneralizedUtf8Decoder generalized_utf8;
void Decode(uint8_t byte) {
utf8.Decode(byte);
generalized_utf8.Decode(byte);
}
void CheckSame() const {
CHECK_EQ(utf8.success(), generalized_utf8.success());
CHECK_EQ(utf8.failure(), generalized_utf8.failure());
if (utf8.success()) CHECK(utf8.codepoint == generalized_utf8.codepoint);
}
bool success() const {
CheckSame();
return utf8.success();
}
bool failure() const {
CheckSame();
return utf8.failure();
}
bool incomplete() const {
CheckSame();
return utf8.incomplete();
}
};
TEST(GeneralizedUTF8Decode) {
// Exhaustive check that the generalized UTF-8 decoder matches the strict
// UTF-8 encoder, except for surrogates. Each production should end the
// decoders accepting or rejecting the production.
for (uint32_t byte1 = 0; byte1 <= 0xFF; byte1++) {
DecodingOracle decoder1;
decoder1.Decode(byte1);
if (byte1 <= 0x7F) {
// First byte in [0x00, 0x7F]: one-byte.
CHECK(decoder1.success());
} else if (byte1 <= 0xC1) {
// First byte in [0x80, 0xC1]: invalid.
CHECK(decoder1.failure());
} else if (byte1 <= 0xDF) {
// First byte in [0xC2, 0xDF]: two-byte.
CHECK(decoder1.incomplete());
// Second byte completes the sequence. Only [0x80, 0xBF] is valid.
for (uint32_t byte2 = 0x00; byte2 <= 0xFF; byte2++) {
DecodingOracle decoder2 = decoder1;
decoder2.Decode(byte2);
if (0x80 <= byte2 && byte2 <= 0xBF) {
CHECK(decoder2.success());
} else {
CHECK(decoder2.failure());
}
}
} else if (byte1 <= 0xEF) {
// First byte in [0xE0, 0xEF]: three-byte sequence.
CHECK(decoder1.incomplete());
uint32_t min = byte1 == 0xE0 ? 0xA0 : 0x80;
for (uint32_t byte2 = 0x00; byte2 <= 0xFF; byte2++) {
DecodingOracle decoder2 = decoder1;
decoder2.Decode(byte2);
if (min <= byte2 && byte2 <= 0xBF) {
// Second byte in [min, 0xBF]: continuation.
bool is_surrogate = byte1 == 0xED && byte2 >= 0xA0;
if (is_surrogate) {
// Here's where we expect the two decoders to differ: generalized
// UTF-8 will get a surrogate and strict UTF-8 errors.
CHECK(decoder2.utf8.failure());
CHECK(decoder2.generalized_utf8.incomplete());
} else {
CHECK(decoder2.incomplete());
}
// Third byte completes the sequence. Only [0x80, 0xBF] is valid.
for (uint32_t byte3 = 0x00; byte3 <= 0xFF; byte3++) {
DecodingOracle decoder3 = decoder2;
if (is_surrogate) {
decoder3.generalized_utf8.Decode(byte3);
if (0x80 <= byte3 && byte3 <= 0xBF) {
CHECK(decoder3.generalized_utf8.success());
uint32_t codepoint = decoder3.generalized_utf8.codepoint;
CHECK(unibrow::Utf16::IsLeadSurrogate(codepoint) ||
unibrow::Utf16::IsTrailSurrogate(codepoint));
} else {
CHECK(decoder3.generalized_utf8.failure());
}
} else {
decoder3.Decode(byte3);
if (0x80 <= byte3 && byte3 <= 0xBF) {
CHECK(decoder3.success());
} else {
CHECK(decoder3.failure());
}
}
}
} else {
// Second byte not in range: failure.
CHECK(decoder2.failure());
}
}
} else if (byte1 <= 0xF4) {
// First byte in [0xF0, 0xF4]: four-byte sequence.
CHECK(decoder1.incomplete());
uint32_t min = byte1 == 0xF0 ? 0x90 : 0x80;
uint32_t max = byte1 == 0xF4 ? 0x8F : 0xBF;
for (uint32_t byte2 = 0x00; byte2 <= 0xFF; byte2++) {
DecodingOracle decoder2 = decoder1;
decoder2.Decode(byte2);
if (min <= byte2 && byte2 <= max) {
// Second byte in [min, max]: continuation.
CHECK(decoder2.incomplete());
for (uint32_t byte3 = 0x00; byte3 <= 0xFF; byte3++) {
DecodingOracle decoder3 = decoder2;
decoder3.Decode(byte3);
if (0x80 <= byte3 && byte3 <= 0xBF) {
// Third byte in [0x80, BF]: continuation.
CHECK(decoder3.incomplete());
for (uint32_t byte4 = 0x00; byte4 <= 0xFF; byte4++) {
DecodingOracle decoder4 = decoder3;
decoder4.Decode(byte4);
// Fourth byte4 completes the sequence.
if (0x80 <= byte4 && byte4 <= 0xBF) {
CHECK(decoder4.success());
} else {
CHECK(decoder4.failure());
}
}
} else {
CHECK(decoder3.failure());
}
}
} else {
CHECK(decoder2.failure());
}
}
} else {
// First byte in [0xF5, 0xFF]: failure.
CHECK(decoder1.failure());
}
}
}
} // namespace test_wasm_strings
} // namespace wasm
} // namespace internal
} // namespace v8