[stringrefs] Add generalized UTF-8 decoder / validator
Bug: v8:12868 A slight modification to the existing DFA-based UTF-8 allocator to allow decoding surrogates, for use in decoding WTF-8. We'll need to additionally constrain the decoder to disallow surrogate pairs. Change-Id: Ifddbf08d4eeeff8f270df52a68f01769ea790eec Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3652787 Commit-Queue: Andy Wingo <wingo@igalia.com> Reviewed-by: Marja Hölttä <marja@chromium.org> Reviewed-by: Jakob Kummerow <jkummerow@chromium.org> Cr-Commit-Position: refs/heads/main@{#80654}
This commit is contained in:
parent
0440123e30
commit
b48262d719
@ -2449,6 +2449,7 @@ filegroup(
|
||||
"src/debug/debug-wasm-objects-inl.h",
|
||||
"src/runtime/runtime-test-wasm.cc",
|
||||
"src/runtime/runtime-wasm.cc",
|
||||
"src/third_party/utf8-decoder/generalized-utf8-decoder.h",
|
||||
"src/trap-handler/handler-inside-posix.h",
|
||||
"src/trap-handler/handler-inside.cc",
|
||||
"src/trap-handler/handler-outside.cc",
|
||||
|
1
BUILD.gn
1
BUILD.gn
@ -3542,6 +3542,7 @@ v8_header_set("v8_internal_headers") {
|
||||
"src/compiler/wasm-loop-peeling.h",
|
||||
"src/debug/debug-wasm-objects-inl.h",
|
||||
"src/debug/debug-wasm-objects.h",
|
||||
"src/third_party/utf8-decoder/generalized-utf8-decoder.h",
|
||||
"src/trap-handler/trap-handler-internal.h",
|
||||
"src/trap-handler/trap-handler.h",
|
||||
"src/wasm/assembler-buffer-cache.h",
|
||||
|
6
src/third_party/utf8-decoder/README.v8
vendored
6
src/third_party/utf8-decoder/README.v8
vendored
@ -16,3 +16,9 @@ Local modifications:
|
||||
bit mask for the incoming byte.
|
||||
- The caller must now zero out the code point buffer after successful or
|
||||
unsuccessful state transitions.
|
||||
- Specifically for generalized-utf8-decoder.h: we adapt the original
|
||||
decoder to decode and validate "generalized UTF-8", a variant of UTF-8
|
||||
used in WTF-8 that can encode surrogates. See
|
||||
https://simonsapin.github.io/wtf-8/#generalized-utf8. There is one
|
||||
fewer state and so the transition table is smaller by one in both
|
||||
dimensions.
|
||||
|
105
src/third_party/utf8-decoder/generalized-utf8-decoder.h
vendored
Normal file
105
src/third_party/utf8-decoder/generalized-utf8-decoder.h
vendored
Normal file
@ -0,0 +1,105 @@
|
||||
// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ and the sibling file
|
||||
// utf8-decoder.h for details.
|
||||
//
|
||||
// This file decodes "generalized UTF-8", which is the same as UTF-8 except that
|
||||
// it allows surrogates: https://simonsapin.github.io/wtf-8/#generalized-utf8
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#ifndef __GENERALIZED_UTF8_DFA_DECODER_H
|
||||
#define __GENERALIZED_UTF8_DFA_DECODER_H
|
||||
|
||||
namespace GeneralizedUtf8DfaDecoder {
|
||||
|
||||
enum State : uint8_t {
|
||||
kReject = 0,
|
||||
kAccept = 11,
|
||||
kTwoByte = 22,
|
||||
kThreeByte = 33,
|
||||
kFourByte = 44,
|
||||
kFourByteLow = 55,
|
||||
kThreeByteHigh = 66,
|
||||
kFourByteMidHigh = 77,
|
||||
};
|
||||
|
||||
static inline void Decode(uint8_t byte, State* state, uint32_t* buffer) {
|
||||
// This first table maps bytes to character to a transition.
|
||||
//
|
||||
// The transition value takes a state to a new state, but it also determines
|
||||
// the set of bits from the current byte that contribute to the decoded
|
||||
// codepoint:
|
||||
//
|
||||
// Transition | Bits from current byte that contribute to decoded codepoint
|
||||
// ------------------------------------------------------------------------
|
||||
// 0, 1 | 0b01111111
|
||||
// 2, 3 | 0b00111111
|
||||
// 4, 5 | 0b00011111
|
||||
// 6, 7 | 0b00001111
|
||||
// 8, 9 | 0b00000111
|
||||
// 10 | 0b00000011
|
||||
//
|
||||
// Given the WTF-8 encoding, we therefore have the following constraints:
|
||||
// 1. The transition value for 1-byte encodings should have the value 0 or 1
|
||||
// so that we preserve all of the low 7 bits.
|
||||
// 2. Continuation bytes (0x80 to 0xBF) are of the form 0b10xxxxxx, and
|
||||
// therefore should have transition value between 0 and 3.
|
||||
// 3. Leading bytes for 2-byte encodings are of the form 0b110yyyyy, and
|
||||
// therefore the transition value can be between 2 and 5.
|
||||
// 4. Leading bytes for 3-byte encodings (0b1110zzzz) need transition value
|
||||
// between 4 and 7.
|
||||
// 5. Leading bytes for 4-byte encodings (0b11110uuu) need transition value
|
||||
// between 6 and 9.
|
||||
// 6. We need more states to impose irregular constraints. Sometimes we can
|
||||
// use the knowldege that e.g. some high significant bits of the xxxx in
|
||||
// 0b1110xxxx are 0, then we can use a higher transition value.
|
||||
// 7. Transitions to invalid states can use any transition value.
|
||||
static constexpr uint8_t transitions[] = {
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 00-0F
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 10-1F
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 20-2F
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 30-3F
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 40-4F
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 50-5F
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 60-6F
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 70-7F
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 80-8F
|
||||
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 90-9F
|
||||
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // A0-AF
|
||||
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // B0-BF
|
||||
8, 8, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // C0-CF
|
||||
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // D0-DF
|
||||
9, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, // E0-EF
|
||||
10, 6, 6, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, // F0-FF
|
||||
};
|
||||
|
||||
// This second table maps a state to a new state when adding a transition.
|
||||
// 00-7F
|
||||
// | 80-8F
|
||||
// | | 90-9F
|
||||
// | | | A0-BF
|
||||
// | | | | C2-DF
|
||||
// | | | | | E1-EF
|
||||
// | | | | | | F1-F3
|
||||
// | | | | | | | F4
|
||||
// | | | | | | | | C0, C1, F5-FF
|
||||
// | | | | | | | | | E0
|
||||
// | | | | | | | | | | F0
|
||||
static constexpr uint8_t states[] = {
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // REJECT = 0
|
||||
11, 0, 0, 0, 22, 33, 44, 55, 0, 66, 77, // ACCEPT = 11
|
||||
0, 11, 11, 11, 0, 0, 0, 0, 0, 0, 0, // 2-byte = 22
|
||||
0, 22, 22, 22, 0, 0, 0, 0, 0, 0, 0, // 3-byte = 33
|
||||
0, 33, 33, 33, 0, 0, 0, 0, 0, 0, 0, // 4-byte = 44
|
||||
0, 33, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 4-byte low = 55
|
||||
0, 0, 0, 22, 0, 0, 0, 0, 0, 0, 0, // 3-byte high = 66
|
||||
0, 0, 33, 33, 0, 0, 0, 0, 0, 0, 0, // 4-byte mid/high = 77
|
||||
};
|
||||
|
||||
uint8_t type = transitions[byte];
|
||||
*state = static_cast<State>(states[*state + type]);
|
||||
*buffer = (*buffer << 6) | (byte & (0x7F >> (type >> 1)));
|
||||
}
|
||||
|
||||
} // namespace GeneralizedUtf8DfaDecoder
|
||||
|
||||
#endif // __GENERALIZED_UTF8_DFA_DECODER_H
|
@ -416,6 +416,7 @@ v8_source_set("cctest_sources") {
|
||||
"wasm/test-wasm-serialization.cc",
|
||||
"wasm/test-wasm-shared-engine.cc",
|
||||
"wasm/test-wasm-stack.cc",
|
||||
"wasm/test-wasm-strings.cc",
|
||||
"wasm/test-wasm-trap-position.cc",
|
||||
"wasm/wasm-atomics-utils.h",
|
||||
"wasm/wasm-run-utils.cc",
|
||||
|
186
test/cctest/wasm/test-wasm-strings.cc
Normal file
186
test/cctest/wasm/test-wasm-strings.cc
Normal file
@ -0,0 +1,186 @@
|
||||
// Copyright 2022 the V8 project authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file.
|
||||
|
||||
#include "src/strings/unicode.h"
|
||||
#include "src/third_party/utf8-decoder/generalized-utf8-decoder.h"
|
||||
#include "src/third_party/utf8-decoder/utf8-decoder.h"
|
||||
#include "test/cctest/cctest.h"
|
||||
|
||||
namespace v8 {
|
||||
namespace internal {
|
||||
namespace wasm {
|
||||
namespace test_wasm_strings {
|
||||
|
||||
struct Utf8Decoder {
|
||||
Utf8DfaDecoder::State state = Utf8DfaDecoder::kAccept;
|
||||
uint32_t codepoint = 0;
|
||||
void Decode(uint8_t byte) {
|
||||
DCHECK(!failure());
|
||||
Utf8DfaDecoder::Decode(byte, &state, &codepoint);
|
||||
}
|
||||
bool success() const { return state == Utf8DfaDecoder::kAccept; }
|
||||
bool failure() const { return state == Utf8DfaDecoder::kReject; }
|
||||
bool incomplete() const { return !success() && !failure(); }
|
||||
};
|
||||
|
||||
struct GeneralizedUtf8Decoder {
|
||||
GeneralizedUtf8DfaDecoder::State state = GeneralizedUtf8DfaDecoder::kAccept;
|
||||
uint32_t codepoint = 0;
|
||||
void Decode(uint8_t byte) {
|
||||
DCHECK(!failure());
|
||||
GeneralizedUtf8DfaDecoder::Decode(byte, &state, &codepoint);
|
||||
}
|
||||
bool success() const { return state == GeneralizedUtf8DfaDecoder::kAccept; }
|
||||
bool failure() const { return state == GeneralizedUtf8DfaDecoder::kReject; }
|
||||
bool incomplete() const { return !success() && !failure(); }
|
||||
};
|
||||
|
||||
struct DecodingOracle {
|
||||
Utf8Decoder utf8;
|
||||
GeneralizedUtf8Decoder generalized_utf8;
|
||||
|
||||
void Decode(uint8_t byte) {
|
||||
utf8.Decode(byte);
|
||||
generalized_utf8.Decode(byte);
|
||||
}
|
||||
|
||||
void CheckSame() const {
|
||||
CHECK_EQ(utf8.success(), generalized_utf8.success());
|
||||
CHECK_EQ(utf8.failure(), generalized_utf8.failure());
|
||||
if (utf8.success()) CHECK(utf8.codepoint == generalized_utf8.codepoint);
|
||||
}
|
||||
|
||||
bool success() const {
|
||||
CheckSame();
|
||||
return utf8.success();
|
||||
}
|
||||
bool failure() const {
|
||||
CheckSame();
|
||||
return utf8.failure();
|
||||
}
|
||||
bool incomplete() const {
|
||||
CheckSame();
|
||||
return utf8.incomplete();
|
||||
}
|
||||
};
|
||||
|
||||
TEST(GeneralizedUTF8Decode) {
|
||||
// Exhaustive check that the generalized UTF-8 decoder matches the strict
|
||||
// UTF-8 encoder, except for surrogates. Each production should end the
|
||||
// decoders accepting or rejecting the production.
|
||||
for (uint32_t byte1 = 0; byte1 <= 0xFF; byte1++) {
|
||||
DecodingOracle decoder1;
|
||||
decoder1.Decode(byte1);
|
||||
|
||||
if (byte1 <= 0x7F) {
|
||||
// First byte in [0x00, 0x7F]: one-byte.
|
||||
CHECK(decoder1.success());
|
||||
} else if (byte1 <= 0xC1) {
|
||||
// First byte in [0x80, 0xC1]: invalid.
|
||||
CHECK(decoder1.failure());
|
||||
} else if (byte1 <= 0xDF) {
|
||||
// First byte in [0xC2, 0xDF]: two-byte.
|
||||
CHECK(decoder1.incomplete());
|
||||
// Second byte completes the sequence. Only [0x80, 0xBF] is valid.
|
||||
for (uint32_t byte2 = 0x00; byte2 <= 0xFF; byte2++) {
|
||||
DecodingOracle decoder2 = decoder1;
|
||||
decoder2.Decode(byte2);
|
||||
if (0x80 <= byte2 && byte2 <= 0xBF) {
|
||||
CHECK(decoder2.success());
|
||||
} else {
|
||||
CHECK(decoder2.failure());
|
||||
}
|
||||
}
|
||||
} else if (byte1 <= 0xEF) {
|
||||
// First byte in [0xE0, 0xEF]: three-byte sequence.
|
||||
CHECK(decoder1.incomplete());
|
||||
uint32_t min = byte1 == 0xE0 ? 0xA0 : 0x80;
|
||||
for (uint32_t byte2 = 0x00; byte2 <= 0xFF; byte2++) {
|
||||
DecodingOracle decoder2 = decoder1;
|
||||
decoder2.Decode(byte2);
|
||||
if (min <= byte2 && byte2 <= 0xBF) {
|
||||
// Second byte in [min, 0xBF]: continuation.
|
||||
bool is_surrogate = byte1 == 0xED && byte2 >= 0xA0;
|
||||
if (is_surrogate) {
|
||||
// Here's where we expect the two decoders to differ: generalized
|
||||
// UTF-8 will get a surrogate and strict UTF-8 errors.
|
||||
CHECK(decoder2.utf8.failure());
|
||||
CHECK(decoder2.generalized_utf8.incomplete());
|
||||
} else {
|
||||
CHECK(decoder2.incomplete());
|
||||
}
|
||||
|
||||
// Third byte completes the sequence. Only [0x80, 0xBF] is valid.
|
||||
for (uint32_t byte3 = 0x00; byte3 <= 0xFF; byte3++) {
|
||||
DecodingOracle decoder3 = decoder2;
|
||||
if (is_surrogate) {
|
||||
decoder3.generalized_utf8.Decode(byte3);
|
||||
if (0x80 <= byte3 && byte3 <= 0xBF) {
|
||||
CHECK(decoder3.generalized_utf8.success());
|
||||
uint32_t codepoint = decoder3.generalized_utf8.codepoint;
|
||||
CHECK(unibrow::Utf16::IsLeadSurrogate(codepoint) ||
|
||||
unibrow::Utf16::IsTrailSurrogate(codepoint));
|
||||
} else {
|
||||
CHECK(decoder3.generalized_utf8.failure());
|
||||
}
|
||||
} else {
|
||||
decoder3.Decode(byte3);
|
||||
if (0x80 <= byte3 && byte3 <= 0xBF) {
|
||||
CHECK(decoder3.success());
|
||||
} else {
|
||||
CHECK(decoder3.failure());
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Second byte not in range: failure.
|
||||
CHECK(decoder2.failure());
|
||||
}
|
||||
}
|
||||
} else if (byte1 <= 0xF4) {
|
||||
// First byte in [0xF0, 0xF4]: four-byte sequence.
|
||||
CHECK(decoder1.incomplete());
|
||||
uint32_t min = byte1 == 0xF0 ? 0x90 : 0x80;
|
||||
uint32_t max = byte1 == 0xF4 ? 0x8F : 0xBF;
|
||||
for (uint32_t byte2 = 0x00; byte2 <= 0xFF; byte2++) {
|
||||
DecodingOracle decoder2 = decoder1;
|
||||
decoder2.Decode(byte2);
|
||||
if (min <= byte2 && byte2 <= max) {
|
||||
// Second byte in [min, max]: continuation.
|
||||
CHECK(decoder2.incomplete());
|
||||
for (uint32_t byte3 = 0x00; byte3 <= 0xFF; byte3++) {
|
||||
DecodingOracle decoder3 = decoder2;
|
||||
decoder3.Decode(byte3);
|
||||
if (0x80 <= byte3 && byte3 <= 0xBF) {
|
||||
// Third byte in [0x80, BF]: continuation.
|
||||
CHECK(decoder3.incomplete());
|
||||
for (uint32_t byte4 = 0x00; byte4 <= 0xFF; byte4++) {
|
||||
DecodingOracle decoder4 = decoder3;
|
||||
decoder4.Decode(byte4);
|
||||
// Fourth byte4 completes the sequence.
|
||||
if (0x80 <= byte4 && byte4 <= 0xBF) {
|
||||
CHECK(decoder4.success());
|
||||
} else {
|
||||
CHECK(decoder4.failure());
|
||||
}
|
||||
}
|
||||
} else {
|
||||
CHECK(decoder3.failure());
|
||||
}
|
||||
}
|
||||
} else {
|
||||
CHECK(decoder2.failure());
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// First byte in [0xF5, 0xFF]: failure.
|
||||
CHECK(decoder1.failure());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace test_wasm_strings
|
||||
} // namespace wasm
|
||||
} // namespace internal
|
||||
} // namespace v8
|
Loading…
Reference in New Issue
Block a user