diff --git a/BUILD.gn b/BUILD.gn index 855a1c6a5d..b27a323094 100644 --- a/BUILD.gn +++ b/BUILD.gn @@ -2052,6 +2052,7 @@ v8_source_set("v8_base") { "src/string-stream.h", "src/strtod.cc", "src/strtod.h", + "src/third_party/utf8-decoder/utf8-decoder.h", "src/tracing/trace-event.cc", "src/tracing/trace-event.h", "src/tracing/traced-value.cc", diff --git a/src/parsing/scanner-character-streams.cc b/src/parsing/scanner-character-streams.cc index 8f584ff715..20aa5c9f8e 100644 --- a/src/parsing/scanner-character-streams.cc +++ b/src/parsing/scanner-character-streams.cc @@ -203,7 +203,7 @@ class Utf8ExternalStreamingStream : public BufferedUtf16CharacterStream { Utf8ExternalStreamingStream( ScriptCompiler::ExternalSourceStream* source_stream, RuntimeCallStats* stats) - : current_({0, {0, 0, unibrow::Utf8::Utf8IncrementalBuffer(0)}}), + : current_({0, {0, 0, 0, unibrow::Utf8::State::kAccept}}), source_stream_(source_stream), stats_(stats) {} ~Utf8ExternalStreamingStream() override { @@ -223,7 +223,8 @@ class Utf8ExternalStreamingStream : public BufferedUtf16CharacterStream { struct StreamPosition { size_t bytes; size_t chars; - unibrow::Utf8::Utf8IncrementalBuffer incomplete_char; + uint32_t incomplete_char; + unibrow::Utf8::State state; }; // Position contains a StreamPosition and the index of the chunk the position @@ -268,25 +269,25 @@ bool Utf8ExternalStreamingStream::SkipToPosition(size_t position) { const Chunk& chunk = chunks_[current_.chunk_no]; DCHECK(current_.pos.bytes >= chunk.start.bytes); - unibrow::Utf8::Utf8IncrementalBuffer incomplete_char = - chunk.start.incomplete_char; + unibrow::Utf8::State state = chunk.start.state; + uint32_t incomplete_char = chunk.start.incomplete_char; size_t it = current_.pos.bytes - chunk.start.bytes; size_t chars = chunk.start.chars; while (it < chunk.length && chars < position) { - unibrow::uchar t = - unibrow::Utf8::ValueOfIncremental(chunk.data[it], &incomplete_char); + unibrow::uchar t = unibrow::Utf8::ValueOfIncremental( + chunk.data[it], &it, &state, &incomplete_char); if (t == kUtf8Bom && current_.pos.chars == 0) { // BOM detected at beginning of the stream. Don't copy it. } else if (t != unibrow::Utf8::kIncomplete) { chars++; if (t > unibrow::Utf16::kMaxNonSurrogateCharCode) chars++; } - it++; } current_.pos.bytes += it; current_.pos.chars = chars; current_.pos.incomplete_char = incomplete_char; + current_.pos.state = state; current_.chunk_no += (it == chunk.length); return current_.pos.chars == position; @@ -304,31 +305,33 @@ void Utf8ExternalStreamingStream::FillBufferFromCurrentChunk() { uint16_t* cursor = buffer_ + (buffer_end_ - buffer_start_); DCHECK_EQ(cursor, buffer_end_); + unibrow::Utf8::State state = current_.pos.state; + uint32_t incomplete_char = current_.pos.incomplete_char; + // If the current chunk is the last (empty) chunk we'll have to process // any left-over, partial characters. if (chunk.length == 0) { - unibrow::uchar t = - unibrow::Utf8::ValueOfIncrementalFinish(¤t_.pos.incomplete_char); + unibrow::uchar t = unibrow::Utf8::ValueOfIncrementalFinish(&state); if (t != unibrow::Utf8::kBufferEmpty) { - DCHECK_LT(t, unibrow::Utf16::kMaxNonSurrogateCharCode); + DCHECK_EQ(t, unibrow::Utf8::kBadChar); *cursor = static_cast(t); buffer_end_++; current_.pos.chars++; + current_.pos.incomplete_char = 0; + current_.pos.state = state; } return; } - unibrow::Utf8::Utf8IncrementalBuffer incomplete_char = - current_.pos.incomplete_char; - size_t it; - for (it = current_.pos.bytes - chunk.start.bytes; - it < chunk.length && cursor + 1 < buffer_start_ + kBufferSize; it++) { - unibrow::uchar t = - unibrow::Utf8::ValueOfIncremental(chunk.data[it], &incomplete_char); - if (t == unibrow::Utf8::kIncomplete) continue; + size_t it = current_.pos.bytes - chunk.start.bytes; + while (it < chunk.length && cursor + 1 < buffer_start_ + kBufferSize) { + unibrow::uchar t = unibrow::Utf8::ValueOfIncremental( + chunk.data[it], &it, &state, &incomplete_char); if (V8_LIKELY(t < kUtf8Bom)) { *(cursor++) = static_cast(t); // The by most frequent case. - } else if (t == kUtf8Bom && current_.pos.bytes + it == 2) { + } else if (t == unibrow::Utf8::kIncomplete) { + continue; + } else if (t == kUtf8Bom && current_.pos.bytes + it == 3) { // BOM detected at beginning of the stream. Don't copy it. } else if (t <= unibrow::Utf16::kMaxNonSurrogateCharCode) { *(cursor++) = static_cast(t); @@ -341,6 +344,7 @@ void Utf8ExternalStreamingStream::FillBufferFromCurrentChunk() { current_.pos.bytes = chunk.start.bytes + it; current_.pos.chars += (cursor - buffer_end_); current_.pos.incomplete_char = incomplete_char; + current_.pos.state = state; current_.chunk_no += (it == chunk.length); buffer_end_ = cursor; @@ -396,16 +400,15 @@ void Utf8ExternalStreamingStream::SearchPosition(size_t position) { // checking whether the # bytes in a chunk are equal to the # chars, and if // so avoid the expensive SkipToPosition.) bool ascii_only_chunk = - chunks_[chunk_no].start.incomplete_char == - unibrow::Utf8::Utf8IncrementalBuffer(0) && + chunks_[chunk_no].start.incomplete_char == 0 && (chunks_[chunk_no + 1].start.bytes - chunks_[chunk_no].start.bytes) == (chunks_[chunk_no + 1].start.chars - chunks_[chunk_no].start.chars); if (ascii_only_chunk) { size_t skip = position - chunks_[chunk_no].start.chars; current_ = {chunk_no, {chunks_[chunk_no].start.bytes + skip, - chunks_[chunk_no].start.chars + skip, - unibrow::Utf8::Utf8IncrementalBuffer(0)}}; + chunks_[chunk_no].start.chars + skip, 0, + unibrow::Utf8::State::kAccept}}; } else { current_ = {chunk_no, chunks_[chunk_no].start}; SkipToPosition(position); diff --git a/src/third_party/utf8-decoder/LICENSE b/src/third_party/utf8-decoder/LICENSE new file mode 100644 index 0000000000..b59bef2fb6 --- /dev/null +++ b/src/third_party/utf8-decoder/LICENSE @@ -0,0 +1,19 @@ +Copyright (c) 2008-2009 Bjoern Hoehrmann + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/src/third_party/utf8-decoder/README.v8 b/src/third_party/utf8-decoder/README.v8 new file mode 100644 index 0000000000..e1e13ce53f --- /dev/null +++ b/src/third_party/utf8-decoder/README.v8 @@ -0,0 +1,18 @@ +Name: DFA UTF-8 Decoder +Short Name: utf8-decoder +URL: http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ +Version: 0 +License: MIT +License File: NOT_SHIPPED +Security Critical: no + +Description: +Decodes UTF-8 bytes using a fast and simple definite finite automata. + +Local modifications: +- Rejection state has been mapped to row 0 (instead of row 1) of the DFA, + saving some 50 bytes and making the table easier to reason about. +- The transitions have been remapped to represent both a state transition and a + bit mask for the incoming byte. +- The caller must now zero out the code point buffer after successful or + unsuccessful state transitions. diff --git a/src/third_party/utf8-decoder/utf8-decoder.h b/src/third_party/utf8-decoder/utf8-decoder.h new file mode 100644 index 0000000000..5668e5ad9e --- /dev/null +++ b/src/third_party/utf8-decoder/utf8-decoder.h @@ -0,0 +1,78 @@ +// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details. +// The remapped transition table is justified at +// https://docs.google.com/spreadsheets/d/1AZcQwuEL93HmNCljJWUwFMGqf7JAQ0puawZaUgP0E14 + +#include + +#ifndef __UTF8_DFA_DECODER_H +#define __UTF8_DFA_DECODER_H + +namespace Utf8DfaDecoder { + +enum State : uint8_t { + kReject = 0, + kAccept = 12, + kTwoByte = 24, + kThreeByte = 36, + kThreeByteLowMid = 48, + kFourByte = 60, + kFourByteLow = 72, + kThreeByteHigh = 84, + kFourByteMidHigh = 96, +}; + +static inline void Decode(uint8_t byte, State* state, uint32_t* buffer) { + // This first table maps bytes to character to a transition. + static constexpr uint8_t transitions[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 00-0F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 10-1F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 20-2F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 30-3F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 40-4F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 50-5F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 60-6F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 70-7F + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 80-8F + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 90-9F + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // A0-AF + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // B0-BF + 9, 9, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // C0-CF + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // D0-DF + 10, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 5, 5, // E0-EF + 11, 7, 7, 7, 8, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, // F0-FF + }; + + // This second table maps a state to a new state when adding a transition. + // 00-7F + // | 80-8F + // | | 90-9F + // | | | A0-BF + // | | | | C2-DF + // | | | | | E1-EC, EE, EF + // | | | | | | ED + // | | | | | | | F1-F3 + // | | | | | | | | F4 + // | | | | | | | | | C0, C1, F5-FF + // | | | | | | | | | | E0 + // | | | | | | | | | | | F0 + static constexpr uint8_t states[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // REJECT = 0 + 12, 0, 0, 0, 24, 36, 48, 60, 72, 0, 84, 96, // ACCEPT = 12 + 0, 12, 12, 12, 0, 0, 0, 0, 0, 0, 0, 0, // 2-byte = 24 + 0, 24, 24, 24, 0, 0, 0, 0, 0, 0, 0, 0, // 3-byte = 36 + 0, 24, 24, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 3-byte low/mid = 48 + 0, 36, 36, 36, 0, 0, 0, 0, 0, 0, 0, 0, // 4-byte = 60 + 0, 36, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 4-byte low = 72 + 0, 0, 0, 24, 0, 0, 0, 0, 0, 0, 0, 0, // 3-byte high = 84 + 0, 0, 36, 36, 0, 0, 0, 0, 0, 0, 0, 0, // 4-byte mid/high = 96 + }; + + DCHECK_NE(*state, State::kReject); + uint8_t type = transitions[byte]; + *state = static_cast(states[*state + type]); + *buffer = (*buffer << 6) | (byte & (0x7F >> (type >> 1))); +} + +} // namespace Utf8DfaDecoder + +#endif /* __UTF8_DFA_DECODER_H */ diff --git a/src/unicode-inl.h b/src/unicode-inl.h index ebebfaa1bd..7c0386ce52 100644 --- a/src/unicode-inl.h +++ b/src/unicode-inl.h @@ -113,8 +113,8 @@ unsigned Utf8::Encode(char* str, uchar Utf8::ValueOf(const byte* bytes, size_t length, size_t* cursor) { if (length <= 0) return kBadChar; byte first = bytes[0]; - // Characters between 0000 and 0007F are encoded as a single character - if (first <= kMaxOneByteChar) { + // Characters between 0000 and 007F are encoded as a single character + if (V8_LIKELY(first <= kMaxOneByteChar)) { *cursor += 1; return first; } diff --git a/src/unicode.cc b/src/unicode.cc index 082334f230..4d7896ec37 100644 --- a/src/unicode.cc +++ b/src/unicode.cc @@ -193,306 +193,91 @@ static int LookupMapping(const int32_t* table, } } -static inline uint8_t NonASCIISequenceLength(byte first) { - // clang-format off - static const uint8_t lengths[256] = { - // The first 128 entries correspond to ASCII characters. - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 00 - 0f */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 10 - 1f */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 20 - 2f */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 30 - 3f */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 40 - 4f */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 50 - 5f */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 60 - 6f */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 70 - 7f */ - // The following 64 entries correspond to continuation bytes. - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80 - 8f */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 90 - 9f */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* a0 - af */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* b0 - bf */ - // The next are two invalid overlong encodings and 30 two-byte sequences. - 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* c0-c1 + c2-cf */ - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* d0-df */ - // 16 three-byte sequences. - 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* e0-ef */ - // 5 four-byte sequences, followed by sequences that could only encode - // code points outside of the Unicode range. - 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; /* f0-f4 + f5-ff */ - // clang-format on - return lengths[first]; -} - - -static inline bool IsContinuationCharacter(byte chr) { - return chr >= 0x80 && chr <= 0xBF; -} - // This method decodes an UTF-8 value according to RFC 3629 and // https://encoding.spec.whatwg.org/#utf-8-decoder . uchar Utf8::CalculateValue(const byte* str, size_t max_length, size_t* cursor) { + DCHECK_GT(max_length, 0); DCHECK_GT(str[0], kMaxOneByteChar); - size_t length = NonASCIISequenceLength(str[0]); + State state = State::kAccept; + Utf8IncrementalBuffer buffer = 0; + uchar t; - // Check continuation characters. - size_t max_count = std::min(length, max_length); - size_t count = 1; - while (count < max_count && IsContinuationCharacter(str[count])) { - count++; - } + size_t i = 0; + do { + t = ValueOfIncremental(str[i], &i, &state, &buffer); + } while (i < max_length && t == kIncomplete); - if (length >= 3 && count < 2) { - // Not enough continuation bytes to check overlong sequences. - *cursor += 1; - return kBadChar; - } - - // Check overly long sequences & other conditions. - if (length == 3) { - if (str[0] == 0xE0 && (str[1] < 0xA0 || str[1] > 0xBF)) { - // Overlong three-byte sequence? The first byte generates a kBadChar. - *cursor += 1; - return kBadChar; - } else if (str[0] == 0xED && (str[1] < 0x80 || str[1] > 0x9F)) { - // High and low surrogate halves? The first byte generates a kBadChar. - *cursor += 1; - return kBadChar; - } - } else if (length == 4) { - if (str[0] == 0xF0 && (str[1] < 0x90 || str[1] > 0xBF)) { - // Overlong four-byte sequence. The first byte generates a kBadChar. - *cursor += 1; - return kBadChar; - } else if (str[0] == 0xF4 && (str[1] < 0x80 || str[1] > 0x8F)) { - // Code points outside of the Unicode range. The first byte generates a - // kBadChar. - *cursor += 1; - return kBadChar; - } - } - - *cursor += count; - - if (count != length) { - // Not enough continuation characters. - return kBadChar; - } - - // All errors have been handled, so we only have to assemble the result. - switch (length) { - case 2: - return ((str[0] << 6) + str[1]) - 0x00003080; - case 3: - return ((str[0] << 12) + (str[1] << 6) + str[2]) - 0x000E2080; - case 4: - return ((str[0] << 18) + (str[1] << 12) + (str[2] << 6) + str[3]) - - 0x03C82080; - } - - UNREACHABLE(); + *cursor += i; + return (state == State::kAccept) ? t : kBadChar; } -/* -Overlong sequence detection: Since Blink's TextCodecUTF8 rejects multi-byte -characters which could be expressed with less bytes, we must too. - -Each continuation byte (10xxxxxx) carries 6 bits of payload. The lead bytes of -1, 2, 3 and 4-byte characters are 0xxxxxxx, 110xxxxx, 1110xxxx and 11110xxx, and -carry 7, 5, 4, and 3 bits of payload, respectively. - -Thus, a two-byte character can contain 11 bits of payload, a three-byte -character 16, and a four-byte character 21. - -If we encounter a two-byte character which contains 7 bits or less, a three-byte -character which contains 11 bits or less, or a four-byte character which -contains 16 bits or less, we reject the character and generate a kBadChar for -each of the bytes. This is because Blink handles overlong sequences by rejecting -the first byte of the character (returning kBadChar); thus the rest are lonely -continuation bytes and generate a kBadChar each. -*/ - -uchar Utf8::ValueOfIncremental(byte next, Utf8IncrementalBuffer* buffer) { +// Decodes UTF-8 bytes incrementally, allowing the decoding of bytes as they +// stream in. This **must** be followed by a call to ValueOfIncrementalFinish +// when the stream is complete, to ensure incomplete sequences are handled. +uchar Utf8::ValueOfIncremental(byte next, size_t* cursor, State* state, + Utf8IncrementalBuffer* buffer) { DCHECK_NOT_NULL(buffer); + State old_state = *state; + *cursor += 1; - // The common case: 1-byte Utf8 (and no incomplete char in the buffer) - if (V8_LIKELY(next <= kMaxOneByteChar && *buffer == 0)) { + if (V8_LIKELY(next <= kMaxOneByteChar && old_state == State::kAccept)) { + DCHECK_EQ(0u, *buffer); return static_cast(next); } - if (*buffer == 0) { - // We're at the start of a new character. - uint32_t kind = NonASCIISequenceLength(next); - CHECK_LE(kind, 4); - if (kind >= 2) { - // Start of 2..4 byte character, and no buffer. + // So we're at the lead byte of a 2/3/4 sequence, or we're at a continuation + // char in that sequence. + Utf8DfaDecoder::Decode(next, state, buffer); - // The mask for the lower bits depends on the kind, and is - // 0x1F, 0x0F, 0x07 for kinds 2, 3, 4 respectively. We can get that - // with one shift. - uint8_t mask = 0x7F >> kind; - - // Store the kind in the top nibble, and kind - 1 (i.e., remaining bytes) - // in 2nd nibble, and the value in the bottom three. The 2nd nibble is - // intended as a counter about how many bytes are still needed. - uint32_t character_info = kind << 28 | (kind - 1) << 24; - DCHECK_EQ(character_info & mask, 0); - *buffer = character_info | (next & mask); - return kIncomplete; - } else { - // No buffer, and not the start of a 1-byte char (handled at the - // beginning), and not the start of a 2..4 byte char (or the start of an - // overlong / invalid sequence)? Bad char. + switch (*state) { + case State::kAccept: { + uchar t = *buffer; *buffer = 0; - return kBadChar; - } - } else if (*buffer <= 0xFF) { - // We have one unprocessed byte left (from the last else case in this if - // statement). - uchar previous = *buffer; - *buffer = 0; - uchar t = ValueOfIncremental(previous, buffer); - if (t == kIncomplete) { - // If we have an incomplete character, process both the previous and the - // next byte at once. - return ValueOfIncremental(next, buffer); - } else { - // Otherwise, process the previous byte and save the next byte for next - // time. - DCHECK_EQ(0u, *buffer); - *buffer = next; return t; } - } else if (IsContinuationCharacter(next)) { - // We're inside of a character, as described by buffer. - // How many bytes (excluding this one) do we still expect? - uint8_t bytes_expected = *buffer >> 28; - uint8_t bytes_left = (*buffer >> 24) & 0x0F; - - // Two-byte overlong sequence detection is handled by - // NonASCIISequenceLength, so we don't need to check anything here. - if (bytes_expected == 3 && bytes_left == 2) { - // Check that there are at least 12 bytes of payload. - uint8_t lead_payload = *buffer & (0x7F >> bytes_expected); - DCHECK_LE(lead_payload, 0xF); - if (lead_payload == 0 && next < 0xA0) { - // 0xA0 = 0b10100000 (payload: 100000). Overlong sequence: 0 bits from - // the first byte, at most 5 from the second byte, and at most 6 from - // the third -> in total at most 11. - - *buffer = next; - return kBadChar; - } else if (lead_payload == 0xD && next > 0x9F) { - // The resulting code point would be on a range which is reserved for - // UTF-16 surrogate halves. - *buffer = next; - return kBadChar; - } - } else if (bytes_expected == 4 && bytes_left == 3) { - // Check that there are at least 17 bytes of payload. - uint8_t lead_payload = *buffer & (0x7F >> bytes_expected); - - // If the lead byte was bigger than 0xF4 (payload: 4), it's not a start of - // any valid character, and this is detected by NonASCIISequenceLength. - DCHECK_LE(lead_payload, 0x4); - if (lead_payload == 0 && next < 0x90) { - // 0x90 = 10010000 (payload 10000). Overlong sequence: 0 bits from the - // first byte, at most 4 from the second byte, at most 12 from the third - // and fourth bytes -> in total at most 16. - *buffer = next; - return kBadChar; - } else if (lead_payload == 4 && next > 0x8F) { - // Invalid code point; value greater than 0b100001111000000000000 - // (0x10FFFF). - *buffer = next; - return kBadChar; - } - } - - bytes_left--; - // Update the value. - uint32_t value = ((*buffer & 0xFFFFFF) << 6) | (next & 0x3F); - if (bytes_left) { - *buffer = (bytes_expected << 28 | bytes_left << 24 | value); - return kIncomplete; - } else { -#ifdef DEBUG - // Check that overlong sequences were already detected. - bool sequence_was_too_long = (bytes_expected == 2 && value < 0x80) || - (bytes_expected == 3 && value < 0x800) || - (bytes_expected == 4 && value < 0x8000); - DCHECK(!sequence_was_too_long); -#endif + case State::kReject: + *state = State::kAccept; *buffer = 0; - return value; - } - } else { - // Within a character, but not a continuation character? Then the - // previous char was a bad char. But we need to save the current - // one. - *buffer = next; - return kBadChar; + + // If we hit a bad byte, we need to determine if we were trying to start + // a sequence or continue one. If we were trying to start a sequence, + // that means it's just an invalid lead byte and we need to continue to + // the next (which we already did above). If we were already in a + // sequence, we need to reprocess this same byte after resetting to the + // initial state. + if (old_state != State::kAccept) { + // We were trying to continue a sequence, so let's reprocess this byte + // next time. + *cursor -= 1; + } + return kBadChar; + + default: + return kIncomplete; } } -uchar Utf8::ValueOfIncrementalFinish(Utf8IncrementalBuffer* buffer) { - DCHECK_NOT_NULL(buffer); - if (*buffer == 0) { +// Finishes the incremental decoding, ensuring that if an unfinished sequence +// is left that it is replaced by a replacement char. +uchar Utf8::ValueOfIncrementalFinish(State* state) { + if (*state == State::kAccept) { return kBufferEmpty; } else { - // Process left-over chars. An incomplete char at the end maps to kBadChar. - uchar t = ValueOfIncremental(0, buffer); - return (t == kIncomplete) ? kBadChar : t; + DCHECK_GT(*state, State::kAccept); + *state = State::kAccept; + return kBadChar; } } bool Utf8::ValidateEncoding(const byte* bytes, size_t length) { - const byte* cursor = bytes; - const byte* end = bytes + length; - - while (cursor < end) { - // Skip over single-byte values. - if (*cursor <= kMaxOneByteChar) { - ++cursor; - continue; - } - - // Get the length the the character. - size_t seq_length = NonASCIISequenceLength(*cursor); - // For some invalid characters NonASCIISequenceLength returns 0. - if (seq_length == 0) return false; - - const byte* char_end = cursor + seq_length; - - // Return false if we do not have enough bytes for the character. - if (char_end > end) return false; - - // Check if the bytes of the character are continuation bytes. - for (const byte* i = cursor + 1; i < char_end; ++i) { - if (!IsContinuationCharacter(*i)) return false; - } - - // Check overly long sequences & other conditions. - if (seq_length == 3) { - if (cursor[0] == 0xE0 && (cursor[1] < 0xA0 || cursor[1] > 0xBF)) { - // Overlong three-byte sequence? - return false; - } else if (cursor[0] == 0xED && (cursor[1] < 0x80 || cursor[1] > 0x9F)) { - // High and low surrogate halves? - return false; - } - } else if (seq_length == 4) { - if (cursor[0] == 0xF0 && (cursor[1] < 0x90 || cursor[1] > 0xBF)) { - // Overlong four-byte sequence. - return false; - } else if (cursor[0] == 0xF4 && (cursor[1] < 0x80 || cursor[1] > 0x8F)) { - // Code points outside of the Unicode range. - return false; - } - } - cursor = char_end; + State state = State::kAccept; + Utf8IncrementalBuffer throw_away = 0; + for (size_t i = 0; i < length && state != State::kReject; i++) { + Utf8DfaDecoder::Decode(bytes[i], &state, &throw_away); } - return true; + return state == State::kAccept; } // Uppercase: point.category == 'Lu' diff --git a/src/unicode.h b/src/unicode.h index 04d58f3650..c6ce9a8eb2 100644 --- a/src/unicode.h +++ b/src/unicode.h @@ -7,6 +7,7 @@ #include #include "src/globals.h" +#include "src/third_party/utf8-decoder/utf8-decoder.h" #include "src/utils.h" /** * \file @@ -129,6 +130,8 @@ class Utf16 { class V8_EXPORT_PRIVATE Utf8 { public: + using State = Utf8DfaDecoder::State; + static inline uchar Length(uchar chr, int previous); static inline unsigned EncodeOneByte(char* out, uint8_t c); static inline unsigned Encode(char* out, @@ -158,9 +161,9 @@ class V8_EXPORT_PRIVATE Utf8 { static inline uchar ValueOf(const byte* str, size_t length, size_t* cursor); typedef uint32_t Utf8IncrementalBuffer; - static uchar ValueOfIncremental(byte next_byte, + static uchar ValueOfIncremental(byte next_byte, size_t* cursor, State* state, Utf8IncrementalBuffer* buffer); - static uchar ValueOfIncrementalFinish(Utf8IncrementalBuffer* buffer); + static uchar ValueOfIncrementalFinish(State* state); // Excludes non-characters from the set of valid code points. static inline bool IsValidCharacter(uchar c); diff --git a/src/v8.gyp b/src/v8.gyp index 5a2461868d..753732d19b 100644 --- a/src/v8.gyp +++ b/src/v8.gyp @@ -1406,6 +1406,7 @@ 'strtod.h', 'ic/stub-cache.cc', 'ic/stub-cache.h', + 'third_party/utf8-decoder/utf8-decoder.h', 'tracing/trace-event.cc', 'tracing/trace-event.h', 'tracing/traced-value.cc', diff --git a/test/cctest/unicode-helpers.h b/test/cctest/unicode-helpers.h index 891424a1cb..ca75fb65d7 100644 --- a/test/cctest/unicode-helpers.h +++ b/test/cctest/unicode-helpers.h @@ -19,12 +19,16 @@ static int Ucs2CharLength(unibrow::uchar c) { static int Utf8LengthHelper(const char* s) { unibrow::Utf8::Utf8IncrementalBuffer buffer(unibrow::Utf8::kBufferEmpty); + unibrow::Utf8::State state = unibrow::Utf8::State::kAccept; + int length = 0; - for (; *s != '\0'; s++) { - unibrow::uchar tmp = unibrow::Utf8::ValueOfIncremental(*s, &buffer); + size_t i = 0; + while (s[i] != '\0') { + unibrow::uchar tmp = + unibrow::Utf8::ValueOfIncremental(s[i], &i, &state, &buffer); length += Ucs2CharLength(tmp); } - unibrow::uchar tmp = unibrow::Utf8::ValueOfIncrementalFinish(&buffer); + unibrow::uchar tmp = unibrow::Utf8::ValueOfIncrementalFinish(&state); length += Ucs2CharLength(tmp); return length; } diff --git a/test/unittests/unicode-unittest.cc b/test/unittests/unicode-unittest.cc index 06e47aedb1..e5ccaca7b1 100644 --- a/test/unittests/unicode-unittest.cc +++ b/test/unittests/unicode-unittest.cc @@ -37,13 +37,15 @@ void DecodeNormally(const std::vector& bytes, void DecodeIncrementally(const std::vector& bytes, std::vector* output) { unibrow::Utf8::Utf8IncrementalBuffer buffer = 0; - for (auto b : bytes) { - unibrow::uchar result = unibrow::Utf8::ValueOfIncremental(b, &buffer); + unibrow::Utf8::State state = unibrow::Utf8::State::kAccept; + for (size_t i = 0; i < bytes.size();) { + unibrow::uchar result = + unibrow::Utf8::ValueOfIncremental(bytes[i], &i, &state, &buffer); if (result != unibrow::Utf8::kIncomplete) { output->push_back(result); } } - unibrow::uchar result = unibrow::Utf8::ValueOfIncrementalFinish(&buffer); + unibrow::uchar result = unibrow::Utf8::ValueOfIncrementalFinish(&state); if (result != unibrow::Utf8::kBufferEmpty) { output->push_back(result); }