cedec225c9
This is a separation of the DFA Unicode Decoder from https://chromium-review.googlesource.com/c/v8/v8/+/789560 I attempted to make the DFA's table a bit more explicit in this CL. Still, the linter prevents me from letting me present the array as a "table" in source code. For a better representation, please refer to https://docs.google.com/spreadsheets/d/1L9STtkmWs-A7HdK5ZmZ-wPZ_VBjQ3-Jj_xN9c6_hLKA - - - - - Now for a big copy-paste from 789560: Essentially, reworks a standard FSM (imagine an array of structs) and flattens it out into a single-dimension array. Using Table 3-7 of the Unicode 10.0.0 standard (page 126 of http://www.unicode.org/versions/Unicode10.0.0/ch03.pdf), we can nicely map all bytes into one of 12 character classes: 00. 0x00-0x7F 01. 0x80-0x8F (split from general continuation because this range is not valid after a 0xF0 leading byte) 02. 0x90-0x9F (split from general continuation because this range is not valid after a 0xE0 nor a 0xF4 leading byte) 03. 0xA0-0xBF (the rest of the continuation range) 04. 0xC0-0xC1, 0xF5-0xFF (the joined range of invalid bytes, notice this includes 255 which we use as a known bad byte during hex-to-int decoding) 05. 0xC2-0xDF (leading bytes which require any continuation byte afterwards) 06. 0xE0 (leading byte which requires a 0xA0-0xBF afterwards then any continuation byte after that) 07. 0xE1-0xEC, 0xEE-0xEF (leading bytes which requires any continuation afterwards then any continuation byte after that) 08. 0xED (leading byte which requires a 0x80-0x9F afterwards then any continuation byte after that) 09. 0xF1-F3 (leading bytes which requires any continuation byte afterwards then any continuation byte then any continuation byte) 10. 0xF0 (leading bytes which requires a 0x90-0xBF afterwards then any continuation byte then any continuation byte) 11. 0xF4 (leading bytes which requires a 0x80-0x8F afterwards then any continuation byte then any continuation byte) Note that 0xF0 and 0xF1-0xF3 were swapped so that fewer bytes were needed to represent the transition state ("9, 10, 10, 10" vs. "10, 9, 9, 9"). Using these 12 classes as "transitions", we can map from one state to the next. Each state is defined as some multiple of 12, so that we're always starting at the 0th column of each row of the FSM. From each state, we add the transition and get a index of the new row the FSM is entering. If at any point we encounter a bad byte, the state + bad-byte-transition is guaranteed to map us into the first row of the FSM (which contains no valid exiting transitions). The key differences from Björn's original (or his self-modified) DFA is the "bad" state is now mapped to 0 (or the first row of the FSM) instead of 12 (the second row). This saves ~50 bytes when gzipping, and also speeds up determining if a string is properly encoded (see his sample code at http://bjoern.hoehrmann.de/utf-8/decoder/dfa/#performance). Finally, I've replace his ternary check with an array access, to make the algorithm branchless. This places a requirement on the caller to 0 out the code point between successful decodings, which it could always have done because it's already branching. R=marja@google.com Bug: Change-Id: I574f208a84dc5d06caba17127b0d41f7ce1a3395 Reviewed-on: https://chromium-review.googlesource.com/805357 Commit-Queue: Justin Ridgewell <jridgewell@google.com> Reviewed-by: Marja Hölttä <marja@chromium.org> Reviewed-by: Mathias Bynens <mathias@chromium.org> Cr-Commit-Position: refs/heads/master@{#50012}
149 lines
4.0 KiB
C++
149 lines
4.0 KiB
C++
// Copyright 2007-2010 the V8 project authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
// found in the LICENSE file.
|
|
|
|
#ifndef V8_UNICODE_INL_H_
|
|
#define V8_UNICODE_INL_H_
|
|
|
|
#include "src/unicode.h"
|
|
#include "src/base/logging.h"
|
|
#include "src/utils.h"
|
|
|
|
namespace unibrow {
|
|
|
|
template <class T, int s> bool Predicate<T, s>::get(uchar code_point) {
|
|
CacheEntry entry = entries_[code_point & kMask];
|
|
if (entry.code_point() == code_point) return entry.value();
|
|
return CalculateValue(code_point);
|
|
}
|
|
|
|
template <class T, int s> bool Predicate<T, s>::CalculateValue(
|
|
uchar code_point) {
|
|
bool result = T::Is(code_point);
|
|
entries_[code_point & kMask] = CacheEntry(code_point, result);
|
|
return result;
|
|
}
|
|
|
|
template <class T, int s> int Mapping<T, s>::get(uchar c, uchar n,
|
|
uchar* result) {
|
|
CacheEntry entry = entries_[c & kMask];
|
|
if (entry.code_point_ == c) {
|
|
if (entry.offset_ == 0) {
|
|
return 0;
|
|
} else {
|
|
result[0] = c + entry.offset_;
|
|
return 1;
|
|
}
|
|
} else {
|
|
return CalculateValue(c, n, result);
|
|
}
|
|
}
|
|
|
|
template <class T, int s> int Mapping<T, s>::CalculateValue(uchar c, uchar n,
|
|
uchar* result) {
|
|
bool allow_caching = true;
|
|
int length = T::Convert(c, n, result, &allow_caching);
|
|
if (allow_caching) {
|
|
if (length == 1) {
|
|
entries_[c & kMask] = CacheEntry(c, result[0] - c);
|
|
return 1;
|
|
} else {
|
|
entries_[c & kMask] = CacheEntry(c, 0);
|
|
return 0;
|
|
}
|
|
} else {
|
|
return length;
|
|
}
|
|
}
|
|
|
|
|
|
unsigned Utf8::EncodeOneByte(char* str, uint8_t c) {
|
|
static const int kMask = ~(1 << 6);
|
|
if (c <= kMaxOneByteChar) {
|
|
str[0] = c;
|
|
return 1;
|
|
}
|
|
str[0] = 0xC0 | (c >> 6);
|
|
str[1] = 0x80 | (c & kMask);
|
|
return 2;
|
|
}
|
|
|
|
// Encode encodes the UTF-16 code units c and previous into the given str
|
|
// buffer, and combines surrogate code units into single code points. If
|
|
// replace_invalid is set to true, orphan surrogate code units will be replaced
|
|
// with kBadChar.
|
|
unsigned Utf8::Encode(char* str,
|
|
uchar c,
|
|
int previous,
|
|
bool replace_invalid) {
|
|
static const int kMask = ~(1 << 6);
|
|
if (c <= kMaxOneByteChar) {
|
|
str[0] = c;
|
|
return 1;
|
|
} else if (c <= kMaxTwoByteChar) {
|
|
str[0] = 0xC0 | (c >> 6);
|
|
str[1] = 0x80 | (c & kMask);
|
|
return 2;
|
|
} else if (c <= kMaxThreeByteChar) {
|
|
if (Utf16::IsSurrogatePair(previous, c)) {
|
|
const int kUnmatchedSize = kSizeOfUnmatchedSurrogate;
|
|
return Encode(str - kUnmatchedSize,
|
|
Utf16::CombineSurrogatePair(previous, c),
|
|
Utf16::kNoPreviousCharacter,
|
|
replace_invalid) - kUnmatchedSize;
|
|
} else if (replace_invalid &&
|
|
(Utf16::IsLeadSurrogate(c) ||
|
|
Utf16::IsTrailSurrogate(c))) {
|
|
c = kBadChar;
|
|
}
|
|
str[0] = 0xE0 | (c >> 12);
|
|
str[1] = 0x80 | ((c >> 6) & kMask);
|
|
str[2] = 0x80 | (c & kMask);
|
|
return 3;
|
|
} else {
|
|
str[0] = 0xF0 | (c >> 18);
|
|
str[1] = 0x80 | ((c >> 12) & kMask);
|
|
str[2] = 0x80 | ((c >> 6) & kMask);
|
|
str[3] = 0x80 | (c & kMask);
|
|
return 4;
|
|
}
|
|
}
|
|
|
|
|
|
uchar Utf8::ValueOf(const byte* bytes, size_t length, size_t* cursor) {
|
|
if (length <= 0) return kBadChar;
|
|
byte first = bytes[0];
|
|
// Characters between 0000 and 007F are encoded as a single character
|
|
if (V8_LIKELY(first <= kMaxOneByteChar)) {
|
|
*cursor += 1;
|
|
return first;
|
|
}
|
|
return CalculateValue(bytes, length, cursor);
|
|
}
|
|
|
|
unsigned Utf8::Length(uchar c, int previous) {
|
|
if (c <= kMaxOneByteChar) {
|
|
return 1;
|
|
} else if (c <= kMaxTwoByteChar) {
|
|
return 2;
|
|
} else if (c <= kMaxThreeByteChar) {
|
|
if (Utf16::IsTrailSurrogate(c) &&
|
|
Utf16::IsLeadSurrogate(previous)) {
|
|
return kSizeOfUnmatchedSurrogate - kBytesSavedByCombiningSurrogates;
|
|
}
|
|
return 3;
|
|
} else {
|
|
return 4;
|
|
}
|
|
}
|
|
|
|
bool Utf8::IsValidCharacter(uchar c) {
|
|
return c < 0xD800u || (c >= 0xE000u && c < 0xFDD0u) ||
|
|
(c > 0xFDEFu && c <= 0x10FFFFu && (c & 0xFFFEu) != 0xFFFEu &&
|
|
c != kBadChar);
|
|
}
|
|
|
|
} // namespace unibrow
|
|
|
|
#endif // V8_UNICODE_INL_H_
|