[stringrefs] Fix WTF-8 rejection of surrogate pairs
Quite embarassingly, the test that the WTF-8 decoder rejects surrogate pairs was broken: the trailing surrogate was invalid. (The range of the second byte for leading surrogates is [A0,AF], and for trailing is [B0,BF]). Of course the actual functionality was broken, because the code that detected surrogate pairs called IsSurrogatePair with swapped arguments. Bug: v8:12868 Change-Id: Icab5e2e4e200afb3d34f478ab4f98b739ada5645 Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3723497 Reviewed-by: Jakob Kummerow <jkummerow@chromium.org> Commit-Queue: Andy Wingo <wingo@igalia.com> Cr-Commit-Position: refs/heads/main@{#81376}
This commit is contained in:
parent
f07faef9c7
commit
0257b0a3d4
@ -14,16 +14,59 @@
|
||||
namespace v8 {
|
||||
namespace internal {
|
||||
|
||||
namespace {
|
||||
template <class Decoder>
|
||||
struct DecoderTraits;
|
||||
|
||||
template <>
|
||||
struct DecoderTraits<Utf8Decoder> {
|
||||
static bool IsInvalidSurrogatePair(uint32_t lead, uint32_t trail) {
|
||||
// The DfaDecoder will only ever decode Unicode scalar values, and all
|
||||
// sequences of USVs are valid.
|
||||
DCHECK(!unibrow::Utf16::IsLeadSurrogate(trail));
|
||||
DCHECK(!unibrow::Utf16::IsTrailSurrogate(trail));
|
||||
return false;
|
||||
}
|
||||
static const bool kAllowIncompleteSequences = true;
|
||||
using DfaDecoder = Utf8DfaDecoder;
|
||||
};
|
||||
|
||||
#if V8_ENABLE_WEBASSEMBLY
|
||||
template <>
|
||||
struct DecoderTraits<Wtf8Decoder> {
|
||||
static bool IsInvalidSurrogatePair(uint32_t lead, uint32_t trail) {
|
||||
return unibrow::Utf16::IsSurrogatePair(lead, trail);
|
||||
}
|
||||
static const bool kAllowIncompleteSequences = false;
|
||||
using DfaDecoder = GeneralizedUtf8DfaDecoder;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct DecoderTraits<StrictUtf8Decoder> {
|
||||
static bool IsInvalidSurrogatePair(uint32_t lead, uint32_t trail) {
|
||||
// The DfaDecoder will only ever decode Unicode scalar values, and all
|
||||
// sequences of USVs are valid.
|
||||
DCHECK(!unibrow::Utf16::IsLeadSurrogate(trail));
|
||||
DCHECK(!unibrow::Utf16::IsTrailSurrogate(trail));
|
||||
return false;
|
||||
}
|
||||
static const bool kAllowIncompleteSequences = false;
|
||||
using DfaDecoder = Utf8DfaDecoder;
|
||||
};
|
||||
#endif // V8_ENABLE_WEBASSEMBLY
|
||||
} // namespace
|
||||
|
||||
template <class Decoder>
|
||||
Utf8DecoderBase<Decoder>::Utf8DecoderBase(
|
||||
const base::Vector<const uint8_t>& data)
|
||||
: encoding_(Encoding::kAscii),
|
||||
non_ascii_start_(NonAsciiStart(data.begin(), data.length())),
|
||||
utf16_length_(non_ascii_start_) {
|
||||
using Traits = DecoderTraits<Decoder>;
|
||||
if (non_ascii_start_ == data.length()) return;
|
||||
|
||||
bool is_one_byte = true;
|
||||
auto state = Decoder::DfaDecoder::kAccept;
|
||||
auto state = Traits::DfaDecoder::kAccept;
|
||||
uint32_t current = 0;
|
||||
uint32_t previous = 0;
|
||||
const uint8_t* cursor = data.begin() + non_ascii_start_;
|
||||
@ -31,11 +74,11 @@ Utf8DecoderBase<Decoder>::Utf8DecoderBase(
|
||||
|
||||
while (cursor < end) {
|
||||
auto previous_state = state;
|
||||
Decoder::DfaDecoder::Decode(*cursor, &state, ¤t);
|
||||
if (state < Decoder::DfaDecoder::kAccept) {
|
||||
DCHECK_EQ(state, Decoder::DfaDecoder::kReject);
|
||||
if (Decoder::kAllowIncompleteSequences) {
|
||||
state = Decoder::DfaDecoder::kAccept;
|
||||
Traits::DfaDecoder::Decode(*cursor, &state, ¤t);
|
||||
if (state < Traits::DfaDecoder::kAccept) {
|
||||
DCHECK_EQ(state, Traits::DfaDecoder::kReject);
|
||||
if (Traits::kAllowIncompleteSequences) {
|
||||
state = Traits::DfaDecoder::kAccept;
|
||||
static_assert(unibrow::Utf8::kBadChar > unibrow::Latin1::kMaxChar);
|
||||
is_one_byte = false;
|
||||
utf16_length_++;
|
||||
@ -43,13 +86,13 @@ Utf8DecoderBase<Decoder>::Utf8DecoderBase(
|
||||
current = 0;
|
||||
// If we were trying to continue a multibyte sequence, try this byte
|
||||
// again.
|
||||
if (previous_state != Decoder::DfaDecoder::kAccept) continue;
|
||||
if (previous_state != Traits::DfaDecoder::kAccept) continue;
|
||||
} else {
|
||||
encoding_ = Encoding::kInvalid;
|
||||
return;
|
||||
}
|
||||
} else if (state == Decoder::DfaDecoder::kAccept) {
|
||||
if (Decoder::InvalidCodePointSequence(current, previous)) {
|
||||
} else if (state == Traits::DfaDecoder::kAccept) {
|
||||
if (Traits::IsInvalidSurrogatePair(previous, current)) {
|
||||
encoding_ = Encoding::kInvalid;
|
||||
return;
|
||||
}
|
||||
@ -62,9 +105,9 @@ Utf8DecoderBase<Decoder>::Utf8DecoderBase(
|
||||
cursor++;
|
||||
}
|
||||
|
||||
if (state == Decoder::DfaDecoder::kAccept) {
|
||||
if (state == Traits::DfaDecoder::kAccept) {
|
||||
encoding_ = is_one_byte ? Encoding::kLatin1 : Encoding::kUtf16;
|
||||
} else if (Decoder::kAllowIncompleteSequences) {
|
||||
} else if (Traits::kAllowIncompleteSequences) {
|
||||
static_assert(unibrow::Utf8::kBadChar > unibrow::Latin1::kMaxChar);
|
||||
encoding_ = Encoding::kUtf16;
|
||||
utf16_length_++;
|
||||
@ -77,28 +120,29 @@ template <class Decoder>
|
||||
template <typename Char>
|
||||
void Utf8DecoderBase<Decoder>::Decode(Char* out,
|
||||
const base::Vector<const uint8_t>& data) {
|
||||
using Traits = DecoderTraits<Decoder>;
|
||||
DCHECK(!is_invalid());
|
||||
CopyChars(out, data.begin(), non_ascii_start_);
|
||||
|
||||
out += non_ascii_start_;
|
||||
|
||||
auto state = Decoder::DfaDecoder::kAccept;
|
||||
auto state = Traits::DfaDecoder::kAccept;
|
||||
uint32_t current = 0;
|
||||
const uint8_t* cursor = data.begin() + non_ascii_start_;
|
||||
const uint8_t* end = data.begin() + data.length();
|
||||
|
||||
while (cursor < end) {
|
||||
auto previous_state = state;
|
||||
Decoder::DfaDecoder::Decode(*cursor, &state, ¤t);
|
||||
if (Decoder::kAllowIncompleteSequences &&
|
||||
state < Decoder::DfaDecoder::kAccept) {
|
||||
state = Decoder::DfaDecoder::kAccept;
|
||||
Traits::DfaDecoder::Decode(*cursor, &state, ¤t);
|
||||
if (Traits::kAllowIncompleteSequences &&
|
||||
state < Traits::DfaDecoder::kAccept) {
|
||||
state = Traits::DfaDecoder::kAccept;
|
||||
*(out++) = static_cast<Char>(unibrow::Utf8::kBadChar);
|
||||
current = 0;
|
||||
// If we were trying to continue a multibyte sequence, try this byte
|
||||
// again.
|
||||
if (previous_state != Decoder::DfaDecoder::kAccept) continue;
|
||||
} else if (state == Decoder::DfaDecoder::kAccept) {
|
||||
if (previous_state != Traits::DfaDecoder::kAccept) continue;
|
||||
} else if (state == Traits::DfaDecoder::kAccept) {
|
||||
if (sizeof(Char) == 1 ||
|
||||
current <= unibrow::Utf16::kMaxNonSurrogateCharCode) {
|
||||
*(out++) = static_cast<Char>(current);
|
||||
@ -111,42 +155,30 @@ void Utf8DecoderBase<Decoder>::Decode(Char* out,
|
||||
cursor++;
|
||||
}
|
||||
|
||||
if (Decoder::kAllowIncompleteSequences &&
|
||||
state != Decoder::DfaDecoder::kAccept) {
|
||||
if (Traits::kAllowIncompleteSequences &&
|
||||
state != Traits::DfaDecoder::kAccept) {
|
||||
*out = static_cast<Char>(unibrow::Utf8::kBadChar);
|
||||
} else {
|
||||
DCHECK_EQ(state, Decoder::DfaDecoder::kAccept);
|
||||
DCHECK_EQ(state, Traits::DfaDecoder::kAccept);
|
||||
}
|
||||
}
|
||||
|
||||
template V8_EXPORT_PRIVATE Utf8DecoderBase<Utf8Decoder>::Utf8DecoderBase(
|
||||
const base::Vector<const uint8_t>& data);
|
||||
#define DEFINE_UNICODE_DECODER(Decoder) \
|
||||
template V8_EXPORT_PRIVATE Utf8DecoderBase<Decoder>::Utf8DecoderBase( \
|
||||
const base::Vector<const uint8_t>& data); \
|
||||
template V8_EXPORT_PRIVATE void Utf8DecoderBase<Decoder>::Decode( \
|
||||
uint8_t* out, const base::Vector<const uint8_t>& data); \
|
||||
template V8_EXPORT_PRIVATE void Utf8DecoderBase<Decoder>::Decode( \
|
||||
uint16_t* out, const base::Vector<const uint8_t>& data)
|
||||
|
||||
template V8_EXPORT_PRIVATE void Utf8DecoderBase<Utf8Decoder>::Decode(
|
||||
uint8_t* out, const base::Vector<const uint8_t>& data);
|
||||
|
||||
template V8_EXPORT_PRIVATE void Utf8DecoderBase<Utf8Decoder>::Decode(
|
||||
uint16_t* out, const base::Vector<const uint8_t>& data);
|
||||
DEFINE_UNICODE_DECODER(Utf8Decoder);
|
||||
|
||||
#if V8_ENABLE_WEBASSEMBLY
|
||||
template Utf8DecoderBase<Wtf8Decoder>::Utf8DecoderBase(
|
||||
const base::Vector<const uint8_t>& data);
|
||||
|
||||
template void Utf8DecoderBase<Wtf8Decoder>::Decode(
|
||||
uint8_t* out, const base::Vector<const uint8_t>& data);
|
||||
|
||||
template void Utf8DecoderBase<Wtf8Decoder>::Decode(
|
||||
uint16_t* out, const base::Vector<const uint8_t>& data);
|
||||
|
||||
template Utf8DecoderBase<StrictUtf8Decoder>::Utf8DecoderBase(
|
||||
const base::Vector<const uint8_t>& data);
|
||||
|
||||
template void Utf8DecoderBase<StrictUtf8Decoder>::Decode(
|
||||
uint8_t* out, const base::Vector<const uint8_t>& data);
|
||||
|
||||
template void Utf8DecoderBase<StrictUtf8Decoder>::Decode(
|
||||
uint16_t* out, const base::Vector<const uint8_t>& data);
|
||||
DEFINE_UNICODE_DECODER(Wtf8Decoder);
|
||||
DEFINE_UNICODE_DECODER(StrictUtf8Decoder);
|
||||
#endif // V8_ENABLE_WEBASSEMBLY
|
||||
|
||||
#undef DEFINE_UNICODE_DECODER
|
||||
|
||||
} // namespace internal
|
||||
} // namespace v8
|
||||
|
@ -8,10 +8,6 @@
|
||||
#include "src/base/vector.h"
|
||||
#include "src/strings/unicode.h"
|
||||
|
||||
#if V8_ENABLE_WEBASSEMBLY
|
||||
#include "src/third_party/utf8-decoder/generalized-utf8-decoder.h"
|
||||
#endif
|
||||
|
||||
namespace v8 {
|
||||
namespace internal {
|
||||
|
||||
@ -84,16 +80,6 @@ class Utf8DecoderBase {
|
||||
class V8_EXPORT_PRIVATE Utf8Decoder final
|
||||
: public Utf8DecoderBase<Utf8Decoder> {
|
||||
public:
|
||||
static bool InvalidCodePointSequence(uint32_t current, uint32_t previous) {
|
||||
// The DfaDecoder will only ever decode Unicode scalar values, and all
|
||||
// sequences of USVs are valid.
|
||||
DCHECK(!unibrow::Utf16::IsLeadSurrogate(current));
|
||||
DCHECK(!unibrow::Utf16::IsTrailSurrogate(current));
|
||||
return false;
|
||||
}
|
||||
static const bool kAllowIncompleteSequences = true;
|
||||
using DfaDecoder = Utf8DfaDecoder;
|
||||
|
||||
explicit Utf8Decoder(const base::Vector<const uint8_t>& data)
|
||||
: Utf8DecoderBase(data) {}
|
||||
|
||||
@ -111,12 +97,6 @@ class V8_EXPORT_PRIVATE Utf8Decoder final
|
||||
// isolated surrogates.
|
||||
class Wtf8Decoder : public Utf8DecoderBase<Wtf8Decoder> {
|
||||
public:
|
||||
static bool InvalidCodePointSequence(uint32_t current, uint32_t previous) {
|
||||
return unibrow::Utf16::IsSurrogatePair(current, previous);
|
||||
}
|
||||
static const bool kAllowIncompleteSequences = false;
|
||||
using DfaDecoder = GeneralizedUtf8DfaDecoder;
|
||||
|
||||
explicit Wtf8Decoder(const base::Vector<const uint8_t>& data)
|
||||
: Utf8DecoderBase(data) {}
|
||||
|
||||
@ -127,16 +107,6 @@ class Wtf8Decoder : public Utf8DecoderBase<Wtf8Decoder> {
|
||||
// with U+FFFD, we have a separate Encoding::kInvalid state.
|
||||
class StrictUtf8Decoder : public Utf8DecoderBase<StrictUtf8Decoder> {
|
||||
public:
|
||||
static bool InvalidCodePointSequence(uint32_t current, uint32_t previous) {
|
||||
// The DfaDecoder will only ever decode Unicode scalar values, and all
|
||||
// sequence of USVs are valid.
|
||||
DCHECK(!unibrow::Utf16::IsLeadSurrogate(current));
|
||||
DCHECK(!unibrow::Utf16::IsTrailSurrogate(current));
|
||||
return false;
|
||||
}
|
||||
static const bool kAllowIncompleteSequences = false;
|
||||
using DfaDecoder = Utf8DfaDecoder;
|
||||
|
||||
explicit StrictUtf8Decoder(const base::Vector<const uint8_t>& data)
|
||||
: Utf8DecoderBase(data) {}
|
||||
|
||||
|
@ -51,7 +51,12 @@ let interestingStrings = ['',
|
||||
'two \ucccc byte',
|
||||
'surrogate \ud800\udc000 pair',
|
||||
'isolated \ud800 leading',
|
||||
'isolated \udc00 trailing'];
|
||||
'isolated \udc00 trailing',
|
||||
'\ud800 isolated leading at beginning',
|
||||
'\udc00 isolated trailing at beginning',
|
||||
'isolated leading at end \ud800',
|
||||
'isolated trailing at end \udc00',
|
||||
'swapped surrogate \udc00\ud800 pair'];
|
||||
|
||||
function IsSurrogate(codepoint) {
|
||||
return 0xD800 <= codepoint && codepoint <= 0xDFFF
|
||||
@ -87,7 +92,8 @@ function makeWtf8TestDataSegment() {
|
||||
for (let bytes of ['trailing high byte \xa9',
|
||||
'interstitial high \xa9 byte',
|
||||
'invalid \xc0 byte',
|
||||
'surrogate \xed\xa0\x80\xed\xd0\x80 pair']) {
|
||||
'invalid three-byte \xed\xd0\x80',
|
||||
'surrogate \xed\xa0\x80\xed\xb0\x80 pair']) {
|
||||
invalid[bytes] = { offset: data.length, length: bytes.length };
|
||||
for (let i = 0; i < bytes.length; i++) {
|
||||
data.push(bytes.charCodeAt(i));
|
||||
|
Loading…
Reference in New Issue
Block a user