[wasm] Also kBadChar is a valid utf8 character
The validation of utf8 strings in WebAssembly modules used the character kBadChar = 0xFFFD to indicate a validation error. However, this character can appear in a valid utf8 string. This CL fixes this problem by duplicating some of the code in {Utf8::CalculateValue} and inlining it directly into Utf8::Validate. Note that Utf8::Validate is used only for WebAssembly. Tests for this change are in the WebAssembly spec tests, which I will update in a separate CL. R=vogelheim@chromium.org Change-Id: I8697b9299f3e98a8eafdf193bff8bdff90efd7dc Reviewed-on: https://chromium-review.googlesource.com/509534 Reviewed-by: Daniel Vogelheim <vogelheim@chromium.org> Commit-Queue: Andreas Haas <ahaas@chromium.org> Cr-Commit-Position: refs/heads/master@{#45476}
This commit is contained in:
parent
a4eb80f7a6
commit
8e0daf78da
@ -360,17 +360,51 @@ uchar Utf8::ValueOfIncrementalFinish(Utf8IncrementalBuffer* buffer) {
|
||||
}
|
||||
}
|
||||
|
||||
bool Utf8::Validate(const byte* bytes, size_t length) {
|
||||
size_t cursor = 0;
|
||||
bool Utf8::ValidateEncoding(const byte* bytes, size_t length) {
|
||||
const byte* cursor = bytes;
|
||||
const byte* end = bytes + length;
|
||||
|
||||
// Performance optimization: Skip over single-byte values first.
|
||||
while (cursor < length && bytes[cursor] <= kMaxOneByteChar) {
|
||||
++cursor;
|
||||
}
|
||||
while (cursor < end) {
|
||||
// Skip over single-byte values.
|
||||
if (*cursor <= kMaxOneByteChar) {
|
||||
++cursor;
|
||||
continue;
|
||||
}
|
||||
|
||||
while (cursor < length) {
|
||||
uchar c = ValueOf(bytes + cursor, length - cursor, &cursor);
|
||||
if (!IsValidCharacter(c)) return false;
|
||||
// Get the length the the character.
|
||||
size_t seq_length = NonASCIISequenceLength(*cursor);
|
||||
// For some invalid characters NonASCIISequenceLength returns 0.
|
||||
if (seq_length == 0) return false;
|
||||
|
||||
const byte* char_end = cursor + seq_length;
|
||||
|
||||
// Return false if we do not have enough bytes for the character.
|
||||
if (char_end > end) return false;
|
||||
|
||||
// Check if the bytes of the character are continuation bytes.
|
||||
for (const byte* i = cursor + 1; i < char_end; ++i) {
|
||||
if (!IsContinuationCharacter(*i)) return false;
|
||||
}
|
||||
|
||||
// Check overly long sequences & other conditions.
|
||||
if (seq_length == 3) {
|
||||
if (cursor[0] == 0xE0 && (cursor[1] < 0xA0 || cursor[1] > 0xBF)) {
|
||||
// Overlong three-byte sequence?
|
||||
return false;
|
||||
} else if (cursor[0] == 0xED && (cursor[1] < 0x80 || cursor[1] > 0x9F)) {
|
||||
// High and low surrogate halves?
|
||||
return false;
|
||||
}
|
||||
} else if (seq_length == 4) {
|
||||
if (cursor[0] == 0xF0 && (cursor[1] < 0x90 || cursor[1] > 0xBF)) {
|
||||
// Overlong four-byte sequence.
|
||||
return false;
|
||||
} else if (cursor[0] == 0xF4 && (cursor[1] < 0x80 || cursor[1] > 0x8F)) {
|
||||
// Code points outside of the unicode range.
|
||||
return false;
|
||||
}
|
||||
}
|
||||
cursor = char_end;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
@ -166,7 +166,15 @@ class Utf8 {
|
||||
// Excludes non-characters from the set of valid code points.
|
||||
static inline bool IsValidCharacter(uchar c);
|
||||
|
||||
static bool Validate(const byte* str, size_t length);
|
||||
// Validate if the input has a valid utf-8 encoding. Unlike JS source code
|
||||
// this validation function will accept any unicode code point, including
|
||||
// kBadChar and BOMs.
|
||||
//
|
||||
// This method checks for:
|
||||
// - valid utf-8 endcoding (e.g. no over-long encodings),
|
||||
// - absence of surrogates,
|
||||
// - valid code point range.
|
||||
static bool ValidateEncoding(const byte* str, size_t length);
|
||||
};
|
||||
|
||||
struct Uppercase {
|
||||
|
@ -100,7 +100,7 @@ uint32_t consume_string(Decoder& decoder, uint32_t* length, bool validate_utf8,
|
||||
if (*length > 0) {
|
||||
decoder.consume_bytes(*length, name);
|
||||
if (decoder.ok() && validate_utf8 &&
|
||||
!unibrow::Utf8::Validate(string_start, *length)) {
|
||||
!unibrow::Utf8::ValidateEncoding(string_start, *length)) {
|
||||
decoder.errorf(string_start, "%s: no valid UTF-8 string", name);
|
||||
}
|
||||
}
|
||||
@ -741,7 +741,7 @@ class ModuleDecoder : public Decoder {
|
||||
// or out-of-order indexes and non-UTF8 names. You can even assign
|
||||
// to the same function multiple times (last valid one wins).
|
||||
if (inner.ok() && function_index < module_->functions.size() &&
|
||||
unibrow::Utf8::Validate(
|
||||
unibrow::Utf8::ValidateEncoding(
|
||||
inner.start() + inner.GetBufferRelativeOffset(name_offset),
|
||||
name_length)) {
|
||||
module_->functions[function_index].name_offset = name_offset;
|
||||
|
@ -1137,7 +1137,7 @@ MaybeHandle<String> WasmCompiledModule::ExtractUtf8StringFromModuleBytes(
|
||||
DCHECK_GE(module_bytes->length(), offset);
|
||||
DCHECK_GE(module_bytes->length() - offset, size);
|
||||
// UTF8 validation happens at decode time.
|
||||
DCHECK(unibrow::Utf8::Validate(
|
||||
DCHECK(unibrow::Utf8::ValidateEncoding(
|
||||
reinterpret_cast<const byte*>(module_bytes->GetCharsAddress() + offset),
|
||||
size));
|
||||
DCHECK_GE(kMaxInt, offset);
|
||||
|
@ -118,5 +118,4 @@ checkAll(toByteArray("\xff"), true);
|
||||
checkAll(toByteArray("\xed\xa0\x8f"), true); // surrogate code points
|
||||
checkAll(toByteArray("\xe0\x82\x80"), true); // overlong sequence
|
||||
checkAll(toByteArray("\xf4\x90\x80\x80"), true); // beyond limit: U+110000
|
||||
checkAll(toByteArray("\xef\xbf\xbe"), true); // non-character; U+FFFE
|
||||
checkAll(toByteArray("with\x00null"), false);
|
||||
|
Loading…
Reference in New Issue
Block a user