Fix input and output to handle UTF16 surrogate pairs.

Review URL: https://chromiumcodereview.appspot.com/9600009

git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@11007 ce2b1a6d-e550-0410-aec6-3dcde31c8c00
This commit is contained in:
erik.corry@gmail.com 2012-03-12 12:35:28 +00:00
parent cd91894d2f
commit 03cfc4363b
30 changed files with 957 additions and 305 deletions

View File

@ -1430,7 +1430,7 @@ void ObjectTemplate::SetInternalFieldCount(int value) {
ScriptData* ScriptData::PreCompile(const char* input, int length) {
i::Utf8ToUC16CharacterStream stream(
i::Utf8ToUtf16CharacterStream stream(
reinterpret_cast<const unsigned char*>(input), length);
return i::ParserApi::PreParse(&stream, NULL, i::FLAG_harmony_scoping);
}
@ -1439,11 +1439,11 @@ ScriptData* ScriptData::PreCompile(const char* input, int length) {
ScriptData* ScriptData::PreCompile(v8::Handle<String> source) {
i::Handle<i::String> str = Utils::OpenHandle(*source);
if (str->IsExternalTwoByteString()) {
i::ExternalTwoByteStringUC16CharacterStream stream(
i::ExternalTwoByteStringUtf16CharacterStream stream(
i::Handle<i::ExternalTwoByteString>::cast(str), 0, str->length());
return i::ParserApi::PreParse(&stream, NULL, i::FLAG_harmony_scoping);
} else {
i::GenericStringUC16CharacterStream stream(str, 0, str->length());
i::GenericStringUtf16CharacterStream stream(str, 0, str->length());
return i::ParserApi::PreParse(&stream, NULL, i::FLAG_harmony_scoping);
}
}
@ -3690,7 +3690,7 @@ int String::Length() const {
int String::Utf8Length() const {
i::Handle<i::String> str = Utils::OpenHandle(this);
if (IsDeadCheck(str->GetIsolate(), "v8::String::Utf8Length()")) return 0;
return str->Utf8Length();
return i::Utf8Length(str);
}
@ -3736,11 +3736,13 @@ int String::WriteUtf8(char* buffer,
int i;
int pos = 0;
int nchars = 0;
int previous = unibrow::Utf16::kNoPreviousCharacter;
for (i = 0; i < len && (capacity == -1 || pos < fast_end); i++) {
i::uc32 c = write_input_buffer.GetNext();
int written = unibrow::Utf8::Encode(buffer + pos, c);
int written = unibrow::Utf8::Encode(buffer + pos, c, previous);
pos += written;
nchars++;
previous = c;
}
if (i < len) {
// For the last characters we need to check the length for each one
@ -3749,16 +3751,33 @@ int String::WriteUtf8(char* buffer,
char intermediate[unibrow::Utf8::kMaxEncodedSize];
for (; i < len && pos < capacity; i++) {
i::uc32 c = write_input_buffer.GetNext();
int written = unibrow::Utf8::Encode(intermediate, c);
if (pos + written <= capacity) {
for (int j = 0; j < written; j++)
buffer[pos + j] = intermediate[j];
if (unibrow::Utf16::IsTrailSurrogate(c) &&
unibrow::Utf16::IsLeadSurrogate(previous)) {
// We can't use the intermediate buffer here because the encoding
// of surrogate pairs is done under assumption that you can step
// back and fix the UTF8 stream. Luckily we only need space for one
// more byte, so there is always space.
ASSERT(pos < capacity);
int written = unibrow::Utf8::Encode(buffer + pos, c, previous);
ASSERT(written == 1);
pos += written;
nchars++;
} else {
// We've reached the end of the buffer
break;
int written =
unibrow::Utf8::Encode(intermediate,
c,
unibrow::Utf16::kNoPreviousCharacter);
if (pos + written <= capacity) {
for (int j = 0; j < written; j++)
buffer[pos + j] = intermediate[j];
pos += written;
nchars++;
} else {
// We've reached the end of the buffer
break;
}
}
previous = c;
}
}
if (nchars_ref != NULL) *nchars_ref = nchars;
@ -5240,7 +5259,8 @@ String::Utf8Value::Utf8Value(v8::Handle<v8::Value> obj)
TryCatch try_catch;
Handle<String> str = obj->ToString();
if (str.IsEmpty()) return;
length_ = str->Utf8Length();
i::Handle<i::String> i_str = Utils::OpenHandle(*str);
length_ = i::Utf8Length(i_str);
str_ = i::NewArray<char>(length_ + 1);
str->WriteUtf8(str_);
}

View File

@ -472,7 +472,7 @@ void RegExpMacroAssemblerARM::CheckNotCharacterAfterMinusAnd(
uc16 minus,
uc16 mask,
Label* on_not_equal) {
ASSERT(minus < String::kMaxUC16CharCode);
ASSERT(minus < String::kMaxUtf16CodeUnit);
__ sub(r0, current_character(), Operand(minus));
__ and_(r0, r0, Operand(mask));
__ cmp(r0, Operand(c));

View File

@ -372,8 +372,11 @@ bool DebuggerAgentUtil::SendMessage(const Socket* conn,
// Calculate the message size in UTF-8 encoding.
int utf8_len = 0;
int previous = unibrow::Utf16::kNoPreviousCharacter;
for (int i = 0; i < message.length(); i++) {
utf8_len += unibrow::Utf8::Length(message[i]);
uint16_t character = message[i];
utf8_len += unibrow::Utf8::Length(character, previous);
previous = character;
}
// Send the header.
@ -388,17 +391,33 @@ bool DebuggerAgentUtil::SendMessage(const Socket* conn,
// Send message body as UTF-8.
int buffer_position = 0; // Current buffer position.
previous = unibrow::Utf16::kNoPreviousCharacter;
for (int i = 0; i < message.length(); i++) {
// Write next UTF-8 encoded character to buffer.
uint16_t character = message[i];
buffer_position +=
unibrow::Utf8::Encode(buffer + buffer_position, message[i]);
unibrow::Utf8::Encode(buffer + buffer_position, character, previous);
ASSERT(buffer_position < kBufferSize);
// Send buffer if full or last character is encoded.
if (kBufferSize - buffer_position < 3 || i == message.length() - 1) {
conn->Send(buffer, buffer_position);
buffer_position = 0;
if (kBufferSize - buffer_position <
unibrow::Utf16::kMaxExtraUtf8BytesForOneUtf16CodeUnit ||
i == message.length() - 1) {
if (unibrow::Utf16::IsLeadSurrogate(character)) {
const int kEncodedSurrogateLength =
unibrow::Utf16::kUtf8BytesToCodeASurrogate;
ASSERT(buffer_position >= kEncodedSurrogateLength);
conn->Send(buffer, buffer_position - kEncodedSurrogateLength);
for (int i = 0; i < kEncodedSurrogateLength; i++) {
buffer[i] = buffer[buffer_position + i];
}
buffer_position = kEncodedSurrogateLength;
} else {
conn->Send(buffer, buffer_position);
buffer_position = 0;
}
}
previous = character;
}
return true;

View File

@ -267,8 +267,9 @@ const int kBinary32ExponentShift = 23;
// other bits set.
const uint64_t kQuietNaNMask = static_cast<uint64_t>(0xfff) << 51;
// ASCII/UC16 constants
// ASCII/UTF-16 constants
// Code-point values in Unicode 4.0 are 21 bits wide.
// Code units in UTF-16 are 16 bits wide.
typedef uint16_t uc16;
typedef int32_t uc32;
const int kASCIISize = kCharSize;

View File

@ -800,4 +800,162 @@ Handle<ObjectHashTable> PutIntoObjectHashTable(Handle<ObjectHashTable> table,
}
// This method determines the type of string involved and then gets the UTF8
// length of the string. It doesn't flatten the string and has log(n) recursion
// for a string of length n. If the failure flag gets set, then we have to
// flatten the string and retry. Failures are caused by surrogate pairs in deep
// cons strings.
// Single surrogate characters that are encountered in the UTF-16 character
// sequence of the input string get counted as 3 UTF-8 bytes, because that
// is the way that WriteUtf8 will encode them. Surrogate pairs are counted and
// encoded as one 4-byte UTF-8 sequence.
// This function conceptually uses recursion on the two halves of cons strings.
// However, in order to avoid the recursion going too deep it recurses on the
// second string of the cons, but iterates on the first substring (by manually
// eliminating it as a tail recursion). This means it counts the UTF-8 length
// from the end to the start, which makes no difference to the total.
// Surrogate pairs are recognized even if they are split across two sides of a
// cons, which complicates the implementation somewhat. Therefore, too deep
// recursion cannot always be avoided. This case is detected, and the failure
// flag is set, a signal to the caller that the string should be flattened and
// the operation retried.
int Utf8LengthHelper(String* input,
int from,
int to,
bool followed_by_surrogate,
int max_recursion,
bool* failure,
bool* starts_with_surrogate) {
if (from == to) return 0;
int total = 0;
bool dummy;
while (true) {
if (input->IsAsciiRepresentation()) {
*starts_with_surrogate = false;
return total + to - from;
}
switch (StringShape(input).representation_tag()) {
case kConsStringTag: {
ConsString* str = ConsString::cast(input);
String* first = str->first();
String* second = str->second();
int first_length = first->length();
if (first_length - from > to - first_length) {
if (first_length < to) {
// Right hand side is shorter. No need to check the recursion depth
// since this can only happen log(n) times.
bool right_starts_with_surrogate = false;
total += Utf8LengthHelper(second,
0,
to - first_length,
followed_by_surrogate,
max_recursion - 1,
failure,
&right_starts_with_surrogate);
if (*failure) return 0;
followed_by_surrogate = right_starts_with_surrogate;
input = first;
to = first_length;
} else {
// We only need the left hand side.
input = first;
}
} else {
if (first_length > from) {
// Left hand side is shorter.
if (first->IsAsciiRepresentation()) {
total += first_length - from;
*starts_with_surrogate = false;
starts_with_surrogate = &dummy;
input = second;
from = 0;
to -= first_length;
} else if (second->IsAsciiRepresentation()) {
followed_by_surrogate = false;
total += to - first_length;
input = first;
to = first_length;
} else if (max_recursion > 0) {
bool right_starts_with_surrogate = false;
// Recursing on the long one. This may fail.
total += Utf8LengthHelper(second,
0,
to - first_length,
followed_by_surrogate,
max_recursion - 1,
failure,
&right_starts_with_surrogate);
if (*failure) return 0;
input = first;
to = first_length;
followed_by_surrogate = right_starts_with_surrogate;
} else {
*failure = true;
return 0;
}
} else {
// We only need the right hand side.
input = second;
from = 0;
to -= first_length;
}
}
continue;
}
case kExternalStringTag:
case kSeqStringTag: {
Vector<const uc16> vector = input->GetFlatContent().ToUC16Vector();
const uc16* p = vector.start();
int previous = unibrow::Utf16::kNoPreviousCharacter;
for (int i = from; i < to; i++) {
uc16 c = p[i];
total += unibrow::Utf8::Length(c, previous);
previous = c;
}
if (to - from > 0) {
if (unibrow::Utf16::IsLeadSurrogate(previous) &&
followed_by_surrogate) {
total -= unibrow::Utf8::kBytesSavedByCombiningSurrogates;
}
if (unibrow::Utf16::IsTrailSurrogate(p[from])) {
*starts_with_surrogate = true;
}
}
return total;
}
case kSlicedStringTag: {
SlicedString* str = SlicedString::cast(input);
int offset = str->offset();
input = str->parent();
from += offset;
to += offset;
continue;
}
default:
break;
}
UNREACHABLE();
return 0;
}
return 0;
}
int Utf8Length(Handle<String> str) {
bool dummy;
bool failure;
int len;
const int kRecursionBudget = 100;
do {
failure = false;
len = Utf8LengthHelper(
*str, 0, str->length(), false, kRecursionBudget, &failure, &dummy);
if (failure) FlattenString(str);
} while (failure);
return len;
}
} } // namespace v8::internal

View File

@ -174,6 +174,8 @@ void FlattenString(Handle<String> str);
// string.
Handle<String> FlattenGetString(Handle<String> str);
int Utf8Length(Handle<String> str);
Handle<Object> SetProperty(Handle<Object> object,
Handle<Object> key,
Handle<Object> value,

View File

@ -4186,8 +4186,6 @@ MaybeObject* Heap::AllocateStringFromAscii(Vector<const char> string,
MaybeObject* Heap::AllocateStringFromUtf8Slow(Vector<const char> string,
PretenureFlag pretenure) {
// V8 only supports characters in the Basic Multilingual Plane.
const uc32 kMaxSupportedChar = 0xFFFF;
// Count the number of characters in the UTF-8 string and check if
// it is an ASCII string.
Access<UnicodeCache::Utf8Decoder>
@ -4195,8 +4193,12 @@ MaybeObject* Heap::AllocateStringFromUtf8Slow(Vector<const char> string,
decoder->Reset(string.start(), string.length());
int chars = 0;
while (decoder->has_more()) {
decoder->GetNext();
chars++;
uint32_t r = decoder->GetNext();
if (r <= unibrow::Utf16::kMaxNonSurrogateCharCode) {
chars++;
} else {
chars += 2;
}
}
Object* result;
@ -4207,10 +4209,15 @@ MaybeObject* Heap::AllocateStringFromUtf8Slow(Vector<const char> string,
// Convert and copy the characters into the new object.
String* string_result = String::cast(result);
decoder->Reset(string.start(), string.length());
for (int i = 0; i < chars; i++) {
uc32 r = decoder->GetNext();
if (r > kMaxSupportedChar) { r = unibrow::Utf8::kBadChar; }
string_result->Set(i, r);
int i = 0;
while (i < chars) {
uint32_t r = decoder->GetNext();
if (r > unibrow::Utf16::kMaxNonSurrogateCharCode) {
string_result->Set(i++, unibrow::Utf16::LeadSurrogate(r));
string_result->Set(i++, unibrow::Utf16::TrailSurrogate(r));
} else {
string_result->Set(i++, r);
}
}
return result;
}
@ -4267,7 +4274,7 @@ MaybeObject* Heap::AllocateInternalSymbol(unibrow::CharacterStream* buffer,
uint32_t hash_field) {
ASSERT(chars >= 0);
// Ensure the chars matches the number of characters in the buffer.
ASSERT(static_cast<unsigned>(chars) == buffer->Length());
ASSERT(static_cast<unsigned>(chars) == buffer->Utf16Length());
// Determine whether the string is ASCII.
bool is_ascii = true;
while (buffer->has_more()) {
@ -4313,8 +4320,15 @@ MaybeObject* Heap::AllocateInternalSymbol(unibrow::CharacterStream* buffer,
ASSERT_EQ(size, answer->Size());
// Fill in the characters.
for (int i = 0; i < chars; i++) {
answer->Set(i, buffer->GetNext());
int i = 0;
while (i < chars) {
uint32_t character = buffer->GetNext();
if (character > unibrow::Utf16::kMaxNonSurrogateCharCode) {
answer->Set(i++, unibrow::Utf16::LeadSurrogate(character));
answer->Set(i++, unibrow::Utf16::TrailSurrogate(character));
} else {
answer->Set(i++, character);
}
}
return answer;
}

View File

@ -4284,7 +4284,7 @@ class HStringCharCodeAt: public HTemplateInstruction<3> {
virtual bool DataEquals(HValue* other) { return true; }
virtual Range* InferRange(Zone* zone) {
return new(zone) Range(0, String::kMaxUC16CharCode);
return new(zone) Range(0, String::kMaxUtf16CodeUnit);
}
};

View File

@ -523,7 +523,7 @@ void RegExpMacroAssemblerIA32::CheckNotCharacterAfterMinusAnd(
uc16 minus,
uc16 mask,
Label* on_not_equal) {
ASSERT(minus < String::kMaxUC16CharCode);
ASSERT(minus < String::kMaxUtf16CodeUnit);
__ lea(eax, Operand(current_character(), -minus));
__ and_(eax, mask);
__ cmp(eax, c);

View File

@ -1444,7 +1444,7 @@ static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler,
if (ascii) {
char_mask = String::kMaxAsciiCharCode;
} else {
char_mask = String::kMaxUC16CharCode;
char_mask = String::kMaxUtf16CodeUnit;
}
uc16 exor = c1 ^ c2;
// Check whether exor has only one bit set.
@ -1546,7 +1546,7 @@ static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
if (ascii) {
max_char = String::kMaxAsciiCharCode;
} else {
max_char = String::kMaxUC16CharCode;
max_char = String::kMaxUtf16CodeUnit;
}
Label success;
@ -1642,7 +1642,7 @@ static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
macro_assembler->CheckCharacterLT(from, on_failure);
}
}
if (to != String::kMaxUC16CharCode) {
if (to != String::kMaxUtf16CodeUnit) {
if (cc->is_negated()) {
macro_assembler->CheckCharacterLT(to + 1, on_failure);
} else {
@ -1835,7 +1835,7 @@ bool QuickCheckDetails::Rationalize(bool asc) {
if (asc) {
char_mask = String::kMaxAsciiCharCode;
} else {
char_mask = String::kMaxUC16CharCode;
char_mask = String::kMaxUtf16CodeUnit;
}
mask_ = 0;
value_ = 0;
@ -1887,7 +1887,7 @@ bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler,
if (compiler->ascii()) {
char_mask = String::kMaxAsciiCharCode;
} else {
char_mask = String::kMaxUC16CharCode;
char_mask = String::kMaxUtf16CodeUnit;
}
if ((mask & char_mask) == char_mask) need_mask = false;
mask &= char_mask;
@ -1939,7 +1939,7 @@ void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,
if (compiler->ascii()) {
char_mask = String::kMaxAsciiCharCode;
} else {
char_mask = String::kMaxUC16CharCode;
char_mask = String::kMaxUtf16CodeUnit;
}
for (int k = 0; k < elms_->length(); k++) {
TextElement elm = elms_->at(k);
@ -4079,7 +4079,7 @@ static void AddClassNegated(const uc16 *elmv,
int elmc,
ZoneList<CharacterRange>* ranges) {
ASSERT(elmv[0] != 0x0000);
ASSERT(elmv[elmc-1] != String::kMaxUC16CharCode);
ASSERT(elmv[elmc-1] != String::kMaxUtf16CodeUnit);
uc16 last = 0x0000;
for (int i = 0; i < elmc; i += 2) {
ASSERT(last <= elmv[i] - 1);
@ -4087,7 +4087,7 @@ static void AddClassNegated(const uc16 *elmv,
ranges->Add(CharacterRange(last, elmv[i] - 1));
last = elmv[i + 1] + 1;
}
ranges->Add(CharacterRange(last, String::kMaxUC16CharCode));
ranges->Add(CharacterRange(last, String::kMaxUtf16CodeUnit));
}
@ -4633,8 +4633,8 @@ void CharacterRange::Negate(ZoneList<CharacterRange>* ranges,
from = range.to();
i++;
}
if (from < String::kMaxUC16CharCode) {
negated_ranges->Add(CharacterRange(from + 1, String::kMaxUC16CharCode));
if (from < String::kMaxUtf16CodeUnit) {
negated_ranges->Add(CharacterRange(from + 1, String::kMaxUtf16CodeUnit));
}
}
@ -4797,7 +4797,7 @@ void DispatchTable::AddRange(CharacterRange full_range, int value) {
entry->AddValue(value);
// Bail out if the last interval ended at 0xFFFF since otherwise
// adding 1 will wrap around to 0.
if (entry->to() == String::kMaxUC16CharCode)
if (entry->to() == String::kMaxUtf16CodeUnit)
break;
ASSERT(entry->to() + 1 > current.from());
current.set_from(entry->to() + 1);
@ -5117,7 +5117,7 @@ int TextNode::ComputeFirstCharacterSet(int budget) {
int new_length = length + 1;
if (length > 0) {
if (ranges->at(0).from() == 0) new_length--;
if (ranges->at(length - 1).to() == String::kMaxUC16CharCode) {
if (ranges->at(length - 1).to() == String::kMaxUtf16CodeUnit) {
new_length--;
}
}
@ -5207,14 +5207,14 @@ void DispatchTableConstructor::AddInverse(ZoneList<CharacterRange>* ranges) {
if (last < range.from())
AddRange(CharacterRange(last, range.from() - 1));
if (range.to() >= last) {
if (range.to() == String::kMaxUC16CharCode) {
if (range.to() == String::kMaxUtf16CodeUnit) {
return;
} else {
last = range.to() + 1;
}
}
}
AddRange(CharacterRange(last, String::kMaxUC16CharCode));
AddRange(CharacterRange(last, String::kMaxUtf16CodeUnit));
}

View File

@ -461,18 +461,20 @@ class Logger::NameBuffer {
utf8_pos_ += utf8_length;
return;
}
int uc16_length = Min(str->length(), kUc16BufferSize);
String::WriteToFlat(str, uc16_buffer_, 0, uc16_length);
int uc16_length = Min(str->length(), kUtf16BufferSize);
String::WriteToFlat(str, utf16_buffer, 0, uc16_length);
int previous = unibrow::Utf16::kNoPreviousCharacter;
for (int i = 0; i < uc16_length && utf8_pos_ < kUtf8BufferSize; ++i) {
uc16 c = uc16_buffer_[i];
uc16 c = utf16_buffer[i];
if (c <= String::kMaxAsciiCharCodeU) {
utf8_buffer_[utf8_pos_++] = static_cast<char>(c);
} else {
int char_length = unibrow::Utf8::Length(c);
int char_length = unibrow::Utf8::Length(c, previous);
if (utf8_pos_ + char_length > kUtf8BufferSize) break;
unibrow::Utf8::Encode(utf8_buffer_ + utf8_pos_, c);
unibrow::Utf8::Encode(utf8_buffer_ + utf8_pos_, c, previous);
utf8_pos_ += char_length;
}
previous = c;
}
}
@ -504,11 +506,11 @@ class Logger::NameBuffer {
private:
static const int kUtf8BufferSize = 512;
static const int kUc16BufferSize = 128;
static const int kUtf16BufferSize = 128;
int utf8_pos_;
char utf8_buffer_[kUtf8BufferSize];
uc16 uc16_buffer_[kUc16BufferSize];
uc16 utf16_buffer[kUtf16BufferSize];
};

View File

@ -4463,7 +4463,11 @@ bool StringHasher::has_trivial_hash() {
}
void StringHasher::AddCharacter(uc32 c) {
void StringHasher::AddCharacter(uint32_t c) {
if (c > unibrow::Utf16::kMaxNonSurrogateCharCode) {
AddSurrogatePair(c); // Not inlined.
return;
}
// Use the Jenkins one-at-a-time hash function to update the hash
// for the given character.
raw_running_hash_ += c;
@ -4492,8 +4496,12 @@ void StringHasher::AddCharacter(uc32 c) {
}
void StringHasher::AddCharacterNoIndex(uc32 c) {
void StringHasher::AddCharacterNoIndex(uint32_t c) {
ASSERT(!is_array_index());
if (c > unibrow::Utf16::kMaxNonSurrogateCharCode) {
AddSurrogatePairNoIndex(c); // Not inlined.
return;
}
raw_running_hash_ += c;
raw_running_hash_ += (raw_running_hash_ << 10);
raw_running_hash_ ^= (raw_running_hash_ >> 6);

View File

@ -6051,9 +6051,11 @@ SmartArrayPointer<char> String::ToCString(AllowNullsFlag allow_nulls,
buffer->Reset(offset, this);
int character_position = offset;
int utf8_bytes = 0;
int last = unibrow::Utf16::kNoPreviousCharacter;
while (buffer->has_more() && character_position++ < offset + length) {
uint16_t character = buffer->GetNext();
utf8_bytes += unibrow::Utf8::Length(character);
utf8_bytes += unibrow::Utf8::Length(character, last);
last = character;
}
if (length_return) {
@ -6067,13 +6069,15 @@ SmartArrayPointer<char> String::ToCString(AllowNullsFlag allow_nulls,
buffer->Seek(offset);
character_position = offset;
int utf8_byte_position = 0;
last = unibrow::Utf16::kNoPreviousCharacter;
while (buffer->has_more() && character_position++ < offset + length) {
uint16_t character = buffer->GetNext();
if (allow_nulls == DISALLOW_NULLS && character == 0) {
character = ' ';
}
utf8_byte_position +=
unibrow::Utf8::Encode(result + utf8_byte_position, character);
unibrow::Utf8::Encode(result + utf8_byte_position, character, last);
last = character;
}
result[utf8_byte_position] = 0;
return SmartArrayPointer<char>(result);
@ -6387,73 +6391,6 @@ const unibrow::byte* String::ReadBlock(String* input,
}
// This method determines the type of string involved and then gets the UTF8
// length of the string. It doesn't flatten the string and has log(n) recursion
// for a string of length n.
int String::Utf8Length(String* input, int from, int to) {
if (from == to) return 0;
int total = 0;
while (true) {
if (input->IsAsciiRepresentation()) return total + to - from;
switch (StringShape(input).representation_tag()) {
case kConsStringTag: {
ConsString* str = ConsString::cast(input);
String* first = str->first();
String* second = str->second();
int first_length = first->length();
if (first_length - from < to - first_length) {
if (first_length > from) {
// Left hand side is shorter.
total += Utf8Length(first, from, first_length);
input = second;
from = 0;
to -= first_length;
} else {
// We only need the right hand side.
input = second;
from -= first_length;
to -= first_length;
}
} else {
if (first_length <= to) {
// Right hand side is shorter.
total += Utf8Length(second, 0, to - first_length);
input = first;
to = first_length;
} else {
// We only need the left hand side.
input = first;
}
}
continue;
}
case kExternalStringTag:
case kSeqStringTag: {
Vector<const uc16> vector = input->GetFlatContent().ToUC16Vector();
const uc16* p = vector.start();
for (int i = from; i < to; i++) {
total += unibrow::Utf8::Length(p[i]);
}
return total;
}
case kSlicedStringTag: {
SlicedString* str = SlicedString::cast(input);
int offset = str->offset();
input = str->parent();
from += offset;
to += offset;
continue;
}
default:
break;
}
UNREACHABLE();
return 0;
}
return 0;
}
void Relocatable::PostGarbageCollectionProcessing() {
Isolate* isolate = Isolate::Current();
Relocatable* current = isolate->relocatable_top();
@ -6847,8 +6784,10 @@ static inline bool CompareStringContents(IteratorA* ia, IteratorB* ib) {
// General slow case check. We know that the ia and ib iterators
// have the same length.
while (ia->has_more()) {
uc32 ca = ia->GetNext();
uc32 cb = ib->GetNext();
uint32_t ca = ia->GetNext();
uint32_t cb = ib->GetNext();
ASSERT(ca <= unibrow::Utf16::kMaxNonSurrogateCharCode);
ASSERT(cb <= unibrow::Utf16::kMaxNonSurrogateCharCode);
if (ca != cb)
return false;
}
@ -7031,8 +6970,14 @@ bool String::IsEqualTo(Vector<const char> str) {
decoder->Reset(str.start(), str.length());
int i;
for (i = 0; i < slen && decoder->has_more(); i++) {
uc32 r = decoder->GetNext();
if (Get(i) != r) return false;
uint32_t r = decoder->GetNext();
if (r > unibrow::Utf16::kMaxNonSurrogateCharCode) {
if (i > slen - 1) return false;
if (Get(i++) != unibrow::Utf16::LeadSurrogate(r)) return false;
if (Get(i) != unibrow::Utf16::TrailSurrogate(r)) return false;
} else {
if (Get(i) != r) return false;
}
}
return i == slen && !decoder->has_more();
}
@ -7162,6 +7107,22 @@ uint32_t StringHasher::MakeArrayIndexHash(uint32_t value, int length) {
}
void StringHasher::AddSurrogatePair(uc32 c) {
uint16_t lead = unibrow::Utf16::LeadSurrogate(c);
AddCharacter(lead);
uint16_t trail = unibrow::Utf16::TrailSurrogate(c);
AddCharacter(trail);
}
void StringHasher::AddSurrogatePairNoIndex(uc32 c) {
uint16_t lead = unibrow::Utf16::LeadSurrogate(c);
AddCharacterNoIndex(lead);
uint16_t trail = unibrow::Utf16::TrailSurrogate(c);
AddCharacterNoIndex(trail);
}
uint32_t StringHasher::GetHashField() {
ASSERT(is_valid());
if (length_ <= String::kMaxHashCalcLength) {
@ -10655,7 +10616,7 @@ class Utf8SymbolKey : public HashTableKey {
if (hash_field_ != 0) return hash_field_ >> String::kHashShift;
unibrow::Utf8InputBuffer<> buffer(string_.start(),
static_cast<unsigned>(string_.length()));
chars_ = buffer.Length();
chars_ = buffer.Utf16Length();
hash_field_ = String::ComputeHashField(&buffer, chars_, seed_);
uint32_t result = hash_field_ >> String::kHashShift;
ASSERT(result != 0); // Ensure that the hash value of 0 is never computed.

View File

@ -6616,12 +6616,17 @@ class StringHasher {
inline bool has_trivial_hash();
// Add a character to the hash and update the array index calculation.
inline void AddCharacter(uc32 c);
inline void AddCharacter(uint32_t c);
// Adds a character to the hash but does not update the array index
// calculation. This can only be called when it has been verified
// that the input is not an array index.
inline void AddCharacterNoIndex(uc32 c);
inline void AddCharacterNoIndex(uint32_t c);
// Add a character above 0xffff as a surrogate pair. These can get into
// the hasher through the routines that take a UTF-8 string and make a symbol.
void AddSurrogatePair(uc32 c);
void AddSurrogatePairNoIndex(uc32 c);
// Returns the value to store in the hash field of a string with
// the given length and contents.
@ -6871,9 +6876,6 @@ class String: public HeapObject {
RobustnessFlag robustness_flag = FAST_STRING_TRAVERSAL,
int* length_output = 0);
inline int Utf8Length() { return Utf8Length(this, 0, length()); }
static int Utf8Length(String* input, int from, int to);
// Return a 16 bit Unicode representation of the string.
// The string should be nearly flat, otherwise the performance of
// of this method may be very bad. Setting robustness_flag to
@ -6939,7 +6941,7 @@ class String: public HeapObject {
// Max ASCII char code.
static const int kMaxAsciiCharCode = unibrow::Utf8::kMaxOneByteChar;
static const unsigned kMaxAsciiCharCodeU = unibrow::Utf8::kMaxOneByteChar;
static const int kMaxUC16CharCode = 0xffff;
static const int kMaxUtf16CodeUnit = 0xffff;
// Mask constant for checking if a string has a computed hash code
// and if it is an array index. The least significant bit indicates

View File

@ -258,7 +258,7 @@ Handle<String> Parser::LookupSymbol(int symbol_id) {
scanner().literal_ascii_string());
} else {
return isolate()->factory()->LookupTwoByteSymbol(
scanner().literal_uc16_string());
scanner().literal_utf16_string());
}
}
return LookupCachedSymbol(symbol_id);
@ -279,7 +279,7 @@ Handle<String> Parser::LookupCachedSymbol(int symbol_id) {
scanner().literal_ascii_string());
} else {
result = isolate()->factory()->LookupTwoByteSymbol(
scanner().literal_uc16_string());
scanner().literal_utf16_string());
}
symbol_cache_.at(symbol_id) = result;
return result;
@ -576,12 +576,12 @@ FunctionLiteral* Parser::ParseProgram(CompilationInfo* info) {
// Notice that the stream is destroyed at the end of the branch block.
// The last line of the blocks can't be moved outside, even though they're
// identical calls.
ExternalTwoByteStringUC16CharacterStream stream(
ExternalTwoByteStringUtf16CharacterStream stream(
Handle<ExternalTwoByteString>::cast(source), 0, source->length());
scanner_.Initialize(&stream);
return DoParseProgram(info, source, &zone_scope);
} else {
GenericStringUC16CharacterStream stream(source, 0, source->length());
GenericStringUtf16CharacterStream stream(source, 0, source->length());
scanner_.Initialize(&stream);
return DoParseProgram(info, source, &zone_scope);
}
@ -665,16 +665,16 @@ FunctionLiteral* Parser::ParseLazy(CompilationInfo* info) {
// Initialize parser state.
source->TryFlatten();
if (source->IsExternalTwoByteString()) {
ExternalTwoByteStringUC16CharacterStream stream(
ExternalTwoByteStringUtf16CharacterStream stream(
Handle<ExternalTwoByteString>::cast(source),
shared_info->start_position(),
shared_info->end_position());
FunctionLiteral* result = ParseLazy(info, &stream, &zone_scope);
return result;
} else {
GenericStringUC16CharacterStream stream(source,
shared_info->start_position(),
shared_info->end_position());
GenericStringUtf16CharacterStream stream(source,
shared_info->start_position(),
shared_info->end_position());
FunctionLiteral* result = ParseLazy(info, &stream, &zone_scope);
return result;
}
@ -682,7 +682,7 @@ FunctionLiteral* Parser::ParseLazy(CompilationInfo* info) {
FunctionLiteral* Parser::ParseLazy(CompilationInfo* info,
UC16CharacterStream* source,
Utf16CharacterStream* source,
ZoneScope* zone_scope) {
Handle<SharedFunctionInfo> shared_info = info->shared_info();
scanner_.Initialize(source);
@ -4285,7 +4285,7 @@ class SingletonLogger : public ParserRecorder {
// Logs a symbol creation of a literal or identifier.
virtual void LogAsciiSymbol(int start, Vector<const char> literal) { }
virtual void LogUC16Symbol(int start, Vector<const uc16> literal) { }
virtual void LogUtf16Symbol(int start, Vector<const uc16> literal) { }
// Logs an error message and marks the log as containing an error.
// Further logging will be ignored, and ExtractData will return a vector
@ -5874,7 +5874,7 @@ int ScriptDataImpl::ReadNumber(byte** source) {
// Create a Scanner for the preparser to use as input, and preparse the source.
static ScriptDataImpl* DoPreParse(UC16CharacterStream* source,
static ScriptDataImpl* DoPreParse(Utf16CharacterStream* source,
int flags,
ParserRecorder* recorder) {
Isolate* isolate = Isolate::Current();
@ -5915,17 +5915,17 @@ ScriptDataImpl* ParserApi::PartialPreParse(Handle<String> source,
PartialParserRecorder recorder;
int source_length = source->length();
if (source->IsExternalTwoByteString()) {
ExternalTwoByteStringUC16CharacterStream stream(
ExternalTwoByteStringUtf16CharacterStream stream(
Handle<ExternalTwoByteString>::cast(source), 0, source_length);
return DoPreParse(&stream, flags, &recorder);
} else {
GenericStringUC16CharacterStream stream(source, 0, source_length);
GenericStringUtf16CharacterStream stream(source, 0, source_length);
return DoPreParse(&stream, flags, &recorder);
}
}
ScriptDataImpl* ParserApi::PreParse(UC16CharacterStream* source,
ScriptDataImpl* ParserApi::PreParse(Utf16CharacterStream* source,
v8::Extension* extension,
int flags) {
Handle<Script> no_script;

View File

@ -172,7 +172,7 @@ class ParserApi {
static bool Parse(CompilationInfo* info, int flags);
// Generic preparser generating full preparse data.
static ScriptDataImpl* PreParse(UC16CharacterStream* source,
static ScriptDataImpl* PreParse(Utf16CharacterStream* source,
v8::Extension* extension,
int flags);
@ -542,7 +542,7 @@ class Parser {
FunctionLiteral* ParseLazy(CompilationInfo* info,
UC16CharacterStream* source,
Utf16CharacterStream* source,
ZoneScope* zone_scope);
Isolate* isolate() { return isolate_; }
@ -712,7 +712,7 @@ class Parser {
scanner().literal_ascii_string(), tenured);
} else {
return isolate_->factory()->NewStringFromTwoByte(
scanner().literal_uc16_string(), tenured);
scanner().literal_utf16_string(), tenured);
}
}
@ -722,7 +722,7 @@ class Parser {
scanner().next_literal_ascii_string(), tenured);
} else {
return isolate_->factory()->NewStringFromTwoByte(
scanner().next_literal_uc16_string(), tenured);
scanner().next_literal_utf16_string(), tenured);
}
}

View File

@ -53,7 +53,7 @@ class ParserRecorder {
// Logs a symbol creation of a literal or identifier.
virtual void LogAsciiSymbol(int start, Vector<const char> literal) { }
virtual void LogUC16Symbol(int start, Vector<const uc16> literal) { }
virtual void LogUtf16Symbol(int start, Vector<const uc16> literal) { }
// Logs an error message and marks the log as containing an error.
// Further logging will be ignored, and ExtractData will return a vector
@ -149,7 +149,7 @@ class PartialParserRecorder : public FunctionLoggingParserRecorder {
public:
PartialParserRecorder() : FunctionLoggingParserRecorder() { }
virtual void LogAsciiSymbol(int start, Vector<const char> literal) { }
virtual void LogUC16Symbol(int start, Vector<const uc16> literal) { }
virtual void LogUtf16Symbol(int start, Vector<const uc16> literal) { }
virtual ~PartialParserRecorder() { }
virtual Vector<unsigned> ExtractData();
virtual int symbol_position() { return 0; }
@ -171,7 +171,7 @@ class CompleteParserRecorder: public FunctionLoggingParserRecorder {
LogSymbol(start, hash, true, Vector<const byte>::cast(literal));
}
virtual void LogUC16Symbol(int start, Vector<const uc16> literal) {
virtual void LogUtf16Symbol(int start, Vector<const uc16> literal) {
if (!is_recording_) return;
int hash = vector_hash(literal);
LogSymbol(start, hash, false, Vector<const byte>::cast(literal));

View File

@ -46,10 +46,10 @@ namespace v8 {
namespace internal {
// UTF16Buffer based on a v8::UnicodeInputStream.
class InputStreamUTF16Buffer : public UC16CharacterStream {
class InputStreamUtf16Buffer : public Utf16CharacterStream {
public:
/* The InputStreamUTF16Buffer maintains an internal buffer
* that is filled in chunks from the UC16CharacterStream.
/* The InputStreamUtf16Buffer maintains an internal buffer
* that is filled in chunks from the Utf16CharacterStream.
* It also maintains unlimited pushback capability, but optimized
* for small pushbacks.
* The pushback_buffer_ pointer points to the limit of pushbacks
@ -60,8 +60,8 @@ class InputStreamUTF16Buffer : public UC16CharacterStream {
* new buffer. When this buffer is read to the end again, the cursor is
* switched back to the internal buffer
*/
explicit InputStreamUTF16Buffer(v8::UnicodeInputStream* stream)
: UC16CharacterStream(),
explicit InputStreamUtf16Buffer(v8::UnicodeInputStream* stream)
: Utf16CharacterStream(),
stream_(stream),
pushback_buffer_(buffer_),
pushback_buffer_end_cache_(NULL),
@ -70,7 +70,7 @@ class InputStreamUTF16Buffer : public UC16CharacterStream {
buffer_cursor_ = buffer_end_ = buffer_ + kPushBackSize;
}
virtual ~InputStreamUTF16Buffer() {
virtual ~InputStreamUtf16Buffer() {
if (pushback_buffer_backing_ != NULL) {
DeleteArray(pushback_buffer_backing_);
}
@ -127,12 +127,18 @@ class InputStreamUTF16Buffer : public UC16CharacterStream {
uc16* buffer_start = buffer_ + kPushBackSize;
buffer_cursor_ = buffer_end_ = buffer_start;
while ((value = stream_->Next()) >= 0) {
if (value > static_cast<int32_t>(unibrow::Utf8::kMaxThreeByteChar)) {
value = unibrow::Utf8::kBadChar;
if (value >
static_cast<int32_t>(unibrow::Utf16::kMaxNonSurrogateCharCode)) {
buffer_start[buffer_end_++ - buffer_start] =
unibrow::Utf16::LeadSurrogate(value);
buffer_start[buffer_end_++ - buffer_start] =
unibrow::Utf16::TrailSurrogate(value);
} else {
// buffer_end_ is a const pointer, but buffer_ is writable.
buffer_start[buffer_end_++ - buffer_start] = static_cast<uc16>(value);
}
// buffer_end_ is a const pointer, but buffer_ is writable.
buffer_start[buffer_end_++ - buffer_start] = static_cast<uc16>(value);
if (buffer_end_ == buffer_ + kPushBackSize + kBufferSize) break;
// Stop one before the end of the buffer in case we get a surrogate pair.
if (buffer_end_ <= buffer_ + 1 + kPushBackSize + kBufferSize) break;
}
return buffer_end_ > buffer_start;
}
@ -179,7 +185,7 @@ UnicodeInputStream::~UnicodeInputStream() { }
PreParserData Preparse(UnicodeInputStream* input, size_t max_stack) {
internal::InputStreamUTF16Buffer buffer(input);
internal::InputStreamUtf16Buffer buffer(input);
uintptr_t stack_limit = reinterpret_cast<uintptr_t>(&buffer) - max_stack;
internal::UnicodeCache unicode_cache;
internal::Scanner scanner(&unicode_cache);

View File

@ -1214,7 +1214,7 @@ void PreParser::CheckDuplicate(DuplicateFinder* finder,
old_type = finder->AddAsciiSymbol(scanner_->literal_ascii_string(),
type);
} else {
old_type = finder->AddUC16Symbol(scanner_->literal_uc16_string(), type);
old_type = finder->AddUtf16Symbol(scanner_->literal_utf16_string(), type);
}
if (HasConflict(old_type, type)) {
if (IsDataDataConflict(old_type, type)) {
@ -1387,7 +1387,7 @@ PreParser::Expression PreParser::ParseFunctionLiteral(bool* ok) {
duplicate_finder.AddAsciiSymbol(scanner_->literal_ascii_string(), 1);
} else {
prev_value =
duplicate_finder.AddUC16Symbol(scanner_->literal_uc16_string(), 1);
duplicate_finder.AddUtf16Symbol(scanner_->literal_utf16_string(), 1);
}
if (prev_value != 0) {
@ -1485,7 +1485,7 @@ void PreParser::LogSymbol() {
if (scanner_->is_literal_ascii()) {
log_->LogAsciiSymbol(identifier_pos, scanner_->literal_ascii_string());
} else {
log_->LogUC16Symbol(identifier_pos, scanner_->literal_uc16_string());
log_->LogUtf16Symbol(identifier_pos, scanner_->literal_utf16_string());
}
}
@ -1657,7 +1657,7 @@ int DuplicateFinder::AddAsciiSymbol(i::Vector<const char> key, int value) {
return AddSymbol(i::Vector<const byte>::cast(key), true, value);
}
int DuplicateFinder::AddUC16Symbol(i::Vector<const uint16_t> key, int value) {
int DuplicateFinder::AddUtf16Symbol(i::Vector<const uint16_t> key, int value) {
return AddSymbol(i::Vector<const byte>::cast(key), false, value);
}

View File

@ -65,7 +65,7 @@ class DuplicateFinder {
map_(&Match) { }
int AddAsciiSymbol(i::Vector<const char> key, int value);
int AddUC16Symbol(i::Vector<const uint16_t> key, int value);
int AddUtf16Symbol(i::Vector<const uint16_t> key, int value);
// Add a a number literal by converting it (if necessary)
// to the string that ToString(ToNumber(literal)) would generate.
// and then adding that string with AddAsciiSymbol.

View File

@ -36,19 +36,19 @@ namespace v8 {
namespace internal {
// ----------------------------------------------------------------------------
// BufferedUC16CharacterStreams
// BufferedUtf16CharacterStreams
BufferedUC16CharacterStream::BufferedUC16CharacterStream()
: UC16CharacterStream(),
BufferedUtf16CharacterStream::BufferedUtf16CharacterStream()
: Utf16CharacterStream(),
pushback_limit_(NULL) {
// Initialize buffer as being empty. First read will fill the buffer.
buffer_cursor_ = buffer_;
buffer_end_ = buffer_;
}
BufferedUC16CharacterStream::~BufferedUC16CharacterStream() { }
BufferedUtf16CharacterStream::~BufferedUtf16CharacterStream() { }
void BufferedUC16CharacterStream::PushBack(uc32 character) {
void BufferedUtf16CharacterStream::PushBack(uc32 character) {
if (character == kEndOfInput) {
pos_--;
return;
@ -63,7 +63,7 @@ void BufferedUC16CharacterStream::PushBack(uc32 character) {
}
void BufferedUC16CharacterStream::SlowPushBack(uc16 character) {
void BufferedUtf16CharacterStream::SlowPushBack(uc16 character) {
// In pushback mode, the end of the buffer contains pushback,
// and the start of the buffer (from buffer start to pushback_limit_)
// contains valid data that comes just after the pushback.
@ -89,7 +89,7 @@ void BufferedUC16CharacterStream::SlowPushBack(uc16 character) {
}
bool BufferedUC16CharacterStream::ReadBlock() {
bool BufferedUtf16CharacterStream::ReadBlock() {
buffer_cursor_ = buffer_;
if (pushback_limit_ != NULL) {
// Leave pushback mode.
@ -106,7 +106,7 @@ bool BufferedUC16CharacterStream::ReadBlock() {
}
unsigned BufferedUC16CharacterStream::SlowSeekForward(unsigned delta) {
unsigned BufferedUtf16CharacterStream::SlowSeekForward(unsigned delta) {
// Leave pushback mode (i.e., ignore that there might be valid data
// in the buffer before the pushback_limit_ point).
pushback_limit_ = NULL;
@ -114,10 +114,10 @@ unsigned BufferedUC16CharacterStream::SlowSeekForward(unsigned delta) {
}
// ----------------------------------------------------------------------------
// GenericStringUC16CharacterStream
// GenericStringUtf16CharacterStream
GenericStringUC16CharacterStream::GenericStringUC16CharacterStream(
GenericStringUtf16CharacterStream::GenericStringUtf16CharacterStream(
Handle<String> data,
unsigned start_position,
unsigned end_position)
@ -130,10 +130,10 @@ GenericStringUC16CharacterStream::GenericStringUC16CharacterStream(
}
GenericStringUC16CharacterStream::~GenericStringUC16CharacterStream() { }
GenericStringUtf16CharacterStream::~GenericStringUtf16CharacterStream() { }
unsigned GenericStringUC16CharacterStream::BufferSeekForward(unsigned delta) {
unsigned GenericStringUtf16CharacterStream::BufferSeekForward(unsigned delta) {
unsigned old_pos = pos_;
pos_ = Min(pos_ + delta, length_);
ReadBlock();
@ -141,7 +141,7 @@ unsigned GenericStringUC16CharacterStream::BufferSeekForward(unsigned delta) {
}
unsigned GenericStringUC16CharacterStream::FillBuffer(unsigned from_pos,
unsigned GenericStringUtf16CharacterStream::FillBuffer(unsigned from_pos,
unsigned length) {
if (from_pos >= length_) return 0;
if (from_pos + length > length_) {
@ -153,10 +153,10 @@ unsigned GenericStringUC16CharacterStream::FillBuffer(unsigned from_pos,
// ----------------------------------------------------------------------------
// Utf8ToUC16CharacterStream
Utf8ToUC16CharacterStream::Utf8ToUC16CharacterStream(const byte* data,
unsigned length)
: BufferedUC16CharacterStream(),
// Utf8ToUtf16CharacterStream
Utf8ToUtf16CharacterStream::Utf8ToUtf16CharacterStream(const byte* data,
unsigned length)
: BufferedUtf16CharacterStream(),
raw_data_(data),
raw_data_length_(length),
raw_data_pos_(0),
@ -165,10 +165,10 @@ Utf8ToUC16CharacterStream::Utf8ToUC16CharacterStream(const byte* data,
}
Utf8ToUC16CharacterStream::~Utf8ToUC16CharacterStream() { }
Utf8ToUtf16CharacterStream::~Utf8ToUtf16CharacterStream() { }
unsigned Utf8ToUC16CharacterStream::BufferSeekForward(unsigned delta) {
unsigned Utf8ToUtf16CharacterStream::BufferSeekForward(unsigned delta) {
unsigned old_pos = pos_;
unsigned target_pos = pos_ + delta;
SetRawPosition(target_pos);
@ -178,9 +178,9 @@ unsigned Utf8ToUC16CharacterStream::BufferSeekForward(unsigned delta) {
}
unsigned Utf8ToUC16CharacterStream::FillBuffer(unsigned char_position,
unsigned length) {
static const unibrow::uchar kMaxUC16Character = 0xffff;
unsigned Utf8ToUtf16CharacterStream::FillBuffer(unsigned char_position,
unsigned length) {
static const unibrow::uchar kMaxUtf16Character = 0xffff;
SetRawPosition(char_position);
if (raw_character_position_ != char_position) {
// char_position was not a valid position in the stream (hit the end
@ -188,7 +188,7 @@ unsigned Utf8ToUC16CharacterStream::FillBuffer(unsigned char_position,
return 0u;
}
unsigned i = 0;
while (i < length) {
while (i < length - 1) {
if (raw_data_pos_ == raw_data_length_) break;
unibrow::uchar c = raw_data_[raw_data_pos_];
if (c <= unibrow::Utf8::kMaxOneByteChar) {
@ -197,12 +197,13 @@ unsigned Utf8ToUC16CharacterStream::FillBuffer(unsigned char_position,
c = unibrow::Utf8::CalculateValue(raw_data_ + raw_data_pos_,
raw_data_length_ - raw_data_pos_,
&raw_data_pos_);
// Don't allow characters outside of the BMP.
if (c > kMaxUC16Character) {
c = unibrow::Utf8::kBadChar;
}
}
buffer_[i++] = static_cast<uc16>(c);
if (c > kMaxUtf16Character) {
buffer_[i++] = unibrow::Utf16::LeadSurrogate(c);
buffer_[i++] = unibrow::Utf16::TrailSurrogate(c);
} else {
buffer_[i++] = static_cast<uc16>(c);
}
}
raw_character_position_ = char_position + i;
return i;
@ -266,37 +267,52 @@ static inline void Utf8CharacterForward(const byte* buffer, unsigned* cursor) {
}
void Utf8ToUC16CharacterStream::SetRawPosition(unsigned target_position) {
// This can't set a raw position between two surrogate pairs, since there
// is no position in the UTF8 stream that corresponds to that. This assumes
// that the surrogate pair is correctly coded as a 4 byte UTF-8 sequence. If
// it is illegally coded as two 3 byte sequences then there is no problem here.
void Utf8ToUtf16CharacterStream::SetRawPosition(unsigned target_position) {
if (raw_character_position_ > target_position) {
// Spool backwards in utf8 buffer.
do {
int old_pos = raw_data_pos_;
Utf8CharacterBack(raw_data_, &raw_data_pos_);
raw_character_position_--;
ASSERT(old_pos - raw_data_pos_ <= 4);
// Step back over both code units for surrogate pairs.
if (old_pos - raw_data_pos_ == 4) raw_character_position_--;
} while (raw_character_position_ > target_position);
// No surrogate pair splitting.
ASSERT(raw_character_position_ == target_position);
return;
}
// Spool forwards in the utf8 buffer.
while (raw_character_position_ < target_position) {
if (raw_data_pos_ == raw_data_length_) return;
int old_pos = raw_data_pos_;
Utf8CharacterForward(raw_data_, &raw_data_pos_);
raw_character_position_++;
ASSERT(raw_data_pos_ - old_pos <= 4);
if (raw_data_pos_ - old_pos == 4) raw_character_position_++;
}
// No surrogate pair splitting.
ASSERT(raw_character_position_ == target_position);
}
// ----------------------------------------------------------------------------
// ExternalTwoByteStringUC16CharacterStream
// ExternalTwoByteStringUtf16CharacterStream
ExternalTwoByteStringUC16CharacterStream::
~ExternalTwoByteStringUC16CharacterStream() { }
ExternalTwoByteStringUtf16CharacterStream::
~ExternalTwoByteStringUtf16CharacterStream() { }
ExternalTwoByteStringUC16CharacterStream
::ExternalTwoByteStringUC16CharacterStream(
ExternalTwoByteStringUtf16CharacterStream
::ExternalTwoByteStringUtf16CharacterStream(
Handle<ExternalTwoByteString> data,
int start_position,
int end_position)
: UC16CharacterStream(),
: Utf16CharacterStream(),
source_(data),
raw_data_(data->GetTwoByteData(start_position)) {
buffer_cursor_ = raw_data_,

View File

@ -36,10 +36,10 @@ namespace internal {
// A buffered character stream based on a random access character
// source (ReadBlock can be called with pos_ pointing to any position,
// even positions before the current).
class BufferedUC16CharacterStream: public UC16CharacterStream {
class BufferedUtf16CharacterStream: public Utf16CharacterStream {
public:
BufferedUC16CharacterStream();
virtual ~BufferedUC16CharacterStream();
BufferedUtf16CharacterStream();
virtual ~BufferedUtf16CharacterStream();
virtual void PushBack(uc32 character);
@ -60,12 +60,12 @@ class BufferedUC16CharacterStream: public UC16CharacterStream {
// Generic string stream.
class GenericStringUC16CharacterStream: public BufferedUC16CharacterStream {
class GenericStringUtf16CharacterStream: public BufferedUtf16CharacterStream {
public:
GenericStringUC16CharacterStream(Handle<String> data,
unsigned start_position,
unsigned end_position);
virtual ~GenericStringUC16CharacterStream();
GenericStringUtf16CharacterStream(Handle<String> data,
unsigned start_position,
unsigned end_position);
virtual ~GenericStringUtf16CharacterStream();
protected:
virtual unsigned BufferSeekForward(unsigned delta);
@ -77,11 +77,11 @@ class GenericStringUC16CharacterStream: public BufferedUC16CharacterStream {
};
// UC16 stream based on a literal UTF-8 string.
class Utf8ToUC16CharacterStream: public BufferedUC16CharacterStream {
// Utf16 stream based on a literal UTF-8 string.
class Utf8ToUtf16CharacterStream: public BufferedUtf16CharacterStream {
public:
Utf8ToUC16CharacterStream(const byte* data, unsigned length);
virtual ~Utf8ToUC16CharacterStream();
Utf8ToUtf16CharacterStream(const byte* data, unsigned length);
virtual ~Utf8ToUtf16CharacterStream();
protected:
virtual unsigned BufferSeekForward(unsigned delta);
@ -98,12 +98,12 @@ class Utf8ToUC16CharacterStream: public BufferedUC16CharacterStream {
// UTF16 buffer to read characters from an external string.
class ExternalTwoByteStringUC16CharacterStream: public UC16CharacterStream {
class ExternalTwoByteStringUtf16CharacterStream: public Utf16CharacterStream {
public:
ExternalTwoByteStringUC16CharacterStream(Handle<ExternalTwoByteString> data,
int start_position,
int end_position);
virtual ~ExternalTwoByteStringUC16CharacterStream();
ExternalTwoByteStringUtf16CharacterStream(Handle<ExternalTwoByteString> data,
int start_position,
int end_position);
virtual ~ExternalTwoByteStringUtf16CharacterStream();
virtual void PushBack(uc32 character) {
ASSERT(buffer_cursor_ > raw_data_);

View File

@ -45,7 +45,7 @@ Scanner::Scanner(UnicodeCache* unicode_cache)
harmony_modules_(false) { }
void Scanner::Initialize(UC16CharacterStream* source) {
void Scanner::Initialize(Utf16CharacterStream* source) {
source_ = source;
// Need to capture identifiers in order to recognize "get" and "set"
// in object literals.

View File

@ -73,15 +73,17 @@ inline int HexValue(uc32 c) {
// ---------------------------------------------------------------------
// Buffered stream of characters, using an internal UC16 buffer.
// Buffered stream of UTF-16 code units, using an internal UTF-16 buffer.
// A code unit is a 16 bit value representing either a 16 bit code point
// or one part of a surrogate pair that make a single 21 bit code point.
class UC16CharacterStream {
class Utf16CharacterStream {
public:
UC16CharacterStream() : pos_(0) { }
virtual ~UC16CharacterStream() { }
Utf16CharacterStream() : pos_(0) { }
virtual ~Utf16CharacterStream() { }
// Returns and advances past the next UC16 character in the input
// stream. If there are no more characters, it returns a negative
// Returns and advances past the next UTF-16 code unit in the input
// stream. If there are no more code units, it returns a negative
// value.
inline uc32 Advance() {
if (buffer_cursor_ < buffer_end_ || ReadBlock()) {
@ -90,47 +92,47 @@ class UC16CharacterStream {
}
// Note: currently the following increment is necessary to avoid a
// parser problem! The scanner treats the final kEndOfInput as
// a character with a position, and does math relative to that
// a code unit with a position, and does math relative to that
// position.
pos_++;
return kEndOfInput;
}
// Return the current position in the character stream.
// Return the current position in the code unit stream.
// Starts at zero.
inline unsigned pos() const { return pos_; }
// Skips forward past the next character_count UC16 characters
// Skips forward past the next code_unit_count UTF-16 code units
// in the input, or until the end of input if that comes sooner.
// Returns the number of characters actually skipped. If less
// than character_count,
inline unsigned SeekForward(unsigned character_count) {
// Returns the number of code units actually skipped. If less
// than code_unit_count,
inline unsigned SeekForward(unsigned code_unit_count) {
unsigned buffered_chars =
static_cast<unsigned>(buffer_end_ - buffer_cursor_);
if (character_count <= buffered_chars) {
buffer_cursor_ += character_count;
pos_ += character_count;
return character_count;
if (code_unit_count <= buffered_chars) {
buffer_cursor_ += code_unit_count;
pos_ += code_unit_count;
return code_unit_count;
}
return SlowSeekForward(character_count);
return SlowSeekForward(code_unit_count);
}
// Pushes back the most recently read UC16 character (or negative
// Pushes back the most recently read UTF-16 code unit (or negative
// value if at end of input), i.e., the value returned by the most recent
// call to Advance.
// Must not be used right after calling SeekForward.
virtual void PushBack(int32_t character) = 0;
virtual void PushBack(int32_t code_unit) = 0;
protected:
static const uc32 kEndOfInput = -1;
// Ensures that the buffer_cursor_ points to the character at
// Ensures that the buffer_cursor_ points to the code_unit at
// position pos_ of the input, if possible. If the position
// is at or after the end of the input, return false. If there
// are more characters available, return true.
// are more code_units available, return true.
virtual bool ReadBlock() = 0;
virtual unsigned SlowSeekForward(unsigned character_count) = 0;
virtual unsigned SlowSeekForward(unsigned code_unit_count) = 0;
const uc16* buffer_cursor_;
const uc16* buffer_end_;
@ -178,23 +180,24 @@ class LiteralBuffer {
}
}
INLINE(void AddChar(uc16 character)) {
INLINE(void AddChar(uint32_t code_unit)) {
if (position_ >= backing_store_.length()) ExpandBuffer();
if (is_ascii_) {
if (character < kMaxAsciiCharCodeU) {
backing_store_[position_] = static_cast<byte>(character);
if (code_unit < kMaxAsciiCharCodeU) {
backing_store_[position_] = static_cast<byte>(code_unit);
position_ += kASCIISize;
return;
}
ConvertToUC16();
ConvertToUtf16();
}
*reinterpret_cast<uc16*>(&backing_store_[position_]) = character;
ASSERT(code_unit < 0x10000u);
*reinterpret_cast<uc16*>(&backing_store_[position_]) = code_unit;
position_ += kUC16Size;
}
bool is_ascii() { return is_ascii_; }
Vector<const uc16> uc16_literal() {
Vector<const uc16> utf16_literal() {
ASSERT(!is_ascii_);
ASSERT((position_ & 0x1) == 0);
return Vector<const uc16>(
@ -236,13 +239,13 @@ class LiteralBuffer {
backing_store_ = new_store;
}
void ConvertToUC16() {
void ConvertToUtf16() {
ASSERT(is_ascii_);
Vector<byte> new_store;
int new_content_size = position_ * kUC16Size;
if (new_content_size >= backing_store_.length()) {
// Ensure room for all currently read characters as UC16 as well
// as the character about to be stored.
// Ensure room for all currently read code units as UC16 as well
// as the code unit about to be stored.
new_store = Vector<byte>::New(NewCapacity(new_content_size));
} else {
new_store = backing_store_;
@ -316,7 +319,7 @@ class Scanner {
explicit Scanner(UnicodeCache* scanner_contants);
void Initialize(UC16CharacterStream* source);
void Initialize(Utf16CharacterStream* source);
// Returns the next token and advances input.
Token::Value Next();
@ -335,9 +338,9 @@ class Scanner {
ASSERT_NOT_NULL(current_.literal_chars);
return current_.literal_chars->ascii_literal();
}
Vector<const uc16> literal_uc16_string() {
Vector<const uc16> literal_utf16_string() {
ASSERT_NOT_NULL(current_.literal_chars);
return current_.literal_chars->uc16_literal();
return current_.literal_chars->utf16_literal();
}
bool is_literal_ascii() {
ASSERT_NOT_NULL(current_.literal_chars);
@ -371,9 +374,9 @@ class Scanner {
ASSERT_NOT_NULL(next_.literal_chars);
return next_.literal_chars->ascii_literal();
}
Vector<const uc16> next_literal_uc16_string() {
Vector<const uc16> next_literal_utf16_string() {
ASSERT_NOT_NULL(next_.literal_chars);
return next_.literal_chars->uc16_literal();
return next_.literal_chars->utf16_literal();
}
bool is_next_literal_ascii() {
ASSERT_NOT_NULL(next_.literal_chars);
@ -542,8 +545,8 @@ class Scanner {
TokenDesc current_; // desc for current token (as returned by Next())
TokenDesc next_; // desc for next token (one token look-ahead)
// Input stream. Must be initialized to an UC16CharacterStream.
UC16CharacterStream* source_;
// Input stream. Must be initialized to an Utf16CharacterStream.
Utf16CharacterStream* source_;
// Start position of the octal literal last scanned.

View File

@ -78,7 +78,7 @@ template <class T, int s> int Mapping<T, s>::CalculateValue(uchar c, uchar n,
}
unsigned Utf8::Encode(char* str, uchar c) {
unsigned Utf8::Encode(char* str, uchar c, int previous) {
static const int kMask = ~(1 << 6);
if (c <= kMaxOneByteChar) {
str[0] = c;
@ -88,6 +88,13 @@ unsigned Utf8::Encode(char* str, uchar c) {
str[1] = 0x80 | (c & kMask);
return 2;
} else if (c <= kMaxThreeByteChar) {
if (Utf16::IsTrailSurrogate(c) &&
Utf16::IsLeadSurrogate(previous)) {
const int kUnmatchedSize = kSizeOfUnmatchedSurrogate;
return Encode(str - kUnmatchedSize,
Utf16::CombineSurrogatePair(previous, c),
Utf16::kNoPreviousCharacter) - kUnmatchedSize;
}
str[0] = 0xE0 | (c >> 12);
str[1] = 0x80 | ((c >> 6) & kMask);
str[2] = 0x80 | (c & kMask);
@ -113,12 +120,16 @@ uchar Utf8::ValueOf(const byte* bytes, unsigned length, unsigned* cursor) {
return CalculateValue(bytes, length, cursor);
}
unsigned Utf8::Length(uchar c) {
unsigned Utf8::Length(uchar c, int previous) {
if (c <= kMaxOneByteChar) {
return 1;
} else if (c <= kMaxTwoByteChar) {
return 2;
} else if (c <= kMaxThreeByteChar) {
if (Utf16::IsTrailSurrogate(c) &&
Utf16::IsLeadSurrogate(previous)) {
return kSizeOfUnmatchedSurrogate - kBytesSavedByCombiningSurrogates;
}
return 3;
} else {
return 4;

View File

@ -276,6 +276,7 @@ uchar Utf8::CalculateValue(const byte* str,
return kBadChar;
}
const byte* Utf8::ReadBlock(Buffer<const char*> str, byte* buffer,
unsigned capacity, unsigned* chars_read_ptr, unsigned* offset_ptr) {
unsigned offset = *offset_ptr;
@ -338,6 +339,16 @@ unsigned CharacterStream::Length() {
return result;
}
unsigned CharacterStream::Utf16Length() {
unsigned result = 0;
while (has_more()) {
uchar c = GetNext();
result += c > Utf16::kMaxNonSurrogateCharCode ? 2 : 1;
}
Rewind();
return result;
}
void CharacterStream::Seek(unsigned position) {
Rewind();
for (unsigned i = 0; i < position; i++) {

View File

@ -100,7 +100,7 @@ class UnicodeData {
static const uchar kMaxCodePoint;
};
// --- U t f 8 ---
// --- U t f 8 a n d 16 ---
template <typename Data>
class Buffer {
@ -114,10 +114,46 @@ class Buffer {
unsigned length_;
};
class Utf16 {
public:
static inline bool IsLeadSurrogate(int32_t code) {
if (code == kNoPreviousCharacter) return false;
return (code & 0xfc00) == 0xd800;
}
static inline bool IsTrailSurrogate(int32_t code) {
if (code == kNoPreviousCharacter) return false;
return (code & 0xfc00) == 0xdc00;
}
static inline int32_t CombineSurrogatePair(uchar lead, uchar trail) {
return 0x10000 + ((lead & 0x3ff) << 10) + (trail & 0x3ff);
}
static const int32_t kNoPreviousCharacter = -1;
static const uchar kMaxNonSurrogateCharCode = 0xffff;
// Encoding a single UTF-16 code unit will produce 1, 2 or 3 bytes
// of UTF-8 data. The special case where the unit is a surrogate
// trail produces 1 byte net, because the encoding of the pair is
// 4 bytes and the 3 bytes that were used to encode the lead surrogate
// can be reclaimed.
static const int kMaxExtraUtf8BytesForOneUtf16CodeUnit = 3;
// One UTF-16 surrogate is endoded (illegally) as 3 UTF-8 bytes.
// The illegality stems from the surrogate not being part of a pair.
static const int kUtf8BytesToCodeASurrogate = 3;
static inline uchar LeadSurrogate(int32_t char_code) {
return 0xd800 + (((char_code - 0x10000) >> 10) & 0x3ff);
}
static inline uchar TrailSurrogate(int32_t char_code) {
return 0xdc00 + (char_code & 0x3ff);
}
};
class Utf8 {
public:
static inline uchar Length(uchar chr);
static inline unsigned Encode(char* out, uchar c);
static inline uchar Length(uchar chr, int previous);
static inline unsigned Encode(
char* out, uchar c, int previous);
static const byte* ReadBlock(Buffer<const char*> str, byte* buffer,
unsigned capacity, unsigned* chars_read, unsigned* offset);
static uchar CalculateValue(const byte* str,
@ -130,6 +166,11 @@ class Utf8 {
static const unsigned kMaxThreeByteChar = 0xffff;
static const unsigned kMaxFourByteChar = 0x1fffff;
// A single surrogate is coded as a 3 byte UTF-8 sequence, but two together
// that match are coded as a 4 byte UTF-8 sequence.
static const unsigned kBytesSavedByCombiningSurrogates = 2;
static const unsigned kSizeOfUnmatchedSurrogate = 3;
private:
template <unsigned s> friend class Utf8InputBuffer;
friend class Test;
@ -147,6 +188,7 @@ class CharacterStream {
// Note that default implementation is not efficient.
virtual void Seek(unsigned);
unsigned Length();
unsigned Utf16Length();
virtual ~CharacterStream() { }
static inline bool EncodeCharacter(uchar c, byte* buffer, unsigned capacity,
unsigned& offset);
@ -156,6 +198,7 @@ class CharacterStream {
unsigned capacity, unsigned& offset);
static inline uchar DecodeCharacter(const byte* buffer, unsigned* offset);
virtual void Rewind() = 0;
protected:
virtual void FillBuffer() = 0;
// The number of characters left in the current buffer

View File

@ -564,7 +564,7 @@ void RegExpMacroAssemblerX64::CheckNotCharacterAfterMinusAnd(
uc16 minus,
uc16 mask,
Label* on_not_equal) {
ASSERT(minus < String::kMaxUC16CharCode);
ASSERT(minus < String::kMaxUtf16CodeUnit);
__ lea(rax, Operand(current_character(), -minus));
__ and_(rax, Immediate(mask));
__ cmpl(rax, Immediate(c));

View File

@ -5526,6 +5526,17 @@ static int StrNCmp16(uint16_t* a, uint16_t* b, int n) {
}
int GetUtf8Length(Handle<String> str) {
int len = str->Utf8Length();
if (len < 0) {
i::Handle<i::String> istr(v8::Utils::OpenHandle(*str));
i::FlattenString(istr);
len = str->Utf8Length();
}
return len;
}
THREADED_TEST(StringWrite) {
LocalContext context;
v8::HandleScope scope;
@ -5606,7 +5617,7 @@ THREADED_TEST(StringWrite) {
CHECK_EQ(0, strncmp(utf8buf, "ab\1", 3));
memset(utf8buf, 0x1, sizeof(utf8buf));
len = left_tree->Utf8Length();
len = GetUtf8Length(left_tree);
int utf8_expected =
(0x80 + (0x800 - 0x80) * 2 + (0xd800 - 0x800) * 3) / kStride;
CHECK_EQ(utf8_expected, len);
@ -5620,7 +5631,7 @@ THREADED_TEST(StringWrite) {
CHECK_EQ(1, utf8buf[utf8_expected]);
memset(utf8buf, 0x1, sizeof(utf8buf));
len = right_tree->Utf8Length();
len = GetUtf8Length(right_tree);
CHECK_EQ(utf8_expected, len);
len = right_tree->WriteUtf8(utf8buf, utf8_expected, &charlen);
CHECK_EQ(utf8_expected, len);
@ -5745,6 +5756,217 @@ THREADED_TEST(StringWrite) {
}
static void Utf16Helper(
LocalContext& context,
const char* name,
const char* lengths_name,
int len) {
Local<v8::Array> a =
Local<v8::Array>::Cast(context->Global()->Get(v8_str(name)));
Local<v8::Array> alens =
Local<v8::Array>::Cast(context->Global()->Get(v8_str(lengths_name)));
for (int i = 0; i < len; i++) {
Local<v8::String> string =
Local<v8::String>::Cast(a->Get(i));
Local<v8::Number> expected_len =
Local<v8::Number>::Cast(alens->Get(i));
int length = GetUtf8Length(string);
CHECK_EQ(static_cast<int>(expected_len->Value()), length);
}
}
static uint16_t StringGet(Handle<String> str, int index) {
i::Handle<i::String> istring =
v8::Utils::OpenHandle(String::Cast(*str));
return istring->Get(index);
}
static void WriteUtf8Helper(
LocalContext& context,
const char* name,
const char* lengths_name,
int len) {
Local<v8::Array> b =
Local<v8::Array>::Cast(context->Global()->Get(v8_str(name)));
Local<v8::Array> alens =
Local<v8::Array>::Cast(context->Global()->Get(v8_str(lengths_name)));
char buffer[1000];
char buffer2[1000];
for (int i = 0; i < len; i++) {
Local<v8::String> string =
Local<v8::String>::Cast(b->Get(i));
Local<v8::Number> expected_len =
Local<v8::Number>::Cast(alens->Get(i));
int utf8_length = static_cast<int>(expected_len->Value());
for (int j = utf8_length + 1; j >= 0; j--) {
memset(reinterpret_cast<void*>(&buffer), 42, sizeof(buffer));
memset(reinterpret_cast<void*>(&buffer2), 42, sizeof(buffer2));
int nchars;
int utf8_written =
string->WriteUtf8(buffer, j, &nchars, String::NO_OPTIONS);
int utf8_written2 =
string->WriteUtf8(buffer2, j, &nchars, String::NO_NULL_TERMINATION);
CHECK_GE(utf8_length + 1, utf8_written);
CHECK_GE(utf8_length, utf8_written2);
for (int k = 0; k < utf8_written2; k++) {
CHECK_EQ(buffer[k], buffer2[k]);
}
CHECK(nchars * 3 >= utf8_written - 1);
CHECK(nchars <= utf8_written);
if (j == utf8_length + 1) {
CHECK_EQ(utf8_written2, utf8_length);
CHECK_EQ(utf8_written2 + 1, utf8_written);
}
CHECK_EQ(buffer[utf8_written], 42);
if (j > utf8_length) {
if (utf8_written != 0) CHECK_EQ(buffer[utf8_written - 1], 0);
if (utf8_written > 1) CHECK_NE(buffer[utf8_written - 2], 42);
Handle<String> roundtrip = v8_str(buffer);
CHECK(roundtrip->Equals(string));
} else {
if (utf8_written != 0) CHECK_NE(buffer[utf8_written - 1], 42);
}
if (utf8_written2 != 0) CHECK_NE(buffer[utf8_written - 1], 42);
if (nchars >= 2) {
uint16_t trail = StringGet(string, nchars - 1);
uint16_t lead = StringGet(string, nchars - 2);
if (((lead & 0xfc00) == 0xd800) &&
((trail & 0xfc00) == 0xdc00)) {
unsigned char u1 = buffer2[utf8_written2 - 4];
unsigned char u2 = buffer2[utf8_written2 - 3];
unsigned char u3 = buffer2[utf8_written2 - 2];
unsigned char u4 = buffer2[utf8_written2 - 1];
CHECK_EQ((u1 & 0xf8), 0xf0);
CHECK_EQ((u2 & 0xc0), 0x80);
CHECK_EQ((u3 & 0xc0), 0x80);
CHECK_EQ((u4 & 0xc0), 0x80);
uint32_t c = 0x10000 + ((lead & 0x3ff) << 10) + (trail & 0x3ff);
CHECK_EQ((u4 & 0x3f), (c & 0x3f));
CHECK_EQ((u3 & 0x3f), ((c >> 6) & 0x3f));
CHECK_EQ((u2 & 0x3f), ((c >> 12) & 0x3f));
CHECK_EQ((u1 & 0x3), c >> 18);
}
}
}
}
}
THREADED_TEST(Utf16) {
LocalContext context;
v8::HandleScope scope;
CompileRun(
"var pad = '01234567890123456789';"
"var p = [];"
"var plens = [20, 3, 3];"
"p.push('01234567890123456789');"
"var lead = 0xd800;"
"var trail = 0xdc00;"
"p.push(String.fromCharCode(0xd800));"
"p.push(String.fromCharCode(0xdc00));"
"var a = [];"
"var b = [];"
"var alens = [];"
"for (var i = 0; i < 3; i++) {"
" p[1] = String.fromCharCode(lead++);"
" for (var j = 0; j < 3; j++) {"
" p[2] = String.fromCharCode(trail++);"
" a.push(p[i] + p[j]);"
" b.push(p[i] + p[j]);"
" alens.push(plens[i] + plens[j]);"
" }"
"}"
"alens[5] -= 2;" // Here the surrogate pairs match up.
"var a2 = [];"
"var b2 = [];"
"var a2lens = [];"
"for (var m = 0; m < 9; m++) {"
" for (var n = 0; n < 9; n++) {"
" a2.push(a[m] + a[n]);"
" b2.push(b[m] + b[n]);"
" var utf = alens[m] + alens[n];" // And here.
// The 'n's that start with 0xdc.. are 6-8
// The 'm's that end with 0xd8.. are 1, 4 and 7
" if ((m % 3) == 1 && n >= 6) utf -= 2;"
" a2lens.push(utf);"
" }"
"}");
Utf16Helper(context, "a", "alens", 9);
Utf16Helper(context, "a2", "a2lens", 81);
WriteUtf8Helper(context, "b", "alens", 9);
WriteUtf8Helper(context, "b2", "a2lens", 81);
}
static bool SameSymbol(Handle<String> s1, Handle<String> s2) {
i::Handle<i::String> is1(v8::Utils::OpenHandle(*s1));
i::Handle<i::String> is2(v8::Utils::OpenHandle(*s2));
return *is1 == *is2;
}
static void SameSymbolHelper(const char* a, const char* b) {
Handle<String> symbol1 = v8::String::NewSymbol(a);
Handle<String> symbol2 = v8::String::NewSymbol(b);
CHECK(SameSymbol(symbol1, symbol2));
}
THREADED_TEST(Utf16Symbol) {
LocalContext context;
v8::HandleScope scope;
Handle<String> symbol1 = v8::String::NewSymbol("abc");
Handle<String> symbol2 = v8::String::NewSymbol("abc");
CHECK(SameSymbol(symbol1, symbol2));
SameSymbolHelper("\360\220\220\205", // 4 byte encoding.
"\355\240\201\355\260\205"); // 2 3-byte surrogates.
SameSymbolHelper("\355\240\201\355\260\206", // 2 3-byte surrogates.
"\360\220\220\206"); // 4 byte encoding.
SameSymbolHelper("x\360\220\220\205", // 4 byte encoding.
"x\355\240\201\355\260\205"); // 2 3-byte surrogates.
SameSymbolHelper("x\355\240\201\355\260\206", // 2 3-byte surrogates.
"x\360\220\220\206"); // 4 byte encoding.
CompileRun(
"var sym0 = 'benedictus';"
"var sym0b = 'S\303\270ren';"
"var sym1 = '\355\240\201\355\260\207';"
"var sym2 = '\360\220\220\210';"
"var sym3 = 'x\355\240\201\355\260\207';"
"var sym4 = 'x\360\220\220\210';"
"if (sym1.length != 2) throw sym1;"
"if (sym1.charCodeAt(1) != 0xdc07) throw sym1.charCodeAt(1);"
"if (sym2.length != 2) throw sym2;"
"if (sym2.charCodeAt(1) != 0xdc08) throw sym2.charCodeAt(2);"
"if (sym3.length != 3) throw sym3;"
"if (sym3.charCodeAt(2) != 0xdc07) throw sym1.charCodeAt(2);"
"if (sym4.length != 3) throw sym4;"
"if (sym4.charCodeAt(2) != 0xdc08) throw sym2.charCodeAt(2);");
Handle<String> sym0 = v8::String::NewSymbol("benedictus");
Handle<String> sym0b = v8::String::NewSymbol("S\303\270ren");
Handle<String> sym1 = v8::String::NewSymbol("\355\240\201\355\260\207");
Handle<String> sym2 = v8::String::NewSymbol("\360\220\220\210");
Handle<String> sym3 = v8::String::NewSymbol("x\355\240\201\355\260\207");
Handle<String> sym4 = v8::String::NewSymbol("x\360\220\220\210");
v8::Local<v8::Object> global = context->Global();
Local<Value> s0 = global->Get(v8_str("sym0"));
Local<Value> s0b = global->Get(v8_str("sym0b"));
Local<Value> s1 = global->Get(v8_str("sym1"));
Local<Value> s2 = global->Get(v8_str("sym2"));
Local<Value> s3 = global->Get(v8_str("sym3"));
Local<Value> s4 = global->Get(v8_str("sym4"));
CHECK(SameSymbol(sym0, Handle<String>(String::Cast(*s0))));
CHECK(SameSymbol(sym0b, Handle<String>(String::Cast(*s0b))));
CHECK(SameSymbol(sym1, Handle<String>(String::Cast(*s1))));
CHECK(SameSymbol(sym2, Handle<String>(String::Cast(*s2))));
CHECK(SameSymbol(sym3, Handle<String>(String::Cast(*s3))));
CHECK(SameSymbol(sym4, Handle<String>(String::Cast(*s4))));
}
THREADED_TEST(ToArrayIndex) {
v8::HandleScope scope;
LocalContext context;

View File

@ -63,7 +63,7 @@ TEST(ScanKeywords) {
int length = i::StrLength(key_token.keyword);
CHECK(static_cast<int>(sizeof(buffer)) >= length);
{
i::Utf8ToUC16CharacterStream stream(keyword, length);
i::Utf8ToUtf16CharacterStream stream(keyword, length);
i::Scanner scanner(&unicode_cache);
// The scanner should parse Harmony keywords for this test.
scanner.SetHarmonyScoping(true);
@ -74,7 +74,7 @@ TEST(ScanKeywords) {
}
// Removing characters will make keyword matching fail.
{
i::Utf8ToUC16CharacterStream stream(keyword, length - 1);
i::Utf8ToUtf16CharacterStream stream(keyword, length - 1);
i::Scanner scanner(&unicode_cache);
scanner.Initialize(&stream);
CHECK_EQ(i::Token::IDENTIFIER, scanner.Next());
@ -85,7 +85,7 @@ TEST(ScanKeywords) {
for (int j = 0; j < static_cast<int>(ARRAY_SIZE(chars_to_append)); ++j) {
memmove(buffer, keyword, length);
buffer[length] = chars_to_append[j];
i::Utf8ToUC16CharacterStream stream(buffer, length + 1);
i::Utf8ToUtf16CharacterStream stream(buffer, length + 1);
i::Scanner scanner(&unicode_cache);
scanner.Initialize(&stream);
CHECK_EQ(i::Token::IDENTIFIER, scanner.Next());
@ -95,7 +95,7 @@ TEST(ScanKeywords) {
{
memmove(buffer, keyword, length);
buffer[length - 1] = '_';
i::Utf8ToUC16CharacterStream stream(buffer, length);
i::Utf8ToUtf16CharacterStream stream(buffer, length);
i::Scanner scanner(&unicode_cache);
scanner.Initialize(&stream);
CHECK_EQ(i::Token::IDENTIFIER, scanner.Next());
@ -255,7 +255,7 @@ TEST(StandAlonePreParser) {
uintptr_t stack_limit = i::Isolate::Current()->stack_guard()->real_climit();
for (int i = 0; programs[i]; i++) {
const char* program = programs[i];
i::Utf8ToUC16CharacterStream stream(
i::Utf8ToUtf16CharacterStream stream(
reinterpret_cast<const i::byte*>(program),
static_cast<unsigned>(strlen(program)));
i::CompleteParserRecorder log;
@ -291,7 +291,7 @@ TEST(StandAlonePreParserNoNatives) {
uintptr_t stack_limit = i::Isolate::Current()->stack_guard()->real_climit();
for (int i = 0; programs[i]; i++) {
const char* program = programs[i];
i::Utf8ToUC16CharacterStream stream(
i::Utf8ToUtf16CharacterStream stream(
reinterpret_cast<const i::byte*>(program),
static_cast<unsigned>(strlen(program)));
i::CompleteParserRecorder log;
@ -326,8 +326,9 @@ TEST(RegressChromium62639) {
// and then used the invalid currently scanned literal. This always
// failed in debug mode, and sometimes crashed in release mode.
i::Utf8ToUC16CharacterStream stream(reinterpret_cast<const i::byte*>(program),
static_cast<unsigned>(strlen(program)));
i::Utf8ToUtf16CharacterStream stream(
reinterpret_cast<const i::byte*>(program),
static_cast<unsigned>(strlen(program)));
i::ScriptDataImpl* data =
i::ParserApi::PreParse(&stream, NULL, false);
CHECK(data->HasError());
@ -392,7 +393,7 @@ TEST(PreParseOverflow) {
uintptr_t stack_limit = i::Isolate::Current()->stack_guard()->real_climit();
i::Utf8ToUC16CharacterStream stream(
i::Utf8ToUtf16CharacterStream stream(
reinterpret_cast<const i::byte*>(*program),
static_cast<unsigned>(kProgramSize));
i::CompleteParserRecorder log;
@ -449,10 +450,10 @@ void TestCharacterStream(const char* ascii_source,
i::Handle<i::String> uc16_string(
FACTORY->NewExternalStringFromTwoByte(&resource));
i::ExternalTwoByteStringUC16CharacterStream uc16_stream(
i::ExternalTwoByteStringUtf16CharacterStream uc16_stream(
i::Handle<i::ExternalTwoByteString>::cast(uc16_string), start, end);
i::GenericStringUC16CharacterStream string_stream(ascii_string, start, end);
i::Utf8ToUC16CharacterStream utf8_stream(
i::GenericStringUtf16CharacterStream string_stream(ascii_string, start, end);
i::Utf8ToUtf16CharacterStream utf8_stream(
reinterpret_cast<const i::byte*>(ascii_source), end);
utf8_stream.SeekForward(start);
@ -575,12 +576,14 @@ TEST(Utf8CharacterStream) {
char buffer[kAllUtf8CharsSizeU];
unsigned cursor = 0;
for (int i = 0; i <= kMaxUC16Char; i++) {
cursor += unibrow::Utf8::Encode(buffer + cursor, i);
cursor += unibrow::Utf8::Encode(buffer + cursor,
i,
unibrow::Utf16::kNoPreviousCharacter);
}
ASSERT(cursor == kAllUtf8CharsSizeU);
i::Utf8ToUC16CharacterStream stream(reinterpret_cast<const i::byte*>(buffer),
kAllUtf8CharsSizeU);
i::Utf8ToUtf16CharacterStream stream(reinterpret_cast<const i::byte*>(buffer),
kAllUtf8CharsSizeU);
for (int i = 0; i <= kMaxUC16Char; i++) {
CHECK_EQU(i, stream.pos());
int32_t c = stream.Advance();
@ -610,7 +613,7 @@ TEST(Utf8CharacterStream) {
#undef CHECK_EQU
void TestStreamScanner(i::UC16CharacterStream* stream,
void TestStreamScanner(i::Utf16CharacterStream* stream,
i::Token::Value* expected_tokens,
int skip_pos = 0, // Zero means not skipping.
int skip_to = 0) {
@ -633,8 +636,8 @@ TEST(StreamScanner) {
v8::V8::Initialize();
const char* str1 = "{ foo get for : */ <- \n\n /*foo*/ bib";
i::Utf8ToUC16CharacterStream stream1(reinterpret_cast<const i::byte*>(str1),
static_cast<unsigned>(strlen(str1)));
i::Utf8ToUtf16CharacterStream stream1(reinterpret_cast<const i::byte*>(str1),
static_cast<unsigned>(strlen(str1)));
i::Token::Value expectations1[] = {
i::Token::LBRACE,
i::Token::IDENTIFIER,
@ -652,8 +655,8 @@ TEST(StreamScanner) {
TestStreamScanner(&stream1, expectations1, 0, 0);
const char* str2 = "case default const {THIS\nPART\nSKIPPED} do";
i::Utf8ToUC16CharacterStream stream2(reinterpret_cast<const i::byte*>(str2),
static_cast<unsigned>(strlen(str2)));
i::Utf8ToUtf16CharacterStream stream2(reinterpret_cast<const i::byte*>(str2),
static_cast<unsigned>(strlen(str2)));
i::Token::Value expectations2[] = {
i::Token::CASE,
i::Token::DEFAULT,
@ -683,7 +686,7 @@ TEST(StreamScanner) {
for (int i = 0; i <= 4; i++) {
expectations3[6 - i] = i::Token::ILLEGAL;
expectations3[5 - i] = i::Token::EOS;
i::Utf8ToUC16CharacterStream stream3(
i::Utf8ToUtf16CharacterStream stream3(
reinterpret_cast<const i::byte*>(str3),
static_cast<unsigned>(strlen(str3)));
TestStreamScanner(&stream3, expectations3, 1, 1 + i);
@ -692,7 +695,7 @@ TEST(StreamScanner) {
void TestScanRegExp(const char* re_source, const char* expected) {
i::Utf8ToUC16CharacterStream stream(
i::Utf8ToUtf16CharacterStream stream(
reinterpret_cast<const i::byte*>(re_source),
static_cast<unsigned>(strlen(re_source)));
i::Scanner scanner(i::Isolate::Current()->unicode_cache());
@ -748,6 +751,67 @@ TEST(RegExpScanning) {
}
static int Utf8LengthHelper(const char* s) {
int len = strlen(s);
int character_length = len;
for (int i = 0; i < len; i++) {
unsigned char c = s[i];
int input_offset = 0;
int output_adjust = 0;
if (c > 0x7f) {
if (c < 0xc0) continue;
if (c >= 0xf0) {
if (c >= 0xf8) {
// 5 and 6 byte UTF-8 sequences turn into a kBadChar for each UTF-8
// byte.
continue; // Handle first UTF-8 byte.
}
if ((c & 7) == 0 && ((s[i + 1] & 0x30) == 0)) {
// This 4 byte sequence could have been coded as a 3 byte sequence.
// Record a single kBadChar for the first byte and continue.
continue;
}
input_offset = 3;
// 4 bytes of UTF-8 turn into 2 UTF-16 code units.
character_length -= 2;
} else if (c >= 0xe0) {
if ((c & 0xf) == 0 && ((s[i + 1] & 0x20) == 0)) {
// This 3 byte sequence could have been coded as a 2 byte sequence.
// Record a single kBadChar for the first byte and continue.
continue;
}
input_offset = 2;
// 3 bytes of UTF-8 turn into 1 UTF-16 code unit.
output_adjust = 2;
} else {
if ((c & 0x1e) == 0) {
// This 2 byte sequence could have been coded as a 1 byte sequence.
// Record a single kBadChar for the first byte and continue.
continue;
}
input_offset = 1;
// 2 bytes of UTF-8 turn into 1 UTF-16 code unit.
output_adjust = 1;
}
bool bad = false;
for (int j = 1; j <= input_offset; j++) {
if ((s[i + j] & 0xc0) != 0x80) {
// Bad UTF-8 sequence turns the first in the sequence into kBadChar,
// which is a single UTF-16 code unit.
bad = true;
break;
}
}
if (!bad) {
i += input_offset;
character_length -= output_adjust;
}
}
}
return character_length;
}
TEST(ScopePositions) {
// Test the parser for correctly setting the start and end positions
// of a scope. We check the scope positions of exactly one scope
@ -835,6 +899,91 @@ TEST(ScopePositions) {
{ " for ", "(let x in {})\n"
" statement;", "\n"
" more;", i::BLOCK_SCOPE, i::EXTENDED_MODE },
// Check that 6-byte and 4-byte encodings of UTF-8 strings do not throw
// the preparser off in terms of byte offsets.
// 6 byte encoding.
{ " 'foo\355\240\201\355\260\211';\n"
" (function fun", "(a,b) { infunction; }", ")();",
i::FUNCTION_SCOPE, i::CLASSIC_MODE },
// 4 byte encoding.
{ " 'foo\360\220\220\212';\n"
" (function fun", "(a,b) { infunction; }", ")();",
i::FUNCTION_SCOPE, i::CLASSIC_MODE },
// 3 byte encoding of \u0fff.
{ " 'foo\340\277\277';\n"
" (function fun", "(a,b) { infunction; }", ")();",
i::FUNCTION_SCOPE, i::CLASSIC_MODE },
// Broken 6 byte encoding with missing last byte.
{ " 'foo\355\240\201\355\211';\n"
" (function fun", "(a,b) { infunction; }", ")();",
i::FUNCTION_SCOPE, i::CLASSIC_MODE },
// Broken 3 byte encoding of \u0fff with missing last byte.
{ " 'foo\340\277';\n"
" (function fun", "(a,b) { infunction; }", ")();",
i::FUNCTION_SCOPE, i::CLASSIC_MODE },
// Broken 3 byte encoding of \u0fff with missing 2 last bytes.
{ " 'foo\340';\n"
" (function fun", "(a,b) { infunction; }", ")();",
i::FUNCTION_SCOPE, i::CLASSIC_MODE },
// Broken 3 byte encoding of \u00ff should be a 2 byte encoding.
{ " 'foo\340\203\277';\n"
" (function fun", "(a,b) { infunction; }", ")();",
i::FUNCTION_SCOPE, i::CLASSIC_MODE },
// Broken 3 byte encoding of \u007f should be a 2 byte encoding.
{ " 'foo\340\201\277';\n"
" (function fun", "(a,b) { infunction; }", ")();",
i::FUNCTION_SCOPE, i::CLASSIC_MODE },
// Unpaired lead surrogate.
{ " 'foo\355\240\201';\n"
" (function fun", "(a,b) { infunction; }", ")();",
i::FUNCTION_SCOPE, i::CLASSIC_MODE },
// Unpaired lead surrogate where following code point is a 3 byte sequence.
{ " 'foo\355\240\201\340\277\277';\n"
" (function fun", "(a,b) { infunction; }", ")();",
i::FUNCTION_SCOPE, i::CLASSIC_MODE },
// Unpaired lead surrogate where following code point is a 4 byte encoding
// of a trail surrogate.
{ " 'foo\355\240\201\360\215\260\211';\n"
" (function fun", "(a,b) { infunction; }", ")();",
i::FUNCTION_SCOPE, i::CLASSIC_MODE },
// Unpaired trail surrogate.
{ " 'foo\355\260\211';\n"
" (function fun", "(a,b) { infunction; }", ")();",
i::FUNCTION_SCOPE, i::CLASSIC_MODE },
// 2 byte encoding of \u00ff.
{ " 'foo\303\277';\n"
" (function fun", "(a,b) { infunction; }", ")();",
i::FUNCTION_SCOPE, i::CLASSIC_MODE },
// Broken 2 byte encoding of \u00ff with missing last byte.
{ " 'foo\303';\n"
" (function fun", "(a,b) { infunction; }", ")();",
i::FUNCTION_SCOPE, i::CLASSIC_MODE },
// Broken 2 byte encoding of \u007f should be a 1 byte encoding.
{ " 'foo\301\277';\n"
" (function fun", "(a,b) { infunction; }", ")();",
i::FUNCTION_SCOPE, i::CLASSIC_MODE },
// Illegal 5 byte encoding.
{ " 'foo\370\277\277\277\277';\n"
" (function fun", "(a,b) { infunction; }", ")();",
i::FUNCTION_SCOPE, i::CLASSIC_MODE },
// Illegal 6 byte encoding.
{ " 'foo\374\277\277\277\277\277';\n"
" (function fun", "(a,b) { infunction; }", ")();",
i::FUNCTION_SCOPE, i::CLASSIC_MODE },
// Illegal 0xfe byte
{ " 'foo\376\277\277\277\277\277\277';\n"
" (function fun", "(a,b) { infunction; }", ")();",
i::FUNCTION_SCOPE, i::CLASSIC_MODE },
// Illegal 0xff byte
{ " 'foo\377\277\277\277\277\277\277\277';\n"
" (function fun", "(a,b) { infunction; }", ")();",
i::FUNCTION_SCOPE, i::CLASSIC_MODE },
{ " 'foo';\n"
" (function fun", "(a,b) { 'bar\355\240\201\355\260\213'; }", ")();",
i::FUNCTION_SCOPE, i::CLASSIC_MODE },
{ " 'foo';\n"
" (function fun", "(a,b) { 'bar\360\220\220\214'; }", ")();",
i::FUNCTION_SCOPE, i::CLASSIC_MODE },
{ NULL, NULL, NULL, i::EVAL_SCOPE, i::CLASSIC_MODE }
};
@ -848,20 +997,24 @@ TEST(ScopePositions) {
i::FLAG_harmony_scoping = true;
for (int i = 0; source_data[i].outer_prefix; i++) {
int kPrefixLen = i::StrLength(source_data[i].outer_prefix);
int kInnerLen = i::StrLength(source_data[i].inner_source);
int kSuffixLen = i::StrLength(source_data[i].outer_suffix);
int kPrefixLen = Utf8LengthHelper(source_data[i].outer_prefix);
int kInnerLen = Utf8LengthHelper(source_data[i].inner_source);
int kSuffixLen = Utf8LengthHelper(source_data[i].outer_suffix);
int kPrefixByteLen = i::StrLength(source_data[i].outer_prefix);
int kInnerByteLen = i::StrLength(source_data[i].inner_source);
int kSuffixByteLen = i::StrLength(source_data[i].outer_suffix);
int kProgramSize = kPrefixLen + kInnerLen + kSuffixLen;
i::Vector<char> program = i::Vector<char>::New(kProgramSize + 1);
int length = i::OS::SNPrintF(program, "%s%s%s",
source_data[i].outer_prefix,
source_data[i].inner_source,
source_data[i].outer_suffix);
CHECK(length == kProgramSize);
int kProgramByteSize = kPrefixByteLen + kInnerByteLen + kSuffixByteLen;
i::Vector<char> program = i::Vector<char>::New(kProgramByteSize + 1);
i::OS::SNPrintF(program, "%s%s%s",
source_data[i].outer_prefix,
source_data[i].inner_source,
source_data[i].outer_suffix);
// Parse program source.
i::Handle<i::String> source(
FACTORY->NewStringFromAscii(i::CStrVector(program.start())));
FACTORY->NewStringFromUtf8(i::CStrVector(program.start())));
CHECK_EQ(source->length(), kProgramSize);
i::Handle<i::Script> script = FACTORY->NewScript(source);
i::Parser parser(script, i::kAllowLazy | i::EXTENDED_MODE, NULL, NULL);
i::CompilationInfo info(script);
@ -894,7 +1047,7 @@ void TestParserSync(i::Handle<i::String> source, int flags) {
// Preparse the data.
i::CompleteParserRecorder log;
i::Scanner scanner(i::Isolate::Current()->unicode_cache());
i::GenericStringUC16CharacterStream stream(source, 0, source->length());
i::GenericStringUtf16CharacterStream stream(source, 0, source->length());
scanner.SetHarmonyScoping(harmony_scoping);
scanner.Initialize(&stream);
v8::preparser::PreParser::PreParseResult result =