Remove Utf8InputBuffer

R=yangguo@chromium.org
BUG=

Review URL: https://chromiumcodereview.appspot.com/11649018
Patch from Dan Carney <dcarney@google.com>.

git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@13248 ce2b1a6d-e550-0410-aec6-3dcde31c8c00
This commit is contained in:
yangguo@chromium.org 2012-12-20 09:20:37 +00:00
parent 9eccd63ccc
commit eedcaf1866
9 changed files with 235 additions and 159 deletions

View File

@ -192,21 +192,14 @@ void DebuggerAgentSession::Run() {
}
// Convert UTF-8 to UTF-16.
unibrow::Utf8InputBuffer<> buf(msg, StrLength(msg));
int len = 0;
while (buf.has_more()) {
buf.GetNext();
len++;
}
ScopedVector<int16_t> temp(len + 1);
buf.Reset(msg, StrLength(msg));
for (int i = 0; i < len; i++) {
temp[i] = buf.GetNext();
}
unibrow::Utf8Decoder<128> decoder(msg, StrLength(msg));
int utf16_length = decoder.Utf16Length();
ScopedVector<uint16_t> temp(utf16_length + 1);
decoder.WriteUtf16(temp.start(), utf16_length);
// Send the request received to the debugger.
v8::Debug::SendCommand(reinterpret_cast<const uint16_t *>(temp.start()),
len,
v8::Debug::SendCommand(temp.start(),
utf16_length,
NULL,
reinterpret_cast<v8::Isolate*>(agent_->isolate()));

View File

@ -4546,37 +4546,31 @@ MaybeObject* Heap::AllocateStringFromUtf8Slow(Vector<const char> string,
PretenureFlag pretenure) {
// Continue counting the number of characters in the UTF-8 string, starting
// from the first non-ascii character or word.
int chars = non_ascii_start;
Access<UnicodeCache::Utf8Decoder>
decoder(isolate_->unicode_cache()->utf8_decoder());
decoder->Reset(string.start() + non_ascii_start, string.length() - chars);
while (decoder->has_more()) {
uint32_t r = decoder->GetNext();
if (r <= unibrow::Utf16::kMaxNonSurrogateCharCode) {
chars++;
} else {
chars += 2;
}
}
decoder->Reset(string.start() + non_ascii_start,
string.length() - non_ascii_start);
int utf16_length = decoder->Utf16Length();
ASSERT(utf16_length > 0);
// Allocate string.
Object* result;
{ MaybeObject* maybe_result = AllocateRawTwoByteString(chars, pretenure);
{
int chars = non_ascii_start + utf16_length;
MaybeObject* maybe_result = AllocateRawTwoByteString(chars, pretenure);
if (!maybe_result->ToObject(&result)) return maybe_result;
}
// Convert and copy the characters into the new object.
SeqTwoByteString* twobyte = SeqTwoByteString::cast(result);
decoder->Reset(string.start(), string.length());
int i = 0;
while (i < chars) {
uint32_t r = decoder->GetNext();
if (r > unibrow::Utf16::kMaxNonSurrogateCharCode) {
twobyte->SeqTwoByteStringSet(i++, unibrow::Utf16::LeadSurrogate(r));
twobyte->SeqTwoByteStringSet(i++, unibrow::Utf16::TrailSurrogate(r));
} else {
twobyte->SeqTwoByteStringSet(i++, r);
// Copy ascii portion.
uint16_t* data = twobyte->GetChars();
if (non_ascii_start != 0) {
const char* ascii_data = string.start();
for (int i = 0; i < non_ascii_start; i++) {
*data++ = *ascii_data++;
}
}
// Now write the remainder.
decoder->WriteUtf16(data, utf16_length);
return result;
}

View File

@ -7641,14 +7641,20 @@ bool String::MarkAsUndetectable() {
bool String::IsEqualTo(Vector<const char> str) {
Isolate* isolate = GetIsolate();
int slen = length();
Access<UnicodeCache::Utf8Decoder>
decoder(isolate->unicode_cache()->utf8_decoder());
decoder->Reset(str.start(), str.length());
// Can't check exact length equality, but we can check bounds.
int str_len = str.length();
if (str_len < slen ||
str_len > slen*static_cast<int>(unibrow::Utf8::kMaxEncodedSize)) {
return false;
}
int i;
for (i = 0; i < slen && decoder->has_more(); i++) {
uint32_t r = decoder->GetNext();
unsigned remaining_in_str = static_cast<unsigned>(str_len);
const uint8_t* utf8_data = reinterpret_cast<const uint8_t*>(str.start());
for (i = 0; i < slen && remaining_in_str > 0; i++) {
unsigned cursor = 0;
uint32_t r = unibrow::Utf8::ValueOf(utf8_data, remaining_in_str, &cursor);
ASSERT(cursor > 0 && cursor <= remaining_in_str);
if (r > unibrow::Utf16::kMaxNonSurrogateCharCode) {
if (i > slen - 1) return false;
if (Get(i++) != unibrow::Utf16::LeadSurrogate(r)) return false;
@ -7656,8 +7662,10 @@ bool String::IsEqualTo(Vector<const char> str) {
} else {
if (Get(i) != r) return false;
}
utf8_data += cursor;
remaining_in_str -= cursor;
}
return i == slen && !decoder->has_more();
return i == slen && remaining_in_str == 0;
}
@ -7862,46 +7870,51 @@ uint32_t StringHasher::GetHashField() {
}
uint32_t StringHasher::ComputeHashField(unibrow::CharacterStream* buffer,
int length,
uint32_t seed) {
typedef unibrow::Utf16 u;
StringHasher hasher(length, seed);
// Very long strings have a trivial hash that doesn't inspect the
// string contents.
if (hasher.has_trivial_hash()) {
return hasher.GetHashField();
uint32_t StringHasher::ComputeUtf8Hash(Vector<const char> chars,
uint32_t seed,
int* utf16_length_out) {
int vector_length = chars.length();
// Handle some edge cases
if (vector_length <= 1) {
ASSERT(vector_length == 0 ||
static_cast<uint8_t>(chars.start()[0]) <=
unibrow::Utf8::kMaxOneByteChar);
*utf16_length_out = vector_length;
return HashSequentialString(chars.start(), vector_length, seed);
}
// Do the iterative array index computation as long as there is a
// chance this is an array index.
if (hasher.is_array_index_) {
while (buffer->has_more()) {
uint32_t c = buffer->GetNext();
if (c > u::kMaxNonSurrogateCharCode) {
uint16_t c1 = u::LeadSurrogate(c);
uint16_t c2 = u::TrailSurrogate(c);
hasher.AddCharacter(c1);
hasher.AddCharacter(c2);
if (!hasher.UpdateIndex(c1)) break;
if (!hasher.UpdateIndex(c2)) break;
} else {
hasher.AddCharacter(c);
if (!hasher.UpdateIndex(c)) break;
}
}
}
// Process the remaining characters without updating the array
// index.
while (buffer->has_more()) {
ASSERT(!hasher.is_array_index_);
uint32_t c = buffer->GetNext();
if (c > u::kMaxNonSurrogateCharCode) {
hasher.AddCharacter(u::LeadSurrogate(c));
hasher.AddCharacter(u::TrailSurrogate(c));
// Start with a fake length which won't affect computation.
// It will be updated later.
StringHasher hasher(String::kMaxArrayIndexSize, seed);
unsigned remaining = static_cast<unsigned>(vector_length);
const uint8_t* stream = reinterpret_cast<const uint8_t*>(chars.start());
int utf16_length = 0;
bool is_index = true;
ASSERT(hasher.is_array_index_);
while (remaining > 0) {
unsigned consumed = 0;
uint32_t c = unibrow::Utf8::ValueOf(stream, remaining, &consumed);
ASSERT(consumed > 0 && consumed <= remaining);
stream += consumed;
remaining -= consumed;
bool is_two_characters = c > unibrow::Utf16::kMaxNonSurrogateCharCode;
utf16_length += is_two_characters ? 2 : 1;
// No need to keep hashing. But we do need to calculate utf16_length.
if (utf16_length > String::kMaxHashCalcLength) continue;
if (is_two_characters) {
uint16_t c1 = unibrow::Utf16::LeadSurrogate(c);
uint16_t c2 = unibrow::Utf16::TrailSurrogate(c);
hasher.AddCharacter(c1);
hasher.AddCharacter(c2);
if (is_index) is_index = hasher.UpdateIndex(c1);
if (is_index) is_index = hasher.UpdateIndex(c2);
} else {
hasher.AddCharacter(c);
if (is_index) is_index = hasher.UpdateIndex(c);
}
}
*utf16_length_out = static_cast<int>(utf16_length);
// Must set length here so that hash computation is correct.
hasher.length_ = utf16_length;
return hasher.GetHashField();
}
@ -11716,10 +11729,7 @@ class Utf8SymbolKey : public HashTableKey {
uint32_t Hash() {
if (hash_field_ != 0) return hash_field_ >> String::kHashShift;
unibrow::Utf8InputBuffer<> buffer(string_.start(),
static_cast<unsigned>(string_.length()));
chars_ = buffer.Utf16Length();
hash_field_ = StringHasher::ComputeHashField(&buffer, chars_, seed_);
hash_field_ = StringHasher::ComputeUtf8Hash(string_, seed_, &chars_);
uint32_t result = hash_field_ >> String::kHashShift;
ASSERT(result != 0); // Ensure that the hash value of 0 is never computed.
return result;

View File

@ -6934,9 +6934,10 @@ class StringHasher {
int length,
uint32_t seed);
static uint32_t ComputeHashField(unibrow::CharacterStream* buffer,
int length,
uint32_t seed);
// Reads all the data, even for long strings and computes the utf16 length.
static uint32_t ComputeUtf8Hash(Vector<const char> chars,
uint32_t seed,
int* utf16_length_out);
// Calculated hash value for a string consisting of 1 to
// String::kMaxArrayIndexSize digits with no leading zeros (except "0").

View File

@ -145,7 +145,7 @@ class UnicodeCache {
// Caching predicates used by scanners.
public:
UnicodeCache() {}
typedef unibrow::Utf8InputBuffer<1024> Utf8Decoder;
typedef unibrow::Utf8Decoder<512> Utf8Decoder;
StaticResource<Utf8Decoder>* utf8_decoder() {
return &utf8_decoder_;
@ -315,8 +315,6 @@ class Scanner {
// -1 is outside of the range of any real source code.
static const int kNoOctalLocation = -1;
typedef unibrow::Utf8InputBuffer<1024> Utf8Decoder;
explicit Scanner(UnicodeCache* scanner_contants);
void Initialize(Utf16CharacterStream* source);

View File

@ -240,10 +240,51 @@ void InputBuffer<R, I, s>::Seek(unsigned position) {
buffer_ = R::ReadBlock(input_, util_buffer_, s, &remaining_, &offset_);
}
template <unsigned s>
Utf8InputBuffer<s>::Utf8InputBuffer(const char* data, unsigned length)
: InputBuffer<Utf8, Buffer<const char*>, s>(Buffer<const char*>(data,
length)) {
Utf8DecoderBase::Utf8DecoderBase()
: unbuffered_start_(NULL),
utf16_length_(0),
last_byte_of_buffer_unused_(false) {}
Utf8DecoderBase::Utf8DecoderBase(uint16_t* buffer,
unsigned buffer_length,
const uint8_t* stream,
unsigned stream_length) {
Reset(buffer, buffer_length, stream, stream_length);
}
template<unsigned kBufferSize>
Utf8Decoder<kBufferSize>::Utf8Decoder(const char* stream, unsigned length)
: Utf8DecoderBase(buffer_,
kBufferSize,
reinterpret_cast<const uint8_t*>(stream),
length) {
}
template<unsigned kBufferSize>
void Utf8Decoder<kBufferSize>::Reset(const char* stream, unsigned length) {
Utf8DecoderBase::Reset(buffer_,
kBufferSize,
reinterpret_cast<const uint8_t*>(stream),
length);
}
template <unsigned kBufferSize>
unsigned Utf8Decoder<kBufferSize>::WriteUtf16(uint16_t* data,
unsigned length) const {
ASSERT(length > 0);
if (length > utf16_length_) length = utf16_length_;
// memcpy everything in buffer.
unsigned buffer_length =
last_byte_of_buffer_unused_ ? kBufferSize - 1 : kBufferSize;
unsigned memcpy_length = length <= buffer_length ? length : buffer_length;
memcpy(data, buffer_, memcpy_length*sizeof(uint16_t));
if (length <= buffer_length) return length;
ASSERT(unbuffered_start_ != NULL);
// Copy the rest the slow way.
WriteUtf16Slow(unbuffered_start_,
data + buffer_length,
length - buffer_length);
return length;
}
} // namespace unibrow

View File

@ -277,58 +277,6 @@ uchar Utf8::CalculateValue(const byte* str,
}
const byte* Utf8::ReadBlock(Buffer<const char*> str, byte* buffer,
unsigned capacity, unsigned* chars_read_ptr, unsigned* offset_ptr) {
unsigned offset = *offset_ptr;
// Bail out early if we've reached the end of the string.
if (offset == str.length()) {
*chars_read_ptr = 0;
return NULL;
}
const byte* data = reinterpret_cast<const byte*>(str.data());
if (data[offset] <= kMaxOneByteChar) {
// The next character is an ASCII char so we scan forward over
// the following ASCII characters and return the next pure ASCII
// substring
const byte* result = data + offset;
offset++;
while ((offset < str.length()) && (data[offset] <= kMaxOneByteChar))
offset++;
*chars_read_ptr = offset - *offset_ptr;
*offset_ptr = offset;
return result;
} else {
// The next character is non-ASCII so we just fill the buffer
unsigned cursor = 0;
unsigned chars_read = 0;
while (offset < str.length()) {
uchar c = data[offset];
if (c <= kMaxOneByteChar) {
// Fast case for ASCII characters
if (!CharacterStream::EncodeAsciiCharacter(c,
buffer,
capacity,
cursor))
break;
offset += 1;
} else {
unsigned chars = 0;
c = Utf8::ValueOf(data + offset, str.length() - offset, &chars);
if (!CharacterStream::EncodeNonAsciiCharacter(c,
buffer,
capacity,
cursor))
break;
offset += chars;
}
chars_read++;
}
*offset_ptr = offset;
*chars_read_ptr = chars_read;
return buffer;
}
}
unsigned CharacterStream::Length() {
unsigned result = 0;
while (has_more()) {
@ -356,6 +304,75 @@ void CharacterStream::Seek(unsigned position) {
}
}
void Utf8DecoderBase::Reset(uint16_t* buffer,
unsigned buffer_length,
const uint8_t* stream,
unsigned stream_length) {
// Assume everything will fit in the buffer and stream won't be needed.
last_byte_of_buffer_unused_ = false;
unbuffered_start_ = NULL;
bool writing_to_buffer = true;
// Loop until stream is read, writing to buffer as long as buffer has space.
unsigned utf16_length = 0;
while (stream_length != 0) {
unsigned cursor = 0;
uint32_t character = Utf8::ValueOf(stream, stream_length, &cursor);
ASSERT(cursor > 0 && cursor <= stream_length);
stream += cursor;
stream_length -= cursor;
bool is_two_characters = character > Utf16::kMaxNonSurrogateCharCode;
utf16_length += is_two_characters ? 2 : 1;
// Don't need to write to the buffer, but still need utf16_length.
if (!writing_to_buffer) continue;
// Write out the characters to the buffer.
// Must check for equality with buffer_length as we've already updated it.
if (utf16_length <= buffer_length) {
if (is_two_characters) {
*buffer++ = Utf16::LeadSurrogate(character);
*buffer++ = Utf16::TrailSurrogate(character);
} else {
*buffer++ = character;
}
if (utf16_length == buffer_length) {
// Just wrote last character of buffer
writing_to_buffer = false;
unbuffered_start_ = stream;
}
continue;
}
// Have gone over buffer.
// Last char of buffer is unused, set cursor back.
ASSERT(is_two_characters);
writing_to_buffer = false;
last_byte_of_buffer_unused_ = true;
unbuffered_start_ = stream - cursor;
}
utf16_length_ = utf16_length;
}
void Utf8DecoderBase::WriteUtf16Slow(const uint8_t* stream,
uint16_t* data,
unsigned data_length) {
while (data_length != 0) {
unsigned cursor = 0;
uint32_t character = Utf8::ValueOf(stream, Utf8::kMaxEncodedSize, &cursor);
// There's a total lack of bounds checking for stream
// as it was already done in Reset.
stream += cursor;
if (character > unibrow::Utf16::kMaxNonSurrogateCharCode) {
*data++ = Utf16::LeadSurrogate(character);
*data++ = Utf16::TrailSurrogate(character);
ASSERT(data_length > 1);
data_length -= 2;
} else {
*data++ = character;
data_length -= 1;
}
}
}
// Uppercase: point.category == 'Lu'
static const uint16_t kUppercaseTable0Size = 450;

View File

@ -29,7 +29,8 @@
#define V8_UNICODE_H_
#include <sys/types.h>
#include <stdint.h>
#include <globals.h>
/**
* \file
* Definitions and convenience functions for working with unicode.
@ -140,10 +141,10 @@ class Utf16 {
// One UTF-16 surrogate is endoded (illegally) as 3 UTF-8 bytes.
// The illegality stems from the surrogate not being part of a pair.
static const int kUtf8BytesToCodeASurrogate = 3;
static inline uchar LeadSurrogate(int char_code) {
static inline uint16_t LeadSurrogate(uint32_t char_code) {
return 0xd800 + (((char_code - 0x10000) >> 10) & 0x3ff);
}
static inline uchar TrailSurrogate(int char_code) {
static inline uint16_t TrailSurrogate(uint32_t char_code) {
return 0xdc00 + (char_code & 0x3ff);
}
};
@ -154,8 +155,6 @@ class Utf8 {
static inline uchar Length(uchar chr, int previous);
static inline unsigned Encode(
char* out, uchar c, int previous);
static const byte* ReadBlock(Buffer<const char*> str, byte* buffer,
unsigned capacity, unsigned* chars_read, unsigned* offset);
static uchar CalculateValue(const byte* str,
unsigned length,
unsigned* cursor);
@ -241,17 +240,42 @@ class InputBuffer : public CharacterStream {
byte util_buffer_[kSize];
};
// --- U t f 8 I n p u t B u f f e r ---
template <unsigned s = 256>
class Utf8InputBuffer : public InputBuffer<Utf8, Buffer<const char*>, s> {
class Utf8DecoderBase {
public:
inline Utf8InputBuffer() { }
inline Utf8InputBuffer(const char* data, unsigned length);
inline void Reset(const char* data, unsigned length) {
InputBuffer<Utf8, Buffer<const char*>, s>::Reset(
Buffer<const char*>(data, length));
}
// Initialization done in subclass.
inline Utf8DecoderBase();
inline Utf8DecoderBase(uint16_t* buffer,
unsigned buffer_length,
const uint8_t* stream,
unsigned stream_length);
inline unsigned Utf16Length() const { return utf16_length_; }
protected:
// This reads all characters and sets the utf16_length_.
// The first buffer_length utf16 chars are cached in the buffer.
void Reset(uint16_t* buffer,
unsigned buffer_length,
const uint8_t* stream,
unsigned stream_length);
static void WriteUtf16Slow(const uint8_t* stream,
uint16_t* data,
unsigned length);
const uint8_t* unbuffered_start_;
unsigned utf16_length_;
bool last_byte_of_buffer_unused_;
private:
DISALLOW_COPY_AND_ASSIGN(Utf8DecoderBase);
};
template <unsigned kBufferSize>
class Utf8Decoder : public Utf8DecoderBase {
public:
inline Utf8Decoder() {}
inline Utf8Decoder(const char* stream, unsigned length);
inline void Reset(const char* stream, unsigned length);
inline unsigned WriteUtf16(uint16_t* data, unsigned length) const;
private:
uint16_t buffer_[kBufferSize];
};

View File

@ -98,7 +98,6 @@ static SmartArrayPointer<const char> Parse(const char* input) {
static bool CheckSimple(const char* input) {
V8::Initialize(NULL);
v8::HandleScope scope;
unibrow::Utf8InputBuffer<> buffer(input, StrLength(input));
ZoneScope zone_scope(Isolate::Current()->runtime_zone(), DELETE_ON_EXIT);
FlatStringReader reader(Isolate::Current(), CStrVector(input));
RegExpCompileData result;
@ -117,7 +116,6 @@ struct MinMaxPair {
static MinMaxPair CheckMinMaxMatch(const char* input) {
V8::Initialize(NULL);
v8::HandleScope scope;
unibrow::Utf8InputBuffer<> buffer(input, StrLength(input));
ZoneScope zone_scope(Isolate::Current()->runtime_zone(), DELETE_ON_EXIT);
FlatStringReader reader(Isolate::Current(), CStrVector(input));
RegExpCompileData result;