2014-10-08 14:55:03 +00:00
|
|
|
// Copyright 2014 the V8 project authors. All rights reserved.
|
|
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
|
|
// found in the LICENSE file.
|
|
|
|
|
|
|
|
|
|
|
|
#include "src/unicode-inl.h"
|
|
|
|
#include "src/unicode-decoder.h"
|
|
|
|
#include <stdio.h>
|
|
|
|
#include <stdlib.h>
|
|
|
|
|
|
|
|
namespace unibrow {
|
|
|
|
|
2015-02-05 07:54:24 +00:00
|
|
|
void Utf8DecoderBase::Reset(uint16_t* buffer, size_t buffer_length,
|
|
|
|
const uint8_t* stream, size_t stream_length) {
|
2014-10-08 14:55:03 +00:00
|
|
|
// Assume everything will fit in the buffer and stream won't be needed.
|
|
|
|
last_byte_of_buffer_unused_ = false;
|
|
|
|
unbuffered_start_ = NULL;
|
2015-07-06 11:00:05 +00:00
|
|
|
unbuffered_length_ = 0;
|
2014-10-08 14:55:03 +00:00
|
|
|
bool writing_to_buffer = true;
|
|
|
|
// Loop until stream is read, writing to buffer as long as buffer has space.
|
2015-02-05 07:54:24 +00:00
|
|
|
size_t utf16_length = 0;
|
2014-10-08 14:55:03 +00:00
|
|
|
while (stream_length != 0) {
|
2015-02-05 07:54:24 +00:00
|
|
|
size_t cursor = 0;
|
2014-10-08 14:55:03 +00:00
|
|
|
uint32_t character = Utf8::ValueOf(stream, stream_length, &cursor);
|
|
|
|
DCHECK(cursor > 0 && cursor <= stream_length);
|
|
|
|
stream += cursor;
|
|
|
|
stream_length -= cursor;
|
|
|
|
bool is_two_characters = character > Utf16::kMaxNonSurrogateCharCode;
|
|
|
|
utf16_length += is_two_characters ? 2 : 1;
|
|
|
|
// Don't need to write to the buffer, but still need utf16_length.
|
|
|
|
if (!writing_to_buffer) continue;
|
|
|
|
// Write out the characters to the buffer.
|
|
|
|
// Must check for equality with buffer_length as we've already updated it.
|
|
|
|
if (utf16_length <= buffer_length) {
|
|
|
|
if (is_two_characters) {
|
|
|
|
*buffer++ = Utf16::LeadSurrogate(character);
|
|
|
|
*buffer++ = Utf16::TrailSurrogate(character);
|
|
|
|
} else {
|
|
|
|
*buffer++ = character;
|
|
|
|
}
|
|
|
|
if (utf16_length == buffer_length) {
|
|
|
|
// Just wrote last character of buffer
|
|
|
|
writing_to_buffer = false;
|
|
|
|
unbuffered_start_ = stream;
|
2015-07-06 11:00:05 +00:00
|
|
|
unbuffered_length_ = stream_length;
|
2014-10-08 14:55:03 +00:00
|
|
|
}
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
// Have gone over buffer.
|
|
|
|
// Last char of buffer is unused, set cursor back.
|
|
|
|
DCHECK(is_two_characters);
|
|
|
|
writing_to_buffer = false;
|
|
|
|
last_byte_of_buffer_unused_ = true;
|
|
|
|
unbuffered_start_ = stream - cursor;
|
2015-07-06 11:00:05 +00:00
|
|
|
unbuffered_length_ = stream_length + cursor;
|
2014-10-08 14:55:03 +00:00
|
|
|
}
|
|
|
|
utf16_length_ = utf16_length;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2015-07-06 11:00:05 +00:00
|
|
|
void Utf8DecoderBase::WriteUtf16Slow(const uint8_t* stream,
|
|
|
|
size_t stream_length, uint16_t* data,
|
2015-02-05 07:54:24 +00:00
|
|
|
size_t data_length) {
|
2014-10-08 14:55:03 +00:00
|
|
|
while (data_length != 0) {
|
2015-02-05 07:54:24 +00:00
|
|
|
size_t cursor = 0;
|
2015-07-06 11:00:05 +00:00
|
|
|
uint32_t character = Utf8::ValueOf(stream, stream_length, &cursor);
|
2014-10-08 14:55:03 +00:00
|
|
|
// There's a total lack of bounds checking for stream
|
|
|
|
// as it was already done in Reset.
|
|
|
|
stream += cursor;
|
2015-07-06 11:00:05 +00:00
|
|
|
DCHECK(stream_length >= cursor);
|
|
|
|
stream_length -= cursor;
|
2014-10-08 14:55:03 +00:00
|
|
|
if (character > unibrow::Utf16::kMaxNonSurrogateCharCode) {
|
|
|
|
*data++ = Utf16::LeadSurrogate(character);
|
|
|
|
*data++ = Utf16::TrailSurrogate(character);
|
|
|
|
DCHECK(data_length > 1);
|
|
|
|
data_length -= 2;
|
|
|
|
} else {
|
|
|
|
*data++ = character;
|
|
|
|
data_length -= 1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
} // namespace unibrow
|