cb242eded6
This CL moves a number of memory-related methods out of utils into its own header, since utils.h is included in many places that do not need these methods. R=clemensh@chromium.org,mstarzinger@chromium.org Change-Id: I5155baf329844784286413408c05c7108b789020 Reviewed-on: https://chromium-review.googlesource.com/c/1354889 Commit-Queue: Ben Titzer <titzer@chromium.org> Reviewed-by: Michael Starzinger <mstarzinger@chromium.org> Reviewed-by: Clemens Hammacher <clemensh@chromium.org> Reviewed-by: Marja Hölttä <marja@chromium.org> Cr-Commit-Position: refs/heads/master@{#57948}
159 lines
4.6 KiB
C++
159 lines
4.6 KiB
C++
// Copyright 2014 the V8 project authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
// found in the LICENSE file.
|
|
|
|
#ifndef V8_UNICODE_DECODER_H_
|
|
#define V8_UNICODE_DECODER_H_
|
|
|
|
#include <sys/types.h>
|
|
#include <algorithm>
|
|
#include "src/globals.h"
|
|
#include "src/memcopy.h"
|
|
#include "src/unicode.h"
|
|
#include "src/vector.h"
|
|
|
|
namespace unibrow {
|
|
|
|
class Utf8Iterator {
|
|
public:
|
|
explicit Utf8Iterator(const v8::internal::Vector<const char>& stream)
|
|
: Utf8Iterator(stream, 0, false) {}
|
|
Utf8Iterator(const v8::internal::Vector<const char>& stream, size_t offset,
|
|
bool trailing)
|
|
: stream_(stream),
|
|
cursor_(offset),
|
|
offset_(0),
|
|
char_(0),
|
|
trailing_(false) {
|
|
DCHECK_LE(offset, stream.length());
|
|
// Read the first char, setting offset_ to offset in the process.
|
|
++*this;
|
|
|
|
// This must be set after reading the first char, since the offset marks
|
|
// the start of the octet sequence that the trailing char is part of.
|
|
trailing_ = trailing;
|
|
if (trailing) {
|
|
DCHECK_GT(char_, Utf16::kMaxNonSurrogateCharCode);
|
|
}
|
|
}
|
|
|
|
uint16_t operator*();
|
|
Utf8Iterator& operator++();
|
|
Utf8Iterator operator++(int);
|
|
bool Done();
|
|
bool Trailing() { return trailing_; }
|
|
size_t Offset() { return offset_; }
|
|
|
|
private:
|
|
const v8::internal::Vector<const char>& stream_;
|
|
size_t cursor_;
|
|
size_t offset_;
|
|
uint32_t char_;
|
|
bool trailing_;
|
|
};
|
|
|
|
class V8_EXPORT_PRIVATE Utf8DecoderBase {
|
|
public:
|
|
// Initialization done in subclass.
|
|
inline Utf8DecoderBase();
|
|
inline Utf8DecoderBase(uint16_t* buffer, size_t buffer_length,
|
|
const v8::internal::Vector<const char>& stream);
|
|
inline size_t Utf16Length() const { return utf16_length_; }
|
|
|
|
protected:
|
|
// This reads all characters and sets the utf16_length_.
|
|
// The first buffer_length utf16 chars are cached in the buffer.
|
|
void Reset(uint16_t* buffer, size_t buffer_length,
|
|
const v8::internal::Vector<const char>& vector);
|
|
static void WriteUtf16Slow(uint16_t* data, size_t length,
|
|
const v8::internal::Vector<const char>& stream,
|
|
size_t offset, bool trailing);
|
|
|
|
size_t bytes_read_;
|
|
size_t chars_written_;
|
|
size_t utf16_length_;
|
|
bool trailing_;
|
|
|
|
private:
|
|
DISALLOW_COPY_AND_ASSIGN(Utf8DecoderBase);
|
|
};
|
|
|
|
template <size_t kBufferSize>
|
|
class Utf8Decoder : public Utf8DecoderBase {
|
|
public:
|
|
inline Utf8Decoder() = default;
|
|
explicit inline Utf8Decoder(const v8::internal::Vector<const char>& stream);
|
|
inline void Reset(const v8::internal::Vector<const char>& stream);
|
|
inline size_t WriteUtf16(
|
|
uint16_t* data, size_t length,
|
|
const v8::internal::Vector<const char>& stream) const;
|
|
|
|
private:
|
|
uint16_t buffer_[kBufferSize];
|
|
};
|
|
|
|
Utf8DecoderBase::Utf8DecoderBase()
|
|
: bytes_read_(0), chars_written_(0), utf16_length_(0), trailing_(false) {}
|
|
|
|
Utf8DecoderBase::Utf8DecoderBase(
|
|
uint16_t* buffer, size_t buffer_length,
|
|
const v8::internal::Vector<const char>& stream) {
|
|
Reset(buffer, buffer_length, stream);
|
|
}
|
|
|
|
template <size_t kBufferSize>
|
|
Utf8Decoder<kBufferSize>::Utf8Decoder(
|
|
const v8::internal::Vector<const char>& stream)
|
|
: Utf8DecoderBase(buffer_, kBufferSize, stream) {}
|
|
|
|
template <size_t kBufferSize>
|
|
void Utf8Decoder<kBufferSize>::Reset(
|
|
const v8::internal::Vector<const char>& stream) {
|
|
Utf8DecoderBase::Reset(buffer_, kBufferSize, stream);
|
|
}
|
|
|
|
template <size_t kBufferSize>
|
|
size_t Utf8Decoder<kBufferSize>::WriteUtf16(
|
|
uint16_t* data, size_t data_length,
|
|
const v8::internal::Vector<const char>& stream) const {
|
|
DCHECK_GT(data_length, 0);
|
|
data_length = std::min(data_length, utf16_length_);
|
|
|
|
// memcpy everything in buffer.
|
|
size_t memcpy_length = std::min(data_length, chars_written_);
|
|
v8::internal::MemCopy(data, buffer_, memcpy_length * sizeof(uint16_t));
|
|
|
|
if (data_length <= chars_written_) return data_length;
|
|
|
|
// Copy the rest the slow way.
|
|
WriteUtf16Slow(data + chars_written_, data_length - chars_written_, stream,
|
|
bytes_read_, trailing_);
|
|
return data_length;
|
|
}
|
|
|
|
class Latin1 {
|
|
public:
|
|
static const unsigned kMaxChar = 0xff;
|
|
// Convert the character to Latin-1 case equivalent if possible.
|
|
static inline uint16_t TryConvertToLatin1(uint16_t);
|
|
};
|
|
|
|
uint16_t Latin1::TryConvertToLatin1(uint16_t c) {
|
|
switch (c) {
|
|
// This are equivalent characters in unicode.
|
|
case 0x39c:
|
|
case 0x3bc:
|
|
return 0xb5;
|
|
// This is an uppercase of a Latin-1 character
|
|
// outside of Latin-1.
|
|
case 0x178:
|
|
return 0xff;
|
|
}
|
|
return c;
|
|
}
|
|
|
|
|
|
} // namespace unibrow
|
|
|
|
#endif // V8_UNICODE_DECODER_H_
|