2014-10-08 14:55:03 +00:00
|
|
|
// Copyright 2014 the V8 project authors. All rights reserved.
|
|
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
|
|
// found in the LICENSE file.
|
|
|
|
|
|
|
|
#ifndef V8_UNICODE_DECODER_H_
|
|
|
|
#define V8_UNICODE_DECODER_H_
|
|
|
|
|
|
|
|
#include <sys/types.h>
|
|
|
|
#include "src/globals.h"
|
2016-11-22 09:27:41 +00:00
|
|
|
#include "src/utils.h"
|
2014-10-08 14:55:03 +00:00
|
|
|
|
|
|
|
namespace unibrow {
|
|
|
|
|
2016-11-22 09:27:41 +00:00
|
|
|
class V8_EXPORT_PRIVATE Utf8DecoderBase {
|
2014-10-08 14:55:03 +00:00
|
|
|
public:
|
|
|
|
// Initialization done in subclass.
|
|
|
|
inline Utf8DecoderBase();
|
2015-02-05 07:54:24 +00:00
|
|
|
inline Utf8DecoderBase(uint16_t* buffer, size_t buffer_length,
|
|
|
|
const uint8_t* stream, size_t stream_length);
|
|
|
|
inline size_t Utf16Length() const { return utf16_length_; }
|
2014-10-08 14:55:03 +00:00
|
|
|
|
|
|
|
protected:
|
|
|
|
// This reads all characters and sets the utf16_length_.
|
|
|
|
// The first buffer_length utf16 chars are cached in the buffer.
|
2015-02-05 07:54:24 +00:00
|
|
|
void Reset(uint16_t* buffer, size_t buffer_length, const uint8_t* stream,
|
|
|
|
size_t stream_length);
|
2015-07-06 11:00:05 +00:00
|
|
|
static void WriteUtf16Slow(const uint8_t* stream, size_t stream_length,
|
|
|
|
uint16_t* data, size_t length);
|
2014-10-08 14:55:03 +00:00
|
|
|
const uint8_t* unbuffered_start_;
|
2015-07-06 11:00:05 +00:00
|
|
|
size_t unbuffered_length_;
|
2015-02-05 07:54:24 +00:00
|
|
|
size_t utf16_length_;
|
2014-10-08 14:55:03 +00:00
|
|
|
bool last_byte_of_buffer_unused_;
|
|
|
|
|
|
|
|
private:
|
|
|
|
DISALLOW_COPY_AND_ASSIGN(Utf8DecoderBase);
|
|
|
|
};
|
|
|
|
|
2015-02-05 07:54:24 +00:00
|
|
|
template <size_t kBufferSize>
|
2014-10-08 14:55:03 +00:00
|
|
|
class Utf8Decoder : public Utf8DecoderBase {
|
|
|
|
public:
|
|
|
|
inline Utf8Decoder() {}
|
2015-02-05 07:54:24 +00:00
|
|
|
inline Utf8Decoder(const char* stream, size_t length);
|
|
|
|
inline void Reset(const char* stream, size_t length);
|
|
|
|
inline size_t WriteUtf16(uint16_t* data, size_t length) const;
|
2014-10-08 14:55:03 +00:00
|
|
|
|
|
|
|
private:
|
|
|
|
uint16_t buffer_[kBufferSize];
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
Utf8DecoderBase::Utf8DecoderBase()
|
|
|
|
: unbuffered_start_(NULL),
|
2015-07-06 11:00:05 +00:00
|
|
|
unbuffered_length_(0),
|
2014-10-08 14:55:03 +00:00
|
|
|
utf16_length_(0),
|
|
|
|
last_byte_of_buffer_unused_(false) {}
|
|
|
|
|
|
|
|
|
2015-02-05 07:54:24 +00:00
|
|
|
Utf8DecoderBase::Utf8DecoderBase(uint16_t* buffer, size_t buffer_length,
|
|
|
|
const uint8_t* stream, size_t stream_length) {
|
2014-10-08 14:55:03 +00:00
|
|
|
Reset(buffer, buffer_length, stream, stream_length);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2015-02-05 07:54:24 +00:00
|
|
|
template <size_t kBufferSize>
|
|
|
|
Utf8Decoder<kBufferSize>::Utf8Decoder(const char* stream, size_t length)
|
2014-10-08 14:55:03 +00:00
|
|
|
: Utf8DecoderBase(buffer_, kBufferSize,
|
|
|
|
reinterpret_cast<const uint8_t*>(stream), length) {}
|
|
|
|
|
|
|
|
|
2015-02-05 07:54:24 +00:00
|
|
|
template <size_t kBufferSize>
|
|
|
|
void Utf8Decoder<kBufferSize>::Reset(const char* stream, size_t length) {
|
2014-10-08 14:55:03 +00:00
|
|
|
Utf8DecoderBase::Reset(buffer_, kBufferSize,
|
|
|
|
reinterpret_cast<const uint8_t*>(stream), length);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2015-02-05 07:54:24 +00:00
|
|
|
template <size_t kBufferSize>
|
|
|
|
size_t Utf8Decoder<kBufferSize>::WriteUtf16(uint16_t* data,
|
|
|
|
size_t length) const {
|
2014-10-08 14:55:03 +00:00
|
|
|
DCHECK(length > 0);
|
|
|
|
if (length > utf16_length_) length = utf16_length_;
|
|
|
|
// memcpy everything in buffer.
|
2015-02-05 07:54:24 +00:00
|
|
|
size_t buffer_length =
|
2014-10-08 14:55:03 +00:00
|
|
|
last_byte_of_buffer_unused_ ? kBufferSize - 1 : kBufferSize;
|
2015-02-05 07:54:24 +00:00
|
|
|
size_t memcpy_length = length <= buffer_length ? length : buffer_length;
|
2014-10-08 14:55:03 +00:00
|
|
|
v8::internal::MemCopy(data, buffer_, memcpy_length * sizeof(uint16_t));
|
|
|
|
if (length <= buffer_length) return length;
|
|
|
|
DCHECK(unbuffered_start_ != NULL);
|
|
|
|
// Copy the rest the slow way.
|
2015-07-06 11:00:05 +00:00
|
|
|
WriteUtf16Slow(unbuffered_start_, unbuffered_length_, data + buffer_length,
|
2014-10-08 14:55:03 +00:00
|
|
|
length - buffer_length);
|
|
|
|
return length;
|
|
|
|
}
|
|
|
|
|
|
|
|
class Latin1 {
|
|
|
|
public:
|
|
|
|
static const unsigned kMaxChar = 0xff;
|
|
|
|
// Returns 0 if character does not convert to single latin-1 character
|
|
|
|
// or if the character doesn't not convert back to latin-1 via inverse
|
|
|
|
// operation (upper to lower, etc).
|
|
|
|
static inline uint16_t ConvertNonLatin1ToLatin1(uint16_t);
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
uint16_t Latin1::ConvertNonLatin1ToLatin1(uint16_t c) {
|
|
|
|
DCHECK(c > Latin1::kMaxChar);
|
|
|
|
switch (c) {
|
|
|
|
// This are equivalent characters in unicode.
|
|
|
|
case 0x39c:
|
|
|
|
case 0x3bc:
|
|
|
|
return 0xb5;
|
|
|
|
// This is an uppercase of a Latin-1 character
|
|
|
|
// outside of Latin-1.
|
|
|
|
case 0x178:
|
|
|
|
return 0xff;
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
} // namespace unibrow
|
|
|
|
|
|
|
|
#endif // V8_UNICODE_DECODER_H_
|