v8/src/unicode-inl.h

// Copyright 2007-2010 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#ifndef V8_UNICODE_INL_H_
#define V8_UNICODE_INL_H_

#include "src/unicode.h"
#include "src/base/logging.h"
#include "src/utils.h"

namespace unibrow {

template <class T, int s> bool Predicate<T, s>::get(uchar code_point) {
  CacheEntry entry = entries_[code_point & kMask];
  if (entry.code_point() == code_point) return entry.value();
  return CalculateValue(code_point);
}

template <class T, int s> bool Predicate<T, s>::CalculateValue(
    uchar code_point) {
  bool result = T::Is(code_point);
  entries_[code_point & kMask] = CacheEntry(code_point, result);
  return result;
}

template <class T, int s> int Mapping<T, s>::get(uchar c, uchar n,
    uchar* result) {
  CacheEntry entry = entries_[c & kMask];
  if (entry.code_point_ == c) {
    if (entry.offset_ == 0) {
      return 0;
    } else {
      result[0] = c + entry.offset_;
      return 1;
    }
  } else {
    return CalculateValue(c, n, result);
  }
}

template <class T, int s> int Mapping<T, s>::CalculateValue(uchar c, uchar n,
    uchar* result) {
  bool allow_caching = true;
  int length = T::Convert(c, n, result, &allow_caching);
  if (allow_caching) {
    if (length == 1) {
      entries_[c & kMask] = CacheEntry(c, result[0] - c);
      return 1;
    } else {
      entries_[c & kMask] = CacheEntry(c, 0);
      return 0;
    }
  } else {
    return length;
  }
}


unsigned Utf8::EncodeOneByte(char* str, uint8_t c) {
  static const int kMask = ~(1 << 6);
  if (c <= kMaxOneByteChar) {
    str[0] = c;
    return 1;
  }
  str[0] = 0xC0 | (c >> 6);
  str[1] = 0x80 | (c & kMask);
  return 2;
}

// Encode encodes the UTF-16 code units c and previous into the given str
// buffer, and combines surrogate code units into single code points. If
// replace_invalid is set to true, orphan surrogate code units will be replaced
// with kBadChar.
unsigned Utf8::Encode(char* str,
                      uchar c,
                      int previous,
                      bool replace_invalid) {
  static const int kMask = ~(1 << 6);
  if (c <= kMaxOneByteChar) {
    str[0] = c;
    return 1;
  } else if (c <= kMaxTwoByteChar) {
    str[0] = 0xC0 | (c >> 6);
    str[1] = 0x80 | (c & kMask);
    return 2;
  } else if (c <= kMaxThreeByteChar) {
    if (Utf16::IsSurrogatePair(previous, c)) {
      const int kUnmatchedSize = kSizeOfUnmatchedSurrogate;
      return Encode(str - kUnmatchedSize,
                    Utf16::CombineSurrogatePair(previous, c),
                    Utf16::kNoPreviousCharacter,
                    replace_invalid) - kUnmatchedSize;
    } else if (replace_invalid &&
               (Utf16::IsLeadSurrogate(c) ||
               Utf16::IsTrailSurrogate(c))) {
      c = kBadChar;
    }
    str[0] = 0xE0 | (c >> 12);
    str[1] = 0x80 | ((c >> 6) & kMask);
    str[2] = 0x80 | (c & kMask);
    return 3;
  } else {
    str[0] = 0xF0 | (c >> 18);
    str[1] = 0x80 | ((c >> 12) & kMask);
    str[2] = 0x80 | ((c >> 6) & kMask);
    str[3] = 0x80 | (c & kMask);
    return 4;
  }
}


uchar Utf8::ValueOf(const byte* bytes, size_t length, size_t* cursor) {
  if (length <= 0) return kBadChar;
  byte first = bytes[0];
  // Characters between 0000 and 0007F are encoded as a single character
  if (first <= kMaxOneByteChar) {
    *cursor += 1;
    return first;
  }
  return CalculateValue(bytes, length, cursor);
}

unsigned Utf8::Length(uchar c, int previous) {
  if (c <= kMaxOneByteChar) {
    return 1;
  } else if (c <= kMaxTwoByteChar) {
    return 2;
  } else if (c <= kMaxThreeByteChar) {
    if (Utf16::IsTrailSurrogate(c) &&
        Utf16::IsLeadSurrogate(previous)) {
      return kSizeOfUnmatchedSurrogate - kBytesSavedByCombiningSurrogates;
    }
    return 3;
  } else {
    return 4;
  }
}

}  // namespace unibrow

#endif  // V8_UNICODE_INL_H_
Updated unicode library. Added Nl category to letters predicate (as requried for JS identifiers). Changed/simplified representation of canonicalization ranges. Truncated tables to code points in the BMP (all that is used by JS). Reformatted tables to avoid excessively long lines. Removed duplicate entries from multi-character mapping result tables. Review URL: http://codereview.chromium.org/3030026 git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@5155 ce2b1a6d-e550-0410-aec6-3dcde31c8c00 2010-07-30 07:10:22 +00:00			`// Copyright 2007-2010 the V8 project authors. All rights reserved.`
Bulk update of Google copyright headers in source files. R=svenpanne@chromium.org Review URL: https://codereview.chromium.org/259183002 git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@21035 ce2b1a6d-e550-0410-aec6-3dcde31c8c00 2014-04-29 06:42:26 +00:00			`// Use of this source code is governed by a BSD-style license that can be`
			`// found in the LICENSE file.`
Initial export. git-svn-id: http://v8.googlecode.com/svn/trunk@2 ce2b1a6d-e550-0410-aec6-3dcde31c8c00 2008-07-03 15:10:15 +00:00
Cleanup include guards: - Fix some typos / guards that didn't match the filename. - Fix some style inconsistencies. - Add guards to files that were missing them. - Add the directory name to the guard. Review URL: http://codereview.chromium.org/99343 git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@1845 ce2b1a6d-e550-0410-aec6-3dcde31c8c00 2009-05-04 13:36:43 +00:00			`#ifndef V8_UNICODE_INL_H_`
			`#define V8_UNICODE_INL_H_`
Initial export. git-svn-id: http://v8.googlecode.com/svn/trunk@2 ce2b1a6d-e550-0410-aec6-3dcde31c8c00 2008-07-03 15:10:15 +00:00
Use full include paths everywhere - this avoids using relative include paths which are forbidden by the style guide - makes the code more readable since it's clear which header is meant - allows for starting to use checkdeps BUG=none R=jkummerow@chromium.org, danno@chromium.org LOG=n Review URL: https://codereview.chromium.org/304153016 git-svn-id: https://v8.googlecode.com/svn/branches/bleeding_edge@21625 ce2b1a6d-e550-0410-aec6-3dcde31c8c00 2014-06-03 08:12:43 +00:00			`#include "src/unicode.h"`
Move platform abstraction to base library Also split v8-core independent methods from checks.h to base/logging.h and merge v8checks with the rest of checks. The CPU::FlushICache method is moved to CpuFeatures::FlushICache RoundUp and related methods are moved to base/macros.h Remove all layering violations from src/libplatform BUG=none R=jkummerow@chromium.org LOG=n Review URL: https://codereview.chromium.org/358363002 git-svn-id: https://v8.googlecode.com/svn/branches/bleeding_edge@22092 ce2b1a6d-e550-0410-aec6-3dcde31c8c00 2014-06-30 13:25:46 +00:00			`#include "src/base/logging.h"`
Use full include paths everywhere - this avoids using relative include paths which are forbidden by the style guide - makes the code more readable since it's clear which header is meant - allows for starting to use checkdeps BUG=none R=jkummerow@chromium.org, danno@chromium.org LOG=n Review URL: https://codereview.chromium.org/304153016 git-svn-id: https://v8.googlecode.com/svn/branches/bleeding_edge@21625 ce2b1a6d-e550-0410-aec6-3dcde31c8c00 2014-06-03 08:12:43 +00:00			`#include "src/utils.h"`
Initial export. git-svn-id: http://v8.googlecode.com/svn/trunk@2 ce2b1a6d-e550-0410-aec6-3dcde31c8c00 2008-07-03 15:10:15 +00:00
			`namespace unibrow {`

			`template <class T, int s> bool Predicate<T, s>::get(uchar code_point) {`
			`CacheEntry entry = entries_[code_point & kMask];`
Replace C++ bitfields with our own BitFields Shave this yak from orbit, it's the only way to be sure. BUG=chromium:427616 LOG=n R=svenpanne@chromium.org Review URL: https://codereview.chromium.org/700963002 Cr-Commit-Position: refs/heads/master@{#25148} git-svn-id: https://v8.googlecode.com/svn/branches/bleeding_edge@25148 ce2b1a6d-e550-0410-aec6-3dcde31c8c00 2014-11-05 12:40:56 +00:00			`if (entry.code_point() == code_point) return entry.value();`
Initial export. git-svn-id: http://v8.googlecode.com/svn/trunk@2 ce2b1a6d-e550-0410-aec6-3dcde31c8c00 2008-07-03 15:10:15 +00:00			`return CalculateValue(code_point);`
			`}`

			`template <class T, int s> bool Predicate<T, s>::CalculateValue(`
			`uchar code_point) {`
			`bool result = T::Is(code_point);`
			`entries_[code_point & kMask] = CacheEntry(code_point, result);`
			`return result;`
			`}`

			`template <class T, int s> int Mapping<T, s>::get(uchar c, uchar n,`
			`uchar* result) {`
			`CacheEntry entry = entries_[c & kMask];`
			`if (entry.code_point_ == c) {`
			`if (entry.offset_ == 0) {`
			`return 0;`
			`} else {`
			`result[0] = c + entry.offset_;`
			`return 1;`
			`}`
			`} else {`
			`return CalculateValue(c, n, result);`
			`}`
			`}`

			`template <class T, int s> int Mapping<T, s>::CalculateValue(uchar c, uchar n,`
			`uchar* result) {`
			`bool allow_caching = true;`
			`int length = T::Convert(c, n, result, &allow_caching);`
			`if (allow_caching) {`
			`if (length == 1) {`
			`entries_[c & kMask] = CacheEntry(c, result[0] - c);`
			`return 1;`
			`} else {`
			`entries_[c & kMask] = CacheEntry(c, 0);`
			`return 0;`
			`}`
			`} else {`
			`return length;`
			`}`
			`}`


Some Utf8Length microoptimizations R=yangguo@chromium.org BUG= Review URL: https://codereview.chromium.org/12783002 git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@13938 ce2b1a6d-e550-0410-aec6-3dcde31c8c00 2013-03-13 19:43:45 +00:00			`unsigned Utf8::EncodeOneByte(char* str, uint8_t c) {`
			`static const int kMask = ~(1 << 6);`
			`if (c <= kMaxOneByteChar) {`
			`str[0] = c;`
			`return 1;`
			`}`
			`str[0] = 0xC0 \| (c >> 6);`
			`str[1] = 0x80 \| (c & kMask);`
			`return 2;`
			`}`

String:WriteUtf8: Add REPLACE_INVALID_UTF8 option This patch makes String::WriteUtf8 replace invalid code points (i.e. unmatched surrogates) with the unicode replacement character when REPLACE_INVALID_UTF8 is set. This is done to avoid creating invalid UTF-8 output which can lead to compatibility issues with software requiring valid UTF-8 inputs (e.g. the WebSocket protocol requires valid UTF-8 and terminates connections when invalid UTF-8 is encountered). R=dcarney@chromium.org BUG= Review URL: https://codereview.chromium.org/121173009 Patch from Felix Geisendörfer <haimuiba@gmail.com>. git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@18683 ce2b1a6d-e550-0410-aec6-3dcde31c8c00 2014-01-20 09:52:54 +00:00			`// Encode encodes the UTF-16 code units c and previous into the given str`
			`// buffer, and combines surrogate code units into single code points. If`
			`// replace_invalid is set to true, orphan surrogate code units will be replaced`
			`// with kBadChar.`
			`unsigned Utf8::Encode(char* str,`
			`uchar c,`
			`int previous,`
			`bool replace_invalid) {`
Initial export. git-svn-id: http://v8.googlecode.com/svn/trunk@2 ce2b1a6d-e550-0410-aec6-3dcde31c8c00 2008-07-03 15:10:15 +00:00			`static const int kMask = ~(1 << 6);`
			`if (c <= kMaxOneByteChar) {`
			`str[0] = c;`
			`return 1;`
			`} else if (c <= kMaxTwoByteChar) {`
			`str[0] = 0xC0 \| (c >> 6);`
			`str[1] = 0x80 \| (c & kMask);`
			`return 2;`
			`} else if (c <= kMaxThreeByteChar) {`
String:WriteUtf8: Add REPLACE_INVALID_UTF8 option This patch makes String::WriteUtf8 replace invalid code points (i.e. unmatched surrogates) with the unicode replacement character when REPLACE_INVALID_UTF8 is set. This is done to avoid creating invalid UTF-8 output which can lead to compatibility issues with software requiring valid UTF-8 inputs (e.g. the WebSocket protocol requires valid UTF-8 and terminates connections when invalid UTF-8 is encountered). R=dcarney@chromium.org BUG= Review URL: https://codereview.chromium.org/121173009 Patch from Felix Geisendörfer <haimuiba@gmail.com>. git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@18683 ce2b1a6d-e550-0410-aec6-3dcde31c8c00 2014-01-20 09:52:54 +00:00			`if (Utf16::IsSurrogatePair(previous, c)) {`
Fix input and output to handle UTF16 surrogate pairs. Review URL: https://chromiumcodereview.appspot.com/9600009 git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@11007 ce2b1a6d-e550-0410-aec6-3dcde31c8c00 2012-03-12 12:35:28 +00:00			`const int kUnmatchedSize = kSizeOfUnmatchedSurrogate;`
			`return Encode(str - kUnmatchedSize,`
			`Utf16::CombineSurrogatePair(previous, c),`
String:WriteUtf8: Add REPLACE_INVALID_UTF8 option This patch makes String::WriteUtf8 replace invalid code points (i.e. unmatched surrogates) with the unicode replacement character when REPLACE_INVALID_UTF8 is set. This is done to avoid creating invalid UTF-8 output which can lead to compatibility issues with software requiring valid UTF-8 inputs (e.g. the WebSocket protocol requires valid UTF-8 and terminates connections when invalid UTF-8 is encountered). R=dcarney@chromium.org BUG= Review URL: https://codereview.chromium.org/121173009 Patch from Felix Geisendörfer <haimuiba@gmail.com>. git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@18683 ce2b1a6d-e550-0410-aec6-3dcde31c8c00 2014-01-20 09:52:54 +00:00			`Utf16::kNoPreviousCharacter,`
			`replace_invalid) - kUnmatchedSize;`
			`} else if (replace_invalid &&`
			`(Utf16::IsLeadSurrogate(c) \|\|`
			`Utf16::IsTrailSurrogate(c))) {`
			`c = kBadChar;`
Fix input and output to handle UTF16 surrogate pairs. Review URL: https://chromiumcodereview.appspot.com/9600009 git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@11007 ce2b1a6d-e550-0410-aec6-3dcde31c8c00 2012-03-12 12:35:28 +00:00			`}`
Initial export. git-svn-id: http://v8.googlecode.com/svn/trunk@2 ce2b1a6d-e550-0410-aec6-3dcde31c8c00 2008-07-03 15:10:15 +00:00			`str[0] = 0xE0 \| (c >> 12);`
			`str[1] = 0x80 \| ((c >> 6) & kMask);`
			`str[2] = 0x80 \| (c & kMask);`
			`return 3;`
			`} else {`
			`str[0] = 0xF0 \| (c >> 18);`
			`str[1] = 0x80 \| ((c >> 12) & kMask);`
			`str[2] = 0x80 \| ((c >> 6) & kMask);`
			`str[3] = 0x80 \| (c & kMask);`
			`return 4;`
			`}`
			`}`


Scanner / Unicode decoding: use size_t instead of unsigned. size_t is the correct data type for this purpose. Our APIs (in particular ExternalSourceStream::GetMoreData) are already using it, and there were some static_casts to convert between them. This CL doesn't intend to fix all of V8, just the minimal sense-making part around scanner character streams. BUG= Review URL: https://codereview.chromium.org/864273005 Cr-Commit-Position: refs/heads/master@{#26449} 2015-02-05 07:54:24 +00:00			`uchar Utf8::ValueOf(const byte* bytes, size_t length, size_t* cursor) {`
Initial export. git-svn-id: http://v8.googlecode.com/svn/trunk@2 ce2b1a6d-e550-0410-aec6-3dcde31c8c00 2008-07-03 15:10:15 +00:00			`if (length <= 0) return kBadChar;`
			`byte first = bytes[0];`
			`// Characters between 0000 and 0007F are encoded as a single character`
			`if (first <= kMaxOneByteChar) {`
			`*cursor += 1;`
			`return first;`
			`}`
			`return CalculateValue(bytes, length, cursor);`
			`}`

Fix input and output to handle UTF16 surrogate pairs. Review URL: https://chromiumcodereview.appspot.com/9600009 git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@11007 ce2b1a6d-e550-0410-aec6-3dcde31c8c00 2012-03-12 12:35:28 +00:00			`unsigned Utf8::Length(uchar c, int previous) {`
Initial export. git-svn-id: http://v8.googlecode.com/svn/trunk@2 ce2b1a6d-e550-0410-aec6-3dcde31c8c00 2008-07-03 15:10:15 +00:00			`if (c <= kMaxOneByteChar) {`
			`return 1;`
			`} else if (c <= kMaxTwoByteChar) {`
			`return 2;`
			`} else if (c <= kMaxThreeByteChar) {`
Fix input and output to handle UTF16 surrogate pairs. Review URL: https://chromiumcodereview.appspot.com/9600009 git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@11007 ce2b1a6d-e550-0410-aec6-3dcde31c8c00 2012-03-12 12:35:28 +00:00			`if (Utf16::IsTrailSurrogate(c) &&`
			`Utf16::IsLeadSurrogate(previous)) {`
			`return kSizeOfUnmatchedSurrogate - kBytesSavedByCombiningSurrogates;`
			`}`
Initial export. git-svn-id: http://v8.googlecode.com/svn/trunk@2 ce2b1a6d-e550-0410-aec6-3dcde31c8c00 2008-07-03 15:10:15 +00:00			`return 3;`
			`} else {`
			`return 4;`
			`}`
			`}`

			`} // namespace unibrow`

Cleanup include guards: - Fix some typos / guards that didn't match the filename. - Fix some style inconsistencies. - Add guards to files that were missing them. - Add the directory name to the guard. Review URL: http://codereview.chromium.org/99343 git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@1845 ce2b1a6d-e550-0410-aec6-3dcde31c8c00 2009-05-04 13:36:43 +00:00			`#endif // V8_UNICODE_INL_H_`