Fix input and output to handle UTF16 surrogate pairs.

Review URL: https://chromiumcodereview.appspot.com/9600009 git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@11007 ce2b1a6d-e550-0410-aec6-3dcde31c8c00
2012-03-12 12:35:28 +00:00 · 2012-03-12 12:35:28 +00:00 · 03cfc4363b
commit 03cfc4363b
parent cd91894d2f
30 changed files with 957 additions and 305 deletions
--- a/src/api.cc
+++ b/src/api.cc
@ -1430,7 +1430,7 @@ void ObjectTemplate::SetInternalFieldCount(int value) {


 ScriptData* ScriptData::PreCompile(const char* input, int length) {
-  i::Utf8ToUC16CharacterStream stream(
+  i::Utf8ToUtf16CharacterStream stream(
      reinterpret_cast<const unsigned char*>(input), length);
  return i::ParserApi::PreParse(&stream, NULL, i::FLAG_harmony_scoping);
 }
@ -1439,11 +1439,11 @@ ScriptData* ScriptData::PreCompile(const char* input, int length) {
 ScriptData* ScriptData::PreCompile(v8::Handle<String> source) {
  i::Handle<i::String> str = Utils::OpenHandle(*source);
  if (str->IsExternalTwoByteString()) {
-    i::ExternalTwoByteStringUC16CharacterStream stream(
+    i::ExternalTwoByteStringUtf16CharacterStream stream(
      i::Handle<i::ExternalTwoByteString>::cast(str), 0, str->length());
    return i::ParserApi::PreParse(&stream, NULL, i::FLAG_harmony_scoping);
  } else {
-    i::GenericStringUC16CharacterStream stream(str, 0, str->length());
+    i::GenericStringUtf16CharacterStream stream(str, 0, str->length());
    return i::ParserApi::PreParse(&stream, NULL, i::FLAG_harmony_scoping);
  }
 }
@ -3690,7 +3690,7 @@ int String::Length() const {
 int String::Utf8Length() const {
  i::Handle<i::String> str = Utils::OpenHandle(this);
  if (IsDeadCheck(str->GetIsolate(), "v8::String::Utf8Length()")) return 0;
-  return str->Utf8Length();
+  return i::Utf8Length(str);
 }


@ -3736,11 +3736,13 @@ int String::WriteUtf8(char* buffer,
  int i;
  int pos = 0;
  int nchars = 0;
+  int previous = unibrow::Utf16::kNoPreviousCharacter;
  for (i = 0; i < len && (capacity == -1 || pos < fast_end); i++) {
    i::uc32 c = write_input_buffer.GetNext();
-    int written = unibrow::Utf8::Encode(buffer + pos, c);
+    int written = unibrow::Utf8::Encode(buffer + pos, c, previous);
    pos += written;
    nchars++;
+    previous = c;
  }
  if (i < len) {
    // For the last characters we need to check the length for each one
@ -3749,16 +3751,33 @@ int String::WriteUtf8(char* buffer,
    char intermediate[unibrow::Utf8::kMaxEncodedSize];
    for (; i < len && pos < capacity; i++) {
      i::uc32 c = write_input_buffer.GetNext();
-      int written = unibrow::Utf8::Encode(intermediate, c);
-      if (pos + written <= capacity) {
-        for (int j = 0; j < written; j++)
-          buffer[pos + j] = intermediate[j];
+      if (unibrow::Utf16::IsTrailSurrogate(c) &&
+          unibrow::Utf16::IsLeadSurrogate(previous)) {
+        // We can't use the intermediate buffer here because the encoding
+        // of surrogate pairs is done under assumption that you can step
+        // back and fix the UTF8 stream.  Luckily we only need space for one
+        // more byte, so there is always space.
+        ASSERT(pos < capacity);
+        int written = unibrow::Utf8::Encode(buffer + pos, c, previous);
+        ASSERT(written == 1);
        pos += written;
        nchars++;
      } else {
-        // We've reached the end of the buffer
-        break;
+        int written =
+            unibrow::Utf8::Encode(intermediate,
+                                  c,
+                                  unibrow::Utf16::kNoPreviousCharacter);
+        if (pos + written <= capacity) {
+          for (int j = 0; j < written; j++)
+            buffer[pos + j] = intermediate[j];
+          pos += written;
+          nchars++;
+        } else {
+          // We've reached the end of the buffer
+          break;
+        }
      }
+      previous = c;
    }
  }
  if (nchars_ref != NULL) *nchars_ref = nchars;
@ -5240,7 +5259,8 @@ String::Utf8Value::Utf8Value(v8::Handle<v8::Value> obj)
  TryCatch try_catch;
  Handle<String> str = obj->ToString();
  if (str.IsEmpty()) return;
-  length_ = str->Utf8Length();
+  i::Handle<i::String> i_str = Utils::OpenHandle(*str);
+  length_ = i::Utf8Length(i_str);
  str_ = i::NewArray<char>(length_ + 1);
  str->WriteUtf8(str_);
 }
--- a/src/arm/regexp-macro-assembler-arm.cc
+++ b/src/arm/regexp-macro-assembler-arm.cc
@ -472,7 +472,7 @@ void RegExpMacroAssemblerARM::CheckNotCharacterAfterMinusAnd(
    uc16 minus,
    uc16 mask,
    Label* on_not_equal) {
-  ASSERT(minus < String::kMaxUC16CharCode);
+  ASSERT(minus < String::kMaxUtf16CodeUnit);
  __ sub(r0, current_character(), Operand(minus));
  __ and_(r0, r0, Operand(mask));
  __ cmp(r0, Operand(c));
--- a/src/debug-agent.cc
+++ b/src/debug-agent.cc
@ -372,8 +372,11 @@ bool DebuggerAgentUtil::SendMessage(const Socket* conn,

  // Calculate the message size in UTF-8 encoding.
  int utf8_len = 0;
+  int previous = unibrow::Utf16::kNoPreviousCharacter;
  for (int i = 0; i < message.length(); i++) {
-    utf8_len += unibrow::Utf8::Length(message[i]);
+    uint16_t character = message[i];
+    utf8_len += unibrow::Utf8::Length(character, previous);
+    previous = character;
  }

  // Send the header.
@ -388,17 +391,33 @@ bool DebuggerAgentUtil::SendMessage(const Socket* conn,

  // Send message body as UTF-8.
  int buffer_position = 0;  // Current buffer position.
+  previous = unibrow::Utf16::kNoPreviousCharacter;
  for (int i = 0; i < message.length(); i++) {
    // Write next UTF-8 encoded character to buffer.
+    uint16_t character = message[i];
    buffer_position +=
-        unibrow::Utf8::Encode(buffer + buffer_position, message[i]);
+        unibrow::Utf8::Encode(buffer + buffer_position, character, previous);
    ASSERT(buffer_position < kBufferSize);

    // Send buffer if full or last character is encoded.
-    if (kBufferSize - buffer_position < 3 || i == message.length() - 1) {
-      conn->Send(buffer, buffer_position);
-      buffer_position = 0;
+    if (kBufferSize - buffer_position <
+          unibrow::Utf16::kMaxExtraUtf8BytesForOneUtf16CodeUnit ||
+        i == message.length() - 1) {
+      if (unibrow::Utf16::IsLeadSurrogate(character)) {
+        const int kEncodedSurrogateLength =
+            unibrow::Utf16::kUtf8BytesToCodeASurrogate;
+        ASSERT(buffer_position >= kEncodedSurrogateLength);
+        conn->Send(buffer, buffer_position - kEncodedSurrogateLength);
+        for (int i = 0; i < kEncodedSurrogateLength; i++) {
+          buffer[i] = buffer[buffer_position + i];
+        }
+        buffer_position = kEncodedSurrogateLength;
+      } else {
+        conn->Send(buffer, buffer_position);
+        buffer_position = 0;
+      }
    }
+    previous = character;
  }

  return true;
--- a/src/globals.h
+++ b/src/globals.h
@ -267,8 +267,9 @@ const int kBinary32ExponentShift = 23;
 // other bits set.
 const uint64_t kQuietNaNMask = static_cast<uint64_t>(0xfff) << 51;

-// ASCII/UC16 constants
+// ASCII/UTF-16 constants
 // Code-point values in Unicode 4.0 are 21 bits wide.
+// Code units in UTF-16 are 16 bits wide.
 typedef uint16_t uc16;
 typedef int32_t uc32;
 const int kASCIISize    = kCharSize;
--- a/src/handles.cc
+++ b/src/handles.cc
@ -800,4 +800,162 @@ Handle<ObjectHashTable> PutIntoObjectHashTable(Handle<ObjectHashTable> table,
 }


+// This method determines the type of string involved and then gets the UTF8
+// length of the string.  It doesn't flatten the string and has log(n) recursion
+// for a string of length n.  If the failure flag gets set, then we have to
+// flatten the string and retry.  Failures are caused by surrogate pairs in deep
+// cons strings.
+
+// Single surrogate characters that are encountered in the UTF-16 character
+// sequence of the input string get counted as 3 UTF-8 bytes, because that
+// is the way that WriteUtf8 will encode them.  Surrogate pairs are counted and
+// encoded as one 4-byte UTF-8 sequence.
+
+// This function conceptually uses recursion on the two halves of cons strings.
+// However, in order to avoid the recursion going too deep it recurses on the
+// second string of the cons, but iterates on the first substring (by manually
+// eliminating it as a tail recursion).  This means it counts the UTF-8 length
+// from the end to the start, which makes no difference to the total.
+
+// Surrogate pairs are recognized even if they are split across two sides of a
+// cons, which complicates the implementation somewhat.  Therefore, too deep
+// recursion cannot always be avoided.  This case is detected, and the failure
+// flag is set, a signal to the caller that the string should be flattened and
+// the operation retried.
+int Utf8LengthHelper(String* input,
+                     int from,
+                     int to,
+                     bool followed_by_surrogate,
+                     int max_recursion,
+                     bool* failure,
+                     bool* starts_with_surrogate) {
+  if (from == to) return 0;
+  int total = 0;
+  bool dummy;
+  while (true) {
+    if (input->IsAsciiRepresentation()) {
+      *starts_with_surrogate = false;
+      return total + to - from;
+    }
+    switch (StringShape(input).representation_tag()) {
+      case kConsStringTag: {
+        ConsString* str = ConsString::cast(input);
+        String* first = str->first();
+        String* second = str->second();
+        int first_length = first->length();
+        if (first_length - from > to - first_length) {
+          if (first_length < to) {
+            // Right hand side is shorter.  No need to check the recursion depth
+            // since this can only happen log(n) times.
+            bool right_starts_with_surrogate = false;
+            total += Utf8LengthHelper(second,
+                                      0,
+                                      to - first_length,
+                                      followed_by_surrogate,
+                                      max_recursion - 1,
+                                      failure,
+                                      &right_starts_with_surrogate);
+            if (*failure) return 0;
+            followed_by_surrogate = right_starts_with_surrogate;
+            input = first;
+            to = first_length;
+          } else {
+            // We only need the left hand side.
+            input = first;
+          }
+        } else {
+          if (first_length > from) {
+            // Left hand side is shorter.
+            if (first->IsAsciiRepresentation()) {
+              total += first_length - from;
+              *starts_with_surrogate = false;
+              starts_with_surrogate = &dummy;
+              input = second;
+              from = 0;
+              to -= first_length;
+            } else if (second->IsAsciiRepresentation()) {
+              followed_by_surrogate = false;
+              total += to - first_length;
+              input = first;
+              to = first_length;
+            } else if (max_recursion > 0) {
+              bool right_starts_with_surrogate = false;
+              // Recursing on the long one.  This may fail.
+              total += Utf8LengthHelper(second,
+                                        0,
+                                        to - first_length,
+                                        followed_by_surrogate,
+                                        max_recursion - 1,
+                                        failure,
+                                        &right_starts_with_surrogate);
+              if (*failure) return 0;
+              input = first;
+              to = first_length;
+              followed_by_surrogate = right_starts_with_surrogate;
+            } else {
+              *failure = true;
+              return 0;
+            }
+          } else {
+            // We only need the right hand side.
+            input = second;
+            from = 0;
+            to -= first_length;
+          }
+        }
+        continue;
+      }
+      case kExternalStringTag:
+      case kSeqStringTag: {
+        Vector<const uc16> vector = input->GetFlatContent().ToUC16Vector();
+        const uc16* p = vector.start();
+        int previous = unibrow::Utf16::kNoPreviousCharacter;
+        for (int i = from; i < to; i++) {
+          uc16 c = p[i];
+          total += unibrow::Utf8::Length(c, previous);
+          previous = c;
+        }
+        if (to - from > 0) {
+          if (unibrow::Utf16::IsLeadSurrogate(previous) &&
+              followed_by_surrogate) {
+            total -= unibrow::Utf8::kBytesSavedByCombiningSurrogates;
+          }
+          if (unibrow::Utf16::IsTrailSurrogate(p[from])) {
+            *starts_with_surrogate = true;
+          }
+        }
+        return total;
+      }
+      case kSlicedStringTag: {
+        SlicedString* str = SlicedString::cast(input);
+        int offset = str->offset();
+        input = str->parent();
+        from += offset;
+        to += offset;
+        continue;
+      }
+      default:
+        break;
+    }
+    UNREACHABLE();
+    return 0;
+  }
+  return 0;
+}
+
+
+int Utf8Length(Handle<String> str) {
+  bool dummy;
+  bool failure;
+  int len;
+  const int kRecursionBudget = 100;
+  do {
+    failure = false;
+    len = Utf8LengthHelper(
+        *str, 0, str->length(), false, kRecursionBudget, &failure, &dummy);
+    if (failure) FlattenString(str);
+  } while (failure);
+  return len;
+}
+
 } }  // namespace v8::internal
--- a/src/handles.h
+++ b/src/handles.h
@ -174,6 +174,8 @@ void FlattenString(Handle<String> str);
 // string.
 Handle<String> FlattenGetString(Handle<String> str);

+int Utf8Length(Handle<String> str);
+
 Handle<Object> SetProperty(Handle<Object> object,
                           Handle<Object> key,
                           Handle<Object> value,
--- a/src/heap.cc
+++ b/src/heap.cc
@ -4186,8 +4186,6 @@ MaybeObject* Heap::AllocateStringFromAscii(Vector<const char> string,

 MaybeObject* Heap::AllocateStringFromUtf8Slow(Vector<const char> string,
                                              PretenureFlag pretenure) {
-  // V8 only supports characters in the Basic Multilingual Plane.
-  const uc32 kMaxSupportedChar = 0xFFFF;
  // Count the number of characters in the UTF-8 string and check if
  // it is an ASCII string.
  Access<UnicodeCache::Utf8Decoder>
@ -4195,8 +4193,12 @@ MaybeObject* Heap::AllocateStringFromUtf8Slow(Vector<const char> string,
  decoder->Reset(string.start(), string.length());
  int chars = 0;
  while (decoder->has_more()) {
-    decoder->GetNext();
-    chars++;
+    uint32_t r = decoder->GetNext();
+    if (r <= unibrow::Utf16::kMaxNonSurrogateCharCode) {
+      chars++;
+    } else {
+      chars += 2;
+    }
  }

  Object* result;
@ -4207,10 +4209,15 @@ MaybeObject* Heap::AllocateStringFromUtf8Slow(Vector<const char> string,
  // Convert and copy the characters into the new object.
  String* string_result = String::cast(result);
  decoder->Reset(string.start(), string.length());
-  for (int i = 0; i < chars; i++) {
-    uc32 r = decoder->GetNext();
-    if (r > kMaxSupportedChar) { r = unibrow::Utf8::kBadChar; }
-    string_result->Set(i, r);
+  int i = 0;
+  while (i < chars) {
+    uint32_t r = decoder->GetNext();
+    if (r > unibrow::Utf16::kMaxNonSurrogateCharCode) {
+      string_result->Set(i++, unibrow::Utf16::LeadSurrogate(r));
+      string_result->Set(i++, unibrow::Utf16::TrailSurrogate(r));
+    } else {
+      string_result->Set(i++, r);
+    }
  }
  return result;
 }
@ -4267,7 +4274,7 @@ MaybeObject* Heap::AllocateInternalSymbol(unibrow::CharacterStream* buffer,
                                          uint32_t hash_field) {
  ASSERT(chars >= 0);
  // Ensure the chars matches the number of characters in the buffer.
-  ASSERT(static_cast<unsigned>(chars) == buffer->Length());
+  ASSERT(static_cast<unsigned>(chars) == buffer->Utf16Length());
  // Determine whether the string is ASCII.
  bool is_ascii = true;
  while (buffer->has_more()) {
@ -4313,8 +4320,15 @@ MaybeObject* Heap::AllocateInternalSymbol(unibrow::CharacterStream* buffer,
  ASSERT_EQ(size, answer->Size());

  // Fill in the characters.
-  for (int i = 0; i < chars; i++) {
-    answer->Set(i, buffer->GetNext());
+  int i = 0;
+  while (i < chars) {
+    uint32_t character = buffer->GetNext();
+    if (character > unibrow::Utf16::kMaxNonSurrogateCharCode) {
+      answer->Set(i++, unibrow::Utf16::LeadSurrogate(character));
+      answer->Set(i++, unibrow::Utf16::TrailSurrogate(character));
+    } else {
+      answer->Set(i++, character);
+    }
  }
  return answer;
 }
--- a/src/hydrogen-instructions.h
+++ b/src/hydrogen-instructions.h
@ -4284,7 +4284,7 @@ class HStringCharCodeAt: public HTemplateInstruction<3> {
  virtual bool DataEquals(HValue* other) { return true; }

  virtual Range* InferRange(Zone* zone) {
-    return new(zone) Range(0, String::kMaxUC16CharCode);
+    return new(zone) Range(0, String::kMaxUtf16CodeUnit);
  }
 };

--- a/src/ia32/regexp-macro-assembler-ia32.cc
+++ b/src/ia32/regexp-macro-assembler-ia32.cc
@ -523,7 +523,7 @@ void RegExpMacroAssemblerIA32::CheckNotCharacterAfterMinusAnd(
    uc16 minus,
    uc16 mask,
    Label* on_not_equal) {
-  ASSERT(minus < String::kMaxUC16CharCode);
+  ASSERT(minus < String::kMaxUtf16CodeUnit);
  __ lea(eax, Operand(current_character(), -minus));
  __ and_(eax, mask);
  __ cmp(eax, c);
--- a/src/jsregexp.cc
+++ b/src/jsregexp.cc
@ -1444,7 +1444,7 @@ static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler,
  if (ascii) {
    char_mask = String::kMaxAsciiCharCode;
  } else {
-    char_mask = String::kMaxUC16CharCode;
+    char_mask = String::kMaxUtf16CodeUnit;
  }
  uc16 exor = c1 ^ c2;
  // Check whether exor has only one bit set.
@ -1546,7 +1546,7 @@ static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
  if (ascii) {
    max_char = String::kMaxAsciiCharCode;
  } else {
-    max_char = String::kMaxUC16CharCode;
+    max_char = String::kMaxUtf16CodeUnit;
  }

  Label success;
@ -1642,7 +1642,7 @@ static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
        macro_assembler->CheckCharacterLT(from, on_failure);
      }
    }
-    if (to != String::kMaxUC16CharCode) {
+    if (to != String::kMaxUtf16CodeUnit) {
      if (cc->is_negated()) {
        macro_assembler->CheckCharacterLT(to + 1, on_failure);
      } else {
@ -1835,7 +1835,7 @@ bool QuickCheckDetails::Rationalize(bool asc) {
  if (asc) {
    char_mask = String::kMaxAsciiCharCode;
  } else {
-    char_mask = String::kMaxUC16CharCode;
+    char_mask = String::kMaxUtf16CodeUnit;
  }
  mask_ = 0;
  value_ = 0;
@ -1887,7 +1887,7 @@ bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler,
    if (compiler->ascii()) {
      char_mask = String::kMaxAsciiCharCode;
    } else {
-      char_mask = String::kMaxUC16CharCode;
+      char_mask = String::kMaxUtf16CodeUnit;
    }
    if ((mask & char_mask) == char_mask) need_mask = false;
    mask &= char_mask;
@ -1939,7 +1939,7 @@ void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,
  if (compiler->ascii()) {
    char_mask = String::kMaxAsciiCharCode;
  } else {
-    char_mask = String::kMaxUC16CharCode;
+    char_mask = String::kMaxUtf16CodeUnit;
  }
  for (int k = 0; k < elms_->length(); k++) {
    TextElement elm = elms_->at(k);
@ -4079,7 +4079,7 @@ static void AddClassNegated(const uc16 *elmv,
                            int elmc,
                            ZoneList<CharacterRange>* ranges) {
  ASSERT(elmv[0] != 0x0000);
-  ASSERT(elmv[elmc-1] != String::kMaxUC16CharCode);
+  ASSERT(elmv[elmc-1] != String::kMaxUtf16CodeUnit);
  uc16 last = 0x0000;
  for (int i = 0; i < elmc; i += 2) {
    ASSERT(last <= elmv[i] - 1);
@ -4087,7 +4087,7 @@ static void AddClassNegated(const uc16 *elmv,
    ranges->Add(CharacterRange(last, elmv[i] - 1));
    last = elmv[i + 1] + 1;
  }
-  ranges->Add(CharacterRange(last, String::kMaxUC16CharCode));
+  ranges->Add(CharacterRange(last, String::kMaxUtf16CodeUnit));
 }


@ -4633,8 +4633,8 @@ void CharacterRange::Negate(ZoneList<CharacterRange>* ranges,
    from = range.to();
    i++;
  }
-  if (from < String::kMaxUC16CharCode) {
-    negated_ranges->Add(CharacterRange(from + 1, String::kMaxUC16CharCode));
+  if (from < String::kMaxUtf16CodeUnit) {
+    negated_ranges->Add(CharacterRange(from + 1, String::kMaxUtf16CodeUnit));
  }
 }

@ -4797,7 +4797,7 @@ void DispatchTable::AddRange(CharacterRange full_range, int value) {
      entry->AddValue(value);
      // Bail out if the last interval ended at 0xFFFF since otherwise
      // adding 1 will wrap around to 0.
-      if (entry->to() == String::kMaxUC16CharCode)
+      if (entry->to() == String::kMaxUtf16CodeUnit)
        break;
      ASSERT(entry->to() + 1 > current.from());
      current.set_from(entry->to() + 1);
@ -5117,7 +5117,7 @@ int TextNode::ComputeFirstCharacterSet(int budget) {
        int new_length = length + 1;
        if (length > 0) {
          if (ranges->at(0).from() == 0) new_length--;
-          if (ranges->at(length - 1).to() == String::kMaxUC16CharCode) {
+          if (ranges->at(length - 1).to() == String::kMaxUtf16CodeUnit) {
            new_length--;
          }
        }
@ -5207,14 +5207,14 @@ void DispatchTableConstructor::AddInverse(ZoneList<CharacterRange>* ranges) {
    if (last < range.from())
      AddRange(CharacterRange(last, range.from() - 1));
    if (range.to() >= last) {
-      if (range.to() == String::kMaxUC16CharCode) {
+      if (range.to() == String::kMaxUtf16CodeUnit) {
        return;
      } else {
        last = range.to() + 1;
      }
    }
  }
-  AddRange(CharacterRange(last, String::kMaxUC16CharCode));
+  AddRange(CharacterRange(last, String::kMaxUtf16CodeUnit));
 }


--- a/src/log.cc
+++ b/src/log.cc
@ -461,18 +461,20 @@ class Logger::NameBuffer {
      utf8_pos_ += utf8_length;
      return;
    }
-    int uc16_length = Min(str->length(), kUc16BufferSize);
-    String::WriteToFlat(str, uc16_buffer_, 0, uc16_length);
+    int uc16_length = Min(str->length(), kUtf16BufferSize);
+    String::WriteToFlat(str, utf16_buffer, 0, uc16_length);
+    int previous = unibrow::Utf16::kNoPreviousCharacter;
    for (int i = 0; i < uc16_length && utf8_pos_ < kUtf8BufferSize; ++i) {
-      uc16 c = uc16_buffer_[i];
+      uc16 c = utf16_buffer[i];
      if (c <= String::kMaxAsciiCharCodeU) {
        utf8_buffer_[utf8_pos_++] = static_cast<char>(c);
      } else {
-        int char_length = unibrow::Utf8::Length(c);
+        int char_length = unibrow::Utf8::Length(c, previous);
        if (utf8_pos_ + char_length > kUtf8BufferSize) break;
-        unibrow::Utf8::Encode(utf8_buffer_ + utf8_pos_, c);
+        unibrow::Utf8::Encode(utf8_buffer_ + utf8_pos_, c, previous);
        utf8_pos_ += char_length;
      }
+      previous = c;
    }
  }

@ -504,11 +506,11 @@ class Logger::NameBuffer {

 private:
  static const int kUtf8BufferSize = 512;
-  static const int kUc16BufferSize = 128;
+  static const int kUtf16BufferSize = 128;

  int utf8_pos_;
  char utf8_buffer_[kUtf8BufferSize];
-  uc16 uc16_buffer_[kUc16BufferSize];
+  uc16 utf16_buffer[kUtf16BufferSize];
 };


--- a/src/objects-inl.h
+++ b/src/objects-inl.h
@ -4463,7 +4463,11 @@ bool StringHasher::has_trivial_hash() {
 }


-void StringHasher::AddCharacter(uc32 c) {
+void StringHasher::AddCharacter(uint32_t c) {
+  if (c > unibrow::Utf16::kMaxNonSurrogateCharCode) {
+    AddSurrogatePair(c);  // Not inlined.
+    return;
+  }
  // Use the Jenkins one-at-a-time hash function to update the hash
  // for the given character.
  raw_running_hash_ += c;
@ -4492,8 +4496,12 @@ void StringHasher::AddCharacter(uc32 c) {
 }


-void StringHasher::AddCharacterNoIndex(uc32 c) {
+void StringHasher::AddCharacterNoIndex(uint32_t c) {
  ASSERT(!is_array_index());
+  if (c > unibrow::Utf16::kMaxNonSurrogateCharCode) {
+    AddSurrogatePairNoIndex(c);  // Not inlined.
+    return;
+  }
  raw_running_hash_ += c;
  raw_running_hash_ += (raw_running_hash_ << 10);
  raw_running_hash_ ^= (raw_running_hash_ >> 6);
--- a/src/objects.cc
+++ b/src/objects.cc
@ -6051,9 +6051,11 @@ SmartArrayPointer<char> String::ToCString(AllowNullsFlag allow_nulls,
  buffer->Reset(offset, this);
  int character_position = offset;
  int utf8_bytes = 0;
+  int last = unibrow::Utf16::kNoPreviousCharacter;
  while (buffer->has_more() && character_position++ < offset + length) {
    uint16_t character = buffer->GetNext();
-    utf8_bytes += unibrow::Utf8::Length(character);
+    utf8_bytes += unibrow::Utf8::Length(character, last);
+    last = character;
  }

  if (length_return) {
@ -6067,13 +6069,15 @@ SmartArrayPointer<char> String::ToCString(AllowNullsFlag allow_nulls,
  buffer->Seek(offset);
  character_position = offset;
  int utf8_byte_position = 0;
+  last = unibrow::Utf16::kNoPreviousCharacter;
  while (buffer->has_more() && character_position++ < offset + length) {
    uint16_t character = buffer->GetNext();
    if (allow_nulls == DISALLOW_NULLS && character == 0) {
      character = ' ';
    }
    utf8_byte_position +=
-        unibrow::Utf8::Encode(result + utf8_byte_position, character);
+        unibrow::Utf8::Encode(result + utf8_byte_position, character, last);
+    last = character;
  }
  result[utf8_byte_position] = 0;
  return SmartArrayPointer<char>(result);
@ -6387,73 +6391,6 @@ const unibrow::byte* String::ReadBlock(String* input,
 }


-// This method determines the type of string involved and then gets the UTF8
-// length of the string.  It doesn't flatten the string and has log(n) recursion
-// for a string of length n.
-int String::Utf8Length(String* input, int from, int to) {
-  if (from == to) return 0;
-  int total = 0;
-  while (true) {
-    if (input->IsAsciiRepresentation()) return total + to - from;
-    switch (StringShape(input).representation_tag()) {
-      case kConsStringTag: {
-        ConsString* str = ConsString::cast(input);
-        String* first = str->first();
-        String* second = str->second();
-        int first_length = first->length();
-        if (first_length - from < to - first_length) {
-          if (first_length > from) {
-            // Left hand side is shorter.
-            total += Utf8Length(first, from, first_length);
-            input = second;
-            from = 0;
-            to -= first_length;
-          } else {
-            // We only need the right hand side.
-            input = second;
-            from -= first_length;
-            to -= first_length;
-          }
-        } else {
-          if (first_length <= to) {
-            // Right hand side is shorter.
-            total += Utf8Length(second, 0, to - first_length);
-            input = first;
-            to = first_length;
-          } else {
-            // We only need the left hand side.
-            input = first;
-          }
-        }
-        continue;
-      }
-      case kExternalStringTag:
-      case kSeqStringTag: {
-        Vector<const uc16> vector = input->GetFlatContent().ToUC16Vector();
-        const uc16* p = vector.start();
-        for (int i = from; i < to; i++) {
-          total += unibrow::Utf8::Length(p[i]);
-        }
-        return total;
-      }
-      case kSlicedStringTag: {
-        SlicedString* str = SlicedString::cast(input);
-        int offset = str->offset();
-        input = str->parent();
-        from += offset;
-        to += offset;
-        continue;
-      }
-      default:
-        break;
-    }
-    UNREACHABLE();
-    return 0;
-  }
-  return 0;
-}
-
-
 void Relocatable::PostGarbageCollectionProcessing() {
  Isolate* isolate = Isolate::Current();
  Relocatable* current = isolate->relocatable_top();
@ -6847,8 +6784,10 @@ static inline bool CompareStringContents(IteratorA* ia, IteratorB* ib) {
  // General slow case check.  We know that the ia and ib iterators
  // have the same length.
  while (ia->has_more()) {
-    uc32 ca = ia->GetNext();
-    uc32 cb = ib->GetNext();
+    uint32_t ca = ia->GetNext();
+    uint32_t cb = ib->GetNext();
+    ASSERT(ca <= unibrow::Utf16::kMaxNonSurrogateCharCode);
+    ASSERT(cb <= unibrow::Utf16::kMaxNonSurrogateCharCode);
    if (ca != cb)
      return false;
  }
@ -7031,8 +6970,14 @@ bool String::IsEqualTo(Vector<const char> str) {
  decoder->Reset(str.start(), str.length());
  int i;
  for (i = 0; i < slen && decoder->has_more(); i++) {
-    uc32 r = decoder->GetNext();
-    if (Get(i) != r) return false;
+    uint32_t r = decoder->GetNext();
+    if (r > unibrow::Utf16::kMaxNonSurrogateCharCode) {
+      if (i > slen - 1) return false;
+      if (Get(i++) != unibrow::Utf16::LeadSurrogate(r)) return false;
+      if (Get(i) != unibrow::Utf16::TrailSurrogate(r)) return false;
+    } else {
+      if (Get(i) != r) return false;
+    }
  }
  return i == slen && !decoder->has_more();
 }
@ -7162,6 +7107,22 @@ uint32_t StringHasher::MakeArrayIndexHash(uint32_t value, int length) {
 }


+void StringHasher::AddSurrogatePair(uc32 c) {
+  uint16_t lead = unibrow::Utf16::LeadSurrogate(c);
+  AddCharacter(lead);
+  uint16_t trail = unibrow::Utf16::TrailSurrogate(c);
+  AddCharacter(trail);
+}
+
+
+void StringHasher::AddSurrogatePairNoIndex(uc32 c) {
+  uint16_t lead = unibrow::Utf16::LeadSurrogate(c);
+  AddCharacterNoIndex(lead);
+  uint16_t trail = unibrow::Utf16::TrailSurrogate(c);
+  AddCharacterNoIndex(trail);
+}
+
+
 uint32_t StringHasher::GetHashField() {
  ASSERT(is_valid());
  if (length_ <= String::kMaxHashCalcLength) {
@ -10655,7 +10616,7 @@ class Utf8SymbolKey : public HashTableKey {
    if (hash_field_ != 0) return hash_field_ >> String::kHashShift;
    unibrow::Utf8InputBuffer<> buffer(string_.start(),
                                      static_cast<unsigned>(string_.length()));
-    chars_ = buffer.Length();
+    chars_ = buffer.Utf16Length();
    hash_field_ = String::ComputeHashField(&buffer, chars_, seed_);
    uint32_t result = hash_field_ >> String::kHashShift;
    ASSERT(result != 0);  // Ensure that the hash value of 0 is never computed.
--- a/src/objects.h
+++ b/src/objects.h
@ -6616,12 +6616,17 @@ class StringHasher {
  inline bool has_trivial_hash();

  // Add a character to the hash and update the array index calculation.
-  inline void AddCharacter(uc32 c);
+  inline void AddCharacter(uint32_t c);

  // Adds a character to the hash but does not update the array index
  // calculation.  This can only be called when it has been verified
  // that the input is not an array index.
-  inline void AddCharacterNoIndex(uc32 c);
+  inline void AddCharacterNoIndex(uint32_t c);
+
+  // Add a character above 0xffff as a surrogate pair.  These can get into
+  // the hasher through the routines that take a UTF-8 string and make a symbol.
+  void AddSurrogatePair(uc32 c);
+  void AddSurrogatePairNoIndex(uc32 c);

  // Returns the value to store in the hash field of a string with
  // the given length and contents.
@ -6871,9 +6876,6 @@ class String: public HeapObject {
      RobustnessFlag robustness_flag = FAST_STRING_TRAVERSAL,
      int* length_output = 0);

-  inline int Utf8Length() { return Utf8Length(this, 0, length()); }
-  static int Utf8Length(String* input, int from, int to);
-
  // Return a 16 bit Unicode representation of the string.
  // The string should be nearly flat, otherwise the performance of
  // of this method may be very bad.  Setting robustness_flag to
@ -6939,7 +6941,7 @@ class String: public HeapObject {
  // Max ASCII char code.
  static const int kMaxAsciiCharCode = unibrow::Utf8::kMaxOneByteChar;
  static const unsigned kMaxAsciiCharCodeU = unibrow::Utf8::kMaxOneByteChar;
-  static const int kMaxUC16CharCode = 0xffff;
+  static const int kMaxUtf16CodeUnit = 0xffff;

  // Mask constant for checking if a string has a computed hash code
  // and if it is an array index.  The least significant bit indicates
--- a/src/parser.cc
+++ b/src/parser.cc
@ -258,7 +258,7 @@ Handle<String> Parser::LookupSymbol(int symbol_id) {
          scanner().literal_ascii_string());
    } else {
      return isolate()->factory()->LookupTwoByteSymbol(
-          scanner().literal_uc16_string());
+          scanner().literal_utf16_string());
    }
  }
  return LookupCachedSymbol(symbol_id);
@ -279,7 +279,7 @@ Handle<String> Parser::LookupCachedSymbol(int symbol_id) {
          scanner().literal_ascii_string());
    } else {
      result = isolate()->factory()->LookupTwoByteSymbol(
-          scanner().literal_uc16_string());
+          scanner().literal_utf16_string());
    }
    symbol_cache_.at(symbol_id) = result;
    return result;
@ -576,12 +576,12 @@ FunctionLiteral* Parser::ParseProgram(CompilationInfo* info) {
    // Notice that the stream is destroyed at the end of the branch block.
    // The last line of the blocks can't be moved outside, even though they're
    // identical calls.
-    ExternalTwoByteStringUC16CharacterStream stream(
+    ExternalTwoByteStringUtf16CharacterStream stream(
        Handle<ExternalTwoByteString>::cast(source), 0, source->length());
    scanner_.Initialize(&stream);
    return DoParseProgram(info, source, &zone_scope);
  } else {
-    GenericStringUC16CharacterStream stream(source, 0, source->length());
+    GenericStringUtf16CharacterStream stream(source, 0, source->length());
    scanner_.Initialize(&stream);
    return DoParseProgram(info, source, &zone_scope);
  }
@ -665,16 +665,16 @@ FunctionLiteral* Parser::ParseLazy(CompilationInfo* info) {
  // Initialize parser state.
  source->TryFlatten();
  if (source->IsExternalTwoByteString()) {
-    ExternalTwoByteStringUC16CharacterStream stream(
+    ExternalTwoByteStringUtf16CharacterStream stream(
        Handle<ExternalTwoByteString>::cast(source),
        shared_info->start_position(),
        shared_info->end_position());
    FunctionLiteral* result = ParseLazy(info, &stream, &zone_scope);
    return result;
  } else {
-    GenericStringUC16CharacterStream stream(source,
-                                            shared_info->start_position(),
-                                            shared_info->end_position());
+    GenericStringUtf16CharacterStream stream(source,
+                                             shared_info->start_position(),
+                                             shared_info->end_position());
    FunctionLiteral* result = ParseLazy(info, &stream, &zone_scope);
    return result;
  }
@ -682,7 +682,7 @@ FunctionLiteral* Parser::ParseLazy(CompilationInfo* info) {


 FunctionLiteral* Parser::ParseLazy(CompilationInfo* info,
-                                   UC16CharacterStream* source,
+                                   Utf16CharacterStream* source,
                                   ZoneScope* zone_scope) {
  Handle<SharedFunctionInfo> shared_info = info->shared_info();
  scanner_.Initialize(source);
@ -4285,7 +4285,7 @@ class SingletonLogger : public ParserRecorder {

  // Logs a symbol creation of a literal or identifier.
  virtual void LogAsciiSymbol(int start, Vector<const char> literal) { }
-  virtual void LogUC16Symbol(int start, Vector<const uc16> literal) { }
+  virtual void LogUtf16Symbol(int start, Vector<const uc16> literal) { }

  // Logs an error message and marks the log as containing an error.
  // Further logging will be ignored, and ExtractData will return a vector
@ -5874,7 +5874,7 @@ int ScriptDataImpl::ReadNumber(byte** source) {


 // Create a Scanner for the preparser to use as input, and preparse the source.
-static ScriptDataImpl* DoPreParse(UC16CharacterStream* source,
+static ScriptDataImpl* DoPreParse(Utf16CharacterStream* source,
                                  int flags,
                                  ParserRecorder* recorder) {
  Isolate* isolate = Isolate::Current();
@ -5915,17 +5915,17 @@ ScriptDataImpl* ParserApi::PartialPreParse(Handle<String> source,
  PartialParserRecorder recorder;
  int source_length = source->length();
  if (source->IsExternalTwoByteString()) {
-    ExternalTwoByteStringUC16CharacterStream stream(
+    ExternalTwoByteStringUtf16CharacterStream stream(
        Handle<ExternalTwoByteString>::cast(source), 0, source_length);
    return DoPreParse(&stream, flags, &recorder);
  } else {
-    GenericStringUC16CharacterStream stream(source, 0, source_length);
+    GenericStringUtf16CharacterStream stream(source, 0, source_length);
    return DoPreParse(&stream, flags, &recorder);
  }
 }


-ScriptDataImpl* ParserApi::PreParse(UC16CharacterStream* source,
+ScriptDataImpl* ParserApi::PreParse(Utf16CharacterStream* source,
                                    v8::Extension* extension,
                                    int flags) {
  Handle<Script> no_script;
--- a/src/parser.h
+++ b/src/parser.h
@ -172,7 +172,7 @@ class ParserApi {
  static bool Parse(CompilationInfo* info, int flags);

  // Generic preparser generating full preparse data.
-  static ScriptDataImpl* PreParse(UC16CharacterStream* source,
+  static ScriptDataImpl* PreParse(Utf16CharacterStream* source,
                                  v8::Extension* extension,
                                  int flags);

@ -542,7 +542,7 @@ class Parser {


  FunctionLiteral* ParseLazy(CompilationInfo* info,
-                             UC16CharacterStream* source,
+                             Utf16CharacterStream* source,
                             ZoneScope* zone_scope);

  Isolate* isolate() { return isolate_; }
@ -712,7 +712,7 @@ class Parser {
          scanner().literal_ascii_string(), tenured);
    } else {
      return isolate_->factory()->NewStringFromTwoByte(
-            scanner().literal_uc16_string(), tenured);
+            scanner().literal_utf16_string(), tenured);
    }
  }

@ -722,7 +722,7 @@ class Parser {
          scanner().next_literal_ascii_string(), tenured);
    } else {
      return isolate_->factory()->NewStringFromTwoByte(
-          scanner().next_literal_uc16_string(), tenured);
+          scanner().next_literal_utf16_string(), tenured);
    }
  }

--- a/src/preparse-data.h
+++ b/src/preparse-data.h
@ -53,7 +53,7 @@ class ParserRecorder {

  // Logs a symbol creation of a literal or identifier.
  virtual void LogAsciiSymbol(int start, Vector<const char> literal) { }
-  virtual void LogUC16Symbol(int start, Vector<const uc16> literal) { }
+  virtual void LogUtf16Symbol(int start, Vector<const uc16> literal) { }

  // Logs an error message and marks the log as containing an error.
  // Further logging will be ignored, and ExtractData will return a vector
@ -149,7 +149,7 @@ class PartialParserRecorder : public FunctionLoggingParserRecorder {
 public:
  PartialParserRecorder() : FunctionLoggingParserRecorder() { }
  virtual void LogAsciiSymbol(int start, Vector<const char> literal) { }
-  virtual void LogUC16Symbol(int start, Vector<const uc16> literal) { }
+  virtual void LogUtf16Symbol(int start, Vector<const uc16> literal) { }
  virtual ~PartialParserRecorder() { }
  virtual Vector<unsigned> ExtractData();
  virtual int symbol_position() { return 0; }
@ -171,7 +171,7 @@ class CompleteParserRecorder: public FunctionLoggingParserRecorder {
    LogSymbol(start, hash, true, Vector<const byte>::cast(literal));
  }

-  virtual void LogUC16Symbol(int start, Vector<const uc16> literal) {
+  virtual void LogUtf16Symbol(int start, Vector<const uc16> literal) {
    if (!is_recording_) return;
    int hash = vector_hash(literal);
    LogSymbol(start, hash, false, Vector<const byte>::cast(literal));
--- a/src/preparser-api.cc
+++ b/src/preparser-api.cc
@ -46,10 +46,10 @@ namespace v8 {
 namespace internal {

 // UTF16Buffer based on a v8::UnicodeInputStream.
-class InputStreamUTF16Buffer : public UC16CharacterStream {
+class InputStreamUtf16Buffer : public Utf16CharacterStream {
 public:
-  /* The InputStreamUTF16Buffer maintains an internal buffer
-   * that is filled in chunks from the UC16CharacterStream.
+  /* The InputStreamUtf16Buffer maintains an internal buffer
+   * that is filled in chunks from the Utf16CharacterStream.
   * It also maintains unlimited pushback capability, but optimized
   * for small pushbacks.
   * The pushback_buffer_ pointer points to the limit of pushbacks
@ -60,8 +60,8 @@ class InputStreamUTF16Buffer : public UC16CharacterStream {
   * new buffer. When this buffer is read to the end again, the cursor is
   * switched back to the internal buffer
   */
-  explicit InputStreamUTF16Buffer(v8::UnicodeInputStream* stream)
-      : UC16CharacterStream(),
+  explicit InputStreamUtf16Buffer(v8::UnicodeInputStream* stream)
+      : Utf16CharacterStream(),
        stream_(stream),
        pushback_buffer_(buffer_),
        pushback_buffer_end_cache_(NULL),
@ -70,7 +70,7 @@ class InputStreamUTF16Buffer : public UC16CharacterStream {
    buffer_cursor_ = buffer_end_ = buffer_ + kPushBackSize;
  }

-  virtual ~InputStreamUTF16Buffer() {
+  virtual ~InputStreamUtf16Buffer() {
    if (pushback_buffer_backing_ != NULL) {
      DeleteArray(pushback_buffer_backing_);
    }
@ -127,12 +127,18 @@ class InputStreamUTF16Buffer : public UC16CharacterStream {
    uc16* buffer_start = buffer_ + kPushBackSize;
    buffer_cursor_ = buffer_end_ = buffer_start;
    while ((value = stream_->Next()) >= 0) {
-      if (value > static_cast<int32_t>(unibrow::Utf8::kMaxThreeByteChar)) {
-        value = unibrow::Utf8::kBadChar;
+      if (value >
+          static_cast<int32_t>(unibrow::Utf16::kMaxNonSurrogateCharCode)) {
+        buffer_start[buffer_end_++ - buffer_start] =
+            unibrow::Utf16::LeadSurrogate(value);
+        buffer_start[buffer_end_++ - buffer_start] =
+            unibrow::Utf16::TrailSurrogate(value);
+      } else {
+        // buffer_end_ is a const pointer, but buffer_ is writable.
+        buffer_start[buffer_end_++ - buffer_start] = static_cast<uc16>(value);
      }
-      // buffer_end_ is a const pointer, but buffer_ is writable.
-      buffer_start[buffer_end_++ - buffer_start] = static_cast<uc16>(value);
-      if (buffer_end_ == buffer_ + kPushBackSize + kBufferSize) break;
+      // Stop one before the end of the buffer in case we get a surrogate pair.
+      if (buffer_end_ <= buffer_ + 1 + kPushBackSize + kBufferSize) break;
    }
    return buffer_end_ > buffer_start;
  }
@ -179,7 +185,7 @@ UnicodeInputStream::~UnicodeInputStream() { }


 PreParserData Preparse(UnicodeInputStream* input, size_t max_stack) {
-  internal::InputStreamUTF16Buffer buffer(input);
+  internal::InputStreamUtf16Buffer buffer(input);
  uintptr_t stack_limit = reinterpret_cast<uintptr_t>(&buffer) - max_stack;
  internal::UnicodeCache unicode_cache;
  internal::Scanner scanner(&unicode_cache);
--- a/src/preparser.cc
+++ b/src/preparser.cc
@ -1214,7 +1214,7 @@ void PreParser::CheckDuplicate(DuplicateFinder* finder,
    old_type = finder->AddAsciiSymbol(scanner_->literal_ascii_string(),
                                      type);
  } else {
-    old_type = finder->AddUC16Symbol(scanner_->literal_uc16_string(), type);
+    old_type = finder->AddUtf16Symbol(scanner_->literal_utf16_string(), type);
  }
  if (HasConflict(old_type, type)) {
    if (IsDataDataConflict(old_type, type)) {
@ -1387,7 +1387,7 @@ PreParser::Expression PreParser::ParseFunctionLiteral(bool* ok) {
          duplicate_finder.AddAsciiSymbol(scanner_->literal_ascii_string(), 1);
    } else {
      prev_value =
-          duplicate_finder.AddUC16Symbol(scanner_->literal_uc16_string(), 1);
+          duplicate_finder.AddUtf16Symbol(scanner_->literal_utf16_string(), 1);
    }

    if (prev_value != 0) {
@ -1485,7 +1485,7 @@ void PreParser::LogSymbol() {
  if (scanner_->is_literal_ascii()) {
    log_->LogAsciiSymbol(identifier_pos, scanner_->literal_ascii_string());
  } else {
-    log_->LogUC16Symbol(identifier_pos, scanner_->literal_uc16_string());
+    log_->LogUtf16Symbol(identifier_pos, scanner_->literal_utf16_string());
  }
 }

@ -1657,7 +1657,7 @@ int DuplicateFinder::AddAsciiSymbol(i::Vector<const char> key, int value) {
  return AddSymbol(i::Vector<const byte>::cast(key), true, value);
 }

-int DuplicateFinder::AddUC16Symbol(i::Vector<const uint16_t> key, int value) {
+int DuplicateFinder::AddUtf16Symbol(i::Vector<const uint16_t> key, int value) {
  return AddSymbol(i::Vector<const byte>::cast(key), false, value);
 }

--- a/src/preparser.h
+++ b/src/preparser.h
@ -65,7 +65,7 @@ class DuplicateFinder {
        map_(&Match) { }

  int AddAsciiSymbol(i::Vector<const char> key, int value);
-  int AddUC16Symbol(i::Vector<const uint16_t> key, int value);
+  int AddUtf16Symbol(i::Vector<const uint16_t> key, int value);
  // Add a a number literal by converting it (if necessary)
  // to the string that ToString(ToNumber(literal)) would generate.
  // and then adding that string with AddAsciiSymbol.
--- a/src/scanner-character-streams.cc
+++ b/src/scanner-character-streams.cc
@ -36,19 +36,19 @@ namespace v8 {
 namespace internal {

 // ----------------------------------------------------------------------------
-// BufferedUC16CharacterStreams
+// BufferedUtf16CharacterStreams

-BufferedUC16CharacterStream::BufferedUC16CharacterStream()
-    : UC16CharacterStream(),
+BufferedUtf16CharacterStream::BufferedUtf16CharacterStream()
+    : Utf16CharacterStream(),
      pushback_limit_(NULL) {
  // Initialize buffer as being empty. First read will fill the buffer.
  buffer_cursor_ = buffer_;
  buffer_end_ = buffer_;
 }

-BufferedUC16CharacterStream::~BufferedUC16CharacterStream() { }
+BufferedUtf16CharacterStream::~BufferedUtf16CharacterStream() { }

-void BufferedUC16CharacterStream::PushBack(uc32 character) {
+void BufferedUtf16CharacterStream::PushBack(uc32 character) {
  if (character == kEndOfInput) {
    pos_--;
    return;
@ -63,7 +63,7 @@ void BufferedUC16CharacterStream::PushBack(uc32 character) {
 }


-void BufferedUC16CharacterStream::SlowPushBack(uc16 character) {
+void BufferedUtf16CharacterStream::SlowPushBack(uc16 character) {
  // In pushback mode, the end of the buffer contains pushback,
  // and the start of the buffer (from buffer start to pushback_limit_)
  // contains valid data that comes just after the pushback.
@ -89,7 +89,7 @@ void BufferedUC16CharacterStream::SlowPushBack(uc16 character) {
 }


-bool BufferedUC16CharacterStream::ReadBlock() {
+bool BufferedUtf16CharacterStream::ReadBlock() {
  buffer_cursor_ = buffer_;
  if (pushback_limit_ != NULL) {
    // Leave pushback mode.
@ -106,7 +106,7 @@ bool BufferedUC16CharacterStream::ReadBlock() {
 }


-unsigned BufferedUC16CharacterStream::SlowSeekForward(unsigned delta) {
+unsigned BufferedUtf16CharacterStream::SlowSeekForward(unsigned delta) {
  // Leave pushback mode (i.e., ignore that there might be valid data
  // in the buffer before the pushback_limit_ point).
  pushback_limit_ = NULL;
@ -114,10 +114,10 @@ unsigned BufferedUC16CharacterStream::SlowSeekForward(unsigned delta) {
 }

 // ----------------------------------------------------------------------------
-// GenericStringUC16CharacterStream
+// GenericStringUtf16CharacterStream


-GenericStringUC16CharacterStream::GenericStringUC16CharacterStream(
+GenericStringUtf16CharacterStream::GenericStringUtf16CharacterStream(
    Handle<String> data,
    unsigned start_position,
    unsigned end_position)
@ -130,10 +130,10 @@ GenericStringUC16CharacterStream::GenericStringUC16CharacterStream(
 }


-GenericStringUC16CharacterStream::~GenericStringUC16CharacterStream() { }
+GenericStringUtf16CharacterStream::~GenericStringUtf16CharacterStream() { }


-unsigned GenericStringUC16CharacterStream::BufferSeekForward(unsigned delta) {
+unsigned GenericStringUtf16CharacterStream::BufferSeekForward(unsigned delta) {
  unsigned old_pos = pos_;
  pos_ = Min(pos_ + delta, length_);
  ReadBlock();
@ -141,7 +141,7 @@ unsigned GenericStringUC16CharacterStream::BufferSeekForward(unsigned delta) {
 }


-unsigned GenericStringUC16CharacterStream::FillBuffer(unsigned from_pos,
+unsigned GenericStringUtf16CharacterStream::FillBuffer(unsigned from_pos,
                                                      unsigned length) {
  if (from_pos >= length_) return 0;
  if (from_pos + length > length_) {
@ -153,10 +153,10 @@ unsigned GenericStringUC16CharacterStream::FillBuffer(unsigned from_pos,


 // ----------------------------------------------------------------------------
-// Utf8ToUC16CharacterStream
-Utf8ToUC16CharacterStream::Utf8ToUC16CharacterStream(const byte* data,
-                                                     unsigned length)
-    : BufferedUC16CharacterStream(),
+// Utf8ToUtf16CharacterStream
+Utf8ToUtf16CharacterStream::Utf8ToUtf16CharacterStream(const byte* data,
+                                                       unsigned length)
+    : BufferedUtf16CharacterStream(),
      raw_data_(data),
      raw_data_length_(length),
      raw_data_pos_(0),
@ -165,10 +165,10 @@ Utf8ToUC16CharacterStream::Utf8ToUC16CharacterStream(const byte* data,
 }


-Utf8ToUC16CharacterStream::~Utf8ToUC16CharacterStream() { }
+Utf8ToUtf16CharacterStream::~Utf8ToUtf16CharacterStream() { }


-unsigned Utf8ToUC16CharacterStream::BufferSeekForward(unsigned delta) {
+unsigned Utf8ToUtf16CharacterStream::BufferSeekForward(unsigned delta) {
  unsigned old_pos = pos_;
  unsigned target_pos = pos_ + delta;
  SetRawPosition(target_pos);
@ -178,9 +178,9 @@ unsigned Utf8ToUC16CharacterStream::BufferSeekForward(unsigned delta) {
 }


-unsigned Utf8ToUC16CharacterStream::FillBuffer(unsigned char_position,
-                                               unsigned length) {
-  static const unibrow::uchar kMaxUC16Character = 0xffff;
+unsigned Utf8ToUtf16CharacterStream::FillBuffer(unsigned char_position,
+                                                unsigned length) {
+  static const unibrow::uchar kMaxUtf16Character = 0xffff;
  SetRawPosition(char_position);
  if (raw_character_position_ != char_position) {
    // char_position was not a valid position in the stream (hit the end
@ -188,7 +188,7 @@ unsigned Utf8ToUC16CharacterStream::FillBuffer(unsigned char_position,
    return 0u;
  }
  unsigned i = 0;
-  while (i < length) {
+  while (i < length - 1) {
    if (raw_data_pos_ == raw_data_length_) break;
    unibrow::uchar c = raw_data_[raw_data_pos_];
    if (c <= unibrow::Utf8::kMaxOneByteChar) {
@ -197,12 +197,13 @@ unsigned Utf8ToUC16CharacterStream::FillBuffer(unsigned char_position,
      c =  unibrow::Utf8::CalculateValue(raw_data_ + raw_data_pos_,
                                         raw_data_length_ - raw_data_pos_,
                                         &raw_data_pos_);
-      // Don't allow characters outside of the BMP.
-      if (c > kMaxUC16Character) {
-        c = unibrow::Utf8::kBadChar;
-      }
    }
-    buffer_[i++] = static_cast<uc16>(c);
+    if (c > kMaxUtf16Character) {
+      buffer_[i++] = unibrow::Utf16::LeadSurrogate(c);
+      buffer_[i++] = unibrow::Utf16::TrailSurrogate(c);
+    } else {
+      buffer_[i++] = static_cast<uc16>(c);
+    }
  }
  raw_character_position_ = char_position + i;
  return i;
@ -266,37 +267,52 @@ static inline void Utf8CharacterForward(const byte* buffer, unsigned* cursor) {
 }


-void Utf8ToUC16CharacterStream::SetRawPosition(unsigned target_position) {
+// This can't set a raw position between two surrogate pairs, since there
+// is no position in the UTF8 stream that corresponds to that.  This assumes
+// that the surrogate pair is correctly coded as a 4 byte UTF-8 sequence.  If
+// it is illegally coded as two 3 byte sequences then there is no problem here.
+void Utf8ToUtf16CharacterStream::SetRawPosition(unsigned target_position) {
  if (raw_character_position_ > target_position) {
    // Spool backwards in utf8 buffer.
    do {
+      int old_pos = raw_data_pos_;
      Utf8CharacterBack(raw_data_, &raw_data_pos_);
      raw_character_position_--;
+      ASSERT(old_pos - raw_data_pos_ <= 4);
+      // Step back over both code units for surrogate pairs.
+      if (old_pos - raw_data_pos_ == 4) raw_character_position_--;
    } while (raw_character_position_ > target_position);
+    // No surrogate pair splitting.
+    ASSERT(raw_character_position_ == target_position);
    return;
  }
  // Spool forwards in the utf8 buffer.
  while (raw_character_position_ < target_position) {
    if (raw_data_pos_ == raw_data_length_) return;
+    int old_pos = raw_data_pos_;
    Utf8CharacterForward(raw_data_, &raw_data_pos_);
    raw_character_position_++;
+    ASSERT(raw_data_pos_ - old_pos <= 4);
+    if (raw_data_pos_ - old_pos == 4) raw_character_position_++;
  }
+  // No surrogate pair splitting.
+  ASSERT(raw_character_position_ == target_position);
 }


 // ----------------------------------------------------------------------------
-// ExternalTwoByteStringUC16CharacterStream
+// ExternalTwoByteStringUtf16CharacterStream

-ExternalTwoByteStringUC16CharacterStream::
-    ~ExternalTwoByteStringUC16CharacterStream() { }
+ExternalTwoByteStringUtf16CharacterStream::
+    ~ExternalTwoByteStringUtf16CharacterStream() { }


-ExternalTwoByteStringUC16CharacterStream
-    ::ExternalTwoByteStringUC16CharacterStream(
+ExternalTwoByteStringUtf16CharacterStream
+    ::ExternalTwoByteStringUtf16CharacterStream(
        Handle<ExternalTwoByteString> data,
        int start_position,
        int end_position)
-    : UC16CharacterStream(),
+    : Utf16CharacterStream(),
      source_(data),
      raw_data_(data->GetTwoByteData(start_position)) {
  buffer_cursor_ = raw_data_,
--- a/src/scanner-character-streams.h
+++ b/src/scanner-character-streams.h
@ -36,10 +36,10 @@ namespace internal {
 // A buffered character stream based on a random access character
 // source (ReadBlock can be called with pos_ pointing to any position,
 // even positions before the current).
-class BufferedUC16CharacterStream: public UC16CharacterStream {
+class BufferedUtf16CharacterStream: public Utf16CharacterStream {
 public:
-  BufferedUC16CharacterStream();
-  virtual ~BufferedUC16CharacterStream();
+  BufferedUtf16CharacterStream();
+  virtual ~BufferedUtf16CharacterStream();

  virtual void PushBack(uc32 character);

@ -60,12 +60,12 @@ class BufferedUC16CharacterStream: public UC16CharacterStream {


 // Generic string stream.
-class GenericStringUC16CharacterStream: public BufferedUC16CharacterStream {
+class GenericStringUtf16CharacterStream: public BufferedUtf16CharacterStream {
 public:
-  GenericStringUC16CharacterStream(Handle<String> data,
-                                   unsigned start_position,
-                                   unsigned end_position);
-  virtual ~GenericStringUC16CharacterStream();
+  GenericStringUtf16CharacterStream(Handle<String> data,
+                                    unsigned start_position,
+                                    unsigned end_position);
+  virtual ~GenericStringUtf16CharacterStream();

 protected:
  virtual unsigned BufferSeekForward(unsigned delta);
@ -77,11 +77,11 @@ class GenericStringUC16CharacterStream: public BufferedUC16CharacterStream {
 };


-// UC16 stream based on a literal UTF-8 string.
-class Utf8ToUC16CharacterStream: public BufferedUC16CharacterStream {
+// Utf16 stream based on a literal UTF-8 string.
+class Utf8ToUtf16CharacterStream: public BufferedUtf16CharacterStream {
 public:
-  Utf8ToUC16CharacterStream(const byte* data, unsigned length);
-  virtual ~Utf8ToUC16CharacterStream();
+  Utf8ToUtf16CharacterStream(const byte* data, unsigned length);
+  virtual ~Utf8ToUtf16CharacterStream();

 protected:
  virtual unsigned BufferSeekForward(unsigned delta);
@ -98,12 +98,12 @@ class Utf8ToUC16CharacterStream: public BufferedUC16CharacterStream {


 // UTF16 buffer to read characters from an external string.
-class ExternalTwoByteStringUC16CharacterStream: public UC16CharacterStream {
+class ExternalTwoByteStringUtf16CharacterStream: public Utf16CharacterStream {
 public:
-  ExternalTwoByteStringUC16CharacterStream(Handle<ExternalTwoByteString> data,
-                                           int start_position,
-                                           int end_position);
-  virtual ~ExternalTwoByteStringUC16CharacterStream();
+  ExternalTwoByteStringUtf16CharacterStream(Handle<ExternalTwoByteString> data,
+                                            int start_position,
+                                            int end_position);
+  virtual ~ExternalTwoByteStringUtf16CharacterStream();

  virtual void PushBack(uc32 character) {
    ASSERT(buffer_cursor_ > raw_data_);
--- a/src/scanner.cc
+++ b/src/scanner.cc
@ -45,7 +45,7 @@ Scanner::Scanner(UnicodeCache* unicode_cache)
      harmony_modules_(false) { }


-void Scanner::Initialize(UC16CharacterStream* source) {
+void Scanner::Initialize(Utf16CharacterStream* source) {
  source_ = source;
  // Need to capture identifiers in order to recognize "get" and "set"
  // in object literals.
--- a/src/scanner.h
+++ b/src/scanner.h
@ -73,15 +73,17 @@ inline int HexValue(uc32 c) {


 // ---------------------------------------------------------------------
-// Buffered stream of characters, using an internal UC16 buffer.
+// Buffered stream of UTF-16 code units, using an internal UTF-16 buffer.
+// A code unit is a 16 bit value representing either a 16 bit code point
+// or one part of a surrogate pair that make a single 21 bit code point.

-class UC16CharacterStream {
+class Utf16CharacterStream {
 public:
-  UC16CharacterStream() : pos_(0) { }
-  virtual ~UC16CharacterStream() { }
+  Utf16CharacterStream() : pos_(0) { }
+  virtual ~Utf16CharacterStream() { }

-  // Returns and advances past the next UC16 character in the input
-  // stream. If there are no more characters, it returns a negative
+  // Returns and advances past the next UTF-16 code unit in the input
+  // stream. If there are no more code units, it returns a negative
  // value.
  inline uc32 Advance() {
    if (buffer_cursor_ < buffer_end_ || ReadBlock()) {
@ -90,47 +92,47 @@ class UC16CharacterStream {
    }
    // Note: currently the following increment is necessary to avoid a
    // parser problem! The scanner treats the final kEndOfInput as
-    // a character with a position, and does math relative to that
+    // a code unit with a position, and does math relative to that
    // position.
    pos_++;

    return kEndOfInput;
  }

-  // Return the current position in the character stream.
+  // Return the current position in the code unit stream.
  // Starts at zero.
  inline unsigned pos() const { return pos_; }

-  // Skips forward past the next character_count UC16 characters
+  // Skips forward past the next code_unit_count UTF-16 code units
  // in the input, or until the end of input if that comes sooner.
-  // Returns the number of characters actually skipped. If less
-  // than character_count,
-  inline unsigned SeekForward(unsigned character_count) {
+  // Returns the number of code units actually skipped. If less
+  // than code_unit_count,
+  inline unsigned SeekForward(unsigned code_unit_count) {
    unsigned buffered_chars =
        static_cast<unsigned>(buffer_end_ - buffer_cursor_);
-    if (character_count <= buffered_chars) {
-      buffer_cursor_ += character_count;
-      pos_ += character_count;
-      return character_count;
+    if (code_unit_count <= buffered_chars) {
+      buffer_cursor_ += code_unit_count;
+      pos_ += code_unit_count;
+      return code_unit_count;
    }
-    return SlowSeekForward(character_count);
+    return SlowSeekForward(code_unit_count);
  }

-  // Pushes back the most recently read UC16 character (or negative
+  // Pushes back the most recently read UTF-16 code unit (or negative
  // value if at end of input), i.e., the value returned by the most recent
  // call to Advance.
  // Must not be used right after calling SeekForward.
-  virtual void PushBack(int32_t character) = 0;
+  virtual void PushBack(int32_t code_unit) = 0;

 protected:
  static const uc32 kEndOfInput = -1;

-  // Ensures that the buffer_cursor_ points to the character at
+  // Ensures that the buffer_cursor_ points to the code_unit at
  // position pos_ of the input, if possible. If the position
  // is at or after the end of the input, return false. If there
-  // are more characters available, return true.
+  // are more code_units available, return true.
  virtual bool ReadBlock() = 0;
-  virtual unsigned SlowSeekForward(unsigned character_count) = 0;
+  virtual unsigned SlowSeekForward(unsigned code_unit_count) = 0;

  const uc16* buffer_cursor_;
  const uc16* buffer_end_;
@ -178,23 +180,24 @@ class LiteralBuffer {
    }
  }

-  INLINE(void AddChar(uc16 character)) {
+  INLINE(void AddChar(uint32_t code_unit)) {
    if (position_ >= backing_store_.length()) ExpandBuffer();
    if (is_ascii_) {
-      if (character < kMaxAsciiCharCodeU) {
-        backing_store_[position_] = static_cast<byte>(character);
+      if (code_unit < kMaxAsciiCharCodeU) {
+        backing_store_[position_] = static_cast<byte>(code_unit);
        position_ += kASCIISize;
        return;
      }
-      ConvertToUC16();
+      ConvertToUtf16();
    }
-    *reinterpret_cast<uc16*>(&backing_store_[position_]) = character;
+    ASSERT(code_unit < 0x10000u);
+    *reinterpret_cast<uc16*>(&backing_store_[position_]) = code_unit;
    position_ += kUC16Size;
  }

  bool is_ascii() { return is_ascii_; }

-  Vector<const uc16> uc16_literal() {
+  Vector<const uc16> utf16_literal() {
    ASSERT(!is_ascii_);
    ASSERT((position_ & 0x1) == 0);
    return Vector<const uc16>(
@ -236,13 +239,13 @@ class LiteralBuffer {
    backing_store_ = new_store;
  }

-  void ConvertToUC16() {
+  void ConvertToUtf16() {
    ASSERT(is_ascii_);
    Vector<byte> new_store;
    int new_content_size = position_ * kUC16Size;
    if (new_content_size >= backing_store_.length()) {
-      // Ensure room for all currently read characters as UC16 as well
-      // as the character about to be stored.
+      // Ensure room for all currently read code units as UC16 as well
+      // as the code unit about to be stored.
      new_store = Vector<byte>::New(NewCapacity(new_content_size));
    } else {
      new_store = backing_store_;
@ -316,7 +319,7 @@ class Scanner {

  explicit Scanner(UnicodeCache* scanner_contants);

-  void Initialize(UC16CharacterStream* source);
+  void Initialize(Utf16CharacterStream* source);

  // Returns the next token and advances input.
  Token::Value Next();
@ -335,9 +338,9 @@ class Scanner {
    ASSERT_NOT_NULL(current_.literal_chars);
    return current_.literal_chars->ascii_literal();
  }
-  Vector<const uc16> literal_uc16_string() {
+  Vector<const uc16> literal_utf16_string() {
    ASSERT_NOT_NULL(current_.literal_chars);
-    return current_.literal_chars->uc16_literal();
+    return current_.literal_chars->utf16_literal();
  }
  bool is_literal_ascii() {
    ASSERT_NOT_NULL(current_.literal_chars);
@ -371,9 +374,9 @@ class Scanner {
    ASSERT_NOT_NULL(next_.literal_chars);
    return next_.literal_chars->ascii_literal();
  }
-  Vector<const uc16> next_literal_uc16_string() {
+  Vector<const uc16> next_literal_utf16_string() {
    ASSERT_NOT_NULL(next_.literal_chars);
-    return next_.literal_chars->uc16_literal();
+    return next_.literal_chars->utf16_literal();
  }
  bool is_next_literal_ascii() {
    ASSERT_NOT_NULL(next_.literal_chars);
@ -542,8 +545,8 @@ class Scanner {
  TokenDesc current_;  // desc for current token (as returned by Next())
  TokenDesc next_;     // desc for next token (one token look-ahead)

-  // Input stream. Must be initialized to an UC16CharacterStream.
-  UC16CharacterStream* source_;
+  // Input stream. Must be initialized to an Utf16CharacterStream.
+  Utf16CharacterStream* source_;


  // Start position of the octal literal last scanned.
--- a/src/unicode-inl.h
+++ b/src/unicode-inl.h
@ -78,7 +78,7 @@ template <class T, int s> int Mapping<T, s>::CalculateValue(uchar c, uchar n,
 }


-unsigned Utf8::Encode(char* str, uchar c) {
+unsigned Utf8::Encode(char* str, uchar c, int previous) {
  static const int kMask = ~(1 << 6);
  if (c <= kMaxOneByteChar) {
    str[0] = c;
@ -88,6 +88,13 @@ unsigned Utf8::Encode(char* str, uchar c) {
    str[1] = 0x80 | (c & kMask);
    return 2;
  } else if (c <= kMaxThreeByteChar) {
+    if (Utf16::IsTrailSurrogate(c) &&
+        Utf16::IsLeadSurrogate(previous)) {
+      const int kUnmatchedSize = kSizeOfUnmatchedSurrogate;
+      return Encode(str - kUnmatchedSize,
+                    Utf16::CombineSurrogatePair(previous, c),
+                    Utf16::kNoPreviousCharacter) - kUnmatchedSize;
+    }
    str[0] = 0xE0 | (c >> 12);
    str[1] = 0x80 | ((c >> 6) & kMask);
    str[2] = 0x80 | (c & kMask);
@ -113,12 +120,16 @@ uchar Utf8::ValueOf(const byte* bytes, unsigned length, unsigned* cursor) {
  return CalculateValue(bytes, length, cursor);
 }

-unsigned Utf8::Length(uchar c) {
+unsigned Utf8::Length(uchar c, int previous) {
  if (c <= kMaxOneByteChar) {
    return 1;
  } else if (c <= kMaxTwoByteChar) {
    return 2;
  } else if (c <= kMaxThreeByteChar) {
+    if (Utf16::IsTrailSurrogate(c) &&
+        Utf16::IsLeadSurrogate(previous)) {
+      return kSizeOfUnmatchedSurrogate - kBytesSavedByCombiningSurrogates;
+    }
    return 3;
  } else {
    return 4;
--- a/src/unicode.cc
+++ b/src/unicode.cc
@ -276,6 +276,7 @@ uchar Utf8::CalculateValue(const byte* str,
  return kBadChar;
 }

+
 const byte* Utf8::ReadBlock(Buffer<const char*> str, byte* buffer,
    unsigned capacity, unsigned* chars_read_ptr, unsigned* offset_ptr) {
  unsigned offset = *offset_ptr;
@ -338,6 +339,16 @@ unsigned CharacterStream::Length() {
  return result;
 }

+unsigned CharacterStream::Utf16Length() {
+  unsigned result = 0;
+  while (has_more()) {
+    uchar c = GetNext();
+    result += c > Utf16::kMaxNonSurrogateCharCode ? 2 : 1;
+  }
+  Rewind();
+  return result;
+}
+
 void CharacterStream::Seek(unsigned position) {
  Rewind();
  for (unsigned i = 0; i < position; i++) {
--- a/src/unicode.h
+++ b/src/unicode.h
@ -100,7 +100,7 @@ class UnicodeData {
  static const uchar kMaxCodePoint;
 };

-// --- U t f   8 ---
+// --- U t f   8   a n d   16 ---

 template <typename Data>
 class Buffer {
@ -114,10 +114,46 @@ class Buffer {
  unsigned length_;
 };

+
+class Utf16 {
+ public:
+  static inline bool IsLeadSurrogate(int32_t code) {
+    if (code == kNoPreviousCharacter) return false;
+    return (code & 0xfc00) == 0xd800;
+  }
+  static inline bool IsTrailSurrogate(int32_t code) {
+    if (code == kNoPreviousCharacter) return false;
+    return (code & 0xfc00) == 0xdc00;
+  }
+
+  static inline int32_t CombineSurrogatePair(uchar lead, uchar trail) {
+    return 0x10000 + ((lead & 0x3ff) << 10) + (trail & 0x3ff);
+  }
+  static const int32_t kNoPreviousCharacter = -1;
+  static const uchar kMaxNonSurrogateCharCode = 0xffff;
+  // Encoding a single UTF-16 code unit will produce 1, 2 or 3 bytes
+  // of UTF-8 data.  The special case where the unit is a surrogate
+  // trail produces 1 byte net, because the encoding of the pair is
+  // 4 bytes and the 3 bytes that were used to encode the lead surrogate
+  // can be reclaimed.
+  static const int kMaxExtraUtf8BytesForOneUtf16CodeUnit = 3;
+  // One UTF-16 surrogate is endoded (illegally) as 3 UTF-8 bytes.
+  // The illegality stems from the surrogate not being part of a pair.
+  static const int kUtf8BytesToCodeASurrogate = 3;
+  static inline uchar LeadSurrogate(int32_t char_code) {
+    return 0xd800 + (((char_code - 0x10000) >> 10) & 0x3ff);
+  }
+  static inline uchar TrailSurrogate(int32_t char_code) {
+    return 0xdc00 + (char_code & 0x3ff);
+  }
+};
+
+
 class Utf8 {
 public:
-  static inline uchar Length(uchar chr);
-  static inline unsigned Encode(char* out, uchar c);
+  static inline uchar Length(uchar chr, int previous);
+  static inline unsigned Encode(
+      char* out, uchar c, int previous);
  static const byte* ReadBlock(Buffer<const char*> str, byte* buffer,
      unsigned capacity, unsigned* chars_read, unsigned* offset);
  static uchar CalculateValue(const byte* str,
@ -130,6 +166,11 @@ class Utf8 {
  static const unsigned kMaxThreeByteChar = 0xffff;
  static const unsigned kMaxFourByteChar  = 0x1fffff;

+  // A single surrogate is coded as a 3 byte UTF-8 sequence, but two together
+  // that match are coded as a 4 byte UTF-8 sequence.
+  static const unsigned kBytesSavedByCombiningSurrogates = 2;
+  static const unsigned kSizeOfUnmatchedSurrogate = 3;
+
 private:
  template <unsigned s> friend class Utf8InputBuffer;
  friend class Test;
@ -147,6 +188,7 @@ class CharacterStream {
  // Note that default implementation is not efficient.
  virtual void Seek(unsigned);
  unsigned Length();
+  unsigned Utf16Length();
  virtual ~CharacterStream() { }
  static inline bool EncodeCharacter(uchar c, byte* buffer, unsigned capacity,
      unsigned& offset);
@ -156,6 +198,7 @@ class CharacterStream {
      unsigned capacity, unsigned& offset);
  static inline uchar DecodeCharacter(const byte* buffer, unsigned* offset);
  virtual void Rewind() = 0;
+
 protected:
  virtual void FillBuffer() = 0;
  // The number of characters left in the current buffer
--- a/src/x64/regexp-macro-assembler-x64.cc
+++ b/src/x64/regexp-macro-assembler-x64.cc
@ -564,7 +564,7 @@ void RegExpMacroAssemblerX64::CheckNotCharacterAfterMinusAnd(
    uc16 minus,
    uc16 mask,
    Label* on_not_equal) {
-  ASSERT(minus < String::kMaxUC16CharCode);
+  ASSERT(minus < String::kMaxUtf16CodeUnit);
  __ lea(rax, Operand(current_character(), -minus));
  __ and_(rax, Immediate(mask));
  __ cmpl(rax, Immediate(c));
--- a/test/cctest/test-api.cc
+++ b/test/cctest/test-api.cc
@ -5526,6 +5526,17 @@ static int StrNCmp16(uint16_t* a, uint16_t* b, int n) {
 }


+int GetUtf8Length(Handle<String> str) {
+  int len = str->Utf8Length();
+  if (len < 0) {
+    i::Handle<i::String> istr(v8::Utils::OpenHandle(*str));
+    i::FlattenString(istr);
+    len = str->Utf8Length();
+  }
+  return len;
+}
+
+
 THREADED_TEST(StringWrite) {
  LocalContext context;
  v8::HandleScope scope;
@ -5606,7 +5617,7 @@ THREADED_TEST(StringWrite) {
  CHECK_EQ(0, strncmp(utf8buf, "ab\1", 3));

  memset(utf8buf, 0x1, sizeof(utf8buf));
-  len = left_tree->Utf8Length();
+  len = GetUtf8Length(left_tree);
  int utf8_expected =
      (0x80 + (0x800 - 0x80) * 2 + (0xd800 - 0x800) * 3) / kStride;
  CHECK_EQ(utf8_expected, len);
@ -5620,7 +5631,7 @@ THREADED_TEST(StringWrite) {
  CHECK_EQ(1, utf8buf[utf8_expected]);

  memset(utf8buf, 0x1, sizeof(utf8buf));
-  len = right_tree->Utf8Length();
+  len = GetUtf8Length(right_tree);
  CHECK_EQ(utf8_expected, len);
  len = right_tree->WriteUtf8(utf8buf, utf8_expected, &charlen);
  CHECK_EQ(utf8_expected, len);
@ -5745,6 +5756,217 @@ THREADED_TEST(StringWrite) {
 }


+static void Utf16Helper(
+    LocalContext& context,
+    const char* name,
+    const char* lengths_name,
+    int len) {
+  Local<v8::Array> a =
+      Local<v8::Array>::Cast(context->Global()->Get(v8_str(name)));
+  Local<v8::Array> alens =
+      Local<v8::Array>::Cast(context->Global()->Get(v8_str(lengths_name)));
+  for (int i = 0; i < len; i++) {
+    Local<v8::String> string =
+      Local<v8::String>::Cast(a->Get(i));
+    Local<v8::Number> expected_len =
+      Local<v8::Number>::Cast(alens->Get(i));
+    int length = GetUtf8Length(string);
+    CHECK_EQ(static_cast<int>(expected_len->Value()), length);
+  }
+}
+
+
+static uint16_t StringGet(Handle<String> str, int index) {
+  i::Handle<i::String> istring =
+      v8::Utils::OpenHandle(String::Cast(*str));
+  return istring->Get(index);
+}
+
+
+static void WriteUtf8Helper(
+    LocalContext& context,
+    const char* name,
+    const char* lengths_name,
+    int len) {
+  Local<v8::Array> b =
+      Local<v8::Array>::Cast(context->Global()->Get(v8_str(name)));
+  Local<v8::Array> alens =
+      Local<v8::Array>::Cast(context->Global()->Get(v8_str(lengths_name)));
+  char buffer[1000];
+  char buffer2[1000];
+  for (int i = 0; i < len; i++) {
+    Local<v8::String> string =
+      Local<v8::String>::Cast(b->Get(i));
+    Local<v8::Number> expected_len =
+      Local<v8::Number>::Cast(alens->Get(i));
+    int utf8_length = static_cast<int>(expected_len->Value());
+    for (int j = utf8_length + 1; j >= 0; j--) {
+      memset(reinterpret_cast<void*>(&buffer), 42, sizeof(buffer));
+      memset(reinterpret_cast<void*>(&buffer2), 42, sizeof(buffer2));
+      int nchars;
+      int utf8_written =
+          string->WriteUtf8(buffer, j, &nchars, String::NO_OPTIONS);
+      int utf8_written2 =
+          string->WriteUtf8(buffer2, j, &nchars, String::NO_NULL_TERMINATION);
+      CHECK_GE(utf8_length + 1, utf8_written);
+      CHECK_GE(utf8_length, utf8_written2);
+      for (int k = 0; k < utf8_written2; k++) {
+        CHECK_EQ(buffer[k], buffer2[k]);
+      }
+      CHECK(nchars * 3 >= utf8_written - 1);
+      CHECK(nchars <= utf8_written);
+      if (j == utf8_length + 1) {
+        CHECK_EQ(utf8_written2, utf8_length);
+        CHECK_EQ(utf8_written2 + 1, utf8_written);
+      }
+      CHECK_EQ(buffer[utf8_written], 42);
+      if (j > utf8_length) {
+        if (utf8_written != 0) CHECK_EQ(buffer[utf8_written - 1], 0);
+        if (utf8_written > 1) CHECK_NE(buffer[utf8_written - 2], 42);
+        Handle<String> roundtrip = v8_str(buffer);
+        CHECK(roundtrip->Equals(string));
+      } else {
+        if (utf8_written != 0) CHECK_NE(buffer[utf8_written - 1], 42);
+      }
+      if (utf8_written2 != 0) CHECK_NE(buffer[utf8_written - 1], 42);
+      if (nchars >= 2) {
+        uint16_t trail = StringGet(string, nchars - 1);
+        uint16_t lead = StringGet(string, nchars - 2);
+        if (((lead & 0xfc00) == 0xd800) &&
+            ((trail & 0xfc00) == 0xdc00)) {
+          unsigned char u1 = buffer2[utf8_written2 - 4];
+          unsigned char u2 = buffer2[utf8_written2 - 3];
+          unsigned char u3 = buffer2[utf8_written2 - 2];
+          unsigned char u4 = buffer2[utf8_written2 - 1];
+          CHECK_EQ((u1 & 0xf8), 0xf0);
+          CHECK_EQ((u2 & 0xc0), 0x80);
+          CHECK_EQ((u3 & 0xc0), 0x80);
+          CHECK_EQ((u4 & 0xc0), 0x80);
+          uint32_t c = 0x10000 + ((lead & 0x3ff) << 10) + (trail & 0x3ff);
+          CHECK_EQ((u4 & 0x3f), (c & 0x3f));
+          CHECK_EQ((u3 & 0x3f), ((c >> 6) & 0x3f));
+          CHECK_EQ((u2 & 0x3f), ((c >> 12) & 0x3f));
+          CHECK_EQ((u1 & 0x3), c >> 18);
+        }
+      }
+    }
+  }
+}
+
+
+THREADED_TEST(Utf16) {
+  LocalContext context;
+  v8::HandleScope scope;
+  CompileRun(
+      "var pad = '01234567890123456789';"
+      "var p = [];"
+      "var plens = [20, 3, 3];"
+      "p.push('01234567890123456789');"
+      "var lead = 0xd800;"
+      "var trail = 0xdc00;"
+      "p.push(String.fromCharCode(0xd800));"
+      "p.push(String.fromCharCode(0xdc00));"
+      "var a = [];"
+      "var b = [];"
+      "var alens = [];"
+      "for (var i = 0; i < 3; i++) {"
+      "  p[1] = String.fromCharCode(lead++);"
+      "  for (var j = 0; j < 3; j++) {"
+      "    p[2] = String.fromCharCode(trail++);"
+      "    a.push(p[i] + p[j]);"
+      "    b.push(p[i] + p[j]);"
+      "    alens.push(plens[i] + plens[j]);"
+      "  }"
+      "}"
+      "alens[5] -= 2;"  // Here the surrogate pairs match up.
+      "var a2 = [];"
+      "var b2 = [];"
+      "var a2lens = [];"
+      "for (var m = 0; m < 9; m++) {"
+      "  for (var n = 0; n < 9; n++) {"
+      "    a2.push(a[m] + a[n]);"
+      "    b2.push(b[m] + b[n]);"
+      "    var utf = alens[m] + alens[n];"  // And here.
+           // The 'n's that start with 0xdc.. are 6-8
+           // The 'm's that end with 0xd8.. are 1, 4 and 7
+      "    if ((m % 3) == 1 && n >= 6) utf -= 2;"
+      "    a2lens.push(utf);"
+      "  }"
+      "}");
+  Utf16Helper(context, "a", "alens", 9);
+  Utf16Helper(context, "a2", "a2lens", 81);
+  WriteUtf8Helper(context, "b", "alens", 9);
+  WriteUtf8Helper(context, "b2", "a2lens", 81);
+}
+
+
+static bool SameSymbol(Handle<String> s1, Handle<String> s2) {
+  i::Handle<i::String> is1(v8::Utils::OpenHandle(*s1));
+  i::Handle<i::String> is2(v8::Utils::OpenHandle(*s2));
+  return *is1 == *is2;
+}
+
+
+static void SameSymbolHelper(const char* a, const char* b) {
+  Handle<String> symbol1 = v8::String::NewSymbol(a);
+  Handle<String> symbol2 = v8::String::NewSymbol(b);
+  CHECK(SameSymbol(symbol1, symbol2));
+}
+
+
+THREADED_TEST(Utf16Symbol) {
+  LocalContext context;
+  v8::HandleScope scope;
+
+  Handle<String> symbol1 = v8::String::NewSymbol("abc");
+  Handle<String> symbol2 = v8::String::NewSymbol("abc");
+  CHECK(SameSymbol(symbol1, symbol2));
+
+  SameSymbolHelper("\360\220\220\205",  // 4 byte encoding.
+                   "\355\240\201\355\260\205");  // 2 3-byte surrogates.
+  SameSymbolHelper("\355\240\201\355\260\206",  // 2 3-byte surrogates.
+                   "\360\220\220\206");  // 4 byte encoding.
+  SameSymbolHelper("x\360\220\220\205",  // 4 byte encoding.
+                   "x\355\240\201\355\260\205");  // 2 3-byte surrogates.
+  SameSymbolHelper("x\355\240\201\355\260\206",  // 2 3-byte surrogates.
+                   "x\360\220\220\206");  // 4 byte encoding.
+  CompileRun(
+      "var sym0 = 'benedictus';"
+      "var sym0b = 'S\303\270ren';"
+      "var sym1 = '\355\240\201\355\260\207';"
+      "var sym2 = '\360\220\220\210';"
+      "var sym3 = 'x\355\240\201\355\260\207';"
+      "var sym4 = 'x\360\220\220\210';"
+      "if (sym1.length != 2) throw sym1;"
+      "if (sym1.charCodeAt(1) != 0xdc07) throw sym1.charCodeAt(1);"
+      "if (sym2.length != 2) throw sym2;"
+      "if (sym2.charCodeAt(1) != 0xdc08) throw sym2.charCodeAt(2);"
+      "if (sym3.length != 3) throw sym3;"
+      "if (sym3.charCodeAt(2) != 0xdc07) throw sym1.charCodeAt(2);"
+      "if (sym4.length != 3) throw sym4;"
+      "if (sym4.charCodeAt(2) != 0xdc08) throw sym2.charCodeAt(2);");
+  Handle<String> sym0 = v8::String::NewSymbol("benedictus");
+  Handle<String> sym0b = v8::String::NewSymbol("S\303\270ren");
+  Handle<String> sym1 = v8::String::NewSymbol("\355\240\201\355\260\207");
+  Handle<String> sym2 = v8::String::NewSymbol("\360\220\220\210");
+  Handle<String> sym3 = v8::String::NewSymbol("x\355\240\201\355\260\207");
+  Handle<String> sym4 = v8::String::NewSymbol("x\360\220\220\210");
+  v8::Local<v8::Object> global = context->Global();
+  Local<Value> s0 = global->Get(v8_str("sym0"));
+  Local<Value> s0b = global->Get(v8_str("sym0b"));
+  Local<Value> s1 = global->Get(v8_str("sym1"));
+  Local<Value> s2 = global->Get(v8_str("sym2"));
+  Local<Value> s3 = global->Get(v8_str("sym3"));
+  Local<Value> s4 = global->Get(v8_str("sym4"));
+  CHECK(SameSymbol(sym0, Handle<String>(String::Cast(*s0))));
+  CHECK(SameSymbol(sym0b, Handle<String>(String::Cast(*s0b))));
+  CHECK(SameSymbol(sym1, Handle<String>(String::Cast(*s1))));
+  CHECK(SameSymbol(sym2, Handle<String>(String::Cast(*s2))));
+  CHECK(SameSymbol(sym3, Handle<String>(String::Cast(*s3))));
+  CHECK(SameSymbol(sym4, Handle<String>(String::Cast(*s4))));
+}
+
+
 THREADED_TEST(ToArrayIndex) {
  v8::HandleScope scope;
  LocalContext context;
--- a/test/cctest/test-parsing.cc
+++ b/test/cctest/test-parsing.cc
@ -63,7 +63,7 @@ TEST(ScanKeywords) {
    int length = i::StrLength(key_token.keyword);
    CHECK(static_cast<int>(sizeof(buffer)) >= length);
    {
-      i::Utf8ToUC16CharacterStream stream(keyword, length);
+      i::Utf8ToUtf16CharacterStream stream(keyword, length);
      i::Scanner scanner(&unicode_cache);
      // The scanner should parse Harmony keywords for this test.
      scanner.SetHarmonyScoping(true);
@ -74,7 +74,7 @@ TEST(ScanKeywords) {
    }
    // Removing characters will make keyword matching fail.
    {
-      i::Utf8ToUC16CharacterStream stream(keyword, length - 1);
+      i::Utf8ToUtf16CharacterStream stream(keyword, length - 1);
      i::Scanner scanner(&unicode_cache);
      scanner.Initialize(&stream);
      CHECK_EQ(i::Token::IDENTIFIER, scanner.Next());
@ -85,7 +85,7 @@ TEST(ScanKeywords) {
    for (int j = 0; j < static_cast<int>(ARRAY_SIZE(chars_to_append)); ++j) {
      memmove(buffer, keyword, length);
      buffer[length] = chars_to_append[j];
-      i::Utf8ToUC16CharacterStream stream(buffer, length + 1);
+      i::Utf8ToUtf16CharacterStream stream(buffer, length + 1);
      i::Scanner scanner(&unicode_cache);
      scanner.Initialize(&stream);
      CHECK_EQ(i::Token::IDENTIFIER, scanner.Next());
@ -95,7 +95,7 @@ TEST(ScanKeywords) {
    {
      memmove(buffer, keyword, length);
      buffer[length - 1] = '_';
-      i::Utf8ToUC16CharacterStream stream(buffer, length);
+      i::Utf8ToUtf16CharacterStream stream(buffer, length);
      i::Scanner scanner(&unicode_cache);
      scanner.Initialize(&stream);
      CHECK_EQ(i::Token::IDENTIFIER, scanner.Next());
@ -255,7 +255,7 @@ TEST(StandAlonePreParser) {
  uintptr_t stack_limit = i::Isolate::Current()->stack_guard()->real_climit();
  for (int i = 0; programs[i]; i++) {
    const char* program = programs[i];
-    i::Utf8ToUC16CharacterStream stream(
+    i::Utf8ToUtf16CharacterStream stream(
        reinterpret_cast<const i::byte*>(program),
        static_cast<unsigned>(strlen(program)));
    i::CompleteParserRecorder log;
@ -291,7 +291,7 @@ TEST(StandAlonePreParserNoNatives) {
  uintptr_t stack_limit = i::Isolate::Current()->stack_guard()->real_climit();
  for (int i = 0; programs[i]; i++) {
    const char* program = programs[i];
-    i::Utf8ToUC16CharacterStream stream(
+    i::Utf8ToUtf16CharacterStream stream(
        reinterpret_cast<const i::byte*>(program),
        static_cast<unsigned>(strlen(program)));
    i::CompleteParserRecorder log;
@ -326,8 +326,9 @@ TEST(RegressChromium62639) {
  // and then used the invalid currently scanned literal. This always
  // failed in debug mode, and sometimes crashed in release mode.

-  i::Utf8ToUC16CharacterStream stream(reinterpret_cast<const i::byte*>(program),
-                                      static_cast<unsigned>(strlen(program)));
+  i::Utf8ToUtf16CharacterStream stream(
+      reinterpret_cast<const i::byte*>(program),
+      static_cast<unsigned>(strlen(program)));
  i::ScriptDataImpl* data =
      i::ParserApi::PreParse(&stream, NULL, false);
  CHECK(data->HasError());
@ -392,7 +393,7 @@ TEST(PreParseOverflow) {

  uintptr_t stack_limit = i::Isolate::Current()->stack_guard()->real_climit();

-  i::Utf8ToUC16CharacterStream stream(
+  i::Utf8ToUtf16CharacterStream stream(
      reinterpret_cast<const i::byte*>(*program),
      static_cast<unsigned>(kProgramSize));
  i::CompleteParserRecorder log;
@ -449,10 +450,10 @@ void TestCharacterStream(const char* ascii_source,
  i::Handle<i::String> uc16_string(
      FACTORY->NewExternalStringFromTwoByte(&resource));

-  i::ExternalTwoByteStringUC16CharacterStream uc16_stream(
+  i::ExternalTwoByteStringUtf16CharacterStream uc16_stream(
      i::Handle<i::ExternalTwoByteString>::cast(uc16_string), start, end);
-  i::GenericStringUC16CharacterStream string_stream(ascii_string, start, end);
-  i::Utf8ToUC16CharacterStream utf8_stream(
+  i::GenericStringUtf16CharacterStream string_stream(ascii_string, start, end);
+  i::Utf8ToUtf16CharacterStream utf8_stream(
      reinterpret_cast<const i::byte*>(ascii_source), end);
  utf8_stream.SeekForward(start);

@ -575,12 +576,14 @@ TEST(Utf8CharacterStream) {
  char buffer[kAllUtf8CharsSizeU];
  unsigned cursor = 0;
  for (int i = 0; i <= kMaxUC16Char; i++) {
-    cursor += unibrow::Utf8::Encode(buffer + cursor, i);
+    cursor += unibrow::Utf8::Encode(buffer + cursor,
+                                    i,
+                                    unibrow::Utf16::kNoPreviousCharacter);
  }
  ASSERT(cursor == kAllUtf8CharsSizeU);

-  i::Utf8ToUC16CharacterStream stream(reinterpret_cast<const i::byte*>(buffer),
-                                      kAllUtf8CharsSizeU);
+  i::Utf8ToUtf16CharacterStream stream(reinterpret_cast<const i::byte*>(buffer),
+                                       kAllUtf8CharsSizeU);
  for (int i = 0; i <= kMaxUC16Char; i++) {
    CHECK_EQU(i, stream.pos());
    int32_t c = stream.Advance();
@ -610,7 +613,7 @@ TEST(Utf8CharacterStream) {

 #undef CHECK_EQU

-void TestStreamScanner(i::UC16CharacterStream* stream,
+void TestStreamScanner(i::Utf16CharacterStream* stream,
                       i::Token::Value* expected_tokens,
                       int skip_pos = 0,  // Zero means not skipping.
                       int skip_to = 0) {
@ -633,8 +636,8 @@ TEST(StreamScanner) {
  v8::V8::Initialize();

  const char* str1 = "{ foo get for : */ <- \n\n /*foo*/ bib";
-  i::Utf8ToUC16CharacterStream stream1(reinterpret_cast<const i::byte*>(str1),
-                                       static_cast<unsigned>(strlen(str1)));
+  i::Utf8ToUtf16CharacterStream stream1(reinterpret_cast<const i::byte*>(str1),
+                                        static_cast<unsigned>(strlen(str1)));
  i::Token::Value expectations1[] = {
      i::Token::LBRACE,
      i::Token::IDENTIFIER,
@ -652,8 +655,8 @@ TEST(StreamScanner) {
  TestStreamScanner(&stream1, expectations1, 0, 0);

  const char* str2 = "case default const {THIS\nPART\nSKIPPED} do";
-  i::Utf8ToUC16CharacterStream stream2(reinterpret_cast<const i::byte*>(str2),
-                                       static_cast<unsigned>(strlen(str2)));
+  i::Utf8ToUtf16CharacterStream stream2(reinterpret_cast<const i::byte*>(str2),
+                                        static_cast<unsigned>(strlen(str2)));
  i::Token::Value expectations2[] = {
      i::Token::CASE,
      i::Token::DEFAULT,
@ -683,7 +686,7 @@ TEST(StreamScanner) {
  for (int i = 0; i <= 4; i++) {
     expectations3[6 - i] = i::Token::ILLEGAL;
     expectations3[5 - i] = i::Token::EOS;
-     i::Utf8ToUC16CharacterStream stream3(
+     i::Utf8ToUtf16CharacterStream stream3(
         reinterpret_cast<const i::byte*>(str3),
         static_cast<unsigned>(strlen(str3)));
     TestStreamScanner(&stream3, expectations3, 1, 1 + i);
@ -692,7 +695,7 @@ TEST(StreamScanner) {


 void TestScanRegExp(const char* re_source, const char* expected) {
-  i::Utf8ToUC16CharacterStream stream(
+  i::Utf8ToUtf16CharacterStream stream(
       reinterpret_cast<const i::byte*>(re_source),
       static_cast<unsigned>(strlen(re_source)));
  i::Scanner scanner(i::Isolate::Current()->unicode_cache());
@ -748,6 +751,67 @@ TEST(RegExpScanning) {
 }


+static int Utf8LengthHelper(const char* s) {
+  int len = strlen(s);
+  int character_length = len;
+  for (int i = 0; i < len; i++) {
+    unsigned char c = s[i];
+    int input_offset = 0;
+    int output_adjust = 0;
+    if (c > 0x7f) {
+      if (c < 0xc0) continue;
+      if (c >= 0xf0) {
+        if (c >= 0xf8) {
+          // 5 and 6 byte UTF-8 sequences turn into a kBadChar for each UTF-8
+          // byte.
+          continue;  // Handle first UTF-8 byte.
+        }
+        if ((c & 7) == 0 && ((s[i + 1] & 0x30) == 0)) {
+          // This 4 byte sequence could have been coded as a 3 byte sequence.
+          // Record a single kBadChar for the first byte and continue.
+          continue;
+        }
+        input_offset = 3;
+        // 4 bytes of UTF-8 turn into 2 UTF-16 code units.
+        character_length -= 2;
+      } else if (c >= 0xe0) {
+        if ((c & 0xf) == 0 && ((s[i + 1] & 0x20) == 0)) {
+          // This 3 byte sequence could have been coded as a 2 byte sequence.
+          // Record a single kBadChar for the first byte and continue.
+          continue;
+        }
+        input_offset = 2;
+        // 3 bytes of UTF-8 turn into 1 UTF-16 code unit.
+        output_adjust = 2;
+      } else {
+        if ((c & 0x1e) == 0) {
+          // This 2 byte sequence could have been coded as a 1 byte sequence.
+          // Record a single kBadChar for the first byte and continue.
+          continue;
+        }
+        input_offset = 1;
+        // 2 bytes of UTF-8 turn into 1 UTF-16 code unit.
+        output_adjust = 1;
+      }
+      bool bad = false;
+      for (int j = 1; j <= input_offset; j++) {
+        if ((s[i + j] & 0xc0) != 0x80) {
+          // Bad UTF-8 sequence turns the first in the sequence into kBadChar,
+          // which is a single UTF-16 code unit.
+          bad = true;
+          break;
+        }
+      }
+      if (!bad) {
+        i += input_offset;
+        character_length -= output_adjust;
+      }
+    }
+  }
+  return character_length;
+}
+
+
 TEST(ScopePositions) {
  // Test the parser for correctly setting the start and end positions
  // of a scope. We check the scope positions of exactly one scope
@ -835,6 +899,91 @@ TEST(ScopePositions) {
    { "  for ", "(let x in {})\n"
      "    statement;", "\n"
      "  more;", i::BLOCK_SCOPE, i::EXTENDED_MODE },
+    // Check that 6-byte and 4-byte encodings of UTF-8 strings do not throw
+    // the preparser off in terms of byte offsets.
+    // 6 byte encoding.
+    { "  'foo\355\240\201\355\260\211';\n"
+      "  (function fun", "(a,b) { infunction; }", ")();",
+      i::FUNCTION_SCOPE, i::CLASSIC_MODE },
+    // 4 byte encoding.
+    { "  'foo\360\220\220\212';\n"
+      "  (function fun", "(a,b) { infunction; }", ")();",
+      i::FUNCTION_SCOPE, i::CLASSIC_MODE },
+    // 3 byte encoding of \u0fff.
+    { "  'foo\340\277\277';\n"
+      "  (function fun", "(a,b) { infunction; }", ")();",
+      i::FUNCTION_SCOPE, i::CLASSIC_MODE },
+    // Broken 6 byte encoding with missing last byte.
+    { "  'foo\355\240\201\355\211';\n"
+      "  (function fun", "(a,b) { infunction; }", ")();",
+      i::FUNCTION_SCOPE, i::CLASSIC_MODE },
+    // Broken 3 byte encoding of \u0fff with missing last byte.
+    { "  'foo\340\277';\n"
+      "  (function fun", "(a,b) { infunction; }", ")();",
+      i::FUNCTION_SCOPE, i::CLASSIC_MODE },
+    // Broken 3 byte encoding of \u0fff with missing 2 last bytes.
+    { "  'foo\340';\n"
+      "  (function fun", "(a,b) { infunction; }", ")();",
+      i::FUNCTION_SCOPE, i::CLASSIC_MODE },
+    // Broken 3 byte encoding of \u00ff should be a 2 byte encoding.
+    { "  'foo\340\203\277';\n"
+      "  (function fun", "(a,b) { infunction; }", ")();",
+      i::FUNCTION_SCOPE, i::CLASSIC_MODE },
+    // Broken 3 byte encoding of \u007f should be a 2 byte encoding.
+    { "  'foo\340\201\277';\n"
+      "  (function fun", "(a,b) { infunction; }", ")();",
+      i::FUNCTION_SCOPE, i::CLASSIC_MODE },
+    // Unpaired lead surrogate.
+    { "  'foo\355\240\201';\n"
+      "  (function fun", "(a,b) { infunction; }", ")();",
+      i::FUNCTION_SCOPE, i::CLASSIC_MODE },
+    // Unpaired lead surrogate where following code point is a 3 byte sequence.
+    { "  'foo\355\240\201\340\277\277';\n"
+      "  (function fun", "(a,b) { infunction; }", ")();",
+      i::FUNCTION_SCOPE, i::CLASSIC_MODE },
+    // Unpaired lead surrogate where following code point is a 4 byte encoding
+    // of a trail surrogate.
+    { "  'foo\355\240\201\360\215\260\211';\n"
+      "  (function fun", "(a,b) { infunction; }", ")();",
+      i::FUNCTION_SCOPE, i::CLASSIC_MODE },
+    // Unpaired trail surrogate.
+    { "  'foo\355\260\211';\n"
+      "  (function fun", "(a,b) { infunction; }", ")();",
+      i::FUNCTION_SCOPE, i::CLASSIC_MODE },
+    // 2 byte encoding of \u00ff.
+    { "  'foo\303\277';\n"
+      "  (function fun", "(a,b) { infunction; }", ")();",
+      i::FUNCTION_SCOPE, i::CLASSIC_MODE },
+    // Broken 2 byte encoding of \u00ff with missing last byte.
+    { "  'foo\303';\n"
+      "  (function fun", "(a,b) { infunction; }", ")();",
+      i::FUNCTION_SCOPE, i::CLASSIC_MODE },
+    // Broken 2 byte encoding of \u007f should be a 1 byte encoding.
+    { "  'foo\301\277';\n"
+      "  (function fun", "(a,b) { infunction; }", ")();",
+      i::FUNCTION_SCOPE, i::CLASSIC_MODE },
+    // Illegal 5 byte encoding.
+    { "  'foo\370\277\277\277\277';\n"
+      "  (function fun", "(a,b) { infunction; }", ")();",
+      i::FUNCTION_SCOPE, i::CLASSIC_MODE },
+    // Illegal 6 byte encoding.
+    { "  'foo\374\277\277\277\277\277';\n"
+      "  (function fun", "(a,b) { infunction; }", ")();",
+      i::FUNCTION_SCOPE, i::CLASSIC_MODE },
+    // Illegal 0xfe byte
+    { "  'foo\376\277\277\277\277\277\277';\n"
+      "  (function fun", "(a,b) { infunction; }", ")();",
+      i::FUNCTION_SCOPE, i::CLASSIC_MODE },
+    // Illegal 0xff byte
+    { "  'foo\377\277\277\277\277\277\277\277';\n"
+      "  (function fun", "(a,b) { infunction; }", ")();",
+      i::FUNCTION_SCOPE, i::CLASSIC_MODE },
+    { "  'foo';\n"
+      "  (function fun", "(a,b) { 'bar\355\240\201\355\260\213'; }", ")();",
+      i::FUNCTION_SCOPE, i::CLASSIC_MODE },
+    { "  'foo';\n"
+      "  (function fun", "(a,b) { 'bar\360\220\220\214'; }", ")();",
+      i::FUNCTION_SCOPE, i::CLASSIC_MODE },
    { NULL, NULL, NULL, i::EVAL_SCOPE, i::CLASSIC_MODE }
  };

@ -848,20 +997,24 @@ TEST(ScopePositions) {
  i::FLAG_harmony_scoping = true;

  for (int i = 0; source_data[i].outer_prefix; i++) {
-    int kPrefixLen = i::StrLength(source_data[i].outer_prefix);
-    int kInnerLen = i::StrLength(source_data[i].inner_source);
-    int kSuffixLen = i::StrLength(source_data[i].outer_suffix);
+    int kPrefixLen = Utf8LengthHelper(source_data[i].outer_prefix);
+    int kInnerLen = Utf8LengthHelper(source_data[i].inner_source);
+    int kSuffixLen = Utf8LengthHelper(source_data[i].outer_suffix);
+    int kPrefixByteLen = i::StrLength(source_data[i].outer_prefix);
+    int kInnerByteLen = i::StrLength(source_data[i].inner_source);
+    int kSuffixByteLen = i::StrLength(source_data[i].outer_suffix);
    int kProgramSize = kPrefixLen + kInnerLen + kSuffixLen;
-    i::Vector<char> program = i::Vector<char>::New(kProgramSize + 1);
-    int length = i::OS::SNPrintF(program, "%s%s%s",
-                                 source_data[i].outer_prefix,
-                                 source_data[i].inner_source,
-                                 source_data[i].outer_suffix);
-    CHECK(length == kProgramSize);
+    int kProgramByteSize = kPrefixByteLen + kInnerByteLen + kSuffixByteLen;
+    i::Vector<char> program = i::Vector<char>::New(kProgramByteSize + 1);
+    i::OS::SNPrintF(program, "%s%s%s",
+                             source_data[i].outer_prefix,
+                             source_data[i].inner_source,
+                             source_data[i].outer_suffix);

    // Parse program source.
    i::Handle<i::String> source(
-        FACTORY->NewStringFromAscii(i::CStrVector(program.start())));
+        FACTORY->NewStringFromUtf8(i::CStrVector(program.start())));
+    CHECK_EQ(source->length(), kProgramSize);
    i::Handle<i::Script> script = FACTORY->NewScript(source);
    i::Parser parser(script, i::kAllowLazy | i::EXTENDED_MODE, NULL, NULL);
    i::CompilationInfo info(script);
@ -894,7 +1047,7 @@ void TestParserSync(i::Handle<i::String> source, int flags) {
  // Preparse the data.
  i::CompleteParserRecorder log;
  i::Scanner scanner(i::Isolate::Current()->unicode_cache());
-  i::GenericStringUC16CharacterStream stream(source, 0, source->length());
+  i::GenericStringUtf16CharacterStream stream(source, 0, source->length());
  scanner.SetHarmonyScoping(harmony_scoping);
  scanner.Initialize(&stream);
  v8::preparser::PreParser::PreParseResult result =