[regexp] implement /ui to mirror the implementation for /i.

R=erik.corry@gmail.com, erikcorry@chromium.org Review URL: https://codereview.chromium.org/1641613002 Cr-Commit-Position: refs/heads/master@{#33655}
2016-02-01 23:07:31 -08:00 · 2016-02-01 23:07:31 -08:00 · eea1a4c003
commit eea1a4c003
parent 1f85ff077d
5 changed files with 151 additions and 136 deletions
--- a/src/regexp/jsregexp.cc
+++ b/src/regexp/jsregexp.cc
@ -1598,19 +1598,34 @@ void ChoiceNode::GenerateGuard(RegExpMacroAssembler* macro_assembler,

 // Returns the number of characters in the equivalence class, omitting those
 // that cannot occur in the source string because it is Latin1.
-static int GetCaseIndependentLetters(Isolate* isolate, uc16 character,
-                                     bool one_byte_subject,
-                                     unibrow::uchar* letters) {
-  int length =
-      isolate->jsregexp_uncanonicalize()->get(character, '\0', letters);
-  // Unibrow returns 0 or 1 for characters where case independence is
-  // trivial.
-  if (length == 0) {
-    letters[0] = character;
-    length = 1;
+static int GetCaseIndependentLetters(RegExpCompiler* compiler, uc16 character,
+                                     uc32* letters) {
+  int length;
+#ifdef V8_I18N_SUPPORT
+  if (compiler->unicode()) {
+    USet* set = uset_open(character, character);
+    uset_closeOver(set, USET_CASE_INSENSITIVE);
+    uset_removeAllStrings(set);
+    length = uset_size(set);
+    for (int i = 0; i < length; i++) {
+      letters[i] = uset_charAt(set, i);
+    }
+    uset_close(set);
+  } else  // NOLINT
+// Fallback in case ICU is not included.
+#endif  // V8_I18N_SUPPORT
+  {
+    length = compiler->isolate()->jsregexp_uncanonicalize()->get(character,
+                                                                 '\0', letters);
+    // Unibrow returns 0 or 1 for characters where case independence is
+    // trivial.
+    if (length == 0) {
+      letters[0] = character;
+      length = 1;
+    }
  }

-  if (one_byte_subject) {
+  if (compiler->one_byte()) {
    int new_length = 0;
    for (int i = 0; i < length; i++) {
      if (letters[i] <= String::kMaxOneByteCharCode) {
@ -1623,14 +1638,9 @@ static int GetCaseIndependentLetters(Isolate* isolate, uc16 character,
  return length;
 }

-
-static inline bool EmitSimpleCharacter(Isolate* isolate,
-                                       RegExpCompiler* compiler,
-                                       uc16 c,
-                                       Label* on_failure,
-                                       int cp_offset,
-                                       bool check,
-                                       bool preloaded) {
+static inline bool EmitSimpleCharacter(RegExpCompiler* compiler, uc16 c,
+                                       Label* on_failure, int cp_offset,
+                                       bool check, bool preloaded) {
  RegExpMacroAssembler* assembler = compiler->macro_assembler();
  bool bound_checked = false;
  if (!preloaded) {
@ -1647,17 +1657,12 @@ static inline bool EmitSimpleCharacter(Isolate* isolate,

 // Only emits non-letters (things that don't have case).  Only used for case
 // independent matches.
-static inline bool EmitAtomNonLetter(Isolate* isolate,
-                                     RegExpCompiler* compiler,
-                                     uc16 c,
-                                     Label* on_failure,
-                                     int cp_offset,
-                                     bool check,
-                                     bool preloaded) {
+static inline bool EmitAtomNonLetter(RegExpCompiler* compiler, uc16 c,
+                                     Label* on_failure, int cp_offset,
+                                     bool check, bool preloaded) {
  RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
-  bool one_byte = compiler->one_byte();
  unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
-  int length = GetCaseIndependentLetters(isolate, c, one_byte, chars);
+  int length = GetCaseIndependentLetters(compiler, c, chars);
  if (length < 1) {
    // This can't match.  Must be an one-byte subject and a non-one-byte
    // character.  We do not need to do anything since the one-byte pass
@ -1667,8 +1672,8 @@ static inline bool EmitAtomNonLetter(Isolate* isolate,
  bool checked = false;
  // We handle the length > 1 case in a later pass.
  if (length == 1) {
-    if (one_byte && c > String::kMaxOneByteCharCodeU) {
-      // Can't match - see above.
+    if (compiler->one_byte() && c > String::kMaxOneByteCharCodeU) {
+      // This cannot match.
      return false;  // Bounds not checked.
    }
    if (!preloaded) {
@ -1717,28 +1722,18 @@ static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler,
  return false;
 }

-
-typedef bool EmitCharacterFunction(Isolate* isolate,
-                                   RegExpCompiler* compiler,
-                                   uc16 c,
-                                   Label* on_failure,
-                                   int cp_offset,
-                                   bool check,
+typedef bool EmitCharacterFunction(RegExpCompiler* compiler, uc16 c,
+                                   Label* on_failure, int cp_offset, bool check,
                                   bool preloaded);

 // Only emits letters (things that have case).  Only used for case independent
 // matches.
-static inline bool EmitAtomLetter(Isolate* isolate,
-                                  RegExpCompiler* compiler,
-                                  uc16 c,
-                                  Label* on_failure,
-                                  int cp_offset,
-                                  bool check,
+static inline bool EmitAtomLetter(RegExpCompiler* compiler, uc16 c,
+                                  Label* on_failure, int cp_offset, bool check,
                                  bool preloaded) {
  RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
-  bool one_byte = compiler->one_byte();
  unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
-  int length = GetCaseIndependentLetters(isolate, c, one_byte, chars);
+  int length = GetCaseIndependentLetters(compiler, c, chars);
  if (length <= 1) return false;
  // We may not need to check against the end of the input string
  // if this character lies before a character that matched.
@ -1749,8 +1744,8 @@ static inline bool EmitAtomLetter(Isolate* isolate,
  DCHECK(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4);
  switch (length) {
    case 2: {
-      if (ShortCutEmitCharacterPair(macro_assembler, one_byte, chars[0],
-                                    chars[1], on_failure)) {
+      if (ShortCutEmitCharacterPair(macro_assembler, compiler->one_byte(),
+                                    chars[0], chars[1], on_failure)) {
      } else {
        macro_assembler->CheckCharacter(chars[0], &ok);
        macro_assembler->CheckNotCharacter(chars[1], on_failure);
@ -2287,13 +2282,12 @@ int ActionNode::EatsAtLeast(int still_to_find,
                                   not_at_start);
 }

-
-void ActionNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
+void ActionNode::FillInBMInfo(RegExpCompiler* compiler, int offset, int budget,
                              BoyerMooreLookahead* bm, bool not_at_start) {
  if (action_type_ == BEGIN_SUBMATCH) {
    bm->SetRest(offset);
  } else if (action_type_ != POSITIVE_SUBMATCH_SUCCESS) {
-    on_success()->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
+    on_success()->FillInBMInfo(compiler, offset, budget - 1, bm, not_at_start);
  }
  SaveBMInfo(bm, not_at_start, offset);
 }
@ -2314,12 +2308,12 @@ int AssertionNode::EatsAtLeast(int still_to_find,
                                   not_at_start);
 }

-
-void AssertionNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
-                                 BoyerMooreLookahead* bm, bool not_at_start) {
+void AssertionNode::FillInBMInfo(RegExpCompiler* compiler, int offset,
+                                 int budget, BoyerMooreLookahead* bm,
+                                 bool not_at_start) {
  // Match the behaviour of EatsAtLeast on this node.
  if (assertion_type() == AT_START && not_at_start) return;
-  on_success()->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
+  on_success()->FillInBMInfo(compiler, offset, budget - 1, bm, not_at_start);
  SaveBMInfo(bm, not_at_start, offset);
 }

@ -2533,7 +2527,6 @@ void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,
  // Do not collect any quick check details if the text node reads backward,
  // since it reads in the opposite direction than we use for quick checks.
  if (read_backward()) return;
-  Isolate* isolate = compiler->macro_assembler()->isolate();
  DCHECK(characters_filled_in < details->characters());
  int characters = details->characters();
  int char_mask;
@ -2552,8 +2545,7 @@ void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,
        uc16 c = quarks[i];
        if (compiler->ignore_case()) {
          unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
-          int length = GetCaseIndependentLetters(isolate, c,
-                                                 compiler->one_byte(), chars);
+          int length = GetCaseIndependentLetters(compiler, c, chars);
          if (length == 0) {
            // This can happen because all case variants are non-Latin1, but we
            // know the input is Latin1.
@ -2758,18 +2750,17 @@ class VisitMarker {
  NodeInfo* info_;
 };

-
-RegExpNode* SeqRegExpNode::FilterOneByte(int depth, bool ignore_case) {
+RegExpNode* SeqRegExpNode::FilterOneByte(int depth, RegExpCompiler* compiler) {
  if (info()->replacement_calculated) return replacement();
  if (depth < 0) return this;
  DCHECK(!info()->visited);
  VisitMarker marker(info());
-  return FilterSuccessor(depth - 1, ignore_case);
+  return FilterSuccessor(depth - 1, compiler);
 }

-
-RegExpNode* SeqRegExpNode::FilterSuccessor(int depth, bool ignore_case) {
-  RegExpNode* next = on_success_->FilterOneByte(depth - 1, ignore_case);
+RegExpNode* SeqRegExpNode::FilterSuccessor(int depth,
+                                           RegExpCompiler* compiler) {
+  RegExpNode* next = on_success_->FilterOneByte(depth - 1, compiler);
  if (next == NULL) return set_replacement(NULL);
  on_success_ = next;
  return set_replacement(this);
@ -2792,8 +2783,30 @@ static bool RangesContainLatin1Equivalents(ZoneList<CharacterRange>* ranges) {
  return false;
 }

+static uc16 ConvertNonLatin1ToEquivalentLatin1(bool unicode, uc16 c) {
+#ifdef V8_I18N_SUPPORT
+  if (unicode) {
+    USet* set = uset_open(c, c);
+    uset_closeOver(set, USET_CASE_INSENSITIVE);
+    uset_removeAllStrings(set);
+    int length = uset_size(set);
+    uc16 result = 0;
+    for (int i = 0; i < length; i++) {
+      uc32 c = uset_charAt(set, i);
+      if (c <= String::kMaxOneByteCharCode) {
+        result = static_cast<uc16>(c);
+        break;
+      }
+    }
+    uset_close(set);
+    return result;
+  }
+// Fallback to unibrow if ICU is not included.
+#endif  // V8_I18N_SUPPORT
+  return unibrow::Latin1::ConvertNonLatin1ToLatin1(c);
+}

-RegExpNode* TextNode::FilterOneByte(int depth, bool ignore_case) {
+RegExpNode* TextNode::FilterOneByte(int depth, RegExpCompiler* compiler) {
  if (info()->replacement_calculated) return replacement();
  if (depth < 0) return this;
  DCHECK(!info()->visited);
@ -2804,16 +2817,17 @@ RegExpNode* TextNode::FilterOneByte(int depth, bool ignore_case) {
    if (elm.text_type() == TextElement::ATOM) {
      Vector<const uc16> quarks = elm.atom()->data();
      for (int j = 0; j < quarks.length(); j++) {
-        uint16_t c = quarks[j];
+        uc16 c = quarks[j];
        if (c <= String::kMaxOneByteCharCode) continue;
-        if (!ignore_case) return set_replacement(NULL);
+        if (!compiler->ignore_case()) return set_replacement(NULL);
        // Here, we need to check for characters whose upper and lower cases
        // are outside the Latin-1 range.
-        uint16_t converted = unibrow::Latin1::ConvertNonLatin1ToLatin1(c);
+        uc16 converted =
+            ConvertNonLatin1ToEquivalentLatin1(compiler->unicode(), c);
        // Character is outside Latin-1 completely
        if (converted == 0) return set_replacement(NULL);
        // Convert quark to Latin-1 in place.
-        uint16_t* copy = const_cast<uint16_t*>(quarks.start());
+        uc16* copy = const_cast<uc16*>(quarks.start());
        copy[j] = converted;
      }
    } else {
@ -2828,24 +2842,25 @@ RegExpNode* TextNode::FilterOneByte(int depth, bool ignore_case) {
            ranges->at(0).from() == 0 &&
            ranges->at(0).to() >= String::kMaxOneByteCharCode) {
          // This will be handled in a later filter.
-          if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue;
+          if (compiler->ignore_case() && RangesContainLatin1Equivalents(ranges))
+            continue;
          return set_replacement(NULL);
        }
      } else {
        if (range_count == 0 ||
            ranges->at(0).from() > String::kMaxOneByteCharCode) {
          // This will be handled in a later filter.
-          if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue;
+          if (compiler->ignore_case() && RangesContainLatin1Equivalents(ranges))
+            continue;
          return set_replacement(NULL);
        }
      }
    }
  }
-  return FilterSuccessor(depth - 1, ignore_case);
+  return FilterSuccessor(depth - 1, compiler);
 }

-
-RegExpNode* LoopChoiceNode::FilterOneByte(int depth, bool ignore_case) {
+RegExpNode* LoopChoiceNode::FilterOneByte(int depth, RegExpCompiler* compiler) {
  if (info()->replacement_calculated) return replacement();
  if (depth < 0) return this;
  if (info()->visited) return this;
@ -2853,17 +2868,16 @@ RegExpNode* LoopChoiceNode::FilterOneByte(int depth, bool ignore_case) {
    VisitMarker marker(info());

    RegExpNode* continue_replacement =
-        continue_node_->FilterOneByte(depth - 1, ignore_case);
+        continue_node_->FilterOneByte(depth - 1, compiler);
    // If we can't continue after the loop then there is no sense in doing the
    // loop.
    if (continue_replacement == NULL) return set_replacement(NULL);
  }

-  return ChoiceNode::FilterOneByte(depth - 1, ignore_case);
+  return ChoiceNode::FilterOneByte(depth - 1, compiler);
 }

-
-RegExpNode* ChoiceNode::FilterOneByte(int depth, bool ignore_case) {
+RegExpNode* ChoiceNode::FilterOneByte(int depth, RegExpCompiler* compiler) {
  if (info()->replacement_calculated) return replacement();
  if (depth < 0) return this;
  if (info()->visited) return this;
@ -2883,7 +2897,7 @@ RegExpNode* ChoiceNode::FilterOneByte(int depth, bool ignore_case) {
  for (int i = 0; i < choice_count; i++) {
    GuardedAlternative alternative = alternatives_->at(i);
    RegExpNode* replacement =
-        alternative.node()->FilterOneByte(depth - 1, ignore_case);
+        alternative.node()->FilterOneByte(depth - 1, compiler);
    DCHECK(replacement != this);  // No missing EMPTY_MATCH_CHECK.
    if (replacement != NULL) {
      alternatives_->at(i).set_node(replacement);
@ -2903,7 +2917,7 @@ RegExpNode* ChoiceNode::FilterOneByte(int depth, bool ignore_case) {
      new(zone()) ZoneList<GuardedAlternative>(surviving, zone());
  for (int i = 0; i < choice_count; i++) {
    RegExpNode* replacement =
-        alternatives_->at(i).node()->FilterOneByte(depth - 1, ignore_case);
+        alternatives_->at(i).node()->FilterOneByte(depth - 1, compiler);
    if (replacement != NULL) {
      alternatives_->at(i).set_node(replacement);
      new_alternatives->Add(alternatives_->at(i), zone());
@ -2913,9 +2927,8 @@ RegExpNode* ChoiceNode::FilterOneByte(int depth, bool ignore_case) {
  return this;
 }

-
-RegExpNode* NegativeLookaroundChoiceNode::FilterOneByte(int depth,
-                                                        bool ignore_case) {
+RegExpNode* NegativeLookaroundChoiceNode::FilterOneByte(
+    int depth, RegExpCompiler* compiler) {
  if (info()->replacement_calculated) return replacement();
  if (depth < 0) return this;
  if (info()->visited) return this;
@ -2923,12 +2936,12 @@ RegExpNode* NegativeLookaroundChoiceNode::FilterOneByte(int depth,
  // Alternative 0 is the negative lookahead, alternative 1 is what comes
  // afterwards.
  RegExpNode* node = alternatives_->at(1).node();
-  RegExpNode* replacement = node->FilterOneByte(depth - 1, ignore_case);
+  RegExpNode* replacement = node->FilterOneByte(depth - 1, compiler);
  if (replacement == NULL) return set_replacement(NULL);
  alternatives_->at(1).set_node(replacement);

  RegExpNode* neg_node = alternatives_->at(0).node();
-  RegExpNode* neg_replacement = neg_node->FilterOneByte(depth - 1, ignore_case);
+  RegExpNode* neg_replacement = neg_node->FilterOneByte(depth - 1, compiler);
  // If the negative lookahead is always going to fail then
  // we don't need to check it.
  if (neg_replacement == NULL) return set_replacement(replacement);
@ -2949,15 +2962,15 @@ void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
                                          not_at_start);
 }

-
-void LoopChoiceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
-                                  BoyerMooreLookahead* bm, bool not_at_start) {
+void LoopChoiceNode::FillInBMInfo(RegExpCompiler* compiler, int offset,
+                                  int budget, BoyerMooreLookahead* bm,
+                                  bool not_at_start) {
  if (body_can_be_zero_length_ || budget <= 0) {
    bm->SetRest(offset);
    SaveBMInfo(bm, not_at_start, offset);
    return;
  }
-  ChoiceNode::FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
+  ChoiceNode::FillInBMInfo(compiler, offset, budget - 1, bm, not_at_start);
  SaveBMInfo(bm, not_at_start, offset);
 }

@ -3049,7 +3062,6 @@ static void EmitHat(RegExpCompiler* compiler,
 // Emit the code to handle \b and \B (word-boundary or non-word-boundary).
 void AssertionNode::EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace) {
  RegExpMacroAssembler* assembler = compiler->macro_assembler();
-  Isolate* isolate = assembler->isolate();
  Trace::TriBool next_is_word_character = Trace::UNKNOWN;
  bool not_at_start = (trace->at_start() == Trace::FALSE_VALUE);
  BoyerMooreLookahead* lookahead = bm_info(not_at_start);
@ -3061,7 +3073,7 @@ void AssertionNode::EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace) {
    if (eats_at_least >= 1) {
      BoyerMooreLookahead* bm =
          new(zone()) BoyerMooreLookahead(eats_at_least, compiler, zone());
-      FillInBMInfo(isolate, 0, kRecursionBudget, bm, not_at_start);
+      FillInBMInfo(compiler, 0, kRecursionBudget, bm, not_at_start);
      if (bm->at(0)->is_non_word())
        next_is_word_character = Trace::FALSE_VALUE;
      if (bm->at(0)->is_word()) next_is_word_character = Trace::TRUE_VALUE;
@ -3233,7 +3245,6 @@ void TextNode::TextEmitPass(RegExpCompiler* compiler,
                            bool first_element_checked,
                            int* checked_up_to) {
  RegExpMacroAssembler* assembler = compiler->macro_assembler();
-  Isolate* isolate = assembler->isolate();
  bool one_byte = compiler->one_byte();
  Label* backtrack = trace->backtrack();
  QuickCheckDetails* quick_check = trace->quick_check_performed();
@ -3251,6 +3262,7 @@ void TextNode::TextEmitPass(RegExpCompiler* compiler,
        switch (pass) {
          case NON_LATIN1_MATCH:
            DCHECK(one_byte);
+            DCHECK(!(compiler->unicode() && compiler->ignore_case()));
            if (quarks[j] > String::kMaxOneByteCharCode) {
              assembler->GoTo(backtrack);
              return;
@ -3271,8 +3283,8 @@ void TextNode::TextEmitPass(RegExpCompiler* compiler,
        if (emit_function != NULL) {
          bool bounds_check = *checked_up_to < cp_offset + j || read_backward();
          bool bound_checked =
-              emit_function(isolate, compiler, quarks[j], backtrack,
-                            cp_offset + j, bounds_check, preloaded);
+              emit_function(compiler, quarks[j], backtrack, cp_offset + j,
+                            bounds_check, preloaded);
          if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to);
        }
      }
@ -3355,7 +3367,13 @@ void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) {
    return;
  }

-  if (compiler->one_byte()) {
+  if (compiler->one_byte() &&
+      !(compiler->unicode() && compiler->ignore_case())) {
+    // If any character within the text node is outside the Latin1 range, it
+    // cannot possibly match anything in a one-byte string. This still holds
+    // for case-insensitive non-unicode regexp patterns. However, for
+    // case-insensitive unicode regexp patterns, this is no longer true, e.g.
+    // /\u212b/ui matches "\u00c5".
    int dummy = 0;
    TextEmitPass(compiler, NON_LATIN1_MATCH, false, trace, false, &dummy);
  }
@ -4107,7 +4125,6 @@ int ChoiceNode::EmitOptimizedUnanchoredSearch(RegExpCompiler* compiler,
  DCHECK(trace->is_trivial());

  RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
-  Isolate* isolate = macro_assembler->isolate();
  // At this point we know that we are at a non-greedy loop that will eat
  // any character one at a time.  Any non-anchored regexp has such a
  // loop prepended to it in order to find where it starts.  We look for
@ -4126,7 +4143,7 @@ int ChoiceNode::EmitOptimizedUnanchoredSearch(RegExpCompiler* compiler,
                                           compiler,
                                           zone());
      GuardedAlternative alt0 = alternatives_->at(0);
-      alt0.node()->FillInBMInfo(isolate, 0, kRecursionBudget, bm, false);
+      alt0.node()->FillInBMInfo(compiler, 0, kRecursionBudget, bm, false);
    }
  }
  if (bm != NULL) {
@ -6388,9 +6405,8 @@ void Analysis::VisitAssertion(AssertionNode* that) {
  EnsureAnalyzed(that->on_success());
 }

-
-void BackReferenceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
-                                     BoyerMooreLookahead* bm,
+void BackReferenceNode::FillInBMInfo(RegExpCompiler* compiler, int offset,
+                                     int budget, BoyerMooreLookahead* bm,
                                     bool not_at_start) {
  // Working out the set of characters that a backreference can match is too
  // hard, so we just say that any character can match.
@ -6402,8 +6418,7 @@ void BackReferenceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
 STATIC_ASSERT(BoyerMoorePositionInfo::kMapSize ==
              RegExpMacroAssembler::kTableSize);

-
-void ChoiceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
+void ChoiceNode::FillInBMInfo(RegExpCompiler* compiler, int offset, int budget,
                              BoyerMooreLookahead* bm, bool not_at_start) {
  ZoneList<GuardedAlternative>* alts = alternatives();
  budget = (budget - 1) / alts->length();
@ -6414,14 +6429,14 @@ void ChoiceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
      SaveBMInfo(bm, not_at_start, offset);
      return;
    }
-    alt.node()->FillInBMInfo(isolate, offset, budget, bm, not_at_start);
+    alt.node()->FillInBMInfo(compiler, offset, budget, bm, not_at_start);
  }
  SaveBMInfo(bm, not_at_start, offset);
 }

-
-void TextNode::FillInBMInfo(Isolate* isolate, int initial_offset, int budget,
-                            BoyerMooreLookahead* bm, bool not_at_start) {
+void TextNode::FillInBMInfo(RegExpCompiler* compiler, int initial_offset,
+                            int budget, BoyerMooreLookahead* bm,
+                            bool not_at_start) {
  if (initial_offset >= bm->length()) return;
  int offset = initial_offset;
  int max_char = bm->max_char();
@ -6441,9 +6456,7 @@ void TextNode::FillInBMInfo(Isolate* isolate, int initial_offset, int budget,
        uc16 character = atom->data()[j];
        if (bm->compiler()->ignore_case()) {
          unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
-          int length = GetCaseIndependentLetters(
-              isolate, character, bm->max_char() == String::kMaxOneByteCharCode,
-              chars);
+          int length = GetCaseIndependentLetters(compiler, character, chars);
          for (int j = 0; j < length; j++) {
            bm->Set(offset, chars[j]);
          }
@ -6472,7 +6485,7 @@ void TextNode::FillInBMInfo(Isolate* isolate, int initial_offset, int budget,
    if (initial_offset == 0) set_bm_info(not_at_start, bm);
    return;
  }
-  on_success()->FillInBMInfo(isolate, offset, budget - 1, bm,
+  on_success()->FillInBMInfo(compiler, offset, budget - 1, bm,
                             true);  // Not at start after a text node.
  if (initial_offset == 0) set_bm_info(not_at_start, bm);
 }
@ -6630,7 +6643,6 @@ RegExpEngine::CompilationResult RegExpEngine::Compile(
  if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) {
    return IrregexpRegExpTooBig(isolate);
  }
-  bool ignore_case = flags & JSRegExp::kIgnoreCase;
  bool is_sticky = flags & JSRegExp::kSticky;
  bool is_global = flags & JSRegExp::kGlobal;
  bool is_unicode = flags & JSRegExp::kUnicode;
@ -6680,11 +6692,11 @@ RegExpEngine::CompilationResult RegExpEngine::Compile(
    }
  }
  if (is_one_byte) {
-    node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, ignore_case);
+    node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, &compiler);
    // Do it again to propagate the new nodes to places where they were not
    // put because they had not been calculated yet.
    if (node != NULL) {
-      node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, ignore_case);
+      node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, &compiler);
    }
  } else if (compiler.unicode() && (is_global || is_sticky)) {
    node = OptionallyStepBackToLeadSurrogate(&compiler, node);
--- a/src/regexp/jsregexp.h
+++ b/src/regexp/jsregexp.h
@ -529,7 +529,7 @@ class RegExpNode: public ZoneObject {
  // the number of nodes we are willing to look at in order to create this data.
  static const int kRecursionBudget = 200;
  bool KeepRecursing(RegExpCompiler* compiler);
-  virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
+  virtual void FillInBMInfo(RegExpCompiler* compiler, int offset, int budget,
                            BoyerMooreLookahead* bm, bool not_at_start) {
    UNREACHABLE();
  }
@ -537,7 +537,7 @@ class RegExpNode: public ZoneObject {
  // If we know that the input is one-byte then there are some nodes that can
  // never match.  This method returns a node that can be substituted for
  // itself, or NULL if the node can never match.
-  virtual RegExpNode* FilterOneByte(int depth, bool ignore_case) {
+  virtual RegExpNode* FilterOneByte(int depth, RegExpCompiler* compiler) {
    return this;
  }
  // Helper for FilterOneByte.
@ -611,15 +611,15 @@ class SeqRegExpNode: public RegExpNode {
      : RegExpNode(on_success->zone()), on_success_(on_success) { }
  RegExpNode* on_success() { return on_success_; }
  void set_on_success(RegExpNode* node) { on_success_ = node; }
-  virtual RegExpNode* FilterOneByte(int depth, bool ignore_case);
-  virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
+  virtual RegExpNode* FilterOneByte(int depth, RegExpCompiler* compiler);
+  virtual void FillInBMInfo(RegExpCompiler* compiler, int offset, int budget,
                            BoyerMooreLookahead* bm, bool not_at_start) {
-    on_success_->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
+    on_success_->FillInBMInfo(compiler, offset, budget - 1, bm, not_at_start);
    if (offset == 0) set_bm_info(not_at_start, bm);
  }

 protected:
-  RegExpNode* FilterSuccessor(int depth, bool ignore_case);
+  RegExpNode* FilterSuccessor(int depth, RegExpCompiler* compiler);

 private:
  RegExpNode* on_success_;
@ -665,7 +665,7 @@ class ActionNode: public SeqRegExpNode {
    return on_success()->GetQuickCheckDetails(
        details, compiler, filled_in, not_at_start);
  }
-  virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
+  virtual void FillInBMInfo(RegExpCompiler* compiler, int offset, int budget,
                            BoyerMooreLookahead* bm, bool not_at_start);
  ActionType action_type() { return action_type_; }
  // TODO(erikcorry): We should allow some action nodes in greedy loops.
@ -744,10 +744,10 @@ class TextNode: public SeqRegExpNode {
  virtual int GreedyLoopTextLength();
  virtual RegExpNode* GetSuccessorOfOmnivorousTextNode(
      RegExpCompiler* compiler);
-  virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
+  virtual void FillInBMInfo(RegExpCompiler* compiler, int offset, int budget,
                            BoyerMooreLookahead* bm, bool not_at_start);
  void CalculateOffsets();
-  virtual RegExpNode* FilterOneByte(int depth, bool ignore_case);
+  virtual RegExpNode* FilterOneByte(int depth, RegExpCompiler* compiler);

 private:
  enum TextEmitPassType {
@ -803,7 +803,7 @@ class AssertionNode: public SeqRegExpNode {
                                    RegExpCompiler* compiler,
                                    int filled_in,
                                    bool not_at_start);
-  virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
+  virtual void FillInBMInfo(RegExpCompiler* compiler, int offset, int budget,
                            BoyerMooreLookahead* bm, bool not_at_start);
  AssertionType assertion_type() { return assertion_type_; }

@ -841,7 +841,7 @@ class BackReferenceNode: public SeqRegExpNode {
                                    bool not_at_start) {
    return;
  }
-  virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
+  virtual void FillInBMInfo(RegExpCompiler* compiler, int offset, int budget,
                            BoyerMooreLookahead* bm, bool not_at_start);

 private:
@ -867,7 +867,7 @@ class EndNode: public RegExpNode {
    // Returning 0 from EatsAtLeast should ensure we never get here.
    UNREACHABLE();
  }
-  virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
+  virtual void FillInBMInfo(RegExpCompiler* compiler, int offset, int budget,
                            BoyerMooreLookahead* bm, bool not_at_start) {
    // Returning 0 from EatsAtLeast should ensure we never get here.
    UNREACHABLE();
@ -960,7 +960,7 @@ class ChoiceNode: public RegExpNode {
                                    RegExpCompiler* compiler,
                                    int characters_filled_in,
                                    bool not_at_start);
-  virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
+  virtual void FillInBMInfo(RegExpCompiler* compiler, int offset, int budget,
                            BoyerMooreLookahead* bm, bool not_at_start);

  bool being_calculated() { return being_calculated_; }
@ -970,7 +970,7 @@ class ChoiceNode: public RegExpNode {
  virtual bool try_to_emit_quick_check_for_alternative(bool is_first) {
    return true;
  }
-  virtual RegExpNode* FilterOneByte(int depth, bool ignore_case);
+  virtual RegExpNode* FilterOneByte(int depth, RegExpCompiler* compiler);
  virtual bool read_backward() { return false; }

 protected:
@ -1028,9 +1028,9 @@ class NegativeLookaroundChoiceNode : public ChoiceNode {
                                    RegExpCompiler* compiler,
                                    int characters_filled_in,
                                    bool not_at_start);
-  virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
+  virtual void FillInBMInfo(RegExpCompiler* compiler, int offset, int budget,
                            BoyerMooreLookahead* bm, bool not_at_start) {
-    alternatives_->at(1).node()->FillInBMInfo(isolate, offset, budget - 1, bm,
+    alternatives_->at(1).node()->FillInBMInfo(compiler, offset, budget - 1, bm,
                                              not_at_start);
    if (offset == 0) set_bm_info(not_at_start, bm);
  }
@ -1042,7 +1042,7 @@ class NegativeLookaroundChoiceNode : public ChoiceNode {
  virtual bool try_to_emit_quick_check_for_alternative(bool is_first) {
    return !is_first;
  }
-  virtual RegExpNode* FilterOneByte(int depth, bool ignore_case);
+  virtual RegExpNode* FilterOneByte(int depth, RegExpCompiler* compiler);
 };


@ -1062,14 +1062,14 @@ class LoopChoiceNode: public ChoiceNode {
                                    RegExpCompiler* compiler,
                                    int characters_filled_in,
                                    bool not_at_start);
-  virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
+  virtual void FillInBMInfo(RegExpCompiler* compiler, int offset, int budget,
                            BoyerMooreLookahead* bm, bool not_at_start);
  RegExpNode* loop_node() { return loop_node_; }
  RegExpNode* continue_node() { return continue_node_; }
  bool body_can_be_zero_length() { return body_can_be_zero_length_; }
  virtual bool read_backward() { return read_backward_; }
  virtual void Accept(NodeVisitor* visitor);
-  virtual RegExpNode* FilterOneByte(int depth, bool ignore_case);
+  virtual RegExpNode* FilterOneByte(int depth, RegExpCompiler* compiler);

 private:
  // AddAlternative is made private for loop nodes because alternatives
--- a/src/regexp/regexp-parser.cc
+++ b/src/regexp/regexp-parser.cc
@ -1294,7 +1294,10 @@ bool RegExpBuilder::NeedsDesugaringForUnicode(RegExpCharacterClass* cc) {

 bool RegExpBuilder::NeedsDesugaringForIgnoreCase(uc32 c) {
 #ifdef V8_I18N_SUPPORT
-  if (unicode() && ignore_case()) {
+  if (unicode() && ignore_case() && c >= kNonBmpStart) {
+    // BMP characters are handled in the case-insensitive TextEmitPass.
+    // Surrogate code units do not have case equivalents.
+    // Non-BMP characters need to be desugared into two uc16 parts.
    USet* set = uset_open(c, c);
    uset_closeOver(set, USET_CASE_INSENSITIVE);
    uset_removeAllStrings(set);
--- a/src/unicode.h
+++ b/src/unicode.h
@ -15,8 +15,8 @@

 namespace unibrow {

-typedef unsigned int uchar;
-typedef unsigned char byte;
+typedef int32_t uchar;
+typedef uint8_t byte;

 /**
 * The max length of the result of converting the case of a single
@ -130,7 +130,7 @@ class Utf16 {

 class Utf8 {
 public:
-  static inline uchar Length(uchar chr, int previous);
+  static inline unsigned Length(uchar chr, int previous);
  static inline unsigned EncodeOneByte(char* out, uint8_t c);
  static inline unsigned Encode(char* out,
                                uchar c,
--- a/test/cctest/test-strings.cc
+++ b/test/cctest/test-strings.cc
@ -1382,7 +1382,7 @@ TEST(IsAscii) {

 template<typename Op, bool return_first>
 static uint16_t ConvertLatin1(uint16_t c) {
-  uint32_t result[Op::kMaxWidth];
+  uc32 result[Op::kMaxWidth];
  int chars;
  chars = Op::Convert(c, 0, result, NULL);
  if (chars == 0) return 0;