From eee6c6405e1e4847f9f83abee17f7ff4dd8459cc Mon Sep 17 00:00:00 2001 From: "lrn@chromium.org" Date: Mon, 18 Jan 2010 09:49:50 +0000 Subject: [PATCH] RegExp bitmap test for word character. Review URL: http://codereview.chromium.org/547024 git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@3626 ce2b1a6d-e550-0410-aec6-3dcde31c8c00 --- src/arm/regexp-macro-assembler-arm.cc | 96 +++++++++++-------------- src/assembler.cc | 4 ++ src/assembler.h | 4 ++ src/ia32/assembler-ia32.cc | 8 +++ src/ia32/assembler-ia32.h | 1 + src/ia32/disasm-ia32.cc | 1 + src/ia32/regexp-macro-assembler-ia32.cc | 61 +++++++--------- src/regexp-macro-assembler.cc | 24 +++++++ src/regexp-macro-assembler.h | 9 +++ src/serialize.cc | 8 ++- src/x64/assembler-x64.cc | 14 ++++ src/x64/assembler-x64.h | 1 + src/x64/regexp-macro-assembler-x64.cc | 65 +++++++---------- 13 files changed, 166 insertions(+), 130 deletions(-) diff --git a/src/arm/regexp-macro-assembler-arm.cc b/src/arm/regexp-macro-assembler-arm.cc index 5ea7751046..ed06eb26c2 100644 --- a/src/arm/regexp-macro-assembler-arm.cc +++ b/src/arm/regexp-macro-assembler-arm.cc @@ -526,64 +526,54 @@ bool RegExpMacroAssemblerARM::CheckSpecialCharacterClass(uc16 type, return true; } case 'n': { - // Match newlines (0x0a('\n'), 0x0d('\r'), 0x2028 and 0x2029) - __ eor(r0, current_character(), Operand(0x01)); - // See if current character is '\n'^1 or '\r'^1, i.e., 0x0b or 0x0c - __ sub(r0, r0, Operand(0x0b)); - __ cmp(r0, Operand(0x0c - 0x0b)); - if (mode_ == ASCII) { - BranchOrBacktrack(hi, on_no_match); - } else { - Label done; - __ b(ls, &done); - // Compare original value to 0x2028 and 0x2029, using the already - // computed (current_char ^ 0x01 - 0x0b). I.e., check for - // 0x201d (0x2028 - 0x0b) or 0x201e. - __ sub(r0, r0, Operand(0x2028 - 0x0b)); - __ cmp(r0, Operand(1)); - BranchOrBacktrack(hi, on_no_match); - __ bind(&done); - } - return true; + // Match newlines (0x0a('\n'), 0x0d('\r'), 0x2028 and 0x2029) + __ eor(r0, current_character(), Operand(0x01)); + // See if current character is '\n'^1 or '\r'^1, i.e., 0x0b or 0x0c + __ sub(r0, r0, Operand(0x0b)); + __ cmp(r0, Operand(0x0c - 0x0b)); + if (mode_ == ASCII) { + BranchOrBacktrack(hi, on_no_match); + } else { + Label done; + __ b(ls, &done); + // Compare original value to 0x2028 and 0x2029, using the already + // computed (current_char ^ 0x01 - 0x0b). I.e., check for + // 0x201d (0x2028 - 0x0b) or 0x201e. + __ sub(r0, r0, Operand(0x2028 - 0x0b)); + __ cmp(r0, Operand(1)); + BranchOrBacktrack(hi, on_no_match); + __ bind(&done); } + return true; + } case 'w': { - // Match word character (0-9, A-Z, a-z and _). - Label digits, done; - __ cmp(current_character(), Operand('9')); - __ b(ls, &digits); - __ cmp(current_character(), Operand('_')); - __ b(eq, &done); - __ orr(r0, current_character(), Operand(0x20)); - __ sub(r0, r0, Operand('a')); - __ cmp(r0, Operand('z' - 'a')); - BranchOrBacktrack(hi, on_no_match); - __ jmp(&done); - - __ bind(&digits); - __ cmp(current_character(), Operand('0')); - BranchOrBacktrack(lo, on_no_match); - __ bind(&done); - + if (mode_ != ASCII) { + // Table is 128 entries, so all ASCII characters can be tested. + __ cmp(current_character(), Operand('z')); + BranchOrBacktrack(hi, on_no_match); + } + ExternalReference map = ExternalReference::re_word_character_map(); + __ mov(r0, Operand(map)); + __ ldrb(r0, MemOperand(r0, current_character())); + __ tst(r0, Operand(r0)); + BranchOrBacktrack(eq, on_no_match); return true; } case 'W': { - // Match non-word character (not 0-9, A-Z, a-z and _). - Label digits, done; - __ cmp(current_character(), Operand('9')); - __ b(ls, &digits); - __ cmp(current_character(), Operand('_')); - BranchOrBacktrack(eq, on_no_match); - __ orr(r0, current_character(), Operand(0x20)); - __ sub(r0, r0, Operand('a')); - __ cmp(r0, Operand('z' - 'a')); - BranchOrBacktrack(ls, on_no_match); - __ jmp(&done); - - __ bind(&digits); - __ cmp(current_character(), Operand('0')); - BranchOrBacktrack(hs, on_no_match); - __ bind(&done); - + Label done; + if (mode_ != ASCII) { + // Table is 128 entries, so all ASCII characters can be tested. + __ cmp(current_character(), Operand('z')); + __ b(hi, &done); + } + ExternalReference map = ExternalReference::re_word_character_map(); + __ mov(r0, Operand(map)); + __ ldrb(r0, MemOperand(r0, current_character())); + __ tst(r0, Operand(r0)); + BranchOrBacktrack(ne, on_no_match); + if (mode_ != ASCII) { + __ bind(&done); + } return true; } case '*': diff --git a/src/assembler.cc b/src/assembler.cc index c1a354d4e2..fcdb14ae06 100644 --- a/src/assembler.cc +++ b/src/assembler.cc @@ -670,6 +670,10 @@ ExternalReference ExternalReference::re_case_insensitive_compare_uc16() { FUNCTION_ADDR(NativeRegExpMacroAssembler::CaseInsensitiveCompareUC16))); } +ExternalReference ExternalReference::re_word_character_map() { + return ExternalReference( + NativeRegExpMacroAssembler::word_character_map_address()); +} ExternalReference ExternalReference::address_of_static_offsets_vector() { return ExternalReference(OffsetsVector::static_offsets_vector_address()); diff --git a/src/assembler.h b/src/assembler.h index b3077a0c93..ec47d5712b 100644 --- a/src/assembler.h +++ b/src/assembler.h @@ -462,6 +462,10 @@ class ExternalReference BASE_EMBEDDED { // Function NativeRegExpMacroAssembler::GrowStack() static ExternalReference re_grow_stack(); + + // byte NativeRegExpMacroAssembler::word_character_bitmap + static ExternalReference re_word_character_map(); + #endif // This lets you register a function that rewrites all external references. diff --git a/src/ia32/assembler-ia32.cc b/src/ia32/assembler-ia32.cc index 0e9ffeaab8..2cf469aeb8 100644 --- a/src/ia32/assembler-ia32.cc +++ b/src/ia32/assembler-ia32.cc @@ -1261,6 +1261,14 @@ void Assembler::test(Register reg, const Operand& op) { } +void Assembler::test_b(Register reg, const Operand& op) { + EnsureSpace ensure_space(this); + last_pc_ = pc_; + EMIT(0x84); + emit_operand(reg, op); +} + + void Assembler::test(const Operand& op, const Immediate& imm) { EnsureSpace ensure_space(this); last_pc_ = pc_; diff --git a/src/ia32/assembler-ia32.h b/src/ia32/assembler-ia32.h index f35abd5756..d675ecf528 100644 --- a/src/ia32/assembler-ia32.h +++ b/src/ia32/assembler-ia32.h @@ -624,6 +624,7 @@ class Assembler : public Malloced { void test(Register reg, const Immediate& imm); void test(Register reg, const Operand& op); + void test_b(Register reg, const Operand& op); void test(const Operand& op, const Immediate& imm); void xor_(Register dst, int32_t imm32); diff --git a/src/ia32/disasm-ia32.cc b/src/ia32/disasm-ia32.cc index 581cdc0754..1fbaa3ce8d 100644 --- a/src/ia32/disasm-ia32.cc +++ b/src/ia32/disasm-ia32.cc @@ -63,6 +63,7 @@ static ByteMnemonic two_operands_instr[] = { {0x29, "sub", OPER_REG_OP_ORDER}, {0x2A, "subb", REG_OPER_OP_ORDER}, {0x2B, "sub", REG_OPER_OP_ORDER}, + {0x84, "test_b", REG_OPER_OP_ORDER}, {0x85, "test", REG_OPER_OP_ORDER}, {0x31, "xor", OPER_REG_OP_ORDER}, {0x33, "xor", REG_OPER_OP_ORDER}, diff --git a/src/ia32/regexp-macro-assembler-ia32.cc b/src/ia32/regexp-macro-assembler-ia32.cc index e41f9c3f0c..4af59dd6aa 100644 --- a/src/ia32/regexp-macro-assembler-ia32.cc +++ b/src/ia32/regexp-macro-assembler-ia32.cc @@ -539,46 +539,33 @@ bool RegExpMacroAssemblerIA32::CheckSpecialCharacterClass(uc16 type, return true; } case 'w': { - Label done, check_digits; - __ cmp(Operand(current_character()), Immediate('9')); - __ j(less_equal, &check_digits); - __ cmp(Operand(current_character()), Immediate('_')); - __ j(equal, &done); - // Convert to lower case if letter. - __ mov(Operand(eax), current_character()); - __ or_(eax, 0x20); - // check current character in range ['a'..'z'], nondestructively. - __ sub(Operand(eax), Immediate('a')); - __ cmp(Operand(eax), Immediate('z' - 'a')); - BranchOrBacktrack(above, on_no_match); - __ jmp(&done); - __ bind(&check_digits); - // Check current character in range ['0'..'9']. - __ cmp(Operand(current_character()), Immediate('0')); - BranchOrBacktrack(below, on_no_match); - __ bind(&done); - + if (mode_ != ASCII) { + // Table is 128 entries, so all ASCII characters can be tested. + __ cmp(Operand(current_character()), Immediate('z')); + BranchOrBacktrack(above, on_no_match); + } + ASSERT_EQ(0, word_character_map[0]); // Character '\0' is not a word char. + ExternalReference word_map = ExternalReference::re_word_character_map(); + __ test_b(current_character(), + Operand::StaticArray(current_character(), times_1, word_map)); + BranchOrBacktrack(zero, on_no_match); return true; } case 'W': { - Label done, check_digits; - __ cmp(Operand(current_character()), Immediate('9')); - __ j(less_equal, &check_digits); - __ cmp(Operand(current_character()), Immediate('_')); - BranchOrBacktrack(equal, on_no_match); - // Convert to lower case if letter. - __ mov(Operand(eax), current_character()); - __ or_(eax, 0x20); - // check current character in range ['a'..'z'], nondestructively. - __ sub(Operand(eax), Immediate('a')); - __ cmp(Operand(eax), Immediate('z' - 'a')); - BranchOrBacktrack(below_equal, on_no_match); - __ jmp(&done); - __ bind(&check_digits); - // Check current character in range ['0'..'9']. - __ cmp(Operand(current_character()), Immediate('0')); - BranchOrBacktrack(above_equal, on_no_match); - __ bind(&done); + Label done; + if (mode_ != ASCII) { + // Table is 128 entries, so all ASCII characters can be tested. + __ cmp(Operand(current_character()), Immediate('z')); + __ j(above, &done); + } + ASSERT_EQ(0, word_character_map[0]); // Character '\0' is not a word char. + ExternalReference word_map = ExternalReference::re_word_character_map(); + __ test_b(current_character(), + Operand::StaticArray(current_character(), times_1, word_map)); + BranchOrBacktrack(not_zero, on_no_match); + if (mode_ != ASCII) { + __ bind(&done); + } return true; } // Non-standard classes (with no syntactic shorthand) used internally. diff --git a/src/regexp-macro-assembler.cc b/src/regexp-macro-assembler.cc index c73e02a8e1..3685fcd3d8 100644 --- a/src/regexp-macro-assembler.cc +++ b/src/regexp-macro-assembler.cc @@ -189,6 +189,30 @@ NativeRegExpMacroAssembler::Result NativeRegExpMacroAssembler::Execute( static unibrow::Mapping canonicalize; + +byte NativeRegExpMacroAssembler::word_character_map[] = { + 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, + 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, + 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, + 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, + + 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, + 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, + 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, // '0' - '7' + 0xffu, 0xffu, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, // '8' - '9' + + 0x00u, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, // 'A' - 'G' + 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, // 'H' - 'O' + 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, // 'P' - 'W' + 0xffu, 0xffu, 0xffu, 0x00u, 0x00u, 0x00u, 0x00u, 0xffu, // 'X' - 'Z', '_' + + 0x00u, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, // 'a' - 'g' + 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, // 'h' - 'o' + 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, // 'p' - 'w' + 0xffu, 0xffu, 0xffu, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, // 'x' - 'z' +}; + + int NativeRegExpMacroAssembler::CaseInsensitiveCompareUC16( Address byte_offset1, Address byte_offset2, diff --git a/src/regexp-macro-assembler.h b/src/regexp-macro-assembler.h index 7cc95110e3..2e619bd15b 100644 --- a/src/regexp-macro-assembler.h +++ b/src/regexp-macro-assembler.h @@ -204,6 +204,15 @@ class NativeRegExpMacroAssembler: public RegExpMacroAssembler { static const byte* StringCharacterPosition(String* subject, int start_index); + // Byte map of ASCII characters with a 0xff if the character is a word + // character (digit, letter or underscore) and 0x00 otherwise. + // Used by generated RegExp code. + static byte word_character_map[128]; + + static Address word_character_map_address() { + return &word_character_map[0]; + } + static Result Execute(Code* code, String* input, int start_offset, diff --git a/src/serialize.cc b/src/serialize.cc index de2fb8e331..5d497a1ff8 100644 --- a/src/serialize.cc +++ b/src/serialize.cc @@ -479,15 +479,19 @@ void ExternalReferenceTable::PopulateTable() { UNCLASSIFIED, 21, "NativeRegExpMacroAssembler::GrowStack()"); + Add(ExternalReference::re_word_character_map().address(), + UNCLASSIFIED, + 22, + "NativeRegExpMacroAssembler::word_character_map"); #endif // Keyed lookup cache. Add(ExternalReference::keyed_lookup_cache_keys().address(), UNCLASSIFIED, - 22, + 23, "KeyedLookupCache::keys()"); Add(ExternalReference::keyed_lookup_cache_field_offsets().address(), UNCLASSIFIED, - 23, + 24, "KeyedLookupCache::field_offsets()"); } diff --git a/src/x64/assembler-x64.cc b/src/x64/assembler-x64.cc index 2d524eaf4b..4ac39339c9 100644 --- a/src/x64/assembler-x64.cc +++ b/src/x64/assembler-x64.cc @@ -1880,6 +1880,20 @@ void Assembler::testb(const Operand& op, Immediate mask) { } +void Assembler::testb(const Operand& op, Register reg) { + EnsureSpace ensure_space(this); + last_pc_ = pc_; + if (reg.code() > 3) { + // Register is not one of al, bl, cl, dl. Its encoding needs REX. + emit_rex_32(reg, op); + } else { + emit_optional_rex_32(reg, op); + } + emit(0x84); + emit_operand(reg, op); +} + + void Assembler::testl(Register dst, Register src) { EnsureSpace ensure_space(this); last_pc_ = pc_; diff --git a/src/x64/assembler-x64.h b/src/x64/assembler-x64.h index fa7d33b1b2..28e03bcb6f 100644 --- a/src/x64/assembler-x64.h +++ b/src/x64/assembler-x64.h @@ -931,6 +931,7 @@ class Assembler : public Malloced { void testb(Register dst, Register src); void testb(Register reg, Immediate mask); void testb(const Operand& op, Immediate mask); + void testb(const Operand& op, Register reg); void testl(Register dst, Register src); void testl(Register reg, Immediate mask); void testl(const Operand& op, Immediate mask); diff --git a/src/x64/regexp-macro-assembler-x64.cc b/src/x64/regexp-macro-assembler-x64.cc index 09cb9177a5..75bbf3e2e5 100644 --- a/src/x64/regexp-macro-assembler-x64.cc +++ b/src/x64/regexp-macro-assembler-x64.cc @@ -582,49 +582,38 @@ bool RegExpMacroAssemblerX64::CheckSpecialCharacterClass(uc16 type, return true; } case 'w': { - Label done, check_digits; - __ cmpl(current_character(), Immediate('9')); - __ j(less_equal, &check_digits); - __ cmpl(current_character(), Immediate('_')); - __ j(equal, &done); - // Convert to lower case if letter. - __ movl(rax, current_character()); - __ orl(rax, Immediate(0x20)); - // check rax in range ['a'..'z']. - __ subl(rax, Immediate('a')); - __ cmpl(rax, Immediate('z' - 'a')); - BranchOrBacktrack(above, on_no_match); - __ jmp(&done); - __ bind(&check_digits); - // Check current character in range ['0'..'9']. - __ cmpl(current_character(), Immediate('0')); - BranchOrBacktrack(below, on_no_match); - __ bind(&done); - + if (mode_ != ASCII) { + // Table is 128 entries, so all ASCII characters can be tested. + __ cmpl(current_character(), Immediate('z')); + BranchOrBacktrack(above, on_no_match); + } + __ movq(rbx, ExternalReference::re_word_character_map()); + ASSERT_EQ(0, word_character_map[0]); // Character '\0' is not a word char. + ExternalReference word_map = ExternalReference::re_word_character_map(); + __ testb(Operand(rbx, current_character(), times_1, 0), + current_character()); + BranchOrBacktrack(zero, on_no_match); return true; } case 'W': { - Label done, check_digits; - __ cmpl(current_character(), Immediate('9')); - __ j(less_equal, &check_digits); - __ cmpl(current_character(), Immediate('_')); - BranchOrBacktrack(equal, on_no_match); - // Convert to lower case if letter. - __ movl(rax, current_character()); - __ orl(rax, Immediate(0x20)); - // check current character in range ['a'..'z'], nondestructively. - __ subl(rax, Immediate('a')); - __ cmpl(rax, Immediate('z' - 'a')); - BranchOrBacktrack(below_equal, on_no_match); - __ jmp(&done); - __ bind(&check_digits); - // Check current character in range ['0'..'9']. - __ cmpl(current_character(), Immediate('0')); - BranchOrBacktrack(above_equal, on_no_match); - __ bind(&done); - + Label done; + if (mode_ != ASCII) { + // Table is 128 entries, so all ASCII characters can be tested. + __ cmpl(current_character(), Immediate('z')); + __ j(above, &done); + } + __ movq(rbx, ExternalReference::re_word_character_map()); + ASSERT_EQ(0, word_character_map[0]); // Character '\0' is not a word char. + ExternalReference word_map = ExternalReference::re_word_character_map(); + __ testb(Operand(rbx, current_character(), times_1, 0), + current_character()); + BranchOrBacktrack(not_zero, on_no_match); + if (mode_ != ASCII) { + __ bind(&done); + } return true; } + case '*': // Match any character. return true;