v8/src/regexp/regexp-macro-assembler-tracer.h
Jakob Gruber 8bbb44e537 [regexp] Compact codegen for large character classes
Large character classes may easily be created when unicode
properties (e.g.: /\p{L}/u and /\P{L}/u) are used - these are
expanded internally into character classes that consist of hundreds
of character ranges. Previously to this CL, we'd emit branching code
for each of these ranges, leading to very large regexp code objects.

This CL adds a new codegen mode for large character classes (where
'large' currently means > 16 ranges). Instead of emitting branching
code inline, the ranges are written into a ByteArray and we call into
the C function IsCharacterInRangeArray for the actual branching logic.
The ByteArray is smaller than emitted code and is deduplicated if the
same character class is matched repeatedly in the same pattern.

Note this mode is *not* implemented for the interpreter, since we
currently don't have a constant pool for irregexp bytecode, and thus
cannot reference ByteArrays.

Bug: v8:11069
Change-Id: I2d728e42d85114b796c637f791848731a104cd54
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3229377
Reviewed-by: Patrick Thier <pthier@chromium.org>
Auto-Submit: Jakob Gruber <jgruber@chromium.org>
Commit-Queue: Jakob Gruber <jgruber@chromium.org>
Cr-Commit-Position: refs/heads/main@{#77463}
2021-10-19 18:20:54 +00:00

92 lines
4.4 KiB
C++

// Copyright 2008 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef V8_REGEXP_REGEXP_MACRO_ASSEMBLER_TRACER_H_
#define V8_REGEXP_REGEXP_MACRO_ASSEMBLER_TRACER_H_
#include "src/base/strings.h"
#include "src/regexp/regexp-macro-assembler.h"
namespace v8 {
namespace internal {
// Decorator on a RegExpMacroAssembler that write all calls.
class RegExpMacroAssemblerTracer: public RegExpMacroAssembler {
public:
RegExpMacroAssemblerTracer(Isolate* isolate, RegExpMacroAssembler* assembler);
~RegExpMacroAssemblerTracer() override;
void AbortedCodeGeneration() override;
int stack_limit_slack() override { return assembler_->stack_limit_slack(); }
bool CanReadUnaligned() const override {
return assembler_->CanReadUnaligned();
}
void AdvanceCurrentPosition(int by) override; // Signed cp change.
void AdvanceRegister(int reg, int by) override; // r[reg] += by.
void Backtrack() override;
void Bind(Label* label) override;
void CheckCharacter(unsigned c, Label* on_equal) override;
void CheckCharacterAfterAnd(unsigned c, unsigned and_with,
Label* on_equal) override;
void CheckCharacterGT(base::uc16 limit, Label* on_greater) override;
void CheckCharacterLT(base::uc16 limit, Label* on_less) override;
void CheckGreedyLoop(Label* on_tos_equals_current_position) override;
void CheckAtStart(int cp_offset, Label* on_at_start) override;
void CheckNotAtStart(int cp_offset, Label* on_not_at_start) override;
void CheckNotBackReference(int start_reg, bool read_backward,
Label* on_no_match) override;
void CheckNotBackReferenceIgnoreCase(int start_reg, bool read_backward,
bool unicode,
Label* on_no_match) override;
void CheckNotCharacter(unsigned c, Label* on_not_equal) override;
void CheckNotCharacterAfterAnd(unsigned c, unsigned and_with,
Label* on_not_equal) override;
void CheckNotCharacterAfterMinusAnd(base::uc16 c, base::uc16 minus,
base::uc16 and_with,
Label* on_not_equal) override;
void CheckCharacterInRange(base::uc16 from, base::uc16 to,
Label* on_in_range) override;
void CheckCharacterNotInRange(base::uc16 from, base::uc16 to,
Label* on_not_in_range) override;
bool CheckCharacterInRangeArray(const ZoneList<CharacterRange>* ranges,
Label* on_in_range) override;
bool CheckCharacterNotInRangeArray(const ZoneList<CharacterRange>* ranges,
Label* on_not_in_range) override;
void CheckBitInTable(Handle<ByteArray> table, Label* on_bit_set) override;
void CheckPosition(int cp_offset, Label* on_outside_input) override;
bool CheckSpecialCharacterClass(StandardCharacterSet type,
Label* on_no_match) override;
void Fail() override;
Handle<HeapObject> GetCode(Handle<String> source) override;
void GoTo(Label* label) override;
void IfRegisterGE(int reg, int comparand, Label* if_ge) override;
void IfRegisterLT(int reg, int comparand, Label* if_lt) override;
void IfRegisterEqPos(int reg, Label* if_eq) override;
IrregexpImplementation Implementation() override;
void LoadCurrentCharacterImpl(int cp_offset, Label* on_end_of_input,
bool check_bounds, int characters,
int eats_at_least) override;
void PopCurrentPosition() override;
void PopRegister(int register_index) override;
void PushBacktrack(Label* label) override;
void PushCurrentPosition() override;
void PushRegister(int register_index,
StackCheckFlag check_stack_limit) override;
void ReadCurrentPositionFromRegister(int reg) override;
void ReadStackPointerFromRegister(int reg) override;
void SetCurrentPositionFromEnd(int by) override;
void SetRegister(int register_index, int to) override;
bool Succeed() override;
void WriteCurrentPositionToRegister(int reg, int cp_offset) override;
void ClearRegisters(int reg_from, int reg_to) override;
void WriteStackPointerToRegister(int reg) override;
private:
RegExpMacroAssembler* assembler_;
};
} // namespace internal
} // namespace v8
#endif // V8_REGEXP_REGEXP_MACRO_ASSEMBLER_TRACER_H_