ICU-5427 Reduce regex static memory consumption by 35%
X-SVN-Rev: 21805
This commit is contained in:
parent
d0a1a3d877
commit
ac329166b4
@ -1175,7 +1175,7 @@ UBool RegexCompile::doParseActions(int32_t action)
|
||||
break;
|
||||
}
|
||||
c = peekCharLL();
|
||||
if (RegexStaticSets::gStaticSets->fRuleDigits->contains(c) == FALSE) {
|
||||
if (RegexStaticSets::gStaticSets->fRuleDigitsAlias->contains(c) == FALSE) {
|
||||
break;
|
||||
}
|
||||
nextCharLL();
|
||||
@ -3375,7 +3375,7 @@ void RegexCompile::nextChar(RegexPatternChar &c) {
|
||||
int32_t startX = fNextIndex; // start and end positions of the
|
||||
int32_t endX = fNextIndex; // sequence following the '\'
|
||||
if (c.fChar == chBackSlash) {
|
||||
if (RegexStaticSets::gStaticSets->fUnescapeCharSet->contains(peekCharLL())) {
|
||||
if (RegexStaticSets::gStaticSets->fUnescapeCharSet.contains(peekCharLL())) {
|
||||
//
|
||||
// A '\' sequence that is handled by ICU's standard unescapeAt function.
|
||||
// Includes \uxxxx, \n, \r, many others.
|
||||
|
@ -1,7 +1,7 @@
|
||||
//
|
||||
// regexst.h
|
||||
//
|
||||
// Copyright (C) 2004-2006, International Business Machines Corporation and others.
|
||||
// Copyright (C) 2004-2007, International Business Machines Corporation and others.
|
||||
// All Rights Reserved.
|
||||
//
|
||||
// This file contains class RegexStaticSets
|
||||
@ -38,13 +38,13 @@
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
|
||||
//----------------------------------------------------------------------------------------
|
||||
//------------------------------------------------------------------------------
|
||||
//
|
||||
// Unicode Set pattern strings for all of the required constant sets.
|
||||
// Initialized with hex values for portability to EBCDIC based machines.
|
||||
// Really ugly, but there's no good way to avoid it.
|
||||
//
|
||||
//----------------------------------------------------------------------------------------
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
// "Rule Char" Characters are those with no special meaning, and therefore do not
|
||||
// need to be escaped to appear as literals in a regexp. Expressed
|
||||
@ -99,7 +99,7 @@ static const UChar gIsWordPattern[] = {
|
||||
//
|
||||
// Unicode Set Definitions for Regular Expression \s
|
||||
//
|
||||
static const UChar gIsSpacePattern[] = {
|
||||
static const UChar gIsSpacePattern[] = {
|
||||
// [ \ p { W h i t e S p a c e } ]
|
||||
0x5b, 0x5c, 0x70, 0x7b, 0x57, 0x68, 0x69, 0x74, 0x65, 0x53, 0x70, 0x61, 0x63, 0x65, 0x7d, 0x5d, 0};
|
||||
|
||||
@ -107,7 +107,7 @@ static const UChar gIsWordPattern[] = {
|
||||
//
|
||||
// UnicodeSets used in implementation of Grapheme Cluster detection, \X
|
||||
//
|
||||
static const UChar gGC_ControlPattern[] = {
|
||||
static const UChar gGC_ControlPattern[] = {
|
||||
// [ [ : Z l : ] [ : Z p : ]
|
||||
0x5b, 0x5b, 0x3a, 0x5A, 0x6c, 0x3a, 0x5d, 0x5b, 0x3a, 0x5A, 0x70, 0x3a, 0x5d,
|
||||
// [ : C c : ] [ : C f : ] -
|
||||
@ -117,37 +117,37 @@ static const UChar gIsWordPattern[] = {
|
||||
// E x t e n d : ] ]
|
||||
0x45, 0x78, 0x74, 0x65, 0x6e, 0x64, 0x3a, 0x5d, 0x5d, 0};
|
||||
|
||||
static const UChar gGC_ExtendPattern[] = {
|
||||
static const UChar gGC_ExtendPattern[] = {
|
||||
// [ \ p { G r a p h e m e _
|
||||
0x5b, 0x5c, 0x70, 0x7b, 0x47, 0x72, 0x61, 0x70, 0x68, 0x65, 0x6d, 0x65, 0x5f,
|
||||
// E x t e n d } ]
|
||||
0x45, 0x78, 0x74, 0x65, 0x6e, 0x64, 0x7d, 0x5d, 0};
|
||||
|
||||
static const UChar gGC_LPattern[] = {
|
||||
static const UChar gGC_LPattern[] = {
|
||||
// [ \ p { H a n g u l _ S y l
|
||||
0x5b, 0x5c, 0x70, 0x7b, 0x48, 0x61, 0x6e, 0x67, 0x75, 0x6c, 0x5f, 0x53, 0x79, 0x6c,
|
||||
// l a b l e _ T y p e = L } ]
|
||||
0x6c, 0x61, 0x62, 0x6c, 0x65, 0x5f, 0x54, 0x79, 0x70, 0x65, 0x3d, 0x4c, 0x7d, 0x5d, 0};
|
||||
|
||||
static const UChar gGC_VPattern[] = {
|
||||
static const UChar gGC_VPattern[] = {
|
||||
// [ \ p { H a n g u l _ S y l
|
||||
0x5b, 0x5c, 0x70, 0x7b, 0x48, 0x61, 0x6e, 0x67, 0x75, 0x6c, 0x5f, 0x53, 0x79, 0x6c,
|
||||
// l a b l e _ T y p e = V } ]
|
||||
0x6c, 0x61, 0x62, 0x6c, 0x65, 0x5f, 0x54, 0x79, 0x70, 0x65, 0x3d, 0x56, 0x7d, 0x5d, 0};
|
||||
|
||||
static const UChar gGC_TPattern[] = {
|
||||
static const UChar gGC_TPattern[] = {
|
||||
// [ \ p { H a n g u l _ S y l
|
||||
0x5b, 0x5c, 0x70, 0x7b, 0x48, 0x61, 0x6e, 0x67, 0x75, 0x6c, 0x5f, 0x53, 0x79, 0x6c,
|
||||
// l a b l e _ T y p e = T } ]
|
||||
0x6c, 0x61, 0x62, 0x6c, 0x65, 0x5f, 0x54, 0x79, 0x70, 0x65, 0x3d, 0x54, 0x7d, 0x5d, 0};
|
||||
|
||||
static const UChar gGC_LVPattern[] = {
|
||||
static const UChar gGC_LVPattern[] = {
|
||||
// [ \ p { H a n g u l _ S y l
|
||||
0x5b, 0x5c, 0x70, 0x7b, 0x48, 0x61, 0x6e, 0x67, 0x75, 0x6c, 0x5f, 0x53, 0x79, 0x6c,
|
||||
// l a b l e _ T y p e = L V } ]
|
||||
0x6c, 0x61, 0x62, 0x6c, 0x65, 0x5f, 0x54, 0x79, 0x70, 0x65, 0x3d, 0x4c, 0x56, 0x7d, 0x5d, 0};
|
||||
|
||||
static const UChar gGC_LVTPattern[] = {
|
||||
static const UChar gGC_LVTPattern[] = {
|
||||
// [ \ p { H a n g u l _ S y l
|
||||
0x5b, 0x5c, 0x70, 0x7b, 0x48, 0x61, 0x6e, 0x67, 0x75, 0x6c, 0x5f, 0x53, 0x79, 0x6c,
|
||||
// l a b l e _ T y p e = L V T } ]
|
||||
@ -155,29 +155,30 @@ static const UChar gIsWordPattern[] = {
|
||||
|
||||
RegexStaticSets *RegexStaticSets::gStaticSets = NULL;
|
||||
|
||||
RegexStaticSets::RegexStaticSets(UErrorCode *status) {
|
||||
RegexStaticSets::RegexStaticSets(UErrorCode *status)
|
||||
:
|
||||
fUnescapeCharSet(UnicodeString(TRUE, gUnescapeCharPattern, -1), *status),
|
||||
fRuleDigitsAlias(NULL)
|
||||
{
|
||||
// First zero out everything
|
||||
int i;
|
||||
for (i=0; i<URX_LAST_SET; i++) {
|
||||
fPropSets[i] = NULL;
|
||||
}
|
||||
for (i=0; i<10; i++) {
|
||||
for (i=0; i<(int32_t)(sizeof(fRuleSets)/sizeof(fRuleSets[0])); i++) {
|
||||
fRuleSets[i] = NULL;
|
||||
}
|
||||
fUnescapeCharSet = NULL;
|
||||
fRuleDigits = NULL;
|
||||
fEmptyString = NULL;
|
||||
|
||||
// Then init the sets to their correct values.
|
||||
fPropSets[URX_ISWORD_SET] = new UnicodeSet(gIsWordPattern, *status);
|
||||
fPropSets[URX_ISSPACE_SET] = new UnicodeSet(gIsSpacePattern, *status);
|
||||
fPropSets[URX_GC_EXTEND] = new UnicodeSet(gGC_ExtendPattern, *status);
|
||||
fPropSets[URX_GC_CONTROL] = new UnicodeSet(gGC_ControlPattern, *status);
|
||||
fPropSets[URX_GC_L] = new UnicodeSet(gGC_LPattern, *status);
|
||||
fPropSets[URX_GC_V] = new UnicodeSet(gGC_VPattern, *status);
|
||||
fPropSets[URX_GC_T] = new UnicodeSet(gGC_TPattern, *status);
|
||||
fPropSets[URX_GC_LV] = new UnicodeSet(gGC_LVPattern, *status);
|
||||
fPropSets[URX_GC_LVT] = new UnicodeSet(gGC_LVTPattern, *status);
|
||||
fPropSets[URX_ISWORD_SET] = new UnicodeSet(UnicodeString(TRUE, gIsWordPattern, -1), *status);
|
||||
fPropSets[URX_ISSPACE_SET] = new UnicodeSet(UnicodeString(TRUE, gIsSpacePattern, -1), *status);
|
||||
fPropSets[URX_GC_EXTEND] = new UnicodeSet(UnicodeString(TRUE, gGC_ExtendPattern, -1), *status);
|
||||
fPropSets[URX_GC_CONTROL] = new UnicodeSet(UnicodeString(TRUE, gGC_ControlPattern, -1), *status);
|
||||
fPropSets[URX_GC_L] = new UnicodeSet(UnicodeString(TRUE, gGC_LPattern, -1), *status);
|
||||
fPropSets[URX_GC_V] = new UnicodeSet(UnicodeString(TRUE, gGC_VPattern, -1), *status);
|
||||
fPropSets[URX_GC_T] = new UnicodeSet(UnicodeString(TRUE, gGC_TPattern, -1), *status);
|
||||
fPropSets[URX_GC_LV] = new UnicodeSet(UnicodeString(TRUE, gGC_LVPattern, -1), *status);
|
||||
fPropSets[URX_GC_LVT] = new UnicodeSet(UnicodeString(TRUE, gGC_LVTPattern, -1), *status);
|
||||
if (U_FAILURE(*status)) {
|
||||
// Bail out if we were unable to create the above sets.
|
||||
// The rest of the initialization needs them, so we cannot proceed.
|
||||
@ -187,7 +188,7 @@ RegexStaticSets::RegexStaticSets(UErrorCode *status) {
|
||||
|
||||
//
|
||||
// The following sets are dynamically constructed, because their
|
||||
// intialization strings would be unreasonable.
|
||||
// initialization strings would be unreasonable.
|
||||
//
|
||||
|
||||
|
||||
@ -195,8 +196,7 @@ RegexStaticSets::RegexStaticSets(UErrorCode *status) {
|
||||
// "Normal" is the set of characters that don't need special handling
|
||||
// when finding grapheme cluster boundaries.
|
||||
//
|
||||
fPropSets[URX_GC_NORMAL] = new UnicodeSet;
|
||||
fPropSets[URX_GC_NORMAL]->complement();
|
||||
fPropSets[URX_GC_NORMAL] = new UnicodeSet(0, UnicodeSet::MAX_VALUE);
|
||||
fPropSets[URX_GC_NORMAL]->remove(0xac00, 0xd7a4);
|
||||
fPropSets[URX_GC_NORMAL]->removeAll(*fPropSets[URX_GC_CONTROL]);
|
||||
fPropSets[URX_GC_NORMAL]->removeAll(*fPropSets[URX_GC_L]);
|
||||
@ -206,47 +206,46 @@ RegexStaticSets::RegexStaticSets(UErrorCode *status) {
|
||||
// Initialize the 8-bit fast bit sets from the parallel full
|
||||
// UnicodeSets.
|
||||
for (i=0; i<URX_LAST_SET; i++) {
|
||||
fPropSets8[i].init(fPropSets[i]);
|
||||
if (fPropSets[i]) {
|
||||
fPropSets[i]->compact();
|
||||
fPropSets8[i].init(fPropSets[i]);
|
||||
}
|
||||
}
|
||||
|
||||
// Sets used while parsing rules, but not referenced from the parse state table
|
||||
fRuleSets[kRuleSet_rule_char-128] = new UnicodeSet(gRuleSet_rule_char_pattern, *status);
|
||||
fRuleSets[kRuleSet_white_space-128] = new UnicodeSet(gRuleWhiteSpacePattern, *status);
|
||||
fRuleSets[kRuleSet_digit_char-128] = new UnicodeSet(gRuleSet_digit_char_pattern, *status);
|
||||
fRuleDigits = new UnicodeSet(gRuleSet_digit_char_pattern, *status);
|
||||
fUnescapeCharSet = new UnicodeSet(gUnescapeCharPattern, *status);
|
||||
|
||||
// Empty UnicodeString, for use by matchers with NULL input.
|
||||
fEmptyString = new UnicodeString;
|
||||
fRuleSets[kRuleSet_rule_char-128] = new UnicodeSet(UnicodeString(TRUE, gRuleSet_rule_char_pattern, -1), *status);
|
||||
fRuleSets[kRuleSet_white_space-128] = new UnicodeSet(UnicodeString(TRUE, gRuleWhiteSpacePattern, -1), *status);
|
||||
fRuleSets[kRuleSet_digit_char-128] = new UnicodeSet(UnicodeString(TRUE, gRuleSet_digit_char_pattern, -1), *status);
|
||||
fRuleDigitsAlias = fRuleSets[kRuleSet_digit_char-128];
|
||||
for (i=0; i<(int32_t)(sizeof(fRuleSets)/sizeof(fRuleSets[0])); i++) {
|
||||
if (fRuleSets[i]) {
|
||||
fRuleSets[i]->compact();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
RegexStaticSets::~RegexStaticSets() {
|
||||
int i;
|
||||
int32_t i;
|
||||
|
||||
for (i=0; i<URX_LAST_SET; i++) {
|
||||
delete fPropSets[i];
|
||||
fPropSets[i] = NULL;
|
||||
}
|
||||
for (i=0; i<10; i++) {
|
||||
for (i=0; i<(int32_t)(sizeof(fRuleSets)/sizeof(fRuleSets[0])); i++) {
|
||||
delete fRuleSets[i];
|
||||
fRuleSets[i] = NULL;
|
||||
}
|
||||
delete fUnescapeCharSet;
|
||||
fUnescapeCharSet = NULL;
|
||||
delete fRuleDigits;
|
||||
fRuleDigits = NULL;
|
||||
delete fEmptyString;
|
||||
fEmptyString = NULL;
|
||||
fRuleDigitsAlias = NULL;
|
||||
}
|
||||
|
||||
|
||||
//----------------------------------------------------------------------------------
|
||||
//------------------------------------------------------------------------------
|
||||
//
|
||||
// regex_cleanup Memory cleanup function, free/delete all
|
||||
// cached memory. Called by ICU's u_cleanup() function.
|
||||
//
|
||||
//----------------------------------------------------------------------------------
|
||||
//------------------------------------------------------------------------------
|
||||
UBool
|
||||
RegexStaticSets::cleanup(void) {
|
||||
delete RegexStaticSets::gStaticSets;
|
||||
|
@ -1,7 +1,7 @@
|
||||
//
|
||||
// regexst.h
|
||||
//
|
||||
// Copyright (C) 2003-2004, International Business Machines Corporation and others.
|
||||
// Copyright (C) 2003-2007, International Business Machines Corporation and others.
|
||||
// All Rights Reserved.
|
||||
//
|
||||
// This file contains declarations for the class RegexStaticSets
|
||||
@ -42,17 +42,16 @@ public:
|
||||
Regex8BitSet fPropSets8[URX_LAST_SET]; // Fast bitmap sets for latin-1 range for above.
|
||||
|
||||
UnicodeSet *fRuleSets[10]; // Sets used while parsing regexp patterns.
|
||||
UnicodeSet *fUnescapeCharSet; // Set of chars handled by unescape when
|
||||
// encountered with a \ in a pattern.
|
||||
UnicodeSet *fRuleDigits;
|
||||
UnicodeString *fEmptyString; // An empty string, to be used when a matcher
|
||||
// is created with no input.
|
||||
UnicodeSet fUnescapeCharSet; // Set of chars handled by unescape when
|
||||
// encountered with a \ in a pattern.
|
||||
UnicodeSet *fRuleDigitsAlias;
|
||||
UnicodeString fEmptyString; // An empty string, to be used when a matcher
|
||||
// is created with no input.
|
||||
|
||||
};
|
||||
|
||||
|
||||
|
||||
|
||||
U_NAMESPACE_END
|
||||
#endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
|
||||
#endif // REGEXST_H
|
||||
|
||||
|
@ -55,7 +55,7 @@ RegexMatcher::RegexMatcher(const RegexPattern *pat) {
|
||||
fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
|
||||
}
|
||||
|
||||
reset(*RegexStaticSets::gStaticSets->fEmptyString);
|
||||
reset(RegexStaticSets::gStaticSets->fEmptyString);
|
||||
}
|
||||
|
||||
|
||||
@ -103,7 +103,7 @@ RegexMatcher::RegexMatcher(const UnicodeString ®exp,
|
||||
if (fStack == NULL || fData == NULL) {
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
}
|
||||
reset(*RegexStaticSets::gStaticSets->fEmptyString);
|
||||
reset(RegexStaticSets::gStaticSets->fEmptyString);
|
||||
}
|
||||
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user