scuffed-code/icu4c/source/i18n/regexst.cpp

// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
//
//  regexst.h
//
//  Copyright (C) 2004-2015, International Business Machines Corporation and others.
//  All Rights Reserved.
//
//  This file contains class RegexStaticSets
//
//  This class is internal to the regular expression implementation.
//  For the public Regular Expression API, see the file "unicode/regex.h"
//
//  RegexStaticSets groups together the common UnicodeSets that are needed
//   for compiling or executing RegularExpressions.  This grouping simplifies
//   the thread safe lazy creation and sharing of these sets across
//   all instances of regular expressions.
//
#include "unicode/utypes.h"

#if !UCONFIG_NO_REGULAR_EXPRESSIONS

#include "unicode/unistr.h"
#include "unicode/uniset.h"
#include "unicode/uchar.h"
#include "unicode/regex.h"
#include "uprops.h"
#include "cmemory.h"
#include "cstring.h"
#include "uassert.h"
#include "ucln_in.h"
#include "umutex.h"

#include "regexcst.h"   // Contains state table for the regex pattern parser.
                        //   generated by a Perl script.
#include "regexst.h"

U_NAMESPACE_BEGIN

// "Rule Char" Characters are those with special meaning, and therefore
//    need to be escaped to appear as literals in a regexp.
constexpr char16_t const *gRuleSet_rule_chars = u"*?+[(){}^$|\\.";

//
//   The backslash escape characters that ICU's unescape() function will handle.
//
constexpr char16_t const *gUnescapeChars = u"acefnrtuUx";

//
//  Unicode Set pattern for Regular Expression  \w
//
constexpr char16_t const *gIsWordPattern = u"[\\p{Alphabetic}\\p{M}\\p{Nd}\\p{Pc}\\u200c\\u200d]";

//
//  Unicode Set Definitions for Regular Expression  \s
//
constexpr  char16_t const *gIsSpacePattern = u"[\\p{WhiteSpace}]";

//
//  UnicodeSets used in implementation of Grapheme Cluster detection, \X
//
constexpr char16_t const *gGC_ControlPattern = u"[[:Zl:][:Zp:][:Cc:][:Cf:]-[:Grapheme_Extend:]]";
constexpr char16_t const *gGC_ExtendPattern  = u"[\\p{Grapheme_Extend}]";
constexpr char16_t const *gGC_LPattern       = u"[\\p{Hangul_Syllable_Type=L}]";
constexpr char16_t const *gGC_VPattern       = u"[\\p{Hangul_Syllable_Type=V}]";
constexpr char16_t const *gGC_TPattern       = u"[\\p{Hangul_Syllable_Type=T}]";
constexpr char16_t const *gGC_LVPattern      = u"[\\p{Hangul_Syllable_Type=LV}]";
constexpr char16_t const *gGC_LVTPattern     = u"[\\p{Hangul_Syllable_Type=LVT}]";


RegexStaticSets *RegexStaticSets::gStaticSets = nullptr;
UInitOnce gStaticSetsInitOnce = U_INITONCE_INITIALIZER;


RegexStaticSets::RegexStaticSets(UErrorCode *status) {
    // Initialize the shared static sets to their correct values.
    fUnescapeCharSet.addAll(UnicodeString(true, gUnescapeChars, -1)).freeze();
    fPropSets[URX_ISWORD_SET].applyPattern(UnicodeString(true, gIsWordPattern, -1), *status).freeze();
    fPropSets[URX_ISSPACE_SET].applyPattern(UnicodeString(true, gIsSpacePattern, -1), *status).freeze();
    fPropSets[URX_GC_EXTEND].applyPattern(UnicodeString(TRUE, gGC_ExtendPattern, -1), *status).freeze();
    fPropSets[URX_GC_CONTROL].applyPattern(UnicodeString(TRUE, gGC_ControlPattern, -1), *status).freeze();
    fPropSets[URX_GC_L].applyPattern(UnicodeString(TRUE, gGC_LPattern, -1), *status).freeze();
    fPropSets[URX_GC_V].applyPattern(UnicodeString(TRUE, gGC_VPattern, -1), *status).freeze();
    fPropSets[URX_GC_T].applyPattern(UnicodeString(TRUE, gGC_TPattern, -1), *status).freeze();
    fPropSets[URX_GC_LV].applyPattern(UnicodeString(TRUE, gGC_LVPattern, -1), *status).freeze();
    fPropSets[URX_GC_LVT].applyPattern(UnicodeString(TRUE, gGC_LVTPattern, -1), *status).freeze();


    //
    //  "Normal" is the set of characters that don't need special handling
    //            when finding grapheme cluster boundaries.
    //
    fPropSets[URX_GC_NORMAL].complement();
    fPropSets[URX_GC_NORMAL].remove(0xac00, 0xd7a4);
    fPropSets[URX_GC_NORMAL].removeAll(fPropSets[URX_GC_CONTROL]);
    fPropSets[URX_GC_NORMAL].removeAll(fPropSets[URX_GC_L]);
    fPropSets[URX_GC_NORMAL].removeAll(fPropSets[URX_GC_V]);
    fPropSets[URX_GC_NORMAL].removeAll(fPropSets[URX_GC_T]);
    fPropSets[URX_GC_NORMAL].freeze();

    // Initialize the 8-bit fast bit sets from the parallel full
    //   UnicodeSets.
    //
    // TODO: 25 Oct 2019 are these fast 8-bit sets worth keeping?
    //       Measured 3.5% gain on (non) matching with the pattern "x(?:\\S+)+x"
    //       This runs in exponential time, making it easy to adjust the time for
    //       convenient measuring.
    //
    //       This 8 bit optimization dates from the early days of ICU,
    //       with a less optimized UnicodeSet. At the time, the difference
    //       was substantial.

    for (int32_t i=0; i<URX_LAST_SET; i++) {
        fPropSets8[i].init(&fPropSets[i]);
    }

    // Sets used while parsing rules, but not referenced from the parse state table
    fRuleSets[kRuleSet_rule_char-128]
            .addAll(UnicodeString(gRuleSet_rule_chars)).complement().freeze();

    fRuleSets[kRuleSet_digit_char-128].add(u'0', u'9').freeze();
    fRuleSets[kRuleSet_ascii_letter-128].add(u'A', u'Z').add(u'a', u'z').freeze();
    fRuleDigitsAlias = &fRuleSets[kRuleSet_digit_char-128];

    // Finally, initialize an empty UText string for utility purposes
    fEmptyText = utext_openUChars(nullptr, nullptr, 0, status);

}


RegexStaticSets::~RegexStaticSets() {
    fRuleDigitsAlias = nullptr;
    utext_close(fEmptyText);
}


//------------------------------------------------------------------------------
//
//   regex_cleanup      Memory cleanup function, free/delete all
//                      cached memory.  Called by ICU's u_cleanup() function.
//
//------------------------------------------------------------------------------

U_CDECL_BEGIN
static UBool U_CALLCONV
regex_cleanup(void) {
    delete RegexStaticSets::gStaticSets;
    RegexStaticSets::gStaticSets = nullptr;
    gStaticSetsInitOnce.reset();
    return TRUE;
}

static void U_CALLCONV initStaticSets(UErrorCode &status) {
    U_ASSERT(RegexStaticSets::gStaticSets == nullptr);
    ucln_i18n_registerCleanup(UCLN_I18N_REGEX, regex_cleanup);
    RegexStaticSets::gStaticSets = new RegexStaticSets(&status);
    if (U_FAILURE(status)) {
        delete RegexStaticSets::gStaticSets;
        RegexStaticSets::gStaticSets = nullptr;
    }
    if (RegexStaticSets::gStaticSets == nullptr && U_SUCCESS(status)) {
        status = U_MEMORY_ALLOCATION_ERROR;
    }
}
U_CDECL_END

void RegexStaticSets::initGlobals(UErrorCode *status) {
    umtx_initOnce(gStaticSetsInitOnce, &initStaticSets, *status);
}

U_NAMESPACE_END
#endif  // !UCONFIG_NO_REGULAR_EXPRESSIONS