2002-10-22 00:09:32 +00:00
|
|
|
//
|
|
|
|
// regexcmp.h
|
|
|
|
//
|
|
|
|
// Copyright (C) 2002, International Business Machines Corporation and others.
|
|
|
|
// All Rights Reserved.
|
|
|
|
//
|
2002-11-07 02:34:46 +00:00
|
|
|
// This file contains declarations for the class RegexCompile
|
|
|
|
//
|
|
|
|
// This class is internal to the regular expression implementation.
|
|
|
|
// For the public Regular Expression API, see the file "unicode/regex.h"
|
2002-10-22 00:09:32 +00:00
|
|
|
//
|
|
|
|
|
|
|
|
|
|
|
|
#ifndef RBBISCAN_H
|
|
|
|
#define RBBISCAN_H
|
|
|
|
|
|
|
|
#include "unicode/utypes.h"
|
2002-11-07 02:34:46 +00:00
|
|
|
#if !UCONFIG_NO_REGULAR_EXPRESSIONS
|
|
|
|
|
2002-10-22 00:09:32 +00:00
|
|
|
#include "unicode/uobject.h"
|
|
|
|
#include "unicode/uniset.h"
|
|
|
|
#include "unicode/parseerr.h"
|
|
|
|
#include "uhash.h"
|
|
|
|
#include "uvector.h"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
U_NAMESPACE_BEGIN
|
|
|
|
|
|
|
|
|
|
|
|
static const UBool REGEX_DEBUG = TRUE;
|
|
|
|
|
|
|
|
//--------------------------------------------------------------------------------
|
|
|
|
//
|
2002-11-07 02:34:46 +00:00
|
|
|
// class RegexCompile Contains the regular expression compiler.
|
2002-10-22 00:09:32 +00:00
|
|
|
//
|
|
|
|
//--------------------------------------------------------------------------------
|
|
|
|
static const int kStackSize = 100; // The size of the state stack for
|
|
|
|
// pattern parsing. Corresponds roughly
|
|
|
|
// to the depth of parentheses nesting
|
|
|
|
// that is allowed in the rules.
|
|
|
|
|
|
|
|
enum EParseAction {dummy01, dummy02}; // Placeholder enum for the specifier for
|
|
|
|
// actions that are specified in the
|
|
|
|
// rule parsing state table.
|
|
|
|
struct RegexTableEl;
|
|
|
|
class RegexPattern;
|
|
|
|
|
|
|
|
|
|
|
|
class RegexCompile : public UObject {
|
|
|
|
public:
|
|
|
|
|
|
|
|
struct RegexPatternChar {
|
|
|
|
UChar32 fChar;
|
|
|
|
UBool fQuoted;
|
|
|
|
};
|
|
|
|
|
|
|
|
RegexCompile(UErrorCode &e);
|
|
|
|
|
|
|
|
void compile(RegexPattern &rxp, const UnicodeString &pat, UParseError &pp, UErrorCode &e);
|
|
|
|
|
|
|
|
|
|
|
|
virtual ~RegexCompile();
|
|
|
|
|
|
|
|
void nextChar(RegexPatternChar &c); // Get the next char from the input stream.
|
|
|
|
|
2002-11-07 20:06:39 +00:00
|
|
|
static void cleanup(); // Memory cleanup
|
|
|
|
|
2002-10-22 00:09:32 +00:00
|
|
|
|
|
|
|
/**
|
|
|
|
* ICU "poor man's RTTI", returns a UClassID for the actual class.
|
|
|
|
*
|
|
|
|
* @draft ICU 2.2
|
|
|
|
*/
|
|
|
|
virtual inline UClassID getDynamicClassID() const { return getStaticClassID(); }
|
|
|
|
|
|
|
|
/**
|
|
|
|
* ICU "poor man's RTTI", returns a UClassID for this class.
|
|
|
|
*
|
|
|
|
* @draft ICU 2.2
|
|
|
|
*/
|
|
|
|
static inline UClassID getStaticClassID() { return (UClassID)&fgClassID; }
|
|
|
|
|
|
|
|
private:
|
|
|
|
|
|
|
|
UBool doParseActions(EParseAction a);
|
|
|
|
void error(UErrorCode e); // error reporting convenience function.
|
|
|
|
|
|
|
|
UChar32 nextCharLL();
|
2002-10-24 22:16:07 +00:00
|
|
|
UChar32 peekCharLL();
|
2002-10-22 00:09:32 +00:00
|
|
|
UnicodeSet *scanSet();
|
2002-10-29 01:20:15 +00:00
|
|
|
UnicodeSet *scanProp();
|
2002-10-22 00:09:32 +00:00
|
|
|
void handleCloseParen();
|
2002-10-28 17:18:44 +00:00
|
|
|
int32_t blockTopLoc(UBool reserve); // Locate a position in the compiled pattern
|
2002-10-22 00:09:32 +00:00
|
|
|
// at the top of the just completed block
|
2002-10-28 17:18:44 +00:00
|
|
|
// or operation, and optionally ensure that
|
|
|
|
// there is space to add an opcode there.
|
2002-10-29 01:20:15 +00:00
|
|
|
void compileSet(UnicodeSet *theSet); // Generate the compiled pattern for
|
|
|
|
// a reference to a UnicodeSet.
|
2002-10-22 00:09:32 +00:00
|
|
|
|
|
|
|
|
|
|
|
UErrorCode *fStatus;
|
|
|
|
RegexPattern *fRXPat;
|
|
|
|
UParseError *fParseErr;
|
|
|
|
|
2002-10-24 22:16:07 +00:00
|
|
|
//
|
|
|
|
// Data associated with low level character scanning
|
|
|
|
//
|
2002-10-22 00:09:32 +00:00
|
|
|
int32_t fScanIndex; // Index of current character being processed
|
|
|
|
// in the rule input string.
|
|
|
|
int32_t fNextIndex; // Index of the next character, which
|
|
|
|
// is the first character not yet scanned.
|
|
|
|
UBool fQuoteMode; // Scan is in a quoted region
|
|
|
|
UBool fFreeForm; // Scan mode is free-form, ignore spaces.
|
|
|
|
int fLineNum; // Line number in input file.
|
|
|
|
int fCharNum; // Char position within the line.
|
|
|
|
UChar32 fLastChar; // Previous char, needed to count CR-LF
|
|
|
|
// as a single line, not two.
|
2002-10-24 22:16:07 +00:00
|
|
|
UChar32 fPeekChar; // Saved char, if we've scanned ahead.
|
|
|
|
|
2002-10-22 00:09:32 +00:00
|
|
|
|
|
|
|
RegexPatternChar fC; // Current char for parse state machine
|
|
|
|
// processing.
|
|
|
|
|
|
|
|
int32_t fStringOpStart; // While a literal string is being scanned
|
|
|
|
// holds the start index within RegexPattern.
|
|
|
|
// fLiteralText where the string is being stored.
|
|
|
|
|
|
|
|
RegexTableEl **fStateTable; // State Transition Table for regex Rule
|
|
|
|
// parsing. index by p[state][char-class]
|
|
|
|
|
|
|
|
uint16_t fStack[kStackSize]; // State stack, holds state pushes
|
|
|
|
int fStackPtr; // and pops as specified in the state
|
|
|
|
// transition rules.
|
|
|
|
|
|
|
|
int32_t fPatternLength; // Length of the input pattern string.
|
|
|
|
|
|
|
|
UStack fParenStack; // parentheses stack. Each frame consists of
|
|
|
|
// the positions of compiled pattern operations
|
|
|
|
// needing fixup, followed by negative vallue. The
|
|
|
|
// first entry in each frame is the position of the
|
|
|
|
// spot reserved for use when a quantifier
|
|
|
|
// needs to add a SAVE at the start of a (block)
|
|
|
|
// The negative value (-1, -2,...) indicates
|
|
|
|
// the kind of paren that opened the frame. Some
|
|
|
|
// need special handling on close.
|
|
|
|
|
|
|
|
|
|
|
|
int32_t fMatchOpenParen; // The position in the compiled pattern
|
|
|
|
// of the slot reserved for a state save
|
|
|
|
// at the start of the most recently processed
|
|
|
|
// parenthesized block.
|
|
|
|
int32_t fMatchCloseParen; // The position in the pattern of the first
|
|
|
|
// location after the most recently processed
|
|
|
|
// parenthesized block.
|
|
|
|
|
|
|
|
/**
|
|
|
|
* The address of this static class variable serves as this class's ID
|
|
|
|
* for ICU "poor man's RTTI".
|
|
|
|
*/
|
|
|
|
static const char fgClassID;
|
|
|
|
};
|
|
|
|
|
|
|
|
U_NAMESPACE_END
|
2002-11-07 02:34:46 +00:00
|
|
|
#endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
|
|
|
|
#endif // RBBISCAN_H
|