scuffed-code/icu4c/source/i18n/uregex.cpp
2013-10-11 20:59:39 +00:00

1952 lines
66 KiB
C++

/*
*******************************************************************************
* Copyright (C) 2004-2013, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* file name: uregex.cpp
*/
#include "unicode/utypes.h"
#if !UCONFIG_NO_REGULAR_EXPRESSIONS
#include "unicode/regex.h"
#include "unicode/uregex.h"
#include "unicode/unistr.h"
#include "unicode/ustring.h"
#include "unicode/uchar.h"
#include "unicode/uobject.h"
#include "unicode/utf16.h"
#include "umutex.h"
#include "uassert.h"
#include "cmemory.h"
#include "regextxt.h"
#include <stdio.h>
U_NAMESPACE_BEGIN
#define REMAINING_CAPACITY(idx,len) ((((len)-(idx))>0)?((len)-(idx)):0)
struct RegularExpression: public UMemory {
public:
RegularExpression();
~RegularExpression();
int32_t fMagic;
RegexPattern *fPat;
u_atomic_int32_t *fPatRefCount;
UChar *fPatString;
int32_t fPatStringLen;
RegexMatcher *fMatcher;
const UChar *fText; // Text from setText()
int32_t fTextLength; // Length provided by user with setText(), which
// may be -1.
UBool fOwnsText;
};
static const int32_t REXP_MAGIC = 0x72657870; // "rexp" in ASCII
RegularExpression::RegularExpression() {
fMagic = REXP_MAGIC;
fPat = NULL;
fPatRefCount = NULL;
fPatString = NULL;
fPatStringLen = 0;
fMatcher = NULL;
fText = NULL;
fTextLength = 0;
fOwnsText = FALSE;
}
RegularExpression::~RegularExpression() {
delete fMatcher;
fMatcher = NULL;
if (fPatRefCount!=NULL && umtx_atomic_dec(fPatRefCount)==0) {
delete fPat;
uprv_free(fPatString);
uprv_free((void *)fPatRefCount);
}
if (fOwnsText && fText!=NULL) {
uprv_free((void *)fText);
}
fMagic = 0;
}
U_NAMESPACE_END
U_NAMESPACE_USE
//----------------------------------------------------------------------------------------
//
// validateRE Do boilerplate style checks on API function parameters.
// Return TRUE if they look OK.
//----------------------------------------------------------------------------------------
static UBool validateRE(const RegularExpression *re, UBool requiresText, UErrorCode *status) {
if (U_FAILURE(*status)) {
return FALSE;
}
if (re == NULL || re->fMagic != REXP_MAGIC) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
return FALSE;
}
// !!! Not sure how to update this with the new UText backing, which is stored in re->fMatcher anyway
if (requiresText && re->fText == NULL && !re->fOwnsText) {
*status = U_REGEX_INVALID_STATE;
return FALSE;
}
return TRUE;
}
//----------------------------------------------------------------------------------------
//
// uregex_open
//
//----------------------------------------------------------------------------------------
U_CAPI URegularExpression * U_EXPORT2
uregex_open( const UChar *pattern,
int32_t patternLength,
uint32_t flags,
UParseError *pe,
UErrorCode *status) {
if (U_FAILURE(*status)) {
return NULL;
}
if (pattern == NULL || patternLength < -1 || patternLength == 0) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
return NULL;
}
int32_t actualPatLen = patternLength;
if (actualPatLen == -1) {
actualPatLen = u_strlen(pattern);
}
RegularExpression *re = new RegularExpression;
u_atomic_int32_t *refC = (u_atomic_int32_t *)uprv_malloc(sizeof(int32_t));
UChar *patBuf = (UChar *)uprv_malloc(sizeof(UChar)*(actualPatLen+1));
if (re == NULL || refC == NULL || patBuf == NULL) {
*status = U_MEMORY_ALLOCATION_ERROR;
delete re;
uprv_free((void *)refC);
uprv_free(patBuf);
return NULL;
}
re->fPatRefCount = refC;
*re->fPatRefCount = 1;
//
// Make a copy of the pattern string, so we can return it later if asked.
// For compiling the pattern, we will use a UText wrapper around
// this local copy, to avoid making even more copies.
//
re->fPatString = patBuf;
re->fPatStringLen = patternLength;
u_memcpy(patBuf, pattern, actualPatLen);
patBuf[actualPatLen] = 0;
UText patText = UTEXT_INITIALIZER;
utext_openUChars(&patText, patBuf, patternLength, status);
//
// Compile the pattern
//
if (pe != NULL) {
re->fPat = RegexPattern::compile(&patText, flags, *pe, *status);
} else {
re->fPat = RegexPattern::compile(&patText, flags, *status);
}
utext_close(&patText);
if (U_FAILURE(*status)) {
goto ErrorExit;
}
//
// Create the matcher object
//
re->fMatcher = re->fPat->matcher(*status);
if (U_SUCCESS(*status)) {
return (URegularExpression*)re;
}
ErrorExit:
delete re;
return NULL;
}
//----------------------------------------------------------------------------------------
//
// uregex_openUText
//
//----------------------------------------------------------------------------------------
U_CAPI URegularExpression * U_EXPORT2
uregex_openUText(UText *pattern,
uint32_t flags,
UParseError *pe,
UErrorCode *status) {
if (U_FAILURE(*status)) {
return NULL;
}
if (pattern == NULL) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
return NULL;
}
int64_t patternNativeLength = utext_nativeLength(pattern);
if (patternNativeLength == 0) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
return NULL;
}
RegularExpression *re = new RegularExpression;
UErrorCode lengthStatus = U_ZERO_ERROR;
int32_t pattern16Length = utext_extract(pattern, 0, patternNativeLength, NULL, 0, &lengthStatus);
u_atomic_int32_t *refC = (u_atomic_int32_t *)uprv_malloc(sizeof(int32_t));
UChar *patBuf = (UChar *)uprv_malloc(sizeof(UChar)*(pattern16Length+1));
if (re == NULL || refC == NULL || patBuf == NULL) {
*status = U_MEMORY_ALLOCATION_ERROR;
delete re;
uprv_free((void *)refC);
uprv_free(patBuf);
return NULL;
}
re->fPatRefCount = refC;
*re->fPatRefCount = 1;
//
// Make a copy of the pattern string, so we can return it later if asked.
// For compiling the pattern, we will use a read-only UText wrapper
// around this local copy, to avoid making even more copies.
//
re->fPatString = patBuf;
re->fPatStringLen = pattern16Length;
utext_extract(pattern, 0, patternNativeLength, patBuf, pattern16Length+1, status);
UText patText = UTEXT_INITIALIZER;
utext_openUChars(&patText, patBuf, pattern16Length, status);
//
// Compile the pattern
//
if (pe != NULL) {
re->fPat = RegexPattern::compile(&patText, flags, *pe, *status);
} else {
re->fPat = RegexPattern::compile(&patText, flags, *status);
}
utext_close(&patText);
if (U_FAILURE(*status)) {
goto ErrorExit;
}
//
// Create the matcher object
//
re->fMatcher = re->fPat->matcher(*status);
if (U_SUCCESS(*status)) {
return (URegularExpression*)re;
}
ErrorExit:
delete re;
return NULL;
}
//----------------------------------------------------------------------------------------
//
// uregex_close
//
//----------------------------------------------------------------------------------------
U_CAPI void U_EXPORT2
uregex_close(URegularExpression *re2) {
RegularExpression *re = (RegularExpression*)re2;
UErrorCode status = U_ZERO_ERROR;
if (validateRE(re, FALSE, &status) == FALSE) {
return;
}
delete re;
}
//----------------------------------------------------------------------------------------
//
// uregex_clone
//
//----------------------------------------------------------------------------------------
U_CAPI URegularExpression * U_EXPORT2
uregex_clone(const URegularExpression *source2, UErrorCode *status) {
RegularExpression *source = (RegularExpression*)source2;
if (validateRE(source, FALSE, status) == FALSE) {
return NULL;
}
RegularExpression *clone = new RegularExpression;
if (clone == NULL) {
*status = U_MEMORY_ALLOCATION_ERROR;
return NULL;
}
clone->fMatcher = source->fPat->matcher(*status);
if (U_FAILURE(*status)) {
delete clone;
return NULL;
}
clone->fPat = source->fPat;
clone->fPatRefCount = source->fPatRefCount;
clone->fPatString = source->fPatString;
clone->fPatStringLen = source->fPatStringLen;
umtx_atomic_inc(source->fPatRefCount);
// Note: fText is not cloned.
return (URegularExpression*)clone;
}
//------------------------------------------------------------------------------
//
// uregex_pattern
//
//------------------------------------------------------------------------------
U_CAPI const UChar * U_EXPORT2
uregex_pattern(const URegularExpression *regexp2,
int32_t *patLength,
UErrorCode *status) {
RegularExpression *regexp = (RegularExpression*)regexp2;
if (validateRE(regexp, FALSE, status) == FALSE) {
return NULL;
}
if (patLength != NULL) {
*patLength = regexp->fPatStringLen;
}
return regexp->fPatString;
}
//------------------------------------------------------------------------------
//
// uregex_patternUText
//
//------------------------------------------------------------------------------
U_CAPI UText * U_EXPORT2
uregex_patternUText(const URegularExpression *regexp2,
UErrorCode *status) {
RegularExpression *regexp = (RegularExpression*)regexp2;
return regexp->fPat->patternText(*status);
}
//------------------------------------------------------------------------------
//
// uregex_flags
//
//------------------------------------------------------------------------------
U_CAPI int32_t U_EXPORT2
uregex_flags(const URegularExpression *regexp2, UErrorCode *status) {
RegularExpression *regexp = (RegularExpression*)regexp2;
if (validateRE(regexp, FALSE, status) == FALSE) {
return 0;
}
int32_t flags = regexp->fPat->flags();
return flags;
}
//------------------------------------------------------------------------------
//
// uregex_setText
//
//------------------------------------------------------------------------------
U_CAPI void U_EXPORT2
uregex_setText(URegularExpression *regexp2,
const UChar *text,
int32_t textLength,
UErrorCode *status) {
RegularExpression *regexp = (RegularExpression*)regexp2;
if (validateRE(regexp, FALSE, status) == FALSE) {
return;
}
if (text == NULL || textLength < -1) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
if (regexp->fOwnsText && regexp->fText != NULL) {
uprv_free((void *)regexp->fText);
}
regexp->fText = text;
regexp->fTextLength = textLength;
regexp->fOwnsText = FALSE;
UText input = UTEXT_INITIALIZER;
utext_openUChars(&input, text, textLength, status);
regexp->fMatcher->reset(&input);
utext_close(&input); // reset() made a shallow clone, so we don't need this copy
}
//------------------------------------------------------------------------------
//
// uregex_setUText
//
//------------------------------------------------------------------------------
U_CAPI void U_EXPORT2
uregex_setUText(URegularExpression *regexp2,
UText *text,
UErrorCode *status) {
RegularExpression *regexp = (RegularExpression*)regexp2;
if (validateRE(regexp, FALSE, status) == FALSE) {
return;
}
if (text == NULL) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
if (regexp->fOwnsText && regexp->fText != NULL) {
uprv_free((void *)regexp->fText);
}
regexp->fText = NULL; // only fill it in on request
regexp->fTextLength = -1;
regexp->fOwnsText = TRUE;
regexp->fMatcher->reset(text);
}
//------------------------------------------------------------------------------
//
// uregex_getText
//
//------------------------------------------------------------------------------
U_CAPI const UChar * U_EXPORT2
uregex_getText(URegularExpression *regexp2,
int32_t *textLength,
UErrorCode *status) {
RegularExpression *regexp = (RegularExpression*)regexp2;
if (validateRE(regexp, FALSE, status) == FALSE) {
return NULL;
}
if (regexp->fText == NULL) {
// need to fill in the text
UText *inputText = regexp->fMatcher->inputText();
int64_t inputNativeLength = utext_nativeLength(inputText);
if (UTEXT_FULL_TEXT_IN_CHUNK(inputText, inputNativeLength)) {
regexp->fText = inputText->chunkContents;
regexp->fTextLength = (int32_t)inputNativeLength;
regexp->fOwnsText = FALSE; // because the UText owns it
} else {
UErrorCode lengthStatus = U_ZERO_ERROR;
regexp->fTextLength = utext_extract(inputText, 0, inputNativeLength, NULL, 0, &lengthStatus); // buffer overflow error
UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(regexp->fTextLength+1));
utext_extract(inputText, 0, inputNativeLength, inputChars, regexp->fTextLength+1, status);
regexp->fText = inputChars;
regexp->fOwnsText = TRUE; // should already be set but just in case
}
}
if (textLength != NULL) {
*textLength = regexp->fTextLength;
}
return regexp->fText;
}
//------------------------------------------------------------------------------
//
// uregex_getUText
//
//------------------------------------------------------------------------------
U_CAPI UText * U_EXPORT2
uregex_getUText(URegularExpression *regexp2,
UText *dest,
UErrorCode *status) {
RegularExpression *regexp = (RegularExpression*)regexp2;
if (validateRE(regexp, FALSE, status) == FALSE) {
return dest;
}
return regexp->fMatcher->getInput(dest, *status);
}
//------------------------------------------------------------------------------
//
// uregex_refreshUText
//
//------------------------------------------------------------------------------
U_CAPI void U_EXPORT2
uregex_refreshUText(URegularExpression *regexp2,
UText *text,
UErrorCode *status) {
RegularExpression *regexp = (RegularExpression*)regexp2;
if (validateRE(regexp, FALSE, status) == FALSE) {
return;
}
regexp->fMatcher->refreshInputText(text, *status);
}
//------------------------------------------------------------------------------
//
// uregex_matches
//
//------------------------------------------------------------------------------
U_CAPI UBool U_EXPORT2
uregex_matches(URegularExpression *regexp2,
int32_t startIndex,
UErrorCode *status) {
return uregex_matches64( regexp2, (int64_t)startIndex, status);
}
U_CAPI UBool U_EXPORT2
uregex_matches64(URegularExpression *regexp2,
int64_t startIndex,
UErrorCode *status) {
RegularExpression *regexp = (RegularExpression*)regexp2;
UBool result = FALSE;
if (validateRE(regexp, TRUE, status) == FALSE) {
return result;
}
if (startIndex == -1) {
result = regexp->fMatcher->matches(*status);
} else {
result = regexp->fMatcher->matches(startIndex, *status);
}
return result;
}
//------------------------------------------------------------------------------
//
// uregex_lookingAt
//
//------------------------------------------------------------------------------
U_CAPI UBool U_EXPORT2
uregex_lookingAt(URegularExpression *regexp2,
int32_t startIndex,
UErrorCode *status) {
return uregex_lookingAt64( regexp2, (int64_t)startIndex, status);
}
U_CAPI UBool U_EXPORT2
uregex_lookingAt64(URegularExpression *regexp2,
int64_t startIndex,
UErrorCode *status) {
RegularExpression *regexp = (RegularExpression*)regexp2;
UBool result = FALSE;
if (validateRE(regexp, TRUE, status) == FALSE) {
return result;
}
if (startIndex == -1) {
result = regexp->fMatcher->lookingAt(*status);
} else {
result = regexp->fMatcher->lookingAt(startIndex, *status);
}
return result;
}
//------------------------------------------------------------------------------
//
// uregex_find
//
//------------------------------------------------------------------------------
U_CAPI UBool U_EXPORT2
uregex_find(URegularExpression *regexp2,
int32_t startIndex,
UErrorCode *status) {
return uregex_find64( regexp2, (int64_t)startIndex, status);
}
U_CAPI UBool U_EXPORT2
uregex_find64(URegularExpression *regexp2,
int64_t startIndex,
UErrorCode *status) {
RegularExpression *regexp = (RegularExpression*)regexp2;
UBool result = FALSE;
if (validateRE(regexp, TRUE, status) == FALSE) {
return result;
}
if (startIndex == -1) {
regexp->fMatcher->resetPreserveRegion();
result = regexp->fMatcher->find();
} else {
result = regexp->fMatcher->find(startIndex, *status);
}
return result;
}
//------------------------------------------------------------------------------
//
// uregex_findNext
//
//------------------------------------------------------------------------------
U_CAPI UBool U_EXPORT2
uregex_findNext(URegularExpression *regexp2,
UErrorCode *status) {
RegularExpression *regexp = (RegularExpression*)regexp2;
if (validateRE(regexp, TRUE, status) == FALSE) {
return FALSE;
}
UBool result = regexp->fMatcher->find();
return result;
}
//------------------------------------------------------------------------------
//
// uregex_groupCount
//
//------------------------------------------------------------------------------
U_CAPI int32_t U_EXPORT2
uregex_groupCount(URegularExpression *regexp2,
UErrorCode *status) {
RegularExpression *regexp = (RegularExpression*)regexp2;
if (validateRE(regexp, FALSE, status) == FALSE) {
return 0;
}
int32_t result = regexp->fMatcher->groupCount();
return result;
}
//------------------------------------------------------------------------------
//
// uregex_group
//
//------------------------------------------------------------------------------
U_CAPI int32_t U_EXPORT2
uregex_group(URegularExpression *regexp2,
int32_t groupNum,
UChar *dest,
int32_t destCapacity,
UErrorCode *status) {
RegularExpression *regexp = (RegularExpression*)regexp2;
if (validateRE(regexp, TRUE, status) == FALSE) {
return 0;
}
if (destCapacity < 0 || (destCapacity > 0 && dest == NULL)) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
if (destCapacity == 0 || regexp->fText != NULL) {
// If preflighting or if we already have the text as UChars,
// this is a little cheaper than going through uregex_groupUTextDeep()
//
// Pick up the range of characters from the matcher
//
int32_t startIx = regexp->fMatcher->start(groupNum, *status);
int32_t endIx = regexp->fMatcher->end (groupNum, *status);
if (U_FAILURE(*status)) {
return 0;
}
//
// Trim length based on buffer capacity
//
int32_t fullLength = endIx - startIx;
int32_t copyLength = fullLength;
if (copyLength < destCapacity) {
dest[copyLength] = 0;
} else if (copyLength == destCapacity) {
*status = U_STRING_NOT_TERMINATED_WARNING;
} else {
copyLength = destCapacity;
*status = U_BUFFER_OVERFLOW_ERROR;
}
//
// Copy capture group to user's buffer
//
if (copyLength > 0) {
u_memcpy(dest, &regexp->fText[startIx], copyLength);
}
return fullLength;
} else {
int32_t result = 0;
UText *groupText = uregex_groupUTextDeep(regexp2, groupNum, NULL, status);
if (U_SUCCESS(*status)) {
result = utext_extract(groupText, 0, utext_nativeLength(groupText), dest, destCapacity, status);
}
utext_close(groupText);
return result;
}
}
//------------------------------------------------------------------------------
//
// uregex_groupUText
//
//------------------------------------------------------------------------------
U_CAPI UText * U_EXPORT2
uregex_groupUText(URegularExpression *regexp2,
int32_t groupNum,
UText *dest,
int64_t *groupLength,
UErrorCode *status) {
RegularExpression *regexp = (RegularExpression*)regexp2;
if (validateRE(regexp, TRUE, status) == FALSE) {
UErrorCode emptyTextStatus = U_ZERO_ERROR;
return (dest ? dest : utext_openUChars(NULL, NULL, 0, &emptyTextStatus));
}
return regexp->fMatcher->group(groupNum, dest, *groupLength, *status);
}
//------------------------------------------------------------------------------
//
// uregex_groupUTextDeep
//
//------------------------------------------------------------------------------
U_CAPI UText * U_EXPORT2
uregex_groupUTextDeep(URegularExpression *regexp2,
int32_t groupNum,
UText *dest,
UErrorCode *status) {
RegularExpression *regexp = (RegularExpression*)regexp2;
if (validateRE(regexp, TRUE, status) == FALSE) {
UErrorCode emptyTextStatus = U_ZERO_ERROR;
return (dest ? dest : utext_openUChars(NULL, NULL, 0, &emptyTextStatus));
}
if (regexp->fText != NULL) {
//
// Pick up the range of characters from the matcher
// and use our already-extracted characters
//
int32_t startIx = regexp->fMatcher->start(groupNum, *status);
int32_t endIx = regexp->fMatcher->end (groupNum, *status);
if (U_FAILURE(*status)) {
UErrorCode emptyTextStatus = U_ZERO_ERROR;
return (dest ? dest : utext_openUChars(NULL, NULL, 0, &emptyTextStatus));
}
if (dest) {
utext_replace(dest, 0, utext_nativeLength(dest), &regexp->fText[startIx], endIx - startIx, status);
} else {
UText groupText = UTEXT_INITIALIZER;
utext_openUChars(&groupText, &regexp->fText[startIx], endIx - startIx, status);
dest = utext_clone(NULL, &groupText, TRUE, FALSE, status);
utext_close(&groupText);
}
return dest;
} else {
return regexp->fMatcher->group(groupNum, dest, *status);
}
}
//------------------------------------------------------------------------------
//
// uregex_start
//
//------------------------------------------------------------------------------
U_CAPI int32_t U_EXPORT2
uregex_start(URegularExpression *regexp2,
int32_t groupNum,
UErrorCode *status) {
return (int32_t)uregex_start64( regexp2, groupNum, status);
}
U_CAPI int64_t U_EXPORT2
uregex_start64(URegularExpression *regexp2,
int32_t groupNum,
UErrorCode *status) {
RegularExpression *regexp = (RegularExpression*)regexp2;
if (validateRE(regexp, TRUE, status) == FALSE) {
return 0;
}
int32_t result = regexp->fMatcher->start(groupNum, *status);
return result;
}
//------------------------------------------------------------------------------
//
// uregex_end
//
//------------------------------------------------------------------------------
U_CAPI int32_t U_EXPORT2
uregex_end(URegularExpression *regexp2,
int32_t groupNum,
UErrorCode *status) {
return (int32_t)uregex_end64( regexp2, groupNum, status);
}
U_CAPI int64_t U_EXPORT2
uregex_end64(URegularExpression *regexp2,
int32_t groupNum,
UErrorCode *status) {
RegularExpression *regexp = (RegularExpression*)regexp2;
if (validateRE(regexp, TRUE, status) == FALSE) {
return 0;
}
int32_t result = regexp->fMatcher->end(groupNum, *status);
return result;
}
//------------------------------------------------------------------------------
//
// uregex_reset
//
//------------------------------------------------------------------------------
U_CAPI void U_EXPORT2
uregex_reset(URegularExpression *regexp2,
int32_t index,
UErrorCode *status) {
uregex_reset64( regexp2, (int64_t)index, status);
}
U_CAPI void U_EXPORT2
uregex_reset64(URegularExpression *regexp2,
int64_t index,
UErrorCode *status) {
RegularExpression *regexp = (RegularExpression*)regexp2;
if (validateRE(regexp, TRUE, status) == FALSE) {
return;
}
regexp->fMatcher->reset(index, *status);
}
//------------------------------------------------------------------------------
//
// uregex_setRegion
//
//------------------------------------------------------------------------------
U_CAPI void U_EXPORT2
uregex_setRegion(URegularExpression *regexp2,
int32_t regionStart,
int32_t regionLimit,
UErrorCode *status) {
uregex_setRegion64( regexp2, (int64_t)regionStart, (int64_t)regionLimit, status);
}
U_CAPI void U_EXPORT2
uregex_setRegion64(URegularExpression *regexp2,
int64_t regionStart,
int64_t regionLimit,
UErrorCode *status) {
RegularExpression *regexp = (RegularExpression*)regexp2;
if (validateRE(regexp, TRUE, status) == FALSE) {
return;
}
regexp->fMatcher->region(regionStart, regionLimit, *status);
}
//------------------------------------------------------------------------------
//
// uregex_setRegionAndStart
//
//------------------------------------------------------------------------------
U_CAPI void U_EXPORT2
uregex_setRegionAndStart(URegularExpression *regexp2,
int64_t regionStart,
int64_t regionLimit,
int64_t startIndex,
UErrorCode *status) {
RegularExpression *regexp = (RegularExpression*)regexp2;
if (validateRE(regexp, TRUE, status) == FALSE) {
return;
}
regexp->fMatcher->region(regionStart, regionLimit, startIndex, *status);
}
//------------------------------------------------------------------------------
//
// uregex_regionStart
//
//------------------------------------------------------------------------------
U_CAPI int32_t U_EXPORT2
uregex_regionStart(const URegularExpression *regexp2,
UErrorCode *status) {
return (int32_t)uregex_regionStart64(regexp2, status);
}
U_CAPI int64_t U_EXPORT2
uregex_regionStart64(const URegularExpression *regexp2,
UErrorCode *status) {
RegularExpression *regexp = (RegularExpression*)regexp2;
if (validateRE(regexp, TRUE, status) == FALSE) {
return 0;
}
return regexp->fMatcher->regionStart();
}
//------------------------------------------------------------------------------
//
// uregex_regionEnd
//
//------------------------------------------------------------------------------
U_CAPI int32_t U_EXPORT2
uregex_regionEnd(const URegularExpression *regexp2,
UErrorCode *status) {
return (int32_t)uregex_regionEnd64(regexp2, status);
}
U_CAPI int64_t U_EXPORT2
uregex_regionEnd64(const URegularExpression *regexp2,
UErrorCode *status) {
RegularExpression *regexp = (RegularExpression*)regexp2;
if (validateRE(regexp, TRUE, status) == FALSE) {
return 0;
}
return regexp->fMatcher->regionEnd();
}
//------------------------------------------------------------------------------
//
// uregex_hasTransparentBounds
//
//------------------------------------------------------------------------------
U_CAPI UBool U_EXPORT2
uregex_hasTransparentBounds(const URegularExpression *regexp2,
UErrorCode *status) {
RegularExpression *regexp = (RegularExpression*)regexp2;
if (validateRE(regexp, FALSE, status) == FALSE) {
return FALSE;
}
return regexp->fMatcher->hasTransparentBounds();
}
//------------------------------------------------------------------------------
//
// uregex_useTransparentBounds
//
//------------------------------------------------------------------------------
U_CAPI void U_EXPORT2
uregex_useTransparentBounds(URegularExpression *regexp2,
UBool b,
UErrorCode *status) {
RegularExpression *regexp = (RegularExpression*)regexp2;
if (validateRE(regexp, FALSE, status) == FALSE) {
return;
}
regexp->fMatcher->useTransparentBounds(b);
}
//------------------------------------------------------------------------------
//
// uregex_hasAnchoringBounds
//
//------------------------------------------------------------------------------
U_CAPI UBool U_EXPORT2
uregex_hasAnchoringBounds(const URegularExpression *regexp2,
UErrorCode *status) {
RegularExpression *regexp = (RegularExpression*)regexp2;
if (validateRE(regexp, FALSE, status) == FALSE) {
return FALSE;
}
return regexp->fMatcher->hasAnchoringBounds();
}
//------------------------------------------------------------------------------
//
// uregex_useAnchoringBounds
//
//------------------------------------------------------------------------------
U_CAPI void U_EXPORT2
uregex_useAnchoringBounds(URegularExpression *regexp2,
UBool b,
UErrorCode *status) {
RegularExpression *regexp = (RegularExpression*)regexp2;
if (validateRE(regexp, FALSE, status) == FALSE) {
return;
}
regexp->fMatcher->useAnchoringBounds(b);
}
//------------------------------------------------------------------------------
//
// uregex_hitEnd
//
//------------------------------------------------------------------------------
U_CAPI UBool U_EXPORT2
uregex_hitEnd(const URegularExpression *regexp2,
UErrorCode *status) {
RegularExpression *regexp = (RegularExpression*)regexp2;
if (validateRE(regexp, TRUE, status) == FALSE) {
return FALSE;
}
return regexp->fMatcher->hitEnd();
}
//------------------------------------------------------------------------------
//
// uregex_requireEnd
//
//------------------------------------------------------------------------------
U_CAPI UBool U_EXPORT2
uregex_requireEnd(const URegularExpression *regexp2,
UErrorCode *status) {
RegularExpression *regexp = (RegularExpression*)regexp2;
if (validateRE(regexp, TRUE, status) == FALSE) {
return FALSE;
}
return regexp->fMatcher->requireEnd();
}
//------------------------------------------------------------------------------
//
// uregex_setTimeLimit
//
//------------------------------------------------------------------------------
U_CAPI void U_EXPORT2
uregex_setTimeLimit(URegularExpression *regexp2,
int32_t limit,
UErrorCode *status) {
RegularExpression *regexp = (RegularExpression*)regexp2;
if (validateRE(regexp, FALSE, status)) {
regexp->fMatcher->setTimeLimit(limit, *status);
}
}
//------------------------------------------------------------------------------
//
// uregex_getTimeLimit
//
//------------------------------------------------------------------------------
U_CAPI int32_t U_EXPORT2
uregex_getTimeLimit(const URegularExpression *regexp2,
UErrorCode *status) {
int32_t retVal = 0;
RegularExpression *regexp = (RegularExpression*)regexp2;
if (validateRE(regexp, FALSE, status)) {
retVal = regexp->fMatcher->getTimeLimit();
}
return retVal;
}
//------------------------------------------------------------------------------
//
// uregex_setStackLimit
//
//------------------------------------------------------------------------------
U_CAPI void U_EXPORT2
uregex_setStackLimit(URegularExpression *regexp2,
int32_t limit,
UErrorCode *status) {
RegularExpression *regexp = (RegularExpression*)regexp2;
if (validateRE(regexp, FALSE, status)) {
regexp->fMatcher->setStackLimit(limit, *status);
}
}
//------------------------------------------------------------------------------
//
// uregex_getStackLimit
//
//------------------------------------------------------------------------------
U_CAPI int32_t U_EXPORT2
uregex_getStackLimit(const URegularExpression *regexp2,
UErrorCode *status) {
int32_t retVal = 0;
RegularExpression *regexp = (RegularExpression*)regexp2;
if (validateRE(regexp, FALSE, status)) {
retVal = regexp->fMatcher->getStackLimit();
}
return retVal;
}
//------------------------------------------------------------------------------
//
// uregex_setMatchCallback
//
//------------------------------------------------------------------------------
U_CAPI void U_EXPORT2
uregex_setMatchCallback(URegularExpression *regexp2,
URegexMatchCallback *callback,
const void *context,
UErrorCode *status) {
RegularExpression *regexp = (RegularExpression*)regexp2;
if (validateRE(regexp, FALSE, status)) {
regexp->fMatcher->setMatchCallback(callback, context, *status);
}
}
//------------------------------------------------------------------------------
//
// uregex_getMatchCallback
//
//------------------------------------------------------------------------------
U_CAPI void U_EXPORT2
uregex_getMatchCallback(const URegularExpression *regexp2,
URegexMatchCallback **callback,
const void **context,
UErrorCode *status) {
RegularExpression *regexp = (RegularExpression*)regexp2;
if (validateRE(regexp, FALSE, status)) {
regexp->fMatcher->getMatchCallback(*callback, *context, *status);
}
}
//------------------------------------------------------------------------------
//
// uregex_setMatchProgressCallback
//
//------------------------------------------------------------------------------
U_CAPI void U_EXPORT2
uregex_setFindProgressCallback(URegularExpression *regexp2,
URegexFindProgressCallback *callback,
const void *context,
UErrorCode *status) {
RegularExpression *regexp = (RegularExpression*)regexp2;
if (validateRE(regexp, FALSE, status)) {
regexp->fMatcher->setFindProgressCallback(callback, context, *status);
}
}
//------------------------------------------------------------------------------
//
// uregex_getMatchCallback
//
//------------------------------------------------------------------------------
U_CAPI void U_EXPORT2
uregex_getFindProgressCallback(const URegularExpression *regexp2,
URegexFindProgressCallback **callback,
const void **context,
UErrorCode *status) {
RegularExpression *regexp = (RegularExpression*)regexp2;
if (validateRE(regexp, FALSE, status)) {
regexp->fMatcher->getFindProgressCallback(*callback, *context, *status);
}
}
//------------------------------------------------------------------------------
//
// uregex_replaceAll
//
//------------------------------------------------------------------------------
U_CAPI int32_t U_EXPORT2
uregex_replaceAll(URegularExpression *regexp2,
const UChar *replacementText,
int32_t replacementLength,
UChar *destBuf,
int32_t destCapacity,
UErrorCode *status) {
RegularExpression *regexp = (RegularExpression*)regexp2;
if (validateRE(regexp, TRUE, status) == FALSE) {
return 0;
}
if (replacementText == NULL || replacementLength < -1 ||
(destBuf == NULL && destCapacity > 0) ||
destCapacity < 0) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
int32_t len = 0;
uregex_reset(regexp2, 0, status);
// Note: Seperate error code variables for findNext() and appendReplacement()
// are used so that destination buffer overflow errors
// in appendReplacement won't stop findNext() from working.
// appendReplacement() and appendTail() special case incoming buffer
// overflow errors, continuing to return the correct length.
UErrorCode findStatus = *status;
while (uregex_findNext(regexp2, &findStatus)) {
len += uregex_appendReplacement(regexp2, replacementText, replacementLength,
&destBuf, &destCapacity, status);
}
len += uregex_appendTail(regexp2, &destBuf, &destCapacity, status);
if (U_FAILURE(findStatus)) {
// If anything went wrong with the findNext(), make that error trump
// whatever may have happened with the append() operations.
// Errors in findNext() are not expected.
*status = findStatus;
}
return len;
}
//------------------------------------------------------------------------------
//
// uregex_replaceAllUText
//
//------------------------------------------------------------------------------
U_CAPI UText * U_EXPORT2
uregex_replaceAllUText(URegularExpression *regexp2,
UText *replacementText,
UText *dest,
UErrorCode *status) {
RegularExpression *regexp = (RegularExpression*)regexp2;
if (validateRE(regexp, TRUE, status) == FALSE) {
return 0;
}
if (replacementText == NULL) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
dest = regexp->fMatcher->replaceAll(replacementText, dest, *status);
return dest;
}
//------------------------------------------------------------------------------
//
// uregex_replaceFirst
//
//------------------------------------------------------------------------------
U_CAPI int32_t U_EXPORT2
uregex_replaceFirst(URegularExpression *regexp2,
const UChar *replacementText,
int32_t replacementLength,
UChar *destBuf,
int32_t destCapacity,
UErrorCode *status) {
RegularExpression *regexp = (RegularExpression*)regexp2;
if (validateRE(regexp, TRUE, status) == FALSE) {
return 0;
}
if (replacementText == NULL || replacementLength < -1 ||
(destBuf == NULL && destCapacity > 0) ||
destCapacity < 0) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
int32_t len = 0;
UBool findSucceeded;
uregex_reset(regexp2, 0, status);
findSucceeded = uregex_find(regexp2, 0, status);
if (findSucceeded) {
len = uregex_appendReplacement(regexp2, replacementText, replacementLength,
&destBuf, &destCapacity, status);
}
len += uregex_appendTail(regexp2, &destBuf, &destCapacity, status);
return len;
}
//------------------------------------------------------------------------------
//
// uregex_replaceFirstUText
//
//------------------------------------------------------------------------------
U_CAPI UText * U_EXPORT2
uregex_replaceFirstUText(URegularExpression *regexp2,
UText *replacementText,
UText *dest,
UErrorCode *status) {
RegularExpression *regexp = (RegularExpression*)regexp2;
if (validateRE(regexp, TRUE, status) == FALSE) {
return 0;
}
if (replacementText == NULL) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
dest = regexp->fMatcher->replaceFirst(replacementText, dest, *status);
return dest;
}
//------------------------------------------------------------------------------
//
// uregex_appendReplacement
//
//------------------------------------------------------------------------------
U_NAMESPACE_BEGIN
//
// Dummy class, because these functions need to be friends of class RegexMatcher,
// and stand-alone C functions don't work as friends
//
class RegexCImpl {
public:
inline static int32_t appendReplacement(RegularExpression *regexp,
const UChar *replacementText,
int32_t replacementLength,
UChar **destBuf,
int32_t *destCapacity,
UErrorCode *status);
inline static int32_t appendTail(RegularExpression *regexp,
UChar **destBuf,
int32_t *destCapacity,
UErrorCode *status);
inline static int32_t split(RegularExpression *regexp,
UChar *destBuf,
int32_t destCapacity,
int32_t *requiredCapacity,
UChar *destFields[],
int32_t destFieldsCapacity,
UErrorCode *status);
};
U_NAMESPACE_END
static const UChar BACKSLASH = 0x5c;
static const UChar DOLLARSIGN = 0x24;
//
// Move a character to an output buffer, with bounds checking on the index.
// Index advances even if capacity is exceeded, for preflight size computations.
// This little sequence is used a LOT.
//
static inline void appendToBuf(UChar c, int32_t *idx, UChar *buf, int32_t bufCapacity) {
if (*idx < bufCapacity) {
buf[*idx] = c;
}
(*idx)++;
}
//
// appendReplacement, the actual implementation.
//
int32_t RegexCImpl::appendReplacement(RegularExpression *regexp,
const UChar *replacementText,
int32_t replacementLength,
UChar **destBuf,
int32_t *destCapacity,
UErrorCode *status) {
// If we come in with a buffer overflow error, don't suppress the operation.
// A series of appendReplacements, appendTail need to correctly preflight
// the buffer size when an overflow happens somewhere in the middle.
UBool pendingBufferOverflow = FALSE;
if (*status == U_BUFFER_OVERFLOW_ERROR && destCapacity != NULL && *destCapacity == 0) {
pendingBufferOverflow = TRUE;
*status = U_ZERO_ERROR;
}
//
// Validate all paramters
//
if (validateRE(regexp, TRUE, status) == FALSE) {
return 0;
}
if (replacementText == NULL || replacementLength < -1 ||
destCapacity == NULL || destBuf == NULL ||
(*destBuf == NULL && *destCapacity > 0) ||
*destCapacity < 0) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
RegexMatcher *m = regexp->fMatcher;
if (m->fMatch == FALSE) {
*status = U_REGEX_INVALID_STATE;
return 0;
}
UChar *dest = *destBuf;
int32_t capacity = *destCapacity;
int32_t destIdx = 0;
int32_t i;
// If it wasn't supplied by the caller, get the length of the replacement text.
// TODO: slightly smarter logic in the copy loop could watch for the NUL on
// the fly and avoid this step.
if (replacementLength == -1) {
replacementLength = u_strlen(replacementText);
}
// Copy input string from the end of previous match to start of current match
if (regexp->fText != NULL) {
int32_t matchStart;
int32_t lastMatchEnd;
if (UTEXT_USES_U16(m->fInputText)) {
lastMatchEnd = (int32_t)m->fLastMatchEnd;
matchStart = (int32_t)m->fMatchStart;
} else {
// !!!: Would like a better way to do this!
UErrorCode status = U_ZERO_ERROR;
lastMatchEnd = utext_extract(m->fInputText, 0, m->fLastMatchEnd, NULL, 0, &status);
status = U_ZERO_ERROR;
matchStart = lastMatchEnd + utext_extract(m->fInputText, m->fLastMatchEnd, m->fMatchStart, NULL, 0, &status);
}
for (i=lastMatchEnd; i<matchStart; i++) {
appendToBuf(regexp->fText[i], &destIdx, dest, capacity);
}
} else {
UErrorCode possibleOverflowError = U_ZERO_ERROR; // ignore
destIdx += utext_extract(m->fInputText, m->fLastMatchEnd, m->fMatchStart,
dest==NULL?NULL:&dest[destIdx], REMAINING_CAPACITY(destIdx, capacity),
&possibleOverflowError);
}
U_ASSERT(destIdx >= 0);
// scan the replacement text, looking for substitutions ($n) and \escapes.
int32_t replIdx = 0;
while (replIdx < replacementLength) {
UChar c = replacementText[replIdx];
replIdx++;
if (c != DOLLARSIGN && c != BACKSLASH) {
// Common case, no substitution, no escaping,
// just copy the char to the dest buf.
appendToBuf(c, &destIdx, dest, capacity);
continue;
}
if (c == BACKSLASH) {
// Backslash Escape. Copy the following char out without further checks.
// Note: Surrogate pairs don't need any special handling
// The second half wont be a '$' or a '\', and
// will move to the dest normally on the next
// loop iteration.
if (replIdx >= replacementLength) {
break;
}
c = replacementText[replIdx];
if (c==0x55/*U*/ || c==0x75/*u*/) {
// We have a \udddd or \Udddddddd escape sequence.
UChar32 escapedChar =
u_unescapeAt(uregex_ucstr_unescape_charAt,
&replIdx, // Index is updated by unescapeAt
replacementLength, // Length of replacement text
(void *)replacementText);
if (escapedChar != (UChar32)0xFFFFFFFF) {
if (escapedChar <= 0xffff) {
appendToBuf((UChar)escapedChar, &destIdx, dest, capacity);
} else {
appendToBuf(U16_LEAD(escapedChar), &destIdx, dest, capacity);
appendToBuf(U16_TRAIL(escapedChar), &destIdx, dest, capacity);
}
continue;
}
// Note: if the \u escape was invalid, just fall through and
// treat it as a plain \<anything> escape.
}
// Plain backslash escape. Just put out the escaped character.
appendToBuf(c, &destIdx, dest, capacity);
replIdx++;
continue;
}
// We've got a $. Pick up a capture group number if one follows.
// Consume at most the number of digits necessary for the largest capture
// number that is valid for this pattern.
int32_t numDigits = 0;
int32_t groupNum = 0;
UChar32 digitC;
for (;;) {
if (replIdx >= replacementLength) {
break;
}
U16_GET(replacementText, 0, replIdx, replacementLength, digitC);
if (u_isdigit(digitC) == FALSE) {
break;
}
U16_FWD_1(replacementText, replIdx, replacementLength);
groupNum=groupNum*10 + u_charDigitValue(digitC);
numDigits++;
if (numDigits >= m->fPattern->fMaxCaptureDigits) {
break;
}
}
if (numDigits == 0) {
// The $ didn't introduce a group number at all.
// Treat it as just part of the substitution text.
appendToBuf(DOLLARSIGN, &destIdx, dest, capacity);
continue;
}
// Finally, append the capture group data to the destination.
destIdx += uregex_group((URegularExpression*)regexp, groupNum,
dest==NULL?NULL:&dest[destIdx], REMAINING_CAPACITY(destIdx, capacity), status);
if (*status == U_BUFFER_OVERFLOW_ERROR) {
// Ignore buffer overflow when extracting the group. We need to
// continue on to get full size of the untruncated result. We will
// raise our own buffer overflow error at the end.
*status = U_ZERO_ERROR;
}
if (U_FAILURE(*status)) {
// Can fail if group number is out of range.
break;
}
}
//
// Nul Terminate the dest buffer if possible.
// Set the appropriate buffer overflow or not terminated error, if needed.
//
if (destIdx < capacity) {
dest[destIdx] = 0;
} else if (destIdx == *destCapacity) {
*status = U_STRING_NOT_TERMINATED_WARNING;
} else {
*status = U_BUFFER_OVERFLOW_ERROR;
}
//
// Return an updated dest buffer and capacity to the caller.
//
if (destIdx > 0 && *destCapacity > 0) {
if (destIdx < capacity) {
*destBuf += destIdx;
*destCapacity -= destIdx;
} else {
*destBuf += capacity;
*destCapacity = 0;
}
}
// If we came in with a buffer overflow, make sure we go out with one also.
// (A zero length match right at the end of the previous match could
// make this function succeed even though a previous call had overflowed the buf)
if (pendingBufferOverflow && U_SUCCESS(*status)) {
*status = U_BUFFER_OVERFLOW_ERROR;
}
return destIdx;
}
//
// appendReplacement the actual API function,
//
U_CAPI int32_t U_EXPORT2
uregex_appendReplacement(URegularExpression *regexp2,
const UChar *replacementText,
int32_t replacementLength,
UChar **destBuf,
int32_t *destCapacity,
UErrorCode *status) {
RegularExpression *regexp = (RegularExpression*)regexp2;
return RegexCImpl::appendReplacement(
regexp, replacementText, replacementLength,destBuf, destCapacity, status);
}
//
// uregex_appendReplacementUText...can just use the normal C++ method
//
U_CAPI void U_EXPORT2
uregex_appendReplacementUText(URegularExpression *regexp2,
UText *replText,
UText *dest,
UErrorCode *status) {
RegularExpression *regexp = (RegularExpression*)regexp2;
regexp->fMatcher->appendReplacement(dest, replText, *status);
}
//------------------------------------------------------------------------------
//
// uregex_appendTail
//
//------------------------------------------------------------------------------
int32_t RegexCImpl::appendTail(RegularExpression *regexp,
UChar **destBuf,
int32_t *destCapacity,
UErrorCode *status)
{
// If we come in with a buffer overflow error, don't suppress the operation.
// A series of appendReplacements, appendTail need to correctly preflight
// the buffer size when an overflow happens somewhere in the middle.
UBool pendingBufferOverflow = FALSE;
if (*status == U_BUFFER_OVERFLOW_ERROR && destCapacity != NULL && *destCapacity == 0) {
pendingBufferOverflow = TRUE;
*status = U_ZERO_ERROR;
}
if (validateRE(regexp, TRUE, status) == FALSE) {
return 0;
}
if (destCapacity == NULL || destBuf == NULL ||
(*destBuf == NULL && *destCapacity > 0) ||
*destCapacity < 0)
{
*status = U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
RegexMatcher *m = regexp->fMatcher;
int32_t destIdx = 0;
int32_t destCap = *destCapacity;
UChar *dest = *destBuf;
if (regexp->fText != NULL) {
int32_t srcIdx;
int64_t nativeIdx = (m->fMatch ? m->fMatchEnd : m->fLastMatchEnd);
if (nativeIdx == -1) {
srcIdx = 0;
} else if (UTEXT_USES_U16(m->fInputText)) {
srcIdx = (int32_t)nativeIdx;
} else {
UErrorCode status = U_ZERO_ERROR;
srcIdx = utext_extract(m->fInputText, 0, nativeIdx, NULL, 0, &status);
}
for (;;) {
U_ASSERT(destIdx >= 0);
if (srcIdx == regexp->fTextLength) {
break;
}
UChar c = regexp->fText[srcIdx];
if (c == 0 && regexp->fTextLength == -1) {
regexp->fTextLength = srcIdx;
break;
}
if (destIdx < destCap) {
dest[destIdx] = c;
} else {
// We've overflowed the dest buffer.
// If the total input string length is known, we can
// compute the total buffer size needed without scanning through the string.
if (regexp->fTextLength > 0) {
destIdx += (regexp->fTextLength - srcIdx);
break;
}
}
srcIdx++;
destIdx++;
}
} else {
int64_t srcIdx;
if (m->fMatch) {
// The most recent call to find() succeeded.
srcIdx = m->fMatchEnd;
} else {
// The last call to find() on this matcher failed().
// Look back to the end of the last find() that succeeded for src index.
srcIdx = m->fLastMatchEnd;
if (srcIdx == -1) {
// There has been no successful match with this matcher.
// We want to copy the whole string.
srcIdx = 0;
}
}
destIdx = utext_extract(m->fInputText, srcIdx, m->fInputLength, dest, destCap, status);
}
//
// NUL terminate the output string, if possible, otherwise issue the
// appropriate error or warning.
//
if (destIdx < destCap) {
dest[destIdx] = 0;
} else if (destIdx == destCap) {
*status = U_STRING_NOT_TERMINATED_WARNING;
} else {
*status = U_BUFFER_OVERFLOW_ERROR;
}
//
// Update the user's buffer ptr and capacity vars to reflect the
// amount used.
//
if (destIdx < destCap) {
*destBuf += destIdx;
*destCapacity -= destIdx;
} else if (*destBuf != NULL) {
*destBuf += destCap;
*destCapacity = 0;
}
if (pendingBufferOverflow && U_SUCCESS(*status)) {
*status = U_BUFFER_OVERFLOW_ERROR;
}
return destIdx;
}
//
// appendTail the actual API function
//
U_CAPI int32_t U_EXPORT2
uregex_appendTail(URegularExpression *regexp2,
UChar **destBuf,
int32_t *destCapacity,
UErrorCode *status) {
RegularExpression *regexp = (RegularExpression*)regexp2;
return RegexCImpl::appendTail(regexp, destBuf, destCapacity, status);
}
//
// uregex_appendTailUText...can just use the normal C++ method
//
U_CAPI UText * U_EXPORT2
uregex_appendTailUText(URegularExpression *regexp2,
UText *dest,
UErrorCode *status) {
RegularExpression *regexp = (RegularExpression*)regexp2;
return regexp->fMatcher->appendTail(dest, *status);
}
//------------------------------------------------------------------------------
//
// copyString Internal utility to copy a string to an output buffer,
// while managing buffer overflow and preflight size
// computation. NUL termination is added to destination,
// and the NUL is counted in the output size.
//
//------------------------------------------------------------------------------
#if 0
static void copyString(UChar *destBuffer, // Destination buffer.
int32_t destCapacity, // Total capacity of dest buffer
int32_t *destIndex, // Index into dest buffer. Updated on return.
// Update not clipped to destCapacity.
const UChar *srcPtr, // Pointer to source string
int32_t srcLen) // Source string len.
{
int32_t si;
int32_t di = *destIndex;
UChar c;
for (si=0; si<srcLen; si++) {
c = srcPtr[si];
if (di < destCapacity) {
destBuffer[di] = c;
di++;
} else {
di += srcLen - si;
break;
}
}
if (di<destCapacity) {
destBuffer[di] = 0;
}
di++;
*destIndex = di;
}
#endif
//------------------------------------------------------------------------------
//
// uregex_split
//
//------------------------------------------------------------------------------
int32_t RegexCImpl::split(RegularExpression *regexp,
UChar *destBuf,
int32_t destCapacity,
int32_t *requiredCapacity,
UChar *destFields[],
int32_t destFieldsCapacity,
UErrorCode *status) {
//
// Reset for the input text
//
regexp->fMatcher->reset();
UText *inputText = regexp->fMatcher->fInputText;
int64_t nextOutputStringStart = 0;
int64_t inputLen = regexp->fMatcher->fInputLength;
if (inputLen == 0) {
return 0;
}
//
// Loop through the input text, searching for the delimiter pattern
//
int32_t i; // Index of the field being processed.
int32_t destIdx = 0; // Next available position in destBuf;
int32_t numCaptureGroups = regexp->fMatcher->groupCount();
UErrorCode tStatus = U_ZERO_ERROR; // Want to ignore any buffer overflow errors so that the strings are still counted
for (i=0; ; i++) {
if (i>=destFieldsCapacity-1) {
// There are one or zero output strings left.
// Fill the last output string with whatever is left from the input, then exit the loop.
// ( i will be == destFieldsCapacity if we filled the output array while processing
// capture groups of the delimiter expression, in which case we will discard the
// last capture group saved in favor of the unprocessed remainder of the
// input string.)
if (inputLen > nextOutputStringStart) {
if (i != destFieldsCapacity-1) {
// No fields are left. Recycle the last one for holding the trailing part of
// the input string.
i = destFieldsCapacity-1;
destIdx = (int32_t)(destFields[i] - destFields[0]);
}
destFields[i] = &destBuf[destIdx];
destIdx += 1 + utext_extract(inputText, nextOutputStringStart, inputLen,
&destBuf[destIdx], REMAINING_CAPACITY(destIdx, destCapacity), status);
}
break;
}
if (regexp->fMatcher->find()) {
// We found another delimiter. Move everything from where we started looking
// up until the start of the delimiter into the next output string.
destFields[i] = &destBuf[destIdx];
destIdx += 1 + utext_extract(inputText, nextOutputStringStart, regexp->fMatcher->fMatchStart,
&destBuf[destIdx], REMAINING_CAPACITY(destIdx, destCapacity), &tStatus);
if (tStatus == U_BUFFER_OVERFLOW_ERROR) {
tStatus = U_ZERO_ERROR;
} else {
*status = tStatus;
}
nextOutputStringStart = regexp->fMatcher->fMatchEnd;
// If the delimiter pattern has capturing parentheses, the captured
// text goes out into the next n destination strings.
int32_t groupNum;
for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) {
// If we've run out of output string slots, bail out.
if (i==destFieldsCapacity-1) {
break;
}
i++;
// Set up to extract the capture group contents into the dest buffer.
destFields[i] = &destBuf[destIdx];
tStatus = U_ZERO_ERROR;
int32_t t = uregex_group((URegularExpression*)regexp,
groupNum,
destFields[i],
REMAINING_CAPACITY(destIdx, destCapacity),
&tStatus);
destIdx += t + 1; // Record the space used in the output string buffer.
// +1 for the NUL that terminates the string.
if (tStatus == U_BUFFER_OVERFLOW_ERROR) {
tStatus = U_ZERO_ERROR;
} else {
*status = tStatus;
}
}
if (nextOutputStringStart == inputLen) {
// The delimiter was at the end of the string.
// Output an empty string, and then we are done.
if (destIdx < destCapacity) {
destBuf[destIdx] = 0;
}
if (i < destFieldsCapacity-1) {
++i;
}
if (destIdx < destCapacity) {
destFields[i] = destBuf + destIdx;
}
++destIdx;
break;
}
}
else
{
// We ran off the end of the input while looking for the next delimiter.
// All the remaining text goes into the current output string.
destFields[i] = &destBuf[destIdx];
destIdx += 1 + utext_extract(inputText, nextOutputStringStart, inputLen,
&destBuf[destIdx], REMAINING_CAPACITY(destIdx, destCapacity), status);
break;
}
}
// Zero out any unused portion of the destFields array
int j;
for (j=i+1; j<destFieldsCapacity; j++) {
destFields[j] = NULL;
}
if (requiredCapacity != NULL) {
*requiredCapacity = destIdx;
}
if (destIdx > destCapacity) {
*status = U_BUFFER_OVERFLOW_ERROR;
}
return i+1;
}
//
// uregex_split The actual API function
//
U_CAPI int32_t U_EXPORT2
uregex_split(URegularExpression *regexp2,
UChar *destBuf,
int32_t destCapacity,
int32_t *requiredCapacity,
UChar *destFields[],
int32_t destFieldsCapacity,
UErrorCode *status) {
RegularExpression *regexp = (RegularExpression*)regexp2;
if (validateRE(regexp, TRUE, status) == FALSE) {
return 0;
}
if ((destBuf == NULL && destCapacity > 0) ||
destCapacity < 0 ||
destFields == NULL ||
destFieldsCapacity < 1 ) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
return RegexCImpl::split(regexp, destBuf, destCapacity, requiredCapacity, destFields, destFieldsCapacity, status);
}
//
// uregex_splitUText...can just use the normal C++ method
//
U_CAPI int32_t U_EXPORT2
uregex_splitUText(URegularExpression *regexp2,
UText *destFields[],
int32_t destFieldsCapacity,
UErrorCode *status) {
RegularExpression *regexp = (RegularExpression*)regexp2;
return regexp->fMatcher->split(regexp->fMatcher->inputText(), destFields, destFieldsCapacity, *status);
}
#endif // !UCONFIG_NO_REGULAR_EXPRESSIONS