75adf07067
New implementation for Unicode Boyer Moore string search. X-SVN-Rev: 5587
358 lines
12 KiB
C++
358 lines
12 KiB
C++
/*
|
|
**********************************************************************
|
|
* Copyright (C) 2001 IBM and others. All rights reserved.
|
|
**********************************************************************
|
|
* Date Name Description
|
|
* 03/22/2000 helena Creation.
|
|
**********************************************************************
|
|
*/
|
|
|
|
#include "unicode/brkiter.h"
|
|
#include "unicode/schriter.h"
|
|
#include "unicode/search.h"
|
|
#include "usrchimp.h"
|
|
#include "cmemory.h"
|
|
|
|
// public constructors and destructors -----------------------------------
|
|
|
|
SearchIterator::SearchIterator(const SearchIterator &other)
|
|
{
|
|
if (other != *this) {
|
|
m_breakiterator_ = other.m_breakiterator_;
|
|
m_text_ = other.m_text_;
|
|
m_search_ = (USearch *)uprv_malloc(sizeof(USearch));
|
|
m_search_->breakIter = other.m_search_->breakIter;
|
|
m_search_->isCanonicalMatch = other.m_search_->isCanonicalMatch;
|
|
m_search_->isOverlap = other.m_search_->isOverlap;
|
|
m_search_->matchedIndex = other.m_search_->matchedIndex;
|
|
m_search_->matchedLength = other.m_search_->matchedLength;
|
|
m_search_->text = other.m_search_->text;
|
|
m_search_->textLength = other.m_search_->textLength;
|
|
}
|
|
}
|
|
|
|
SearchIterator::~SearchIterator()
|
|
{
|
|
if (m_search_ != NULL) {
|
|
uprv_free(m_search_);
|
|
}
|
|
}
|
|
|
|
// public get and set methods ----------------------------------------
|
|
|
|
void SearchIterator::setAttribute(USearchAttribute attribute,
|
|
USearchAttributeValue value,
|
|
UErrorCode &status)
|
|
{
|
|
if (U_SUCCESS(status)) {
|
|
switch (attribute)
|
|
{
|
|
case USEARCH_OVERLAP :
|
|
m_search_->isOverlap = (value == USEARCH_ON ? TRUE : FALSE);
|
|
break;
|
|
case USEARCH_CANONICAL_MATCH :
|
|
m_search_->isCanonicalMatch = (value == USEARCH_ON ? TRUE : FALSE);
|
|
break;
|
|
default:
|
|
status = U_ILLEGAL_ARGUMENT_ERROR;
|
|
}
|
|
}
|
|
if (value == USEARCH_ATTRIBUTE_VALUE_COUNT) {
|
|
status = U_ILLEGAL_ARGUMENT_ERROR;
|
|
}
|
|
}
|
|
|
|
USearchAttributeValue SearchIterator::getAttribute(
|
|
USearchAttribute attribute) const
|
|
{
|
|
switch (attribute) {
|
|
case USEARCH_ATTRIBUTE_COUNT :
|
|
return USEARCH_DEFAULT;
|
|
case USEARCH_OVERLAP :
|
|
return (m_search_->isOverlap == TRUE ? USEARCH_ON : USEARCH_OFF);
|
|
case USEARCH_CANONICAL_MATCH :
|
|
return (m_search_->isCanonicalMatch == TRUE ? USEARCH_ON :
|
|
USEARCH_OFF);
|
|
}
|
|
return USEARCH_DEFAULT;
|
|
}
|
|
|
|
UTextOffset SearchIterator::getMatchedStart() const
|
|
{
|
|
return m_search_->matchedIndex;
|
|
}
|
|
|
|
int32_t SearchIterator::getMatchedLength() const
|
|
{
|
|
return m_search_->matchedLength;
|
|
}
|
|
|
|
void SearchIterator::getMatchedText(UnicodeString &result) const
|
|
{
|
|
UTextOffset matchedindex = m_search_->matchedIndex;
|
|
int32_t matchedlength = m_search_->matchedLength;
|
|
if (matchedindex != USEARCH_DONE && matchedlength != 0) {
|
|
result.setTo(m_search_->text + matchedindex, matchedlength);
|
|
}
|
|
else {
|
|
result.remove();
|
|
}
|
|
}
|
|
|
|
void SearchIterator::setBreakIterator(BreakIterator *breakiter,
|
|
UErrorCode &status)
|
|
{
|
|
if (U_SUCCESS(status)) {
|
|
m_search_->breakIter = NULL;
|
|
// the c++ breakiterator may not make use of ubreakiterator.
|
|
// so we'll have to keep track of it ourselves.
|
|
m_breakiterator_ = breakiter;
|
|
}
|
|
}
|
|
|
|
const BreakIterator * SearchIterator::getBreakIterator(void) const
|
|
{
|
|
return m_breakiterator_;
|
|
}
|
|
|
|
void SearchIterator::setText(const UnicodeString &text, UErrorCode &status)
|
|
{
|
|
if (U_SUCCESS(status)) {
|
|
if (text.length() == 0) {
|
|
status = U_ILLEGAL_ARGUMENT_ERROR;
|
|
}
|
|
else {
|
|
m_text_ = text;
|
|
m_search_->text = m_text_.fArray;
|
|
}
|
|
}
|
|
}
|
|
|
|
void SearchIterator::setText(CharacterIterator &text, UErrorCode &status)
|
|
{
|
|
if (U_SUCCESS(status)) {
|
|
text.getText(m_text_);
|
|
setText(m_text_, status);
|
|
}
|
|
}
|
|
|
|
const UnicodeString & SearchIterator::getText(void) const
|
|
{
|
|
return m_text_;
|
|
}
|
|
|
|
// operator overloading ----------------------------------------------
|
|
|
|
UBool SearchIterator::operator==(const SearchIterator &that) const
|
|
{
|
|
if (this == &that) {
|
|
return TRUE;
|
|
}
|
|
return (m_breakiterator_ == that.m_breakiterator_ &&
|
|
m_search_->isCanonicalMatch == that.m_search_->isCanonicalMatch &&
|
|
m_search_->isOverlap == that.m_search_->isOverlap &&
|
|
m_search_->matchedIndex == that.m_search_->matchedIndex &&
|
|
m_search_->matchedLength == that.m_search_->matchedLength &&
|
|
m_search_->textLength == that.m_search_->textLength &&
|
|
getOffset() == that.getOffset() &&
|
|
(uprv_memcmp(m_search_->text, that.m_search_->text,
|
|
m_search_->textLength * sizeof(UChar)) == 0));
|
|
}
|
|
|
|
// public methods ----------------------------------------------------
|
|
|
|
UTextOffset SearchIterator::first(UErrorCode &status)
|
|
{
|
|
setOffset(0, status);
|
|
return handleNext(0, status);
|
|
}
|
|
|
|
UTextOffset SearchIterator::following(UTextOffset position,
|
|
UErrorCode &status)
|
|
{
|
|
setOffset(position, status);
|
|
return handleNext(position, status);
|
|
}
|
|
|
|
UTextOffset SearchIterator::last(UErrorCode &status)
|
|
{
|
|
setOffset(m_search_->textLength, status);
|
|
return handlePrev(m_search_->textLength, status);
|
|
}
|
|
|
|
UTextOffset SearchIterator::preceding(UTextOffset position,
|
|
UErrorCode &status)
|
|
{
|
|
setOffset(position, status);
|
|
return handlePrev(position, status);
|
|
}
|
|
|
|
UTextOffset SearchIterator::next(UErrorCode &status)
|
|
{
|
|
if (U_SUCCESS(status)) {
|
|
UTextOffset offset = getOffset();
|
|
UTextOffset matchindex = m_search_->matchedIndex;
|
|
int32_t matchlength = m_search_->matchedLength;
|
|
m_search_->reset = FALSE;
|
|
if (m_search_->isForwardSearching == TRUE) {
|
|
int32_t textlength = m_search_->textLength;
|
|
if (offset == textlength || matchindex == textlength ||
|
|
(matchindex != USEARCH_DONE &&
|
|
matchindex + matchlength >= textlength)) {
|
|
// not enough characters to match
|
|
setMatchNotFound();
|
|
return USEARCH_DONE;
|
|
}
|
|
}
|
|
else {
|
|
// switching direction.
|
|
// if matchedIndex == USEARCH_DONE, it means that either a
|
|
// setOffset has been called or that previous ran off the text
|
|
// string. the iterator would have been set to offset 0 if a
|
|
// match is not found.
|
|
m_search_->isForwardSearching = TRUE;
|
|
if (m_search_->matchedIndex != USEARCH_DONE) {
|
|
// there's no need to set the collation element iterator
|
|
// the next call to next will set the offset.
|
|
return matchindex;
|
|
}
|
|
}
|
|
|
|
if (matchindex != USEARCH_DONE) {
|
|
return handleNext(matchindex + matchlength, status);
|
|
}
|
|
return handleNext(offset, status);
|
|
}
|
|
return USEARCH_DONE;
|
|
}
|
|
|
|
UTextOffset SearchIterator::previous(UErrorCode &status)
|
|
{
|
|
if (U_SUCCESS(status)) {
|
|
UTextOffset offset;
|
|
if (m_search_->reset) {
|
|
offset = m_search_->textLength;
|
|
m_search_->isForwardSearching = FALSE;
|
|
m_search_->reset = FALSE;
|
|
}
|
|
else {
|
|
offset = getOffset();
|
|
}
|
|
|
|
UTextOffset matchindex = m_search_->matchedIndex;
|
|
if (m_search_->isForwardSearching == TRUE) {
|
|
// switching direction.
|
|
// if matchedIndex == USEARCH_DONE, it means that either a
|
|
// setOffset has been called or that next ran off the text
|
|
// string. the iterator would have been set to offset textLength if
|
|
// a match is not found.
|
|
m_search_->isForwardSearching = FALSE;
|
|
if (matchindex != USEARCH_DONE) {
|
|
return matchindex;
|
|
}
|
|
}
|
|
else {
|
|
if (offset == 0 || matchindex == 0) {
|
|
// not enough characters to match
|
|
setMatchNotFound();
|
|
return USEARCH_DONE;
|
|
}
|
|
}
|
|
|
|
if (matchindex != USEARCH_DONE) {
|
|
return handlePrev(matchindex, status);
|
|
}
|
|
return handlePrev(offset, status);
|
|
}
|
|
return USEARCH_DONE;
|
|
}
|
|
|
|
void SearchIterator::reset()
|
|
{
|
|
setMatchNotFound();
|
|
m_search_->isOverlap = FALSE;
|
|
m_search_->isCanonicalMatch = FALSE;
|
|
m_search_->isForwardSearching = TRUE;
|
|
m_search_->reset = TRUE;
|
|
}
|
|
|
|
// protected constructors and destructors -----------------------------
|
|
|
|
SearchIterator::SearchIterator() : m_breakiterator_(NULL)
|
|
{
|
|
m_search_ = (USearch *)uprv_malloc(sizeof(USearch));
|
|
m_search_->breakIter = NULL;
|
|
m_search_->isOverlap = FALSE;
|
|
m_search_->isCanonicalMatch = FALSE;
|
|
m_search_->isForwardSearching = TRUE;
|
|
m_search_->reset = TRUE;
|
|
m_search_->matchedIndex = USEARCH_DONE;
|
|
m_search_->matchedLength = 0;
|
|
m_search_->text = NULL;
|
|
m_search_->textLength = 0;
|
|
}
|
|
|
|
SearchIterator::SearchIterator(const UnicodeString &text,
|
|
BreakIterator *breakiter) :
|
|
m_breakiterator_(breakiter),
|
|
m_text_(text)
|
|
{
|
|
m_search_ = (USearch *)uprv_malloc(sizeof(USearch));
|
|
m_search_->breakIter = NULL;
|
|
m_search_->isOverlap = FALSE;
|
|
m_search_->isCanonicalMatch = FALSE;
|
|
m_search_->isForwardSearching = TRUE;
|
|
m_search_->reset = TRUE;
|
|
m_search_->matchedIndex = USEARCH_DONE;
|
|
m_search_->matchedLength = 0;
|
|
m_search_->text = m_text_.fArray;
|
|
m_search_->textLength = text.length();
|
|
}
|
|
|
|
SearchIterator::SearchIterator(CharacterIterator &text,
|
|
BreakIterator *breakiter) :
|
|
m_breakiterator_(breakiter)
|
|
{
|
|
m_search_ = (USearch *)uprv_malloc(sizeof(USearch));
|
|
m_search_->breakIter = NULL;
|
|
m_search_->isOverlap = FALSE;
|
|
m_search_->isCanonicalMatch = FALSE;
|
|
m_search_->isForwardSearching = TRUE;
|
|
m_search_->reset = TRUE;
|
|
m_search_->matchedIndex = USEARCH_DONE;
|
|
m_search_->matchedLength = 0;
|
|
text.getText(m_text_);
|
|
m_search_->text = m_text_.fArray;
|
|
m_search_->textLength = m_text_.length();
|
|
m_breakiterator_ = breakiter;
|
|
}
|
|
|
|
// protected methods ------------------------------------------------------
|
|
|
|
void SearchIterator::setMatchLength(int32_t length)
|
|
{
|
|
m_search_->matchedLength = length;
|
|
}
|
|
|
|
void SearchIterator::setMatchStart(UTextOffset position)
|
|
{
|
|
m_search_->matchedIndex = position;
|
|
}
|
|
|
|
void SearchIterator::setMatchNotFound()
|
|
{
|
|
setMatchStart(USEARCH_DONE);
|
|
setMatchLength(0);
|
|
UErrorCode status = U_ZERO_ERROR;
|
|
// by default no errors should be returned here since offsets are within
|
|
// range.
|
|
if (m_search_->isForwardSearching) {
|
|
setOffset(m_search_->textLength, status);
|
|
}
|
|
else {
|
|
setOffset(0, status);
|
|
}
|
|
}
|
|
|
|
|