scuffed-code/icu4c/source/i18n/search.cpp
Syn Wee Quek 75adf07067 ICU-1030
New implementation for Unicode Boyer Moore string search.

X-SVN-Rev: 5587
2001-08-25 02:03:53 +00:00

358 lines
12 KiB
C++

/*
**********************************************************************
* Copyright (C) 2001 IBM and others. All rights reserved.
**********************************************************************
* Date Name Description
* 03/22/2000 helena Creation.
**********************************************************************
*/
#include "unicode/brkiter.h"
#include "unicode/schriter.h"
#include "unicode/search.h"
#include "usrchimp.h"
#include "cmemory.h"
// public constructors and destructors -----------------------------------
SearchIterator::SearchIterator(const SearchIterator &other)
{
if (other != *this) {
m_breakiterator_ = other.m_breakiterator_;
m_text_ = other.m_text_;
m_search_ = (USearch *)uprv_malloc(sizeof(USearch));
m_search_->breakIter = other.m_search_->breakIter;
m_search_->isCanonicalMatch = other.m_search_->isCanonicalMatch;
m_search_->isOverlap = other.m_search_->isOverlap;
m_search_->matchedIndex = other.m_search_->matchedIndex;
m_search_->matchedLength = other.m_search_->matchedLength;
m_search_->text = other.m_search_->text;
m_search_->textLength = other.m_search_->textLength;
}
}
SearchIterator::~SearchIterator()
{
if (m_search_ != NULL) {
uprv_free(m_search_);
}
}
// public get and set methods ----------------------------------------
void SearchIterator::setAttribute(USearchAttribute attribute,
USearchAttributeValue value,
UErrorCode &status)
{
if (U_SUCCESS(status)) {
switch (attribute)
{
case USEARCH_OVERLAP :
m_search_->isOverlap = (value == USEARCH_ON ? TRUE : FALSE);
break;
case USEARCH_CANONICAL_MATCH :
m_search_->isCanonicalMatch = (value == USEARCH_ON ? TRUE : FALSE);
break;
default:
status = U_ILLEGAL_ARGUMENT_ERROR;
}
}
if (value == USEARCH_ATTRIBUTE_VALUE_COUNT) {
status = U_ILLEGAL_ARGUMENT_ERROR;
}
}
USearchAttributeValue SearchIterator::getAttribute(
USearchAttribute attribute) const
{
switch (attribute) {
case USEARCH_ATTRIBUTE_COUNT :
return USEARCH_DEFAULT;
case USEARCH_OVERLAP :
return (m_search_->isOverlap == TRUE ? USEARCH_ON : USEARCH_OFF);
case USEARCH_CANONICAL_MATCH :
return (m_search_->isCanonicalMatch == TRUE ? USEARCH_ON :
USEARCH_OFF);
}
return USEARCH_DEFAULT;
}
UTextOffset SearchIterator::getMatchedStart() const
{
return m_search_->matchedIndex;
}
int32_t SearchIterator::getMatchedLength() const
{
return m_search_->matchedLength;
}
void SearchIterator::getMatchedText(UnicodeString &result) const
{
UTextOffset matchedindex = m_search_->matchedIndex;
int32_t matchedlength = m_search_->matchedLength;
if (matchedindex != USEARCH_DONE && matchedlength != 0) {
result.setTo(m_search_->text + matchedindex, matchedlength);
}
else {
result.remove();
}
}
void SearchIterator::setBreakIterator(BreakIterator *breakiter,
UErrorCode &status)
{
if (U_SUCCESS(status)) {
m_search_->breakIter = NULL;
// the c++ breakiterator may not make use of ubreakiterator.
// so we'll have to keep track of it ourselves.
m_breakiterator_ = breakiter;
}
}
const BreakIterator * SearchIterator::getBreakIterator(void) const
{
return m_breakiterator_;
}
void SearchIterator::setText(const UnicodeString &text, UErrorCode &status)
{
if (U_SUCCESS(status)) {
if (text.length() == 0) {
status = U_ILLEGAL_ARGUMENT_ERROR;
}
else {
m_text_ = text;
m_search_->text = m_text_.fArray;
}
}
}
void SearchIterator::setText(CharacterIterator &text, UErrorCode &status)
{
if (U_SUCCESS(status)) {
text.getText(m_text_);
setText(m_text_, status);
}
}
const UnicodeString & SearchIterator::getText(void) const
{
return m_text_;
}
// operator overloading ----------------------------------------------
UBool SearchIterator::operator==(const SearchIterator &that) const
{
if (this == &that) {
return TRUE;
}
return (m_breakiterator_ == that.m_breakiterator_ &&
m_search_->isCanonicalMatch == that.m_search_->isCanonicalMatch &&
m_search_->isOverlap == that.m_search_->isOverlap &&
m_search_->matchedIndex == that.m_search_->matchedIndex &&
m_search_->matchedLength == that.m_search_->matchedLength &&
m_search_->textLength == that.m_search_->textLength &&
getOffset() == that.getOffset() &&
(uprv_memcmp(m_search_->text, that.m_search_->text,
m_search_->textLength * sizeof(UChar)) == 0));
}
// public methods ----------------------------------------------------
UTextOffset SearchIterator::first(UErrorCode &status)
{
setOffset(0, status);
return handleNext(0, status);
}
UTextOffset SearchIterator::following(UTextOffset position,
UErrorCode &status)
{
setOffset(position, status);
return handleNext(position, status);
}
UTextOffset SearchIterator::last(UErrorCode &status)
{
setOffset(m_search_->textLength, status);
return handlePrev(m_search_->textLength, status);
}
UTextOffset SearchIterator::preceding(UTextOffset position,
UErrorCode &status)
{
setOffset(position, status);
return handlePrev(position, status);
}
UTextOffset SearchIterator::next(UErrorCode &status)
{
if (U_SUCCESS(status)) {
UTextOffset offset = getOffset();
UTextOffset matchindex = m_search_->matchedIndex;
int32_t matchlength = m_search_->matchedLength;
m_search_->reset = FALSE;
if (m_search_->isForwardSearching == TRUE) {
int32_t textlength = m_search_->textLength;
if (offset == textlength || matchindex == textlength ||
(matchindex != USEARCH_DONE &&
matchindex + matchlength >= textlength)) {
// not enough characters to match
setMatchNotFound();
return USEARCH_DONE;
}
}
else {
// switching direction.
// if matchedIndex == USEARCH_DONE, it means that either a
// setOffset has been called or that previous ran off the text
// string. the iterator would have been set to offset 0 if a
// match is not found.
m_search_->isForwardSearching = TRUE;
if (m_search_->matchedIndex != USEARCH_DONE) {
// there's no need to set the collation element iterator
// the next call to next will set the offset.
return matchindex;
}
}
if (matchindex != USEARCH_DONE) {
return handleNext(matchindex + matchlength, status);
}
return handleNext(offset, status);
}
return USEARCH_DONE;
}
UTextOffset SearchIterator::previous(UErrorCode &status)
{
if (U_SUCCESS(status)) {
UTextOffset offset;
if (m_search_->reset) {
offset = m_search_->textLength;
m_search_->isForwardSearching = FALSE;
m_search_->reset = FALSE;
}
else {
offset = getOffset();
}
UTextOffset matchindex = m_search_->matchedIndex;
if (m_search_->isForwardSearching == TRUE) {
// switching direction.
// if matchedIndex == USEARCH_DONE, it means that either a
// setOffset has been called or that next ran off the text
// string. the iterator would have been set to offset textLength if
// a match is not found.
m_search_->isForwardSearching = FALSE;
if (matchindex != USEARCH_DONE) {
return matchindex;
}
}
else {
if (offset == 0 || matchindex == 0) {
// not enough characters to match
setMatchNotFound();
return USEARCH_DONE;
}
}
if (matchindex != USEARCH_DONE) {
return handlePrev(matchindex, status);
}
return handlePrev(offset, status);
}
return USEARCH_DONE;
}
void SearchIterator::reset()
{
setMatchNotFound();
m_search_->isOverlap = FALSE;
m_search_->isCanonicalMatch = FALSE;
m_search_->isForwardSearching = TRUE;
m_search_->reset = TRUE;
}
// protected constructors and destructors -----------------------------
SearchIterator::SearchIterator() : m_breakiterator_(NULL)
{
m_search_ = (USearch *)uprv_malloc(sizeof(USearch));
m_search_->breakIter = NULL;
m_search_->isOverlap = FALSE;
m_search_->isCanonicalMatch = FALSE;
m_search_->isForwardSearching = TRUE;
m_search_->reset = TRUE;
m_search_->matchedIndex = USEARCH_DONE;
m_search_->matchedLength = 0;
m_search_->text = NULL;
m_search_->textLength = 0;
}
SearchIterator::SearchIterator(const UnicodeString &text,
BreakIterator *breakiter) :
m_breakiterator_(breakiter),
m_text_(text)
{
m_search_ = (USearch *)uprv_malloc(sizeof(USearch));
m_search_->breakIter = NULL;
m_search_->isOverlap = FALSE;
m_search_->isCanonicalMatch = FALSE;
m_search_->isForwardSearching = TRUE;
m_search_->reset = TRUE;
m_search_->matchedIndex = USEARCH_DONE;
m_search_->matchedLength = 0;
m_search_->text = m_text_.fArray;
m_search_->textLength = text.length();
}
SearchIterator::SearchIterator(CharacterIterator &text,
BreakIterator *breakiter) :
m_breakiterator_(breakiter)
{
m_search_ = (USearch *)uprv_malloc(sizeof(USearch));
m_search_->breakIter = NULL;
m_search_->isOverlap = FALSE;
m_search_->isCanonicalMatch = FALSE;
m_search_->isForwardSearching = TRUE;
m_search_->reset = TRUE;
m_search_->matchedIndex = USEARCH_DONE;
m_search_->matchedLength = 0;
text.getText(m_text_);
m_search_->text = m_text_.fArray;
m_search_->textLength = m_text_.length();
m_breakiterator_ = breakiter;
}
// protected methods ------------------------------------------------------
void SearchIterator::setMatchLength(int32_t length)
{
m_search_->matchedLength = length;
}
void SearchIterator::setMatchStart(UTextOffset position)
{
m_search_->matchedIndex = position;
}
void SearchIterator::setMatchNotFound()
{
setMatchStart(USEARCH_DONE);
setMatchLength(0);
UErrorCode status = U_ZERO_ERROR;
// by default no errors should be returned here since offsets are within
// range.
if (m_search_->isForwardSearching) {
setOffset(m_search_->textLength, status);
}
else {
setOffset(0, status);
}
}