ICU-4009 Port Any-BreakInternal transliterator from Java
X-SVN-Rev: 23902
This commit is contained in:
parent
eecf0b1bae
commit
dbadbd711a
185
icu4c/source/i18n/brktrans.cpp
Normal file
185
icu4c/source/i18n/brktrans.cpp
Normal file
@ -0,0 +1,185 @@
|
||||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 2008, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
* Date Name Description
|
||||
* 05/11/2008 Andy Heninger Port from Java
|
||||
**********************************************************************
|
||||
*/
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
|
||||
#if !UCONFIG_NO_TRANSLITERATION
|
||||
|
||||
#include "unicode/unifilt.h"
|
||||
#include "unicode/uchar.h"
|
||||
#include "unicode/uniset.h"
|
||||
#include "unicode/brkiter.h"
|
||||
#include "brktrans.h"
|
||||
#include "unicode/uchar.h"
|
||||
#include "cmemory.h"
|
||||
#include "uprops.h"
|
||||
#include "uinvchar.h"
|
||||
#include "util.h"
|
||||
#include "uvectr32.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(BreakTransliterator)
|
||||
|
||||
static const UChar SPACE = 32; // ' '
|
||||
|
||||
|
||||
/**
|
||||
* Constructs a transliterator with the default delimiters '{' and
|
||||
* '}'.
|
||||
*/
|
||||
BreakTransliterator::BreakTransliterator(UnicodeFilter* adoptedFilter) :
|
||||
Transliterator(UNICODE_STRING("Any-BreakInternal", 17), adoptedFilter) {
|
||||
bi = NULL;
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
boundaries = new UVector32(status);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Destructor.
|
||||
*/
|
||||
BreakTransliterator::~BreakTransliterator() {
|
||||
delete bi;
|
||||
bi = NULL;
|
||||
delete boundaries;
|
||||
boundaries = NULL;
|
||||
}
|
||||
|
||||
/**
|
||||
* Copy constructor.
|
||||
*/
|
||||
BreakTransliterator::BreakTransliterator(const BreakTransliterator& o) :
|
||||
Transliterator(o) {
|
||||
bi = NULL;
|
||||
if (o.bi != NULL) {
|
||||
bi = o.bi->clone();
|
||||
}
|
||||
fInsertion = o.fInsertion;
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
boundaries = new UVector32(status);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Transliterator API.
|
||||
*/
|
||||
Transliterator* BreakTransliterator::clone(void) const {
|
||||
return new BreakTransliterator(*this);
|
||||
};
|
||||
|
||||
/**
|
||||
* Implements {@link Transliterator#handleTransliterate}.
|
||||
*/
|
||||
void BreakTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets,
|
||||
UBool isIncremental ) const {
|
||||
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
boundaries->removeAllElements();
|
||||
BreakTransliterator *nonConstThis = (BreakTransliterator *)this;
|
||||
nonConstThis->getBreakIterator(); // Lazy-create it if necessary
|
||||
UnicodeString sText = replaceableAsString(text);
|
||||
bi->setText(sText);
|
||||
bi->preceding(offsets.start);
|
||||
|
||||
// To make things much easier, we will stack the boundaries, and then insert at the end.
|
||||
// generally, we won't need too many, since we will be filtered.
|
||||
|
||||
int32_t boundary;
|
||||
for(boundary = bi->next(); boundary != UBRK_DONE && boundary < offsets.limit; boundary = bi->next()) {
|
||||
if (boundary == 0) continue;
|
||||
// HACK: Check to see that preceeding item was a letter
|
||||
|
||||
UChar32 cp = sText.char32At(boundary-1);
|
||||
int type = u_charType(cp);
|
||||
//System.out.println(Integer.toString(cp,16) + " (before): " + type);
|
||||
if ((U_MASK(type) & (U_GC_L_MASK | U_GC_M_MASK)) == 0) continue;
|
||||
|
||||
cp = sText.char32At(boundary);
|
||||
type = u_charType(cp);
|
||||
//System.out.println(Integer.toString(cp,16) + " (after): " + type);
|
||||
if ((U_MASK(type) & (U_GC_L_MASK | U_GC_M_MASK)) == 0) continue;
|
||||
|
||||
boundaries->addElement(boundary, status);
|
||||
//System.out.println(boundary);
|
||||
}
|
||||
|
||||
int delta = 0;
|
||||
int lastBoundary = 0;
|
||||
|
||||
if (boundaries->size() != 0) { // if we found something, adjust
|
||||
delta = boundaries->size() * fInsertion.length();
|
||||
lastBoundary = boundaries->lastElementi();
|
||||
|
||||
// we do this from the end backwards, so that we don't have to keep updating.
|
||||
|
||||
while (boundaries->size() > 0) {
|
||||
boundary = boundaries->popi();
|
||||
text.handleReplaceBetween(boundary, boundary, fInsertion);
|
||||
}
|
||||
}
|
||||
|
||||
// Now fix up the return values
|
||||
offsets.contextLimit += delta;
|
||||
offsets.limit += delta;
|
||||
offsets.start = isIncremental ? lastBoundary + delta : offsets.limit;
|
||||
|
||||
// TODO: do something with U_FAILURE(status);
|
||||
// (need to look at transliterators overall, not just here.)
|
||||
}
|
||||
|
||||
//
|
||||
// getInsertion()
|
||||
//
|
||||
const UnicodeString &BreakTransliterator::getInsertion() const {
|
||||
return fInsertion;
|
||||
}
|
||||
|
||||
//
|
||||
// setInsertion()
|
||||
//
|
||||
void BreakTransliterator::setInsertsion(const UnicodeString &insertion) {
|
||||
this->fInsertion = insertion;
|
||||
}
|
||||
|
||||
//
|
||||
// getBreakIterator Lazily create the break iterator if it does
|
||||
// not already exist. Copied from Java, probably
|
||||
// better to just create it in the constructor.
|
||||
//
|
||||
BreakIterator *BreakTransliterator::getBreakIterator() {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
if (bi == NULL) {
|
||||
// Note: Thai breaking behavior is universal, it is not
|
||||
// tied to the Thai locale.
|
||||
bi = BreakIterator::createWordInstance(Locale::getEnglish(), status);
|
||||
}
|
||||
return bi;
|
||||
}
|
||||
|
||||
//
|
||||
// replaceableAsString Hack to let break iterators work
|
||||
// on the replaceable text from transliterators.
|
||||
// In practice, the only real Replaceable type that we
|
||||
// will be seeing is UnicodeString, so this function
|
||||
// will normally be efficient.
|
||||
//
|
||||
UnicodeString BreakTransliterator::replaceableAsString(Replaceable &r) {
|
||||
if (r.getDynamicClassID() == UnicodeString::getStaticClassID()) {
|
||||
return (UnicodeString &) r;
|
||||
}
|
||||
UnicodeString s;
|
||||
r.extractBetween(0, r.length(), s);
|
||||
return s;
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
#endif /* #if !UCONFIG_NO_TRANSLITERATION */
|
113
icu4c/source/i18n/brktrans.h
Normal file
113
icu4c/source/i18n/brktrans.h
Normal file
@ -0,0 +1,113 @@
|
||||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 2008, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
* Date Name Description
|
||||
* 05/11/2008 Andy Heninger Ported from Java
|
||||
**********************************************************************
|
||||
*/
|
||||
#ifndef BRKTRANS_H
|
||||
#define BRKTRANS_H
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
|
||||
#if !UCONFIG_NO_TRANSLITERATION
|
||||
|
||||
#include "unicode/translit.h"
|
||||
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
class UVector32;
|
||||
|
||||
/**
|
||||
* A transliterator that pInserts the specified characters at word breaks.
|
||||
* To restrict it to particular characters, use a filter.
|
||||
* TODO: this is an internal class, and only temporary.
|
||||
* Remove it once we have \b notation in Transliterator.
|
||||
*/
|
||||
class BreakTransliterator : public Transliterator {
|
||||
public:
|
||||
|
||||
BreakTransliterator(const UnicodeString &ID,
|
||||
UnicodeFilter *adoptedFilter,
|
||||
BreakIterator *bi,
|
||||
const UnicodeString &insertion);
|
||||
/**
|
||||
* Constructs a transliterator.
|
||||
* @param adoptedFilter the filter for this transliterator.
|
||||
*/
|
||||
BreakTransliterator(UnicodeFilter* adoptedFilter = 0);
|
||||
|
||||
/**
|
||||
* Destructor.
|
||||
*/
|
||||
virtual ~BreakTransliterator();
|
||||
|
||||
/**
|
||||
* Copy constructor.
|
||||
*/
|
||||
BreakTransliterator(const BreakTransliterator&);
|
||||
|
||||
/**
|
||||
* Transliterator API.
|
||||
* @return A copy of the object.
|
||||
*/
|
||||
virtual Transliterator* clone(void) const;
|
||||
|
||||
virtual const UnicodeString &getInsertion() const;
|
||||
|
||||
virtual void setInsertsion(const UnicodeString &insertion);
|
||||
|
||||
/**
|
||||
* Return the break iterator used by this transliterator.
|
||||
* Caution, this is the live break iterator; it must not be used while
|
||||
* there is any possibility that this transliterator is using it.
|
||||
*/
|
||||
virtual BreakIterator *getBreakIterator();
|
||||
|
||||
|
||||
/**
|
||||
* ICU "poor man's RTTI", returns a UClassID for the actual class.
|
||||
*/
|
||||
virtual UClassID getDynamicClassID() const;
|
||||
|
||||
/**
|
||||
* ICU "poor man's RTTI", returns a UClassID for this class.
|
||||
*/
|
||||
U_I18N_API static UClassID U_EXPORT2 getStaticClassID();
|
||||
|
||||
protected:
|
||||
|
||||
/**
|
||||
* Implements {@link Transliterator#handleTransliterate}.
|
||||
* @param text the buffer holding transliterated and
|
||||
* untransliterated text
|
||||
* @param offset the start and limit of the text, the position
|
||||
* of the cursor, and the start and limit of transliteration.
|
||||
* @param incremental if true, assume more text may be coming after
|
||||
* pos.contextLimit. Otherwise, assume the text is complete.
|
||||
*/
|
||||
virtual void handleTransliterate(Replaceable& text, UTransPosition& offset,
|
||||
UBool isIncremental) const;
|
||||
|
||||
private:
|
||||
BreakIterator *bi;
|
||||
UnicodeString fInsertion;
|
||||
UVector32 *boundaries;
|
||||
UnicodeString sText; // text from handleTransliterate().
|
||||
|
||||
static UnicodeString replaceableAsString(Replaceable &r);
|
||||
|
||||
/**
|
||||
* Assignment operator.
|
||||
*/
|
||||
BreakTransliterator& operator=(const BreakTransliterator&);
|
||||
};
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
#endif /* #if !UCONFIG_NO_TRANSLITERATION */
|
||||
|
||||
#endif
|
Loading…
Reference in New Issue
Block a user