From dbadbd711adb5aa63af5af8509bdeee2e77317c4 Mon Sep 17 00:00:00 2001 From: Andy Heninger Date: Thu, 15 May 2008 04:54:19 +0000 Subject: [PATCH] ICU-4009 Port Any-BreakInternal transliterator from Java X-SVN-Rev: 23902 --- icu4c/source/i18n/brktrans.cpp | 185 +++++++++++++++++++++++++++++++++ icu4c/source/i18n/brktrans.h | 113 ++++++++++++++++++++ 2 files changed, 298 insertions(+) create mode 100644 icu4c/source/i18n/brktrans.cpp create mode 100644 icu4c/source/i18n/brktrans.h diff --git a/icu4c/source/i18n/brktrans.cpp b/icu4c/source/i18n/brktrans.cpp new file mode 100644 index 0000000000..65f025089a --- /dev/null +++ b/icu4c/source/i18n/brktrans.cpp @@ -0,0 +1,185 @@ +/* +********************************************************************** +* Copyright (C) 2008, International Business Machines +* Corporation and others. All Rights Reserved. +********************************************************************** +* Date Name Description +* 05/11/2008 Andy Heninger Port from Java +********************************************************************** +*/ + +#include "unicode/utypes.h" + +#if !UCONFIG_NO_TRANSLITERATION + +#include "unicode/unifilt.h" +#include "unicode/uchar.h" +#include "unicode/uniset.h" +#include "unicode/brkiter.h" +#include "brktrans.h" +#include "unicode/uchar.h" +#include "cmemory.h" +#include "uprops.h" +#include "uinvchar.h" +#include "util.h" +#include "uvectr32.h" + +U_NAMESPACE_BEGIN + +UOBJECT_DEFINE_RTTI_IMPLEMENTATION(BreakTransliterator) + +static const UChar SPACE = 32; // ' ' + + +/** + * Constructs a transliterator with the default delimiters '{' and + * '}'. + */ +BreakTransliterator::BreakTransliterator(UnicodeFilter* adoptedFilter) : + Transliterator(UNICODE_STRING("Any-BreakInternal", 17), adoptedFilter) { + bi = NULL; + UErrorCode status = U_ZERO_ERROR; + boundaries = new UVector32(status); + } + + +/** + * Destructor. + */ +BreakTransliterator::~BreakTransliterator() { + delete bi; + bi = NULL; + delete boundaries; + boundaries = NULL; +} + +/** + * Copy constructor. + */ +BreakTransliterator::BreakTransliterator(const BreakTransliterator& o) : + Transliterator(o) { + bi = NULL; + if (o.bi != NULL) { + bi = o.bi->clone(); + } + fInsertion = o.fInsertion; + UErrorCode status = U_ZERO_ERROR; + boundaries = new UVector32(status); + } + + +/** + * Transliterator API. + */ +Transliterator* BreakTransliterator::clone(void) const { + return new BreakTransliterator(*this); +}; + +/** + * Implements {@link Transliterator#handleTransliterate}. + */ +void BreakTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets, + UBool isIncremental ) const { + + UErrorCode status = U_ZERO_ERROR; + boundaries->removeAllElements(); + BreakTransliterator *nonConstThis = (BreakTransliterator *)this; + nonConstThis->getBreakIterator(); // Lazy-create it if necessary + UnicodeString sText = replaceableAsString(text); + bi->setText(sText); + bi->preceding(offsets.start); + + // To make things much easier, we will stack the boundaries, and then insert at the end. + // generally, we won't need too many, since we will be filtered. + + int32_t boundary; + for(boundary = bi->next(); boundary != UBRK_DONE && boundary < offsets.limit; boundary = bi->next()) { + if (boundary == 0) continue; + // HACK: Check to see that preceeding item was a letter + + UChar32 cp = sText.char32At(boundary-1); + int type = u_charType(cp); + //System.out.println(Integer.toString(cp,16) + " (before): " + type); + if ((U_MASK(type) & (U_GC_L_MASK | U_GC_M_MASK)) == 0) continue; + + cp = sText.char32At(boundary); + type = u_charType(cp); + //System.out.println(Integer.toString(cp,16) + " (after): " + type); + if ((U_MASK(type) & (U_GC_L_MASK | U_GC_M_MASK)) == 0) continue; + + boundaries->addElement(boundary, status); + //System.out.println(boundary); + } + + int delta = 0; + int lastBoundary = 0; + + if (boundaries->size() != 0) { // if we found something, adjust + delta = boundaries->size() * fInsertion.length(); + lastBoundary = boundaries->lastElementi(); + + // we do this from the end backwards, so that we don't have to keep updating. + + while (boundaries->size() > 0) { + boundary = boundaries->popi(); + text.handleReplaceBetween(boundary, boundary, fInsertion); + } + } + + // Now fix up the return values + offsets.contextLimit += delta; + offsets.limit += delta; + offsets.start = isIncremental ? lastBoundary + delta : offsets.limit; + + // TODO: do something with U_FAILURE(status); + // (need to look at transliterators overall, not just here.) +} + +// +// getInsertion() +// +const UnicodeString &BreakTransliterator::getInsertion() const { + return fInsertion; +} + +// +// setInsertion() +// +void BreakTransliterator::setInsertsion(const UnicodeString &insertion) { + this->fInsertion = insertion; +} + +// +// getBreakIterator Lazily create the break iterator if it does +// not already exist. Copied from Java, probably +// better to just create it in the constructor. +// +BreakIterator *BreakTransliterator::getBreakIterator() { + UErrorCode status = U_ZERO_ERROR; + if (bi == NULL) { + // Note: Thai breaking behavior is universal, it is not + // tied to the Thai locale. + bi = BreakIterator::createWordInstance(Locale::getEnglish(), status); + } + return bi; +} + +// +// replaceableAsString Hack to let break iterators work +// on the replaceable text from transliterators. +// In practice, the only real Replaceable type that we +// will be seeing is UnicodeString, so this function +// will normally be efficient. +// +UnicodeString BreakTransliterator::replaceableAsString(Replaceable &r) { + if (r.getDynamicClassID() == UnicodeString::getStaticClassID()) { + return (UnicodeString &) r; + } + UnicodeString s; + r.extractBetween(0, r.length(), s); + return s; +} + +U_NAMESPACE_END + +#endif /* #if !UCONFIG_NO_TRANSLITERATION */ diff --git a/icu4c/source/i18n/brktrans.h b/icu4c/source/i18n/brktrans.h new file mode 100644 index 0000000000..b5db373ce0 --- /dev/null +++ b/icu4c/source/i18n/brktrans.h @@ -0,0 +1,113 @@ +/* +********************************************************************** +* Copyright (C) 2008, International Business Machines +* Corporation and others. All Rights Reserved. +********************************************************************** +* Date Name Description +* 05/11/2008 Andy Heninger Ported from Java +********************************************************************** +*/ +#ifndef BRKTRANS_H +#define BRKTRANS_H + +#include "unicode/utypes.h" + +#if !UCONFIG_NO_TRANSLITERATION + +#include "unicode/translit.h" + + +U_NAMESPACE_BEGIN + +class UVector32; + +/** + * A transliterator that pInserts the specified characters at word breaks. + * To restrict it to particular characters, use a filter. + * TODO: this is an internal class, and only temporary. + * Remove it once we have \b notation in Transliterator. + */ +class BreakTransliterator : public Transliterator { +public: + + BreakTransliterator(const UnicodeString &ID, + UnicodeFilter *adoptedFilter, + BreakIterator *bi, + const UnicodeString &insertion); + /** + * Constructs a transliterator. + * @param adoptedFilter the filter for this transliterator. + */ + BreakTransliterator(UnicodeFilter* adoptedFilter = 0); + + /** + * Destructor. + */ + virtual ~BreakTransliterator(); + + /** + * Copy constructor. + */ + BreakTransliterator(const BreakTransliterator&); + + /** + * Transliterator API. + * @return A copy of the object. + */ + virtual Transliterator* clone(void) const; + + virtual const UnicodeString &getInsertion() const; + + virtual void setInsertsion(const UnicodeString &insertion); + + /** + * Return the break iterator used by this transliterator. + * Caution, this is the live break iterator; it must not be used while + * there is any possibility that this transliterator is using it. + */ + virtual BreakIterator *getBreakIterator(); + + + /** + * ICU "poor man's RTTI", returns a UClassID for the actual class. + */ + virtual UClassID getDynamicClassID() const; + + /** + * ICU "poor man's RTTI", returns a UClassID for this class. + */ + U_I18N_API static UClassID U_EXPORT2 getStaticClassID(); + + protected: + + /** + * Implements {@link Transliterator#handleTransliterate}. + * @param text the buffer holding transliterated and + * untransliterated text + * @param offset the start and limit of the text, the position + * of the cursor, and the start and limit of transliteration. + * @param incremental if true, assume more text may be coming after + * pos.contextLimit. Otherwise, assume the text is complete. + */ + virtual void handleTransliterate(Replaceable& text, UTransPosition& offset, + UBool isIncremental) const; + + private: + BreakIterator *bi; + UnicodeString fInsertion; + UVector32 *boundaries; + UnicodeString sText; // text from handleTransliterate(). + + static UnicodeString replaceableAsString(Replaceable &r); + + /** + * Assignment operator. + */ + BreakTransliterator& operator=(const BreakTransliterator&); +}; + +U_NAMESPACE_END + +#endif /* #if !UCONFIG_NO_TRANSLITERATION */ + +#endif