ICU-4009 Port Any-BreakInternal transliterator from Java

X-SVN-Rev: 23902
2008-05-15 04:54:19 +00:00 · 2008-05-15 04:54:19 +00:00 · dbadbd711a
commit dbadbd711a
parent eecf0b1bae
2 changed files with 298 additions and 0 deletions
--- a/icu4c/source/i18n/brktrans.cpp
+++ b/icu4c/source/i18n/brktrans.cpp
@ -0,0 +1,185 @@
+/*
+**********************************************************************
+*   Copyright (C) 2008, International Business Machines
+*   Corporation and others.  All Rights Reserved.
+**********************************************************************
+*   Date        Name        Description
+*   05/11/2008  Andy Heninger  Port from Java
+**********************************************************************
+*/
+
+#include "unicode/utypes.h"
+
+#if !UCONFIG_NO_TRANSLITERATION
+
+#include "unicode/unifilt.h"
+#include "unicode/uchar.h"
+#include "unicode/uniset.h"
+#include "unicode/brkiter.h"
+#include "brktrans.h"
+#include "unicode/uchar.h"
+#include "cmemory.h"
+#include "uprops.h"
+#include "uinvchar.h"
+#include "util.h"
+#include "uvectr32.h"
+
+U_NAMESPACE_BEGIN
+
+UOBJECT_DEFINE_RTTI_IMPLEMENTATION(BreakTransliterator)
+
+static const UChar SPACE       = 32;  // ' '
+
+
+/**
+ * Constructs a transliterator with the default delimiters '{' and
+ * '}'.
+ */
+BreakTransliterator::BreakTransliterator(UnicodeFilter* adoptedFilter) :
+    Transliterator(UNICODE_STRING("Any-BreakInternal", 17), adoptedFilter) {
+        bi = NULL;
+        UErrorCode status = U_ZERO_ERROR;
+        boundaries = new UVector32(status);
+    }
+
+
+/**
+ * Destructor.
+ */
+BreakTransliterator::~BreakTransliterator() {
+    delete bi;
+    bi = NULL;
+    delete boundaries;
+    boundaries = NULL;
+}
+
+/**
+ * Copy constructor.
+ */
+BreakTransliterator::BreakTransliterator(const BreakTransliterator& o) :
+    Transliterator(o) {
+        bi = NULL;
+        if (o.bi != NULL) {
+            bi = o.bi->clone();
+        }
+        fInsertion = o.fInsertion;
+        UErrorCode status = U_ZERO_ERROR;
+        boundaries = new UVector32(status);
+    }
+
+
+/**
+ * Transliterator API.
+ */
+Transliterator* BreakTransliterator::clone(void) const {
+    return new BreakTransliterator(*this);
+};
+
+/**
+ * Implements {@link Transliterator#handleTransliterate}.
+ */
+void BreakTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets,
+                                                    UBool isIncremental ) const {
+
+        UErrorCode status = U_ZERO_ERROR;
+        boundaries->removeAllElements();
+        BreakTransliterator *nonConstThis = (BreakTransliterator *)this;
+        nonConstThis->getBreakIterator(); // Lazy-create it if necessary
+        UnicodeString sText = replaceableAsString(text);
+        bi->setText(sText);
+        bi->preceding(offsets.start);
+
+        // To make things much easier, we will stack the boundaries, and then insert at the end.
+        // generally, we won't need too many, since we will be filtered.
+
+        int32_t boundary;
+        for(boundary = bi->next(); boundary != UBRK_DONE && boundary < offsets.limit; boundary = bi->next()) {
+            if (boundary == 0) continue;
+            // HACK: Check to see that preceeding item was a letter
+
+            UChar32 cp = sText.char32At(boundary-1);
+            int type = u_charType(cp);
+            //System.out.println(Integer.toString(cp,16) + " (before): " + type);
+            if ((U_MASK(type) & (U_GC_L_MASK | U_GC_M_MASK)) == 0) continue;
+
+            cp = sText.char32At(boundary);
+            type = u_charType(cp);
+            //System.out.println(Integer.toString(cp,16) + " (after): " + type);
+            if ((U_MASK(type) & (U_GC_L_MASK | U_GC_M_MASK)) == 0) continue;
+
+            boundaries->addElement(boundary, status);
+            //System.out.println(boundary);
+        }
+
+        int delta = 0;
+        int lastBoundary = 0;
+
+        if (boundaries->size() != 0) { // if we found something, adjust
+            delta = boundaries->size() * fInsertion.length();
+            lastBoundary = boundaries->lastElementi();
+
+            // we do this from the end backwards, so that we don't have to keep updating.
+
+            while (boundaries->size() > 0) {
+                boundary = boundaries->popi();
+                text.handleReplaceBetween(boundary, boundary, fInsertion);
+            }
+        }
+
+        // Now fix up the return values
+        offsets.contextLimit += delta;
+        offsets.limit += delta;
+        offsets.start = isIncremental ? lastBoundary + delta : offsets.limit;
+
+        // TODO:  do something with U_FAILURE(status);
+        //        (need to look at transliterators overall, not just here.)
+}
+
+//
+//  getInsertion()
+//
+const UnicodeString &BreakTransliterator::getInsertion() const {
+    return fInsertion;
+}
+
+//
+//  setInsertion()
+//
+void BreakTransliterator::setInsertsion(const UnicodeString &insertion) {
+    this->fInsertion = insertion;
+}
+
+//
+//  getBreakIterator     Lazily create the break iterator if it does
+//                       not already exist.  Copied from Java, probably
+//                       better to just create it in the constructor.
+//
+BreakIterator *BreakTransliterator::getBreakIterator() {
+    UErrorCode status = U_ZERO_ERROR;
+    if (bi == NULL) {
+        // Note:  Thai breaking behavior is universal, it is not
+        //        tied to the Thai locale.
+        bi = BreakIterator::createWordInstance(Locale::getEnglish(), status);
+    }
+    return bi;
+}
+
+//
+//   replaceableAsString   Hack to let break iterators work
+//                         on the replaceable text from transliterators.
+//                         In practice, the only real Replaceable type that we
+//                         will be seeing is UnicodeString, so this function
+//                         will normally be efficient.
+//
+UnicodeString BreakTransliterator::replaceableAsString(Replaceable &r) {
+    if (r.getDynamicClassID() == UnicodeString::getStaticClassID()) {
+        return (UnicodeString &) r;
+    }
+    UnicodeString s;
+    r.extractBetween(0, r.length(), s);
+    return s;
+}
+
+U_NAMESPACE_END
+
+#endif /* #if !UCONFIG_NO_TRANSLITERATION */
--- a/icu4c/source/i18n/brktrans.h
+++ b/icu4c/source/i18n/brktrans.h
@ -0,0 +1,113 @@
+/*
+**********************************************************************
+*   Copyright (C) 2008, International Business Machines
+*   Corporation and others.  All Rights Reserved.
+**********************************************************************
+*   Date        Name        Description
+*   05/11/2008  Andy Heninger  Ported from Java
+**********************************************************************
+*/
+#ifndef BRKTRANS_H
+#define BRKTRANS_H
+
+#include "unicode/utypes.h"
+
+#if !UCONFIG_NO_TRANSLITERATION
+
+#include "unicode/translit.h"
+
+
+U_NAMESPACE_BEGIN
+
+class UVector32;
+
+/**
+ * A transliterator that pInserts the specified characters at word breaks.
+ * To restrict it to particular characters, use a filter.
+ * TODO: this is an internal class, and only temporary. 
+ * Remove it once we have \b notation in Transliterator.
+ */
+class BreakTransliterator : public Transliterator {
+public:
+
+    BreakTransliterator(const UnicodeString &ID, 
+                        UnicodeFilter *adoptedFilter,
+                        BreakIterator *bi, 
+                        const UnicodeString &insertion);
+    /**
+     * Constructs a transliterator.
+     * @param adoptedFilter    the filter for this transliterator.
+     */
+    BreakTransliterator(UnicodeFilter* adoptedFilter = 0);
+
+    /**
+     * Destructor.
+     */
+    virtual ~BreakTransliterator();
+
+    /**
+     * Copy constructor.
+     */
+    BreakTransliterator(const BreakTransliterator&);
+
+    /**
+     * Transliterator API.
+     * @return    A copy of the object.
+     */
+    virtual Transliterator* clone(void) const;
+
+    virtual const UnicodeString &getInsertion() const;
+
+    virtual void setInsertsion(const UnicodeString &insertion);
+
+    /**
+      *  Return the break iterator used by this transliterator.
+      *  Caution, this is the live break iterator; it must not be used while
+      *     there is any possibility that this transliterator is using it.
+      */
+    virtual BreakIterator *getBreakIterator();
+
+
+    /**
+     * ICU "poor man's RTTI", returns a UClassID for the actual class.
+     */
+    virtual UClassID getDynamicClassID() const;
+
+    /**
+     * ICU "poor man's RTTI", returns a UClassID for this class.
+     */
+    U_I18N_API static UClassID U_EXPORT2 getStaticClassID();
+
+ protected:
+
+    /**
+     * Implements {@link Transliterator#handleTransliterate}.
+     * @param text          the buffer holding transliterated and
+     *                      untransliterated text
+     * @param offset        the start and limit of the text, the position
+     *                      of the cursor, and the start and limit of transliteration.
+     * @param incremental   if true, assume more text may be coming after
+     *                      pos.contextLimit. Otherwise, assume the text is complete.
+     */
+    virtual void handleTransliterate(Replaceable& text, UTransPosition& offset,
+                                     UBool isIncremental) const;
+
+ private:
+     BreakIterator     *bi;
+     UnicodeString      fInsertion;
+     UVector32         *boundaries;
+     UnicodeString      sText;  // text from handleTransliterate().
+
+     static UnicodeString replaceableAsString(Replaceable &r);
+
+    /**
+     * Assignment operator.
+     */
+    BreakTransliterator& operator=(const BreakTransliterator&);
+};
+
+U_NAMESPACE_END
+
+#endif /* #if !UCONFIG_NO_TRANSLITERATION */
+
+#endif