scuffed-code/icu4c/source/i18n/titletrn.cpp

/*
**********************************************************************
*   Copyright (C) 2001, International Business Machines
*   Corporation and others.  All Rights Reserved.
**********************************************************************
*   Date        Name        Description
*   05/24/01    aliu        Creation.
**********************************************************************
*/

#include "unicode/uchar.h"
#include "titletrn.h"
#include "unicode/uniset.h"
#include "mutex.h"
#include "ucln_in.h"
#include "unicode/ustring.h"
#include "ustr_imp.h"
#include "cpputils.h"

U_NAMESPACE_BEGIN

/**
 * ID for this transliterator.
 */
const char TitlecaseTransliterator::_ID[] = "Any-Title";

/**
 * Mutex for statics IN THIS FILE
 */
static UMTX MUTEX = 0;

/**
 * The set of characters we skip.  These are neither cased nor
 * non-cased, to us; we copy them verbatim.
 */
static UnicodeSet* SKIP = NULL;

/**
 * The set of characters that cause the next non-SKIP character
 * to be lowercased.
 */
static UnicodeSet* CASED = NULL;

TitlecaseTransliterator::TitlecaseTransliterator(const Locale& theLoc) :
    Transliterator(_ID, 0),
    loc(theLoc), 
    buffer(0) {
    buffer = new UChar[u_getMaxCaseExpansion()];
    // Need to look back 2 characters in the case of "can't"
    setMaximumContextLength(2);
}

/**
 * Destructor.
 */
TitlecaseTransliterator::~TitlecaseTransliterator() {
    delete [] buffer;
}

/**
 * Copy constructor.
 */
TitlecaseTransliterator::TitlecaseTransliterator(const TitlecaseTransliterator& o) :
    Transliterator(o),
    loc(o.loc),
    buffer(0) {
    buffer = new UChar[u_getMaxCaseExpansion()];    
    uprv_arrayCopy(o.buffer, 0, this->buffer, 0, u_getMaxCaseExpansion());
}

/**
 * Assignment operator.
 */
TitlecaseTransliterator& TitlecaseTransliterator::operator=(
                             const TitlecaseTransliterator& o) {
    Transliterator::operator=(o);
    loc = o.loc;
    uprv_arrayCopy(o.buffer, 0, this->buffer, 0, u_getMaxCaseExpansion());
    return *this;
}

/**
 * Transliterator API.
 */
Transliterator* TitlecaseTransliterator::clone(void) const {
    return new TitlecaseTransliterator(*this);
}

/**
 * Implements {@link Transliterator#handleTransliterate}.
 */
void TitlecaseTransliterator::handleTransliterate(
                                  Replaceable& text, UTransPosition& offsets,
                                  UBool isIncremental) const {
    if (SKIP == NULL) {
        Mutex lock(&MUTEX);
        if (SKIP == NULL) {
            UErrorCode ec = U_ZERO_ERROR;
            SKIP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\u00AD \\u2019 \\' [:Mn:] [:Me:] [:Cf:] [:Lm:] [:Sk:]]"), ec);
            CASED = new UnicodeSet(UNICODE_STRING_SIMPLE("[[:Lu:] [:Ll:] [:Lt:]]"), ec);
            ucln_i18n_registerCleanup();
        }
    }

    // Our mode; we are either converting letter toTitle or
    // toLower.
    UBool doTitle = TRUE;
    
    // Determine if there is a preceding context of CASED SKIP*,
    // in which case we want to start in toLower mode.  If the
    // prior context is anything else (including empty) then start
    // in toTitle mode.
    UChar32 c;
    int32_t start;
    for (start = offsets.start - 1; start >= offsets.contextStart; start -= UTF_CHAR_LENGTH(c)) {
        c = text.char32At(start);
        if (SKIP->contains(c)) {
            continue;
        }
        doTitle = !CASED->contains(c);
        break;
    }
    
    // Convert things after a CASED character toLower; things
    // after a non-CASED, non-SKIP character toTitle.  SKIP
    // characters are copied directly and do not change the mode.
    int32_t textPos = offsets.start;
    if (textPos >= offsets.limit) return;

    // get string for context
    // TODO: add convenience method to do this, since we do it all over

    int32_t loop = 0;
    UnicodeString original;
    /* UChar *original = new UChar[offsets.contextLimit - offsets.contextStart+1]; */// get whole context
    /* Extract the characters from Replaceable */
    for (loop = offsets.contextStart; loop < offsets.contextLimit; loop++) {
        original.append(text.charAt(loop));
    }
    // Walk through original string
    // If there is a case change, modify corresponding position in replaceable

    int32_t i = textPos - offsets.contextStart;
    int32_t limit = offsets.limit - offsets.contextStart;
    UChar32 cp, bufferCH;
    int32_t oldLen;
    int32_t newLen;

    for (; i < limit; ) {
        UErrorCode status = U_ZERO_ERROR;
        int32_t s = i;

        UTF_GET_CHAR(original.getBuffer(), 0, i, original.length(), cp);
        oldLen = UTF_CHAR_LENGTH(cp);
        i += oldLen;
        if (!SKIP->contains(cp)) {
            if (doTitle) {
                newLen = u_internalTitleCase(cp, buffer, u_getMaxCaseExpansion(), loc.getName());
            } else {
                int32_t len = u_strToLower(buffer, u_getMaxCaseExpansion(), original.getBuffer()+s, i-s, loc.getName(), &status);
                UTF_GET_CHAR(buffer, 0, 0, len, bufferCH);
                newLen = (bufferCH == original.char32At(s) ? -1 : len);
            }
            doTitle = !CASED->contains(cp);
            if (newLen >= 0) {
                UnicodeString temp(buffer, newLen);
                text.handleReplaceBetween(textPos, textPos + oldLen, temp);
                if (newLen != oldLen) {
                    textPos += newLen;
                    offsets.limit += newLen - oldLen;
                    offsets.contextLimit += newLen - oldLen;
                    continue;
                }
            }
        }
        textPos += oldLen;
    }
    offsets.start = offsets.limit;
}

/**
 * Static memory cleanup function.
 */
void TitlecaseTransliterator::cleanup() {
    if (SKIP != NULL) {
        delete SKIP; SKIP = NULL;
        delete CASED; CASED = NULL;
        umtx_destroy(&MUTEX);
    }
}

U_NAMESPACE_END
ICU-965 create Any-Lower, Any-Upper, and Any-Title transliterators X-SVN-Rev: 4941 2001-06-11 19:51:46 +00:00			`/*`
			`**********************************************************************`
			`* Copyright (C) 2001, International Business Machines`
			`* Corporation and others. All Rights Reserved.`
			`**********************************************************************`
			`* Date Name Description`
			`* 05/24/01 aliu Creation.`
			`**********************************************************************`
			`*/`

			`#include "unicode/uchar.h"`
ICU-1533 Moved new Transliterator subclasses here to make them private. X-SVN-Rev: 6964 2001-11-16 23:51:15 +00:00			`#include "titletrn.h"`
ICU-1135 rewrite Any-Title to be smarter about accents X-SVN-Rev: 6545 2001-11-01 04:37:27 +00:00			`#include "unicode/uniset.h"`
			`#include "mutex.h"`
ICU-770 fix memory leaks X-SVN-Rev: 6855 2001-11-13 23:47:11 +00:00			`#include "ucln_in.h"`
ICU-1501 Ported back the Upper/Lower/TitlecaseTransliterator changes from Java. X-SVN-Rev: 7192 2001-11-30 00:57:29 +00:00			`#include "unicode/ustring.h"`
			`#include "ustr_imp.h"`
			`#include "cpputils.h"`
ICU-965 create Any-Lower, Any-Upper, and Any-Title transliterators X-SVN-Rev: 4941 2001-06-11 19:51:46 +00:00
ICU-1264 added namspace support where possible. X-SVN-Rev: 6124 2001-10-08 23:26:58 +00:00			`U_NAMESPACE_BEGIN`

ICU-965 create Any-Lower, Any-Upper, and Any-Title transliterators X-SVN-Rev: 4941 2001-06-11 19:51:46 +00:00			`/**`
			`* ID for this transliterator.`
			`*/`
ICU-1099 Make some data a bit more const X-SVN-Rev: 6203 2001-10-11 23:54:55 +00:00			`const char TitlecaseTransliterator::_ID[] = "Any-Title";`
ICU-965 create Any-Lower, Any-Upper, and Any-Title transliterators X-SVN-Rev: 4941 2001-06-11 19:51:46 +00:00
ICU-1575 more mutex and string initialization cleanup X-SVN-Rev: 7301 2001-12-04 00:08:31 +00:00			`/**`
			`* Mutex for statics IN THIS FILE`
			`*/`
			`static UMTX MUTEX = 0;`

ICU-1135 rewrite Any-Title to be smarter about accents X-SVN-Rev: 6545 2001-11-01 04:37:27 +00:00			`/**`
			`* The set of characters we skip. These are neither cased nor`
			`* non-cased, to us; we copy them verbatim.`
			`*/`
ICU-770 Fix for AIX. Can't delete const data. X-SVN-Rev: 6883 2001-11-14 17:23:01 +00:00			`static UnicodeSet* SKIP = NULL;`
ICU-1135 rewrite Any-Title to be smarter about accents X-SVN-Rev: 6545 2001-11-01 04:37:27 +00:00
			`/**`
			`* The set of characters that cause the next non-SKIP character`
			`* to be lowercased.`
			`*/`
ICU-770 Fix for AIX. Can't delete const data. X-SVN-Rev: 6883 2001-11-14 17:23:01 +00:00			`static UnicodeSet* CASED = NULL;`
ICU-1135 rewrite Any-Title to be smarter about accents X-SVN-Rev: 6545 2001-11-01 04:37:27 +00:00
ICU-1501 Ported back the Upper/Lower/TitlecaseTransliterator changes from Java. X-SVN-Rev: 7192 2001-11-30 00:57:29 +00:00			`TitlecaseTransliterator::TitlecaseTransliterator(const Locale& theLoc) :`
			`Transliterator(_ID, 0),`
			`loc(theLoc),`
			`buffer(0) {`
			`buffer = new UChar[u_getMaxCaseExpansion()];`
ICU-965 in Any-Title make can't -> Can't, not Can'T X-SVN-Rev: 5144 2001-06-29 21:19:49 +00:00			`// Need to look back 2 characters in the case of "can't"`
			`setMaximumContextLength(2);`
ICU-965 create Any-Lower, Any-Upper, and Any-Title transliterators X-SVN-Rev: 4941 2001-06-11 19:51:46 +00:00			`}`

			`/**`
			`* Destructor.`
			`*/`
ICU-1501 Ported back the Upper/Lower/TitlecaseTransliterator changes from Java. X-SVN-Rev: 7192 2001-11-30 00:57:29 +00:00			`TitlecaseTransliterator::~TitlecaseTransliterator() {`
			`delete [] buffer;`
			`}`
ICU-965 create Any-Lower, Any-Upper, and Any-Title transliterators X-SVN-Rev: 4941 2001-06-11 19:51:46 +00:00
			`/**`
			`* Copy constructor.`
			`*/`
			`TitlecaseTransliterator::TitlecaseTransliterator(const TitlecaseTransliterator& o) :`
ICU-1501 Ported back the Upper/Lower/TitlecaseTransliterator changes from Java. X-SVN-Rev: 7192 2001-11-30 00:57:29 +00:00			`Transliterator(o),`
			`loc(o.loc),`
			`buffer(0) {`
			`buffer = new UChar[u_getMaxCaseExpansion()];`
			`uprv_arrayCopy(o.buffer, 0, this->buffer, 0, u_getMaxCaseExpansion());`
			`}`
ICU-965 create Any-Lower, Any-Upper, and Any-Title transliterators X-SVN-Rev: 4941 2001-06-11 19:51:46 +00:00
			`/**`
			`* Assignment operator.`
			`*/`
			`TitlecaseTransliterator& TitlecaseTransliterator::operator=(`
			`const TitlecaseTransliterator& o) {`
			`Transliterator::operator=(o);`
ICU-1501 Ported back the Upper/Lower/TitlecaseTransliterator changes from Java. X-SVN-Rev: 7192 2001-11-30 00:57:29 +00:00			`loc = o.loc;`
			`uprv_arrayCopy(o.buffer, 0, this->buffer, 0, u_getMaxCaseExpansion());`
ICU-965 create Any-Lower, Any-Upper, and Any-Title transliterators X-SVN-Rev: 4941 2001-06-11 19:51:46 +00:00			`return *this;`
			`}`

			`/**`
			`* Transliterator API.`
			`*/`
			`Transliterator* TitlecaseTransliterator::clone(void) const {`
			`return new TitlecaseTransliterator(*this);`
			`}`

			`/**`
			`* Implements {@link Transliterator#handleTransliterate}.`
			`*/`
			`void TitlecaseTransliterator::handleTransliterate(`
			`Replaceable& text, UTransPosition& offsets,`
			`UBool isIncremental) const {`
ICU-1135 rewrite Any-Title to be smarter about accents X-SVN-Rev: 6545 2001-11-01 04:37:27 +00:00			`if (SKIP == NULL) {`
ICU-1575 more mutex and string initialization cleanup X-SVN-Rev: 7301 2001-12-04 00:08:31 +00:00			`Mutex lock(&MUTEX);`
ICU-1135 rewrite Any-Title to be smarter about accents X-SVN-Rev: 6545 2001-11-01 04:37:27 +00:00			`if (SKIP == NULL) {`
			`UErrorCode ec = U_ZERO_ERROR;`
ICU-1629 Simplify UnicodeString creation X-SVN-Rev: 7471 2002-01-22 00:27:49 +00:00			`SKIP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\u00AD \\u2019 \\' [:Mn:] [:Me:] [:Cf:] [:Lm:] [:Sk:]]"), ec);`
			`CASED = new UnicodeSet(UNICODE_STRING_SIMPLE("[[:Lu:] [:Ll:] [:Lt:]]"), ec);`
ICU-770 fix memory leaks X-SVN-Rev: 6855 2001-11-13 23:47:11 +00:00			`ucln_i18n_registerCleanup();`
ICU-965 in Any-Title make can't -> Can't, not Can'T X-SVN-Rev: 5144 2001-06-29 21:19:49 +00:00			`}`
ICU-965 create Any-Lower, Any-Upper, and Any-Title transliterators X-SVN-Rev: 4941 2001-06-11 19:51:46 +00:00			`}`

ICU-1135 rewrite Any-Title to be smarter about accents X-SVN-Rev: 6545 2001-11-01 04:37:27 +00:00			`// Our mode; we are either converting letter toTitle or`
			`// toLower.`
			`UBool doTitle = TRUE;`

			`// Determine if there is a preceding context of CASED SKIP*,`
			`// in which case we want to start in toLower mode. If the`
			`// prior context is anything else (including empty) then start`
			`// in toTitle mode.`
ICU-1501 fix to handle surrogates X-SVN-Rev: 7258 2001-12-01 22:37:34 +00:00			`UChar32 c;`
			`int32_t start;`
			`for (start = offsets.start - 1; start >= offsets.contextStart; start -= UTF_CHAR_LENGTH(c)) {`
			`c = text.char32At(start);`
ICU-1135 rewrite Any-Title to be smarter about accents X-SVN-Rev: 6545 2001-11-01 04:37:27 +00:00			`if (SKIP->contains(c)) {`
			`continue;`
ICU-965 create Any-Lower, Any-Upper, and Any-Title transliterators X-SVN-Rev: 4941 2001-06-11 19:51:46 +00:00			`}`
ICU-1135 rewrite Any-Title to be smarter about accents X-SVN-Rev: 6545 2001-11-01 04:37:27 +00:00			`doTitle = !CASED->contains(c);`
			`break;`
ICU-965 create Any-Lower, Any-Upper, and Any-Title transliterators X-SVN-Rev: 4941 2001-06-11 19:51:46 +00:00			`}`
ICU-1135 rewrite Any-Title to be smarter about accents X-SVN-Rev: 6545 2001-11-01 04:37:27 +00:00
			`// Convert things after a CASED character toLower; things`
			`// after a non-CASED, non-SKIP character toTitle. SKIP`
			`// characters are copied directly and do not change the mode.`
ICU-1501 Ported back the Upper/Lower/TitlecaseTransliterator changes from Java. X-SVN-Rev: 7192 2001-11-30 00:57:29 +00:00			`int32_t textPos = offsets.start;`
			`if (textPos >= offsets.limit) return;`

			`// get string for context`
			`// TODO: add convenience method to do this, since we do it all over`

			`int32_t loop = 0;`
			`UnicodeString original;`
			`/* UChar original = new UChar[offsets.contextLimit - offsets.contextStart+1]; /// get whole context`
			`/* Extract the characters from Replaceable */`
			`for (loop = offsets.contextStart; loop < offsets.contextLimit; loop++) {`
			`original.append(text.charAt(loop));`
			`}`
			`// Walk through original string`
			`// If there is a case change, modify corresponding position in replaceable`

			`int32_t i = textPos - offsets.contextStart;`
			`int32_t limit = offsets.limit - offsets.contextStart;`
ICU-1501 Updated Upper/Lower/TitlecaseTransliterator and checked in new casing tests for surrogates from Java. X-SVN-Rev: 7235 2001-11-30 23:53:55 +00:00			`UChar32 cp, bufferCH;`
ICU-1501 Ported back the Upper/Lower/TitlecaseTransliterator changes from Java. X-SVN-Rev: 7192 2001-11-30 00:57:29 +00:00			`int32_t oldLen;`
			`int32_t newLen;`

			`for (; i < limit; ) {`
			`UErrorCode status = U_ZERO_ERROR;`
			`int32_t s = i;`

			`UTF_GET_CHAR(original.getBuffer(), 0, i, original.length(), cp);`
			`oldLen = UTF_CHAR_LENGTH(cp);`
			`i += oldLen;`
			`if (!SKIP->contains(cp)) {`
			`if (doTitle) {`
			`newLen = u_internalTitleCase(cp, buffer, u_getMaxCaseExpansion(), loc.getName());`
			`} else {`
ICU-1211 Don't go beyond the end of the array (fix for Solaris, Windows, HP/UX) X-SVN-Rev: 7304 2001-12-04 02:16:58 +00:00			`int32_t len = u_strToLower(buffer, u_getMaxCaseExpansion(), original.getBuffer()+s, i-s, loc.getName(), &status);`
			`UTF_GET_CHAR(buffer, 0, 0, len, bufferCH);`
			`newLen = (bufferCH == original.char32At(s) ? -1 : len);`
ICU-1501 Ported back the Upper/Lower/TitlecaseTransliterator changes from Java. X-SVN-Rev: 7192 2001-11-30 00:57:29 +00:00			`}`
			`doTitle = !CASED->contains(cp);`
			`if (newLen >= 0) {`
			`UnicodeString temp(buffer, newLen);`
			`text.handleReplaceBetween(textPos, textPos + oldLen, temp);`
			`if (newLen != oldLen) {`
			`textPos += newLen;`
			`offsets.limit += newLen - oldLen;`
			`offsets.contextLimit += newLen - oldLen;`
			`continue;`
			`}`
			`}`
ICU-1135 rewrite Any-Title to be smarter about accents X-SVN-Rev: 6545 2001-11-01 04:37:27 +00:00			`}`
ICU-1501 Ported back the Upper/Lower/TitlecaseTransliterator changes from Java. X-SVN-Rev: 7192 2001-11-30 00:57:29 +00:00			`textPos += oldLen;`
ICU-965 create Any-Lower, Any-Upper, and Any-Title transliterators X-SVN-Rev: 4941 2001-06-11 19:51:46 +00:00			`}`
ICU-1501 Ported back the Upper/Lower/TitlecaseTransliterator changes from Java. X-SVN-Rev: 7192 2001-11-30 00:57:29 +00:00			`offsets.start = offsets.limit;`
ICU-965 create Any-Lower, Any-Upper, and Any-Title transliterators X-SVN-Rev: 4941 2001-06-11 19:51:46 +00:00			`}`
ICU-1264 added namspace support where possible. X-SVN-Rev: 6124 2001-10-08 23:26:58 +00:00
ICU-770 fix memory leaks X-SVN-Rev: 6855 2001-11-13 23:47:11 +00:00			`/**`
			`* Static memory cleanup function.`
			`*/`
			`void TitlecaseTransliterator::cleanup() {`
			`if (SKIP != NULL) {`
			`delete SKIP; SKIP = NULL;`
			`delete CASED; CASED = NULL;`
ICU-1575 more mutex and string initialization cleanup X-SVN-Rev: 7301 2001-12-04 00:08:31 +00:00			`umtx_destroy(&MUTEX);`
ICU-770 fix memory leaks X-SVN-Rev: 6855 2001-11-13 23:47:11 +00:00			`}`
			`}`

ICU-1264 added namspace support where possible. X-SVN-Rev: 6124 2001-10-08 23:26:58 +00:00			`U_NAMESPACE_END`