ICU-5766 Extended Grapheme Clusters for ICU4C

X-SVN-Rev: 21933
This commit is contained in:
Andy Heninger 2007-07-10 01:25:26 +00:00
parent abbc83a287
commit ca5d005978
8 changed files with 72 additions and 4 deletions

View File

@ -189,6 +189,15 @@ BreakIterator::createTitleInstance(const Locale& key, UErrorCode& status)
// -------------------------------------
// Creates a break iterator for Extended Grapheme Cluster breaks.
BreakIterator* U_EXPORT2
BreakIterator::createXGraphemeClusterInstance(const Locale& key, UErrorCode& status)
{
return createInstance(key, UBRK_X_GRAPHEME_CLUSTER, status);
}
// -------------------------------------
// Gets all the available locales that has localized text boundary data.
const Locale* U_EXPORT2
BreakIterator::getAvailableLocales(int32_t& count)
@ -424,6 +433,9 @@ BreakIterator::makeInstance(const Locale& loc, int32_t kind, UErrorCode& status)
case UBRK_TITLE:
result = BreakIterator::buildInstance(loc, "title", kind, status);
break;
case UBRK_X_GRAPHEME_CLUSTER:
result = BreakIterator::buildInstance(loc, "xgc", kind, status);
break;
default:
status = U_ILLEGAL_ARGUMENT_ERROR;
}

View File

@ -397,6 +397,22 @@ public:
static BreakIterator* U_EXPORT2
createTitleInstance(const Locale& where, UErrorCode& status);
/**
* Create BreakIterator for Extended Grapheme Clusters using specified locale
* Returns an instance of a BreakIterator for locating XGC booundaries
* Extended Grapheme Clusters are combining character sequences and other
* sequences that should remain unbroken when iterating over
* "characters" from a user perspective.
* @param loc the locale.
* @param status Receive information regarding any errors or warnings that
* occurred in creating the break iterator.
* @return A BreakIterator for Extended Grapheme Clusters.
* The caller owns the returned object and is responsible for deleting it.
* @draft ICU 3.8
*/
static BreakIterator* U_EXPORT2
createXGraphemeClusterInstance(const Locale& loc, UErrorCode& status);
/**
* Get the set of Locales for which TextBoundaries are installed.
* <p><b>Note:</b> this will not return locales added through the register

View File

@ -105,7 +105,9 @@ typedef enum UBreakIteratorType {
*/
UBRK_TITLE = 4,
#endif /* U_HIDE_DEPRECATED_API */
UBRK_COUNT = 5
/** Extended Grapheme Cluster breaks @draft ICU 3.8 */
UBRK_X_GRAPHEME_CLUSTER=6,
UBRK_COUNT = 6
} UBreakIteratorType;
/** Value indicating all text boundaries have been returned.

View File

@ -1,4 +1,4 @@
// ***************************************************************************
// ***************************************************************************
// *
// * Copyright (C) 2007 International Business Machines
// * Corporation and others. All Rights Reserved.
@ -14,6 +14,7 @@ root{
sentence:process(dependency){"sent.brk"}
title:process(dependency){"title.brk"}
word:process(dependency){"word.brk"}
xgc:process(dependency){"xgc.brk"}
}
dictionaries{
Thai:process(dependency){"thaidict.ctd"}

View File

@ -1,6 +1,6 @@
<?xml version="1.0" encoding="UTF-8" ?>
<!--
Copyright (c) 2006 International Business Machines Corporation and others. All rights reserved.
Copyright (c) 2007 International Business Machines Corporation and others. All rights reserved.
-->
<!DOCTYPE ldml SYSTEM "http://www.unicode.org/cldr/dtd/1.4/ldml.dtd"
[
@ -22,6 +22,7 @@
<icu:line icu:dependency="line.brk"/>
<icu:sentence icu:dependency="sent.brk"/>
<icu:title icu:dependency="title.brk"/>
<icu:xgc icu:dependency="xgc.brk"/>
</icu:boundaries>
<icu:dictionaries>
<icu:dictionary type="Thai" icu:dependency="thaidict.ctd"/>

View File

@ -411,6 +411,13 @@ void RBBIAPITest::TestIteration()
}
delete bi;
status=U_ZERO_ERROR;
bi = (RuleBasedBreakIterator*)RuleBasedBreakIterator::
createXGraphemeClusterInstance(Locale::getDefault(), status);
TEST_ASSERT_SUCCESS(status);
TEST_ASSERT(bi != NULL);
delete bi;
status=U_ZERO_ERROR;
bi = (RuleBasedBreakIterator*)RuleBasedBreakIterator::createCharacterInstance(Locale::getDefault(), status);
if (U_FAILURE(status) || bi == NULL) {

View File

@ -1502,6 +1502,13 @@ void RBBITest::TestExtended() {
charIdx += 6;
break;
}
if (testString.compare(charIdx-1, 5, "<xgc>") == 0) {
delete tp.bi;
tp.bi = BreakIterator::createXGraphemeClusterInstance(locale, status);
charIdx += 4;
break;
}
// <locale loc_name>
localeMatcher.reset(testString);
if (localeMatcher.lookingAt(charIdx-1, status)) {

View File

@ -1,4 +1,4 @@
# Copyright (c) 2001-2006 International Business Machines
# Copyright (c) 2001-2006 International Business Machines
# Corporation and others. All Rights Reserved.
#
# RBBI Test Data
@ -91,6 +91,28 @@
# Treat Japanese Half Width voicing marks as combining
<data>•A\uff9e•B\uff9f\uff9e\uff9f•C•</data>
########################################################################################
#
#
# Extended G r a p h e m e C l u s t e r T e s t s
#
#
##########################################################################################
<xgc>
# Plain Vanilla grapheme clusters
<data>•a•b•c•</data>
<data>•a\u0301\u0302• •b\u0303\u0304•</data>
# Assorted Hindi combining marks
<data>•\u0904\u0903• •\u0937\u093E• •\u0904\u093F• •\u0937\u0940• •\u0937\u0949• •\u0937\u094A• •\u0937\u094B• •\u0937\u094C•</data>
# Thai Clusters
# $Prepend $Extend* $PrependBase $Extend*;
#
<data>•\u0e40\u0e01•\u0e44\u0301\u0e23\u0302\u0303•\u0e40•\u0e40\u0e02•\u0e02• •</data>
########################################################################################
#
#