ICU-13177 Merging trunk to branch

X-SVN-Rev: 40350
This commit is contained in:
Shane Carr 2017-08-24 05:56:16 +00:00
commit 8b625eda51
89 changed files with 6032 additions and 1679 deletions

View File

@ -195,7 +195,7 @@ BreakIterator::getAvailableLocales(int32_t& count)
// ------------------------------------------
//
// Default constructor and destructor
// Constructors, destructor and assignment operator
//
//-------------------------------------------
@ -204,6 +204,19 @@ BreakIterator::BreakIterator()
*validLocale = *actualLocale = 0;
}
BreakIterator::BreakIterator(const BreakIterator &other) : UObject(other) {
uprv_strncpy(actualLocale, other.actualLocale, sizeof(actualLocale));
uprv_strncpy(validLocale, other.validLocale, sizeof(validLocale));
}
BreakIterator &BreakIterator::operator =(const BreakIterator &other) {
if (this != &other) {
uprv_strncpy(actualLocale, other.actualLocale, sizeof(actualLocale));
uprv_strncpy(validLocale, other.validLocale, sizeof(validLocale));
}
return *this;
}
BreakIterator::~BreakIterator()
{
}
@ -265,7 +278,7 @@ ICUBreakIteratorService::~ICUBreakIteratorService() {}
// defined in ucln_cmn.h
U_NAMESPACE_END
static icu::UInitOnce gInitOnce;
static icu::UInitOnce gInitOnceBrkiter;
static icu::ICULocaleService* gService = NULL;
@ -280,7 +293,7 @@ static UBool U_CALLCONV breakiterator_cleanup(void) {
delete gService;
gService = NULL;
}
gInitOnce.reset();
gInitOnceBrkiter.reset();
#endif
return TRUE;
}
@ -296,7 +309,7 @@ initService(void) {
static ICULocaleService*
getService(void)
{
umtx_initOnce(gInitOnce, &initService);
umtx_initOnce(gInitOnceBrkiter, &initService);
return gService;
}
@ -306,7 +319,7 @@ getService(void)
static inline UBool
hasService(void)
{
return !gInitOnce.isReset() && getService() != NULL;
return !gInitOnceBrkiter.isReset() && getService() != NULL;
}
// -------------------------------------

View File

@ -33,20 +33,85 @@ const int32_t LENGTH_IN_2TRAIL = 62;
} // namespace
Edits::~Edits() {
if(array != stackArray) {
void Edits::releaseArray() U_NOEXCEPT {
if (array != stackArray) {
uprv_free(array);
}
}
void Edits::reset() {
Edits &Edits::copyArray(const Edits &other) {
if (U_FAILURE(errorCode_)) {
length = delta = numChanges = 0;
return *this;
}
if (length > capacity) {
uint16_t *newArray = (uint16_t *)uprv_malloc((size_t)length * 2);
if (newArray == nullptr) {
length = delta = numChanges = 0;
errorCode_ = U_MEMORY_ALLOCATION_ERROR;
return *this;
}
releaseArray();
array = newArray;
capacity = length;
}
if (length > 0) {
uprv_memcpy(array, other.array, (size_t)length * 2);
}
return *this;
}
Edits &Edits::moveArray(Edits &src) U_NOEXCEPT {
if (U_FAILURE(errorCode_)) {
length = delta = numChanges = 0;
return *this;
}
releaseArray();
if (length > STACK_CAPACITY) {
array = src.array;
capacity = src.capacity;
src.array = src.stackArray;
src.capacity = STACK_CAPACITY;
src.reset();
return *this;
}
array = stackArray;
capacity = STACK_CAPACITY;
if (length > 0) {
uprv_memcpy(array, src.array, (size_t)length * 2);
}
return *this;
}
Edits &Edits::operator=(const Edits &other) {
length = other.length;
delta = other.delta;
numChanges = other.numChanges;
errorCode_ = other.errorCode_;
return copyArray(other);
}
Edits &Edits::operator=(Edits &&src) U_NOEXCEPT {
length = src.length;
delta = src.delta;
numChanges = src.numChanges;
errorCode_ = src.errorCode_;
return moveArray(src);
}
Edits::~Edits() {
releaseArray();
}
void Edits::reset() U_NOEXCEPT {
length = delta = numChanges = 0;
errorCode_ = U_ZERO_ERROR;
}
void Edits::addUnchanged(int32_t unchangedLength) {
if(U_FAILURE(errorCode) || unchangedLength == 0) { return; }
if(U_FAILURE(errorCode_) || unchangedLength == 0) { return; }
if(unchangedLength < 0) {
errorCode = U_ILLEGAL_ARGUMENT_ERROR;
errorCode_ = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
// Merge into previous unchanged-text record, if any.
@ -72,7 +137,7 @@ void Edits::addUnchanged(int32_t unchangedLength) {
}
void Edits::addReplace(int32_t oldLength, int32_t newLength) {
if(U_FAILURE(errorCode)) { return; }
if(U_FAILURE(errorCode_)) { return; }
if(oldLength == newLength && 0 < oldLength && oldLength <= MAX_SHORT_WIDTH) {
// Replacement of short oldLength text units by same-length new text.
// Merge into previous short-replacement record, if any.
@ -88,7 +153,7 @@ void Edits::addReplace(int32_t oldLength, int32_t newLength) {
}
if(oldLength < 0 || newLength < 0) {
errorCode = U_ILLEGAL_ARGUMENT_ERROR;
errorCode_ = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
if (oldLength == 0 && newLength == 0) {
@ -100,7 +165,7 @@ void Edits::addReplace(int32_t oldLength, int32_t newLength) {
if ((newDelta > 0 && delta >= 0 && newDelta > (INT32_MAX - delta)) ||
(newDelta < 0 && delta < 0 && newDelta < (INT32_MIN - delta))) {
// Integer overflow or underflow.
errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
errorCode_ = U_INDEX_OUTOFBOUNDS_ERROR;
return;
}
delta += newDelta;
@ -151,7 +216,7 @@ UBool Edits::growArray() {
} else if (capacity == INT32_MAX) {
// Not U_BUFFER_OVERFLOW_ERROR because that could be confused on a string transform API
// with a result-string-buffer overflow.
errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
errorCode_ = U_INDEX_OUTOFBOUNDS_ERROR;
return FALSE;
} else if (capacity >= (INT32_MAX / 2)) {
newCapacity = INT32_MAX;
@ -160,18 +225,16 @@ UBool Edits::growArray() {
}
// Grow by at least 5 units so that a maximal change record will fit.
if ((newCapacity - capacity) < 5) {
errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
errorCode_ = U_INDEX_OUTOFBOUNDS_ERROR;
return FALSE;
}
uint16_t *newArray = (uint16_t *)uprv_malloc((size_t)newCapacity * 2);
if (newArray == NULL) {
errorCode = U_MEMORY_ALLOCATION_ERROR;
errorCode_ = U_MEMORY_ALLOCATION_ERROR;
return FALSE;
}
uprv_memcpy(newArray, array, (size_t)length * 2);
if (array != stackArray) {
uprv_free(array);
}
releaseArray();
array = newArray;
capacity = newCapacity;
return TRUE;
@ -179,11 +242,157 @@ UBool Edits::growArray() {
UBool Edits::copyErrorTo(UErrorCode &outErrorCode) {
if (U_FAILURE(outErrorCode)) { return TRUE; }
if (U_SUCCESS(errorCode)) { return FALSE; }
outErrorCode = errorCode;
if (U_SUCCESS(errorCode_)) { return FALSE; }
outErrorCode = errorCode_;
return TRUE;
}
Edits &Edits::mergeAndAppend(const Edits &ab, const Edits &bc, UErrorCode &errorCode) {
if (copyErrorTo(errorCode)) { return *this; }
// Picture string a --(Edits ab)--> string b --(Edits bc)--> string c.
// Parallel iteration over both Edits.
Iterator abIter = ab.getFineIterator();
Iterator bcIter = bc.getFineIterator();
UBool abHasNext = TRUE, bcHasNext = TRUE;
// Copy iterator state into local variables, so that we can modify and subdivide spans.
// ab old & new length, bc old & new length
int32_t aLength = 0, ab_bLength = 0, bc_bLength = 0, cLength = 0;
// When we have different-intermediate-length changes, we accumulate a larger change.
int32_t pending_aLength = 0, pending_cLength = 0;
for (;;) {
// At this point, for each of the two iterators:
// Either we are done with the locally cached current edit,
// and its intermediate-string length has been reset,
// or we will continue to work with a truncated remainder of this edit.
//
// If the current edit is done, and the iterator has not yet reached the end,
// then we fetch the next edit. This is true for at least one of the iterators.
//
// Normally it does not matter whether we fetch from ab and then bc or vice versa.
// However, the result is observably different when
// ab deletions meet bc insertions at the same intermediate-string index.
// Some users expect the bc insertions to come first, so we fetch from bc first.
if (bc_bLength == 0) {
if (bcHasNext && (bcHasNext = bcIter.next(errorCode))) {
bc_bLength = bcIter.oldLength();
cLength = bcIter.newLength();
if (bc_bLength == 0) {
// insertion
if (ab_bLength == 0 || !abIter.hasChange()) {
addReplace(pending_aLength, pending_cLength + cLength);
pending_aLength = pending_cLength = 0;
} else {
pending_cLength += cLength;
}
continue;
}
}
// else see if the other iterator is done, too.
}
if (ab_bLength == 0) {
if (abHasNext && (abHasNext = abIter.next(errorCode))) {
aLength = abIter.oldLength();
ab_bLength = abIter.newLength();
if (ab_bLength == 0) {
// deletion
if (bc_bLength == bcIter.oldLength() || !bcIter.hasChange()) {
addReplace(pending_aLength + aLength, pending_cLength);
pending_aLength = pending_cLength = 0;
} else {
pending_aLength += aLength;
}
continue;
}
} else if (bc_bLength == 0) {
// Both iterators are done at the same time:
// The intermediate-string lengths match.
break;
} else {
// The ab output string is shorter than the bc input string.
if (!copyErrorTo(errorCode)) {
errorCode = U_ILLEGAL_ARGUMENT_ERROR;
}
return *this;
}
}
if (bc_bLength == 0) {
// The bc input string is shorter than the ab output string.
if (!copyErrorTo(errorCode)) {
errorCode = U_ILLEGAL_ARGUMENT_ERROR;
}
return *this;
}
// Done fetching: ab_bLength > 0 && bc_bLength > 0
// The current state has two parts:
// - Past: We accumulate a longer ac edit in the "pending" variables.
// - Current: We have copies of the current ab/bc edits in local variables.
// At least one side is newly fetched.
// One side might be a truncated remainder of an edit we fetched earlier.
if (!abIter.hasChange() && !bcIter.hasChange()) {
// An unchanged span all the way from string a to string c.
if (pending_aLength != 0 || pending_cLength != 0) {
addReplace(pending_aLength, pending_cLength);
pending_aLength = pending_cLength = 0;
}
int32_t unchangedLength = aLength <= cLength ? aLength : cLength;
addUnchanged(unchangedLength);
ab_bLength = aLength -= unchangedLength;
bc_bLength = cLength -= unchangedLength;
// At least one of the unchanged spans is now empty.
continue;
}
if (!abIter.hasChange() && bcIter.hasChange()) {
// Unchanged a->b but changed b->c.
if (ab_bLength >= bc_bLength) {
// Split the longer unchanged span into change + remainder.
addReplace(pending_aLength + bc_bLength, pending_cLength + cLength);
pending_aLength = pending_cLength = 0;
aLength = ab_bLength -= bc_bLength;
bc_bLength = 0;
continue;
}
// Handle the shorter unchanged span below like a change.
} else if (abIter.hasChange() && !bcIter.hasChange()) {
// Changed a->b and then unchanged b->c.
if (ab_bLength <= bc_bLength) {
// Split the longer unchanged span into change + remainder.
addReplace(pending_aLength + aLength, pending_cLength + ab_bLength);
pending_aLength = pending_cLength = 0;
cLength = bc_bLength -= ab_bLength;
ab_bLength = 0;
continue;
}
// Handle the shorter unchanged span below like a change.
} else { // both abIter.hasChange() && bcIter.hasChange()
if (ab_bLength == bc_bLength) {
// Changes on both sides up to the same position. Emit & reset.
addReplace(pending_aLength + aLength, pending_cLength + cLength);
pending_aLength = pending_cLength = 0;
ab_bLength = bc_bLength = 0;
continue;
}
}
// Accumulate the a->c change, reset the shorter side,
// keep a remainder of the longer one.
pending_aLength += aLength;
pending_cLength += cLength;
if (ab_bLength < bc_bLength) {
bc_bLength -= ab_bLength;
cLength = ab_bLength = 0;
} else { // ab_bLength > bc_bLength
ab_bLength -= bc_bLength;
aLength = bc_bLength = 0;
}
}
if (pending_aLength != 0 || pending_cLength != 0) {
addReplace(pending_aLength, pending_cLength);
}
copyErrorTo(errorCode);
return *this;
}
Edits::Iterator::Iterator(const uint16_t *a, int32_t len, UBool oc, UBool crs) :
array(a), index(0), length(len), remaining(0),
onlyChanges_(oc), coarse(crs),
@ -308,12 +517,7 @@ int32_t Edits::Iterator::findIndex(int32_t i, UBool findSource, UErrorCode &erro
spanStart = destIndex;
spanLength = newLength_;
}
// If we are at the start or limit of an empty span, then we search from
// the start of the string so that we always return
// the first of several consecutive empty spans, for consistent results.
// We do not currently track the properties of the previous span,
// so for now we always reset if we are at the start of the current span.
if (i <= spanStart) {
if (i < spanStart) {
// Reset the iterator to the start.
index = remaining = oldLength_ = newLength_ = srcIndex = replIndex = destIndex = 0;
} else if (i < (spanStart + spanLength)) {
@ -328,8 +532,8 @@ int32_t Edits::Iterator::findIndex(int32_t i, UBool findSource, UErrorCode &erro
spanStart = destIndex;
spanLength = newLength_;
}
if (i == spanStart || i < (spanStart + spanLength)) {
// The index is in the current span, or at an empty one.
if (i < (spanStart + spanLength)) {
// The index is in the current span.
return 0;
}
if (remaining > 0) {

View File

@ -35,7 +35,7 @@ U_NAMESPACE_BEGIN
static icu::Locale* availableLocaleList = NULL;
static int32_t availableLocaleListCount;
static icu::UInitOnce gInitOnce = U_INITONCE_INITIALIZER;
static icu::UInitOnce gInitOnceLocale = U_INITONCE_INITIALIZER;
U_NAMESPACE_END
@ -50,7 +50,7 @@ static UBool U_CALLCONV locale_available_cleanup(void)
availableLocaleList = NULL;
}
availableLocaleListCount = 0;
gInitOnce.reset();
gInitOnceLocale.reset();
return TRUE;
}
@ -81,7 +81,7 @@ void U_CALLCONV locale_available_init() {
const Locale* U_EXPORT2
Locale::getAvailableLocales(int32_t& count)
{
umtx_initOnce(gInitOnce, &locale_available_init);
umtx_initOnce(gInitOnceLocale, &locale_available_init);
count = availableLocaleListCount;
return availableLocaleList;
}

View File

@ -1069,7 +1069,7 @@ uprv_getWindowsTimeZone()
U_CAPI const char* U_EXPORT2
uprv_tzname(int n)
{
n; // Avoid unreferenced parameter warning.
(void)n; // Avoid unreferenced parameter warning.
const char *tzid = NULL;
#if U_PLATFORM_USES_ONLY_WIN32_API
#if U_PLATFORM_HAS_WINUWP_API > 0

View File

@ -72,15 +72,6 @@
typedef size_t uintptr_t;
#endif
/**
* \def U_HAVE_MSVC_2003_OR_EARLIER
* Flag for workaround of MSVC 2003 optimization bugs
* @internal
*/
#if !defined(U_HAVE_MSVC_2003_OR_EARLIER) && defined(_MSC_VER) && (_MSC_VER < 1400)
#define U_HAVE_MSVC_2003_OR_EARLIER
#endif
/*===========================================================================*/
/** @{ Information about POSIX support */
/*===========================================================================*/
@ -120,15 +111,15 @@ typedef size_t uintptr_t;
/* Use the predefined value. */
#elif U_PLATFORM == U_PF_ANDROID
# define U_TIMEZONE timezone
#elif defined(__UCLIBC__)
// uClibc does not have __timezone or _timezone.
#elif defined(_NEWLIB_VERSION)
# define U_TIMEZONE _timezone
#elif defined(__GLIBC__)
// glibc
# define U_TIMEZONE __timezone
#elif U_PLATFORM_IS_LINUX_BASED
# if defined(__UCLIBC__)
/* uClibc does not have __timezone or _timezone. */
# elif defined(_NEWLIB_VERSION)
# define U_TIMEZONE _timezone
# elif defined(__GLIBC__)
/* glibc */
# define U_TIMEZONE __timezone
# endif
// not defined
#elif U_PLATFORM_USES_ONLY_WIN32_API
# define U_TIMEZONE _timezone
#elif U_PLATFORM == U_PF_BSD && !defined(__NetBSD__)
@ -214,7 +205,7 @@ typedef size_t uintptr_t;
/**
* \def U_HAVE_STD_ATOMICS
* Defines whether the standard C++11 <atomic> is available.
* ICU will use this when avialable,
* ICU will use this when available,
* otherwise will fall back to compiler or platform specific alternatives.
* @internal
*/
@ -239,7 +230,7 @@ typedef size_t uintptr_t;
/**
* \def U_HAVE_CLANG_ATOMICS
* Defines whether Clang c11 style built-in atomics are avaialable.
* Defines whether Clang c11 style built-in atomics are available.
* These are used in preference to gcc atomics when both are available.
*/
#ifdef U_HAVE_CLANG_ATOMICS
@ -277,7 +268,7 @@ typedef size_t uintptr_t;
/**
* Platform utilities isolates the platform dependencies of the
* libarary. For each platform which this code is ported to, these
* library. For each platform which this code is ported to, these
* functions may have to be re-implemented.
*/
@ -425,7 +416,7 @@ U_INTERNAL const char* U_EXPORT2 uprv_getDefaultCodepage(void);
/**
* Please use uloc_getDefault() instead.
* Return the default locale ID string by querying ths system, or
* Return the default locale ID string by querying the system, or
* zero if one cannot be found.
* This function can call setlocale() on Unix platforms. Please read the
* platform documentation on setlocale() before calling this function.

View File

@ -213,6 +213,8 @@ RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) {
if (this == &that) {
return *this;
}
BreakIterator::operator=(that);
reset(); // Delete break cache information
fBreakType = that.fBreakType;
if (fLanguageBreakEngines != NULL) {
@ -311,16 +313,19 @@ RuleBasedBreakIterator::operator==(const BreakIterator& that) const {
return FALSE;
}
// The base class BreakIterator carries no state that participates in equality,
// and does not implement an equality function that would otherwise be
// checked at this point.
const RuleBasedBreakIterator& that2 = (const RuleBasedBreakIterator&) that;
if (!utext_equals(fText, that2.fText)) {
// The two break iterators are operating on different text,
// or have a different interation position.
// or have a different iteration position.
// Note that fText's position is always the same as the break iterator's position.
return FALSE;
};
// TODO: need a check for when in a dictionary region at different offsets.
if (that2.fData == fData ||
(fData != NULL && that2.fData != NULL && *that2.fData == *fData)) {
// The two break iterators are using the same rules.

View File

@ -287,7 +287,7 @@ UCharsTrieBuilder::indexOfElementWithNextUnit(int32_t i, int32_t unitIndex, UCha
UCharsTrieBuilder::UCTLinearMatchNode::UCTLinearMatchNode(const UChar *units, int32_t len, Node *nextNode)
: LinearMatchNode(len, nextNode), s(units) {
hash=hash*37+ustr_hashUCharsN(units, len);
hash=hash*37u+ustr_hashUCharsN(units, len);
}
UBool

View File

@ -250,7 +250,7 @@ public:
virtual int32_t next(void) = 0;
/**
* Return character index of the current interator position within the text.
* Return character index of the current iterator position within the text.
* @return The boundary most recently returned.
* @stable ICU 2.0
*/
@ -277,7 +277,7 @@ public:
virtual int32_t preceding(int32_t offset) = 0;
/**
* Return true if the specfied position is a boundary position.
* Return true if the specified position is a boundary position.
* As a side effect, the current position of the iterator is set
* to the first boundary position at or following the specified offset.
* @param offset the offset to check.
@ -331,7 +331,7 @@ public:
* @param fillInVec an array to be filled in with the status values.
* @param capacity the length of the supplied vector. A length of zero causes
* the function to return the number of status values, in the
* normal way, without attemtping to store any values.
* normal way, without attempting to store any values.
* @param status receives error codes.
* @return The number of rule status values from rules that determined
* the most recent boundary returned by the break iterator.
@ -469,7 +469,7 @@ public:
static const Locale* U_EXPORT2 getAvailableLocales(int32_t& count);
/**
* Get name of the object for the desired Locale, in the desired langauge.
* Get name of the object for the desired Locale, in the desired language.
* @param objectLocale must be from getAvailableLocales.
* @param displayLocale specifies the desired locale for output.
* @param name the fill-in parameter of the return value
@ -482,7 +482,7 @@ public:
UnicodeString& name);
/**
* Get name of the object for the desired Locale, in the langauge of the
* Get name of the object for the desired Locale, in the language of the
* default locale.
* @param objectLocale must be from getMatchingLocales
* @param name the fill-in parameter of the return value
@ -629,10 +629,12 @@ protected:
/** @internal */
BreakIterator();
/** @internal */
BreakIterator (const BreakIterator &other) : UObject(other) {}
BreakIterator (const BreakIterator &other);
#ifndef U_HIDE_INTERNAL_API
/** @internal */
BreakIterator (const Locale& valid, const Locale& actual);
BreakIterator (const Locale& valid, const Locale &actual);
/** @internal. Assignment Operator, used by RuleBasedBreakIterator. */
BreakIterator &operator = (const BreakIterator &other);
#endif /* U_HIDE_INTERNAL_API */
private:
@ -640,12 +642,6 @@ private:
/** @internal */
char actualLocale[ULOC_FULLNAME_CAPACITY];
char validLocale[ULOC_FULLNAME_CAPACITY];
/**
* The assignment operator has no real implementation.
* It's provided to make the compiler happy. Do not call.
*/
BreakIterator& operator=(const BreakIterator&);
};
#ifndef U_HIDE_DEPRECATED_API
@ -661,5 +657,5 @@ U_NAMESPACE_END
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
#endif // _BRKITER
#endif // BRKITER_H
//eof

View File

@ -95,45 +95,45 @@ private:
return reinterpret_cast<char16_t *>(t);
}
char16_t *p;
char16_t *p_;
#else
union {
char16_t *cp;
uint16_t *up;
wchar_t *wp;
} u;
} u_;
#endif
};
#ifdef U_ALIASING_BARRIER
Char16Ptr::Char16Ptr(char16_t *p) : p(p) {}
Char16Ptr::Char16Ptr(char16_t *p) : p_(p) {}
#if !U_CHAR16_IS_TYPEDEF
Char16Ptr::Char16Ptr(uint16_t *p) : p(cast(p)) {}
Char16Ptr::Char16Ptr(uint16_t *p) : p_(cast(p)) {}
#endif
#if U_SIZEOF_WCHAR_T==2
Char16Ptr::Char16Ptr(wchar_t *p) : p(cast(p)) {}
Char16Ptr::Char16Ptr(wchar_t *p) : p_(cast(p)) {}
#endif
Char16Ptr::Char16Ptr(std::nullptr_t p) : p(p) {}
Char16Ptr::Char16Ptr(std::nullptr_t p) : p_(p) {}
Char16Ptr::~Char16Ptr() {
U_ALIASING_BARRIER(p);
U_ALIASING_BARRIER(p_);
}
char16_t *Char16Ptr::get() const { return p; }
char16_t *Char16Ptr::get() const { return p_; }
#else
Char16Ptr::Char16Ptr(char16_t *p) { u.cp = p; }
Char16Ptr::Char16Ptr(char16_t *p) { u_.cp = p; }
#if !U_CHAR16_IS_TYPEDEF
Char16Ptr::Char16Ptr(uint16_t *p) { u.up = p; }
Char16Ptr::Char16Ptr(uint16_t *p) { u_.up = p; }
#endif
#if U_SIZEOF_WCHAR_T==2
Char16Ptr::Char16Ptr(wchar_t *p) { u.wp = p; }
Char16Ptr::Char16Ptr(wchar_t *p) { u_.wp = p; }
#endif
Char16Ptr::Char16Ptr(std::nullptr_t p) { u.cp = p; }
Char16Ptr::Char16Ptr(std::nullptr_t p) { u_.cp = p; }
Char16Ptr::~Char16Ptr() {}
char16_t *Char16Ptr::get() const { return u.cp; }
char16_t *Char16Ptr::get() const { return u_.cp; }
#endif
@ -203,45 +203,45 @@ private:
return reinterpret_cast<const char16_t *>(t);
}
const char16_t *p;
const char16_t *p_;
#else
union {
const char16_t *cp;
const uint16_t *up;
const wchar_t *wp;
} u;
} u_;
#endif
};
#ifdef U_ALIASING_BARRIER
ConstChar16Ptr::ConstChar16Ptr(const char16_t *p) : p(p) {}
ConstChar16Ptr::ConstChar16Ptr(const char16_t *p) : p_(p) {}
#if !U_CHAR16_IS_TYPEDEF
ConstChar16Ptr::ConstChar16Ptr(const uint16_t *p) : p(cast(p)) {}
ConstChar16Ptr::ConstChar16Ptr(const uint16_t *p) : p_(cast(p)) {}
#endif
#if U_SIZEOF_WCHAR_T==2
ConstChar16Ptr::ConstChar16Ptr(const wchar_t *p) : p(cast(p)) {}
ConstChar16Ptr::ConstChar16Ptr(const wchar_t *p) : p_(cast(p)) {}
#endif
ConstChar16Ptr::ConstChar16Ptr(const std::nullptr_t p) : p(p) {}
ConstChar16Ptr::ConstChar16Ptr(const std::nullptr_t p) : p_(p) {}
ConstChar16Ptr::~ConstChar16Ptr() {
U_ALIASING_BARRIER(p);
U_ALIASING_BARRIER(p_);
}
const char16_t *ConstChar16Ptr::get() const { return p; }
const char16_t *ConstChar16Ptr::get() const { return p_; }
#else
ConstChar16Ptr::ConstChar16Ptr(const char16_t *p) { u.cp = p; }
ConstChar16Ptr::ConstChar16Ptr(const char16_t *p) { u_.cp = p; }
#if !U_CHAR16_IS_TYPEDEF
ConstChar16Ptr::ConstChar16Ptr(const uint16_t *p) { u.up = p; }
ConstChar16Ptr::ConstChar16Ptr(const uint16_t *p) { u_.up = p; }
#endif
#if U_SIZEOF_WCHAR_T==2
ConstChar16Ptr::ConstChar16Ptr(const wchar_t *p) { u.wp = p; }
ConstChar16Ptr::ConstChar16Ptr(const wchar_t *p) { u_.wp = p; }
#endif
ConstChar16Ptr::ConstChar16Ptr(const std::nullptr_t p) { u.cp = p; }
ConstChar16Ptr::ConstChar16Ptr(const std::nullptr_t p) { u_.cp = p; }
ConstChar16Ptr::~ConstChar16Ptr() {}
const char16_t *ConstChar16Ptr::get() const { return u.cp; }
const char16_t *ConstChar16Ptr::get() const { return u_.cp; }
#endif

View File

@ -37,18 +37,60 @@ public:
*/
Edits() :
array(stackArray), capacity(STACK_CAPACITY), length(0), delta(0), numChanges(0),
errorCode(U_ZERO_ERROR) {}
errorCode_(U_ZERO_ERROR) {}
/**
* Copy constructor.
* @param other source edits
* @draft ICU 60
*/
Edits(const Edits &other) :
array(stackArray), capacity(STACK_CAPACITY), length(other.length),
delta(other.delta), numChanges(other.numChanges),
errorCode_(other.errorCode_) {
copyArray(other);
}
/**
* Move constructor, might leave src empty.
* This object will have the same contents that the source object had.
* @param src source edits
* @draft ICU 60
*/
Edits(Edits &&src) U_NOEXCEPT :
array(stackArray), capacity(STACK_CAPACITY), length(src.length),
delta(src.delta), numChanges(src.numChanges),
errorCode_(src.errorCode_) {
moveArray(src);
}
/**
* Destructor.
* @draft ICU 59
*/
~Edits();
/**
* Assignment operator.
* @param other source edits
* @return *this
* @draft ICU 60
*/
Edits &operator=(const Edits &other);
/**
* Move assignment operator, might leave src empty.
* This object will have the same contents that the source object had.
* The behavior is undefined if *this and src are the same object.
* @param src source edits
* @return *this
* @draft ICU 60
*/
Edits &operator=(Edits &&src) U_NOEXCEPT;
/**
* Resets the data but may not release memory.
* @draft ICU 59
*/
void reset();
void reset() U_NOEXCEPT;
/**
* Adds a record for an unchanged segment of text.
@ -99,6 +141,15 @@ public:
* @draft ICU 59
*/
struct U_COMMON_API Iterator U_FINAL : public UMemory {
/**
* Default constructor, empty iterator.
* @draft ICU 60
*/
Iterator() :
array(nullptr), index(0), length(0),
remaining(0), onlyChanges_(FALSE), coarse(FALSE),
changed(FALSE), oldLength_(0), newLength_(0),
srcIndex(0), replIndex(0), destIndex(0) {}
/**
* Copy constructor.
* @draft ICU 59
@ -309,9 +360,39 @@ public:
return Iterator(array, length, FALSE, FALSE);
}
/**
* Merges the two input Edits and appends the result to this object.
*
* Consider two string transformations (for example, normalization and case mapping)
* where each records Edits in addition to writing an output string.<br>
* Edits ab reflect how substrings of input string a
* map to substrings of intermediate string b.<br>
* Edits bc reflect how substrings of intermediate string b
* map to substrings of output string c.<br>
* This function merges ab and bc such that the additional edits
* recorded in this object reflect how substrings of input string a
* map to substrings of output string c.
*
* If unrelated Edits are passed in where the output string of the first
* has a different length than the input string of the second,
* then a U_ILLEGAL_ARGUMENT_ERROR is reported.
*
* @param ab reflects how substrings of input string a
* map to substrings of intermediate string b.
* @param bc reflects how substrings of intermediate string b
* map to substrings of output string c.
* @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test,
* or else the function returns immediately. Check for U_FAILURE()
* on output or use with function chaining. (See User Guide for details.)
* @return *this, with the merged edits appended
* @draft ICU 60
*/
Edits &mergeAndAppend(const Edits &ab, const Edits &bc, UErrorCode &errorCode);
private:
Edits(const Edits &) = delete;
Edits &operator=(const Edits &) = delete;
void releaseArray() U_NOEXCEPT;
Edits &copyArray(const Edits &other);
Edits &moveArray(Edits &src) U_NOEXCEPT;
void setLastUnit(int32_t last) { array[length - 1] = (uint16_t)last; }
int32_t lastUnit() const { return length > 0 ? array[length - 1] : 0xffff; }
@ -325,7 +406,7 @@ private:
int32_t length;
int32_t delta;
int32_t numChanges;
UErrorCode errorCode;
UErrorCode errorCode_;
uint16_t stackArray[STACK_CAPACITY];
};

View File

@ -256,7 +256,7 @@ protected:
/** @internal */
class FinalValueNode : public Node {
public:
FinalValueNode(int32_t v) : Node(0x111111*37+v), value(v) {}
FinalValueNode(int32_t v) : Node(0x111111u*37u+v), value(v) {}
virtual UBool operator==(const Node &other) const;
virtual void write(StringTrieBuilder &builder);
protected:
@ -276,7 +276,7 @@ protected:
void setValue(int32_t v) {
hasValue=TRUE;
value=v;
hash=hash*37+v;
hash=hash*37u+v;
}
protected:
UBool hasValue;
@ -290,7 +290,7 @@ protected:
class IntermediateValueNode : public ValueNode {
public:
IntermediateValueNode(int32_t v, Node *nextNode)
: ValueNode(0x222222*37+hashCode(nextNode)), next(nextNode) { setValue(v); }
: ValueNode(0x222222u*37u+hashCode(nextNode)), next(nextNode) { setValue(v); }
virtual UBool operator==(const Node &other) const;
virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
virtual void write(StringTrieBuilder &builder);
@ -307,7 +307,7 @@ protected:
class LinearMatchNode : public ValueNode {
public:
LinearMatchNode(int32_t len, Node *nextNode)
: ValueNode((0x333333*37+len)*37+hashCode(nextNode)),
: ValueNode((0x333333u*37u+len)*37u+hashCode(nextNode)),
length(len), next(nextNode) {}
virtual UBool operator==(const Node &other) const;
virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
@ -342,7 +342,7 @@ protected:
equal[length]=NULL;
values[length]=value;
++length;
hash=(hash*37+c)*37+value;
hash=(hash*37u+c)*37u+value;
}
// Adds a unit which leads to another match node.
void add(int32_t c, Node *node) {
@ -350,7 +350,7 @@ protected:
equal[length]=node;
values[length]=0;
++length;
hash=(hash*37+c)*37+hashCode(node);
hash=(hash*37u+c)*37u+hashCode(node);
}
protected:
Node *equal[kMaxBranchLinearSubNodeLength]; // NULL means "has final value".
@ -365,8 +365,8 @@ protected:
class SplitBranchNode : public BranchNode {
public:
SplitBranchNode(char16_t middleUnit, Node *lessThanNode, Node *greaterOrEqualNode)
: BranchNode(((0x555555*37+middleUnit)*37+
hashCode(lessThanNode))*37+hashCode(greaterOrEqualNode)),
: BranchNode(((0x555555u*37u+middleUnit)*37u+
hashCode(lessThanNode))*37u+hashCode(greaterOrEqualNode)),
unit(middleUnit), lessThan(lessThanNode), greaterOrEqual(greaterOrEqualNode) {}
virtual UBool operator==(const Node &other) const;
virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
@ -382,7 +382,7 @@ protected:
class BranchHeadNode : public ValueNode {
public:
BranchHeadNode(int32_t len, Node *subNode)
: ValueNode((0x666666*37+len)*37+hashCode(subNode)),
: ValueNode((0x666666u*37u+len)*37u+hashCode(subNode)),
length(len), next(subNode) {}
virtual UBool operator==(const Node &other) const;
virtual int32_t markRightEdgesFirst(int32_t edgeNumber);

View File

@ -987,7 +987,7 @@ UnicodeSet::applyPropertyAlias(const UnicodeString& prop,
UProperty p;
int32_t v;
UBool mustNotBeEmpty = FALSE, invert = FALSE;
UBool invert = FALSE;
if (value.length() > 0) {
p = u_getPropertyEnum(pname.data());
@ -1009,14 +1009,15 @@ UnicodeSet::applyPropertyAlias(const UnicodeString& prop,
p == UCHAR_LEAD_CANONICAL_COMBINING_CLASS) {
char* end;
double value = uprv_strtod(vname.data(), &end);
v = (int32_t) value;
if (v != value || v < 0 || *end != 0) {
// non-integral or negative value, or trailing junk
// Anything between 0 and 255 is valid even if unused.
// Cast double->int only after range check.
// We catch NaN here because comparing it with both 0 and 255 will be false
// (as are all comparisons with NaN).
if (*end != 0 || !(0 <= value && value <= 255) ||
(v = (int32_t)value) != value) {
// non-integral value or outside 0..255, or trailing junk
FAIL(ec);
}
// If the resultant set is empty then the numeric value
// was invalid.
mustNotBeEmpty = TRUE;
} else {
FAIL(ec);
}
@ -1115,12 +1116,6 @@ UnicodeSet::applyPropertyAlias(const UnicodeString& prop,
complement();
}
if (U_SUCCESS(ec) && (mustNotBeEmpty && isEmpty())) {
// mustNotBeEmpty is set to true if an empty set indicates
// invalid input.
ec = U_ILLEGAL_ARGUMENT_ERROR;
}
if (isBogus() && U_SUCCESS(ec)) {
// We likely ran out of memory. AHHH!
ec = U_MEMORY_ALLOCATION_ERROR;

View File

@ -980,11 +980,4 @@ utrie2_internalU8PrevIndex(const UTrie2 *trie, UChar32 c,
U_CDECL_END
/**
* Work around MSVC 2003 optimization bugs.
*/
#if defined (U_HAVE_MSVC_2003_OR_EARLIER)
#pragma optimize("", off)
#endif
#endif

View File

@ -496,8 +496,8 @@ $CAN_CM $CM* $QU; # QU x .
[$LB20NonBreaks-$CM] $CM* $BB; # BB x .
[^$CB] $CM* $BB; #
# LB21a
[^$CM $CB]? $CM* ($HY | $BA | $BAX) $CM* $HL;
# LB21a Don't break after Hebrew + Hyphen.
([^$CB] $CM*)? ($HY | $BA | $BAX) $CM* $HL;
# LB21b (reverse)
$HL $CM* $SY;

View File

@ -427,6 +427,7 @@ tzdbNames{
ss{"LINT"}
}
"meta:Lord_Howe"{
sd{"LHDT"}
ss{"LHST"}
}
"meta:Macau"{

View File

@ -226,7 +226,7 @@ AffixPattern::append(const AffixPattern &other) {
addLiteral(literal.getBuffer(), 0, literal.length());
break;
case kCurrency:
addCurrency(iter.getTokenLength());
addCurrency(static_cast<uint8_t>(iter.getTokenLength()));
break;
default:
add(iter.getTokenType());
@ -481,7 +481,7 @@ AffixPattern::parseUserAffixString(
break;
case 0xA4:
appender.flush();
appendTo.add(kCurrency, tokenSize);
appendTo.add(kCurrency, static_cast<uint8_t>(tokenSize));
break;
default:
appender.append(token);

View File

@ -28,6 +28,21 @@ class SkippedState;
class UCharsTrie;
class UVector32;
/* Large enough for CEs of most short strings. */
#define CEBUFFER_INITIAL_CAPACITY 40
// Export an explicit template instantiation of the MaybeStackArray that
// is used as a data member of CEBuffer.
//
// MSVC requires this, even though it should not be necessary.
// No direct access to the MaybeStackArray leaks out of the i18n library.
//
// See digitlst.h, pluralaffix.h, datefmt.h, and others for similar examples.
//
#if defined (_MSC_VER)
template class U_I18N_API MaybeStackArray<int64_t, CEBUFFER_INITIAL_CAPACITY>;
#endif
/**
* Collation element iterator and abstract character iterator.
*
@ -36,10 +51,10 @@ class UVector32;
*/
class U_I18N_API CollationIterator : public UObject {
private:
class CEBuffer {
class U_I18N_API CEBuffer {
private:
/** Large enough for CEs of most short strings. */
static const int32_t INITIAL_CAPACITY = 40;
static const int32_t INITIAL_CAPACITY = CEBUFFER_INITIAL_CAPACITY;
public:
CEBuffer() : length(0) {}
~CEBuffer();

View File

@ -97,9 +97,7 @@ static const char *gNumberElementKeys[DecimalFormatSymbols::kFormatSymbolCount]
// Initializes this with the decimal format symbols in the default locale.
DecimalFormatSymbols::DecimalFormatSymbols(UErrorCode& status)
: UObject(),
locale()
{
: UObject(), locale() {
initialize(locale, status, TRUE);
}
@ -107,16 +105,17 @@ DecimalFormatSymbols::DecimalFormatSymbols(UErrorCode& status)
// Initializes this with the decimal format symbols in the desired locale.
DecimalFormatSymbols::DecimalFormatSymbols(const Locale& loc, UErrorCode& status)
: UObject(),
locale(loc)
{
: UObject(), locale(loc) {
initialize(locale, status);
}
DecimalFormatSymbols::DecimalFormatSymbols(const Locale& loc, const NumberingSystem& ns, UErrorCode& status)
: UObject(), locale(loc) {
initialize(locale, status, FALSE, &ns);
}
DecimalFormatSymbols::DecimalFormatSymbols()
: UObject(),
locale(Locale::getRoot()),
currPattern(NULL) {
: UObject(), locale(Locale::getRoot()), currPattern(NULL) {
*validLocale = *actualLocale = 0;
initialize();
}
@ -342,7 +341,8 @@ CurrencySpacingSink::~CurrencySpacingSink() {}
} // namespace
void
DecimalFormatSymbols::initialize(const Locale& loc, UErrorCode& status, UBool useLastResortData)
DecimalFormatSymbols::initialize(const Locale& loc, UErrorCode& status,
UBool useLastResortData, const NumberingSystem* ns)
{
if (U_FAILURE(status)) { return; }
*validLocale = *actualLocale = 0;
@ -355,7 +355,13 @@ DecimalFormatSymbols::initialize(const Locale& loc, UErrorCode& status, UBool us
// Next get the numbering system for this locale and set zero digit
// and the digit string based on the numbering system for the locale
//
LocalPointer<NumberingSystem> ns(NumberingSystem::createInstance(loc, status));
LocalPointer<NumberingSystem> nsLocal;
if (ns == nullptr) {
// Use the numbering system according to the locale.
// Save it into a LocalPointer so it gets cleaned up.
nsLocal.adoptInstead(NumberingSystem::createInstance(loc, status));
ns = nsLocal.getAlias();
}
const char *nsName;
if (U_SUCCESS(status) && ns->getRadix() == 10 && !ns->isAlgorithmic()) {
nsName = ns->getName();

View File

@ -111,7 +111,7 @@ public:
return newRuleValue * divisor;
}
virtual double calcUpperBound(double /*oldUpperBound*/) const { return divisor; }
virtual double calcUpperBound(double /*oldUpperBound*/) const { return static_cast<double>(divisor); }
virtual UChar tokenChar() const { return (UChar)0x003c; } // '<'
@ -148,7 +148,7 @@ public:
virtual void doSubstitution(double number, UnicodeString& toInsertInto, int32_t pos, int32_t recursionCount, UErrorCode& status) const;
virtual int64_t transformNumber(int64_t number) const { return number % divisor; }
virtual double transformNumber(double number) const { return uprv_fmod(number, divisor); }
virtual double transformNumber(double number) const { return uprv_fmod(number, static_cast<double>(divisor)); }
virtual UBool doParse(const UnicodeString& text,
ParsePosition& parsePosition,
@ -158,10 +158,10 @@ public:
Formattable& result) const;
virtual double composeRuleValue(double newRuleValue, double oldRuleValue) const {
return oldRuleValue - uprv_fmod(oldRuleValue, divisor) + newRuleValue;
return oldRuleValue - uprv_fmod(oldRuleValue, static_cast<double>(divisor)) + newRuleValue;
}
virtual double calcUpperBound(double /*oldUpperBound*/) const { return divisor; }
virtual double calcUpperBound(double /*oldUpperBound*/) const { return static_cast<double>(divisor); }
virtual UBool isModulusSubstitution() const { return TRUE; }

View File

@ -1509,6 +1509,24 @@ NumberFormat::makeInstance(const Locale& desiredLocale,
return f;
}
/**
* Get the rounding mode.
* @return A rounding mode
*/
NumberFormat::ERoundingMode NumberFormat::getRoundingMode() const {
// Default value. ICU4J throws an exception and we can't change this API.
return NumberFormat::ERoundingMode::kRoundUnnecessary;
}
/**
* Set the rounding mode. This has no effect unless the rounding
* increment is greater than zero.
* @param roundingMode A rounding mode
*/
void NumberFormat::setRoundingMode(NumberFormat::ERoundingMode /*roundingMode*/) {
// No-op ICU4J throws an exception, and we can't change this API.
}
U_NAMESPACE_END
#endif /* #if !UCONFIG_NO_FORMATTING */

View File

@ -239,10 +239,10 @@ FixedPrecision::initVisibleDigits(
}
}
// Try fast path
if (n >= 0 && initVisibleDigits(scaled, -n, digits, status)) {
if (n >= 0 && initVisibleDigits(static_cast<int64_t>(scaled), -n, digits, status)) {
digits.fAbsDoubleValue = fabs(value);
digits.fAbsDoubleValueSet = U_SUCCESS(status) && !digits.isOverMaxDigits();
// Adjust for negative 0 becuase when we cast to an int64,
// Adjust for negative 0 because when we cast to an int64,
// negative 0 becomes positive 0.
if (scaled == 0.0 && uprv_isNegative(scaled)) {
digits.setNegative();

View File

@ -687,6 +687,7 @@ RuleBasedNumberFormat::RuleBasedNumberFormat(const UnicodeString& description,
, decimalFormatSymbols(NULL)
, defaultInfinityRule(NULL)
, defaultNaNRule(NULL)
, roundingMode(DecimalFormat::ERoundingMode::kRoundUnnecessary)
, lenient(FALSE)
, lenientParseRules(NULL)
, localizations(NULL)
@ -711,6 +712,7 @@ RuleBasedNumberFormat::RuleBasedNumberFormat(const UnicodeString& description,
, decimalFormatSymbols(NULL)
, defaultInfinityRule(NULL)
, defaultNaNRule(NULL)
, roundingMode(DecimalFormat::ERoundingMode::kRoundUnnecessary)
, lenient(FALSE)
, lenientParseRules(NULL)
, localizations(NULL)
@ -735,6 +737,7 @@ RuleBasedNumberFormat::RuleBasedNumberFormat(const UnicodeString& description,
, decimalFormatSymbols(NULL)
, defaultInfinityRule(NULL)
, defaultNaNRule(NULL)
, roundingMode(DecimalFormat::ERoundingMode::kRoundUnnecessary)
, lenient(FALSE)
, lenientParseRules(NULL)
, localizations(NULL)
@ -758,6 +761,7 @@ RuleBasedNumberFormat::RuleBasedNumberFormat(const UnicodeString& description,
, decimalFormatSymbols(NULL)
, defaultInfinityRule(NULL)
, defaultNaNRule(NULL)
, roundingMode(DecimalFormat::ERoundingMode::kRoundUnnecessary)
, lenient(FALSE)
, lenientParseRules(NULL)
, localizations(NULL)
@ -782,6 +786,7 @@ RuleBasedNumberFormat::RuleBasedNumberFormat(const UnicodeString& description,
, decimalFormatSymbols(NULL)
, defaultInfinityRule(NULL)
, defaultNaNRule(NULL)
, roundingMode(DecimalFormat::ERoundingMode::kRoundUnnecessary)
, lenient(FALSE)
, lenientParseRules(NULL)
, localizations(NULL)
@ -803,6 +808,7 @@ RuleBasedNumberFormat::RuleBasedNumberFormat(URBNFRuleSetTag tag, const Locale&
, decimalFormatSymbols(NULL)
, defaultInfinityRule(NULL)
, defaultNaNRule(NULL)
, roundingMode(DecimalFormat::ERoundingMode::kRoundUnnecessary)
, lenient(FALSE)
, lenientParseRules(NULL)
, localizations(NULL)
@ -869,6 +875,7 @@ RuleBasedNumberFormat::RuleBasedNumberFormat(const RuleBasedNumberFormat& rhs)
, decimalFormatSymbols(NULL)
, defaultInfinityRule(NULL)
, defaultNaNRule(NULL)
, roundingMode(DecimalFormat::ERoundingMode::kRoundUnnecessary)
, lenient(FALSE)
, lenientParseRules(NULL)
, localizations(NULL)
@ -898,6 +905,7 @@ RuleBasedNumberFormat::operator=(const RuleBasedNumberFormat& rhs)
setDecimalFormatSymbols(*rhs.getDecimalFormatSymbols());
init(rhs.originalDescription, rhs.localizations ? rhs.localizations->ref() : NULL, perror, status);
setDefaultRuleSet(rhs.getDefaultRuleSetName(), status);
setRoundingMode(rhs.getRoundingMode());
capitalizationInfoSet = rhs.capitalizationInfoSet;
capitalizationForUIListMenu = rhs.capitalizationForUIListMenu;
@ -1195,7 +1203,7 @@ RuleBasedNumberFormat::format(double number,
int32_t startPos = toAppendTo.length();
UErrorCode status = U_ZERO_ERROR;
if (defaultRuleSet) {
defaultRuleSet->format(number, toAppendTo, toAppendTo.length(), 0, status);
format(number, *defaultRuleSet, toAppendTo, status);
}
return adjustForCapitalizationContext(startPos, toAppendTo, status);
}
@ -1248,15 +1256,31 @@ RuleBasedNumberFormat::format(double number,
} else {
NFRuleSet *rs = findRuleSet(ruleSetName, status);
if (rs) {
int32_t startPos = toAppendTo.length();
rs->format(number, toAppendTo, toAppendTo.length(), 0, status);
adjustForCapitalizationContext(startPos, toAppendTo, status);
format(number, *rs, toAppendTo, status);
}
}
}
return toAppendTo;
}
void
RuleBasedNumberFormat::format(double number,
NFRuleSet& rs,
UnicodeString& toAppendTo,
UErrorCode& status) const
{
int32_t startPos = toAppendTo.length();
if (getRoundingMode() != DecimalFormat::ERoundingMode::kRoundUnnecessary && !uprv_isNaN(number) && !uprv_isInfinite(number)) {
DigitList digitList;
digitList.set(number);
digitList.setRoundingMode(getRoundingMode());
digitList.roundFixedPoint(getMaximumFractionDigits());
number = digitList.getDouble();
}
rs.format(number, toAppendTo, toAppendTo.length(), 0, status);
adjustForCapitalizationContext(startPos, toAppendTo, status);
}
/**
* Bottleneck through which all the public format() methods
* that take a long pass. By the time we get here, we know
@ -1959,6 +1983,23 @@ RuleBasedNumberFormat::createPluralFormat(UPluralType pluralType,
return new PluralFormat(locale, pluralType, pattern, status);
}
/**
* Get the rounding mode.
* @return A rounding mode
*/
DecimalFormat::ERoundingMode RuleBasedNumberFormat::getRoundingMode() const {
return roundingMode;
}
/**
* Set the rounding mode. This has no effect unless the rounding
* increment is greater than zero.
* @param roundingMode A rounding mode
*/
void RuleBasedNumberFormat::setRoundingMode(DecimalFormat::ERoundingMode roundingMode) {
this->roundingMode = roundingMode;
}
U_NAMESPACE_END
/* U_HAVE_RBNF */

View File

@ -2430,7 +2430,7 @@ TimeZoneFormat::parseOffsetPattern(const UnicodeString& pattern, OffsetFields re
isPrevQuote = TRUE;
if (itemType != GMTOffsetField::TEXT) {
if (GMTOffsetField::isValid(itemType, itemLength)) {
GMTOffsetField* fld = GMTOffsetField::createTimeField(itemType, (uint8_t)itemLength, status);
GMTOffsetField* fld = GMTOffsetField::createTimeField(itemType, static_cast<uint8_t>(itemLength), status);
result->addElement(fld, status);
if (U_FAILURE(status)) {
break;
@ -2465,7 +2465,7 @@ TimeZoneFormat::parseOffsetPattern(const UnicodeString& pattern, OffsetFields re
}
} else {
if (GMTOffsetField::isValid(itemType, itemLength)) {
GMTOffsetField* fld = GMTOffsetField::createTimeField(itemType, itemLength, status);
GMTOffsetField* fld = GMTOffsetField::createTimeField(itemType, static_cast<uint8_t>(itemLength), status);
result->addElement(fld, status);
if (U_FAILURE(status)) {
break;
@ -2483,7 +2483,7 @@ TimeZoneFormat::parseOffsetPattern(const UnicodeString& pattern, OffsetFields re
// a string literal
if (itemType != GMTOffsetField::TEXT) {
if (GMTOffsetField::isValid(itemType, itemLength)) {
GMTOffsetField* fld = GMTOffsetField::createTimeField(itemType, itemLength, status);
GMTOffsetField* fld = GMTOffsetField::createTimeField(itemType, static_cast<uint8_t>(itemLength), status);
result->addElement(fld, status);
if (U_FAILURE(status)) {
break;
@ -2508,7 +2508,7 @@ TimeZoneFormat::parseOffsetPattern(const UnicodeString& pattern, OffsetFields re
}
} else {
if (GMTOffsetField::isValid(itemType, itemLength)) {
GMTOffsetField* fld = GMTOffsetField::createTimeField(itemType, itemLength, status);
GMTOffsetField* fld = GMTOffsetField::createTimeField(itemType, static_cast<uint8_t>(itemLength), status);
result->addElement(fld, status);
} else {
status = U_ILLEGAL_ARGUMENT_ERROR;

View File

@ -62,7 +62,7 @@ namespace {
static const UChar *rootRules = NULL;
static int32_t rootRulesLength = 0;
static UResourceBundle *rootBundle = NULL;
static UInitOnce gInitOnce = U_INITONCE_INITIALIZER;
static UInitOnce gInitOnceUcolRes = U_INITONCE_INITIALIZER;
} // namespace
@ -74,7 +74,7 @@ ucol_res_cleanup() {
rootRulesLength = 0;
ures_close(rootBundle);
rootBundle = NULL;
gInitOnce.reset();
gInitOnceUcolRes.reset();
return TRUE;
}
@ -97,7 +97,7 @@ U_CDECL_END
void
CollationLoader::appendRootRules(UnicodeString &s) {
UErrorCode errorCode = U_ZERO_ERROR;
umtx_initOnce(gInitOnce, CollationLoader::loadRootRules, errorCode);
umtx_initOnce(gInitOnceUcolRes, CollationLoader::loadRootRules, errorCode);
if(U_SUCCESS(errorCode)) {
s.append(rootRules, rootRulesLength);
}

View File

@ -34,6 +34,7 @@
#include "unicode/uobject.h"
#include "unicode/locid.h"
#include "unicode/numsys.h"
#include "unicode/unum.h"
#include "unicode/unistr.h"
@ -184,6 +185,24 @@ public:
*/
DecimalFormatSymbols(const Locale& locale, UErrorCode& status);
/**
* Creates a DecimalFormatSymbols instance for the given locale with digits and symbols
* corresponding to the given NumberingSystem.
*
* This constructor behaves equivalently to the normal constructor called with a locale having a
* "numbers=xxxx" keyword specifying the numbering system by name.
*
* In this constructor, the NumberingSystem argument will be used even if the locale has its own
* "numbers=xxxx" keyword.
*
* @param locale The locale to get symbols for.
* @param ns The numbering system.
* @param status Input/output parameter, set to success or
* failure code upon return.
* @draft ICU 60
*/
DecimalFormatSymbols(const Locale& locale, const NumberingSystem& ns, UErrorCode& status);
/**
* Create a DecimalFormatSymbols object for the default locale.
* This constructor will not fail. If the resource file data is
@ -346,8 +365,11 @@ private:
* @param success Input/output parameter, set to success or
* failure code upon return.
* @param useLastResortData determine if use last resort data
* @param ns The NumberingSystem to use; otherwise, fall
* back to the locale.
*/
void initialize(const Locale& locale, UErrorCode& success, UBool useLastResortData = FALSE);
void initialize(const Locale& locale, UErrorCode& success,
UBool useLastResortData = FALSE, const NumberingSystem* ns = nullptr);
/**
* Initialize the symbols with default values.

View File

@ -668,28 +668,6 @@ template class U_I18N_API EnumSet<UNumberFormatAttribute,
*/
class U_I18N_API DecimalFormat: public NumberFormat {
public:
/**
* Rounding mode.
* @stable ICU 2.4
*/
enum ERoundingMode {
kRoundCeiling, /**< Round towards positive infinity */
kRoundFloor, /**< Round towards negative infinity */
kRoundDown, /**< Round towards zero */
kRoundUp, /**< Round away from zero */
kRoundHalfEven, /**< Round towards the nearest integer, or
towards the nearest even integer if equidistant */
kRoundHalfDown, /**< Round towards the nearest integer, or
towards zero if equidistant */
kRoundHalfUp, /**< Round towards the nearest integer, or
away from zero if equidistant */
/**
* Return U_FORMAT_INEXACT_ERROR if number does not format exactly.
* @stable ICU 4.8
*/
kRoundUnnecessary
};
/**
* Pad position.
* @stable ICU 2.4

View File

@ -168,6 +168,28 @@ class StringEnumeration;
*/
class U_I18N_API NumberFormat : public Format {
public:
/**
* Rounding mode.
* @stable ICU 2.4
*/
enum ERoundingMode {
kRoundCeiling, /**< Round towards positive infinity */
kRoundFloor, /**< Round towards negative infinity */
kRoundDown, /**< Round towards zero */
kRoundUp, /**< Round away from zero */
kRoundHalfEven, /**< Round towards the nearest integer, or
towards the nearest even integer if equidistant */
kRoundHalfDown, /**< Round towards the nearest integer, or
towards zero if equidistant */
kRoundHalfUp, /**< Round towards the nearest integer, or
away from zero if equidistant */
/**
* Return U_FORMAT_INEXACT_ERROR if number does not format exactly.
* @stable ICU 4.8
*/
kRoundUnnecessary
};
/**
* Alignment Field constants used to construct a FieldPosition object.
* Signifies that the position of the integer part or fraction part of
@ -965,6 +987,21 @@ public:
*/
virtual UDisplayContext getContext(UDisplayContextType type, UErrorCode& status) const;
/**
* Get the rounding mode. This will always return NumberFormat::ERoundingMode::kRoundUnnecessary
* if the subclass does not support rounding.
* @return A rounding mode
* @draft ICU 60
*/
virtual ERoundingMode getRoundingMode(void) const;
/**
* Set the rounding mode. If a subclass does not support rounding, this will do nothing.
* @param roundingMode A rounding mode
* @draft ICU 60
*/
virtual void setRoundingMode(ERoundingMode roundingMode);
public:
/**

View File

@ -30,6 +30,7 @@
#define U_HAVE_RBNF 1
#include "unicode/dcfmtsym.h"
#include "unicode/decimfmt.h"
#include "unicode/fmtable.h"
#include "unicode/locid.h"
#include "unicode/numfmt.h"
@ -1010,6 +1011,20 @@ public:
*/
virtual void setContext(UDisplayContext value, UErrorCode& status);
/**
* Get the rounding mode.
* @return A rounding mode
* @draft ICU 60
*/
virtual DecimalFormat::ERoundingMode getRoundingMode(void) const;
/**
* Set the rounding mode.
* @param roundingMode A rounding mode
* @draft ICU 60
*/
virtual void setRoundingMode(DecimalFormat::ERoundingMode roundingMode);
public:
/**
* ICU "poor man's RTTI", returns a UClassID for this class.
@ -1059,7 +1074,6 @@ private:
void dispose();
void stripWhitespace(UnicodeString& src);
void initDefaultRuleSet();
void format(double number, NFRuleSet& ruleSet);
NFRuleSet* findRuleSet(const UnicodeString& name, UErrorCode& status) const;
/* friend access */
@ -1079,6 +1093,7 @@ private:
PluralFormat *createPluralFormat(UPluralType pluralType, const UnicodeString &pattern, UErrorCode& status) const;
UnicodeString& adjustForCapitalizationContext(int32_t startPos, UnicodeString& currentResult, UErrorCode& status) const;
UnicodeString& format(int64_t number, NFRuleSet *ruleSet, UnicodeString& toAppendTo, UErrorCode& status) const;
void format(double number, NFRuleSet& rs, UnicodeString& toAppendTo, UErrorCode& status) const;
private:
NFRuleSet **ruleSets;
@ -1090,6 +1105,7 @@ private:
DecimalFormatSymbols* decimalFormatSymbols;
NFRule *defaultInfinityRule;
NFRule *defaultNaNRule;
DecimalFormat::ERoundingMode roundingMode;
UBool lenient;
UnicodeString* lenientParseRules;
LocalizationInfo* localizations;

View File

@ -507,20 +507,43 @@ U_CAPI int32_t U_EXPORT2
unum_getAttribute(const UNumberFormat* fmt,
UNumberFormatAttribute attr)
{
const NumberFormat* nf = reinterpret_cast<const NumberFormat*>(fmt);
if ( attr == UNUM_LENIENT_PARSE ) {
// Supported for all subclasses
return nf->isLenient();
}
const NumberFormat* nf = reinterpret_cast<const NumberFormat*>(fmt);
if (attr == UNUM_LENIENT_PARSE) {
// Supported for all subclasses
return nf->isLenient();
}
else if (attr == UNUM_MAX_INTEGER_DIGITS) {
return nf->getMaximumIntegerDigits();
}
else if (attr == UNUM_MIN_INTEGER_DIGITS) {
return nf->getMinimumIntegerDigits();
}
else if (attr == UNUM_INTEGER_DIGITS) {
// TODO: what should this return?
return nf->getMinimumIntegerDigits();
}
else if (attr == UNUM_MAX_FRACTION_DIGITS) {
return nf->getMaximumFractionDigits();
}
else if (attr == UNUM_MIN_FRACTION_DIGITS) {
return nf->getMinimumFractionDigits();
}
else if (attr == UNUM_FRACTION_DIGITS) {
// TODO: what should this return?
return nf->getMinimumFractionDigits();
}
else if (attr == UNUM_ROUNDING_MODE) {
return nf->getRoundingMode();
}
// The remaining attributea are only supported for DecimalFormat
const DecimalFormat* df = dynamic_cast<const DecimalFormat*>(nf);
if (df != NULL) {
UErrorCode ignoredStatus = U_ZERO_ERROR;
return df->getAttribute( attr, ignoredStatus );
}
// The remaining attributes are only supported for DecimalFormat
const DecimalFormat* df = dynamic_cast<const DecimalFormat*>(nf);
if (df != NULL) {
UErrorCode ignoredStatus = U_ZERO_ERROR;
return df->getAttribute(attr, ignoredStatus);
}
return -1;
return -1;
}
U_CAPI void U_EXPORT2
@ -528,18 +551,42 @@ unum_setAttribute( UNumberFormat* fmt,
UNumberFormatAttribute attr,
int32_t newValue)
{
NumberFormat* nf = reinterpret_cast<NumberFormat*>(fmt);
if ( attr == UNUM_LENIENT_PARSE ) {
// Supported for all subclasses
// keep this here as the class may not be a DecimalFormat
return nf->setLenient(newValue != 0);
}
// The remaining attributea are only supported for DecimalFormat
DecimalFormat* df = dynamic_cast<DecimalFormat*>(nf);
if (df != NULL) {
UErrorCode ignoredStatus = U_ZERO_ERROR;
df->setAttribute(attr, newValue, ignoredStatus);
}
NumberFormat* nf = reinterpret_cast<NumberFormat*>(fmt);
if (attr == UNUM_LENIENT_PARSE) {
// Supported for all subclasses
// keep this here as the class may not be a DecimalFormat
return nf->setLenient(newValue != 0);
}
else if (attr == UNUM_MAX_INTEGER_DIGITS) {
return nf->setMaximumIntegerDigits(newValue);
}
else if (attr == UNUM_MIN_INTEGER_DIGITS) {
return nf->setMinimumIntegerDigits(newValue);
}
else if (attr == UNUM_INTEGER_DIGITS) {
nf->setMinimumIntegerDigits(newValue);
return nf->setMaximumIntegerDigits(newValue);
}
else if (attr == UNUM_MAX_FRACTION_DIGITS) {
return nf->setMaximumFractionDigits(newValue);
}
else if (attr == UNUM_MIN_FRACTION_DIGITS) {
return nf->setMinimumFractionDigits(newValue);
}
else if (attr == UNUM_FRACTION_DIGITS) {
nf->setMinimumFractionDigits(newValue);
return nf->setMaximumFractionDigits(newValue);
}
else if (attr == UNUM_ROUNDING_MODE) {
return nf->setRoundingMode((NumberFormat::ERoundingMode)newValue);
}
// The remaining attributes are only supported for DecimalFormat
DecimalFormat* df = dynamic_cast<DecimalFormat*>(nf);
if (df != NULL) {
UErrorCode ignoredStatus = U_ZERO_ERROR;
df->setAttribute(attr, newValue, ignoredStatus);
}
}
U_CAPI double U_EXPORT2

View File

@ -690,7 +690,6 @@ ZoneMeta::createMetazoneMappings(const UnicodeString &tzid) {
mzMappings = new UVector(deleteOlsonToMetaMappingEntry, NULL, status);
if (U_FAILURE(status)) {
delete mzMappings;
deleteOlsonToMetaMappingEntry(entry);
uprv_free(entry);
break;
}

View File

@ -64,6 +64,7 @@ static void TestCurrFmtNegSameAsPositive(void);
static void TestVariousStylesAndAttributes(void);
static void TestParseCurrPatternWithDecStyle(void);
static void TestFormatForFields(void);
static void TestRBNFRounding(void);
#define TESTCASE(x) addTest(root, &x, "tsformat/cnumtst/" #x)
@ -79,6 +80,7 @@ void addNumForTest(TestNode** root)
TESTCASE(TestCurrencyRegression);
TESTCASE(TestTextAttributeCrash);
TESTCASE(TestRBNFFormat);
TESTCASE(TestRBNFRounding);
TESTCASE(TestNBSPInPattern);
TESTCASE(TestInt64Parse);
TESTCASE(TestParseZero);
@ -1791,6 +1793,48 @@ static void TestRBNFFormat() {
}
}
static void TestRBNFRounding() {
UChar fmtbuf[FORMAT_BUF_CAPACITY];
UChar expectedBuf[FORMAT_BUF_CAPACITY];
int32_t len;
UErrorCode status = U_ZERO_ERROR;
UNumberFormat* fmt = unum_open(UNUM_SPELLOUT, NULL, 0, "en_US", NULL, &status);
if (U_FAILURE(status)) {
log_err_status(status, "unable to open spellout -> %s\n", u_errorName(status));
return;
}
len = unum_formatDouble(fmt, 10.123456789, fmtbuf, FORMAT_BUF_CAPACITY, NULL, &status);
if (U_FAILURE(status)) {
log_err_status(status, "unum_formatDouble 10.123456789 failed with %s\n", u_errorName(status));
}
u_uastrcpy(expectedBuf, "ten point one two three four five six seven eight nine");
if (u_strcmp(expectedBuf, fmtbuf) != 0) {
log_err("Wrong result for unrounded value\n");
}
unum_setAttribute(fmt, UNUM_MAX_FRACTION_DIGITS, 3);
if (unum_getAttribute(fmt, UNUM_MAX_FRACTION_DIGITS) != 3) {
log_err("UNUM_MAX_FRACTION_DIGITS was incorrectly ignored -> %d\n", unum_getAttribute(fmt, UNUM_MAX_FRACTION_DIGITS));
}
if (unum_getAttribute(fmt, UNUM_ROUNDING_MODE) != UNUM_ROUND_UNNECESSARY) {
log_err("UNUM_ROUNDING_MODE was set -> %d\n", unum_getAttribute(fmt, UNUM_ROUNDING_MODE));
}
unum_setAttribute(fmt, UNUM_ROUNDING_MODE, UNUM_ROUND_HALFUP);
if (unum_getAttribute(fmt, UNUM_ROUNDING_MODE) != UNUM_ROUND_HALFUP) {
log_err("UNUM_ROUNDING_MODE was not set -> %d\n", unum_getAttribute(fmt, UNUM_ROUNDING_MODE));
}
len = unum_formatDouble(fmt, 10.123456789, fmtbuf, FORMAT_BUF_CAPACITY, NULL, &status);
if (U_FAILURE(status)) {
log_err_status(status, "unum_formatDouble 10.123456789 failed with %s\n", u_errorName(status));
}
u_uastrcpy(expectedBuf, "ten point one two three");
if (u_strcmp(expectedBuf, fmtbuf) != 0) {
char temp[512];
u_austrcpy(temp, fmtbuf);
log_err("Wrong result for rounded value. Got: %s\n", temp);
}
unum_close(fmt);
}
static void TestCurrencyRegression(void) {
/*
I've found a case where unum_parseDoubleCurrency is not doing what I

View File

@ -68,11 +68,11 @@ void RBBIAPITest::TestCloneEquals()
b |= *bi1 == *bi2;
b |= *bi1 == *bi3;
if (b) {
errln((UnicodeString)"ERROR:1 RBBI's == and != operator failed.");
errln("%s:%d ERROR:1 RBBI's == and != operator failed.", __FILE__, __LINE__);
}
if(*bi2 == *biequal || *bi2 == *bi1 || *biequal == *bi3)
errln((UnicodeString)"ERROR:2 RBBI's == and != operator failed.");
errln("%s:%d ERROR:2 RBBI's == and != operator failed.", __FILE__, __LINE__);
// Quick test of RulesBasedBreakIterator assignment -
@ -90,15 +90,15 @@ void RBBIAPITest::TestCloneEquals()
RuleBasedBreakIterator biDefault, biDefault2;
if(U_FAILURE(status)){
errln((UnicodeString)"FAIL : in construction of default iterator");
errln("%s:%d FAIL : in construction of default iterator", __FILE__, __LINE__);
return;
}
if (biDefault == *bix) {
errln((UnicodeString)"ERROR: iterators should not compare ==");
errln("%s:%d ERROR: iterators should not compare ==", __FILE__, __LINE__);
return;
}
if (biDefault != biDefault2) {
errln((UnicodeString)"ERROR: iterators should compare ==");
errln("%s:%d ERROR: iterators should compare ==", __FILE__, __LINE__);
return;
}
@ -106,41 +106,41 @@ void RBBIAPITest::TestCloneEquals()
UnicodeString HelloString("Hello Kitty");
bix->setText(HelloString);
if (*bix == *bi2) {
errln(UnicodeString("ERROR: strings should not be equal before assignment."));
errln("%s:%d ERROR: strings should not be equal before assignment.", __FILE__, __LINE__);
}
*bix = *bi2;
if (*bix != *bi2) {
errln(UnicodeString("ERROR: strings should be equal before assignment."));
errln("%s:%d ERROR: strings should be equal before assignment.", __FILE__, __LINE__);
}
int bixnext = bix->next();
int bi2next = bi2->next();
if (! (bixnext == bi2next && bixnext == 7)) {
errln(UnicodeString("ERROR: iterators behaved differently after assignment."));
errln("%s:%d ERROR: iterators behaved differently after assignment.", __FILE__, __LINE__);
}
delete bix;
if (bi2->next() != 8) {
errln(UnicodeString("ERROR: iterator.next() failed after deleting copy."));
errln("%s:%d ERROR: iterator.next() failed after deleting copy.", __FILE__, __LINE__);
}
logln((UnicodeString)"Testing clone()");
RuleBasedBreakIterator* bi1clone=(RuleBasedBreakIterator*)bi1->clone();
RuleBasedBreakIterator* bi2clone=(RuleBasedBreakIterator*)bi2->clone();
RuleBasedBreakIterator* bi1clone = dynamic_cast<RuleBasedBreakIterator *>(bi1->clone());
RuleBasedBreakIterator* bi2clone = dynamic_cast<RuleBasedBreakIterator *>(bi2->clone());
if(*bi1clone != *bi1 || *bi1clone != *biequal ||
*bi1clone == *bi3 || *bi1clone == *bi2)
errln((UnicodeString)"ERROR:1 RBBI's clone() method failed");
errln("%s:%d ERROR:1 RBBI's clone() method failed", __FILE__, __LINE__);
if(*bi2clone == *bi1 || *bi2clone == *biequal ||
*bi2clone == *bi3 || *bi2clone != *bi2)
errln((UnicodeString)"ERROR:2 RBBI's clone() method failed");
errln("%s:%d ERROR:2 RBBI's clone() method failed", __FILE__, __LINE__);
if(bi1->getText() != bi1clone->getText() ||
bi2clone->getText() != bi2->getText() ||
*bi2clone == *bi1clone )
errln((UnicodeString)"ERROR: RBBI's clone() method failed");
errln("%s:%d ERROR: RBBI's clone() method failed", __FILE__, __LINE__);
delete bi1clone;
delete bi2clone;
@ -427,12 +427,12 @@ void RBBIAPITest::TestIteration()
int32_t i;
i = bi->first();
if (i != 0) {
errln("Incorrect value from bi->first(). Expected 0, got %d.", i);
errln("%s:%d Incorrect value from bi->first(). Expected 0, got %d.", __FILE__, __LINE__, i);
}
i = bi->last();
if (i != 10) {
errln("Incorrect value from bi->last(). Expected 10, got %d", i);
errln("%s:%d Incorrect value from bi->last(). Expected 10, got %d", __FILE__, __LINE__, i);
}
//
@ -441,14 +441,14 @@ void RBBIAPITest::TestIteration()
bi->last();
i = bi->previous();
if (i != 9) {
errln("Incorrect value from bi->last() at line %d. Expected 9, got %d", __LINE__, i);
errln("%s:%d Incorrect value from bi->last(). Expected 9, got %d", __FILE__, __LINE__, i);
}
bi->first();
i = bi->previous();
if (i != BreakIterator::DONE) {
errln("Incorrect value from bi->previous() at line %d. Expected DONE, got %d", __LINE__, i);
errln("%s:%d Incorrect value from bi->previous(). Expected DONE, got %d", __FILE__, __LINE__, i);
}
//
@ -457,13 +457,13 @@ void RBBIAPITest::TestIteration()
bi->first();
i = bi->next();
if (i != 1) {
errln("Incorrect value from bi->next() at line %d. Expected 1, got %d", __LINE__, i);
errln("%s:%d Incorrect value from bi->next(). Expected 1, got %d", __FILE__, __LINE__, i);
}
bi->last();
i = bi->next();
if (i != BreakIterator::DONE) {
errln("Incorrect value from bi->next() at line %d. Expected DONE, got %d", __LINE__, i);
errln("%s:%d Incorrect value from bi->next(). Expected DONE, got %d", __FILE__, __LINE__, i);
}
@ -473,27 +473,27 @@ void RBBIAPITest::TestIteration()
bi->first();
i = bi->current();
if (i != 0) {
errln("Incorrect value from bi->previous() at line %d. Expected 0, got %d", __LINE__, i);
errln("%s:%d Incorrect value from bi->previous(). Expected 0, got %d", __FILE__, __LINE__, i);
}
bi->next();
i = bi->current();
if (i != 1) {
errln("Incorrect value from bi->previous() at line %d. Expected 1, got %d", __LINE__, i);
errln("%s:%d Incorrect value from bi->previous(). Expected 1, got %d", __FILE__, __LINE__, i);
}
bi->last();
bi->next();
i = bi->current();
if (i != 10) {
errln("Incorrect value from bi->previous() at line %d. Expected 10, got %d", __LINE__, i);
errln("%s:%d Incorrect value from bi->previous(). Expected 10, got %d", __FILE__, __LINE__, i);
}
bi->first();
bi->previous();
i = bi->current();
if (i != 0) {
errln("Incorrect value from bi->previous() at line %d. Expected 0, got %d", __LINE__, i);
errln("%s:%d Incorrect value from bi->previous(). Expected 0, got %d", __FILE__, __LINE__, i);
}
@ -502,17 +502,17 @@ void RBBIAPITest::TestIteration()
//
i = bi->following(4);
if (i != 5) {
errln("Incorrect value from bi->following() at line %d. Expected 5, got %d", __LINE__, i);
errln("%s:%d Incorrect value from bi->following(). Expected 5, got %d", __FILE__, __LINE__, i);
}
i = bi->following(9);
if (i != 10) {
errln("Incorrect value from bi->following() at line %d. Expected 10, got %d", __LINE__, i);
errln("%s:%d Incorrect value from bi->following(). Expected 10, got %d", __FILE__, __LINE__, i);
}
i = bi->following(10);
if (i != BreakIterator::DONE) {
errln("Incorrect value from bi->following() at line %d. Expected DONE, got %d", __LINE__, i);
errln("%s:%d Incorrect value from bi->following(). Expected DONE, got %d", __FILE__, __LINE__, i);
}
@ -521,22 +521,22 @@ void RBBIAPITest::TestIteration()
//
i = bi->preceding(4);
if (i != 3) {
errln("Incorrect value from bi->preceding() at line %d. Expected 3, got %d", __LINE__, i);
errln("%s:%d Incorrect value from bi->preceding(). Expected 3, got %d", __FILE__, __LINE__, i);
}
i = bi->preceding(10);
if (i != 9) {
errln("Incorrect value from bi->preceding() at line %d. Expected 9, got %d", __LINE__, i);
errln("%s:%d Incorrect value from bi->preceding(). Expected 9, got %d", __FILE__, __LINE__, i);
}
i = bi->preceding(1);
if (i != 0) {
errln("Incorrect value from bi->preceding() at line %d. Expected 0, got %d", __LINE__, i);
errln("%s:%d Incorrect value from bi->preceding(). Expected 0, got %d", __FILE__, __LINE__, i);
}
i = bi->preceding(0);
if (i != BreakIterator::DONE) {
errln("Incorrect value from bi->preceding() at line %d. Expected DONE, got %d", __LINE__, i);
errln("%s:%d Incorrect value from bi->preceding(). Expected DONE, got %d", __FILE__, __LINE__, i);
}
@ -545,20 +545,20 @@ void RBBIAPITest::TestIteration()
//
bi->first();
if (bi->isBoundary(3) != TRUE) {
errln("Incorrect value from bi->isBoudary() at line %d. Expected TRUE, got FALSE", __LINE__, i);
errln("%s:%d Incorrect value from bi->isBoudary(). Expected TRUE, got FALSE", __FILE__, __LINE__, i);
}
i = bi->current();
if (i != 3) {
errln("Incorrect value from bi->current() at line %d. Expected 3, got %d", __LINE__, i);
errln("%s:%d Incorrect value from bi->current(). Expected 3, got %d", __FILE__, __LINE__, i);
}
if (bi->isBoundary(11) != FALSE) {
errln("Incorrect value from bi->isBoudary() at line %d. Expected FALSE, got TRUE", __LINE__, i);
errln("%s:%d Incorrect value from bi->isBoudary(). Expected FALSE, got TRUE", __FILE__, __LINE__, i);
}
i = bi->current();
if (i != 10) {
errln("Incorrect value from bi->current() at line %d. Expected 10, got %d", __LINE__, i);
errln("%s:%d Incorrect value from bi->current(). Expected 10, got %d", __FILE__, __LINE__, i);
}
//
@ -567,18 +567,18 @@ void RBBIAPITest::TestIteration()
bi->first();
i = bi->next(4);
if (i != 4) {
errln("Incorrect value from bi->next() at line %d. Expected 4, got %d", __LINE__, i);
errln("%s:%d Incorrect value from bi->next(). Expected 4, got %d", __FILE__, __LINE__, i);
}
i = bi->next(6);
if (i != 10) {
errln("Incorrect value from bi->next() at line %d. Expected 10, got %d", __LINE__, i);
errln("%s:%d Incorrect value from bi->next(). Expected 10, got %d", __FILE__, __LINE__, i);
}
bi->first();
i = bi->next(11);
if (i != BreakIterator::DONE) {
errln("Incorrect value from bi->next() at line %d. Expected BreakIterator::DONE, got %d", __LINE__, i);
errln("%s:%d Incorrect value from bi->next(). Expected BreakIterator::DONE, got %d", __FILE__, __LINE__, i);
}
delete bi;
@ -666,7 +666,7 @@ void RBBIAPITest::TestRuleStatus() {
BreakIterator *bi = BreakIterator::createWordInstance(Locale::getEnglish(), status);
if(U_FAILURE(status)) {
errcheckln(status, "Fail : in construction - %s", u_errorName(status));
errcheckln(status, "%s:%d Fail in construction - %s", __FILE__, __LINE__, u_errorName(status));
} else {
bi->setText(testString1);
// First test that the breaks are in the right spots.
@ -677,12 +677,12 @@ void RBBIAPITest::TestRuleStatus() {
int32_t pos, tag;
for (pos = bi->first(); pos != BreakIterator::DONE; pos = bi->next(), i++) {
if (pos != bounds1[i]) {
errln("FAIL: unexpected word break at postion %d", pos);
errln("%s:%d FAIL: unexpected word break at postion %d", __FILE__, __LINE__, pos);
break;
}
tag = bi->getRuleStatus();
if (tag < tag_lo[i] || tag >= tag_hi[i]) {
errln("FAIL: incorrect tag value %d at position %d", tag, pos);
errln("%s:%d FAIL: incorrect tag value %d at position %d", __FILE__, __LINE__, tag, pos);
break;
}
@ -703,7 +703,7 @@ void RBBIAPITest::TestRuleStatus() {
bi = BreakIterator::createLineInstance(Locale::getEnglish(), status);
if(U_FAILURE(status)) {
errcheckln(status, "failed to create word break iterator. - %s", u_errorName(status));
errcheckln(status, "%s:%d failed to create line break iterator. - %s", __FILE__, __LINE__, u_errorName(status));
} else {
int32_t i = 0;
int32_t pos, tag;
@ -724,8 +724,8 @@ void RBBIAPITest::TestRuleStatus() {
success = FALSE; break;
}
if (success == FALSE) {
errln("Fail: incorrect word break status or position. i=%d, pos=%d, tag=%d",
i, pos, tag);
errln("%s:%d: incorrect line break status or position. i=%d, pos=%d, tag=%d",
__FILE__, __LINE__, i, pos, tag);
break;
}
pos = bi->next();
@ -734,7 +734,7 @@ void RBBIAPITest::TestRuleStatus() {
if (UBRK_LINE_SOFT >= UBRK_LINE_SOFT_LIMIT ||
UBRK_LINE_HARD >= UBRK_LINE_HARD_LIMIT ||
(UBRK_LINE_HARD > UBRK_LINE_SOFT && UBRK_LINE_HARD < UBRK_LINE_SOFT_LIMIT)) {
errln("UBRK_LINE_* constants from header are inconsistent.");
errln("%s:%d UBRK_LINE_* constants from header are inconsistent.", __FILE__, __LINE__);
}
}
delete bi;

View File

@ -73,7 +73,7 @@ BreakRules::BreakRules(RBBIMonkeyImpl *monkeyImpl, UErrorCode &status) :
fCharClassList.adoptInstead(new UVector(status));
fSetRefsMatcher.adoptInstead(new RegexMatcher(UnicodeString(
"(?!(?:\\{|=|\\[:)[ \\t]{0,4})" // Negative lookbehind for '{' or '=' or '[:'
"(?!(?:\\{|=|\\[:)[ \\t]{0,4})" // Negative look behind for '{' or '=' or '[:'
// (the identifier is a unicode property name or value)
"(?<ClassName>[A-Za-z_][A-Za-z0-9_]*)"), // The char class name
0, status));
@ -86,7 +86,7 @@ BreakRules::BreakRules(RBBIMonkeyImpl *monkeyImpl, UErrorCode &status) :
"\\R$" // new-line at end of line.
), 0, status));
// Match (initial parse) of a character class defintion line.
// Match (initial parse) of a character class definition line.
fClassDefMatcher.adoptInstead(new RegexMatcher(UnicodeString(
"[ \\t]*" // leading white space
"(?<ClassName>[A-Za-z_][A-Za-z0-9_]*)" // The char class name
@ -129,7 +129,7 @@ CharClass *BreakRules::addCharClass(const UnicodeString &name, const UnicodeStri
}
fSetRefsMatcher->appendTail(expandedDef);
// Verify that the expanded set defintion is valid.
// Verify that the expanded set definition is valid.
if (fMonkeyImpl->fDumpExpansions) {
printf("epandedDef: %s\n", CStr(expandedDef)());
@ -149,7 +149,7 @@ CharClass *BreakRules::addCharClass(const UnicodeString &name, const UnicodeStri
if (previousClass != NULL) {
// Duplicate class def.
// These are legitimate, they are adustments of an existing class.
// These are legitimate, they are adjustments of an existing class.
// TODO: will need to keep the old around when we handle tailorings.
IntlTest::gTest->logln("Redefinition of character class %s\n", CStr(cclass->fName)());
delete previousClass;

View File

@ -53,7 +53,6 @@
#define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \
errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));}}
//---------------------------------------------
// runIndexedTest
//---------------------------------------------
@ -74,7 +73,6 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha
#if !UCONFIG_NO_FILE_IO
TESTCASE_AUTO(TestBug4153072);
#endif
TESTCASE_AUTO(TestStatusReturn);
#if !UCONFIG_NO_FILE_IO
TESTCASE_AUTO(TestUnicodeFiles);
TESTCASE_AUTO(TestEmptyString);
@ -107,6 +105,7 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha
TESTCASE_AUTO(TestBug12918);
TESTCASE_AUTO(TestBug12932);
TESTCASE_AUTO(TestEmoji);
TESTCASE_AUTO(TestBug12519);
TESTCASE_AUTO_END;
}
@ -266,51 +265,6 @@ RBBITest::RBBITest() {
RBBITest::~RBBITest() {
}
//-----------------------------------------------------------------------------------
//
// Test for status {tag} return value from break rules.
// TODO: a more thorough test.
//
//-----------------------------------------------------------------------------------
void RBBITest::TestStatusReturn() {
UnicodeString rulesString1("$Letters = [:L:];\n"
"$Numbers = [:N:];\n"
"$Letters+{1};\n"
"$Numbers+{2};\n"
"Help\\ /me\\!{4};\n"
"[^$Letters $Numbers];\n"
"!.*;\n", -1, US_INV);
UnicodeString testString1 = "abc123..abc Help me Help me!";
// 01234567890123456789012345678
int32_t bounds1[] = {0, 3, 6, 7, 8, 11, 12, 16, 17, 19, 20, 25, 27, 28, -1};
int32_t brkStatus[] = {0, 1, 2, 0, 0, 1, 0, 1, 0, 1, 0, 4, 1, 0, -1};
UErrorCode status=U_ZERO_ERROR;
UParseError parseError;
LocalPointer <BreakIterator> bi(new RuleBasedBreakIterator(rulesString1, parseError, status));
if(U_FAILURE(status)) {
dataerrln("%s:%d error in break iterator construction - %s", __FILE__, __LINE__, u_errorName(status));
return;
}
int32_t pos;
int32_t i = 0;
bi->setText(testString1);
for (pos=bi->first(); pos!= BreakIterator::DONE; pos=bi->next()) {
if (pos != bounds1[i]) {
errln("%s:%d expected break at %d, got %d\n", __FILE__, __LINE__, bounds1[i], pos);
break;
}
int tag = bi->getRuleStatus();
if (tag != brkStatus[i]) {
errln("%s:%d break at %d, expected tag %d, got tag %d\n", __FILE__, __LINE__, pos, brkStatus[i], tag);
break;
}
i++;
}
}
static void printStringBreaks(UText *tstr, int expected[], int expectedCount) {
UErrorCode status = U_ZERO_ERROR;
@ -1131,34 +1085,27 @@ void RBBITest::TestExtended() {
UErrorCode status = U_ZERO_ERROR;
Locale locale("");
UnicodeString rules;
TestParams tp(status);
RegexMatcher localeMatcher(UNICODE_STRING_SIMPLE("<locale *([\\p{L}\\p{Nd}_@&=-]*) *>"), 0, status);
RegexMatcher localeMatcher(UnicodeString(u"<locale *([\\p{L}\\p{Nd}_@&=-]*) *>"), 0, status);
if (U_FAILURE(status)) {
dataerrln("Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(status));
}
//
// Open and read the test data file.
//
const char *testDataDirectory = IntlTest::getSourceTestData(status);
char testFileName[1000];
if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
errln("Can't open test data. Path too long.");
return;
}
strcpy(testFileName, testDataDirectory);
strcat(testFileName, "rbbitst.txt");
CharString testFileName(testDataDirectory, -1, status);
testFileName.append("rbbitst.txt", -1, status);
int len;
UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
UChar *testFile = ReadAndConvertFile(testFileName.data(), len, "UTF-8", status);
if (U_FAILURE(status)) {
return; /* something went wrong, error already output */
errln("%s:%d Error %s opening file rbbitst.txt", __FILE__, __LINE__, u_errorName(status));
return;
}
bool skipTest = false; // Skip this test?
//
@ -1170,7 +1117,8 @@ void RBBITest::TestExtended() {
PARSE_COMMENT,
PARSE_TAG,
PARSE_DATA,
PARSE_NUM
PARSE_NUM,
PARSE_RULES
}
parseState = PARSE_TAG;
@ -1181,7 +1129,10 @@ void RBBITest::TestExtended() {
int32_t column = 0;
int32_t charIdx = 0;
int32_t tagValue = 0; // The numeric value of a <nnn> tag.
int32_t tagValue = 0; // The numeric value of a <nnn> tag.
UnicodeString rules; // Holds rules from a <rules> ... </rules> block
int32_t rulesFirstLine; // Line number of the start of current <rules> block
for (charIdx = 0; charIdx < len; ) {
status = U_ZERO_ERROR;
@ -1215,41 +1166,50 @@ void RBBITest::TestExtended() {
if (u_isUWhiteSpace(c)) {
break;
}
if (testString.compare(charIdx-1, 6, "<word>") == 0) {
if (testString.compare(charIdx-1, 6, u"<word>") == 0) {
delete tp.bi;
tp.bi = BreakIterator::createWordInstance(locale, status);
skipTest = false;
charIdx += 5;
break;
}
if (testString.compare(charIdx-1, 6, "<char>") == 0) {
if (testString.compare(charIdx-1, 6, u"<char>") == 0) {
delete tp.bi;
tp.bi = BreakIterator::createCharacterInstance(locale, status);
skipTest = false;
charIdx += 5;
break;
}
if (testString.compare(charIdx-1, 6, "<line>") == 0) {
if (testString.compare(charIdx-1, 6, u"<line>") == 0) {
delete tp.bi;
tp.bi = BreakIterator::createLineInstance(locale, status);
skipTest = false;
charIdx += 5;
break;
}
if (testString.compare(charIdx-1, 6, "<sent>") == 0) {
if (testString.compare(charIdx-1, 6, u"<sent>") == 0) {
delete tp.bi;
tp.bi = BreakIterator::createSentenceInstance(locale, status);
skipTest = false;
charIdx += 5;
break;
}
if (testString.compare(charIdx-1, 7, "<title>") == 0) {
if (testString.compare(charIdx-1, 7, u"<title>") == 0) {
delete tp.bi;
tp.bi = BreakIterator::createTitleInstance(locale, status);
charIdx += 6;
break;
}
if (testString.compare(charIdx-1, 7, u"<rules>") == 0 ||
testString.compare(charIdx-1, 10, u"<badrules>") == 0) {
charIdx = testString.indexOf(u'>', charIdx) + 1;
parseState = PARSE_RULES;
rules.remove();
rulesFirstLine = lineNum;
break;
}
// <locale loc_name>
localeMatcher.reset(testString);
if (localeMatcher.lookingAt(charIdx-1, status)) {
@ -1261,7 +1221,7 @@ void RBBITest::TestExtended() {
TEST_ASSERT_SUCCESS(status);
break;
}
if (testString.compare(charIdx-1, 6, "<data>") == 0) {
if (testString.compare(charIdx-1, 6, u"<data>") == 0) {
parseState = PARSE_DATA;
charIdx += 5;
tp.dataToBreak = "";
@ -1278,6 +1238,33 @@ void RBBITest::TestExtended() {
}
break;
case PARSE_RULES:
if (testString.compare(charIdx-1, 8, u"</rules>") == 0) {
charIdx += 7;
parseState = PARSE_TAG;
delete tp.bi;
UParseError pe;
tp.bi = new RuleBasedBreakIterator(rules, pe, status);
skipTest = U_FAILURE(status);
if (U_FAILURE(status)) {
errln("file rbbitst.txt: %d - Error %s creating break iterator from rules.",
rulesFirstLine + pe.line - 1, u_errorName(status));
}
} else if (testString.compare(charIdx-1, 11, u"</badrules>") == 0) {
charIdx += 10;
parseState = PARSE_TAG;
UErrorCode ec = U_ZERO_ERROR;
UParseError pe;
RuleBasedBreakIterator bi(rules, pe, ec);
if (U_SUCCESS(ec)) {
errln("file rbbitst.txt: %d - Expected, but did not get, a failure creating break iterator from rules.",
rulesFirstLine + pe.line - 1);
}
} else {
rules.append(c);
}
break;
case PARSE_DATA:
if (c == u'') {
int32_t breakIdx = tp.dataToBreak.length();
@ -1290,7 +1277,7 @@ void RBBITest::TestExtended() {
break;
}
if (testString.compare(charIdx-1, 7, "</data>") == 0) {
if (testString.compare(charIdx-1, 7, u"</data>") == 0) {
// Add final entry to mappings from break location to source file position.
// Need one extra because last break position returned is after the
// last char in the data, not at the last char.
@ -1316,7 +1303,7 @@ void RBBITest::TestExtended() {
break;
}
if (testString.compare(charIdx-1, 3, UNICODE_STRING_SIMPLE("\\N{")) == 0) {
if (testString.compare(charIdx-1, 3, u"\\N{") == 0) {
// Named character, e.g. \N{COMBINING GRAVE ACCENT}
// Get the code point from the name and insert it into the test data.
// (Damn, no API takes names in Unicode !!!
@ -1355,8 +1342,7 @@ void RBBITest::TestExtended() {
if (testString.compare(charIdx-1, 2, "<>") == 0) {
if (testString.compare(charIdx-1, 2, u"<>") == 0) {
charIdx++;
int32_t breakIdx = tp.dataToBreak.length();
tp.expectedBreaks->setSize(breakIdx+1);
@ -1469,7 +1455,7 @@ void RBBITest::TestExtended() {
if (U_FAILURE(status)) {
dataerrln("ICU Error %s while parsing test file at line %d.",
errln("ICU Error %s while parsing test file at line %d.",
u_errorName(status), lineNum);
status = U_ZERO_ERROR;
goto end_test; // Stop the test
@ -1477,6 +1463,17 @@ void RBBITest::TestExtended() {
}
// Reached end of test file. Raise an error if parseState indicates that we are
// within a block that should have been terminated.
if (parseState == PARSE_RULES) {
errln("rbbitst.txt:%d <rules> block beginning at line %d is not closed.",
lineNum, rulesFirstLine);
}
if (parseState == PARSE_DATA) {
errln("rbbitst.txt:%d <data> block not closed.", lineNum);
}
end_test:
delete [] testFile;
#endif
@ -3762,16 +3759,16 @@ static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
forward[count] = i;
if (count < expectedcount && expected[count] != i) {
test->errln("break forward test failed: expected %d but got %d",
expected[count], i);
test->errln("%s:%d break forward test failed: expected %d but got %d",
__FILE__, __LINE__, expected[count], i);
break;
}
count ++;
}
if (count != expectedcount) {
printStringBreaks(ustr, expected, expectedcount);
test->errln("break forward test failed: missed %d match",
expectedcount - count);
test->errln("%s:%d break forward test failed: missed %d match",
__FILE__, __LINE__, expectedcount - count);
return;
}
// testing boundaries
@ -3779,13 +3776,15 @@ static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
int j = expected[i - 1];
if (!bi->isBoundary(j)) {
printStringBreaks(ustr, expected, expectedcount);
test->errln("isBoundary() failed. Expected boundary at position %d", j);
test->errln("%s:%d isBoundary() failed. Expected boundary at position %d",
__FILE__, __LINE__, j);
return;
}
for (j = expected[i - 1] + 1; j < expected[i]; j ++) {
if (bi->isBoundary(j)) {
printStringBreaks(ustr, expected, expectedcount);
test->errln("isBoundary() failed. Not expecting boundary at position %d", j);
test->errln("%s:%d isBoundary() failed. Not expecting boundary at position %d",
__FILE__, __LINE__, j);
return;
}
}
@ -3795,8 +3794,8 @@ static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
count --;
if (forward[count] != i) {
printStringBreaks(ustr, expected, expectedcount);
test->errln("happy break test previous() failed: expected %d but got %d",
forward[count], i);
test->errln("%s:%d happy break test previous() failed: expected %d but got %d",
__FILE__, __LINE__, forward[count], i);
break;
}
}
@ -3811,9 +3810,12 @@ static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
// int j = expected[i] + 1;
int j = ustr.moveIndex32(expected[i], 1);
for (; j <= expected[i + 1]; j ++) {
if (bi->preceding(j) != expected[i]) {
int32_t expectedPreceding = expected[i];
int32_t actualPreceding = bi->preceding(j);
if (actualPreceding != expectedPreceding) {
printStringBreaks(ustr, expected, expectedcount);
test->errln("preceding(): Not expecting boundary at position %d", j);
test->errln("%s:%d preceding(%d): expected %d, got %d",
__FILE__, __LINE__, j, expectedPreceding, actualPreceding);
return;
}
}
@ -3905,7 +3907,12 @@ void RBBITest::TestWordBoundary(void)
Locale locale("en");
UErrorCode status = U_ZERO_ERROR;
// BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
LocalPointer<BreakIterator> bi(BreakIterator::createWordInstance(locale, status), status);
if (U_FAILURE(status)) {
errcheckln(status, "%s:%d Creation of break iterator failed %s",
__FILE__, __LINE__, u_errorName(status));
return;
}
UChar str[50];
static const char *strlist[] =
{
@ -3940,43 +3947,44 @@ void RBBITest::TestWordBoundary(void)
"\\u003b\\u0027\\u00b7\\u47a3",
};
int loop;
if (U_FAILURE(status)) {
errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
return;
}
for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
// printf("looping %d\n", loop);
u_unescape(strlist[loop], str, 20);
u_unescape(strlist[loop], str, UPRV_LENGTHOF(str));
UnicodeString ustr(str);
int forward[50];
int count = 0;
bi->setText(ustr);
int prev = 0;
int i;
for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
forward[count ++] = i;
if (i > prev) {
int j;
for (j = prev + 1; j < i; j ++) {
if (bi->isBoundary(j)) {
printStringBreaks(ustr, forward, count);
errln("happy boundary test failed: expected %d not a boundary",
j);
return;
}
}
}
if (!bi->isBoundary(i)) {
printStringBreaks(ustr, forward, count);
errln("happy boundary test failed: expected %d a boundary",
i);
int prev = -1;
for (int32_t boundary = bi->first(); boundary != BreakIterator::DONE; boundary = bi->next()) {
++count;
if (count >= UPRV_LENGTHOF(forward)) {
errln("%s:%d too many breaks found. (loop, count, boundary) = (%d, %d, %d)",
__FILE__, __LINE__, loop, count, boundary);
return;
}
prev = i;
forward[count] = boundary;
if (boundary <= prev) {
errln("%s:%d bi::next() did not advance. (loop, prev, boundary) = (%d, %d, %d)\n",
__FILE__, __LINE__, loop, prev, boundary);
break;
}
for (int32_t nonBoundary = prev + 1; nonBoundary < boundary; nonBoundary ++) {
if (bi->isBoundary(nonBoundary)) {
printStringBreaks(ustr, forward, count);
errln("%s:%d isBoundary(nonBoundary) failed. (loop, prev, nonBoundary, boundary) = (%d, %d, %d, %d)",
__FILE__, __LINE__, loop, prev, nonBoundary, boundary);
return;
}
}
if (!bi->isBoundary(boundary)) {
printStringBreaks(ustr, forward, count);
errln("%s:%d happy boundary test failed: expected %d a boundary",
__FILE__, __LINE__, boundary);
return;
}
prev = boundary;
}
}
delete bi;
}
void RBBITest::TestLineBreaks(void)
@ -4792,6 +4800,40 @@ void RBBITest::TestEmoji() {
}
// TestBug12519 - Correct handling of Locales by assignment / copy / clone
// WHERE Macro yields a literal string of the form "source_file_name:line number "
// TODO: propose something equivalent as a test framework addition.
#define WHERE __FILE__ ":" XLINE(__LINE__) " "
#define XLINE(s) LINE(s)
#define LINE(s) #s
void RBBITest::TestBug12519() {
UErrorCode status = U_ZERO_ERROR;
LocalPointer<RuleBasedBreakIterator> biEn((RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status));
LocalPointer<RuleBasedBreakIterator> biFr((RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getFrance(), status));
assertSuccess(WHERE, status);
assertTrue(WHERE, Locale::getEnglish() == biEn->getLocale(ULOC_VALID_LOCALE, status));
assertTrue(WHERE, Locale::getFrench() == biFr->getLocale(ULOC_VALID_LOCALE, status));
assertTrue(WHERE "Locales do not participate in BreakIterator equality.", *biEn == *biFr);
LocalPointer<RuleBasedBreakIterator>cloneEn((RuleBasedBreakIterator *)biEn->clone());
assertTrue(WHERE, *biEn == *cloneEn);
assertTrue(WHERE, Locale::getEnglish() == cloneEn->getLocale(ULOC_VALID_LOCALE, status));
LocalPointer<RuleBasedBreakIterator>cloneFr((RuleBasedBreakIterator *)biFr->clone());
assertTrue(WHERE, *biFr == *cloneFr);
assertTrue(WHERE, Locale::getFrench() == cloneFr->getLocale(ULOC_VALID_LOCALE, status));
LocalPointer<RuleBasedBreakIterator>biDe((RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getGerman(), status));
UnicodeString text("Hallo Welt");
biDe->setText(text);
assertTrue(WHERE "before assignment of \"biDe = biFr\", they should be different, but are equal.", *biFr != *biDe);
*biDe = *biFr;
assertTrue(WHERE "after assignment of \"biDe = biFr\", they should be equal, but are not.", *biFr == *biDe);
}
//
// TestDebug - A place-holder test for debugging purposes.
// For putting in fragments of other tests that can be invoked

View File

@ -41,11 +41,6 @@ public:
void runIndexedTest( int32_t index, UBool exec, const char* &name, char* par = NULL );
/**
* Tests rule status return values
**/
void TestStatusReturn();
void TestEmptyString();
void TestGetAvailableLocales();
void TestGetDisplayName();
@ -79,6 +74,7 @@ public:
void TestBug12918();
void TestBug12932();
void TestEmoji();
void TestBug12519();
void TestDebug();
void TestProperties();

View File

@ -57,6 +57,8 @@ public:
void TestMalformedUTF8();
void TestBufferOverflow();
void TestEdits();
void TestCopyMoveEdits();
void TestMergeEdits();
void TestCaseMapWithEdits();
void TestCaseMapUTF8WithEdits();
void TestLongUnicodeString();
@ -94,6 +96,8 @@ StringCaseTest::runIndexedTest(int32_t index, UBool exec, const char *&name, cha
TESTCASE_AUTO(TestMalformedUTF8);
TESTCASE_AUTO(TestBufferOverflow);
TESTCASE_AUTO(TestEdits);
TESTCASE_AUTO(TestCopyMoveEdits);
TESTCASE_AUTO(TestMergeEdits);
TESTCASE_AUTO(TestCaseMapWithEdits);
TESTCASE_AUTO(TestCaseMapUTF8WithEdits);
TESTCASE_AUTO(TestLongUnicodeString);
@ -966,6 +970,225 @@ void StringCaseTest::TestEdits() {
assertFalse("reset then iterator", ei.next(errorCode));
}
void StringCaseTest::TestCopyMoveEdits() {
IcuTestErrorCode errorCode(*this, "TestCopyMoveEdits");
// Exceed the stack array capacity.
Edits a;
for (int32_t i = 0; i < 250; ++i) {
a.addReplace(i % 10, (i % 10) + 1);
}
assertEquals("a: many edits, length delta", 250, a.lengthDelta());
// copy
Edits b(a);
assertEquals("b: copy of many edits, length delta", 250, b.lengthDelta());
assertEquals("a remains: many edits, length delta", 250, a.lengthDelta());
TestUtility::checkEqualEdits(*this, u"b copy of a", a, b, errorCode);
// assign
Edits c;
c.addUnchanged(99);
c.addReplace(88, 77);
c = b;
assertEquals("c: assigned many edits, length delta", 250, c.lengthDelta());
assertEquals("b remains: many edits, length delta", 250, b.lengthDelta());
TestUtility::checkEqualEdits(*this, u"c = b", b, c, errorCode);
// move constructor empties object with heap array
Edits d(std::move(a));
assertEquals("d: move-constructed many edits, length delta", 250, d.lengthDelta());
assertFalse("a moved away: no more hasChanges", a.hasChanges());
TestUtility::checkEqualEdits(*this, u"d() <- a", d, b, errorCode);
Edits empty;
TestUtility::checkEqualEdits(*this, u"a moved away", empty, a, errorCode);
// move assignment empties object with heap array
Edits e;
e.addReplace(0, 1000);
e = std::move(b);
assertEquals("e: move-assigned many edits, length delta", 250, e.lengthDelta());
assertFalse("b moved away: no more hasChanges", b.hasChanges());
TestUtility::checkEqualEdits(*this, u"e <- b", e, c, errorCode);
TestUtility::checkEqualEdits(*this, u"b moved away", empty, b, errorCode);
// Edits::Iterator default constructor.
Edits::Iterator iter;
assertFalse("Edits::Iterator().next()", iter.next(errorCode));
assertSuccess("Edits::Iterator().next()", errorCode);
iter = e.getFineChangesIterator();
assertTrue("iter.next()", iter.next(errorCode));
assertSuccess("iter.next()", errorCode);
assertTrue("iter.hasChange()", iter.hasChange());
assertEquals("iter.newLength()", 1, iter.newLength());
}
void StringCaseTest::TestMergeEdits() {
// For debugging, set -v to see matching edits up to a failure.
IcuTestErrorCode errorCode(*this, "TestMergeEdits");
Edits ab, bc, ac, expected_ac;
// Simple: Two parallel non-changes.
ab.addUnchanged(2);
bc.addUnchanged(2);
expected_ac.addUnchanged(2);
// Simple: Two aligned changes.
ab.addReplace(3, 2);
bc.addReplace(2, 1);
expected_ac.addReplace(3, 1);
// Unequal non-changes.
ab.addUnchanged(5);
bc.addUnchanged(3);
expected_ac.addUnchanged(3);
// ab ahead by 2
// Overlapping changes accumulate until they share a boundary.
ab.addReplace(4, 3);
bc.addReplace(3, 2);
ab.addReplace(4, 3);
bc.addReplace(3, 2);
ab.addReplace(4, 3);
bc.addReplace(3, 2);
bc.addUnchanged(4);
expected_ac.addReplace(14, 8);
// bc ahead by 2
// Balance out intermediate-string lengths.
ab.addUnchanged(2);
expected_ac.addUnchanged(2);
// Insert something and delete it: Should disappear.
ab.addReplace(0, 5);
ab.addReplace(0, 2);
bc.addReplace(7, 0);
// Parallel change to make a new boundary.
ab.addReplace(1, 2);
bc.addReplace(2, 3);
expected_ac.addReplace(1, 3);
// Multiple ab deletions should remain separate at the boundary.
ab.addReplace(1, 0);
ab.addReplace(2, 0);
ab.addReplace(3, 0);
expected_ac.addReplace(1, 0);
expected_ac.addReplace(2, 0);
expected_ac.addReplace(3, 0);
// Unequal non-changes can be split for another boundary.
ab.addUnchanged(2);
bc.addUnchanged(1);
expected_ac.addUnchanged(1);
// ab ahead by 1
// Multiple bc insertions should create a boundary and remain separate.
bc.addReplace(0, 4);
bc.addReplace(0, 5);
bc.addReplace(0, 6);
expected_ac.addReplace(0, 4);
expected_ac.addReplace(0, 5);
expected_ac.addReplace(0, 6);
// ab ahead by 1
// Multiple ab deletions in the middle of a bc change are merged.
bc.addReplace(2, 2);
// bc ahead by 1
ab.addReplace(1, 0);
ab.addReplace(2, 0);
ab.addReplace(3, 0);
ab.addReplace(4, 1);
expected_ac.addReplace(11, 2);
// Multiple bc insertions in the middle of an ab change are merged.
ab.addReplace(5, 6);
bc.addReplace(3, 3);
// ab ahead by 3
bc.addReplace(0, 4);
bc.addReplace(0, 5);
bc.addReplace(0, 6);
bc.addReplace(3, 7);
expected_ac.addReplace(5, 25);
// Delete around a deletion.
ab.addReplace(4, 4);
ab.addReplace(3, 0);
ab.addUnchanged(2);
bc.addReplace(2, 2);
bc.addReplace(4, 0);
expected_ac.addReplace(9, 2);
// Insert into an insertion.
ab.addReplace(0, 2);
bc.addReplace(1, 1);
bc.addReplace(0, 8);
bc.addUnchanged(4);
expected_ac.addReplace(0, 10);
// bc ahead by 3
// Balance out intermediate-string lengths.
ab.addUnchanged(3);
expected_ac.addUnchanged(3);
// Deletions meet insertions.
// Output order is arbitrary in principle, but we expect insertions first
// and want to keep it that way.
ab.addReplace(2, 0);
ab.addReplace(4, 0);
ab.addReplace(6, 0);
bc.addReplace(0, 1);
bc.addReplace(0, 3);
bc.addReplace(0, 5);
expected_ac.addReplace(0, 1);
expected_ac.addReplace(0, 3);
expected_ac.addReplace(0, 5);
expected_ac.addReplace(2, 0);
expected_ac.addReplace(4, 0);
expected_ac.addReplace(6, 0);
// End with a non-change, so that further edits are never reordered.
ab.addUnchanged(1);
bc.addUnchanged(1);
expected_ac.addUnchanged(1);
ac.mergeAndAppend(ab, bc, errorCode);
assertSuccess("ab+bc", errorCode);
if (!TestUtility::checkEqualEdits(*this, u"ab+bc", expected_ac, ac, errorCode)) {
return;
}
// Append more Edits.
Edits ab2, bc2;
ab2.addUnchanged(5);
bc2.addReplace(1, 2);
bc2.addUnchanged(4);
expected_ac.addReplace(1, 2);
expected_ac.addUnchanged(4);
ac.mergeAndAppend(ab2, bc2, errorCode);
assertSuccess("ab2+bc2", errorCode);
if (!TestUtility::checkEqualEdits(*this, u"ab2+bc2", expected_ac, ac, errorCode)) {
return;
}
// Append empty edits.
Edits empty;
ac.mergeAndAppend(empty, empty, errorCode);
assertSuccess("empty+empty", errorCode);
if (!TestUtility::checkEqualEdits(*this, u"empty+empty", expected_ac, ac, errorCode)) {
return;
}
// Error: Append more edits with mismatched intermediate-string lengths.
Edits mismatch;
mismatch.addReplace(1, 1);
ac.mergeAndAppend(ab2, mismatch, errorCode);
assertEquals("ab2+mismatch", U_ILLEGAL_ARGUMENT_ERROR, errorCode.get());
errorCode.reset();
ac.mergeAndAppend(mismatch, bc2, errorCode);
assertEquals("mismatch+bc2", U_ILLEGAL_ARGUMENT_ERROR, errorCode.get());
errorCode.reset();
}
void StringCaseTest::TestCaseMapWithEdits() {
IcuTestErrorCode errorCode(*this, "TestEdits");
UChar dest[20];

View File

@ -10,6 +10,8 @@
**********************************************************************
*/
#include <algorithm>
#include <vector>
#include "unicode/utypes.h"
#include "unicode/edits.h"
#include "unicode/unistr.h"
@ -65,6 +67,100 @@ UnicodeString TestUtility::hex(const uint8_t* bytes, int32_t len) {
return buf;
}
namespace {
UnicodeString printOneEdit(const Edits::Iterator &ei) {
if (ei.hasChange()) {
return UnicodeString() + ei.oldLength() + u"->" + ei.newLength();
} else {
return UnicodeString() + ei.oldLength() + u"=" + ei.newLength();
}
}
/**
* Maps indexes according to the expected edits.
* A destination index can occur multiple times when there are source deletions.
* Map according to the last occurrence, normally in a non-empty destination span.
* Simplest is to search from the back.
*/
int32_t srcIndexFromDest(const EditChange expected[], int32_t expLength,
int32_t srcLength, int32_t destLength, int32_t index) {
int32_t srcIndex = srcLength;
int32_t destIndex = destLength;
int32_t i = expLength;
while (index < destIndex && i > 0) {
--i;
int32_t prevSrcIndex = srcIndex - expected[i].oldLength;
int32_t prevDestIndex = destIndex - expected[i].newLength;
if (index == prevDestIndex) {
return prevSrcIndex;
} else if (index > prevDestIndex) {
if (expected[i].change) {
// In a change span, map to its end.
return srcIndex;
} else {
// In an unchanged span, offset within it.
return prevSrcIndex + (index - prevDestIndex);
}
}
srcIndex = prevSrcIndex;
destIndex = prevDestIndex;
}
// index is outside the string.
return srcIndex;
}
int32_t destIndexFromSrc(const EditChange expected[], int32_t expLength,
int32_t srcLength, int32_t destLength, int32_t index) {
int32_t srcIndex = srcLength;
int32_t destIndex = destLength;
int32_t i = expLength;
while (index < srcIndex && i > 0) {
--i;
int32_t prevSrcIndex = srcIndex - expected[i].oldLength;
int32_t prevDestIndex = destIndex - expected[i].newLength;
if (index == prevSrcIndex) {
return prevDestIndex;
} else if (index > prevSrcIndex) {
if (expected[i].change) {
// In a change span, map to its end.
return destIndex;
} else {
// In an unchanged span, offset within it.
return prevDestIndex + (index - prevSrcIndex);
}
}
srcIndex = prevSrcIndex;
destIndex = prevDestIndex;
}
// index is outside the string.
return destIndex;
}
} // namespace
// For debugging, set -v to see matching edits up to a failure.
UBool TestUtility::checkEqualEdits(IntlTest &test, const UnicodeString &name,
const Edits &e1, const Edits &e2, UErrorCode &errorCode) {
Edits::Iterator ei1 = e1.getFineIterator();
Edits::Iterator ei2 = e2.getFineIterator();
UBool ok = TRUE;
for (int32_t i = 0; ok; ++i) {
UBool ei1HasNext = ei1.next(errorCode);
UBool ei2HasNext = ei2.next(errorCode);
ok &= test.assertEquals(name + u" next()[" + i + u"]" + __LINE__,
ei1HasNext, ei2HasNext);
ok &= test.assertSuccess(name + u" errorCode[" + i + u"]" + __LINE__, errorCode);
ok &= test.assertEquals(name + u" edit[" + i + u"]" + __LINE__,
printOneEdit(ei1), printOneEdit(ei2));
if (!ei1HasNext || !ei2HasNext) {
break;
}
test.logln();
}
return ok;
}
void TestUtility::checkEditsIter(
IntlTest &test,
const UnicodeString &name,
@ -77,8 +173,6 @@ void TestUtility::checkEditsIter(
int32_t expSrcIndex = 0;
int32_t expDestIndex = 0;
int32_t expReplIndex = 0;
int32_t expSrcIndexFromDest = 0; // for sourceIndexFromDestinationIndex()
int32_t expDestIndexFromSrc = 0; // for destinationIndexFromSourceIndex()
for (int32_t expIndex = 0; expIndex < expLength; ++expIndex) {
const EditChange &expect = expected[expIndex];
UnicodeString msg = UnicodeString(name).append(u' ') + expIndex;
@ -92,7 +186,7 @@ void TestUtility::checkEditsIter(
test.assertEquals(msg + u":" + __LINE__, expReplIndex, ei1.replacementIndex());
}
if (expect.oldLength > 0 && expDestIndex == expDestIndexFromSrc) {
if (expect.oldLength > 0) {
test.assertTrue(msg + u":" + __LINE__, ei2.findSourceIndex(expSrcIndex, errorCode));
test.assertEquals(msg + u":" + __LINE__, expect.change, ei2.hasChange());
test.assertEquals(msg + u":" + __LINE__, expect.oldLength, ei2.oldLength());
@ -108,7 +202,7 @@ void TestUtility::checkEditsIter(
}
}
if (expect.newLength > 0 && expSrcIndex == expSrcIndexFromDest) {
if (expect.newLength > 0) {
test.assertTrue(msg + u":" + __LINE__, ei2.findDestinationIndex(expDestIndex, errorCode));
test.assertEquals(msg + u":" + __LINE__, expect.change, ei2.hasChange());
test.assertEquals(msg + u":" + __LINE__, expect.oldLength, ei2.oldLength());
@ -124,45 +218,11 @@ void TestUtility::checkEditsIter(
}
}
// Span starts.
test.assertEquals(name + u":" + __LINE__, expDestIndexFromSrc,
ei2.destinationIndexFromSourceIndex(expSrcIndex, errorCode));
test.assertEquals(name + u":" + __LINE__, expSrcIndexFromDest,
ei2.sourceIndexFromDestinationIndex(expDestIndex, errorCode));
// Inside unchanged span map offsets 1:1.
if (!expect.change && expect.oldLength >= 2) {
test.assertEquals(name + u":" + __LINE__, expDestIndex + 1,
ei2.destinationIndexFromSourceIndex(expSrcIndex + 1, errorCode));
test.assertEquals(name + u":" + __LINE__, expSrcIndex + 1,
ei2.sourceIndexFromDestinationIndex(expDestIndex + 1, errorCode));
}
// Inside change span map to the span limit.
int32_t expSrcLimit = expSrcIndex + expect.oldLength;
int32_t expDestLimit = expDestIndex + expect.newLength;
if (expect.change) {
if (expect.oldLength >= 2) {
test.assertEquals(name + u":" + __LINE__, expDestLimit,
ei2.destinationIndexFromSourceIndex(expSrcIndex + 1, errorCode));
}
if (expect.newLength >= 2) {
test.assertEquals(name + u":" + __LINE__, expSrcLimit,
ei2.sourceIndexFromDestinationIndex(expDestIndex + 1, errorCode));
}
}
expSrcIndex = expSrcLimit;
expDestIndex = expDestLimit;
expSrcIndex += expect.oldLength;
expDestIndex += expect.newLength;
if (expect.change) {
expReplIndex += expect.newLength;
}
if (expect.newLength > 0) {
expSrcIndexFromDest = expSrcIndex;
}
if (expect.oldLength > 0) {
expDestIndexFromSrc = expDestIndex;
}
}
UnicodeString msg = UnicodeString(name).append(u" end");
test.assertFalse(msg + u":" + __LINE__, ei1.next(errorCode));
@ -175,8 +235,47 @@ void TestUtility::checkEditsIter(
test.assertFalse(name + u":" + __LINE__, ei2.findSourceIndex(expSrcIndex, errorCode));
test.assertFalse(name + u":" + __LINE__, ei2.findDestinationIndex(expDestIndex, errorCode));
test.assertEquals(name + u":" + __LINE__, expDestIndex,
ei2.destinationIndexFromSourceIndex(expSrcIndex, errorCode));
test.assertEquals(name + u":" + __LINE__, expSrcIndex,
ei2.sourceIndexFromDestinationIndex(expDestIndex, errorCode));
// Check mapping of all indexes against a simple implementation
// that works on the expected changes.
// Iterate once forward, once backward, to cover more runtime conditions.
int32_t srcLength = expSrcIndex;
int32_t destLength = expDestIndex;
std::vector<int32_t> srcIndexes;
std::vector<int32_t> destIndexes;
srcIndexes.push_back(-1);
destIndexes.push_back(-1);
int32_t srcIndex = 0;
int32_t destIndex = 0;
for (int32_t i = 0; i < expLength; ++i) {
if (expected[i].oldLength > 0) {
srcIndexes.push_back(srcIndex);
if (expected[i].oldLength > 1) {
srcIndexes.push_back(srcIndex + 1);
}
}
if (expected[i].newLength > 0) {
destIndexes.push_back(destIndex);
if (expected[i].newLength > 0) {
destIndexes.push_back(destIndex + 1);
}
}
srcIndex += expected[i].oldLength;
destIndex += expected[i].newLength;
}
srcIndexes.push_back(srcLength);
destIndexes.push_back(destLength);
srcIndexes.push_back(srcLength + 1);
destIndexes.push_back(destLength + 1);
std::reverse(destIndexes.begin(), destIndexes.end());
for (int32_t i : srcIndexes) {
test.assertEquals(name + u" destIndexFromSrc(" + i + u"):" + __LINE__,
destIndexFromSrc(expected, expLength, srcLength, destLength, i),
ei2.destinationIndexFromSourceIndex(i, errorCode));
}
for (int32_t i : destIndexes) {
test.assertEquals(name + u" srcIndexFromDest(" + i + u"):" + __LINE__,
srcIndexFromDest(expected, expLength, srcLength, destLength, i),
ei2.sourceIndexFromDestinationIndex(i, errorCode));
}
}

View File

@ -37,6 +37,9 @@ public:
static UnicodeString hex(const uint8_t* bytes, int32_t len);
static UBool checkEqualEdits(IntlTest &test, const UnicodeString &name,
const Edits &e1, const Edits &e2, UErrorCode &errorCode);
static void checkEditsIter(
IntlTest &test, const UnicodeString &name,
Edits::Iterator ei1, Edits::Iterator ei2, // two equal iterators

View File

@ -23,6 +23,7 @@ void IntlTestDecimalFormatSymbols::runIndexedTest( int32_t index, UBool exec, co
TESTCASE_AUTO_BEGIN;
TESTCASE_AUTO(testSymbols);
TESTCASE_AUTO(testLastResortData);
TESTCASE_AUTO(testNumberingSystem);
TESTCASE_AUTO_END;
}
@ -248,6 +249,49 @@ void IntlTestDecimalFormatSymbols::testLastResortData() {
Verify(1234567.25, "#,##0.##", *lastResort, "1,234,567.25");
}
void IntlTestDecimalFormatSymbols::testNumberingSystem() {
IcuTestErrorCode errorCode(*this, "testNumberingSystem");
struct testcase {
const char* locid;
const char* nsname;
const char16_t* expected1; // Expected number format string
const char16_t* expected2; // Expected pattern separator
};
static const testcase cases[9] = {
{"en", "latn", u"1,234.56", u";"},
{"en", "arab", u"١٬٢٣٤٫٥٦", u"؛"},
{"en", "mathsanb", u"𝟭,𝟮𝟯𝟰.𝟱𝟲", u";"},
{"en", "mymr", u"၁,၂၃၄.၅၆", u";"},
{"my", "latn", u"1,234.56", u";"},
{"my", "arab", u"١٬٢٣٤٫٥٦", u"؛"},
{"my", "mathsanb", u"𝟭,𝟮𝟯𝟰.𝟱𝟲", u";"},
{"my", "mymr", u"၁,၂၃၄.၅၆", u""},
{"en@numbers=thai", "mymr", u"၁,၂၃၄.၅၆", u";"}, // conflicting numbering system
};
for (int i=0; i<8; i++) {
testcase cas = cases[i];
Locale loc(cas.locid);
LocalPointer<NumberingSystem> ns(NumberingSystem::createInstanceByName(cas.nsname, errorCode));
if (errorCode.logDataIfFailureAndReset("NumberingSystem failed")) {
return;
}
UnicodeString expected1(cas.expected1);
UnicodeString expected2(cas.expected2);
DecimalFormatSymbols dfs(loc, *ns, errorCode);
if (errorCode.logDataIfFailureAndReset("DecimalFormatSymbols failed")) {
return;
}
Verify(1234.56, "#,##0.##", dfs, expected1);
// The pattern separator is something that differs by numbering system in my@numbers=mymr.
UnicodeString actual2 = dfs.getSymbol(DecimalFormatSymbols::kPatternSeparatorSymbol);
if (expected2 != actual2) {
errln((UnicodeString)"ERROR: DecimalFormatSymbols returned pattern separator " + actual2
+ " but we expected " + expected2);
}
}
}
void IntlTestDecimalFormatSymbols::Verify(double value, const UnicodeString& pattern,
const DecimalFormatSymbols &sym, const UnicodeString& expected){
UErrorCode status = U_ZERO_ERROR;

View File

@ -28,6 +28,7 @@ private:
*/
void testSymbols(/*char *par*/);
void testLastResortData();
void testNumberingSystem();
/** helper functions**/
void Verify(double value, const UnicodeString& pattern,

View File

@ -67,35 +67,38 @@ UnicodeSetTest::~UnicodeSetTest() {
void
UnicodeSetTest::runIndexedTest(int32_t index, UBool exec,
const char* &name, char* /*par*/) {
// if (exec) logln((UnicodeString)"TestSuite UnicodeSetTest");
switch (index) {
CASE(0,TestPatterns);
CASE(1,TestAddRemove);
CASE(2,TestCategories);
CASE(3,TestCloneEqualHash);
CASE(4,TestMinimalRep);
CASE(5,TestAPI);
CASE(6,TestScriptSet);
CASE(7,TestPropertySet);
CASE(8,TestClone);
CASE(9,TestExhaustive);
CASE(10,TestToPattern);
CASE(11,TestIndexOf);
CASE(12,TestStrings);
CASE(13,Testj2268);
CASE(14,TestCloseOver);
CASE(15,TestEscapePattern);
CASE(16,TestInvalidCodePoint);
CASE(17,TestSymbolTable);
CASE(18,TestSurrogate);
CASE(19,TestPosixClasses);
CASE(20,TestIteration);
CASE(21,TestFreezable);
CASE(22,TestSpan);
CASE(23,TestStringSpan);
CASE(24,TestUCAUnsafeBackwards);
default: name = ""; break;
if (exec) {
logln(u"TestSuite UnicodeSetTest");
}
TESTCASE_AUTO_BEGIN;
TESTCASE_AUTO(TestPatterns);
TESTCASE_AUTO(TestAddRemove);
TESTCASE_AUTO(TestCategories);
TESTCASE_AUTO(TestCloneEqualHash);
TESTCASE_AUTO(TestMinimalRep);
TESTCASE_AUTO(TestAPI);
TESTCASE_AUTO(TestScriptSet);
TESTCASE_AUTO(TestPropertySet);
TESTCASE_AUTO(TestClone);
TESTCASE_AUTO(TestExhaustive);
TESTCASE_AUTO(TestToPattern);
TESTCASE_AUTO(TestIndexOf);
TESTCASE_AUTO(TestStrings);
TESTCASE_AUTO(Testj2268);
TESTCASE_AUTO(TestCloseOver);
TESTCASE_AUTO(TestEscapePattern);
TESTCASE_AUTO(TestInvalidCodePoint);
TESTCASE_AUTO(TestSymbolTable);
TESTCASE_AUTO(TestSurrogate);
TESTCASE_AUTO(TestPosixClasses);
TESTCASE_AUTO(TestIteration);
TESTCASE_AUTO(TestFreezable);
TESTCASE_AUTO(TestSpan);
TESTCASE_AUTO(TestStringSpan);
TESTCASE_AUTO(TestUCAUnsafeBackwards);
TESTCASE_AUTO(TestIntOverflow);
TESTCASE_AUTO(TestUnusedCcc);
TESTCASE_AUTO_END;
}
static const char NOT[] = "%%%%";
@ -3925,3 +3928,41 @@ void UnicodeSetTest::TestUCAUnsafeBackwards() {
}
#endif
}
void UnicodeSetTest::TestIntOverflow() {
// This test triggers undefined double->int conversion behavior
// if the implementation is not careful.
IcuTestErrorCode errorCode(*this, "TestIntOverflow");
UnicodeSet set(u"[:ccc=2222222222222222222:]", errorCode);
assertTrue("[:ccc=int_overflow:] -> empty set", set.isEmpty());
assertEquals("[:ccc=int_overflow:] -> illegal argument",
U_ILLEGAL_ARGUMENT_ERROR, errorCode.reset());
}
void UnicodeSetTest::TestUnusedCcc() {
// All numeric ccc values 0..255 are valid, but many are unused.
IcuTestErrorCode errorCode(*this, "TestUnusedCcc");
UnicodeSet ccc2(u"[:ccc=2:]", errorCode);
assertSuccess("[:ccc=2:]", errorCode);
assertTrue("[:ccc=2:] -> empty set", ccc2.isEmpty());
UnicodeSet ccc255(u"[:ccc=255:]", errorCode);
assertSuccess("[:ccc=255:]", errorCode);
assertTrue("[:ccc=255:] -> empty set", ccc255.isEmpty());
// Non-integer values and values outside 0..255 are invalid.
UnicodeSet ccc_1(u"[:ccc=-1:]", errorCode);
assertEquals("[:ccc=-1:] -> illegal argument",
U_ILLEGAL_ARGUMENT_ERROR, errorCode.reset());
assertTrue("[:ccc=-1:] -> empty set", ccc_1.isEmpty());
UnicodeSet ccc256(u"[:ccc=256:]", errorCode);
assertEquals("[:ccc=256:] -> illegal argument",
U_ILLEGAL_ARGUMENT_ERROR, errorCode.reset());
assertTrue("[:ccc=256:] -> empty set", ccc256.isEmpty());
UnicodeSet ccc1_1(u"[:ccc=1.1:]", errorCode);
assertEquals("[:ccc=1.1:] -> illegal argument",
U_ILLEGAL_ARGUMENT_ERROR, errorCode.reset());
assertTrue("[:ccc=1.1:] -> empty set", ccc1_1.isEmpty());
}

View File

@ -91,6 +91,8 @@ private:
void TestStringSpan();
void TestUCAUnsafeBackwards();
void TestIntOverflow();
void TestUnusedCcc();
private:

View File

@ -116,7 +116,7 @@ LB12: GL CM* [^CM];
LB12a: [^SP BA HY] CM* GL;
# LB 13 ICU Tailoring, matches tailoring exmaple 8 from UAX 14.
# LB 13 ICU Tailoring, matches tailoring example 8 from UAX 14.
#
# LB13.1 [^SP] CM* [CL CP EX IS SY] # original UAX 14 rule.
# LB13.2 SP CM* [CL CP EX IS SY]

View File

@ -5,23 +5,27 @@ License & terms of use: http://www.unicode.org/copyright.html#License
Copyright (c) 2015-2016, International Business Machines Corporation and others. All Rights Reserved.
This directory contains the break iterator reference rule files used by intltest rbbi/RBBIMonkeyTest/testMonkey.
The rules in this directory track the boundary rules from Unicode UAX 14 and 29. They are interpretted
The rules in this directory track the boundary rules from Unicode UAX 14 and 29. They are interpreted
to provide an expected set of boundary positions to compare with the results from ICU break iteration.
ICU4J also includes copies of the test reference rules, located in the directory
main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/
The copies should be kept synchronized; there should be no differences.
Each set of reference break rules lives in a separate file.
The list of rule files to run by default is hardcoded into the test code, in rbbimonkeytest.cpp.
The list of rule files to run by default is hard coded into the test code, in rbbimonkeytest.cpp.
Each test file includes
- The type of ICU break interator to create (word, line, sentence, etc.)
- The type of ICU break iterator to create (word, line, sentence, etc.)
- The locale to use
- Character Class definitions
- Rule definitions
To Do
- Syntax for tailoring.
- Extend the syntax to support rule tailoring.
Character Class Definition:
Character Class Definition:
name = set_regular_expression;
Rule Definition:
@ -35,7 +39,7 @@ set_regular_expression:
(They are mostly the same)
May include previously defined set names, which are logically expanded in-place.
rule_regular_expresson:
rule_regular_expression:
An ICU Regular Expression.
May include set names, which are logically expanded in-place.
May include a '÷', which defines a boundary position.
@ -52,7 +56,7 @@ Application of the rules:
return the position of the '÷' within the match.
else
position = last character of the rule match.
break from the rule loop, continue the outer loop.
break from the inner rule loop, continue the outer loop.
This differs from the Unicode UAX algorithm in that each position in the text is
not tested separately. Instead, when a rule match is found, rule application restarts with the last
@ -66,7 +70,7 @@ Application of the rules:
are with the Unicode UAX rules. With the main ICU break rules, all are applied in parallel.
Word Dictionaries
The monkey test does not test dictionary based breaking. The set named 'dicitionary' is special,
The monkey test does not test dictionary based breaking. The set named 'dictionary' is special,
as it is in the main ICU rules. For the monkey test, no characters from the dictionary set are
included in the randomly-generated test data.

View File

@ -39,7 +39,7 @@ EmojiNRK = [[\p{Emoji}] - [[Regional_Indicator]\u002a\u00230-9©®™
Extended_Pict = [\U0001F774-\U0001F77F\U00002700-\U00002701\U00002703-\U00002704\U0000270E\U00002710-\U00002711\U00002765-\U00002767\U0001F030-\U0001F093\U0001F094-\U0001F09F\U0001F10D-\U0001F10F\U0001F12F\U0001F16C-\U0001F16F\U0001F1AD-\U0001F1E5\U0001F260-\U0001F265\U0001F203-\U0001F20F\U0001F23C-\U0001F23F\U0001F249-\U0001F24F\U0001F252-\U0001F25F\U0001F266-\U0001F2FF\U0001F7D5-\U0001F7FF\U0001F000-\U0001F003\U0001F005-\U0001F02B\U0001F02C-\U0001F02F\U0001F322-\U0001F323\U0001F394-\U0001F395\U0001F398\U0001F39C-\U0001F39D\U0001F3F1-\U0001F3F2\U0001F3F6\U0001F4FE\U0001F53E-\U0001F548\U0001F54F\U0001F568-\U0001F56E\U0001F571-\U0001F572\U0001F57B-\U0001F586\U0001F588-\U0001F589\U0001F58E-\U0001F58F\U0001F591-\U0001F594\U0001F597-\U0001F5A3\U0001F5A6-\U0001F5A7\U0001F5A9-\U0001F5B0\U0001F5B3-\U0001F5BB\U0001F5BD-\U0001F5C1\U0001F5C5-\U0001F5D0\U0001F5D4-\U0001F5DB\U0001F5DF-\U0001F5E0\U0001F5E2\U0001F5E4-\U0001F5E7\U0001F5E9-\U0001F5EE\U0001F5F0-\U0001F5F2\U0001F5F4-\U0001F5F9\U00002605\U00002607-\U0000260D\U0000260F-\U00002610\U00002612\U00002616-\U00002617\U00002619-\U0000261C\U0000261E-\U0000261F\U00002621\U00002624-\U00002625\U00002627-\U00002629\U0000262B-\U0000262D\U00002630-\U00002637\U0000263B-\U00002647\U00002654-\U0000265F\U00002661-\U00002662\U00002664\U00002667\U00002669-\U0000267A\U0000267C-\U0000267E\U00002680-\U00002691\U00002695\U00002698\U0000269A\U0000269D-\U0000269F\U000026A2-\U000026A9\U000026AC-\U000026AF\U000026B2-\U000026BC\U000026BF-\U000026C3\U000026C6-\U000026C7\U000026C9-\U000026CD\U000026D0\U000026D2\U000026D5-\U000026E8\U000026EB-\U000026EF\U000026F6\U000026FB-\U000026FC\U000026FE-\U000026FF\U00002388\U0001FA00-\U0001FFFD\U0001F0A0-\U0001F0AE\U0001F0B1-\U0001F0BF\U0001F0C1-\U0001F0CF\U0001F0D1-\U0001F0F5\U0001F0AF-\U0001F0B0\U0001F0C0\U0001F0D0\U0001F0F6-\U0001F0FF\U0001F80C-\U0001F80F\U0001F848-\U0001F84F\U0001F85A-\U0001F85F\U0001F888-\U0001F88F\U0001F8AE-\U0001F8FF\U0001F900-\U0001F90B\U0001F91F\U0001F928-\U0001F92F\U0001F931-\U0001F932\U0001F94C\U0001F95F-\U0001F96B\U0001F992-\U0001F997\U0001F9D0-\U0001F9E6\U0001F90C-\U0001F90F\U0001F93F\U0001F94D-\U0001F94F\U0001F96C-\U0001F97F\U0001F998-\U0001F9BF\U0001F9C1-\U0001F9CF\U0001F9E7-\U0001F9FF\U0001F6C6-\U0001F6CA\U0001F6D3-\U0001F6D4\U0001F6E6-\U0001F6E8\U0001F6EA\U0001F6F1-\U0001F6F2\U0001F6F7-\U0001F6F8\U0001F6D5-\U0001F6DF\U0001F6ED-\U0001F6EF\U0001F6F9-\U0001F6FF];
EBG = [\p{Word_Break = EBG}];
#define dicitionary, with the effect being that those characters don't appear in test data.
#define dictionary, with the effect being that those characters don't appear in test data.
Han = [:Han:];
Hiragana = [:Hiragana:];
@ -51,12 +51,7 @@ KanaKanji = [Han Hiragana Katakana];
dictionaryCJK = [KanaKanji HangulSyllable];
dictionary = [ComplexContext dictionaryCJK];
# leave CJK scripts out of ALetterPlus
# Tricky. Redfine a set.
# For tailorings, if it modifies itself, do at end of sets ????
# Tweak redefine to mean replace existing definition at its original location.
# Insert defs without redefine just after last pre-existing def of that name.
# Maybe drop redefine, add warning for sets defined and not used, should catch typos.
# leave dictionary scripts out of ALetter
ALetter = [ALetter - dictionary];

View File

@ -38,7 +38,7 @@ EmojiNRK = [[\p{Emoji}] - [[Regional_Indicator]\u002a\u00230-9©®™
Extended_Pict = [\U0001F774-\U0001F77F\U00002700-\U00002701\U00002703-\U00002704\U0000270E\U00002710-\U00002711\U00002765-\U00002767\U0001F030-\U0001F093\U0001F094-\U0001F09F\U0001F10D-\U0001F10F\U0001F12F\U0001F16C-\U0001F16F\U0001F1AD-\U0001F1E5\U0001F260-\U0001F265\U0001F203-\U0001F20F\U0001F23C-\U0001F23F\U0001F249-\U0001F24F\U0001F252-\U0001F25F\U0001F266-\U0001F2FF\U0001F7D5-\U0001F7FF\U0001F000-\U0001F003\U0001F005-\U0001F02B\U0001F02C-\U0001F02F\U0001F322-\U0001F323\U0001F394-\U0001F395\U0001F398\U0001F39C-\U0001F39D\U0001F3F1-\U0001F3F2\U0001F3F6\U0001F4FE\U0001F53E-\U0001F548\U0001F54F\U0001F568-\U0001F56E\U0001F571-\U0001F572\U0001F57B-\U0001F586\U0001F588-\U0001F589\U0001F58E-\U0001F58F\U0001F591-\U0001F594\U0001F597-\U0001F5A3\U0001F5A6-\U0001F5A7\U0001F5A9-\U0001F5B0\U0001F5B3-\U0001F5BB\U0001F5BD-\U0001F5C1\U0001F5C5-\U0001F5D0\U0001F5D4-\U0001F5DB\U0001F5DF-\U0001F5E0\U0001F5E2\U0001F5E4-\U0001F5E7\U0001F5E9-\U0001F5EE\U0001F5F0-\U0001F5F2\U0001F5F4-\U0001F5F9\U00002605\U00002607-\U0000260D\U0000260F-\U00002610\U00002612\U00002616-\U00002617\U00002619-\U0000261C\U0000261E-\U0000261F\U00002621\U00002624-\U00002625\U00002627-\U00002629\U0000262B-\U0000262D\U00002630-\U00002637\U0000263B-\U00002647\U00002654-\U0000265F\U00002661-\U00002662\U00002664\U00002667\U00002669-\U0000267A\U0000267C-\U0000267E\U00002680-\U00002691\U00002695\U00002698\U0000269A\U0000269D-\U0000269F\U000026A2-\U000026A9\U000026AC-\U000026AF\U000026B2-\U000026BC\U000026BF-\U000026C3\U000026C6-\U000026C7\U000026C9-\U000026CD\U000026D0\U000026D2\U000026D5-\U000026E8\U000026EB-\U000026EF\U000026F6\U000026FB-\U000026FC\U000026FE-\U000026FF\U00002388\U0001FA00-\U0001FFFD\U0001F0A0-\U0001F0AE\U0001F0B1-\U0001F0BF\U0001F0C1-\U0001F0CF\U0001F0D1-\U0001F0F5\U0001F0AF-\U0001F0B0\U0001F0C0\U0001F0D0\U0001F0F6-\U0001F0FF\U0001F80C-\U0001F80F\U0001F848-\U0001F84F\U0001F85A-\U0001F85F\U0001F888-\U0001F88F\U0001F8AE-\U0001F8FF\U0001F900-\U0001F90B\U0001F91F\U0001F928-\U0001F92F\U0001F931-\U0001F932\U0001F94C\U0001F95F-\U0001F96B\U0001F992-\U0001F997\U0001F9D0-\U0001F9E6\U0001F90C-\U0001F90F\U0001F93F\U0001F94D-\U0001F94F\U0001F96C-\U0001F97F\U0001F998-\U0001F9BF\U0001F9C1-\U0001F9CF\U0001F9E7-\U0001F9FF\U0001F6C6-\U0001F6CA\U0001F6D3-\U0001F6D4\U0001F6E6-\U0001F6E8\U0001F6EA\U0001F6F1-\U0001F6F2\U0001F6F7-\U0001F6F8\U0001F6D5-\U0001F6DF\U0001F6ED-\U0001F6EF\U0001F6F9-\U0001F6FF];
EBG = [\p{Word_Break = EBG}];
#define dicitionary, with the effect being that those characters don't appear in test data.
#define dictionary, with the effect being that those characters don't appear in test data.
Han = [:Han:];
Hiragana = [:Hiragana:];
@ -50,12 +50,7 @@ KanaKanji = [Han Hiragana Katakana];
dictionaryCJK = [KanaKanji HangulSyllable];
dictionary = [ComplexContext dictionaryCJK];
# leave CJK scripts out of ALetterPlus
# Tricky. Redfine a set.
# For tailorings, if it modifies itself, do at end of sets ????
# Tweak redefine to mean replace existing definition at its original location.
# Insert defs without redefine just after last pre-existing def of that name.
# Maybe drop redefine, add warning for sets defined and not used, should catch typos.
# leave dictionary scripts out of ALetter
ALetter = [ALetter - dictionary];

View File

@ -358,15 +358,14 @@ minIntegerDigits maxIntegerDigits minFractionDigits maxFractionDigits output bre
0 0 1 0 2.99792458E8 KS
// JDK and S give .2998E9
0 0 0 4 2.998E8 KSQ
// S correctly formats this as 29.979246E7.
// JDK uses 8 + 6 for significant digits instead of 2 + 6
// J and C return 2.9979246E8.
// TODO: Merge trunk
2 8 1 6 29.979246E7 CJKQ
// Context: #13289
2 8 1 6 2.9979246E8 K
// Treat max int digits > 8 as being the same as min int digits.
// This behavior is not spelled out in the specification.
// JDK fails here because it tries to use 9 + 6 = 15 sig digits.
2 9 1 6 29.979246E7 K
// C and J get 29.979246E7
2 9 1 6 2.9979246E8 CJK
test significant digits scientific
set locale en

View File

@ -14,7 +14,9 @@
# <sent> any following data is for sentence break testing
# <line> any following data is for line break testing
# <char> any following data is for char break testing
# <locale local_name> Switch to the named locale at the next occurence of <word>, <sent>, etc.
# <rules> rules ... </rules> following data is tested against these rules.
# Applies until a following occurence of <word>, <sent>, etc. or another <rules>
# <locale locale_name> Switch to the named locale at the next occurence of <word>, <sent>, etc.
# <data> ... </data> test data. May span multiple lines.
# <> Break position, status == 0
# • Break position, status == 0 (Bullet, \u2022)
@ -37,8 +39,17 @@
# Temp debugging tests
<locale en>
<word>
<data><0>1•2•3•4•</data>
# <data><0>ク<400>ライアン<400>トサーバー<400></data>
<data><0>コンピューター<400>は<400>、<0>本質<400>的<400>に<400>は<400>数字<400>しか<400>扱う<400>こと<400>が<400>でき<400>ま<400>せん<400>。<0>\
コンピューター<400>は<400>、<0>文字<400>や<400>記号<400>など<400>の<400>それぞれに<400>番号<400>を<400>割り振る<400>こと<400>によって<400>扱える<400>\
よう<400>にし<400>ます<400>。<0>ユニ<400>コード<400>が<400>出来る<400>まで<400>は<400>、<0>これらの<400>番号<400>を<400>割り振る<400>仕組み<400>が<400>\
何<400>百<400>種類<400>も<400>存在<400>しま<400>した<400>。<0>どの<400>一つ<400>を<400>とっても<400>、<0>十分<400>な<400>文字<400>を<400>含<400>\
んで<400>は<400>いま<400>せん<400>で<400>した<400>。<0>例えば<400>、<0>欧州<400>連合<400>一つ<400>を<400>見<400>て<400>も<400>、<0>その<400>\
すべて<400>の<400>言語<400>を<400>カバー<400>する<400>ため<400>に<400>は<400>、<0>いくつか<400>の<400>異なる<400>符号<400>化<400>の<400>仕組み<400>\
が<400>必要<400>で<400>した<400>。<0>英語<400>の<400>よう<400>な<400>一つ<400>の<400>言語<400>に<400>限<400>って<400>も<400>、<0>一つ<400>だけ<400>\
の<400>符号<400>化<400>の<400>仕組み<400>では<400>、<0>一般<400>的<400>に<400>使<400>われる<400>すべて<400>の<400>文字<400>、<0>句読点<400>、<0>\
。<0></data>
#<data><0>コンピューター<400>は<400>、<0>本質<400>的<400>に<400>は<400>数字<400>しか<400>扱う<400>こと<400>が<400>でき<400>ま<400>せん<400>。<0>\
## FILTERED BREAK TESTS
@ -1308,3 +1319,48 @@ Bangkok)•</data>
<data>•\U0001F468\u200D\u2695\uFE0F•\U0001F468\u200D\u2695•\U0001F468\U0001F3FD\u200D\u2695\uFE0F•\U0001F468\U0001F3FD\u200D\u2695\u0020•</data>
# woman astronaut, woman astronaut / fitz4
<data>•\U0001F469\u200D\U0001F680•\U0001F469\U0001F3FD\u200D\U0001F680\u0020•</data>
####################################################################################
#
# Test rule status values
#
####################################################################################
<rules> $Letters = [:L:];
$Numbers = [:N:];
$Letters+{1};
$Numbers+{2};
Help\ me\!{4};
[^$Letters $Numbers];
!.*;
</rules>
<data>•abc<1>123<2>.•.•abc<1> •Help<1> •me<1> •Help me!<4></data>
# Test option to prohibit unquoted literals.
<rules>
!!forward;
Hello\ World;
!!reverse;
.*;
</rules>
<data>•Hello World•</data>
<badrules>
!!quoted_literals_only;
!!forward;
Hello\ World;
!!reverse;
.*;
</badrules>
<rules>
#TODO: uncomment this line when quoted_literals_only is implemented.
#!!quoted_literals_only;
!!forward;
'Hello World';
!!reverse;
.*;
</rules>
<data>•Hello World•</data>

View File

@ -61,6 +61,7 @@ enum {
OUTPUT_FILENAME,
UNICODE_VERSION,
WRITE_C_SOURCE,
WRITE_COMBINED_DATA,
OPT_FAST
};
@ -73,6 +74,7 @@ static UOption options[]={
UOPTION_DEF("output", 'o', UOPT_REQUIRES_ARG),
UOPTION_DEF("unicode", 'u', UOPT_REQUIRES_ARG),
UOPTION_DEF("csource", '\1', UOPT_NO_ARG),
UOPTION_DEF("combined", '\1', UOPT_NO_ARG),
UOPTION_DEF("fast", '\1', UOPT_NO_ARG)
};
@ -96,17 +98,22 @@ main(int argc, char* argv[]) {
if( argc<2 ||
options[HELP_H].doesOccur || options[HELP_QUESTION_MARK].doesOccur
) {
/*
* Broken into chunks because the C89 standard says the minimum
* required supported string length is 509 bytes.
*/
fprintf(stderr,
"Usage: %s [-options] infiles+ -o outputfilename\n"
"\n"
"Reads the infiles with normalization data and\n"
"creates a binary or C source file (outputfilename) with the data.\n"
"creates a binary file, or a C source file (--csource), with the data,\n"
"or writes a data file with the combined data (--combined).\n"
"See http://userguide.icu-project.org/transforms/normalization#TOC-Data-File-Syntax\n"
"\n"
"Alternate usage: %s [-options] a.txt b.txt minus p.txt q.txt -o outputfilename\n"
"\n"
"Computes the difference of (a, b) minus (p, q) and writes the diff data\n"
"in input-file syntax to the outputfilename.\n"
"It is then possible to build (p, q, diff) to get the same data as (a, b).\n"
"(Useful for computing minimal incremental mapping data files.)\n"
"\n",
argv[0]);
argv[0], argv[0]);
fprintf(stderr,
"Options:\n"
"\t-h or -? or --help this usage text\n"
@ -116,7 +123,9 @@ main(int argc, char* argv[]) {
fprintf(stderr,
"\t-s or --sourcedir source directory, followed by the path\n"
"\t-o or --output output filename\n"
"\t --csource writes a C source file with initializers\n");
"\t --csource writes a C source file with initializers\n"
"\t --combined writes a .txt file (input-file syntax) with the\n"
"\t combined data from all of the input files\n");
fprintf(stderr,
"\t --fast optimize the data for fast normalization,\n"
"\t which might increase its size (Writes fully decomposed\n"
@ -144,7 +153,10 @@ main(int argc, char* argv[]) {
#else
LocalPointer<Normalizer2DataBuilder> builder(new Normalizer2DataBuilder(errorCode), errorCode);
LocalPointer<Normalizer2DataBuilder> b1(new Normalizer2DataBuilder(errorCode), errorCode);
LocalPointer<Normalizer2DataBuilder> b2;
LocalPointer<Normalizer2DataBuilder> diff;
Normalizer2DataBuilder *builder = b1.getAlias();
errorCode.assertSuccess();
if(options[UNICODE_VERSION].doesOccur) {
@ -166,8 +178,29 @@ main(int argc, char* argv[]) {
pathLength=filename.length();
}
bool doMinus = false;
for(int i=1; i<argc; ++i) {
printf("gennorm2: processing %s\n", argv[i]);
if(strcmp(argv[i], "minus") == 0) {
if(doMinus) {
fprintf(stderr, "gennorm2 error: only one 'minus' can be specified\n");
exit(U_ILLEGAL_ARGUMENT_ERROR);
}
// Data from previous input files has been collected in b1.
// Collect data from further input files in b2.
b2.adoptInsteadAndCheckErrorCode(new Normalizer2DataBuilder(errorCode), errorCode);
diff.adoptInsteadAndCheckErrorCode(new Normalizer2DataBuilder(errorCode), errorCode);
errorCode.assertSuccess();
builder = b2.getAlias();
if(options[UNICODE_VERSION].doesOccur) {
builder->setUnicodeVersion(options[UNICODE_VERSION].value);
}
if(options[OPT_FAST].doesOccur) {
builder->setOptimization(Normalizer2DataBuilder::OPTIMIZE_FAST);
}
doMinus = true;
continue;
}
filename.append(argv[i], errorCode);
LocalStdioFilePointer f(fopen(filename.data(), "r"));
if(f==NULL) {
@ -179,7 +212,12 @@ main(int argc, char* argv[]) {
filename.truncate(pathLength);
}
if(options[WRITE_C_SOURCE].doesOccur) {
if(doMinus) {
Normalizer2DataBuilder::computeDiff(*b1, *b2, *diff);
diff->writeDataFile(options[OUTPUT_FILENAME].value, /* writeRemoved= */ true);
} else if(options[WRITE_COMBINED_DATA].doesOccur) {
builder->writeDataFile(options[OUTPUT_FILENAME].value, /* writeRemoved= */ false);
} else if(options[WRITE_C_SOURCE].doesOccur) {
builder->writeCSourceFile(options[OUTPUT_FILENAME].value);
} else {
builder->writeBinaryFile(options[OUTPUT_FILENAME].value);

View File

@ -30,7 +30,9 @@
#include "unicode/localpointer.h"
#include "unicode/putil.h"
#include "unicode/udata.h"
#include "unicode/uniset.h"
#include "unicode/unistr.h"
#include "unicode/usetiter.h"
#include "unicode/ustring.h"
#include "charstr.h"
#include "extradata.h"
@ -146,6 +148,7 @@ void Normalizer2DataBuilder::setOverrideHandling(OverrideHandling oh) {
void Normalizer2DataBuilder::setCC(UChar32 c, uint8_t cc) {
norms.createNorm(c)->cc=cc;
norms.ccSet.add(c);
}
static UBool isWellFormed(const UnicodeString &s) {
@ -166,6 +169,7 @@ void Normalizer2DataBuilder::setOneWayMapping(UChar32 c, const UnicodeString &m)
p->mapping=new UnicodeString(m);
p->mappingType=Norm::ONE_WAY;
p->setMappingCP();
norms.mappingSet.add(c);
}
void Normalizer2DataBuilder::setRoundTripMapping(UChar32 c, const UnicodeString &m) {
@ -195,12 +199,14 @@ void Normalizer2DataBuilder::setRoundTripMapping(UChar32 c, const UnicodeString
p->mapping=new UnicodeString(m);
p->mappingType=Norm::ROUND_TRIP;
p->mappingCP=U_SENTINEL;
norms.mappingSet.add(c);
}
void Normalizer2DataBuilder::removeMapping(UChar32 c) {
// createNorm(c), not getNorm(c), to record a non-mapping and detect conflicting data.
Norm *p=checkNormForMapping(norms.createNorm(c), c);
p->mappingType=Norm::REMOVED;
norms.mappingSet.add(c);
}
UBool Normalizer2DataBuilder::mappingHasCompBoundaryAfter(const BuilderReorderingBuffer &buffer) const {
@ -832,6 +838,198 @@ Normalizer2DataBuilder::writeCSourceFile(const char *filename) {
fclose(f);
}
namespace {
bool equalStrings(const UnicodeString *s1, const UnicodeString *s2) {
if(s1 == nullptr) {
return s2 == nullptr;
} else if(s2 == nullptr) {
return false;
} else {
return *s1 == *s2;
}
}
const char *typeChars = "?-=>";
void writeMapping(FILE *f, const UnicodeString *m) {
if(m != nullptr && !m->isEmpty()) {
int32_t i = 0;
UChar32 c = m->char32At(i);
fprintf(f, "%04lX", (long)c);
while((i += U16_LENGTH(c)) < m->length()) {
c = m->char32At(i);
fprintf(f, " %04lX", (long)c);
}
}
fputs("\n", f);
}
} // namespace
void
Normalizer2DataBuilder::writeDataFile(const char *filename, bool writeRemoved) const {
// Do not processData() before writing the input-syntax data file.
FILE *f = fopen(filename, "w");
if(f == nullptr) {
fprintf(stderr, "gennorm2/writeDataFile() error: unable to create the output file %s\n",
filename);
exit(U_FILE_ACCESS_ERROR);
return;
}
if(unicodeVersion[0] != 0 || unicodeVersion[1] != 0 ||
unicodeVersion[2] != 0 || unicodeVersion[3] != 0) {
char uv[U_MAX_VERSION_STRING_LENGTH];
u_versionToString(unicodeVersion, uv);
fprintf(f, "* Unicode %s\n\n", uv);
}
UnicodeSetIterator ccIter(norms.ccSet);
UChar32 start = U_SENTINEL;
UChar32 end = U_SENTINEL;
uint8_t prevCC = 0;
bool done = false;
bool didWrite = false;
do {
UChar32 c;
uint8_t cc;
if(ccIter.next() && !ccIter.isString()) {
c = ccIter.getCodepoint();
cc = norms.getCC(c);
} else {
c = 0x110000;
cc = 0;
done = true;
}
if(cc == prevCC && c == (end + 1)) {
end = c;
} else {
if(prevCC != 0) {
if(start == end) {
fprintf(f, "%04lX:%d\n", (long)start, (int)prevCC);
} else {
fprintf(f, "%04lX..%04lX:%d\n", (long)start, (long)end, (int)prevCC);
}
didWrite = true;
}
start = end = c;
prevCC = cc;
}
} while(!done);
if(didWrite) {
fputs("\n", f);
}
UnicodeSetIterator mIter(norms.mappingSet);
start = U_SENTINEL;
end = U_SENTINEL;
const UnicodeString *prevMapping = nullptr;
Norm::MappingType prevType = Norm::NONE;
done = false;
do {
UChar32 c;
const Norm *norm;
if(mIter.next() && !mIter.isString()) {
c = mIter.getCodepoint();
norm = norms.getNorm(c);
} else {
c = 0x110000;
norm = nullptr;
done = true;
}
const UnicodeString *mapping;
Norm::MappingType type;
if(norm == nullptr) {
mapping = nullptr;
type = Norm::NONE;
} else {
type = norm->mappingType;
if(type == Norm::NONE) {
mapping = nullptr;
} else {
mapping = norm->mapping;
}
}
if(type == prevType && equalStrings(mapping, prevMapping) && c == (end + 1)) {
end = c;
} else {
if(writeRemoved ? prevType != Norm::NONE : prevType > Norm::REMOVED) {
if(start == end) {
fprintf(f, "%04lX%c", (long)start, typeChars[prevType]);
} else {
fprintf(f, "%04lX..%04lX%c", (long)start, (long)end, typeChars[prevType]);
}
writeMapping(f, prevMapping);
}
start = end = c;
prevMapping = mapping;
prevType = type;
}
} while(!done);
fclose(f);
}
void
Normalizer2DataBuilder::computeDiff(const Normalizer2DataBuilder &b1,
const Normalizer2DataBuilder &b2,
Normalizer2DataBuilder &diff) {
// Compute diff = b1 - b2
// so that we should be able to get b1 = b2 + diff.
if(0 != memcmp(b1.unicodeVersion, b2.unicodeVersion, U_MAX_VERSION_LENGTH)) {
memcpy(diff.unicodeVersion, b1.unicodeVersion, U_MAX_VERSION_LENGTH);
}
UnicodeSet ccSet(b1.norms.ccSet);
ccSet.addAll(b2.norms.ccSet);
UnicodeSetIterator ccIter(ccSet);
while(ccIter.next() && !ccIter.isString()) {
UChar32 c = ccIter.getCodepoint();
uint8_t cc1 = b1.norms.getCC(c);
uint8_t cc2 = b2.norms.getCC(c);
if(cc1 != cc2) {
diff.setCC(c, cc1);
}
}
UnicodeSet mSet(b1.norms.mappingSet);
mSet.addAll(b2.norms.mappingSet);
UnicodeSetIterator mIter(mSet);
while(mIter.next() && !mIter.isString()) {
UChar32 c = mIter.getCodepoint();
const Norm *norm1 = b1.norms.getNorm(c);
const Norm *norm2 = b2.norms.getNorm(c);
const UnicodeString *mapping1;
Norm::MappingType type1;
if(norm1 == nullptr || !norm1->hasMapping()) {
mapping1 = nullptr;
type1 = Norm::NONE;
} else {
mapping1 = norm1->mapping;
type1 = norm1->mappingType;
}
const UnicodeString *mapping2;
Norm::MappingType type2;
if(norm2 == nullptr || !norm2->hasMapping()) {
mapping2 = nullptr;
type2 = Norm::NONE;
} else {
mapping2 = norm2->mapping;
type2 = norm2->mappingType;
}
if(type1 == type2 && equalStrings(mapping1, mapping2)) {
// Nothing to do.
} else if(type1 == Norm::NONE) {
diff.removeMapping(c);
} else if(type1 == Norm::ROUND_TRIP) {
diff.setRoundTripMapping(c, *mapping1);
} else if(type1 == Norm::ONE_WAY) {
diff.setOneWayMapping(c, *mapping1);
}
}
}
U_NAMESPACE_END
#endif /* #if !UCONFIG_NO_NORMALIZATION */

View File

@ -63,6 +63,11 @@ public:
void writeBinaryFile(const char *filename);
void writeCSourceFile(const char *filename);
void writeDataFile(const char *filename, bool writeRemoved) const;
static void computeDiff(const Normalizer2DataBuilder &b1,
const Normalizer2DataBuilder &b2,
Normalizer2DataBuilder &diff);
private:
friend class Norm16Writer;

View File

@ -15,6 +15,7 @@
#if !UCONFIG_NO_NORMALIZATION
#include "unicode/errorcode.h"
#include "unicode/uniset.h"
#include "unicode/unistr.h"
#include "unicode/utf16.h"
#include "normalizer2impl.h"
@ -183,6 +184,8 @@ public:
void enumRanges(Enumerator &e);
UnicodeSet ccSet, mappingSet;
private:
Norms(const Norms &other) = delete;
Norms &operator=(const Norms &other) = delete;

View File

@ -33,6 +33,7 @@
#include "uhash.h"
#include "uresimp.h"
#include "unicode/ustring.h"
#include "unicode/utf8.h"
void res_write_java(struct SResource *res,UErrorCode *status);
@ -244,7 +245,8 @@ str_write_java(const UChar *src, int32_t srcLen, UBool printEndLine, UErrorCode
memset(buf,0,length);
bufLen = uCharsToChars(buf,length,src,srcLen,status);
// buflen accounts for extra bytes added due to multi byte encoding of
// non ASCII characters
if(printEndLine)
write_tabs(out);
@ -284,10 +286,22 @@ str_write_java(const UChar *src, int32_t srcLen, UBool printEndLine, UErrorCode
}
}
T_FileStream_write(out,"\"",1);
uint32_t byteIndex = 0;
uint32_t trailBytes = 0;
if(len+add<bufLen){
// check the trail bytes to be added to the output line
while (byteIndex < add) {
if (U8_IS_LEAD(*(current + byteIndex))) {
trailBytes = U8_COUNT_TRAIL_BYTES(*(current + byteIndex));
add += trailBytes;
}
byteIndex++;
}
T_FileStream_write(out,current,add);
T_FileStream_write(out,"\" +\n",4);
write_tabs(out);
if (len + add < bufLen) {
T_FileStream_write(out,"\" +\n",4);
write_tabs(out);
}
}else{
T_FileStream_write(out,current,bufLen-len);
}
@ -437,9 +451,7 @@ bytes_write_java(const BinaryResource *res, UErrorCode * /*status*/) {
char byteBuffer[100] = { 0 };
uint8_t* byteArray = NULL;
int byteIterator = 0;
int32_t srcLen=res->fLength;
if(srcLen>0 )
{
byteArray = res->fData;

View File

@ -149,6 +149,11 @@ public class ScientificFormat extends Format.BeforeFormat implements Rounder.Mul
// (see #13118). Note that the bound 8 on integer digits is historic.
int _maxInt = properties.getMaximumIntegerDigits();
int _minInt = properties.getMinimumIntegerDigits();
// Bug #13289: if maxInt > minInt > 1, then minInt should be 1 for the
// purposes of engineering notatation.
if (_maxInt > _minInt && _minInt > 1) {
_minInt = 1;
}
minInt = _minInt < 0 ? 0 : _minInt >= 8 ? 1 : _minInt;
maxInt = _maxInt < _minInt ? _minInt : _maxInt >= 8 ? _minInt : _maxInt;
assert 0 <= minInt && minInt <= maxInt && maxInt < 8;

View File

@ -53,7 +53,7 @@ public class DecimalFormatSymbols implements Cloneable, Serializable {
* @stable ICU 2.0
*/
public DecimalFormatSymbols() {
initialize(ULocale.getDefault(Category.FORMAT));
this(ULocale.getDefault(Category.FORMAT));
}
/**
@ -62,7 +62,7 @@ public class DecimalFormatSymbols implements Cloneable, Serializable {
* @stable ICU 2.0
*/
public DecimalFormatSymbols(Locale locale) {
initialize(ULocale.forLocale(locale));
this(ULocale.forLocale(locale));
}
/**
@ -71,7 +71,15 @@ public class DecimalFormatSymbols implements Cloneable, Serializable {
* @stable ICU 3.2
*/
public DecimalFormatSymbols(ULocale locale) {
initialize(locale);
initialize(locale, null);
}
private DecimalFormatSymbols(Locale locale, NumberingSystem ns) {
this(ULocale.forLocale(locale), ns);
}
private DecimalFormatSymbols(ULocale locale, NumberingSystem ns) {
initialize(locale, ns);
}
/**
@ -123,6 +131,46 @@ public class DecimalFormatSymbols implements Cloneable, Serializable {
return new DecimalFormatSymbols(locale);
}
/**
* {@icu} Returns a DecimalFormatSymbols instance for the given locale with digits and symbols
* corresponding to the given {@link NumberingSystem}.
*
* <p>This method behaves equivalently to {@link #getInstance} called with a locale having a
* "numbers=xxxx" keyword specifying the numbering system by name.
*
* <p>In this method, the NumberingSystem argument will be used even if the locale has its own
* "numbers=xxxx" keyword.
*
* @param locale the locale.
* @param ns the numbering system.
* @return A DecimalFormatSymbols instance.
* @provisional This API might change or be removed in a future release.
* @draft ICU 60
*/
public static DecimalFormatSymbols forNumberingSystem(Locale locale, NumberingSystem ns) {
return new DecimalFormatSymbols(locale, ns);
}
/**
* {@icu} Returns a DecimalFormatSymbols instance for the given locale with digits and symbols
* corresponding to the given {@link NumberingSystem}.
*
* <p>This method behaves equivalently to {@link #getInstance} called with a locale having a
* "numbers=xxxx" keyword specifying the numbering system by name.
*
* <p>In this method, the NumberingSystem argument will be used even if the locale has its own
* "numbers=xxxx" keyword.
*
* @param locale the locale.
* @param ns the numbering system.
* @return A DecimalFormatSymbols instance.
* @provisional This API might change or be removed in a future release.
* @draft ICU 60
*/
public static DecimalFormatSymbols forNumberingSystem(ULocale locale, NumberingSystem ns) {
return new DecimalFormatSymbols(locale, ns);
}
/**
* Returns an array of all locales for which the <code>getInstance</code> methods of
* this class can return localized instances.
@ -1336,10 +1384,16 @@ public class DecimalFormatSymbols implements Cloneable, Serializable {
/**
* Initializes the symbols from the locale data.
*/
private void initialize( ULocale locale ) {
private void initialize(ULocale locale, NumberingSystem ns) {
this.requestedLocale = locale.toLocale();
this.ulocale = locale;
CacheData data = cachedLocaleData.getInstance(locale, null /* unused */);
// TODO: The cache requires a single key, so we just save the NumberingSystem into the
// locale string. NumberingSystem is then decoded again in the loadData() method. It would
// be more efficient if we didn't have to serialize and deserialize the NumberingSystem.
ULocale keyLocale = (ns == null) ? locale : locale.setKeywordValue("numbers", ns.getName());
CacheData data = cachedLocaleData.getInstance(keyLocale, null /* unused */);
setLocale(data.validLocale, data.validLocale);
setDigitStrings(data.digits);
String[] numberElements = data.numberElements;

View File

@ -409,12 +409,7 @@ public final class Edits {
spanStart = destIndex;
spanLength = newLength_;
}
// If we are at the start or limit of an empty span, then we search from
// the start of the string so that we always return
// the first of several consecutive empty spans, for consistent results.
// We do not currently track the properties of the previous span,
// so for now we always reset if we are at the start of the current span.
if (i <= spanStart) {
if (i < spanStart) {
// Reset the iterator to the start.
index = remaining = oldLength_ = newLength_ = srcIndex = replIndex = destIndex = 0;
} else if (i < (spanStart + spanLength)) {
@ -429,8 +424,8 @@ public final class Edits {
spanStart = destIndex;
spanLength = newLength_;
}
if (i == spanStart || i < (spanStart + spanLength)) {
// The index is in the current span, or at an empty one.
if (i < (spanStart + spanLength)) {
// The index is in the current span.
return 0;
}
if (remaining > 0) {
@ -615,4 +610,167 @@ public final class Edits {
public Iterator getFineIterator() {
return new Iterator(array, length, false, false);
}
/**
* Merges the two input Edits and appends the result to this object.
*
* <p>Consider two string transformations (for example, normalization and case mapping)
* where each records Edits in addition to writing an output string.<br>
* Edits ab reflect how substrings of input string a
* map to substrings of intermediate string b.<br>
* Edits bc reflect how substrings of intermediate string b
* map to substrings of output string c.<br>
* This function merges ab and bc such that the additional edits
* recorded in this object reflect how substrings of input string a
* map to substrings of output string c.
*
* <p>If unrelated Edits are passed in where the output string of the first
* has a different length than the input string of the second,
* then an IllegalArgumentException is thrown.
*
* @param ab reflects how substrings of input string a
* map to substrings of intermediate string b.
* @param bc reflects how substrings of intermediate string b
* map to substrings of output string c.
* @return this, with the merged edits appended
* @draft ICU 60
* @provisional This API might change or be removed in a future release.
*/
public Edits mergeAndAppend(Edits ab, Edits bc) {
// Picture string a --(Edits ab)--> string b --(Edits bc)--> string c.
// Parallel iteration over both Edits.
Iterator abIter = ab.getFineIterator();
Iterator bcIter = bc.getFineIterator();
boolean abHasNext = true, bcHasNext = true;
// Copy iterator state into local variables, so that we can modify and subdivide spans.
// ab old & new length, bc old & new length
int aLength = 0, ab_bLength = 0, bc_bLength = 0, cLength = 0;
// When we have different-intermediate-length changes, we accumulate a larger change.
int pending_aLength = 0, pending_cLength = 0;
for (;;) {
// At this point, for each of the two iterators:
// Either we are done with the locally cached current edit,
// and its intermediate-string length has been reset,
// or we will continue to work with a truncated remainder of this edit.
//
// If the current edit is done, and the iterator has not yet reached the end,
// then we fetch the next edit. This is true for at least one of the iterators.
//
// Normally it does not matter whether we fetch from ab and then bc or vice versa.
// However, the result is observably different when
// ab deletions meet bc insertions at the same intermediate-string index.
// Some users expect the bc insertions to come first, so we fetch from bc first.
if (bc_bLength == 0) {
if (bcHasNext && (bcHasNext = bcIter.next())) {
bc_bLength = bcIter.oldLength();
cLength = bcIter.newLength();
if (bc_bLength == 0) {
// insertion
if (ab_bLength == 0 || !abIter.hasChange()) {
addReplace(pending_aLength, pending_cLength + cLength);
pending_aLength = pending_cLength = 0;
} else {
pending_cLength += cLength;
}
continue;
}
}
// else see if the other iterator is done, too.
}
if (ab_bLength == 0) {
if (abHasNext && (abHasNext = abIter.next())) {
aLength = abIter.oldLength();
ab_bLength = abIter.newLength();
if (ab_bLength == 0) {
// deletion
if (bc_bLength == bcIter.oldLength() || !bcIter.hasChange()) {
addReplace(pending_aLength + aLength, pending_cLength);
pending_aLength = pending_cLength = 0;
} else {
pending_aLength += aLength;
}
continue;
}
} else if (bc_bLength == 0) {
// Both iterators are done at the same time:
// The intermediate-string lengths match.
break;
} else {
throw new IllegalArgumentException(
"The ab output string is shorter than the bc input string.");
}
}
if (bc_bLength == 0) {
throw new IllegalArgumentException(
"The bc input string is shorter than the ab output string.");
}
// Done fetching: ab_bLength > 0 && bc_bLength > 0
// The current state has two parts:
// - Past: We accumulate a longer ac edit in the "pending" variables.
// - Current: We have copies of the current ab/bc edits in local variables.
// At least one side is newly fetched.
// One side might be a truncated remainder of an edit we fetched earlier.
if (!abIter.hasChange() && !bcIter.hasChange()) {
// An unchanged span all the way from string a to string c.
if (pending_aLength != 0 || pending_cLength != 0) {
addReplace(pending_aLength, pending_cLength);
pending_aLength = pending_cLength = 0;
}
int unchangedLength = aLength <= cLength ? aLength : cLength;
addUnchanged(unchangedLength);
ab_bLength = aLength -= unchangedLength;
bc_bLength = cLength -= unchangedLength;
// At least one of the unchanged spans is now empty.
continue;
}
if (!abIter.hasChange() && bcIter.hasChange()) {
// Unchanged a->b but changed b->c.
if (ab_bLength >= bc_bLength) {
// Split the longer unchanged span into change + remainder.
addReplace(pending_aLength + bc_bLength, pending_cLength + cLength);
pending_aLength = pending_cLength = 0;
aLength = ab_bLength -= bc_bLength;
bc_bLength = 0;
continue;
}
// Handle the shorter unchanged span below like a change.
} else if (abIter.hasChange() && !bcIter.hasChange()) {
// Changed a->b and then unchanged b->c.
if (ab_bLength <= bc_bLength) {
// Split the longer unchanged span into change + remainder.
addReplace(pending_aLength + aLength, pending_cLength + ab_bLength);
pending_aLength = pending_cLength = 0;
cLength = bc_bLength -= ab_bLength;
ab_bLength = 0;
continue;
}
// Handle the shorter unchanged span below like a change.
} else { // both abIter.hasChange() && bcIter.hasChange()
if (ab_bLength == bc_bLength) {
// Changes on both sides up to the same position. Emit & reset.
addReplace(pending_aLength + aLength, pending_cLength + cLength);
pending_aLength = pending_cLength = 0;
ab_bLength = bc_bLength = 0;
continue;
}
}
// Accumulate the a->c change, reset the shorter side,
// keep a remainder of the longer one.
pending_aLength += aLength;
pending_cLength += cLength;
if (ab_bLength < bc_bLength) {
bc_bLength -= ab_bLength;
cLength = ab_bLength = 0;
} else { // ab_bLength > bc_bLength
ab_bLength -= bc_bLength;
aLength = bc_bLength = 0;
}
}
if (pending_aLength != 0 || pending_cLength != 0) {
addReplace(pending_aLength, pending_cLength);
}
return this;
}
}

View File

@ -1952,7 +1952,7 @@ public class RuleBasedNumberFormat extends NumberFormat {
// position of 0 and the number being formatted) to the rule set
// for formatting
StringBuilder result = new StringBuilder();
if (getRoundingMode() != BigDecimal.ROUND_UNNECESSARY) {
if (getRoundingMode() != BigDecimal.ROUND_UNNECESSARY && !Double.isNaN(number) && !Double.isInfinite(number)) {
// We convert to a string because BigDecimal insists on excessive precision.
number = new BigDecimal(Double.toString(number)).setScale(getMaximumFractionDigits(), roundingMode).doubleValue();
}

View File

@ -3443,7 +3443,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
checkFrozen();
int p;
int v;
boolean mustNotBeEmpty = false, invert = false;
boolean invert = false;
if (symbols != null
&& (symbols instanceof XSymbolTable)
@ -3476,10 +3476,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
p == UProperty.LEAD_CANONICAL_COMBINING_CLASS ||
p == UProperty.TRAIL_CANONICAL_COMBINING_CLASS) {
v = Integer.parseInt(PatternProps.trimWhiteSpace(valueAlias));
// If the resultant set is empty then the numeric value
// was invalid.
//mustNotBeEmpty = true;
// old code was wrong; anything between 0 and 255 is valid even if unused.
// Anything between 0 and 255 is valid even if unused.
if (v < 0 || v > 255) throw e;
} else {
throw e;
@ -3580,12 +3577,6 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
complement();
}
if (mustNotBeEmpty && isEmpty()) {
// mustNotBeEmpty is set to true if an empty set indicates
// invalid input.
throw new IllegalArgumentException("Invalid property value");
}
return this;
}

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:f0d65ed59329e1eaae1813db0fa8e1236a3b58ddfa5e7e1ff33d4bea7eef3c31
size 12226292
oid sha256:193787da8cd2caebf1901892beccad07f8e7f3c714ef482681784bc583be5c60
size 12226288

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:79b0c13215014e21a95869ccbac72d191485436cae6f26a2f96622a4268c1a82
oid sha256:ca79a3355cea5666551889ce8ff3703987162a35937a292d80284519f2b68286
size 92486

View File

@ -358,15 +358,14 @@ minIntegerDigits maxIntegerDigits minFractionDigits maxFractionDigits output bre
0 0 1 0 2.99792458E8 KS
// JDK and S give .2998E9
0 0 0 4 2.998E8 KSQ
// S correctly formats this as 29.979246E7.
// JDK uses 8 + 6 for significant digits instead of 2 + 6
// J and C return 2.9979246E8.
// TODO: Merge trunk
2 8 1 6 29.979246E7 CJKQ
// According to the spec, if maxInt>minInt and minInt>1, then set
// Context: #13289
2 8 1 6 2.9979246E8 K
// Treat max int digits > 8 as being the same as min int digits.
// This behavior is not spelled out in the specification.
// JDK fails here because it tries to use 9 + 6 = 15 sig digits.
2 9 1 6 29.979246E7 K
// C and J get 29.979246E7
2 9 1 6 2.9979246E8 CJK
test significant digits scientific
set locale en

View File

@ -26,6 +26,7 @@ import org.junit.Test;
import com.ibm.icu.text.DecimalFormat;
import com.ibm.icu.text.DecimalFormatSymbols;
import com.ibm.icu.text.NumberingSystem;
import com.ibm.icu.util.Currency;
import com.ibm.icu.util.ULocale;
@ -323,4 +324,40 @@ public class IntlTestDecimalFormatSymbols extends com.ibm.icu.dev.test.TestFmwk
errln("ERROR: Code point zero be ASCII 0");
}
}
@Test
public void testNumberingSystem() {
Object[][] cases = {
{"en", "latn", "1,234.56", ';'},
{"en", "arab", "١٬٢٣٤٫٥٦", '؛'},
{"en", "mathsanb", "𝟭,𝟮𝟯𝟰.𝟱𝟲", ';'},
{"en", "mymr", "၁,၂၃၄.၅၆", ';'},
{"my", "latn", "1,234.56", ';'},
{"my", "arab", "١٬٢٣٤٫٥٦", '؛'},
{"my", "mathsanb", "𝟭,𝟮𝟯𝟰.𝟱𝟲", ';'},
{"my", "mymr", "၁,၂၃၄.၅၆", '၊'},
{"en@numbers=thai", "mymr", "၁,၂၃၄.၅၆", ';'}, // conflicting numbering system
};
for (Object[] cas : cases) {
ULocale loc = new ULocale((String) cas[0]);
NumberingSystem ns = NumberingSystem.getInstanceByName((String) cas[1]);
String expectedFormattedNumberString = (String) cas[2];
char expectedPatternSeparator = (Character) cas[3];
DecimalFormatSymbols dfs = DecimalFormatSymbols.forNumberingSystem(loc, ns);
DecimalFormat df = new DecimalFormat("#,##0.##", dfs);
String actual1 = df.format(1234.56);
assertEquals("1234.56 with " + loc + " and " + ns.getName(),
expectedFormattedNumberString, actual1);
// The pattern separator is something that differs by numbering system in my@numbers=mymr.
char actual2 = dfs.getPatternSeparator();
assertEquals("Pattern separator with " + loc + " and " + ns.getName(),
expectedPatternSeparator, actual2);
// Coverage for JDK Locale overload
DecimalFormatSymbols dfs2 = DecimalFormatSymbols.forNumberingSystem(loc.toLocale(), ns);
assertEquals("JDK Locale and ICU Locale should produce the same object", dfs, dfs2);
}
}
}

View File

@ -832,6 +832,9 @@ public class NumberFormatDataDrivenTest {
@Test
@Ignore
public void TestDataDrivenJDK() {
// Android implements java.text.DecimalFormat with ICU4J (ticket #13322).
if (TestUtil.getJavaVendor() == TestUtil.JavaVendor.Android) return;
DataDrivenNumberFormatTestUtility.runFormatSuiteIncludingKnownFailures(
"numberformattestspecification.txt", JDK);
}

View File

@ -5236,6 +5236,13 @@ public class NumberFormatTest extends TestFmwk {
assertEquals("Should parse to 300000 using non-monetary separators: " + ppos, 300000L, number);
}
@Test
public void Test13289() {
DecimalFormat df = new DecimalFormat("#00.0#E0");
String result = df.format(0.00123);
assertEquals("Should ignore scientific minInt if maxInt>minInt", "1.23E-3", result);
}
@Test
public void testPercentZero() {
DecimalFormat df = (DecimalFormat) NumberFormat.getPercentInstance();

View File

@ -1705,4 +1705,21 @@ public class RbnfTest extends TestFmwk {
};
doTest(rbnf, enTestFullData, false);
}
private void assertEquals(String expected, String result) {
if (!expected.equals(result)) {
errln("Expected: " + expected + " Got: " + result);
}
}
@Test
public void testRoundingUnrealNumbers() {
RuleBasedNumberFormat rbnf = new RuleBasedNumberFormat(ULocale.US, RuleBasedNumberFormat.SPELLOUT);
rbnf.setRoundingMode(BigDecimal.ROUND_HALF_UP);
rbnf.setMaximumFractionDigits(3);
assertEquals("zero point one", rbnf.format(0.1));
assertEquals("zero point zero zero one", rbnf.format(0.0005));
assertEquals("infinity", rbnf.format(Double.POSITIVE_INFINITY));
assertEquals("not a number", rbnf.format(Double.NaN));
}
}

View File

@ -0,0 +1,376 @@
// © 2017 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html#License
package com.ibm.icu.dev.test.lang;
import java.util.Arrays;
import java.util.Collection;
import java.util.Locale;
import org.junit.Test;
import org.junit.experimental.runners.Enclosed;
import org.junit.runner.RunWith;
import org.junit.runners.Parameterized;
import com.ibm.icu.dev.test.TestFmwk;
import com.ibm.icu.lang.UScript;
import com.ibm.icu.util.ULocale;
@RunWith(Enclosed.class)
public class DataDrivenUScriptTest extends TestFmwk {
private static String scriptsToString(int[] scripts) {
if (scripts == null) {
return "null";
}
StringBuilder sb = new StringBuilder();
for (int script : scripts) {
if (sb.length() > 0) {
sb.append(' ');
}
sb.append(UScript.getShortName(script));
}
return sb.toString();
}
private static void assertEqualScripts(String msg, int[] expectedScripts, int[] actualScripts) {
assertEquals(msg, scriptsToString(expectedScripts), scriptsToString(actualScripts));
}
@RunWith(Parameterized.class)
public static class LocaleGetCodeTest {
private ULocale testLocaleName;
private int expected;
public LocaleGetCodeTest(ULocale testLocaleName, int expected) {
this.testLocaleName = testLocaleName;
this.expected = expected;
}
@Parameterized.Parameters
public static Collection testData() {
return Arrays.asList(new Object[][] { { new ULocale("en"), UScript.LATIN },
{ new ULocale("en_US"), UScript.LATIN },
{ new ULocale("sr"), UScript.CYRILLIC },
{ new ULocale("ta"), UScript.TAMIL },
{ new ULocale("te_IN"), UScript.TELUGU },
{ new ULocale("hi"), UScript.DEVANAGARI },
{ new ULocale("he"), UScript.HEBREW },
{ new ULocale("ar"), UScript.ARABIC },
{ new ULocale("abcde"), UScript.INVALID_CODE },
{ new ULocale("abcde_cdef"), UScript.INVALID_CODE },
{ new ULocale("iw"), UScript.HEBREW }
});
}
@Test
public void TestLocaleGetCode() {
int[] code = UScript.getCode(testLocaleName);
if (code == null) {
if (expected != UScript.INVALID_CODE) {
errln("Error testing UScript.getCode(). Got: null" + " Expected: " + expected + " for locale "
+ testLocaleName);
}
} else if ((code[0] != expected)) {
errln("Error testing UScript.getCode(). Got: " + code[0] + " Expected: " + expected + " for locale "
+ testLocaleName);
}
ULocale defaultLoc = ULocale.getDefault();
ULocale esperanto = new ULocale("eo_DE");
ULocale.setDefault(esperanto);
code = UScript.getCode(esperanto);
if (code != null) {
if (code[0] != UScript.LATIN) {
errln("Did not get the expected script code for Esperanto");
}
} else {
warnln("Could not load the locale data.");
}
ULocale.setDefault(defaultLoc);
// Should work regardless of whether we have locale data for the language.
assertEqualScripts("tg script: Cyrl", // Tajik
new int[] { UScript.CYRILLIC }, UScript.getCode(new ULocale("tg")));
assertEqualScripts("xsr script: Deva", // Sherpa
new int[] { UScript.DEVANAGARI }, UScript.getCode(new ULocale("xsr")));
// Multi-script languages.
assertEqualScripts("ja scripts: Kana Hira Hani",
new int[] { UScript.KATAKANA, UScript.HIRAGANA, UScript.HAN }, UScript.getCode(ULocale.JAPANESE));
assertEqualScripts("ko scripts: Hang Hani", new int[] { UScript.HANGUL, UScript.HAN },
UScript.getCode(ULocale.KOREAN));
assertEqualScripts("zh script: Hani", new int[] { UScript.HAN }, UScript.getCode(ULocale.CHINESE));
assertEqualScripts("zh-Hant scripts: Hani Bopo", new int[] { UScript.HAN, UScript.BOPOMOFO },
UScript.getCode(ULocale.TRADITIONAL_CHINESE));
assertEqualScripts("zh-TW scripts: Hani Bopo", new int[] { UScript.HAN, UScript.BOPOMOFO },
UScript.getCode(ULocale.TAIWAN));
// Ambiguous API, but this probably wants to return Latin rather than Rongorongo (Roro).
assertEqualScripts("ro-RO script: Latn", new int[] { UScript.LATIN }, UScript.getCode("ro-RO")); // String
// not
// ULocale
}
}
@RunWith(Parameterized.class)
public static class TestMultipleUScript extends TestFmwk {
private String testLocaleName;
private Locale testLocale;
private int[] expected;
public TestMultipleUScript(String testLocaleName, int[] expected, Locale testLocale) {
this.testLocaleName = testLocaleName;
this.testLocale = testLocale;
this.expected = expected;
}
@Parameterized.Parameters
public static Collection testData() {
return Arrays.asList(new Object[][] {
{ "ja", new int[] { UScript.KATAKANA, UScript.HIRAGANA, UScript.HAN }, Locale.JAPANESE },
{ "ko_KR", new int[] { UScript.HANGUL, UScript.HAN }, Locale.KOREA },
{ "zh", new int[] { UScript.HAN }, Locale.CHINESE },
{ "zh_TW", new int[] { UScript.HAN, UScript.BOPOMOFO }, Locale.TAIWAN }
});
}
@Test
public void TestMultipleCodes() {
int[] code = UScript.getCode(testLocaleName);
if (code != null) {
for (int j = 0; j < code.length; j++) {
if (code[j] != expected[j]) {
errln("Error testing UScript.getCode(). Got: " + code[j] + " Expected: " + expected[j]
+ " for locale " + testLocaleName);
}
}
} else {
errln("Error testing UScript.getCode() for locale " + testLocaleName);
}
logln(" Testing UScript.getCode(Locale) with locale: " + testLocale.getDisplayName());
code = UScript.getCode(testLocale);
if (code != null) {
for (int j = 0; j < code.length; j++) {
if (code[j] != expected[j]) {
errln("Error testing UScript.getCode(). Got: " + code[j] + " Expected: " + expected[j]
+ " for locale " + testLocaleName);
}
}
} else {
errln("Error testing UScript.getCode() for locale " + testLocaleName);
}
}
}
@RunWith(Parameterized.class)
public static class GetCodeTest extends TestFmwk {
private String testName;
private int expected;
public GetCodeTest(String testName, int expected) {
this.testName = testName;
this.expected = expected;
}
@Parameterized.Parameters
public static Collection testData() {
return Arrays.asList(new Object[][] {
/* test locale */
{ "en", UScript.LATIN },
{ "en_US", UScript.LATIN },
{ "sr", UScript.CYRILLIC },
{ "ta", UScript.TAMIL },
{ "gu", UScript.GUJARATI },
{ "te_IN", UScript.TELUGU },
{ "hi", UScript.DEVANAGARI },
{ "he", UScript.HEBREW },
{ "ar", UScript.ARABIC },
{ "abcde", UScript.INVALID_CODE },
{ "abscde_cdef", UScript.INVALID_CODE },
{ "iw", UScript.HEBREW },
/* test abbr */
{ "Hani", UScript.HAN },
{ "Hang", UScript.HANGUL },
{ "Hebr", UScript.HEBREW },
{ "Hira", UScript.HIRAGANA },
{ "Knda", UScript.KANNADA },
{ "Kana", UScript.KATAKANA },
{ "Khmr", UScript.KHMER },
{ "Lao", UScript.LAO },
{ "Latn", UScript.LATIN }, /* "Latf","Latg", */
{ "Mlym", UScript.MALAYALAM },
{ "Mong", UScript.MONGOLIAN },
/* test names */
{ "CYRILLIC", UScript.CYRILLIC },
{ "DESERET", UScript.DESERET },
{ "DEVANAGARI", UScript.DEVANAGARI },
{ "ETHIOPIC", UScript.ETHIOPIC },
{ "GEORGIAN", UScript.GEORGIAN },
{ "GOTHIC", UScript.GOTHIC },
{ "GREEK", UScript.GREEK },
{ "GUJARATI", UScript.GUJARATI },
{ "COMMON", UScript.COMMON },
{ "INHERITED", UScript.INHERITED },
/* test lower case names */
{ "malayalam", UScript.MALAYALAM },
{ "mongolian", UScript.MONGOLIAN },
{ "myanmar", UScript.MYANMAR },
{ "ogham", UScript.OGHAM },
{ "old-italic", UScript.OLD_ITALIC },
{ "oriya", UScript.ORIYA },
{ "runic", UScript.RUNIC },
{ "sinhala", UScript.SINHALA },
{ "syriac", UScript.SYRIAC },
{ "tamil", UScript.TAMIL },
{ "telugu", UScript.TELUGU },
{ "thaana", UScript.THAANA },
{ "thai", UScript.THAI },
{ "tibetan", UScript.TIBETAN },
/* test the bounds */
{ "Cans", UScript.CANADIAN_ABORIGINAL },
{ "arabic", UScript.ARABIC },
{ "Yi", UScript.YI },
{ "Zyyy", UScript.COMMON }
});
}
@Test
public void TestGetCode() {
int[] code = UScript.getCode(testName);
if (code == null) {
if (expected != UScript.INVALID_CODE) {
// getCode returns null if the code could not be found
errln("Error testing UScript.getCode(). Got: null" + " Expected: " + expected + " for locale "
+ testName);
}
} else if ((code[0] != expected)) {
errln("Error testing UScript.getCode(). Got: " + code + " Expected: " + expected + " for locale "
+ testName);
}
}
}
@RunWith(Parameterized.class)
public static class GetNameTest {
private int testCode;
private String expected;
public GetNameTest(int testCode, String expected) {
this.testCode = testCode;
this.expected = expected;
}
@Parameterized.Parameters
public static Collection testData() {
return Arrays.asList(new Object[][] {
{ UScript.CYRILLIC, "Cyrillic" },
{ UScript.DESERET, "Deseret" },
{ UScript.DEVANAGARI, "Devanagari" },
{ UScript.ETHIOPIC, "Ethiopic" },
{ UScript.GEORGIAN, "Georgian" },
{ UScript.GOTHIC, "Gothic" },
{ UScript.GREEK, "Greek" },
{ UScript.GUJARATI, "Gujarati" }
});
}
@Test
public void TestGetName() {
String scriptName = UScript.getName(testCode);
if (!expected.equals(scriptName)) {
errln("Error testing UScript.getName(). Got: " + scriptName + " Expected: " + expected);
}
}
}
@RunWith(Parameterized.class)
public static class GetShortNameTest {
private int testCode;
private String expected;
public GetShortNameTest(int testCode, String expected) {
this.testCode = testCode;
this.expected = expected;
}
@Parameterized.Parameters
public static Collection testData() {
return Arrays.asList(new Object[][] {
{ UScript.HAN, "Hani" },
{ UScript.HANGUL, "Hang" },
{ UScript.HEBREW, "Hebr" },
{ UScript.HIRAGANA, "Hira" },
{ UScript.KANNADA, "Knda" },
{ UScript.KATAKANA, "Kana" },
{ UScript.KHMER, "Khmr" },
{ UScript.LAO, "Laoo" },
{ UScript.LATIN, "Latn" },
{ UScript.MALAYALAM, "Mlym" },
{ UScript.MONGOLIAN, "Mong" },
});
}
@Test
public void TestGetShortName() {
String shortName = UScript.getShortName(testCode);
if (!expected.equals(shortName)) {
errln("Error testing UScript.getShortName(). Got: " + shortName + " Expected: " + expected);
}
}
}
@RunWith(Parameterized.class)
public static class GetScriptTest {
private int codepoint;
private int expected;
public GetScriptTest(int[] codepoint) {
this.codepoint = codepoint[0];
this.expected = codepoint[1];
}
@Parameterized.Parameters
public static Collection testData() {
return Arrays.asList(new int[][] {
{ 0x0000FF9D, UScript.KATAKANA },
{ 0x0000FFBE, UScript.HANGUL },
{ 0x0000FFC7, UScript.HANGUL },
{ 0x0000FFCF, UScript.HANGUL },
{ 0x0000FFD7, UScript.HANGUL },
{ 0x0000FFDC, UScript.HANGUL },
{ 0x00010300, UScript.OLD_ITALIC },
{ 0x00010330, UScript.GOTHIC },
{ 0x0001034A, UScript.GOTHIC },
{ 0x00010400, UScript.DESERET },
{ 0x00010428, UScript.DESERET },
{ 0x0001D167, UScript.INHERITED },
{ 0x0001D17B, UScript.INHERITED },
{ 0x0001D185, UScript.INHERITED },
{ 0x0001D1AA, UScript.INHERITED },
{ 0x00020000, UScript.HAN },
{ 0x00000D02, UScript.MALAYALAM },
{ 0x00050005, UScript.UNKNOWN }, // new Zzzz value in Unicode 5.0
{ 0x00000000, UScript.COMMON },
{ 0x0001D169, UScript.INHERITED },
{ 0x0001D182, UScript.INHERITED },
{ 0x0001D18B, UScript.INHERITED },
{ 0x0001D1AD, UScript.INHERITED },
});
}
@Test
public void TestGetScript() {
int code = UScript.INVALID_CODE;
code = UScript.getScript(codepoint);
if (code != expected) {
errln("Error testing UScript.getScript(). Got: " + code + " Expected: " + expected
+ " for codepoint 0x + hex(codepoint).");
}
}
}
}

View File

@ -10,7 +10,6 @@
package com.ibm.icu.dev.test.lang;
import java.util.BitSet;
import java.util.Locale;
import org.junit.Test;
@ -19,7 +18,6 @@ import com.ibm.icu.lang.UProperty;
import com.ibm.icu.lang.UScript;
import com.ibm.icu.lang.UScript.ScriptUsage;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.util.ULocale;
public class TestUScript extends TestFmwk {
@ -30,350 +28,6 @@ public class TestUScript extends TestFmwk {
{
}
private static String scriptsToString(int[] scripts) {
if(scripts == null) {
return "null";
}
StringBuilder sb = new StringBuilder();
for(int script : scripts) {
if(sb.length() > 0) {
sb.append(' ');
}
sb.append(UScript.getShortName(script));
}
return sb.toString();
}
private void assertEqualScripts(String msg, int[] expectedScripts, int[] actualScripts) {
assertEquals(msg, scriptsToString(expectedScripts), scriptsToString(actualScripts));
}
@Test
public void TestLocaleGetCode(){
final ULocale[] testNames={
/* test locale */
new ULocale("en"), new ULocale("en_US"),
new ULocale("sr"), new ULocale("ta") ,
new ULocale("te_IN"),
new ULocale("hi"),
new ULocale("he"), new ULocale("ar"),
new ULocale("abcde"),
new ULocale("abcde_cdef"),
new ULocale("iw")
};
final int[] expected ={
/* locales should return */
UScript.LATIN, UScript.LATIN,
UScript.CYRILLIC, UScript.TAMIL,
UScript.TELUGU,UScript.DEVANAGARI,
UScript.HEBREW, UScript.ARABIC,
UScript.INVALID_CODE,UScript.INVALID_CODE,
UScript.HEBREW
};
int i =0;
int numErrors =0;
for( ; i<testNames.length; i++){
int[] code = UScript.getCode(testNames[i]);
if(code==null){
if(expected[i]!=UScript.INVALID_CODE){
logln("Error getting script code Got: null" + " Expected: " +expected[i] +" for name "+testNames[i]);
numErrors++;
}
// getCode returns null if the code could not be found
continue;
}
if((code[0] != expected[i])){
logln("Error getting script code Got: " +code[0] + " Expected: " +expected[i] +" for name "+testNames[i]);
numErrors++;
}
}
reportDataErrors(numErrors);
//
ULocale defaultLoc = ULocale.getDefault();
ULocale esperanto = new ULocale("eo_DE");
ULocale.setDefault(esperanto);
int[] code = UScript.getCode(esperanto);
if(code != null){
if( code[0] != UScript.LATIN){
errln("Did not get the expected script code for Esperanto");
}
}else{
warnln("Could not load the locale data.");
}
ULocale.setDefault(defaultLoc);
// Should work regardless of whether we have locale data for the language.
assertEqualScripts("tg script: Cyrl", // Tajik
new int[] { UScript.CYRILLIC },
UScript.getCode(new ULocale("tg")));
assertEqualScripts("xsr script: Deva", // Sherpa
new int[] { UScript.DEVANAGARI },
UScript.getCode(new ULocale("xsr")));
// Multi-script languages.
assertEqualScripts("ja scripts: Kana Hira Hani",
new int[] { UScript.KATAKANA, UScript.HIRAGANA, UScript.HAN },
UScript.getCode(ULocale.JAPANESE));
assertEqualScripts("ko scripts: Hang Hani",
new int[] { UScript.HANGUL, UScript.HAN },
UScript.getCode(ULocale.KOREAN));
assertEqualScripts("zh script: Hani",
new int[] { UScript.HAN },
UScript.getCode(ULocale.CHINESE));
assertEqualScripts("zh-Hant scripts: Hani Bopo",
new int[] { UScript.HAN, UScript.BOPOMOFO },
UScript.getCode(ULocale.TRADITIONAL_CHINESE));
assertEqualScripts("zh-TW scripts: Hani Bopo",
new int[] { UScript.HAN, UScript.BOPOMOFO },
UScript.getCode(ULocale.TAIWAN));
// Ambiguous API, but this probably wants to return Latin rather than Rongorongo (Roro).
assertEqualScripts("ro-RO script: Latn",
new int[] { UScript.LATIN },
UScript.getCode("ro-RO")); // String not ULocale
}
// TODO(junit): remove this and convert the tests that use this to be parameterized
private void reportDataErrors(int numErrors) {
if (numErrors >0) {
// assume missing locale data, so not an error, just a warning
errln("encountered " + numErrors + " errors.");
}
}
@Test
public void TestMultipleCode(){
final String[] testNames = { "ja" ,"ko_KR","zh","zh_TW"};
final int[][] expected = {
{UScript.KATAKANA,UScript.HIRAGANA,UScript.HAN},
{UScript.HANGUL, UScript.HAN},
{UScript.HAN},
{UScript.HAN,UScript.BOPOMOFO}
};
int numErrors = 0;
for(int i=0; i<testNames.length;i++){
int[] code = UScript.getCode(testNames[i]);
int[] expt = expected[i];
if(code!=null){
for(int j =0; j< code.length;j++){
if(code[j]!=expt[j]){
numErrors++;
logln("Error getting script code Got: " +code[j] + " Expected: " +expt[j] +" for name "+testNames[i]);
}
}
}else{
numErrors++;
logln("Error getting script code for name "+testNames[i]);
}
}
reportDataErrors(numErrors);
//cover UScript.getCode(Locale)
Locale[] testLocales = new Locale[] {
Locale.JAPANESE,
Locale.KOREA,
Locale.CHINESE,
Locale.TAIWAN };
logln("Testing UScript.getCode(Locale) ...");
numErrors = 0;
for(int i=0; i<testNames.length;i++){
logln(" Testing locale: " + testLocales[i].getDisplayName());
int[] code = UScript.getCode(testLocales[i]);
int[] expt = expected[i];
if(code!=null){
for(int j =0; j< code.length;j++){
if(code[j]!=expt[j]){
numErrors++;
logln(" Error getting script code Got: " +code[j] + " Expected: " +expt[j] +" for name "+testNames[i]);
}
}
}else{
numErrors++;
logln(" Error getting script code for name "+testNames[i]);
}
}
reportDataErrors(numErrors);
}
@Test
public void TestGetCode(){
final String[] testNames={
/* test locale */
"en", "en_US", "sr", "ta", "gu", "te_IN",
"hi", "he", "ar",
/* test abbr */
"Hani", "Hang","Hebr","Hira",
"Knda","Kana","Khmr","Lao",
"Latn",/*"Latf","Latg",*/
"Mlym", "Mong",
/* test names */
"CYRILLIC","DESERET","DEVANAGARI","ETHIOPIC","GEORGIAN",
"GOTHIC", "GREEK", "GUJARATI", "COMMON", "INHERITED",
/* test lower case names */
"malayalam", "mongolian", "myanmar", "ogham", "old-italic",
"oriya", "runic", "sinhala", "syriac","tamil",
"telugu", "thaana", "thai", "tibetan",
/* test the bounds*/
"Cans", "arabic","Yi","Zyyy"
};
final int[] expected ={
/* locales should return */
UScript.LATIN, UScript.LATIN,
UScript.CYRILLIC, UScript.TAMIL, UScript.GUJARATI,
UScript.TELUGU,UScript.DEVANAGARI,
UScript.HEBREW, UScript.ARABIC,
/* abbr should return */
UScript.HAN, UScript.HANGUL, UScript.HEBREW, UScript.HIRAGANA,
UScript.KANNADA, UScript.KATAKANA, UScript.KHMER, UScript.LAO,
UScript.LATIN,/* UScript.LATIN, UScript.LATIN,*/
UScript.MALAYALAM, UScript.MONGOLIAN,
/* names should return */
UScript.CYRILLIC, UScript.DESERET, UScript.DEVANAGARI, UScript.ETHIOPIC, UScript.GEORGIAN,
UScript.GOTHIC, UScript.GREEK, UScript.GUJARATI, UScript.COMMON, UScript.INHERITED,
/* lower case names should return */
UScript.MALAYALAM, UScript.MONGOLIAN, UScript.MYANMAR, UScript.OGHAM, UScript.OLD_ITALIC,
UScript.ORIYA, UScript.RUNIC, UScript.SINHALA, UScript.SYRIAC, UScript.TAMIL,
UScript.TELUGU, UScript.THAANA, UScript.THAI, UScript.TIBETAN,
/* bounds */
UScript.CANADIAN_ABORIGINAL, UScript.ARABIC, UScript.YI, UScript.COMMON
};
int i =0;
int numErrors =0;
for( ; i<testNames.length; i++){
int[] code = UScript.getCode(testNames[i]);
if(code == null){
if(expected[i]==UScript.INVALID_CODE){
// getCode returns null if the code could not be found
continue;
}
// currently commented out until jitterbug#2678 is fixed
logln("Error getting script code Got: null" + " Expected: " +expected[i] +" for name "+testNames[i]);
numErrors++;
continue;
}
if((code[0] != expected[i])){
logln("Error getting script code Got: " +code[0] + " Expected: " +expected[i] +" for name "+testNames[i]);
numErrors++;
}
}
reportDataErrors(numErrors);
}
@Test
public void TestGetName(){
final int[] testCodes={
/* names should return */
UScript.CYRILLIC, UScript.DESERET, UScript.DEVANAGARI, UScript.ETHIOPIC, UScript.GEORGIAN,
UScript.GOTHIC, UScript.GREEK, UScript.GUJARATI,
};
final String[] expectedNames={
/* test names */
"Cyrillic","Deseret","Devanagari","Ethiopic","Georgian",
"Gothic", "Greek", "Gujarati",
};
int i =0;
int numErrors=0;
while(i< testCodes.length){
String scriptName = UScript.getName(testCodes[i]);
if(!expectedNames[i].equals(scriptName)){
logln("Error getting abbreviations Got: " +scriptName +" Expected: "+expectedNames[i]);
numErrors++;
}
i++;
}
if(numErrors >0 ){
warnln("encountered " + numErrors + " errors in UScript.getName()");
}
}
@Test
public void TestGetShortName(){
final int[] testCodes={
/* abbr should return */
UScript.HAN, UScript.HANGUL, UScript.HEBREW, UScript.HIRAGANA,
UScript.KANNADA, UScript.KATAKANA, UScript.KHMER, UScript.LAO,
UScript.LATIN,
UScript.MALAYALAM, UScript.MONGOLIAN,
};
final String[] expectedAbbr={
/* test abbr */
"Hani", "Hang","Hebr","Hira",
"Knda","Kana","Khmr","Laoo",
"Latn",
"Mlym", "Mong",
};
int i=0;
int numErrors=0;
while(i<testCodes.length){
String shortName = UScript.getShortName(testCodes[i]);
if(!expectedAbbr[i].equals(shortName)){
logln("Error getting abbreviations Got: " +shortName+ " Expected: " +expectedAbbr[i]);
numErrors++;
}
i++;
}
if(numErrors >0 ){
warnln("encountered " + numErrors + " errors in UScript.getShortName()");
}
}
@Test
public void TestGetScript(){
int codepoints[][] = new int[][] {
{0x0000FF9D, UScript.KATAKANA },
{0x0000FFBE, UScript.HANGUL },
{0x0000FFC7, UScript.HANGUL },
{0x0000FFCF, UScript.HANGUL },
{0x0000FFD7, UScript.HANGUL},
{0x0000FFDC, UScript.HANGUL},
{0x00010300, UScript.OLD_ITALIC},
{0x00010330, UScript.GOTHIC},
{0x0001034A, UScript.GOTHIC},
{0x00010400, UScript.DESERET},
{0x00010428, UScript.DESERET},
{0x0001D167, UScript.INHERITED},
{0x0001D17B, UScript.INHERITED},
{0x0001D185, UScript.INHERITED},
{0x0001D1AA, UScript.INHERITED},
{0x00020000, UScript.HAN},
{0x00000D02, UScript.MALAYALAM},
{0x00050005, UScript.UNKNOWN}, // new Zzzz value in Unicode 5.0
{0x00000000, UScript.COMMON},
{0x0001D169, UScript.INHERITED },
{0x0001D182, UScript.INHERITED },
{0x0001D18B, UScript.INHERITED },
{0x0001D1AD, UScript.INHERITED },
};
int i =0;
int code = UScript.INVALID_CODE;
boolean passed = true;
while(i< codepoints.length){
code = UScript.getScript(codepoints[i][0]);
if(code != codepoints[i][1]){
logln("UScript.getScript for codepoint 0x"+ hex(codepoints[i][0])+" failed");
passed = false;
}
i++;
}
if(!passed){
errln("UScript.getScript failed.");
}
}
@Test
public void TestGetScriptOfCharsWithScriptExtensions() {
/* test characters which have Script_Extensions */

View File

@ -13,6 +13,7 @@ package com.ibm.icu.dev.test.lang;
import java.io.BufferedReader;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Locale;
@ -777,6 +778,88 @@ public final class UCharacterCaseTest extends TestFmwk
}
}
private static String printOneEdit(Edits.Iterator ei) {
if (ei.hasChange()) {
return "" + ei.oldLength() + "->" + ei.newLength();
} else {
return "" + ei.oldLength() + "=" + ei.newLength();
}
}
/**
* Maps indexes according to the expected edits.
* A destination index can occur multiple times when there are source deletions.
* Map according to the last occurrence, normally in a non-empty destination span.
* Simplest is to search from the back.
*/
private static int srcIndexFromDest(
EditChange expected[], int srcLength, int destLength, int index) {
int srcIndex = srcLength;
int destIndex = destLength;
int i = expected.length;
while (index < destIndex && i > 0) {
--i;
int prevSrcIndex = srcIndex - expected[i].oldLength;
int prevDestIndex = destIndex - expected[i].newLength;
if (index == prevDestIndex) {
return prevSrcIndex;
} else if (index > prevDestIndex) {
if (expected[i].change) {
// In a change span, map to its end.
return srcIndex;
} else {
// In an unchanged span, offset within it.
return prevSrcIndex + (index - prevDestIndex);
}
}
srcIndex = prevSrcIndex;
destIndex = prevDestIndex;
}
// index is outside the string.
return srcIndex;
}
private static int destIndexFromSrc(
EditChange expected[], int srcLength, int destLength, int index) {
int srcIndex = srcLength;
int destIndex = destLength;
int i = expected.length;
while (index < srcIndex && i > 0) {
--i;
int prevSrcIndex = srcIndex - expected[i].oldLength;
int prevDestIndex = destIndex - expected[i].newLength;
if (index == prevSrcIndex) {
return prevDestIndex;
} else if (index > prevSrcIndex) {
if (expected[i].change) {
// In a change span, map to its end.
return destIndex;
} else {
// In an unchanged span, offset within it.
return prevDestIndex + (index - prevSrcIndex);
}
}
srcIndex = prevSrcIndex;
destIndex = prevDestIndex;
}
// index is outside the string.
return destIndex;
}
private void checkEqualEdits(String name, Edits e1, Edits e2) {
Edits.Iterator ei1 = e1.getFineIterator();
Edits.Iterator ei2 = e2.getFineIterator();
for (int i = 0;; ++i) {
boolean ei1HasNext = ei1.next();
boolean ei2HasNext = ei2.next();
assertEquals(name + " next()[" + i + "]", ei1HasNext, ei2HasNext);
assertEquals(name + " edit[" + i + "]", printOneEdit(ei1), printOneEdit(ei2));
if (!ei1HasNext || !ei2HasNext) {
break;
}
}
}
private static void checkEditsIter(
String name, Edits.Iterator ei1, Edits.Iterator ei2, // two equal iterators
EditChange[] expected, boolean withUnchanged) {
@ -786,8 +869,6 @@ public final class UCharacterCaseTest extends TestFmwk
int expSrcIndex = 0;
int expDestIndex = 0;
int expReplIndex = 0;
int expSrcIndexFromDest = 0; // for sourceIndexFromDestinationIndex()
int expDestIndexFromSrc = 0; // for destinationIndexFromSourceIndex()
for (int expIndex = 0; expIndex < expected.length; ++expIndex) {
EditChange expect = expected[expIndex];
String msg = name + ' ' + expIndex;
@ -801,7 +882,7 @@ public final class UCharacterCaseTest extends TestFmwk
assertEquals(msg, expReplIndex, ei1.replacementIndex());
}
if (expect.oldLength > 0 && expDestIndex == expDestIndexFromSrc) {
if (expect.oldLength > 0) {
assertTrue(msg, ei2.findSourceIndex(expSrcIndex));
assertEquals(msg, expect.change, ei2.hasChange());
assertEquals(msg, expect.oldLength, ei2.oldLength());
@ -817,7 +898,7 @@ public final class UCharacterCaseTest extends TestFmwk
}
}
if (expect.newLength > 0 && expSrcIndex == expSrcIndexFromDest) {
if (expect.newLength > 0) {
assertTrue(msg, ei2.findDestinationIndex(expDestIndex));
assertEquals(msg, expect.change, ei2.hasChange());
assertEquals(msg, expect.oldLength, ei2.oldLength());
@ -833,45 +914,11 @@ public final class UCharacterCaseTest extends TestFmwk
}
}
// Span starts.
assertEquals(name, expDestIndexFromSrc,
ei2.destinationIndexFromSourceIndex(expSrcIndex));
assertEquals(name, expSrcIndexFromDest,
ei2.sourceIndexFromDestinationIndex(expDestIndex));
// Inside unchanged span map offsets 1:1.
if (!expect.change && expect.oldLength >= 2) {
assertEquals(name, expDestIndex + 1,
ei2.destinationIndexFromSourceIndex(expSrcIndex + 1));
assertEquals(name, expSrcIndex + 1,
ei2.sourceIndexFromDestinationIndex(expDestIndex + 1));
}
// Inside change span map to the span limit.
int expSrcLimit = expSrcIndex + expect.oldLength;
int expDestLimit = expDestIndex + expect.newLength;
if (expect.change) {
if (expect.oldLength >= 2) {
assertEquals(name, expDestLimit,
ei2.destinationIndexFromSourceIndex(expSrcIndex + 1));
}
if (expect.newLength >= 2) {
assertEquals(name, expSrcLimit,
ei2.sourceIndexFromDestinationIndex(expDestIndex + 1));
}
}
expSrcIndex = expSrcLimit;
expDestIndex = expDestLimit;
expSrcIndex += expect.oldLength;
expDestIndex += expect.newLength;
if (expect.change) {
expReplIndex += expect.newLength;
}
if (expect.newLength > 0) {
expSrcIndexFromDest = expSrcIndex;
}
if (expect.oldLength > 0) {
expDestIndexFromSrc = expDestIndex;
}
}
String msg = name + " end";
assertFalse(msg, ei1.next());
@ -884,8 +931,49 @@ public final class UCharacterCaseTest extends TestFmwk
assertFalse(name, ei2.findSourceIndex(expSrcIndex));
assertFalse(name, ei2.findDestinationIndex(expDestIndex));
assertEquals(name, expDestIndex, ei2.destinationIndexFromSourceIndex(expSrcIndex));
assertEquals(name, expSrcIndex, ei2.sourceIndexFromDestinationIndex(expDestIndex));
// Check mapping of all indexes against a simple implementation
// that works on the expected changes.
// Iterate once forward, once backward, to cover more runtime conditions.
int srcLength = expSrcIndex;
int destLength = expDestIndex;
List<Integer> srcIndexes = new ArrayList<Integer>();
List<Integer> destIndexes = new ArrayList<Integer>();
srcIndexes.add(-1);
destIndexes.add(-1);
int srcIndex = 0;
int destIndex = 0;
for (int i = 0; i < expected.length; ++i) {
if (expected[i].oldLength > 0) {
srcIndexes.add(srcIndex);
if (expected[i].oldLength > 1) {
srcIndexes.add(srcIndex + 1);
}
}
if (expected[i].newLength > 0) {
destIndexes.add(destIndex);
if (expected[i].newLength > 0) {
destIndexes.add(destIndex + 1);
}
}
srcIndex += expected[i].oldLength;
destIndex += expected[i].newLength;
}
srcIndexes.add(srcLength);
destIndexes.add(destLength);
srcIndexes.add(srcLength + 1);
destIndexes.add(destLength + 1);
Collections.reverse(destIndexes);
for (int i : srcIndexes) {
assertEquals(name + " destIndexFromSrc(" + i + "):",
destIndexFromSrc(expected, srcLength, destLength, i),
ei2.destinationIndexFromSourceIndex(i));
}
for (int i : destIndexes) {
assertEquals(name + " srcIndexFromDest(" + i + "):",
srcIndexFromDest(expected, srcLength, destLength, i),
ei2.sourceIndexFromDestinationIndex(i));
}
}
@Test
@ -949,6 +1037,167 @@ public final class UCharacterCaseTest extends TestFmwk
assertFalse("reset then iterator", ei.next());
}
@Test
public void TestMergeEdits() {
Edits ab = new Edits(), bc = new Edits(), ac = new Edits(), expected_ac = new Edits();
// Simple: Two parallel non-changes.
ab.addUnchanged(2);
bc.addUnchanged(2);
expected_ac.addUnchanged(2);
// Simple: Two aligned changes.
ab.addReplace(3, 2);
bc.addReplace(2, 1);
expected_ac.addReplace(3, 1);
// Unequal non-changes.
ab.addUnchanged(5);
bc.addUnchanged(3);
expected_ac.addUnchanged(3);
// ab ahead by 2
// Overlapping changes accumulate until they share a boundary.
ab.addReplace(4, 3);
bc.addReplace(3, 2);
ab.addReplace(4, 3);
bc.addReplace(3, 2);
ab.addReplace(4, 3);
bc.addReplace(3, 2);
bc.addUnchanged(4);
expected_ac.addReplace(14, 8);
// bc ahead by 2
// Balance out intermediate-string lengths.
ab.addUnchanged(2);
expected_ac.addUnchanged(2);
// Insert something and delete it: Should disappear.
ab.addReplace(0, 5);
ab.addReplace(0, 2);
bc.addReplace(7, 0);
// Parallel change to make a new boundary.
ab.addReplace(1, 2);
bc.addReplace(2, 3);
expected_ac.addReplace(1, 3);
// Multiple ab deletions should remain separate at the boundary.
ab.addReplace(1, 0);
ab.addReplace(2, 0);
ab.addReplace(3, 0);
expected_ac.addReplace(1, 0);
expected_ac.addReplace(2, 0);
expected_ac.addReplace(3, 0);
// Unequal non-changes can be split for another boundary.
ab.addUnchanged(2);
bc.addUnchanged(1);
expected_ac.addUnchanged(1);
// ab ahead by 1
// Multiple bc insertions should create a boundary and remain separate.
bc.addReplace(0, 4);
bc.addReplace(0, 5);
bc.addReplace(0, 6);
expected_ac.addReplace(0, 4);
expected_ac.addReplace(0, 5);
expected_ac.addReplace(0, 6);
// ab ahead by 1
// Multiple ab deletions in the middle of a bc change are merged.
bc.addReplace(2, 2);
// bc ahead by 1
ab.addReplace(1, 0);
ab.addReplace(2, 0);
ab.addReplace(3, 0);
ab.addReplace(4, 1);
expected_ac.addReplace(11, 2);
// Multiple bc insertions in the middle of an ab change are merged.
ab.addReplace(5, 6);
bc.addReplace(3, 3);
// ab ahead by 3
bc.addReplace(0, 4);
bc.addReplace(0, 5);
bc.addReplace(0, 6);
bc.addReplace(3, 7);
expected_ac.addReplace(5, 25);
// Delete around a deletion.
ab.addReplace(4, 4);
ab.addReplace(3, 0);
ab.addUnchanged(2);
bc.addReplace(2, 2);
bc.addReplace(4, 0);
expected_ac.addReplace(9, 2);
// Insert into an insertion.
ab.addReplace(0, 2);
bc.addReplace(1, 1);
bc.addReplace(0, 8);
bc.addUnchanged(4);
expected_ac.addReplace(0, 10);
// bc ahead by 3
// Balance out intermediate-string lengths.
ab.addUnchanged(3);
expected_ac.addUnchanged(3);
// Deletions meet insertions.
// Output order is arbitrary in principle, but we expect insertions first
// and want to keep it that way.
ab.addReplace(2, 0);
ab.addReplace(4, 0);
ab.addReplace(6, 0);
bc.addReplace(0, 1);
bc.addReplace(0, 3);
bc.addReplace(0, 5);
expected_ac.addReplace(0, 1);
expected_ac.addReplace(0, 3);
expected_ac.addReplace(0, 5);
expected_ac.addReplace(2, 0);
expected_ac.addReplace(4, 0);
expected_ac.addReplace(6, 0);
// End with a non-change, so that further edits are never reordered.
ab.addUnchanged(1);
bc.addUnchanged(1);
expected_ac.addUnchanged(1);
ac.mergeAndAppend(ab, bc);
checkEqualEdits("ab+bc", expected_ac, ac);
// Append more Edits.
Edits ab2 = new Edits(), bc2 = new Edits();
ab2.addUnchanged(5);
bc2.addReplace(1, 2);
bc2.addUnchanged(4);
expected_ac.addReplace(1, 2);
expected_ac.addUnchanged(4);
ac.mergeAndAppend(ab2, bc2);
checkEqualEdits("ab2+bc2", expected_ac, ac);
// Append empty edits.
Edits empty = new Edits();
ac.mergeAndAppend(empty, empty);
checkEqualEdits("empty+empty", expected_ac, ac);
// Error: Append more edits with mismatched intermediate-string lengths.
Edits mismatch = new Edits();
mismatch.addReplace(1, 1);
try {
ac.mergeAndAppend(ab2, mismatch);
fail("ab2+mismatch did not yield IllegalArgumentException");
} catch (IllegalArgumentException expected) {
}
try {
ac.mergeAndAppend(mismatch, bc2);
fail("mismatch+bc2 did not yield IllegalArgumentException");
} catch (IllegalArgumentException expected) {
}
}
@Test
public void TestCaseMapWithEdits() {
StringBuilder sb = new StringBuilder();

View File

@ -84,7 +84,7 @@ public class UnicodeSetTest extends TestFmwk {
@Test
public void TestPropertyAccess() {
int count = 0;
int count = 0;
// test to see that all of the names work
for (int propNum = UProperty.BINARY_START; propNum < UProperty.INT_LIMIT; ++propNum) {
count++;
@ -130,7 +130,7 @@ public class UnicodeSetTest extends TestFmwk {
}
} catch (RuntimeException e1) {
errln("Can't get property value name for: "
+ "Property (" + propNum + "): " + propName + ", "
+ "Property (" + propNum + "): " + propName + ", "
+ "Value (" + valueNum + ") "
+ ", NameChoice: " + nameChoice + ", "
+ e1.getClass().getName());
@ -142,7 +142,7 @@ public class UnicodeSetTest extends TestFmwk {
testSet = new UnicodeSet("[:" + propName + "=" + valueName + ":]");
} catch (RuntimeException e) {
errln("Can't create UnicodeSet for: "
+ "Property (" + propNum + "): " + propName + ", "
+ "Property (" + propNum + "): " + propName + ", "
+ "Value (" + valueNum + "): " + valueName + ", "
+ e.getClass().getName());
continue;
@ -155,13 +155,13 @@ public class UnicodeSetTest extends TestFmwk {
}
}
if (collectedErrors.size() != 0) {
errln("Property Value Differs: "
+ "Property (" + propNum + "): " + propName + ", "
errln("Property Value Differs: "
+ "Property (" + propNum + "): " + propName + ", "
+ "Value (" + valueNum + "): " + valueName + ", "
+ "Differing values: " + collectedErrors.toPattern(true));
}
}
}
}
}
}
@ -183,7 +183,7 @@ public class UnicodeSetTest extends TestFmwk {
if (!toPatternAux(0, i)) continue;
if (!toPatternAux(i, 0xFFFF)) continue;
}
}
}
// Test pattern behavior of multicharacter strings.
UnicodeSet s = new UnicodeSet("[a-z {aa} {ab}]");
@ -211,7 +211,7 @@ public class UnicodeSetTest extends TestFmwk {
new String[] {"abc", NOT, "ab"});
// JB#3400: For 2 character ranges prefer [ab] to [a-b]
s.clear();
s.clear();
s.add('a', 'b');
expectToPattern(s, "[ab]", null);
@ -244,7 +244,7 @@ public class UnicodeSetTest extends TestFmwk {
}
static String[] OTHER_TOPATTERN_TESTS = {
"[[:latin:]&[:greek:]]",
"[[:latin:]&[:greek:]]",
"[[:latin:]-[:greek:]]",
"[:nonspacing mark:]"
};
@ -456,7 +456,7 @@ public class UnicodeSetTest extends TestFmwk {
for (int i=0; i<0x200; ++i) {
boolean l = UCharacter.isLetter(i);
if (l != set.contains((char)i)) {
errln("FAIL: L contains " + (char)i + " = " +
errln("FAIL: L contains " + (char)i + " = " +
set.contains((char)i));
if (++failures == 10) break;
}
@ -466,7 +466,7 @@ public class UnicodeSetTest extends TestFmwk {
for (int i=0; i<0x200; ++i) {
boolean lu = (UCharacter.getType(i) == ECharacterCategory.UPPERCASE_LETTER);
if (lu != set.contains((char)i)) {
errln("FAIL: Lu contains " + (char)i + " = " +
errln("FAIL: Lu contains " + (char)i + " = " +
set.contains((char)i));
if (++failures == 20) break;
}
@ -653,7 +653,7 @@ public class UnicodeSetTest extends TestFmwk {
logln("bitsToSet(setToBits(c)): " + c);
} else {
errln("FAIL: bitsToSet(setToBits(c)) = " + c + ", expect " + exp);
}
}
// Additional tests for coverage JB#2118
//UnicodeSet::complement(class UnicodeString const &)
@ -744,10 +744,10 @@ public class UnicodeSetTest extends TestFmwk {
}
{
//Cover addAll(Collection) and addAllTo(Collection)
//Cover addAll(Collection) and addAllTo(Collection)
// Seems that there is a bug in addAll(Collection) operation
// Ram also add a similar test to UtilityTest.java
logln("Testing addAll(Collection) ... ");
logln("Testing addAll(Collection) ... ");
String[] array = {"a", "b", "c", "de"};
List list = Arrays.asList(array);
Set aset = new HashSet(list);
@ -783,20 +783,20 @@ public class UnicodeSetTest extends TestFmwk {
// Object[][] testList = {
// {I_EQUALS, UnicodeSet.fromAll("abc"),
// new UnicodeSet("[a-c]")},
//
//
// {I_EQUALS, UnicodeSet.from("ch").add('a','z').add("ll"),
// new UnicodeSet("[{ll}{ch}a-z]")},
//
// {I_EQUALS, UnicodeSet.from("ab}c"),
//
// {I_EQUALS, UnicodeSet.from("ab}c"),
// new UnicodeSet("[{ab\\}c}]")},
//
// {I_EQUALS, new UnicodeSet('a','z').add('A', 'Z').retain('M','m').complement('X'),
//
// {I_EQUALS, new UnicodeSet('a','z').add('A', 'Z').retain('M','m').complement('X'),
// new UnicodeSet("[[a-zA-Z]&[M-m]-[X]]")},
// };
//
//
// for (int i = 0; i < testList.length; ++i) {
// expectRelation(testList[i][0], testList[i][1], testList[i][2], "(" + i + ")");
// }
// }
UnicodeSet[][] testList = {
{UnicodeSet.fromAll("abc"),
@ -805,10 +805,10 @@ public class UnicodeSetTest extends TestFmwk {
{UnicodeSet.from("ch").add('a','z').add("ll"),
new UnicodeSet("[{ll}{ch}a-z]")},
{UnicodeSet.from("ab}c"),
{UnicodeSet.from("ab}c"),
new UnicodeSet("[{ab\\}c}]")},
{new UnicodeSet('a','z').add('A', 'Z').retain('M','m').complement('X'),
{new UnicodeSet('a','z').add('A', 'Z').retain('M','m').complement('X'),
new UnicodeSet("[[a-zA-Z]&[M-m]-[X]]")},
};
@ -816,10 +816,10 @@ public class UnicodeSetTest extends TestFmwk {
if (!testList[i][0].equals(testList[i][1])) {
errln("FAIL: sets unequal; see source code (" + i + ")");
}
}
}
}
static final Integer
static final Integer
I_ANY = new Integer(SortedSetRelation.ANY),
I_CONTAINS = new Integer(SortedSetRelation.CONTAINS),
I_DISJOINT = new Integer(SortedSetRelation.DISJOINT),
@ -875,12 +875,12 @@ public class UnicodeSetTest extends TestFmwk {
iset.add(new Integer(size + 1)); // add odd value in middle
CheckSpeed(iset, jset, "when a contains b", iterations);
CheckSpeed(iset, jset, "when a contains b", iterations);
CheckSpeed(jset, iset, "when b contains a", iterations);
jset.add(new Integer(size - 1)); // add different odd value in middle
CheckSpeed(jset, iset, "when a, b are disjoint", iterations);
CheckSpeed(jset, iset, "when a, b are disjoint", iterations);
}
void CheckSpeed(SortedSet iset, SortedSet jset, String message, int iterations) {
@ -952,28 +952,28 @@ public class UnicodeSetTest extends TestFmwk {
public static final String[] RELATION_NAME = {
"both-are-null",
"a-is-null",
"equals",
"a-is-null",
"equals",
"is-contained-in",
"b-is-null",
"is-disjoint_with",
"contains",
"contains",
"any", };
boolean dumbHasRelation(Collection A, int filter, Collection B) {
Collection ab = new TreeSet(A);
ab.retainAll(B);
if (ab.size() > 0 && (filter & SortedSetRelation.A_AND_B) == 0) return false;
if (ab.size() > 0 && (filter & SortedSetRelation.A_AND_B) == 0) return false;
// A - B size == A.size - A&B.size
if (A.size() > ab.size() && (filter & SortedSetRelation.A_NOT_B) == 0) return false;
if (A.size() > ab.size() && (filter & SortedSetRelation.A_NOT_B) == 0) return false;
// B - A size == B.size - A&B.size
if (B.size() > ab.size() && (filter & SortedSetRelation.B_NOT_A) == 0) return false;
if (B.size() > ab.size() && (filter & SortedSetRelation.B_NOT_A) == 0) return false;
return true;
}
}
void checkSetRelation(SortedSet a, SortedSet b, String message) {
for (int i = 0; i < 8; ++i) {
@ -984,7 +984,7 @@ public class UnicodeSetTest extends TestFmwk {
logln(message + " " + hasRelation + ":\t" + a + "\t" + RELATION_NAME[i] + "\t" + b);
if (hasRelation != dumbHasRelation) {
errln("FAIL: " +
errln("FAIL: " +
message + " " + dumbHasRelation + ":\t" + a + "\t" + RELATION_NAME[i] + "\t" + b);
}
}
@ -1077,9 +1077,9 @@ public class UnicodeSetTest extends TestFmwk {
"\u03D6", // 1.1
"\u03D8\u03D9", // 3.2
"[:Age=3.1:]",
"\\u1800\\u3400\\U0002f800",
"\\u0220\\u034f\\u30ff\\u33ff\\ufe73\\U00010000\\U00050000",
"[:Age=3.1:]",
"\\u1800\\u3400\\U0002f800",
"\\u0220\\u034f\\u30ff\\u33ff\\ufe73\\U00010000\\U00050000",
// JB#2350: Case_Sensitive
"[:Case Sensitive:]",
@ -1168,7 +1168,7 @@ public class UnicodeSetTest extends TestFmwk {
"\\uFDF2"
};
for (int i=0; i<DATA.length; i+=3) {
for (int i=0; i<DATA.length; i+=3) {
expectContainment(DATA[i], DATA[i+1], DATA[i+2]);
}
}
@ -1319,7 +1319,7 @@ public class UnicodeSetTest extends TestFmwk {
CASE,
"[{F\uFB01}]",
"[\uFB03{ffi}]",
"[\uFB03{ffi}]",
CASE,
"[a-z]","[A-Za-z\u017F\u212A]",
@ -1615,6 +1615,7 @@ public class UnicodeSetTest extends TestFmwk {
assertEquals("compareTo-shorter-first", goalShortest, sorted);
TreeSet<UnicodeSet> sorted1 = new TreeSet<UnicodeSet>(new Comparator<UnicodeSet>(){
@Override
public int compare(UnicodeSet o1, UnicodeSet o2) {
// TODO Auto-generated method stub
return o1.compareTo(o2, ComparisonStyle.LONGER_FIRST);
@ -1625,6 +1626,7 @@ public class UnicodeSetTest extends TestFmwk {
assertEquals("compareTo-longer-first", goalLongest, sorted);
sorted1 = new TreeSet<UnicodeSet>(new Comparator<UnicodeSet>(){
@Override
public int compare(UnicodeSet o1, UnicodeSet o2) {
// TODO Auto-generated method stub
return o1.compareTo(o2, ComparisonStyle.LEXICOGRAPHIC);
@ -1931,6 +1933,7 @@ public class UnicodeSetTest extends TestFmwk {
/* (non-Javadoc)
* @see com.ibm.icu.text.SymbolTable#lookup(java.lang.String)
*/
@Override
public char[] lookup(String s) {
logln("TokenSymbolTable: lookup \"" + s + "\" => \"" +
new String((char[]) contents.get(s)) + "\"");
@ -1940,6 +1943,7 @@ public class UnicodeSetTest extends TestFmwk {
/* (non-Javadoc)
* @see com.ibm.icu.text.SymbolTable#lookupMatcher(int)
*/
@Override
public UnicodeMatcher lookupMatcher(int ch) {
return null;
}
@ -1948,6 +1952,7 @@ public class UnicodeSetTest extends TestFmwk {
* @see com.ibm.icu.text.SymbolTable#parseReference(java.lang.String,
java.text.ParsePosition, int)
*/
@Override
public String parseReference(String text, ParsePosition pos, int
limit) {
int cp;
@ -1982,7 +1987,7 @@ public class UnicodeSetTest extends TestFmwk {
CharsToUnicodeString("abc\\U00010000"),
"\uD800;\uDC00"); // split apart surrogate-pair
if (set.size() != 4) {
errln(Utility.escape("FAIL: " + DATA[i] + ".size() == " +
errln(Utility.escape("FAIL: " + DATA[i] + ".size() == " +
set.size() + ", expected 4"));
}
}
@ -2385,11 +2390,11 @@ public class UnicodeSetTest extends TestFmwk {
}
boolean contained = set.contains(expStrings[i]);
if (contained == in) {
logln("Ok: " + expPat +
logln("Ok: " + expPat +
(contained ? " contains {" : " does not contain {") +
Utility.escape(expStrings[i]) + "}");
} else {
errln("FAIL: " + expPat +
errln("FAIL: " + expPat +
(contained ? " contains {" : " does not contain {") +
Utility.escape(expStrings[i]) + "}");
}
@ -2442,10 +2447,10 @@ public class UnicodeSetTest extends TestFmwk {
assertEquals("", "M, a-c", CollectionUtilities.join(us1.ranges(), ", "));
// Sample code
for (@SuppressWarnings("unused") EntryRange range : us1.ranges()) {
// do something with code points between range.codepointEnd and range.codepointEnd;
for (@SuppressWarnings("unused") EntryRange range : us1.ranges()) {
// do something with code points between range.codepointEnd and range.codepointEnd;
}
for (@SuppressWarnings("unused") String s : us1.strings()) {
for (@SuppressWarnings("unused") String s : us1.strings()) {
// do something with each string;
}
@ -2479,7 +2484,7 @@ public class UnicodeSetTest extends TestFmwk {
UnicodeSetSpanner m;
m = new UnicodeSetSpanner(new UnicodeSet("[._]"));
assertEquals("", "abc", m.deleteFrom("_._a_._b_._c_._"));
assertEquals("", "abc", m.deleteFrom("_._a_._b_._c_._"));
assertEquals("", "_.__.__.__._", m.deleteFrom("_._a_._b_._c_._", SpanCondition.NOT_CONTAINED));
assertEquals("", "a_._b_._c", m.trim("_._a_._b_._c_._"));
@ -2511,11 +2516,11 @@ public class UnicodeSetTest extends TestFmwk {
checkCodePoints("👦", "👧", CountMethod.MIN_ELEMENTS, SpanCondition.SIMPLE, null, 1);
}
private void checkCodePoints(String a, String b, CountMethod quantifier, SpanCondition spanCondition,
private void checkCodePoints(String a, String b, CountMethod quantifier, SpanCondition spanCondition,
String expectedReplaced, int expectedCount) {
final String ab = a+b;
UnicodeSetSpanner m = new UnicodeSetSpanner(new UnicodeSet("[{" + a + "}]"));
assertEquals("new UnicodeSetSpanner(\"[{" + a + "}]\").countIn(\"" + ab + "\")",
assertEquals("new UnicodeSetSpanner(\"[{" + a + "}]\").countIn(\"" + ab + "\")",
expectedCount,
callCountIn(m, ab, quantifier, spanCondition)
);
@ -2523,7 +2528,7 @@ public class UnicodeSetTest extends TestFmwk {
if (expectedReplaced == null) {
expectedReplaced = "-" + b;
}
assertEquals("new UnicodeSetSpanner(\"[{" + a + "}]\").replaceFrom(\"" + ab + "\", \"-\")",
assertEquals("new UnicodeSetSpanner(\"[{" + a + "}]\").replaceFrom(\"" + ab + "\", \"-\")",
expectedReplaced, m.replaceFrom(ab, "-", quantifier));
}
@ -2586,9 +2591,6 @@ public class UnicodeSetTest extends TestFmwk {
/**
* Check that there are no gaps, when we alternate spanning. That is, there
* should only be a zero length span at the very start.
* @param longString
* @param us
* @param simple
*/
private int checkSpan(String longString, UnicodeSet us, SpanCondition spanCondition) {
int start = 0;
@ -2657,7 +2659,7 @@ public class UnicodeSetTest extends TestFmwk {
assertEquals("CharSequence complementAll", new UnicodeSet("[ABbc]"), new UnicodeSet("[a-cA]").complementAll(new StringBuilder("aB")) );
// containment
assertEquals("CharSequence contains", true, new UnicodeSet("[a-cA{ab}]"). contains(new StringBuilder("ab")) );
assertEquals("CharSequence contains", true, new UnicodeSet("[a-cA{ab}]"). contains(new StringBuilder("ab")) );
assertEquals("CharSequence containsNone", false, new UnicodeSet("[a-cA]"). containsNone(new StringBuilder("ab")) );
assertEquals("CharSequence containsSome", true, new UnicodeSet("[a-cA{ab}]"). containsSome(new StringBuilder("ab")) );
@ -2726,7 +2728,7 @@ public class UnicodeSetTest extends TestFmwk {
0, UnicodeSet.fromAll("a").compareTo(Collections.singleton("a")));
// Longer is bigger
assertTrue("UnicodeSet is empty",
assertTrue("UnicodeSet is empty",
UnicodeSet.ALL_CODE_POINTS.compareTo(test_set) > 0);
assertTrue("UnicodeSet not empty",
UnicodeSet.EMPTY.compareTo(Collections.singleton("a")) < 0);
@ -2739,4 +2741,33 @@ public class UnicodeSetTest extends TestFmwk {
assertTrue("UnicodeSet comparison wrong",
UnicodeSet.fromAll("b").compareTo(Collections.singleton("a")) > 0);
}
@Test
public void TestUnusedCcc() {
// All numeric ccc values 0..255 are valid, but many are unused.
UnicodeSet ccc2 = new UnicodeSet("[:ccc=2:]");
assertTrue("[:ccc=2:] -> empty set", ccc2.isEmpty());
UnicodeSet ccc255 = new UnicodeSet("[:ccc=255:]");
assertTrue("[:ccc=255:] -> empty set", ccc255.isEmpty());
// Non-integer values and values outside 0..255 are invalid.
try {
new UnicodeSet("[:ccc=-1:]");
fail("[:ccc=-1:] -> illegal argument");
} catch (IllegalArgumentException expected) {
}
try {
new UnicodeSet("[:ccc=256:]");
fail("[:ccc=256:] -> illegal argument");
} catch (IllegalArgumentException expected) {
}
try {
new UnicodeSet("[:ccc=1.1:]");
fail("[:ccc=1.1:] -> illegal argument");
} catch (IllegalArgumentException expected) {
}
}
}

View File

@ -63,7 +63,7 @@ public class BreakIteratorTest extends TestFmwk
List<String> previousResults = _testLastAndPrevious(bi, text);
logln("comparing forward and backward...");
//TODO(junit) - needs to be rewritten
// TODO(#13318): As part of clean-up, permanently remove the error count check.
//int errs = getErrorCount();
compareFragmentLists("forward iteration", "backward iteration", nextResults,
previousResults);

File diff suppressed because it is too large Load Diff

View File

@ -279,7 +279,7 @@ public class RBBITest extends TestFmwk {
List<String> previousResults = _testLastAndPrevious(rbbi, text);
logln("comparing forward and backward...");
//TODO(junit) - needs to be rewritten
// TODO(#13318): As part of clean-up, permanently remove the error count check.
//int errs = getErrorCount();
compareFragmentLists("forward iteration", "backward iteration", nextResults, previousResults);
//if (getErrorCount() == errs) {
@ -957,4 +957,20 @@ public class RBBITest extends TestFmwk {
}
}
@Test
public void TestBug12519() {
RuleBasedBreakIterator biEn = (RuleBasedBreakIterator)BreakIterator.getWordInstance(ULocale.ENGLISH);
RuleBasedBreakIterator biFr = (RuleBasedBreakIterator)BreakIterator.getWordInstance(ULocale.FRANCE);
assertEquals("", ULocale.ENGLISH, biEn.getLocale(ULocale.VALID_LOCALE));
assertEquals("", ULocale.FRENCH, biFr.getLocale(ULocale.VALID_LOCALE));
assertEquals("Locales do not participate in BreakIterator equality.", biEn, biFr);
RuleBasedBreakIterator cloneEn = (RuleBasedBreakIterator)biEn.clone();
assertEquals("", biEn, cloneEn);
assertEquals("", ULocale.ENGLISH, cloneEn.getLocale(ULocale.VALID_LOCALE));
RuleBasedBreakIterator cloneFr = (RuleBasedBreakIterator)biFr.clone();
assertEquals("", biFr, cloneFr);
assertEquals("", ULocale.FRENCH, cloneFr.getLocale(ULocale.VALID_LOCALE));
}
}

View File

@ -20,7 +20,7 @@ import com.ibm.icu.dev.test.TestFmwk;
import com.ibm.icu.impl.Utility;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.text.BreakIterator;
import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.RuleBasedBreakIterator;
import com.ibm.icu.util.ULocale;
@ -39,7 +39,7 @@ public RBBITestExtended() {
static class TestParams {
BreakIterator bi;
StringBuffer dataToBreak = new StringBuffer();
StringBuilder dataToBreak = new StringBuilder();
int[] expectedBreaks = new int[1000];
int[] srcLine = new int[1000];
int[] srcCol = new int[1000];
@ -55,7 +55,7 @@ public void TestExtended() {
//
// Open and read the test data file.
//
StringBuffer testFileBuf = new StringBuffer();
StringBuilder testFileBuf = new StringBuilder();
InputStream is = null;
try {
is = RBBITestExtended.class.getResourceAsStream("rbbitst.txt");
@ -78,7 +78,7 @@ public void TestExtended() {
continue;
}
UTF16.append(testFileBuf, c);
testFileBuf.appendCodePoint(c);
}
} finally {
isr.close();
@ -99,20 +99,12 @@ public void TestExtended() {
final int PARSE_TAG = 2;
final int PARSE_DATA = 3;
final int PARSE_NUM = 4;
final int PARSE_RULES = 5;
int parseState = PARSE_TAG;
int savedState = PARSE_TAG;
final char CH_LF = 0x0a;
final char CH_CR = 0x0d;
final char CH_HASH = 0x23;
/*static const UChar CH_PERIOD = 0x2e;*/
final char CH_LT = 0x3c;
final char CH_GT = 0x3e;
final char CH_BACKSLASH = 0x5c;
final char CH_BULLET = 0x2022;
int lineNum = 1;
int colStart = 0;
int column = 0;
@ -120,17 +112,21 @@ public void TestExtended() {
int i;
int tagValue = 0; // The numeric value of a <nnn> tag.
StringBuilder rules = new StringBuilder(); // Holds rules from a <rules> ... </rules> block
int rulesFirstLine = 0; // Line number of the start of current <rules> block
int len = testString.length();
for (charIdx = 0; charIdx < len; ) {
int c = UTF16.charAt(testString, charIdx);
int c = testString.codePointAt(charIdx);
charIdx++;
if (c == CH_CR && charIdx<len && testString.charAt(charIdx) == CH_LF) {
if (c == '\r' && charIdx<len && testString.charAt(charIdx) == '\n') {
// treat CRLF as a unit
c = CH_LF;
c = '\n';
charIdx++;
}
if (c == CH_LF || c == CH_CR) {
if (c == '\n' || c == '\r') {
lineNum++;
colStart = charIdx;
}
@ -145,7 +141,7 @@ public void TestExtended() {
case PARSE_TAG:
{
if (c == CH_HASH) {
if (c == '#') {
parseState = PARSE_COMMENT;
savedState = PARSE_TAG;
break;
@ -178,6 +174,15 @@ public void TestExtended() {
charIdx += 6;
break;
}
if (testString.startsWith("<rules>", charIdx-1) ||
testString.startsWith("<badrules>", charIdx-1)) {
charIdx = testString.indexOf('>', charIdx) + 1;
parseState = PARSE_RULES;
rules.setLength(0);
rulesFirstLine = lineNum;
break;
}
if (testString.startsWith("<locale ", charIdx-1)) {
int closeIndex = testString.indexOf(">", charIdx);
if (closeIndex < 0) {
@ -206,8 +211,36 @@ public void TestExtended() {
//savedState = PARSE_DATA;
}
case PARSE_RULES:
if (testString.startsWith("</rules>", charIdx-1)) {
charIdx += 7;
parseState = PARSE_TAG;
try {
tp.bi = new RuleBasedBreakIterator(rules.toString());
} catch (IllegalArgumentException e) {
errln(String.format("rbbitst.txt:%d Error creating break iterator from rules. %s", lineNum, e));
}
} else if (testString.startsWith("</badrules>", charIdx-1)) {
charIdx += 10;
parseState = PARSE_TAG;
boolean goodRules = true;
try {
new RuleBasedBreakIterator(rules.toString());
} catch (IllegalArgumentException e) {
goodRules = false;
}
if (goodRules) {
errln(String.format(
"rbbitst.txt:%d Expected, but did not get, a failure creating break iterator from rules.",
lineNum));
}
} else {
rules.appendCodePoint(c);
}
break;
case PARSE_DATA:
if (c == CH_BULLET) {
if (c == '•') {
int breakIdx = tp.dataToBreak.length();
tp.expectedBreaks[breakIdx] = -1;
tp.srcLine[breakIdx] = lineNum;
@ -247,7 +280,7 @@ public void TestExtended() {
} else {
// Named code point was recognized. Insert it
// into the test data.
UTF16.append(tp.dataToBreak, c);
tp.dataToBreak.appendCodePoint(c);
for (i = tp.dataToBreak.length()-1; i>=0 && tp.srcLine[i]==0; i--) {
tp.srcLine[i] = lineNum;
tp.srcCol[i] = column;
@ -269,28 +302,28 @@ public void TestExtended() {
break;
}
if (c == CH_LT) {
if (c == '<') {
tagValue = 0;
parseState = PARSE_NUM;
break;
}
if (c == CH_HASH && column==3) { // TODO: why is column off so far?
if (c == '#' && column==3) { // TODO: why is column off so far?
parseState = PARSE_COMMENT;
savedState = PARSE_DATA;
break;
}
if (c == CH_BACKSLASH) {
if (c == '\\') {
// Check for \ at end of line, a line continuation.
// Advance over (discard) the newline
int cp = UTF16.charAt(testString, charIdx);
if (cp == CH_CR && charIdx<len && UTF16.charAt(testString, charIdx+1) == CH_LF) {
int cp = testString.codePointAt(charIdx);
if (cp == '\r' && charIdx<len && testString.codePointAt(charIdx+1) == '\n') {
// We have a CR LF
// Need an extra increment of the input ptr to move over both of them
charIdx++;
}
if (cp == CH_LF || cp == CH_CR) {
if (cp == '\n' || cp == '\r') {
lineNum++;
column = 0;
charIdx++;
@ -306,7 +339,7 @@ public void TestExtended() {
// Escape sequence was recognized. Insert the char
// into the test data.
charIdx = charIdxAr[0];
UTF16.append(tp.dataToBreak, cp);
tp.dataToBreak.appendCodePoint(cp);
for (i=tp.dataToBreak.length()-1; i>=0 && tp.srcLine[i]==0; i--) {
tp.srcLine[i] = lineNum;
tp.srcCol[i] = column;
@ -319,12 +352,12 @@ public void TestExtended() {
// Not a recognized backslash escape sequence.
// Take the next char as a literal.
// TODO: Should this be an error?
c = UTF16.charAt(testString,charIdx);
charIdx = UTF16.moveCodePointOffset(testString, charIdx, 1);
c = testString.codePointAt(charIdx);
charIdx = testString.offsetByCodePoints(charIdx, 1);
}
// Normal, non-escaped data char.
UTF16.append(tp.dataToBreak, c);
tp.dataToBreak.appendCodePoint(c);
// Save the mapping from offset in the data to line/column numbers in
// the original input file. Will be used for better error messages only.
@ -344,7 +377,7 @@ public void TestExtended() {
break;
}
if (c == CH_GT) {
if (c == '>') {
// Finished the number. Add the info to the expected break data,
// and switch parse state back to doing plain data.
parseState = PARSE_DATA;
@ -363,15 +396,19 @@ public void TestExtended() {
break;
}
errln("Syntax Error in test file at line "+ lineNum +", col %d" + column);
errln(String.format("Syntax Error in rbbitst.txt at line %d, col %d", lineNum, column));
return;
// parseState = PARSE_COMMENT; // TODO: unreachable. Don't stop on errors.
// break;
}
}
// Reached end of test file. Raise an error if parseState indicates that we are
// within a block that should have been terminated.
if (parseState == PARSE_RULES) {
errln(String.format("rbbitst.txt:%d <rules> block beginning at line %d is not closed.",
lineNum, rulesFirstLine));
}
if (parseState == PARSE_DATA) {
errln(String.format("rbbitst.txt:%d <data> block not closed.", lineNum));
}
}

View File

@ -9,7 +9,10 @@
package com.ibm.icu.dev.test.rbbi;
// Monkey testing of RuleBasedBreakIterator
// Monkey testing of RuleBasedBreakIterator.
// The old, original monkey test. TODO: remove
// The new monkey test is class RBBIMonkeyTest.
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;

View File

@ -0,0 +1,69 @@
#
# Copyright (C) 2016 and later: Unicode, Inc. and others.
# License & terms of use: http://www.unicode.org/copyright.html
# Copyright (c) 2016, International Business Machines Corporation and others. All Rights Reserved.
# file: grapheme.txt
#
# Reference Grapheme Break rules for intltest rbbi/RBBIMonkeyTest
#
#
# Note: Rule syntax and the monkey test itself are still a work in progress.
# They are expected to change with review and the addition of support for rule tailoring.
type = grapheme; # one of grapheme | word | line | sentence
locale = en;
CR = [\p{Grapheme_Cluster_Break = CR}];
LF = [\p{Grapheme_Cluster_Break = LF}];
Control = [[\p{Grapheme_Cluster_Break = Control}]];
Extend = [[\p{Grapheme_Cluster_Break = Extend}]];
ZWJ = [\p{Grapheme_Cluster_Break = ZWJ}];
Regional_Indicator = [\p{Grapheme_Cluster_Break = Regional_Indicator}];
Prepend = [\p{Grapheme_Cluster_Break = Prepend}];
SpacingMark = [\p{Grapheme_Cluster_Break = SpacingMark}];
#
# Korean Syllable Definitions
#
L = [\p{Grapheme_Cluster_Break = L}];
V = [\p{Grapheme_Cluster_Break = V}];
T = [\p{Grapheme_Cluster_Break = T}];
LV = [\p{Grapheme_Cluster_Break = LV}];
LVT = [\p{Grapheme_Cluster_Break = LVT}];
# Emoji defintions
EmojiNRK = [[\p{Emoji}] - [Regional_Indicator\u002a\u00230-9©®™〰〽]];
E_Base = [\p{Grapheme_Cluster_Break = EB}];
E_Modifier = [\p{Grapheme_Cluster_Break = EM}];
E_Base_GAZ = [\p{Grapheme_Cluster_Break = EBG}];
# Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r13267
Extended_Pict = [\U0001F774-\U0001F77F\U00002700-\U00002701\U00002703-\U00002704\U0000270E\U00002710-\U00002711\U00002765-\U00002767\U0001F030-\U0001F093\U0001F094-\U0001F09F\U0001F10D-\U0001F10F\U0001F12F\U0001F16C-\U0001F16F\U0001F1AD-\U0001F1E5\U0001F260-\U0001F265\U0001F203-\U0001F20F\U0001F23C-\U0001F23F\U0001F249-\U0001F24F\U0001F252-\U0001F25F\U0001F266-\U0001F2FF\U0001F7D5-\U0001F7FF\U0001F000-\U0001F003\U0001F005-\U0001F02B\U0001F02C-\U0001F02F\U0001F322-\U0001F323\U0001F394-\U0001F395\U0001F398\U0001F39C-\U0001F39D\U0001F3F1-\U0001F3F2\U0001F3F6\U0001F4FE\U0001F53E-\U0001F548\U0001F54F\U0001F568-\U0001F56E\U0001F571-\U0001F572\U0001F57B-\U0001F586\U0001F588-\U0001F589\U0001F58E-\U0001F58F\U0001F591-\U0001F594\U0001F597-\U0001F5A3\U0001F5A6-\U0001F5A7\U0001F5A9-\U0001F5B0\U0001F5B3-\U0001F5BB\U0001F5BD-\U0001F5C1\U0001F5C5-\U0001F5D0\U0001F5D4-\U0001F5DB\U0001F5DF-\U0001F5E0\U0001F5E2\U0001F5E4-\U0001F5E7\U0001F5E9-\U0001F5EE\U0001F5F0-\U0001F5F2\U0001F5F4-\U0001F5F9\U00002605\U00002607-\U0000260D\U0000260F-\U00002610\U00002612\U00002616-\U00002617\U00002619-\U0000261C\U0000261E-\U0000261F\U00002621\U00002624-\U00002625\U00002627-\U00002629\U0000262B-\U0000262D\U00002630-\U00002637\U0000263B-\U00002647\U00002654-\U0000265F\U00002661-\U00002662\U00002664\U00002667\U00002669-\U0000267A\U0000267C-\U0000267E\U00002680-\U00002691\U00002695\U00002698\U0000269A\U0000269D-\U0000269F\U000026A2-\U000026A9\U000026AC-\U000026AF\U000026B2-\U000026BC\U000026BF-\U000026C3\U000026C6-\U000026C7\U000026C9-\U000026CD\U000026D0\U000026D2\U000026D5-\U000026E8\U000026EB-\U000026EF\U000026F6\U000026FB-\U000026FC\U000026FE-\U000026FF\U00002388\U0001FA00-\U0001FFFD\U0001F0A0-\U0001F0AE\U0001F0B1-\U0001F0BF\U0001F0C1-\U0001F0CF\U0001F0D1-\U0001F0F5\U0001F0AF-\U0001F0B0\U0001F0C0\U0001F0D0\U0001F0F6-\U0001F0FF\U0001F80C-\U0001F80F\U0001F848-\U0001F84F\U0001F85A-\U0001F85F\U0001F888-\U0001F88F\U0001F8AE-\U0001F8FF\U0001F900-\U0001F90B\U0001F91F\U0001F928-\U0001F92F\U0001F931-\U0001F932\U0001F94C\U0001F95F-\U0001F96B\U0001F992-\U0001F997\U0001F9D0-\U0001F9E6\U0001F90C-\U0001F90F\U0001F93F\U0001F94D-\U0001F94F\U0001F96C-\U0001F97F\U0001F998-\U0001F9BF\U0001F9C1-\U0001F9CF\U0001F9E7-\U0001F9FF\U0001F6C6-\U0001F6CA\U0001F6D3-\U0001F6D4\U0001F6E6-\U0001F6E8\U0001F6EA\U0001F6F1-\U0001F6F2\U0001F6F7-\U0001F6F8\U0001F6D5-\U0001F6DF\U0001F6ED-\U0001F6EF\U0001F6F9-\U0001F6FF];
GB3: CR LF;
GB4: (Control | CR | LF) ÷;
GB5: . ÷ (Control | CR | LF);
GB6: L (L | V | LV | LVT);
GB7: (LV | V) (V | T);
GB8: (LVT | T) T;
GB10: (E_Base | E_Base_GAZ) Extend* E_Modifier;
GB11: (Extended_Pict | EmojiNRK) Extend* ZWJ (Extended_Pict | EmojiNRK);
GB9: . (Extend | ZWJ);
GB9a: . SpacingMark;
GB9b: Prepend .;
# Regional Indicators, split into pairs.
# Note that a pair of RIs that is not followed by a third RI will fall into
# the normal rules for Extend, etc.
#
GB12: Regional_Indicator Regional_Indicator ÷ Regional_Indicator;
GB13: Regional_Indicator Regional_Indicator;
GB999: . ÷;

View File

@ -0,0 +1,200 @@
#
# Copyright (C) 2016 and later: Unicode, Inc. and others.
# License & terms of use: http://www.unicode.org/copyright.html
# Copyright (c) 2016, International Business Machines Corporation and others. All Rights Reserved.
# file: line.txt
#
# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest
#
# Note: Rule syntax and the monkey test itself are still a work in progress.
# They are expected to change with review and the addition of support for rule tailoring.
type = line;
locale = en;
AI = [:LineBreak = Ambiguous:];
AL = [:LineBreak = Alphabetic:];
BA = [:LineBreak = Break_After:];
BB = [:LineBreak = Break_Before:];
BK = [:LineBreak = Mandatory_Break:];
B2 = [:LineBreak = Break_Both:];
CB = [:LineBreak = Contingent_Break:];
CJ = [:LineBreak = Conditional_Japanese_Starter:];
CL = [:LineBreak = Close_Punctuation:];
CM = [:LineBreak = Combining_Mark:];
CP = [:LineBreak = Close_Parenthesis:];
CR = [:LineBreak = Carriage_Return:];
EB = [:LineBreak = EB:];
EM = [:LineBreak = EM:];
EX = [:LineBreak = Exclamation:];
GL = [:LineBreak = Glue:];
HL = [:LineBreak = Hebrew_Letter:];
HY = [:LineBreak = Hyphen:];
H2 = [:LineBreak = H2:];
H3 = [:LineBreak = H3:];
ID = [:LineBreak = Ideographic:];
IN = [:LineBreak = Inseperable:];
IS = [:LineBreak = Infix_Numeric:];
JL = [:LineBreak = JL:];
JV = [:LineBreak = JV:];
JT = [:LineBreak = JT:];
LF = [:LineBreak = Line_Feed:];
NL = [:LineBreak = Next_Line:];
NS = [[:LineBreak = Nonstarter:] CJ]; # CSS Strict tailoring: CJ resolves to NS.
NU = [:LineBreak = Numeric:];
OP = [:LineBreak = Open_Punctuation:];
PO = [:LineBreak = Postfix_Numeric:];
PR = [:LineBreak = Prefix_Numeric:];
QU = [:LineBreak = Quotation:];
RI = [:LineBreak = Regional_Indicator:];
SA = [:LineBreak = Complex_Context:];
SG = [:LineBreak = Surrogate:];
SP = [:LineBreak = Space:];
SY = [:LineBreak = Break_Symbols:];
WJ = [:LineBreak = Word_Joiner:];
XX = [:LineBreak = Unknown:];
ZW = [:LineBreak = ZWSpace:];
ZWJ = [:LineBreak = ZWJ:];
EmojiNRK = [[\p{Emoji}] - [[RI]\u002a\u00230-9©®™〰〽]];
# Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r13267
Extended_Pict = [\U0001F774-\U0001F77F\U00002700-\U00002701\U00002703-\U00002704\U0000270E\U00002710-\U00002711\U00002765-\U00002767\U0001F030-\U0001F093\U0001F094-\U0001F09F\U0001F10D-\U0001F10F\U0001F12F\U0001F16C-\U0001F16F\U0001F1AD-\U0001F1E5\U0001F260-\U0001F265\U0001F203-\U0001F20F\U0001F23C-\U0001F23F\U0001F249-\U0001F24F\U0001F252-\U0001F25F\U0001F266-\U0001F2FF\U0001F7D5-\U0001F7FF\U0001F000-\U0001F003\U0001F005-\U0001F02B\U0001F02C-\U0001F02F\U0001F322-\U0001F323\U0001F394-\U0001F395\U0001F398\U0001F39C-\U0001F39D\U0001F3F1-\U0001F3F2\U0001F3F6\U0001F4FE\U0001F53E-\U0001F548\U0001F54F\U0001F568-\U0001F56E\U0001F571-\U0001F572\U0001F57B-\U0001F586\U0001F588-\U0001F589\U0001F58E-\U0001F58F\U0001F591-\U0001F594\U0001F597-\U0001F5A3\U0001F5A6-\U0001F5A7\U0001F5A9-\U0001F5B0\U0001F5B3-\U0001F5BB\U0001F5BD-\U0001F5C1\U0001F5C5-\U0001F5D0\U0001F5D4-\U0001F5DB\U0001F5DF-\U0001F5E0\U0001F5E2\U0001F5E4-\U0001F5E7\U0001F5E9-\U0001F5EE\U0001F5F0-\U0001F5F2\U0001F5F4-\U0001F5F9\U00002605\U00002607-\U0000260D\U0000260F-\U00002610\U00002612\U00002616-\U00002617\U00002619-\U0000261C\U0000261E-\U0000261F\U00002621\U00002624-\U00002625\U00002627-\U00002629\U0000262B-\U0000262D\U00002630-\U00002637\U0000263B-\U00002647\U00002654-\U0000265F\U00002661-\U00002662\U00002664\U00002667\U00002669-\U0000267A\U0000267C-\U0000267E\U00002680-\U00002691\U00002695\U00002698\U0000269A\U0000269D-\U0000269F\U000026A2-\U000026A9\U000026AC-\U000026AF\U000026B2-\U000026BC\U000026BF-\U000026C3\U000026C6-\U000026C7\U000026C9-\U000026CD\U000026D0\U000026D2\U000026D5-\U000026E8\U000026EB-\U000026EF\U000026F6\U000026FB-\U000026FC\U000026FE-\U000026FF\U00002388\U0001FA00-\U0001FFFD\U0001F0A0-\U0001F0AE\U0001F0B1-\U0001F0BF\U0001F0C1-\U0001F0CF\U0001F0D1-\U0001F0F5\U0001F0AF-\U0001F0B0\U0001F0C0\U0001F0D0\U0001F0F6-\U0001F0FF\U0001F80C-\U0001F80F\U0001F848-\U0001F84F\U0001F85A-\U0001F85F\U0001F888-\U0001F88F\U0001F8AE-\U0001F8FF\U0001F900-\U0001F90B\U0001F91F\U0001F928-\U0001F92F\U0001F931-\U0001F932\U0001F94C\U0001F95F-\U0001F96B\U0001F992-\U0001F997\U0001F9D0-\U0001F9E6\U0001F90C-\U0001F90F\U0001F93F\U0001F94D-\U0001F94F\U0001F96C-\U0001F97F\U0001F998-\U0001F9BF\U0001F9C1-\U0001F9CF\U0001F9E7-\U0001F9FF\U0001F6C6-\U0001F6CA\U0001F6D3-\U0001F6D4\U0001F6E6-\U0001F6E8\U0001F6EA\U0001F6F1-\U0001F6F2\U0001F6F7-\U0001F6F8\U0001F6D5-\U0001F6DF\U0001F6ED-\U0001F6EF\U0001F6F9-\U0001F6FF];
# LB1 - Resolve AI, CB, CJ, SA, SG, and XX into other line breaking classes
AL = [AL AI SG XX ];
dictionary = SA;
# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
# list it in the numerous rules that use CM.
CM = [CM ZWJ];
LB4: BK ÷;
LB5: CR LF;
LB5.1: CR ÷;
LB5.2: LF ÷;
LB5.3: NL ÷;
LB6: . (BK | CR | LF | NL);
LB6.1: [^BK CR LF NL SP ZW] CM* (BK | CR | LF | NL);
# Rules LB14 - LB17.
# Moved before LB7, because they can match a longer sequence that would also match LB7,
# for example, the sequence "OP CM SP AL" matches LB14 while the prefix of it,
# "while only the prefix "OP CM SP" matches LB7.1
LB14: OP CM* SP* .;
LB15: QU CM* SP* OP;
LB16: (CL | CP)CM* SP* NS;
LB17: B2 CM* SP* B2;
LB7.1: [^ZW SP] CM* [SP ZW];
LB7.2: [ZW SP] [SP ZW];
# LB8, ICU differs from UAX-14,
# ICU: ZW ÷;
# UAX 14: ZW SP* ÷;
LB8: ZW ÷;
# LB8a
# ZWJ x (ID | Extended_Pict | EmojiNRK)
LB8a: ZWJ (ID | Extended_Pict | EmojiNRK);
# LB9: X CM -> X
# LB10: Unattached CM -> AL
#LB11: × WJ;
# WJ ×
LB11.1: [^BK CR LF NL SP ZW] CM* WJ;
LB11.2: SP WJ;
LB11.3: WJ CM* [^CM];
LB12: GL CM* [^CM];
LB12a: [^SP BA HY] CM* GL;
# LB 13 ICU Tailoring, matches tailoring example 8 from UAX 14.
#
# LB13.1 [^SP] CM* [CL CP EX IS SY] # original UAX 14 rule.
# LB13.2 SP CM* [CL CP EX IS SY]
LB13.1: [^NU SP] CM* [CL CP IS SY];
LB13.2: [^SP] CM* EX;
LB13.2: SP [CL CP EX IS SY];
# LB 14-17 are moved above LB 7.
LB18: SP ÷;
LB19: . CM* QU;
LB19.1: QU CM* [^CM];
# LB 20 Break before and after CB.
# Interaction with LB8a: ZWJ x ID is tricky because CM includes ZWJ.
# ZWJ acts like a CM to the left, combining with CB.
# ZWJ acts independently to the right, no break from ID by LB8a.
LB20: . CM* ÷ CB;
LB20.1a: CB CM* ZWJ (ID | Extended_Pict | EmojiNRK);
LB20.1b: CB CM* ÷;
# Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
# not picking up the continuing match after the BA from 21a.
LB21a: HL CM* (HY | BA) CM* [^CM CB];
LB21.1: . CM* [BA HY NS];
LB21.2: BB CM* [^CM CB];
LB21b: SY CM* HL;
LB22.1: (AL | HL | CM) CM* IN; # The CM is from LB10, treat an unattached CM as AL.
LB22.2: EX CM* IN;
LB22.3: (ID | EB | EM) CM* IN;
LB22.4: IN CM* IN;
LB22.5: NU CM* IN;
LB23.1: (AL | HL | CM) CM* NU;
LB23.2: NU CM* (AL | HL);
LB23a.1: PR CM* (ID | EB | EM);
LB23a.2: (ID | EB | EM) CM* PO;
LB24.2: (PR | PO) CM* (AL | HL);
LB24.3: (AL | HL | CM) CM* (PR | PO);
# Numbers. Equivalent to Tailoring example 8 from UAX 14.
LB25: ((PR | PO)CM*)? ((OP | HY)CM*)? NU (CM*(NU | SY | IS))* (CM*(CL | CP))? (CM*(PR | PO))?;
LB26.1: JL CM* (JL | JV | H2 | H3);
LB26.2: (JV | H2) CM* (JV | JT);
LB26.3: (JT | H3) CM* JT;
LB27.1: (JL | JV | JT | H2 | H3) CM* IN;
LB27.2: (JL | JV | JT | H2 | H3) CM* PO;
LB27.3: PR CM* (JL | JV | JT | H2 | H3);
# LB28 Do not break between Alphabetics.
# Unattached (leading) CM treated as AL.
LB28: (AL | HL | CM)CM* (AL | HL);
LB29: IS CM* (AL | HL);
# LB30 is adjusted for unattached leading CM being treated as AL.
LB30.1: (AL | CM | HL | NU) CM* OP;
LB30.2: CP CM* (AL | HL | NU);
# LB31 keep pairs of RI together.
LB30a.1: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS];
LB30a.2: RI CM* RI CM* ZWJ (ID | Extended_Pict | EmojiNRK);
LB30a.3: RI CM* RI CM* ÷;
# LB30b Do not break between Emoji Base and Emoji Modifier
LB30b: EB CM* EM;
# LB31 Break Everywhere Else.
# Include combining marks
LB31.1: . CM* ZWJ (ID | Extended_Pict | EmojiNRK);
LB31.2: . CM* ÷;

View File

@ -0,0 +1,208 @@
#
# Copyright (C) 2016 and later: Unicode, Inc. and others.
# License & terms of use: http://www.unicode.org/copyright.html
# Copyright (c) 2016, International Business Machines Corporation and others. All Rights Reserved.
#
# file: line_loose.txt
#
# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest
#
# Note: Rule syntax and the monkey test itself are still a work in progress.
# They are expected to change with review and the addition of support for rule tailoring.
#
# This tailors the line break behavior to correspond to CSS
# line-break=loose (BCP47 -u-lb-loose) as defined for languages other than
# Chinese & Japanese.
# It sets characters of class CJ to behave like ID.
# In addition, it allows breaks:
# * before iteration marks 3005, 303B, 309D, 309E, 30FD, 30FE (all NS)
# * between characters of LineBreak class IN
type = line;
locale = en@lb=loose;
AI = [:LineBreak = Ambiguous:];
AL = [:LineBreak = Alphabetic:];
BA = [:LineBreak = Break_After:];
BB = [:LineBreak = Break_Before:];
BK = [:LineBreak = Mandatory_Break:];
B2 = [:LineBreak = Break_Both:];
CB = [:LineBreak = Contingent_Break:];
CJ = [:LineBreak = Conditional_Japanese_Starter:];
CL = [:LineBreak = Close_Punctuation:];
CM = [:LineBreak = Combining_Mark:];
CP = [:LineBreak = Close_Parenthesis:];
CR = [:LineBreak = Carriage_Return:];
EB = [:LineBreak = EB:];
EM = [:LineBreak = EM:];
EX = [:LineBreak = Exclamation:];
GL = [:LineBreak = Glue:];
HL = [:LineBreak = Hebrew_Letter:];
HY = [:LineBreak = Hyphen:];
H2 = [:LineBreak = H2:];
H3 = [:LineBreak = H3:];
ID = [[:LineBreak = Ideographic:] CJ]; # CSS Normal tailoring: CJ resolves to ID
IN = [:LineBreak = Inseperable:];
IS = [:LineBreak = Infix_Numeric:];
JL = [:LineBreak = JL:];
JV = [:LineBreak = JV:];
JT = [:LineBreak = JT:];
LF = [:LineBreak = Line_Feed:];
NL = [:LineBreak = Next_Line:];
NSX = [\u3005 \u303B \u309D \u309E \u30FD \u30FE];
NS = [[:LineBreak = Nonstarter:] - NSX];
NU = [:LineBreak = Numeric:];
OP = [:LineBreak = Open_Punctuation:];
PO = [:LineBreak = Postfix_Numeric:];
PR = [:LineBreak = Prefix_Numeric:];
QU = [:LineBreak = Quotation:];
RI = [:LineBreak = Regional_Indicator:];
SA = [:LineBreak = Complex_Context:];
SG = [:LineBreak = Surrogate:];
SP = [:LineBreak = Space:];
SY = [:LineBreak = Break_Symbols:];
WJ = [:LineBreak = Word_Joiner:];
XX = [:LineBreak = Unknown:];
ZW = [:LineBreak = ZWSpace:];
ZWJ = [:LineBreak = ZWJ:];
EmojiNRK = [[\p{Emoji}] - [[RI]\u002a\u00230-9©®™〰〽]];
# Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r13267
Extended_Pict = [\U0001F774-\U0001F77F\U00002700-\U00002701\U00002703-\U00002704\U0000270E\U00002710-\U00002711\U00002765-\U00002767\U0001F030-\U0001F093\U0001F094-\U0001F09F\U0001F10D-\U0001F10F\U0001F12F\U0001F16C-\U0001F16F\U0001F1AD-\U0001F1E5\U0001F260-\U0001F265\U0001F203-\U0001F20F\U0001F23C-\U0001F23F\U0001F249-\U0001F24F\U0001F252-\U0001F25F\U0001F266-\U0001F2FF\U0001F7D5-\U0001F7FF\U0001F000-\U0001F003\U0001F005-\U0001F02B\U0001F02C-\U0001F02F\U0001F322-\U0001F323\U0001F394-\U0001F395\U0001F398\U0001F39C-\U0001F39D\U0001F3F1-\U0001F3F2\U0001F3F6\U0001F4FE\U0001F53E-\U0001F548\U0001F54F\U0001F568-\U0001F56E\U0001F571-\U0001F572\U0001F57B-\U0001F586\U0001F588-\U0001F589\U0001F58E-\U0001F58F\U0001F591-\U0001F594\U0001F597-\U0001F5A3\U0001F5A6-\U0001F5A7\U0001F5A9-\U0001F5B0\U0001F5B3-\U0001F5BB\U0001F5BD-\U0001F5C1\U0001F5C5-\U0001F5D0\U0001F5D4-\U0001F5DB\U0001F5DF-\U0001F5E0\U0001F5E2\U0001F5E4-\U0001F5E7\U0001F5E9-\U0001F5EE\U0001F5F0-\U0001F5F2\U0001F5F4-\U0001F5F9\U00002605\U00002607-\U0000260D\U0000260F-\U00002610\U00002612\U00002616-\U00002617\U00002619-\U0000261C\U0000261E-\U0000261F\U00002621\U00002624-\U00002625\U00002627-\U00002629\U0000262B-\U0000262D\U00002630-\U00002637\U0000263B-\U00002647\U00002654-\U0000265F\U00002661-\U00002662\U00002664\U00002667\U00002669-\U0000267A\U0000267C-\U0000267E\U00002680-\U00002691\U00002695\U00002698\U0000269A\U0000269D-\U0000269F\U000026A2-\U000026A9\U000026AC-\U000026AF\U000026B2-\U000026BC\U000026BF-\U000026C3\U000026C6-\U000026C7\U000026C9-\U000026CD\U000026D0\U000026D2\U000026D5-\U000026E8\U000026EB-\U000026EF\U000026F6\U000026FB-\U000026FC\U000026FE-\U000026FF\U00002388\U0001FA00-\U0001FFFD\U0001F0A0-\U0001F0AE\U0001F0B1-\U0001F0BF\U0001F0C1-\U0001F0CF\U0001F0D1-\U0001F0F5\U0001F0AF-\U0001F0B0\U0001F0C0\U0001F0D0\U0001F0F6-\U0001F0FF\U0001F80C-\U0001F80F\U0001F848-\U0001F84F\U0001F85A-\U0001F85F\U0001F888-\U0001F88F\U0001F8AE-\U0001F8FF\U0001F900-\U0001F90B\U0001F91F\U0001F928-\U0001F92F\U0001F931-\U0001F932\U0001F94C\U0001F95F-\U0001F96B\U0001F992-\U0001F997\U0001F9D0-\U0001F9E6\U0001F90C-\U0001F90F\U0001F93F\U0001F94D-\U0001F94F\U0001F96C-\U0001F97F\U0001F998-\U0001F9BF\U0001F9C1-\U0001F9CF\U0001F9E7-\U0001F9FF\U0001F6C6-\U0001F6CA\U0001F6D3-\U0001F6D4\U0001F6E6-\U0001F6E8\U0001F6EA\U0001F6F1-\U0001F6F2\U0001F6F7-\U0001F6F8\U0001F6D5-\U0001F6DF\U0001F6ED-\U0001F6EF\U0001F6F9-\U0001F6FF];
# LB1 - Resolve AI, CB, CJ, SA, SG, and XX into other line breaking classes
AL = [AL AI SG XX ];
dictionary = SA;
# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
# list it in the numerous rules that use CM.
CM = [CM ZWJ];
LB4: BK ÷;
LB5: CR LF;
LB5.1: CR ÷;
LB5.2: LF ÷;
LB5.3: NL ÷;
LB6: . (BK | CR | LF | NL);
LB6.1: [^BK CR LF NL SP ZW] CM* (BK | CR | LF | NL);
# Rules LB14 - LB17.
# Moved before LB7, because they can match a longer sequence that would also match LB7,
# for example, the sequence "OP CM SP AL" matches LB14 while the prefix of it,
# "while only the prefix "OP CM SP" matches LB7.1
LB14: OP CM* SP* .;
LB15: QU CM* SP* OP;
LB16: (CL | CP)CM* SP* NS;
LB17: B2 CM* SP* B2;
LB7.1: [^ZW SP] CM* [SP ZW];
LB7.2: [ZW SP] [SP ZW];
# LB8, ICU differs from UAX-14,
# ICU: ZW ÷;
# UAX 14: ZW SP* ÷;
LB8: ZW ÷;
# LB8a
# ZWJ x (ID | Extended_Pict | EmojiNRK)
LB8a: ZWJ (ID | Extended_Pict | EmojiNRK);
# LB9: X CM -> X
# LB10: Unattached CM -> AL
#LB11: × WJ;
# WJ ×
LB11.1: [^BK CR LF NL SP ZW] CM* WJ;
LB11.2: SP WJ;
LB11.3: WJ CM* [^CM];
LB12: GL CM* [^CM];
LB12a: [^SP BA HY] CM* GL;
# LB 13 ICU Tailoring, matches tailoring exmaple 8 from UAX 14.
#
# LB13.1 [^SP] CM* [CL CP EX IS SY] # original UAX 14 rule.
# LB13.2 SP CM* [CL CP EX IS SY]
LB13.1: [^NU SP] CM* [CL CP IS SY];
LB13.2: [^SP] CM* EX;
LB13.2: SP [CL CP EX IS SY];
# LB 14-17 are moved above LB 7.
LB18: SP ÷;
LB19: . CM* QU;
LB19.1: QU CM* [^CM];
# LB 20 Break before and after CB.
# Interaction with LB8a: ZWJ x ID is tricky because CM includes ZWJ.
# ZWJ acts like a CM to the left, combining with CB.
# ZWJ acts independently to the right, no break from ID by LB8a.
LB20: . CM* ÷ CB;
LB20.1a: CB CM* ZWJ (ID | Extended_Pict | EmojiNRK);
LB20.1b: CB CM* ÷;
# Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
# not picking up the continuing match after the BA from 21a.
LB21a: HL CM* (HY | BA) CM* [^CM CB];
LB21.1: . CM* [BA HY NS];
LB21.2: BB CM* [^CM CB];
LB21b: SY CM* HL;
LB22.1: (AL | HL | CM) CM* IN; # The CM is from LB10, treat an unattached CM as AL.
LB22.2: EX CM* IN;
LB22.3: (ID | EB | EM) CM* IN;
# LB22.4: IN CM* IN; # delete this rule for CSS loose.
LB22.5: NU CM* IN;
LB23.1: (AL | HL | CM) CM* NU;
LB23.2: NU CM* (AL | HL);
LB23a.1: PR CM* (ID | EB | EM);
LB23a.2: (ID | EB | EM) CM* PO;
LB24.2: (PR | PO) CM* (AL | HL);
LB24.3: (AL | HL | CM) CM* (PR | PO);
# Numbers. Equivalent to Tailoring example 8 from UAx 14.
LB25: ((PR | PO)CM*)? ((OP | HY)CM*)? NU (CM*(NU | SY | IS))* (CM*(CL | CP))? (CM*(PR | PO))?;
LB26.1: JL CM* (JL | JV | H2 | H3);
LB26.2: (JV | H2) CM* (JV | JT);
LB26.3: (JT | H3) CM* JT;
LB27.1: (JL | JV | JT | H2 | H3) CM* IN;
LB27.2: (JL | JV | JT | H2 | H3) CM* PO;
LB27.3: PR CM* (JL | JV | JT | H2 | H3);
# LB28 Do not break between Alphabetics.
# Unattached (leading) CM treated as AL.
LB28: (AL | HL | CM)CM* (AL | HL);
LB29: IS CM* (AL | HL);
# LB30 is adjusted for unattached leading CM being treated as AL.
LB30.1: (AL | CM | HL | NU) CM* OP;
LB30.2: CP CM* (AL | HL | NU);
# LB31 keep pairs of RI together.
LB30a.1: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS];
LB30a.2: RI CM* RI CM* ZWJ (ID | Extended_Pict | EmojiNRK);
LB30a.3: RI CM* RI CM* ÷;
# LB30b Do not break between Emoji Base and Emoji Modifier
LB30b: EB CM* EM;
# LB31 Break Everywhere Else.
# Include combining marks
LB31.1: . CM* ZWJ (ID | Extended_Pict | EmojiNRK);
LB31.2: . CM* ÷;

View File

@ -0,0 +1,229 @@
#
# Copyright (C) 2016 and later: Unicode, Inc. and others.
# License & terms of use: http://www.unicode.org/copyright.html
# Copyright (c) 2016, International Business Machines Corporation and others. All Rights Reserved.
#
# file: line_loose_cj.txt
#
# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest
#
# Note: Rule syntax and the monkey test itself are still a work in progress.
# They are expected to change with review and the addition of support for rule tailoring.
#
# Line Breaking Rules
# Implement default line breaking as defined by
# Unicode Standard Annex #14 Revision 34 for Unicode 8.0
# http://www.unicode.org/reports/tr14/
# tailored as noted in 2nd paragraph below..
#
# This tailors the line break behavior to correspond to CSS
# line-break=loose (BCP47 -u-lb-loose) as defined for Chinese & Japanese.
# It sets characters of class CJ to behave like ID.
# In addition, it allows breaks:
# * before hyphens 2010 & 2013 (both BA) and 301C, 30A0 (both NS)
# * before iteration marks 3005, 303B, 309D, 309E, 30FD, 30FE (all NS)
# * between characters of LineBreak class IN such as 2026
# * before some centered punct 203C, 2047, 2048, 2049, 30FB, FF1A, FF1B,
# FF65 (all NS) and FF01, FF1F (both EX).
# * before suffix characters with LineBreak class PO and EastAsianWidth A,F,W;
# this includes: 00B0 2030 2032 2033 2035 2103 2109 FE6A FF05 FFE0
# * after prefix characters with LineBreak class PR and EastAsianWidth A,F,W;
# this includes: 00A4 00B1 20AC 2116 FE69 FF04 FFE1 FFE5 FFE6
type = line;
locale = ja@lb=loose;
AI = [:LineBreak = Ambiguous:];
AL = [[:LineBreak = Alphabetic:]];
BAX = [\u2010 \u2013];
BA = [[:LineBreak = Break_After:] - BAX];
BB = [:LineBreak = Break_Before:];
BK = [:LineBreak = Mandatory_Break:];
B2 = [:LineBreak = Break_Both:];
CB = [:LineBreak = Contingent_Break:];
CJ = [:LineBreak = Conditional_Japanese_Starter:];
CL = [:LineBreak = Close_Punctuation:];
CM = [:LineBreak = Combining_Mark:];
CP = [:LineBreak = Close_Parenthesis:];
CR = [:LineBreak = Carriage_Return:];
EB = [:LineBreak = EB:];
EM = [:LineBreak = EM:];
EXX = [\uFF01 \uFF1F];
EX = [[:LineBreak = Exclamation:] - EXX];
GL = [:LineBreak = Glue:];
HL = [:LineBreak = Hebrew_Letter:];
HY = [:LineBreak = Hyphen:];
H2 = [:LineBreak = H2:];
H3 = [:LineBreak = H3:];
ID = [[:LineBreak = Ideographic:] CJ]; # CSS Loose tailoring: CJ resolves to ID
IN = [:LineBreak = Inseperable:];
IS = [:LineBreak = Infix_Numeric:];
JL = [:LineBreak = JL:];
JV = [:LineBreak = JV:];
JT = [:LineBreak = JT:];
LF = [:LineBreak = Line_Feed:];
NL = [:LineBreak = Next_Line:];
NSX = [\u301C \u30A0 \u3005 \u303B \u309D \u309E \u30FD \u30FE \u203C \u2047 \u2048 \u2049 \u30FB \uFF1A \uFF1B \uFF65];
NS = [[:LineBreak = Nonstarter:] - NSX];
NU = [:LineBreak = Numeric:];
OP = [:LineBreak = Open_Punctuation:];
POX = [\u00B0 \u2030 \u2032 \u2033 \u2035 \u2103 \u2109 \uFE6A \uFF05 \uFFE0];
PO = [[:LineBreak = Postfix_Numeric:] - POX];
PRX = [\u00A4 \u00B1 \u20AC \u2116 \uFE69 \uFF04 \uFFE1 \uFFE5 \uFFE6];
PR = [[:LineBreak = Prefix_Numeric:] - PRX];
QU = [:LineBreak = Quotation:];
RI = [:LineBreak = Regional_Indicator:];
SA = [:LineBreak = Complex_Context:];
SG = [:LineBreak = Surrogate:];
SP = [:LineBreak = Space:];
SY = [:LineBreak = Break_Symbols:];
WJ = [:LineBreak = Word_Joiner:];
XX = [:LineBreak = Unknown:];
ZW = [:LineBreak = ZWSpace:];
ZWJ = [:LineBreak = ZWJ:];
EmojiNRK = [[\p{Emoji}] - [[RI]\u002a\u00230-9©®™〰〽]];
# Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r13267
Extended_Pict = [\U0001F774-\U0001F77F\U00002700-\U00002701\U00002703-\U00002704\U0000270E\U00002710-\U00002711\U00002765-\U00002767\U0001F030-\U0001F093\U0001F094-\U0001F09F\U0001F10D-\U0001F10F\U0001F12F\U0001F16C-\U0001F16F\U0001F1AD-\U0001F1E5\U0001F260-\U0001F265\U0001F203-\U0001F20F\U0001F23C-\U0001F23F\U0001F249-\U0001F24F\U0001F252-\U0001F25F\U0001F266-\U0001F2FF\U0001F7D5-\U0001F7FF\U0001F000-\U0001F003\U0001F005-\U0001F02B\U0001F02C-\U0001F02F\U0001F322-\U0001F323\U0001F394-\U0001F395\U0001F398\U0001F39C-\U0001F39D\U0001F3F1-\U0001F3F2\U0001F3F6\U0001F4FE\U0001F53E-\U0001F548\U0001F54F\U0001F568-\U0001F56E\U0001F571-\U0001F572\U0001F57B-\U0001F586\U0001F588-\U0001F589\U0001F58E-\U0001F58F\U0001F591-\U0001F594\U0001F597-\U0001F5A3\U0001F5A6-\U0001F5A7\U0001F5A9-\U0001F5B0\U0001F5B3-\U0001F5BB\U0001F5BD-\U0001F5C1\U0001F5C5-\U0001F5D0\U0001F5D4-\U0001F5DB\U0001F5DF-\U0001F5E0\U0001F5E2\U0001F5E4-\U0001F5E7\U0001F5E9-\U0001F5EE\U0001F5F0-\U0001F5F2\U0001F5F4-\U0001F5F9\U00002605\U00002607-\U0000260D\U0000260F-\U00002610\U00002612\U00002616-\U00002617\U00002619-\U0000261C\U0000261E-\U0000261F\U00002621\U00002624-\U00002625\U00002627-\U00002629\U0000262B-\U0000262D\U00002630-\U00002637\U0000263B-\U00002647\U00002654-\U0000265F\U00002661-\U00002662\U00002664\U00002667\U00002669-\U0000267A\U0000267C-\U0000267E\U00002680-\U00002691\U00002695\U00002698\U0000269A\U0000269D-\U0000269F\U000026A2-\U000026A9\U000026AC-\U000026AF\U000026B2-\U000026BC\U000026BF-\U000026C3\U000026C6-\U000026C7\U000026C9-\U000026CD\U000026D0\U000026D2\U000026D5-\U000026E8\U000026EB-\U000026EF\U000026F6\U000026FB-\U000026FC\U000026FE-\U000026FF\U00002388\U0001FA00-\U0001FFFD\U0001F0A0-\U0001F0AE\U0001F0B1-\U0001F0BF\U0001F0C1-\U0001F0CF\U0001F0D1-\U0001F0F5\U0001F0AF-\U0001F0B0\U0001F0C0\U0001F0D0\U0001F0F6-\U0001F0FF\U0001F80C-\U0001F80F\U0001F848-\U0001F84F\U0001F85A-\U0001F85F\U0001F888-\U0001F88F\U0001F8AE-\U0001F8FF\U0001F900-\U0001F90B\U0001F91F\U0001F928-\U0001F92F\U0001F931-\U0001F932\U0001F94C\U0001F95F-\U0001F96B\U0001F992-\U0001F997\U0001F9D0-\U0001F9E6\U0001F90C-\U0001F90F\U0001F93F\U0001F94D-\U0001F94F\U0001F96C-\U0001F97F\U0001F998-\U0001F9BF\U0001F9C1-\U0001F9CF\U0001F9E7-\U0001F9FF\U0001F6C6-\U0001F6CA\U0001F6D3-\U0001F6D4\U0001F6E6-\U0001F6E8\U0001F6EA\U0001F6F1-\U0001F6F2\U0001F6F7-\U0001F6F8\U0001F6D5-\U0001F6DF\U0001F6ED-\U0001F6EF\U0001F6F9-\U0001F6FF];
# LB1 - Resolve AI, CB, CJ, SA, SG, and XX into other line breaking classes
AL = [AL AI SG XX ];
dictionary = SA;
# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
# list it in the numerous rules that use CM.
CM = [CM ZWJ];
LB4: BK ÷;
LB5: CR LF;
LB5.1: CR ÷;
LB5.2: LF ÷;
LB5.3: NL ÷;
LB6: . (BK | CR | LF | NL);
LB6.1: [^BK CR LF NL SP ZW] CM* (BK | CR | LF | NL);
# Rules LB14 - LB17.
# Moved before LB7, because they can match a longer sequence that would also match LB7,
# for example, the sequence "OP CM SP AL" matches LB14 while the prefix of it,
# "while only the prefix "OP CM SP" matches LB7.1
LB14: OP CM* SP* .;
LB15: QU CM* SP* OP;
LB16: (CL | CP)CM* SP* NS;
LB17: B2 CM* SP* B2;
LB7.1: [^ZW SP] CM* [SP ZW];
LB7.2: [ZW SP] [SP ZW];
# LB8, ICU differs from UAX-14,
# ICU: ZW ÷;
# UAX 14: ZW SP* ÷;
LB8: ZW ÷;
# LB8a
# ZWJ x (ID | Extended_Pict | EmojiNRK)
LB8a: ZWJ (ID | Extended_Pict | EmojiNRK);
# LB9: X CM -> X
# LB10: Unattached CM -> AL
#LB11: × WJ;
# WJ ×
LB11.1: [^BK CR LF NL SP ZW] CM* WJ;
LB11.2: SP WJ;
LB11.3: WJ CM* [^CM];
LB12: GL CM* [^CM];
LB12a: [^SP BA BAX HY] CM* GL;
# LB 13 ICU Tailoring, matches tailoring exmaple 8 from UAX 14.
#
# LB13.1 [^SP] CM* [CL CP EX IS SY] # original UAX 14 rule.
# LB13.2 SP CM* [CL CP EX IS SY]
LB13.1: [^NU SP] CM* [CL CP IS SY];
LB13.2: [^SP] CM* EX;
LB13.2: SP [CL CP EX IS SY];
# LB 14-17 are moved above LB 7.
LB18: SP ÷;
LB19: . CM* QU;
LB19.1: QU CM* [^CM];
# LB 20 Break before and after CB.
# Interaction with LB8a: ZWJ x ID is tricky because CM includes ZWJ.
# ZWJ acts like a CM to the left, combining with CB.
# ZWJ acts independently to the right, no break from ID by LB8a.
LB20: . CM* ÷ CB;
LB20.1a: CB CM* ZWJ (ID | Extended_Pict | EmojiNRK);
LB20.1b: CB CM* ÷;
# Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
# not picking up the continuing match after the BA from 21a.
# LB 21a Don't break after Hebrew + Hyphen
# HL (HY | BA) x
LB21a: HL CM* (HY | BA | BAX) CM* [^CM CB]?;
LB21.1: . CM* [BA HY NS];
LB21.2: BB CM* [^CM CB];
LB21b: SY CM* HL;
LB22.1: (AL | HL | CM) CM* IN; # The CM is from LB10, treat an unattached CM as AL.
LB22.2: EX CM* IN;
LB22.3: (ID | EB | EM) CM* IN;
# LB22.4: IN CM* IN; # delete this rule for CSS loose.
LB22.5: NU CM* IN;
LB23.1: (AL | HL | CM) CM* NU;
LB23.2: NU CM* (AL | HL);
LB23a.1: PR CM* (ID | EB | EM);
LB23a.2: (ID | EB | EM) CM* PO;
LB24.2: (PR | PO | POX) CM* (AL | HL);
LB24.3: (AL | HL | CM) CM* (PR | PO | POX);
# Numbers. Equivalent to Tailoring example 8 from UAx 14.
# Loose_cj tailoring: do not include $PRX at the beginning or $POX at the end.
LB25: ((PR | PO | POX)CM*)? ((OP | HY)CM*)? NU (CM*(NU | SY | IS))* (CM*(CL | CP))? (CM*(PR | PRX | PO))?;
LB26.1: JL CM* (JL | JV | H2 | H3);
LB26.2: (JV | H2) CM* (JV | JT);
LB26.3: (JT | H3) CM* JT;
LB27.1: (JL | JV | JT | H2 | H3) CM* IN;
LB27.2: (JL | JV | JT | H2 | H3) CM* PO;
LB27.3: PR CM* (JL | JV | JT | H2 | H3);
# LB28 Do not break between Alphabetics.
# Unattached (leading) CM treated as AL.
LB28: (AL | HL | CM)CM* (AL | HL);
LB29: IS CM* (AL | HL);
# LB30 is adjusted for unattached leading CM being treated as AL.
LB30.1: (AL | CM | HL | NU) CM* OP;
LB30.2: CP CM* (AL | HL | NU);
# LB31 keep pairs of RI together.
LB30a.1: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS];
LB30a.2: RI CM* RI CM* ZWJ (ID | Extended_Pict | EmojiNRK);
LB30a.3: RI CM* RI CM* ÷;
# LB30b Do not break between Emoji Base and Emoji Modifier
LB30b: EB CM* EM;
# LB31 Break Everywhere Else.
# Include combining marks
LB31.1: . CM* ZWJ (ID | Extended_Pict | EmojiNRK);
LB31.2: . CM* ÷;

View File

@ -0,0 +1,214 @@
#
# Copyright (C) 2016 and later: Unicode, Inc. and others.
# License & terms of use: http://www.unicode.org/copyright.html
# Copyright (c) 2016, International Business Machines Corporation and others. All Rights Reserved.
#
# file: line_normal.txt
#
# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest
#
# Note: Rule syntax and the monkey test itself are still a work in progress.
# They are expected to change with review and the addition of support for rule tailoring.
#
# Line Breaking Rules
# Implement default line breaking as defined by
# Unicode Standard Annex #14 Revision 34 for Unicode 8.0
# http://www.unicode.org/reports/tr14/
# tailored as noted in 2nd paragraph below.
#
# TODO: Rule LB 8 remains as it was in Unicode 5.2
# This is only because of a limitation of ICU break engine implementation,
# not because the older behavior is desirable.
#
# This tailors the line break behavior to correspond to CSS
# line-break=normal (BCP47 -u-lb-normal) as defined for languages other than
# Chinese & Japanese.
# It sets characters of class CJ to behave like ID.
type = line;
locale = en@lb=normal;
AI = [:LineBreak = Ambiguous:];
AL = [:LineBreak = Alphabetic:];
BA = [:LineBreak = Break_After:];
BB = [:LineBreak = Break_Before:];
BK = [:LineBreak = Mandatory_Break:];
B2 = [:LineBreak = Break_Both:];
CB = [:LineBreak = Contingent_Break:];
CJ = [:LineBreak = Conditional_Japanese_Starter:];
CL = [:LineBreak = Close_Punctuation:];
CM = [:LineBreak = Combining_Mark:];
CP = [:LineBreak = Close_Parenthesis:];
CR = [:LineBreak = Carriage_Return:];
EB = [:LineBreak = EB:];
EM = [:LineBreak = EM:];
EX = [:LineBreak = Exclamation:];
GL = [:LineBreak = Glue:];
HL = [:LineBreak = Hebrew_Letter:];
HY = [:LineBreak = Hyphen:];
H2 = [:LineBreak = H2:];
H3 = [:LineBreak = H3:];
ID = [[:LineBreak = Ideographic:] CJ]; # CSS Normal tailoring: CJ resolves to ID
IN = [:LineBreak = Inseperable:];
IS = [:LineBreak = Infix_Numeric:];
JL = [:LineBreak = JL:];
JV = [:LineBreak = JV:];
JT = [:LineBreak = JT:];
LF = [:LineBreak = Line_Feed:];
NL = [:LineBreak = Next_Line:];
NS = [:LineBreak = Nonstarter:];
NU = [:LineBreak = Numeric:];
OP = [:LineBreak = Open_Punctuation:];
PO = [:LineBreak = Postfix_Numeric:];
PR = [:LineBreak = Prefix_Numeric:];
QU = [:LineBreak = Quotation:];
RI = [:LineBreak = Regional_Indicator:];
SA = [:LineBreak = Complex_Context:];
SG = [:LineBreak = Surrogate:];
SP = [:LineBreak = Space:];
SY = [:LineBreak = Break_Symbols:];
WJ = [:LineBreak = Word_Joiner:];
XX = [:LineBreak = Unknown:];
ZW = [:LineBreak = ZWSpace:];
ZWJ = [:LineBreak = ZWJ:];
EmojiNRK = [[\p{Emoji}] - [[RI]\u002a\u00230-9©®™〰〽]];
# Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r13267
Extended_Pict = [\U0001F774-\U0001F77F\U00002700-\U00002701\U00002703-\U00002704\U0000270E\U00002710-\U00002711\U00002765-\U00002767\U0001F030-\U0001F093\U0001F094-\U0001F09F\U0001F10D-\U0001F10F\U0001F12F\U0001F16C-\U0001F16F\U0001F1AD-\U0001F1E5\U0001F260-\U0001F265\U0001F203-\U0001F20F\U0001F23C-\U0001F23F\U0001F249-\U0001F24F\U0001F252-\U0001F25F\U0001F266-\U0001F2FF\U0001F7D5-\U0001F7FF\U0001F000-\U0001F003\U0001F005-\U0001F02B\U0001F02C-\U0001F02F\U0001F322-\U0001F323\U0001F394-\U0001F395\U0001F398\U0001F39C-\U0001F39D\U0001F3F1-\U0001F3F2\U0001F3F6\U0001F4FE\U0001F53E-\U0001F548\U0001F54F\U0001F568-\U0001F56E\U0001F571-\U0001F572\U0001F57B-\U0001F586\U0001F588-\U0001F589\U0001F58E-\U0001F58F\U0001F591-\U0001F594\U0001F597-\U0001F5A3\U0001F5A6-\U0001F5A7\U0001F5A9-\U0001F5B0\U0001F5B3-\U0001F5BB\U0001F5BD-\U0001F5C1\U0001F5C5-\U0001F5D0\U0001F5D4-\U0001F5DB\U0001F5DF-\U0001F5E0\U0001F5E2\U0001F5E4-\U0001F5E7\U0001F5E9-\U0001F5EE\U0001F5F0-\U0001F5F2\U0001F5F4-\U0001F5F9\U00002605\U00002607-\U0000260D\U0000260F-\U00002610\U00002612\U00002616-\U00002617\U00002619-\U0000261C\U0000261E-\U0000261F\U00002621\U00002624-\U00002625\U00002627-\U00002629\U0000262B-\U0000262D\U00002630-\U00002637\U0000263B-\U00002647\U00002654-\U0000265F\U00002661-\U00002662\U00002664\U00002667\U00002669-\U0000267A\U0000267C-\U0000267E\U00002680-\U00002691\U00002695\U00002698\U0000269A\U0000269D-\U0000269F\U000026A2-\U000026A9\U000026AC-\U000026AF\U000026B2-\U000026BC\U000026BF-\U000026C3\U000026C6-\U000026C7\U000026C9-\U000026CD\U000026D0\U000026D2\U000026D5-\U000026E8\U000026EB-\U000026EF\U000026F6\U000026FB-\U000026FC\U000026FE-\U000026FF\U00002388\U0001FA00-\U0001FFFD\U0001F0A0-\U0001F0AE\U0001F0B1-\U0001F0BF\U0001F0C1-\U0001F0CF\U0001F0D1-\U0001F0F5\U0001F0AF-\U0001F0B0\U0001F0C0\U0001F0D0\U0001F0F6-\U0001F0FF\U0001F80C-\U0001F80F\U0001F848-\U0001F84F\U0001F85A-\U0001F85F\U0001F888-\U0001F88F\U0001F8AE-\U0001F8FF\U0001F900-\U0001F90B\U0001F91F\U0001F928-\U0001F92F\U0001F931-\U0001F932\U0001F94C\U0001F95F-\U0001F96B\U0001F992-\U0001F997\U0001F9D0-\U0001F9E6\U0001F90C-\U0001F90F\U0001F93F\U0001F94D-\U0001F94F\U0001F96C-\U0001F97F\U0001F998-\U0001F9BF\U0001F9C1-\U0001F9CF\U0001F9E7-\U0001F9FF\U0001F6C6-\U0001F6CA\U0001F6D3-\U0001F6D4\U0001F6E6-\U0001F6E8\U0001F6EA\U0001F6F1-\U0001F6F2\U0001F6F7-\U0001F6F8\U0001F6D5-\U0001F6DF\U0001F6ED-\U0001F6EF\U0001F6F9-\U0001F6FF];
# LB1 - Resolve AI, CB, CJ, SA, SG, and XX into other line breaking classes
AL = [AL AI SG XX ];
dictionary = SA;
# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
# list it in the numerous rules that use CM.
CM = [CM ZWJ];
LB4: BK ÷;
LB5: CR LF;
LB5.1: CR ÷;
LB5.2: LF ÷;
LB5.3: NL ÷;
LB6: . (BK | CR | LF | NL);
LB6.1: [^BK CR LF NL SP ZW] CM* (BK | CR | LF | NL);
# Rules LB14 - LB17.
# Moved before LB7, because they can match a longer sequence that would also match LB7,
# for example, the sequence "OP CM SP AL" matches LB14 while the prefix of it,
# "while only the prefix "OP CM SP" matches LB7.1
LB14: OP CM* SP* .;
LB15: QU CM* SP* OP;
LB16: (CL | CP)CM* SP* NS;
LB17: B2 CM* SP* B2;
LB7.1: [^ZW SP] CM* [SP ZW];
LB7.2: [ZW SP] [SP ZW];
# LB8, ICU differs from UAX-14,
# ICU: ZW ÷;
# UAX 14: ZW SP* ÷;
LB8: ZW ÷;
# LB8a
# ZWJ x (ID | Extended_Pict | EmojiNRK)
LB8a: ZWJ (ID | Extended_Pict | EmojiNRK);
# LB9: X CM -> X
# LB10: Unattached CM -> AL
#LB11: × WJ;
# WJ ×
LB11.1: [^BK CR LF NL SP ZW] CM* WJ;
LB11.2: SP WJ;
LB11.3: WJ CM* [^CM];
LB12: GL CM* [^CM];
LB12a: [^SP BA HY] CM* GL;
# LB 13 ICU Tailoring, matches tailoring exmaple 8 from UAX 14.
#
# LB13.1 [^SP] CM* [CL CP EX IS SY] # original UAX 14 rule.
# LB13.2 SP CM* [CL CP EX IS SY]
LB13.1: [^NU SP] CM* [CL CP IS SY];
LB13.2: [^SP] CM* EX;
LB13.2: SP [CL CP EX IS SY];
# LB 14-17 are moved above LB 7.
LB18: SP ÷;
LB19: . CM* QU;
LB19.1: QU CM* [^CM];
# LB 20 Break before and after CB.
# Interaction with LB8a: ZWJ x ID is tricky because CM includes ZWJ.
# ZWJ acts like a CM to the left, combining with CB.
# ZWJ acts independently to the right, no break from ID by LB8a.
LB20: . CM* ÷ CB;
LB20.1a: CB CM* ZWJ (ID | Extended_Pict | EmojiNRK);
LB20.1b: CB CM* ÷;
# Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
# not picking up the continuing match after the BA from 21a.
LB21a: HL CM* (HY | BA) CM* [^CM CB];
LB21.1: . CM* [BA HY NS];
LB21.2: BB CM* [^CM CB];
LB21b: SY CM* HL;
LB22.1: (AL | HL | CM) CM* IN; # The CM is from LB10, treat an unattached CM as AL.
LB22.2: EX CM* IN;
LB22.3: (ID | EB | EM) CM* IN;
LB22.4: IN CM* IN;
LB22.5: NU CM* IN;
LB23.1: (AL | HL | CM) CM* NU;
LB23.2: NU CM* (AL | HL);
LB23a.1: PR CM* (ID | EB | EM);
LB23a.2: (ID | EB | EM) CM* PO;
LB24.2: (PR | PO) CM* (AL | HL);
LB24.3: (AL | HL | CM) CM* (PR | PO);
# Numbers. Equivalent to Tailoring example 8 from UAx 14.
LB25: ((PR | PO)CM*)? ((OP | HY)CM*)? NU (CM*(NU | SY | IS))* (CM*(CL | CP))? (CM*(PR | PO))?;
LB26.1: JL CM* (JL | JV | H2 | H3);
LB26.2: (JV | H2) CM* (JV | JT);
LB26.3: (JT | H3) CM* JT;
LB27.1: (JL | JV | JT | H2 | H3) CM* IN;
LB27.2: (JL | JV | JT | H2 | H3) CM* PO;
LB27.3: PR CM* (JL | JV | JT | H2 | H3);
# LB28 Do not break between Alphabetics.
# Unattached (leading) CM treated as AL.
LB28: (AL | HL | CM)CM* (AL | HL);
LB29: IS CM* (AL | HL);
# LB30 is adjusted for unattached leading CM being treated as AL.
LB30.1: (AL | CM | HL | NU) CM* OP;
LB30.2: CP CM* (AL | HL | NU);
# LB31 keep pairs of RI together.
LB30a.1: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS];
LB30a.2: RI CM* RI CM* ZWJ (ID | Extended_Pict | EmojiNRK);
LB30a.3: RI CM* RI CM* ÷;
# LB30b Do not break between Emoji Base and Emoji Modifier
LB30b: EB CM* EM;
# LB31 Break Everywhere Else.
# Include combining marks
LB31.1: . CM* ZWJ (ID | Extended_Pict | EmojiNRK);
LB31.2: . CM* ÷;

View File

@ -0,0 +1,223 @@
#
# Copyright (C) 2016 and later: Unicode, Inc. and others.
# License & terms of use: http://www.unicode.org/copyright.html
# Copyright (c) 2016 International Business Machines Corporation and others. All Rights Reserved.
#
# file: line_normal_cj.txt
#
# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest
#
# Note: Rule syntax and the monkey test itself are still a work in progress.
# They are expected to change with review and the addition of support for rule tailoring.
#
# Line Breaking Rules
# Implement default line breaking as defined by
# Unicode Standard Annex #14 Revision 34 for Unicode 8.0
# http://www.unicode.org/reports/tr14/
# tailored as noted in 2nd paragraph below.
#
# TODO: Rule LB 8 remains as it was in Unicode 5.2
# This is only because of a limitation of ICU break engine implementation,
# not because the older behavior is desirable.
#
# This tailors the line break behavior to correspond to CSS
# line-break=normal (BCP47 -u-lb-normal) as defined for Chinese & Japanese.
# It sets characters of class CJ to behave like ID.
# In addition, it allows breaks:
# * before hyphens 2010 & 2013 (both BA) and 301C, 30A0 (both NS)
type = line;
locale = ja@lb=normal;
AI = [:LineBreak = Ambiguous:];
AL = [:LineBreak = Alphabetic:];
BAX = [\u2010 \u2013];
BA = [[:LineBreak = Break_After:] - BAX];
BB = [:LineBreak = Break_Before:];
BK = [:LineBreak = Mandatory_Break:];
B2 = [:LineBreak = Break_Both:];
CB = [:LineBreak = Contingent_Break:];
CJ = [:LineBreak = Conditional_Japanese_Starter:];
CL = [:LineBreak = Close_Punctuation:];
CM = [:LineBreak = Combining_Mark:];
CP = [:LineBreak = Close_Parenthesis:];
CR = [:LineBreak = Carriage_Return:];
EB = [:LineBreak = EB:];
EM = [:LineBreak = EM:];
EX = [:LineBreak = Exclamation:];
GL = [:LineBreak = Glue:];
HL = [:LineBreak = Hebrew_Letter:];
HY = [:LineBreak = Hyphen:];
H2 = [:LineBreak = H2:];
H3 = [:LineBreak = H3:];
ID = [[:LineBreak = Ideographic:] CJ]; # CSS Normal tailoring: CJ resolves to ID
IN = [:LineBreak = Inseperable:];
IS = [:LineBreak = Infix_Numeric:];
JL = [:LineBreak = JL:];
JV = [:LineBreak = JV:];
JT = [:LineBreak = JT:];
LF = [:LineBreak = Line_Feed:];
NL = [:LineBreak = Next_Line:];
NSX = [\u301C \u30A0];
NS = [[:LineBreak = Nonstarter:] - NSX];
NU = [:LineBreak = Numeric:];
OP = [:LineBreak = Open_Punctuation:];
PO = [:LineBreak = Postfix_Numeric:];
PR = [:LineBreak = Prefix_Numeric:];
QU = [:LineBreak = Quotation:];
RI = [:LineBreak = Regional_Indicator:];
SA = [:LineBreak = Complex_Context:];
SG = [:LineBreak = Surrogate:];
SP = [:LineBreak = Space:];
SY = [:LineBreak = Break_Symbols:];
WJ = [:LineBreak = Word_Joiner:];
XX = [:LineBreak = Unknown:];
ZW = [:LineBreak = ZWSpace:];
ZWJ = [:LineBreak = ZWJ:];
EmojiNRK = [[\p{Emoji}] - [[RI]\u002a\u00230-9©®™〰〽]];
# Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r13267
Extended_Pict = [\U0001F774-\U0001F77F\U00002700-\U00002701\U00002703-\U00002704\U0000270E\U00002710-\U00002711\U00002765-\U00002767\U0001F030-\U0001F093\U0001F094-\U0001F09F\U0001F10D-\U0001F10F\U0001F12F\U0001F16C-\U0001F16F\U0001F1AD-\U0001F1E5\U0001F260-\U0001F265\U0001F203-\U0001F20F\U0001F23C-\U0001F23F\U0001F249-\U0001F24F\U0001F252-\U0001F25F\U0001F266-\U0001F2FF\U0001F7D5-\U0001F7FF\U0001F000-\U0001F003\U0001F005-\U0001F02B\U0001F02C-\U0001F02F\U0001F322-\U0001F323\U0001F394-\U0001F395\U0001F398\U0001F39C-\U0001F39D\U0001F3F1-\U0001F3F2\U0001F3F6\U0001F4FE\U0001F53E-\U0001F548\U0001F54F\U0001F568-\U0001F56E\U0001F571-\U0001F572\U0001F57B-\U0001F586\U0001F588-\U0001F589\U0001F58E-\U0001F58F\U0001F591-\U0001F594\U0001F597-\U0001F5A3\U0001F5A6-\U0001F5A7\U0001F5A9-\U0001F5B0\U0001F5B3-\U0001F5BB\U0001F5BD-\U0001F5C1\U0001F5C5-\U0001F5D0\U0001F5D4-\U0001F5DB\U0001F5DF-\U0001F5E0\U0001F5E2\U0001F5E4-\U0001F5E7\U0001F5E9-\U0001F5EE\U0001F5F0-\U0001F5F2\U0001F5F4-\U0001F5F9\U00002605\U00002607-\U0000260D\U0000260F-\U00002610\U00002612\U00002616-\U00002617\U00002619-\U0000261C\U0000261E-\U0000261F\U00002621\U00002624-\U00002625\U00002627-\U00002629\U0000262B-\U0000262D\U00002630-\U00002637\U0000263B-\U00002647\U00002654-\U0000265F\U00002661-\U00002662\U00002664\U00002667\U00002669-\U0000267A\U0000267C-\U0000267E\U00002680-\U00002691\U00002695\U00002698\U0000269A\U0000269D-\U0000269F\U000026A2-\U000026A9\U000026AC-\U000026AF\U000026B2-\U000026BC\U000026BF-\U000026C3\U000026C6-\U000026C7\U000026C9-\U000026CD\U000026D0\U000026D2\U000026D5-\U000026E8\U000026EB-\U000026EF\U000026F6\U000026FB-\U000026FC\U000026FE-\U000026FF\U00002388\U0001FA00-\U0001FFFD\U0001F0A0-\U0001F0AE\U0001F0B1-\U0001F0BF\U0001F0C1-\U0001F0CF\U0001F0D1-\U0001F0F5\U0001F0AF-\U0001F0B0\U0001F0C0\U0001F0D0\U0001F0F6-\U0001F0FF\U0001F80C-\U0001F80F\U0001F848-\U0001F84F\U0001F85A-\U0001F85F\U0001F888-\U0001F88F\U0001F8AE-\U0001F8FF\U0001F900-\U0001F90B\U0001F91F\U0001F928-\U0001F92F\U0001F931-\U0001F932\U0001F94C\U0001F95F-\U0001F96B\U0001F992-\U0001F997\U0001F9D0-\U0001F9E6\U0001F90C-\U0001F90F\U0001F93F\U0001F94D-\U0001F94F\U0001F96C-\U0001F97F\U0001F998-\U0001F9BF\U0001F9C1-\U0001F9CF\U0001F9E7-\U0001F9FF\U0001F6C6-\U0001F6CA\U0001F6D3-\U0001F6D4\U0001F6E6-\U0001F6E8\U0001F6EA\U0001F6F1-\U0001F6F2\U0001F6F7-\U0001F6F8\U0001F6D5-\U0001F6DF\U0001F6ED-\U0001F6EF\U0001F6F9-\U0001F6FF];
# LB1 - Resolve AI, CB, CJ, SA, SG, and XX into other line breaking classes
AL = [AL AI SG XX ];
dictionary = SA;
# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
# list it in the numerous rules that use CM.
CM = [CM ZWJ];
LB4: BK ÷;
LB5: CR LF;
LB5.1: CR ÷;
LB5.2: LF ÷;
LB5.3: NL ÷;
LB6: . (BK | CR | LF | NL);
LB6.1: [^BK CR LF NL SP ZW] CM* (BK | CR | LF | NL);
# Rules LB14 - LB17.
# Moved before LB7, because they can match a longer sequence that would also match LB7,
# for example, the sequence "OP CM SP AL" matches LB14 while the prefix of it,
# "while only the prefix "OP CM SP" matches LB7.1
LB14: OP CM* SP* .;
LB15: QU CM* SP* OP;
# Do not break between closing punctuation and $NS, even with intervening spaces
# But DO allow a break between closing punctuation and $NSX, don't include it here
LB16: (CL | CP)CM* SP* NS;
LB17: B2 CM* SP* B2;
LB7.1: [^ZW SP] CM* [SP ZW];
LB7.2: [ZW SP] [SP ZW];
# LB8, ICU differs from UAX-14,
# ICU: ZW ÷;
# UAX 14: ZW SP* ÷;
LB8: ZW ÷;
# LB8a
# ZWJ x (ID | Extended_Pict | EmojiNRK)
LB8a: ZWJ (ID | Extended_Pict | EmojiNRK);
# LB9: X CM -> X
# LB10: Unattached CM -> AL
#LB11: × WJ;
# WJ ×
LB11.1: [^BK CR LF NL SP ZW] CM* WJ;
LB11.2: SP WJ;
LB11.3: WJ CM* [^CM];
LB12: GL CM* [^CM];
LB12a: [^SP BA BAX HY] CM* GL;
# LB 13 ICU Tailoring, matches tailoring exmaple 8 from UAX 14.
#
# LB13.1 [^SP] CM* [CL CP EX IS SY] # original UAX 14 rule.
# LB13.2 SP CM* [CL CP EX IS SY]
LB13.1: [^NU SP] CM* [CL CP IS SY];
LB13.2: [^SP] CM* EX;
LB13.2: SP [CL CP EX IS SY];
# LB 14-17 are moved above LB 7.
LB18: SP ÷;
LB19: . CM* QU;
LB19.1: QU CM* [^CM];
# LB 20 Break before and after CB.
# Interaction with LB8a: ZWJ x ID is tricky because CM includes ZWJ.
# ZWJ acts like a CM to the left, combining with CB.
# ZWJ acts independently to the right, no break from ID by LB8a.
LB20: . CM* ÷ CB;
LB20.1a: CB CM* ZWJ (ID | Extended_Pict | EmojiNRK);
LB20.1b: CB CM* ÷;
# Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
# not picking up the continuing match after the BA from 21a.
# TODO: For CJ tailorings (with BAX) does this rule want to include BAX? If so,
# should "HL BAX" not break when followed by a CB? Thats what the current
# rules do, which is why "[^CM CB]?" includes the ?.
LB21a: HL CM* (HY | BA | BAX) CM* [^CM CB]?;
# DO allow breaks here before $BAXcm and $NSXcm, so don't include them
LB21.1: . CM* [BA HY NS];
LB21.2: BB CM* [^CM CB];
LB21b: SY CM* HL;
LB22.1: (AL | HL | CM) CM* IN; # The CM is from LB10, treat an unattached CM as AL.
LB22.2: EX CM* IN;
LB22.3: (ID | EB | EM) CM* IN;
LB22.4: IN CM* IN;
LB22.5: NU CM* IN;
LB23.1: (AL | HL | CM) CM* NU;
LB23.2: NU CM* (AL | HL);
LB23a.1: PR CM* (ID | EB | EM);
LB23a.2: (ID | EB | EM) CM* PO;
LB24.2: (PR | PO) CM* (AL | HL);
LB24.3: (AL | HL | CM) CM* (PR | PO);
# Numbers. Equivalent to Tailoring example 8 from UAx 14.
LB25: ((PR | PO)CM*)? ((OP | HY)CM*)? NU (CM*(NU | SY | IS))* (CM*(CL | CP))? (CM*(PR | PO))?;
LB26.1: JL CM* (JL | JV | H2 | H3);
LB26.2: (JV | H2) CM* (JV | JT);
LB26.3: (JT | H3) CM* JT;
LB27.1: (JL | JV | JT | H2 | H3) CM* IN;
LB27.2: (JL | JV | JT | H2 | H3) CM* PO;
LB27.3: PR CM* (JL | JV | JT | H2 | H3);
# LB28 Do not break between Alphabetics.
# Unattached (leading) CM treated as AL.
LB28: (AL | HL | CM)CM* (AL | HL);
LB29: IS CM* (AL | HL);
# LB30 is adjusted for unattached leading CM being treated as AL.
LB30.1: (AL | CM | HL | NU) CM* OP;
LB30.2: CP CM* (AL | HL | NU);
# LB31 keep pairs of RI together.
LB30a.1: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS];
LB30a.2: RI CM* RI CM* ZWJ (ID | Extended_Pict | EmojiNRK);
LB30a.3: RI CM* RI CM* ÷;
# LB30b Do not break between Emoji Base and Emoji Modifier
LB30b: EB CM* EM;
# LB31 Break Everywhere Else.
# Include combining marks
LB31.1: . CM* ZWJ (ID | Extended_Pict | EmojiNRK);
LB31.2: . CM* ÷;

View File

@ -0,0 +1,10 @@
file: main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/readme.txt
Copyright (C) 2016 and later: Unicode, Inc. and others.
License & terms of use: http://www.unicode.org/copyright.html#License
Copyright (c) 2015-2016, International Business Machines Corporation and others. All Rights Reserved.
This directory contains the break iterator reference rule files used by the test RBBIMonkeyTest.
The rule files are copied from ICU4C, from source/test/testdata/break_rules/*
See the readme.txt located there for additional information.

View File

@ -0,0 +1,50 @@
#
# Copyright (C) 2016 and later: Unicode, Inc. and others.
# License & terms of use: http://www.unicode.org/copyright.html#License
# Copyright (c) 2016, International Business Machines Corporation and others. All Rights Reserved.
# file: sentence.txt
type = sentence; # one of grapheme | word | line | sentence
locale = en;
CR = [\p{Sentence_Break = CR}];
LF = [\p{Sentence_Break = LF}];
Extend = [\p{Sentence_Break = Extend}];
Sep = [\p{Sentence_Break = Sep}];
Format = [\p{Sentence_Break = Format}];
Sp = [\p{Sentence_Break = Sp}];
Lower = [\p{Sentence_Break = Lower}];
Upper = [\p{Sentence_Break = Upper}];
OLetter = [\p{Sentence_Break = OLetter}];
Numeric = [\p{Sentence_Break = Numeric}];
ATerm = [\p{Sentence_Break = ATerm}];
SContinue = [\p{Sentence_Break = SContinue}];
STerm = [\p{Sentence_Break = STerm}];
Close = [\p{Sentence_Break = Close}];
ParaSep = [Sep CR LF];
SATerm = [STerm ATerm];
ExtFmt = [Extend Format];
# SB2: ÷ eot
# Conventional regular expression matching for '$' as end-of-text also matches
# at a line separator just preceding the physical end of text.
# Instead, use a look-ahead assertion that there is no following character.
SB2: . ÷ (?!.);
SB3: CR LF;
SB4: ParaSep ÷;
# SB5: ignore Format and Extend characters.
SB6: ATerm ExtFmt* Numeric;
SB7: (Upper | Lower) ExtFmt* ATerm ExtFmt* Upper;
SB8: ATerm ExtFmt* (Close ExtFmt*)* (Sp ExtFmt*)* ([^OLetter Upper Lower ParaSep SATerm ExtFmt] ExtFmt *)* Lower;
SB8a: SATerm ExtFmt* (Close ExtFmt*)* (Sp ExtFmt*)* (SContinue | SATerm);
SB9: SATerm ExtFmt* (Close ExtFmt*)* (Sp ExtFmt*)* (CR LF | ParaSep)? ÷;
# Also covers SB10, SB11.
SB12: . ExtFmt* [^ExtFmt]?;

View File

@ -0,0 +1,97 @@
#
# Copyright (C) 2016 and later: Unicode, Inc. and others.
# License & terms of use: http://www.unicode.org/copyright.html
# Copyright (c) 2016, International Business Machines Corporation and others. All Rights Reserved.
# file: word.txt
#
# Reference Word Break rules for intltest rbbi/RBBIMonkeyTest
#
# Note: Rule syntax and the monkey test itself are still a work in progress.
# They are expected to change with review and the addition of support for rule tailoring.
type = word; # one of grapheme | word | line | sentence
locale = en;
CR = [\p{Word_Break = CR}];
LF = [\p{Word_Break = LF}];
Newline = [\p{Word_Break = Newline}];
Extend = [\p{Word_Break = Extend}];
ZWJ = [\p{Word_Break = ZWJ}];
Regional_Indicator = [\p{Word_Break = Regional_Indicator}];
Format = [\p{Word_Break = Format}];
Katakana = [\p{Word_Break = Katakana}];
Hebrew_Letter = [\p{Word_Break = Hebrew_Letter}];
ALetter = [\p{Word_Break = ALetter}];
Single_Quote = [\p{Word_Break = Single_Quote}];
Double_Quote = [\p{Word_Break = Double_Quote}];
MidNumLet = [\p{Word_Break = MidNumLet}];
MidLetter = [\p{Word_Break = MidLetter}];
MidNum = [\p{Word_Break = MidNum}];
Numeric = [\p{Word_Break = Numeric}];
ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
E_Base = [\p{Word_Break = EB}];
E_Modifier = [\p{Word_Break = EM}];
EmojiNRK = [[\p{Emoji}] - [[Regional_Indicator]\u002a\u00230-9©®™〰〽]];
# Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r13267
Extended_Pict = [\U0001F774-\U0001F77F\U00002700-\U00002701\U00002703-\U00002704\U0000270E\U00002710-\U00002711\U00002765-\U00002767\U0001F030-\U0001F093\U0001F094-\U0001F09F\U0001F10D-\U0001F10F\U0001F12F\U0001F16C-\U0001F16F\U0001F1AD-\U0001F1E5\U0001F260-\U0001F265\U0001F203-\U0001F20F\U0001F23C-\U0001F23F\U0001F249-\U0001F24F\U0001F252-\U0001F25F\U0001F266-\U0001F2FF\U0001F7D5-\U0001F7FF\U0001F000-\U0001F003\U0001F005-\U0001F02B\U0001F02C-\U0001F02F\U0001F322-\U0001F323\U0001F394-\U0001F395\U0001F398\U0001F39C-\U0001F39D\U0001F3F1-\U0001F3F2\U0001F3F6\U0001F4FE\U0001F53E-\U0001F548\U0001F54F\U0001F568-\U0001F56E\U0001F571-\U0001F572\U0001F57B-\U0001F586\U0001F588-\U0001F589\U0001F58E-\U0001F58F\U0001F591-\U0001F594\U0001F597-\U0001F5A3\U0001F5A6-\U0001F5A7\U0001F5A9-\U0001F5B0\U0001F5B3-\U0001F5BB\U0001F5BD-\U0001F5C1\U0001F5C5-\U0001F5D0\U0001F5D4-\U0001F5DB\U0001F5DF-\U0001F5E0\U0001F5E2\U0001F5E4-\U0001F5E7\U0001F5E9-\U0001F5EE\U0001F5F0-\U0001F5F2\U0001F5F4-\U0001F5F9\U00002605\U00002607-\U0000260D\U0000260F-\U00002610\U00002612\U00002616-\U00002617\U00002619-\U0000261C\U0000261E-\U0000261F\U00002621\U00002624-\U00002625\U00002627-\U00002629\U0000262B-\U0000262D\U00002630-\U00002637\U0000263B-\U00002647\U00002654-\U0000265F\U00002661-\U00002662\U00002664\U00002667\U00002669-\U0000267A\U0000267C-\U0000267E\U00002680-\U00002691\U00002695\U00002698\U0000269A\U0000269D-\U0000269F\U000026A2-\U000026A9\U000026AC-\U000026AF\U000026B2-\U000026BC\U000026BF-\U000026C3\U000026C6-\U000026C7\U000026C9-\U000026CD\U000026D0\U000026D2\U000026D5-\U000026E8\U000026EB-\U000026EF\U000026F6\U000026FB-\U000026FC\U000026FE-\U000026FF\U00002388\U0001FA00-\U0001FFFD\U0001F0A0-\U0001F0AE\U0001F0B1-\U0001F0BF\U0001F0C1-\U0001F0CF\U0001F0D1-\U0001F0F5\U0001F0AF-\U0001F0B0\U0001F0C0\U0001F0D0\U0001F0F6-\U0001F0FF\U0001F80C-\U0001F80F\U0001F848-\U0001F84F\U0001F85A-\U0001F85F\U0001F888-\U0001F88F\U0001F8AE-\U0001F8FF\U0001F900-\U0001F90B\U0001F91F\U0001F928-\U0001F92F\U0001F931-\U0001F932\U0001F94C\U0001F95F-\U0001F96B\U0001F992-\U0001F997\U0001F9D0-\U0001F9E6\U0001F90C-\U0001F90F\U0001F93F\U0001F94D-\U0001F94F\U0001F96C-\U0001F97F\U0001F998-\U0001F9BF\U0001F9C1-\U0001F9CF\U0001F9E7-\U0001F9FF\U0001F6C6-\U0001F6CA\U0001F6D3-\U0001F6D4\U0001F6E6-\U0001F6E8\U0001F6EA\U0001F6F1-\U0001F6F2\U0001F6F7-\U0001F6F8\U0001F6D5-\U0001F6DF\U0001F6ED-\U0001F6EF\U0001F6F9-\U0001F6FF];
EBG = [\p{Word_Break = EBG}];
#define dictionary, with the effect being that those characters don't appear in test data.
Han = [:Han:];
Hiragana = [:Hiragana:];
Control = [\p{Grapheme_Cluster_Break = Control}];
HangulSyllable = [\uac00-\ud7a3];
ComplexContext = [:LineBreak = Complex_Context:];
KanaKanji = [Han Hiragana Katakana];
dictionaryCJK = [KanaKanji HangulSyllable];
dictionary = [ComplexContext dictionaryCJK];
# leave dictionary scripts out of ALetter
ALetter = [ALetter - dictionary];
AHLetter = [ALetter Hebrew_Letter];
MidNumLetQ = [MidNumLet Single_Quote];
ExtFmt = [Extend Format ZWJ];
WB3: CR LF;
WB3a: (Newline | CR | LF) ÷;
WB3b: . ÷ (Newline | CR | LF); # actually redundant? No other rule combines.
# (but needed with UAX treat-as scheme.)
WB3c: ZWJ (Extended_Pict | EmojiNRK);
WB5: AHLetter ExtFmt* AHLetter;
# includes both WB6 and WB7
WB6: AHLetter ExtFmt* (MidLetter | MidNumLetQ) ExtFmt* AHLetter;
WB7a: Hebrew_Letter ExtFmt* Single_Quote;
WB7b: Hebrew_Letter ExtFmt* Double_Quote ExtFmt* Hebrew_Letter; # Include WB7c
WB8: Numeric ExtFmt* Numeric;
WB9: AHLetter ExtFmt* Numeric;
WB10: Numeric ExtFmt* AHLetter;
WB11: Numeric ExtFmt* (MidNum | MidNumLetQ) ExtFmt* Numeric; # includes WB12
WB13: Katakana ExtFmt* Katakana;
WB13a: (AHLetter | Numeric | Katakana | ExtendNumLet) ExtFmt* ExtendNumLet;
WB13b: ExtendNumLet ExtFmt* (AHLetter | Numeric | Katakana);
# WB rule 15 - 17, pairs of Regional Indicators stay unbroken.
# Interacts with WB3c.
WB15: Regional_Indicator ExtFmt* Regional_Indicator ExtFmt* ZWJ (Extended_Pict | EmojiNRK);
WB17: Regional_Indicator ExtFmt* Regional_Indicator ExtFmt* ÷;
WB14: (E_Base | EBG) ExtFmt* E_Modifier;
# Rule WB 999 Any ÷ Any
# Interacts with WB3c, do not break between ZWJ and (Extended_Pict | EBG).
WB999.1: . ExtFmt* ZWJ (Extended_Pict | EmojiNRK);
WB999.2: . ExtFmt* ÷;

View File

@ -0,0 +1,96 @@
#
# Copyright (C) 2016 and later: Unicode, Inc. and others.
# License & terms of use: http://www.unicode.org/copyright.html
# Copyright (c) 2016, International Business Machines Corporation and others. All Rights Reserved.
# file: word_POSIX.txt
#
# Reference Word Break rules for intltest rbbi/RBBIMonkeyTest
#
# Note: Rule syntax and the monkey test itself are still a work in progress.
# They are expected to change with review and the addition of support for rule tailoring.
type = word; # one of grapheme | word | line | sentence
locale = en_US_POSIX;
CR = [\p{Word_Break = CR}];
LF = [\p{Word_Break = LF}];
Newline = [\p{Word_Break = Newline}];
Extend = [\p{Word_Break = Extend}];
ZWJ = [\p{Word_Break = ZWJ}];
Regional_Indicator = [\p{Word_Break = Regional_Indicator}];
Format = [\p{Word_Break = Format}];
Katakana = [\p{Word_Break = Katakana}];
Hebrew_Letter = [\p{Word_Break = Hebrew_Letter}];
ALetter = [\p{Word_Break = ALetter}];
Single_Quote = [\p{Word_Break = Single_Quote}];
Double_Quote = [\p{Word_Break = Double_Quote}];
MidNumLet = [\p{Word_Break = MidNumLet} - [.]];
MidLetter = [\p{Word_Break = MidLetter} - [\:]];
MidNum = [\p{Word_Break = MidNum} [.]];
Numeric = [\p{Word_Break = Numeric}];
ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
E_Base = [\p{Word_Break = EB}];
E_Modifier = [\p{Word_Break = EM}];
EmojiNRK = [[\p{Emoji}] - [[Regional_Indicator]\u002a\u00230-9©®™〰〽]];
# Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r13267
Extended_Pict = [\U0001F774-\U0001F77F\U00002700-\U00002701\U00002703-\U00002704\U0000270E\U00002710-\U00002711\U00002765-\U00002767\U0001F030-\U0001F093\U0001F094-\U0001F09F\U0001F10D-\U0001F10F\U0001F12F\U0001F16C-\U0001F16F\U0001F1AD-\U0001F1E5\U0001F260-\U0001F265\U0001F203-\U0001F20F\U0001F23C-\U0001F23F\U0001F249-\U0001F24F\U0001F252-\U0001F25F\U0001F266-\U0001F2FF\U0001F7D5-\U0001F7FF\U0001F000-\U0001F003\U0001F005-\U0001F02B\U0001F02C-\U0001F02F\U0001F322-\U0001F323\U0001F394-\U0001F395\U0001F398\U0001F39C-\U0001F39D\U0001F3F1-\U0001F3F2\U0001F3F6\U0001F4FE\U0001F53E-\U0001F548\U0001F54F\U0001F568-\U0001F56E\U0001F571-\U0001F572\U0001F57B-\U0001F586\U0001F588-\U0001F589\U0001F58E-\U0001F58F\U0001F591-\U0001F594\U0001F597-\U0001F5A3\U0001F5A6-\U0001F5A7\U0001F5A9-\U0001F5B0\U0001F5B3-\U0001F5BB\U0001F5BD-\U0001F5C1\U0001F5C5-\U0001F5D0\U0001F5D4-\U0001F5DB\U0001F5DF-\U0001F5E0\U0001F5E2\U0001F5E4-\U0001F5E7\U0001F5E9-\U0001F5EE\U0001F5F0-\U0001F5F2\U0001F5F4-\U0001F5F9\U00002605\U00002607-\U0000260D\U0000260F-\U00002610\U00002612\U00002616-\U00002617\U00002619-\U0000261C\U0000261E-\U0000261F\U00002621\U00002624-\U00002625\U00002627-\U00002629\U0000262B-\U0000262D\U00002630-\U00002637\U0000263B-\U00002647\U00002654-\U0000265F\U00002661-\U00002662\U00002664\U00002667\U00002669-\U0000267A\U0000267C-\U0000267E\U00002680-\U00002691\U00002695\U00002698\U0000269A\U0000269D-\U0000269F\U000026A2-\U000026A9\U000026AC-\U000026AF\U000026B2-\U000026BC\U000026BF-\U000026C3\U000026C6-\U000026C7\U000026C9-\U000026CD\U000026D0\U000026D2\U000026D5-\U000026E8\U000026EB-\U000026EF\U000026F6\U000026FB-\U000026FC\U000026FE-\U000026FF\U00002388\U0001FA00-\U0001FFFD\U0001F0A0-\U0001F0AE\U0001F0B1-\U0001F0BF\U0001F0C1-\U0001F0CF\U0001F0D1-\U0001F0F5\U0001F0AF-\U0001F0B0\U0001F0C0\U0001F0D0\U0001F0F6-\U0001F0FF\U0001F80C-\U0001F80F\U0001F848-\U0001F84F\U0001F85A-\U0001F85F\U0001F888-\U0001F88F\U0001F8AE-\U0001F8FF\U0001F900-\U0001F90B\U0001F91F\U0001F928-\U0001F92F\U0001F931-\U0001F932\U0001F94C\U0001F95F-\U0001F96B\U0001F992-\U0001F997\U0001F9D0-\U0001F9E6\U0001F90C-\U0001F90F\U0001F93F\U0001F94D-\U0001F94F\U0001F96C-\U0001F97F\U0001F998-\U0001F9BF\U0001F9C1-\U0001F9CF\U0001F9E7-\U0001F9FF\U0001F6C6-\U0001F6CA\U0001F6D3-\U0001F6D4\U0001F6E6-\U0001F6E8\U0001F6EA\U0001F6F1-\U0001F6F2\U0001F6F7-\U0001F6F8\U0001F6D5-\U0001F6DF\U0001F6ED-\U0001F6EF\U0001F6F9-\U0001F6FF];
EBG = [\p{Word_Break = EBG}];
#define dictionary, with the effect being that those characters don't appear in test data.
Han = [:Han:];
Hiragana = [:Hiragana:];
Control = [\p{Grapheme_Cluster_Break = Control}];
HangulSyllable = [\uac00-\ud7a3];
ComplexContext = [:LineBreak = Complex_Context:];
KanaKanji = [Han Hiragana Katakana];
dictionaryCJK = [KanaKanji HangulSyllable];
dictionary = [ComplexContext dictionaryCJK];
# leave dictionary scripts out of ALetter
ALetter = [ALetter - dictionary];
AHLetter = [ALetter Hebrew_Letter];
MidNumLetQ = [MidNumLet Single_Quote];
ExtFmt = [Extend Format ZWJ];
WB3: CR LF;
WB3a: (Newline | CR | LF) ÷;
WB3b: . ÷ (Newline | CR | LF); # actually redundant? No other rule combines.
# (but needed with UAX treat-as scheme.)
WB3c: ZWJ (Extended_Pict | EmojiNRK);
WB5: AHLetter ExtFmt* AHLetter;
# includes both WB6 and WB7
WB6: AHLetter ExtFmt* (MidLetter | MidNumLetQ) ExtFmt* AHLetter;
WB7a: Hebrew_Letter ExtFmt* Single_Quote;
WB7b: Hebrew_Letter ExtFmt* Double_Quote ExtFmt* Hebrew_Letter; # Include WB7c
WB8: Numeric ExtFmt* Numeric;
WB9: AHLetter ExtFmt* Numeric;
WB10: Numeric ExtFmt* AHLetter;
WB11: Numeric ExtFmt* (MidNum | MidNumLetQ) ExtFmt* Numeric; # includes WB12
WB13: Katakana ExtFmt* Katakana;
WB13a: (AHLetter | Numeric | Katakana | ExtendNumLet) ExtFmt* ExtendNumLet;
WB13b: ExtendNumLet ExtFmt* (AHLetter | Numeric | Katakana);
# WB rule 15 - 17, pairs of Regional Indicators stay unbroken.
# Interacts with WB3c.
WB15: Regional_Indicator ExtFmt* Regional_Indicator ExtFmt* ZWJ (Extended_Pict | EmojiNRK);
WB17: Regional_Indicator ExtFmt* Regional_Indicator ExtFmt* ÷;
WB14: (E_Base | EBG) ExtFmt* E_Modifier;
# Rule WB 999 Any ÷ Any
# Interacts with WB3c, do not break between ZWJ and (Extended_Pict | EBG).
WB999.1: . ExtFmt* ZWJ (Extended_Pict | EmojiNRK);
WB999.2: . ExtFmt* ÷;

View File

@ -14,7 +14,9 @@
# <sent> any following data is for sentence break testing
# <line> any following data is for line break testing
# <char> any following data is for char break testing
# <locale local_name> Switch to the named locale at the next occurence of <word>, <sent>, etc.
# <rules> rules ... </rules> following data is tested against these rules.
# Applies until a following occurence of <word>, <sent>, etc. or another <rules>
# <locale locale_name> Switch to the named locale at the next occurence of <word>, <sent>, etc.
# <data> ... </data> test data. May span multiple lines.
# <> Break position, status == 0
# • Break position, status == 0 (Bullet, \u2022)
@ -37,8 +39,17 @@
# Temp debugging tests
<locale en>
<word>
<data><0>ク<400>ライアン<400>ト<400>サーバー<400></data>
# <data><0>ク<400>ライアン<400>トサーバー<400></data>
<data><0>コンピューター<400>は<400>、<0>本質<400>的<400>に<400>は<400>数字<400>しか<400>扱う<400>こと<400>が<400>でき<400>ま<400>せん<400>。<0>\
コンピューター<400>は<400>、<0>文字<400>や<400>記号<400>など<400>の<400>それぞれに<400>番号<400>を<400>割り振る<400>こと<400>によって<400>扱える<400>\
よう<400>にし<400>ます<400>。<0>ユニ<400>コード<400>が<400>出来る<400>まで<400>は<400>、<0>これらの<400>番号<400>を<400>割り振る<400>仕組み<400>が<400>\
何<400>百<400>種類<400>も<400>存在<400>しま<400>した<400>。<0>どの<400>一つ<400>を<400>とっても<400>、<0>十分<400>な<400>文字<400>を<400>含<400>\
んで<400>は<400>いま<400>せん<400>で<400>した<400>。<0>例えば<400>、<0>欧州<400>連合<400>一つ<400>を<400>見<400>て<400>も<400>、<0>その<400>\
すべて<400>の<400>言語<400>を<400>カバー<400>する<400>ため<400>に<400>は<400>、<0>いくつか<400>の<400>異なる<400>符号<400>化<400>の<400>仕組み<400>\
が<400>必要<400>で<400>した<400>。<0>英語<400>の<400>よう<400>な<400>一つ<400>の<400>言語<400>に<400>限<400>って<400>も<400>、<0>一つ<400>だけ<400>\
の<400>符号<400>化<400>の<400>仕組み<400>では<400>、<0>一般<400>的<400>に<400>使<400>われる<400>すべて<400>の<400>文字<400>、<0>句読点<400>、<0>\
。<0></data>
#<data><0>コンピューター<400>は<400>、<0>本質<400>的<400>に<400>は<400>数字<400>しか<400>扱う<400>こと<400>が<400>でき<400>ま<400>せん<400>。<0>\
## FILTERED BREAK TESTS
@ -1308,3 +1319,48 @@ Bangkok)•</data>
<data>•\U0001F468\u200D\u2695\uFE0F•\U0001F468\u200D\u2695•\U0001F468\U0001F3FD\u200D\u2695\uFE0F•\U0001F468\U0001F3FD\u200D\u2695\u0020•</data>
# woman astronaut, woman astronaut / fitz4
<data>•\U0001F469\u200D\U0001F680•\U0001F469\U0001F3FD\u200D\U0001F680\u0020•</data>
####################################################################################
#
# Test rule status values
#
####################################################################################
<rules> $Letters = [:L:];
$Numbers = [:N:];
$Letters+{1};
$Numbers+{2};
Help\ me\!{4};
[^$Letters $Numbers];
!.*;
</rules>
<data>•abc<1>123<2>.•.•abc<1> •Help<1> •me<1> •Help me!<4></data>
# Test option to prohibit unquoted literals.
<rules>
!!forward;
Hello\ World;
!!reverse;
.*;
</rules>
<data>•Hello World•</data>
<badrules>
!!quoted_literals_only;
!!forward;
Hello\ World;
!!reverse;
.*;
</badrules>
<rules>
#TODO: uncomment this line when quoted_literals_only is implemented.
#!!quoted_literals_only;
!!forward;
'Hello World';
!!reverse;
.*;
</rules>
<data>•Hello World•</data>

View File

@ -10,13 +10,11 @@
package com.ibm.icu.dev.test.shaping;
import java.lang.reflect.Method;
import java.util.MissingResourceException;
import org.junit.Test;
import com.ibm.icu.dev.test.TestFmwk;
import com.ibm.icu.text.ArabicShaping;
import com.ibm.icu.text.ArabicShapingException;
/**
* Regression test for Arabic shaping.
@ -48,509 +46,6 @@ public class ArabicShapingRegTest extends TestFmwk {
public static final int DIGIT_TYPE_AN = 0;
public static final int DIGIT_TYPE_AN_EXTENDED = 0x100;
public static class TestData {
public int type;
public String source;
public int flags;
public String result;
public int length;
public Class error;
public static final int STANDARD = 0;
public static final int PREFLIGHT = 1;
public static final int ERROR = 2;
public static TestData standard(String source, int flags, String result) {
return new TestData(STANDARD, source, flags, result, 0, null);
}
public static TestData preflight(String source, int flags, int length) {
return new TestData(PREFLIGHT, source, flags, null, length, null);
}
public static TestData error(String source, int flags, Class error) {
return new TestData(ERROR, source, flags, null, 0, error);
}
private TestData(int type, String source, int flags, String result, int length, Class error) {
this.type = type;
this.source = source;
this.flags = flags;
this.result = result;
this.length = length;
this.error = error;
}
private static final String[] typenames = { "standard", "preflight", "error" };
public String toString() {
StringBuffer buf = new StringBuffer(super.toString());
buf.append("[\n");
buf.append(typenames[type]);
buf.append(",\n");
if (source == null) {
buf.append("null");
} else {
buf.append('"');
buf.append(escapedString(source));
buf.append('"');
}
buf.append(",\n");
buf.append(Integer.toHexString(flags));
buf.append(",\n");
if (result == null) {
buf.append("null");
} else {
buf.append('"');
buf.append(escapedString(result));
buf.append('"');
}
buf.append(",\n");
buf.append(length);
buf.append(",\n");
buf.append(error);
buf.append(']');
return buf.toString();
}
}
private static final String lamAlefSpecialVLTR =
"\u0020\u0646\u0622\u0644\u0627\u0020" +
"\u0646\u0623\u064E\u0644\u0627\u0020" +
"\u0646\u0627\u0670\u0644\u0627\u0020" +
"\u0646\u0622\u0653\u0644\u0627\u0020" +
"\u0646\u0625\u0655\u0644\u0627\u0020" +
"\u0646\u0622\u0654\u0644\u0627\u0020" +
"\uFEFC\u0639";
private static final String tashkeelSpecialVLTR =
"\u064A\u0628\u0631\u0639\u0020" +
"\u064A\u0628\u0651\u0631\u064E\u0639\u0020" +
"\u064C\u064A\u0628\u0631\u064F\u0639\u0020" +
"\u0628\u0670\u0631\u0670\u0639\u0020" +
"\u0628\u0653\u0631\u0653\u0639\u0020" +
"\u0628\u0654\u0631\u0654\u0639\u0020" +
"\u0628\u0655\u0631\u0655\u0639\u0020";
private static final String tashkeelShaddaRTL=
"\u0634\u0651\u0645\u0652\u0633";
private static final String tashkeelShaddaLTR=
"\u0633\u0652\u0645\u0651\u0634";
private static final String ArMathSym =
"\uD83B\uDE00\uD83B\uDE01\uD83B\uDE02\uD83B\uDE03\u0020" +
"\uD83B\uDE24\uD83B\uDE05\uD83B\uDE06\u0020" +
"\uD83B\uDE07\uD83B\uDE08\uD83B\uDE09\u0020" +
"\uD83B\uDE0A\uD83B\uDE0B\uD83B\uDE0C\uD83B\uDE0D\u0020" +
"\uD83B\uDE0E\uD83B\uDE0F\uD83B\uDE10\uD83B\uDE11\u0020" +
"\uD83B\uDE12\uD83B\uDE13\uD83B\uDE14\uD83B\uDE15\u0020" +
"\uD83B\uDE16\uD83B\uDE17\uD83B\uDE18\u0020" +
"\uD83B\uDE19\uD83B\uDE1A\uD83B\uDE1B";
private static final String ArMathSymLooped =
"\uD83B\uDE80\uD83B\uDE81\uD83B\uDE82\uD83B\uDE83\u0020" +
"\uD83B\uDE84\uD83B\uDE85\uD83B\uDE86\u0020" +
"\uD83B\uDE87\uD83B\uDE88\uD83B\uDE89\u0020" +
"\uD83B\uDE8B\uD83B\uDE8C\uD83B\uDE8D\u0020" +
"\uD83B\uDE8E\uD83B\uDE8F\uD83B\uDE90\uD83B\uDE91\u0020" +
"\uD83B\uDE92\uD83B\uDE93\uD83B\uDE94\uD83B\uDE95\u0020" +
"\uD83B\uDE96\uD83B\uDE97\uD83B\uDE98\u0020" +
"\uD83B\uDE99\uD83B\uDE9A\uD83B\uDE9B";
private static final String ArMathSymDoubleStruck =
"\uD83B\uDEA1\uD83B\uDEA2\uD83B\uDEA3\u0020" +
"\uD83B\uDEA5\uD83B\uDEA6\u0020" +
"\uD83B\uDEA7\uD83B\uDEA8\uD83B\uDEA9\u0020" +
"\uD83B\uDEAB\uD83B\uDEAC\uD83B\uDEAD\u0020" +
"\uD83B\uDEAE\uD83B\uDEAF\uD83B\uDEB0\uD83B\uDEB1\u0020" +
"\uD83B\uDEB2\uD83B\uDEB3\uD83B\uDEB4\uD83B\uDEB5\u0020" +
"\uD83B\uDEB6\uD83B\uDEB7\uD83B\uDEB8\u0020" +
"\uD83B\uDEB9\uD83B\uDEBA\uD83B\uDEBB";
private static final String ArMathSymInitial =
"\uD83B\uDE21\uD83B\uDE22\u0020" +
"\uD83B\uDE27\uD83B\uDE29\u0020" +
"\uD83B\uDE2A\uD83B\uDE2B\uD83B\uDE2C\uD83B\uDE2D\u0020" +
"\uD83B\uDE2E\uD83B\uDE2F\uD83B\uDE30\uD83B\uDE31\u0020" +
"\uD83B\uDE32\uD83B\uDE34\uD83B\uDE35\u0020" +
"\uD83B\uDE36\uD83B\uDE37\u0020" +
"\uD83B\uDE39\uD83B\uDE3B";
private static final String ArMathSymTailed =
"\uD83B\uDE42\uD83B\uDE47\uD83B\uDE49\uD83B\uDE4B\u0020" +
"\uD83B\uDE4D\uD83B\uDE4E\uD83B\uDE4F\u0020" +
"\uD83B\uDE51\uD83B\uDE52\uD83B\uDE54\uD83B\uDE57\u0020" +
"\uD83B\uDE59\uD83B\uDE5B\uD83B\uDE5D\uD83B\uDE5F";
private static final String ArMathSymStretched =
"\uD83B\uDE21\u0633\uD83B\uDE62\u0647";
private static final String logicalUnshape =
"\u0020\u0020\u0020\uFE8D\uFEF5\u0020\uFEE5\u0020\uFE8D\uFEF7\u0020" +
"\uFED7\uFEFC\u0020\uFEE1\u0020\uFE8D\uFEDF\uFECC\uFEAE\uFE91\uFEF4" +
"\uFE94\u0020\uFE8D\uFEDF\uFEA4\uFEAE\uFE93\u0020\u0020\u0020\u0020";
private static final String numSource =
"\u0031" + /* en:1 */
"\u0627" + /* arabic:alef */
"\u0032" + /* en:2 */
"\u06f3" + /* an:3 */
"\u0061" + /* latin:a */
"\u0034"; /* en:4 */
private static final TestData[] standardTests = {
/* lam alef special visual ltr */
TestData.standard(lamAlefSpecialVLTR,
LETTERS_SHAPE | TEXT_DIRECTION_VISUAL_LTR | LENGTH_FIXED_SPACES_NEAR,
"\u0020\ufee5\u0020\ufef5\ufe8d\u0020" +
"\ufee5\u0020\ufe76\ufef7\ufe8d\u0020" +
"\ufee5\u0020\u0670\ufefb\ufe8d\u0020" +
"\ufee5\u0020\u0653\ufef5\ufe8d\u0020" +
"\ufee5\u0020\u0655\ufef9\ufe8d\u0020" +
"\ufee5\u0020\u0654\ufef5\ufe8d\u0020" +
"\ufefc\ufecb"),
TestData.standard(lamAlefSpecialVLTR,
LETTERS_SHAPE | TEXT_DIRECTION_VISUAL_LTR | LENGTH_FIXED_SPACES_AT_END,
"\u0020\ufee5\ufef5\ufe8d\u0020\ufee5" +
"\ufe76\ufef7\ufe8d\u0020\ufee5\u0670" +
"\ufefb\ufe8d\u0020\ufee5\u0653\ufef5" +
"\ufe8d\u0020\ufee5\u0655\ufef9\ufe8d" +
"\u0020\ufee5\u0654\ufef5\ufe8d\u0020" +
"\ufefc\ufecb\u0020\u0020\u0020\u0020" +
"\u0020\u0020"),
TestData.standard(lamAlefSpecialVLTR,
LETTERS_SHAPE | TEXT_DIRECTION_VISUAL_LTR | LENGTH_FIXED_SPACES_AT_BEGINNING,
"\u0020\u0020\u0020\u0020\u0020\u0020" +
"\u0020\ufee5\ufef5\ufe8d\u0020\ufee5" +
"\ufe76\ufef7\ufe8d\u0020\ufee5\u0670" +
"\ufefb\ufe8d\u0020\ufee5\u0653\ufef5" +
"\ufe8d\u0020\ufee5\u0655\ufef9\ufe8d" +
"\u0020\ufee5\u0654\ufef5\ufe8d\u0020" +
"\ufefc\ufecb"),
TestData.standard(lamAlefSpecialVLTR,
LETTERS_SHAPE | TEXT_DIRECTION_VISUAL_LTR | LENGTH_GROW_SHRINK,
"\u0020\ufee5\ufef5\ufe8d\u0020\ufee5" +
"\ufe76\ufef7\ufe8d\u0020\ufee5\u0670" +
"\ufefb\ufe8d\u0020\ufee5\u0653\ufef5" +
"\ufe8d\u0020\ufee5\u0655\ufef9\ufe8d" +
"\u0020\ufee5\u0654\ufef5\ufe8d\u0020" +
"\ufefc\ufecb"),
/* TASHKEEL */
TestData.standard(lamAlefSpecialVLTR,
LETTERS_SHAPE_TASHKEEL_ISOLATED | TEXT_DIRECTION_VISUAL_LTR | LENGTH_FIXED_SPACES_NEAR,
"\u0020\ufee5\u0020\ufef5\ufe8d\u0020" +
"\ufee5\u0020\ufe76\ufef7\ufe8d\u0020" +
"\ufee5\u0020\u0670\ufefb\ufe8d\u0020" +
"\ufee5\u0020\u0653\ufef5\ufe8d\u0020" +
"\ufee5\u0020\u0655\ufef9\ufe8d\u0020" +
"\ufee5\u0020\u0654\ufef5\ufe8d\u0020" +
"\ufefc\ufecb"),
TestData.standard(lamAlefSpecialVLTR,
LETTERS_SHAPE_TASHKEEL_ISOLATED | TEXT_DIRECTION_VISUAL_LTR | LENGTH_FIXED_SPACES_AT_END,
"\u0020\ufee5\ufef5\ufe8d\u0020\ufee5" +
"\ufe76\ufef7\ufe8d\u0020\ufee5\u0670" +
"\ufefb\ufe8d\u0020\ufee5\u0653\ufef5" +
"\ufe8d\u0020\ufee5\u0655\ufef9\ufe8d" +
"\u0020\ufee5\u0654\ufef5\ufe8d\u0020" +
"\ufefc\ufecb\u0020\u0020\u0020\u0020" +
"\u0020\u0020"),
TestData.standard(lamAlefSpecialVLTR,
LETTERS_SHAPE_TASHKEEL_ISOLATED | TEXT_DIRECTION_VISUAL_LTR | LENGTH_FIXED_SPACES_AT_BEGINNING,
"\u0020\u0020\u0020\u0020\u0020\u0020" +
"\u0020\ufee5\ufef5\ufe8d\u0020\ufee5" +
"\ufe76\ufef7\ufe8d\u0020\ufee5\u0670" +
"\ufefb\ufe8d\u0020\ufee5\u0653\ufef5" +
"\ufe8d\u0020\ufee5\u0655\ufef9\ufe8d" +
"\u0020\ufee5\u0654\ufef5\ufe8d\u0020" +
"\ufefc\ufecb"),
TestData.standard(lamAlefSpecialVLTR,
LETTERS_SHAPE_TASHKEEL_ISOLATED | TEXT_DIRECTION_VISUAL_LTR | LENGTH_GROW_SHRINK,
"\u0020\ufee5\ufef5\ufe8d\u0020\ufee5" +
"\ufe76\ufef7\ufe8d\u0020\ufee5\u0670" +
"\ufefb\ufe8d\u0020\ufee5\u0653\ufef5" +
"\ufe8d\u0020\ufee5\u0655\ufef9\ufe8d" +
"\u0020\ufee5\u0654\ufef5\ufe8d\u0020" +
"\ufefc\ufecb"),
/* tashkeel special visual ltr */
TestData.standard(tashkeelSpecialVLTR,
LETTERS_SHAPE | TEXT_DIRECTION_VISUAL_LTR | LENGTH_FIXED_SPACES_NEAR,
"\ufef2\ufe91\ufeae\ufecb\u0020" +
"\ufef2\ufe91\ufe7c\ufeae\ufe77\ufecb\u0020" +
"\ufe72\ufef2\ufe91\ufeae\ufe79\ufecb\u0020" +
"\ufe8f\u0670\ufeae\u0670\ufecb\u0020" +
"\ufe8f\u0653\ufeae\u0653\ufecb\u0020" +
"\ufe8f\u0654\ufeae\u0654\ufecb\u0020" +
"\ufe8f\u0655\ufeae\u0655\ufecb\u0020"),
TestData.standard(tashkeelSpecialVLTR,
LETTERS_SHAPE_TASHKEEL_ISOLATED | TEXT_DIRECTION_VISUAL_LTR | LENGTH_FIXED_SPACES_NEAR,
"\ufef2\ufe91\ufeae\ufecb\u0020" +
"\ufef2\ufe91\ufe7c\ufeae\ufe76\ufecb\u0020" +
"\ufe72\ufef2\ufe91\ufeae\ufe78\ufecb\u0020" +
"\ufe8f\u0670\ufeae\u0670\ufecb\u0020" +
"\ufe8f\u0653\ufeae\u0653\ufecb\u0020" +
"\ufe8f\u0654\ufeae\u0654\ufecb\u0020" +
"\ufe8f\u0655\ufeae\u0655\ufecb\u0020"),
TestData.standard(tashkeelShaddaRTL,
ArabicShaping.LETTERS_SHAPE|ArabicShaping.TASHKEEL_BEGIN |ArabicShaping.TEXT_DIRECTION_VISUAL_RTL ,
"\u0020\ufeb7\ufe7d\ufee4\ufeb2"),
TestData.standard(tashkeelShaddaRTL,
ArabicShaping.LETTERS_SHAPE|ArabicShaping.TASHKEEL_END|ArabicShaping.TEXT_DIRECTION_VISUAL_RTL ,
"\ufeb7\ufe7d\ufee4\ufeb2\u0020"),
TestData.standard(tashkeelShaddaRTL,
ArabicShaping.LETTERS_SHAPE|ArabicShaping.TASHKEEL_RESIZE|ArabicShaping.TEXT_DIRECTION_VISUAL_RTL ,
"\ufeb7\ufe7d\ufee4\ufeb2"),
TestData.standard(tashkeelShaddaRTL,
ArabicShaping.LETTERS_SHAPE|ArabicShaping.TASHKEEL_REPLACE_BY_TATWEEL|ArabicShaping.TEXT_DIRECTION_VISUAL_RTL ,
"\ufeb7\ufe7d\ufee4\u0640\ufeb2"),
TestData.standard(tashkeelShaddaLTR,
ArabicShaping.LETTERS_SHAPE|ArabicShaping.TASHKEEL_BEGIN |ArabicShaping.TEXT_DIRECTION_VISUAL_LTR ,
"\u0020\ufeb2\ufee4\ufe7d\ufeb7"),
TestData.standard(tashkeelShaddaLTR,
ArabicShaping.LETTERS_SHAPE|ArabicShaping.TASHKEEL_END |ArabicShaping.TEXT_DIRECTION_VISUAL_LTR ,
"\ufeb2\ufee4\ufe7d\ufeb7\u0020"),
TestData.standard(tashkeelShaddaLTR,
ArabicShaping.LETTERS_SHAPE|ArabicShaping.TASHKEEL_RESIZE |ArabicShaping.TEXT_DIRECTION_VISUAL_LTR ,
"\ufeb2\ufee4\ufe7d\ufeb7"),
TestData.standard(tashkeelShaddaLTR,
ArabicShaping.LETTERS_SHAPE|ArabicShaping.TASHKEEL_REPLACE_BY_TATWEEL |ArabicShaping.TEXT_DIRECTION_VISUAL_LTR ,
"\ufeb2\u0640\ufee4\ufe7d\ufeb7"),
TestData.standard(ArMathSym,
ArabicShaping.LETTERS_SHAPE|ArabicShaping.TASHKEEL_BEGIN |ArabicShaping.TEXT_DIRECTION_VISUAL_RTL ,
"\uD83B\uDE00\uD83B\uDE01\uD83B\uDE02\uD83B\uDE03\u0020" +
"\uD83B\uDE24\uD83B\uDE05\uD83B\uDE06\u0020" +
"\uD83B\uDE07\uD83B\uDE08\uD83B\uDE09\u0020" +
"\uD83B\uDE0A\uD83B\uDE0B\uD83B\uDE0C\uD83B\uDE0D\u0020" +
"\uD83B\uDE0E\uD83B\uDE0F\uD83B\uDE10\uD83B\uDE11\u0020" +
"\uD83B\uDE12\uD83B\uDE13\uD83B\uDE14\uD83B\uDE15\u0020" +
"\uD83B\uDE16\uD83B\uDE17\uD83B\uDE18\u0020" +
"\uD83B\uDE19\uD83B\uDE1A\uD83B\uDE1B"),
TestData.standard(ArMathSymLooped,
ArabicShaping.LETTERS_SHAPE|ArabicShaping.TASHKEEL_END|ArabicShaping.TEXT_DIRECTION_VISUAL_RTL ,
"\uD83B\uDE80\uD83B\uDE81\uD83B\uDE82\uD83B\uDE83\u0020" +
"\uD83B\uDE84\uD83B\uDE85\uD83B\uDE86\u0020" +
"\uD83B\uDE87\uD83B\uDE88\uD83B\uDE89\u0020" +
"\uD83B\uDE8B\uD83B\uDE8C\uD83B\uDE8D\u0020" +
"\uD83B\uDE8E\uD83B\uDE8F\uD83B\uDE90\uD83B\uDE91\u0020" +
"\uD83B\uDE92\uD83B\uDE93\uD83B\uDE94\uD83B\uDE95\u0020" +
"\uD83B\uDE96\uD83B\uDE97\uD83B\uDE98\u0020" +
"\uD83B\uDE99\uD83B\uDE9A\uD83B\uDE9B"),
TestData.standard(ArMathSymDoubleStruck,
ArabicShaping.LETTERS_SHAPE|ArabicShaping.TASHKEEL_RESIZE|ArabicShaping.TEXT_DIRECTION_VISUAL_RTL ,
"\uD83B\uDEA1\uD83B\uDEA2\uD83B\uDEA3\u0020" +
"\uD83B\uDEA5\uD83B\uDEA6\u0020" +
"\uD83B\uDEA7\uD83B\uDEA8\uD83B\uDEA9\u0020" +
"\uD83B\uDEAB\uD83B\uDEAC\uD83B\uDEAD\u0020" +
"\uD83B\uDEAE\uD83B\uDEAF\uD83B\uDEB0\uD83B\uDEB1\u0020" +
"\uD83B\uDEB2\uD83B\uDEB3\uD83B\uDEB4\uD83B\uDEB5\u0020" +
"\uD83B\uDEB6\uD83B\uDEB7\uD83B\uDEB8\u0020" +
"\uD83B\uDEB9\uD83B\uDEBA\uD83B\uDEBB"),
TestData.standard(ArMathSymInitial,
ArabicShaping.LETTERS_SHAPE|ArabicShaping.TASHKEEL_BEGIN |ArabicShaping.TEXT_DIRECTION_VISUAL_LTR ,
"\uD83B\uDE21\uD83B\uDE22\u0020" +
"\uD83B\uDE27\uD83B\uDE29\u0020" +
"\uD83B\uDE2A\uD83B\uDE2B\uD83B\uDE2C\uD83B\uDE2D\u0020" +
"\uD83B\uDE2E\uD83B\uDE2F\uD83B\uDE30\uD83B\uDE31\u0020" +
"\uD83B\uDE32\uD83B\uDE34\uD83B\uDE35\u0020" +
"\uD83B\uDE36\uD83B\uDE37\u0020" +
"\uD83B\uDE39\uD83B\uDE3B"),
TestData.standard(ArMathSymTailed,
ArabicShaping.LETTERS_SHAPE|ArabicShaping.TASHKEEL_END |ArabicShaping.TEXT_DIRECTION_VISUAL_LTR ,
"\uD83B\uDE42\uD83B\uDE47\uD83B\uDE49\uD83B\uDE4B\u0020" +
"\uD83B\uDE4D\uD83B\uDE4E\uD83B\uDE4F\u0020" +
"\uD83B\uDE51\uD83B\uDE52\uD83B\uDE54\uD83B\uDE57\u0020" +
"\uD83B\uDE59\uD83B\uDE5B\uD83B\uDE5D\uD83B\uDE5F"),
TestData.standard(ArMathSymStretched,
ArabicShaping.LETTERS_SHAPE|ArabicShaping.TASHKEEL_RESIZE |ArabicShaping.TEXT_DIRECTION_VISUAL_LTR ,
"\uD83B\uDE21\uFEB1\uD83B\uDE62\uFEE9"),
/* logical unshape */
TestData.standard(logicalUnshape,
LETTERS_UNSHAPE | TEXT_DIRECTION_LOGICAL | LENGTH_FIXED_SPACES_NEAR,
"\u0020\u0020\u0020\u0627\u0644\u0622\u0646\u0020\u0627\u0644\u0623\u0642\u0644\u0627" +
"\u0645\u0020\u0627\u0644\u0639\u0631\u0628\u064a\u0629\u0020\u0627\u0644\u062d\u0631" +
"\u0629\u0020\u0020\u0020\u0020"),
TestData.standard(logicalUnshape,
LETTERS_UNSHAPE | TEXT_DIRECTION_LOGICAL | LENGTH_FIXED_SPACES_AT_END,
"\u0020\u0020\u0020\u0627\u0644\u0622\u0020\u0646\u0020\u0627\u0644\u0623\u0020\u0642" +
"\u0644\u0627\u0020\u0645\u0020\u0627\u0644\u0639\u0631\u0628\u064a\u0629\u0020\u0627" +
"\u0644\u062d\u0631\u0629\u0020"),
TestData.standard(logicalUnshape,
LETTERS_UNSHAPE | TEXT_DIRECTION_LOGICAL | LENGTH_FIXED_SPACES_AT_BEGINNING,
"\u0627\u0644\u0622\u0020\u0646\u0020\u0627\u0644\u0623\u0020\u0642\u0644\u0627\u0020" +
"\u0645\u0020\u0627\u0644\u0639\u0631\u0628\u064a\u0629\u0020\u0627\u0644\u062d\u0631" +
"\u0629\u0020\u0020\u0020\u0020"),
TestData.standard(logicalUnshape,
LETTERS_UNSHAPE | TEXT_DIRECTION_LOGICAL | LENGTH_GROW_SHRINK,
"\u0020\u0020\u0020\u0627\u0644\u0622\u0020\u0646\u0020\u0627\u0644\u0623\u0020\u0642" +
"\u0644\u0627\u0020\u0645\u0020\u0627\u0644\u0639\u0631\u0628\u064a\u0629\u0020\u0627" +
"\u0644\u062d\u0631\u0629\u0020\u0020\u0020\u0020"),
/* numbers */
TestData.standard(numSource,
DIGITS_EN2AN | DIGIT_TYPE_AN,
"\u0661\u0627\u0662\u06f3\u0061\u0664"),
TestData.standard(numSource,
DIGITS_AN2EN | DIGIT_TYPE_AN_EXTENDED,
"\u0031\u0627\u0032\u0033\u0061\u0034"),
TestData.standard(numSource,
DIGITS_EN2AN_INIT_LR | DIGIT_TYPE_AN,
"\u0031\u0627\u0662\u06f3\u0061\u0034"),
TestData.standard(numSource,
DIGITS_EN2AN_INIT_AL | DIGIT_TYPE_AN_EXTENDED,
"\u06f1\u0627\u06f2\u06f3\u0061\u0034"),
TestData.standard(numSource,
DIGITS_EN2AN_INIT_LR | DIGIT_TYPE_AN | TEXT_DIRECTION_VISUAL_LTR,
"\u0661\u0627\u0032\u06f3\u0061\u0034"),
TestData.standard(numSource,
DIGITS_EN2AN_INIT_AL | DIGIT_TYPE_AN_EXTENDED | TEXT_DIRECTION_VISUAL_LTR,
"\u06f1\u0627\u0032\u06f3\u0061\u06f4"),
/* no-op */
TestData.standard(numSource,
0,
numSource),
};
private static final TestData[] preflightTests = {
/* preflight */
TestData.preflight("\u0644\u0627",
LETTERS_SHAPE | LENGTH_GROW_SHRINK,
1),
TestData.preflight("\u0644\u0627\u0031",
DIGITS_EN2AN | DIGIT_TYPE_AN_EXTENDED | LENGTH_GROW_SHRINK,
3),
TestData.preflight("\u0644\u0644",
LETTERS_SHAPE | LENGTH_GROW_SHRINK,
2),
TestData.preflight("\ufef7",
LETTERS_UNSHAPE | LENGTH_GROW_SHRINK,
2),
};
private static final TestData[] errorTests = {
/* bad data */
TestData.error("\u0020\ufef7\u0644\u0020",
LETTERS_UNSHAPE | LENGTH_FIXED_SPACES_NEAR,
ArabicShapingException.class),
TestData.error("\u0020\ufef7",
LETTERS_UNSHAPE | LENGTH_FIXED_SPACES_AT_END,
ArabicShapingException.class),
TestData.error("\ufef7\u0020",
LETTERS_UNSHAPE | LENGTH_FIXED_SPACES_AT_BEGINNING,
ArabicShapingException.class),
/* bad options */
TestData.error("\ufef7",
0xffffffff,
IllegalArgumentException.class),
TestData.error("\ufef7",
LETTERS_UNSHAPE | LENGTH_GROW_SHRINK,
ArabicShapingException.class),
TestData.error(null,
LETTERS_UNSHAPE | LENGTH_FIXED_SPACES_AT_END,
IllegalArgumentException.class),
};
@Test
public void TestStandard() {
for (int i = 0; i < standardTests.length; ++i) {
TestData test = standardTests[i];
Exception ex = null;
String result = null;
ArabicShaping shaper = null;
try {
shaper = new ArabicShaping(test.flags);
result = shaper.shape(test.source);
}
catch(MissingResourceException e){
throw e;
}
catch (IllegalStateException ie){
warnln("IllegalStateException: "+ie.toString());
return;
}
catch (Exception e) {
ex = e;
}
if (!test.result.equals(result)) {
reportTestFailure(i, test, shaper, result, ex);
}
}
}
@Test
public void TestPreflight() {
for (int i = 0; i < preflightTests.length; ++i) {
TestData test = preflightTests[i];
Exception ex = null;
char src[] = null;
int len = 0;
ArabicShaping shaper = null;
if (test.source != null) {
src = test.source.toCharArray();
}
try {
shaper = new ArabicShaping(test.flags);
len = shaper.shape(src, 0, src.length, null, 0, 0);
}
catch (Exception e) {
ex = e;
}
if (test.length != len) {
reportTestFailure(i, test, shaper, test.source, ex);
}
}
}
@Test
public void TestError() {
for (int i = 0; i < errorTests.length; ++i) {
TestData test = errorTests[i];
Exception ex = null;
char src[] = null;
int len = 0;
ArabicShaping shaper = null;
if (test.source != null) {
src = test.source.toCharArray();
len = src.length;
}
try {
shaper = new ArabicShaping(test.flags);
shaper.shape(src, 0, len);
}
catch (Exception e) {
ex = e;
}
if (!test.error.isInstance(ex)) {
reportTestFailure(i, test, shaper, test.source, ex);
}
}
}
@Test
public void TestEquals()
@ -572,64 +67,6 @@ public class ArabicShapingRegTest extends TestFmwk {
}
}
// TODO(junit): remove this and convert callers to parameterized tests
private void reportTestFailure(int index, TestData test, ArabicShaping shaper, String result, Exception error) {
if (error != null && error instanceof MissingResourceException ) {
warnln(error.getMessage());
}
StringBuffer buf = new StringBuffer();
buf.append("*** test failure ***\n");
buf.append("index: " + index + "\n");
buf.append("test: " + test + "\n");
buf.append("shaper: " + shaper + "\n");
buf.append("result: " + escapedString(result) + "\n");
buf.append("error: " + error + "\n");
if (result != null && test.result != null && !test.result.equals(result)) {
for (int i = 0; i < Math.max(test.result.length(), result.length()); ++i) {
String temp = Integer.toString(i);
if (temp.length() < 2) {
temp = " ".concat(temp);
}
char trg = i < test.result.length() ? test.result.charAt(i) : '\uffff';
char res = i < result.length() ? result.charAt(i) : '\uffff';
buf.append("[" + temp + "] ");
buf.append(escapedString("" + trg) + " ");
buf.append(escapedString("" + res) + " ");
if (trg != res) {
buf.append("***");
}
buf.append("\n");
}
}
err(buf.toString());
}
private static String escapedString(String str) {
if (str == null) {
return null;
}
StringBuffer buf = new StringBuffer(str.length() * 6);
for (int i = 0; i < str.length(); ++i) {
char ch = str.charAt(i);
buf.append("\\u");
if (ch < 0x1000) {
buf.append('0');
}
if (ch < 0x0100) {
buf.append('0');
}
if (ch < 0x0010) {
buf.append('0');
}
buf.append(Integer.toHexString(ch));
}
return buf.toString();
}
/* Tests the method
* public int shape(char[] source, int sourceStart, int sourceLength,
* char[] dest, int destStart, int destSize) throws ArabicShapingException)
@ -643,8 +80,8 @@ public class ArabicShapingRegTest extends TestFmwk {
char[] source = {'d','u','m','m','y'};
char[] dest = {'d','u','m','m','y'};
int[] negNum = {-1,-2,-5,-10,-100};
for(int i=0; i<negNum.length; i++){
try{
// Checks when "sourceStart < 0"
@ -652,7 +89,7 @@ public class ArabicShapingRegTest extends TestFmwk {
errln("ArabicShaping.shape(char[],int,int,char[],int,int) was " +
"suppose to return an exception when 'sourceStart < 0'.");
} catch(Exception e){}
try{
// Checks when "sourceLength < 0"
as.shape(source, 0, negNum[i], dest, 0, 0);
@ -660,7 +97,7 @@ public class ArabicShapingRegTest extends TestFmwk {
"suppose to return an exception when 'sourceLength < 0'.");
} catch(Exception e){}
}
// Checks when "sourceStart + sourceLength > source.length"
try{
as.shape(source, 3, 3, dest, 0, 0);
@ -682,14 +119,14 @@ public class ArabicShapingRegTest extends TestFmwk {
errln("ArabicShaping.shape(char[],int,int,char[],int,int) was " +
"suppose to return an exception when 'sourceStart + sourceLength > source.length'.");
} catch(Exception e){}
// Checks when "if (dest == null && destSize != 0)" is true
try{
as.shape(source, 2, 2, null, 0, 1);
errln("ArabicShaping.shape(char[],int,int,char[],int,int) was " +
"suppose to return an exception when 'dest == null && destSize != 0'.");
} catch(Exception e){}
// Checks when
// if ((destSize != 0) && (destStart < 0 || destSize < 0 || destStart + destSize > dest.length))
for(int i=0; i<negNum.length; i++){
@ -699,7 +136,7 @@ public class ArabicShapingRegTest extends TestFmwk {
"suppose to return an exception when " +
"(destSize != 0) && (destStart < 0 || destSize < 0 || destStart + destSize > dest.length).");
} catch(Exception e){}
try{
as.shape(source, 2, 2, dest, 0, negNum[i]);
errln("ArabicShaping.shape(char[],int,int,char[],int,int) was " +
@ -707,7 +144,7 @@ public class ArabicShapingRegTest extends TestFmwk {
"(destSize != 0) && (destStart < 0 || destSize < 0 || destStart + destSize > dest.length).");
} catch(Exception e){}
}
// Checks when "destStart + destSize > dest.length"
try{
as.shape(source, 2, 2, dest, 3, 3);
@ -733,9 +170,9 @@ public class ArabicShapingRegTest extends TestFmwk {
"suppose to return an exception when " +
"(destSize != 0) && (destStart < 0 || destSize < 0 || destStart + destSize > dest.length).");
} catch(Exception e){}
// Tests when "throw new IllegalArgumentException("Wrong Tashkeel argument")"
int[] invalid_Tashkeel = {-1000, -500, -100};
int[] invalid_Tashkeel = {-1000, -500, -100};
for(int i=0; i < invalid_Tashkeel.length; i++){
ArabicShaping arabicShape = new ArabicShaping(invalid_Tashkeel[i]);
try {

View File

@ -0,0 +1,487 @@
// © 2017 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html#License
package com.ibm.icu.dev.test.shaping;
import java.util.Arrays;
import java.util.Collection;
import java.util.MissingResourceException;
import org.junit.Test;
import org.junit.experimental.runners.Enclosed;
import org.junit.runner.RunWith;
import org.junit.runners.Parameterized;
import com.ibm.icu.dev.test.TestFmwk;
import com.ibm.icu.text.ArabicShaping;
import com.ibm.icu.text.ArabicShapingException;
/**
* Regression test for Arabic shaping.
*/
@RunWith(Enclosed.class)
public class DataDrivenArabicShapingRegTest extends TestFmwk {
/* constants copied from ArabicShaping for convenience */
public static final int LENGTH_GROW_SHRINK = 0;
public static final int LENGTH_FIXED_SPACES_NEAR = 1;
public static final int LENGTH_FIXED_SPACES_AT_END = 2;
public static final int LENGTH_FIXED_SPACES_AT_BEGINNING = 3;
public static final int TEXT_DIRECTION_LOGICAL = 0;
public static final int TEXT_DIRECTION_VISUAL_LTR = 4;
public static final int LETTERS_NOOP = 0;
public static final int LETTERS_SHAPE = 8;
public static final int LETTERS_SHAPE_TASHKEEL_ISOLATED = 0x18;
public static final int LETTERS_UNSHAPE = 0x10;
public static final int DIGITS_NOOP = 0;
public static final int DIGITS_EN2AN = 0x20;
public static final int DIGITS_AN2EN = 0x40;
public static final int DIGITS_EN2AN_INIT_LR = 0x60;
public static final int DIGITS_EN2AN_INIT_AL = 0x80;
// private static final int DIGITS_RESERVED = 0xa0;
public static final int DIGIT_TYPE_AN = 0;
public static final int DIGIT_TYPE_AN_EXTENDED = 0x100;
@RunWith(Parameterized.class)
public static class StandardDataTest extends TestFmwk {
private String source;
private int flags;
private String expected;
public StandardDataTest(String source, int flags, String expected) {
this.source = source;
this.flags = flags;
this.expected = expected;
}
@Parameterized.Parameters
public static Collection testData() {
String lamAlefSpecialVLTR =
"\u0020\u0646\u0622\u0644\u0627\u0020\u0646\u0623\u064E\u0644\u0627\u0020" +
"\u0646\u0627\u0670\u0644\u0627\u0020\u0646\u0622\u0653\u0644\u0627\u0020" +
"\u0646\u0625\u0655\u0644\u0627\u0020\u0646\u0622\u0654\u0644\u0627\u0020" +
"\uFEFC\u0639";
String tashkeelSpecialVLTR =
"\u064A\u0628\u0631\u0639\u0020\u064A\u0628\u0651\u0631\u064E\u0639\u0020" +
"\u064C\u064A\u0628\u0631\u064F\u0639\u0020\u0628\u0670\u0631\u0670\u0639" +
"\u0020\u0628\u0653\u0631\u0653\u0639\u0020\u0628\u0654\u0631\u0654\u0639" +
"\u0020\u0628\u0655\u0631\u0655\u0639\u0020";
String tashkeelShaddaRTL=
"\u0634\u0651\u0645\u0652\u0633";
String tashkeelShaddaLTR=
"\u0633\u0652\u0645\u0651\u0634";
String ArMathSym =
"\uD83B\uDE00\uD83B\uDE01\uD83B\uDE02\uD83B\uDE03\u0020\uD83B\uDE24\uD83B" +
"\uDE05\uD83B\uDE06\u0020\uD83B\uDE07\uD83B\uDE08\uD83B\uDE09\u0020\uD83B" +
"\uDE0A\uD83B\uDE0B\uD83B\uDE0C\uD83B\uDE0D\u0020\uD83B\uDE0E\uD83B\uDE0F" +
"\uD83B\uDE10\uD83B\uDE11\u0020\uD83B\uDE12\uD83B\uDE13\uD83B\uDE14\uD83B" +
"\uDE15\u0020\uD83B\uDE16\uD83B\uDE17\uD83B\uDE18\u0020\uD83B\uDE19\uD83B" +
"\uDE1A\uD83B\uDE1B";
String ArMathSymLooped =
"\uD83B\uDE80\uD83B\uDE81\uD83B\uDE82\uD83B\uDE83\u0020\uD83B\uDE84\uD83B" +
"\uDE85\uD83B\uDE86\u0020\uD83B\uDE87\uD83B\uDE88\uD83B\uDE89\u0020\uD83B" +
"\uDE8B\uD83B\uDE8C\uD83B\uDE8D\u0020\uD83B\uDE8E\uD83B\uDE8F\uD83B\uDE90" +
"\uD83B\uDE91\u0020\uD83B\uDE92\uD83B\uDE93\uD83B\uDE94\uD83B\uDE95\u0020" +
"\uD83B\uDE96\uD83B\uDE97\uD83B\uDE98\u0020\uD83B\uDE99\uD83B\uDE9A\uD83B" +
"\uDE9B";
String ArMathSymDoubleStruck =
"\uD83B\uDEA1\uD83B\uDEA2\uD83B\uDEA3\u0020\uD83B\uDEA5\uD83B\uDEA6\u0020" +
"\uD83B\uDEA7\uD83B\uDEA8\uD83B\uDEA9\u0020\uD83B\uDEAB\uD83B\uDEAC\uD83B" +
"\uDEAD\u0020\uD83B\uDEAE\uD83B\uDEAF\uD83B\uDEB0\uD83B\uDEB1\u0020\uD83B" +
"\uDEB2\uD83B\uDEB3\uD83B\uDEB4\uD83B\uDEB5\u0020\uD83B\uDEB6\uD83B\uDEB7" +
"\uD83B\uDEB8\u0020\uD83B\uDEB9\uD83B\uDEBA\uD83B\uDEBB";
String ArMathSymInitial =
"\uD83B\uDE21\uD83B\uDE22\u0020\uD83B\uDE27\uD83B\uDE29\u0020\uD83B\uDE2A" +
"\uD83B\uDE2B\uD83B\uDE2C\uD83B\uDE2D\u0020\uD83B\uDE2E\uD83B\uDE2F\uD83B" +
"\uDE30\uD83B\uDE31\u0020\uD83B\uDE32\uD83B\uDE34\uD83B\uDE35\u0020\uD83B" +
"\uDE36\uD83B\uDE37\u0020\uD83B\uDE39\uD83B\uDE3B";
String ArMathSymTailed =
"\uD83B\uDE42\uD83B\uDE47\uD83B\uDE49\uD83B\uDE4B\u0020\uD83B\uDE4D\uD83B" +
"\uDE4E\uD83B\uDE4F\u0020\uD83B\uDE51\uD83B\uDE52\uD83B\uDE54\uD83B\uDE57" +
"\u0020\uD83B\uDE59\uD83B\uDE5B\uD83B\uDE5D\uD83B\uDE5F";
String ArMathSymStretched =
"\uD83B\uDE21\u0633\uD83B\uDE62\u0647";
String logicalUnshape =
"\u0020\u0020\u0020\uFE8D\uFEF5\u0020\uFEE5\u0020\uFE8D\uFEF7\u0020\uFED7" +
"\uFEFC\u0020\uFEE1\u0020\uFE8D\uFEDF\uFECC\uFEAE\uFE91\uFEF4\uFE94\u0020" +
"\uFE8D\uFEDF\uFEA4\uFEAE\uFE93\u0020\u0020\u0020\u0020";
String numSource =
"\u0031" + /* en:1 */
"\u0627" + /* arabic:alef */
"\u0032" + /* en:2 */
"\u06f3" + /* an:3 */
"\u0061" + /* latin:a */
"\u0034"; /* en:4 */
return Arrays.asList(new Object[][] {
/* lam alef special visual ltr */
{lamAlefSpecialVLTR,
LETTERS_SHAPE | TEXT_DIRECTION_VISUAL_LTR | LENGTH_FIXED_SPACES_NEAR,
"\u0020\ufee5\u0020\ufef5\ufe8d\u0020\ufee5\u0020\ufe76\ufef7\ufe8d\u0020" +
"\ufee5\u0020\u0670\ufefb\ufe8d\u0020\ufee5\u0020\u0653\ufef5\ufe8d\u0020" +
"\ufee5\u0020\u0655\ufef9\ufe8d\u0020\ufee5\u0020\u0654\ufef5\ufe8d\u0020" +
"\ufefc\ufecb"},
{lamAlefSpecialVLTR,
LETTERS_SHAPE | TEXT_DIRECTION_VISUAL_LTR | LENGTH_FIXED_SPACES_AT_END,
"\u0020\ufee5\ufef5\ufe8d\u0020\ufee5\ufe76\ufef7\ufe8d\u0020\ufee5\u0670" +
"\ufefb\ufe8d\u0020\ufee5\u0653\ufef5\ufe8d\u0020\ufee5\u0655\ufef9\ufe8d" +
"\u0020\ufee5\u0654\ufef5\ufe8d\u0020\ufefc\ufecb\u0020\u0020\u0020\u0020" +
"\u0020\u0020"},
{lamAlefSpecialVLTR,
LETTERS_SHAPE | TEXT_DIRECTION_VISUAL_LTR | LENGTH_FIXED_SPACES_AT_BEGINNING,
"\u0020\u0020\u0020\u0020\u0020\u0020\u0020\ufee5\ufef5\ufe8d\u0020\ufee5" +
"\ufe76\ufef7\ufe8d\u0020\ufee5\u0670\ufefb\ufe8d\u0020\ufee5\u0653\ufef5" +
"\ufe8d\u0020\ufee5\u0655\ufef9\ufe8d\u0020\ufee5\u0654\ufef5\ufe8d\u0020" +
"\ufefc\ufecb"},
{lamAlefSpecialVLTR,
LETTERS_SHAPE | TEXT_DIRECTION_VISUAL_LTR | LENGTH_GROW_SHRINK,
"\u0020\ufee5\ufef5\ufe8d\u0020\ufee5\ufe76\ufef7\ufe8d\u0020\ufee5\u0670" +
"\ufefb\ufe8d\u0020\ufee5\u0653\ufef5\ufe8d\u0020\ufee5\u0655\ufef9\ufe8d" +
"\u0020\ufee5\u0654\ufef5\ufe8d\u0020\ufefc\ufecb"},
/* TASHKEEL */
{lamAlefSpecialVLTR,
LETTERS_SHAPE_TASHKEEL_ISOLATED | TEXT_DIRECTION_VISUAL_LTR |
LENGTH_FIXED_SPACES_NEAR,
"\u0020\ufee5\u0020\ufef5\ufe8d\u0020\ufee5\u0020\ufe76\ufef7\ufe8d\u0020" +
"\ufee5\u0020\u0670\ufefb\ufe8d\u0020\ufee5\u0020\u0653\ufef5\ufe8d\u0020" +
"\ufee5\u0020\u0655\ufef9\ufe8d\u0020\ufee5\u0020\u0654\ufef5\ufe8d\u0020" +
"\ufefc\ufecb"},
{lamAlefSpecialVLTR,
LETTERS_SHAPE_TASHKEEL_ISOLATED | TEXT_DIRECTION_VISUAL_LTR |
LENGTH_FIXED_SPACES_AT_END,
"\u0020\ufee5\ufef5\ufe8d\u0020\ufee5\ufe76\ufef7\ufe8d\u0020\ufee5\u0670" +
"\ufefb\ufe8d\u0020\ufee5\u0653\ufef5\ufe8d\u0020\ufee5\u0655\ufef9\ufe8d" +
"\u0020\ufee5\u0654\ufef5\ufe8d\u0020\ufefc\ufecb\u0020\u0020\u0020\u0020" +
"\u0020\u0020"},
{lamAlefSpecialVLTR,
LETTERS_SHAPE_TASHKEEL_ISOLATED | TEXT_DIRECTION_VISUAL_LTR |
LENGTH_FIXED_SPACES_AT_BEGINNING,
"\u0020\u0020\u0020\u0020\u0020\u0020\u0020\ufee5\ufef5\ufe8d\u0020\ufee5" +
"\ufe76\ufef7\ufe8d\u0020\ufee5\u0670\ufefb\ufe8d\u0020\ufee5\u0653\ufef5" +
"\ufe8d\u0020\ufee5\u0655\ufef9\ufe8d\u0020\ufee5\u0654\ufef5\ufe8d\u0020" +
"\ufefc\ufecb"},
{lamAlefSpecialVLTR,
LETTERS_SHAPE_TASHKEEL_ISOLATED | TEXT_DIRECTION_VISUAL_LTR |
LENGTH_GROW_SHRINK,
"\u0020\ufee5\ufef5\ufe8d\u0020\ufee5\ufe76\ufef7\ufe8d\u0020\ufee5\u0670" +
"\ufefb\ufe8d\u0020\ufee5\u0653\ufef5\ufe8d\u0020\ufee5\u0655\ufef9\ufe8d" +
"\u0020\ufee5\u0654\ufef5\ufe8d\u0020\ufefc\ufecb"},
/* tashkeel special visual ltr */
{tashkeelSpecialVLTR,
LETTERS_SHAPE | TEXT_DIRECTION_VISUAL_LTR | LENGTH_FIXED_SPACES_NEAR,
"\ufef2\ufe91\ufeae\ufecb\u0020\ufef2\ufe91\ufe7c\ufeae\ufe77\ufecb\u0020" +
"\ufe72\ufef2\ufe91\ufeae\ufe79\ufecb\u0020\ufe8f\u0670\ufeae\u0670\ufecb" +
"\u0020\ufe8f\u0653\ufeae\u0653\ufecb\u0020\ufe8f\u0654\ufeae\u0654\ufecb" +
"\u0020\ufe8f\u0655\ufeae\u0655\ufecb\u0020"},
{tashkeelSpecialVLTR,
LETTERS_SHAPE_TASHKEEL_ISOLATED | TEXT_DIRECTION_VISUAL_LTR |
LENGTH_FIXED_SPACES_NEAR,
"\ufef2\ufe91\ufeae\ufecb\u0020\ufef2\ufe91\ufe7c\ufeae\ufe76\ufecb\u0020" +
"\ufe72\ufef2\ufe91\ufeae\ufe78\ufecb\u0020\ufe8f\u0670\ufeae\u0670\ufecb" +
"\u0020\ufe8f\u0653\ufeae\u0653\ufecb\u0020\ufe8f\u0654\ufeae\u0654\ufecb" +
"\u0020\ufe8f\u0655\ufeae\u0655\ufecb\u0020"},
{tashkeelShaddaRTL,
ArabicShaping.LETTERS_SHAPE | ArabicShaping.TASHKEEL_BEGIN |
ArabicShaping.TEXT_DIRECTION_VISUAL_RTL,
"\u0020\ufeb7\ufe7d\ufee4\ufeb2"},
{tashkeelShaddaRTL,
ArabicShaping.LETTERS_SHAPE | ArabicShaping.TASHKEEL_END |
ArabicShaping.TEXT_DIRECTION_VISUAL_RTL,
"\ufeb7\ufe7d\ufee4\ufeb2\u0020"},
{tashkeelShaddaRTL,
ArabicShaping.LETTERS_SHAPE | ArabicShaping.TASHKEEL_RESIZE |
ArabicShaping.TEXT_DIRECTION_VISUAL_RTL,
"\ufeb7\ufe7d\ufee4\ufeb2"},
{tashkeelShaddaRTL,
ArabicShaping.LETTERS_SHAPE | ArabicShaping.TASHKEEL_REPLACE_BY_TATWEEL |
ArabicShaping.TEXT_DIRECTION_VISUAL_RTL,
"\ufeb7\ufe7d\ufee4\u0640\ufeb2"},
{tashkeelShaddaLTR,
ArabicShaping.LETTERS_SHAPE | ArabicShaping.TASHKEEL_BEGIN |
ArabicShaping.TEXT_DIRECTION_VISUAL_LTR,
"\u0020\ufeb2\ufee4\ufe7d\ufeb7"},
{tashkeelShaddaLTR,
ArabicShaping.LETTERS_SHAPE | ArabicShaping.TASHKEEL_END |
ArabicShaping.TEXT_DIRECTION_VISUAL_LTR,
"\ufeb2\ufee4\ufe7d\ufeb7\u0020"},
{tashkeelShaddaLTR,
ArabicShaping.LETTERS_SHAPE | ArabicShaping.TASHKEEL_RESIZE |
ArabicShaping.TEXT_DIRECTION_VISUAL_LTR,
"\ufeb2\ufee4\ufe7d\ufeb7"},
{tashkeelShaddaLTR,
ArabicShaping.LETTERS_SHAPE | ArabicShaping.TASHKEEL_REPLACE_BY_TATWEEL |
ArabicShaping.TEXT_DIRECTION_VISUAL_LTR,
"\ufeb2\u0640\ufee4\ufe7d\ufeb7"},
{ArMathSym,
ArabicShaping.LETTERS_SHAPE | ArabicShaping.TASHKEEL_BEGIN |
ArabicShaping.TEXT_DIRECTION_VISUAL_RTL,
"\uD83B\uDE00\uD83B\uDE01\uD83B\uDE02\uD83B\uDE03\u0020\uD83B\uDE24\uD83B" +
"\uDE05\uD83B\uDE06\u0020\uD83B\uDE07\uD83B\uDE08\uD83B\uDE09\u0020\uD83B" +
"\uDE0A\uD83B\uDE0B\uD83B\uDE0C\uD83B\uDE0D\u0020\uD83B\uDE0E\uD83B\uDE0F" +
"\uD83B\uDE10\uD83B\uDE11\u0020\uD83B\uDE12\uD83B\uDE13\uD83B\uDE14\uD83B" +
"\uDE15\u0020\uD83B\uDE16\uD83B\uDE17\uD83B\uDE18\u0020\uD83B\uDE19\uD83B" +
"\uDE1A\uD83B\uDE1B"},
{ArMathSymLooped,
ArabicShaping.LETTERS_SHAPE | ArabicShaping.TASHKEEL_END |
ArabicShaping.TEXT_DIRECTION_VISUAL_RTL,
"\uD83B\uDE80\uD83B\uDE81\uD83B\uDE82\uD83B\uDE83\u0020\uD83B\uDE84\uD83B" +
"\uDE85\uD83B\uDE86\u0020\uD83B\uDE87\uD83B\uDE88\uD83B\uDE89\u0020\uD83B" +
"\uDE8B\uD83B\uDE8C\uD83B\uDE8D\u0020\uD83B\uDE8E\uD83B\uDE8F\uD83B\uDE90" +
"\uD83B\uDE91\u0020\uD83B\uDE92\uD83B\uDE93\uD83B\uDE94\uD83B\uDE95\u0020" +
"\uD83B\uDE96\uD83B\uDE97\uD83B\uDE98\u0020\uD83B\uDE99\uD83B\uDE9A\uD83B" +
"\uDE9B"},
{ArMathSymDoubleStruck,
ArabicShaping.LETTERS_SHAPE | ArabicShaping.TASHKEEL_RESIZE|
ArabicShaping.TEXT_DIRECTION_VISUAL_RTL,
"\uD83B\uDEA1\uD83B\uDEA2\uD83B\uDEA3\u0020\uD83B\uDEA5\uD83B\uDEA6\u0020" +
"\uD83B\uDEA7\uD83B\uDEA8\uD83B\uDEA9\u0020\uD83B\uDEAB\uD83B\uDEAC\uD83B" +
"\uDEAD\u0020\uD83B\uDEAE\uD83B\uDEAF\uD83B\uDEB0\uD83B\uDEB1\u0020\uD83B" +
"\uDEB2\uD83B\uDEB3\uD83B\uDEB4\uD83B\uDEB5\u0020\uD83B\uDEB6\uD83B\uDEB7" +
"\uD83B\uDEB8\u0020\uD83B\uDEB9\uD83B\uDEBA\uD83B\uDEBB"},
{ArMathSymInitial,
ArabicShaping.LETTERS_SHAPE | ArabicShaping.TASHKEEL_BEGIN |
ArabicShaping.TEXT_DIRECTION_VISUAL_LTR,
"\uD83B\uDE21\uD83B\uDE22\u0020\uD83B\uDE27\uD83B\uDE29\u0020\uD83B\uDE2A" +
"\uD83B\uDE2B\uD83B\uDE2C\uD83B\uDE2D\u0020\uD83B\uDE2E\uD83B\uDE2F\uD83B" +
"\uDE30\uD83B\uDE31\u0020\uD83B\uDE32\uD83B\uDE34\uD83B\uDE35\u0020\uD83B" +
"\uDE36\uD83B\uDE37\u0020\uD83B\uDE39\uD83B\uDE3B"},
{ArMathSymTailed,
ArabicShaping.LETTERS_SHAPE | ArabicShaping.TASHKEEL_END |
ArabicShaping.TEXT_DIRECTION_VISUAL_LTR,
"\uD83B\uDE42\uD83B\uDE47\uD83B\uDE49\uD83B\uDE4B\u0020\uD83B\uDE4D\uD83B" +
"\uDE4E\uD83B\uDE4F\u0020\uD83B\uDE51\uD83B\uDE52\uD83B\uDE54\uD83B\uDE57" +
"\u0020\uD83B\uDE59\uD83B\uDE5B\uD83B\uDE5D\uD83B\uDE5F"},
{ArMathSymStretched,
ArabicShaping.LETTERS_SHAPE|ArabicShaping.TASHKEEL_RESIZE |
ArabicShaping.TEXT_DIRECTION_VISUAL_LTR,
"\uD83B\uDE21\uFEB1\uD83B\uDE62\uFEE9"},
/* logical unshape */
{logicalUnshape,
LETTERS_UNSHAPE | TEXT_DIRECTION_LOGICAL | LENGTH_FIXED_SPACES_NEAR,
"\u0020\u0020\u0020\u0627\u0644\u0622\u0646\u0020\u0627\u0644\u0623\u0642" +
"\u0644\u0627\u0645\u0020\u0627\u0644\u0639\u0631\u0628\u064a\u0629\u0020" +
"\u0627\u0644\u062d\u0631\u0629\u0020\u0020\u0020\u0020"},
{logicalUnshape,
LETTERS_UNSHAPE | TEXT_DIRECTION_LOGICAL | LENGTH_FIXED_SPACES_AT_END,
"\u0020\u0020\u0020\u0627\u0644\u0622\u0020\u0646\u0020\u0627\u0644\u0623" +
"\u0020\u0642\u0644\u0627\u0020\u0645\u0020\u0627\u0644\u0639\u0631\u0628" +
"\u064a\u0629\u0020\u0627\u0644\u062d\u0631\u0629\u0020"},
{logicalUnshape,
LETTERS_UNSHAPE | TEXT_DIRECTION_LOGICAL | LENGTH_FIXED_SPACES_AT_BEGINNING,
"\u0627\u0644\u0622\u0020\u0646\u0020\u0627\u0644\u0623\u0020\u0642\u0644" +
"\u0627\u0020\u0645\u0020\u0627\u0644\u0639\u0631\u0628\u064a\u0629\u0020" +
"\u0627\u0644\u062d\u0631\u0629\u0020\u0020\u0020\u0020"},
{logicalUnshape,
LETTERS_UNSHAPE | TEXT_DIRECTION_LOGICAL | LENGTH_GROW_SHRINK,
"\u0020\u0020\u0020\u0627\u0644\u0622\u0020\u0646\u0020\u0627\u0644\u0623" +
"\u0020\u0642\u0644\u0627\u0020\u0645\u0020\u0627\u0644\u0639\u0631\u0628" +
"\u064a\u0629\u0020\u0627\u0644\u062d\u0631\u0629\u0020\u0020\u0020\u0020"},
/* numbers */
{numSource,
DIGITS_EN2AN | DIGIT_TYPE_AN,
"\u0661\u0627\u0662\u06f3\u0061\u0664"},
{numSource,
DIGITS_AN2EN | DIGIT_TYPE_AN_EXTENDED,
"\u0031\u0627\u0032\u0033\u0061\u0034"},
{numSource,
DIGITS_EN2AN_INIT_LR | DIGIT_TYPE_AN,
"\u0031\u0627\u0662\u06f3\u0061\u0034" },
{numSource,
DIGITS_EN2AN_INIT_AL | DIGIT_TYPE_AN_EXTENDED,
"\u06f1\u0627\u06f2\u06f3\u0061\u0034"},
{numSource,
DIGITS_EN2AN_INIT_LR | DIGIT_TYPE_AN | TEXT_DIRECTION_VISUAL_LTR,
"\u0661\u0627\u0032\u06f3\u0061\u0034"},
{numSource,
DIGITS_EN2AN_INIT_AL | DIGIT_TYPE_AN_EXTENDED | TEXT_DIRECTION_VISUAL_LTR,
"\u06f1\u0627\u0032\u06f3\u0061\u06f4"},
/* no-op */
{numSource, 0, numSource}
});
}
@Test
public void TestStandard() {
Exception ex = null;
String actual = null;
ArabicShaping shaper = null;
try {
shaper = new ArabicShaping(flags);
actual = shaper.shape(source);
}
catch(MissingResourceException e){
throw e;
}
catch (IllegalStateException ie){
warnln("IllegalStateException: "+ ie.toString());
return;
}
catch (Exception e) {
ex = e;
}
if (ex != null) {
err("Error: Shaper " + shaper + "\n throws exception '" + ex + "'\n for input '" + source);
} else if (!expected.equals(actual)) {
StringBuffer buf = new StringBuffer();
buf.append("Error: Shaper: " + shaper + "\n Input: " + source + "\n Actual: " + actual +
"\n Expected: " + expected + "\n");
for (int i = 0; i < Math.max(expected.length(), actual.length()); ++i) {
String temp = Integer.toString(i);
if (temp.length() < 2) {
temp = " ".concat(temp);
}
char trg = i < expected.length() ? expected.charAt(i) : '\uffff';
char res = i < actual.length() ? actual.charAt(i) : '\uffff';
buf.append("[" + temp + "] ");
buf.append(escapedString("" + trg) + " ");
buf.append(escapedString("" + res) + " ");
if (trg != res) {
buf.append("***");
}
buf.append("\n");
}
err(buf.toString());
}
}
private static String escapedString(String str) {
if (str == null) {
return null;
}
StringBuffer buf = new StringBuffer(str.length() * 6);
for (int i = 0; i < str.length(); ++i) {
char ch = str.charAt(i);
buf.append("\\u");
if (ch < 0x1000) {
buf.append('0');
}
if (ch < 0x0100) {
buf.append('0');
}
if (ch < 0x0010) {
buf.append('0');
}
buf.append(Integer.toHexString(ch));
}
return buf.toString();
}
}
@RunWith(Parameterized.class)
public static class PreflightDataTest extends TestFmwk {
private String source;
private int flags;
private int length;
public PreflightDataTest(String source, int flags, int length) {
this.source = source;
this.flags = flags;
this.length = length;
}
@Parameterized.Parameters
public static Collection testData() {
return Arrays.asList(new Object[][] {
{"\u0644\u0627", LETTERS_SHAPE | LENGTH_GROW_SHRINK, 1},
{"\u0644\u0627\u0031",
DIGITS_EN2AN | DIGIT_TYPE_AN_EXTENDED | LENGTH_GROW_SHRINK, 3},
{"\u0644\u0644", LETTERS_SHAPE | LENGTH_GROW_SHRINK, 2},
{"\ufef7", LETTERS_UNSHAPE | LENGTH_GROW_SHRINK, 2}
});
}
@Test
public void TestPreflight() {
Exception ex = null;
char src[] = null;
int len = 0;
ArabicShaping shaper = null;
if (source != null) {
src = source.toCharArray();
}
try {
shaper = new ArabicShaping(flags);
len = shaper.shape(src, 0, src.length, null, 0, 0);
}
catch (Exception e) {
ex = e;
}
if (ex != null) {
err("Error: Shaper " + shaper + "\n throws exception '" + ex + "'\n for input '" + source);
} else if (length != len) {
err("Error: Shaper " + shaper + "\n returns " + len + " characters for input '" +
source + "'\n Expected were " + length + " characters");
}
}
}
@RunWith(Parameterized.class)
public static class ErrorDataTest extends TestFmwk {
private String source;
private int flags;
private Class error;
public ErrorDataTest(String source, int flags, Class error) {
this.source = source;
this.flags = flags;
this.error = error;
}
@Parameterized.Parameters
public static Collection testData() {
return Arrays.asList(new Object[][] {
/* bad data */
{"\u0020\ufef7\u0644\u0020", LETTERS_UNSHAPE | LENGTH_FIXED_SPACES_NEAR,
ArabicShapingException.class},
{"\u0020\ufef7", LETTERS_UNSHAPE | LENGTH_FIXED_SPACES_AT_END,
ArabicShapingException.class},
{"\ufef7\u0020", LETTERS_UNSHAPE | LENGTH_FIXED_SPACES_AT_BEGINNING,
ArabicShapingException.class},
/* bad options */
{"\ufef7", 0xffffffff, IllegalArgumentException.class},
{"\ufef7", LETTERS_UNSHAPE | LENGTH_GROW_SHRINK, ArabicShapingException.class},
{null, LETTERS_UNSHAPE | LENGTH_FIXED_SPACES_AT_END,
IllegalArgumentException.class}
});
}
@Test
public void TestError() {
Exception ex = null;
char src[] = null;
int len = 0;
ArabicShaping shaper = null;
if (source != null) {
src = source.toCharArray();
len = src.length;
}
try {
shaper = new ArabicShaping(flags);
shaper.shape(src, 0, len);
}
catch (Exception e) {
ex = e;
}
if (!error.isInstance(ex)) {
err("Error: Shaper " + shaper + "\n throws exception '" + ex + "'\n for input '" +
source + "'\n Expected exception: " + error);
}
}
}
}

View File

@ -33,24 +33,24 @@ public class TestIDNA extends TestFmwk {
// test StringBuffer toUnicode
doTestToUnicode(TestData.asciiIn[i],new String(TestData.unicodeIn[i]),IDNA.DEFAULT, null);
doTestToUnicode(TestData.asciiIn[i],new String(TestData.unicodeIn[i]),IDNA.ALLOW_UNASSIGNED, null);
doTestToUnicode(TestData.asciiIn[i],new String(TestData.unicodeIn[i]),IDNA.USE_STD3_RULES, null);
doTestToUnicode(TestData.asciiIn[i],new String(TestData.unicodeIn[i]),IDNA.USE_STD3_RULES|IDNA.ALLOW_UNASSIGNED, null);
doTestToUnicode(TestData.asciiIn[i],new String(TestData.unicodeIn[i]),IDNA.USE_STD3_RULES, null);
doTestToUnicode(TestData.asciiIn[i],new String(TestData.unicodeIn[i]),IDNA.USE_STD3_RULES|IDNA.ALLOW_UNASSIGNED, null);
}
}
@Test
public void TestToASCII() throws Exception{
for(int i=0; i<TestData.asciiIn.length; i++){
// test StringBuffer toUnicode
doTestToASCII(new String(TestData.unicodeIn[i]),TestData.asciiIn[i],IDNA.DEFAULT, null);
doTestToASCII(new String(TestData.unicodeIn[i]),TestData.asciiIn[i],IDNA.ALLOW_UNASSIGNED, null);
doTestToUnicode(TestData.asciiIn[i],new String(TestData.unicodeIn[i]),IDNA.USE_STD3_RULES, null);
doTestToUnicode(TestData.asciiIn[i],new String(TestData.unicodeIn[i]),IDNA.USE_STD3_RULES|IDNA.ALLOW_UNASSIGNED, null);
doTestToUnicode(TestData.asciiIn[i],new String(TestData.unicodeIn[i]),IDNA.USE_STD3_RULES, null);
doTestToUnicode(TestData.asciiIn[i],new String(TestData.unicodeIn[i]),IDNA.USE_STD3_RULES|IDNA.ALLOW_UNASSIGNED, null);
}
}
@Test
public void TestIDNToASCII() throws Exception{
for(int i=0; i<TestData.domainNames.length; i++){
@ -59,7 +59,7 @@ public class TestIDNA extends TestFmwk {
doTestIDNToASCII(TestData.domainNames[i],TestData.domainNames[i],IDNA.USE_STD3_RULES, null);
doTestIDNToASCII(TestData.domainNames[i],TestData.domainNames[i],IDNA.ALLOW_UNASSIGNED|IDNA.USE_STD3_RULES, null);
}
for(int i=0; i<TestData.domainNames1Uni.length; i++){
doTestIDNToASCII(TestData.domainNames1Uni[i],TestData.domainNamesToASCIIOut[i],IDNA.DEFAULT, null);
doTestIDNToASCII(TestData.domainNames1Uni[i],TestData.domainNamesToASCIIOut[i],IDNA.ALLOW_UNASSIGNED, null);
@ -78,16 +78,16 @@ public class TestIDNA extends TestFmwk {
doTestIDNToUnicode(TestData.domainNamesToASCIIOut[i],TestData.domainNamesToUnicodeOut[i],IDNA.ALLOW_UNASSIGNED, null);
}
}
private void doTestToUnicode(String src, String expected, int options, Object expectedException)
private void doTestToUnicode(String src, String expected, int options, Object expectedException)
throws Exception{
StringBuffer inBuf = new StringBuffer(src);
UCharacterIterator inIter = UCharacterIterator.getInstance(src);
try{
StringBuffer out = IDNA.convertToUnicode(src,options);
if(expected!=null && out != null && !out.toString().equals(expected)){
errln("convertToUnicode did not return expected result with options : "+ options +
errln("convertToUnicode did not return expected result with options : "+ options +
" Expected: " + prettify(expected)+" Got: "+prettify(out));
}
if(expectedException!=null && !unassignedException.equals(expectedException)){
@ -99,10 +99,10 @@ public class TestIDNA extends TestFmwk {
}
}
try{
StringBuffer out = IDNA.convertToUnicode(inBuf,options);
if(expected!=null && out != null && !out.toString().equals(expected)){
errln("convertToUnicode did not return expected result with options : "+ options +
errln("convertToUnicode did not return expected result with options : "+ options +
" Expected: " + prettify(expected)+" Got: "+out);
}
if(expectedException!=null && !unassignedException.equals(expectedException)){
@ -113,7 +113,7 @@ public class TestIDNA extends TestFmwk {
errln("convertToUnicode did not get the expected exception for source: " + prettify(src) +" Got: "+ ex.toString());
}
}
try{
StringBuffer out = IDNA.convertToUnicode(inIter,options);
if(expected!=null && out != null && !out.toString().equals(expected)){
@ -129,16 +129,16 @@ public class TestIDNA extends TestFmwk {
}
}
}
private void doTestIDNToUnicode(String src, String expected, int options, Object expectedException)
private void doTestIDNToUnicode(String src, String expected, int options, Object expectedException)
throws Exception{
StringBuffer inBuf = new StringBuffer(src);
UCharacterIterator inIter = UCharacterIterator.getInstance(src);
try{
StringBuffer out = IDNA.convertIDNToUnicode(src,options);
if(expected!=null && out != null && !out.toString().equals(expected)){
errln("convertToUnicode did not return expected result with options : "+ options +
errln("convertToUnicode did not return expected result with options : "+ options +
" Expected: " + prettify(expected)+" Got: "+prettify(out));
}
if(expectedException!=null && !unassignedException.equals(expectedException)){
@ -152,7 +152,7 @@ public class TestIDNA extends TestFmwk {
try{
StringBuffer out = IDNA.convertIDNToUnicode(inBuf,options);
if(expected!=null && out != null && !out.toString().equals(expected)){
errln("convertToUnicode did not return expected result with options : "+ options +
errln("convertToUnicode did not return expected result with options : "+ options +
" Expected: " + prettify(expected)+" Got: "+out);
}
if(expectedException!=null && !unassignedException.equals(expectedException)){
@ -163,7 +163,7 @@ public class TestIDNA extends TestFmwk {
errln("convertToUnicode did not get the expected exception for source: " +src +" Got: "+ ex.toString());
}
}
try{
StringBuffer out = IDNA.convertIDNToUnicode(inIter,options);
if(expected!=null && out != null && !out.toString().equals(expected)){
@ -179,17 +179,17 @@ public class TestIDNA extends TestFmwk {
}
}
}
private void doTestToASCII(String src, String expected, int options, Object expectedException)
private void doTestToASCII(String src, String expected, int options, Object expectedException)
throws Exception{
StringBuffer inBuf = new StringBuffer(src);
UCharacterIterator inIter = UCharacterIterator.getInstance(src);
try{
StringBuffer out = IDNA.convertToASCII(src,options);
if(!unassignedException.equals(expectedException) && expected!=null && out != null && expected!=null && out != null && !out.toString().equals(expected.toLowerCase())){
errln("convertToASCII did not return expected result with options : "+ options +
errln("convertToASCII did not return expected result with options : "+ options +
" Expected: " + expected+" Got: "+out);
}
}
if(expectedException!=null && !unassignedException.equals(expectedException)){
errln("convertToASCII did not get the expected exception. The operation succeeded!");
}
@ -198,11 +198,11 @@ public class TestIDNA extends TestFmwk {
errln("convertToASCII did not get the expected exception for source: " +src +"\n Got: "+ ex.toString() +"\n Expected: " +ex.toString());
}
}
try{
try{
StringBuffer out = IDNA.convertToASCII(inBuf,options);
if(!unassignedException.equals(expectedException) && expected!=null && out != null && expected!=null && out != null && !out.toString().equals(expected.toLowerCase())){
errln("convertToASCII did not return expected result with options : "+ options +
errln("convertToASCII did not return expected result with options : "+ options +
" Expected: " + expected+" Got: "+out);
}
if(expectedException!=null && !unassignedException.equals(expectedException)){
@ -213,7 +213,7 @@ public class TestIDNA extends TestFmwk {
errln("convertToASCII did not get the expected exception for source: " +src +" Got: "+ ex.toString());
}
}
try{
StringBuffer out = IDNA.convertToASCII(inIter,options);
if(!unassignedException.equals(expectedException) && expected!=null && out != null && expected!=null && out != null && !out.toString().equals(expected.toLowerCase())){
@ -229,15 +229,15 @@ public class TestIDNA extends TestFmwk {
}
}
}
private void doTestIDNToASCII(String src, String expected, int options, Object expectedException)
private void doTestIDNToASCII(String src, String expected, int options, Object expectedException)
throws Exception{
StringBuffer inBuf = new StringBuffer(src);
UCharacterIterator inIter = UCharacterIterator.getInstance(src);
try{
StringBuffer out = IDNA.convertIDNToASCII(src,options);
if(expected!=null && out != null && !out.toString().equals(expected)){
errln("convertToIDNASCII did not return expected result with options : "+ options +
errln("convertToIDNASCII did not return expected result with options : "+ options +
" Expected: " + expected+" Got: "+out);
}
if(expectedException!=null && !unassignedException.equals(expectedException)){
@ -251,9 +251,9 @@ public class TestIDNA extends TestFmwk {
try{
StringBuffer out = IDNA.convertIDNToASCII(inBuf,options);
if(expected!=null && out != null && !out.toString().equals(expected)){
errln("convertToIDNASCII did not return expected result with options : "+ options +
errln("convertToIDNASCII did not return expected result with options : "+ options +
" Expected: " + expected+" Got: "+out);
}
}
if(expectedException!=null && !unassignedException.equals(expectedException)){
errln("convertToIDNASCII did not get the expected exception. The operation succeeded!");
}
@ -262,14 +262,14 @@ public class TestIDNA extends TestFmwk {
errln("convertToIDNASCII did not get the expected exception for source: " +src +" Got: "+ ex.toString());
}
}
try{
StringBuffer out = IDNA.convertIDNToASCII(inIter,options);
if(expected!=null && out != null && !out.toString().equals(expected)){
errln("convertIDNToASCII did not return expected result with options : "+ options +
" Expected: " + expected+" Got: "+ out);
}
if(expectedException!=null && !unassignedException.equals(expectedException)){
errln("convertIDNToASCII did not get the expected exception. The operation succeeded!");
}
@ -282,7 +282,7 @@ public class TestIDNA extends TestFmwk {
@Test
public void TestConformance()throws Exception{
for(int i=0; i<TestData.conformanceTestCases.length;i++){
TestData.ConformanceTestCase testCase = TestData.conformanceTestCases[i];
if(testCase.expected != null){
//Test toASCII
@ -313,7 +313,7 @@ public class TestIDNA extends TestFmwk {
errln("Did not get the expected exception for source: " +testCase.input +" Got: "+ ex.toString());
}
}
try{
iter.setToStart();
StringBuffer output = namePrep.prepare(iter,StringPrep.ALLOW_UNASSIGNED);
@ -330,7 +330,7 @@ public class TestIDNA extends TestFmwk {
}
}
}
}
@Test
public void TestErrorCases() throws Exception{
@ -345,11 +345,11 @@ public class TestIDNA extends TestFmwk {
}
}
if(errCase.useSTD3ASCIIRules!=true){
// Test IDNToASCII
doTestIDNToASCII(new String(errCase.unicode),errCase.ascii,IDNA.DEFAULT,errCase.expected);
doTestIDNToASCII(new String(errCase.unicode),errCase.ascii,IDNA.ALLOW_UNASSIGNED,errCase.expected);
}else{
doTestIDNToASCII(new String(errCase.unicode),errCase.ascii,IDNA.USE_STD3_RULES,errCase.expected);
}
@ -359,7 +359,7 @@ public class TestIDNA extends TestFmwk {
// Test IDNToUnicode
doTestIDNToUnicode(errCase.ascii,new String(errCase.unicode),IDNA.DEFAULT,errCase.expected);
doTestIDNToUnicode(errCase.ascii,new String(errCase.unicode),IDNA.ALLOW_UNASSIGNED,errCase.expected);
}else{
doTestIDNToUnicode(errCase.ascii,new String(errCase.unicode),IDNA.USE_STD3_RULES,errCase.expected);
}
@ -370,38 +370,38 @@ public class TestIDNA extends TestFmwk {
try{
int retVal = IDNA.compare(s1,s2,IDNA.DEFAULT);
if(isEqual==true && retVal != 0){
errln("Did not get the expected result for s1: "+ prettify(s1)+
errln("Did not get the expected result for s1: "+ prettify(s1)+
" s2: "+prettify(s2));
}
retVal = IDNA.compare(new StringBuffer(s1), new StringBuffer(s2), IDNA.DEFAULT);
if(isEqual==true && retVal != 0){
errln("Did not get the expected result for s1: "+ prettify(s1)+
errln("Did not get the expected result for s1: "+ prettify(s1)+
" s2: "+prettify(s2));
}
retVal = IDNA.compare(UCharacterIterator.getInstance(s1), UCharacterIterator.getInstance(s2), IDNA.DEFAULT);
if(isEqual==true && retVal != 0){
errln("Did not get the expected result for s1: "+ prettify(s1)+
errln("Did not get the expected result for s1: "+ prettify(s1)+
" s2: "+prettify(s2));
}
}catch(Exception e){
e.printStackTrace();
errln("Unexpected exception thrown by IDNA.compare");
}
try{
int retVal = IDNA.compare(s1,s2,IDNA.ALLOW_UNASSIGNED);
if(isEqual==true && retVal != 0){
errln("Did not get the expected result for s1: "+ prettify(s1)+
errln("Did not get the expected result for s1: "+ prettify(s1)+
" s2: "+prettify(s2));
}
retVal = IDNA.compare(new StringBuffer(s1), new StringBuffer(s2), IDNA.ALLOW_UNASSIGNED);
if(isEqual==true && retVal != 0){
errln("Did not get the expected result for s1: "+ prettify(s1)+
errln("Did not get the expected result for s1: "+ prettify(s1)+
" s2: "+prettify(s2));
}
retVal = IDNA.compare(UCharacterIterator.getInstance(s1), UCharacterIterator.getInstance(s2), IDNA.ALLOW_UNASSIGNED);
if(isEqual==true && retVal != 0){
errln("Did not get the expected result for s1: "+ prettify(s1)+
errln("Did not get the expected result for s1: "+ prettify(s1)+
" s2: "+prettify(s2));
}
}catch(Exception e){
@ -437,13 +437,13 @@ public class TestIDNA extends TestFmwk {
source.setLength(4);
source.append(TestData.unicodeIn[i]);
source.append(com);
// a) compare it with itself
doTestCompare(source.toString(),source.toString(),true);
// b) compare it with asciiIn equivalent
doTestCompare(source.toString(),www+TestData.asciiIn[i]+com,true);
// c) compare it with unicodeIn not equivalent
if(i==0){
doTestCompare(source.toString(), uni1.toString(), false);
@ -463,9 +463,9 @@ public class TestIDNA extends TestFmwk {
// test and ascertain
// func(func(func(src))) == func(src)
private void doTestChainingToASCII(String source) throws Exception {
StringBuffer expected;
StringBuffer expected;
StringBuffer chained;
// test convertIDNToASCII
expected = IDNA.convertIDNToASCII(source,IDNA.DEFAULT);
chained = expected;
@ -483,15 +483,15 @@ public class TestIDNA extends TestFmwk {
}
if(!expected.toString().equals(chained.toString())){
errln("Chaining test failed for convertToASCII");
}
}
}
// test and ascertain
// func(func(func(src))) == func(src)
private void doTestChainingToUnicode(String source) throws Exception {
StringBuffer expected;
StringBuffer expected;
StringBuffer chained;
// test convertIDNToUnicode
expected = IDNA.convertIDNToUnicode(source,IDNA.DEFAULT);
chained = expected;
@ -509,7 +509,7 @@ public class TestIDNA extends TestFmwk {
}
if(!expected.toString().equals(chained.toString())){
errln("Chaining test failed for convertToUnicode");
}
}
}
@Test
public void TestChaining() throws Exception{
@ -520,7 +520,7 @@ public class TestIDNA extends TestFmwk {
doTestChainingToASCII(new String(TestData.unicodeIn[i]));
}
}
/* IDNA RFC Says:
A label is an individual part of a domain name. Labels are usually
@ -559,13 +559,13 @@ public class TestIDNA extends TestFmwk {
source.setLength(4);
source.append(TestData.unicodeIn[i]);
source.append(com);
// a) compare it with itself
doTestCompare(source.toString(),source.toString(),true);
// b) compare it with asciiIn equivalent
doTestCompare(source.toString(),www+TestData.asciiIn[i]+com,true);
// c) compare it with unicodeIn not equivalent
if(i==0){
doTestCompare(source.toString(), uni1.toString(), false);
@ -582,13 +582,13 @@ public class TestIDNA extends TestFmwk {
}
}
private static final int loopCount = 100;
private static final int maxCharCount = 15;
// private static final int maxCodePoint = 0x10ffff;
private Random random = null;
/**
* Return a random integer i where 0 <= i < n.
* A special function that gets random codepoints from planes 0,1,2 and 14
@ -622,30 +622,30 @@ public class TestIDNA extends TestFmwk {
i++;
}
return fillIn;
}
// TODO(junit): turned off because not running before
// TODO(#13294): turned off because monkey test fails approx 1 in 3 times.
@Ignore
@Test
public void MonkeyTest() throws Exception{
StringBuffer source = new StringBuffer();
/* do the monkey test */
/* do the monkey test */
for(int i=0; i<loopCount; i++){
source.setLength(0);
getTestSource(source);
doTestCompareReferenceImpl(source);
}
// test string with embedded null
// test string with embedded null
source.append( "\\u0000\\u2109\\u3E1B\\U000E65CA\\U0001CAC5" );
source = new StringBuffer(Utility.unescape(source.toString()));
doTestCompareReferenceImpl(source);
//StringBuffer src = new StringBuffer(Utility.unescape("\\uDEE8\\U000E228C\\U0002EE8E\\U000E6350\\U00024DD9\u4049\\U000E0DE4\\U000E448C\\U0001869B\\U000E3380\\U00016A8E\\U000172D5\\U0001C408\\U000E9FB5"));
//doTestCompareReferenceImpl(src);
//test deletion of code points
source = new StringBuffer(Utility.unescape("\\u043f\\u00AD\\u034f\\u043e\\u0447\\u0435\\u043c\\u0443\\u0436\\u0435\\u043e\\u043d\\u0438\\u043d\\u0435\\u0433\\u043e\\u0432\\u043e\\u0440\\u044f\\u0442\\u043f\\u043e\\u0440\\u0443\\u0441\\u0441\\u043a\\u0438"));
StringBuffer expected = new StringBuffer("xn--b1abfaaepdrnnbgefbadotcwatmq2g4l");
@ -704,8 +704,6 @@ public class TestIDNA extends TestFmwk {
private void doTestCompareReferenceImpl(StringBuffer src) throws Exception{
// test toASCII
src.setLength(0);
src.append("[");
StringBuffer asciiLabel = _doTestCompareReferenceImpl(src, true, IDNA.ALLOW_UNASSIGNED);
_doTestCompareReferenceImpl(src, true, IDNA.DEFAULT);
_doTestCompareReferenceImpl(src, true, IDNA.USE_STD3_RULES);
@ -720,6 +718,8 @@ public class TestIDNA extends TestFmwk {
}
}
// TODO(#13324): test turned off because it has dependency on translit.
@Ignore
@Test
public void TestCompareRefImpl() throws Exception {
for (int i = 65; i < 0x10FFFF; i++) {
@ -742,7 +742,7 @@ public class TestIDNA extends TestFmwk {
"\u00F5\u00dE\u00dF\u00dD",
"\uFB00\uFB01"
};
for ( int i=0; i< in.length; i++){
for ( int i=0; i< in.length; i++){
try{
String ascii = IDNA.convertToASCII(in[i],IDNA.DEFAULT).toString();
try{
@ -763,7 +763,7 @@ public class TestIDNA extends TestFmwk {
"test"
};
for ( int i=0; i< in.length; i++){
try{
String ascii = IDNA.convertToASCII(in[i],IDNA.DEFAULT).toString();
if(!ascii.equals(in[i])){
@ -773,11 +773,11 @@ public class TestIDNA extends TestFmwk {
errln("Unexpected exception: " + ex.getMessage());
}
}
}
@Test
public void TestDebug(){
public void TestDebug(){
try{
String src = "\u00ED4dn";
String uni = IDNA.convertToUnicode(src,IDNA.DEFAULT).toString();
@ -808,7 +808,7 @@ public class TestIDNA extends TestFmwk {
} catch (ArrayIndexOutOfBoundsException ex) {
errln("Got an ArrayIndexOutOfBoundsException calling convertIDNToUnicode(\"" + INVALID_DOMAIN_NAME + "\")");
}
String domain = "xn--m\u00FCller.de";
try{
IDNA.convertIDNToUnicode(domain, IDNA.DEFAULT);
@ -840,12 +840,12 @@ public class TestIDNA extends TestFmwk {
errln("ToUnicode operation failed! "+ex.getMessage());
}
}
@Test
public void TestLength(){
String ul = "my_very_very_very_very_very_very_very_very_very_very_very_very_very_long_and_incredibly_uncreative_domain_label";
/* this unicode string is longer than MAX_LABEL_BUFFER_SIZE and produces an
/* this unicode string is longer than MAX_LABEL_BUFFER_SIZE and produces an
IDNA prepared string (including xn--)that is exactly 63 bytes long */
String ul1 ="\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"+
"\uD55C\uAD6D\uC5B4\uB97C\uC774\u00AD\u034F\u1806\u180B"+
@ -887,7 +887,7 @@ public class TestIDNA extends TestFmwk {
}catch (StringPrepParseException ex){
errln("IDNA.convertToASCII failed with error: "+ex.toString());
}
String idn = "my_very_very_long_and_incredibly_uncreative_domain_label.my_very_very_long_and_incredibly_uncreative_domain_label.my_very_very_long_and_incredibly_uncreative_domain_label.my_very_very_long_and_incredibly_uncreative_domain_label.my_very_very_long_and_incredibly_uncreative_domain_label.my_very_very_long_and_incredibly_uncreative_domain_label.ibm.com";
try{
IDNA.convertIDNToASCII(idn, IDNA.DEFAULT);
@ -901,7 +901,7 @@ public class TestIDNA extends TestFmwk {
}
try{
IDNA.convertIDNToUnicode(idn, IDNA.DEFAULT);
errln("IDNA.convertToUnicode did not fail!");
errln("IDNA.convertToUnicode did not fail!");
}catch (StringPrepParseException ex){
if(ex.getError()!= StringPrepParseException.DOMAIN_NAME_TOO_LONG_ERROR){
errln("IDNA.convertToUnicode failed with error: "+ex.toString());
@ -910,7 +910,7 @@ public class TestIDNA extends TestFmwk {
}
}
}
/* Tests the method public static StringBuffer convertToASCII(String src, int options) */
@Test
public void TestConvertToASCII() {

View File

@ -117,6 +117,38 @@ abstract public class TestFmwk extends AbstractTestLog {
return new Random(getParams().getSeed());
}
/**
* Integer Random number generator, produces positive int values.
* Similar to C++ std::minstd_rand, with the same algorithm & constants.
* Provided for compatibility with ICU4C.
* Get & set of the seed allows for reproducible monkey tests.
*/
protected class ICU_Rand {
private int fLast;
public ICU_Rand(int seed) {
seed(seed);
}
public int next() {
fLast = (int)((fLast * 48271L) % 2147483647L);
return fLast;
}
public void seed(int seed) {
if (seed <= 0) {
seed = 1;
}
seed %= 2147483647; // = 0x7FFFFFFF
fLast = seed > 0 ? seed : 1;
}
public int getSeed() {
return fLast;
}
}
static final String ICU_TRAC_URL = "http://bugs.icu-project.org/trac/ticket/";
static final String CLDR_TRAC_URL = "http://unicode.org/cldr/trac/ticket/";
static final String CLDR_TICKET_PREFIX = "cldrbug:";