ICU-13177 Merging trunk to branch
X-SVN-Rev: 40350
This commit is contained in:
commit
8b625eda51
@ -195,7 +195,7 @@ BreakIterator::getAvailableLocales(int32_t& count)
|
||||
|
||||
// ------------------------------------------
|
||||
//
|
||||
// Default constructor and destructor
|
||||
// Constructors, destructor and assignment operator
|
||||
//
|
||||
//-------------------------------------------
|
||||
|
||||
@ -204,6 +204,19 @@ BreakIterator::BreakIterator()
|
||||
*validLocale = *actualLocale = 0;
|
||||
}
|
||||
|
||||
BreakIterator::BreakIterator(const BreakIterator &other) : UObject(other) {
|
||||
uprv_strncpy(actualLocale, other.actualLocale, sizeof(actualLocale));
|
||||
uprv_strncpy(validLocale, other.validLocale, sizeof(validLocale));
|
||||
}
|
||||
|
||||
BreakIterator &BreakIterator::operator =(const BreakIterator &other) {
|
||||
if (this != &other) {
|
||||
uprv_strncpy(actualLocale, other.actualLocale, sizeof(actualLocale));
|
||||
uprv_strncpy(validLocale, other.validLocale, sizeof(validLocale));
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
BreakIterator::~BreakIterator()
|
||||
{
|
||||
}
|
||||
@ -265,7 +278,7 @@ ICUBreakIteratorService::~ICUBreakIteratorService() {}
|
||||
// defined in ucln_cmn.h
|
||||
U_NAMESPACE_END
|
||||
|
||||
static icu::UInitOnce gInitOnce;
|
||||
static icu::UInitOnce gInitOnceBrkiter;
|
||||
static icu::ICULocaleService* gService = NULL;
|
||||
|
||||
|
||||
@ -280,7 +293,7 @@ static UBool U_CALLCONV breakiterator_cleanup(void) {
|
||||
delete gService;
|
||||
gService = NULL;
|
||||
}
|
||||
gInitOnce.reset();
|
||||
gInitOnceBrkiter.reset();
|
||||
#endif
|
||||
return TRUE;
|
||||
}
|
||||
@ -296,7 +309,7 @@ initService(void) {
|
||||
static ICULocaleService*
|
||||
getService(void)
|
||||
{
|
||||
umtx_initOnce(gInitOnce, &initService);
|
||||
umtx_initOnce(gInitOnceBrkiter, &initService);
|
||||
return gService;
|
||||
}
|
||||
|
||||
@ -306,7 +319,7 @@ getService(void)
|
||||
static inline UBool
|
||||
hasService(void)
|
||||
{
|
||||
return !gInitOnce.isReset() && getService() != NULL;
|
||||
return !gInitOnceBrkiter.isReset() && getService() != NULL;
|
||||
}
|
||||
|
||||
// -------------------------------------
|
||||
|
@ -33,20 +33,85 @@ const int32_t LENGTH_IN_2TRAIL = 62;
|
||||
|
||||
} // namespace
|
||||
|
||||
Edits::~Edits() {
|
||||
if(array != stackArray) {
|
||||
void Edits::releaseArray() U_NOEXCEPT {
|
||||
if (array != stackArray) {
|
||||
uprv_free(array);
|
||||
}
|
||||
}
|
||||
|
||||
void Edits::reset() {
|
||||
Edits &Edits::copyArray(const Edits &other) {
|
||||
if (U_FAILURE(errorCode_)) {
|
||||
length = delta = numChanges = 0;
|
||||
return *this;
|
||||
}
|
||||
if (length > capacity) {
|
||||
uint16_t *newArray = (uint16_t *)uprv_malloc((size_t)length * 2);
|
||||
if (newArray == nullptr) {
|
||||
length = delta = numChanges = 0;
|
||||
errorCode_ = U_MEMORY_ALLOCATION_ERROR;
|
||||
return *this;
|
||||
}
|
||||
releaseArray();
|
||||
array = newArray;
|
||||
capacity = length;
|
||||
}
|
||||
if (length > 0) {
|
||||
uprv_memcpy(array, other.array, (size_t)length * 2);
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
Edits &Edits::moveArray(Edits &src) U_NOEXCEPT {
|
||||
if (U_FAILURE(errorCode_)) {
|
||||
length = delta = numChanges = 0;
|
||||
return *this;
|
||||
}
|
||||
releaseArray();
|
||||
if (length > STACK_CAPACITY) {
|
||||
array = src.array;
|
||||
capacity = src.capacity;
|
||||
src.array = src.stackArray;
|
||||
src.capacity = STACK_CAPACITY;
|
||||
src.reset();
|
||||
return *this;
|
||||
}
|
||||
array = stackArray;
|
||||
capacity = STACK_CAPACITY;
|
||||
if (length > 0) {
|
||||
uprv_memcpy(array, src.array, (size_t)length * 2);
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
Edits &Edits::operator=(const Edits &other) {
|
||||
length = other.length;
|
||||
delta = other.delta;
|
||||
numChanges = other.numChanges;
|
||||
errorCode_ = other.errorCode_;
|
||||
return copyArray(other);
|
||||
}
|
||||
|
||||
Edits &Edits::operator=(Edits &&src) U_NOEXCEPT {
|
||||
length = src.length;
|
||||
delta = src.delta;
|
||||
numChanges = src.numChanges;
|
||||
errorCode_ = src.errorCode_;
|
||||
return moveArray(src);
|
||||
}
|
||||
|
||||
Edits::~Edits() {
|
||||
releaseArray();
|
||||
}
|
||||
|
||||
void Edits::reset() U_NOEXCEPT {
|
||||
length = delta = numChanges = 0;
|
||||
errorCode_ = U_ZERO_ERROR;
|
||||
}
|
||||
|
||||
void Edits::addUnchanged(int32_t unchangedLength) {
|
||||
if(U_FAILURE(errorCode) || unchangedLength == 0) { return; }
|
||||
if(U_FAILURE(errorCode_) || unchangedLength == 0) { return; }
|
||||
if(unchangedLength < 0) {
|
||||
errorCode = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
errorCode_ = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return;
|
||||
}
|
||||
// Merge into previous unchanged-text record, if any.
|
||||
@ -72,7 +137,7 @@ void Edits::addUnchanged(int32_t unchangedLength) {
|
||||
}
|
||||
|
||||
void Edits::addReplace(int32_t oldLength, int32_t newLength) {
|
||||
if(U_FAILURE(errorCode)) { return; }
|
||||
if(U_FAILURE(errorCode_)) { return; }
|
||||
if(oldLength == newLength && 0 < oldLength && oldLength <= MAX_SHORT_WIDTH) {
|
||||
// Replacement of short oldLength text units by same-length new text.
|
||||
// Merge into previous short-replacement record, if any.
|
||||
@ -88,7 +153,7 @@ void Edits::addReplace(int32_t oldLength, int32_t newLength) {
|
||||
}
|
||||
|
||||
if(oldLength < 0 || newLength < 0) {
|
||||
errorCode = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
errorCode_ = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return;
|
||||
}
|
||||
if (oldLength == 0 && newLength == 0) {
|
||||
@ -100,7 +165,7 @@ void Edits::addReplace(int32_t oldLength, int32_t newLength) {
|
||||
if ((newDelta > 0 && delta >= 0 && newDelta > (INT32_MAX - delta)) ||
|
||||
(newDelta < 0 && delta < 0 && newDelta < (INT32_MIN - delta))) {
|
||||
// Integer overflow or underflow.
|
||||
errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
errorCode_ = U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
return;
|
||||
}
|
||||
delta += newDelta;
|
||||
@ -151,7 +216,7 @@ UBool Edits::growArray() {
|
||||
} else if (capacity == INT32_MAX) {
|
||||
// Not U_BUFFER_OVERFLOW_ERROR because that could be confused on a string transform API
|
||||
// with a result-string-buffer overflow.
|
||||
errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
errorCode_ = U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
return FALSE;
|
||||
} else if (capacity >= (INT32_MAX / 2)) {
|
||||
newCapacity = INT32_MAX;
|
||||
@ -160,18 +225,16 @@ UBool Edits::growArray() {
|
||||
}
|
||||
// Grow by at least 5 units so that a maximal change record will fit.
|
||||
if ((newCapacity - capacity) < 5) {
|
||||
errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
errorCode_ = U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
return FALSE;
|
||||
}
|
||||
uint16_t *newArray = (uint16_t *)uprv_malloc((size_t)newCapacity * 2);
|
||||
if (newArray == NULL) {
|
||||
errorCode = U_MEMORY_ALLOCATION_ERROR;
|
||||
errorCode_ = U_MEMORY_ALLOCATION_ERROR;
|
||||
return FALSE;
|
||||
}
|
||||
uprv_memcpy(newArray, array, (size_t)length * 2);
|
||||
if (array != stackArray) {
|
||||
uprv_free(array);
|
||||
}
|
||||
releaseArray();
|
||||
array = newArray;
|
||||
capacity = newCapacity;
|
||||
return TRUE;
|
||||
@ -179,11 +242,157 @@ UBool Edits::growArray() {
|
||||
|
||||
UBool Edits::copyErrorTo(UErrorCode &outErrorCode) {
|
||||
if (U_FAILURE(outErrorCode)) { return TRUE; }
|
||||
if (U_SUCCESS(errorCode)) { return FALSE; }
|
||||
outErrorCode = errorCode;
|
||||
if (U_SUCCESS(errorCode_)) { return FALSE; }
|
||||
outErrorCode = errorCode_;
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
Edits &Edits::mergeAndAppend(const Edits &ab, const Edits &bc, UErrorCode &errorCode) {
|
||||
if (copyErrorTo(errorCode)) { return *this; }
|
||||
// Picture string a --(Edits ab)--> string b --(Edits bc)--> string c.
|
||||
// Parallel iteration over both Edits.
|
||||
Iterator abIter = ab.getFineIterator();
|
||||
Iterator bcIter = bc.getFineIterator();
|
||||
UBool abHasNext = TRUE, bcHasNext = TRUE;
|
||||
// Copy iterator state into local variables, so that we can modify and subdivide spans.
|
||||
// ab old & new length, bc old & new length
|
||||
int32_t aLength = 0, ab_bLength = 0, bc_bLength = 0, cLength = 0;
|
||||
// When we have different-intermediate-length changes, we accumulate a larger change.
|
||||
int32_t pending_aLength = 0, pending_cLength = 0;
|
||||
for (;;) {
|
||||
// At this point, for each of the two iterators:
|
||||
// Either we are done with the locally cached current edit,
|
||||
// and its intermediate-string length has been reset,
|
||||
// or we will continue to work with a truncated remainder of this edit.
|
||||
//
|
||||
// If the current edit is done, and the iterator has not yet reached the end,
|
||||
// then we fetch the next edit. This is true for at least one of the iterators.
|
||||
//
|
||||
// Normally it does not matter whether we fetch from ab and then bc or vice versa.
|
||||
// However, the result is observably different when
|
||||
// ab deletions meet bc insertions at the same intermediate-string index.
|
||||
// Some users expect the bc insertions to come first, so we fetch from bc first.
|
||||
if (bc_bLength == 0) {
|
||||
if (bcHasNext && (bcHasNext = bcIter.next(errorCode))) {
|
||||
bc_bLength = bcIter.oldLength();
|
||||
cLength = bcIter.newLength();
|
||||
if (bc_bLength == 0) {
|
||||
// insertion
|
||||
if (ab_bLength == 0 || !abIter.hasChange()) {
|
||||
addReplace(pending_aLength, pending_cLength + cLength);
|
||||
pending_aLength = pending_cLength = 0;
|
||||
} else {
|
||||
pending_cLength += cLength;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
}
|
||||
// else see if the other iterator is done, too.
|
||||
}
|
||||
if (ab_bLength == 0) {
|
||||
if (abHasNext && (abHasNext = abIter.next(errorCode))) {
|
||||
aLength = abIter.oldLength();
|
||||
ab_bLength = abIter.newLength();
|
||||
if (ab_bLength == 0) {
|
||||
// deletion
|
||||
if (bc_bLength == bcIter.oldLength() || !bcIter.hasChange()) {
|
||||
addReplace(pending_aLength + aLength, pending_cLength);
|
||||
pending_aLength = pending_cLength = 0;
|
||||
} else {
|
||||
pending_aLength += aLength;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
} else if (bc_bLength == 0) {
|
||||
// Both iterators are done at the same time:
|
||||
// The intermediate-string lengths match.
|
||||
break;
|
||||
} else {
|
||||
// The ab output string is shorter than the bc input string.
|
||||
if (!copyErrorTo(errorCode)) {
|
||||
errorCode = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
}
|
||||
if (bc_bLength == 0) {
|
||||
// The bc input string is shorter than the ab output string.
|
||||
if (!copyErrorTo(errorCode)) {
|
||||
errorCode = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
// Done fetching: ab_bLength > 0 && bc_bLength > 0
|
||||
|
||||
// The current state has two parts:
|
||||
// - Past: We accumulate a longer ac edit in the "pending" variables.
|
||||
// - Current: We have copies of the current ab/bc edits in local variables.
|
||||
// At least one side is newly fetched.
|
||||
// One side might be a truncated remainder of an edit we fetched earlier.
|
||||
|
||||
if (!abIter.hasChange() && !bcIter.hasChange()) {
|
||||
// An unchanged span all the way from string a to string c.
|
||||
if (pending_aLength != 0 || pending_cLength != 0) {
|
||||
addReplace(pending_aLength, pending_cLength);
|
||||
pending_aLength = pending_cLength = 0;
|
||||
}
|
||||
int32_t unchangedLength = aLength <= cLength ? aLength : cLength;
|
||||
addUnchanged(unchangedLength);
|
||||
ab_bLength = aLength -= unchangedLength;
|
||||
bc_bLength = cLength -= unchangedLength;
|
||||
// At least one of the unchanged spans is now empty.
|
||||
continue;
|
||||
}
|
||||
if (!abIter.hasChange() && bcIter.hasChange()) {
|
||||
// Unchanged a->b but changed b->c.
|
||||
if (ab_bLength >= bc_bLength) {
|
||||
// Split the longer unchanged span into change + remainder.
|
||||
addReplace(pending_aLength + bc_bLength, pending_cLength + cLength);
|
||||
pending_aLength = pending_cLength = 0;
|
||||
aLength = ab_bLength -= bc_bLength;
|
||||
bc_bLength = 0;
|
||||
continue;
|
||||
}
|
||||
// Handle the shorter unchanged span below like a change.
|
||||
} else if (abIter.hasChange() && !bcIter.hasChange()) {
|
||||
// Changed a->b and then unchanged b->c.
|
||||
if (ab_bLength <= bc_bLength) {
|
||||
// Split the longer unchanged span into change + remainder.
|
||||
addReplace(pending_aLength + aLength, pending_cLength + ab_bLength);
|
||||
pending_aLength = pending_cLength = 0;
|
||||
cLength = bc_bLength -= ab_bLength;
|
||||
ab_bLength = 0;
|
||||
continue;
|
||||
}
|
||||
// Handle the shorter unchanged span below like a change.
|
||||
} else { // both abIter.hasChange() && bcIter.hasChange()
|
||||
if (ab_bLength == bc_bLength) {
|
||||
// Changes on both sides up to the same position. Emit & reset.
|
||||
addReplace(pending_aLength + aLength, pending_cLength + cLength);
|
||||
pending_aLength = pending_cLength = 0;
|
||||
ab_bLength = bc_bLength = 0;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
// Accumulate the a->c change, reset the shorter side,
|
||||
// keep a remainder of the longer one.
|
||||
pending_aLength += aLength;
|
||||
pending_cLength += cLength;
|
||||
if (ab_bLength < bc_bLength) {
|
||||
bc_bLength -= ab_bLength;
|
||||
cLength = ab_bLength = 0;
|
||||
} else { // ab_bLength > bc_bLength
|
||||
ab_bLength -= bc_bLength;
|
||||
aLength = bc_bLength = 0;
|
||||
}
|
||||
}
|
||||
if (pending_aLength != 0 || pending_cLength != 0) {
|
||||
addReplace(pending_aLength, pending_cLength);
|
||||
}
|
||||
copyErrorTo(errorCode);
|
||||
return *this;
|
||||
}
|
||||
|
||||
Edits::Iterator::Iterator(const uint16_t *a, int32_t len, UBool oc, UBool crs) :
|
||||
array(a), index(0), length(len), remaining(0),
|
||||
onlyChanges_(oc), coarse(crs),
|
||||
@ -308,12 +517,7 @@ int32_t Edits::Iterator::findIndex(int32_t i, UBool findSource, UErrorCode &erro
|
||||
spanStart = destIndex;
|
||||
spanLength = newLength_;
|
||||
}
|
||||
// If we are at the start or limit of an empty span, then we search from
|
||||
// the start of the string so that we always return
|
||||
// the first of several consecutive empty spans, for consistent results.
|
||||
// We do not currently track the properties of the previous span,
|
||||
// so for now we always reset if we are at the start of the current span.
|
||||
if (i <= spanStart) {
|
||||
if (i < spanStart) {
|
||||
// Reset the iterator to the start.
|
||||
index = remaining = oldLength_ = newLength_ = srcIndex = replIndex = destIndex = 0;
|
||||
} else if (i < (spanStart + spanLength)) {
|
||||
@ -328,8 +532,8 @@ int32_t Edits::Iterator::findIndex(int32_t i, UBool findSource, UErrorCode &erro
|
||||
spanStart = destIndex;
|
||||
spanLength = newLength_;
|
||||
}
|
||||
if (i == spanStart || i < (spanStart + spanLength)) {
|
||||
// The index is in the current span, or at an empty one.
|
||||
if (i < (spanStart + spanLength)) {
|
||||
// The index is in the current span.
|
||||
return 0;
|
||||
}
|
||||
if (remaining > 0) {
|
||||
|
@ -35,7 +35,7 @@ U_NAMESPACE_BEGIN
|
||||
|
||||
static icu::Locale* availableLocaleList = NULL;
|
||||
static int32_t availableLocaleListCount;
|
||||
static icu::UInitOnce gInitOnce = U_INITONCE_INITIALIZER;
|
||||
static icu::UInitOnce gInitOnceLocale = U_INITONCE_INITIALIZER;
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
@ -50,7 +50,7 @@ static UBool U_CALLCONV locale_available_cleanup(void)
|
||||
availableLocaleList = NULL;
|
||||
}
|
||||
availableLocaleListCount = 0;
|
||||
gInitOnce.reset();
|
||||
gInitOnceLocale.reset();
|
||||
|
||||
return TRUE;
|
||||
}
|
||||
@ -81,7 +81,7 @@ void U_CALLCONV locale_available_init() {
|
||||
const Locale* U_EXPORT2
|
||||
Locale::getAvailableLocales(int32_t& count)
|
||||
{
|
||||
umtx_initOnce(gInitOnce, &locale_available_init);
|
||||
umtx_initOnce(gInitOnceLocale, &locale_available_init);
|
||||
count = availableLocaleListCount;
|
||||
return availableLocaleList;
|
||||
}
|
||||
|
@ -1069,7 +1069,7 @@ uprv_getWindowsTimeZone()
|
||||
U_CAPI const char* U_EXPORT2
|
||||
uprv_tzname(int n)
|
||||
{
|
||||
n; // Avoid unreferenced parameter warning.
|
||||
(void)n; // Avoid unreferenced parameter warning.
|
||||
const char *tzid = NULL;
|
||||
#if U_PLATFORM_USES_ONLY_WIN32_API
|
||||
#if U_PLATFORM_HAS_WINUWP_API > 0
|
||||
|
@ -72,15 +72,6 @@
|
||||
typedef size_t uintptr_t;
|
||||
#endif
|
||||
|
||||
/**
|
||||
* \def U_HAVE_MSVC_2003_OR_EARLIER
|
||||
* Flag for workaround of MSVC 2003 optimization bugs
|
||||
* @internal
|
||||
*/
|
||||
#if !defined(U_HAVE_MSVC_2003_OR_EARLIER) && defined(_MSC_VER) && (_MSC_VER < 1400)
|
||||
#define U_HAVE_MSVC_2003_OR_EARLIER
|
||||
#endif
|
||||
|
||||
/*===========================================================================*/
|
||||
/** @{ Information about POSIX support */
|
||||
/*===========================================================================*/
|
||||
@ -120,15 +111,15 @@ typedef size_t uintptr_t;
|
||||
/* Use the predefined value. */
|
||||
#elif U_PLATFORM == U_PF_ANDROID
|
||||
# define U_TIMEZONE timezone
|
||||
#elif defined(__UCLIBC__)
|
||||
// uClibc does not have __timezone or _timezone.
|
||||
#elif defined(_NEWLIB_VERSION)
|
||||
# define U_TIMEZONE _timezone
|
||||
#elif defined(__GLIBC__)
|
||||
// glibc
|
||||
# define U_TIMEZONE __timezone
|
||||
#elif U_PLATFORM_IS_LINUX_BASED
|
||||
# if defined(__UCLIBC__)
|
||||
/* uClibc does not have __timezone or _timezone. */
|
||||
# elif defined(_NEWLIB_VERSION)
|
||||
# define U_TIMEZONE _timezone
|
||||
# elif defined(__GLIBC__)
|
||||
/* glibc */
|
||||
# define U_TIMEZONE __timezone
|
||||
# endif
|
||||
// not defined
|
||||
#elif U_PLATFORM_USES_ONLY_WIN32_API
|
||||
# define U_TIMEZONE _timezone
|
||||
#elif U_PLATFORM == U_PF_BSD && !defined(__NetBSD__)
|
||||
@ -214,7 +205,7 @@ typedef size_t uintptr_t;
|
||||
/**
|
||||
* \def U_HAVE_STD_ATOMICS
|
||||
* Defines whether the standard C++11 <atomic> is available.
|
||||
* ICU will use this when avialable,
|
||||
* ICU will use this when available,
|
||||
* otherwise will fall back to compiler or platform specific alternatives.
|
||||
* @internal
|
||||
*/
|
||||
@ -239,7 +230,7 @@ typedef size_t uintptr_t;
|
||||
|
||||
/**
|
||||
* \def U_HAVE_CLANG_ATOMICS
|
||||
* Defines whether Clang c11 style built-in atomics are avaialable.
|
||||
* Defines whether Clang c11 style built-in atomics are available.
|
||||
* These are used in preference to gcc atomics when both are available.
|
||||
*/
|
||||
#ifdef U_HAVE_CLANG_ATOMICS
|
||||
@ -277,7 +268,7 @@ typedef size_t uintptr_t;
|
||||
|
||||
/**
|
||||
* Platform utilities isolates the platform dependencies of the
|
||||
* libarary. For each platform which this code is ported to, these
|
||||
* library. For each platform which this code is ported to, these
|
||||
* functions may have to be re-implemented.
|
||||
*/
|
||||
|
||||
@ -425,7 +416,7 @@ U_INTERNAL const char* U_EXPORT2 uprv_getDefaultCodepage(void);
|
||||
|
||||
/**
|
||||
* Please use uloc_getDefault() instead.
|
||||
* Return the default locale ID string by querying ths system, or
|
||||
* Return the default locale ID string by querying the system, or
|
||||
* zero if one cannot be found.
|
||||
* This function can call setlocale() on Unix platforms. Please read the
|
||||
* platform documentation on setlocale() before calling this function.
|
||||
|
@ -213,6 +213,8 @@ RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) {
|
||||
if (this == &that) {
|
||||
return *this;
|
||||
}
|
||||
BreakIterator::operator=(that);
|
||||
|
||||
reset(); // Delete break cache information
|
||||
fBreakType = that.fBreakType;
|
||||
if (fLanguageBreakEngines != NULL) {
|
||||
@ -311,16 +313,19 @@ RuleBasedBreakIterator::operator==(const BreakIterator& that) const {
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
// The base class BreakIterator carries no state that participates in equality,
|
||||
// and does not implement an equality function that would otherwise be
|
||||
// checked at this point.
|
||||
|
||||
const RuleBasedBreakIterator& that2 = (const RuleBasedBreakIterator&) that;
|
||||
|
||||
if (!utext_equals(fText, that2.fText)) {
|
||||
// The two break iterators are operating on different text,
|
||||
// or have a different interation position.
|
||||
// or have a different iteration position.
|
||||
// Note that fText's position is always the same as the break iterator's position.
|
||||
return FALSE;
|
||||
};
|
||||
|
||||
// TODO: need a check for when in a dictionary region at different offsets.
|
||||
|
||||
if (that2.fData == fData ||
|
||||
(fData != NULL && that2.fData != NULL && *that2.fData == *fData)) {
|
||||
// The two break iterators are using the same rules.
|
||||
|
@ -287,7 +287,7 @@ UCharsTrieBuilder::indexOfElementWithNextUnit(int32_t i, int32_t unitIndex, UCha
|
||||
|
||||
UCharsTrieBuilder::UCTLinearMatchNode::UCTLinearMatchNode(const UChar *units, int32_t len, Node *nextNode)
|
||||
: LinearMatchNode(len, nextNode), s(units) {
|
||||
hash=hash*37+ustr_hashUCharsN(units, len);
|
||||
hash=hash*37u+ustr_hashUCharsN(units, len);
|
||||
}
|
||||
|
||||
UBool
|
||||
|
@ -250,7 +250,7 @@ public:
|
||||
virtual int32_t next(void) = 0;
|
||||
|
||||
/**
|
||||
* Return character index of the current interator position within the text.
|
||||
* Return character index of the current iterator position within the text.
|
||||
* @return The boundary most recently returned.
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
@ -277,7 +277,7 @@ public:
|
||||
virtual int32_t preceding(int32_t offset) = 0;
|
||||
|
||||
/**
|
||||
* Return true if the specfied position is a boundary position.
|
||||
* Return true if the specified position is a boundary position.
|
||||
* As a side effect, the current position of the iterator is set
|
||||
* to the first boundary position at or following the specified offset.
|
||||
* @param offset the offset to check.
|
||||
@ -331,7 +331,7 @@ public:
|
||||
* @param fillInVec an array to be filled in with the status values.
|
||||
* @param capacity the length of the supplied vector. A length of zero causes
|
||||
* the function to return the number of status values, in the
|
||||
* normal way, without attemtping to store any values.
|
||||
* normal way, without attempting to store any values.
|
||||
* @param status receives error codes.
|
||||
* @return The number of rule status values from rules that determined
|
||||
* the most recent boundary returned by the break iterator.
|
||||
@ -469,7 +469,7 @@ public:
|
||||
static const Locale* U_EXPORT2 getAvailableLocales(int32_t& count);
|
||||
|
||||
/**
|
||||
* Get name of the object for the desired Locale, in the desired langauge.
|
||||
* Get name of the object for the desired Locale, in the desired language.
|
||||
* @param objectLocale must be from getAvailableLocales.
|
||||
* @param displayLocale specifies the desired locale for output.
|
||||
* @param name the fill-in parameter of the return value
|
||||
@ -482,7 +482,7 @@ public:
|
||||
UnicodeString& name);
|
||||
|
||||
/**
|
||||
* Get name of the object for the desired Locale, in the langauge of the
|
||||
* Get name of the object for the desired Locale, in the language of the
|
||||
* default locale.
|
||||
* @param objectLocale must be from getMatchingLocales
|
||||
* @param name the fill-in parameter of the return value
|
||||
@ -629,10 +629,12 @@ protected:
|
||||
/** @internal */
|
||||
BreakIterator();
|
||||
/** @internal */
|
||||
BreakIterator (const BreakIterator &other) : UObject(other) {}
|
||||
BreakIterator (const BreakIterator &other);
|
||||
#ifndef U_HIDE_INTERNAL_API
|
||||
/** @internal */
|
||||
BreakIterator (const Locale& valid, const Locale& actual);
|
||||
BreakIterator (const Locale& valid, const Locale &actual);
|
||||
/** @internal. Assignment Operator, used by RuleBasedBreakIterator. */
|
||||
BreakIterator &operator = (const BreakIterator &other);
|
||||
#endif /* U_HIDE_INTERNAL_API */
|
||||
|
||||
private:
|
||||
@ -640,12 +642,6 @@ private:
|
||||
/** @internal */
|
||||
char actualLocale[ULOC_FULLNAME_CAPACITY];
|
||||
char validLocale[ULOC_FULLNAME_CAPACITY];
|
||||
|
||||
/**
|
||||
* The assignment operator has no real implementation.
|
||||
* It's provided to make the compiler happy. Do not call.
|
||||
*/
|
||||
BreakIterator& operator=(const BreakIterator&);
|
||||
};
|
||||
|
||||
#ifndef U_HIDE_DEPRECATED_API
|
||||
@ -661,5 +657,5 @@ U_NAMESPACE_END
|
||||
|
||||
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
|
||||
|
||||
#endif // _BRKITER
|
||||
#endif // BRKITER_H
|
||||
//eof
|
||||
|
@ -95,45 +95,45 @@ private:
|
||||
return reinterpret_cast<char16_t *>(t);
|
||||
}
|
||||
|
||||
char16_t *p;
|
||||
char16_t *p_;
|
||||
#else
|
||||
union {
|
||||
char16_t *cp;
|
||||
uint16_t *up;
|
||||
wchar_t *wp;
|
||||
} u;
|
||||
} u_;
|
||||
#endif
|
||||
};
|
||||
|
||||
#ifdef U_ALIASING_BARRIER
|
||||
|
||||
Char16Ptr::Char16Ptr(char16_t *p) : p(p) {}
|
||||
Char16Ptr::Char16Ptr(char16_t *p) : p_(p) {}
|
||||
#if !U_CHAR16_IS_TYPEDEF
|
||||
Char16Ptr::Char16Ptr(uint16_t *p) : p(cast(p)) {}
|
||||
Char16Ptr::Char16Ptr(uint16_t *p) : p_(cast(p)) {}
|
||||
#endif
|
||||
#if U_SIZEOF_WCHAR_T==2
|
||||
Char16Ptr::Char16Ptr(wchar_t *p) : p(cast(p)) {}
|
||||
Char16Ptr::Char16Ptr(wchar_t *p) : p_(cast(p)) {}
|
||||
#endif
|
||||
Char16Ptr::Char16Ptr(std::nullptr_t p) : p(p) {}
|
||||
Char16Ptr::Char16Ptr(std::nullptr_t p) : p_(p) {}
|
||||
Char16Ptr::~Char16Ptr() {
|
||||
U_ALIASING_BARRIER(p);
|
||||
U_ALIASING_BARRIER(p_);
|
||||
}
|
||||
|
||||
char16_t *Char16Ptr::get() const { return p; }
|
||||
char16_t *Char16Ptr::get() const { return p_; }
|
||||
|
||||
#else
|
||||
|
||||
Char16Ptr::Char16Ptr(char16_t *p) { u.cp = p; }
|
||||
Char16Ptr::Char16Ptr(char16_t *p) { u_.cp = p; }
|
||||
#if !U_CHAR16_IS_TYPEDEF
|
||||
Char16Ptr::Char16Ptr(uint16_t *p) { u.up = p; }
|
||||
Char16Ptr::Char16Ptr(uint16_t *p) { u_.up = p; }
|
||||
#endif
|
||||
#if U_SIZEOF_WCHAR_T==2
|
||||
Char16Ptr::Char16Ptr(wchar_t *p) { u.wp = p; }
|
||||
Char16Ptr::Char16Ptr(wchar_t *p) { u_.wp = p; }
|
||||
#endif
|
||||
Char16Ptr::Char16Ptr(std::nullptr_t p) { u.cp = p; }
|
||||
Char16Ptr::Char16Ptr(std::nullptr_t p) { u_.cp = p; }
|
||||
Char16Ptr::~Char16Ptr() {}
|
||||
|
||||
char16_t *Char16Ptr::get() const { return u.cp; }
|
||||
char16_t *Char16Ptr::get() const { return u_.cp; }
|
||||
|
||||
#endif
|
||||
|
||||
@ -203,45 +203,45 @@ private:
|
||||
return reinterpret_cast<const char16_t *>(t);
|
||||
}
|
||||
|
||||
const char16_t *p;
|
||||
const char16_t *p_;
|
||||
#else
|
||||
union {
|
||||
const char16_t *cp;
|
||||
const uint16_t *up;
|
||||
const wchar_t *wp;
|
||||
} u;
|
||||
} u_;
|
||||
#endif
|
||||
};
|
||||
|
||||
#ifdef U_ALIASING_BARRIER
|
||||
|
||||
ConstChar16Ptr::ConstChar16Ptr(const char16_t *p) : p(p) {}
|
||||
ConstChar16Ptr::ConstChar16Ptr(const char16_t *p) : p_(p) {}
|
||||
#if !U_CHAR16_IS_TYPEDEF
|
||||
ConstChar16Ptr::ConstChar16Ptr(const uint16_t *p) : p(cast(p)) {}
|
||||
ConstChar16Ptr::ConstChar16Ptr(const uint16_t *p) : p_(cast(p)) {}
|
||||
#endif
|
||||
#if U_SIZEOF_WCHAR_T==2
|
||||
ConstChar16Ptr::ConstChar16Ptr(const wchar_t *p) : p(cast(p)) {}
|
||||
ConstChar16Ptr::ConstChar16Ptr(const wchar_t *p) : p_(cast(p)) {}
|
||||
#endif
|
||||
ConstChar16Ptr::ConstChar16Ptr(const std::nullptr_t p) : p(p) {}
|
||||
ConstChar16Ptr::ConstChar16Ptr(const std::nullptr_t p) : p_(p) {}
|
||||
ConstChar16Ptr::~ConstChar16Ptr() {
|
||||
U_ALIASING_BARRIER(p);
|
||||
U_ALIASING_BARRIER(p_);
|
||||
}
|
||||
|
||||
const char16_t *ConstChar16Ptr::get() const { return p; }
|
||||
const char16_t *ConstChar16Ptr::get() const { return p_; }
|
||||
|
||||
#else
|
||||
|
||||
ConstChar16Ptr::ConstChar16Ptr(const char16_t *p) { u.cp = p; }
|
||||
ConstChar16Ptr::ConstChar16Ptr(const char16_t *p) { u_.cp = p; }
|
||||
#if !U_CHAR16_IS_TYPEDEF
|
||||
ConstChar16Ptr::ConstChar16Ptr(const uint16_t *p) { u.up = p; }
|
||||
ConstChar16Ptr::ConstChar16Ptr(const uint16_t *p) { u_.up = p; }
|
||||
#endif
|
||||
#if U_SIZEOF_WCHAR_T==2
|
||||
ConstChar16Ptr::ConstChar16Ptr(const wchar_t *p) { u.wp = p; }
|
||||
ConstChar16Ptr::ConstChar16Ptr(const wchar_t *p) { u_.wp = p; }
|
||||
#endif
|
||||
ConstChar16Ptr::ConstChar16Ptr(const std::nullptr_t p) { u.cp = p; }
|
||||
ConstChar16Ptr::ConstChar16Ptr(const std::nullptr_t p) { u_.cp = p; }
|
||||
ConstChar16Ptr::~ConstChar16Ptr() {}
|
||||
|
||||
const char16_t *ConstChar16Ptr::get() const { return u.cp; }
|
||||
const char16_t *ConstChar16Ptr::get() const { return u_.cp; }
|
||||
|
||||
#endif
|
||||
|
||||
|
@ -37,18 +37,60 @@ public:
|
||||
*/
|
||||
Edits() :
|
||||
array(stackArray), capacity(STACK_CAPACITY), length(0), delta(0), numChanges(0),
|
||||
errorCode(U_ZERO_ERROR) {}
|
||||
errorCode_(U_ZERO_ERROR) {}
|
||||
/**
|
||||
* Copy constructor.
|
||||
* @param other source edits
|
||||
* @draft ICU 60
|
||||
*/
|
||||
Edits(const Edits &other) :
|
||||
array(stackArray), capacity(STACK_CAPACITY), length(other.length),
|
||||
delta(other.delta), numChanges(other.numChanges),
|
||||
errorCode_(other.errorCode_) {
|
||||
copyArray(other);
|
||||
}
|
||||
/**
|
||||
* Move constructor, might leave src empty.
|
||||
* This object will have the same contents that the source object had.
|
||||
* @param src source edits
|
||||
* @draft ICU 60
|
||||
*/
|
||||
Edits(Edits &&src) U_NOEXCEPT :
|
||||
array(stackArray), capacity(STACK_CAPACITY), length(src.length),
|
||||
delta(src.delta), numChanges(src.numChanges),
|
||||
errorCode_(src.errorCode_) {
|
||||
moveArray(src);
|
||||
}
|
||||
|
||||
/**
|
||||
* Destructor.
|
||||
* @draft ICU 59
|
||||
*/
|
||||
~Edits();
|
||||
|
||||
/**
|
||||
* Assignment operator.
|
||||
* @param other source edits
|
||||
* @return *this
|
||||
* @draft ICU 60
|
||||
*/
|
||||
Edits &operator=(const Edits &other);
|
||||
|
||||
/**
|
||||
* Move assignment operator, might leave src empty.
|
||||
* This object will have the same contents that the source object had.
|
||||
* The behavior is undefined if *this and src are the same object.
|
||||
* @param src source edits
|
||||
* @return *this
|
||||
* @draft ICU 60
|
||||
*/
|
||||
Edits &operator=(Edits &&src) U_NOEXCEPT;
|
||||
|
||||
/**
|
||||
* Resets the data but may not release memory.
|
||||
* @draft ICU 59
|
||||
*/
|
||||
void reset();
|
||||
void reset() U_NOEXCEPT;
|
||||
|
||||
/**
|
||||
* Adds a record for an unchanged segment of text.
|
||||
@ -99,6 +141,15 @@ public:
|
||||
* @draft ICU 59
|
||||
*/
|
||||
struct U_COMMON_API Iterator U_FINAL : public UMemory {
|
||||
/**
|
||||
* Default constructor, empty iterator.
|
||||
* @draft ICU 60
|
||||
*/
|
||||
Iterator() :
|
||||
array(nullptr), index(0), length(0),
|
||||
remaining(0), onlyChanges_(FALSE), coarse(FALSE),
|
||||
changed(FALSE), oldLength_(0), newLength_(0),
|
||||
srcIndex(0), replIndex(0), destIndex(0) {}
|
||||
/**
|
||||
* Copy constructor.
|
||||
* @draft ICU 59
|
||||
@ -309,9 +360,39 @@ public:
|
||||
return Iterator(array, length, FALSE, FALSE);
|
||||
}
|
||||
|
||||
/**
|
||||
* Merges the two input Edits and appends the result to this object.
|
||||
*
|
||||
* Consider two string transformations (for example, normalization and case mapping)
|
||||
* where each records Edits in addition to writing an output string.<br>
|
||||
* Edits ab reflect how substrings of input string a
|
||||
* map to substrings of intermediate string b.<br>
|
||||
* Edits bc reflect how substrings of intermediate string b
|
||||
* map to substrings of output string c.<br>
|
||||
* This function merges ab and bc such that the additional edits
|
||||
* recorded in this object reflect how substrings of input string a
|
||||
* map to substrings of output string c.
|
||||
*
|
||||
* If unrelated Edits are passed in where the output string of the first
|
||||
* has a different length than the input string of the second,
|
||||
* then a U_ILLEGAL_ARGUMENT_ERROR is reported.
|
||||
*
|
||||
* @param ab reflects how substrings of input string a
|
||||
* map to substrings of intermediate string b.
|
||||
* @param bc reflects how substrings of intermediate string b
|
||||
* map to substrings of output string c.
|
||||
* @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test,
|
||||
* or else the function returns immediately. Check for U_FAILURE()
|
||||
* on output or use with function chaining. (See User Guide for details.)
|
||||
* @return *this, with the merged edits appended
|
||||
* @draft ICU 60
|
||||
*/
|
||||
Edits &mergeAndAppend(const Edits &ab, const Edits &bc, UErrorCode &errorCode);
|
||||
|
||||
private:
|
||||
Edits(const Edits &) = delete;
|
||||
Edits &operator=(const Edits &) = delete;
|
||||
void releaseArray() U_NOEXCEPT;
|
||||
Edits ©Array(const Edits &other);
|
||||
Edits &moveArray(Edits &src) U_NOEXCEPT;
|
||||
|
||||
void setLastUnit(int32_t last) { array[length - 1] = (uint16_t)last; }
|
||||
int32_t lastUnit() const { return length > 0 ? array[length - 1] : 0xffff; }
|
||||
@ -325,7 +406,7 @@ private:
|
||||
int32_t length;
|
||||
int32_t delta;
|
||||
int32_t numChanges;
|
||||
UErrorCode errorCode;
|
||||
UErrorCode errorCode_;
|
||||
uint16_t stackArray[STACK_CAPACITY];
|
||||
};
|
||||
|
||||
|
@ -256,7 +256,7 @@ protected:
|
||||
/** @internal */
|
||||
class FinalValueNode : public Node {
|
||||
public:
|
||||
FinalValueNode(int32_t v) : Node(0x111111*37+v), value(v) {}
|
||||
FinalValueNode(int32_t v) : Node(0x111111u*37u+v), value(v) {}
|
||||
virtual UBool operator==(const Node &other) const;
|
||||
virtual void write(StringTrieBuilder &builder);
|
||||
protected:
|
||||
@ -276,7 +276,7 @@ protected:
|
||||
void setValue(int32_t v) {
|
||||
hasValue=TRUE;
|
||||
value=v;
|
||||
hash=hash*37+v;
|
||||
hash=hash*37u+v;
|
||||
}
|
||||
protected:
|
||||
UBool hasValue;
|
||||
@ -290,7 +290,7 @@ protected:
|
||||
class IntermediateValueNode : public ValueNode {
|
||||
public:
|
||||
IntermediateValueNode(int32_t v, Node *nextNode)
|
||||
: ValueNode(0x222222*37+hashCode(nextNode)), next(nextNode) { setValue(v); }
|
||||
: ValueNode(0x222222u*37u+hashCode(nextNode)), next(nextNode) { setValue(v); }
|
||||
virtual UBool operator==(const Node &other) const;
|
||||
virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
|
||||
virtual void write(StringTrieBuilder &builder);
|
||||
@ -307,7 +307,7 @@ protected:
|
||||
class LinearMatchNode : public ValueNode {
|
||||
public:
|
||||
LinearMatchNode(int32_t len, Node *nextNode)
|
||||
: ValueNode((0x333333*37+len)*37+hashCode(nextNode)),
|
||||
: ValueNode((0x333333u*37u+len)*37u+hashCode(nextNode)),
|
||||
length(len), next(nextNode) {}
|
||||
virtual UBool operator==(const Node &other) const;
|
||||
virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
|
||||
@ -342,7 +342,7 @@ protected:
|
||||
equal[length]=NULL;
|
||||
values[length]=value;
|
||||
++length;
|
||||
hash=(hash*37+c)*37+value;
|
||||
hash=(hash*37u+c)*37u+value;
|
||||
}
|
||||
// Adds a unit which leads to another match node.
|
||||
void add(int32_t c, Node *node) {
|
||||
@ -350,7 +350,7 @@ protected:
|
||||
equal[length]=node;
|
||||
values[length]=0;
|
||||
++length;
|
||||
hash=(hash*37+c)*37+hashCode(node);
|
||||
hash=(hash*37u+c)*37u+hashCode(node);
|
||||
}
|
||||
protected:
|
||||
Node *equal[kMaxBranchLinearSubNodeLength]; // NULL means "has final value".
|
||||
@ -365,8 +365,8 @@ protected:
|
||||
class SplitBranchNode : public BranchNode {
|
||||
public:
|
||||
SplitBranchNode(char16_t middleUnit, Node *lessThanNode, Node *greaterOrEqualNode)
|
||||
: BranchNode(((0x555555*37+middleUnit)*37+
|
||||
hashCode(lessThanNode))*37+hashCode(greaterOrEqualNode)),
|
||||
: BranchNode(((0x555555u*37u+middleUnit)*37u+
|
||||
hashCode(lessThanNode))*37u+hashCode(greaterOrEqualNode)),
|
||||
unit(middleUnit), lessThan(lessThanNode), greaterOrEqual(greaterOrEqualNode) {}
|
||||
virtual UBool operator==(const Node &other) const;
|
||||
virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
|
||||
@ -382,7 +382,7 @@ protected:
|
||||
class BranchHeadNode : public ValueNode {
|
||||
public:
|
||||
BranchHeadNode(int32_t len, Node *subNode)
|
||||
: ValueNode((0x666666*37+len)*37+hashCode(subNode)),
|
||||
: ValueNode((0x666666u*37u+len)*37u+hashCode(subNode)),
|
||||
length(len), next(subNode) {}
|
||||
virtual UBool operator==(const Node &other) const;
|
||||
virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
|
||||
|
@ -987,7 +987,7 @@ UnicodeSet::applyPropertyAlias(const UnicodeString& prop,
|
||||
|
||||
UProperty p;
|
||||
int32_t v;
|
||||
UBool mustNotBeEmpty = FALSE, invert = FALSE;
|
||||
UBool invert = FALSE;
|
||||
|
||||
if (value.length() > 0) {
|
||||
p = u_getPropertyEnum(pname.data());
|
||||
@ -1009,14 +1009,15 @@ UnicodeSet::applyPropertyAlias(const UnicodeString& prop,
|
||||
p == UCHAR_LEAD_CANONICAL_COMBINING_CLASS) {
|
||||
char* end;
|
||||
double value = uprv_strtod(vname.data(), &end);
|
||||
v = (int32_t) value;
|
||||
if (v != value || v < 0 || *end != 0) {
|
||||
// non-integral or negative value, or trailing junk
|
||||
// Anything between 0 and 255 is valid even if unused.
|
||||
// Cast double->int only after range check.
|
||||
// We catch NaN here because comparing it with both 0 and 255 will be false
|
||||
// (as are all comparisons with NaN).
|
||||
if (*end != 0 || !(0 <= value && value <= 255) ||
|
||||
(v = (int32_t)value) != value) {
|
||||
// non-integral value or outside 0..255, or trailing junk
|
||||
FAIL(ec);
|
||||
}
|
||||
// If the resultant set is empty then the numeric value
|
||||
// was invalid.
|
||||
mustNotBeEmpty = TRUE;
|
||||
} else {
|
||||
FAIL(ec);
|
||||
}
|
||||
@ -1115,12 +1116,6 @@ UnicodeSet::applyPropertyAlias(const UnicodeString& prop,
|
||||
complement();
|
||||
}
|
||||
|
||||
if (U_SUCCESS(ec) && (mustNotBeEmpty && isEmpty())) {
|
||||
// mustNotBeEmpty is set to true if an empty set indicates
|
||||
// invalid input.
|
||||
ec = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
}
|
||||
|
||||
if (isBogus() && U_SUCCESS(ec)) {
|
||||
// We likely ran out of memory. AHHH!
|
||||
ec = U_MEMORY_ALLOCATION_ERROR;
|
||||
|
@ -980,11 +980,4 @@ utrie2_internalU8PrevIndex(const UTrie2 *trie, UChar32 c,
|
||||
|
||||
U_CDECL_END
|
||||
|
||||
/**
|
||||
* Work around MSVC 2003 optimization bugs.
|
||||
*/
|
||||
#if defined (U_HAVE_MSVC_2003_OR_EARLIER)
|
||||
#pragma optimize("", off)
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
@ -496,8 +496,8 @@ $CAN_CM $CM* $QU; # QU x .
|
||||
[$LB20NonBreaks-$CM] $CM* $BB; # BB x .
|
||||
[^$CB] $CM* $BB; #
|
||||
|
||||
# LB21a
|
||||
[^$CM $CB]? $CM* ($HY | $BA | $BAX) $CM* $HL;
|
||||
# LB21a Don't break after Hebrew + Hyphen.
|
||||
([^$CB] $CM*)? ($HY | $BA | $BAX) $CM* $HL;
|
||||
|
||||
# LB21b (reverse)
|
||||
$HL $CM* $SY;
|
||||
|
@ -427,6 +427,7 @@ tzdbNames{
|
||||
ss{"LINT"}
|
||||
}
|
||||
"meta:Lord_Howe"{
|
||||
sd{"LHDT"}
|
||||
ss{"LHST"}
|
||||
}
|
||||
"meta:Macau"{
|
||||
|
@ -226,7 +226,7 @@ AffixPattern::append(const AffixPattern &other) {
|
||||
addLiteral(literal.getBuffer(), 0, literal.length());
|
||||
break;
|
||||
case kCurrency:
|
||||
addCurrency(iter.getTokenLength());
|
||||
addCurrency(static_cast<uint8_t>(iter.getTokenLength()));
|
||||
break;
|
||||
default:
|
||||
add(iter.getTokenType());
|
||||
@ -481,7 +481,7 @@ AffixPattern::parseUserAffixString(
|
||||
break;
|
||||
case 0xA4:
|
||||
appender.flush();
|
||||
appendTo.add(kCurrency, tokenSize);
|
||||
appendTo.add(kCurrency, static_cast<uint8_t>(tokenSize));
|
||||
break;
|
||||
default:
|
||||
appender.append(token);
|
||||
|
@ -28,6 +28,21 @@ class SkippedState;
|
||||
class UCharsTrie;
|
||||
class UVector32;
|
||||
|
||||
/* Large enough for CEs of most short strings. */
|
||||
#define CEBUFFER_INITIAL_CAPACITY 40
|
||||
|
||||
// Export an explicit template instantiation of the MaybeStackArray that
|
||||
// is used as a data member of CEBuffer.
|
||||
//
|
||||
// MSVC requires this, even though it should not be necessary.
|
||||
// No direct access to the MaybeStackArray leaks out of the i18n library.
|
||||
//
|
||||
// See digitlst.h, pluralaffix.h, datefmt.h, and others for similar examples.
|
||||
//
|
||||
#if defined (_MSC_VER)
|
||||
template class U_I18N_API MaybeStackArray<int64_t, CEBUFFER_INITIAL_CAPACITY>;
|
||||
#endif
|
||||
|
||||
/**
|
||||
* Collation element iterator and abstract character iterator.
|
||||
*
|
||||
@ -36,10 +51,10 @@ class UVector32;
|
||||
*/
|
||||
class U_I18N_API CollationIterator : public UObject {
|
||||
private:
|
||||
class CEBuffer {
|
||||
class U_I18N_API CEBuffer {
|
||||
private:
|
||||
/** Large enough for CEs of most short strings. */
|
||||
static const int32_t INITIAL_CAPACITY = 40;
|
||||
static const int32_t INITIAL_CAPACITY = CEBUFFER_INITIAL_CAPACITY;
|
||||
public:
|
||||
CEBuffer() : length(0) {}
|
||||
~CEBuffer();
|
||||
|
@ -97,9 +97,7 @@ static const char *gNumberElementKeys[DecimalFormatSymbols::kFormatSymbolCount]
|
||||
// Initializes this with the decimal format symbols in the default locale.
|
||||
|
||||
DecimalFormatSymbols::DecimalFormatSymbols(UErrorCode& status)
|
||||
: UObject(),
|
||||
locale()
|
||||
{
|
||||
: UObject(), locale() {
|
||||
initialize(locale, status, TRUE);
|
||||
}
|
||||
|
||||
@ -107,16 +105,17 @@ DecimalFormatSymbols::DecimalFormatSymbols(UErrorCode& status)
|
||||
// Initializes this with the decimal format symbols in the desired locale.
|
||||
|
||||
DecimalFormatSymbols::DecimalFormatSymbols(const Locale& loc, UErrorCode& status)
|
||||
: UObject(),
|
||||
locale(loc)
|
||||
{
|
||||
: UObject(), locale(loc) {
|
||||
initialize(locale, status);
|
||||
}
|
||||
|
||||
DecimalFormatSymbols::DecimalFormatSymbols(const Locale& loc, const NumberingSystem& ns, UErrorCode& status)
|
||||
: UObject(), locale(loc) {
|
||||
initialize(locale, status, FALSE, &ns);
|
||||
}
|
||||
|
||||
DecimalFormatSymbols::DecimalFormatSymbols()
|
||||
: UObject(),
|
||||
locale(Locale::getRoot()),
|
||||
currPattern(NULL) {
|
||||
: UObject(), locale(Locale::getRoot()), currPattern(NULL) {
|
||||
*validLocale = *actualLocale = 0;
|
||||
initialize();
|
||||
}
|
||||
@ -342,7 +341,8 @@ CurrencySpacingSink::~CurrencySpacingSink() {}
|
||||
} // namespace
|
||||
|
||||
void
|
||||
DecimalFormatSymbols::initialize(const Locale& loc, UErrorCode& status, UBool useLastResortData)
|
||||
DecimalFormatSymbols::initialize(const Locale& loc, UErrorCode& status,
|
||||
UBool useLastResortData, const NumberingSystem* ns)
|
||||
{
|
||||
if (U_FAILURE(status)) { return; }
|
||||
*validLocale = *actualLocale = 0;
|
||||
@ -355,7 +355,13 @@ DecimalFormatSymbols::initialize(const Locale& loc, UErrorCode& status, UBool us
|
||||
// Next get the numbering system for this locale and set zero digit
|
||||
// and the digit string based on the numbering system for the locale
|
||||
//
|
||||
LocalPointer<NumberingSystem> ns(NumberingSystem::createInstance(loc, status));
|
||||
LocalPointer<NumberingSystem> nsLocal;
|
||||
if (ns == nullptr) {
|
||||
// Use the numbering system according to the locale.
|
||||
// Save it into a LocalPointer so it gets cleaned up.
|
||||
nsLocal.adoptInstead(NumberingSystem::createInstance(loc, status));
|
||||
ns = nsLocal.getAlias();
|
||||
}
|
||||
const char *nsName;
|
||||
if (U_SUCCESS(status) && ns->getRadix() == 10 && !ns->isAlgorithmic()) {
|
||||
nsName = ns->getName();
|
||||
|
@ -111,7 +111,7 @@ public:
|
||||
return newRuleValue * divisor;
|
||||
}
|
||||
|
||||
virtual double calcUpperBound(double /*oldUpperBound*/) const { return divisor; }
|
||||
virtual double calcUpperBound(double /*oldUpperBound*/) const { return static_cast<double>(divisor); }
|
||||
|
||||
virtual UChar tokenChar() const { return (UChar)0x003c; } // '<'
|
||||
|
||||
@ -148,7 +148,7 @@ public:
|
||||
virtual void doSubstitution(double number, UnicodeString& toInsertInto, int32_t pos, int32_t recursionCount, UErrorCode& status) const;
|
||||
|
||||
virtual int64_t transformNumber(int64_t number) const { return number % divisor; }
|
||||
virtual double transformNumber(double number) const { return uprv_fmod(number, divisor); }
|
||||
virtual double transformNumber(double number) const { return uprv_fmod(number, static_cast<double>(divisor)); }
|
||||
|
||||
virtual UBool doParse(const UnicodeString& text,
|
||||
ParsePosition& parsePosition,
|
||||
@ -158,10 +158,10 @@ public:
|
||||
Formattable& result) const;
|
||||
|
||||
virtual double composeRuleValue(double newRuleValue, double oldRuleValue) const {
|
||||
return oldRuleValue - uprv_fmod(oldRuleValue, divisor) + newRuleValue;
|
||||
return oldRuleValue - uprv_fmod(oldRuleValue, static_cast<double>(divisor)) + newRuleValue;
|
||||
}
|
||||
|
||||
virtual double calcUpperBound(double /*oldUpperBound*/) const { return divisor; }
|
||||
virtual double calcUpperBound(double /*oldUpperBound*/) const { return static_cast<double>(divisor); }
|
||||
|
||||
virtual UBool isModulusSubstitution() const { return TRUE; }
|
||||
|
||||
|
@ -1509,6 +1509,24 @@ NumberFormat::makeInstance(const Locale& desiredLocale,
|
||||
return f;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the rounding mode.
|
||||
* @return A rounding mode
|
||||
*/
|
||||
NumberFormat::ERoundingMode NumberFormat::getRoundingMode() const {
|
||||
// Default value. ICU4J throws an exception and we can't change this API.
|
||||
return NumberFormat::ERoundingMode::kRoundUnnecessary;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the rounding mode. This has no effect unless the rounding
|
||||
* increment is greater than zero.
|
||||
* @param roundingMode A rounding mode
|
||||
*/
|
||||
void NumberFormat::setRoundingMode(NumberFormat::ERoundingMode /*roundingMode*/) {
|
||||
// No-op ICU4J throws an exception, and we can't change this API.
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
#endif /* #if !UCONFIG_NO_FORMATTING */
|
||||
|
@ -239,10 +239,10 @@ FixedPrecision::initVisibleDigits(
|
||||
}
|
||||
}
|
||||
// Try fast path
|
||||
if (n >= 0 && initVisibleDigits(scaled, -n, digits, status)) {
|
||||
if (n >= 0 && initVisibleDigits(static_cast<int64_t>(scaled), -n, digits, status)) {
|
||||
digits.fAbsDoubleValue = fabs(value);
|
||||
digits.fAbsDoubleValueSet = U_SUCCESS(status) && !digits.isOverMaxDigits();
|
||||
// Adjust for negative 0 becuase when we cast to an int64,
|
||||
// Adjust for negative 0 because when we cast to an int64,
|
||||
// negative 0 becomes positive 0.
|
||||
if (scaled == 0.0 && uprv_isNegative(scaled)) {
|
||||
digits.setNegative();
|
||||
|
@ -687,6 +687,7 @@ RuleBasedNumberFormat::RuleBasedNumberFormat(const UnicodeString& description,
|
||||
, decimalFormatSymbols(NULL)
|
||||
, defaultInfinityRule(NULL)
|
||||
, defaultNaNRule(NULL)
|
||||
, roundingMode(DecimalFormat::ERoundingMode::kRoundUnnecessary)
|
||||
, lenient(FALSE)
|
||||
, lenientParseRules(NULL)
|
||||
, localizations(NULL)
|
||||
@ -711,6 +712,7 @@ RuleBasedNumberFormat::RuleBasedNumberFormat(const UnicodeString& description,
|
||||
, decimalFormatSymbols(NULL)
|
||||
, defaultInfinityRule(NULL)
|
||||
, defaultNaNRule(NULL)
|
||||
, roundingMode(DecimalFormat::ERoundingMode::kRoundUnnecessary)
|
||||
, lenient(FALSE)
|
||||
, lenientParseRules(NULL)
|
||||
, localizations(NULL)
|
||||
@ -735,6 +737,7 @@ RuleBasedNumberFormat::RuleBasedNumberFormat(const UnicodeString& description,
|
||||
, decimalFormatSymbols(NULL)
|
||||
, defaultInfinityRule(NULL)
|
||||
, defaultNaNRule(NULL)
|
||||
, roundingMode(DecimalFormat::ERoundingMode::kRoundUnnecessary)
|
||||
, lenient(FALSE)
|
||||
, lenientParseRules(NULL)
|
||||
, localizations(NULL)
|
||||
@ -758,6 +761,7 @@ RuleBasedNumberFormat::RuleBasedNumberFormat(const UnicodeString& description,
|
||||
, decimalFormatSymbols(NULL)
|
||||
, defaultInfinityRule(NULL)
|
||||
, defaultNaNRule(NULL)
|
||||
, roundingMode(DecimalFormat::ERoundingMode::kRoundUnnecessary)
|
||||
, lenient(FALSE)
|
||||
, lenientParseRules(NULL)
|
||||
, localizations(NULL)
|
||||
@ -782,6 +786,7 @@ RuleBasedNumberFormat::RuleBasedNumberFormat(const UnicodeString& description,
|
||||
, decimalFormatSymbols(NULL)
|
||||
, defaultInfinityRule(NULL)
|
||||
, defaultNaNRule(NULL)
|
||||
, roundingMode(DecimalFormat::ERoundingMode::kRoundUnnecessary)
|
||||
, lenient(FALSE)
|
||||
, lenientParseRules(NULL)
|
||||
, localizations(NULL)
|
||||
@ -803,6 +808,7 @@ RuleBasedNumberFormat::RuleBasedNumberFormat(URBNFRuleSetTag tag, const Locale&
|
||||
, decimalFormatSymbols(NULL)
|
||||
, defaultInfinityRule(NULL)
|
||||
, defaultNaNRule(NULL)
|
||||
, roundingMode(DecimalFormat::ERoundingMode::kRoundUnnecessary)
|
||||
, lenient(FALSE)
|
||||
, lenientParseRules(NULL)
|
||||
, localizations(NULL)
|
||||
@ -869,6 +875,7 @@ RuleBasedNumberFormat::RuleBasedNumberFormat(const RuleBasedNumberFormat& rhs)
|
||||
, decimalFormatSymbols(NULL)
|
||||
, defaultInfinityRule(NULL)
|
||||
, defaultNaNRule(NULL)
|
||||
, roundingMode(DecimalFormat::ERoundingMode::kRoundUnnecessary)
|
||||
, lenient(FALSE)
|
||||
, lenientParseRules(NULL)
|
||||
, localizations(NULL)
|
||||
@ -898,6 +905,7 @@ RuleBasedNumberFormat::operator=(const RuleBasedNumberFormat& rhs)
|
||||
setDecimalFormatSymbols(*rhs.getDecimalFormatSymbols());
|
||||
init(rhs.originalDescription, rhs.localizations ? rhs.localizations->ref() : NULL, perror, status);
|
||||
setDefaultRuleSet(rhs.getDefaultRuleSetName(), status);
|
||||
setRoundingMode(rhs.getRoundingMode());
|
||||
|
||||
capitalizationInfoSet = rhs.capitalizationInfoSet;
|
||||
capitalizationForUIListMenu = rhs.capitalizationForUIListMenu;
|
||||
@ -1195,7 +1203,7 @@ RuleBasedNumberFormat::format(double number,
|
||||
int32_t startPos = toAppendTo.length();
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
if (defaultRuleSet) {
|
||||
defaultRuleSet->format(number, toAppendTo, toAppendTo.length(), 0, status);
|
||||
format(number, *defaultRuleSet, toAppendTo, status);
|
||||
}
|
||||
return adjustForCapitalizationContext(startPos, toAppendTo, status);
|
||||
}
|
||||
@ -1248,15 +1256,31 @@ RuleBasedNumberFormat::format(double number,
|
||||
} else {
|
||||
NFRuleSet *rs = findRuleSet(ruleSetName, status);
|
||||
if (rs) {
|
||||
int32_t startPos = toAppendTo.length();
|
||||
rs->format(number, toAppendTo, toAppendTo.length(), 0, status);
|
||||
adjustForCapitalizationContext(startPos, toAppendTo, status);
|
||||
format(number, *rs, toAppendTo, status);
|
||||
}
|
||||
}
|
||||
}
|
||||
return toAppendTo;
|
||||
}
|
||||
|
||||
void
|
||||
RuleBasedNumberFormat::format(double number,
|
||||
NFRuleSet& rs,
|
||||
UnicodeString& toAppendTo,
|
||||
UErrorCode& status) const
|
||||
{
|
||||
int32_t startPos = toAppendTo.length();
|
||||
if (getRoundingMode() != DecimalFormat::ERoundingMode::kRoundUnnecessary && !uprv_isNaN(number) && !uprv_isInfinite(number)) {
|
||||
DigitList digitList;
|
||||
digitList.set(number);
|
||||
digitList.setRoundingMode(getRoundingMode());
|
||||
digitList.roundFixedPoint(getMaximumFractionDigits());
|
||||
number = digitList.getDouble();
|
||||
}
|
||||
rs.format(number, toAppendTo, toAppendTo.length(), 0, status);
|
||||
adjustForCapitalizationContext(startPos, toAppendTo, status);
|
||||
}
|
||||
|
||||
/**
|
||||
* Bottleneck through which all the public format() methods
|
||||
* that take a long pass. By the time we get here, we know
|
||||
@ -1959,6 +1983,23 @@ RuleBasedNumberFormat::createPluralFormat(UPluralType pluralType,
|
||||
return new PluralFormat(locale, pluralType, pattern, status);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the rounding mode.
|
||||
* @return A rounding mode
|
||||
*/
|
||||
DecimalFormat::ERoundingMode RuleBasedNumberFormat::getRoundingMode() const {
|
||||
return roundingMode;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the rounding mode. This has no effect unless the rounding
|
||||
* increment is greater than zero.
|
||||
* @param roundingMode A rounding mode
|
||||
*/
|
||||
void RuleBasedNumberFormat::setRoundingMode(DecimalFormat::ERoundingMode roundingMode) {
|
||||
this->roundingMode = roundingMode;
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
/* U_HAVE_RBNF */
|
||||
|
@ -2430,7 +2430,7 @@ TimeZoneFormat::parseOffsetPattern(const UnicodeString& pattern, OffsetFields re
|
||||
isPrevQuote = TRUE;
|
||||
if (itemType != GMTOffsetField::TEXT) {
|
||||
if (GMTOffsetField::isValid(itemType, itemLength)) {
|
||||
GMTOffsetField* fld = GMTOffsetField::createTimeField(itemType, (uint8_t)itemLength, status);
|
||||
GMTOffsetField* fld = GMTOffsetField::createTimeField(itemType, static_cast<uint8_t>(itemLength), status);
|
||||
result->addElement(fld, status);
|
||||
if (U_FAILURE(status)) {
|
||||
break;
|
||||
@ -2465,7 +2465,7 @@ TimeZoneFormat::parseOffsetPattern(const UnicodeString& pattern, OffsetFields re
|
||||
}
|
||||
} else {
|
||||
if (GMTOffsetField::isValid(itemType, itemLength)) {
|
||||
GMTOffsetField* fld = GMTOffsetField::createTimeField(itemType, itemLength, status);
|
||||
GMTOffsetField* fld = GMTOffsetField::createTimeField(itemType, static_cast<uint8_t>(itemLength), status);
|
||||
result->addElement(fld, status);
|
||||
if (U_FAILURE(status)) {
|
||||
break;
|
||||
@ -2483,7 +2483,7 @@ TimeZoneFormat::parseOffsetPattern(const UnicodeString& pattern, OffsetFields re
|
||||
// a string literal
|
||||
if (itemType != GMTOffsetField::TEXT) {
|
||||
if (GMTOffsetField::isValid(itemType, itemLength)) {
|
||||
GMTOffsetField* fld = GMTOffsetField::createTimeField(itemType, itemLength, status);
|
||||
GMTOffsetField* fld = GMTOffsetField::createTimeField(itemType, static_cast<uint8_t>(itemLength), status);
|
||||
result->addElement(fld, status);
|
||||
if (U_FAILURE(status)) {
|
||||
break;
|
||||
@ -2508,7 +2508,7 @@ TimeZoneFormat::parseOffsetPattern(const UnicodeString& pattern, OffsetFields re
|
||||
}
|
||||
} else {
|
||||
if (GMTOffsetField::isValid(itemType, itemLength)) {
|
||||
GMTOffsetField* fld = GMTOffsetField::createTimeField(itemType, itemLength, status);
|
||||
GMTOffsetField* fld = GMTOffsetField::createTimeField(itemType, static_cast<uint8_t>(itemLength), status);
|
||||
result->addElement(fld, status);
|
||||
} else {
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
|
@ -62,7 +62,7 @@ namespace {
|
||||
static const UChar *rootRules = NULL;
|
||||
static int32_t rootRulesLength = 0;
|
||||
static UResourceBundle *rootBundle = NULL;
|
||||
static UInitOnce gInitOnce = U_INITONCE_INITIALIZER;
|
||||
static UInitOnce gInitOnceUcolRes = U_INITONCE_INITIALIZER;
|
||||
|
||||
} // namespace
|
||||
|
||||
@ -74,7 +74,7 @@ ucol_res_cleanup() {
|
||||
rootRulesLength = 0;
|
||||
ures_close(rootBundle);
|
||||
rootBundle = NULL;
|
||||
gInitOnce.reset();
|
||||
gInitOnceUcolRes.reset();
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
@ -97,7 +97,7 @@ U_CDECL_END
|
||||
void
|
||||
CollationLoader::appendRootRules(UnicodeString &s) {
|
||||
UErrorCode errorCode = U_ZERO_ERROR;
|
||||
umtx_initOnce(gInitOnce, CollationLoader::loadRootRules, errorCode);
|
||||
umtx_initOnce(gInitOnceUcolRes, CollationLoader::loadRootRules, errorCode);
|
||||
if(U_SUCCESS(errorCode)) {
|
||||
s.append(rootRules, rootRulesLength);
|
||||
}
|
||||
|
@ -34,6 +34,7 @@
|
||||
|
||||
#include "unicode/uobject.h"
|
||||
#include "unicode/locid.h"
|
||||
#include "unicode/numsys.h"
|
||||
#include "unicode/unum.h"
|
||||
#include "unicode/unistr.h"
|
||||
|
||||
@ -184,6 +185,24 @@ public:
|
||||
*/
|
||||
DecimalFormatSymbols(const Locale& locale, UErrorCode& status);
|
||||
|
||||
/**
|
||||
* Creates a DecimalFormatSymbols instance for the given locale with digits and symbols
|
||||
* corresponding to the given NumberingSystem.
|
||||
*
|
||||
* This constructor behaves equivalently to the normal constructor called with a locale having a
|
||||
* "numbers=xxxx" keyword specifying the numbering system by name.
|
||||
*
|
||||
* In this constructor, the NumberingSystem argument will be used even if the locale has its own
|
||||
* "numbers=xxxx" keyword.
|
||||
*
|
||||
* @param locale The locale to get symbols for.
|
||||
* @param ns The numbering system.
|
||||
* @param status Input/output parameter, set to success or
|
||||
* failure code upon return.
|
||||
* @draft ICU 60
|
||||
*/
|
||||
DecimalFormatSymbols(const Locale& locale, const NumberingSystem& ns, UErrorCode& status);
|
||||
|
||||
/**
|
||||
* Create a DecimalFormatSymbols object for the default locale.
|
||||
* This constructor will not fail. If the resource file data is
|
||||
@ -346,8 +365,11 @@ private:
|
||||
* @param success Input/output parameter, set to success or
|
||||
* failure code upon return.
|
||||
* @param useLastResortData determine if use last resort data
|
||||
* @param ns The NumberingSystem to use; otherwise, fall
|
||||
* back to the locale.
|
||||
*/
|
||||
void initialize(const Locale& locale, UErrorCode& success, UBool useLastResortData = FALSE);
|
||||
void initialize(const Locale& locale, UErrorCode& success,
|
||||
UBool useLastResortData = FALSE, const NumberingSystem* ns = nullptr);
|
||||
|
||||
/**
|
||||
* Initialize the symbols with default values.
|
||||
|
@ -668,28 +668,6 @@ template class U_I18N_API EnumSet<UNumberFormatAttribute,
|
||||
*/
|
||||
class U_I18N_API DecimalFormat: public NumberFormat {
|
||||
public:
|
||||
/**
|
||||
* Rounding mode.
|
||||
* @stable ICU 2.4
|
||||
*/
|
||||
enum ERoundingMode {
|
||||
kRoundCeiling, /**< Round towards positive infinity */
|
||||
kRoundFloor, /**< Round towards negative infinity */
|
||||
kRoundDown, /**< Round towards zero */
|
||||
kRoundUp, /**< Round away from zero */
|
||||
kRoundHalfEven, /**< Round towards the nearest integer, or
|
||||
towards the nearest even integer if equidistant */
|
||||
kRoundHalfDown, /**< Round towards the nearest integer, or
|
||||
towards zero if equidistant */
|
||||
kRoundHalfUp, /**< Round towards the nearest integer, or
|
||||
away from zero if equidistant */
|
||||
/**
|
||||
* Return U_FORMAT_INEXACT_ERROR if number does not format exactly.
|
||||
* @stable ICU 4.8
|
||||
*/
|
||||
kRoundUnnecessary
|
||||
};
|
||||
|
||||
/**
|
||||
* Pad position.
|
||||
* @stable ICU 2.4
|
||||
|
@ -168,6 +168,28 @@ class StringEnumeration;
|
||||
*/
|
||||
class U_I18N_API NumberFormat : public Format {
|
||||
public:
|
||||
/**
|
||||
* Rounding mode.
|
||||
* @stable ICU 2.4
|
||||
*/
|
||||
enum ERoundingMode {
|
||||
kRoundCeiling, /**< Round towards positive infinity */
|
||||
kRoundFloor, /**< Round towards negative infinity */
|
||||
kRoundDown, /**< Round towards zero */
|
||||
kRoundUp, /**< Round away from zero */
|
||||
kRoundHalfEven, /**< Round towards the nearest integer, or
|
||||
towards the nearest even integer if equidistant */
|
||||
kRoundHalfDown, /**< Round towards the nearest integer, or
|
||||
towards zero if equidistant */
|
||||
kRoundHalfUp, /**< Round towards the nearest integer, or
|
||||
away from zero if equidistant */
|
||||
/**
|
||||
* Return U_FORMAT_INEXACT_ERROR if number does not format exactly.
|
||||
* @stable ICU 4.8
|
||||
*/
|
||||
kRoundUnnecessary
|
||||
};
|
||||
|
||||
/**
|
||||
* Alignment Field constants used to construct a FieldPosition object.
|
||||
* Signifies that the position of the integer part or fraction part of
|
||||
@ -965,6 +987,21 @@ public:
|
||||
*/
|
||||
virtual UDisplayContext getContext(UDisplayContextType type, UErrorCode& status) const;
|
||||
|
||||
/**
|
||||
* Get the rounding mode. This will always return NumberFormat::ERoundingMode::kRoundUnnecessary
|
||||
* if the subclass does not support rounding.
|
||||
* @return A rounding mode
|
||||
* @draft ICU 60
|
||||
*/
|
||||
virtual ERoundingMode getRoundingMode(void) const;
|
||||
|
||||
/**
|
||||
* Set the rounding mode. If a subclass does not support rounding, this will do nothing.
|
||||
* @param roundingMode A rounding mode
|
||||
* @draft ICU 60
|
||||
*/
|
||||
virtual void setRoundingMode(ERoundingMode roundingMode);
|
||||
|
||||
public:
|
||||
|
||||
/**
|
||||
|
@ -30,6 +30,7 @@
|
||||
#define U_HAVE_RBNF 1
|
||||
|
||||
#include "unicode/dcfmtsym.h"
|
||||
#include "unicode/decimfmt.h"
|
||||
#include "unicode/fmtable.h"
|
||||
#include "unicode/locid.h"
|
||||
#include "unicode/numfmt.h"
|
||||
@ -1010,6 +1011,20 @@ public:
|
||||
*/
|
||||
virtual void setContext(UDisplayContext value, UErrorCode& status);
|
||||
|
||||
/**
|
||||
* Get the rounding mode.
|
||||
* @return A rounding mode
|
||||
* @draft ICU 60
|
||||
*/
|
||||
virtual DecimalFormat::ERoundingMode getRoundingMode(void) const;
|
||||
|
||||
/**
|
||||
* Set the rounding mode.
|
||||
* @param roundingMode A rounding mode
|
||||
* @draft ICU 60
|
||||
*/
|
||||
virtual void setRoundingMode(DecimalFormat::ERoundingMode roundingMode);
|
||||
|
||||
public:
|
||||
/**
|
||||
* ICU "poor man's RTTI", returns a UClassID for this class.
|
||||
@ -1059,7 +1074,6 @@ private:
|
||||
void dispose();
|
||||
void stripWhitespace(UnicodeString& src);
|
||||
void initDefaultRuleSet();
|
||||
void format(double number, NFRuleSet& ruleSet);
|
||||
NFRuleSet* findRuleSet(const UnicodeString& name, UErrorCode& status) const;
|
||||
|
||||
/* friend access */
|
||||
@ -1079,6 +1093,7 @@ private:
|
||||
PluralFormat *createPluralFormat(UPluralType pluralType, const UnicodeString &pattern, UErrorCode& status) const;
|
||||
UnicodeString& adjustForCapitalizationContext(int32_t startPos, UnicodeString& currentResult, UErrorCode& status) const;
|
||||
UnicodeString& format(int64_t number, NFRuleSet *ruleSet, UnicodeString& toAppendTo, UErrorCode& status) const;
|
||||
void format(double number, NFRuleSet& rs, UnicodeString& toAppendTo, UErrorCode& status) const;
|
||||
|
||||
private:
|
||||
NFRuleSet **ruleSets;
|
||||
@ -1090,6 +1105,7 @@ private:
|
||||
DecimalFormatSymbols* decimalFormatSymbols;
|
||||
NFRule *defaultInfinityRule;
|
||||
NFRule *defaultNaNRule;
|
||||
DecimalFormat::ERoundingMode roundingMode;
|
||||
UBool lenient;
|
||||
UnicodeString* lenientParseRules;
|
||||
LocalizationInfo* localizations;
|
||||
|
@ -507,20 +507,43 @@ U_CAPI int32_t U_EXPORT2
|
||||
unum_getAttribute(const UNumberFormat* fmt,
|
||||
UNumberFormatAttribute attr)
|
||||
{
|
||||
const NumberFormat* nf = reinterpret_cast<const NumberFormat*>(fmt);
|
||||
if ( attr == UNUM_LENIENT_PARSE ) {
|
||||
// Supported for all subclasses
|
||||
return nf->isLenient();
|
||||
}
|
||||
const NumberFormat* nf = reinterpret_cast<const NumberFormat*>(fmt);
|
||||
if (attr == UNUM_LENIENT_PARSE) {
|
||||
// Supported for all subclasses
|
||||
return nf->isLenient();
|
||||
}
|
||||
else if (attr == UNUM_MAX_INTEGER_DIGITS) {
|
||||
return nf->getMaximumIntegerDigits();
|
||||
}
|
||||
else if (attr == UNUM_MIN_INTEGER_DIGITS) {
|
||||
return nf->getMinimumIntegerDigits();
|
||||
}
|
||||
else if (attr == UNUM_INTEGER_DIGITS) {
|
||||
// TODO: what should this return?
|
||||
return nf->getMinimumIntegerDigits();
|
||||
}
|
||||
else if (attr == UNUM_MAX_FRACTION_DIGITS) {
|
||||
return nf->getMaximumFractionDigits();
|
||||
}
|
||||
else if (attr == UNUM_MIN_FRACTION_DIGITS) {
|
||||
return nf->getMinimumFractionDigits();
|
||||
}
|
||||
else if (attr == UNUM_FRACTION_DIGITS) {
|
||||
// TODO: what should this return?
|
||||
return nf->getMinimumFractionDigits();
|
||||
}
|
||||
else if (attr == UNUM_ROUNDING_MODE) {
|
||||
return nf->getRoundingMode();
|
||||
}
|
||||
|
||||
// The remaining attributea are only supported for DecimalFormat
|
||||
const DecimalFormat* df = dynamic_cast<const DecimalFormat*>(nf);
|
||||
if (df != NULL) {
|
||||
UErrorCode ignoredStatus = U_ZERO_ERROR;
|
||||
return df->getAttribute( attr, ignoredStatus );
|
||||
}
|
||||
// The remaining attributes are only supported for DecimalFormat
|
||||
const DecimalFormat* df = dynamic_cast<const DecimalFormat*>(nf);
|
||||
if (df != NULL) {
|
||||
UErrorCode ignoredStatus = U_ZERO_ERROR;
|
||||
return df->getAttribute(attr, ignoredStatus);
|
||||
}
|
||||
|
||||
return -1;
|
||||
return -1;
|
||||
}
|
||||
|
||||
U_CAPI void U_EXPORT2
|
||||
@ -528,18 +551,42 @@ unum_setAttribute( UNumberFormat* fmt,
|
||||
UNumberFormatAttribute attr,
|
||||
int32_t newValue)
|
||||
{
|
||||
NumberFormat* nf = reinterpret_cast<NumberFormat*>(fmt);
|
||||
if ( attr == UNUM_LENIENT_PARSE ) {
|
||||
// Supported for all subclasses
|
||||
// keep this here as the class may not be a DecimalFormat
|
||||
return nf->setLenient(newValue != 0);
|
||||
}
|
||||
// The remaining attributea are only supported for DecimalFormat
|
||||
DecimalFormat* df = dynamic_cast<DecimalFormat*>(nf);
|
||||
if (df != NULL) {
|
||||
UErrorCode ignoredStatus = U_ZERO_ERROR;
|
||||
df->setAttribute(attr, newValue, ignoredStatus);
|
||||
}
|
||||
NumberFormat* nf = reinterpret_cast<NumberFormat*>(fmt);
|
||||
if (attr == UNUM_LENIENT_PARSE) {
|
||||
// Supported for all subclasses
|
||||
// keep this here as the class may not be a DecimalFormat
|
||||
return nf->setLenient(newValue != 0);
|
||||
}
|
||||
else if (attr == UNUM_MAX_INTEGER_DIGITS) {
|
||||
return nf->setMaximumIntegerDigits(newValue);
|
||||
}
|
||||
else if (attr == UNUM_MIN_INTEGER_DIGITS) {
|
||||
return nf->setMinimumIntegerDigits(newValue);
|
||||
}
|
||||
else if (attr == UNUM_INTEGER_DIGITS) {
|
||||
nf->setMinimumIntegerDigits(newValue);
|
||||
return nf->setMaximumIntegerDigits(newValue);
|
||||
}
|
||||
else if (attr == UNUM_MAX_FRACTION_DIGITS) {
|
||||
return nf->setMaximumFractionDigits(newValue);
|
||||
}
|
||||
else if (attr == UNUM_MIN_FRACTION_DIGITS) {
|
||||
return nf->setMinimumFractionDigits(newValue);
|
||||
}
|
||||
else if (attr == UNUM_FRACTION_DIGITS) {
|
||||
nf->setMinimumFractionDigits(newValue);
|
||||
return nf->setMaximumFractionDigits(newValue);
|
||||
}
|
||||
else if (attr == UNUM_ROUNDING_MODE) {
|
||||
return nf->setRoundingMode((NumberFormat::ERoundingMode)newValue);
|
||||
}
|
||||
|
||||
// The remaining attributes are only supported for DecimalFormat
|
||||
DecimalFormat* df = dynamic_cast<DecimalFormat*>(nf);
|
||||
if (df != NULL) {
|
||||
UErrorCode ignoredStatus = U_ZERO_ERROR;
|
||||
df->setAttribute(attr, newValue, ignoredStatus);
|
||||
}
|
||||
}
|
||||
|
||||
U_CAPI double U_EXPORT2
|
||||
|
@ -690,7 +690,6 @@ ZoneMeta::createMetazoneMappings(const UnicodeString &tzid) {
|
||||
mzMappings = new UVector(deleteOlsonToMetaMappingEntry, NULL, status);
|
||||
if (U_FAILURE(status)) {
|
||||
delete mzMappings;
|
||||
deleteOlsonToMetaMappingEntry(entry);
|
||||
uprv_free(entry);
|
||||
break;
|
||||
}
|
||||
|
@ -64,6 +64,7 @@ static void TestCurrFmtNegSameAsPositive(void);
|
||||
static void TestVariousStylesAndAttributes(void);
|
||||
static void TestParseCurrPatternWithDecStyle(void);
|
||||
static void TestFormatForFields(void);
|
||||
static void TestRBNFRounding(void);
|
||||
|
||||
#define TESTCASE(x) addTest(root, &x, "tsformat/cnumtst/" #x)
|
||||
|
||||
@ -79,6 +80,7 @@ void addNumForTest(TestNode** root)
|
||||
TESTCASE(TestCurrencyRegression);
|
||||
TESTCASE(TestTextAttributeCrash);
|
||||
TESTCASE(TestRBNFFormat);
|
||||
TESTCASE(TestRBNFRounding);
|
||||
TESTCASE(TestNBSPInPattern);
|
||||
TESTCASE(TestInt64Parse);
|
||||
TESTCASE(TestParseZero);
|
||||
@ -1791,6 +1793,48 @@ static void TestRBNFFormat() {
|
||||
}
|
||||
}
|
||||
|
||||
static void TestRBNFRounding() {
|
||||
UChar fmtbuf[FORMAT_BUF_CAPACITY];
|
||||
UChar expectedBuf[FORMAT_BUF_CAPACITY];
|
||||
int32_t len;
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
UNumberFormat* fmt = unum_open(UNUM_SPELLOUT, NULL, 0, "en_US", NULL, &status);
|
||||
if (U_FAILURE(status)) {
|
||||
log_err_status(status, "unable to open spellout -> %s\n", u_errorName(status));
|
||||
return;
|
||||
}
|
||||
len = unum_formatDouble(fmt, 10.123456789, fmtbuf, FORMAT_BUF_CAPACITY, NULL, &status);
|
||||
if (U_FAILURE(status)) {
|
||||
log_err_status(status, "unum_formatDouble 10.123456789 failed with %s\n", u_errorName(status));
|
||||
}
|
||||
u_uastrcpy(expectedBuf, "ten point one two three four five six seven eight nine");
|
||||
if (u_strcmp(expectedBuf, fmtbuf) != 0) {
|
||||
log_err("Wrong result for unrounded value\n");
|
||||
}
|
||||
unum_setAttribute(fmt, UNUM_MAX_FRACTION_DIGITS, 3);
|
||||
if (unum_getAttribute(fmt, UNUM_MAX_FRACTION_DIGITS) != 3) {
|
||||
log_err("UNUM_MAX_FRACTION_DIGITS was incorrectly ignored -> %d\n", unum_getAttribute(fmt, UNUM_MAX_FRACTION_DIGITS));
|
||||
}
|
||||
if (unum_getAttribute(fmt, UNUM_ROUNDING_MODE) != UNUM_ROUND_UNNECESSARY) {
|
||||
log_err("UNUM_ROUNDING_MODE was set -> %d\n", unum_getAttribute(fmt, UNUM_ROUNDING_MODE));
|
||||
}
|
||||
unum_setAttribute(fmt, UNUM_ROUNDING_MODE, UNUM_ROUND_HALFUP);
|
||||
if (unum_getAttribute(fmt, UNUM_ROUNDING_MODE) != UNUM_ROUND_HALFUP) {
|
||||
log_err("UNUM_ROUNDING_MODE was not set -> %d\n", unum_getAttribute(fmt, UNUM_ROUNDING_MODE));
|
||||
}
|
||||
len = unum_formatDouble(fmt, 10.123456789, fmtbuf, FORMAT_BUF_CAPACITY, NULL, &status);
|
||||
if (U_FAILURE(status)) {
|
||||
log_err_status(status, "unum_formatDouble 10.123456789 failed with %s\n", u_errorName(status));
|
||||
}
|
||||
u_uastrcpy(expectedBuf, "ten point one two three");
|
||||
if (u_strcmp(expectedBuf, fmtbuf) != 0) {
|
||||
char temp[512];
|
||||
u_austrcpy(temp, fmtbuf);
|
||||
log_err("Wrong result for rounded value. Got: %s\n", temp);
|
||||
}
|
||||
unum_close(fmt);
|
||||
}
|
||||
|
||||
static void TestCurrencyRegression(void) {
|
||||
/*
|
||||
I've found a case where unum_parseDoubleCurrency is not doing what I
|
||||
|
@ -68,11 +68,11 @@ void RBBIAPITest::TestCloneEquals()
|
||||
b |= *bi1 == *bi2;
|
||||
b |= *bi1 == *bi3;
|
||||
if (b) {
|
||||
errln((UnicodeString)"ERROR:1 RBBI's == and != operator failed.");
|
||||
errln("%s:%d ERROR:1 RBBI's == and != operator failed.", __FILE__, __LINE__);
|
||||
}
|
||||
|
||||
if(*bi2 == *biequal || *bi2 == *bi1 || *biequal == *bi3)
|
||||
errln((UnicodeString)"ERROR:2 RBBI's == and != operator failed.");
|
||||
errln("%s:%d ERROR:2 RBBI's == and != operator failed.", __FILE__, __LINE__);
|
||||
|
||||
|
||||
// Quick test of RulesBasedBreakIterator assignment -
|
||||
@ -90,15 +90,15 @@ void RBBIAPITest::TestCloneEquals()
|
||||
|
||||
RuleBasedBreakIterator biDefault, biDefault2;
|
||||
if(U_FAILURE(status)){
|
||||
errln((UnicodeString)"FAIL : in construction of default iterator");
|
||||
errln("%s:%d FAIL : in construction of default iterator", __FILE__, __LINE__);
|
||||
return;
|
||||
}
|
||||
if (biDefault == *bix) {
|
||||
errln((UnicodeString)"ERROR: iterators should not compare ==");
|
||||
errln("%s:%d ERROR: iterators should not compare ==", __FILE__, __LINE__);
|
||||
return;
|
||||
}
|
||||
if (biDefault != biDefault2) {
|
||||
errln((UnicodeString)"ERROR: iterators should compare ==");
|
||||
errln("%s:%d ERROR: iterators should compare ==", __FILE__, __LINE__);
|
||||
return;
|
||||
}
|
||||
|
||||
@ -106,41 +106,41 @@ void RBBIAPITest::TestCloneEquals()
|
||||
UnicodeString HelloString("Hello Kitty");
|
||||
bix->setText(HelloString);
|
||||
if (*bix == *bi2) {
|
||||
errln(UnicodeString("ERROR: strings should not be equal before assignment."));
|
||||
errln("%s:%d ERROR: strings should not be equal before assignment.", __FILE__, __LINE__);
|
||||
}
|
||||
*bix = *bi2;
|
||||
if (*bix != *bi2) {
|
||||
errln(UnicodeString("ERROR: strings should be equal before assignment."));
|
||||
errln("%s:%d ERROR: strings should be equal before assignment.", __FILE__, __LINE__);
|
||||
}
|
||||
|
||||
int bixnext = bix->next();
|
||||
int bi2next = bi2->next();
|
||||
if (! (bixnext == bi2next && bixnext == 7)) {
|
||||
errln(UnicodeString("ERROR: iterators behaved differently after assignment."));
|
||||
errln("%s:%d ERROR: iterators behaved differently after assignment.", __FILE__, __LINE__);
|
||||
}
|
||||
delete bix;
|
||||
if (bi2->next() != 8) {
|
||||
errln(UnicodeString("ERROR: iterator.next() failed after deleting copy."));
|
||||
errln("%s:%d ERROR: iterator.next() failed after deleting copy.", __FILE__, __LINE__);
|
||||
}
|
||||
|
||||
|
||||
|
||||
logln((UnicodeString)"Testing clone()");
|
||||
RuleBasedBreakIterator* bi1clone=(RuleBasedBreakIterator*)bi1->clone();
|
||||
RuleBasedBreakIterator* bi2clone=(RuleBasedBreakIterator*)bi2->clone();
|
||||
RuleBasedBreakIterator* bi1clone = dynamic_cast<RuleBasedBreakIterator *>(bi1->clone());
|
||||
RuleBasedBreakIterator* bi2clone = dynamic_cast<RuleBasedBreakIterator *>(bi2->clone());
|
||||
|
||||
if(*bi1clone != *bi1 || *bi1clone != *biequal ||
|
||||
*bi1clone == *bi3 || *bi1clone == *bi2)
|
||||
errln((UnicodeString)"ERROR:1 RBBI's clone() method failed");
|
||||
errln("%s:%d ERROR:1 RBBI's clone() method failed", __FILE__, __LINE__);
|
||||
|
||||
if(*bi2clone == *bi1 || *bi2clone == *biequal ||
|
||||
*bi2clone == *bi3 || *bi2clone != *bi2)
|
||||
errln((UnicodeString)"ERROR:2 RBBI's clone() method failed");
|
||||
errln("%s:%d ERROR:2 RBBI's clone() method failed", __FILE__, __LINE__);
|
||||
|
||||
if(bi1->getText() != bi1clone->getText() ||
|
||||
bi2clone->getText() != bi2->getText() ||
|
||||
*bi2clone == *bi1clone )
|
||||
errln((UnicodeString)"ERROR: RBBI's clone() method failed");
|
||||
errln("%s:%d ERROR: RBBI's clone() method failed", __FILE__, __LINE__);
|
||||
|
||||
delete bi1clone;
|
||||
delete bi2clone;
|
||||
@ -427,12 +427,12 @@ void RBBIAPITest::TestIteration()
|
||||
int32_t i;
|
||||
i = bi->first();
|
||||
if (i != 0) {
|
||||
errln("Incorrect value from bi->first(). Expected 0, got %d.", i);
|
||||
errln("%s:%d Incorrect value from bi->first(). Expected 0, got %d.", __FILE__, __LINE__, i);
|
||||
}
|
||||
|
||||
i = bi->last();
|
||||
if (i != 10) {
|
||||
errln("Incorrect value from bi->last(). Expected 10, got %d", i);
|
||||
errln("%s:%d Incorrect value from bi->last(). Expected 10, got %d", __FILE__, __LINE__, i);
|
||||
}
|
||||
|
||||
//
|
||||
@ -441,14 +441,14 @@ void RBBIAPITest::TestIteration()
|
||||
bi->last();
|
||||
i = bi->previous();
|
||||
if (i != 9) {
|
||||
errln("Incorrect value from bi->last() at line %d. Expected 9, got %d", __LINE__, i);
|
||||
errln("%s:%d Incorrect value from bi->last(). Expected 9, got %d", __FILE__, __LINE__, i);
|
||||
}
|
||||
|
||||
|
||||
bi->first();
|
||||
i = bi->previous();
|
||||
if (i != BreakIterator::DONE) {
|
||||
errln("Incorrect value from bi->previous() at line %d. Expected DONE, got %d", __LINE__, i);
|
||||
errln("%s:%d Incorrect value from bi->previous(). Expected DONE, got %d", __FILE__, __LINE__, i);
|
||||
}
|
||||
|
||||
//
|
||||
@ -457,13 +457,13 @@ void RBBIAPITest::TestIteration()
|
||||
bi->first();
|
||||
i = bi->next();
|
||||
if (i != 1) {
|
||||
errln("Incorrect value from bi->next() at line %d. Expected 1, got %d", __LINE__, i);
|
||||
errln("%s:%d Incorrect value from bi->next(). Expected 1, got %d", __FILE__, __LINE__, i);
|
||||
}
|
||||
|
||||
bi->last();
|
||||
i = bi->next();
|
||||
if (i != BreakIterator::DONE) {
|
||||
errln("Incorrect value from bi->next() at line %d. Expected DONE, got %d", __LINE__, i);
|
||||
errln("%s:%d Incorrect value from bi->next(). Expected DONE, got %d", __FILE__, __LINE__, i);
|
||||
}
|
||||
|
||||
|
||||
@ -473,27 +473,27 @@ void RBBIAPITest::TestIteration()
|
||||
bi->first();
|
||||
i = bi->current();
|
||||
if (i != 0) {
|
||||
errln("Incorrect value from bi->previous() at line %d. Expected 0, got %d", __LINE__, i);
|
||||
errln("%s:%d Incorrect value from bi->previous(). Expected 0, got %d", __FILE__, __LINE__, i);
|
||||
}
|
||||
|
||||
bi->next();
|
||||
i = bi->current();
|
||||
if (i != 1) {
|
||||
errln("Incorrect value from bi->previous() at line %d. Expected 1, got %d", __LINE__, i);
|
||||
errln("%s:%d Incorrect value from bi->previous(). Expected 1, got %d", __FILE__, __LINE__, i);
|
||||
}
|
||||
|
||||
bi->last();
|
||||
bi->next();
|
||||
i = bi->current();
|
||||
if (i != 10) {
|
||||
errln("Incorrect value from bi->previous() at line %d. Expected 10, got %d", __LINE__, i);
|
||||
errln("%s:%d Incorrect value from bi->previous(). Expected 10, got %d", __FILE__, __LINE__, i);
|
||||
}
|
||||
|
||||
bi->first();
|
||||
bi->previous();
|
||||
i = bi->current();
|
||||
if (i != 0) {
|
||||
errln("Incorrect value from bi->previous() at line %d. Expected 0, got %d", __LINE__, i);
|
||||
errln("%s:%d Incorrect value from bi->previous(). Expected 0, got %d", __FILE__, __LINE__, i);
|
||||
}
|
||||
|
||||
|
||||
@ -502,17 +502,17 @@ void RBBIAPITest::TestIteration()
|
||||
//
|
||||
i = bi->following(4);
|
||||
if (i != 5) {
|
||||
errln("Incorrect value from bi->following() at line %d. Expected 5, got %d", __LINE__, i);
|
||||
errln("%s:%d Incorrect value from bi->following(). Expected 5, got %d", __FILE__, __LINE__, i);
|
||||
}
|
||||
|
||||
i = bi->following(9);
|
||||
if (i != 10) {
|
||||
errln("Incorrect value from bi->following() at line %d. Expected 10, got %d", __LINE__, i);
|
||||
errln("%s:%d Incorrect value from bi->following(). Expected 10, got %d", __FILE__, __LINE__, i);
|
||||
}
|
||||
|
||||
i = bi->following(10);
|
||||
if (i != BreakIterator::DONE) {
|
||||
errln("Incorrect value from bi->following() at line %d. Expected DONE, got %d", __LINE__, i);
|
||||
errln("%s:%d Incorrect value from bi->following(). Expected DONE, got %d", __FILE__, __LINE__, i);
|
||||
}
|
||||
|
||||
|
||||
@ -521,22 +521,22 @@ void RBBIAPITest::TestIteration()
|
||||
//
|
||||
i = bi->preceding(4);
|
||||
if (i != 3) {
|
||||
errln("Incorrect value from bi->preceding() at line %d. Expected 3, got %d", __LINE__, i);
|
||||
errln("%s:%d Incorrect value from bi->preceding(). Expected 3, got %d", __FILE__, __LINE__, i);
|
||||
}
|
||||
|
||||
i = bi->preceding(10);
|
||||
if (i != 9) {
|
||||
errln("Incorrect value from bi->preceding() at line %d. Expected 9, got %d", __LINE__, i);
|
||||
errln("%s:%d Incorrect value from bi->preceding(). Expected 9, got %d", __FILE__, __LINE__, i);
|
||||
}
|
||||
|
||||
i = bi->preceding(1);
|
||||
if (i != 0) {
|
||||
errln("Incorrect value from bi->preceding() at line %d. Expected 0, got %d", __LINE__, i);
|
||||
errln("%s:%d Incorrect value from bi->preceding(). Expected 0, got %d", __FILE__, __LINE__, i);
|
||||
}
|
||||
|
||||
i = bi->preceding(0);
|
||||
if (i != BreakIterator::DONE) {
|
||||
errln("Incorrect value from bi->preceding() at line %d. Expected DONE, got %d", __LINE__, i);
|
||||
errln("%s:%d Incorrect value from bi->preceding(). Expected DONE, got %d", __FILE__, __LINE__, i);
|
||||
}
|
||||
|
||||
|
||||
@ -545,20 +545,20 @@ void RBBIAPITest::TestIteration()
|
||||
//
|
||||
bi->first();
|
||||
if (bi->isBoundary(3) != TRUE) {
|
||||
errln("Incorrect value from bi->isBoudary() at line %d. Expected TRUE, got FALSE", __LINE__, i);
|
||||
errln("%s:%d Incorrect value from bi->isBoudary(). Expected TRUE, got FALSE", __FILE__, __LINE__, i);
|
||||
}
|
||||
i = bi->current();
|
||||
if (i != 3) {
|
||||
errln("Incorrect value from bi->current() at line %d. Expected 3, got %d", __LINE__, i);
|
||||
errln("%s:%d Incorrect value from bi->current(). Expected 3, got %d", __FILE__, __LINE__, i);
|
||||
}
|
||||
|
||||
|
||||
if (bi->isBoundary(11) != FALSE) {
|
||||
errln("Incorrect value from bi->isBoudary() at line %d. Expected FALSE, got TRUE", __LINE__, i);
|
||||
errln("%s:%d Incorrect value from bi->isBoudary(). Expected FALSE, got TRUE", __FILE__, __LINE__, i);
|
||||
}
|
||||
i = bi->current();
|
||||
if (i != 10) {
|
||||
errln("Incorrect value from bi->current() at line %d. Expected 10, got %d", __LINE__, i);
|
||||
errln("%s:%d Incorrect value from bi->current(). Expected 10, got %d", __FILE__, __LINE__, i);
|
||||
}
|
||||
|
||||
//
|
||||
@ -567,18 +567,18 @@ void RBBIAPITest::TestIteration()
|
||||
bi->first();
|
||||
i = bi->next(4);
|
||||
if (i != 4) {
|
||||
errln("Incorrect value from bi->next() at line %d. Expected 4, got %d", __LINE__, i);
|
||||
errln("%s:%d Incorrect value from bi->next(). Expected 4, got %d", __FILE__, __LINE__, i);
|
||||
}
|
||||
|
||||
i = bi->next(6);
|
||||
if (i != 10) {
|
||||
errln("Incorrect value from bi->next() at line %d. Expected 10, got %d", __LINE__, i);
|
||||
errln("%s:%d Incorrect value from bi->next(). Expected 10, got %d", __FILE__, __LINE__, i);
|
||||
}
|
||||
|
||||
bi->first();
|
||||
i = bi->next(11);
|
||||
if (i != BreakIterator::DONE) {
|
||||
errln("Incorrect value from bi->next() at line %d. Expected BreakIterator::DONE, got %d", __LINE__, i);
|
||||
errln("%s:%d Incorrect value from bi->next(). Expected BreakIterator::DONE, got %d", __FILE__, __LINE__, i);
|
||||
}
|
||||
|
||||
delete bi;
|
||||
@ -666,7 +666,7 @@ void RBBIAPITest::TestRuleStatus() {
|
||||
|
||||
BreakIterator *bi = BreakIterator::createWordInstance(Locale::getEnglish(), status);
|
||||
if(U_FAILURE(status)) {
|
||||
errcheckln(status, "Fail : in construction - %s", u_errorName(status));
|
||||
errcheckln(status, "%s:%d Fail in construction - %s", __FILE__, __LINE__, u_errorName(status));
|
||||
} else {
|
||||
bi->setText(testString1);
|
||||
// First test that the breaks are in the right spots.
|
||||
@ -677,12 +677,12 @@ void RBBIAPITest::TestRuleStatus() {
|
||||
int32_t pos, tag;
|
||||
for (pos = bi->first(); pos != BreakIterator::DONE; pos = bi->next(), i++) {
|
||||
if (pos != bounds1[i]) {
|
||||
errln("FAIL: unexpected word break at postion %d", pos);
|
||||
errln("%s:%d FAIL: unexpected word break at postion %d", __FILE__, __LINE__, pos);
|
||||
break;
|
||||
}
|
||||
tag = bi->getRuleStatus();
|
||||
if (tag < tag_lo[i] || tag >= tag_hi[i]) {
|
||||
errln("FAIL: incorrect tag value %d at position %d", tag, pos);
|
||||
errln("%s:%d FAIL: incorrect tag value %d at position %d", __FILE__, __LINE__, tag, pos);
|
||||
break;
|
||||
}
|
||||
|
||||
@ -703,7 +703,7 @@ void RBBIAPITest::TestRuleStatus() {
|
||||
|
||||
bi = BreakIterator::createLineInstance(Locale::getEnglish(), status);
|
||||
if(U_FAILURE(status)) {
|
||||
errcheckln(status, "failed to create word break iterator. - %s", u_errorName(status));
|
||||
errcheckln(status, "%s:%d failed to create line break iterator. - %s", __FILE__, __LINE__, u_errorName(status));
|
||||
} else {
|
||||
int32_t i = 0;
|
||||
int32_t pos, tag;
|
||||
@ -724,8 +724,8 @@ void RBBIAPITest::TestRuleStatus() {
|
||||
success = FALSE; break;
|
||||
}
|
||||
if (success == FALSE) {
|
||||
errln("Fail: incorrect word break status or position. i=%d, pos=%d, tag=%d",
|
||||
i, pos, tag);
|
||||
errln("%s:%d: incorrect line break status or position. i=%d, pos=%d, tag=%d",
|
||||
__FILE__, __LINE__, i, pos, tag);
|
||||
break;
|
||||
}
|
||||
pos = bi->next();
|
||||
@ -734,7 +734,7 @@ void RBBIAPITest::TestRuleStatus() {
|
||||
if (UBRK_LINE_SOFT >= UBRK_LINE_SOFT_LIMIT ||
|
||||
UBRK_LINE_HARD >= UBRK_LINE_HARD_LIMIT ||
|
||||
(UBRK_LINE_HARD > UBRK_LINE_SOFT && UBRK_LINE_HARD < UBRK_LINE_SOFT_LIMIT)) {
|
||||
errln("UBRK_LINE_* constants from header are inconsistent.");
|
||||
errln("%s:%d UBRK_LINE_* constants from header are inconsistent.", __FILE__, __LINE__);
|
||||
}
|
||||
}
|
||||
delete bi;
|
||||
|
@ -73,7 +73,7 @@ BreakRules::BreakRules(RBBIMonkeyImpl *monkeyImpl, UErrorCode &status) :
|
||||
fCharClassList.adoptInstead(new UVector(status));
|
||||
|
||||
fSetRefsMatcher.adoptInstead(new RegexMatcher(UnicodeString(
|
||||
"(?!(?:\\{|=|\\[:)[ \\t]{0,4})" // Negative lookbehind for '{' or '=' or '[:'
|
||||
"(?!(?:\\{|=|\\[:)[ \\t]{0,4})" // Negative look behind for '{' or '=' or '[:'
|
||||
// (the identifier is a unicode property name or value)
|
||||
"(?<ClassName>[A-Za-z_][A-Za-z0-9_]*)"), // The char class name
|
||||
0, status));
|
||||
@ -86,7 +86,7 @@ BreakRules::BreakRules(RBBIMonkeyImpl *monkeyImpl, UErrorCode &status) :
|
||||
"\\R$" // new-line at end of line.
|
||||
), 0, status));
|
||||
|
||||
// Match (initial parse) of a character class defintion line.
|
||||
// Match (initial parse) of a character class definition line.
|
||||
fClassDefMatcher.adoptInstead(new RegexMatcher(UnicodeString(
|
||||
"[ \\t]*" // leading white space
|
||||
"(?<ClassName>[A-Za-z_][A-Za-z0-9_]*)" // The char class name
|
||||
@ -129,7 +129,7 @@ CharClass *BreakRules::addCharClass(const UnicodeString &name, const UnicodeStri
|
||||
}
|
||||
fSetRefsMatcher->appendTail(expandedDef);
|
||||
|
||||
// Verify that the expanded set defintion is valid.
|
||||
// Verify that the expanded set definition is valid.
|
||||
|
||||
if (fMonkeyImpl->fDumpExpansions) {
|
||||
printf("epandedDef: %s\n", CStr(expandedDef)());
|
||||
@ -149,7 +149,7 @@ CharClass *BreakRules::addCharClass(const UnicodeString &name, const UnicodeStri
|
||||
|
||||
if (previousClass != NULL) {
|
||||
// Duplicate class def.
|
||||
// These are legitimate, they are adustments of an existing class.
|
||||
// These are legitimate, they are adjustments of an existing class.
|
||||
// TODO: will need to keep the old around when we handle tailorings.
|
||||
IntlTest::gTest->logln("Redefinition of character class %s\n", CStr(cclass->fName)());
|
||||
delete previousClass;
|
||||
|
@ -53,7 +53,6 @@
|
||||
#define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \
|
||||
errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));}}
|
||||
|
||||
|
||||
//---------------------------------------------
|
||||
// runIndexedTest
|
||||
//---------------------------------------------
|
||||
@ -74,7 +73,6 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha
|
||||
#if !UCONFIG_NO_FILE_IO
|
||||
TESTCASE_AUTO(TestBug4153072);
|
||||
#endif
|
||||
TESTCASE_AUTO(TestStatusReturn);
|
||||
#if !UCONFIG_NO_FILE_IO
|
||||
TESTCASE_AUTO(TestUnicodeFiles);
|
||||
TESTCASE_AUTO(TestEmptyString);
|
||||
@ -107,6 +105,7 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha
|
||||
TESTCASE_AUTO(TestBug12918);
|
||||
TESTCASE_AUTO(TestBug12932);
|
||||
TESTCASE_AUTO(TestEmoji);
|
||||
TESTCASE_AUTO(TestBug12519);
|
||||
TESTCASE_AUTO_END;
|
||||
}
|
||||
|
||||
@ -266,51 +265,6 @@ RBBITest::RBBITest() {
|
||||
RBBITest::~RBBITest() {
|
||||
}
|
||||
|
||||
//-----------------------------------------------------------------------------------
|
||||
//
|
||||
// Test for status {tag} return value from break rules.
|
||||
// TODO: a more thorough test.
|
||||
//
|
||||
//-----------------------------------------------------------------------------------
|
||||
void RBBITest::TestStatusReturn() {
|
||||
UnicodeString rulesString1("$Letters = [:L:];\n"
|
||||
"$Numbers = [:N:];\n"
|
||||
"$Letters+{1};\n"
|
||||
"$Numbers+{2};\n"
|
||||
"Help\\ /me\\!{4};\n"
|
||||
"[^$Letters $Numbers];\n"
|
||||
"!.*;\n", -1, US_INV);
|
||||
UnicodeString testString1 = "abc123..abc Help me Help me!";
|
||||
// 01234567890123456789012345678
|
||||
int32_t bounds1[] = {0, 3, 6, 7, 8, 11, 12, 16, 17, 19, 20, 25, 27, 28, -1};
|
||||
int32_t brkStatus[] = {0, 1, 2, 0, 0, 1, 0, 1, 0, 1, 0, 4, 1, 0, -1};
|
||||
|
||||
UErrorCode status=U_ZERO_ERROR;
|
||||
UParseError parseError;
|
||||
|
||||
LocalPointer <BreakIterator> bi(new RuleBasedBreakIterator(rulesString1, parseError, status));
|
||||
if(U_FAILURE(status)) {
|
||||
dataerrln("%s:%d error in break iterator construction - %s", __FILE__, __LINE__, u_errorName(status));
|
||||
return;
|
||||
}
|
||||
int32_t pos;
|
||||
int32_t i = 0;
|
||||
bi->setText(testString1);
|
||||
for (pos=bi->first(); pos!= BreakIterator::DONE; pos=bi->next()) {
|
||||
if (pos != bounds1[i]) {
|
||||
errln("%s:%d expected break at %d, got %d\n", __FILE__, __LINE__, bounds1[i], pos);
|
||||
break;
|
||||
}
|
||||
|
||||
int tag = bi->getRuleStatus();
|
||||
if (tag != brkStatus[i]) {
|
||||
errln("%s:%d break at %d, expected tag %d, got tag %d\n", __FILE__, __LINE__, pos, brkStatus[i], tag);
|
||||
break;
|
||||
}
|
||||
i++;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static void printStringBreaks(UText *tstr, int expected[], int expectedCount) {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
@ -1131,34 +1085,27 @@ void RBBITest::TestExtended() {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
Locale locale("");
|
||||
|
||||
UnicodeString rules;
|
||||
TestParams tp(status);
|
||||
|
||||
RegexMatcher localeMatcher(UNICODE_STRING_SIMPLE("<locale *([\\p{L}\\p{Nd}_@&=-]*) *>"), 0, status);
|
||||
RegexMatcher localeMatcher(UnicodeString(u"<locale *([\\p{L}\\p{Nd}_@&=-]*) *>"), 0, status);
|
||||
if (U_FAILURE(status)) {
|
||||
dataerrln("Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(status));
|
||||
}
|
||||
|
||||
|
||||
//
|
||||
// Open and read the test data file.
|
||||
//
|
||||
const char *testDataDirectory = IntlTest::getSourceTestData(status);
|
||||
char testFileName[1000];
|
||||
if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
|
||||
errln("Can't open test data. Path too long.");
|
||||
return;
|
||||
}
|
||||
strcpy(testFileName, testDataDirectory);
|
||||
strcat(testFileName, "rbbitst.txt");
|
||||
CharString testFileName(testDataDirectory, -1, status);
|
||||
testFileName.append("rbbitst.txt", -1, status);
|
||||
|
||||
int len;
|
||||
UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
|
||||
UChar *testFile = ReadAndConvertFile(testFileName.data(), len, "UTF-8", status);
|
||||
if (U_FAILURE(status)) {
|
||||
return; /* something went wrong, error already output */
|
||||
errln("%s:%d Error %s opening file rbbitst.txt", __FILE__, __LINE__, u_errorName(status));
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
bool skipTest = false; // Skip this test?
|
||||
|
||||
//
|
||||
@ -1170,7 +1117,8 @@ void RBBITest::TestExtended() {
|
||||
PARSE_COMMENT,
|
||||
PARSE_TAG,
|
||||
PARSE_DATA,
|
||||
PARSE_NUM
|
||||
PARSE_NUM,
|
||||
PARSE_RULES
|
||||
}
|
||||
parseState = PARSE_TAG;
|
||||
|
||||
@ -1181,7 +1129,10 @@ void RBBITest::TestExtended() {
|
||||
int32_t column = 0;
|
||||
int32_t charIdx = 0;
|
||||
|
||||
int32_t tagValue = 0; // The numeric value of a <nnn> tag.
|
||||
int32_t tagValue = 0; // The numeric value of a <nnn> tag.
|
||||
|
||||
UnicodeString rules; // Holds rules from a <rules> ... </rules> block
|
||||
int32_t rulesFirstLine; // Line number of the start of current <rules> block
|
||||
|
||||
for (charIdx = 0; charIdx < len; ) {
|
||||
status = U_ZERO_ERROR;
|
||||
@ -1215,41 +1166,50 @@ void RBBITest::TestExtended() {
|
||||
if (u_isUWhiteSpace(c)) {
|
||||
break;
|
||||
}
|
||||
if (testString.compare(charIdx-1, 6, "<word>") == 0) {
|
||||
if (testString.compare(charIdx-1, 6, u"<word>") == 0) {
|
||||
delete tp.bi;
|
||||
tp.bi = BreakIterator::createWordInstance(locale, status);
|
||||
skipTest = false;
|
||||
charIdx += 5;
|
||||
break;
|
||||
}
|
||||
if (testString.compare(charIdx-1, 6, "<char>") == 0) {
|
||||
if (testString.compare(charIdx-1, 6, u"<char>") == 0) {
|
||||
delete tp.bi;
|
||||
tp.bi = BreakIterator::createCharacterInstance(locale, status);
|
||||
skipTest = false;
|
||||
charIdx += 5;
|
||||
break;
|
||||
}
|
||||
if (testString.compare(charIdx-1, 6, "<line>") == 0) {
|
||||
if (testString.compare(charIdx-1, 6, u"<line>") == 0) {
|
||||
delete tp.bi;
|
||||
tp.bi = BreakIterator::createLineInstance(locale, status);
|
||||
skipTest = false;
|
||||
charIdx += 5;
|
||||
break;
|
||||
}
|
||||
if (testString.compare(charIdx-1, 6, "<sent>") == 0) {
|
||||
if (testString.compare(charIdx-1, 6, u"<sent>") == 0) {
|
||||
delete tp.bi;
|
||||
tp.bi = BreakIterator::createSentenceInstance(locale, status);
|
||||
skipTest = false;
|
||||
charIdx += 5;
|
||||
break;
|
||||
}
|
||||
if (testString.compare(charIdx-1, 7, "<title>") == 0) {
|
||||
if (testString.compare(charIdx-1, 7, u"<title>") == 0) {
|
||||
delete tp.bi;
|
||||
tp.bi = BreakIterator::createTitleInstance(locale, status);
|
||||
charIdx += 6;
|
||||
break;
|
||||
}
|
||||
|
||||
if (testString.compare(charIdx-1, 7, u"<rules>") == 0 ||
|
||||
testString.compare(charIdx-1, 10, u"<badrules>") == 0) {
|
||||
charIdx = testString.indexOf(u'>', charIdx) + 1;
|
||||
parseState = PARSE_RULES;
|
||||
rules.remove();
|
||||
rulesFirstLine = lineNum;
|
||||
break;
|
||||
}
|
||||
|
||||
// <locale loc_name>
|
||||
localeMatcher.reset(testString);
|
||||
if (localeMatcher.lookingAt(charIdx-1, status)) {
|
||||
@ -1261,7 +1221,7 @@ void RBBITest::TestExtended() {
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
break;
|
||||
}
|
||||
if (testString.compare(charIdx-1, 6, "<data>") == 0) {
|
||||
if (testString.compare(charIdx-1, 6, u"<data>") == 0) {
|
||||
parseState = PARSE_DATA;
|
||||
charIdx += 5;
|
||||
tp.dataToBreak = "";
|
||||
@ -1278,6 +1238,33 @@ void RBBITest::TestExtended() {
|
||||
}
|
||||
break;
|
||||
|
||||
case PARSE_RULES:
|
||||
if (testString.compare(charIdx-1, 8, u"</rules>") == 0) {
|
||||
charIdx += 7;
|
||||
parseState = PARSE_TAG;
|
||||
delete tp.bi;
|
||||
UParseError pe;
|
||||
tp.bi = new RuleBasedBreakIterator(rules, pe, status);
|
||||
skipTest = U_FAILURE(status);
|
||||
if (U_FAILURE(status)) {
|
||||
errln("file rbbitst.txt: %d - Error %s creating break iterator from rules.",
|
||||
rulesFirstLine + pe.line - 1, u_errorName(status));
|
||||
}
|
||||
} else if (testString.compare(charIdx-1, 11, u"</badrules>") == 0) {
|
||||
charIdx += 10;
|
||||
parseState = PARSE_TAG;
|
||||
UErrorCode ec = U_ZERO_ERROR;
|
||||
UParseError pe;
|
||||
RuleBasedBreakIterator bi(rules, pe, ec);
|
||||
if (U_SUCCESS(ec)) {
|
||||
errln("file rbbitst.txt: %d - Expected, but did not get, a failure creating break iterator from rules.",
|
||||
rulesFirstLine + pe.line - 1);
|
||||
}
|
||||
} else {
|
||||
rules.append(c);
|
||||
}
|
||||
break;
|
||||
|
||||
case PARSE_DATA:
|
||||
if (c == u'•') {
|
||||
int32_t breakIdx = tp.dataToBreak.length();
|
||||
@ -1290,7 +1277,7 @@ void RBBITest::TestExtended() {
|
||||
break;
|
||||
}
|
||||
|
||||
if (testString.compare(charIdx-1, 7, "</data>") == 0) {
|
||||
if (testString.compare(charIdx-1, 7, u"</data>") == 0) {
|
||||
// Add final entry to mappings from break location to source file position.
|
||||
// Need one extra because last break position returned is after the
|
||||
// last char in the data, not at the last char.
|
||||
@ -1316,7 +1303,7 @@ void RBBITest::TestExtended() {
|
||||
break;
|
||||
}
|
||||
|
||||
if (testString.compare(charIdx-1, 3, UNICODE_STRING_SIMPLE("\\N{")) == 0) {
|
||||
if (testString.compare(charIdx-1, 3, u"\\N{") == 0) {
|
||||
// Named character, e.g. \N{COMBINING GRAVE ACCENT}
|
||||
// Get the code point from the name and insert it into the test data.
|
||||
// (Damn, no API takes names in Unicode !!!
|
||||
@ -1355,8 +1342,7 @@ void RBBITest::TestExtended() {
|
||||
|
||||
|
||||
|
||||
|
||||
if (testString.compare(charIdx-1, 2, "<>") == 0) {
|
||||
if (testString.compare(charIdx-1, 2, u"<>") == 0) {
|
||||
charIdx++;
|
||||
int32_t breakIdx = tp.dataToBreak.length();
|
||||
tp.expectedBreaks->setSize(breakIdx+1);
|
||||
@ -1469,7 +1455,7 @@ void RBBITest::TestExtended() {
|
||||
|
||||
|
||||
if (U_FAILURE(status)) {
|
||||
dataerrln("ICU Error %s while parsing test file at line %d.",
|
||||
errln("ICU Error %s while parsing test file at line %d.",
|
||||
u_errorName(status), lineNum);
|
||||
status = U_ZERO_ERROR;
|
||||
goto end_test; // Stop the test
|
||||
@ -1477,6 +1463,17 @@ void RBBITest::TestExtended() {
|
||||
|
||||
}
|
||||
|
||||
// Reached end of test file. Raise an error if parseState indicates that we are
|
||||
// within a block that should have been terminated.
|
||||
if (parseState == PARSE_RULES) {
|
||||
errln("rbbitst.txt:%d <rules> block beginning at line %d is not closed.",
|
||||
lineNum, rulesFirstLine);
|
||||
}
|
||||
if (parseState == PARSE_DATA) {
|
||||
errln("rbbitst.txt:%d <data> block not closed.", lineNum);
|
||||
}
|
||||
|
||||
|
||||
end_test:
|
||||
delete [] testFile;
|
||||
#endif
|
||||
@ -3762,16 +3759,16 @@ static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
|
||||
for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
|
||||
forward[count] = i;
|
||||
if (count < expectedcount && expected[count] != i) {
|
||||
test->errln("break forward test failed: expected %d but got %d",
|
||||
expected[count], i);
|
||||
test->errln("%s:%d break forward test failed: expected %d but got %d",
|
||||
__FILE__, __LINE__, expected[count], i);
|
||||
break;
|
||||
}
|
||||
count ++;
|
||||
}
|
||||
if (count != expectedcount) {
|
||||
printStringBreaks(ustr, expected, expectedcount);
|
||||
test->errln("break forward test failed: missed %d match",
|
||||
expectedcount - count);
|
||||
test->errln("%s:%d break forward test failed: missed %d match",
|
||||
__FILE__, __LINE__, expectedcount - count);
|
||||
return;
|
||||
}
|
||||
// testing boundaries
|
||||
@ -3779,13 +3776,15 @@ static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
|
||||
int j = expected[i - 1];
|
||||
if (!bi->isBoundary(j)) {
|
||||
printStringBreaks(ustr, expected, expectedcount);
|
||||
test->errln("isBoundary() failed. Expected boundary at position %d", j);
|
||||
test->errln("%s:%d isBoundary() failed. Expected boundary at position %d",
|
||||
__FILE__, __LINE__, j);
|
||||
return;
|
||||
}
|
||||
for (j = expected[i - 1] + 1; j < expected[i]; j ++) {
|
||||
if (bi->isBoundary(j)) {
|
||||
printStringBreaks(ustr, expected, expectedcount);
|
||||
test->errln("isBoundary() failed. Not expecting boundary at position %d", j);
|
||||
test->errln("%s:%d isBoundary() failed. Not expecting boundary at position %d",
|
||||
__FILE__, __LINE__, j);
|
||||
return;
|
||||
}
|
||||
}
|
||||
@ -3795,8 +3794,8 @@ static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
|
||||
count --;
|
||||
if (forward[count] != i) {
|
||||
printStringBreaks(ustr, expected, expectedcount);
|
||||
test->errln("happy break test previous() failed: expected %d but got %d",
|
||||
forward[count], i);
|
||||
test->errln("%s:%d happy break test previous() failed: expected %d but got %d",
|
||||
__FILE__, __LINE__, forward[count], i);
|
||||
break;
|
||||
}
|
||||
}
|
||||
@ -3811,9 +3810,12 @@ static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
|
||||
// int j = expected[i] + 1;
|
||||
int j = ustr.moveIndex32(expected[i], 1);
|
||||
for (; j <= expected[i + 1]; j ++) {
|
||||
if (bi->preceding(j) != expected[i]) {
|
||||
int32_t expectedPreceding = expected[i];
|
||||
int32_t actualPreceding = bi->preceding(j);
|
||||
if (actualPreceding != expectedPreceding) {
|
||||
printStringBreaks(ustr, expected, expectedcount);
|
||||
test->errln("preceding(): Not expecting boundary at position %d", j);
|
||||
test->errln("%s:%d preceding(%d): expected %d, got %d",
|
||||
__FILE__, __LINE__, j, expectedPreceding, actualPreceding);
|
||||
return;
|
||||
}
|
||||
}
|
||||
@ -3905,7 +3907,12 @@ void RBBITest::TestWordBoundary(void)
|
||||
Locale locale("en");
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
// BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
|
||||
BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
|
||||
LocalPointer<BreakIterator> bi(BreakIterator::createWordInstance(locale, status), status);
|
||||
if (U_FAILURE(status)) {
|
||||
errcheckln(status, "%s:%d Creation of break iterator failed %s",
|
||||
__FILE__, __LINE__, u_errorName(status));
|
||||
return;
|
||||
}
|
||||
UChar str[50];
|
||||
static const char *strlist[] =
|
||||
{
|
||||
@ -3940,43 +3947,44 @@ void RBBITest::TestWordBoundary(void)
|
||||
"\\u003b\\u0027\\u00b7\\u47a3",
|
||||
};
|
||||
int loop;
|
||||
if (U_FAILURE(status)) {
|
||||
errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
|
||||
return;
|
||||
}
|
||||
for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
|
||||
// printf("looping %d\n", loop);
|
||||
u_unescape(strlist[loop], str, 20);
|
||||
u_unescape(strlist[loop], str, UPRV_LENGTHOF(str));
|
||||
UnicodeString ustr(str);
|
||||
int forward[50];
|
||||
int count = 0;
|
||||
|
||||
bi->setText(ustr);
|
||||
int prev = 0;
|
||||
int i;
|
||||
for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
|
||||
forward[count ++] = i;
|
||||
if (i > prev) {
|
||||
int j;
|
||||
for (j = prev + 1; j < i; j ++) {
|
||||
if (bi->isBoundary(j)) {
|
||||
printStringBreaks(ustr, forward, count);
|
||||
errln("happy boundary test failed: expected %d not a boundary",
|
||||
j);
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!bi->isBoundary(i)) {
|
||||
printStringBreaks(ustr, forward, count);
|
||||
errln("happy boundary test failed: expected %d a boundary",
|
||||
i);
|
||||
int prev = -1;
|
||||
for (int32_t boundary = bi->first(); boundary != BreakIterator::DONE; boundary = bi->next()) {
|
||||
++count;
|
||||
if (count >= UPRV_LENGTHOF(forward)) {
|
||||
errln("%s:%d too many breaks found. (loop, count, boundary) = (%d, %d, %d)",
|
||||
__FILE__, __LINE__, loop, count, boundary);
|
||||
return;
|
||||
}
|
||||
prev = i;
|
||||
forward[count] = boundary;
|
||||
if (boundary <= prev) {
|
||||
errln("%s:%d bi::next() did not advance. (loop, prev, boundary) = (%d, %d, %d)\n",
|
||||
__FILE__, __LINE__, loop, prev, boundary);
|
||||
break;
|
||||
}
|
||||
for (int32_t nonBoundary = prev + 1; nonBoundary < boundary; nonBoundary ++) {
|
||||
if (bi->isBoundary(nonBoundary)) {
|
||||
printStringBreaks(ustr, forward, count);
|
||||
errln("%s:%d isBoundary(nonBoundary) failed. (loop, prev, nonBoundary, boundary) = (%d, %d, %d, %d)",
|
||||
__FILE__, __LINE__, loop, prev, nonBoundary, boundary);
|
||||
return;
|
||||
}
|
||||
}
|
||||
if (!bi->isBoundary(boundary)) {
|
||||
printStringBreaks(ustr, forward, count);
|
||||
errln("%s:%d happy boundary test failed: expected %d a boundary",
|
||||
__FILE__, __LINE__, boundary);
|
||||
return;
|
||||
}
|
||||
prev = boundary;
|
||||
}
|
||||
}
|
||||
delete bi;
|
||||
}
|
||||
|
||||
void RBBITest::TestLineBreaks(void)
|
||||
@ -4792,6 +4800,40 @@ void RBBITest::TestEmoji() {
|
||||
}
|
||||
|
||||
|
||||
// TestBug12519 - Correct handling of Locales by assignment / copy / clone
|
||||
|
||||
// WHERE Macro yields a literal string of the form "source_file_name:line number "
|
||||
// TODO: propose something equivalent as a test framework addition.
|
||||
|
||||
#define WHERE __FILE__ ":" XLINE(__LINE__) " "
|
||||
#define XLINE(s) LINE(s)
|
||||
#define LINE(s) #s
|
||||
|
||||
void RBBITest::TestBug12519() {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
LocalPointer<RuleBasedBreakIterator> biEn((RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status));
|
||||
LocalPointer<RuleBasedBreakIterator> biFr((RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getFrance(), status));
|
||||
assertSuccess(WHERE, status);
|
||||
assertTrue(WHERE, Locale::getEnglish() == biEn->getLocale(ULOC_VALID_LOCALE, status));
|
||||
assertTrue(WHERE, Locale::getFrench() == biFr->getLocale(ULOC_VALID_LOCALE, status));
|
||||
assertTrue(WHERE "Locales do not participate in BreakIterator equality.", *biEn == *biFr);
|
||||
|
||||
LocalPointer<RuleBasedBreakIterator>cloneEn((RuleBasedBreakIterator *)biEn->clone());
|
||||
assertTrue(WHERE, *biEn == *cloneEn);
|
||||
assertTrue(WHERE, Locale::getEnglish() == cloneEn->getLocale(ULOC_VALID_LOCALE, status));
|
||||
|
||||
LocalPointer<RuleBasedBreakIterator>cloneFr((RuleBasedBreakIterator *)biFr->clone());
|
||||
assertTrue(WHERE, *biFr == *cloneFr);
|
||||
assertTrue(WHERE, Locale::getFrench() == cloneFr->getLocale(ULOC_VALID_LOCALE, status));
|
||||
|
||||
LocalPointer<RuleBasedBreakIterator>biDe((RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getGerman(), status));
|
||||
UnicodeString text("Hallo Welt");
|
||||
biDe->setText(text);
|
||||
assertTrue(WHERE "before assignment of \"biDe = biFr\", they should be different, but are equal.", *biFr != *biDe);
|
||||
*biDe = *biFr;
|
||||
assertTrue(WHERE "after assignment of \"biDe = biFr\", they should be equal, but are not.", *biFr == *biDe);
|
||||
}
|
||||
|
||||
//
|
||||
// TestDebug - A place-holder test for debugging purposes.
|
||||
// For putting in fragments of other tests that can be invoked
|
||||
|
@ -41,11 +41,6 @@ public:
|
||||
|
||||
void runIndexedTest( int32_t index, UBool exec, const char* &name, char* par = NULL );
|
||||
|
||||
/**
|
||||
* Tests rule status return values
|
||||
**/
|
||||
void TestStatusReturn();
|
||||
|
||||
void TestEmptyString();
|
||||
void TestGetAvailableLocales();
|
||||
void TestGetDisplayName();
|
||||
@ -79,6 +74,7 @@ public:
|
||||
void TestBug12918();
|
||||
void TestBug12932();
|
||||
void TestEmoji();
|
||||
void TestBug12519();
|
||||
|
||||
void TestDebug();
|
||||
void TestProperties();
|
||||
|
@ -57,6 +57,8 @@ public:
|
||||
void TestMalformedUTF8();
|
||||
void TestBufferOverflow();
|
||||
void TestEdits();
|
||||
void TestCopyMoveEdits();
|
||||
void TestMergeEdits();
|
||||
void TestCaseMapWithEdits();
|
||||
void TestCaseMapUTF8WithEdits();
|
||||
void TestLongUnicodeString();
|
||||
@ -94,6 +96,8 @@ StringCaseTest::runIndexedTest(int32_t index, UBool exec, const char *&name, cha
|
||||
TESTCASE_AUTO(TestMalformedUTF8);
|
||||
TESTCASE_AUTO(TestBufferOverflow);
|
||||
TESTCASE_AUTO(TestEdits);
|
||||
TESTCASE_AUTO(TestCopyMoveEdits);
|
||||
TESTCASE_AUTO(TestMergeEdits);
|
||||
TESTCASE_AUTO(TestCaseMapWithEdits);
|
||||
TESTCASE_AUTO(TestCaseMapUTF8WithEdits);
|
||||
TESTCASE_AUTO(TestLongUnicodeString);
|
||||
@ -966,6 +970,225 @@ void StringCaseTest::TestEdits() {
|
||||
assertFalse("reset then iterator", ei.next(errorCode));
|
||||
}
|
||||
|
||||
void StringCaseTest::TestCopyMoveEdits() {
|
||||
IcuTestErrorCode errorCode(*this, "TestCopyMoveEdits");
|
||||
// Exceed the stack array capacity.
|
||||
Edits a;
|
||||
for (int32_t i = 0; i < 250; ++i) {
|
||||
a.addReplace(i % 10, (i % 10) + 1);
|
||||
}
|
||||
assertEquals("a: many edits, length delta", 250, a.lengthDelta());
|
||||
|
||||
// copy
|
||||
Edits b(a);
|
||||
assertEquals("b: copy of many edits, length delta", 250, b.lengthDelta());
|
||||
assertEquals("a remains: many edits, length delta", 250, a.lengthDelta());
|
||||
TestUtility::checkEqualEdits(*this, u"b copy of a", a, b, errorCode);
|
||||
|
||||
// assign
|
||||
Edits c;
|
||||
c.addUnchanged(99);
|
||||
c.addReplace(88, 77);
|
||||
c = b;
|
||||
assertEquals("c: assigned many edits, length delta", 250, c.lengthDelta());
|
||||
assertEquals("b remains: many edits, length delta", 250, b.lengthDelta());
|
||||
TestUtility::checkEqualEdits(*this, u"c = b", b, c, errorCode);
|
||||
|
||||
// move constructor empties object with heap array
|
||||
Edits d(std::move(a));
|
||||
assertEquals("d: move-constructed many edits, length delta", 250, d.lengthDelta());
|
||||
assertFalse("a moved away: no more hasChanges", a.hasChanges());
|
||||
TestUtility::checkEqualEdits(*this, u"d() <- a", d, b, errorCode);
|
||||
Edits empty;
|
||||
TestUtility::checkEqualEdits(*this, u"a moved away", empty, a, errorCode);
|
||||
|
||||
// move assignment empties object with heap array
|
||||
Edits e;
|
||||
e.addReplace(0, 1000);
|
||||
e = std::move(b);
|
||||
assertEquals("e: move-assigned many edits, length delta", 250, e.lengthDelta());
|
||||
assertFalse("b moved away: no more hasChanges", b.hasChanges());
|
||||
TestUtility::checkEqualEdits(*this, u"e <- b", e, c, errorCode);
|
||||
TestUtility::checkEqualEdits(*this, u"b moved away", empty, b, errorCode);
|
||||
|
||||
// Edits::Iterator default constructor.
|
||||
Edits::Iterator iter;
|
||||
assertFalse("Edits::Iterator().next()", iter.next(errorCode));
|
||||
assertSuccess("Edits::Iterator().next()", errorCode);
|
||||
iter = e.getFineChangesIterator();
|
||||
assertTrue("iter.next()", iter.next(errorCode));
|
||||
assertSuccess("iter.next()", errorCode);
|
||||
assertTrue("iter.hasChange()", iter.hasChange());
|
||||
assertEquals("iter.newLength()", 1, iter.newLength());
|
||||
}
|
||||
|
||||
void StringCaseTest::TestMergeEdits() {
|
||||
// For debugging, set -v to see matching edits up to a failure.
|
||||
IcuTestErrorCode errorCode(*this, "TestMergeEdits");
|
||||
Edits ab, bc, ac, expected_ac;
|
||||
|
||||
// Simple: Two parallel non-changes.
|
||||
ab.addUnchanged(2);
|
||||
bc.addUnchanged(2);
|
||||
expected_ac.addUnchanged(2);
|
||||
|
||||
// Simple: Two aligned changes.
|
||||
ab.addReplace(3, 2);
|
||||
bc.addReplace(2, 1);
|
||||
expected_ac.addReplace(3, 1);
|
||||
|
||||
// Unequal non-changes.
|
||||
ab.addUnchanged(5);
|
||||
bc.addUnchanged(3);
|
||||
expected_ac.addUnchanged(3);
|
||||
// ab ahead by 2
|
||||
|
||||
// Overlapping changes accumulate until they share a boundary.
|
||||
ab.addReplace(4, 3);
|
||||
bc.addReplace(3, 2);
|
||||
ab.addReplace(4, 3);
|
||||
bc.addReplace(3, 2);
|
||||
ab.addReplace(4, 3);
|
||||
bc.addReplace(3, 2);
|
||||
bc.addUnchanged(4);
|
||||
expected_ac.addReplace(14, 8);
|
||||
// bc ahead by 2
|
||||
|
||||
// Balance out intermediate-string lengths.
|
||||
ab.addUnchanged(2);
|
||||
expected_ac.addUnchanged(2);
|
||||
|
||||
// Insert something and delete it: Should disappear.
|
||||
ab.addReplace(0, 5);
|
||||
ab.addReplace(0, 2);
|
||||
bc.addReplace(7, 0);
|
||||
|
||||
// Parallel change to make a new boundary.
|
||||
ab.addReplace(1, 2);
|
||||
bc.addReplace(2, 3);
|
||||
expected_ac.addReplace(1, 3);
|
||||
|
||||
// Multiple ab deletions should remain separate at the boundary.
|
||||
ab.addReplace(1, 0);
|
||||
ab.addReplace(2, 0);
|
||||
ab.addReplace(3, 0);
|
||||
expected_ac.addReplace(1, 0);
|
||||
expected_ac.addReplace(2, 0);
|
||||
expected_ac.addReplace(3, 0);
|
||||
|
||||
// Unequal non-changes can be split for another boundary.
|
||||
ab.addUnchanged(2);
|
||||
bc.addUnchanged(1);
|
||||
expected_ac.addUnchanged(1);
|
||||
// ab ahead by 1
|
||||
|
||||
// Multiple bc insertions should create a boundary and remain separate.
|
||||
bc.addReplace(0, 4);
|
||||
bc.addReplace(0, 5);
|
||||
bc.addReplace(0, 6);
|
||||
expected_ac.addReplace(0, 4);
|
||||
expected_ac.addReplace(0, 5);
|
||||
expected_ac.addReplace(0, 6);
|
||||
// ab ahead by 1
|
||||
|
||||
// Multiple ab deletions in the middle of a bc change are merged.
|
||||
bc.addReplace(2, 2);
|
||||
// bc ahead by 1
|
||||
ab.addReplace(1, 0);
|
||||
ab.addReplace(2, 0);
|
||||
ab.addReplace(3, 0);
|
||||
ab.addReplace(4, 1);
|
||||
expected_ac.addReplace(11, 2);
|
||||
|
||||
// Multiple bc insertions in the middle of an ab change are merged.
|
||||
ab.addReplace(5, 6);
|
||||
bc.addReplace(3, 3);
|
||||
// ab ahead by 3
|
||||
bc.addReplace(0, 4);
|
||||
bc.addReplace(0, 5);
|
||||
bc.addReplace(0, 6);
|
||||
bc.addReplace(3, 7);
|
||||
expected_ac.addReplace(5, 25);
|
||||
|
||||
// Delete around a deletion.
|
||||
ab.addReplace(4, 4);
|
||||
ab.addReplace(3, 0);
|
||||
ab.addUnchanged(2);
|
||||
bc.addReplace(2, 2);
|
||||
bc.addReplace(4, 0);
|
||||
expected_ac.addReplace(9, 2);
|
||||
|
||||
// Insert into an insertion.
|
||||
ab.addReplace(0, 2);
|
||||
bc.addReplace(1, 1);
|
||||
bc.addReplace(0, 8);
|
||||
bc.addUnchanged(4);
|
||||
expected_ac.addReplace(0, 10);
|
||||
// bc ahead by 3
|
||||
|
||||
// Balance out intermediate-string lengths.
|
||||
ab.addUnchanged(3);
|
||||
expected_ac.addUnchanged(3);
|
||||
|
||||
// Deletions meet insertions.
|
||||
// Output order is arbitrary in principle, but we expect insertions first
|
||||
// and want to keep it that way.
|
||||
ab.addReplace(2, 0);
|
||||
ab.addReplace(4, 0);
|
||||
ab.addReplace(6, 0);
|
||||
bc.addReplace(0, 1);
|
||||
bc.addReplace(0, 3);
|
||||
bc.addReplace(0, 5);
|
||||
expected_ac.addReplace(0, 1);
|
||||
expected_ac.addReplace(0, 3);
|
||||
expected_ac.addReplace(0, 5);
|
||||
expected_ac.addReplace(2, 0);
|
||||
expected_ac.addReplace(4, 0);
|
||||
expected_ac.addReplace(6, 0);
|
||||
|
||||
// End with a non-change, so that further edits are never reordered.
|
||||
ab.addUnchanged(1);
|
||||
bc.addUnchanged(1);
|
||||
expected_ac.addUnchanged(1);
|
||||
|
||||
ac.mergeAndAppend(ab, bc, errorCode);
|
||||
assertSuccess("ab+bc", errorCode);
|
||||
if (!TestUtility::checkEqualEdits(*this, u"ab+bc", expected_ac, ac, errorCode)) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Append more Edits.
|
||||
Edits ab2, bc2;
|
||||
ab2.addUnchanged(5);
|
||||
bc2.addReplace(1, 2);
|
||||
bc2.addUnchanged(4);
|
||||
expected_ac.addReplace(1, 2);
|
||||
expected_ac.addUnchanged(4);
|
||||
ac.mergeAndAppend(ab2, bc2, errorCode);
|
||||
assertSuccess("ab2+bc2", errorCode);
|
||||
if (!TestUtility::checkEqualEdits(*this, u"ab2+bc2", expected_ac, ac, errorCode)) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Append empty edits.
|
||||
Edits empty;
|
||||
ac.mergeAndAppend(empty, empty, errorCode);
|
||||
assertSuccess("empty+empty", errorCode);
|
||||
if (!TestUtility::checkEqualEdits(*this, u"empty+empty", expected_ac, ac, errorCode)) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Error: Append more edits with mismatched intermediate-string lengths.
|
||||
Edits mismatch;
|
||||
mismatch.addReplace(1, 1);
|
||||
ac.mergeAndAppend(ab2, mismatch, errorCode);
|
||||
assertEquals("ab2+mismatch", U_ILLEGAL_ARGUMENT_ERROR, errorCode.get());
|
||||
errorCode.reset();
|
||||
ac.mergeAndAppend(mismatch, bc2, errorCode);
|
||||
assertEquals("mismatch+bc2", U_ILLEGAL_ARGUMENT_ERROR, errorCode.get());
|
||||
errorCode.reset();
|
||||
}
|
||||
|
||||
void StringCaseTest::TestCaseMapWithEdits() {
|
||||
IcuTestErrorCode errorCode(*this, "TestEdits");
|
||||
UChar dest[20];
|
||||
|
@ -10,6 +10,8 @@
|
||||
**********************************************************************
|
||||
*/
|
||||
|
||||
#include <algorithm>
|
||||
#include <vector>
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/edits.h"
|
||||
#include "unicode/unistr.h"
|
||||
@ -65,6 +67,100 @@ UnicodeString TestUtility::hex(const uint8_t* bytes, int32_t len) {
|
||||
return buf;
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
UnicodeString printOneEdit(const Edits::Iterator &ei) {
|
||||
if (ei.hasChange()) {
|
||||
return UnicodeString() + ei.oldLength() + u"->" + ei.newLength();
|
||||
} else {
|
||||
return UnicodeString() + ei.oldLength() + u"=" + ei.newLength();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Maps indexes according to the expected edits.
|
||||
* A destination index can occur multiple times when there are source deletions.
|
||||
* Map according to the last occurrence, normally in a non-empty destination span.
|
||||
* Simplest is to search from the back.
|
||||
*/
|
||||
int32_t srcIndexFromDest(const EditChange expected[], int32_t expLength,
|
||||
int32_t srcLength, int32_t destLength, int32_t index) {
|
||||
int32_t srcIndex = srcLength;
|
||||
int32_t destIndex = destLength;
|
||||
int32_t i = expLength;
|
||||
while (index < destIndex && i > 0) {
|
||||
--i;
|
||||
int32_t prevSrcIndex = srcIndex - expected[i].oldLength;
|
||||
int32_t prevDestIndex = destIndex - expected[i].newLength;
|
||||
if (index == prevDestIndex) {
|
||||
return prevSrcIndex;
|
||||
} else if (index > prevDestIndex) {
|
||||
if (expected[i].change) {
|
||||
// In a change span, map to its end.
|
||||
return srcIndex;
|
||||
} else {
|
||||
// In an unchanged span, offset within it.
|
||||
return prevSrcIndex + (index - prevDestIndex);
|
||||
}
|
||||
}
|
||||
srcIndex = prevSrcIndex;
|
||||
destIndex = prevDestIndex;
|
||||
}
|
||||
// index is outside the string.
|
||||
return srcIndex;
|
||||
}
|
||||
|
||||
int32_t destIndexFromSrc(const EditChange expected[], int32_t expLength,
|
||||
int32_t srcLength, int32_t destLength, int32_t index) {
|
||||
int32_t srcIndex = srcLength;
|
||||
int32_t destIndex = destLength;
|
||||
int32_t i = expLength;
|
||||
while (index < srcIndex && i > 0) {
|
||||
--i;
|
||||
int32_t prevSrcIndex = srcIndex - expected[i].oldLength;
|
||||
int32_t prevDestIndex = destIndex - expected[i].newLength;
|
||||
if (index == prevSrcIndex) {
|
||||
return prevDestIndex;
|
||||
} else if (index > prevSrcIndex) {
|
||||
if (expected[i].change) {
|
||||
// In a change span, map to its end.
|
||||
return destIndex;
|
||||
} else {
|
||||
// In an unchanged span, offset within it.
|
||||
return prevDestIndex + (index - prevSrcIndex);
|
||||
}
|
||||
}
|
||||
srcIndex = prevSrcIndex;
|
||||
destIndex = prevDestIndex;
|
||||
}
|
||||
// index is outside the string.
|
||||
return destIndex;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
// For debugging, set -v to see matching edits up to a failure.
|
||||
UBool TestUtility::checkEqualEdits(IntlTest &test, const UnicodeString &name,
|
||||
const Edits &e1, const Edits &e2, UErrorCode &errorCode) {
|
||||
Edits::Iterator ei1 = e1.getFineIterator();
|
||||
Edits::Iterator ei2 = e2.getFineIterator();
|
||||
UBool ok = TRUE;
|
||||
for (int32_t i = 0; ok; ++i) {
|
||||
UBool ei1HasNext = ei1.next(errorCode);
|
||||
UBool ei2HasNext = ei2.next(errorCode);
|
||||
ok &= test.assertEquals(name + u" next()[" + i + u"]" + __LINE__,
|
||||
ei1HasNext, ei2HasNext);
|
||||
ok &= test.assertSuccess(name + u" errorCode[" + i + u"]" + __LINE__, errorCode);
|
||||
ok &= test.assertEquals(name + u" edit[" + i + u"]" + __LINE__,
|
||||
printOneEdit(ei1), printOneEdit(ei2));
|
||||
if (!ei1HasNext || !ei2HasNext) {
|
||||
break;
|
||||
}
|
||||
test.logln();
|
||||
}
|
||||
return ok;
|
||||
}
|
||||
|
||||
void TestUtility::checkEditsIter(
|
||||
IntlTest &test,
|
||||
const UnicodeString &name,
|
||||
@ -77,8 +173,6 @@ void TestUtility::checkEditsIter(
|
||||
int32_t expSrcIndex = 0;
|
||||
int32_t expDestIndex = 0;
|
||||
int32_t expReplIndex = 0;
|
||||
int32_t expSrcIndexFromDest = 0; // for sourceIndexFromDestinationIndex()
|
||||
int32_t expDestIndexFromSrc = 0; // for destinationIndexFromSourceIndex()
|
||||
for (int32_t expIndex = 0; expIndex < expLength; ++expIndex) {
|
||||
const EditChange &expect = expected[expIndex];
|
||||
UnicodeString msg = UnicodeString(name).append(u' ') + expIndex;
|
||||
@ -92,7 +186,7 @@ void TestUtility::checkEditsIter(
|
||||
test.assertEquals(msg + u":" + __LINE__, expReplIndex, ei1.replacementIndex());
|
||||
}
|
||||
|
||||
if (expect.oldLength > 0 && expDestIndex == expDestIndexFromSrc) {
|
||||
if (expect.oldLength > 0) {
|
||||
test.assertTrue(msg + u":" + __LINE__, ei2.findSourceIndex(expSrcIndex, errorCode));
|
||||
test.assertEquals(msg + u":" + __LINE__, expect.change, ei2.hasChange());
|
||||
test.assertEquals(msg + u":" + __LINE__, expect.oldLength, ei2.oldLength());
|
||||
@ -108,7 +202,7 @@ void TestUtility::checkEditsIter(
|
||||
}
|
||||
}
|
||||
|
||||
if (expect.newLength > 0 && expSrcIndex == expSrcIndexFromDest) {
|
||||
if (expect.newLength > 0) {
|
||||
test.assertTrue(msg + u":" + __LINE__, ei2.findDestinationIndex(expDestIndex, errorCode));
|
||||
test.assertEquals(msg + u":" + __LINE__, expect.change, ei2.hasChange());
|
||||
test.assertEquals(msg + u":" + __LINE__, expect.oldLength, ei2.oldLength());
|
||||
@ -124,45 +218,11 @@ void TestUtility::checkEditsIter(
|
||||
}
|
||||
}
|
||||
|
||||
// Span starts.
|
||||
test.assertEquals(name + u":" + __LINE__, expDestIndexFromSrc,
|
||||
ei2.destinationIndexFromSourceIndex(expSrcIndex, errorCode));
|
||||
test.assertEquals(name + u":" + __LINE__, expSrcIndexFromDest,
|
||||
ei2.sourceIndexFromDestinationIndex(expDestIndex, errorCode));
|
||||
|
||||
// Inside unchanged span map offsets 1:1.
|
||||
if (!expect.change && expect.oldLength >= 2) {
|
||||
test.assertEquals(name + u":" + __LINE__, expDestIndex + 1,
|
||||
ei2.destinationIndexFromSourceIndex(expSrcIndex + 1, errorCode));
|
||||
test.assertEquals(name + u":" + __LINE__, expSrcIndex + 1,
|
||||
ei2.sourceIndexFromDestinationIndex(expDestIndex + 1, errorCode));
|
||||
}
|
||||
|
||||
// Inside change span map to the span limit.
|
||||
int32_t expSrcLimit = expSrcIndex + expect.oldLength;
|
||||
int32_t expDestLimit = expDestIndex + expect.newLength;
|
||||
if (expect.change) {
|
||||
if (expect.oldLength >= 2) {
|
||||
test.assertEquals(name + u":" + __LINE__, expDestLimit,
|
||||
ei2.destinationIndexFromSourceIndex(expSrcIndex + 1, errorCode));
|
||||
}
|
||||
if (expect.newLength >= 2) {
|
||||
test.assertEquals(name + u":" + __LINE__, expSrcLimit,
|
||||
ei2.sourceIndexFromDestinationIndex(expDestIndex + 1, errorCode));
|
||||
}
|
||||
}
|
||||
|
||||
expSrcIndex = expSrcLimit;
|
||||
expDestIndex = expDestLimit;
|
||||
expSrcIndex += expect.oldLength;
|
||||
expDestIndex += expect.newLength;
|
||||
if (expect.change) {
|
||||
expReplIndex += expect.newLength;
|
||||
}
|
||||
if (expect.newLength > 0) {
|
||||
expSrcIndexFromDest = expSrcIndex;
|
||||
}
|
||||
if (expect.oldLength > 0) {
|
||||
expDestIndexFromSrc = expDestIndex;
|
||||
}
|
||||
}
|
||||
UnicodeString msg = UnicodeString(name).append(u" end");
|
||||
test.assertFalse(msg + u":" + __LINE__, ei1.next(errorCode));
|
||||
@ -175,8 +235,47 @@ void TestUtility::checkEditsIter(
|
||||
|
||||
test.assertFalse(name + u":" + __LINE__, ei2.findSourceIndex(expSrcIndex, errorCode));
|
||||
test.assertFalse(name + u":" + __LINE__, ei2.findDestinationIndex(expDestIndex, errorCode));
|
||||
test.assertEquals(name + u":" + __LINE__, expDestIndex,
|
||||
ei2.destinationIndexFromSourceIndex(expSrcIndex, errorCode));
|
||||
test.assertEquals(name + u":" + __LINE__, expSrcIndex,
|
||||
ei2.sourceIndexFromDestinationIndex(expDestIndex, errorCode));
|
||||
|
||||
// Check mapping of all indexes against a simple implementation
|
||||
// that works on the expected changes.
|
||||
// Iterate once forward, once backward, to cover more runtime conditions.
|
||||
int32_t srcLength = expSrcIndex;
|
||||
int32_t destLength = expDestIndex;
|
||||
std::vector<int32_t> srcIndexes;
|
||||
std::vector<int32_t> destIndexes;
|
||||
srcIndexes.push_back(-1);
|
||||
destIndexes.push_back(-1);
|
||||
int32_t srcIndex = 0;
|
||||
int32_t destIndex = 0;
|
||||
for (int32_t i = 0; i < expLength; ++i) {
|
||||
if (expected[i].oldLength > 0) {
|
||||
srcIndexes.push_back(srcIndex);
|
||||
if (expected[i].oldLength > 1) {
|
||||
srcIndexes.push_back(srcIndex + 1);
|
||||
}
|
||||
}
|
||||
if (expected[i].newLength > 0) {
|
||||
destIndexes.push_back(destIndex);
|
||||
if (expected[i].newLength > 0) {
|
||||
destIndexes.push_back(destIndex + 1);
|
||||
}
|
||||
}
|
||||
srcIndex += expected[i].oldLength;
|
||||
destIndex += expected[i].newLength;
|
||||
}
|
||||
srcIndexes.push_back(srcLength);
|
||||
destIndexes.push_back(destLength);
|
||||
srcIndexes.push_back(srcLength + 1);
|
||||
destIndexes.push_back(destLength + 1);
|
||||
std::reverse(destIndexes.begin(), destIndexes.end());
|
||||
for (int32_t i : srcIndexes) {
|
||||
test.assertEquals(name + u" destIndexFromSrc(" + i + u"):" + __LINE__,
|
||||
destIndexFromSrc(expected, expLength, srcLength, destLength, i),
|
||||
ei2.destinationIndexFromSourceIndex(i, errorCode));
|
||||
}
|
||||
for (int32_t i : destIndexes) {
|
||||
test.assertEquals(name + u" srcIndexFromDest(" + i + u"):" + __LINE__,
|
||||
srcIndexFromDest(expected, expLength, srcLength, destLength, i),
|
||||
ei2.sourceIndexFromDestinationIndex(i, errorCode));
|
||||
}
|
||||
}
|
||||
|
@ -37,6 +37,9 @@ public:
|
||||
|
||||
static UnicodeString hex(const uint8_t* bytes, int32_t len);
|
||||
|
||||
static UBool checkEqualEdits(IntlTest &test, const UnicodeString &name,
|
||||
const Edits &e1, const Edits &e2, UErrorCode &errorCode);
|
||||
|
||||
static void checkEditsIter(
|
||||
IntlTest &test, const UnicodeString &name,
|
||||
Edits::Iterator ei1, Edits::Iterator ei2, // two equal iterators
|
||||
|
@ -23,6 +23,7 @@ void IntlTestDecimalFormatSymbols::runIndexedTest( int32_t index, UBool exec, co
|
||||
TESTCASE_AUTO_BEGIN;
|
||||
TESTCASE_AUTO(testSymbols);
|
||||
TESTCASE_AUTO(testLastResortData);
|
||||
TESTCASE_AUTO(testNumberingSystem);
|
||||
TESTCASE_AUTO_END;
|
||||
}
|
||||
|
||||
@ -248,6 +249,49 @@ void IntlTestDecimalFormatSymbols::testLastResortData() {
|
||||
Verify(1234567.25, "#,##0.##", *lastResort, "1,234,567.25");
|
||||
}
|
||||
|
||||
void IntlTestDecimalFormatSymbols::testNumberingSystem() {
|
||||
IcuTestErrorCode errorCode(*this, "testNumberingSystem");
|
||||
struct testcase {
|
||||
const char* locid;
|
||||
const char* nsname;
|
||||
const char16_t* expected1; // Expected number format string
|
||||
const char16_t* expected2; // Expected pattern separator
|
||||
};
|
||||
static const testcase cases[9] = {
|
||||
{"en", "latn", u"1,234.56", u";"},
|
||||
{"en", "arab", u"١٬٢٣٤٫٥٦", u"؛"},
|
||||
{"en", "mathsanb", u"𝟭,𝟮𝟯𝟰.𝟱𝟲", u";"},
|
||||
{"en", "mymr", u"၁,၂၃၄.၅၆", u";"},
|
||||
{"my", "latn", u"1,234.56", u";"},
|
||||
{"my", "arab", u"١٬٢٣٤٫٥٦", u"؛"},
|
||||
{"my", "mathsanb", u"𝟭,𝟮𝟯𝟰.𝟱𝟲", u";"},
|
||||
{"my", "mymr", u"၁,၂၃၄.၅၆", u"၊"},
|
||||
{"en@numbers=thai", "mymr", u"၁,၂၃၄.၅၆", u";"}, // conflicting numbering system
|
||||
};
|
||||
|
||||
for (int i=0; i<8; i++) {
|
||||
testcase cas = cases[i];
|
||||
Locale loc(cas.locid);
|
||||
LocalPointer<NumberingSystem> ns(NumberingSystem::createInstanceByName(cas.nsname, errorCode));
|
||||
if (errorCode.logDataIfFailureAndReset("NumberingSystem failed")) {
|
||||
return;
|
||||
}
|
||||
UnicodeString expected1(cas.expected1);
|
||||
UnicodeString expected2(cas.expected2);
|
||||
DecimalFormatSymbols dfs(loc, *ns, errorCode);
|
||||
if (errorCode.logDataIfFailureAndReset("DecimalFormatSymbols failed")) {
|
||||
return;
|
||||
}
|
||||
Verify(1234.56, "#,##0.##", dfs, expected1);
|
||||
// The pattern separator is something that differs by numbering system in my@numbers=mymr.
|
||||
UnicodeString actual2 = dfs.getSymbol(DecimalFormatSymbols::kPatternSeparatorSymbol);
|
||||
if (expected2 != actual2) {
|
||||
errln((UnicodeString)"ERROR: DecimalFormatSymbols returned pattern separator " + actual2
|
||||
+ " but we expected " + expected2);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void IntlTestDecimalFormatSymbols::Verify(double value, const UnicodeString& pattern,
|
||||
const DecimalFormatSymbols &sym, const UnicodeString& expected){
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
|
@ -28,6 +28,7 @@ private:
|
||||
*/
|
||||
void testSymbols(/*char *par*/);
|
||||
void testLastResortData();
|
||||
void testNumberingSystem();
|
||||
|
||||
/** helper functions**/
|
||||
void Verify(double value, const UnicodeString& pattern,
|
||||
|
@ -67,35 +67,38 @@ UnicodeSetTest::~UnicodeSetTest() {
|
||||
void
|
||||
UnicodeSetTest::runIndexedTest(int32_t index, UBool exec,
|
||||
const char* &name, char* /*par*/) {
|
||||
// if (exec) logln((UnicodeString)"TestSuite UnicodeSetTest");
|
||||
switch (index) {
|
||||
CASE(0,TestPatterns);
|
||||
CASE(1,TestAddRemove);
|
||||
CASE(2,TestCategories);
|
||||
CASE(3,TestCloneEqualHash);
|
||||
CASE(4,TestMinimalRep);
|
||||
CASE(5,TestAPI);
|
||||
CASE(6,TestScriptSet);
|
||||
CASE(7,TestPropertySet);
|
||||
CASE(8,TestClone);
|
||||
CASE(9,TestExhaustive);
|
||||
CASE(10,TestToPattern);
|
||||
CASE(11,TestIndexOf);
|
||||
CASE(12,TestStrings);
|
||||
CASE(13,Testj2268);
|
||||
CASE(14,TestCloseOver);
|
||||
CASE(15,TestEscapePattern);
|
||||
CASE(16,TestInvalidCodePoint);
|
||||
CASE(17,TestSymbolTable);
|
||||
CASE(18,TestSurrogate);
|
||||
CASE(19,TestPosixClasses);
|
||||
CASE(20,TestIteration);
|
||||
CASE(21,TestFreezable);
|
||||
CASE(22,TestSpan);
|
||||
CASE(23,TestStringSpan);
|
||||
CASE(24,TestUCAUnsafeBackwards);
|
||||
default: name = ""; break;
|
||||
if (exec) {
|
||||
logln(u"TestSuite UnicodeSetTest");
|
||||
}
|
||||
TESTCASE_AUTO_BEGIN;
|
||||
TESTCASE_AUTO(TestPatterns);
|
||||
TESTCASE_AUTO(TestAddRemove);
|
||||
TESTCASE_AUTO(TestCategories);
|
||||
TESTCASE_AUTO(TestCloneEqualHash);
|
||||
TESTCASE_AUTO(TestMinimalRep);
|
||||
TESTCASE_AUTO(TestAPI);
|
||||
TESTCASE_AUTO(TestScriptSet);
|
||||
TESTCASE_AUTO(TestPropertySet);
|
||||
TESTCASE_AUTO(TestClone);
|
||||
TESTCASE_AUTO(TestExhaustive);
|
||||
TESTCASE_AUTO(TestToPattern);
|
||||
TESTCASE_AUTO(TestIndexOf);
|
||||
TESTCASE_AUTO(TestStrings);
|
||||
TESTCASE_AUTO(Testj2268);
|
||||
TESTCASE_AUTO(TestCloseOver);
|
||||
TESTCASE_AUTO(TestEscapePattern);
|
||||
TESTCASE_AUTO(TestInvalidCodePoint);
|
||||
TESTCASE_AUTO(TestSymbolTable);
|
||||
TESTCASE_AUTO(TestSurrogate);
|
||||
TESTCASE_AUTO(TestPosixClasses);
|
||||
TESTCASE_AUTO(TestIteration);
|
||||
TESTCASE_AUTO(TestFreezable);
|
||||
TESTCASE_AUTO(TestSpan);
|
||||
TESTCASE_AUTO(TestStringSpan);
|
||||
TESTCASE_AUTO(TestUCAUnsafeBackwards);
|
||||
TESTCASE_AUTO(TestIntOverflow);
|
||||
TESTCASE_AUTO(TestUnusedCcc);
|
||||
TESTCASE_AUTO_END;
|
||||
}
|
||||
|
||||
static const char NOT[] = "%%%%";
|
||||
@ -3925,3 +3928,41 @@ void UnicodeSetTest::TestUCAUnsafeBackwards() {
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
void UnicodeSetTest::TestIntOverflow() {
|
||||
// This test triggers undefined double->int conversion behavior
|
||||
// if the implementation is not careful.
|
||||
IcuTestErrorCode errorCode(*this, "TestIntOverflow");
|
||||
UnicodeSet set(u"[:ccc=2222222222222222222:]", errorCode);
|
||||
assertTrue("[:ccc=int_overflow:] -> empty set", set.isEmpty());
|
||||
assertEquals("[:ccc=int_overflow:] -> illegal argument",
|
||||
U_ILLEGAL_ARGUMENT_ERROR, errorCode.reset());
|
||||
}
|
||||
|
||||
void UnicodeSetTest::TestUnusedCcc() {
|
||||
// All numeric ccc values 0..255 are valid, but many are unused.
|
||||
IcuTestErrorCode errorCode(*this, "TestUnusedCcc");
|
||||
UnicodeSet ccc2(u"[:ccc=2:]", errorCode);
|
||||
assertSuccess("[:ccc=2:]", errorCode);
|
||||
assertTrue("[:ccc=2:] -> empty set", ccc2.isEmpty());
|
||||
|
||||
UnicodeSet ccc255(u"[:ccc=255:]", errorCode);
|
||||
assertSuccess("[:ccc=255:]", errorCode);
|
||||
assertTrue("[:ccc=255:] -> empty set", ccc255.isEmpty());
|
||||
|
||||
// Non-integer values and values outside 0..255 are invalid.
|
||||
UnicodeSet ccc_1(u"[:ccc=-1:]", errorCode);
|
||||
assertEquals("[:ccc=-1:] -> illegal argument",
|
||||
U_ILLEGAL_ARGUMENT_ERROR, errorCode.reset());
|
||||
assertTrue("[:ccc=-1:] -> empty set", ccc_1.isEmpty());
|
||||
|
||||
UnicodeSet ccc256(u"[:ccc=256:]", errorCode);
|
||||
assertEquals("[:ccc=256:] -> illegal argument",
|
||||
U_ILLEGAL_ARGUMENT_ERROR, errorCode.reset());
|
||||
assertTrue("[:ccc=256:] -> empty set", ccc256.isEmpty());
|
||||
|
||||
UnicodeSet ccc1_1(u"[:ccc=1.1:]", errorCode);
|
||||
assertEquals("[:ccc=1.1:] -> illegal argument",
|
||||
U_ILLEGAL_ARGUMENT_ERROR, errorCode.reset());
|
||||
assertTrue("[:ccc=1.1:] -> empty set", ccc1_1.isEmpty());
|
||||
}
|
||||
|
@ -91,6 +91,8 @@ private:
|
||||
void TestStringSpan();
|
||||
|
||||
void TestUCAUnsafeBackwards();
|
||||
void TestIntOverflow();
|
||||
void TestUnusedCcc();
|
||||
|
||||
private:
|
||||
|
||||
|
@ -116,7 +116,7 @@ LB12: GL CM* [^CM];
|
||||
|
||||
LB12a: [^SP BA HY] CM* GL;
|
||||
|
||||
# LB 13 ICU Tailoring, matches tailoring exmaple 8 from UAX 14.
|
||||
# LB 13 ICU Tailoring, matches tailoring example 8 from UAX 14.
|
||||
#
|
||||
# LB13.1 [^SP] CM* [CL CP EX IS SY] # original UAX 14 rule.
|
||||
# LB13.2 SP CM* [CL CP EX IS SY]
|
||||
|
@ -5,23 +5,27 @@ License & terms of use: http://www.unicode.org/copyright.html#License
|
||||
Copyright (c) 2015-2016, International Business Machines Corporation and others. All Rights Reserved.
|
||||
|
||||
This directory contains the break iterator reference rule files used by intltest rbbi/RBBIMonkeyTest/testMonkey.
|
||||
The rules in this directory track the boundary rules from Unicode UAX 14 and 29. They are interpretted
|
||||
The rules in this directory track the boundary rules from Unicode UAX 14 and 29. They are interpreted
|
||||
to provide an expected set of boundary positions to compare with the results from ICU break iteration.
|
||||
|
||||
ICU4J also includes copies of the test reference rules, located in the directory
|
||||
main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/
|
||||
The copies should be kept synchronized; there should be no differences.
|
||||
|
||||
Each set of reference break rules lives in a separate file.
|
||||
The list of rule files to run by default is hardcoded into the test code, in rbbimonkeytest.cpp.
|
||||
The list of rule files to run by default is hard coded into the test code, in rbbimonkeytest.cpp.
|
||||
|
||||
Each test file includes
|
||||
- The type of ICU break interator to create (word, line, sentence, etc.)
|
||||
- The type of ICU break iterator to create (word, line, sentence, etc.)
|
||||
- The locale to use
|
||||
- Character Class definitions
|
||||
- Rule definitions
|
||||
|
||||
To Do
|
||||
- Syntax for tailoring.
|
||||
- Extend the syntax to support rule tailoring.
|
||||
|
||||
|
||||
Character Class Definition:
|
||||
Character Class Definition:
|
||||
name = set_regular_expression;
|
||||
|
||||
Rule Definition:
|
||||
@ -35,7 +39,7 @@ set_regular_expression:
|
||||
(They are mostly the same)
|
||||
May include previously defined set names, which are logically expanded in-place.
|
||||
|
||||
rule_regular_expresson:
|
||||
rule_regular_expression:
|
||||
An ICU Regular Expression.
|
||||
May include set names, which are logically expanded in-place.
|
||||
May include a '÷', which defines a boundary position.
|
||||
@ -52,7 +56,7 @@ Application of the rules:
|
||||
return the position of the '÷' within the match.
|
||||
else
|
||||
position = last character of the rule match.
|
||||
break from the rule loop, continue the outer loop.
|
||||
break from the inner rule loop, continue the outer loop.
|
||||
|
||||
This differs from the Unicode UAX algorithm in that each position in the text is
|
||||
not tested separately. Instead, when a rule match is found, rule application restarts with the last
|
||||
@ -66,7 +70,7 @@ Application of the rules:
|
||||
are with the Unicode UAX rules. With the main ICU break rules, all are applied in parallel.
|
||||
|
||||
Word Dictionaries
|
||||
The monkey test does not test dictionary based breaking. The set named 'dicitionary' is special,
|
||||
The monkey test does not test dictionary based breaking. The set named 'dictionary' is special,
|
||||
as it is in the main ICU rules. For the monkey test, no characters from the dictionary set are
|
||||
included in the randomly-generated test data.
|
||||
|
||||
|
@ -39,7 +39,7 @@ EmojiNRK = [[\p{Emoji}] - [[Regional_Indicator]\u002a\u00230-9©®™
|
||||
Extended_Pict = [\U0001F774-\U0001F77F\U00002700-\U00002701\U00002703-\U00002704\U0000270E\U00002710-\U00002711\U00002765-\U00002767\U0001F030-\U0001F093\U0001F094-\U0001F09F\U0001F10D-\U0001F10F\U0001F12F\U0001F16C-\U0001F16F\U0001F1AD-\U0001F1E5\U0001F260-\U0001F265\U0001F203-\U0001F20F\U0001F23C-\U0001F23F\U0001F249-\U0001F24F\U0001F252-\U0001F25F\U0001F266-\U0001F2FF\U0001F7D5-\U0001F7FF\U0001F000-\U0001F003\U0001F005-\U0001F02B\U0001F02C-\U0001F02F\U0001F322-\U0001F323\U0001F394-\U0001F395\U0001F398\U0001F39C-\U0001F39D\U0001F3F1-\U0001F3F2\U0001F3F6\U0001F4FE\U0001F53E-\U0001F548\U0001F54F\U0001F568-\U0001F56E\U0001F571-\U0001F572\U0001F57B-\U0001F586\U0001F588-\U0001F589\U0001F58E-\U0001F58F\U0001F591-\U0001F594\U0001F597-\U0001F5A3\U0001F5A6-\U0001F5A7\U0001F5A9-\U0001F5B0\U0001F5B3-\U0001F5BB\U0001F5BD-\U0001F5C1\U0001F5C5-\U0001F5D0\U0001F5D4-\U0001F5DB\U0001F5DF-\U0001F5E0\U0001F5E2\U0001F5E4-\U0001F5E7\U0001F5E9-\U0001F5EE\U0001F5F0-\U0001F5F2\U0001F5F4-\U0001F5F9\U00002605\U00002607-\U0000260D\U0000260F-\U00002610\U00002612\U00002616-\U00002617\U00002619-\U0000261C\U0000261E-\U0000261F\U00002621\U00002624-\U00002625\U00002627-\U00002629\U0000262B-\U0000262D\U00002630-\U00002637\U0000263B-\U00002647\U00002654-\U0000265F\U00002661-\U00002662\U00002664\U00002667\U00002669-\U0000267A\U0000267C-\U0000267E\U00002680-\U00002691\U00002695\U00002698\U0000269A\U0000269D-\U0000269F\U000026A2-\U000026A9\U000026AC-\U000026AF\U000026B2-\U000026BC\U000026BF-\U000026C3\U000026C6-\U000026C7\U000026C9-\U000026CD\U000026D0\U000026D2\U000026D5-\U000026E8\U000026EB-\U000026EF\U000026F6\U000026FB-\U000026FC\U000026FE-\U000026FF\U00002388\U0001FA00-\U0001FFFD\U0001F0A0-\U0001F0AE\U0001F0B1-\U0001F0BF\U0001F0C1-\U0001F0CF\U0001F0D1-\U0001F0F5\U0001F0AF-\U0001F0B0\U0001F0C0\U0001F0D0\U0001F0F6-\U0001F0FF\U0001F80C-\U0001F80F\U0001F848-\U0001F84F\U0001F85A-\U0001F85F\U0001F888-\U0001F88F\U0001F8AE-\U0001F8FF\U0001F900-\U0001F90B\U0001F91F\U0001F928-\U0001F92F\U0001F931-\U0001F932\U0001F94C\U0001F95F-\U0001F96B\U0001F992-\U0001F997\U0001F9D0-\U0001F9E6\U0001F90C-\U0001F90F\U0001F93F\U0001F94D-\U0001F94F\U0001F96C-\U0001F97F\U0001F998-\U0001F9BF\U0001F9C1-\U0001F9CF\U0001F9E7-\U0001F9FF\U0001F6C6-\U0001F6CA\U0001F6D3-\U0001F6D4\U0001F6E6-\U0001F6E8\U0001F6EA\U0001F6F1-\U0001F6F2\U0001F6F7-\U0001F6F8\U0001F6D5-\U0001F6DF\U0001F6ED-\U0001F6EF\U0001F6F9-\U0001F6FF];
|
||||
EBG = [\p{Word_Break = EBG}];
|
||||
|
||||
#define dicitionary, with the effect being that those characters don't appear in test data.
|
||||
#define dictionary, with the effect being that those characters don't appear in test data.
|
||||
|
||||
Han = [:Han:];
|
||||
Hiragana = [:Hiragana:];
|
||||
@ -51,12 +51,7 @@ KanaKanji = [Han Hiragana Katakana];
|
||||
dictionaryCJK = [KanaKanji HangulSyllable];
|
||||
dictionary = [ComplexContext dictionaryCJK];
|
||||
|
||||
# leave CJK scripts out of ALetterPlus
|
||||
# Tricky. Redfine a set.
|
||||
# For tailorings, if it modifies itself, do at end of sets ????
|
||||
# Tweak redefine to mean replace existing definition at its original location.
|
||||
# Insert defs without redefine just after last pre-existing def of that name.
|
||||
# Maybe drop redefine, add warning for sets defined and not used, should catch typos.
|
||||
# leave dictionary scripts out of ALetter
|
||||
|
||||
ALetter = [ALetter - dictionary];
|
||||
|
||||
|
@ -38,7 +38,7 @@ EmojiNRK = [[\p{Emoji}] - [[Regional_Indicator]\u002a\u00230-9©®™
|
||||
Extended_Pict = [\U0001F774-\U0001F77F\U00002700-\U00002701\U00002703-\U00002704\U0000270E\U00002710-\U00002711\U00002765-\U00002767\U0001F030-\U0001F093\U0001F094-\U0001F09F\U0001F10D-\U0001F10F\U0001F12F\U0001F16C-\U0001F16F\U0001F1AD-\U0001F1E5\U0001F260-\U0001F265\U0001F203-\U0001F20F\U0001F23C-\U0001F23F\U0001F249-\U0001F24F\U0001F252-\U0001F25F\U0001F266-\U0001F2FF\U0001F7D5-\U0001F7FF\U0001F000-\U0001F003\U0001F005-\U0001F02B\U0001F02C-\U0001F02F\U0001F322-\U0001F323\U0001F394-\U0001F395\U0001F398\U0001F39C-\U0001F39D\U0001F3F1-\U0001F3F2\U0001F3F6\U0001F4FE\U0001F53E-\U0001F548\U0001F54F\U0001F568-\U0001F56E\U0001F571-\U0001F572\U0001F57B-\U0001F586\U0001F588-\U0001F589\U0001F58E-\U0001F58F\U0001F591-\U0001F594\U0001F597-\U0001F5A3\U0001F5A6-\U0001F5A7\U0001F5A9-\U0001F5B0\U0001F5B3-\U0001F5BB\U0001F5BD-\U0001F5C1\U0001F5C5-\U0001F5D0\U0001F5D4-\U0001F5DB\U0001F5DF-\U0001F5E0\U0001F5E2\U0001F5E4-\U0001F5E7\U0001F5E9-\U0001F5EE\U0001F5F0-\U0001F5F2\U0001F5F4-\U0001F5F9\U00002605\U00002607-\U0000260D\U0000260F-\U00002610\U00002612\U00002616-\U00002617\U00002619-\U0000261C\U0000261E-\U0000261F\U00002621\U00002624-\U00002625\U00002627-\U00002629\U0000262B-\U0000262D\U00002630-\U00002637\U0000263B-\U00002647\U00002654-\U0000265F\U00002661-\U00002662\U00002664\U00002667\U00002669-\U0000267A\U0000267C-\U0000267E\U00002680-\U00002691\U00002695\U00002698\U0000269A\U0000269D-\U0000269F\U000026A2-\U000026A9\U000026AC-\U000026AF\U000026B2-\U000026BC\U000026BF-\U000026C3\U000026C6-\U000026C7\U000026C9-\U000026CD\U000026D0\U000026D2\U000026D5-\U000026E8\U000026EB-\U000026EF\U000026F6\U000026FB-\U000026FC\U000026FE-\U000026FF\U00002388\U0001FA00-\U0001FFFD\U0001F0A0-\U0001F0AE\U0001F0B1-\U0001F0BF\U0001F0C1-\U0001F0CF\U0001F0D1-\U0001F0F5\U0001F0AF-\U0001F0B0\U0001F0C0\U0001F0D0\U0001F0F6-\U0001F0FF\U0001F80C-\U0001F80F\U0001F848-\U0001F84F\U0001F85A-\U0001F85F\U0001F888-\U0001F88F\U0001F8AE-\U0001F8FF\U0001F900-\U0001F90B\U0001F91F\U0001F928-\U0001F92F\U0001F931-\U0001F932\U0001F94C\U0001F95F-\U0001F96B\U0001F992-\U0001F997\U0001F9D0-\U0001F9E6\U0001F90C-\U0001F90F\U0001F93F\U0001F94D-\U0001F94F\U0001F96C-\U0001F97F\U0001F998-\U0001F9BF\U0001F9C1-\U0001F9CF\U0001F9E7-\U0001F9FF\U0001F6C6-\U0001F6CA\U0001F6D3-\U0001F6D4\U0001F6E6-\U0001F6E8\U0001F6EA\U0001F6F1-\U0001F6F2\U0001F6F7-\U0001F6F8\U0001F6D5-\U0001F6DF\U0001F6ED-\U0001F6EF\U0001F6F9-\U0001F6FF];
|
||||
EBG = [\p{Word_Break = EBG}];
|
||||
|
||||
#define dicitionary, with the effect being that those characters don't appear in test data.
|
||||
#define dictionary, with the effect being that those characters don't appear in test data.
|
||||
|
||||
Han = [:Han:];
|
||||
Hiragana = [:Hiragana:];
|
||||
@ -50,12 +50,7 @@ KanaKanji = [Han Hiragana Katakana];
|
||||
dictionaryCJK = [KanaKanji HangulSyllable];
|
||||
dictionary = [ComplexContext dictionaryCJK];
|
||||
|
||||
# leave CJK scripts out of ALetterPlus
|
||||
# Tricky. Redfine a set.
|
||||
# For tailorings, if it modifies itself, do at end of sets ????
|
||||
# Tweak redefine to mean replace existing definition at its original location.
|
||||
# Insert defs without redefine just after last pre-existing def of that name.
|
||||
# Maybe drop redefine, add warning for sets defined and not used, should catch typos.
|
||||
# leave dictionary scripts out of ALetter
|
||||
|
||||
ALetter = [ALetter - dictionary];
|
||||
|
||||
|
@ -358,15 +358,14 @@ minIntegerDigits maxIntegerDigits minFractionDigits maxFractionDigits output bre
|
||||
0 0 1 0 2.99792458E8 KS
|
||||
// JDK and S give .2998E9
|
||||
0 0 0 4 2.998E8 KSQ
|
||||
// S correctly formats this as 29.979246E7.
|
||||
// JDK uses 8 + 6 for significant digits instead of 2 + 6
|
||||
// J and C return 2.9979246E8.
|
||||
// TODO: Merge trunk
|
||||
2 8 1 6 29.979246E7 CJKQ
|
||||
// Context: #13289
|
||||
2 8 1 6 2.9979246E8 K
|
||||
// Treat max int digits > 8 as being the same as min int digits.
|
||||
// This behavior is not spelled out in the specification.
|
||||
// JDK fails here because it tries to use 9 + 6 = 15 sig digits.
|
||||
2 9 1 6 29.979246E7 K
|
||||
// C and J get 29.979246E7
|
||||
2 9 1 6 2.9979246E8 CJK
|
||||
|
||||
test significant digits scientific
|
||||
set locale en
|
||||
|
62
icu4c/source/test/testdata/rbbitst.txt
vendored
62
icu4c/source/test/testdata/rbbitst.txt
vendored
@ -14,7 +14,9 @@
|
||||
# <sent> any following data is for sentence break testing
|
||||
# <line> any following data is for line break testing
|
||||
# <char> any following data is for char break testing
|
||||
# <locale local_name> Switch to the named locale at the next occurence of <word>, <sent>, etc.
|
||||
# <rules> rules ... </rules> following data is tested against these rules.
|
||||
# Applies until a following occurence of <word>, <sent>, etc. or another <rules>
|
||||
# <locale locale_name> Switch to the named locale at the next occurence of <word>, <sent>, etc.
|
||||
# <data> ... </data> test data. May span multiple lines.
|
||||
# <> Break position, status == 0
|
||||
# • Break position, status == 0 (Bullet, \u2022)
|
||||
@ -37,8 +39,17 @@
|
||||
# Temp debugging tests
|
||||
<locale en>
|
||||
<word>
|
||||
<data><0>1•2•3•4•</data>
|
||||
# <data><0>ク<400>ライアン<400>トサーバー<400></data>
|
||||
<data><0>コンピューター<400>は<400>、<0>本質<400>的<400>に<400>は<400>数字<400>しか<400>扱う<400>こと<400>が<400>でき<400>ま<400>せん<400>。<0>\
|
||||
コンピューター<400>は<400>、<0>文字<400>や<400>記号<400>など<400>の<400>それぞれに<400>番号<400>を<400>割り振る<400>こと<400>によって<400>扱える<400>\
|
||||
よう<400>にし<400>ます<400>。<0>ユニ<400>コード<400>が<400>出来る<400>まで<400>は<400>、<0>これらの<400>番号<400>を<400>割り振る<400>仕組み<400>が<400>\
|
||||
何<400>百<400>種類<400>も<400>存在<400>しま<400>した<400>。<0>どの<400>一つ<400>を<400>とっても<400>、<0>十分<400>な<400>文字<400>を<400>含<400>\
|
||||
んで<400>は<400>いま<400>せん<400>で<400>した<400>。<0>例えば<400>、<0>欧州<400>連合<400>一つ<400>を<400>見<400>て<400>も<400>、<0>その<400>\
|
||||
すべて<400>の<400>言語<400>を<400>カバー<400>する<400>ため<400>に<400>は<400>、<0>いくつか<400>の<400>異なる<400>符号<400>化<400>の<400>仕組み<400>\
|
||||
が<400>必要<400>で<400>した<400>。<0>英語<400>の<400>よう<400>な<400>一つ<400>の<400>言語<400>に<400>限<400>って<400>も<400>、<0>一つ<400>だけ<400>\
|
||||
の<400>符号<400>化<400>の<400>仕組み<400>では<400>、<0>一般<400>的<400>に<400>使<400>われる<400>すべて<400>の<400>文字<400>、<0>句読点<400>、<0>\
|
||||
。<0></data>
|
||||
|
||||
#<data><0>コンピューター<400>は<400>、<0>本質<400>的<400>に<400>は<400>数字<400>しか<400>扱う<400>こと<400>が<400>でき<400>ま<400>せん<400>。<0>\
|
||||
|
||||
## FILTERED BREAK TESTS
|
||||
|
||||
@ -1308,3 +1319,48 @@ Bangkok)•</data>
|
||||
<data>•\U0001F468\u200D\u2695\uFE0F•\U0001F468\u200D\u2695•\U0001F468\U0001F3FD\u200D\u2695\uFE0F•\U0001F468\U0001F3FD\u200D\u2695\u0020•</data>
|
||||
# woman astronaut, woman astronaut / fitz4
|
||||
<data>•\U0001F469\u200D\U0001F680•\U0001F469\U0001F3FD\u200D\U0001F680\u0020•</data>
|
||||
|
||||
|
||||
####################################################################################
|
||||
#
|
||||
# Test rule status values
|
||||
#
|
||||
####################################################################################
|
||||
<rules> $Letters = [:L:];
|
||||
$Numbers = [:N:];
|
||||
$Letters+{1};
|
||||
$Numbers+{2};
|
||||
Help\ me\!{4};
|
||||
[^$Letters $Numbers];
|
||||
!.*;
|
||||
</rules>
|
||||
<data>•abc<1>123<2>.•.•abc<1> •Help<1> •me<1> •Help me!<4></data>
|
||||
|
||||
# Test option to prohibit unquoted literals.
|
||||
|
||||
<rules>
|
||||
!!forward;
|
||||
Hello\ World;
|
||||
!!reverse;
|
||||
.*;
|
||||
</rules>
|
||||
<data>•Hello World•</data>
|
||||
|
||||
<badrules>
|
||||
!!quoted_literals_only;
|
||||
!!forward;
|
||||
Hello\ World;
|
||||
!!reverse;
|
||||
.*;
|
||||
</badrules>
|
||||
|
||||
<rules>
|
||||
#TODO: uncomment this line when quoted_literals_only is implemented.
|
||||
#!!quoted_literals_only;
|
||||
!!forward;
|
||||
'Hello World';
|
||||
!!reverse;
|
||||
.*;
|
||||
</rules>
|
||||
<data>•Hello World•</data>
|
||||
|
||||
|
@ -61,6 +61,7 @@ enum {
|
||||
OUTPUT_FILENAME,
|
||||
UNICODE_VERSION,
|
||||
WRITE_C_SOURCE,
|
||||
WRITE_COMBINED_DATA,
|
||||
OPT_FAST
|
||||
};
|
||||
|
||||
@ -73,6 +74,7 @@ static UOption options[]={
|
||||
UOPTION_DEF("output", 'o', UOPT_REQUIRES_ARG),
|
||||
UOPTION_DEF("unicode", 'u', UOPT_REQUIRES_ARG),
|
||||
UOPTION_DEF("csource", '\1', UOPT_NO_ARG),
|
||||
UOPTION_DEF("combined", '\1', UOPT_NO_ARG),
|
||||
UOPTION_DEF("fast", '\1', UOPT_NO_ARG)
|
||||
};
|
||||
|
||||
@ -96,17 +98,22 @@ main(int argc, char* argv[]) {
|
||||
if( argc<2 ||
|
||||
options[HELP_H].doesOccur || options[HELP_QUESTION_MARK].doesOccur
|
||||
) {
|
||||
/*
|
||||
* Broken into chunks because the C89 standard says the minimum
|
||||
* required supported string length is 509 bytes.
|
||||
*/
|
||||
fprintf(stderr,
|
||||
"Usage: %s [-options] infiles+ -o outputfilename\n"
|
||||
"\n"
|
||||
"Reads the infiles with normalization data and\n"
|
||||
"creates a binary or C source file (outputfilename) with the data.\n"
|
||||
"creates a binary file, or a C source file (--csource), with the data,\n"
|
||||
"or writes a data file with the combined data (--combined).\n"
|
||||
"See http://userguide.icu-project.org/transforms/normalization#TOC-Data-File-Syntax\n"
|
||||
"\n"
|
||||
"Alternate usage: %s [-options] a.txt b.txt minus p.txt q.txt -o outputfilename\n"
|
||||
"\n"
|
||||
"Computes the difference of (a, b) minus (p, q) and writes the diff data\n"
|
||||
"in input-file syntax to the outputfilename.\n"
|
||||
"It is then possible to build (p, q, diff) to get the same data as (a, b).\n"
|
||||
"(Useful for computing minimal incremental mapping data files.)\n"
|
||||
"\n",
|
||||
argv[0]);
|
||||
argv[0], argv[0]);
|
||||
fprintf(stderr,
|
||||
"Options:\n"
|
||||
"\t-h or -? or --help this usage text\n"
|
||||
@ -116,7 +123,9 @@ main(int argc, char* argv[]) {
|
||||
fprintf(stderr,
|
||||
"\t-s or --sourcedir source directory, followed by the path\n"
|
||||
"\t-o or --output output filename\n"
|
||||
"\t --csource writes a C source file with initializers\n");
|
||||
"\t --csource writes a C source file with initializers\n"
|
||||
"\t --combined writes a .txt file (input-file syntax) with the\n"
|
||||
"\t combined data from all of the input files\n");
|
||||
fprintf(stderr,
|
||||
"\t --fast optimize the data for fast normalization,\n"
|
||||
"\t which might increase its size (Writes fully decomposed\n"
|
||||
@ -144,7 +153,10 @@ main(int argc, char* argv[]) {
|
||||
|
||||
#else
|
||||
|
||||
LocalPointer<Normalizer2DataBuilder> builder(new Normalizer2DataBuilder(errorCode), errorCode);
|
||||
LocalPointer<Normalizer2DataBuilder> b1(new Normalizer2DataBuilder(errorCode), errorCode);
|
||||
LocalPointer<Normalizer2DataBuilder> b2;
|
||||
LocalPointer<Normalizer2DataBuilder> diff;
|
||||
Normalizer2DataBuilder *builder = b1.getAlias();
|
||||
errorCode.assertSuccess();
|
||||
|
||||
if(options[UNICODE_VERSION].doesOccur) {
|
||||
@ -166,8 +178,29 @@ main(int argc, char* argv[]) {
|
||||
pathLength=filename.length();
|
||||
}
|
||||
|
||||
bool doMinus = false;
|
||||
for(int i=1; i<argc; ++i) {
|
||||
printf("gennorm2: processing %s\n", argv[i]);
|
||||
if(strcmp(argv[i], "minus") == 0) {
|
||||
if(doMinus) {
|
||||
fprintf(stderr, "gennorm2 error: only one 'minus' can be specified\n");
|
||||
exit(U_ILLEGAL_ARGUMENT_ERROR);
|
||||
}
|
||||
// Data from previous input files has been collected in b1.
|
||||
// Collect data from further input files in b2.
|
||||
b2.adoptInsteadAndCheckErrorCode(new Normalizer2DataBuilder(errorCode), errorCode);
|
||||
diff.adoptInsteadAndCheckErrorCode(new Normalizer2DataBuilder(errorCode), errorCode);
|
||||
errorCode.assertSuccess();
|
||||
builder = b2.getAlias();
|
||||
if(options[UNICODE_VERSION].doesOccur) {
|
||||
builder->setUnicodeVersion(options[UNICODE_VERSION].value);
|
||||
}
|
||||
if(options[OPT_FAST].doesOccur) {
|
||||
builder->setOptimization(Normalizer2DataBuilder::OPTIMIZE_FAST);
|
||||
}
|
||||
doMinus = true;
|
||||
continue;
|
||||
}
|
||||
filename.append(argv[i], errorCode);
|
||||
LocalStdioFilePointer f(fopen(filename.data(), "r"));
|
||||
if(f==NULL) {
|
||||
@ -179,7 +212,12 @@ main(int argc, char* argv[]) {
|
||||
filename.truncate(pathLength);
|
||||
}
|
||||
|
||||
if(options[WRITE_C_SOURCE].doesOccur) {
|
||||
if(doMinus) {
|
||||
Normalizer2DataBuilder::computeDiff(*b1, *b2, *diff);
|
||||
diff->writeDataFile(options[OUTPUT_FILENAME].value, /* writeRemoved= */ true);
|
||||
} else if(options[WRITE_COMBINED_DATA].doesOccur) {
|
||||
builder->writeDataFile(options[OUTPUT_FILENAME].value, /* writeRemoved= */ false);
|
||||
} else if(options[WRITE_C_SOURCE].doesOccur) {
|
||||
builder->writeCSourceFile(options[OUTPUT_FILENAME].value);
|
||||
} else {
|
||||
builder->writeBinaryFile(options[OUTPUT_FILENAME].value);
|
||||
|
@ -30,7 +30,9 @@
|
||||
#include "unicode/localpointer.h"
|
||||
#include "unicode/putil.h"
|
||||
#include "unicode/udata.h"
|
||||
#include "unicode/uniset.h"
|
||||
#include "unicode/unistr.h"
|
||||
#include "unicode/usetiter.h"
|
||||
#include "unicode/ustring.h"
|
||||
#include "charstr.h"
|
||||
#include "extradata.h"
|
||||
@ -146,6 +148,7 @@ void Normalizer2DataBuilder::setOverrideHandling(OverrideHandling oh) {
|
||||
|
||||
void Normalizer2DataBuilder::setCC(UChar32 c, uint8_t cc) {
|
||||
norms.createNorm(c)->cc=cc;
|
||||
norms.ccSet.add(c);
|
||||
}
|
||||
|
||||
static UBool isWellFormed(const UnicodeString &s) {
|
||||
@ -166,6 +169,7 @@ void Normalizer2DataBuilder::setOneWayMapping(UChar32 c, const UnicodeString &m)
|
||||
p->mapping=new UnicodeString(m);
|
||||
p->mappingType=Norm::ONE_WAY;
|
||||
p->setMappingCP();
|
||||
norms.mappingSet.add(c);
|
||||
}
|
||||
|
||||
void Normalizer2DataBuilder::setRoundTripMapping(UChar32 c, const UnicodeString &m) {
|
||||
@ -195,12 +199,14 @@ void Normalizer2DataBuilder::setRoundTripMapping(UChar32 c, const UnicodeString
|
||||
p->mapping=new UnicodeString(m);
|
||||
p->mappingType=Norm::ROUND_TRIP;
|
||||
p->mappingCP=U_SENTINEL;
|
||||
norms.mappingSet.add(c);
|
||||
}
|
||||
|
||||
void Normalizer2DataBuilder::removeMapping(UChar32 c) {
|
||||
// createNorm(c), not getNorm(c), to record a non-mapping and detect conflicting data.
|
||||
Norm *p=checkNormForMapping(norms.createNorm(c), c);
|
||||
p->mappingType=Norm::REMOVED;
|
||||
norms.mappingSet.add(c);
|
||||
}
|
||||
|
||||
UBool Normalizer2DataBuilder::mappingHasCompBoundaryAfter(const BuilderReorderingBuffer &buffer) const {
|
||||
@ -832,6 +838,198 @@ Normalizer2DataBuilder::writeCSourceFile(const char *filename) {
|
||||
fclose(f);
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
bool equalStrings(const UnicodeString *s1, const UnicodeString *s2) {
|
||||
if(s1 == nullptr) {
|
||||
return s2 == nullptr;
|
||||
} else if(s2 == nullptr) {
|
||||
return false;
|
||||
} else {
|
||||
return *s1 == *s2;
|
||||
}
|
||||
}
|
||||
|
||||
const char *typeChars = "?-=>";
|
||||
|
||||
void writeMapping(FILE *f, const UnicodeString *m) {
|
||||
if(m != nullptr && !m->isEmpty()) {
|
||||
int32_t i = 0;
|
||||
UChar32 c = m->char32At(i);
|
||||
fprintf(f, "%04lX", (long)c);
|
||||
while((i += U16_LENGTH(c)) < m->length()) {
|
||||
c = m->char32At(i);
|
||||
fprintf(f, " %04lX", (long)c);
|
||||
}
|
||||
}
|
||||
fputs("\n", f);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
void
|
||||
Normalizer2DataBuilder::writeDataFile(const char *filename, bool writeRemoved) const {
|
||||
// Do not processData() before writing the input-syntax data file.
|
||||
FILE *f = fopen(filename, "w");
|
||||
if(f == nullptr) {
|
||||
fprintf(stderr, "gennorm2/writeDataFile() error: unable to create the output file %s\n",
|
||||
filename);
|
||||
exit(U_FILE_ACCESS_ERROR);
|
||||
return;
|
||||
}
|
||||
|
||||
if(unicodeVersion[0] != 0 || unicodeVersion[1] != 0 ||
|
||||
unicodeVersion[2] != 0 || unicodeVersion[3] != 0) {
|
||||
char uv[U_MAX_VERSION_STRING_LENGTH];
|
||||
u_versionToString(unicodeVersion, uv);
|
||||
fprintf(f, "* Unicode %s\n\n", uv);
|
||||
}
|
||||
|
||||
UnicodeSetIterator ccIter(norms.ccSet);
|
||||
UChar32 start = U_SENTINEL;
|
||||
UChar32 end = U_SENTINEL;
|
||||
uint8_t prevCC = 0;
|
||||
bool done = false;
|
||||
bool didWrite = false;
|
||||
do {
|
||||
UChar32 c;
|
||||
uint8_t cc;
|
||||
if(ccIter.next() && !ccIter.isString()) {
|
||||
c = ccIter.getCodepoint();
|
||||
cc = norms.getCC(c);
|
||||
} else {
|
||||
c = 0x110000;
|
||||
cc = 0;
|
||||
done = true;
|
||||
}
|
||||
if(cc == prevCC && c == (end + 1)) {
|
||||
end = c;
|
||||
} else {
|
||||
if(prevCC != 0) {
|
||||
if(start == end) {
|
||||
fprintf(f, "%04lX:%d\n", (long)start, (int)prevCC);
|
||||
} else {
|
||||
fprintf(f, "%04lX..%04lX:%d\n", (long)start, (long)end, (int)prevCC);
|
||||
}
|
||||
didWrite = true;
|
||||
}
|
||||
start = end = c;
|
||||
prevCC = cc;
|
||||
}
|
||||
} while(!done);
|
||||
if(didWrite) {
|
||||
fputs("\n", f);
|
||||
}
|
||||
|
||||
UnicodeSetIterator mIter(norms.mappingSet);
|
||||
start = U_SENTINEL;
|
||||
end = U_SENTINEL;
|
||||
const UnicodeString *prevMapping = nullptr;
|
||||
Norm::MappingType prevType = Norm::NONE;
|
||||
done = false;
|
||||
do {
|
||||
UChar32 c;
|
||||
const Norm *norm;
|
||||
if(mIter.next() && !mIter.isString()) {
|
||||
c = mIter.getCodepoint();
|
||||
norm = norms.getNorm(c);
|
||||
} else {
|
||||
c = 0x110000;
|
||||
norm = nullptr;
|
||||
done = true;
|
||||
}
|
||||
const UnicodeString *mapping;
|
||||
Norm::MappingType type;
|
||||
if(norm == nullptr) {
|
||||
mapping = nullptr;
|
||||
type = Norm::NONE;
|
||||
} else {
|
||||
type = norm->mappingType;
|
||||
if(type == Norm::NONE) {
|
||||
mapping = nullptr;
|
||||
} else {
|
||||
mapping = norm->mapping;
|
||||
}
|
||||
}
|
||||
if(type == prevType && equalStrings(mapping, prevMapping) && c == (end + 1)) {
|
||||
end = c;
|
||||
} else {
|
||||
if(writeRemoved ? prevType != Norm::NONE : prevType > Norm::REMOVED) {
|
||||
if(start == end) {
|
||||
fprintf(f, "%04lX%c", (long)start, typeChars[prevType]);
|
||||
} else {
|
||||
fprintf(f, "%04lX..%04lX%c", (long)start, (long)end, typeChars[prevType]);
|
||||
}
|
||||
writeMapping(f, prevMapping);
|
||||
}
|
||||
start = end = c;
|
||||
prevMapping = mapping;
|
||||
prevType = type;
|
||||
}
|
||||
} while(!done);
|
||||
|
||||
fclose(f);
|
||||
}
|
||||
|
||||
void
|
||||
Normalizer2DataBuilder::computeDiff(const Normalizer2DataBuilder &b1,
|
||||
const Normalizer2DataBuilder &b2,
|
||||
Normalizer2DataBuilder &diff) {
|
||||
// Compute diff = b1 - b2
|
||||
// so that we should be able to get b1 = b2 + diff.
|
||||
if(0 != memcmp(b1.unicodeVersion, b2.unicodeVersion, U_MAX_VERSION_LENGTH)) {
|
||||
memcpy(diff.unicodeVersion, b1.unicodeVersion, U_MAX_VERSION_LENGTH);
|
||||
}
|
||||
|
||||
UnicodeSet ccSet(b1.norms.ccSet);
|
||||
ccSet.addAll(b2.norms.ccSet);
|
||||
UnicodeSetIterator ccIter(ccSet);
|
||||
while(ccIter.next() && !ccIter.isString()) {
|
||||
UChar32 c = ccIter.getCodepoint();
|
||||
uint8_t cc1 = b1.norms.getCC(c);
|
||||
uint8_t cc2 = b2.norms.getCC(c);
|
||||
if(cc1 != cc2) {
|
||||
diff.setCC(c, cc1);
|
||||
}
|
||||
}
|
||||
|
||||
UnicodeSet mSet(b1.norms.mappingSet);
|
||||
mSet.addAll(b2.norms.mappingSet);
|
||||
UnicodeSetIterator mIter(mSet);
|
||||
while(mIter.next() && !mIter.isString()) {
|
||||
UChar32 c = mIter.getCodepoint();
|
||||
const Norm *norm1 = b1.norms.getNorm(c);
|
||||
const Norm *norm2 = b2.norms.getNorm(c);
|
||||
const UnicodeString *mapping1;
|
||||
Norm::MappingType type1;
|
||||
if(norm1 == nullptr || !norm1->hasMapping()) {
|
||||
mapping1 = nullptr;
|
||||
type1 = Norm::NONE;
|
||||
} else {
|
||||
mapping1 = norm1->mapping;
|
||||
type1 = norm1->mappingType;
|
||||
}
|
||||
const UnicodeString *mapping2;
|
||||
Norm::MappingType type2;
|
||||
if(norm2 == nullptr || !norm2->hasMapping()) {
|
||||
mapping2 = nullptr;
|
||||
type2 = Norm::NONE;
|
||||
} else {
|
||||
mapping2 = norm2->mapping;
|
||||
type2 = norm2->mappingType;
|
||||
}
|
||||
if(type1 == type2 && equalStrings(mapping1, mapping2)) {
|
||||
// Nothing to do.
|
||||
} else if(type1 == Norm::NONE) {
|
||||
diff.removeMapping(c);
|
||||
} else if(type1 == Norm::ROUND_TRIP) {
|
||||
diff.setRoundTripMapping(c, *mapping1);
|
||||
} else if(type1 == Norm::ONE_WAY) {
|
||||
diff.setOneWayMapping(c, *mapping1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
#endif /* #if !UCONFIG_NO_NORMALIZATION */
|
||||
|
@ -63,6 +63,11 @@ public:
|
||||
|
||||
void writeBinaryFile(const char *filename);
|
||||
void writeCSourceFile(const char *filename);
|
||||
void writeDataFile(const char *filename, bool writeRemoved) const;
|
||||
|
||||
static void computeDiff(const Normalizer2DataBuilder &b1,
|
||||
const Normalizer2DataBuilder &b2,
|
||||
Normalizer2DataBuilder &diff);
|
||||
|
||||
private:
|
||||
friend class Norm16Writer;
|
||||
|
@ -15,6 +15,7 @@
|
||||
#if !UCONFIG_NO_NORMALIZATION
|
||||
|
||||
#include "unicode/errorcode.h"
|
||||
#include "unicode/uniset.h"
|
||||
#include "unicode/unistr.h"
|
||||
#include "unicode/utf16.h"
|
||||
#include "normalizer2impl.h"
|
||||
@ -183,6 +184,8 @@ public:
|
||||
|
||||
void enumRanges(Enumerator &e);
|
||||
|
||||
UnicodeSet ccSet, mappingSet;
|
||||
|
||||
private:
|
||||
Norms(const Norms &other) = delete;
|
||||
Norms &operator=(const Norms &other) = delete;
|
||||
|
@ -33,6 +33,7 @@
|
||||
#include "uhash.h"
|
||||
#include "uresimp.h"
|
||||
#include "unicode/ustring.h"
|
||||
#include "unicode/utf8.h"
|
||||
|
||||
void res_write_java(struct SResource *res,UErrorCode *status);
|
||||
|
||||
@ -244,7 +245,8 @@ str_write_java(const UChar *src, int32_t srcLen, UBool printEndLine, UErrorCode
|
||||
memset(buf,0,length);
|
||||
|
||||
bufLen = uCharsToChars(buf,length,src,srcLen,status);
|
||||
|
||||
// buflen accounts for extra bytes added due to multi byte encoding of
|
||||
// non ASCII characters
|
||||
if(printEndLine)
|
||||
write_tabs(out);
|
||||
|
||||
@ -284,10 +286,22 @@ str_write_java(const UChar *src, int32_t srcLen, UBool printEndLine, UErrorCode
|
||||
}
|
||||
}
|
||||
T_FileStream_write(out,"\"",1);
|
||||
uint32_t byteIndex = 0;
|
||||
uint32_t trailBytes = 0;
|
||||
if(len+add<bufLen){
|
||||
// check the trail bytes to be added to the output line
|
||||
while (byteIndex < add) {
|
||||
if (U8_IS_LEAD(*(current + byteIndex))) {
|
||||
trailBytes = U8_COUNT_TRAIL_BYTES(*(current + byteIndex));
|
||||
add += trailBytes;
|
||||
}
|
||||
byteIndex++;
|
||||
}
|
||||
T_FileStream_write(out,current,add);
|
||||
T_FileStream_write(out,"\" +\n",4);
|
||||
write_tabs(out);
|
||||
if (len + add < bufLen) {
|
||||
T_FileStream_write(out,"\" +\n",4);
|
||||
write_tabs(out);
|
||||
}
|
||||
}else{
|
||||
T_FileStream_write(out,current,bufLen-len);
|
||||
}
|
||||
@ -437,9 +451,7 @@ bytes_write_java(const BinaryResource *res, UErrorCode * /*status*/) {
|
||||
char byteBuffer[100] = { 0 };
|
||||
uint8_t* byteArray = NULL;
|
||||
int byteIterator = 0;
|
||||
|
||||
int32_t srcLen=res->fLength;
|
||||
|
||||
if(srcLen>0 )
|
||||
{
|
||||
byteArray = res->fData;
|
||||
|
@ -149,6 +149,11 @@ public class ScientificFormat extends Format.BeforeFormat implements Rounder.Mul
|
||||
// (see #13118). Note that the bound 8 on integer digits is historic.
|
||||
int _maxInt = properties.getMaximumIntegerDigits();
|
||||
int _minInt = properties.getMinimumIntegerDigits();
|
||||
// Bug #13289: if maxInt > minInt > 1, then minInt should be 1 for the
|
||||
// purposes of engineering notatation.
|
||||
if (_maxInt > _minInt && _minInt > 1) {
|
||||
_minInt = 1;
|
||||
}
|
||||
minInt = _minInt < 0 ? 0 : _minInt >= 8 ? 1 : _minInt;
|
||||
maxInt = _maxInt < _minInt ? _minInt : _maxInt >= 8 ? _minInt : _maxInt;
|
||||
assert 0 <= minInt && minInt <= maxInt && maxInt < 8;
|
||||
|
@ -53,7 +53,7 @@ public class DecimalFormatSymbols implements Cloneable, Serializable {
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
public DecimalFormatSymbols() {
|
||||
initialize(ULocale.getDefault(Category.FORMAT));
|
||||
this(ULocale.getDefault(Category.FORMAT));
|
||||
}
|
||||
|
||||
/**
|
||||
@ -62,7 +62,7 @@ public class DecimalFormatSymbols implements Cloneable, Serializable {
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
public DecimalFormatSymbols(Locale locale) {
|
||||
initialize(ULocale.forLocale(locale));
|
||||
this(ULocale.forLocale(locale));
|
||||
}
|
||||
|
||||
/**
|
||||
@ -71,7 +71,15 @@ public class DecimalFormatSymbols implements Cloneable, Serializable {
|
||||
* @stable ICU 3.2
|
||||
*/
|
||||
public DecimalFormatSymbols(ULocale locale) {
|
||||
initialize(locale);
|
||||
initialize(locale, null);
|
||||
}
|
||||
|
||||
private DecimalFormatSymbols(Locale locale, NumberingSystem ns) {
|
||||
this(ULocale.forLocale(locale), ns);
|
||||
}
|
||||
|
||||
private DecimalFormatSymbols(ULocale locale, NumberingSystem ns) {
|
||||
initialize(locale, ns);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -123,6 +131,46 @@ public class DecimalFormatSymbols implements Cloneable, Serializable {
|
||||
return new DecimalFormatSymbols(locale);
|
||||
}
|
||||
|
||||
/**
|
||||
* {@icu} Returns a DecimalFormatSymbols instance for the given locale with digits and symbols
|
||||
* corresponding to the given {@link NumberingSystem}.
|
||||
*
|
||||
* <p>This method behaves equivalently to {@link #getInstance} called with a locale having a
|
||||
* "numbers=xxxx" keyword specifying the numbering system by name.
|
||||
*
|
||||
* <p>In this method, the NumberingSystem argument will be used even if the locale has its own
|
||||
* "numbers=xxxx" keyword.
|
||||
*
|
||||
* @param locale the locale.
|
||||
* @param ns the numbering system.
|
||||
* @return A DecimalFormatSymbols instance.
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
* @draft ICU 60
|
||||
*/
|
||||
public static DecimalFormatSymbols forNumberingSystem(Locale locale, NumberingSystem ns) {
|
||||
return new DecimalFormatSymbols(locale, ns);
|
||||
}
|
||||
|
||||
/**
|
||||
* {@icu} Returns a DecimalFormatSymbols instance for the given locale with digits and symbols
|
||||
* corresponding to the given {@link NumberingSystem}.
|
||||
*
|
||||
* <p>This method behaves equivalently to {@link #getInstance} called with a locale having a
|
||||
* "numbers=xxxx" keyword specifying the numbering system by name.
|
||||
*
|
||||
* <p>In this method, the NumberingSystem argument will be used even if the locale has its own
|
||||
* "numbers=xxxx" keyword.
|
||||
*
|
||||
* @param locale the locale.
|
||||
* @param ns the numbering system.
|
||||
* @return A DecimalFormatSymbols instance.
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
* @draft ICU 60
|
||||
*/
|
||||
public static DecimalFormatSymbols forNumberingSystem(ULocale locale, NumberingSystem ns) {
|
||||
return new DecimalFormatSymbols(locale, ns);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns an array of all locales for which the <code>getInstance</code> methods of
|
||||
* this class can return localized instances.
|
||||
@ -1336,10 +1384,16 @@ public class DecimalFormatSymbols implements Cloneable, Serializable {
|
||||
/**
|
||||
* Initializes the symbols from the locale data.
|
||||
*/
|
||||
private void initialize( ULocale locale ) {
|
||||
private void initialize(ULocale locale, NumberingSystem ns) {
|
||||
this.requestedLocale = locale.toLocale();
|
||||
this.ulocale = locale;
|
||||
CacheData data = cachedLocaleData.getInstance(locale, null /* unused */);
|
||||
|
||||
// TODO: The cache requires a single key, so we just save the NumberingSystem into the
|
||||
// locale string. NumberingSystem is then decoded again in the loadData() method. It would
|
||||
// be more efficient if we didn't have to serialize and deserialize the NumberingSystem.
|
||||
ULocale keyLocale = (ns == null) ? locale : locale.setKeywordValue("numbers", ns.getName());
|
||||
CacheData data = cachedLocaleData.getInstance(keyLocale, null /* unused */);
|
||||
|
||||
setLocale(data.validLocale, data.validLocale);
|
||||
setDigitStrings(data.digits);
|
||||
String[] numberElements = data.numberElements;
|
||||
|
@ -409,12 +409,7 @@ public final class Edits {
|
||||
spanStart = destIndex;
|
||||
spanLength = newLength_;
|
||||
}
|
||||
// If we are at the start or limit of an empty span, then we search from
|
||||
// the start of the string so that we always return
|
||||
// the first of several consecutive empty spans, for consistent results.
|
||||
// We do not currently track the properties of the previous span,
|
||||
// so for now we always reset if we are at the start of the current span.
|
||||
if (i <= spanStart) {
|
||||
if (i < spanStart) {
|
||||
// Reset the iterator to the start.
|
||||
index = remaining = oldLength_ = newLength_ = srcIndex = replIndex = destIndex = 0;
|
||||
} else if (i < (spanStart + spanLength)) {
|
||||
@ -429,8 +424,8 @@ public final class Edits {
|
||||
spanStart = destIndex;
|
||||
spanLength = newLength_;
|
||||
}
|
||||
if (i == spanStart || i < (spanStart + spanLength)) {
|
||||
// The index is in the current span, or at an empty one.
|
||||
if (i < (spanStart + spanLength)) {
|
||||
// The index is in the current span.
|
||||
return 0;
|
||||
}
|
||||
if (remaining > 0) {
|
||||
@ -615,4 +610,167 @@ public final class Edits {
|
||||
public Iterator getFineIterator() {
|
||||
return new Iterator(array, length, false, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Merges the two input Edits and appends the result to this object.
|
||||
*
|
||||
* <p>Consider two string transformations (for example, normalization and case mapping)
|
||||
* where each records Edits in addition to writing an output string.<br>
|
||||
* Edits ab reflect how substrings of input string a
|
||||
* map to substrings of intermediate string b.<br>
|
||||
* Edits bc reflect how substrings of intermediate string b
|
||||
* map to substrings of output string c.<br>
|
||||
* This function merges ab and bc such that the additional edits
|
||||
* recorded in this object reflect how substrings of input string a
|
||||
* map to substrings of output string c.
|
||||
*
|
||||
* <p>If unrelated Edits are passed in where the output string of the first
|
||||
* has a different length than the input string of the second,
|
||||
* then an IllegalArgumentException is thrown.
|
||||
*
|
||||
* @param ab reflects how substrings of input string a
|
||||
* map to substrings of intermediate string b.
|
||||
* @param bc reflects how substrings of intermediate string b
|
||||
* map to substrings of output string c.
|
||||
* @return this, with the merged edits appended
|
||||
* @draft ICU 60
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
public Edits mergeAndAppend(Edits ab, Edits bc) {
|
||||
// Picture string a --(Edits ab)--> string b --(Edits bc)--> string c.
|
||||
// Parallel iteration over both Edits.
|
||||
Iterator abIter = ab.getFineIterator();
|
||||
Iterator bcIter = bc.getFineIterator();
|
||||
boolean abHasNext = true, bcHasNext = true;
|
||||
// Copy iterator state into local variables, so that we can modify and subdivide spans.
|
||||
// ab old & new length, bc old & new length
|
||||
int aLength = 0, ab_bLength = 0, bc_bLength = 0, cLength = 0;
|
||||
// When we have different-intermediate-length changes, we accumulate a larger change.
|
||||
int pending_aLength = 0, pending_cLength = 0;
|
||||
for (;;) {
|
||||
// At this point, for each of the two iterators:
|
||||
// Either we are done with the locally cached current edit,
|
||||
// and its intermediate-string length has been reset,
|
||||
// or we will continue to work with a truncated remainder of this edit.
|
||||
//
|
||||
// If the current edit is done, and the iterator has not yet reached the end,
|
||||
// then we fetch the next edit. This is true for at least one of the iterators.
|
||||
//
|
||||
// Normally it does not matter whether we fetch from ab and then bc or vice versa.
|
||||
// However, the result is observably different when
|
||||
// ab deletions meet bc insertions at the same intermediate-string index.
|
||||
// Some users expect the bc insertions to come first, so we fetch from bc first.
|
||||
if (bc_bLength == 0) {
|
||||
if (bcHasNext && (bcHasNext = bcIter.next())) {
|
||||
bc_bLength = bcIter.oldLength();
|
||||
cLength = bcIter.newLength();
|
||||
if (bc_bLength == 0) {
|
||||
// insertion
|
||||
if (ab_bLength == 0 || !abIter.hasChange()) {
|
||||
addReplace(pending_aLength, pending_cLength + cLength);
|
||||
pending_aLength = pending_cLength = 0;
|
||||
} else {
|
||||
pending_cLength += cLength;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
}
|
||||
// else see if the other iterator is done, too.
|
||||
}
|
||||
if (ab_bLength == 0) {
|
||||
if (abHasNext && (abHasNext = abIter.next())) {
|
||||
aLength = abIter.oldLength();
|
||||
ab_bLength = abIter.newLength();
|
||||
if (ab_bLength == 0) {
|
||||
// deletion
|
||||
if (bc_bLength == bcIter.oldLength() || !bcIter.hasChange()) {
|
||||
addReplace(pending_aLength + aLength, pending_cLength);
|
||||
pending_aLength = pending_cLength = 0;
|
||||
} else {
|
||||
pending_aLength += aLength;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
} else if (bc_bLength == 0) {
|
||||
// Both iterators are done at the same time:
|
||||
// The intermediate-string lengths match.
|
||||
break;
|
||||
} else {
|
||||
throw new IllegalArgumentException(
|
||||
"The ab output string is shorter than the bc input string.");
|
||||
}
|
||||
}
|
||||
if (bc_bLength == 0) {
|
||||
throw new IllegalArgumentException(
|
||||
"The bc input string is shorter than the ab output string.");
|
||||
}
|
||||
// Done fetching: ab_bLength > 0 && bc_bLength > 0
|
||||
|
||||
// The current state has two parts:
|
||||
// - Past: We accumulate a longer ac edit in the "pending" variables.
|
||||
// - Current: We have copies of the current ab/bc edits in local variables.
|
||||
// At least one side is newly fetched.
|
||||
// One side might be a truncated remainder of an edit we fetched earlier.
|
||||
|
||||
if (!abIter.hasChange() && !bcIter.hasChange()) {
|
||||
// An unchanged span all the way from string a to string c.
|
||||
if (pending_aLength != 0 || pending_cLength != 0) {
|
||||
addReplace(pending_aLength, pending_cLength);
|
||||
pending_aLength = pending_cLength = 0;
|
||||
}
|
||||
int unchangedLength = aLength <= cLength ? aLength : cLength;
|
||||
addUnchanged(unchangedLength);
|
||||
ab_bLength = aLength -= unchangedLength;
|
||||
bc_bLength = cLength -= unchangedLength;
|
||||
// At least one of the unchanged spans is now empty.
|
||||
continue;
|
||||
}
|
||||
if (!abIter.hasChange() && bcIter.hasChange()) {
|
||||
// Unchanged a->b but changed b->c.
|
||||
if (ab_bLength >= bc_bLength) {
|
||||
// Split the longer unchanged span into change + remainder.
|
||||
addReplace(pending_aLength + bc_bLength, pending_cLength + cLength);
|
||||
pending_aLength = pending_cLength = 0;
|
||||
aLength = ab_bLength -= bc_bLength;
|
||||
bc_bLength = 0;
|
||||
continue;
|
||||
}
|
||||
// Handle the shorter unchanged span below like a change.
|
||||
} else if (abIter.hasChange() && !bcIter.hasChange()) {
|
||||
// Changed a->b and then unchanged b->c.
|
||||
if (ab_bLength <= bc_bLength) {
|
||||
// Split the longer unchanged span into change + remainder.
|
||||
addReplace(pending_aLength + aLength, pending_cLength + ab_bLength);
|
||||
pending_aLength = pending_cLength = 0;
|
||||
cLength = bc_bLength -= ab_bLength;
|
||||
ab_bLength = 0;
|
||||
continue;
|
||||
}
|
||||
// Handle the shorter unchanged span below like a change.
|
||||
} else { // both abIter.hasChange() && bcIter.hasChange()
|
||||
if (ab_bLength == bc_bLength) {
|
||||
// Changes on both sides up to the same position. Emit & reset.
|
||||
addReplace(pending_aLength + aLength, pending_cLength + cLength);
|
||||
pending_aLength = pending_cLength = 0;
|
||||
ab_bLength = bc_bLength = 0;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
// Accumulate the a->c change, reset the shorter side,
|
||||
// keep a remainder of the longer one.
|
||||
pending_aLength += aLength;
|
||||
pending_cLength += cLength;
|
||||
if (ab_bLength < bc_bLength) {
|
||||
bc_bLength -= ab_bLength;
|
||||
cLength = ab_bLength = 0;
|
||||
} else { // ab_bLength > bc_bLength
|
||||
ab_bLength -= bc_bLength;
|
||||
aLength = bc_bLength = 0;
|
||||
}
|
||||
}
|
||||
if (pending_aLength != 0 || pending_cLength != 0) {
|
||||
addReplace(pending_aLength, pending_cLength);
|
||||
}
|
||||
return this;
|
||||
}
|
||||
}
|
||||
|
@ -1952,7 +1952,7 @@ public class RuleBasedNumberFormat extends NumberFormat {
|
||||
// position of 0 and the number being formatted) to the rule set
|
||||
// for formatting
|
||||
StringBuilder result = new StringBuilder();
|
||||
if (getRoundingMode() != BigDecimal.ROUND_UNNECESSARY) {
|
||||
if (getRoundingMode() != BigDecimal.ROUND_UNNECESSARY && !Double.isNaN(number) && !Double.isInfinite(number)) {
|
||||
// We convert to a string because BigDecimal insists on excessive precision.
|
||||
number = new BigDecimal(Double.toString(number)).setScale(getMaximumFractionDigits(), roundingMode).doubleValue();
|
||||
}
|
||||
|
@ -3443,7 +3443,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
|
||||
checkFrozen();
|
||||
int p;
|
||||
int v;
|
||||
boolean mustNotBeEmpty = false, invert = false;
|
||||
boolean invert = false;
|
||||
|
||||
if (symbols != null
|
||||
&& (symbols instanceof XSymbolTable)
|
||||
@ -3476,10 +3476,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
|
||||
p == UProperty.LEAD_CANONICAL_COMBINING_CLASS ||
|
||||
p == UProperty.TRAIL_CANONICAL_COMBINING_CLASS) {
|
||||
v = Integer.parseInt(PatternProps.trimWhiteSpace(valueAlias));
|
||||
// If the resultant set is empty then the numeric value
|
||||
// was invalid.
|
||||
//mustNotBeEmpty = true;
|
||||
// old code was wrong; anything between 0 and 255 is valid even if unused.
|
||||
// Anything between 0 and 255 is valid even if unused.
|
||||
if (v < 0 || v > 255) throw e;
|
||||
} else {
|
||||
throw e;
|
||||
@ -3580,12 +3577,6 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
|
||||
complement();
|
||||
}
|
||||
|
||||
if (mustNotBeEmpty && isEmpty()) {
|
||||
// mustNotBeEmpty is set to true if an empty set indicates
|
||||
// invalid input.
|
||||
throw new IllegalArgumentException("Invalid property value");
|
||||
}
|
||||
|
||||
return this;
|
||||
}
|
||||
|
||||
|
@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:f0d65ed59329e1eaae1813db0fa8e1236a3b58ddfa5e7e1ff33d4bea7eef3c31
|
||||
size 12226292
|
||||
oid sha256:193787da8cd2caebf1901892beccad07f8e7f3c714ef482681784bc583be5c60
|
||||
size 12226288
|
||||
|
@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:79b0c13215014e21a95869ccbac72d191485436cae6f26a2f96622a4268c1a82
|
||||
oid sha256:ca79a3355cea5666551889ce8ff3703987162a35937a292d80284519f2b68286
|
||||
size 92486
|
||||
|
@ -358,15 +358,14 @@ minIntegerDigits maxIntegerDigits minFractionDigits maxFractionDigits output bre
|
||||
0 0 1 0 2.99792458E8 KS
|
||||
// JDK and S give .2998E9
|
||||
0 0 0 4 2.998E8 KSQ
|
||||
// S correctly formats this as 29.979246E7.
|
||||
// JDK uses 8 + 6 for significant digits instead of 2 + 6
|
||||
// J and C return 2.9979246E8.
|
||||
// TODO: Merge trunk
|
||||
2 8 1 6 29.979246E7 CJKQ
|
||||
// According to the spec, if maxInt>minInt and minInt>1, then set
|
||||
// Context: #13289
|
||||
2 8 1 6 2.9979246E8 K
|
||||
// Treat max int digits > 8 as being the same as min int digits.
|
||||
// This behavior is not spelled out in the specification.
|
||||
// JDK fails here because it tries to use 9 + 6 = 15 sig digits.
|
||||
2 9 1 6 29.979246E7 K
|
||||
// C and J get 29.979246E7
|
||||
2 9 1 6 2.9979246E8 CJK
|
||||
|
||||
test significant digits scientific
|
||||
set locale en
|
||||
|
@ -26,6 +26,7 @@ import org.junit.Test;
|
||||
|
||||
import com.ibm.icu.text.DecimalFormat;
|
||||
import com.ibm.icu.text.DecimalFormatSymbols;
|
||||
import com.ibm.icu.text.NumberingSystem;
|
||||
import com.ibm.icu.util.Currency;
|
||||
import com.ibm.icu.util.ULocale;
|
||||
|
||||
@ -323,4 +324,40 @@ public class IntlTestDecimalFormatSymbols extends com.ibm.icu.dev.test.TestFmwk
|
||||
errln("ERROR: Code point zero be ASCII 0");
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testNumberingSystem() {
|
||||
Object[][] cases = {
|
||||
{"en", "latn", "1,234.56", ';'},
|
||||
{"en", "arab", "١٬٢٣٤٫٥٦", '؛'},
|
||||
{"en", "mathsanb", "𝟭,𝟮𝟯𝟰.𝟱𝟲", ';'},
|
||||
{"en", "mymr", "၁,၂၃၄.၅၆", ';'},
|
||||
{"my", "latn", "1,234.56", ';'},
|
||||
{"my", "arab", "١٬٢٣٤٫٥٦", '؛'},
|
||||
{"my", "mathsanb", "𝟭,𝟮𝟯𝟰.𝟱𝟲", ';'},
|
||||
{"my", "mymr", "၁,၂၃၄.၅၆", '၊'},
|
||||
{"en@numbers=thai", "mymr", "၁,၂၃၄.၅၆", ';'}, // conflicting numbering system
|
||||
};
|
||||
|
||||
for (Object[] cas : cases) {
|
||||
ULocale loc = new ULocale((String) cas[0]);
|
||||
NumberingSystem ns = NumberingSystem.getInstanceByName((String) cas[1]);
|
||||
String expectedFormattedNumberString = (String) cas[2];
|
||||
char expectedPatternSeparator = (Character) cas[3];
|
||||
|
||||
DecimalFormatSymbols dfs = DecimalFormatSymbols.forNumberingSystem(loc, ns);
|
||||
DecimalFormat df = new DecimalFormat("#,##0.##", dfs);
|
||||
String actual1 = df.format(1234.56);
|
||||
assertEquals("1234.56 with " + loc + " and " + ns.getName(),
|
||||
expectedFormattedNumberString, actual1);
|
||||
// The pattern separator is something that differs by numbering system in my@numbers=mymr.
|
||||
char actual2 = dfs.getPatternSeparator();
|
||||
assertEquals("Pattern separator with " + loc + " and " + ns.getName(),
|
||||
expectedPatternSeparator, actual2);
|
||||
|
||||
// Coverage for JDK Locale overload
|
||||
DecimalFormatSymbols dfs2 = DecimalFormatSymbols.forNumberingSystem(loc.toLocale(), ns);
|
||||
assertEquals("JDK Locale and ICU Locale should produce the same object", dfs, dfs2);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -832,6 +832,9 @@ public class NumberFormatDataDrivenTest {
|
||||
@Test
|
||||
@Ignore
|
||||
public void TestDataDrivenJDK() {
|
||||
// Android implements java.text.DecimalFormat with ICU4J (ticket #13322).
|
||||
if (TestUtil.getJavaVendor() == TestUtil.JavaVendor.Android) return;
|
||||
|
||||
DataDrivenNumberFormatTestUtility.runFormatSuiteIncludingKnownFailures(
|
||||
"numberformattestspecification.txt", JDK);
|
||||
}
|
||||
|
@ -5236,6 +5236,13 @@ public class NumberFormatTest extends TestFmwk {
|
||||
assertEquals("Should parse to 300000 using non-monetary separators: " + ppos, 300000L, number);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void Test13289() {
|
||||
DecimalFormat df = new DecimalFormat("#00.0#E0");
|
||||
String result = df.format(0.00123);
|
||||
assertEquals("Should ignore scientific minInt if maxInt>minInt", "1.23E-3", result);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testPercentZero() {
|
||||
DecimalFormat df = (DecimalFormat) NumberFormat.getPercentInstance();
|
||||
|
@ -1705,4 +1705,21 @@ public class RbnfTest extends TestFmwk {
|
||||
};
|
||||
doTest(rbnf, enTestFullData, false);
|
||||
}
|
||||
|
||||
private void assertEquals(String expected, String result) {
|
||||
if (!expected.equals(result)) {
|
||||
errln("Expected: " + expected + " Got: " + result);
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testRoundingUnrealNumbers() {
|
||||
RuleBasedNumberFormat rbnf = new RuleBasedNumberFormat(ULocale.US, RuleBasedNumberFormat.SPELLOUT);
|
||||
rbnf.setRoundingMode(BigDecimal.ROUND_HALF_UP);
|
||||
rbnf.setMaximumFractionDigits(3);
|
||||
assertEquals("zero point one", rbnf.format(0.1));
|
||||
assertEquals("zero point zero zero one", rbnf.format(0.0005));
|
||||
assertEquals("infinity", rbnf.format(Double.POSITIVE_INFINITY));
|
||||
assertEquals("not a number", rbnf.format(Double.NaN));
|
||||
}
|
||||
}
|
||||
|
@ -0,0 +1,376 @@
|
||||
// © 2017 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html#License
|
||||
|
||||
package com.ibm.icu.dev.test.lang;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.Collection;
|
||||
import java.util.Locale;
|
||||
|
||||
import org.junit.Test;
|
||||
import org.junit.experimental.runners.Enclosed;
|
||||
import org.junit.runner.RunWith;
|
||||
import org.junit.runners.Parameterized;
|
||||
|
||||
import com.ibm.icu.dev.test.TestFmwk;
|
||||
import com.ibm.icu.lang.UScript;
|
||||
import com.ibm.icu.util.ULocale;
|
||||
|
||||
@RunWith(Enclosed.class)
|
||||
public class DataDrivenUScriptTest extends TestFmwk {
|
||||
|
||||
private static String scriptsToString(int[] scripts) {
|
||||
if (scripts == null) {
|
||||
return "null";
|
||||
}
|
||||
StringBuilder sb = new StringBuilder();
|
||||
for (int script : scripts) {
|
||||
if (sb.length() > 0) {
|
||||
sb.append(' ');
|
||||
}
|
||||
sb.append(UScript.getShortName(script));
|
||||
}
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
private static void assertEqualScripts(String msg, int[] expectedScripts, int[] actualScripts) {
|
||||
assertEquals(msg, scriptsToString(expectedScripts), scriptsToString(actualScripts));
|
||||
}
|
||||
|
||||
@RunWith(Parameterized.class)
|
||||
public static class LocaleGetCodeTest {
|
||||
private ULocale testLocaleName;
|
||||
private int expected;
|
||||
|
||||
public LocaleGetCodeTest(ULocale testLocaleName, int expected) {
|
||||
this.testLocaleName = testLocaleName;
|
||||
this.expected = expected;
|
||||
}
|
||||
|
||||
@Parameterized.Parameters
|
||||
public static Collection testData() {
|
||||
return Arrays.asList(new Object[][] { { new ULocale("en"), UScript.LATIN },
|
||||
{ new ULocale("en_US"), UScript.LATIN },
|
||||
{ new ULocale("sr"), UScript.CYRILLIC },
|
||||
{ new ULocale("ta"), UScript.TAMIL },
|
||||
{ new ULocale("te_IN"), UScript.TELUGU },
|
||||
{ new ULocale("hi"), UScript.DEVANAGARI },
|
||||
{ new ULocale("he"), UScript.HEBREW },
|
||||
{ new ULocale("ar"), UScript.ARABIC },
|
||||
{ new ULocale("abcde"), UScript.INVALID_CODE },
|
||||
{ new ULocale("abcde_cdef"), UScript.INVALID_CODE },
|
||||
{ new ULocale("iw"), UScript.HEBREW }
|
||||
});
|
||||
}
|
||||
|
||||
@Test
|
||||
public void TestLocaleGetCode() {
|
||||
int[] code = UScript.getCode(testLocaleName);
|
||||
if (code == null) {
|
||||
if (expected != UScript.INVALID_CODE) {
|
||||
errln("Error testing UScript.getCode(). Got: null" + " Expected: " + expected + " for locale "
|
||||
+ testLocaleName);
|
||||
}
|
||||
} else if ((code[0] != expected)) {
|
||||
errln("Error testing UScript.getCode(). Got: " + code[0] + " Expected: " + expected + " for locale "
|
||||
+ testLocaleName);
|
||||
}
|
||||
|
||||
ULocale defaultLoc = ULocale.getDefault();
|
||||
ULocale esperanto = new ULocale("eo_DE");
|
||||
ULocale.setDefault(esperanto);
|
||||
code = UScript.getCode(esperanto);
|
||||
if (code != null) {
|
||||
if (code[0] != UScript.LATIN) {
|
||||
errln("Did not get the expected script code for Esperanto");
|
||||
}
|
||||
} else {
|
||||
warnln("Could not load the locale data.");
|
||||
}
|
||||
ULocale.setDefault(defaultLoc);
|
||||
|
||||
// Should work regardless of whether we have locale data for the language.
|
||||
assertEqualScripts("tg script: Cyrl", // Tajik
|
||||
new int[] { UScript.CYRILLIC }, UScript.getCode(new ULocale("tg")));
|
||||
assertEqualScripts("xsr script: Deva", // Sherpa
|
||||
new int[] { UScript.DEVANAGARI }, UScript.getCode(new ULocale("xsr")));
|
||||
|
||||
// Multi-script languages.
|
||||
assertEqualScripts("ja scripts: Kana Hira Hani",
|
||||
new int[] { UScript.KATAKANA, UScript.HIRAGANA, UScript.HAN }, UScript.getCode(ULocale.JAPANESE));
|
||||
assertEqualScripts("ko scripts: Hang Hani", new int[] { UScript.HANGUL, UScript.HAN },
|
||||
UScript.getCode(ULocale.KOREAN));
|
||||
assertEqualScripts("zh script: Hani", new int[] { UScript.HAN }, UScript.getCode(ULocale.CHINESE));
|
||||
assertEqualScripts("zh-Hant scripts: Hani Bopo", new int[] { UScript.HAN, UScript.BOPOMOFO },
|
||||
UScript.getCode(ULocale.TRADITIONAL_CHINESE));
|
||||
assertEqualScripts("zh-TW scripts: Hani Bopo", new int[] { UScript.HAN, UScript.BOPOMOFO },
|
||||
UScript.getCode(ULocale.TAIWAN));
|
||||
|
||||
// Ambiguous API, but this probably wants to return Latin rather than Rongorongo (Roro).
|
||||
assertEqualScripts("ro-RO script: Latn", new int[] { UScript.LATIN }, UScript.getCode("ro-RO")); // String
|
||||
// not
|
||||
// ULocale
|
||||
}
|
||||
}
|
||||
|
||||
@RunWith(Parameterized.class)
|
||||
public static class TestMultipleUScript extends TestFmwk {
|
||||
private String testLocaleName;
|
||||
private Locale testLocale;
|
||||
private int[] expected;
|
||||
|
||||
public TestMultipleUScript(String testLocaleName, int[] expected, Locale testLocale) {
|
||||
this.testLocaleName = testLocaleName;
|
||||
this.testLocale = testLocale;
|
||||
this.expected = expected;
|
||||
}
|
||||
|
||||
@Parameterized.Parameters
|
||||
public static Collection testData() {
|
||||
return Arrays.asList(new Object[][] {
|
||||
{ "ja", new int[] { UScript.KATAKANA, UScript.HIRAGANA, UScript.HAN }, Locale.JAPANESE },
|
||||
{ "ko_KR", new int[] { UScript.HANGUL, UScript.HAN }, Locale.KOREA },
|
||||
{ "zh", new int[] { UScript.HAN }, Locale.CHINESE },
|
||||
{ "zh_TW", new int[] { UScript.HAN, UScript.BOPOMOFO }, Locale.TAIWAN }
|
||||
});
|
||||
}
|
||||
|
||||
@Test
|
||||
public void TestMultipleCodes() {
|
||||
int[] code = UScript.getCode(testLocaleName);
|
||||
if (code != null) {
|
||||
for (int j = 0; j < code.length; j++) {
|
||||
if (code[j] != expected[j]) {
|
||||
errln("Error testing UScript.getCode(). Got: " + code[j] + " Expected: " + expected[j]
|
||||
+ " for locale " + testLocaleName);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
errln("Error testing UScript.getCode() for locale " + testLocaleName);
|
||||
}
|
||||
|
||||
logln(" Testing UScript.getCode(Locale) with locale: " + testLocale.getDisplayName());
|
||||
code = UScript.getCode(testLocale);
|
||||
if (code != null) {
|
||||
for (int j = 0; j < code.length; j++) {
|
||||
if (code[j] != expected[j]) {
|
||||
errln("Error testing UScript.getCode(). Got: " + code[j] + " Expected: " + expected[j]
|
||||
+ " for locale " + testLocaleName);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
errln("Error testing UScript.getCode() for locale " + testLocaleName);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@RunWith(Parameterized.class)
|
||||
public static class GetCodeTest extends TestFmwk {
|
||||
private String testName;
|
||||
private int expected;
|
||||
|
||||
public GetCodeTest(String testName, int expected) {
|
||||
this.testName = testName;
|
||||
this.expected = expected;
|
||||
}
|
||||
|
||||
@Parameterized.Parameters
|
||||
public static Collection testData() {
|
||||
return Arrays.asList(new Object[][] {
|
||||
/* test locale */
|
||||
{ "en", UScript.LATIN },
|
||||
{ "en_US", UScript.LATIN },
|
||||
{ "sr", UScript.CYRILLIC },
|
||||
{ "ta", UScript.TAMIL },
|
||||
{ "gu", UScript.GUJARATI },
|
||||
{ "te_IN", UScript.TELUGU },
|
||||
{ "hi", UScript.DEVANAGARI },
|
||||
{ "he", UScript.HEBREW },
|
||||
{ "ar", UScript.ARABIC },
|
||||
{ "abcde", UScript.INVALID_CODE },
|
||||
{ "abscde_cdef", UScript.INVALID_CODE },
|
||||
{ "iw", UScript.HEBREW },
|
||||
/* test abbr */
|
||||
{ "Hani", UScript.HAN },
|
||||
{ "Hang", UScript.HANGUL },
|
||||
{ "Hebr", UScript.HEBREW },
|
||||
{ "Hira", UScript.HIRAGANA },
|
||||
{ "Knda", UScript.KANNADA },
|
||||
{ "Kana", UScript.KATAKANA },
|
||||
{ "Khmr", UScript.KHMER },
|
||||
{ "Lao", UScript.LAO },
|
||||
{ "Latn", UScript.LATIN }, /* "Latf","Latg", */
|
||||
{ "Mlym", UScript.MALAYALAM },
|
||||
{ "Mong", UScript.MONGOLIAN },
|
||||
/* test names */
|
||||
{ "CYRILLIC", UScript.CYRILLIC },
|
||||
{ "DESERET", UScript.DESERET },
|
||||
{ "DEVANAGARI", UScript.DEVANAGARI },
|
||||
{ "ETHIOPIC", UScript.ETHIOPIC },
|
||||
{ "GEORGIAN", UScript.GEORGIAN },
|
||||
{ "GOTHIC", UScript.GOTHIC },
|
||||
{ "GREEK", UScript.GREEK },
|
||||
{ "GUJARATI", UScript.GUJARATI },
|
||||
{ "COMMON", UScript.COMMON },
|
||||
{ "INHERITED", UScript.INHERITED },
|
||||
/* test lower case names */
|
||||
{ "malayalam", UScript.MALAYALAM },
|
||||
{ "mongolian", UScript.MONGOLIAN },
|
||||
{ "myanmar", UScript.MYANMAR },
|
||||
{ "ogham", UScript.OGHAM },
|
||||
{ "old-italic", UScript.OLD_ITALIC },
|
||||
{ "oriya", UScript.ORIYA },
|
||||
{ "runic", UScript.RUNIC },
|
||||
{ "sinhala", UScript.SINHALA },
|
||||
{ "syriac", UScript.SYRIAC },
|
||||
{ "tamil", UScript.TAMIL },
|
||||
{ "telugu", UScript.TELUGU },
|
||||
{ "thaana", UScript.THAANA },
|
||||
{ "thai", UScript.THAI },
|
||||
{ "tibetan", UScript.TIBETAN },
|
||||
/* test the bounds */
|
||||
{ "Cans", UScript.CANADIAN_ABORIGINAL },
|
||||
{ "arabic", UScript.ARABIC },
|
||||
{ "Yi", UScript.YI },
|
||||
{ "Zyyy", UScript.COMMON }
|
||||
});
|
||||
}
|
||||
|
||||
@Test
|
||||
public void TestGetCode() {
|
||||
int[] code = UScript.getCode(testName);
|
||||
if (code == null) {
|
||||
if (expected != UScript.INVALID_CODE) {
|
||||
// getCode returns null if the code could not be found
|
||||
errln("Error testing UScript.getCode(). Got: null" + " Expected: " + expected + " for locale "
|
||||
+ testName);
|
||||
}
|
||||
} else if ((code[0] != expected)) {
|
||||
errln("Error testing UScript.getCode(). Got: " + code + " Expected: " + expected + " for locale "
|
||||
+ testName);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@RunWith(Parameterized.class)
|
||||
public static class GetNameTest {
|
||||
private int testCode;
|
||||
private String expected;
|
||||
|
||||
public GetNameTest(int testCode, String expected) {
|
||||
this.testCode = testCode;
|
||||
this.expected = expected;
|
||||
}
|
||||
|
||||
@Parameterized.Parameters
|
||||
public static Collection testData() {
|
||||
return Arrays.asList(new Object[][] {
|
||||
{ UScript.CYRILLIC, "Cyrillic" },
|
||||
{ UScript.DESERET, "Deseret" },
|
||||
{ UScript.DEVANAGARI, "Devanagari" },
|
||||
{ UScript.ETHIOPIC, "Ethiopic" },
|
||||
{ UScript.GEORGIAN, "Georgian" },
|
||||
{ UScript.GOTHIC, "Gothic" },
|
||||
{ UScript.GREEK, "Greek" },
|
||||
{ UScript.GUJARATI, "Gujarati" }
|
||||
});
|
||||
}
|
||||
|
||||
@Test
|
||||
public void TestGetName() {
|
||||
String scriptName = UScript.getName(testCode);
|
||||
if (!expected.equals(scriptName)) {
|
||||
errln("Error testing UScript.getName(). Got: " + scriptName + " Expected: " + expected);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@RunWith(Parameterized.class)
|
||||
public static class GetShortNameTest {
|
||||
private int testCode;
|
||||
private String expected;
|
||||
|
||||
public GetShortNameTest(int testCode, String expected) {
|
||||
this.testCode = testCode;
|
||||
this.expected = expected;
|
||||
}
|
||||
|
||||
@Parameterized.Parameters
|
||||
public static Collection testData() {
|
||||
return Arrays.asList(new Object[][] {
|
||||
{ UScript.HAN, "Hani" },
|
||||
{ UScript.HANGUL, "Hang" },
|
||||
{ UScript.HEBREW, "Hebr" },
|
||||
{ UScript.HIRAGANA, "Hira" },
|
||||
{ UScript.KANNADA, "Knda" },
|
||||
{ UScript.KATAKANA, "Kana" },
|
||||
{ UScript.KHMER, "Khmr" },
|
||||
{ UScript.LAO, "Laoo" },
|
||||
{ UScript.LATIN, "Latn" },
|
||||
{ UScript.MALAYALAM, "Mlym" },
|
||||
{ UScript.MONGOLIAN, "Mong" },
|
||||
});
|
||||
}
|
||||
|
||||
@Test
|
||||
public void TestGetShortName() {
|
||||
String shortName = UScript.getShortName(testCode);
|
||||
if (!expected.equals(shortName)) {
|
||||
errln("Error testing UScript.getShortName(). Got: " + shortName + " Expected: " + expected);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@RunWith(Parameterized.class)
|
||||
public static class GetScriptTest {
|
||||
private int codepoint;
|
||||
private int expected;
|
||||
|
||||
public GetScriptTest(int[] codepoint) {
|
||||
this.codepoint = codepoint[0];
|
||||
this.expected = codepoint[1];
|
||||
}
|
||||
|
||||
@Parameterized.Parameters
|
||||
public static Collection testData() {
|
||||
return Arrays.asList(new int[][] {
|
||||
{ 0x0000FF9D, UScript.KATAKANA },
|
||||
{ 0x0000FFBE, UScript.HANGUL },
|
||||
{ 0x0000FFC7, UScript.HANGUL },
|
||||
{ 0x0000FFCF, UScript.HANGUL },
|
||||
{ 0x0000FFD7, UScript.HANGUL },
|
||||
{ 0x0000FFDC, UScript.HANGUL },
|
||||
{ 0x00010300, UScript.OLD_ITALIC },
|
||||
{ 0x00010330, UScript.GOTHIC },
|
||||
{ 0x0001034A, UScript.GOTHIC },
|
||||
{ 0x00010400, UScript.DESERET },
|
||||
{ 0x00010428, UScript.DESERET },
|
||||
{ 0x0001D167, UScript.INHERITED },
|
||||
{ 0x0001D17B, UScript.INHERITED },
|
||||
{ 0x0001D185, UScript.INHERITED },
|
||||
{ 0x0001D1AA, UScript.INHERITED },
|
||||
{ 0x00020000, UScript.HAN },
|
||||
{ 0x00000D02, UScript.MALAYALAM },
|
||||
{ 0x00050005, UScript.UNKNOWN }, // new Zzzz value in Unicode 5.0
|
||||
{ 0x00000000, UScript.COMMON },
|
||||
{ 0x0001D169, UScript.INHERITED },
|
||||
{ 0x0001D182, UScript.INHERITED },
|
||||
{ 0x0001D18B, UScript.INHERITED },
|
||||
{ 0x0001D1AD, UScript.INHERITED },
|
||||
});
|
||||
}
|
||||
|
||||
@Test
|
||||
public void TestGetScript() {
|
||||
|
||||
int code = UScript.INVALID_CODE;
|
||||
|
||||
code = UScript.getScript(codepoint);
|
||||
|
||||
if (code != expected) {
|
||||
errln("Error testing UScript.getScript(). Got: " + code + " Expected: " + expected
|
||||
+ " for codepoint 0x + hex(codepoint).");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
@ -10,7 +10,6 @@
|
||||
package com.ibm.icu.dev.test.lang;
|
||||
|
||||
import java.util.BitSet;
|
||||
import java.util.Locale;
|
||||
|
||||
import org.junit.Test;
|
||||
|
||||
@ -19,7 +18,6 @@ import com.ibm.icu.lang.UProperty;
|
||||
import com.ibm.icu.lang.UScript;
|
||||
import com.ibm.icu.lang.UScript.ScriptUsage;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
import com.ibm.icu.util.ULocale;
|
||||
|
||||
public class TestUScript extends TestFmwk {
|
||||
|
||||
@ -30,350 +28,6 @@ public class TestUScript extends TestFmwk {
|
||||
{
|
||||
}
|
||||
|
||||
private static String scriptsToString(int[] scripts) {
|
||||
if(scripts == null) {
|
||||
return "null";
|
||||
}
|
||||
StringBuilder sb = new StringBuilder();
|
||||
for(int script : scripts) {
|
||||
if(sb.length() > 0) {
|
||||
sb.append(' ');
|
||||
}
|
||||
sb.append(UScript.getShortName(script));
|
||||
}
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
private void assertEqualScripts(String msg, int[] expectedScripts, int[] actualScripts) {
|
||||
assertEquals(msg, scriptsToString(expectedScripts), scriptsToString(actualScripts));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void TestLocaleGetCode(){
|
||||
final ULocale[] testNames={
|
||||
/* test locale */
|
||||
new ULocale("en"), new ULocale("en_US"),
|
||||
new ULocale("sr"), new ULocale("ta") ,
|
||||
new ULocale("te_IN"),
|
||||
new ULocale("hi"),
|
||||
new ULocale("he"), new ULocale("ar"),
|
||||
new ULocale("abcde"),
|
||||
new ULocale("abcde_cdef"),
|
||||
new ULocale("iw")
|
||||
};
|
||||
final int[] expected ={
|
||||
/* locales should return */
|
||||
UScript.LATIN, UScript.LATIN,
|
||||
UScript.CYRILLIC, UScript.TAMIL,
|
||||
UScript.TELUGU,UScript.DEVANAGARI,
|
||||
UScript.HEBREW, UScript.ARABIC,
|
||||
UScript.INVALID_CODE,UScript.INVALID_CODE,
|
||||
UScript.HEBREW
|
||||
};
|
||||
int i =0;
|
||||
int numErrors =0;
|
||||
|
||||
for( ; i<testNames.length; i++){
|
||||
int[] code = UScript.getCode(testNames[i]);
|
||||
|
||||
if(code==null){
|
||||
if(expected[i]!=UScript.INVALID_CODE){
|
||||
logln("Error getting script code Got: null" + " Expected: " +expected[i] +" for name "+testNames[i]);
|
||||
numErrors++;
|
||||
}
|
||||
// getCode returns null if the code could not be found
|
||||
continue;
|
||||
}
|
||||
if((code[0] != expected[i])){
|
||||
logln("Error getting script code Got: " +code[0] + " Expected: " +expected[i] +" for name "+testNames[i]);
|
||||
numErrors++;
|
||||
}
|
||||
}
|
||||
reportDataErrors(numErrors);
|
||||
|
||||
//
|
||||
ULocale defaultLoc = ULocale.getDefault();
|
||||
ULocale esperanto = new ULocale("eo_DE");
|
||||
ULocale.setDefault(esperanto);
|
||||
int[] code = UScript.getCode(esperanto);
|
||||
if(code != null){
|
||||
if( code[0] != UScript.LATIN){
|
||||
errln("Did not get the expected script code for Esperanto");
|
||||
}
|
||||
}else{
|
||||
warnln("Could not load the locale data.");
|
||||
}
|
||||
ULocale.setDefault(defaultLoc);
|
||||
|
||||
// Should work regardless of whether we have locale data for the language.
|
||||
assertEqualScripts("tg script: Cyrl", // Tajik
|
||||
new int[] { UScript.CYRILLIC },
|
||||
UScript.getCode(new ULocale("tg")));
|
||||
assertEqualScripts("xsr script: Deva", // Sherpa
|
||||
new int[] { UScript.DEVANAGARI },
|
||||
UScript.getCode(new ULocale("xsr")));
|
||||
|
||||
// Multi-script languages.
|
||||
assertEqualScripts("ja scripts: Kana Hira Hani",
|
||||
new int[] { UScript.KATAKANA, UScript.HIRAGANA, UScript.HAN },
|
||||
UScript.getCode(ULocale.JAPANESE));
|
||||
assertEqualScripts("ko scripts: Hang Hani",
|
||||
new int[] { UScript.HANGUL, UScript.HAN },
|
||||
UScript.getCode(ULocale.KOREAN));
|
||||
assertEqualScripts("zh script: Hani",
|
||||
new int[] { UScript.HAN },
|
||||
UScript.getCode(ULocale.CHINESE));
|
||||
assertEqualScripts("zh-Hant scripts: Hani Bopo",
|
||||
new int[] { UScript.HAN, UScript.BOPOMOFO },
|
||||
UScript.getCode(ULocale.TRADITIONAL_CHINESE));
|
||||
assertEqualScripts("zh-TW scripts: Hani Bopo",
|
||||
new int[] { UScript.HAN, UScript.BOPOMOFO },
|
||||
UScript.getCode(ULocale.TAIWAN));
|
||||
|
||||
// Ambiguous API, but this probably wants to return Latin rather than Rongorongo (Roro).
|
||||
assertEqualScripts("ro-RO script: Latn",
|
||||
new int[] { UScript.LATIN },
|
||||
UScript.getCode("ro-RO")); // String not ULocale
|
||||
}
|
||||
|
||||
// TODO(junit): remove this and convert the tests that use this to be parameterized
|
||||
private void reportDataErrors(int numErrors) {
|
||||
if (numErrors >0) {
|
||||
// assume missing locale data, so not an error, just a warning
|
||||
errln("encountered " + numErrors + " errors.");
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void TestMultipleCode(){
|
||||
final String[] testNames = { "ja" ,"ko_KR","zh","zh_TW"};
|
||||
final int[][] expected = {
|
||||
{UScript.KATAKANA,UScript.HIRAGANA,UScript.HAN},
|
||||
{UScript.HANGUL, UScript.HAN},
|
||||
{UScript.HAN},
|
||||
{UScript.HAN,UScript.BOPOMOFO}
|
||||
};
|
||||
|
||||
int numErrors = 0;
|
||||
for(int i=0; i<testNames.length;i++){
|
||||
int[] code = UScript.getCode(testNames[i]);
|
||||
int[] expt = expected[i];
|
||||
if(code!=null){
|
||||
for(int j =0; j< code.length;j++){
|
||||
if(code[j]!=expt[j]){
|
||||
numErrors++;
|
||||
logln("Error getting script code Got: " +code[j] + " Expected: " +expt[j] +" for name "+testNames[i]);
|
||||
}
|
||||
}
|
||||
}else{
|
||||
numErrors++;
|
||||
logln("Error getting script code for name "+testNames[i]);
|
||||
}
|
||||
}
|
||||
reportDataErrors(numErrors);
|
||||
|
||||
//cover UScript.getCode(Locale)
|
||||
Locale[] testLocales = new Locale[] {
|
||||
Locale.JAPANESE,
|
||||
Locale.KOREA,
|
||||
Locale.CHINESE,
|
||||
Locale.TAIWAN };
|
||||
logln("Testing UScript.getCode(Locale) ...");
|
||||
numErrors = 0;
|
||||
for(int i=0; i<testNames.length;i++){
|
||||
logln(" Testing locale: " + testLocales[i].getDisplayName());
|
||||
int[] code = UScript.getCode(testLocales[i]);
|
||||
int[] expt = expected[i];
|
||||
if(code!=null){
|
||||
for(int j =0; j< code.length;j++){
|
||||
if(code[j]!=expt[j]){
|
||||
numErrors++;
|
||||
logln(" Error getting script code Got: " +code[j] + " Expected: " +expt[j] +" for name "+testNames[i]);
|
||||
}
|
||||
}
|
||||
}else{
|
||||
numErrors++;
|
||||
logln(" Error getting script code for name "+testNames[i]);
|
||||
}
|
||||
}
|
||||
reportDataErrors(numErrors);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void TestGetCode(){
|
||||
|
||||
final String[] testNames={
|
||||
/* test locale */
|
||||
"en", "en_US", "sr", "ta", "gu", "te_IN",
|
||||
"hi", "he", "ar",
|
||||
/* test abbr */
|
||||
"Hani", "Hang","Hebr","Hira",
|
||||
"Knda","Kana","Khmr","Lao",
|
||||
"Latn",/*"Latf","Latg",*/
|
||||
"Mlym", "Mong",
|
||||
|
||||
/* test names */
|
||||
"CYRILLIC","DESERET","DEVANAGARI","ETHIOPIC","GEORGIAN",
|
||||
"GOTHIC", "GREEK", "GUJARATI", "COMMON", "INHERITED",
|
||||
/* test lower case names */
|
||||
"malayalam", "mongolian", "myanmar", "ogham", "old-italic",
|
||||
"oriya", "runic", "sinhala", "syriac","tamil",
|
||||
"telugu", "thaana", "thai", "tibetan",
|
||||
/* test the bounds*/
|
||||
"Cans", "arabic","Yi","Zyyy"
|
||||
};
|
||||
final int[] expected ={
|
||||
/* locales should return */
|
||||
UScript.LATIN, UScript.LATIN,
|
||||
UScript.CYRILLIC, UScript.TAMIL, UScript.GUJARATI,
|
||||
UScript.TELUGU,UScript.DEVANAGARI,
|
||||
UScript.HEBREW, UScript.ARABIC,
|
||||
/* abbr should return */
|
||||
UScript.HAN, UScript.HANGUL, UScript.HEBREW, UScript.HIRAGANA,
|
||||
UScript.KANNADA, UScript.KATAKANA, UScript.KHMER, UScript.LAO,
|
||||
UScript.LATIN,/* UScript.LATIN, UScript.LATIN,*/
|
||||
UScript.MALAYALAM, UScript.MONGOLIAN,
|
||||
/* names should return */
|
||||
UScript.CYRILLIC, UScript.DESERET, UScript.DEVANAGARI, UScript.ETHIOPIC, UScript.GEORGIAN,
|
||||
UScript.GOTHIC, UScript.GREEK, UScript.GUJARATI, UScript.COMMON, UScript.INHERITED,
|
||||
/* lower case names should return */
|
||||
UScript.MALAYALAM, UScript.MONGOLIAN, UScript.MYANMAR, UScript.OGHAM, UScript.OLD_ITALIC,
|
||||
UScript.ORIYA, UScript.RUNIC, UScript.SINHALA, UScript.SYRIAC, UScript.TAMIL,
|
||||
UScript.TELUGU, UScript.THAANA, UScript.THAI, UScript.TIBETAN,
|
||||
/* bounds */
|
||||
UScript.CANADIAN_ABORIGINAL, UScript.ARABIC, UScript.YI, UScript.COMMON
|
||||
};
|
||||
int i =0;
|
||||
int numErrors =0;
|
||||
|
||||
for( ; i<testNames.length; i++){
|
||||
int[] code = UScript.getCode(testNames[i]);
|
||||
if(code == null){
|
||||
if(expected[i]==UScript.INVALID_CODE){
|
||||
// getCode returns null if the code could not be found
|
||||
continue;
|
||||
}
|
||||
// currently commented out until jitterbug#2678 is fixed
|
||||
logln("Error getting script code Got: null" + " Expected: " +expected[i] +" for name "+testNames[i]);
|
||||
numErrors++;
|
||||
continue;
|
||||
}
|
||||
if((code[0] != expected[i])){
|
||||
logln("Error getting script code Got: " +code[0] + " Expected: " +expected[i] +" for name "+testNames[i]);
|
||||
numErrors++;
|
||||
}
|
||||
}
|
||||
reportDataErrors(numErrors);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void TestGetName(){
|
||||
|
||||
final int[] testCodes={
|
||||
/* names should return */
|
||||
UScript.CYRILLIC, UScript.DESERET, UScript.DEVANAGARI, UScript.ETHIOPIC, UScript.GEORGIAN,
|
||||
UScript.GOTHIC, UScript.GREEK, UScript.GUJARATI,
|
||||
};
|
||||
|
||||
final String[] expectedNames={
|
||||
|
||||
/* test names */
|
||||
"Cyrillic","Deseret","Devanagari","Ethiopic","Georgian",
|
||||
"Gothic", "Greek", "Gujarati",
|
||||
};
|
||||
int i =0;
|
||||
int numErrors=0;
|
||||
while(i< testCodes.length){
|
||||
String scriptName = UScript.getName(testCodes[i]);
|
||||
if(!expectedNames[i].equals(scriptName)){
|
||||
logln("Error getting abbreviations Got: " +scriptName +" Expected: "+expectedNames[i]);
|
||||
numErrors++;
|
||||
}
|
||||
i++;
|
||||
}
|
||||
if(numErrors >0 ){
|
||||
warnln("encountered " + numErrors + " errors in UScript.getName()");
|
||||
}
|
||||
|
||||
}
|
||||
@Test
|
||||
public void TestGetShortName(){
|
||||
final int[] testCodes={
|
||||
/* abbr should return */
|
||||
UScript.HAN, UScript.HANGUL, UScript.HEBREW, UScript.HIRAGANA,
|
||||
UScript.KANNADA, UScript.KATAKANA, UScript.KHMER, UScript.LAO,
|
||||
UScript.LATIN,
|
||||
UScript.MALAYALAM, UScript.MONGOLIAN,
|
||||
};
|
||||
|
||||
final String[] expectedAbbr={
|
||||
/* test abbr */
|
||||
"Hani", "Hang","Hebr","Hira",
|
||||
"Knda","Kana","Khmr","Laoo",
|
||||
"Latn",
|
||||
"Mlym", "Mong",
|
||||
};
|
||||
int i=0;
|
||||
int numErrors=0;
|
||||
while(i<testCodes.length){
|
||||
String shortName = UScript.getShortName(testCodes[i]);
|
||||
if(!expectedAbbr[i].equals(shortName)){
|
||||
logln("Error getting abbreviations Got: " +shortName+ " Expected: " +expectedAbbr[i]);
|
||||
numErrors++;
|
||||
}
|
||||
i++;
|
||||
}
|
||||
if(numErrors >0 ){
|
||||
warnln("encountered " + numErrors + " errors in UScript.getShortName()");
|
||||
}
|
||||
}
|
||||
@Test
|
||||
public void TestGetScript(){
|
||||
int codepoints[][] = new int[][] {
|
||||
{0x0000FF9D, UScript.KATAKANA },
|
||||
{0x0000FFBE, UScript.HANGUL },
|
||||
{0x0000FFC7, UScript.HANGUL },
|
||||
{0x0000FFCF, UScript.HANGUL },
|
||||
{0x0000FFD7, UScript.HANGUL},
|
||||
{0x0000FFDC, UScript.HANGUL},
|
||||
{0x00010300, UScript.OLD_ITALIC},
|
||||
{0x00010330, UScript.GOTHIC},
|
||||
{0x0001034A, UScript.GOTHIC},
|
||||
{0x00010400, UScript.DESERET},
|
||||
{0x00010428, UScript.DESERET},
|
||||
{0x0001D167, UScript.INHERITED},
|
||||
{0x0001D17B, UScript.INHERITED},
|
||||
{0x0001D185, UScript.INHERITED},
|
||||
{0x0001D1AA, UScript.INHERITED},
|
||||
{0x00020000, UScript.HAN},
|
||||
{0x00000D02, UScript.MALAYALAM},
|
||||
{0x00050005, UScript.UNKNOWN}, // new Zzzz value in Unicode 5.0
|
||||
{0x00000000, UScript.COMMON},
|
||||
{0x0001D169, UScript.INHERITED },
|
||||
{0x0001D182, UScript.INHERITED },
|
||||
{0x0001D18B, UScript.INHERITED },
|
||||
{0x0001D1AD, UScript.INHERITED },
|
||||
};
|
||||
|
||||
int i =0;
|
||||
int code = UScript.INVALID_CODE;
|
||||
boolean passed = true;
|
||||
|
||||
while(i< codepoints.length){
|
||||
code = UScript.getScript(codepoints[i][0]);
|
||||
|
||||
if(code != codepoints[i][1]){
|
||||
logln("UScript.getScript for codepoint 0x"+ hex(codepoints[i][0])+" failed");
|
||||
passed = false;
|
||||
}
|
||||
|
||||
i++;
|
||||
}
|
||||
if(!passed){
|
||||
errln("UScript.getScript failed.");
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void TestGetScriptOfCharsWithScriptExtensions() {
|
||||
/* test characters which have Script_Extensions */
|
||||
|
@ -13,6 +13,7 @@ package com.ibm.icu.dev.test.lang;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
|
||||
@ -777,6 +778,88 @@ public final class UCharacterCaseTest extends TestFmwk
|
||||
}
|
||||
}
|
||||
|
||||
private static String printOneEdit(Edits.Iterator ei) {
|
||||
if (ei.hasChange()) {
|
||||
return "" + ei.oldLength() + "->" + ei.newLength();
|
||||
} else {
|
||||
return "" + ei.oldLength() + "=" + ei.newLength();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Maps indexes according to the expected edits.
|
||||
* A destination index can occur multiple times when there are source deletions.
|
||||
* Map according to the last occurrence, normally in a non-empty destination span.
|
||||
* Simplest is to search from the back.
|
||||
*/
|
||||
private static int srcIndexFromDest(
|
||||
EditChange expected[], int srcLength, int destLength, int index) {
|
||||
int srcIndex = srcLength;
|
||||
int destIndex = destLength;
|
||||
int i = expected.length;
|
||||
while (index < destIndex && i > 0) {
|
||||
--i;
|
||||
int prevSrcIndex = srcIndex - expected[i].oldLength;
|
||||
int prevDestIndex = destIndex - expected[i].newLength;
|
||||
if (index == prevDestIndex) {
|
||||
return prevSrcIndex;
|
||||
} else if (index > prevDestIndex) {
|
||||
if (expected[i].change) {
|
||||
// In a change span, map to its end.
|
||||
return srcIndex;
|
||||
} else {
|
||||
// In an unchanged span, offset within it.
|
||||
return prevSrcIndex + (index - prevDestIndex);
|
||||
}
|
||||
}
|
||||
srcIndex = prevSrcIndex;
|
||||
destIndex = prevDestIndex;
|
||||
}
|
||||
// index is outside the string.
|
||||
return srcIndex;
|
||||
}
|
||||
|
||||
private static int destIndexFromSrc(
|
||||
EditChange expected[], int srcLength, int destLength, int index) {
|
||||
int srcIndex = srcLength;
|
||||
int destIndex = destLength;
|
||||
int i = expected.length;
|
||||
while (index < srcIndex && i > 0) {
|
||||
--i;
|
||||
int prevSrcIndex = srcIndex - expected[i].oldLength;
|
||||
int prevDestIndex = destIndex - expected[i].newLength;
|
||||
if (index == prevSrcIndex) {
|
||||
return prevDestIndex;
|
||||
} else if (index > prevSrcIndex) {
|
||||
if (expected[i].change) {
|
||||
// In a change span, map to its end.
|
||||
return destIndex;
|
||||
} else {
|
||||
// In an unchanged span, offset within it.
|
||||
return prevDestIndex + (index - prevSrcIndex);
|
||||
}
|
||||
}
|
||||
srcIndex = prevSrcIndex;
|
||||
destIndex = prevDestIndex;
|
||||
}
|
||||
// index is outside the string.
|
||||
return destIndex;
|
||||
}
|
||||
|
||||
private void checkEqualEdits(String name, Edits e1, Edits e2) {
|
||||
Edits.Iterator ei1 = e1.getFineIterator();
|
||||
Edits.Iterator ei2 = e2.getFineIterator();
|
||||
for (int i = 0;; ++i) {
|
||||
boolean ei1HasNext = ei1.next();
|
||||
boolean ei2HasNext = ei2.next();
|
||||
assertEquals(name + " next()[" + i + "]", ei1HasNext, ei2HasNext);
|
||||
assertEquals(name + " edit[" + i + "]", printOneEdit(ei1), printOneEdit(ei2));
|
||||
if (!ei1HasNext || !ei2HasNext) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static void checkEditsIter(
|
||||
String name, Edits.Iterator ei1, Edits.Iterator ei2, // two equal iterators
|
||||
EditChange[] expected, boolean withUnchanged) {
|
||||
@ -786,8 +869,6 @@ public final class UCharacterCaseTest extends TestFmwk
|
||||
int expSrcIndex = 0;
|
||||
int expDestIndex = 0;
|
||||
int expReplIndex = 0;
|
||||
int expSrcIndexFromDest = 0; // for sourceIndexFromDestinationIndex()
|
||||
int expDestIndexFromSrc = 0; // for destinationIndexFromSourceIndex()
|
||||
for (int expIndex = 0; expIndex < expected.length; ++expIndex) {
|
||||
EditChange expect = expected[expIndex];
|
||||
String msg = name + ' ' + expIndex;
|
||||
@ -801,7 +882,7 @@ public final class UCharacterCaseTest extends TestFmwk
|
||||
assertEquals(msg, expReplIndex, ei1.replacementIndex());
|
||||
}
|
||||
|
||||
if (expect.oldLength > 0 && expDestIndex == expDestIndexFromSrc) {
|
||||
if (expect.oldLength > 0) {
|
||||
assertTrue(msg, ei2.findSourceIndex(expSrcIndex));
|
||||
assertEquals(msg, expect.change, ei2.hasChange());
|
||||
assertEquals(msg, expect.oldLength, ei2.oldLength());
|
||||
@ -817,7 +898,7 @@ public final class UCharacterCaseTest extends TestFmwk
|
||||
}
|
||||
}
|
||||
|
||||
if (expect.newLength > 0 && expSrcIndex == expSrcIndexFromDest) {
|
||||
if (expect.newLength > 0) {
|
||||
assertTrue(msg, ei2.findDestinationIndex(expDestIndex));
|
||||
assertEquals(msg, expect.change, ei2.hasChange());
|
||||
assertEquals(msg, expect.oldLength, ei2.oldLength());
|
||||
@ -833,45 +914,11 @@ public final class UCharacterCaseTest extends TestFmwk
|
||||
}
|
||||
}
|
||||
|
||||
// Span starts.
|
||||
assertEquals(name, expDestIndexFromSrc,
|
||||
ei2.destinationIndexFromSourceIndex(expSrcIndex));
|
||||
assertEquals(name, expSrcIndexFromDest,
|
||||
ei2.sourceIndexFromDestinationIndex(expDestIndex));
|
||||
|
||||
// Inside unchanged span map offsets 1:1.
|
||||
if (!expect.change && expect.oldLength >= 2) {
|
||||
assertEquals(name, expDestIndex + 1,
|
||||
ei2.destinationIndexFromSourceIndex(expSrcIndex + 1));
|
||||
assertEquals(name, expSrcIndex + 1,
|
||||
ei2.sourceIndexFromDestinationIndex(expDestIndex + 1));
|
||||
}
|
||||
|
||||
// Inside change span map to the span limit.
|
||||
int expSrcLimit = expSrcIndex + expect.oldLength;
|
||||
int expDestLimit = expDestIndex + expect.newLength;
|
||||
if (expect.change) {
|
||||
if (expect.oldLength >= 2) {
|
||||
assertEquals(name, expDestLimit,
|
||||
ei2.destinationIndexFromSourceIndex(expSrcIndex + 1));
|
||||
}
|
||||
if (expect.newLength >= 2) {
|
||||
assertEquals(name, expSrcLimit,
|
||||
ei2.sourceIndexFromDestinationIndex(expDestIndex + 1));
|
||||
}
|
||||
}
|
||||
|
||||
expSrcIndex = expSrcLimit;
|
||||
expDestIndex = expDestLimit;
|
||||
expSrcIndex += expect.oldLength;
|
||||
expDestIndex += expect.newLength;
|
||||
if (expect.change) {
|
||||
expReplIndex += expect.newLength;
|
||||
}
|
||||
if (expect.newLength > 0) {
|
||||
expSrcIndexFromDest = expSrcIndex;
|
||||
}
|
||||
if (expect.oldLength > 0) {
|
||||
expDestIndexFromSrc = expDestIndex;
|
||||
}
|
||||
}
|
||||
String msg = name + " end";
|
||||
assertFalse(msg, ei1.next());
|
||||
@ -884,8 +931,49 @@ public final class UCharacterCaseTest extends TestFmwk
|
||||
|
||||
assertFalse(name, ei2.findSourceIndex(expSrcIndex));
|
||||
assertFalse(name, ei2.findDestinationIndex(expDestIndex));
|
||||
assertEquals(name, expDestIndex, ei2.destinationIndexFromSourceIndex(expSrcIndex));
|
||||
assertEquals(name, expSrcIndex, ei2.sourceIndexFromDestinationIndex(expDestIndex));
|
||||
|
||||
// Check mapping of all indexes against a simple implementation
|
||||
// that works on the expected changes.
|
||||
// Iterate once forward, once backward, to cover more runtime conditions.
|
||||
int srcLength = expSrcIndex;
|
||||
int destLength = expDestIndex;
|
||||
List<Integer> srcIndexes = new ArrayList<Integer>();
|
||||
List<Integer> destIndexes = new ArrayList<Integer>();
|
||||
srcIndexes.add(-1);
|
||||
destIndexes.add(-1);
|
||||
int srcIndex = 0;
|
||||
int destIndex = 0;
|
||||
for (int i = 0; i < expected.length; ++i) {
|
||||
if (expected[i].oldLength > 0) {
|
||||
srcIndexes.add(srcIndex);
|
||||
if (expected[i].oldLength > 1) {
|
||||
srcIndexes.add(srcIndex + 1);
|
||||
}
|
||||
}
|
||||
if (expected[i].newLength > 0) {
|
||||
destIndexes.add(destIndex);
|
||||
if (expected[i].newLength > 0) {
|
||||
destIndexes.add(destIndex + 1);
|
||||
}
|
||||
}
|
||||
srcIndex += expected[i].oldLength;
|
||||
destIndex += expected[i].newLength;
|
||||
}
|
||||
srcIndexes.add(srcLength);
|
||||
destIndexes.add(destLength);
|
||||
srcIndexes.add(srcLength + 1);
|
||||
destIndexes.add(destLength + 1);
|
||||
Collections.reverse(destIndexes);
|
||||
for (int i : srcIndexes) {
|
||||
assertEquals(name + " destIndexFromSrc(" + i + "):",
|
||||
destIndexFromSrc(expected, srcLength, destLength, i),
|
||||
ei2.destinationIndexFromSourceIndex(i));
|
||||
}
|
||||
for (int i : destIndexes) {
|
||||
assertEquals(name + " srcIndexFromDest(" + i + "):",
|
||||
srcIndexFromDest(expected, srcLength, destLength, i),
|
||||
ei2.sourceIndexFromDestinationIndex(i));
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
@ -949,6 +1037,167 @@ public final class UCharacterCaseTest extends TestFmwk
|
||||
assertFalse("reset then iterator", ei.next());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void TestMergeEdits() {
|
||||
Edits ab = new Edits(), bc = new Edits(), ac = new Edits(), expected_ac = new Edits();
|
||||
|
||||
// Simple: Two parallel non-changes.
|
||||
ab.addUnchanged(2);
|
||||
bc.addUnchanged(2);
|
||||
expected_ac.addUnchanged(2);
|
||||
|
||||
// Simple: Two aligned changes.
|
||||
ab.addReplace(3, 2);
|
||||
bc.addReplace(2, 1);
|
||||
expected_ac.addReplace(3, 1);
|
||||
|
||||
// Unequal non-changes.
|
||||
ab.addUnchanged(5);
|
||||
bc.addUnchanged(3);
|
||||
expected_ac.addUnchanged(3);
|
||||
// ab ahead by 2
|
||||
|
||||
// Overlapping changes accumulate until they share a boundary.
|
||||
ab.addReplace(4, 3);
|
||||
bc.addReplace(3, 2);
|
||||
ab.addReplace(4, 3);
|
||||
bc.addReplace(3, 2);
|
||||
ab.addReplace(4, 3);
|
||||
bc.addReplace(3, 2);
|
||||
bc.addUnchanged(4);
|
||||
expected_ac.addReplace(14, 8);
|
||||
// bc ahead by 2
|
||||
|
||||
// Balance out intermediate-string lengths.
|
||||
ab.addUnchanged(2);
|
||||
expected_ac.addUnchanged(2);
|
||||
|
||||
// Insert something and delete it: Should disappear.
|
||||
ab.addReplace(0, 5);
|
||||
ab.addReplace(0, 2);
|
||||
bc.addReplace(7, 0);
|
||||
|
||||
// Parallel change to make a new boundary.
|
||||
ab.addReplace(1, 2);
|
||||
bc.addReplace(2, 3);
|
||||
expected_ac.addReplace(1, 3);
|
||||
|
||||
// Multiple ab deletions should remain separate at the boundary.
|
||||
ab.addReplace(1, 0);
|
||||
ab.addReplace(2, 0);
|
||||
ab.addReplace(3, 0);
|
||||
expected_ac.addReplace(1, 0);
|
||||
expected_ac.addReplace(2, 0);
|
||||
expected_ac.addReplace(3, 0);
|
||||
|
||||
// Unequal non-changes can be split for another boundary.
|
||||
ab.addUnchanged(2);
|
||||
bc.addUnchanged(1);
|
||||
expected_ac.addUnchanged(1);
|
||||
// ab ahead by 1
|
||||
|
||||
// Multiple bc insertions should create a boundary and remain separate.
|
||||
bc.addReplace(0, 4);
|
||||
bc.addReplace(0, 5);
|
||||
bc.addReplace(0, 6);
|
||||
expected_ac.addReplace(0, 4);
|
||||
expected_ac.addReplace(0, 5);
|
||||
expected_ac.addReplace(0, 6);
|
||||
// ab ahead by 1
|
||||
|
||||
// Multiple ab deletions in the middle of a bc change are merged.
|
||||
bc.addReplace(2, 2);
|
||||
// bc ahead by 1
|
||||
ab.addReplace(1, 0);
|
||||
ab.addReplace(2, 0);
|
||||
ab.addReplace(3, 0);
|
||||
ab.addReplace(4, 1);
|
||||
expected_ac.addReplace(11, 2);
|
||||
|
||||
// Multiple bc insertions in the middle of an ab change are merged.
|
||||
ab.addReplace(5, 6);
|
||||
bc.addReplace(3, 3);
|
||||
// ab ahead by 3
|
||||
bc.addReplace(0, 4);
|
||||
bc.addReplace(0, 5);
|
||||
bc.addReplace(0, 6);
|
||||
bc.addReplace(3, 7);
|
||||
expected_ac.addReplace(5, 25);
|
||||
|
||||
// Delete around a deletion.
|
||||
ab.addReplace(4, 4);
|
||||
ab.addReplace(3, 0);
|
||||
ab.addUnchanged(2);
|
||||
bc.addReplace(2, 2);
|
||||
bc.addReplace(4, 0);
|
||||
expected_ac.addReplace(9, 2);
|
||||
|
||||
// Insert into an insertion.
|
||||
ab.addReplace(0, 2);
|
||||
bc.addReplace(1, 1);
|
||||
bc.addReplace(0, 8);
|
||||
bc.addUnchanged(4);
|
||||
expected_ac.addReplace(0, 10);
|
||||
// bc ahead by 3
|
||||
|
||||
// Balance out intermediate-string lengths.
|
||||
ab.addUnchanged(3);
|
||||
expected_ac.addUnchanged(3);
|
||||
|
||||
// Deletions meet insertions.
|
||||
// Output order is arbitrary in principle, but we expect insertions first
|
||||
// and want to keep it that way.
|
||||
ab.addReplace(2, 0);
|
||||
ab.addReplace(4, 0);
|
||||
ab.addReplace(6, 0);
|
||||
bc.addReplace(0, 1);
|
||||
bc.addReplace(0, 3);
|
||||
bc.addReplace(0, 5);
|
||||
expected_ac.addReplace(0, 1);
|
||||
expected_ac.addReplace(0, 3);
|
||||
expected_ac.addReplace(0, 5);
|
||||
expected_ac.addReplace(2, 0);
|
||||
expected_ac.addReplace(4, 0);
|
||||
expected_ac.addReplace(6, 0);
|
||||
|
||||
// End with a non-change, so that further edits are never reordered.
|
||||
ab.addUnchanged(1);
|
||||
bc.addUnchanged(1);
|
||||
expected_ac.addUnchanged(1);
|
||||
|
||||
ac.mergeAndAppend(ab, bc);
|
||||
checkEqualEdits("ab+bc", expected_ac, ac);
|
||||
|
||||
// Append more Edits.
|
||||
Edits ab2 = new Edits(), bc2 = new Edits();
|
||||
ab2.addUnchanged(5);
|
||||
bc2.addReplace(1, 2);
|
||||
bc2.addUnchanged(4);
|
||||
expected_ac.addReplace(1, 2);
|
||||
expected_ac.addUnchanged(4);
|
||||
ac.mergeAndAppend(ab2, bc2);
|
||||
checkEqualEdits("ab2+bc2", expected_ac, ac);
|
||||
|
||||
// Append empty edits.
|
||||
Edits empty = new Edits();
|
||||
ac.mergeAndAppend(empty, empty);
|
||||
checkEqualEdits("empty+empty", expected_ac, ac);
|
||||
|
||||
// Error: Append more edits with mismatched intermediate-string lengths.
|
||||
Edits mismatch = new Edits();
|
||||
mismatch.addReplace(1, 1);
|
||||
try {
|
||||
ac.mergeAndAppend(ab2, mismatch);
|
||||
fail("ab2+mismatch did not yield IllegalArgumentException");
|
||||
} catch (IllegalArgumentException expected) {
|
||||
}
|
||||
try {
|
||||
ac.mergeAndAppend(mismatch, bc2);
|
||||
fail("mismatch+bc2 did not yield IllegalArgumentException");
|
||||
} catch (IllegalArgumentException expected) {
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void TestCaseMapWithEdits() {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
|
@ -84,7 +84,7 @@ public class UnicodeSetTest extends TestFmwk {
|
||||
|
||||
@Test
|
||||
public void TestPropertyAccess() {
|
||||
int count = 0;
|
||||
int count = 0;
|
||||
// test to see that all of the names work
|
||||
for (int propNum = UProperty.BINARY_START; propNum < UProperty.INT_LIMIT; ++propNum) {
|
||||
count++;
|
||||
@ -130,7 +130,7 @@ public class UnicodeSetTest extends TestFmwk {
|
||||
}
|
||||
} catch (RuntimeException e1) {
|
||||
errln("Can't get property value name for: "
|
||||
+ "Property (" + propNum + "): " + propName + ", "
|
||||
+ "Property (" + propNum + "): " + propName + ", "
|
||||
+ "Value (" + valueNum + ") "
|
||||
+ ", NameChoice: " + nameChoice + ", "
|
||||
+ e1.getClass().getName());
|
||||
@ -142,7 +142,7 @@ public class UnicodeSetTest extends TestFmwk {
|
||||
testSet = new UnicodeSet("[:" + propName + "=" + valueName + ":]");
|
||||
} catch (RuntimeException e) {
|
||||
errln("Can't create UnicodeSet for: "
|
||||
+ "Property (" + propNum + "): " + propName + ", "
|
||||
+ "Property (" + propNum + "): " + propName + ", "
|
||||
+ "Value (" + valueNum + "): " + valueName + ", "
|
||||
+ e.getClass().getName());
|
||||
continue;
|
||||
@ -155,13 +155,13 @@ public class UnicodeSetTest extends TestFmwk {
|
||||
}
|
||||
}
|
||||
if (collectedErrors.size() != 0) {
|
||||
errln("Property Value Differs: "
|
||||
+ "Property (" + propNum + "): " + propName + ", "
|
||||
errln("Property Value Differs: "
|
||||
+ "Property (" + propNum + "): " + propName + ", "
|
||||
+ "Value (" + valueNum + "): " + valueName + ", "
|
||||
+ "Differing values: " + collectedErrors.toPattern(true));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -183,7 +183,7 @@ public class UnicodeSetTest extends TestFmwk {
|
||||
if (!toPatternAux(0, i)) continue;
|
||||
if (!toPatternAux(i, 0xFFFF)) continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Test pattern behavior of multicharacter strings.
|
||||
UnicodeSet s = new UnicodeSet("[a-z {aa} {ab}]");
|
||||
@ -211,7 +211,7 @@ public class UnicodeSetTest extends TestFmwk {
|
||||
new String[] {"abc", NOT, "ab"});
|
||||
|
||||
// JB#3400: For 2 character ranges prefer [ab] to [a-b]
|
||||
s.clear();
|
||||
s.clear();
|
||||
s.add('a', 'b');
|
||||
expectToPattern(s, "[ab]", null);
|
||||
|
||||
@ -244,7 +244,7 @@ public class UnicodeSetTest extends TestFmwk {
|
||||
}
|
||||
|
||||
static String[] OTHER_TOPATTERN_TESTS = {
|
||||
"[[:latin:]&[:greek:]]",
|
||||
"[[:latin:]&[:greek:]]",
|
||||
"[[:latin:]-[:greek:]]",
|
||||
"[:nonspacing mark:]"
|
||||
};
|
||||
@ -456,7 +456,7 @@ public class UnicodeSetTest extends TestFmwk {
|
||||
for (int i=0; i<0x200; ++i) {
|
||||
boolean l = UCharacter.isLetter(i);
|
||||
if (l != set.contains((char)i)) {
|
||||
errln("FAIL: L contains " + (char)i + " = " +
|
||||
errln("FAIL: L contains " + (char)i + " = " +
|
||||
set.contains((char)i));
|
||||
if (++failures == 10) break;
|
||||
}
|
||||
@ -466,7 +466,7 @@ public class UnicodeSetTest extends TestFmwk {
|
||||
for (int i=0; i<0x200; ++i) {
|
||||
boolean lu = (UCharacter.getType(i) == ECharacterCategory.UPPERCASE_LETTER);
|
||||
if (lu != set.contains((char)i)) {
|
||||
errln("FAIL: Lu contains " + (char)i + " = " +
|
||||
errln("FAIL: Lu contains " + (char)i + " = " +
|
||||
set.contains((char)i));
|
||||
if (++failures == 20) break;
|
||||
}
|
||||
@ -653,7 +653,7 @@ public class UnicodeSetTest extends TestFmwk {
|
||||
logln("bitsToSet(setToBits(c)): " + c);
|
||||
} else {
|
||||
errln("FAIL: bitsToSet(setToBits(c)) = " + c + ", expect " + exp);
|
||||
}
|
||||
}
|
||||
|
||||
// Additional tests for coverage JB#2118
|
||||
//UnicodeSet::complement(class UnicodeString const &)
|
||||
@ -744,10 +744,10 @@ public class UnicodeSetTest extends TestFmwk {
|
||||
}
|
||||
|
||||
{
|
||||
//Cover addAll(Collection) and addAllTo(Collection)
|
||||
//Cover addAll(Collection) and addAllTo(Collection)
|
||||
// Seems that there is a bug in addAll(Collection) operation
|
||||
// Ram also add a similar test to UtilityTest.java
|
||||
logln("Testing addAll(Collection) ... ");
|
||||
logln("Testing addAll(Collection) ... ");
|
||||
String[] array = {"a", "b", "c", "de"};
|
||||
List list = Arrays.asList(array);
|
||||
Set aset = new HashSet(list);
|
||||
@ -783,20 +783,20 @@ public class UnicodeSetTest extends TestFmwk {
|
||||
// Object[][] testList = {
|
||||
// {I_EQUALS, UnicodeSet.fromAll("abc"),
|
||||
// new UnicodeSet("[a-c]")},
|
||||
//
|
||||
//
|
||||
// {I_EQUALS, UnicodeSet.from("ch").add('a','z').add("ll"),
|
||||
// new UnicodeSet("[{ll}{ch}a-z]")},
|
||||
//
|
||||
// {I_EQUALS, UnicodeSet.from("ab}c"),
|
||||
//
|
||||
// {I_EQUALS, UnicodeSet.from("ab}c"),
|
||||
// new UnicodeSet("[{ab\\}c}]")},
|
||||
//
|
||||
// {I_EQUALS, new UnicodeSet('a','z').add('A', 'Z').retain('M','m').complement('X'),
|
||||
//
|
||||
// {I_EQUALS, new UnicodeSet('a','z').add('A', 'Z').retain('M','m').complement('X'),
|
||||
// new UnicodeSet("[[a-zA-Z]&[M-m]-[X]]")},
|
||||
// };
|
||||
//
|
||||
//
|
||||
// for (int i = 0; i < testList.length; ++i) {
|
||||
// expectRelation(testList[i][0], testList[i][1], testList[i][2], "(" + i + ")");
|
||||
// }
|
||||
// }
|
||||
|
||||
UnicodeSet[][] testList = {
|
||||
{UnicodeSet.fromAll("abc"),
|
||||
@ -805,10 +805,10 @@ public class UnicodeSetTest extends TestFmwk {
|
||||
{UnicodeSet.from("ch").add('a','z').add("ll"),
|
||||
new UnicodeSet("[{ll}{ch}a-z]")},
|
||||
|
||||
{UnicodeSet.from("ab}c"),
|
||||
{UnicodeSet.from("ab}c"),
|
||||
new UnicodeSet("[{ab\\}c}]")},
|
||||
|
||||
{new UnicodeSet('a','z').add('A', 'Z').retain('M','m').complement('X'),
|
||||
{new UnicodeSet('a','z').add('A', 'Z').retain('M','m').complement('X'),
|
||||
new UnicodeSet("[[a-zA-Z]&[M-m]-[X]]")},
|
||||
};
|
||||
|
||||
@ -816,10 +816,10 @@ public class UnicodeSetTest extends TestFmwk {
|
||||
if (!testList[i][0].equals(testList[i][1])) {
|
||||
errln("FAIL: sets unequal; see source code (" + i + ")");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static final Integer
|
||||
static final Integer
|
||||
I_ANY = new Integer(SortedSetRelation.ANY),
|
||||
I_CONTAINS = new Integer(SortedSetRelation.CONTAINS),
|
||||
I_DISJOINT = new Integer(SortedSetRelation.DISJOINT),
|
||||
@ -875,12 +875,12 @@ public class UnicodeSetTest extends TestFmwk {
|
||||
|
||||
iset.add(new Integer(size + 1)); // add odd value in middle
|
||||
|
||||
CheckSpeed(iset, jset, "when a contains b", iterations);
|
||||
CheckSpeed(iset, jset, "when a contains b", iterations);
|
||||
CheckSpeed(jset, iset, "when b contains a", iterations);
|
||||
|
||||
jset.add(new Integer(size - 1)); // add different odd value in middle
|
||||
|
||||
CheckSpeed(jset, iset, "when a, b are disjoint", iterations);
|
||||
CheckSpeed(jset, iset, "when a, b are disjoint", iterations);
|
||||
}
|
||||
|
||||
void CheckSpeed(SortedSet iset, SortedSet jset, String message, int iterations) {
|
||||
@ -952,28 +952,28 @@ public class UnicodeSetTest extends TestFmwk {
|
||||
|
||||
public static final String[] RELATION_NAME = {
|
||||
"both-are-null",
|
||||
"a-is-null",
|
||||
"equals",
|
||||
"a-is-null",
|
||||
"equals",
|
||||
"is-contained-in",
|
||||
"b-is-null",
|
||||
"is-disjoint_with",
|
||||
"contains",
|
||||
"contains",
|
||||
"any", };
|
||||
|
||||
boolean dumbHasRelation(Collection A, int filter, Collection B) {
|
||||
Collection ab = new TreeSet(A);
|
||||
ab.retainAll(B);
|
||||
if (ab.size() > 0 && (filter & SortedSetRelation.A_AND_B) == 0) return false;
|
||||
if (ab.size() > 0 && (filter & SortedSetRelation.A_AND_B) == 0) return false;
|
||||
|
||||
// A - B size == A.size - A&B.size
|
||||
if (A.size() > ab.size() && (filter & SortedSetRelation.A_NOT_B) == 0) return false;
|
||||
if (A.size() > ab.size() && (filter & SortedSetRelation.A_NOT_B) == 0) return false;
|
||||
|
||||
// B - A size == B.size - A&B.size
|
||||
if (B.size() > ab.size() && (filter & SortedSetRelation.B_NOT_A) == 0) return false;
|
||||
if (B.size() > ab.size() && (filter & SortedSetRelation.B_NOT_A) == 0) return false;
|
||||
|
||||
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
void checkSetRelation(SortedSet a, SortedSet b, String message) {
|
||||
for (int i = 0; i < 8; ++i) {
|
||||
@ -984,7 +984,7 @@ public class UnicodeSetTest extends TestFmwk {
|
||||
logln(message + " " + hasRelation + ":\t" + a + "\t" + RELATION_NAME[i] + "\t" + b);
|
||||
|
||||
if (hasRelation != dumbHasRelation) {
|
||||
errln("FAIL: " +
|
||||
errln("FAIL: " +
|
||||
message + " " + dumbHasRelation + ":\t" + a + "\t" + RELATION_NAME[i] + "\t" + b);
|
||||
}
|
||||
}
|
||||
@ -1077,9 +1077,9 @@ public class UnicodeSetTest extends TestFmwk {
|
||||
"\u03D6", // 1.1
|
||||
"\u03D8\u03D9", // 3.2
|
||||
|
||||
"[:Age=3.1:]",
|
||||
"\\u1800\\u3400\\U0002f800",
|
||||
"\\u0220\\u034f\\u30ff\\u33ff\\ufe73\\U00010000\\U00050000",
|
||||
"[:Age=3.1:]",
|
||||
"\\u1800\\u3400\\U0002f800",
|
||||
"\\u0220\\u034f\\u30ff\\u33ff\\ufe73\\U00010000\\U00050000",
|
||||
|
||||
// JB#2350: Case_Sensitive
|
||||
"[:Case Sensitive:]",
|
||||
@ -1168,7 +1168,7 @@ public class UnicodeSetTest extends TestFmwk {
|
||||
"\\uFDF2"
|
||||
};
|
||||
|
||||
for (int i=0; i<DATA.length; i+=3) {
|
||||
for (int i=0; i<DATA.length; i+=3) {
|
||||
expectContainment(DATA[i], DATA[i+1], DATA[i+2]);
|
||||
}
|
||||
}
|
||||
@ -1319,7 +1319,7 @@ public class UnicodeSetTest extends TestFmwk {
|
||||
|
||||
CASE,
|
||||
"[{F\uFB01}]",
|
||||
"[\uFB03{ffi}]",
|
||||
"[\uFB03{ffi}]",
|
||||
|
||||
CASE,
|
||||
"[a-z]","[A-Za-z\u017F\u212A]",
|
||||
@ -1615,6 +1615,7 @@ public class UnicodeSetTest extends TestFmwk {
|
||||
assertEquals("compareTo-shorter-first", goalShortest, sorted);
|
||||
|
||||
TreeSet<UnicodeSet> sorted1 = new TreeSet<UnicodeSet>(new Comparator<UnicodeSet>(){
|
||||
@Override
|
||||
public int compare(UnicodeSet o1, UnicodeSet o2) {
|
||||
// TODO Auto-generated method stub
|
||||
return o1.compareTo(o2, ComparisonStyle.LONGER_FIRST);
|
||||
@ -1625,6 +1626,7 @@ public class UnicodeSetTest extends TestFmwk {
|
||||
assertEquals("compareTo-longer-first", goalLongest, sorted);
|
||||
|
||||
sorted1 = new TreeSet<UnicodeSet>(new Comparator<UnicodeSet>(){
|
||||
@Override
|
||||
public int compare(UnicodeSet o1, UnicodeSet o2) {
|
||||
// TODO Auto-generated method stub
|
||||
return o1.compareTo(o2, ComparisonStyle.LEXICOGRAPHIC);
|
||||
@ -1931,6 +1933,7 @@ public class UnicodeSetTest extends TestFmwk {
|
||||
/* (non-Javadoc)
|
||||
* @see com.ibm.icu.text.SymbolTable#lookup(java.lang.String)
|
||||
*/
|
||||
@Override
|
||||
public char[] lookup(String s) {
|
||||
logln("TokenSymbolTable: lookup \"" + s + "\" => \"" +
|
||||
new String((char[]) contents.get(s)) + "\"");
|
||||
@ -1940,6 +1943,7 @@ public class UnicodeSetTest extends TestFmwk {
|
||||
/* (non-Javadoc)
|
||||
* @see com.ibm.icu.text.SymbolTable#lookupMatcher(int)
|
||||
*/
|
||||
@Override
|
||||
public UnicodeMatcher lookupMatcher(int ch) {
|
||||
return null;
|
||||
}
|
||||
@ -1948,6 +1952,7 @@ public class UnicodeSetTest extends TestFmwk {
|
||||
* @see com.ibm.icu.text.SymbolTable#parseReference(java.lang.String,
|
||||
java.text.ParsePosition, int)
|
||||
*/
|
||||
@Override
|
||||
public String parseReference(String text, ParsePosition pos, int
|
||||
limit) {
|
||||
int cp;
|
||||
@ -1982,7 +1987,7 @@ public class UnicodeSetTest extends TestFmwk {
|
||||
CharsToUnicodeString("abc\\U00010000"),
|
||||
"\uD800;\uDC00"); // split apart surrogate-pair
|
||||
if (set.size() != 4) {
|
||||
errln(Utility.escape("FAIL: " + DATA[i] + ".size() == " +
|
||||
errln(Utility.escape("FAIL: " + DATA[i] + ".size() == " +
|
||||
set.size() + ", expected 4"));
|
||||
}
|
||||
}
|
||||
@ -2385,11 +2390,11 @@ public class UnicodeSetTest extends TestFmwk {
|
||||
}
|
||||
boolean contained = set.contains(expStrings[i]);
|
||||
if (contained == in) {
|
||||
logln("Ok: " + expPat +
|
||||
logln("Ok: " + expPat +
|
||||
(contained ? " contains {" : " does not contain {") +
|
||||
Utility.escape(expStrings[i]) + "}");
|
||||
} else {
|
||||
errln("FAIL: " + expPat +
|
||||
errln("FAIL: " + expPat +
|
||||
(contained ? " contains {" : " does not contain {") +
|
||||
Utility.escape(expStrings[i]) + "}");
|
||||
}
|
||||
@ -2442,10 +2447,10 @@ public class UnicodeSetTest extends TestFmwk {
|
||||
assertEquals("", "M, a-c", CollectionUtilities.join(us1.ranges(), ", "));
|
||||
|
||||
// Sample code
|
||||
for (@SuppressWarnings("unused") EntryRange range : us1.ranges()) {
|
||||
// do something with code points between range.codepointEnd and range.codepointEnd;
|
||||
for (@SuppressWarnings("unused") EntryRange range : us1.ranges()) {
|
||||
// do something with code points between range.codepointEnd and range.codepointEnd;
|
||||
}
|
||||
for (@SuppressWarnings("unused") String s : us1.strings()) {
|
||||
for (@SuppressWarnings("unused") String s : us1.strings()) {
|
||||
// do something with each string;
|
||||
}
|
||||
|
||||
@ -2479,7 +2484,7 @@ public class UnicodeSetTest extends TestFmwk {
|
||||
UnicodeSetSpanner m;
|
||||
|
||||
m = new UnicodeSetSpanner(new UnicodeSet("[._]"));
|
||||
assertEquals("", "abc", m.deleteFrom("_._a_._b_._c_._"));
|
||||
assertEquals("", "abc", m.deleteFrom("_._a_._b_._c_._"));
|
||||
assertEquals("", "_.__.__.__._", m.deleteFrom("_._a_._b_._c_._", SpanCondition.NOT_CONTAINED));
|
||||
|
||||
assertEquals("", "a_._b_._c", m.trim("_._a_._b_._c_._"));
|
||||
@ -2511,11 +2516,11 @@ public class UnicodeSetTest extends TestFmwk {
|
||||
checkCodePoints("👦", "👧", CountMethod.MIN_ELEMENTS, SpanCondition.SIMPLE, null, 1);
|
||||
}
|
||||
|
||||
private void checkCodePoints(String a, String b, CountMethod quantifier, SpanCondition spanCondition,
|
||||
private void checkCodePoints(String a, String b, CountMethod quantifier, SpanCondition spanCondition,
|
||||
String expectedReplaced, int expectedCount) {
|
||||
final String ab = a+b;
|
||||
UnicodeSetSpanner m = new UnicodeSetSpanner(new UnicodeSet("[{" + a + "}]"));
|
||||
assertEquals("new UnicodeSetSpanner(\"[{" + a + "}]\").countIn(\"" + ab + "\")",
|
||||
assertEquals("new UnicodeSetSpanner(\"[{" + a + "}]\").countIn(\"" + ab + "\")",
|
||||
expectedCount,
|
||||
callCountIn(m, ab, quantifier, spanCondition)
|
||||
);
|
||||
@ -2523,7 +2528,7 @@ public class UnicodeSetTest extends TestFmwk {
|
||||
if (expectedReplaced == null) {
|
||||
expectedReplaced = "-" + b;
|
||||
}
|
||||
assertEquals("new UnicodeSetSpanner(\"[{" + a + "}]\").replaceFrom(\"" + ab + "\", \"-\")",
|
||||
assertEquals("new UnicodeSetSpanner(\"[{" + a + "}]\").replaceFrom(\"" + ab + "\", \"-\")",
|
||||
expectedReplaced, m.replaceFrom(ab, "-", quantifier));
|
||||
}
|
||||
|
||||
@ -2586,9 +2591,6 @@ public class UnicodeSetTest extends TestFmwk {
|
||||
/**
|
||||
* Check that there are no gaps, when we alternate spanning. That is, there
|
||||
* should only be a zero length span at the very start.
|
||||
* @param longString
|
||||
* @param us
|
||||
* @param simple
|
||||
*/
|
||||
private int checkSpan(String longString, UnicodeSet us, SpanCondition spanCondition) {
|
||||
int start = 0;
|
||||
@ -2657,7 +2659,7 @@ public class UnicodeSetTest extends TestFmwk {
|
||||
assertEquals("CharSequence complementAll", new UnicodeSet("[ABbc]"), new UnicodeSet("[a-cA]").complementAll(new StringBuilder("aB")) );
|
||||
|
||||
// containment
|
||||
assertEquals("CharSequence contains", true, new UnicodeSet("[a-cA{ab}]"). contains(new StringBuilder("ab")) );
|
||||
assertEquals("CharSequence contains", true, new UnicodeSet("[a-cA{ab}]"). contains(new StringBuilder("ab")) );
|
||||
assertEquals("CharSequence containsNone", false, new UnicodeSet("[a-cA]"). containsNone(new StringBuilder("ab")) );
|
||||
assertEquals("CharSequence containsSome", true, new UnicodeSet("[a-cA{ab}]"). containsSome(new StringBuilder("ab")) );
|
||||
|
||||
@ -2726,7 +2728,7 @@ public class UnicodeSetTest extends TestFmwk {
|
||||
0, UnicodeSet.fromAll("a").compareTo(Collections.singleton("a")));
|
||||
|
||||
// Longer is bigger
|
||||
assertTrue("UnicodeSet is empty",
|
||||
assertTrue("UnicodeSet is empty",
|
||||
UnicodeSet.ALL_CODE_POINTS.compareTo(test_set) > 0);
|
||||
assertTrue("UnicodeSet not empty",
|
||||
UnicodeSet.EMPTY.compareTo(Collections.singleton("a")) < 0);
|
||||
@ -2739,4 +2741,33 @@ public class UnicodeSetTest extends TestFmwk {
|
||||
assertTrue("UnicodeSet comparison wrong",
|
||||
UnicodeSet.fromAll("b").compareTo(Collections.singleton("a")) > 0);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void TestUnusedCcc() {
|
||||
// All numeric ccc values 0..255 are valid, but many are unused.
|
||||
UnicodeSet ccc2 = new UnicodeSet("[:ccc=2:]");
|
||||
assertTrue("[:ccc=2:] -> empty set", ccc2.isEmpty());
|
||||
|
||||
UnicodeSet ccc255 = new UnicodeSet("[:ccc=255:]");
|
||||
assertTrue("[:ccc=255:] -> empty set", ccc255.isEmpty());
|
||||
|
||||
// Non-integer values and values outside 0..255 are invalid.
|
||||
try {
|
||||
new UnicodeSet("[:ccc=-1:]");
|
||||
fail("[:ccc=-1:] -> illegal argument");
|
||||
} catch (IllegalArgumentException expected) {
|
||||
}
|
||||
|
||||
try {
|
||||
new UnicodeSet("[:ccc=256:]");
|
||||
fail("[:ccc=256:] -> illegal argument");
|
||||
} catch (IllegalArgumentException expected) {
|
||||
}
|
||||
|
||||
try {
|
||||
new UnicodeSet("[:ccc=1.1:]");
|
||||
fail("[:ccc=1.1:] -> illegal argument");
|
||||
} catch (IllegalArgumentException expected) {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -63,7 +63,7 @@ public class BreakIteratorTest extends TestFmwk
|
||||
List<String> previousResults = _testLastAndPrevious(bi, text);
|
||||
|
||||
logln("comparing forward and backward...");
|
||||
//TODO(junit) - needs to be rewritten
|
||||
// TODO(#13318): As part of clean-up, permanently remove the error count check.
|
||||
//int errs = getErrorCount();
|
||||
compareFragmentLists("forward iteration", "backward iteration", nextResults,
|
||||
previousResults);
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -279,7 +279,7 @@ public class RBBITest extends TestFmwk {
|
||||
List<String> previousResults = _testLastAndPrevious(rbbi, text);
|
||||
|
||||
logln("comparing forward and backward...");
|
||||
//TODO(junit) - needs to be rewritten
|
||||
// TODO(#13318): As part of clean-up, permanently remove the error count check.
|
||||
//int errs = getErrorCount();
|
||||
compareFragmentLists("forward iteration", "backward iteration", nextResults, previousResults);
|
||||
//if (getErrorCount() == errs) {
|
||||
@ -957,4 +957,20 @@ public class RBBITest extends TestFmwk {
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void TestBug12519() {
|
||||
RuleBasedBreakIterator biEn = (RuleBasedBreakIterator)BreakIterator.getWordInstance(ULocale.ENGLISH);
|
||||
RuleBasedBreakIterator biFr = (RuleBasedBreakIterator)BreakIterator.getWordInstance(ULocale.FRANCE);
|
||||
assertEquals("", ULocale.ENGLISH, biEn.getLocale(ULocale.VALID_LOCALE));
|
||||
assertEquals("", ULocale.FRENCH, biFr.getLocale(ULocale.VALID_LOCALE));
|
||||
assertEquals("Locales do not participate in BreakIterator equality.", biEn, biFr);
|
||||
|
||||
RuleBasedBreakIterator cloneEn = (RuleBasedBreakIterator)biEn.clone();
|
||||
assertEquals("", biEn, cloneEn);
|
||||
assertEquals("", ULocale.ENGLISH, cloneEn.getLocale(ULocale.VALID_LOCALE));
|
||||
|
||||
RuleBasedBreakIterator cloneFr = (RuleBasedBreakIterator)biFr.clone();
|
||||
assertEquals("", biFr, cloneFr);
|
||||
assertEquals("", ULocale.FRENCH, cloneFr.getLocale(ULocale.VALID_LOCALE));
|
||||
}
|
||||
}
|
||||
|
@ -20,7 +20,7 @@ import com.ibm.icu.dev.test.TestFmwk;
|
||||
import com.ibm.icu.impl.Utility;
|
||||
import com.ibm.icu.lang.UCharacter;
|
||||
import com.ibm.icu.text.BreakIterator;
|
||||
import com.ibm.icu.text.UTF16;
|
||||
import com.ibm.icu.text.RuleBasedBreakIterator;
|
||||
import com.ibm.icu.util.ULocale;
|
||||
|
||||
|
||||
@ -39,7 +39,7 @@ public RBBITestExtended() {
|
||||
|
||||
static class TestParams {
|
||||
BreakIterator bi;
|
||||
StringBuffer dataToBreak = new StringBuffer();
|
||||
StringBuilder dataToBreak = new StringBuilder();
|
||||
int[] expectedBreaks = new int[1000];
|
||||
int[] srcLine = new int[1000];
|
||||
int[] srcCol = new int[1000];
|
||||
@ -55,7 +55,7 @@ public void TestExtended() {
|
||||
//
|
||||
// Open and read the test data file.
|
||||
//
|
||||
StringBuffer testFileBuf = new StringBuffer();
|
||||
StringBuilder testFileBuf = new StringBuilder();
|
||||
InputStream is = null;
|
||||
try {
|
||||
is = RBBITestExtended.class.getResourceAsStream("rbbitst.txt");
|
||||
@ -78,7 +78,7 @@ public void TestExtended() {
|
||||
continue;
|
||||
}
|
||||
|
||||
UTF16.append(testFileBuf, c);
|
||||
testFileBuf.appendCodePoint(c);
|
||||
}
|
||||
} finally {
|
||||
isr.close();
|
||||
@ -99,20 +99,12 @@ public void TestExtended() {
|
||||
final int PARSE_TAG = 2;
|
||||
final int PARSE_DATA = 3;
|
||||
final int PARSE_NUM = 4;
|
||||
final int PARSE_RULES = 5;
|
||||
|
||||
int parseState = PARSE_TAG;
|
||||
|
||||
int savedState = PARSE_TAG;
|
||||
|
||||
final char CH_LF = 0x0a;
|
||||
final char CH_CR = 0x0d;
|
||||
final char CH_HASH = 0x23;
|
||||
/*static const UChar CH_PERIOD = 0x2e;*/
|
||||
final char CH_LT = 0x3c;
|
||||
final char CH_GT = 0x3e;
|
||||
final char CH_BACKSLASH = 0x5c;
|
||||
final char CH_BULLET = 0x2022;
|
||||
|
||||
int lineNum = 1;
|
||||
int colStart = 0;
|
||||
int column = 0;
|
||||
@ -120,17 +112,21 @@ public void TestExtended() {
|
||||
int i;
|
||||
|
||||
int tagValue = 0; // The numeric value of a <nnn> tag.
|
||||
|
||||
StringBuilder rules = new StringBuilder(); // Holds rules from a <rules> ... </rules> block
|
||||
int rulesFirstLine = 0; // Line number of the start of current <rules> block
|
||||
|
||||
int len = testString.length();
|
||||
|
||||
for (charIdx = 0; charIdx < len; ) {
|
||||
int c = UTF16.charAt(testString, charIdx);
|
||||
int c = testString.codePointAt(charIdx);
|
||||
charIdx++;
|
||||
if (c == CH_CR && charIdx<len && testString.charAt(charIdx) == CH_LF) {
|
||||
if (c == '\r' && charIdx<len && testString.charAt(charIdx) == '\n') {
|
||||
// treat CRLF as a unit
|
||||
c = CH_LF;
|
||||
c = '\n';
|
||||
charIdx++;
|
||||
}
|
||||
if (c == CH_LF || c == CH_CR) {
|
||||
if (c == '\n' || c == '\r') {
|
||||
lineNum++;
|
||||
colStart = charIdx;
|
||||
}
|
||||
@ -145,7 +141,7 @@ public void TestExtended() {
|
||||
|
||||
case PARSE_TAG:
|
||||
{
|
||||
if (c == CH_HASH) {
|
||||
if (c == '#') {
|
||||
parseState = PARSE_COMMENT;
|
||||
savedState = PARSE_TAG;
|
||||
break;
|
||||
@ -178,6 +174,15 @@ public void TestExtended() {
|
||||
charIdx += 6;
|
||||
break;
|
||||
}
|
||||
if (testString.startsWith("<rules>", charIdx-1) ||
|
||||
testString.startsWith("<badrules>", charIdx-1)) {
|
||||
charIdx = testString.indexOf('>', charIdx) + 1;
|
||||
parseState = PARSE_RULES;
|
||||
rules.setLength(0);
|
||||
rulesFirstLine = lineNum;
|
||||
break;
|
||||
}
|
||||
|
||||
if (testString.startsWith("<locale ", charIdx-1)) {
|
||||
int closeIndex = testString.indexOf(">", charIdx);
|
||||
if (closeIndex < 0) {
|
||||
@ -206,8 +211,36 @@ public void TestExtended() {
|
||||
//savedState = PARSE_DATA;
|
||||
}
|
||||
|
||||
case PARSE_RULES:
|
||||
if (testString.startsWith("</rules>", charIdx-1)) {
|
||||
charIdx += 7;
|
||||
parseState = PARSE_TAG;
|
||||
try {
|
||||
tp.bi = new RuleBasedBreakIterator(rules.toString());
|
||||
} catch (IllegalArgumentException e) {
|
||||
errln(String.format("rbbitst.txt:%d Error creating break iterator from rules. %s", lineNum, e));
|
||||
}
|
||||
} else if (testString.startsWith("</badrules>", charIdx-1)) {
|
||||
charIdx += 10;
|
||||
parseState = PARSE_TAG;
|
||||
boolean goodRules = true;
|
||||
try {
|
||||
new RuleBasedBreakIterator(rules.toString());
|
||||
} catch (IllegalArgumentException e) {
|
||||
goodRules = false;
|
||||
}
|
||||
if (goodRules) {
|
||||
errln(String.format(
|
||||
"rbbitst.txt:%d Expected, but did not get, a failure creating break iterator from rules.",
|
||||
lineNum));
|
||||
}
|
||||
} else {
|
||||
rules.appendCodePoint(c);
|
||||
}
|
||||
break;
|
||||
|
||||
case PARSE_DATA:
|
||||
if (c == CH_BULLET) {
|
||||
if (c == '•') {
|
||||
int breakIdx = tp.dataToBreak.length();
|
||||
tp.expectedBreaks[breakIdx] = -1;
|
||||
tp.srcLine[breakIdx] = lineNum;
|
||||
@ -247,7 +280,7 @@ public void TestExtended() {
|
||||
} else {
|
||||
// Named code point was recognized. Insert it
|
||||
// into the test data.
|
||||
UTF16.append(tp.dataToBreak, c);
|
||||
tp.dataToBreak.appendCodePoint(c);
|
||||
for (i = tp.dataToBreak.length()-1; i>=0 && tp.srcLine[i]==0; i--) {
|
||||
tp.srcLine[i] = lineNum;
|
||||
tp.srcCol[i] = column;
|
||||
@ -269,28 +302,28 @@ public void TestExtended() {
|
||||
break;
|
||||
}
|
||||
|
||||
if (c == CH_LT) {
|
||||
if (c == '<') {
|
||||
tagValue = 0;
|
||||
parseState = PARSE_NUM;
|
||||
break;
|
||||
}
|
||||
|
||||
if (c == CH_HASH && column==3) { // TODO: why is column off so far?
|
||||
if (c == '#' && column==3) { // TODO: why is column off so far?
|
||||
parseState = PARSE_COMMENT;
|
||||
savedState = PARSE_DATA;
|
||||
break;
|
||||
}
|
||||
|
||||
if (c == CH_BACKSLASH) {
|
||||
if (c == '\\') {
|
||||
// Check for \ at end of line, a line continuation.
|
||||
// Advance over (discard) the newline
|
||||
int cp = UTF16.charAt(testString, charIdx);
|
||||
if (cp == CH_CR && charIdx<len && UTF16.charAt(testString, charIdx+1) == CH_LF) {
|
||||
int cp = testString.codePointAt(charIdx);
|
||||
if (cp == '\r' && charIdx<len && testString.codePointAt(charIdx+1) == '\n') {
|
||||
// We have a CR LF
|
||||
// Need an extra increment of the input ptr to move over both of them
|
||||
charIdx++;
|
||||
}
|
||||
if (cp == CH_LF || cp == CH_CR) {
|
||||
if (cp == '\n' || cp == '\r') {
|
||||
lineNum++;
|
||||
column = 0;
|
||||
charIdx++;
|
||||
@ -306,7 +339,7 @@ public void TestExtended() {
|
||||
// Escape sequence was recognized. Insert the char
|
||||
// into the test data.
|
||||
charIdx = charIdxAr[0];
|
||||
UTF16.append(tp.dataToBreak, cp);
|
||||
tp.dataToBreak.appendCodePoint(cp);
|
||||
for (i=tp.dataToBreak.length()-1; i>=0 && tp.srcLine[i]==0; i--) {
|
||||
tp.srcLine[i] = lineNum;
|
||||
tp.srcCol[i] = column;
|
||||
@ -319,12 +352,12 @@ public void TestExtended() {
|
||||
// Not a recognized backslash escape sequence.
|
||||
// Take the next char as a literal.
|
||||
// TODO: Should this be an error?
|
||||
c = UTF16.charAt(testString,charIdx);
|
||||
charIdx = UTF16.moveCodePointOffset(testString, charIdx, 1);
|
||||
c = testString.codePointAt(charIdx);
|
||||
charIdx = testString.offsetByCodePoints(charIdx, 1);
|
||||
}
|
||||
|
||||
// Normal, non-escaped data char.
|
||||
UTF16.append(tp.dataToBreak, c);
|
||||
tp.dataToBreak.appendCodePoint(c);
|
||||
|
||||
// Save the mapping from offset in the data to line/column numbers in
|
||||
// the original input file. Will be used for better error messages only.
|
||||
@ -344,7 +377,7 @@ public void TestExtended() {
|
||||
break;
|
||||
}
|
||||
|
||||
if (c == CH_GT) {
|
||||
if (c == '>') {
|
||||
// Finished the number. Add the info to the expected break data,
|
||||
// and switch parse state back to doing plain data.
|
||||
parseState = PARSE_DATA;
|
||||
@ -363,15 +396,19 @@ public void TestExtended() {
|
||||
break;
|
||||
}
|
||||
|
||||
errln("Syntax Error in test file at line "+ lineNum +", col %d" + column);
|
||||
errln(String.format("Syntax Error in rbbitst.txt at line %d, col %d", lineNum, column));
|
||||
return;
|
||||
|
||||
// parseState = PARSE_COMMENT; // TODO: unreachable. Don't stop on errors.
|
||||
// break;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
// Reached end of test file. Raise an error if parseState indicates that we are
|
||||
// within a block that should have been terminated.
|
||||
if (parseState == PARSE_RULES) {
|
||||
errln(String.format("rbbitst.txt:%d <rules> block beginning at line %d is not closed.",
|
||||
lineNum, rulesFirstLine));
|
||||
}
|
||||
if (parseState == PARSE_DATA) {
|
||||
errln(String.format("rbbitst.txt:%d <data> block not closed.", lineNum));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -9,7 +9,10 @@
|
||||
package com.ibm.icu.dev.test.rbbi;
|
||||
|
||||
|
||||
// Monkey testing of RuleBasedBreakIterator
|
||||
// Monkey testing of RuleBasedBreakIterator.
|
||||
// The old, original monkey test. TODO: remove
|
||||
// The new monkey test is class RBBIMonkeyTest.
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
@ -0,0 +1,69 @@
|
||||
#
|
||||
# Copyright (C) 2016 and later: Unicode, Inc. and others.
|
||||
# License & terms of use: http://www.unicode.org/copyright.html
|
||||
# Copyright (c) 2016, International Business Machines Corporation and others. All Rights Reserved.
|
||||
|
||||
# file: grapheme.txt
|
||||
#
|
||||
# Reference Grapheme Break rules for intltest rbbi/RBBIMonkeyTest
|
||||
#
|
||||
#
|
||||
# Note: Rule syntax and the monkey test itself are still a work in progress.
|
||||
# They are expected to change with review and the addition of support for rule tailoring.
|
||||
|
||||
type = grapheme; # one of grapheme | word | line | sentence
|
||||
locale = en;
|
||||
|
||||
CR = [\p{Grapheme_Cluster_Break = CR}];
|
||||
LF = [\p{Grapheme_Cluster_Break = LF}];
|
||||
|
||||
Control = [[\p{Grapheme_Cluster_Break = Control}]];
|
||||
Extend = [[\p{Grapheme_Cluster_Break = Extend}]];
|
||||
ZWJ = [\p{Grapheme_Cluster_Break = ZWJ}];
|
||||
Regional_Indicator = [\p{Grapheme_Cluster_Break = Regional_Indicator}];
|
||||
Prepend = [\p{Grapheme_Cluster_Break = Prepend}];
|
||||
SpacingMark = [\p{Grapheme_Cluster_Break = SpacingMark}];
|
||||
|
||||
#
|
||||
# Korean Syllable Definitions
|
||||
#
|
||||
L = [\p{Grapheme_Cluster_Break = L}];
|
||||
V = [\p{Grapheme_Cluster_Break = V}];
|
||||
T = [\p{Grapheme_Cluster_Break = T}];
|
||||
LV = [\p{Grapheme_Cluster_Break = LV}];
|
||||
LVT = [\p{Grapheme_Cluster_Break = LVT}];
|
||||
|
||||
# Emoji defintions
|
||||
|
||||
EmojiNRK = [[\p{Emoji}] - [Regional_Indicator\u002a\u00230-9©®™〰〽]];
|
||||
E_Base = [\p{Grapheme_Cluster_Break = EB}];
|
||||
E_Modifier = [\p{Grapheme_Cluster_Break = EM}];
|
||||
E_Base_GAZ = [\p{Grapheme_Cluster_Break = EBG}];
|
||||
|
||||
# Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r13267
|
||||
Extended_Pict = [\U0001F774-\U0001F77F\U00002700-\U00002701\U00002703-\U00002704\U0000270E\U00002710-\U00002711\U00002765-\U00002767\U0001F030-\U0001F093\U0001F094-\U0001F09F\U0001F10D-\U0001F10F\U0001F12F\U0001F16C-\U0001F16F\U0001F1AD-\U0001F1E5\U0001F260-\U0001F265\U0001F203-\U0001F20F\U0001F23C-\U0001F23F\U0001F249-\U0001F24F\U0001F252-\U0001F25F\U0001F266-\U0001F2FF\U0001F7D5-\U0001F7FF\U0001F000-\U0001F003\U0001F005-\U0001F02B\U0001F02C-\U0001F02F\U0001F322-\U0001F323\U0001F394-\U0001F395\U0001F398\U0001F39C-\U0001F39D\U0001F3F1-\U0001F3F2\U0001F3F6\U0001F4FE\U0001F53E-\U0001F548\U0001F54F\U0001F568-\U0001F56E\U0001F571-\U0001F572\U0001F57B-\U0001F586\U0001F588-\U0001F589\U0001F58E-\U0001F58F\U0001F591-\U0001F594\U0001F597-\U0001F5A3\U0001F5A6-\U0001F5A7\U0001F5A9-\U0001F5B0\U0001F5B3-\U0001F5BB\U0001F5BD-\U0001F5C1\U0001F5C5-\U0001F5D0\U0001F5D4-\U0001F5DB\U0001F5DF-\U0001F5E0\U0001F5E2\U0001F5E4-\U0001F5E7\U0001F5E9-\U0001F5EE\U0001F5F0-\U0001F5F2\U0001F5F4-\U0001F5F9\U00002605\U00002607-\U0000260D\U0000260F-\U00002610\U00002612\U00002616-\U00002617\U00002619-\U0000261C\U0000261E-\U0000261F\U00002621\U00002624-\U00002625\U00002627-\U00002629\U0000262B-\U0000262D\U00002630-\U00002637\U0000263B-\U00002647\U00002654-\U0000265F\U00002661-\U00002662\U00002664\U00002667\U00002669-\U0000267A\U0000267C-\U0000267E\U00002680-\U00002691\U00002695\U00002698\U0000269A\U0000269D-\U0000269F\U000026A2-\U000026A9\U000026AC-\U000026AF\U000026B2-\U000026BC\U000026BF-\U000026C3\U000026C6-\U000026C7\U000026C9-\U000026CD\U000026D0\U000026D2\U000026D5-\U000026E8\U000026EB-\U000026EF\U000026F6\U000026FB-\U000026FC\U000026FE-\U000026FF\U00002388\U0001FA00-\U0001FFFD\U0001F0A0-\U0001F0AE\U0001F0B1-\U0001F0BF\U0001F0C1-\U0001F0CF\U0001F0D1-\U0001F0F5\U0001F0AF-\U0001F0B0\U0001F0C0\U0001F0D0\U0001F0F6-\U0001F0FF\U0001F80C-\U0001F80F\U0001F848-\U0001F84F\U0001F85A-\U0001F85F\U0001F888-\U0001F88F\U0001F8AE-\U0001F8FF\U0001F900-\U0001F90B\U0001F91F\U0001F928-\U0001F92F\U0001F931-\U0001F932\U0001F94C\U0001F95F-\U0001F96B\U0001F992-\U0001F997\U0001F9D0-\U0001F9E6\U0001F90C-\U0001F90F\U0001F93F\U0001F94D-\U0001F94F\U0001F96C-\U0001F97F\U0001F998-\U0001F9BF\U0001F9C1-\U0001F9CF\U0001F9E7-\U0001F9FF\U0001F6C6-\U0001F6CA\U0001F6D3-\U0001F6D4\U0001F6E6-\U0001F6E8\U0001F6EA\U0001F6F1-\U0001F6F2\U0001F6F7-\U0001F6F8\U0001F6D5-\U0001F6DF\U0001F6ED-\U0001F6EF\U0001F6F9-\U0001F6FF];
|
||||
|
||||
|
||||
GB3: CR LF;
|
||||
GB4: (Control | CR | LF) ÷;
|
||||
GB5: . ÷ (Control | CR | LF);
|
||||
|
||||
GB6: L (L | V | LV | LVT);
|
||||
GB7: (LV | V) (V | T);
|
||||
GB8: (LVT | T) T;
|
||||
|
||||
GB10: (E_Base | E_Base_GAZ) Extend* E_Modifier;
|
||||
GB11: (Extended_Pict | EmojiNRK) Extend* ZWJ (Extended_Pict | EmojiNRK);
|
||||
GB9: . (Extend | ZWJ);
|
||||
|
||||
GB9a: . SpacingMark;
|
||||
GB9b: Prepend .;
|
||||
|
||||
# Regional Indicators, split into pairs.
|
||||
# Note that a pair of RIs that is not followed by a third RI will fall into
|
||||
# the normal rules for Extend, etc.
|
||||
#
|
||||
GB12: Regional_Indicator Regional_Indicator ÷ Regional_Indicator;
|
||||
GB13: Regional_Indicator Regional_Indicator;
|
||||
|
||||
GB999: . ÷;
|
@ -0,0 +1,200 @@
|
||||
#
|
||||
# Copyright (C) 2016 and later: Unicode, Inc. and others.
|
||||
# License & terms of use: http://www.unicode.org/copyright.html
|
||||
# Copyright (c) 2016, International Business Machines Corporation and others. All Rights Reserved.
|
||||
|
||||
# file: line.txt
|
||||
#
|
||||
# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest
|
||||
#
|
||||
# Note: Rule syntax and the monkey test itself are still a work in progress.
|
||||
# They are expected to change with review and the addition of support for rule tailoring.
|
||||
|
||||
|
||||
type = line;
|
||||
locale = en;
|
||||
|
||||
|
||||
AI = [:LineBreak = Ambiguous:];
|
||||
AL = [:LineBreak = Alphabetic:];
|
||||
BA = [:LineBreak = Break_After:];
|
||||
BB = [:LineBreak = Break_Before:];
|
||||
BK = [:LineBreak = Mandatory_Break:];
|
||||
B2 = [:LineBreak = Break_Both:];
|
||||
CB = [:LineBreak = Contingent_Break:];
|
||||
CJ = [:LineBreak = Conditional_Japanese_Starter:];
|
||||
CL = [:LineBreak = Close_Punctuation:];
|
||||
CM = [:LineBreak = Combining_Mark:];
|
||||
CP = [:LineBreak = Close_Parenthesis:];
|
||||
CR = [:LineBreak = Carriage_Return:];
|
||||
EB = [:LineBreak = EB:];
|
||||
EM = [:LineBreak = EM:];
|
||||
EX = [:LineBreak = Exclamation:];
|
||||
GL = [:LineBreak = Glue:];
|
||||
HL = [:LineBreak = Hebrew_Letter:];
|
||||
HY = [:LineBreak = Hyphen:];
|
||||
H2 = [:LineBreak = H2:];
|
||||
H3 = [:LineBreak = H3:];
|
||||
ID = [:LineBreak = Ideographic:];
|
||||
IN = [:LineBreak = Inseperable:];
|
||||
IS = [:LineBreak = Infix_Numeric:];
|
||||
JL = [:LineBreak = JL:];
|
||||
JV = [:LineBreak = JV:];
|
||||
JT = [:LineBreak = JT:];
|
||||
LF = [:LineBreak = Line_Feed:];
|
||||
NL = [:LineBreak = Next_Line:];
|
||||
NS = [[:LineBreak = Nonstarter:] CJ]; # CSS Strict tailoring: CJ resolves to NS.
|
||||
NU = [:LineBreak = Numeric:];
|
||||
OP = [:LineBreak = Open_Punctuation:];
|
||||
PO = [:LineBreak = Postfix_Numeric:];
|
||||
PR = [:LineBreak = Prefix_Numeric:];
|
||||
QU = [:LineBreak = Quotation:];
|
||||
RI = [:LineBreak = Regional_Indicator:];
|
||||
SA = [:LineBreak = Complex_Context:];
|
||||
SG = [:LineBreak = Surrogate:];
|
||||
SP = [:LineBreak = Space:];
|
||||
SY = [:LineBreak = Break_Symbols:];
|
||||
WJ = [:LineBreak = Word_Joiner:];
|
||||
XX = [:LineBreak = Unknown:];
|
||||
ZW = [:LineBreak = ZWSpace:];
|
||||
ZWJ = [:LineBreak = ZWJ:];
|
||||
|
||||
EmojiNRK = [[\p{Emoji}] - [[RI]\u002a\u00230-9©®™〰〽]];
|
||||
# Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r13267
|
||||
Extended_Pict = [\U0001F774-\U0001F77F\U00002700-\U00002701\U00002703-\U00002704\U0000270E\U00002710-\U00002711\U00002765-\U00002767\U0001F030-\U0001F093\U0001F094-\U0001F09F\U0001F10D-\U0001F10F\U0001F12F\U0001F16C-\U0001F16F\U0001F1AD-\U0001F1E5\U0001F260-\U0001F265\U0001F203-\U0001F20F\U0001F23C-\U0001F23F\U0001F249-\U0001F24F\U0001F252-\U0001F25F\U0001F266-\U0001F2FF\U0001F7D5-\U0001F7FF\U0001F000-\U0001F003\U0001F005-\U0001F02B\U0001F02C-\U0001F02F\U0001F322-\U0001F323\U0001F394-\U0001F395\U0001F398\U0001F39C-\U0001F39D\U0001F3F1-\U0001F3F2\U0001F3F6\U0001F4FE\U0001F53E-\U0001F548\U0001F54F\U0001F568-\U0001F56E\U0001F571-\U0001F572\U0001F57B-\U0001F586\U0001F588-\U0001F589\U0001F58E-\U0001F58F\U0001F591-\U0001F594\U0001F597-\U0001F5A3\U0001F5A6-\U0001F5A7\U0001F5A9-\U0001F5B0\U0001F5B3-\U0001F5BB\U0001F5BD-\U0001F5C1\U0001F5C5-\U0001F5D0\U0001F5D4-\U0001F5DB\U0001F5DF-\U0001F5E0\U0001F5E2\U0001F5E4-\U0001F5E7\U0001F5E9-\U0001F5EE\U0001F5F0-\U0001F5F2\U0001F5F4-\U0001F5F9\U00002605\U00002607-\U0000260D\U0000260F-\U00002610\U00002612\U00002616-\U00002617\U00002619-\U0000261C\U0000261E-\U0000261F\U00002621\U00002624-\U00002625\U00002627-\U00002629\U0000262B-\U0000262D\U00002630-\U00002637\U0000263B-\U00002647\U00002654-\U0000265F\U00002661-\U00002662\U00002664\U00002667\U00002669-\U0000267A\U0000267C-\U0000267E\U00002680-\U00002691\U00002695\U00002698\U0000269A\U0000269D-\U0000269F\U000026A2-\U000026A9\U000026AC-\U000026AF\U000026B2-\U000026BC\U000026BF-\U000026C3\U000026C6-\U000026C7\U000026C9-\U000026CD\U000026D0\U000026D2\U000026D5-\U000026E8\U000026EB-\U000026EF\U000026F6\U000026FB-\U000026FC\U000026FE-\U000026FF\U00002388\U0001FA00-\U0001FFFD\U0001F0A0-\U0001F0AE\U0001F0B1-\U0001F0BF\U0001F0C1-\U0001F0CF\U0001F0D1-\U0001F0F5\U0001F0AF-\U0001F0B0\U0001F0C0\U0001F0D0\U0001F0F6-\U0001F0FF\U0001F80C-\U0001F80F\U0001F848-\U0001F84F\U0001F85A-\U0001F85F\U0001F888-\U0001F88F\U0001F8AE-\U0001F8FF\U0001F900-\U0001F90B\U0001F91F\U0001F928-\U0001F92F\U0001F931-\U0001F932\U0001F94C\U0001F95F-\U0001F96B\U0001F992-\U0001F997\U0001F9D0-\U0001F9E6\U0001F90C-\U0001F90F\U0001F93F\U0001F94D-\U0001F94F\U0001F96C-\U0001F97F\U0001F998-\U0001F9BF\U0001F9C1-\U0001F9CF\U0001F9E7-\U0001F9FF\U0001F6C6-\U0001F6CA\U0001F6D3-\U0001F6D4\U0001F6E6-\U0001F6E8\U0001F6EA\U0001F6F1-\U0001F6F2\U0001F6F7-\U0001F6F8\U0001F6D5-\U0001F6DF\U0001F6ED-\U0001F6EF\U0001F6F9-\U0001F6FF];
|
||||
|
||||
# LB1 - Resolve AI, CB, CJ, SA, SG, and XX into other line breaking classes
|
||||
AL = [AL AI SG XX ];
|
||||
dictionary = SA;
|
||||
|
||||
# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
|
||||
# list it in the numerous rules that use CM.
|
||||
CM = [CM ZWJ];
|
||||
|
||||
LB4: BK ÷;
|
||||
LB5: CR LF;
|
||||
LB5.1: CR ÷;
|
||||
LB5.2: LF ÷;
|
||||
LB5.3: NL ÷;
|
||||
|
||||
LB6: . (BK | CR | LF | NL);
|
||||
LB6.1: [^BK CR LF NL SP ZW] CM* (BK | CR | LF | NL);
|
||||
|
||||
# Rules LB14 - LB17.
|
||||
# Moved before LB7, because they can match a longer sequence that would also match LB7,
|
||||
# for example, the sequence "OP CM SP AL" matches LB14 while the prefix of it,
|
||||
# "while only the prefix "OP CM SP" matches LB7.1
|
||||
LB14: OP CM* SP* .;
|
||||
LB15: QU CM* SP* OP;
|
||||
LB16: (CL | CP)CM* SP* NS;
|
||||
LB17: B2 CM* SP* B2;
|
||||
|
||||
LB7.1: [^ZW SP] CM* [SP ZW];
|
||||
LB7.2: [ZW SP] [SP ZW];
|
||||
|
||||
# LB8, ICU differs from UAX-14,
|
||||
# ICU: ZW ÷;
|
||||
# UAX 14: ZW SP* ÷;
|
||||
LB8: ZW ÷;
|
||||
|
||||
# LB8a
|
||||
# ZWJ x (ID | Extended_Pict | EmojiNRK)
|
||||
LB8a: ZWJ (ID | Extended_Pict | EmojiNRK);
|
||||
|
||||
|
||||
# LB9: X CM -> X
|
||||
# LB10: Unattached CM -> AL
|
||||
|
||||
#LB11: × WJ;
|
||||
# WJ ×
|
||||
|
||||
LB11.1: [^BK CR LF NL SP ZW] CM* WJ;
|
||||
LB11.2: SP WJ;
|
||||
LB11.3: WJ CM* [^CM];
|
||||
|
||||
LB12: GL CM* [^CM];
|
||||
|
||||
LB12a: [^SP BA HY] CM* GL;
|
||||
|
||||
# LB 13 ICU Tailoring, matches tailoring example 8 from UAX 14.
|
||||
#
|
||||
# LB13.1 [^SP] CM* [CL CP EX IS SY] # original UAX 14 rule.
|
||||
# LB13.2 SP CM* [CL CP EX IS SY]
|
||||
|
||||
LB13.1: [^NU SP] CM* [CL CP IS SY];
|
||||
LB13.2: [^SP] CM* EX;
|
||||
LB13.2: SP [CL CP EX IS SY];
|
||||
|
||||
|
||||
# LB 14-17 are moved above LB 7.
|
||||
|
||||
LB18: SP ÷;
|
||||
|
||||
LB19: . CM* QU;
|
||||
LB19.1: QU CM* [^CM];
|
||||
|
||||
# LB 20 Break before and after CB.
|
||||
# Interaction with LB8a: ZWJ x ID is tricky because CM includes ZWJ.
|
||||
# ZWJ acts like a CM to the left, combining with CB.
|
||||
# ZWJ acts independently to the right, no break from ID by LB8a.
|
||||
LB20: . CM* ÷ CB;
|
||||
LB20.1a: CB CM* ZWJ (ID | Extended_Pict | EmojiNRK);
|
||||
LB20.1b: CB CM* ÷;
|
||||
|
||||
# Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
|
||||
# not picking up the continuing match after the BA from 21a.
|
||||
LB21a: HL CM* (HY | BA) CM* [^CM CB];
|
||||
|
||||
LB21.1: . CM* [BA HY NS];
|
||||
LB21.2: BB CM* [^CM CB];
|
||||
|
||||
LB21b: SY CM* HL;
|
||||
|
||||
LB22.1: (AL | HL | CM) CM* IN; # The CM is from LB10, treat an unattached CM as AL.
|
||||
LB22.2: EX CM* IN;
|
||||
LB22.3: (ID | EB | EM) CM* IN;
|
||||
LB22.4: IN CM* IN;
|
||||
LB22.5: NU CM* IN;
|
||||
|
||||
LB23.1: (AL | HL | CM) CM* NU;
|
||||
LB23.2: NU CM* (AL | HL);
|
||||
|
||||
LB23a.1: PR CM* (ID | EB | EM);
|
||||
LB23a.2: (ID | EB | EM) CM* PO;
|
||||
|
||||
LB24.2: (PR | PO) CM* (AL | HL);
|
||||
LB24.3: (AL | HL | CM) CM* (PR | PO);
|
||||
|
||||
# Numbers. Equivalent to Tailoring example 8 from UAX 14.
|
||||
LB25: ((PR | PO)CM*)? ((OP | HY)CM*)? NU (CM*(NU | SY | IS))* (CM*(CL | CP))? (CM*(PR | PO))?;
|
||||
|
||||
LB26.1: JL CM* (JL | JV | H2 | H3);
|
||||
LB26.2: (JV | H2) CM* (JV | JT);
|
||||
LB26.3: (JT | H3) CM* JT;
|
||||
|
||||
LB27.1: (JL | JV | JT | H2 | H3) CM* IN;
|
||||
LB27.2: (JL | JV | JT | H2 | H3) CM* PO;
|
||||
LB27.3: PR CM* (JL | JV | JT | H2 | H3);
|
||||
|
||||
# LB28 Do not break between Alphabetics.
|
||||
# Unattached (leading) CM treated as AL.
|
||||
LB28: (AL | HL | CM)CM* (AL | HL);
|
||||
|
||||
LB29: IS CM* (AL | HL);
|
||||
|
||||
# LB30 is adjusted for unattached leading CM being treated as AL.
|
||||
LB30.1: (AL | CM | HL | NU) CM* OP;
|
||||
LB30.2: CP CM* (AL | HL | NU);
|
||||
|
||||
# LB31 keep pairs of RI together.
|
||||
LB30a.1: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS];
|
||||
LB30a.2: RI CM* RI CM* ZWJ (ID | Extended_Pict | EmojiNRK);
|
||||
LB30a.3: RI CM* RI CM* ÷;
|
||||
|
||||
# LB30b Do not break between Emoji Base and Emoji Modifier
|
||||
LB30b: EB CM* EM;
|
||||
|
||||
# LB31 Break Everywhere Else.
|
||||
# Include combining marks
|
||||
LB31.1: . CM* ZWJ (ID | Extended_Pict | EmojiNRK);
|
||||
LB31.2: . CM* ÷;
|
@ -0,0 +1,208 @@
|
||||
#
|
||||
# Copyright (C) 2016 and later: Unicode, Inc. and others.
|
||||
# License & terms of use: http://www.unicode.org/copyright.html
|
||||
# Copyright (c) 2016, International Business Machines Corporation and others. All Rights Reserved.
|
||||
#
|
||||
# file: line_loose.txt
|
||||
#
|
||||
# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest
|
||||
#
|
||||
# Note: Rule syntax and the monkey test itself are still a work in progress.
|
||||
# They are expected to change with review and the addition of support for rule tailoring.
|
||||
#
|
||||
# This tailors the line break behavior to correspond to CSS
|
||||
# line-break=loose (BCP47 -u-lb-loose) as defined for languages other than
|
||||
# Chinese & Japanese.
|
||||
# It sets characters of class CJ to behave like ID.
|
||||
# In addition, it allows breaks:
|
||||
# * before iteration marks 3005, 303B, 309D, 309E, 30FD, 30FE (all NS)
|
||||
# * between characters of LineBreak class IN
|
||||
|
||||
type = line;
|
||||
locale = en@lb=loose;
|
||||
|
||||
|
||||
AI = [:LineBreak = Ambiguous:];
|
||||
AL = [:LineBreak = Alphabetic:];
|
||||
BA = [:LineBreak = Break_After:];
|
||||
BB = [:LineBreak = Break_Before:];
|
||||
BK = [:LineBreak = Mandatory_Break:];
|
||||
B2 = [:LineBreak = Break_Both:];
|
||||
CB = [:LineBreak = Contingent_Break:];
|
||||
CJ = [:LineBreak = Conditional_Japanese_Starter:];
|
||||
CL = [:LineBreak = Close_Punctuation:];
|
||||
CM = [:LineBreak = Combining_Mark:];
|
||||
CP = [:LineBreak = Close_Parenthesis:];
|
||||
CR = [:LineBreak = Carriage_Return:];
|
||||
EB = [:LineBreak = EB:];
|
||||
EM = [:LineBreak = EM:];
|
||||
EX = [:LineBreak = Exclamation:];
|
||||
GL = [:LineBreak = Glue:];
|
||||
HL = [:LineBreak = Hebrew_Letter:];
|
||||
HY = [:LineBreak = Hyphen:];
|
||||
H2 = [:LineBreak = H2:];
|
||||
H3 = [:LineBreak = H3:];
|
||||
ID = [[:LineBreak = Ideographic:] CJ]; # CSS Normal tailoring: CJ resolves to ID
|
||||
IN = [:LineBreak = Inseperable:];
|
||||
IS = [:LineBreak = Infix_Numeric:];
|
||||
JL = [:LineBreak = JL:];
|
||||
JV = [:LineBreak = JV:];
|
||||
JT = [:LineBreak = JT:];
|
||||
LF = [:LineBreak = Line_Feed:];
|
||||
NL = [:LineBreak = Next_Line:];
|
||||
NSX = [\u3005 \u303B \u309D \u309E \u30FD \u30FE];
|
||||
NS = [[:LineBreak = Nonstarter:] - NSX];
|
||||
NU = [:LineBreak = Numeric:];
|
||||
OP = [:LineBreak = Open_Punctuation:];
|
||||
PO = [:LineBreak = Postfix_Numeric:];
|
||||
PR = [:LineBreak = Prefix_Numeric:];
|
||||
QU = [:LineBreak = Quotation:];
|
||||
RI = [:LineBreak = Regional_Indicator:];
|
||||
SA = [:LineBreak = Complex_Context:];
|
||||
SG = [:LineBreak = Surrogate:];
|
||||
SP = [:LineBreak = Space:];
|
||||
SY = [:LineBreak = Break_Symbols:];
|
||||
WJ = [:LineBreak = Word_Joiner:];
|
||||
XX = [:LineBreak = Unknown:];
|
||||
ZW = [:LineBreak = ZWSpace:];
|
||||
ZWJ = [:LineBreak = ZWJ:];
|
||||
|
||||
EmojiNRK = [[\p{Emoji}] - [[RI]\u002a\u00230-9©®™〰〽]];
|
||||
# Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r13267
|
||||
Extended_Pict = [\U0001F774-\U0001F77F\U00002700-\U00002701\U00002703-\U00002704\U0000270E\U00002710-\U00002711\U00002765-\U00002767\U0001F030-\U0001F093\U0001F094-\U0001F09F\U0001F10D-\U0001F10F\U0001F12F\U0001F16C-\U0001F16F\U0001F1AD-\U0001F1E5\U0001F260-\U0001F265\U0001F203-\U0001F20F\U0001F23C-\U0001F23F\U0001F249-\U0001F24F\U0001F252-\U0001F25F\U0001F266-\U0001F2FF\U0001F7D5-\U0001F7FF\U0001F000-\U0001F003\U0001F005-\U0001F02B\U0001F02C-\U0001F02F\U0001F322-\U0001F323\U0001F394-\U0001F395\U0001F398\U0001F39C-\U0001F39D\U0001F3F1-\U0001F3F2\U0001F3F6\U0001F4FE\U0001F53E-\U0001F548\U0001F54F\U0001F568-\U0001F56E\U0001F571-\U0001F572\U0001F57B-\U0001F586\U0001F588-\U0001F589\U0001F58E-\U0001F58F\U0001F591-\U0001F594\U0001F597-\U0001F5A3\U0001F5A6-\U0001F5A7\U0001F5A9-\U0001F5B0\U0001F5B3-\U0001F5BB\U0001F5BD-\U0001F5C1\U0001F5C5-\U0001F5D0\U0001F5D4-\U0001F5DB\U0001F5DF-\U0001F5E0\U0001F5E2\U0001F5E4-\U0001F5E7\U0001F5E9-\U0001F5EE\U0001F5F0-\U0001F5F2\U0001F5F4-\U0001F5F9\U00002605\U00002607-\U0000260D\U0000260F-\U00002610\U00002612\U00002616-\U00002617\U00002619-\U0000261C\U0000261E-\U0000261F\U00002621\U00002624-\U00002625\U00002627-\U00002629\U0000262B-\U0000262D\U00002630-\U00002637\U0000263B-\U00002647\U00002654-\U0000265F\U00002661-\U00002662\U00002664\U00002667\U00002669-\U0000267A\U0000267C-\U0000267E\U00002680-\U00002691\U00002695\U00002698\U0000269A\U0000269D-\U0000269F\U000026A2-\U000026A9\U000026AC-\U000026AF\U000026B2-\U000026BC\U000026BF-\U000026C3\U000026C6-\U000026C7\U000026C9-\U000026CD\U000026D0\U000026D2\U000026D5-\U000026E8\U000026EB-\U000026EF\U000026F6\U000026FB-\U000026FC\U000026FE-\U000026FF\U00002388\U0001FA00-\U0001FFFD\U0001F0A0-\U0001F0AE\U0001F0B1-\U0001F0BF\U0001F0C1-\U0001F0CF\U0001F0D1-\U0001F0F5\U0001F0AF-\U0001F0B0\U0001F0C0\U0001F0D0\U0001F0F6-\U0001F0FF\U0001F80C-\U0001F80F\U0001F848-\U0001F84F\U0001F85A-\U0001F85F\U0001F888-\U0001F88F\U0001F8AE-\U0001F8FF\U0001F900-\U0001F90B\U0001F91F\U0001F928-\U0001F92F\U0001F931-\U0001F932\U0001F94C\U0001F95F-\U0001F96B\U0001F992-\U0001F997\U0001F9D0-\U0001F9E6\U0001F90C-\U0001F90F\U0001F93F\U0001F94D-\U0001F94F\U0001F96C-\U0001F97F\U0001F998-\U0001F9BF\U0001F9C1-\U0001F9CF\U0001F9E7-\U0001F9FF\U0001F6C6-\U0001F6CA\U0001F6D3-\U0001F6D4\U0001F6E6-\U0001F6E8\U0001F6EA\U0001F6F1-\U0001F6F2\U0001F6F7-\U0001F6F8\U0001F6D5-\U0001F6DF\U0001F6ED-\U0001F6EF\U0001F6F9-\U0001F6FF];
|
||||
|
||||
# LB1 - Resolve AI, CB, CJ, SA, SG, and XX into other line breaking classes
|
||||
AL = [AL AI SG XX ];
|
||||
dictionary = SA;
|
||||
|
||||
# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
|
||||
# list it in the numerous rules that use CM.
|
||||
CM = [CM ZWJ];
|
||||
|
||||
LB4: BK ÷;
|
||||
LB5: CR LF;
|
||||
LB5.1: CR ÷;
|
||||
LB5.2: LF ÷;
|
||||
LB5.3: NL ÷;
|
||||
|
||||
LB6: . (BK | CR | LF | NL);
|
||||
LB6.1: [^BK CR LF NL SP ZW] CM* (BK | CR | LF | NL);
|
||||
|
||||
# Rules LB14 - LB17.
|
||||
# Moved before LB7, because they can match a longer sequence that would also match LB7,
|
||||
# for example, the sequence "OP CM SP AL" matches LB14 while the prefix of it,
|
||||
# "while only the prefix "OP CM SP" matches LB7.1
|
||||
LB14: OP CM* SP* .;
|
||||
LB15: QU CM* SP* OP;
|
||||
LB16: (CL | CP)CM* SP* NS;
|
||||
LB17: B2 CM* SP* B2;
|
||||
|
||||
LB7.1: [^ZW SP] CM* [SP ZW];
|
||||
LB7.2: [ZW SP] [SP ZW];
|
||||
|
||||
# LB8, ICU differs from UAX-14,
|
||||
# ICU: ZW ÷;
|
||||
# UAX 14: ZW SP* ÷;
|
||||
LB8: ZW ÷;
|
||||
|
||||
# LB8a
|
||||
# ZWJ x (ID | Extended_Pict | EmojiNRK)
|
||||
LB8a: ZWJ (ID | Extended_Pict | EmojiNRK);
|
||||
|
||||
|
||||
# LB9: X CM -> X
|
||||
# LB10: Unattached CM -> AL
|
||||
|
||||
#LB11: × WJ;
|
||||
# WJ ×
|
||||
|
||||
LB11.1: [^BK CR LF NL SP ZW] CM* WJ;
|
||||
LB11.2: SP WJ;
|
||||
LB11.3: WJ CM* [^CM];
|
||||
|
||||
LB12: GL CM* [^CM];
|
||||
|
||||
LB12a: [^SP BA HY] CM* GL;
|
||||
|
||||
# LB 13 ICU Tailoring, matches tailoring exmaple 8 from UAX 14.
|
||||
#
|
||||
# LB13.1 [^SP] CM* [CL CP EX IS SY] # original UAX 14 rule.
|
||||
# LB13.2 SP CM* [CL CP EX IS SY]
|
||||
|
||||
LB13.1: [^NU SP] CM* [CL CP IS SY];
|
||||
LB13.2: [^SP] CM* EX;
|
||||
LB13.2: SP [CL CP EX IS SY];
|
||||
|
||||
|
||||
# LB 14-17 are moved above LB 7.
|
||||
|
||||
LB18: SP ÷;
|
||||
|
||||
LB19: . CM* QU;
|
||||
LB19.1: QU CM* [^CM];
|
||||
|
||||
# LB 20 Break before and after CB.
|
||||
# Interaction with LB8a: ZWJ x ID is tricky because CM includes ZWJ.
|
||||
# ZWJ acts like a CM to the left, combining with CB.
|
||||
# ZWJ acts independently to the right, no break from ID by LB8a.
|
||||
LB20: . CM* ÷ CB;
|
||||
LB20.1a: CB CM* ZWJ (ID | Extended_Pict | EmojiNRK);
|
||||
LB20.1b: CB CM* ÷;
|
||||
|
||||
# Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
|
||||
# not picking up the continuing match after the BA from 21a.
|
||||
LB21a: HL CM* (HY | BA) CM* [^CM CB];
|
||||
|
||||
LB21.1: . CM* [BA HY NS];
|
||||
LB21.2: BB CM* [^CM CB];
|
||||
|
||||
LB21b: SY CM* HL;
|
||||
|
||||
LB22.1: (AL | HL | CM) CM* IN; # The CM is from LB10, treat an unattached CM as AL.
|
||||
LB22.2: EX CM* IN;
|
||||
LB22.3: (ID | EB | EM) CM* IN;
|
||||
# LB22.4: IN CM* IN; # delete this rule for CSS loose.
|
||||
LB22.5: NU CM* IN;
|
||||
|
||||
LB23.1: (AL | HL | CM) CM* NU;
|
||||
LB23.2: NU CM* (AL | HL);
|
||||
|
||||
LB23a.1: PR CM* (ID | EB | EM);
|
||||
LB23a.2: (ID | EB | EM) CM* PO;
|
||||
|
||||
LB24.2: (PR | PO) CM* (AL | HL);
|
||||
LB24.3: (AL | HL | CM) CM* (PR | PO);
|
||||
|
||||
# Numbers. Equivalent to Tailoring example 8 from UAx 14.
|
||||
LB25: ((PR | PO)CM*)? ((OP | HY)CM*)? NU (CM*(NU | SY | IS))* (CM*(CL | CP))? (CM*(PR | PO))?;
|
||||
|
||||
LB26.1: JL CM* (JL | JV | H2 | H3);
|
||||
LB26.2: (JV | H2) CM* (JV | JT);
|
||||
LB26.3: (JT | H3) CM* JT;
|
||||
|
||||
LB27.1: (JL | JV | JT | H2 | H3) CM* IN;
|
||||
LB27.2: (JL | JV | JT | H2 | H3) CM* PO;
|
||||
LB27.3: PR CM* (JL | JV | JT | H2 | H3);
|
||||
|
||||
# LB28 Do not break between Alphabetics.
|
||||
# Unattached (leading) CM treated as AL.
|
||||
LB28: (AL | HL | CM)CM* (AL | HL);
|
||||
|
||||
LB29: IS CM* (AL | HL);
|
||||
|
||||
# LB30 is adjusted for unattached leading CM being treated as AL.
|
||||
LB30.1: (AL | CM | HL | NU) CM* OP;
|
||||
LB30.2: CP CM* (AL | HL | NU);
|
||||
|
||||
# LB31 keep pairs of RI together.
|
||||
LB30a.1: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS];
|
||||
LB30a.2: RI CM* RI CM* ZWJ (ID | Extended_Pict | EmojiNRK);
|
||||
LB30a.3: RI CM* RI CM* ÷;
|
||||
|
||||
# LB30b Do not break between Emoji Base and Emoji Modifier
|
||||
LB30b: EB CM* EM;
|
||||
|
||||
# LB31 Break Everywhere Else.
|
||||
# Include combining marks
|
||||
LB31.1: . CM* ZWJ (ID | Extended_Pict | EmojiNRK);
|
||||
LB31.2: . CM* ÷;
|
@ -0,0 +1,229 @@
|
||||
#
|
||||
# Copyright (C) 2016 and later: Unicode, Inc. and others.
|
||||
# License & terms of use: http://www.unicode.org/copyright.html
|
||||
# Copyright (c) 2016, International Business Machines Corporation and others. All Rights Reserved.
|
||||
#
|
||||
# file: line_loose_cj.txt
|
||||
#
|
||||
# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest
|
||||
#
|
||||
# Note: Rule syntax and the monkey test itself are still a work in progress.
|
||||
# They are expected to change with review and the addition of support for rule tailoring.
|
||||
#
|
||||
# Line Breaking Rules
|
||||
# Implement default line breaking as defined by
|
||||
# Unicode Standard Annex #14 Revision 34 for Unicode 8.0
|
||||
# http://www.unicode.org/reports/tr14/
|
||||
# tailored as noted in 2nd paragraph below..
|
||||
#
|
||||
# This tailors the line break behavior to correspond to CSS
|
||||
# line-break=loose (BCP47 -u-lb-loose) as defined for Chinese & Japanese.
|
||||
# It sets characters of class CJ to behave like ID.
|
||||
# In addition, it allows breaks:
|
||||
# * before hyphens 2010 & 2013 (both BA) and 301C, 30A0 (both NS)
|
||||
# * before iteration marks 3005, 303B, 309D, 309E, 30FD, 30FE (all NS)
|
||||
# * between characters of LineBreak class IN such as 2026
|
||||
# * before some centered punct 203C, 2047, 2048, 2049, 30FB, FF1A, FF1B,
|
||||
# FF65 (all NS) and FF01, FF1F (both EX).
|
||||
# * before suffix characters with LineBreak class PO and EastAsianWidth A,F,W;
|
||||
# this includes: 00B0 2030 2032 2033 2035 2103 2109 FE6A FF05 FFE0
|
||||
# * after prefix characters with LineBreak class PR and EastAsianWidth A,F,W;
|
||||
# this includes: 00A4 00B1 20AC 2116 FE69 FF04 FFE1 FFE5 FFE6
|
||||
|
||||
|
||||
type = line;
|
||||
locale = ja@lb=loose;
|
||||
|
||||
|
||||
AI = [:LineBreak = Ambiguous:];
|
||||
AL = [[:LineBreak = Alphabetic:]];
|
||||
BAX = [\u2010 \u2013];
|
||||
BA = [[:LineBreak = Break_After:] - BAX];
|
||||
BB = [:LineBreak = Break_Before:];
|
||||
BK = [:LineBreak = Mandatory_Break:];
|
||||
B2 = [:LineBreak = Break_Both:];
|
||||
CB = [:LineBreak = Contingent_Break:];
|
||||
CJ = [:LineBreak = Conditional_Japanese_Starter:];
|
||||
CL = [:LineBreak = Close_Punctuation:];
|
||||
CM = [:LineBreak = Combining_Mark:];
|
||||
CP = [:LineBreak = Close_Parenthesis:];
|
||||
CR = [:LineBreak = Carriage_Return:];
|
||||
EB = [:LineBreak = EB:];
|
||||
EM = [:LineBreak = EM:];
|
||||
EXX = [\uFF01 \uFF1F];
|
||||
EX = [[:LineBreak = Exclamation:] - EXX];
|
||||
GL = [:LineBreak = Glue:];
|
||||
HL = [:LineBreak = Hebrew_Letter:];
|
||||
HY = [:LineBreak = Hyphen:];
|
||||
H2 = [:LineBreak = H2:];
|
||||
H3 = [:LineBreak = H3:];
|
||||
ID = [[:LineBreak = Ideographic:] CJ]; # CSS Loose tailoring: CJ resolves to ID
|
||||
IN = [:LineBreak = Inseperable:];
|
||||
IS = [:LineBreak = Infix_Numeric:];
|
||||
JL = [:LineBreak = JL:];
|
||||
JV = [:LineBreak = JV:];
|
||||
JT = [:LineBreak = JT:];
|
||||
LF = [:LineBreak = Line_Feed:];
|
||||
NL = [:LineBreak = Next_Line:];
|
||||
NSX = [\u301C \u30A0 \u3005 \u303B \u309D \u309E \u30FD \u30FE \u203C \u2047 \u2048 \u2049 \u30FB \uFF1A \uFF1B \uFF65];
|
||||
NS = [[:LineBreak = Nonstarter:] - NSX];
|
||||
NU = [:LineBreak = Numeric:];
|
||||
OP = [:LineBreak = Open_Punctuation:];
|
||||
POX = [\u00B0 \u2030 \u2032 \u2033 \u2035 \u2103 \u2109 \uFE6A \uFF05 \uFFE0];
|
||||
PO = [[:LineBreak = Postfix_Numeric:] - POX];
|
||||
PRX = [\u00A4 \u00B1 \u20AC \u2116 \uFE69 \uFF04 \uFFE1 \uFFE5 \uFFE6];
|
||||
PR = [[:LineBreak = Prefix_Numeric:] - PRX];
|
||||
QU = [:LineBreak = Quotation:];
|
||||
RI = [:LineBreak = Regional_Indicator:];
|
||||
SA = [:LineBreak = Complex_Context:];
|
||||
SG = [:LineBreak = Surrogate:];
|
||||
SP = [:LineBreak = Space:];
|
||||
SY = [:LineBreak = Break_Symbols:];
|
||||
WJ = [:LineBreak = Word_Joiner:];
|
||||
XX = [:LineBreak = Unknown:];
|
||||
ZW = [:LineBreak = ZWSpace:];
|
||||
ZWJ = [:LineBreak = ZWJ:];
|
||||
|
||||
EmojiNRK = [[\p{Emoji}] - [[RI]\u002a\u00230-9©®™〰〽]];
|
||||
# Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r13267
|
||||
Extended_Pict = [\U0001F774-\U0001F77F\U00002700-\U00002701\U00002703-\U00002704\U0000270E\U00002710-\U00002711\U00002765-\U00002767\U0001F030-\U0001F093\U0001F094-\U0001F09F\U0001F10D-\U0001F10F\U0001F12F\U0001F16C-\U0001F16F\U0001F1AD-\U0001F1E5\U0001F260-\U0001F265\U0001F203-\U0001F20F\U0001F23C-\U0001F23F\U0001F249-\U0001F24F\U0001F252-\U0001F25F\U0001F266-\U0001F2FF\U0001F7D5-\U0001F7FF\U0001F000-\U0001F003\U0001F005-\U0001F02B\U0001F02C-\U0001F02F\U0001F322-\U0001F323\U0001F394-\U0001F395\U0001F398\U0001F39C-\U0001F39D\U0001F3F1-\U0001F3F2\U0001F3F6\U0001F4FE\U0001F53E-\U0001F548\U0001F54F\U0001F568-\U0001F56E\U0001F571-\U0001F572\U0001F57B-\U0001F586\U0001F588-\U0001F589\U0001F58E-\U0001F58F\U0001F591-\U0001F594\U0001F597-\U0001F5A3\U0001F5A6-\U0001F5A7\U0001F5A9-\U0001F5B0\U0001F5B3-\U0001F5BB\U0001F5BD-\U0001F5C1\U0001F5C5-\U0001F5D0\U0001F5D4-\U0001F5DB\U0001F5DF-\U0001F5E0\U0001F5E2\U0001F5E4-\U0001F5E7\U0001F5E9-\U0001F5EE\U0001F5F0-\U0001F5F2\U0001F5F4-\U0001F5F9\U00002605\U00002607-\U0000260D\U0000260F-\U00002610\U00002612\U00002616-\U00002617\U00002619-\U0000261C\U0000261E-\U0000261F\U00002621\U00002624-\U00002625\U00002627-\U00002629\U0000262B-\U0000262D\U00002630-\U00002637\U0000263B-\U00002647\U00002654-\U0000265F\U00002661-\U00002662\U00002664\U00002667\U00002669-\U0000267A\U0000267C-\U0000267E\U00002680-\U00002691\U00002695\U00002698\U0000269A\U0000269D-\U0000269F\U000026A2-\U000026A9\U000026AC-\U000026AF\U000026B2-\U000026BC\U000026BF-\U000026C3\U000026C6-\U000026C7\U000026C9-\U000026CD\U000026D0\U000026D2\U000026D5-\U000026E8\U000026EB-\U000026EF\U000026F6\U000026FB-\U000026FC\U000026FE-\U000026FF\U00002388\U0001FA00-\U0001FFFD\U0001F0A0-\U0001F0AE\U0001F0B1-\U0001F0BF\U0001F0C1-\U0001F0CF\U0001F0D1-\U0001F0F5\U0001F0AF-\U0001F0B0\U0001F0C0\U0001F0D0\U0001F0F6-\U0001F0FF\U0001F80C-\U0001F80F\U0001F848-\U0001F84F\U0001F85A-\U0001F85F\U0001F888-\U0001F88F\U0001F8AE-\U0001F8FF\U0001F900-\U0001F90B\U0001F91F\U0001F928-\U0001F92F\U0001F931-\U0001F932\U0001F94C\U0001F95F-\U0001F96B\U0001F992-\U0001F997\U0001F9D0-\U0001F9E6\U0001F90C-\U0001F90F\U0001F93F\U0001F94D-\U0001F94F\U0001F96C-\U0001F97F\U0001F998-\U0001F9BF\U0001F9C1-\U0001F9CF\U0001F9E7-\U0001F9FF\U0001F6C6-\U0001F6CA\U0001F6D3-\U0001F6D4\U0001F6E6-\U0001F6E8\U0001F6EA\U0001F6F1-\U0001F6F2\U0001F6F7-\U0001F6F8\U0001F6D5-\U0001F6DF\U0001F6ED-\U0001F6EF\U0001F6F9-\U0001F6FF];
|
||||
|
||||
# LB1 - Resolve AI, CB, CJ, SA, SG, and XX into other line breaking classes
|
||||
AL = [AL AI SG XX ];
|
||||
dictionary = SA;
|
||||
|
||||
# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
|
||||
# list it in the numerous rules that use CM.
|
||||
CM = [CM ZWJ];
|
||||
|
||||
LB4: BK ÷;
|
||||
LB5: CR LF;
|
||||
LB5.1: CR ÷;
|
||||
LB5.2: LF ÷;
|
||||
LB5.3: NL ÷;
|
||||
|
||||
LB6: . (BK | CR | LF | NL);
|
||||
LB6.1: [^BK CR LF NL SP ZW] CM* (BK | CR | LF | NL);
|
||||
|
||||
# Rules LB14 - LB17.
|
||||
# Moved before LB7, because they can match a longer sequence that would also match LB7,
|
||||
# for example, the sequence "OP CM SP AL" matches LB14 while the prefix of it,
|
||||
# "while only the prefix "OP CM SP" matches LB7.1
|
||||
LB14: OP CM* SP* .;
|
||||
LB15: QU CM* SP* OP;
|
||||
LB16: (CL | CP)CM* SP* NS;
|
||||
LB17: B2 CM* SP* B2;
|
||||
|
||||
LB7.1: [^ZW SP] CM* [SP ZW];
|
||||
LB7.2: [ZW SP] [SP ZW];
|
||||
|
||||
# LB8, ICU differs from UAX-14,
|
||||
# ICU: ZW ÷;
|
||||
# UAX 14: ZW SP* ÷;
|
||||
LB8: ZW ÷;
|
||||
|
||||
# LB8a
|
||||
# ZWJ x (ID | Extended_Pict | EmojiNRK)
|
||||
LB8a: ZWJ (ID | Extended_Pict | EmojiNRK);
|
||||
|
||||
|
||||
# LB9: X CM -> X
|
||||
# LB10: Unattached CM -> AL
|
||||
|
||||
#LB11: × WJ;
|
||||
# WJ ×
|
||||
|
||||
LB11.1: [^BK CR LF NL SP ZW] CM* WJ;
|
||||
LB11.2: SP WJ;
|
||||
LB11.3: WJ CM* [^CM];
|
||||
|
||||
LB12: GL CM* [^CM];
|
||||
|
||||
LB12a: [^SP BA BAX HY] CM* GL;
|
||||
|
||||
# LB 13 ICU Tailoring, matches tailoring exmaple 8 from UAX 14.
|
||||
#
|
||||
# LB13.1 [^SP] CM* [CL CP EX IS SY] # original UAX 14 rule.
|
||||
# LB13.2 SP CM* [CL CP EX IS SY]
|
||||
|
||||
LB13.1: [^NU SP] CM* [CL CP IS SY];
|
||||
LB13.2: [^SP] CM* EX;
|
||||
LB13.2: SP [CL CP EX IS SY];
|
||||
|
||||
|
||||
# LB 14-17 are moved above LB 7.
|
||||
|
||||
LB18: SP ÷;
|
||||
|
||||
LB19: . CM* QU;
|
||||
LB19.1: QU CM* [^CM];
|
||||
|
||||
# LB 20 Break before and after CB.
|
||||
# Interaction with LB8a: ZWJ x ID is tricky because CM includes ZWJ.
|
||||
# ZWJ acts like a CM to the left, combining with CB.
|
||||
# ZWJ acts independently to the right, no break from ID by LB8a.
|
||||
LB20: . CM* ÷ CB;
|
||||
LB20.1a: CB CM* ZWJ (ID | Extended_Pict | EmojiNRK);
|
||||
LB20.1b: CB CM* ÷;
|
||||
|
||||
# Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
|
||||
# not picking up the continuing match after the BA from 21a.
|
||||
# LB 21a Don't break after Hebrew + Hyphen
|
||||
# HL (HY | BA) x
|
||||
|
||||
LB21a: HL CM* (HY | BA | BAX) CM* [^CM CB]?;
|
||||
|
||||
LB21.1: . CM* [BA HY NS];
|
||||
LB21.2: BB CM* [^CM CB];
|
||||
|
||||
LB21b: SY CM* HL;
|
||||
|
||||
LB22.1: (AL | HL | CM) CM* IN; # The CM is from LB10, treat an unattached CM as AL.
|
||||
LB22.2: EX CM* IN;
|
||||
LB22.3: (ID | EB | EM) CM* IN;
|
||||
# LB22.4: IN CM* IN; # delete this rule for CSS loose.
|
||||
LB22.5: NU CM* IN;
|
||||
|
||||
LB23.1: (AL | HL | CM) CM* NU;
|
||||
LB23.2: NU CM* (AL | HL);
|
||||
|
||||
LB23a.1: PR CM* (ID | EB | EM);
|
||||
LB23a.2: (ID | EB | EM) CM* PO;
|
||||
|
||||
LB24.2: (PR | PO | POX) CM* (AL | HL);
|
||||
LB24.3: (AL | HL | CM) CM* (PR | PO | POX);
|
||||
|
||||
# Numbers. Equivalent to Tailoring example 8 from UAx 14.
|
||||
# Loose_cj tailoring: do not include $PRX at the beginning or $POX at the end.
|
||||
LB25: ((PR | PO | POX)CM*)? ((OP | HY)CM*)? NU (CM*(NU | SY | IS))* (CM*(CL | CP))? (CM*(PR | PRX | PO))?;
|
||||
|
||||
LB26.1: JL CM* (JL | JV | H2 | H3);
|
||||
LB26.2: (JV | H2) CM* (JV | JT);
|
||||
LB26.3: (JT | H3) CM* JT;
|
||||
|
||||
LB27.1: (JL | JV | JT | H2 | H3) CM* IN;
|
||||
LB27.2: (JL | JV | JT | H2 | H3) CM* PO;
|
||||
LB27.3: PR CM* (JL | JV | JT | H2 | H3);
|
||||
|
||||
# LB28 Do not break between Alphabetics.
|
||||
# Unattached (leading) CM treated as AL.
|
||||
LB28: (AL | HL | CM)CM* (AL | HL);
|
||||
|
||||
LB29: IS CM* (AL | HL);
|
||||
|
||||
# LB30 is adjusted for unattached leading CM being treated as AL.
|
||||
LB30.1: (AL | CM | HL | NU) CM* OP;
|
||||
LB30.2: CP CM* (AL | HL | NU);
|
||||
|
||||
# LB31 keep pairs of RI together.
|
||||
LB30a.1: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS];
|
||||
LB30a.2: RI CM* RI CM* ZWJ (ID | Extended_Pict | EmojiNRK);
|
||||
LB30a.3: RI CM* RI CM* ÷;
|
||||
|
||||
# LB30b Do not break between Emoji Base and Emoji Modifier
|
||||
LB30b: EB CM* EM;
|
||||
|
||||
# LB31 Break Everywhere Else.
|
||||
# Include combining marks
|
||||
LB31.1: . CM* ZWJ (ID | Extended_Pict | EmojiNRK);
|
||||
LB31.2: . CM* ÷;
|
@ -0,0 +1,214 @@
|
||||
#
|
||||
# Copyright (C) 2016 and later: Unicode, Inc. and others.
|
||||
# License & terms of use: http://www.unicode.org/copyright.html
|
||||
# Copyright (c) 2016, International Business Machines Corporation and others. All Rights Reserved.
|
||||
#
|
||||
# file: line_normal.txt
|
||||
#
|
||||
# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest
|
||||
#
|
||||
# Note: Rule syntax and the monkey test itself are still a work in progress.
|
||||
# They are expected to change with review and the addition of support for rule tailoring.
|
||||
#
|
||||
# Line Breaking Rules
|
||||
# Implement default line breaking as defined by
|
||||
# Unicode Standard Annex #14 Revision 34 for Unicode 8.0
|
||||
# http://www.unicode.org/reports/tr14/
|
||||
# tailored as noted in 2nd paragraph below.
|
||||
#
|
||||
# TODO: Rule LB 8 remains as it was in Unicode 5.2
|
||||
# This is only because of a limitation of ICU break engine implementation,
|
||||
# not because the older behavior is desirable.
|
||||
#
|
||||
# This tailors the line break behavior to correspond to CSS
|
||||
# line-break=normal (BCP47 -u-lb-normal) as defined for languages other than
|
||||
# Chinese & Japanese.
|
||||
# It sets characters of class CJ to behave like ID.
|
||||
|
||||
|
||||
type = line;
|
||||
locale = en@lb=normal;
|
||||
|
||||
AI = [:LineBreak = Ambiguous:];
|
||||
AL = [:LineBreak = Alphabetic:];
|
||||
BA = [:LineBreak = Break_After:];
|
||||
BB = [:LineBreak = Break_Before:];
|
||||
BK = [:LineBreak = Mandatory_Break:];
|
||||
B2 = [:LineBreak = Break_Both:];
|
||||
CB = [:LineBreak = Contingent_Break:];
|
||||
CJ = [:LineBreak = Conditional_Japanese_Starter:];
|
||||
CL = [:LineBreak = Close_Punctuation:];
|
||||
CM = [:LineBreak = Combining_Mark:];
|
||||
CP = [:LineBreak = Close_Parenthesis:];
|
||||
CR = [:LineBreak = Carriage_Return:];
|
||||
EB = [:LineBreak = EB:];
|
||||
EM = [:LineBreak = EM:];
|
||||
EX = [:LineBreak = Exclamation:];
|
||||
GL = [:LineBreak = Glue:];
|
||||
HL = [:LineBreak = Hebrew_Letter:];
|
||||
HY = [:LineBreak = Hyphen:];
|
||||
H2 = [:LineBreak = H2:];
|
||||
H3 = [:LineBreak = H3:];
|
||||
ID = [[:LineBreak = Ideographic:] CJ]; # CSS Normal tailoring: CJ resolves to ID
|
||||
IN = [:LineBreak = Inseperable:];
|
||||
IS = [:LineBreak = Infix_Numeric:];
|
||||
JL = [:LineBreak = JL:];
|
||||
JV = [:LineBreak = JV:];
|
||||
JT = [:LineBreak = JT:];
|
||||
LF = [:LineBreak = Line_Feed:];
|
||||
NL = [:LineBreak = Next_Line:];
|
||||
NS = [:LineBreak = Nonstarter:];
|
||||
NU = [:LineBreak = Numeric:];
|
||||
OP = [:LineBreak = Open_Punctuation:];
|
||||
PO = [:LineBreak = Postfix_Numeric:];
|
||||
PR = [:LineBreak = Prefix_Numeric:];
|
||||
QU = [:LineBreak = Quotation:];
|
||||
RI = [:LineBreak = Regional_Indicator:];
|
||||
SA = [:LineBreak = Complex_Context:];
|
||||
SG = [:LineBreak = Surrogate:];
|
||||
SP = [:LineBreak = Space:];
|
||||
SY = [:LineBreak = Break_Symbols:];
|
||||
WJ = [:LineBreak = Word_Joiner:];
|
||||
XX = [:LineBreak = Unknown:];
|
||||
ZW = [:LineBreak = ZWSpace:];
|
||||
ZWJ = [:LineBreak = ZWJ:];
|
||||
|
||||
EmojiNRK = [[\p{Emoji}] - [[RI]\u002a\u00230-9©®™〰〽]];
|
||||
# Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r13267
|
||||
Extended_Pict = [\U0001F774-\U0001F77F\U00002700-\U00002701\U00002703-\U00002704\U0000270E\U00002710-\U00002711\U00002765-\U00002767\U0001F030-\U0001F093\U0001F094-\U0001F09F\U0001F10D-\U0001F10F\U0001F12F\U0001F16C-\U0001F16F\U0001F1AD-\U0001F1E5\U0001F260-\U0001F265\U0001F203-\U0001F20F\U0001F23C-\U0001F23F\U0001F249-\U0001F24F\U0001F252-\U0001F25F\U0001F266-\U0001F2FF\U0001F7D5-\U0001F7FF\U0001F000-\U0001F003\U0001F005-\U0001F02B\U0001F02C-\U0001F02F\U0001F322-\U0001F323\U0001F394-\U0001F395\U0001F398\U0001F39C-\U0001F39D\U0001F3F1-\U0001F3F2\U0001F3F6\U0001F4FE\U0001F53E-\U0001F548\U0001F54F\U0001F568-\U0001F56E\U0001F571-\U0001F572\U0001F57B-\U0001F586\U0001F588-\U0001F589\U0001F58E-\U0001F58F\U0001F591-\U0001F594\U0001F597-\U0001F5A3\U0001F5A6-\U0001F5A7\U0001F5A9-\U0001F5B0\U0001F5B3-\U0001F5BB\U0001F5BD-\U0001F5C1\U0001F5C5-\U0001F5D0\U0001F5D4-\U0001F5DB\U0001F5DF-\U0001F5E0\U0001F5E2\U0001F5E4-\U0001F5E7\U0001F5E9-\U0001F5EE\U0001F5F0-\U0001F5F2\U0001F5F4-\U0001F5F9\U00002605\U00002607-\U0000260D\U0000260F-\U00002610\U00002612\U00002616-\U00002617\U00002619-\U0000261C\U0000261E-\U0000261F\U00002621\U00002624-\U00002625\U00002627-\U00002629\U0000262B-\U0000262D\U00002630-\U00002637\U0000263B-\U00002647\U00002654-\U0000265F\U00002661-\U00002662\U00002664\U00002667\U00002669-\U0000267A\U0000267C-\U0000267E\U00002680-\U00002691\U00002695\U00002698\U0000269A\U0000269D-\U0000269F\U000026A2-\U000026A9\U000026AC-\U000026AF\U000026B2-\U000026BC\U000026BF-\U000026C3\U000026C6-\U000026C7\U000026C9-\U000026CD\U000026D0\U000026D2\U000026D5-\U000026E8\U000026EB-\U000026EF\U000026F6\U000026FB-\U000026FC\U000026FE-\U000026FF\U00002388\U0001FA00-\U0001FFFD\U0001F0A0-\U0001F0AE\U0001F0B1-\U0001F0BF\U0001F0C1-\U0001F0CF\U0001F0D1-\U0001F0F5\U0001F0AF-\U0001F0B0\U0001F0C0\U0001F0D0\U0001F0F6-\U0001F0FF\U0001F80C-\U0001F80F\U0001F848-\U0001F84F\U0001F85A-\U0001F85F\U0001F888-\U0001F88F\U0001F8AE-\U0001F8FF\U0001F900-\U0001F90B\U0001F91F\U0001F928-\U0001F92F\U0001F931-\U0001F932\U0001F94C\U0001F95F-\U0001F96B\U0001F992-\U0001F997\U0001F9D0-\U0001F9E6\U0001F90C-\U0001F90F\U0001F93F\U0001F94D-\U0001F94F\U0001F96C-\U0001F97F\U0001F998-\U0001F9BF\U0001F9C1-\U0001F9CF\U0001F9E7-\U0001F9FF\U0001F6C6-\U0001F6CA\U0001F6D3-\U0001F6D4\U0001F6E6-\U0001F6E8\U0001F6EA\U0001F6F1-\U0001F6F2\U0001F6F7-\U0001F6F8\U0001F6D5-\U0001F6DF\U0001F6ED-\U0001F6EF\U0001F6F9-\U0001F6FF];
|
||||
|
||||
# LB1 - Resolve AI, CB, CJ, SA, SG, and XX into other line breaking classes
|
||||
AL = [AL AI SG XX ];
|
||||
dictionary = SA;
|
||||
|
||||
# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
|
||||
# list it in the numerous rules that use CM.
|
||||
CM = [CM ZWJ];
|
||||
|
||||
LB4: BK ÷;
|
||||
LB5: CR LF;
|
||||
LB5.1: CR ÷;
|
||||
LB5.2: LF ÷;
|
||||
LB5.3: NL ÷;
|
||||
|
||||
LB6: . (BK | CR | LF | NL);
|
||||
LB6.1: [^BK CR LF NL SP ZW] CM* (BK | CR | LF | NL);
|
||||
|
||||
# Rules LB14 - LB17.
|
||||
# Moved before LB7, because they can match a longer sequence that would also match LB7,
|
||||
# for example, the sequence "OP CM SP AL" matches LB14 while the prefix of it,
|
||||
# "while only the prefix "OP CM SP" matches LB7.1
|
||||
LB14: OP CM* SP* .;
|
||||
LB15: QU CM* SP* OP;
|
||||
LB16: (CL | CP)CM* SP* NS;
|
||||
LB17: B2 CM* SP* B2;
|
||||
|
||||
LB7.1: [^ZW SP] CM* [SP ZW];
|
||||
LB7.2: [ZW SP] [SP ZW];
|
||||
|
||||
# LB8, ICU differs from UAX-14,
|
||||
# ICU: ZW ÷;
|
||||
# UAX 14: ZW SP* ÷;
|
||||
LB8: ZW ÷;
|
||||
|
||||
# LB8a
|
||||
# ZWJ x (ID | Extended_Pict | EmojiNRK)
|
||||
LB8a: ZWJ (ID | Extended_Pict | EmojiNRK);
|
||||
|
||||
|
||||
# LB9: X CM -> X
|
||||
# LB10: Unattached CM -> AL
|
||||
|
||||
#LB11: × WJ;
|
||||
# WJ ×
|
||||
|
||||
LB11.1: [^BK CR LF NL SP ZW] CM* WJ;
|
||||
LB11.2: SP WJ;
|
||||
LB11.3: WJ CM* [^CM];
|
||||
|
||||
LB12: GL CM* [^CM];
|
||||
|
||||
LB12a: [^SP BA HY] CM* GL;
|
||||
|
||||
# LB 13 ICU Tailoring, matches tailoring exmaple 8 from UAX 14.
|
||||
#
|
||||
# LB13.1 [^SP] CM* [CL CP EX IS SY] # original UAX 14 rule.
|
||||
# LB13.2 SP CM* [CL CP EX IS SY]
|
||||
|
||||
LB13.1: [^NU SP] CM* [CL CP IS SY];
|
||||
LB13.2: [^SP] CM* EX;
|
||||
LB13.2: SP [CL CP EX IS SY];
|
||||
|
||||
|
||||
# LB 14-17 are moved above LB 7.
|
||||
|
||||
LB18: SP ÷;
|
||||
|
||||
LB19: . CM* QU;
|
||||
LB19.1: QU CM* [^CM];
|
||||
|
||||
# LB 20 Break before and after CB.
|
||||
# Interaction with LB8a: ZWJ x ID is tricky because CM includes ZWJ.
|
||||
# ZWJ acts like a CM to the left, combining with CB.
|
||||
# ZWJ acts independently to the right, no break from ID by LB8a.
|
||||
LB20: . CM* ÷ CB;
|
||||
LB20.1a: CB CM* ZWJ (ID | Extended_Pict | EmojiNRK);
|
||||
LB20.1b: CB CM* ÷;
|
||||
|
||||
# Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
|
||||
# not picking up the continuing match after the BA from 21a.
|
||||
LB21a: HL CM* (HY | BA) CM* [^CM CB];
|
||||
|
||||
LB21.1: . CM* [BA HY NS];
|
||||
LB21.2: BB CM* [^CM CB];
|
||||
|
||||
LB21b: SY CM* HL;
|
||||
|
||||
LB22.1: (AL | HL | CM) CM* IN; # The CM is from LB10, treat an unattached CM as AL.
|
||||
LB22.2: EX CM* IN;
|
||||
LB22.3: (ID | EB | EM) CM* IN;
|
||||
LB22.4: IN CM* IN;
|
||||
LB22.5: NU CM* IN;
|
||||
|
||||
LB23.1: (AL | HL | CM) CM* NU;
|
||||
LB23.2: NU CM* (AL | HL);
|
||||
|
||||
LB23a.1: PR CM* (ID | EB | EM);
|
||||
LB23a.2: (ID | EB | EM) CM* PO;
|
||||
|
||||
LB24.2: (PR | PO) CM* (AL | HL);
|
||||
LB24.3: (AL | HL | CM) CM* (PR | PO);
|
||||
|
||||
# Numbers. Equivalent to Tailoring example 8 from UAx 14.
|
||||
LB25: ((PR | PO)CM*)? ((OP | HY)CM*)? NU (CM*(NU | SY | IS))* (CM*(CL | CP))? (CM*(PR | PO))?;
|
||||
|
||||
LB26.1: JL CM* (JL | JV | H2 | H3);
|
||||
LB26.2: (JV | H2) CM* (JV | JT);
|
||||
LB26.3: (JT | H3) CM* JT;
|
||||
|
||||
LB27.1: (JL | JV | JT | H2 | H3) CM* IN;
|
||||
LB27.2: (JL | JV | JT | H2 | H3) CM* PO;
|
||||
LB27.3: PR CM* (JL | JV | JT | H2 | H3);
|
||||
|
||||
# LB28 Do not break between Alphabetics.
|
||||
# Unattached (leading) CM treated as AL.
|
||||
LB28: (AL | HL | CM)CM* (AL | HL);
|
||||
|
||||
LB29: IS CM* (AL | HL);
|
||||
|
||||
# LB30 is adjusted for unattached leading CM being treated as AL.
|
||||
LB30.1: (AL | CM | HL | NU) CM* OP;
|
||||
LB30.2: CP CM* (AL | HL | NU);
|
||||
|
||||
# LB31 keep pairs of RI together.
|
||||
LB30a.1: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS];
|
||||
LB30a.2: RI CM* RI CM* ZWJ (ID | Extended_Pict | EmojiNRK);
|
||||
LB30a.3: RI CM* RI CM* ÷;
|
||||
|
||||
# LB30b Do not break between Emoji Base and Emoji Modifier
|
||||
LB30b: EB CM* EM;
|
||||
|
||||
# LB31 Break Everywhere Else.
|
||||
# Include combining marks
|
||||
LB31.1: . CM* ZWJ (ID | Extended_Pict | EmojiNRK);
|
||||
LB31.2: . CM* ÷;
|
@ -0,0 +1,223 @@
|
||||
#
|
||||
# Copyright (C) 2016 and later: Unicode, Inc. and others.
|
||||
# License & terms of use: http://www.unicode.org/copyright.html
|
||||
# Copyright (c) 2016 International Business Machines Corporation and others. All Rights Reserved.
|
||||
#
|
||||
# file: line_normal_cj.txt
|
||||
#
|
||||
# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest
|
||||
#
|
||||
# Note: Rule syntax and the monkey test itself are still a work in progress.
|
||||
# They are expected to change with review and the addition of support for rule tailoring.
|
||||
#
|
||||
# Line Breaking Rules
|
||||
# Implement default line breaking as defined by
|
||||
# Unicode Standard Annex #14 Revision 34 for Unicode 8.0
|
||||
# http://www.unicode.org/reports/tr14/
|
||||
# tailored as noted in 2nd paragraph below.
|
||||
#
|
||||
# TODO: Rule LB 8 remains as it was in Unicode 5.2
|
||||
# This is only because of a limitation of ICU break engine implementation,
|
||||
# not because the older behavior is desirable.
|
||||
#
|
||||
# This tailors the line break behavior to correspond to CSS
|
||||
# line-break=normal (BCP47 -u-lb-normal) as defined for Chinese & Japanese.
|
||||
# It sets characters of class CJ to behave like ID.
|
||||
# In addition, it allows breaks:
|
||||
# * before hyphens 2010 & 2013 (both BA) and 301C, 30A0 (both NS)
|
||||
|
||||
type = line;
|
||||
locale = ja@lb=normal;
|
||||
|
||||
AI = [:LineBreak = Ambiguous:];
|
||||
AL = [:LineBreak = Alphabetic:];
|
||||
BAX = [\u2010 \u2013];
|
||||
BA = [[:LineBreak = Break_After:] - BAX];
|
||||
BB = [:LineBreak = Break_Before:];
|
||||
BK = [:LineBreak = Mandatory_Break:];
|
||||
B2 = [:LineBreak = Break_Both:];
|
||||
CB = [:LineBreak = Contingent_Break:];
|
||||
CJ = [:LineBreak = Conditional_Japanese_Starter:];
|
||||
CL = [:LineBreak = Close_Punctuation:];
|
||||
CM = [:LineBreak = Combining_Mark:];
|
||||
CP = [:LineBreak = Close_Parenthesis:];
|
||||
CR = [:LineBreak = Carriage_Return:];
|
||||
EB = [:LineBreak = EB:];
|
||||
EM = [:LineBreak = EM:];
|
||||
EX = [:LineBreak = Exclamation:];
|
||||
GL = [:LineBreak = Glue:];
|
||||
HL = [:LineBreak = Hebrew_Letter:];
|
||||
HY = [:LineBreak = Hyphen:];
|
||||
H2 = [:LineBreak = H2:];
|
||||
H3 = [:LineBreak = H3:];
|
||||
ID = [[:LineBreak = Ideographic:] CJ]; # CSS Normal tailoring: CJ resolves to ID
|
||||
IN = [:LineBreak = Inseperable:];
|
||||
IS = [:LineBreak = Infix_Numeric:];
|
||||
JL = [:LineBreak = JL:];
|
||||
JV = [:LineBreak = JV:];
|
||||
JT = [:LineBreak = JT:];
|
||||
LF = [:LineBreak = Line_Feed:];
|
||||
NL = [:LineBreak = Next_Line:];
|
||||
NSX = [\u301C \u30A0];
|
||||
NS = [[:LineBreak = Nonstarter:] - NSX];
|
||||
NU = [:LineBreak = Numeric:];
|
||||
OP = [:LineBreak = Open_Punctuation:];
|
||||
PO = [:LineBreak = Postfix_Numeric:];
|
||||
PR = [:LineBreak = Prefix_Numeric:];
|
||||
QU = [:LineBreak = Quotation:];
|
||||
RI = [:LineBreak = Regional_Indicator:];
|
||||
SA = [:LineBreak = Complex_Context:];
|
||||
SG = [:LineBreak = Surrogate:];
|
||||
SP = [:LineBreak = Space:];
|
||||
SY = [:LineBreak = Break_Symbols:];
|
||||
WJ = [:LineBreak = Word_Joiner:];
|
||||
XX = [:LineBreak = Unknown:];
|
||||
ZW = [:LineBreak = ZWSpace:];
|
||||
ZWJ = [:LineBreak = ZWJ:];
|
||||
|
||||
EmojiNRK = [[\p{Emoji}] - [[RI]\u002a\u00230-9©®™〰〽]];
|
||||
# Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r13267
|
||||
Extended_Pict = [\U0001F774-\U0001F77F\U00002700-\U00002701\U00002703-\U00002704\U0000270E\U00002710-\U00002711\U00002765-\U00002767\U0001F030-\U0001F093\U0001F094-\U0001F09F\U0001F10D-\U0001F10F\U0001F12F\U0001F16C-\U0001F16F\U0001F1AD-\U0001F1E5\U0001F260-\U0001F265\U0001F203-\U0001F20F\U0001F23C-\U0001F23F\U0001F249-\U0001F24F\U0001F252-\U0001F25F\U0001F266-\U0001F2FF\U0001F7D5-\U0001F7FF\U0001F000-\U0001F003\U0001F005-\U0001F02B\U0001F02C-\U0001F02F\U0001F322-\U0001F323\U0001F394-\U0001F395\U0001F398\U0001F39C-\U0001F39D\U0001F3F1-\U0001F3F2\U0001F3F6\U0001F4FE\U0001F53E-\U0001F548\U0001F54F\U0001F568-\U0001F56E\U0001F571-\U0001F572\U0001F57B-\U0001F586\U0001F588-\U0001F589\U0001F58E-\U0001F58F\U0001F591-\U0001F594\U0001F597-\U0001F5A3\U0001F5A6-\U0001F5A7\U0001F5A9-\U0001F5B0\U0001F5B3-\U0001F5BB\U0001F5BD-\U0001F5C1\U0001F5C5-\U0001F5D0\U0001F5D4-\U0001F5DB\U0001F5DF-\U0001F5E0\U0001F5E2\U0001F5E4-\U0001F5E7\U0001F5E9-\U0001F5EE\U0001F5F0-\U0001F5F2\U0001F5F4-\U0001F5F9\U00002605\U00002607-\U0000260D\U0000260F-\U00002610\U00002612\U00002616-\U00002617\U00002619-\U0000261C\U0000261E-\U0000261F\U00002621\U00002624-\U00002625\U00002627-\U00002629\U0000262B-\U0000262D\U00002630-\U00002637\U0000263B-\U00002647\U00002654-\U0000265F\U00002661-\U00002662\U00002664\U00002667\U00002669-\U0000267A\U0000267C-\U0000267E\U00002680-\U00002691\U00002695\U00002698\U0000269A\U0000269D-\U0000269F\U000026A2-\U000026A9\U000026AC-\U000026AF\U000026B2-\U000026BC\U000026BF-\U000026C3\U000026C6-\U000026C7\U000026C9-\U000026CD\U000026D0\U000026D2\U000026D5-\U000026E8\U000026EB-\U000026EF\U000026F6\U000026FB-\U000026FC\U000026FE-\U000026FF\U00002388\U0001FA00-\U0001FFFD\U0001F0A0-\U0001F0AE\U0001F0B1-\U0001F0BF\U0001F0C1-\U0001F0CF\U0001F0D1-\U0001F0F5\U0001F0AF-\U0001F0B0\U0001F0C0\U0001F0D0\U0001F0F6-\U0001F0FF\U0001F80C-\U0001F80F\U0001F848-\U0001F84F\U0001F85A-\U0001F85F\U0001F888-\U0001F88F\U0001F8AE-\U0001F8FF\U0001F900-\U0001F90B\U0001F91F\U0001F928-\U0001F92F\U0001F931-\U0001F932\U0001F94C\U0001F95F-\U0001F96B\U0001F992-\U0001F997\U0001F9D0-\U0001F9E6\U0001F90C-\U0001F90F\U0001F93F\U0001F94D-\U0001F94F\U0001F96C-\U0001F97F\U0001F998-\U0001F9BF\U0001F9C1-\U0001F9CF\U0001F9E7-\U0001F9FF\U0001F6C6-\U0001F6CA\U0001F6D3-\U0001F6D4\U0001F6E6-\U0001F6E8\U0001F6EA\U0001F6F1-\U0001F6F2\U0001F6F7-\U0001F6F8\U0001F6D5-\U0001F6DF\U0001F6ED-\U0001F6EF\U0001F6F9-\U0001F6FF];
|
||||
|
||||
# LB1 - Resolve AI, CB, CJ, SA, SG, and XX into other line breaking classes
|
||||
AL = [AL AI SG XX ];
|
||||
dictionary = SA;
|
||||
|
||||
# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
|
||||
# list it in the numerous rules that use CM.
|
||||
CM = [CM ZWJ];
|
||||
|
||||
LB4: BK ÷;
|
||||
LB5: CR LF;
|
||||
LB5.1: CR ÷;
|
||||
LB5.2: LF ÷;
|
||||
LB5.3: NL ÷;
|
||||
|
||||
LB6: . (BK | CR | LF | NL);
|
||||
LB6.1: [^BK CR LF NL SP ZW] CM* (BK | CR | LF | NL);
|
||||
|
||||
# Rules LB14 - LB17.
|
||||
# Moved before LB7, because they can match a longer sequence that would also match LB7,
|
||||
# for example, the sequence "OP CM SP AL" matches LB14 while the prefix of it,
|
||||
# "while only the prefix "OP CM SP" matches LB7.1
|
||||
LB14: OP CM* SP* .;
|
||||
LB15: QU CM* SP* OP;
|
||||
|
||||
# Do not break between closing punctuation and $NS, even with intervening spaces
|
||||
# But DO allow a break between closing punctuation and $NSX, don't include it here
|
||||
LB16: (CL | CP)CM* SP* NS;
|
||||
LB17: B2 CM* SP* B2;
|
||||
|
||||
LB7.1: [^ZW SP] CM* [SP ZW];
|
||||
LB7.2: [ZW SP] [SP ZW];
|
||||
|
||||
# LB8, ICU differs from UAX-14,
|
||||
# ICU: ZW ÷;
|
||||
# UAX 14: ZW SP* ÷;
|
||||
LB8: ZW ÷;
|
||||
|
||||
# LB8a
|
||||
# ZWJ x (ID | Extended_Pict | EmojiNRK)
|
||||
LB8a: ZWJ (ID | Extended_Pict | EmojiNRK);
|
||||
|
||||
|
||||
# LB9: X CM -> X
|
||||
# LB10: Unattached CM -> AL
|
||||
|
||||
#LB11: × WJ;
|
||||
# WJ ×
|
||||
|
||||
LB11.1: [^BK CR LF NL SP ZW] CM* WJ;
|
||||
LB11.2: SP WJ;
|
||||
LB11.3: WJ CM* [^CM];
|
||||
|
||||
LB12: GL CM* [^CM];
|
||||
|
||||
LB12a: [^SP BA BAX HY] CM* GL;
|
||||
|
||||
# LB 13 ICU Tailoring, matches tailoring exmaple 8 from UAX 14.
|
||||
#
|
||||
# LB13.1 [^SP] CM* [CL CP EX IS SY] # original UAX 14 rule.
|
||||
# LB13.2 SP CM* [CL CP EX IS SY]
|
||||
|
||||
LB13.1: [^NU SP] CM* [CL CP IS SY];
|
||||
LB13.2: [^SP] CM* EX;
|
||||
LB13.2: SP [CL CP EX IS SY];
|
||||
|
||||
|
||||
# LB 14-17 are moved above LB 7.
|
||||
|
||||
LB18: SP ÷;
|
||||
|
||||
LB19: . CM* QU;
|
||||
LB19.1: QU CM* [^CM];
|
||||
|
||||
# LB 20 Break before and after CB.
|
||||
# Interaction with LB8a: ZWJ x ID is tricky because CM includes ZWJ.
|
||||
# ZWJ acts like a CM to the left, combining with CB.
|
||||
# ZWJ acts independently to the right, no break from ID by LB8a.
|
||||
LB20: . CM* ÷ CB;
|
||||
LB20.1a: CB CM* ZWJ (ID | Extended_Pict | EmojiNRK);
|
||||
LB20.1b: CB CM* ÷;
|
||||
|
||||
# Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
|
||||
# not picking up the continuing match after the BA from 21a.
|
||||
# TODO: For CJ tailorings (with BAX) does this rule want to include BAX? If so,
|
||||
# should "HL BAX" not break when followed by a CB? Thats what the current
|
||||
# rules do, which is why "[^CM CB]?" includes the ?.
|
||||
LB21a: HL CM* (HY | BA | BAX) CM* [^CM CB]?;
|
||||
|
||||
# DO allow breaks here before $BAXcm and $NSXcm, so don't include them
|
||||
LB21.1: . CM* [BA HY NS];
|
||||
LB21.2: BB CM* [^CM CB];
|
||||
|
||||
LB21b: SY CM* HL;
|
||||
|
||||
LB22.1: (AL | HL | CM) CM* IN; # The CM is from LB10, treat an unattached CM as AL.
|
||||
LB22.2: EX CM* IN;
|
||||
LB22.3: (ID | EB | EM) CM* IN;
|
||||
LB22.4: IN CM* IN;
|
||||
LB22.5: NU CM* IN;
|
||||
|
||||
LB23.1: (AL | HL | CM) CM* NU;
|
||||
LB23.2: NU CM* (AL | HL);
|
||||
|
||||
LB23a.1: PR CM* (ID | EB | EM);
|
||||
LB23a.2: (ID | EB | EM) CM* PO;
|
||||
|
||||
LB24.2: (PR | PO) CM* (AL | HL);
|
||||
LB24.3: (AL | HL | CM) CM* (PR | PO);
|
||||
|
||||
# Numbers. Equivalent to Tailoring example 8 from UAx 14.
|
||||
LB25: ((PR | PO)CM*)? ((OP | HY)CM*)? NU (CM*(NU | SY | IS))* (CM*(CL | CP))? (CM*(PR | PO))?;
|
||||
|
||||
LB26.1: JL CM* (JL | JV | H2 | H3);
|
||||
LB26.2: (JV | H2) CM* (JV | JT);
|
||||
LB26.3: (JT | H3) CM* JT;
|
||||
|
||||
LB27.1: (JL | JV | JT | H2 | H3) CM* IN;
|
||||
LB27.2: (JL | JV | JT | H2 | H3) CM* PO;
|
||||
LB27.3: PR CM* (JL | JV | JT | H2 | H3);
|
||||
|
||||
# LB28 Do not break between Alphabetics.
|
||||
# Unattached (leading) CM treated as AL.
|
||||
LB28: (AL | HL | CM)CM* (AL | HL);
|
||||
|
||||
LB29: IS CM* (AL | HL);
|
||||
|
||||
# LB30 is adjusted for unattached leading CM being treated as AL.
|
||||
LB30.1: (AL | CM | HL | NU) CM* OP;
|
||||
LB30.2: CP CM* (AL | HL | NU);
|
||||
|
||||
# LB31 keep pairs of RI together.
|
||||
LB30a.1: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS];
|
||||
LB30a.2: RI CM* RI CM* ZWJ (ID | Extended_Pict | EmojiNRK);
|
||||
LB30a.3: RI CM* RI CM* ÷;
|
||||
|
||||
# LB30b Do not break between Emoji Base and Emoji Modifier
|
||||
LB30b: EB CM* EM;
|
||||
|
||||
# LB31 Break Everywhere Else.
|
||||
# Include combining marks
|
||||
LB31.1: . CM* ZWJ (ID | Extended_Pict | EmojiNRK);
|
||||
LB31.2: . CM* ÷;
|
@ -0,0 +1,10 @@
|
||||
file: main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/readme.txt
|
||||
Copyright (C) 2016 and later: Unicode, Inc. and others.
|
||||
License & terms of use: http://www.unicode.org/copyright.html#License
|
||||
|
||||
Copyright (c) 2015-2016, International Business Machines Corporation and others. All Rights Reserved.
|
||||
|
||||
This directory contains the break iterator reference rule files used by the test RBBIMonkeyTest.
|
||||
|
||||
The rule files are copied from ICU4C, from source/test/testdata/break_rules/*
|
||||
See the readme.txt located there for additional information.
|
@ -0,0 +1,50 @@
|
||||
#
|
||||
# Copyright (C) 2016 and later: Unicode, Inc. and others.
|
||||
# License & terms of use: http://www.unicode.org/copyright.html#License
|
||||
|
||||
# Copyright (c) 2016, International Business Machines Corporation and others. All Rights Reserved.
|
||||
# file: sentence.txt
|
||||
|
||||
type = sentence; # one of grapheme | word | line | sentence
|
||||
locale = en;
|
||||
|
||||
CR = [\p{Sentence_Break = CR}];
|
||||
LF = [\p{Sentence_Break = LF}];
|
||||
Extend = [\p{Sentence_Break = Extend}];
|
||||
Sep = [\p{Sentence_Break = Sep}];
|
||||
Format = [\p{Sentence_Break = Format}];
|
||||
Sp = [\p{Sentence_Break = Sp}];
|
||||
Lower = [\p{Sentence_Break = Lower}];
|
||||
Upper = [\p{Sentence_Break = Upper}];
|
||||
OLetter = [\p{Sentence_Break = OLetter}];
|
||||
Numeric = [\p{Sentence_Break = Numeric}];
|
||||
ATerm = [\p{Sentence_Break = ATerm}];
|
||||
SContinue = [\p{Sentence_Break = SContinue}];
|
||||
STerm = [\p{Sentence_Break = STerm}];
|
||||
Close = [\p{Sentence_Break = Close}];
|
||||
|
||||
ParaSep = [Sep CR LF];
|
||||
SATerm = [STerm ATerm];
|
||||
ExtFmt = [Extend Format];
|
||||
|
||||
# SB2: ÷ eot
|
||||
# Conventional regular expression matching for '$' as end-of-text also matches
|
||||
# at a line separator just preceding the physical end of text.
|
||||
# Instead, use a look-ahead assertion that there is no following character.
|
||||
SB2: . ÷ (?!.);
|
||||
|
||||
SB3: CR LF;
|
||||
SB4: ParaSep ÷;
|
||||
|
||||
# SB5: ignore Format and Extend characters.
|
||||
|
||||
SB6: ATerm ExtFmt* Numeric;
|
||||
SB7: (Upper | Lower) ExtFmt* ATerm ExtFmt* Upper;
|
||||
SB8: ATerm ExtFmt* (Close ExtFmt*)* (Sp ExtFmt*)* ([^OLetter Upper Lower ParaSep SATerm ExtFmt] ExtFmt *)* Lower;
|
||||
SB8a: SATerm ExtFmt* (Close ExtFmt*)* (Sp ExtFmt*)* (SContinue | SATerm);
|
||||
|
||||
SB9: SATerm ExtFmt* (Close ExtFmt*)* (Sp ExtFmt*)* (CR LF | ParaSep)? ÷;
|
||||
# Also covers SB10, SB11.
|
||||
|
||||
SB12: . ExtFmt* [^ExtFmt]?;
|
||||
|
@ -0,0 +1,97 @@
|
||||
#
|
||||
# Copyright (C) 2016 and later: Unicode, Inc. and others.
|
||||
# License & terms of use: http://www.unicode.org/copyright.html
|
||||
# Copyright (c) 2016, International Business Machines Corporation and others. All Rights Reserved.
|
||||
|
||||
# file: word.txt
|
||||
#
|
||||
# Reference Word Break rules for intltest rbbi/RBBIMonkeyTest
|
||||
#
|
||||
# Note: Rule syntax and the monkey test itself are still a work in progress.
|
||||
# They are expected to change with review and the addition of support for rule tailoring.
|
||||
|
||||
|
||||
type = word; # one of grapheme | word | line | sentence
|
||||
locale = en;
|
||||
|
||||
|
||||
CR = [\p{Word_Break = CR}];
|
||||
LF = [\p{Word_Break = LF}];
|
||||
Newline = [\p{Word_Break = Newline}];
|
||||
Extend = [\p{Word_Break = Extend}];
|
||||
ZWJ = [\p{Word_Break = ZWJ}];
|
||||
Regional_Indicator = [\p{Word_Break = Regional_Indicator}];
|
||||
Format = [\p{Word_Break = Format}];
|
||||
Katakana = [\p{Word_Break = Katakana}];
|
||||
Hebrew_Letter = [\p{Word_Break = Hebrew_Letter}];
|
||||
ALetter = [\p{Word_Break = ALetter}];
|
||||
Single_Quote = [\p{Word_Break = Single_Quote}];
|
||||
Double_Quote = [\p{Word_Break = Double_Quote}];
|
||||
MidNumLet = [\p{Word_Break = MidNumLet}];
|
||||
MidLetter = [\p{Word_Break = MidLetter}];
|
||||
MidNum = [\p{Word_Break = MidNum}];
|
||||
Numeric = [\p{Word_Break = Numeric}];
|
||||
ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
|
||||
E_Base = [\p{Word_Break = EB}];
|
||||
E_Modifier = [\p{Word_Break = EM}];
|
||||
EmojiNRK = [[\p{Emoji}] - [[Regional_Indicator]\u002a\u00230-9©®™〰〽]];
|
||||
# Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r13267
|
||||
Extended_Pict = [\U0001F774-\U0001F77F\U00002700-\U00002701\U00002703-\U00002704\U0000270E\U00002710-\U00002711\U00002765-\U00002767\U0001F030-\U0001F093\U0001F094-\U0001F09F\U0001F10D-\U0001F10F\U0001F12F\U0001F16C-\U0001F16F\U0001F1AD-\U0001F1E5\U0001F260-\U0001F265\U0001F203-\U0001F20F\U0001F23C-\U0001F23F\U0001F249-\U0001F24F\U0001F252-\U0001F25F\U0001F266-\U0001F2FF\U0001F7D5-\U0001F7FF\U0001F000-\U0001F003\U0001F005-\U0001F02B\U0001F02C-\U0001F02F\U0001F322-\U0001F323\U0001F394-\U0001F395\U0001F398\U0001F39C-\U0001F39D\U0001F3F1-\U0001F3F2\U0001F3F6\U0001F4FE\U0001F53E-\U0001F548\U0001F54F\U0001F568-\U0001F56E\U0001F571-\U0001F572\U0001F57B-\U0001F586\U0001F588-\U0001F589\U0001F58E-\U0001F58F\U0001F591-\U0001F594\U0001F597-\U0001F5A3\U0001F5A6-\U0001F5A7\U0001F5A9-\U0001F5B0\U0001F5B3-\U0001F5BB\U0001F5BD-\U0001F5C1\U0001F5C5-\U0001F5D0\U0001F5D4-\U0001F5DB\U0001F5DF-\U0001F5E0\U0001F5E2\U0001F5E4-\U0001F5E7\U0001F5E9-\U0001F5EE\U0001F5F0-\U0001F5F2\U0001F5F4-\U0001F5F9\U00002605\U00002607-\U0000260D\U0000260F-\U00002610\U00002612\U00002616-\U00002617\U00002619-\U0000261C\U0000261E-\U0000261F\U00002621\U00002624-\U00002625\U00002627-\U00002629\U0000262B-\U0000262D\U00002630-\U00002637\U0000263B-\U00002647\U00002654-\U0000265F\U00002661-\U00002662\U00002664\U00002667\U00002669-\U0000267A\U0000267C-\U0000267E\U00002680-\U00002691\U00002695\U00002698\U0000269A\U0000269D-\U0000269F\U000026A2-\U000026A9\U000026AC-\U000026AF\U000026B2-\U000026BC\U000026BF-\U000026C3\U000026C6-\U000026C7\U000026C9-\U000026CD\U000026D0\U000026D2\U000026D5-\U000026E8\U000026EB-\U000026EF\U000026F6\U000026FB-\U000026FC\U000026FE-\U000026FF\U00002388\U0001FA00-\U0001FFFD\U0001F0A0-\U0001F0AE\U0001F0B1-\U0001F0BF\U0001F0C1-\U0001F0CF\U0001F0D1-\U0001F0F5\U0001F0AF-\U0001F0B0\U0001F0C0\U0001F0D0\U0001F0F6-\U0001F0FF\U0001F80C-\U0001F80F\U0001F848-\U0001F84F\U0001F85A-\U0001F85F\U0001F888-\U0001F88F\U0001F8AE-\U0001F8FF\U0001F900-\U0001F90B\U0001F91F\U0001F928-\U0001F92F\U0001F931-\U0001F932\U0001F94C\U0001F95F-\U0001F96B\U0001F992-\U0001F997\U0001F9D0-\U0001F9E6\U0001F90C-\U0001F90F\U0001F93F\U0001F94D-\U0001F94F\U0001F96C-\U0001F97F\U0001F998-\U0001F9BF\U0001F9C1-\U0001F9CF\U0001F9E7-\U0001F9FF\U0001F6C6-\U0001F6CA\U0001F6D3-\U0001F6D4\U0001F6E6-\U0001F6E8\U0001F6EA\U0001F6F1-\U0001F6F2\U0001F6F7-\U0001F6F8\U0001F6D5-\U0001F6DF\U0001F6ED-\U0001F6EF\U0001F6F9-\U0001F6FF];
|
||||
EBG = [\p{Word_Break = EBG}];
|
||||
|
||||
#define dictionary, with the effect being that those characters don't appear in test data.
|
||||
|
||||
Han = [:Han:];
|
||||
Hiragana = [:Hiragana:];
|
||||
|
||||
Control = [\p{Grapheme_Cluster_Break = Control}];
|
||||
HangulSyllable = [\uac00-\ud7a3];
|
||||
ComplexContext = [:LineBreak = Complex_Context:];
|
||||
KanaKanji = [Han Hiragana Katakana];
|
||||
dictionaryCJK = [KanaKanji HangulSyllable];
|
||||
dictionary = [ComplexContext dictionaryCJK];
|
||||
|
||||
# leave dictionary scripts out of ALetter
|
||||
|
||||
ALetter = [ALetter - dictionary];
|
||||
|
||||
AHLetter = [ALetter Hebrew_Letter];
|
||||
MidNumLetQ = [MidNumLet Single_Quote];
|
||||
ExtFmt = [Extend Format ZWJ];
|
||||
|
||||
WB3: CR LF;
|
||||
WB3a: (Newline | CR | LF) ÷;
|
||||
WB3b: . ÷ (Newline | CR | LF); # actually redundant? No other rule combines.
|
||||
# (but needed with UAX treat-as scheme.)
|
||||
WB3c: ZWJ (Extended_Pict | EmojiNRK);
|
||||
|
||||
WB5: AHLetter ExtFmt* AHLetter;
|
||||
|
||||
# includes both WB6 and WB7
|
||||
WB6: AHLetter ExtFmt* (MidLetter | MidNumLetQ) ExtFmt* AHLetter;
|
||||
|
||||
WB7a: Hebrew_Letter ExtFmt* Single_Quote;
|
||||
WB7b: Hebrew_Letter ExtFmt* Double_Quote ExtFmt* Hebrew_Letter; # Include WB7c
|
||||
|
||||
WB8: Numeric ExtFmt* Numeric;
|
||||
WB9: AHLetter ExtFmt* Numeric;
|
||||
WB10: Numeric ExtFmt* AHLetter;
|
||||
|
||||
WB11: Numeric ExtFmt* (MidNum | MidNumLetQ) ExtFmt* Numeric; # includes WB12
|
||||
WB13: Katakana ExtFmt* Katakana;
|
||||
|
||||
WB13a: (AHLetter | Numeric | Katakana | ExtendNumLet) ExtFmt* ExtendNumLet;
|
||||
WB13b: ExtendNumLet ExtFmt* (AHLetter | Numeric | Katakana);
|
||||
|
||||
# WB rule 15 - 17, pairs of Regional Indicators stay unbroken.
|
||||
# Interacts with WB3c.
|
||||
WB15: Regional_Indicator ExtFmt* Regional_Indicator ExtFmt* ZWJ (Extended_Pict | EmojiNRK);
|
||||
WB17: Regional_Indicator ExtFmt* Regional_Indicator ExtFmt* ÷;
|
||||
|
||||
WB14: (E_Base | EBG) ExtFmt* E_Modifier;
|
||||
|
||||
# Rule WB 999 Any ÷ Any
|
||||
# Interacts with WB3c, do not break between ZWJ and (Extended_Pict | EBG).
|
||||
WB999.1: . ExtFmt* ZWJ (Extended_Pict | EmojiNRK);
|
||||
WB999.2: . ExtFmt* ÷;
|
||||
|
@ -0,0 +1,96 @@
|
||||
#
|
||||
# Copyright (C) 2016 and later: Unicode, Inc. and others.
|
||||
# License & terms of use: http://www.unicode.org/copyright.html
|
||||
# Copyright (c) 2016, International Business Machines Corporation and others. All Rights Reserved.
|
||||
|
||||
# file: word_POSIX.txt
|
||||
#
|
||||
# Reference Word Break rules for intltest rbbi/RBBIMonkeyTest
|
||||
#
|
||||
# Note: Rule syntax and the monkey test itself are still a work in progress.
|
||||
# They are expected to change with review and the addition of support for rule tailoring.
|
||||
|
||||
type = word; # one of grapheme | word | line | sentence
|
||||
locale = en_US_POSIX;
|
||||
|
||||
|
||||
CR = [\p{Word_Break = CR}];
|
||||
LF = [\p{Word_Break = LF}];
|
||||
Newline = [\p{Word_Break = Newline}];
|
||||
Extend = [\p{Word_Break = Extend}];
|
||||
ZWJ = [\p{Word_Break = ZWJ}];
|
||||
Regional_Indicator = [\p{Word_Break = Regional_Indicator}];
|
||||
Format = [\p{Word_Break = Format}];
|
||||
Katakana = [\p{Word_Break = Katakana}];
|
||||
Hebrew_Letter = [\p{Word_Break = Hebrew_Letter}];
|
||||
ALetter = [\p{Word_Break = ALetter}];
|
||||
Single_Quote = [\p{Word_Break = Single_Quote}];
|
||||
Double_Quote = [\p{Word_Break = Double_Quote}];
|
||||
MidNumLet = [\p{Word_Break = MidNumLet} - [.]];
|
||||
MidLetter = [\p{Word_Break = MidLetter} - [\:]];
|
||||
MidNum = [\p{Word_Break = MidNum} [.]];
|
||||
Numeric = [\p{Word_Break = Numeric}];
|
||||
ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
|
||||
E_Base = [\p{Word_Break = EB}];
|
||||
E_Modifier = [\p{Word_Break = EM}];
|
||||
EmojiNRK = [[\p{Emoji}] - [[Regional_Indicator]\u002a\u00230-9©®™〰〽]];
|
||||
# Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r13267
|
||||
Extended_Pict = [\U0001F774-\U0001F77F\U00002700-\U00002701\U00002703-\U00002704\U0000270E\U00002710-\U00002711\U00002765-\U00002767\U0001F030-\U0001F093\U0001F094-\U0001F09F\U0001F10D-\U0001F10F\U0001F12F\U0001F16C-\U0001F16F\U0001F1AD-\U0001F1E5\U0001F260-\U0001F265\U0001F203-\U0001F20F\U0001F23C-\U0001F23F\U0001F249-\U0001F24F\U0001F252-\U0001F25F\U0001F266-\U0001F2FF\U0001F7D5-\U0001F7FF\U0001F000-\U0001F003\U0001F005-\U0001F02B\U0001F02C-\U0001F02F\U0001F322-\U0001F323\U0001F394-\U0001F395\U0001F398\U0001F39C-\U0001F39D\U0001F3F1-\U0001F3F2\U0001F3F6\U0001F4FE\U0001F53E-\U0001F548\U0001F54F\U0001F568-\U0001F56E\U0001F571-\U0001F572\U0001F57B-\U0001F586\U0001F588-\U0001F589\U0001F58E-\U0001F58F\U0001F591-\U0001F594\U0001F597-\U0001F5A3\U0001F5A6-\U0001F5A7\U0001F5A9-\U0001F5B0\U0001F5B3-\U0001F5BB\U0001F5BD-\U0001F5C1\U0001F5C5-\U0001F5D0\U0001F5D4-\U0001F5DB\U0001F5DF-\U0001F5E0\U0001F5E2\U0001F5E4-\U0001F5E7\U0001F5E9-\U0001F5EE\U0001F5F0-\U0001F5F2\U0001F5F4-\U0001F5F9\U00002605\U00002607-\U0000260D\U0000260F-\U00002610\U00002612\U00002616-\U00002617\U00002619-\U0000261C\U0000261E-\U0000261F\U00002621\U00002624-\U00002625\U00002627-\U00002629\U0000262B-\U0000262D\U00002630-\U00002637\U0000263B-\U00002647\U00002654-\U0000265F\U00002661-\U00002662\U00002664\U00002667\U00002669-\U0000267A\U0000267C-\U0000267E\U00002680-\U00002691\U00002695\U00002698\U0000269A\U0000269D-\U0000269F\U000026A2-\U000026A9\U000026AC-\U000026AF\U000026B2-\U000026BC\U000026BF-\U000026C3\U000026C6-\U000026C7\U000026C9-\U000026CD\U000026D0\U000026D2\U000026D5-\U000026E8\U000026EB-\U000026EF\U000026F6\U000026FB-\U000026FC\U000026FE-\U000026FF\U00002388\U0001FA00-\U0001FFFD\U0001F0A0-\U0001F0AE\U0001F0B1-\U0001F0BF\U0001F0C1-\U0001F0CF\U0001F0D1-\U0001F0F5\U0001F0AF-\U0001F0B0\U0001F0C0\U0001F0D0\U0001F0F6-\U0001F0FF\U0001F80C-\U0001F80F\U0001F848-\U0001F84F\U0001F85A-\U0001F85F\U0001F888-\U0001F88F\U0001F8AE-\U0001F8FF\U0001F900-\U0001F90B\U0001F91F\U0001F928-\U0001F92F\U0001F931-\U0001F932\U0001F94C\U0001F95F-\U0001F96B\U0001F992-\U0001F997\U0001F9D0-\U0001F9E6\U0001F90C-\U0001F90F\U0001F93F\U0001F94D-\U0001F94F\U0001F96C-\U0001F97F\U0001F998-\U0001F9BF\U0001F9C1-\U0001F9CF\U0001F9E7-\U0001F9FF\U0001F6C6-\U0001F6CA\U0001F6D3-\U0001F6D4\U0001F6E6-\U0001F6E8\U0001F6EA\U0001F6F1-\U0001F6F2\U0001F6F7-\U0001F6F8\U0001F6D5-\U0001F6DF\U0001F6ED-\U0001F6EF\U0001F6F9-\U0001F6FF];
|
||||
EBG = [\p{Word_Break = EBG}];
|
||||
|
||||
#define dictionary, with the effect being that those characters don't appear in test data.
|
||||
|
||||
Han = [:Han:];
|
||||
Hiragana = [:Hiragana:];
|
||||
|
||||
Control = [\p{Grapheme_Cluster_Break = Control}];
|
||||
HangulSyllable = [\uac00-\ud7a3];
|
||||
ComplexContext = [:LineBreak = Complex_Context:];
|
||||
KanaKanji = [Han Hiragana Katakana];
|
||||
dictionaryCJK = [KanaKanji HangulSyllable];
|
||||
dictionary = [ComplexContext dictionaryCJK];
|
||||
|
||||
# leave dictionary scripts out of ALetter
|
||||
|
||||
ALetter = [ALetter - dictionary];
|
||||
|
||||
AHLetter = [ALetter Hebrew_Letter];
|
||||
MidNumLetQ = [MidNumLet Single_Quote];
|
||||
ExtFmt = [Extend Format ZWJ];
|
||||
|
||||
WB3: CR LF;
|
||||
WB3a: (Newline | CR | LF) ÷;
|
||||
WB3b: . ÷ (Newline | CR | LF); # actually redundant? No other rule combines.
|
||||
# (but needed with UAX treat-as scheme.)
|
||||
WB3c: ZWJ (Extended_Pict | EmojiNRK);
|
||||
|
||||
WB5: AHLetter ExtFmt* AHLetter;
|
||||
|
||||
# includes both WB6 and WB7
|
||||
WB6: AHLetter ExtFmt* (MidLetter | MidNumLetQ) ExtFmt* AHLetter;
|
||||
|
||||
WB7a: Hebrew_Letter ExtFmt* Single_Quote;
|
||||
WB7b: Hebrew_Letter ExtFmt* Double_Quote ExtFmt* Hebrew_Letter; # Include WB7c
|
||||
|
||||
WB8: Numeric ExtFmt* Numeric;
|
||||
WB9: AHLetter ExtFmt* Numeric;
|
||||
WB10: Numeric ExtFmt* AHLetter;
|
||||
|
||||
WB11: Numeric ExtFmt* (MidNum | MidNumLetQ) ExtFmt* Numeric; # includes WB12
|
||||
WB13: Katakana ExtFmt* Katakana;
|
||||
|
||||
WB13a: (AHLetter | Numeric | Katakana | ExtendNumLet) ExtFmt* ExtendNumLet;
|
||||
WB13b: ExtendNumLet ExtFmt* (AHLetter | Numeric | Katakana);
|
||||
|
||||
# WB rule 15 - 17, pairs of Regional Indicators stay unbroken.
|
||||
# Interacts with WB3c.
|
||||
WB15: Regional_Indicator ExtFmt* Regional_Indicator ExtFmt* ZWJ (Extended_Pict | EmojiNRK);
|
||||
WB17: Regional_Indicator ExtFmt* Regional_Indicator ExtFmt* ÷;
|
||||
|
||||
WB14: (E_Base | EBG) ExtFmt* E_Modifier;
|
||||
|
||||
# Rule WB 999 Any ÷ Any
|
||||
# Interacts with WB3c, do not break between ZWJ and (Extended_Pict | EBG).
|
||||
WB999.1: . ExtFmt* ZWJ (Extended_Pict | EmojiNRK);
|
||||
WB999.2: . ExtFmt* ÷;
|
||||
|
@ -14,7 +14,9 @@
|
||||
# <sent> any following data is for sentence break testing
|
||||
# <line> any following data is for line break testing
|
||||
# <char> any following data is for char break testing
|
||||
# <locale local_name> Switch to the named locale at the next occurence of <word>, <sent>, etc.
|
||||
# <rules> rules ... </rules> following data is tested against these rules.
|
||||
# Applies until a following occurence of <word>, <sent>, etc. or another <rules>
|
||||
# <locale locale_name> Switch to the named locale at the next occurence of <word>, <sent>, etc.
|
||||
# <data> ... </data> test data. May span multiple lines.
|
||||
# <> Break position, status == 0
|
||||
# • Break position, status == 0 (Bullet, \u2022)
|
||||
@ -37,8 +39,17 @@
|
||||
# Temp debugging tests
|
||||
<locale en>
|
||||
<word>
|
||||
<data><0>ク<400>ライアン<400>ト<400>サーバー<400></data>
|
||||
# <data><0>ク<400>ライアン<400>トサーバー<400></data>
|
||||
<data><0>コンピューター<400>は<400>、<0>本質<400>的<400>に<400>は<400>数字<400>しか<400>扱う<400>こと<400>が<400>でき<400>ま<400>せん<400>。<0>\
|
||||
コンピューター<400>は<400>、<0>文字<400>や<400>記号<400>など<400>の<400>それぞれに<400>番号<400>を<400>割り振る<400>こと<400>によって<400>扱える<400>\
|
||||
よう<400>にし<400>ます<400>。<0>ユニ<400>コード<400>が<400>出来る<400>まで<400>は<400>、<0>これらの<400>番号<400>を<400>割り振る<400>仕組み<400>が<400>\
|
||||
何<400>百<400>種類<400>も<400>存在<400>しま<400>した<400>。<0>どの<400>一つ<400>を<400>とっても<400>、<0>十分<400>な<400>文字<400>を<400>含<400>\
|
||||
んで<400>は<400>いま<400>せん<400>で<400>した<400>。<0>例えば<400>、<0>欧州<400>連合<400>一つ<400>を<400>見<400>て<400>も<400>、<0>その<400>\
|
||||
すべて<400>の<400>言語<400>を<400>カバー<400>する<400>ため<400>に<400>は<400>、<0>いくつか<400>の<400>異なる<400>符号<400>化<400>の<400>仕組み<400>\
|
||||
が<400>必要<400>で<400>した<400>。<0>英語<400>の<400>よう<400>な<400>一つ<400>の<400>言語<400>に<400>限<400>って<400>も<400>、<0>一つ<400>だけ<400>\
|
||||
の<400>符号<400>化<400>の<400>仕組み<400>では<400>、<0>一般<400>的<400>に<400>使<400>われる<400>すべて<400>の<400>文字<400>、<0>句読点<400>、<0>\
|
||||
。<0></data>
|
||||
|
||||
#<data><0>コンピューター<400>は<400>、<0>本質<400>的<400>に<400>は<400>数字<400>しか<400>扱う<400>こと<400>が<400>でき<400>ま<400>せん<400>。<0>\
|
||||
|
||||
## FILTERED BREAK TESTS
|
||||
|
||||
@ -1308,3 +1319,48 @@ Bangkok)•</data>
|
||||
<data>•\U0001F468\u200D\u2695\uFE0F•\U0001F468\u200D\u2695•\U0001F468\U0001F3FD\u200D\u2695\uFE0F•\U0001F468\U0001F3FD\u200D\u2695\u0020•</data>
|
||||
# woman astronaut, woman astronaut / fitz4
|
||||
<data>•\U0001F469\u200D\U0001F680•\U0001F469\U0001F3FD\u200D\U0001F680\u0020•</data>
|
||||
|
||||
|
||||
####################################################################################
|
||||
#
|
||||
# Test rule status values
|
||||
#
|
||||
####################################################################################
|
||||
<rules> $Letters = [:L:];
|
||||
$Numbers = [:N:];
|
||||
$Letters+{1};
|
||||
$Numbers+{2};
|
||||
Help\ me\!{4};
|
||||
[^$Letters $Numbers];
|
||||
!.*;
|
||||
</rules>
|
||||
<data>•abc<1>123<2>.•.•abc<1> •Help<1> •me<1> •Help me!<4></data>
|
||||
|
||||
# Test option to prohibit unquoted literals.
|
||||
|
||||
<rules>
|
||||
!!forward;
|
||||
Hello\ World;
|
||||
!!reverse;
|
||||
.*;
|
||||
</rules>
|
||||
<data>•Hello World•</data>
|
||||
|
||||
<badrules>
|
||||
!!quoted_literals_only;
|
||||
!!forward;
|
||||
Hello\ World;
|
||||
!!reverse;
|
||||
.*;
|
||||
</badrules>
|
||||
|
||||
<rules>
|
||||
#TODO: uncomment this line when quoted_literals_only is implemented.
|
||||
#!!quoted_literals_only;
|
||||
!!forward;
|
||||
'Hello World';
|
||||
!!reverse;
|
||||
.*;
|
||||
</rules>
|
||||
<data>•Hello World•</data>
|
||||
|
||||
|
@ -10,13 +10,11 @@
|
||||
package com.ibm.icu.dev.test.shaping;
|
||||
|
||||
import java.lang.reflect.Method;
|
||||
import java.util.MissingResourceException;
|
||||
|
||||
import org.junit.Test;
|
||||
|
||||
import com.ibm.icu.dev.test.TestFmwk;
|
||||
import com.ibm.icu.text.ArabicShaping;
|
||||
import com.ibm.icu.text.ArabicShapingException;
|
||||
|
||||
/**
|
||||
* Regression test for Arabic shaping.
|
||||
@ -48,509 +46,6 @@ public class ArabicShapingRegTest extends TestFmwk {
|
||||
public static final int DIGIT_TYPE_AN = 0;
|
||||
public static final int DIGIT_TYPE_AN_EXTENDED = 0x100;
|
||||
|
||||
public static class TestData {
|
||||
public int type;
|
||||
public String source;
|
||||
public int flags;
|
||||
public String result;
|
||||
public int length;
|
||||
public Class error;
|
||||
|
||||
public static final int STANDARD = 0;
|
||||
public static final int PREFLIGHT = 1;
|
||||
public static final int ERROR = 2;
|
||||
|
||||
public static TestData standard(String source, int flags, String result) {
|
||||
return new TestData(STANDARD, source, flags, result, 0, null);
|
||||
}
|
||||
|
||||
public static TestData preflight(String source, int flags, int length) {
|
||||
return new TestData(PREFLIGHT, source, flags, null, length, null);
|
||||
}
|
||||
|
||||
public static TestData error(String source, int flags, Class error) {
|
||||
return new TestData(ERROR, source, flags, null, 0, error);
|
||||
}
|
||||
|
||||
private TestData(int type, String source, int flags, String result, int length, Class error) {
|
||||
this.type = type;
|
||||
this.source = source;
|
||||
this.flags = flags;
|
||||
this.result = result;
|
||||
this.length = length;
|
||||
this.error = error;
|
||||
}
|
||||
|
||||
private static final String[] typenames = { "standard", "preflight", "error" };
|
||||
|
||||
public String toString() {
|
||||
StringBuffer buf = new StringBuffer(super.toString());
|
||||
buf.append("[\n");
|
||||
buf.append(typenames[type]);
|
||||
buf.append(",\n");
|
||||
if (source == null) {
|
||||
buf.append("null");
|
||||
} else {
|
||||
buf.append('"');
|
||||
buf.append(escapedString(source));
|
||||
buf.append('"');
|
||||
}
|
||||
buf.append(",\n");
|
||||
buf.append(Integer.toHexString(flags));
|
||||
buf.append(",\n");
|
||||
if (result == null) {
|
||||
buf.append("null");
|
||||
} else {
|
||||
buf.append('"');
|
||||
buf.append(escapedString(result));
|
||||
buf.append('"');
|
||||
}
|
||||
buf.append(",\n");
|
||||
buf.append(length);
|
||||
buf.append(",\n");
|
||||
buf.append(error);
|
||||
buf.append(']');
|
||||
return buf.toString();
|
||||
}
|
||||
}
|
||||
|
||||
private static final String lamAlefSpecialVLTR =
|
||||
"\u0020\u0646\u0622\u0644\u0627\u0020" +
|
||||
"\u0646\u0623\u064E\u0644\u0627\u0020" +
|
||||
"\u0646\u0627\u0670\u0644\u0627\u0020" +
|
||||
"\u0646\u0622\u0653\u0644\u0627\u0020" +
|
||||
"\u0646\u0625\u0655\u0644\u0627\u0020" +
|
||||
"\u0646\u0622\u0654\u0644\u0627\u0020" +
|
||||
"\uFEFC\u0639";
|
||||
|
||||
private static final String tashkeelSpecialVLTR =
|
||||
"\u064A\u0628\u0631\u0639\u0020" +
|
||||
"\u064A\u0628\u0651\u0631\u064E\u0639\u0020" +
|
||||
"\u064C\u064A\u0628\u0631\u064F\u0639\u0020" +
|
||||
"\u0628\u0670\u0631\u0670\u0639\u0020" +
|
||||
"\u0628\u0653\u0631\u0653\u0639\u0020" +
|
||||
"\u0628\u0654\u0631\u0654\u0639\u0020" +
|
||||
"\u0628\u0655\u0631\u0655\u0639\u0020";
|
||||
|
||||
private static final String tashkeelShaddaRTL=
|
||||
"\u0634\u0651\u0645\u0652\u0633";
|
||||
private static final String tashkeelShaddaLTR=
|
||||
"\u0633\u0652\u0645\u0651\u0634";
|
||||
|
||||
private static final String ArMathSym =
|
||||
"\uD83B\uDE00\uD83B\uDE01\uD83B\uDE02\uD83B\uDE03\u0020" +
|
||||
"\uD83B\uDE24\uD83B\uDE05\uD83B\uDE06\u0020" +
|
||||
"\uD83B\uDE07\uD83B\uDE08\uD83B\uDE09\u0020" +
|
||||
"\uD83B\uDE0A\uD83B\uDE0B\uD83B\uDE0C\uD83B\uDE0D\u0020" +
|
||||
"\uD83B\uDE0E\uD83B\uDE0F\uD83B\uDE10\uD83B\uDE11\u0020" +
|
||||
"\uD83B\uDE12\uD83B\uDE13\uD83B\uDE14\uD83B\uDE15\u0020" +
|
||||
"\uD83B\uDE16\uD83B\uDE17\uD83B\uDE18\u0020" +
|
||||
"\uD83B\uDE19\uD83B\uDE1A\uD83B\uDE1B";
|
||||
|
||||
private static final String ArMathSymLooped =
|
||||
"\uD83B\uDE80\uD83B\uDE81\uD83B\uDE82\uD83B\uDE83\u0020" +
|
||||
"\uD83B\uDE84\uD83B\uDE85\uD83B\uDE86\u0020" +
|
||||
"\uD83B\uDE87\uD83B\uDE88\uD83B\uDE89\u0020" +
|
||||
"\uD83B\uDE8B\uD83B\uDE8C\uD83B\uDE8D\u0020" +
|
||||
"\uD83B\uDE8E\uD83B\uDE8F\uD83B\uDE90\uD83B\uDE91\u0020" +
|
||||
"\uD83B\uDE92\uD83B\uDE93\uD83B\uDE94\uD83B\uDE95\u0020" +
|
||||
"\uD83B\uDE96\uD83B\uDE97\uD83B\uDE98\u0020" +
|
||||
"\uD83B\uDE99\uD83B\uDE9A\uD83B\uDE9B";
|
||||
|
||||
private static final String ArMathSymDoubleStruck =
|
||||
"\uD83B\uDEA1\uD83B\uDEA2\uD83B\uDEA3\u0020" +
|
||||
"\uD83B\uDEA5\uD83B\uDEA6\u0020" +
|
||||
"\uD83B\uDEA7\uD83B\uDEA8\uD83B\uDEA9\u0020" +
|
||||
"\uD83B\uDEAB\uD83B\uDEAC\uD83B\uDEAD\u0020" +
|
||||
"\uD83B\uDEAE\uD83B\uDEAF\uD83B\uDEB0\uD83B\uDEB1\u0020" +
|
||||
"\uD83B\uDEB2\uD83B\uDEB3\uD83B\uDEB4\uD83B\uDEB5\u0020" +
|
||||
"\uD83B\uDEB6\uD83B\uDEB7\uD83B\uDEB8\u0020" +
|
||||
"\uD83B\uDEB9\uD83B\uDEBA\uD83B\uDEBB";
|
||||
|
||||
private static final String ArMathSymInitial =
|
||||
"\uD83B\uDE21\uD83B\uDE22\u0020" +
|
||||
"\uD83B\uDE27\uD83B\uDE29\u0020" +
|
||||
"\uD83B\uDE2A\uD83B\uDE2B\uD83B\uDE2C\uD83B\uDE2D\u0020" +
|
||||
"\uD83B\uDE2E\uD83B\uDE2F\uD83B\uDE30\uD83B\uDE31\u0020" +
|
||||
"\uD83B\uDE32\uD83B\uDE34\uD83B\uDE35\u0020" +
|
||||
"\uD83B\uDE36\uD83B\uDE37\u0020" +
|
||||
"\uD83B\uDE39\uD83B\uDE3B";
|
||||
|
||||
private static final String ArMathSymTailed =
|
||||
"\uD83B\uDE42\uD83B\uDE47\uD83B\uDE49\uD83B\uDE4B\u0020" +
|
||||
"\uD83B\uDE4D\uD83B\uDE4E\uD83B\uDE4F\u0020" +
|
||||
"\uD83B\uDE51\uD83B\uDE52\uD83B\uDE54\uD83B\uDE57\u0020" +
|
||||
"\uD83B\uDE59\uD83B\uDE5B\uD83B\uDE5D\uD83B\uDE5F";
|
||||
|
||||
private static final String ArMathSymStretched =
|
||||
"\uD83B\uDE21\u0633\uD83B\uDE62\u0647";
|
||||
|
||||
private static final String logicalUnshape =
|
||||
"\u0020\u0020\u0020\uFE8D\uFEF5\u0020\uFEE5\u0020\uFE8D\uFEF7\u0020" +
|
||||
"\uFED7\uFEFC\u0020\uFEE1\u0020\uFE8D\uFEDF\uFECC\uFEAE\uFE91\uFEF4" +
|
||||
"\uFE94\u0020\uFE8D\uFEDF\uFEA4\uFEAE\uFE93\u0020\u0020\u0020\u0020";
|
||||
|
||||
private static final String numSource =
|
||||
"\u0031" + /* en:1 */
|
||||
"\u0627" + /* arabic:alef */
|
||||
"\u0032" + /* en:2 */
|
||||
"\u06f3" + /* an:3 */
|
||||
"\u0061" + /* latin:a */
|
||||
"\u0034"; /* en:4 */
|
||||
|
||||
private static final TestData[] standardTests = {
|
||||
/* lam alef special visual ltr */
|
||||
TestData.standard(lamAlefSpecialVLTR,
|
||||
LETTERS_SHAPE | TEXT_DIRECTION_VISUAL_LTR | LENGTH_FIXED_SPACES_NEAR,
|
||||
"\u0020\ufee5\u0020\ufef5\ufe8d\u0020" +
|
||||
"\ufee5\u0020\ufe76\ufef7\ufe8d\u0020" +
|
||||
"\ufee5\u0020\u0670\ufefb\ufe8d\u0020" +
|
||||
"\ufee5\u0020\u0653\ufef5\ufe8d\u0020" +
|
||||
"\ufee5\u0020\u0655\ufef9\ufe8d\u0020" +
|
||||
"\ufee5\u0020\u0654\ufef5\ufe8d\u0020" +
|
||||
"\ufefc\ufecb"),
|
||||
TestData.standard(lamAlefSpecialVLTR,
|
||||
LETTERS_SHAPE | TEXT_DIRECTION_VISUAL_LTR | LENGTH_FIXED_SPACES_AT_END,
|
||||
"\u0020\ufee5\ufef5\ufe8d\u0020\ufee5" +
|
||||
"\ufe76\ufef7\ufe8d\u0020\ufee5\u0670" +
|
||||
"\ufefb\ufe8d\u0020\ufee5\u0653\ufef5" +
|
||||
"\ufe8d\u0020\ufee5\u0655\ufef9\ufe8d" +
|
||||
"\u0020\ufee5\u0654\ufef5\ufe8d\u0020" +
|
||||
"\ufefc\ufecb\u0020\u0020\u0020\u0020" +
|
||||
"\u0020\u0020"),
|
||||
TestData.standard(lamAlefSpecialVLTR,
|
||||
LETTERS_SHAPE | TEXT_DIRECTION_VISUAL_LTR | LENGTH_FIXED_SPACES_AT_BEGINNING,
|
||||
"\u0020\u0020\u0020\u0020\u0020\u0020" +
|
||||
"\u0020\ufee5\ufef5\ufe8d\u0020\ufee5" +
|
||||
"\ufe76\ufef7\ufe8d\u0020\ufee5\u0670" +
|
||||
"\ufefb\ufe8d\u0020\ufee5\u0653\ufef5" +
|
||||
"\ufe8d\u0020\ufee5\u0655\ufef9\ufe8d" +
|
||||
"\u0020\ufee5\u0654\ufef5\ufe8d\u0020" +
|
||||
"\ufefc\ufecb"),
|
||||
TestData.standard(lamAlefSpecialVLTR,
|
||||
LETTERS_SHAPE | TEXT_DIRECTION_VISUAL_LTR | LENGTH_GROW_SHRINK,
|
||||
"\u0020\ufee5\ufef5\ufe8d\u0020\ufee5" +
|
||||
"\ufe76\ufef7\ufe8d\u0020\ufee5\u0670" +
|
||||
"\ufefb\ufe8d\u0020\ufee5\u0653\ufef5" +
|
||||
"\ufe8d\u0020\ufee5\u0655\ufef9\ufe8d" +
|
||||
"\u0020\ufee5\u0654\ufef5\ufe8d\u0020" +
|
||||
"\ufefc\ufecb"),
|
||||
|
||||
/* TASHKEEL */
|
||||
TestData.standard(lamAlefSpecialVLTR,
|
||||
LETTERS_SHAPE_TASHKEEL_ISOLATED | TEXT_DIRECTION_VISUAL_LTR | LENGTH_FIXED_SPACES_NEAR,
|
||||
"\u0020\ufee5\u0020\ufef5\ufe8d\u0020" +
|
||||
"\ufee5\u0020\ufe76\ufef7\ufe8d\u0020" +
|
||||
"\ufee5\u0020\u0670\ufefb\ufe8d\u0020" +
|
||||
"\ufee5\u0020\u0653\ufef5\ufe8d\u0020" +
|
||||
"\ufee5\u0020\u0655\ufef9\ufe8d\u0020" +
|
||||
"\ufee5\u0020\u0654\ufef5\ufe8d\u0020" +
|
||||
"\ufefc\ufecb"),
|
||||
TestData.standard(lamAlefSpecialVLTR,
|
||||
LETTERS_SHAPE_TASHKEEL_ISOLATED | TEXT_DIRECTION_VISUAL_LTR | LENGTH_FIXED_SPACES_AT_END,
|
||||
"\u0020\ufee5\ufef5\ufe8d\u0020\ufee5" +
|
||||
"\ufe76\ufef7\ufe8d\u0020\ufee5\u0670" +
|
||||
"\ufefb\ufe8d\u0020\ufee5\u0653\ufef5" +
|
||||
"\ufe8d\u0020\ufee5\u0655\ufef9\ufe8d" +
|
||||
"\u0020\ufee5\u0654\ufef5\ufe8d\u0020" +
|
||||
"\ufefc\ufecb\u0020\u0020\u0020\u0020" +
|
||||
"\u0020\u0020"),
|
||||
TestData.standard(lamAlefSpecialVLTR,
|
||||
LETTERS_SHAPE_TASHKEEL_ISOLATED | TEXT_DIRECTION_VISUAL_LTR | LENGTH_FIXED_SPACES_AT_BEGINNING,
|
||||
"\u0020\u0020\u0020\u0020\u0020\u0020" +
|
||||
"\u0020\ufee5\ufef5\ufe8d\u0020\ufee5" +
|
||||
"\ufe76\ufef7\ufe8d\u0020\ufee5\u0670" +
|
||||
"\ufefb\ufe8d\u0020\ufee5\u0653\ufef5" +
|
||||
"\ufe8d\u0020\ufee5\u0655\ufef9\ufe8d" +
|
||||
"\u0020\ufee5\u0654\ufef5\ufe8d\u0020" +
|
||||
"\ufefc\ufecb"),
|
||||
TestData.standard(lamAlefSpecialVLTR,
|
||||
LETTERS_SHAPE_TASHKEEL_ISOLATED | TEXT_DIRECTION_VISUAL_LTR | LENGTH_GROW_SHRINK,
|
||||
"\u0020\ufee5\ufef5\ufe8d\u0020\ufee5" +
|
||||
"\ufe76\ufef7\ufe8d\u0020\ufee5\u0670" +
|
||||
"\ufefb\ufe8d\u0020\ufee5\u0653\ufef5" +
|
||||
"\ufe8d\u0020\ufee5\u0655\ufef9\ufe8d" +
|
||||
"\u0020\ufee5\u0654\ufef5\ufe8d\u0020" +
|
||||
"\ufefc\ufecb"),
|
||||
|
||||
/* tashkeel special visual ltr */
|
||||
TestData.standard(tashkeelSpecialVLTR,
|
||||
LETTERS_SHAPE | TEXT_DIRECTION_VISUAL_LTR | LENGTH_FIXED_SPACES_NEAR,
|
||||
"\ufef2\ufe91\ufeae\ufecb\u0020" +
|
||||
"\ufef2\ufe91\ufe7c\ufeae\ufe77\ufecb\u0020" +
|
||||
"\ufe72\ufef2\ufe91\ufeae\ufe79\ufecb\u0020" +
|
||||
"\ufe8f\u0670\ufeae\u0670\ufecb\u0020" +
|
||||
"\ufe8f\u0653\ufeae\u0653\ufecb\u0020" +
|
||||
"\ufe8f\u0654\ufeae\u0654\ufecb\u0020" +
|
||||
"\ufe8f\u0655\ufeae\u0655\ufecb\u0020"),
|
||||
|
||||
TestData.standard(tashkeelSpecialVLTR,
|
||||
LETTERS_SHAPE_TASHKEEL_ISOLATED | TEXT_DIRECTION_VISUAL_LTR | LENGTH_FIXED_SPACES_NEAR,
|
||||
"\ufef2\ufe91\ufeae\ufecb\u0020" +
|
||||
"\ufef2\ufe91\ufe7c\ufeae\ufe76\ufecb\u0020" +
|
||||
"\ufe72\ufef2\ufe91\ufeae\ufe78\ufecb\u0020" +
|
||||
"\ufe8f\u0670\ufeae\u0670\ufecb\u0020" +
|
||||
"\ufe8f\u0653\ufeae\u0653\ufecb\u0020" +
|
||||
"\ufe8f\u0654\ufeae\u0654\ufecb\u0020" +
|
||||
"\ufe8f\u0655\ufeae\u0655\ufecb\u0020"),
|
||||
|
||||
TestData.standard(tashkeelShaddaRTL,
|
||||
ArabicShaping.LETTERS_SHAPE|ArabicShaping.TASHKEEL_BEGIN |ArabicShaping.TEXT_DIRECTION_VISUAL_RTL ,
|
||||
"\u0020\ufeb7\ufe7d\ufee4\ufeb2"),
|
||||
TestData.standard(tashkeelShaddaRTL,
|
||||
ArabicShaping.LETTERS_SHAPE|ArabicShaping.TASHKEEL_END|ArabicShaping.TEXT_DIRECTION_VISUAL_RTL ,
|
||||
"\ufeb7\ufe7d\ufee4\ufeb2\u0020"),
|
||||
TestData.standard(tashkeelShaddaRTL,
|
||||
ArabicShaping.LETTERS_SHAPE|ArabicShaping.TASHKEEL_RESIZE|ArabicShaping.TEXT_DIRECTION_VISUAL_RTL ,
|
||||
"\ufeb7\ufe7d\ufee4\ufeb2"),
|
||||
TestData.standard(tashkeelShaddaRTL,
|
||||
ArabicShaping.LETTERS_SHAPE|ArabicShaping.TASHKEEL_REPLACE_BY_TATWEEL|ArabicShaping.TEXT_DIRECTION_VISUAL_RTL ,
|
||||
"\ufeb7\ufe7d\ufee4\u0640\ufeb2"),
|
||||
|
||||
TestData.standard(tashkeelShaddaLTR,
|
||||
ArabicShaping.LETTERS_SHAPE|ArabicShaping.TASHKEEL_BEGIN |ArabicShaping.TEXT_DIRECTION_VISUAL_LTR ,
|
||||
"\u0020\ufeb2\ufee4\ufe7d\ufeb7"),
|
||||
TestData.standard(tashkeelShaddaLTR,
|
||||
ArabicShaping.LETTERS_SHAPE|ArabicShaping.TASHKEEL_END |ArabicShaping.TEXT_DIRECTION_VISUAL_LTR ,
|
||||
"\ufeb2\ufee4\ufe7d\ufeb7\u0020"),
|
||||
TestData.standard(tashkeelShaddaLTR,
|
||||
ArabicShaping.LETTERS_SHAPE|ArabicShaping.TASHKEEL_RESIZE |ArabicShaping.TEXT_DIRECTION_VISUAL_LTR ,
|
||||
"\ufeb2\ufee4\ufe7d\ufeb7"),
|
||||
TestData.standard(tashkeelShaddaLTR,
|
||||
ArabicShaping.LETTERS_SHAPE|ArabicShaping.TASHKEEL_REPLACE_BY_TATWEEL |ArabicShaping.TEXT_DIRECTION_VISUAL_LTR ,
|
||||
"\ufeb2\u0640\ufee4\ufe7d\ufeb7"),
|
||||
|
||||
TestData.standard(ArMathSym,
|
||||
ArabicShaping.LETTERS_SHAPE|ArabicShaping.TASHKEEL_BEGIN |ArabicShaping.TEXT_DIRECTION_VISUAL_RTL ,
|
||||
"\uD83B\uDE00\uD83B\uDE01\uD83B\uDE02\uD83B\uDE03\u0020" +
|
||||
"\uD83B\uDE24\uD83B\uDE05\uD83B\uDE06\u0020" +
|
||||
"\uD83B\uDE07\uD83B\uDE08\uD83B\uDE09\u0020" +
|
||||
"\uD83B\uDE0A\uD83B\uDE0B\uD83B\uDE0C\uD83B\uDE0D\u0020" +
|
||||
"\uD83B\uDE0E\uD83B\uDE0F\uD83B\uDE10\uD83B\uDE11\u0020" +
|
||||
"\uD83B\uDE12\uD83B\uDE13\uD83B\uDE14\uD83B\uDE15\u0020" +
|
||||
"\uD83B\uDE16\uD83B\uDE17\uD83B\uDE18\u0020" +
|
||||
"\uD83B\uDE19\uD83B\uDE1A\uD83B\uDE1B"),
|
||||
TestData.standard(ArMathSymLooped,
|
||||
ArabicShaping.LETTERS_SHAPE|ArabicShaping.TASHKEEL_END|ArabicShaping.TEXT_DIRECTION_VISUAL_RTL ,
|
||||
"\uD83B\uDE80\uD83B\uDE81\uD83B\uDE82\uD83B\uDE83\u0020" +
|
||||
"\uD83B\uDE84\uD83B\uDE85\uD83B\uDE86\u0020" +
|
||||
"\uD83B\uDE87\uD83B\uDE88\uD83B\uDE89\u0020" +
|
||||
"\uD83B\uDE8B\uD83B\uDE8C\uD83B\uDE8D\u0020" +
|
||||
"\uD83B\uDE8E\uD83B\uDE8F\uD83B\uDE90\uD83B\uDE91\u0020" +
|
||||
"\uD83B\uDE92\uD83B\uDE93\uD83B\uDE94\uD83B\uDE95\u0020" +
|
||||
"\uD83B\uDE96\uD83B\uDE97\uD83B\uDE98\u0020" +
|
||||
"\uD83B\uDE99\uD83B\uDE9A\uD83B\uDE9B"),
|
||||
TestData.standard(ArMathSymDoubleStruck,
|
||||
ArabicShaping.LETTERS_SHAPE|ArabicShaping.TASHKEEL_RESIZE|ArabicShaping.TEXT_DIRECTION_VISUAL_RTL ,
|
||||
"\uD83B\uDEA1\uD83B\uDEA2\uD83B\uDEA3\u0020" +
|
||||
"\uD83B\uDEA5\uD83B\uDEA6\u0020" +
|
||||
"\uD83B\uDEA7\uD83B\uDEA8\uD83B\uDEA9\u0020" +
|
||||
"\uD83B\uDEAB\uD83B\uDEAC\uD83B\uDEAD\u0020" +
|
||||
"\uD83B\uDEAE\uD83B\uDEAF\uD83B\uDEB0\uD83B\uDEB1\u0020" +
|
||||
"\uD83B\uDEB2\uD83B\uDEB3\uD83B\uDEB4\uD83B\uDEB5\u0020" +
|
||||
"\uD83B\uDEB6\uD83B\uDEB7\uD83B\uDEB8\u0020" +
|
||||
"\uD83B\uDEB9\uD83B\uDEBA\uD83B\uDEBB"),
|
||||
|
||||
TestData.standard(ArMathSymInitial,
|
||||
ArabicShaping.LETTERS_SHAPE|ArabicShaping.TASHKEEL_BEGIN |ArabicShaping.TEXT_DIRECTION_VISUAL_LTR ,
|
||||
"\uD83B\uDE21\uD83B\uDE22\u0020" +
|
||||
"\uD83B\uDE27\uD83B\uDE29\u0020" +
|
||||
"\uD83B\uDE2A\uD83B\uDE2B\uD83B\uDE2C\uD83B\uDE2D\u0020" +
|
||||
"\uD83B\uDE2E\uD83B\uDE2F\uD83B\uDE30\uD83B\uDE31\u0020" +
|
||||
"\uD83B\uDE32\uD83B\uDE34\uD83B\uDE35\u0020" +
|
||||
"\uD83B\uDE36\uD83B\uDE37\u0020" +
|
||||
"\uD83B\uDE39\uD83B\uDE3B"),
|
||||
TestData.standard(ArMathSymTailed,
|
||||
ArabicShaping.LETTERS_SHAPE|ArabicShaping.TASHKEEL_END |ArabicShaping.TEXT_DIRECTION_VISUAL_LTR ,
|
||||
"\uD83B\uDE42\uD83B\uDE47\uD83B\uDE49\uD83B\uDE4B\u0020" +
|
||||
"\uD83B\uDE4D\uD83B\uDE4E\uD83B\uDE4F\u0020" +
|
||||
"\uD83B\uDE51\uD83B\uDE52\uD83B\uDE54\uD83B\uDE57\u0020" +
|
||||
"\uD83B\uDE59\uD83B\uDE5B\uD83B\uDE5D\uD83B\uDE5F"),
|
||||
TestData.standard(ArMathSymStretched,
|
||||
ArabicShaping.LETTERS_SHAPE|ArabicShaping.TASHKEEL_RESIZE |ArabicShaping.TEXT_DIRECTION_VISUAL_LTR ,
|
||||
"\uD83B\uDE21\uFEB1\uD83B\uDE62\uFEE9"),
|
||||
|
||||
/* logical unshape */
|
||||
TestData.standard(logicalUnshape,
|
||||
LETTERS_UNSHAPE | TEXT_DIRECTION_LOGICAL | LENGTH_FIXED_SPACES_NEAR,
|
||||
"\u0020\u0020\u0020\u0627\u0644\u0622\u0646\u0020\u0627\u0644\u0623\u0642\u0644\u0627" +
|
||||
"\u0645\u0020\u0627\u0644\u0639\u0631\u0628\u064a\u0629\u0020\u0627\u0644\u062d\u0631" +
|
||||
"\u0629\u0020\u0020\u0020\u0020"),
|
||||
TestData.standard(logicalUnshape,
|
||||
LETTERS_UNSHAPE | TEXT_DIRECTION_LOGICAL | LENGTH_FIXED_SPACES_AT_END,
|
||||
"\u0020\u0020\u0020\u0627\u0644\u0622\u0020\u0646\u0020\u0627\u0644\u0623\u0020\u0642" +
|
||||
"\u0644\u0627\u0020\u0645\u0020\u0627\u0644\u0639\u0631\u0628\u064a\u0629\u0020\u0627" +
|
||||
"\u0644\u062d\u0631\u0629\u0020"),
|
||||
TestData.standard(logicalUnshape,
|
||||
LETTERS_UNSHAPE | TEXT_DIRECTION_LOGICAL | LENGTH_FIXED_SPACES_AT_BEGINNING,
|
||||
"\u0627\u0644\u0622\u0020\u0646\u0020\u0627\u0644\u0623\u0020\u0642\u0644\u0627\u0020" +
|
||||
"\u0645\u0020\u0627\u0644\u0639\u0631\u0628\u064a\u0629\u0020\u0627\u0644\u062d\u0631" +
|
||||
"\u0629\u0020\u0020\u0020\u0020"),
|
||||
TestData.standard(logicalUnshape,
|
||||
LETTERS_UNSHAPE | TEXT_DIRECTION_LOGICAL | LENGTH_GROW_SHRINK,
|
||||
"\u0020\u0020\u0020\u0627\u0644\u0622\u0020\u0646\u0020\u0627\u0644\u0623\u0020\u0642" +
|
||||
"\u0644\u0627\u0020\u0645\u0020\u0627\u0644\u0639\u0631\u0628\u064a\u0629\u0020\u0627" +
|
||||
"\u0644\u062d\u0631\u0629\u0020\u0020\u0020\u0020"),
|
||||
|
||||
/* numbers */
|
||||
TestData.standard(numSource,
|
||||
DIGITS_EN2AN | DIGIT_TYPE_AN,
|
||||
"\u0661\u0627\u0662\u06f3\u0061\u0664"),
|
||||
TestData.standard(numSource,
|
||||
DIGITS_AN2EN | DIGIT_TYPE_AN_EXTENDED,
|
||||
"\u0031\u0627\u0032\u0033\u0061\u0034"),
|
||||
TestData.standard(numSource,
|
||||
DIGITS_EN2AN_INIT_LR | DIGIT_TYPE_AN,
|
||||
"\u0031\u0627\u0662\u06f3\u0061\u0034"),
|
||||
TestData.standard(numSource,
|
||||
DIGITS_EN2AN_INIT_AL | DIGIT_TYPE_AN_EXTENDED,
|
||||
"\u06f1\u0627\u06f2\u06f3\u0061\u0034"),
|
||||
TestData.standard(numSource,
|
||||
DIGITS_EN2AN_INIT_LR | DIGIT_TYPE_AN | TEXT_DIRECTION_VISUAL_LTR,
|
||||
"\u0661\u0627\u0032\u06f3\u0061\u0034"),
|
||||
TestData.standard(numSource,
|
||||
DIGITS_EN2AN_INIT_AL | DIGIT_TYPE_AN_EXTENDED | TEXT_DIRECTION_VISUAL_LTR,
|
||||
"\u06f1\u0627\u0032\u06f3\u0061\u06f4"),
|
||||
|
||||
/* no-op */
|
||||
TestData.standard(numSource,
|
||||
0,
|
||||
numSource),
|
||||
};
|
||||
|
||||
private static final TestData[] preflightTests = {
|
||||
/* preflight */
|
||||
TestData.preflight("\u0644\u0627",
|
||||
LETTERS_SHAPE | LENGTH_GROW_SHRINK,
|
||||
1),
|
||||
|
||||
TestData.preflight("\u0644\u0627\u0031",
|
||||
DIGITS_EN2AN | DIGIT_TYPE_AN_EXTENDED | LENGTH_GROW_SHRINK,
|
||||
3),
|
||||
|
||||
TestData.preflight("\u0644\u0644",
|
||||
LETTERS_SHAPE | LENGTH_GROW_SHRINK,
|
||||
2),
|
||||
|
||||
TestData.preflight("\ufef7",
|
||||
LETTERS_UNSHAPE | LENGTH_GROW_SHRINK,
|
||||
2),
|
||||
};
|
||||
|
||||
private static final TestData[] errorTests = {
|
||||
/* bad data */
|
||||
TestData.error("\u0020\ufef7\u0644\u0020",
|
||||
LETTERS_UNSHAPE | LENGTH_FIXED_SPACES_NEAR,
|
||||
ArabicShapingException.class),
|
||||
|
||||
TestData.error("\u0020\ufef7",
|
||||
LETTERS_UNSHAPE | LENGTH_FIXED_SPACES_AT_END,
|
||||
ArabicShapingException.class),
|
||||
|
||||
TestData.error("\ufef7\u0020",
|
||||
LETTERS_UNSHAPE | LENGTH_FIXED_SPACES_AT_BEGINNING,
|
||||
ArabicShapingException.class),
|
||||
|
||||
/* bad options */
|
||||
TestData.error("\ufef7",
|
||||
0xffffffff,
|
||||
IllegalArgumentException.class),
|
||||
|
||||
TestData.error("\ufef7",
|
||||
LETTERS_UNSHAPE | LENGTH_GROW_SHRINK,
|
||||
ArabicShapingException.class),
|
||||
|
||||
TestData.error(null,
|
||||
LETTERS_UNSHAPE | LENGTH_FIXED_SPACES_AT_END,
|
||||
IllegalArgumentException.class),
|
||||
};
|
||||
|
||||
@Test
|
||||
public void TestStandard() {
|
||||
for (int i = 0; i < standardTests.length; ++i) {
|
||||
TestData test = standardTests[i];
|
||||
|
||||
Exception ex = null;
|
||||
String result = null;
|
||||
ArabicShaping shaper = null;
|
||||
|
||||
try {
|
||||
shaper = new ArabicShaping(test.flags);
|
||||
result = shaper.shape(test.source);
|
||||
}
|
||||
catch(MissingResourceException e){
|
||||
throw e;
|
||||
}
|
||||
catch (IllegalStateException ie){
|
||||
warnln("IllegalStateException: "+ie.toString());
|
||||
return;
|
||||
}
|
||||
catch (Exception e) {
|
||||
ex = e;
|
||||
}
|
||||
|
||||
if (!test.result.equals(result)) {
|
||||
reportTestFailure(i, test, shaper, result, ex);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void TestPreflight() {
|
||||
for (int i = 0; i < preflightTests.length; ++i) {
|
||||
TestData test = preflightTests[i];
|
||||
|
||||
Exception ex = null;
|
||||
char src[] = null;
|
||||
int len = 0;
|
||||
ArabicShaping shaper = null;
|
||||
|
||||
if (test.source != null) {
|
||||
src = test.source.toCharArray();
|
||||
}
|
||||
|
||||
try {
|
||||
shaper = new ArabicShaping(test.flags);
|
||||
len = shaper.shape(src, 0, src.length, null, 0, 0);
|
||||
}
|
||||
catch (Exception e) {
|
||||
ex = e;
|
||||
}
|
||||
|
||||
if (test.length != len) {
|
||||
reportTestFailure(i, test, shaper, test.source, ex);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void TestError() {
|
||||
for (int i = 0; i < errorTests.length; ++i) {
|
||||
TestData test = errorTests[i];
|
||||
|
||||
Exception ex = null;
|
||||
char src[] = null;
|
||||
int len = 0;
|
||||
ArabicShaping shaper = null;
|
||||
|
||||
if (test.source != null) {
|
||||
src = test.source.toCharArray();
|
||||
len = src.length;
|
||||
}
|
||||
|
||||
try {
|
||||
shaper = new ArabicShaping(test.flags);
|
||||
shaper.shape(src, 0, len);
|
||||
}
|
||||
catch (Exception e) {
|
||||
ex = e;
|
||||
}
|
||||
|
||||
if (!test.error.isInstance(ex)) {
|
||||
reportTestFailure(i, test, shaper, test.source, ex);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void TestEquals()
|
||||
@ -572,64 +67,6 @@ public class ArabicShapingRegTest extends TestFmwk {
|
||||
}
|
||||
}
|
||||
|
||||
// TODO(junit): remove this and convert callers to parameterized tests
|
||||
private void reportTestFailure(int index, TestData test, ArabicShaping shaper, String result, Exception error) {
|
||||
if (error != null && error instanceof MissingResourceException ) {
|
||||
warnln(error.getMessage());
|
||||
}
|
||||
|
||||
StringBuffer buf = new StringBuffer();
|
||||
buf.append("*** test failure ***\n");
|
||||
buf.append("index: " + index + "\n");
|
||||
buf.append("test: " + test + "\n");
|
||||
buf.append("shaper: " + shaper + "\n");
|
||||
buf.append("result: " + escapedString(result) + "\n");
|
||||
buf.append("error: " + error + "\n");
|
||||
|
||||
if (result != null && test.result != null && !test.result.equals(result)) {
|
||||
for (int i = 0; i < Math.max(test.result.length(), result.length()); ++i) {
|
||||
String temp = Integer.toString(i);
|
||||
if (temp.length() < 2) {
|
||||
temp = " ".concat(temp);
|
||||
}
|
||||
char trg = i < test.result.length() ? test.result.charAt(i) : '\uffff';
|
||||
char res = i < result.length() ? result.charAt(i) : '\uffff';
|
||||
|
||||
buf.append("[" + temp + "] ");
|
||||
buf.append(escapedString("" + trg) + " ");
|
||||
buf.append(escapedString("" + res) + " ");
|
||||
if (trg != res) {
|
||||
buf.append("***");
|
||||
}
|
||||
buf.append("\n");
|
||||
}
|
||||
}
|
||||
err(buf.toString());
|
||||
}
|
||||
|
||||
private static String escapedString(String str) {
|
||||
if (str == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
StringBuffer buf = new StringBuffer(str.length() * 6);
|
||||
for (int i = 0; i < str.length(); ++i) {
|
||||
char ch = str.charAt(i);
|
||||
buf.append("\\u");
|
||||
if (ch < 0x1000) {
|
||||
buf.append('0');
|
||||
}
|
||||
if (ch < 0x0100) {
|
||||
buf.append('0');
|
||||
}
|
||||
if (ch < 0x0010) {
|
||||
buf.append('0');
|
||||
}
|
||||
buf.append(Integer.toHexString(ch));
|
||||
}
|
||||
return buf.toString();
|
||||
}
|
||||
|
||||
/* Tests the method
|
||||
* public int shape(char[] source, int sourceStart, int sourceLength,
|
||||
* char[] dest, int destStart, int destSize) throws ArabicShapingException)
|
||||
@ -643,8 +80,8 @@ public class ArabicShapingRegTest extends TestFmwk {
|
||||
char[] source = {'d','u','m','m','y'};
|
||||
char[] dest = {'d','u','m','m','y'};
|
||||
int[] negNum = {-1,-2,-5,-10,-100};
|
||||
|
||||
|
||||
|
||||
|
||||
for(int i=0; i<negNum.length; i++){
|
||||
try{
|
||||
// Checks when "sourceStart < 0"
|
||||
@ -652,7 +89,7 @@ public class ArabicShapingRegTest extends TestFmwk {
|
||||
errln("ArabicShaping.shape(char[],int,int,char[],int,int) was " +
|
||||
"suppose to return an exception when 'sourceStart < 0'.");
|
||||
} catch(Exception e){}
|
||||
|
||||
|
||||
try{
|
||||
// Checks when "sourceLength < 0"
|
||||
as.shape(source, 0, negNum[i], dest, 0, 0);
|
||||
@ -660,7 +97,7 @@ public class ArabicShapingRegTest extends TestFmwk {
|
||||
"suppose to return an exception when 'sourceLength < 0'.");
|
||||
} catch(Exception e){}
|
||||
}
|
||||
|
||||
|
||||
// Checks when "sourceStart + sourceLength > source.length"
|
||||
try{
|
||||
as.shape(source, 3, 3, dest, 0, 0);
|
||||
@ -682,14 +119,14 @@ public class ArabicShapingRegTest extends TestFmwk {
|
||||
errln("ArabicShaping.shape(char[],int,int,char[],int,int) was " +
|
||||
"suppose to return an exception when 'sourceStart + sourceLength > source.length'.");
|
||||
} catch(Exception e){}
|
||||
|
||||
|
||||
// Checks when "if (dest == null && destSize != 0)" is true
|
||||
try{
|
||||
as.shape(source, 2, 2, null, 0, 1);
|
||||
errln("ArabicShaping.shape(char[],int,int,char[],int,int) was " +
|
||||
"suppose to return an exception when 'dest == null && destSize != 0'.");
|
||||
} catch(Exception e){}
|
||||
|
||||
|
||||
// Checks when
|
||||
// if ((destSize != 0) && (destStart < 0 || destSize < 0 || destStart + destSize > dest.length))
|
||||
for(int i=0; i<negNum.length; i++){
|
||||
@ -699,7 +136,7 @@ public class ArabicShapingRegTest extends TestFmwk {
|
||||
"suppose to return an exception when " +
|
||||
"(destSize != 0) && (destStart < 0 || destSize < 0 || destStart + destSize > dest.length).");
|
||||
} catch(Exception e){}
|
||||
|
||||
|
||||
try{
|
||||
as.shape(source, 2, 2, dest, 0, negNum[i]);
|
||||
errln("ArabicShaping.shape(char[],int,int,char[],int,int) was " +
|
||||
@ -707,7 +144,7 @@ public class ArabicShapingRegTest extends TestFmwk {
|
||||
"(destSize != 0) && (destStart < 0 || destSize < 0 || destStart + destSize > dest.length).");
|
||||
} catch(Exception e){}
|
||||
}
|
||||
|
||||
|
||||
// Checks when "destStart + destSize > dest.length"
|
||||
try{
|
||||
as.shape(source, 2, 2, dest, 3, 3);
|
||||
@ -733,9 +170,9 @@ public class ArabicShapingRegTest extends TestFmwk {
|
||||
"suppose to return an exception when " +
|
||||
"(destSize != 0) && (destStart < 0 || destSize < 0 || destStart + destSize > dest.length).");
|
||||
} catch(Exception e){}
|
||||
|
||||
|
||||
// Tests when "throw new IllegalArgumentException("Wrong Tashkeel argument")"
|
||||
int[] invalid_Tashkeel = {-1000, -500, -100};
|
||||
int[] invalid_Tashkeel = {-1000, -500, -100};
|
||||
for(int i=0; i < invalid_Tashkeel.length; i++){
|
||||
ArabicShaping arabicShape = new ArabicShaping(invalid_Tashkeel[i]);
|
||||
try {
|
||||
|
@ -0,0 +1,487 @@
|
||||
// © 2017 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html#License
|
||||
|
||||
package com.ibm.icu.dev.test.shaping;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.Collection;
|
||||
import java.util.MissingResourceException;
|
||||
|
||||
import org.junit.Test;
|
||||
import org.junit.experimental.runners.Enclosed;
|
||||
import org.junit.runner.RunWith;
|
||||
import org.junit.runners.Parameterized;
|
||||
|
||||
import com.ibm.icu.dev.test.TestFmwk;
|
||||
import com.ibm.icu.text.ArabicShaping;
|
||||
import com.ibm.icu.text.ArabicShapingException;
|
||||
|
||||
/**
|
||||
* Regression test for Arabic shaping.
|
||||
*/
|
||||
@RunWith(Enclosed.class)
|
||||
public class DataDrivenArabicShapingRegTest extends TestFmwk {
|
||||
|
||||
/* constants copied from ArabicShaping for convenience */
|
||||
|
||||
public static final int LENGTH_GROW_SHRINK = 0;
|
||||
public static final int LENGTH_FIXED_SPACES_NEAR = 1;
|
||||
public static final int LENGTH_FIXED_SPACES_AT_END = 2;
|
||||
public static final int LENGTH_FIXED_SPACES_AT_BEGINNING = 3;
|
||||
|
||||
public static final int TEXT_DIRECTION_LOGICAL = 0;
|
||||
public static final int TEXT_DIRECTION_VISUAL_LTR = 4;
|
||||
|
||||
public static final int LETTERS_NOOP = 0;
|
||||
public static final int LETTERS_SHAPE = 8;
|
||||
public static final int LETTERS_SHAPE_TASHKEEL_ISOLATED = 0x18;
|
||||
public static final int LETTERS_UNSHAPE = 0x10;
|
||||
|
||||
public static final int DIGITS_NOOP = 0;
|
||||
public static final int DIGITS_EN2AN = 0x20;
|
||||
public static final int DIGITS_AN2EN = 0x40;
|
||||
public static final int DIGITS_EN2AN_INIT_LR = 0x60;
|
||||
public static final int DIGITS_EN2AN_INIT_AL = 0x80;
|
||||
// private static final int DIGITS_RESERVED = 0xa0;
|
||||
|
||||
public static final int DIGIT_TYPE_AN = 0;
|
||||
public static final int DIGIT_TYPE_AN_EXTENDED = 0x100;
|
||||
|
||||
@RunWith(Parameterized.class)
|
||||
public static class StandardDataTest extends TestFmwk {
|
||||
private String source;
|
||||
private int flags;
|
||||
private String expected;
|
||||
|
||||
public StandardDataTest(String source, int flags, String expected) {
|
||||
this.source = source;
|
||||
this.flags = flags;
|
||||
this.expected = expected;
|
||||
}
|
||||
|
||||
@Parameterized.Parameters
|
||||
public static Collection testData() {
|
||||
String lamAlefSpecialVLTR =
|
||||
"\u0020\u0646\u0622\u0644\u0627\u0020\u0646\u0623\u064E\u0644\u0627\u0020" +
|
||||
"\u0646\u0627\u0670\u0644\u0627\u0020\u0646\u0622\u0653\u0644\u0627\u0020" +
|
||||
"\u0646\u0625\u0655\u0644\u0627\u0020\u0646\u0622\u0654\u0644\u0627\u0020" +
|
||||
"\uFEFC\u0639";
|
||||
String tashkeelSpecialVLTR =
|
||||
"\u064A\u0628\u0631\u0639\u0020\u064A\u0628\u0651\u0631\u064E\u0639\u0020" +
|
||||
"\u064C\u064A\u0628\u0631\u064F\u0639\u0020\u0628\u0670\u0631\u0670\u0639" +
|
||||
"\u0020\u0628\u0653\u0631\u0653\u0639\u0020\u0628\u0654\u0631\u0654\u0639" +
|
||||
"\u0020\u0628\u0655\u0631\u0655\u0639\u0020";
|
||||
String tashkeelShaddaRTL=
|
||||
"\u0634\u0651\u0645\u0652\u0633";
|
||||
String tashkeelShaddaLTR=
|
||||
"\u0633\u0652\u0645\u0651\u0634";
|
||||
String ArMathSym =
|
||||
"\uD83B\uDE00\uD83B\uDE01\uD83B\uDE02\uD83B\uDE03\u0020\uD83B\uDE24\uD83B" +
|
||||
"\uDE05\uD83B\uDE06\u0020\uD83B\uDE07\uD83B\uDE08\uD83B\uDE09\u0020\uD83B" +
|
||||
"\uDE0A\uD83B\uDE0B\uD83B\uDE0C\uD83B\uDE0D\u0020\uD83B\uDE0E\uD83B\uDE0F" +
|
||||
"\uD83B\uDE10\uD83B\uDE11\u0020\uD83B\uDE12\uD83B\uDE13\uD83B\uDE14\uD83B" +
|
||||
"\uDE15\u0020\uD83B\uDE16\uD83B\uDE17\uD83B\uDE18\u0020\uD83B\uDE19\uD83B" +
|
||||
"\uDE1A\uD83B\uDE1B";
|
||||
String ArMathSymLooped =
|
||||
"\uD83B\uDE80\uD83B\uDE81\uD83B\uDE82\uD83B\uDE83\u0020\uD83B\uDE84\uD83B" +
|
||||
"\uDE85\uD83B\uDE86\u0020\uD83B\uDE87\uD83B\uDE88\uD83B\uDE89\u0020\uD83B" +
|
||||
"\uDE8B\uD83B\uDE8C\uD83B\uDE8D\u0020\uD83B\uDE8E\uD83B\uDE8F\uD83B\uDE90" +
|
||||
"\uD83B\uDE91\u0020\uD83B\uDE92\uD83B\uDE93\uD83B\uDE94\uD83B\uDE95\u0020" +
|
||||
"\uD83B\uDE96\uD83B\uDE97\uD83B\uDE98\u0020\uD83B\uDE99\uD83B\uDE9A\uD83B" +
|
||||
"\uDE9B";
|
||||
String ArMathSymDoubleStruck =
|
||||
"\uD83B\uDEA1\uD83B\uDEA2\uD83B\uDEA3\u0020\uD83B\uDEA5\uD83B\uDEA6\u0020" +
|
||||
"\uD83B\uDEA7\uD83B\uDEA8\uD83B\uDEA9\u0020\uD83B\uDEAB\uD83B\uDEAC\uD83B" +
|
||||
"\uDEAD\u0020\uD83B\uDEAE\uD83B\uDEAF\uD83B\uDEB0\uD83B\uDEB1\u0020\uD83B" +
|
||||
"\uDEB2\uD83B\uDEB3\uD83B\uDEB4\uD83B\uDEB5\u0020\uD83B\uDEB6\uD83B\uDEB7" +
|
||||
"\uD83B\uDEB8\u0020\uD83B\uDEB9\uD83B\uDEBA\uD83B\uDEBB";
|
||||
String ArMathSymInitial =
|
||||
"\uD83B\uDE21\uD83B\uDE22\u0020\uD83B\uDE27\uD83B\uDE29\u0020\uD83B\uDE2A" +
|
||||
"\uD83B\uDE2B\uD83B\uDE2C\uD83B\uDE2D\u0020\uD83B\uDE2E\uD83B\uDE2F\uD83B" +
|
||||
"\uDE30\uD83B\uDE31\u0020\uD83B\uDE32\uD83B\uDE34\uD83B\uDE35\u0020\uD83B" +
|
||||
"\uDE36\uD83B\uDE37\u0020\uD83B\uDE39\uD83B\uDE3B";
|
||||
String ArMathSymTailed =
|
||||
"\uD83B\uDE42\uD83B\uDE47\uD83B\uDE49\uD83B\uDE4B\u0020\uD83B\uDE4D\uD83B" +
|
||||
"\uDE4E\uD83B\uDE4F\u0020\uD83B\uDE51\uD83B\uDE52\uD83B\uDE54\uD83B\uDE57" +
|
||||
"\u0020\uD83B\uDE59\uD83B\uDE5B\uD83B\uDE5D\uD83B\uDE5F";
|
||||
String ArMathSymStretched =
|
||||
"\uD83B\uDE21\u0633\uD83B\uDE62\u0647";
|
||||
String logicalUnshape =
|
||||
"\u0020\u0020\u0020\uFE8D\uFEF5\u0020\uFEE5\u0020\uFE8D\uFEF7\u0020\uFED7" +
|
||||
"\uFEFC\u0020\uFEE1\u0020\uFE8D\uFEDF\uFECC\uFEAE\uFE91\uFEF4\uFE94\u0020" +
|
||||
"\uFE8D\uFEDF\uFEA4\uFEAE\uFE93\u0020\u0020\u0020\u0020";
|
||||
String numSource =
|
||||
"\u0031" + /* en:1 */
|
||||
"\u0627" + /* arabic:alef */
|
||||
"\u0032" + /* en:2 */
|
||||
"\u06f3" + /* an:3 */
|
||||
"\u0061" + /* latin:a */
|
||||
"\u0034"; /* en:4 */
|
||||
|
||||
return Arrays.asList(new Object[][] {
|
||||
/* lam alef special visual ltr */
|
||||
{lamAlefSpecialVLTR,
|
||||
LETTERS_SHAPE | TEXT_DIRECTION_VISUAL_LTR | LENGTH_FIXED_SPACES_NEAR,
|
||||
"\u0020\ufee5\u0020\ufef5\ufe8d\u0020\ufee5\u0020\ufe76\ufef7\ufe8d\u0020" +
|
||||
"\ufee5\u0020\u0670\ufefb\ufe8d\u0020\ufee5\u0020\u0653\ufef5\ufe8d\u0020" +
|
||||
"\ufee5\u0020\u0655\ufef9\ufe8d\u0020\ufee5\u0020\u0654\ufef5\ufe8d\u0020" +
|
||||
"\ufefc\ufecb"},
|
||||
{lamAlefSpecialVLTR,
|
||||
LETTERS_SHAPE | TEXT_DIRECTION_VISUAL_LTR | LENGTH_FIXED_SPACES_AT_END,
|
||||
"\u0020\ufee5\ufef5\ufe8d\u0020\ufee5\ufe76\ufef7\ufe8d\u0020\ufee5\u0670" +
|
||||
"\ufefb\ufe8d\u0020\ufee5\u0653\ufef5\ufe8d\u0020\ufee5\u0655\ufef9\ufe8d" +
|
||||
"\u0020\ufee5\u0654\ufef5\ufe8d\u0020\ufefc\ufecb\u0020\u0020\u0020\u0020" +
|
||||
"\u0020\u0020"},
|
||||
{lamAlefSpecialVLTR,
|
||||
LETTERS_SHAPE | TEXT_DIRECTION_VISUAL_LTR | LENGTH_FIXED_SPACES_AT_BEGINNING,
|
||||
"\u0020\u0020\u0020\u0020\u0020\u0020\u0020\ufee5\ufef5\ufe8d\u0020\ufee5" +
|
||||
"\ufe76\ufef7\ufe8d\u0020\ufee5\u0670\ufefb\ufe8d\u0020\ufee5\u0653\ufef5" +
|
||||
"\ufe8d\u0020\ufee5\u0655\ufef9\ufe8d\u0020\ufee5\u0654\ufef5\ufe8d\u0020" +
|
||||
"\ufefc\ufecb"},
|
||||
{lamAlefSpecialVLTR,
|
||||
LETTERS_SHAPE | TEXT_DIRECTION_VISUAL_LTR | LENGTH_GROW_SHRINK,
|
||||
"\u0020\ufee5\ufef5\ufe8d\u0020\ufee5\ufe76\ufef7\ufe8d\u0020\ufee5\u0670" +
|
||||
"\ufefb\ufe8d\u0020\ufee5\u0653\ufef5\ufe8d\u0020\ufee5\u0655\ufef9\ufe8d" +
|
||||
"\u0020\ufee5\u0654\ufef5\ufe8d\u0020\ufefc\ufecb"},
|
||||
/* TASHKEEL */
|
||||
{lamAlefSpecialVLTR,
|
||||
LETTERS_SHAPE_TASHKEEL_ISOLATED | TEXT_DIRECTION_VISUAL_LTR |
|
||||
LENGTH_FIXED_SPACES_NEAR,
|
||||
"\u0020\ufee5\u0020\ufef5\ufe8d\u0020\ufee5\u0020\ufe76\ufef7\ufe8d\u0020" +
|
||||
"\ufee5\u0020\u0670\ufefb\ufe8d\u0020\ufee5\u0020\u0653\ufef5\ufe8d\u0020" +
|
||||
"\ufee5\u0020\u0655\ufef9\ufe8d\u0020\ufee5\u0020\u0654\ufef5\ufe8d\u0020" +
|
||||
"\ufefc\ufecb"},
|
||||
{lamAlefSpecialVLTR,
|
||||
LETTERS_SHAPE_TASHKEEL_ISOLATED | TEXT_DIRECTION_VISUAL_LTR |
|
||||
LENGTH_FIXED_SPACES_AT_END,
|
||||
"\u0020\ufee5\ufef5\ufe8d\u0020\ufee5\ufe76\ufef7\ufe8d\u0020\ufee5\u0670" +
|
||||
"\ufefb\ufe8d\u0020\ufee5\u0653\ufef5\ufe8d\u0020\ufee5\u0655\ufef9\ufe8d" +
|
||||
"\u0020\ufee5\u0654\ufef5\ufe8d\u0020\ufefc\ufecb\u0020\u0020\u0020\u0020" +
|
||||
"\u0020\u0020"},
|
||||
{lamAlefSpecialVLTR,
|
||||
LETTERS_SHAPE_TASHKEEL_ISOLATED | TEXT_DIRECTION_VISUAL_LTR |
|
||||
LENGTH_FIXED_SPACES_AT_BEGINNING,
|
||||
"\u0020\u0020\u0020\u0020\u0020\u0020\u0020\ufee5\ufef5\ufe8d\u0020\ufee5" +
|
||||
"\ufe76\ufef7\ufe8d\u0020\ufee5\u0670\ufefb\ufe8d\u0020\ufee5\u0653\ufef5" +
|
||||
"\ufe8d\u0020\ufee5\u0655\ufef9\ufe8d\u0020\ufee5\u0654\ufef5\ufe8d\u0020" +
|
||||
"\ufefc\ufecb"},
|
||||
{lamAlefSpecialVLTR,
|
||||
LETTERS_SHAPE_TASHKEEL_ISOLATED | TEXT_DIRECTION_VISUAL_LTR |
|
||||
LENGTH_GROW_SHRINK,
|
||||
"\u0020\ufee5\ufef5\ufe8d\u0020\ufee5\ufe76\ufef7\ufe8d\u0020\ufee5\u0670" +
|
||||
"\ufefb\ufe8d\u0020\ufee5\u0653\ufef5\ufe8d\u0020\ufee5\u0655\ufef9\ufe8d" +
|
||||
"\u0020\ufee5\u0654\ufef5\ufe8d\u0020\ufefc\ufecb"},
|
||||
/* tashkeel special visual ltr */
|
||||
{tashkeelSpecialVLTR,
|
||||
LETTERS_SHAPE | TEXT_DIRECTION_VISUAL_LTR | LENGTH_FIXED_SPACES_NEAR,
|
||||
"\ufef2\ufe91\ufeae\ufecb\u0020\ufef2\ufe91\ufe7c\ufeae\ufe77\ufecb\u0020" +
|
||||
"\ufe72\ufef2\ufe91\ufeae\ufe79\ufecb\u0020\ufe8f\u0670\ufeae\u0670\ufecb" +
|
||||
"\u0020\ufe8f\u0653\ufeae\u0653\ufecb\u0020\ufe8f\u0654\ufeae\u0654\ufecb" +
|
||||
"\u0020\ufe8f\u0655\ufeae\u0655\ufecb\u0020"},
|
||||
{tashkeelSpecialVLTR,
|
||||
LETTERS_SHAPE_TASHKEEL_ISOLATED | TEXT_DIRECTION_VISUAL_LTR |
|
||||
LENGTH_FIXED_SPACES_NEAR,
|
||||
"\ufef2\ufe91\ufeae\ufecb\u0020\ufef2\ufe91\ufe7c\ufeae\ufe76\ufecb\u0020" +
|
||||
"\ufe72\ufef2\ufe91\ufeae\ufe78\ufecb\u0020\ufe8f\u0670\ufeae\u0670\ufecb" +
|
||||
"\u0020\ufe8f\u0653\ufeae\u0653\ufecb\u0020\ufe8f\u0654\ufeae\u0654\ufecb" +
|
||||
"\u0020\ufe8f\u0655\ufeae\u0655\ufecb\u0020"},
|
||||
{tashkeelShaddaRTL,
|
||||
ArabicShaping.LETTERS_SHAPE | ArabicShaping.TASHKEEL_BEGIN |
|
||||
ArabicShaping.TEXT_DIRECTION_VISUAL_RTL,
|
||||
"\u0020\ufeb7\ufe7d\ufee4\ufeb2"},
|
||||
{tashkeelShaddaRTL,
|
||||
ArabicShaping.LETTERS_SHAPE | ArabicShaping.TASHKEEL_END |
|
||||
ArabicShaping.TEXT_DIRECTION_VISUAL_RTL,
|
||||
"\ufeb7\ufe7d\ufee4\ufeb2\u0020"},
|
||||
{tashkeelShaddaRTL,
|
||||
ArabicShaping.LETTERS_SHAPE | ArabicShaping.TASHKEEL_RESIZE |
|
||||
ArabicShaping.TEXT_DIRECTION_VISUAL_RTL,
|
||||
"\ufeb7\ufe7d\ufee4\ufeb2"},
|
||||
{tashkeelShaddaRTL,
|
||||
ArabicShaping.LETTERS_SHAPE | ArabicShaping.TASHKEEL_REPLACE_BY_TATWEEL |
|
||||
ArabicShaping.TEXT_DIRECTION_VISUAL_RTL,
|
||||
"\ufeb7\ufe7d\ufee4\u0640\ufeb2"},
|
||||
{tashkeelShaddaLTR,
|
||||
ArabicShaping.LETTERS_SHAPE | ArabicShaping.TASHKEEL_BEGIN |
|
||||
ArabicShaping.TEXT_DIRECTION_VISUAL_LTR,
|
||||
"\u0020\ufeb2\ufee4\ufe7d\ufeb7"},
|
||||
{tashkeelShaddaLTR,
|
||||
ArabicShaping.LETTERS_SHAPE | ArabicShaping.TASHKEEL_END |
|
||||
ArabicShaping.TEXT_DIRECTION_VISUAL_LTR,
|
||||
"\ufeb2\ufee4\ufe7d\ufeb7\u0020"},
|
||||
{tashkeelShaddaLTR,
|
||||
ArabicShaping.LETTERS_SHAPE | ArabicShaping.TASHKEEL_RESIZE |
|
||||
ArabicShaping.TEXT_DIRECTION_VISUAL_LTR,
|
||||
"\ufeb2\ufee4\ufe7d\ufeb7"},
|
||||
{tashkeelShaddaLTR,
|
||||
ArabicShaping.LETTERS_SHAPE | ArabicShaping.TASHKEEL_REPLACE_BY_TATWEEL |
|
||||
ArabicShaping.TEXT_DIRECTION_VISUAL_LTR,
|
||||
"\ufeb2\u0640\ufee4\ufe7d\ufeb7"},
|
||||
{ArMathSym,
|
||||
ArabicShaping.LETTERS_SHAPE | ArabicShaping.TASHKEEL_BEGIN |
|
||||
ArabicShaping.TEXT_DIRECTION_VISUAL_RTL,
|
||||
"\uD83B\uDE00\uD83B\uDE01\uD83B\uDE02\uD83B\uDE03\u0020\uD83B\uDE24\uD83B" +
|
||||
"\uDE05\uD83B\uDE06\u0020\uD83B\uDE07\uD83B\uDE08\uD83B\uDE09\u0020\uD83B" +
|
||||
"\uDE0A\uD83B\uDE0B\uD83B\uDE0C\uD83B\uDE0D\u0020\uD83B\uDE0E\uD83B\uDE0F" +
|
||||
"\uD83B\uDE10\uD83B\uDE11\u0020\uD83B\uDE12\uD83B\uDE13\uD83B\uDE14\uD83B" +
|
||||
"\uDE15\u0020\uD83B\uDE16\uD83B\uDE17\uD83B\uDE18\u0020\uD83B\uDE19\uD83B" +
|
||||
"\uDE1A\uD83B\uDE1B"},
|
||||
{ArMathSymLooped,
|
||||
ArabicShaping.LETTERS_SHAPE | ArabicShaping.TASHKEEL_END |
|
||||
ArabicShaping.TEXT_DIRECTION_VISUAL_RTL,
|
||||
"\uD83B\uDE80\uD83B\uDE81\uD83B\uDE82\uD83B\uDE83\u0020\uD83B\uDE84\uD83B" +
|
||||
"\uDE85\uD83B\uDE86\u0020\uD83B\uDE87\uD83B\uDE88\uD83B\uDE89\u0020\uD83B" +
|
||||
"\uDE8B\uD83B\uDE8C\uD83B\uDE8D\u0020\uD83B\uDE8E\uD83B\uDE8F\uD83B\uDE90" +
|
||||
"\uD83B\uDE91\u0020\uD83B\uDE92\uD83B\uDE93\uD83B\uDE94\uD83B\uDE95\u0020" +
|
||||
"\uD83B\uDE96\uD83B\uDE97\uD83B\uDE98\u0020\uD83B\uDE99\uD83B\uDE9A\uD83B" +
|
||||
"\uDE9B"},
|
||||
{ArMathSymDoubleStruck,
|
||||
ArabicShaping.LETTERS_SHAPE | ArabicShaping.TASHKEEL_RESIZE|
|
||||
ArabicShaping.TEXT_DIRECTION_VISUAL_RTL,
|
||||
"\uD83B\uDEA1\uD83B\uDEA2\uD83B\uDEA3\u0020\uD83B\uDEA5\uD83B\uDEA6\u0020" +
|
||||
"\uD83B\uDEA7\uD83B\uDEA8\uD83B\uDEA9\u0020\uD83B\uDEAB\uD83B\uDEAC\uD83B" +
|
||||
"\uDEAD\u0020\uD83B\uDEAE\uD83B\uDEAF\uD83B\uDEB0\uD83B\uDEB1\u0020\uD83B" +
|
||||
"\uDEB2\uD83B\uDEB3\uD83B\uDEB4\uD83B\uDEB5\u0020\uD83B\uDEB6\uD83B\uDEB7" +
|
||||
"\uD83B\uDEB8\u0020\uD83B\uDEB9\uD83B\uDEBA\uD83B\uDEBB"},
|
||||
{ArMathSymInitial,
|
||||
ArabicShaping.LETTERS_SHAPE | ArabicShaping.TASHKEEL_BEGIN |
|
||||
ArabicShaping.TEXT_DIRECTION_VISUAL_LTR,
|
||||
"\uD83B\uDE21\uD83B\uDE22\u0020\uD83B\uDE27\uD83B\uDE29\u0020\uD83B\uDE2A" +
|
||||
"\uD83B\uDE2B\uD83B\uDE2C\uD83B\uDE2D\u0020\uD83B\uDE2E\uD83B\uDE2F\uD83B" +
|
||||
"\uDE30\uD83B\uDE31\u0020\uD83B\uDE32\uD83B\uDE34\uD83B\uDE35\u0020\uD83B" +
|
||||
"\uDE36\uD83B\uDE37\u0020\uD83B\uDE39\uD83B\uDE3B"},
|
||||
{ArMathSymTailed,
|
||||
ArabicShaping.LETTERS_SHAPE | ArabicShaping.TASHKEEL_END |
|
||||
ArabicShaping.TEXT_DIRECTION_VISUAL_LTR,
|
||||
"\uD83B\uDE42\uD83B\uDE47\uD83B\uDE49\uD83B\uDE4B\u0020\uD83B\uDE4D\uD83B" +
|
||||
"\uDE4E\uD83B\uDE4F\u0020\uD83B\uDE51\uD83B\uDE52\uD83B\uDE54\uD83B\uDE57" +
|
||||
"\u0020\uD83B\uDE59\uD83B\uDE5B\uD83B\uDE5D\uD83B\uDE5F"},
|
||||
{ArMathSymStretched,
|
||||
ArabicShaping.LETTERS_SHAPE|ArabicShaping.TASHKEEL_RESIZE |
|
||||
ArabicShaping.TEXT_DIRECTION_VISUAL_LTR,
|
||||
"\uD83B\uDE21\uFEB1\uD83B\uDE62\uFEE9"},
|
||||
/* logical unshape */
|
||||
{logicalUnshape,
|
||||
LETTERS_UNSHAPE | TEXT_DIRECTION_LOGICAL | LENGTH_FIXED_SPACES_NEAR,
|
||||
"\u0020\u0020\u0020\u0627\u0644\u0622\u0646\u0020\u0627\u0644\u0623\u0642" +
|
||||
"\u0644\u0627\u0645\u0020\u0627\u0644\u0639\u0631\u0628\u064a\u0629\u0020" +
|
||||
"\u0627\u0644\u062d\u0631\u0629\u0020\u0020\u0020\u0020"},
|
||||
{logicalUnshape,
|
||||
LETTERS_UNSHAPE | TEXT_DIRECTION_LOGICAL | LENGTH_FIXED_SPACES_AT_END,
|
||||
"\u0020\u0020\u0020\u0627\u0644\u0622\u0020\u0646\u0020\u0627\u0644\u0623" +
|
||||
"\u0020\u0642\u0644\u0627\u0020\u0645\u0020\u0627\u0644\u0639\u0631\u0628" +
|
||||
"\u064a\u0629\u0020\u0627\u0644\u062d\u0631\u0629\u0020"},
|
||||
{logicalUnshape,
|
||||
LETTERS_UNSHAPE | TEXT_DIRECTION_LOGICAL | LENGTH_FIXED_SPACES_AT_BEGINNING,
|
||||
"\u0627\u0644\u0622\u0020\u0646\u0020\u0627\u0644\u0623\u0020\u0642\u0644" +
|
||||
"\u0627\u0020\u0645\u0020\u0627\u0644\u0639\u0631\u0628\u064a\u0629\u0020" +
|
||||
"\u0627\u0644\u062d\u0631\u0629\u0020\u0020\u0020\u0020"},
|
||||
{logicalUnshape,
|
||||
LETTERS_UNSHAPE | TEXT_DIRECTION_LOGICAL | LENGTH_GROW_SHRINK,
|
||||
"\u0020\u0020\u0020\u0627\u0644\u0622\u0020\u0646\u0020\u0627\u0644\u0623" +
|
||||
"\u0020\u0642\u0644\u0627\u0020\u0645\u0020\u0627\u0644\u0639\u0631\u0628" +
|
||||
"\u064a\u0629\u0020\u0627\u0644\u062d\u0631\u0629\u0020\u0020\u0020\u0020"},
|
||||
/* numbers */
|
||||
{numSource,
|
||||
DIGITS_EN2AN | DIGIT_TYPE_AN,
|
||||
"\u0661\u0627\u0662\u06f3\u0061\u0664"},
|
||||
{numSource,
|
||||
DIGITS_AN2EN | DIGIT_TYPE_AN_EXTENDED,
|
||||
"\u0031\u0627\u0032\u0033\u0061\u0034"},
|
||||
{numSource,
|
||||
DIGITS_EN2AN_INIT_LR | DIGIT_TYPE_AN,
|
||||
"\u0031\u0627\u0662\u06f3\u0061\u0034" },
|
||||
{numSource,
|
||||
DIGITS_EN2AN_INIT_AL | DIGIT_TYPE_AN_EXTENDED,
|
||||
"\u06f1\u0627\u06f2\u06f3\u0061\u0034"},
|
||||
{numSource,
|
||||
DIGITS_EN2AN_INIT_LR | DIGIT_TYPE_AN | TEXT_DIRECTION_VISUAL_LTR,
|
||||
"\u0661\u0627\u0032\u06f3\u0061\u0034"},
|
||||
{numSource,
|
||||
DIGITS_EN2AN_INIT_AL | DIGIT_TYPE_AN_EXTENDED | TEXT_DIRECTION_VISUAL_LTR,
|
||||
"\u06f1\u0627\u0032\u06f3\u0061\u06f4"},
|
||||
/* no-op */
|
||||
{numSource, 0, numSource}
|
||||
});
|
||||
}
|
||||
|
||||
@Test
|
||||
public void TestStandard() {
|
||||
Exception ex = null;
|
||||
String actual = null;
|
||||
ArabicShaping shaper = null;
|
||||
|
||||
try {
|
||||
shaper = new ArabicShaping(flags);
|
||||
actual = shaper.shape(source);
|
||||
}
|
||||
catch(MissingResourceException e){
|
||||
throw e;
|
||||
}
|
||||
catch (IllegalStateException ie){
|
||||
warnln("IllegalStateException: "+ ie.toString());
|
||||
return;
|
||||
}
|
||||
catch (Exception e) {
|
||||
ex = e;
|
||||
}
|
||||
|
||||
if (ex != null) {
|
||||
err("Error: Shaper " + shaper + "\n throws exception '" + ex + "'\n for input '" + source);
|
||||
} else if (!expected.equals(actual)) {
|
||||
StringBuffer buf = new StringBuffer();
|
||||
buf.append("Error: Shaper: " + shaper + "\n Input: " + source + "\n Actual: " + actual +
|
||||
"\n Expected: " + expected + "\n");
|
||||
|
||||
for (int i = 0; i < Math.max(expected.length(), actual.length()); ++i) {
|
||||
String temp = Integer.toString(i);
|
||||
if (temp.length() < 2) {
|
||||
temp = " ".concat(temp);
|
||||
}
|
||||
char trg = i < expected.length() ? expected.charAt(i) : '\uffff';
|
||||
char res = i < actual.length() ? actual.charAt(i) : '\uffff';
|
||||
|
||||
buf.append("[" + temp + "] ");
|
||||
buf.append(escapedString("" + trg) + " ");
|
||||
buf.append(escapedString("" + res) + " ");
|
||||
if (trg != res) {
|
||||
buf.append("***");
|
||||
}
|
||||
buf.append("\n");
|
||||
}
|
||||
err(buf.toString());
|
||||
}
|
||||
}
|
||||
|
||||
private static String escapedString(String str) {
|
||||
if (str == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
StringBuffer buf = new StringBuffer(str.length() * 6);
|
||||
for (int i = 0; i < str.length(); ++i) {
|
||||
char ch = str.charAt(i);
|
||||
buf.append("\\u");
|
||||
if (ch < 0x1000) {
|
||||
buf.append('0');
|
||||
}
|
||||
if (ch < 0x0100) {
|
||||
buf.append('0');
|
||||
}
|
||||
if (ch < 0x0010) {
|
||||
buf.append('0');
|
||||
}
|
||||
buf.append(Integer.toHexString(ch));
|
||||
}
|
||||
return buf.toString();
|
||||
}
|
||||
}
|
||||
|
||||
@RunWith(Parameterized.class)
|
||||
public static class PreflightDataTest extends TestFmwk {
|
||||
private String source;
|
||||
private int flags;
|
||||
private int length;
|
||||
|
||||
public PreflightDataTest(String source, int flags, int length) {
|
||||
this.source = source;
|
||||
this.flags = flags;
|
||||
this.length = length;
|
||||
}
|
||||
|
||||
@Parameterized.Parameters
|
||||
public static Collection testData() {
|
||||
return Arrays.asList(new Object[][] {
|
||||
{"\u0644\u0627", LETTERS_SHAPE | LENGTH_GROW_SHRINK, 1},
|
||||
{"\u0644\u0627\u0031",
|
||||
DIGITS_EN2AN | DIGIT_TYPE_AN_EXTENDED | LENGTH_GROW_SHRINK, 3},
|
||||
{"\u0644\u0644", LETTERS_SHAPE | LENGTH_GROW_SHRINK, 2},
|
||||
{"\ufef7", LETTERS_UNSHAPE | LENGTH_GROW_SHRINK, 2}
|
||||
});
|
||||
}
|
||||
|
||||
@Test
|
||||
public void TestPreflight() {
|
||||
Exception ex = null;
|
||||
char src[] = null;
|
||||
int len = 0;
|
||||
ArabicShaping shaper = null;
|
||||
|
||||
if (source != null) {
|
||||
src = source.toCharArray();
|
||||
}
|
||||
|
||||
try {
|
||||
shaper = new ArabicShaping(flags);
|
||||
len = shaper.shape(src, 0, src.length, null, 0, 0);
|
||||
}
|
||||
catch (Exception e) {
|
||||
ex = e;
|
||||
}
|
||||
|
||||
if (ex != null) {
|
||||
err("Error: Shaper " + shaper + "\n throws exception '" + ex + "'\n for input '" + source);
|
||||
} else if (length != len) {
|
||||
err("Error: Shaper " + shaper + "\n returns " + len + " characters for input '" +
|
||||
source + "'\n Expected were " + length + " characters");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@RunWith(Parameterized.class)
|
||||
public static class ErrorDataTest extends TestFmwk {
|
||||
private String source;
|
||||
private int flags;
|
||||
private Class error;
|
||||
|
||||
public ErrorDataTest(String source, int flags, Class error) {
|
||||
this.source = source;
|
||||
this.flags = flags;
|
||||
this.error = error;
|
||||
}
|
||||
|
||||
@Parameterized.Parameters
|
||||
public static Collection testData() {
|
||||
return Arrays.asList(new Object[][] {
|
||||
/* bad data */
|
||||
{"\u0020\ufef7\u0644\u0020", LETTERS_UNSHAPE | LENGTH_FIXED_SPACES_NEAR,
|
||||
ArabicShapingException.class},
|
||||
{"\u0020\ufef7", LETTERS_UNSHAPE | LENGTH_FIXED_SPACES_AT_END,
|
||||
ArabicShapingException.class},
|
||||
{"\ufef7\u0020", LETTERS_UNSHAPE | LENGTH_FIXED_SPACES_AT_BEGINNING,
|
||||
ArabicShapingException.class},
|
||||
/* bad options */
|
||||
{"\ufef7", 0xffffffff, IllegalArgumentException.class},
|
||||
{"\ufef7", LETTERS_UNSHAPE | LENGTH_GROW_SHRINK, ArabicShapingException.class},
|
||||
{null, LETTERS_UNSHAPE | LENGTH_FIXED_SPACES_AT_END,
|
||||
IllegalArgumentException.class}
|
||||
});
|
||||
}
|
||||
|
||||
@Test
|
||||
public void TestError() {
|
||||
Exception ex = null;
|
||||
char src[] = null;
|
||||
int len = 0;
|
||||
ArabicShaping shaper = null;
|
||||
|
||||
if (source != null) {
|
||||
src = source.toCharArray();
|
||||
len = src.length;
|
||||
}
|
||||
|
||||
try {
|
||||
shaper = new ArabicShaping(flags);
|
||||
shaper.shape(src, 0, len);
|
||||
}
|
||||
catch (Exception e) {
|
||||
ex = e;
|
||||
}
|
||||
|
||||
if (!error.isInstance(ex)) {
|
||||
err("Error: Shaper " + shaper + "\n throws exception '" + ex + "'\n for input '" +
|
||||
source + "'\n Expected exception: " + error);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
@ -33,24 +33,24 @@ public class TestIDNA extends TestFmwk {
|
||||
// test StringBuffer toUnicode
|
||||
doTestToUnicode(TestData.asciiIn[i],new String(TestData.unicodeIn[i]),IDNA.DEFAULT, null);
|
||||
doTestToUnicode(TestData.asciiIn[i],new String(TestData.unicodeIn[i]),IDNA.ALLOW_UNASSIGNED, null);
|
||||
doTestToUnicode(TestData.asciiIn[i],new String(TestData.unicodeIn[i]),IDNA.USE_STD3_RULES, null);
|
||||
doTestToUnicode(TestData.asciiIn[i],new String(TestData.unicodeIn[i]),IDNA.USE_STD3_RULES|IDNA.ALLOW_UNASSIGNED, null);
|
||||
|
||||
doTestToUnicode(TestData.asciiIn[i],new String(TestData.unicodeIn[i]),IDNA.USE_STD3_RULES, null);
|
||||
doTestToUnicode(TestData.asciiIn[i],new String(TestData.unicodeIn[i]),IDNA.USE_STD3_RULES|IDNA.ALLOW_UNASSIGNED, null);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void TestToASCII() throws Exception{
|
||||
for(int i=0; i<TestData.asciiIn.length; i++){
|
||||
// test StringBuffer toUnicode
|
||||
doTestToASCII(new String(TestData.unicodeIn[i]),TestData.asciiIn[i],IDNA.DEFAULT, null);
|
||||
doTestToASCII(new String(TestData.unicodeIn[i]),TestData.asciiIn[i],IDNA.ALLOW_UNASSIGNED, null);
|
||||
doTestToUnicode(TestData.asciiIn[i],new String(TestData.unicodeIn[i]),IDNA.USE_STD3_RULES, null);
|
||||
doTestToUnicode(TestData.asciiIn[i],new String(TestData.unicodeIn[i]),IDNA.USE_STD3_RULES|IDNA.ALLOW_UNASSIGNED, null);
|
||||
|
||||
doTestToUnicode(TestData.asciiIn[i],new String(TestData.unicodeIn[i]),IDNA.USE_STD3_RULES, null);
|
||||
doTestToUnicode(TestData.asciiIn[i],new String(TestData.unicodeIn[i]),IDNA.USE_STD3_RULES|IDNA.ALLOW_UNASSIGNED, null);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void TestIDNToASCII() throws Exception{
|
||||
for(int i=0; i<TestData.domainNames.length; i++){
|
||||
@ -59,7 +59,7 @@ public class TestIDNA extends TestFmwk {
|
||||
doTestIDNToASCII(TestData.domainNames[i],TestData.domainNames[i],IDNA.USE_STD3_RULES, null);
|
||||
doTestIDNToASCII(TestData.domainNames[i],TestData.domainNames[i],IDNA.ALLOW_UNASSIGNED|IDNA.USE_STD3_RULES, null);
|
||||
}
|
||||
|
||||
|
||||
for(int i=0; i<TestData.domainNames1Uni.length; i++){
|
||||
doTestIDNToASCII(TestData.domainNames1Uni[i],TestData.domainNamesToASCIIOut[i],IDNA.DEFAULT, null);
|
||||
doTestIDNToASCII(TestData.domainNames1Uni[i],TestData.domainNamesToASCIIOut[i],IDNA.ALLOW_UNASSIGNED, null);
|
||||
@ -78,16 +78,16 @@ public class TestIDNA extends TestFmwk {
|
||||
doTestIDNToUnicode(TestData.domainNamesToASCIIOut[i],TestData.domainNamesToUnicodeOut[i],IDNA.ALLOW_UNASSIGNED, null);
|
||||
}
|
||||
}
|
||||
|
||||
private void doTestToUnicode(String src, String expected, int options, Object expectedException)
|
||||
|
||||
private void doTestToUnicode(String src, String expected, int options, Object expectedException)
|
||||
throws Exception{
|
||||
StringBuffer inBuf = new StringBuffer(src);
|
||||
UCharacterIterator inIter = UCharacterIterator.getInstance(src);
|
||||
try{
|
||||
|
||||
|
||||
StringBuffer out = IDNA.convertToUnicode(src,options);
|
||||
if(expected!=null && out != null && !out.toString().equals(expected)){
|
||||
errln("convertToUnicode did not return expected result with options : "+ options +
|
||||
errln("convertToUnicode did not return expected result with options : "+ options +
|
||||
" Expected: " + prettify(expected)+" Got: "+prettify(out));
|
||||
}
|
||||
if(expectedException!=null && !unassignedException.equals(expectedException)){
|
||||
@ -99,10 +99,10 @@ public class TestIDNA extends TestFmwk {
|
||||
}
|
||||
}
|
||||
try{
|
||||
|
||||
|
||||
StringBuffer out = IDNA.convertToUnicode(inBuf,options);
|
||||
if(expected!=null && out != null && !out.toString().equals(expected)){
|
||||
errln("convertToUnicode did not return expected result with options : "+ options +
|
||||
errln("convertToUnicode did not return expected result with options : "+ options +
|
||||
" Expected: " + prettify(expected)+" Got: "+out);
|
||||
}
|
||||
if(expectedException!=null && !unassignedException.equals(expectedException)){
|
||||
@ -113,7 +113,7 @@ public class TestIDNA extends TestFmwk {
|
||||
errln("convertToUnicode did not get the expected exception for source: " + prettify(src) +" Got: "+ ex.toString());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
try{
|
||||
StringBuffer out = IDNA.convertToUnicode(inIter,options);
|
||||
if(expected!=null && out != null && !out.toString().equals(expected)){
|
||||
@ -129,16 +129,16 @@ public class TestIDNA extends TestFmwk {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void doTestIDNToUnicode(String src, String expected, int options, Object expectedException)
|
||||
|
||||
private void doTestIDNToUnicode(String src, String expected, int options, Object expectedException)
|
||||
throws Exception{
|
||||
StringBuffer inBuf = new StringBuffer(src);
|
||||
UCharacterIterator inIter = UCharacterIterator.getInstance(src);
|
||||
try{
|
||||
|
||||
|
||||
StringBuffer out = IDNA.convertIDNToUnicode(src,options);
|
||||
if(expected!=null && out != null && !out.toString().equals(expected)){
|
||||
errln("convertToUnicode did not return expected result with options : "+ options +
|
||||
errln("convertToUnicode did not return expected result with options : "+ options +
|
||||
" Expected: " + prettify(expected)+" Got: "+prettify(out));
|
||||
}
|
||||
if(expectedException!=null && !unassignedException.equals(expectedException)){
|
||||
@ -152,7 +152,7 @@ public class TestIDNA extends TestFmwk {
|
||||
try{
|
||||
StringBuffer out = IDNA.convertIDNToUnicode(inBuf,options);
|
||||
if(expected!=null && out != null && !out.toString().equals(expected)){
|
||||
errln("convertToUnicode did not return expected result with options : "+ options +
|
||||
errln("convertToUnicode did not return expected result with options : "+ options +
|
||||
" Expected: " + prettify(expected)+" Got: "+out);
|
||||
}
|
||||
if(expectedException!=null && !unassignedException.equals(expectedException)){
|
||||
@ -163,7 +163,7 @@ public class TestIDNA extends TestFmwk {
|
||||
errln("convertToUnicode did not get the expected exception for source: " +src +" Got: "+ ex.toString());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
try{
|
||||
StringBuffer out = IDNA.convertIDNToUnicode(inIter,options);
|
||||
if(expected!=null && out != null && !out.toString().equals(expected)){
|
||||
@ -179,17 +179,17 @@ public class TestIDNA extends TestFmwk {
|
||||
}
|
||||
}
|
||||
}
|
||||
private void doTestToASCII(String src, String expected, int options, Object expectedException)
|
||||
private void doTestToASCII(String src, String expected, int options, Object expectedException)
|
||||
throws Exception{
|
||||
StringBuffer inBuf = new StringBuffer(src);
|
||||
UCharacterIterator inIter = UCharacterIterator.getInstance(src);
|
||||
try{
|
||||
|
||||
|
||||
StringBuffer out = IDNA.convertToASCII(src,options);
|
||||
if(!unassignedException.equals(expectedException) && expected!=null && out != null && expected!=null && out != null && !out.toString().equals(expected.toLowerCase())){
|
||||
errln("convertToASCII did not return expected result with options : "+ options +
|
||||
errln("convertToASCII did not return expected result with options : "+ options +
|
||||
" Expected: " + expected+" Got: "+out);
|
||||
}
|
||||
}
|
||||
if(expectedException!=null && !unassignedException.equals(expectedException)){
|
||||
errln("convertToASCII did not get the expected exception. The operation succeeded!");
|
||||
}
|
||||
@ -198,11 +198,11 @@ public class TestIDNA extends TestFmwk {
|
||||
errln("convertToASCII did not get the expected exception for source: " +src +"\n Got: "+ ex.toString() +"\n Expected: " +ex.toString());
|
||||
}
|
||||
}
|
||||
|
||||
try{
|
||||
|
||||
try{
|
||||
StringBuffer out = IDNA.convertToASCII(inBuf,options);
|
||||
if(!unassignedException.equals(expectedException) && expected!=null && out != null && expected!=null && out != null && !out.toString().equals(expected.toLowerCase())){
|
||||
errln("convertToASCII did not return expected result with options : "+ options +
|
||||
errln("convertToASCII did not return expected result with options : "+ options +
|
||||
" Expected: " + expected+" Got: "+out);
|
||||
}
|
||||
if(expectedException!=null && !unassignedException.equals(expectedException)){
|
||||
@ -213,7 +213,7 @@ public class TestIDNA extends TestFmwk {
|
||||
errln("convertToASCII did not get the expected exception for source: " +src +" Got: "+ ex.toString());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
try{
|
||||
StringBuffer out = IDNA.convertToASCII(inIter,options);
|
||||
if(!unassignedException.equals(expectedException) && expected!=null && out != null && expected!=null && out != null && !out.toString().equals(expected.toLowerCase())){
|
||||
@ -229,15 +229,15 @@ public class TestIDNA extends TestFmwk {
|
||||
}
|
||||
}
|
||||
}
|
||||
private void doTestIDNToASCII(String src, String expected, int options, Object expectedException)
|
||||
private void doTestIDNToASCII(String src, String expected, int options, Object expectedException)
|
||||
throws Exception{
|
||||
StringBuffer inBuf = new StringBuffer(src);
|
||||
UCharacterIterator inIter = UCharacterIterator.getInstance(src);
|
||||
try{
|
||||
|
||||
|
||||
StringBuffer out = IDNA.convertIDNToASCII(src,options);
|
||||
if(expected!=null && out != null && !out.toString().equals(expected)){
|
||||
errln("convertToIDNASCII did not return expected result with options : "+ options +
|
||||
errln("convertToIDNASCII did not return expected result with options : "+ options +
|
||||
" Expected: " + expected+" Got: "+out);
|
||||
}
|
||||
if(expectedException!=null && !unassignedException.equals(expectedException)){
|
||||
@ -251,9 +251,9 @@ public class TestIDNA extends TestFmwk {
|
||||
try{
|
||||
StringBuffer out = IDNA.convertIDNToASCII(inBuf,options);
|
||||
if(expected!=null && out != null && !out.toString().equals(expected)){
|
||||
errln("convertToIDNASCII did not return expected result with options : "+ options +
|
||||
errln("convertToIDNASCII did not return expected result with options : "+ options +
|
||||
" Expected: " + expected+" Got: "+out);
|
||||
}
|
||||
}
|
||||
if(expectedException!=null && !unassignedException.equals(expectedException)){
|
||||
errln("convertToIDNASCII did not get the expected exception. The operation succeeded!");
|
||||
}
|
||||
@ -262,14 +262,14 @@ public class TestIDNA extends TestFmwk {
|
||||
errln("convertToIDNASCII did not get the expected exception for source: " +src +" Got: "+ ex.toString());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
try{
|
||||
StringBuffer out = IDNA.convertIDNToASCII(inIter,options);
|
||||
if(expected!=null && out != null && !out.toString().equals(expected)){
|
||||
errln("convertIDNToASCII did not return expected result with options : "+ options +
|
||||
" Expected: " + expected+" Got: "+ out);
|
||||
}
|
||||
|
||||
|
||||
if(expectedException!=null && !unassignedException.equals(expectedException)){
|
||||
errln("convertIDNToASCII did not get the expected exception. The operation succeeded!");
|
||||
}
|
||||
@ -282,7 +282,7 @@ public class TestIDNA extends TestFmwk {
|
||||
@Test
|
||||
public void TestConformance()throws Exception{
|
||||
for(int i=0; i<TestData.conformanceTestCases.length;i++){
|
||||
|
||||
|
||||
TestData.ConformanceTestCase testCase = TestData.conformanceTestCases[i];
|
||||
if(testCase.expected != null){
|
||||
//Test toASCII
|
||||
@ -313,7 +313,7 @@ public class TestIDNA extends TestFmwk {
|
||||
errln("Did not get the expected exception for source: " +testCase.input +" Got: "+ ex.toString());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
try{
|
||||
iter.setToStart();
|
||||
StringBuffer output = namePrep.prepare(iter,StringPrep.ALLOW_UNASSIGNED);
|
||||
@ -330,7 +330,7 @@ public class TestIDNA extends TestFmwk {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
@Test
|
||||
public void TestErrorCases() throws Exception{
|
||||
@ -345,11 +345,11 @@ public class TestIDNA extends TestFmwk {
|
||||
}
|
||||
}
|
||||
if(errCase.useSTD3ASCIIRules!=true){
|
||||
|
||||
|
||||
// Test IDNToASCII
|
||||
doTestIDNToASCII(new String(errCase.unicode),errCase.ascii,IDNA.DEFAULT,errCase.expected);
|
||||
doTestIDNToASCII(new String(errCase.unicode),errCase.ascii,IDNA.ALLOW_UNASSIGNED,errCase.expected);
|
||||
|
||||
|
||||
}else{
|
||||
doTestIDNToASCII(new String(errCase.unicode),errCase.ascii,IDNA.USE_STD3_RULES,errCase.expected);
|
||||
}
|
||||
@ -359,7 +359,7 @@ public class TestIDNA extends TestFmwk {
|
||||
// Test IDNToUnicode
|
||||
doTestIDNToUnicode(errCase.ascii,new String(errCase.unicode),IDNA.DEFAULT,errCase.expected);
|
||||
doTestIDNToUnicode(errCase.ascii,new String(errCase.unicode),IDNA.ALLOW_UNASSIGNED,errCase.expected);
|
||||
|
||||
|
||||
}else{
|
||||
doTestIDNToUnicode(errCase.ascii,new String(errCase.unicode),IDNA.USE_STD3_RULES,errCase.expected);
|
||||
}
|
||||
@ -370,38 +370,38 @@ public class TestIDNA extends TestFmwk {
|
||||
try{
|
||||
int retVal = IDNA.compare(s1,s2,IDNA.DEFAULT);
|
||||
if(isEqual==true && retVal != 0){
|
||||
errln("Did not get the expected result for s1: "+ prettify(s1)+
|
||||
errln("Did not get the expected result for s1: "+ prettify(s1)+
|
||||
" s2: "+prettify(s2));
|
||||
}
|
||||
retVal = IDNA.compare(new StringBuffer(s1), new StringBuffer(s2), IDNA.DEFAULT);
|
||||
if(isEqual==true && retVal != 0){
|
||||
errln("Did not get the expected result for s1: "+ prettify(s1)+
|
||||
errln("Did not get the expected result for s1: "+ prettify(s1)+
|
||||
" s2: "+prettify(s2));
|
||||
}
|
||||
retVal = IDNA.compare(UCharacterIterator.getInstance(s1), UCharacterIterator.getInstance(s2), IDNA.DEFAULT);
|
||||
if(isEqual==true && retVal != 0){
|
||||
errln("Did not get the expected result for s1: "+ prettify(s1)+
|
||||
errln("Did not get the expected result for s1: "+ prettify(s1)+
|
||||
" s2: "+prettify(s2));
|
||||
}
|
||||
}catch(Exception e){
|
||||
e.printStackTrace();
|
||||
errln("Unexpected exception thrown by IDNA.compare");
|
||||
}
|
||||
|
||||
|
||||
try{
|
||||
int retVal = IDNA.compare(s1,s2,IDNA.ALLOW_UNASSIGNED);
|
||||
if(isEqual==true && retVal != 0){
|
||||
errln("Did not get the expected result for s1: "+ prettify(s1)+
|
||||
errln("Did not get the expected result for s1: "+ prettify(s1)+
|
||||
" s2: "+prettify(s2));
|
||||
}
|
||||
retVal = IDNA.compare(new StringBuffer(s1), new StringBuffer(s2), IDNA.ALLOW_UNASSIGNED);
|
||||
if(isEqual==true && retVal != 0){
|
||||
errln("Did not get the expected result for s1: "+ prettify(s1)+
|
||||
errln("Did not get the expected result for s1: "+ prettify(s1)+
|
||||
" s2: "+prettify(s2));
|
||||
}
|
||||
retVal = IDNA.compare(UCharacterIterator.getInstance(s1), UCharacterIterator.getInstance(s2), IDNA.ALLOW_UNASSIGNED);
|
||||
if(isEqual==true && retVal != 0){
|
||||
errln("Did not get the expected result for s1: "+ prettify(s1)+
|
||||
errln("Did not get the expected result for s1: "+ prettify(s1)+
|
||||
" s2: "+prettify(s2));
|
||||
}
|
||||
}catch(Exception e){
|
||||
@ -437,13 +437,13 @@ public class TestIDNA extends TestFmwk {
|
||||
source.setLength(4);
|
||||
source.append(TestData.unicodeIn[i]);
|
||||
source.append(com);
|
||||
|
||||
|
||||
// a) compare it with itself
|
||||
doTestCompare(source.toString(),source.toString(),true);
|
||||
|
||||
|
||||
// b) compare it with asciiIn equivalent
|
||||
doTestCompare(source.toString(),www+TestData.asciiIn[i]+com,true);
|
||||
|
||||
|
||||
// c) compare it with unicodeIn not equivalent
|
||||
if(i==0){
|
||||
doTestCompare(source.toString(), uni1.toString(), false);
|
||||
@ -463,9 +463,9 @@ public class TestIDNA extends TestFmwk {
|
||||
// test and ascertain
|
||||
// func(func(func(src))) == func(src)
|
||||
private void doTestChainingToASCII(String source) throws Exception {
|
||||
StringBuffer expected;
|
||||
StringBuffer expected;
|
||||
StringBuffer chained;
|
||||
|
||||
|
||||
// test convertIDNToASCII
|
||||
expected = IDNA.convertIDNToASCII(source,IDNA.DEFAULT);
|
||||
chained = expected;
|
||||
@ -483,15 +483,15 @@ public class TestIDNA extends TestFmwk {
|
||||
}
|
||||
if(!expected.toString().equals(chained.toString())){
|
||||
errln("Chaining test failed for convertToASCII");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// test and ascertain
|
||||
// func(func(func(src))) == func(src)
|
||||
private void doTestChainingToUnicode(String source) throws Exception {
|
||||
StringBuffer expected;
|
||||
StringBuffer expected;
|
||||
StringBuffer chained;
|
||||
|
||||
|
||||
// test convertIDNToUnicode
|
||||
expected = IDNA.convertIDNToUnicode(source,IDNA.DEFAULT);
|
||||
chained = expected;
|
||||
@ -509,7 +509,7 @@ public class TestIDNA extends TestFmwk {
|
||||
}
|
||||
if(!expected.toString().equals(chained.toString())){
|
||||
errln("Chaining test failed for convertToUnicode");
|
||||
}
|
||||
}
|
||||
}
|
||||
@Test
|
||||
public void TestChaining() throws Exception{
|
||||
@ -520,7 +520,7 @@ public class TestIDNA extends TestFmwk {
|
||||
doTestChainingToASCII(new String(TestData.unicodeIn[i]));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
/* IDNA RFC Says:
|
||||
A label is an individual part of a domain name. Labels are usually
|
||||
@ -559,13 +559,13 @@ public class TestIDNA extends TestFmwk {
|
||||
source.setLength(4);
|
||||
source.append(TestData.unicodeIn[i]);
|
||||
source.append(com);
|
||||
|
||||
|
||||
// a) compare it with itself
|
||||
doTestCompare(source.toString(),source.toString(),true);
|
||||
|
||||
|
||||
// b) compare it with asciiIn equivalent
|
||||
doTestCompare(source.toString(),www+TestData.asciiIn[i]+com,true);
|
||||
|
||||
|
||||
// c) compare it with unicodeIn not equivalent
|
||||
if(i==0){
|
||||
doTestCompare(source.toString(), uni1.toString(), false);
|
||||
@ -582,13 +582,13 @@ public class TestIDNA extends TestFmwk {
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
private static final int loopCount = 100;
|
||||
private static final int maxCharCount = 15;
|
||||
// private static final int maxCodePoint = 0x10ffff;
|
||||
private Random random = null;
|
||||
|
||||
|
||||
/**
|
||||
* Return a random integer i where 0 <= i < n.
|
||||
* A special function that gets random codepoints from planes 0,1,2 and 14
|
||||
@ -622,30 +622,30 @@ public class TestIDNA extends TestFmwk {
|
||||
i++;
|
||||
}
|
||||
return fillIn;
|
||||
|
||||
|
||||
}
|
||||
|
||||
// TODO(junit): turned off because not running before
|
||||
|
||||
// TODO(#13294): turned off because monkey test fails approx 1 in 3 times.
|
||||
@Ignore
|
||||
@Test
|
||||
public void MonkeyTest() throws Exception{
|
||||
StringBuffer source = new StringBuffer();
|
||||
/* do the monkey test */
|
||||
/* do the monkey test */
|
||||
for(int i=0; i<loopCount; i++){
|
||||
source.setLength(0);
|
||||
getTestSource(source);
|
||||
doTestCompareReferenceImpl(source);
|
||||
}
|
||||
|
||||
// test string with embedded null
|
||||
|
||||
// test string with embedded null
|
||||
source.append( "\\u0000\\u2109\\u3E1B\\U000E65CA\\U0001CAC5" );
|
||||
|
||||
|
||||
source = new StringBuffer(Utility.unescape(source.toString()));
|
||||
doTestCompareReferenceImpl(source);
|
||||
|
||||
|
||||
//StringBuffer src = new StringBuffer(Utility.unescape("\\uDEE8\\U000E228C\\U0002EE8E\\U000E6350\\U00024DD9\u4049\\U000E0DE4\\U000E448C\\U0001869B\\U000E3380\\U00016A8E\\U000172D5\\U0001C408\\U000E9FB5"));
|
||||
//doTestCompareReferenceImpl(src);
|
||||
|
||||
|
||||
//test deletion of code points
|
||||
source = new StringBuffer(Utility.unescape("\\u043f\\u00AD\\u034f\\u043e\\u0447\\u0435\\u043c\\u0443\\u0436\\u0435\\u043e\\u043d\\u0438\\u043d\\u0435\\u0433\\u043e\\u0432\\u043e\\u0440\\u044f\\u0442\\u043f\\u043e\\u0440\\u0443\\u0441\\u0441\\u043a\\u0438"));
|
||||
StringBuffer expected = new StringBuffer("xn--b1abfaaepdrnnbgefbadotcwatmq2g4l");
|
||||
@ -704,8 +704,6 @@ public class TestIDNA extends TestFmwk {
|
||||
|
||||
private void doTestCompareReferenceImpl(StringBuffer src) throws Exception{
|
||||
// test toASCII
|
||||
src.setLength(0);
|
||||
src.append("[");
|
||||
StringBuffer asciiLabel = _doTestCompareReferenceImpl(src, true, IDNA.ALLOW_UNASSIGNED);
|
||||
_doTestCompareReferenceImpl(src, true, IDNA.DEFAULT);
|
||||
_doTestCompareReferenceImpl(src, true, IDNA.USE_STD3_RULES);
|
||||
@ -720,6 +718,8 @@ public class TestIDNA extends TestFmwk {
|
||||
}
|
||||
}
|
||||
|
||||
// TODO(#13324): test turned off because it has dependency on translit.
|
||||
@Ignore
|
||||
@Test
|
||||
public void TestCompareRefImpl() throws Exception {
|
||||
for (int i = 65; i < 0x10FFFF; i++) {
|
||||
@ -742,7 +742,7 @@ public class TestIDNA extends TestFmwk {
|
||||
"\u00F5\u00dE\u00dF\u00dD",
|
||||
"\uFB00\uFB01"
|
||||
};
|
||||
for ( int i=0; i< in.length; i++){
|
||||
for ( int i=0; i< in.length; i++){
|
||||
try{
|
||||
String ascii = IDNA.convertToASCII(in[i],IDNA.DEFAULT).toString();
|
||||
try{
|
||||
@ -763,7 +763,7 @@ public class TestIDNA extends TestFmwk {
|
||||
"test"
|
||||
};
|
||||
for ( int i=0; i< in.length; i++){
|
||||
|
||||
|
||||
try{
|
||||
String ascii = IDNA.convertToASCII(in[i],IDNA.DEFAULT).toString();
|
||||
if(!ascii.equals(in[i])){
|
||||
@ -773,11 +773,11 @@ public class TestIDNA extends TestFmwk {
|
||||
errln("Unexpected exception: " + ex.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void TestDebug(){
|
||||
public void TestDebug(){
|
||||
try{
|
||||
String src = "\u00ED4dn";
|
||||
String uni = IDNA.convertToUnicode(src,IDNA.DEFAULT).toString();
|
||||
@ -808,7 +808,7 @@ public class TestIDNA extends TestFmwk {
|
||||
} catch (ArrayIndexOutOfBoundsException ex) {
|
||||
errln("Got an ArrayIndexOutOfBoundsException calling convertIDNToUnicode(\"" + INVALID_DOMAIN_NAME + "\")");
|
||||
}
|
||||
|
||||
|
||||
String domain = "xn--m\u00FCller.de";
|
||||
try{
|
||||
IDNA.convertIDNToUnicode(domain, IDNA.DEFAULT);
|
||||
@ -840,12 +840,12 @@ public class TestIDNA extends TestFmwk {
|
||||
errln("ToUnicode operation failed! "+ex.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void TestLength(){
|
||||
String ul = "my_very_very_very_very_very_very_very_very_very_very_very_very_very_long_and_incredibly_uncreative_domain_label";
|
||||
|
||||
/* this unicode string is longer than MAX_LABEL_BUFFER_SIZE and produces an
|
||||
/* this unicode string is longer than MAX_LABEL_BUFFER_SIZE and produces an
|
||||
IDNA prepared string (including xn--)that is exactly 63 bytes long */
|
||||
String ul1 ="\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"+
|
||||
"\uD55C\uAD6D\uC5B4\uB97C\uC774\u00AD\u034F\u1806\u180B"+
|
||||
@ -887,7 +887,7 @@ public class TestIDNA extends TestFmwk {
|
||||
}catch (StringPrepParseException ex){
|
||||
errln("IDNA.convertToASCII failed with error: "+ex.toString());
|
||||
}
|
||||
|
||||
|
||||
String idn = "my_very_very_long_and_incredibly_uncreative_domain_label.my_very_very_long_and_incredibly_uncreative_domain_label.my_very_very_long_and_incredibly_uncreative_domain_label.my_very_very_long_and_incredibly_uncreative_domain_label.my_very_very_long_and_incredibly_uncreative_domain_label.my_very_very_long_and_incredibly_uncreative_domain_label.ibm.com";
|
||||
try{
|
||||
IDNA.convertIDNToASCII(idn, IDNA.DEFAULT);
|
||||
@ -901,7 +901,7 @@ public class TestIDNA extends TestFmwk {
|
||||
}
|
||||
try{
|
||||
IDNA.convertIDNToUnicode(idn, IDNA.DEFAULT);
|
||||
errln("IDNA.convertToUnicode did not fail!");
|
||||
errln("IDNA.convertToUnicode did not fail!");
|
||||
}catch (StringPrepParseException ex){
|
||||
if(ex.getError()!= StringPrepParseException.DOMAIN_NAME_TOO_LONG_ERROR){
|
||||
errln("IDNA.convertToUnicode failed with error: "+ex.toString());
|
||||
@ -910,7 +910,7 @@ public class TestIDNA extends TestFmwk {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* Tests the method public static StringBuffer convertToASCII(String src, int options) */
|
||||
@Test
|
||||
public void TestConvertToASCII() {
|
||||
|
@ -117,6 +117,38 @@ abstract public class TestFmwk extends AbstractTestLog {
|
||||
return new Random(getParams().getSeed());
|
||||
}
|
||||
|
||||
/**
|
||||
* Integer Random number generator, produces positive int values.
|
||||
* Similar to C++ std::minstd_rand, with the same algorithm & constants.
|
||||
* Provided for compatibility with ICU4C.
|
||||
* Get & set of the seed allows for reproducible monkey tests.
|
||||
*/
|
||||
protected class ICU_Rand {
|
||||
private int fLast;
|
||||
|
||||
public ICU_Rand(int seed) {
|
||||
seed(seed);
|
||||
}
|
||||
|
||||
public int next() {
|
||||
fLast = (int)((fLast * 48271L) % 2147483647L);
|
||||
return fLast;
|
||||
}
|
||||
|
||||
public void seed(int seed) {
|
||||
if (seed <= 0) {
|
||||
seed = 1;
|
||||
}
|
||||
seed %= 2147483647; // = 0x7FFFFFFF
|
||||
fLast = seed > 0 ? seed : 1;
|
||||
}
|
||||
|
||||
public int getSeed() {
|
||||
return fLast;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
static final String ICU_TRAC_URL = "http://bugs.icu-project.org/trac/ticket/";
|
||||
static final String CLDR_TRAC_URL = "http://unicode.org/cldr/trac/ticket/";
|
||||
static final String CLDR_TICKET_PREFIX = "cldrbug:";
|
||||
|
Loading…
Reference in New Issue
Block a user