ICU-8490 Add BreakIterator::refresInputText()

X-SVN-Rev: 30203
This commit is contained in:
Andy Heninger 2011-06-09 22:49:40 +00:00
parent b90dc9ad48
commit b8d330e9a7
8 changed files with 231 additions and 4 deletions

View File

@ -486,6 +486,37 @@ RuleBasedBreakIterator::setText(const UnicodeString& newText) {
}
/**
* Provide a new UText for the input text. Must reference text with contents identical
* to the original.
* Intended for use with text data originating in Java (garbage collected) environments
* where the data may be moved in memory at arbitrary times.
*/
RuleBasedBreakIterator &RuleBasedBreakIterator::refreshInputText(UText *input, UErrorCode &status) {
if (U_FAILURE(status)) {
return *this;
}
if (input == NULL) {
status = U_ILLEGAL_ARGUMENT_ERROR;
return *this;
}
int64_t pos = utext_getNativeIndex(fText);
// Shallow read-only clone of the new UText into the existing input UText
fText = utext_clone(fText, input, FALSE, TRUE, &status);
if (U_FAILURE(status)) {
return *this;
}
utext_setNativeIndex(fText, pos);
if (utext_getNativeIndex(fText) != pos) {
// Sanity check. The new input utext is supposed to have the exact same
// contents as the old. If we can't set to the same position, it doesn't.
// The contents underlying the old utext might be invalid at this point,
// so it's not safe to check directly.
status = U_ILLEGAL_ARGUMENT_ERROR;
}
return *this;
}
/**
* Sets the current iteration position to the beginning of the text.

View File

@ -1,6 +1,6 @@
/*
********************************************************************************
* Copyright (C) 1996-2008, International Business Machines
* Copyright (C) 1996-2011, International Business Machines
* Corporation and others. All Rights Reserved.
********************************************************************************
*/
@ -290,4 +290,14 @@ ubrk_getLocaleByType(const UBreakIterator *bi,
}
void ubrk_refreshUText(UBreakIterator *bi,
UText *text,
UErrorCode *status)
{
BreakIterator *bii = reinterpret_cast<BreakIterator *>(bi);
bii->refreshInputText(text, *status);
}
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */

View File

@ -1,6 +1,6 @@
/*
********************************************************************************
* Copyright (C) 1997-2010, International Business Machines
* Copyright (C) 1997-2011, International Business Machines
* Corporation and others. All Rights Reserved.
********************************************************************************
*
@ -514,6 +514,33 @@ public:
*/
const char *getLocaleID(ULocDataLocaleType type, UErrorCode& status) const;
/**
* Set the subject text string upon which the break iterator is operating
* without changing any other aspect of the matching state.
* The new and previous text strings must have the same content.
*
* This function is intended for use in environments where ICU is operating on
* strings that may move around in memory. It provides a mechanism for notifying
* ICU that the string has been relocated, and providing a new UText to access the
* string in its new position.
*
* Note that the break iterator implementation never copies the underlying text
* of a string being processed, but always operates directly on the original text
* provided by the user. Refreshing simply drops the references to the old text
* and replaces them with references to the new.
*
* Caution: this function is normally used only by very specialized,
* system-level code. One example use case is with garbage collection that moves
* the text in memory.
*
* @param input The new (moved) text string.
* @param status Receives errors detected by this function.
* @return *this
*
* @draft ICU 5.0
*/
virtual BreakIterator &refreshInputText(UText *input, UErrorCode &status) = 0;
private:
static BreakIterator* buildInstance(const Locale& loc, const char *type, int32_t kind, UErrorCode& status);
static BreakIterator* createInstance(const Locale& loc, int32_t kind, UErrorCode& status);

View File

@ -633,6 +633,33 @@ public:
*/
virtual const uint8_t *getBinaryRules(uint32_t &length);
/**
* Set the subject text string upon which the break iterator is operating
* without changing any other aspect of the matching state.
* The new and previous text strings must have the same content.
*
* This function is intended for use in environments where ICU is operating on
* strings that may move around in memory. It provides a mechanism for notifying
* ICU that the string has been relocated, and providing a new UText to access the
* string in its new position.
*
* Note that the break iterator implementation never copies the underlying text
* of a string being processed, but always operates directly on the original text
* provided by the user. Refreshing simply drops the references to the old text
* and replaces them with references to the new.
*
* Caution: this function is normally used only by very specialized,
* system-level code. One example use case is with garbage collection that moves
* the text in memory.
*
* @param input The new (moved) text string.
* @param status Receives errors detected by this function.
* @return *this
*
* @draft ICU 5.0
*/
virtual RuleBasedBreakIterator &refreshInputText(UText *input, UErrorCode &status);
protected:
//=======================================================================

View File

@ -1,6 +1,6 @@
/*
******************************************************************************
* Copyright (C) 1996-2010, International Business Machines Corporation and others.
* Copyright (C) 1996-2011, International Business Machines Corporation and others.
* All Rights Reserved.
******************************************************************************
*/
@ -496,6 +496,37 @@ U_STABLE const char* U_EXPORT2
ubrk_getLocaleByType(const UBreakIterator *bi, ULocDataLocaleType type, UErrorCode* status);
/**
* Set the subject text string upon which the break iterator is operating
* without changing any other aspect of the state.
* The new and previous text strings must have the same content.
*
* This function is intended for use in environments where ICU is operating on
* strings that may move around in memory. It provides a mechanism for notifying
* ICU that the string has been relocated, and providing a new UText to access the
* string in its new position.
*
* Note that the break iterator never copies the underlying text
* of a string being processed, but always operates directly on the original text
* provided by the user. Refreshing simply drops the references to the old text
* and replaces them with references to the new.
*
* Caution: this function is normally used only by very specialized
* system-level code. One example use case is with garbage collection
* that moves the text in memory.
*
* @param bi The break iterator.
* @param text The new (moved) text string.
* @param status Receives errors detected by this function.
*
* @draft ICU 5.0
*/
U_DRAFT void U_EXPORT2
ubrk_refreshUText(UBreakIterator *bi,
UText *text,
UErrorCode *status);
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
#endif

View File

@ -44,6 +44,7 @@ static void TestBreakIteratorRuleError(void);
static void TestBreakIteratorStatusVec(void);
static void TestBreakIteratorUText(void);
static void TestBreakIteratorTailoring(void);
static void TestBreakIteratorRefresh(void);
void addBrkIterAPITest(TestNode** root);
@ -58,6 +59,7 @@ void addBrkIterAPITest(TestNode** root)
addTest(root, &TestBreakIteratorRuleError, "tstxtbd/cbiapts/TestBreakIteratorRuleError");
addTest(root, &TestBreakIteratorStatusVec, "tstxtbd/cbiapts/TestBreakIteratorStatusVec");
addTest(root, &TestBreakIteratorTailoring, "tstxtbd/cbiapts/TestBreakIteratorTailoring");
addTest(root, &TestBreakIteratorRefresh, "tstxtbd/cbiapts/TestBreakIteratorRefresh");
}
#define CLONETEST_ITERATOR_COUNT 2
@ -823,4 +825,52 @@ static void TestBreakIteratorTailoring(void) {
}
}
static void TestBreakIteratorRefresh(void) {
/*
* RefreshInput changes out the input of a Break Iterator without
* changing anything else in the iterator's state. Used with Java JNI,
* when Java moves the underlying string storage. This test
* runs a ubrk_next() repeatedly, moving the text in the middle of the sequence.
* The right set of boundaries should still be found.
*/
UChar testStr[] = {0x20, 0x41, 0x20, 0x42, 0x20, 0x43, 0x20, 0x44, 0x0}; /* = " A B C D" */
UChar movedStr[] = {0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0};
UErrorCode status = U_ZERO_ERROR;
UBreakIterator *bi;
UText ut1 = UTEXT_INITIALIZER;
UText ut2 = UTEXT_INITIALIZER;
bi = ubrk_open(UBRK_LINE, "en_US", NULL, 0, &status);
TEST_ASSERT_SUCCESS(status);
utext_openUChars(&ut1, testStr, -1, &status);
TEST_ASSERT_SUCCESS(status);
ubrk_setUText(bi, &ut1, &status);
TEST_ASSERT_SUCCESS(status);
/* Line boundaries will occur before each letter in the original string */
TEST_ASSERT(1 == ubrk_next(bi));
TEST_ASSERT(3 == ubrk_next(bi));
/* Move the string, kill the original string. */
u_strcpy(movedStr, testStr);
u_memset(testStr, 0x20, u_strlen(testStr));
utext_openUChars(&ut2, movedStr, -1, &status);
TEST_ASSERT_SUCCESS(status);
ubrk_refreshUText(bi, &ut2, &status);
TEST_ASSERT_SUCCESS(status);
/* Find the following matches, now working in the moved string. */
TEST_ASSERT(5 == ubrk_next(bi));
TEST_ASSERT(7 == ubrk_next(bi));
TEST_ASSERT(8 == ubrk_next(bi));
TEST_ASSERT(UBRK_DONE == ubrk_next(bi));
TEST_ASSERT_SUCCESS(status);
ubrk_close(bi);
utext_close(&ut1);
utext_close(&ut2);
}
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */

View File

@ -1122,6 +1122,54 @@ void RBBIAPITest::TestCreateFromRBBIData() {
}
}
void RBBIAPITest::TestRefreshInputText() {
/*
* RefreshInput changes out the input of a Break Iterator without
* changing anything else in the iterator's state. Used with Java JNI,
* when Java moves the underlying string storage. This test
* runs BreakIterator::next() repeatedly, moving the text in the middle of the sequence.
* The right set of boundaries should still be found.
*/
UChar testStr[] = {0x20, 0x41, 0x20, 0x42, 0x20, 0x43, 0x20, 0x44, 0x0}; /* = " A B C D" */
UChar movedStr[] = {0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0};
UErrorCode status = U_ZERO_ERROR;
UText ut1 = UTEXT_INITIALIZER;
UText ut2 = UTEXT_INITIALIZER;
RuleBasedBreakIterator *bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status);
TEST_ASSERT_SUCCESS(status);
utext_openUChars(&ut1, testStr, -1, &status);
TEST_ASSERT_SUCCESS(status);
bi->setText(&ut1, status);
TEST_ASSERT_SUCCESS(status);
/* Line boundaries will occur before each letter in the original string */
TEST_ASSERT(1 == bi->next());
TEST_ASSERT(3 == bi->next());
/* Move the string, kill the original string. */
u_strcpy(movedStr, testStr);
u_memset(testStr, 0x20, u_strlen(testStr));
utext_openUChars(&ut2, movedStr, -1, &status);
TEST_ASSERT_SUCCESS(status);
RuleBasedBreakIterator *returnedBI = &bi->refreshInputText(&ut2, status);
TEST_ASSERT_SUCCESS(status);
TEST_ASSERT(bi == returnedBI);
/* Find the following matches, now working in the moved string. */
TEST_ASSERT(5 == bi->next());
TEST_ASSERT(7 == bi->next());
TEST_ASSERT(8 == bi->next());
TEST_ASSERT(UBRK_DONE == bi->next());
delete bi;
utext_close(&ut1);
utext_close(&ut2);
}
//---------------------------------------------
// runIndexedTest
//---------------------------------------------
@ -1153,6 +1201,7 @@ void RBBIAPITest::runIndexedTest( int32_t index, UBool exec, const char* &name,
#else
case 9: case 10: case 11: case 12: case 13: name = "skip"; break;
#endif
case 14: name = "TestRefreshInputText"; if (exec) TestRefreshInputText(); break;
default: name = ""; break; // needed to end loop
}

View File

@ -1,6 +1,6 @@
/********************************************************************
* COPYRIGHT:
* Copyright (c) 1999-2004,2008 International Business Machines Corporation and
* Copyright (c) 1999-2011 International Business Machines Corporation and
* others. All Rights Reserved.
********************************************************************/
/************************************************************************
@ -86,6 +86,8 @@ public:
void TestRegistration();
void TestRefreshInputText();
/**
*Internal subroutines
**/