ICU-8079 rewrite/simplify sort key buffer/memory management, fixes overflow & length counting bugs

X-SVN-Rev: 29968
This commit is contained in:
Markus Scherer 2011-05-03 00:29:45 +00:00
parent b4653c9f9f
commit 2021d92a62
10 changed files with 788 additions and 1230 deletions

View File

@ -1,9 +1,9 @@
/*
*******************************************************************************
* Copyright (C) 2001-2003, International Business Machines
* Copyright (C) 2001-2011, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* file name: bocsu.c
* file name: bocsu.cpp
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
@ -16,6 +16,7 @@
#include "unicode/utypes.h"
#include "unicode/bytestream.h"
#if !UCONFIG_NO_COLLATION
@ -76,30 +77,6 @@ u_writeDiff(int32_t diff, uint8_t *p) {
return p;
}
/* How many bytes would writeDiff() write? */
static int32_t
lengthOfDiff(int32_t diff) {
if(diff>=SLOPE_REACH_NEG_1) {
if(diff<=SLOPE_REACH_POS_1) {
return 1;
} else if(diff<=SLOPE_REACH_POS_2) {
return 2;
} else if(diff<=SLOPE_REACH_POS_3) {
return 3;
} else {
return 4;
}
} else {
if(diff>=SLOPE_REACH_NEG_2) {
return 2;
} else if(diff>=SLOPE_REACH_NEG_3) {
return 3;
} else {
return 4;
}
}
}
/*
* Encode the code points of a string as
* a sequence of byte-encoded differences (slope detection),
@ -117,16 +94,26 @@ lengthOfDiff(int32_t diff) {
* Note that the identical-level run in a sort key is generated from
* NFD text - there are never Hangul characters included.
*/
U_CFUNC int32_t
u_writeIdenticalLevelRun(const UChar *s, int32_t length, uint8_t *p) {
uint8_t *p0;
int32_t c, prev;
int32_t i;
U_CFUNC void
u_writeIdenticalLevelRun(const UChar *s, int32_t length, U_NAMESPACE_QUALIFIER ByteSink &sink) {
char scratch[64];
int32_t capacity;
prev=0;
p0=p;
i=0;
UChar32 prev=0;
int32_t i=0;
while(i<length) {
char *buffer=sink.GetAppendBuffer(1, length*2, scratch, (int32_t)sizeof(scratch), &capacity);
uint8_t *p;
// We must have capacity>=SLOPE_MAX_BYTES in case u_writeDiff() writes that much,
// but we do not want to force the sink.GetAppendBuffer() to allocate
// for a large min_capacity because we might actually only write one byte.
if(capacity<16) {
buffer=scratch;
capacity=(int32_t)sizeof(scratch);
}
p=reinterpret_cast<uint8_t *>(buffer);
uint8_t *lastSafe=p+capacity-SLOPE_MAX_BYTES;
while(i<length && p<=lastSafe) {
if(prev<0x4e00 || prev>=0xa000) {
prev=(prev&~0x7f)-SLOPE_REACH_NEG_1;
} else {
@ -137,11 +124,13 @@ u_writeIdenticalLevelRun(const UChar *s, int32_t length, uint8_t *p) {
prev=0x9fff-SLOPE_REACH_POS_2;
}
UTF_NEXT_CHAR(s, i, length, c);
UChar32 c;
U16_NEXT(s, i, length, c);
p=u_writeDiff(c-prev, p);
prev=c;
}
return (int32_t)(p-p0);
sink.Append(buffer, (int32_t)(p-reinterpret_cast<uint8_t *>(buffer)));
}
}
U_CFUNC int32_t
@ -161,31 +150,4 @@ u_writeIdenticalLevelRunTwoChars(UChar32 first, UChar32 second, uint8_t *p) {
return (int32_t)(p-p0);
}
/* How many bytes would writeIdenticalLevelRun() write? */
U_CFUNC int32_t
u_lengthOfIdenticalLevelRun(const UChar *s, int32_t length) {
int32_t c, prev;
int32_t i, runLength;
prev=0;
runLength=0;
i=0;
while(i<length) {
if(prev<0x4e00 || prev>=0xa000) {
prev=(prev&~0x7f)-SLOPE_REACH_NEG_1;
} else {
/*
* Unihan U+4e00..U+9fa5:
* double-bytes down from the upper end
*/
prev=0x9fff-SLOPE_REACH_POS_2;
}
UTF_NEXT_CHAR(s, i, length, c);
runLength+=lengthOfDiff(c-prev);
prev=c;
}
return runLength;
}
#endif /* #if !UCONFIG_NO_COLLATION */

View File

@ -1,6 +1,6 @@
/*
*******************************************************************************
* Copyright (C) 2001-2003, International Business Machines
* Copyright (C) 2001-2011, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* file name: bocsu.c
@ -21,6 +21,12 @@
#if !UCONFIG_NO_COLLATION
U_NAMESPACE_BEGIN
class ByteSink;
U_NAMESPACE_END
/*
* "BOCSU"
* Binary Ordered Compression Scheme for Unicode
@ -145,15 +151,12 @@
} \
}
U_CFUNC int32_t
u_writeIdenticalLevelRun(const UChar *s, int32_t length, uint8_t *p);
U_CFUNC void
u_writeIdenticalLevelRun(const UChar *s, int32_t length, U_NAMESPACE_QUALIFIER ByteSink &sink);
U_CFUNC int32_t
u_writeIdenticalLevelRunTwoChars(UChar32 first, UChar32 second, uint8_t *p);
U_CFUNC int32_t
u_lengthOfIdenticalLevelRun(const UChar *s, int32_t length);
U_CFUNC uint8_t *
u_writeDiff(int32_t diff, uint8_t *p);

View File

@ -247,7 +247,7 @@
<ClCompile Include="alphaindex.cpp" />
<ClCompile Include="bms.cpp" />
<ClCompile Include="bmsearch.cpp" />
<ClCompile Include="bocsu.c" />
<ClCompile Include="bocsu.cpp" />
<ClCompile Include="coleitr.cpp" />
<ClCompile Include="coll.cpp" />
<ClCompile Include="colldata.cpp" />

View File

@ -1,6 +1,6 @@
/*
*******************************************************************************
* Copyright (C) 1996-2006, International Business Machines Corporation and *
* Copyright (C) 1996-2011, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*/
@ -96,14 +96,18 @@ CollationKey::~CollationKey()
uprv_free(fBytes);
}
void CollationKey::adopt(uint8_t *values, int32_t count) {
void CollationKey::adopt(uint8_t *values, int32_t capacity, int32_t count) {
if(fBytes != NULL) {
uprv_free(fBytes);
}
fBogus = FALSE;
fBytes = values;
fCount = count;
fCapacity = count;
fCapacity = capacity;
setLength(count);
}
void CollationKey::setLength(int32_t newLength) {
fBogus = FALSE;
fCount = newLength;
fHashCode = kInvalidHashCode;
}

View File

@ -453,21 +453,46 @@ CollationKey& RuleBasedCollator::getCollationKey(const UChar* source,
CollationKey& sortkey,
UErrorCode& status) const
{
if (U_FAILURE(status))
{
if (U_FAILURE(status)) {
return sortkey.setToBogus();
}
if (sourceLen < -1 || (source == NULL && sourceLen != 0)) {
status = U_ILLEGAL_ARGUMENT_ERROR;
return sortkey.setToBogus();
}
if ((!source) || (sourceLen == 0)) {
if (sourceLen < 0) {
sourceLen = u_strlen(source);
}
if (sourceLen == 0) {
return sortkey.reset();
}
uint8_t *result;
int32_t resultLen = ucol_getSortKeyWithAllocation(ucollator,
source, sourceLen,
&result,
&status);
sortkey.adopt(result, resultLen);
int32_t resultCapacity;
if (sortkey.fCapacity >= (sourceLen * 3)) {
// Try to reuse the CollationKey.fBytes.
result = sortkey.fBytes;
resultCapacity = sortkey.fCapacity;
} else {
result = NULL;
resultCapacity = 0;
}
int32_t resultLen = ucol_getSortKeyWithAllocation(ucollator, source, sourceLen,
result, resultCapacity, &status);
if (U_SUCCESS(status)) {
if (result == sortkey.fBytes) {
sortkey.setLength(resultLen);
} else {
sortkey.adopt(result, resultCapacity, resultLen);
}
} else {
if (result != sortkey.fBytes) {
uprv_free(result);
}
sortkey.setToBogus();
}
return sortkey;
}

File diff suppressed because it is too large Load Diff

View File

@ -545,42 +545,55 @@ U_CAPI uint32_t U_EXPORT2 ucol_getNextCE(const UCollator *coll,
U_CFUNC uint32_t U_EXPORT2 ucol_getPrevCE(const UCollator *coll,
U_NAMESPACE_QUALIFIER collIterate *collationSource,
UErrorCode *status);
/* get some memory */
void *ucol_getABuffer(const UCollator *coll, uint32_t size);
#ifdef XP_CPLUSPLUS
class SortKeyByteSink;
/* function used by C++ getCollationKey to prevent restarting the calculation */
U_CFUNC int32_t
ucol_getSortKeyWithAllocation(const UCollator *coll,
const UChar *source, int32_t sourceLength,
uint8_t **pResult,
uint8_t *&result, int32_t &resultCapacity,
UErrorCode *pErrorCode);
/* get some memory */
void *ucol_getABuffer(const UCollator *coll, uint32_t size);
typedef void U_CALLCONV
SortKeyGenerator(const UCollator *coll,
const UChar *source,
int32_t sourceLength,
SortKeyByteSink &result,
UErrorCode *status);
/* worker function for generating sortkeys */
U_CFUNC
int32_t U_CALLCONV
void U_CALLCONV
ucol_calcSortKey(const UCollator *coll,
const UChar *source,
int32_t sourceLength,
uint8_t **result,
uint32_t resultLength,
UBool allocatePrimary,
SortKeyByteSink &result,
UErrorCode *status);
U_CFUNC
int32_t U_CALLCONV
void U_CALLCONV
ucol_calcSortKeySimpleTertiary(const UCollator *coll,
const UChar *source,
int32_t sourceLength,
uint8_t **result,
uint32_t resultLength,
UBool allocatePrimary,
SortKeyByteSink &result,
UErrorCode *status);
U_CFUNC
int32_t
ucol_getSortKeySize(const UCollator *coll, U_NAMESPACE_QUALIFIER collIterate *s,
int32_t currentSize, UColAttributeValue strength,
int32_t len);
#else
typedef void U_CALLCONV
SortKeyGenerator(const UCollator *coll,
const UChar *source,
int32_t sourceLength,
void *result,
UErrorCode *status);
#endif
/**
* Makes a copy of the Collator's rule data. The format is
* that of .col files.
@ -958,15 +971,6 @@ typedef struct {
uint8_t padding[8];
} InverseUCATableHeader;
typedef int32_t U_CALLCONV
SortKeyGenerator(const UCollator *coll,
const UChar *source,
int32_t sourceLength,
uint8_t **result,
uint32_t resultLength,
UBool allocatePrimary,
UErrorCode *status);
typedef void U_CALLCONV
ResourceCleaner(UCollator *coll);

View File

@ -1,6 +1,6 @@
/*
*****************************************************************************
* Copyright (C) 1996-2006, International Business Machines Corporation and others.
* Copyright (C) 1996-2011, International Business Machines Corporation and others.
* All Rights Reserved.
*****************************************************************************
*
@ -243,10 +243,15 @@ private:
* Returns an array of the collation key values as 16-bit integers.
* The caller owns the storage and must delete it.
* @param values Output param of the collation key values.
* @param capacity Size of the values array.
* @param count output parameter of the number of collation key values
* @return a pointer to an array of 16-bit collation key values.
*/
void adopt(uint8_t *values, int32_t count);
void adopt(uint8_t *values, int32_t capacity, int32_t count);
/**
* Set a new length for a new sort key in the existing fBytes.
*/
void setLength(int32_t newLength);
/*
* Creates a collation key with a string.

View File

@ -1,6 +1,6 @@
/********************************************************************
* COPYRIGHT:
* Copyright (c) 1997-2010, International Business Machines Corporation and
* Copyright (c) 1997-2011, International Business Machines Corporation and
* others. All Rights Reserved.
********************************************************************/
//===============================================================================
@ -46,6 +46,8 @@
#include "cmemory.h"
#include <stdlib.h>
#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
void
CollationAPITest::doAssert(UBool condition, const char *message)
{
@ -1245,6 +1247,55 @@ void CollationAPITest::TestSortKey()
delete col;
}
void CollationAPITest::TestSortKeyOverflow() {
IcuTestErrorCode errorCode(*this, "TestSortKeyOverflow()");
LocalPointer<Collator> col(Collator::createInstance(Locale::getEnglish(), errorCode));
if (errorCode.logDataIfFailureAndReset("Collator::createInstance(English) failed")) {
return;
}
col->setAttribute(UCOL_STRENGTH, UCOL_PRIMARY, errorCode);
UChar i_and_phi[] = { 0x438, 0x3c6 }; // Cyrillic small i & Greek small phi.
// The sort key should be 6 bytes:
// 2 bytes for the Cyrillic i, 1 byte for the primary-compression terminator,
// 2 bytes for the Greek phi, and 1 byte for the NUL terminator.
uint8_t sortKey[12];
int32_t length = col->getSortKey(i_and_phi, 2, sortKey, LENGTHOF(sortKey));
uint8_t sortKey2[12];
for (int32_t capacity = 0; capacity < length; ++capacity) {
uprv_memset(sortKey2, 2, LENGTHOF(sortKey2));
int32_t length2 = col->getSortKey(i_and_phi, 2, sortKey2, capacity);
if (length2 != length || 0 != uprv_memcmp(sortKey, sortKey2, capacity)) {
errln("getSortKey(i_and_phi, capacity=%d) failed to write proper prefix", capacity);
} else if (sortKey2[capacity] != 2 || sortKey2[capacity + 1] != 2) {
errln("getSortKey(i_and_phi, capacity=%d) wrote beyond capacity", capacity);
}
}
// Now try to break getCollationKey().
// Internally, it always starts with a large stack buffer.
// Since we cannot control the initial capacity, we throw an increasing number
// of characters at it, with the problematic part at the end.
const int32_t longCapacity = 2000;
// Each 'a' in the prefix should result in one primary sort key byte.
// For i_and_phi we expect 6 bytes, then the NUL terminator.
const int32_t maxPrefixLength = longCapacity - 6 - 1;
LocalArray<uint8_t> longSortKey(new uint8_t[longCapacity]);
UnicodeString s(FALSE, i_and_phi, 2);
for (int32_t prefixLength = 0; prefixLength < maxPrefixLength; ++prefixLength) {
length = col->getSortKey(s, longSortKey.getAlias(), longCapacity);
CollationKey collKey;
col->getCollationKey(s, collKey, errorCode);
int32_t collKeyLength;
const uint8_t *collSortKey = collKey.getByteArray(collKeyLength);
if (collKeyLength != length || 0 != uprv_memcmp(longSortKey.getAlias(), collSortKey, length)) {
errln("getCollationKey(prefix[%d]+i_and_phi) failed to write proper sort key", prefixLength);
}
// Insert an 'a' to match ++prefixLength.
s.insert(prefixLength, (UChar)0x61);
}
}
void CollationAPITest::TestMaxExpansion()
{
UErrorCode status = U_ZERO_ERROR;
@ -2261,33 +2312,33 @@ void CollationAPITest::TestClone() {
void CollationAPITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par */)
{
if (exec) logln("TestSuite CollationAPITest: ");
switch (index) {
case 0: name = "TestProperty"; if (exec) TestProperty(/* par */); break;
case 1: name = "TestOperators"; if (exec) TestOperators(/* par */); break;
case 2: name = "TestDuplicate"; if (exec) TestDuplicate(/* par */); break;
case 3: name = "TestCompare"; if (exec) TestCompare(/* par */); break;
case 4: name = "TestHashCode"; if (exec) TestHashCode(/* par */); break;
case 5: name = "TestCollationKey"; if (exec) TestCollationKey(/* par */); break;
case 6: name = "TestElemIter"; if (exec) TestElemIter(/* par */); break;
case 7: name = "TestGetAll"; if (exec) TestGetAll(/* par */); break;
case 8: name = "TestRuleBasedColl"; if (exec) TestRuleBasedColl(/* par */); break;
case 9: name = "TestDecomposition"; if (exec) TestDecomposition(/* par */); break;
case 10: name = "TestSafeClone"; if (exec) TestSafeClone(/* par */); break;
case 11: name = "TestSortKey"; if (exec) TestSortKey(); break;
case 12: name = "TestMaxExpansion"; if (exec) TestMaxExpansion(); break;
case 13: name = "TestDisplayName"; if (exec) TestDisplayName(); break;
case 14: name = "TestAttribute"; if (exec) TestAttribute(); break;
case 15: name = "TestVariableTopSetting"; if (exec) TestVariableTopSetting(); break;
case 16: name = "TestRules"; if (exec) TestRules(); break;
case 17: name = "TestGetLocale"; if (exec) TestGetLocale(); break;
case 18: name = "TestBounds"; if (exec) TestBounds(); break;
case 19: name = "TestGetTailoredSet"; if (exec) TestGetTailoredSet(); break;
case 20: name = "TestUClassID"; if (exec) TestUClassID(); break;
case 21: name = "TestSubclass"; if (exec) TestSubclass(); break;
case 22: name = "TestNULLCharTailoring"; if (exec) TestNULLCharTailoring(); break;
case 23: name = "TestClone"; if (exec) TestClone(); break;
default: name = ""; break;
}
TESTCASE_AUTO_BEGIN;
TESTCASE_AUTO(TestProperty);
TESTCASE_AUTO(TestOperators);
TESTCASE_AUTO(TestDuplicate);
TESTCASE_AUTO(TestCompare);
TESTCASE_AUTO(TestHashCode);
TESTCASE_AUTO(TestCollationKey);
TESTCASE_AUTO(TestElemIter);
TESTCASE_AUTO(TestGetAll);
TESTCASE_AUTO(TestRuleBasedColl);
TESTCASE_AUTO(TestDecomposition);
TESTCASE_AUTO(TestSafeClone);
TESTCASE_AUTO(TestSortKey);
TESTCASE_AUTO(TestSortKeyOverflow);
TESTCASE_AUTO(TestMaxExpansion);
TESTCASE_AUTO(TestDisplayName);
TESTCASE_AUTO(TestAttribute);
TESTCASE_AUTO(TestVariableTopSetting);
TESTCASE_AUTO(TestRules);
TESTCASE_AUTO(TestGetLocale);
TESTCASE_AUTO(TestBounds);
TESTCASE_AUTO(TestGetTailoredSet);
TESTCASE_AUTO(TestUClassID);
TESTCASE_AUTO(TestSubclass);
TESTCASE_AUTO(TestNULLCharTailoring);
TESTCASE_AUTO(TestClone);
TESTCASE_AUTO_END;
}
#endif /* #if !UCONFIG_NO_COLLATION */

View File

@ -1,6 +1,6 @@
/***********************************************************************
* COPYRIGHT:
* Copyright (c) 1997-2006, International Business Machines Corporation
* Copyright (c) 1997-2011, International Business Machines Corporation
* and others. All Rights Reserved.
***********************************************************************/
@ -113,6 +113,7 @@ public:
* This tests the sort keys generated by collator
*/
void TestSortKey();
void TestSortKeyOverflow();
/**
* This tests getMaxExpansion