/* ******************************************************************************* * Copyright (C) 2012-2014, International Business Machines * Corporation and others. All Rights Reserved. ******************************************************************************* * collationdata.cpp * * created on: 2012jul28 * created by: Markus W. Scherer */ #include "unicode/utypes.h" #if !UCONFIG_NO_COLLATION #include "unicode/ucol.h" #include "unicode/udata.h" #include "unicode/uscript.h" #include "cmemory.h" #include "collation.h" #include "collationdata.h" #include "uassert.h" #include "utrie2.h" U_NAMESPACE_BEGIN uint32_t CollationData::getIndirectCE32(uint32_t ce32) const { U_ASSERT(Collation::isSpecialCE32(ce32)); int32_t tag = Collation::tagFromCE32(ce32); if(tag == Collation::DIGIT_TAG) { // Fetch the non-numeric-collation CE32. ce32 = ce32s[Collation::indexFromCE32(ce32)]; } else if(tag == Collation::LEAD_SURROGATE_TAG) { ce32 = Collation::UNASSIGNED_CE32; } else if(tag == Collation::U0000_TAG) { // Fetch the normal ce32 for U+0000. ce32 = ce32s[0]; } return ce32; } uint32_t CollationData::getFinalCE32(uint32_t ce32) const { if(Collation::isSpecialCE32(ce32)) { ce32 = getIndirectCE32(ce32); } return ce32; } uint32_t CollationData::getFirstPrimaryForGroup(int32_t script) const { int32_t index = findScript(script); if(index < 0) { return 0; } uint32_t head = scripts[index]; return (head & 0xff00) << 16; } uint32_t CollationData::getLastPrimaryForGroup(int32_t script) const { int32_t index = findScript(script); if(index < 0) { return 0; } uint32_t head = scripts[index]; uint32_t lastByte = head & 0xff; return ((lastByte + 1) << 24) - 1; } int32_t CollationData::getGroupForPrimary(uint32_t p) const { p >>= 24; // Reordering groups are distinguished by primary lead bytes. for(int32_t i = 0; i < scriptsLength; i = i + 2 + scripts[i + 1]) { uint32_t lastByte = scripts[i] & 0xff; if(p <= lastByte) { return scripts[i + 2]; } } return -1; } int32_t CollationData::findScript(int32_t script) const { if(script < 0 || 0xffff < script) { return -1; } for(int32_t i = 0; i < scriptsLength;) { int32_t limit = i + 2 + scripts[i + 1]; for(int32_t j = i + 2; j < limit; ++j) { if(script == scripts[j]) { return i; } } i = limit; } return -1; } int32_t CollationData::getEquivalentScripts(int32_t script, int32_t dest[], int32_t capacity, UErrorCode &errorCode) const { if(U_FAILURE(errorCode)) { return 0; } int32_t i = findScript(script); if(i < 0) { return 0; } int32_t length = scripts[i + 1]; U_ASSERT(length != 0); if(length > capacity) { errorCode = U_BUFFER_OVERFLOW_ERROR; return length; } i += 2; dest[0] = scripts[i++]; for(int32_t j = 1; j < length; ++j) { script = scripts[i++]; // Sorted insertion. for(int32_t k = j;; --k) { // Invariant: dest[k] is free to receive either script or dest[k - 1]. if(k > 0 && script < dest[k - 1]) { dest[k] = dest[k - 1]; } else { dest[k] = script; break; } } } return length; } void CollationData::makeReorderTable(const int32_t *reorder, int32_t length, uint8_t table[256], UErrorCode &errorCode) const { if(U_FAILURE(errorCode)) { return; } // Initialize the table. // Never reorder special low and high primary lead bytes. int32_t lowByte; for(lowByte = 0; lowByte <= Collation::MERGE_SEPARATOR_BYTE; ++lowByte) { table[lowByte] = lowByte; } // lowByte == 03 int32_t highByte; for(highByte = 0xff; highByte >= Collation::TRAIL_WEIGHT_BYTE; --highByte) { table[highByte] = highByte; } // highByte == FE // Set intermediate bytes to 0 to indicate that they have not been set yet. for(int32_t i = lowByte; i <= highByte; ++i) { table[i] = 0; } // Get the set of special reorder codes in the input list. // This supports up to 32 special reorder codes; // it works for data with codes beyond UCOL_REORDER_CODE_LIMIT. uint32_t specials = 0; for(int32_t i = 0; i < length; ++i) { int32_t reorderCode = reorder[i] - UCOL_REORDER_CODE_FIRST; if(0 <= reorderCode && reorderCode <= 31) { specials |= (uint32_t)1 << reorderCode; } } // Start the reordering with the special low reorder codes that do not occur in the input. for(int32_t i = 0;; i += 3) { if(scripts[i + 1] != 1) { break; } // Went beyond special single-code reorder codes. int32_t reorderCode = (int32_t)scripts[i + 2] - UCOL_REORDER_CODE_FIRST; if(reorderCode < 0) { break; } // Went beyond special reorder codes. if((specials & ((uint32_t)1 << reorderCode)) == 0) { int32_t head = scripts[i]; int32_t firstByte = head >> 8; int32_t lastByte = head & 0xff; do { table[firstByte++] = lowByte++; } while(firstByte <= lastByte); } } // Reorder according to the input scripts, continuing from the bottom of the bytes range. for(int32_t i = 0; i < length;) { int32_t script = reorder[i++]; if(script == USCRIPT_UNKNOWN) { // Put the remaining scripts at the top. while(i < length) { script = reorder[--length]; if(script == USCRIPT_UNKNOWN || // Must occur at most once. script == UCOL_REORDER_CODE_DEFAULT) { errorCode = U_ILLEGAL_ARGUMENT_ERROR; return; } int32_t index = findScript(script); if(index < 0) { continue; } int32_t head = scripts[index]; int32_t firstByte = head >> 8; int32_t lastByte = head & 0xff; if(table[firstByte] != 0) { // Duplicate or equivalent script. errorCode = U_ILLEGAL_ARGUMENT_ERROR; return; } do { table[lastByte--] = highByte--; } while(firstByte <= lastByte); } break; } if(script == UCOL_REORDER_CODE_DEFAULT) { // The default code must be the only one in the list, and that is handled by the caller. // Otherwise it must not be used. errorCode = U_ILLEGAL_ARGUMENT_ERROR; return; } int32_t index = findScript(script); if(index < 0) { continue; } int32_t head = scripts[index]; int32_t firstByte = head >> 8; int32_t lastByte = head & 0xff; if(table[firstByte] != 0) { // Duplicate or equivalent script. errorCode = U_ILLEGAL_ARGUMENT_ERROR; return; } do { table[firstByte++] = lowByte++; } while(firstByte <= lastByte); } // Put all remaining scripts into the middle. // Avoid table[0] which must remain 0. for(int32_t i = 1; i <= 0xff; ++i) { if(table[i] == 0) { table[i] = lowByte++; } } U_ASSERT(lowByte == highByte + 1); } U_NAMESPACE_END #endif // !UCONFIG_NO_COLLATION