/* ****************************************************************************** * Copyright (c) 1996-2001, International Business Machines * Corporation and others. All Rights Reserved. ****************************************************************************** * File unorm.cpp * * Created by: Vladimir Weinstein 12052000 * * Modification history : * * Date Name Description * 02/01/01 synwee Added normalization quickcheck enum and method. * 02/12/01 synwee Commented out quickcheck util api has been approved * Added private method for doing FCD checks * 02/23/01 synwee Modified quickcheck and checkFCE to run through * string for codepoints < 0x300 for the normalization * mode NFC. */ #include "unicode/unorm.h" #include "unicode/normlzr.h" #include "unicode/ustring.h" #include "unicode/udata.h" #include "cpputils.h" #include "ustr_imp.h" #include "umutex.h" /* added by synwee */ #include "unicode/uchar.h" #include "unicode/utf16.h" /* added by synwee for trie manipulation*/ #define STAGE_1_SHIFT_ 10 #define STAGE_2_SHIFT_ 4 #define STAGE_2_MASK_AFTER_SHIFT_ 0x3F #define STAGE_3_MASK_ 0xF #define LAST_BYTE_MASK_ 0xFF #define SECOND_LAST_BYTE_SHIFT_ 8 /* added by synwee for fast route in quickcheck and fcd */ #define NFC_ZERO_CC_BLOCK_LIMIT_ 0x300 /* * for a description of the file format, * see icu/source/tools/genqchk/genqchk.c */ #define QCHK_DATA_NAME "qchk" #define FCHK_DATA_NAME "fchk" #define DATA_TYPE "dat" static UDataMemory *quickcheckData = NULL; static UDataMemory *fcdcheckData = NULL; /** * Authentication values */ static const uint8_t QCHK_DATA_FORMAT_[] = {0x71, 0x63, 0x68, 0x6b}; static const uint8_t FCHK_DATA_FORMAT_[] = {0x66, 0x63, 0x68, 0x6b}; static const uint8_t QCHK_FORMAT_VERSION_[] = {1, 0, 0, 0}; static const uint8_t FCHK_FORMAT_VERSION_[] = {1, 0, 0, 0}; /** * index values loaded from qchk.dat. * static uint16_t indexes[8]; */ enum { QCHK_INDEX_STAGE_2_BITS, QCHK_INDEX_STAGE_3_BITS, QCHK_INDEX_MIN_VALUES_SIZE, QCHK_INDEX_STAGE_1_INDEX, QCHK_INDEX_STAGE_2_INDEX, QCHK_INDEX_STAGE_3_INDEX }; /** * index values loaded from qchk.dat. * static uint16_t indexes[8]; */ enum { FCHK_INDEX_STAGE_2_BITS, FCHK_INDEX_STAGE_3_BITS, FCHK_INDEX_STAGE_1_INDEX, FCHK_INDEX_STAGE_2_INDEX, FCHK_INDEX_STAGE_3_INDEX }; /** * Array of mask for determining normalization quick check values. * Indexes follows the values in UNormalizationMode */ static const uint8_t QCHK_MASK_[] = {0, 0, 0x11, 0x22, 0x44, 0x88}; /** * Array of minimum codepoints that has UNORM_MAYBE or UNORM_NO quick check * values. Indexes follows the values in UNormalizationMode. * Generated values! Edit at your own risk. */ static const UChar32 *QCHK_MIN_VALUES_; /** * Flag to indicate if data has been loaded */ static UBool isQuickCheckLoaded = FALSE; static UBool isFCDCheckLoaded = FALSE; /** * Minimum value to determine if quickcheck value contains a MAYBE */ static const uint8_t MIN_UNORM_MAYBE_ = 0x10; /** * Array of normalization form corresponding to the index code point. * Hence codepoint 0xABCD will have normalization form QUICK_CHECK_DATA[0xABCD]. * UQUICK_CHECK_DATA[0xABCD] is a byte containing 2 sets of 4 bits information * representing UNORM_MAYBE and UNORM_YES.
* bits 1 2 3 4 5678
* NFKC NFC NFKD NFD MAYBES NFKC NFC NFKD NFD YES
* ie if UQUICK_CHECK_DATA[0xABCD] = 10000001, this means that 0xABCD is in * NFD form and maybe in NFKC form */ static const uint16_t *QCHK_STAGE_1_; static const uint16_t *QCHK_STAGE_2_; static const uint8_t *QCHK_STAGE_3_; /** * Trie data for FCD. * Each index corresponds to each code point. * Trie value is the combining class of the first and the last character of the * NFD of the codepoint. * size uint16_t for the first 2 stages instead of uint32_t to reduce size. */ static const uint16_t *FCHK_STAGE_1_; static const uint16_t *FCHK_STAGE_2_; static const uint16_t *FCHK_STAGE_3_; U_CAPI int32_t unorm_normalize(const UChar* source, int32_t sourceLength, UNormalizationMode mode, int32_t option, UChar* result, int32_t resultLength, UErrorCode* status) { if(U_FAILURE(*status)) return -1; /* synwee : removed hard coded conversion */ Normalizer::EMode normMode = Normalizer::getNormalizerEMode(mode, *status); if (U_FAILURE(*status)) return -1; int32_t len = (sourceLength == -1 ? u_strlen(source) : sourceLength); const UnicodeString src(sourceLength == -1, source, len); UnicodeString dst(result, 0, resultLength); /* synwee : note quickcheck is added in C ++ normalize method */ if ((option & UNORM_IGNORE_HANGUL) != 0) option = Normalizer::IGNORE_HANGUL; Normalizer::normalize(src, normMode, option, dst, *status); return uprv_fillOutputString(dst, result, resultLength, status); } static UBool U_CALLCONV isQuickCheckAcceptable(void *context, const char *type, const char *name, const UDataInfo *pInfo) { if (pInfo->size >= 20 && pInfo->isBigEndian == U_IS_BIG_ENDIAN && pInfo->charsetFamily == U_CHARSET_FAMILY && (uprv_memcmp(pInfo->dataFormat, QCHK_DATA_FORMAT_, sizeof(QCHK_DATA_FORMAT_)) == 0) && /* pInfo->dataFormat[0] == 0x71 && pInfo->dataFormat[1] == 0x63 && pInfo->dataFormat[2] == 0x68 && pInfo->dataFormat[3] == 0x6b && pInfo->formatVersion[0] == 1 */ (uprv_memcmp(pInfo->formatVersion, QCHK_FORMAT_VERSION_, sizeof(QCHK_FORMAT_VERSION_)) == 0)) { return TRUE; } else { context = NULL; type = NULL; name = NULL; return FALSE; } } static UBool loadQuickCheckData(UErrorCode *error) { /* load quickcheck data from file if necessary */ if (!isQuickCheckLoaded && U_SUCCESS(*error)) { UDataMemory *data; /* open the data outside the mutex block */ data = udata_openChoice(NULL, DATA_TYPE, QCHK_DATA_NAME, isQuickCheckAcceptable, NULL, error); if (U_FAILURE(*error)) { return isQuickCheckLoaded = FALSE; } /* in the mutex block, set the data for this process */ umtx_lock(NULL); if (quickcheckData == NULL) { const uint16_t *temp = (const uint16_t *)udata_getMemory(data); const uint16_t *indexes = temp; quickcheckData = data; temp += 8; QCHK_MIN_VALUES_ = (const UChar32 *)temp; QCHK_STAGE_1_ = temp + indexes[QCHK_INDEX_STAGE_1_INDEX]; QCHK_STAGE_2_ = temp + indexes[QCHK_INDEX_STAGE_2_INDEX]; QCHK_STAGE_3_ = (const uint8_t *)(temp + indexes[QCHK_INDEX_STAGE_3_INDEX]); data = NULL; } umtx_unlock(NULL); isQuickCheckLoaded = TRUE; /* if a different thread set it first, then close the extra data */ if (data != NULL) { udata_close(data); /* NULL if it was set correctly */ } } return isQuickCheckLoaded; } /** * Performing quick check on a string, to quickly determine if the string is * in a particular normalization format. * Three types of result can be returned UNORM_YES, UNORM_NO or * UNORM_MAYBE. Result UNORM_YES indicates that the argument * string is in the desired normalized format, UNORM_NO determines that * argument string is not in the desired normalized format. A * UNORM_MAYBE result indicates that a more thorough check is required, * the user may have to put the string in its normalized form and compare the * results. * @param source string for determining if it is in a normalized format * @param sourcelength length of source to test * @param mode normalization format from the enum UNormalizationMode * @param status A pointer to an UErrorCode to receive any errors * @return UNORM_YES, UNORM_NO or UNORM_MAYBE */ U_CAPI UNormalizationCheckResult unorm_quickCheck(const UChar *source, int32_t sourcelength, UNormalizationMode mode, UErrorCode* status) { uint8_t oldcombiningclass = 0; uint8_t combiningclass; uint8_t quickcheckvalue; uint8_t mask = QCHK_MASK_[mode]; UChar32 min; UChar32 codepoint; UNormalizationCheckResult result = UNORM_YES; const UChar *psource; const UChar *pend = 0; if (!loadQuickCheckData(status) || U_FAILURE(*status)) { return UNORM_MAYBE; } min = QCHK_MIN_VALUES_[mode]; /* checking argument*/ if (mode >= UNORM_MODE_COUNT || mode < UNORM_NONE) { *status = U_ILLEGAL_ARGUMENT_ERROR; return UNORM_MAYBE; } if (sourcelength >= 0) { psource = source; pend = source + sourcelength; for (;;) { if (psource >= pend) { return UNORM_YES; } /* fast route : since codepoints < min has combining class 0 and YES looking at the minimum values, surrogates are not a problem */ if (*psource >= min) { break; } psource ++; } } else { psource = source; for (;;) { if (*psource == 0) { return UNORM_YES; } /* fast route : since codepoints < min has combining class 0 and YES looking at the minimum values, surrogates are not a problem */ if (*psource >= min) { break; } psource ++; } } if (sourcelength >= 0) { for (;;) { int count = 0; if (psource >= pend) { break; } UTF_NEXT_CHAR(psource, count, pend - psource, codepoint); combiningclass = u_getCombiningClass(codepoint); /* not in canonical order */ if (oldcombiningclass > combiningclass && combiningclass != 0) { return UNORM_NO; } oldcombiningclass = combiningclass; /* trie access */ quickcheckvalue = (uint8_t)(QCHK_STAGE_3_[ QCHK_STAGE_2_[QCHK_STAGE_1_[codepoint >> STAGE_1_SHIFT_] + ((codepoint >> STAGE_2_SHIFT_) & STAGE_2_MASK_AFTER_SHIFT_)] + (codepoint & STAGE_3_MASK_)] & mask); /* value is a byte containing 2 sets of 4 bits information. bits 1 2 3 4 5678
NFKC NFC NFKD NFD MAYBES NFKC NFC NFKD NFD YES
ie if quick[0xABCD] = 10000001, this means that 0xABCD is in NFD form and maybe in NFKC form. */ if (quickcheckvalue == 0) { return UNORM_NO; } if (quickcheckvalue >= MIN_UNORM_MAYBE_) { result = UNORM_MAYBE; } psource += count; } } else { for (;;) { int count = 0; UTF_NEXT_CHAR(psource, count, pend - psource, codepoint); if (codepoint == 0) { break; } combiningclass = u_getCombiningClass(codepoint); /* not in canonical order */ if (oldcombiningclass > combiningclass && combiningclass != 0) { return UNORM_NO; } oldcombiningclass = combiningclass; /* trie access */ quickcheckvalue = (uint8_t)(QCHK_STAGE_3_[ QCHK_STAGE_2_[QCHK_STAGE_1_[codepoint >> STAGE_1_SHIFT_] + ((codepoint >> STAGE_2_SHIFT_) & STAGE_2_MASK_AFTER_SHIFT_)] + (codepoint & STAGE_3_MASK_)] & mask); /* value is a byte containing 2 sets of 4 bits information. bits 1 2 3 4 5678
NFKC NFC NFKD NFD MAYBES NFKC NFC NFKD NFD YES
ie if quick[0xABCD] = 10000001, this means that 0xABCD is in NFD form and maybe in NFKC form. */ if (quickcheckvalue == 0) { return UNORM_NO; } if (quickcheckvalue >= MIN_UNORM_MAYBE_) { result = UNORM_MAYBE; } psource += count; } } return result; } /* private methods ---------------------------------------------------------- */ static UBool U_CALLCONV isFCDCheckAcceptable(void *context, const char *type, const char *name, const UDataInfo *pInfo) { if( pInfo->size >= 20 && pInfo->isBigEndian == U_IS_BIG_ENDIAN && pInfo->charsetFamily == U_CHARSET_FAMILY && (uprv_memcmp(pInfo->dataFormat, FCHK_DATA_FORMAT_, sizeof(FCHK_DATA_FORMAT_)) == 0) && /* pInfo->dataFormat[0] == 0x71 && pInfo->dataFormat[1] == 0x63 && pInfo->dataFormat[2] == 0x68 && pInfo->dataFormat[3] == 0x6b && pInfo->formatVersion[0] == 1 */ (uprv_memcmp(pInfo->formatVersion, FCHK_FORMAT_VERSION_, sizeof(FCHK_FORMAT_VERSION_)) == 0)) { return TRUE; } else { context = NULL; type = NULL; name = NULL; return FALSE; } } static UBool loadFCDCheckData(UErrorCode *error) { /* load fcdcheck data from file if necessary */ if (!isFCDCheckLoaded && U_SUCCESS(*error)) { UDataMemory *data; /* open the data outside the mutex block */ data = udata_openChoice(NULL, DATA_TYPE, FCHK_DATA_NAME, isFCDCheckAcceptable, NULL, error); if (U_FAILURE(*error)) { return isFCDCheckLoaded = FALSE; } /* in the mutex block, set the data for this process */ umtx_lock(NULL); if (fcdcheckData == NULL) { const uint16_t *temp = (const uint16_t *)udata_getMemory(data); const uint16_t *indexes = temp; fcdcheckData = data; temp += 8; FCHK_STAGE_1_ = temp + indexes[FCHK_INDEX_STAGE_1_INDEX]; FCHK_STAGE_2_ = temp + indexes[FCHK_INDEX_STAGE_2_INDEX]; FCHK_STAGE_3_ = (const uint16_t *)(temp + indexes[FCHK_INDEX_STAGE_3_INDEX]); data = NULL; } umtx_unlock(NULL); isFCDCheckLoaded = TRUE; /* if a different thread set it first, then close the extra data */ if (data != NULL) { udata_close(data); /* NULL if it was set correctly */ } } return isFCDCheckLoaded; } /** * Gets the stage 1 data for checkFCD. * @param error status * @return checkFCD data stage 1, null if data can not be loaded */ U_CAPI const uint16_t * getFCHK_STAGE_1_(UErrorCode *error) { if (loadFCDCheckData(error)) { return FCHK_STAGE_1_; } return NULL; } /** * Gets the stage 2 data for checkFCD. * @param error status * @return checkFCD data stage 2, null if data can not be loaded */ U_CAPI const uint16_t * getFCHK_STAGE_2_(UErrorCode *error) { if (loadFCDCheckData(error)) { return FCHK_STAGE_2_; } return NULL; } /** * Gets the stage 3 data for checkFCD. * @param error status * @return checkFCD data stage 3, null if data can not be loaded */ U_CAPI const uint16_t * getFCHK_STAGE_3_(UErrorCode *error) { if (loadFCDCheckData(error)) { return FCHK_STAGE_3_; } return NULL; } /** * Private method which performs a quick FCD check on a string, to quickly * determine if a string is in a required FCD format. * FCD is the set of strings such that for each character in the string, * decomposition without any canonical reordering will produce a NFD. * @param source string for determining if it is in a normalized format * @param sourcelength length of source to test * @paran mode normalization format from the enum UNormalizationMode * @param status A pointer to an UErrorCode to receive any errors * @return TRUE if source is in FCD format, FALSE otherwise */ U_CAPI UBool checkFCD(const UChar* source, int32_t sourcelength, UErrorCode* status) { UChar32 codepoint; const UChar *psource; const UChar *pend = 0; uint8_t oldfcdtrail = 0; uint16_t fcd = 0; if (!loadFCDCheckData(status) || U_FAILURE(*status)) { return FALSE; } if (sourcelength >= 0) { psource = source; pend = source + sourcelength; for (;;) { if (psource >= pend) { return TRUE; } /* fast route : since codepoints < NFC_ZER_CC_BLOCK_LIMIT_ has combining class 0. looking at the minimum values, surrogates are not a problem */ if (*psource >= NFC_ZERO_CC_BLOCK_LIMIT_) { break; } psource ++; } } else { psource = source; for (;;) { if (*psource == 0) { return TRUE; } /* fast route : since codepoints < min has combining class 0 and YES looking at the minimum values, surrogates are not a problem */ if (*psource >= NFC_ZERO_CC_BLOCK_LIMIT_) { break; } psource ++; } } /* not end of string and yet failed simple compare safe to shift back one char because the previous char has to be < 0x300 or the start of a string */ if (psource == source) { oldfcdtrail = 0; } else { codepoint = *(psource - 1); oldfcdtrail = (uint8_t)(FCHK_STAGE_3_[ FCHK_STAGE_2_[FCHK_STAGE_1_[codepoint >> STAGE_1_SHIFT_] + ((codepoint >> STAGE_2_SHIFT_) & STAGE_2_MASK_AFTER_SHIFT_)] + (codepoint & STAGE_3_MASK_)] & LAST_BYTE_MASK_); } if (sourcelength >= 0) { for (;;) { int count = 0; uint8_t lead; if (psource >= pend) { return TRUE; } UTF_NEXT_CHAR(psource, count, pend - psource, codepoint); /* trie access */ fcd = FCHK_STAGE_3_[ FCHK_STAGE_2_[FCHK_STAGE_1_[codepoint >> STAGE_1_SHIFT_] + ((codepoint >> STAGE_2_SHIFT_) & STAGE_2_MASK_AFTER_SHIFT_)] + (codepoint & STAGE_3_MASK_)]; lead = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_); if (lead != 0 && oldfcdtrail > lead) { return FALSE; } oldfcdtrail = (uint8_t)(fcd & LAST_BYTE_MASK_); psource += count; } } else { for (;;) { int count = 0; uint8_t lead; UTF_NEXT_CHAR(psource, count, pend - psource, codepoint); if (codepoint == 0) { return TRUE; } /* trie access */ fcd = FCHK_STAGE_3_[ FCHK_STAGE_2_[FCHK_STAGE_1_[codepoint >> STAGE_1_SHIFT_] + ((codepoint >> STAGE_2_SHIFT_) & STAGE_2_MASK_AFTER_SHIFT_)] + (codepoint & STAGE_3_MASK_)]; lead = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_); if (lead != 0 && oldfcdtrail > lead) { return FALSE; } oldfcdtrail = (uint8_t)(fcd & LAST_BYTE_MASK_); psource += count; } } return TRUE; }