/* ********************************************************************** * Copyright (C) 2005-2006, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** */ #include "unicode/utypes.h" #if !UCONFIG_NO_CONVERSION #include "csrmbcs.h" #include U_NAMESPACE_BEGIN #define ARRAY_SIZE(array) (sizeof array / sizeof array[0]) #define min(x,y) (((x)<(y))?(x):(y)) const int32_t commonChars_sjis [] = { // TODO: This set of data comes from the character frequency- // of-occurence analysis tool. The data needs to be moved // into a resource and loaded from there. 0x8140, 0x8141, 0x8142, 0x8145, 0x815b, 0x8169, 0x816a, 0x8175, 0x8176, 0x82a0, 0x82a2, 0x82a4, 0x82a9, 0x82aa, 0x82ab, 0x82ad, 0x82af, 0x82b1, 0x82b3, 0x82b5, 0x82b7, 0x82bd, 0x82be, 0x82c1, 0x82c4, 0x82c5, 0x82c6, 0x82c8, 0x82c9, 0x82cc, 0x82cd, 0x82dc, 0x82e0, 0x82e7, 0x82e8, 0x82e9, 0x82ea, 0x82f0, 0x82f1, 0x8341, 0x8343, 0x834e, 0x834f, 0x8358, 0x835e, 0x8362, 0x8367, 0x8375, 0x8376, 0x8389, 0x838a, 0x838b, 0x838d, 0x8393, 0x8e96, 0x93fa, 0x95aa}; const int32_t commonChars_euc_jp[] = { // TODO: This set of data comes from the character frequency- // of-occurence analysis tool. The data needs to be moved // into a resource and loaded from there. 0xa1a1, 0xa1a2, 0xa1a3, 0xa1a6, 0xa1bc, 0xa1ca, 0xa1cb, 0xa1d6, 0xa1d7, 0xa4a2, 0xa4a4, 0xa4a6, 0xa4a8, 0xa4aa, 0xa4ab, 0xa4ac, 0xa4ad, 0xa4af, 0xa4b1, 0xa4b3, 0xa4b5, 0xa4b7, 0xa4b9, 0xa4bb, 0xa4bd, 0xa4bf, 0xa4c0, 0xa4c1, 0xa4c3, 0xa4c4, 0xa4c6, 0xa4c7, 0xa4c8, 0xa4c9, 0xa4ca, 0xa4cb, 0xa4ce, 0xa4cf, 0xa4d0, 0xa4de, 0xa4df, 0xa4e1, 0xa4e2, 0xa4e4, 0xa4e8, 0xa4e9, 0xa4ea, 0xa4eb, 0xa4ec, 0xa4ef, 0xa4f2, 0xa4f3, 0xa5a2, 0xa5a3, 0xa5a4, 0xa5a6, 0xa5a7, 0xa5aa, 0xa5ad, 0xa5af, 0xa5b0, 0xa5b3, 0xa5b5, 0xa5b7, 0xa5b8, 0xa5b9, 0xa5bf, 0xa5c3, 0xa5c6, 0xa5c7, 0xa5c8, 0xa5c9, 0xa5cb, 0xa5d0, 0xa5d5, 0xa5d6, 0xa5d7, 0xa5de, 0xa5e0, 0xa5e1, 0xa5e5, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5f3, 0xb8a9, 0xb9d4, 0xbaee, 0xbbc8, 0xbef0, 0xbfb7, 0xc4ea, 0xc6fc, 0xc7bd, 0xcab8, 0xcaf3, 0xcbdc, 0xcdd1}; const int32_t commonChars_euc_kr[] = { // TODO: This set of data comes from the character frequency- // of-occurence analysis tool. The data needs to be moved // into a resource and loaded from there. 0xb0a1, 0xb0b3, 0xb0c5, 0xb0cd, 0xb0d4, 0xb0e6, 0xb0ed, 0xb0f8, 0xb0fa, 0xb0fc, 0xb1b8, 0xb1b9, 0xb1c7, 0xb1d7, 0xb1e2, 0xb3aa, 0xb3bb, 0xb4c2, 0xb4cf, 0xb4d9, 0xb4eb, 0xb5a5, 0xb5b5, 0xb5bf, 0xb5c7, 0xb5e9, 0xb6f3, 0xb7af, 0xb7c2, 0xb7ce, 0xb8a6, 0xb8ae, 0xb8b6, 0xb8b8, 0xb8bb, 0xb8e9, 0xb9ab, 0xb9ae, 0xb9cc, 0xb9ce, 0xb9fd, 0xbab8, 0xbace, 0xbad0, 0xbaf1, 0xbbe7, 0xbbf3, 0xbbfd, 0xbcad, 0xbcba, 0xbcd2, 0xbcf6, 0xbdba, 0xbdc0, 0xbdc3, 0xbdc5, 0xbec6, 0xbec8, 0xbedf, 0xbeee, 0xbef8, 0xbefa, 0xbfa1, 0xbfa9, 0xbfc0, 0xbfe4, 0xbfeb, 0xbfec, 0xbff8, 0xc0a7, 0xc0af, 0xc0b8, 0xc0ba, 0xc0bb, 0xc0bd, 0xc0c7, 0xc0cc, 0xc0ce, 0xc0cf, 0xc0d6, 0xc0da, 0xc0e5, 0xc0fb, 0xc0fc, 0xc1a4, 0xc1a6, 0xc1b6, 0xc1d6, 0xc1df, 0xc1f6, 0xc1f8, 0xc4a1, 0xc5cd, 0xc6ae, 0xc7cf, 0xc7d1, 0xc7d2, 0xc7d8, 0xc7e5, 0xc8ad}; const int32_t commonChars_big5[] = { // TODO: This set of data comes from the character frequency- // of-occurence analysis tool. The data needs to be moved // into a resource and loaded from there. 0xa140, 0xa141, 0xa142, 0xa143, 0xa147, 0xa149, 0xa175, 0xa176, 0xa440, 0xa446, 0xa447, 0xa448, 0xa451, 0xa454, 0xa457, 0xa464, 0xa46a, 0xa46c, 0xa477, 0xa4a3, 0xa4a4, 0xa4a7, 0xa4c1, 0xa4ce, 0xa4d1, 0xa4df, 0xa4e8, 0xa4fd, 0xa540, 0xa548, 0xa558, 0xa569, 0xa5cd, 0xa5e7, 0xa657, 0xa661, 0xa662, 0xa668, 0xa670, 0xa6a8, 0xa6b3, 0xa6b9, 0xa6d3, 0xa6db, 0xa6e6, 0xa6f2, 0xa740, 0xa751, 0xa759, 0xa7da, 0xa8a3, 0xa8a5, 0xa8ad, 0xa8d1, 0xa8d3, 0xa8e4, 0xa8fc, 0xa9c0, 0xa9d2, 0xa9f3, 0xaa6b, 0xaaba, 0xaabe, 0xaacc, 0xaafc, 0xac47, 0xac4f, 0xacb0, 0xacd2, 0xad59, 0xaec9, 0xafe0, 0xb0ea, 0xb16f, 0xb2b3, 0xb2c4, 0xb36f, 0xb44c, 0xb44e, 0xb54c, 0xb5a5, 0xb5bd, 0xb5d0, 0xb5d8, 0xb671, 0xb7ed, 0xb867, 0xb944, 0xbad8, 0xbb44, 0xbba1, 0xbdd1, 0xc2c4, 0xc3b9, 0xc440, 0xc45f}; const int32_t commonChars_gb_18030[] = { // TODO: This set of data comes from the character frequency- // of-occurence analysis tool. The data needs to be moved // into a resource and loaded from there. 0xa1a1, 0xa1a2, 0xa1a3, 0xa1a4, 0xa1b0, 0xa1b1, 0xa1f1, 0xa1f3, 0xa3a1, 0xa3ac, 0xa3ba, 0xb1a8, 0xb1b8, 0xb1be, 0xb2bb, 0xb3c9, 0xb3f6, 0xb4f3, 0xb5bd, 0xb5c4, 0xb5e3, 0xb6af, 0xb6d4, 0xb6e0, 0xb7a2, 0xb7a8, 0xb7bd, 0xb7d6, 0xb7dd, 0xb8b4, 0xb8df, 0xb8f6, 0xb9ab, 0xb9c9, 0xb9d8, 0xb9fa, 0xb9fd, 0xbacd, 0xbba7, 0xbbd6, 0xbbe1, 0xbbfa, 0xbcbc, 0xbcdb, 0xbcfe, 0xbdcc, 0xbecd, 0xbedd, 0xbfb4, 0xbfc6, 0xbfc9, 0xc0b4, 0xc0ed, 0xc1cb, 0xc2db, 0xc3c7, 0xc4dc, 0xc4ea, 0xc5cc, 0xc6f7, 0xc7f8, 0xc8ab, 0xc8cb, 0xc8d5, 0xc8e7, 0xc9cf, 0xc9fa, 0xcab1, 0xcab5, 0xcac7, 0xcad0, 0xcad6, 0xcaf5, 0xcafd, 0xccec, 0xcdf8, 0xceaa, 0xcec4, 0xced2, 0xcee5, 0xcfb5, 0xcfc2, 0xcfd6, 0xd0c2, 0xd0c5, 0xd0d0, 0xd0d4, 0xd1a7, 0xd2aa, 0xd2b2, 0xd2b5, 0xd2bb, 0xd2d4, 0xd3c3, 0xd3d0, 0xd3fd, 0xd4c2, 0xd4da, 0xd5e2, 0xd6d0}; static int32_t binarySearch(const int32_t *array, int32_t len, int32_t value) { int32_t start = 0, end = len-1; int32_t mid = (start+end)/2; while(start <= end) { if(array[mid] == value) { return mid; } if(array[mid] < value){ start = mid+1; } else { end = mid-1; } mid = (start+end)/2; } return -1; } iteratedChar::iteratedChar():charValue(0), index(0), nextIndex(0), error(FALSE), done(FALSE) { // nothing else to do. } void iteratedChar::reset() { charValue = 0; index = -1; nextIndex = 0; error = FALSE; done = FALSE; } int32_t iteratedChar::nextByte(InputText *det) { if (nextIndex >= det->fRawLength) { done = TRUE; return -1; } return det->fRawInput[nextIndex++]; } CharsetRecog_mbcs::~CharsetRecog_mbcs() { // nothing to do. } int32_t CharsetRecog_mbcs::match_mbcs(InputText *det, const int32_t commonChars[], int32_t commonCharsLen) { int singleByteCharCount = 0; int doubleByteCharCount = 0; int commonCharCount = 0; int badCharCount = 0; int totalCharCount = 0; int confidence = 0; iteratedChar *iter = new iteratedChar(); // { for (iter->reset(); nextChar(iter, det);) { totalCharCount += 1; if (iter->error) { badCharCount += 1; } else { if (iter->charValue <= 0xFF) { singleByteCharCount += 1; } else { doubleByteCharCount += 1; if (commonChars != 0) { if (binarySearch(commonChars, commonCharsLen, iter->charValue) >= 0){ commonCharCount += 1; } } } } if (badCharCount >= 2 && badCharCount*5 >= doubleByteCharCount) { // Bail out early if the byte data is not matching the encoding scheme. // break detectBlock; delete iter; return confidence; } } delete iter; if (doubleByteCharCount <= 10 && badCharCount == 0) { // Not many multi-byte chars. // ASCII or ISO file? It's probably not our encoding, // but is not incompatible with our encoding, so don't give it a zero. confidence = 10; return confidence; } // // No match if there are too many characters that don't fit the encoding scheme. // (should we have zero tolerance for these?) // if (doubleByteCharCount < 20*badCharCount) { confidence = 0; return confidence; } if (commonChars == 0) { // We have no statistics on frequently occuring characters. // Assess confidence purely on having a reasonable number of // multi-byte characters (the more the better) confidence = 30 + doubleByteCharCount - 20*badCharCount; if (confidence > 100) { confidence = 100; } } else { // // Frequency of occurence statistics exist. // double maxVal = log10((double)doubleByteCharCount / 4); /*(float)?*/ double scaleFactor = 90.0 / maxVal; confidence = (int32_t)(log10((double)commonCharCount+1) * scaleFactor + 10.0); confidence = min(confidence, 100); } if (confidence < 0) { confidence = 0; } return confidence; } CharsetRecog_sjis::~CharsetRecog_sjis() { // nothing to do } UBool CharsetRecog_sjis::nextChar(iteratedChar* it, InputText* det) { it->index = it->nextIndex; it->error = FALSE; int32_t firstByte = it->charValue = it->nextByte(det); if (firstByte < 0) { return FALSE; } if (firstByte <= 0x7F || (firstByte > 0xA0 && firstByte <= 0xDF)) { return TRUE; } int32_t secondByte = it->nextByte(det); if (secondByte < 0) { return FALSE; } it->charValue = (firstByte << 8) | secondByte; if (! ((secondByte >= 0x40 && secondByte <= 0x7F) || (secondByte >= 0x80 && secondByte <= 0xFE))) { // Illegal second byte value. it->error = TRUE; } return TRUE; } int32_t CharsetRecog_sjis::match(InputText* det) { return match_mbcs(det, commonChars_sjis, ARRAY_SIZE(commonChars_sjis)); } const char *CharsetRecog_sjis::getName() const { return "Shift_JIS"; } const char *CharsetRecog_sjis::getLanguage() const { return "jp"; } CharsetRecog_euc::~CharsetRecog_euc() { // nothing to do } UBool CharsetRecog_euc::nextChar(iteratedChar* it, InputText* det) { int32_t firstByte = 0; int32_t secondByte = 0; int32_t thirdByte = 0; // int32_t fourthByte = 0; it->index = it->nextIndex; it->error = FALSE; firstByte = it->charValue = it->nextByte(det); if (firstByte < 0) { // Ran off the end of the input data it->done = TRUE; return (! it->done); } if (firstByte <= 0x8D) { // single byte char return (! it->done); } secondByte = it->nextByte(det); it->charValue = (it->charValue << 8) | secondByte; if (firstByte >= 0xA1 && firstByte <= 0xFE) { // Two byte Char if (secondByte < 0xA1) { it->error = TRUE; } return (! it->done); } if (firstByte == 0x8E) { // Code Set 2. // In EUC-JP, total char size is 2 bytes, only one byte of actual char value. // In EUC-TW, total char size is 4 bytes, three bytes contribute to char value. // We don't know which we've got. // Treat it like EUC-JP. If the data really was EUC-TW, the following two // bytes will look like a well formed 2 byte char. if (secondByte < 0xA1) { it->error = TRUE; } return (! it->done); } if (firstByte == 0x8F) { // Code set 3. // Three byte total char size, two bytes of actual char value. thirdByte = it->nextByte(det); it->charValue = (it->charValue << 8) | thirdByte; if (thirdByte < 0xa1) { it->error = TRUE; } } return (! it->done); } CharsetRecog_euc_jp::~CharsetRecog_euc_jp() { // nothing to do } const char *CharsetRecog_euc_jp::getName() const { return "EUC-JP"; } const char *CharsetRecog_euc_jp::getLanguage() const { return "jp"; } int32_t CharsetRecog_euc_jp::match(InputText *det) { return match_mbcs(det, commonChars_euc_jp, ARRAY_SIZE(commonChars_euc_jp)); } CharsetRecog_euc_kr::~CharsetRecog_euc_kr() { // nothing to do } const char *CharsetRecog_euc_kr::getName() const { return "EUC-KR"; } const char *CharsetRecog_euc_kr::getLanguage() const { return "kr"; } int32_t CharsetRecog_euc_kr::match(InputText *det) { return match_mbcs(det, commonChars_euc_kr, ARRAY_SIZE(commonChars_euc_kr)); } CharsetRecog_big5::~CharsetRecog_big5() { // nothing to do } UBool CharsetRecog_big5::nextChar(iteratedChar* it, InputText* det) { int32_t firstByte; it->index = it->nextIndex; it->error = FALSE; firstByte = it->charValue = it->nextByte(det); if (firstByte < 0) { return FALSE; } if (firstByte <= 0x7F || firstByte == 0xFF) { // single byte character. return TRUE; } int32_t secondByte = it->nextByte(det); if (secondByte < 0) { return FALSE; } it->charValue = (it->charValue << 8) | secondByte; if (secondByte < 0x40 || secondByte == 0x7F || secondByte == 0xFF) { it->error = TRUE; } return TRUE; } const char *CharsetRecog_big5::getName() const { return "Big5"; } const char *CharsetRecog_big5::getLanguage() const { return "zh"; } int32_t CharsetRecog_big5::match(InputText *det) { return match_mbcs(det, commonChars_big5, ARRAY_SIZE(commonChars_big5)); } CharsetRecog_gb_18030::~CharsetRecog_gb_18030() { // nothing to do } UBool CharsetRecog_gb_18030::nextChar(iteratedChar* it, InputText* det) { int32_t firstByte = 0; int32_t secondByte = 0; int32_t thirdByte = 0; int32_t fourthByte = 0; it->index = it->nextIndex; it->error = FALSE; firstByte = it->charValue = it->nextByte(det); if (firstByte < 0) { // Ran off the end of the input data it->done = TRUE; return (! it->done); } if (firstByte <= 0x80) { // single byte char return (! it->done); } secondByte = it->nextByte(det); it->charValue = (it->charValue << 8) | secondByte; if (firstByte >= 0x81 && firstByte <= 0xFE) { // Two byte Char if ((secondByte >= 0x40 && secondByte <= 0x7E) || (secondByte >=80 && secondByte <= 0xFE)) { return (! it->done); } // Four byte char if (secondByte >= 0x30 && secondByte <= 0x39) { thirdByte = it->nextByte(det); if (thirdByte >= 0x81 && thirdByte <= 0xFE) { fourthByte = it->nextByte(det); if (fourthByte >= 0x30 && fourthByte <= 0x39) { it->charValue = (it->charValue << 16) | (thirdByte << 8) | fourthByte; return (! it->done); } } } it->error = TRUE; return (! it->done); } return (! it->done); } const char *CharsetRecog_gb_18030::getName() const { return "GB18030"; } const char *CharsetRecog_gb_18030::getLanguage() const { return "zh"; } int32_t CharsetRecog_gb_18030::match(InputText *det) { return match_mbcs(det, commonChars_gb_18030, ARRAY_SIZE(commonChars_gb_18030)); } U_NAMESPACE_END #endif