2006-02-06 18:03:11 +00:00
|
|
|
/*
|
|
|
|
**********************************************************************
|
2011-07-06 04:03:35 +00:00
|
|
|
* Copyright (C) 2005-2011, International Business Machines
|
2006-02-06 18:03:11 +00:00
|
|
|
* Corporation and others. All Rights Reserved.
|
|
|
|
**********************************************************************
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include "unicode/utypes.h"
|
2006-05-09 18:06:10 +00:00
|
|
|
|
|
|
|
#if !UCONFIG_NO_CONVERSION
|
|
|
|
|
2006-02-07 07:50:53 +00:00
|
|
|
#include "unicode/ucsdet.h"
|
2006-02-06 18:03:11 +00:00
|
|
|
|
|
|
|
#include "csdetect.h"
|
|
|
|
#include "csmatch.h"
|
2006-02-07 07:50:53 +00:00
|
|
|
#include "uenumimp.h"
|
2006-02-06 18:03:11 +00:00
|
|
|
|
|
|
|
#include "cmemory.h"
|
2006-02-07 07:50:53 +00:00
|
|
|
#include "cstring.h"
|
2006-02-06 18:03:11 +00:00
|
|
|
#include "umutex.h"
|
|
|
|
#include "ucln_in.h"
|
2006-08-21 23:35:23 +00:00
|
|
|
#include "uarrsort.h"
|
2006-02-06 18:03:11 +00:00
|
|
|
#include "inputext.h"
|
|
|
|
#include "csrsbcs.h"
|
|
|
|
#include "csrmbcs.h"
|
|
|
|
#include "csrutf8.h"
|
|
|
|
#include "csrucode.h"
|
|
|
|
#include "csr2022.h"
|
|
|
|
|
|
|
|
#define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
|
|
|
|
|
|
|
|
#define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type))
|
|
|
|
#define DELETE_ARRAY(array) uprv_free((void *) (array))
|
|
|
|
|
|
|
|
U_CDECL_BEGIN
|
2011-07-06 04:03:35 +00:00
|
|
|
static icu::CharsetRecognizer **fCSRecognizers = NULL;
|
2006-02-07 07:50:53 +00:00
|
|
|
|
|
|
|
static int32_t fCSRecognizers_size = 0;
|
|
|
|
|
2006-02-06 18:03:11 +00:00
|
|
|
static UBool U_CALLCONV csdet_cleanup(void)
|
|
|
|
{
|
2006-02-07 07:50:53 +00:00
|
|
|
if (fCSRecognizers != NULL) {
|
|
|
|
for(int32_t r = 0; r < fCSRecognizers_size; r += 1) {
|
|
|
|
delete fCSRecognizers[r];
|
|
|
|
fCSRecognizers[r] = NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
DELETE_ARRAY(fCSRecognizers);
|
|
|
|
fCSRecognizers = NULL;
|
|
|
|
fCSRecognizers_size = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
return TRUE;
|
2006-02-06 18:03:11 +00:00
|
|
|
}
|
2006-08-22 08:31:57 +00:00
|
|
|
|
|
|
|
static int32_t U_CALLCONV
|
2007-03-17 06:13:14 +00:00
|
|
|
charsetMatchComparator(const void * /*context*/, const void *left, const void *right)
|
2006-08-22 08:31:57 +00:00
|
|
|
{
|
2006-09-04 16:36:21 +00:00
|
|
|
U_NAMESPACE_USE
|
|
|
|
|
2006-08-22 08:31:57 +00:00
|
|
|
const CharsetMatch **csm_l = (const CharsetMatch **) left;
|
|
|
|
const CharsetMatch **csm_r = (const CharsetMatch **) right;
|
|
|
|
|
|
|
|
// NOTE: compare is backwards to sort from highest to lowest.
|
|
|
|
return (*csm_r)->getConfidence() - (*csm_l)->getConfidence();
|
|
|
|
}
|
|
|
|
|
2006-02-06 18:03:11 +00:00
|
|
|
U_CDECL_END
|
|
|
|
|
2006-02-07 07:50:53 +00:00
|
|
|
U_NAMESPACE_BEGIN
|
2006-02-06 18:03:11 +00:00
|
|
|
|
2006-08-21 23:35:23 +00:00
|
|
|
void CharsetDetector::setRecognizers(UErrorCode &status)
|
2006-02-06 18:03:11 +00:00
|
|
|
{
|
|
|
|
UBool needsInit;
|
|
|
|
CharsetRecognizer **recognizers;
|
|
|
|
|
2006-08-21 23:35:23 +00:00
|
|
|
if (U_FAILURE(status)) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2006-08-29 04:57:05 +00:00
|
|
|
UMTX_CHECK(NULL, (UBool) (fCSRecognizers == NULL), needsInit);
|
2006-02-06 18:03:11 +00:00
|
|
|
|
|
|
|
if (needsInit) {
|
|
|
|
CharsetRecognizer *tempArray[] = {
|
|
|
|
new CharsetRecog_UTF8(),
|
|
|
|
|
|
|
|
new CharsetRecog_UTF_16_BE(),
|
|
|
|
new CharsetRecog_UTF_16_LE(),
|
|
|
|
new CharsetRecog_UTF_32_BE(),
|
|
|
|
new CharsetRecog_UTF_32_LE(),
|
|
|
|
|
|
|
|
new CharsetRecog_8859_1_en(),
|
|
|
|
new CharsetRecog_8859_1_da(),
|
|
|
|
new CharsetRecog_8859_1_de(),
|
|
|
|
new CharsetRecog_8859_1_es(),
|
|
|
|
new CharsetRecog_8859_1_fr(),
|
|
|
|
new CharsetRecog_8859_1_it(),
|
|
|
|
new CharsetRecog_8859_1_nl(),
|
|
|
|
new CharsetRecog_8859_1_no(),
|
|
|
|
new CharsetRecog_8859_1_pt(),
|
|
|
|
new CharsetRecog_8859_1_sv(),
|
|
|
|
new CharsetRecog_8859_2_cs(),
|
|
|
|
new CharsetRecog_8859_2_hu(),
|
|
|
|
new CharsetRecog_8859_2_pl(),
|
|
|
|
new CharsetRecog_8859_2_ro(),
|
|
|
|
new CharsetRecog_8859_5_ru(),
|
|
|
|
new CharsetRecog_8859_6_ar(),
|
|
|
|
new CharsetRecog_8859_7_el(),
|
|
|
|
new CharsetRecog_8859_8_I_he(),
|
|
|
|
new CharsetRecog_8859_8_he(),
|
|
|
|
new CharsetRecog_windows_1251(),
|
|
|
|
new CharsetRecog_windows_1256(),
|
|
|
|
new CharsetRecog_KOI8_R(),
|
|
|
|
new CharsetRecog_8859_9_tr(),
|
|
|
|
new CharsetRecog_sjis(),
|
|
|
|
new CharsetRecog_gb_18030(),
|
|
|
|
new CharsetRecog_euc_jp(),
|
|
|
|
new CharsetRecog_euc_kr(),
|
2006-02-13 20:47:36 +00:00
|
|
|
new CharsetRecog_big5(),
|
2006-02-06 18:03:11 +00:00
|
|
|
|
|
|
|
new CharsetRecog_2022JP(),
|
|
|
|
new CharsetRecog_2022KR(),
|
2009-03-31 15:39:00 +00:00
|
|
|
new CharsetRecog_2022CN(),
|
|
|
|
|
2009-04-13 21:32:21 +00:00
|
|
|
new CharsetRecog_IBM424_he_rtl(),
|
|
|
|
new CharsetRecog_IBM424_he_ltr(),
|
|
|
|
new CharsetRecog_IBM420_ar_rtl(),
|
|
|
|
new CharsetRecog_IBM420_ar_ltr()
|
2006-02-06 18:03:11 +00:00
|
|
|
};
|
|
|
|
int32_t rCount = ARRAY_SIZE(tempArray);
|
|
|
|
int32_t r;
|
|
|
|
|
|
|
|
recognizers = NEW_ARRAY(CharsetRecognizer *, rCount);
|
2006-08-21 23:35:23 +00:00
|
|
|
|
|
|
|
if (recognizers == NULL) {
|
|
|
|
status = U_MEMORY_ALLOCATION_ERROR;
|
2008-02-13 09:35:50 +00:00
|
|
|
return;
|
2006-08-21 23:35:23 +00:00
|
|
|
} else {
|
|
|
|
for (r = 0; r < rCount; r += 1) {
|
|
|
|
recognizers[r] = tempArray[r];
|
|
|
|
|
|
|
|
if (recognizers[r] == NULL) {
|
|
|
|
status = U_MEMORY_ALLOCATION_ERROR;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
2006-02-06 18:03:11 +00:00
|
|
|
}
|
|
|
|
|
2006-08-21 23:35:23 +00:00
|
|
|
if (U_SUCCESS(status)) {
|
|
|
|
umtx_lock(NULL);
|
|
|
|
if (fCSRecognizers == NULL) {
|
|
|
|
fCSRecognizers_size = rCount;
|
2008-08-05 00:09:13 +00:00
|
|
|
fCSRecognizers = recognizers;
|
2006-08-21 23:35:23 +00:00
|
|
|
}
|
|
|
|
umtx_unlock(NULL);
|
2006-02-06 18:03:11 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
if (fCSRecognizers != recognizers) {
|
|
|
|
for (r = 0; r < rCount; r += 1) {
|
|
|
|
delete recognizers[r];
|
|
|
|
recognizers[r] = NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
DELETE_ARRAY(recognizers);
|
|
|
|
}
|
|
|
|
|
|
|
|
recognizers = NULL;
|
|
|
|
ucln_i18n_registerCleanup(UCLN_I18N_CSDET, csdet_cleanup);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2006-08-21 23:35:23 +00:00
|
|
|
CharsetDetector::CharsetDetector(UErrorCode &status)
|
2008-02-08 09:10:22 +00:00
|
|
|
: textIn(new InputText(status)), resultArray(NULL),
|
|
|
|
resultCount(0), fStripTags(FALSE), fFreshTextSet(FALSE)
|
2006-02-06 18:03:11 +00:00
|
|
|
{
|
2006-08-21 23:35:23 +00:00
|
|
|
if (U_FAILURE(status)) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
setRecognizers(status);
|
|
|
|
|
|
|
|
if (U_FAILURE(status)) {
|
|
|
|
return;
|
|
|
|
}
|
2006-02-06 18:03:11 +00:00
|
|
|
|
2006-08-15 06:45:05 +00:00
|
|
|
resultArray = (CharsetMatch **)uprv_malloc(sizeof(CharsetMatch *)*fCSRecognizers_size);
|
2006-02-06 18:03:11 +00:00
|
|
|
|
2006-08-21 23:35:23 +00:00
|
|
|
if (resultArray == NULL) {
|
|
|
|
status = U_MEMORY_ALLOCATION_ERROR;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2006-02-06 18:03:11 +00:00
|
|
|
for(int32_t i = 0; i < fCSRecognizers_size; i += 1) {
|
2006-02-06 20:45:30 +00:00
|
|
|
resultArray[i] = new CharsetMatch();
|
2006-08-21 23:35:23 +00:00
|
|
|
|
|
|
|
if (resultArray[i] == NULL) {
|
|
|
|
status = U_MEMORY_ALLOCATION_ERROR;
|
|
|
|
break;
|
|
|
|
}
|
2006-02-06 18:03:11 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2006-02-06 20:45:30 +00:00
|
|
|
CharsetDetector::~CharsetDetector()
|
2006-02-06 18:03:11 +00:00
|
|
|
{
|
|
|
|
delete textIn;
|
|
|
|
|
|
|
|
for(int32_t i = 0; i < fCSRecognizers_size; i += 1) {
|
|
|
|
delete resultArray[i];
|
|
|
|
}
|
|
|
|
|
2006-08-15 06:45:05 +00:00
|
|
|
uprv_free(resultArray);
|
2006-02-06 18:03:11 +00:00
|
|
|
}
|
|
|
|
|
2006-02-06 20:45:30 +00:00
|
|
|
void CharsetDetector::setText(const char *in, int32_t len)
|
2006-02-06 18:03:11 +00:00
|
|
|
{
|
|
|
|
textIn->setText(in, len);
|
|
|
|
fFreshTextSet = TRUE;
|
|
|
|
}
|
|
|
|
|
2006-02-06 20:45:30 +00:00
|
|
|
UBool CharsetDetector::setStripTagsFlag(UBool flag)
|
2006-02-06 18:03:11 +00:00
|
|
|
{
|
|
|
|
UBool temp = fStripTags;
|
|
|
|
fStripTags = flag;
|
|
|
|
fFreshTextSet = TRUE;
|
|
|
|
return temp;
|
|
|
|
}
|
|
|
|
|
2006-02-06 20:45:30 +00:00
|
|
|
UBool CharsetDetector::getStripTagsFlag() const
|
2006-02-06 18:03:11 +00:00
|
|
|
{
|
|
|
|
return fStripTags;
|
|
|
|
}
|
|
|
|
|
2006-02-06 20:45:30 +00:00
|
|
|
void CharsetDetector::setDeclaredEncoding(const char *encoding, int32_t len) const
|
2006-02-06 18:03:11 +00:00
|
|
|
{
|
|
|
|
textIn->setDeclaredEncoding(encoding,len);
|
|
|
|
}
|
|
|
|
|
2006-02-06 20:45:30 +00:00
|
|
|
int32_t CharsetDetector::getDetectableCount()
|
2006-02-06 18:03:11 +00:00
|
|
|
{
|
2006-08-21 23:35:23 +00:00
|
|
|
UErrorCode status = U_ZERO_ERROR;
|
|
|
|
|
|
|
|
setRecognizers(status);
|
2006-02-06 18:03:11 +00:00
|
|
|
|
|
|
|
return fCSRecognizers_size;
|
|
|
|
}
|
|
|
|
|
2006-02-06 20:45:30 +00:00
|
|
|
const CharsetMatch *CharsetDetector::detect(UErrorCode &status)
|
2006-02-06 18:03:11 +00:00
|
|
|
{
|
|
|
|
int32_t maxMatchesFound = 0;
|
|
|
|
|
|
|
|
detectAll(maxMatchesFound, status);
|
|
|
|
|
|
|
|
if(maxMatchesFound > 0) {
|
|
|
|
return resultArray[0];
|
|
|
|
} else {
|
2006-02-07 21:59:16 +00:00
|
|
|
return NULL;
|
2006-02-06 18:03:11 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2006-02-06 20:45:30 +00:00
|
|
|
const CharsetMatch * const *CharsetDetector::detectAll(int32_t &maxMatchesFound, UErrorCode &status)
|
2006-02-06 18:03:11 +00:00
|
|
|
{
|
|
|
|
if(!textIn->isSet()) {
|
|
|
|
status = U_MISSING_RESOURCE_ERROR;// TODO: Need to set proper status code for input text not set
|
|
|
|
|
2006-02-07 20:24:16 +00:00
|
|
|
return NULL;
|
2006-02-06 18:03:11 +00:00
|
|
|
} else if(fFreshTextSet) {
|
|
|
|
CharsetRecognizer *csr;
|
|
|
|
int32_t detectResults;
|
|
|
|
int32_t confidence;
|
2006-09-08 16:48:31 +00:00
|
|
|
int32_t i;
|
2006-02-06 18:03:11 +00:00
|
|
|
|
|
|
|
textIn->MungeInput(fStripTags);
|
|
|
|
|
|
|
|
// Iterate over all possible charsets, remember all that
|
|
|
|
// give a match quality > 0.
|
2006-02-07 20:24:16 +00:00
|
|
|
resultCount = 0;
|
2006-09-08 16:48:31 +00:00
|
|
|
for (i = 0; i < fCSRecognizers_size; i += 1) {
|
2006-02-06 18:03:11 +00:00
|
|
|
csr = fCSRecognizers[i];
|
|
|
|
detectResults = csr->match(textIn);
|
|
|
|
confidence = detectResults;
|
|
|
|
|
|
|
|
if (confidence > 0) {
|
2006-02-06 20:45:30 +00:00
|
|
|
resultArray[resultCount++]->set(textIn, csr, confidence);
|
2006-02-06 18:03:11 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2006-09-08 16:48:31 +00:00
|
|
|
for(i = resultCount; i < fCSRecognizers_size; i += 1) {
|
2006-02-09 21:13:01 +00:00
|
|
|
resultArray[i]->set(textIn, 0, 0);
|
2006-02-06 18:03:11 +00:00
|
|
|
}
|
|
|
|
|
2006-08-21 23:35:23 +00:00
|
|
|
uprv_sortArray(resultArray, resultCount, sizeof resultArray[0], charsetMatchComparator, NULL, TRUE, &status);
|
2009-04-24 22:24:27 +00:00
|
|
|
|
|
|
|
// Remove duplicate charsets from the results.
|
|
|
|
// Simple minded, brute force approach - check each entry against all that follow.
|
|
|
|
// The first entry of any duplicated set is the one that should be kept because it will
|
|
|
|
// be the one with the highest confidence rating.
|
|
|
|
// (Duplicate matches have different languages, only the charset is the same)
|
|
|
|
// Because the resultArray contains preallocated CharsetMatch objects, they aren't actually
|
|
|
|
// deleted, just reordered, with the unwanted duplicates placed after the good results.
|
|
|
|
int32_t j, k;
|
|
|
|
for (i=0; i<resultCount; i++) {
|
|
|
|
const char *charSetName = resultArray[i]->getName();
|
|
|
|
for (j=i+1; j<resultCount; ) {
|
|
|
|
if (uprv_strcmp(charSetName, resultArray[j]->getName()) != 0) {
|
|
|
|
// Not a duplicate.
|
|
|
|
j++;
|
|
|
|
} else {
|
|
|
|
// Duplicate entry at index j.
|
|
|
|
CharsetMatch *duplicate = resultArray[j];
|
|
|
|
for (k=j; k<resultCount-1; k++) {
|
|
|
|
resultArray[k] = resultArray[k+1];
|
|
|
|
}
|
|
|
|
resultCount--;
|
|
|
|
resultArray[resultCount] = duplicate;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2006-02-06 18:03:11 +00:00
|
|
|
|
|
|
|
fFreshTextSet = FALSE;
|
|
|
|
}
|
|
|
|
|
|
|
|
maxMatchesFound = resultCount;
|
|
|
|
|
|
|
|
return resultArray;
|
|
|
|
}
|
|
|
|
|
2006-11-16 20:32:23 +00:00
|
|
|
/*const char *CharsetDetector::getCharsetName(int32_t index, UErrorCode &status) const
|
2006-02-06 18:03:11 +00:00
|
|
|
{
|
|
|
|
if( index > fCSRecognizers_size-1 || index < 0) {
|
|
|
|
status = U_INDEX_OUTOFBOUNDS_ERROR;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
} else {
|
|
|
|
return fCSRecognizers[index]->getName();
|
|
|
|
}
|
2006-11-16 20:32:23 +00:00
|
|
|
}*/
|
2006-02-06 18:03:11 +00:00
|
|
|
|
|
|
|
U_NAMESPACE_END
|
|
|
|
|
2006-02-07 07:50:53 +00:00
|
|
|
U_CDECL_BEGIN
|
|
|
|
typedef struct {
|
|
|
|
int32_t currIndex;
|
|
|
|
} Context;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static void U_CALLCONV
|
|
|
|
enumClose(UEnumeration *en) {
|
|
|
|
if(en->context != NULL) {
|
|
|
|
DELETE_ARRAY(en->context);
|
|
|
|
}
|
|
|
|
|
|
|
|
DELETE_ARRAY(en);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int32_t U_CALLCONV
|
2006-03-22 09:48:15 +00:00
|
|
|
enumCount(UEnumeration *, UErrorCode *) {
|
2006-02-07 07:50:53 +00:00
|
|
|
return fCSRecognizers_size;
|
|
|
|
}
|
|
|
|
|
|
|
|
static const char* U_CALLCONV
|
2007-03-17 06:13:14 +00:00
|
|
|
enumNext(UEnumeration *en, int32_t *resultLength, UErrorCode * /*status*/) {
|
2006-02-07 07:50:53 +00:00
|
|
|
if(((Context *)en->context)->currIndex >= fCSRecognizers_size) {
|
2006-08-06 22:38:31 +00:00
|
|
|
if(resultLength != NULL) {
|
|
|
|
*resultLength = 0;
|
|
|
|
}
|
2006-02-07 07:50:53 +00:00
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
const char *currName = fCSRecognizers[((Context *)en->context)->currIndex]->getName();
|
2006-08-06 22:38:31 +00:00
|
|
|
if(resultLength != NULL) {
|
|
|
|
*resultLength = (int32_t)uprv_strlen(currName);
|
|
|
|
}
|
2006-02-07 07:50:53 +00:00
|
|
|
((Context *)en->context)->currIndex++;
|
|
|
|
|
|
|
|
return currName;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void U_CALLCONV
|
2006-03-22 09:48:15 +00:00
|
|
|
enumReset(UEnumeration *en, UErrorCode *) {
|
2006-02-07 07:50:53 +00:00
|
|
|
((Context *)en->context)->currIndex = 0;
|
|
|
|
}
|
|
|
|
|
2006-02-24 19:57:04 +00:00
|
|
|
static const UEnumeration gCSDetEnumeration = {
|
2006-02-07 07:50:53 +00:00
|
|
|
NULL,
|
|
|
|
NULL,
|
|
|
|
enumClose,
|
|
|
|
enumCount,
|
|
|
|
uenum_unextDefault,
|
|
|
|
enumNext,
|
|
|
|
enumReset
|
|
|
|
};
|
|
|
|
|
2006-03-22 09:48:15 +00:00
|
|
|
U_CAPI UEnumeration * U_EXPORT2
|
2009-03-27 00:37:55 +00:00
|
|
|
ucsdet_getAllDetectableCharsets(const UCharsetDetector * /*ucsd*/, UErrorCode *status)
|
2006-02-07 07:50:53 +00:00
|
|
|
{
|
2006-09-04 16:36:21 +00:00
|
|
|
U_NAMESPACE_USE
|
|
|
|
|
2006-02-07 07:50:53 +00:00
|
|
|
if(U_FAILURE(*status)) {
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Initialize recognized charsets. */
|
|
|
|
CharsetDetector::getDetectableCount();
|
|
|
|
|
|
|
|
UEnumeration *en = NEW_ARRAY(UEnumeration, 1);
|
2006-02-24 19:57:04 +00:00
|
|
|
memcpy(en, &gCSDetEnumeration, sizeof(UEnumeration));
|
2006-02-07 07:50:53 +00:00
|
|
|
en->context = (void*)NEW_ARRAY(Context, 1);
|
|
|
|
uprv_memset(en->context, 0, sizeof(Context));
|
|
|
|
return en;
|
|
|
|
}
|
|
|
|
U_CDECL_END
|
2006-05-09 18:06:10 +00:00
|
|
|
|
|
|
|
#endif
|
2007-03-17 06:13:14 +00:00
|
|
|
|