scuffed-code/icu4c/source/i18n/csdetect.cpp
2006-02-07 21:59:16 +00:00

343 lines
8.7 KiB
C++

/*
**********************************************************************
* Copyright (C) 2005-2006, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
*/
#include "unicode/utypes.h"
#include "unicode/ucsdet.h"
#include "csdetect.h"
#include "csmatch.h"
#include "uenumimp.h"
#include "cmemory.h"
#include "cstring.h"
#include "umutex.h"
#include "ucln_in.h"
#include "inputext.h"
#include "csrsbcs.h"
#include "csrmbcs.h"
#include "csrutf8.h"
#include "csrucode.h"
#include "csr2022.h"
#define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
#define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type))
#define DELETE_ARRAY(array) uprv_free((void *) (array))
U_CDECL_BEGIN
static CharsetRecognizer **fCSRecognizers = NULL;
static int32_t fCSRecognizers_size = 0;
static UBool U_CALLCONV csdet_cleanup(void)
{
if (fCSRecognizers != NULL) {
for(int32_t r = 0; r < fCSRecognizers_size; r += 1) {
delete fCSRecognizers[r];
fCSRecognizers[r] = NULL;
}
DELETE_ARRAY(fCSRecognizers);
fCSRecognizers = NULL;
fCSRecognizers_size = 0;
}
return TRUE;
}
U_CDECL_END
U_NAMESPACE_BEGIN
void CharsetDetector::setRecognizers()
{
UBool needsInit;
CharsetRecognizer **recognizers;
umtx_lock(NULL);
needsInit = (UBool) (fCSRecognizers == NULL);
umtx_unlock(NULL);
if (needsInit) {
CharsetRecognizer *tempArray[] = {
new CharsetRecog_UTF8(),
new CharsetRecog_UTF_16_BE(),
new CharsetRecog_UTF_16_LE(),
new CharsetRecog_UTF_32_BE(),
new CharsetRecog_UTF_32_LE(),
new CharsetRecog_8859_1_en(),
new CharsetRecog_8859_1_da(),
new CharsetRecog_8859_1_de(),
new CharsetRecog_8859_1_es(),
new CharsetRecog_8859_1_fr(),
new CharsetRecog_8859_1_it(),
new CharsetRecog_8859_1_nl(),
new CharsetRecog_8859_1_no(),
new CharsetRecog_8859_1_pt(),
new CharsetRecog_8859_1_sv(),
new CharsetRecog_8859_2_cs(),
new CharsetRecog_8859_2_hu(),
new CharsetRecog_8859_2_pl(),
new CharsetRecog_8859_2_ro(),
new CharsetRecog_8859_5_ru(),
new CharsetRecog_8859_6_ar(),
new CharsetRecog_8859_7_el(),
new CharsetRecog_8859_8_I_he(),
new CharsetRecog_8859_8_he(),
new CharsetRecog_windows_1251(),
new CharsetRecog_windows_1256(),
new CharsetRecog_KOI8_R(),
new CharsetRecog_8859_9_tr(),
new CharsetRecog_sjis(),
new CharsetRecog_gb_18030(),
new CharsetRecog_euc_jp(),
new CharsetRecog_euc_kr(),
new CharsetRecog_2022JP(),
new CharsetRecog_2022KR(),
new CharsetRecog_2022CN()
};
int32_t rCount = ARRAY_SIZE(tempArray);
int32_t r;
recognizers = NEW_ARRAY(CharsetRecognizer *, rCount);
for (r = 0; r < rCount; r += 1) {
recognizers[r] = tempArray[r];
}
umtx_lock(NULL);
if (fCSRecognizers == NULL) {
fCSRecognizers = recognizers;
fCSRecognizers_size = rCount;
}
umtx_unlock(NULL);
if (fCSRecognizers != recognizers) {
for (r = 0; r < rCount; r += 1) {
delete recognizers[r];
recognizers[r] = NULL;
}
DELETE_ARRAY(recognizers);
}
recognizers = NULL;
ucln_i18n_registerCleanup(UCLN_I18N_CSDET, csdet_cleanup);
}
}
CharsetDetector::CharsetDetector()
: textIn(new InputText()), resultCount(0), fStripTags(FALSE), fFreshTextSet(FALSE)
{
setRecognizers();
resultArray = new CharsetMatch *[fCSRecognizers_size];
for(int32_t i = 0; i < fCSRecognizers_size; i += 1) {
resultArray[i] = new CharsetMatch();
}
}
CharsetDetector::~CharsetDetector()
{
delete textIn;
for(int32_t i = 0; i < fCSRecognizers_size; i += 1) {
delete resultArray[i];
}
delete [] resultArray;
}
void CharsetDetector::setText(const char *in, int32_t len)
{
textIn->setText(in, len);
fFreshTextSet = TRUE;
}
UBool CharsetDetector::setStripTagsFlag(UBool flag)
{
UBool temp = fStripTags;
fStripTags = flag;
fFreshTextSet = TRUE;
return temp;
}
UBool CharsetDetector::getStripTagsFlag() const
{
return fStripTags;
}
void CharsetDetector::setDeclaredEncoding(const char *encoding, int32_t len) const
{
textIn->setDeclaredEncoding(encoding,len);
}
int32_t CharsetDetector::getDetectableCount()
{
setRecognizers();
return fCSRecognizers_size;
}
const CharsetMatch *CharsetDetector::detect(UErrorCode &status)
{
int32_t maxMatchesFound = 0;
detectAll(maxMatchesFound, status);
if(maxMatchesFound > 0) {
return resultArray[0];
} else {
return NULL;
}
}
// this sort of conversion is explicitly mentioned in the C++ standard
// in section 4.4:
// [Note: if a program could assign a pointer of type T** to a pointer of type
// const T** (that is, if line //1 below was allowed), a program could
// inadvertently modify a const object (as it is done on line //2). For
// example,
// int32_t main() {
// const char c = 'c';
// char* pc;
// const char** pcc = &pc;//1: not allowed
// *pcc = &c;
// *pc = 'C';//2: modifies a const object
// }
const CharsetMatch * const *CharsetDetector::detectAll(int32_t &maxMatchesFound, UErrorCode &status)
{
if(!textIn->isSet()) {
status = U_MISSING_RESOURCE_ERROR;// TODO: Need to set proper status code for input text not set
return NULL;
} else if(fFreshTextSet) {
CharsetRecognizer *csr;
int32_t detectResults;
int32_t confidence;
textIn->MungeInput(fStripTags);
// Iterate over all possible charsets, remember all that
// give a match quality > 0.
resultCount = 0;
for (int32_t i = 0; i < fCSRecognizers_size; i += 1) {
csr = fCSRecognizers[i];
detectResults = csr->match(textIn);
confidence = detectResults;
if (confidence > 0) {
resultArray[resultCount++]->set(textIn, csr, confidence);
}
}
for(int32_t i = resultCount; i < fCSRecognizers_size; i += 1) {
resultArray[i]->set(textIn,0,0);
}
//Bubble sort
for(int32_t i = resultCount; i > 1; i -= 1) {
for(int32_t j = 0; j < i-1; j += 1) {
if(resultArray[j]->getConfidence() < resultArray[j+1]->getConfidence()) {
CharsetMatch *temp = resultArray[j];
resultArray[j]= resultArray[j+1];
resultArray[j+1]=temp;
}
}
}
fFreshTextSet = FALSE;
}
maxMatchesFound = resultCount;
return resultArray;
}
const char *CharsetDetector::getCharsetName(int32_t index, UErrorCode &status) const
{
if( index > fCSRecognizers_size-1 || index < 0) {
status = U_INDEX_OUTOFBOUNDS_ERROR;
return 0;
} else {
return fCSRecognizers[index]->getName();
}
}
U_NAMESPACE_END
U_CDECL_BEGIN
typedef struct {
int32_t currIndex;
} Context;
static void U_CALLCONV
enumClose(UEnumeration *en) {
if(en->context != NULL) {
DELETE_ARRAY(en->context);
}
DELETE_ARRAY(en);
}
static int32_t U_CALLCONV
enumCount(UEnumeration *en, UErrorCode *status) {
return fCSRecognizers_size;
}
static const char* U_CALLCONV
enumNext(UEnumeration *en, int32_t *resultLength, UErrorCode *status) {
if(((Context *)en->context)->currIndex >= fCSRecognizers_size) {
return NULL;
}
const char *currName = fCSRecognizers[((Context *)en->context)->currIndex]->getName();
*resultLength = (int32_t)uprv_strlen(currName);
((Context *)en->context)->currIndex++;
return currName;
}
static void U_CALLCONV
enumReset(UEnumeration *en, UErrorCode *status) {
((Context *)en->context)->currIndex = 0;
}
UEnumeration enumeration = {
NULL,
NULL,
enumClose,
enumCount,
uenum_unextDefault,
enumNext,
enumReset
};
U_DRAFT UEnumeration * U_EXPORT2
ucsdet_getAllDetectableCharsets(const UCharsetDetector *ucsd, UErrorCode *status)
{
if(U_FAILURE(*status)) {
return 0;
}
/* Initialize recognized charsets. */
CharsetDetector::getDetectableCount();
UEnumeration *en = NEW_ARRAY(UEnumeration, 1);
memcpy(en, &enumeration, sizeof(UEnumeration));
en->context = (void*)NEW_ARRAY(Context, 1);
uprv_memset(en->context, 0, sizeof(Context));
return en;
}
U_CDECL_END