2006-02-06 18:03:11 +00:00
|
|
|
/*
|
|
|
|
**********************************************************************
|
|
|
|
* Copyright (C) 2005-2006, International Business Machines
|
|
|
|
* Corporation and others. All Rights Reserved.
|
|
|
|
**********************************************************************
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include "unicode/utypes.h"
|
|
|
|
|
2006-05-09 18:06:10 +00:00
|
|
|
#if !UCONFIG_NO_CONVERSION
|
|
|
|
|
2006-02-06 18:03:11 +00:00
|
|
|
#include "inputext.h"
|
|
|
|
|
|
|
|
#include "cmemory.h"
|
|
|
|
#include "cstring.h"
|
|
|
|
|
|
|
|
#include <string.h>
|
|
|
|
|
|
|
|
U_NAMESPACE_BEGIN
|
|
|
|
|
2006-02-09 21:13:01 +00:00
|
|
|
#define BUFFER_SIZE 8192
|
|
|
|
|
2006-02-06 18:03:11 +00:00
|
|
|
#define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
|
|
|
|
|
|
|
|
#define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type))
|
|
|
|
#define DELETE_ARRAY(array) uprv_free((void *) (array))
|
|
|
|
|
|
|
|
InputText::InputText()
|
2006-02-09 21:13:01 +00:00
|
|
|
: fInputBytes(NEW_ARRAY(uint8_t, BUFFER_SIZE)), // The text to be checked. Markup will have been
|
2006-02-06 18:03:11 +00:00
|
|
|
// removed if appropriate.
|
|
|
|
fByteStats(NEW_ARRAY(int16_t, 256)), // byte frequency statistics for the input text.
|
|
|
|
// Value is percent, not absolute.
|
|
|
|
fDeclaredEncoding(0),
|
|
|
|
fRawInput(0),
|
|
|
|
fRawLength(0)
|
|
|
|
{
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
InputText::~InputText()
|
|
|
|
{
|
|
|
|
DELETE_ARRAY(fDeclaredEncoding);
|
|
|
|
DELETE_ARRAY(fByteStats);
|
|
|
|
DELETE_ARRAY(fInputBytes);
|
|
|
|
}
|
|
|
|
|
|
|
|
void InputText::setText(const char *in, int32_t len)
|
|
|
|
{
|
|
|
|
fInputLen = 0;
|
|
|
|
fC1Bytes = FALSE;
|
|
|
|
fRawInput = (const uint8_t *) in;
|
2006-08-21 23:35:23 +00:00
|
|
|
fRawLength = len == -1? uprv_strlen(in) : len;
|
2006-02-06 18:03:11 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
void InputText::setDeclaredEncoding(const char* encoding, int32_t len)
|
|
|
|
{
|
|
|
|
if(encoding) {
|
2006-08-21 23:35:23 +00:00
|
|
|
if (len == -1) {
|
|
|
|
len = uprv_strlen(encoding);
|
|
|
|
}
|
|
|
|
|
2006-02-06 18:03:11 +00:00
|
|
|
len += 1; // to make place for the \0 at the end.
|
2006-08-15 06:45:05 +00:00
|
|
|
uprv_free(fDeclaredEncoding);
|
2006-02-06 18:03:11 +00:00
|
|
|
fDeclaredEncoding = NEW_ARRAY(char, len);
|
|
|
|
uprv_strncpy(fDeclaredEncoding, encoding, len);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
UBool InputText::isSet() const
|
|
|
|
{
|
2006-08-21 23:35:23 +00:00
|
|
|
return fRawInput != NULL;
|
2006-02-06 18:03:11 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* MungeInput - after getting a set of raw input data to be analyzed, preprocess
|
|
|
|
* it by removing what appears to be html markup.
|
|
|
|
*
|
|
|
|
* @internal
|
|
|
|
*/
|
|
|
|
void InputText::MungeInput(UBool fStripTags) {
|
|
|
|
int srci = 0;
|
|
|
|
int dsti = 0;
|
|
|
|
uint8_t b;
|
|
|
|
bool inMarkup = FALSE;
|
|
|
|
int32_t openTags = 0;
|
|
|
|
int32_t badTags = 0;
|
|
|
|
|
|
|
|
//
|
|
|
|
// html / xml markup stripping.
|
|
|
|
// quick and dirty, not 100% accurate, but hopefully good enough, statistically.
|
|
|
|
// discard everything within < brackets >
|
|
|
|
// Count how many total '<' and illegal (nested) '<' occur, so we can make some
|
|
|
|
// guess as to whether the input was actually marked up at all.
|
2006-08-01 15:48:07 +00:00
|
|
|
// TODO: Think about how this interacts with EBCDIC charsets that are detected.
|
2006-02-06 18:03:11 +00:00
|
|
|
if (fStripTags) {
|
2006-02-09 21:13:01 +00:00
|
|
|
for (srci = 0; srci < fRawLength && dsti < BUFFER_SIZE; srci += 1) {
|
2006-02-06 18:03:11 +00:00
|
|
|
b = fRawInput[srci];
|
|
|
|
|
2006-08-01 15:48:07 +00:00
|
|
|
if (b == (uint8_t)0x3C) { /* Check for the ASCII '<' */
|
2006-02-06 18:03:11 +00:00
|
|
|
if (inMarkup) {
|
|
|
|
badTags += 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
inMarkup = TRUE;
|
|
|
|
openTags += 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (! inMarkup) {
|
|
|
|
fInputBytes[dsti++] = b;
|
|
|
|
}
|
|
|
|
|
2006-08-01 15:48:07 +00:00
|
|
|
if (b == (uint8_t)0x3E) { /* Check for the ASCII '>' */
|
2006-02-06 18:03:11 +00:00
|
|
|
inMarkup = FALSE;
|
2006-08-01 15:48:07 +00:00
|
|
|
}
|
2006-02-06 18:03:11 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
fInputLen = dsti;
|
|
|
|
}
|
|
|
|
|
|
|
|
//
|
|
|
|
// If it looks like this input wasn't marked up, or if it looks like it's
|
|
|
|
// essentially nothing but markup abandon the markup stripping.
|
|
|
|
// Detection will have to work on the unstripped input.
|
|
|
|
//
|
|
|
|
if (openTags<5 || openTags/5 < badTags ||
|
2006-08-01 15:48:07 +00:00
|
|
|
(fInputLen < 100 && fRawLength>600))
|
|
|
|
{
|
|
|
|
int32_t limit = fRawLength;
|
2006-02-06 18:03:11 +00:00
|
|
|
|
2006-08-01 15:48:07 +00:00
|
|
|
if (limit > BUFFER_SIZE) {
|
|
|
|
limit = BUFFER_SIZE;
|
|
|
|
}
|
2006-02-06 18:03:11 +00:00
|
|
|
|
2006-08-01 15:48:07 +00:00
|
|
|
for (srci=0; srci<limit; srci++) {
|
|
|
|
fInputBytes[srci] = fRawInput[srci];
|
2006-02-06 18:03:11 +00:00
|
|
|
}
|
|
|
|
|
2006-08-01 15:48:07 +00:00
|
|
|
fInputLen = srci;
|
|
|
|
}
|
2006-02-06 18:03:11 +00:00
|
|
|
|
2006-08-01 15:48:07 +00:00
|
|
|
//
|
|
|
|
// Tally up the byte occurence statistics.
|
|
|
|
// These are available for use by the various detectors.
|
|
|
|
//
|
2006-02-06 18:03:11 +00:00
|
|
|
|
2006-08-01 15:48:07 +00:00
|
|
|
uprv_memset(fByteStats, 0, (sizeof fByteStats[0]) * 256);
|
2006-02-06 18:03:11 +00:00
|
|
|
|
2006-08-01 15:48:07 +00:00
|
|
|
for (srci = 0; srci < fInputLen; srci += 1) {
|
|
|
|
fByteStats[fInputBytes[srci]] += 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
for (int32_t i = 0x80; i <= 0x9F; i += 1) {
|
|
|
|
if (fByteStats[i] != 0) {
|
|
|
|
fC1Bytes = TRUE;
|
|
|
|
break;
|
2006-02-06 18:03:11 +00:00
|
|
|
}
|
2006-08-01 15:48:07 +00:00
|
|
|
}
|
2006-02-06 18:03:11 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
U_NAMESPACE_END
|
2006-05-09 18:06:10 +00:00
|
|
|
#endif
|
|
|
|
|