scuffed-code/icu4c/source/i18n/inputext.cpp

/*
 **********************************************************************
 *   Copyright (C) 2005-2006, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 **********************************************************************
 */

#include "unicode/utypes.h"

#if !UCONFIG_NO_CONVERSION

#include "inputext.h"

#include "cmemory.h"
#include "cstring.h"

#include <string.h>

U_NAMESPACE_BEGIN

#define BUFFER_SIZE 8192

#define ARRAY_SIZE(array) (sizeof array / sizeof array[0])

#define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type))
#define DELETE_ARRAY(array) uprv_free((void *) (array))

InputText::InputText()
    : fInputBytes(NEW_ARRAY(uint8_t, BUFFER_SIZE)), // The text to be checked.  Markup will have been
                                                 //   removed if appropriate.
      fByteStats(NEW_ARRAY(int16_t, 256)),       // byte frequency statistics for the input text.
                                                 //   Value is percent, not absolute.
      fDeclaredEncoding(0),
      fRawInput(0),
      fRawLength(0)
{  

}

InputText::~InputText()
{
    DELETE_ARRAY(fDeclaredEncoding);
    DELETE_ARRAY(fByteStats);
    DELETE_ARRAY(fInputBytes);
}

void InputText::setText(const char *in, int32_t len)
{
    fInputLen  = 0;
    fC1Bytes   = FALSE;
    fRawInput  = (const uint8_t *) in;
    fRawLength = len == -1? uprv_strlen(in) : len;
}

void InputText::setDeclaredEncoding(const char* encoding, int32_t len)
{
    if(encoding) {
        if (len == -1) {
            len = uprv_strlen(encoding);
        }

        len += 1;     // to make place for the \0 at the end.
        uprv_free(fDeclaredEncoding);
        fDeclaredEncoding = NEW_ARRAY(char, len);
        uprv_strncpy(fDeclaredEncoding, encoding, len);
    }
}

UBool InputText::isSet() const 
{
    return fRawInput != NULL;
}

/**
*  MungeInput - after getting a set of raw input data to be analyzed, preprocess
*               it by removing what appears to be html markup.
* 
* @internal
*/
void InputText::MungeInput(UBool fStripTags) {
    int     srci = 0;
    int     dsti = 0;
    uint8_t b;
    bool    inMarkup = FALSE;
    int32_t openTags = 0;
    int32_t badTags  = 0;

    //
    //  html / xml markup stripping.
    //     quick and dirty, not 100% accurate, but hopefully good enough, statistically.
    //     discard everything within < brackets >
    //     Count how many total '<' and illegal (nested) '<' occur, so we can make some
    //     guess as to whether the input was actually marked up at all.
    // TODO: Think about how this interacts with EBCDIC charsets that are detected.
    if (fStripTags) {
        for (srci = 0; srci < fRawLength && dsti < BUFFER_SIZE; srci += 1) {
            b = fRawInput[srci];

            if (b == (uint8_t)0x3C) { /* Check for the ASCII '<' */
                if (inMarkup) {
                    badTags += 1;
                }

                inMarkup = TRUE;
                openTags += 1;
            }

            if (! inMarkup) {
                fInputBytes[dsti++] = b;
            }

            if (b == (uint8_t)0x3E) { /* Check for the ASCII '>' */
                inMarkup = FALSE;
            }
        }

        fInputLen = dsti;
    }

    //
    //  If it looks like this input wasn't marked up, or if it looks like it's
    //    essentially nothing but markup abandon the markup stripping.
    //    Detection will have to work on the unstripped input.
    //
    if (openTags<5 || openTags/5 < badTags || 
        (fInputLen < 100 && fRawLength>600))
    {
        int32_t limit = fRawLength;

        if (limit > BUFFER_SIZE) {
            limit = BUFFER_SIZE;
        }

        for (srci=0; srci<limit; srci++) {
            fInputBytes[srci] = fRawInput[srci];
        }

        fInputLen = srci;
    }

    //
    // Tally up the byte occurence statistics.
    // These are available for use by the various detectors.
    //

    uprv_memset(fByteStats, 0, (sizeof fByteStats[0]) * 256);

    for (srci = 0; srci < fInputLen; srci += 1) {
        fByteStats[fInputBytes[srci]] += 1;
    }

    for (int32_t i = 0x80; i <= 0x9F; i += 1) {
        if (fByteStats[i] != 0) {
            fC1Bytes = TRUE;
            break;
        }
    }
}

U_NAMESPACE_END
#endif
ICU-4639 Initial checkin of C port of CharsetDetection. X-SVN-Rev: 19069 2006-02-06 18:03:11 +00:00			`/*`
			`**********************************************************************`
			`* Copyright (C) 2005-2006, International Business Machines`
			`* Corporation and others. All Rights Reserved.`
			`**********************************************************************`
			`*/`

			`#include "unicode/utypes.h"`

ICU-5198 Disable charset detection when UCONFIG_NO_CONVERSION is 1. X-SVN-Rev: 19622 2006-05-09 18:06:10 +00:00			`#if !UCONFIG_NO_CONVERSION`

ICU-4639 Initial checkin of C port of CharsetDetection. X-SVN-Rev: 19069 2006-02-06 18:03:11 +00:00			`#include "inputext.h"`

			`#include "cmemory.h"`
			`#include "cstring.h"`

			`#include <string.h>`

			`U_NAMESPACE_BEGIN`

ICU-4639 Remove static constants from headers, fix overflow in confidence calculation in match_mbcs. X-SVN-Rev: 19122 2006-02-09 21:13:01 +00:00			`#define BUFFER_SIZE 8192`

ICU-4639 Initial checkin of C port of CharsetDetection. X-SVN-Rev: 19069 2006-02-06 18:03:11 +00:00			`#define ARRAY_SIZE(array) (sizeof array / sizeof array[0])`

			`#define NEW_ARRAY(type,count) (type ) uprv_malloc((count) sizeof(type))`
			`#define DELETE_ARRAY(array) uprv_free((void *) (array))`

			`InputText::InputText()`
ICU-4639 Remove static constants from headers, fix overflow in confidence calculation in match_mbcs. X-SVN-Rev: 19122 2006-02-09 21:13:01 +00:00			`: fInputBytes(NEW_ARRAY(uint8_t, BUFFER_SIZE)), // The text to be checked. Markup will have been`
ICU-4639 Initial checkin of C port of CharsetDetection. X-SVN-Rev: 19069 2006-02-06 18:03:11 +00:00			`// removed if appropriate.`
			`fByteStats(NEW_ARRAY(int16_t, 256)), // byte frequency statistics for the input text.`
			`// Value is percent, not absolute.`
			`fDeclaredEncoding(0),`
			`fRawInput(0),`
			`fRawLength(0)`
			`{`

			`}`

			`InputText::~InputText()`
			`{`
			`DELETE_ARRAY(fDeclaredEncoding);`
			`DELETE_ARRAY(fByteStats);`
			`DELETE_ARRAY(fInputBytes);`
			`}`

			`void InputText::setText(const char *in, int32_t len)`
			`{`
			`fInputLen = 0;`
			`fC1Bytes = FALSE;`
			`fRawInput = (const uint8_t *) in;`
ICU-4639 code review comments. X-SVN-Rev: 20125 2006-08-21 23:35:23 +00:00			`fRawLength = len == -1? uprv_strlen(in) : len;`
ICU-4639 Initial checkin of C port of CharsetDetection. X-SVN-Rev: 19069 2006-02-06 18:03:11 +00:00			`}`

			`void InputText::setDeclaredEncoding(const char* encoding, int32_t len)`
			`{`
			`if(encoding) {`
ICU-4639 code review comments. X-SVN-Rev: 20125 2006-08-21 23:35:23 +00:00			`if (len == -1) {`
			`len = uprv_strlen(encoding);`
			`}`

ICU-4639 Initial checkin of C port of CharsetDetection. X-SVN-Rev: 19069 2006-02-06 18:03:11 +00:00			`len += 1; // to make place for the \0 at the end.`
ICU-5320 Don't use global new and delete X-SVN-Rev: 20063 2006-08-15 06:45:05 +00:00			`uprv_free(fDeclaredEncoding);`
ICU-4639 Initial checkin of C port of CharsetDetection. X-SVN-Rev: 19069 2006-02-06 18:03:11 +00:00			`fDeclaredEncoding = NEW_ARRAY(char, len);`
			`uprv_strncpy(fDeclaredEncoding, encoding, len);`
			`}`
			`}`

			`UBool InputText::isSet() const`
			`{`
ICU-4639 code review comments. X-SVN-Rev: 20125 2006-08-21 23:35:23 +00:00			`return fRawInput != NULL;`
ICU-4639 Initial checkin of C port of CharsetDetection. X-SVN-Rev: 19069 2006-02-06 18:03:11 +00:00			`}`

			`/**`
			`* MungeInput - after getting a set of raw input data to be analyzed, preprocess`
			`* it by removing what appears to be html markup.`
			`*`
			`* @internal`
			`*/`
			`void InputText::MungeInput(UBool fStripTags) {`
			`int srci = 0;`
			`int dsti = 0;`
			`uint8_t b;`
			`bool inMarkup = FALSE;`
			`int32_t openTags = 0;`
			`int32_t badTags = 0;`

			`//`
			`// html / xml markup stripping.`
			`// quick and dirty, not 100% accurate, but hopefully good enough, statistically.`
			`// discard everything within < brackets >`
			`// Count how many total '<' and illegal (nested) '<' occur, so we can make some`
			`// guess as to whether the input was actually marked up at all.`
ICU-4869 Try to fix an EBCDIC issue. X-SVN-Rev: 19942 2006-08-01 15:48:07 +00:00			`// TODO: Think about how this interacts with EBCDIC charsets that are detected.`
ICU-4639 Initial checkin of C port of CharsetDetection. X-SVN-Rev: 19069 2006-02-06 18:03:11 +00:00			`if (fStripTags) {`
ICU-4639 Remove static constants from headers, fix overflow in confidence calculation in match_mbcs. X-SVN-Rev: 19122 2006-02-09 21:13:01 +00:00			`for (srci = 0; srci < fRawLength && dsti < BUFFER_SIZE; srci += 1) {`
ICU-4639 Initial checkin of C port of CharsetDetection. X-SVN-Rev: 19069 2006-02-06 18:03:11 +00:00			`b = fRawInput[srci];`

ICU-4869 Try to fix an EBCDIC issue. X-SVN-Rev: 19942 2006-08-01 15:48:07 +00:00			`if (b == (uint8_t)0x3C) { /* Check for the ASCII '<' */`
ICU-4639 Initial checkin of C port of CharsetDetection. X-SVN-Rev: 19069 2006-02-06 18:03:11 +00:00			`if (inMarkup) {`
			`badTags += 1;`
			`}`

			`inMarkup = TRUE;`
			`openTags += 1;`
			`}`

			`if (! inMarkup) {`
			`fInputBytes[dsti++] = b;`
			`}`

ICU-4869 Try to fix an EBCDIC issue. X-SVN-Rev: 19942 2006-08-01 15:48:07 +00:00			`if (b == (uint8_t)0x3E) { /* Check for the ASCII '>' */`
ICU-4639 Initial checkin of C port of CharsetDetection. X-SVN-Rev: 19069 2006-02-06 18:03:11 +00:00			`inMarkup = FALSE;`
ICU-4869 Try to fix an EBCDIC issue. X-SVN-Rev: 19942 2006-08-01 15:48:07 +00:00			`}`
ICU-4639 Initial checkin of C port of CharsetDetection. X-SVN-Rev: 19069 2006-02-06 18:03:11 +00:00			`}`

			`fInputLen = dsti;`
			`}`

			`//`
			`// If it looks like this input wasn't marked up, or if it looks like it's`
			`// essentially nothing but markup abandon the markup stripping.`
			`// Detection will have to work on the unstripped input.`
			`//`
			`if (openTags<5 \|\| openTags/5 < badTags \|\|`
ICU-4869 Try to fix an EBCDIC issue. X-SVN-Rev: 19942 2006-08-01 15:48:07 +00:00			`(fInputLen < 100 && fRawLength>600))`
			`{`
			`int32_t limit = fRawLength;`
ICU-4639 Initial checkin of C port of CharsetDetection. X-SVN-Rev: 19069 2006-02-06 18:03:11 +00:00
ICU-4869 Try to fix an EBCDIC issue. X-SVN-Rev: 19942 2006-08-01 15:48:07 +00:00			`if (limit > BUFFER_SIZE) {`
			`limit = BUFFER_SIZE;`
			`}`
ICU-4639 Initial checkin of C port of CharsetDetection. X-SVN-Rev: 19069 2006-02-06 18:03:11 +00:00
ICU-4869 Try to fix an EBCDIC issue. X-SVN-Rev: 19942 2006-08-01 15:48:07 +00:00			`for (srci=0; srci<limit; srci++) {`
			`fInputBytes[srci] = fRawInput[srci];`
ICU-4639 Initial checkin of C port of CharsetDetection. X-SVN-Rev: 19069 2006-02-06 18:03:11 +00:00			`}`

ICU-4869 Try to fix an EBCDIC issue. X-SVN-Rev: 19942 2006-08-01 15:48:07 +00:00			`fInputLen = srci;`
			`}`
ICU-4639 Initial checkin of C port of CharsetDetection. X-SVN-Rev: 19069 2006-02-06 18:03:11 +00:00
ICU-4869 Try to fix an EBCDIC issue. X-SVN-Rev: 19942 2006-08-01 15:48:07 +00:00			`//`
			`// Tally up the byte occurence statistics.`
			`// These are available for use by the various detectors.`
			`//`
ICU-4639 Initial checkin of C port of CharsetDetection. X-SVN-Rev: 19069 2006-02-06 18:03:11 +00:00
ICU-4869 Try to fix an EBCDIC issue. X-SVN-Rev: 19942 2006-08-01 15:48:07 +00:00			`uprv_memset(fByteStats, 0, (sizeof fByteStats[0]) * 256);`
ICU-4639 Initial checkin of C port of CharsetDetection. X-SVN-Rev: 19069 2006-02-06 18:03:11 +00:00
ICU-4869 Try to fix an EBCDIC issue. X-SVN-Rev: 19942 2006-08-01 15:48:07 +00:00			`for (srci = 0; srci < fInputLen; srci += 1) {`
			`fByteStats[fInputBytes[srci]] += 1;`
			`}`

			`for (int32_t i = 0x80; i <= 0x9F; i += 1) {`
			`if (fByteStats[i] != 0) {`
			`fC1Bytes = TRUE;`
			`break;`
ICU-4639 Initial checkin of C port of CharsetDetection. X-SVN-Rev: 19069 2006-02-06 18:03:11 +00:00			`}`
ICU-4869 Try to fix an EBCDIC issue. X-SVN-Rev: 19942 2006-08-01 15:48:07 +00:00			`}`
ICU-4639 Initial checkin of C port of CharsetDetection. X-SVN-Rev: 19069 2006-02-06 18:03:11 +00:00			`}`

			`U_NAMESPACE_END`
ICU-5198 Disable charset detection when UCONFIG_NO_CONVERSION is 1. X-SVN-Rev: 19622 2006-05-09 18:06:10 +00:00			`#endif`