2006-02-06 18:03:11 +00:00
/*
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
* Copyright ( C ) 2005 - 2006 , International Business Machines
* Corporation and others . All Rights Reserved .
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
*/
# include "unicode/utypes.h"
# include "unicode/ucsdet.h"
# include "unicode/ucnv.h"
# include "unicode/unistr.h"
# include "unicode/putil.h"
# include "intltest.h"
# include "csdetest.h"
# include "xmlparser.h"
# include <stdlib.h>
# include <string.h>
2006-02-09 21:13:01 +00:00
# ifdef DEBUG_DETECT
# include <stdio.h>
# endif
2006-02-06 18:03:11 +00:00
# define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
# define NEW_ARRAY(type,count) (type *) /*uprv_*/ malloc((count) * sizeof(type))
# define DELETE_ARRAY(array) /*uprv_*/ free((void *) (array))
# define CH_SPACE 0x0020
# define CH_SLASH 0x002F
//---------------------------------------------------------------------------
//
// Test class boilerplate
//
//---------------------------------------------------------------------------
CharsetDetectionTest : : CharsetDetectionTest ( )
{
}
CharsetDetectionTest : : ~ CharsetDetectionTest ( )
{
}
void CharsetDetectionTest : : runIndexedTest ( int32_t index , UBool exec , const char * & name , char * /*par*/ )
{
if ( exec ) logln ( " TestSuite CharsetDetectionTest: " ) ;
switch ( index ) {
case 0 : name = " ConstructionTest " ;
if ( exec ) ConstructionTest ( ) ;
break ;
case 1 : name = " UTF8Test " ;
if ( exec ) UTF8Test ( ) ;
break ;
case 2 : name = " UTF16Test " ;
if ( exec ) UTF16Test ( ) ;
break ;
case 3 : name = " C1BytesTest " ;
if ( exec ) C1BytesTest ( ) ;
break ;
case 4 : name = " InputFilterTest " ;
if ( exec ) InputFilterTest ( ) ;
break ;
case 5 : name = " DetectionTest " ;
if ( exec ) DetectionTest ( ) ;
break ;
default : name = " " ;
break ; //needed to end loop
}
}
static UnicodeString * split ( const UnicodeString & src , UChar ch , int32_t & splits )
{
int32_t offset = - 1 ;
splits = 1 ;
while ( ( offset = src . indexOf ( ch , offset + 1 ) ) > = 0 ) {
splits + = 1 ;
}
UnicodeString * result = new UnicodeString [ splits ] ;
int32_t start = 0 ;
int32_t split = 0 ;
int32_t end ;
while ( ( end = src . indexOf ( ch , start ) ) > = 0 ) {
src . extractBetween ( start , end , result [ split + + ] ) ;
start = end + 1 ;
}
src . extractBetween ( start , src . length ( ) , result [ split ] ) ;
return result ;
}
static char * extractBytes ( const UnicodeString & source , const char * codepage , int32_t & length )
{
int32_t sLength = source . length ( ) ;
2006-07-28 22:58:29 +00:00
char * bytes = NULL ;
2006-02-06 18:03:11 +00:00
length = source . extract ( 0 , sLength , NULL , codepage ) ;
2006-07-28 22:58:29 +00:00
if ( length > 0 ) {
bytes = NEW_ARRAY ( char , length + 1 ) ;
source . extract ( 0 , sLength , bytes , codepage ) ;
}
2006-02-06 18:03:11 +00:00
return bytes ;
}
static void freeBytes ( char * bytes )
{
DELETE_ARRAY ( bytes ) ;
}
void CharsetDetectionTest : : checkEncoding ( const UnicodeString & testString , const UnicodeString & encoding , const UnicodeString & id )
{
int32_t splits = 0 ;
int32_t testLength = testString . length ( ) ;
UnicodeString * eSplit = split ( encoding , CH_SLASH , splits ) ;
UErrorCode status = U_ZERO_ERROR ;
int32_t cpLength = eSplit [ 0 ] . length ( ) ;
char codepage [ 64 ] ;
u_UCharsToChars ( eSplit [ 0 ] . getBuffer ( ) , codepage , cpLength ) ;
codepage [ cpLength ] = ' \0 ' ;
UCharsetDetector * csd = ucsdet_open ( & status ) ;
int32_t byteLength = 0 ;
char * bytes = extractBytes ( testString , codepage , byteLength ) ;
2006-07-28 22:58:29 +00:00
if ( bytes = = NULL ) {
# if !UCONFIG_NO_LEGACY_CONVERSION
errln ( " Can't open a " + encoding + " converter for " + id ) ;
# endif
return ;
}
2006-02-06 18:03:11 +00:00
ucsdet_setText ( csd , bytes , byteLength , & status ) ;
2006-02-13 20:47:36 +00:00
int32_t matchCount = 0 ;
const UCharsetMatch * * matches = ucsdet_detectAll ( csd , & matchCount , & status ) ;
2006-02-07 21:59:16 +00:00
2006-02-13 20:47:36 +00:00
UnicodeString name ( ucsdet_getName ( matches [ 0 ] , & status ) ) ;
UnicodeString lang ( ucsdet_getLanguage ( matches [ 0 ] , & status ) ) ;
2006-02-06 18:03:11 +00:00
UChar * decoded = NULL ;
int32_t dLength = 0 ;
2006-02-13 20:47:36 +00:00
if ( matchCount = = 0 ) {
2006-02-07 21:59:16 +00:00
errln ( " Encoding detection failure for " + id + " : expected " + eSplit [ 0 ] + " , got no matches " ) ;
goto bail ;
}
2006-02-06 18:03:11 +00:00
if ( name . compare ( eSplit [ 0 ] ) ! = 0 ) {
errln ( " Encoding detection failure for " + id + " : expected " + eSplit [ 0 ] + " , got " + name ) ;
2006-02-07 19:12:43 +00:00
2006-02-09 21:13:01 +00:00
# ifdef DEBUG_DETECT
2006-02-07 19:12:43 +00:00
for ( int32_t m = 0 ; m < matchCount ; m + = 1 ) {
const char * name = ucsdet_getName ( matches [ m ] , & status ) ;
const char * lang = ucsdet_getLanguage ( matches [ m ] , & status ) ;
int32_t confidence = ucsdet_getConfidence ( matches [ m ] , & status ) ;
2006-02-09 21:13:01 +00:00
printf ( " %s (%s) %d \n " , name , lang , confidence ) ;
2006-02-07 19:12:43 +00:00
}
2006-02-09 21:13:01 +00:00
# endif
2006-02-06 18:03:11 +00:00
goto bail ;
}
if ( splits > 1 & & lang . compare ( eSplit [ 1 ] ) ! = 0 ) {
errln ( " Language detection failure for " + id + " , " + eSplit [ 0 ] + " : expected " + eSplit [ 1 ] + " , got " + lang ) ;
goto bail ;
}
decoded = NEW_ARRAY ( UChar , testLength ) ;
2006-02-13 20:47:36 +00:00
dLength = ucsdet_getUChars ( matches [ 0 ] , decoded , testLength , & status ) ;
2006-02-06 18:03:11 +00:00
if ( testString . compare ( decoded , dLength ) ! = 0 ) {
errln ( " Round-trip error for " + id + " , " + eSplit [ 0 ] + " : getUChars() didn't yeild the original string. " ) ;
2006-02-09 21:13:01 +00:00
# ifdef DEBUG_DETECT
for ( int32_t i = 0 ; i < testLength ; i + = 1 ) {
if ( testString [ i ] ! = decoded [ i ] ) {
printf ( " Strings differ at byte %d \n " , i ) ;
break ;
}
}
# endif
2006-02-06 18:03:11 +00:00
}
DELETE_ARRAY ( decoded ) ;
bail :
freeBytes ( bytes ) ;
ucsdet_close ( csd ) ;
delete [ ] eSplit ;
}
const char * CharsetDetectionTest : : getPath ( char buffer [ 2048 ] , const char * filename ) {
UErrorCode status = U_ZERO_ERROR ;
const char * testDataDirectory = IntlTest : : getSourceTestData ( status ) ;
if ( U_FAILURE ( status ) ) {
errln ( " ERROR: getPath() failed - %s " , u_errorName ( status ) ) ;
return NULL ;
}
strcpy ( buffer , testDataDirectory ) ;
strcat ( buffer , filename ) ;
return buffer ;
}
void CharsetDetectionTest : : ConstructionTest ( )
{
UErrorCode status = U_ZERO_ERROR ;
UCharsetDetector * csd = ucsdet_open ( & status ) ;
UEnumeration * e = ucsdet_getAllDetectableCharsets ( csd , & status ) ;
int32_t count = uenum_count ( e , & status ) ;
2006-02-09 21:13:01 +00:00
# ifdef DEBUG_DETECT
printf ( " There are %d recognizers. \n " , count ) ;
# endif
2006-02-06 18:03:11 +00:00
for ( int32_t i = 0 ; i < count ; i + = 1 ) {
int32_t length ;
const char * name = uenum_next ( e , & length , & status ) ;
if ( name = = NULL | | length < = 0 ) {
2006-02-09 21:13:01 +00:00
errln ( " ucsdet_getAllDetectableCharsets() returned a null or empty name! " ) ;
2006-02-06 18:03:11 +00:00
}
2006-02-09 21:13:01 +00:00
# ifdef DEBUG_DETECT
printf ( " %s \n " , name ) ;
# endif
2006-02-06 18:03:11 +00:00
}
uenum_close ( e ) ;
ucsdet_close ( csd ) ;
}
void CharsetDetectionTest : : UTF8Test ( )
{
UErrorCode status = U_ZERO_ERROR ;
UnicodeString ss = " This is a string with some non-ascii characters that will "
" be converted to UTF-8, then shoved through the detection process. "
" \\ u0391 \\ u0392 \\ u0393 \\ u0394 \\ u0395 "
" Sure would be nice if our source could contain Unicode directly! " ;
UnicodeString s = ss . unescape ( ) ;
int32_t byteLength = 0 , sLength = s . length ( ) ;
char * bytes = extractBytes ( s , " UTF-8 " , byteLength ) ;
UCharsetDetector * csd = ucsdet_open ( & status ) ;
const UCharsetMatch * match ;
UChar * detected = NEW_ARRAY ( UChar , sLength ) ;
ucsdet_setText ( csd , bytes , byteLength , & status ) ;
match = ucsdet_detect ( csd , & status ) ;
2006-02-07 21:59:16 +00:00
if ( match = = NULL ) {
errln ( " Detection failure for UTF-8: got no matches. " ) ;
goto bail ;
}
2006-02-06 18:03:11 +00:00
ucsdet_getUChars ( match , detected , sLength , & status ) ;
if ( s . compare ( detected , sLength ) ! = 0 ) {
errln ( " Round-trip test failed! " ) ;
}
2006-02-07 21:59:16 +00:00
ucsdet_setDeclaredEncoding ( csd , " UTF-8 " , 5 , & status ) ; /* for coverage */
bail :
2006-02-06 18:03:11 +00:00
DELETE_ARRAY ( detected ) ;
freeBytes ( bytes ) ;
ucsdet_close ( csd ) ;
}
void CharsetDetectionTest : : UTF16Test ( )
{
UErrorCode status = U_ZERO_ERROR ;
/* Notice the BOM on the start of this string */
UChar chars [ ] = {
0xFEFF , 0x0623 , 0x0648 , 0x0631 , 0x0648 , 0x0628 , 0x0627 , 0x002C ,
0x0020 , 0x0628 , 0x0631 , 0x0645 , 0x062c , 0x064a , 0x0627 , 0x062a ,
0x0020 , 0x0627 , 0x0644 , 0x062d , 0x0627 , 0x0633 , 0x0648 , 0x0628 ,
0x0020 , 0x002b , 0x0020 , 0x0627 , 0x0646 , 0x062a , 0x0631 , 0x0646 ,
0x064a , 0x062a , 0x0000 } ;
UnicodeString s ( chars ) ;
int32_t beLength = 0 , leLength = 0 ;
char * beBytes = extractBytes ( s , " UTF-16BE " , beLength ) ;
char * leBytes = extractBytes ( s , " UTF-16LE " , leLength ) ;
UCharsetDetector * csd = ucsdet_open ( & status ) ;
const UCharsetMatch * match ;
const char * name ;
2006-02-07 21:59:16 +00:00
int32_t conf ;
2006-02-06 18:03:11 +00:00
ucsdet_setText ( csd , beBytes , beLength , & status ) ;
match = ucsdet_detect ( csd , & status ) ;
2006-02-07 21:59:16 +00:00
if ( match = = NULL ) {
errln ( " Encoding detection failure for UTF-16BE: got no matches. " ) ;
goto try_le ;
}
2006-02-06 18:03:11 +00:00
name = ucsdet_getName ( match , & status ) ;
2006-02-07 21:59:16 +00:00
conf = ucsdet_getConfidence ( match , & status ) ;
2006-02-06 18:03:11 +00:00
if ( strcmp ( name , " UTF-16BE " ) ! = 0 ) {
errln ( " Encoding detection failure for UTF-16BE: got %s " , name ) ;
}
2006-02-07 21:59:16 +00:00
if ( conf ! = 100 ) {
errln ( " Did not get 100%% confidence for UTF-16BE: got %d " , conf ) ;
}
try_le :
2006-02-06 18:03:11 +00:00
ucsdet_setText ( csd , leBytes , leLength , & status ) ;
match = ucsdet_detect ( csd , & status ) ;
2006-02-07 21:59:16 +00:00
if ( match = = NULL ) {
errln ( " Encoding detection failure for UTF-16LE: got no matches. " ) ;
goto bail ;
}
2006-02-06 18:03:11 +00:00
name = ucsdet_getName ( match , & status ) ;
2006-02-07 21:59:16 +00:00
conf = ucsdet_getConfidence ( match , & status ) ;
2006-02-06 18:03:11 +00:00
if ( strcmp ( name , " UTF-16LE " ) ! = 0 ) {
errln ( " Enconding detection failure for UTF-16LE: got %s " , name ) ;
}
2006-02-07 21:59:16 +00:00
if ( conf ! = 100 ) {
errln ( " Did not get 100%% confidence for UTF-16LE: got %d " , conf ) ;
}
bail :
2006-02-06 18:03:11 +00:00
freeBytes ( leBytes ) ;
freeBytes ( beBytes ) ;
ucsdet_close ( csd ) ;
}
void CharsetDetectionTest : : InputFilterTest ( )
{
UErrorCode status = U_ZERO_ERROR ;
UnicodeString ss = " <a> <lot> <of> <English> <inside> <the> <markup> Un tr \\ u00E8s petit peu de Fran \\ u00E7ais. <to> <confuse> <the> <detector> " ;
UnicodeString s = ss . unescape ( ) ;
int32_t byteLength = 0 ;
char * bytes = extractBytes ( s , " ISO-8859-1 " , byteLength ) ;
UCharsetDetector * csd = ucsdet_open ( & status ) ;
const UCharsetMatch * match ;
2006-02-09 21:13:01 +00:00
const char * lang , * name ;
2006-02-06 18:03:11 +00:00
ucsdet_enableInputFilter ( csd , TRUE ) ;
if ( ! ucsdet_isInputFilterEnabled ( csd ) ) {
errln ( " ucsdet_enableInputFilter(csd, TRUE) did not enable input filter! " ) ;
}
2006-02-09 21:13:01 +00:00
2006-02-06 18:03:11 +00:00
ucsdet_setText ( csd , bytes , byteLength , & status ) ;
match = ucsdet_detect ( csd , & status ) ;
2006-02-07 21:59:16 +00:00
if ( match = = NULL ) {
errln ( " Turning on the input filter resulted in no matches. " ) ;
goto turn_off ;
}
2006-02-09 21:13:01 +00:00
name = ucsdet_getName ( match , & status ) ;
if ( name = = NULL | | strcmp ( name , " ISO-8859-1 " ) ! = 0 ) {
2006-02-10 23:49:09 +00:00
errln ( " Turning on the input filter resulted in %s rather than ISO-8859-1. " , name ) ;
2006-02-09 21:13:01 +00:00
} else {
lang = ucsdet_getLanguage ( match , & status ) ;
2006-02-06 18:03:11 +00:00
2006-02-09 21:13:01 +00:00
if ( lang = = NULL | | strcmp ( lang , " fr " ) ! = 0 ) {
errln ( " Input filter did not strip markup! " ) ;
}
2006-02-06 18:03:11 +00:00
}
2006-02-07 21:59:16 +00:00
turn_off :
2006-02-06 18:03:11 +00:00
ucsdet_enableInputFilter ( csd , FALSE ) ;
ucsdet_setText ( csd , bytes , byteLength , & status ) ;
match = ucsdet_detect ( csd , & status ) ;
2006-02-07 21:59:16 +00:00
if ( match = = NULL ) {
errln ( " Turning off the input filter resulted in no matches. " ) ;
goto bail ;
}
2006-02-09 21:13:01 +00:00
name = ucsdet_getName ( match , & status ) ;
2006-02-06 18:03:11 +00:00
2006-02-09 21:13:01 +00:00
if ( name = = NULL | | strcmp ( name , " ISO-8859-1 " ) ! = 0 ) {
2006-02-10 23:49:09 +00:00
errln ( " Turning off the input filter resulted in %s rather than ISO-8859-1. " , name ) ;
2006-02-09 21:13:01 +00:00
} else {
lang = ucsdet_getLanguage ( match , & status ) ;
if ( lang = = NULL | | strcmp ( lang , " en " ) ! = 0 ) {
errln ( " Unfiltered input did not detect as English! " ) ;
}
2006-02-06 18:03:11 +00:00
}
2006-02-07 21:59:16 +00:00
bail :
2006-02-06 18:03:11 +00:00
freeBytes ( bytes ) ;
ucsdet_close ( csd ) ;
}
void CharsetDetectionTest : : C1BytesTest ( )
{
2006-07-28 22:58:29 +00:00
# if !UCONFIG_NO_LEGACY_CONVERSION
2006-02-06 18:03:11 +00:00
UErrorCode status = U_ZERO_ERROR ;
UnicodeString sISO = " This is a small sample of some English text. Just enough to be sure that it detects correctly. " ;
UnicodeString ssWindows = " This is another small sample of some English text. Just enough to be sure that it detects correctly. It also includes some \\ u201CC1 \\ u201D bytes. " ;
UnicodeString sWindows = ssWindows . unescape ( ) ;
int32_t lISO = 0 , lWindows = 0 ;
char * bISO = extractBytes ( sISO , " ISO-8859-1 " , lISO ) ;
char * bWindows = extractBytes ( sWindows , " windows-1252 " , lWindows ) ;
UCharsetDetector * csd = ucsdet_open ( & status ) ;
const UCharsetMatch * match ;
const char * name ;
ucsdet_setText ( csd , bWindows , lWindows , & status ) ;
match = ucsdet_detect ( csd , & status ) ;
2006-02-07 21:59:16 +00:00
if ( match = = NULL ) {
errln ( " English test with C1 bytes got no matches. " ) ;
goto bail ;
}
2006-02-06 18:03:11 +00:00
name = ucsdet_getName ( match , & status ) ;
if ( strcmp ( name , " windows-1252 " ) ! = 0 ) {
2006-02-07 19:12:43 +00:00
errln ( " English text with C1 bytes does not detect as windows-1252, but as %s " , name ) ;
2006-02-06 18:03:11 +00:00
}
ucsdet_setText ( csd , bISO , lISO , & status ) ;
match = ucsdet_detect ( csd , & status ) ;
2006-02-07 21:59:16 +00:00
if ( match = = NULL ) {
errln ( " English text without C1 bytes got no matches. " ) ;
goto bail ;
}
2006-02-06 18:03:11 +00:00
name = ucsdet_getName ( match , & status ) ;
if ( strcmp ( name , " ISO-8859-1 " ) ! = 0 ) {
2006-02-07 19:12:43 +00:00
errln ( " English text without C1 bytes does not detect as ISO-8859-1, but as %s " , name ) ;
2006-02-06 18:03:11 +00:00
}
2006-02-07 21:59:16 +00:00
bail :
2006-02-06 18:03:11 +00:00
freeBytes ( bWindows ) ;
freeBytes ( bISO ) ;
ucsdet_close ( csd ) ;
2006-07-28 22:58:29 +00:00
# endif
2006-02-06 18:03:11 +00:00
}
void CharsetDetectionTest : : DetectionTest ( )
{
2006-08-02 22:29:50 +00:00
# if !UCONFIG_NO_REGULAR_EXPRESSIONS
2006-02-06 18:03:11 +00:00
UErrorCode status = U_ZERO_ERROR ;
char path [ 2048 ] ;
const char * testFilePath = getPath ( path , " csdetest.xml " ) ;
if ( testFilePath = = NULL ) {
return ; /* Couldn't get path: error message already output. */
}
UXMLParser * parser = UXMLParser : : createParser ( status ) ;
2006-07-21 22:01:55 +00:00
if ( ! assertSuccess ( " UXMLParser::createParser " , status ) ) return ;
2006-02-06 18:03:11 +00:00
UXMLElement * root = parser - > parseFile ( testFilePath , status ) ;
2006-07-21 22:01:55 +00:00
if ( ! assertSuccess ( " parseFile " , status ) ) return ;
2006-02-06 18:03:11 +00:00
UnicodeString test_case = UNICODE_STRING_SIMPLE ( " test-case " ) ;
UnicodeString id_attr = UNICODE_STRING_SIMPLE ( " id " ) ;
UnicodeString enc_attr = UNICODE_STRING_SIMPLE ( " encodings " ) ;
const UXMLElement * testCase ;
int32_t tc = 0 ;
while ( ( testCase = root - > nextChildElement ( tc ) ) ! = NULL ) {
if ( testCase - > getTagName ( ) . compare ( test_case ) = = 0 ) {
const UnicodeString * id = testCase - > getAttribute ( id_attr ) ;
const UnicodeString * encodings = testCase - > getAttribute ( enc_attr ) ;
const UnicodeString text = testCase - > getText ( TRUE ) ;
int32_t encodingCount ;
UnicodeString * encodingList = split ( * encodings , CH_SPACE , encodingCount ) ;
for ( int32_t e = 0 ; e < encodingCount ; e + = 1 ) {
checkEncoding ( text , encodingList [ e ] , * id ) ;
}
delete [ ] encodingList ;
}
}
delete root ;
delete parser ;
2006-07-28 22:58:29 +00:00
# endif
2006-02-06 18:03:11 +00:00
}