2006-02-06 18:03:11 +00:00
/*
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
2011-03-03 19:29:57 +00:00
* Copyright ( C ) 2005 - 2011 , International Business Machines
2006-02-06 18:03:11 +00:00
* Corporation and others . All Rights Reserved .
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
*/
# include "unicode/utypes.h"
# include "unicode/ucsdet.h"
# include "unicode/ucnv.h"
# include "unicode/unistr.h"
# include "unicode/putil.h"
2009-04-24 22:24:27 +00:00
# include "unicode/uniset.h"
2006-02-06 18:03:11 +00:00
# include "intltest.h"
# include "csdetest.h"
# include "xmlparser.h"
# include <stdlib.h>
# include <string.h>
2006-02-09 21:13:01 +00:00
# ifdef DEBUG_DETECT
# include <stdio.h>
# endif
2006-02-06 18:03:11 +00:00
# define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
# define NEW_ARRAY(type,count) (type *) /*uprv_*/ malloc((count) * sizeof(type))
# define DELETE_ARRAY(array) /*uprv_*/ free((void *) (array))
# define CH_SPACE 0x0020
# define CH_SLASH 0x002F
//---------------------------------------------------------------------------
//
// Test class boilerplate
//
//---------------------------------------------------------------------------
CharsetDetectionTest : : CharsetDetectionTest ( )
{
}
CharsetDetectionTest : : ~ CharsetDetectionTest ( )
{
}
void CharsetDetectionTest : : runIndexedTest ( int32_t index , UBool exec , const char * & name , char * /*par*/ )
{
if ( exec ) logln ( " TestSuite CharsetDetectionTest: " ) ;
switch ( index ) {
case 0 : name = " ConstructionTest " ;
if ( exec ) ConstructionTest ( ) ;
break ;
case 1 : name = " UTF8Test " ;
if ( exec ) UTF8Test ( ) ;
break ;
case 2 : name = " UTF16Test " ;
if ( exec ) UTF16Test ( ) ;
break ;
case 3 : name = " C1BytesTest " ;
if ( exec ) C1BytesTest ( ) ;
break ;
case 4 : name = " InputFilterTest " ;
if ( exec ) InputFilterTest ( ) ;
break ;
case 5 : name = " DetectionTest " ;
if ( exec ) DetectionTest ( ) ;
break ;
2009-08-04 21:09:17 +00:00
# if !UCONFIG_NO_LEGACY_CONVERSION
2009-03-31 15:39:00 +00:00
case 6 : name = " IBM424Test " ;
if ( exec ) IBM424Test ( ) ;
break ;
case 7 : name = " IBM420Test " ;
if ( exec ) IBM420Test ( ) ;
break ;
2009-08-04 21:09:17 +00:00
# else
case 6 :
case 7 : name = " skip " ; break ;
# endif
2009-04-24 22:24:27 +00:00
case 8 : name = " Ticket6394Test " ;
if ( exec ) Ticket6394Test ( ) ;
break ;
2006-02-06 18:03:11 +00:00
default : name = " " ;
break ; //needed to end loop
}
}
static UnicodeString * split ( const UnicodeString & src , UChar ch , int32_t & splits )
{
int32_t offset = - 1 ;
splits = 1 ;
while ( ( offset = src . indexOf ( ch , offset + 1 ) ) > = 0 ) {
splits + = 1 ;
}
UnicodeString * result = new UnicodeString [ splits ] ;
int32_t start = 0 ;
int32_t split = 0 ;
int32_t end ;
while ( ( end = src . indexOf ( ch , start ) ) > = 0 ) {
src . extractBetween ( start , end , result [ split + + ] ) ;
start = end + 1 ;
}
src . extractBetween ( start , src . length ( ) , result [ split ] ) ;
return result ;
}
static char * extractBytes ( const UnicodeString & source , const char * codepage , int32_t & length )
{
2007-08-31 18:06:46 +00:00
int32_t sLength = source . length ( ) ;
2006-07-28 22:58:29 +00:00
char * bytes = NULL ;
2006-02-06 18:03:11 +00:00
2007-08-31 18:06:46 +00:00
length = source . extract ( 0 , sLength , NULL , codepage ) ;
2007-08-24 19:04:53 +00:00
2007-08-31 18:06:46 +00:00
if ( length > 0 ) {
bytes = NEW_ARRAY ( char , length + 1 ) ;
source . extract ( 0 , sLength , bytes , codepage ) ;
2006-07-28 22:58:29 +00:00
}
2006-02-06 18:03:11 +00:00
return bytes ;
}
static void freeBytes ( char * bytes )
{
DELETE_ARRAY ( bytes ) ;
}
void CharsetDetectionTest : : checkEncoding ( const UnicodeString & testString , const UnicodeString & encoding , const UnicodeString & id )
{
int32_t splits = 0 ;
int32_t testLength = testString . length ( ) ;
UnicodeString * eSplit = split ( encoding , CH_SLASH , splits ) ;
UErrorCode status = U_ZERO_ERROR ;
int32_t cpLength = eSplit [ 0 ] . length ( ) ;
char codepage [ 64 ] ;
u_UCharsToChars ( eSplit [ 0 ] . getBuffer ( ) , codepage , cpLength ) ;
codepage [ cpLength ] = ' \0 ' ;
2009-11-20 06:28:25 +00:00
LocalUCharsetDetectorPointer csd ( ucsdet_open ( & status ) ) ;
2006-02-06 18:03:11 +00:00
int32_t byteLength = 0 ;
char * bytes = extractBytes ( testString , codepage , byteLength ) ;
2006-07-28 22:58:29 +00:00
if ( bytes = = NULL ) {
# if !UCONFIG_NO_LEGACY_CONVERSION
2011-03-03 19:29:57 +00:00
dataerrln ( " Can't open a " + encoding + " converter for " + id ) ;
2006-07-28 22:58:29 +00:00
# endif
return ;
}
2009-11-20 06:28:25 +00:00
ucsdet_setText ( csd . getAlias ( ) , bytes , byteLength , & status ) ;
2006-02-06 18:03:11 +00:00
2006-02-13 20:47:36 +00:00
int32_t matchCount = 0 ;
2009-11-20 06:28:25 +00:00
const UCharsetMatch * * matches = ucsdet_detectAll ( csd . getAlias ( ) , & matchCount , & status ) ;
2006-02-07 21:59:16 +00:00
2006-02-13 20:47:36 +00:00
UnicodeString name ( ucsdet_getName ( matches [ 0 ] , & status ) ) ;
UnicodeString lang ( ucsdet_getLanguage ( matches [ 0 ] , & status ) ) ;
2006-02-06 18:03:11 +00:00
UChar * decoded = NULL ;
int32_t dLength = 0 ;
2006-02-13 20:47:36 +00:00
if ( matchCount = = 0 ) {
2006-02-07 21:59:16 +00:00
errln ( " Encoding detection failure for " + id + " : expected " + eSplit [ 0 ] + " , got no matches " ) ;
goto bail ;
}
2006-02-06 18:03:11 +00:00
if ( name . compare ( eSplit [ 0 ] ) ! = 0 ) {
errln ( " Encoding detection failure for " + id + " : expected " + eSplit [ 0 ] + " , got " + name ) ;
2006-02-07 19:12:43 +00:00
2006-02-09 21:13:01 +00:00
# ifdef DEBUG_DETECT
2006-02-07 19:12:43 +00:00
for ( int32_t m = 0 ; m < matchCount ; m + = 1 ) {
const char * name = ucsdet_getName ( matches [ m ] , & status ) ;
const char * lang = ucsdet_getLanguage ( matches [ m ] , & status ) ;
int32_t confidence = ucsdet_getConfidence ( matches [ m ] , & status ) ;
2006-02-09 21:13:01 +00:00
printf ( " %s (%s) %d \n " , name , lang , confidence ) ;
2006-02-07 19:12:43 +00:00
}
2006-02-09 21:13:01 +00:00
# endif
2006-02-06 18:03:11 +00:00
goto bail ;
}
if ( splits > 1 & & lang . compare ( eSplit [ 1 ] ) ! = 0 ) {
errln ( " Language detection failure for " + id + " , " + eSplit [ 0 ] + " : expected " + eSplit [ 1 ] + " , got " + lang ) ;
goto bail ;
}
decoded = NEW_ARRAY ( UChar , testLength ) ;
2006-02-13 20:47:36 +00:00
dLength = ucsdet_getUChars ( matches [ 0 ] , decoded , testLength , & status ) ;
2006-02-06 18:03:11 +00:00
if ( testString . compare ( decoded , dLength ) ! = 0 ) {
errln ( " Round-trip error for " + id + " , " + eSplit [ 0 ] + " : getUChars() didn't yeild the original string. " ) ;
2006-02-09 21:13:01 +00:00
# ifdef DEBUG_DETECT
for ( int32_t i = 0 ; i < testLength ; i + = 1 ) {
if ( testString [ i ] ! = decoded [ i ] ) {
printf ( " Strings differ at byte %d \n " , i ) ;
break ;
}
}
# endif
2006-02-06 18:03:11 +00:00
}
DELETE_ARRAY ( decoded ) ;
bail :
freeBytes ( bytes ) ;
delete [ ] eSplit ;
}
const char * CharsetDetectionTest : : getPath ( char buffer [ 2048 ] , const char * filename ) {
UErrorCode status = U_ZERO_ERROR ;
const char * testDataDirectory = IntlTest : : getSourceTestData ( status ) ;
if ( U_FAILURE ( status ) ) {
errln ( " ERROR: getPath() failed - %s " , u_errorName ( status ) ) ;
return NULL ;
}
strcpy ( buffer , testDataDirectory ) ;
strcat ( buffer , filename ) ;
return buffer ;
}
void CharsetDetectionTest : : ConstructionTest ( )
{
2009-11-20 06:28:25 +00:00
IcuTestErrorCode status ( * this , " ConstructionTest " ) ;
LocalUCharsetDetectorPointer csd ( ucsdet_open ( status ) ) ;
LocalUEnumerationPointer e ( ucsdet_getAllDetectableCharsets ( csd . getAlias ( ) , status ) ) ;
int32_t count = uenum_count ( e . getAlias ( ) , status ) ;
2006-02-06 18:03:11 +00:00
2006-02-09 21:13:01 +00:00
# ifdef DEBUG_DETECT
printf ( " There are %d recognizers. \n " , count ) ;
# endif
2006-02-06 18:03:11 +00:00
for ( int32_t i = 0 ; i < count ; i + = 1 ) {
int32_t length ;
2009-11-20 06:28:25 +00:00
const char * name = uenum_next ( e . getAlias ( ) , & length , status ) ;
2006-02-06 18:03:11 +00:00
if ( name = = NULL | | length < = 0 ) {
2006-02-09 21:13:01 +00:00
errln ( " ucsdet_getAllDetectableCharsets() returned a null or empty name! " ) ;
2006-02-06 18:03:11 +00:00
}
2006-02-09 21:13:01 +00:00
# ifdef DEBUG_DETECT
printf ( " %s \n " , name ) ;
# endif
2006-02-06 18:03:11 +00:00
}
}
void CharsetDetectionTest : : UTF8Test ( )
{
UErrorCode status = U_ZERO_ERROR ;
UnicodeString ss = " This is a string with some non-ascii characters that will "
" be converted to UTF-8, then shoved through the detection process. "
" \\ u0391 \\ u0392 \\ u0393 \\ u0394 \\ u0395 "
" Sure would be nice if our source could contain Unicode directly! " ;
UnicodeString s = ss . unescape ( ) ;
int32_t byteLength = 0 , sLength = s . length ( ) ;
char * bytes = extractBytes ( s , " UTF-8 " , byteLength ) ;
UCharsetDetector * csd = ucsdet_open ( & status ) ;
const UCharsetMatch * match ;
UChar * detected = NEW_ARRAY ( UChar , sLength ) ;
ucsdet_setText ( csd , bytes , byteLength , & status ) ;
match = ucsdet_detect ( csd , & status ) ;
2006-02-07 21:59:16 +00:00
if ( match = = NULL ) {
errln ( " Detection failure for UTF-8: got no matches. " ) ;
goto bail ;
}
2006-02-06 18:03:11 +00:00
ucsdet_getUChars ( match , detected , sLength , & status ) ;
if ( s . compare ( detected , sLength ) ! = 0 ) {
errln ( " Round-trip test failed! " ) ;
}
2006-02-07 21:59:16 +00:00
ucsdet_setDeclaredEncoding ( csd , " UTF-8 " , 5 , & status ) ; /* for coverage */
bail :
2006-02-06 18:03:11 +00:00
DELETE_ARRAY ( detected ) ;
freeBytes ( bytes ) ;
ucsdet_close ( csd ) ;
}
void CharsetDetectionTest : : UTF16Test ( )
{
UErrorCode status = U_ZERO_ERROR ;
/* Notice the BOM on the start of this string */
UChar chars [ ] = {
0xFEFF , 0x0623 , 0x0648 , 0x0631 , 0x0648 , 0x0628 , 0x0627 , 0x002C ,
0x0020 , 0x0628 , 0x0631 , 0x0645 , 0x062c , 0x064a , 0x0627 , 0x062a ,
0x0020 , 0x0627 , 0x0644 , 0x062d , 0x0627 , 0x0633 , 0x0648 , 0x0628 ,
0x0020 , 0x002b , 0x0020 , 0x0627 , 0x0646 , 0x062a , 0x0631 , 0x0646 ,
0x064a , 0x062a , 0x0000 } ;
UnicodeString s ( chars ) ;
int32_t beLength = 0 , leLength = 0 ;
char * beBytes = extractBytes ( s , " UTF-16BE " , beLength ) ;
char * leBytes = extractBytes ( s , " UTF-16LE " , leLength ) ;
UCharsetDetector * csd = ucsdet_open ( & status ) ;
const UCharsetMatch * match ;
const char * name ;
2006-02-07 21:59:16 +00:00
int32_t conf ;
2006-02-06 18:03:11 +00:00
ucsdet_setText ( csd , beBytes , beLength , & status ) ;
match = ucsdet_detect ( csd , & status ) ;
2006-02-07 21:59:16 +00:00
if ( match = = NULL ) {
errln ( " Encoding detection failure for UTF-16BE: got no matches. " ) ;
goto try_le ;
}
2006-02-06 18:03:11 +00:00
name = ucsdet_getName ( match , & status ) ;
2006-02-07 21:59:16 +00:00
conf = ucsdet_getConfidence ( match , & status ) ;
2006-02-06 18:03:11 +00:00
if ( strcmp ( name , " UTF-16BE " ) ! = 0 ) {
errln ( " Encoding detection failure for UTF-16BE: got %s " , name ) ;
2006-08-11 18:55:15 +00:00
goto try_le ; // no point in looking at confidence if we got the wrong character set.
2006-02-06 18:03:11 +00:00
}
2006-02-07 21:59:16 +00:00
if ( conf ! = 100 ) {
errln ( " Did not get 100%% confidence for UTF-16BE: got %d " , conf ) ;
}
try_le :
2006-02-06 18:03:11 +00:00
ucsdet_setText ( csd , leBytes , leLength , & status ) ;
match = ucsdet_detect ( csd , & status ) ;
2006-02-07 21:59:16 +00:00
if ( match = = NULL ) {
errln ( " Encoding detection failure for UTF-16LE: got no matches. " ) ;
goto bail ;
}
2006-02-06 18:03:11 +00:00
name = ucsdet_getName ( match , & status ) ;
2006-02-07 21:59:16 +00:00
conf = ucsdet_getConfidence ( match , & status ) ;
2006-02-06 18:03:11 +00:00
if ( strcmp ( name , " UTF-16LE " ) ! = 0 ) {
errln ( " Enconding detection failure for UTF-16LE: got %s " , name ) ;
2006-08-11 18:55:15 +00:00
goto bail ; // no point in looking at confidence if we got the wrong character set.
2006-02-06 18:03:11 +00:00
}
2006-02-07 21:59:16 +00:00
if ( conf ! = 100 ) {
errln ( " Did not get 100%% confidence for UTF-16LE: got %d " , conf ) ;
}
bail :
2006-02-06 18:03:11 +00:00
freeBytes ( leBytes ) ;
freeBytes ( beBytes ) ;
ucsdet_close ( csd ) ;
}
void CharsetDetectionTest : : InputFilterTest ( )
{
UErrorCode status = U_ZERO_ERROR ;
UnicodeString ss = " <a> <lot> <of> <English> <inside> <the> <markup> Un tr \\ u00E8s petit peu de Fran \\ u00E7ais. <to> <confuse> <the> <detector> " ;
UnicodeString s = ss . unescape ( ) ;
int32_t byteLength = 0 ;
char * bytes = extractBytes ( s , " ISO-8859-1 " , byteLength ) ;
UCharsetDetector * csd = ucsdet_open ( & status ) ;
const UCharsetMatch * match ;
2006-02-09 21:13:01 +00:00
const char * lang , * name ;
2006-02-06 18:03:11 +00:00
ucsdet_enableInputFilter ( csd , TRUE ) ;
if ( ! ucsdet_isInputFilterEnabled ( csd ) ) {
errln ( " ucsdet_enableInputFilter(csd, TRUE) did not enable input filter! " ) ;
}
2006-02-09 21:13:01 +00:00
2006-02-06 18:03:11 +00:00
ucsdet_setText ( csd , bytes , byteLength , & status ) ;
match = ucsdet_detect ( csd , & status ) ;
2006-02-07 21:59:16 +00:00
if ( match = = NULL ) {
errln ( " Turning on the input filter resulted in no matches. " ) ;
goto turn_off ;
}
2006-02-09 21:13:01 +00:00
name = ucsdet_getName ( match , & status ) ;
if ( name = = NULL | | strcmp ( name , " ISO-8859-1 " ) ! = 0 ) {
2006-02-10 23:49:09 +00:00
errln ( " Turning on the input filter resulted in %s rather than ISO-8859-1. " , name ) ;
2006-02-09 21:13:01 +00:00
} else {
lang = ucsdet_getLanguage ( match , & status ) ;
2006-02-06 18:03:11 +00:00
2006-02-09 21:13:01 +00:00
if ( lang = = NULL | | strcmp ( lang , " fr " ) ! = 0 ) {
errln ( " Input filter did not strip markup! " ) ;
}
2006-02-06 18:03:11 +00:00
}
2006-02-07 21:59:16 +00:00
turn_off :
2006-02-06 18:03:11 +00:00
ucsdet_enableInputFilter ( csd , FALSE ) ;
ucsdet_setText ( csd , bytes , byteLength , & status ) ;
match = ucsdet_detect ( csd , & status ) ;
2006-02-07 21:59:16 +00:00
if ( match = = NULL ) {
errln ( " Turning off the input filter resulted in no matches. " ) ;
goto bail ;
}
2006-02-09 21:13:01 +00:00
name = ucsdet_getName ( match , & status ) ;
2006-02-06 18:03:11 +00:00
2006-02-09 21:13:01 +00:00
if ( name = = NULL | | strcmp ( name , " ISO-8859-1 " ) ! = 0 ) {
2006-02-10 23:49:09 +00:00
errln ( " Turning off the input filter resulted in %s rather than ISO-8859-1. " , name ) ;
2006-02-09 21:13:01 +00:00
} else {
lang = ucsdet_getLanguage ( match , & status ) ;
if ( lang = = NULL | | strcmp ( lang , " en " ) ! = 0 ) {
errln ( " Unfiltered input did not detect as English! " ) ;
}
2006-02-06 18:03:11 +00:00
}
2006-02-07 21:59:16 +00:00
bail :
2006-02-06 18:03:11 +00:00
freeBytes ( bytes ) ;
ucsdet_close ( csd ) ;
}
void CharsetDetectionTest : : C1BytesTest ( )
{
2006-07-28 22:58:29 +00:00
# if !UCONFIG_NO_LEGACY_CONVERSION
2006-02-06 18:03:11 +00:00
UErrorCode status = U_ZERO_ERROR ;
UnicodeString sISO = " This is a small sample of some English text. Just enough to be sure that it detects correctly. " ;
2008-06-17 00:55:35 +00:00
UnicodeString ssWindows ( " This is another small sample of some English text. Just enough to be sure that it detects correctly. It also includes some \\ u201CC1 \\ u201D bytes. " , - 1 , US_INV ) ;
2006-02-06 18:03:11 +00:00
UnicodeString sWindows = ssWindows . unescape ( ) ;
int32_t lISO = 0 , lWindows = 0 ;
char * bISO = extractBytes ( sISO , " ISO-8859-1 " , lISO ) ;
char * bWindows = extractBytes ( sWindows , " windows-1252 " , lWindows ) ;
UCharsetDetector * csd = ucsdet_open ( & status ) ;
const UCharsetMatch * match ;
const char * name ;
ucsdet_setText ( csd , bWindows , lWindows , & status ) ;
match = ucsdet_detect ( csd , & status ) ;
2006-02-07 21:59:16 +00:00
if ( match = = NULL ) {
2009-06-12 19:34:21 +00:00
errcheckln ( status , " English test with C1 bytes got no matches. - %s " , u_errorName ( status ) ) ;
2006-02-07 21:59:16 +00:00
goto bail ;
}
2006-02-06 18:03:11 +00:00
name = ucsdet_getName ( match , & status ) ;
if ( strcmp ( name , " windows-1252 " ) ! = 0 ) {
2006-02-07 19:12:43 +00:00
errln ( " English text with C1 bytes does not detect as windows-1252, but as %s " , name ) ;
2006-02-06 18:03:11 +00:00
}
ucsdet_setText ( csd , bISO , lISO , & status ) ;
match = ucsdet_detect ( csd , & status ) ;
2006-02-07 21:59:16 +00:00
if ( match = = NULL ) {
errln ( " English text without C1 bytes got no matches. " ) ;
goto bail ;
}
2006-02-06 18:03:11 +00:00
name = ucsdet_getName ( match , & status ) ;
if ( strcmp ( name , " ISO-8859-1 " ) ! = 0 ) {
2006-02-07 19:12:43 +00:00
errln ( " English text without C1 bytes does not detect as ISO-8859-1, but as %s " , name ) ;
2006-02-06 18:03:11 +00:00
}
2006-02-07 21:59:16 +00:00
bail :
2006-02-06 18:03:11 +00:00
freeBytes ( bWindows ) ;
freeBytes ( bISO ) ;
ucsdet_close ( csd ) ;
2006-07-28 22:58:29 +00:00
# endif
2006-02-06 18:03:11 +00:00
}
void CharsetDetectionTest : : DetectionTest ( )
{
2006-08-02 22:29:50 +00:00
# if !UCONFIG_NO_REGULAR_EXPRESSIONS
2006-02-06 18:03:11 +00:00
UErrorCode status = U_ZERO_ERROR ;
char path [ 2048 ] ;
const char * testFilePath = getPath ( path , " csdetest.xml " ) ;
if ( testFilePath = = NULL ) {
return ; /* Couldn't get path: error message already output. */
}
UXMLParser * parser = UXMLParser : : createParser ( status ) ;
2009-06-12 19:34:21 +00:00
if ( U_FAILURE ( status ) ) {
dataerrln ( " FAIL: UXMLParser::createParser (%s) " , u_errorName ( status ) ) ;
return ;
}
2006-02-06 18:03:11 +00:00
UXMLElement * root = parser - > parseFile ( testFilePath , status ) ;
2006-07-21 22:01:55 +00:00
if ( ! assertSuccess ( " parseFile " , status ) ) return ;
2006-02-06 18:03:11 +00:00
UnicodeString test_case = UNICODE_STRING_SIMPLE ( " test-case " ) ;
UnicodeString id_attr = UNICODE_STRING_SIMPLE ( " id " ) ;
UnicodeString enc_attr = UNICODE_STRING_SIMPLE ( " encodings " ) ;
const UXMLElement * testCase ;
int32_t tc = 0 ;
while ( ( testCase = root - > nextChildElement ( tc ) ) ! = NULL ) {
if ( testCase - > getTagName ( ) . compare ( test_case ) = = 0 ) {
const UnicodeString * id = testCase - > getAttribute ( id_attr ) ;
const UnicodeString * encodings = testCase - > getAttribute ( enc_attr ) ;
const UnicodeString text = testCase - > getText ( TRUE ) ;
int32_t encodingCount ;
UnicodeString * encodingList = split ( * encodings , CH_SPACE , encodingCount ) ;
for ( int32_t e = 0 ; e < encodingCount ; e + = 1 ) {
checkEncoding ( text , encodingList [ e ] , * id ) ;
}
delete [ ] encodingList ;
}
}
delete root ;
delete parser ;
2006-07-28 22:58:29 +00:00
# endif
2006-02-06 18:03:11 +00:00
}
2009-03-31 15:39:00 +00:00
void CharsetDetectionTest : : IBM424Test ( )
{
UErrorCode status = U_ZERO_ERROR ;
static const UChar chars [ ] = {
0x05D4 , 0x05E4 , 0x05E8 , 0x05E7 , 0x05DC , 0x05D9 , 0x05D8 , 0x0020 , 0x05D4 , 0x05E6 , 0x05D1 , 0x05D0 , 0x05D9 , 0x0020 , 0x05D4 , 0x05E8 ,
0x05D0 , 0x05E9 , 0x05D9 , 0x002C , 0x0020 , 0x05EA , 0x05EA , 0x0020 , 0x05D0 , 0x05DC , 0x05D5 , 0x05E3 , 0x0020 , 0x05D0 , 0x05D1 , 0x05D9 ,
0x05D7 , 0x05D9 , 0x0020 , 0x05DE , 0x05E0 , 0x05D3 , 0x05DC , 0x05D1 , 0x05DC , 0x05D9 , 0x05D8 , 0x002C , 0x0020 , 0x05D4 , 0x05D5 , 0x05E8 ,
0x05D4 , 0x0020 , 0x05E2 , 0x05DC , 0x0020 , 0x05E4 , 0x05EA , 0x05D9 , 0x05D7 , 0x05EA , 0x0020 , 0x05D7 , 0x05E7 , 0x05D9 , 0x05E8 , 0x05EA ,
0x0020 , 0x05DE , 0x05E6 , 0x0022 , 0x05D7 , 0x0020 , 0x05D1 , 0x05E2 , 0x05E7 , 0x05D1 , 0x05D5 , 0x05EA , 0x0020 , 0x05E2 , 0x05D3 , 0x05D5 ,
0x05D9 , 0x05D5 , 0x05EA , 0x0020 , 0x05D7 , 0x05D9 , 0x05D9 , 0x05DC , 0x05D9 , 0x0020 , 0x05E6 , 0x05D4 , 0x0022 , 0x05DC , 0x0020 , 0x05DE ,
0x05DE , 0x05D1 , 0x05E6 , 0x05E2 , 0x0020 , 0x05E2 , 0x05D5 , 0x05E4 , 0x05E8 , 0x05EA , 0x0020 , 0x05D9 , 0x05E6 , 0x05D5 , 0x05E7 , 0x05D4 ,
0x0020 , 0x05D1 , 0x002B , 0x0020 , 0x05E8 , 0x05E6 , 0x05D5 , 0x05E2 , 0x05EA , 0x0020 , 0x05E2 , 0x05D6 , 0x05D4 , 0x002E , 0x0020 , 0x05DC ,
0x05D3 , 0x05D1 , 0x05E8 , 0x05D9 , 0x0020 , 0x05D4 , 0x05E4 , 0x05E6 , 0x0022 , 0x05E8 , 0x002C , 0x0020 , 0x05DE , 0x05D4 , 0x05E2 , 0x05D3 ,
0x05D5 , 0x05D9 , 0x05D5 , 0x05EA , 0x0020 , 0x05E2 , 0x05D5 , 0x05DC , 0x05D4 , 0x0020 , 0x05EA , 0x05DE , 0x05D5 , 0x05E0 , 0x05D4 , 0x0020 ,
0x05E9 , 0x05DC , 0x0020 , 0x0022 , 0x05D4 , 0x05EA , 0x05E0 , 0x05D4 , 0x05D2 , 0x05D5 , 0x05EA , 0x0020 , 0x05E4 , 0x05E1 , 0x05D5 , 0x05DC ,
0x05D4 , 0x0020 , 0x05DC , 0x05DB , 0x05D0 , 0x05D5 , 0x05E8 , 0x05D4 , 0x0020 , 0x05E9 , 0x05DC , 0x0020 , 0x05D7 , 0x05D9 , 0x05D9 , 0x05DC ,
0x05D9 , 0x05DD , 0x0020 , 0x05D1 , 0x05DE , 0x05D4 , 0x05DC , 0x05DA , 0x0020 , 0x05DE , 0x05D1 , 0x05E6 , 0x05E2 , 0x0020 , 0x05E2 , 0x05D5 ,
0x05E4 , 0x05E8 , 0x05EA , 0x0020 , 0x05D9 , 0x05E6 , 0x05D5 , 0x05E7 , 0x05D4 , 0x0022 , 0x002E , 0x0020 , 0x05DE , 0x05E0 , 0x05D3 , 0x05DC ,
0x05D1 , 0x05DC , 0x05D9 , 0x05D8 , 0x0020 , 0x05E7 , 0x05D9 , 0x05D1 , 0x05DC , 0x0020 , 0x05D0 , 0x05EA , 0x0020 , 0x05D4 , 0x05D7 , 0x05DC ,
0x05D8 , 0x05EA , 0x05D5 , 0x0020 , 0x05DC , 0x05D0 , 0x05D7 , 0x05E8 , 0x0020 , 0x05E9 , 0x05E2 , 0x05D9 , 0x05D9 , 0x05DF , 0x0020 , 0x05D1 ,
0x05EA , 0x05DE , 0x05DC , 0x05D9 , 0x05DC , 0x0020 , 0x05D4 , 0x05E2 , 0x05D3 , 0x05D5 , 0x05D9 , 0x05D5 , 0x05EA , 0x0000
} ;
2009-04-14 20:12:54 +00:00
static const UChar chars_reverse [ ] = {
0x05EA , 0x05D5 , 0x05D9 , 0x05D5 , 0x05D3 , 0x05E2 , 0x05D4 , 0x0020 , 0x05DC , 0x05D9 , 0x05DC , 0x05DE , 0x05EA ,
0x05D1 , 0x0020 , 0x05DF , 0x05D9 , 0x05D9 , 0x05E2 , 0x05E9 , 0x0020 , 0x05E8 , 0x05D7 , 0x05D0 , 0x05DC , 0x0020 , 0x05D5 , 0x05EA , 0x05D8 ,
0x05DC , 0x05D7 , 0x05D4 , 0x0020 , 0x05EA , 0x05D0 , 0x0020 , 0x05DC , 0x05D1 , 0x05D9 , 0x05E7 , 0x0020 , 0x05D8 , 0x05D9 , 0x05DC , 0x05D1 ,
0x05DC , 0x05D3 , 0x05E0 , 0x05DE , 0x0020 , 0x002E , 0x0022 , 0x05D4 , 0x05E7 , 0x05D5 , 0x05E6 , 0x05D9 , 0x0020 , 0x05EA , 0x05E8 , 0x05E4 ,
0x05D5 , 0x05E2 , 0x0020 , 0x05E2 , 0x05E6 , 0x05D1 , 0x05DE , 0x0020 , 0x05DA , 0x05DC , 0x05D4 , 0x05DE , 0x05D1 , 0x0020 , 0x05DD , 0x05D9 ,
0x05DC , 0x05D9 , 0x05D9 , 0x05D7 , 0x0020 , 0x05DC , 0x05E9 , 0x0020 , 0x05D4 , 0x05E8 , 0x05D5 , 0x05D0 , 0x05DB , 0x05DC , 0x0020 , 0x05D4 ,
0x05DC , 0x05D5 , 0x05E1 , 0x05E4 , 0x0020 , 0x05EA , 0x05D5 , 0x05D2 , 0x05D4 , 0x05E0 , 0x05EA , 0x05D4 , 0x0022 , 0x0020 , 0x05DC , 0x05E9 ,
0x0020 , 0x05D4 , 0x05E0 , 0x05D5 , 0x05DE , 0x05EA , 0x0020 , 0x05D4 , 0x05DC , 0x05D5 , 0x05E2 , 0x0020 , 0x05EA , 0x05D5 , 0x05D9 , 0x05D5 ,
0x05D3 , 0x05E2 , 0x05D4 , 0x05DE , 0x0020 , 0x002C , 0x05E8 , 0x0022 , 0x05E6 , 0x05E4 , 0x05D4 , 0x0020 , 0x05D9 , 0x05E8 , 0x05D1 , 0x05D3 ,
0x05DC , 0x0020 , 0x002E , 0x05D4 , 0x05D6 , 0x05E2 , 0x0020 , 0x05EA , 0x05E2 , 0x05D5 , 0x05E6 , 0x05E8 , 0x0020 , 0x002B , 0x05D1 , 0x0020 ,
0x05D4 , 0x05E7 , 0x05D5 , 0x05E6 , 0x05D9 , 0x0020 , 0x05EA , 0x05E8 , 0x05E4 , 0x05D5 , 0x05E2 , 0x0020 , 0x05E2 , 0x05E6 , 0x05D1 , 0x05DE ,
0x05DE , 0x0020 , 0x05DC , 0x0022 , 0x05D4 , 0x05E6 , 0x0020 , 0x05D9 , 0x05DC , 0x05D9 , 0x05D9 , 0x05D7 , 0x0020 , 0x05EA , 0x05D5 , 0x05D9 ,
0x05D5 , 0x05D3 , 0x05E2 , 0x0020 , 0x05EA , 0x05D5 , 0x05D1 , 0x05E7 , 0x05E2 , 0x05D1 , 0x0020 , 0x05D7 , 0x0022 , 0x05E6 , 0x05DE , 0x0020 ,
0x05EA , 0x05E8 , 0x05D9 , 0x05E7 , 0x05D7 , 0x0020 , 0x05EA , 0x05D7 , 0x05D9 , 0x05EA , 0x05E4 , 0x0020 , 0x05DC , 0x05E2 , 0x0020 , 0x05D4 ,
0x05E8 , 0x05D5 , 0x05D4 , 0x0020 , 0x002C , 0x05D8 , 0x05D9 , 0x05DC , 0x05D1 , 0x05DC , 0x05D3 , 0x05E0 , 0x05DE , 0x0020 , 0x05D9 , 0x05D7 ,
0x05D9 , 0x05D1 , 0x05D0 , 0x0020 , 0x05E3 , 0x05D5 , 0x05DC , 0x05D0 , 0x0020 , 0x05EA , 0x05EA , 0x0020 , 0x002C , 0x05D9 , 0x05E9 , 0x05D0 ,
0x05E8 , 0x05D4 , 0x0020 , 0x05D9 , 0x05D0 , 0x05D1 , 0x05E6 , 0x05D4 , 0x0020 , 0x05D8 , 0x05D9 , 0x05DC , 0x05E7 , 0x05E8 , 0x05E4 , 0x05D4 ,
0x0000
} ;
int32_t bLength = 0 , brLength = 0 ;
UnicodeString s1 ( chars ) ;
UnicodeString s2 ( chars_reverse ) ;
char * bytes = extractBytes ( s1 , " IBM424 " , bLength ) ;
char * bytes_r = extractBytes ( s2 , " IBM424 " , brLength ) ;
2009-03-31 15:39:00 +00:00
UCharsetDetector * csd = ucsdet_open ( & status ) ;
2009-06-12 19:34:21 +00:00
if ( U_FAILURE ( status ) ) {
errln ( " Error opening charset detector. - %s " , u_errorName ( status ) ) ;
}
2009-03-31 15:39:00 +00:00
const UCharsetMatch * match ;
const char * name ;
ucsdet_setText ( csd , bytes , bLength , & status ) ;
match = ucsdet_detect ( csd , & status ) ;
if ( match = = NULL ) {
2009-06-12 19:34:21 +00:00
errcheckln ( status , " Encoding detection failure for IBM424_rtl: got no matches. - %s " , u_errorName ( status ) ) ;
2009-04-14 20:12:54 +00:00
goto bail ;
}
name = ucsdet_getName ( match , & status ) ;
if ( strcmp ( name , " IBM424_rtl " ) ! = 0 ) {
2009-06-12 19:34:21 +00:00
errln ( " Encoding detection failure for IBM424_rtl: got %s " , name ) ;
2009-04-14 20:12:54 +00:00
}
ucsdet_setText ( csd , bytes_r , brLength , & status ) ;
match = ucsdet_detect ( csd , & status ) ;
if ( match = = NULL ) {
2009-06-12 19:34:21 +00:00
errln ( " Encoding detection failure for IBM424_ltr: got no matches. " ) ;
2009-03-31 15:39:00 +00:00
goto bail ;
}
name = ucsdet_getName ( match , & status ) ;
2009-04-14 20:12:54 +00:00
if ( strcmp ( name , " IBM424_ltr " ) ! = 0 ) {
2009-06-12 19:34:21 +00:00
errln ( " Encoding detection failure for IBM424_ltr: got %s " , name ) ;
2009-03-31 15:39:00 +00:00
}
bail :
freeBytes ( bytes ) ;
2009-04-14 20:12:54 +00:00
freeBytes ( bytes_r ) ;
2009-03-31 15:39:00 +00:00
ucsdet_close ( csd ) ;
}
void CharsetDetectionTest : : IBM420Test ( )
{
UErrorCode status = U_ZERO_ERROR ;
static const UChar chars [ ] = {
0x0648 , 0x064F , 0x0636 , 0x0639 , 0x062A , 0x0020 , 0x0648 , 0x0646 , 0x064F , 0x0641 , 0x0630 , 0x062A , 0x0020 , 0x0628 , 0x0631 , 0x0627 ,
0x0645 , 0x062C , 0x0020 , 0x062A , 0x0623 , 0x0645 , 0x064A , 0x0646 , 0x0020 , 0x0639 , 0x062F , 0x064A , 0x062F , 0x0629 , 0x0020 , 0x0641 ,
0x064A , 0x0020 , 0x0645 , 0x0624 , 0x0633 , 0x0633 , 0x0629 , 0x0020 , 0x0627 , 0x0644 , 0x062A , 0x0623 , 0x0645 , 0x064A , 0x0646 , 0x0020 ,
0x0627 , 0x0644 , 0x0648 , 0x0637 , 0x0646 , 0x064A , 0x002C , 0x0020 , 0x0645 , 0x0639 , 0x0020 , 0x0645 , 0x0644 , 0x0627 , 0x0626 , 0x0645 ,
0x062A , 0x0647 , 0x0627 , 0x0020 , 0x062F , 0x0627 , 0x0626 , 0x0645 , 0x0627 , 0x064B , 0x0020 , 0x0644 , 0x0644 , 0x0627 , 0x062D , 0x062A ,
0x064A , 0x0627 , 0x062C , 0x0627 , 0x062A , 0x0020 , 0x0627 , 0x0644 , 0x0645 , 0x062A , 0x063A , 0x064A , 0x0631 , 0x0629 , 0x0020 , 0x0644 ,
0x0644 , 0x0645 , 0x062C , 0x062A , 0x0645 , 0x0639 , 0x0020 , 0x0648 , 0x0644 , 0x0644 , 0x062F , 0x0648 , 0x0644 , 0x0629 , 0x002E , 0x0020 ,
0x062A , 0x0648 , 0x0633 , 0x0639 , 0x062A , 0x0020 , 0x0648 , 0x062A , 0x0637 , 0x0648 , 0x0631 , 0x062A , 0x0020 , 0x0627 , 0x0644 , 0x0645 ,
0x0624 , 0x0633 , 0x0633 , 0x0629 , 0x0020 , 0x0628 , 0x0647 , 0x062F , 0x0641 , 0x0020 , 0x0636 , 0x0645 , 0x0627 , 0x0646 , 0x0020 , 0x0634 ,
0x0628 , 0x0643 , 0x0629 , 0x0020 , 0x0623 , 0x0645 , 0x0627 , 0x0646 , 0x0020 , 0x0644 , 0x0633 , 0x0643 , 0x0627 , 0x0646 , 0x0020 , 0x062F ,
0x0648 , 0x0644 , 0x0629 , 0x0020 , 0x0627 , 0x0633 , 0x0631 , 0x0627 , 0x0626 , 0x064A , 0x0644 , 0x0020 , 0x0628 , 0x0648 , 0x062C , 0x0647 ,
0x0020 , 0x0627 , 0x0644 , 0x0645 , 0x062E , 0x0627 , 0x0637 , 0x0631 , 0x0020 , 0x0627 , 0x0644 , 0x0627 , 0x0642 , 0x062A , 0x0635 , 0x0627 ,
0x062F , 0x064A , 0x0629 , 0x0020 , 0x0648 , 0x0627 , 0x0644 , 0x0627 , 0x062C , 0x062A , 0x0645 , 0x0627 , 0x0639 , 0x064A , 0x0629 , 0x002E ,
0x0000
} ;
2009-04-14 20:12:54 +00:00
static const UChar chars_reverse [ ] = {
0x002E , 0x0629 , 0x064A , 0x0639 , 0x0627 , 0x0645 , 0x062A , 0x062C , 0x0627 , 0x0644 , 0x0627 , 0x0648 , 0x0020 , 0x0629 , 0x064A , 0x062F ,
0x0627 , 0x0635 , 0x062A , 0x0642 , 0x0627 , 0x0644 , 0x0627 , 0x0020 , 0x0631 , 0x0637 , 0x0627 , 0x062E , 0x0645 , 0x0644 , 0x0627 , 0x0020 ,
0x0647 , 0x062C , 0x0648 , 0x0628 , 0x0020 , 0x0644 , 0x064A , 0x0626 , 0x0627 , 0x0631 , 0x0633 , 0x0627 , 0x0020 , 0x0629 , 0x0644 , 0x0648 ,
0x062F , 0x0020 , 0x0646 , 0x0627 , 0x0643 , 0x0633 , 0x0644 , 0x0020 , 0x0646 , 0x0627 , 0x0645 , 0x0623 , 0x0020 , 0x0629 , 0x0643 , 0x0628 ,
0x0634 , 0x0020 , 0x0646 , 0x0627 , 0x0645 , 0x0636 , 0x0020 , 0x0641 , 0x062F , 0x0647 , 0x0628 , 0x0020 , 0x0629 , 0x0633 , 0x0633 , 0x0624 ,
0x0645 , 0x0644 , 0x0627 , 0x0020 , 0x062A , 0x0631 , 0x0648 , 0x0637 , 0x062A , 0x0648 , 0x0020 , 0x062A , 0x0639 , 0x0633 , 0x0648 , 0x062A ,
0x0020 , 0x002E , 0x0629 , 0x0644 , 0x0648 , 0x062F , 0x0644 , 0x0644 , 0x0648 , 0x0020 , 0x0639 , 0x0645 , 0x062A , 0x062C , 0x0645 , 0x0644 ,
0x0644 , 0x0020 , 0x0629 , 0x0631 , 0x064A , 0x063A , 0x062A , 0x0645 , 0x0644 , 0x0627 , 0x0020 , 0x062A , 0x0627 , 0x062C , 0x0627 , 0x064A ,
0x062A , 0x062D , 0x0627 , 0x0644 , 0x0644 , 0x0020 , 0x064B , 0x0627 , 0x0645 , 0x0626 , 0x0627 , 0x062F , 0x0020 , 0x0627 , 0x0647 , 0x062A ,
0x0645 , 0x0626 , 0x0627 , 0x0644 , 0x0645 , 0x0020 , 0x0639 , 0x0645 , 0x0020 , 0x002C , 0x064A , 0x0646 , 0x0637 , 0x0648 , 0x0644 , 0x0627 ,
0x0020 , 0x0646 , 0x064A , 0x0645 , 0x0623 , 0x062A , 0x0644 , 0x0627 , 0x0020 , 0x0629 , 0x0633 , 0x0633 , 0x0624 , 0x0645 , 0x0020 , 0x064A ,
0x0641 , 0x0020 , 0x0629 , 0x062F , 0x064A , 0x062F , 0x0639 , 0x0020 , 0x0646 , 0x064A , 0x0645 , 0x0623 , 0x062A , 0x0020 , 0x062C , 0x0645 ,
0x0627 , 0x0631 , 0x0628 , 0x0020 , 0x062A , 0x0630 , 0x0641 , 0x064F , 0x0646 , 0x0648 , 0x0020 , 0x062A , 0x0639 , 0x0636 , 0x064F , 0x0648 ,
0x0000 ,
} ;
int32_t bLength = 0 , brLength = 0 ;
UnicodeString s1 ( chars ) ;
UnicodeString s2 ( chars_reverse ) ;
char * bytes = extractBytes ( s1 , " IBM420 " , bLength ) ;
char * bytes_r = extractBytes ( s2 , " IBM420 " , brLength ) ;
2009-03-31 15:39:00 +00:00
UCharsetDetector * csd = ucsdet_open ( & status ) ;
2009-06-12 19:34:21 +00:00
if ( U_FAILURE ( status ) ) {
errln ( " Error opening charset detector. - %s " , u_errorName ( status ) ) ;
}
2009-03-31 15:39:00 +00:00
const UCharsetMatch * match ;
const char * name ;
ucsdet_setText ( csd , bytes , bLength , & status ) ;
match = ucsdet_detect ( csd , & status ) ;
2006-02-06 18:03:11 +00:00
2009-03-31 15:39:00 +00:00
if ( match = = NULL ) {
2009-06-12 19:34:21 +00:00
errcheckln ( status , " Encoding detection failure for IBM420_rtl: got no matches. - %s " , u_errorName ( status ) ) ;
2009-04-14 20:12:54 +00:00
goto bail ;
}
name = ucsdet_getName ( match , & status ) ;
if ( strcmp ( name , " IBM420_rtl " ) ! = 0 ) {
errln ( " Encoding detection failure for IBM420_rtl: got %s \n " , name ) ;
}
ucsdet_setText ( csd , bytes_r , brLength , & status ) ;
match = ucsdet_detect ( csd , & status ) ;
if ( match = = NULL ) {
errln ( " Encoding detection failure for IBM420_ltr: got no matches. \n " ) ;
2009-03-31 15:39:00 +00:00
goto bail ;
}
name = ucsdet_getName ( match , & status ) ;
2009-04-14 20:12:54 +00:00
if ( strcmp ( name , " IBM420_ltr " ) ! = 0 ) {
errln ( " Encoding detection failure for IBM420_ltr: got %s \n " , name ) ;
2009-03-31 15:39:00 +00:00
}
bail :
freeBytes ( bytes ) ;
2009-04-14 20:12:54 +00:00
freeBytes ( bytes_r ) ;
2009-03-31 15:39:00 +00:00
ucsdet_close ( csd ) ;
}
2009-04-24 22:24:27 +00:00
void CharsetDetectionTest : : Ticket6394Test ( ) {
# if !UCONFIG_NO_CONVERSION
const char charText [ ] = " Here is some random English text that should be detected as ISO-8859-1. "
" Ticket 6394 claims that ISO-8859-1 will appear in the array of detected "
" encodings more than once. The hop through UnicodeString is for platforms "
" where this char * string is be EBCDIC and needs conversion to Latin1. " ;
char latin1Text [ sizeof ( charText ) ] ;
UnicodeString ( charText ) . extract ( 0 , sizeof ( charText ) - 2 , latin1Text , sizeof ( latin1Text ) , " ISO-8859-1 " ) ;
UErrorCode status = U_ZERO_ERROR ;
UCharsetDetector * csd = ucsdet_open ( & status ) ;
ucsdet_setText ( csd , latin1Text , - 1 , & status ) ;
if ( U_FAILURE ( status ) ) {
errln ( " Fail at file %s, line %d. status = %s " , __FILE__ , __LINE__ , u_errorName ( status ) ) ;
return ;
}
int32_t matchCount = 0 ;
const UCharsetMatch * * matches = ucsdet_detectAll ( csd , & matchCount , & status ) ;
if ( U_FAILURE ( status ) ) {
errln ( " Fail at file %s, line %d. status = %s " , __FILE__ , __LINE__ , u_errorName ( status ) ) ;
return ;
}
UnicodeSet setOfCharsetNames ; // UnicodSets can hold strings.
int32_t i ;
for ( i = 0 ; i < matchCount ; i + + ) {
UnicodeString charSetName ( ucsdet_getName ( matches [ i ] , & status ) ) ;
if ( U_FAILURE ( status ) ) {
errln ( " Fail at file %s, line %d. status = %s; i=%d " , __FILE__ , __LINE__ , u_errorName ( status ) , i ) ;
status = U_ZERO_ERROR ;
}
if ( setOfCharsetNames . contains ( charSetName ) ) {
errln ( " Fail at file %s, line %d " , __FILE__ , __LINE__ ) ;
errln ( UnicodeString ( " Duplicate charset name = " ) + charSetName ) ;
}
setOfCharsetNames . add ( charSetName ) ;
}
ucsdet_close ( csd ) ;
# endif
}