2006-02-10 23:49:09 +00:00
/*
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
2008-02-01 07:08:13 +00:00
* Copyright ( c ) 2005 - 2008 , International Business Machines Corporation and *
2006-02-10 23:49:09 +00:00
* others . All Rights Reserved . *
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
*/
# include "unicode/utypes.h"
# include "unicode/ucsdet.h"
# include "unicode/ucnv.h"
# include "unicode/ustring.h"
# include "cintltst.h"
# include <stdlib.h>
# include <string.h>
2008-02-01 07:08:13 +00:00
# define ARRAY_SIZE(array) (sizeof(array) / sizeof(array[0]))
2006-02-10 23:49:09 +00:00
2006-09-29 03:38:06 +00:00
# define NEW_ARRAY(type,count) (type *) malloc((count) * sizeof(type))
# define DELETE_ARRAY(array) free(array)
2006-02-10 23:49:09 +00:00
static void TestConstruction ( void ) ;
static void TestUTF8 ( void ) ;
static void TestUTF16 ( void ) ;
static void TestC1Bytes ( void ) ;
static void TestInputFilter ( void ) ;
2006-03-24 23:29:08 +00:00
static void TestChaining ( void ) ;
2008-02-01 07:08:13 +00:00
static void TestBufferOverflow ( void ) ;
2006-02-10 23:49:09 +00:00
void addUCsdetTest ( TestNode * * root ) ;
void addUCsdetTest ( TestNode * * root )
{
2006-03-24 23:29:08 +00:00
addTest ( root , & TestConstruction , " ucsdetst/TestConstruction " ) ;
addTest ( root , & TestUTF8 , " ucsdetst/TestUTF8 " ) ;
addTest ( root , & TestUTF16 , " ucsdetst/TestUTF16 " ) ;
addTest ( root , & TestC1Bytes , " ucsdetst/TestC1Bytes " ) ;
addTest ( root , & TestInputFilter , " ucsdetst/TestInputFilter " ) ;
addTest ( root , & TestChaining , " ucsdetst/TestErrorChaining " ) ;
2008-02-01 07:08:13 +00:00
addTest ( root , & TestBufferOverflow , " ucsdetst/TestBufferOverflow " ) ;
2006-02-10 23:49:09 +00:00
}
static int32_t preflight ( const UChar * src , int32_t length , UConverter * cnv )
{
UErrorCode status ;
char buffer [ 1024 ] ;
char * dest , * destLimit = buffer + sizeof ( buffer ) ;
const UChar * srcLimit = src + length ;
int32_t result = 0 ;
do {
dest = buffer ;
status = U_ZERO_ERROR ;
ucnv_fromUnicode ( cnv , & dest , destLimit , & src , srcLimit , 0 , TRUE , & status ) ;
result + = ( int32_t ) ( dest - buffer ) ;
} while ( status = = U_BUFFER_OVERFLOW_ERROR ) ;
return result ;
}
static char * extractBytes ( const UChar * src , int32_t length , const char * codepage , int32_t * byteLength )
{
UErrorCode status = U_ZERO_ERROR ;
UConverter * cnv = ucnv_open ( codepage , & status ) ;
int32_t byteCount = preflight ( src , length , cnv ) ;
const UChar * srcLimit = src + length ;
char * bytes = NEW_ARRAY ( char , byteCount + 1 ) ;
char * dest = bytes , * destLimit = bytes + byteCount + 1 ;
ucnv_fromUnicode ( cnv , & dest , destLimit , & src , srcLimit , 0 , TRUE , & status ) ;
ucnv_close ( cnv ) ;
* byteLength = byteCount ;
return bytes ;
}
static void freeBytes ( char * bytes )
{
DELETE_ARRAY ( bytes ) ;
}
static void TestConstruction ( void )
{
UErrorCode status = U_ZERO_ERROR ;
UCharsetDetector * csd = ucsdet_open ( & status ) ;
UEnumeration * e = ucsdet_getAllDetectableCharsets ( csd , & status ) ;
2006-08-06 22:38:31 +00:00
const char * name ;
2006-02-10 23:49:09 +00:00
int32_t count = uenum_count ( e , & status ) ;
2006-08-06 22:38:31 +00:00
int32_t i , length ;
2006-02-10 23:49:09 +00:00
for ( i = 0 ; i < count ; i + = 1 ) {
2006-08-06 22:38:31 +00:00
name = uenum_next ( e , & length , & status ) ;
2006-02-10 23:49:09 +00:00
if ( name = = NULL | | length < = 0 ) {
log_err ( " ucsdet_getAllDetectableCharsets() returned a null or empty name! \n " ) ;
}
}
2006-08-06 22:38:31 +00:00
/* one past the list of all names must return NULL */
name = uenum_next ( e , & length , & status ) ;
if ( name ! = NULL | | length ! = 0 | | U_FAILURE ( status ) ) {
log_err ( " ucsdet_getAllDetectableCharsets(past the list) returned a non-null name! \n " ) ;
}
2006-02-10 23:49:09 +00:00
uenum_close ( e ) ;
ucsdet_close ( csd ) ;
}
static void TestUTF8 ( void )
{
UErrorCode status = U_ZERO_ERROR ;
2006-09-29 03:38:06 +00:00
static const char ss [ ] = " This is a string with some non-ascii characters that will "
2006-02-10 23:49:09 +00:00
" be converted to UTF-8, then shoved through the detection process. "
" \\ u0391 \\ u0392 \\ u0393 \\ u0394 \\ u0395 "
" Sure would be nice if our source could contain Unicode directly! " ;
int32_t byteLength = 0 , sLength = 0 , dLength = 0 ;
2006-09-29 03:38:06 +00:00
UChar s [ sizeof ( ss ) ] ;
char * bytes ;
2006-02-10 23:49:09 +00:00
UCharsetDetector * csd = ucsdet_open ( & status ) ;
const UCharsetMatch * match ;
2006-09-29 03:38:06 +00:00
UChar detected [ sizeof ( ss ) ] ;
sLength = u_unescape ( ss , s , sizeof ( ss ) ) ;
bytes = extractBytes ( s , sLength , " UTF-8 " , & byteLength ) ;
2006-02-10 23:49:09 +00:00
ucsdet_setText ( csd , bytes , byteLength , & status ) ;
2008-02-08 09:10:22 +00:00
if ( U_FAILURE ( status ) ) {
log_err ( " status is %s \n " , u_errorName ( status ) ) ;
goto bail ;
}
2006-02-10 23:49:09 +00:00
match = ucsdet_detect ( csd , & status ) ;
if ( match = = NULL ) {
log_err ( " Detection failure for UTF-8: got no matches. \n " ) ;
goto bail ;
}
dLength = ucsdet_getUChars ( match , detected , sLength , & status ) ;
if ( u_strCompare ( detected , dLength , s , sLength , FALSE ) ! = 0 ) {
log_err ( " Round-trip test failed! \n " ) ;
}
ucsdet_setDeclaredEncoding ( csd , " UTF-8 " , 5 , & status ) ; /* for coverage */
bail :
freeBytes ( bytes ) ;
ucsdet_close ( csd ) ;
}
static void TestUTF16 ( void )
{
UErrorCode status = U_ZERO_ERROR ;
/* Notice the BOM on the start of this string */
2006-09-29 03:38:06 +00:00
static const UChar chars [ ] = {
2006-02-10 23:49:09 +00:00
0xFEFF , 0x0623 , 0x0648 , 0x0631 , 0x0648 , 0x0628 , 0x0627 , 0x002C ,
0x0020 , 0x0628 , 0x0631 , 0x0645 , 0x062c , 0x064a , 0x0627 , 0x062a ,
0x0020 , 0x0627 , 0x0644 , 0x062d , 0x0627 , 0x0633 , 0x0648 , 0x0628 ,
0x0020 , 0x002b , 0x0020 , 0x0627 , 0x0646 , 0x062a , 0x0631 , 0x0646 ,
0x064a , 0x062a , 0x0000 } ;
int32_t beLength = 0 , leLength = 0 , cLength = ARRAY_SIZE ( chars ) ;
char * beBytes = extractBytes ( chars , cLength , " UTF-16BE " , & beLength ) ;
char * leBytes = extractBytes ( chars , cLength , " UTF-16LE " , & leLength ) ;
UCharsetDetector * csd = ucsdet_open ( & status ) ;
const UCharsetMatch * match ;
const char * name ;
int32_t conf ;
ucsdet_setText ( csd , beBytes , beLength , & status ) ;
match = ucsdet_detect ( csd , & status ) ;
if ( match = = NULL ) {
log_err ( " Encoding detection failure for UTF-16BE: got no matches. \n " ) ;
goto try_le ;
}
name = ucsdet_getName ( match , & status ) ;
conf = ucsdet_getConfidence ( match , & status ) ;
if ( strcmp ( name , " UTF-16BE " ) ! = 0 ) {
log_err ( " Encoding detection failure for UTF-16BE: got %s \n " , name ) ;
}
if ( conf ! = 100 ) {
log_err ( " Did not get 100%% confidence for UTF-16BE: got %d \n " , conf ) ;
}
try_le :
ucsdet_setText ( csd , leBytes , leLength , & status ) ;
match = ucsdet_detect ( csd , & status ) ;
if ( match = = NULL ) {
log_err ( " Encoding detection failure for UTF-16LE: got no matches. \n " ) ;
goto bail ;
}
name = ucsdet_getName ( match , & status ) ;
conf = ucsdet_getConfidence ( match , & status ) ;
if ( strcmp ( name , " UTF-16LE " ) ! = 0 ) {
log_err ( " Enconding detection failure for UTF-16LE: got %s \n " , name ) ;
}
if ( conf ! = 100 ) {
log_err ( " Did not get 100%% confidence for UTF-16LE: got %d \n " , conf ) ;
}
bail :
freeBytes ( leBytes ) ;
freeBytes ( beBytes ) ;
ucsdet_close ( csd ) ;
}
static void TestC1Bytes ( void )
{
2006-07-28 22:58:29 +00:00
# if !UCONFIG_NO_LEGACY_CONVERSION
2006-02-10 23:49:09 +00:00
UErrorCode status = U_ZERO_ERROR ;
2006-09-29 03:38:06 +00:00
static const char ssISO [ ] = " This is a small sample of some English text. Just enough to be sure that it detects correctly. " ;
static const char ssWindows [ ] = " This is another small sample of some English text. Just enough to be sure that it detects correctly. It also includes some \\ u201CC1 \\ u201D bytes. " ;
2006-02-10 23:49:09 +00:00
int32_t sISOLength = 0 , sWindowsLength = 0 ;
2006-09-29 03:38:06 +00:00
UChar sISO [ sizeof ( ssISO ) ] ;
UChar sWindows [ sizeof ( ssWindows ) ] ;
2006-02-10 23:49:09 +00:00
int32_t lISO = 0 , lWindows = 0 ;
2006-09-29 03:38:06 +00:00
char * bISO ;
char * bWindows ;
2006-02-10 23:49:09 +00:00
UCharsetDetector * csd = ucsdet_open ( & status ) ;
const UCharsetMatch * match ;
const char * name ;
2006-09-29 03:38:06 +00:00
sISOLength = u_unescape ( ssISO , sISO , sizeof ( ssISO ) ) ;
sWindowsLength = u_unescape ( ssWindows , sWindows , sizeof ( ssWindows ) ) ;
bISO = extractBytes ( sISO , sISOLength , " ISO-8859-1 " , & lISO ) ;
bWindows = extractBytes ( sWindows , sWindowsLength , " windows-1252 " , & lWindows ) ;
2006-02-10 23:49:09 +00:00
ucsdet_setText ( csd , bWindows , lWindows , & status ) ;
match = ucsdet_detect ( csd , & status ) ;
if ( match = = NULL ) {
log_err ( " English test with C1 bytes got no matches. \n " ) ;
goto bail ;
}
name = ucsdet_getName ( match , & status ) ;
if ( strcmp ( name , " windows-1252 " ) ! = 0 ) {
log_err ( " English text with C1 bytes does not detect as windows-1252, but as %s \n " , name ) ;
}
ucsdet_setText ( csd , bISO , lISO , & status ) ;
match = ucsdet_detect ( csd , & status ) ;
if ( match = = NULL ) {
log_err ( " English text without C1 bytes got no matches. \n " ) ;
goto bail ;
}
name = ucsdet_getName ( match , & status ) ;
if ( strcmp ( name , " ISO-8859-1 " ) ! = 0 ) {
log_err ( " English text without C1 bytes does not detect as ISO-8859-1, but as %s \n " , name ) ;
}
bail :
freeBytes ( bWindows ) ;
freeBytes ( bISO ) ;
ucsdet_close ( csd ) ;
2006-07-28 22:58:29 +00:00
# endif
2006-02-10 23:49:09 +00:00
}
static void TestInputFilter ( void )
{
UErrorCode status = U_ZERO_ERROR ;
2006-09-29 03:38:06 +00:00
static const char ss [ ] = " <a> <lot> <of> <English> <inside> <the> <markup> Un tr \\ u00E8s petit peu de Fran \\ u00E7ais. <to> <confuse> <the> <detector> " ;
2006-02-10 23:49:09 +00:00
int32_t sLength = 0 ;
2006-09-29 03:38:06 +00:00
UChar s [ sizeof ( ss ) ] ;
2006-02-10 23:49:09 +00:00
int32_t byteLength = 0 ;
2006-09-29 03:38:06 +00:00
char * bytes ;
2006-02-10 23:49:09 +00:00
UCharsetDetector * csd = ucsdet_open ( & status ) ;
const UCharsetMatch * match ;
const char * lang , * name ;
2006-09-29 03:38:06 +00:00
sLength = u_unescape ( ss , s , sizeof ( ss ) ) ;
bytes = extractBytes ( s , sLength , " ISO-8859-1 " , & byteLength ) ;
2006-02-10 23:49:09 +00:00
ucsdet_enableInputFilter ( csd , TRUE ) ;
if ( ! ucsdet_isInputFilterEnabled ( csd ) ) {
log_err ( " ucsdet_enableInputFilter(csd, TRUE) did not enable input filter! \n " ) ;
}
ucsdet_setText ( csd , bytes , byteLength , & status ) ;
match = ucsdet_detect ( csd , & status ) ;
if ( match = = NULL ) {
log_err ( " Turning on the input filter resulted in no matches. \n " ) ;
goto turn_off ;
}
name = ucsdet_getName ( match , & status ) ;
if ( name = = NULL | | strcmp ( name , " ISO-8859-1 " ) ! = 0 ) {
log_err ( " Turning on the input filter resulted in %s rather than ISO-8859-1 \n " , name ) ;
} else {
lang = ucsdet_getLanguage ( match , & status ) ;
if ( lang = = NULL | | strcmp ( lang , " fr " ) ! = 0 ) {
log_err ( " Input filter did not strip markup! \n " ) ;
}
}
turn_off :
ucsdet_enableInputFilter ( csd , FALSE ) ;
ucsdet_setText ( csd , bytes , byteLength , & status ) ;
match = ucsdet_detect ( csd , & status ) ;
if ( match = = NULL ) {
log_err ( " Turning off the input filter resulted in no matches. \n " ) ;
goto bail ;
}
name = ucsdet_getName ( match , & status ) ;
if ( name = = NULL | | strcmp ( name , " ISO-8859-1 " ) ! = 0 ) {
log_err ( " Turning off the input filter resulted in %s rather than ISO-8859-1 \n " , name ) ;
} else {
lang = ucsdet_getLanguage ( match , & status ) ;
if ( lang = = NULL | | strcmp ( lang , " en " ) ! = 0 ) {
log_err ( " Unfiltered input did not detect as English! \n " ) ;
}
}
bail :
freeBytes ( bytes ) ;
ucsdet_close ( csd ) ;
}
2006-03-24 23:29:08 +00:00
static void TestChaining ( void ) {
UErrorCode status = U_USELESS_COLLATOR_ERROR ;
ucsdet_open ( & status ) ;
ucsdet_setText ( NULL , NULL , 0 , & status ) ;
ucsdet_getName ( NULL , & status ) ;
ucsdet_getConfidence ( NULL , & status ) ;
ucsdet_getLanguage ( NULL , & status ) ;
ucsdet_detect ( NULL , & status ) ;
ucsdet_setDeclaredEncoding ( NULL , NULL , 0 , & status ) ;
ucsdet_detectAll ( NULL , NULL , & status ) ;
ucsdet_getUChars ( NULL , NULL , 0 , & status ) ;
ucsdet_getUChars ( NULL , NULL , 0 , & status ) ;
ucsdet_close ( NULL ) ;
/* All of this code should have done nothing. */
if ( status ! = U_USELESS_COLLATOR_ERROR ) {
log_err ( " Status got changed to %s \n " , u_errorName ( status ) ) ;
}
}
2008-02-01 07:08:13 +00:00
static void TestBufferOverflow ( void ) {
UErrorCode status = U_ZERO_ERROR ;
static const char * testStrings [ ] = {
2008-02-25 22:34:08 +00:00
" \x80 \x20 \x54 \x68 \x69 \x73 \x20 \x69 \x73 \x20 \x45 \x6E \x67 \x6C \x69 \x73 \x68 \x20 \x1b " , /* A partial ISO-2022 shift state at the end */
" \x80 \x20 \x54 \x68 \x69 \x73 \x20 \x69 \x73 \x20 \x45 \x6E \x67 \x6C \x69 \x73 \x68 \x20 \x1b \x24 " , /* A partial ISO-2022 shift state at the end */
" \x80 \x20 \x54 \x68 \x69 \x73 \x20 \x69 \x73 \x20 \x45 \x6E \x67 \x6C \x69 \x73 \x68 \x20 \x1b \x24 \x28 " , /* A partial ISO-2022 shift state at the end */
" \x80 \x20 \x54 \x68 \x69 \x73 \x20 \x69 \x73 \x20 \x45 \x6E \x67 \x6C \x69 \x73 \x68 \x20 \x1b \x24 \x28 \x44 " , /* A complete ISO-2022 shift state at the end with a bad one at the start */
2008-02-01 10:34:40 +00:00
" \x1b \x24 \x28 \x44 " , /* A complete ISO-2022 shift state at the end */
" \xa1 " , /* Could be a single byte shift-jis at the end */
2008-02-25 22:34:08 +00:00
" \x74 \x68 \xa1 " , /* Could be a single byte shift-jis at the end */
" \x74 \x68 \x65 \xa1 " /* Could be a single byte shift-jis at the end, but now we have English creeping in. */
2008-02-01 07:08:13 +00:00
} ;
static const char * testResults [ ] = {
" windows-1252 " ,
" windows-1252 " ,
" windows-1252 " ,
" windows-1252 " ,
2008-02-01 10:34:40 +00:00
" ISO-2022-JP " ,
NULL ,
NULL ,
" ISO-8859-1 "
2008-02-01 07:08:13 +00:00
} ;
int32_t idx = 0 ;
UCharsetDetector * csd = ucsdet_open ( & status ) ;
const UCharsetMatch * match ;
ucsdet_setDeclaredEncoding ( csd , " ISO-2022-JP " , - 1 , & status ) ;
if ( U_FAILURE ( status ) ) {
log_err ( " Couldn't open detector. %s \n " , u_errorName ( status ) ) ;
goto bail ;
}
for ( idx = 0 ; idx < ARRAY_SIZE ( testStrings ) ; idx + + ) {
ucsdet_setText ( csd , testStrings [ idx ] , - 1 , & status ) ;
match = ucsdet_detect ( csd , & status ) ;
2008-02-01 10:34:40 +00:00
if ( match = = NULL ) {
if ( testResults [ idx ] ! = NULL ) {
log_err ( " Unexpectedly got no results at index %d. \n " , idx ) ;
}
else {
log_verbose ( " Got no result as expected at index %d. \n " , idx ) ;
}
continue ;
}
if ( testResults [ idx ] = = NULL | | strcmp ( ucsdet_getName ( match , & status ) , testResults [ idx ] ) ! = 0 ) {
2008-02-01 07:08:13 +00:00
log_err ( " Unexpectedly got %s instead of %s at index %d with confidence %d. \n " ,
ucsdet_getName ( match , & status ) , testResults [ idx ] , idx , ucsdet_getConfidence ( match , & status ) ) ;
goto bail ;
}
}
bail :
ucsdet_close ( csd ) ;
}