2017-01-20 00:20:31 +00:00
// © 2016 and later: Unicode, Inc. and others.
2016-06-15 18:58:17 +00:00
// License & terms of use: http://www.unicode.org/copyright.html
2006-02-06 18:03:11 +00:00
/*
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
2016-05-31 21:45:07 +00:00
* Copyright ( C ) 2005 - 2016 , International Business Machines
* Corporation and others . All Rights Reserved .
2006-02-06 18:03:11 +00:00
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
*/
# include "unicode/utypes.h"
# include "unicode/ucsdet.h"
# include "unicode/ucnv.h"
# include "unicode/unistr.h"
# include "unicode/putil.h"
2009-04-24 22:24:27 +00:00
# include "unicode/uniset.h"
2006-02-06 18:03:11 +00:00
# include "intltest.h"
# include "csdetest.h"
# include "xmlparser.h"
2018-01-03 04:45:29 +00:00
# include <memory>
2006-02-06 18:03:11 +00:00
# include <stdlib.h>
# include <string.h>
2006-02-09 21:13:01 +00:00
# ifdef DEBUG_DETECT
# include <stdio.h>
# endif
2006-02-06 18:03:11 +00:00
# define CH_SPACE 0x0020
# define CH_SLASH 0x002F
2019-08-14 21:06:44 +00:00
# define TEST_ASSERT(x) UPRV_BLOCK_MACRO_BEGIN { \
if ( ! ( x ) ) { \
errln ( " Failure in file %s, line %d " , __FILE__ , __LINE__ ) ; \
} \
} UPRV_BLOCK_MACRO_END
# define TEST_ASSERT_SUCCESS(errcode) UPRV_BLOCK_MACRO_BEGIN { \
if ( U_FAILURE ( errcode ) ) { \
errcheckln ( errcode , " Failure in file %s, line %d, status = \" %s \" " , __FILE__ , __LINE__ , u_errorName ( errcode ) ) ; \
return ; \
} \
} UPRV_BLOCK_MACRO_END
2012-05-26 00:19:47 +00:00
2006-02-06 18:03:11 +00:00
//---------------------------------------------------------------------------
//
// Test class boilerplate
//
//---------------------------------------------------------------------------
CharsetDetectionTest : : CharsetDetectionTest ( )
{
}
CharsetDetectionTest : : ~ CharsetDetectionTest ( )
{
}
void CharsetDetectionTest : : runIndexedTest ( int32_t index , UBool exec , const char * & name , char * /*par*/ )
{
if ( exec ) logln ( " TestSuite CharsetDetectionTest: " ) ;
switch ( index ) {
case 0 : name = " ConstructionTest " ;
if ( exec ) ConstructionTest ( ) ;
break ;
case 1 : name = " UTF8Test " ;
if ( exec ) UTF8Test ( ) ;
break ;
case 2 : name = " UTF16Test " ;
if ( exec ) UTF16Test ( ) ;
break ;
case 3 : name = " C1BytesTest " ;
if ( exec ) C1BytesTest ( ) ;
break ;
case 4 : name = " InputFilterTest " ;
if ( exec ) InputFilterTest ( ) ;
break ;
case 5 : name = " DetectionTest " ;
if ( exec ) DetectionTest ( ) ;
break ;
2009-08-04 21:09:17 +00:00
# if !UCONFIG_NO_LEGACY_CONVERSION
2009-03-31 15:39:00 +00:00
case 6 : name = " IBM424Test " ;
if ( exec ) IBM424Test ( ) ;
break ;
case 7 : name = " IBM420Test " ;
if ( exec ) IBM420Test ( ) ;
break ;
2009-08-04 21:09:17 +00:00
# else
case 6 :
case 7 : name = " skip " ; break ;
# endif
2009-04-24 22:24:27 +00:00
case 8 : name = " Ticket6394Test " ;
if ( exec ) Ticket6394Test ( ) ;
break ;
2012-05-26 00:19:47 +00:00
case 9 : name = " Ticket6954Test " ;
if ( exec ) Ticket6954Test ( ) ;
break ;
2006-02-06 18:03:11 +00:00
default : name = " " ;
break ; //needed to end loop
}
}
static UnicodeString * split ( const UnicodeString & src , UChar ch , int32_t & splits )
{
int32_t offset = - 1 ;
splits = 1 ;
while ( ( offset = src . indexOf ( ch , offset + 1 ) ) > = 0 ) {
splits + = 1 ;
}
UnicodeString * result = new UnicodeString [ splits ] ;
int32_t start = 0 ;
int32_t split = 0 ;
int32_t end ;
while ( ( end = src . indexOf ( ch , start ) ) > = 0 ) {
src . extractBetween ( start , end , result [ split + + ] ) ;
start = end + 1 ;
}
src . extractBetween ( start , src . length ( ) , result [ split ] ) ;
return result ;
}
static char * extractBytes ( const UnicodeString & source , const char * codepage , int32_t & length )
{
2007-08-31 18:06:46 +00:00
int32_t sLength = source . length ( ) ;
2006-07-28 22:58:29 +00:00
char * bytes = NULL ;
2006-02-06 18:03:11 +00:00
2007-08-31 18:06:46 +00:00
length = source . extract ( 0 , sLength , NULL , codepage ) ;
2007-08-24 19:04:53 +00:00
2007-08-31 18:06:46 +00:00
if ( length > 0 ) {
2018-01-03 04:45:29 +00:00
bytes = new char [ length + 1 ] ;
2007-08-31 18:06:46 +00:00
source . extract ( 0 , sLength , bytes , codepage ) ;
2006-07-28 22:58:29 +00:00
}
2006-02-06 18:03:11 +00:00
return bytes ;
}
void CharsetDetectionTest : : checkEncoding ( const UnicodeString & testString , const UnicodeString & encoding , const UnicodeString & id )
{
int32_t splits = 0 ;
int32_t testLength = testString . length ( ) ;
2018-01-03 04:45:29 +00:00
std : : unique_ptr < UnicodeString [ ] > eSplit ( split ( encoding , CH_SLASH , splits ) ) ;
2006-02-06 18:03:11 +00:00
UErrorCode status = U_ZERO_ERROR ;
int32_t cpLength = eSplit [ 0 ] . length ( ) ;
char codepage [ 64 ] ;
u_UCharsToChars ( eSplit [ 0 ] . getBuffer ( ) , codepage , cpLength ) ;
codepage [ cpLength ] = ' \0 ' ;
2009-11-20 06:28:25 +00:00
LocalUCharsetDetectorPointer csd ( ucsdet_open ( & status ) ) ;
2006-02-06 18:03:11 +00:00
int32_t byteLength = 0 ;
2018-01-03 04:45:29 +00:00
std : : unique_ptr < char [ ] > bytes ( extractBytes ( testString , codepage , byteLength ) ) ;
2006-02-06 18:03:11 +00:00
2018-01-03 04:45:29 +00:00
if ( ! bytes ) {
2006-07-28 22:58:29 +00:00
# if !UCONFIG_NO_LEGACY_CONVERSION
2011-03-03 19:29:57 +00:00
dataerrln ( " Can't open a " + encoding + " converter for " + id ) ;
2006-07-28 22:58:29 +00:00
# endif
return ;
}
2018-01-03 04:45:29 +00:00
ucsdet_setText ( csd . getAlias ( ) , bytes . get ( ) , byteLength , & status ) ;
2006-02-06 18:03:11 +00:00
2006-02-13 20:47:36 +00:00
int32_t matchCount = 0 ;
2009-11-20 06:28:25 +00:00
const UCharsetMatch * * matches = ucsdet_detectAll ( csd . getAlias ( ) , & matchCount , & status ) ;
2006-02-07 21:59:16 +00:00
2006-02-13 20:47:36 +00:00
UnicodeString name ( ucsdet_getName ( matches [ 0 ] , & status ) ) ;
UnicodeString lang ( ucsdet_getLanguage ( matches [ 0 ] , & status ) ) ;
2006-02-06 18:03:11 +00:00
UChar * decoded = NULL ;
int32_t dLength = 0 ;
2006-02-13 20:47:36 +00:00
if ( matchCount = = 0 ) {
2006-02-07 21:59:16 +00:00
errln ( " Encoding detection failure for " + id + " : expected " + eSplit [ 0 ] + " , got no matches " ) ;
2018-01-03 04:45:29 +00:00
return ;
2006-02-07 21:59:16 +00:00
}
2006-02-06 18:03:11 +00:00
if ( name . compare ( eSplit [ 0 ] ) ! = 0 ) {
errln ( " Encoding detection failure for " + id + " : expected " + eSplit [ 0 ] + " , got " + name ) ;
2006-02-07 19:12:43 +00:00
2006-02-09 21:13:01 +00:00
# ifdef DEBUG_DETECT
2006-02-07 19:12:43 +00:00
for ( int32_t m = 0 ; m < matchCount ; m + = 1 ) {
const char * name = ucsdet_getName ( matches [ m ] , & status ) ;
const char * lang = ucsdet_getLanguage ( matches [ m ] , & status ) ;
int32_t confidence = ucsdet_getConfidence ( matches [ m ] , & status ) ;
2006-02-09 21:13:01 +00:00
printf ( " %s (%s) %d \n " , name , lang , confidence ) ;
2006-02-07 19:12:43 +00:00
}
2006-02-09 21:13:01 +00:00
# endif
2018-01-03 04:45:29 +00:00
return ;
2006-02-06 18:03:11 +00:00
}
if ( splits > 1 & & lang . compare ( eSplit [ 1 ] ) ! = 0 ) {
errln ( " Language detection failure for " + id + " , " + eSplit [ 0 ] + " : expected " + eSplit [ 1 ] + " , got " + lang ) ;
2018-01-03 04:45:29 +00:00
return ;
2006-02-06 18:03:11 +00:00
}
2018-01-03 04:45:29 +00:00
decoded = new UChar [ testLength ] ;
2006-02-13 20:47:36 +00:00
dLength = ucsdet_getUChars ( matches [ 0 ] , decoded , testLength , & status ) ;
2006-02-06 18:03:11 +00:00
if ( testString . compare ( decoded , dLength ) ! = 0 ) {
errln ( " Round-trip error for " + id + " , " + eSplit [ 0 ] + " : getUChars() didn't yeild the original string. " ) ;
2006-02-09 21:13:01 +00:00
# ifdef DEBUG_DETECT
for ( int32_t i = 0 ; i < testLength ; i + = 1 ) {
if ( testString [ i ] ! = decoded [ i ] ) {
printf ( " Strings differ at byte %d \n " , i ) ;
break ;
}
}
# endif
2006-02-06 18:03:11 +00:00
}
2018-01-03 04:45:29 +00:00
delete [ ] decoded ;
2006-02-06 18:03:11 +00:00
}
const char * CharsetDetectionTest : : getPath ( char buffer [ 2048 ] , const char * filename ) {
UErrorCode status = U_ZERO_ERROR ;
const char * testDataDirectory = IntlTest : : getSourceTestData ( status ) ;
if ( U_FAILURE ( status ) ) {
errln ( " ERROR: getPath() failed - %s " , u_errorName ( status ) ) ;
return NULL ;
}
strcpy ( buffer , testDataDirectory ) ;
strcat ( buffer , filename ) ;
return buffer ;
}
void CharsetDetectionTest : : ConstructionTest ( )
{
2009-11-20 06:28:25 +00:00
IcuTestErrorCode status ( * this , " ConstructionTest " ) ;
LocalUCharsetDetectorPointer csd ( ucsdet_open ( status ) ) ;
LocalUEnumerationPointer e ( ucsdet_getAllDetectableCharsets ( csd . getAlias ( ) , status ) ) ;
int32_t count = uenum_count ( e . getAlias ( ) , status ) ;
2006-02-06 18:03:11 +00:00
2006-02-09 21:13:01 +00:00
# ifdef DEBUG_DETECT
printf ( " There are %d recognizers. \n " , count ) ;
# endif
2006-02-06 18:03:11 +00:00
for ( int32_t i = 0 ; i < count ; i + = 1 ) {
int32_t length ;
2009-11-20 06:28:25 +00:00
const char * name = uenum_next ( e . getAlias ( ) , & length , status ) ;
2006-02-06 18:03:11 +00:00
if ( name = = NULL | | length < = 0 ) {
2006-02-09 21:13:01 +00:00
errln ( " ucsdet_getAllDetectableCharsets() returned a null or empty name! " ) ;
2006-02-06 18:03:11 +00:00
}
2006-02-09 21:13:01 +00:00
# ifdef DEBUG_DETECT
printf ( " %s \n " , name ) ;
# endif
2006-02-06 18:03:11 +00:00
}
2013-09-17 06:57:53 +00:00
const char * defDisabled [ ] = {
" IBM420_rtl " , " IBM420_ltr " ,
" IBM424_rtl " , " IBM424_ltr " ,
0
} ;
LocalUEnumerationPointer eActive ( ucsdet_getDetectableCharsets ( csd . getAlias ( ) , status ) ) ;
const char * activeName = NULL ;
2013-09-18 20:08:25 +00:00
while ( ( activeName = uenum_next ( eActive . getAlias ( ) , NULL , status ) ) ) {
2013-09-17 06:57:53 +00:00
// the charset must be included in all list
UBool found = FALSE ;
const char * name = NULL ;
uenum_reset ( e . getAlias ( ) , status ) ;
2013-09-18 20:08:25 +00:00
while ( ( name = uenum_next ( e . getAlias ( ) , NULL , status ) ) ) {
2013-09-17 06:57:53 +00:00
if ( strcmp ( activeName , name ) = = 0 ) {
found = TRUE ;
break ;
}
}
if ( ! found ) {
errln ( UnicodeString ( activeName ) + " is not included in the all charset list. " ) ;
}
// some charsets are disabled by default
found = FALSE ;
for ( int32_t i = 0 ; defDisabled [ i ] ! = 0 ; i + + ) {
if ( strcmp ( activeName , defDisabled [ i ] ) = = 0 ) {
found = TRUE ;
break ;
}
}
if ( found ) {
errln ( UnicodeString ( activeName ) + " should not be included in the default charset list. " ) ;
}
}
2006-02-06 18:03:11 +00:00
}
void CharsetDetectionTest : : UTF8Test ( )
{
UErrorCode status = U_ZERO_ERROR ;
UnicodeString ss = " This is a string with some non-ascii characters that will "
" be converted to UTF-8, then shoved through the detection process. "
" \\ u0391 \\ u0392 \\ u0393 \\ u0394 \\ u0395 "
" Sure would be nice if our source could contain Unicode directly! " ;
UnicodeString s = ss . unescape ( ) ;
int32_t byteLength = 0 , sLength = s . length ( ) ;
char * bytes = extractBytes ( s , " UTF-8 " , byteLength ) ;
UCharsetDetector * csd = ucsdet_open ( & status ) ;
const UCharsetMatch * match ;
2018-01-03 04:45:29 +00:00
UChar * detected = new UChar [ sLength ] ;
2006-02-06 18:03:11 +00:00
ucsdet_setText ( csd , bytes , byteLength , & status ) ;
match = ucsdet_detect ( csd , & status ) ;
2006-02-07 21:59:16 +00:00
if ( match = = NULL ) {
errln ( " Detection failure for UTF-8: got no matches. " ) ;
goto bail ;
}
2006-02-06 18:03:11 +00:00
ucsdet_getUChars ( match , detected , sLength , & status ) ;
if ( s . compare ( detected , sLength ) ! = 0 ) {
errln ( " Round-trip test failed! " ) ;
}
2006-02-07 21:59:16 +00:00
ucsdet_setDeclaredEncoding ( csd , " UTF-8 " , 5 , & status ) ; /* for coverage */
bail :
2018-01-03 04:45:29 +00:00
delete [ ] detected ;
delete [ ] bytes ;
2006-02-06 18:03:11 +00:00
ucsdet_close ( csd ) ;
}
void CharsetDetectionTest : : UTF16Test ( )
{
UErrorCode status = U_ZERO_ERROR ;
/* Notice the BOM on the start of this string */
UChar chars [ ] = {
0xFEFF , 0x0623 , 0x0648 , 0x0631 , 0x0648 , 0x0628 , 0x0627 , 0x002C ,
0x0020 , 0x0628 , 0x0631 , 0x0645 , 0x062c , 0x064a , 0x0627 , 0x062a ,
0x0020 , 0x0627 , 0x0644 , 0x062d , 0x0627 , 0x0633 , 0x0648 , 0x0628 ,
0x0020 , 0x002b , 0x0020 , 0x0627 , 0x0646 , 0x062a , 0x0631 , 0x0646 ,
0x064a , 0x062a , 0x0000 } ;
UnicodeString s ( chars ) ;
int32_t beLength = 0 , leLength = 0 ;
2018-01-03 04:45:29 +00:00
std : : unique_ptr < char [ ] > beBytes ( extractBytes ( s , " UTF-16BE " , beLength ) ) ;
std : : unique_ptr < char [ ] > leBytes ( extractBytes ( s , " UTF-16LE " , leLength ) ) ;
LocalUCharsetDetectorPointer csd ( ucsdet_open ( & status ) ) ;
2006-02-06 18:03:11 +00:00
const UCharsetMatch * match ;
const char * name ;
2006-02-07 21:59:16 +00:00
int32_t conf ;
2006-02-06 18:03:11 +00:00
2018-01-03 04:45:29 +00:00
ucsdet_setText ( csd . getAlias ( ) , beBytes . get ( ) , beLength , & status ) ;
match = ucsdet_detect ( csd . getAlias ( ) , & status ) ;
2006-02-07 21:59:16 +00:00
if ( match = = NULL ) {
errln ( " Encoding detection failure for UTF-16BE: got no matches. " ) ;
2018-01-03 04:45:29 +00:00
} else {
2006-02-06 18:03:11 +00:00
2018-01-03 04:45:29 +00:00
name = ucsdet_getName ( match , & status ) ;
conf = ucsdet_getConfidence ( match , & status ) ;
2006-02-06 18:03:11 +00:00
2018-01-03 04:45:29 +00:00
if ( strcmp ( name , " UTF-16BE " ) ! = 0 ) {
errln ( " Encoding detection failure for UTF-16BE: got %s " , name ) ;
} else if ( conf ! = 100 ) {
errln ( " Did not get 100%% confidence for UTF-16BE: got %d " , conf ) ;
}
2006-02-07 21:59:16 +00:00
}
2018-01-03 04:45:29 +00:00
ucsdet_setText ( csd . getAlias ( ) , leBytes . get ( ) , leLength , & status ) ;
match = ucsdet_detect ( csd . getAlias ( ) , & status ) ;
2006-02-07 21:59:16 +00:00
if ( match = = NULL ) {
errln ( " Encoding detection failure for UTF-16LE: got no matches. " ) ;
2018-01-03 04:45:29 +00:00
return ;
2006-02-07 21:59:16 +00:00
}
2006-02-06 18:03:11 +00:00
name = ucsdet_getName ( match , & status ) ;
2006-02-07 21:59:16 +00:00
conf = ucsdet_getConfidence ( match , & status ) ;
2006-02-06 18:03:11 +00:00
if ( strcmp ( name , " UTF-16LE " ) ! = 0 ) {
errln ( " Enconding detection failure for UTF-16LE: got %s " , name ) ;
2018-01-03 04:45:29 +00:00
return ;
2006-02-06 18:03:11 +00:00
}
2006-02-07 21:59:16 +00:00
if ( conf ! = 100 ) {
errln ( " Did not get 100%% confidence for UTF-16LE: got %d " , conf ) ;
}
2006-02-06 18:03:11 +00:00
}
void CharsetDetectionTest : : InputFilterTest ( )
{
UErrorCode status = U_ZERO_ERROR ;
2018-01-03 04:45:29 +00:00
UnicodeString s ( u " <a> <lot> <of> <English> <inside> <the> <markup> Un très petit peu de Français. <to> <confuse> <the> <detector> " ) ;
2006-02-06 18:03:11 +00:00
int32_t byteLength = 0 ;
char * bytes = extractBytes ( s , " ISO-8859-1 " , byteLength ) ;
UCharsetDetector * csd = ucsdet_open ( & status ) ;
const UCharsetMatch * match ;
2006-02-09 21:13:01 +00:00
const char * lang , * name ;
2006-02-06 18:03:11 +00:00
ucsdet_enableInputFilter ( csd , TRUE ) ;
if ( ! ucsdet_isInputFilterEnabled ( csd ) ) {
errln ( " ucsdet_enableInputFilter(csd, TRUE) did not enable input filter! " ) ;
}
2006-02-09 21:13:01 +00:00
2006-02-06 18:03:11 +00:00
ucsdet_setText ( csd , bytes , byteLength , & status ) ;
match = ucsdet_detect ( csd , & status ) ;
2006-02-07 21:59:16 +00:00
if ( match = = NULL ) {
errln ( " Turning on the input filter resulted in no matches. " ) ;
goto turn_off ;
}
2006-02-09 21:13:01 +00:00
name = ucsdet_getName ( match , & status ) ;
if ( name = = NULL | | strcmp ( name , " ISO-8859-1 " ) ! = 0 ) {
2006-02-10 23:49:09 +00:00
errln ( " Turning on the input filter resulted in %s rather than ISO-8859-1. " , name ) ;
2006-02-09 21:13:01 +00:00
} else {
lang = ucsdet_getLanguage ( match , & status ) ;
2006-02-06 18:03:11 +00:00
2006-02-09 21:13:01 +00:00
if ( lang = = NULL | | strcmp ( lang , " fr " ) ! = 0 ) {
errln ( " Input filter did not strip markup! " ) ;
}
2006-02-06 18:03:11 +00:00
}
2006-02-07 21:59:16 +00:00
turn_off :
2006-02-06 18:03:11 +00:00
ucsdet_enableInputFilter ( csd , FALSE ) ;
ucsdet_setText ( csd , bytes , byteLength , & status ) ;
match = ucsdet_detect ( csd , & status ) ;
2006-02-07 21:59:16 +00:00
if ( match = = NULL ) {
errln ( " Turning off the input filter resulted in no matches. " ) ;
goto bail ;
}
2006-02-09 21:13:01 +00:00
name = ucsdet_getName ( match , & status ) ;
2006-02-06 18:03:11 +00:00
2006-02-09 21:13:01 +00:00
if ( name = = NULL | | strcmp ( name , " ISO-8859-1 " ) ! = 0 ) {
2006-02-10 23:49:09 +00:00
errln ( " Turning off the input filter resulted in %s rather than ISO-8859-1. " , name ) ;
2006-02-09 21:13:01 +00:00
} else {
lang = ucsdet_getLanguage ( match , & status ) ;
if ( lang = = NULL | | strcmp ( lang , " en " ) ! = 0 ) {
errln ( " Unfiltered input did not detect as English! " ) ;
}
2006-02-06 18:03:11 +00:00
}
2006-02-07 21:59:16 +00:00
bail :
2018-01-03 04:45:29 +00:00
delete [ ] bytes ;
2006-02-06 18:03:11 +00:00
ucsdet_close ( csd ) ;
}
void CharsetDetectionTest : : C1BytesTest ( )
{
2006-07-28 22:58:29 +00:00
# if !UCONFIG_NO_LEGACY_CONVERSION
2006-02-06 18:03:11 +00:00
UErrorCode status = U_ZERO_ERROR ;
UnicodeString sISO = " This is a small sample of some English text. Just enough to be sure that it detects correctly. " ;
2008-06-17 00:55:35 +00:00
UnicodeString ssWindows ( " This is another small sample of some English text. Just enough to be sure that it detects correctly. It also includes some \\ u201CC1 \\ u201D bytes. " , - 1 , US_INV ) ;
2006-02-06 18:03:11 +00:00
UnicodeString sWindows = ssWindows . unescape ( ) ;
int32_t lISO = 0 , lWindows = 0 ;
char * bISO = extractBytes ( sISO , " ISO-8859-1 " , lISO ) ;
char * bWindows = extractBytes ( sWindows , " windows-1252 " , lWindows ) ;
UCharsetDetector * csd = ucsdet_open ( & status ) ;
const UCharsetMatch * match ;
const char * name ;
ucsdet_setText ( csd , bWindows , lWindows , & status ) ;
match = ucsdet_detect ( csd , & status ) ;
2006-02-07 21:59:16 +00:00
if ( match = = NULL ) {
2009-06-12 19:34:21 +00:00
errcheckln ( status , " English test with C1 bytes got no matches. - %s " , u_errorName ( status ) ) ;
2006-02-07 21:59:16 +00:00
goto bail ;
}
2006-02-06 18:03:11 +00:00
name = ucsdet_getName ( match , & status ) ;
if ( strcmp ( name , " windows-1252 " ) ! = 0 ) {
2006-02-07 19:12:43 +00:00
errln ( " English text with C1 bytes does not detect as windows-1252, but as %s " , name ) ;
2006-02-06 18:03:11 +00:00
}
ucsdet_setText ( csd , bISO , lISO , & status ) ;
match = ucsdet_detect ( csd , & status ) ;
2006-02-07 21:59:16 +00:00
if ( match = = NULL ) {
errln ( " English text without C1 bytes got no matches. " ) ;
goto bail ;
}
2006-02-06 18:03:11 +00:00
name = ucsdet_getName ( match , & status ) ;
if ( strcmp ( name , " ISO-8859-1 " ) ! = 0 ) {
2006-02-07 19:12:43 +00:00
errln ( " English text without C1 bytes does not detect as ISO-8859-1, but as %s " , name ) ;
2006-02-06 18:03:11 +00:00
}
2006-02-07 21:59:16 +00:00
bail :
2018-01-03 04:45:29 +00:00
delete [ ] bWindows ;
delete [ ] bISO ;
2006-02-06 18:03:11 +00:00
ucsdet_close ( csd ) ;
2006-07-28 22:58:29 +00:00
# endif
2006-02-06 18:03:11 +00:00
}
void CharsetDetectionTest : : DetectionTest ( )
{
2006-08-02 22:29:50 +00:00
# if !UCONFIG_NO_REGULAR_EXPRESSIONS
2006-02-06 18:03:11 +00:00
UErrorCode status = U_ZERO_ERROR ;
char path [ 2048 ] ;
const char * testFilePath = getPath ( path , " csdetest.xml " ) ;
if ( testFilePath = = NULL ) {
return ; /* Couldn't get path: error message already output. */
}
UXMLParser * parser = UXMLParser : : createParser ( status ) ;
2009-06-12 19:34:21 +00:00
if ( U_FAILURE ( status ) ) {
dataerrln ( " FAIL: UXMLParser::createParser (%s) " , u_errorName ( status ) ) ;
return ;
}
2006-02-06 18:03:11 +00:00
UXMLElement * root = parser - > parseFile ( testFilePath , status ) ;
2006-07-21 22:01:55 +00:00
if ( ! assertSuccess ( " parseFile " , status ) ) return ;
2006-02-06 18:03:11 +00:00
UnicodeString test_case = UNICODE_STRING_SIMPLE ( " test-case " ) ;
UnicodeString id_attr = UNICODE_STRING_SIMPLE ( " id " ) ;
UnicodeString enc_attr = UNICODE_STRING_SIMPLE ( " encodings " ) ;
const UXMLElement * testCase ;
int32_t tc = 0 ;
while ( ( testCase = root - > nextChildElement ( tc ) ) ! = NULL ) {
if ( testCase - > getTagName ( ) . compare ( test_case ) = = 0 ) {
const UnicodeString * id = testCase - > getAttribute ( id_attr ) ;
const UnicodeString * encodings = testCase - > getAttribute ( enc_attr ) ;
const UnicodeString text = testCase - > getText ( TRUE ) ;
int32_t encodingCount ;
UnicodeString * encodingList = split ( * encodings , CH_SPACE , encodingCount ) ;
for ( int32_t e = 0 ; e < encodingCount ; e + = 1 ) {
checkEncoding ( text , encodingList [ e ] , * id ) ;
}
delete [ ] encodingList ;
}
}
delete root ;
delete parser ;
2006-07-28 22:58:29 +00:00
# endif
2006-02-06 18:03:11 +00:00
}
2009-03-31 15:39:00 +00:00
void CharsetDetectionTest : : IBM424Test ( )
{
2015-02-27 02:24:05 +00:00
# if !UCONFIG_ONLY_HTML_CONVERSION
2009-03-31 15:39:00 +00:00
UErrorCode status = U_ZERO_ERROR ;
static const UChar chars [ ] = {
0x05D4 , 0x05E4 , 0x05E8 , 0x05E7 , 0x05DC , 0x05D9 , 0x05D8 , 0x0020 , 0x05D4 , 0x05E6 , 0x05D1 , 0x05D0 , 0x05D9 , 0x0020 , 0x05D4 , 0x05E8 ,
0x05D0 , 0x05E9 , 0x05D9 , 0x002C , 0x0020 , 0x05EA , 0x05EA , 0x0020 , 0x05D0 , 0x05DC , 0x05D5 , 0x05E3 , 0x0020 , 0x05D0 , 0x05D1 , 0x05D9 ,
0x05D7 , 0x05D9 , 0x0020 , 0x05DE , 0x05E0 , 0x05D3 , 0x05DC , 0x05D1 , 0x05DC , 0x05D9 , 0x05D8 , 0x002C , 0x0020 , 0x05D4 , 0x05D5 , 0x05E8 ,
0x05D4 , 0x0020 , 0x05E2 , 0x05DC , 0x0020 , 0x05E4 , 0x05EA , 0x05D9 , 0x05D7 , 0x05EA , 0x0020 , 0x05D7 , 0x05E7 , 0x05D9 , 0x05E8 , 0x05EA ,
0x0020 , 0x05DE , 0x05E6 , 0x0022 , 0x05D7 , 0x0020 , 0x05D1 , 0x05E2 , 0x05E7 , 0x05D1 , 0x05D5 , 0x05EA , 0x0020 , 0x05E2 , 0x05D3 , 0x05D5 ,
0x05D9 , 0x05D5 , 0x05EA , 0x0020 , 0x05D7 , 0x05D9 , 0x05D9 , 0x05DC , 0x05D9 , 0x0020 , 0x05E6 , 0x05D4 , 0x0022 , 0x05DC , 0x0020 , 0x05DE ,
0x05DE , 0x05D1 , 0x05E6 , 0x05E2 , 0x0020 , 0x05E2 , 0x05D5 , 0x05E4 , 0x05E8 , 0x05EA , 0x0020 , 0x05D9 , 0x05E6 , 0x05D5 , 0x05E7 , 0x05D4 ,
0x0020 , 0x05D1 , 0x002B , 0x0020 , 0x05E8 , 0x05E6 , 0x05D5 , 0x05E2 , 0x05EA , 0x0020 , 0x05E2 , 0x05D6 , 0x05D4 , 0x002E , 0x0020 , 0x05DC ,
0x05D3 , 0x05D1 , 0x05E8 , 0x05D9 , 0x0020 , 0x05D4 , 0x05E4 , 0x05E6 , 0x0022 , 0x05E8 , 0x002C , 0x0020 , 0x05DE , 0x05D4 , 0x05E2 , 0x05D3 ,
0x05D5 , 0x05D9 , 0x05D5 , 0x05EA , 0x0020 , 0x05E2 , 0x05D5 , 0x05DC , 0x05D4 , 0x0020 , 0x05EA , 0x05DE , 0x05D5 , 0x05E0 , 0x05D4 , 0x0020 ,
0x05E9 , 0x05DC , 0x0020 , 0x0022 , 0x05D4 , 0x05EA , 0x05E0 , 0x05D4 , 0x05D2 , 0x05D5 , 0x05EA , 0x0020 , 0x05E4 , 0x05E1 , 0x05D5 , 0x05DC ,
0x05D4 , 0x0020 , 0x05DC , 0x05DB , 0x05D0 , 0x05D5 , 0x05E8 , 0x05D4 , 0x0020 , 0x05E9 , 0x05DC , 0x0020 , 0x05D7 , 0x05D9 , 0x05D9 , 0x05DC ,
0x05D9 , 0x05DD , 0x0020 , 0x05D1 , 0x05DE , 0x05D4 , 0x05DC , 0x05DA , 0x0020 , 0x05DE , 0x05D1 , 0x05E6 , 0x05E2 , 0x0020 , 0x05E2 , 0x05D5 ,
0x05E4 , 0x05E8 , 0x05EA , 0x0020 , 0x05D9 , 0x05E6 , 0x05D5 , 0x05E7 , 0x05D4 , 0x0022 , 0x002E , 0x0020 , 0x05DE , 0x05E0 , 0x05D3 , 0x05DC ,
0x05D1 , 0x05DC , 0x05D9 , 0x05D8 , 0x0020 , 0x05E7 , 0x05D9 , 0x05D1 , 0x05DC , 0x0020 , 0x05D0 , 0x05EA , 0x0020 , 0x05D4 , 0x05D7 , 0x05DC ,
0x05D8 , 0x05EA , 0x05D5 , 0x0020 , 0x05DC , 0x05D0 , 0x05D7 , 0x05E8 , 0x0020 , 0x05E9 , 0x05E2 , 0x05D9 , 0x05D9 , 0x05DF , 0x0020 , 0x05D1 ,
0x05EA , 0x05DE , 0x05DC , 0x05D9 , 0x05DC , 0x0020 , 0x05D4 , 0x05E2 , 0x05D3 , 0x05D5 , 0x05D9 , 0x05D5 , 0x05EA , 0x0000
} ;
2009-04-14 20:12:54 +00:00
static const UChar chars_reverse [ ] = {
0x05EA , 0x05D5 , 0x05D9 , 0x05D5 , 0x05D3 , 0x05E2 , 0x05D4 , 0x0020 , 0x05DC , 0x05D9 , 0x05DC , 0x05DE , 0x05EA ,
0x05D1 , 0x0020 , 0x05DF , 0x05D9 , 0x05D9 , 0x05E2 , 0x05E9 , 0x0020 , 0x05E8 , 0x05D7 , 0x05D0 , 0x05DC , 0x0020 , 0x05D5 , 0x05EA , 0x05D8 ,
0x05DC , 0x05D7 , 0x05D4 , 0x0020 , 0x05EA , 0x05D0 , 0x0020 , 0x05DC , 0x05D1 , 0x05D9 , 0x05E7 , 0x0020 , 0x05D8 , 0x05D9 , 0x05DC , 0x05D1 ,
0x05DC , 0x05D3 , 0x05E0 , 0x05DE , 0x0020 , 0x002E , 0x0022 , 0x05D4 , 0x05E7 , 0x05D5 , 0x05E6 , 0x05D9 , 0x0020 , 0x05EA , 0x05E8 , 0x05E4 ,
0x05D5 , 0x05E2 , 0x0020 , 0x05E2 , 0x05E6 , 0x05D1 , 0x05DE , 0x0020 , 0x05DA , 0x05DC , 0x05D4 , 0x05DE , 0x05D1 , 0x0020 , 0x05DD , 0x05D9 ,
0x05DC , 0x05D9 , 0x05D9 , 0x05D7 , 0x0020 , 0x05DC , 0x05E9 , 0x0020 , 0x05D4 , 0x05E8 , 0x05D5 , 0x05D0 , 0x05DB , 0x05DC , 0x0020 , 0x05D4 ,
0x05DC , 0x05D5 , 0x05E1 , 0x05E4 , 0x0020 , 0x05EA , 0x05D5 , 0x05D2 , 0x05D4 , 0x05E0 , 0x05EA , 0x05D4 , 0x0022 , 0x0020 , 0x05DC , 0x05E9 ,
0x0020 , 0x05D4 , 0x05E0 , 0x05D5 , 0x05DE , 0x05EA , 0x0020 , 0x05D4 , 0x05DC , 0x05D5 , 0x05E2 , 0x0020 , 0x05EA , 0x05D5 , 0x05D9 , 0x05D5 ,
0x05D3 , 0x05E2 , 0x05D4 , 0x05DE , 0x0020 , 0x002C , 0x05E8 , 0x0022 , 0x05E6 , 0x05E4 , 0x05D4 , 0x0020 , 0x05D9 , 0x05E8 , 0x05D1 , 0x05D3 ,
0x05DC , 0x0020 , 0x002E , 0x05D4 , 0x05D6 , 0x05E2 , 0x0020 , 0x05EA , 0x05E2 , 0x05D5 , 0x05E6 , 0x05E8 , 0x0020 , 0x002B , 0x05D1 , 0x0020 ,
0x05D4 , 0x05E7 , 0x05D5 , 0x05E6 , 0x05D9 , 0x0020 , 0x05EA , 0x05E8 , 0x05E4 , 0x05D5 , 0x05E2 , 0x0020 , 0x05E2 , 0x05E6 , 0x05D1 , 0x05DE ,
0x05DE , 0x0020 , 0x05DC , 0x0022 , 0x05D4 , 0x05E6 , 0x0020 , 0x05D9 , 0x05DC , 0x05D9 , 0x05D9 , 0x05D7 , 0x0020 , 0x05EA , 0x05D5 , 0x05D9 ,
0x05D5 , 0x05D3 , 0x05E2 , 0x0020 , 0x05EA , 0x05D5 , 0x05D1 , 0x05E7 , 0x05E2 , 0x05D1 , 0x0020 , 0x05D7 , 0x0022 , 0x05E6 , 0x05DE , 0x0020 ,
0x05EA , 0x05E8 , 0x05D9 , 0x05E7 , 0x05D7 , 0x0020 , 0x05EA , 0x05D7 , 0x05D9 , 0x05EA , 0x05E4 , 0x0020 , 0x05DC , 0x05E2 , 0x0020 , 0x05D4 ,
0x05E8 , 0x05D5 , 0x05D4 , 0x0020 , 0x002C , 0x05D8 , 0x05D9 , 0x05DC , 0x05D1 , 0x05DC , 0x05D3 , 0x05E0 , 0x05DE , 0x0020 , 0x05D9 , 0x05D7 ,
0x05D9 , 0x05D1 , 0x05D0 , 0x0020 , 0x05E3 , 0x05D5 , 0x05DC , 0x05D0 , 0x0020 , 0x05EA , 0x05EA , 0x0020 , 0x002C , 0x05D9 , 0x05E9 , 0x05D0 ,
0x05E8 , 0x05D4 , 0x0020 , 0x05D9 , 0x05D0 , 0x05D1 , 0x05E6 , 0x05D4 , 0x0020 , 0x05D8 , 0x05D9 , 0x05DC , 0x05E7 , 0x05E8 , 0x05E4 , 0x05D4 ,
0x0000
} ;
int32_t bLength = 0 , brLength = 0 ;
UnicodeString s1 ( chars ) ;
UnicodeString s2 ( chars_reverse ) ;
char * bytes = extractBytes ( s1 , " IBM424 " , bLength ) ;
char * bytes_r = extractBytes ( s2 , " IBM424 " , brLength ) ;
2009-03-31 15:39:00 +00:00
UCharsetDetector * csd = ucsdet_open ( & status ) ;
2013-09-17 06:57:53 +00:00
ucsdet_setDetectableCharset ( csd , " IBM424_rtl " , TRUE , & status ) ;
ucsdet_setDetectableCharset ( csd , " IBM424_ltr " , TRUE , & status ) ;
ucsdet_setDetectableCharset ( csd , " IBM420_rtl " , TRUE , & status ) ;
ucsdet_setDetectableCharset ( csd , " IBM420_ltr " , TRUE , & status ) ;
2009-06-12 19:34:21 +00:00
if ( U_FAILURE ( status ) ) {
errln ( " Error opening charset detector. - %s " , u_errorName ( status ) ) ;
}
2009-03-31 15:39:00 +00:00
const UCharsetMatch * match ;
const char * name ;
ucsdet_setText ( csd , bytes , bLength , & status ) ;
match = ucsdet_detect ( csd , & status ) ;
if ( match = = NULL ) {
2009-06-12 19:34:21 +00:00
errcheckln ( status , " Encoding detection failure for IBM424_rtl: got no matches. - %s " , u_errorName ( status ) ) ;
2009-04-14 20:12:54 +00:00
goto bail ;
}
name = ucsdet_getName ( match , & status ) ;
if ( strcmp ( name , " IBM424_rtl " ) ! = 0 ) {
2009-06-12 19:34:21 +00:00
errln ( " Encoding detection failure for IBM424_rtl: got %s " , name ) ;
2009-04-14 20:12:54 +00:00
}
ucsdet_setText ( csd , bytes_r , brLength , & status ) ;
match = ucsdet_detect ( csd , & status ) ;
if ( match = = NULL ) {
2009-06-12 19:34:21 +00:00
errln ( " Encoding detection failure for IBM424_ltr: got no matches. " ) ;
2009-03-31 15:39:00 +00:00
goto bail ;
}
name = ucsdet_getName ( match , & status ) ;
2009-04-14 20:12:54 +00:00
if ( strcmp ( name , " IBM424_ltr " ) ! = 0 ) {
2009-06-12 19:34:21 +00:00
errln ( " Encoding detection failure for IBM424_ltr: got %s " , name ) ;
2009-03-31 15:39:00 +00:00
}
bail :
2018-01-03 04:45:29 +00:00
delete [ ] bytes ;
delete [ ] bytes_r ;
2009-03-31 15:39:00 +00:00
ucsdet_close ( csd ) ;
2015-02-27 02:24:05 +00:00
# endif
2009-03-31 15:39:00 +00:00
}
void CharsetDetectionTest : : IBM420Test ( )
{
2015-02-27 02:24:05 +00:00
# if !UCONFIG_ONLY_HTML_CONVERSION
2009-03-31 15:39:00 +00:00
UErrorCode status = U_ZERO_ERROR ;
static const UChar chars [ ] = {
0x0648 , 0x064F , 0x0636 , 0x0639 , 0x062A , 0x0020 , 0x0648 , 0x0646 , 0x064F , 0x0641 , 0x0630 , 0x062A , 0x0020 , 0x0628 , 0x0631 , 0x0627 ,
0x0645 , 0x062C , 0x0020 , 0x062A , 0x0623 , 0x0645 , 0x064A , 0x0646 , 0x0020 , 0x0639 , 0x062F , 0x064A , 0x062F , 0x0629 , 0x0020 , 0x0641 ,
0x064A , 0x0020 , 0x0645 , 0x0624 , 0x0633 , 0x0633 , 0x0629 , 0x0020 , 0x0627 , 0x0644 , 0x062A , 0x0623 , 0x0645 , 0x064A , 0x0646 , 0x0020 ,
0x0627 , 0x0644 , 0x0648 , 0x0637 , 0x0646 , 0x064A , 0x002C , 0x0020 , 0x0645 , 0x0639 , 0x0020 , 0x0645 , 0x0644 , 0x0627 , 0x0626 , 0x0645 ,
0x062A , 0x0647 , 0x0627 , 0x0020 , 0x062F , 0x0627 , 0x0626 , 0x0645 , 0x0627 , 0x064B , 0x0020 , 0x0644 , 0x0644 , 0x0627 , 0x062D , 0x062A ,
0x064A , 0x0627 , 0x062C , 0x0627 , 0x062A , 0x0020 , 0x0627 , 0x0644 , 0x0645 , 0x062A , 0x063A , 0x064A , 0x0631 , 0x0629 , 0x0020 , 0x0644 ,
0x0644 , 0x0645 , 0x062C , 0x062A , 0x0645 , 0x0639 , 0x0020 , 0x0648 , 0x0644 , 0x0644 , 0x062F , 0x0648 , 0x0644 , 0x0629 , 0x002E , 0x0020 ,
0x062A , 0x0648 , 0x0633 , 0x0639 , 0x062A , 0x0020 , 0x0648 , 0x062A , 0x0637 , 0x0648 , 0x0631 , 0x062A , 0x0020 , 0x0627 , 0x0644 , 0x0645 ,
0x0624 , 0x0633 , 0x0633 , 0x0629 , 0x0020 , 0x0628 , 0x0647 , 0x062F , 0x0641 , 0x0020 , 0x0636 , 0x0645 , 0x0627 , 0x0646 , 0x0020 , 0x0634 ,
0x0628 , 0x0643 , 0x0629 , 0x0020 , 0x0623 , 0x0645 , 0x0627 , 0x0646 , 0x0020 , 0x0644 , 0x0633 , 0x0643 , 0x0627 , 0x0646 , 0x0020 , 0x062F ,
0x0648 , 0x0644 , 0x0629 , 0x0020 , 0x0627 , 0x0633 , 0x0631 , 0x0627 , 0x0626 , 0x064A , 0x0644 , 0x0020 , 0x0628 , 0x0648 , 0x062C , 0x0647 ,
0x0020 , 0x0627 , 0x0644 , 0x0645 , 0x062E , 0x0627 , 0x0637 , 0x0631 , 0x0020 , 0x0627 , 0x0644 , 0x0627 , 0x0642 , 0x062A , 0x0635 , 0x0627 ,
0x062F , 0x064A , 0x0629 , 0x0020 , 0x0648 , 0x0627 , 0x0644 , 0x0627 , 0x062C , 0x062A , 0x0645 , 0x0627 , 0x0639 , 0x064A , 0x0629 , 0x002E ,
0x0000
} ;
2009-04-14 20:12:54 +00:00
static const UChar chars_reverse [ ] = {
0x002E , 0x0629 , 0x064A , 0x0639 , 0x0627 , 0x0645 , 0x062A , 0x062C , 0x0627 , 0x0644 , 0x0627 , 0x0648 , 0x0020 , 0x0629 , 0x064A , 0x062F ,
0x0627 , 0x0635 , 0x062A , 0x0642 , 0x0627 , 0x0644 , 0x0627 , 0x0020 , 0x0631 , 0x0637 , 0x0627 , 0x062E , 0x0645 , 0x0644 , 0x0627 , 0x0020 ,
0x0647 , 0x062C , 0x0648 , 0x0628 , 0x0020 , 0x0644 , 0x064A , 0x0626 , 0x0627 , 0x0631 , 0x0633 , 0x0627 , 0x0020 , 0x0629 , 0x0644 , 0x0648 ,
0x062F , 0x0020 , 0x0646 , 0x0627 , 0x0643 , 0x0633 , 0x0644 , 0x0020 , 0x0646 , 0x0627 , 0x0645 , 0x0623 , 0x0020 , 0x0629 , 0x0643 , 0x0628 ,
0x0634 , 0x0020 , 0x0646 , 0x0627 , 0x0645 , 0x0636 , 0x0020 , 0x0641 , 0x062F , 0x0647 , 0x0628 , 0x0020 , 0x0629 , 0x0633 , 0x0633 , 0x0624 ,
0x0645 , 0x0644 , 0x0627 , 0x0020 , 0x062A , 0x0631 , 0x0648 , 0x0637 , 0x062A , 0x0648 , 0x0020 , 0x062A , 0x0639 , 0x0633 , 0x0648 , 0x062A ,
0x0020 , 0x002E , 0x0629 , 0x0644 , 0x0648 , 0x062F , 0x0644 , 0x0644 , 0x0648 , 0x0020 , 0x0639 , 0x0645 , 0x062A , 0x062C , 0x0645 , 0x0644 ,
0x0644 , 0x0020 , 0x0629 , 0x0631 , 0x064A , 0x063A , 0x062A , 0x0645 , 0x0644 , 0x0627 , 0x0020 , 0x062A , 0x0627 , 0x062C , 0x0627 , 0x064A ,
0x062A , 0x062D , 0x0627 , 0x0644 , 0x0644 , 0x0020 , 0x064B , 0x0627 , 0x0645 , 0x0626 , 0x0627 , 0x062F , 0x0020 , 0x0627 , 0x0647 , 0x062A ,
0x0645 , 0x0626 , 0x0627 , 0x0644 , 0x0645 , 0x0020 , 0x0639 , 0x0645 , 0x0020 , 0x002C , 0x064A , 0x0646 , 0x0637 , 0x0648 , 0x0644 , 0x0627 ,
0x0020 , 0x0646 , 0x064A , 0x0645 , 0x0623 , 0x062A , 0x0644 , 0x0627 , 0x0020 , 0x0629 , 0x0633 , 0x0633 , 0x0624 , 0x0645 , 0x0020 , 0x064A ,
0x0641 , 0x0020 , 0x0629 , 0x062F , 0x064A , 0x062F , 0x0639 , 0x0020 , 0x0646 , 0x064A , 0x0645 , 0x0623 , 0x062A , 0x0020 , 0x062C , 0x0645 ,
0x0627 , 0x0631 , 0x0628 , 0x0020 , 0x062A , 0x0630 , 0x0641 , 0x064F , 0x0646 , 0x0648 , 0x0020 , 0x062A , 0x0639 , 0x0636 , 0x064F , 0x0648 ,
0x0000 ,
} ;
int32_t bLength = 0 , brLength = 0 ;
UnicodeString s1 ( chars ) ;
UnicodeString s2 ( chars_reverse ) ;
char * bytes = extractBytes ( s1 , " IBM420 " , bLength ) ;
char * bytes_r = extractBytes ( s2 , " IBM420 " , brLength ) ;
2009-03-31 15:39:00 +00:00
UCharsetDetector * csd = ucsdet_open ( & status ) ;
2009-06-12 19:34:21 +00:00
if ( U_FAILURE ( status ) ) {
errln ( " Error opening charset detector. - %s " , u_errorName ( status ) ) ;
}
2013-09-17 06:57:53 +00:00
ucsdet_setDetectableCharset ( csd , " IBM424_rtl " , TRUE , & status ) ;
ucsdet_setDetectableCharset ( csd , " IBM424_ltr " , TRUE , & status ) ;
ucsdet_setDetectableCharset ( csd , " IBM420_rtl " , TRUE , & status ) ;
ucsdet_setDetectableCharset ( csd , " IBM420_ltr " , TRUE , & status ) ;
2009-03-31 15:39:00 +00:00
const UCharsetMatch * match ;
const char * name ;
ucsdet_setText ( csd , bytes , bLength , & status ) ;
match = ucsdet_detect ( csd , & status ) ;
2006-02-06 18:03:11 +00:00
2009-03-31 15:39:00 +00:00
if ( match = = NULL ) {
2009-06-12 19:34:21 +00:00
errcheckln ( status , " Encoding detection failure for IBM420_rtl: got no matches. - %s " , u_errorName ( status ) ) ;
2009-04-14 20:12:54 +00:00
goto bail ;
}
name = ucsdet_getName ( match , & status ) ;
if ( strcmp ( name , " IBM420_rtl " ) ! = 0 ) {
errln ( " Encoding detection failure for IBM420_rtl: got %s \n " , name ) ;
}
ucsdet_setText ( csd , bytes_r , brLength , & status ) ;
match = ucsdet_detect ( csd , & status ) ;
if ( match = = NULL ) {
errln ( " Encoding detection failure for IBM420_ltr: got no matches. \n " ) ;
2009-03-31 15:39:00 +00:00
goto bail ;
}
name = ucsdet_getName ( match , & status ) ;
2009-04-14 20:12:54 +00:00
if ( strcmp ( name , " IBM420_ltr " ) ! = 0 ) {
errln ( " Encoding detection failure for IBM420_ltr: got %s \n " , name ) ;
2009-03-31 15:39:00 +00:00
}
bail :
2018-01-03 04:45:29 +00:00
delete [ ] bytes ;
delete [ ] bytes_r ;
2009-03-31 15:39:00 +00:00
ucsdet_close ( csd ) ;
2015-02-27 02:24:05 +00:00
# endif
2009-03-31 15:39:00 +00:00
}
2009-04-24 22:24:27 +00:00
void CharsetDetectionTest : : Ticket6394Test ( ) {
# if !UCONFIG_NO_CONVERSION
const char charText [ ] = " Here is some random English text that should be detected as ISO-8859-1. "
" Ticket 6394 claims that ISO-8859-1 will appear in the array of detected "
" encodings more than once. The hop through UnicodeString is for platforms "
" where this char * string is be EBCDIC and needs conversion to Latin1. " ;
char latin1Text [ sizeof ( charText ) ] ;
UnicodeString ( charText ) . extract ( 0 , sizeof ( charText ) - 2 , latin1Text , sizeof ( latin1Text ) , " ISO-8859-1 " ) ;
UErrorCode status = U_ZERO_ERROR ;
UCharsetDetector * csd = ucsdet_open ( & status ) ;
ucsdet_setText ( csd , latin1Text , - 1 , & status ) ;
if ( U_FAILURE ( status ) ) {
errln ( " Fail at file %s, line %d. status = %s " , __FILE__ , __LINE__ , u_errorName ( status ) ) ;
return ;
}
int32_t matchCount = 0 ;
const UCharsetMatch * * matches = ucsdet_detectAll ( csd , & matchCount , & status ) ;
if ( U_FAILURE ( status ) ) {
errln ( " Fail at file %s, line %d. status = %s " , __FILE__ , __LINE__ , u_errorName ( status ) ) ;
return ;
}
UnicodeSet setOfCharsetNames ; // UnicodSets can hold strings.
int32_t i ;
for ( i = 0 ; i < matchCount ; i + + ) {
UnicodeString charSetName ( ucsdet_getName ( matches [ i ] , & status ) ) ;
if ( U_FAILURE ( status ) ) {
errln ( " Fail at file %s, line %d. status = %s; i=%d " , __FILE__ , __LINE__ , u_errorName ( status ) , i ) ;
status = U_ZERO_ERROR ;
}
if ( setOfCharsetNames . contains ( charSetName ) ) {
errln ( " Fail at file %s, line %d " , __FILE__ , __LINE__ ) ;
errln ( UnicodeString ( " Duplicate charset name = " ) + charSetName ) ;
}
setOfCharsetNames . add ( charSetName ) ;
}
ucsdet_close ( csd ) ;
# endif
}
2012-05-26 00:19:47 +00:00
// Ticket 6954 - trouble with the haveC1Bytes flag that is used to distinguish between
// similar Windows and non-Windows SBCS encodings. State was kept in the shared
// Charset Recognizer objects, and could be overwritten.
void CharsetDetectionTest : : Ticket6954Test ( ) {
2017-03-27 16:05:50 +00:00
# if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION && !UCONFIG_NO_FORMATTING
2012-05-26 00:19:47 +00:00
UErrorCode status = U_ZERO_ERROR ;
UnicodeString sISO = " This is a small sample of some English text. Just enough to be sure that it detects correctly. " ;
UnicodeString ssWindows ( " This is another small sample of some English text. Just enough to be sure that it detects correctly. "
" It also includes some \\ u201CC1 \\ u201D bytes. " , - 1 , US_INV ) ;
UnicodeString sWindows = ssWindows . unescape ( ) ;
int32_t lISO = 0 , lWindows = 0 ;
2018-01-03 04:45:29 +00:00
std : : unique_ptr < char [ ] > bISO ( extractBytes ( sISO , " ISO-8859-1 " , lISO ) ) ;
std : : unique_ptr < char [ ] > bWindows ( extractBytes ( sWindows , " windows-1252 " , lWindows ) ) ;
2012-05-26 00:19:47 +00:00
// First do a plain vanilla detect of 1252 text
2018-01-03 04:45:29 +00:00
LocalUCharsetDetectorPointer csd1 ( ucsdet_open ( & status ) ) ;
ucsdet_setText ( csd1 . getAlias ( ) , bWindows . get ( ) , lWindows , & status ) ;
const UCharsetMatch * match1 = ucsdet_detect ( csd1 . getAlias ( ) , & status ) ;
2012-05-26 00:19:47 +00:00
const char * name1 = ucsdet_getName ( match1 , & status ) ;
TEST_ASSERT_SUCCESS ( status ) ;
TEST_ASSERT ( strcmp ( name1 , " windows-1252 " ) = = 0 ) ;
// Next, using a completely separate detector, detect some 8859-1 text
2018-01-03 04:45:29 +00:00
LocalUCharsetDetectorPointer csd2 ( ucsdet_open ( & status ) ) ;
ucsdet_setText ( csd2 . getAlias ( ) , bISO . get ( ) , lISO , & status ) ;
const UCharsetMatch * match2 = ucsdet_detect ( csd2 . getAlias ( ) , & status ) ;
2012-05-26 00:19:47 +00:00
const char * name2 = ucsdet_getName ( match2 , & status ) ;
TEST_ASSERT_SUCCESS ( status ) ;
TEST_ASSERT ( strcmp ( name2 , " ISO-8859-1 " ) = = 0 ) ;
// Recheck the 1252 results from the first detector, which should not have been
// altered by the use of a different detector.
name1 = ucsdet_getName ( match1 , & status ) ;
TEST_ASSERT_SUCCESS ( status ) ;
2012-06-01 20:40:48 +00:00
TEST_ASSERT ( strcmp ( name1 , " windows-1252 " ) = = 0 ) ;
2012-05-26 00:19:47 +00:00
# endif
}