2003-02-28 21:37:55 +00:00
/*
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
*
2013-03-21 01:42:01 +00:00
* Copyright ( C ) 2003 - 2013 , International Business Machines
2003-02-28 21:37:55 +00:00
* Corporation and others . All Rights Reserved .
*
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
2003-05-06 01:37:52 +00:00
* file name : testidn . cpp
2003-02-28 21:37:55 +00:00
* encoding : US - ASCII
* tab size : 8 ( not used )
* indentation : 4
*
* created on : 2003 - 02 - 06
* created by : Ram Viswanadha
*
* This program reads the rfc3454_ * . txt files ,
* parses them , and extracts the data for Nameprep conformance .
* It then preprocesses it and writes a binary file for efficient use
* in various IDNA conversion processes .
*/
# include "unicode/utypes.h"
2003-05-06 01:37:52 +00:00
2003-05-29 01:15:29 +00:00
# if !UCONFIG_NO_IDNA && !UCONFIG_NO_TRANSLITERATION
2003-05-06 01:37:52 +00:00
2003-09-02 23:26:01 +00:00
# define USPREP_TYPE_NAMES_ARRAY
2003-02-28 21:37:55 +00:00
# include "unicode/uchar.h"
# include "unicode/putil.h"
# include "cmemory.h"
# include "cstring.h"
# include "unicode/udata.h"
2011-07-27 05:53:56 +00:00
# include "unicode/utf16.h"
2003-02-28 21:37:55 +00:00
# include "unewdata.h"
# include "uoptions.h"
# include "uparse.h"
# include "utrie.h"
# include "umutex.h"
# include "sprpimpl.h"
# include "testidna.h"
2003-07-24 23:23:19 +00:00
# include "punyref.h"
2006-06-26 16:47:37 +00:00
# include <stdlib.h>
2003-02-28 21:37:55 +00:00
UBool beVerbose = FALSE , haveCopyright = TRUE ;
/* prototypes --------------------------------------------------------------- */
static void
2003-07-24 23:23:19 +00:00
parseMappings ( const char * filename , UBool reportError , TestIDNA & test , UErrorCode * pErrorCode ) ;
2003-02-28 21:37:55 +00:00
static void
compareMapping ( uint32_t codepoint , uint32_t * mapping , int32_t mapLength ,
2003-07-24 23:23:19 +00:00
UStringPrepType option ) ;
2003-02-28 21:37:55 +00:00
static void
2003-07-24 23:23:19 +00:00
compareFlagsForRange ( uint32_t start , uint32_t end , UStringPrepType option ) ;
2003-02-28 21:37:55 +00:00
static void
testAllCodepoints ( TestIDNA & test ) ;
static TestIDNA * pTestIDNA = NULL ;
static const char * fileNames [ ] = {
2009-02-02 16:16:07 +00:00
" rfc3491.txt "
2003-07-24 23:23:19 +00:00
} ;
static const UTrie * idnTrie = NULL ;
static const int32_t * indexes = NULL ;
static const uint16_t * mappingData = NULL ;
2003-02-28 21:37:55 +00:00
/* -------------------------------------------------------------------------- */
/* file definitions */
# define DATA_TYPE "icu"
2009-02-02 16:16:07 +00:00
# define SPREP_DIR "sprep"
2003-02-28 21:37:55 +00:00
extern int
testData ( TestIDNA & test ) {
char * basename = NULL ;
UErrorCode errorCode = U_ZERO_ERROR ;
char * saveBasename = NULL ;
2009-11-20 06:28:25 +00:00
LocalUStringPrepProfilePointer profile ( usprep_openByType ( USPREP_RFC3491_NAMEPREP , & errorCode ) ) ;
2003-07-24 23:23:19 +00:00
if ( U_FAILURE ( errorCode ) ) {
2009-06-12 19:34:21 +00:00
test . errcheckln ( errorCode , " Failed to load IDNA data file. " + UnicodeString ( u_errorName ( errorCode ) ) ) ;
2003-07-24 23:23:19 +00:00
return errorCode ;
2003-02-28 21:37:55 +00:00
}
2007-08-31 04:58:52 +00:00
char * filename = ( char * ) malloc ( strlen ( IntlTest : : pathToDataDirectory ( ) ) * 1024 ) ;
//TODO get the srcDir dynamically
const char * srcDir = IntlTest : : pathToDataDirectory ( ) ;
2003-07-24 23:23:19 +00:00
idnTrie = & profile - > sprepTrie ;
indexes = profile - > indexes ;
mappingData = profile - > mappingData ;
2003-02-28 21:37:55 +00:00
//initialize
pTestIDNA = & test ;
2003-04-17 20:35:47 +00:00
2003-02-28 21:37:55 +00:00
/* prepare the filename beginning with the source dir */
2003-04-17 20:35:47 +00:00
if ( uprv_strchr ( srcDir , U_FILE_SEP_CHAR ) = = NULL ) {
2003-02-28 21:37:55 +00:00
filename [ 0 ] = 0x2E ;
filename [ 1 ] = U_FILE_SEP_CHAR ;
uprv_strcpy ( filename + 2 , srcDir ) ;
} else {
uprv_strcpy ( filename , srcDir ) ;
}
2003-04-17 20:35:47 +00:00
basename = filename + uprv_strlen ( filename ) ;
if ( basename > filename & & * ( basename - 1 ) ! = U_FILE_SEP_CHAR ) {
* basename + + = U_FILE_SEP_CHAR ;
}
2003-02-28 21:37:55 +00:00
/* process unassigned */
basename = filename + uprv_strlen ( filename ) ;
if ( basename > filename & & * ( basename - 1 ) ! = U_FILE_SEP_CHAR ) {
* basename + + = U_FILE_SEP_CHAR ;
}
2003-04-17 20:35:47 +00:00
/* first copy misc directory */
saveBasename = basename ;
2013-03-21 01:42:01 +00:00
( void ) saveBasename ; // Suppress set but not used warning.
2009-02-02 16:16:07 +00:00
uprv_strcpy ( basename , SPREP_DIR ) ;
basename = basename + uprv_strlen ( SPREP_DIR ) ;
2003-04-17 20:35:47 +00:00
* basename + + = U_FILE_SEP_CHAR ;
2003-02-28 21:37:55 +00:00
2003-04-17 20:35:47 +00:00
/* process unassigned */
2003-02-28 21:37:55 +00:00
uprv_strcpy ( basename , fileNames [ 0 ] ) ;
2003-07-24 23:23:19 +00:00
parseMappings ( filename , TRUE , test , & errorCode ) ;
2003-02-28 21:37:55 +00:00
if ( U_FAILURE ( errorCode ) ) {
test . errln ( " Could not open file %s for reading \n " , filename ) ;
return errorCode ;
}
testAllCodepoints ( test ) ;
pTestIDNA = NULL ;
2003-05-05 23:21:16 +00:00
free ( filename ) ;
2003-02-28 21:37:55 +00:00
return errorCode ;
}
2003-03-20 01:11:51 +00:00
U_CDECL_BEGIN
2003-07-24 23:23:19 +00:00
2003-02-28 21:37:55 +00:00
static void U_CALLCONV
2003-12-02 18:53:27 +00:00
strprepProfileLineFn ( void * /*context*/ ,
2003-07-24 23:23:19 +00:00
char * fields [ ] [ 2 ] , int32_t fieldCount ,
2003-02-28 21:37:55 +00:00
UErrorCode * pErrorCode ) {
uint32_t mapping [ 40 ] ;
2003-07-24 23:23:19 +00:00
char * end , * map ;
2003-02-28 21:37:55 +00:00
uint32_t code ;
int32_t length ;
2003-07-24 23:23:19 +00:00
/*UBool* mapWithNorm = (UBool*) context;*/
const char * typeName ;
uint32_t rangeStart = 0 , rangeEnd = 0 ;
2009-02-02 16:16:07 +00:00
const char * s ;
s = u_skipWhitespace ( fields [ 0 ] [ 0 ] ) ;
if ( * s = = ' @ ' ) {
/* a special directive introduced in 4.2 */
return ;
}
2003-12-02 03:15:53 +00:00
if ( fieldCount ! = 3 ) {
* pErrorCode = U_INVALID_FORMAT_ERROR ;
return ;
}
2003-07-24 23:23:19 +00:00
typeName = fields [ 2 ] [ 0 ] ;
map = fields [ 1 ] [ 0 ] ;
if ( uprv_strstr ( typeName , usprepTypeNames [ USPREP_UNASSIGNED ] ) ! = NULL ) {
2003-02-28 21:37:55 +00:00
2009-02-02 16:16:07 +00:00
u_parseCodePointRange ( s , & rangeStart , & rangeEnd , pErrorCode ) ;
2003-02-28 21:37:55 +00:00
2003-07-24 23:23:19 +00:00
/* store the range */
compareFlagsForRange ( rangeStart , rangeEnd , USPREP_UNASSIGNED ) ;
} else if ( uprv_strstr ( typeName , usprepTypeNames [ USPREP_PROHIBITED ] ) ! = NULL ) {
2009-02-02 16:16:07 +00:00
u_parseCodePointRange ( s , & rangeStart , & rangeEnd , pErrorCode ) ;
2003-07-24 23:23:19 +00:00
/* store the range */
compareFlagsForRange ( rangeStart , rangeEnd , USPREP_PROHIBITED ) ;
2003-02-28 21:37:55 +00:00
2003-07-24 23:23:19 +00:00
} else if ( uprv_strstr ( typeName , usprepTypeNames [ USPREP_MAP ] ) ! = NULL ) {
/* get the character code, field 0 */
2009-02-02 16:16:07 +00:00
code = ( uint32_t ) uprv_strtoul ( s , & end , 16 ) ;
2003-02-28 21:37:55 +00:00
2003-07-24 23:23:19 +00:00
/* parse the mapping string */
length = u_parseCodePoints ( map , mapping , sizeof ( mapping ) / 4 , pErrorCode ) ;
/* store the mapping */
compareMapping ( code , mapping , length , USPREP_MAP ) ;
} else {
* pErrorCode = U_INVALID_FORMAT_ERROR ;
}
2003-02-28 21:37:55 +00:00
}
2003-07-24 23:23:19 +00:00
2003-03-20 01:11:51 +00:00
U_CDECL_END
2003-02-28 21:37:55 +00:00
static void
2003-07-24 23:23:19 +00:00
parseMappings ( const char * filename , UBool reportError , TestIDNA & test , UErrorCode * pErrorCode ) {
2003-02-28 21:37:55 +00:00
char * fields [ 3 ] [ 2 ] ;
if ( pErrorCode = = NULL | | U_FAILURE ( * pErrorCode ) ) {
return ;
}
2003-07-24 23:23:19 +00:00
u_parseDelimitedFile ( filename , ' ; ' , fields , 3 , strprepProfileLineFn , ( void * ) filename , pErrorCode ) ;
2003-02-28 21:37:55 +00:00
//fprintf(stdout,"Number of code points that have mappings with length >1 : %i\n",len);
if ( U_FAILURE ( * pErrorCode ) & & ( reportError | | * pErrorCode ! = U_FILE_ACCESS_ERROR ) ) {
2003-04-17 20:35:47 +00:00
test . errln ( " testidn error: u_parseDelimitedFile( \" %s \" ) failed - %s \n " , filename , u_errorName ( * pErrorCode ) ) ;
2003-02-28 21:37:55 +00:00
}
}
2003-07-24 23:23:19 +00:00
static inline UStringPrepType
getValues ( uint32_t result , int32_t & value , UBool & isIndex ) {
2003-02-28 21:37:55 +00:00
2003-07-24 23:23:19 +00:00
UStringPrepType type ;
2003-02-28 21:37:55 +00:00
2003-07-24 23:23:19 +00:00
if ( result = = 0 ) {
/*
* Initial value stored in the mapping table
* just return USPREP_TYPE_LIMIT . . so that
* the source codepoint is copied to the destination
*/
type = USPREP_TYPE_LIMIT ;
2006-04-14 16:20:02 +00:00
isIndex = FALSE ;
value = 0 ;
2003-07-24 23:23:19 +00:00
} else if ( result > = _SPREP_TYPE_THRESHOLD ) {
type = ( UStringPrepType ) ( result - _SPREP_TYPE_THRESHOLD ) ;
2006-04-14 16:20:02 +00:00
isIndex = FALSE ;
value = 0 ;
2003-07-24 23:23:19 +00:00
} else {
/* get the state */
type = USPREP_MAP ;
/* ascertain if the value is index or delta */
if ( result & 0x02 ) {
isIndex = TRUE ;
value = result > > 2 ; //mask off the lower 2 bits and shift
2003-02-28 21:37:55 +00:00
2003-07-24 23:23:19 +00:00
} else {
isIndex = FALSE ;
value = ( int16_t ) result ;
value = ( value > > 2 ) ;
2003-02-28 21:37:55 +00:00
2003-07-24 23:23:19 +00:00
}
if ( ( result > > 2 ) = = _SPREP_MAX_INDEX_VALUE ) {
type = USPREP_DELETE ;
isIndex = FALSE ;
value = 0 ;
}
}
return type ;
2003-02-28 21:37:55 +00:00
}
2003-07-24 23:23:19 +00:00
2003-03-20 01:11:51 +00:00
2003-02-28 21:37:55 +00:00
static void
2003-07-24 23:23:19 +00:00
testAllCodepoints ( TestIDNA & test ) {
/*
{
UChar str [ 19 ] = {
0xC138 , 0xACC4 , 0xC758 , 0xBAA8 , 0xB4E0 , 0xC0AC , 0xB78C , 0xB4E4 , 0xC774 ,
0x070F , //prohibited
0xD55C , 0xAD6D , 0xC5B4 , 0xB97C , 0xC774 , 0xD574 , 0xD55C , 0xB2E4 , 0xBA74
} ;
uint32_t in [ 19 ] = { 0 } ;
UErrorCode status = U_ZERO_ERROR ;
int32_t inLength = 0 , outLength = 100 ;
char output [ 100 ] = { 0 } ;
punycode_status error ;
u_strToUTF32 ( ( UChar32 * ) in , 19 , & inLength , str , 19 , & status ) ;
error = punycode_encode ( inLength , in , NULL , ( uint32_t * ) & outLength , output ) ;
printf ( output ) ;
2003-02-28 21:37:55 +00:00
2003-07-24 23:23:19 +00:00
}
*/
2003-02-28 21:37:55 +00:00
2003-07-24 23:23:19 +00:00
uint32_t i = 0 ;
int32_t unassigned = 0 ;
int32_t prohibited = 0 ;
int32_t mappedWithNorm = 0 ;
int32_t mapped = 0 ;
int32_t noValueInTrie = 0 ;
2003-02-28 21:37:55 +00:00
2003-07-24 23:23:19 +00:00
UStringPrepType type ;
int32_t value ;
UBool isIndex = FALSE ;
2003-02-28 21:37:55 +00:00
2003-07-24 23:23:19 +00:00
for ( i = 0 ; i < = 0x10FFFF ; i + + ) {
uint32_t result = 0 ;
UTRIE_GET16 ( idnTrie , i , result ) ;
type = getValues ( result , value , isIndex ) ;
if ( type ! = USPREP_TYPE_LIMIT ) {
if ( type = = USPREP_UNASSIGNED ) {
unassigned + + ;
}
if ( type = = USPREP_PROHIBITED ) {
prohibited + + ;
}
if ( type = = USPREP_MAP ) {
mapped + + ;
}
} else {
noValueInTrie + + ;
if ( result > 0 ) {
test . errln ( " The return value for 0x%06X is wrong. %i \n " , i , result ) ;
2003-02-28 21:37:55 +00:00
}
}
}
2003-07-24 23:23:19 +00:00
test . logln ( " Number of Unassinged code points : %i \n " , unassigned ) ;
test . logln ( " Number of Prohibited code points : %i \n " , prohibited ) ;
test . logln ( " Number of Mapped code points : %i \n " , mapped ) ;
test . logln ( " Number of Mapped with NFKC code points : %i \n " , mappedWithNorm ) ;
test . logln ( " Number of code points that have no value in Trie: %i \n " , noValueInTrie ) ;
2003-02-28 21:37:55 +00:00
}
static void
compareMapping ( uint32_t codepoint , uint32_t * mapping , int32_t mapLength ,
2003-07-24 23:23:19 +00:00
UStringPrepType type ) {
uint32_t result = 0 ;
UTRIE_GET16 ( idnTrie , codepoint , result ) ;
2003-02-28 21:37:55 +00:00
2003-07-24 23:23:19 +00:00
int32_t length = 0 ;
UBool isIndex ;
UStringPrepType retType ;
int32_t value , index = 0 , delta = 0 ;
retType = getValues ( result , value , isIndex ) ;
2003-02-28 21:37:55 +00:00
2003-07-24 23:23:19 +00:00
if ( type ! = retType & & retType ! = USPREP_DELETE ) {
pTestIDNA - > errln ( " Did not get the assigned type for codepoint 0x%08X. Expected: %i Got: %i \n " , codepoint , USPREP_MAP , type ) ;
}
if ( isIndex ) {
index = value ;
if ( index > = indexes [ _SPREP_ONE_UCHAR_MAPPING_INDEX_START ] & &
index < indexes [ _SPREP_TWO_UCHARS_MAPPING_INDEX_START ] ) {
length = 1 ;
} else if ( index > = indexes [ _SPREP_TWO_UCHARS_MAPPING_INDEX_START ] & &
index < indexes [ _SPREP_THREE_UCHARS_MAPPING_INDEX_START ] ) {
length = 2 ;
} else if ( index > = indexes [ _SPREP_THREE_UCHARS_MAPPING_INDEX_START ] & &
index < indexes [ _SPREP_FOUR_UCHARS_MAPPING_INDEX_START ] ) {
length = 3 ;
2003-02-28 21:37:55 +00:00
} else {
2003-07-24 23:23:19 +00:00
length = mappingData [ index + + ] ;
2003-05-19 21:07:32 +00:00
}
2003-07-24 23:23:19 +00:00
} else {
delta = value ;
length = ( retType = = USPREP_DELETE ) ? 0 : 1 ;
}
2003-02-28 21:37:55 +00:00
2003-07-24 23:23:19 +00:00
int32_t realLength = 0 ;
/* figure out the real length */
for ( int32_t j = 0 ; j < mapLength ; j + + ) {
if ( mapping [ j ] > 0xFFFF ) {
realLength + = 2 ;
} else {
realLength + + ;
}
}
2003-02-28 21:37:55 +00:00
2003-07-24 23:23:19 +00:00
if ( realLength ! = length ) {
pTestIDNA - > errln ( " Did not get the expected length. Expected: %i Got: %i \n " , mapLength , length ) ;
}
if ( isIndex ) {
2003-02-28 21:37:55 +00:00
for ( int8_t i = 0 ; i < mapLength ; i + + ) {
if ( mapping [ i ] < = 0xFFFF ) {
if ( mappingData [ index + i ] ! = ( uint16_t ) mapping [ i ] ) {
pTestIDNA - > errln ( " Did not get the expected result. Expected: 0x%04X Got: 0x%04X \n " , mapping [ i ] , mappingData [ index + i ] ) ;
}
} else {
2011-07-27 05:53:56 +00:00
UChar lead = U16_LEAD ( mapping [ i ] ) ;
UChar trail = U16_TRAIL ( mapping [ i ] ) ;
2003-02-28 21:37:55 +00:00
if ( mappingData [ index + i ] ! = lead | |
mappingData [ index + i + 1 ] ! = trail ) {
pTestIDNA - > errln ( " Did not get the expected result. Expected: 0x%04X 0x%04X Got: 0x%04X 0x%04X " , lead , trail , mappingData [ index + i ] , mappingData [ index + i + 1 ] ) ;
}
}
}
2003-07-24 23:23:19 +00:00
} else {
if ( retType ! = USPREP_DELETE & & ( codepoint - delta ) ! = ( uint16_t ) mapping [ 0 ] ) {
pTestIDNA - > errln ( " Did not get the expected result. Expected: 0x%04X Got: 0x%04X \n " , mapping [ 0 ] , ( codepoint - delta ) ) ;
}
2003-02-28 21:37:55 +00:00
}
}
static void
compareFlagsForRange ( uint32_t start , uint32_t end ,
2003-07-24 23:23:19 +00:00
UStringPrepType type ) {
uint32_t result = 0 ;
UStringPrepType retType ;
UBool isIndex = FALSE ;
int32_t value = 0 ;
2011-07-27 05:53:56 +00:00
/*
2003-07-24 23:23:19 +00:00
// supplementary code point
2011-07-27 05:53:56 +00:00
UChar __lead16 = U16_LEAD ( 0x2323E ) ;
2003-07-24 23:23:19 +00:00
int32_t __offset ;
// get data for lead surrogate
( result ) = _UTRIE_GET_RAW ( ( & idnTrie ) , index , 0 , ( __lead16 ) ) ;
__offset = ( & idnTrie ) - > getFoldingOffset ( result ) ;
// get the real data from the folded lead/trail units
if ( __offset > 0 ) {
( result ) = _UTRIE_GET_RAW ( ( & idnTrie ) , index , __offset , ( 0x2323E ) & 0x3ff ) ;
2003-02-28 21:37:55 +00:00
} else {
2003-07-24 23:23:19 +00:00
( result ) = ( uint32_t ) ( ( & idnTrie ) - > initialValue ) ;
2003-02-28 21:37:55 +00:00
}
2003-07-24 23:23:19 +00:00
UTRIE_GET16 ( & idnTrie , 0x2323E , result ) ;
*/
while ( start < end + 1 ) {
UTRIE_GET16 ( idnTrie , start , result ) ;
retType = getValues ( result , value , isIndex ) ;
if ( result > _SPREP_TYPE_THRESHOLD ) {
if ( retType ! = type ) {
pTestIDNA - > errln ( " FAIL: Did not get the expected type for 0x%06X. Expected: %s Got: %s \n " , start , usprepTypeNames [ type ] , usprepTypeNames [ retType ] ) ;
}
} else {
if ( type = = USPREP_PROHIBITED & & ( ( result & 0x01 ) ! = 0x01 ) ) {
pTestIDNA - > errln ( " FAIL: Did not get the expected type for 0x%06X. Expected: %s Got: %s \n " , start , usprepTypeNames [ type ] , usprepTypeNames [ retType ] ) ;
}
2003-02-28 21:37:55 +00:00
}
2003-07-24 23:23:19 +00:00
start + + ;
2003-02-28 21:37:55 +00:00
}
2003-07-24 23:23:19 +00:00
2003-02-28 21:37:55 +00:00
}
2003-07-24 23:23:19 +00:00
2003-05-06 01:37:52 +00:00
# endif /* #if !UCONFIG_NO_IDNA */
2003-02-28 21:37:55 +00:00
/*
* Hey , Emacs , please set the following :
*
* Local Variables :
* indent - tabs - mode : nil
* End :
*
*/