2003-02-28 21:37:55 +00:00
/*
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
*
2003-05-06 01:37:52 +00:00
* Copyright ( C ) 2003 , International Business Machines
2003-02-28 21:37:55 +00:00
* Corporation and others . All Rights Reserved .
*
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
2003-05-06 01:37:52 +00:00
* file name : testidn . cpp
2003-02-28 21:37:55 +00:00
* encoding : US - ASCII
* tab size : 8 ( not used )
* indentation : 4
*
* created on : 2003 - 02 - 06
* created by : Ram Viswanadha
*
* This program reads the rfc3454_ * . txt files ,
* parses them , and extracts the data for Nameprep conformance .
* It then preprocesses it and writes a binary file for efficient use
* in various IDNA conversion processes .
*/
# include <stdio.h>
# include <stdlib.h>
# include "unicode/utypes.h"
2003-05-06 01:37:52 +00:00
2003-05-29 01:15:29 +00:00
# if !UCONFIG_NO_IDNA && !UCONFIG_NO_TRANSLITERATION
2003-05-06 01:37:52 +00:00
2003-02-28 21:37:55 +00:00
# include "unicode/uchar.h"
# include "unicode/putil.h"
# include "cmemory.h"
# include "cstring.h"
# include "unicode/udata.h"
# include "unewdata.h"
# include "uoptions.h"
# include "uparse.h"
# include "utrie.h"
# include "umutex.h"
# include "sprpimpl.h"
# include "testidna.h"
# ifdef WIN32
# pragma warning(disable: 4100)
# endif
UBool beVerbose = FALSE , haveCopyright = TRUE ;
/* prototypes --------------------------------------------------------------- */
static UBool isDataLoaded = FALSE ;
static UTrie idnTrie = { 0 , 0 , 0 , 0 , 0 , 0 , 0 } ;
static UDataMemory * idnData = NULL ;
static UErrorCode dataErrorCode = U_ZERO_ERROR ;
static const uint16_t * mappingData = NULL ;
static int32_t indexes [ _IDNA_INDEX_TOP ] = { 0 } ;
static void
parseMappings ( const char * filename , UBool withNorm , UBool reportError , TestIDNA & test , UErrorCode * pErrorCode ) ;
static void
parseTable ( const char * filename , UBool isUnassigned , TestIDNA & test , UErrorCode * pErrorCode ) ;
static UBool loadIDNData ( UErrorCode & errorCode ) ;
static UBool cleanup ( ) ;
static void
compareMapping ( uint32_t codepoint , uint32_t * mapping , int32_t mapLength ,
UBool withNorm , UErrorCode * status ) ;
static void
compareFlagsForRange ( uint32_t start , uint32_t end ,
UBool isUnassigned , UErrorCode * status ) ;
static void
testAllCodepoints ( TestIDNA & test ) ;
static TestIDNA * pTestIDNA = NULL ;
static const char * fileNames [ ] = {
" rfc3454_A_1.txt " , /* contains unassigned code points */
" rfc3454_C_X.txt " , /* contains code points that are prohibited */
" rfc3454_B_1.txt " , /* contains case mappings when normalization is turned off */
" rfc3454_B_2.txt " , /* contains case mappings when normalization it turned on */
/* "NormalizationCorrections.txt",contains NFKC case mappings whicha are not included in UTR 21 */
} ;
/* -------------------------------------------------------------------------- */
static UOption options [ ] = {
UOPTION_HELP_H ,
UOPTION_HELP_QUESTION_MARK ,
UOPTION_VERBOSE ,
UOPTION_COPYRIGHT ,
UOPTION_DESTDIR ,
UOPTION_SOURCEDIR ,
{ " unicode " , NULL , NULL , NULL , ' u ' , UOPT_REQUIRES_ARG , 0 }
} ;
/* file definitions */
# define DATA_NAME "uidna"
# define DATA_TYPE "icu"
# define MISC_DIR "misc"
extern int
testData ( TestIDNA & test ) {
2003-05-05 23:21:16 +00:00
char * filename = ( char * ) malloc ( strlen ( IntlTest : : pathToDataDirectory ( ) ) * 3 ) ;
2003-02-28 21:37:55 +00:00
//TODO get the srcDir dynamically
const char * srcDir = IntlTest : : pathToDataDirectory ( ) , * destDir = NULL , * suffix = NULL ;
char * basename = NULL ;
UErrorCode errorCode = U_ZERO_ERROR ;
char * saveBasename = NULL ;
loadIDNData ( errorCode ) ;
if ( U_FAILURE ( dataErrorCode ) ) {
test . errln ( " Could not load data. Error: %s \n " , u_errorName ( dataErrorCode ) ) ;
return dataErrorCode ;
}
//initialize
pTestIDNA = & test ;
2003-04-17 20:35:47 +00:00
2003-02-28 21:37:55 +00:00
/* prepare the filename beginning with the source dir */
2003-04-17 20:35:47 +00:00
if ( uprv_strchr ( srcDir , U_FILE_SEP_CHAR ) = = NULL ) {
2003-02-28 21:37:55 +00:00
filename [ 0 ] = 0x2E ;
filename [ 1 ] = U_FILE_SEP_CHAR ;
uprv_strcpy ( filename + 2 , srcDir ) ;
} else {
uprv_strcpy ( filename , srcDir ) ;
}
2003-04-17 20:35:47 +00:00
basename = filename + uprv_strlen ( filename ) ;
if ( basename > filename & & * ( basename - 1 ) ! = U_FILE_SEP_CHAR ) {
* basename + + = U_FILE_SEP_CHAR ;
}
2003-02-28 21:37:55 +00:00
/* process unassigned */
basename = filename + uprv_strlen ( filename ) ;
if ( basename > filename & & * ( basename - 1 ) ! = U_FILE_SEP_CHAR ) {
* basename + + = U_FILE_SEP_CHAR ;
}
2003-04-17 20:35:47 +00:00
/* first copy misc directory */
saveBasename = basename ;
2003-02-28 21:37:55 +00:00
uprv_strcpy ( basename , MISC_DIR ) ;
2003-04-17 20:35:47 +00:00
basename = basename + uprv_strlen ( MISC_DIR ) ;
* basename + + = U_FILE_SEP_CHAR ;
2003-02-28 21:37:55 +00:00
2003-04-17 20:35:47 +00:00
/* process unassigned */
2003-02-28 21:37:55 +00:00
uprv_strcpy ( basename , fileNames [ 0 ] ) ;
parseTable ( filename , TRUE , test , & errorCode ) ;
if ( U_FAILURE ( errorCode ) ) {
test . errln ( " Could not open file %s for reading \n " , filename ) ;
return errorCode ;
}
/* process prohibited */
uprv_strcpy ( basename , fileNames [ 1 ] ) ;
parseTable ( filename , FALSE , test , & errorCode ) ;
if ( U_FAILURE ( errorCode ) ) {
test . errln ( " Could not open file %s for reading \n " , filename ) ;
return errorCode ;
}
/* process mappings */
uprv_strcpy ( basename , fileNames [ 2 ] ) ;
parseMappings ( filename , FALSE , FALSE , test , & errorCode ) ;
if ( U_FAILURE ( errorCode ) ) {
test . errln ( " Could not open file %s for reading \n " , filename ) ;
return errorCode ;
}
uprv_strcpy ( basename , fileNames [ 3 ] ) ;
parseMappings ( filename , TRUE , FALSE , test , & errorCode ) ;
if ( U_FAILURE ( errorCode ) ) {
test . errln ( " Could not open file %s for reading \n " , filename ) ;
return errorCode ;
}
testAllCodepoints ( test ) ;
cleanup ( ) ;
pTestIDNA = NULL ;
2003-05-05 23:21:16 +00:00
free ( filename ) ;
2003-02-28 21:37:55 +00:00
return errorCode ;
}
2003-03-20 01:11:51 +00:00
U_CDECL_BEGIN
2003-02-28 21:37:55 +00:00
static void U_CALLCONV
caseMapLineFn ( void * context ,
char * fields [ ] [ 2 ] , int32_t fieldCount ,
UErrorCode * pErrorCode ) {
uint32_t mapping [ 40 ] ;
char * end , * s ;
uint32_t code ;
int32_t length ;
UBool * mapWithNorm = ( UBool * ) context ;
/* get the character code, field 0 */
code = ( uint32_t ) uprv_strtoul ( fields [ 0 ] [ 0 ] , & end , 16 ) ;
if ( end < = fields [ 0 ] [ 0 ] | | end ! = fields [ 0 ] [ 1 ] ) {
* pErrorCode = U_PARSE_ERROR ;
}
s = fields [ 1 ] [ 0 ] ;
/* parse the mapping string */
length = u_parseCodePoints ( s , mapping , sizeof ( mapping ) / 4 , pErrorCode ) ;
/* store the mapping */
compareMapping ( code , mapping , length , * mapWithNorm , pErrorCode ) ;
}
2003-03-20 01:11:51 +00:00
U_CDECL_END
2003-02-28 21:37:55 +00:00
static void
parseMappings ( const char * filename , UBool withNorm , UBool reportError , TestIDNA & test , UErrorCode * pErrorCode ) {
char * fields [ 3 ] [ 2 ] ;
if ( pErrorCode = = NULL | | U_FAILURE ( * pErrorCode ) ) {
return ;
}
u_parseDelimitedFile ( filename , ' ; ' , fields , 3 , caseMapLineFn , & withNorm , pErrorCode ) ;
//fprintf(stdout,"Number of code points that have mappings with length >1 : %i\n",len);
if ( U_FAILURE ( * pErrorCode ) & & ( reportError | | * pErrorCode ! = U_FILE_ACCESS_ERROR ) ) {
2003-04-17 20:35:47 +00:00
test . errln ( " testidn error: u_parseDelimitedFile( \" %s \" ) failed - %s \n " , filename , u_errorName ( * pErrorCode ) ) ;
2003-02-28 21:37:55 +00:00
}
}
/* parser for UnicodeData.txt ----------------------------------------------- */
2003-03-20 01:11:51 +00:00
U_CDECL_BEGIN
2003-02-28 21:37:55 +00:00
static void U_CALLCONV
unicodeDataLineFn ( void * context ,
char * fields [ ] [ 2 ] , int32_t fieldCount ,
UErrorCode * pErrorCode ) {
uint32_t rangeStart = 0 , rangeEnd = 0 ;
UBool * isUnassigned = ( UBool * ) context ;
u_parseCodePointRange ( fields [ 0 ] [ 0 ] , & rangeStart , & rangeEnd , pErrorCode ) ;
if ( U_FAILURE ( * pErrorCode ) ) {
* pErrorCode = U_PARSE_ERROR ;
return ;
}
compareFlagsForRange ( rangeStart , rangeEnd , * isUnassigned , pErrorCode ) ;
}
2003-03-20 01:11:51 +00:00
U_CDECL_END
2003-02-28 21:37:55 +00:00
static void
parseTable ( const char * filename , UBool isUnassigned , TestIDNA & test , UErrorCode * pErrorCode ) {
2003-03-14 21:39:33 +00:00
char * fields [ 2 ] [ 2 ] ;
2003-02-28 21:37:55 +00:00
int32_t len = 0 ;
if ( pErrorCode = = NULL | | U_FAILURE ( * pErrorCode ) ) {
return ;
}
u_parseDelimitedFile ( filename , ' ; ' , fields , 1 , unicodeDataLineFn , & isUnassigned , pErrorCode ) ;
if ( U_FAILURE ( * pErrorCode ) ) {
2003-04-17 20:35:47 +00:00
test . errln ( " testidn error: u_parseDelimitedFile( \" %s \" ) failed - %s \n " , filename , u_errorName ( * pErrorCode ) ) ;
2003-02-28 21:37:55 +00:00
}
}
static void
testAllCodepoints ( TestIDNA & test ) {
if ( isDataLoaded ) {
uint32_t i = 0 ;
int32_t unassigned = 0 ;
int32_t prohibited = 0 ;
int32_t mappedWithNorm = 0 ;
int32_t mapped = 0 ;
int32_t noValueInTrie = 0 ;
for ( i = 0 ; i < = 0x10FFFF ; i + + ) {
uint32_t result = 0 ;
UTRIE_GET16 ( & idnTrie , i , result ) ;
if ( result ! = UIDNA_NO_VALUE ) {
if ( ( result & 0x07 ) = = UIDNA_UNASSIGNED ) {
unassigned + + ;
}
if ( ( result & 0x07 ) = = UIDNA_PROHIBITED ) {
prohibited + + ;
}
if ( ( result > > 5 ) = = _IDNA_MAP_TO_NOTHING ) {
mapped + + ;
}
if ( ( result & 0x07 ) = = UIDNA_MAP_NFKC ) {
mappedWithNorm + + ;
}
} else {
noValueInTrie + + ;
if ( result > 0 ) {
test . errln ( " The return value for 0x%06X is wrong. %i \n " , i , result ) ;
}
}
}
test . logln ( " Number of Unassinged code points : %i \n " , unassigned ) ;
test . logln ( " Number of Prohibited code points : %i \n " , prohibited ) ;
test . logln ( " Number of Mapped code points : %i \n " , mapped ) ;
test . logln ( " Number of Mapped with NFKC code points : %i \n " , mappedWithNorm ) ;
test . logln ( " Number of code points that have no value in Trie: %i \n " , noValueInTrie ) ;
}
}
static inline void getValues ( uint32_t result , int8_t & flag ,
int8_t & length , int32_t & index ) {
/* first 3 bits contain the flag */
flag = ( int8_t ) ( result & 0x07 ) ;
/* next 2 bits contain the length */
length = ( int8_t ) ( ( result > > 3 ) & 0x03 ) ;
2003-03-12 23:16:57 +00:00
/* next 11 bits contain the index */
2003-02-28 21:37:55 +00:00
index = ( result > > 5 ) ;
}
static void
compareMapping ( uint32_t codepoint , uint32_t * mapping , int32_t mapLength ,
UBool withNorm , UErrorCode * status ) {
if ( isDataLoaded ) {
uint32_t result = 0 ;
UTRIE_GET16 ( & idnTrie , codepoint , result ) ;
int8_t flag , length ;
int32_t index ;
getValues ( result , flag , length , index ) ;
if ( withNorm ) {
if ( flag ! = UIDNA_MAP_NFKC ) {
pTestIDNA - > errln ( " Did not get the assigned flag for codepoint 0x%08X. Expected: %i Got: %i \n " , codepoint , UIDNA_MAP_NFKC , flag ) ;
}
} else {
2003-03-12 23:16:57 +00:00
if ( flag = = UIDNA_NO_VALUE | | flag = = UIDNA_PROHIBITED ) {
2003-02-28 21:37:55 +00:00
if ( index ! = _IDNA_MAP_TO_NOTHING ) {
pTestIDNA - > errln ( " Did not get the assigned flag for codepoint 0x%08X. Expected: %i Got: %i \n " , codepoint , _IDNA_MAP_TO_NOTHING , index ) ;
}
}
}
if ( length = = _IDNA_LENGTH_IN_MAPPING_TABLE ) {
length = ( int8_t ) mappingData [ index ] ;
index + + ;
}
2003-05-19 21:07:32 +00:00
int32_t realLength = 0 ;
/* figure out the real length */
2003-05-20 03:03:26 +00:00
for ( int32_t j = 0 ; j < mapLength ; j + + ) {
if ( mapping [ j ] > 0xFFFF ) {
2003-05-19 21:07:32 +00:00
realLength + = 2 ;
} else {
realLength + + ;
}
}
2003-02-28 21:37:55 +00:00
2003-05-19 21:07:32 +00:00
if ( realLength ! = length ) {
2003-02-28 21:37:55 +00:00
pTestIDNA - > errln ( " Did not get the expected length. Expected: %i Got: %i \n " , mapLength , length ) ;
}
for ( int8_t i = 0 ; i < mapLength ; i + + ) {
if ( mapping [ i ] < = 0xFFFF ) {
if ( mappingData [ index + i ] ! = ( uint16_t ) mapping [ i ] ) {
pTestIDNA - > errln ( " Did not get the expected result. Expected: 0x%04X Got: 0x%04X \n " , mapping [ i ] , mappingData [ index + i ] ) ;
}
} else {
UChar lead = UTF16_LEAD ( mapping [ i ] ) ;
UChar trail = UTF16_TRAIL ( mapping [ i ] ) ;
if ( mappingData [ index + i ] ! = lead | |
mappingData [ index + i + 1 ] ! = trail ) {
pTestIDNA - > errln ( " Did not get the expected result. Expected: 0x%04X 0x%04X Got: 0x%04X 0x%04X " , lead , trail , mappingData [ index + i ] , mappingData [ index + i + 1 ] ) ;
}
}
}
}
}
static void
compareFlagsForRange ( uint32_t start , uint32_t end ,
UBool isUnassigned , UErrorCode * status ) {
if ( isDataLoaded ) {
uint32_t result = 0 ;
while ( start < end + 1 ) {
UTRIE_GET16 ( & idnTrie , start , result ) ;
if ( isUnassigned ) {
if ( result ! = UIDNA_UNASSIGNED ) {
pTestIDNA - > errln ( " UIDNA_UASSIGNED flag failed for 0x%06X. Expected: %04X Got: %04X \n " , start , UIDNA_UNASSIGNED , result ) ;
}
} else {
if ( ( result & 0x03 ) ! = UIDNA_PROHIBITED ) {
pTestIDNA - > errln ( " UIDNA_PROHIBITED flag failed for 0x%06X. Expected: %04X Got: %04X \n \n " , start , UIDNA_PROHIBITED , result ) ;
}
}
start + + ;
}
}
}
UBool
cleanup ( ) {
if ( idnData ! = NULL ) {
udata_close ( idnData ) ;
idnData = NULL ;
}
dataErrorCode = U_ZERO_ERROR ;
isDataLoaded = FALSE ;
return TRUE ;
}
2003-03-20 01:11:51 +00:00
U_CDECL_BEGIN
2003-02-28 21:37:55 +00:00
static UBool U_CALLCONV
isAcceptable ( void * /* context */ ,
const char * /* type */ , const char * /* name */ ,
const UDataInfo * pInfo ) {
if (
pInfo - > size > = 20 & &
pInfo - > isBigEndian = = U_IS_BIG_ENDIAN & &
pInfo - > charsetFamily = = U_CHARSET_FAMILY & &
pInfo - > dataFormat [ 0 ] = = 0x49 & & /* dataFormat="IDNA" 0x49, 0x44, 0x4e, 0x41 */
pInfo - > dataFormat [ 1 ] = = 0x44 & &
pInfo - > dataFormat [ 2 ] = = 0x4e & &
pInfo - > dataFormat [ 3 ] = = 0x41 & &
pInfo - > formatVersion [ 0 ] = = 2 & &
pInfo - > formatVersion [ 2 ] = = UTRIE_SHIFT & &
pInfo - > formatVersion [ 3 ] = = UTRIE_INDEX_SHIFT
) {
return TRUE ;
} else {
return FALSE ;
}
}
/* idnTrie: the folding offset is the lead FCD value itself */
static int32_t U_CALLCONV
getFoldingOffset ( uint32_t data ) {
if ( data & 0x8000 ) {
return ( int32_t ) ( data & 0x7fff ) ;
} else {
return 0 ;
}
}
2003-03-20 01:11:51 +00:00
U_CDECL_END
2003-02-28 21:37:55 +00:00
static UBool
loadIDNData ( UErrorCode & errorCode ) {
/* load Unicode normalization data from file */
if ( isDataLoaded = = FALSE ) {
UTrie _idnTrie = { 0 , 0 , 0 , 0 , 0 , 0 , 0 } ;
UDataMemory * data ;
const int32_t * p = NULL ;
const uint8_t * pb ;
if ( & errorCode = = NULL | | U_FAILURE ( errorCode ) ) {
return 0 ;
}
/* open the data outside the mutex block */
2003-03-14 21:39:33 +00:00
data = udata_openChoice ( NULL , DATA_TYPE , DATA_NAME , isAcceptable , NULL , & errorCode ) ;
2003-02-28 21:37:55 +00:00
dataErrorCode = errorCode ;
if ( U_FAILURE ( errorCode ) ) {
return isDataLoaded = FALSE ;
}
p = ( const int32_t * ) udata_getMemory ( data ) ;
pb = ( const uint8_t * ) ( p + _IDNA_INDEX_TOP ) ;
utrie_unserialize ( & _idnTrie , pb , p [ _IDNA_INDEX_TRIE_SIZE ] , & errorCode ) ;
_idnTrie . getFoldingOffset = getFoldingOffset ;
if ( U_FAILURE ( errorCode ) ) {
dataErrorCode = errorCode ;
udata_close ( data ) ;
return isDataLoaded = FALSE ;
}
/* in the mutex block, set the data for this process */
umtx_lock ( NULL ) ;
if ( idnData = = NULL ) {
idnData = data ;
data = NULL ;
uprv_memcpy ( & indexes , p , sizeof ( indexes ) ) ;
uprv_memcpy ( & idnTrie , & _idnTrie , sizeof ( UTrie ) ) ;
} else {
p = ( const int32_t * ) udata_getMemory ( idnData ) ;
}
umtx_unlock ( NULL ) ;
/* initialize some variables */
mappingData = ( uint16_t * ) ( ( uint8_t * ) ( p + _IDNA_INDEX_TOP ) + indexes [ _IDNA_INDEX_TRIE_SIZE ] ) ;
isDataLoaded = TRUE ;
/* if a different thread set it first, then close the extra data */
if ( data ! = NULL ) {
udata_close ( data ) ; /* NULL if it was set correctly */
}
}
return isDataLoaded ;
}
2003-05-06 01:37:52 +00:00
# endif /* #if !UCONFIG_NO_IDNA */
2003-02-28 21:37:55 +00:00
/*
* Hey , Emacs , please set the following :
*
* Local Variables :
* indent - tabs - mode : nil
* End :
*
*/