ICU-868 Allow multiline parsing of aliases, and a list of standard tags

X-SVN-Rev: 8858
This commit is contained in:
George Rhoten 2002-06-12 17:29:40 +00:00
parent 5cb27bd920
commit dce8ee6d71
3 changed files with 186 additions and 49 deletions

View File

@ -63,6 +63,21 @@
# or names of algorithmic converters, and their case must not
# be changed - or else code and/or file names must also be changed.
# List of supported standard tags
{ IANA MIME
#ICU # Can also use ICU_FEATURE ICU_CANONICAL
#IBM AIX DB2
#WINDOWS MSIE # MSIE is Internet Explorer, which is different from Windows
#GLIBC
#JAVA
#SOLARIS
#APPLE
#HPUX
#ZOS ZOS_USS # Could be OS390 and OS390_USS instead
}
# Fully algorithmic converters
UTF-8 { IANA MIME } ibm-1208 cp1208
@ -84,7 +99,6 @@ UTF-32LE { IANA } UTF32_LittleEndian
UTF32_PlatformEndian
UTF32_OppositeEndian
UTF-7 { IANA MIME }
# On UTF-7:
# RFC 2152 (http://www.imc.org/rfc2152) allows to encode some US-ASCII
# characters directly or in base64. Especially, the characters in set O
@ -94,10 +108,15 @@ UTF-7 { IANA MIME }
# By choosing the option "version=1", set O will be escaped instead.
# For example:
# utf7Converter=ucnv_open("UTF-7,version=1");
UTF-7 { IANA MIME }
SCSU { IANA }
BOCU-1
# See http://www.unicode.org/unicode/reports/tr26 for this Compatibility Encoding Scheme for UTF-16
# The Unicode Consortium does not encourage the use of CESU-8
CESU-8 { IANA }
ISO-8859-1 { MIME } LATIN_1 ibm-819 cp819 latin1 8859-1 csisolatin1 iso-ir-100 ISO_8859-1:1987 { IANA } l1 ANSI_X3.110-1983 819 #!!!!! There's whole lot of names for this
US-ASCII { MIME } ascii ascii-7 ANSI_X3.4-1968 { IANA } ANSI_X3.4-1986 ISO_646.irv:1991 iso646-us us csASCII 646 iso-ir-6 cp367
@ -140,9 +159,6 @@ ISCII,version=8 iscii-mlm x-iscii-ma
ibm-367
# Special mapping for S/390 new line characters
ebcdic-xml-us
# Interchange codepages
ibm-912 iso-8859-2 { MIME } latin2 cp912 8859-2 csisolatin2 iso-ir-101 ISO_8859-2:1987 { IANA } l2 912 # Central Europe
ibm-913 iso-8859-3 { MIME } latin3 cp913 8859-3 csisolatin3 iso-ir-109 ISO_8859-3:1988 { IANA } l3 913 # Maltese Esperanto
@ -343,6 +359,8 @@ ibm-16804 cpibm16804 ebcdic-ar # EBCDIC Arabic
# EBCDIC codepages for S/390, with LF and NL codes swapped
ebcdic-xml-us
# without Euro
ibm-37-s390 ibm037-s390 # EBCDIC US
ibm-1047-s390 # EBCDIC for S/390 Open Edition

View File

@ -22,9 +22,6 @@
* and a 2.1 reader will be able to read 2.0.
*/
#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>
#include "unicode/utypes.h"
#include "unicode/putil.h"
#include "unicode/ucnv.h" /* ucnv_compareNames() */
@ -34,6 +31,10 @@
#include "unewdata.h"
#include "uoptions.h"
#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>
/* TODO: Need to specify the maximum alias name length in a header (see ucnv_io.c::findalias()) */
#define STRING_STORE_SIZE 100000
@ -42,6 +43,8 @@
#define TAG_STORE_SIZE 20000
#define MAX_TAG_COUNT 200
#define MAX_LINE_SIZE 32767
#define DATA_NAME "cnvalias"
#define DATA_TYPE "dat"
@ -96,11 +99,23 @@ typedef struct {
static Tag tags[MAX_TAG_COUNT];
static uint16_t tagCount = 0;
/* Were the standard tags declared before the aliases. */
UBool standardTagsUsed = FALSE;
/* prototypes --------------------------------------------------------------- */
static void
parseLine(const char *line);
static void
parseFile(FileStream *in);
static int32_t
chomp(char *line);
static void
addOfficialTaggedStandards(char *line, int32_t lineLen);
static uint16_t
addAlias(const char *alias, uint16_t converter);
@ -131,11 +146,10 @@ static UOption options[]={
extern int
main(int argc, char* argv[]) {
char line[512];
char pathBuf[512];
const char *path;
FileStream *in;
UNewDataMemory *out;
char *s;
UErrorCode errorCode=U_ZERO_ERROR;
int i;
uint16_t tagOffset, stringOffset;
@ -170,14 +184,13 @@ main(int argc, char* argv[]) {
} else {
path=options[4].value;
if(path!=NULL && *path!=0) {
uprv_strcpy(line, path);
path=line+uprv_strlen(line);
if(*(path-1)!=U_FILE_SEP_CHAR) {
*((char *)path)=U_FILE_SEP_CHAR;
++path;
char *end = pathBuf+uprv_strlen(pathBuf);
uprv_strcpy(pathBuf, path);
if(*(end-1)!=U_FILE_SEP_CHAR) {
*(end++)=U_FILE_SEP_CHAR;
}
uprv_strcpy((char *)path, "convrtrs.txt");
path=line;
uprv_strcpy(end, "convrtrs.txt");
path=pathBuf;
} else {
path = "convrtrs.txt";
}
@ -188,21 +201,8 @@ main(int argc, char* argv[]) {
exit(U_FILE_ACCESS_ERROR);
}
/* read the list of aliases */
while(T_FileStream_readLine(in, line, sizeof(line))!=NULL) {
/* remove trailing newline characters */
s=line;
while(*s!=0) {
if(*s=='\r' || *s=='\n') {
*s=0;
break;
}
++s;
}
parseLine(line);
}
parseFile(in);
T_FileStream_close(in);
/* sort the aliases */
@ -268,6 +268,83 @@ main(int argc, char* argv[]) {
return 0;
}
static void
parseFile(FileStream *in) {
char line[MAX_LINE_SIZE];
char lastLine[MAX_LINE_SIZE];
int32_t lineSize = 0;
int32_t lastLineSize;
UBool validParse = TRUE;
int32_t lineNum = 1;
/* read the list of aliases */
while (validParse) {
validParse = FALSE;
/* Read non-empty lines that don't start with a space character. */
while (T_FileStream_readLine(in, lastLine, MAX_LINE_SIZE) != NULL) {
lineNum++;
lastLineSize = chomp(lastLine);
if (lineSize == 0 || (lastLineSize > 0 && isspace(*lastLine))) {
uprv_strcpy(line + lineSize, lastLine);
lineSize += lastLineSize;
} else if (lineSize > 0) {
validParse = TRUE;
break;
}
}
if (validParse) {
if (isspace(*line)) {
fprintf(stderr, "error: line %d: cannot start an alias with a space\n", lineNum-2);
exit(1);
} else if (line[0] == '{') {
if (!standardTagsUsed && line[lineSize - 1] != '}') {
fprintf(stderr, "error: line %d: alias needs to start with a converter name\n", lineNum);
exit(1);
}
addOfficialTaggedStandards(line, lineSize);
standardTagsUsed = TRUE;
} else {
parseLine(line);
}
/* Was the last line consumed */
if (lastLineSize > 0) {
uprv_strcpy(line, lastLine);
lineSize = lastLineSize;
}
else {
lineSize = 0;
}
}
}
}
/* This works almost like the Perl chomp.
It removes the newlines, comments and trailing whitespace (not preceding whitespace).
*/
static int32_t
chomp(char *line) {
char *s = line;
char *lastNonSpace = line;
while(*s!=0) {
/* truncate at a newline or a comment */
if(*s == '\r' || *s == '\n' || *s == '#') {
*s = 0;
break;
}
if (!isspace(*s)) {
lastNonSpace = s;
}
++s;
}
if (lastNonSpace++ > line) {
*lastNonSpace = 0;
s = lastNonSpace;
}
return (int32_t)(s - line);
}
static void
parseLine(const char *line) {
uint16_t pos=0, start, limit, length, cnv;
@ -278,14 +355,14 @@ parseLine(const char *line) {
++pos;
}
/* is there only a comment on this line? */
if(line[pos]==0 || line[pos]=='#') {
/* is there nothing on this line? */
if(line[pos]==0) {
return;
}
/* get the converter name */
start=pos;
while(line[pos]!=0 && line[pos]!='#' && !isspace((unsigned char)line[pos])) {
while(line[pos]!=0 && !isspace((unsigned char)line[pos])) {
++pos;
}
limit=pos;
@ -408,12 +485,18 @@ static uint16_t
getTagNumber(const char *tag, uint16_t tagLen) {
char *atag;
uint16_t t;
UBool preferredName = (tag[tagLen - 1] == '*');
if (tagCount >= MAX_TAG_COUNT) {
fprintf(stderr, "gencnval: too many tags\n");
exit(U_BUFFER_OVERFLOW_ERROR);
}
if (preferredName) {
/* puts(tag);*/
tagLen--;
}
for (t = 0; t < tagCount; ++t) {
if (uprv_strlen(tags[t].tag) == tagLen && !uprv_strnicmp(tags[t].tag, tag, tagLen)) {
return t;
@ -421,18 +504,24 @@ getTagNumber(const char *tag, uint16_t tagLen) {
}
/* we need to add this tag */
if (tagCount >= MAX_TAG_COUNT) {
fprintf(stderr, "gencnval: too many tags\n");
exit(U_BUFFER_OVERFLOW_ERROR);
}
/* allocate a new entry in the tag table */
atag = allocString(&tagBlock, tagLen + 1);
uprv_memcpy(atag, tag, tagLen);
atag[tagLen] = 0;
if (standardTagsUsed) {
fprintf(stderr, "error: Tag \"%s\" is not declared at the beginning of the alias table.\n", atag);
exit(1);
}
else {
fprintf(stderr, "warning: Tag \"%s\" was added to the list of standards because it was not declared at beginning of the alias table.\n", atag);
}
/* add the tag to the tag table */
tags[tagCount].tag = atag;
/* Set the array of pointers to NULL */
@ -446,6 +535,40 @@ addTaggedAlias(uint16_t tag, const char *alias, uint16_t converter) {
tags[tag].aliases[converter] = alias;
}
static void
addOfficialTaggedStandards(char *line, int32_t lineLen) {
char *atag;
char *tag = strchr(line, '{') + 1;
uint16_t tagSize;
static const char WHITESPACE[] = " \t";
if (tagCount >= MAX_TAG_COUNT) {
fprintf(stderr, "gencnval: too many tags\n");
exit(U_BUFFER_OVERFLOW_ERROR);
}
strchr(tag, '}')[0] = 0;
tag = strtok(tag, WHITESPACE);
while (tag != NULL) {
/* printf("Adding original tag \"%s\"\n", tag);*/
tagSize = strlen(tag) + 1;
/* allocate a new entry in the tag table */
atag = allocString(&tagBlock, tagSize);
uprv_memcpy(atag, tag, tagSize);
/* add the tag to the tag table */
tags[tagCount].tag = atag;
/* Set the array of pointers to NULL */
uprv_memset((void *)&tags[tagCount].aliases, 0, sizeof(tags[tagCount].aliases));
tagCount++;
/* Get next tag */
tag = strtok(NULL, WHITESPACE);
}
}
static uint16_t
addAlias(const char *alias, uint16_t converter) {
if(aliasCount>=MAX_ALIAS_COUNT) {

View File

@ -43,8 +43,8 @@ RSC=rc.exe
# PROP Intermediate_Dir "Release"
# PROP Ignore_Export_Lib 0
# PROP Target_Dir ""
# ADD BASE CPP /nologo /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /FD /c
# ADD CPP /nologo /MD /Za /W3 /GX /O2 /I "..\..\common" /I "..\toolutil" /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /FD /c
# ADD BASE CPP /nologo /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /FD /c
# ADD CPP /nologo /MD /Za /W3 /GX /O2 /I "..\..\common" /I "..\toolutil" /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /FD /c
# ADD BASE RSC /l 0x409 /d "NDEBUG"
# ADD RSC /l 0x409 /d "NDEBUG"
BSC32=bscmake.exe
@ -60,7 +60,7 @@ InputName=gencnval
SOURCE="$(InputPath)"
"..\..\..\bin\$(InputName).exe" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
copy $(TargetPath) ..\..\..\bin
copy $(TargetPath) ..\..\..\bin
# End Custom Build
@ -77,8 +77,8 @@ SOURCE="$(InputPath)"
# PROP Intermediate_Dir "Debug"
# PROP Ignore_Export_Lib 0
# PROP Target_Dir ""
# ADD BASE CPP /nologo /W3 /Gm /GX /ZI /Od /D "WIN32" /D "_DEBUG" /D "_CONSOLE" /D "_MBCS" /FD /GZ /c
# ADD CPP /nologo /MDd /Za /W3 /Gm /GX /ZI /Od /I "..\..\common" /I "..\toolutil" /D "WIN32" /D "_DEBUG" /D "_CONSOLE" /D "_MBCS" /FR /FD /GZ /c
# ADD BASE CPP /nologo /W3 /Gm /GX /ZI /Od /D "WIN32" /D "_DEBUG" /D "_CONSOLE" /D "_MBCS" /FD /GZ /c
# ADD CPP /nologo /MDd /Za /W3 /Gm /GX /ZI /Od /I "..\..\common" /I "..\toolutil" /D "WIN32" /D "_DEBUG" /D "_CONSOLE" /D "_MBCS" /FR /FD /GZ /c
# ADD BASE RSC /l 0x409 /d "_DEBUG"
# ADD RSC /l 0x409 /d "_DEBUG"
BSC32=bscmake.exe
@ -94,7 +94,7 @@ InputName=gencnval
SOURCE="$(InputPath)"
"..\..\..\bin\$(InputName).exe" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
copy $(TargetPath) ..\..\..\bin
copy $(TargetPath) ..\..\..\bin
# End Custom Build
@ -111,7 +111,7 @@ SOURCE="$(InputPath)"
# PROP Intermediate_Dir "Release"
# PROP Ignore_Export_Lib 0
# PROP Target_Dir ""
# ADD BASE CPP /nologo /W3 /GX /O2 /D "WIN64" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /FD /c
# ADD BASE CPP /nologo /W3 /GX /O2 /D "WIN64" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /FD /c
# ADD CPP /nologo /MD /Za /W3 /I "..\..\common" /I "..\toolutil" /D"WIN64" /D"NDEBUG" /D"_CONSOLE" /D"_MBCS" /FD /c /O2 /GX /Op /QIA64_fmaopt /D"_IA64_" /Zi /D"WIN64" /D"WIN32" /D"_AFX_NO_DAO_SUPPORT" /Wp64 /Zm600
# ADD BASE RSC /l 0x409 /d "NDEBUG"
# ADD RSC /l 0x409 /d "NDEBUG"
@ -128,7 +128,7 @@ InputName=gencnval
SOURCE="$(InputPath)"
"..\..\..\bin\$(InputName).exe" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
copy $(TargetPath) ..\..\..\bin
copy $(TargetPath) ..\..\..\bin
# End Custom Build
@ -145,7 +145,7 @@ SOURCE="$(InputPath)"
# PROP Intermediate_Dir "Debug"
# PROP Ignore_Export_Lib 0
# PROP Target_Dir ""
# ADD BASE CPP /nologo /W3 /Gm /GX /ZI /Od /D "WIN64" /D "_DEBUG" /D "_CONSOLE" /D "_MBCS" /FD /GZ /c
# ADD BASE CPP /nologo /W3 /Gm /GX /ZI /Od /D "WIN64" /D "_DEBUG" /D "_CONSOLE" /D "_MBCS" /FD /GZ /c
# ADD CPP /nologo /MDd /Za /W3 /Gm /I "..\..\common" /I "..\toolutil" /D"WIN64" /D"_DEBUG" /D"_CONSOLE" /D"_MBCS" /FR /FD /GZ /c /Od /GX /Op /QIA64_fmaopt /D"_IA64_" /Zi /D"WIN64" /D"WIN32" /D"_AFX_NO_DAO_SUPPORT" /Wp64 /Zm600
# ADD BASE RSC /l 0x409 /d "_DEBUG"
# ADD RSC /l 0x409 /d "_DEBUG"
@ -162,7 +162,7 @@ InputName=gencnval
SOURCE="$(InputPath)"
"..\..\..\bin\$(InputName).exe" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
copy $(TargetPath) ..\..\..\bin
copy $(TargetPath) ..\..\..\bin
# End Custom Build
@ -181,10 +181,6 @@ SOURCE="$(InputPath)"
SOURCE=.\gencnval.c
# End Source File
# Begin Source File
SOURCE=..\..\common\ucnv_io.c
# End Source File
# End Group
# Begin Group "Header Files"