ICU-868 Allow multiline parsing of aliases, and a list of standard tags
X-SVN-Rev: 8858
This commit is contained in:
parent
5cb27bd920
commit
dce8ee6d71
@ -63,6 +63,21 @@
|
||||
# or names of algorithmic converters, and their case must not
|
||||
# be changed - or else code and/or file names must also be changed.
|
||||
|
||||
# List of supported standard tags
|
||||
{ IANA MIME
|
||||
#ICU # Can also use ICU_FEATURE ICU_CANONICAL
|
||||
#IBM AIX DB2
|
||||
#WINDOWS MSIE # MSIE is Internet Explorer, which is different from Windows
|
||||
#GLIBC
|
||||
#JAVA
|
||||
#SOLARIS
|
||||
#APPLE
|
||||
#HPUX
|
||||
#ZOS ZOS_USS # Could be OS390 and OS390_USS instead
|
||||
}
|
||||
|
||||
|
||||
|
||||
# Fully algorithmic converters
|
||||
|
||||
UTF-8 { IANA MIME } ibm-1208 cp1208
|
||||
@ -84,7 +99,6 @@ UTF-32LE { IANA } UTF32_LittleEndian
|
||||
UTF32_PlatformEndian
|
||||
UTF32_OppositeEndian
|
||||
|
||||
UTF-7 { IANA MIME }
|
||||
# On UTF-7:
|
||||
# RFC 2152 (http://www.imc.org/rfc2152) allows to encode some US-ASCII
|
||||
# characters directly or in base64. Especially, the characters in set O
|
||||
@ -94,10 +108,15 @@ UTF-7 { IANA MIME }
|
||||
# By choosing the option "version=1", set O will be escaped instead.
|
||||
# For example:
|
||||
# utf7Converter=ucnv_open("UTF-7,version=1");
|
||||
UTF-7 { IANA MIME }
|
||||
|
||||
SCSU { IANA }
|
||||
BOCU-1
|
||||
|
||||
# See http://www.unicode.org/unicode/reports/tr26 for this Compatibility Encoding Scheme for UTF-16
|
||||
# The Unicode Consortium does not encourage the use of CESU-8
|
||||
CESU-8 { IANA }
|
||||
|
||||
ISO-8859-1 { MIME } LATIN_1 ibm-819 cp819 latin1 8859-1 csisolatin1 iso-ir-100 ISO_8859-1:1987 { IANA } l1 ANSI_X3.110-1983 819 #!!!!! There's whole lot of names for this
|
||||
US-ASCII { MIME } ascii ascii-7 ANSI_X3.4-1968 { IANA } ANSI_X3.4-1986 ISO_646.irv:1991 iso646-us us csASCII 646 iso-ir-6 cp367
|
||||
|
||||
@ -140,9 +159,6 @@ ISCII,version=8 iscii-mlm x-iscii-ma
|
||||
|
||||
ibm-367
|
||||
|
||||
# Special mapping for S/390 new line characters
|
||||
ebcdic-xml-us
|
||||
|
||||
# Interchange codepages
|
||||
ibm-912 iso-8859-2 { MIME } latin2 cp912 8859-2 csisolatin2 iso-ir-101 ISO_8859-2:1987 { IANA } l2 912 # Central Europe
|
||||
ibm-913 iso-8859-3 { MIME } latin3 cp913 8859-3 csisolatin3 iso-ir-109 ISO_8859-3:1988 { IANA } l3 913 # Maltese Esperanto
|
||||
@ -343,6 +359,8 @@ ibm-16804 cpibm16804 ebcdic-ar # EBCDIC Arabic
|
||||
|
||||
# EBCDIC codepages for S/390, with LF and NL codes swapped
|
||||
|
||||
ebcdic-xml-us
|
||||
|
||||
# without Euro
|
||||
ibm-37-s390 ibm037-s390 # EBCDIC US
|
||||
ibm-1047-s390 # EBCDIC for S/390 Open Edition
|
||||
|
@ -22,9 +22,6 @@
|
||||
* and a 2.1 reader will be able to read 2.0.
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <ctype.h>
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/putil.h"
|
||||
#include "unicode/ucnv.h" /* ucnv_compareNames() */
|
||||
@ -34,6 +31,10 @@
|
||||
#include "unewdata.h"
|
||||
#include "uoptions.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <ctype.h>
|
||||
|
||||
/* TODO: Need to specify the maximum alias name length in a header (see ucnv_io.c::findalias()) */
|
||||
|
||||
#define STRING_STORE_SIZE 100000
|
||||
@ -42,6 +43,8 @@
|
||||
#define TAG_STORE_SIZE 20000
|
||||
#define MAX_TAG_COUNT 200
|
||||
|
||||
#define MAX_LINE_SIZE 32767
|
||||
|
||||
#define DATA_NAME "cnvalias"
|
||||
#define DATA_TYPE "dat"
|
||||
|
||||
@ -96,11 +99,23 @@ typedef struct {
|
||||
static Tag tags[MAX_TAG_COUNT];
|
||||
static uint16_t tagCount = 0;
|
||||
|
||||
/* Were the standard tags declared before the aliases. */
|
||||
UBool standardTagsUsed = FALSE;
|
||||
|
||||
/* prototypes --------------------------------------------------------------- */
|
||||
|
||||
static void
|
||||
parseLine(const char *line);
|
||||
|
||||
static void
|
||||
parseFile(FileStream *in);
|
||||
|
||||
static int32_t
|
||||
chomp(char *line);
|
||||
|
||||
static void
|
||||
addOfficialTaggedStandards(char *line, int32_t lineLen);
|
||||
|
||||
static uint16_t
|
||||
addAlias(const char *alias, uint16_t converter);
|
||||
|
||||
@ -131,11 +146,10 @@ static UOption options[]={
|
||||
|
||||
extern int
|
||||
main(int argc, char* argv[]) {
|
||||
char line[512];
|
||||
char pathBuf[512];
|
||||
const char *path;
|
||||
FileStream *in;
|
||||
UNewDataMemory *out;
|
||||
char *s;
|
||||
UErrorCode errorCode=U_ZERO_ERROR;
|
||||
int i;
|
||||
uint16_t tagOffset, stringOffset;
|
||||
@ -170,14 +184,13 @@ main(int argc, char* argv[]) {
|
||||
} else {
|
||||
path=options[4].value;
|
||||
if(path!=NULL && *path!=0) {
|
||||
uprv_strcpy(line, path);
|
||||
path=line+uprv_strlen(line);
|
||||
if(*(path-1)!=U_FILE_SEP_CHAR) {
|
||||
*((char *)path)=U_FILE_SEP_CHAR;
|
||||
++path;
|
||||
char *end = pathBuf+uprv_strlen(pathBuf);
|
||||
uprv_strcpy(pathBuf, path);
|
||||
if(*(end-1)!=U_FILE_SEP_CHAR) {
|
||||
*(end++)=U_FILE_SEP_CHAR;
|
||||
}
|
||||
uprv_strcpy((char *)path, "convrtrs.txt");
|
||||
path=line;
|
||||
uprv_strcpy(end, "convrtrs.txt");
|
||||
path=pathBuf;
|
||||
} else {
|
||||
path = "convrtrs.txt";
|
||||
}
|
||||
@ -188,21 +201,8 @@ main(int argc, char* argv[]) {
|
||||
exit(U_FILE_ACCESS_ERROR);
|
||||
}
|
||||
|
||||
/* read the list of aliases */
|
||||
while(T_FileStream_readLine(in, line, sizeof(line))!=NULL) {
|
||||
/* remove trailing newline characters */
|
||||
s=line;
|
||||
while(*s!=0) {
|
||||
if(*s=='\r' || *s=='\n') {
|
||||
*s=0;
|
||||
break;
|
||||
}
|
||||
++s;
|
||||
}
|
||||
|
||||
parseLine(line);
|
||||
}
|
||||
|
||||
parseFile(in);
|
||||
T_FileStream_close(in);
|
||||
|
||||
/* sort the aliases */
|
||||
@ -268,6 +268,83 @@ main(int argc, char* argv[]) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void
|
||||
parseFile(FileStream *in) {
|
||||
char line[MAX_LINE_SIZE];
|
||||
char lastLine[MAX_LINE_SIZE];
|
||||
int32_t lineSize = 0;
|
||||
int32_t lastLineSize;
|
||||
UBool validParse = TRUE;
|
||||
int32_t lineNum = 1;
|
||||
|
||||
/* read the list of aliases */
|
||||
while (validParse) {
|
||||
validParse = FALSE;
|
||||
|
||||
/* Read non-empty lines that don't start with a space character. */
|
||||
while (T_FileStream_readLine(in, lastLine, MAX_LINE_SIZE) != NULL) {
|
||||
lineNum++;
|
||||
lastLineSize = chomp(lastLine);
|
||||
if (lineSize == 0 || (lastLineSize > 0 && isspace(*lastLine))) {
|
||||
uprv_strcpy(line + lineSize, lastLine);
|
||||
lineSize += lastLineSize;
|
||||
} else if (lineSize > 0) {
|
||||
validParse = TRUE;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (validParse) {
|
||||
if (isspace(*line)) {
|
||||
fprintf(stderr, "error: line %d: cannot start an alias with a space\n", lineNum-2);
|
||||
exit(1);
|
||||
} else if (line[0] == '{') {
|
||||
if (!standardTagsUsed && line[lineSize - 1] != '}') {
|
||||
fprintf(stderr, "error: line %d: alias needs to start with a converter name\n", lineNum);
|
||||
exit(1);
|
||||
}
|
||||
addOfficialTaggedStandards(line, lineSize);
|
||||
standardTagsUsed = TRUE;
|
||||
} else {
|
||||
parseLine(line);
|
||||
}
|
||||
/* Was the last line consumed */
|
||||
if (lastLineSize > 0) {
|
||||
uprv_strcpy(line, lastLine);
|
||||
lineSize = lastLineSize;
|
||||
}
|
||||
else {
|
||||
lineSize = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* This works almost like the Perl chomp.
|
||||
It removes the newlines, comments and trailing whitespace (not preceding whitespace).
|
||||
*/
|
||||
static int32_t
|
||||
chomp(char *line) {
|
||||
char *s = line;
|
||||
char *lastNonSpace = line;
|
||||
while(*s!=0) {
|
||||
/* truncate at a newline or a comment */
|
||||
if(*s == '\r' || *s == '\n' || *s == '#') {
|
||||
*s = 0;
|
||||
break;
|
||||
}
|
||||
if (!isspace(*s)) {
|
||||
lastNonSpace = s;
|
||||
}
|
||||
++s;
|
||||
}
|
||||
if (lastNonSpace++ > line) {
|
||||
*lastNonSpace = 0;
|
||||
s = lastNonSpace;
|
||||
}
|
||||
return (int32_t)(s - line);
|
||||
}
|
||||
|
||||
static void
|
||||
parseLine(const char *line) {
|
||||
uint16_t pos=0, start, limit, length, cnv;
|
||||
@ -278,14 +355,14 @@ parseLine(const char *line) {
|
||||
++pos;
|
||||
}
|
||||
|
||||
/* is there only a comment on this line? */
|
||||
if(line[pos]==0 || line[pos]=='#') {
|
||||
/* is there nothing on this line? */
|
||||
if(line[pos]==0) {
|
||||
return;
|
||||
}
|
||||
|
||||
/* get the converter name */
|
||||
start=pos;
|
||||
while(line[pos]!=0 && line[pos]!='#' && !isspace((unsigned char)line[pos])) {
|
||||
while(line[pos]!=0 && !isspace((unsigned char)line[pos])) {
|
||||
++pos;
|
||||
}
|
||||
limit=pos;
|
||||
@ -408,12 +485,18 @@ static uint16_t
|
||||
getTagNumber(const char *tag, uint16_t tagLen) {
|
||||
char *atag;
|
||||
uint16_t t;
|
||||
UBool preferredName = (tag[tagLen - 1] == '*');
|
||||
|
||||
if (tagCount >= MAX_TAG_COUNT) {
|
||||
fprintf(stderr, "gencnval: too many tags\n");
|
||||
exit(U_BUFFER_OVERFLOW_ERROR);
|
||||
}
|
||||
|
||||
if (preferredName) {
|
||||
/* puts(tag);*/
|
||||
tagLen--;
|
||||
}
|
||||
|
||||
for (t = 0; t < tagCount; ++t) {
|
||||
if (uprv_strlen(tags[t].tag) == tagLen && !uprv_strnicmp(tags[t].tag, tag, tagLen)) {
|
||||
return t;
|
||||
@ -421,18 +504,24 @@ getTagNumber(const char *tag, uint16_t tagLen) {
|
||||
}
|
||||
|
||||
/* we need to add this tag */
|
||||
|
||||
if (tagCount >= MAX_TAG_COUNT) {
|
||||
fprintf(stderr, "gencnval: too many tags\n");
|
||||
exit(U_BUFFER_OVERFLOW_ERROR);
|
||||
}
|
||||
|
||||
/* allocate a new entry in the tag table */
|
||||
|
||||
atag = allocString(&tagBlock, tagLen + 1);
|
||||
uprv_memcpy(atag, tag, tagLen);
|
||||
atag[tagLen] = 0;
|
||||
|
||||
if (standardTagsUsed) {
|
||||
fprintf(stderr, "error: Tag \"%s\" is not declared at the beginning of the alias table.\n", atag);
|
||||
exit(1);
|
||||
}
|
||||
else {
|
||||
fprintf(stderr, "warning: Tag \"%s\" was added to the list of standards because it was not declared at beginning of the alias table.\n", atag);
|
||||
}
|
||||
|
||||
/* add the tag to the tag table */
|
||||
tags[tagCount].tag = atag;
|
||||
/* Set the array of pointers to NULL */
|
||||
@ -446,6 +535,40 @@ addTaggedAlias(uint16_t tag, const char *alias, uint16_t converter) {
|
||||
tags[tag].aliases[converter] = alias;
|
||||
}
|
||||
|
||||
static void
|
||||
addOfficialTaggedStandards(char *line, int32_t lineLen) {
|
||||
char *atag;
|
||||
char *tag = strchr(line, '{') + 1;
|
||||
uint16_t tagSize;
|
||||
static const char WHITESPACE[] = " \t";
|
||||
|
||||
if (tagCount >= MAX_TAG_COUNT) {
|
||||
fprintf(stderr, "gencnval: too many tags\n");
|
||||
exit(U_BUFFER_OVERFLOW_ERROR);
|
||||
}
|
||||
strchr(tag, '}')[0] = 0;
|
||||
|
||||
tag = strtok(tag, WHITESPACE);
|
||||
while (tag != NULL) {
|
||||
/* printf("Adding original tag \"%s\"\n", tag);*/
|
||||
|
||||
tagSize = strlen(tag) + 1;
|
||||
/* allocate a new entry in the tag table */
|
||||
|
||||
atag = allocString(&tagBlock, tagSize);
|
||||
uprv_memcpy(atag, tag, tagSize);
|
||||
|
||||
/* add the tag to the tag table */
|
||||
tags[tagCount].tag = atag;
|
||||
/* Set the array of pointers to NULL */
|
||||
uprv_memset((void *)&tags[tagCount].aliases, 0, sizeof(tags[tagCount].aliases));
|
||||
tagCount++;
|
||||
|
||||
/* Get next tag */
|
||||
tag = strtok(NULL, WHITESPACE);
|
||||
}
|
||||
}
|
||||
|
||||
static uint16_t
|
||||
addAlias(const char *alias, uint16_t converter) {
|
||||
if(aliasCount>=MAX_ALIAS_COUNT) {
|
||||
|
@ -43,8 +43,8 @@ RSC=rc.exe
|
||||
# PROP Intermediate_Dir "Release"
|
||||
# PROP Ignore_Export_Lib 0
|
||||
# PROP Target_Dir ""
|
||||
# ADD BASE CPP /nologo /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /FD /c
|
||||
# ADD CPP /nologo /MD /Za /W3 /GX /O2 /I "..\..\common" /I "..\toolutil" /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /FD /c
|
||||
# ADD BASE CPP /nologo /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /FD /c
|
||||
# ADD CPP /nologo /MD /Za /W3 /GX /O2 /I "..\..\common" /I "..\toolutil" /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /FD /c
|
||||
# ADD BASE RSC /l 0x409 /d "NDEBUG"
|
||||
# ADD RSC /l 0x409 /d "NDEBUG"
|
||||
BSC32=bscmake.exe
|
||||
@ -60,7 +60,7 @@ InputName=gencnval
|
||||
SOURCE="$(InputPath)"
|
||||
|
||||
"..\..\..\bin\$(InputName).exe" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
|
||||
copy $(TargetPath) ..\..\..\bin
|
||||
copy $(TargetPath) ..\..\..\bin
|
||||
|
||||
# End Custom Build
|
||||
|
||||
@ -77,8 +77,8 @@ SOURCE="$(InputPath)"
|
||||
# PROP Intermediate_Dir "Debug"
|
||||
# PROP Ignore_Export_Lib 0
|
||||
# PROP Target_Dir ""
|
||||
# ADD BASE CPP /nologo /W3 /Gm /GX /ZI /Od /D "WIN32" /D "_DEBUG" /D "_CONSOLE" /D "_MBCS" /FD /GZ /c
|
||||
# ADD CPP /nologo /MDd /Za /W3 /Gm /GX /ZI /Od /I "..\..\common" /I "..\toolutil" /D "WIN32" /D "_DEBUG" /D "_CONSOLE" /D "_MBCS" /FR /FD /GZ /c
|
||||
# ADD BASE CPP /nologo /W3 /Gm /GX /ZI /Od /D "WIN32" /D "_DEBUG" /D "_CONSOLE" /D "_MBCS" /FD /GZ /c
|
||||
# ADD CPP /nologo /MDd /Za /W3 /Gm /GX /ZI /Od /I "..\..\common" /I "..\toolutil" /D "WIN32" /D "_DEBUG" /D "_CONSOLE" /D "_MBCS" /FR /FD /GZ /c
|
||||
# ADD BASE RSC /l 0x409 /d "_DEBUG"
|
||||
# ADD RSC /l 0x409 /d "_DEBUG"
|
||||
BSC32=bscmake.exe
|
||||
@ -94,7 +94,7 @@ InputName=gencnval
|
||||
SOURCE="$(InputPath)"
|
||||
|
||||
"..\..\..\bin\$(InputName).exe" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
|
||||
copy $(TargetPath) ..\..\..\bin
|
||||
copy $(TargetPath) ..\..\..\bin
|
||||
|
||||
# End Custom Build
|
||||
|
||||
@ -111,7 +111,7 @@ SOURCE="$(InputPath)"
|
||||
# PROP Intermediate_Dir "Release"
|
||||
# PROP Ignore_Export_Lib 0
|
||||
# PROP Target_Dir ""
|
||||
# ADD BASE CPP /nologo /W3 /GX /O2 /D "WIN64" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /FD /c
|
||||
# ADD BASE CPP /nologo /W3 /GX /O2 /D "WIN64" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /FD /c
|
||||
# ADD CPP /nologo /MD /Za /W3 /I "..\..\common" /I "..\toolutil" /D"WIN64" /D"NDEBUG" /D"_CONSOLE" /D"_MBCS" /FD /c /O2 /GX /Op /QIA64_fmaopt /D"_IA64_" /Zi /D"WIN64" /D"WIN32" /D"_AFX_NO_DAO_SUPPORT" /Wp64 /Zm600
|
||||
# ADD BASE RSC /l 0x409 /d "NDEBUG"
|
||||
# ADD RSC /l 0x409 /d "NDEBUG"
|
||||
@ -128,7 +128,7 @@ InputName=gencnval
|
||||
SOURCE="$(InputPath)"
|
||||
|
||||
"..\..\..\bin\$(InputName).exe" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
|
||||
copy $(TargetPath) ..\..\..\bin
|
||||
copy $(TargetPath) ..\..\..\bin
|
||||
|
||||
# End Custom Build
|
||||
|
||||
@ -145,7 +145,7 @@ SOURCE="$(InputPath)"
|
||||
# PROP Intermediate_Dir "Debug"
|
||||
# PROP Ignore_Export_Lib 0
|
||||
# PROP Target_Dir ""
|
||||
# ADD BASE CPP /nologo /W3 /Gm /GX /ZI /Od /D "WIN64" /D "_DEBUG" /D "_CONSOLE" /D "_MBCS" /FD /GZ /c
|
||||
# ADD BASE CPP /nologo /W3 /Gm /GX /ZI /Od /D "WIN64" /D "_DEBUG" /D "_CONSOLE" /D "_MBCS" /FD /GZ /c
|
||||
# ADD CPP /nologo /MDd /Za /W3 /Gm /I "..\..\common" /I "..\toolutil" /D"WIN64" /D"_DEBUG" /D"_CONSOLE" /D"_MBCS" /FR /FD /GZ /c /Od /GX /Op /QIA64_fmaopt /D"_IA64_" /Zi /D"WIN64" /D"WIN32" /D"_AFX_NO_DAO_SUPPORT" /Wp64 /Zm600
|
||||
# ADD BASE RSC /l 0x409 /d "_DEBUG"
|
||||
# ADD RSC /l 0x409 /d "_DEBUG"
|
||||
@ -162,7 +162,7 @@ InputName=gencnval
|
||||
SOURCE="$(InputPath)"
|
||||
|
||||
"..\..\..\bin\$(InputName).exe" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
|
||||
copy $(TargetPath) ..\..\..\bin
|
||||
copy $(TargetPath) ..\..\..\bin
|
||||
|
||||
# End Custom Build
|
||||
|
||||
@ -181,10 +181,6 @@ SOURCE="$(InputPath)"
|
||||
|
||||
SOURCE=.\gencnval.c
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=..\..\common\ucnv_io.c
|
||||
# End Source File
|
||||
# End Group
|
||||
# Begin Group "Header Files"
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user