ICU-1970 add ISO_Comment

X-SVN-Rev: 9046
This commit is contained in:
Markus Scherer 2002-07-08 16:31:25 +00:00
parent 98dbc49f16
commit da30a01ee9
2 changed files with 117 additions and 49 deletions

View File

@ -215,6 +215,31 @@ u_charName(UChar32 code, UCharNameChoice nameChoice,
return u_terminateChars(buffer, bufferLength, length, pErrorCode);
}
#define _U_ISO_COMMENT U_CHAR_NAME_CHOICE_COUNT
U_CAPI int32_t U_EXPORT2
u_getISOComment(UChar32 c,
char *dest, int32_t destCapacity,
UErrorCode *pErrorCode) {
int32_t length;
/* check the argument values */
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
return 0;
} else if(destCapacity<0 || (destCapacity>0 && dest==NULL)) {
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
if((uint32_t)c>UCHAR_MAX_VALUE || !isDataLoaded(pErrorCode)) {
return u_terminateChars(dest, destCapacity, 0, pErrorCode);
}
/* the ISO comment is stored like a normal character name */
length=getName(uCharNames, (uint32_t)c, _U_ISO_COMMENT, dest, (uint16_t)destCapacity);
return u_terminateChars(dest, destCapacity, length, pErrorCode);
}
U_CAPI UChar32 U_EXPORT2
u_charFromName(UCharNameChoice nameChoice,
const char *name,
@ -596,7 +621,7 @@ expandName(UCharNames *names,
uint8_t *tokenStrings=(uint8_t *)names+names->tokenStringOffset;
uint8_t c;
if(nameChoice==U_UNICODE_10_CHAR_NAME) {
if(nameChoice==U_UNICODE_10_CHAR_NAME || nameChoice==_U_ISO_COMMENT) {
/*
* skip the modern name if it is not requested _and_
* if the semicolon byte value is a character, not a token number
@ -608,6 +633,15 @@ expandName(UCharNames *names,
break;
}
}
if(nameChoice==_U_ISO_COMMENT) {
/* skip the Unicode 1.0 name as well to get the ISO comment */
while(nameLength>0) {
--nameLength;
if(*name++==';') {
break;
}
}
}
} else {
/*
* the semicolon byte value is a token number, therefore

View File

@ -16,7 +16,7 @@
* This program reads the Unicode character database text file,
* parses it, and extracts the character code,
* the "modern" character name, and optionally the
* Unicode 1.0 character name.
* Unicode 1.0 character name, and (starting with ICU 2.2) the ISO 10646 comment.
* It then tokenizes and compresses the names and builds
* compact binary tables for random-access lookup
* in a u_charName() API function.
@ -53,6 +53,9 @@
* tokenString=tokenStrings+token; (tokenStrings=start of names data + tokenStringOffset;)
* append zero-terminated tokenString;
*
* Different strings for a code point - normal name, 1.0 name, and ISO comment -
* are separated by ';'.
*
* uint16_t groupCount;
* struct {
* uint16_t groupMSB; -- for a group of 32 character names stored, this is code point>>5
@ -242,7 +245,7 @@ static void
countWord(Word *word);
static void
addLine(uint32_t code, char *name1, int16_t name1Length, char *name2, int16_t name2Length);
addLine(uint32_t code, char *names[], int16_t lengths[], int16_t count);
static void
addGroup(uint32_t groupMSB, uint8_t *strings, int16_t length);
@ -257,7 +260,7 @@ static void
appendLineLengthNibble(uint8_t nibble);
static uint8_t *
allocLine(uint32_t length);
allocLine(int32_t length);
static uint8_t *
allocWord(uint32_t length);
@ -284,7 +287,7 @@ main(int argc, char* argv[]) {
/* preset then read command line options */
options[5].value=u_getDataDirectory();
options[6].value="3.1.1";
options[6].value="3.2";
argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options);
/* error handling, printing usage message */
@ -355,10 +358,10 @@ static void
lineFn(void *context,
char *fields[][2], int32_t fieldCount,
UErrorCode *pErrorCode) {
char *names[3];
int16_t lengths[3];
static uint32_t prevCode=0;
uint32_t code=0;
char *name1Start, *name2Start;
int16_t name1Length, name2Length;
if(U_FAILURE(*pErrorCode)) {
return;
@ -367,25 +370,32 @@ lineFn(void *context,
code=uprv_strtoul(fields[0][0], NULL, 16);
/* get the character name */
name1Start=fields[1][0];
names[0]=fields[1][0];
if(fields[1][0][0]!='<') {
name1Length=(int16_t)(fields[1][1]-name1Start);
lengths[0]=(int16_t)(fields[1][1]-names[0]);
} else {
/* do not store pseudo-names in <> brackets */
name1Length=0;
lengths[0]=0;
}
/* store 1.0 names */
/* get the second character name, the one from Unicode 1.0 */
/* do not store pseudo-names in <> brackets */
name2Start=fields[10][0];
names[1]=fields[10][0];
if(*(UBool *)context && fields[10][0][0]!='<') {
name2Length=(int16_t)(fields[10][1]-name2Start);
lengths[1]=(int16_t)(fields[10][1]-names[1]);
} else {
name2Length=0;
lengths[1]=0;
}
if(name1Length+name2Length==0) {
/* get the ISO 10646 comment */
names[2]=fields[11][0];
lengths[2]=(int16_t)(fields[11][1]-names[2]);
if(lengths[2]!=0) {
char *s=names[2];
}
if(lengths[0]+lengths[1]+lengths[2]==0) {
return;
}
@ -406,22 +416,27 @@ lineFn(void *context,
}
prevCode=code;
/* printf("%lx:%.*s(%.*s)\n", code, name1Length, line+name1Start, name2Length, line+name2Start); */
parseName(names[0], lengths[0]);
parseName(names[1], lengths[1]);
parseName(names[2], lengths[2]);
parseName(name1Start, name1Length);
parseName(name2Start, name2Length);
addLine(code, name1Start, name1Length, name2Start, name2Length);
/*
* set the count argument to
* 1: only store regular names
* 2: store regular and 1.0 names
* 3: store names and ISO 10646 comment
*/
addLine(code, names, lengths, 3);
}
static void
parseDB(const char *filename, UBool store10Names) {
char *fields[11][2];
char *fields[15][2];
UErrorCode errorCode=U_ZERO_ERROR;
/* parsing the 11 fields 0..10 is enough for gennames */
u_parseDelimitedFile(filename, ';', fields, 11, lineFn, &store10Names, &errorCode);
u_parseDelimitedFile(filename, ';', fields, 15, lineFn, &store10Names, &errorCode);
if(U_FAILURE(errorCode)) {
fprintf(stderr, "gennames parse error: %s\n", u_errorName(errorCode));
exit(errorCode);
}
@ -482,17 +497,23 @@ parseName(char *name, int16_t length) {
}
}
static U_INLINE
isWordChar(char c) {
return ('A'<=c && c<='I') || /* EBCDIC-safe check for letters */
('J'<=c && c<='R') ||
('S'<=c && c<='Z') ||
('a'<=c && c<='i') || /* lowercase letters for ISO comments */
('j'<=c && c<='r') ||
('s'<=c && c<='z') ||
('0'<=c && c<='9');
}
static int16_t
skipNoise(char *line, int16_t start, int16_t limit) {
char c;
/* skip anything that is not part of a word in this sense */
while(start<limit &&
!(('A'<=(c=line[start]) && c<='I') || /* EBCDIC-safe check for letters */
('J'<=c && c<='R') ||
('S'<=c && c<='Z') ||
('0'<=c && c<='9'))
) {
while(start<limit && !isWordChar(line[start])) {
++start;
}
@ -504,17 +525,12 @@ getWord(char *line, int16_t start, int16_t limit) {
char c=0; /* initialize to avoid a compiler warning although the code was safe */
/* a unicode character name word consists of A-Z0-9 */
while(start<limit &&
(('A'<=(c=line[start]) && c<='I') || /* EBCDIC-safe check for letters */
('J'<=c && c<='R') ||
('S'<=c && c<='Z') ||
('0'<=c && c<='9'))
) {
while(start<limit && isWordChar(line[start])) {
++start;
}
/* include a following space or dash */
if(start<limit && (c==' ' || c=='-')) {
if(start<limit && ((c=line[start])==' ' || c=='-')) {
++start;
}
@ -1103,28 +1119,46 @@ countWord(Word *word) {
}
static void
addLine(uint32_t code, char *name1, int16_t name1Length, char *name2, int16_t name2Length) {
addLine(uint32_t code, char *names[], int16_t lengths[], int16_t count) {
uint8_t *stringStart;
Line *line;
int16_t length;
int16_t i, length;
if(lineCount==MAX_LINE_COUNT) {
fprintf(stderr, "gennames: too many lines\n");
exit(U_BUFFER_OVERFLOW_ERROR);
}
length=name1Length;
if(name2Length>0) {
length=(int16_t)(length+1+name2Length);
/* find the last non-empty name */
while(count>0 && lengths[count-1]==0) {
--count;
}
if(count==0) {
return; /* should not occur: caller should not have called */
}
stringStart=allocLine(length);
if(name1Length>0) {
uprv_memcpy(stringStart, name1, name1Length);
/* there will be (count-1) separator characters */
i=count;
length=count-1;
/* add lengths of strings */
while(i>0) {
length+=lengths[--i];
}
if(name2Length>0) {
stringStart[name1Length]=NAME_SEPARATOR_CHAR;
uprv_memcpy(stringStart+name1Length+1, name2, name2Length);
/* allocate line memory */
stringStart=allocLine(length);
/* copy all strings into the line memory */
length=0; /* number of chars copied so far */
for(i=0; i<count; ++i) {
if(i>0) {
stringStart[length++]=NAME_SEPARATOR_CHAR;
}
if(lengths[i]>0) {
uprv_memcpy(stringStart+length, names[i], lengths[i]);
length+=lengths[i];
}
}
line=lines+lineCount;
@ -1201,7 +1235,7 @@ appendLineLengthNibble(uint8_t nibble) {
}
static uint8_t *
allocLine(uint32_t length) {
allocLine(int32_t length) {
uint32_t top=lineTop+length;
uint8_t *p;