1999-08-16 21:50:52 +00:00
|
|
|
/*
|
1999-12-13 22:28:37 +00:00
|
|
|
*
|
|
|
|
* Copyright (C) 1998-1999, International Business Machines
|
|
|
|
* Corporation and others. All Rights Reserved.
|
|
|
|
*
|
1999-08-16 21:50:52 +00:00
|
|
|
* @version 1.0 06/19/98
|
|
|
|
* @author Helena Shih
|
|
|
|
* Based on Taligent international support for C++
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include <stdio.h>
|
|
|
|
#include <stdlib.h>
|
|
|
|
#include <string.h>
|
|
|
|
#include <assert.h>
|
|
|
|
|
|
|
|
#include "ucmp16.h"
|
2000-04-25 21:33:15 +00:00
|
|
|
|
|
|
|
#if U_IOSTREAM_SOURCE >= 199711
|
|
|
|
#include <iostream>
|
|
|
|
using namespace std;
|
|
|
|
#elif U_IOSTREAM_SOURCE >= 198506
|
|
|
|
#include <iostream.h>
|
|
|
|
#endif
|
|
|
|
|
1999-08-16 21:50:52 +00:00
|
|
|
CompactShortArray* ulxfrmArray = 0;
|
|
|
|
|
|
|
|
enum ECharTypeMapping {
|
|
|
|
UNASSIGNED = 0,
|
|
|
|
UPPERCASE_LETTER = 1,
|
|
|
|
LOWERCASE_LETTER = 2,
|
|
|
|
TITLECASE_LETTER = 3,
|
|
|
|
MODIFIER_LETTER = 4,
|
|
|
|
OTHER_LETTER = 5,
|
|
|
|
NON_SPACING_MARK = 6,
|
|
|
|
ENCLOSING_MARK = 7,
|
|
|
|
COMBINING_SPACING_MARK = 8,
|
|
|
|
DECIMAL_DIGIT_NUMBER = 9,
|
|
|
|
LETTER_NUMBER = 10,
|
|
|
|
OTHER_NUMBER = 11,
|
|
|
|
SPACE_SEPARATOR = 12,
|
|
|
|
LINE_SEPARATOR = 13,
|
|
|
|
PARAGRAPH_SEPARATOR = 14,
|
|
|
|
CONTROL = 15,
|
|
|
|
FORMAT = 16,
|
|
|
|
PRIVATE_USE = 17,
|
|
|
|
SURROGATE = 18,
|
|
|
|
DASH_PUNCTUATION = 19,
|
|
|
|
START_PUNCTUATION = 20,
|
|
|
|
END_PUNCTUATION = 21,
|
|
|
|
CONNECTOR_PUNCTUATION = 22,
|
|
|
|
OTHER_PUNCTUATION = 23,
|
|
|
|
MATH_SYMBOL = 24,
|
|
|
|
CURRENCY_SYMBOL = 25,
|
|
|
|
MODIFIER_SYMBOL = 26,
|
|
|
|
OTHER_SYMBOL = 27,
|
|
|
|
INITIAL_PUNCTUATION = 28,
|
|
|
|
FINAL_PUNCTUATION = 29
|
|
|
|
};
|
|
|
|
|
|
|
|
static const UChar LAST_CHAR_CODE_IN_FILE = 0xFFFD;
|
|
|
|
const char tagStrings[] = "MnMcMeNdNlNoZsZlZpCcCfCsCoCnLuLlLtLmLoPcPdPsPePoSmScSkSoPiPf";
|
|
|
|
const int16_t tagValues[] =
|
|
|
|
{
|
|
|
|
/* Mn */ (int16_t)NON_SPACING_MARK,
|
|
|
|
/* Mc */ (int16_t)COMBINING_SPACING_MARK,
|
|
|
|
/* Me */ (int16_t)ENCLOSING_MARK,
|
|
|
|
/* Nd */ (int16_t)DECIMAL_DIGIT_NUMBER,
|
|
|
|
/* Nl */ (int16_t)LETTER_NUMBER,
|
|
|
|
/* No */ (int16_t)OTHER_NUMBER,
|
|
|
|
/* Zs */ (int16_t)SPACE_SEPARATOR,
|
|
|
|
/* Zl */ (int16_t)LINE_SEPARATOR,
|
|
|
|
/* Zp */ (int16_t)PARAGRAPH_SEPARATOR,
|
|
|
|
/* Cc */ (int16_t)CONTROL,
|
|
|
|
/* Cf */ (int16_t)FORMAT,
|
|
|
|
/* Cs */ (int16_t)SURROGATE,
|
|
|
|
/* Co */ (int16_t)PRIVATE_USE,
|
|
|
|
/* Cn */ (int16_t)UNASSIGNED,
|
|
|
|
/* Lu */ (int16_t)UPPERCASE_LETTER,
|
|
|
|
/* Ll */ (int16_t)LOWERCASE_LETTER,
|
|
|
|
/* Lt */ (int16_t)TITLECASE_LETTER,
|
|
|
|
/* Lm */ (int16_t)MODIFIER_LETTER,
|
|
|
|
/* Lo */ (int16_t)OTHER_LETTER,
|
|
|
|
/* Pc */ (int16_t)CONNECTOR_PUNCTUATION,
|
|
|
|
/* Pd */ (int16_t)DASH_PUNCTUATION,
|
|
|
|
/* Ps */ (int16_t)START_PUNCTUATION,
|
|
|
|
/* Pe */ (int16_t)END_PUNCTUATION,
|
|
|
|
/* Po */ (int16_t)OTHER_PUNCTUATION,
|
|
|
|
/* Sm */ (int16_t)MATH_SYMBOL,
|
|
|
|
/* Sc */ (int16_t)CURRENCY_SYMBOL,
|
|
|
|
/* Sk */ (int16_t)MODIFIER_SYMBOL,
|
|
|
|
/* So */ (int16_t)OTHER_SYMBOL,
|
|
|
|
/* Pi */ (int16_t)INITIAL_PUNCTUATION,
|
|
|
|
/* Pf */ (int16_t)FINAL_PUNCTUATION
|
|
|
|
};
|
|
|
|
int
|
|
|
|
MakeProp(char* str)
|
|
|
|
{
|
|
|
|
int result = 0;
|
|
|
|
char* matchPosition;
|
|
|
|
|
|
|
|
matchPosition = strstr(tagStrings, str);
|
|
|
|
if (matchPosition == 0) fprintf(stderr, "unrecognized type letter %s", str);
|
|
|
|
else result = ((matchPosition - tagStrings) / 2);
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
|
|
|
CompactShortArray*
|
|
|
|
getArray(FILE *input)
|
|
|
|
{
|
|
|
|
if (ulxfrmArray == 0) {
|
|
|
|
char buffer[1000];
|
|
|
|
char* bufferPtr;
|
|
|
|
int set = FALSE;
|
|
|
|
char type[3];
|
|
|
|
|
|
|
|
try {
|
|
|
|
ulxfrmArray = ucmp16_open((int16_t)0xffff);
|
|
|
|
int32_t unicode, otherunicode, digit, i;
|
|
|
|
while (TRUE) {
|
|
|
|
otherunicode = 0xffff;
|
|
|
|
digit = -1;
|
|
|
|
bufferPtr = fgets(buffer, 999, input);
|
|
|
|
if (bufferPtr == NULL) break;
|
|
|
|
if (bufferPtr[0] == '#' || bufferPtr[0] == '\n' || bufferPtr[0] == 0) continue;
|
|
|
|
sscanf(bufferPtr, "%X", &unicode);
|
|
|
|
assert(0 <= unicode && unicode < 65536);
|
|
|
|
bufferPtr = strchr(bufferPtr, ';');
|
|
|
|
assert(bufferPtr != NULL);
|
|
|
|
bufferPtr = strchr(bufferPtr + 1, ';');
|
|
|
|
strncpy(type, ++bufferPtr, 2); // go to start of third field
|
|
|
|
assert(type != NULL);
|
|
|
|
type[2] = 0;
|
|
|
|
int typeResult = tagValues[MakeProp(type)];
|
|
|
|
// check for the decimal values
|
|
|
|
bufferPtr++;
|
|
|
|
for (i = 3; i < 8; i++) {
|
|
|
|
bufferPtr = strchr(bufferPtr, ';');
|
|
|
|
assert(bufferPtr != NULL);
|
|
|
|
bufferPtr++;
|
|
|
|
}
|
|
|
|
sscanf(bufferPtr, "%X", &digit);
|
|
|
|
if (((typeResult == DECIMAL_DIGIT_NUMBER) || (typeResult == OTHER_NUMBER)) &&
|
|
|
|
(digit >= 0 && digit <= 9)){
|
|
|
|
buffer[10];
|
|
|
|
sprintf(buffer, "0x%04X", unicode);
|
|
|
|
cout << " { " << buffer << ", " << digit << "}, \n";
|
|
|
|
}
|
|
|
|
bufferPtr++;
|
|
|
|
for (i = 8; i < 12; i++) {
|
|
|
|
bufferPtr = strchr(bufferPtr, ';');
|
|
|
|
assert(bufferPtr != NULL);
|
|
|
|
bufferPtr++;
|
|
|
|
}
|
|
|
|
sscanf(bufferPtr, "%X", &otherunicode);
|
|
|
|
// the Unicode char has a equivalent uppercase
|
|
|
|
if ((typeResult == LOWERCASE_LETTER) && (0 <= otherunicode && otherunicode < 65536)) {
|
|
|
|
set = TRUE;
|
|
|
|
}
|
|
|
|
if ((typeResult == UPPERCASE_LETTER) && !set) {
|
|
|
|
bufferPtr++;
|
|
|
|
sscanf(bufferPtr, "%X", &otherunicode);
|
|
|
|
if (0 <= otherunicode && otherunicode < 65536) {
|
|
|
|
set = TRUE;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if ((set == TRUE) && (ucmp16_get(ulxfrmArray, (UChar)unicode) == (int16_t)0xffff))
|
|
|
|
ucmp16_set(ulxfrmArray, (UChar)unicode, (int16_t)otherunicode);
|
|
|
|
set = FALSE;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (input) fclose(input);
|
|
|
|
ucmp16_compact(ulxfrmArray);
|
|
|
|
}
|
|
|
|
catch (...) {
|
|
|
|
fprintf(stderr, "Error Occured while parsing unicode data file.\n");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return ulxfrmArray;
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
writeArrays()
|
|
|
|
{
|
|
|
|
const int16_t* values = ucmp16_getArray(ulxfrmArray);
|
|
|
|
const uint16_t* indexes = ucmp16_getIndex(ulxfrmArray);
|
|
|
|
int32_t i;
|
|
|
|
int32_t cnt = ucmp16_getCount(ulxfrmArray);
|
|
|
|
cout << "\nconst uint32_t Unicode::caseIndex[] = {\n ";
|
|
|
|
for (i = 0; i < ucmp16_getkIndexCount()-1; i++)
|
|
|
|
{
|
|
|
|
cout << "(uint16_t)" << ((indexes[i] >= 0) ? (int)indexes[i] : (int)(indexes[i]+ucmp16_getkUnicodeCount()))
|
|
|
|
<< ", ";
|
|
|
|
if (i != 0)
|
|
|
|
if (i % 3 == 0)
|
|
|
|
cout << "\n ";
|
|
|
|
}
|
|
|
|
cout << " (uint16_t)" << ((indexes[ucmp16_getkIndexCount()-1] >= 0) ? (int)indexes[i] : (int)(indexes[i]+ucmp16_getkUnicodeCount()))
|
|
|
|
<< " };\n";
|
|
|
|
cout << "\nconst int16_t Unicode::caseValues[] = {\n ";
|
|
|
|
for (i = 0; i < cnt-1; i++)
|
|
|
|
{
|
|
|
|
cout << "(int16_t)" << (int16_t)values[i] << ", ";
|
|
|
|
if (i != 0)
|
|
|
|
if (i % 5 == 0)
|
|
|
|
cout << "\n ";
|
|
|
|
}
|
|
|
|
cout << " (char)" << (int16_t)values[cnt-1] << " }\n";
|
|
|
|
cout << "const int32_t Unicode::caseCount = " << cnt << ";\n";
|
|
|
|
}
|
|
|
|
/**
|
|
|
|
* The main function builds the CharType data array and prints it to System.out
|
|
|
|
*/
|
|
|
|
void main(int argc, char** argv)
|
|
|
|
{
|
|
|
|
CompactShortArray* arrays = 0;
|
|
|
|
FILE *input = 0;
|
|
|
|
if (argc != 2) {
|
|
|
|
printf("Usage : chartype filename\n\n");
|
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
input = fopen(argv[1], "r");
|
|
|
|
if (input == 0) {
|
|
|
|
printf("Cannot open the input file: %s\n\n", argv[1]);
|
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
arrays = getArray(input);
|
|
|
|
writeArrays();
|
|
|
|
}
|
|
|
|
|