scuffed-code/icu4c/source/tools/ulxfrm/ulxfrm.cpp
Andy Heninger 360f38dc1e ICU-903 updated copyright notices.
X-SVN-Rev: 4245
2001-03-21 23:22:16 +00:00

230 lines
6.8 KiB
C++

/*
*
* Copyright (C) 1998-2000, International Business Machines
* Corporation and others. All Rights Reserved.
*
* @version 1.0 06/19/98
* @author Helena Shih
* Based on Taligent international support for C++
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include "ucmp16.h"
#if U_IOSTREAM_SOURCE >= 199711
#include <iostream>
using namespace std;
#elif U_IOSTREAM_SOURCE >= 198506
#include <iostream.h>
#endif
CompactShortArray* ulxfrmArray = 0;
enum ECharTypeMapping {
UNASSIGNED = 0,
UPPERCASE_LETTER = 1,
LOWERCASE_LETTER = 2,
TITLECASE_LETTER = 3,
MODIFIER_LETTER = 4,
OTHER_LETTER = 5,
NON_SPACING_MARK = 6,
ENCLOSING_MARK = 7,
COMBINING_SPACING_MARK = 8,
DECIMAL_DIGIT_NUMBER = 9,
LETTER_NUMBER = 10,
OTHER_NUMBER = 11,
SPACE_SEPARATOR = 12,
LINE_SEPARATOR = 13,
PARAGRAPH_SEPARATOR = 14,
CONTROL = 15,
FORMAT = 16,
PRIVATE_USE = 17,
SURROGATE = 18,
DASH_PUNCTUATION = 19,
START_PUNCTUATION = 20,
END_PUNCTUATION = 21,
CONNECTOR_PUNCTUATION = 22,
OTHER_PUNCTUATION = 23,
MATH_SYMBOL = 24,
CURRENCY_SYMBOL = 25,
MODIFIER_SYMBOL = 26,
OTHER_SYMBOL = 27,
INITIAL_PUNCTUATION = 28,
FINAL_PUNCTUATION = 29
};
static const UChar LAST_CHAR_CODE_IN_FILE = 0xFFFD;
const char tagStrings[] = "MnMcMeNdNlNoZsZlZpCcCfCsCoCnLuLlLtLmLoPcPdPsPePoSmScSkSoPiPf";
const int16_t tagValues[] =
{
/* Mn */ (int16_t)NON_SPACING_MARK,
/* Mc */ (int16_t)COMBINING_SPACING_MARK,
/* Me */ (int16_t)ENCLOSING_MARK,
/* Nd */ (int16_t)DECIMAL_DIGIT_NUMBER,
/* Nl */ (int16_t)LETTER_NUMBER,
/* No */ (int16_t)OTHER_NUMBER,
/* Zs */ (int16_t)SPACE_SEPARATOR,
/* Zl */ (int16_t)LINE_SEPARATOR,
/* Zp */ (int16_t)PARAGRAPH_SEPARATOR,
/* Cc */ (int16_t)CONTROL,
/* Cf */ (int16_t)FORMAT,
/* Cs */ (int16_t)SURROGATE,
/* Co */ (int16_t)PRIVATE_USE,
/* Cn */ (int16_t)UNASSIGNED,
/* Lu */ (int16_t)UPPERCASE_LETTER,
/* Ll */ (int16_t)LOWERCASE_LETTER,
/* Lt */ (int16_t)TITLECASE_LETTER,
/* Lm */ (int16_t)MODIFIER_LETTER,
/* Lo */ (int16_t)OTHER_LETTER,
/* Pc */ (int16_t)CONNECTOR_PUNCTUATION,
/* Pd */ (int16_t)DASH_PUNCTUATION,
/* Ps */ (int16_t)START_PUNCTUATION,
/* Pe */ (int16_t)END_PUNCTUATION,
/* Po */ (int16_t)OTHER_PUNCTUATION,
/* Sm */ (int16_t)MATH_SYMBOL,
/* Sc */ (int16_t)CURRENCY_SYMBOL,
/* Sk */ (int16_t)MODIFIER_SYMBOL,
/* So */ (int16_t)OTHER_SYMBOL,
/* Pi */ (int16_t)INITIAL_PUNCTUATION,
/* Pf */ (int16_t)FINAL_PUNCTUATION
};
int
MakeProp(char* str)
{
int result = 0;
char* matchPosition;
matchPosition = strstr(tagStrings, str);
if (matchPosition == 0) fprintf(stderr, "unrecognized type letter %s", str);
else result = ((matchPosition - tagStrings) / 2);
return result;
}
CompactShortArray*
getArray(FILE *input)
{
if (ulxfrmArray == 0) {
char buffer[1000];
char* bufferPtr;
int set = FALSE;
char type[3];
try {
ulxfrmArray = ucmp16_open((int16_t)0xffff);
int32_t unicode, otherunicode, digit, i;
while (TRUE) {
otherunicode = 0xffff;
digit = -1;
bufferPtr = fgets(buffer, 999, input);
if (bufferPtr == NULL) break;
if (bufferPtr[0] == '#' || bufferPtr[0] == '\n' || bufferPtr[0] == 0) continue;
sscanf(bufferPtr, "%X", &unicode);
assert(0 <= unicode && unicode < 65536);
bufferPtr = strchr(bufferPtr, ';');
assert(bufferPtr != NULL);
bufferPtr = strchr(bufferPtr + 1, ';');
strncpy(type, ++bufferPtr, 2); // go to start of third field
assert(type != NULL);
type[2] = 0;
int typeResult = tagValues[MakeProp(type)];
// check for the decimal values
bufferPtr++;
for (i = 3; i < 8; i++) {
bufferPtr = strchr(bufferPtr, ';');
assert(bufferPtr != NULL);
bufferPtr++;
}
sscanf(bufferPtr, "%X", &digit);
if (((typeResult == DECIMAL_DIGIT_NUMBER) || (typeResult == OTHER_NUMBER)) &&
(digit >= 0 && digit <= 9)){
buffer[10];
sprintf(buffer, "0x%04X", unicode);
cout << " { " << buffer << ", " << digit << "}, \n";
}
bufferPtr++;
for (i = 8; i < 12; i++) {
bufferPtr = strchr(bufferPtr, ';');
assert(bufferPtr != NULL);
bufferPtr++;
}
sscanf(bufferPtr, "%X", &otherunicode);
// the Unicode char has a equivalent uppercase
if ((typeResult == LOWERCASE_LETTER) && (0 <= otherunicode && otherunicode < 65536)) {
set = TRUE;
}
if ((typeResult == UPPERCASE_LETTER) && !set) {
bufferPtr++;
sscanf(bufferPtr, "%X", &otherunicode);
if (0 <= otherunicode && otherunicode < 65536) {
set = TRUE;
}
}
if ((set == TRUE) && (ucmp16_get(ulxfrmArray, (UChar)unicode) == (int16_t)0xffff))
ucmp16_set(ulxfrmArray, (UChar)unicode, (int16_t)otherunicode);
set = FALSE;
}
if (input) fclose(input);
ucmp16_compact(ulxfrmArray);
}
catch (...) {
fprintf(stderr, "Error Occured while parsing unicode data file.\n");
}
}
return ulxfrmArray;
}
void
writeArrays()
{
const int16_t* values = ucmp16_getArray(ulxfrmArray);
const uint16_t* indexes = ucmp16_getIndex(ulxfrmArray);
int32_t i;
int32_t cnt = ucmp16_getCount(ulxfrmArray);
cout << "\nconst uint32_t Unicode::caseIndex[] = {\n ";
for (i = 0; i < ucmp16_getkIndexCount()-1; i++)
{
cout << "(uint16_t)" << ((indexes[i] >= 0) ? (int)indexes[i] : (int)(indexes[i]+ucmp16_getkUnicodeCount()))
<< ", ";
if (i != 0)
if (i % 3 == 0)
cout << "\n ";
}
cout << " (uint16_t)" << ((indexes[ucmp16_getkIndexCount()-1] >= 0) ? (int)indexes[i] : (int)(indexes[i]+ucmp16_getkUnicodeCount()))
<< " };\n";
cout << "\nconst int16_t Unicode::caseValues[] = {\n ";
for (i = 0; i < cnt-1; i++)
{
cout << "(int16_t)" << (int16_t)values[i] << ", ";
if (i != 0)
if (i % 5 == 0)
cout << "\n ";
}
cout << " (char)" << (int16_t)values[cnt-1] << " }\n";
cout << "const int32_t Unicode::caseCount = " << cnt << ";\n";
}
/**
* The main function builds the CharType data array and prints it to System.out
*/
void main(int argc, char** argv)
{
CompactShortArray* arrays = 0;
FILE *input = 0;
if (argc != 2) {
printf("Usage : chartype filename\n\n");
exit(1);
}
input = fopen(argv[1], "r");
if (input == 0) {
printf("Cannot open the input file: %s\n\n", argv[1]);
exit(1);
}
arrays = getArray(input);
writeArrays();
}