/* * * Copyright (C) 1998-2000, International Business Machines * Corporation and others. All Rights Reserved. * * @version 1.0 06/19/98 * @author Helena Shih * Based on Taligent international support for C++ */ #include #include #include #include #include "ucmp16.h" #if U_IOSTREAM_SOURCE >= 199711 #include using namespace std; #elif U_IOSTREAM_SOURCE >= 198506 #include #endif CompactShortArray* ulxfrmArray = 0; enum ECharTypeMapping { UNASSIGNED = 0, UPPERCASE_LETTER = 1, LOWERCASE_LETTER = 2, TITLECASE_LETTER = 3, MODIFIER_LETTER = 4, OTHER_LETTER = 5, NON_SPACING_MARK = 6, ENCLOSING_MARK = 7, COMBINING_SPACING_MARK = 8, DECIMAL_DIGIT_NUMBER = 9, LETTER_NUMBER = 10, OTHER_NUMBER = 11, SPACE_SEPARATOR = 12, LINE_SEPARATOR = 13, PARAGRAPH_SEPARATOR = 14, CONTROL = 15, FORMAT = 16, PRIVATE_USE = 17, SURROGATE = 18, DASH_PUNCTUATION = 19, START_PUNCTUATION = 20, END_PUNCTUATION = 21, CONNECTOR_PUNCTUATION = 22, OTHER_PUNCTUATION = 23, MATH_SYMBOL = 24, CURRENCY_SYMBOL = 25, MODIFIER_SYMBOL = 26, OTHER_SYMBOL = 27, INITIAL_PUNCTUATION = 28, FINAL_PUNCTUATION = 29 }; static const UChar LAST_CHAR_CODE_IN_FILE = 0xFFFD; const char tagStrings[] = "MnMcMeNdNlNoZsZlZpCcCfCsCoCnLuLlLtLmLoPcPdPsPePoSmScSkSoPiPf"; const int16_t tagValues[] = { /* Mn */ (int16_t)NON_SPACING_MARK, /* Mc */ (int16_t)COMBINING_SPACING_MARK, /* Me */ (int16_t)ENCLOSING_MARK, /* Nd */ (int16_t)DECIMAL_DIGIT_NUMBER, /* Nl */ (int16_t)LETTER_NUMBER, /* No */ (int16_t)OTHER_NUMBER, /* Zs */ (int16_t)SPACE_SEPARATOR, /* Zl */ (int16_t)LINE_SEPARATOR, /* Zp */ (int16_t)PARAGRAPH_SEPARATOR, /* Cc */ (int16_t)CONTROL, /* Cf */ (int16_t)FORMAT, /* Cs */ (int16_t)SURROGATE, /* Co */ (int16_t)PRIVATE_USE, /* Cn */ (int16_t)UNASSIGNED, /* Lu */ (int16_t)UPPERCASE_LETTER, /* Ll */ (int16_t)LOWERCASE_LETTER, /* Lt */ (int16_t)TITLECASE_LETTER, /* Lm */ (int16_t)MODIFIER_LETTER, /* Lo */ (int16_t)OTHER_LETTER, /* Pc */ (int16_t)CONNECTOR_PUNCTUATION, /* Pd */ (int16_t)DASH_PUNCTUATION, /* Ps */ (int16_t)START_PUNCTUATION, /* Pe */ (int16_t)END_PUNCTUATION, /* Po */ (int16_t)OTHER_PUNCTUATION, /* Sm */ (int16_t)MATH_SYMBOL, /* Sc */ (int16_t)CURRENCY_SYMBOL, /* Sk */ (int16_t)MODIFIER_SYMBOL, /* So */ (int16_t)OTHER_SYMBOL, /* Pi */ (int16_t)INITIAL_PUNCTUATION, /* Pf */ (int16_t)FINAL_PUNCTUATION }; int MakeProp(char* str) { int result = 0; char* matchPosition; matchPosition = strstr(tagStrings, str); if (matchPosition == 0) fprintf(stderr, "unrecognized type letter %s", str); else result = ((matchPosition - tagStrings) / 2); return result; } CompactShortArray* getArray(FILE *input) { if (ulxfrmArray == 0) { char buffer[1000]; char* bufferPtr; int set = FALSE; char type[3]; try { ulxfrmArray = ucmp16_open((int16_t)0xffff); int32_t unicode, otherunicode, digit, i; while (TRUE) { otherunicode = 0xffff; digit = -1; bufferPtr = fgets(buffer, 999, input); if (bufferPtr == NULL) break; if (bufferPtr[0] == '#' || bufferPtr[0] == '\n' || bufferPtr[0] == 0) continue; sscanf(bufferPtr, "%X", &unicode); assert(0 <= unicode && unicode < 65536); bufferPtr = strchr(bufferPtr, ';'); assert(bufferPtr != NULL); bufferPtr = strchr(bufferPtr + 1, ';'); strncpy(type, ++bufferPtr, 2); // go to start of third field assert(type != NULL); type[2] = 0; int typeResult = tagValues[MakeProp(type)]; // check for the decimal values bufferPtr++; for (i = 3; i < 8; i++) { bufferPtr = strchr(bufferPtr, ';'); assert(bufferPtr != NULL); bufferPtr++; } sscanf(bufferPtr, "%X", &digit); if (((typeResult == DECIMAL_DIGIT_NUMBER) || (typeResult == OTHER_NUMBER)) && (digit >= 0 && digit <= 9)){ buffer[10]; sprintf(buffer, "0x%04X", unicode); cout << " { " << buffer << ", " << digit << "}, \n"; } bufferPtr++; for (i = 8; i < 12; i++) { bufferPtr = strchr(bufferPtr, ';'); assert(bufferPtr != NULL); bufferPtr++; } sscanf(bufferPtr, "%X", &otherunicode); // the Unicode char has a equivalent uppercase if ((typeResult == LOWERCASE_LETTER) && (0 <= otherunicode && otherunicode < 65536)) { set = TRUE; } if ((typeResult == UPPERCASE_LETTER) && !set) { bufferPtr++; sscanf(bufferPtr, "%X", &otherunicode); if (0 <= otherunicode && otherunicode < 65536) { set = TRUE; } } if ((set == TRUE) && (ucmp16_get(ulxfrmArray, (UChar)unicode) == (int16_t)0xffff)) ucmp16_set(ulxfrmArray, (UChar)unicode, (int16_t)otherunicode); set = FALSE; } if (input) fclose(input); ucmp16_compact(ulxfrmArray); } catch (...) { fprintf(stderr, "Error Occured while parsing unicode data file.\n"); } } return ulxfrmArray; } void writeArrays() { const int16_t* values = ucmp16_getArray(ulxfrmArray); const uint16_t* indexes = ucmp16_getIndex(ulxfrmArray); int32_t i; int32_t cnt = ucmp16_getCount(ulxfrmArray); cout << "\nconst uint32_t Unicode::caseIndex[] = {\n "; for (i = 0; i < ucmp16_getkIndexCount()-1; i++) { cout << "(uint16_t)" << ((indexes[i] >= 0) ? (int)indexes[i] : (int)(indexes[i]+ucmp16_getkUnicodeCount())) << ", "; if (i != 0) if (i % 3 == 0) cout << "\n "; } cout << " (uint16_t)" << ((indexes[ucmp16_getkIndexCount()-1] >= 0) ? (int)indexes[i] : (int)(indexes[i]+ucmp16_getkUnicodeCount())) << " };\n"; cout << "\nconst int16_t Unicode::caseValues[] = {\n "; for (i = 0; i < cnt-1; i++) { cout << "(int16_t)" << (int16_t)values[i] << ", "; if (i != 0) if (i % 5 == 0) cout << "\n "; } cout << " (char)" << (int16_t)values[cnt-1] << " }\n"; cout << "const int32_t Unicode::caseCount = " << cnt << ";\n"; } /** * The main function builds the CharType data array and prints it to System.out */ void main(int argc, char** argv) { CompactShortArray* arrays = 0; FILE *input = 0; if (argc != 2) { printf("Usage : chartype filename\n\n"); exit(1); } input = fopen(argv[1], "r"); if (input == 0) { printf("Cannot open the input file: %s\n\n", argv[1]); exit(1); } arrays = getArray(input); writeArrays(); }