scuffed-code/icu4c/source/tools/ulxfrm/ulxfrm.cpp

/*
 *
 *   Copyright (C) 1998-2000, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 *
 * @version	1.0 06/19/98
 * @author	Helena Shih
 * Based on Taligent international support for C++
 */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>

#include "ucmp16.h"

#if U_IOSTREAM_SOURCE >= 199711
#include <iostream>
using namespace std;
#elif U_IOSTREAM_SOURCE >= 198506
#include <iostream.h>
#endif

CompactShortArray* ulxfrmArray = 0;

	enum	ECharTypeMapping {
		UNASSIGNED				= 0,
	UPPERCASE_LETTER		= 1,
	LOWERCASE_LETTER		= 2,
	TITLECASE_LETTER		= 3,
	MODIFIER_LETTER			= 4,
	OTHER_LETTER			= 5,
	NON_SPACING_MARK		= 6,
	ENCLOSING_MARK			= 7,
	COMBINING_SPACING_MARK	= 8,
	DECIMAL_DIGIT_NUMBER	= 9,
	LETTER_NUMBER			= 10,
	OTHER_NUMBER			= 11,
	SPACE_SEPARATOR			= 12,
	LINE_SEPARATOR			= 13,
	PARAGRAPH_SEPARATOR		= 14,
	CONTROL					= 15,
	FORMAT					= 16,
	PRIVATE_USE				= 17,
	SURROGATE				= 18,
	DASH_PUNCTUATION		= 19,
	START_PUNCTUATION		= 20,
	END_PUNCTUATION			= 21,
	CONNECTOR_PUNCTUATION	= 22,
	OTHER_PUNCTUATION		= 23,
	MATH_SYMBOL				= 24,
	CURRENCY_SYMBOL			= 25,
	MODIFIER_SYMBOL			= 26,
	OTHER_SYMBOL			= 27,
	INITIAL_PUNCTUATION     = 28,
	FINAL_PUNCTUATION       = 29
	};

static const UChar LAST_CHAR_CODE_IN_FILE = 0xFFFD;
const char tagStrings[] = "MnMcMeNdNlNoZsZlZpCcCfCsCoCnLuLlLtLmLoPcPdPsPePoSmScSkSoPiPf";
const int16_t tagValues[] =
	{
	/* Mn */ (int16_t)NON_SPACING_MARK,
	/* Mc */ (int16_t)COMBINING_SPACING_MARK,
	/* Me */ (int16_t)ENCLOSING_MARK,
	/* Nd */ (int16_t)DECIMAL_DIGIT_NUMBER,
	/* Nl */ (int16_t)LETTER_NUMBER,
	/* No */ (int16_t)OTHER_NUMBER,
	/* Zs */ (int16_t)SPACE_SEPARATOR,
	/* Zl */ (int16_t)LINE_SEPARATOR,
	/* Zp */ (int16_t)PARAGRAPH_SEPARATOR,
	/* Cc */ (int16_t)CONTROL,
	/* Cf */ (int16_t)FORMAT,
	/* Cs */ (int16_t)SURROGATE,
	/* Co */ (int16_t)PRIVATE_USE,
	/* Cn */ (int16_t)UNASSIGNED,
	/* Lu */ (int16_t)UPPERCASE_LETTER,
	/* Ll */ (int16_t)LOWERCASE_LETTER,
	/* Lt */ (int16_t)TITLECASE_LETTER,
	/* Lm */ (int16_t)MODIFIER_LETTER,
	/* Lo */ (int16_t)OTHER_LETTER,
	/* Pc */ (int16_t)CONNECTOR_PUNCTUATION,
	/* Pd */ (int16_t)DASH_PUNCTUATION,
	/* Ps */ (int16_t)START_PUNCTUATION,
	/* Pe */ (int16_t)END_PUNCTUATION,
	/* Po */ (int16_t)OTHER_PUNCTUATION,
	/* Sm */ (int16_t)MATH_SYMBOL,
	/* Sc */ (int16_t)CURRENCY_SYMBOL,
	/* Sk */ (int16_t)MODIFIER_SYMBOL,
	/* So */ (int16_t)OTHER_SYMBOL,
	/* Pi */ (int16_t)INITIAL_PUNCTUATION,
	/* Pf */ (int16_t)FINAL_PUNCTUATION
	};
int
MakeProp(char* str)
{
	int result = 0;
	char* matchPosition;

	matchPosition = strstr(tagStrings, str);
	if (matchPosition == 0) fprintf(stderr, "unrecognized type letter %s", str);
	else result = ((matchPosition - tagStrings) / 2);
	return result;
}

CompactShortArray*
getArray(FILE *input)
{
	if (ulxfrmArray == 0) {
		char	buffer[1000];
		char*	bufferPtr;
        int  set = FALSE;
        char type[3];

		try {
			ulxfrmArray = ucmp16_open((int16_t)0xffff);
			int32_t unicode, otherunicode, digit, i;
			while (TRUE) {
                otherunicode = 0xffff;
                digit = -1;
				bufferPtr = fgets(buffer, 999, input);
				if (bufferPtr == NULL) break;
				if (bufferPtr[0] == '#' || bufferPtr[0] == '\n' || bufferPtr[0] == 0) continue;
				sscanf(bufferPtr, "%X", &unicode);
				assert(0 <= unicode && unicode < 65536);
				bufferPtr = strchr(bufferPtr, ';');
				assert(bufferPtr != NULL);
                bufferPtr = strchr(bufferPtr + 1, ';');
				strncpy(type, ++bufferPtr, 2);	// go to start of third field
				assert(type != NULL);
				type[2] = 0;
  				int typeResult = tagValues[MakeProp(type)];
                // check for the decimal values
                bufferPtr++;
                for (i = 3; i < 8; i++) {
				    bufferPtr = strchr(bufferPtr, ';');
				    assert(bufferPtr != NULL);
				    bufferPtr++;
                }
    	    	sscanf(bufferPtr, "%X", &digit);
                if (((typeResult == DECIMAL_DIGIT_NUMBER) || (typeResult == OTHER_NUMBER)) &&
                    (digit >= 0 && digit <= 9)){
                    buffer[10];
                    sprintf(buffer, "0x%04X", unicode);
                    cout << "    { " << buffer << ", " << digit << "}, \n";
                }
                bufferPtr++;
                for (i = 8; i < 12; i++) {
				    bufferPtr = strchr(bufferPtr, ';');
				    assert(bufferPtr != NULL);
				    bufferPtr++;
                }
				sscanf(bufferPtr, "%X", &otherunicode);
                // the Unicode char has a equivalent uppercase
                if ((typeResult == LOWERCASE_LETTER) && (0 <= otherunicode && otherunicode < 65536)) {
                    set = TRUE;
                }
                if ((typeResult == UPPERCASE_LETTER) && !set) {
                    bufferPtr++;
			    	sscanf(bufferPtr, "%X", &otherunicode);
                    if (0 <= otherunicode && otherunicode < 65536) {
                        set = TRUE;
                    }
                }
				if ((set == TRUE) && (ucmp16_get(ulxfrmArray, (UChar)unicode) == (int16_t)0xffff))
					ucmp16_set(ulxfrmArray, (UChar)unicode, (int16_t)otherunicode);
                set = FALSE;
                }

			if (input) fclose(input);
			ucmp16_compact(ulxfrmArray);
		}
		catch (...) {
			fprintf(stderr, "Error Occured while parsing unicode data file.\n");
		}
	}
	return ulxfrmArray;
}

void
writeArrays()
{
	const int16_t* values = ucmp16_getArray(ulxfrmArray);
	const uint16_t* indexes = ucmp16_getIndex(ulxfrmArray);
	int32_t i;
    int32_t cnt = ucmp16_getCount(ulxfrmArray);
    cout << "\nconst uint32_t Unicode::caseIndex[] = {\n    ";
    for (i = 0; i < ucmp16_getkIndexCount()-1; i++)
    {
        cout << "(uint16_t)" << ((indexes[i] >= 0) ? (int)indexes[i] : (int)(indexes[i]+ucmp16_getkUnicodeCount()))
                         << ", ";
        if (i != 0)
            if (i % 3 == 0)
                cout << "\n    ";
    }
    cout << "    (uint16_t)" << ((indexes[ucmp16_getkIndexCount()-1] >= 0) ? (int)indexes[i] : (int)(indexes[i]+ucmp16_getkUnicodeCount()))
                       << " };\n";
    cout << "\nconst int16_t Unicode::caseValues[] = {\n    ";
    for (i = 0; i < cnt-1; i++)
    {
        cout << "(int16_t)" << (int16_t)values[i] << ", ";
        if (i != 0)
            if (i % 5 == 0)
                cout << "\n    ";
    }
    cout << "    (char)" << (int16_t)values[cnt-1] << " }\n";
	cout << "const int32_t Unicode::caseCount = " << cnt << ";\n";
}
/**
 * The main function builds the CharType data array and prints it to System.out
 */
void main(int argc, char** argv)
{
	CompactShortArray* arrays = 0;
    FILE *input = 0;
    if (argc != 2) {
        printf("Usage : chartype filename\n\n");
        exit(1);
    }
    input = fopen(argv[1], "r");
    if (input == 0) {
        printf("Cannot open the input file: %s\n\n", argv[1]);
        exit(1);
    }
	arrays = getArray(input);
	writeArrays();
}