diff --git a/icu4c/source/tools/genuca/Makefile.in b/icu4c/source/tools/genuca/Makefile.in new file mode 100644 index 0000000000..0ebc04c86b --- /dev/null +++ b/icu4c/source/tools/genuca/Makefile.in @@ -0,0 +1,121 @@ +## Makefile.in for ICU - tools/genuca +## Copyright (c) 1999, 2000, International Business Machines Corporation and +## others. All Rights Reserved. + +## Source directory information +srcdir = @srcdir@ +top_srcdir = @top_srcdir@ + +top_builddir = ../.. + +include $(top_builddir)/icudefs.mk + +## Platform-specific setup + +include @platform_make_fragment@ + +## + +SECTION = 8 + +MAN_FILES = $(TARGET).$(SECTION) + +## Build directory information +subdir = tools/genuca + +## Extra files to remove for 'make clean' +CLEANFILES = *~ $(TARGET).$(SECTION) $(DEPS) + +## Target information +TARGET = genuca + +ENABLE_STATIC = @ENABLE_STATIC@ + +ifneq ($(ENABLE_STATIC),) +LINK = $(LINK.cc) +else +LINK = $(LINK.c) +endif + +DEFS = @DEFS@ +CPPFLAGS = @CPPFLAGS@ -I$(top_builddir)/common -I$(top_srcdir)/common -I$(top_srcdir)/i18n -I$(top_srcdir)/extra/ustdio -I$(srcdir)/../toolutil +CFLAGS = @CFLAGS@ +CXXFLAGS = @CXXFLAGS@ +ENABLE_RPATH = @ENABLE_RPATH@ +ifeq ($(ENABLE_RPATH),YES) +RPATHLDFLAGS = $(LD_RPATH)$(LD_RPATH_PRE)$(libdir) +endif +LDFLAGS = @LDFLAGS@ $(RPATHLDFLAGS) +LIBS = $(LIBUSTDIO) $(LIBICUI18N) $(LIBICUTOOLUTIL) $(LIBICUUC) @LIBS@ @LIB_M@ + +OBJECTS = UCAData.o cnttable.o + +DEPS = $(OBJECTS:.o=.d) + + +## List of phony targets +.PHONY : all all-local install install-local clean clean-local \ +distclean distclean-local dist dist-local check \ +check-local install-man + +## Clear suffix list +.SUFFIXES : + +## List of standard targets +all: all-local +install: install-local +clean: clean-local +distclean : distclean-local +dist: dist-local +check: all check-local + +all-local: $(TARGET) $(MAN_FILES) + +install-local: all-local install-man + $(MKINSTALLDIRS) $(DESTDIR)$(sbindir) + $(INSTALL) $(TARGET) $(DESTDIR)$(sbindir)/$(TARGET) + + $@ + +$(TARGET).pdf: $(TARGET).ps + ps2pdf $< $@ + +ifeq (,$(MAKECMDGOALS)) +-include $(DEPS) +else +ifneq ($(patsubst %clean,,$(MAKECMDGOALS)),) +-include $(DEPS) +endif +endif diff --git a/icu4c/source/tools/genuca/cnttable.cpp b/icu4c/source/tools/genuca/cnttable.cpp new file mode 100644 index 0000000000..c8722c1898 --- /dev/null +++ b/icu4c/source/tools/genuca/cnttable.cpp @@ -0,0 +1,345 @@ +#include "cnttable.h" +#include "cmemory.h" + +void uprv_growTable(ContractionTable *tbl, UErrorCode *status) { + if(tbl->position == tbl->size) { + uint32_t *newData = (uint32_t *)realloc(tbl->CEs, 2*tbl->size*sizeof(uint32_t)); + UChar *newCPs = (UChar *)realloc(tbl->codePoints, 2*tbl->size*sizeof(UChar)); + if(newData == NULL || newCPs == NULL) { + fprintf(stderr, "out of memory for contractions\n"); + *status = U_MEMORY_ALLOCATION_ERROR; + return; + } + tbl->CEs = newData; + tbl->codePoints = newCPs; + tbl->size *= 2; + } +} + +CntTable *uprv_cnttab_open(CompactIntArray *mapping, UErrorCode *status) { + if(U_FAILURE(*status)) { + return 0; + } + CntTable *tbl = (CntTable *)malloc(sizeof(CntTable)); + tbl->mapping = mapping; + //tbl->elements = uhash_open(uhash_hashLong, uhash_compareLong, status); + //uhash_setValueDeleter(tbl->elements, deleteCntElement); + tbl->elements = (ContractionTable **)malloc(INIT_EXP_TABLE_SIZE*sizeof(ContractionTable *)); + tbl->capacity = INIT_EXP_TABLE_SIZE; + memset(tbl->elements, 0, INIT_EXP_TABLE_SIZE*sizeof(ContractionTable *)); + tbl->size = 0; + tbl->position = 0; + tbl->CEs = NULL; + tbl->codePoints = NULL; + tbl->offsets = NULL; + return tbl; +} + +ContractionTable *addATableElement(CntTable *table, uint32_t *key, UErrorCode *status) { + ContractionTable *el = (ContractionTable *)malloc(sizeof(ContractionTable)); + el->CEs = (uint32_t *)malloc(INIT_EXP_TABLE_SIZE*sizeof(uint32_t)); + el->codePoints = (UChar *)malloc(INIT_EXP_TABLE_SIZE*sizeof(UChar)); + el->position = 0; + el->size = INIT_EXP_TABLE_SIZE; + el->forward = TRUE; + memset(el->CEs, 'F', INIT_EXP_TABLE_SIZE*sizeof(uint32_t)); + memset(el->codePoints, 'F', INIT_EXP_TABLE_SIZE*sizeof(UChar)); + + el->reversed = (ContractionTable *)malloc(sizeof(ContractionTable)); + el->reversed->CEs = (uint32_t *)malloc(INIT_EXP_TABLE_SIZE*sizeof(uint32_t)); + el->reversed->codePoints = (UChar *)malloc(INIT_EXP_TABLE_SIZE*sizeof(UChar)); + el->reversed->position = 0; + el->reversed->size = INIT_EXP_TABLE_SIZE; + el->reversed->forward = FALSE; + memset(el->reversed->CEs, 'R', INIT_EXP_TABLE_SIZE*sizeof(uint32_t)); + memset(el->reversed->codePoints, 'R', INIT_EXP_TABLE_SIZE*sizeof(UChar)); + + table->elements[table->size] = el; + + //uhash_put(table->elements, (void *)table->size, el, status); + + *key = table->size++; + + if(table->size > table->capacity) { + // do realloc + *status = U_MEMORY_ALLOCATION_ERROR; + } + + return el; +} + + +int32_t uprv_cnttab_constructTable(CntTable *table, uint32_t mainOffset, UErrorCode *status) { + if(U_FAILURE(*status)) { + return 0; + } + int32_t i = 0, j = 0; + + table->position = 0; + + if(table->offsets != NULL) { + free(table->offsets); + } + table->offsets = (int32_t *)malloc(table->size*sizeof(int32_t)); + + + /* See how much memory we need */ + for(i = 0; isize; i++) { + table->offsets[i] = table->position+mainOffset; + table->position += table->elements[i]->position; + if(table->elements[i]->reversed->position > 0) { + table->elements[i]->codePoints[0] = table->elements[i]->position; /* set offset for backwards table */ + table->position += table->elements[i]->reversed->position-1; + } + } + + /* Allocate it */ + if(table->CEs != NULL) { + free(table->CEs); + } + table->CEs = (uint32_t *)malloc(table->position*sizeof(uint32_t)); + memset(table->CEs, '?', table->position*sizeof(uint32_t)); + if(table->codePoints != NULL) { + free(table->codePoints); + } + table->codePoints = (UChar *)malloc(table->position*sizeof(UChar)); + memset(table->codePoints, '?', table->position*sizeof(UChar)); + + /* Now stuff the things in*/ + + UChar *cpPointer = table->codePoints; + uint32_t *CEPointer = table->CEs; + for(i = 0; isize; i++) { + int32_t size = table->elements[i]->position; + memcpy(cpPointer, table->elements[i]->codePoints, size*sizeof(UChar)); + memcpy(CEPointer, table->elements[i]->CEs, size*sizeof(uint32_t)); + for(j = 0; joffsets[getContractOffset(*(CEPointer+j))]); + } + } + cpPointer += size; + CEPointer += size; + if(table->elements[i]->reversed->position-1 > 0) { + int32_t size2 = table->elements[i]->reversed->position-1; + memcpy(cpPointer, (table->elements[i]->reversed->codePoints)+1, size2*sizeof(UChar)); + memcpy(CEPointer, (table->elements[i]->reversed->CEs)+1, size2*sizeof(uint32_t)); + for(j = 0; joffsets[getContractOffset(*(CEPointer+j))]); + } + } + cpPointer += size2; + CEPointer += size2; + } + } + + + uint32_t CE; + for(i = 0; i<=0xFFFF; i++) { + CE = ucmp32_get(table->mapping, i); + if(isContraction(CE)) { + CE = constructContractCE(table->offsets[getContractOffset(CE)]); + ucmp32_set(table->mapping, i, CE); + } + } + + + return table->position; +} + +void uprv_cnttab_close(CntTable *table) { + int32_t i = 0; + for(i = 0; isize; i++) { + free(table->elements[i]->reversed->CEs); + free(table->elements[i]->reversed->codePoints); + free(table->elements[i]->reversed); + free(table->elements[i]->CEs); + free(table->elements[i]->codePoints); + free(table->elements[i]); + } + free(table->CEs); + free(table->offsets); + free(table->codePoints); + free(table); +} + +/* this is for adding non contractions */ +uint32_t uprv_cnttab_changeLastCE(CntTable *table, uint32_t element, uint32_t value, UBool forward, UErrorCode *status) { + element &= 0xFFFFFF; + + ContractionTable *tbl = NULL; + if(U_FAILURE(*status)) { + return 0; + } + + if((element == 0xFFFFFF) || (tbl = table->elements[element]) == NULL) { + tbl = addATableElement(table, &element, status); + } + + if(forward == TRUE) { + tbl->CEs[tbl->position-1] = value; + } else { + tbl->reversed->CEs[tbl->reversed->position-1] = value; + } + + return(constructContractCE(element)); +} + + +/* inserts a part of contraction sequence in table. Sequences behind the offset are moved back. If element is non existent, it creates on. Returns element handle */ +uint32_t uprv_cnttab_insertContraction(CntTable *table, uint32_t element, UChar codePoint, uint32_t value, UBool forward, UErrorCode *status) { + + element &= 0xFFFFFF; + ContractionTable *tbl = NULL; + + if(U_FAILURE(*status)) { + return 0; + } + + if((element == 0xFFFFFF) || (tbl = table->elements[element]) == NULL) { + tbl = addATableElement(table, &element, status); + } + + if(forward == FALSE) { + tbl = tbl->reversed; + } + + uprv_growTable(tbl, status); + + int32_t offset = 0; + + + while(tbl->codePoints[offset] < codePoint && offsetposition) { + offset++; + } + + int32_t i = tbl->position; + for(i = tbl->position; i > offset; i--) { + tbl->CEs[i] = tbl->CEs[i-1]; + tbl->codePoints[i] = tbl->codePoints[i-1]; + } + + tbl->CEs[offset] = value; + tbl->codePoints[offset] = codePoint; + + tbl->position++; + + return(constructContractCE(element)); +} + + +/* adds more contractions in table. If element is non existant, it creates on. Returns element handle */ +uint32_t uprv_cnttab_addContraction(CntTable *table, uint32_t element, UChar codePoint, uint32_t value, UBool forward, UErrorCode *status) { + + element &= 0xFFFFFF; + + ContractionTable *tbl = NULL; + + if(U_FAILURE(*status)) { + return 0; + } + + if((element == 0xFFFFFF) || (tbl = table->elements[element]) == NULL) { + tbl = addATableElement(table, &element, status); + } + + if(forward == FALSE) { + tbl = tbl->reversed; + } + + uprv_growTable(tbl, status); + + tbl->CEs[tbl->position] = value; + tbl->codePoints[tbl->position] = codePoint; + + tbl->position++; + + return(constructContractCE(element)); +} + +/* sets a part of contraction sequence in table. If element is non existant, it creates on. Returns element handle */ +uint32_t uprv_cnttab_setContraction(CntTable *table, uint32_t element, int32_t offset, UChar codePoint, uint32_t value, UBool forward, UErrorCode *status) { + + element &= 0xFFFFFF; + ContractionTable *tbl = NULL; + + if(U_FAILURE(*status)) { + return 0; + } + + if((element == 0xFFFFFF) || (tbl = table->elements[element]) == NULL) { + tbl = addATableElement(table, &element, status); + } + + if(forward == FALSE) { + tbl = tbl->reversed; + } + + if(offset >= tbl->size) { + *status = U_INDEX_OUTOFBOUNDS_ERROR; + return 0; + } + tbl->CEs[offset] = value; + tbl->codePoints[offset] = codePoint; + + //return(offset); + return(constructContractCE(element)); +} + +uint32_t uprv_cnttab_findCP(CntTable *table, uint32_t element, UChar codePoint, UBool forward, UErrorCode *status) { + + element &= 0xFFFFFF; + ContractionTable *tbl = NULL; + + if(U_FAILURE(*status)) { + return 0; + } + + if((element == 0xFFFFFF) || (tbl = table->elements[element]) == NULL) { + return 0; + } + + if(forward == FALSE) { + tbl = tbl->reversed; + } + + uint32_t position = 0; + + while(codePoint > tbl->codePoints[position]) { + position++; + if(position > tbl->position) { + return 0; + } + } + if (codePoint == tbl->codePoints[position]) { + return position; + } else { + return 0; + } +} + +uint32_t uprv_cnttab_getCE(CntTable *table, uint32_t element, uint32_t position, UBool forward, UErrorCode *status) { + + element &= 0xFFFFFF; + ContractionTable *tbl = NULL; + + if(U_FAILURE(*status)) { + return UCOL_NOT_FOUND; + } + + if((element == 0xFFFFFF) || (tbl = table->elements[element]) == NULL) { + return UCOL_NOT_FOUND; + } + + if(forward == FALSE) { + tbl = tbl->reversed; + } + + + if(position > tbl->position) { + return UCOL_NOT_FOUND; + } else { + return tbl->CEs[position]; + } +} diff --git a/icu4c/source/tools/genuca/cnttable.h b/icu4c/source/tools/genuca/cnttable.h new file mode 100644 index 0000000000..1c7ddbb41a --- /dev/null +++ b/icu4c/source/tools/genuca/cnttable.h @@ -0,0 +1,35 @@ +#ifndef UCOL_CNTTABLE_H +#define UCOL_CNTTABLE_H + +#include "uhash.h" +#include "UCAData.h" + +typedef struct { + ContractionTable **elements; + CompactIntArray *mapping; + UChar *codePoints; + uint32_t *CEs; + int32_t *offsets; + int32_t position; + int32_t size; + int32_t capacity; +} CntTable; + +CntTable *uprv_cnttab_open(CompactIntArray *mapping, UErrorCode *status); +/* construct the table for output */ +int32_t uprv_cnttab_constructTable(CntTable *table, uint32_t mainOffset, UErrorCode *status); +void uprv_cnttab_close(CntTable *table); +/* adds more contractions in table. If element is non existant, it creates on. Returns element handle */ +uint32_t uprv_cnttab_addContraction(CntTable *table, uint32_t element, UChar codePoint, uint32_t value, UBool forward, UErrorCode *status); +/* sets a part of contraction sequence in table. If element is non existant, it creates on. Returns element handle */ +uint32_t uprv_cnttab_setContraction(CntTable *table, uint32_t element, int32_t offset, UChar codePoint, uint32_t value, UBool forward, UErrorCode *status); +/* inserts a part of contraction sequence in table. Sequences behind the offset are moved back. If element is non existant, it creates on. Returns element handle */ +uint32_t uprv_cnttab_insertContraction(CntTable *table, uint32_t element, UChar codePoint, uint32_t value, UBool forward, UErrorCode *status); +/* this is for adding non contractions */ +uint32_t uprv_cnttab_changeLastCE(CntTable *table, uint32_t element, uint32_t value, UBool forward, UErrorCode *status); + +uint32_t uprv_cnttab_findCP(CntTable *table, uint32_t element, UChar codePoint, UBool forward, UErrorCode *status); + +uint32_t uprv_cnttab_getCE(CntTable *table, uint32_t element, uint32_t position, UBool forward, UErrorCode *status); + +#endif diff --git a/icu4c/source/tools/genuca/genuca.8.in b/icu4c/source/tools/genuca/genuca.8.in new file mode 100644 index 0000000000..9513c76aad --- /dev/null +++ b/icu4c/source/tools/genuca/genuca.8.in @@ -0,0 +1,95 @@ +.\" Hey, Emacs! This is -*-nroff-*- you know... +.\" +.\" genuca.8: manual page for the genuca utility +.\" +.\" Copyright (C) 2000 IBM, Inc. and others. +.\" +.TH GENUCA 8 "22 February 2001" "ICU MANPAGE" "ICU @VERSION@ Manual" +.SH NAME +.B genuca +\- create the UCA data table +.SH SYNOPSIS +.B genuca +[ +.BR "\-V\fP, \fB\-\-version" +] +[ +.BR "\-h\fP, \fB\-?\fP, \fB\-\-help" +] +[ +.BR "\-v\fP, \fB\-\-verbose" +] +[ +.BI "\-s\fP, \fB\-\-sourcedir" " source" +] +[ +.BI "\-d\fP, \fB\-\-destdir" " destination" +] +.IR bundle " \.\.\." +.SH DESCRIPTION +.B genuca +create the UCA data table +.I bundle +source files passed on the command line to their binary form. +The resulting files have a +.B .res +extension while resource bundle source files typically have a +.B .txt +extension. +The +.I bundle +file name should be a local identifier, e.g. +.B ja_JP.txt +for Japanese (Japan) data, or +.B root.txt +for the root bundle. +.PP +These binary files can then be read directly by ICU, or used by +.BR pkgdata (8) +for incorporation into a larger archive or library. +.SH OPTIONS +.TP +.BR \-V\fP, \fB\-\-version +Print the version of +.B genuca +and exit. +.TP +.BR \-h\fP, \fB\-?\fP, \fB\-\-help +Print help about usage and exit. +.TP +.BR \-v\fP, \fB\-\-verbose +Display extra informative messages during execution. +.TP +.BI "\-s\fP, \fB\-\-sourcedir" " source" +Set the source directory to +.IR source . +The default source directory is specified by the environment variable +.BR ICU_DATA . +.TP +.BI "\-d\fP, \fB\-\-destdir" " destination" +Set the destination directory to +.IR destination . +The default destination directory is specified by the environment variable +.BR ICU_DATA . +.SH INVARIANT CHARACTERS +The +.B invariant character set +consists of the following set of characters, expressed as a standard POSIX +regular expression: +.BR "[a-z]|[A-Z]|[0-9]|_| |+|-|*|/" . +This is the set which is guaranteed to be available regardless of code page. +.SH ENVIRONMENT +.TP 10 +.B ICU_DATA +Specifies the directory containing ICU data. Defaults to +.BR @thedatadir@/icu/@VERSION@/ . +Some tools in ICU depend on the presence of the trailing slash. It is thus +important to make sure that it is present if +.B ICU_DATA +is set. +.SH VERSION +@VERSION@ +.SH COPYRIGHT +Copyright (C) 2001 IBM, Inc. and others. +.SH SEE ALSO +.BR pkgdata (8) diff --git a/icu4c/source/tools/genuca/genuca.cpp b/icu4c/source/tools/genuca/genuca.cpp new file mode 100644 index 0000000000..4746b47a5b --- /dev/null +++ b/icu4c/source/tools/genuca/genuca.cpp @@ -0,0 +1,956 @@ +#include "UCAData.h" +#include "cnttable.h" + +#include + +ExpansionTable expansions; +CntTable *contractions; +CompactIntArray *mapping = NULL; +/*UHashtable *elements = NULL;*/ +UCAElements le; + +void deleteElement(void *element) { + UCAElements *el = (UCAElements *)element; +/* + int32_t i = 0; + for(i = 0; i < el->noOfCEs; i++) { + free(el->primary[i]); + free(el->secondary[i]); + free(el->tertiary[i]); + } +*/ + //free(el); +} + +int32_t readElement(char **from, char *to, char separator, UErrorCode *status) { + if(U_FAILURE(*status)) { + return 0; + } + char buffer[1024]; + int32_t i = 0; + while(**from != separator) { + if(**from != ' ') { + *(buffer+i++) = **from; + } + (*from)++; + } + (*from)++; + *(buffer + i) = 0; + //*to = (char *)malloc(strlen(buffer)+1); + strcpy(to, buffer); + return i/2; +} + + +uint32_t getSingleCEValue(char *primary, char *secondary, char *tertiary, UBool caseBit, UErrorCode *status) { + if(U_FAILURE(*status)) { + return 0; + } + uint32_t value = 0; + char primsave = '\0'; + char secsave = '\0'; + char tersave = '\0'; + char *primend = primary+4; + if(strlen(primary) > 4) { + primsave = *primend; + *primend = '\0'; + } + char *secend = secondary+2; + if(strlen(secondary) > 2) { + secsave = *secend; + *secend = '\0'; + } + char *terend = tertiary+2; + if(strlen(tertiary) > 2) { + tersave = *terend; + *terend = '\0'; + } + uint32_t primvalue = (*primary!='\0')?strtoul(primary, &primend, 16):0; + uint32_t secvalue = (*secondary!='\0')?strtoul(secondary, &secend, 16):0; + uint32_t tervalue = (*tertiary!='\0')?strtoul(tertiary, &terend, 16):0; + if(primvalue <= 0xFF) { + primvalue <<= 8; + } + + value = ((primvalue<cSize == 1) { + return element->mapCE; + } + + /* this recursion currently feeds on the only element we have... We will have to copy it in order to accomodate */ + /* for both backward and forward cycles */ + + /* we encountered either an empty space or a non-contraction element */ + /* this means we are constructing a new contraction sequence */ + if(existingCE == UCOL_NOT_FOUND || !isContraction(existingCE)) { + /* if it wasn't contraction, we wouldn't end up here*/ + firstContractionOffset = uprv_cnttab_addContraction(contractions, -1, 0, existingCE, forward, status); + if(forward == FALSE) { + uprv_cnttab_addContraction(contractions, firstContractionOffset, 0, existingCE, TRUE, status); + uprv_cnttab_addContraction(contractions, firstContractionOffset, 0xFFFF, existingCE, TRUE, status); + } + + UChar toAdd = element->cPoints[1]; + element->cPoints++; + element->cSize--; + uint32_t newCE = processContraction(element, UCOL_NOT_FOUND, forward, status); + element->cPoints--; + element->cSize++; + contractionOffset = uprv_cnttab_addContraction(contractions, firstContractionOffset, toAdd, newCE, forward, status); + contractionOffset = uprv_cnttab_addContraction(contractions, firstContractionOffset, 0xFFFF, existingCE, forward, status); + contractionElement = constructContractCE(firstContractionOffset); + return contractionElement; + } else { /* we are adding to existing contraction */ + /* there were already some elements in the table, so we need to add a new contraction */ + /* Two things can happen here: either the codepoint is already in the table, or it is not */ + uint32_t position = uprv_cnttab_findCP(contractions, existingCE, *(element->cPoints+1), forward, status); + element->cPoints++; + element->cSize--; + if(position != 0) { /* if it is we just continue down the chain */ + uint32_t eCE = uprv_cnttab_getCE(contractions, existingCE, position, forward, status); + uint32_t newCE = processContraction(element, eCE, forward, status); + uprv_cnttab_setContraction(contractions, existingCE, position, *(element->cPoints), newCE, forward, status); + } else { /* if it isn't, we will have to create a new sequence */ + uint32_t newCE = processContraction(element, UCOL_NOT_FOUND, forward, status); + uprv_cnttab_insertContraction(contractions, existingCE, *(element->cPoints), newCE, forward, status); + } + element->cPoints--; + element->cSize++; + return existingCE; + } +} + +int32_t addExpansion(uint32_t value, UErrorCode *status) { + if(U_FAILURE(*status)) { + return 0; + } + if(expansions.CEs == NULL) { + expansions.CEs = (uint32_t *)malloc(INIT_EXP_TABLE_SIZE*sizeof(uint32_t)); + expansions.size = INIT_EXP_TABLE_SIZE; + expansions.position = 0; + } + + if(expansions.position == expansions.size) { + uint32_t *newData = (uint32_t *)realloc(expansions.CEs, 2*expansions.size*sizeof(uint32_t)); + if(newData == NULL) { + fprintf(stderr, "out of memory for expansions\n"); + *status = U_MEMORY_ALLOCATION_ERROR; + return -1; + } + expansions.CEs = newData; + expansions.size *= 2; + } + + expansions.CEs[expansions.position] = value; + return(expansions.position++); +} + +uint32_t inverseTable[0xFFFF][3]; +uint32_t inversePos = 0; +/*UChar *stringContinue[0xFFFF];*/ +UChar stringContinue[0xFFFF]; +uint32_t stringContSize[0xFFFF]; +uint32_t sContPos = 0; +uint32_t contSize = 0; + +#define UCOL_INV_SIZEMASK 0xFFF00000 +#define UCOL_INV_OFFSETMASK 0x000FFFFF +#define UCOL_INV_SHIFTVALUE 20 + +void addNewInverse(UCAElements *element, UErrorCode *status) { + + if(isContinuation(element->CEs[1])) { + fprintf(stderr, "+"); + } + inversePos++; + inverseTable[inversePos][0] = element->CEs[0]; + if(element->noOfCEs > 1 && isContinuation(element->CEs[1])) { + inverseTable[inversePos][1] = element->CEs[1]; + } + if(element->cSize < 2) { + inverseTable[inversePos][2] = element->cPoints[0]; + } else { /* add a new store of cruft */ + inverseTable[inversePos][2] = ((element->cSize+1) << UCOL_INV_SHIFTVALUE) | sContPos; + memcpy(stringContinue+sContPos, element->cPoints, element->cSize*sizeof(UChar)); + sContPos += element->cSize+1; + } +} + +void addToExistingInverse(UCAElements *element, uint32_t position, UErrorCode *status) { + + if((inverseTable[position][2] & UCOL_INV_SIZEMASK) == 0) { /* single element, have to make new extension place and put both guys there */ + stringContinue[sContPos] = inverseTable[position][2]; + inverseTable[position][2] = ((element->cSize+3) << UCOL_INV_SHIFTVALUE) | sContPos; + sContPos++; + stringContinue[sContPos++] = 0xFFFF; + memcpy(stringContinue+sContPos, element->cPoints, element->cSize*sizeof(UChar)); + sContPos += element->cSize; + stringContinue[sContPos++] = 0xFFFE; + } else { /* adding to the already existing continuing table */ + uint32_t contIndex = inverseTable[position][2] & UCOL_INV_OFFSETMASK; + uint32_t contSize = (inverseTable[position][2] & UCOL_INV_SIZEMASK) >> UCOL_INV_SHIFTVALUE; + + if(contIndex+contSize < sContPos) { + /*fprintf(stderr, ".", sContPos, contIndex+contSize);*/ + memcpy(stringContinue+contIndex+contSize+element->cSize+1, stringContinue+contIndex+contSize, (element->cSize+1)*sizeof(UChar)); + } + + stringContinue[contIndex+contSize-1] = 0xFFFF; + memcpy(stringContinue+contIndex+contSize, element->cPoints, element->cSize*sizeof(UChar)); + sContPos += element->cSize+1; + stringContinue[contIndex+contSize+element->cSize] = 0xFFFE; + + inverseTable[position][2] = ((contSize+element->cSize+1) << UCOL_INV_SHIFTVALUE) | contIndex; + } +} + +uint32_t addToInverse(UCAElements *element, UErrorCode *status) { + + if(inverseTable[inversePos][0] > element->CEs[0]) { + uint32_t position = inversePos; + while(inverseTable[--position][0] > element->CEs[0]) + addToExistingInverse(element, position, status); + } else if(inverseTable[inversePos][0] == element->CEs[0]) { + if(element->noOfCEs > 1 && isContinuation(element->CEs[1]) + && inverseTable[inversePos][1] != element->CEs[1]) { + /* also, we should do long primaries here */ + addNewInverse(element, status); + } else { + addToExistingInverse(element, inversePos, status); + } + } else { + addNewInverse(element, status); + } + return inversePos; +} + +InverseTableHeader *assembleInverseTable(UErrorCode *status) { + uint32_t i = 0; + InverseTableHeader *result = NULL; + uint32_t headerByteSize = paddedsize(sizeof(InverseTableHeader)); + uint32_t inverseTableByteSize = (inversePos+2)*sizeof(uint32_t)*3; + uint32_t contsByteSize = sContPos * sizeof(UChar); + + result = (InverseTableHeader *)malloc(headerByteSize + inverseTableByteSize + contsByteSize); + if(result != NULL) { + result->byteSize = headerByteSize + inverseTableByteSize + contsByteSize; + + inversePos++; + inverseTable[inversePos][0] = 0xFFFFFFFF; + inverseTable[inversePos][1] = 0xFFFFFFFF; + inverseTable[inversePos][2] = 0x0000FFFF; + inversePos++; + + result->tableSize = inversePos; + result->contsSize = sContPos; + + result->table = headerByteSize; + result->conts = headerByteSize + inverseTableByteSize; + + memcpy((uint8_t *)result + result->table, inverseTable, inverseTableByteSize); + memcpy((uint8_t *)result + result->conts, stringContinue, contsByteSize); + + } else { + *status = U_MEMORY_ALLOCATION_ERROR; + return NULL; + } + + { + UNewDataMemory *pData; + + long dataLength; + +#ifdef WIN32 + char *currdir = _getcwd(NULL, 0); +#else + char *currdir = getcwd(NULL, 0); +#endif + pData=udata_create(NULL, INVC_DATA_TYPE, INVC_DATA_NAME, &invDataInfo, + U_COPYRIGHT_STRING, status); + + if(currdir != NULL) { + free(currdir); + } + + + if(U_FAILURE(*status)) { + fprintf(stderr, "Error: unable to create data memory, error %d\n", *status); + free(result); + return 0; + } + + /* write the data to the file */ + fprintf(stdout, "Writing out inverse table\n"); + udata_writeBlock(pData, result, result->byteSize); + + /* finish up */ + dataLength=udata_finish(pData, status); + if(U_FAILURE(*status)) { + fprintf(stderr, "Error: error %d writing the output file\n", *status); + free(result); + return 0; + } + } + return result; + + +} + + +/* This adds a read element, while testing for existence */ +uint32_t addAnElement(UCAElements *element, UErrorCode *status) { + + uint32_t i = 1, expansion = 0; + + if(U_FAILURE(*status)) { + return 0xFFFF; + } + if(element->noOfCEs == 1) { + if(element->isThai == FALSE) { + element->mapCE = element->CEs[0]; + } else { /* add thai - totally bad here */ + expansion = UCOL_SPECIAL_FLAG | (THAI_TAG<CEs[0], status)+(paddedsize(sizeof(UCATableHeader))>>2))<<4) + | 0x1; + element->mapCE = expansion; + } + } else { + expansion = UCOL_SPECIAL_FLAG | (EXPANSION_TAG<CEs[0], status)+(paddedsize(sizeof(UCATableHeader))>>2))<<4) + & 0xFFFFF0; + + for(i = 1; inoOfCEs; i++) { + addExpansion(element->CEs[i], status); + } + if(element->noOfCEs <= 0xF) { + expansion |= element->noOfCEs; + } else { + addExpansion(0, status); + } + element->mapCE = expansion; + } + + uint32_t CE = ucmp32_get(mapping, element->cPoints[0]); + + if(element->cSize > 1) { /* we're adding a contraction */ + /* and we need to deal with it */ + /* we could aready have something in table - or we might not */ + /* The fact is that we want to add or modify an existing contraction */ + /* and add it backwards then */ + uint32_t result = processContraction(element, CE, TRUE, status); + if(CE == UCOL_NOT_FOUND || !isContraction(CE)) { + ucmp32_set(mapping, element->cPoints[0], result); + } + /* add the reverse order */ + reverseElement(element); + CE = ucmp32_get(mapping, element->cPoints[0]); + result = processContraction(element, CE, FALSE, status); + if(CE == UCOL_NOT_FOUND || !isContraction(CE)) { + ucmp32_set(mapping, element->cPoints[0], result); + } + } else { /* easy case, */ + if( CE != UCOL_NOT_FOUND) { + if(isContraction(CE)) { /* adding a non contraction element (thai, expansion, single) to already existing contraction */ + uprv_cnttab_setContraction(contractions, CE, 0, 0, element->mapCE, TRUE, status); + /* This loop has to change the CE at the end of contraction REDO!*/ + uprv_cnttab_changeLastCE(contractions, CE, element->mapCE, TRUE, status); + } else { + fprintf(stderr, "Fatal error - trying to overwrite already existing data for codepoint %04X\n", element->cPoints[0]); + *status = U_ILLEGAL_ARGUMENT_ERROR; + } + } else { + ucmp32_set(mapping, element->cPoints[0], element->mapCE); + } + } + + + return CE; +} + +int32_t hex2num(char hex) { + if(hex>='0' && hex <='9') { + return hex-'0'; + } else if(hex>='a' && hex<='f') { + return hex-'a'+10; + } else if(hex>='A' && hex<='F') { + return hex-'A'+10; + } else { + return 0; + } +} + +/* Here's the fun part: */ +/* Normal CE produced by getSingleCEValue | 16P | 8S |0|C| 6T | */ +/* Continuation CE produced by processContinuation | 16P | 8S |1|0| 6T | */ +/* Long primary, produced by ???? | 24P |1|1| 6S | */ + +UCATableHeader *assembleTable(UChar variableTopValue, UErrorCode *status) { + if(U_FAILURE(*status)) { + return NULL; + } + + uint32_t beforeContractions = (paddedsize(sizeof(UCATableHeader))+paddedsize(expansions.position*sizeof(uint32_t)))/sizeof(UChar); + + int32_t contractionsSize = uprv_cnttab_constructTable(contractions, beforeContractions, status); + + ucmp32_compact(mapping, 1); + UMemoryStream *ms = uprv_mstrm_openNew(8192); + int32_t mappingSize = ucmp32_flattenMem(mapping, ms); + const uint8_t *flattened = uprv_mstrm_getBuffer(ms, &mappingSize); + + uint32_t tableOffset = 0; + uint8_t *dataStart; + + int32_t toAllocate = paddedsize(sizeof(UCATableHeader))+paddedsize(expansions.position*sizeof(uint32_t))+paddedsize(mappingSize)+paddedsize(contractionsSize*(sizeof(UChar)+sizeof(uint32_t))+paddedsize(0x100*sizeof(uint32_t))); + + dataStart = (uint8_t *)malloc(toAllocate); + UCATableHeader *myData = (UCATableHeader *)dataStart; + + /* Stuff everything with @ */ + memset(dataStart, '@', toAllocate); + + memset(dataStart+tableOffset, 0, sizeof(UCATableHeader)); + tableOffset += paddedsize(sizeof(UCATableHeader)); + + /* copy expansions */ + /*myData->expansion = (uint32_t *)dataStart+tableOffset;*/ + myData->expansion = tableOffset; + memcpy(dataStart+tableOffset, expansions.CEs, expansions.position*sizeof(uint32_t)); + tableOffset += paddedsize(expansions.position*sizeof(uint32_t)); + + /* contractions block */ + /* copy contraction index */ + /*myData->contractionIndex = (UChar *)(dataStart+tableOffset);*/ + myData->contractionIndex = tableOffset; + memcpy(dataStart+tableOffset, contractions->codePoints, contractionsSize*sizeof(UChar)); + tableOffset += paddedsize(contractionsSize*sizeof(UChar)); + + /* copy contraction collation elements */ + /*myData->contractionCEs = (uint32_t *)(dataStart+tableOffset);*/ + myData->contractionCEs = tableOffset; + memcpy(dataStart+tableOffset, contractions->CEs, contractionsSize*sizeof(uint32_t)); + tableOffset += paddedsize(contractionsSize*sizeof(uint32_t)); + + /* copy mapping table */ + /*myData->mappingPosition = dataStart+tableOffset;*/ + myData->mappingPosition = tableOffset; + memcpy(dataStart+tableOffset, flattened, mappingSize); + tableOffset += paddedsize(mappingSize); + + /* construct the fast tracker for latin one*/ + myData->latinOneMapping = tableOffset; + uint32_t *store = (uint32_t*)(dataStart+tableOffset); + int32_t i = 0; + for(i = 0; i<=0xFF; i++) { + *(store++) = ucmp32_get(mapping,i); + tableOffset+=sizeof(uint32_t); + } + + if(tableOffset != toAllocate) { + fprintf(stderr, "calculation screwup!!! Expected to write %i but wrote %i instead!!!\n", toAllocate, tableOffset); + *status = U_INTERNAL_PROGRAM_ERROR; + free(dataStart); + return 0; + } + + myData->size = tableOffset; + myData->variableTopValue = variableTopValue; + myData->strength = UCOL_TERTIARY; + myData->frenchCollation = UCOL_OFF; + myData->alternateHandling = UCOL_SHIFTED; /* attribute for handling variable elements*/ + myData->caseFirst = UCOL_LOWER_FIRST; /* who goes first, lower case or uppercase */ + myData->caseLevel = UCOL_OFF; /* do we have an extra case level */ + myData->normalizationMode = UCOL_ON; /* attribute for normalization */ + + + + /* This should happen upon ressurection */ + const uint8_t *mapPosition = (uint8_t*)myData+myData->mappingPosition; + myData->mapping = ucmp32_openFromData(&mapPosition, status); + return myData; +} + +void processFile(FILE *data, UErrorCode *status) { + if(U_FAILURE(*status)) { + return; + } +} + +UCAElements *readAnElement(FILE *data, UErrorCode *status) { + char buffer[2048], primary[100], secondary[100], tertiary[100]; + UBool detectedContraction; + int32_t i = 0; + char *pointer = NULL; + char *commentStart = NULL; + char *startCodePoint = NULL; + char *endCodePoint = NULL; + char *spacePointer = NULL; + char *result = fgets(buffer, 2048, data); + if(U_FAILURE(*status)) { + return 0; + } + *primary = *secondary = *tertiary = '\0'; + if(result == NULL) { + if(feof(data)) { + return NULL; + } else { + fprintf(stderr, "empty line but no EOF!\n"); + *status = U_INVALID_FORMAT_ERROR; + return NULL; + } + } + if(buffer[0] == '#' || buffer[0] == '\n') { + return NULL; // just a comment, skip whole line + } + + UCAElements *element = ≤ //(UCAElements *)malloc(sizeof(UCAElements)); + + if(buffer[0] == '[') { + element->variableTop = TRUE; + return element; // just a comment, skip whole line + } + element->variableTop = FALSE; + + startCodePoint = buffer; + endCodePoint = strchr(startCodePoint, ';'); + + if(endCodePoint == 0) { + fprintf(stderr, "error - line with no code point!\n"); + *status = U_INVALID_FORMAT_ERROR; /* No code point - could be an error, but probably only an empty line */ + return NULL; + } else { + *(endCodePoint) = 0; + } + + if(element != NULL) { + memset(element, 0, sizeof(*element)); + } else { + *status = U_MEMORY_ALLOCATION_ERROR; + return NULL; + } + + element->cPoints = element->uchars; + + spacePointer = strchr(buffer, ' '); + sscanf(buffer, "%04X", element->cPoints); /* read first code point */ + element->codepoint = element->cPoints[0]; + if(spacePointer == 0) { + detectedContraction = FALSE; + element->cSize = 1; + } else { + i = 1; + detectedContraction = TRUE; + while(spacePointer != NULL) { + sscanf(spacePointer+1, "%04X", (element->cPoints+i)); + i++; + spacePointer = strchr(spacePointer+1, ' '); + } + + element->cSize = i; + + //fprintf(stderr, "Number of codepoints in contraction: %i\n", i); + } + + startCodePoint = endCodePoint+1; + endCodePoint = strchr(startCodePoint, ';'); + + while(*startCodePoint != 'L' && *startCodePoint != 'S') { + startCodePoint++; + if(startCodePoint == endCodePoint) { + *status = U_INVALID_FORMAT_ERROR; + return NULL; + } + } + + if(*startCodePoint == 'S') { + element->caseBit = FALSE; + } else { + element->caseBit = TRUE; + } + + startCodePoint = endCodePoint+1; + + commentStart = strchr(startCodePoint, '#'); + if(commentStart == NULL) { + commentStart = strlen(startCodePoint) + startCodePoint; + } + + i = 0; + uint32_t CEindex = 0; + element->noOfCEs = 0; + for(;;) { + endCodePoint = strchr(startCodePoint, ']'); + if(endCodePoint == NULL || endCodePoint >= commentStart) { + break; + } + pointer = strchr(startCodePoint, '['); + pointer++; + + element->sizePrim[i]=readElement(&pointer, primary, ',', status); + element->sizeSec[i]=readElement(&pointer, secondary, ',', status); + element->sizeTer[i]=readElement(&pointer, tertiary, ']', status); + + + /* I want to get the CEs entered right here, including continuation */ +#if 0 + if(element->sizePrim[i]==3 && + strtoul(secondary, 0, 16)== UCOL_UNMARKED && + strtoul(tertiary, 0, 16) < 0x40) { + /* This is a test for a long primary - secondary has 6 bits and tertiary must be unmarked */ + /* fprintf(stderr, "Long primary in expansion for 0x%04X\n", element->codepoint);*/ + element->CEs[CEindex++] = (uint32_t)strtoul(primary, 0, 16) << 8 | 0xC0 | (strtoul(tertiary, 0, 16) & 0x3F); + /* Long primary, | 24P |1|1| 6T | */ + } else { +#endif /* we will try to go without long primaries */ + element->CEs[CEindex++] = getSingleCEValue(primary, secondary, tertiary, element->caseBit, status); + + uint32_t CEi = 1; + while(2*CEisizePrim[i] || CEisizeSec[i] || CEisizeTer[i]) { + uint32_t value = 0x80; /* Continuation marker */ + if(2*CEisizePrim[i]) { + value |= ((hex2num(*(primary+4*CEi))&0xF)<<28); + value |= ((hex2num(*(primary+4*CEi+1))&0xF)<<24); + } + + if(2*CEi+1sizePrim[i]) { + value |= ((hex2num(*(primary+4*CEi+2))&0xF)<<20); + value |= ((hex2num(*(primary+4*CEi+3))&0xF)<<16); + } + + if(CEisizeSec[i]) { + value |= ((hex2num(*(secondary+2*CEi))&0xF)<<12); + value |= ((hex2num(*(secondary+2*CEi+1))&0xF)<<8); + } + + if(CEisizeTer[i]) { + value |= ((hex2num(*(tertiary+2*CEi))&0x3)<<4); + value |= (hex2num(*(tertiary+2*CEi+1))&0xF); + } + + CEi++; + + element->CEs[CEindex++] = value; + } +#if 0 + } +#endif /* part for long primaries */ + + uint32_t terValue = strtoul(tertiary+strlen(tertiary)-2, NULL, 16); + if(terValue > 0x3F) { + fprintf(stderr, "Tertiary value %02X too big for %04X\n", terValue, element->codepoint); + } + startCodePoint = endCodePoint+1; + i++; + } + element->noOfCEs = CEindex; + + element->isThai = UCOL_ISTHAIPREVOWEL(element->codepoint); + + // we don't want any strange stuff after useful data! + while(pointer < commentStart) { + if(*pointer != ' ') { + *status=U_INVALID_FORMAT_ERROR; + break; + } + *pointer++; + } + + /* + strcpy(element->comment, commentStart); + uhash_put(elements, (void *)element->codepoint, element, status); + */ + + if(U_FAILURE(*status)) { + fprintf(stderr, "problem putting stuff in hash table\n"); + *status = U_INTERNAL_PROGRAM_ERROR; + free(element); + return NULL; + } + + return element; + +} + +void reverseElement(UCAElements *el) { + int32_t i = 0; + UChar temp; + for(i = 0; icSize/2; i++) { + temp = el->cPoints[i]; + el->cPoints[i] = el->cPoints[el->cSize-i-1]; + el->cPoints[el->cSize-i-1] = temp; + } + el->codepoint = el->cPoints[0]; + uint32_t tempCE = 0, expansion = 0; + UErrorCode status = U_ZERO_ERROR; + if(el->noOfCEs>1) { /* this is an expansion that needs to be reversed and added - also, we need to change the mapValue */ + for(i = 0; inoOfCEs/2; i++) { + tempCE = el->CEs[i]; + el->CEs[i] = el->CEs[el->noOfCEs-i-1]; + el->CEs[el->noOfCEs-i-1] = tempCE; + } + expansion = UCOL_SPECIAL_FLAG | (EXPANSION_TAG<CEs[0], &status)+(paddedsize(sizeof(UCATableHeader))>>2))<<4) + & 0xFFFFF0; + + for(i = 1; inoOfCEs; i++) { + addExpansion(el->CEs[i], &status); + } + if(el->noOfCEs <= 0xF) { + expansion |= el->noOfCEs; + } else { + addExpansion(0, &status); + } + el->mapCE = expansion; + } + + +} + +void writeOutData(UCATableHeader *data, UErrorCode *status) { + if(U_FAILURE(*status)) { + return; + } + + UNewDataMemory *pData; + + long dataLength; + +#ifdef WIN32 + char *currdir = _getcwd(NULL, 0); +#else + char *currdir = getcwd(NULL, 0); +#endif +/* + pData=udata_create(getcwd(NULL, 0), UCA_DATA_TYPE, UCA_DATA_NAME, &dataInfo, + U_COPYRIGHT_STRING, status); +*/ + pData=udata_create(NULL, UCA_DATA_TYPE, UCA_DATA_NAME, &dataInfo, + U_COPYRIGHT_STRING, status); + + if(currdir != NULL) { + free(currdir); + } + + + if(U_FAILURE(*status)) { + fprintf(stderr, "Error: unable to create data memory, error %d\n", *status); + return; + } + + /* write the data to the file */ + fprintf(stdout, "Writing out table\n"); + udata_writeBlock(pData, data, data->size); + + /* finish up */ + dataLength=udata_finish(pData, status); + if(U_FAILURE(*status)) { + fprintf(stderr, "Error: error %d writing the output file\n", *status); + return; + } +} + +int main(int argc, char* argv[]) { + FILE *data = fopen("FractionalUCA.txt", "r"); + //FILE *data = fopen("uca30codepointsort.txt", "r"); + int32_t i = 0, j = 0, k = 0, line = 0, thai = 0; + int32_t sizesPrim[35], sizesSec[35], sizesTer[35]; + int32_t terValue[0xffff], secValue[0xffff]; + int32_t sizeBreakDown[35][35][35]; + UErrorCode status = U_ZERO_ERROR; + UCAElements *element = NULL; + UChar variableTopValue = 0; + UBool foundVariableTop = FALSE; + + if(data == NULL) { + fprintf(stderr, "Couldn't open file\n"); + return -1; + } + + memset(secValue, 0, 0xffff*sizeof(int32_t)); + memset(terValue, 0, 0xffff*sizeof(int32_t)); + memset(sizesPrim, 0, 35*sizeof(int32_t)); + memset(sizesSec, 0, 35*sizeof(int32_t)); + memset(sizesTer, 0, 35*sizeof(int32_t)); + memset(sizeBreakDown, 0, 35*35*35*sizeof(int32_t)); + memset(&expansions, 0, sizeof(expansions)); + memset(&contractions, 0, sizeof(contractions)); + memset(inverseTable, 0, sizeof(int32_t)*3*0xFFFF); + + + mapping = ucmp32_open(UCOL_UNMAPPED); + contractions = uprv_cnttab_open(mapping, &status); + ucmp32_setRange(mapping, 0, 0xFFFF, UCOL_NOT_FOUND); + + /* + elements = uhash_open(uhash_hashLong, uhash_compareLong, &status); + + uhash_setValueDeleter(elements, deleteElement); + */ + + if(mapping == NULL) { + return(-1); + } + + while(!feof(data)) { + if(U_FAILURE(status)) { + fprintf(stderr, "Something returned an error %i while processing line: %i\nExiting...", status, line); + exit(status); + } + + element = readAnElement(data, &status); + line++; + if(element != NULL) { + /* this does statistics on CE lengths, but is currently broken */ +/* + for( i = 0; inoOfCEs; i++) { + sizesPrim[element->sizePrim[i]]++; + sizesSec[element->sizeSec[i]]++; + sizesTer[element->sizeTer[i]]++; + + sizeBreakDown[element->sizePrim[i]][element->sizeSec[i]][element->sizeTer[i]]++; + + if(element->sizePrim[i] == 2 && element->sizeSec[i]==2) { + terValue[strtoul(element->tertiary[i], 0, 16)]++; + secValue[strtoul(element->secondary[i], 0, 16)]++; + } + } +*/ + + + // we have read the line, now do something sensible with the read data! + if(element->variableTop == TRUE) { + foundVariableTop = TRUE; + continue; + } + + if(variableTopValue == 0 && foundVariableTop == TRUE) { + variableTopValue = element->cPoints[0]; + foundVariableTop = FALSE; + } + + /* we're first adding to inverse, because addAnElement will reverse the order */ + /* of code points and stuff... we don't want that to happen */ + uint32_t invResult = addToInverse(element, &status); + uint32_t result = addAnElement(element, &status); + //deleteElement(element); + } + } + + + fprintf(stderr, "Lines read: %i\n", line); + + + +/* + for(i = 0; i<35; i++) { + fprintf(stderr, "size %i: P:%i S:%i T:%i\n", i, sizesPrim[i], sizesSec[i], sizesTer[i]); + } + + for(i = 0; i<35; i++) { + UBool printedPrimary = FALSE; + for(j = 0; j<35; j++) { + for(k = 0; k<35; k++) { + if(sizeBreakDown[i][j][k] != 0) { + if(!printedPrimary) { + fprintf(stderr, "Primary: %i\n", i); + printedPrimary = TRUE; + } + fprintf(stderr, "Sec: %i, Ter: %i = %i\n", j, k, sizeBreakDown[i][j][k]); + } + } + } + } + + for(i = 0; i<(uint32_t)0xffff; i++) { + if(terValue[i] != 0) { + fprintf(stderr, "Tertiaries with value %04X : %i\n", i, terValue[i]); + } + if(secValue[i] != 0) { + fprintf(stderr, "Secondaries with value %04X : %i\n", i, secValue[i]); + } + } +*/ + /* test */ + UCATableHeader *myData = assembleTable(variableTopValue, &status); + writeOutData(myData, &status); + + InverseTableHeader *inverse = assembleInverseTable(&status); +/* + uint32_t *itab = (uint32_t *)((uint8_t *)inverse + inverse->table); + UChar *conts = (UChar *)((uint8_t *)inverse + inverse->conts); + for(i = 0; itableSize; i++) { + fprintf(stderr, "[%04X] 0x%08X 0x%08X 0x%08X\n", i, *(itab+3*i), *(itab+3*i+1), *(itab+3*i+2)); + if((*(itab+3*i+2) & UCOL_INV_SIZEMASK) != 0) { + uint32_t contIndex = *(itab+3*i+2) & UCOL_INV_OFFSETMASK; + uint32_t contSize = (*(itab+3*i+2) & UCOL_INV_SIZEMASK) >> UCOL_INV_SHIFTVALUE; + fprintf(stderr, "\t"); + for(j = 0; jmapping); + + free(myData); + + + return 0; +} diff --git a/icu4c/source/tools/genuca/genuca.h b/icu4c/source/tools/genuca/genuca.h new file mode 100644 index 0000000000..86892cc3c2 --- /dev/null +++ b/icu4c/source/tools/genuca/genuca.h @@ -0,0 +1,100 @@ +#ifndef UCADATA_H +#define UCADATA_H + +#include +#include +#include "unicode/utypes.h" +#include "unicode/unicode.h" +#include "ucolimp.h" +#include "ucmp32.h" +#include "compitr.h" +#include "uhash.h" +#include "umemstrm.h" +#include "unewdata.h" +#ifdef WIN32 +#include +#else +#include +#endif + +#define paddedsize(something) ((something)+((((something)%4)!=0)?(4-(something)%4):0)) + +/* UDataInfo for UCA mapping table */ +static const UDataInfo dataInfo={ + sizeof(UDataInfo), + 0, + + U_IS_BIG_ENDIAN, + U_CHARSET_FAMILY, + sizeof(UChar), + 0, + + 0x55, 0x43, 0x6f, 0x6c, /* dataFormat="UCol" */ + 1, 0, 0, 0, /* formatVersion */ + 3, 0, 0, 0 /* dataVersion = Unicode Version*/ +}; + +/* UDataInfo for inverse UCA table */ +static const UDataInfo invDataInfo={ + sizeof(UDataInfo), + 0, + + U_IS_BIG_ENDIAN, + U_CHARSET_FAMILY, + sizeof(UChar), + 0, + + 0x49, 0x6E, 0x76, 0x43, /* dataFormat="InvC" */ + 1, 0, 0, 0, /* formatVersion */ + 3, 0, 0, 0 /* dataVersion = Unicode Version*/ +}; + +typedef struct { + UChar codepoint; + UChar uchars[128]; + UChar *cPoints; + int32_t cSize; /* Number of characters in sequence - for contraction */ + int32_t noOfCEs; /* Number of collation elements */ + uint32_t CEs[128]; /* These are collation elements - there could be more than one - in case of expansion */ + uint32_t mapCE; /* This is the value element maps in original table */ + int32_t sizePrim[128]; + int32_t sizeSec[128]; + int32_t sizeTer[128]; + UBool variableTop; + UBool caseBit; + UBool isThai; +} UCAElements; + +typedef struct { + uint32_t *CEs; + int32_t position; + int32_t size; +} ExpansionTable; + +struct ContractionTable; + +struct ContractionTable { + UChar *codePoints; + uint32_t *CEs; + int32_t position; + int32_t size; + int32_t backSize; + UBool forward; + ContractionTable *reversed; +}; + +void deleteElement(void *element); +int32_t readElement(char **from, char *to, char separator, UErrorCode *status); +int32_t addExpansion(uint32_t value, UErrorCode *status); +uint32_t getSingleCEValue(char *primary, char *secondary, char *tertiary, UBool caseBit, UErrorCode *status); +uint32_t processContraction(UCAElements *element, uint32_t existingCE, UBool forward, UErrorCode *status); +void printOutTable(UCATableHeader *myData, UErrorCode *status); +UCATableHeader *assembleTable(UChar variableTopValue, UErrorCode *status); +void processFile(FILE *data, UErrorCode *status); +/* This adds a read element, while testing for existence */ +uint32_t addAnElement(UCAElements *element, UErrorCode *status); +UCAElements *readAnElement(FILE *data, UErrorCode *status); +void reverseElement(UCAElements *el); + + +#endif diff --git a/icu4c/source/tools/genuca/tblprint.cpp b/icu4c/source/tools/genuca/tblprint.cpp new file mode 100644 index 0000000000..e7d01ebdb8 --- /dev/null +++ b/icu4c/source/tools/genuca/tblprint.cpp @@ -0,0 +1,186 @@ +#include "tblprint.h" + +char *formatElementString(uint32_t CE, char *buffer) { + char temp[1024]; + UBool firstPrim = FALSE; + sprintf(buffer, "["); + if(UCOL_PRIMARYORDER(CE)>>8 != 0x02) { + sprintf(temp, "%02X ", UCOL_PRIMARYORDER(CE)>>8); + strcat(buffer, temp); + firstPrim = TRUE; + } + + if((UCOL_PRIMARYORDER(CE)&0xFF) != 0x02 || firstPrim == TRUE) { + sprintf(temp, "%02X", UCOL_PRIMARYORDER(CE)&0xFF); + strcat(buffer, temp); + } + firstPrim = FALSE; + + strcat(buffer, ","); + + if(UCOL_SECONDARYORDER(CE) != 0x02) { + sprintf(temp, " %02X", UCOL_SECONDARYORDER(CE)); + strcat(buffer, temp); + } + + strcat(buffer, ","); + + if((UCOL_TERTIARYORDER(CE)&0x7F) != 0x02) { + sprintf(temp, " %02X", UCOL_TERTIARYORDER(CE)&0x7F); + strcat(buffer, temp); + } + + strcat(buffer, "]"); + + return buffer; +} + +void printExp(uint32_t CE, uint32_t oldCE, char* primb, char* secb, char *terb, UBool *printedCont) { + char temp[1024]; + if(CE 0xFF) { + sprintf(temp, "%02X ", UCOL_PRIMARYORDER(oldCE)>>8); + strcat(primb, temp); + } + + if(UCOL_PRIMARYORDER(oldCE) != 0) { + sprintf(temp, "%02X ", UCOL_PRIMARYORDER(oldCE)&0xFF); + strcat(primb, temp); + } + if(UCOL_SECONDARYORDER(oldCE) != 0) { + sprintf(temp, "%02X ", UCOL_SECONDARYORDER(oldCE)); + strcat(secb, temp); + } + if(UCOL_TERTIARYORDER(oldCE) != 0) { + sprintf(temp, "%02X ", UCOL_TERTIARYORDER(oldCE)); + strcat(terb, temp); + } + fprintf(stdout, "[%s, %s, %s] ", primb, secb, terb); + *primb = *secb = *terb = *temp = 0; + } + *printedCont = FALSE; + } else { /* this is a contiunation, process accordingly */ + if(*printedCont == TRUE) { + oldCE &= 0x0FFFFFFF; + } + if(UCOL_PRIMARYORDER(oldCE) > 0xFF) { + sprintf(temp, "%02X ", UCOL_PRIMARYORDER(oldCE)>>8); + strcat(primb, temp); + } + + if(UCOL_PRIMARYORDER(oldCE) != 0) { + sprintf(temp, "%02X ", UCOL_PRIMARYORDER(oldCE)&0xFF); + strcat(primb, temp); + } + if(UCOL_SECONDARYORDER(oldCE) != 0) { + sprintf(temp, "%02X ", UCOL_SECONDARYORDER(oldCE)); + strcat(secb, temp); + } + if(UCOL_TERTIARYORDER(oldCE)&0x7F != 0) { + sprintf(temp, "%02X ", UCOL_TERTIARYORDER(oldCE)&0x7F); + strcat(terb, temp); + } + *printedCont = TRUE; + } +} + +void printOutTable(UCATableHeader *myData, UErrorCode *status) { + if(U_FAILURE(*status)) { + return; + } + int32_t i = 0, j = 0; + int32_t CE = 0; + uint32_t *address = NULL; + uint8_t size = 0; + char buffer[1024]; + for(i = 0; i<=0xFFFF; i++) { + CE = ucmp32_get(myData->mapping, i); + if(CE != UCOL_NOT_FOUND) { + fprintf(stdout, "%04X; ", i); + if(CE < UCOL_NOT_FOUND) { + fprintf(stdout, "%c; %s ", (UCOL_TERTIARYORDER(CE)&0x80)>>7?'L':'S', formatElementString(CE, buffer)); + } else { + int32_t tag = (CE&UCOL_TAG_MASK)>>UCOL_TAG_SHIFT; + if(tag == SURROGATE_TAG) { + // do surrogates + } + if(tag == THAI_TAG) { + address = ((uint32_t*)myData+((CE&0x00FFFFF0)>>4)); + CE = *(address); + fprintf(stdout, "%c; %s ", (UCOL_TERTIARYORDER(CE)&0x80)>>7?'L':'S', formatElementString(CE, buffer)); + fprintf(stdout, "THAI - from %08X to %08X (offset %05X) ", CE, address, ((CE&0x00FFFFF0)>>4)); + } + if(tag == CONTRACTION_TAG) { + int16_t hasBackward = 0; + char conChars[1024]; + char temp[1024]; + sprintf(conChars, "%04X", i); + UChar *contractionCP = (UChar *)myData+getContractOffset(CE); + hasBackward = *(contractionCP); /* skip backward */ + UBool printSeq = FALSE; + address = (uint32_t *)((uint8_t*)myData+myData->contractionCEs)+(contractionCP - (UChar *)((uint8_t*)myData+myData->contractionIndex)); + while(*contractionCP != 0xFFFF) { + if(printSeq == TRUE) { + fprintf(stdout, "\n%s;",conChars); + } + CE = *(address); + fprintf(stdout, "%c; %s ", (UCOL_TERTIARYORDER(CE)&0x80)>>7?'L':'S', formatElementString(CE, buffer)); + fprintf(stdout, "Contraction "); + if(hasBackward != 0) { + fprintf(stdout, "Back = %i ", hasBackward); + } + + contractionCP++; + address++; + sprintf(temp, " %04X", *contractionCP); + strcat(conChars, temp); + printSeq = TRUE; + } + + + } + if(tag == EXPANSION_TAG) { + char primb[1024], secb[1024], terb[1024], temp[1024]; + UBool printedCont = FALSE; + uint32_t oldCE; + *primb = *secb = *terb = *temp = 0; + size = CE&0xF; + address = ((uint32_t*)myData+((CE&0x00FFFFF0)>>4)); + CE = *(address++); + fprintf(stdout, "%c; ", (UCOL_TERTIARYORDER(CE)&0x80)>>7?'L':'S'); + + if(size != 0) { + for(j = 1; jcomment); + */ + } + } +} + diff --git a/icu4c/source/tools/genuca/tblprint.h b/icu4c/source/tools/genuca/tblprint.h new file mode 100644 index 0000000000..245953029b --- /dev/null +++ b/icu4c/source/tools/genuca/tblprint.h @@ -0,0 +1,11 @@ +#ifndef TBLPRINT_H +#define TBLPRINT_H + +#include "unicode/utypes.h" +#include "ucadata.h" + +char *formatElementString(uint32_t CE, char *buffer); +void printExp(uint32_t CE, uint32_t oldCE, char* primb, char* secb, char *terb, UBool *printedCont); +void printOutTable(UCATableHeader *myData, UErrorCode *status); + +#endif \ No newline at end of file