ICU-867 initial version.

X-SVN-Rev: 3730
This commit is contained in:
George Rhoten 2001-02-22 21:18:29 +00:00
parent 131e27bb87
commit c3231963c4
8 changed files with 1849 additions and 0 deletions

View File

@ -0,0 +1,121 @@
## Makefile.in for ICU - tools/genuca
## Copyright (c) 1999, 2000, International Business Machines Corporation and
## others. All Rights Reserved.
## Source directory information
srcdir = @srcdir@
top_srcdir = @top_srcdir@
top_builddir = ../..
include $(top_builddir)/icudefs.mk
## Platform-specific setup
include @platform_make_fragment@
##
SECTION = 8
MAN_FILES = $(TARGET).$(SECTION)
## Build directory information
subdir = tools/genuca
## Extra files to remove for 'make clean'
CLEANFILES = *~ $(TARGET).$(SECTION) $(DEPS)
## Target information
TARGET = genuca
ENABLE_STATIC = @ENABLE_STATIC@
ifneq ($(ENABLE_STATIC),)
LINK = $(LINK.cc)
else
LINK = $(LINK.c)
endif
DEFS = @DEFS@
CPPFLAGS = @CPPFLAGS@ -I$(top_builddir)/common -I$(top_srcdir)/common -I$(top_srcdir)/i18n -I$(top_srcdir)/extra/ustdio -I$(srcdir)/../toolutil
CFLAGS = @CFLAGS@
CXXFLAGS = @CXXFLAGS@
ENABLE_RPATH = @ENABLE_RPATH@
ifeq ($(ENABLE_RPATH),YES)
RPATHLDFLAGS = $(LD_RPATH)$(LD_RPATH_PRE)$(libdir)
endif
LDFLAGS = @LDFLAGS@ $(RPATHLDFLAGS)
LIBS = $(LIBUSTDIO) $(LIBICUI18N) $(LIBICUTOOLUTIL) $(LIBICUUC) @LIBS@ @LIB_M@
OBJECTS = UCAData.o cnttable.o
DEPS = $(OBJECTS:.o=.d)
## List of phony targets
.PHONY : all all-local install install-local clean clean-local \
distclean distclean-local dist dist-local check \
check-local install-man
## Clear suffix list
.SUFFIXES :
## List of standard targets
all: all-local
install: install-local
clean: clean-local
distclean : distclean-local
dist: dist-local
check: all check-local
all-local: $(TARGET) $(MAN_FILES)
install-local: all-local install-man
$(MKINSTALLDIRS) $(DESTDIR)$(sbindir)
$(INSTALL) $(TARGET) $(DESTDIR)$(sbindir)/$(TARGET)
<dist-local:
clean-local:
test -z "$(CLEANFILES)" || $(RMV) $(CLEANFILES)
$(RMV) $(TARGET) $(OBJECTS)
distclean-local: clean-local
$(RMV) Makefile
check-local: all-local
Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
cd $(top_builddir) \
&& CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status
$(TARGET) : $(OBJECTS)
$(LINK) -o $@ $^ $(LIBS)
# the 'mv' will always fail if you are building in the source dir
# man page
install-man: $(MAN_FILES)
$(MKINSTALLDIRS) $(DESTDIR)$(mandir)/man$(SECTION)
$(INSTALL_DATA) $< $(DESTDIR)$(mandir)/man$(SECTION)
$(TARGET).$(SECTION): $(srcdir)/$(TARGET).$(SECTION).in
cd $(top_builddir) \
&& CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status
# build postscript and pdf formats
$(TARGET).ps: $(TARGET).$(SECTION)
groff -man < $< > $@
$(TARGET).pdf: $(TARGET).ps
ps2pdf $< $@
ifeq (,$(MAKECMDGOALS))
-include $(DEPS)
else
ifneq ($(patsubst %clean,,$(MAKECMDGOALS)),)
-include $(DEPS)
endif
endif

View File

@ -0,0 +1,345 @@
#include "cnttable.h"
#include "cmemory.h"
void uprv_growTable(ContractionTable *tbl, UErrorCode *status) {
if(tbl->position == tbl->size) {
uint32_t *newData = (uint32_t *)realloc(tbl->CEs, 2*tbl->size*sizeof(uint32_t));
UChar *newCPs = (UChar *)realloc(tbl->codePoints, 2*tbl->size*sizeof(UChar));
if(newData == NULL || newCPs == NULL) {
fprintf(stderr, "out of memory for contractions\n");
*status = U_MEMORY_ALLOCATION_ERROR;
return;
}
tbl->CEs = newData;
tbl->codePoints = newCPs;
tbl->size *= 2;
}
}
CntTable *uprv_cnttab_open(CompactIntArray *mapping, UErrorCode *status) {
if(U_FAILURE(*status)) {
return 0;
}
CntTable *tbl = (CntTable *)malloc(sizeof(CntTable));
tbl->mapping = mapping;
//tbl->elements = uhash_open(uhash_hashLong, uhash_compareLong, status);
//uhash_setValueDeleter(tbl->elements, deleteCntElement);
tbl->elements = (ContractionTable **)malloc(INIT_EXP_TABLE_SIZE*sizeof(ContractionTable *));
tbl->capacity = INIT_EXP_TABLE_SIZE;
memset(tbl->elements, 0, INIT_EXP_TABLE_SIZE*sizeof(ContractionTable *));
tbl->size = 0;
tbl->position = 0;
tbl->CEs = NULL;
tbl->codePoints = NULL;
tbl->offsets = NULL;
return tbl;
}
ContractionTable *addATableElement(CntTable *table, uint32_t *key, UErrorCode *status) {
ContractionTable *el = (ContractionTable *)malloc(sizeof(ContractionTable));
el->CEs = (uint32_t *)malloc(INIT_EXP_TABLE_SIZE*sizeof(uint32_t));
el->codePoints = (UChar *)malloc(INIT_EXP_TABLE_SIZE*sizeof(UChar));
el->position = 0;
el->size = INIT_EXP_TABLE_SIZE;
el->forward = TRUE;
memset(el->CEs, 'F', INIT_EXP_TABLE_SIZE*sizeof(uint32_t));
memset(el->codePoints, 'F', INIT_EXP_TABLE_SIZE*sizeof(UChar));
el->reversed = (ContractionTable *)malloc(sizeof(ContractionTable));
el->reversed->CEs = (uint32_t *)malloc(INIT_EXP_TABLE_SIZE*sizeof(uint32_t));
el->reversed->codePoints = (UChar *)malloc(INIT_EXP_TABLE_SIZE*sizeof(UChar));
el->reversed->position = 0;
el->reversed->size = INIT_EXP_TABLE_SIZE;
el->reversed->forward = FALSE;
memset(el->reversed->CEs, 'R', INIT_EXP_TABLE_SIZE*sizeof(uint32_t));
memset(el->reversed->codePoints, 'R', INIT_EXP_TABLE_SIZE*sizeof(UChar));
table->elements[table->size] = el;
//uhash_put(table->elements, (void *)table->size, el, status);
*key = table->size++;
if(table->size > table->capacity) {
// do realloc
*status = U_MEMORY_ALLOCATION_ERROR;
}
return el;
}
int32_t uprv_cnttab_constructTable(CntTable *table, uint32_t mainOffset, UErrorCode *status) {
if(U_FAILURE(*status)) {
return 0;
}
int32_t i = 0, j = 0;
table->position = 0;
if(table->offsets != NULL) {
free(table->offsets);
}
table->offsets = (int32_t *)malloc(table->size*sizeof(int32_t));
/* See how much memory we need */
for(i = 0; i<table->size; i++) {
table->offsets[i] = table->position+mainOffset;
table->position += table->elements[i]->position;
if(table->elements[i]->reversed->position > 0) {
table->elements[i]->codePoints[0] = table->elements[i]->position; /* set offset for backwards table */
table->position += table->elements[i]->reversed->position-1;
}
}
/* Allocate it */
if(table->CEs != NULL) {
free(table->CEs);
}
table->CEs = (uint32_t *)malloc(table->position*sizeof(uint32_t));
memset(table->CEs, '?', table->position*sizeof(uint32_t));
if(table->codePoints != NULL) {
free(table->codePoints);
}
table->codePoints = (UChar *)malloc(table->position*sizeof(UChar));
memset(table->codePoints, '?', table->position*sizeof(UChar));
/* Now stuff the things in*/
UChar *cpPointer = table->codePoints;
uint32_t *CEPointer = table->CEs;
for(i = 0; i<table->size; i++) {
int32_t size = table->elements[i]->position;
memcpy(cpPointer, table->elements[i]->codePoints, size*sizeof(UChar));
memcpy(CEPointer, table->elements[i]->CEs, size*sizeof(uint32_t));
for(j = 0; j<size; j++) {
if(isContraction(*(CEPointer+j))) {
*(CEPointer+j) = constructContractCE(table->offsets[getContractOffset(*(CEPointer+j))]);
}
}
cpPointer += size;
CEPointer += size;
if(table->elements[i]->reversed->position-1 > 0) {
int32_t size2 = table->elements[i]->reversed->position-1;
memcpy(cpPointer, (table->elements[i]->reversed->codePoints)+1, size2*sizeof(UChar));
memcpy(CEPointer, (table->elements[i]->reversed->CEs)+1, size2*sizeof(uint32_t));
for(j = 0; j<size2; j++) {
if(isContraction(*(CEPointer+j))) {
*(CEPointer+j) = constructContractCE(table->offsets[getContractOffset(*(CEPointer+j))]);
}
}
cpPointer += size2;
CEPointer += size2;
}
}
uint32_t CE;
for(i = 0; i<=0xFFFF; i++) {
CE = ucmp32_get(table->mapping, i);
if(isContraction(CE)) {
CE = constructContractCE(table->offsets[getContractOffset(CE)]);
ucmp32_set(table->mapping, i, CE);
}
}
return table->position;
}
void uprv_cnttab_close(CntTable *table) {
int32_t i = 0;
for(i = 0; i<table->size; i++) {
free(table->elements[i]->reversed->CEs);
free(table->elements[i]->reversed->codePoints);
free(table->elements[i]->reversed);
free(table->elements[i]->CEs);
free(table->elements[i]->codePoints);
free(table->elements[i]);
}
free(table->CEs);
free(table->offsets);
free(table->codePoints);
free(table);
}
/* this is for adding non contractions */
uint32_t uprv_cnttab_changeLastCE(CntTable *table, uint32_t element, uint32_t value, UBool forward, UErrorCode *status) {
element &= 0xFFFFFF;
ContractionTable *tbl = NULL;
if(U_FAILURE(*status)) {
return 0;
}
if((element == 0xFFFFFF) || (tbl = table->elements[element]) == NULL) {
tbl = addATableElement(table, &element, status);
}
if(forward == TRUE) {
tbl->CEs[tbl->position-1] = value;
} else {
tbl->reversed->CEs[tbl->reversed->position-1] = value;
}
return(constructContractCE(element));
}
/* inserts a part of contraction sequence in table. Sequences behind the offset are moved back. If element is non existent, it creates on. Returns element handle */
uint32_t uprv_cnttab_insertContraction(CntTable *table, uint32_t element, UChar codePoint, uint32_t value, UBool forward, UErrorCode *status) {
element &= 0xFFFFFF;
ContractionTable *tbl = NULL;
if(U_FAILURE(*status)) {
return 0;
}
if((element == 0xFFFFFF) || (tbl = table->elements[element]) == NULL) {
tbl = addATableElement(table, &element, status);
}
if(forward == FALSE) {
tbl = tbl->reversed;
}
uprv_growTable(tbl, status);
int32_t offset = 0;
while(tbl->codePoints[offset] < codePoint && offset<tbl->position) {
offset++;
}
int32_t i = tbl->position;
for(i = tbl->position; i > offset; i--) {
tbl->CEs[i] = tbl->CEs[i-1];
tbl->codePoints[i] = tbl->codePoints[i-1];
}
tbl->CEs[offset] = value;
tbl->codePoints[offset] = codePoint;
tbl->position++;
return(constructContractCE(element));
}
/* adds more contractions in table. If element is non existant, it creates on. Returns element handle */
uint32_t uprv_cnttab_addContraction(CntTable *table, uint32_t element, UChar codePoint, uint32_t value, UBool forward, UErrorCode *status) {
element &= 0xFFFFFF;
ContractionTable *tbl = NULL;
if(U_FAILURE(*status)) {
return 0;
}
if((element == 0xFFFFFF) || (tbl = table->elements[element]) == NULL) {
tbl = addATableElement(table, &element, status);
}
if(forward == FALSE) {
tbl = tbl->reversed;
}
uprv_growTable(tbl, status);
tbl->CEs[tbl->position] = value;
tbl->codePoints[tbl->position] = codePoint;
tbl->position++;
return(constructContractCE(element));
}
/* sets a part of contraction sequence in table. If element is non existant, it creates on. Returns element handle */
uint32_t uprv_cnttab_setContraction(CntTable *table, uint32_t element, int32_t offset, UChar codePoint, uint32_t value, UBool forward, UErrorCode *status) {
element &= 0xFFFFFF;
ContractionTable *tbl = NULL;
if(U_FAILURE(*status)) {
return 0;
}
if((element == 0xFFFFFF) || (tbl = table->elements[element]) == NULL) {
tbl = addATableElement(table, &element, status);
}
if(forward == FALSE) {
tbl = tbl->reversed;
}
if(offset >= tbl->size) {
*status = U_INDEX_OUTOFBOUNDS_ERROR;
return 0;
}
tbl->CEs[offset] = value;
tbl->codePoints[offset] = codePoint;
//return(offset);
return(constructContractCE(element));
}
uint32_t uprv_cnttab_findCP(CntTable *table, uint32_t element, UChar codePoint, UBool forward, UErrorCode *status) {
element &= 0xFFFFFF;
ContractionTable *tbl = NULL;
if(U_FAILURE(*status)) {
return 0;
}
if((element == 0xFFFFFF) || (tbl = table->elements[element]) == NULL) {
return 0;
}
if(forward == FALSE) {
tbl = tbl->reversed;
}
uint32_t position = 0;
while(codePoint > tbl->codePoints[position]) {
position++;
if(position > tbl->position) {
return 0;
}
}
if (codePoint == tbl->codePoints[position]) {
return position;
} else {
return 0;
}
}
uint32_t uprv_cnttab_getCE(CntTable *table, uint32_t element, uint32_t position, UBool forward, UErrorCode *status) {
element &= 0xFFFFFF;
ContractionTable *tbl = NULL;
if(U_FAILURE(*status)) {
return UCOL_NOT_FOUND;
}
if((element == 0xFFFFFF) || (tbl = table->elements[element]) == NULL) {
return UCOL_NOT_FOUND;
}
if(forward == FALSE) {
tbl = tbl->reversed;
}
if(position > tbl->position) {
return UCOL_NOT_FOUND;
} else {
return tbl->CEs[position];
}
}

View File

@ -0,0 +1,35 @@
#ifndef UCOL_CNTTABLE_H
#define UCOL_CNTTABLE_H
#include "uhash.h"
#include "UCAData.h"
typedef struct {
ContractionTable **elements;
CompactIntArray *mapping;
UChar *codePoints;
uint32_t *CEs;
int32_t *offsets;
int32_t position;
int32_t size;
int32_t capacity;
} CntTable;
CntTable *uprv_cnttab_open(CompactIntArray *mapping, UErrorCode *status);
/* construct the table for output */
int32_t uprv_cnttab_constructTable(CntTable *table, uint32_t mainOffset, UErrorCode *status);
void uprv_cnttab_close(CntTable *table);
/* adds more contractions in table. If element is non existant, it creates on. Returns element handle */
uint32_t uprv_cnttab_addContraction(CntTable *table, uint32_t element, UChar codePoint, uint32_t value, UBool forward, UErrorCode *status);
/* sets a part of contraction sequence in table. If element is non existant, it creates on. Returns element handle */
uint32_t uprv_cnttab_setContraction(CntTable *table, uint32_t element, int32_t offset, UChar codePoint, uint32_t value, UBool forward, UErrorCode *status);
/* inserts a part of contraction sequence in table. Sequences behind the offset are moved back. If element is non existant, it creates on. Returns element handle */
uint32_t uprv_cnttab_insertContraction(CntTable *table, uint32_t element, UChar codePoint, uint32_t value, UBool forward, UErrorCode *status);
/* this is for adding non contractions */
uint32_t uprv_cnttab_changeLastCE(CntTable *table, uint32_t element, uint32_t value, UBool forward, UErrorCode *status);
uint32_t uprv_cnttab_findCP(CntTable *table, uint32_t element, UChar codePoint, UBool forward, UErrorCode *status);
uint32_t uprv_cnttab_getCE(CntTable *table, uint32_t element, uint32_t position, UBool forward, UErrorCode *status);
#endif

View File

@ -0,0 +1,95 @@
.\" Hey, Emacs! This is -*-nroff-*- you know...
.\"
.\" genuca.8: manual page for the genuca utility
.\"
.\" Copyright (C) 2000 IBM, Inc. and others.
.\"
.TH GENUCA 8 "22 February 2001" "ICU MANPAGE" "ICU @VERSION@ Manual"
.SH NAME
.B genuca
\- create the UCA data table
.SH SYNOPSIS
.B genuca
[
.BR "\-V\fP, \fB\-\-version"
]
[
.BR "\-h\fP, \fB\-?\fP, \fB\-\-help"
]
[
.BR "\-v\fP, \fB\-\-verbose"
]
[
.BI "\-s\fP, \fB\-\-sourcedir" " source"
]
[
.BI "\-d\fP, \fB\-\-destdir" " destination"
]
.IR bundle " \.\.\."
.SH DESCRIPTION
.B genuca
create the UCA data table
.I bundle
source files passed on the command line to their binary form.
The resulting files have a
.B .res
extension while resource bundle source files typically have a
.B .txt
extension.
The
.I bundle
file name should be a local identifier, e.g.
.B ja_JP.txt
for Japanese (Japan) data, or
.B root.txt
for the root bundle.
.PP
These binary files can then be read directly by ICU, or used by
.BR pkgdata (8)
for incorporation into a larger archive or library.
.SH OPTIONS
.TP
.BR \-V\fP, \fB\-\-version
Print the version of
.B genuca
and exit.
.TP
.BR \-h\fP, \fB\-?\fP, \fB\-\-help
Print help about usage and exit.
.TP
.BR \-v\fP, \fB\-\-verbose
Display extra informative messages during execution.
.TP
.BI "\-s\fP, \fB\-\-sourcedir" " source"
Set the source directory to
.IR source .
The default source directory is specified by the environment variable
.BR ICU_DATA .
.TP
.BI "\-d\fP, \fB\-\-destdir" " destination"
Set the destination directory to
.IR destination .
The default destination directory is specified by the environment variable
.BR ICU_DATA .
.SH INVARIANT CHARACTERS
The
.B invariant character set
consists of the following set of characters, expressed as a standard POSIX
regular expression:
.BR "[a-z]|[A-Z]|[0-9]|_| |+|-|*|/" .
This is the set which is guaranteed to be available regardless of code page.
.SH ENVIRONMENT
.TP 10
.B ICU_DATA
Specifies the directory containing ICU data. Defaults to
.BR @thedatadir@/icu/@VERSION@/ .
Some tools in ICU depend on the presence of the trailing slash. It is thus
important to make sure that it is present if
.B ICU_DATA
is set.
.SH VERSION
@VERSION@
.SH COPYRIGHT
Copyright (C) 2001 IBM, Inc. and others.
.SH SEE ALSO
.BR pkgdata (8)

View File

@ -0,0 +1,956 @@
#include "UCAData.h"
#include "cnttable.h"
#include <stdlib.h>
ExpansionTable expansions;
CntTable *contractions;
CompactIntArray *mapping = NULL;
/*UHashtable *elements = NULL;*/
UCAElements le;
void deleteElement(void *element) {
UCAElements *el = (UCAElements *)element;
/*
int32_t i = 0;
for(i = 0; i < el->noOfCEs; i++) {
free(el->primary[i]);
free(el->secondary[i]);
free(el->tertiary[i]);
}
*/
//free(el);
}
int32_t readElement(char **from, char *to, char separator, UErrorCode *status) {
if(U_FAILURE(*status)) {
return 0;
}
char buffer[1024];
int32_t i = 0;
while(**from != separator) {
if(**from != ' ') {
*(buffer+i++) = **from;
}
(*from)++;
}
(*from)++;
*(buffer + i) = 0;
//*to = (char *)malloc(strlen(buffer)+1);
strcpy(to, buffer);
return i/2;
}
uint32_t getSingleCEValue(char *primary, char *secondary, char *tertiary, UBool caseBit, UErrorCode *status) {
if(U_FAILURE(*status)) {
return 0;
}
uint32_t value = 0;
char primsave = '\0';
char secsave = '\0';
char tersave = '\0';
char *primend = primary+4;
if(strlen(primary) > 4) {
primsave = *primend;
*primend = '\0';
}
char *secend = secondary+2;
if(strlen(secondary) > 2) {
secsave = *secend;
*secend = '\0';
}
char *terend = tertiary+2;
if(strlen(tertiary) > 2) {
tersave = *terend;
*terend = '\0';
}
uint32_t primvalue = (*primary!='\0')?strtoul(primary, &primend, 16):0;
uint32_t secvalue = (*secondary!='\0')?strtoul(secondary, &secend, 16):0;
uint32_t tervalue = (*tertiary!='\0')?strtoul(tertiary, &terend, 16):0;
if(primvalue <= 0xFF) {
primvalue <<= 8;
}
value = ((primvalue<<UCOL_PRIMARYORDERSHIFT)&UCOL_PRIMARYORDERMASK)|
((secvalue<<UCOL_SECONDARYORDERSHIFT)&UCOL_SECONDARYORDERMASK)|
(tervalue&UCOL_TERTIARYORDERMASK);
// This CE is not special at all... a very uninteresting one...
value &= 0xFFFFFF7F;
// Here's case handling!
if(caseBit == TRUE) {
value |= 0x40; // 0100 0000 set case bit
} else {
value &= 0xFFFFFFBF; // ... 1011 1111 (reset case bit)
}
if(primsave!='\0') {
*primend = primsave;
}
if(secsave!='\0') {
*secend = secsave;
}
if(tersave!='\0') {
*terend = tersave;
}
return value;
}
UCAElements *copyUCAElement(UCAElements *that) {
UCAElements *r = (UCAElements *)malloc(sizeof(*that));
memcpy(r, that, sizeof(*that));
return r;
}
void releaseUCACopy(UCAElements *r) {
free(r);
}
uint32_t processContraction(UCAElements *element, uint32_t existingCE, UBool forward, UErrorCode *status) {
if(U_FAILURE(*status)) {
return UCOL_NOT_FOUND;
}
int32_t i = 0;
UBool gotContractionOffset = FALSE;
int32_t firstContractionOffset = 0;
int32_t contractionOffset = 0;
uint32_t contractionElement = UCOL_NOT_FOUND;
/* end of recursion */
if(element->cSize == 1) {
return element->mapCE;
}
/* this recursion currently feeds on the only element we have... We will have to copy it in order to accomodate */
/* for both backward and forward cycles */
/* we encountered either an empty space or a non-contraction element */
/* this means we are constructing a new contraction sequence */
if(existingCE == UCOL_NOT_FOUND || !isContraction(existingCE)) {
/* if it wasn't contraction, we wouldn't end up here*/
firstContractionOffset = uprv_cnttab_addContraction(contractions, -1, 0, existingCE, forward, status);
if(forward == FALSE) {
uprv_cnttab_addContraction(contractions, firstContractionOffset, 0, existingCE, TRUE, status);
uprv_cnttab_addContraction(contractions, firstContractionOffset, 0xFFFF, existingCE, TRUE, status);
}
UChar toAdd = element->cPoints[1];
element->cPoints++;
element->cSize--;
uint32_t newCE = processContraction(element, UCOL_NOT_FOUND, forward, status);
element->cPoints--;
element->cSize++;
contractionOffset = uprv_cnttab_addContraction(contractions, firstContractionOffset, toAdd, newCE, forward, status);
contractionOffset = uprv_cnttab_addContraction(contractions, firstContractionOffset, 0xFFFF, existingCE, forward, status);
contractionElement = constructContractCE(firstContractionOffset);
return contractionElement;
} else { /* we are adding to existing contraction */
/* there were already some elements in the table, so we need to add a new contraction */
/* Two things can happen here: either the codepoint is already in the table, or it is not */
uint32_t position = uprv_cnttab_findCP(contractions, existingCE, *(element->cPoints+1), forward, status);
element->cPoints++;
element->cSize--;
if(position != 0) { /* if it is we just continue down the chain */
uint32_t eCE = uprv_cnttab_getCE(contractions, existingCE, position, forward, status);
uint32_t newCE = processContraction(element, eCE, forward, status);
uprv_cnttab_setContraction(contractions, existingCE, position, *(element->cPoints), newCE, forward, status);
} else { /* if it isn't, we will have to create a new sequence */
uint32_t newCE = processContraction(element, UCOL_NOT_FOUND, forward, status);
uprv_cnttab_insertContraction(contractions, existingCE, *(element->cPoints), newCE, forward, status);
}
element->cPoints--;
element->cSize++;
return existingCE;
}
}
int32_t addExpansion(uint32_t value, UErrorCode *status) {
if(U_FAILURE(*status)) {
return 0;
}
if(expansions.CEs == NULL) {
expansions.CEs = (uint32_t *)malloc(INIT_EXP_TABLE_SIZE*sizeof(uint32_t));
expansions.size = INIT_EXP_TABLE_SIZE;
expansions.position = 0;
}
if(expansions.position == expansions.size) {
uint32_t *newData = (uint32_t *)realloc(expansions.CEs, 2*expansions.size*sizeof(uint32_t));
if(newData == NULL) {
fprintf(stderr, "out of memory for expansions\n");
*status = U_MEMORY_ALLOCATION_ERROR;
return -1;
}
expansions.CEs = newData;
expansions.size *= 2;
}
expansions.CEs[expansions.position] = value;
return(expansions.position++);
}
uint32_t inverseTable[0xFFFF][3];
uint32_t inversePos = 0;
/*UChar *stringContinue[0xFFFF];*/
UChar stringContinue[0xFFFF];
uint32_t stringContSize[0xFFFF];
uint32_t sContPos = 0;
uint32_t contSize = 0;
#define UCOL_INV_SIZEMASK 0xFFF00000
#define UCOL_INV_OFFSETMASK 0x000FFFFF
#define UCOL_INV_SHIFTVALUE 20
void addNewInverse(UCAElements *element, UErrorCode *status) {
if(isContinuation(element->CEs[1])) {
fprintf(stderr, "+");
}
inversePos++;
inverseTable[inversePos][0] = element->CEs[0];
if(element->noOfCEs > 1 && isContinuation(element->CEs[1])) {
inverseTable[inversePos][1] = element->CEs[1];
}
if(element->cSize < 2) {
inverseTable[inversePos][2] = element->cPoints[0];
} else { /* add a new store of cruft */
inverseTable[inversePos][2] = ((element->cSize+1) << UCOL_INV_SHIFTVALUE) | sContPos;
memcpy(stringContinue+sContPos, element->cPoints, element->cSize*sizeof(UChar));
sContPos += element->cSize+1;
}
}
void addToExistingInverse(UCAElements *element, uint32_t position, UErrorCode *status) {
if((inverseTable[position][2] & UCOL_INV_SIZEMASK) == 0) { /* single element, have to make new extension place and put both guys there */
stringContinue[sContPos] = inverseTable[position][2];
inverseTable[position][2] = ((element->cSize+3) << UCOL_INV_SHIFTVALUE) | sContPos;
sContPos++;
stringContinue[sContPos++] = 0xFFFF;
memcpy(stringContinue+sContPos, element->cPoints, element->cSize*sizeof(UChar));
sContPos += element->cSize;
stringContinue[sContPos++] = 0xFFFE;
} else { /* adding to the already existing continuing table */
uint32_t contIndex = inverseTable[position][2] & UCOL_INV_OFFSETMASK;
uint32_t contSize = (inverseTable[position][2] & UCOL_INV_SIZEMASK) >> UCOL_INV_SHIFTVALUE;
if(contIndex+contSize < sContPos) {
/*fprintf(stderr, ".", sContPos, contIndex+contSize);*/
memcpy(stringContinue+contIndex+contSize+element->cSize+1, stringContinue+contIndex+contSize, (element->cSize+1)*sizeof(UChar));
}
stringContinue[contIndex+contSize-1] = 0xFFFF;
memcpy(stringContinue+contIndex+contSize, element->cPoints, element->cSize*sizeof(UChar));
sContPos += element->cSize+1;
stringContinue[contIndex+contSize+element->cSize] = 0xFFFE;
inverseTable[position][2] = ((contSize+element->cSize+1) << UCOL_INV_SHIFTVALUE) | contIndex;
}
}
uint32_t addToInverse(UCAElements *element, UErrorCode *status) {
if(inverseTable[inversePos][0] > element->CEs[0]) {
uint32_t position = inversePos;
while(inverseTable[--position][0] > element->CEs[0])
addToExistingInverse(element, position, status);
} else if(inverseTable[inversePos][0] == element->CEs[0]) {
if(element->noOfCEs > 1 && isContinuation(element->CEs[1])
&& inverseTable[inversePos][1] != element->CEs[1]) {
/* also, we should do long primaries here */
addNewInverse(element, status);
} else {
addToExistingInverse(element, inversePos, status);
}
} else {
addNewInverse(element, status);
}
return inversePos;
}
InverseTableHeader *assembleInverseTable(UErrorCode *status) {
uint32_t i = 0;
InverseTableHeader *result = NULL;
uint32_t headerByteSize = paddedsize(sizeof(InverseTableHeader));
uint32_t inverseTableByteSize = (inversePos+2)*sizeof(uint32_t)*3;
uint32_t contsByteSize = sContPos * sizeof(UChar);
result = (InverseTableHeader *)malloc(headerByteSize + inverseTableByteSize + contsByteSize);
if(result != NULL) {
result->byteSize = headerByteSize + inverseTableByteSize + contsByteSize;
inversePos++;
inverseTable[inversePos][0] = 0xFFFFFFFF;
inverseTable[inversePos][1] = 0xFFFFFFFF;
inverseTable[inversePos][2] = 0x0000FFFF;
inversePos++;
result->tableSize = inversePos;
result->contsSize = sContPos;
result->table = headerByteSize;
result->conts = headerByteSize + inverseTableByteSize;
memcpy((uint8_t *)result + result->table, inverseTable, inverseTableByteSize);
memcpy((uint8_t *)result + result->conts, stringContinue, contsByteSize);
} else {
*status = U_MEMORY_ALLOCATION_ERROR;
return NULL;
}
{
UNewDataMemory *pData;
long dataLength;
#ifdef WIN32
char *currdir = _getcwd(NULL, 0);
#else
char *currdir = getcwd(NULL, 0);
#endif
pData=udata_create(NULL, INVC_DATA_TYPE, INVC_DATA_NAME, &invDataInfo,
U_COPYRIGHT_STRING, status);
if(currdir != NULL) {
free(currdir);
}
if(U_FAILURE(*status)) {
fprintf(stderr, "Error: unable to create data memory, error %d\n", *status);
free(result);
return 0;
}
/* write the data to the file */
fprintf(stdout, "Writing out inverse table\n");
udata_writeBlock(pData, result, result->byteSize);
/* finish up */
dataLength=udata_finish(pData, status);
if(U_FAILURE(*status)) {
fprintf(stderr, "Error: error %d writing the output file\n", *status);
free(result);
return 0;
}
}
return result;
}
/* This adds a read element, while testing for existence */
uint32_t addAnElement(UCAElements *element, UErrorCode *status) {
uint32_t i = 1, expansion = 0;
if(U_FAILURE(*status)) {
return 0xFFFF;
}
if(element->noOfCEs == 1) {
if(element->isThai == FALSE) {
element->mapCE = element->CEs[0];
} else { /* add thai - totally bad here */
expansion = UCOL_SPECIAL_FLAG | (THAI_TAG<<UCOL_TAG_SHIFT)
| ((addExpansion(element->CEs[0], status)+(paddedsize(sizeof(UCATableHeader))>>2))<<4)
| 0x1;
element->mapCE = expansion;
}
} else {
expansion = UCOL_SPECIAL_FLAG | (EXPANSION_TAG<<UCOL_TAG_SHIFT)
| ((addExpansion(element->CEs[0], status)+(paddedsize(sizeof(UCATableHeader))>>2))<<4)
& 0xFFFFF0;
for(i = 1; i<element->noOfCEs; i++) {
addExpansion(element->CEs[i], status);
}
if(element->noOfCEs <= 0xF) {
expansion |= element->noOfCEs;
} else {
addExpansion(0, status);
}
element->mapCE = expansion;
}
uint32_t CE = ucmp32_get(mapping, element->cPoints[0]);
if(element->cSize > 1) { /* we're adding a contraction */
/* and we need to deal with it */
/* we could aready have something in table - or we might not */
/* The fact is that we want to add or modify an existing contraction */
/* and add it backwards then */
uint32_t result = processContraction(element, CE, TRUE, status);
if(CE == UCOL_NOT_FOUND || !isContraction(CE)) {
ucmp32_set(mapping, element->cPoints[0], result);
}
/* add the reverse order */
reverseElement(element);
CE = ucmp32_get(mapping, element->cPoints[0]);
result = processContraction(element, CE, FALSE, status);
if(CE == UCOL_NOT_FOUND || !isContraction(CE)) {
ucmp32_set(mapping, element->cPoints[0], result);
}
} else { /* easy case, */
if( CE != UCOL_NOT_FOUND) {
if(isContraction(CE)) { /* adding a non contraction element (thai, expansion, single) to already existing contraction */
uprv_cnttab_setContraction(contractions, CE, 0, 0, element->mapCE, TRUE, status);
/* This loop has to change the CE at the end of contraction REDO!*/
uprv_cnttab_changeLastCE(contractions, CE, element->mapCE, TRUE, status);
} else {
fprintf(stderr, "Fatal error - trying to overwrite already existing data for codepoint %04X\n", element->cPoints[0]);
*status = U_ILLEGAL_ARGUMENT_ERROR;
}
} else {
ucmp32_set(mapping, element->cPoints[0], element->mapCE);
}
}
return CE;
}
int32_t hex2num(char hex) {
if(hex>='0' && hex <='9') {
return hex-'0';
} else if(hex>='a' && hex<='f') {
return hex-'a'+10;
} else if(hex>='A' && hex<='F') {
return hex-'A'+10;
} else {
return 0;
}
}
/* Here's the fun part: */
/* Normal CE produced by getSingleCEValue | 16P | 8S |0|C| 6T | */
/* Continuation CE produced by processContinuation | 16P | 8S |1|0| 6T | */
/* Long primary, produced by ???? | 24P |1|1| 6S | */
UCATableHeader *assembleTable(UChar variableTopValue, UErrorCode *status) {
if(U_FAILURE(*status)) {
return NULL;
}
uint32_t beforeContractions = (paddedsize(sizeof(UCATableHeader))+paddedsize(expansions.position*sizeof(uint32_t)))/sizeof(UChar);
int32_t contractionsSize = uprv_cnttab_constructTable(contractions, beforeContractions, status);
ucmp32_compact(mapping, 1);
UMemoryStream *ms = uprv_mstrm_openNew(8192);
int32_t mappingSize = ucmp32_flattenMem(mapping, ms);
const uint8_t *flattened = uprv_mstrm_getBuffer(ms, &mappingSize);
uint32_t tableOffset = 0;
uint8_t *dataStart;
int32_t toAllocate = paddedsize(sizeof(UCATableHeader))+paddedsize(expansions.position*sizeof(uint32_t))+paddedsize(mappingSize)+paddedsize(contractionsSize*(sizeof(UChar)+sizeof(uint32_t))+paddedsize(0x100*sizeof(uint32_t)));
dataStart = (uint8_t *)malloc(toAllocate);
UCATableHeader *myData = (UCATableHeader *)dataStart;
/* Stuff everything with @ */
memset(dataStart, '@', toAllocate);
memset(dataStart+tableOffset, 0, sizeof(UCATableHeader));
tableOffset += paddedsize(sizeof(UCATableHeader));
/* copy expansions */
/*myData->expansion = (uint32_t *)dataStart+tableOffset;*/
myData->expansion = tableOffset;
memcpy(dataStart+tableOffset, expansions.CEs, expansions.position*sizeof(uint32_t));
tableOffset += paddedsize(expansions.position*sizeof(uint32_t));
/* contractions block */
/* copy contraction index */
/*myData->contractionIndex = (UChar *)(dataStart+tableOffset);*/
myData->contractionIndex = tableOffset;
memcpy(dataStart+tableOffset, contractions->codePoints, contractionsSize*sizeof(UChar));
tableOffset += paddedsize(contractionsSize*sizeof(UChar));
/* copy contraction collation elements */
/*myData->contractionCEs = (uint32_t *)(dataStart+tableOffset);*/
myData->contractionCEs = tableOffset;
memcpy(dataStart+tableOffset, contractions->CEs, contractionsSize*sizeof(uint32_t));
tableOffset += paddedsize(contractionsSize*sizeof(uint32_t));
/* copy mapping table */
/*myData->mappingPosition = dataStart+tableOffset;*/
myData->mappingPosition = tableOffset;
memcpy(dataStart+tableOffset, flattened, mappingSize);
tableOffset += paddedsize(mappingSize);
/* construct the fast tracker for latin one*/
myData->latinOneMapping = tableOffset;
uint32_t *store = (uint32_t*)(dataStart+tableOffset);
int32_t i = 0;
for(i = 0; i<=0xFF; i++) {
*(store++) = ucmp32_get(mapping,i);
tableOffset+=sizeof(uint32_t);
}
if(tableOffset != toAllocate) {
fprintf(stderr, "calculation screwup!!! Expected to write %i but wrote %i instead!!!\n", toAllocate, tableOffset);
*status = U_INTERNAL_PROGRAM_ERROR;
free(dataStart);
return 0;
}
myData->size = tableOffset;
myData->variableTopValue = variableTopValue;
myData->strength = UCOL_TERTIARY;
myData->frenchCollation = UCOL_OFF;
myData->alternateHandling = UCOL_SHIFTED; /* attribute for handling variable elements*/
myData->caseFirst = UCOL_LOWER_FIRST; /* who goes first, lower case or uppercase */
myData->caseLevel = UCOL_OFF; /* do we have an extra case level */
myData->normalizationMode = UCOL_ON; /* attribute for normalization */
/* This should happen upon ressurection */
const uint8_t *mapPosition = (uint8_t*)myData+myData->mappingPosition;
myData->mapping = ucmp32_openFromData(&mapPosition, status);
return myData;
}
void processFile(FILE *data, UErrorCode *status) {
if(U_FAILURE(*status)) {
return;
}
}
UCAElements *readAnElement(FILE *data, UErrorCode *status) {
char buffer[2048], primary[100], secondary[100], tertiary[100];
UBool detectedContraction;
int32_t i = 0;
char *pointer = NULL;
char *commentStart = NULL;
char *startCodePoint = NULL;
char *endCodePoint = NULL;
char *spacePointer = NULL;
char *result = fgets(buffer, 2048, data);
if(U_FAILURE(*status)) {
return 0;
}
*primary = *secondary = *tertiary = '\0';
if(result == NULL) {
if(feof(data)) {
return NULL;
} else {
fprintf(stderr, "empty line but no EOF!\n");
*status = U_INVALID_FORMAT_ERROR;
return NULL;
}
}
if(buffer[0] == '#' || buffer[0] == '\n') {
return NULL; // just a comment, skip whole line
}
UCAElements *element = &le; //(UCAElements *)malloc(sizeof(UCAElements));
if(buffer[0] == '[') {
element->variableTop = TRUE;
return element; // just a comment, skip whole line
}
element->variableTop = FALSE;
startCodePoint = buffer;
endCodePoint = strchr(startCodePoint, ';');
if(endCodePoint == 0) {
fprintf(stderr, "error - line with no code point!\n");
*status = U_INVALID_FORMAT_ERROR; /* No code point - could be an error, but probably only an empty line */
return NULL;
} else {
*(endCodePoint) = 0;
}
if(element != NULL) {
memset(element, 0, sizeof(*element));
} else {
*status = U_MEMORY_ALLOCATION_ERROR;
return NULL;
}
element->cPoints = element->uchars;
spacePointer = strchr(buffer, ' ');
sscanf(buffer, "%04X", element->cPoints); /* read first code point */
element->codepoint = element->cPoints[0];
if(spacePointer == 0) {
detectedContraction = FALSE;
element->cSize = 1;
} else {
i = 1;
detectedContraction = TRUE;
while(spacePointer != NULL) {
sscanf(spacePointer+1, "%04X", (element->cPoints+i));
i++;
spacePointer = strchr(spacePointer+1, ' ');
}
element->cSize = i;
//fprintf(stderr, "Number of codepoints in contraction: %i\n", i);
}
startCodePoint = endCodePoint+1;
endCodePoint = strchr(startCodePoint, ';');
while(*startCodePoint != 'L' && *startCodePoint != 'S') {
startCodePoint++;
if(startCodePoint == endCodePoint) {
*status = U_INVALID_FORMAT_ERROR;
return NULL;
}
}
if(*startCodePoint == 'S') {
element->caseBit = FALSE;
} else {
element->caseBit = TRUE;
}
startCodePoint = endCodePoint+1;
commentStart = strchr(startCodePoint, '#');
if(commentStart == NULL) {
commentStart = strlen(startCodePoint) + startCodePoint;
}
i = 0;
uint32_t CEindex = 0;
element->noOfCEs = 0;
for(;;) {
endCodePoint = strchr(startCodePoint, ']');
if(endCodePoint == NULL || endCodePoint >= commentStart) {
break;
}
pointer = strchr(startCodePoint, '[');
pointer++;
element->sizePrim[i]=readElement(&pointer, primary, ',', status);
element->sizeSec[i]=readElement(&pointer, secondary, ',', status);
element->sizeTer[i]=readElement(&pointer, tertiary, ']', status);
/* I want to get the CEs entered right here, including continuation */
#if 0
if(element->sizePrim[i]==3 &&
strtoul(secondary, 0, 16)== UCOL_UNMARKED &&
strtoul(tertiary, 0, 16) < 0x40) {
/* This is a test for a long primary - secondary has 6 bits and tertiary must be unmarked */
/* fprintf(stderr, "Long primary in expansion for 0x%04X\n", element->codepoint);*/
element->CEs[CEindex++] = (uint32_t)strtoul(primary, 0, 16) << 8 | 0xC0 | (strtoul(tertiary, 0, 16) & 0x3F);
/* Long primary, | 24P |1|1| 6T | */
} else {
#endif /* we will try to go without long primaries */
element->CEs[CEindex++] = getSingleCEValue(primary, secondary, tertiary, element->caseBit, status);
uint32_t CEi = 1;
while(2*CEi<element->sizePrim[i] || CEi<element->sizeSec[i] || CEi<element->sizeTer[i]) {
uint32_t value = 0x80; /* Continuation marker */
if(2*CEi<element->sizePrim[i]) {
value |= ((hex2num(*(primary+4*CEi))&0xF)<<28);
value |= ((hex2num(*(primary+4*CEi+1))&0xF)<<24);
}
if(2*CEi+1<element->sizePrim[i]) {
value |= ((hex2num(*(primary+4*CEi+2))&0xF)<<20);
value |= ((hex2num(*(primary+4*CEi+3))&0xF)<<16);
}
if(CEi<element->sizeSec[i]) {
value |= ((hex2num(*(secondary+2*CEi))&0xF)<<12);
value |= ((hex2num(*(secondary+2*CEi+1))&0xF)<<8);
}
if(CEi<element->sizeTer[i]) {
value |= ((hex2num(*(tertiary+2*CEi))&0x3)<<4);
value |= (hex2num(*(tertiary+2*CEi+1))&0xF);
}
CEi++;
element->CEs[CEindex++] = value;
}
#if 0
}
#endif /* part for long primaries */
uint32_t terValue = strtoul(tertiary+strlen(tertiary)-2, NULL, 16);
if(terValue > 0x3F) {
fprintf(stderr, "Tertiary value %02X too big for %04X\n", terValue, element->codepoint);
}
startCodePoint = endCodePoint+1;
i++;
}
element->noOfCEs = CEindex;
element->isThai = UCOL_ISTHAIPREVOWEL(element->codepoint);
// we don't want any strange stuff after useful data!
while(pointer < commentStart) {
if(*pointer != ' ') {
*status=U_INVALID_FORMAT_ERROR;
break;
}
*pointer++;
}
/*
strcpy(element->comment, commentStart);
uhash_put(elements, (void *)element->codepoint, element, status);
*/
if(U_FAILURE(*status)) {
fprintf(stderr, "problem putting stuff in hash table\n");
*status = U_INTERNAL_PROGRAM_ERROR;
free(element);
return NULL;
}
return element;
}
void reverseElement(UCAElements *el) {
int32_t i = 0;
UChar temp;
for(i = 0; i<el->cSize/2; i++) {
temp = el->cPoints[i];
el->cPoints[i] = el->cPoints[el->cSize-i-1];
el->cPoints[el->cSize-i-1] = temp;
}
el->codepoint = el->cPoints[0];
uint32_t tempCE = 0, expansion = 0;
UErrorCode status = U_ZERO_ERROR;
if(el->noOfCEs>1) { /* this is an expansion that needs to be reversed and added - also, we need to change the mapValue */
for(i = 0; i<el->noOfCEs/2; i++) {
tempCE = el->CEs[i];
el->CEs[i] = el->CEs[el->noOfCEs-i-1];
el->CEs[el->noOfCEs-i-1] = tempCE;
}
expansion = UCOL_SPECIAL_FLAG | (EXPANSION_TAG<<UCOL_TAG_SHIFT)
| ((addExpansion(el->CEs[0], &status)+(paddedsize(sizeof(UCATableHeader))>>2))<<4)
& 0xFFFFF0;
for(i = 1; i<el->noOfCEs; i++) {
addExpansion(el->CEs[i], &status);
}
if(el->noOfCEs <= 0xF) {
expansion |= el->noOfCEs;
} else {
addExpansion(0, &status);
}
el->mapCE = expansion;
}
}
void writeOutData(UCATableHeader *data, UErrorCode *status) {
if(U_FAILURE(*status)) {
return;
}
UNewDataMemory *pData;
long dataLength;
#ifdef WIN32
char *currdir = _getcwd(NULL, 0);
#else
char *currdir = getcwd(NULL, 0);
#endif
/*
pData=udata_create(getcwd(NULL, 0), UCA_DATA_TYPE, UCA_DATA_NAME, &dataInfo,
U_COPYRIGHT_STRING, status);
*/
pData=udata_create(NULL, UCA_DATA_TYPE, UCA_DATA_NAME, &dataInfo,
U_COPYRIGHT_STRING, status);
if(currdir != NULL) {
free(currdir);
}
if(U_FAILURE(*status)) {
fprintf(stderr, "Error: unable to create data memory, error %d\n", *status);
return;
}
/* write the data to the file */
fprintf(stdout, "Writing out table\n");
udata_writeBlock(pData, data, data->size);
/* finish up */
dataLength=udata_finish(pData, status);
if(U_FAILURE(*status)) {
fprintf(stderr, "Error: error %d writing the output file\n", *status);
return;
}
}
int main(int argc, char* argv[]) {
FILE *data = fopen("FractionalUCA.txt", "r");
//FILE *data = fopen("uca30codepointsort.txt", "r");
int32_t i = 0, j = 0, k = 0, line = 0, thai = 0;
int32_t sizesPrim[35], sizesSec[35], sizesTer[35];
int32_t terValue[0xffff], secValue[0xffff];
int32_t sizeBreakDown[35][35][35];
UErrorCode status = U_ZERO_ERROR;
UCAElements *element = NULL;
UChar variableTopValue = 0;
UBool foundVariableTop = FALSE;
if(data == NULL) {
fprintf(stderr, "Couldn't open file\n");
return -1;
}
memset(secValue, 0, 0xffff*sizeof(int32_t));
memset(terValue, 0, 0xffff*sizeof(int32_t));
memset(sizesPrim, 0, 35*sizeof(int32_t));
memset(sizesSec, 0, 35*sizeof(int32_t));
memset(sizesTer, 0, 35*sizeof(int32_t));
memset(sizeBreakDown, 0, 35*35*35*sizeof(int32_t));
memset(&expansions, 0, sizeof(expansions));
memset(&contractions, 0, sizeof(contractions));
memset(inverseTable, 0, sizeof(int32_t)*3*0xFFFF);
mapping = ucmp32_open(UCOL_UNMAPPED);
contractions = uprv_cnttab_open(mapping, &status);
ucmp32_setRange(mapping, 0, 0xFFFF, UCOL_NOT_FOUND);
/*
elements = uhash_open(uhash_hashLong, uhash_compareLong, &status);
uhash_setValueDeleter(elements, deleteElement);
*/
if(mapping == NULL) {
return(-1);
}
while(!feof(data)) {
if(U_FAILURE(status)) {
fprintf(stderr, "Something returned an error %i while processing line: %i\nExiting...", status, line);
exit(status);
}
element = readAnElement(data, &status);
line++;
if(element != NULL) {
/* this does statistics on CE lengths, but is currently broken */
/*
for( i = 0; i<element->noOfCEs; i++) {
sizesPrim[element->sizePrim[i]]++;
sizesSec[element->sizeSec[i]]++;
sizesTer[element->sizeTer[i]]++;
sizeBreakDown[element->sizePrim[i]][element->sizeSec[i]][element->sizeTer[i]]++;
if(element->sizePrim[i] == 2 && element->sizeSec[i]==2) {
terValue[strtoul(element->tertiary[i], 0, 16)]++;
secValue[strtoul(element->secondary[i], 0, 16)]++;
}
}
*/
// we have read the line, now do something sensible with the read data!
if(element->variableTop == TRUE) {
foundVariableTop = TRUE;
continue;
}
if(variableTopValue == 0 && foundVariableTop == TRUE) {
variableTopValue = element->cPoints[0];
foundVariableTop = FALSE;
}
/* we're first adding to inverse, because addAnElement will reverse the order */
/* of code points and stuff... we don't want that to happen */
uint32_t invResult = addToInverse(element, &status);
uint32_t result = addAnElement(element, &status);
//deleteElement(element);
}
}
fprintf(stderr, "Lines read: %i\n", line);
/*
for(i = 0; i<35; i++) {
fprintf(stderr, "size %i: P:%i S:%i T:%i\n", i, sizesPrim[i], sizesSec[i], sizesTer[i]);
}
for(i = 0; i<35; i++) {
UBool printedPrimary = FALSE;
for(j = 0; j<35; j++) {
for(k = 0; k<35; k++) {
if(sizeBreakDown[i][j][k] != 0) {
if(!printedPrimary) {
fprintf(stderr, "Primary: %i\n", i);
printedPrimary = TRUE;
}
fprintf(stderr, "Sec: %i, Ter: %i = %i\n", j, k, sizeBreakDown[i][j][k]);
}
}
}
}
for(i = 0; i<(uint32_t)0xffff; i++) {
if(terValue[i] != 0) {
fprintf(stderr, "Tertiaries with value %04X : %i\n", i, terValue[i]);
}
if(secValue[i] != 0) {
fprintf(stderr, "Secondaries with value %04X : %i\n", i, secValue[i]);
}
}
*/
/* test */
UCATableHeader *myData = assembleTable(variableTopValue, &status);
writeOutData(myData, &status);
InverseTableHeader *inverse = assembleInverseTable(&status);
/*
uint32_t *itab = (uint32_t *)((uint8_t *)inverse + inverse->table);
UChar *conts = (UChar *)((uint8_t *)inverse + inverse->conts);
for(i = 0; i<inverse->tableSize; i++) {
fprintf(stderr, "[%04X] 0x%08X 0x%08X 0x%08X\n", i, *(itab+3*i), *(itab+3*i+1), *(itab+3*i+2));
if((*(itab+3*i+2) & UCOL_INV_SIZEMASK) != 0) {
uint32_t contIndex = *(itab+3*i+2) & UCOL_INV_OFFSETMASK;
uint32_t contSize = (*(itab+3*i+2) & UCOL_INV_SIZEMASK) >> UCOL_INV_SHIFTVALUE;
fprintf(stderr, "\t");
for(j = 0; j<contSize; j++) {
if(*(conts+contIndex+j) < 0xFFFE) {
fprintf(stderr, "%04X ", *(conts+contIndex+j));
} else {
fprintf(stderr, "\n\t");
}
}
fprintf(stderr, "\n");
}
}
*/
uprv_cnttab_close(contractions);
ucmp32_close(mapping);
//printOutTable(myData, &status);
//uhash_close(elements);
ucmp32_close(myData->mapping);
free(myData);
return 0;
}

View File

@ -0,0 +1,100 @@
#ifndef UCADATA_H
#define UCADATA_H
#include <stdio.h>
#include <string.h>
#include "unicode/utypes.h"
#include "unicode/unicode.h"
#include "ucolimp.h"
#include "ucmp32.h"
#include "compitr.h"
#include "uhash.h"
#include "umemstrm.h"
#include "unewdata.h"
#ifdef WIN32
#include <direct.h>
#else
#include <unistd.h>
#endif
#define paddedsize(something) ((something)+((((something)%4)!=0)?(4-(something)%4):0))
/* UDataInfo for UCA mapping table */
static const UDataInfo dataInfo={
sizeof(UDataInfo),
0,
U_IS_BIG_ENDIAN,
U_CHARSET_FAMILY,
sizeof(UChar),
0,
0x55, 0x43, 0x6f, 0x6c, /* dataFormat="UCol" */
1, 0, 0, 0, /* formatVersion */
3, 0, 0, 0 /* dataVersion = Unicode Version*/
};
/* UDataInfo for inverse UCA table */
static const UDataInfo invDataInfo={
sizeof(UDataInfo),
0,
U_IS_BIG_ENDIAN,
U_CHARSET_FAMILY,
sizeof(UChar),
0,
0x49, 0x6E, 0x76, 0x43, /* dataFormat="InvC" */
1, 0, 0, 0, /* formatVersion */
3, 0, 0, 0 /* dataVersion = Unicode Version*/
};
typedef struct {
UChar codepoint;
UChar uchars[128];
UChar *cPoints;
int32_t cSize; /* Number of characters in sequence - for contraction */
int32_t noOfCEs; /* Number of collation elements */
uint32_t CEs[128]; /* These are collation elements - there could be more than one - in case of expansion */
uint32_t mapCE; /* This is the value element maps in original table */
int32_t sizePrim[128];
int32_t sizeSec[128];
int32_t sizeTer[128];
UBool variableTop;
UBool caseBit;
UBool isThai;
} UCAElements;
typedef struct {
uint32_t *CEs;
int32_t position;
int32_t size;
} ExpansionTable;
struct ContractionTable;
struct ContractionTable {
UChar *codePoints;
uint32_t *CEs;
int32_t position;
int32_t size;
int32_t backSize;
UBool forward;
ContractionTable *reversed;
};
void deleteElement(void *element);
int32_t readElement(char **from, char *to, char separator, UErrorCode *status);
int32_t addExpansion(uint32_t value, UErrorCode *status);
uint32_t getSingleCEValue(char *primary, char *secondary, char *tertiary, UBool caseBit, UErrorCode *status);
uint32_t processContraction(UCAElements *element, uint32_t existingCE, UBool forward, UErrorCode *status);
void printOutTable(UCATableHeader *myData, UErrorCode *status);
UCATableHeader *assembleTable(UChar variableTopValue, UErrorCode *status);
void processFile(FILE *data, UErrorCode *status);
/* This adds a read element, while testing for existence */
uint32_t addAnElement(UCAElements *element, UErrorCode *status);
UCAElements *readAnElement(FILE *data, UErrorCode *status);
void reverseElement(UCAElements *el);
#endif

View File

@ -0,0 +1,186 @@
#include "tblprint.h"
char *formatElementString(uint32_t CE, char *buffer) {
char temp[1024];
UBool firstPrim = FALSE;
sprintf(buffer, "[");
if(UCOL_PRIMARYORDER(CE)>>8 != 0x02) {
sprintf(temp, "%02X ", UCOL_PRIMARYORDER(CE)>>8);
strcat(buffer, temp);
firstPrim = TRUE;
}
if((UCOL_PRIMARYORDER(CE)&0xFF) != 0x02 || firstPrim == TRUE) {
sprintf(temp, "%02X", UCOL_PRIMARYORDER(CE)&0xFF);
strcat(buffer, temp);
}
firstPrim = FALSE;
strcat(buffer, ",");
if(UCOL_SECONDARYORDER(CE) != 0x02) {
sprintf(temp, " %02X", UCOL_SECONDARYORDER(CE));
strcat(buffer, temp);
}
strcat(buffer, ",");
if((UCOL_TERTIARYORDER(CE)&0x7F) != 0x02) {
sprintf(temp, " %02X", UCOL_TERTIARYORDER(CE)&0x7F);
strcat(buffer, temp);
}
strcat(buffer, "]");
return buffer;
}
void printExp(uint32_t CE, uint32_t oldCE, char* primb, char* secb, char *terb, UBool *printedCont) {
char temp[1024];
if(CE<UCOL_NOT_FOUND) {
if(*printedCont == FALSE) {
fprintf(stdout, "%s ", formatElementString(oldCE, temp));
} else {
oldCE &= 0x0FFFFFFF;
if(UCOL_PRIMARYORDER(oldCE) > 0xFF) {
sprintf(temp, "%02X ", UCOL_PRIMARYORDER(oldCE)>>8);
strcat(primb, temp);
}
if(UCOL_PRIMARYORDER(oldCE) != 0) {
sprintf(temp, "%02X ", UCOL_PRIMARYORDER(oldCE)&0xFF);
strcat(primb, temp);
}
if(UCOL_SECONDARYORDER(oldCE) != 0) {
sprintf(temp, "%02X ", UCOL_SECONDARYORDER(oldCE));
strcat(secb, temp);
}
if(UCOL_TERTIARYORDER(oldCE) != 0) {
sprintf(temp, "%02X ", UCOL_TERTIARYORDER(oldCE));
strcat(terb, temp);
}
fprintf(stdout, "[%s, %s, %s] ", primb, secb, terb);
*primb = *secb = *terb = *temp = 0;
}
*printedCont = FALSE;
} else { /* this is a contiunation, process accordingly */
if(*printedCont == TRUE) {
oldCE &= 0x0FFFFFFF;
}
if(UCOL_PRIMARYORDER(oldCE) > 0xFF) {
sprintf(temp, "%02X ", UCOL_PRIMARYORDER(oldCE)>>8);
strcat(primb, temp);
}
if(UCOL_PRIMARYORDER(oldCE) != 0) {
sprintf(temp, "%02X ", UCOL_PRIMARYORDER(oldCE)&0xFF);
strcat(primb, temp);
}
if(UCOL_SECONDARYORDER(oldCE) != 0) {
sprintf(temp, "%02X ", UCOL_SECONDARYORDER(oldCE));
strcat(secb, temp);
}
if(UCOL_TERTIARYORDER(oldCE)&0x7F != 0) {
sprintf(temp, "%02X ", UCOL_TERTIARYORDER(oldCE)&0x7F);
strcat(terb, temp);
}
*printedCont = TRUE;
}
}
void printOutTable(UCATableHeader *myData, UErrorCode *status) {
if(U_FAILURE(*status)) {
return;
}
int32_t i = 0, j = 0;
int32_t CE = 0;
uint32_t *address = NULL;
uint8_t size = 0;
char buffer[1024];
for(i = 0; i<=0xFFFF; i++) {
CE = ucmp32_get(myData->mapping, i);
if(CE != UCOL_NOT_FOUND) {
fprintf(stdout, "%04X; ", i);
if(CE < UCOL_NOT_FOUND) {
fprintf(stdout, "%c; %s ", (UCOL_TERTIARYORDER(CE)&0x80)>>7?'L':'S', formatElementString(CE, buffer));
} else {
int32_t tag = (CE&UCOL_TAG_MASK)>>UCOL_TAG_SHIFT;
if(tag == SURROGATE_TAG) {
// do surrogates
}
if(tag == THAI_TAG) {
address = ((uint32_t*)myData+((CE&0x00FFFFF0)>>4));
CE = *(address);
fprintf(stdout, "%c; %s ", (UCOL_TERTIARYORDER(CE)&0x80)>>7?'L':'S', formatElementString(CE, buffer));
fprintf(stdout, "THAI - from %08X to %08X (offset %05X) ", CE, address, ((CE&0x00FFFFF0)>>4));
}
if(tag == CONTRACTION_TAG) {
int16_t hasBackward = 0;
char conChars[1024];
char temp[1024];
sprintf(conChars, "%04X", i);
UChar *contractionCP = (UChar *)myData+getContractOffset(CE);
hasBackward = *(contractionCP); /* skip backward */
UBool printSeq = FALSE;
address = (uint32_t *)((uint8_t*)myData+myData->contractionCEs)+(contractionCP - (UChar *)((uint8_t*)myData+myData->contractionIndex));
while(*contractionCP != 0xFFFF) {
if(printSeq == TRUE) {
fprintf(stdout, "\n%s;",conChars);
}
CE = *(address);
fprintf(stdout, "%c; %s ", (UCOL_TERTIARYORDER(CE)&0x80)>>7?'L':'S', formatElementString(CE, buffer));
fprintf(stdout, "Contraction ");
if(hasBackward != 0) {
fprintf(stdout, "Back = %i ", hasBackward);
}
contractionCP++;
address++;
sprintf(temp, " %04X", *contractionCP);
strcat(conChars, temp);
printSeq = TRUE;
}
}
if(tag == EXPANSION_TAG) {
char primb[1024], secb[1024], terb[1024], temp[1024];
UBool printedCont = FALSE;
uint32_t oldCE;
*primb = *secb = *terb = *temp = 0;
size = CE&0xF;
address = ((uint32_t*)myData+((CE&0x00FFFFF0)>>4));
CE = *(address++);
fprintf(stdout, "%c; ", (UCOL_TERTIARYORDER(CE)&0x80)>>7?'L':'S');
if(size != 0) {
for(j = 1; j<size; j++) {
oldCE = CE;
CE = *(address++);
printExp(CE, oldCE, primb, secb, terb, &printedCont);
}
} else {
while(*address != 0) {
oldCE = CE;
CE = *(address++);
printExp(CE, oldCE, primb, secb, terb, &printedCont);
}
}
printExp(CE, CE, primb, secb, terb, &printedCont);
if(*primb != '\0' || *secb != '\0' || *terb != '\0') {
fprintf(stdout, "[%s, %s, %s] ", primb, secb, terb);
}
}
if(tag == CHARSET_TAG) {
;
}
}
/*
UCAElements *e = (UCAElements *)uhash_get(elements, (void *)i);
fprintf(stdout, "%s", e->comment);
*/
}
}
}

View File

@ -0,0 +1,11 @@
#ifndef TBLPRINT_H
#define TBLPRINT_H
#include "unicode/utypes.h"
#include "ucadata.h"
char *formatElementString(uint32_t CE, char *buffer);
void printExp(uint32_t CE, uint32_t oldCE, char* primb, char* secb, char *terb, UBool *printedCont);
void printOutTable(UCATableHeader *myData, UErrorCode *status);
#endif