5bd3f677d5
X-SVN-Rev: 21927
1730 lines
50 KiB
C++
1730 lines
50 KiB
C++
/*
|
|
*******************************************************************************
|
|
*
|
|
* Copyright (C) 2003, International Business Machines
|
|
* Corporation and others. All Rights Reserved.
|
|
*
|
|
*******************************************************************************
|
|
*
|
|
* File colprobe.cpp
|
|
*
|
|
* Modification History:
|
|
*
|
|
* Date Name Description
|
|
* 03/18/2003 weiv Creation.
|
|
*******************************************************************************
|
|
*/
|
|
|
|
#include "uoptions.h"
|
|
#include "unicode/ucol.h"
|
|
#include "unicode/ucoleitr.h"
|
|
#include "unicode/ures.h"
|
|
#include "unicode/uniset.h"
|
|
#include "unicode/usetiter.h"
|
|
#include "unicode/ustring.h"
|
|
#include "unicode/uchar.h"
|
|
#include "unicode/uscript.h"
|
|
#include "uprops.h"
|
|
#include "hash.h"
|
|
#include "ucol_imp.h"
|
|
|
|
#include "unicode/ustdio.h"
|
|
#include "unicode/utrans.h"
|
|
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
#include <io.h>
|
|
#include <fcntl.h>
|
|
|
|
#include "colprobe.h"
|
|
|
|
|
|
#ifdef WIN32
|
|
#include <windows.h>
|
|
#else
|
|
//
|
|
// Stubs for Windows API functions when building on UNIXes.
|
|
//
|
|
typedef int DWORD;
|
|
inline int CompareStringW(DWORD, DWORD, UChar *, int, UChar *, int) {return 0;};
|
|
#include <sys/time.h>
|
|
unsigned long timeGetTime() {
|
|
struct timeval t;
|
|
gettimeofday(&t, 0);
|
|
unsigned long val = t.tv_sec * 1000; // Let it overflow. Who cares.
|
|
val += t.tv_usec / 1000;
|
|
return val;
|
|
};
|
|
inline int LCMapStringW(DWORD, DWORD, UChar *, int, UChar *, int) {return 0;};
|
|
const int LCMAP_SORTKEY = 0;
|
|
#define MAKELCID(a,b) 0
|
|
const int SORT_DEFAULT = 0;
|
|
#endif
|
|
|
|
#include "line.h"
|
|
|
|
static UBool gVerbose = FALSE;
|
|
static UBool gDebug = FALSE;
|
|
static UBool gQuiet = FALSE;
|
|
static UBool gExemplar = FALSE;
|
|
|
|
DWORD gWinLCID;
|
|
int gCount;
|
|
Line **gICULines;
|
|
UCollator *gCol;
|
|
UCollator *gUCA;
|
|
Line source;
|
|
Line target;
|
|
Line *gSource = &source;
|
|
Line *gTarget = ⌖
|
|
Hashtable gElements(FALSE);
|
|
Hashtable gExpansions(FALSE);
|
|
CompareFn gComparer;
|
|
|
|
const UChar separatorChar = 0x0030;
|
|
|
|
UFILE *out = NULL;
|
|
UFILE *err = NULL;
|
|
UFILE *log = NULL;
|
|
|
|
const char *progName = "colprobe";
|
|
|
|
const char *gLocale = NULL;
|
|
//char platform[256];
|
|
int32_t platformIndex = -1;
|
|
int32_t gPlatformNo = 0;
|
|
int32_t gPlatformIndexes[10];
|
|
int32_t gLocaleNo = 0;
|
|
const char* gLocales[100];
|
|
UBool gRulesStdin = FALSE;
|
|
|
|
enum {
|
|
HELP1,
|
|
HELP2,
|
|
VERBOSE,
|
|
QUIET,
|
|
VERSION,
|
|
ICUDATADIR,
|
|
COPYRIGHT,
|
|
LOCALE,
|
|
PLATFORM,
|
|
DEBUG,
|
|
EXEMPLAR,
|
|
RULESSTDIN
|
|
};
|
|
|
|
UOption options[]={
|
|
/*0*/ UOPTION_HELP_H,
|
|
/*1*/ UOPTION_HELP_QUESTION_MARK,
|
|
/*2*/ UOPTION_VERBOSE,
|
|
/*3*/ UOPTION_QUIET,
|
|
/*4*/ UOPTION_VERSION,
|
|
/*5*/ UOPTION_ICUDATADIR,
|
|
/*6*/ UOPTION_COPYRIGHT,
|
|
/*7*/ UOPTION_DEF("locale", 'l', UOPT_REQUIRES_ARG),
|
|
/*8*/ UOPTION_DEF("platform", 'p', UOPT_REQUIRES_ARG),
|
|
/*9*/ UOPTION_DEF("debug", 'D', UOPT_NO_ARG),
|
|
/*10*/ UOPTION_DEF("exemplar", 'E', UOPT_NO_ARG),
|
|
/*11*/ UOPTION_DEF("rulesstdin", 'R', UOPT_NO_ARG)
|
|
};
|
|
|
|
int Winstrcmp(const void *a, const void *b) {
|
|
gCount++;
|
|
int t;
|
|
t = CompareStringW(gWinLCID, 0,
|
|
(*(Line **)a)->name, (*(Line **)a)->len,
|
|
(*(Line **)b)->name, (*(Line **)b)->len);
|
|
return t-2;
|
|
}
|
|
|
|
int ICUstrcmp(const void *a, const void *b) {
|
|
gCount++;
|
|
UCollationResult t;
|
|
t = ucol_strcoll(gCol,
|
|
(*(Line **)a)->name, (*(Line **)a)->len,
|
|
(*(Line **)b)->name, (*(Line **)b)->len);
|
|
if (t == UCOL_LESS) return -1;
|
|
if (t == UCOL_GREATER) return +1;
|
|
return 0;
|
|
}
|
|
|
|
struct {
|
|
const char* name;
|
|
CompareFn comparer;
|
|
} platforms[] = {
|
|
{ "icu", ICUstrcmp },
|
|
{ "win", Winstrcmp}
|
|
};
|
|
|
|
|
|
void deleteLineElement(void *line) {
|
|
delete((Line *)line);
|
|
}
|
|
|
|
void stringToLower(char *string) {
|
|
uint32_t i = 0;
|
|
for(i = 0; i < strlen(string); i++) {
|
|
string[i] = tolower(string[i]);
|
|
}
|
|
}
|
|
|
|
void usage(const char *name) {
|
|
u_fprintf(out, "Usage: %s --locale loc_name --platform platform\n", name);
|
|
}
|
|
|
|
void listKnownPlatforms() {
|
|
int32_t i = 0;
|
|
u_fprintf(err, "Known platforms:\n");
|
|
for(i = 0; i < sizeof(platforms)/sizeof(platforms[0]); i++) {
|
|
u_fprintf(err, "\t%s\n", platforms[i]);
|
|
}
|
|
}
|
|
|
|
void addPlatform(const char *platform) {
|
|
int32_t i;
|
|
//stringToLower(platform);
|
|
int32_t oldPlatformNo = gPlatformNo;
|
|
|
|
for(i = 0; i < sizeof(platforms)/sizeof(platforms[0]); i++) {
|
|
if(strcmp(platform, platforms[i].name) == 0) {
|
|
gPlatformIndexes[gPlatformNo++] = i;
|
|
}
|
|
}
|
|
if(gPlatformNo == oldPlatformNo) {
|
|
u_fprintf(err, "Unknown platform %s\n", platform);
|
|
listKnownPlatforms();
|
|
}
|
|
}
|
|
|
|
void processArgs(int argc, char* argv[], UErrorCode &status)
|
|
{
|
|
int32_t i = 0;
|
|
U_MAIN_INIT_ARGS(argc, argv);
|
|
|
|
argc = u_parseArgs(argc, argv, (int32_t)(sizeof(options)/sizeof(options[0])), options);
|
|
|
|
if(argc < 0) {
|
|
u_fprintf(err, "Unknown option: %s\n", argv[-argc]);
|
|
usage(progName);
|
|
return;
|
|
}
|
|
|
|
if(options[0].doesOccur || options[1].doesOccur) {
|
|
usage(progName);
|
|
return;
|
|
}
|
|
if(options[VERBOSE].doesOccur) {
|
|
gVerbose = TRUE;
|
|
}
|
|
if(options[DEBUG].doesOccur) {
|
|
gDebug = TRUE;
|
|
gVerbose = TRUE;
|
|
}
|
|
if(options[EXEMPLAR].doesOccur) {
|
|
gExemplar = TRUE;
|
|
}
|
|
if(options[QUIET].doesOccur) {
|
|
gQuiet = TRUE;
|
|
}
|
|
/*
|
|
for(i = 8; i < 9; i++) {
|
|
if(!options[i].doesOccur) {
|
|
u_fprintf(err, "Option %s is required!\n", options[i].longName);
|
|
usage(progName);
|
|
status = U_ILLEGAL_ARGUMENT_ERROR;
|
|
}
|
|
if(options[i].value == NULL) {
|
|
u_fprintf(err, "Option %s needs an argument!\n", options[i].longName);
|
|
usage(progName);
|
|
status = U_ILLEGAL_ARGUMENT_ERROR;
|
|
}
|
|
}
|
|
*/
|
|
// ASCII based options specified on the command line
|
|
// this is for testing purposes, will allow to load
|
|
// up ICU rules and then poke through them.
|
|
// In that case, we test only ICU and don't need
|
|
// a locale.
|
|
if(options[RULESSTDIN].doesOccur) {
|
|
gRulesStdin = TRUE;
|
|
addPlatform("icu");
|
|
return;
|
|
}
|
|
|
|
if(options[LOCALE].doesOccur) {
|
|
gLocale = options[LOCALE].value;
|
|
} else {
|
|
for(i = 1; i < argc; i++) {
|
|
gLocales[gLocaleNo++] = argv[i];
|
|
}
|
|
}
|
|
if(options[PLATFORM].doesOccur) {
|
|
//strcpy(platform, options[PLATFORM].value);
|
|
//addPlatform("icu");
|
|
addPlatform(options[PLATFORM].value);
|
|
} else { // there is a list of platforms
|
|
u_fprintf(err, "Option %s is required!\n", options[i].longName);
|
|
usage(progName);
|
|
status = U_ILLEGAL_ARGUMENT_ERROR;
|
|
}
|
|
|
|
//
|
|
// Set up a Windows LCID
|
|
//
|
|
gWinLCID = uloc_getLCID(gLocale);
|
|
/*
|
|
if (gLocale != 0) {
|
|
gWinLCID = MAKELCID(gLocale, SORT_DEFAULT);
|
|
}
|
|
else {
|
|
gWinLCID = uloc_getLCID(gLocale);
|
|
}
|
|
*/
|
|
|
|
}
|
|
|
|
void printRules(const UChar *name, int32_t len, UFILE *file) {
|
|
// very rudimentary pretty rules print
|
|
int32_t i = 0;
|
|
UChar toPrint[16384];
|
|
int32_t toPrintIndex = 0;
|
|
for(i = 0; i < len; i++) {
|
|
if(name[i] == 0x0026) {
|
|
if(toPrintIndex) {
|
|
toPrint[toPrintIndex] = 0;
|
|
u_fprintf(file, "%U\n", toPrint);
|
|
toPrintIndex = 0;
|
|
toPrint[toPrintIndex++] = name[i];
|
|
} else {
|
|
toPrint[toPrintIndex++] = name[i];
|
|
}
|
|
} else {
|
|
toPrint[toPrintIndex++] = name[i];
|
|
}
|
|
}
|
|
if(toPrintIndex) {
|
|
toPrint[toPrintIndex] = 0;
|
|
u_fprintf(file, "%U\n", toPrint);
|
|
toPrintIndex = 0;
|
|
}
|
|
|
|
|
|
}
|
|
|
|
void escapeString(const UChar *name, int32_t len, UFILE *file) {
|
|
u_fprintf(file, "%U", name);
|
|
/*
|
|
int32_t j = 0;
|
|
for(j = 0; j < len; j++) {
|
|
if(name[j] >= 0x20 && name[j] < 0x80) {
|
|
u_fprintf(file, "%c", name[j]);
|
|
} else {
|
|
u_fprintf(file, "\\u%04X", name[j]);
|
|
}
|
|
}
|
|
*/
|
|
}
|
|
void escapeALine(Line *line, UFILE *file) {
|
|
escapeString(line->name, line->len, file);
|
|
}
|
|
|
|
void escapeExpansion(Line *line, UFILE *file) {
|
|
escapeString(line->expansionString, line->expLen, file);
|
|
}
|
|
|
|
void showNames(Line *line, UFILE *file) {
|
|
UErrorCode status = U_ZERO_ERROR;
|
|
int32_t j = 0;
|
|
char charName[256];
|
|
for(j = 0; j < line->len; j++) {
|
|
u_charName(line->name[j], U_EXTENDED_CHAR_NAME, charName, 256, &status);
|
|
u_fprintf(file, "%s ", charName);
|
|
}
|
|
}
|
|
|
|
void setArray(Line **array, Line *contents, int32_t size) {
|
|
int32_t i = 0;
|
|
for(i = 0; i < size; i++) {
|
|
array[i] = contents+i;
|
|
}
|
|
}
|
|
|
|
// set an array from a Hashtable
|
|
int32_t
|
|
setArray(Line **array, Hashtable *table = &gElements) {
|
|
int32_t size = table->count();
|
|
int32_t hashIndex = -1;
|
|
const UHashElement *hashElement = NULL;
|
|
int32_t count = 0;
|
|
while((hashElement = table->nextElement(hashIndex)) != NULL) {
|
|
array[count++] = (Line *)hashElement->value.pointer;
|
|
}
|
|
return size;
|
|
}
|
|
|
|
UBool trySwamped(Line **smaller, Line **greater, UChar chars[2], CompareFn comparer) {
|
|
u_strcpy(gSource->name, (*smaller)->name);
|
|
gSource->name[(*smaller)->len] = separatorChar;
|
|
gSource->name[(*smaller)->len+1] = chars[0];
|
|
gSource->name[(*smaller)->len+2] = 0;
|
|
gSource->len = (*smaller)->len+2;
|
|
|
|
u_strcpy(gTarget->name, (*greater)->name);
|
|
gTarget->name[(*greater)->len] = separatorChar;
|
|
gTarget->name[(*greater)->len+1] = chars[1];
|
|
gTarget->name[(*greater)->len+2] = 0;
|
|
gTarget->len = (*greater)->len+2;
|
|
|
|
if(comparer(&gSource, &gTarget) > 0) {
|
|
return TRUE;
|
|
} else {
|
|
return FALSE;
|
|
}
|
|
}
|
|
|
|
UBool trySwamps(Line **smaller, Line **greater, UChar chars[2], CompareFn comparer) {
|
|
gSource->name[0] = chars[0];
|
|
gSource->name[1] = separatorChar;
|
|
u_strcpy(gSource->name+2, (*smaller)->name);
|
|
gSource->len = (*smaller)->len+2;
|
|
|
|
gTarget->name[0] = chars[1];
|
|
gTarget->name[1] = separatorChar;
|
|
u_strcpy(gTarget->name+2, (*greater)->name);
|
|
gTarget->len = (*greater)->len+2;
|
|
|
|
if(comparer(&gSource, &gTarget) < 0) {
|
|
return TRUE;
|
|
} else {
|
|
return FALSE;
|
|
}
|
|
}
|
|
|
|
UColAttributeValue
|
|
probeStrength(Line** prevLine, Line **currLine, CompareFn comparer) {
|
|
// Primary swamps secondary
|
|
// have pairs where [0] 2> [1]
|
|
UChar primSwamps[][2] = {
|
|
{ 0x00E0, 0x0061 },
|
|
{ 0x0450, 0x0435 },
|
|
{ 0x31a3, 0x310d }
|
|
};
|
|
// Secondary swamps tertiary
|
|
// have pairs where [0] 3> [1]
|
|
UChar secSwamps[][2] = {
|
|
{ 0x0053, 0x0073 },
|
|
{ 0x0415, 0x0435 },
|
|
{ 0x31b6, 0x310e }
|
|
};
|
|
// Secondary is swamped by primary
|
|
// have pairs where [0] 1> [1]
|
|
UChar secSwamped[][2] = {
|
|
{ 0x0062, 0x0061 },
|
|
{ 0x0436, 0x0454 },
|
|
{ 0x310e, 0x310d }
|
|
};
|
|
// Tertiary is swamped by secondary
|
|
// have pairs where [0] 2> [1]
|
|
UChar terSwamped[][2] = {
|
|
{ 0x00E0, 0x0061 },
|
|
{ 0x0450, 0x0435 },
|
|
{ 0x31a3, 0x310d }
|
|
};
|
|
int32_t i = 0;
|
|
// Tertiary swamps equal?
|
|
int result = 0;
|
|
// Choose the pair
|
|
i = 0;
|
|
/*
|
|
if((*prevLine)->name[0] > 0xFF && (*currLine)->name[0] > 0xFF) {
|
|
i = 0;
|
|
} else if((*prevLine)->name[0] < 0x0400 && (*currLine)->name[0] < 0x0400) {
|
|
i = 1;
|
|
} else {
|
|
i = 2;
|
|
}
|
|
*/
|
|
// are they equal?
|
|
if((result = comparer(prevLine, currLine)) == 0) {
|
|
return UCOL_IDENTICAL;
|
|
} else if(result > 0) {
|
|
//fprintf(stderr, "lines should be ordered!");
|
|
return UCOL_OFF;
|
|
} else if(trySwamps(prevLine, currLine, primSwamps[i], comparer)) {
|
|
return UCOL_PRIMARY;
|
|
} else if(trySwamps(prevLine, currLine, secSwamps[i], comparer)) {
|
|
return UCOL_SECONDARY;
|
|
} else if(trySwamped(prevLine, currLine, terSwamped[i], comparer)) {
|
|
// is there a tertiary difference
|
|
return UCOL_TERTIARY;
|
|
} else {
|
|
//fprintf(stderr, "Unknown strength!\n");
|
|
return UCOL_ON;
|
|
}
|
|
}
|
|
|
|
// This function tries to probe the set of lines
|
|
// (already sorted by qsort) and deduct the strengths
|
|
void
|
|
analyzeStrength(Line **lines, int32_t size, CompareFn comparer) {
|
|
int32_t i = 0;
|
|
|
|
for(i = 1; i < size; i++) {
|
|
Line **prevLine = lines+i-1;
|
|
Line **currLine = lines+i;
|
|
(*currLine)->strength = probeStrength(prevLine, currLine, comparer);
|
|
(*currLine)->sortedIndex = i;
|
|
(*currLine)->previous = *prevLine;
|
|
(*prevLine)->next = *currLine;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
void printStrength(UColAttributeValue strength, UFILE *file) {
|
|
u_fprintf(file, " ");
|
|
switch(strength) {
|
|
case UCOL_IDENTICAL:
|
|
u_fprintf(file, "=");
|
|
break;
|
|
case UCOL_TERTIARY:
|
|
//u_fprintf(file, "<3");
|
|
u_fprintf(file, "<<<");
|
|
break;
|
|
case UCOL_SECONDARY:
|
|
//u_fprintf(file, "<2");
|
|
u_fprintf(file, "<<");
|
|
break;
|
|
case UCOL_PRIMARY:
|
|
//u_fprintf(file, "<1");
|
|
u_fprintf(file, "<");
|
|
break;
|
|
case UCOL_OFF:
|
|
u_fprintf(file, ">?");
|
|
default:
|
|
u_fprintf(file, "?!");
|
|
break;
|
|
}
|
|
u_fprintf(file, " ");
|
|
}
|
|
|
|
void printStrength(Line *line, UFILE *file) {
|
|
printStrength(line->strength, file);
|
|
}
|
|
|
|
void printLine(Line *line, UFILE *file) {
|
|
escapeALine(line, file);
|
|
if(line->isExpansion) {
|
|
u_fprintf(file, "/");
|
|
escapeExpansion(line, file);
|
|
}
|
|
}
|
|
|
|
void printOrdering(Line **lines, int32_t size, UFILE *file, UBool useLinks = FALSE) {
|
|
int32_t i = 0;
|
|
|
|
//printLine(*lines);
|
|
//escapeALine(*lines); // Print first line
|
|
|
|
Line *line = NULL;
|
|
Line *previous = *lines;
|
|
if(previous->isReset) {
|
|
u_fprintf(file, "\n& ");
|
|
escapeALine(previous, file);
|
|
} else if(!previous->isRemoved) {
|
|
printLine(previous, file);
|
|
}
|
|
i = 1;
|
|
while(i < size && previous->next) {
|
|
if(useLinks) {
|
|
line = previous->next;
|
|
} else {
|
|
line = *(lines+i);
|
|
}
|
|
if(line->isReset) {
|
|
u_fprintf(file, "\n& ");
|
|
escapeALine(line, file);
|
|
} else if(!line->isRemoved) {
|
|
if(file == out) {
|
|
u_fprintf(file, "\n");
|
|
}
|
|
if(i > 0) {
|
|
printStrength(line, file);
|
|
}
|
|
printLine(line, file);
|
|
//escapeALine(line, file);
|
|
}
|
|
previous = line;
|
|
i++;
|
|
}
|
|
u_fprintf(file, "\n");
|
|
}
|
|
|
|
|
|
void setIndexes(Line **lines, int32_t size) {
|
|
int32_t i = 0;
|
|
(*lines)->sortedIndex = 0;
|
|
for(i = 1; i < size; i++) {
|
|
Line *line = *(lines+i);
|
|
Line *prev = *(lines+i-1);
|
|
line->previous = prev;
|
|
prev->next = line;
|
|
line->sortedIndex = i;
|
|
}
|
|
}
|
|
|
|
|
|
// this seems to be a dead end
|
|
void
|
|
noteExpansion(Line **gLines, Line *line, int32_t size, CompareFn comparer) {
|
|
UErrorCode status = U_ZERO_ERROR;
|
|
|
|
UnicodeString key(line->name, line->len);
|
|
//Line *toInsert = (Line *)gElements.get(key);
|
|
Line *toInsert = (Line *)gExpansions.get(key);
|
|
if(toInsert != NULL) {
|
|
toInsert->isExpansion = TRUE;
|
|
u_strcpy(toInsert->expansionString, line->expansionString);
|
|
toInsert->expLen = line->expLen;
|
|
toInsert->previous->next = toInsert->next;
|
|
toInsert->next->previous = toInsert->previous;
|
|
gElements.remove(key);
|
|
} else {
|
|
toInsert = new Line(*line);
|
|
toInsert->isExpansion = TRUE;
|
|
gElements.put(UnicodeString(toInsert->name, toInsert->len), toInsert, status);
|
|
}
|
|
|
|
int32_t i = 0;
|
|
Line testLine;
|
|
Line *l = &testLine;
|
|
for(i = 0; i < size; i++) {
|
|
u_strcpy(testLine.name, (*(gLines+i))->name);
|
|
u_strcat(testLine.name, line->expansionString);
|
|
testLine.len = (*(gLines+i))->len + line->expLen;
|
|
if(comparer(&l, &line) > 0) {
|
|
toInsert->previous = *(gLines+i-1);
|
|
toInsert->next = *(gLines+i);
|
|
toInsert->previous->next = toInsert;
|
|
toInsert->next->previous = toInsert;
|
|
break;
|
|
}
|
|
}
|
|
if(gVerbose) {
|
|
u_fprintf(log, "Adding expansion\n");
|
|
escapeALine(line, log);
|
|
u_fprintf(log, "/");
|
|
escapeExpansion(line, log);
|
|
u_fprintf(log, " ");
|
|
}
|
|
}
|
|
|
|
void
|
|
positionExpansions(Line **gLines, int32_t size, CompareFn comparer) {
|
|
int result = 0;
|
|
Line *line = NULL;
|
|
Line *toMove = NULL;
|
|
int32_t i = 0, j = 0;
|
|
Line **sortedExpansions = new Line*[gExpansions.count()];
|
|
int32_t sortedExpansionsSize = setArray(sortedExpansions, &gExpansions);
|
|
qsort(sortedExpansions, sortedExpansionsSize, sizeof(Line *), comparer);
|
|
// Make a list of things in the vincinity of expansion candidate
|
|
for(j = 0; j < sortedExpansionsSize; j++) {
|
|
line = *(sortedExpansions+j);
|
|
UnicodeString key(line->name, line->len);
|
|
toMove = (Line *)gElements.get(key);
|
|
int32_t i = 0;
|
|
Line testLine, prevTestLine;
|
|
Line *l = &testLine;
|
|
Line *prevL = &prevTestLine;
|
|
// This can be further optimized, since we now know that we have a
|
|
// sorted list of expansions, so current can start from toMove, since all
|
|
// the elements before it are already smaller. In the beggining it needs to
|
|
// be on gLines, though.
|
|
Line *current = *gLines;
|
|
while(current) {
|
|
if(current == toMove) {
|
|
// we are wading through a sorted list
|
|
// if we found ourselves, it means that we
|
|
// are already in a right place, so no moving
|
|
// is needed, but we need to make sure we have
|
|
// the right strength.
|
|
toMove->strength = probeStrength(&prevL, &toMove, comparer);
|
|
if(0) {
|
|
u_fprintf(log, "Positioned expansion without moving ");
|
|
printLine(toMove, log);
|
|
u_fprintf(log, " new ordering: \n");
|
|
printOrdering(gLines, size, log, TRUE);
|
|
}
|
|
break;
|
|
} else {
|
|
u_strcpy(testLine.name, current->name);
|
|
if(!current->isExpansion) {
|
|
u_strcat(testLine.name, line->expansionString);
|
|
testLine.len = current->len + line->expLen;
|
|
} else {
|
|
testLine.len = current->len;
|
|
}
|
|
if(comparer(&l, &line) > 0) {
|
|
// remove from chain
|
|
if(toMove->next) {
|
|
toMove->next->strength = probeStrength(&(toMove->previous), &(toMove->next), comparer);
|
|
toMove->next->previous = toMove->previous;
|
|
}
|
|
if(toMove->previous) {
|
|
toMove->previous->next = toMove->next;
|
|
}
|
|
|
|
// insert
|
|
toMove->previous = current->previous;
|
|
toMove->next = current;
|
|
|
|
if(current->previous) {
|
|
current->previous->next = toMove;
|
|
}
|
|
current->previous = toMove;
|
|
|
|
toMove->strength = probeStrength(&prevL, &toMove, comparer);
|
|
toMove->next->strength = probeStrength(&toMove, &l, comparer);
|
|
if(0) {
|
|
u_fprintf(log, "Positioned expansion ");
|
|
printLine(toMove, log);
|
|
u_fprintf(log, " new ordering: \n");
|
|
printOrdering(gLines, size, log, TRUE);
|
|
}
|
|
if(toMove->strength == UCOL_IDENTICAL) {
|
|
// check for craziness such as s = ss/s
|
|
// such line would consist of previous (or next) concatenated with the expansion value
|
|
// make a test
|
|
UChar fullString[256];
|
|
u_strcpy(fullString, toMove->previous->name);
|
|
u_strcat(fullString, toMove->expansionString);
|
|
if(u_strcmp(fullString, toMove->name) == 0) {
|
|
toMove->previous->next = toMove->next;
|
|
toMove->next->previous = toMove->previous;
|
|
toMove->isRemoved = TRUE;
|
|
u_fprintf(log, "Removed: ");
|
|
printLine(toMove, log);
|
|
u_fprintf(log, "\n");
|
|
}
|
|
} else if(toMove->next->strength == UCOL_IDENTICAL) {
|
|
UChar fullString[256];
|
|
u_strcpy(fullString, toMove->next->name);
|
|
u_strcat(fullString, toMove->expansionString);
|
|
if(u_strcmp(fullString, toMove->name) == 0) {
|
|
toMove->next->strength = toMove->strength;
|
|
toMove->previous->next = toMove->next;
|
|
toMove->next->previous = toMove->previous;
|
|
toMove->isRemoved = TRUE;
|
|
u_fprintf(log, "Removed because of back: ");
|
|
printLine(toMove, log);
|
|
u_fprintf(log, "\n");
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
prevTestLine = testLine;
|
|
}
|
|
current = current->next;
|
|
}
|
|
}
|
|
delete[] sortedExpansions;
|
|
}
|
|
|
|
|
|
void
|
|
noteExpansion(Line *line) {
|
|
UErrorCode status = U_ZERO_ERROR;
|
|
UnicodeString key(line->name, line->len);
|
|
Line *el = (Line *)gElements.get(key);
|
|
if(el != NULL) {
|
|
el->isExpansion = TRUE;
|
|
u_strcpy(el->expansionString, line->expansionString);
|
|
el->expLen = line->expLen;
|
|
} else {
|
|
Line *toInsert = new Line(*line);
|
|
toInsert->isExpansion = TRUE;
|
|
gElements.put(UnicodeString(line->name, line->len), toInsert, status);
|
|
}
|
|
|
|
Line *el2 = (Line *)gExpansions.get(key);
|
|
el2->isExpansion = TRUE;
|
|
u_strcpy(el2->expansionString, line->expansionString);
|
|
el2->expLen = line->expLen;
|
|
|
|
if(gDebug) {
|
|
u_fprintf(log, "Adding expansion\n");
|
|
printLine(line, log);
|
|
u_fprintf(log, "\n");
|
|
}
|
|
}
|
|
|
|
void
|
|
noteContraction(Line *line) {
|
|
UErrorCode status = U_ZERO_ERROR;
|
|
Line *toInsert = new Line(*line);
|
|
toInsert->isContraction = TRUE;
|
|
gElements.put(UnicodeString(line->name, line->len), toInsert, status);
|
|
if(gVerbose) {
|
|
u_fprintf(log, "Adding contraction\n");
|
|
escapeALine(line, log);
|
|
u_fprintf(log, " ");
|
|
}
|
|
}
|
|
|
|
void
|
|
noteElement(Line *line) {
|
|
UErrorCode status = U_ZERO_ERROR;
|
|
Line *toInsert = new Line(*line);
|
|
gElements.put(UnicodeString(line->name, line->len), toInsert, status);
|
|
if(0) { //if(gDebug)
|
|
escapeALine(line, log);
|
|
u_fprintf(log, " ");
|
|
}
|
|
}
|
|
|
|
|
|
|
|
// This function checks if a combination of characters has changed place with the
|
|
// adjacent elements. If so, these are most probably contractions.
|
|
// However, it still needs to be checked if these contractions are fake - the
|
|
// test is simple - if xy is suspected contraction, if we get that x/y is expansion, then
|
|
// xy is a fake contraction.
|
|
int32_t
|
|
analyzeContractions(Line** lines, int32_t size, CompareFn comparer) {
|
|
int32_t i = 0, j = 0;
|
|
int32_t outOfOrder = 0;
|
|
UColAttributeValue strength = UCOL_OFF;
|
|
UColAttributeValue currStrength = UCOL_OFF;
|
|
Line **prevLine = lines;
|
|
Line **currLine = NULL;
|
|
Line **backupLine = NULL;
|
|
UBool prevIsContraction = FALSE, currIsContraction = FALSE;
|
|
// Problem here is detecting a contraction that is at the very end of the sorted list
|
|
for(i = 1; i < size; i++) {
|
|
currLine = lines+i;
|
|
strength = probeStrength(prevLine, currLine, comparer);
|
|
if(strength == UCOL_OFF || strength != (*currLine)->strength) {
|
|
prevIsContraction = FALSE;
|
|
currIsContraction = FALSE;
|
|
if(!outOfOrder) {
|
|
if(gVerbose) {
|
|
u_fprintf(log, "Possible contractions: ");
|
|
}
|
|
}
|
|
// now we have two elements that are different. The question is,
|
|
// which one of them is the contraction - which one has moved.
|
|
// Could be the previous, but could also be the current.
|
|
|
|
outOfOrder++;
|
|
|
|
// First, lets check whether the previous has jumped back
|
|
j = i+1;
|
|
// skip all the nexts that have smaller strength, they don't have an effect
|
|
while(j < size && (*(lines+j))->strength > (*currLine)->strength) {
|
|
j++;
|
|
}
|
|
// check if there are other elements of same or greater strength
|
|
while(j < size &&
|
|
(strength = probeStrength(prevLine, (backupLine = lines+j), comparer)) == UCOL_OFF) {
|
|
j++;
|
|
// if we skipped more than one, it might be in fact a contraction
|
|
prevIsContraction = TRUE;
|
|
}
|
|
if(prevIsContraction) {
|
|
noteContraction(*prevLine);
|
|
j = i-2;
|
|
// add all the previous elements with smaller strength, since they also
|
|
// will jump over and are contractions
|
|
while(j >= 0 && (*(lines+j+1))->strength > (*currLine)->strength) {
|
|
strength = probeStrength(lines+j, currLine, comparer);
|
|
if(strength == UCOL_OFF) {
|
|
noteContraction(*(lines+j));
|
|
}
|
|
j--;
|
|
}
|
|
}
|
|
|
|
// now we check if the current element is jumping forward,
|
|
// the dance steps are analogous to above.
|
|
j = i - 2;
|
|
while(j >= 0 && (*(lines+j+1))->strength > (*currLine)->strength) {
|
|
j--;
|
|
}
|
|
while(j >= 0 &&
|
|
(strength = probeStrength((backupLine = lines+j), currLine, comparer)) == UCOL_OFF) {
|
|
j--;
|
|
currIsContraction = TRUE;
|
|
}
|
|
if(currIsContraction) {
|
|
if(gVerbose) {
|
|
escapeALine(*currLine, log);
|
|
u_fprintf(log, " ");
|
|
}
|
|
j = i+1;
|
|
while(j < size && (*(lines+j))->strength > (*currLine)->strength) {
|
|
strength = probeStrength(prevLine, lines+j, comparer);
|
|
if(strength == UCOL_OFF) {
|
|
noteContraction(*(lines+j));
|
|
}
|
|
j++;
|
|
}
|
|
}
|
|
|
|
// Not sure about either. List both and then check
|
|
if(!(prevIsContraction || currIsContraction)) {
|
|
noteContraction(*prevLine);
|
|
noteContraction(*currLine);
|
|
}
|
|
}
|
|
prevLine = currLine;
|
|
}
|
|
if(outOfOrder) {
|
|
if(gVerbose) {
|
|
u_fprintf(log, "\n");
|
|
}
|
|
}
|
|
return outOfOrder;
|
|
}
|
|
|
|
int32_t
|
|
detectContractions(Line **gLines, Line *lines, int32_t size, CompareFn comparer) {
|
|
int32_t i = 0, j = 0;
|
|
int32_t noContractions = 0;
|
|
// Create and compare doubles:
|
|
Line *backupLines = new Line[size];
|
|
Line::copyArray(backupLines, lines, size);
|
|
// detect contractions
|
|
|
|
Line **gLinesBackup = NULL; //new Line*[size];
|
|
|
|
for(i = 0; i < size; i++) {
|
|
// preserve index and previous
|
|
Line::copyArray(lines, backupLines, size);
|
|
for(j = 0; j < size; j++) {
|
|
u_strcpy(lines[j].name, backupLines[i].name);
|
|
u_strcat(lines[j].name, backupLines[j].name);
|
|
lines[j].len = backupLines[i].len+backupLines[j].len;
|
|
}
|
|
|
|
if((noContractions += analyzeContractions(gLines, size, comparer)) && gDebug) {
|
|
if(gLinesBackup == NULL) {
|
|
gLinesBackup = new Line*[size];
|
|
}
|
|
// Show the sorted doubles, for debugging
|
|
setArray(gLinesBackup, lines, size);
|
|
qsort(gLinesBackup, size, sizeof(Line *), comparer);
|
|
//setIndexes(gLinesBackup, size);
|
|
analyzeStrength(gLinesBackup, size, comparer);
|
|
printOrdering(gLinesBackup, size, log);
|
|
}
|
|
if(!gQuiet) {
|
|
u_fprintf(log, ".");
|
|
}
|
|
}
|
|
if(!gQuiet) {
|
|
u_fprintf(log, "\n");
|
|
}
|
|
delete[] backupLines;
|
|
if(gLinesBackup) {
|
|
delete[] gLinesBackup;
|
|
}
|
|
return noContractions;
|
|
}
|
|
|
|
// gLines in this function is an array of sorted pointers.
|
|
// Contractions are already included.
|
|
int32_t
|
|
detectExpansions(Line **gLines, int32_t size, CompareFn comparer) {
|
|
UErrorCode status = U_ZERO_ERROR;
|
|
// detect expansions
|
|
|
|
UColAttributeValue startStrength = UCOL_OFF, endStrength = UCOL_OFF,
|
|
strength = UCOL_OFF, previousStrength = UCOL_OFF;
|
|
Line start, end, src;
|
|
Line *startP = &start, *endP = &end, *srcP = &src;
|
|
Line *current = NULL;
|
|
memset(startP, 0, sizeof(Line));
|
|
memset(endP, 0, sizeof(Line));
|
|
memset(srcP, 0, sizeof(Line));
|
|
int32_t srcLen;
|
|
int32_t i = 0, j = 0, k = 0;
|
|
for(i = 0; i < size; i++) {
|
|
u_strcpy(start.name, (*(gLines+i))->name);
|
|
u_strcpy(end.name, (*(gLines+i))->name);
|
|
srcLen = (*(gLines+i))->len;
|
|
u_strcpy(start.name+srcLen, (*(gLines))->name);
|
|
start.len = srcLen + (*(gLines))->len;
|
|
u_strcpy(end.name+srcLen, (*(gLines+size-1))->name);
|
|
end.len = srcLen + (*(gLines+size-1))->len;
|
|
|
|
for(k = 0; k < size; k++) { // k is index of a thing that is not doubled
|
|
current = *(gLines+k);
|
|
// see if we have moved to front
|
|
// has it moved to the very beggining
|
|
if((startStrength = probeStrength((gLines+k), &startP, comparer)) != UCOL_OFF) {
|
|
continue; // this one is in the front
|
|
}
|
|
// has it moved to the very end?
|
|
if((endStrength = probeStrength(&endP, (gLines+k), comparer)) != UCOL_OFF) {
|
|
continue; // this one is in the back
|
|
}
|
|
// Potential Expansion
|
|
if(gDebug) { //gVerbose
|
|
u_fprintf(log, "Possible expansion: ");
|
|
escapeALine(*(gLines+k), log);
|
|
u_fprintf(log, " ");
|
|
}
|
|
// Now we have to make sure that this is really an expansion
|
|
// First, we have to find it
|
|
u_strcpy(src.name, (*(gLines+i))->name);
|
|
for(j = 0; j < size; j++) {
|
|
u_strcpy(src.name+srcLen, (*(gLines+j))->name);
|
|
src.len = srcLen + (*(gLines+j))->len;
|
|
if((strength = probeStrength(&srcP, (gLines+k), comparer)) == UCOL_OFF) {
|
|
strength = probeStrength((gLines+k), &srcP, comparer);
|
|
// we found it *(gLines+j-1) is the element that is interesting
|
|
// since gLines+j-1 < gLines+k < gLines+j
|
|
if(gDebug) { //gVerbose
|
|
u_fprintf(log, "i = %i, k = %i, j = %i ", i, k, j);
|
|
escapeALine(*(gLines+i), log);
|
|
escapeALine(*(gLines+j-1), log);
|
|
printStrength(previousStrength, log);
|
|
escapeALine(current, log);
|
|
printStrength(strength, log);
|
|
escapeALine(*(gLines+i), log);
|
|
escapeALine(*(gLines+j), log);
|
|
u_fprintf(log, "\n");
|
|
}
|
|
// check whether it is a contraction that is the same as an expansion
|
|
// or a multi character that doesn't do anything
|
|
current->addExpansionHit(i, j);
|
|
current->isExpansion = TRUE;
|
|
current->expIndex = k;
|
|
// cache expansion
|
|
gExpansions.put(UnicodeString(current->name, current->len), current, status); //new Line(*current)
|
|
break;
|
|
}
|
|
previousStrength = strength;
|
|
}
|
|
}
|
|
if(!gQuiet) {
|
|
u_fprintf(log, ".");
|
|
}
|
|
}
|
|
if(!gQuiet) {
|
|
u_fprintf(log, "\n");
|
|
}
|
|
// now we have identified possible expansions. We need to find out how do they expand.
|
|
// Let's iterate over expansions cache - it's easier.
|
|
const UHashElement *el = NULL;
|
|
int32_t hashIndex = -1;
|
|
Line *doubles = new Line[size*10];
|
|
Line **sorter = new Line*[size*10];
|
|
int32_t currSize = 0;
|
|
int32_t newSize = 0;
|
|
Line *prev = NULL;
|
|
Line *next = NULL;
|
|
Line *origin = NULL;
|
|
int result = 0;
|
|
// Make a list of things in the vincinity of expansion candidate
|
|
// in expansionPrefixes and expansionAfter we have stored the
|
|
// prefixes of stuff that caused the detection of an expansion
|
|
// and a position where the expansion was.
|
|
// For example (icu, de__PHONEBOOK), we had:
|
|
// aE <<< \u00E4 < af
|
|
// AD < \u00E4 <<< Ae
|
|
// From that we will construct the following sequence:
|
|
// AD < aE <<< \u00E4/ <<< Ae < af
|
|
// then we will take the vincinity of \u00E4:
|
|
// aE <<< \u00E4/ <<< Ae
|
|
// then we will choose the smallest expansion to be the expansion
|
|
// part: 'e'.
|
|
// if there is equality, we choose the equal part:
|
|
// (win32, de__PHONEBOOK):
|
|
// AD < \u00E4/ = ae <<< aE <<< Ae
|
|
// we choose 'e'.
|
|
|
|
while((el = gExpansions.nextElement(hashIndex)) != NULL) {
|
|
newSize = 0;
|
|
current = (Line *)el->value.pointer;
|
|
currSize = size*current->expansionPrefixesSize;
|
|
if(gDebug) {
|
|
escapeALine(current, log);
|
|
u_fprintf(log, " Number: %i\n", current->expansionPrefixesSize);
|
|
}
|
|
// construct the doubles
|
|
for(i = 0; i < current->expansionPrefixesSize; i++) {
|
|
doubles[newSize].suffix = current->expansionAfter[i]-1;
|
|
doubles[newSize++].setToConcat(*(gLines+current->expansionPrefixes[i]), *(gLines+current->expansionAfter[i]-1));
|
|
doubles[newSize].suffix = current->expansionAfter[i];
|
|
doubles[newSize++].setToConcat(*(gLines+current->expansionPrefixes[i]), *(gLines+current->expansionAfter[i]));
|
|
}
|
|
// add the expansion we're observing
|
|
doubles[newSize++] = *current;
|
|
setArray(sorter, doubles, newSize);
|
|
qsort(sorter, newSize, sizeof(Line*), comparer);
|
|
analyzeStrength(sorter, newSize, comparer);
|
|
if(gDebug) {
|
|
printOrdering(sorter, newSize, log);
|
|
}
|
|
i = 0;
|
|
while(**(sorter+i) != *current) {
|
|
i++;
|
|
}
|
|
// find the two additions
|
|
if((*(sorter+i))->strength == UCOL_IDENTICAL) {
|
|
// if we ae id
|
|
origin = *(gLines+((*(sorter+i-1))->suffix));
|
|
u_strcpy(current->expansionString, origin->name);
|
|
current->expLen = origin->len;
|
|
} else if(i < newSize-1 && (*(sorter+i+1))->strength == UCOL_IDENTICAL) {
|
|
origin = *(gLines+((*(sorter+i+1))->suffix));
|
|
u_strcpy(current->expansionString, origin->name);
|
|
current->expLen = origin->len;
|
|
} else {
|
|
if(i > 0) {
|
|
prev = *(gLines+(*(sorter+i-1))->suffix);
|
|
if(i < newSize-1) {
|
|
next = *(gLines+(*(sorter+i+1))->suffix);
|
|
result = comparer(&prev, &next);
|
|
if(result <= 0) {
|
|
u_strcpy(current->expansionString, prev->name);
|
|
current->expLen = prev->len;
|
|
} else {
|
|
u_strcpy(current->expansionString, next->name);
|
|
current->expLen = next->len;
|
|
}
|
|
}
|
|
}
|
|
if(0) { //if(gDebug)
|
|
u_fprintf(log, "Expansion is: ");
|
|
escapeALine(current, log);
|
|
u_fprintf(log, "/");
|
|
escapeExpansion(current, log);
|
|
u_fprintf(log, "\n");
|
|
}
|
|
}
|
|
noteExpansion(current);
|
|
//noteExpansion(gLines, current, size, comparer);
|
|
if(!gQuiet) {
|
|
u_fprintf(log, ".");
|
|
}
|
|
}
|
|
if(!gQuiet) {
|
|
u_fprintf(log, "\n");
|
|
}
|
|
delete[] doubles;
|
|
delete[] sorter;
|
|
return gExpansions.count();
|
|
}
|
|
|
|
UBool
|
|
isTailored(Line *line, UErrorCode &status) {
|
|
UBool result = FALSE;
|
|
UCollationElements *tailoring = ucol_openElements(gCol, line->name, line->len, &status);
|
|
UCollationElements *uca = ucol_openElements(gUCA, line->name, line->len, &status);
|
|
|
|
int32_t tailElement = UCOL_NULLORDER;
|
|
int32_t ucaElement = UCOL_NULLORDER;
|
|
|
|
do {
|
|
do {
|
|
tailElement = ucol_next(tailoring, &status);
|
|
} while(tailElement == 0);
|
|
do {
|
|
ucaElement = ucol_next(uca, &status);
|
|
} while(ucaElement == 0);
|
|
if(tailElement != ucaElement) {
|
|
result = TRUE;
|
|
break;
|
|
}
|
|
} while (tailElement != UCOL_NULLORDER && ucaElement != UCOL_NULLORDER);
|
|
|
|
ucol_closeElements(tailoring);
|
|
ucol_closeElements(uca);
|
|
return result;
|
|
}
|
|
|
|
void
|
|
reduceUntailored(Line **gLines, int32_t size){
|
|
UErrorCode status = U_ZERO_ERROR;
|
|
Line *current = *(gLines);
|
|
Line *previous = NULL;
|
|
while(current) {
|
|
// if the current line is not tailored according to the UCA
|
|
if(!isTailored(current, status)) {
|
|
// we remove it
|
|
current->isRemoved = TRUE;
|
|
} else {
|
|
// if it's tailored
|
|
if(current->previous && current->previous->isRemoved == TRUE) {
|
|
previous = current->previous;
|
|
while(previous && (previous->strength > current->strength || previous->isExpansion || previous->isContraction) && previous->isRemoved) {
|
|
if(previous->previous && previous->previous->isRemoved) {
|
|
previous = previous->previous;
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
if(previous) {
|
|
previous->isReset = TRUE;
|
|
} else {
|
|
(*(gLines))->isReset = TRUE;
|
|
}
|
|
}
|
|
}
|
|
current = current->next;
|
|
}
|
|
}
|
|
|
|
void
|
|
constructAndAnalyze(Line **gLines, Line *lines, int32_t size, CompareFn comparer) {
|
|
int32_t i = 0, j = 0, k = 0;
|
|
// setup our compare arrays to point to single set.
|
|
|
|
// For contractions we need a block of data
|
|
setArray(gLines, lines, size);
|
|
//size = setArray(gLines);
|
|
|
|
qsort(gLines, size, sizeof(Line *), comparer);
|
|
|
|
// Establish who is previous according to the sort order
|
|
//setIndexes(gLines, size);
|
|
|
|
analyzeStrength(gLines, size, comparer);
|
|
if(gVerbose) {
|
|
u_fprintf(log, "Ordering:\n");
|
|
printOrdering(gLines, size, log);
|
|
}
|
|
|
|
//showDifferences(exemplarSetSize);
|
|
//dumpData(exemplarSetSize);
|
|
|
|
if(!gQuiet) {
|
|
u_fprintf(log, "Detecting contractions?\n");
|
|
}
|
|
int32_t noContractions = 0;
|
|
noContractions = detectContractions(gLines, lines, size, comparer);
|
|
if(!gQuiet) {
|
|
u_fprintf(log, "Detected %i contractions\n", noContractions);
|
|
}
|
|
|
|
// now we have suspected contractions in the table
|
|
// we have to re-sort the things
|
|
size = setArray(gLines);
|
|
qsort(gLines, size, sizeof(Line *), comparer);
|
|
analyzeStrength(gLines, size, comparer);
|
|
|
|
if(!gQuiet) {
|
|
u_fprintf(log, "Detecting expansions\n");
|
|
}
|
|
int32_t noExpansions = detectExpansions(gLines, size, comparer);
|
|
if(!gQuiet) {
|
|
u_fprintf(log, "Detected %i expansions\n", noExpansions);
|
|
}
|
|
|
|
positionExpansions(gLines, size, comparer);
|
|
|
|
if(gVerbose) {
|
|
u_fprintf(log, "After positioning expansions:\n");
|
|
printOrdering(gLines, size, log, TRUE);
|
|
}
|
|
//reduceUntailored(gLines, size);
|
|
if(!gQuiet) {
|
|
u_fprintf(out, "Final result\n");
|
|
}
|
|
printOrdering(gLines, size, out, TRUE);
|
|
printOrdering(gLines, size, log, TRUE);
|
|
}
|
|
|
|
// Check whether upper case comes before lower case or vice-versa
|
|
int32_t
|
|
checkCaseOrdering(void) {
|
|
UChar stuff[][3] = {
|
|
{ 0x0061, separatorChar, 0x0061}, //"aa",
|
|
{ 0x0061, separatorChar, 0x0041 }, //"a\\u00E0",
|
|
{ 0x0041, separatorChar, 0x0061 }, //"\\u00E0a",
|
|
{ 0x0041, separatorChar, 0x0041 }, //"\\u00E0a",
|
|
//{ 0x00E0, separatorChar, 0x00E0 } //"\\u00E0\\u00E0"
|
|
};
|
|
const int32_t size = sizeof(stuff)/sizeof(stuff[0]);
|
|
|
|
Line **sortedLines = new Line*[size];
|
|
Line lines[size];
|
|
|
|
int32_t i = 0;
|
|
int32_t ordered = 0, reversed = 0;
|
|
|
|
for(i = 0; i < size; i++) {
|
|
lines[i].setName(stuff[i], 3);
|
|
}
|
|
setArray(sortedLines, lines, size);
|
|
qsort(sortedLines, size, sizeof(Line*), gComparer);
|
|
|
|
for(i = 0; i < size; i++) {
|
|
if(*(sortedLines+i) == &lines[i]) {
|
|
ordered++;
|
|
}
|
|
if(*(sortedLines+i) == &lines[size-i-1]) {
|
|
reversed++;
|
|
}
|
|
}
|
|
|
|
delete[] sortedLines;
|
|
if(ordered == size) {
|
|
return 0; // in normal order
|
|
} else if(reversed == size) {
|
|
return 1; // in reversed order
|
|
} else {
|
|
return -1; // unknown order
|
|
}
|
|
}
|
|
|
|
|
|
// Check whether the secondaries are in the straight or reversed order
|
|
int32_t
|
|
checkSecondaryOrdering(void) {
|
|
UChar stuff[][5] = {
|
|
{ 0x0061, separatorChar, 0x0061, separatorChar, 0x00E0 }, //"aa",
|
|
{ 0x0061, separatorChar, 0x00E0, separatorChar, 0x0061 }, //"a\\u00E0",
|
|
{ 0x00E0, separatorChar, 0x0061, separatorChar, 0x0061 }, //"\\u00E0a",
|
|
//{ 0x00E0, separatorChar, 0x00E0 } //"\\u00E0\\u00E0"
|
|
};
|
|
const int32_t size = sizeof(stuff)/sizeof(stuff[0]);
|
|
|
|
Line **sortedLines = new Line*[size];
|
|
Line lines[size];
|
|
|
|
int32_t i = 0;
|
|
int32_t ordered = 0, reversed = 0;
|
|
|
|
for(i = 0; i < size; i++) {
|
|
lines[i].setName(stuff[i], 5);
|
|
}
|
|
setArray(sortedLines, lines, size);
|
|
qsort(sortedLines, size, sizeof(Line*), gComparer);
|
|
|
|
for(i = 0; i < size; i++) {
|
|
if(*(sortedLines+i) == &lines[i]) {
|
|
ordered++;
|
|
}
|
|
if(*(sortedLines+i) == &lines[size-i-1]) {
|
|
reversed++;
|
|
}
|
|
}
|
|
|
|
delete[] sortedLines;
|
|
if(ordered == size) {
|
|
return 0; // in normal order
|
|
} else if(reversed == size) {
|
|
return 1; // in reversed order
|
|
} else {
|
|
return -1; // unknown order
|
|
}
|
|
}
|
|
|
|
// We have to remove ignorable characters from the exemplar set,
|
|
// otherwise, we get messed up results
|
|
void removeIgnorableChars(UnicodeSet &exemplarUSet, CompareFn comparer, UErrorCode &status) {
|
|
UnicodeSet ignorables, primaryIgnorables;
|
|
UnicodeSetIterator exemplarUSetIter(exemplarUSet);
|
|
exemplarUSetIter.reset();
|
|
Line empty;
|
|
Line *emptyP = ∅
|
|
Line current;
|
|
Line *currLine = ¤t;
|
|
UColAttributeValue strength = UCOL_OFF;
|
|
|
|
|
|
while(exemplarUSetIter.next()) {
|
|
if(exemplarUSetIter.isString()) { // process a string
|
|
u_memcpy(currLine->name, exemplarUSetIter.getString().getBuffer(), exemplarUSetIter.getString().length());
|
|
currLine->len = exemplarUSetIter.getString().length();
|
|
strength = probeStrength(&emptyP, &currLine, comparer);
|
|
if(strength == UCOL_IDENTICAL) {
|
|
ignorables.add(exemplarUSetIter.getString());
|
|
} else if(strength > UCOL_PRIMARY) {
|
|
primaryIgnorables.add(exemplarUSetIter.getString());
|
|
}
|
|
} else { // process code point
|
|
UBool isError = FALSE;
|
|
UChar32 codePoint = exemplarUSetIter.getCodepoint();
|
|
currLine->len = 0;
|
|
U16_APPEND(currLine->name, currLine->len, 25, codePoint, isError);
|
|
strength = probeStrength(&emptyP, &currLine, comparer);
|
|
if(strength == UCOL_IDENTICAL) {
|
|
ignorables.add(codePoint);
|
|
} else if(strength > UCOL_PRIMARY) {
|
|
primaryIgnorables.add(codePoint);
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
|
|
exemplarUSet.removeAll(ignorables);
|
|
exemplarUSet.removeAll(primaryIgnorables);
|
|
|
|
UnicodeString removedPattern;
|
|
if(ignorables.size()) {
|
|
u_fprintf(log, "Ignorables:\n");
|
|
ignorables.toPattern(removedPattern, TRUE);
|
|
removedPattern.setCharAt(removedPattern.length(), 0);
|
|
escapeString(removedPattern.getBuffer(), removedPattern.length(), log);
|
|
u_fprintf(log, "\n");
|
|
}
|
|
if(primaryIgnorables.size()) {
|
|
u_fprintf(log, "Primary ignorables:\n");
|
|
primaryIgnorables.toPattern(removedPattern, TRUE);
|
|
removedPattern.setCharAt(removedPattern.length(), 0);
|
|
escapeString(removedPattern.getBuffer(), removedPattern.length(), log);
|
|
u_fprintf(log, "\n");
|
|
}
|
|
|
|
}
|
|
|
|
// TODO: develop logic for choosing boundary characters - right now it is hardcoded
|
|
// It should be a function of used scripts. Also, check whether we need to save
|
|
// used script names
|
|
void addUtilityChars(UnicodeSet &exemplarUSet, UErrorCode &status) {
|
|
|
|
// in order to get nice rules, we need to add some characters to the
|
|
// starting set. These are mostly parts of compatibity composed characters,
|
|
// such as L-middle dot (middle dot is 0x00B7). If we don't add these, we would
|
|
// get a reset at a funky character, such as L-middle dot. This list will probably
|
|
// grow.
|
|
exemplarUSet.add(0x00B7);
|
|
|
|
// these things represent a script before the target script and
|
|
// a script after. More logic should be added so that these characters are
|
|
// chosen automatically
|
|
|
|
exemplarUSet.add(0x0038);
|
|
exemplarUSet.add(0x0039);
|
|
|
|
//exemplarUSet.add(0x0433);
|
|
//exemplarUSet.add(0x0436);
|
|
exemplarUSet.add(0xfa29);
|
|
exemplarUSet.add(0xfa28);
|
|
}
|
|
|
|
void
|
|
getExemplars(const char *locale, UnicodeSet &exemplars, UErrorCode &status) {
|
|
// first we fill out structures with exemplar characters.
|
|
UResourceBundle *res = ures_open(NULL, locale, &status);
|
|
int32_t exemplarLength = 0;
|
|
UnicodeString exemplarString = ures_getUnicodeStringByKey(res, "ExemplarCharacters", &status);
|
|
exemplars.clear();
|
|
exemplars.applyPattern(exemplarString, status);
|
|
ures_close(res);
|
|
}
|
|
|
|
void
|
|
prepareStartingSet(UnicodeSet &exemplarUSet, CompareFn comparer, UErrorCode &status) {
|
|
int32_t i = 0;
|
|
UnicodeString exemplarString;
|
|
exemplarUSet.toPattern(exemplarString);
|
|
// Produce case closure of exemplar characters
|
|
// Then we want to figure out what is the script of the exemplar characters
|
|
// just pick several and see their script
|
|
const char* usedScriptNames[USCRIPT_CODE_LIMIT];
|
|
int32_t numberOfUsedScripts = 0;
|
|
char scriptSetPattern[256];
|
|
UnicodeString pattern; // for debugging
|
|
UChar32 exChar = -1;
|
|
while(exemplarUSet.size() != 0 && (exChar = exemplarUSet.charAt(0)) != -1) {
|
|
int32_t scriptNo = u_getIntPropertyValue(exChar, UCHAR_SCRIPT);
|
|
usedScriptNames[numberOfUsedScripts] = u_getPropertyValueName(UCHAR_SCRIPT, scriptNo, U_SHORT_PROPERTY_NAME);
|
|
sprintf(scriptSetPattern, "[:%s:]", usedScriptNames[numberOfUsedScripts]);
|
|
numberOfUsedScripts++;
|
|
UnicodeSet scriptSet(UnicodeString(scriptSetPattern, ""), status);
|
|
exemplarUSet.removeAll(scriptSet);
|
|
exemplarUSet.toPattern(pattern, TRUE);
|
|
}
|
|
exemplarUSet.clear();
|
|
|
|
// always add ASCII
|
|
//exemplarUSet.addAll(UnicodeSet(UnicodeString("[\\u0020-\\u007f]", ""), status));
|
|
exemplarUSet.addAll(UnicodeSet(UnicodeString("[\\u0041-\\u005b]", ""), status));
|
|
if(gExemplar) {
|
|
exemplarUSet.applyPattern(exemplarString, status);
|
|
exemplarUSet.closeOver(USET_CASE);
|
|
if(!gQuiet) {
|
|
u_fprintf(out, "ICU exemplar characters:\n");
|
|
escapeString(exemplarString.getBuffer(), exemplarString.length(), out);
|
|
u_fprintf(out, "\n");
|
|
}
|
|
} else {
|
|
if(!gQuiet) {
|
|
u_fprintf(out, "Using scripts:\n");
|
|
}
|
|
// add interesting scripts
|
|
for(i = 0; i < numberOfUsedScripts; i++) {
|
|
sprintf(scriptSetPattern, "[:%s:]", usedScriptNames[i]);
|
|
exemplarUSet.addAll(UnicodeSet(UnicodeString(scriptSetPattern, ""), status));
|
|
if(!gQuiet) {
|
|
u_fprintf(out, "%s\n", scriptSetPattern);
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
removeIgnorableChars(exemplarUSet, comparer, status);
|
|
|
|
addUtilityChars(exemplarUSet, status);
|
|
|
|
/*
|
|
// try to check whether tailored set and exemplar characters match.
|
|
USet *tailored = ucol_getTailoredSet(gCol, &status);
|
|
UBool tailoredContained = exemplarUSet.containsAll(*((UnicodeSet *)tailored));
|
|
if(!tailoredContained) {
|
|
((UnicodeSet *)tailored)->removeAll(exemplarUSet);
|
|
UnicodeString pattern;
|
|
((UnicodeSet *)tailored)->toPattern(pattern, TRUE);
|
|
}
|
|
uset_close(tailored);
|
|
*/
|
|
|
|
//return exemplarUSet;
|
|
}
|
|
|
|
void
|
|
setOutputFile(const char *name, UErrorCode &status) {
|
|
int32_t i = 0;
|
|
char filename[256];
|
|
strcpy(filename, name);
|
|
for(i = 0; i < gPlatformNo; i++) {
|
|
strcat(filename, "_");
|
|
strcat(filename, platforms[gPlatformIndexes[i]].name);
|
|
}
|
|
if(gExemplar) {
|
|
strcat(filename, "_exemplar");
|
|
} else {
|
|
strcat(filename, "_script");
|
|
}
|
|
strcat(filename, ".utf16.txt");
|
|
out = u_fopen(filename, "wb", "en", "utf-16");
|
|
}
|
|
|
|
void
|
|
processCollator(UCollator *col, UErrorCode &status) {
|
|
int32_t i = 0;
|
|
gCol = col;
|
|
UChar ruleString[16384];
|
|
int32_t ruleStringLength = ucol_getRulesEx(gCol, UCOL_TAILORING_ONLY, ruleString, 16384);
|
|
if(!gQuiet) {
|
|
u_fprintf(out, "ICU rules:\n");
|
|
printRules(ruleString, ruleStringLength, out);
|
|
printRules(ruleString, ruleStringLength, log);
|
|
//escapeString(ruleString, ruleStringLength, out);
|
|
u_fprintf(out, "\n");
|
|
}
|
|
const char *locale = ucol_getLocale(gCol, ULOC_REQUESTED_LOCALE, &status);
|
|
UnicodeSet exemplarUSet;
|
|
if(locale) {
|
|
getExemplars(locale, exemplarUSet, status);
|
|
} else {
|
|
exemplarUSet = *((UnicodeSet *)ucol_getTailoredSet(gCol, &status));
|
|
}
|
|
|
|
|
|
for(i = 0; i < gPlatformNo; i++) {
|
|
u_fprintf(out, "\nGenerating order for platform: %s\n", platforms[gPlatformIndexes[i]].name);
|
|
gComparer = platforms[gPlatformIndexes[i]].comparer;
|
|
|
|
prepareStartingSet(exemplarUSet, gComparer, status);
|
|
int32_t itemLen = 0;
|
|
// get the number of all the items from the set (both codepoints and strings)
|
|
int32_t exemplarSetSize = exemplarUSet.size();
|
|
UnicodeSetIterator exemplarUSetIter(exemplarUSet);
|
|
|
|
// allocate ICU lines
|
|
gICULines = new Line*[exemplarSetSize*5];
|
|
int32_t j = 0;
|
|
int32_t linesCount = 0;
|
|
Line *lines = new Line[exemplarSetSize];
|
|
|
|
int32_t reversedSecondary = checkSecondaryOrdering();
|
|
if(reversedSecondary == 0) {
|
|
u_fprintf(out, "Secondaries do not seem to be reversed\n");
|
|
} else if(reversedSecondary == 1) {
|
|
u_fprintf(out, "Secondaries are reversed\n");
|
|
if(gComparer == ICUstrcmp) {
|
|
ucol_setAttribute(gCol, UCOL_FRENCH_COLLATION, UCOL_OFF, &status);
|
|
}
|
|
} else {
|
|
u_fprintf(out, "Cannot conclude if secondaries are reversed\n");
|
|
}
|
|
|
|
int32_t reversedCase = checkCaseOrdering();
|
|
if(reversedCase == 0) {
|
|
u_fprintf(out, "Case does not seem to be reversed\n");
|
|
} else if(reversedCase == 1) {
|
|
u_fprintf(out, "Case is reversed\n");
|
|
if(gComparer == ICUstrcmp) {
|
|
ucol_setAttribute(gCol, UCOL_CASE_FIRST, UCOL_OFF, &status);
|
|
}
|
|
} else {
|
|
u_fprintf(out, "Cannot conclude if case is reversed\n");
|
|
}
|
|
|
|
exemplarUSetIter.reset();
|
|
gElements.removeAll();
|
|
gExpansions.removeAll();
|
|
linesCount = 0;
|
|
|
|
while(exemplarUSetIter.next()) {
|
|
Line *currLine = lines+linesCount;
|
|
if(exemplarUSetIter.isString()) { // process a string
|
|
u_memcpy(currLine->name, exemplarUSetIter.getString().getBuffer(), exemplarUSetIter.getString().length());
|
|
currLine->len = exemplarUSetIter.getString().length();
|
|
} else { // process code point
|
|
UBool isError = FALSE;
|
|
currLine->len = 0;
|
|
U16_APPEND(currLine->name, currLine->len, 25, exemplarUSetIter.getCodepoint(), isError);
|
|
}
|
|
currLine->name[currLine->len] = 0; // zero terminate, for our evil ways
|
|
currLine->index = linesCount;
|
|
linesCount++;
|
|
noteElement(currLine);
|
|
}
|
|
constructAndAnalyze(gICULines, lines, exemplarSetSize, gComparer);
|
|
|
|
delete[] lines;
|
|
}
|
|
|
|
|
|
// cleanup globals
|
|
delete[] gICULines;
|
|
u_fflush(out);
|
|
u_fclose(out);
|
|
ucol_close(gCol);
|
|
}
|
|
|
|
void
|
|
processLocale(const char *locale, UErrorCode &status) {
|
|
gWinLCID = uloc_getLCID(locale);
|
|
|
|
UCollator *col = ucol_open(locale, &status);
|
|
|
|
setOutputFile(locale, status);
|
|
|
|
u_fprintf(out, "Locale %s (LCID:%06X)\n", locale, gWinLCID);
|
|
|
|
processCollator(col, status);
|
|
}
|
|
|
|
UBool
|
|
hasCollationElements(const char *locName) {
|
|
|
|
UErrorCode status = U_ZERO_ERROR;
|
|
UResourceBundle *ColEl = NULL;
|
|
|
|
UResourceBundle *loc = ures_open(NULL, locName, &status);;
|
|
|
|
if(U_SUCCESS(status)) {
|
|
status = U_ZERO_ERROR;
|
|
ColEl = ures_getByKey(loc, "CollationElements", ColEl, &status);
|
|
if(status == U_ZERO_ERROR) { /* do the test - there are real elements */
|
|
ures_close(ColEl);
|
|
ures_close(loc);
|
|
return TRUE;
|
|
}
|
|
ures_close(ColEl);
|
|
ures_close(loc);
|
|
}
|
|
return FALSE;
|
|
}
|
|
|
|
int
|
|
main(int argc,
|
|
char* argv[])
|
|
{
|
|
UErrorCode status = U_ZERO_ERROR;
|
|
err = u_finit(stderr, "en", "latin-1");
|
|
log = u_finit(stdout, "en", "latin-1");
|
|
|
|
/*
|
|
USet *wsp = uprv_openRuleWhiteSpaceSet(&status);
|
|
uset_add(wsp, 0x0041);
|
|
uset_remove(wsp, 0x0041);
|
|
UnicodeString pat;
|
|
((UnicodeSet *)wsp)->toPattern(pat, TRUE);
|
|
pat.setCharAt(pat.length(), 0);
|
|
escapeString(pat.getBuffer(), pat.length(), log);
|
|
u_fflush(log);
|
|
*/
|
|
|
|
UTransliterator *anyHex = utrans_open("[^\\u000a\\u0020-\\u007f] Any-Hex/Java", UTRANS_FORWARD, NULL, 0, NULL, &status);
|
|
u_fsettransliterator(log, U_WRITE, anyHex, &status);
|
|
|
|
processArgs(argc, argv, status);
|
|
int32_t i = 0;
|
|
|
|
|
|
gElements.setValueDeleter(deleteLineElement);
|
|
|
|
|
|
if(U_FAILURE(status) || gPlatformNo == 0) {
|
|
return -1;
|
|
}
|
|
|
|
gUCA = ucol_open("root", &status);
|
|
|
|
if(gRulesStdin) {
|
|
char buffer[1024];
|
|
UChar ruleBuffer[16384];
|
|
UChar *rules = ruleBuffer;
|
|
int32_t maxRuleLen = 16384;
|
|
int32_t rLen = 0;
|
|
while(gets(buffer)) {
|
|
if(buffer[0] != '/' && buffer[1] != '/') {
|
|
rLen = u_unescape(buffer, rules, maxRuleLen);
|
|
rules += rLen;
|
|
maxRuleLen -= rLen;
|
|
}
|
|
}
|
|
UParseError parseError;
|
|
//escapeString(ruleBuffer, rules-ruleBuffer, log);//
|
|
u_fprintf(log, "%U\n", ruleBuffer);
|
|
|
|
UCollator *col = ucol_openRules(ruleBuffer, rules-ruleBuffer, UCOL_DEFAULT, UCOL_DEFAULT, &parseError, &status);
|
|
if(U_SUCCESS(status)) {
|
|
setOutputFile("stdinRules", status);
|
|
processCollator(col, status);
|
|
} else {
|
|
u_fprintf(err, "Error %s\n", u_errorName(status));
|
|
}
|
|
} else {
|
|
|
|
if(gLocale) {
|
|
processLocale(gLocale, status);
|
|
} else if(gLocaleNo) {
|
|
for(i = 0; i < gLocaleNo; i++) {
|
|
processLocale(gLocales[i], status);
|
|
}
|
|
} else { // do the loop through all the locales
|
|
int32_t noOfLoc = uloc_countAvailable();
|
|
const char *locName = NULL;
|
|
for(i = 0; i<noOfLoc; i++) {
|
|
status = U_ZERO_ERROR;
|
|
locName = uloc_getAvailable(i);
|
|
if(hasCollationElements(locName)) {
|
|
processLocale(locName, status);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
ucol_close(gUCA);
|
|
|
|
u_fflush(log);
|
|
u_fclose(log);
|
|
u_fflush(err);
|
|
u_fclose(err);
|
|
|
|
return 0;
|
|
} |