5bd3f677d5
X-SVN-Rev: 21927
1079 lines
32 KiB
C++
1079 lines
32 KiB
C++
/*
|
|
*******************************************************************************
|
|
*
|
|
* Copyright (C) 2003, International Business Machines
|
|
* Corporation and others. All Rights Reserved.
|
|
*
|
|
*******************************************************************************
|
|
*
|
|
* File colprobe.cpp
|
|
*
|
|
* Modification History:
|
|
*
|
|
* Date Name Description
|
|
* 03/18/2003 weiv Creation.
|
|
*******************************************************************************
|
|
*/
|
|
|
|
#include "uoptions.h"
|
|
#include "unicode/ucol.h"
|
|
#include "unicode/ucoleitr.h"
|
|
#include "unicode/ures.h"
|
|
#include "unicode/uniset.h"
|
|
#include "unicode/usetiter.h"
|
|
#include "unicode/ustring.h"
|
|
#include "unicode/uchar.h"
|
|
#include "unicode/uscript.h"
|
|
#include "unicode/locid.h"
|
|
#include "unicode/ucnv.h"
|
|
#include "uprops.h"
|
|
#include "hash.h"
|
|
#include "ucol_imp.h"
|
|
|
|
#include "unicode/ustdio.h"
|
|
#include "unicode/utrans.h"
|
|
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
#include <fcntl.h>
|
|
|
|
// unix tolower
|
|
#include <ctype.h>
|
|
// unix setlocale
|
|
#include <locale.h>
|
|
|
|
#include "colprobe.h"
|
|
|
|
#include "line.h"
|
|
#include "sortedlines.h"
|
|
#include "strengthprobe.h"
|
|
|
|
void testWin(StrengthProbe &probe, UErrorCode &status) ;
|
|
|
|
#if defined WIN32
|
|
#include <io.h>
|
|
#include <windows.h>
|
|
#include <sys/types.h>
|
|
#include <sys/stat.h>
|
|
#include <direct.h>
|
|
|
|
int createDir(const char* dirName) {
|
|
struct _stat myStat;
|
|
int result = _stat(dirName, &myStat);
|
|
|
|
if(result == -1) {
|
|
result = _mkdir(dirName);
|
|
return result;
|
|
} else if(myStat.st_mode & _S_IFDIR) {
|
|
return 0;
|
|
} else {
|
|
return 1;
|
|
}
|
|
}
|
|
|
|
//#elif defined POSIX
|
|
#else
|
|
#include <sys/stat.h>
|
|
#include <unistd.h>
|
|
|
|
int createDir(const char* dirName) {
|
|
struct stat myStat;
|
|
int result = stat(dirName, &myStat);
|
|
|
|
if(result == -1) {
|
|
result = mkdir(dirName, S_IRUSR|S_IWUSR|S_IXUSR|S_IRGRP|S_IWGRP|S_IXGRP|S_IROTH|S_IWOTH|S_IXOTH);
|
|
return result;
|
|
} else if(S_ISDIR(myStat.st_mode)) {
|
|
return 0;
|
|
} else {
|
|
return 1;
|
|
}
|
|
}
|
|
//
|
|
// Stubs for Windows API functions when building on UNIXes.
|
|
//
|
|
typedef int DWORD;
|
|
inline int CompareStringW(DWORD, DWORD, UChar *, int, UChar *, int) {return 0;};
|
|
//#else
|
|
//#error "Not POSIX or Windows. Won't work."
|
|
#endif
|
|
|
|
#include "line.h"
|
|
|
|
static UBool gVerbose = FALSE;
|
|
static UBool gDebug = FALSE;
|
|
static UBool gQuiet = FALSE;
|
|
static UBool gExemplar = FALSE;
|
|
|
|
DWORD gWinLCID;
|
|
int gCount;
|
|
UCollator *gCol;
|
|
UCollator *gUCA;
|
|
UConverter *utf8cnv;
|
|
CompareFn gComparer;
|
|
int gRefNum;
|
|
UnicodeSet gExcludeSet;
|
|
UnicodeSet gRepertoire;
|
|
|
|
const UChar separatorChar = 0x0030;
|
|
|
|
UPrinter *logger;
|
|
UPrinter *debug;
|
|
UPrinter *tailoringBundle;
|
|
UPrinter *referenceBundle;
|
|
UPrinter *bundle;
|
|
FILE *fTailoringDump;
|
|
FILE *fDefaultDump;
|
|
|
|
const char *progName = "colprobe";
|
|
|
|
const char *gLocale = NULL;
|
|
int32_t platformIndex = -1;
|
|
int32_t gPlatformNo = 0;
|
|
int32_t gPlatformIndexes[10];
|
|
int32_t gLocaleNo = 0;
|
|
const char* gLocales[100];
|
|
UBool gRulesStdin = FALSE;
|
|
const char *outputFormat = "HTML";
|
|
const char *outExtension = "html";
|
|
|
|
enum {
|
|
HELP1,
|
|
HELP2,
|
|
VERBOSE,
|
|
QUIET,
|
|
VERSION,
|
|
ICUDATADIR,
|
|
COPYRIGHT,
|
|
LOCALE,
|
|
PLATFORM,
|
|
DEBUG,
|
|
EXEMPLAR,
|
|
RULESSTDIN,
|
|
REFERENCE,
|
|
EXCLUDESET,
|
|
REPERTOIRE,
|
|
INTERACTIVE,
|
|
PRINTREF,
|
|
DIFF,
|
|
OUTPUT
|
|
};
|
|
|
|
UOption options[]={
|
|
/*0*/ UOPTION_HELP_H,
|
|
/*1*/ UOPTION_HELP_QUESTION_MARK,
|
|
/*2*/ UOPTION_VERBOSE,
|
|
/*3*/ UOPTION_QUIET,
|
|
/*4*/ UOPTION_VERSION,
|
|
/*5*/ UOPTION_ICUDATADIR,
|
|
/*6*/ UOPTION_COPYRIGHT,
|
|
/*7*/ UOPTION_DEF("locale", 'l', UOPT_REQUIRES_ARG),
|
|
/*8*/ UOPTION_DEF("platform", 'p', UOPT_REQUIRES_ARG),
|
|
/*9*/ UOPTION_DEF("debug", 'D', UOPT_NO_ARG),
|
|
/*10*/ UOPTION_DEF("exemplar", 'E', UOPT_NO_ARG),
|
|
/*11*/ UOPTION_DEF("rulesstdin", 'R', UOPT_NO_ARG),
|
|
/*12*/ UOPTION_DEF("ref", 'c', UOPT_REQUIRES_ARG),
|
|
/*13*/ UOPTION_DEF("excludeset", 'x', UOPT_REQUIRES_ARG),
|
|
/*14*/ UOPTION_DEF("repertoire", 't', UOPT_REQUIRES_ARG),
|
|
/*15*/ UOPTION_DEF("interactive", 'I', UOPT_NO_ARG),
|
|
/*16*/ UOPTION_DEF("printref", 0, UOPT_NO_ARG),
|
|
/*17*/ UOPTION_DEF("diff", 0, UOPT_NO_ARG),
|
|
/*18*/ UOPTION_DEF("output", 0, UOPT_REQUIRES_ARG)
|
|
};
|
|
|
|
UChar compA[256];
|
|
UChar compB[256];
|
|
int32_t compALen = 0;
|
|
int32_t compBLen = 0;
|
|
|
|
char compUTF8A[256];
|
|
char compUTF8B[256];
|
|
int32_t compUTF8ALen = 0;
|
|
int32_t compUTF8BLen = 0;
|
|
|
|
int UNIXstrcmp(const void *a, const void *b) {
|
|
UErrorCode status = U_ZERO_ERROR;
|
|
gCount++;
|
|
int t;
|
|
compALen = unorm_normalize((*(Line **)a)->name, (*(Line **)a)->len, UNORM_NFC, 0, compA, 256, &status);
|
|
compBLen = unorm_normalize((*(Line **)b)->name, (*(Line **)b)->len, UNORM_NFC, 0, compB, 256, &status);
|
|
compUTF8ALen = ucnv_fromUChars(utf8cnv, compUTF8A, 256, compA, compALen, &status);
|
|
compUTF8A[compUTF8ALen] = 0;
|
|
compUTF8BLen = ucnv_fromUChars(utf8cnv, compUTF8B, 256, compB, compBLen, &status);
|
|
compUTF8B[compUTF8BLen] = 0;
|
|
t = strcoll(compUTF8A, compUTF8B);
|
|
return t;
|
|
}
|
|
|
|
int UNIXgetSortKey(const UChar *string, int32_t len, uint8_t *buffer, int32_t buffCapacity) {
|
|
UErrorCode status = U_ZERO_ERROR;
|
|
compALen = unorm_normalize(string, len, UNORM_NFC, 0, compA, 256, &status);
|
|
compUTF8ALen = ucnv_fromUChars(utf8cnv, compUTF8A, 256, compA, compALen, &status);
|
|
compUTF8A[compUTF8ALen] = 0;
|
|
return (strxfrm((char *)buffer, compUTF8A, buffCapacity)+1);
|
|
}
|
|
|
|
#ifdef WIN32
|
|
int Winstrcmp(const void *a, const void *b) {
|
|
UErrorCode status = U_ZERO_ERROR;
|
|
gCount++;
|
|
int t;
|
|
//compALen = unorm_compose(compA, 256, (*(Line **)a)->name, (*(Line **)a)->len, FALSE, 0, &status);
|
|
//compBLen = unorm_compose(compB, 256, (*(Line **)b)->name, (*(Line **)b)->len, FALSE, 0, &status);
|
|
compALen = unorm_normalize((*(Line **)a)->name, (*(Line **)a)->len, UNORM_NFC, 0, compA, 256, &status);
|
|
compBLen = unorm_normalize((*(Line **)b)->name, (*(Line **)b)->len, UNORM_NFC, 0, compB, 256, &status);
|
|
t = CompareStringW(gWinLCID, SORT_STRINGSORT, //0,
|
|
compA, compALen,
|
|
compB, compBLen);
|
|
|
|
/*
|
|
t = CompareStringW(gWinLCID, 0,
|
|
(*(Line **)a)->name, (*(Line **)a)->len,
|
|
(*(Line **)b)->name, (*(Line **)b)->len);
|
|
*/
|
|
return t-2;
|
|
}
|
|
|
|
int WingetSortKey(const UChar *string, int32_t len, uint8_t *buffer, int32_t buffCapacity) {
|
|
UErrorCode status = U_ZERO_ERROR;
|
|
compALen = unorm_normalize(string, len, UNORM_NFC, 0, compA, 256, &status);
|
|
return LCMapStringW(gWinLCID, LCMAP_SORTKEY | SORT_STRINGSORT, compA, compALen, (unsigned short *)buffer, buffCapacity);
|
|
}
|
|
|
|
#if 0
|
|
int Winstrcmp(const void *a, const void *b) {
|
|
UErrorCode status = U_ZERO_ERROR;
|
|
uint8_t b1[256], b2[256];
|
|
int32_t b1Len, b2Len;
|
|
b1Len = WingetSortKey((*(Line **)a)->name, (*(Line **)a)->len, b1, 256);
|
|
b2Len = WingetSortKey((*(Line **)b)->name, (*(Line **)b)->len, b2, 256);
|
|
|
|
b1[b1Len] = 0;
|
|
b2[b2Len] = 0;
|
|
|
|
return strcmp((const char *)b1, (const char *)b2);
|
|
}
|
|
#endif
|
|
|
|
#else
|
|
int Winstrcmp(const void *a, const void *b) {
|
|
if(a == b);
|
|
return 0;
|
|
}
|
|
int WingetSortKey(const UChar *, int32_t , uint8_t *, int32_t ) {
|
|
return 0;
|
|
}
|
|
#endif
|
|
|
|
int ICUstrcmp(const void *a, const void *b) {
|
|
gCount++;
|
|
UCollationResult t;
|
|
t = ucol_strcoll(gCol,
|
|
(*(Line **)a)->name, (*(Line **)a)->len,
|
|
(*(Line **)b)->name, (*(Line **)b)->len);
|
|
if (t == UCOL_LESS) return -1;
|
|
if (t == UCOL_GREATER) return +1;
|
|
return 0;
|
|
}
|
|
|
|
int ICUgetSortKey(const UChar *string, int32_t len, uint8_t *buffer, int32_t buffCapacity) {
|
|
return ucol_getSortKey(gCol, string, len, buffer, buffCapacity);
|
|
}
|
|
|
|
struct {
|
|
const char* name;
|
|
CompareFn comparer;
|
|
GetSortKeyFn skgetter;
|
|
} platforms[] = {
|
|
{ "icu", ICUstrcmp, ICUgetSortKey },
|
|
{ "w2k", Winstrcmp, WingetSortKey},
|
|
{ "winxp", Winstrcmp, WingetSortKey},
|
|
{ "aix", UNIXstrcmp, UNIXgetSortKey},
|
|
{ "linux", UNIXstrcmp, UNIXgetSortKey}
|
|
};
|
|
|
|
|
|
void stringToLower(char *string) {
|
|
uint32_t i = 0;
|
|
for(i = 0; i < strlen(string); i++) {
|
|
string[i] = tolower(string[i]);
|
|
}
|
|
}
|
|
|
|
void usage(const char *name) {
|
|
logger->log("Usage: %s --locale loc_name --platform platform\n", name);
|
|
}
|
|
|
|
void listKnownPlatforms() {
|
|
uint32_t i = 0;
|
|
logger->log("Known platforms:\n");
|
|
for(i = 0; i < sizeof(platforms)/sizeof(platforms[0]); i++) {
|
|
logger->log("\t%s\n", platforms[i]);
|
|
}
|
|
}
|
|
|
|
void addPlatform(const char *platform) {
|
|
uint32_t i;
|
|
//stringToLower(platform);
|
|
int32_t oldPlatformNo = gPlatformNo;
|
|
|
|
for(i = 0; i < sizeof(platforms)/sizeof(platforms[0]); i++) {
|
|
if(strcmp(platform, platforms[i].name) == 0) {
|
|
gPlatformIndexes[gPlatformNo++] = i;
|
|
}
|
|
}
|
|
if(gPlatformNo == oldPlatformNo) {
|
|
logger->log("Unknown platform %s\n", platform);
|
|
listKnownPlatforms();
|
|
}
|
|
}
|
|
|
|
void processArgs(int argc, char* argv[], UErrorCode &status)
|
|
{
|
|
int32_t i = 0;
|
|
U_MAIN_INIT_ARGS(argc, argv);
|
|
|
|
argc = u_parseArgs(argc, argv, (int32_t)(sizeof(options)/sizeof(options[0])), options);
|
|
|
|
if(argc < 0) {
|
|
logger->log("Unknown option: %s\n", argv[-argc]);
|
|
usage(progName);
|
|
return;
|
|
}
|
|
|
|
if(options[0].doesOccur || options[1].doesOccur) {
|
|
usage(progName);
|
|
return;
|
|
}
|
|
if(options[VERBOSE].doesOccur) {
|
|
gVerbose = TRUE;
|
|
}
|
|
if(options[DEBUG].doesOccur) {
|
|
gDebug = TRUE;
|
|
gVerbose = TRUE;
|
|
}
|
|
if(options[EXEMPLAR].doesOccur) {
|
|
gExemplar = TRUE;
|
|
}
|
|
if(options[QUIET].doesOccur) {
|
|
gQuiet = TRUE;
|
|
}
|
|
|
|
// ASCII based options specified on the command line
|
|
// this is for testing purposes, will allow to load
|
|
// up ICU rules and then poke through them.
|
|
// In that case, we test only ICU and don't need
|
|
// a locale.
|
|
if(options[RULESSTDIN].doesOccur) {
|
|
gRulesStdin = TRUE;
|
|
addPlatform("icu");
|
|
return;
|
|
}
|
|
|
|
if(options[LOCALE].doesOccur) {
|
|
gLocale = options[LOCALE].value;
|
|
} else {
|
|
gLocale = argv[1];
|
|
//for(i = 1; i < argc; i++) {
|
|
//gLocales[gLocaleNo++] = argv[i];
|
|
//}
|
|
}
|
|
|
|
if(options[PLATFORM].doesOccur) {
|
|
addPlatform(options[PLATFORM].value);
|
|
} else { // there is a list of platforms
|
|
addPlatform("icu");
|
|
}
|
|
|
|
if(options[REFERENCE].doesOccur) {
|
|
for(i = 0; i < (int32_t)(sizeof(platforms)/sizeof(platforms[0])); i++) {
|
|
if(strcmp(options[REFERENCE].value, platforms[i].name) == 0) {
|
|
gRefNum = i;
|
|
break;
|
|
}
|
|
}
|
|
if(i == sizeof(platforms)/sizeof(platforms[0])) {
|
|
logger->log("Unknown reference %s!\n", options[REFERENCE].value);
|
|
status = U_ILLEGAL_ARGUMENT_ERROR;
|
|
return;
|
|
}
|
|
} else {
|
|
gRefNum = 0;
|
|
}
|
|
|
|
if(options[EXCLUDESET].doesOccur) {
|
|
gExcludeSet.applyPattern(UnicodeString(options[EXCLUDESET].value), status);
|
|
if(U_FAILURE(status)) {
|
|
logger->log("Cannot construct exclude set from argument %s. Error %s\n", options[EXCLUDESET].value, u_errorName(status));
|
|
return;
|
|
} else {
|
|
UnicodeString pattern;
|
|
logger->log(gExcludeSet.toPattern(pattern, TRUE), TRUE);
|
|
}
|
|
}
|
|
|
|
if(options[REPERTOIRE].doesOccur) {
|
|
gRepertoire.applyPattern(UnicodeString(options[REPERTOIRE].value), status);
|
|
if(U_FAILURE(status)) {
|
|
logger->log("Cannot construct repertoire from argument %s. Error %s\n", options[REPERTOIRE].value, u_errorName(status));
|
|
return;
|
|
}
|
|
}
|
|
|
|
if(options[OUTPUT].doesOccur) {
|
|
outputFormat = options[OUTPUT].value;
|
|
if(strcmp(outputFormat, "HTML") == 0) {
|
|
outExtension = "html";
|
|
} else if(strcmp(outputFormat, "XML") == 0) {
|
|
outExtension = "xml";
|
|
} else {
|
|
outExtension = "txt";
|
|
}
|
|
}
|
|
|
|
}
|
|
|
|
// Check whether upper case comes before lower case or vice-versa
|
|
int32_t
|
|
checkCaseOrdering(void) {
|
|
UChar stuff[][3] = {
|
|
{ 0x0061, separatorChar, 0x0061}, //"aa",
|
|
{ 0x0061, separatorChar, 0x0041 }, //"a\\u00E0",
|
|
{ 0x0041, separatorChar, 0x0061 }, //"\\u00E0a",
|
|
{ 0x0041, separatorChar, 0x0041 }, //"\\u00E0a",
|
|
//{ 0x00E0, separatorChar, 0x00E0 } //"\\u00E0\\u00E0"
|
|
};
|
|
const int32_t size = sizeof(stuff)/sizeof(stuff[0]);
|
|
|
|
Line **sortedLines = new Line*[size];
|
|
Line lines[size];
|
|
|
|
int32_t i = 0;
|
|
int32_t ordered = 0, reversed = 0;
|
|
|
|
for(i = 0; i < size; i++) {
|
|
lines[i].setName(stuff[i], 3);
|
|
}
|
|
//setArray(sortedLines, lines, size);
|
|
qsort(sortedLines, size, sizeof(Line*), gComparer);
|
|
|
|
for(i = 0; i < size; i++) {
|
|
if(*(sortedLines+i) == &lines[i]) {
|
|
ordered++;
|
|
}
|
|
if(*(sortedLines+i) == &lines[size-i-1]) {
|
|
reversed++;
|
|
}
|
|
}
|
|
|
|
delete[] sortedLines;
|
|
if(ordered == size) {
|
|
return 0; // in normal order
|
|
} else if(reversed == size) {
|
|
return 1; // in reversed order
|
|
} else {
|
|
return -1; // unknown order
|
|
}
|
|
}
|
|
|
|
void
|
|
getExemplars(const char *locale, UnicodeSet &exemplars, UErrorCode &status) {
|
|
// first we fill out structures with exemplar characters.
|
|
UResourceBundle *res = ures_open(NULL, locale, &status);
|
|
UnicodeString exemplarString = ures_getUnicodeStringByKey(res, "ExemplarCharacters", &status);
|
|
exemplars.clear();
|
|
exemplars.applyPattern(exemplarString, status);
|
|
ures_close(res);
|
|
}
|
|
|
|
|
|
void
|
|
getFileNames(const char *name, char *tailoringName, char *tailoringDumpName, char *defaultName, char *defaultDumpName, char *diffName) {
|
|
if(tailoringName) {
|
|
strcpy(tailoringName, platforms[gPlatformIndexes[0]].name);
|
|
strcat(tailoringName, "/");
|
|
strcat(tailoringName, name);
|
|
strcat(tailoringName, "_raw.");
|
|
strcat(tailoringName, outExtension);
|
|
}
|
|
if(tailoringDumpName) {
|
|
strcpy(tailoringDumpName, platforms[gPlatformIndexes[0]].name);
|
|
strcat(tailoringDumpName, "/");
|
|
strcat(tailoringDumpName, name);
|
|
strcat(tailoringDumpName, ".dump");
|
|
}
|
|
|
|
if(diffName) {
|
|
strcpy(diffName, platforms[gPlatformIndexes[0]].name);
|
|
strcat(diffName, "/");
|
|
strcat(diffName, name);
|
|
strcat(diffName, "_collation.");
|
|
strcat(diffName, outExtension);
|
|
}
|
|
|
|
if(defaultName) {
|
|
strcpy(defaultName, platforms[gRefNum].name);
|
|
strcat(defaultName, "/");
|
|
strcat(defaultName, name);
|
|
strcat(defaultName, "_default_raw.");
|
|
strcat(defaultName, outExtension);
|
|
}
|
|
|
|
if(defaultDumpName) {
|
|
strcpy(defaultDumpName, platforms[gRefNum].name);
|
|
strcat(defaultDumpName, "/");
|
|
strcat(defaultDumpName, name);
|
|
strcat(defaultDumpName, "_default.dump");
|
|
}
|
|
}
|
|
|
|
void
|
|
setFiles(const char *name, UErrorCode &status) {
|
|
if(U_FAILURE(status)) {
|
|
return;
|
|
}
|
|
int32_t i = 0;
|
|
char tailoringName[256];
|
|
char tailoringDumpName[256];
|
|
char defaultName[256];
|
|
char defaultDumpName[256];
|
|
char diffName[256];
|
|
|
|
getFileNames(name, tailoringName, tailoringDumpName, defaultName, defaultDumpName, diffName);
|
|
if(options[PLATFORM].doesOccur && !options[DIFF].doesOccur) {
|
|
if(createDir(platforms[gPlatformIndexes[0]].name) == 0) {
|
|
tailoringBundle = new UPrinter(tailoringName, "en", "utf-8", NULL, FALSE);
|
|
fTailoringDump = fopen(tailoringDumpName, "wb");
|
|
} else {
|
|
status = U_FILE_ACCESS_ERROR;
|
|
return;
|
|
}
|
|
}
|
|
|
|
if(options[REFERENCE].doesOccur && !options[DIFF].doesOccur) {
|
|
if(createDir(platforms[gRefNum].name) == 0) {
|
|
referenceBundle = new UPrinter(defaultName, "en", "utf-8", NULL, FALSE);
|
|
fDefaultDump = fopen(defaultDumpName, "wb");
|
|
} else {
|
|
status = U_FILE_ACCESS_ERROR;
|
|
return;
|
|
}
|
|
}
|
|
|
|
if((options[PLATFORM].doesOccur && options[REFERENCE].doesOccur) || options[DIFF].doesOccur) {
|
|
if(createDir(platforms[gPlatformIndexes[0]].name) == 0) {
|
|
bundle = new UPrinter(diffName, "en", "utf-8", NULL, FALSE);
|
|
}
|
|
}
|
|
if(options[DIFF].doesOccur) {
|
|
fTailoringDump = fopen(tailoringDumpName, "rb");
|
|
fDefaultDump = fopen(defaultDumpName, "rb");
|
|
}
|
|
}
|
|
|
|
|
|
UErrorCode status = U_ZERO_ERROR;
|
|
static UnicodeSet UNASSIGNED(UnicodeString("[:Cn:]"), status);
|
|
static UnicodeSet GENERAL_ACCENTS(UnicodeString("[[:block=Combining Diacritical Marks:]-[:Cn:]]"), status);
|
|
//static UnicodeSet ASCII_BASE(UnicodeString("[[:ASCII:]-[:L:]-[:N:]]"), status);
|
|
static UnicodeSet ASCII_BASE(UnicodeString("[[:ASCII:]]"), status);
|
|
static UnicodeSet ALPHABETIC(UnicodeString("[:alphabetic:]"), status);
|
|
//static UnicodeSet CONTROL(UnicodeString("[[:control:][\\u0000-\\u002F]]"), status);
|
|
static UnicodeSet BMP(UnicodeString("[\\u0000-\\uFFFF]"), status);
|
|
|
|
static UnicodeSet CONTROL(UnicodeString("[:control:]"), status);
|
|
|
|
UCollator *
|
|
setLocale(const char* locale, UErrorCode &status)
|
|
{
|
|
gWinLCID = uloc_getLCID(locale);
|
|
setlocale(LC_COLLATE, locale);
|
|
|
|
if(gCol) {
|
|
ucol_close(gCol);
|
|
}
|
|
gCol = ucol_open(locale, &status);
|
|
ucol_setAttribute(gCol, UCOL_NORMALIZATION_MODE, UCOL_ON, &status);
|
|
//ucol_setAttribute(col, UCOL_ALTERNATE_HANDLING, UCOL_SHIFTED, &status);
|
|
//ucol_setAttribute(col, UCOL_STRENGTH, UCOL_QUATERNARY, &status);
|
|
|
|
return gCol;
|
|
}
|
|
|
|
|
|
|
|
UCollator *
|
|
setReference(UErrorCode &status)
|
|
{
|
|
gWinLCID = uloc_getLCID("en");
|
|
setlocale(LC_COLLATE, "en_US.UTF-8");
|
|
if(gCol) {
|
|
ucol_close(gCol);
|
|
}
|
|
gCol = ucol_open("root", &status);
|
|
ucol_setAttribute(gCol, UCOL_NORMALIZATION_MODE, UCOL_ON, &status);
|
|
return gCol;
|
|
}
|
|
|
|
void
|
|
processInteractive() {
|
|
char command[256];
|
|
while(fgets(command, 256, stdin)) {
|
|
|
|
}
|
|
}
|
|
|
|
UChar probeChars[][4] = {
|
|
{ 0x0061, 0x0062, 0x00E1, 0x0041 }, // latin with a-grave
|
|
{ 0x0041, 0x0042, 0x00C1, 0x0061 }, // upper first
|
|
{ 0x006E, 0x006F, 0x00F1, 0x004E }, // latin with n-tilda
|
|
{ 0x004E, 0x004F, 0x00D1, 0x006E }, // upper first
|
|
{ 0x0433, 0x0493, 0x0491, 0x0413 }, // Cyrillic
|
|
{ 0x0413, 0x0492, 0x0490, 0x0433 }, // upper first
|
|
{ 0x3045, 0x3047, 0x3094, 0x3046 } // Hiragana/Katakana (last resort)
|
|
|
|
};
|
|
|
|
void
|
|
processCollator(UCollator *col, UErrorCode &status) {
|
|
int32_t i = 0;
|
|
uint32_t j = 0;
|
|
gCol = col;
|
|
UChar ruleString[16384];
|
|
char myLoc[256];
|
|
|
|
int32_t ruleStringLength = ucol_getRulesEx(gCol, UCOL_TAILORING_ONLY, ruleString, 16384);
|
|
logger->log(UnicodeString(ruleString, ruleStringLength), TRUE);
|
|
const char *locale = ucol_getLocale(gCol, ULOC_REQUESTED_LOCALE, &status);
|
|
if(locale == NULL) {
|
|
locale = "en";
|
|
}
|
|
strcpy(myLoc, locale);
|
|
UnicodeSet exemplarUSet;
|
|
UnicodeSet RefRepertoire;
|
|
|
|
UnicodeSet tailored;
|
|
|
|
tailored = *((UnicodeSet *)ucol_getTailoredSet(gCol, &status));
|
|
tailored.removeAll(CONTROL);
|
|
|
|
|
|
UnicodeString pattern;
|
|
int sanityResult;
|
|
|
|
UnicodeSet hanSet;
|
|
UBool hanAppears = FALSE;
|
|
|
|
debug->log("\nGenerating order for platform: %s\n", platforms[gPlatformIndexes[0]].name);
|
|
gComparer = platforms[gPlatformIndexes[0]].comparer;
|
|
|
|
StrengthProbe probe(platforms[gPlatformIndexes[0]].comparer, platforms[gPlatformIndexes[0]].skgetter, 0x0030, probeChars[0][0], probeChars[0][1], probeChars[0][2], probeChars[0][3]);
|
|
sanityResult = probe.checkSanity();
|
|
j = 0;
|
|
while(sanityResult && j+1 < sizeof(probeChars)/sizeof(probeChars[0])) {
|
|
j++;
|
|
sanityResult = probe.setProbeChars(probeChars[j][0], probeChars[j][1], probeChars[j][2], probeChars[j][3]);
|
|
}
|
|
if(sanityResult) {
|
|
logger->log("Bad choice of probe characters! Sanity returned %i. Exiting\n", sanityResult, sanityResult);
|
|
return;
|
|
}
|
|
logger->log("Probe chars: %C, %C, %C, %C\n", probeChars[j][0], probeChars[j][1], probeChars[j][2], probeChars[j][3]);
|
|
|
|
debug->off();
|
|
|
|
if(gRepertoire.size()) {
|
|
exemplarUSet = gRepertoire;
|
|
} else {
|
|
generateRepertoire(locale, exemplarUSet, hanAppears, status);
|
|
}
|
|
exemplarUSet.addAll(tailored);
|
|
hanSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_HAN, status);
|
|
exemplarUSet.removeAll(hanSet);
|
|
|
|
logger->log(exemplarUSet.toPattern(pattern, TRUE), TRUE);
|
|
|
|
exemplarUSet = flatten(exemplarUSet, status);
|
|
logger->log(exemplarUSet.toPattern(pattern, TRUE), TRUE);
|
|
|
|
if(!options[PRINTREF].doesOccur) {
|
|
|
|
logger->log("\n*** Detecting ordering for the locale\n\n");
|
|
|
|
debug->on();
|
|
SortedLines lines(exemplarUSet, gExcludeSet, probe, logger, debug);
|
|
lines.analyse(status);
|
|
lines.calculateSortKeys();
|
|
debug->log("\n*** Final order\n\n");
|
|
debug->log(lines.toPrettyString(TRUE, TRUE), TRUE);
|
|
lines.toFile(fTailoringDump, TRUE, status);
|
|
tailoringBundle->log(lines.toOutput(outputFormat, myLoc, platforms[gPlatformIndexes[0]].name, NULL, TRUE, TRUE, hanAppears), TRUE);
|
|
//debug->off();
|
|
|
|
if(options[REFERENCE].doesOccur) {
|
|
status = U_ZERO_ERROR;
|
|
lines.getRepertoire(RefRepertoire);
|
|
setReference(status);
|
|
|
|
logger->log(exemplarUSet.toPattern(pattern, TRUE), TRUE);
|
|
logger->log(RefRepertoire.toPattern(pattern, TRUE), TRUE);
|
|
|
|
StrengthProbe RefProbe(platforms[gRefNum].comparer, platforms[gRefNum].skgetter);
|
|
logger->log("\n*** Detecting ordering for reference\n\n");
|
|
SortedLines RefLines(exemplarUSet, gExcludeSet, RefProbe, logger, debug);
|
|
RefLines.analyse(status);
|
|
referenceBundle->log(RefLines.toOutput(outputFormat, myLoc, platforms[gRefNum].name, NULL, TRUE, TRUE, FALSE), TRUE);
|
|
RefLines.toFile(fDefaultDump, TRUE, status);
|
|
|
|
lines.reduceDifference(RefLines);
|
|
logger->log("\n*** Final rules\n\n");
|
|
logger->log(lines.toPrettyString(TRUE), TRUE);
|
|
bundle->log(lines.toOutput(outputFormat, myLoc, platforms[gPlatformIndexes[0]].name, platforms[gRefNum].name, TRUE, TRUE, hanAppears), TRUE);
|
|
}
|
|
} else {
|
|
setReference(status);
|
|
StrengthProbe RefProbe(platforms[gRefNum].comparer, platforms[gRefNum].skgetter);
|
|
logger->log("\n*** Detecting ordering for reference\n\n");
|
|
SortedLines RefLines(exemplarUSet, gExcludeSet, RefProbe, logger, debug);
|
|
RefLines.analyse(status);
|
|
logger->log(RefLines.toPrettyString(TRUE), TRUE);
|
|
referenceBundle->log(RefLines.toOutput(outputFormat, myLoc, platforms[gRefNum].name, NULL, TRUE, TRUE, FALSE), TRUE);
|
|
}
|
|
if(hanAppears) {
|
|
// there are Han characters. This is a huge block. The best we can do is to just sort it, compare to empty
|
|
// and spit it out. Anything else would be a suicide (actually is - kernel just kills you :)
|
|
logger->log("\n*** Detecting order for Han\n");
|
|
debug->off();
|
|
setLocale(gLocale, status);
|
|
exemplarUSet.clear();
|
|
exemplarUSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_HAN, status);
|
|
exemplarUSet = flatten(exemplarUSet, status);
|
|
SortedLines han(exemplarUSet, gExcludeSet, probe, logger, debug);
|
|
han.sort(TRUE, TRUE);
|
|
han.classifyRepertoire();
|
|
han.getBounds(status);
|
|
tailoringBundle->log("Han ordering:<br>\n");
|
|
tailoringBundle->log(han.toOutput(outputFormat, myLoc, platforms[gPlatformIndexes[0]].name, NULL, TRUE, FALSE, FALSE), TRUE);
|
|
bundle->log(han.toOutput(outputFormat, myLoc, platforms[gPlatformIndexes[0]].name, NULL, TRUE, FALSE, FALSE), TRUE);
|
|
}
|
|
ucol_close(gCol);
|
|
}
|
|
|
|
void
|
|
processLocale(const char *locale, UErrorCode &status) {
|
|
setLocale(locale, status);
|
|
setFiles(locale, status);
|
|
if(U_FAILURE(status)) {
|
|
return;
|
|
}
|
|
|
|
debug->log("Locale %s (LCID:%06X, unix:%s)\n", locale, gWinLCID, setlocale(LC_COLLATE, NULL));
|
|
tailoringBundle->log("// Ordering for locale %s (LCID:%06X, unix:%s), platform %s reference %s<br>\n",
|
|
locale, gWinLCID, setlocale(LC_COLLATE, NULL),
|
|
platforms[gPlatformIndexes[0]].name, platforms[gRefNum].name);
|
|
if(options[REFERENCE].doesOccur) {
|
|
referenceBundle->log("// Reference for locale %s (LCID:%06X, unix:%s), platform %s reference %s<br>\n",
|
|
locale, gWinLCID, setlocale(LC_COLLATE, NULL),
|
|
platforms[gPlatformIndexes[0]].name, platforms[gRefNum].name);
|
|
}
|
|
|
|
|
|
processCollator(gCol, status);
|
|
}
|
|
|
|
|
|
|
|
UBool
|
|
hasCollationElements(const char *locName) {
|
|
|
|
UErrorCode status = U_ZERO_ERROR;
|
|
UResourceBundle *ColEl = NULL;
|
|
|
|
UResourceBundle *loc = ures_open(NULL, locName, &status);;
|
|
|
|
if(U_SUCCESS(status)) {
|
|
status = U_ZERO_ERROR;
|
|
ColEl = ures_getByKey(loc, "CollationElements", ColEl, &status);
|
|
if(status == U_ZERO_ERROR) { /* do the test - there are real elements */
|
|
ures_close(ColEl);
|
|
ures_close(loc);
|
|
return TRUE;
|
|
}
|
|
ures_close(ColEl);
|
|
ures_close(loc);
|
|
}
|
|
return FALSE;
|
|
}
|
|
|
|
int
|
|
main(int argc,
|
|
char* argv[])
|
|
{
|
|
UErrorCode status = U_ZERO_ERROR;
|
|
logger = new UPrinter(stdout, "en", "latin-1");
|
|
debug = new UPrinter(stderr, "en", "latin-1");
|
|
|
|
/*
|
|
USet *wsp = uprv_openRuleWhiteSpaceSet(&status);
|
|
uset_add(wsp, 0x0041);
|
|
uset_remove(wsp, 0x0041);
|
|
UnicodeString pat;
|
|
((UnicodeSet *)wsp)->toPattern(pat, TRUE);
|
|
pat.setCharAt(pat.length(), 0);
|
|
escapeString(pat.getBuffer(), pat.length(), log);
|
|
u_fflush(log);
|
|
*/
|
|
|
|
processArgs(argc, argv, status);
|
|
int32_t i = 0;
|
|
|
|
|
|
|
|
if(U_FAILURE(status) || gPlatformNo == 0) {
|
|
return -1;
|
|
}
|
|
|
|
utf8cnv = ucnv_open("utf-8", &status); // we are just doing UTF-8 locales for now.
|
|
gUCA = ucol_open("root", &status);
|
|
|
|
if(options[INTERACTIVE].doesOccur) {
|
|
processInteractive();
|
|
} else {
|
|
if(gRulesStdin) {
|
|
char buffer[1024];
|
|
UChar ruleBuffer[16384];
|
|
UChar *rules = ruleBuffer;
|
|
int32_t maxRuleLen = 16384;
|
|
int32_t rLen = 0;
|
|
while(fgets(buffer, 1024, stdin)) {
|
|
if(buffer[0] != '/' && buffer[1] != '/') {
|
|
rLen = u_unescape(buffer, rules, maxRuleLen);
|
|
rules += rLen;
|
|
maxRuleLen -= rLen;
|
|
}
|
|
}
|
|
UParseError parseError;
|
|
//escapeString(ruleBuffer, rules-ruleBuffer, log);//
|
|
debug->log("%U\n", ruleBuffer);
|
|
|
|
UCollator *col = ucol_openRules(ruleBuffer, rules-ruleBuffer, UCOL_DEFAULT, UCOL_DEFAULT, &parseError, &status);
|
|
if(U_SUCCESS(status)) {
|
|
setFiles("stdinRules", status);
|
|
processCollator(col, status);
|
|
} else {
|
|
logger->log("Error %s\n", u_errorName(status));
|
|
}
|
|
} else if(options[DIFF].doesOccur) {
|
|
logger->log("Diffing two dumps\n");
|
|
// must have locale, platform and ref in order to be
|
|
// able to find dump files.
|
|
setFiles(gLocale, status);
|
|
|
|
if(fTailoringDump && fDefaultDump) {
|
|
SortedLines tailoring(fTailoringDump, logger, debug, status);
|
|
logger->log(tailoring.toString(TRUE), TRUE);
|
|
SortedLines reference(fDefaultDump, logger, debug, status);
|
|
logger->log(reference.toString(TRUE), TRUE);
|
|
tailoring.reduceDifference(reference);
|
|
logger->log("\n*** Final rules\n\n");
|
|
logger->log(tailoring.toPrettyString(TRUE), TRUE);
|
|
//result->log(lines.toPrettyString(TRUE), TRUE);
|
|
bundle->log(tailoring.toOutput(outputFormat, gLocale, platforms[gPlatformIndexes[0]].name, platforms[gRefNum].name, TRUE, TRUE, FALSE), TRUE);
|
|
}
|
|
|
|
} else {
|
|
if(gLocale) {
|
|
processLocale(gLocale, status);
|
|
} else if(gLocaleNo) {
|
|
for(i = 0; i < gLocaleNo; i++) {
|
|
processLocale(gLocales[i], status);
|
|
}
|
|
} else { // do the loop through all the locales
|
|
int32_t noOfLoc = uloc_countAvailable();
|
|
const char *locName = NULL;
|
|
for(i = 0; i<noOfLoc; i++) {
|
|
status = U_ZERO_ERROR;
|
|
locName = uloc_getAvailable(i);
|
|
if(hasCollationElements(locName)) {
|
|
processLocale(locName, status);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
ucol_close(gUCA);
|
|
ucnv_close(utf8cnv);
|
|
|
|
delete logger;
|
|
delete debug;
|
|
if(tailoringBundle) {
|
|
delete tailoringBundle;
|
|
}
|
|
if(referenceBundle) {
|
|
delete referenceBundle;
|
|
}
|
|
if(bundle) {
|
|
delete bundle;
|
|
}
|
|
if(fTailoringDump) {
|
|
fclose(fTailoringDump);
|
|
}
|
|
if(fDefaultDump) {
|
|
fclose(fDefaultDump);
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
|
|
UnicodeString propertyAndValueName(UProperty prop, int32_t i) {
|
|
UnicodeString result;
|
|
result.append(u_getPropertyName(prop, U_LONG_PROPERTY_NAME));
|
|
result.append("=");
|
|
result.append(u_getPropertyValueName(prop, i, U_LONG_PROPERTY_NAME));
|
|
|
|
//+ "(" + prop + "," + i + ") ";
|
|
return result;
|
|
}
|
|
|
|
|
|
void generateRepertoire(const char *locale, UnicodeSet &rep, UBool &hanAppears, UErrorCode &status) {
|
|
UnicodeString dispName;
|
|
debug->log("Getting repertoire for %s\n", locale);
|
|
tailoringBundle->log("// Scripts in repertoire: ");
|
|
if(options[REFERENCE].doesOccur) {
|
|
referenceBundle->log("// Scripts in repertoire: ");
|
|
}
|
|
rep.clear();
|
|
UnicodeSet delta;
|
|
|
|
UScriptCode script[256];
|
|
int32_t i = 0;
|
|
// now add the scripts for the locale
|
|
UProperty prop = UCHAR_SCRIPT;
|
|
int32_t scriptLength = uscript_getCode(locale, script, 256, &status);
|
|
if(scriptLength) {
|
|
for (i = 0; i < scriptLength; ++i) {
|
|
if(script[i] == USCRIPT_HAN) {
|
|
hanAppears = TRUE;
|
|
continue;
|
|
}
|
|
delta.applyIntPropertyValue(prop, script[i], status);
|
|
debug->log("Adding ");
|
|
debug->log(propertyAndValueName(prop, script[i]), TRUE);
|
|
tailoringBundle->log("// ");
|
|
tailoringBundle->log(propertyAndValueName(prop, script[i]), TRUE);
|
|
if(options[REFERENCE].doesOccur) {
|
|
referenceBundle->log("// ");
|
|
referenceBundle->log(propertyAndValueName(prop, script[i]), TRUE);
|
|
}
|
|
rep.addAll(delta);
|
|
}
|
|
} else {
|
|
delta.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_LATIN, status);
|
|
rep.addAll(delta);
|
|
}
|
|
|
|
// now see which blocks those overlap, and add
|
|
prop = UCHAR_BLOCK;
|
|
int32_t min = u_getIntPropertyMinValue(prop);
|
|
int32_t max = u_getIntPropertyMaxValue(prop);
|
|
UnicodeSet checkDelta;
|
|
for (i = min; i <= max; ++i) {
|
|
// skip certain blocks
|
|
const char *name = u_getPropertyValueName(prop, i, U_LONG_PROPERTY_NAME);
|
|
if (strcmp(name, "Superscripts_and_Subscripts") == 0
|
|
|| strcmp(name, "Letterlike_Symbols") == 0
|
|
|| strcmp(name, "Alphabetic_Presentation_Forms") == 0
|
|
|| strcmp(name, "Halfwidth_and_Fullwidth_Forms") == 0) continue;
|
|
|
|
delta.applyIntPropertyValue(prop, i, status).removeAll(UNASSIGNED);
|
|
if (!rep.containsSome(delta)) continue;
|
|
if (rep.containsAll(delta)) continue; // just to see what we are adding
|
|
debug->log("Adding ");
|
|
debug->log(propertyAndValueName(prop, i), TRUE);
|
|
tailoringBundle->log("// ");
|
|
tailoringBundle->log(propertyAndValueName(prop, i), TRUE);
|
|
if(options[REFERENCE].doesOccur) {
|
|
referenceBundle->log("// ");
|
|
referenceBundle->log(propertyAndValueName(prop, i), TRUE);
|
|
}
|
|
rep.addAll(delta);
|
|
}
|
|
|
|
// add ASCII and general accents
|
|
rep.addAll(GENERAL_ACCENTS).addAll(ASCII_BASE);
|
|
rep.removeAll(CONTROL);
|
|
//delta.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_HAN, status);
|
|
//rep.removeAll(delta);
|
|
|
|
// now add the exemplar characters
|
|
// can't get at them from Java right now
|
|
tailoringBundle->log("<br>\n");
|
|
if(options[REFERENCE].doesOccur) {
|
|
referenceBundle->log("<br>\n");
|
|
}
|
|
}
|
|
|
|
UnicodeSet flatten(const UnicodeSet &source, UErrorCode &status) {
|
|
UnicodeSet result;
|
|
UnicodeSetIterator it(source);
|
|
UnicodeString item, itemNFKD, toNormalize;
|
|
while (it.next()) {
|
|
// would be nicer if UnicodeSetIterator had a getString function
|
|
if (it.isString()) {
|
|
Normalizer::normalize(it.getString(), UNORM_NFD, 0, item, status);
|
|
Normalizer::normalize(it.getString(), UNORM_NFKD, 0, itemNFKD, status);
|
|
} else {
|
|
toNormalize.setTo(it.getCodepoint());
|
|
Normalizer::normalize(toNormalize, UNORM_NFD, 0, item, status);
|
|
Normalizer::normalize(toNormalize, UNORM_NFKD, 0, itemNFKD, status);
|
|
}
|
|
result.addAll(item);
|
|
result.addAll(itemNFKD);
|
|
}
|
|
return result;
|
|
}
|
|
|
|
|
|
void testWin(StrengthProbe &probe, UErrorCode &status)
|
|
{
|
|
UnicodeSet trailings(UnicodeString("[\\uFE7D\\uFE7C\\u30FD\\uFF70\\u30FC\\u309D\\u3032\\u3031\\u3005\\u0651]"), status);
|
|
char intChar[] = "\\uFE7D\\uFE7C\\u30FD\\uFF70\\u30FC\\u309D\\u3032\\u3031\\u3005\\u0651";
|
|
UChar interesting[256];
|
|
int32_t intLen = u_unescape(intChar, interesting, 256);
|
|
UChar i = 0;
|
|
UChar j = 0, k = 0;
|
|
int32_t count;
|
|
Line myCh, combo, trial, inter, kLine;
|
|
for(i = 0; i < intLen; i++) {
|
|
inter.setTo(interesting[i]);
|
|
logger->log(inter.toString(TRUE), TRUE);
|
|
logger->log("----------------------\n");
|
|
for(j = 0; j < 0xFFFF; j++) {
|
|
myCh.setTo(j);
|
|
if(probe.distanceFromEmptyString(myCh) == UCOL_IDENTICAL) {
|
|
continue;
|
|
}
|
|
logger->log(myCh.toString(TRUE));
|
|
combo.setTo(j);
|
|
combo.append(interesting[i]);
|
|
count = 0;
|
|
for(k = 0; k < 0xFFFF; k++) {
|
|
kLine.setTo(k);
|
|
trial.setTo(j);
|
|
trial.append(k);
|
|
if(probe.compare(kLine, inter) < 0) {
|
|
if(probe.compare(trial, combo) >= 0) {
|
|
count++;
|
|
}
|
|
}
|
|
}
|
|
logger->log("%i %i\n", count, count);
|
|
}
|
|
}
|
|
}
|
|
|