ICU-2199 use exemplar characters for chart generation
X-SVN-Rev: 12104
This commit is contained in:
parent
39d923f573
commit
1b1cf58c46
@ -37,6 +37,9 @@
|
||||
#include "cstring.h"
|
||||
#include "uoptions.h"
|
||||
#include "ucol_imp.h"
|
||||
#include <unicode/ures.h>
|
||||
#include <unicode/uniset.h>
|
||||
#include <unicode/usetiter.h>
|
||||
|
||||
/**
|
||||
* Command line option variables.
|
||||
@ -44,18 +47,19 @@
|
||||
* command line by the user.
|
||||
*/
|
||||
static UOption options[]={
|
||||
UOPTION_HELP_H,
|
||||
UOPTION_HELP_QUESTION_MARK,
|
||||
{"locale", NULL, NULL, NULL, 'l', UOPT_REQUIRES_ARG, 0},
|
||||
{"serialize", NULL, NULL, NULL, 'z', UOPT_NO_ARG, 0},
|
||||
UOPTION_DESTDIR,
|
||||
UOPTION_SOURCEDIR,
|
||||
{"attribute", NULL, NULL, NULL, 'a', UOPT_REQUIRES_ARG, 0},
|
||||
{"rule", NULL, NULL, NULL, 'r', UOPT_REQUIRES_ARG, 0},
|
||||
{"normalization", NULL, NULL, NULL, 'n', UOPT_REQUIRES_ARG, 0},
|
||||
{"scripts", NULL, NULL, NULL, 't', UOPT_NO_ARG, 0},
|
||||
{"reducehan", NULL, NULL, NULL, 'e', UOPT_NO_ARG, 0},
|
||||
UOPTION_VERBOSE
|
||||
/* 00 */ UOPTION_HELP_H,
|
||||
/* 01 */ UOPTION_HELP_QUESTION_MARK,
|
||||
/* 02 */ {"locale", NULL, NULL, NULL, 'l', UOPT_REQUIRES_ARG, 0},
|
||||
/* 03 */ {"serialize", NULL, NULL, NULL, 'z', UOPT_NO_ARG, 0},
|
||||
/* 04 */ UOPTION_DESTDIR,
|
||||
/* 05 */ UOPTION_SOURCEDIR,
|
||||
/* 06 */ {"attribute", NULL, NULL, NULL, 'a', UOPT_REQUIRES_ARG, 0},
|
||||
/* 07 */ {"rule", NULL, NULL, NULL, 'r', UOPT_REQUIRES_ARG, 0},
|
||||
/* 08 */ {"normalization", NULL, NULL, NULL, 'n', UOPT_REQUIRES_ARG, 0},
|
||||
/* 09 */ {"scripts", NULL, NULL, NULL, 't', UOPT_NO_ARG, 0},
|
||||
/* 10 */ {"reducehan", NULL, NULL, NULL, 'e', UOPT_NO_ARG, 0},
|
||||
/* 11 */ UOPTION_VERBOSE,
|
||||
/* 12 */ {"wholescripts", NULL, NULL, NULL, 'W', UOPT_NO_ARG, 0}
|
||||
};
|
||||
|
||||
/**
|
||||
@ -263,6 +267,8 @@ void serialize(FILE *f, UChar *rule, int rlen, UBool contractiononly,
|
||||
|
||||
while ((current = ucol_tok_parseNextToken(&src, rstart, &parseError,
|
||||
&error)) != NULL) {
|
||||
chOffset = src.parsedToken.charsOffset;
|
||||
chLen = src.parsedToken.charsLen;
|
||||
// contractions handled here
|
||||
if (!contractiononly || chLen > 1) {
|
||||
ucol_setText(iter, rule + chOffset, chLen, &error);
|
||||
@ -940,6 +946,85 @@ inline UBool checkInScripts(UScriptCode script[], int scriptcount,
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the script elements and contractions belonging to the script
|
||||
* @param elems output list
|
||||
* @param locale locale
|
||||
* @return number of script elements
|
||||
* Add by Richard
|
||||
*/
|
||||
int getScriptElementsFromExemplars(ScriptElement scriptelem[], const char* locale) {
|
||||
UErrorCode error = U_ZERO_ERROR;
|
||||
UChar32 codepoint = 0;
|
||||
|
||||
UResourceBundle* ures = ures_open(NULL, locale, &error);
|
||||
if (U_FAILURE(error)) {
|
||||
fprintf(stdout, "Can not find resource bundle for locale: %s\n", locale);
|
||||
return -1;
|
||||
}
|
||||
int32_t length;
|
||||
const UChar* exemplarChars = ures_getStringByKey(ures, "ExemplarCharacters", &length, &error);
|
||||
|
||||
if (U_FAILURE(error)) {
|
||||
fprintf(stdout, "Can not find ExemplarCharacters in resource bundle\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
UChar* upperChars = new UChar[length*2];
|
||||
if (upperChars == 0) {
|
||||
fprintf(stdout, "Memory error\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
int32_t destLength = u_strToUpper(upperChars, length*2, exemplarChars, -1, locale, &error);
|
||||
if (U_FAILURE(error)) {
|
||||
fprintf(stdout, "Error when u_strToUpper() \n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
UChar* pattern = new UChar[length + destLength + 10];
|
||||
UChar left[2] = {0x005b, 0x0};
|
||||
UChar right[2] = {0x005d, 0x0};
|
||||
pattern = u_strcpy(pattern, left);
|
||||
pattern = u_strcat(pattern, exemplarChars);
|
||||
pattern = u_strcat(pattern, upperChars);
|
||||
pattern = u_strcat(pattern, right);
|
||||
|
||||
UnicodeSet * uniset = new UnicodeSet(UnicodeString(pattern), error);
|
||||
if (U_FAILURE(error)) {
|
||||
fprintf(stdout, "Can not open USet \n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
UnicodeSetIterator* usetiter = new UnicodeSetIterator(*uniset);
|
||||
|
||||
int32_t count = 0;
|
||||
|
||||
while (usetiter -> next()) {
|
||||
if (usetiter -> isString()) {
|
||||
UnicodeString strItem = usetiter -> getString();
|
||||
|
||||
scriptelem[count].count = 0;
|
||||
for (int i = 0; i < strItem.length(); i++) {
|
||||
codepoint = strItem.char32At(i);
|
||||
UTF16_APPEND_CHAR_UNSAFE(scriptelem[count].ch,
|
||||
scriptelem[count].count, codepoint);
|
||||
scriptelem[count].tailored = FALSE;
|
||||
}
|
||||
} else {
|
||||
codepoint = usetiter -> getCodepoint();
|
||||
scriptelem[count].count = 0;
|
||||
UTF16_APPEND_CHAR_UNSAFE(scriptelem[count].ch,
|
||||
scriptelem[count].count, codepoint);
|
||||
scriptelem[count].tailored = FALSE;
|
||||
}
|
||||
|
||||
count++;
|
||||
}
|
||||
|
||||
return count;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the script elements and contractions belonging to the script
|
||||
* @param script list
|
||||
@ -1173,7 +1258,9 @@ void outputHTMLFooter()
|
||||
* @param script code list
|
||||
* @param scriptcount number of scripts
|
||||
*/
|
||||
void serializeScripts(UScriptCode script[], int scriptcount)
|
||||
//void serializeScripts(UScriptCode script[], int scriptcount)
|
||||
//Richard
|
||||
void serializeScripts(UScriptCode script[], int scriptcount, const char* locale = NULL)
|
||||
{
|
||||
UErrorCode error = U_ZERO_ERROR;
|
||||
|
||||
@ -1183,7 +1270,13 @@ void serializeScripts(UScriptCode script[], int scriptcount)
|
||||
fprintf(stdout, "Memory error\n");
|
||||
return;
|
||||
}
|
||||
int count = getScriptElements(script, scriptcount, scriptelem);
|
||||
int count = 0;
|
||||
if(locale) {
|
||||
count = getScriptElementsFromExemplars(scriptelem, locale);
|
||||
} else {
|
||||
count = getScriptElements(script, scriptcount, scriptelem);
|
||||
}
|
||||
|
||||
// Sort script elements using Quicksort algorithm:
|
||||
qsort(scriptelem, count, sizeof(ScriptElement), compareCodepoints);
|
||||
markTailored(script, scriptcount, scriptelem, count);
|
||||
@ -1247,7 +1340,7 @@ void outputHTMLHeader(const char *locale, UScriptCode script[],
|
||||
fprintf(OUTPUT_, "<table border=0>\n");
|
||||
UChar displayname[64];
|
||||
UErrorCode error = U_ZERO_ERROR;
|
||||
int32_t size = uloc_getDisplayName(locale, NULL, displayname, 64, &error);
|
||||
int32_t size = uloc_getDisplayName(locale, "en_US", displayname, 64, &error);
|
||||
char utf8displayname[128];
|
||||
if (U_FAILURE(error)) {
|
||||
utf8displayname[0] = 0;
|
||||
@ -1362,6 +1455,8 @@ void serializeScripts() {
|
||||
if (options[4].doesOccur) {
|
||||
strcpy(filename, options[4].value);
|
||||
dirlength = appendDirSeparator(filename);
|
||||
} else {
|
||||
filename[0] = 0;
|
||||
}
|
||||
|
||||
const char *locale;
|
||||
@ -1417,7 +1512,14 @@ void serializeScripts() {
|
||||
}
|
||||
outputHTMLHeader(locale, scriptcode, scriptcount);
|
||||
fprintf(stdout, "%s\n", locale);
|
||||
serializeScripts(scriptcode, scriptcount);
|
||||
|
||||
if(options[12].doesOccur) {
|
||||
// use whole scripts
|
||||
serializeScripts(scriptcode, scriptcount);
|
||||
} else {
|
||||
// use exemplar chars
|
||||
serializeScripts(scriptcode, scriptcount, locale);
|
||||
}
|
||||
fclose(OUTPUT_);
|
||||
}
|
||||
ucol_close(COLLATOR_);
|
||||
@ -1451,7 +1553,7 @@ int main(int argc, char *argv[]) {
|
||||
fprintf(stdout, "\n");
|
||||
}
|
||||
if (argc < 0 || options[0].doesOccur || options[1].doesOccur) {
|
||||
fprintf(stdout, "Usage: strperf options...\n"
|
||||
fprintf(stdout, "Usage: dumpce options...\n"
|
||||
"--help\n"
|
||||
" Display this message.\n"
|
||||
"--locale name|all\n"
|
||||
@ -1471,7 +1573,9 @@ int main(int argc, char *argv[]) {
|
||||
"--scripts\n"
|
||||
" Codepoints from all scripts are sorted and serialized.\n"
|
||||
"--reducehan\n"
|
||||
" Only 200 Han script characters will be displayed with the use of --scripts.\n\n");
|
||||
" Only 200 Han script characters will be displayed with the use of --scripts.\n"
|
||||
"--wholescripts\n"
|
||||
" Show collation order for whole scripts instead of just for exemplar characters of a locale\n\n");
|
||||
|
||||
fprintf(stdout, "Example to generate *.txt files : dumpce --serialize --locale af --destdir /temp --attribute UCOL_STRENGTH=UCOL_DEFAULT_STRENGTH,4=17\n\n");
|
||||
fprintf(stdout, "Example to generate *.html files for oss web display: dumpce --scripts --destdir /temp --reducehan\n");
|
||||
|
Loading…
Reference in New Issue
Block a user