ICU-8972 genprops: start to read ppucd.txt, use that for Decomposition_Type

X-SVN-Rev: 31137
This commit is contained in:
Markus Scherer 2011-12-16 06:51:58 +00:00
parent d25c4ca662
commit 03f9f2307e
5 changed files with 138 additions and 91 deletions

View File

@ -251,13 +251,6 @@ static UTrie2 *pTrie=NULL;
/* -------------------------------------------------------------------------- */
U_CFUNC void
setUnicodeVersion(const char *v) {
UVersionInfo version;
u_versionFromString(version, v);
uprv_memcpy(dataInfo.dataVersion, version, 4);
}
U_CFUNC void
initStore() {
UErrorCode errorCode=U_ZERO_ERROR;
@ -496,6 +489,31 @@ generateData(const char *dataDir, UBool csource) {
}
}
class CorePropsWriter : public PropsWriter {
public:
virtual void setUnicodeVersion(const UVersionInfo version);
virtual void setProps(const UniProps &, const UnicodeSet &newValues, UErrorCode &errorCode);
};
void
CorePropsWriter::setUnicodeVersion(const UVersionInfo version) {
uprv_memcpy(dataInfo.dataVersion, version, 4);
}
void
CorePropsWriter::setProps(const UniProps &props, const UnicodeSet &newValues, UErrorCode &errorCode) {
}
PropsWriter *
createCorePropsWriter(UErrorCode &errorCode) {
if(U_FAILURE(errorCode)) { return NULL; }
PropsWriter *pw=new CorePropsWriter();
if(pw==NULL) {
errorCode=U_MEMORY_ALLOCATION_ERROR;
}
return pw;
}
/*
* Hey, Emacs, please set the following:
*

View File

@ -1,7 +1,7 @@
/*
*******************************************************************************
*
* Copyright (C) 1999-2010, International Business Machines
* Copyright (C) 1999-2011, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
@ -23,27 +23,36 @@
#include <stdio.h>
#include <stdlib.h>
#include "unicode/utypes.h"
#include "unicode/uchar.h"
#include "unicode/localpointer.h"
#include "unicode/putil.h"
#include "unicode/uchar.h"
#include "unicode/uclean.h"
#include "unicode/uniset.h"
#include "unicode/unistr.h"
#include "charstr.h"
#include "cmemory.h"
#include "cstring.h"
#include "genprops.h"
#include "propsvec.h"
#include "ppucd.h"
#include "toolutil.h"
#include "unewdata.h"
#include "uoptions.h"
#include "uparse.h"
#include "uprops.h"
#include "propsvec.h"
U_CDECL_BEGIN
#include "genprops.h"
U_CDECL_END
#define LENGTHOF(array) (sizeof(array)/sizeof((array)[0]))
// TODO: remove
#define USE_NEW 1
U_NAMESPACE_USE
UBool beVerbose=FALSE, haveCopyright=TRUE;
void PropsWriter::setUnicodeVersion(const UVersionInfo version) {}
void PropsWriter::setProps(const UniProps &, const UnicodeSet &, UErrorCode &) {}
/* prototypes --------------------------------------------------------------- */
static void
@ -59,7 +68,6 @@ enum
COPYRIGHT,
DESTDIR,
SOURCEDIR,
UNICODE_VERSION,
ICUDATADIR,
CSOURCE
};
@ -72,7 +80,6 @@ static UOption options[]={
UOPTION_COPYRIGHT,
UOPTION_DESTDIR,
UOPTION_SOURCEDIR,
UOPTION_DEF("unicode", 'u', UOPT_REQUIRES_ARG),
UOPTION_ICUDATADIR,
UOPTION_DEF("csource", 'C', UOPT_NO_ARG)
};
@ -82,14 +89,12 @@ main(int argc, char* argv[]) {
char filename[300];
const char *srcDir=NULL, *destDir=NULL, *suffix=NULL;
char *basename=NULL;
UErrorCode errorCode=U_ZERO_ERROR;
U_MAIN_INIT_ARGS(argc, argv);
/* preset then read command line options */
options[DESTDIR].value=u_getDataDirectory();
options[SOURCEDIR].value="";
options[UNICODE_VERSION].value="";
options[ICUDATADIR].value=u_getDataDirectory();
argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options);
@ -136,17 +141,47 @@ main(int argc, char* argv[]) {
srcDir=options[SOURCEDIR].value;
destDir=options[DESTDIR].value;
/* initialize */
initStore();
IcuToolErrorCode errorCode("genprops");
LocalPointer<PropsWriter> corePropsWriter(createCorePropsWriter(errorCode));
LocalPointer<PropsWriter> props2Writer(createProps2Writer(errorCode));
if(errorCode.isFailure()) {
fprintf(stderr, "genprops: unable to create PropsWriters - %s\n", errorCode.errorName());
return errorCode.reset();
}
CharString ppucdPath(srcDir, errorCode);
ppucdPath.appendPathPart("ppucd.txt", errorCode);
PreparsedUCD ppucd(ppucdPath.data(), errorCode);
if(errorCode.isFailure()) {
fprintf(stderr, "genprops: unable to open %s - %s\n",
ppucdPath.data(), errorCode.errorName());
return errorCode.reset();
}
PreparsedUCD::LineType lineType;
UnicodeSet newValues;
int i=0;
while((lineType=ppucd.readLine(errorCode))!=PreparsedUCD::NO_LINE) {
if(ppucd.lineHasPropertyValues()) {
const UniProps *props=ppucd.getProps(newValues, errorCode);
props2Writer->setProps(*props, newValues, errorCode);
} else if(lineType==PreparsedUCD::UNICODE_VERSION_LINE) {
const UVersionInfo &version=ppucd.getUnicodeVersion();
corePropsWriter->setUnicodeVersion(version);
}
++i;
}
printf("*** parsed %d lines from ppucd.txt\n", i);
if(argc>=2) {
suffix=argv[1];
} else {
suffix=NULL;
}
if(options[UNICODE_VERSION].doesOccur) {
setUnicodeVersion(options[UNICODE_VERSION].value);
}
/* else use the default dataVersion in store.c */
if (options[ICUDATADIR].doesOccur) {
u_setDataDirectory(options[ICUDATADIR].value);
}
@ -158,16 +193,13 @@ main(int argc, char* argv[]) {
*basename++=U_FILE_SEP_CHAR;
}
/* initialize */
initStore();
/* process UnicodeData.txt */
writeUCDFilename(basename, "UnicodeData", suffix);
parseDB(filename, &errorCode);
parseDB(filename, errorCode);
/* process additional properties files */
*basename=0;
generateAdditionalProperties(filename, suffix, &errorCode);
generateAdditionalProperties(filename, suffix, errorCode);
/* process parsed data */
if(U_SUCCESS(errorCode)) {
@ -259,28 +291,6 @@ genCategoryNames[U_CHAR_CATEGORY_COUNT]={
"Pi", "Pf"
};
const char *const
decompositionTypeNames[U_DT_COUNT]={
NULL,
NULL,
"compat",
"circle",
"final",
"font",
"fraction",
"initial",
"isolated",
"medial",
"narrow",
"noBreak",
"small",
"square",
"sub",
"super",
"vertical",
"wide"
};
static struct {
uint32_t first, last, props;
char name[80];
@ -320,34 +330,6 @@ unicodeDataLineFn(void *context,
exit(U_PARSE_ERROR);
}
/* get decomposition type, field 5 */
if(fields[5][0]<fields[5][1]) {
/* there is some decomposition */
if(*fields[5][0]!='<') {
/* canonical */
i=U_DT_CANONICAL;
} else {
/* get compatibility type */
end=fields[5][0]+1;
while(end<fields[5][1] && *end!='>') {
++end;
}
*end='#';
i=getTokenIndex(decompositionTypeNames, U_DT_COUNT, fields[5][0]+1);
if(i<0) {
fprintf(stderr, "genprops: unknown decomposition type \"%s\" at code 0x%lx\n",
fields[5][0], (unsigned long)p.code);
*pErrorCode=U_PARSE_ERROR;
exit(U_PARSE_ERROR);
}
}
upvec_setValue(pv, p.code, p.code, 2, (uint32_t)i, UPROPS_DT_MASK, pErrorCode);
if(U_FAILURE(*pErrorCode)) {
fprintf(stderr, "genprops error: unable to set decomposition type: %s\n", u_errorName(*pErrorCode));
exit(*pErrorCode);
}
}
/* decimal digit value, field 6 */
if(fields[6][0]<fields[6][1]) {
value=(uint32_t)uprv_strtoul(fields[6][0], &end, 10);
@ -544,14 +526,6 @@ repeatAreaProps() {
repeatProps(0x100000, 0x10fffd, puaProps);
}
}
/* Hangul have canonical decompositions */
errorCode=U_ZERO_ERROR;
upvec_setValue(pv, 0xac00, 0xd7a3, 2, (uint32_t)U_DT_CANONICAL, UPROPS_DT_MASK, &errorCode);
if(U_FAILURE(errorCode)) {
fprintf(stderr, "genprops error: unable to set decomposition type: %s\n", u_errorName(errorCode));
exit(errorCode);
}
}
static void

View File

@ -1,7 +1,7 @@
/*
*******************************************************************************
*
* Copyright (C) 1999-2010, International Business Machines
* Copyright (C) 1999-2011, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
@ -18,13 +18,25 @@
#define __GENPROPS_H__
#include "unicode/utypes.h"
#include "utrie.h"
#include "unicode/uniset.h"
#include "ppucd.h"
#include "propsvec.h"
/* file definitions */
#define DATA_NAME "uprops"
#define DATA_TYPE "icu"
class PropsWriter {
public:
virtual void setUnicodeVersion(const UVersionInfo version);
virtual void setProps(const UniProps &props, const UnicodeSet &newValues, UErrorCode &errorCode);
// virtual writeCSourceFile(icusrcroot);
// virtual writeBinaryData(icusrcroot);
};
PropsWriter *createCorePropsWriter(UErrorCode &errorCode);
PropsWriter *createProps2Writer(UErrorCode &errorCode);
/* character properties */
typedef struct {
uint32_t code;
@ -52,9 +64,6 @@ isToken(const char *token, const char *s);
U_CFUNC int32_t
getTokenIndex(const char *const tokens[], int32_t countTokens, const char *s);
U_CFUNC void
setUnicodeVersion(const char *v);
U_CFUNC void
initStore(void);

View File

@ -503,6 +503,20 @@ generateAdditionalProperties(char *filename, const char *suffix, UErrorCode *pEr
parseSingleEnumFile(filename, basename, suffix, &eawSingleEnum, pErrorCode);
newTrie=upvec_compactToUTrie2WithRowIndexes(pv, pErrorCode);
// TODO: remove
#if 0
const uint32_t *pvArray;
int32_t pvRows;
pvArray=upvec_getArray(pv, &pvRows, NULL);
for(int32_t c=0; c<=0x10ffff; ++c) {
uint16_t ri=utrie2_get32(newTrie, c);
uint32_t v2=pvArray[ri+2];
int32_t dt=v2&UPROPS_DT_MASK;
if(dt!=0) {
printf("%04x %d\n", c, dt);
}
}
#endif
if(U_FAILURE(*pErrorCode)) {
fprintf(stderr, "genprops error: unable to build trie for additional properties: %s\n",
u_errorName(*pErrorCode));
@ -923,3 +937,33 @@ writeAdditionalData(FILE *f, uint8_t *p, int32_t capacity, int32_t indexes[UPROP
return additionalPropsSize;
}
class Props2Writer : public PropsWriter {
public:
virtual void setProps(const UniProps &, const UnicodeSet &newValues, UErrorCode &errorCode);
};
void
Props2Writer::setProps(const UniProps &props, const UnicodeSet &newValues, UErrorCode &errorCode) {
if(U_FAILURE(errorCode)) { return; }
if(newValues.contains(UCHAR_DECOMPOSITION_TYPE)) {
upvec_setValue(pv, props.start, props.end,
2, (uint32_t)props.getIntProp(UCHAR_DECOMPOSITION_TYPE), UPROPS_DT_MASK,
&errorCode);
if(U_FAILURE(errorCode)) {
fprintf(stderr, "genprops error: unable to set decomposition type: %s\n",
u_errorName(errorCode));
exit(errorCode);
}
}
}
PropsWriter *
createProps2Writer(UErrorCode &errorCode) {
if(U_FAILURE(errorCode)) { return NULL; }
PropsWriter *pw=new Props2Writer();
if(pw==NULL) {
errorCode=U_MEMORY_ALLOCATION_ERROR;
}
return pw;
}

View File

@ -600,6 +600,8 @@ def ParseUnicodeData(in_file):
raise SyntaxError(
"error: unterminated range started at\n %s\n" %
range_first_line)
# Hangul syllables have canonical decompositions which are not listed in UnicodeData.txt.
SetPropertyValue("dt", "Can", 0xac00, 0xd7a3)
_alg_names_ranges.sort()