00a996d08f
X-SVN-Rev: 28563
365 lines
12 KiB
C++
365 lines
12 KiB
C++
/*
|
|
*******************************************************************************
|
|
* Copyright (C) 2010, International Business Machines
|
|
* Corporation and others. All Rights Reserved.
|
|
*******************************************************************************
|
|
* file name: genuts46.cpp
|
|
* encoding: US-ASCII
|
|
* tab size: 8 (not used)
|
|
* indentation:4
|
|
*
|
|
* created on: 2010mar02
|
|
* created by: Markus W. Scherer
|
|
*
|
|
* quick & dirty tool to recreate the UTS #46 data table according to the spec
|
|
*/
|
|
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <string>
|
|
#include <string.h>
|
|
#include "unicode/utypes.h"
|
|
#include "unicode/errorcode.h"
|
|
#include "unicode/normalizer2.h"
|
|
#include "unicode/uniset.h"
|
|
#include "unicode/unistr.h"
|
|
#include "unicode/usetiter.h"
|
|
#include "unicode/usprep.h"
|
|
#include "sprpimpl.h" // HACK
|
|
|
|
/**
|
|
* icu::ErrorCode subclass for easy UErrorCode handling.
|
|
* The destructor calls handleFailure() which calls exit(errorCode) when isFailure().
|
|
*/
|
|
class ExitingErrorCode : public icu::ErrorCode {
|
|
public:
|
|
/**
|
|
* @param loc A short string describing where the ExitingErrorCode is used.
|
|
*/
|
|
ExitingErrorCode(const char *loc) : location(loc) {}
|
|
virtual ~ExitingErrorCode();
|
|
protected:
|
|
virtual void handleFailure() const;
|
|
private:
|
|
const char *location;
|
|
};
|
|
|
|
ExitingErrorCode::~ExitingErrorCode() {
|
|
// Safe because our handleFailure() does not throw exceptions.
|
|
if(isFailure()) { handleFailure(); }
|
|
}
|
|
|
|
void ExitingErrorCode::handleFailure() const {
|
|
fprintf(stderr, "error at %s: %s\n", location, errorName());
|
|
exit(errorCode);
|
|
}
|
|
|
|
static int
|
|
toIDNA2003(const UStringPrepProfile *prep, UChar32 c, icu::UnicodeString &destString) {
|
|
UChar src[2];
|
|
int32_t srcLength=0;
|
|
U16_APPEND_UNSAFE(src, srcLength, c);
|
|
UChar *dest;
|
|
int32_t destLength;
|
|
dest=destString.getBuffer(32);
|
|
if(dest==NULL) {
|
|
return FALSE;
|
|
}
|
|
UErrorCode errorCode=U_ZERO_ERROR;
|
|
destLength=usprep_prepare(prep, src, srcLength,
|
|
dest, destString.getCapacity(),
|
|
USPREP_DEFAULT, NULL, &errorCode);
|
|
destString.releaseBuffer(destLength);
|
|
if(errorCode==U_STRINGPREP_PROHIBITED_ERROR) {
|
|
return -1;
|
|
} else {
|
|
// Returns FALSE=0 for U_STRINGPREP_UNASSIGNED_ERROR and processing errors,
|
|
// TRUE=1 if c is valid or mapped.
|
|
return U_SUCCESS(errorCode);
|
|
}
|
|
}
|
|
|
|
enum Status {
|
|
DISALLOWED, IGNORED, MAPPED, DEVIATION, VALID,
|
|
DISALLOWED_STD3_VALID, DISALLOWED_STD3_MAPPED
|
|
};
|
|
static const char *const statusNames[]={
|
|
"disallowed", "ignored", "mapped", "deviation", "valid",
|
|
"disallowed_STD3_valid", "disallowed_STD3_mapped"
|
|
};
|
|
|
|
static void
|
|
printLine(UChar32 start, UChar32 end, Status status, const icu::UnicodeString &mapping) {
|
|
if(start==end) {
|
|
printf("%04lX ", (long)start);
|
|
} else {
|
|
printf("%04lX..%04lX ", (long)start, (long)end);
|
|
}
|
|
printf("; %s", statusNames[status]);
|
|
if(status==MAPPED || status==DEVIATION || !mapping.isEmpty()) {
|
|
printf(" ;");
|
|
const UChar *buffer=mapping.getBuffer();
|
|
int32_t length=mapping.length();
|
|
int32_t i=0;
|
|
UChar32 c;
|
|
while(i<length) {
|
|
U16_NEXT(buffer, i, length, c);
|
|
printf(" %04lX", (long)c);
|
|
}
|
|
}
|
|
puts("");
|
|
}
|
|
|
|
static void
|
|
getAgeIfAssigned(UChar32 c, UVersionInfo age) {
|
|
if(u_isdefined(c)) {
|
|
u_charAge(c, age);
|
|
} else if(U_IS_UNICODE_NONCHAR(c)) {
|
|
age[0]=0;
|
|
age[1]=0;
|
|
age[2]=0;
|
|
age[3]=1;
|
|
} else {
|
|
memset(age, 0, 4);
|
|
}
|
|
}
|
|
|
|
extern int
|
|
main(int argc, const char *argv[]) {
|
|
ExitingErrorCode errorCode("genuts46");
|
|
|
|
// predefined base sets
|
|
icu::UnicodeSet unassignedSet(UNICODE_STRING_SIMPLE("[:Cn:]"), errorCode);
|
|
|
|
icu::UnicodeSet labelSeparators(
|
|
UNICODE_STRING_SIMPLE("[\\u002E\\u3002\\uFF0E\\uFF61]"), errorCode);
|
|
|
|
icu::UnicodeSet mappedSet(
|
|
UNICODE_STRING_SIMPLE("[:Changes_When_NFKC_Casefolded:]"), errorCode);
|
|
mappedSet.removeAll(labelSeparators); // simplifies checking of mapped characters
|
|
|
|
icu::UnicodeSet baseValidSet(icu::UnicodeString(
|
|
"[[[[:^Changes_When_NFKC_Casefolded:]"
|
|
"-[:C:]-[:Z:]"
|
|
"-[:Block=Ideographic_Description_Characters:]]"
|
|
"[:ascii:]]-[.]]", -1, US_INV), errorCode);
|
|
|
|
// Characters that are disallowed when STD3 rules are applied,
|
|
// but valid when STD3 rules are not applied.
|
|
icu::UnicodeSet disallowedSTD3Set(icu::UnicodeString(
|
|
"[[:ascii:]-[\\u002D.a-zA-Z0-9]]", -1, US_INV), errorCode);
|
|
|
|
icu::UnicodeSet deviationSet(
|
|
UNICODE_STRING_SIMPLE("[\\u00DF\\u03C2\\u200C\\u200D]"), errorCode);
|
|
errorCode.assertSuccess();
|
|
|
|
// derived sets
|
|
icu::LocalUStringPrepProfilePointer namePrep(usprep_openByType(USPREP_RFC3491_NAMEPREP, errorCode));
|
|
const icu::Normalizer2 *nfkc_cf=
|
|
icu::Normalizer2::getInstance(NULL, "nfkc_cf", UNORM2_COMPOSE, errorCode);
|
|
errorCode.assertSuccess();
|
|
|
|
// HACK: The StringPrep API performs a BiDi check according to the data.
|
|
// We need to override that for this data generation, by resetting an internal flag.
|
|
namePrep->checkBiDi=FALSE;
|
|
|
|
icu::UnicodeSet baseExclusionSet;
|
|
icu::UnicodeString cString, mapping, namePrepResult;
|
|
for(UChar32 c=0; c<=0x10ffff; ++c) {
|
|
if(c==0xd800) {
|
|
c=0xe000;
|
|
}
|
|
int namePrepStatus=toIDNA2003(namePrep.getAlias(), c, namePrepResult);
|
|
if(namePrepStatus!=0) {
|
|
// get the UTS #46 base mapping value
|
|
switch(c) {
|
|
case 0xff0e:
|
|
case 0x3002:
|
|
case 0xff61:
|
|
mapping.setTo(0x2e);
|
|
break;
|
|
default:
|
|
cString.setTo(c);
|
|
nfkc_cf->normalize(cString, mapping, errorCode);
|
|
break;
|
|
}
|
|
if(
|
|
namePrepStatus>0 ?
|
|
// c is valid or mapped in IDNA2003
|
|
!labelSeparators.contains(c) && namePrepResult!=mapping :
|
|
// namePrepStatus<0: c is prohibited in IDNA2003
|
|
baseValidSet.contains(c) || (cString!=mapping && baseValidSet.containsAll(mapping))
|
|
) {
|
|
baseExclusionSet.add(c);
|
|
}
|
|
}
|
|
}
|
|
|
|
icu::UnicodeSet disallowedSet(0, 0x10ffff);
|
|
disallowedSet.
|
|
removeAll(labelSeparators).
|
|
removeAll(deviationSet).
|
|
removeAll(mappedSet).
|
|
removeAll(baseValidSet).
|
|
addAll(baseExclusionSet).
|
|
addAll(unassignedSet);
|
|
|
|
const icu::Normalizer2 *nfd=
|
|
icu::Normalizer2::getInstance(NULL, "nfc", UNORM2_DECOMPOSE, errorCode);
|
|
errorCode.assertSuccess();
|
|
|
|
icu::UnicodeSet ignoredSet; // will be a subset of mappedSet
|
|
icu::UnicodeSet removeSet;
|
|
icu::UnicodeString nfdString;
|
|
{
|
|
icu::UnicodeSetIterator iter(mappedSet);
|
|
while(iter.next()) {
|
|
UChar32 c=iter.getCodepoint();
|
|
cString.setTo(c);
|
|
nfkc_cf->normalize(cString, mapping, errorCode);
|
|
if(!baseValidSet.containsAll(mapping)) {
|
|
fprintf(stderr, "U+%04lX mapped -> disallowed: mapping not wholly in base valid set\n", (long)c);
|
|
disallowedSet.add(c);
|
|
removeSet.add(c);
|
|
} else if(mapping.isEmpty()) {
|
|
ignoredSet.add(c);
|
|
}
|
|
}
|
|
mappedSet.removeAll(removeSet);
|
|
}
|
|
errorCode.assertSuccess();
|
|
|
|
icu::UnicodeSet validSet(baseValidSet);
|
|
validSet.
|
|
removeAll(labelSeparators). // non-ASCII label separators will be mapped in the end
|
|
removeAll(deviationSet).
|
|
removeAll(disallowedSet).
|
|
removeAll(mappedSet).
|
|
add(0x2e); // not mapped, simply valid
|
|
UBool madeChange;
|
|
do {
|
|
madeChange=FALSE;
|
|
{
|
|
removeSet.clear();
|
|
icu::UnicodeSetIterator iter(validSet);
|
|
while(iter.next()) {
|
|
UChar32 c=iter.getCodepoint();
|
|
if(nfd->getDecomposition(c, nfdString) && !validSet.containsAll(nfdString)) {
|
|
fprintf(stderr, "U+%04lX valid -> disallowed: NFD not wholly valid\n", (long)c);
|
|
disallowedSet.add(c);
|
|
removeSet.add(c);
|
|
madeChange=TRUE;
|
|
}
|
|
}
|
|
validSet.removeAll(removeSet);
|
|
}
|
|
{
|
|
removeSet.clear();
|
|
icu::UnicodeSetIterator iter(mappedSet);
|
|
while(iter.next()) {
|
|
UChar32 c=iter.getCodepoint();
|
|
cString.setTo(c);
|
|
nfkc_cf->normalize(cString, mapping, errorCode);
|
|
nfd->normalize(mapping, nfdString, errorCode);
|
|
if(!validSet.containsAll(nfdString)) {
|
|
fprintf(stderr, "U+%04lX mapped -> disallowed: NFD of mapping not wholly valid\n", (long)c);
|
|
disallowedSet.add(c);
|
|
removeSet.add(c);
|
|
madeChange=TRUE;
|
|
}
|
|
}
|
|
mappedSet.removeAll(removeSet);
|
|
}
|
|
} while(madeChange);
|
|
errorCode.assertSuccess();
|
|
|
|
// finish up
|
|
labelSeparators.remove(0x2e).freeze(); // U+002E is simply valid
|
|
deviationSet.freeze();
|
|
ignoredSet.freeze();
|
|
validSet.freeze();
|
|
mappedSet.freeze();
|
|
disallowedSTD3Set.freeze();
|
|
|
|
// output
|
|
UChar32 prevStart=0, c=0;
|
|
Status prevStatus=DISALLOWED_STD3_VALID, status;
|
|
icu::UnicodeString prevMapping;
|
|
UVersionInfo prevAge={ 1, 1, 0, 0 }, age;
|
|
|
|
icu::UnicodeSetIterator iter(disallowedSet);
|
|
while(iter.nextRange()) {
|
|
UChar32 start=iter.getCodepoint();
|
|
while(c<start) {
|
|
mapping.remove();
|
|
if(labelSeparators.contains(c)) {
|
|
status=MAPPED;
|
|
mapping.setTo(0x2e);
|
|
} else if(deviationSet.contains(c)) {
|
|
status=DEVIATION;
|
|
cString.setTo(c);
|
|
nfkc_cf->normalize(cString, mapping, errorCode);
|
|
} else if(ignoredSet.contains(c)) {
|
|
status=IGNORED;
|
|
} else if(validSet.contains(c)) {
|
|
if(disallowedSTD3Set.contains(c)) {
|
|
fprintf(stderr, "U+%04lX valid -> disallowed_STD3_valid: itself not STD3\n", (long)c);
|
|
status=DISALLOWED_STD3_VALID;
|
|
} else if( nfd->getDecomposition(c, nfdString) &&
|
|
disallowedSTD3Set.containsSome(nfdString)
|
|
) {
|
|
fprintf(stderr, "U+%04lX valid -> disallowed_STD3_valid: NFD not wholly STD3\n", (long)c);
|
|
status=DISALLOWED_STD3_VALID;
|
|
} else {
|
|
status=VALID;
|
|
}
|
|
} else if(mappedSet.contains(c)) {
|
|
cString.setTo(c);
|
|
nfkc_cf->normalize(cString, mapping, errorCode);
|
|
if(disallowedSTD3Set.containsSome(mapping)) {
|
|
fprintf(stderr, "U+%04lX mapped -> disallowed_STD3_mapped\n", (long)c);
|
|
status=DISALLOWED_STD3_MAPPED;
|
|
} else {
|
|
status=MAPPED;
|
|
}
|
|
} else {
|
|
fprintf(stderr, "*** undetermined status of U+%04lX\n", (long)c);
|
|
}
|
|
// Print a new line where the status, the mapping or
|
|
// the character age change.
|
|
getAgeIfAssigned(c, age);
|
|
if( prevStart<c &&
|
|
(status!=prevStatus || mapping!=prevMapping || 0!=memcmp(prevAge, age, 4))
|
|
) {
|
|
printLine(prevStart, c-1, prevStatus, prevMapping);
|
|
prevStart=c;
|
|
prevStatus=status;
|
|
prevMapping=mapping;
|
|
memcpy(prevAge, age, 4);
|
|
}
|
|
++c;
|
|
}
|
|
// c==start is disallowed
|
|
if(prevStart<c) {
|
|
printLine(prevStart, c-1, prevStatus, prevMapping);
|
|
}
|
|
prevStart=c;
|
|
prevStatus=DISALLOWED;
|
|
prevMapping.remove();
|
|
getAgeIfAssigned(c, prevAge);
|
|
UChar32 end=iter.getCodepointEnd();
|
|
while(++c<=end) {
|
|
getAgeIfAssigned(c, age);
|
|
if(prevStart<c && 0!=memcmp(prevAge, age, 4)) {
|
|
printLine(prevStart, c-1, prevStatus, prevMapping);
|
|
prevStart=c;
|
|
memcpy(prevAge, age, 4);
|
|
}
|
|
}
|
|
}
|
|
if(prevStart<c) {
|
|
printLine(prevStart, c-1, prevStatus, prevMapping);
|
|
}
|
|
return 0;
|
|
}
|