ICU-7264 generate UTS #46 data with disallowed_STD3_valid and disallowed_STD3_mapped

X-SVN-Rev: 28560
This commit is contained in:
Markus Scherer 2010-08-31 05:48:38 +00:00
parent 2347b7736e
commit ab9fc77dfb

View File

@ -17,6 +17,7 @@
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
#include <string> #include <string>
#include <string.h>
#include "unicode/utypes.h" #include "unicode/utypes.h"
#include "unicode/errorcode.h" #include "unicode/errorcode.h"
#include "unicode/normalizer2.h" #include "unicode/normalizer2.h"
@ -78,9 +79,13 @@ toIDNA2003(const UStringPrepProfile *prep, UChar32 c, icu::UnicodeString &destSt
} }
} }
enum Status { DISALLOWED, IGNORED, MAPPED, DEVIATION, VALID }; enum Status {
DISALLOWED, IGNORED, MAPPED, DEVIATION, VALID,
DISALLOWED_STD3_VALID, DISALLOWED_STD3_MAPPED
};
static const char *const statusNames[]={ static const char *const statusNames[]={
"disallowed", "ignored", "mapped", "deviation", "valid" "disallowed", "ignored", "mapped", "deviation", "valid",
"disallowed_STD3_valid", "disallowed_STD3_mapped"
}; };
static void static void
@ -105,6 +110,20 @@ printLine(UChar32 start, UChar32 end, Status status, const icu::UnicodeString &m
puts(""); puts("");
} }
static void
getAgeIfAssigned(UChar32 c, UVersionInfo age) {
if(u_isdefined(c)) {
u_charAge(c, age);
} else if(U_IS_UNICODE_NONCHAR(c)) {
age[0]=0;
age[1]=0;
age[2]=0;
age[3]=1;
} else {
memset(age, 0, 4);
}
}
extern int extern int
main(int argc, const char *argv[]) { main(int argc, const char *argv[]) {
ExitingErrorCode errorCode("genuts46"); ExitingErrorCode errorCode("genuts46");
@ -120,11 +139,15 @@ main(int argc, const char *argv[]) {
mappedSet.removeAll(labelSeparators); // simplifies checking of mapped characters mappedSet.removeAll(labelSeparators); // simplifies checking of mapped characters
icu::UnicodeSet baseValidSet(icu::UnicodeString( icu::UnicodeSet baseValidSet(icu::UnicodeString(
"[[[:^Changes_When_NFKC_Casefolded:]" "[[[[:^Changes_When_NFKC_Casefolded:]"
"-[:C:]-[:Z:]" "-[:C:]-[:Z:]"
"-[:Block=Ideographic_Description_Characters:]" "-[:Block=Ideographic_Description_Characters:]]"
"-[:ascii:]]" "[:ascii:]]-[.]]", -1, US_INV), errorCode);
"[\\u002Da-zA-Z0-9]]", -1, US_INV), errorCode);
// Characters that are disallowed when STD3 rules are applied,
// but valid when STD3 rules are not applied.
icu::UnicodeSet disallowedSTD3Set(icu::UnicodeString(
"[[:ascii:]-[\\u002D.a-zA-Z0-9]]", -1, US_INV), errorCode);
icu::UnicodeSet deviationSet( icu::UnicodeSet deviationSet(
UNICODE_STRING_SIMPLE("[\\u00DF\\u03C2\\u200C\\u200D]"), errorCode); UNICODE_STRING_SIMPLE("[\\u00DF\\u03C2\\u200C\\u200D]"), errorCode);
@ -258,11 +281,13 @@ main(int argc, const char *argv[]) {
ignoredSet.freeze(); ignoredSet.freeze();
validSet.freeze(); validSet.freeze();
mappedSet.freeze(); mappedSet.freeze();
disallowedSTD3Set.freeze();
// output // output
UChar32 prevStart=0, c=0; UChar32 prevStart=0, c=0;
Status prevStatus=DISALLOWED, status; Status prevStatus=DISALLOWED_STD3_VALID, status;
icu::UnicodeString prevMapping; icu::UnicodeString prevMapping;
UVersionInfo prevAge={ 1, 1, 0, 0 }, age;
icu::UnicodeSetIterator iter(disallowedSet); icu::UnicodeSetIterator iter(disallowedSet);
while(iter.nextRange()) { while(iter.nextRange()) {
@ -278,20 +303,32 @@ main(int argc, const char *argv[]) {
nfkc_cf->normalize(cString, mapping, errorCode); nfkc_cf->normalize(cString, mapping, errorCode);
} else if(ignoredSet.contains(c)) { } else if(ignoredSet.contains(c)) {
status=IGNORED; status=IGNORED;
} else if(disallowedSTD3Set.contains(c)) {
status=DISALLOWED_STD3_VALID;
} else if(validSet.contains(c)) { } else if(validSet.contains(c)) {
status=VALID; status=VALID;
} else if(mappedSet.contains(c)) { } else if(mappedSet.contains(c)) {
status=MAPPED;
cString.setTo(c); cString.setTo(c);
nfkc_cf->normalize(cString, mapping, errorCode); nfkc_cf->normalize(cString, mapping, errorCode);
if(disallowedSTD3Set.containsSome(mapping)) {
status=DISALLOWED_STD3_MAPPED;
} else {
status=MAPPED;
}
} else { } else {
fprintf(stderr, "*** undetermined status of U+%04lX\n", (long)c); fprintf(stderr, "*** undetermined status of U+%04lX\n", (long)c);
} }
if(prevStart<c && status!=prevStatus || mapping!=prevMapping) { // Print a new line where the status, the mapping or
// the character age change.
getAgeIfAssigned(c, age);
if( prevStart<c &&
(status!=prevStatus || mapping!=prevMapping || 0!=memcmp(prevAge, age, 4))
) {
printLine(prevStart, c-1, prevStatus, prevMapping); printLine(prevStart, c-1, prevStatus, prevMapping);
prevStart=c; prevStart=c;
prevStatus=status; prevStatus=status;
prevMapping=mapping; prevMapping=mapping;
memcpy(prevAge, age, 4);
} }
++c; ++c;
} }
@ -302,7 +339,16 @@ main(int argc, const char *argv[]) {
prevStart=c; prevStart=c;
prevStatus=DISALLOWED; prevStatus=DISALLOWED;
prevMapping.remove(); prevMapping.remove();
c=iter.getCodepointEnd()+1; getAgeIfAssigned(c, prevAge);
UChar32 end=iter.getCodepointEnd();
while(++c<=end) {
getAgeIfAssigned(c, age);
if(prevStart<c && 0!=memcmp(prevAge, age, 4)) {
printLine(prevStart, c-1, prevStatus, prevMapping);
prevStart=c;
memcpy(prevAge, age, 4);
}
}
} }
if(prevStart<c) { if(prevStart<c) {
printLine(prevStart, c-1, prevStatus, prevMapping); printLine(prevStart, c-1, prevStatus, prevMapping);