ICU-1709 do not tweak the UCD data to mark some control characters
as spacing ones. X-SVN-Rev: 7749
This commit is contained in:
parent
3699fd2f22
commit
504a75e931
@ -30,17 +30,11 @@
|
|||||||
#include "utrie.h"
|
#include "utrie.h"
|
||||||
#include "ustr_imp.h"
|
#include "ustr_imp.h"
|
||||||
|
|
||||||
/*
|
|
||||||
* Since genprops overrides the general category for some control codes,
|
|
||||||
* we need to hardcode ISO 8 controls for u_iscntrl(), u_isprint(), etc.
|
|
||||||
*/
|
|
||||||
#define IS_ISO_8_CONTROL(c) ((uint32_t)(c)<0x20 || (uint32_t)((c)-0x7f)<=0x20)
|
|
||||||
|
|
||||||
/* dynamically loaded Unicode character properties -------------------------- */
|
/* dynamically loaded Unicode character properties -------------------------- */
|
||||||
|
|
||||||
/* fallback properties for the ASCII range if the data cannot be loaded */
|
/* fallback properties for the ASCII range if the data cannot be loaded */
|
||||||
/* these are printed by genprops in verbose mode */
|
/* these are printed by genprops in verbose mode */
|
||||||
static const uint32_t staticProps32Table[]={
|
static uint32_t staticProps32Table[0xa0]={
|
||||||
/* 0x00 */ 0x48f,
|
/* 0x00 */ 0x48f,
|
||||||
/* 0x01 */ 0x48f,
|
/* 0x01 */ 0x48f,
|
||||||
/* 0x02 */ 0x48f,
|
/* 0x02 */ 0x48f,
|
||||||
@ -50,11 +44,11 @@ static const uint32_t staticProps32Table[]={
|
|||||||
/* 0x06 */ 0x48f,
|
/* 0x06 */ 0x48f,
|
||||||
/* 0x07 */ 0x48f,
|
/* 0x07 */ 0x48f,
|
||||||
/* 0x08 */ 0x48f,
|
/* 0x08 */ 0x48f,
|
||||||
/* 0x09 */ 0x20c,
|
/* 0x09 */ 0x20f,
|
||||||
/* 0x0a */ 0x1ce,
|
/* 0x0a */ 0x1cf,
|
||||||
/* 0x0b */ 0x20c,
|
/* 0x0b */ 0x20f,
|
||||||
/* 0x0c */ 0x24d,
|
/* 0x0c */ 0x24f,
|
||||||
/* 0x0d */ 0x1ce,
|
/* 0x0d */ 0x1cf,
|
||||||
/* 0x0e */ 0x48f,
|
/* 0x0e */ 0x48f,
|
||||||
/* 0x0f */ 0x48f,
|
/* 0x0f */ 0x48f,
|
||||||
/* 0x10 */ 0x48f,
|
/* 0x10 */ 0x48f,
|
||||||
@ -69,10 +63,10 @@ static const uint32_t staticProps32Table[]={
|
|||||||
/* 0x19 */ 0x48f,
|
/* 0x19 */ 0x48f,
|
||||||
/* 0x1a */ 0x48f,
|
/* 0x1a */ 0x48f,
|
||||||
/* 0x1b */ 0x48f,
|
/* 0x1b */ 0x48f,
|
||||||
/* 0x1c */ 0x1ce,
|
/* 0x1c */ 0x1cf,
|
||||||
/* 0x1d */ 0x1ce,
|
/* 0x1d */ 0x1cf,
|
||||||
/* 0x1e */ 0x1ce,
|
/* 0x1e */ 0x1cf,
|
||||||
/* 0x1f */ 0x20c,
|
/* 0x1f */ 0x20f,
|
||||||
/* 0x20 */ 0x24c,
|
/* 0x20 */ 0x24c,
|
||||||
/* 0x21 */ 0x297,
|
/* 0x21 */ 0x297,
|
||||||
/* 0x22 */ 0x297,
|
/* 0x22 */ 0x297,
|
||||||
@ -114,8 +108,8 @@ static const uint32_t staticProps32Table[]={
|
|||||||
/* 0x46 */ 0x2000001,
|
/* 0x46 */ 0x2000001,
|
||||||
/* 0x47 */ 0x2000001,
|
/* 0x47 */ 0x2000001,
|
||||||
/* 0x48 */ 0x2000001,
|
/* 0x48 */ 0x2000001,
|
||||||
/* 0x49 */ 0x2000001,
|
/* 0x49 */ 0x1, /* has exception */
|
||||||
/* 0x4a */ 0x2000001,
|
/* 0x4a */ 0x300001, /* has exception */
|
||||||
/* 0x4b */ 0x2000001,
|
/* 0x4b */ 0x2000001,
|
||||||
/* 0x4c */ 0x2000001,
|
/* 0x4c */ 0x2000001,
|
||||||
/* 0x4d */ 0x2000001,
|
/* 0x4d */ 0x2000001,
|
||||||
@ -146,7 +140,7 @@ static const uint32_t staticProps32Table[]={
|
|||||||
/* 0x66 */ 0x2000002,
|
/* 0x66 */ 0x2000002,
|
||||||
/* 0x67 */ 0x2000002,
|
/* 0x67 */ 0x2000002,
|
||||||
/* 0x68 */ 0x2000002,
|
/* 0x68 */ 0x2000002,
|
||||||
/* 0x69 */ 0x2000002,
|
/* 0x69 */ 0x600002, /* has exception */
|
||||||
/* 0x6a */ 0x2000002,
|
/* 0x6a */ 0x2000002,
|
||||||
/* 0x6b */ 0x2000002,
|
/* 0x6b */ 0x2000002,
|
||||||
/* 0x6c */ 0x2000002,
|
/* 0x6c */ 0x2000002,
|
||||||
@ -174,7 +168,7 @@ static const uint32_t staticProps32Table[]={
|
|||||||
/* 0x82 */ 0x48f,
|
/* 0x82 */ 0x48f,
|
||||||
/* 0x83 */ 0x48f,
|
/* 0x83 */ 0x48f,
|
||||||
/* 0x84 */ 0x48f,
|
/* 0x84 */ 0x48f,
|
||||||
/* 0x85 */ 0x1ce,
|
/* 0x85 */ 0x1cf,
|
||||||
/* 0x86 */ 0x48f,
|
/* 0x86 */ 0x48f,
|
||||||
/* 0x87 */ 0x48f,
|
/* 0x87 */ 0x48f,
|
||||||
/* 0x88 */ 0x48f,
|
/* 0x88 */ 0x48f,
|
||||||
@ -200,7 +194,7 @@ static const uint32_t staticProps32Table[]={
|
|||||||
/* 0x9c */ 0x48f,
|
/* 0x9c */ 0x48f,
|
||||||
/* 0x9d */ 0x48f,
|
/* 0x9d */ 0x48f,
|
||||||
/* 0x9e */ 0x48f,
|
/* 0x9e */ 0x48f,
|
||||||
/* 0x9f */ 0x48f
|
/* 0x9f */ 0x48f,
|
||||||
};
|
};
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -424,18 +418,6 @@ u_charType(UChar32 c) {
|
|||||||
return (int8_t)GET_CATEGORY(props);
|
return (int8_t)GET_CATEGORY(props);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Gets the Unicode character's general category, as per the UCD.*/
|
|
||||||
U_CAPI int8_t U_EXPORT2
|
|
||||||
u_charUCDType(UChar32 c) {
|
|
||||||
if (IS_ISO_8_CONTROL(c)) {
|
|
||||||
return U_CONTROL_CHAR;
|
|
||||||
} else {
|
|
||||||
uint32_t props;
|
|
||||||
GET_PROPS(c, props);
|
|
||||||
return (int8_t)GET_CATEGORY(props);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Enumerate all code points with their general categories. */
|
/* Enumerate all code points with their general categories. */
|
||||||
struct _EnumTypeCallback {
|
struct _EnumTypeCallback {
|
||||||
UCharEnumTypeRange *enumRange;
|
UCharEnumTypeRange *enumRange;
|
||||||
@ -547,26 +529,26 @@ u_isbase(UChar32 c) {
|
|||||||
/* Checks if the Unicode character is a control character.*/
|
/* Checks if the Unicode character is a control character.*/
|
||||||
U_CAPI UBool U_EXPORT2
|
U_CAPI UBool U_EXPORT2
|
||||||
u_iscntrl(UChar32 c) {
|
u_iscntrl(UChar32 c) {
|
||||||
if(IS_ISO_8_CONTROL(c)) {
|
uint32_t props;
|
||||||
return TRUE;
|
GET_PROPS(c, props);
|
||||||
} else {
|
return (UBool)(
|
||||||
uint32_t props;
|
((1UL<<GET_CATEGORY(props))&
|
||||||
GET_PROPS(c, props);
|
(1UL<<U_CONTROL_CHAR|1UL<<U_FORMAT_CHAR|1UL<<U_LINE_SEPARATOR|1UL<<U_PARAGRAPH_SEPARATOR)
|
||||||
return (UBool)(
|
)!=0);
|
||||||
((1UL<<GET_CATEGORY(props))&
|
|
||||||
(1UL<<U_CONTROL_CHAR|1UL<<U_FORMAT_CHAR|1UL<<U_LINE_SEPARATOR|1UL<<U_PARAGRAPH_SEPARATOR)
|
|
||||||
)!=0);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Some control characters that are used as space. */
|
||||||
|
#define IS_THAT_CONTROL_SPACE(c) \
|
||||||
|
((c>=0x09 && c <= 0x0d) || (c>=0x1c && c <=0x1f) || c==0x85)
|
||||||
|
|
||||||
/* Checks if the Unicode character is a space character.*/
|
/* Checks if the Unicode character is a space character.*/
|
||||||
U_CAPI UBool U_EXPORT2
|
U_CAPI UBool U_EXPORT2
|
||||||
u_isspace(UChar32 c) {
|
u_isspace(UChar32 c) {
|
||||||
uint32_t props;
|
uint32_t props;
|
||||||
GET_PROPS(c, props);
|
GET_PROPS(c, props);
|
||||||
return (UBool)(((1UL<<GET_CATEGORY(props))&
|
return (UBool)((((1UL<<GET_CATEGORY(props))&
|
||||||
(1UL<<U_SPACE_SEPARATOR|1UL<<U_LINE_SEPARATOR|1UL<<U_PARAGRAPH_SEPARATOR)
|
(1UL<<U_SPACE_SEPARATOR|1UL<<U_LINE_SEPARATOR|1UL<<U_PARAGRAPH_SEPARATOR)
|
||||||
)!=0);
|
)!=0) || IS_THAT_CONTROL_SPACE(c));
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Checks if the Unicode character is a whitespace character.*/
|
/* Checks if the Unicode character is a whitespace character.*/
|
||||||
@ -574,27 +556,24 @@ U_CAPI UBool U_EXPORT2
|
|||||||
u_isWhitespace(UChar32 c) {
|
u_isWhitespace(UChar32 c) {
|
||||||
uint32_t props;
|
uint32_t props;
|
||||||
GET_PROPS(c, props);
|
GET_PROPS(c, props);
|
||||||
return (UBool)(((1UL<<GET_CATEGORY(props))&
|
return (UBool)((((1UL<<GET_CATEGORY(props))&
|
||||||
(1UL<<U_SPACE_SEPARATOR|1UL<<U_LINE_SEPARATOR|1UL<<U_PARAGRAPH_SEPARATOR)
|
(1UL<<U_SPACE_SEPARATOR|1UL<<U_LINE_SEPARATOR|1UL<<U_PARAGRAPH_SEPARATOR)
|
||||||
)!=0 &&
|
)!=0 &&
|
||||||
c!=0xa0 && c!=0x202f && c!=0xfeff); /* exclude no-break spaces */
|
c!=0xa0 && c!=0x202f && c!=0xfeff) || /* exclude no-break spaces */
|
||||||
|
IS_THAT_CONTROL_SPACE(c));
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Checks if the Unicode character is printable.*/
|
/* Checks if the Unicode character is printable.*/
|
||||||
U_CAPI UBool U_EXPORT2
|
U_CAPI UBool U_EXPORT2
|
||||||
u_isprint(UChar32 c) {
|
u_isprint(UChar32 c) {
|
||||||
if(IS_ISO_8_CONTROL(c)) {
|
uint32_t props;
|
||||||
return FALSE;
|
GET_PROPS(c, props);
|
||||||
} else {
|
return (UBool)(
|
||||||
uint32_t props;
|
((1UL<<GET_CATEGORY(props))&
|
||||||
GET_PROPS(c, props);
|
~(1UL<<U_UNASSIGNED|
|
||||||
return (UBool)(
|
1UL<<U_CONTROL_CHAR|1UL<<U_FORMAT_CHAR|1UL<<U_PRIVATE_USE_CHAR|1UL<<U_SURROGATE|
|
||||||
((1UL<<GET_CATEGORY(props))&
|
1UL<<U_GENERAL_OTHER_TYPES|1UL<<31)
|
||||||
~(1UL<<U_UNASSIGNED|
|
)!=0);
|
||||||
1UL<<U_CONTROL_CHAR|1UL<<U_FORMAT_CHAR|1UL<<U_PRIVATE_USE_CHAR|1UL<<U_SURROGATE|
|
|
||||||
1UL<<U_GENERAL_OTHER_TYPES|1UL<<31)
|
|
||||||
)!=0);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Checks if the Unicode character can start a Unicode identifier.*/
|
/* Checks if the Unicode character can start a Unicode identifier.*/
|
||||||
@ -1091,9 +1070,6 @@ u_charCellWidth(UChar32 ch)
|
|||||||
|
|
||||||
/* these Unicode character types are scattered throughout the Unicode range, so
|
/* these Unicode character types are scattered throughout the Unicode range, so
|
||||||
special-case for them*/
|
special-case for them*/
|
||||||
if(IS_ISO_8_CONTROL(ch)) {
|
|
||||||
return U_ZERO_WIDTH;
|
|
||||||
}
|
|
||||||
switch (type) {
|
switch (type) {
|
||||||
case U_UNASSIGNED:
|
case U_UNASSIGNED:
|
||||||
case U_NON_SPACING_MARK:
|
case U_NON_SPACING_MARK:
|
||||||
|
@ -1299,15 +1299,8 @@ static uint8_t getCharCat(UChar32 cp) {
|
|||||||
return U_NONCHARACTER_CODE_POINT;
|
return U_NONCHARACTER_CODE_POINT;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Undo ICU exceptions to the UCD when determining the
|
if ((cat = u_charType(cp)) == U_SURROGATE) {
|
||||||
category. */
|
cat = UTF_IS_LEAD(cp) ? U_LEAD_SURROGATE : U_TRAIL_SURROGATE;
|
||||||
|
|
||||||
if (u_iscntrl(cp)) {
|
|
||||||
cat = U_CONTROL_CHAR;
|
|
||||||
} else {
|
|
||||||
if ((cat = u_charType(cp)) == U_SURROGATE) {
|
|
||||||
cat = UTF_IS_LEAD(cp) ? U_LEAD_SURROGATE : U_TRAIL_SURROGATE;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return cat;
|
return cat;
|
||||||
|
@ -993,8 +993,6 @@ u_charCellWidth(UChar32 c);
|
|||||||
* Returns a value indicating a character category.
|
* Returns a value indicating a character category.
|
||||||
* The categories are taken from the Unicode Character Database (UCD) in
|
* The categories are taken from the Unicode Character Database (UCD) in
|
||||||
* UnicodeData.txt.
|
* UnicodeData.txt.
|
||||||
* ICU changes the category of some of the ISO control characters to various
|
|
||||||
* separators categories.
|
|
||||||
*
|
*
|
||||||
* @param c the character to be tested
|
* @param c the character to be tested
|
||||||
* @return a value of type int, the character category.
|
* @return a value of type int, the character category.
|
||||||
|
@ -495,29 +495,8 @@ unicodeDataLineFn(void *context,
|
|||||||
}
|
}
|
||||||
|
|
||||||
/* get general category, field 2 */
|
/* get general category, field 2 */
|
||||||
/* we override the general category of some control characters */
|
*fields[2][1]=0;
|
||||||
switch(c) {
|
type = (int8_t)tagValues[MakeProp(fields[2][0])];
|
||||||
case 9:
|
|
||||||
case 0xb:
|
|
||||||
case 0x1f:
|
|
||||||
type = U_SPACE_SEPARATOR;
|
|
||||||
break;
|
|
||||||
case 0xc:
|
|
||||||
type = U_LINE_SEPARATOR;
|
|
||||||
break;
|
|
||||||
case 0xa:
|
|
||||||
case 0xd:
|
|
||||||
case 0x1c:
|
|
||||||
case 0x1d:
|
|
||||||
case 0x1e:
|
|
||||||
case 0x85:
|
|
||||||
type = U_PARAGRAPH_SEPARATOR;
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
*fields[2][1]=0;
|
|
||||||
type = (int8_t)tagValues[MakeProp(fields[2][0])];
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
if(u_charType(c)!=type) {
|
if(u_charType(c)!=type) {
|
||||||
log_err("error: u_charType(U+%04lx)==%u instead of %u\n", c, u_charType(c), type);
|
log_err("error: u_charType(U+%04lx)==%u instead of %u\n", c, u_charType(c), type);
|
||||||
}
|
}
|
||||||
|
@ -209,7 +209,7 @@ void TransliteratorTest::TestInstantiation() {
|
|||||||
rules, UTRANS_FORWARD, parseError,status);
|
rules, UTRANS_FORWARD, parseError,status);
|
||||||
if (u == 0) {
|
if (u == 0) {
|
||||||
errln(UnicodeString("FAIL: ") + id +
|
errln(UnicodeString("FAIL: ") + id +
|
||||||
".toRules() => bad rules" +
|
".createFromRules() => bad rules" +
|
||||||
/*", parse error " + parseError.code +*/
|
/*", parse error " + parseError.code +*/
|
||||||
", line " + parseError.line +
|
", line " + parseError.line +
|
||||||
", offset " + parseError.offset +
|
", offset " + parseError.offset +
|
||||||
@ -2967,7 +2967,7 @@ void TransliteratorTest::TestAnchorMasking(){
|
|||||||
Transliterator* t = Transliterator::createFromRules("ID", rule, UTRANS_FORWARD,parseError,status);
|
Transliterator* t = Transliterator::createFromRules("ID", rule, UTRANS_FORWARD,parseError,status);
|
||||||
if(U_FAILURE(status)){
|
if(U_FAILURE(status)){
|
||||||
errln(UnicodeString("FAIL: ") + "ID" +
|
errln(UnicodeString("FAIL: ") + "ID" +
|
||||||
".toRules() => bad rules" +
|
".createFromRules() => bad rules" +
|
||||||
/*", parse error " + parseError.code +*/
|
/*", parse error " + parseError.code +*/
|
||||||
", line " + parseError.line +
|
", line " + parseError.line +
|
||||||
", offset " + parseError.offset +
|
", offset " + parseError.offset +
|
||||||
|
@ -342,29 +342,8 @@ UnicodeTest::unicodeDataLineFn(void *context,
|
|||||||
}
|
}
|
||||||
|
|
||||||
/* get general category, field 2 */
|
/* get general category, field 2 */
|
||||||
/* we override the general category of some control characters */
|
*fields[2][1]=0;
|
||||||
switch(c) {
|
type = (int8_t)tagValues[me->MakeProp(fields[2][0])];
|
||||||
case 9:
|
|
||||||
case 0xb:
|
|
||||||
case 0x1f:
|
|
||||||
type = U_SPACE_SEPARATOR;
|
|
||||||
break;
|
|
||||||
case 0xc:
|
|
||||||
type = U_LINE_SEPARATOR;
|
|
||||||
break;
|
|
||||||
case 0xa:
|
|
||||||
case 0xd:
|
|
||||||
case 0x1c:
|
|
||||||
case 0x1d:
|
|
||||||
case 0x1e:
|
|
||||||
case 0x85:
|
|
||||||
type = U_PARAGRAPH_SEPARATOR;
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
*fields[2][1]=0;
|
|
||||||
type = (int8_t)tagValues[me->MakeProp(fields[2][0])];
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
if(Unicode::getType(c)!=type) {
|
if(Unicode::getType(c)!=type) {
|
||||||
me->errln("error: Unicode::getType(U+%04lx)==%u instead of %u\n", c, Unicode::getType(c), type);
|
me->errln("error: Unicode::getType(U+%04lx)==%u instead of %u\n", c, Unicode::getType(c), type);
|
||||||
*pErrorCode = U_PARSE_ERROR;
|
*pErrorCode = U_PARSE_ERROR;
|
||||||
|
@ -525,23 +525,6 @@ bidiNames[U_CHAR_DIRECTION_COUNT]={
|
|||||||
"WS", "ON", "LRE", "LRO", "AL", "RLE", "RLO", "PDF", "NSM", "BN"
|
"WS", "ON", "LRE", "LRO", "AL", "RLE", "RLO", "PDF", "NSM", "BN"
|
||||||
};
|
};
|
||||||
|
|
||||||
/* control code properties */
|
|
||||||
static const struct {
|
|
||||||
uint32_t code;
|
|
||||||
uint8_t generalCategory;
|
|
||||||
} controlProps[]={
|
|
||||||
/* TAB */ {0x9, U_SPACE_SEPARATOR},
|
|
||||||
/* VT */ {0xb, U_SPACE_SEPARATOR},
|
|
||||||
/* LF */ {0xa, U_PARAGRAPH_SEPARATOR},
|
|
||||||
/* FF */ {0xc, U_LINE_SEPARATOR},
|
|
||||||
/* CR */ {0xd, U_PARAGRAPH_SEPARATOR},
|
|
||||||
/* FS */ {0x1c, U_PARAGRAPH_SEPARATOR},
|
|
||||||
/* GS */ {0x1d, U_PARAGRAPH_SEPARATOR},
|
|
||||||
/* RS */ {0x1e, U_PARAGRAPH_SEPARATOR},
|
|
||||||
/* US */ {0x1f, U_SPACE_SEPARATOR},
|
|
||||||
/* NL */ {0x85, U_PARAGRAPH_SEPARATOR}
|
|
||||||
};
|
|
||||||
|
|
||||||
static struct {
|
static struct {
|
||||||
uint32_t first, last, props;
|
uint32_t first, last, props;
|
||||||
char name[80];
|
char name[80];
|
||||||
@ -714,15 +697,6 @@ unicodeDataLineFn(void *context,
|
|||||||
}
|
}
|
||||||
p.titleCase=value;
|
p.titleCase=value;
|
||||||
|
|
||||||
/* override properties for some common control characters */
|
|
||||||
if(p.generalCategory==U_CONTROL_CHAR) {
|
|
||||||
for(i=0; i<sizeof(controlProps)/sizeof(controlProps[0]); ++i) {
|
|
||||||
if(controlProps[i].code==p.code) {
|
|
||||||
p.generalCategory=controlProps[i].generalCategory;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* set additional properties from previously parsed files */
|
/* set additional properties from previously parsed files */
|
||||||
if(mirrorIndex<mirrorCount && p.code==mirrorMappings[mirrorIndex][0]) {
|
if(mirrorIndex<mirrorCount && p.code==mirrorMappings[mirrorIndex][0]) {
|
||||||
p.mirrorMapping=mirrorMappings[mirrorIndex++][1];
|
p.mirrorMapping=mirrorMappings[mirrorIndex++][1];
|
||||||
|
Loading…
Reference in New Issue
Block a user