ICU-1970 add properties: decomposition type, joining group, joining type, line break
X-SVN-Rev: 9039
This commit is contained in:
parent
0dcc70147f
commit
642c3b43aa
@ -233,7 +233,9 @@ enum UProperty {
|
||||
/** Enumerated property Canonical_Combining_Class.
|
||||
Same as u_getCombiningClass, returns 8-bit numeric values. @draft ICU 2.2 */
|
||||
UCHAR_CANONICAL_COMBINING_CLASS,
|
||||
/* UCHAR_DECOMPOSITION_TYPE, -- ### later ICU release */
|
||||
/** Enumerated property Decomposition_Type.
|
||||
Returns UDecompositionType values. @draft ICU 2.2 */
|
||||
UCHAR_DECOMPOSITION_TYPE,
|
||||
/** Enumerated property East_Asian_Width.
|
||||
See http://www.unicode.org/reports/tr11/
|
||||
Returns UEastAsianWidth values. @draft ICU 2.2 */
|
||||
@ -241,9 +243,15 @@ enum UProperty {
|
||||
/** Enumerated property General_Category.
|
||||
Same as u_charType, returns UCharCategory values. @draft ICU 2.2 */
|
||||
UCHAR_GENERAL_CATEGORY,
|
||||
/* UCHAR_JOINING_GROUP, -- ### later ICU release */
|
||||
/* UCHAR_JOINING_TYPE, -- ### later ICU release */
|
||||
/* UCHAR_LINE_BREAK, -- ### later ICU release */
|
||||
/** Enumerated property Joining_Group.
|
||||
Returns UJoiningGroup values. @draft ICU 2.2 */
|
||||
UCHAR_JOINING_GROUP,
|
||||
/** Enumerated property Joining_Type.
|
||||
Returns UJoiningType values. @draft ICU 2.2 */
|
||||
UCHAR_JOINING_TYPE,
|
||||
/** Enumerated property Line_Break.
|
||||
Returns ULineBreak values. @draft ICU 2.2 */
|
||||
UCHAR_LINE_BREAK,
|
||||
/** Enumerated property Numeric_Type.
|
||||
Returns UNumericType values. @draft ICU 2.2 */
|
||||
UCHAR_NUMERIC_TYPE,
|
||||
@ -1036,6 +1044,154 @@ enum UCharNameChoice {
|
||||
/** @stable */
|
||||
typedef enum UCharNameChoice UCharNameChoice;
|
||||
|
||||
/**
|
||||
* Decomposition Type constants.
|
||||
*
|
||||
* @see UCHAR_DECOMPOSITION_TYPE
|
||||
* @draft ICU 2.2
|
||||
*/
|
||||
enum UDecompositionType {
|
||||
U_DT_NONE,
|
||||
U_DT_CANONICAL,
|
||||
U_DT_COMPAT,
|
||||
U_DT_CIRCLE,
|
||||
U_DT_FINAL,
|
||||
U_DT_FONT,
|
||||
U_DT_FRACTION,
|
||||
U_DT_INITIAL,
|
||||
U_DT_ISOLATED,
|
||||
U_DT_MEDIAL,
|
||||
U_DT_NARROW,
|
||||
U_DT_NOBREAK,
|
||||
U_DT_SMALL,
|
||||
U_DT_SQUARE,
|
||||
U_DT_SUB,
|
||||
U_DT_SUPER,
|
||||
U_DT_VERTICAL,
|
||||
U_DT_WIDE,
|
||||
U_DT_COUNT /* 18 */
|
||||
};
|
||||
typedef enum UDecompositionType UDecompositionType;
|
||||
|
||||
/**
|
||||
* Joining Type constants.
|
||||
*
|
||||
* @see UCHAR_JOINING_TYPE
|
||||
* @draft ICU 2.2
|
||||
*/
|
||||
enum UJoiningType {
|
||||
U_JT_NON_JOINING,
|
||||
U_JT_JOIN_CAUSING,
|
||||
U_JT_DUAL_JOINING,
|
||||
U_JT_LEFT_JOINING,
|
||||
U_JT_RIGHT_JOINING,
|
||||
U_JT_TRANSPARENT,
|
||||
U_JT_COUNT /* 6 */
|
||||
};
|
||||
typedef enum UJoiningType UJoiningType;
|
||||
|
||||
/**
|
||||
* Joining Group constants.
|
||||
*
|
||||
* @see UCHAR_JOINING_GROUP
|
||||
* @draft ICU 2.2
|
||||
*/
|
||||
enum UJoiningGroup {
|
||||
U_JG_NO_JOINING_GROUP,
|
||||
U_JG_AIN,
|
||||
U_JG_ALAPH,
|
||||
U_JG_ALEF,
|
||||
U_JG_BEH,
|
||||
U_JG_BETH,
|
||||
U_JG_DAL,
|
||||
U_JG_DALATH_RISH,
|
||||
U_JG_E,
|
||||
U_JG_FEH,
|
||||
U_JG_FINAL_SEMKATH,
|
||||
U_JG_GAF,
|
||||
U_JG_GAMAL,
|
||||
U_JG_HAH,
|
||||
U_JG_HAMZA_ON_HEH_GOAL,
|
||||
U_JG_HE,
|
||||
U_JG_HEH,
|
||||
U_JG_HEH_GOAL,
|
||||
U_JG_HETH,
|
||||
U_JG_KAF,
|
||||
U_JG_KAPH,
|
||||
U_JG_KNOTTED_HEH,
|
||||
U_JG_LAM,
|
||||
U_JG_LAMADH,
|
||||
U_JG_MEEM,
|
||||
U_JG_MIM,
|
||||
U_JG_NOON,
|
||||
U_JG_NUN,
|
||||
U_JG_PE,
|
||||
U_JG_QAF,
|
||||
U_JG_QAPH,
|
||||
U_JG_REH,
|
||||
U_JG_REVERSED_PE,
|
||||
U_JG_SAD,
|
||||
U_JG_SADHE,
|
||||
U_JG_SEEN,
|
||||
U_JG_SEMKATH,
|
||||
U_JG_SHIN,
|
||||
U_JG_SWASH_KAF,
|
||||
U_JG_SYRIAC_WAW,
|
||||
U_JG_TAH,
|
||||
U_JG_TAW,
|
||||
U_JG_TEH_MARBUTA,
|
||||
U_JG_TETH,
|
||||
U_JG_WAW,
|
||||
U_JG_YEH,
|
||||
U_JG_YEH_BARREE,
|
||||
U_JG_YEH_WITH_TAIL,
|
||||
U_JG_YUDH,
|
||||
U_JG_YUDH_HE,
|
||||
U_JG_ZAIN,
|
||||
U_JG_COUNT /* 51 */
|
||||
};
|
||||
typedef enum UJoiningGroup UJoiningGroup;
|
||||
|
||||
/**
|
||||
* Line Break constants.
|
||||
*
|
||||
* @see UCHAR_LINE_BREAK
|
||||
* @draft ICU 2.2
|
||||
*/
|
||||
enum ULineBreak {
|
||||
U_LB_UNKNOWN,
|
||||
U_LB_AMBIGUOUS,
|
||||
U_LB_ALPHABETIC,
|
||||
U_LB_BREAK_BOTH,
|
||||
U_LB_BREAK_AFTER,
|
||||
U_LB_BREAK_BEFORE,
|
||||
U_LB_MANDATORY_BREAK,
|
||||
U_LB_CONTINGENT_BREAK,
|
||||
U_LB_CLOSE_PUNCTUATION,
|
||||
U_LB_COMBINING_MARK,
|
||||
U_LB_CARRIAGE_RETURN,
|
||||
U_LB_EXCLAMATION,
|
||||
U_LB_GLUE,
|
||||
U_LB_HYPHEN,
|
||||
U_LB_IDEOGRAPHIC,
|
||||
U_LB_INSEPERABLE,
|
||||
U_LB_INFIX_NUMERIC,
|
||||
U_LB_LINE_FEED,
|
||||
U_LB_NONSTARTER,
|
||||
U_LB_NUMERIC,
|
||||
U_LB_OPEN_PUNCTUATION,
|
||||
U_LB_POSTFIX_NUMERIC,
|
||||
U_LB_PREFIX_NUMERIC,
|
||||
U_LB_QUOTATION,
|
||||
U_LB_COMPLEX_CONTEXT,
|
||||
U_LB_SURROGATE,
|
||||
U_LB_SPACE,
|
||||
U_LB_BREAK_SYMBOLS,
|
||||
U_LB_ZWSPACE,
|
||||
U_LB_COUNT /* 29 */
|
||||
};
|
||||
typedef enum ULineBreak ULineBreak;
|
||||
|
||||
/**
|
||||
* Numeric Type constants.
|
||||
*
|
||||
|
@ -26,7 +26,9 @@
|
||||
|
||||
/* helper definitions ------------------------------------------------------- */
|
||||
|
||||
#define CGJ 0x34f
|
||||
#define CGJ 0x034f
|
||||
#define ZWNJ 0x200C
|
||||
#define ZWJ 0x200D
|
||||
|
||||
/**
|
||||
* Unicode property names and property value names are compared
|
||||
@ -259,6 +261,7 @@ u_isUWhiteSpace(UChar32 c) {
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
u_getIntPropertyValue(UChar32 c, UProperty which) {
|
||||
UErrorCode errorCode;
|
||||
int32_t i;
|
||||
|
||||
if(which<UCHAR_BINARY_START) {
|
||||
return 0; /* undefined */
|
||||
@ -274,22 +277,40 @@ u_getIntPropertyValue(UChar32 c, UProperty which) {
|
||||
return (int32_t)ublock_getCode(c);
|
||||
case UCHAR_CANONICAL_COMBINING_CLASS:
|
||||
return u_getCombiningClass(c);
|
||||
#if 0 /* ### */
|
||||
case UCHAR_DECOMPOSITION_TYPE:
|
||||
return ;
|
||||
#endif
|
||||
return (int32_t)(u_getUnicodeProperties(c, 2)&UPROPS_DT_MASK);
|
||||
case UCHAR_EAST_ASIAN_WIDTH:
|
||||
return (int32_t)(u_getUnicodeProperties(c, 0)&UPROPS_EA_WIDTH_MASK)>>UPROPS_EA_WIDTH_SHIFT;
|
||||
return (int32_t)(u_getUnicodeProperties(c, 0)&UPROPS_EA_MASK)>>UPROPS_EA_SHIFT;
|
||||
case UCHAR_GENERAL_CATEGORY:
|
||||
return (int32_t)u_charType(c);
|
||||
#if 0 /* ### */
|
||||
case UCHAR_JOINING_GROUP:
|
||||
return ;
|
||||
return (int32_t)(u_getUnicodeProperties(c, 2)&UPROPS_JG_MASK)>>UPROPS_JG_SHIFT;
|
||||
case UCHAR_JOINING_TYPE:
|
||||
return ;
|
||||
/*
|
||||
* ArabicShaping.txt:
|
||||
* Note: Characters of joining type T and most characters of
|
||||
* joining type U are not explicitly listed in this file.
|
||||
*
|
||||
* Characters of joining type T can [be] derived by the following formula:
|
||||
* T = Mn + Cf - ZWNJ - ZWJ
|
||||
*/
|
||||
i=(int32_t)(u_getUnicodeProperties(c, 2)&UPROPS_JG_MASK)>>UPROPS_JG_SHIFT;
|
||||
if(i==0 && c!=ZWNJ && c!=ZWJ && (FLAG(u_charType(c))&(_Mn|_Cf))!=0) {
|
||||
i=(int32_t)U_JT_TRANSPARENT;
|
||||
}
|
||||
return i;
|
||||
case UCHAR_LINE_BREAK:
|
||||
return ;
|
||||
#endif
|
||||
/*
|
||||
* LineBreak.txt:
|
||||
* - Assigned characters that are not listed explicitly are given the value
|
||||
* "AL".
|
||||
* - Unassigned characters are given the value "XX".
|
||||
*/
|
||||
i=(int32_t)(u_getUnicodeProperties(c, 0)&UPROPS_LB_MASK)>>UPROPS_LB_SHIFT;
|
||||
if(i==0 && u_charType(c)!=0) {
|
||||
i=(int32_t)U_LB_ALPHABETIC;
|
||||
}
|
||||
return i;
|
||||
case UCHAR_NUMERIC_TYPE:
|
||||
return (int32_t)GET_NUMERIC_TYPE(u_getUnicodeProperties(c, -1));
|
||||
case UCHAR_SCRIPT:
|
||||
@ -317,6 +338,8 @@ u_getIntPropertyMinValue(UProperty which) {
|
||||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
u_getIntPropertyMaxValue(UProperty which) {
|
||||
int32_t max;
|
||||
|
||||
if(which<UCHAR_BINARY_START) {
|
||||
return 0; /* undefined */
|
||||
} else if(which<UCHAR_BINARY_LIMIT) {
|
||||
@ -328,31 +351,33 @@ u_getIntPropertyMaxValue(UProperty which) {
|
||||
case UCHAR_BIDI_CLASS:
|
||||
return (int32_t)U_CHAR_DIRECTION_COUNT-1;
|
||||
case UCHAR_BLOCK:
|
||||
/* ### TODO This should be data-driven from uprops.dat */
|
||||
return (int32_t)UBLOCK_COUNT-1;
|
||||
max=(uprv_getMaxValues()&UPROPS_BLOCK_MASK)>>UPROPS_BLOCK_SHIFT;
|
||||
if(max==0) {
|
||||
max=(int32_t)UBLOCK_COUNT-1;
|
||||
}
|
||||
return max;
|
||||
case UCHAR_CANONICAL_COMBINING_CLASS:
|
||||
return 0xff; /* TODO do we need to be more precise, getting the actual maximum? */
|
||||
#if 0 /* ### */
|
||||
case UCHAR_DECOMPOSITION_TYPE:
|
||||
return ;
|
||||
#endif
|
||||
return (int32_t)U_DT_COUNT-1;
|
||||
case UCHAR_EAST_ASIAN_WIDTH:
|
||||
return (int32_t)U_EA_COUNT-1;
|
||||
case UCHAR_GENERAL_CATEGORY:
|
||||
return (int32_t)U_CHAR_CATEGORY_COUNT-1;
|
||||
#if 0 /* ### */
|
||||
case UCHAR_JOINING_GROUP:
|
||||
return ;
|
||||
return (int32_t)U_JG_COUNT-1;
|
||||
case UCHAR_JOINING_TYPE:
|
||||
return ;
|
||||
return (int32_t)U_JT_COUNT-1;
|
||||
case UCHAR_LINE_BREAK:
|
||||
return ;
|
||||
#endif
|
||||
return (int32_t)U_LB_COUNT-1;
|
||||
case UCHAR_NUMERIC_TYPE:
|
||||
return (int32_t)U_NT_COUNT-1;
|
||||
case UCHAR_SCRIPT:
|
||||
/* ### TODO This should be data-driven from uprops.dat */
|
||||
return (int32_t)USCRIPT_CODE_LIMIT-1;
|
||||
max=uprv_getMaxValues()&UPROPS_SCRIPT_MASK;
|
||||
if(max==0) {
|
||||
max=(int32_t)USCRIPT_CODE_LIMIT-1;
|
||||
}
|
||||
return max;
|
||||
default:
|
||||
return 0; /* undefined */
|
||||
}
|
||||
|
@ -30,7 +30,10 @@ enum {
|
||||
UPROPS_ADDITIONAL_VECTORS_INDEX,
|
||||
UPROPS_ADDITIONAL_VECTORS_COLUMNS_INDEX,
|
||||
|
||||
UPROPS_RESERVED_INDEX,
|
||||
UPROPS_RESERVED_INDEX, /* 6 */
|
||||
|
||||
/* maximum values for block and script codes, bits used as in vector word 0 */
|
||||
UPROPS_MAX_VALUES_INDEX=10,
|
||||
|
||||
UPROPS_INDEX_COUNT=16
|
||||
};
|
||||
@ -73,13 +76,14 @@ enum {
|
||||
};
|
||||
|
||||
/* number of properties vector words */
|
||||
#define UPROPS_VECTOR_WORDS 2
|
||||
#define UPROPS_VECTOR_WORDS 3
|
||||
|
||||
/*
|
||||
* Properties in vector word 0
|
||||
* Bits
|
||||
* 31..24 DerivedAge version major/minor one nibble each
|
||||
* 23..18 reserved
|
||||
* 23 reserved
|
||||
* 22..18 Line Break
|
||||
* 17..15 East Asian Width
|
||||
* 14.. 7 UBlockCode
|
||||
* 6.. 0 UScriptCode
|
||||
@ -89,8 +93,11 @@ enum {
|
||||
#define UPROPS_AGE_MASK 0xff000000
|
||||
#define UPROPS_AGE_SHIFT 24
|
||||
|
||||
#define UPROPS_EA_WIDTH_MASK 0x00038000
|
||||
#define UPROPS_EA_WIDTH_SHIFT 15
|
||||
#define UPROPS_LB_MASK 0x007C0000
|
||||
#define UPROPS_LB_SHIFT 18
|
||||
|
||||
#define UPROPS_EA_MASK 0x00038000
|
||||
#define UPROPS_EA_SHIFT 15
|
||||
|
||||
#define UPROPS_BLOCK_MASK 0x00007f80
|
||||
#define UPROPS_BLOCK_SHIFT 7
|
||||
@ -142,6 +149,21 @@ enum {
|
||||
UPROPS_BINARY_1_TOP
|
||||
};
|
||||
|
||||
/*
|
||||
* Properties in vector word 2
|
||||
* Bits
|
||||
* 13..11 Joining Type
|
||||
* 10.. 5 Joining Group
|
||||
* 4.. 0 Decomposition Type
|
||||
*/
|
||||
#define UPROPS_JT_MASK 0x00003800
|
||||
#define UPROPS_JT_SHIFT 11
|
||||
|
||||
#define UPROPS_JG_MASK 0x000007e0
|
||||
#define UPROPS_JG_SHIFT 5
|
||||
|
||||
#define UPROPS_DT_MASK 0x0000001f
|
||||
|
||||
/**
|
||||
* Get a properties vector word for a code point.
|
||||
* Implemented in uchar.c for uprops.c.
|
||||
@ -151,6 +173,13 @@ enum {
|
||||
U_CFUNC uint32_t
|
||||
u_getUnicodeProperties(UChar32 c, int32_t column);
|
||||
|
||||
/**
|
||||
* Get the the maximum values for some enum/int properties.
|
||||
* @internal
|
||||
*/
|
||||
U_CFUNC int32_t
|
||||
uprv_getMaxValues();
|
||||
|
||||
/**
|
||||
* Unicode property names and property value names are compared
|
||||
* "loosely". Property[Value]Aliases.txt say:
|
||||
|
Loading…
Reference in New Issue
Block a user