ICU-1970 add properties: decomposition type, joining group, joining type, line break

X-SVN-Rev: 9039
This commit is contained in:
Markus Scherer 2002-07-04 16:46:36 +00:00
parent 0dcc70147f
commit 642c3b43aa
3 changed files with 241 additions and 31 deletions

View File

@ -233,7 +233,9 @@ enum UProperty {
/** Enumerated property Canonical_Combining_Class.
Same as u_getCombiningClass, returns 8-bit numeric values. @draft ICU 2.2 */
UCHAR_CANONICAL_COMBINING_CLASS,
/* UCHAR_DECOMPOSITION_TYPE, -- ### later ICU release */
/** Enumerated property Decomposition_Type.
Returns UDecompositionType values. @draft ICU 2.2 */
UCHAR_DECOMPOSITION_TYPE,
/** Enumerated property East_Asian_Width.
See http://www.unicode.org/reports/tr11/
Returns UEastAsianWidth values. @draft ICU 2.2 */
@ -241,9 +243,15 @@ enum UProperty {
/** Enumerated property General_Category.
Same as u_charType, returns UCharCategory values. @draft ICU 2.2 */
UCHAR_GENERAL_CATEGORY,
/* UCHAR_JOINING_GROUP, -- ### later ICU release */
/* UCHAR_JOINING_TYPE, -- ### later ICU release */
/* UCHAR_LINE_BREAK, -- ### later ICU release */
/** Enumerated property Joining_Group.
Returns UJoiningGroup values. @draft ICU 2.2 */
UCHAR_JOINING_GROUP,
/** Enumerated property Joining_Type.
Returns UJoiningType values. @draft ICU 2.2 */
UCHAR_JOINING_TYPE,
/** Enumerated property Line_Break.
Returns ULineBreak values. @draft ICU 2.2 */
UCHAR_LINE_BREAK,
/** Enumerated property Numeric_Type.
Returns UNumericType values. @draft ICU 2.2 */
UCHAR_NUMERIC_TYPE,
@ -1036,6 +1044,154 @@ enum UCharNameChoice {
/** @stable */
typedef enum UCharNameChoice UCharNameChoice;
/**
* Decomposition Type constants.
*
* @see UCHAR_DECOMPOSITION_TYPE
* @draft ICU 2.2
*/
enum UDecompositionType {
U_DT_NONE,
U_DT_CANONICAL,
U_DT_COMPAT,
U_DT_CIRCLE,
U_DT_FINAL,
U_DT_FONT,
U_DT_FRACTION,
U_DT_INITIAL,
U_DT_ISOLATED,
U_DT_MEDIAL,
U_DT_NARROW,
U_DT_NOBREAK,
U_DT_SMALL,
U_DT_SQUARE,
U_DT_SUB,
U_DT_SUPER,
U_DT_VERTICAL,
U_DT_WIDE,
U_DT_COUNT /* 18 */
};
typedef enum UDecompositionType UDecompositionType;
/**
* Joining Type constants.
*
* @see UCHAR_JOINING_TYPE
* @draft ICU 2.2
*/
enum UJoiningType {
U_JT_NON_JOINING,
U_JT_JOIN_CAUSING,
U_JT_DUAL_JOINING,
U_JT_LEFT_JOINING,
U_JT_RIGHT_JOINING,
U_JT_TRANSPARENT,
U_JT_COUNT /* 6 */
};
typedef enum UJoiningType UJoiningType;
/**
* Joining Group constants.
*
* @see UCHAR_JOINING_GROUP
* @draft ICU 2.2
*/
enum UJoiningGroup {
U_JG_NO_JOINING_GROUP,
U_JG_AIN,
U_JG_ALAPH,
U_JG_ALEF,
U_JG_BEH,
U_JG_BETH,
U_JG_DAL,
U_JG_DALATH_RISH,
U_JG_E,
U_JG_FEH,
U_JG_FINAL_SEMKATH,
U_JG_GAF,
U_JG_GAMAL,
U_JG_HAH,
U_JG_HAMZA_ON_HEH_GOAL,
U_JG_HE,
U_JG_HEH,
U_JG_HEH_GOAL,
U_JG_HETH,
U_JG_KAF,
U_JG_KAPH,
U_JG_KNOTTED_HEH,
U_JG_LAM,
U_JG_LAMADH,
U_JG_MEEM,
U_JG_MIM,
U_JG_NOON,
U_JG_NUN,
U_JG_PE,
U_JG_QAF,
U_JG_QAPH,
U_JG_REH,
U_JG_REVERSED_PE,
U_JG_SAD,
U_JG_SADHE,
U_JG_SEEN,
U_JG_SEMKATH,
U_JG_SHIN,
U_JG_SWASH_KAF,
U_JG_SYRIAC_WAW,
U_JG_TAH,
U_JG_TAW,
U_JG_TEH_MARBUTA,
U_JG_TETH,
U_JG_WAW,
U_JG_YEH,
U_JG_YEH_BARREE,
U_JG_YEH_WITH_TAIL,
U_JG_YUDH,
U_JG_YUDH_HE,
U_JG_ZAIN,
U_JG_COUNT /* 51 */
};
typedef enum UJoiningGroup UJoiningGroup;
/**
* Line Break constants.
*
* @see UCHAR_LINE_BREAK
* @draft ICU 2.2
*/
enum ULineBreak {
U_LB_UNKNOWN,
U_LB_AMBIGUOUS,
U_LB_ALPHABETIC,
U_LB_BREAK_BOTH,
U_LB_BREAK_AFTER,
U_LB_BREAK_BEFORE,
U_LB_MANDATORY_BREAK,
U_LB_CONTINGENT_BREAK,
U_LB_CLOSE_PUNCTUATION,
U_LB_COMBINING_MARK,
U_LB_CARRIAGE_RETURN,
U_LB_EXCLAMATION,
U_LB_GLUE,
U_LB_HYPHEN,
U_LB_IDEOGRAPHIC,
U_LB_INSEPERABLE,
U_LB_INFIX_NUMERIC,
U_LB_LINE_FEED,
U_LB_NONSTARTER,
U_LB_NUMERIC,
U_LB_OPEN_PUNCTUATION,
U_LB_POSTFIX_NUMERIC,
U_LB_PREFIX_NUMERIC,
U_LB_QUOTATION,
U_LB_COMPLEX_CONTEXT,
U_LB_SURROGATE,
U_LB_SPACE,
U_LB_BREAK_SYMBOLS,
U_LB_ZWSPACE,
U_LB_COUNT /* 29 */
};
typedef enum ULineBreak ULineBreak;
/**
* Numeric Type constants.
*

View File

@ -26,7 +26,9 @@
/* helper definitions ------------------------------------------------------- */
#define CGJ 0x34f
#define CGJ 0x034f
#define ZWNJ 0x200C
#define ZWJ 0x200D
/**
* Unicode property names and property value names are compared
@ -259,6 +261,7 @@ u_isUWhiteSpace(UChar32 c) {
U_CAPI int32_t U_EXPORT2
u_getIntPropertyValue(UChar32 c, UProperty which) {
UErrorCode errorCode;
int32_t i;
if(which<UCHAR_BINARY_START) {
return 0; /* undefined */
@ -274,22 +277,40 @@ u_getIntPropertyValue(UChar32 c, UProperty which) {
return (int32_t)ublock_getCode(c);
case UCHAR_CANONICAL_COMBINING_CLASS:
return u_getCombiningClass(c);
#if 0 /* ### */
case UCHAR_DECOMPOSITION_TYPE:
return ;
#endif
return (int32_t)(u_getUnicodeProperties(c, 2)&UPROPS_DT_MASK);
case UCHAR_EAST_ASIAN_WIDTH:
return (int32_t)(u_getUnicodeProperties(c, 0)&UPROPS_EA_WIDTH_MASK)>>UPROPS_EA_WIDTH_SHIFT;
return (int32_t)(u_getUnicodeProperties(c, 0)&UPROPS_EA_MASK)>>UPROPS_EA_SHIFT;
case UCHAR_GENERAL_CATEGORY:
return (int32_t)u_charType(c);
#if 0 /* ### */
case UCHAR_JOINING_GROUP:
return ;
return (int32_t)(u_getUnicodeProperties(c, 2)&UPROPS_JG_MASK)>>UPROPS_JG_SHIFT;
case UCHAR_JOINING_TYPE:
return ;
/*
* ArabicShaping.txt:
* Note: Characters of joining type T and most characters of
* joining type U are not explicitly listed in this file.
*
* Characters of joining type T can [be] derived by the following formula:
* T = Mn + Cf - ZWNJ - ZWJ
*/
i=(int32_t)(u_getUnicodeProperties(c, 2)&UPROPS_JG_MASK)>>UPROPS_JG_SHIFT;
if(i==0 && c!=ZWNJ && c!=ZWJ && (FLAG(u_charType(c))&(_Mn|_Cf))!=0) {
i=(int32_t)U_JT_TRANSPARENT;
}
return i;
case UCHAR_LINE_BREAK:
return ;
#endif
/*
* LineBreak.txt:
* - Assigned characters that are not listed explicitly are given the value
* "AL".
* - Unassigned characters are given the value "XX".
*/
i=(int32_t)(u_getUnicodeProperties(c, 0)&UPROPS_LB_MASK)>>UPROPS_LB_SHIFT;
if(i==0 && u_charType(c)!=0) {
i=(int32_t)U_LB_ALPHABETIC;
}
return i;
case UCHAR_NUMERIC_TYPE:
return (int32_t)GET_NUMERIC_TYPE(u_getUnicodeProperties(c, -1));
case UCHAR_SCRIPT:
@ -317,6 +338,8 @@ u_getIntPropertyMinValue(UProperty which) {
U_CAPI int32_t U_EXPORT2
u_getIntPropertyMaxValue(UProperty which) {
int32_t max;
if(which<UCHAR_BINARY_START) {
return 0; /* undefined */
} else if(which<UCHAR_BINARY_LIMIT) {
@ -328,31 +351,33 @@ u_getIntPropertyMaxValue(UProperty which) {
case UCHAR_BIDI_CLASS:
return (int32_t)U_CHAR_DIRECTION_COUNT-1;
case UCHAR_BLOCK:
/* ### TODO This should be data-driven from uprops.dat */
return (int32_t)UBLOCK_COUNT-1;
max=(uprv_getMaxValues()&UPROPS_BLOCK_MASK)>>UPROPS_BLOCK_SHIFT;
if(max==0) {
max=(int32_t)UBLOCK_COUNT-1;
}
return max;
case UCHAR_CANONICAL_COMBINING_CLASS:
return 0xff; /* TODO do we need to be more precise, getting the actual maximum? */
#if 0 /* ### */
case UCHAR_DECOMPOSITION_TYPE:
return ;
#endif
return (int32_t)U_DT_COUNT-1;
case UCHAR_EAST_ASIAN_WIDTH:
return (int32_t)U_EA_COUNT-1;
case UCHAR_GENERAL_CATEGORY:
return (int32_t)U_CHAR_CATEGORY_COUNT-1;
#if 0 /* ### */
case UCHAR_JOINING_GROUP:
return ;
return (int32_t)U_JG_COUNT-1;
case UCHAR_JOINING_TYPE:
return ;
return (int32_t)U_JT_COUNT-1;
case UCHAR_LINE_BREAK:
return ;
#endif
return (int32_t)U_LB_COUNT-1;
case UCHAR_NUMERIC_TYPE:
return (int32_t)U_NT_COUNT-1;
case UCHAR_SCRIPT:
/* ### TODO This should be data-driven from uprops.dat */
return (int32_t)USCRIPT_CODE_LIMIT-1;
max=uprv_getMaxValues()&UPROPS_SCRIPT_MASK;
if(max==0) {
max=(int32_t)USCRIPT_CODE_LIMIT-1;
}
return max;
default:
return 0; /* undefined */
}

View File

@ -30,7 +30,10 @@ enum {
UPROPS_ADDITIONAL_VECTORS_INDEX,
UPROPS_ADDITIONAL_VECTORS_COLUMNS_INDEX,
UPROPS_RESERVED_INDEX,
UPROPS_RESERVED_INDEX, /* 6 */
/* maximum values for block and script codes, bits used as in vector word 0 */
UPROPS_MAX_VALUES_INDEX=10,
UPROPS_INDEX_COUNT=16
};
@ -73,13 +76,14 @@ enum {
};
/* number of properties vector words */
#define UPROPS_VECTOR_WORDS 2
#define UPROPS_VECTOR_WORDS 3
/*
* Properties in vector word 0
* Bits
* 31..24 DerivedAge version major/minor one nibble each
* 23..18 reserved
* 23 reserved
* 22..18 Line Break
* 17..15 East Asian Width
* 14.. 7 UBlockCode
* 6.. 0 UScriptCode
@ -89,8 +93,11 @@ enum {
#define UPROPS_AGE_MASK 0xff000000
#define UPROPS_AGE_SHIFT 24
#define UPROPS_EA_WIDTH_MASK 0x00038000
#define UPROPS_EA_WIDTH_SHIFT 15
#define UPROPS_LB_MASK 0x007C0000
#define UPROPS_LB_SHIFT 18
#define UPROPS_EA_MASK 0x00038000
#define UPROPS_EA_SHIFT 15
#define UPROPS_BLOCK_MASK 0x00007f80
#define UPROPS_BLOCK_SHIFT 7
@ -142,6 +149,21 @@ enum {
UPROPS_BINARY_1_TOP
};
/*
* Properties in vector word 2
* Bits
* 13..11 Joining Type
* 10.. 5 Joining Group
* 4.. 0 Decomposition Type
*/
#define UPROPS_JT_MASK 0x00003800
#define UPROPS_JT_SHIFT 11
#define UPROPS_JG_MASK 0x000007e0
#define UPROPS_JG_SHIFT 5
#define UPROPS_DT_MASK 0x0000001f
/**
* Get a properties vector word for a code point.
* Implemented in uchar.c for uprops.c.
@ -151,6 +173,13 @@ enum {
U_CFUNC uint32_t
u_getUnicodeProperties(UChar32 c, int32_t column);
/**
* Get the the maximum values for some enum/int properties.
* @internal
*/
U_CFUNC int32_t
uprv_getMaxValues();
/**
* Unicode property names and property value names are compared
* "loosely". Property[Value]Aliases.txt say: