ICU-10128 update ICU to Unicode 6.3 beta (merge from branches/markus/uni63 at r33585)

X-SVN-Rev: 33663
This commit is contained in:
Markus Scherer 2013-05-15 22:16:48 +00:00
parent 2982958b06
commit 54eb776527
29 changed files with 278596 additions and 676 deletions

View File

@ -1,6 +1,6 @@
/**
*******************************************************************************
* Copyright (C) 1996-2011, International Business Machines Corporation and
* Copyright (C) 1996-2013, International Business Machines Corporation and
* others. All Rights Reserved.
*******************************************************************************
*/
@ -16,7 +16,6 @@ import com.ibm.icu.impl.ICUBinary;
import com.ibm.icu.impl.ICUData;
import com.ibm.icu.impl.ICUResourceBundle;
import com.ibm.icu.impl.IntTrie;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.text.CollationParsedRuleBuilder.InverseUCA;
import com.ibm.icu.text.RuleBasedCollator.LeadByteConstants;
import com.ibm.icu.text.RuleBasedCollator.UCAConstants;
@ -138,13 +137,14 @@ final class CollatorReader {
*/
private CollatorReader(InputStream inputStream, boolean readICUHeader) throws IOException {
if (readICUHeader) {
byte[] UnicodeVersion = ICUBinary.readHeader(inputStream, DATA_FORMAT_ID_, UCA_AUTHENTICATE_);
// weiv: check that we have the correct Unicode version in
// binary files
VersionInfo UCDVersion = UCharacter.getUnicodeVersion();
if (UnicodeVersion[0] != UCDVersion.getMajor() || UnicodeVersion[1] != UCDVersion.getMinor()) {
throw new IOException(WRONG_UNICODE_VERSION_ERROR_);
}
ICUBinary.readHeader(inputStream, DATA_FORMAT_ID_, UCA_AUTHENTICATE_);
// Note: In ICU 51 and earlier,
// we used to check that the UCA data version (readHeader() return value)
// matches the UCD version (UCharacter.getUnicodeVersion())
// but that complicated version updates, and
// a mismatch is "only" a problem for handling canonical equivalence.
// It need not be a fatal error.
// throw new IOException(WRONG_UNICODE_VERSION_ERROR_);
}
m_dataInputStream_ = new DataInputStream(inputStream);
}
@ -512,15 +512,11 @@ final class CollatorReader {
* thrown when error occurs while reading the inverse uca
*/
private static CollationParsedRuleBuilder.InverseUCA readInverseUCA(InputStream inputStream) throws IOException {
byte[] UnicodeVersion = ICUBinary.readHeader(inputStream, INVERSE_UCA_DATA_FORMAT_ID_,
INVERSE_UCA_AUTHENTICATE_);
ICUBinary.readHeader(inputStream, INVERSE_UCA_DATA_FORMAT_ID_, INVERSE_UCA_AUTHENTICATE_);
// weiv: check that we have the correct Unicode version in
// binary files
VersionInfo UCDVersion = UCharacter.getUnicodeVersion();
if (UnicodeVersion[0] != UCDVersion.getMajor() || UnicodeVersion[1] != UCDVersion.getMinor()) {
throw new IOException(WRONG_UNICODE_VERSION_ERROR_);
}
// TODO: Check that the invuca data version (readHeader() return value)
// matches the ucadata version.
// throw new IOException(WRONG_UNICODE_VERSION_ERROR_);
CollationParsedRuleBuilder.InverseUCA result = new CollationParsedRuleBuilder.InverseUCA();
DataInputStream input = new DataInputStream(inputStream);
@ -616,7 +612,7 @@ final class CollatorReader {
/**
* Wrong unicode version error string
*/
private static final String WRONG_UNICODE_VERSION_ERROR_ = "Unicode version in binary image is not compatible with the current Unicode version";
// private static final String WRONG_UNICODE_VERSION_ERROR_ = "Unicode version in binary image is not compatible with the current Unicode version";
/**
* Size of expansion table in bytes

View File

@ -1,7 +1,7 @@
/*
*******************************************************************************
*
* Copyright (C) 2004-2011, International Business Machines
* Copyright (C) 2004-2013, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
@ -152,6 +152,8 @@ public final class UBiDiProps {
return (max&MAX_JG_MASK)>>MAX_JG_SHIFT;
case UProperty.JOINING_TYPE:
return (max&JT_MASK)>>JT_SHIFT;
case UProperty.BIDI_PAIRED_BRACKET_TYPE:
return (max&BPT_MASK)>>BPT_SHIFT;
default:
return -1; /* undefined */
}
@ -165,12 +167,8 @@ public final class UBiDiProps {
return getFlagFromProps(trie.get(c), IS_MIRRORED_SHIFT);
}
public final int getMirror(int c) {
int props;
int delta;
props=trie.get(c);
delta=((short)props)>>MIRROR_DELTA_SHIFT;
private final int getMirror(int c, int props) {
int delta=getMirrorDeltaFromProps(props);
if(delta!=ESC_MIRROR_DELTA) {
return c+delta;
} else {
@ -198,6 +196,11 @@ public final class UBiDiProps {
}
}
public final int getMirror(int c) {
int props=trie.get(c);
return getMirror(c, props);
}
public final boolean isBidiControl(int c) {
return getFlagFromProps(trie.get(c), BIDI_CONTROL_SHIFT);
}
@ -222,6 +225,19 @@ public final class UBiDiProps {
}
}
public final int getPairedBracketType(int c) {
return (trie.get(c)&BPT_MASK)>>BPT_SHIFT;
}
public final int getPairedBracket(int c) {
int props=trie.get(c);
if((props&BPT_MASK)==0) {
return c;
} else {
return getMirror(c, props);
}
}
// data members -------------------------------------------------------- ***
private int indexes[];
private int mirrors[];
@ -254,7 +270,7 @@ public final class UBiDiProps {
/* CLASS_SHIFT=0, */ /* bidi class: 5 bits (4..0) */
private static final int JT_SHIFT=5; /* joining type: 3 bits (7..5) */
/* private static final int _SHIFT=8, reserved: 2 bits (9..8) */
private static final int BPT_SHIFT=8; /* Bidi_Paired_Bracket_Type(bpt): 2 bits (9..8) */
private static final int JOIN_CONTROL_SHIFT=10;
private static final int BIDI_CONTROL_SHIFT=11;
@ -266,6 +282,7 @@ public final class UBiDiProps {
private static final int CLASS_MASK= 0x0000001f;
private static final int JT_MASK= 0x000000e0;
private static final int BPT_MASK= 0x00000300;
private static final int MAX_JG_MASK= 0x00ff0000;
@ -275,6 +292,9 @@ public final class UBiDiProps {
private static final boolean getFlagFromProps(int props, int shift) {
return ((props>>shift)&1)!=0;
}
private static final int getMirrorDeltaFromProps(int props) {
return (short)props>>MIRROR_DELTA_SHIFT;
}
private static final int ESC_MIRROR_DELTA=-4;
//private static final int MIN_MIRROR_DELTA=-3;

View File

@ -1,6 +1,6 @@
/**
*******************************************************************************
* Copyright (C) 1996-2012, International Business Machines Corporation and
* Copyright (C) 1996-2013, International Business Machines Corporation and
* others. All Rights Reserved.
*******************************************************************************
*/
@ -560,6 +560,11 @@ public final class UCharacterProperty
new IntProperty(2, GCB_MASK, GCB_SHIFT), // GRAPHEME_CLUSTER_BREAK
new IntProperty(2, SB_MASK, SB_SHIFT), // SENTENCE_BREAK
new IntProperty(2, WB_MASK, WB_SHIFT), // WORD_BREAK
new BiDiIntProperty() { // BIDI_PAIRED_BRACKET_TYPE
int getValue(int c) {
return UBiDiProps.INSTANCE.getPairedBracketType(c);
}
},
};
public int getIntPropertyValue(int c, int which) {

View File

@ -1,6 +1,6 @@
/**
*******************************************************************************
* Copyright (C) 1996-2012, International Business Machines Corporation and
* Copyright (C) 1996-2013, International Business Machines Corporation and
* others. All Rights Reserved.
*******************************************************************************
*/
@ -2823,10 +2823,16 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection
public static final int NEWLINE = 12;
/** @stable ICU 50 */
public static final int REGIONAL_INDICATOR = 13; /*[RI]*/ /* new in Unicode 6.2/ICU 50 */
/** @stable ICU 52 */
public static final int HEBREW_LETTER = 14; /*[HL]*/ /* from here on: new in Unicode 6.3/ICU 52 */
/** @stable ICU 52 */
public static final int SINGLE_QUOTE = 15; /*[SQ]*/
/** @stable ICU 52 */
public static final int DOUBLE_QUOTE = 16; /*[DQ]*/
/**
* @stable ICU 4.0
*/
public static final int COUNT = 14;
public static final int COUNT = 17;
}
/**
@ -3139,6 +3145,34 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection
public static final int COUNT = 6;
}
/**
* Bidi Paired Bracket Type constants.
*
* @see UProperty#BIDI_PAIRED_BRACKET_TYPE
* @stable ICU 52
*/
public static interface BidiPairedBracketType {
/**
* Not a paired bracket.
* @stable ICU 52
*/
public static final int NONE = 0;
/**
* Open paired bracket.
* @stable ICU 52
*/
public static final int OPEN = 1;
/**
* Close paired bracket.
* @stable ICU 52
*/
public static final int CLOSE = 2;
/**
* @stable ICU 52
*/
public static final int COUNT = 3;
}
// public data members -----------------------------------------------
/**
@ -3937,6 +3971,26 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection
return UBiDiProps.INSTANCE.getMirror(ch);
}
/**
* {@icu} Maps the specified character to its paired bracket character.
* For Bidi_Paired_Bracket_Type!=None, this is the same as getMirror(int).
* Otherwise c itself is returned.
* See http://www.unicode.org/reports/tr9/
*
* @param c the code point to be mapped
* @return the paired bracket code point,
* or c itself if there is no such mapping
* (Bidi_Paired_Bracket_Type=None)
*
* @see UProperty#BIDI_PAIRED_BRACKET
* @see UProperty#BIDI_PAIRED_BRACKET_TYPE
* @see #getMirror(int)
* @stable ICU 52
*/
public static int getBidiPairedBracket(int c) {
return UBiDiProps.INSTANCE.getPairedBracket(c);
}
/**
* {@icu} Returns the combining class of the argument codepoint
* @param ch code point whose combining is to be retrieved

View File

@ -1,7 +1,7 @@
/**
*******************************************************************************
* Copyright (C) 1996-2004, International Business Machines Corporation and *
* others. All Rights Reserved. *
* Copyright (C) 1996-2013, International Business Machines Corporation and
* others. All Rights Reserved.
*******************************************************************************
*/
@ -78,6 +78,14 @@ public final class UCharacterDirection implements ECharacterDirection {
return "Non-Spacing Mark";
case BOUNDARY_NEUTRAL :
return "Boundary Neutral";
case FIRST_STRONG_ISOLATE:
return "First Strong Isolate";
case LEFT_TO_RIGHT_ISOLATE:
return "Left-to-Right Isolate";
case RIGHT_TO_LEFT_ISOLATE:
return "Right-to-Left Isolate";
case POP_DIRECTIONAL_ISOLATE:
return "Pop Directional Isolate";
}
return "Unassigned";
}

View File

@ -1,7 +1,7 @@
/**
*******************************************************************************
* Copyright (C) 2004-2007, International Business Machines Corporation and *
* others. All Rights Reserved. *
* Copyright (C) 2004-2013, International Business Machines Corporation and
* others. All Rights Reserved.
*******************************************************************************
*/
@ -475,11 +475,35 @@ public class UCharacterEnums {
*/
public static final byte DIRECTIONALITY_BOUNDARY_NEUTRAL = (byte)BOUNDARY_NEUTRAL;
/**
* Directional type FSI
* @stable ICU 52
*/
public static final byte FIRST_STRONG_ISOLATE = 19;
/**
* Directional type LRI
* @stable ICU 52
*/
public static final byte LEFT_TO_RIGHT_ISOLATE = 20;
/**
* Directional type RLI
* @stable ICU 52
*/
public static final byte RIGHT_TO_LEFT_ISOLATE = 21;
/**
* Directional type PDI
* @stable ICU 52
*/
public static final byte POP_DIRECTIONAL_ISOLATE = 22;
/**
* Number of directional types
* @stable ICU 2.1
*/
public static final int CHAR_DIRECTION_COUNT = 19;
public static final int CHAR_DIRECTION_COUNT = 23;
/**
* Undefined bidirectional character type. Undefined <code>char</code>

View File

@ -1,6 +1,6 @@
/**
*******************************************************************************
* Copyright (C) 1996-2012, International Business Machines Corporation and
* Copyright (C) 1996-2013, International Business Machines Corporation and
* others. All Rights Reserved.
*******************************************************************************
*/
@ -605,7 +605,7 @@ public interface UProperty
/**
* Enumerated property Hangul_Syllable_Type, new in Unicode 4.
* Returns HangulSyllableType values.
* Returns UCharacter.HangulSyllableType values.
* @stable ICU 2.6
*/
public static final int HANGUL_SYLLABLE_TYPE = 0x100B;
@ -664,7 +664,7 @@ public interface UProperty
* Enumerated property Grapheme_Cluster_Break (new in Unicode 4.1).
* Used in UAX #29: Text Boundaries
* (http://www.unicode.org/reports/tr29/)
* Returns UGraphemeClusterBreak values.
* Returns UCharacter.GraphemeClusterBreak values.
* @stable ICU 3.4
*/
public static final int GRAPHEME_CLUSTER_BREAK = 0x1012;
@ -673,7 +673,7 @@ public interface UProperty
* Enumerated property Sentence_Break (new in Unicode 4.1).
* Used in UAX #29: Text Boundaries
* (http://www.unicode.org/reports/tr29/)
* Returns USentenceBreak values.
* Returns UCharacter.SentenceBreak values.
* @stable ICU 3.4
*/
public static final int SENTENCE_BREAK = 0x1013;
@ -682,17 +682,26 @@ public interface UProperty
* Enumerated property Word_Break (new in Unicode 4.1).
* Used in UAX #29: Text Boundaries
* (http://www.unicode.org/reports/tr29/)
* Returns UWordBreakValues values.
* Returns UCharacter.WordBreak values.
* @stable ICU 3.4
*/
public static final int WORD_BREAK = 0x1014;
/**
* Enumerated property Bidi_Paired_Bracket_Type (new in Unicode 6.3).
* Used in UAX #9: Unicode Bidirectional Algorithm
* (http://www.unicode.org/reports/tr9/)
* Returns UCharacter.BidiPairedBracketType values.
* @stable ICU 52
*/
public static final int BIDI_PAIRED_BRACKET_TYPE = 0x1015;
/**
* One more than the last constant for enumerated/integer Unicode
* properties.
* @stable ICU 2.4
*/
public static final int INT_LIMIT = 0x1015;
public static final int INT_LIMIT = 0x1016;
/**
* Bitmask property General_Category_Mask.
@ -835,16 +844,21 @@ public interface UProperty
*/
public static final int UPPERCASE_MAPPING = 0x400C;
/**
* String property Bidi_Paired_Bracket (new in Unicode 6.3).
* Corresponds to UCharacter.getBidiPairedBracket.
* @stable ICU 52
*/
public static final int BIDI_PAIRED_BRACKET = 0x400D;
/**
* One more than the last constant for string Unicode properties.
* @stable ICU 2.4
*/
public static final int STRING_LIMIT = 0x400D;
public static final int STRING_LIMIT = 0x400E;
/**
* Provisional property Script_Extensions (new in Unicode 6.0).
* As a provisional property, it may be modified or removed
* in future versions of the Unicode Standard, and thus in ICU.
* Miscellaneous property Script_Extensions (new in Unicode 6.0).
* Some characters are commonly used in multiple scripts.
* For more information, see UAX #24: http://www.unicode.org/reports/tr24/.
* Corresponds to UScript.hasScript and UScript.getScriptExtensions.

View File

@ -523,11 +523,16 @@ public final class UScript {
* @stable ICU 3.6
*/
public static final int PHOENICIAN = 91; /* Phnx */
/**
* ISO 15924 script code
* @stable ICU 52
*/
public static final int MIAO = 92; /* Plrd */
/**
* ISO 15924 script code
* @stable ICU 3.6
*/
public static final int PHONETIC_POLLARD = 92; /* Plrd */
public static final int PHONETIC_POLLARD = MIAO;
/**
* ISO 15924 script code
* @stable ICU 3.6
@ -584,7 +589,6 @@ public final class UScript {
*/
public static final int UNKNOWN = 103;/* Zzzz */ /* Unknown="Code for uncoded script", for unassigned code points */
/* Private use codes from Qaaa - Qabx are not supported*/
/**
* ISO 15924 script code
* @stable ICU 3.8
@ -878,6 +882,18 @@ public final class UScript {
* @stable ICU 49
*/
public static final int TIRHUTA = 158;/* Tirh */
/**
* ISO 15924 script code
* @stable ICU 52
*/
public static final int CAUCASIAN_ALBANIAN = 159; /* Aghb */
/**
* ISO 15924 script code
* @stable ICU 52
*/
public static final int MAHAJANI = 160; /* Mahj */
/* Private use codes from Qaaa - Qabx are not supported */
/**
* One higher than the last ISO 15924 script code integer.
@ -885,7 +901,7 @@ public final class UScript {
* for which integer constants are added above.
* @stable ICU 2.4
*/
public static final int CODE_LIMIT = 159;
public static final int CODE_LIMIT = 161;
private static final String kLocaleScript = "LocaleScript";
@ -1324,6 +1340,8 @@ public final class UScript {
0,
0,
0,
0,
0,
// End copy-paste from parsescriptmetadata.py
};

View File

@ -1,6 +1,6 @@
/*
*******************************************************************************
* Copyright (C) 2001-2012, International Business Machines
* Copyright (C) 2001-2013, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
*/
@ -850,6 +850,10 @@ public class Bidi {
static final byte PDF = UCharacterDirection.POP_DIRECTIONAL_FORMAT;
static final byte NSM = UCharacterDirection.DIR_NON_SPACING_MARK;
static final byte BN = UCharacterDirection.BOUNDARY_NEUTRAL;
static final byte FSI = UCharacterDirection.FIRST_STRONG_ISOLATE;
static final byte LRI = UCharacterDirection.LEFT_TO_RIGHT_ISOLATE;
static final byte RLI = UCharacterDirection.RIGHT_TO_LEFT_ISOLATE;
static final byte PDI = UCharacterDirection.POP_DIRECTIONAL_ISOLATE;
static final int MASK_R_AL = (1 << R | 1 << AL);
@ -3954,11 +3958,14 @@ public class Bidi {
int dir;
if (customClassifier == null ||
(dir = customClassifier.classify(c)) == Bidi.CLASS_DEFAULT) {
return bdp.getClass(c);
} else {
return dir;
(dir = customClassifier.classify(c)) == Bidi.CLASS_DEFAULT) {
dir = bdp.getClass(c);
}
if (dir > 18) {
// TODO: Implement Unicode 6.3 BiDi isolates in the ICU BiDi code.
dir = ON;
}
return dir;
}
/**

View File

@ -1,7 +1,7 @@
/*
*******************************************************************************
* Copyright (C) 1996-2013, International Business Machines Corporation and *
* others. All Rights Reserved. *
* Copyright (C) 1996-2013, International Business Machines Corporation and
* others. All Rights Reserved.
*******************************************************************************
*/
@ -143,6 +143,12 @@ public final class VersionInfo implements Comparable<VersionInfo>
*/
public static final VersionInfo UNICODE_6_2;
/**
* Unicode 6.3 version
* @stable ICU 52
*/
public static final VersionInfo UNICODE_6_3;
/**
* ICU4J current release version
* @stable ICU 2.8
@ -505,10 +511,11 @@ public final class VersionInfo implements Comparable<VersionInfo>
UNICODE_6_0 = getInstance(6, 0, 0, 0);
UNICODE_6_1 = getInstance(6, 1, 0, 0);
UNICODE_6_2 = getInstance(6, 2, 0, 0);
UNICODE_6_3 = getInstance(6, 3, 0, 0);
ICU_VERSION = getInstance(52, 0, 1, 0);
ICU_DATA_VERSION = getInstance(52, 0, 1, 0);
UNICODE_VERSION = UNICODE_6_2;
UNICODE_VERSION = UNICODE_6_3;
UCOL_RUNTIME_VERSION = getInstance(7);
UCOL_BUILDER_VERSION = getInstance(8);
@ -529,9 +536,9 @@ public final class VersionInfo implements Comparable<VersionInfo>
/**
* Gets the int from the version numbers
* @param major non-negative version number
* @param minor non-negativeversion number
* @param milli non-negativeversion number
* @param micro non-negativeversion number
* @param minor non-negative version number
* @param milli non-negative version number
* @param micro non-negative version number
*/
private static int getInt(int major, int minor, int milli, int micro)
{

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:b246383d534609ffb6a14c49236f9c325b445cb60e97c0ddf5948ac47406d82f
size 10576546
oid sha256:dd2c9faa74a7029d57014b097e4b380f038c54d837ca90b197321e183eca432d
size 10578576

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:43cc25f91794de5c1ba2aa48032a9e69653da5e00d0f0fc5cc72db5d2fb0a170
oid sha256:ee9e2b3884dcfd3d4905d0005280aa429b8b1846667f9abb475157cd6fc3f3d0
size 98429

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:bbedc52b17fb2c3437c1d6498655b6223654395d08edf79d661401f3dab224d0
size 724510
oid sha256:2dd62bbd4ef0fe4b89295bffdb22c67e718effb1ada3a5e28e516b7e3576adec
size 724514

View File

@ -1,6 +1,6 @@
/*
*******************************************************************************
* Copyright (C) 2002-2012, International Business Machines Corporation and
* Copyright (C) 2002-2013, International Business Machines Corporation and
* others. All Rights Reserved.
*******************************************************************************
*/
@ -436,7 +436,11 @@ public class CollationAPITest extends TestFmwk {
// Assume that the UCD and UCA versions are the same,
// rather than hardcoding (and updating each time) a particular UCA version.
VersionInfo ucdVersion = UCharacter.getUnicodeVersion();
doAssert(col.getUCAVersion().equals(ucdVersion), "Expected UCA version "+ucdVersion.toString()+" got "+col.getUCAVersion().toString());
VersionInfo ucaVersion = col.getUCAVersion();
doAssert(logKnownIssue("9101", "update to collv2 & UCA 6.3") ?
ucdVersion.getMajor() == 6 && ucdVersion.getMinor() == 3 :
ucaVersion.equals(ucdVersion),
"Expected UCA version "+ucdVersion.toString()+" got "+col.getUCAVersion().toString());
doAssert((col.compare("ab", "abc") < 0), "ab < abc comparison failed");
doAssert((col.compare("ab", "AB") < 0), "ab < AB comparison failed");

View File

@ -1,10 +1,10 @@
# NormalizationCorrections-6.2.0.txt
# Date: 2012-05-15, 22:25:00 GMT [KW, LI]
# NormalizationCorrections-6.3.0.txt
# Date: 2013-01-02, 08:39:00 GMT [KW, LI]
#
# This file is a normative contributory data file in the
# Unicode Character Database.
#
# Copyright (c) 1991-2012 Unicode, Inc.
# Copyright (c) 1991-2013 Unicode, Inc.
# For terms of use, see http://www.unicode.org/terms_of_use.html
#
# The normalization stability policy of the Unicode Consortium

View File

@ -1,8 +1,8 @@
# NormalizationTest-6.2.0.txt
# Date: 2012-08-14, 17:54:58 GMT [MD]
# NormalizationTest-6.3.0.txt
# Date: 2012-12-20, 22:18:30 GMT [MD]
#
# Unicode Character Database
# Copyright (c) 1991-2012 Unicode, Inc.
# Copyright (c) 1991-2013 Unicode, Inc.
# For terms of use, see http://www.unicode.org/terms_of_use.html
# For documentation, see http://www.unicode.org/reports/tr44/
#

View File

@ -1,8 +1,8 @@
# SpecialCasing-6.2.0.txt
# Date: 2012-05-23, 20:35:15 GMT [MD]
# SpecialCasing-6.3.0.txt
# Date: 2013-03-12, 22:36:00 GMT [LI temp]
#
# Unicode Character Database
# Copyright (c) 1991-2012 Unicode, Inc.
# Copyright (c) 1991-2013 Unicode, Inc.
# For terms of use, see http://www.unicode.org/terms_of_use.html
# For documentation, see http://www.unicode.org/reports/tr44/
#
@ -39,7 +39,7 @@
# A language ID is defined by BCP 47, with '-' and '_' treated equivalently.
#
# A context for a character C is defined by Section 3.13 Default Case
# Operations, of The Unicode Standard, Version 5.0.
# Algorithms, of The Unicode Standard, Version 6.3.
# (This is identical to the context defined by Unicode 4.1.0,
# as specified in http://www.unicode.org/versions/Unicode4.1.0/)
#
@ -273,4 +273,3 @@ FB17; FB17; 0544 056D; 0544 053D; # ARMENIAN SMALL LIGATURE MEN XEH
# 0131; 0131; 0049; 0049; tr; # LATIN SMALL LETTER DOTLESS I
# EOF

View File

@ -1509,6 +1509,7 @@
0619;ARABIC SMALL DAMMA;Mn;31;NSM;;;;;N;;;;;
061A;ARABIC SMALL KASRA;Mn;32;NSM;;;;;N;;;;;
061B;ARABIC SEMICOLON;Po;0;AL;;;;;N;;;;;
061C;ARABIC LETTER MARK;Cf;0;AL;;;;;N;;;;;
061E;ARABIC TRIPLE DOT PUNCTUATION MARK;Po;0;AL;;;;;N;;;;;
061F;ARABIC QUESTION MARK;Po;0;AL;;;;;N;;;;;
0620;ARABIC LETTER KASHMIRI YEH;Lo;0;AL;;;;;N;;;;;
@ -5296,7 +5297,7 @@
180B;MONGOLIAN FREE VARIATION SELECTOR ONE;Mn;0;NSM;;;;;N;;;;;
180C;MONGOLIAN FREE VARIATION SELECTOR TWO;Mn;0;NSM;;;;;N;;;;;
180D;MONGOLIAN FREE VARIATION SELECTOR THREE;Mn;0;NSM;;;;;N;;;;;
180E;MONGOLIAN VOWEL SEPARATOR;Zs;0;WS;;;;;N;;;;;
180E;MONGOLIAN VOWEL SEPARATOR;Cf;0;BN;;;;;N;;;;;
1810;MONGOLIAN DIGIT ZERO;Nd;0;L;;0;0;0;N;;;;;
1811;MONGOLIAN DIGIT ONE;Nd;0;L;;1;1;1;N;;;;;
1812;MONGOLIAN DIGIT TWO;Nd;0;L;;2;2;2;N;;;;;
@ -5751,7 +5752,7 @@
1A18;BUGINESE VOWEL SIGN U;Mn;220;NSM;;;;;N;;;;;
1A19;BUGINESE VOWEL SIGN E;Mc;0;L;;;;;N;;;;;
1A1A;BUGINESE VOWEL SIGN O;Mc;0;L;;;;;N;;;;;
1A1B;BUGINESE VOWEL SIGN AE;Mc;0;L;;;;;N;;;;;
1A1B;BUGINESE VOWEL SIGN AE;Mn;0;NSM;;;;;N;;;;;
1A1E;BUGINESE PALLAWA;Po;0;L;;;;;N;;;;;
1A1F;BUGINESE END OF SECTION;Po;0;L;;;;;N;;;;;
1A20;TAI THAM LETTER HIGH KA;Lo;0;L;;;;;N;;;;;
@ -7116,6 +7117,10 @@
2062;INVISIBLE TIMES;Cf;0;BN;;;;;N;;;;;
2063;INVISIBLE SEPARATOR;Cf;0;BN;;;;;N;;;;;
2064;INVISIBLE PLUS;Cf;0;BN;;;;;N;;;;;
2066;LEFT-TO-RIGHT ISOLATE;Cf;0;LRI;;;;;N;;;;;
2067;RIGHT-TO-LEFT ISOLATE;Cf;0;RLI;;;;;N;;;;;
2068;FIRST STRONG ISOLATE;Cf;0;FSI;;;;;N;;;;;
2069;POP DIRECTIONAL ISOLATE;Cf;0;PDI;;;;;N;;;;;
206A;INHIBIT SYMMETRIC SWAPPING;Cf;0;BN;;;;;N;;;;;
206B;ACTIVATE SYMMETRIC SWAPPING;Cf;0;BN;;;;;N;;;;;
206C;INHIBIT ARABIC FORM SHAPING;Cf;0;BN;;;;;N;;;;;
@ -18740,8 +18745,8 @@ FFFD;REPLACEMENT CHARACTER;So;0;ON;;;;;N;;;;;
12453;CUNEIFORM NUMERIC SIGN FOUR BAN2 VARIANT FORM;Nl;0;L;;;;4;N;;;;;
12454;CUNEIFORM NUMERIC SIGN FIVE BAN2;Nl;0;L;;;;5;N;;;;;
12455;CUNEIFORM NUMERIC SIGN FIVE BAN2 VARIANT FORM;Nl;0;L;;;;5;N;;;;;
12456;CUNEIFORM NUMERIC SIGN NIGIDAMIN;Nl;0;L;;;;-1;N;;;;;
12457;CUNEIFORM NUMERIC SIGN NIGIDAESH;Nl;0;L;;;;-1;N;;;;;
12456;CUNEIFORM NUMERIC SIGN NIGIDAMIN;Nl;0;L;;;;2;N;;;;;
12457;CUNEIFORM NUMERIC SIGN NIGIDAESH;Nl;0;L;;;;3;N;;;;;
12458;CUNEIFORM NUMERIC SIGN ONE ESHE3;Nl;0;L;;;;1;N;;;;;
12459;CUNEIFORM NUMERIC SIGN TWO ESHE3;Nl;0;L;;;;2;N;;;;;
1245A;CUNEIFORM NUMERIC SIGN ONE THIRD DISH;Nl;0;L;;;;1/3;N;;;;;

View File

@ -1,6 +1,6 @@
/*
*******************************************************************************
* Copyright (C) 2010, International Business Machines Corporation and
* Copyright (C) 2010-2013, International Business Machines Corporation and
* others. All Rights Reserved.
*******************************************************************************
*/
@ -27,6 +27,11 @@ public class BiDiConformanceTest extends TestFmwk {
public BiDiConformanceTest() {}
public void TestBidiTest() throws IOException {
if(logKnownIssue("10142",
"Update the ICU BiDi code to implement the additions in the " +
"Unicode 6.3 BiDi Algorithm, and reenable the BiDi conformance test.")) {
return;
}
BufferedReader bidiTestFile=TestUtil.getDataReader("unicode/BidiTest.txt");
Bidi ubidi=new Bidi();
ubidi.setCustomClassifier(new ConfTestBidiClassifier());
@ -140,7 +145,12 @@ outerLoop:
0x4f, // 'O' for RLO
0x2a, // '*' for PDF
0x60, // '`' for NSM
0x7c // '|' for BN
0x7c, // '|' for BN
// new in Unicode 6.3/ICU 52
0x53, // 'S' for FSI
0x69, // 'i' for LRI
0x49, // 'I' for RLI
0x2e // '.' for PDI
};
private class ConfTestBidiClassifier extends BidiClassifier {
public ConfTestBidiClassifier() {
@ -159,7 +169,7 @@ outerLoop:
}
}
private static final int biDiClassNameLengths[]={
1, 1, 2, 2, 2, 2, 2, 1, 1, 2, 2, 3, 3, 2, 3, 3, 3, 3, 2, 0
1, 1, 2, 2, 2, 2, 2, 1, 1, 2, 2, 3, 3, 2, 3, 3, 3, 3, 2, 3, 3, 3, 3, 0
};
private void parseInputStringFromBiDiClasses() {
inputStringBuilder.delete(0, 0x7fffffff);
@ -178,6 +188,8 @@ outerLoop:
if((lineIndex+2)<line.length() && line.charAt(lineIndex+1)=='R') {
if((c2=line.charAt(lineIndex+2))=='E') {
biDiClass=UCharacterDirection.LEFT_TO_RIGHT_EMBEDDING;
} else if(line.charAt(lineIndex+2)=='I') {
biDiClass=UCharacterDirection.LEFT_TO_RIGHT_ISOLATE;
} else if(c2=='O') {
biDiClass=UCharacterDirection.LEFT_TO_RIGHT_OVERRIDE;
}
@ -188,6 +200,8 @@ outerLoop:
if((lineIndex+2)<line.length() && line.charAt(lineIndex+1)=='L') {
if((c2=line.charAt(lineIndex+2))=='E') {
biDiClass=UCharacterDirection.RIGHT_TO_LEFT_EMBEDDING;
} else if(line.charAt(lineIndex+2)=='I') {
biDiClass=UCharacterDirection.RIGHT_TO_LEFT_ISOLATE;
} else if(c2=='O') {
biDiClass=UCharacterDirection.RIGHT_TO_LEFT_OVERRIDE;
}
@ -226,12 +240,18 @@ outerLoop:
biDiClass=UCharacterDirection.WHITE_SPACE_NEUTRAL;
} else if(c0=='O' && (lineIndex+1)<line.length() && line.charAt(lineIndex+1)=='N') {
biDiClass=UCharacterDirection.OTHER_NEUTRAL;
} else if(c0=='P' && (lineIndex+2)<line.length() &&
line.charAt(lineIndex+1)=='D' && line.charAt(lineIndex+2)=='F') {
biDiClass=UCharacterDirection.POP_DIRECTIONAL_FORMAT;
} else if(c0=='P' && (lineIndex+2)<line.length() && line.charAt(lineIndex+1)=='D') {
if(line.charAt(lineIndex+2)=='F') {
biDiClass=UCharacterDirection.POP_DIRECTIONAL_FORMAT;
} else if(line.charAt(lineIndex+2)=='I') {
biDiClass=UCharacterDirection.POP_DIRECTIONAL_ISOLATE;
}
} else if(c0=='N' && (lineIndex+2)<line.length() &&
line.charAt(lineIndex+1)=='S' && line.charAt(lineIndex+2)=='M') {
biDiClass=UCharacterDirection.DIR_NON_SPACING_MARK;
} else if(c0=='F' && (lineIndex+2)<line.length() &&
line.charAt(lineIndex+1)=='S' && line.charAt(lineIndex+2)=='I') {
biDiClass=UCharacterDirection.FIRST_STRONG_ISOLATE;
}
// Now we verify that the class name is terminated properly,
// and not just the start of a longer word.

View File

@ -1,6 +1,6 @@
/*
*******************************************************************************
* Copyright (C) 2001-2010, International Business Machines
* Copyright (C) 2001-2013, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
*/
@ -28,7 +28,9 @@ public class BidiTest extends TestFmwk {
/* L R EN ES ET AN CS B S WS ON */
0x61, 0x5d0, 0x30, 0x2f, 0x25, 0x660, 0x2c, 0xa, 0x9, 0x20, 0x26,
/* LRE LRO AL RLE RLO PDF NSM BN */
0x202a, 0x202d, 0x627, 0x202b, 0x202e, 0x202c, 0x308, 0x200c
0x202a, 0x202d, 0x627, 0x202b, 0x202e, 0x202c, 0x308, 0x200c,
/* FSI LRI RLI PDI */
0x2068, 0x2066, 0x2067, 0x2069 /* new in Unicode 6.3/ICU 52 */
};
static {

View File

@ -1,6 +1,6 @@
/*
*******************************************************************************
* Copyright (C) 2001-2007, International Business Machines
* Copyright (C) 2001-2013, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
*/
@ -37,11 +37,16 @@ public class TestData {
protected static final int PDF = UCharacterDirection.POP_DIRECTIONAL_FORMAT;
protected static final int NSM = UCharacterDirection.DIR_NON_SPACING_MARK;
protected static final int BN = UCharacterDirection.BOUNDARY_NEUTRAL;
protected static final int FSI = UCharacterDirection.FIRST_STRONG_ISOLATE;
protected static final int LRI = UCharacterDirection.LEFT_TO_RIGHT_ISOLATE;
protected static final int RLI = UCharacterDirection.RIGHT_TO_LEFT_ISOLATE;
protected static final int PDI = UCharacterDirection.POP_DIRECTIONAL_ISOLATE;
protected static final int DEF = Bidi.CLASS_DEFAULT;
protected static final String[] dirPropNames = {
"L", "R", "EN", "ES", "ET", "AN", "CS", "B", "S", "WS", "ON",
"LRE", "LRO", "AL", "RLE", "RLO", "PDF", "NSM", "BN"
"LRE", "LRO", "AL", "RLE", "RLO", "PDF", "NSM", "BN",
"FSI", "LRI", "RLI", "PDI" /* new in Unicode 6.3/ICU 52 */
};
protected static final short[][] testDirProps = {
{ L, L, WS, L, WS, EN, L, B }, // 0

View File

@ -579,6 +579,8 @@ public class TestUScript extends TestFmwk {
"Afak", "Jurc", "Mroo", "Nshu", "Sharada", "Sora_Sompeng", "Takri", "Tang", "Wole",
/* new in ICU 49 */
"Hluw", "Khoj", "Tirh",
/* new in ICU 52 */
"Aghb", "Mahj"
};
String[] expectedShort = new String[]{
"Bali", "Batk", "Blis", "Brah", "Cham", "Cirt", "Cyrs", "Egyd", "Egyh", "Egyp",
@ -601,6 +603,8 @@ public class TestUScript extends TestFmwk {
"Afak", "Jurc", "Mroo", "Nshu", "Shrd", "Sora", "Takr", "Tang", "Wole",
/* new in ICU 49 */
"Hluw", "Khoj", "Tirh",
/* new in ICU 52 */
"Aghb", "Mahj"
};
if(expectedLong.length!=(UScript.CODE_LIMIT-UScript.BALINESE)) {
errln("need to add new script codes in lang.TestUScript.java!");

View File

@ -1,7 +1,7 @@
/**
*******************************************************************************
* Copyright (C) 2001-2009, International Business Machines Corporation and *
* others. All Rights Reserved. *
* Copyright (C) 2001-2013, International Business Machines Corporation and
* others. All Rights Reserved.
*******************************************************************************
*/
package com.ibm.icu.dev.test.lang;
@ -65,6 +65,10 @@ public class UCharacterDirectionTest extends TestFmwk
"Pop Directional Format",
"Non-Spacing Mark",
"Boundary Neutral",
"First Strong Isolate",
"Left-to-Right Isolate",
"Right-to-Left Isolate",
"Pop Directional Isolate",
"Unassigned"};
for (int i = UCharacterDirection.LEFT_TO_RIGHT;

View File

@ -1,6 +1,6 @@
/**
*******************************************************************************
* Copyright (C) 1996-2012, International Business Machines Corporation and
* Copyright (C) 1996-2013, International Business Machines Corporation and
* others. All Rights Reserved.
*******************************************************************************
*/
@ -46,7 +46,7 @@ public final class UCharacterTest extends TestFmwk
/**
* ICU4J data version number
*/
private final VersionInfo VERSION_ = VersionInfo.getInstance("6.2.0.0");
private final VersionInfo VERSION_ = VersionInfo.getInstance("6.3.0.0");
// constructor ===================================================
@ -534,6 +534,16 @@ public final class UCharacterTest extends TestFmwk
if(c3!=start) {
errln("getMirror() does not roundtrip: U+"+hex(start)+"->U+"+hex(c2)+"->U+"+hex(c3));
}
c3=UCharacter.getBidiPairedBracket(start);
if(UCharacter.getIntPropertyValue(start, UProperty.BIDI_PAIRED_BRACKET_TYPE)==UCharacter.BidiPairedBracketType.NONE) {
if(c3!=start) {
errln("u_getBidiPairedBracket(U+"+hex(start)+") != self for bpt(c)==None");
}
} else {
if(c3!=c2) {
errln("u_getBidiPairedBracket(U+"+hex(start)+") != U+"+hex(c2)+" = bmg(c)'");
}
}
} while(++start<=end);
}
@ -673,10 +683,10 @@ public final class UCharacterTest extends TestFmwk
final String TYPE =
"LuLlLtLmLoMnMeMcNdNlNoZsZlZpCcCfCoCsPdPsPePcPoSmScSkSoPiPf";
// directory types used in the UnicodeData file
// directorionality types used in the UnicodeData file
// padded by spaces to make each type size 4
final String DIR =
"L R EN ES ET AN CS B S WS ON LRE LRO AL RLE RLO PDF NSM BN ";
"L R EN ES ET AN CS B S WS ON LRE LRO AL RLE RLO PDF NSM BN FSI LRI RLI PDI ";
Normalizer2 nfc = Normalizer2.getNFCInstance();
Normalizer2 nfkc = Normalizer2.getNFKCInstance();
@ -802,7 +812,7 @@ public final class UCharacterTest extends TestFmwk
}
int i=UCharacter.getIntPropertyValue(ch, UProperty.DECOMPOSITION_TYPE);
assertEquals(
String.format("error: u_getIntPropertyValue(U+%04x, UCHAR_DECOMPOSITION_TYPE) is wrong", ch),
String.format("error: UCharacter.getIntPropertyValue(U+%04x, UProperty.DECOMPOSITION_TYPE) is wrong", ch),
dt, i);
/* Expect Decomposition_Mapping=nfkc.getRawDecomposition(c). */
String mapping=nfkc.getRawDecomposition(ch);
@ -1492,6 +1502,8 @@ public final class UCharacterTest extends TestFmwk
{ 0x07C0, UCharacterDirection.RIGHT_TO_LEFT_ARABIC },
{ 0x08A0, UCharacterDirection.RIGHT_TO_LEFT },
{ 0x0900, UCharacterDirection.RIGHT_TO_LEFT_ARABIC }, /* Unicode 6.1 changes U+08A0..U+08FF from R to AL */
{ 0x20A0, UCharacterDirection.LEFT_TO_RIGHT },
{ 0x20D0, UCharacterDirection.EUROPEAN_NUMBER_TERMINATOR }, /* Unicode 6.3 changes the currency symbols block U+20A0..U+20CF to default to ET not L */
{ 0xFB1D, UCharacterDirection.LEFT_TO_RIGHT },
{ 0xFB50, UCharacterDirection.RIGHT_TO_LEFT },
{ 0xFE00, UCharacterDirection.RIGHT_TO_LEFT_ARABIC },
@ -2067,6 +2079,20 @@ public final class UCharacterTest extends TestFmwk
{ 0x08ba, UProperty.BIDI_CLASS, UCharacterDirection.RIGHT_TO_LEFT_ARABIC },
{ 0x1eee4, UProperty.BIDI_CLASS, UCharacterDirection.RIGHT_TO_LEFT_ARABIC },
{ -1, 0x630, 0 }, /* version break for Unicode 6.3 */
/* unassigned code points in the currency symbols block now default to ET */
{ 0x20C0, UProperty.BIDI_CLASS, UCharacterDirection.EUROPEAN_NUMBER_TERMINATOR },
{ 0x20CF, UProperty.BIDI_CLASS, UCharacterDirection.EUROPEAN_NUMBER_TERMINATOR },
/* new property in Unicode 6.3 */
{ 0x0027, UProperty.BIDI_PAIRED_BRACKET_TYPE, UCharacter.BidiPairedBracketType.NONE },
{ 0x0028, UProperty.BIDI_PAIRED_BRACKET_TYPE, UCharacter.BidiPairedBracketType.OPEN },
{ 0x0029, UProperty.BIDI_PAIRED_BRACKET_TYPE, UCharacter.BidiPairedBracketType.CLOSE },
{ 0xFF5C, UProperty.BIDI_PAIRED_BRACKET_TYPE, UCharacter.BidiPairedBracketType.NONE },
{ 0xFF5B, UProperty.BIDI_PAIRED_BRACKET_TYPE, UCharacter.BidiPairedBracketType.OPEN },
{ 0xFF5D, UProperty.BIDI_PAIRED_BRACKET_TYPE, UCharacter.BidiPairedBracketType.CLOSE },
/* undefined UProperty values */
{ 0x61, 0x4a7, 0 },
{ 0x234bc, 0x15ed, 0 }
@ -2121,6 +2147,9 @@ public final class UCharacterTest extends TestFmwk
if(UCharacter.getIntPropertyMaxValue(UProperty.WORD_BREAK)!=UCharacter.WordBreak.COUNT-1) {
errln("error: UCharacter.getIntPropertyMaxValue(UProperty.WORD_BREAK) wrong\n");
}
if(UCharacter.getIntPropertyMaxValue(UProperty.BIDI_PAIRED_BRACKET_TYPE)!=UCharacter.BidiPairedBracketType.COUNT-1) {
errln("error: UCharacter.getIntPropertyMaxValue(UProperty.BIDI_PAIRED_BRACKET_TYPE) wrong\n");
}
/*JB#2410*/
if( UCharacter.getIntPropertyMaxValue(0x2345)!=-1) {
errln("error: UCharacter.getIntPropertyMaxValue(0x2345) wrong\n");
@ -2227,8 +2256,6 @@ public final class UCharacterTest extends TestFmwk
// where UCharacter.NO_NUMERIC_VALUE is turned into -1.
// getNumericValue() returns -2 if the code point has a value
// which is not a non-negative integer. (This is mostly auto-converted to -2.)
{ 0x12456, UCharacter.NumericType.NUMERIC, -1. },
{ 0x12457, UCharacter.NumericType.NUMERIC, -1. },
{ 0x0F33, UCharacter.NumericType.NUMERIC, -1./2. },
{ 0x0C66, UCharacter.NumericType.DECIMAL, 0 },
{ 0x96f6, UCharacter.NumericType.NUMERIC, 0 },
@ -2389,6 +2416,32 @@ public final class UCharacterTest extends TestFmwk
}
}
public void TestBidiPairedBracketType() {
// BidiBrackets-6.3.0.txt says:
//
// The set of code points listed in this file was originally derived
// using the character properties General_Category (gc), Bidi_Class (bc),
// Bidi_Mirrored (Bidi_M), and Bidi_Mirroring_Glyph (bmg), as follows:
// two characters, A and B, form a pair if A has gc=Ps and B has gc=Pe,
// both have bc=ON and Bidi_M=Y, and bmg of A is B. Bidi_Paired_Bracket
// maps A to B and vice versa, and their Bidi_Paired_Bracket_Type
// property values are Open and Close, respectively.
UnicodeSet bpt = new UnicodeSet("[:^bpt=n:]");
assertTrue("bpt!=None is not empty", !bpt.isEmpty());
// The following should always be true.
UnicodeSet mirrored = new UnicodeSet("[:Bidi_M:]");
UnicodeSet other_neutral = new UnicodeSet("[:bc=ON:]");
assertTrue("bpt!=None is a subset of Bidi_M", mirrored.containsAll(bpt));
assertTrue("bpt!=None is a subset of bc=ON", other_neutral.containsAll(bpt));
// The following are true at least initially in Unicode 6.3.
UnicodeSet bpt_open = new UnicodeSet("[:bpt=o:]");
UnicodeSet bpt_close = new UnicodeSet("[:bpt=c:]");
UnicodeSet ps = new UnicodeSet("[:Ps:]");
UnicodeSet pe = new UnicodeSet("[:Pe:]");
assertTrue("bpt=Open is a subset of Ps", ps.containsAll(bpt_open));
assertTrue("bpt=Close is a subset of Pe", pe.containsAll(bpt_close));
}
public void TestIsBMP()
{
int ch[] = {0x0, -1, 0xffff, 0x10ffff, 0xff, 0x1ffff};

View File

@ -1,6 +1,6 @@
/*
*******************************************************************************
* Copyright (C) 1996-2012, International Business Machines Corporation and
* Copyright (C) 1996-2013, International Business Machines Corporation and
* others. All Rights Reserved.
*******************************************************************************
*/
@ -588,6 +588,8 @@ public class RBBITest extends TestFmwk {
}
}
// TODO: Move these test cases to rbbitst.txt if they aren't there already, then remove this test. It is redundant.
public void TestTailoredBreaks() {
class TBItem {
private int type;
@ -661,7 +663,7 @@ public class RBBITest extends TestFmwk {
// KIND_WORD "en_US_POSIX"
final String posxWordText = "Can't have breaks in xx:yy or struct.field for CS-types.";
final int[] posxWordTOffsets = { 5, 6, 10, 11, 17, 18, 20, 21, 23, 24, 26, 27, 29, 30, 36, 37, 42, 43, 46, 47, 49, 50, 55, 56 };
final int[] posxWordROffsets = { 5, 6, 10, 11, 17, 18, 20, 21, 26, 27, 29, 30, 42, 43, 46, 47, 49, 50, 55, 56 };
final int[] posxWordROffsets = { 5, 6, 10, 11, 17, 18, 20, 21, 23, 24, 26, 27, 29, 30, 42, 43, 46, 47, 49, 50, 55, 56 };
// KIND_SENTENCE "el"
final String elSentText = "\u0391\u03B2, \u03B3\u03B4; \u0395 \u03B6\u03B7\u037E \u0398 \u03B9\u03BA. " +
"\u039B\u03BC \u03BD\u03BE! \u039F\u03C0, \u03A1\u03C2? \u03A3";

View File

@ -1,6 +1,6 @@
/*
*******************************************************************************
* Copyright (C) 2003-2012 International Business Machines Corporation and
* Copyright (C) 2003-2013 International Business Machines Corporation and
* others. All Rights Reserved.
*******************************************************************************
*/
@ -262,8 +262,12 @@ public class RBBITestMonkey extends TestFmwk {
UnicodeSet fCRSet;
UnicodeSet fLFSet;
UnicodeSet fNewlineSet;
UnicodeSet fRegionalIndicatorSet;
UnicodeSet fKatakanaSet;
UnicodeSet fHebrew_LetterSet;
UnicodeSet fALetterSet;
UnicodeSet fSingle_QuoteSet;
UnicodeSet fDouble_QuoteSet;
UnicodeSet fMidNumLetSet;
UnicodeSet fMidLetterSet;
UnicodeSet fMidNumSet;
@ -271,9 +275,7 @@ public class RBBITestMonkey extends TestFmwk {
UnicodeSet fFormatSet;
UnicodeSet fExtendSet;
UnicodeSet fExtendNumLetSet;
UnicodeSet fRegionalIndicatorSet;
UnicodeSet fOtherSet;
UnicodeSet fOtherSet;
UnicodeSet fDictionaryCjkSet;
@ -284,9 +286,13 @@ public class RBBITestMonkey extends TestFmwk {
fCRSet = new UnicodeSet("[\\p{Word_Break = CR}]");
fLFSet = new UnicodeSet("[\\p{Word_Break = LF}]");
fNewlineSet = new UnicodeSet("[\\p{Word_Break = Newline}]");
fRegionalIndicatorSet = new UnicodeSet("[\\p{Word_Break = Regional_Indicator}]");
fKatakanaSet = new UnicodeSet("[\\p{Word_Break = Katakana}]");
fHebrew_LetterSet = new UnicodeSet("[\\p{Word_Break = Hebrew_Letter}]");
fALetterSet = new UnicodeSet("[\\p{Word_Break = ALetter}]");
fALetterSet.removeAll(fDictionaryCjkSet);
fKatakanaSet = new UnicodeSet("[\\p{Word_Break = Katakana}]");
fSingle_QuoteSet = new UnicodeSet("[\\p{Word_Break = Single_Quote}]");
fDouble_QuoteSet = new UnicodeSet("[\\p{Word_Break = Double_Quote}]");
fMidNumLetSet = new UnicodeSet("[\\p{Word_Break = MidNumLet}]");
fMidLetterSet = new UnicodeSet("[\\p{Word_Break = MidLetter}]");
fMidNumSet = new UnicodeSet("[\\p{Word_Break = MidNum}]");
@ -294,7 +300,6 @@ public class RBBITestMonkey extends TestFmwk {
fFormatSet = new UnicodeSet("[\\p{Word_Break = Format}]");
fExtendNumLetSet = new UnicodeSet("[\\p{Word_Break = ExtendNumLet}]");
fExtendSet = new UnicodeSet("[\\p{Word_Break = Extend}]");
fRegionalIndicatorSet = new UnicodeSet("[\\p{Word_Break = Regional_Indicator}]");
fOtherSet = new UnicodeSet();
fOtherSet.complement();
@ -302,7 +307,10 @@ public class RBBITestMonkey extends TestFmwk {
fOtherSet.removeAll(fLFSet);
fOtherSet.removeAll(fNewlineSet);
fOtherSet.removeAll(fALetterSet);
fOtherSet.removeAll(fSingle_QuoteSet);
fOtherSet.removeAll(fDouble_QuoteSet);
fOtherSet.removeAll(fKatakanaSet);
fOtherSet.removeAll(fHebrew_LetterSet);
fOtherSet.removeAll(fMidLetterSet);
fOtherSet.removeAll(fMidNumSet);
fOtherSet.removeAll(fNumericSet);
@ -319,8 +327,12 @@ public class RBBITestMonkey extends TestFmwk {
fSets.add(fCRSet);
fSets.add(fLFSet);
fSets.add(fNewlineSet);
fSets.add(fRegionalIndicatorSet);
fSets.add(fHebrew_LetterSet);
fSets.add(fALetterSet);
//fSets.add(fKatakanaSet); // TODO: work out how to test katakana
fSets.add(fSingle_QuoteSet);
fSets.add(fDouble_QuoteSet);
fSets.add(fMidLetterSet);
fSets.add(fMidNumLetSet);
fSets.add(fMidNumSet);
@ -328,7 +340,6 @@ public class RBBITestMonkey extends TestFmwk {
fSets.add(fFormatSet);
fSets.add(fExtendSet);
fSets.add(fExtendNumLetSet);
fSets.add(fRegionalIndicatorSet);
fSets.add(fOtherSet);
}
@ -407,25 +418,39 @@ public class RBBITestMonkey extends TestFmwk {
break;
}
// Rule (5). ALetter x ALetter
if (fALetterSet.contains(c1) &&
fALetterSet.contains(c2)) {
// Rule (5). (ALetter | Hebrew_Letter) x (ALetter | Hebrew_Letter)
if ((fALetterSet.contains(c1) || fHebrew_LetterSet.contains(c1)) &&
(fALetterSet.contains(c2) || fHebrew_LetterSet.contains(c2))) {
continue;
}
// Rule (6) ALetter x (MidLetter | MidNumLet) ALetter
// Rule (6) (ALetter | Hebrew_Letter) x (MidLetter | MidNumLet | Single_Quote) (ALetter | Hebrew_Letter)
//
if ( fALetterSet.contains(c1) &&
(fMidLetterSet.contains(c2) || fMidNumLetSet.contains(c2)) &&
setContains(fALetterSet, c3)) {
if ( (fALetterSet.contains(c1) || fHebrew_LetterSet.contains(c1)) &&
(fMidLetterSet.contains(c2) || fMidNumLetSet.contains(c2) || fSingle_QuoteSet.contains(c2)) &&
(setContains(fALetterSet, c3) || setContains(fHebrew_LetterSet, c3))) {
continue;
}
// Rule (7) ALetter (MidLetter | MidNumLet) x ALetter
if (fALetterSet.contains(c0) &&
(fMidLetterSet.contains(c1) || fMidNumLetSet.contains(c1)) &&
fALetterSet.contains(c2)) {
// Rule (7) (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote) x (ALetter | Hebrew_Letter)
if ((fALetterSet.contains(c0) || fHebrew_LetterSet.contains(c0)) &&
(fMidLetterSet.contains(c1) || fMidNumLetSet.contains(c1) || fSingle_QuoteSet.contains(c1)) &&
(fALetterSet.contains(c2) || fHebrew_LetterSet.contains(c2))) {
continue;
}
// Rule (7a) Hebrew_Letter x Single_Quote
if (fHebrew_LetterSet.contains(c1) && fSingle_QuoteSet.contains(c2)) {
continue;
}
// Rule (7b) Hebrew_Letter x Double_Quote Hebrew_Letter
if (fHebrew_LetterSet.contains(c1) && fDouble_QuoteSet.contains(c2) && setContains(fHebrew_LetterSet,c3)) {
continue;
}
// Rule (7c) Hebrew_Letter Double_Quote x Hebrew_Letter
if (fHebrew_LetterSet.contains(c0) && fDouble_QuoteSet.contains(c1) && fHebrew_LetterSet.contains(c2)) {
continue;
}
@ -435,29 +460,29 @@ public class RBBITestMonkey extends TestFmwk {
continue;
}
// Rule (9) ALetter x Numeric
if (fALetterSet.contains(c1) &&
fNumericSet.contains(c2)) {
// Rule (9) (ALetter | Hebrew_Letter) x Numeric
if ((fALetterSet.contains(c1) || fHebrew_LetterSet.contains(c1)) &&
fNumericSet.contains(c2)) {
continue;
}
// Rule (10) Numeric x ALetter
// Rule (10) Numeric x (ALetter | Hebrew_Letter)
if (fNumericSet.contains(c1) &&
fALetterSet.contains(c2)) {
(fALetterSet.contains(c2) || fHebrew_LetterSet.contains(c2))) {
continue;
}
// Rule (11) Numeric (MidNum | MidNumLet) x Numeric
if ( fNumericSet.contains(c0) &&
(fMidNumSet.contains(c1) || fMidNumLetSet.contains(c1)) &&
// Rule (11) Numeric (MidNum | MidNumLet | Single_Quote) x Numeric
if (fNumericSet.contains(c0) &&
(fMidNumSet.contains(c1) || fMidNumLetSet.contains(c1) || fSingle_QuoteSet.contains(c1)) &&
fNumericSet.contains(c2)) {
continue;
}
// Rule (12) Numeric x (MidNum | MidNumLet) Numeric
// Rule (12) Numeric x (MidNum | MidNumLet | SingleQuote) Numeric
if (fNumericSet.contains(c1) &&
(fMidNumSet.contains(c2) || fMidNumLetSet.contains(c2)) &&
setContains(fNumericSet, c3)) {
(fMidNumSet.contains(c2) || fMidNumLetSet.contains(c2) || fSingle_QuoteSet.contains(c2)) &&
setContains(fNumericSet, c3)) {
continue;
}
@ -466,19 +491,21 @@ public class RBBITestMonkey extends TestFmwk {
fKatakanaSet.contains(c2)) {
continue;
}
// Rule 13a (ALetter | Numeric | Katakana | ExtendNumLet) x ExtendNumLet
if ((fALetterSet.contains(c1) || fNumericSet.contains(c1) ||
// Rule 13a (ALetter | Hebrew_Letter | Numeric | KataKana | ExtendNumLet) x ExtendNumLet
if ((fALetterSet.contains(c1) || fHebrew_LetterSet.contains(c1) ||fNumericSet.contains(c1) ||
fKatakanaSet.contains(c1) || fExtendNumLetSet.contains(c1)) &&
fExtendNumLetSet.contains(c2)) {
continue;
}
// Rule 13b ExtendNumLet x (ALetter | Numeric | Katakana | ExtendNumLet)
// Rule 13b ExtendNumLet x (ALetter | Hebrew_Letter | Numeric | Katakana)
if (fExtendNumLetSet.contains(c1) &&
(fALetterSet.contains(c2) || fNumericSet.contains(c2) ||
fKatakanaSet.contains(c2) || fExtendNumLetSet.contains(c2))) {
(fALetterSet.contains(c2) || fHebrew_LetterSet.contains(c2) ||
fNumericSet.contains(c2) || fKatakanaSet.contains(c2))) {
continue;
}
// Rule 13c Do not break between Regional Indicators.
// Regional_Indicator × Regional_Indicator
@ -1976,7 +2003,6 @@ public void TestRTWordMonkey() {
if (params.inclusion >= 9) {
loopCount = 2000;
}
logln("Word Break Monkey Test");
RBBIWordMonkey m = new RBBIWordMonkey();
BreakIterator bi = BreakIterator.getWordInstance(Locale.US);

View File

@ -711,10 +711,11 @@ Bangkok)•</data>
# UBreakIteratorType UBRK_WORD, Locale "en_US_POSIX"
# Words don't include colon or period (cldrbug #1969).
# Unicode 6.3 change: colon now breaks words.
<locale en_US>
<word>
<data>•Can't<200> •have<200> •breaks<200> •in<200> •xx:yy<200> •or<200> •struct.field<200> \
<data>•Can't<200> •have<200> •breaks<200> •in<200> •xx<200>:yy<200> •or<200> •struct.field<200> \
•for<200> •CS<200>-•types<200>.•</data>
<locale en_US_POSIX>

View File

@ -529,8 +529,8 @@ public class SpoofCheckerTest extends TestFmwk {
{"アaー〆", "HIGHLY_RESTRICTIVE", "[]", "Latn Kana", "", ""},
{"a1١", "UNRESTRICTIVE", "[0٠]", "Latn", "Arab Thaa", "Arab Thaa"},
{"a1١۱", "UNRESTRICTIVE", "[0٠۰]", "Latn Arab", "", ""},
{"١ー〆aア1१۱", "UNRESTRICTIVE", "[0٠۰]", "Latn Kana Arab Deva", "", ""},
{"aアー〆1१١۱", "UNRESTRICTIVE", "[0٠۰]", "Latn Kana Arab Deva", "", ""},
{"١ー〆aア1१۱", "UNRESTRICTIVE", "[0٠۰]", "Latn Kana Arab", "Deva Kthi", "Deva Kthi"},
{"aアー〆1१١۱", "UNRESTRICTIVE", "[0٠۰]", "Latn Kana Arab", "Deva Kthi", "Deva Kthi"},
};
for (String[] test : tests) {
String testString = test[0];