diff --git a/tools/unicode/c/genprops/pnames_data.h b/tools/unicode/c/genprops/pnames_data.h index 63936fe415..0773a22329 100644 --- a/tools/unicode/c/genprops/pnames_data.h +++ b/tools/unicode/c/genprops/pnames_data.h @@ -5,7 +5,7 @@ * machine-generated by: icu/tools/unicode/py/preparseucd.py */ -#define UNICODE_VERSION { 6, 1, 0, 0 } +#define UNICODE_VERSION { 6, 2, 0, 0 } static const Value VALUES_binprop[2] = { Value(0, "N No F False"), @@ -450,7 +450,7 @@ static const Value VALUES_jt[6] = { Value(U_JT_TRANSPARENT, "T Transparent"), }; -static const Value VALUES_lb[39] = { +static const Value VALUES_lb[40] = { Value(U_LB_UNKNOWN, "XX Unknown"), Value(U_LB_AMBIGUOUS, "AI Ambiguous"), Value(U_LB_ALPHABETIC, "AL Alphabetic"), @@ -490,6 +490,7 @@ static const Value VALUES_lb[39] = { Value(U_LB_CLOSE_PARENTHESIS, "CP Close_Parenthesis"), Value(U_LB_CONDITIONAL_JAPANESE_STARTER, "CJ Conditional_Japanese_Starter"), Value(U_LB_HEBREW_LETTER, "HL Hebrew_Letter"), + Value(U_LB_ZERO_WIDTH_JOINER, "ZJ Zero_Width_Joiner"), }; static const Value VALUES_nt[4] = { @@ -810,7 +811,7 @@ static const Value VALUES_tccc[56] = { Value(240, "IS Iota_Subscript"), }; -static const Value VALUES_GCB[12] = { +static const Value VALUES_GCB[14] = { Value(U_GCB_OTHER, "XX Other"), Value(U_GCB_CONTROL, "CN Control"), Value(U_GCB_CR, "CR CR"), @@ -823,6 +824,8 @@ static const Value VALUES_GCB[12] = { Value(U_GCB_V, "V V"), Value(U_GCB_SPACING_MARK, "SM SpacingMark"), Value(U_GCB_PREPEND, "PP Prepend"), + Value(U_GCB_AFTER_JOINER, "AJ After_Joiner"), + Value(U_GCB_JOINER, "J Joiner"), }; static const Value VALUES_SB[15] = { @@ -843,7 +846,7 @@ static const Value VALUES_SB[15] = { Value(U_SB_SCONTINUE, "SC SContinue"), }; -static const Value VALUES_WB[13] = { +static const Value VALUES_WB[15] = { Value(U_WB_OTHER, "XX Other"), Value(U_WB_ALETTER, "LE ALetter"), Value(U_WB_FORMAT, "FO Format"), @@ -857,6 +860,8 @@ static const Value VALUES_WB[13] = { Value(U_WB_LF, "LF LF"), Value(U_WB_MIDNUMLET, "MB MidNumLet"), Value(U_WB_NEWLINE, "NL Newline"), + Value(U_WB_AFTER_JOINER, "AJ After_Joiner"), + Value(U_WB_JOINER, "J Joiner"), }; static const Value VALUES_gcm[38] = { @@ -966,7 +971,7 @@ static const Property PROPERTIES[94] = { Property(UCHAR_GENERAL_CATEGORY, "gc General_Category", VALUES_gc, 30), Property(UCHAR_JOINING_GROUP, "jg Joining_Group", VALUES_jg, 58), Property(UCHAR_JOINING_TYPE, "jt Joining_Type", VALUES_jt, 6), - Property(UCHAR_LINE_BREAK, "lb Line_Break", VALUES_lb, 39), + Property(UCHAR_LINE_BREAK, "lb Line_Break", VALUES_lb, 40), Property(UCHAR_NUMERIC_TYPE, "nt Numeric_Type", VALUES_nt, 4), Property(UCHAR_SCRIPT, "sc Script", VALUES_sc, 159), Property(UCHAR_HANGUL_SYLLABLE_TYPE, "hst Hangul_Syllable_Type", VALUES_hst, 6), @@ -976,9 +981,9 @@ static const Property PROPERTIES[94] = { Property(UCHAR_NFKC_QUICK_CHECK, "NFKC_QC NFKC_Quick_Check", VALUES_NFKC_QC, 3), Property(UCHAR_LEAD_CANONICAL_COMBINING_CLASS, "lccc Lead_Canonical_Combining_Class", VALUES_lccc, 56), Property(UCHAR_TRAIL_CANONICAL_COMBINING_CLASS, "tccc Trail_Canonical_Combining_Class", VALUES_tccc, 56), - Property(UCHAR_GRAPHEME_CLUSTER_BREAK, "GCB Grapheme_Cluster_Break", VALUES_GCB, 12), + Property(UCHAR_GRAPHEME_CLUSTER_BREAK, "GCB Grapheme_Cluster_Break", VALUES_GCB, 14), Property(UCHAR_SENTENCE_BREAK, "SB Sentence_Break", VALUES_SB, 15), - Property(UCHAR_WORD_BREAK, "WB Word_Break", VALUES_WB, 13), + Property(UCHAR_WORD_BREAK, "WB Word_Break", VALUES_WB, 15), Property(UCHAR_GENERAL_CATEGORY_MASK, "gcm General_Category_Mask", VALUES_gcm, 38), Property(UCHAR_NUMERIC_VALUE, "nv Numeric_Value"), Property(UCHAR_AGE, "age Age"), diff --git a/tools/unicode/py/preparseucd.py b/tools/unicode/py/preparseucd.py index db378e1f83..c8c56a1f33 100755 --- a/tools/unicode/py/preparseucd.py +++ b/tools/unicode/py/preparseucd.py @@ -2029,10 +2029,13 @@ def main(): for (basename, path, parser) in files: print "Parsing %s" % basename value = _files[basename] + # Unicode data files are in UTF-8. + charset = "UTF-8" if basename == "NamesList.txt": - in_file = codecs.open(path, "r", "ISO-8859-1") - else: - in_file = open(path, "r") + # The NamesList used to be in Latin-1 before Unicode 6.2. + numeric_ucd_version = [int(field) for field in _ucd_version.split('.')] + if numeric_ucd_version < [6, 2]: charset = "ISO-8859-1" + in_file = codecs.open(path, "r", charset) with in_file: parser(in_file) _null_or_defaults = _null_values.copy()