ICU-2015 add uprv_getInclusions(), formerly implemented in upropset.cpp

X-SVN-Rev: 10100
2002-10-30 18:12:49 +00:00 · 2002-10-30 18:12:49 +00:00 · 72802d9da6
commit 72802d9da6
parent 4e051554eb
2 changed files with 139 additions and 0 deletions
--- a/icu4c/source/common/uprops.c
+++ b/icu4c/source/common/uprops.c
@ -398,3 +398,132 @@ u_getIntPropertyMaxValue(UProperty which) {
        return 0; /* undefined */
    }
 }
+
+/*----------------------------------------------------------------
+ * Inclusions list
+ *----------------------------------------------------------------/
+
+/* >From UnicodeData:
+ * 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
+ * 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
+ * 4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
+ * 9FA5;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;
+ * AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
+ * D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
+ * D800;<Non Private Use High Surrogate, First>;Cs;0;L;;;;;N;;;;;
+ * DB7F;<Non Private Use High Surrogate, Last>;Cs;0;L;;;;;N;;;;;
+ * DB80;<Private Use High Surrogate, First>;Cs;0;L;;;;;N;;;;;
+ * DBFF;<Private Use High Surrogate, Last>;Cs;0;L;;;;;N;;;;;
+ * DC00;<Low Surrogate, First>;Cs;0;L;;;;;N;;;;;
+ * DFFF;<Low Surrogate, Last>;Cs;0;L;;;;;N;;;;;
+ * E000;<Private Use, First>;Co;0;L;;;;;N;;;;;
+ * F8FF;<Private Use, Last>;Co;0;L;;;;;N;;;;;
+ * 20000;<CJK Ideograph Extension B, First>;Lo;0;L;;;;;N;;;;;
+ * 2A6D6;<CJK Ideograph Extension B, Last>;Lo;0;L;;;;;N;;;;;
+ * F0000;<Plane 15 Private Use, First>;Co;0;L;;;;;N;;;;;
+ * FFFFD;<Plane 15 Private Use, Last>;Co;0;L;;;;;N;;;;;
+ * 100000;<Plane 16 Private Use, First>;Co;0;L;;;;;N;;;;;
+ * 10FFFD;<Plane 16 Private Use, Last>;Co;0;L;;;;;N;;;;;
+ *
+ * >Large Blocks of Unassigned: (from DerivedGeneralCategory)
+ * 1044E..1CFFF  ; Cn # [52146]
+ * 1D800..1FFFF  ; Cn # [10240]
+ * 2A6D7..2F7FF  ; Cn # [20777]
+ * 2FA1E..E0000  ; Cn # [722403]
+ * E0080..EFFFF  ; Cn # [65408]
+ *
+ * ---
+ *
+ * TODO: The Inclusion List should be generated from the UCD for each
+ * version.  Currently it is static.
+ *
+ * ---
+ *
+ * ### TODO ICU 2.4 markus Ideas for getting properties-unique code point ranges:
+ *
+ * To enumerate properties efficiently, one needs to know ranges of
+ * repetitive values, so that the value of only each start code point
+ * can be applied to the whole range.
+ * This information is in principle available in the uprops.icu data.
+ *
+ * There are two obstacles:
+ *
+ * 1. Some properties are computed from multiple data structures,
+ *    making it necessary to get repetitive ranges by intersecting
+ *    ranges from multiple tries.
+ *
+ * 2. It is not economical to write code for getting repetitive ranges
+ *    that are precise for each of some 50 properties.
+ *
+ * Compromise ideas:
+ *
+ * - Get ranges per trie, not per individual property.
+ *   Each range contains the same values for a whole group of properties.
+ *   This would generate currently five range sets, two for uprops.icu tries
+ *   and three for unorm.icu tries.
+ *
+ * - Combine sets of ranges for multiple tries to get sufficient sets
+ *   for properties, e.g., the uprops.icu main and auxiliary tries
+ *   for all non-normalization properties.
+ *
+ * Ideas for representing ranges and combining them:
+ *
+ * - A UnicodeSet could hold just the start code points of ranges.
+ *   Multiple sets are easily combined by or-ing them together.
+ *
+ * - Alternatively, a UnicodeSet could hold each even-numbered range.
+ *   All ranges could be enumerated by using each start code point
+ *   (for the even-numbered ranges) as well as each limit (end+1) code point
+ *   (for the odd-numbered ranges).
+ *   It should be possible to combine two such sets by xor-ing them,
+ *   but no more than two.
+ *
+ * The second way to represent ranges may(?!) yield smaller UnicodeSet arrays,
+ * but the first one is certainly simpler and applicable for combining more than
+ * two range sets.
+ *
+ * It is possible to combine all range sets for all uprops/unorm tries into one
+ * set that can be used for all properties.
+ * As an optimization, there could be less-combined range sets for certain
+ * groups of properties.
+ * The relationship of which less-combined range set to use for which property
+ * depends on the implementation of the properties and must be hardcoded
+ * - somewhat error-prone and higher maintenance but can be tested easily
+ * by building property sets "the simple way" in test code.
+ *
+ * ---
+ *
+ * Do not use a UnicodeSet pattern because that causes infinite recursion;
+ * UnicodeSet depends on the inclusions set.
+ */
+
+U_CAPI void U_EXPORT2
+uprv_getInclusions(USet* set) {
+    /* Build a UnicodeSet for all of Unicode,
+     * then remove known ranges with all-same properties.
+     */
+    uset_addRange(set, 0, 0x10FFFF);
+
+    /* Effectively, build a UnicodeSet according to the following pattern:
+     * "[^\\u3401-\\u4DB5 \\u4E01-\\u9FA5 \\uAC01-\\uD7A3 \\uD801-\\uDB7F
+     *    \\uDB81-\\uDBFF \\uDC01-\\uDFFF \\uE001-\\uF8FF \\U0001044F-\\U0001CFFF
+     *    \\U0001D801-\\U0001FFFF \\U00020001-\\U0002A6D6 \\U0002A6D8-\\U0002F7FF
+     *    \\U0002FA1F-\\U000E0000 \\U000E0081-\\U000EFFFF \\U000F0001-\\U000FFFFD
+     *    \\U00100001-\\U0010FFFD]"
+     */
+    uset_removeRange(set, 0x3401, 0x4DB5);
+    uset_removeRange(set, 0x4E01, 0x9FA5);
+    uset_removeRange(set, 0xAC01, 0xD7A3);
+    uset_removeRange(set, 0xD801, 0xDB7F);
+    uset_removeRange(set, 0xDB81, 0xDBFF);
+    uset_removeRange(set, 0xDC01, 0xDFFF);
+    uset_removeRange(set, 0xE001, 0xF8FF);
+    uset_removeRange(set, 0x1044F, 0x1CFFF);
+    uset_removeRange(set, 0x1D801, 0x1FFFF);
+    uset_removeRange(set, 0x20001, 0x2A6D6);
+    uset_removeRange(set, 0x2A6D8, 0x2F7FF);
+    uset_removeRange(set, 0x2FA1F, 0xE0000);
+    uset_removeRange(set, 0xE0081, 0xEFFFF);
+    uset_removeRange(set, 0xF0001, 0xFFFFD);
+    uset_removeRange(set, 0x100001, 0x10FFFD);
+}
--- a/icu4c/source/common/uprops.h
+++ b/icu4c/source/common/uprops.h
@ -271,4 +271,14 @@ uprv_getCharNameCharacters(USet* set);
 U_CAPI void U_EXPORT2
 uprv_getISOCommentCharacters(USet* set);

+/**
+ * Return a set of all characters _except_ the second through last
+ * characters of certain ranges.  These ranges are ranges of
+ * characters whose properties are all exactly alike, e.g. CJK
+ * Ideographs from U+4E00 to U+9FA5.
+ * @param set USet to receive result.  Existing contents are lost.
+ */
+U_CAPI void U_EXPORT2
+uprv_getInclusions(USet* set);
+
 #endif