2002-12-06 01:40:42 +00:00
|
|
|
# Copyright (c) 2002, International Business Machines Corporation and
|
|
|
|
# others. All Rights Reserved.
|
2002-06-25 17:23:07 +00:00
|
|
|
#
|
|
|
|
# file: line.txt
|
|
|
|
#
|
|
|
|
# Line Breaking Rules for ICU rules based break iteration.
|
|
|
|
# Implement default line breaking as defined by Unicode TR 14.
|
|
|
|
#
|
|
|
|
|
|
|
|
|
|
|
|
#
|
|
|
|
# Character Classes defined by Unicode TR 14.
|
|
|
|
# These are generated by a script from the Unicode LineBreak derived
|
|
|
|
# properties file.
|
|
|
|
#
|
|
|
|
|
|
|
|
############ Start of Script-Generated Definitions #######################
|
|
|
|
|
|
|
|
$LF = [ \u000A];
|
|
|
|
|
|
|
|
$IN = [ \u2024-\u2026];
|
|
|
|
|
|
|
|
$SY = [ \u002F];
|
|
|
|
|
|
|
|
$EX = [ \u0021 \u003F \u2762-\u2763 \uFE56-\uFE57 \uFF01 \uFF1F];
|
|
|
|
|
|
|
|
$BA = [ \u0009 \u007C \u00AD \u058A \u0F0B \u1361 \u1680 \u17D5 \u2000-\u2006
|
|
|
|
\u2008-\u200A \u2010 \u2012-\u2013 \u2027 \u205F];
|
|
|
|
|
|
|
|
$IS = [ \u002C \u002E \u003A-\u003B \u0589];
|
|
|
|
|
|
|
|
$BB = [ \u00B4 \u02C8 \u02CC \u1806];
|
|
|
|
|
|
|
|
$SA = [ \u0E01-\u0E30 \u0E32-\u0E33 \u0E40-\u0E46 \u0E81-\u0E82 \u0E84 \u0E87-\u0E88
|
|
|
|
\u0E8A \u0E8D \u0E94-\u0E97 \u0E99-\u0E9F \u0EA1-\u0EA3 \u0EA5
|
|
|
|
\u0EA7 \u0EAA-\u0EAB \u0EAD-\u0EB0 \u0EB2-\u0EB3 \u0EBD \u0EC0-\u0EC4
|
|
|
|
\u0EC6 \u0EDC-\u0EDD \u1000-\u1021 \u1023-\u1027 \u1029-\u102A
|
|
|
|
\u1050-\u1055 \u1780-\u17B3];
|
|
|
|
|
|
|
|
$CB = [ \uFFFC];
|
|
|
|
|
|
|
|
$XX = [ \uE000-\uF8FF \U000F0000-\U000FFFFD \U00100000-\U0010FFFD];
|
|
|
|
|
|
|
|
$HY = [ \u002D];
|
|
|
|
|
|
|
|
$AI = [ \u00A1 \u00A7-\u00A8 \u00AA \u00B2-\u00B3 \u00B6-\u00BA \u00BC-\u00BF
|
|
|
|
\u00C6 \u00D0 \u00D7-\u00D8 \u00DE-\u00E1 \u00E6 \u00E8-\u00EA
|
|
|
|
\u00EC-\u00ED \u00F0 \u00F2-\u00F3 \u00F7-\u00FA \u00FC \u00FE
|
|
|
|
\u0101 \u0111 \u0113 \u011B \u0126-\u0127 \u012B \u0131-\u0133
|
|
|
|
\u0138 \u013F-\u0142 \u0144 \u0148-\u014A \u014D \u0152-\u0153
|
|
|
|
\u0166-\u0167 \u016B \u01CE \u01D0 \u01D2 \u01D4 \u01D6 \u01D8
|
|
|
|
\u01DA \u01DC \u0251 \u0261 \u02C7 \u02C9-\u02CB \u02CD \u02D0
|
|
|
|
\u02D8-\u02DB \u02DD \u0391-\u03A1 \u03A3-\u03A9 \u03B1-\u03C1
|
|
|
|
\u03C3-\u03C9 \u0401 \u0410-\u044F \u0451 \u2015-\u2016 \u2020-\u2021
|
|
|
|
\u203B \u2074 \u207F \u2081-\u2084 \u2105 \u2113 \u2121-\u2122
|
|
|
|
\u212B \u2140 \u2154-\u2155 \u215B \u215E \u2160-\u216B \u2170-\u2179
|
|
|
|
\u2190-\u2199 \u21D2 \u21D4 \u2200 \u2202-\u2203 \u2207-\u2208
|
|
|
|
\u220B \u220F \u2211 \u2215 \u221A \u221D-\u2220 \u2223 \u2225
|
|
|
|
\u2227-\u222C \u222E \u2234-\u2237 \u223C-\u223D \u2248 \u224C
|
|
|
|
\u2252 \u2260-\u2261 \u2264-\u2267 \u226A-\u226B \u226E-\u226F
|
|
|
|
\u2282-\u2283 \u2286-\u2287 \u2295 \u2299 \u22A5 \u22BF \u2312
|
|
|
|
\u2460-\u24BF \u24D0-\u24E9 \u24EB-\u24FE \u2500-\u254B \u2550-\u2574
|
|
|
|
\u2580-\u258F \u2592-\u2595 \u25A0-\u25A1 \u25A3-\u25A9 \u25B2-\u25B3
|
|
|
|
\u25B6-\u25B7 \u25BC-\u25BD \u25C0-\u25C1 \u25C6-\u25C8 \u25CB
|
|
|
|
\u25CE-\u25D1 \u25E2-\u25E5 \u25EF \u2605-\u2606 \u2609 \u260E-\u260F
|
|
|
|
\u2616-\u2617 \u261C \u261E \u2640 \u2642 \u2660-\u2661 \u2663-\u2665
|
|
|
|
\u2667-\u266A \u266C-\u266D \u266F \uFFFD];
|
|
|
|
|
|
|
|
$ZW = [ \u200B];
|
|
|
|
|
|
|
|
$SG = [ \uD800-\uDFFF];
|
|
|
|
|
|
|
|
$AL = [ \u0023 \u0026 \u002A \u003C-\u003E \u0040-\u005A \u005E-\u007A \u007E
|
|
|
|
\u00A6 \u00A9 \u00AC \u00AE-\u00AF \u00B5 \u00C0-\u00C5 \u00C7-\u00CF
|
|
|
|
\u00D1-\u00D6 \u00D9-\u00DD \u00E2-\u00E5 \u00E7 \u00EB \u00EE-\u00EF
|
|
|
|
\u00F1 \u00F4-\u00F6 \u00FB \u00FD \u00FF-\u0100 \u0102-\u0110
|
|
|
|
\u0112 \u0114-\u011A \u011C-\u0125 \u0128-\u012A \u012C-\u0130
|
|
|
|
\u0134-\u0137 \u0139-\u013E \u0143 \u0145-\u0147 \u014B-\u014C
|
|
|
|
\u014E-\u0151 \u0154-\u0165 \u0168-\u016A \u016C-\u01CD \u01CF
|
|
|
|
\u01D1 \u01D3 \u01D5 \u01D7 \u01D9 \u01DB \u01DD-\u0220 \u0222-\u0233
|
|
|
|
\u0250 \u0252-\u0260 \u0262-\u02AD \u02B0-\u02C6 \u02CE-\u02CF
|
|
|
|
\u02D1-\u02D7 \u02DC \u02DE-\u02EE \u0374-\u0375 \u037A \u037E
|
|
|
|
\u0384-\u038A \u038C \u038E-\u0390 \u03AA-\u03B0 \u03C2 \u03CA-\u03CE
|
|
|
|
\u03D0-\u03F6 \u0400 \u0402-\u040F \u0450 \u0452-\u0482 \u048A-\u04CE
|
|
|
|
\u04D0-\u04F5 \u04F8-\u04F9 \u0500-\u050F \u0531-\u0556 \u0559-\u055F
|
|
|
|
\u0561-\u0587 \u05BE \u05C0 \u05C3 \u05D0-\u05EA \u05F0-\u05F4
|
|
|
|
\u060C \u061B \u061F \u0621-\u063A \u0640-\u064A \u066A-\u066F
|
|
|
|
\u0671-\u06D5 \u06E5-\u06E6 \u06E9 \u06FA-\u06FE \u0700-\u070D
|
|
|
|
\u0710 \u0712-\u072C \u0780-\u07A5 \u07B1 \u0905-\u0939 \u093D
|
|
|
|
\u0950 \u0958-\u0961 \u0964-\u0965 \u0970 \u0985-\u098C \u098F-\u0990
|
|
|
|
\u0993-\u09A8 \u09AA-\u09B0 \u09B2 \u09B6-\u09B9 \u09DC-\u09DD
|
|
|
|
\u09DF-\u09E1 \u09F0-\u09F1 \u09F4-\u09FA \u0A05-\u0A0A \u0A0F-\u0A10
|
|
|
|
\u0A13-\u0A28 \u0A2A-\u0A30 \u0A32-\u0A33 \u0A35-\u0A36 \u0A38-\u0A39
|
|
|
|
\u0A59-\u0A5C \u0A5E \u0A72-\u0A74 \u0A85-\u0A8B \u0A8D \u0A8F-\u0A91
|
|
|
|
\u0A93-\u0AA8 \u0AAA-\u0AB0 \u0AB2-\u0AB3 \u0AB5-\u0AB9 \u0ABD
|
|
|
|
\u0AD0 \u0AE0 \u0B05-\u0B0C \u0B0F-\u0B10 \u0B13-\u0B28 \u0B2A-\u0B30
|
|
|
|
\u0B32-\u0B33 \u0B36-\u0B39 \u0B3D \u0B5C-\u0B5D \u0B5F-\u0B61
|
|
|
|
\u0B70 \u0B83 \u0B85-\u0B8A \u0B8E-\u0B90 \u0B92-\u0B95 \u0B99-\u0B9A
|
|
|
|
\u0B9C \u0B9E-\u0B9F \u0BA3-\u0BA4 \u0BA8-\u0BAA \u0BAE-\u0BB5
|
|
|
|
\u0BB7-\u0BB9 \u0BF0-\u0BF2 \u0C05-\u0C0C \u0C0E-\u0C10 \u0C12-\u0C28
|
|
|
|
\u0C2A-\u0C33 \u0C35-\u0C39 \u0C60-\u0C61 \u0C85-\u0C8C \u0C8E-\u0C90
|
|
|
|
\u0C92-\u0CA8 \u0CAA-\u0CB3 \u0CB5-\u0CB9 \u0CDE \u0CE0-\u0CE1
|
|
|
|
\u0D05-\u0D0C \u0D0E-\u0D10 \u0D12-\u0D28 \u0D2A-\u0D39 \u0D60-\u0D61
|
|
|
|
\u0D85-\u0D96 \u0D9A-\u0DB1 \u0DB3-\u0DBB \u0DBD \u0DC0-\u0DC6
|
|
|
|
\u0DF4 \u0E4F \u0F00-\u0F0A \u0F0D-\u0F17 \u0F1A-\u0F1F \u0F2A-\u0F34
|
|
|
|
\u0F36 \u0F38 \u0F40-\u0F47 \u0F49-\u0F6A \u0F85 \u0F88-\u0F8B
|
|
|
|
\u0FBE-\u0FC5 \u0FC7-\u0FCC \u0FCF \u104A-\u104F \u10A0-\u10C5
|
|
|
|
\u10D0-\u10F8 \u10FB \u1200-\u1206 \u1208-\u1246 \u1248 \u124A-\u124D
|
|
|
|
\u1250-\u1256 \u1258 \u125A-\u125D \u1260-\u1286 \u1288 \u128A-\u128D
|
|
|
|
\u1290-\u12AE \u12B0 \u12B2-\u12B5 \u12B8-\u12BE \u12C0 \u12C2-\u12C5
|
|
|
|
\u12C8-\u12CE \u12D0-\u12D6 \u12D8-\u12EE \u12F0-\u130E \u1310
|
|
|
|
\u1312-\u1315 \u1318-\u131E \u1320-\u1346 \u1348-\u135A \u1362-\u1368
|
|
|
|
\u1372-\u137C \u13A0-\u13F4 \u1401-\u1676 \u1681-\u169A \u16A0-\u16F0
|
|
|
|
\u1700-\u170C \u170E-\u1711 \u1720-\u1731 \u1735-\u1736 \u1740-\u1751
|
|
|
|
\u1760-\u176C \u176E-\u1770 \u17DC \u1800-\u1805 \u1807-\u180A
|
|
|
|
\u1820-\u1877 \u1880-\u18A8 \u1E00-\u1E9B \u1EA0-\u1EF9 \u1F00-\u1F15
|
|
|
|
\u1F18-\u1F1D \u1F20-\u1F45 \u1F48-\u1F4D \u1F50-\u1F57 \u1F59
|
|
|
|
\u1F5B \u1F5D \u1F5F-\u1F7D \u1F80-\u1FB4 \u1FB6-\u1FC4 \u1FC6-\u1FD3
|
|
|
|
\u1FD6-\u1FDB \u1FDD-\u1FEF \u1FF2-\u1FF4 \u1FF6-\u1FFE \u2017
|
|
|
|
\u2022-\u2023 \u2038 \u203D-\u2043 \u2047-\u2052 \u2057 \u2061-\u2063
|
|
|
|
\u2070-\u2071 \u2075-\u207C \u2080 \u2085-\u208C \u2100-\u2102
|
|
|
|
\u2104 \u2106-\u2108 \u210A-\u2112 \u2114-\u2115 \u2117-\u2120
|
|
|
|
\u2123-\u2125 \u2127-\u212A \u212C-\u213A \u213D-\u213F \u2141-\u214B
|
|
|
|
\u2153 \u2156-\u215A \u215C-\u215D \u215F \u216C-\u216F \u217A-\u2183
|
|
|
|
\u219A-\u21D1 \u21D3 \u21D5-\u21FF \u2201 \u2204-\u2206 \u2209-\u220A
|
|
|
|
\u220C-\u220E \u2210 \u2214 \u2216-\u2219 \u221B-\u221C \u2221-\u2222
|
|
|
|
\u2224 \u2226 \u222D \u222F-\u2233 \u2238-\u223B \u223E-\u2247
|
|
|
|
\u2249-\u224B \u224D-\u2251 \u2253-\u225F \u2262-\u2263 \u2268-\u2269
|
|
|
|
\u226C-\u226D \u2270-\u2281 \u2284-\u2285 \u2288-\u2294 \u2296-\u2298
|
|
|
|
\u229A-\u22A4 \u22A6-\u22BE \u22C0-\u2311 \u2313-\u2328 \u232B-\u23B3
|
|
|
|
\u23B7-\u23CE \u2400-\u2426 \u2440-\u244A \u24C0-\u24CF \u24EA
|
|
|
|
\u254C-\u254F \u2575-\u257F \u2590-\u2591 \u2596-\u259F \u25A2
|
|
|
|
\u25AA-\u25B1 \u25B4-\u25B5 \u25B8-\u25BB \u25BE-\u25BF \u25C2-\u25C5
|
|
|
|
\u25C9-\u25CA \u25CC-\u25CD \u25D2-\u25E1 \u25E6-\u25EE \u25F0-\u2604
|
|
|
|
\u2607-\u2608 \u260A-\u260D \u2610-\u2613 \u2619-\u261B \u261D
|
|
|
|
\u261F-\u263F \u2641 \u2643-\u265F \u2662 \u2666 \u266B \u266E
|
|
|
|
\u2670-\u267D \u2680-\u2689 \u2701-\u2704 \u2706-\u2709 \u270C-\u2727
|
|
|
|
\u2729-\u274B \u274D \u274F-\u2752 \u2756 \u2758-\u275A \u2761
|
|
|
|
\u2764-\u2767 \u2776-\u2794 \u2798-\u27AF \u27B1-\u27BE \u27D0-\u27E5
|
|
|
|
\u27F0-\u2982 \u2999-\u29D7 \u29DC-\u29FB \u29FE-\u2AFF \uFB00-\uFB06
|
|
|
|
\uFB13-\uFB17 \uFB1D \uFB1F-\uFB36 \uFB38-\uFB3C \uFB3E \uFB40-\uFB41
|
|
|
|
\uFB43-\uFB44 \uFB46-\uFBB1 \uFBD3-\uFD3D \uFD50-\uFD8F \uFD92-\uFDC7
|
|
|
|
\uFDF0-\uFDFB \uFE70-\uFE74 \uFE76-\uFEFC \uFF66 \uFF71-\uFF9D
|
|
|
|
\uFFA0-\uFFBE \uFFC2-\uFFC7 \uFFCA-\uFFCF \uFFD2-\uFFD7 \uFFDA-\uFFDC
|
|
|
|
\uFFE8-\uFFEE \U00010300-\U0001031E \U00010320-\U00010323 \U00010330-\U0001034A
|
|
|
|
\U00010400-\U00010425 \U00010428-\U0001044D \U0001D000-\U0001D0F5
|
|
|
|
\U0001D100-\U0001D126 \U0001D12A-\U0001D164 \U0001D16A-\U0001D16C
|
|
|
|
\U0001D183-\U0001D184 \U0001D18C-\U0001D1A9 \U0001D1AE-\U0001D1DD
|
|
|
|
\U0001D400-\U0001D454 \U0001D456-\U0001D49C \U0001D49E-\U0001D49F
|
|
|
|
\U0001D4A2 \U0001D4A5-\U0001D4A6 \U0001D4A9-\U0001D4AC \U0001D4AE-\U0001D4B9
|
|
|
|
\U0001D4BB \U0001D4BD-\U0001D4C0 \U0001D4C2-\U0001D4C3 \U0001D4C5-\U0001D505
|
|
|
|
\U0001D507-\U0001D50A \U0001D50D-\U0001D514 \U0001D516-\U0001D51C
|
|
|
|
\U0001D51E-\U0001D539 \U0001D53B-\U0001D53E \U0001D540-\U0001D544
|
|
|
|
\U0001D546 \U0001D54A-\U0001D550 \U0001D552-\U0001D6A3 \U0001D6A8-\U0001D7C9];
|
|
|
|
|
|
|
|
$OP = [ \u0028 \u005B \u007B \u0F3A \u0F3C \u169B \u201A \u201E \u2045 \u207D
|
|
|
|
\u208D \u2329 \u23B4 \u2768 \u276A \u276C \u276E \u2770 \u2772
|
|
|
|
\u2774 \u27E6 \u27E8 \u27EA \u2983 \u2985 \u2987 \u2989 \u298B
|
|
|
|
\u298D \u298F \u2991 \u2993 \u2995 \u2997 \u29D8 \u29DA \u29FC
|
|
|
|
\u3008 \u300A \u300C \u300E \u3010 \u3014 \u3016 \u3018 \u301A
|
|
|
|
\u301D \uFD3E \uFE35 \uFE37 \uFE39 \uFE3B \uFE3D \uFE3F \uFE41
|
|
|
|
\uFE43 \uFE59 \uFE5B \uFE5D \uFF08 \uFF3B \uFF5B \uFF5F \uFF62];
|
|
|
|
|
|
|
|
$BK = [ \u000C \u2028-\u2029];
|
|
|
|
|
|
|
|
$PO = [ \u0025 \u00A2 \u00B0 \u2030-\u2037 \u20A7 \u2103 \u2109 \u2126 \uFDFC
|
|
|
|
\uFE6A \uFF05 \uFFE0];
|
|
|
|
|
|
|
|
$NS = [ \u0E5A-\u0E5B \u17D4 \u17D6-\u17DA \u203C \u2044 \u3005 \u301C \u303B-\u303C
|
|
|
|
\u3041 \u3043 \u3045 \u3047 \u3049 \u3063 \u3083 \u3085 \u3087
|
|
|
|
\u308E \u3095-\u3096 \u309B-\u309E \u30A0-\u30A1 \u30A3 \u30A5
|
|
|
|
\u30A7 \u30A9 \u30C3 \u30E3 \u30E5 \u30E7 \u30EE \u30F5-\u30F6
|
|
|
|
\u30FB \u30FD \u31F0-\u31FF \uFE54-\uFE55 \uFF1A-\uFF1B \uFF65
|
|
|
|
\uFF67-\uFF70 \uFF9E-\uFF9F];
|
|
|
|
|
|
|
|
$CL = [ \u0029 \u005D \u007D \u0F3B \u0F3D \u169C \u2046 \u207E \u208E \u232A
|
|
|
|
\u23B5 \u2769 \u276B \u276D \u276F \u2771 \u2773 \u2775 \u27E7
|
|
|
|
\u27E9 \u27EB \u2984 \u2986 \u2988 \u298A \u298C \u298E \u2990
|
|
|
|
\u2992 \u2994 \u2996 \u2998 \u29D9 \u29DB \u29FD \u3001-\u3002
|
|
|
|
\u3009 \u300B \u300D \u300F \u3011 \u3015 \u3017 \u3019 \u301B
|
|
|
|
\u301E-\u301F \uFD3F \uFE36 \uFE38 \uFE3A \uFE3C \uFE3E \uFE40
|
|
|
|
\uFE42 \uFE44 \uFE50 \uFE52 \uFE5A \uFE5C \uFE5E \uFF09 \uFF0C
|
|
|
|
\uFF0E \uFF3D \uFF5D \uFF60-\uFF61 \uFF63-\uFF64];
|
|
|
|
|
|
|
|
$NU = [ \u0030-\u0039 \u0660-\u0669 \u06F0-\u06F9 \u0966-\u096F \u09E6-\u09EF
|
|
|
|
\u0A66-\u0A6F \u0AE6-\u0AEF \u0B66-\u0B6F \u0BE7-\u0BEF \u0C66-\u0C6F
|
|
|
|
\u0CE6-\u0CEF \u0D66-\u0D6F \u0E50-\u0E59 \u0ED0-\u0ED9 \u0F20-\u0F29
|
|
|
|
\u1040-\u1049 \u1369-\u1371 \u17E0-\u17E9 \u1810-\u1819 \U0001D7CE-\U0001D7FF];
|
|
|
|
|
|
|
|
$CM = [ \u0000-\u0008 \u000B \u000E-\u001F \u007F-\u009F \u0300-\u034F \u0360-\u036F
|
|
|
|
\u0483-\u0486 \u0488-\u0489 \u0591-\u05A1 \u05A3-\u05B9 \u05BB-\u05BD
|
|
|
|
\u05BF \u05C1-\u05C2 \u05C4 \u064B-\u0655 \u0670 \u06D6-\u06E4
|
|
|
|
\u06E7-\u06E8 \u06EA-\u06ED \u070F \u0711 \u0730-\u074A \u07A6-\u07B0
|
|
|
|
\u0901-\u0903 \u093C \u093E-\u094D \u0951-\u0954 \u0962-\u0963
|
|
|
|
\u0981-\u0983 \u09BC \u09BE-\u09C4 \u09C7-\u09C8 \u09CB-\u09CD
|
|
|
|
\u09D7 \u09E2-\u09E3 \u0A02 \u0A3C \u0A3E-\u0A42 \u0A47-\u0A48
|
|
|
|
\u0A4B-\u0A4D \u0A70-\u0A71 \u0A81-\u0A83 \u0ABC \u0ABE-\u0AC5
|
|
|
|
\u0AC7-\u0AC9 \u0ACB-\u0ACD \u0B01-\u0B03 \u0B3C \u0B3E-\u0B43
|
|
|
|
\u0B47-\u0B48 \u0B4B-\u0B4D \u0B56-\u0B57 \u0B82 \u0BBE-\u0BC2
|
|
|
|
\u0BC6-\u0BC8 \u0BCA-\u0BCD \u0BD7 \u0C01-\u0C03 \u0C3E-\u0C44
|
|
|
|
\u0C46-\u0C48 \u0C4A-\u0C4D \u0C55-\u0C56 \u0C82-\u0C83 \u0CBE-\u0CC4
|
|
|
|
\u0CC6-\u0CC8 \u0CCA-\u0CCD \u0CD5-\u0CD6 \u0D02-\u0D03 \u0D3E-\u0D43
|
|
|
|
\u0D46-\u0D48 \u0D4A-\u0D4D \u0D57 \u0D82-\u0D83 \u0DCA \u0DCF-\u0DD4
|
|
|
|
\u0DD6 \u0DD8-\u0DDF \u0DF2-\u0DF3 \u0E31 \u0E34-\u0E3A \u0E47-\u0E4E
|
|
|
|
\u0EB1 \u0EB4-\u0EB9 \u0EBB-\u0EBC \u0EC8-\u0ECD \u0F18-\u0F19
|
|
|
|
\u0F35 \u0F37 \u0F39 \u0F3E-\u0F3F \u0F71-\u0F84 \u0F86-\u0F87
|
|
|
|
\u0F90-\u0F97 \u0F99-\u0FBC \u0FC6 \u102C-\u1032 \u1036-\u1039
|
|
|
|
\u1056-\u1059 \u1160-\u11A2 \u11A8-\u11F9 \u1712-\u1714 \u1732-\u1734
|
|
|
|
\u1752-\u1753 \u1772-\u1773 \u17B4-\u17D3 \u180B-\u180E \u18A9
|
|
|
|
\u200C-\u200F \u202A-\u202E \u206A-\u206F \u20D0-\u20EA \u302A-\u302F
|
|
|
|
\u3099-\u309A \uFB1E \uFE00-\uFE0F \uFE20-\uFE23 \uFFF9-\uFFFB
|
|
|
|
\U0001D165-\U0001D169 \U0001D16D-\U0001D182 \U0001D185-\U0001D18B
|
|
|
|
\U0001D1AA-\U0001D1AD \U000E0001 \U000E0020-\U000E007F];
|
|
|
|
|
|
|
|
$PR = [ \u0024 \u002B \u005C \u00A3-\u00A5 \u00B1 \u09F2-\u09F3 \u0E3F \u17DB
|
|
|
|
\u20A0-\u20A6 \u20A8-\u20B1 \u2116 \u2212-\u2213 \uFE69 \uFF04
|
|
|
|
\uFFE1 \uFFE5-\uFFE6];
|
|
|
|
|
|
|
|
$B2 = [ \u2014];
|
|
|
|
|
|
|
|
$ID = [ \u1100-\u1159 \u115F \u2E80-\u2E99 \u2E9B-\u2EF3 \u2F00-\u2FD5 \u2FF0-\u2FFB
|
|
|
|
\u3000 \u3003-\u3004 \u3006-\u3007 \u3012-\u3013 \u3020-\u3029
|
|
|
|
\u3030-\u303A \u303D-\u303F \u3042 \u3044 \u3046 \u3048 \u304A-\u3062
|
|
|
|
\u3064-\u3082 \u3084 \u3086 \u3088-\u308D \u308F-\u3094 \u309F
|
|
|
|
\u30A2 \u30A4 \u30A6 \u30A8 \u30AA-\u30C2 \u30C4-\u30E2 \u30E4
|
|
|
|
\u30E6 \u30E8-\u30ED \u30EF-\u30F4 \u30F7-\u30FA \u30FC \u30FE-\u30FF
|
|
|
|
\u3105-\u312C \u3131-\u318E \u3190-\u31B7 \u3200-\u321C \u3220-\u3243
|
|
|
|
\u3251-\u327B \u327F-\u32CB \u32D0-\u32FE \u3300-\u3376 \u337B-\u33DD
|
|
|
|
\u33E0-\u33FE \u3400-\u4DB5 \u4E00-\u9FA5 \uA000-\uA48C \uA490-\uA4C6
|
|
|
|
\uAC00-\uD7A3 \uF900-\uFA2D \uFA30-\uFA6A \uFE30-\uFE34 \uFE45-\uFE46
|
|
|
|
\uFE49-\uFE4F \uFE51 \uFE58 \uFE5F-\uFE66 \uFE68 \uFE6B \uFF02-\uFF03
|
|
|
|
\uFF06-\uFF07 \uFF0A-\uFF0B \uFF0D \uFF0F-\uFF19 \uFF1C-\uFF1E
|
|
|
|
\uFF20-\uFF3A \uFF3C \uFF3E-\uFF5A \uFF5C \uFF5E \uFFE2-\uFFE4
|
|
|
|
\U00020000-\U0002A6D6 \U0002F800-\U0002FA1D];
|
|
|
|
|
|
|
|
$SP = [ \u0020];
|
|
|
|
|
|
|
|
$QU = [ \u0022 \u0027 \u00AB \u00BB \u2018-\u2019 \u201B-\u201D \u201F \u2039-\u203A
|
|
|
|
\u23B6 \u275B-\u275E];
|
|
|
|
|
|
|
|
$CR = [ \u000D];
|
|
|
|
|
|
|
|
$GL = [ \u00A0 \u0F0C \u2007 \u2011 \u202F \u2060 \uFEFF];
|
|
|
|
|
|
|
|
############ End of Script-Generated Definitions #######################
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#
|
|
|
|
# Thai Dictionary related definitions and rules
|
|
|
|
#
|
|
|
|
|
|
|
|
$dictionary = [\u0e01-\u0e2e \u0e30-\u0e3a \u0e40-\u0e44 \u0e47-\u0e4e]; # this rule breaks the iterator with mixed Thai and English
|
|
|
|
$paiyannoi = [\u0e2f];
|
|
|
|
$maiyamok = [\u0e46];
|
|
|
|
$thai_etc = $paiyannoi \u0e25 $paiyannoi;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#
|
|
|
|
# Character classes from TR 29. Needed for finding characters.
|
|
|
|
#
|
|
|
|
# $Extend is all combining characters, and none of the other cruft that
|
|
|
|
# TR14 puts into $CM, which is its concept of combining marks.
|
|
|
|
#
|
|
|
|
$Extend = # From UNIDATA/DerivedCoreProperties.txt
|
|
|
|
[\u0300-\u034E \u0360-\u036F \u0483-\u0486 \u0488-\u0489 \u0591-\u05A1 \u05A3-\u05B9
|
|
|
|
\u05BB-\u05BD \u05BF \u05C1-\u05C2 \u05C4 \u064B-\u0655 \u0670 \u06D6-\u06DC
|
|
|
|
\u06DE \u06DF-\u06E4 \u06E7-\u06E8 \u06EA-\u06ED \u0711 \u0730-\u074A
|
|
|
|
\u07A6-\u07B0 \u0901-\u0902 \u0903 \u093C \u093E-\u0940 \u0941-\u0948
|
|
|
|
\u0949-\u094C \u0951-\u0954 \u0962-\u0963 \u0981 \u0982-\u0983 \u09BC
|
|
|
|
\u09BE-\u09C0 \u09C1-\u09C4 \u09C7-\u09C8 \u09CB-\u09CC \u09D7 \u09E2-\u09E3
|
|
|
|
\u0A02 \u0A3C \u0A3E-\u0A40 \u0A41-\u0A42 \u0A47-\u0A48 \u0A4B-\u0A4C
|
|
|
|
\u0A70-\u0A71 \u0A81-\u0A82 \u0A83 \u0ABC \u0ABE-\u0AC0 \u0AC1-\u0AC5
|
|
|
|
\u0AC7-\u0AC8 \u0AC9 \u0ACB-\u0ACC \u0B01 \u0B02-\u0B03 \u0B3C \u0B3E
|
|
|
|
\u0B3F \u0B40 \u0B41-\u0B43 \u0B47-\u0B48 \u0B4B-\u0B4C \u0B56 \u0B57
|
|
|
|
\u0B82 \u0BBE-\u0BBF \u0BC0 \u0BC1-\u0BC2 \u0BC6-\u0BC8 \u0BCA-\u0BCC \u0BD7
|
|
|
|
\u0C01-\u0C03 \u0C3E-\u0C40 \u0C41-\u0C44 \u0C46-\u0C48 \u0C4A-\u0C4C
|
|
|
|
\u0C55-\u0C56 \u0C82-\u0C83 \u0CBE \u0CBF \u0CC0-\u0CC4 \u0CC6
|
|
|
|
\u0CC7-\u0CC8 \u0CCA-\u0CCB \u0CCC \u0CD5-\u0CD6 \u0D02-\u0D03 \u0D3E-\u0D40
|
|
|
|
\u0D41-\u0D43 \u0D46-\u0D48 \u0D4A-\u0D4C \u0D57 \u0D82-\u0D83 \u0DCF-\u0DD1
|
|
|
|
\u0DD2-\u0DD4 \u0DD6 \u0DD8-\u0DDF \u0DF2-\u0DF3 \u0E31 \u0E34-\u0E39
|
|
|
|
\u0E47-\u0E4E \u0EB1 \u0EB4-\u0EB9 \u0EBB-\u0EBC \u0EC8-\u0ECD \u0F18-\u0F19
|
|
|
|
\u0F35 \u0F37 \u0F39 \u0F3E-\u0F3F \u0F71-\u0F7E \u0F7F \u0F80-\u0F84
|
|
|
|
\u0F86-\u0F87 \u0F90-\u0F97 \u0F99-\u0FBC \u0FC6 \u102C \u102D-\u1030 \u1031
|
|
|
|
\u1032 \u1036-\u1037 \u1038 \u1056-\u1057 \u1058-\u1059 \u1712-\u1714
|
|
|
|
\u1732-\u1734 \u1752-\u1753 \u1772-\u1773 \u17B4-\u17B6 \u17B7-\u17BD
|
|
|
|
\u17BE-\u17C5 \u17C6 \u17C7-\u17C8 \u17C9-\u17D1 \u17D3 \u180B-\u180D
|
|
|
|
\u18A9 \u20D0-\u20DC \u20DD-\u20E0 \u20E1 \u20E2-\u20E4 \u20E5-\u20EA
|
|
|
|
\u302A-\u302F \u3099-\u309A \uFB1E \uFE00-\uFE0F \uFE20-\uFE23 \uFF9E-\uFF9F
|
|
|
|
\U0001D165-\U0001D166 \U0001D167-\U0001D169 \U0001D16D-\U0001D172
|
|
|
|
\U0001D17B-\U0001D182 \U0001D185-\U0001D18B \U0001D1AA-\U0001D1AD];
|
|
|
|
|
|
|
|
|
2002-12-24 20:53:22 +00:00
|
|
|
#
|
|
|
|
# Rule LB1. By default, treat AI (characters with ambiguous east Asian width) and
|
|
|
|
# SA (South East Asian: Thai, Lao, Khmer) as $AL (Alphabetic)
|
|
|
|
#
|
|
|
|
$ALPlus = $AL | $AI | [$SA - $dictionary];
|
|
|
|
|
2002-06-25 17:23:07 +00:00
|
|
|
#
|
|
|
|
# Combining Marks. X $CM* behaves as if it were X. Rule LB6.
|
|
|
|
# TODO: This is going to produce some odd results, because of the non-combining
|
|
|
|
# chars that are included in $CM. Use $Extend instead, where possible.
|
|
|
|
#
|
2002-12-24 20:53:22 +00:00
|
|
|
$ALcm = $ALPlus $CM*;
|
2002-06-25 17:23:07 +00:00
|
|
|
$IDcm = $ID $CM*;
|
|
|
|
$NUcm = $NU $Extend*;
|
|
|
|
$HYcm = $HY $Extend*;
|
|
|
|
$SPcm = $SP $Extend*;
|
|
|
|
$QUcm = $QU $Extend*;
|
|
|
|
$POcm = $PO $Extend*;
|
|
|
|
$OPcm = $OP $Extend*;
|
|
|
|
$BAcm = $BA $Extend*;
|
|
|
|
$BBcm = $BB $Extend*;
|
|
|
|
$NScm = $NS $Extend*;
|
|
|
|
$GLcm = $GL $Extend*;
|
|
|
|
$B2cm = $B2 $Extend*;
|
|
|
|
$INcm = $IN $Extend*;
|
|
|
|
|
|
|
|
|
|
|
|
# New Lines. Always break after, never break before.
|
|
|
|
# Rule LB 3
|
|
|
|
#
|
|
|
|
# Endings. NewLine or Zero Width Space, or both. Rules 4, 5
|
|
|
|
# Because we never break before these things, $Endings
|
|
|
|
# appears at the end of line break rule.
|
|
|
|
#
|
|
|
|
$NLF = $BK | $CR | $LF | $CR $LF;
|
|
|
|
$Endings = $SPcm* $ZW* $NLF?;
|
|
|
|
$EndingsMandatory = $SPcm* $NLF | $SPcm* $ZW $NLF?;
|
|
|
|
|
|
|
|
|
|
|
|
#
|
|
|
|
# Openings Sequences that can precede Words, and that should not be separated from them.
|
|
|
|
# Rules LB 9, 10
|
|
|
|
#
|
|
|
|
$Openings = (($QUcm $SPcm*)? $OPcm $SPcm*)*;
|
|
|
|
|
|
|
|
#
|
|
|
|
# Closings Seqences that follow words, and that should not be separated from them,
|
|
|
|
# Rule LB 8, 11, 15
|
|
|
|
$Closings = ($SPcm*( ($CL ($SPcm* $NScm)? | $EX | $IS | $SY) $Extend*) | $BAcm | $HYcm | $NScm | $maiyamok)*;
|
|
|
|
|
|
|
|
#
|
|
|
|
# Words. Includes mixed Alpha-numerics.
|
|
|
|
# Rules 11a, 16, 17, 19, more or less.
|
|
|
|
#
|
|
|
|
$NumberInterior = $IDcm | ($NUcm | $ALcm | $IS $NUcm)+;
|
|
|
|
$Number = $PR? ($OPcm | $HYcm)? $NumberInterior $CL? $POcm?; # Fancy Number 18
|
|
|
|
$Word = (($IDcm | ($ALcm | $NUcm)+) ($POcm? | $INcm?)); # Alpha-numeric. 16, 17
|
|
|
|
$Dashes = (($B2cm $SPcm*)*); # Dashes 11a
|
|
|
|
$ThaiRange = $dictionary+ | $thai_etc;
|
|
|
|
$WordLikeThing = $Number | $Word | $Dashes | $ThaiRange;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
$Word15 = ($BBcm* ($WordLikeThing)? ($BAcm | $HYcm | $NScm)*) | # Rule 15. Stuff sticks around words.
|
|
|
|
[^[:Cc:] $BK $CR $LF $ZW $SP $GL] $Extend* | # Allow characters that don't meet the
|
|
|
|
[^$BK $CR $LF $ZW $SP $GL ]; # more elaborate definitions for WORD
|
|
|
|
# to be glued.
|
|
|
|
|
|
|
|
$GluedWord = ($GLcm | $QUcm)? $Word15 (($GLcm | $QUcm) $Word15)*; # "Glue" will stick anything below it together.
|
|
|
|
# Rules 13, 14
|
|
|
|
|
|
|
|
#
|
|
|
|
# The actual rules, a combination of everything defined above.
|
|
|
|
#
|
|
|
|
$Openings $GluedWord $Closings $paiyannoi? $EndingsMandatory;
|
|
|
|
$Openings $GluedWord $Closings $Endings;
|
|
|
|
|
|
|
|
$Openings $GluedWord $Closings $paiyannoi /
|
|
|
|
([^\u0e25 $Extend] | \u0e25[^$paiyannoi $Extend]);
|
|
|
|
|
|
|
|
|
|
|
|
#"$word($nbsp+$word)*$paiyannoi/([^[\u0e25$_ignore_]]|"
|
|
|
|
# + "\u0e25[^$paiyannoi$_ignore_]);"
|
|
|
|
|
|
|
|
|
|
|
|
#
|
|
|
|
# Reverse Rules.
|
|
|
|
#
|
2002-12-09 22:36:10 +00:00
|
|
|
# Back up to a hard break or a space that will cause a boundary.
|
|
|
|
# Not all spaces cause line breaks. $SpaceGlue represents a sequence
|
|
|
|
# containing a space that may inhibit a break from occuring.
|
2002-06-25 17:23:07 +00:00
|
|
|
#
|
2002-12-09 22:36:10 +00:00
|
|
|
$SpaceGlue = ([$ZW $CL $IS $NS $OP] ($Extend* $SP)) | (($Extend* $SP)+ $OP);
|
|
|
|
$ClumpingChars = [^$SP $BK $CR $LF];
|
|
|
|
|
|
|
|
!. . $ClumpingChars* ($SpaceGlue $ClumpingChars*)* (. | $LF $CR);
|
|
|
|
|