scuffed-code/icu4c/source/data/brkitr/word_th.txt
2005-04-22 21:49:52 +00:00

73 lines
2.2 KiB
Plaintext

# Copyright (c) 2002-2005, International Business Machines Corporation and
# others. All Rights Reserved.
#
# word.txt Word Breaking Rules for ICU Rules Based Break Iterator.
#
# TODO: Shift this over to being based on the current default (non-Thai)
# word rules, including exact reverse rules. Postponed
# because of interactions with dictionary implementation.
$Katakana = [\p{Word_Break = Katakana}];
$ALetter = [\p{Word_Break = ALetter}];
$MidLetter = [\p{Word_Break = MidLetter}];
$Numeric = [\p{Line_Break = Numeric}];
$MidNum = [\p{Word_Break = MidNum}];
$ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
$Hiragana = [\p{Hiragana}];
$Control = [^\p{Grapheme_Cluster_Break = Control}];
$Extend = [\p{Grapheme_Cluster_Break = Extend}];
$ALetterEx = $ALetter $Extend*;
$NumericEx = $Numeric $Extend*;
$MidLetterEx = $MidLetter $Extend*;
$MidNumEx = $MidNum $Extend*;
$ExtendNumLetEx = $ExtendNumLet $Extend*;
#
# Thai Dictionary Related Rules. Identify runs that will be subdivided into words
# using the dictionary.
#
$dictionary = [\u0e01-\u0e2e \u0e30-\u0e3a \u0e40-\u0e44 \u0e47-\u0e4e]; # this rule breaks the iterator with mixed Thai and English
$paiyannoi = [\u0e2f];
$maiyamok = [\u0e46];
$thai_etc = $paiyannoi \u0e25 $paiyannoi;
$dictionary+ ($paiyannoi? $maiyamok)?;
$dictionary+ $paiyannoi / ([^\u0e25 $maiyamok $Extend] | \u0e25[^$paiyannoi $Extend]);
$thai_etc;
#
# The Big Rule. Gloms Non-Thai words together.
#
$NumericClump = $NumericEx ($MidNumEx? $NumericEx)*;
$AlphaClump = $ALetterEx ($MidLetterEx? $ALetterEx)*;
($AlphaClump | $NumericClump | $ExtendNumLetEx)+;
#
# Lesser rules
#
($Hiragana $Extend*)*;
($Katakana $Extend*)*;
[^$Control] $Extend*;
\r\n;
.;
#
# Reverse Rules. Back up over any of the chars that can group together.
# (Reverse rules do not need to be exact; they can back up a bit too far,
# but must back up at least enough.)
#
! ( $ALetter | $MidLetter | $Numeric | $ExtendNumLet | $MidNum | $Extend )*;
! ($Hiragana | $Extend)*;
! ($Katakana | $Extend)*;
! $Extend* .;
! \n\r;
! ($dictionary | $paiyannoi | $maiyamok | \u0e25)*;