2005-04-22 21:49:52 +00:00
|
|
|
# Copyright (c) 2002-2005, International Business Machines Corporation and
|
2002-12-06 01:40:42 +00:00
|
|
|
# others. All Rights Reserved.
|
2002-06-25 17:23:07 +00:00
|
|
|
#
|
|
|
|
# word.txt Word Breaking Rules for ICU Rules Based Break Iterator.
|
|
|
|
#
|
2005-04-22 21:49:52 +00:00
|
|
|
# TODO: Shift this over to being based on the current default (non-Thai)
|
|
|
|
# word rules, including exact reverse rules. Postponed
|
|
|
|
# because of interactions with dictionary implementation.
|
2002-06-25 17:23:07 +00:00
|
|
|
|
|
|
|
|
2005-04-22 21:49:52 +00:00
|
|
|
$Katakana = [\p{Word_Break = Katakana}];
|
|
|
|
$ALetter = [\p{Word_Break = ALetter}];
|
|
|
|
$MidLetter = [\p{Word_Break = MidLetter}];
|
|
|
|
$Numeric = [\p{Line_Break = Numeric}];
|
|
|
|
$MidNum = [\p{Word_Break = MidNum}];
|
|
|
|
$ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
|
2002-06-25 17:23:07 +00:00
|
|
|
|
2005-04-22 21:49:52 +00:00
|
|
|
$Hiragana = [\p{Hiragana}];
|
2003-12-08 17:50:35 +00:00
|
|
|
|
2005-04-22 21:49:52 +00:00
|
|
|
$Control = [^\p{Grapheme_Cluster_Break = Control}];
|
|
|
|
$Extend = [\p{Grapheme_Cluster_Break = Extend}];
|
|
|
|
$ALetterEx = $ALetter $Extend*;
|
|
|
|
$NumericEx = $Numeric $Extend*;
|
|
|
|
$MidLetterEx = $MidLetter $Extend*;
|
|
|
|
$MidNumEx = $MidNum $Extend*;
|
|
|
|
$ExtendNumLetEx = $ExtendNumLet $Extend*;
|
2002-06-25 17:23:07 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#
|
2005-04-22 21:49:52 +00:00
|
|
|
# Thai Dictionary Related Rules. Identify runs that will be subdivided into words
|
|
|
|
# using the dictionary.
|
2002-06-25 17:23:07 +00:00
|
|
|
#
|
|
|
|
$dictionary = [\u0e01-\u0e2e \u0e30-\u0e3a \u0e40-\u0e44 \u0e47-\u0e4e]; # this rule breaks the iterator with mixed Thai and English
|
|
|
|
$paiyannoi = [\u0e2f];
|
|
|
|
$maiyamok = [\u0e46];
|
|
|
|
$thai_etc = $paiyannoi \u0e25 $paiyannoi;
|
|
|
|
|
|
|
|
|
|
|
|
$dictionary+ ($paiyannoi? $maiyamok)?;
|
|
|
|
$dictionary+ $paiyannoi / ([^\u0e25 $maiyamok $Extend] | \u0e25[^$paiyannoi $Extend]);
|
|
|
|
$thai_etc;
|
|
|
|
|
|
|
|
|
|
|
|
#
|
2005-04-22 21:49:52 +00:00
|
|
|
# The Big Rule. Gloms Non-Thai words together.
|
2002-06-25 17:23:07 +00:00
|
|
|
#
|
2005-04-22 21:49:52 +00:00
|
|
|
$NumericClump = $NumericEx ($MidNumEx? $NumericEx)*;
|
|
|
|
$AlphaClump = $ALetterEx ($MidLetterEx? $ALetterEx)*;
|
|
|
|
($AlphaClump | $NumericClump | $ExtendNumLetEx)+;
|
2002-06-25 17:23:07 +00:00
|
|
|
|
|
|
|
#
|
|
|
|
# Lesser rules
|
|
|
|
#
|
|
|
|
($Hiragana $Extend*)*;
|
|
|
|
($Katakana $Extend*)*;
|
2005-04-22 21:49:52 +00:00
|
|
|
[^$Control] $Extend*;
|
2002-06-25 17:23:07 +00:00
|
|
|
\r\n;
|
|
|
|
.;
|
|
|
|
|
|
|
|
#
|
|
|
|
# Reverse Rules. Back up over any of the chars that can group together.
|
|
|
|
# (Reverse rules do not need to be exact; they can back up a bit too far,
|
|
|
|
# but must back up at least enough.)
|
|
|
|
#
|
2005-04-22 21:49:52 +00:00
|
|
|
! ( $ALetter | $MidLetter | $Numeric | $ExtendNumLet | $MidNum | $Extend )*;
|
2002-06-25 17:23:07 +00:00
|
|
|
! ($Hiragana | $Extend)*;
|
|
|
|
! ($Katakana | $Extend)*;
|
|
|
|
! $Extend* .;
|
|
|
|
! \n\r;
|
|
|
|
|
|
|
|
! ($dictionary | $paiyannoi | $maiyamok | \u0e25)*;
|