# Copyright (c) 2002-2005, International Business Machines Corporation and
# others. All Rights Reserved.
#
#  word.txt    Word Breaking Rules for ICU Rules Based Break Iterator.
#
#     TODO:    Shift this over to being based on the current default (non-Thai)
#              word rules, including exact reverse rules.  Postponed
#              because of interactions with dictionary implementation.


$Katakana     = [\p{Word_Break = Katakana}];
$ALetter      = [\p{Word_Break = ALetter}];
$MidLetter    = [\p{Word_Break = MidLetter}];
$Numeric      = [\p{Line_Break = Numeric}];  
$MidNum       = [\p{Word_Break = MidNum}];
$ExtendNumLet = [\p{Word_Break = ExtendNumLet}];

$Hiragana     = [\p{Hiragana}];

$Control        = [^\p{Grapheme_Cluster_Break = Control}];
$Extend         = [\p{Grapheme_Cluster_Break = Extend}];
$ALetterEx      = $ALetter $Extend*;
$NumericEx      = $Numeric $Extend*;
$MidLetterEx    = $MidLetter $Extend*;
$MidNumEx       = $MidNum  $Extend*;
$ExtendNumLetEx = $ExtendNumLet $Extend*;



#
#  Thai Dictionary Related Rules.  Identify runs that will be subdivided into words
#                                  using the dictionary.
#
$dictionary = [\u0e01-\u0e2e \u0e30-\u0e3a \u0e40-\u0e44 \u0e47-\u0e4e]; # this rule breaks the iterator with mixed Thai and English
$paiyannoi  = [\u0e2f];
$maiyamok   = [\u0e46];
$thai_etc   = $paiyannoi \u0e25 $paiyannoi;


$dictionary+ ($paiyannoi? $maiyamok)?;
$dictionary+ $paiyannoi / ([^\u0e25 $maiyamok $Extend] | \u0e25[^$paiyannoi $Extend]);
$thai_etc;


#
#  The Big Rule.  Gloms Non-Thai words together.
#
$NumericClump    = $NumericEx ($MidNumEx? $NumericEx)*;
$AlphaClump      = $ALetterEx ($MidLetterEx? $ALetterEx)*;
($AlphaClump | $NumericClump | $ExtendNumLetEx)+;

#
#  Lesser rules
#
($Hiragana $Extend*)*;
($Katakana $Extend*)*;
[^$Control] $Extend*;
\r\n;
.;

#
#  Reverse Rules.   Back up over any of the chars that can group together.
#                   (Reverse rules do not need to be exact; they can back up a bit too far,
#                   but must back up at least enough.)
#
! ( $ALetter | $MidLetter | $Numeric | $ExtendNumLet | $MidNum | $Extend )*;
! ($Hiragana | $Extend)*;
! ($Katakana | $Extend)*;
! $Extend* .;
! \n\r;

! ($dictionary | $paiyannoi | $maiyamok | \u0e25)*;
