OLD | NEW |
(Empty) | |
| 1 # |
| 2 # Copyright (C) 2002-2010, International Business Machines Corporation and oth
ers. |
| 3 # All Rights Reserved. |
| 4 # |
| 5 # file: sent_el.txt |
| 6 # |
| 7 # ICU Sentence Break Rules |
| 8 # See Unicode Standard Annex #29. |
| 9 # These rules are based on UAX 29 Revision 16 for Unicode Version 6.0 |
| 10 # |
| 11 |
| 12 |
| 13 # |
| 14 # Character categories as defined in TR 29 |
| 15 # |
| 16 $CR = [\p{Sentence_Break = CR}]; |
| 17 $LF = [\p{Sentence_Break = LF}]; |
| 18 $Extend = [\p{Sentence_Break = Extend}]; |
| 19 $Sep = [\p{Sentence_Break = Sep}]; |
| 20 $Format = [\p{Sentence_Break = Format}]; |
| 21 $Sp = [\p{Sentence_Break = Sp}]; |
| 22 $Lower = [\p{Sentence_Break = Lower}]; |
| 23 $Upper = [\p{Sentence_Break = Upper}]; |
| 24 $OLetter = [\p{Sentence_Break = OLetter}]; |
| 25 $Numeric = [\p{Sentence_Break = Numeric}]; |
| 26 $ATerm = [\p{Sentence_Break = ATerm}]; |
| 27 $SContinue = [\p{Sentence_Break = SContinue}]; |
| 28 $STerm = [\p{Sentence_Break = STerm} [\u003B \u037E]]; |
| 29 $Close = [\p{Sentence_Break = Close}]; |
| 30 |
| 31 # |
| 32 # Define extended forms of the character classes, |
| 33 # incorporate trailing Extend or Format chars. |
| 34 # Rules 4 and 5. |
| 35 |
| 36 $SpEx = $Sp ($Extend | $Format)*; |
| 37 $LowerEx = $Lower ($Extend | $Format)*; |
| 38 $UpperEx = $Upper ($Extend | $Format)*; |
| 39 $OLetterEx = $OLetter ($Extend | $Format)*; |
| 40 $NumericEx = $Numeric ($Extend | $Format)*; |
| 41 $ATermEx = $ATerm ($Extend | $Format)*; |
| 42 $SContinueEx= $SContinue ($Extend | $Format)*; |
| 43 $STermEx = $STerm ($Extend | $Format)*; |
| 44 $CloseEx = $Close ($Extend | $Format)*; |
| 45 |
| 46 |
| 47 ## ------------------------------------------------- |
| 48 |
| 49 !!chain; |
| 50 !!forward; |
| 51 |
| 52 # Rule 3 - break after separators. Keep CR/LF together. |
| 53 # |
| 54 $CR $LF; |
| 55 |
| 56 |
| 57 # Rule 4 - Break after $Sep. |
| 58 # Rule 5 - Ignore $Format and $Extend |
| 59 # |
| 60 [^$Sep $CR $LF]? ($Extend | $Format)*; |
| 61 |
| 62 |
| 63 # Rule 6 |
| 64 $ATermEx $NumericEx; |
| 65 |
| 66 # Rule 7 |
| 67 $UpperEx $ATermEx $UpperEx; |
| 68 |
| 69 #Rule 8 |
| 70 $NotLettersEx = [^$OLetter $Upper $Lower $Sep $CR $LF $ATerm $STerm] ($Extend |
$Format)*; |
| 71 $ATermEx $CloseEx* $SpEx* $NotLettersEx* $Lower; |
| 72 |
| 73 # Rule 8a |
| 74 ($STermEx | $ATermEx) $CloseEx* $SpEx* ($SContinueEx | $STermEx | $ATermEx); |
| 75 |
| 76 #Rule 9, 10, 11 |
| 77 ($STermEx | $ATermEx) $CloseEx* $SpEx* ($Sep | $CR | $LF)?; |
| 78 |
| 79 #Rule 12 |
| 80 [[^$STerm $ATerm $Close $Sp $Sep $LF $CR $Format $Extend]{bof}] ($Extend | $Form
at | $Close | $Sp)* .; |
| 81 [[^$STerm $ATerm $Close $Sp $Sep $LF $CR $Format $Extend]{bof}] ($Extend | $Form
at | $Close | $Sp)* ([$Sep $LF $CR {eof}] | $CR $LF){100}; |
| 82 |
| 83 ## ------------------------------------------------- |
| 84 |
| 85 !!reverse; |
| 86 |
| 87 $SpEx_R = ($Extend | $Format)* $Sp; |
| 88 $ATermEx_R = ($Extend | $Format)* $ATerm; |
| 89 $STermEx_R = ($Extend | $Format)* $STerm; |
| 90 $CloseEx_R = ($Extend | $Format)* $Close; |
| 91 |
| 92 # |
| 93 # Reverse rules. |
| 94 # For now, use the old style inexact reverse rules, which are easier |
| 95 # to write, but less efficient. |
| 96 # TODO: exact reverse rules. It appears that exact reverse rules |
| 97 # may require improving support for look-ahead breaks in the |
| 98 # builder. Needs more investigation. |
| 99 # |
| 100 |
| 101 [{bof}] (.? | $LF $CR) [^$Sep $CR $LF]* [$Sep $CR $LF {eof}] ($SpEx_R* $CloseEx_
R* ($STermEx_R | $ATermEx_R))*; |
| 102 #.*; |
| 103 |
| 104 # Explanation for this rule: |
| 105 # |
| 106 # It needs to back over |
| 107 # The $Sep at which we probably begin |
| 108 # All of the non $Sep chars leading to the preceding $Sep |
| 109 # The preceding $Sep, which will be the second one that the rule matches. |
| 110 # Any immediately preceding STerm or ATerm sequences. We need to see the
se |
| 111 # to get the correct rule status when moving forwards again. |
| 112 # |
| 113 # [{bof}] inhibit rule chaining. Without this, rule would loop on its
elf and match |
| 114 # the entire string. |
| 115 # |
| 116 # (.? | $LF $CR) Match one $Sep instance. Use .? rather than $Sep because po
sition might be |
| 117 # at the beginning of the string at this point, and we don't w
ant to fail. |
| 118 # Can only use {eof} once, and it is used later. |
| 119 # |
OLD | NEW |