OLD | NEW |
1 # | 1 # |
2 # Copyright (C) 2002-2011, International Business Machines Corporation and oth
ers. | 2 # Copyright (C) 2002-2015, International Business Machines Corporation and oth
ers. |
3 # All Rights Reserved. | 3 # All Rights Reserved. |
4 # | 4 # |
5 # file: sent.txt | 5 # file: sent.txt |
6 # | 6 # |
7 # ICU Sentence Break Rules | 7 # ICU Sentence Break Rules |
8 # See Unicode Standard Annex #29. | 8 # See Unicode Standard Annex #29. |
9 # These rules are based on UAX #29 Revision 19 for Unicode Version 6.1 | 9 # These rules are based on UAX #29 Revision 26 for Unicode Version 8.0 |
10 # | 10 # |
11 | 11 |
12 | 12 |
13 # | 13 # |
14 # Character categories as defined in TR 29 | 14 # Character categories as defined in TR 29 |
15 # | 15 # |
16 $CR = [\p{Sentence_Break = CR}]; | 16 $CR = [\p{Sentence_Break = CR}]; |
17 $LF = [\p{Sentence_Break = LF}]; | 17 $LF = [\p{Sentence_Break = LF}]; |
18 $Extend = [\p{Sentence_Break = Extend}]; | 18 $Extend = [\p{Sentence_Break = Extend}]; |
19 $Sep = [\p{Sentence_Break = Sep}]; | 19 $Sep = [\p{Sentence_Break = Sep}]; |
(...skipping 37 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
57 # Rule 4 - Break after $Sep. | 57 # Rule 4 - Break after $Sep. |
58 # Rule 5 - Ignore $Format and $Extend | 58 # Rule 5 - Ignore $Format and $Extend |
59 # | 59 # |
60 [^$Sep $CR $LF]? ($Extend | $Format)*; | 60 [^$Sep $CR $LF]? ($Extend | $Format)*; |
61 | 61 |
62 | 62 |
63 # Rule 6 | 63 # Rule 6 |
64 $ATermEx $NumericEx; | 64 $ATermEx $NumericEx; |
65 | 65 |
66 # Rule 7 | 66 # Rule 7 |
67 $UpperEx $ATermEx $UpperEx; | 67 ($UpperEx | $LowerEx) $ATermEx $UpperEx; |
68 | 68 |
69 #Rule 8 | 69 #Rule 8 |
70 $NotLettersEx = [^$OLetter $Upper $Lower $Sep $CR $LF $ATerm $STerm] ($Extend |
$Format)*; | 70 $NotLettersEx = [^$OLetter $Upper $Lower $Sep $CR $LF $ATerm $STerm] ($Extend |
$Format)*; |
71 $ATermEx $CloseEx* $SpEx* $NotLettersEx* $Lower; | 71 $ATermEx $CloseEx* $SpEx* $NotLettersEx* $Lower; |
72 | 72 |
73 # Rule 8a | 73 # Rule 8a |
74 ($STermEx | $ATermEx) $CloseEx* $SpEx* ($SContinueEx | $STermEx | $ATermEx); | 74 ($STermEx | $ATermEx) $CloseEx* $SpEx* ($SContinueEx | $STermEx | $ATermEx); |
75 | 75 |
76 #Rule 9, 10, 11 | 76 #Rule 9, 10, 11 |
77 ($STermEx | $ATermEx) $CloseEx* $SpEx* ($Sep | $CR | $LF)?; | 77 ($STermEx | $ATermEx) $CloseEx* $SpEx* ($Sep | $CR | $LF)?; |
(...skipping 32 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
110 # Any immediately preceding STerm or ATerm sequences. We need to see the
se | 110 # Any immediately preceding STerm or ATerm sequences. We need to see the
se |
111 # to get the correct rule status when moving forwards again. | 111 # to get the correct rule status when moving forwards again. |
112 # | 112 # |
113 # [{bof}] inhibit rule chaining. Without this, rule would loop on its
elf and match | 113 # [{bof}] inhibit rule chaining. Without this, rule would loop on its
elf and match |
114 # the entire string. | 114 # the entire string. |
115 # | 115 # |
116 # (.? | $LF $CR) Match one $Sep instance. Use .? rather than $Sep because po
sition might be | 116 # (.? | $LF $CR) Match one $Sep instance. Use .? rather than $Sep because po
sition might be |
117 # at the beginning of the string at this point, and we don't w
ant to fail. | 117 # at the beginning of the string at this point, and we don't w
ant to fail. |
118 # Can only use {eof} once, and it is used later. | 118 # Can only use {eof} once, and it is used later. |
119 # | 119 # |
OLD | NEW |