OLD | NEW |
1 # Copyright (c) 2002-2010 International Business Machines Corporation and | 1 # Copyright (c) 2002-2010 International Business Machines Corporation and |
2 # others. All Rights Reserved. | 2 # others. All Rights Reserved. |
3 # | 3 # |
4 # file: line.txt | 4 # file: line.txt |
5 # | 5 # |
6 # Line Breaking Rules | 6 # Line Breaking Rules |
7 # Implement default line breaking as defined by | 7 # Implement default line breaking as defined by |
8 # Unicode Standard Annex #14 Revision 24 for Unicode 6.0 | 8 # Unicode Standard Annex #14 Revision 24 for Unicode 6.0 |
9 # http://www.unicode.org/reports/tr14/ | 9 # http://www.unicode.org/reports/tr14/ |
10 # | 10 # |
11 # TODO: Rule LB 8 remains as it was in Unicode 5.2 | 11 # TODO: Rule LB 8 remains as it was in Unicode 5.2 |
12 # This is only because of a limitation of ICU break engine implementatio
n, | 12 # This is only because of a limitation of ICU break engine implementatio
n, |
13 # not because the older behavior is desirable. | 13 # not because the older behavior is desirable. |
| 14 # |
| 15 # CHROME: Hebrew tailoring was incorporatd as well as some |
| 16 # other minor changes (CL, OP, ID). |
14 | 17 |
15 # | 18 # |
16 # Character Classes defined by TR 14. | 19 # Character Classes defined by TR 14. |
17 # | 20 # |
18 | 21 |
19 !!chain; | 22 !!chain; |
20 !!LBCMNoChain; | 23 !!LBCMNoChain; |
21 | 24 |
22 | 25 |
23 !!lookAheadHardBreak; | 26 !!lookAheadHardBreak; |
(...skipping 26 matching lines...) Expand all Loading... |
50 # | 53 # |
51 # !!lookAheadHardBreak forces the run time state machine to | 54 # !!lookAheadHardBreak forces the run time state machine to |
52 # stop immediately when a look ahead rule ( '/' operator) matches, | 55 # stop immediately when a look ahead rule ( '/' operator) matches, |
53 # and set the match position to that of the look-ahead operator, | 56 # and set the match position to that of the look-ahead operator, |
54 # no matter what other rules may be in play at the time. | 57 # no matter what other rules may be in play at the time. |
55 # | 58 # |
56 # See rule LB 19 for an example. | 59 # See rule LB 19 for an example. |
57 # | 60 # |
58 | 61 |
59 $AI = [:LineBreak = Ambiguous:]; | 62 $AI = [:LineBreak = Ambiguous:]; |
60 $AL = [:LineBreak = Alphabetic:]; | 63 $AL = [[:LineBreak = Alphabetic:] - [[:Hebrew:] & [:Letter:]] - [\u23B4\u23B5]]
; |
61 $BA = [:LineBreak = Break_After:]; | 64 $HL = [[:Hebrew:] & [:Letter:]]; |
| 65 $BA = [[:LineBreak = Break_After:] - [\u2010]]; |
| 66 $HH = [\u2010]; |
62 $BB = [:LineBreak = Break_Before:]; | 67 $BB = [:LineBreak = Break_Before:]; |
63 $BK = [:LineBreak = Mandatory_Break:]; | 68 $BK = [:LineBreak = Mandatory_Break:]; |
64 $B2 = [:LineBreak = Break_Both:]; | 69 $B2 = [:LineBreak = Break_Both:]; |
65 $CB = [:LineBreak = Contingent_Break:]; | 70 $CB = [:LineBreak = Contingent_Break:]; |
66 $CL = [:LineBreak = Close_Punctuation:]; | 71 $CL = [[:LineBreak = Close_Punctuation:] [\uFE51\uFE10\u23B5]]; |
67 $CM = [:LineBreak = Combining_Mark:]; | 72 $CM = [:LineBreak = Combining_Mark:]; |
68 $CP = [:LineBreak = Close_Parenthesis:]; | 73 $CP = [:LineBreak = Close_Parenthesis:]; |
69 $CR = [:LineBreak = Carriage_Return:]; | 74 $CR = [:LineBreak = Carriage_Return:]; |
70 $EX = [:LineBreak = Exclamation:]; | 75 $EX = [:LineBreak = Exclamation:]; |
71 $GL = [:LineBreak = Glue:]; | 76 $GL = [:LineBreak = Glue:]; |
72 $HY = [:LineBreak = Hyphen:]; | 77 $HY = [:LineBreak = Hyphen:]; |
73 $H2 = [:LineBreak = H2:]; | 78 $H2 = [:LineBreak = H2:]; |
74 $H3 = [:LineBreak = H3:]; | 79 $H3 = [:LineBreak = H3:]; |
75 $ID = [:LineBreak = Ideographic:]; | 80 $ID = [[:LineBreak = Ideographic:] - [\uFE51]]; |
76 $IN = [:LineBreak = Inseperable:]; | 81 $IN = [:LineBreak = Inseperable:]; |
77 $IS = [:LineBreak = Infix_Numeric:]; | 82 $IS = [[:LineBreak = Infix_Numeric:] - [\uFE10]]; |
78 $JL = [:LineBreak = JL:]; | 83 $JL = [:LineBreak = JL:]; |
79 $JV = [:LineBreak = JV:]; | 84 $JV = [:LineBreak = JV:]; |
80 $JT = [:LineBreak = JT:]; | 85 $JT = [:LineBreak = JT:]; |
81 $LF = [:LineBreak = Line_Feed:]; | 86 $LF = [:LineBreak = Line_Feed:]; |
82 $NL = [:LineBreak = Next_Line:]; | 87 $NL = [:LineBreak = Next_Line:]; |
83 $NS = [:LineBreak = Nonstarter:]; | 88 $NS = [:LineBreak = Nonstarter:]; |
84 $NU = [:LineBreak = Numeric:]; | 89 $NU = [:LineBreak = Numeric:]; |
85 $OP = [:LineBreak = Open_Punctuation:]; | 90 $OP = [[:LineBreak = Open_Punctuation:] \u23B4]; |
86 $PO = [:LineBreak = Postfix_Numeric:]; | 91 $PO = [:LineBreak = Postfix_Numeric:]; |
87 $PR = [:LineBreak = Prefix_Numeric:]; | 92 $PR = [:LineBreak = Prefix_Numeric:]; |
88 $QU = [:LineBreak = Quotation:]; | 93 $QU = [:LineBreak = Quotation:]; |
89 $SA = [:LineBreak = Complex_Context:]; | 94 $SA = [:LineBreak = Complex_Context:]; |
90 $SG = [:LineBreak = Surrogate:]; | 95 $SG = [:LineBreak = Surrogate:]; |
91 $SP = [:LineBreak = Space:]; | 96 $SP = [:LineBreak = Space:]; |
92 $SY = [:LineBreak = Break_Symbols:]; | 97 $SY = [:LineBreak = Break_Symbols:]; |
93 $WJ = [:LineBreak = Word_Joiner:]; | 98 $WJ = [:LineBreak = Word_Joiner:]; |
94 $XX = [:LineBreak = Unknown:]; | 99 $XX = [:LineBreak = Unknown:]; |
95 $ZW = [:LineBreak = ZWSpace:]; | 100 $ZW = [:LineBreak = ZWSpace:]; |
96 | 101 |
97 # Dictionary character set, for triggering language-based break engines. Curre
ntly | 102 # Dictionary character set, for triggering language-based break engines. Curre
ntly |
98 # limited to LineBreak=Complex_Context. Note that this set only works in Unico
de | 103 # limited to LineBreak=Complex_Context. Note that this set only works in Unico
de |
99 # 5.0 or later as the definition of Complex_Context was corrected to include a
ll | 104 # 5.0 or later as the definition of Complex_Context was corrected to include a
ll |
100 # characters requiring dictionary break. | 105 # characters requiring dictionary break. |
101 | 106 |
102 $dictionary = [:LineBreak = Complex_Context:]; | 107 $dictionary = [:LineBreak = Complex_Context:]; |
103 | 108 |
104 # | 109 # |
105 # Rule LB1. By default, treat AI (characters with ambiguous east Asian width)
, | 110 # Rule LB1. By default, treat AI (characters with ambiguous east Asian width)
, |
106 # SA (South East Asian: Thai, Lao, Khmer) | 111 # SA (South East Asian: Thai, Lao, Khmer) |
107 # SG (Unpaired Surrogates) | 112 # SG (Unpaired Surrogates) |
108 # XX (Unknown, unassigned) | 113 # XX (Unknown, unassigned) |
109 # as $AL (Alphabetic) | 114 # as $AL (Alphabetic) |
110 # | 115 # |
111 $ALPlus = [$AL $AI $SA $SG $XX]; | 116 $ALPlus = [$AL $HL $AI $SA $SG $XX]; |
112 | 117 |
113 # | 118 # |
114 # Combining Marks. X $CM* behaves as if it were X. Rule LB6. | 119 # Combining Marks. X $CM* behaves as if it were X. Rule LB6. |
115 # | 120 # |
116 $ALcm = $ALPlus $CM*; | 121 $ALcm = $ALPlus $CM*; |
| 122 $HLcm = $HL $CM*; |
117 $BAcm = $BA $CM*; | 123 $BAcm = $BA $CM*; |
| 124 $HHcm = $HH $CM*; |
118 $BBcm = $BB $CM*; | 125 $BBcm = $BB $CM*; |
119 $B2cm = $B2 $CM*; | 126 $B2cm = $B2 $CM*; |
120 $CLcm = $CL $CM*; | 127 $CLcm = $CL $CM*; |
121 $CPcm = $CP $CM*; | 128 $CPcm = $CP $CM*; |
122 $EXcm = $EX $CM*; | 129 $EXcm = $EX $CM*; |
123 $GLcm = $GL $CM*; | 130 $GLcm = $GL $CM*; |
124 $HYcm = $HY $CM*; | 131 $HYcm = $HY $CM*; |
125 $H2cm = $H2 $CM*; | 132 $H2cm = $H2 $CM*; |
126 $H3cm = $H3 $CM*; | 133 $H3cm = $H3 $CM*; |
127 $IDcm = $ID $CM*; | 134 $IDcm = $ID $CM*; |
(...skipping 13 matching lines...) Expand all Loading... |
141 | 148 |
142 ## ------------------------------------------------- | 149 ## ------------------------------------------------- |
143 | 150 |
144 !!forward; | 151 !!forward; |
145 | 152 |
146 # | 153 # |
147 # Each class of character can stand by itself as an unbroken token, with traili
ng combining stuff | 154 # Each class of character can stand by itself as an unbroken token, with traili
ng combining stuff |
148 # | 155 # |
149 $ALPlus $CM+; | 156 $ALPlus $CM+; |
150 $BA $CM+; | 157 $BA $CM+; |
| 158 $HH $CM+; |
151 $BB $CM+; | 159 $BB $CM+; |
152 $B2 $CM+; | 160 $B2 $CM+; |
153 $CL $CM+; | 161 $CL $CM+; |
154 $CP $CM+; | 162 $CP $CM+; |
155 $EX $CM+; | 163 $EX $CM+; |
156 $GL $CM+; | 164 $GL $CM+; |
157 $HY $CM+; | 165 $HY $CM+; |
158 $H2 $CM+; | 166 $H2 $CM+; |
159 $H3 $CM+; | 167 $H3 $CM+; |
160 $ID $CM+; | 168 $ID $CM+; |
(...skipping 22 matching lines...) Expand all Loading... |
183 $CAN_CM = [^$SP $BK $CR $LF $NL $ZW $CM]; # Bases that can take CMs | 191 $CAN_CM = [^$SP $BK $CR $LF $NL $ZW $CM]; # Bases that can take CMs |
184 $CANT_CM = [ $SP $BK $CR $LF $NL $ZW $CM]; # Bases that can't take CMs | 192 $CANT_CM = [ $SP $BK $CR $LF $NL $ZW $CM]; # Bases that can't take CMs |
185 | 193 |
186 # | 194 # |
187 # AL_FOLLOW set of chars that can unconditionally follow an AL | 195 # AL_FOLLOW set of chars that can unconditionally follow an AL |
188 # Needed in rules where stand-alone $CM s are treated as AL. | 196 # Needed in rules where stand-alone $CM s are treated as AL. |
189 # Chaining is disabled with CM because it causes other failures, | 197 # Chaining is disabled with CM because it causes other failures, |
190 # so for this one case we need to manually list out longer sequences. | 198 # so for this one case we need to manually list out longer sequences. |
191 # | 199 # |
192 $AL_FOLLOW_NOCM = [$BK $CR $LF $NL $ZW $SP]; | 200 $AL_FOLLOW_NOCM = [$BK $CR $LF $NL $ZW $SP]; |
193 $AL_FOLLOW_CM = [$CL $CP $EX $IS $SY $WJ $GL $OP $QU $BA $HY $NS $IN $NU $ALPl
us]; | 201 $AL_FOLLOW_CM = [$CL $CP $EX $IS $SY $WJ $GL $OP $QU $BA $HH $HY $NS $IN $NU $
ALPlus]; |
194 $AL_FOLLOW = [$AL_FOLLOW_NOCM $AL_FOLLOW_CM]; | 202 $AL_FOLLOW = [$AL_FOLLOW_NOCM $AL_FOLLOW_CM]; |
195 | 203 |
196 | 204 |
197 # | 205 # |
198 # Rule LB 4, 5 Mandatory (Hard) breaks. | 206 # Rule LB 4, 5 Mandatory (Hard) breaks. |
199 # | 207 # |
200 $LB4Breaks = [$BK $CR $LF $NL]; | 208 $LB4Breaks = [$BK $CR $LF $NL]; |
201 $LB4NonBreaks = [^$BK $CR $LF $NL]; | 209 $LB4NonBreaks = [^$BK $CR $LF $NL]; |
202 $CR $LF {100}; | 210 $CR $LF {100}; |
203 | 211 |
(...skipping 41 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
245 # LB 12 Do not break after NBSP and related characters. | 253 # LB 12 Do not break after NBSP and related characters. |
246 # GL x | 254 # GL x |
247 # | 255 # |
248 $GLcm $CAN_CM $CM*; | 256 $GLcm $CAN_CM $CM*; |
249 $GLcm $CANT_CM; | 257 $GLcm $CANT_CM; |
250 | 258 |
251 # | 259 # |
252 # LB 12a Do not break before NBSP and related characters ... | 260 # LB 12a Do not break before NBSP and related characters ... |
253 # [^SP BA HY] x GL | 261 # [^SP BA HY] x GL |
254 # | 262 # |
255 [[$LB8NonBreaks] - [$SP $BA $HY]] $CM* $GLcm; | 263 [[$LB8NonBreaks] - [$SP $BA $HH $HY]] $CM* $GLcm; |
256 $CM+ GLcm; | 264 $CM+ GLcm; |
257 | 265 |
258 | 266 |
259 | 267 |
260 # | 268 # |
261 # LB 13 Don't break before ']' or '!' or ';' or '/', even after spaces. | 269 # LB 13 Don't break before ']' or '!' or ';' or '/', even after spaces. |
262 # | 270 # |
263 $LB8NonBreaks $CL; | 271 $LB8NonBreaks $CL; |
264 $CAN_CM $CM* $CL; | 272 $CAN_CM $CM* $CL; |
265 $CM+ $CL; # by rule 10, stand-alone CM behaves as AL | 273 $CM+ $CL; # by rule 10, stand-alone CM behaves as AL |
(...skipping 52 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
318 | 326 |
319 # LB 20 | 327 # LB 20 |
320 # <break> $CB | 328 # <break> $CB |
321 # $CB <break> | 329 # $CB <break> |
322 | 330 |
323 $LB20NonBreaks = [$LB18NonBreaks - $CB]; | 331 $LB20NonBreaks = [$LB18NonBreaks - $CB]; |
324 | 332 |
325 # LB 21 x (BA | HY | NS) | 333 # LB 21 x (BA | HY | NS) |
326 # BB x | 334 # BB x |
327 # | 335 # |
328 $LB20NonBreaks $CM* ($BAcm | $HYcm | $NScm); | 336 $LB20NonBreaks $CM* ($BAcm | $HHcm | $HYcm | $NScm); |
329 | 337 |
330 $BBcm [^$CB]; # $BB x | 338 $BBcm [^$CB]; # $BB x |
331 $BBcm $LB20NonBreaks $CM*; | 339 $BBcm $LB20NonBreaks $CM*; |
332 | 340 |
333 # LB 22 | 341 # LB 22 |
334 $ALcm $INcm; | 342 $ALcm $INcm; |
335 $CM+ $INcm; # by rule 10, any otherwise unattached CM behaves as AL | 343 $CM+ $INcm; # by rule 10, any otherwise unattached CM behaves as AL |
336 $IDcm $INcm; | 344 $IDcm $INcm; |
337 $INcm $INcm; | 345 $INcm $INcm; |
338 $NUcm $INcm; | 346 $NUcm $INcm; |
(...skipping 35 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
374 $CM+ $ALcm; # The $CM+ is from rule 10, an unattached CM is treated as AL | 382 $CM+ $ALcm; # The $CM+ is from rule 10, an unattached CM is treated as AL |
375 | 383 |
376 # LB 29 | 384 # LB 29 |
377 $IScm $ALcm; | 385 $IScm $ALcm; |
378 | 386 |
379 # LB 30 | 387 # LB 30 |
380 ($ALcm | $NUcm) $OPcm; | 388 ($ALcm | $NUcm) $OPcm; |
381 $CM+ $OPcm; # The $CM+ is from rule 10, an unattached CM is treated as A
L. | 389 $CM+ $OPcm; # The $CM+ is from rule 10, an unattached CM is treated as A
L. |
382 $CPcm ($ALcm | $NUcm); | 390 $CPcm ($ALcm | $NUcm); |
383 | 391 |
| 392 # (LB 31) Add new rule to prevent the break we do not want, this is the behavior
change |
| 393 $HLcm ($HY | $HH) $ALcm; |
384 | 394 |
385 # | 395 # |
386 # Reverse Rules. | 396 # Reverse Rules. |
387 # | 397 # |
388 ## ------------------------------------------------- | 398 ## ------------------------------------------------- |
389 | 399 |
390 !!reverse; | 400 !!reverse; |
391 | 401 |
392 $CM+ $ALPlus; | 402 $CM+ $ALPlus; |
393 $CM+ $BA; | 403 $CM+ $BA; |
| 404 $CM+ $HH; |
394 $CM+ $BB; | 405 $CM+ $BB; |
395 $CM+ $B2; | 406 $CM+ $B2; |
396 $CM+ $CL; | 407 $CM+ $CL; |
397 $CM+ $CP; | 408 $CM+ $CP; |
398 $CM+ $EX; | 409 $CM+ $EX; |
399 $CM+ $GL; | 410 $CM+ $GL; |
400 $CM+ $HY; | 411 $CM+ $HY; |
401 $CM+ $H2; | 412 $CM+ $H2; |
402 $CM+ $H3; | 413 $CM+ $H3; |
403 $CM+ $ID; | 414 $CM+ $ID; |
(...skipping 68 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
472 # LB 11 | 483 # LB 11 |
473 $CM* $WJ $CM* $CAN_CM; | 484 $CM* $WJ $CM* $CAN_CM; |
474 $CM* $WJ [$LB8NonBreaks-$CM]; | 485 $CM* $WJ [$LB8NonBreaks-$CM]; |
475 | 486 |
476 $CANT_CM $CM* $WJ; | 487 $CANT_CM $CM* $WJ; |
477 $CM* $CAN_CM $CM* $WJ; | 488 $CM* $CAN_CM $CM* $WJ; |
478 | 489 |
479 # LB 12a | 490 # LB 12a |
480 # [^SP BA HY] x GL | 491 # [^SP BA HY] x GL |
481 # | 492 # |
482 $CM* $GL $CM* [$LB8NonBreaks-[$CM $SP $BA $HY]]; | 493 $CM* $GL $CM* [$LB8NonBreaks-[$CM $SP $BA $HH $HY]]; |
483 | 494 |
484 # LB 12 | 495 # LB 12 |
485 # GL x | 496 # GL x |
486 # | 497 # |
487 $CANT_CM $CM* $GL; | 498 $CANT_CM $CM* $GL; |
488 $CM* $CAN_CM $CM* $GL; | 499 $CM* $CAN_CM $CM* $GL; |
489 | 500 |
490 | 501 |
491 # LB 13 | 502 # LB 13 |
492 $CL $CM+ $CAN_CM; | 503 $CL $CM+ $CAN_CM; |
(...skipping 49 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
542 | 553 |
543 $CM* $CAN_CM $CM* $QU; # QU x . | 554 $CM* $CAN_CM $CM* $QU; # QU x . |
544 $CANT_CM $CM* $QU; | 555 $CANT_CM $CM* $QU; |
545 | 556 |
546 # | 557 # |
547 # LB 20 Break before and after CB. | 558 # LB 20 Break before and after CB. |
548 # nothing needed here. | 559 # nothing needed here. |
549 # | 560 # |
550 | 561 |
551 # LB 21 | 562 # LB 21 |
552 $CM* ($BA | $HY | $NS) $CM* [$LB20NonBreaks-$CM]; # . x (BA | HY | NS) | 563 $CM* ($BA | $HH | $HY | $NS) $CM* [$LB20NonBreaks-$CM]; # . x (BA | HY | NS
) |
553 | 564 |
554 $CM* [$LB20NonBreaks-$CM] $CM* $BB; # BB x . | 565 $CM* [$LB20NonBreaks-$CM] $CM* $BB; # BB x . |
555 [^$CB] $CM* $BB; # | 566 [^$CB] $CM* $BB; # |
556 | 567 |
557 | 568 |
558 | 569 |
559 # LB 22 | 570 # LB 22 |
560 $CM* $IN $CM* $ALPlus; | 571 $CM* $IN $CM* $ALPlus; |
561 $CM* $IN $CM* $ID; | 572 $CM* $IN $CM* $ID; |
562 $CM* $IN $CM* $IN; | 573 $CM* $IN $CM* $IN; |
(...skipping 27 matching lines...) Expand all Loading... |
590 $CM* $ALPlus $CM* $ALPlus; | 601 $CM* $ALPlus $CM* $ALPlus; |
591 | 602 |
592 | 603 |
593 # LB 29 | 604 # LB 29 |
594 $CM* $ALPlus $CM* $IS; | 605 $CM* $ALPlus $CM* $IS; |
595 | 606 |
596 # LB 30 | 607 # LB 30 |
597 $CM* $OP $CM* ($ALPlus | $NU); | 608 $CM* $OP $CM* ($ALPlus | $NU); |
598 $CM* ($ALPlus | $NU) $CM* $CP; | 609 $CM* ($ALPlus | $NU) $CM* $CP; |
599 | 610 |
| 611 # (LB 31) Add new rule to prevent the break we do not want, this is the behavior
change |
| 612 $CM* $ALPlus ($HY | $HH) $CM* $HL; |
600 | 613 |
601 ## ------------------------------------------------- | 614 ## ------------------------------------------------- |
602 | 615 |
603 !!safe_reverse; | 616 !!safe_reverse; |
604 | 617 |
605 # LB 9 | 618 # LB 9 |
606 $CM+ [^$CM $BK $CR $LF $NL $ZW $SP]; | 619 $CM+ [^$CM $BK $CR $LF $NL $ZW $SP]; |
607 $CM+ $SP / .; | 620 $CM+ $SP / .; |
608 | 621 |
609 # LB 14 | 622 # LB 14 |
(...skipping 24 matching lines...) Expand all Loading... |
634 # of context. | 647 # of context. |
635 # | 648 # |
636 # It might be slightly more efficient to have specific rules | 649 # It might be slightly more efficient to have specific rules |
637 # instead of one generic one, but only if we could | 650 # instead of one generic one, but only if we could |
638 # turn off rule chaining. We don't want to move more | 651 # turn off rule chaining. We don't want to move more |
639 # than necessary. | 652 # than necessary. |
640 # | 653 # |
641 [$CM $OP $QU $CL $CP $B2 $PR $HY $SP $dictionary]+ [^$CM $OP $QU $CL $CP $B2 $PR
$HY $dictionary]; | 654 [$CM $OP $QU $CL $CP $B2 $PR $HY $SP $dictionary]+ [^$CM $OP $QU $CL $CP $B2 $PR
$HY $dictionary]; |
642 $dictionary $dictionary; | 655 $dictionary $dictionary; |
643 | 656 |
OLD | NEW |