OLD | NEW |
1 # Copyright (c) 2002-2013 International Business Machines Corporation and | 1 # Copyright (c) 2002-2015 International Business Machines Corporation and |
2 # others. All Rights Reserved. | 2 # others. All Rights Reserved. |
3 # | 3 # |
4 # file: line_fi.txt | 4 # file: line_loose_fi.txt |
5 # | 5 # |
6 # Line Breaking Rules | 6 # Line Breaking Rules |
7 # Implement default line breaking as defined by | 7 # Implement default line breaking as defined by |
8 # Unicode Standard Annex #14 Revision 29 for Unicode 6.2 | 8 # Unicode Standard Annex #14 Revision 34 for Unicode 8.0 |
9 # http://www.unicode.org/reports/tr14/ | 9 # http://www.unicode.org/reports/tr14/ |
| 10 # tailored as noted in 2nd paragraph below.. |
10 # | 11 # |
11 # TODO: Rule LB 8 remains as it was in Unicode 5.2 | 12 # TODO: Rule LB 8 remains as it was in Unicode 5.2 |
12 # This is only because of a limitation of ICU break engine implementatio
n, | 13 # This is only because of a limitation of ICU break engine implementatio
n, |
13 # not because the older behavior is desirable. | 14 # not because the older behavior is desirable. |
| 15 # |
| 16 # This tailors the line break behavior both for Finnish and to correpond
to CSS |
| 17 # line-break=loose (BCP47 -u-lb-loose) as defined for languages other th
an |
| 18 # Chinese & Japanese. |
| 19 # It sets characters of class CJ to behave like ID. |
| 20 # In addition, it allows breaks before 3005, 303B, 309D, 309E, 30FD, 30F
E (all NS). |
14 | 21 |
15 # | 22 # |
16 # Character Classes defined by TR 14. | 23 # Character Classes defined by TR 14. |
17 # | 24 # |
18 | 25 |
19 !!chain; | 26 !!chain; |
20 !!LBCMNoChain; | 27 !!LBCMNoChain; |
21 | 28 |
22 | 29 |
23 !!lookAheadHardBreak; | 30 !!lookAheadHardBreak; |
(...skipping 44 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
68 $CL = [:LineBreak = Close_Punctuation:]; | 75 $CL = [:LineBreak = Close_Punctuation:]; |
69 $CM = [:LineBreak = Combining_Mark:]; | 76 $CM = [:LineBreak = Combining_Mark:]; |
70 $CP = [:LineBreak = Close_Parenthesis:]; | 77 $CP = [:LineBreak = Close_Parenthesis:]; |
71 $CR = [:LineBreak = Carriage_Return:]; | 78 $CR = [:LineBreak = Carriage_Return:]; |
72 $EX = [:LineBreak = Exclamation:]; | 79 $EX = [:LineBreak = Exclamation:]; |
73 $GL = [:LineBreak = Glue:]; | 80 $GL = [:LineBreak = Glue:]; |
74 $HL = [:LineBreak = Hebrew_Letter:]; | 81 $HL = [:LineBreak = Hebrew_Letter:]; |
75 $HY = [:LineBreak = Hyphen:]; | 82 $HY = [:LineBreak = Hyphen:]; |
76 $H2 = [:LineBreak = H2:]; | 83 $H2 = [:LineBreak = H2:]; |
77 $H3 = [:LineBreak = H3:]; | 84 $H3 = [:LineBreak = H3:]; |
78 $ID = [:LineBreak = Ideographic:]; | 85 $ID = [[:LineBreak = Ideographic:] $CJ]; |
79 $IN = [:LineBreak = Inseperable:]; | 86 $IN = [:LineBreak = Inseperable:]; |
80 $IS = [:LineBreak = Infix_Numeric:]; | 87 $IS = [:LineBreak = Infix_Numeric:]; |
81 $JL = [:LineBreak = JL:]; | 88 $JL = [:LineBreak = JL:]; |
82 $JV = [:LineBreak = JV:]; | 89 $JV = [:LineBreak = JV:]; |
83 $JT = [:LineBreak = JT:]; | 90 $JT = [:LineBreak = JT:]; |
84 $LF = [:LineBreak = Line_Feed:]; | 91 $LF = [:LineBreak = Line_Feed:]; |
85 $NL = [:LineBreak = Next_Line:]; | 92 $NL = [:LineBreak = Next_Line:]; |
86 $NS = [[:LineBreak = Nonstarter:] $CJ]; | 93 $NSX = [\u3005 \u303B \u309D \u309E \u30FD \u30FE]; |
| 94 $NS = [[:LineBreak = Nonstarter:] - $NSX]; |
87 $NU = [:LineBreak = Numeric:]; | 95 $NU = [:LineBreak = Numeric:]; |
88 $OP = [:LineBreak = Open_Punctuation:]; | 96 $OP = [:LineBreak = Open_Punctuation:]; |
89 $PO = [:LineBreak = Postfix_Numeric:]; | 97 $PO = [:LineBreak = Postfix_Numeric:]; |
90 $PR = [:LineBreak = Prefix_Numeric:]; | 98 $PR = [:LineBreak = Prefix_Numeric:]; |
91 $QU = [:LineBreak = Quotation:]; | 99 $QU = [:LineBreak = Quotation:]; |
92 $RI = [:LineBreak = Regional_Indicator:]; | 100 $RI = [:LineBreak = Regional_Indicator:]; |
93 $SA = [:LineBreak = Complex_Context:]; | 101 $SA = [:LineBreak = Complex_Context:]; |
94 $SG = [:LineBreak = Surrogate:]; | 102 $SG = [:LineBreak = Surrogate:]; |
95 $SP = [:LineBreak = Space:]; | 103 $SP = [:LineBreak = Space:]; |
96 $SY = [:LineBreak = Break_Symbols:]; | 104 $SY = [:LineBreak = Break_Symbols:]; |
(...skipping 33 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
130 $HYcm = $HY $CM*; | 138 $HYcm = $HY $CM*; |
131 $H2cm = $H2 $CM*; | 139 $H2cm = $H2 $CM*; |
132 $H3cm = $H3 $CM*; | 140 $H3cm = $H3 $CM*; |
133 $IDcm = $ID $CM*; | 141 $IDcm = $ID $CM*; |
134 $INcm = $IN $CM*; | 142 $INcm = $IN $CM*; |
135 $IScm = $IS $CM*; | 143 $IScm = $IS $CM*; |
136 $JLcm = $JL $CM*; | 144 $JLcm = $JL $CM*; |
137 $JVcm = $JV $CM*; | 145 $JVcm = $JV $CM*; |
138 $JTcm = $JT $CM*; | 146 $JTcm = $JT $CM*; |
139 $NScm = $NS $CM*; | 147 $NScm = $NS $CM*; |
| 148 $NSXcm = $NSX $CM*; |
140 $NUcm = $NU $CM*; | 149 $NUcm = $NU $CM*; |
141 $OPcm = $OP $CM*; | 150 $OPcm = $OP $CM*; |
142 $POcm = $PO $CM*; | 151 $POcm = $PO $CM*; |
143 $PRcm = $PR $CM*; | 152 $PRcm = $PR $CM*; |
144 $QUcm = $QU $CM*; | 153 $QUcm = $QU $CM*; |
145 $RIcm = $RI $CM*; | 154 $RIcm = $RI $CM*; |
146 $SYcm = $SY $CM*; | 155 $SYcm = $SY $CM*; |
147 $WJcm = $WJ $CM*; | 156 $WJcm = $WJ $CM*; |
148 | 157 |
149 ## ------------------------------------------------- | 158 ## ------------------------------------------------- |
(...skipping 16 matching lines...) Expand all Loading... |
166 $HY $CM+; | 175 $HY $CM+; |
167 $H2 $CM+; | 176 $H2 $CM+; |
168 $H3 $CM+; | 177 $H3 $CM+; |
169 $ID $CM+; | 178 $ID $CM+; |
170 $IN $CM+; | 179 $IN $CM+; |
171 $IS $CM+; | 180 $IS $CM+; |
172 $JL $CM+; | 181 $JL $CM+; |
173 $JV $CM+; | 182 $JV $CM+; |
174 $JT $CM+; | 183 $JT $CM+; |
175 $NS $CM+; | 184 $NS $CM+; |
| 185 $NSX $CM+; |
176 $NU $CM+; | 186 $NU $CM+; |
177 $OP $CM+; | 187 $OP $CM+; |
178 $PO $CM+; | 188 $PO $CM+; |
179 $PR $CM+; | 189 $PR $CM+; |
180 $QU $CM+; | 190 $QU $CM+; |
181 $RI $CM+; | 191 $RI $CM+; |
182 $SY $CM+; | 192 $SY $CM+; |
183 $WJ $CM+; | 193 $WJ $CM+; |
184 | 194 |
185 # | 195 # |
186 # CAN_CM is the set of characters that may combine with CM combining chars. | 196 # CAN_CM is the set of characters that may combine with CM combining chars. |
187 # Note that Linebreak UAX 14's concept of a combining char and the rules | 197 # Note that Linebreak UAX 14's concept of a combining char and the rules |
188 # for what they can combine with are _very_ different from the rest of U
nicode. | 198 # for what they can combine with are _very_ different from the rest of U
nicode. |
189 # | 199 # |
190 # Note that $CM itself is left out of this set. If CM is needed as a ba
se | 200 # Note that $CM itself is left out of this set. If CM is needed as a ba
se |
191 # it must be listed separately in the rule. | 201 # it must be listed separately in the rule. |
192 # | 202 # |
193 $CAN_CM = [^$SP $BK $CR $LF $NL $ZW $CM]; # Bases that can take CMs | 203 $CAN_CM = [^$SP $BK $CR $LF $NL $ZW $CM]; # Bases that can take CMs |
194 $CANT_CM = [ $SP $BK $CR $LF $NL $ZW $CM]; # Bases that can't take CMs | 204 $CANT_CM = [ $SP $BK $CR $LF $NL $ZW $CM]; # Bases that can't take CMs |
195 | 205 |
196 # | 206 # |
197 # AL_FOLLOW set of chars that can unconditionally follow an AL | 207 # AL_FOLLOW set of chars that can unconditionally follow an AL |
198 # Needed in rules where stand-alone $CM s are treated as AL. | 208 # Needed in rules where stand-alone $CM s are treated as AL. |
199 # Chaining is disabled with CM because it causes other failures, | 209 # Chaining is disabled with CM because it causes other failures, |
200 # so for this one case we need to manually list out longer sequences. | 210 # so for this one case we need to manually list out longer sequences. |
201 # | 211 # |
202 $AL_FOLLOW_NOCM = [$BK $CR $LF $NL $ZW $SP]; | 212 $AL_FOLLOW_NOCM = [$BK $CR $LF $NL $ZW $SP]; |
203 $AL_FOLLOW_CM = [$CL $CP $EX $HL $IS $SY $WJ $GL $OP $QU $BA $HH $HY $NS $IN $
NU $ALPlus]; | 213 $AL_FOLLOW_CM = [$CL $CP $EX $HL $IS $SY $WJ $GL $OP $QU $BA $HH $HY $NS $NSX
$IN $NU $ALPlus]; |
204 $AL_FOLLOW = [$AL_FOLLOW_NOCM $AL_FOLLOW_CM]; | 214 $AL_FOLLOW = [$AL_FOLLOW_NOCM $AL_FOLLOW_CM]; |
205 | 215 |
206 | 216 |
207 # | 217 # |
208 # Rule LB 4, 5 Mandatory (Hard) breaks. | 218 # Rule LB 4, 5 Mandatory (Hard) breaks. |
209 # | 219 # |
210 $LB4Breaks = [$BK $CR $LF $NL]; | 220 $LB4Breaks = [$BK $CR $LF $NL]; |
211 $LB4NonBreaks = [^$BK $CR $LF $NL]; | 221 $LB4NonBreaks = [^$BK $CR $LF $NL]; |
212 $CR $LF {100}; | 222 $CR $LF {100}; |
213 | 223 |
(...skipping 82 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
296 # | 306 # |
297 $OPcm $SP* $CAN_CM $CM*; | 307 $OPcm $SP* $CAN_CM $CM*; |
298 $OPcm $SP* $CANT_CM; | 308 $OPcm $SP* $CANT_CM; |
299 | 309 |
300 $OPcm $SP+ $CM+ $AL_FOLLOW?; # by rule 10, stand-alone CM behaves as AL | 310 $OPcm $SP+ $CM+ $AL_FOLLOW?; # by rule 10, stand-alone CM behaves as AL |
301 | 311 |
302 # LB 15 | 312 # LB 15 |
303 $QUcm $SP* $OPcm; | 313 $QUcm $SP* $OPcm; |
304 | 314 |
305 # LB 16 | 315 # LB 16 |
| 316 # Do not break between closing punctuation and $NS, even with intervening spaces |
| 317 # But DO allow a break between closing punctuation and $NSX, don't include it he
re |
306 ($CLcm | $CPcm) $SP* $NScm; | 318 ($CLcm | $CPcm) $SP* $NScm; |
307 | 319 |
308 # LB 17 | 320 # LB 17 |
309 $B2cm $SP* $B2cm; | 321 $B2cm $SP* $B2cm; |
310 | 322 |
311 # | 323 # |
312 # LB 18 Break after spaces. | 324 # LB 18 Break after spaces. |
313 # | 325 # |
314 $LB18NonBreaks = [$LB8NonBreaks - [$SP]]; | 326 $LB18NonBreaks = [$LB8NonBreaks - [$SP]]; |
315 $LB18Breaks = [$LB8Breaks $SP]; | 327 $LB18Breaks = [$LB8Breaks $SP]; |
(...skipping 13 matching lines...) Expand all Loading... |
329 # LB 20 | 341 # LB 20 |
330 # <break> $CB | 342 # <break> $CB |
331 # $CB <break> | 343 # $CB <break> |
332 | 344 |
333 $LB20NonBreaks = [$LB18NonBreaks - $CB]; | 345 $LB20NonBreaks = [$LB18NonBreaks - $CB]; |
334 | 346 |
335 # LB 20.09 added rule for Finnish tailoring | 347 # LB 20.09 added rule for Finnish tailoring |
336 # LB 21 x (BA | HY | NS) | 348 # LB 21 x (BA | HY | NS) |
337 # BB x | 349 # BB x |
338 # | 350 # |
| 351 # DO allow breaks here before NSXcm, so don't include it |
339 $LB20NonBreaks $CM* ($BAcm | $HHcm | $HYcm | $NScm) / $AL; | 352 $LB20NonBreaks $CM* ($BAcm | $HHcm | $HYcm | $NScm) / $AL; |
340 $LB20NonBreaks $CM* ($BAcm | $HHcm | $HYcm | $NScm); | 353 $LB20NonBreaks $CM* ($BAcm | $HHcm | $HYcm | $NScm); |
341 ($HY | $HH) $AL; | 354 ($HY | $HH) $AL; |
342 | 355 |
343 $BBcm [^$CB]; # $BB x | 356 $BBcm [^$CB]; # $BB x |
344 $BBcm $LB20NonBreaks $CM*; | 357 $BBcm $LB20NonBreaks $CM*; |
345 | 358 |
346 # LB 21a Don't break after Hebrew + Hyphen | 359 # LB 21a Don't break after Hebrew + Hyphen |
347 # HL (HY | BA) x | 360 # HL (HY | BA) x |
348 # | 361 # |
349 $HLcm ($HYcm | $BAcm | $HHcm) [^$CB]?; | 362 $HLcm ($HYcm | $BAcm | $HHcm) [^$CB]?; |
350 | 363 |
351 # LB 21b (forward) Don't break between SY and HL | 364 # LB 21b (forward) Don't break between SY and HL |
352 # (break between HL and SY already disallowed by LB 13 above) | 365 # (break between HL and SY already disallowed by LB 13 above) |
353 $SYcm $HLcm; | 366 $SYcm $HLcm; |
354 | 367 |
355 # LB 22 | 368 # LB 22 |
356 ($ALcm | $HLcm) $INcm; | 369 ($ALcm | $HLcm) $INcm; |
357 $CM+ $INcm; # by rule 10, any otherwise unattached CM behaves as AL | 370 $CM+ $INcm; # by rule 10, any otherwise unattached CM behaves as AL |
| 371 $EXcm $INcm; |
358 $IDcm $INcm; | 372 $IDcm $INcm; |
359 $INcm $INcm; | 373 $INcm $INcm; |
360 $NUcm $INcm; | 374 $NUcm $INcm; |
361 | 375 |
362 | 376 |
363 # $LB 23 | 377 # $LB 23 |
364 $IDcm $POcm; | 378 $IDcm $POcm; |
365 $ALcm $NUcm; # includes $LB19 | 379 $ALcm $NUcm; # includes $LB19 |
366 $HLcm $NUcm; | 380 $HLcm $NUcm; |
367 $CM+ $NUcm; # Rule 10, any otherwise unattached CM behaves as AL | 381 $CM+ $NUcm; # Rule 10, any otherwise unattached CM behaves as AL |
(...skipping 60 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
428 $CM+ $HY; | 442 $CM+ $HY; |
429 $CM+ $H2; | 443 $CM+ $H2; |
430 $CM+ $H3; | 444 $CM+ $H3; |
431 $CM+ $ID; | 445 $CM+ $ID; |
432 $CM+ $IN; | 446 $CM+ $IN; |
433 $CM+ $IS; | 447 $CM+ $IS; |
434 $CM+ $JL; | 448 $CM+ $JL; |
435 $CM+ $JV; | 449 $CM+ $JV; |
436 $CM+ $JT; | 450 $CM+ $JT; |
437 $CM+ $NS; | 451 $CM+ $NS; |
| 452 $CM+ $NSX; |
438 $CM+ $NU; | 453 $CM+ $NU; |
439 $CM+ $OP; | 454 $CM+ $OP; |
440 $CM+ $PO; | 455 $CM+ $PO; |
441 $CM+ $PR; | 456 $CM+ $PR; |
442 $CM+ $QU; | 457 $CM+ $QU; |
443 $CM+ $RI; | 458 $CM+ $RI; |
444 $CM+ $SY; | 459 $CM+ $SY; |
445 $CM+ $WJ; | 460 $CM+ $WJ; |
446 $CM+; | 461 $CM+; |
447 | 462 |
(...skipping 98 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
546 $AL_FOLLOW_NOCM $CM+ $SP+ $CM* $OP; | 561 $AL_FOLLOW_NOCM $CM+ $SP+ $CM* $OP; |
547 $CM* $AL_FOLLOW_CM $CM+ $SP+ $CM* $OP; | 562 $CM* $AL_FOLLOW_CM $CM+ $SP+ $CM* $OP; |
548 $SY $CM $SP+ $OP; # TODO: Experiment. Remove. | 563 $SY $CM $SP+ $OP; # TODO: Experiment. Remove. |
549 | 564 |
550 | 565 |
551 | 566 |
552 # LB 15 | 567 # LB 15 |
553 $CM* $OP $SP* $CM* $QU; | 568 $CM* $OP $SP* $CM* $QU; |
554 | 569 |
555 # LB 16 | 570 # LB 16 |
| 571 # Don't include $NSX here |
556 $CM* $NS $SP* $CM* ($CL | $CP); | 572 $CM* $NS $SP* $CM* ($CL | $CP); |
557 | 573 |
558 # LB 17 | 574 # LB 17 |
559 $CM* $B2 $SP* $CM* $B2; | 575 $CM* $B2 $SP* $CM* $B2; |
560 | 576 |
561 # LB 18 break after spaces | 577 # LB 18 break after spaces |
562 # Nothing explicit needed here. | 578 # Nothing explicit needed here. |
563 | 579 |
564 | 580 |
565 # | 581 # |
566 # LB 19 | 582 # LB 19 |
567 # | 583 # |
568 $CM* $QU $CM* $CAN_CM; # . x QU | 584 $CM* $QU $CM* $CAN_CM; # . x QU |
569 $CM* $QU $LB18NonBreaks; | 585 $CM* $QU $LB18NonBreaks; |
570 | 586 |
571 | 587 |
572 $CM* $CAN_CM $CM* $QU; # QU x . | 588 $CM* $CAN_CM $CM* $QU; # QU x . |
573 $CANT_CM $CM* $QU; | 589 $CANT_CM $CM* $QU; |
574 | 590 |
575 # | 591 # |
576 # LB 20 Break before and after CB. | 592 # LB 20 Break before and after CB. |
577 # nothing needed here. | 593 # nothing needed here. |
578 # | 594 # |
579 | 595 |
580 # LB 20.09 added rule for Finnish tailoring | 596 # LB 20.09 added rule for Finnish tailoring |
581 $AL ($HY | $HH) / $SP; | 597 $AL ($HY | $HH) / $SP; |
582 | 598 |
583 # LB 21 | 599 # LB 21 |
| 600 # Don't include $NSX here |
584 $CM* ($BA | $HH | $HY | $NS) $CM* [$LB20NonBreaks-$CM]; # . x (BA | HY | NS
) | 601 $CM* ($BA | $HH | $HY | $NS) $CM* [$LB20NonBreaks-$CM]; # . x (BA | HY | NS
) |
585 | 602 |
586 $CM* [$LB20NonBreaks-$CM] $CM* $BB; # BB x . | 603 $CM* [$LB20NonBreaks-$CM] $CM* $BB; # BB x . |
587 [^$CB] $CM* $BB; # | 604 [^$CB] $CM* $BB; # |
588 | 605 |
589 # LB21a | 606 # LB21a |
590 [^$CB] $CM* ($HY | $BA | $HH) $CM* $HL; | 607 [^$CB] $CM* ($HY | $BA | $HH) $CM* $HL; |
591 | 608 |
592 # LB21b (reverse) | 609 # LB21b (reverse) |
593 $CM* $HL $CM* $SY; | 610 $CM* $HL $CM* $SY; |
594 | 611 |
595 # LB 22 | 612 # LB 22 |
596 $CM* $IN $CM* ($ALPlus | $HL); | 613 $CM* $IN $CM* ($ALPlus | $HL); |
| 614 $CM* $IN $CM* $EX; |
597 $CM* $IN $CM* $ID; | 615 $CM* $IN $CM* $ID; |
598 $CM* $IN $CM* $IN; | 616 $CM* $IN $CM* $IN; |
599 $CM* $IN $CM* $NU; | 617 $CM* $IN $CM* $NU; |
600 | 618 |
601 # LB 23 | 619 # LB 23 |
602 $CM* $PO $CM* $ID; | 620 $CM* $PO $CM* $ID; |
603 $CM* $NU $CM* ($ALPlus | $HL); | 621 $CM* $NU $CM* ($ALPlus | $HL); |
604 $CM* ($ALPlus | $HL) $CM* $NU; | 622 $CM* ($ALPlus | $HL) $CM* $NU; |
605 | 623 |
606 # LB 24 | 624 # LB 24 |
(...skipping 68 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
675 # of context. | 693 # of context. |
676 # | 694 # |
677 # It might be slightly more efficient to have specific rules | 695 # It might be slightly more efficient to have specific rules |
678 # instead of one generic one, but only if we could | 696 # instead of one generic one, but only if we could |
679 # turn off rule chaining. We don't want to move more | 697 # turn off rule chaining. We don't want to move more |
680 # than necessary. | 698 # than necessary. |
681 # | 699 # |
682 [$CM $OP $QU $CL $CP $B2 $PR $HY $BA $SP $dictionary]+ [^$CM $OP $QU $CL $CP $B2
$PR $HY $BA $dictionary]; | 700 [$CM $OP $QU $CL $CP $B2 $PR $HY $BA $SP $dictionary]+ [^$CM $OP $QU $CL $CP $B2
$PR $HY $BA $dictionary]; |
683 $dictionary $dictionary; | 701 $dictionary $dictionary; |
684 | 702 |
OLD | NEW |