OLD | NEW |
1 # Copyright (c) 2002-2013 International Business Machines Corporation and | 1 # Copyright (c) 2002-2015 International Business Machines Corporation and |
2 # others. All Rights Reserved. | 2 # others. All Rights Reserved. |
3 # | 3 # |
4 # file: line_ja.txt | 4 # file: line_loose_cj.txt |
5 # | 5 # |
6 # Line Breaking Rules | 6 # Line Breaking Rules |
7 # Implement default line breaking as defined by | 7 # Implement default line breaking as defined by |
8 # Unicode Standard Annex #14 Revision 29 for Unicode 6.2 | 8 # Unicode Standard Annex #14 Revision 34 for Unicode 8.0 |
9 # http://www.unicode.org/reports/tr14/ | 9 # http://www.unicode.org/reports/tr14/ |
| 10 # tailored as noted in 2nd paragraph below.. |
10 # | 11 # |
11 # TODO: Rule LB 8 remains as it was in Unicode 5.2 | 12 # TODO: Rule LB 8 remains as it was in Unicode 5.2 |
12 # This is only because of a limitation of ICU break engine implementatio
n, | 13 # This is only because of a limitation of ICU break engine implementatio
n, |
13 # not because the older behavior is desirable. | 14 # not because the older behavior is desirable. |
| 15 # |
| 16 # This tailors the line break behavior to correspond to CSS |
| 17 # line-break=loose (BCP47 -u-lb-loose) as defined for Chinese & Japanese
. |
| 18 # It sets characters of class CJ to behave like ID. |
| 19 # In addition, it allows breaks: |
| 20 # * before hyphens 2010 & 2013 (both BA) and 301C, 30A0 (both NS) |
| 21 # * before iteration marks 3005, 303B, 309D, 309E, 30FD, 30FE (all NS) |
| 22 # * between characters of LineBreak class IN such as 2026 |
| 23 # * before some centered punct 203C, 2047, 2048, 2049, 30FB, FF1A, FF1B, |
| 24 # FF65 (all NS) and FF01, FF1F (both EX). |
| 25 # * before suffix characters with LineBreak class PO and EastAsianWidth
A,F,W; |
| 26 # this includes: 00B0 2030 2032 2033 2035 2103 2109 FE6A FF05 FFE0 |
| 27 # * after prefix characters with LineBreak class PR and EastAsianWidth A
,F,W; |
| 28 # this includes: 00A4 00B1 20AC 2116 FE69 FF04 FFE1 FFE5 FFE6 |
| 29 |
14 | 30 |
15 # | 31 # |
16 # Character Classes defined by TR 14. | 32 # Character Classes defined by TR 14. |
17 # | 33 # |
18 | 34 |
19 !!chain; | 35 !!chain; |
20 !!LBCMNoChain; | 36 !!LBCMNoChain; |
21 | 37 |
22 | 38 |
23 !!lookAheadHardBreak; | 39 !!lookAheadHardBreak; |
(...skipping 27 matching lines...) Expand all Loading... |
51 # !!lookAheadHardBreak forces the run time state machine to | 67 # !!lookAheadHardBreak forces the run time state machine to |
52 # stop immediately when a look ahead rule ( '/' operator) matches, | 68 # stop immediately when a look ahead rule ( '/' operator) matches, |
53 # and set the match position to that of the look-ahead operator, | 69 # and set the match position to that of the look-ahead operator, |
54 # no matter what other rules may be in play at the time. | 70 # no matter what other rules may be in play at the time. |
55 # | 71 # |
56 # See rule LB 19 for an example. | 72 # See rule LB 19 for an example. |
57 # | 73 # |
58 | 74 |
59 $AI = [:LineBreak = Ambiguous:]; | 75 $AI = [:LineBreak = Ambiguous:]; |
60 $AL = [:LineBreak = Alphabetic:]; | 76 $AL = [:LineBreak = Alphabetic:]; |
61 $BA = [:LineBreak = Break_After:]; | 77 $BAX = [\u2010 \u2013]; |
| 78 $BA = [[:LineBreak = Break_After:] - $BAX]; |
62 $BB = [:LineBreak = Break_Before:]; | 79 $BB = [:LineBreak = Break_Before:]; |
63 $BK = [:LineBreak = Mandatory_Break:]; | 80 $BK = [:LineBreak = Mandatory_Break:]; |
64 $B2 = [:LineBreak = Break_Both:]; | 81 $B2 = [:LineBreak = Break_Both:]; |
65 $CB = [:LineBreak = Contingent_Break:]; | 82 $CB = [:LineBreak = Contingent_Break:]; |
66 $CJ = [:LineBreak = Conditional_Japanese_Starter:]; | 83 $CJ = [:LineBreak = Conditional_Japanese_Starter:]; |
67 $CL = [:LineBreak = Close_Punctuation:]; | 84 $CL = [:LineBreak = Close_Punctuation:]; |
68 $CM = [:LineBreak = Combining_Mark:]; | 85 $CM = [:LineBreak = Combining_Mark:]; |
69 $CP = [:LineBreak = Close_Parenthesis:]; | 86 $CP = [:LineBreak = Close_Parenthesis:]; |
70 $CR = [:LineBreak = Carriage_Return:]; | 87 $CR = [:LineBreak = Carriage_Return:]; |
71 $EX = [:LineBreak = Exclamation:]; | 88 $EXX = [\uFF01 \uFF1F]; |
| 89 $EX = [[:LineBreak = Exclamation:] - $EXX]; |
72 $GL = [:LineBreak = Glue:]; | 90 $GL = [:LineBreak = Glue:]; |
73 $HL = [:LineBreak = Hebrew_Letter:]; | 91 $HL = [:LineBreak = Hebrew_Letter:]; |
74 $HY = [:LineBreak = Hyphen:]; | 92 $HY = [:LineBreak = Hyphen:]; |
75 $H2 = [:LineBreak = H2:]; | 93 $H2 = [:LineBreak = H2:]; |
76 $H3 = [:LineBreak = H3:]; | 94 $H3 = [:LineBreak = H3:]; |
77 $ID = [[:LineBreak = Ideographic:] $CJ]; | 95 $ID = [[:LineBreak = Ideographic:] $CJ]; |
78 $IN = [:LineBreak = Inseperable:]; | 96 $IN = [:LineBreak = Inseperable:]; |
79 $IS = [:LineBreak = Infix_Numeric:]; | 97 $IS = [:LineBreak = Infix_Numeric:]; |
80 $JL = [:LineBreak = JL:]; | 98 $JL = [:LineBreak = JL:]; |
81 $JV = [:LineBreak = JV:]; | 99 $JV = [:LineBreak = JV:]; |
82 $JT = [:LineBreak = JT:]; | 100 $JT = [:LineBreak = JT:]; |
83 $LF = [:LineBreak = Line_Feed:]; | 101 $LF = [:LineBreak = Line_Feed:]; |
84 $NL = [:LineBreak = Next_Line:]; | 102 $NL = [:LineBreak = Next_Line:]; |
85 $NS = [:LineBreak = Nonstarter:]; | 103 $NSX = [\u301C \u30A0 \u3005 \u303B \u309D \u309E \u30FD \u30FE \u203C \u2047 \u
2048 \u2049 \u30FB \uFF1A \uFF1B \uFF65]; |
| 104 $NS = [[:LineBreak = Nonstarter:] - $NSX]; |
86 $NU = [:LineBreak = Numeric:]; | 105 $NU = [:LineBreak = Numeric:]; |
87 $OP = [:LineBreak = Open_Punctuation:]; | 106 $OP = [:LineBreak = Open_Punctuation:]; |
88 $PO = [:LineBreak = Postfix_Numeric:]; | 107 $POX = [\u00B0 \u2030 \u2032 \u2033 \u2035 \u2103 \u2109 \uFE6A \uFF05 \uFFE0]; |
89 $PR = [:LineBreak = Prefix_Numeric:]; | 108 $PO = [[:LineBreak = Postfix_Numeric:] - $POX]; |
| 109 $PRX = [\u00A4 \u00B1 \u20AC \u2116 \uFE69 \uFF04 \uFFE1 \uFFE5 \uFFE6]; |
| 110 $PR = [[:LineBreak = Prefix_Numeric:] - $PRX]; |
90 $QU = [:LineBreak = Quotation:]; | 111 $QU = [:LineBreak = Quotation:]; |
91 $RI = [:LineBreak = Regional_Indicator:]; | 112 $RI = [:LineBreak = Regional_Indicator:]; |
92 $SA = [:LineBreak = Complex_Context:]; | 113 $SA = [:LineBreak = Complex_Context:]; |
93 $SG = [:LineBreak = Surrogate:]; | 114 $SG = [:LineBreak = Surrogate:]; |
94 $SP = [:LineBreak = Space:]; | 115 $SP = [:LineBreak = Space:]; |
95 $SY = [:LineBreak = Break_Symbols:]; | 116 $SY = [:LineBreak = Break_Symbols:]; |
96 $WJ = [:LineBreak = Word_Joiner:]; | 117 $WJ = [:LineBreak = Word_Joiner:]; |
97 $XX = [:LineBreak = Unknown:]; | 118 $XX = [:LineBreak = Unknown:]; |
98 $ZW = [:LineBreak = ZWSpace:]; | 119 $ZW = [:LineBreak = ZWSpace:]; |
99 | 120 |
(...skipping 11 matching lines...) Expand all Loading... |
111 # XX (Unknown, unassigned) | 132 # XX (Unknown, unassigned) |
112 # as $AL (Alphabetic) | 133 # as $AL (Alphabetic) |
113 # | 134 # |
114 $ALPlus = [$AL $AI $SA $SG $XX]; | 135 $ALPlus = [$AL $AI $SA $SG $XX]; |
115 | 136 |
116 # | 137 # |
117 # Combining Marks. X $CM* behaves as if it were X. Rule LB6. | 138 # Combining Marks. X $CM* behaves as if it were X. Rule LB6. |
118 # | 139 # |
119 $ALcm = $ALPlus $CM*; | 140 $ALcm = $ALPlus $CM*; |
120 $BAcm = $BA $CM*; | 141 $BAcm = $BA $CM*; |
| 142 $BAXcm = $BAX $CM*; |
121 $BBcm = $BB $CM*; | 143 $BBcm = $BB $CM*; |
122 $B2cm = $B2 $CM*; | 144 $B2cm = $B2 $CM*; |
123 $CLcm = $CL $CM*; | 145 $CLcm = $CL $CM*; |
124 $CPcm = $CP $CM*; | 146 $CPcm = $CP $CM*; |
125 $EXcm = $EX $CM*; | 147 $EXcm = $EX $CM*; |
| 148 $EXXcm = $EXX $CM*; |
126 $GLcm = $GL $CM*; | 149 $GLcm = $GL $CM*; |
127 $HLcm = $HL $CM*; | 150 $HLcm = $HL $CM*; |
128 $HYcm = $HY $CM*; | 151 $HYcm = $HY $CM*; |
129 $H2cm = $H2 $CM*; | 152 $H2cm = $H2 $CM*; |
130 $H3cm = $H3 $CM*; | 153 $H3cm = $H3 $CM*; |
131 $IDcm = $ID $CM*; | 154 $IDcm = $ID $CM*; |
132 $INcm = $IN $CM*; | 155 $INcm = $IN $CM*; |
133 $IScm = $IS $CM*; | 156 $IScm = $IS $CM*; |
134 $JLcm = $JL $CM*; | 157 $JLcm = $JL $CM*; |
135 $JVcm = $JV $CM*; | 158 $JVcm = $JV $CM*; |
136 $JTcm = $JT $CM*; | 159 $JTcm = $JT $CM*; |
137 $NScm = $NS $CM*; | 160 $NScm = $NS $CM*; |
| 161 $NSXcm = $NSX $CM*; |
138 $NUcm = $NU $CM*; | 162 $NUcm = $NU $CM*; |
139 $OPcm = $OP $CM*; | 163 $OPcm = $OP $CM*; |
140 $POcm = $PO $CM*; | 164 $POcm = $PO $CM*; |
| 165 $POXcm = $POX $CM*; |
141 $PRcm = $PR $CM*; | 166 $PRcm = $PR $CM*; |
| 167 $PRXcm = $PRX $CM*; |
142 $QUcm = $QU $CM*; | 168 $QUcm = $QU $CM*; |
143 $RIcm = $RI $CM*; | 169 $RIcm = $RI $CM*; |
144 $SYcm = $SY $CM*; | 170 $SYcm = $SY $CM*; |
145 $WJcm = $WJ $CM*; | 171 $WJcm = $WJ $CM*; |
146 | 172 |
147 ## ------------------------------------------------- | 173 ## ------------------------------------------------- |
148 | 174 |
149 !!forward; | 175 !!forward; |
150 | 176 |
151 # | 177 # |
152 # Each class of character can stand by itself as an unbroken token, with traili
ng combining stuff | 178 # Each class of character can stand by itself as an unbroken token, with traili
ng combining stuff |
153 # | 179 # |
154 $ALPlus $CM+; | 180 $ALPlus $CM+; |
155 $BA $CM+; | 181 $BA $CM+; |
| 182 $BAX $CM+; |
156 $BB $CM+; | 183 $BB $CM+; |
157 $B2 $CM+; | 184 $B2 $CM+; |
158 $CL $CM+; | 185 $CL $CM+; |
159 $CP $CM+; | 186 $CP $CM+; |
160 $EX $CM+; | 187 $EX $CM+; |
| 188 $EXX $CM+; |
161 $GL $CM+; | 189 $GL $CM+; |
162 $HL $CM+; | 190 $HL $CM+; |
163 $HY $CM+; | 191 $HY $CM+; |
164 $H2 $CM+; | 192 $H2 $CM+; |
165 $H3 $CM+; | 193 $H3 $CM+; |
166 $ID $CM+; | 194 $ID $CM+; |
167 $IN $CM+; | 195 $IN $CM+; |
168 $IS $CM+; | 196 $IS $CM+; |
169 $JL $CM+; | 197 $JL $CM+; |
170 $JV $CM+; | 198 $JV $CM+; |
171 $JT $CM+; | 199 $JT $CM+; |
172 $NS $CM+; | 200 $NS $CM+; |
| 201 $NSX $CM+; |
173 $NU $CM+; | 202 $NU $CM+; |
174 $OP $CM+; | 203 $OP $CM+; |
175 $PO $CM+; | 204 $PO $CM+; |
| 205 $POX $CM+; |
176 $PR $CM+; | 206 $PR $CM+; |
| 207 $PRX $CM+; |
177 $QU $CM+; | 208 $QU $CM+; |
178 $RI $CM+; | 209 $RI $CM+; |
179 $SY $CM+; | 210 $SY $CM+; |
180 $WJ $CM+; | 211 $WJ $CM+; |
181 | 212 |
182 # | 213 # |
183 # CAN_CM is the set of characters that may combine with CM combining chars. | 214 # CAN_CM is the set of characters that may combine with CM combining chars. |
184 # Note that Linebreak UAX 14's concept of a combining char and the rules | 215 # Note that Linebreak UAX 14's concept of a combining char and the rules |
185 # for what they can combine with are _very_ different from the rest of U
nicode. | 216 # for what they can combine with are _very_ different from the rest of U
nicode. |
186 # | 217 # |
(...skipping 65 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
252 # LB 12 Do not break after NBSP and related characters. | 283 # LB 12 Do not break after NBSP and related characters. |
253 # GL x | 284 # GL x |
254 # | 285 # |
255 $GLcm $CAN_CM $CM*; | 286 $GLcm $CAN_CM $CM*; |
256 $GLcm $CANT_CM; | 287 $GLcm $CANT_CM; |
257 | 288 |
258 # | 289 # |
259 # LB 12a Do not break before NBSP and related characters ... | 290 # LB 12a Do not break before NBSP and related characters ... |
260 # [^SP BA HY] x GL | 291 # [^SP BA HY] x GL |
261 # | 292 # |
262 [[$LB8NonBreaks] - [$SP $BA $HY]] $CM* $GLcm; | 293 [[$LB8NonBreaks] - [$SP $BA $BAX $HY]] $CM* $GLcm; |
263 $CM+ GLcm; | 294 $CM+ GLcm; |
264 | 295 |
265 | 296 |
266 | 297 |
267 # | 298 # |
268 # LB 13 Don't break before ']' or '!' or ';' or '/', even after spaces. | 299 # LB 13 Don't break before ']' or '!' or ';' or '/', even after spaces. |
269 # | 300 # |
| 301 # Do not include $EXX here |
270 $LB8NonBreaks $CL; | 302 $LB8NonBreaks $CL; |
271 $CAN_CM $CM* $CL; | 303 $CAN_CM $CM* $CL; |
272 $CM+ $CL; # by rule 10, stand-alone CM behaves as AL | 304 $CM+ $CL; # by rule 10, stand-alone CM behaves as AL |
273 | 305 |
274 $LB8NonBreaks $CP; | 306 $LB8NonBreaks $CP; |
275 $CAN_CM $CM* $CP; | 307 $CAN_CM $CM* $CP; |
276 $CM+ $CP; # by rule 10, stand-alone CM behaves as AL | 308 $CM+ $CP; # by rule 10, stand-alone CM behaves as AL |
277 | 309 |
278 $LB8NonBreaks $EX; | 310 $LB8NonBreaks $EX; |
279 $CAN_CM $CM* $EX; | 311 $CAN_CM $CM* $EX; |
(...skipping 13 matching lines...) Expand all Loading... |
293 # | 325 # |
294 $OPcm $SP* $CAN_CM $CM*; | 326 $OPcm $SP* $CAN_CM $CM*; |
295 $OPcm $SP* $CANT_CM; | 327 $OPcm $SP* $CANT_CM; |
296 | 328 |
297 $OPcm $SP+ $CM+ $AL_FOLLOW?; # by rule 10, stand-alone CM behaves as AL | 329 $OPcm $SP+ $CM+ $AL_FOLLOW?; # by rule 10, stand-alone CM behaves as AL |
298 | 330 |
299 # LB 15 | 331 # LB 15 |
300 $QUcm $SP* $OPcm; | 332 $QUcm $SP* $OPcm; |
301 | 333 |
302 # LB 16 | 334 # LB 16 |
| 335 # Do not break between closing punctuation and $NS, even with intervening spaces |
| 336 # But DO allow a break between closing punctuation and $NSX, don't include it he
re |
303 ($CLcm | $CPcm) $SP* $NScm; | 337 ($CLcm | $CPcm) $SP* $NScm; |
304 | 338 |
305 # LB 17 | 339 # LB 17 |
306 $B2cm $SP* $B2cm; | 340 $B2cm $SP* $B2cm; |
307 | 341 |
308 # | 342 # |
309 # LB 18 Break after spaces. | 343 # LB 18 Break after spaces. |
310 # | 344 # |
311 $LB18NonBreaks = [$LB8NonBreaks - [$SP]]; | 345 $LB18NonBreaks = [$LB8NonBreaks - [$SP]]; |
312 $LB18Breaks = [$LB8Breaks $SP]; | 346 $LB18Breaks = [$LB8Breaks $SP]; |
(...skipping 12 matching lines...) Expand all Loading... |
325 | 359 |
326 # LB 20 | 360 # LB 20 |
327 # <break> $CB | 361 # <break> $CB |
328 # $CB <break> | 362 # $CB <break> |
329 | 363 |
330 $LB20NonBreaks = [$LB18NonBreaks - $CB]; | 364 $LB20NonBreaks = [$LB18NonBreaks - $CB]; |
331 | 365 |
332 # LB 21 x (BA | HY | NS) | 366 # LB 21 x (BA | HY | NS) |
333 # BB x | 367 # BB x |
334 # | 368 # |
| 369 # DO allow breaks here before $BAXcm and $NSXcm, so don't include them |
335 $LB20NonBreaks $CM* ($BAcm | $HYcm | $NScm); | 370 $LB20NonBreaks $CM* ($BAcm | $HYcm | $NScm); |
336 | 371 |
337 $BBcm [^$CB]; # $BB x | 372 $BBcm [^$CB]; # $BB x |
338 $BBcm $LB20NonBreaks $CM*; | 373 $BBcm $LB20NonBreaks $CM*; |
339 | 374 |
340 # LB 21a Don't break after Hebrew + Hyphen | 375 # LB 21a Don't break after Hebrew + Hyphen |
341 # HL (HY | BA) x | 376 # HL (HY | BA) x |
342 # | 377 # |
343 $HLcm ($HYcm | $BAcm) [^$CB]?; | 378 $HLcm ($HYcm | $BAcm | $BAXcm) [^$CB]?; |
344 | 379 |
345 # LB 21b (forward) Don't break between SY and HL | 380 # LB 21b (forward) Don't break between SY and HL |
346 # (break between HL and SY already disallowed by LB 13 above) | 381 # (break between HL and SY already disallowed by LB 13 above) |
347 $SYcm $HLcm; | 382 $SYcm $HLcm; |
348 | 383 |
349 # LB 22 | 384 # LB 22 |
350 ($ALcm | $HLcm) $INcm; | 385 ($ALcm | $HLcm) $INcm; |
351 $CM+ $INcm; # by rule 10, any otherwise unattached CM behaves as AL | 386 $CM+ $INcm; # by rule 10, any otherwise unattached CM behaves as AL |
| 387 $EXcm $INcm; |
352 $IDcm $INcm; | 388 $IDcm $INcm; |
353 $INcm $INcm; | 389 # $INcm $INcm; # delete this rule for CSS loose |
354 $NUcm $INcm; | 390 $NUcm $INcm; |
355 | 391 |
356 | 392 |
357 # $LB 23 | 393 # LB 23 |
| 394 # Do not include $POX here |
358 $IDcm $POcm; | 395 $IDcm $POcm; |
359 $ALcm $NUcm; # includes $LB19 | 396 $ALcm $NUcm; # includes $LB19 |
360 $HLcm $NUcm; | 397 $HLcm $NUcm; |
361 $CM+ $NUcm; # Rule 10, any otherwise unattached CM behaves as AL | 398 $CM+ $NUcm; # Rule 10, any otherwise unattached CM behaves as AL |
362 $NUcm $ALcm; | 399 $NUcm $ALcm; |
363 $NUcm $HLcm; | 400 $NUcm $HLcm; |
364 | 401 |
365 # | 402 # |
366 # LB 24 | 403 # LB 24 |
367 # | 404 # |
| 405 # Do not include $PRX here |
368 $PRcm $IDcm; | 406 $PRcm $IDcm; |
369 $PRcm ($ALcm | $HLcm); | 407 $PRcm ($ALcm | $HLcm); |
370 $POcm ($ALcm | $HLcm); | 408 ($POcm | $POXcm) ($ALcm | $HLcm); |
371 | 409 |
372 # | 410 # |
373 # LB 25 Numbers. | 411 # LB 25 Numbers. |
374 # | 412 # |
375 ($PRcm | $POcm)? ($OPcm | $HYcm)? $NUcm ($NUcm | $SYcm | $IScm)* ($CLcm | $CPcm)
? ($PRcm | $POcm)?; | 413 # Here do not include $PRX at the beginning or $POX at the end |
| 414 ($PRcm | $POcm | $POXcm)? ($OPcm | $HYcm)? $NUcm ($NUcm | $SYcm | $IScm)* ($CLcm
| $CPcm)? ($PRcm | $PRXcm | $POcm)?; |
376 | 415 |
377 # LB 26 Do not break a Korean syllable | 416 # LB 26 Do not break a Korean syllable |
378 # | 417 # |
379 $JLcm ($JLcm | $JVcm | $H2cm | $H3cm); | 418 $JLcm ($JLcm | $JVcm | $H2cm | $H3cm); |
380 ($JVcm | $H2cm) ($JVcm | $JTcm); | 419 ($JVcm | $H2cm) ($JVcm | $JTcm); |
381 ($JTcm | $H3cm) $JTcm; | 420 ($JTcm | $H3cm) $JTcm; |
382 | 421 |
383 # LB 27 Treat korean Syllable Block the same as ID (don't break it) | 422 # LB 27 Treat korean Syllable Block the same as ID (don't break it) |
| 423 # Do not include $POX or $PRX here |
384 ($JLcm | $JVcm | $JTcm | $H2cm | $H3cm) $INcm; | 424 ($JLcm | $JVcm | $JTcm | $H2cm | $H3cm) $INcm; |
385 ($JLcm | $JVcm | $JTcm | $H2cm | $H3cm) $POcm; | 425 ($JLcm | $JVcm | $JTcm | $H2cm | $H3cm) $POcm; |
386 $PRcm ($JLcm | $JVcm | $JTcm | $H2cm | $H3cm); | 426 $PRcm ($JLcm | $JVcm | $JTcm | $H2cm | $H3cm); |
387 | 427 |
388 | 428 |
389 # LB 28 Do not break between alphabetics | 429 # LB 28 Do not break between alphabetics |
390 # | 430 # |
391 ($ALcm | $HLcm) ($ALcm | $HLcm); | 431 ($ALcm | $HLcm) ($ALcm | $HLcm); |
392 $CM+ ($ALcm | $HLcm); # The $CM+ is from rule 10, an unattached CM is treat
ed as AL | 432 $CM+ ($ALcm | $HLcm); # The $CM+ is from rule 10, an unattached CM is treat
ed as AL |
393 | 433 |
(...skipping 10 matching lines...) Expand all Loading... |
404 | 444 |
405 # | 445 # |
406 # Reverse Rules. | 446 # Reverse Rules. |
407 # | 447 # |
408 ## ------------------------------------------------- | 448 ## ------------------------------------------------- |
409 | 449 |
410 !!reverse; | 450 !!reverse; |
411 | 451 |
412 $CM+ $ALPlus; | 452 $CM+ $ALPlus; |
413 $CM+ $BA; | 453 $CM+ $BA; |
| 454 $CM+ $BAX; |
414 $CM+ $BB; | 455 $CM+ $BB; |
415 $CM+ $B2; | 456 $CM+ $B2; |
416 $CM+ $CL; | 457 $CM+ $CL; |
417 $CM+ $CP; | 458 $CM+ $CP; |
418 $CM+ $EX; | 459 $CM+ $EX; |
| 460 $CM+ $EXX; |
419 $CM+ $GL; | 461 $CM+ $GL; |
420 $CM+ $HL; | 462 $CM+ $HL; |
421 $CM+ $HY; | 463 $CM+ $HY; |
422 $CM+ $H2; | 464 $CM+ $H2; |
423 $CM+ $H3; | 465 $CM+ $H3; |
424 $CM+ $ID; | 466 $CM+ $ID; |
425 $CM+ $IN; | 467 $CM+ $IN; |
426 $CM+ $IS; | 468 $CM+ $IS; |
427 $CM+ $JL; | 469 $CM+ $JL; |
428 $CM+ $JV; | 470 $CM+ $JV; |
429 $CM+ $JT; | 471 $CM+ $JT; |
430 $CM+ $NS; | 472 $CM+ $NS; |
| 473 $CM+ $NSX; |
431 $CM+ $NU; | 474 $CM+ $NU; |
432 $CM+ $OP; | 475 $CM+ $OP; |
433 $CM+ $PO; | 476 $CM+ $PO; |
| 477 $CM+ $POX; |
434 $CM+ $PR; | 478 $CM+ $PR; |
| 479 $CM+ $PRX; |
435 $CM+ $QU; | 480 $CM+ $QU; |
436 $CM+ $RI; | 481 $CM+ $RI; |
437 $CM+ $SY; | 482 $CM+ $SY; |
438 $CM+ $WJ; | 483 $CM+ $WJ; |
439 $CM+; | 484 $CM+; |
440 | 485 |
441 | 486 |
442 # | 487 # |
443 # Sequences of the form (shown forwards) | 488 # Sequences of the form (shown forwards) |
444 # [CANT_CM] <break> [CM] [whatever] | 489 # [CANT_CM] <break> [CM] [whatever] |
(...skipping 11 matching lines...) Expand all Loading... |
456 # a rule compiler
bug which complains about | 501 # a rule compiler
bug which complains about |
457 # empty sets other
wise. | 502 # empty sets other
wise. |
458 | 503 |
459 # | 504 # |
460 # Sequences of the form (shown forwards) | 505 # Sequences of the form (shown forwards) |
461 # [CANT_CM] <break> [CM] <break> [PR] | 506 # [CANT_CM] <break> [CM] <break> [PR] |
462 # The CM needs to behave as an AL | 507 # The CM needs to behave as an AL |
463 # This rule is concerned about getting the second of the two <breaks> in place. | 508 # This rule is concerned about getting the second of the two <breaks> in place. |
464 # | 509 # |
465 | 510 |
466 [$PR ] / $CM+ [$BK $CR $LF $NL $ZW $SP {eof}]; | 511 [$PR $PRX ] / $CM+ [$BK $CR $LF $NL $ZW $SP {eof}]; |
467 | 512 |
468 | 513 |
469 | 514 |
470 # LB 4, 5, 5 | 515 # LB 4, 5, 5 |
471 | 516 |
472 $LB4Breaks [$LB4NonBreaks-$CM]; | 517 $LB4Breaks [$LB4NonBreaks-$CM]; |
473 $LB4Breaks $CM+ $CAN_CM; | 518 $LB4Breaks $CM+ $CAN_CM; |
474 $LF $CR; | 519 $LF $CR; |
475 | 520 |
476 | 521 |
(...skipping 17 matching lines...) Expand all Loading... |
494 # LB 11 | 539 # LB 11 |
495 $CM* $WJ $CM* $CAN_CM; | 540 $CM* $WJ $CM* $CAN_CM; |
496 $CM* $WJ [$LB8NonBreaks-$CM]; | 541 $CM* $WJ [$LB8NonBreaks-$CM]; |
497 | 542 |
498 $CANT_CM $CM* $WJ; | 543 $CANT_CM $CM* $WJ; |
499 $CM* $CAN_CM $CM* $WJ; | 544 $CM* $CAN_CM $CM* $WJ; |
500 | 545 |
501 # LB 12a | 546 # LB 12a |
502 # [^SP BA HY] x GL | 547 # [^SP BA HY] x GL |
503 # | 548 # |
504 $CM* $GL $CM* [$LB8NonBreaks-[$CM $SP $BA $HY]]; | 549 $CM* $GL $CM* [$LB8NonBreaks-[$CM $SP $BA $BAX $HY]]; |
505 | 550 |
506 # LB 12 | 551 # LB 12 |
507 # GL x | 552 # GL x |
508 # | 553 # |
509 $CANT_CM $CM* $GL; | 554 $CANT_CM $CM* $GL; |
510 $CM* $CAN_CM $CM* $GL; | 555 $CM* $CAN_CM $CM* $GL; |
511 | 556 |
512 | 557 |
513 # LB 13 | 558 # LB 13 |
| 559 # Do not include $EXX here |
514 $CL $CM+ $CAN_CM; | 560 $CL $CM+ $CAN_CM; |
515 $CP $CM+ $CAN_CM; | 561 $CP $CM+ $CAN_CM; |
516 $EX $CM+ $CAN_CM; | 562 $EX $CM+ $CAN_CM; |
517 $IS $CM+ $CAN_CM; | 563 $IS $CM+ $CAN_CM; |
518 $SY $CM+ $CAN_CM; | 564 $SY $CM+ $CAN_CM; |
519 | 565 |
520 $CL [$LB8NonBreaks-$CM]; | 566 $CL [$LB8NonBreaks-$CM]; |
521 $CP [$LB8NonBreaks-$CM]; | 567 $CP [$LB8NonBreaks-$CM]; |
522 $EX [$LB8NonBreaks-$CM]; | 568 $EX [$LB8NonBreaks-$CM]; |
523 $IS [$LB8NonBreaks-$CM]; | 569 $IS [$LB8NonBreaks-$CM]; |
(...skipping 15 matching lines...) Expand all Loading... |
539 $AL_FOLLOW_NOCM $CM+ $SP+ $CM* $OP; | 585 $AL_FOLLOW_NOCM $CM+ $SP+ $CM* $OP; |
540 $CM* $AL_FOLLOW_CM $CM+ $SP+ $CM* $OP; | 586 $CM* $AL_FOLLOW_CM $CM+ $SP+ $CM* $OP; |
541 $SY $CM $SP+ $OP; # TODO: Experiment. Remove. | 587 $SY $CM $SP+ $OP; # TODO: Experiment. Remove. |
542 | 588 |
543 | 589 |
544 | 590 |
545 # LB 15 | 591 # LB 15 |
546 $CM* $OP $SP* $CM* $QU; | 592 $CM* $OP $SP* $CM* $QU; |
547 | 593 |
548 # LB 16 | 594 # LB 16 |
| 595 # Don't include $NSX here |
549 $CM* $NS $SP* $CM* ($CL | $CP); | 596 $CM* $NS $SP* $CM* ($CL | $CP); |
550 | 597 |
551 # LB 17 | 598 # LB 17 |
552 $CM* $B2 $SP* $CM* $B2; | 599 $CM* $B2 $SP* $CM* $B2; |
553 | 600 |
554 # LB 18 break after spaces | 601 # LB 18 break after spaces |
555 # Nothing explicit needed here. | 602 # Nothing explicit needed here. |
556 | 603 |
557 | 604 |
558 # | 605 # |
559 # LB 19 | 606 # LB 19 |
560 # | 607 # |
561 $CM* $QU $CM* $CAN_CM; # . x QU | 608 $CM* $QU $CM* $CAN_CM; # . x QU |
562 $CM* $QU $LB18NonBreaks; | 609 $CM* $QU $LB18NonBreaks; |
563 | 610 |
564 | 611 |
565 $CM* $CAN_CM $CM* $QU; # QU x . | 612 $CM* $CAN_CM $CM* $QU; # QU x . |
566 $CANT_CM $CM* $QU; | 613 $CANT_CM $CM* $QU; |
567 | 614 |
568 # | 615 # |
569 # LB 20 Break before and after CB. | 616 # LB 20 Break before and after CB. |
570 # nothing needed here. | 617 # nothing needed here. |
571 # | 618 # |
572 | 619 |
573 # LB 21 | 620 # LB 21 |
| 621 # Don't include $BAX or $NSX here |
574 $CM* ($BA | $HY | $NS) $CM* [$LB20NonBreaks-$CM]; # . x (BA | HY | NS) | 622 $CM* ($BA | $HY | $NS) $CM* [$LB20NonBreaks-$CM]; # . x (BA | HY | NS) |
575 | 623 |
576 $CM* [$LB20NonBreaks-$CM] $CM* $BB; # BB x . | 624 $CM* [$LB20NonBreaks-$CM] $CM* $BB; # BB x . |
577 [^$CB] $CM* $BB; # | 625 [^$CB] $CM* $BB; # |
578 | 626 |
579 # LB21a | 627 # LB21a |
580 [^$CB] $CM* ($HY | $BA) $CM* $HL; | 628 [^$CB]? $CM* ($HY | $BA | $BAX) $CM* $HL; |
581 | 629 |
582 # LB21b (reverse) | 630 # LB21b (reverse) |
583 $CM* $HL $CM* $SY; | 631 $CM* $HL $CM* $SY; |
584 | 632 |
585 # LB 22 | 633 # LB 22 |
586 $CM* $IN $CM* ($ALPlus | $HL); | 634 $CM* $IN $CM* ($ALPlus | $HL); |
| 635 $CM* $IN $CM* $EX; |
587 $CM* $IN $CM* $ID; | 636 $CM* $IN $CM* $ID; |
588 $CM* $IN $CM* $IN; | 637 # $CM* $IN $CM* $IN; # delete this rule for CSS loose |
589 $CM* $IN $CM* $NU; | 638 $CM* $IN $CM* $NU; |
590 | 639 |
591 # LB 23 | 640 # LB 23 |
| 641 # Do not include $POX here |
592 $CM* $PO $CM* $ID; | 642 $CM* $PO $CM* $ID; |
593 $CM* $NU $CM* ($ALPlus | $HL); | 643 $CM* $NU $CM* ($ALPlus | $HL); |
594 $CM* ($ALPlus | $HL) $CM* $NU; | 644 $CM* ($ALPlus | $HL) $CM* $NU; |
595 | 645 |
596 # LB 24 | 646 # LB 24 |
| 647 # Do not include $PRX here |
597 $CM* $ID $CM* $PR; | 648 $CM* $ID $CM* $PR; |
598 $CM* ($ALPlus | $HL) $CM* $PR; | 649 $CM* ($ALPlus | $HL) $CM* $PR; |
599 $CM* ($ALPlus | $HL) $CM* $PO; | 650 $CM* ($ALPlus | $HL) $CM* ($PO | $POX); |
600 | 651 |
601 | 652 |
602 # LB 25 | 653 # LB 25 |
603 ($CM* ($PR | $PO))? ($CM* ($CL | $CP))? ($CM* ($NU | $IS | $SY))* $CM* $NU ($CM*
($OP | $HY))? ($CM* ($PR | $PO))?; | 654 # Here do not include $POX at the beginning or $PRX at the end |
| 655 ($CM* ($PR | $PRX | $PO))? ($CM* ($CL | $CP))? ($CM* ($NU | $IS | $SY))* $CM* $N
U ($CM* ($OP | $HY))? ($CM* ($PR | $PO | $POX))?; |
604 | 656 |
605 # LB 26 | 657 # LB 26 |
606 $CM* ($H3 | $H2 | $JV | $JL) $CM* $JL; | 658 $CM* ($H3 | $H2 | $JV | $JL) $CM* $JL; |
607 $CM* ($JT | $JV) $CM* ($H2 | $JV); | 659 $CM* ($JT | $JV) $CM* ($H2 | $JV); |
608 $CM* $JT $CM* ($H3 | $JT); | 660 $CM* $JT $CM* ($H3 | $JT); |
609 | 661 |
610 # LB 27 | 662 # LB 27 |
| 663 # Do not include $POX or $PRX here |
611 $CM* $IN $CM* ($H3 | $H2 | $JT | $JV | $JL); | 664 $CM* $IN $CM* ($H3 | $H2 | $JT | $JV | $JL); |
612 $CM* $PO $CM* ($H3 | $H2 | $JT | $JV | $JL); | 665 $CM* $PO $CM* ($H3 | $H2 | $JT | $JV | $JL); |
613 $CM* ($H3 | $H2 | $JT | $JV | $JL) $CM* $PR; | 666 $CM* ($H3 | $H2 | $JT | $JV | $JL) $CM* $PR; |
614 | 667 |
615 # LB 28 | 668 # LB 28 |
616 $CM* ($ALPlus | $HL) $CM* ($ALPlus | $HL); | 669 $CM* ($ALPlus | $HL) $CM* ($ALPlus | $HL); |
617 | 670 |
618 | 671 |
619 # LB 29 | 672 # LB 29 |
620 $CM* ($ALPlus | $HL) $CM* $IS; | 673 $CM* ($ALPlus | $HL) $CM* $IS; |
(...skipping 19 matching lines...) Expand all Loading... |
640 # LB 15 | 693 # LB 15 |
641 $SP+ $CM* $QU; | 694 $SP+ $CM* $QU; |
642 | 695 |
643 # LB 16 | 696 # LB 16 |
644 $SP+ $CM* ($CL | $CP); | 697 $SP+ $CM* ($CL | $CP); |
645 | 698 |
646 # LB 17 | 699 # LB 17 |
647 $SP+ $CM* $B2; | 700 $SP+ $CM* $B2; |
648 | 701 |
649 # LB 21 | 702 # LB 21 |
650 $CM* ($HY | $BA) $CM* $HL; | 703 $CM* ($HY | $BA | $BAX) $CM* $HL; |
651 | 704 |
652 # LB 25 | 705 # LB 25 |
653 ($CM* ($IS | $SY))+ $CM* $NU; | 706 ($CM* ($IS | $SY))+ $CM* $NU; |
654 ($CL | $CP) $CM* ($NU | $IS | $SY); | 707 ($CL | $CP) $CM* ($NU | $IS | $SY); |
655 | 708 |
656 # For dictionary-based break | 709 # For dictionary-based break |
657 $dictionary $dictionary; | 710 $dictionary $dictionary; |
658 | 711 |
659 ## ------------------------------------------------- | 712 ## ------------------------------------------------- |
660 | 713 |
661 !!safe_forward; | 714 !!safe_forward; |
662 | 715 |
663 # Skip forward over all character classes that are involved in | 716 # Skip forward over all character classes that are involved in |
664 # rules containing patterns with possibly more than one char | 717 # rules containing patterns with possibly more than one char |
665 # of context. | 718 # of context. |
666 # | 719 # |
667 # It might be slightly more efficient to have specific rules | 720 # It might be slightly more efficient to have specific rules |
668 # instead of one generic one, but only if we could | 721 # instead of one generic one, but only if we could |
669 # turn off rule chaining. We don't want to move more | 722 # turn off rule chaining. We don't want to move more |
670 # than necessary. | 723 # than necessary. |
671 # | 724 # |
672 [$CM $OP $QU $CL $CP $B2 $PR $HY $BA $SP $dictionary]+ [^$CM $OP $QU $CL $CP $B2
$PR $HY $BA $dictionary]; | 725 [$CM $OP $QU $CL $CP $B2 $PR $PRX $HY $BA $BAX $SP $dictionary]+ [^$CM $OP $QU $
CL $CP $B2 $PR $PRX $HY $BA $BAX $dictionary]; |
673 $dictionary $dictionary; | 726 $dictionary $dictionary; |
674 | 727 |
OLD | NEW |