| OLD | NEW |
| 1 # Copyright (c) 2002-2013 International Business Machines Corporation and | 1 # Copyright (c) 2002-2015 International Business Machines Corporation and |
| 2 # others. All Rights Reserved. | 2 # others. All Rights Reserved. |
| 3 # | 3 # |
| 4 # file: line_ja.txt | 4 # file: line_loose_cj.txt |
| 5 # | 5 # |
| 6 # Line Breaking Rules | 6 # Line Breaking Rules |
| 7 # Implement default line breaking as defined by | 7 # Implement default line breaking as defined by |
| 8 # Unicode Standard Annex #14 Revision 29 for Unicode 6.2 | 8 # Unicode Standard Annex #14 Revision 34 for Unicode 8.0 |
| 9 # http://www.unicode.org/reports/tr14/ | 9 # http://www.unicode.org/reports/tr14/ |
| 10 # tailored as noted in 2nd paragraph below.. |
| 10 # | 11 # |
| 11 # TODO: Rule LB 8 remains as it was in Unicode 5.2 | 12 # TODO: Rule LB 8 remains as it was in Unicode 5.2 |
| 12 # This is only because of a limitation of ICU break engine implementatio
n, | 13 # This is only because of a limitation of ICU break engine implementatio
n, |
| 13 # not because the older behavior is desirable. | 14 # not because the older behavior is desirable. |
| 15 # |
| 16 # This tailors the line break behavior to correspond to CSS |
| 17 # line-break=loose (BCP47 -u-lb-loose) as defined for Chinese & Japanese
. |
| 18 # It sets characters of class CJ to behave like ID. |
| 19 # In addition, it allows breaks: |
| 20 # * before hyphens 2010 & 2013 (both BA) and 301C, 30A0 (both NS) |
| 21 # * before iteration marks 3005, 303B, 309D, 309E, 30FD, 30FE (all NS) |
| 22 # * between characters of LineBreak class IN such as 2026 |
| 23 # * before some centered punct 203C, 2047, 2048, 2049, 30FB, FF1A, FF1B, |
| 24 # FF65 (all NS) and FF01, FF1F (both EX). |
| 25 # * before suffix characters with LineBreak class PO and EastAsianWidth
A,F,W; |
| 26 # this includes: 00B0 2030 2032 2033 2035 2103 2109 FE6A FF05 FFE0 |
| 27 # * after prefix characters with LineBreak class PR and EastAsianWidth A
,F,W; |
| 28 # this includes: 00A4 00B1 20AC 2116 FE69 FF04 FFE1 FFE5 FFE6 |
| 29 |
| 14 | 30 |
| 15 # | 31 # |
| 16 # Character Classes defined by TR 14. | 32 # Character Classes defined by TR 14. |
| 17 # | 33 # |
| 18 | 34 |
| 19 !!chain; | 35 !!chain; |
| 20 !!LBCMNoChain; | 36 !!LBCMNoChain; |
| 21 | 37 |
| 22 | 38 |
| 23 !!lookAheadHardBreak; | 39 !!lookAheadHardBreak; |
| (...skipping 27 matching lines...) Expand all Loading... |
| 51 # !!lookAheadHardBreak forces the run time state machine to | 67 # !!lookAheadHardBreak forces the run time state machine to |
| 52 # stop immediately when a look ahead rule ( '/' operator) matches, | 68 # stop immediately when a look ahead rule ( '/' operator) matches, |
| 53 # and set the match position to that of the look-ahead operator, | 69 # and set the match position to that of the look-ahead operator, |
| 54 # no matter what other rules may be in play at the time. | 70 # no matter what other rules may be in play at the time. |
| 55 # | 71 # |
| 56 # See rule LB 19 for an example. | 72 # See rule LB 19 for an example. |
| 57 # | 73 # |
| 58 | 74 |
| 59 $AI = [:LineBreak = Ambiguous:]; | 75 $AI = [:LineBreak = Ambiguous:]; |
| 60 $AL = [:LineBreak = Alphabetic:]; | 76 $AL = [:LineBreak = Alphabetic:]; |
| 61 $BA = [:LineBreak = Break_After:]; | 77 $BAX = [\u2010 \u2013]; |
| 78 $BA = [[:LineBreak = Break_After:] - $BAX]; |
| 62 $BB = [:LineBreak = Break_Before:]; | 79 $BB = [:LineBreak = Break_Before:]; |
| 63 $BK = [:LineBreak = Mandatory_Break:]; | 80 $BK = [:LineBreak = Mandatory_Break:]; |
| 64 $B2 = [:LineBreak = Break_Both:]; | 81 $B2 = [:LineBreak = Break_Both:]; |
| 65 $CB = [:LineBreak = Contingent_Break:]; | 82 $CB = [:LineBreak = Contingent_Break:]; |
| 66 $CJ = [:LineBreak = Conditional_Japanese_Starter:]; | 83 $CJ = [:LineBreak = Conditional_Japanese_Starter:]; |
| 67 $CL = [:LineBreak = Close_Punctuation:]; | 84 $CL = [:LineBreak = Close_Punctuation:]; |
| 68 $CM = [:LineBreak = Combining_Mark:]; | 85 $CM = [:LineBreak = Combining_Mark:]; |
| 69 $CP = [:LineBreak = Close_Parenthesis:]; | 86 $CP = [:LineBreak = Close_Parenthesis:]; |
| 70 $CR = [:LineBreak = Carriage_Return:]; | 87 $CR = [:LineBreak = Carriage_Return:]; |
| 71 $EX = [:LineBreak = Exclamation:]; | 88 $EXX = [\uFF01 \uFF1F]; |
| 89 $EX = [[:LineBreak = Exclamation:] - $EXX]; |
| 72 $GL = [:LineBreak = Glue:]; | 90 $GL = [:LineBreak = Glue:]; |
| 73 $HL = [:LineBreak = Hebrew_Letter:]; | 91 $HL = [:LineBreak = Hebrew_Letter:]; |
| 74 $HY = [:LineBreak = Hyphen:]; | 92 $HY = [:LineBreak = Hyphen:]; |
| 75 $H2 = [:LineBreak = H2:]; | 93 $H2 = [:LineBreak = H2:]; |
| 76 $H3 = [:LineBreak = H3:]; | 94 $H3 = [:LineBreak = H3:]; |
| 77 $ID = [[:LineBreak = Ideographic:] $CJ]; | 95 $ID = [[:LineBreak = Ideographic:] $CJ]; |
| 78 $IN = [:LineBreak = Inseperable:]; | 96 $IN = [:LineBreak = Inseperable:]; |
| 79 $IS = [:LineBreak = Infix_Numeric:]; | 97 $IS = [:LineBreak = Infix_Numeric:]; |
| 80 $JL = [:LineBreak = JL:]; | 98 $JL = [:LineBreak = JL:]; |
| 81 $JV = [:LineBreak = JV:]; | 99 $JV = [:LineBreak = JV:]; |
| 82 $JT = [:LineBreak = JT:]; | 100 $JT = [:LineBreak = JT:]; |
| 83 $LF = [:LineBreak = Line_Feed:]; | 101 $LF = [:LineBreak = Line_Feed:]; |
| 84 $NL = [:LineBreak = Next_Line:]; | 102 $NL = [:LineBreak = Next_Line:]; |
| 85 $NS = [:LineBreak = Nonstarter:]; | 103 $NSX = [\u301C \u30A0 \u3005 \u303B \u309D \u309E \u30FD \u30FE \u203C \u2047 \u
2048 \u2049 \u30FB \uFF1A \uFF1B \uFF65]; |
| 104 $NS = [[:LineBreak = Nonstarter:] - $NSX]; |
| 86 $NU = [:LineBreak = Numeric:]; | 105 $NU = [:LineBreak = Numeric:]; |
| 87 $OP = [:LineBreak = Open_Punctuation:]; | 106 $OP = [:LineBreak = Open_Punctuation:]; |
| 88 $PO = [:LineBreak = Postfix_Numeric:]; | 107 $POX = [\u00B0 \u2030 \u2032 \u2033 \u2035 \u2103 \u2109 \uFE6A \uFF05 \uFFE0]; |
| 89 $PR = [:LineBreak = Prefix_Numeric:]; | 108 $PO = [[:LineBreak = Postfix_Numeric:] - $POX]; |
| 109 $PRX = [\u00A4 \u00B1 \u20AC \u2116 \uFE69 \uFF04 \uFFE1 \uFFE5 \uFFE6]; |
| 110 $PR = [[:LineBreak = Prefix_Numeric:] - $PRX]; |
| 90 $QU = [:LineBreak = Quotation:]; | 111 $QU = [:LineBreak = Quotation:]; |
| 91 $RI = [:LineBreak = Regional_Indicator:]; | 112 $RI = [:LineBreak = Regional_Indicator:]; |
| 92 $SA = [:LineBreak = Complex_Context:]; | 113 $SA = [:LineBreak = Complex_Context:]; |
| 93 $SG = [:LineBreak = Surrogate:]; | 114 $SG = [:LineBreak = Surrogate:]; |
| 94 $SP = [:LineBreak = Space:]; | 115 $SP = [:LineBreak = Space:]; |
| 95 $SY = [:LineBreak = Break_Symbols:]; | 116 $SY = [:LineBreak = Break_Symbols:]; |
| 96 $WJ = [:LineBreak = Word_Joiner:]; | 117 $WJ = [:LineBreak = Word_Joiner:]; |
| 97 $XX = [:LineBreak = Unknown:]; | 118 $XX = [:LineBreak = Unknown:]; |
| 98 $ZW = [:LineBreak = ZWSpace:]; | 119 $ZW = [:LineBreak = ZWSpace:]; |
| 99 | 120 |
| (...skipping 11 matching lines...) Expand all Loading... |
| 111 # XX (Unknown, unassigned) | 132 # XX (Unknown, unassigned) |
| 112 # as $AL (Alphabetic) | 133 # as $AL (Alphabetic) |
| 113 # | 134 # |
| 114 $ALPlus = [$AL $AI $SA $SG $XX]; | 135 $ALPlus = [$AL $AI $SA $SG $XX]; |
| 115 | 136 |
| 116 # | 137 # |
| 117 # Combining Marks. X $CM* behaves as if it were X. Rule LB6. | 138 # Combining Marks. X $CM* behaves as if it were X. Rule LB6. |
| 118 # | 139 # |
| 119 $ALcm = $ALPlus $CM*; | 140 $ALcm = $ALPlus $CM*; |
| 120 $BAcm = $BA $CM*; | 141 $BAcm = $BA $CM*; |
| 142 $BAXcm = $BAX $CM*; |
| 121 $BBcm = $BB $CM*; | 143 $BBcm = $BB $CM*; |
| 122 $B2cm = $B2 $CM*; | 144 $B2cm = $B2 $CM*; |
| 123 $CLcm = $CL $CM*; | 145 $CLcm = $CL $CM*; |
| 124 $CPcm = $CP $CM*; | 146 $CPcm = $CP $CM*; |
| 125 $EXcm = $EX $CM*; | 147 $EXcm = $EX $CM*; |
| 148 $EXXcm = $EXX $CM*; |
| 126 $GLcm = $GL $CM*; | 149 $GLcm = $GL $CM*; |
| 127 $HLcm = $HL $CM*; | 150 $HLcm = $HL $CM*; |
| 128 $HYcm = $HY $CM*; | 151 $HYcm = $HY $CM*; |
| 129 $H2cm = $H2 $CM*; | 152 $H2cm = $H2 $CM*; |
| 130 $H3cm = $H3 $CM*; | 153 $H3cm = $H3 $CM*; |
| 131 $IDcm = $ID $CM*; | 154 $IDcm = $ID $CM*; |
| 132 $INcm = $IN $CM*; | 155 $INcm = $IN $CM*; |
| 133 $IScm = $IS $CM*; | 156 $IScm = $IS $CM*; |
| 134 $JLcm = $JL $CM*; | 157 $JLcm = $JL $CM*; |
| 135 $JVcm = $JV $CM*; | 158 $JVcm = $JV $CM*; |
| 136 $JTcm = $JT $CM*; | 159 $JTcm = $JT $CM*; |
| 137 $NScm = $NS $CM*; | 160 $NScm = $NS $CM*; |
| 161 $NSXcm = $NSX $CM*; |
| 138 $NUcm = $NU $CM*; | 162 $NUcm = $NU $CM*; |
| 139 $OPcm = $OP $CM*; | 163 $OPcm = $OP $CM*; |
| 140 $POcm = $PO $CM*; | 164 $POcm = $PO $CM*; |
| 165 $POXcm = $POX $CM*; |
| 141 $PRcm = $PR $CM*; | 166 $PRcm = $PR $CM*; |
| 167 $PRXcm = $PRX $CM*; |
| 142 $QUcm = $QU $CM*; | 168 $QUcm = $QU $CM*; |
| 143 $RIcm = $RI $CM*; | 169 $RIcm = $RI $CM*; |
| 144 $SYcm = $SY $CM*; | 170 $SYcm = $SY $CM*; |
| 145 $WJcm = $WJ $CM*; | 171 $WJcm = $WJ $CM*; |
| 146 | 172 |
| 147 ## ------------------------------------------------- | 173 ## ------------------------------------------------- |
| 148 | 174 |
| 149 !!forward; | 175 !!forward; |
| 150 | 176 |
| 151 # | 177 # |
| 152 # Each class of character can stand by itself as an unbroken token, with traili
ng combining stuff | 178 # Each class of character can stand by itself as an unbroken token, with traili
ng combining stuff |
| 153 # | 179 # |
| 154 $ALPlus $CM+; | 180 $ALPlus $CM+; |
| 155 $BA $CM+; | 181 $BA $CM+; |
| 182 $BAX $CM+; |
| 156 $BB $CM+; | 183 $BB $CM+; |
| 157 $B2 $CM+; | 184 $B2 $CM+; |
| 158 $CL $CM+; | 185 $CL $CM+; |
| 159 $CP $CM+; | 186 $CP $CM+; |
| 160 $EX $CM+; | 187 $EX $CM+; |
| 188 $EXX $CM+; |
| 161 $GL $CM+; | 189 $GL $CM+; |
| 162 $HL $CM+; | 190 $HL $CM+; |
| 163 $HY $CM+; | 191 $HY $CM+; |
| 164 $H2 $CM+; | 192 $H2 $CM+; |
| 165 $H3 $CM+; | 193 $H3 $CM+; |
| 166 $ID $CM+; | 194 $ID $CM+; |
| 167 $IN $CM+; | 195 $IN $CM+; |
| 168 $IS $CM+; | 196 $IS $CM+; |
| 169 $JL $CM+; | 197 $JL $CM+; |
| 170 $JV $CM+; | 198 $JV $CM+; |
| 171 $JT $CM+; | 199 $JT $CM+; |
| 172 $NS $CM+; | 200 $NS $CM+; |
| 201 $NSX $CM+; |
| 173 $NU $CM+; | 202 $NU $CM+; |
| 174 $OP $CM+; | 203 $OP $CM+; |
| 175 $PO $CM+; | 204 $PO $CM+; |
| 205 $POX $CM+; |
| 176 $PR $CM+; | 206 $PR $CM+; |
| 207 $PRX $CM+; |
| 177 $QU $CM+; | 208 $QU $CM+; |
| 178 $RI $CM+; | 209 $RI $CM+; |
| 179 $SY $CM+; | 210 $SY $CM+; |
| 180 $WJ $CM+; | 211 $WJ $CM+; |
| 181 | 212 |
| 182 # | 213 # |
| 183 # CAN_CM is the set of characters that may combine with CM combining chars. | 214 # CAN_CM is the set of characters that may combine with CM combining chars. |
| 184 # Note that Linebreak UAX 14's concept of a combining char and the rules | 215 # Note that Linebreak UAX 14's concept of a combining char and the rules |
| 185 # for what they can combine with are _very_ different from the rest of U
nicode. | 216 # for what they can combine with are _very_ different from the rest of U
nicode. |
| 186 # | 217 # |
| (...skipping 65 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 252 # LB 12 Do not break after NBSP and related characters. | 283 # LB 12 Do not break after NBSP and related characters. |
| 253 # GL x | 284 # GL x |
| 254 # | 285 # |
| 255 $GLcm $CAN_CM $CM*; | 286 $GLcm $CAN_CM $CM*; |
| 256 $GLcm $CANT_CM; | 287 $GLcm $CANT_CM; |
| 257 | 288 |
| 258 # | 289 # |
| 259 # LB 12a Do not break before NBSP and related characters ... | 290 # LB 12a Do not break before NBSP and related characters ... |
| 260 # [^SP BA HY] x GL | 291 # [^SP BA HY] x GL |
| 261 # | 292 # |
| 262 [[$LB8NonBreaks] - [$SP $BA $HY]] $CM* $GLcm; | 293 [[$LB8NonBreaks] - [$SP $BA $BAX $HY]] $CM* $GLcm; |
| 263 $CM+ GLcm; | 294 $CM+ GLcm; |
| 264 | 295 |
| 265 | 296 |
| 266 | 297 |
| 267 # | 298 # |
| 268 # LB 13 Don't break before ']' or '!' or ';' or '/', even after spaces. | 299 # LB 13 Don't break before ']' or '!' or ';' or '/', even after spaces. |
| 269 # | 300 # |
| 301 # Do not include $EXX here |
| 270 $LB8NonBreaks $CL; | 302 $LB8NonBreaks $CL; |
| 271 $CAN_CM $CM* $CL; | 303 $CAN_CM $CM* $CL; |
| 272 $CM+ $CL; # by rule 10, stand-alone CM behaves as AL | 304 $CM+ $CL; # by rule 10, stand-alone CM behaves as AL |
| 273 | 305 |
| 274 $LB8NonBreaks $CP; | 306 $LB8NonBreaks $CP; |
| 275 $CAN_CM $CM* $CP; | 307 $CAN_CM $CM* $CP; |
| 276 $CM+ $CP; # by rule 10, stand-alone CM behaves as AL | 308 $CM+ $CP; # by rule 10, stand-alone CM behaves as AL |
| 277 | 309 |
| 278 $LB8NonBreaks $EX; | 310 $LB8NonBreaks $EX; |
| 279 $CAN_CM $CM* $EX; | 311 $CAN_CM $CM* $EX; |
| (...skipping 13 matching lines...) Expand all Loading... |
| 293 # | 325 # |
| 294 $OPcm $SP* $CAN_CM $CM*; | 326 $OPcm $SP* $CAN_CM $CM*; |
| 295 $OPcm $SP* $CANT_CM; | 327 $OPcm $SP* $CANT_CM; |
| 296 | 328 |
| 297 $OPcm $SP+ $CM+ $AL_FOLLOW?; # by rule 10, stand-alone CM behaves as AL | 329 $OPcm $SP+ $CM+ $AL_FOLLOW?; # by rule 10, stand-alone CM behaves as AL |
| 298 | 330 |
| 299 # LB 15 | 331 # LB 15 |
| 300 $QUcm $SP* $OPcm; | 332 $QUcm $SP* $OPcm; |
| 301 | 333 |
| 302 # LB 16 | 334 # LB 16 |
| 335 # Do not break between closing punctuation and $NS, even with intervening spaces |
| 336 # But DO allow a break between closing punctuation and $NSX, don't include it he
re |
| 303 ($CLcm | $CPcm) $SP* $NScm; | 337 ($CLcm | $CPcm) $SP* $NScm; |
| 304 | 338 |
| 305 # LB 17 | 339 # LB 17 |
| 306 $B2cm $SP* $B2cm; | 340 $B2cm $SP* $B2cm; |
| 307 | 341 |
| 308 # | 342 # |
| 309 # LB 18 Break after spaces. | 343 # LB 18 Break after spaces. |
| 310 # | 344 # |
| 311 $LB18NonBreaks = [$LB8NonBreaks - [$SP]]; | 345 $LB18NonBreaks = [$LB8NonBreaks - [$SP]]; |
| 312 $LB18Breaks = [$LB8Breaks $SP]; | 346 $LB18Breaks = [$LB8Breaks $SP]; |
| (...skipping 12 matching lines...) Expand all Loading... |
| 325 | 359 |
| 326 # LB 20 | 360 # LB 20 |
| 327 # <break> $CB | 361 # <break> $CB |
| 328 # $CB <break> | 362 # $CB <break> |
| 329 | 363 |
| 330 $LB20NonBreaks = [$LB18NonBreaks - $CB]; | 364 $LB20NonBreaks = [$LB18NonBreaks - $CB]; |
| 331 | 365 |
| 332 # LB 21 x (BA | HY | NS) | 366 # LB 21 x (BA | HY | NS) |
| 333 # BB x | 367 # BB x |
| 334 # | 368 # |
| 369 # DO allow breaks here before $BAXcm and $NSXcm, so don't include them |
| 335 $LB20NonBreaks $CM* ($BAcm | $HYcm | $NScm); | 370 $LB20NonBreaks $CM* ($BAcm | $HYcm | $NScm); |
| 336 | 371 |
| 337 $BBcm [^$CB]; # $BB x | 372 $BBcm [^$CB]; # $BB x |
| 338 $BBcm $LB20NonBreaks $CM*; | 373 $BBcm $LB20NonBreaks $CM*; |
| 339 | 374 |
| 340 # LB 21a Don't break after Hebrew + Hyphen | 375 # LB 21a Don't break after Hebrew + Hyphen |
| 341 # HL (HY | BA) x | 376 # HL (HY | BA) x |
| 342 # | 377 # |
| 343 $HLcm ($HYcm | $BAcm) [^$CB]?; | 378 $HLcm ($HYcm | $BAcm | $BAXcm) [^$CB]?; |
| 344 | 379 |
| 345 # LB 21b (forward) Don't break between SY and HL | 380 # LB 21b (forward) Don't break between SY and HL |
| 346 # (break between HL and SY already disallowed by LB 13 above) | 381 # (break between HL and SY already disallowed by LB 13 above) |
| 347 $SYcm $HLcm; | 382 $SYcm $HLcm; |
| 348 | 383 |
| 349 # LB 22 | 384 # LB 22 |
| 350 ($ALcm | $HLcm) $INcm; | 385 ($ALcm | $HLcm) $INcm; |
| 351 $CM+ $INcm; # by rule 10, any otherwise unattached CM behaves as AL | 386 $CM+ $INcm; # by rule 10, any otherwise unattached CM behaves as AL |
| 387 $EXcm $INcm; |
| 352 $IDcm $INcm; | 388 $IDcm $INcm; |
| 353 $INcm $INcm; | 389 # $INcm $INcm; # delete this rule for CSS loose |
| 354 $NUcm $INcm; | 390 $NUcm $INcm; |
| 355 | 391 |
| 356 | 392 |
| 357 # $LB 23 | 393 # LB 23 |
| 394 # Do not include $POX here |
| 358 $IDcm $POcm; | 395 $IDcm $POcm; |
| 359 $ALcm $NUcm; # includes $LB19 | 396 $ALcm $NUcm; # includes $LB19 |
| 360 $HLcm $NUcm; | 397 $HLcm $NUcm; |
| 361 $CM+ $NUcm; # Rule 10, any otherwise unattached CM behaves as AL | 398 $CM+ $NUcm; # Rule 10, any otherwise unattached CM behaves as AL |
| 362 $NUcm $ALcm; | 399 $NUcm $ALcm; |
| 363 $NUcm $HLcm; | 400 $NUcm $HLcm; |
| 364 | 401 |
| 365 # | 402 # |
| 366 # LB 24 | 403 # LB 24 |
| 367 # | 404 # |
| 405 # Do not include $PRX here |
| 368 $PRcm $IDcm; | 406 $PRcm $IDcm; |
| 369 $PRcm ($ALcm | $HLcm); | 407 $PRcm ($ALcm | $HLcm); |
| 370 $POcm ($ALcm | $HLcm); | 408 ($POcm | $POXcm) ($ALcm | $HLcm); |
| 371 | 409 |
| 372 # | 410 # |
| 373 # LB 25 Numbers. | 411 # LB 25 Numbers. |
| 374 # | 412 # |
| 375 ($PRcm | $POcm)? ($OPcm | $HYcm)? $NUcm ($NUcm | $SYcm | $IScm)* ($CLcm | $CPcm)
? ($PRcm | $POcm)?; | 413 # Here do not include $PRX at the beginning or $POX at the end |
| 414 ($PRcm | $POcm | $POXcm)? ($OPcm | $HYcm)? $NUcm ($NUcm | $SYcm | $IScm)* ($CLcm
| $CPcm)? ($PRcm | $PRXcm | $POcm)?; |
| 376 | 415 |
| 377 # LB 26 Do not break a Korean syllable | 416 # LB 26 Do not break a Korean syllable |
| 378 # | 417 # |
| 379 $JLcm ($JLcm | $JVcm | $H2cm | $H3cm); | 418 $JLcm ($JLcm | $JVcm | $H2cm | $H3cm); |
| 380 ($JVcm | $H2cm) ($JVcm | $JTcm); | 419 ($JVcm | $H2cm) ($JVcm | $JTcm); |
| 381 ($JTcm | $H3cm) $JTcm; | 420 ($JTcm | $H3cm) $JTcm; |
| 382 | 421 |
| 383 # LB 27 Treat korean Syllable Block the same as ID (don't break it) | 422 # LB 27 Treat korean Syllable Block the same as ID (don't break it) |
| 423 # Do not include $POX or $PRX here |
| 384 ($JLcm | $JVcm | $JTcm | $H2cm | $H3cm) $INcm; | 424 ($JLcm | $JVcm | $JTcm | $H2cm | $H3cm) $INcm; |
| 385 ($JLcm | $JVcm | $JTcm | $H2cm | $H3cm) $POcm; | 425 ($JLcm | $JVcm | $JTcm | $H2cm | $H3cm) $POcm; |
| 386 $PRcm ($JLcm | $JVcm | $JTcm | $H2cm | $H3cm); | 426 $PRcm ($JLcm | $JVcm | $JTcm | $H2cm | $H3cm); |
| 387 | 427 |
| 388 | 428 |
| 389 # LB 28 Do not break between alphabetics | 429 # LB 28 Do not break between alphabetics |
| 390 # | 430 # |
| 391 ($ALcm | $HLcm) ($ALcm | $HLcm); | 431 ($ALcm | $HLcm) ($ALcm | $HLcm); |
| 392 $CM+ ($ALcm | $HLcm); # The $CM+ is from rule 10, an unattached CM is treat
ed as AL | 432 $CM+ ($ALcm | $HLcm); # The $CM+ is from rule 10, an unattached CM is treat
ed as AL |
| 393 | 433 |
| (...skipping 10 matching lines...) Expand all Loading... |
| 404 | 444 |
| 405 # | 445 # |
| 406 # Reverse Rules. | 446 # Reverse Rules. |
| 407 # | 447 # |
| 408 ## ------------------------------------------------- | 448 ## ------------------------------------------------- |
| 409 | 449 |
| 410 !!reverse; | 450 !!reverse; |
| 411 | 451 |
| 412 $CM+ $ALPlus; | 452 $CM+ $ALPlus; |
| 413 $CM+ $BA; | 453 $CM+ $BA; |
| 454 $CM+ $BAX; |
| 414 $CM+ $BB; | 455 $CM+ $BB; |
| 415 $CM+ $B2; | 456 $CM+ $B2; |
| 416 $CM+ $CL; | 457 $CM+ $CL; |
| 417 $CM+ $CP; | 458 $CM+ $CP; |
| 418 $CM+ $EX; | 459 $CM+ $EX; |
| 460 $CM+ $EXX; |
| 419 $CM+ $GL; | 461 $CM+ $GL; |
| 420 $CM+ $HL; | 462 $CM+ $HL; |
| 421 $CM+ $HY; | 463 $CM+ $HY; |
| 422 $CM+ $H2; | 464 $CM+ $H2; |
| 423 $CM+ $H3; | 465 $CM+ $H3; |
| 424 $CM+ $ID; | 466 $CM+ $ID; |
| 425 $CM+ $IN; | 467 $CM+ $IN; |
| 426 $CM+ $IS; | 468 $CM+ $IS; |
| 427 $CM+ $JL; | 469 $CM+ $JL; |
| 428 $CM+ $JV; | 470 $CM+ $JV; |
| 429 $CM+ $JT; | 471 $CM+ $JT; |
| 430 $CM+ $NS; | 472 $CM+ $NS; |
| 473 $CM+ $NSX; |
| 431 $CM+ $NU; | 474 $CM+ $NU; |
| 432 $CM+ $OP; | 475 $CM+ $OP; |
| 433 $CM+ $PO; | 476 $CM+ $PO; |
| 477 $CM+ $POX; |
| 434 $CM+ $PR; | 478 $CM+ $PR; |
| 479 $CM+ $PRX; |
| 435 $CM+ $QU; | 480 $CM+ $QU; |
| 436 $CM+ $RI; | 481 $CM+ $RI; |
| 437 $CM+ $SY; | 482 $CM+ $SY; |
| 438 $CM+ $WJ; | 483 $CM+ $WJ; |
| 439 $CM+; | 484 $CM+; |
| 440 | 485 |
| 441 | 486 |
| 442 # | 487 # |
| 443 # Sequences of the form (shown forwards) | 488 # Sequences of the form (shown forwards) |
| 444 # [CANT_CM] <break> [CM] [whatever] | 489 # [CANT_CM] <break> [CM] [whatever] |
| (...skipping 11 matching lines...) Expand all Loading... |
| 456 # a rule compiler
bug which complains about | 501 # a rule compiler
bug which complains about |
| 457 # empty sets other
wise. | 502 # empty sets other
wise. |
| 458 | 503 |
| 459 # | 504 # |
| 460 # Sequences of the form (shown forwards) | 505 # Sequences of the form (shown forwards) |
| 461 # [CANT_CM] <break> [CM] <break> [PR] | 506 # [CANT_CM] <break> [CM] <break> [PR] |
| 462 # The CM needs to behave as an AL | 507 # The CM needs to behave as an AL |
| 463 # This rule is concerned about getting the second of the two <breaks> in place. | 508 # This rule is concerned about getting the second of the two <breaks> in place. |
| 464 # | 509 # |
| 465 | 510 |
| 466 [$PR ] / $CM+ [$BK $CR $LF $NL $ZW $SP {eof}]; | 511 [$PR $PRX ] / $CM+ [$BK $CR $LF $NL $ZW $SP {eof}]; |
| 467 | 512 |
| 468 | 513 |
| 469 | 514 |
| 470 # LB 4, 5, 5 | 515 # LB 4, 5, 5 |
| 471 | 516 |
| 472 $LB4Breaks [$LB4NonBreaks-$CM]; | 517 $LB4Breaks [$LB4NonBreaks-$CM]; |
| 473 $LB4Breaks $CM+ $CAN_CM; | 518 $LB4Breaks $CM+ $CAN_CM; |
| 474 $LF $CR; | 519 $LF $CR; |
| 475 | 520 |
| 476 | 521 |
| (...skipping 17 matching lines...) Expand all Loading... |
| 494 # LB 11 | 539 # LB 11 |
| 495 $CM* $WJ $CM* $CAN_CM; | 540 $CM* $WJ $CM* $CAN_CM; |
| 496 $CM* $WJ [$LB8NonBreaks-$CM]; | 541 $CM* $WJ [$LB8NonBreaks-$CM]; |
| 497 | 542 |
| 498 $CANT_CM $CM* $WJ; | 543 $CANT_CM $CM* $WJ; |
| 499 $CM* $CAN_CM $CM* $WJ; | 544 $CM* $CAN_CM $CM* $WJ; |
| 500 | 545 |
| 501 # LB 12a | 546 # LB 12a |
| 502 # [^SP BA HY] x GL | 547 # [^SP BA HY] x GL |
| 503 # | 548 # |
| 504 $CM* $GL $CM* [$LB8NonBreaks-[$CM $SP $BA $HY]]; | 549 $CM* $GL $CM* [$LB8NonBreaks-[$CM $SP $BA $BAX $HY]]; |
| 505 | 550 |
| 506 # LB 12 | 551 # LB 12 |
| 507 # GL x | 552 # GL x |
| 508 # | 553 # |
| 509 $CANT_CM $CM* $GL; | 554 $CANT_CM $CM* $GL; |
| 510 $CM* $CAN_CM $CM* $GL; | 555 $CM* $CAN_CM $CM* $GL; |
| 511 | 556 |
| 512 | 557 |
| 513 # LB 13 | 558 # LB 13 |
| 559 # Do not include $EXX here |
| 514 $CL $CM+ $CAN_CM; | 560 $CL $CM+ $CAN_CM; |
| 515 $CP $CM+ $CAN_CM; | 561 $CP $CM+ $CAN_CM; |
| 516 $EX $CM+ $CAN_CM; | 562 $EX $CM+ $CAN_CM; |
| 517 $IS $CM+ $CAN_CM; | 563 $IS $CM+ $CAN_CM; |
| 518 $SY $CM+ $CAN_CM; | 564 $SY $CM+ $CAN_CM; |
| 519 | 565 |
| 520 $CL [$LB8NonBreaks-$CM]; | 566 $CL [$LB8NonBreaks-$CM]; |
| 521 $CP [$LB8NonBreaks-$CM]; | 567 $CP [$LB8NonBreaks-$CM]; |
| 522 $EX [$LB8NonBreaks-$CM]; | 568 $EX [$LB8NonBreaks-$CM]; |
| 523 $IS [$LB8NonBreaks-$CM]; | 569 $IS [$LB8NonBreaks-$CM]; |
| (...skipping 15 matching lines...) Expand all Loading... |
| 539 $AL_FOLLOW_NOCM $CM+ $SP+ $CM* $OP; | 585 $AL_FOLLOW_NOCM $CM+ $SP+ $CM* $OP; |
| 540 $CM* $AL_FOLLOW_CM $CM+ $SP+ $CM* $OP; | 586 $CM* $AL_FOLLOW_CM $CM+ $SP+ $CM* $OP; |
| 541 $SY $CM $SP+ $OP; # TODO: Experiment. Remove. | 587 $SY $CM $SP+ $OP; # TODO: Experiment. Remove. |
| 542 | 588 |
| 543 | 589 |
| 544 | 590 |
| 545 # LB 15 | 591 # LB 15 |
| 546 $CM* $OP $SP* $CM* $QU; | 592 $CM* $OP $SP* $CM* $QU; |
| 547 | 593 |
| 548 # LB 16 | 594 # LB 16 |
| 595 # Don't include $NSX here |
| 549 $CM* $NS $SP* $CM* ($CL | $CP); | 596 $CM* $NS $SP* $CM* ($CL | $CP); |
| 550 | 597 |
| 551 # LB 17 | 598 # LB 17 |
| 552 $CM* $B2 $SP* $CM* $B2; | 599 $CM* $B2 $SP* $CM* $B2; |
| 553 | 600 |
| 554 # LB 18 break after spaces | 601 # LB 18 break after spaces |
| 555 # Nothing explicit needed here. | 602 # Nothing explicit needed here. |
| 556 | 603 |
| 557 | 604 |
| 558 # | 605 # |
| 559 # LB 19 | 606 # LB 19 |
| 560 # | 607 # |
| 561 $CM* $QU $CM* $CAN_CM; # . x QU | 608 $CM* $QU $CM* $CAN_CM; # . x QU |
| 562 $CM* $QU $LB18NonBreaks; | 609 $CM* $QU $LB18NonBreaks; |
| 563 | 610 |
| 564 | 611 |
| 565 $CM* $CAN_CM $CM* $QU; # QU x . | 612 $CM* $CAN_CM $CM* $QU; # QU x . |
| 566 $CANT_CM $CM* $QU; | 613 $CANT_CM $CM* $QU; |
| 567 | 614 |
| 568 # | 615 # |
| 569 # LB 20 Break before and after CB. | 616 # LB 20 Break before and after CB. |
| 570 # nothing needed here. | 617 # nothing needed here. |
| 571 # | 618 # |
| 572 | 619 |
| 573 # LB 21 | 620 # LB 21 |
| 621 # Don't include $BAX or $NSX here |
| 574 $CM* ($BA | $HY | $NS) $CM* [$LB20NonBreaks-$CM]; # . x (BA | HY | NS) | 622 $CM* ($BA | $HY | $NS) $CM* [$LB20NonBreaks-$CM]; # . x (BA | HY | NS) |
| 575 | 623 |
| 576 $CM* [$LB20NonBreaks-$CM] $CM* $BB; # BB x . | 624 $CM* [$LB20NonBreaks-$CM] $CM* $BB; # BB x . |
| 577 [^$CB] $CM* $BB; # | 625 [^$CB] $CM* $BB; # |
| 578 | 626 |
| 579 # LB21a | 627 # LB21a |
| 580 [^$CB] $CM* ($HY | $BA) $CM* $HL; | 628 [^$CB]? $CM* ($HY | $BA | $BAX) $CM* $HL; |
| 581 | 629 |
| 582 # LB21b (reverse) | 630 # LB21b (reverse) |
| 583 $CM* $HL $CM* $SY; | 631 $CM* $HL $CM* $SY; |
| 584 | 632 |
| 585 # LB 22 | 633 # LB 22 |
| 586 $CM* $IN $CM* ($ALPlus | $HL); | 634 $CM* $IN $CM* ($ALPlus | $HL); |
| 635 $CM* $IN $CM* $EX; |
| 587 $CM* $IN $CM* $ID; | 636 $CM* $IN $CM* $ID; |
| 588 $CM* $IN $CM* $IN; | 637 # $CM* $IN $CM* $IN; # delete this rule for CSS loose |
| 589 $CM* $IN $CM* $NU; | 638 $CM* $IN $CM* $NU; |
| 590 | 639 |
| 591 # LB 23 | 640 # LB 23 |
| 641 # Do not include $POX here |
| 592 $CM* $PO $CM* $ID; | 642 $CM* $PO $CM* $ID; |
| 593 $CM* $NU $CM* ($ALPlus | $HL); | 643 $CM* $NU $CM* ($ALPlus | $HL); |
| 594 $CM* ($ALPlus | $HL) $CM* $NU; | 644 $CM* ($ALPlus | $HL) $CM* $NU; |
| 595 | 645 |
| 596 # LB 24 | 646 # LB 24 |
| 647 # Do not include $PRX here |
| 597 $CM* $ID $CM* $PR; | 648 $CM* $ID $CM* $PR; |
| 598 $CM* ($ALPlus | $HL) $CM* $PR; | 649 $CM* ($ALPlus | $HL) $CM* $PR; |
| 599 $CM* ($ALPlus | $HL) $CM* $PO; | 650 $CM* ($ALPlus | $HL) $CM* ($PO | $POX); |
| 600 | 651 |
| 601 | 652 |
| 602 # LB 25 | 653 # LB 25 |
| 603 ($CM* ($PR | $PO))? ($CM* ($CL | $CP))? ($CM* ($NU | $IS | $SY))* $CM* $NU ($CM*
($OP | $HY))? ($CM* ($PR | $PO))?; | 654 # Here do not include $POX at the beginning or $PRX at the end |
| 655 ($CM* ($PR | $PRX | $PO))? ($CM* ($CL | $CP))? ($CM* ($NU | $IS | $SY))* $CM* $N
U ($CM* ($OP | $HY))? ($CM* ($PR | $PO | $POX))?; |
| 604 | 656 |
| 605 # LB 26 | 657 # LB 26 |
| 606 $CM* ($H3 | $H2 | $JV | $JL) $CM* $JL; | 658 $CM* ($H3 | $H2 | $JV | $JL) $CM* $JL; |
| 607 $CM* ($JT | $JV) $CM* ($H2 | $JV); | 659 $CM* ($JT | $JV) $CM* ($H2 | $JV); |
| 608 $CM* $JT $CM* ($H3 | $JT); | 660 $CM* $JT $CM* ($H3 | $JT); |
| 609 | 661 |
| 610 # LB 27 | 662 # LB 27 |
| 663 # Do not include $POX or $PRX here |
| 611 $CM* $IN $CM* ($H3 | $H2 | $JT | $JV | $JL); | 664 $CM* $IN $CM* ($H3 | $H2 | $JT | $JV | $JL); |
| 612 $CM* $PO $CM* ($H3 | $H2 | $JT | $JV | $JL); | 665 $CM* $PO $CM* ($H3 | $H2 | $JT | $JV | $JL); |
| 613 $CM* ($H3 | $H2 | $JT | $JV | $JL) $CM* $PR; | 666 $CM* ($H3 | $H2 | $JT | $JV | $JL) $CM* $PR; |
| 614 | 667 |
| 615 # LB 28 | 668 # LB 28 |
| 616 $CM* ($ALPlus | $HL) $CM* ($ALPlus | $HL); | 669 $CM* ($ALPlus | $HL) $CM* ($ALPlus | $HL); |
| 617 | 670 |
| 618 | 671 |
| 619 # LB 29 | 672 # LB 29 |
| 620 $CM* ($ALPlus | $HL) $CM* $IS; | 673 $CM* ($ALPlus | $HL) $CM* $IS; |
| (...skipping 19 matching lines...) Expand all Loading... |
| 640 # LB 15 | 693 # LB 15 |
| 641 $SP+ $CM* $QU; | 694 $SP+ $CM* $QU; |
| 642 | 695 |
| 643 # LB 16 | 696 # LB 16 |
| 644 $SP+ $CM* ($CL | $CP); | 697 $SP+ $CM* ($CL | $CP); |
| 645 | 698 |
| 646 # LB 17 | 699 # LB 17 |
| 647 $SP+ $CM* $B2; | 700 $SP+ $CM* $B2; |
| 648 | 701 |
| 649 # LB 21 | 702 # LB 21 |
| 650 $CM* ($HY | $BA) $CM* $HL; | 703 $CM* ($HY | $BA | $BAX) $CM* $HL; |
| 651 | 704 |
| 652 # LB 25 | 705 # LB 25 |
| 653 ($CM* ($IS | $SY))+ $CM* $NU; | 706 ($CM* ($IS | $SY))+ $CM* $NU; |
| 654 ($CL | $CP) $CM* ($NU | $IS | $SY); | 707 ($CL | $CP) $CM* ($NU | $IS | $SY); |
| 655 | 708 |
| 656 # For dictionary-based break | 709 # For dictionary-based break |
| 657 $dictionary $dictionary; | 710 $dictionary $dictionary; |
| 658 | 711 |
| 659 ## ------------------------------------------------- | 712 ## ------------------------------------------------- |
| 660 | 713 |
| 661 !!safe_forward; | 714 !!safe_forward; |
| 662 | 715 |
| 663 # Skip forward over all character classes that are involved in | 716 # Skip forward over all character classes that are involved in |
| 664 # rules containing patterns with possibly more than one char | 717 # rules containing patterns with possibly more than one char |
| 665 # of context. | 718 # of context. |
| 666 # | 719 # |
| 667 # It might be slightly more efficient to have specific rules | 720 # It might be slightly more efficient to have specific rules |
| 668 # instead of one generic one, but only if we could | 721 # instead of one generic one, but only if we could |
| 669 # turn off rule chaining. We don't want to move more | 722 # turn off rule chaining. We don't want to move more |
| 670 # than necessary. | 723 # than necessary. |
| 671 # | 724 # |
| 672 [$CM $OP $QU $CL $CP $B2 $PR $HY $BA $SP $dictionary]+ [^$CM $OP $QU $CL $CP $B2
$PR $HY $BA $dictionary]; | 725 [$CM $OP $QU $CL $CP $B2 $PR $PRX $HY $BA $BAX $SP $dictionary]+ [^$CM $OP $QU $
CL $CP $B2 $PR $PRX $HY $BA $BAX $dictionary]; |
| 673 $dictionary $dictionary; | 726 $dictionary $dictionary; |
| 674 | 727 |
| OLD | NEW |