OLD | NEW |
(Empty) | |
| 1 #!/usr/bin/perl |
| 2 # ******************************************************************** |
| 3 # * COPYRIGHT: |
| 4 # * Copyright (c) 2002-2007, International Business Machines Corporation and |
| 5 # * others. All Rights Reserved. |
| 6 # ******************************************************************** |
| 7 # |
| 8 # regexcst.pl |
| 9 # Compile the regular expression paser state table data into initiali
zed C data. |
| 10 # Usage: |
| 11 # cd icu/source/i18n |
| 12 # perl regexcst.pl < regexcst.txt > regexcst.h |
| 13 # |
| 14 # The output file, regexcst.h, is included by some of the .cpp regex |
| 15 # implementation files. This perl script is NOT run as part |
| 16 # of a normal ICU build. It is run by hand when needed, and the |
| 17 # regexcst.h generated file is put back into cvs. |
| 18 # |
| 19 # See regexcst.txt for a description of the input format for this sc
ript. |
| 20 # |
| 21 # This script is derived from rbbicst.pl, which peforms the same fun
ction |
| 22 # for the Rule Based Break Iterator Rule Parser. Perhaps they could
be |
| 23 # merged? |
| 24 # |
| 25 |
| 26 |
| 27 $num_states = 1; # Always the state number for the line being compiled. |
| 28 $line_num = 0; # The line number in the input file. |
| 29 |
| 30 $states{"pop"} = 255; # Add the "pop" to the list of defined state names. |
| 31 # This prevents any state from being labelled with "pop
", |
| 32 # and resolves references to "pop" in the next state f
ield. |
| 33 |
| 34 line_loop: while (<>) { |
| 35 chomp(); |
| 36 $line = $_; |
| 37 @fields = split(); |
| 38 $line_num++; |
| 39 |
| 40 # Remove # comments, which are any fields beginning with a #, plus all |
| 41 # that follow on the line. |
| 42 for ($i=0; $i<@fields; $i++) { |
| 43 if ($fields[$i] =~ /^#/) { |
| 44 @fields = @fields[0 .. $i-1]; |
| 45 last; |
| 46 } |
| 47 } |
| 48 # ignore blank lines, and those with no fields left after stripping comments
.. |
| 49 if (@fields == 0) { |
| 50 next; |
| 51 } |
| 52 |
| 53 # |
| 54 # State Label: handling. |
| 55 # Does the first token end with a ":"? If so, it's the name of a state. |
| 56 # Put in a hash, together with the current state number, |
| 57 # so that we can later look up the number from the name. |
| 58 # |
| 59 if (@fields[0] =~ /.*:$/) { |
| 60 $state_name = @fields[0]; |
| 61 $state_name =~ s/://; # strip off the colon from the state name. |
| 62 |
| 63 if ($states{$state_name} != 0) { |
| 64 print " rbbicst: at line $line-num duplicate definition of state $s
tate_name\n"; |
| 65 } |
| 66 $states{$state_name} = $num_states; |
| 67 $stateNames[$num_states] = $state_name; |
| 68 |
| 69 # if the label was the only thing on this line, go on to the next line, |
| 70 # otherwise assume that a state definition is on the same line and fall
through. |
| 71 if (@fields == 1) { |
| 72 next line_loop; |
| 73 } |
| 74 shift @fields; # shift off label field in preparat
ion |
| 75 # for handling the rest of the lin
e. |
| 76 } |
| 77 |
| 78 # |
| 79 # State Transition line. |
| 80 # syntax is this, |
| 81 # character [n] target-state [^push-state] [function-name] |
| 82 # where |
| 83 # [something] is an optional something |
| 84 # character is either a single quoted character e.g. '[' |
| 85 # or a name of a character class, e.g. white_space |
| 86 # |
| 87 |
| 88 $state_line_num[$num_states] = $line_num; # remember line number with each
state |
| 89 # so we can make better error m
essages later. |
| 90 # |
| 91 # First field, character class or literal character for this transition. |
| 92 # |
| 93 if ($fields[0] =~ /^'.'$/) { |
| 94 # We've got a quoted literal character. |
| 95 $state_literal_chars[$num_states] = $fields[0]; |
| 96 $state_literal_chars[$num_states] =~ s/'//g; |
| 97 } else { |
| 98 # We've got the name of a character class. |
| 99 $state_char_class[$num_states] = $fields[0]; |
| 100 if ($fields[0] =~ /[\W]/) { |
| 101 print " rbbicsts: at line $line_num, bad character literal or char
acter class name.\n"; |
| 102 print " scanning $fields[0]\n"; |
| 103 exit(-1); |
| 104 } |
| 105 } |
| 106 shift @fields; |
| 107 |
| 108 # |
| 109 # do the 'n' flag |
| 110 # |
| 111 $state_flag[$num_states] = "FALSE"; |
| 112 if ($fields[0] eq "n") { |
| 113 $state_flag[$num_states] = "TRUE"; |
| 114 shift @fields; |
| 115 } |
| 116 |
| 117 # |
| 118 # do the destination state. |
| 119 # |
| 120 $state_dest_state[$num_states] = $fields[0]; |
| 121 if ($fields[0] eq "") { |
| 122 print " rbbicsts: at line $line_num, destination state missing.\n"; |
| 123 exit(-1); |
| 124 } |
| 125 shift @fields; |
| 126 |
| 127 # |
| 128 # do the push state, if present. |
| 129 # |
| 130 if ($fields[0] =~ /^\^/) { |
| 131 $fields[0] =~ s/^\^//; |
| 132 $state_push_state[$num_states] = $fields[0]; |
| 133 if ($fields[0] eq "" ) { |
| 134 print " rbbicsts: at line $line_num, expected state after ^ (no sp
aces).\n"; |
| 135 exit(-1); |
| 136 } |
| 137 shift @fields; |
| 138 } |
| 139 |
| 140 # |
| 141 # Lastly, do the optional action name. |
| 142 # |
| 143 if ($fields[0] ne "") { |
| 144 $state_func_name[$num_states] = $fields[0]; |
| 145 shift @fields; |
| 146 } |
| 147 |
| 148 # |
| 149 # There should be no fields left on the line at this point. |
| 150 # |
| 151 if (@fields > 0) { |
| 152 print " rbbicsts: at line $line_num, unexpected extra stuff on input li
ne.\n"; |
| 153 print " scanning $fields[0]\n"; |
| 154 } |
| 155 $num_states++; |
| 156 } |
| 157 |
| 158 # |
| 159 # We've read in the whole file, now go back and output the |
| 160 # C source code for the state transition table. |
| 161 # |
| 162 # We read all states first, before writing anything, so that the state numbers |
| 163 # for the destination states are all available to be written. |
| 164 # |
| 165 |
| 166 # |
| 167 # Make hashes for the names of the character classes and |
| 168 # for the names of the actions that appeared. |
| 169 # |
| 170 for ($state=1; $state < $num_states; $state++) { |
| 171 if ($state_char_class[$state] ne "") { |
| 172 if ($charClasses{$state_char_class[$state]} == 0) { |
| 173 $charClasses{$state_char_class[$state]} = 1; |
| 174 } |
| 175 } |
| 176 if ($state_func_name[$state] eq "") { |
| 177 $state_func_name[$state] = "doNOP"; |
| 178 } |
| 179 if ($actions{$state_action_name[$state]} == 0) { |
| 180 $actions{$state_func_name[$state]} = 1; |
| 181 } |
| 182 } |
| 183 |
| 184 # |
| 185 # Check that all of the destination states have been defined |
| 186 # |
| 187 # |
| 188 $states{"exit"} = 0; # Predefined state name, terminates state mach
ine. |
| 189 for ($state=1; $state<$num_states; $state++) { |
| 190 if ($states{$state_dest_state[$state]} == 0 && $state_dest_state[$state] ne "
exit") { |
| 191 print "Error at line $state_line_num[$state]: target state \"$state_dest_
state[$state]\" is not defined.\n"; |
| 192 $errors++; |
| 193 } |
| 194 if ($state_push_state[$state] ne "" && $states{$state_push_state[$state]} ==
0) { |
| 195 print "Error at line $state_line_num[$state]: target state \"$state_push_
state[$state]\" is not defined.\n"; |
| 196 $errors++; |
| 197 } |
| 198 } |
| 199 |
| 200 die if ($errors>0); |
| 201 |
| 202 print "//-----------------------------------------------------------------------
----------\n"; |
| 203 print "//\n"; |
| 204 print "// Generated Header File. Do not edit by hand.\n"; |
| 205 print "// This file contains the state table for the ICU Regular Expression P
attern Parser\n"; |
| 206 print "// It is generated by the Perl script \"regexcst.pl\" from\n"; |
| 207 print "// the rule parser state definitions file \"regexcst.txt\".\n"; |
| 208 print "//\n"; |
| 209 print "// Copyright (C) 2002-2007 International Business Machines Corporation
\n"; |
| 210 print "// and others. All rights reserved. \n"; |
| 211 print "//\n"; |
| 212 print "//-----------------------------------------------------------------------
----------\n"; |
| 213 print "#ifndef RBBIRPT_H\n"; |
| 214 print "#define RBBIRPT_H\n"; |
| 215 print "\n"; |
| 216 print "U_NAMESPACE_BEGIN\n"; |
| 217 |
| 218 # |
| 219 # Emit the constants for indicies of Unicode Sets |
| 220 # Define one constant for each of the character classes encountered. |
| 221 # At the same time, store the index corresponding to the set name back into ha
sh. |
| 222 # |
| 223 print "//\n"; |
| 224 print "// Character classes for regex pattern scanning.\n"; |
| 225 print "//\n"; |
| 226 $i = 128; # State Table values for Unicode char sets range fro
m 128-250. |
| 227 # Sets "default", "quoted", etc. get special handlin
g. |
| 228 # They have no corresponding UnicodeSet object in t
he state machine, |
| 229 # but are handled by special case code. So we em
it no reference |
| 230 # to a UnicodeSet object to them here. |
| 231 foreach $setName (keys %charClasses) { |
| 232 if ($setName eq "default") { |
| 233 $charClasses{$setName} = 255;} |
| 234 elsif ($setName eq "quoted") { |
| 235 $charClasses{$setName} = 254;} |
| 236 elsif ($setName eq "eof") { |
| 237 $charClasses{$setName} = 253;} |
| 238 else { |
| 239 # Normal character class. Fill in array with a ptr to the corresponding
UnicodeSet in the state machine. |
| 240 print " static const uint8_t kRuleSet_$setName = $i;\n"; |
| 241 $charClasses{$setName} = $i; |
| 242 $i++; |
| 243 } |
| 244 } |
| 245 print "\n\n"; |
| 246 |
| 247 # |
| 248 # Emit the enum for the actions to be performed. |
| 249 # |
| 250 print "enum Regex_PatternParseAction {\n"; |
| 251 foreach $act (keys %actions) { |
| 252 print " $act,\n"; |
| 253 } |
| 254 print " rbbiLastAction};\n\n"; |
| 255 |
| 256 # |
| 257 # Emit the struct definition for transtion table elements. |
| 258 # |
| 259 print "//-----------------------------------------------------------------------
--------\n"; |
| 260 print "//\n"; |
| 261 print "// RegexTableEl represents the structure of a row in the transitio
n table\n"; |
| 262 print "// for the pattern parser state machine.\n"; |
| 263 print "//-----------------------------------------------------------------------
--------\n"; |
| 264 print "struct RegexTableEl {\n"; |
| 265 print " Regex_PatternParseAction fAction;\n"; |
| 266 print " uint8_t fCharClass; // 0-127: an indiv
idual ASCII character\n"; |
| 267 print " // 128-255: characte
r class index\n"; |
| 268 print " uint8_t fNextState; // 0-250: normal n
ext-state numbers\n"; |
| 269 print " // 255: pop next
-state from stack.\n"; |
| 270 print " uint8_t fPushState;\n"; |
| 271 print " UBool fNextChar;\n"; |
| 272 print "};\n\n"; |
| 273 |
| 274 # |
| 275 # emit the state transition table |
| 276 # |
| 277 print "static const struct RegexTableEl gRuleParseStateTable[] = {\n"; |
| 278 print " {doNOP, 0, 0, 0, TRUE}\n"; # State 0 is a dummy. Real states star
t with index = 1. |
| 279 for ($state=1; $state < $num_states; $state++) { |
| 280 print " , {$state_func_name[$state],"; |
| 281 if ($state_literal_chars[$state] ne "") { |
| 282 $c = $state_literal_chars[$state]; |
| 283 printf(" %d /* $c */,", ord($c)); # use numeric value, so EBCDIC mach
ines are ok. |
| 284 }else { |
| 285 print " $charClasses{$state_char_class[$state]},"; |
| 286 } |
| 287 print " $states{$state_dest_state[$state]},"; |
| 288 |
| 289 # The push-state field is optional. If omitted, fill field with a zero, whi
ch flags |
| 290 # the state machine that there is no push state. |
| 291 if ($state_push_state[$state] eq "") { |
| 292 print "0, "; |
| 293 } else { |
| 294 print " $states{$state_push_state[$state]},"; |
| 295 } |
| 296 print " $state_flag[$state]} "; |
| 297 |
| 298 # Put out a C++ comment showing the number (index) of this state row, |
| 299 # and, if this is the first row of the table for this state, the state nam
e. |
| 300 print " // $state "; |
| 301 if ($stateNames[$state] ne "") { |
| 302 print " $stateNames[$state]"; |
| 303 } |
| 304 print "\n"; |
| 305 }; |
| 306 print " };\n"; |
| 307 |
| 308 |
| 309 # |
| 310 # emit a mapping array from state numbers to state names. |
| 311 # |
| 312 # This array is used for producing debugging output from the pattern parser. |
| 313 # |
| 314 print "static const char * const RegexStateNames[] = {"; |
| 315 for ($state=0; $state<$num_states; $state++) { |
| 316 if ($stateNames[$state] ne "") { |
| 317 print " \"$stateNames[$state]\",\n"; |
| 318 } else { |
| 319 print " 0,\n"; |
| 320 } |
| 321 } |
| 322 print " 0};\n\n"; |
| 323 |
| 324 print "U_NAMESPACE_END\n"; |
| 325 print "#endif\n"; |
| 326 |
| 327 |
| 328 |
OLD | NEW |