icu46/source/i18n/regexcst.pl - Issue 5516007: Check in the pristine copy of ICU 4.6...

Side by Side Diff: icu46/source/i18n/regexcst.pl

Issue 5516007: Check in the pristine copy of ICU 4.6... (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/

Patch Set: Created 10 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

Property Changes:

Added: svn:executable
+ *
Added: svn:eol-style
+ LF

OLD	NEW
(Empty)
	1 #!/usr/bin/perl

	2 # ********************************************************************

	3 # * COPYRIGHT:

	4 # * Copyright (c) 2002-2007, International Business Machines Corporation and

	5 # * others. All Rights Reserved.

	6 # ********************************************************************

	7 #

	8 # regexcst.pl

	9 # Compile the regular expression paser state table data into initiali zed C data.

	10 # Usage:

	11 # cd icu/source/i18n

	12 # perl regexcst.pl < regexcst.txt > regexcst.h

	13 #

	14 # The output file, regexcst.h, is included by some of the .cpp regex

	15 # implementation files. This perl script is NOT run as part

	16 # of a normal ICU build. It is run by hand when needed, and the

	17 # regexcst.h generated file is put back into cvs.

	18 #

	19 # See regexcst.txt for a description of the input format for this sc ript.

	20 #

	21 # This script is derived from rbbicst.pl, which peforms the same fun ction

	22 # for the Rule Based Break Iterator Rule Parser. Perhaps they could be

	23 # merged?

	24 #

	25

	26

	27 $num_states = 1; # Always the state number for the line being compiled.

	28 $line_num = 0; # The line number in the input file.

	29

	30 $states{"pop"} = 255; # Add the "pop" to the list of defined state names.

	31 # This prevents any state from being labelled with "pop ",

	32 # and resolves references to "pop" in the next state f ield.

	33

	34 line_loop: while (<>) {

	35 chomp();

	36 $line = $_;

	37 @fields = split();

	38 $line_num++;

	39

	40 # Remove # comments, which are any fields beginning with a #, plus all

	41 # that follow on the line.

	42 for ($i=0; $i<@fields; $i++) {

	43 if ($fields[$i] =~ /^#/) {

	44 @fields = @fields[0 .. $i-1];

	45 last;

	46 }

	47 }

	48 # ignore blank lines, and those with no fields left after stripping comments ..

	49 if (@fields == 0) {

	50 next;

	51 }

	52

	53 #

	54 # State Label: handling.

	55 # Does the first token end with a ":"? If so, it's the name of a state.

	56 # Put in a hash, together with the current state number,

	57 # so that we can later look up the number from the name.

	58 #

	59 if (@fields[0] =~ /.*:$/) {

	60 $state_name = @fields[0];

	61 $state_name =~ s/://; # strip off the colon from the state name.

	62

	63 if ($states{$state_name} != 0) {

	64 print " rbbicst: at line $line-num duplicate definition of state $s tate_name\n";

	65 }

	66 $states{$state_name} = $num_states;

	67 $stateNames[$num_states] = $state_name;

	68

	69 # if the label was the only thing on this line, go on to the next line,

	70 # otherwise assume that a state definition is on the same line and fall through.

	71 if (@fields == 1) {

	72 next line_loop;

	73 }

	74 shift @fields; # shift off label field in preparat ion

	75 # for handling the rest of the lin e.

	76 }

	77

	78 #

	79 # State Transition line.

	80 # syntax is this,

	81 # character [n] target-state [^push-state] [function-name]

	82 # where

	83 # [something] is an optional something

	84 # character is either a single quoted character e.g. '['

	85 # or a name of a character class, e.g. white_space

	86 #

	87

	88 $state_line_num[$num_states] = $line_num; # remember line number with each state

	89 # so we can make better error m essages later.

	90 #

	91 # First field, character class or literal character for this transition.

	92 #

	93 if ($fields[0] =~ /^'.'$/) {

	94 # We've got a quoted literal character.

	95 $state_literal_chars[$num_states] = $fields[0];

	96 $state_literal_chars[$num_states] =~ s/'//g;

	97 } else {

	98 # We've got the name of a character class.

	99 $state_char_class[$num_states] = $fields[0];

	100 if ($fields[0] =~ /[\W]/) {

	101 print " rbbicsts: at line $line_num, bad character literal or char acter class name.\n";

	102 print " scanning $fields[0]\n";

	103 exit(-1);

	104 }

	105 }

	106 shift @fields;

	107

	108 #

	109 # do the 'n' flag

	110 #

	111 $state_flag[$num_states] = "FALSE";

	112 if ($fields[0] eq "n") {

	113 $state_flag[$num_states] = "TRUE";

	114 shift @fields;

	115 }

	116

	117 #

	118 # do the destination state.

	119 #

	120 $state_dest_state[$num_states] = $fields[0];

	121 if ($fields[0] eq "") {

	122 print " rbbicsts: at line $line_num, destination state missing.\n";

	123 exit(-1);

	124 }

	125 shift @fields;

	126

	127 #

	128 # do the push state, if present.

	129 #

	130 if ($fields[0] =~ /^\^/) {

	131 $fields[0] =~ s/^\^//;

	132 $state_push_state[$num_states] = $fields[0];

	133 if ($fields[0] eq "" ) {

	134 print " rbbicsts: at line $line_num, expected state after ^ (no sp aces).\n";

	135 exit(-1);

	136 }

	137 shift @fields;

	138 }

	139

	140 #

	141 # Lastly, do the optional action name.

	142 #

	143 if ($fields[0] ne "") {

	144 $state_func_name[$num_states] = $fields[0];

	145 shift @fields;

	146 }

	147

	148 #

	149 # There should be no fields left on the line at this point.

	150 #

	151 if (@fields > 0) {

	152 print " rbbicsts: at line $line_num, unexpected extra stuff on input li ne.\n";

	153 print " scanning $fields[0]\n";

	154 }

	155 $num_states++;

	156 }

	157

	158 #

	159 # We've read in the whole file, now go back and output the

	160 # C source code for the state transition table.

	161 #

	162 # We read all states first, before writing anything, so that the state numbers

	163 # for the destination states are all available to be written.

	164 #

	165

	166 #

	167 # Make hashes for the names of the character classes and

	168 # for the names of the actions that appeared.

	169 #

	170 for ($state=1; $state < $num_states; $state++) {

	171 if ($state_char_class[$state] ne "") {

	172 if ($charClasses{$state_char_class[$state]} == 0) {

	173 $charClasses{$state_char_class[$state]} = 1;

	174 }

	175 }

	176 if ($state_func_name[$state] eq "") {

	177 $state_func_name[$state] = "doNOP";

	178 }

	179 if ($actions{$state_action_name[$state]} == 0) {

	180 $actions{$state_func_name[$state]} = 1;

	181 }

	182 }

	183

	184 #

	185 # Check that all of the destination states have been defined

	186 #

	187 #

	188 $states{"exit"} = 0; # Predefined state name, terminates state mach ine.

	189 for ($state=1; $state<$num_states; $state++) {

	190 if ($states{$state_dest_state[$state]} == 0 && $state_dest_state[$state] ne " exit") {

	191 print "Error at line $state_line_num[$state]: target state \"$state_dest_ state[$state]\" is not defined.\n";

	192 $errors++;

	193 }

	194 if ($state_push_state[$state] ne "" && $states{$state_push_state[$state]} == 0) {

	195 print "Error at line $state_line_num[$state]: target state \"$state_push_ state[$state]\" is not defined.\n";

	196 $errors++;

	197 }

	198 }

	199

	200 die if ($errors>0);

	201

	202 print "//----------------------------------------------------------------------- ----------\n";

	203 print "//\n";

	204 print "// Generated Header File. Do not edit by hand.\n";

	205 print "// This file contains the state table for the ICU Regular Expression P attern Parser\n";

	206 print "// It is generated by the Perl script \"regexcst.pl\" from\n";

	207 print "// the rule parser state definitions file \"regexcst.txt\".\n";

	208 print "//\n";

	209 print "// Copyright (C) 2002-2007 International Business Machines Corporation \n";

	210 print "// and others. All rights reserved. \n";

	211 print "//\n";

	212 print "//----------------------------------------------------------------------- ----------\n";

	213 print "#ifndef RBBIRPT_H\n";

	214 print "#define RBBIRPT_H\n";

	215 print "\n";

	216 print "U_NAMESPACE_BEGIN\n";

	217

	218 #

	219 # Emit the constants for indicies of Unicode Sets

	220 # Define one constant for each of the character classes encountered.

	221 # At the same time, store the index corresponding to the set name back into ha sh.

	222 #

	223 print "//\n";

	224 print "// Character classes for regex pattern scanning.\n";

	225 print "//\n";

	226 $i = 128; # State Table values for Unicode char sets range fro m 128-250.

	227 # Sets "default", "quoted", etc. get special handlin g.

	228 # They have no corresponding UnicodeSet object in t he state machine,

	229 # but are handled by special case code. So we em it no reference

	230 # to a UnicodeSet object to them here.

	231 foreach $setName (keys %charClasses) {

	232 if ($setName eq "default") {

	233 $charClasses{$setName} = 255;}

	234 elsif ($setName eq "quoted") {

	235 $charClasses{$setName} = 254;}

	236 elsif ($setName eq "eof") {

	237 $charClasses{$setName} = 253;}

	238 else {

	239 # Normal character class. Fill in array with a ptr to the corresponding UnicodeSet in the state machine.

	240 print " static const uint8_t kRuleSet_$setName = $i;\n";

	241 $charClasses{$setName} = $i;

	242 $i++;

	243 }

	244 }

	245 print "\n\n";

	246

	247 #

	248 # Emit the enum for the actions to be performed.

	249 #

	250 print "enum Regex_PatternParseAction {\n";

	251 foreach $act (keys %actions) {

	252 print " $act,\n";

	253 }

	254 print " rbbiLastAction};\n\n";

	255

	256 #

	257 # Emit the struct definition for transtion table elements.

	258 #

	259 print "//----------------------------------------------------------------------- --------\n";

	260 print "//\n";

	261 print "// RegexTableEl represents the structure of a row in the transitio n table\n";

	262 print "// for the pattern parser state machine.\n";

	263 print "//----------------------------------------------------------------------- --------\n";

	264 print "struct RegexTableEl {\n";

	265 print " Regex_PatternParseAction fAction;\n";

	266 print " uint8_t fCharClass; // 0-127: an indiv idual ASCII character\n";

	267 print " // 128-255: characte r class index\n";

	268 print " uint8_t fNextState; // 0-250: normal n ext-state numbers\n";

	269 print " // 255: pop next -state from stack.\n";

	270 print " uint8_t fPushState;\n";

	271 print " UBool fNextChar;\n";

	272 print "};\n\n";

	273

	274 #

	275 # emit the state transition table

	276 #

	277 print "static const struct RegexTableEl gRuleParseStateTable[] = {\n";

	278 print " {doNOP, 0, 0, 0, TRUE}\n"; # State 0 is a dummy. Real states star t with index = 1.

	279 for ($state=1; $state < $num_states; $state++) {

	280 print " , {$state_func_name[$state],";

	281 if ($state_literal_chars[$state] ne "") {

	282 $c = $state_literal_chars[$state];

	283 printf(" %d /* $c */,", ord($c)); # use numeric value, so EBCDIC mach ines are ok.

	284 }else {

	285 print " $charClasses{$state_char_class[$state]},";

	286 }

	287 print " $states{$state_dest_state[$state]},";

	288

	289 # The push-state field is optional. If omitted, fill field with a zero, whi ch flags

	290 # the state machine that there is no push state.

	291 if ($state_push_state[$state] eq "") {

	292 print "0, ";

	293 } else {

	294 print " $states{$state_push_state[$state]},";

	295 }

	296 print " $state_flag[$state]} ";

	297

	298 # Put out a C++ comment showing the number (index) of this state row,

	299 # and, if this is the first row of the table for this state, the state nam e.

	300 print " // $state ";

	301 if ($stateNames[$state] ne "") {

	302 print " $stateNames[$state]";

	303 }

	304 print "\n";

	305 };

	306 print " };\n";

	307

	308

	309 #

	310 # emit a mapping array from state numbers to state names.

	311 #

	312 # This array is used for producing debugging output from the pattern parser.

	313 #

	314 print "static const char * const RegexStateNames[] = {";

	315 for ($state=0; $state<$num_states; $state++) {

	316 if ($stateNames[$state] ne "") {

	317 print " \"$stateNames[$state]\",\n";

	318 } else {

	319 print " 0,\n";

	320 }

	321 }

	322 print " 0};\n\n";

	323

	324 print "U_NAMESPACE_END\n";

	325 print "#endif\n";

	326

	327

	328

OLD	NEW

« no previous file with comments | « icu46/source/i18n/regexcst.h ('k') | icu46/source/i18n/regexcst.txt » ('j') | no next file with comments »