OLD | NEW |
1 #!/usr/bin/env python | 1 #!/usr/bin/env python |
2 # Copyright 2014 The Chromium Authors. All rights reserved. | 2 # Copyright 2014 The Chromium Authors. All rights reserved. |
3 # Use of this source code is governed by a BSD-style license that can be | 3 # Use of this source code is governed by a BSD-style license that can be |
4 # found in the LICENSE file. | 4 # found in the LICENSE file. |
5 | 5 |
6 """ | 6 """ |
7 A Deterministic acyclic finite state automaton (DAFSA) is a compact | 7 A Deterministic acyclic finite state automaton (DAFSA) is a compact |
8 representation of an unordered word list (dictionary). | 8 representation of an unordered word list (dictionary). |
9 | 9 |
10 http://en.wikipedia.org/wiki/Deterministic_acyclic_finite_state_automaton | 10 http://en.wikipedia.org/wiki/Deterministic_acyclic_finite_state_automaton |
(...skipping 174 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
185 | 185 |
186 5: 0x61 <char> label character 0x61 -> match "a" | 186 5: 0x61 <char> label character 0x61 -> match "a" |
187 6: 0x61 <char> label character 0x61 -> match "a" | 187 6: 0x61 <char> label character 0x61 -> match "a" |
188 7: 0x81 <return_value> 0x81 & 0x0F -> return 1 | 188 7: 0x81 <return_value> 0x81 & 0x0F -> return 1 |
189 | 189 |
190 8: 0x62 <char> label character 0x62 -> match "b" | 190 8: 0x62 <char> label character 0x62 -> match "b" |
191 9: 0x62 <char> label character 0x62 -> match "b" | 191 9: 0x62 <char> label character 0x62 -> match "b" |
192 10: 0x82 <return_value> 0x82 & 0x0F -> return 2 | 192 10: 0x82 <return_value> 0x82 & 0x0F -> return 2 |
193 """ | 193 """ |
194 | 194 |
| 195 import argparse |
195 import sys | 196 import sys |
196 | 197 |
| 198 |
| 199 parser = argparse.ArgumentParser() |
| 200 parser.add_argument('infile', nargs='?', type=argparse.FileType('r')) |
| 201 parser.add_argument('outfile', nargs='?', type=argparse.FileType('w')) |
| 202 parser.add_argument('--reverse_inputs', action='store_true', |
| 203 help='Generates a DAFSA on reversed input strings; this is ' |
| 204 'useful for suffix queries') |
| 205 args = parser.parse_args() |
| 206 |
| 207 |
197 class InputError(Exception): | 208 class InputError(Exception): |
198 """Exception raised for errors in the input file.""" | 209 """Exception raised for errors in the input file.""" |
199 | 210 |
200 | 211 |
201 def to_dafsa(words): | 212 def to_dafsa(words): |
202 """Generates a DAFSA from a word list and returns the source node. | 213 """Generates a DAFSA from a word list and returns the source node. |
203 | 214 |
204 Each word is split into characters so that each character is represented by | 215 Each word is split into characters so that each character is represented by |
205 a unique node. It is assumed the word list is not empty. | 216 a unique node. It is assumed the word list is not empty. |
206 """ | 217 """ |
(...skipping 239 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
446 end = lines.index('%%', begin) | 457 end = lines.index('%%', begin) |
447 lines = lines[begin:end] | 458 lines = lines[begin:end] |
448 for line in lines: | 459 for line in lines: |
449 if line[-3:-1] != ', ': | 460 if line[-3:-1] != ', ': |
450 raise InputError('Expected "domainname, <digit>", found "%s"' % line) | 461 raise InputError('Expected "domainname, <digit>", found "%s"' % line) |
451 # Technically the DAFSA format can support return values in the range | 462 # Technically the DAFSA format can support return values in the range |
452 # [0-31], but only the first three bits have any defined meaning. | 463 # [0-31], but only the first three bits have any defined meaning. |
453 if not line.endswith(('0', '1', '2', '3', '4', '5', '6', '7')): | 464 if not line.endswith(('0', '1', '2', '3', '4', '5', '6', '7')): |
454 raise InputError('Expected value to be in the range of 0-7, found "%s"' % | 465 raise InputError('Expected value to be in the range of 0-7, found "%s"' % |
455 line[-1]) | 466 line[-1]) |
456 return [line[:-3] + line[-1] for line in lines] | 467 result = [] |
457 | 468 for line in lines: |
| 469 word, result_code = line[:-3], line[-1] |
| 470 if args.reverse_inputs: |
| 471 word = word[::-1] |
| 472 result.append(word + result_code) |
| 473 result.sort() |
| 474 return result |
458 | 475 |
459 def main(): | 476 def main(): |
460 if len(sys.argv) != 3: | 477 with args.infile as infile, args.outfile as outfile: |
461 print('usage: %s infile outfile' % sys.argv[0]) | |
462 return 1 | |
463 with open(sys.argv[1], 'r') as infile, open(sys.argv[2], 'w') as outfile: | |
464 outfile.write(words_to_cxx(parse_gperf(infile))) | 478 outfile.write(words_to_cxx(parse_gperf(infile))) |
465 return 0 | 479 return 0 |
466 | 480 |
467 | 481 |
468 if __name__ == '__main__': | 482 if __name__ == '__main__': |
469 sys.exit(main()) | 483 sys.exit(main()) |
OLD | NEW |