Chromium Code Reviews| Index: testing/libfuzzer/dictionary_generator.py |
| diff --git a/testing/libfuzzer/dictionary_generator.py b/testing/libfuzzer/dictionary_generator.py |
| new file mode 100755 |
| index 0000000000000000000000000000000000000000..95961744065f04754301d5d0bd03b576f4d7f48e |
| --- /dev/null |
| +++ b/testing/libfuzzer/dictionary_generator.py |
| @@ -0,0 +1,209 @@ |
| +#!/usr/bin/python2 |
| +# |
| +# Copyright 2016 The Chromium Authors. All rights reserved. |
| +# Use of this source code is governed by a BSD-style license that can be |
| +# found in the LICENSE file. |
| + |
| +"""Generate a dictionary for libfuzzer or AFL-based fuzzer. |
| + |
| +Invoked manually using a fuzzer binary and target format/protocol specification. |
| +Works better for text formats or protocols. For binary ones may be useless. |
| +""" |
| + |
| +import argparse |
| +import HTMLParser |
| +import io |
| +import os |
| +import random |
| +import re |
| +import shutil |
| +import string |
| +import sys |
| +import time |
| + |
| + |
| +ALLOWED_CHARS = string.letters + string.digits + '#$%&()*+,-./:;<=>?@[]^_`{|}~ ' |
|
inferno
2016/07/04 00:21:56
maybe better to just use ascii table 32-126 ?
mmoroz
2016/07/06 15:06:19
Removed this white-listing at all.
|
| + |
| + |
| +def EscapeDictionaryElement(element): |
|
aizatsky
2016/07/01 22:28:55
Consider using string_escape:
https://docs.python
mmoroz
2016/07/06 15:06:18
Thank you, it is almost exactly what I need!
|
| + """Escape all unprintable and control characters in an element.""" |
| + res = '' |
|
inferno
2016/07/04 00:21:55
s/res/result. full variable names are preferred.
mmoroz
2016/07/06 15:06:18
Done.
|
| + for c in element: |
| + if c in ALLOWED_CHARS: |
| + res += c |
| + else: |
| + res += HexEscapeSingleChar(c) |
| + |
| + return res |
| + |
| + |
| +def ExtractWordsFromLines(lines): |
| + """Extract all words from a list of strings.""" |
| + words = set() |
| + for l in lines: |
|
inferno
2016/07/04 00:21:55
nit: s/l/line, s/w/word
mmoroz
2016/07/06 15:06:18
Done.
|
| + for w in l.split(): |
| + words.add(w) |
| + |
| + return words |
| + |
| + |
| +def FindNumberOfLeadingSpaces(line): |
| + """Calculate number of leading whitespace characters in the string.""" |
| + n = 0 |
|
inferno
2016/07/04 00:21:55
nit: maybe just line.count(' ') ? or we care about
mmoroz
2016/07/06 15:06:19
I'm afraid that specification in textual form may
|
| + while n < len(line) and line[n].isspace(): |
| + n += 1 |
| + |
| + return n |
| + |
| + |
| +def FindQuotedText(text): |
| + """Find text with an offset, e.g. code samples in RFCs or data examples.""" |
|
aizatsky
2016/07/01 22:28:55
Find space-indented text blocks, e.g. ...
mmoroz
2016/07/06 15:06:19
Done.
|
| + strings = text.split('\n') |
| + found_strings = [] |
| + min_number_of_spaces = 1 |
| + previous_number_of_spaces = 0 |
| + previous_strings = '' |
| + |
| + # Go through every line and concatenate quoted lines into a one string. |
|
inferno
2016/07/04 00:21:55
Can you more explanation on algorithm and add an e
mmoroz
2016/07/06 15:06:19
Made it a bit simpler, added comments and renamed
|
| + for i in xrange(1, len(strings), 1): |
| + n = FindNumberOfLeadingSpaces(strings[i]) |
| + if n < previous_number_of_spaces and previous_strings: |
| + # Current line is not quotted, save previously concatenated lines. |
| + found_strings.append(previous_strings) |
| + previous_strings = '' |
| + previous_number_of_spaces = n |
| + continue |
| + |
| + if n >= min_number_of_spaces: |
| + if n > previous_number_of_spaces: |
| + # Beginning of a quoted text, start concatenation. |
| + previous_strings = strings[i][n : ] |
| + elif n == previous_number_of_spaces and previous_strings: |
| + # Or continuation of a quoted text, concatenate it. |
| + previous_strings += '\n' + strings[i][n : ] |
| + |
| + previous_number_of_spaces = n |
| + |
| + return found_strings |
| + |
| + |
| +def GenerateDictionary(path_to_binary, path_to_spec, strategy): |
| + """Generate a dictionary for given pair of fuzzer binary and specification.""" |
| + words_from_binary = ExtractWordsFromBinary(path_to_binary) |
| + words_from_spec = ParseSpec(path_to_spec) |
| + |
| + dictionary_words = set() |
| + |
| + if 'i' in strategy: |
| + # Strategy i: only words which are common for binary and for specification. |
| + dictionary_words = words_from_binary.intersection(words_from_spec) |
| + |
| + if 'q' in strategy: |
| + # Strategy q: add words from all quoted strings from specification. |
| + # TODO(mmoroz): experimental and very noisy. Not recommended to use. |
| + spec_data = ReadSpecification(path_to_spec) |
| + quoted_strings = FindQuotedText(spec_data) |
| + quoted_words = ExtractWordsFromLines(quoted_strings) |
| + dictionary_words = dictionary_words.union(quoted_words) |
| + |
| + if 'u' in strategy: |
| + # Strategy u: add all uppercase words from specification. |
| + uppercase_words = set(w for w in words_from_spec if w.isupper()) |
| + dictionary_words = dictionary_words.union(uppercase_words) |
| + |
| + return dictionary_words |
| + |
| + |
| +def HexEscapeSingleChar(c): |
| + """Replace a character with its hex-escaped representation.""" |
| + return '\\x' + c.encode('hex') |
| + |
| + |
| +def ExtractWordsFromBinary(filepath, min_length=4): |
| + """Extract words (splitted strings) from a binary executable file.""" |
| + rodata_path = PreprocessBinary(filepath) |
|
Oliver Chang
2016/07/01 19:10:45
is rodata_path ever deleted?
mmoroz
2016/07/06 15:06:19
Done. Now it works via tempfile, thanks for the su
|
| + words = [] |
| + |
| + # Use different encodings for strings extraction. |
| + for encoding in 'sSbBlL': |
| + cmd = 'strings -e %s -n %d %s' % (encoding, min_length, rodata_path) |
|
aizatsky
2016/07/01 22:28:55
Wouldn't it be simpler to simply use regexp to ext
mmoroz
2016/07/06 15:06:18
Nice idea! Looks like it gives even a bit "cleaner
|
| + strings_raw = os.popen(cmd).read() |
| + words += strings_raw.split() |
| + |
| + return set(words) |
| + |
| + |
| +def ParseSpec(filepath): |
|
inferno
2016/07/04 00:21:56
nit: s/ParseSpec/ExtractWordsFromSpec
mmoroz
2016/07/06 15:06:18
Done.
|
| + """Extract words from a specification.""" |
| + data = ReadSpecification(filepath) |
| + words = data.split() |
| + return set(words) |
| + |
| + |
| +def PreprocessBinary(filepath): |
| + """Creates a stripped copy of the binary and extract .rodata section.""" |
| + temp_filepath = os.path.join('/tmp', os.path.basename(filepath) + '_copy') |
|
Oliver Chang
2016/07/01 19:10:45
can you use the tempfile instead?
inferno
2016/07/04 00:21:56
+1
mmoroz
2016/07/06 15:06:18
Done.
|
| + shutil.copyfile(filepath, temp_filepath) |
| + |
| + # Strip all symbols to reduce amount of redundant strings. |
| + os.popen('strip --strip-all %s' % temp_filepath) |
|
inferno
2016/07/04 00:21:56
os.popen is deprecated, use subprocess module.
mmoroz
2016/07/06 15:06:18
Thanks! Done.
|
| + |
| + # Extract .rodata section to reduce amount of redundant strings. |
| + rodata_filepath = os.path.join('/tmp', os.path.basename(filepath) + '_rodata') |
| + os.popen('objcopy -j .rodata %s %s 2>&1' % (temp_filepath, rodata_filepath)) |
| + |
| + return rodata_filepath |
| + |
| + |
| +def ReadAndDecodeHTML(filepath): |
| + """Return HTML-decoded content of the file.""" |
| + data = io.open(filepath, 'r', encoding='utf-8').read() |
|
inferno
2016/07/04 00:21:55
forgot to close file :(, also just plain open shou
mmoroz
2016/07/06 15:06:19
My bad, I thought that it is not mandatory to expl
|
| + html_parser = HTMLParser.HTMLParser() |
| + data = html_parser.unescape(data) |
| + return data.encode('ascii', 'ignore') |
| + |
| + |
| +def ReadSpecification(filepath): |
| + """Read a specification file and convert it to ASCII text if needed.""" |
|
inferno
2016/07/04 00:21:55
description part "convert it to ASCII text if need
mmoroz
2016/07/06 15:06:19
Done.
|
| + if filepath.endswith('.txt'): |
|
inferno
2016/07/04 00:21:56
This seems too hacky, i wonder if people will know
mmoroz
2016/07/06 15:06:19
Good point, but we don't know exactly should file
|
| + data = open(filepath).read() |
| + else: |
| + # If specification is not a .txt file, decode possible HTML entities. |
| + data = ReadAndDecodeHTML(filepath) |
|
inferno
2016/07/04 00:21:55
maybe split ReadAndDecodeHtml into ReadFile and de
mmoroz
2016/07/06 15:06:18
Yeah, but regular open() works fine for everything
|
| + |
| + return data |
| + |
| + |
| +def WriteDictionary(dictionary_path, dictionary): |
| + """Write given dictionary to a file.""" |
| + with open(dictionary_path, 'wb') as f: |
| + f.write('# This is an automatically generated dictionary.\n') |
| + for word in dictionary: |
| + if not word: |
| + continue |
| + line = '"%s"\n' % EscapeDictionaryElement(word) |
| + f.write(line) |
| + |
| + |
| +def main(): |
| + parser = argparse.ArgumentParser(description="Generate fuzzer dictionary.") |
| + parser.add_argument('--fuzzer', required=True, |
| + help='Path to a fuzzer binary executable. It is ' |
| + 'recommended to use a binary built with ' |
| + '"use_libfuzzer=false is_asan=false" to get a better ' |
| + 'dictionary with fewer number of redundant elements.') |
| + parser.add_argument('--out', required=True, |
| + help='Path to a file to write a dictionary into.') |
| + parser.add_argument('--spec', required=True, |
|
Oliver Chang
2016/07/01 19:10:45
can the spec option be optional? is it reasonable
mmoroz
2016/07/06 15:06:18
I don't think it will be reasonable, there are too
|
| + help='Path to a target specification (in textual form).') |
| + parser.add_argument('--strategy', default='iu', |
| + help='Generation strategy [iqu] ("iu" is default value): ' |
| + 'i - intersection, q - quoted, u - uppercase.') |
| + args = parser.parse_args() |
| + |
| + dictionary = GenerateDictionary(args.fuzzer, args.spec, args.strategy) |
|
inferno
2016/07/04 00:21:55
I think you should check the platform so that bina
mmoroz
2016/07/06 15:06:19
Good point, thanks!
My plan is to focus on dynami
|
| + WriteDictionary(args.out, dictionary) |
| + |
| + |
| +if __name__ == '__main__': |
| + main() |