Chromium Code Reviews| OLD | NEW |
|---|---|
| (Empty) | |
| 1 #!/usr/bin/python2 | |
| 2 # | |
| 3 # Copyright 2016 The Chromium Authors. All rights reserved. | |
| 4 # Use of this source code is governed by a BSD-style license that can be | |
| 5 # found in the LICENSE file. | |
| 6 | |
| 7 """Generate a dictionary for libfuzzer or AFL-based fuzzer. | |
| 8 | |
| 9 Invoked manually using a fuzzer binary and target format/protocol specification. | |
| 10 Works better for text formats or protocols. For binary ones may be useless. | |
| 11 """ | |
| 12 | |
| 13 import argparse | |
| 14 import HTMLParser | |
| 15 import io | |
| 16 import os | |
| 17 import random | |
| 18 import re | |
| 19 import shutil | |
| 20 import string | |
| 21 import sys | |
| 22 import time | |
| 23 | |
| 24 | |
| 25 ALLOWED_CHARS = string.letters + string.digits + '#$%&()*+,-./:;<=>?@[]^_`{|}~ ' | |
|
inferno
2016/07/04 00:21:56
maybe better to just use ascii table 32-126 ?
mmoroz
2016/07/06 15:06:19
Removed this white-listing at all.
| |
| 26 | |
| 27 | |
| 28 def EscapeDictionaryElement(element): | |
|
aizatsky
2016/07/01 22:28:55
Consider using string_escape:
https://docs.python
mmoroz
2016/07/06 15:06:18
Thank you, it is almost exactly what I need!
| |
| 29 """Escape all unprintable and control characters in an element.""" | |
| 30 res = '' | |
|
inferno
2016/07/04 00:21:55
s/res/result. full variable names are preferred.
mmoroz
2016/07/06 15:06:18
Done.
| |
| 31 for c in element: | |
| 32 if c in ALLOWED_CHARS: | |
| 33 res += c | |
| 34 else: | |
| 35 res += HexEscapeSingleChar(c) | |
| 36 | |
| 37 return res | |
| 38 | |
| 39 | |
| 40 def ExtractWordsFromLines(lines): | |
| 41 """Extract all words from a list of strings.""" | |
| 42 words = set() | |
| 43 for l in lines: | |
|
inferno
2016/07/04 00:21:55
nit: s/l/line, s/w/word
mmoroz
2016/07/06 15:06:18
Done.
| |
| 44 for w in l.split(): | |
| 45 words.add(w) | |
| 46 | |
| 47 return words | |
| 48 | |
| 49 | |
| 50 def FindNumberOfLeadingSpaces(line): | |
| 51 """Calculate number of leading whitespace characters in the string.""" | |
| 52 n = 0 | |
|
inferno
2016/07/04 00:21:55
nit: maybe just line.count(' ') ? or we care about
mmoroz
2016/07/06 15:06:19
I'm afraid that specification in textual form may
| |
| 53 while n < len(line) and line[n].isspace(): | |
| 54 n += 1 | |
| 55 | |
| 56 return n | |
| 57 | |
| 58 | |
| 59 def FindQuotedText(text): | |
| 60 """Find text with an offset, e.g. code samples in RFCs or data examples.""" | |
|
aizatsky
2016/07/01 22:28:55
Find space-indented text blocks, e.g. ...
mmoroz
2016/07/06 15:06:19
Done.
| |
| 61 strings = text.split('\n') | |
| 62 found_strings = [] | |
| 63 min_number_of_spaces = 1 | |
| 64 previous_number_of_spaces = 0 | |
| 65 previous_strings = '' | |
| 66 | |
| 67 # Go through every line and concatenate quoted lines into a one string. | |
|
inferno
2016/07/04 00:21:55
Can you more explanation on algorithm and add an e
mmoroz
2016/07/06 15:06:19
Made it a bit simpler, added comments and renamed
| |
| 68 for i in xrange(1, len(strings), 1): | |
| 69 n = FindNumberOfLeadingSpaces(strings[i]) | |
| 70 if n < previous_number_of_spaces and previous_strings: | |
| 71 # Current line is not quotted, save previously concatenated lines. | |
| 72 found_strings.append(previous_strings) | |
| 73 previous_strings = '' | |
| 74 previous_number_of_spaces = n | |
| 75 continue | |
| 76 | |
| 77 if n >= min_number_of_spaces: | |
| 78 if n > previous_number_of_spaces: | |
| 79 # Beginning of a quoted text, start concatenation. | |
| 80 previous_strings = strings[i][n : ] | |
| 81 elif n == previous_number_of_spaces and previous_strings: | |
| 82 # Or continuation of a quoted text, concatenate it. | |
| 83 previous_strings += '\n' + strings[i][n : ] | |
| 84 | |
| 85 previous_number_of_spaces = n | |
| 86 | |
| 87 return found_strings | |
| 88 | |
| 89 | |
| 90 def GenerateDictionary(path_to_binary, path_to_spec, strategy): | |
| 91 """Generate a dictionary for given pair of fuzzer binary and specification.""" | |
| 92 words_from_binary = ExtractWordsFromBinary(path_to_binary) | |
| 93 words_from_spec = ParseSpec(path_to_spec) | |
| 94 | |
| 95 dictionary_words = set() | |
| 96 | |
| 97 if 'i' in strategy: | |
| 98 # Strategy i: only words which are common for binary and for specification. | |
| 99 dictionary_words = words_from_binary.intersection(words_from_spec) | |
| 100 | |
| 101 if 'q' in strategy: | |
| 102 # Strategy q: add words from all quoted strings from specification. | |
| 103 # TODO(mmoroz): experimental and very noisy. Not recommended to use. | |
| 104 spec_data = ReadSpecification(path_to_spec) | |
| 105 quoted_strings = FindQuotedText(spec_data) | |
| 106 quoted_words = ExtractWordsFromLines(quoted_strings) | |
| 107 dictionary_words = dictionary_words.union(quoted_words) | |
| 108 | |
| 109 if 'u' in strategy: | |
| 110 # Strategy u: add all uppercase words from specification. | |
| 111 uppercase_words = set(w for w in words_from_spec if w.isupper()) | |
| 112 dictionary_words = dictionary_words.union(uppercase_words) | |
| 113 | |
| 114 return dictionary_words | |
| 115 | |
| 116 | |
| 117 def HexEscapeSingleChar(c): | |
| 118 """Replace a character with its hex-escaped representation.""" | |
| 119 return '\\x' + c.encode('hex') | |
| 120 | |
| 121 | |
| 122 def ExtractWordsFromBinary(filepath, min_length=4): | |
| 123 """Extract words (splitted strings) from a binary executable file.""" | |
| 124 rodata_path = PreprocessBinary(filepath) | |
|
Oliver Chang
2016/07/01 19:10:45
is rodata_path ever deleted?
mmoroz
2016/07/06 15:06:19
Done. Now it works via tempfile, thanks for the su
| |
| 125 words = [] | |
| 126 | |
| 127 # Use different encodings for strings extraction. | |
| 128 for encoding in 'sSbBlL': | |
| 129 cmd = 'strings -e %s -n %d %s' % (encoding, min_length, rodata_path) | |
|
aizatsky
2016/07/01 22:28:55
Wouldn't it be simpler to simply use regexp to ext
mmoroz
2016/07/06 15:06:18
Nice idea! Looks like it gives even a bit "cleaner
| |
| 130 strings_raw = os.popen(cmd).read() | |
| 131 words += strings_raw.split() | |
| 132 | |
| 133 return set(words) | |
| 134 | |
| 135 | |
| 136 def ParseSpec(filepath): | |
|
inferno
2016/07/04 00:21:56
nit: s/ParseSpec/ExtractWordsFromSpec
mmoroz
2016/07/06 15:06:18
Done.
| |
| 137 """Extract words from a specification.""" | |
| 138 data = ReadSpecification(filepath) | |
| 139 words = data.split() | |
| 140 return set(words) | |
| 141 | |
| 142 | |
| 143 def PreprocessBinary(filepath): | |
| 144 """Creates a stripped copy of the binary and extract .rodata section.""" | |
| 145 temp_filepath = os.path.join('/tmp', os.path.basename(filepath) + '_copy') | |
|
Oliver Chang
2016/07/01 19:10:45
can you use the tempfile instead?
inferno
2016/07/04 00:21:56
+1
mmoroz
2016/07/06 15:06:18
Done.
| |
| 146 shutil.copyfile(filepath, temp_filepath) | |
| 147 | |
| 148 # Strip all symbols to reduce amount of redundant strings. | |
| 149 os.popen('strip --strip-all %s' % temp_filepath) | |
|
inferno
2016/07/04 00:21:56
os.popen is deprecated, use subprocess module.
mmoroz
2016/07/06 15:06:18
Thanks! Done.
| |
| 150 | |
| 151 # Extract .rodata section to reduce amount of redundant strings. | |
| 152 rodata_filepath = os.path.join('/tmp', os.path.basename(filepath) + '_rodata') | |
| 153 os.popen('objcopy -j .rodata %s %s 2>&1' % (temp_filepath, rodata_filepath)) | |
| 154 | |
| 155 return rodata_filepath | |
| 156 | |
| 157 | |
| 158 def ReadAndDecodeHTML(filepath): | |
| 159 """Return HTML-decoded content of the file.""" | |
| 160 data = io.open(filepath, 'r', encoding='utf-8').read() | |
|
inferno
2016/07/04 00:21:55
forgot to close file :(, also just plain open shou
mmoroz
2016/07/06 15:06:19
My bad, I thought that it is not mandatory to expl
| |
| 161 html_parser = HTMLParser.HTMLParser() | |
| 162 data = html_parser.unescape(data) | |
| 163 return data.encode('ascii', 'ignore') | |
| 164 | |
| 165 | |
| 166 def ReadSpecification(filepath): | |
| 167 """Read a specification file and convert it to ASCII text if needed.""" | |
|
inferno
2016/07/04 00:21:55
description part "convert it to ASCII text if need
mmoroz
2016/07/06 15:06:19
Done.
| |
| 168 if filepath.endswith('.txt'): | |
|
inferno
2016/07/04 00:21:56
This seems too hacky, i wonder if people will know
mmoroz
2016/07/06 15:06:19
Good point, but we don't know exactly should file
| |
| 169 data = open(filepath).read() | |
| 170 else: | |
| 171 # If specification is not a .txt file, decode possible HTML entities. | |
| 172 data = ReadAndDecodeHTML(filepath) | |
|
inferno
2016/07/04 00:21:55
maybe split ReadAndDecodeHtml into ReadFile and de
mmoroz
2016/07/06 15:06:18
Yeah, but regular open() works fine for everything
| |
| 173 | |
| 174 return data | |
| 175 | |
| 176 | |
| 177 def WriteDictionary(dictionary_path, dictionary): | |
| 178 """Write given dictionary to a file.""" | |
| 179 with open(dictionary_path, 'wb') as f: | |
| 180 f.write('# This is an automatically generated dictionary.\n') | |
| 181 for word in dictionary: | |
| 182 if not word: | |
| 183 continue | |
| 184 line = '"%s"\n' % EscapeDictionaryElement(word) | |
| 185 f.write(line) | |
| 186 | |
| 187 | |
| 188 def main(): | |
| 189 parser = argparse.ArgumentParser(description="Generate fuzzer dictionary.") | |
| 190 parser.add_argument('--fuzzer', required=True, | |
| 191 help='Path to a fuzzer binary executable. It is ' | |
| 192 'recommended to use a binary built with ' | |
| 193 '"use_libfuzzer=false is_asan=false" to get a better ' | |
| 194 'dictionary with fewer number of redundant elements.') | |
| 195 parser.add_argument('--out', required=True, | |
| 196 help='Path to a file to write a dictionary into.') | |
| 197 parser.add_argument('--spec', required=True, | |
|
Oliver Chang
2016/07/01 19:10:45
can the spec option be optional? is it reasonable
mmoroz
2016/07/06 15:06:18
I don't think it will be reasonable, there are too
| |
| 198 help='Path to a target specification (in textual form).') | |
| 199 parser.add_argument('--strategy', default='iu', | |
| 200 help='Generation strategy [iqu] ("iu" is default value): ' | |
| 201 'i - intersection, q - quoted, u - uppercase.') | |
| 202 args = parser.parse_args() | |
| 203 | |
| 204 dictionary = GenerateDictionary(args.fuzzer, args.spec, args.strategy) | |
|
inferno
2016/07/04 00:21:55
I think you should check the platform so that bina
mmoroz
2016/07/06 15:06:19
Good point, thanks!
My plan is to focus on dynami
| |
| 205 WriteDictionary(args.out, dictionary) | |
| 206 | |
| 207 | |
| 208 if __name__ == '__main__': | |
| 209 main() | |
| OLD | NEW |