OLD | NEW |
---|---|
(Empty) | |
1 #!/usr/bin/python2 | |
2 # | |
3 # Copyright 2016 The Chromium Authors. All rights reserved. | |
4 # Use of this source code is governed by a BSD-style license that can be | |
5 # found in the LICENSE file. | |
6 | |
7 """Generate a dictionary for libfuzzer or AFL-based fuzzer. | |
8 | |
9 Invoked manually using a fuzzer binary and target format/protocol specification. | |
10 Works better for text formats or protocols. For binary ones may be useless. | |
11 """ | |
12 | |
13 import argparse | |
14 import HTMLParser | |
15 import io | |
16 import os | |
17 import random | |
18 import re | |
19 import shutil | |
20 import string | |
21 import sys | |
22 import time | |
23 | |
24 | |
25 ALLOWED_CHARS = string.letters + string.digits + '#$%&()*+,-./:;<=>?@[]^_`{|}~ ' | |
inferno
2016/07/04 00:21:56
maybe better to just use ascii table 32-126 ?
mmoroz
2016/07/06 15:06:19
Removed this white-listing at all.
| |
26 | |
27 | |
28 def EscapeDictionaryElement(element): | |
aizatsky
2016/07/01 22:28:55
Consider using string_escape:
https://docs.python
mmoroz
2016/07/06 15:06:18
Thank you, it is almost exactly what I need!
| |
29 """Escape all unprintable and control characters in an element.""" | |
30 res = '' | |
inferno
2016/07/04 00:21:55
s/res/result. full variable names are preferred.
mmoroz
2016/07/06 15:06:18
Done.
| |
31 for c in element: | |
32 if c in ALLOWED_CHARS: | |
33 res += c | |
34 else: | |
35 res += HexEscapeSingleChar(c) | |
36 | |
37 return res | |
38 | |
39 | |
40 def ExtractWordsFromLines(lines): | |
41 """Extract all words from a list of strings.""" | |
42 words = set() | |
43 for l in lines: | |
inferno
2016/07/04 00:21:55
nit: s/l/line, s/w/word
mmoroz
2016/07/06 15:06:18
Done.
| |
44 for w in l.split(): | |
45 words.add(w) | |
46 | |
47 return words | |
48 | |
49 | |
50 def FindNumberOfLeadingSpaces(line): | |
51 """Calculate number of leading whitespace characters in the string.""" | |
52 n = 0 | |
inferno
2016/07/04 00:21:55
nit: maybe just line.count(' ') ? or we care about
mmoroz
2016/07/06 15:06:19
I'm afraid that specification in textual form may
| |
53 while n < len(line) and line[n].isspace(): | |
54 n += 1 | |
55 | |
56 return n | |
57 | |
58 | |
59 def FindQuotedText(text): | |
60 """Find text with an offset, e.g. code samples in RFCs or data examples.""" | |
aizatsky
2016/07/01 22:28:55
Find space-indented text blocks, e.g. ...
mmoroz
2016/07/06 15:06:19
Done.
| |
61 strings = text.split('\n') | |
62 found_strings = [] | |
63 min_number_of_spaces = 1 | |
64 previous_number_of_spaces = 0 | |
65 previous_strings = '' | |
66 | |
67 # Go through every line and concatenate quoted lines into a one string. | |
inferno
2016/07/04 00:21:55
Can you more explanation on algorithm and add an e
mmoroz
2016/07/06 15:06:19
Made it a bit simpler, added comments and renamed
| |
68 for i in xrange(1, len(strings), 1): | |
69 n = FindNumberOfLeadingSpaces(strings[i]) | |
70 if n < previous_number_of_spaces and previous_strings: | |
71 # Current line is not quotted, save previously concatenated lines. | |
72 found_strings.append(previous_strings) | |
73 previous_strings = '' | |
74 previous_number_of_spaces = n | |
75 continue | |
76 | |
77 if n >= min_number_of_spaces: | |
78 if n > previous_number_of_spaces: | |
79 # Beginning of a quoted text, start concatenation. | |
80 previous_strings = strings[i][n : ] | |
81 elif n == previous_number_of_spaces and previous_strings: | |
82 # Or continuation of a quoted text, concatenate it. | |
83 previous_strings += '\n' + strings[i][n : ] | |
84 | |
85 previous_number_of_spaces = n | |
86 | |
87 return found_strings | |
88 | |
89 | |
90 def GenerateDictionary(path_to_binary, path_to_spec, strategy): | |
91 """Generate a dictionary for given pair of fuzzer binary and specification.""" | |
92 words_from_binary = ExtractWordsFromBinary(path_to_binary) | |
93 words_from_spec = ParseSpec(path_to_spec) | |
94 | |
95 dictionary_words = set() | |
96 | |
97 if 'i' in strategy: | |
98 # Strategy i: only words which are common for binary and for specification. | |
99 dictionary_words = words_from_binary.intersection(words_from_spec) | |
100 | |
101 if 'q' in strategy: | |
102 # Strategy q: add words from all quoted strings from specification. | |
103 # TODO(mmoroz): experimental and very noisy. Not recommended to use. | |
104 spec_data = ReadSpecification(path_to_spec) | |
105 quoted_strings = FindQuotedText(spec_data) | |
106 quoted_words = ExtractWordsFromLines(quoted_strings) | |
107 dictionary_words = dictionary_words.union(quoted_words) | |
108 | |
109 if 'u' in strategy: | |
110 # Strategy u: add all uppercase words from specification. | |
111 uppercase_words = set(w for w in words_from_spec if w.isupper()) | |
112 dictionary_words = dictionary_words.union(uppercase_words) | |
113 | |
114 return dictionary_words | |
115 | |
116 | |
117 def HexEscapeSingleChar(c): | |
118 """Replace a character with its hex-escaped representation.""" | |
119 return '\\x' + c.encode('hex') | |
120 | |
121 | |
122 def ExtractWordsFromBinary(filepath, min_length=4): | |
123 """Extract words (splitted strings) from a binary executable file.""" | |
124 rodata_path = PreprocessBinary(filepath) | |
Oliver Chang
2016/07/01 19:10:45
is rodata_path ever deleted?
mmoroz
2016/07/06 15:06:19
Done. Now it works via tempfile, thanks for the su
| |
125 words = [] | |
126 | |
127 # Use different encodings for strings extraction. | |
128 for encoding in 'sSbBlL': | |
129 cmd = 'strings -e %s -n %d %s' % (encoding, min_length, rodata_path) | |
aizatsky
2016/07/01 22:28:55
Wouldn't it be simpler to simply use regexp to ext
mmoroz
2016/07/06 15:06:18
Nice idea! Looks like it gives even a bit "cleaner
| |
130 strings_raw = os.popen(cmd).read() | |
131 words += strings_raw.split() | |
132 | |
133 return set(words) | |
134 | |
135 | |
136 def ParseSpec(filepath): | |
inferno
2016/07/04 00:21:56
nit: s/ParseSpec/ExtractWordsFromSpec
mmoroz
2016/07/06 15:06:18
Done.
| |
137 """Extract words from a specification.""" | |
138 data = ReadSpecification(filepath) | |
139 words = data.split() | |
140 return set(words) | |
141 | |
142 | |
143 def PreprocessBinary(filepath): | |
144 """Creates a stripped copy of the binary and extract .rodata section.""" | |
145 temp_filepath = os.path.join('/tmp', os.path.basename(filepath) + '_copy') | |
Oliver Chang
2016/07/01 19:10:45
can you use the tempfile instead?
inferno
2016/07/04 00:21:56
+1
mmoroz
2016/07/06 15:06:18
Done.
| |
146 shutil.copyfile(filepath, temp_filepath) | |
147 | |
148 # Strip all symbols to reduce amount of redundant strings. | |
149 os.popen('strip --strip-all %s' % temp_filepath) | |
inferno
2016/07/04 00:21:56
os.popen is deprecated, use subprocess module.
mmoroz
2016/07/06 15:06:18
Thanks! Done.
| |
150 | |
151 # Extract .rodata section to reduce amount of redundant strings. | |
152 rodata_filepath = os.path.join('/tmp', os.path.basename(filepath) + '_rodata') | |
153 os.popen('objcopy -j .rodata %s %s 2>&1' % (temp_filepath, rodata_filepath)) | |
154 | |
155 return rodata_filepath | |
156 | |
157 | |
158 def ReadAndDecodeHTML(filepath): | |
159 """Return HTML-decoded content of the file.""" | |
160 data = io.open(filepath, 'r', encoding='utf-8').read() | |
inferno
2016/07/04 00:21:55
forgot to close file :(, also just plain open shou
mmoroz
2016/07/06 15:06:19
My bad, I thought that it is not mandatory to expl
| |
161 html_parser = HTMLParser.HTMLParser() | |
162 data = html_parser.unescape(data) | |
163 return data.encode('ascii', 'ignore') | |
164 | |
165 | |
166 def ReadSpecification(filepath): | |
167 """Read a specification file and convert it to ASCII text if needed.""" | |
inferno
2016/07/04 00:21:55
description part "convert it to ASCII text if need
mmoroz
2016/07/06 15:06:19
Done.
| |
168 if filepath.endswith('.txt'): | |
inferno
2016/07/04 00:21:56
This seems too hacky, i wonder if people will know
mmoroz
2016/07/06 15:06:19
Good point, but we don't know exactly should file
| |
169 data = open(filepath).read() | |
170 else: | |
171 # If specification is not a .txt file, decode possible HTML entities. | |
172 data = ReadAndDecodeHTML(filepath) | |
inferno
2016/07/04 00:21:55
maybe split ReadAndDecodeHtml into ReadFile and de
mmoroz
2016/07/06 15:06:18
Yeah, but regular open() works fine for everything
| |
173 | |
174 return data | |
175 | |
176 | |
177 def WriteDictionary(dictionary_path, dictionary): | |
178 """Write given dictionary to a file.""" | |
179 with open(dictionary_path, 'wb') as f: | |
180 f.write('# This is an automatically generated dictionary.\n') | |
181 for word in dictionary: | |
182 if not word: | |
183 continue | |
184 line = '"%s"\n' % EscapeDictionaryElement(word) | |
185 f.write(line) | |
186 | |
187 | |
188 def main(): | |
189 parser = argparse.ArgumentParser(description="Generate fuzzer dictionary.") | |
190 parser.add_argument('--fuzzer', required=True, | |
191 help='Path to a fuzzer binary executable. It is ' | |
192 'recommended to use a binary built with ' | |
193 '"use_libfuzzer=false is_asan=false" to get a better ' | |
194 'dictionary with fewer number of redundant elements.') | |
195 parser.add_argument('--out', required=True, | |
196 help='Path to a file to write a dictionary into.') | |
197 parser.add_argument('--spec', required=True, | |
Oliver Chang
2016/07/01 19:10:45
can the spec option be optional? is it reasonable
mmoroz
2016/07/06 15:06:18
I don't think it will be reasonable, there are too
| |
198 help='Path to a target specification (in textual form).') | |
199 parser.add_argument('--strategy', default='iu', | |
200 help='Generation strategy [iqu] ("iu" is default value): ' | |
201 'i - intersection, q - quoted, u - uppercase.') | |
202 args = parser.parse_args() | |
203 | |
204 dictionary = GenerateDictionary(args.fuzzer, args.spec, args.strategy) | |
inferno
2016/07/04 00:21:55
I think you should check the platform so that bina
mmoroz
2016/07/06 15:06:19
Good point, thanks!
My plan is to focus on dynami
| |
205 WriteDictionary(args.out, dictionary) | |
206 | |
207 | |
208 if __name__ == '__main__': | |
209 main() | |
OLD | NEW |