OLD | NEW |
---|---|
(Empty) | |
1 #!/usr/bin/python2 | |
2 # | |
3 # Copyright 2016 The Chromium Authors. All rights reserved. | |
4 # Use of this source code is governed by a BSD-style license that can be | |
5 # found in the LICENSE file. | |
6 | |
7 """Generate a dictionary for libFuzzer or AFL-based fuzzer. | |
8 | |
9 Invoked manually using a fuzzer binary and target format/protocol specification. | |
10 Works better for text formats or protocols. For binary ones may be useless. | |
11 """ | |
12 | |
13 import argparse | |
14 import HTMLParser | |
15 import io | |
16 import logging | |
17 import os | |
18 import re | |
19 import shutil | |
20 import string | |
21 import subprocess | |
22 import sys | |
23 import tempfile | |
24 | |
25 | |
26 ENCODING_TYPES = ['ascii', 'utf_16_be', 'utf_16_le', 'utf_32_be', 'utf_32_le'] | |
27 | |
28 | |
29 def EscapeDictionaryElement(element): | |
30 """Escape all unprintable and control characters in an element.""" | |
31 return element.encode('string_escape').replace('"', '\"') | |
32 | |
33 | |
34 def ExtractWordsFromLines(lines): | |
35 """Extract all words from a list of strings.""" | |
36 words = set() | |
37 for line in lines: | |
38 for word in line.split(): | |
39 words.add(word) | |
40 | |
41 return words | |
42 | |
43 | |
44 def FindNumberOfLeadingSpaces(line): | |
45 """Calculate number of leading whitespace characters in the string.""" | |
46 n = 0 | |
47 while n < len(line) and line[n].isspace(): | |
48 n += 1 | |
49 | |
50 return n | |
51 | |
52 | |
53 def FindIndentedText(text): | |
54 """Find space-indented text blocks, e.g. code or data samples in RFCs.""" | |
55 lines = text.split('\n') | |
56 indented_blocks = [] | |
57 current_block = '' | |
58 previous_number_of_spaces = 0 | |
59 | |
60 # Go through every line and concatenate space-indented blocks into lines. | |
61 for i in xrange(0, len(lines), 1): | |
62 if not lines[i]: | |
63 # Ignore empty lines. | |
64 continue | |
65 | |
66 # Space-indented text blocks have more leading spaces than regular text. | |
67 n = FindNumberOfLeadingSpaces(lines[i]) | |
68 | |
69 if n > previous_number_of_spaces: | |
70 # Beginning of a space-indented text block, start concatenation. | |
71 current_block = lines[i][n : ] | |
72 elif n == previous_number_of_spaces and current_block: | |
73 # Or continuation of a space-indented text block, concatenate lines. | |
74 current_block += '\n' + lines[i][n : ] | |
75 | |
76 if n < previous_number_of_spaces and current_block: | |
77 # Current line is not indented, save previously concatenated lines. | |
78 indented_blocks.append(current_block) | |
79 current_block = '' | |
80 | |
81 previous_number_of_spaces = n | |
82 | |
83 return indented_blocks | |
84 | |
85 | |
86 def GenerateDictionary(path_to_binary, path_to_spec, strategy, is_html=False): | |
87 """Generate a dictionary for given pair of fuzzer binary and specification.""" | |
88 for filepath in [path_to_binary, path_to_spec]: | |
89 if not os.path.exists(filepath): | |
90 logging.error('%s doesn\'t exist. Exit.', filepath) | |
91 sys.exit(1) | |
92 | |
93 words_from_binary = ExtractWordsFromBinary(path_to_binary) | |
94 words_from_spec = ExtractWordsFromSpec(path_to_spec, is_html) | |
95 | |
96 dictionary_words = set() | |
97 | |
98 if 'i' in strategy: | |
99 # Strategy i: only words which are common for binary and for specification. | |
100 dictionary_words = words_from_binary.intersection(words_from_spec) | |
101 | |
102 if 'q' in strategy: | |
103 # Strategy q: add words from all quoted strings from specification. | |
104 # TODO(mmoroz): experimental and very noisy. Not recommended to use. | |
105 spec_data = ReadSpecification(path_to_spec, is_html) | |
106 quoted_strings = FindIndentedText(spec_data) | |
107 quoted_words = ExtractWordsFromLines(quoted_strings) | |
108 dictionary_words = dictionary_words.union(quoted_words) | |
109 | |
110 if 'u' in strategy: | |
111 # Strategy u: add all uppercase words from specification. | |
112 uppercase_words = set(w for w in words_from_spec if w.isupper()) | |
113 dictionary_words = dictionary_words.union(uppercase_words) | |
114 | |
115 return dictionary_words | |
116 | |
117 | |
118 def HexEscapeSingleChar(c): | |
inferno
2016/07/06 15:33:23
unused, please remove.
mmoroz
2016/07/06 16:12:37
Done.
| |
119 """Replace a character with its hex-escaped representation.""" | |
120 return '\\x' + c.encode('hex') | |
121 | |
122 | |
123 def ExtractWordsFromBinary(filepath, min_length=4): | |
inferno
2016/07/06 15:33:23
nit: Put 4 in a global
mmoroz
2016/07/06 16:12:37
Done.
| |
124 """Extract words (splitted strings) from a binary executable file.""" | |
125 rodata = PreprocessAndReadRodata(filepath) | |
126 words = [] | |
127 | |
128 strings_re = re.compile(r'[^\x00-\x1F\x7F-\xFF]{%d,}' % min_length) | |
129 # Use different encodings for strings extraction. | |
130 for encoding in ENCODING_TYPES: | |
131 data = rodata.decode(encoding, 'ignore').encode('ascii', 'ignore') | |
132 strings_raw = strings_re.findall(data) | |
inferno
2016/07/06 15:33:23
s/strings_raw/raw_strings
mmoroz
2016/07/06 16:12:37
Done.
| |
133 for splitted_line in map(lambda line: line.split(), strings_raw): | |
134 words += splitted_line | |
135 | |
136 return set(words) | |
137 | |
138 | |
139 def ExtractWordsFromSpec(filepath, is_html): | |
140 """Extract words from a specification.""" | |
141 data = ReadSpecification(filepath, is_html) | |
142 words = data.split() | |
143 return set(words) | |
144 | |
145 | |
146 def PreprocessAndReadRodata(filepath): | |
147 """Create a stripped copy of the binary and extract .rodata section.""" | |
148 stripped_file = tempfile.NamedTemporaryFile(prefix='.stripped_') | |
149 stripped_filepath = stripped_file.name | |
150 shutil.copyfile(filepath, stripped_filepath) | |
151 | |
152 # Strip all symbols to reduce amount of redundant strings. | |
153 strip_cmd = ['strip', '--strip-all', stripped_filepath] | |
154 result = subprocess.call(strip_cmd) | |
155 if result: | |
156 logging.warning('Failed to strip the binary. Using the original version.') | |
157 stripped_filepath = filepath | |
158 | |
159 # Extract .rodata section to reduce amount of redundant strings. | |
160 rodata_file = tempfile.NamedTemporaryFile(prefix='.rodata_') | |
161 rodata_filepath = rodata_file.name | |
162 objcopy_cmd = ['objcopy', '-j', '.rodata', stripped_filepath, rodata_filepath] | |
163 | |
164 # Hide output from stderr since objcopy prints a warning. | |
165 with open(os.devnull, 'w') as devnull: | |
166 result = subprocess.call(objcopy_cmd, stderr=devnull) | |
167 | |
168 if result: | |
169 logging.warning('Failed to extract .rodata section. Using the whole file.') | |
170 rodata_filepath = stripped_filepath | |
171 | |
172 with open(rodata_filepath) as f: | |
173 data = f.read() | |
174 | |
175 stripped_file.close() | |
176 rodata_file.close() | |
177 | |
178 return data | |
179 | |
180 | |
181 def ReadAndDecodeHTML(filepath): | |
182 """Return HTML-decoded content of the file.""" | |
183 with io.open(filepath, 'r', encoding='utf-8') as f: | |
184 data = f.read() | |
185 html_parser = HTMLParser.HTMLParser() | |
186 data = html_parser.unescape(data) | |
187 return data.encode('ascii', 'ignore') | |
188 | |
189 | |
190 def ReadSpecification(filepath, is_html): | |
191 """Read a specification file and return its contents.""" | |
192 if not is_html: | |
193 data = open(filepath).read() | |
inferno
2016/07/06 15:33:23
nit: forgot to close file.
with open(filepath, 'r
mmoroz
2016/07/06 16:12:37
HTMLParser.unescape fails in this case. But decode
mmoroz
2016/07/06 16:12:37
Done.
| |
194 else: | |
195 data = ReadAndDecodeHTML(filepath) | |
196 | |
197 return data | |
198 | |
199 | |
200 def WriteDictionary(dictionary_path, dictionary): | |
201 """Write given dictionary to a file.""" | |
202 with open(dictionary_path, 'wb') as f: | |
203 f.write('# This is an automatically generated dictionary.\n') | |
204 for word in dictionary: | |
205 if not word: | |
206 continue | |
207 line = '"%s"\n' % EscapeDictionaryElement(word) | |
208 f.write(line) | |
209 | |
210 | |
211 def main(): | |
212 parser = argparse.ArgumentParser(description="Generate fuzzer dictionary.") | |
213 parser.add_argument('--fuzzer', required=True, | |
214 help='Path to a fuzzer binary executable. It is ' | |
215 'recommended to use a binary built with ' | |
216 '"use_libfuzzer=false is_asan=false" to get a better ' | |
217 'dictionary with fewer number of redundant elements.') | |
218 parser.add_argument('--spec', required=True, | |
219 help='Path to a target specification (in textual form).') | |
220 parser.add_argument('--html', default=0, | |
221 help='Decode HTML [01] (0 is default value): ' | |
222 '1 - if specification has HTML entities to be decoded.') | |
223 parser.add_argument('--out', required=True, | |
224 help='Path to a file to write a dictionary into.') | |
225 parser.add_argument('--strategy', default='iu', | |
226 help='Generation strategy [iqu] ("iu" is default value): ' | |
227 'i - intersection, q - quoted, u - uppercase.') | |
228 args = parser.parse_args() | |
229 | |
230 dictionary = GenerateDictionary(args.fuzzer, args.spec, args.strategy, | |
231 is_html=bool(args.html)) | |
232 WriteDictionary(args.out, dictionary) | |
233 | |
234 | |
235 if __name__ == '__main__': | |
236 main() | |
OLD | NEW |