Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(15)

Side by Side Diff: testing/libfuzzer/dictionary_generator.py

Issue 2115563002: [libfuzzer] Added script for dictionary generation. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Fix unused imports, missing copyrights, rename one function. Created 4 years, 5 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 #!/usr/bin/python2
2 #
3 # Copyright 2016 The Chromium Authors. All rights reserved.
4 # Use of this source code is governed by a BSD-style license that can be
5 # found in the LICENSE file.
6
7 """Generate a dictionary for libFuzzer or AFL-based fuzzer.
8
9 Invoked manually using a fuzzer binary and target format/protocol specification.
10 Works better for text formats or protocols. For binary ones may be useless.
11 """
12
13 import argparse
14 import HTMLParser
15 import io
16 import logging
17 import os
18 import re
19 import shutil
20 import string
21 import subprocess
22 import sys
23 import tempfile
24
25
26 ENCODING_TYPES = ['ascii', 'utf_16_be', 'utf_16_le', 'utf_32_be', 'utf_32_le']
27
28
29 def EscapeDictionaryElement(element):
30 """Escape all unprintable and control characters in an element."""
31 return element.encode('string_escape').replace('"', '\"')
32
33
34 def ExtractWordsFromLines(lines):
35 """Extract all words from a list of strings."""
36 words = set()
37 for line in lines:
38 for word in line.split():
39 words.add(word)
40
41 return words
42
43
44 def FindNumberOfLeadingSpaces(line):
45 """Calculate number of leading whitespace characters in the string."""
46 n = 0
47 while n < len(line) and line[n].isspace():
48 n += 1
49
50 return n
51
52
53 def FindIndentedText(text):
54 """Find space-indented text blocks, e.g. code or data samples in RFCs."""
55 lines = text.split('\n')
56 indented_blocks = []
57 current_block = ''
58 previous_number_of_spaces = 0
59
60 # Go through every line and concatenate space-indented blocks into lines.
61 for i in xrange(0, len(lines), 1):
62 if not lines[i]:
63 # Ignore empty lines.
64 continue
65
66 # Space-indented text blocks have more leading spaces than regular text.
67 n = FindNumberOfLeadingSpaces(lines[i])
68
69 if n > previous_number_of_spaces:
70 # Beginning of a space-indented text block, start concatenation.
71 current_block = lines[i][n : ]
72 elif n == previous_number_of_spaces and current_block:
73 # Or continuation of a space-indented text block, concatenate lines.
74 current_block += '\n' + lines[i][n : ]
75
76 if n < previous_number_of_spaces and current_block:
77 # Current line is not indented, save previously concatenated lines.
78 indented_blocks.append(current_block)
79 current_block = ''
80
81 previous_number_of_spaces = n
82
83 return indented_blocks
84
85
86 def GenerateDictionary(path_to_binary, path_to_spec, strategy, is_html=False):
87 """Generate a dictionary for given pair of fuzzer binary and specification."""
88 for filepath in [path_to_binary, path_to_spec]:
89 if not os.path.exists(filepath):
90 logging.error('%s doesn\'t exist. Exit.', filepath)
91 sys.exit(1)
92
93 words_from_binary = ExtractWordsFromBinary(path_to_binary)
94 words_from_spec = ExtractWordsFromSpec(path_to_spec, is_html)
95
96 dictionary_words = set()
97
98 if 'i' in strategy:
99 # Strategy i: only words which are common for binary and for specification.
100 dictionary_words = words_from_binary.intersection(words_from_spec)
101
102 if 'q' in strategy:
103 # Strategy q: add words from all quoted strings from specification.
104 # TODO(mmoroz): experimental and very noisy. Not recommended to use.
105 spec_data = ReadSpecification(path_to_spec, is_html)
106 quoted_strings = FindIndentedText(spec_data)
107 quoted_words = ExtractWordsFromLines(quoted_strings)
108 dictionary_words = dictionary_words.union(quoted_words)
109
110 if 'u' in strategy:
111 # Strategy u: add all uppercase words from specification.
112 uppercase_words = set(w for w in words_from_spec if w.isupper())
113 dictionary_words = dictionary_words.union(uppercase_words)
114
115 return dictionary_words
116
117
118 def HexEscapeSingleChar(c):
inferno 2016/07/06 15:33:23 unused, please remove.
mmoroz 2016/07/06 16:12:37 Done.
119 """Replace a character with its hex-escaped representation."""
120 return '\\x' + c.encode('hex')
121
122
123 def ExtractWordsFromBinary(filepath, min_length=4):
inferno 2016/07/06 15:33:23 nit: Put 4 in a global
mmoroz 2016/07/06 16:12:37 Done.
124 """Extract words (splitted strings) from a binary executable file."""
125 rodata = PreprocessAndReadRodata(filepath)
126 words = []
127
128 strings_re = re.compile(r'[^\x00-\x1F\x7F-\xFF]{%d,}' % min_length)
129 # Use different encodings for strings extraction.
130 for encoding in ENCODING_TYPES:
131 data = rodata.decode(encoding, 'ignore').encode('ascii', 'ignore')
132 strings_raw = strings_re.findall(data)
inferno 2016/07/06 15:33:23 s/strings_raw/raw_strings
mmoroz 2016/07/06 16:12:37 Done.
133 for splitted_line in map(lambda line: line.split(), strings_raw):
134 words += splitted_line
135
136 return set(words)
137
138
139 def ExtractWordsFromSpec(filepath, is_html):
140 """Extract words from a specification."""
141 data = ReadSpecification(filepath, is_html)
142 words = data.split()
143 return set(words)
144
145
146 def PreprocessAndReadRodata(filepath):
147 """Create a stripped copy of the binary and extract .rodata section."""
148 stripped_file = tempfile.NamedTemporaryFile(prefix='.stripped_')
149 stripped_filepath = stripped_file.name
150 shutil.copyfile(filepath, stripped_filepath)
151
152 # Strip all symbols to reduce amount of redundant strings.
153 strip_cmd = ['strip', '--strip-all', stripped_filepath]
154 result = subprocess.call(strip_cmd)
155 if result:
156 logging.warning('Failed to strip the binary. Using the original version.')
157 stripped_filepath = filepath
158
159 # Extract .rodata section to reduce amount of redundant strings.
160 rodata_file = tempfile.NamedTemporaryFile(prefix='.rodata_')
161 rodata_filepath = rodata_file.name
162 objcopy_cmd = ['objcopy', '-j', '.rodata', stripped_filepath, rodata_filepath]
163
164 # Hide output from stderr since objcopy prints a warning.
165 with open(os.devnull, 'w') as devnull:
166 result = subprocess.call(objcopy_cmd, stderr=devnull)
167
168 if result:
169 logging.warning('Failed to extract .rodata section. Using the whole file.')
170 rodata_filepath = stripped_filepath
171
172 with open(rodata_filepath) as f:
173 data = f.read()
174
175 stripped_file.close()
176 rodata_file.close()
177
178 return data
179
180
181 def ReadAndDecodeHTML(filepath):
182 """Return HTML-decoded content of the file."""
183 with io.open(filepath, 'r', encoding='utf-8') as f:
184 data = f.read()
185 html_parser = HTMLParser.HTMLParser()
186 data = html_parser.unescape(data)
187 return data.encode('ascii', 'ignore')
188
189
190 def ReadSpecification(filepath, is_html):
191 """Read a specification file and return its contents."""
192 if not is_html:
193 data = open(filepath).read()
inferno 2016/07/06 15:33:23 nit: forgot to close file. with open(filepath, 'r
mmoroz 2016/07/06 16:12:37 HTMLParser.unescape fails in this case. But decode
mmoroz 2016/07/06 16:12:37 Done.
194 else:
195 data = ReadAndDecodeHTML(filepath)
196
197 return data
198
199
200 def WriteDictionary(dictionary_path, dictionary):
201 """Write given dictionary to a file."""
202 with open(dictionary_path, 'wb') as f:
203 f.write('# This is an automatically generated dictionary.\n')
204 for word in dictionary:
205 if not word:
206 continue
207 line = '"%s"\n' % EscapeDictionaryElement(word)
208 f.write(line)
209
210
211 def main():
212 parser = argparse.ArgumentParser(description="Generate fuzzer dictionary.")
213 parser.add_argument('--fuzzer', required=True,
214 help='Path to a fuzzer binary executable. It is '
215 'recommended to use a binary built with '
216 '"use_libfuzzer=false is_asan=false" to get a better '
217 'dictionary with fewer number of redundant elements.')
218 parser.add_argument('--spec', required=True,
219 help='Path to a target specification (in textual form).')
220 parser.add_argument('--html', default=0,
221 help='Decode HTML [01] (0 is default value): '
222 '1 - if specification has HTML entities to be decoded.')
223 parser.add_argument('--out', required=True,
224 help='Path to a file to write a dictionary into.')
225 parser.add_argument('--strategy', default='iu',
226 help='Generation strategy [iqu] ("iu" is default value): '
227 'i - intersection, q - quoted, u - uppercase.')
228 args = parser.parse_args()
229
230 dictionary = GenerateDictionary(args.fuzzer, args.spec, args.strategy,
231 is_html=bool(args.html))
232 WriteDictionary(args.out, dictionary)
233
234
235 if __name__ == '__main__':
236 main()
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698