Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(440)

Side by Side Diff: testing/libfuzzer/dictionary_generator.py

Issue 2115563002: [libfuzzer] Added script for dictionary generation. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Restore xml.dict, will replace with generated one in a separate CL. Created 4 years, 5 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 #!/usr/bin/python2
2 #
3 # Copyright 2016 The Chromium Authors. All rights reserved.
4 # Use of this source code is governed by a BSD-style license that can be
5 # found in the LICENSE file.
6
7 """Generate a dictionary for libFuzzer or AFL-based fuzzer.
8
9 Invoked manually using a fuzzer binary and target format/protocol specification.
10 Works better for text formats or protocols. For binary ones may be useless.
11 """
12
13 import argparse
14 import HTMLParser
15 import io
16 import logging
17 import os
18 import re
19 import shutil
20 import string
21 import subprocess
22 import sys
23 import tempfile
24
25
26 ENCODING_TYPES = ['ascii', 'utf_16_be', 'utf_16_le', 'utf_32_be', 'utf_32_le']
27 MIN_STRING_LENGTH = 4
28
29
30 def DecodeHTML(html_data):
31 """HTML-decoding of the data."""
32 html_parser = HTMLParser.HTMLParser()
33 data = html_parser.unescape(html_data.decode('ascii', 'ignore'))
34 return data.encode('ascii', 'ignore')
35
36
37 def EscapeDictionaryElement(element):
38 """Escape all unprintable and control characters in an element."""
39 return element.encode('string_escape').replace('"', '\"')
40
41
42 def ExtractWordsFromBinary(filepath, min_length=MIN_STRING_LENGTH):
43 """Extract words (splitted strings) from a binary executable file."""
44 rodata = PreprocessAndReadRodata(filepath)
45 words = []
46
47 strings_re = re.compile(r'[^\x00-\x1F\x7F-\xFF]{%d,}' % min_length)
48 # Use different encodings for strings extraction.
49 for encoding in ENCODING_TYPES:
50 data = rodata.decode(encoding, 'ignore').encode('ascii', 'ignore')
51 raw_strings = strings_re.findall(data)
52 for splitted_line in map(lambda line: line.split(), raw_strings):
53 words += splitted_line
54
55 return set(words)
56
57
58 def ExtractWordsFromLines(lines):
59 """Extract all words from a list of strings."""
60 words = set()
61 for line in lines:
62 for word in line.split():
63 words.add(word)
64
65 return words
66
67
68 def ExtractWordsFromSpec(filepath, is_html):
69 """Extract words from a specification."""
70 data = ReadSpecification(filepath, is_html)
71 words = data.split()
72 return set(words)
73
74
75 def FindIndentedText(text):
76 """Find space-indented text blocks, e.g. code or data samples in RFCs."""
77 lines = text.split('\n')
78 indented_blocks = []
79 current_block = ''
80 previous_number_of_spaces = 0
81
82 # Go through every line and concatenate space-indented blocks into lines.
83 for i in xrange(0, len(lines), 1):
84 if not lines[i]:
85 # Ignore empty lines.
86 continue
87
88 # Space-indented text blocks have more leading spaces than regular text.
89 n = FindNumberOfLeadingSpaces(lines[i])
90
91 if n > previous_number_of_spaces:
92 # Beginning of a space-indented text block, start concatenation.
93 current_block = lines[i][n : ]
94 elif n == previous_number_of_spaces and current_block:
95 # Or continuation of a space-indented text block, concatenate lines.
96 current_block += '\n' + lines[i][n : ]
97
98 if n < previous_number_of_spaces and current_block:
99 # Current line is not indented, save previously concatenated lines.
100 indented_blocks.append(current_block)
101 current_block = ''
102
103 previous_number_of_spaces = n
104
105 return indented_blocks
106
107
108 def FindNumberOfLeadingSpaces(line):
109 """Calculate number of leading whitespace characters in the string."""
110 n = 0
111 while n < len(line) and line[n].isspace():
112 n += 1
113
114 return n
115
116
117 def GenerateDictionary(path_to_binary, path_to_spec, strategy, is_html=False):
118 """Generate a dictionary for given pair of fuzzer binary and specification."""
119 for filepath in [path_to_binary, path_to_spec]:
120 if not os.path.exists(filepath):
121 logging.error('%s doesn\'t exist. Exit.', filepath)
122 sys.exit(1)
123
124 words_from_binary = ExtractWordsFromBinary(path_to_binary)
125 words_from_spec = ExtractWordsFromSpec(path_to_spec, is_html)
126
127 dictionary_words = set()
128
129 if 'i' in strategy:
130 # Strategy i: only words which are common for binary and for specification.
131 dictionary_words = words_from_binary.intersection(words_from_spec)
132
133 if 'q' in strategy:
134 # Strategy q: add words from all quoted strings from specification.
135 # TODO(mmoroz): experimental and very noisy. Not recommended to use.
136 spec_data = ReadSpecification(path_to_spec, is_html)
137 quoted_strings = FindIndentedText(spec_data)
138 quoted_words = ExtractWordsFromLines(quoted_strings)
139 dictionary_words = dictionary_words.union(quoted_words)
140
141 if 'u' in strategy:
142 # Strategy u: add all uppercase words from specification.
143 uppercase_words = set(w for w in words_from_spec if w.isupper())
144 dictionary_words = dictionary_words.union(uppercase_words)
145
146 return dictionary_words
147
148
149 def PreprocessAndReadRodata(filepath):
150 """Create a stripped copy of the binary and extract .rodata section."""
151 stripped_file = tempfile.NamedTemporaryFile(prefix='.stripped_')
152 stripped_filepath = stripped_file.name
153 shutil.copyfile(filepath, stripped_filepath)
154
155 # Strip all symbols to reduce amount of redundant strings.
156 strip_cmd = ['strip', '--strip-all', stripped_filepath]
157 result = subprocess.call(strip_cmd)
158 if result:
159 logging.warning('Failed to strip the binary. Using the original version.')
160 stripped_filepath = filepath
161
162 # Extract .rodata section to reduce amount of redundant strings.
163 rodata_file = tempfile.NamedTemporaryFile(prefix='.rodata_')
164 rodata_filepath = rodata_file.name
165 objcopy_cmd = ['objcopy', '-j', '.rodata', stripped_filepath, rodata_filepath]
166
167 # Hide output from stderr since objcopy prints a warning.
168 with open(os.devnull, 'w') as devnull:
169 result = subprocess.call(objcopy_cmd, stderr=devnull)
170
171 if result:
172 logging.warning('Failed to extract .rodata section. Using the whole file.')
173 rodata_filepath = stripped_filepath
174
175 with open(rodata_filepath) as file_handle:
176 data = file_handle.read()
177
178 stripped_file.close()
179 rodata_file.close()
180
181 return data
182
183
184 def ReadSpecification(filepath, is_html):
185 """Read a specification file and return its contents."""
186 with open(filepath, 'r') as file_handle:
187 data = file_handle.read()
188
189 if is_html:
190 data = DecodeHTML(data)
191
192 return data
193
194
195 def WriteDictionary(dictionary_path, dictionary):
196 """Write given dictionary to a file."""
197 with open(dictionary_path, 'wb') as file_handle:
198 file_handle.write('# This is an automatically generated dictionary.\n')
199 for word in dictionary:
200 if not word:
201 continue
202 line = '"%s"\n' % EscapeDictionaryElement(word)
203 file_handle.write(line)
204
205
206 def main():
207 parser = argparse.ArgumentParser(description="Generate fuzzer dictionary.")
208 parser.add_argument('--fuzzer', required=True,
209 help='Path to a fuzzer binary executable. It is '
210 'recommended to use a binary built with '
211 '"use_libfuzzer=false is_asan=false" to get a better '
212 'dictionary with fewer number of redundant elements.')
213 parser.add_argument('--spec', required=True,
214 help='Path to a target specification (in textual form).')
215 parser.add_argument('--html', default=0,
216 help='Decode HTML [01] (0 is default value): '
217 '1 - if specification has HTML entities to be decoded.')
218 parser.add_argument('--out', required=True,
219 help='Path to a file to write a dictionary into.')
220 parser.add_argument('--strategy', default='iu',
221 help='Generation strategy [iqu] ("iu" is default value): '
222 'i - intersection, q - quoted, u - uppercase.')
223 args = parser.parse_args()
224
225 dictionary = GenerateDictionary(args.fuzzer, args.spec, args.strategy,
226 is_html=bool(args.html))
227 WriteDictionary(args.out, dictionary)
228
229
230 if __name__ == '__main__':
231 main()
OLDNEW
« no previous file with comments | « content/test/data/fuzzer_dictionaries/renderer_fuzzer.dict ('k') | testing/libfuzzer/fuzzers/BUILD.gn » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698