Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(146)

Side by Side Diff: testing/libfuzzer/dictionary_generator.py

Issue 2115563002: [libfuzzer] Added script for dictionary generation. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: A couple of fixes ported from local version. Created 4 years, 5 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 #!/usr/bin/python2
2 #
3 # Copyright 2016 The Chromium Authors. All rights reserved.
4 # Use of this source code is governed by a BSD-style license that can be
5 # found in the LICENSE file.
6
7 """Generate a dictionary for libfuzzer or AFL-based fuzzer.
8
9 Invoked manually using a fuzzer binary and target format/protocol specification.
10 Works better for text formats or protocols. For binary ones may be useless.
11 """
12
13 import argparse
14 import HTMLParser
15 import io
16 import os
17 import random
18 import re
19 import shutil
20 import string
21 import sys
22 import time
23
24
25 ALLOWED_CHARS = string.letters + string.digits + '#$%&()*+,-./:;<=>?@[]^_`{|}~ '
inferno 2016/07/04 00:21:56 maybe better to just use ascii table 32-126 ?
mmoroz 2016/07/06 15:06:19 Removed this white-listing at all.
26
27
28 def EscapeDictionaryElement(element):
aizatsky 2016/07/01 22:28:55 Consider using string_escape: https://docs.python
mmoroz 2016/07/06 15:06:18 Thank you, it is almost exactly what I need!
29 """Escape all unprintable and control characters in an element."""
30 res = ''
inferno 2016/07/04 00:21:55 s/res/result. full variable names are preferred.
mmoroz 2016/07/06 15:06:18 Done.
31 for c in element:
32 if c in ALLOWED_CHARS:
33 res += c
34 else:
35 res += HexEscapeSingleChar(c)
36
37 return res
38
39
40 def ExtractWordsFromLines(lines):
41 """Extract all words from a list of strings."""
42 words = set()
43 for l in lines:
inferno 2016/07/04 00:21:55 nit: s/l/line, s/w/word
mmoroz 2016/07/06 15:06:18 Done.
44 for w in l.split():
45 words.add(w)
46
47 return words
48
49
50 def FindNumberOfLeadingSpaces(line):
51 """Calculate number of leading whitespace characters in the string."""
52 n = 0
inferno 2016/07/04 00:21:55 nit: maybe just line.count(' ') ? or we care about
mmoroz 2016/07/06 15:06:19 I'm afraid that specification in textual form may
53 while n < len(line) and line[n].isspace():
54 n += 1
55
56 return n
57
58
59 def FindQuotedText(text):
60 """Find text with an offset, e.g. code samples in RFCs or data examples."""
aizatsky 2016/07/01 22:28:55 Find space-indented text blocks, e.g. ...
mmoroz 2016/07/06 15:06:19 Done.
61 strings = text.split('\n')
62 found_strings = []
63 min_number_of_spaces = 1
64 previous_number_of_spaces = 0
65 previous_strings = ''
66
67 # Go through every line and concatenate quoted lines into a one string.
inferno 2016/07/04 00:21:55 Can you more explanation on algorithm and add an e
mmoroz 2016/07/06 15:06:19 Made it a bit simpler, added comments and renamed
68 for i in xrange(1, len(strings), 1):
69 n = FindNumberOfLeadingSpaces(strings[i])
70 if n < previous_number_of_spaces and previous_strings:
71 # Current line is not quotted, save previously concatenated lines.
72 found_strings.append(previous_strings)
73 previous_strings = ''
74 previous_number_of_spaces = n
75 continue
76
77 if n >= min_number_of_spaces:
78 if n > previous_number_of_spaces:
79 # Beginning of a quoted text, start concatenation.
80 previous_strings = strings[i][n : ]
81 elif n == previous_number_of_spaces and previous_strings:
82 # Or continuation of a quoted text, concatenate it.
83 previous_strings += '\n' + strings[i][n : ]
84
85 previous_number_of_spaces = n
86
87 return found_strings
88
89
90 def GenerateDictionary(path_to_binary, path_to_spec, strategy):
91 """Generate a dictionary for given pair of fuzzer binary and specification."""
92 words_from_binary = ExtractWordsFromBinary(path_to_binary)
93 words_from_spec = ParseSpec(path_to_spec)
94
95 dictionary_words = set()
96
97 if 'i' in strategy:
98 # Strategy i: only words which are common for binary and for specification.
99 dictionary_words = words_from_binary.intersection(words_from_spec)
100
101 if 'q' in strategy:
102 # Strategy q: add words from all quoted strings from specification.
103 # TODO(mmoroz): experimental and very noisy. Not recommended to use.
104 spec_data = ReadSpecification(path_to_spec)
105 quoted_strings = FindQuotedText(spec_data)
106 quoted_words = ExtractWordsFromLines(quoted_strings)
107 dictionary_words = dictionary_words.union(quoted_words)
108
109 if 'u' in strategy:
110 # Strategy u: add all uppercase words from specification.
111 uppercase_words = set(w for w in words_from_spec if w.isupper())
112 dictionary_words = dictionary_words.union(uppercase_words)
113
114 return dictionary_words
115
116
117 def HexEscapeSingleChar(c):
118 """Replace a character with its hex-escaped representation."""
119 return '\\x' + c.encode('hex')
120
121
122 def ExtractWordsFromBinary(filepath, min_length=4):
123 """Extract words (splitted strings) from a binary executable file."""
124 rodata_path = PreprocessBinary(filepath)
Oliver Chang 2016/07/01 19:10:45 is rodata_path ever deleted?
mmoroz 2016/07/06 15:06:19 Done. Now it works via tempfile, thanks for the su
125 words = []
126
127 # Use different encodings for strings extraction.
128 for encoding in 'sSbBlL':
129 cmd = 'strings -e %s -n %d %s' % (encoding, min_length, rodata_path)
aizatsky 2016/07/01 22:28:55 Wouldn't it be simpler to simply use regexp to ext
mmoroz 2016/07/06 15:06:18 Nice idea! Looks like it gives even a bit "cleaner
130 strings_raw = os.popen(cmd).read()
131 words += strings_raw.split()
132
133 return set(words)
134
135
136 def ParseSpec(filepath):
inferno 2016/07/04 00:21:56 nit: s/ParseSpec/ExtractWordsFromSpec
mmoroz 2016/07/06 15:06:18 Done.
137 """Extract words from a specification."""
138 data = ReadSpecification(filepath)
139 words = data.split()
140 return set(words)
141
142
143 def PreprocessBinary(filepath):
144 """Creates a stripped copy of the binary and extract .rodata section."""
145 temp_filepath = os.path.join('/tmp', os.path.basename(filepath) + '_copy')
Oliver Chang 2016/07/01 19:10:45 can you use the tempfile instead?
inferno 2016/07/04 00:21:56 +1
mmoroz 2016/07/06 15:06:18 Done.
146 shutil.copyfile(filepath, temp_filepath)
147
148 # Strip all symbols to reduce amount of redundant strings.
149 os.popen('strip --strip-all %s' % temp_filepath)
inferno 2016/07/04 00:21:56 os.popen is deprecated, use subprocess module.
mmoroz 2016/07/06 15:06:18 Thanks! Done.
150
151 # Extract .rodata section to reduce amount of redundant strings.
152 rodata_filepath = os.path.join('/tmp', os.path.basename(filepath) + '_rodata')
153 os.popen('objcopy -j .rodata %s %s 2>&1' % (temp_filepath, rodata_filepath))
154
155 return rodata_filepath
156
157
158 def ReadAndDecodeHTML(filepath):
159 """Return HTML-decoded content of the file."""
160 data = io.open(filepath, 'r', encoding='utf-8').read()
inferno 2016/07/04 00:21:55 forgot to close file :(, also just plain open shou
mmoroz 2016/07/06 15:06:19 My bad, I thought that it is not mandatory to expl
161 html_parser = HTMLParser.HTMLParser()
162 data = html_parser.unescape(data)
163 return data.encode('ascii', 'ignore')
164
165
166 def ReadSpecification(filepath):
167 """Read a specification file and convert it to ASCII text if needed."""
inferno 2016/07/04 00:21:55 description part "convert it to ASCII text if need
mmoroz 2016/07/06 15:06:19 Done.
168 if filepath.endswith('.txt'):
inferno 2016/07/04 00:21:56 This seems too hacky, i wonder if people will know
mmoroz 2016/07/06 15:06:19 Good point, but we don't know exactly should file
169 data = open(filepath).read()
170 else:
171 # If specification is not a .txt file, decode possible HTML entities.
172 data = ReadAndDecodeHTML(filepath)
inferno 2016/07/04 00:21:55 maybe split ReadAndDecodeHtml into ReadFile and de
mmoroz 2016/07/06 15:06:18 Yeah, but regular open() works fine for everything
173
174 return data
175
176
177 def WriteDictionary(dictionary_path, dictionary):
178 """Write given dictionary to a file."""
179 with open(dictionary_path, 'wb') as f:
180 f.write('# This is an automatically generated dictionary.\n')
181 for word in dictionary:
182 if not word:
183 continue
184 line = '"%s"\n' % EscapeDictionaryElement(word)
185 f.write(line)
186
187
188 def main():
189 parser = argparse.ArgumentParser(description="Generate fuzzer dictionary.")
190 parser.add_argument('--fuzzer', required=True,
191 help='Path to a fuzzer binary executable. It is '
192 'recommended to use a binary built with '
193 '"use_libfuzzer=false is_asan=false" to get a better '
194 'dictionary with fewer number of redundant elements.')
195 parser.add_argument('--out', required=True,
196 help='Path to a file to write a dictionary into.')
197 parser.add_argument('--spec', required=True,
Oliver Chang 2016/07/01 19:10:45 can the spec option be optional? is it reasonable
mmoroz 2016/07/06 15:06:18 I don't think it will be reasonable, there are too
198 help='Path to a target specification (in textual form).')
199 parser.add_argument('--strategy', default='iu',
200 help='Generation strategy [iqu] ("iu" is default value): '
201 'i - intersection, q - quoted, u - uppercase.')
202 args = parser.parse_args()
203
204 dictionary = GenerateDictionary(args.fuzzer, args.spec, args.strategy)
inferno 2016/07/04 00:21:55 I think you should check the platform so that bina
mmoroz 2016/07/06 15:06:19 Good point, thanks! My plan is to focus on dynami
205 WriteDictionary(args.out, dictionary)
206
207
208 if __name__ == '__main__':
209 main()
OLDNEW
« net/BUILD.gn ('K') | « net/BUILD.gn ('k') | testing/libfuzzer/fuzzers/BUILD.gn » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698