testing/libfuzzer/dictionary_generator.py - Issue 2115563002: [libfuzzer] Added script for dictionary generation.

Side by Side Diff: testing/libfuzzer/dictionary_generator.py

Issue 2115563002: [libfuzzer] Added script for dictionary generation. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Fix unused imports, missing copyrights, rename one function. Created 4 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
(Empty)
	1 #!/usr/bin/python2

	2 #

	3 # Copyright 2016 The Chromium Authors. All rights reserved.

	4 # Use of this source code is governed by a BSD-style license that can be

	5 # found in the LICENSE file.

	6

	7 """Generate a dictionary for libFuzzer or AFL-based fuzzer.

	8

	9 Invoked manually using a fuzzer binary and target format/protocol specification.

	10 Works better for text formats or protocols. For binary ones may be useless.

	11 """

	12

	13 import argparse

	14 import HTMLParser

	15 import io

	16 import logging

	17 import os

	18 import re

	19 import shutil

	20 import string

	21 import subprocess

	22 import sys

	23 import tempfile

	24

	25

	26 ENCODING_TYPES = ['ascii', 'utf_16_be', 'utf_16_le', 'utf_32_be', 'utf_32_le']

	27

	28

	29 def EscapeDictionaryElement(element):

	30 """Escape all unprintable and control characters in an element."""

	31 return element.encode('string_escape').replace('"', '\"')

	32

	33

	34 def ExtractWordsFromLines(lines):

	35 """Extract all words from a list of strings."""

	36 words = set()

	37 for line in lines:

	38 for word in line.split():

	39 words.add(word)

	40

	41 return words

	42

	43

	44 def FindNumberOfLeadingSpaces(line):

	45 """Calculate number of leading whitespace characters in the string."""

	46 n = 0

	47 while n < len(line) and line[n].isspace():

	48 n += 1

	49

	50 return n

	51

	52

	53 def FindIndentedText(text):

	54 """Find space-indented text blocks, e.g. code or data samples in RFCs."""

	55 lines = text.split('\n')

	56 indented_blocks = []

	57 current_block = ''

	58 previous_number_of_spaces = 0

	59

	60 # Go through every line and concatenate space-indented blocks into lines.

	61 for i in xrange(0, len(lines), 1):

	62 if not lines[i]:

	63 # Ignore empty lines.

	64 continue

	65

	66 # Space-indented text blocks have more leading spaces than regular text.

	67 n = FindNumberOfLeadingSpaces(lines[i])

	68

	69 if n > previous_number_of_spaces:

	70 # Beginning of a space-indented text block, start concatenation.

	71 current_block = lines[i][n : ]

	72 elif n == previous_number_of_spaces and current_block:

	73 # Or continuation of a space-indented text block, concatenate lines.

	74 current_block += '\n' + lines[i][n : ]

	75

	76 if n < previous_number_of_spaces and current_block:

	77 # Current line is not indented, save previously concatenated lines.

	78 indented_blocks.append(current_block)

	79 current_block = ''

	80

	81 previous_number_of_spaces = n

	82

	83 return indented_blocks

	84

	85

	86 def GenerateDictionary(path_to_binary, path_to_spec, strategy, is_html=False):

	87 """Generate a dictionary for given pair of fuzzer binary and specification."""

	88 for filepath in [path_to_binary, path_to_spec]:

	89 if not os.path.exists(filepath):

	90 logging.error('%s doesn\'t exist. Exit.', filepath)

	91 sys.exit(1)

	92

	93 words_from_binary = ExtractWordsFromBinary(path_to_binary)

	94 words_from_spec = ExtractWordsFromSpec(path_to_spec, is_html)

	95

	96 dictionary_words = set()

	97

	98 if 'i' in strategy:

	99 # Strategy i: only words which are common for binary and for specification.

	100 dictionary_words = words_from_binary.intersection(words_from_spec)

	101

	102 if 'q' in strategy:

	103 # Strategy q: add words from all quoted strings from specification.

	104 # TODO(mmoroz): experimental and very noisy. Not recommended to use.

	105 spec_data = ReadSpecification(path_to_spec, is_html)

	106 quoted_strings = FindIndentedText(spec_data)

	107 quoted_words = ExtractWordsFromLines(quoted_strings)

	108 dictionary_words = dictionary_words.union(quoted_words)

	109

	110 if 'u' in strategy:

	111 # Strategy u: add all uppercase words from specification.

	112 uppercase_words = set(w for w in words_from_spec if w.isupper())

	113 dictionary_words = dictionary_words.union(uppercase_words)

	114

	115 return dictionary_words

	116

	117

	118 def HexEscapeSingleChar(c):
	inferno 2016/07/06 15:33:23 unused, please remove. unused, please remove. mmoroz 2016/07/06 16:12:37 Done. Show quoted text On 2016/07/06 15:33:23, inferno wrote: > unused, please remove. Done.
	119 """Replace a character with its hex-escaped representation."""

	120 return '\\x' + c.encode('hex')

	121

	122

	123 def ExtractWordsFromBinary(filepath, min_length=4):
	inferno 2016/07/06 15:33:23 nit: Put 4 in a global nit: Put 4 in a global mmoroz 2016/07/06 16:12:37 Done. Show quoted text On 2016/07/06 15:33:23, inferno wrote: > nit: Put 4 in a global Done.
	124 """Extract words (splitted strings) from a binary executable file."""

	125 rodata = PreprocessAndReadRodata(filepath)

	126 words = []

	127

	128 strings_re = re.compile(r'[^\x00-\x1F\x7F-\xFF]{%d,}' % min_length)

	129 # Use different encodings for strings extraction.

	130 for encoding in ENCODING_TYPES:

	131 data = rodata.decode(encoding, 'ignore').encode('ascii', 'ignore')

	132 strings_raw = strings_re.findall(data)
	inferno 2016/07/06 15:33:23 s/strings_raw/raw_strings s/strings_raw/raw_strings mmoroz 2016/07/06 16:12:37 Done. Show quoted text On 2016/07/06 15:33:23, inferno wrote: > s/strings_raw/raw_strings Done.
	133 for splitted_line in map(lambda line: line.split(), strings_raw):

	134 words += splitted_line

	135

	136 return set(words)

	137

	138

	139 def ExtractWordsFromSpec(filepath, is_html):

	140 """Extract words from a specification."""

	141 data = ReadSpecification(filepath, is_html)

	142 words = data.split()

	143 return set(words)

	144

	145

	146 def PreprocessAndReadRodata(filepath):

	147 """Create a stripped copy of the binary and extract .rodata section."""

	148 stripped_file = tempfile.NamedTemporaryFile(prefix='.stripped_')

	149 stripped_filepath = stripped_file.name

	150 shutil.copyfile(filepath, stripped_filepath)

	151

	152 # Strip all symbols to reduce amount of redundant strings.

	153 strip_cmd = ['strip', '--strip-all', stripped_filepath]

	154 result = subprocess.call(strip_cmd)

	155 if result:

	156 logging.warning('Failed to strip the binary. Using the original version.')

	157 stripped_filepath = filepath

	158

	159 # Extract .rodata section to reduce amount of redundant strings.

	160 rodata_file = tempfile.NamedTemporaryFile(prefix='.rodata_')

	161 rodata_filepath = rodata_file.name

	162 objcopy_cmd = ['objcopy', '-j', '.rodata', stripped_filepath, rodata_filepath]

	163

	164 # Hide output from stderr since objcopy prints a warning.

	165 with open(os.devnull, 'w') as devnull:

	166 result = subprocess.call(objcopy_cmd, stderr=devnull)

	167

	168 if result:

	169 logging.warning('Failed to extract .rodata section. Using the whole file.')

	170 rodata_filepath = stripped_filepath

	171

	172 with open(rodata_filepath) as f:

	173 data = f.read()

	174

	175 stripped_file.close()

	176 rodata_file.close()

	177

	178 return data

	179

	180

	181 def ReadAndDecodeHTML(filepath):

	182 """Return HTML-decoded content of the file."""

	183 with io.open(filepath, 'r', encoding='utf-8') as f:

	184 data = f.read()

	185 html_parser = HTMLParser.HTMLParser()

	186 data = html_parser.unescape(data)

	187 return data.encode('ascii', 'ignore')

	188

	189

	190 def ReadSpecification(filepath, is_html):

	191 """Read a specification file and return its contents."""

	192 if not is_html:

	193 data = open(filepath).read()
	inferno 2016/07/06 15:33:23 nit: forgot to close file. with open(filepath, 'r nit: forgot to close file. with open(filepath, 'r') as file_handle: data = file_handle.read() if is_html: data = DecodeHTML(data) No need to read file in ReadAndDecodeHTML then, just rename to DecodeHTML. mmoroz 2016/07/06 16:12:37 HTMLParser.unescape fails in this case. But decode Show quoted text On 2016/07/06 15:33:23, inferno wrote: > nit: forgot to close file. > > with open(filepath, 'r') as file_handle: > data = file_handle.read() > > if is_html: > data = DecodeHTML(data) > > No need to read file in ReadAndDecodeHTML then, just rename to DecodeHTML. HTMLParser.unescape fails in this case. But decode('ascii', 'ignore') prevents HTMLParser from failure and produces the same results, so I add it. mmoroz 2016/07/06 16:12:37 Done. Show quoted text On 2016/07/06 15:33:23, inferno wrote: > nit: forgot to close file. > > with open(filepath, 'r') as file_handle: > data = file_handle.read() > > if is_html: > data = DecodeHTML(data) > > No need to read file in ReadAndDecodeHTML then, just rename to DecodeHTML. Done.
	194 else:

	195 data = ReadAndDecodeHTML(filepath)

	196

	197 return data

	198

	199

	200 def WriteDictionary(dictionary_path, dictionary):

	201 """Write given dictionary to a file."""

	202 with open(dictionary_path, 'wb') as f:

	203 f.write('# This is an automatically generated dictionary.\n')

	204 for word in dictionary:

	205 if not word:

	206 continue

	207 line = '"%s"\n' % EscapeDictionaryElement(word)

	208 f.write(line)

	209

	210

	211 def main():

	212 parser = argparse.ArgumentParser(description="Generate fuzzer dictionary.")

	213 parser.add_argument('--fuzzer', required=True,

	214 help='Path to a fuzzer binary executable. It is '

	215 'recommended to use a binary built with '

	216 '"use_libfuzzer=false is_asan=false" to get a better '

	217 'dictionary with fewer number of redundant elements.')

	218 parser.add_argument('--spec', required=True,

	219 help='Path to a target specification (in textual form).')

	220 parser.add_argument('--html', default=0,

	221 help='Decode HTML [01] (0 is default value): '

	222 '1 - if specification has HTML entities to be decoded.')

	223 parser.add_argument('--out', required=True,

	224 help='Path to a file to write a dictionary into.')

	225 parser.add_argument('--strategy', default='iu',

	226 help='Generation strategy [iqu] ("iu" is default value): '

	227 'i - intersection, q - quoted, u - uppercase.')

	228 args = parser.parse_args()

	229

	230 dictionary = GenerateDictionary(args.fuzzer, args.spec, args.strategy,

	231 is_html=bool(args.html))

	232 WriteDictionary(args.out, dictionary)

	233

	234

	235 if __name__ == '__main__':

	236 main()

OLD	NEW