testing/libfuzzer/dictionary_generator.py - Issue 2115563002: [libfuzzer] Added script for dictionary generation.

Side by Side Diff: testing/libfuzzer/dictionary_generator.py

Issue 2115563002: [libfuzzer] Added script for dictionary generation. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Restore xml.dict, will replace with generated one in a separate CL. Created 4 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
(Empty)
	1 #!/usr/bin/python2

	2 #

	3 # Copyright 2016 The Chromium Authors. All rights reserved.

	4 # Use of this source code is governed by a BSD-style license that can be

	5 # found in the LICENSE file.

	6

	7 """Generate a dictionary for libFuzzer or AFL-based fuzzer.

	8

	9 Invoked manually using a fuzzer binary and target format/protocol specification.

	10 Works better for text formats or protocols. For binary ones may be useless.

	11 """

	12

	13 import argparse

	14 import HTMLParser

	15 import io

	16 import logging

	17 import os

	18 import re

	19 import shutil

	20 import string

	21 import subprocess

	22 import sys

	23 import tempfile

	24

	25

	26 ENCODING_TYPES = ['ascii', 'utf_16_be', 'utf_16_le', 'utf_32_be', 'utf_32_le']

	27 MIN_STRING_LENGTH = 4

	28

	29

	30 def DecodeHTML(html_data):

	31 """HTML-decoding of the data."""

	32 html_parser = HTMLParser.HTMLParser()

	33 data = html_parser.unescape(html_data.decode('ascii', 'ignore'))

	34 return data.encode('ascii', 'ignore')

	35

	36

	37 def EscapeDictionaryElement(element):

	38 """Escape all unprintable and control characters in an element."""

	39 return element.encode('string_escape').replace('"', '\"')

	40

	41

	42 def ExtractWordsFromBinary(filepath, min_length=MIN_STRING_LENGTH):

	43 """Extract words (splitted strings) from a binary executable file."""

	44 rodata = PreprocessAndReadRodata(filepath)

	45 words = []

	46

	47 strings_re = re.compile(r'[^\x00-\x1F\x7F-\xFF]{%d,}' % min_length)

	48 # Use different encodings for strings extraction.

	49 for encoding in ENCODING_TYPES:

	50 data = rodata.decode(encoding, 'ignore').encode('ascii', 'ignore')

	51 raw_strings = strings_re.findall(data)

	52 for splitted_line in map(lambda line: line.split(), raw_strings):

	53 words += splitted_line

	54

	55 return set(words)

	56

	57

	58 def ExtractWordsFromLines(lines):

	59 """Extract all words from a list of strings."""

	60 words = set()

	61 for line in lines:

	62 for word in line.split():

	63 words.add(word)

	64

	65 return words

	66

	67

	68 def ExtractWordsFromSpec(filepath, is_html):

	69 """Extract words from a specification."""

	70 data = ReadSpecification(filepath, is_html)

	71 words = data.split()

	72 return set(words)

	73

	74

	75 def FindIndentedText(text):

	76 """Find space-indented text blocks, e.g. code or data samples in RFCs."""

	77 lines = text.split('\n')

	78 indented_blocks = []

	79 current_block = ''

	80 previous_number_of_spaces = 0

	81

	82 # Go through every line and concatenate space-indented blocks into lines.

	83 for i in xrange(0, len(lines), 1):

	84 if not lines[i]:

	85 # Ignore empty lines.

	86 continue

	87

	88 # Space-indented text blocks have more leading spaces than regular text.

	89 n = FindNumberOfLeadingSpaces(lines[i])

	90

	91 if n > previous_number_of_spaces:

	92 # Beginning of a space-indented text block, start concatenation.

	93 current_block = lines[i][n : ]

	94 elif n == previous_number_of_spaces and current_block:

	95 # Or continuation of a space-indented text block, concatenate lines.

	96 current_block += '\n' + lines[i][n : ]

	97

	98 if n < previous_number_of_spaces and current_block:

	99 # Current line is not indented, save previously concatenated lines.

	100 indented_blocks.append(current_block)

	101 current_block = ''

	102

	103 previous_number_of_spaces = n

	104

	105 return indented_blocks

	106

	107

	108 def FindNumberOfLeadingSpaces(line):

	109 """Calculate number of leading whitespace characters in the string."""

	110 n = 0

	111 while n < len(line) and line[n].isspace():

	112 n += 1

	113

	114 return n

	115

	116

	117 def GenerateDictionary(path_to_binary, path_to_spec, strategy, is_html=False):

	118 """Generate a dictionary for given pair of fuzzer binary and specification."""

	119 for filepath in [path_to_binary, path_to_spec]:

	120 if not os.path.exists(filepath):

	121 logging.error('%s doesn\'t exist. Exit.', filepath)

	122 sys.exit(1)

	123

	124 words_from_binary = ExtractWordsFromBinary(path_to_binary)

	125 words_from_spec = ExtractWordsFromSpec(path_to_spec, is_html)

	126

	127 dictionary_words = set()

	128

	129 if 'i' in strategy:

	130 # Strategy i: only words which are common for binary and for specification.

	131 dictionary_words = words_from_binary.intersection(words_from_spec)

	132

	133 if 'q' in strategy:

	134 # Strategy q: add words from all quoted strings from specification.

	135 # TODO(mmoroz): experimental and very noisy. Not recommended to use.

	136 spec_data = ReadSpecification(path_to_spec, is_html)

	137 quoted_strings = FindIndentedText(spec_data)

	138 quoted_words = ExtractWordsFromLines(quoted_strings)

	139 dictionary_words = dictionary_words.union(quoted_words)

	140

	141 if 'u' in strategy:

	142 # Strategy u: add all uppercase words from specification.

	143 uppercase_words = set(w for w in words_from_spec if w.isupper())

	144 dictionary_words = dictionary_words.union(uppercase_words)

	145

	146 return dictionary_words

	147

	148

	149 def PreprocessAndReadRodata(filepath):

	150 """Create a stripped copy of the binary and extract .rodata section."""

	151 stripped_file = tempfile.NamedTemporaryFile(prefix='.stripped_')

	152 stripped_filepath = stripped_file.name

	153 shutil.copyfile(filepath, stripped_filepath)

	154

	155 # Strip all symbols to reduce amount of redundant strings.

	156 strip_cmd = ['strip', '--strip-all', stripped_filepath]

	157 result = subprocess.call(strip_cmd)

	158 if result:

	159 logging.warning('Failed to strip the binary. Using the original version.')

	160 stripped_filepath = filepath

	161

	162 # Extract .rodata section to reduce amount of redundant strings.

	163 rodata_file = tempfile.NamedTemporaryFile(prefix='.rodata_')

	164 rodata_filepath = rodata_file.name

	165 objcopy_cmd = ['objcopy', '-j', '.rodata', stripped_filepath, rodata_filepath]

	166

	167 # Hide output from stderr since objcopy prints a warning.

	168 with open(os.devnull, 'w') as devnull:

	169 result = subprocess.call(objcopy_cmd, stderr=devnull)

	170

	171 if result:

	172 logging.warning('Failed to extract .rodata section. Using the whole file.')

	173 rodata_filepath = stripped_filepath

	174

	175 with open(rodata_filepath) as file_handle:

	176 data = file_handle.read()

	177

	178 stripped_file.close()

	179 rodata_file.close()

	180

	181 return data

	182

	183

	184 def ReadSpecification(filepath, is_html):

	185 """Read a specification file and return its contents."""

	186 with open(filepath, 'r') as file_handle:

	187 data = file_handle.read()

	188

	189 if is_html:

	190 data = DecodeHTML(data)

	191

	192 return data

	193

	194

	195 def WriteDictionary(dictionary_path, dictionary):

	196 """Write given dictionary to a file."""

	197 with open(dictionary_path, 'wb') as file_handle:

	198 file_handle.write('# This is an automatically generated dictionary.\n')

	199 for word in dictionary:

	200 if not word:

	201 continue

	202 line = '"%s"\n' % EscapeDictionaryElement(word)

	203 file_handle.write(line)

	204

	205

	206 def main():

	207 parser = argparse.ArgumentParser(description="Generate fuzzer dictionary.")

	208 parser.add_argument('--fuzzer', required=True,

	209 help='Path to a fuzzer binary executable. It is '

	210 'recommended to use a binary built with '

	211 '"use_libfuzzer=false is_asan=false" to get a better '

	212 'dictionary with fewer number of redundant elements.')

	213 parser.add_argument('--spec', required=True,

	214 help='Path to a target specification (in textual form).')

	215 parser.add_argument('--html', default=0,

	216 help='Decode HTML [01] (0 is default value): '

	217 '1 - if specification has HTML entities to be decoded.')

	218 parser.add_argument('--out', required=True,

	219 help='Path to a file to write a dictionary into.')

	220 parser.add_argument('--strategy', default='iu',

	221 help='Generation strategy [iqu] ("iu" is default value): '

	222 'i - intersection, q - quoted, u - uppercase.')

	223 args = parser.parse_args()

	224

	225 dictionary = GenerateDictionary(args.fuzzer, args.spec, args.strategy,

	226 is_html=bool(args.html))

	227 WriteDictionary(args.out, dictionary)

	228

	229

	230 if __name__ == '__main__':

	231 main()

OLD	NEW

« no previous file with comments | « content/test/data/fuzzer_dictionaries/renderer_fuzzer.dict ('k') | testing/libfuzzer/fuzzers/BUILD.gn » ('j') | no next file with comments »