OLD | NEW |
(Empty) | |
| 1 #!/usr/bin/env bash |
| 2 # |
| 3 # american fuzzy lop - corpus minimization tool |
| 4 # --------------------------------------------- |
| 5 # |
| 6 # Written and maintained by Michal Zalewski <lcamtuf@google.com> |
| 7 # |
| 8 # Copyright 2014, 2015 Google Inc. All rights reserved. |
| 9 # |
| 10 # Licensed under the Apache License, Version 2.0 (the "License"); |
| 11 # you may not use this file except in compliance with the License. |
| 12 # You may obtain a copy of the License at: |
| 13 # |
| 14 # http://www.apache.org/licenses/LICENSE-2.0 |
| 15 # |
| 16 # This tool tries to find the smallest subset of files in the input directory |
| 17 # that still trigger the full range of instrumentation data points seen in |
| 18 # the starting corpus. This has two uses: |
| 19 # |
| 20 # - Screening large corpora of input files before using them as a seed for |
| 21 # afl-fuzz. The tool will remove functionally redundant files and likely |
| 22 # leave you with a much smaller set. |
| 23 # |
| 24 # (In this case, you probably also want to consider running afl-tmin on |
| 25 # the individual files later on to reduce their size.) |
| 26 # |
| 27 # - Minimizing the corpus generated organically by afl-fuzz, perhaps when |
| 28 # planning to feed it to more resource-intensive tools. The tool achieves |
| 29 # this by removing all entries that used to trigger unique behaviors in the |
| 30 # past, but have been made obsolete by later finds. |
| 31 # |
| 32 # Note that the tool doesn't modify the files themselves. For that, you want |
| 33 # afl-tmin. |
| 34 # |
| 35 # This script must use bash because other shells may have hardcoded limits on |
| 36 # array sizes. |
| 37 # |
| 38 |
| 39 echo "corpus minimization tool for afl-fuzz by <lcamtuf@google.com>" |
| 40 echo |
| 41 |
| 42 ######### |
| 43 # SETUP # |
| 44 ######### |
| 45 |
| 46 # Process command-line options... |
| 47 |
| 48 MEM_LIMIT=100 |
| 49 TIMEOUT=none |
| 50 |
| 51 unset IN_DIR OUT_DIR STDIN_FILE EXTRA_PAR MEM_LIMIT_GIVEN \ |
| 52 AFL_CMIN_CRASHES_ONLY AFL_CMIN_ALLOW_ANY QEMU_MODE |
| 53 |
| 54 while getopts "+i:o:f:m:t:eQC" opt; do |
| 55 |
| 56 case "$opt" in |
| 57 |
| 58 "i") |
| 59 IN_DIR="$OPTARG" |
| 60 ;; |
| 61 |
| 62 "o") |
| 63 OUT_DIR="$OPTARG" |
| 64 ;; |
| 65 "f") |
| 66 STDIN_FILE="$OPTARG" |
| 67 ;; |
| 68 "m") |
| 69 MEM_LIMIT="$OPTARG" |
| 70 MEM_LIMIT_GIVEN=1 |
| 71 ;; |
| 72 "t") |
| 73 TIMEOUT="$OPTARG" |
| 74 ;; |
| 75 "e") |
| 76 EXTRA_PAR="$EXTRA_PAR -e" |
| 77 ;; |
| 78 "C") |
| 79 export AFL_CMIN_CRASHES_ONLY=1 |
| 80 ;; |
| 81 "Q") |
| 82 EXTRA_PAR="$EXTRA_PAR -Q" |
| 83 test "$MEM_LIMIT_GIVEN" = "" && MEM_LIMIT=250 |
| 84 QEMU_MODE=1 |
| 85 ;; |
| 86 "?") |
| 87 exit 1 |
| 88 ;; |
| 89 |
| 90 esac |
| 91 |
| 92 done |
| 93 |
| 94 shift $((OPTIND-1)) |
| 95 |
| 96 TARGET_BIN="$1" |
| 97 |
| 98 if [ "$TARGET_BIN" = "" -o "$IN_DIR" = "" -o "$OUT_DIR" = "" ]; then |
| 99 |
| 100 cat 1>&2 <<_EOF_ |
| 101 Usage: $0 [ options ] -- /path/to/target_app [ ... ] |
| 102 |
| 103 Required parameters: |
| 104 |
| 105 -i dir - input directory with the starting corpus |
| 106 -o dir - output directory for minimized files |
| 107 |
| 108 Execution control settings: |
| 109 |
| 110 -f file - location read by the fuzzed program (stdin) |
| 111 -m megs - memory limit for child process ($MEM_LIMIT MB) |
| 112 -t msec - run time limit for child process (none) |
| 113 -Q - use binary-only instrumentation (QEMU mode) |
| 114 |
| 115 Minimization settings: |
| 116 |
| 117 -C - keep crashing inputs, reject everything else |
| 118 -e - solve for edge coverage only, ignore hit counts |
| 119 |
| 120 For additional tips, please consult docs/README. |
| 121 |
| 122 _EOF_ |
| 123 exit 1 |
| 124 fi |
| 125 |
| 126 # Do a sanity check to discourage the use of /tmp, since we can't really |
| 127 # handle this safely from a shell script. |
| 128 |
| 129 echo "$IN_DIR" | grep -qE '^(/var)?/tmp/' |
| 130 T1="$?" |
| 131 |
| 132 echo "$TARGET_BIN" | grep -qE '^(/var)?/tmp/' |
| 133 T2="$?" |
| 134 |
| 135 echo "$OUT_DIR" | grep -qE '^(/var)?/tmp/' |
| 136 T3="$?" |
| 137 |
| 138 echo "$STDIN_FILE" | grep -qE '^(/var)?/tmp/' |
| 139 T4="$?" |
| 140 |
| 141 echo "$PWD" | grep -qE '^(/var)?/tmp/' |
| 142 T5="$?" |
| 143 |
| 144 if [ "$T1" = "0" -o "$T2" = "0" -o "$T3" = "0" -o "$T4" = "0" -o "$T5" = "0" ];
then |
| 145 echo "[-] Error: do not use this script in /tmp or /var/tmp." 1>&2 |
| 146 exit 1 |
| 147 fi |
| 148 |
| 149 # If @@ is specified, but there's no -f, let's come up with a temporary input |
| 150 # file name. |
| 151 |
| 152 TRACE_DIR="$OUT_DIR/.traces" |
| 153 |
| 154 if [ "$STDIN_FILE" = "" ]; then |
| 155 |
| 156 if echo "$*" | grep -qF '@@'; then |
| 157 STDIN_FILE="$TRACE_DIR/.cur_input" |
| 158 fi |
| 159 |
| 160 fi |
| 161 |
| 162 # Check for obvious errors. |
| 163 |
| 164 if [ ! "$MEM_LIMIT" = "none" ]; then |
| 165 |
| 166 if [ "$MEM_LIMIT" -lt "5" ]; then |
| 167 echo "[-] Error: dangerously low memory limit." 1>&2 |
| 168 exit 1 |
| 169 fi |
| 170 |
| 171 fi |
| 172 |
| 173 if [ ! "$TIMEOUT" = "none" ]; then |
| 174 |
| 175 if [ "$TIMEOUT" -lt "10" ]; then |
| 176 echo "[-] Error: dangerously low timeout." 1>&2 |
| 177 exit 1 |
| 178 fi |
| 179 |
| 180 fi |
| 181 |
| 182 if [ ! -f "$TARGET_BIN" -o ! -x "$TARGET_BIN" ]; then |
| 183 |
| 184 TNEW="`which "$TARGET_BIN" 2>/dev/null`" |
| 185 |
| 186 if [ ! -f "$TNEW" -o ! -x "$TNEW" ]; then |
| 187 echo "[-] Error: binary '$TARGET_BIN' not found or not executable." 1>&2 |
| 188 exit 1 |
| 189 fi |
| 190 |
| 191 TARGET_BIN="$TNEW" |
| 192 |
| 193 fi |
| 194 |
| 195 if [ "$AFL_SKIP_BIN_CHECK" = "" -a "$QEMU_MODE" = "" ]; then |
| 196 |
| 197 if ! grep -qF "__AFL_SHM_ID" "$TARGET_BIN"; then |
| 198 echo "[-] Error: binary '$TARGET_BIN' doesn't appear to be instrumented." 1>
&2 |
| 199 exit 1 |
| 200 fi |
| 201 |
| 202 fi |
| 203 |
| 204 if [ ! -d "$IN_DIR" ]; then |
| 205 echo "[-] Error: directory '$IN_DIR' not found." 1>&2 |
| 206 exit 1 |
| 207 fi |
| 208 |
| 209 test -d "$IN_DIR/queue" && IN_DIR="$IN_DIR/queue" |
| 210 |
| 211 find "$OUT_DIR" -name 'id[:_]*' -maxdepth 1 -exec rm -- {} \; 2>/dev/null |
| 212 rm -rf "$TRACE_DIR" 2>/dev/null |
| 213 |
| 214 rmdir "$OUT_DIR" 2>/dev/null |
| 215 |
| 216 if [ -d "$OUT_DIR" ]; then |
| 217 echo "[-] Error: directory '$OUT_DIR' exists and is not empty - delete it firs
t." 1>&2 |
| 218 exit 1 |
| 219 fi |
| 220 |
| 221 mkdir -m 700 -p "$TRACE_DIR" || exit 1 |
| 222 |
| 223 if [ ! "$STDIN_FILE" = "" ]; then |
| 224 rm -f "$STDIN_FILE" || exit 1 |
| 225 touch "$STDIN_FILE" || exit 1 |
| 226 fi |
| 227 |
| 228 if [ "$AFL_PATH" = "" ]; then |
| 229 SHOWMAP="${0%/afl-cmin}/afl-showmap" |
| 230 else |
| 231 SHOWMAP="$AFL_PATH/afl-showmap" |
| 232 fi |
| 233 |
| 234 if [ ! -x "$SHOWMAP" ]; then |
| 235 echo "[-] Error: can't find 'afl-showmap' - please set AFL_PATH." 1>&2 |
| 236 rm -rf "$TRACE_DIR" |
| 237 exit 1 |
| 238 fi |
| 239 |
| 240 IN_COUNT=$((`ls -- "$IN_DIR" 2>/dev/null | wc -l`)) |
| 241 |
| 242 if [ "$IN_COUNT" = "0" ]; then |
| 243 echo "No inputs in the target directory - nothing to be done." |
| 244 rm -rf "$TRACE_DIR" |
| 245 exit 1 |
| 246 fi |
| 247 |
| 248 FIRST_FILE=`ls "$IN_DIR" | head -1` |
| 249 |
| 250 if ln "$IN_DIR/$FIRST_FILE" "$TRACE_DIR/.link_test" 2>/dev/null; then |
| 251 CP_TOOL=ln |
| 252 else |
| 253 CP_TOOL=cp |
| 254 fi |
| 255 |
| 256 # Make sure that we can actually get anything out of afl-showmap before we |
| 257 # waste too much time. |
| 258 |
| 259 echo "[*] Testing the target binary..." |
| 260 |
| 261 if [ "$STDIN_FILE" = "" ]; then |
| 262 |
| 263 AFL_CMIN_ALLOW_ANY=1 "$SHOWMAP" -m "$MEM_LIMIT" -t "$TIMEOUT" -o "$TRACE_DIR/.
run_test" -Z $EXTRA_PAR -- "$@" <"$IN_DIR/$FIRST_FILE" |
| 264 |
| 265 else |
| 266 |
| 267 cp "$IN_DIR/$FIRST_FILE" "$STDIN_FILE" |
| 268 AFL_CMIN_ALLOW_ANY=1 "$SHOWMAP" -m "$MEM_LIMIT" -t "$TIMEOUT" -o "$TRACE_DIR/.
run_test" -Z $EXTRA_PAR -A "$STDIN_FILE" -- "$@" </dev/null |
| 269 |
| 270 fi |
| 271 |
| 272 FIRST_COUNT=$((`grep -c . "$TRACE_DIR/.run_test"`)) |
| 273 |
| 274 if [ "$FIRST_COUNT" -gt "0" ]; then |
| 275 |
| 276 echo "[+] OK, $FIRST_COUNT tuples recorded." |
| 277 |
| 278 else |
| 279 |
| 280 echo "[-] Error: no instrumentation output detected (perhaps crash or timeout)
." 1>&2 |
| 281 test "$AFL_KEEP_TRACES" = "" && rm -rf "$TRACE_DIR" |
| 282 exit 1 |
| 283 |
| 284 fi |
| 285 |
| 286 # Let's roll! |
| 287 |
| 288 ############################# |
| 289 # STEP 1: COLLECTING TRACES # |
| 290 ############################# |
| 291 |
| 292 echo "[*] Obtaining traces for input files in '$IN_DIR'..." |
| 293 |
| 294 ( |
| 295 |
| 296 CUR=0 |
| 297 |
| 298 if [ "$STDIN_FILE" = "" ]; then |
| 299 |
| 300 while read -r fn; do |
| 301 |
| 302 CUR=$((CUR+1)) |
| 303 printf "\\r Processing file $CUR/$IN_COUNT... " |
| 304 |
| 305 "$SHOWMAP" -m "$MEM_LIMIT" -t "$TIMEOUT" -o "$TRACE_DIR/$fn" -Z $EXTRA_PAR
-- "$@" <"$IN_DIR/$fn" |
| 306 |
| 307 done < <(ls "$IN_DIR") |
| 308 |
| 309 else |
| 310 |
| 311 while read -r fn; do |
| 312 |
| 313 CUR=$((CUR+1)) |
| 314 printf "\\r Processing file $CUR/$IN_COUNT... " |
| 315 |
| 316 cp "$IN_DIR/$fn" "$STDIN_FILE" |
| 317 |
| 318 "$SHOWMAP" -m "$MEM_LIMIT" -t "$TIMEOUT" -o "$TRACE_DIR/$fn" -Z $EXTRA_PAR
-A "$STDIN_FILE" -- "$@" </dev/null |
| 319 |
| 320 done < <(ls "$IN_DIR") |
| 321 |
| 322 |
| 323 fi |
| 324 |
| 325 ) |
| 326 |
| 327 echo |
| 328 |
| 329 ########################## |
| 330 # STEP 2: SORTING TUPLES # |
| 331 ########################## |
| 332 |
| 333 # With this out of the way, we sort all tuples by popularity across all |
| 334 # datasets. The reasoning here is that we won't be able to avoid the files |
| 335 # that trigger unique tuples anyway, so we will want to start with them and |
| 336 # see what's left. |
| 337 |
| 338 echo "[*] Sorting trace sets (this may take a while)..." |
| 339 |
| 340 ls "$IN_DIR" | sed "s#^#$TRACE_DIR/#" | tr '\n' '\0' | xargs -0 -n 1 cat | \ |
| 341 sort | uniq -c | sort -n >"$TRACE_DIR/.all_uniq" |
| 342 |
| 343 TUPLE_COUNT=$((`grep -c . "$TRACE_DIR/.all_uniq"`)) |
| 344 |
| 345 echo "[+] Found $TUPLE_COUNT unique tuples across $IN_COUNT files." |
| 346 |
| 347 ##################################### |
| 348 # STEP 3: SELECTING CANDIDATE FILES # |
| 349 ##################################### |
| 350 |
| 351 # The next step is to find the best candidate for each tuple. The "best" |
| 352 # part is understood simply as the smallest input that includes a particular |
| 353 # tuple in its trace. Empirical evidence suggests that this produces smaller |
| 354 # datasets than more involved algorithms that could be still pulled off in |
| 355 # a shell script. |
| 356 |
| 357 echo "[*] Finding best candidates for each tuple..." |
| 358 |
| 359 CUR=0 |
| 360 |
| 361 while read -r fn; do |
| 362 |
| 363 CUR=$((CUR+1)) |
| 364 printf "\\r Processing file $CUR/$IN_COUNT... " |
| 365 |
| 366 sed "s#\$# $fn#" "$TRACE_DIR/$fn" >>"$TRACE_DIR/.candidate_list" |
| 367 |
| 368 done < <(ls -rS "$IN_DIR") |
| 369 |
| 370 echo |
| 371 |
| 372 ############################## |
| 373 # STEP 4: LOADING CANDIDATES # |
| 374 ############################## |
| 375 |
| 376 # At this point, we have a file of tuple-file pairs, sorted by file size |
| 377 # in ascending order (as a consequence of ls -rS). By doing sort keyed |
| 378 # only by tuple (-k 1,1) and configured to output only the first line for |
| 379 # every key (-s -u), we end up with the smallest file for each tuple. |
| 380 |
| 381 echo "[*] Sorting candidate list (be patient)..." |
| 382 |
| 383 sort -k1,1 -s -u "$TRACE_DIR/.candidate_list" | \ |
| 384 sed 's/^/BEST_FILE[/;s/ /]="/;s/$/"/' >"$TRACE_DIR/.candidate_script" |
| 385 |
| 386 if [ ! -s "$TRACE_DIR/.candidate_script" ]; then |
| 387 echo "[-] Error: no traces obtained from test cases, check syntax!" |
| 388 test "$AFL_KEEP_TRACES" = "" && rm -rf "$TRACE_DIR" |
| 389 exit 1 |
| 390 fi |
| 391 |
| 392 # The sed command converted the sorted list to a shell script that populates |
| 393 # BEST_FILE[tuple]="fname". Let's load that! |
| 394 |
| 395 . "$TRACE_DIR/.candidate_script" |
| 396 |
| 397 ########################## |
| 398 # STEP 5: WRITING OUTPUT # |
| 399 ########################## |
| 400 |
| 401 # The final trick is to grab the top pick for each tuple, unless said tuple is |
| 402 # already set due to the inclusion of an earlier candidate; and then put all |
| 403 # tuples associated with the newly-added file to the "already have" list. The |
| 404 # loop works from least popular tuples and toward the most common ones. |
| 405 |
| 406 echo "[*] Processing candidates and writing output files..." |
| 407 |
| 408 CUR=0 |
| 409 |
| 410 touch "$TRACE_DIR/.already_have" |
| 411 |
| 412 while read -r cnt tuple; do |
| 413 |
| 414 CUR=$((CUR+1)) |
| 415 printf "\\r Processing tuple $CUR/$TUPLE_COUNT... " |
| 416 |
| 417 # If we already have this tuple, skip it. |
| 418 |
| 419 grep -q "^$tuple\$" "$TRACE_DIR/.already_have" && continue |
| 420 |
| 421 FN=${BEST_FILE[tuple]} |
| 422 |
| 423 $CP_TOOL "$IN_DIR/$FN" "$OUT_DIR/$FN" |
| 424 |
| 425 if [ "$((CUR % 5))" = "0" ]; then |
| 426 sort -u "$TRACE_DIR/$FN" "$TRACE_DIR/.already_have" >"$TRACE_DIR/.tmp" |
| 427 mv -f "$TRACE_DIR/.tmp" "$TRACE_DIR/.already_have" |
| 428 else |
| 429 cat "$TRACE_DIR/$FN" >>"$TRACE_DIR/.already_have" |
| 430 fi |
| 431 |
| 432 done <"$TRACE_DIR/.all_uniq" |
| 433 |
| 434 echo |
| 435 |
| 436 OUT_COUNT=`ls -- "$OUT_DIR" | wc -l` |
| 437 |
| 438 if [ "$OUT_COUNT" = "1" ]; then |
| 439 echo "[!] WARNING: All test cases had the same traces, check syntax!" |
| 440 fi |
| 441 |
| 442 echo "[+] Narrowed down to $OUT_COUNT files, saved in '$OUT_DIR'." |
| 443 echo |
| 444 |
| 445 test "$AFL_KEEP_TRACES" = "" && rm -rf "$TRACE_DIR" |
| 446 |
| 447 exit 0 |
OLD | NEW |