| Index: src/trusted/validator_ragel/compress_regular_instructions.py
|
| ===================================================================
|
| --- src/trusted/validator_ragel/compress_regular_instructions.py (revision 0)
|
| +++ src/trusted/validator_ragel/compress_regular_instructions.py (revision 0)
|
| @@ -0,0 +1,1733 @@
|
| +# Copyright (c) 2013 The Native Client Authors. All rights reserved.
|
| +# Use of this source code is governed by a BSD-style license that can be
|
| +# found in the LICENSE file.
|
| +
|
| +"""
|
| +Traverse the validator's DFA, collect all "normal" instructions and then
|
| +compress output. Note: "anybyte fields" (immediates and displacements)
|
| +are always filled with zeros. Otherwise processing of sextillions (sic!)
|
| +of possibilities will take too long.
|
| +
|
| +Each rule is applied only when all variants are accepted by validator.
|
| +The following compression rules are present:
|
| +
|
| +1. Compress ModR/M (+SIB & displacement).
|
| + Instruction: 00 00 add %al,(%rax)
|
| + ...
|
| + Instruction: 00 ff add %bh,%bh
|
| + becomes
|
| + Instruction: 00 XX add [%al..%bh],[%al..%bh or memory]
|
| +
|
| +1a. Compress ModR/M (+SIB & displacement) memory-only.
|
| + Instruction: f0 01 00 lock add %eax,(%eax)
|
| + ...
|
| + Instruction: f0 01 bf 00 00 00 00 lock add %edi,0x0(%edi)
|
| + becomes
|
| + Instruction: f0 01 XX lock add [%eax..edi],[memory]
|
| +
|
| +1b. Compress ModR/M register only.
|
| + Instruction: 66 0f 50 c0 movmskpd %xmm0,%eax
|
| + ...
|
| + Instruction: 66 0f 50 ff movmskpd %xmm7,%edi
|
| + becomes
|
| + Instruction: 66 0f 50 XX movmskpd [%xmm0..%xmm7],[%eax..edi]
|
| +
|
| +2. Compress ModR/M (+SIB & displacement) with opcode extension.
|
| + Instruction: 0f 90 00 seto (%eax)
|
| + ...
|
| + Instruction: 0f 90 c7 seto %bh
|
| + becomes
|
| + Instruction: 0f 90 XX/0 seto [%al..%bh or memory]
|
| +
|
| +2a. Compress ModR/M (+SIB & displacement) memory-only with opcode extension.
|
| + Instruction: f0 ff 00 lock incl (%eax)
|
| + ...
|
| + Instruction: f0 ff 84 ff 00 00 00 00 lock incl 0x0(%edi,%edi,8)
|
| + becomes
|
| + Instruction: f0 ff XX/1 lock decl [memory]
|
| +
|
| +2b. Compress ModR/M register-only with opcode extension.
|
| + Instruction: 0f 71 d0 00 psrlw $0x0,%mm0
|
| + ...
|
| + Instruction: 0f 71 d7 00 psrlw $0x0,%mm7
|
| + becomes
|
| + Instruction: 66 0f 71 XX/2 00 psrlw $0x0,[%mm0..%mm7]
|
| +
|
| +3. Compress register-in-opcode.
|
| + Instruction: d9 c0 fld %st(0)
|
| + ...
|
| + Instruction: d9 c7 fld %st(7)
|
| + becomes
|
| + Instruction: Instruction: d9 c[0..7] fld [%st(0)..%st(7)]
|
| +
|
| + Only applies if all possible register accesses are accepted by validator.
|
| +
|
| +4. Special compressor for "set" instruction.
|
| + Instruction: 0f 90 XX/0 seto [%al..%bh or memory]
|
| + ...
|
| + Instruction: 0f 90 XX/7 seto [%al..%bh or memory]
|
| + becomes
|
| + Instruction: 0f 90 XX seto [%al..%bh or memory]
|
| +"""
|
| +
|
| +import itertools
|
| +import multiprocessing
|
| +import optparse
|
| +import os
|
| +import re
|
| +import subprocess
|
| +import sys
|
| +import tempfile
|
| +import traceback
|
| +
|
| +import dfa_parser
|
| +import dfa_traversal
|
| +import validator
|
| +
|
| +
|
| +# Register names in 'natual' order (as defined by IA32/x86-64 ABI)
|
| +#
|
| +# X86-64 ABI splits all registers in groups of 8 because it uses 3-bit field
|
| +# in opcode, ModR/M, and/or SIB bytes to encode them.
|
| +#
|
| +# In most cases there are 16 registers of a given kind and two such groups,
|
| +# but there are couple of exceptions:
|
| +# 1. There are 20 8-bit registers and three groups (two of them overlap)
|
| +# 2. There are eight X87 and MMX registers thus two groups are identical
|
| +#
|
| +# We use typical register from a group to name the whole group. Most groups
|
| +# are named by the first register but 'spl' group is named by the fifth register
|
| +# because it's first four registers are the same as 'al' group.
|
| +# We use mnemonic name 'mmalt' to represent the "evil mirror" of the 'mm0'
|
| +# group: it the same as 'mm0', but implies use of the appropriate REX bit.
|
| +REGISTERS = {
|
| + 'al': [ 'al', 'cl', 'dl', 'bl', 'ah', 'ch', 'dh', 'bh' ],
|
| + 'spl': [ 'al', 'cl', 'dl', 'bl', 'spl', 'bpl', 'sil', 'dil' ],
|
| + 'ax': [ 'ax', 'cx', 'dx', 'bx', 'sp', 'bp', 'si', 'di' ],
|
| + 'eax': [ 'eax', 'ecx', 'edx', 'ebx', 'esp', 'ebp', 'esi', 'edi' ],
|
| + 'rax': [ 'rax', 'rcx', 'rdx', 'rbx', 'rsp', 'rbp', 'rsi', 'rdi' ],
|
| + 'r8b': [ 'r{}b'.format(N) for N in range(8,16) ],
|
| + 'r8w': [ 'r{}w'.format(N) for N in range(8,16) ],
|
| + 'r8d': [ 'r{}d'.format(N) for N in range(8,16) ],
|
| + 'r8': [ 'r{}'.format(N) for N in range(8,16) ],
|
| + 'mm0': [ 'mm{}'.format(N) for N in range(8) ],
|
| + 'mmalt': [ 'mm{}'.format(N) for N in range(8) ],
|
| + 'st(0)': [ 'st({})'.format(N) for N in range(8) ],
|
| + 'xmm0': [ 'xmm{}'.format(N) for N in range(8) ],
|
| + 'xmm8': [ 'xmm{}'.format(N) for N in range(8,16) ],
|
| + 'ymm0': [ 'ymm{}'.format(N) for N in range(8) ],
|
| + 'ymm8': [ 'ymm{}'.format(N) for N in range(8,16) ]
|
| +}
|
| +
|
| +
|
| +NOP = 0x90
|
| +
|
| +
|
| +def PadToBundleSize(bytes):
|
| + assert len(bytes) <= validator.BUNDLE_SIZE
|
| + return bytes + [NOP] * (validator.BUNDLE_SIZE - len(bytes))
|
| +
|
| +
|
| +# In x86-64 mode we have so-called 'restricted register' which is used to
|
| +# tie two groups together. Some instructions require particular value to
|
| +# be stored in this variable, while some accept any non-special restricted
|
| +# register (%ebp and %esp are special because they can only be accepted by
|
| +# a few 'special' instructions).
|
| +#
|
| +# You can find more details in the "NaCl SFI model on x86-64 systems" manual.
|
| +#
|
| +# We try to feed all possible 'restricted registers' into validator and then
|
| +# classify the instruction using this map. If set of acceptable 'restricted
|
| +# registers' is not here, then it's an error in validator.
|
| +ACCEPTABLE_X86_64_INPUTS = {
|
| + 0x00001: 'input_rr=%eax',
|
| + 0x00002: 'input_rr=%ecx',
|
| + 0x00004: 'input_rr=%edx',
|
| + 0x00008: 'input_rr=%ebx',
|
| + 0x00010: 'input_rr=%esp',
|
| + 0x00020: 'input_rr=%ebp',
|
| + 0x00040: 'input_rr=%esi',
|
| + 0x00080: 'input_rr=%edi',
|
| + 0x00100: 'input_rr=%r8d',
|
| + 0x00200: 'input_rr=%r9d',
|
| + 0x00400: 'input_rr=%r10d',
|
| + 0x00800: 'input_rr=%r11d',
|
| + 0x01000: 'input_rr=%r12d',
|
| + 0x02000: 'input_rr=%r13d',
|
| + 0x04000: 'input_rr=%r14d',
|
| + 0x08000: 'input_rr=%r15d',
|
| + 0x1ffcf: 'input_rr=any_nonspecial'
|
| +}
|
| +
|
| +# Any instruction must produce either None or one of fifteen registers as an
|
| +# output 'restricted register' value. 'r15d' is NOT acceptable as an output.
|
| +ACCEPTABLE_X86_64_OUTPUT_REGISTERS = tuple(
|
| + '%' + reg for reg in (REGISTERS['eax'] + REGISTERS['r8d'])[0:-1])
|
| +
|
| +
|
| +def ValidateInstruction(instruction, validator_inst):
|
| + bundle = ''.join(map(chr, PadToBundleSize(instruction)))
|
| + if options.bitness == 32:
|
| + result = validator_inst.ValidateChunk(bundle, bitness=32)
|
| + return result, []
|
| + else:
|
| + valid_inputs = 0
|
| + known_final_rr = None
|
| + output_rr = None
|
| + # Note that iteration order is aligned with ACCEPTABLE_X86_64_INPUTS array
|
| + # above.
|
| + for bit, initial_rr in enumerate(validator.ALL_REGISTERS + [None]):
|
| + valid, final_rr = validator_inst.ValidateAndGetFinalRestrictedRegister(
|
| + bundle, len(instruction), initial_rr)
|
| + if valid:
|
| + # final_rr should not depend on input_rr
|
| + assert valid_inputs == 0 or known_final_rr == final_rr
|
| + valid_inputs |= 1 << bit
|
| + known_final_rr = final_rr
|
| + # If nothing is accepted then instruction is not valid. Easy and simple.
|
| + if valid_inputs == 0: return False, []
|
| + # If returned value in unacceptable we'll get IndexError here and this
|
| + # test will fail
|
| + if known_final_rr is not None:
|
| + output_rr = ACCEPTABLE_X86_64_OUTPUT_REGISTERS[known_final_rr]
|
| + # If collected valid_inputs are unacceptable we'll get KeyError here and
|
| + # this test will fail
|
| + return True, [ACCEPTABLE_X86_64_INPUTS[valid_inputs],
|
| + 'output_rr={}'.format(output_rr)]
|
| +
|
| +
|
| +class WorkerState(object):
|
| + def __init__(self, prefix, validator):
|
| + self.total_instructions = 0
|
| + self.num_valid = 0
|
| + self.validator = validator
|
| + self.output = set()
|
| + self.trace = []
|
| +
|
| +
|
| + def ReceiveInstruction(self, bytes):
|
| + self.total_instructions += 1
|
| + result, notes = ValidateInstruction(bytes, self.validator)
|
| + if result:
|
| + self.num_valid += 1
|
| + dis = self.validator.DisassembleChunk(
|
| + ''.join(map(chr, bytes)),
|
| + bitness=options.bitness)
|
| + for line_nr in xrange(len(dis)):
|
| + dis[line_nr] = str(dis[line_nr])
|
| + assert dis[line_nr][0:17] == 'Instruction(0x' + str(line_nr) + ': '
|
| + assert dis[line_nr][-1:] == ')'
|
| + dis[line_nr] = dis[line_nr][17:-1]
|
| + # If %rip is involved then comment will be different depending on the
|
| + # instruction length. Eliminate it.
|
| + if '(%rip)' in dis[0]:
|
| + dis[0] = re.sub(' # 0x[ ]*[0-9a-fA-F]*', '', dis[0])
|
| + # Zero displacements are represented as 0x0 for all instructions except
|
| + # jumps where they disassembled as non-zero due to %eip/%rip-relative
|
| + # addressing. We replace this displacement with %eip/%rip to simplify
|
| + # compression.
|
| + if ' 0x' in dis[0] and ' 0x0' not in dis[0]:
|
| + for bytes in xrange(1, 16):
|
| + dis[0] = re.sub(
|
| + '(' + '(?:[0-9a-fA-F][0-9a-fA-F] ){' + str(bytes) + '} .* )' +
|
| + hex(bytes) + '(.*)',
|
| + r'\1%eip\2' if options.bitness == 32 else r'\1%rip\2',
|
| + dis[0]);
|
| + dis[0] = 'Instruction: ' + dis[0]
|
| + dis += notes
|
| + self.output.add('; '.join(dis))
|
| +
|
| +
|
| + def RecordTrace(self, compressor_nr, instruction):
|
| + self.trace.append((compressor_nr, instruction))
|
| +
|
| +
|
| +# Compressor has three slots: regex (which picks apart given instruction),
|
| +# subst (which is used to create the compressed version) and replacements (which
|
| +# are used to generate set of instructions from a given code).
|
| +#
|
| +# Example compressor:
|
| +# regex = r'.*?[0-9a-fA-F]([0-7]) \w* (%e(?:[abcd]x|[sb]p|[sd]i)).*()'
|
| +# subst = ('[0-7]', '[%eax..%edi]', ' # register in opcode')
|
| +# replacements = ((0, '%eax', ''), (1, '%ecx', ''),
|
| +# (2, '%edx', ''), (3, '%ebx', '')
|
| +# (4, '%esp', ''), (5, '%ebp', ''),
|
| +# (6, '%esi', ''), (7, '%edi', ''))
|
| +#
|
| +# When faced with instruction '40 inc %eax' it will capture the following
|
| +# pieces of said instruction: '4{0} inc {%eax}{}'.
|
| +#
|
| +# Then it will replace groups to produce the following 8 instructions:
|
| +# '40 inc %eax'
|
| +# '41 inc %ecx'
|
| +# '42 inc %edx'
|
| +# '43 inc %ebx'
|
| +# '44 inc %esp'
|
| +# '45 inc %ebp'
|
| +# '46 inc %esi'
|
| +# '47 inc %edi'
|
| +#
|
| +# If all these instructions can be found in a set of instructions then
|
| +# compressor will remove them from said set and will substitute them with
|
| +# one "compressed instruction" '4[0-7] inc [%eax..%edi] # register in opcode'.
|
| +class Compressor(object):
|
| + __slots__ = [
|
| + 'regex',
|
| + 'subst',
|
| + 'replacements'
|
| + ]
|
| +
|
| + def __init__(self, regex, subst, replacements):
|
| + self.regex = re.compile(regex)
|
| + self.subst = subst
|
| + self.replacements = replacements
|
| +
|
| +
|
| +def CompressionTemplate(instruction, match, mark):
|
| + """ Replace all match groups with the mark. """
|
| + pos = 0
|
| + format_str = ''
|
| + for group in range(0, len(match.groups())):
|
| + format_str += instruction[pos:match.start(group + 1)] + mark
|
| + pos = match.end(group + 1)
|
| + return format_str + instruction[pos:]
|
| +
|
| +
|
| +def CompressOneMatch(instructions, instruction, match, compressor):
|
| + format_str = CompressionTemplate(instruction, match, '{}')
|
| + subset = set()
|
| + for replacement in compressor.replacements:
|
| + replacement_str = format_str.format(*replacement)
|
| + if not replacement_str in instructions:
|
| + return (False, instructions)
|
| + subset.add(replacement_str)
|
| + assert instruction in subset
|
| + instructions -= subset
|
| + instructions.add(format_str.format(*compressor.subst))
|
| + return (True, instructions)
|
| +
|
| +
|
| +def CompressOneInstruction(instructions, compressors, split, cache):
|
| + sorted_instructions = (sorted(i for i in instructions if i > split) +
|
| + sorted(i for i in instructions if i < split))
|
| + for instruction in sorted_instructions:
|
| + if instruction in cache:
|
| + compressors_list = cache[instruction]
|
| + for compressor_nr, match, compressor in compressors_list:
|
| + result, instructions = CompressOneMatch(
|
| + instructions, instruction, match, compressor)
|
| + if result:
|
| + return (instructions, compressor_nr, instruction)
|
| + else:
|
| + compressors_list = []
|
| + for compressor_nr, compressor in enumerate(compressors):
|
| + match = compressor.regex.match(instruction)
|
| + if match:
|
| + compressors_list.append((compressor_nr, match, compressor))
|
| + result, instructions = CompressOneMatch(
|
| + instructions, instruction, match, compressor)
|
| + if result:
|
| + return (instructions, compressor_nr, instruction)
|
| + cache[instruction] = compressors_list
|
| + return (instructions, False, False)
|
| +
|
| +
|
| +def Compressed(instructions, compressors, show_progress):
|
| + split = ''
|
| + cache = {}
|
| + while True:
|
| + instructions, rule, split = CompressOneInstruction(
|
| + instructions, compressors, split, cache)
|
| + if rule is False: break
|
| + show_progress(rule, split)
|
| + return instructions
|
| +
|
| +
|
| +def Worker((prefix, state_index)):
|
| + worker_state = WorkerState(prefix, worker_validator)
|
| +
|
| + try:
|
| + dfa_traversal.TraverseTree(
|
| + dfa.states[state_index],
|
| + final_callback=worker_state.ReceiveInstruction,
|
| + prefix=prefix,
|
| + anyfield=0)
|
| + if (prefix[0] != 0x0f or prefix[1] != 0x0f): # Skip 3DNow! instructions
|
| + worker_state.output = Compressed(set(worker_state.output),
|
| + compressors,
|
| + worker_state.RecordTrace)
|
| + except Exception as e:
|
| + traceback.print_exc() # because multiprocessing imap swallows traceback
|
| + raise
|
| +
|
| + return (
|
| + prefix,
|
| + worker_state.total_instructions,
|
| + worker_state.num_valid,
|
| + worker_state.output,
|
| + worker_state.trace)
|
| +
|
| +
|
| +def ParseOptions():
|
| + parser = optparse.OptionParser(usage='%prog [options] xmlfile')
|
| +
|
| + parser.add_option('--bitness',
|
| + choices=['32', '64'],
|
| + help='The subarchitecture: 32 or 64')
|
| + parser.add_option('--validator_dll',
|
| + help='Path to librdfa_validator_dll')
|
| + parser.add_option('--decoder_dll',
|
| + help='Path to librdfa_decoder_dll')
|
| +
|
| + options, args = parser.parse_args()
|
| + options.bitness = int(options.bitness)
|
| +
|
| + if len(args) != 1:
|
| + parser.error('specify one xml file')
|
| +
|
| + (xml_file, ) = args
|
| +
|
| + return options, xml_file
|
| +
|
| +
|
| +# Version suitable for use in regular expressions
|
| +REGISTERS_RE = REGISTERS.copy()
|
| +REGISTERS_RE['st(0)'] = [ r'st\({}\)'.format(N) for N in range(8) ]
|
| +
|
| +# Index names in 'natual' order (as defined by IA32/x86-64 ABI)
|
| +INDEXES = {
|
| + 'eax': [ 'eax', 'ecx', 'edx', 'ebx', 'eiz', 'ebp', 'esi', 'edi' ],
|
| + 'rax': [ 'rax', 'rcx', 'rdx', 'rbx', 'riz', 'rbp', 'rsi', 'rdi' ],
|
| + 'r8': [ 'r8', 'r9', 'r10', 'r11', 'r12', 'r13', 'r14', 'r15' ]
|
| +}
|
| +# Register which can be used as base in 64-bit mode in all their incarnations.
|
| +X86_64_BASE_REGISTERS = set([
|
| + '%spl', '%bpl', '%r15b',
|
| + '%sp', '%bp', '%r15w',
|
| + '%esp', '%ebp', '%r15d',
|
| + '%rsp', '%rbp', '%r15',
|
| + '%rip'
|
| +])
|
| +
|
| +
|
| +def InstructionIsDangerous(input, output, register_write,
|
| + writes_to, memory_accessed=False,
|
| + base_text='%riz', index_text='%riz'):
|
| + """ Check if instruction with given replacements will be dangerous
|
| +
|
| + Args:
|
| + input: input argument
|
| + output: output argument
|
| + register_write: three-state selector
|
| + 'sandbox' - instruction can be used to produce "restricted register"
|
| + 'protect' - instruction can damage output, protect "special registers"
|
| + 'ignore' - instruction does not affect it's operands (e.g. test) or
|
| + is used with non-GP registers (X87, MMX, XMM, etc)
|
| + memory_accessed: True if instruction accesses memory
|
| + base: base register (if memory is accessed)
|
| + index: index register (if memory is accessed)
|
| +
|
| + Returns:
|
| + True if instruction should be rejected by validator
|
| + """
|
| + if memory_accessed:
|
| + if base_text not in X86_64_BASE_REGISTERS:
|
| + return True
|
| + # Surprisingly enough %r15 is considered "valid index register" by our
|
| + # validator: it can not be ever produced as "restricted register" which
|
| + # means that such instruction can not ever occur in real program (it's
|
| + # guaranteed because ACCEPTABLE_X86_64_OUTPUT_REGISTERS excludes it), but
|
| + # when instruction is tested in isolation it's allowed.
|
| + if index_text in X86_64_BASE_REGISTERS - set(['%r15']):
|
| + return True
|
| + if register_write == 'protect' and output in X86_64_BASE_REGISTERS:
|
| + return True
|
| + if register_write == 'sandbox' and output == '%r15d':
|
| + return True
|
| + if writes_to == 'both' and input in X86_64_BASE_REGISTERS:
|
| + return True
|
| + return False
|
| +
|
| +
|
| +def AppendOperandsReplacement(replacement, rm_text, reg, modrm, writes_to):
|
| + """ Appends replacement text to replacement list
|
| +
|
| + Args:
|
| + replacement: replacement list
|
| + rm_text: replacement for rm field
|
| + reg: register kind (or None if reg field is used as opcode extension)
|
| + modrm: modrm byte
|
| + writes_to: three-state selector
|
| + 'reg' - instruction uses rm as source, reg as destination
|
| + 'rm' - instruction uses reg as source, rm as destination
|
| + 'both' - instruction writes to both reg and rm
|
| +
|
| + Returns:
|
| + input: textual representation of input argument
|
| + output: textual representation of output argument
|
| +
|
| + Side-effect:
|
| + output (if reg is None) or (input, output) tuple (if reg is not None)
|
| + are added to replacement list.
|
| + """
|
| + if reg is None:
|
| + assert writes_to == 'rm'
|
| + input, output = None, rm_text
|
| + replacement.append(output)
|
| + else:
|
| + reg_field = (modrm >> 3) & 0x07
|
| + reg_text = '%' + REGISTERS[reg][reg_field]
|
| + if writes_to == 'reg':
|
| + input, output = rm_text, reg_text
|
| + else: # rm, both
|
| + input, output = reg_text, rm_text
|
| + replacement.extend([input, output])
|
| + return input, output
|
| +
|
| +
|
| +def ModRMRegisterReplacements(rm, reg=None, writes_to='rm', opcode_bits=0,
|
| + register_write='ignore', note=''):
|
| + """Creates replacement tuples list for register-to-register instructions
|
| +
|
| + Args:
|
| + rm: rm operand kind (see REGISTERS array)
|
| + reg: reg operand kind (see REGISTERS array) or None if reg is not used
|
| + writes_to: three-state selector
|
| + 'reg' - instruction uses rm as source, reg as destination
|
| + 'rm' - instruction uses reg as source, rm as destination
|
| + 'both' - instruction writes to both reg and rm
|
| + opcode_bits: opcode extensions code (used when reg is None)
|
| + register_write: three-state selector
|
| + 'sandbox' - instruction can be used to produce "restricted register"
|
| + 'protect' - instruction can damage output, protect "special registers"
|
| + 'ignore' - instruction does not affect it's operands (e.g. test) or
|
| + is used with non-GP registers (X87, MMX, XMM, etc)
|
| + note: note to include at the end of input instruction
|
| + Returns:
|
| + List of replacement tuples
|
| + """
|
| + # Reg field can be used either as reg or as opcode extension, but not both
|
| + assert reg is None or opcode_bits == 0
|
| +
|
| + output_key = (options.bitness, reg, rm, writes_to, opcode_bits,
|
| + register_write)
|
| + if output_key in ModRMRegisterReplacements.replacements:
|
| + return ModRMRegisterReplacements.replacements[output_key]
|
| +
|
| + replacements = []
|
| +
|
| + # Two upper bits of ModR/M byte (mod field) must be equal to 11
|
| + # This gives us range from 0xc0 to 0xff but we are going from the
|
| + # end to make rejection faster (%r15 is equal to 0x7 and %rbp is 0x5).
|
| + if reg is None:
|
| + # reg field is used as opcode extension
|
| + byte_range = [byte
|
| + for byte in range(0xff, 0xbf, -1)
|
| + if (byte >> 3) & 0x7 == opcode_bits]
|
| + else:
|
| + byte_range = range(0xff, 0xbf, -1)
|
| +
|
| + for modrm in byte_range:
|
| + rm_field = (modrm & 0x07)
|
| + rm_text = '%' + REGISTERS[rm][rm_field]
|
| + byte_text = '{:02x}'.format(modrm)
|
| + replacement = [byte_text]
|
| + input, output = AppendOperandsReplacement(
|
| + replacement, rm_text, reg, modrm, writes_to)
|
| + if options.bitness == 64:
|
| + replacement.append('any_nonspecial') # input_rr
|
| + replacement.append(output if register_write == 'sandbox' else None)
|
| + if InstructionIsDangerous(input, output, register_write, writes_to):
|
| + continue
|
| + replacement.append(note)
|
| + replacements.append(tuple(replacement))
|
| + ModRMRegisterReplacements.replacements[output_key] = tuple(replacements)
|
| + return ModRMRegisterReplacements.replacements[output_key]
|
| +ModRMRegisterReplacements.replacements = {}
|
| +
|
| +
|
| +def BaseOnlyMemoryOperand(modrm, base):
|
| + """Creates replacement tuples list for register-to-memory instructions
|
| + (base only, no SIB)
|
| +
|
| + Args:
|
| + modrm: modrm byte
|
| + base: register kind for base
|
| + Returns:
|
| + bytes_text: replacement for "bytes" group
|
| + rm_text: textual representation of "rm" argument
|
| + base_text: textual representation of "base" register
|
| + """
|
| + mod_field = (modrm >> 6) & 0x03
|
| + rm_field = (modrm & 0x07)
|
| + base_text = '%' + REGISTERS[base][rm_field]
|
| + # If RM field == %rbp and MOD field is zero then it's absolute address
|
| + # in 32-bit mode and %rip-based address in 64-bit mode
|
| + if mod_field == 0 and rm_field == validator.REG_RBP:
|
| + bytes_text = '{:02x} 00 00 00 00'.format(modrm)
|
| + rm_text = '0x0' if options.bitness == 32 else '0x0(%rip)'
|
| + base_text = '%eiz' if options.bitness == 32 else '%rip'
|
| + # Memory access with just a base register
|
| + elif mod_field == 0:
|
| + bytes_text = '{:02x}'.format(modrm)
|
| + rm_text = '({})'.format(base_text)
|
| + # Memory access with base and 8bit offset
|
| + elif mod_field == 1:
|
| + bytes_text = '{:02x} 00'.format(modrm)
|
| + rm_text = '0x0({})'.format(base_text)
|
| + # Memory access with base and 32bit offset
|
| + else: # mod_field == 2
|
| + bytes_text = '{:02x} 00 00 00 00'.format(modrm)
|
| + rm_text = '0x0({})'.format(base_text)
|
| + return bytes_text, rm_text, base_text
|
| +
|
| +
|
| +def SIBMemoryOperand(modrm, sib, base, index):
|
| + """Creates replacement tuples list for register-to-memory instructions
|
| + (base only, no SIB)
|
| +
|
| + Args:
|
| + modrm: modrm byte
|
| + base: register kind for base
|
| + Returns:
|
| + bytes_text: replacement for "bytes" group
|
| + rm_text: textual representation of "rm" argument
|
| + base_text: textual representation of "base" register
|
| + index_text: textual representation of "index" register
|
| + """
|
| + mod_field = (modrm >> 6) & 0x03
|
| + scale_field = (sib >> 6) & 0x03
|
| + index_field = (sib >> 3) & 0x07
|
| + base_field = (sib & 0x07)
|
| + index_text = '%' + INDEXES[index][index_field]
|
| + base_text = '%' + REGISTERS[base][base_field]
|
| + scale_text = str(1 << scale_field)
|
| + # If BASE is %rbp and MOD == 0 then index with 32bit offset is used
|
| + if mod_field == 0 and base_field == validator.REG_RBP:
|
| + bytes_text = '{:02x} {:02x} 00 00 00 00'.format(modrm, sib)
|
| + if (options.bitness == 64 and
|
| + index_text == '%riz' and
|
| + scale_text == '1'):
|
| + rm_text = '0x0'
|
| + else:
|
| + rm_text = '0x0(,{},{})'.format(index_text, scale_text)
|
| + # There are no base in this case
|
| + base_text = '%eiz' if options.bitness == 32 else '%riz'
|
| + # Memory access with base and index (no offset)
|
| + elif mod_field == 0:
|
| + bytes_text = '{:02x} {:02x}'.format(modrm, sib)
|
| + rm_text = '({},{},{})'.format(base_text, index_text, scale_text)
|
| + # Memory access with base, index and 8bit offset
|
| + elif mod_field == 1:
|
| + bytes_text = '{:02x} {:02x} 00'.format(modrm, sib)
|
| + rm_text = '0x0({},{},{})'.format(base_text, index_text, scale_text)
|
| + # Memory access with base, index and 32bit offset
|
| + elif mod_field == 2:
|
| + bytes_text = '{:02x} {:02x} 00 00 00 00'.format(modrm, sib)
|
| + rm_text = '0x0({},{},{})'.format(base_text, index_text, scale_text)
|
| + # Pretty-printing of access via %rsp (or %r12)
|
| + if (base_field == validator.REG_RSP and
|
| + index_text in ('%eiz', '%riz') and
|
| + scale_text == '1'):
|
| + if mod_field == 0: # no offset
|
| + rm_text = '({})'.format(base_text)
|
| + else: # 8-bit or 32-bit offset
|
| + rm_text = '0x0({})'.format(base_text)
|
| + return bytes_text, rm_text, base_text, index_text
|
| +
|
| +
|
| +def ModRMMemoryReplacements(reg=None, writes_to='rm', opcode_bits=0,
|
| + memory_accessed=True, register_write='ignore',
|
| + base_r8=False, index_r8=False, note=''):
|
| + """Creates replacement tuples list for register-to-memory instructions
|
| +
|
| + Args:
|
| + reg: reg operand kind (see REGISTERS array) or None if reg is not used
|
| + writes_to: three-state selector
|
| + 'reg' - instruction uses rm as source, reg as destination
|
| + 'rm' - instruction uses reg as source, rm as destination
|
| + 'both' - instruction writes to both reg and rm
|
| + opcode_bits: opcode extensions code (used when reg is None)
|
| + memory_accessed: True if instruction accesses memory
|
| + register_write: three-state selector
|
| + 'sandbox' - instruction can be used to produce "restricted register"
|
| + 'protect' - instruction can damage output, protect "special registers"
|
| + 'ignore' - instruction does not affect it's operands (e.g. test) or
|
| + is used with non-GP registers (X87, MMX, XMM, etc)
|
| + index_r8: True if REX.X bit in the instruction set to 1
|
| + note: note to include at the end of input instruction
|
| +
|
| + Returns:
|
| + List of replacement tuples
|
| + """
|
| + # Reg field can be used either as reg or as opcode extension, but not both
|
| + assert reg is None or opcode_bits == 0
|
| +
|
| + output_key = (options.bitness, reg, writes_to, opcode_bits,
|
| + base_r8, index_r8, memory_accessed, register_write)
|
| + if output_key in ModRMMemoryReplacements.replacements:
|
| + return ModRMMemoryReplacements.replacements[output_key]
|
| +
|
| + if options.bitness == 32:
|
| + base = 'eax'
|
| + index = 'eax'
|
| + else:
|
| + base = 'r8' if base_r8 else 'rax'
|
| + index = 'r8' if index_r8 else 'rax'
|
| +
|
| + replacements = []
|
| +
|
| + # Two upper bits of ModR/M byte (mod field) must be equal to 00, 01, or 10
|
| + # This gives us range from 0x00 to 0xbf but we are going from the end to make
|
| + # rejection faster (%r15 is equal to 0x7 and %rbp is 0x5).
|
| + if reg is None:
|
| + # reg field is used as opcode extension
|
| + byte_range = [byte
|
| + for byte in range(0xbf, -1, -1)
|
| + if (byte >> 3) & 0x7 == opcode_bits]
|
| + else:
|
| + byte_range = range(0xbf, -1, -1)
|
| +
|
| + for modrm in byte_range:
|
| + # If RM field != %rsp then there are no SIB byte
|
| + if (modrm & 0x07) != validator.REG_RSP:
|
| + bytes_text, rm_text, base_text = BaseOnlyMemoryOperand(modrm, base)
|
| + replacement = [bytes_text]
|
| + input, output = AppendOperandsReplacement(
|
| + replacement, rm_text, reg, modrm, writes_to)
|
| + if options.bitness == 64:
|
| + replacement.append('any_nonspecial')
|
| + # If writes_to is equal to 'reg' then output is register
|
| + if writes_to == 'reg' and register_write == 'sandbox':
|
| + replacement.append(output)
|
| + else:
|
| + # Note that instruction like xchg could write to another register:
|
| + # it's "input"! But validator currently does not support this case
|
| + # thus we are ignoring it here and writing "None" in this case, too.
|
| + replacement.append(None)
|
| + if InstructionIsDangerous(input, output, register_write, writes_to,
|
| + memory_accessed, base_text):
|
| + continue
|
| + replacement.append(note)
|
| + replacements.append(tuple(replacement))
|
| + else:
|
| + # If RM field == %rsp then we have SIB byte
|
| + for sib in xrange(0x100):
|
| + bytes_text, rm_text, base_text, index_text = SIBMemoryOperand(
|
| + modrm, sib, base, index)
|
| + replacement = [bytes_text]
|
| + input, output = AppendOperandsReplacement(
|
| + replacement, rm_text, reg, modrm, writes_to)
|
| + if options.bitness == 64:
|
| + if not memory_accessed or index_text == '%riz':
|
| + replacement.append('any_nonspecial')
|
| + else:
|
| + if index_r8:
|
| + # Convert %r8 to %r8d, %r9 to %r9d, etc
|
| + replacement.append(index_text + 'd')
|
| + else:
|
| + # Convert %rax to %eax, %rsp to %esp, etc
|
| + replacement.append('%e' + index_text[2:])
|
| + # If writes_to is equal to 'reg' then output is register
|
| + if writes_to == 'reg' and register_write == 'sandbox':
|
| + replacement.append(output)
|
| + else:
|
| + # Note that instruction like xchg could write to another register:
|
| + # it's "input"! But validator currently does not support this case
|
| + # thus we are ignoring it here and writing "None" in this case, too.
|
| + replacement.append(None)
|
| + if InstructionIsDangerous(input, output, register_write, writes_to,
|
| + memory_accessed, base_text, index_text):
|
| + continue
|
| + replacement.append(note)
|
| + replacements.append(tuple(replacement))
|
| + ModRMMemoryReplacements.replacements[output_key] = tuple(replacements)
|
| + return ModRMMemoryReplacements.replacements[output_key]
|
| +ModRMMemoryReplacements.replacements = {}
|
| +
|
| +
|
| +# Map from "REX bit off" group of registers to "REX bit on" group of registers
|
| +r8 = {
|
| + 'al': 'r8b',
|
| + 'ax': 'r8w',
|
| + 'eax': 'r8d',
|
| + 'rax': 'r8',
|
| + 'mm0': 'mmalt',
|
| + 'xmm0': 'xmm8',
|
| + 'ymm0': 'ymm8'
|
| +}
|
| +
|
| +
|
| +def RegisterKinds():
|
| + """ Return list of register kinds we process with register compressors """
|
| +
|
| + if options.bitness == 32:
|
| + return ('al', 'ax', 'eax', 'mm0', 'xmm0', 'ymm0')
|
| + else:
|
| + return ('al', 'spl', 'ax', 'eax', 'rax', 'mm0', 'xmm0', 'ymm0',
|
| + 'r8b', 'r8w', 'r8d', 'r8', 'mmalt', 'xmm8', 'ymm8')
|
| +
|
| +
|
| +def RegisterKindPairs():
|
| + """ Return hand-picked pairs which we must consider in compressors """
|
| +
|
| + if options.bitness == 32:
|
| + return (
|
| + ( 'al', 'al'),
|
| + ( 'ax', 'al'),
|
| + ( 'al', 'ax'),
|
| + ( 'ax', 'ax'),
|
| + ( 'eax', 'al'),
|
| + ( 'al', 'eax'),
|
| + ( 'eax', 'ax'),
|
| + ( 'ax', 'eax'),
|
| + ( 'eax', 'eax'),
|
| + ( 'eax', 'mm0'),
|
| + ( 'mm0', 'eax'),
|
| + ( 'eax', 'xmm0'),
|
| + ('xmm0', 'eax'),
|
| + ( 'mm0', 'mm0'),
|
| + ( 'mm0', 'xmm0'),
|
| + ('xmm0', 'mm0'),
|
| + ('xmm0', 'xmm0'),
|
| + ('xmm0', 'ymm0'),
|
| + ('ymm0', 'xmm0'),
|
| + ('ymm0', 'ymm0')
|
| + )
|
| + else:
|
| + return (
|
| + ( 'al', 'al'),
|
| + ( 'spl', 'spl'), ( 'spl', 'r8b'), ( 'r8b', 'spl'), ( 'r8b', 'r8b'),
|
| + ( 'ax', 'al'),
|
| + ( 'ax', 'spl'), ( 'ax', 'r8b'), ( 'r8w', 'spl'), ( 'r8w', 'r8b'),
|
| + ( 'al', 'ax'),
|
| + ( 'spl', 'ax'), ( 'spl', 'r8w'), ( 'r8b', 'ax'), ( 'r8b', 'r8w'),
|
| + ( 'ax', 'ax'), ( 'ax', 'r8w'), ( 'r8w', 'ax'), ( 'r8w', 'r8w'),
|
| + ( 'eax', 'al'),
|
| + ( 'eax', 'spl'), ( 'eax', 'r8b'), ( 'r8d', 'spl'), ( 'r8d', 'r8b'),
|
| + ( 'al', 'eax'),
|
| + ( 'spl', 'eax'), ( 'spl', 'r8d'), ( 'r8b', 'eax'), ( 'r8b', 'r8d'),
|
| + ( 'eax', 'ax'), ( 'eax', 'r8w'), ( 'r8d', 'ax'), ( 'r8d', 'r8w'),
|
| + ( 'ax', 'eax'), ( 'ax', 'r8d'), ( 'r8w', 'eax'), ( 'r8w', 'r8d'),
|
| + ( 'eax', 'eax'), ( 'eax', 'r8d'), ( 'r8d', 'eax'), ( 'r8d', 'r8d'),
|
| + ( 'rax', 'al'),
|
| + ( 'rax', 'spl'), ( 'rax', 'r8b'), ( 'r8', 'spl'), ( 'r8', 'r8b'),
|
| + ( 'al', 'rax'),
|
| + ( 'spl', 'rax'), ( 'spl', 'r8'), ( 'r8b', 'rax'), ( 'r8b', 'r8'),
|
| + ( 'rax', 'ax'), ( 'rax', 'r8w'), ( 'r8', 'ax'), ( 'r8', 'r8w'),
|
| + ( 'ax', 'rax'), ( 'ax', 'r8'), ( 'r8w', 'rax'), ( 'r8w', 'r8'),
|
| + ( 'rax', 'eax'), ( 'rax', 'r8d'), ( 'r8', 'eax'), ( 'r8', 'r8d'),
|
| + ( 'eax', 'rax'), ( 'eax', 'r8'), ( 'r8d', 'rax'), ( 'r8d', 'r8'),
|
| + ( 'rax', 'rax'), ( 'rax', 'r8'), ( 'r8', 'rax'), ( 'r8', 'r8'),
|
| + ( 'eax', 'mm0'), ( 'eax','mmalt'), ( 'r8d', 'mm0'), ( 'eax', 'mmalt'),
|
| + ( 'rax', 'mm0'), ( 'rax','mmalt'), ( 'r8', 'mm0'), ( 'r8', 'mmalt'),
|
| + ( 'mm0', 'eax'), ('mmalt', 'eax'), ( 'mm0', 'r8d'), ('mmalt', 'r8d'),
|
| + ( 'mm0', 'rax'), ('mmalt', 'rax'), ( 'mm0', 'r8'), ('mmalt', 'r8'),
|
| + ( 'eax', 'xmm0'), ( 'eax', 'xmm8'), ( 'r8d', 'xmm0'), ( 'r8d', 'xmm8'),
|
| + ( 'rax', 'xmm0'), ( 'rax', 'xmm8'), ( 'r8', 'xmm0'), ( 'r8', 'xmm8'),
|
| + ('xmm0', 'eax'), ('xmm0', 'r8d'), ('xmm8', 'eax'), ('xmm8', 'r8d'),
|
| + ('xmm0', 'rax'), ('xmm0', 'r8'), ('xmm8', 'rax'), ('xmm8', 'r8'),
|
| + ( 'mm0', 'mm0'), ('mmalt', 'mm0'), ( 'mm0','mmalt'), ('mmalt','mmalt'),
|
| + ( 'mm0', 'xmm0'), ('mmalt','xmm0'), ( 'mm0', 'xmm8'), ('mmalt', 'xmm8'),
|
| + ('xmm0', 'mm0'), ('xmm8', 'mm0'), ('xmm0','mmalt'), ('xmm8', 'mmalt'),
|
| + ('xmm0', 'xmm0'), ('xmm0', 'xmm8'), ('xmm8', 'xmm0'), ('xmm8', 'xmm8'),
|
| + ('xmm0', 'ymm0'), ('xmm0', 'ymm8'), ('xmm8', 'ymm0'), ('xmm8', 'ymm8'),
|
| + ('ymm0', 'xmm0'), ('ymm0', 'xmm8'), ('ymm8', 'xmm0'), ('ymm8', 'xmm8'),
|
| + ('ymm0', 'ymm0'), ('ymm0', 'ymm8'), ('ymm8', 'ymm0'), ('ymm8', 'ymm8')
|
| + )
|
| +
|
| +
|
| +def ProtectIndexRegister(registers_interval):
|
| + """ Remove index-forbidden regsiters from the registers_interval """
|
| +
|
| + replacements = {
|
| + '%al..%dil': '%al|%cl|%dl|%bl|%sil|%dil',
|
| + '%ax..%di': '%ax|%cx|%dx|%bx|%si|%di',
|
| + '%eax..%edi': '%eax|%ecx|%edx|%ebx|%esi|%edi',
|
| + '%rax..%rdi': '%rax|%rcx|%rdx|%rbx|%rsi|%rdi',
|
| + '%al..%r15b': '%al..%bl|%ah..%bh|%sil..%r15b',
|
| + '%ax..%r15w': '%ax..%bx|%si..%r15w',
|
| + '%eax..%r15d': '%eax..%ebx|%esi..%r15d',
|
| + '%rax..%r15': '%rax..%rbx|%rsi..%r15'
|
| + }
|
| + return replacements.get(registers_interval, registers_interval)
|
| +
|
| +
|
| +def ProtectBaseRegister(registers_interval):
|
| + """ Remove base-allowed from the registers_interval """
|
| +
|
| + replacements = {
|
| + '%al..%r15b': '%al..%bl|%ah..%bh|%sil..%r14b',
|
| + '%ax..%r15w': '%ax..%bx|%si..%r14w',
|
| + '%eax..%r15d': '%eax..%ebx|%esi..%r14d',
|
| + '%rax..%r15': '%rax..%rbx|%rsi..%r14',
|
| + '%r8b..%r15b': '%r8b..%r14b',
|
| + '%r8w..%r15w': '%r8w..%r14w',
|
| + '%r8d..%r15d': '%r8d..%r14d',
|
| + '%r8..%r15': '%r8..%r14'
|
| + }
|
| + if registers_interval in replacements:
|
| + return replacements.get(registers_interval, registers_interval)
|
| + else:
|
| + return ProtectIndexRegister(registers_interval)
|
| +
|
| +
|
| +def RegisterRegex(register_kind, opcode_bits = 0):
|
| + """ Returns modrm regex for the first register """
|
| +
|
| + regex_mod = '^.*?({:02x})'.format(0xc0 + opcode_bits * 8)
|
| + regex_rm = '(%' + REGISTERS[register_kind][0] + ')'
|
| + return regex_mod, regex_rm
|
| +
|
| +
|
| +def MemoryRegex(base_r8, index_r8, opcode_bits = 0):
|
| + """ Returns one regex for bytes and one for the textual representation. """
|
| +
|
| + # We only need to process ModR/M+SIB '04 04' or '04 07' here. We only match
|
| + # memory operands without offset, so we don't need to add zeros to the end of
|
| + # regex_modrm and regex_modrm regular expressions.
|
| + #
|
| + # We pick version which is "always there" (in both 32bit and 64bit mode) and
|
| + # which is not 0x00 (to avoid confusion with offsets and immediates).
|
| + if base_r8:
|
| + regex_modrm_sib = '^.*?({:02x} 07)'.format(0x04 + opcode_bits * 8)
|
| + else:
|
| + regex_modrm_sib = '^.*?({:02x} 04)'.format(0x04 + opcode_bits * 8)
|
| + if options.bitness == 32:
|
| + regex_mem = r'(\(%esp,%eax,1\))'
|
| + elif base_r8:
|
| + regex_mem = r'(\(%r15,%r8,1\))' if index_r8 else r'(\(%r15,%rax,1\))'
|
| + else:
|
| + regex_mem = r'(\(%rsp,%r8,1\))' if index_r8 else r'(\(%rsp,%rax,1\))'
|
| + return regex_modrm_sib, regex_mem
|
| +
|
| +
|
| +def RegToRmCompressor(rm, reg, writes_to, memory_accessed=True,
|
| + register_write='ignore', index_r8=False,
|
| + rm_compression='register', is_3Dnow=False, notes=''):
|
| + """Returns a list of reg <-> rm compressors for a given set of parameters.
|
| +
|
| + Args:
|
| + rm: rm operand kind (see REGISTERS array)
|
| + reg: reg operand kind (see REGISTERS array) or None if reg is not used
|
| + writes_to: three-state selector
|
| + 'reg' - instruction uses rm as source, reg as destination
|
| + 'rm' - instruction uses reg as source, rm as destination
|
| + 'both' - instruction writes to both reg and rm
|
| + memory_accessed: True if instruction accesses memory
|
| + register_write: three-state selector
|
| + 'sandbox' - instruction can be used to produce "restricted register"
|
| + 'protect' - instruction can damage output, protect "special registers"
|
| + 'ignore' - instruction does not affect it's operands (e.g. test) or
|
| + is used with non-GP registers (X87, MMX, XMM, etc)
|
| + index_r8: True if REX.X bit in the instruction set to 1
|
| + rm_compression: three-state selector
|
| + 'register' - instruction supports register-only rm parameter
|
| + 'memory' - instruction supports memory-only rm parameter
|
| + 'register_or_memory' - instruction supports all kinds of rm parameters
|
| + is_3DNow - True if instruction is 3DNow! style (opcode in immidiate)
|
| + notes - Notes to add to the description
|
| +
|
| + Returns:
|
| + List of compressors
|
| + """
|
| +
|
| + start_reg = REGISTERS[reg][0]
|
| + end_reg = REGISTERS[reg][-1]
|
| + mark_reg = '%{}..%{}'.format(start_reg, end_reg)
|
| + # Exclude rbp/rsp/r15 from the interval, if instruction can write to reg.
|
| + if register_write != 'ignore' and writes_to in ('reg', 'both'):
|
| + mark_reg = ProtectBaseRegister(mark_reg)
|
| + start_rm = REGISTERS[rm][0]
|
| + end_rm = REGISTERS[rm][-1]
|
| + mark_rm = '%{}..%{}'.format(start_rm, end_rm)
|
| + # Exclude rbp/rsp/r15 from the interval, if instruction can write to rm.
|
| + if register_write != 'ignore' and writes_to in ('rm', 'both'):
|
| + mark_rm = ProtectBaseRegister(mark_rm)
|
| + # Prepare substitution texts
|
| + subst_reg = '[{}]'.format(mark_reg)
|
| + if rm_compression == 'register':
|
| + subst_rm = '[{}]'.format(mark_rm)
|
| + elif rm_compression == 'register_or_memory':
|
| + subst_rm = '[{} or memory]'.format(mark_rm)
|
| + else:
|
| + assert rm_compression == 'memory'
|
| + subst_rm = '[memory]'
|
| + base_r8 = rm in r8.values()
|
| + regex, regex_reg = RegisterRegex(reg)
|
| + if rm_compression == 'register':
|
| + regex, regex_rm = RegisterRegex(rm)
|
| + else: # memory and register_or_memory
|
| + regex, regex_rm = MemoryRegex(base_r8, index_r8)
|
| + if is_3Dnow:
|
| + # Immediate byte is opcode extension with 3DNow! instructions.
|
| + regex += ' [0-9a-fA-F][0-9a-fA-F]'
|
| + else:
|
| + # Accept immediate (which is always zeros in our case).
|
| + regex += '(?: 00)*'
|
| + # 2 spaces separate byte code from instruction name.
|
| + regex += ' '
|
| + # Optional "lock" prefix.
|
| + regex += '(?:lock )?'
|
| + # Instruction name.
|
| + regex += r'\w* '
|
| + # Immediate or implicit operand that can precede reg/rm pair.
|
| + regex += r'(?:\$0x0,|\$0x0,\$0x0,|%cl,|%xmm0,)?'
|
| + regex_output_rr = None
|
| + subst_output_rr = None
|
| + if writes_to == 'reg':
|
| + regex += regex_rm + ',' + regex_reg
|
| + if register_write == 'sandbox':
|
| + assert reg in ('eax', 'r8d')
|
| + regex_output_rr = '%' + reg
|
| + subst_output_rr = subst_reg
|
| + subst = ('XX', subst_rm, subst_reg)
|
| + else:
|
| + regex += regex_reg + ',' + regex_rm
|
| + if register_write == 'sandbox':
|
| + assert rm in ('eax', 'r8d')
|
| + # Instruction can be sandboxing one and yet produce output_rr == None
|
| + # This looks strange, but it's perfectly logical because we can't ever
|
| + # sandbox memory!
|
| + if rm_compression == 'register':
|
| + regex_output_rr = '%' + rm
|
| + if rm_compression != 'memory':
|
| + subst_output_rr = '[{}]'.format(mark_rm)
|
| + subst = ('XX', subst_reg, subst_rm)
|
| + # Skip implicit arguments (if any)
|
| + regex += '.*'
|
| + if options.bitness == 64:
|
| + if memory_accessed and rm_compression != 'register':
|
| + regex += '; input_rr=(%r8d)' if index_r8 else '; input_rr=(%eax)'
|
| + subst_input_rr = '%r8d..%r15d' if index_r8 else '%eax..%edi'
|
| + subst_input_rr = '[{}]'.format(ProtectIndexRegister(subst_input_rr))
|
| + else:
|
| + regex += '; input_rr=(any_nonspecial)'
|
| + subst_input_rr = 'any_nonspecial'
|
| + regex += '; output_rr=({})'.format(regex_output_rr)
|
| + subst = subst + (subst_input_rr, subst_output_rr)
|
| + # With "or memory" or "memory" compressors we always can see where is
|
| + # reg and where is rm, but with reg to rm or rm to reg it's impossible.
|
| + # Add the appropriate comment
|
| + notes = () if notes == '' else (notes, )
|
| + if rm_compression == 'register':
|
| + notes += ('rm to reg' if writes_to == 'reg' else 'reg to rm', )
|
| + notes = '; '.join(notes)
|
| + if notes != '':
|
| + notes = ' # ' + notes
|
| + subst += (notes, )
|
| + regex += '()$'
|
| + replacement = ()
|
| + if rm_compression != 'memory':
|
| + replacement += ModRMRegisterReplacements(
|
| + reg=reg, rm=rm, writes_to=writes_to, register_write=register_write)
|
| + if rm_compression != 'register':
|
| + replacement += ModRMMemoryReplacements(
|
| + reg=reg, base_r8=base_r8, index_r8=index_r8, writes_to=writes_to,
|
| + memory_accessed=memory_accessed, register_write=register_write)
|
| + return Compressor(regex, subst, replacement)
|
| +
|
| +
|
| +def RegToRmCompressors():
|
| + """ Return list of all Reg to RM (and RM to Reg) compressors. """
|
| +
|
| + compressors = []
|
| +
|
| + for reg, rm in RegisterKindPairs():
|
| + instruction_kinds = [
|
| + # Normal instructions with two operands (rm to reg)
|
| + {'writes_to': 'reg'},
|
| + # Normal instructions with two operands (reg to rm)
|
| + {'writes_to': 'rm'}
|
| + ]
|
| + # In 64 bit mode we have many different types of instructions depending
|
| + # on whether we are accessing memory or whether we are writing to registers.
|
| + if options.bitness == 64:
|
| + # Lea in 64 bit mode is truly unique instruction for now
|
| + if reg in ('ax', 'r8w', 'eax', 'r8d', 'rax', 'r8'):
|
| + register_write = 'sandbox' if reg in ('eax', 'r8d') else 'protect'
|
| + instruction_kinds.append(
|
| + {'writes_to': 'reg',
|
| + 'memory_accessed': False,
|
| + 'register_write': register_write,
|
| + 'notes': 'lea'})
|
| + # There are few more forms in 64 bit case (rm to reg)
|
| + if reg in ('eax', 'r8d'):
|
| + # Zero-extending version.
|
| + instruction_kinds.append(
|
| + {'writes_to': 'reg',
|
| + 'register_write': 'sandbox'})
|
| + # More forms in 64 bit case (reg to rm)
|
| + if rm in ('eax', 'r8d'):
|
| + # Zero-extending version.
|
| + instruction_kinds.append(
|
| + {'writes_to': 'rm',
|
| + 'register_write': 'sandbox'})
|
| + # Zero-extending xchg/xadd
|
| + instruction_kinds.append(
|
| + {'writes_to': 'both',
|
| + 'register_write': 'sandbox',
|
| + 'notes': 'write to both'})
|
| + # Still more forms for 64 bit case (rm to reg).
|
| + if reg in ('al', 'spl', 'ax', 'eax', 'rax', 'r8b', 'r8w', 'r8d', 'r8'):
|
| + # Dangerous instructions (rm to reg)
|
| + instruction_kinds.append(
|
| + {'writes_to': 'reg',
|
| + 'register_write': 'protect'})
|
| + # Still more forms for 64 bit case (reg to rm)
|
| + if rm in ('al', 'spl', 'ax', 'eax', 'rax', 'r8b', 'r8w', 'r8d', 'r8'):
|
| + # Dangerous instructions (reg to rm)
|
| + instruction_kinds.append(
|
| + {'writes_to': 'rm',
|
| + 'register_write': 'protect'})
|
| + # Dangerous xchg/xadd
|
| + instruction_kinds.append(
|
| + {'writes_to': 'both',
|
| + 'register_write': 'protect',
|
| + 'notes': 'write to both'})
|
| + # 3DNow! instructions
|
| + if reg in ('mm0', 'mmalt') or rm in ('mm0', 'mmalt'):
|
| + instruction_kinds.append(
|
| + {'writes_to': 'reg',
|
| + 'is_3Dnow': True})
|
| + for instruction_kind in instruction_kinds:
|
| + for rm_compression in ('register', 'memory', 'register_or_memory'):
|
| + if rm_compression == 'register':
|
| + # Register-to-register instructions can not access memory anyway
|
| + # which means special compressors are not needed.
|
| + if 'memory_accessed' in instruction_kind:
|
| + continue
|
| + elif options.bitness == 64:
|
| + # We generate 2 variants of compressors with memory operands to
|
| + # enumerate 2 possible values of index_r8 property. Register only
|
| + # instructions have only one version since they don't use index.
|
| + compressors.append(RegToRmCompressor(
|
| + reg=reg, rm=rm, index_r8=True,
|
| + rm_compression=rm_compression, **instruction_kind))
|
| + # Create "normal" compressor now
|
| + compressors.append(RegToRmCompressor(
|
| + reg=reg, rm=rm,
|
| + rm_compression=rm_compression, **instruction_kind))
|
| + return compressors
|
| +
|
| +
|
| +def RmCompressor(rm, opcode_bits, memory_accessed=True, index_r8=False,
|
| + rm_compression='register', register_write='ignore'):
|
| + """Returns a list of rm compressors for a given set of parameters.
|
| +
|
| + Args:
|
| + rm: rm operand kind (see REGISTERS array)
|
| + memory_accessed: True if instruction accesses memory
|
| + index_r8: True if REX.X bit in the instruction set to 1
|
| + rm_compression: three-state selector
|
| + 'register' - instruction supports register-only rm parameter
|
| + 'memory' - instruction supports memory-only rm parameter
|
| + 'register_or_memory' - instruction supports all kinds of rm parameters
|
| + register_write: three-state selector
|
| + 'sandbox' - instruction can be used to produce "restricted register"
|
| + 'protect' - instruction can damage output, protect "special registers"
|
| + 'ignore' - instruction does not affect it's operands (e.g. test) or
|
| + is used with non-GP registers (X87, MMX, XMM, etc)
|
| +
|
| + Returns:
|
| + List of compressors
|
| + """
|
| + start_rm = REGISTERS[rm][0]
|
| + end_rm = REGISTERS[rm][-1]
|
| + mark_rm = '%{}..%{}'.format(start_rm, end_rm)
|
| + # Exclude rbp/rsp/r15 from the interval, if instruction can write to rm.
|
| + if register_write != 'ignore':
|
| + mark_rm = ProtectBaseRegister(mark_rm)
|
| + byte_mark = 'XX/' + str(opcode_bits)
|
| + # Prepare substitution texts
|
| + if rm_compression == 'register':
|
| + subst_rm = '[{}]'.format(mark_rm)
|
| + elif rm_compression == 'register_or_memory':
|
| + subst_rm = '[{} or memory]'.format(mark_rm)
|
| + else:
|
| + assert rm_compression == 'memory'
|
| + subst_rm = '[memory]'
|
| + base_r8 = rm in r8.values()
|
| + if rm_compression == 'register':
|
| + regex, regex_rm = RegisterRegex(rm, opcode_bits)
|
| + else: # memory and register_or_memory
|
| + regex, regex_rm = MemoryRegex(base_r8, index_r8, opcode_bits)
|
| + # Regex here matches everything AFTER the ModR/M (+ SIB) bytes.
|
| + regex += '(?: 00)*'
|
| + # 2 spaces separate byte code from instruction name.
|
| + regex += ' '
|
| + # Optional "lock" prefix.
|
| + regex += '(?:lock )?'
|
| + # Instruction name.
|
| + regex += r'\w* '
|
| + # Immediate or implicit operand that can precede rm argument.
|
| + regex += r'(?:\$0x0,|%cl,)?'
|
| + # Register or memory access
|
| + regex += regex_rm
|
| + # Skip everything after that
|
| + regex += '.*'
|
| + regex_output_rr = None
|
| + subst_output_rr = None
|
| + subst = (byte_mark, subst_rm)
|
| + if options.bitness == 32:
|
| + subst += ('', )
|
| + else:
|
| + if register_write == 'sandbox':
|
| + assert rm in ('eax', 'r8d')
|
| + # Instruction can be sandboxing one and yet produce output_rr == None
|
| + # This looks strange, but it's perfectly logical because we can't ever
|
| + # sandbox memory!
|
| + if rm_compression == 'register':
|
| + regex_output_rr = '%' + rm
|
| + if rm_compression != 'memory':
|
| + subst_output_rr = '[{}]'.format(mark_rm)
|
| + if memory_accessed and rm_compression != 'register':
|
| + regex += '; input_rr=(%r8d)' if index_r8 else '; input_rr=(%eax)'
|
| + subst_input_rr = '%r8d..%r15d' if index_r8 else '%eax..%edi'
|
| + subst_input_rr = '[{}]'.format(ProtectIndexRegister(subst_input_rr))
|
| + else:
|
| + regex += '; input_rr=(any_nonspecial)'
|
| + subst_input_rr = 'any_nonspecial'
|
| + regex += '; output_rr=({})'.format(regex_output_rr)
|
| + subst = subst + (subst_input_rr, subst_output_rr, '')
|
| + regex += '()'
|
| + replacement = ()
|
| + if rm_compression != 'memory':
|
| + replacement += ModRMRegisterReplacements(
|
| + rm=rm, opcode_bits=opcode_bits, register_write=register_write)
|
| + if rm_compression != 'register':
|
| + replacement += ModRMMemoryReplacements(
|
| + base_r8=base_r8, index_r8=index_r8, opcode_bits=opcode_bits,
|
| + memory_accessed=memory_accessed, register_write=register_write)
|
| + return Compressor(regex, subst, replacement)
|
| +
|
| +
|
| +def RmCompressors():
|
| + """ Return list of all RM (with reg as opcode extension) compressors. """
|
| + compressors = []
|
| +
|
| + for rm in RegisterKinds():
|
| + for opcode_bits in xrange(8):
|
| + instruction_kinds = [
|
| + # The most basic form
|
| + {}
|
| + ]
|
| + # In 64 bit mode we have many different types of instructions depending
|
| + # on whether we are accessing memory or whether we are writing to
|
| + # registers.
|
| + if options.bitness == 64:
|
| + # No memory access (e.g. prefetch)
|
| + instruction_kinds.append(
|
| + {'memory_accessed':False})
|
| + # More forms in 64 bit case.
|
| + if rm in ('eax', 'r8d'):
|
| + # Zero-extending version.
|
| + instruction_kinds.append(
|
| + {'register_write':'sandbox'})
|
| + # Still more forms for 64 bit case (reg to rm).
|
| + if rm in ('al', 'spl', 'ax', 'eax', 'rax',
|
| + 'r8b', 'r8w', 'r8d', 'r8'):
|
| + # Dangerous instructions.
|
| + instruction_kinds.append(
|
| + {'register_write':'protect'})
|
| + for instruction_kind in instruction_kinds:
|
| + for rm_compression in ('register', 'memory', 'register_or_memory'):
|
| + if rm_compression == 'register':
|
| + # Register-to-register instructions can not access memory anyway
|
| + # which means special compressors are not needed.
|
| + if 'memory_accessed' in instruction_kind:
|
| + continue
|
| + elif options.bitness == 64:
|
| + # We generate 2 variants of compressors with memory operands to
|
| + # enumerate 2 possible values of index_r8 property. Register only
|
| + # instructions have only one version since they don't use index.
|
| + compressors.append(RmCompressor(
|
| + rm=rm, opcode_bits=opcode_bits, index_r8=True,
|
| + rm_compression=rm_compression, **instruction_kind))
|
| + # Create "normal" compressor now
|
| + compressors.append(RmCompressor(
|
| + rm=rm, opcode_bits=opcode_bits,
|
| + rm_compression=rm_compression, **instruction_kind))
|
| + return compressors
|
| +
|
| +
|
| +def OpcodeCompressors():
|
| + """ Return "register in opcode" compressors. """
|
| + compressors = []
|
| +
|
| + for reg in RegisterKinds() + ('st(0)',):
|
| + for opcode in xrange(8):
|
| + for text1, text2, nibble in (
|
| + ('[0..7]', '[8..f]', xrange(8)),
|
| + ('[012367]', '[89abef]', (0, 1, 2, 3, 6, 7)),
|
| + ('[0..6]', '[8..e]', xrange(7))
|
| + ):
|
| + start_reg = REGISTERS[reg][0]
|
| + end_reg = REGISTERS[reg][-1]
|
| + mark_reg = '%{}..%{}'.format(start_reg, end_reg)
|
| + if 7 not in nibble:
|
| + if ProtectBaseRegister(mark_reg) != mark_reg:
|
| + mark_reg = ProtectBaseRegister(mark_reg)
|
| + else:
|
| + continue
|
| + subst_reg = '[{}]'.format(mark_reg)
|
| + # Note that we use instruction with 1st register, not 0th register
|
| + # here to avoid ambiguity when opcode is 0x00
|
| + compressors.append(Compressor(
|
| + r'.*?[0-9a-fA-F](1)(?: 00)*'
|
| + r' \w* (?:\$0x0,|%ax,|%st,)?'
|
| + r'(%' + REGISTERS_RE[reg][1] + ').*',
|
| + (text1, subst_reg),
|
| + [('{:x}'.format(n), '%' + REGISTERS[reg][n])
|
| + for n in nibble]))
|
| + compressors.append(Compressor(
|
| + r'.*?[0-9a-fA-F](8)(?: 00)*'
|
| + r' \w* (?:\$0x0,|%ax,|%st,)?'
|
| + r'(%' + REGISTERS_RE[reg][0] + ').*',
|
| + (text2, subst_reg),
|
| + [('{:x}'.format(n + 8), '%' + REGISTERS[reg][n])
|
| + for n in nibble]))
|
| + # Another version for 64 bit case
|
| + if options.bitness == 64 and reg in ('eax', 'r8d'):
|
| + compressors.append(Compressor(
|
| + r'.*?[0-9a-fA-F](1)(?: 00)*'
|
| + r' \w* (?:\$0x0,|%ax,|%st,)?'
|
| + r'(%' + REGISTERS_RE[reg][1] + ').*'
|
| + r'output_rr=(%'+ REGISTERS_RE[reg][1] + ').*',
|
| + (text1, subst_reg, subst_reg),
|
| + [['{:x}'.format(n)] + ['%' + REGISTERS[reg][n]] * 2
|
| + for n in nibble]))
|
| + compressors.append(Compressor(
|
| + r'.*?[0-9a-fA-F](8)(?: 00)*'
|
| + r' \w* (?:\$0x0,|%ax,|%st,)?'
|
| + r'(%' + REGISTERS_RE[reg][0] + ').*'
|
| + r'output_rr=(%'+ REGISTERS_RE[reg][0] + ').*',
|
| + (text2, subst_reg, subst_reg),
|
| + [['{:x}'.format(n + 8)] + ['%' + REGISTERS[reg][n]] * 2
|
| + for n in nibble]))
|
| + return compressors
|
| +
|
| +
|
| +def MemoryNonMemoryCompressors():
|
| + """ Return memory/nonmemory compressors. """
|
| +
|
| + compressors = []
|
| +
|
| + if options.bitness == 32:
|
| + letters_and_registers = (('b', 'al', ''), ('w', 'ax', ''), ('l', 'eax', ''))
|
| + else:
|
| + letters_and_registers = (
|
| + ('b', 'al', 'eax'), ('b', 'spl', 'eax'), ('b', 'r8b', 'r8d'),
|
| + ('w', 'ax', 'eax'), ('w', 'r8w', 'r8d'),
|
| + ('l', 'eax', 'eax'), ('l', 'r8d', 'r8d'),
|
| + ('q', 'rax', 'eax'), ('q', 'r8', 'r8d')
|
| + )
|
| + for letter, reg, out_reg in letters_and_registers:
|
| + start_reg = REGISTERS[reg][0]
|
| + end_reg = REGISTERS[reg][-1]
|
| + for rmR15 in (True, False):
|
| + rm_mark = '%{}..%{}'.format(start_reg, end_reg)
|
| + if not rmR15:
|
| + rm_mark = ProtectBaseRegister(rm_mark)
|
| + all_regs = '[{}]'.format(rm_mark)
|
| + regs_mark = '[{} or memory]'.format(rm_mark)
|
| + if options.bitness == 64:
|
| + start_out = REGISTERS[out_reg][0]
|
| + end_out = REGISTERS[out_reg][-1]
|
| + mark_out = '%{}..%{}'.format(start_out, end_out)
|
| + out_regs = '[{}]'.format(ProtectBaseRegister(mark_out))
|
| + for notes in ('', ' # rm to reg', ' # reg to rm'):
|
| + compressors.append(Compressor(
|
| + r'.* \w*(' + letter + r') .*(\[memory]).*()',
|
| + ('[{}]?'.format(letter), regs_mark, ''),
|
| + ((letter, '[memory]', ''), ('', all_regs, notes))))
|
| + if options.bitness == 64:
|
| + for index_reg in ('eax', 'r8d'):
|
| + start_index = REGISTERS[index_reg][0]
|
| + end_index = REGISTERS[index_reg][-1]
|
| + index_mark = '%{}..%{}'.format(start_index, end_index)
|
| + index_regs = '[{}]'.format(ProtectIndexRegister(index_mark))
|
| + for output_rrs in ((None, out_regs),
|
| + (out_regs, None),
|
| + (None, None)):
|
| + compressors.append(Compressor(
|
| + r'.* \w*(' + letter + r') .*(\[memory]).*; '
|
| + r'input_rr=(\[(?:%[a-z0-9]*(?:\.\.|\|))+%[a-z0-9]*]); '
|
| + r'output_rr=(\[(?:%[a-z0-9]*(?:\.\.|\|))+%[a-z0-9]*]|None)()',
|
| + ('[{}]?'.format(letter), regs_mark, index_regs,
|
| + output_rrs[1] if output_rrs[0] is None else output_rrs[0],
|
| + ''),
|
| + ((letter, '[memory]', index_regs, output_rrs[0], ''),
|
| + ('', all_regs, 'any_nonspecial', output_rrs[1], notes))))
|
| + return compressors
|
| +
|
| +
|
| +def RexCompressor(rm, rmR15, reg, regR15, rexw, input_rr, output_rr, rm_to_reg):
|
| + """ Return REX compressor (or nothing) for a given set of paramenters.
|
| +
|
| + Args:
|
| + rm: rm operand kind (see REGISTERS array) or None if rm is not used
|
| + reg: reg operand kind (see REGISTERS array) or None if reg is not used
|
| + rmR15: True if R15 register is included
|
| + regR15: True if R15 register is included
|
| + rexw: True if REX.W should be set
|
| + input_rr: True if input_rr is used
|
| + output_rr: true if output_rr is used
|
| + rm_to_reg: True if direction is rm to reg
|
| + """
|
| +
|
| + def MakeRegex(string):
|
| + return string.replace('.', r'\.').replace('|', r'\|')
|
| +
|
| + # reg and rm can be of three different possible intervals
|
| + # start_reg/rm to end_reg/rm (e.g. from %al to %bh)
|
| + # start_reg/rm to end_reg0/rm0 (e.g from %al to %dil)
|
| + # start_reg8/rm8 to end_reg8/rm8 (e.g. from %r8b to %r15b)
|
| + # First form can be observed if there are not REX, second form
|
| + # if REX.R/REX.B is not set and third form is where it's set.
|
| + if reg:
|
| + start_reg = REGISTERS[reg][0]
|
| + start_reg8 = REGISTERS[r8[reg]][0]
|
| + end_reg = REGISTERS[reg][-1]
|
| + end_reg0 = 'dil' if reg == 'al' else end_reg
|
| + end_reg8 = REGISTERS[r8[reg]][-1]
|
| + if rexw:
|
| + mark_reg = '%{}..%{}'.format(start_reg, end_reg0)
|
| + else:
|
| + mark_reg = '%{}..%{}'.format(start_reg, end_reg)
|
| + if not regR15:
|
| + mark_reg = ProtectBaseRegister(mark_reg)
|
| + regex_reg = r'\[({})]'.format(MakeRegex(mark_reg))
|
| + if rm:
|
| + start_rm = REGISTERS[rm][0]
|
| + start_rm8 = REGISTERS[r8[rm]][0]
|
| + end_rm = REGISTERS[rm][-1]
|
| + end_rm0 = 'dil' if rm == 'al' else end_rm
|
| + end_rm8 = REGISTERS[r8[rm]][-1]
|
| + if rexw:
|
| + mark_rm = '%{}..%{}'.format(start_rm, end_rm0)
|
| + else:
|
| + mark_rm = '%{}..%{}'.format(start_rm, end_rm)
|
| + if not rmR15:
|
| + mark_rm = ProtectBaseRegister(mark_rm)
|
| + regex_rm = r'\[({})(?: or memory)?]'.format(MakeRegex(mark_rm))
|
| +
|
| + # Legacy prefixes
|
| + regex = '.*:(?: 26| 2e| 36| 3e| 64| 65| 66| 67| f0| f2| f3)*'
|
| + # REX
|
| + regex += '( 48).*' if rexw else '( 40|).*'
|
| + # Replacement text
|
| + replacement_tuple = (' [REX:48..4f]' if rexw else ' [REX:40..47]?', )
|
| + if reg:
|
| + replacement_regs = '%{}..%{}'.format(start_reg, end_reg8)
|
| + if not regR15:
|
| + replacement_regs = ProtectBaseRegister(replacement_regs)
|
| + if rm:
|
| + replacement_rms = '%{}..%{}'.format(start_rm, end_rm8)
|
| + if not rmR15:
|
| + replacement_rms = ProtectBaseRegister(replacement_rms)
|
| + # Instruction arguments
|
| + if not reg and not rm:
|
| + pass
|
| + elif not reg and rm:
|
| + regex += regex_rm + '.*'
|
| + replacement_tuple += (replacement_rms, )
|
| + elif reg and not rm:
|
| + regex += regex_reg + '.*'
|
| + replacement_tuple += (replacement_regs, )
|
| + elif rm_to_reg:
|
| + regex += regex_rm + ',' + regex_reg + '.*'
|
| + replacement_tuple += (replacement_rms, replacement_regs)
|
| + else:
|
| + regex += regex_reg + ',' + regex_rm + '.*'
|
| + replacement_tuple += (replacement_regs, replacement_rms)
|
| + # Input and output restricted registers
|
| + if input_rr:
|
| + regex += r'input_rr=\[({})].*'.format(
|
| + MakeRegex(ProtectIndexRegister('%eax..%edi')))
|
| + replacement_tuple += (ProtectIndexRegister('%eax..%r15d'), )
|
| + if output_rr:
|
| + regex += r'output_rr=\[({})].*'.format(
|
| + MakeRegex(ProtectIndexRegister('%eax..%edi')))
|
| + replacement_tuple += (ProtectBaseRegister('%eax..%r15d'), )
|
| + # Replacement cases
|
| + replacement_tuples = ()
|
| + for byte in (range(0x48, 0x50) if rexw else range(0x40, 0x48) + [0]):
|
| + replacement_case = (' {:02x}'.format(byte) if byte else '', )
|
| + if rm:
|
| + if byte & 0x1:
|
| + replacement_rms = '%{}..%{}'.format(start_rm8, end_rm8)
|
| + elif byte:
|
| + replacement_rms = '%{}..%{}'.format(start_rm, end_rm0)
|
| + else:
|
| + replacement_rms = '%{}..%{}'.format(start_rm, end_rm)
|
| + if not rmR15:
|
| + replacement_rms = ProtectBaseRegister(replacement_rms)
|
| + if byte & 0x2:
|
| + replacement_index = '%r8d..%r15d'
|
| + else:
|
| + replacement_index = '%eax..%edi'
|
| + if reg:
|
| + if byte & 0x4:
|
| + replacement_regs = '%{}..%{}'.format(start_reg8, end_reg8)
|
| + elif byte:
|
| + replacement_regs = '%{}..%{}'.format(start_reg, end_reg0)
|
| + else:
|
| + replacement_regs = '%{}..%{}'.format(start_reg, end_reg)
|
| + if not regR15:
|
| + replacement_regs = ProtectBaseRegister(replacement_regs)
|
| + if not reg and not rm:
|
| + pass
|
| + elif not reg and rm:
|
| + replacement_case += (replacement_rms, )
|
| + final_rr = '%r8d..%r15d' if byte & 0x1 else '%eax..%edi'
|
| + elif reg and not rm:
|
| + replacement_case += (replacement_regs, )
|
| + final_rr = '%r8d..%r15d' if byte & 0x4 else '%eax..%edi'
|
| + elif rm_to_reg:
|
| + replacement_case += (replacement_rms, replacement_regs)
|
| + final_rr = '%r8d..%r15d' if byte & 0x4 else '%eax..%edi'
|
| + else:
|
| + replacement_case += (replacement_regs, replacement_rms)
|
| + final_rr = '%r8d..%r15d' if byte & 0x1 else '%eax..%edi'
|
| + if input_rr: replacement_case += (ProtectIndexRegister(replacement_index), )
|
| + if output_rr: replacement_case += (ProtectBaseRegister(final_rr), )
|
| + replacement_tuples += (replacement_case, )
|
| + return Compressor(regex, replacement_tuple, replacement_tuples)
|
| +
|
| +
|
| +def RexCompressors():
|
| + """ Return "REX" compressors which combine different REX prefixes. """
|
| +
|
| + if options.bitness != 64:
|
| + return []
|
| +
|
| + compressors = []
|
| +
|
| + # First pretty complex set of compressors to combine versions of REX with
|
| + # three lowest bits in different states.
|
| + register_kind_pairs = (
|
| + ( None, None),
|
| + ( 'al', 'al'), ( 'al', None), (None, 'al'),
|
| + ( 'ax', 'al'), ( 'al', 'ax'),
|
| + ( 'ax', 'ax'), ( 'ax', None), (None, 'ax'),
|
| + ( 'eax', 'al'), ( 'al', 'eax'),
|
| + ( 'eax', 'ax'), ( 'ax', 'eax'),
|
| + ( 'eax', 'eax'), ( 'eax', None), (None, 'eax'),
|
| + ( 'rax', 'al'), ( 'al', 'rax'),
|
| + ( 'rax', 'ax'), ( 'ax', 'rax'),
|
| + ( 'rax', 'eax'), ( 'eax', 'rax'),
|
| + ( 'rax', 'rax'), ( 'rax', None), (None, 'rax'),
|
| + ( 'eax', 'mm0'), ( 'mm0', 'eax'),
|
| + ( 'rax', 'mm0'), ( 'mm0', 'rax'),
|
| + ( 'mm0', 'eax'), ( 'eax', 'mm0'),
|
| + ( 'mm0', 'rax'), ( 'rax', 'mm0'),
|
| + ( 'eax', 'xmm0'),
|
| + ( 'rax', 'xmm0'),
|
| + ('xmm0', 'eax'),
|
| + ('xmm0', 'rax'),
|
| + ( 'mm0', 'mm0'), ( 'mm0', None), (None, 'mm0'),
|
| + ( 'mm0', 'xmm0'),
|
| + ('xmm0', 'mm0'),
|
| + ('xmm0', 'xmm0'),
|
| + ('xmm0', 'ymm0'), ('xmm0', None), (None, 'xmm0'),
|
| + ('ymm0', 'xmm0'),
|
| + ('ymm0', 'ymm0'), ('ymm0', None), (None, 'ymm0'),
|
| + )
|
| + for reg, rm in register_kind_pairs:
|
| + for regR15 in (True, False):
|
| + for rmR15 in (True, False):
|
| + for rexw in (True, False):
|
| + for input_rr in (True, False):
|
| + for output_rr in (True, False):
|
| + for rm_to_reg in (True, False):
|
| + # These combinations will just produce useless duplicates
|
| + if not reg and not regR15: continue
|
| + if not rm and not rmR15: continue
|
| + if not reg and not rm and (output_rr or rm_to_reg): continue
|
| + compressors.append(RexCompressor(
|
| + rm=rm, rmR15=rmR15, reg=reg, regR15=regR15,
|
| + rexw=rexw, input_rr=input_rr, output_rr=output_rr,
|
| + rm_to_reg=rm_to_reg))
|
| +
|
| + # This is pretty simple compressor to combine two lines with different REX.W
|
| + # bits (only if they are otherwise identical).
|
| + compressors.append(Compressor(
|
| + r'.*(\[REX:40\.\.47]\?).*', ('[REX:40..4f]?', ),
|
| + (('[REX:40..47]?', ), ('[REX:48..4f]', ))))
|
| + return compressors
|
| +
|
| +
|
| +def SpecialCompressors():
|
| + """ Return all "special" compressors. """
|
| +
|
| + compressors = []
|
| +
|
| + # Special compressors: will handle some cosmetic issues.
|
| + #
|
| + # SETxx ignores reg field and thus are described as many separate instructions
|
| + compressors.append(Compressor(
|
| + '.*0f 9[0-9a-fA-F] XX(/[0-7]) set.*', ('', ),
|
| + [('/' + str(i), ) for i in range(8)]))
|
| + # BSWAP is described with opcode "0f c8+r", not "0f /1" in manual
|
| + if options.bitness == 32:
|
| + compressors.append(Compressor(
|
| + '.*(XX/1) bswap.*ax.*', ('c[8..f]', ), (('XX/1', ), )))
|
| + else:
|
| + compressors.append(Compressor(
|
| + '.*(XX/1) bswap.*ax.*', ('c[89abef]', ), (('XX/1', ), )))
|
| + compressors.append(Compressor(
|
| + '.*(XX/1) bswap.*r8.*', ('c[8..e]', ), (('XX/1', ), )))
|
| + # Add mark '# write to both' to certain versions of CMPXCHG, XADD, and XCHG
|
| + if options.bitness == 64:
|
| + compressors.append(Compressor(
|
| + r'.* (?:cmpxchg|xadd|xchg).*%al\.\.%bh[^#]*()$',
|
| + (' # write to both', ),
|
| + (('', ), )))
|
| + # "and $0xe0,[%eax..%edi]" is treated specially which means that we list all
|
| + # versions of and "[$0x1..$0xff],[%eax..%edi]" separately here.
|
| + # Without this rule these ands comprise 2/3 of the whole output!
|
| + if options.bitness == 32:
|
| + compressors.append(Compressor(
|
| + r'.*83 (e0 01 and \$0x1,%eax)()',
|
| + ('XX/4 00 and[l]? $0x0,[%eax..%edi or memory]',
|
| + ' # special and'),
|
| + [('e{} {:02x} and $0x{:x},%{}'.format(r, i, i, REGISTERS['eax'][r]),
|
| + '')
|
| + for i in range(0x01, 0x100) for r in range(8)] +
|
| + [('XX/4 00 and[l]? $0x0,[%eax..%edi or memory]',
|
| + '')]))
|
| + else:
|
| + for reg in ('eax', 'r8d'):
|
| + start_reg = REGISTERS[reg][0]
|
| + end_reg = REGISTERS[reg][-1]
|
| + mark_reg = '%{}..%{}'.format(start_reg, end_reg)
|
| + mark_reg = ProtectBaseRegister(mark_reg)
|
| + for index_reg in ('eax', 'r8d'):
|
| + start_index = REGISTERS[index_reg][0]
|
| + end_index = REGISTERS[index_reg][-1]
|
| + mark_index = '%{}..%{}'.format(start_index, end_index)
|
| + mark_index = ProtectIndexRegister(mark_index)
|
| + compressors.append(Compressor(
|
| + r'.*83 (e0 01 and \$0x1,%' + reg + ').*'
|
| + 'input_rr=(any_nonspecial); output_rr=(%' + reg + ')()',
|
| + ('XX/4 00 and[l]? $0x0,[{} or memory]'.format(mark_reg),
|
| + '[{}]'.format(mark_index),
|
| + '[{}]'.format(mark_reg),
|
| + ' # special and'),
|
| + [('e{} {:02x} and $0x{:x},%{}'.format(r, i, i, REGISTERS[reg][r]),
|
| + 'any_nonspecial', '%' + REGISTERS[reg][r], '')
|
| + for i in range(0x01, 0x100) for r in range(7 + (reg == 'eax'))] +
|
| + [('XX/4 00 and[l]? $0x0,[{} or memory]'.format(mark_reg),
|
| + '[{}]'.format(mark_index),
|
| + '[{}]'.format(mark_reg),
|
| + '')]))
|
| +
|
| + # "and $e0" and similar are used to align %rsp. All negative values are
|
| + # accepted by validator and there are 127 of these.
|
| + # Consolidate them into one line.
|
| + if options.bitness == 64:
|
| + compressors.append(Compressor(
|
| + r'.*(?:81|83) (?:e4|e5) (80) (?:00 00 00 |) and \$0x(80),%r[bs]p.*()',
|
| + ('[80..ff]', '[80..ff]', ' # alignment and'),
|
| + [('{:02x}'.format(i), '{:02x}'.format(i), '')
|
| + for i in range(0x80, 0x100)]))
|
| + return compressors
|
| +
|
| +
|
| +def PrepareCompressors():
|
| + """ Return list of all compressors sorted from bigger ones to smaller ones """
|
| +
|
| + return tuple(sorted(
|
| + RegToRmCompressors() +
|
| + RmCompressors() +
|
| + OpcodeCompressors() +
|
| + MemoryNonMemoryCompressors() +
|
| + RexCompressors() +
|
| + SpecialCompressors(),
|
| + key=lambda compressor: -len(compressor.replacements)))
|
| +
|
| +
|
| +def ShowProgress(rule, instruction):
|
| + if rule not in ShowProgress.rules_shown:
|
| + first_print = True
|
| + ShowProgress.rules_shown[rule]=len(ShowProgress.rules_shown)
|
| + else:
|
| + first_print = False
|
| + print >> sys.stderr, '-------- Compressed --------'
|
| + print >> sys.stderr, 'Rule:', ShowProgress.rules_shown[rule]
|
| + print >> sys.stderr, '--------'
|
| + compressor = compressors[rule]
|
| + match = compressor.regex.match(instruction)
|
| + assert match
|
| + format_str = CompressionTemplate(instruction, match, '{{{}}}')
|
| + replacements = sorted(format_str.format(*replacement)
|
| + for replacement in compressor.replacements)
|
| + if len(compressor.replacements) <= 4 or first_print:
|
| + for replacement in replacements:
|
| + print >> sys.stderr, replacement
|
| + else:
|
| + print >> sys.stderr, replacements[0]
|
| + print >> sys.stderr, '...'
|
| + print >> sys.stderr, replacements[-1]
|
| + print >> sys.stderr, '--------'
|
| + print >> sys.stderr, 'Compressed', format_str.format(*compressor.subst)
|
| +ShowProgress.rules_shown = {}
|
| +
|
| +
|
| +def main():
|
| + # We are keeping these global to share state graph and compressors
|
| + # between workers spawned by multiprocess. Passing them every time is slow.
|
| + global options, xml_file
|
| + global dfa
|
| + global worker_validator
|
| + options, xml_file = ParseOptions()
|
| + dfa = dfa_parser.ParseXml(xml_file)
|
| + worker_validator = validator.Validator(
|
| + validator_dll=options.validator_dll,
|
| + decoder_dll=options.decoder_dll)
|
| + global compressors
|
| + compressors = PrepareCompressors()
|
| +
|
| + assert dfa.initial_state.is_accepting
|
| + assert not dfa.initial_state.any_byte
|
| +
|
| + print >> sys.stderr, len(dfa.states), 'states'
|
| +
|
| + num_suffixes = dfa_traversal.GetNumSuffixes(dfa.initial_state)
|
| +
|
| + # We can't just write 'num_suffixes[dfa.initial_state]' because
|
| + # initial state is accepting.
|
| + total_instructions = sum(
|
| + num_suffixes[t.to_state]
|
| + for t in dfa.initial_state.forward_transitions.values())
|
| + print >> sys.stderr, total_instructions, 'regular instructions total'
|
| +
|
| + tasks = dfa_traversal.CreateTraversalTasks(dfa.states, dfa.initial_state)
|
| + print >> sys.stderr, len(tasks), 'tasks'
|
| +
|
| + pool = multiprocessing.Pool()
|
| +
|
| + results = pool.imap(Worker, tasks)
|
| +
|
| + total = 0
|
| + num_valid = 0
|
| + full_output = set()
|
| + for prefix, count, valid_count, output, trace in results:
|
| + print >> sys.stderr, 'Prefix:', ', '.join(map(hex, prefix))
|
| + total += count
|
| + num_valid += valid_count
|
| + full_output |= output
|
| + for rule, instruction in trace:
|
| + ShowProgress(rule, instruction)
|
| + for instruction in sorted(Compressed(full_output,
|
| + compressors,
|
| + ShowProgress)):
|
| + print instruction
|
| +
|
| + print >> sys.stderr, total, 'instructions were processed'
|
| + print >> sys.stderr, num_valid, 'valid instructions'
|
| +
|
| +
|
| +if __name__ == '__main__':
|
| + main()
|
|
|