src/trusted/validator_ragel/compress_regular_instructions.py - Issue 49183002: Regular instructions golden file test.

Unified Diff: src/trusted/validator_ragel/compress_regular_instructions.py

Issue 49183002: Regular instructions golden file test. Base URL: svn://svn.chromium.org/native_client/trunk/src/native_client/

Patch Set: Created 7 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: src/trusted/validator_ragel/compress_regular_instructions.py

===================================================================

--- src/trusted/validator_ragel/compress_regular_instructions.py (revision 0)

+++ src/trusted/validator_ragel/compress_regular_instructions.py (revision 0)

@@ -0,0 +1,1463 @@

+# Use of this source code is governed by a BSD-style license that can be

+# found in the LICENSE file.

+"""

+Traverse the validator's DFA, collect all "normal" instruction and then

+compress output. Note: "anybyte fields" (immediates and displacements)

+are always filled with zeros. Otherwise processing of sextillions (sic!)

+of possibilities will take too long.

+Each rule is applied only when all variants are accepted by validator.

+The following compression rules are present:

+1. Compress ModR/M (+SIB & displacement).

+ Instruction: 00 00 add %al,(%rax)

+ ...

+ Instruction: 00 ff add %bh,%bh

+ becomes

+ Instruction: 00 XX add [%al..%bh],[%al..%bh or memory]

+1a. Compress ModR/M (+SIB & displacement) memory-only.

+ Instruction: f0 01 00 lock add %eax,(%eax)

+ ...

+ Instruction: f0 01 bf 00 00 00 00 lock add %edi,0x0(%edi)

+ becomes

+ Instruction: f0 01 XX lock add [%eax..edi],[memory]

+1b. Compress ModR/M register only.

+ Instruction: 66 0f 50 c0 movmskpd %xmm0,%eax

+ ...

+ Instruction: 66 0f 50 ff movmskpd %xmm7,%edi

+ becomes

+ Instruction: 66 0f 50 XX movmskpd [%xmm0..%xmm7],[%eax..edi]

+2. Compress ModR/M (+SIB & displacement) with opcode extension.

+ Instruction: 0f 90 00 seto (%eax)

+ ...

+ Instruction: 0f 90 c7 seto %bh

+ becomes

+ Instruction: 0f 90 XX/0 seto [%al..%bh or memory]

+2a. Compress ModR/M (+SIB & displacement) memory-only with opcode extension.

+ Instruction: f0 ff 00 lock incl (%eax)

+ ...

+ Instruction: f0 ff 84 ff 00 00 00 00 lock incl 0x0(%edi,%edi,8)

+ becomes

+ Instruction: f0 ff XX/1 lock decl [memory]

+2b. Compress ModR/M register-only with opcode extension.

+ Instruction: 0f 71 d0 00 psrlw $0x0,%mm0

+ ...

+ Instruction: 0f 71 d7 00 psrlw $0x0,%mm7

+ becomes

+ Instruction: 66 0f 71 XX/2 00 psrlw $0x0,[%mm0..%mm7]

+3. Compress register-in-opcode.

+ Instruction: d9 c0 fld %st(0)

+ ...

+ Instruction: d9 c7 fld %st(7)

+ becomes

+ Instruction: Instruction: d9 c[0..7] fld [%st(0)..%st(7)]

+ Only applies if all possible register accesses are accepted by validator.

+4. Special compressor for "set" instruction.

+ Instruction: 0f 90 XX/0 seto [%al..%bh or memory]

+ ...

+ Instruction: 0f 90 XX/7 seto [%al..%bh or memory]

+ becomes

+ Instruction: 0f 90 XX seto [%al..%bh or memory]

+"""

+import itertools

+import multiprocessing

+import optparse

+import os

+import re

+import subprocess

+import sys

+import tempfile

+import traceback

+import dfa_parser

+import dfa_traversal

+import validator

+# Register names in 'natual' order (as defined by IA32/x86-64 ABI)

+# X86-64 ABI splits all registers in groups of 8 because it uses 3-bit field

+# in opcode, ModR/M, and/or SIB bytes to encode them.

+# In most cases there are 16 registers of a given kind and two such groups,

+# but there are couple of exceptions:

+# 1. There are 20 8-bit registers and three groups (two of them overlap)

+# 2. There are eight X87 and MMX registers thus two groups are identical

+# We use typical register from a group to name the whole group. Most groups

+# use first register, but 'spl' group uses fifth register because it's first

+# four registers are the same as 'al' group. We use mnemonic name 'mmalt'

+# to represent the "evil mirror" of the 'mm0' group.

+REGISTERS = {

+ 'al': [ 'al', 'cl', 'dl', 'bl', 'ah', 'ch', 'dh', 'bh' ],

+ 'spl': [ 'al', 'cl', 'dl', 'bl', 'spl', 'bpl', 'sil', 'dil' ],

+ 'ax': [ 'ax', 'cx', 'dx', 'bx', 'sp', 'bp', 'si', 'di' ],

+ 'eax': [ 'eax', 'ecx', 'edx', 'ebx', 'esp', 'ebp', 'esi', 'edi' ],

+ 'rax': [ 'rax', 'rcx', 'rdx', 'rbx', 'rsp', 'rbp', 'rsi', 'rdi' ],

+ 'r8b': [ 'r{}b'.format(N) for N in range(8,16) ],

+ 'r8w': [ 'r{}w'.format(N) for N in range(8,16) ],

+ 'r8d': [ 'r{}d'.format(N) for N in range(8,16) ],

+ 'r8': [ 'r{}'.format(N) for N in range(8,16) ],

+ 'mm0': [ 'mm{}'.format(N) for N in range(8) ],

+ 'mmalt': [ 'mm{}'.format(N) for N in range(8) ],

+ 'st(0)': [ 'st({})'.format(N) for N in range(8) ],

+ 'xmm0': [ 'xmm{}'.format(N) for N in range(8) ],

+ 'xmm8': [ 'xmm{}'.format(N) for N in range(8,16) ],

+ 'ymm0': [ 'ymm{}'.format(N) for N in range(8) ],

+ 'ymm8': [ 'ymm{}'.format(N) for N in range(8,16) ]

+NOP = 0x90

+def PadToBundleSize(bytes):

+ assert len(bytes) <= validator.BUNDLE_SIZE

+ return bytes + [NOP] * (validator.BUNDLE_SIZE - len(bytes))

+# In x86-64 mode we have so-called 'restricted register' which is used to

+# tie two groups together. Some instructions require particular value to

+# be stored in this variable, while some accept any non-special restricted

+# register (%ebp and %esp are special because they can only be accepted by

+# a few 'special' instructions).

+# You can find more details in the "NaCl SFI model on x86-64 systems" manual.

+# We try to feed all possible 'restricted registers' into validator and then

+# classify the instruction using this map. If set of acceptable 'restricted

+# registers' is not here, then it's an error in validator.

+ACCEPTABLE_X86_64_INPUTS = {

+ 0x00001: 'input_rr=%eax',

+ 0x00002: 'input_rr=%ecx',

+ 0x00004: 'input_rr=%edx',

+ 0x00008: 'input_rr=%ebx',

+ 0x00010: 'input_rr=%esp',

+ 0x00020: 'input_rr=%ebp',

+ 0x00040: 'input_rr=%esi',

+ 0x00080: 'input_rr=%edi',

+ 0x00100: 'input_rr=%r8d',

+ 0x00200: 'input_rr=%r9d',

+ 0x00400: 'input_rr=%r10d',

+ 0x00800: 'input_rr=%r11d',

+ 0x01000: 'input_rr=%r12d',

+ 0x02000: 'input_rr=%r13d',

+ 0x04000: 'input_rr=%r14d',

+ 0x08000: 'input_rr=%r15d',

+ 0x1ffcf: 'input_rr=any_nonspecial'

+# Any instruction must produce either None or one of fifteen registers as an

+# output 'restricted register' value. 'r15d' is NOT acceptable as an output.

+ACCEPTABLE_X86_64_OUTPUT_REGISTERS = tuple(

+ '%' + reg for reg in (REGISTERS['eax'] + REGISTERS['r8d'])[0:-1])

+def ValidateInstruction(instruction, validator_inst):

+ bundle = ''.join(map(chr, PadToBundleSize(instruction)))

+ if options.bitness == 32:

+ result = validator_inst.ValidateChunk(bundle, bitness=32)

+ return result, []

+ else:

+ valid_inputs = 0

+ known_final_rr = None

+ output_rr = None

+ # Note that iteration order is aligned with ACCEPTABLE_X86_64_INPUTS array

+ # above.

+ for bit, initial_rr in enumerate(validator.ALL_REGISTERS + [None]):

+ valid, final_rr = validator_inst.ValidateAndGetFinalRestrictedRegister(

+ bundle, len(instruction), initial_rr)

+ if valid:

+ # final_rr should not depend on input_rr

+ assert valid_inputs == 0 or known_final_rr == final_rr

+ valid_inputs |= 1 << bit

+ known_final_rr = final_rr

+ # If nothing is accepted then instruction is not valid. Easy and simple.

+ if valid_inputs == 0: return False, []

+ # If returned value in unacceptable we'll get IndexError here and this

+ # test will fail

+ if known_final_rr is not None:

+ output_rr = ACCEPTABLE_X86_64_OUTPUT_REGISTERS[known_final_rr]

+ # If collected valid_inputs are unacceptable we'll get KeyError here and

+ # this test will fail

+ return True, [ACCEPTABLE_X86_64_INPUTS[valid_inputs],

+ 'output_rr={}'.format(output_rr)]

+class WorkerState(object):

+ def __init__(self, prefix, validator):

+ self.total_instructions = 0

+ self.num_valid = 0

+ self.validator = validator

+ self.output = set()

+ self.trace = []

+ def ReceiveInstruction(self, bytes):

+ self.total_instructions += 1

+ result, notes = ValidateInstruction(bytes, self.validator)

+ if result:

+ self.num_valid += 1

+ dis = self.validator.DisassembleChunk(

+ ''.join(map(chr, bytes)),

+ bitness=options.bitness)

+ for line_nr in xrange(len(dis)):

+ dis[line_nr] = str(dis[line_nr])

+ assert dis[line_nr][0:17] == 'Instruction(0x' + str(line_nr) + ': '

+ assert dis[line_nr][-1:] == ')'

+ dis[line_nr] = dis[line_nr][17:-1]

+ # If %rip is involved then comment will be different depending on the

+ # instruction length. Eliminate it.

+ if '(%rip)' in dis[0]:

+ dis[0] = re.sub(' # 0x[ ]*[0-9a-fA-F]*', '', dis[0])

+ # Zero displacements are represented as 0x0 for all instructions except

+ # jumps where they disassembled as non-zero due to %eip/%rip-relative

+ # addressing. We replace this displacement with %eip/%rip to simplify

+ # compression.

+ if ' 0x' in dis[0] and ' 0x0' not in dis[0]:

+ for bytes in xrange(1, 16):

+ dis[0] = re.sub(

+ '(' + '(?:[0-9a-fA-F][0-9a-fA-F] ){' + str(bytes) + '} .* )' +

+ hex(bytes) + '(.*)',

+ '\\1%eip\\2' if options.bitness == 32 else '\\1%rip\\2',

+ dis[0]);

+ dis[0] = 'Instruction: ' + dis[0]

+ dis += notes

+ self.output.add('; '.join(dis))

+ def RecordTrace(self, compressor_nr, instruction):

+ self.trace.append((compressor_nr, instruction))

+# Compressor has three slots: regex (which picks apart given instruction),

+# subst (which is used to denote compressed version) and replacements (which

+# are used to generate set of instructions from a given code).

+# Example compressor:

+# regex = '.*?[0-9a-fA-F]([0-7]) \\w* (%e(?:[abcd]x|[sb]p|[sd]i)).*()'

+# subst = ('[0-7]', '[%eax..%edi]', ' # register in opcode')

+# replacements = ((0, '%eax'), (1, '%ecx'), (2, '%edx'), (3, '%ebx')

+# (4, '%esp'), (5, '%ebp'), (6, '%esi'), (7, '%edi'))

+# When faced with instriuction '40 inc %eax' it will capture the following

+# pieces of said instruction: '4[0] inc [%eax]'.

+# Then it will produce the following eight instructions:

+# '40 inc %eax'

+# '41 inc %ecx'

+# '42 inc %edx'

+# '43 inc %ebx'

+# '44 inc %esp'

+# '45 inc %ebp'

+# '46 inc %esi'

+# '47 inc %edi'

+# If all these instructions can be found in a set of instructions then

+# compressor will remove them from said set and will insert one replacement

+# "compressed instruction" '4[0-7] inc [%eax..%edi] # register in opcode'.

+# Note that last group is only used in the replacement. It's used to grab marks

+# added by previous compressors and to replace them with a new mark.

+class Compressor(object):

+ __slots__ = [

+ 'regex',

+ 'subst',

+ 'replacements'

+ ]

+ def __init__(self, regex, subst, replacements=None):

+ self.regex = re.compile(regex)

+ self.subst = subst

+ self.replacements = [] if replacements is None else replacements

+def CompressionTemplate(instruction, match, mark):

+ """ Replace all match groups with the mark. """

+ pos = 0

+ format_str = ''

+ for group in range(1, len(match.groups())):

+ format_str += instruction[pos:match.start(group)] + mark

+ pos = match.end(group)

+ return format_str + instruction[pos:match.start(len(match.groups()))]

+def CompressOneMatch(instructions, instruction, match, compressor):

+ format_str = CompressionTemplate(instruction, match, '{}')

+ subset = set()

+ for replacement in compressor.replacements:

+ replacement_str = format_str.format(*replacement)

+ if not replacement_str in instructions:

+ return (False, instructions)

+ subset.add(replacement_str)

+ instructions -= subset

+ instructions.add((format_str + '{}').format(*compressor.subst))

+ return (True, instructions)

+def CompressOneInstruction(instructions, compressors, split, cache):

+ sorted_instructions = (sorted(i for i in instructions if i > split) +

+ sorted(i for i in instructions if i < split))

+ for instruction in sorted_instructions:

+ if instruction in cache:

+ compressors_list = cache[instruction]

+ for compressor_nr, match, compressor in compressors_list:

+ result, instructions = CompressOneMatch(

+ instructions, instruction, match, compressor)

+ if result:

+ return (instructions, compressor_nr, instruction)

+ else:

+ compressors_list = []

+ for compressor_nr, compressor in enumerate(compressors):

+ match = compressor.regex.match(instruction)

+ if match:

+ compressors_list.append((compressor_nr, match, compressor))

+ result, instructions = CompressOneMatch(

+ instructions, instruction, match, compressor)

+ if result:

+ return (instructions, compressor_nr, instruction)

+ cache[instruction] = compressors_list

+ return (instructions, False, False)

+def Compressed(instructions, compressors, show_progress):

+ split = ''

+ cache = {}

+ while True:

+ instructions, rule, split = CompressOneInstruction(

+ instructions, compressors, split, cache)

+ if rule is False: break

+ show_progress(rule, split)

+ return instructions

+def Worker((prefix, state_index)):

+ worker_state = WorkerState(prefix, worker_validator)

+ try:

+ dfa_traversal.TraverseTree(

+ dfa.states[state_index],

+ final_callback=worker_state.ReceiveInstruction,

+ prefix=prefix,

+ anyfield=0)

+ if (prefix[0] != 0x0f or prefix[1] != 0x0f): # Skip 3DNow! instructions

+ worker_state.output = Compressed(set(worker_state.output),

+ compressors,

+ worker_state.RecordTrace)

+ except Exception as e:

+ traceback.print_exc() # because multiprocessing imap swallows traceback

+ raise

+ return (

+ prefix,

+ worker_state.total_instructions,

+ worker_state.num_valid,

+ worker_state.output,

+ worker_state.trace)

+def ParseOptions():

+ parser = optparse.OptionParser(usage='%prog [options] xmlfile')

+ parser.add_option('--bitness',

+ choices=['32', '64'],

+ help='The subarchitecture: 32 or 64')

+ parser.add_option('--validator_dll',

+ help='Path to librdfa_validator_dll')

+ parser.add_option('--decoder_dll',

+ help='Path to librdfa_decoder_dll')

+ options, args = parser.parse_args()

+ options.bitness = int(options.bitness)

+ if len(args) != 1:

+ parser.error('specify one xml file')

+ (xml_file, ) = args

+ return options, xml_file

+# Version suitable for use in regular expressions

+REGISTERS_RE = REGISTERS.copy()

+REGISTERS_RE['st(0)'] = [ 'st\${}\$'.format(N) for N in range(8) ]

+REGISTERS_RE['st\$0\$'] = REGISTERS_RE['st(0)']

+# Index names in 'natual' order (as defined by IA32/x86-64 ABI)

+INDEXES = {

+ 'eax': [ 'eax', 'ecx', 'edx', 'ebx', 'eiz', 'ebp', 'esi', 'edi' ],

+ 'rax': [ 'rax', 'rcx', 'rdx', 'rbx', 'riz', 'rbp', 'rsi', 'rdi' ],

+ 'r8': [ 'r8', 'r9', 'r10', 'r11', 'r12', 'r13', 'r14', 'r15' ]

+# Register which can not be used as base in 64-bit mode in all incarnations

+X86_64_BASE_REGISTERS = set([

+ '%spl', '%bpl', '%r15b',

+ '%sp', '%bp', '%r15w',

+ '%esp', '%ebp', '%r15d',

+ '%rsp', '%rbp', '%r15',

+ '%rip'

+])

+def InstructionIsDangerous(input, output, register_write,

+ writes_to, memory_accessed=False,

+ base_text='%riz', index_text='%riz'):

+ """ Check if instruction with given replacements will be dangerous

+ Args:

+ input: input argument

+ output: output argument

+ register_write: three-state selector

+ 'sandbox' - instruction can be used to produce "restricted register"

+ 'protect' - instruction can damage output, protect "special registers"

+ 'ignore' - instruction does not affect it's operands (e.g. test) or

+ is used with non-GP registers (X87, MMX, XMM, etc)

+ memory_accessed: True if instruction accesses memory

+ base: base register (if memory is accessed)

+ index: index register (if memory is accessed)

+ Returns:

+ True if instruction should be rejected by validator

+ """

+ if memory_accessed:

+ if base_text not in X86_64_BASE_REGISTERS:

+ return True

+ if index_text in X86_64_BASE_REGISTERS - set(['%r15']):

+ return True

+ if register_write == 'protect' and output in X86_64_BASE_REGISTERS:

+ return True

+ if register_write == 'sandbox' and output == '%r15d':

+ return True

+ if writes_to == 'both' and input in X86_64_BASE_REGISTERS:

+ return True

+ return False

+def AppendOperandsReplacement(replacement, rm_text, reg, modrm, writes_to):

+ """ Appends replacement text to replacement list

+ Args:

+ replacement: replacement list

+ rm_text: replacement for rm field

+ reg: register kind (or None if reg field is used as opcode extension)

+ modrm: modrm byte

+ writes_to: three-state selector

+ 'reg' - instruction uses rm as source, reg as destination

+ 'rm' - instruction uses reg as source, rm as destination

+ 'both' - instruction writes to both reg and rm

+ Returns:

+ input: textual representation of input argument

+ output: textual representation of output argument

+ Side-effect:

+ output (if reg is None) or (input, output) tuple (if reg is not None)

+ are added to replacement list.

+ """

+ if reg is None:

+ assert writes_to == 'rm'

+ input, output = None, rm_text

+ replacement.append(output)

+ else:

+ reg_field = (modrm >> 3) & 0x07

+ reg_text = '%' + REGISTERS[reg][reg_field]

+ if writes_to == 'reg':

+ input, output = rm_text, reg_text

+ else: # rm, both

+ input, output = reg_text, rm_text

+ replacement.extend([input, output])

+ return input, output

+def ModRMRegisterReplacements(rm, reg=None, writes_to='rm', opcode_bits=0,

+ register_write='ignore'):

+ """Creates replacement tuples list for register-to-register instructions

+ Args:

+ rm: rm operand kind (see REGISTERS array)

+ reg: reg operand kind (see REGISTERS array) or None if reg is not used

+ writes_to: three-state selector

+ 'reg' - instruction uses rm as source, reg as destination

+ 'rm' - instruction uses reg as source, rm as destination

+ 'both' - instruction writes to both reg and rm

+ opcode_bits: opcode extensions code (used when reg is None)

+ register_write: three-state selector

+ 'sandbox' - instruction can be used to produce "restricted register"

+ 'protect' - instruction can damage output, protect "special registers"

+ 'ignore' - instruction does not affect it's operands (e.g. test) or

+ is used with non-GP registers (X87, MMX, XMM, etc)

+ Returns:

+ List of replacement tuples

+ """

+ # Reg field can be used either as reg or as opcode extension, but not both

+ assert reg is None or opcode_bits == 0

+ output_key = (options.bitness, reg, rm, writes_to, opcode_bits,

+ register_write)

+ if output_key in ModRMRegisterReplacements.replacements:

+ return ModRMRegisterReplacements.replacements[output_key]

+ replacements = []

+ # Two upper bits of ModR/M byte (mod field) must be equal to 11

+ # This gives us range from 0xc0 to 0xff but we are going from the

+ # end to make rejection faster (%r15 is equal to 0x7 and %rbp is 0x5).

+ if reg is None:

+ # reg field is used as opcode extension

+ byte_range = [byte

+ for byte in range(0xff, 0xbf, -1)

+ if (byte >> 3) & 0x7 == opcode_bits]

+ else:

+ byte_range = range(0xff, 0xbf, -1)

+ for modrm in byte_range:

+ rm_field = (modrm & 0x07)

+ rm_text = '%' + REGISTERS[rm][rm_field]

+ byte_text = '{:02x}'.format(modrm)

+ replacement = [byte_text]

+ input, output = AppendOperandsReplacement(

+ replacement, rm_text, reg, modrm, writes_to)

+ if options.bitness == 64:

+ replacement.append('any_nonspecial') # input_rr

+ replacement.append(output if register_write == 'sandbox' else None)

+ if InstructionIsDangerous(input, output, register_write, writes_to):

+ continue

+ replacements.append(tuple(replacement))

+ ModRMRegisterReplacements.replacements[output_key] = tuple(replacements)

+ return ModRMRegisterReplacements.replacements[output_key]

+ModRMRegisterReplacements.replacements = {}

+def BaseOnlyMemoryOperand(modrm, base):

+ """Creates replacement tuples list for register-to-memory instructions

+ (base only, no SIB)

+ Args:

+ modrm: modrm byte

+ base: register kind for base

+ Returns:

+ bytes_text: replacement for "bytes" group

+ rm_text: textual representation of "rm" argument

+ base_text: textual representation of "base" register

+ """

+ mod_field = (modrm >> 6) & 0x03

+ rm_field = (modrm & 0x07)

+ base_text = '%' + REGISTERS[base][rm_field]

+ # If RM field == %rbp and MOD field is zero then it's absolute address

+ # in 32-bit mode and %rip-based address in 64-bit mode

+ if mod_field == 0 and rm_field == validator.REG_RBP:

+ bytes_text = '{:02x} 00 00 00 00'.format(modrm)

+ rm_text = '0x0' if options.bitness == 32 else '0x0(%rip)'

+ base_text = '%eiz' if options.bitness == 32 else '%rip'

+ # Memory access with just a base register

+ elif mod_field == 0:

+ bytes_text = '{:02x}'.format(modrm)

+ rm_text = '({})'.format(base_text)

+ # Memory access with base and 8bit offset

+ elif mod_field == 1:

+ bytes_text = '{:02x} 00'.format(modrm)

+ rm_text = '0x0({})'.format(base_text)

+ # Memory access with base and 32bit offset

+ else: # mod_field == 2

+ bytes_text = '{:02x} 00 00 00 00'.format(modrm)

+ rm_text = '0x0({})'.format(base_text)

+ return bytes_text, rm_text, base_text

+def SIBMemoryOperand(modrm, sib, base, index):

+ """Creates replacement tuples list for register-to-memory instructions

+ (base only, no SIB)

+ Args:

+ modrm: modrm byte

+ base: register kind for base

+ Returns:

+ bytes_text: replacement for "bytes" group

+ rm_text: textual representation of "rm" argument

+ base_text: textual representation of "base" register

+ index_text: textual representation of "index" register

+ """

+ mod_field = (modrm >> 6) & 0x03

+ scale_field = (sib >> 6) & 0x03

+ index_field = (sib >> 3) & 0x07

+ base_field = (sib & 0x07)

+ index_text = '%' + INDEXES[index][index_field]

+ base_text = '%' + REGISTERS[base][base_field]

+ scale_text = str(1 << scale_field)

+ # If BASE is %rbp and MOD == 0 then index with 32bit offset is used

+ if mod_field == 0 and base_field == validator.REG_RBP:

+ bytes_text = '{:02x} {:02x} 00 00 00 00'.format(modrm, sib)

+ # In 64-bit mode this case is displayed as simple absolute address

+ # In 32-bit mode there are another, shorter, form, but it's used

halyavin 2013/11/18 14:13:10 Remove this comment since it confuses the reader.

khim 2013/11/19 09:26:48 Done.

+ # for %rip-relative addressing in 64-bit mode

+ if (options.bitness == 64 and

+ index_text == '%riz' and

+ scale_text == '1'):

+ rm_text = '0x0'

+ else:

+ rm_text = '0x0(,{},{})'.format(index_text, scale_text)

+ # There are no base in this case

+ base_text = '%eiz' if options.bitness == 32 else '%riz'

+ # Memory access with base and index (no offset)

+ elif mod_field == 0:

+ bytes_text = '{:02x} {:02x}'.format(modrm, sib)

+ rm_text = '({},{},{})'.format(base_text, index_text, scale_text)

+ # Memory access with base, index and 8bit offset

+ elif mod_field == 1:

+ bytes_text = '{:02x} {:02x} 00'.format(modrm, sib)

+ rm_text = '0x0({},{},{})'.format(base_text, index_text, scale_text)

+ # Memory access with base, index and 32bit offset

+ elif mod_field == 2:

+ bytes_text = '{:02x} {:02x} 00 00 00 00'.format(modrm, sib)

+ rm_text = '0x0({},{},{})'.format(base_text, index_text, scale_text)

+ # Pretty-printing of access via %rsp (or %r12)

+ if (base_field == validator.REG_RSP and

+ index_text in ('%eiz', '%riz') and

+ scale_text == '1'):

+ if mod_field == 0: # no offset

+ rm_text = '({})'.format(base_text)

+ else: # 8-bit or 32-bit offset

+ rm_text = '0x0({})'.format(base_text)

+ return bytes_text, rm_text, base_text, index_text

+def ModRMMemoryReplacements(reg=None, writes_to='rm', opcode_bits=0,

+ memory_accessed=True, register_write='ignore',

+ base_r8=False, index_r8=False):

+ """Creates replacement tuples list for register-to-memory instructions

+ Args:

+ rm: rm operand kind (see REGISTERS array)

+ reg: reg operand kind (see REGISTERS array) or None if reg is not used

+ writes_to: three-state selector

+ 'reg' - instruction uses rm as source, reg as destination

+ 'rm' - instruction uses reg as source, rm as destination

+ 'both' - instruction writes to both reg and rm

+ opcode_bits: opcode extensions code (used when reg is None)

+ memory_accessed: True if instruction accesses memory

+ register_write: three-state selector

+ 'sandbox' - instruction can be used to produce "restricted register"

+ 'protect' - instruction can damage output, protect "special registers"

+ 'ignore' - instruction does not affect it's operands (e.g. test) or

+ is used with non-GP registers (X87, MMX, XMM, etc)

+ index_r8: True if REX.X bit in the instruction set to 1

+ Returns:

+ List of replacement tuples

+ """

+ # Reg field can be used either as reg or as opcode extension, but not both

+ assert reg is None or opcode_bits == 0

+ output_key = (options.bitness, reg, writes_to, opcode_bits,

+ base_r8, index_r8, memory_accessed, register_write)

+ if output_key in ModRMMemoryReplacements.replacements:

+ return ModRMMemoryReplacements.replacements[output_key]

+ if options.bitness == 32:

+ base = 'eax'

+ index = 'eax'

+ else:

+ base = 'r8' if base_r8 else 'rax'

+ index = 'r8' if index_r8 else 'rax'

+ replacements = []

+ # Two upper bits of ModR/M byte (mod field) must be equal to 00, 01, or 10

+ # This gives us range from 0x00 to 0xbf but we are going from the end to make

+ # rejection faster (%r15 is equal to 0x7 and %rbp is 0x5).

+ if reg is None:

+ # reg field is used as opcode extension

+ byte_range = [byte

+ for byte in range(0xbf, -1, -1)

+ if (byte >> 3) & 0x7 == opcode_bits]

+ else:

+ byte_range = range(0xbf, -1, -1)

+ for modrm in byte_range:

+ # If RM field != %rsp then there are no SIB byte

+ if (modrm & 0x07) != validator.REG_RSP:

+ bytes_text, rm_text, base_text = BaseOnlyMemoryOperand(modrm, base)

+ replacement = [bytes_text]

+ input, output = AppendOperandsReplacement(

+ replacement, rm_text, reg, modrm, writes_to)

+ if options.bitness == 64:

+ replacement.append('any_nonspecial')

+ # xchg with memory can not be used to sandbox it's operand, only

+ # instriuction which explicitly writes to reg operand can do that

+ if writes_to == 'reg' and register_write == 'sandbox':

halyavin 2013/11/19 08:57:50 The same as below

khim 2013/11/19 09:26:48 Done.

+ replacement.append(output)

+ else:

+ replacement.append(None)

+ if InstructionIsDangerous(input, output, register_write, writes_to,

+ memory_accessed, base_text):

+ continue

+ replacement = tuple(replacement)

+ replacements.append(replacement)

halyavin 2013/11/19 08:57:50 We wanted to join this lines.

khim 2013/11/19 09:26:48 Done.

+ else:

+ # If RM field == %rsp then we have SIB byte

+ for sib in xrange(0x100):

+ bytes_text, rm_text, base_text, index_text = SIBMemoryOperand(

+ modrm, sib, base, index)

+ replacement = [bytes_text]

+ input, output = AppendOperandsReplacement(

+ replacement, rm_text, reg, modrm, writes_to)

+ if options.bitness == 64:

+ if not memory_accessed or index_text == '%riz':

+ replacement.append('any_nonspecial')

+ else:

+ if index_r8:

+ # Convert %r8 to %r8d, %r9 to %r9d, etc

+ replacement.append(index_text + 'd')

+ else:

+ # Convert %rax to %eax, %rsp to %esp, etc

+ replacement.append('%e' + index_text[2:])

+ # xchg with memory can not be used to sandbox it's operand, only

halyavin 2013/11/18 14:13:10 Move comment to the else clause.

khim 2013/11/19 09:26:48 Done.

+ # instruction which explicitly writes to reg operand can do that

+ if writes_to == 'reg' and register_write == 'sandbox':

halyavin 2013/11/18 14:13:10 Add comment that writes_to == 'reg' means that out

khim 2013/11/19 09:26:48 Done.

+ replacement.append(output)

+ else:

+ replacement.append(None)

+ if InstructionIsDangerous(input, output, register_write, writes_to,

+ memory_accessed, base_text, index_text):

+ continue

+ replacements.append(tuple(replacement))

+ ModRMMemoryReplacements.replacements[output_key] = tuple(replacements)

+ return ModRMMemoryReplacements.replacements[output_key]

+ModRMMemoryReplacements.replacements = {}

+def PrepareCompressors():

+ global compressors

+ global main_compressors

+ global register_compressors

+ global memory_compressors

+ # "Larger" compressors should be tried first, then "smaller" ones.

+ main_compressors = []

+ register_compressors = []

+ memory_compressors = []

+ extra_compressors = []

+ # Map from "REX bit off" group of registers to "REX bit on" group of registers

+ r8 = {

+ 'al': 'r8b',

+ 'ax': 'r8w',

+ 'eax': 'r8d',

+ 'rax': 'r8',

+ 'mm0': 'mmalt',

+ 'xmm0': 'xmm8',

+ 'ymm0': 'ymm8'

+ }

+ if options.bitness == 32:

+ register_kinds = ('al', 'ax', 'eax', 'mm0', 'xmm0', 'ymm0')

+ register_kind_pairs = (

+ ( 'al', 'al'),

+ ( 'ax', 'al'),

+ ( 'ax', 'ax'),

+ ( 'eax', 'al'),

+ ( 'eax', 'ax'),

+ ( 'eax', 'eax'),

+ ( 'eax', 'mm0'),

+ ( 'mm0', 'eax'),

+ ( 'eax', 'xmm0'),

+ ('xmm0', 'eax'),

+ ( 'mm0', 'mm0'),

+ ( 'mm0', 'xmm0'),

+ ('xmm0', 'mm0'),

+ ('xmm0', 'xmm0'),

+ ('xmm0', 'ymm0'),

+ ('ymm0', 'xmm0'),

+ ('ymm0', 'ymm0')

+ )

+ else:

+ register_kinds = ('al', 'spl', 'ax', 'eax', 'rax', 'mm0', 'xmm0', 'ymm0',

+ 'r8b', 'r8w', 'r8d', 'r8', 'mmalt', 'xmm8', 'ymm8')

+ register_kind_pairs = (

+ ( 'al', 'al'),

+ ( 'spl', 'spl'), ( 'spl', 'r8b'), ( 'r8b', 'spl'), ( 'r8b', 'r8b'),

+ ( 'ax', 'al'),

+ ( 'ax', 'spl'), ( 'ax', 'r8b'), ( 'r8w', 'spl'), ( 'r8w', 'r8b'),

+ ( 'ax', 'ax'), ( 'ax', 'r8w'), ( 'r8w', 'ax'), ( 'r8w', 'r8w'),

+ ( 'eax', 'al'),

+ ( 'eax', 'spl'), ( 'eax', 'r8b'), ( 'r8d', 'spl'), ( 'r8d', 'r8b'),

+ ( 'eax', 'ax'), ( 'eax', 'r8w'), ( 'r8d', 'ax'), ( 'r8d', 'r8w'),

+ ( 'eax', 'eax'), ( 'eax', 'r8d'), ( 'r8d', 'eax'), ( 'r8d', 'r8d'),

+ ( 'rax', 'al'),

+ ( 'rax', 'spl'), ( 'rax', 'r8b'), ( 'r8', 'spl'), ( 'r8', 'r8b'),

+ ( 'rax', 'ax'), ( 'rax', 'r8w'), ( 'r8', 'ax'), ( 'r8', 'r8w'),

+ ( 'rax', 'eax'), ( 'rax', 'r8d'), ( 'r8', 'eax'), ( 'r8', 'r8d'),

+ ( 'rax', 'rax'), ( 'rax', 'r8'), ( 'r8', 'rax'), ( 'r8', 'r8'),

+ ( 'eax', 'mm0'), ( 'eax','mmalt'), ( 'r8d', 'mm0'), ( 'eax', 'mmalt'),

+ ( 'rax', 'mm0'), ( 'rax','mmalt'), ( 'r8', 'mm0'), ( 'r8', 'mmalt'),

+ ( 'mm0', 'eax'), ('mmalt', 'eax'), ( 'mm0', 'r8d'), ('mmalt', 'r8d'),

+ ( 'mm0', 'rax'), ('mmalt', 'rax'), ( 'mm0', 'r8'), ('mmalt', 'r8'),

+ ( 'eax', 'xmm0'), ( 'eax', 'xmm8'), ( 'r8d', 'xmm0'), ( 'r8d', 'xmm8'),

+ ( 'rax', 'xmm0'), ( 'rax', 'xmm8'), ( 'r8', 'xmm0'), ( 'r8', 'xmm8'),

+ ('xmm0', 'eax'), ('xmm0', 'r8d'), ('xmm8', 'eax'), ('xmm8', 'r8d'),

+ ('xmm0', 'rax'), ('xmm0', 'r8'), ('xmm8', 'rax'), ('xmm8', 'r8'),

+ ( 'mm0', 'mm0'), ('mmalt', 'mm0'), ( 'mm0','mmalt'), ('mmalt','mmalt'),

+ ( 'mm0', 'xmm0'), ('mmalt','xmm0'), ( 'mm0', 'xmm8'), ('mmalt', 'xmm8'),

+ ('xmm0', 'mm0'), ('xmm8', 'mm0'), ('xmm0','mmalt'), ('xmm8', 'mmalt'),

+ ('xmm0', 'xmm0'), ('xmm0', 'xmm8'), ('xmm8', 'xmm0'), ('xmm8', 'xmm8'),

+ ('xmm0', 'ymm0'), ('xmm0', 'ymm8'), ('xmm8', 'ymm0'), ('xmm8', 'ymm8'),

+ ('ymm0', 'xmm0'), ('ymm0', 'xmm8'), ('ymm8', 'xmm0'), ('ymm8', 'xmm8'),

+ ('ymm0', 'ymm0'), ('ymm0', 'ymm8'), ('ymm8', 'ymm0'), ('ymm8', 'ymm8')

+ )

+ # Largest compressors: both reg and rm fields are used

+ for reg, rm in register_kind_pairs:

+ start_reg = REGISTERS[reg][0]

+ end_reg = REGISTERS[reg][-1 if reg[0:2] != 'r8' else -2]

+ start_rm = REGISTERS[rm][0]

+ end_rm = REGISTERS[rm][-1 if rm[0:2] != 'r8' else -2]

+ instruction_kinds = [

+ # Normal instructions with two operands (rm to reg)

+ ({'writes_to':'reg'}, '', ' # rm to reg', ''),

+ # Normal instructions with two operands (reg to rm)

+ ({'writes_to':'rm'}, '', ' # reg to rm', '')

+ ]

+ # Lea in 64 bit mode is truly unique instruction for now

+ if options.bitness == 64 and reg in ('eax', 'r8d', 'rax', 'r8'):

+ instruction_kinds = [

+ ({'writes_to':'reg', 'memory_accessed':False,

+ 'register_write':'sandbox' if reg in ('eax', 'r8d') else 'protect'},

+ ' # lea', ' # rm to reg; lea', ' # lea')] + instruction_kinds

+ # There are few more forms in 64 bit case (rm to reg)

+ if options.bitness == 64 and reg in ('eax', 'r8d'):

+ # Zero-extending version.

+ instruction_kinds.append(

+ ({'writes_to':'reg', 'register_write':'sandbox'},

+ '', ' # rm to reg', ''))

+ # More forms in 64 bit case (reg to rm)

+ if options.bitness == 64 and rm in ('eax', 'r8d'):

+ # Zero-extending version.

+ instruction_kinds.append(

+ ({'writes_to':'rm', 'register_write':'sandbox'},

+ '', ' # reg to rm', ''))

+ # Zero-extending xchg/xadd

+ instruction_kinds.append(

+ ({'writes_to':'both', 'register_write':'sandbox'},

+ ' # write to both',

+ ' # reg to rm; write to both',

+ ' # write to both'))

+ # Still more forms for 64 bit case (rm to reg).

+ if options.bitness == 64 and reg in ('al', 'spl', 'ax', 'eax', 'rax',

+ 'r8b', 'r8w', 'r8d', 'r8'):

+ # Dangerous instructions (rm to reg)

+ instruction_kinds.append(

+ ({'writes_to':'reg', 'register_write':'protect'},

+ '', ' # rm to reg', ''))

+ # Still more forms for 64 bit case (reg to rm)

+ if options.bitness == 64 and rm in ('al', 'spl', 'ax', 'eax', 'rax',

+ 'r8b', 'r8w', 'r8d', 'r8'):

+ # Dangerous instructions (reg to rm)

+ instruction_kinds.append(

+ ({'writes_to':'rm', 'register_write':'protect'},

+ '', ' # reg to rm', ''))

+ # Dangerous xchg/xadd

+ instruction_kinds.append(

+ ({'writes_to':'both', 'register_write':'protect'},

+ ' # write to both',

+ ' # reg to rm; write to both',

+ ' # write to both'))

+ # 3DNow! instructions

+ instruction_kinds.append(

+ ({'writes_to':'reg', '3dnow':'yes'}, '', ' # rm to reg', ''))

+ for args, notes, notes_register, notes_memory in instruction_kinds:

+ regex = '(?: 00)*'

+ # Additional byte is opcode extension with 3DNow! instructions.

+ if '3dnow' in args:

+ regex = ' [0-9a-fA-F][0-9a-fA-F]'

+ args.pop('3dnow')

+ regex += ' (?:lock )?\\w* (?:\\$0x0,|\\$0x0,\\$0x0,|%cl,|%xmm0,)?'

+ # We only need to process ModR/M+SIB '04 04' or '04 07' here

+ if options.bitness == 32:

+ regex_mem = '\$%esp,%eax,1\$'

+ else:

+ regex_mem = '\$(?:%rsp|%r15),(?:%rax|%r8),1\$'

+ output = None

+ output_note = None

+ if args['writes_to'] == 'reg':

+ regex += '(%' + REGISTERS[rm][0] + '|' + regex_mem + ')'

+ regex += ',(%' + REGISTERS[reg][0] + ')'

+ if 'register_write' in args and args['register_write'] == 'sandbox':

+ assert reg in ('eax', 'r8d')

+ output = '%' + reg + '|None'

+ output_note = '[%eax..%edi]' if reg == 'eax' else '[%r8d..%r14d]'

+ subst = (

+ 'XX', '[%{}..%{} or memory]'.format(start_rm, end_rm),

+ '[%{}..%{}]'.format(start_reg, end_reg), notes)

+ subst_register = (

+ 'XX', '[%{}..%{}]'.format(start_rm, end_rm),

+ '[%{}..%{}]'.format(start_reg, end_reg), notes_register)

+ subst_memory = (

+ 'XX', '[memory]',

+ '[%{}..%{}]'.format(start_reg, end_reg), notes_memory)

+ else:

+ regex += '(%' + REGISTERS[reg][0] + ')'

+ regex += ',(%' + REGISTERS[rm][0] + '|' + regex_mem + ')'

+ if 'register_write'in args and args['register_write'] == 'sandbox':

+ assert rm in ('eax', 'r8d')

+ output = '%' + rm + '|None'

+ output_note = '[%eax..%edi]' if rm == 'eax' else '[%r8d..%r14d]'

+ subst = (

+ 'XX', '[%{}..%{}]'.format(start_reg, end_reg),

+ '[%{}..%{} or memory]'.format(start_rm, end_rm), notes)

+ subst_register = (

+ 'XX', '[%{}..%{}]'.format(start_reg, end_reg),

+ '[%{}..%{}]'.format(start_rm, end_rm), notes_register)

+ subst_memory = (

+ 'XX', '[%{}..%{}]'.format(start_reg, end_reg),

+ '[memory]', notes_memory)

+ regex += '.*'

+ if options.bitness == 64:

+ regex += '; input_rr=(%eax|%r8d|any_nonspecial)'

+ regex += '; output_rr=({})'.format(output)

+ if 'memory_accessed' in args:

+ input_note = 'any_nonspecial'

+ input_note_r8 = 'any_nonspecial'

+ else:

+ input_note = '[%eax..%edi]'

+ input_note_r8 = '[%r8d..%r15d]'

+ subst_r8 = subst[0:-1] + (input_note_r8, output_note) + subst[-1:]

+ subst = subst[0:-1] + (input_note, output_note) + subst[-1:]

+ subst_memory_r8 = subst_memory[0:-1] + (

+ input_note_r8, output_note) + subst_memory[-1:]

+ subst_memory = subst_memory[0:-1] + (

+ input_note, output_note) + subst_memory[-1:]

+ subst_register = subst_register[0:-1] + (

+ 'any_nonspecial', output_note) + subst_register[-1:]

+ regex += '()'

+ base_r8 = rm in r8.values()

+ memory_replacement = ModRMMemoryReplacements(

+ reg=reg, base_r8=base_r8, **args)

+ memory_compressors.append(Compressor(

+ '.*?(04 0[47])' + regex, subst_memory, memory_replacement))

+ if options.bitness == 64:

+ memory_replacement_r8 = ModRMMemoryReplacements(

+ reg=reg, base_r8=base_r8, index_r8=True, **args)

+ memory_compressors.append(Compressor(

+ '.*?(04 0[47])' + regex, subst_memory_r8, memory_replacement_r8))

+ # Instructions with no memory access are instructions which are doing

+ # something with memory address (e.g. lea) and as such they don't have

+ # non-memory forms.

+ if not 'memory_accessed' in args:

+ register_replacement = ModRMRegisterReplacements(rm=rm, reg=reg, **args)

+ register_compressors.append(Compressor(

+ '.*?(c0)' + regex, subst_register, register_replacement))

+ main_replacement = register_replacement + memory_replacement

+ main_compressors.append(Compressor(

+ '.*?(04 0[47])' + regex, subst, main_replacement))

+ if options.bitness == 64:

+ main_replacement_r8 = register_replacement + memory_replacement_r8

+ main_compressors.append(Compressor(

+ '.*?(04 0[47])' + regex, subst_r8, main_replacement_r8))

+ # Smaller compressors: only rm field is used.

+ for rm in register_kinds:

+ start_rm = REGISTERS[rm][0]

+ end_rm = REGISTERS[rm][-1 if rm[0:2] != 'r8' else -2]

+ for opcode_bits in xrange(8):

+ XX_byte_mark = 'XX/' + str(opcode_bits)

+ instruction_kinds = [

+ # The most basic form

+ ({}, '', '', '')

+ ]

+ if options.bitness == 64:

+ # No memory access (e.g. prefetch)

+ instruction_kinds = [

+ ({'memory_accessed':False}, '', '', '')] + instruction_kinds

+ # More forms in 64 bit case.

+ if options.bitness == 64 and rm in ('eax', 'r8d'):

+ # Zero-extending version.

+ instruction_kinds.append(

+ ({'register_write':'sandbox'}, '', '', ''))

+ # Still more forms for 64 bit case (reg to rm).

+ if options.bitness == 64 and rm in ('al', 'spl', 'ax', 'eax', 'rax',

+ 'r8b', 'r8w', 'r8d', 'r8'):

+ # Dangerous instructions.

+ instruction_kinds.append(

+ ({'register_write':'protect'}, '', '', ''))

+ for args, notes, notes_register, notes_memory in instruction_kinds:

+ subst = (XX_byte_mark, '[%{}..%{} or memory]'.format(start_rm, end_rm),

+ notes)

+ subst_register = (XX_byte_mark, '[%{}..%{}]'.format(start_rm, end_rm),

+ notes_register)

+ subst_memory = (XX_byte_mark, '[memory]',

+ notes_memory)

+ regex = ('(?: 00)* (?:lock )?\\w* (?:\\$0x0,|%cl,)?'

+ '(%' + REGISTERS[rm][0] + '|' + regex_mem + ').*')

+ output = None

+ output_note = None

+ if options.bitness == 64:

+ if 'register_write' in args and args['register_write'] == 'sandbox':

+ assert rm in ('eax', 'r8d')

+ output = '%' + rm + '|None'

+ output_note = '[%eax..%edi]' if rm == 'eax' else '[%r8d..%r14d]'

+ regex += '; input_rr=(%eax|%r8d|any_nonspecial)'

+ regex += '; output_rr=({})'.format(output)

+ if 'memory_accessed' in args:

+ input_note = 'any_nonspecial'

+ input_note_r8 = 'any_nonspecial'

+ else:

+ input_note = '[%eax..%edi]'

+ input_note_r8 = '[%r8d..%r15d]'

+ subst_r8 = subst[0:-1] + (input_note_r8, output_note) + subst[-1:]

+ subst = subst[0:-1] + (input_note, output_note) + subst[-1:]

+ subst_memory_r8 = subst_memory[0:-1] + (

+ input_note_r8, output_note) + subst_memory[-1:]

+ subst_memory = subst_memory[0:-1] + (

+ input_note, output_note) + subst_memory[-1:]

+ subst_register = subst_register[0:-1] + (

+ 'any_nonspecial', output_note) + subst_register[-1:]

+ regex += '()'

+ base_r8 = rm in r8.values()

+ memory_replacement = ModRMMemoryReplacements(

+ reg=None, base_r8=base_r8, opcode_bits=opcode_bits, **args)

+ memory_compressors.append(Compressor(

+ '.*?({:02x} 0[47])'.format(0x04 + opcode_bits * 8) + regex,

+ subst_memory, memory_replacement))

+ if options.bitness == 64:

+ memory_replacement_r8 = ModRMMemoryReplacements(

+ reg=None, base_r8=base_r8, index_r8=True, opcode_bits=opcode_bits,

+ **args)

+ memory_compressors.append(Compressor(

+ '.*?({:02x} 0[47])'.format(0x04 + opcode_bits * 8) + regex,

+ subst_memory_r8, memory_replacement_r8))

+ # Instructions with no memory access are instructions which are doing

+ # something with memory address (e.g. prefetch) and as such they don't

+ # have non-memory forms.

+ if not 'memory_accessed' in args:

+ register_replacement = ModRMRegisterReplacements(

+ reg=None, rm=rm, opcode_bits=opcode_bits, **args)

+ register_compressors.append(Compressor(

+ '.*?({:02x})'.format(0xc0 + opcode_bits * 8) + regex,

+ subst_register, register_replacement))

+ main_replacement = register_replacement + memory_replacement

+ main_compressors.append(Compressor(

+ '.*?({:02x} 0[47])'.format(0x04 + opcode_bits * 8) + regex,

+ subst, main_replacement))

+ if options.bitness == 64:

+ main_replacement_r8 = register_replacement + memory_replacement_r8

+ main_compressors.append(Compressor(

+ '.*?({:02x} 0[47])'.format(0x04 + opcode_bits * 8) + regex,

+ subst_r8, main_replacement_r8))

+ # Even smaller compressors: only low 3 bits of opcode are used.

+ for reg in register_kinds + ('st(0)',):

+ start_reg = REGISTERS[reg][0]

+ end_reg = REGISTERS[reg][-1 if reg[0:2] != 'r8' else -2]

+ for opcode in xrange(8):

+ for text1, text2, nibble in (

+ ('[0..7]', '[8..f]', xrange(8)),

+ ('[012367]', '[89abef]', (0, 1, 2, 3, 6, 7)),

+ ('[0..6]', '[8..e]', xrange(7))

+ ):

+ # Note that we use 2nd line here to avoid ambiguity when opcode is 0x00

+ extra_compressors.append(Compressor(

+ '.*?[0-9a-fA-F](1)(?: 00)*'

+ ' \\w* (?:\\$0x0,|%ax,|%st,)?'

+ '(%(?:' + REGISTERS_RE[reg][1] + ')).*()',

+ (text1, '[%{}..%{}]'.format(start_reg, end_reg), ''),

+ tuple(('{:x}'.format(n), '%' + REGISTERS[reg][n])

+ for n in nibble)))

+ extra_compressors.append(Compressor(

+ '.*?[0-9a-fA-F](8)(?: 00)*'

+ ' \\w* (?:\\$0x0,|%ax,|%st,)?'

+ '(%(?:' + REGISTERS_RE[reg][0] + ')).*()',

+ (text2, '[%{}..%{}]'.format(start_reg, end_reg), ''),

+ tuple(('{:x}'.format(n + 8), '%' + REGISTERS[reg][n])

+ for n in nibble)))

+ # Another version for 64 bit case

+ if options.bitness == 64 and reg in ('eax', 'r8d'):

+ extra_compressors.append(Compressor(

+ '.*?[0-9a-fA-F](1)(?: 00)*'

+ ' \\w* (?:\\$0x0,|%ax,|%st,)?'

+ '(%(?:' + REGISTERS_RE[reg][1] + ')).*'

+ 'output_rr=(%(?:'+ REGISTERS_RE[reg][1] + ')).*()',

+ tuple([text1] + ['[%{}..%{}]'.format(start_reg, end_reg)] * 2 +

+ ['']),

+ tuple(['{:x}'.format(n)] + ['%' + REGISTERS[reg][n]] * 2

+ for n in nibble)))

+ extra_compressors.append(Compressor(

+ '.*?[0-9a-fA-F](8)(?: 00)*'

+ ' \\w* (?:\\$0x0,|%ax,|%st,)?'

+ '(%(?:' + REGISTERS_RE[reg][0] + ')).*'

+ 'output_rr=(%(?:'+ REGISTERS_RE[reg][0] + ')).*()',

+ tuple([text2] + ['[%{}..%{}]'.format(start_reg, end_reg)] * 2 +

+ ['']),

+ tuple(['{:x}'.format(n + 8)] + ['%' + REGISTERS[reg][n]] * 2

+ for n in nibble)))

+ compressors = (main_compressors + memory_compressors + register_compressors +

+ extra_compressors)

+ # Special compressors: will handle some cosmetic issues.

+ #

+ # SETxx ignores reg field and thus are described as many separate instructions

+ compressors.append(Compressor(

+ '.*0f 9[0-9a-fA-F] XX(/[0-7]) set.*()', ('', ''),

+ [('/' + str(i), ) for i in range(8)]))

+ # BSWAP is described with opcode "0f c8+r", not "0f /1" in manual

+ if options.bitness == 32:

+ compressors.append(Compressor(

+ '.*(XX/1) bswap.*ax.*()', ('c[8..f]', ''), [('XX/1', )]))

+ else:

+ compressors.append(Compressor(

+ '.*(XX/1) bswap.*ax.*()', ('c[89abef]', ''), [('XX/1', )]))

+ compressors.append(Compressor(

+ '.*(XX/1) bswap.*r8.*()', ('c[8..e]', ''), [('XX/1', )]))

+ # Add mark '# write to both' to certain versions of CMPXCHG, XADD, and XCHG

+ if options.bitness == 64:

+ compressors.append(Compressor(

+ '.* (?:cmpxchg|xadd|xchg).*%al\\.\\.%bh[^#]*()$',

+ (' # write to both', ), ((), )))

+ # "and $0xe0,[%eax..%edi]" is treated specially which means that we list all

+ # versions of and "[$0x1..$0xff],[%eax..%edi]" separately here.

+ # Without this rule these ands comprise 2/3 of the whole output!

+ if options.bitness == 32:

+ compressors.append(Compressor(

+ '.*83 (e0 01 and \\$0x1,%eax)()',

+ ('XX/4 00 and[l]? $0x0,[%eax..%edi or memory]', ' # special and'),

+ [('e{} {:02x} and $0x{:x},%{}'.format(r, i, i, REGISTERS['eax'][r]), )

+ for i in range(0x01, 0x100) for r in range(8)] +

+ [('XX/4 00 and[l]? $0x0,[%eax..%edi or memory]', )]))

+ else:

+ for reg in ('eax', 'r8d'):

+ start_reg = REGISTERS[reg][0]

+ end_reg = REGISTERS[reg][-1 if reg[0:2] != 'r8' else -2]

+ for index_reg in ('eax', 'r8d'):

+ start_index = REGISTERS[index_reg][0]

+ end_index = REGISTERS[index_reg][-1]

+ compressors.append(Compressor(

+ '.*83 (e0 01 and \\$0x1,%' + reg + ').*'

+ 'input_rr=(any_nonspecial); output_rr=(%' + reg + ')()',

+ ('XX/4 00 and[l]? $0x0,[%{}..%{} or memory]'.format(start_reg,

+ end_reg), '[%{}..%{}]'.format(start_index, end_index),

+ '[%{}..%{}]'.format(start_reg, end_reg),

+ ' # special and'),

+ [('e{} {:02x} and $0x{:x},%{}'.format(r, i, i, REGISTERS[reg][r]),

+ 'any_nonspecial', '%' + REGISTERS[reg][r])

+ for i in range(0x01, 0x100) for r in range(7 + (reg == 'eax'))] +

+ [('XX/4 00 and[l]? $0x0,[%{}..%{} or memory]'.format(start_reg,

+ end_reg), '[%{}..%{}]'.format(start_index, end_index),

+ '[%{}..%{}]'.format(start_reg, end_reg))]))

+ # "and $e0" and similar are used to align %rsp. All negative values are

+ # accepted by validator and there are 127 of these.

+ # Consolidate them into one line.

+ if options.bitness == 64:

+ compressors.append(Compressor(

+ '.*(?:81|83) (?:e4|e5) (80) (?:00 00 00 |) and \\$0x(80),%r[bs]p.*()',

+ ('[80..ff]', '[80..ff]', ' # alignment and'),

+ [('{:02x}'.format(i), '{:02x}'.format(i)) for i in range(0x80, 0x100)]))

+ # Merge memory and non-memory access

+ if options.bitness == 32:

+ letters_and_registers = (('b', 'al', ''), ('w', 'ax', ''), ('l', 'eax', ''))

+ else:

+ letters_and_registers = (

+ ('b', 'al', 'eax'), ('b', 'spl', 'eax'), ('b', 'r8b', 'r8d'),

+ ('w', 'ax', 'eax'), ('w', 'r8w', 'r8d'),

+ ('l', 'eax', 'eax'), ('l', 'r8d', 'r8d'),

+ ('q', 'rax', 'eax'), ('q', 'r8', 'r8d')

+ )

+ for letter, reg, out_reg in letters_and_registers:

+ start_reg = REGISTERS[reg][0]

+ end_reg = REGISTERS[reg][-1 if reg[0:2] != 'r8' else -2]

+ all_regs = '[%{}..%{}]'.format(start_reg, end_reg)

+ regs_mark = '[%{}..%{} or memory]'.format(start_reg, end_reg)

+ if options.bitness == 64:

+ start_out = REGISTERS[out_reg][0]

+ end_out = REGISTERS[out_reg][-1 if out_reg[0:2] != 'r8' else -2]

+ out_regs = '[%{}..%{}]'.format(start_out, end_out)

+ for notes in ('', ' # rm to reg', ' # reg to rm'):

+ compressors.append(Compressor(

+ '.* \\w*(' + letter + ') .*(\\[memory]).*()()',

+ ('[{}]?'.format(letter), regs_mark, '', ''),

+ ((letter, '[memory]', ''), ('', all_regs, notes))))

+ if options.bitness == 64:

+ for index_reg in ('eax', 'r8d'):

+ start_index = REGISTERS[index_reg][0]

+ end_index = REGISTERS[index_reg][-1]

+ index_regs = '[%{}..%{}]'.format(start_index, end_index)

+ for output_rrs in ((None, out_regs), (out_regs, None), (None, None)):

+ compressors.append(Compressor(

+ '.* \\w*(' + letter + ') .*(\\[memory]).*; '

+ 'input_rr=(\\[%[a-z0-9]*..%[a-z0-9]*\\]); '

+ 'output_rr=(\\[%[a-z0-9]*..%[a-z0-9]*\\]|None)()()',

+ ('[{}]?'.format(letter), regs_mark, index_regs,

+ output_rrs[0] if output_rrs[0] is not None else output_rrs[1],

+ '', ''),

+ ((letter, '[memory]', index_regs, output_rrs[0], ''),

+ ('', all_regs, 'any_nonspecial', output_rrs[1], notes))))

+ # REX compressors

+ if options.bitness == 64:

+ # First pretty complex set of compressors to combine versions of REX with

+ # three lowest bits in different states.

+ register_kind_pairs = (

+ ( None, None),

+ ( 'al', 'al'), ( 'al', None), (None, 'al'),

+ ( 'ax', 'al'), ( 'al', 'ax'),

+ ( 'ax', 'ax'), ( 'ax', None), (None, 'ax'),

+ ( 'eax', 'al'), ( 'al', 'eax'),

+ ( 'eax', 'ax'), ( 'ax', 'eax'),

+ ( 'eax', 'eax'), ( 'eax', None), (None, 'eax'),

+ ( 'rax', 'al'), ( 'al', 'rax'),

+ ( 'rax', 'ax'), ( 'ax', 'rax'),

+ ( 'rax', 'eax'), ( 'eax', 'rax'),

+ ( 'rax', 'rax'), ( 'rax', None), (None, 'rax'),

+ ( 'eax', 'mm0'), ( 'mm0', 'eax'),

+ ( 'rax', 'mm0'), ( 'mm0', 'rax'),

+ ( 'mm0', 'eax'), ( 'eax', 'mm0'),

+ ( 'mm0', 'rax'), ( 'rax', 'mm0'),

+ ( 'eax', 'xmm0'),

+ ( 'rax', 'xmm0'),

+ ('xmm0', 'eax'),

+ ('xmm0', 'rax'),

+ ( 'mm0', 'mm0'), ( 'mm0', None), (None, 'mm0'),

+ ( 'mm0', 'xmm0'),

+ ('xmm0', 'mm0'),

+ ('xmm0', 'xmm0'),

+ ('xmm0', 'ymm0'), ('xmm0', None), (None, 'xmm0'),

+ ('ymm0', 'xmm0'),

+ ('ymm0', 'ymm0'), ('ymm0', None), (None, 'ymm0'),

+ )

+ for reg, rm in register_kind_pairs:

+ for last_reg, last_rm in ((-1, -1), (-1, -2), (-2, -1), (-2, -2)):

+ if reg:

+ start_reg = REGISTERS[reg][0]

+ start_reg8 = REGISTERS[r8[reg]][0]

+ end_reg = REGISTERS[reg][-1]

+ end_reg0 = 'dil' if reg == 'al' else end_reg

+ end_reg8 = REGISTERS[r8[reg]][last_reg]

+ reg_regex = '\\[(%' + start_reg + '\\.\\.%' + end_reg + ')]'

+ reg_regex0 = '\\[(%' + start_reg + '\\.\\.%' + end_reg0 + ')]'

+ elif last_reg == -2:

+ continue

+ if rm:

+ start_rm = REGISTERS[rm][0]

+ start_rm8 = REGISTERS[r8[rm]][0]

+ end_rm = REGISTERS[rm][-1]

+ end_rm0 = 'dil' if rm == 'al' else end_rm

+ end_rm8 = REGISTERS[r8[rm]][last_rm]

+ rm_regex = ('\\[(%' + start_rm + '\\.\\.%' + end_rm + ')'

+ '(?: or memory)?]')

+ rm_regex0 = ('\\[(%' + start_rm + '\\.\\.%' + end_rm0 + ')'

+ '(?: or memory)?]')

+ elif last_rm == -2:

+ continue

+ for rexw in (True, False):

+ for input_rr in (True, False):

+ for output_rr in (True, False) if reg or rm else (None, ):

+ for rm_to_reg in (True, False) if reg and rm else (None, ):

+ # Legacy prefixes

+ regex = '.*:(?: 26| 2e| 36| 3e| 64| 65| 66| 67| f0| f2| f3)*'

+ # REX

+ regex += '( 48).*' if rexw else '( 40|).*'

+ # Replacement text

+ replacement_tuple = (

+ ' [REX:48..4f]' if rexw else ' [REX:40..47]?', )

+ if reg:

+ replacement_regs = '%{}..%{}'.format(start_reg, end_reg8)

+ if rm:

+ replacement_rms = '%{}..%{}'.format(start_rm, end_rm8)

+ # Instruction arguments

+ if not reg and not rm:

+ pass

+ elif not reg and rm:

+ if rexw:

+ regex += rm_regex0 + '.*'

+ else:

+ regex += rm_regex + '.*'

+ replacement_tuple += (replacement_rms, )

+ elif reg and not rm:

+ if rexw:

+ regex += reg_regex0 + '.*'

+ else:

+ regex += reg_regex + '.*'

+ replacement_tuple += (replacement_regs, )

+ elif rm_to_reg:

+ if rexw:

+ regex += rm_regex0 + ',' + reg_regex0 + '.*'

+ else:

+ regex += rm_regex + ',' + reg_regex + '.*'

+ replacement_tuple += (replacement_rms, replacement_regs)

+ else:

+ if rexw:

+ regex += reg_regex0 + ',' + rm_regex0 + '.*'

+ else:

+ regex += reg_regex + ',' + rm_regex + '.*'

+ replacement_tuple += (replacement_regs, replacement_rms)

+ # Input and output restricted registers

+ if input_rr:

+ regex += 'input_rr=\\[(%eax\\.\\.%edi)].*'

+ replacement_tuple += ('%eax..%r15d', )

+ if output_rr:

+ regex += 'output_rr=\\[(%eax\\.\\.%edi)].*'

+ replacement_tuple += ('%eax..%r14d', )

+ regex += '()'

+ replacement_tuple += ('', )

+ # Replacement cases

+ replacement_tuples = ()

+ for byte in (range(0x48, 0x50)

+ if rexw

+ else range(0x40, 0x48) + ['']):

+ replacement_case = (

+ ' {:02x}'.format(byte) if byte else byte, )

+ if byte:

+ if rm:

+ if byte & 0x1:

+ replacement_rms = '%{}..%{}'.format(start_rm8, end_rm8)

+ else:

+ replacement_rms = '%{}..%{}'.format(start_rm, end_rm0)

+ if byte & 0x2:

+ replacement_index = '%r8d..%r15d'

+ else:

+ replacement_index = '%eax..%edi'

+ if reg:

+ if byte & 0x4:

+ replacement_regs = '%{}..%{}'.format(start_reg8,

+ end_reg8)

+ else:

+ replacement_regs = '%{}..%{}'.format(start_reg,

+ end_reg0)

+ else:

+ if rm:

+ replacement_rms = '%{}..%{}'.format(start_rm, end_rm)

+ replacement_index = '%eax..%edi'

+ if reg:

+ replacement_regs = '%{}..%{}'.format(start_reg, end_reg)

+ if not reg and not rm:

+ pass

+ elif not reg and rm:

+ replacement_case += (replacement_rms, )

+ if byte:

+ final_rr = '%r8d..%r14d' if byte & 0x1 else '%eax..%edi'

+ else:

+ final_rr = '%eax..%edi'

+ elif reg and not rm:

+ replacement_case += (replacement_regs, )

+ if byte:

+ final_rr = '%r8d..%r14d' if byte & 0x4 else '%eax..%edi'

+ else:

+ final_rr = '%eax..%edi'

+ elif rm_to_reg:

+ replacement_case += (replacement_rms, replacement_regs)

+ if byte:

+ final_rr = '%r8d..%r14d' if byte & 0x4 else '%eax..%edi'

+ else:

+ final_rr = '%eax..%edi'

+ else:

+ replacement_case += (replacement_regs, replacement_rms)

+ if byte:

+ final_rr = '%r8d..%r14d' if byte & 0x1 else '%eax..%edi'

+ else:

+ final_rr = '%eax..%edi'

+ if input_rr: replacement_case += (replacement_index, )

+ if output_rr: replacement_case += (final_rr, )

+ replacement_tuples += (replacement_case, )

+ compressors.append(Compressor(

+ regex, replacement_tuple, replacement_tuples))

+ # This is pretty simple compressor to combine two lines with different REX.W

+ # bits (only if they are otherwise identical).

+ compressors.append(Compressor(

+ '.*(\\[REX:40\\.\\.47]\\?).*()', ('[REX:40..4f]?', ''),

+ (('[REX:40..47]?', ), ('[REX:48..4f]', ))))

+def ShowProgress(rule, instruction):

+ if rule not in ShowProgress.rules_shown:

+ first_print = True

+ ShowProgress.rules_shown[rule]=len(ShowProgress.rules_shown)

+ else:

+ first_print = False

+ print >> sys.stderr, '-------- Compressed --------'

+ print >> sys.stderr, 'Rule:', ShowProgress.rules_shown[rule]

+ print >> sys.stderr, '--------'

+ compressor = compressors[rule]

+ match = compressor.regex.match(instruction)

+ assert match

+ format_str = CompressionTemplate(instruction, match, '{{{}}}')

+ replacements = sorted(format_str.format(*replacement)

+ for replacement in compressor.replacements)

+ if len(compressor.replacements) <= 4 or first_print:

+ for replacement in replacements:

+ print >> sys.stderr, replacement

+ else:

+ print >> sys.stderr, replacements[0]

+ print >> sys.stderr, '...'

+ print >> sys.stderr, replacements[-1]

+ print >> sys.stderr, '--------'

+ print >> sys.stderr, 'Compressed', (

+ format_str + '{{{}}}').format(*compressor.subst)

+ShowProgress.rules_shown = {}

+def main():

+ # We are keeping these global to share state graph and compressors

+ # between workers spawned by multiprocess. Passing them every time is slow.

+ global options, xml_file

+ global dfa

+ global worker_validator

+ options, xml_file = ParseOptions()

+ dfa = dfa_parser.ParseXml(xml_file)

+ worker_validator = validator.Validator(

+ validator_dll=options.validator_dll,

+ decoder_dll=options.decoder_dll)

+ PrepareCompressors()

+ assert dfa.initial_state.is_accepting

+ assert not dfa.initial_state.any_byte

+ print >> sys.stderr, len(dfa.states), 'states'

+ num_suffixes = dfa_traversal.GetNumSuffixes(dfa.initial_state)

+ # We can't just write 'num_suffixes[dfa.initial_state]' because

+ # initial state is accepting.

+ total_instructions = sum(

+ num_suffixes[t.to_state]

+ for t in dfa.initial_state.forward_transitions.values())

+ print >> sys.stderr, total_instructions, 'regular instructions total'

+ tasks = dfa_traversal.CreateTraversalTasks(dfa.states, dfa.initial_state)

+ print >> sys.stderr, len(tasks), 'tasks'

+ pool = multiprocessing.Pool()

+ results = pool.imap(Worker, tasks)

+ total = 0

+ num_valid = 0

+ full_output = set()

+ for prefix, count, valid_count, output, trace in results:

+ print >> sys.stderr, 'Prefix:', ', '.join(map(hex, prefix))

+ total += count

+ num_valid += valid_count

+ full_output |= output

+ for rule, instruction in trace:

+ ShowProgress(rule, instruction)

+ for instruction in sorted(Compressed(full_output,

+ compressors,

+ ShowProgress)):

+ print instruction

+ print >> sys.stderr, total, 'instructions were processed'

+ print >> sys.stderr, num_valid, 'valid instructions'

+if __name__ == '__main__':

+ main()

« no previous file with comments | « no previous file | src/trusted/validator_ragel/testdata/32bit_regular.golden » ('j') | no next file with comments »