Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(287)

Unified Diff: src/trusted/validator_ragel/compress_regular_instructions.py

Issue 49183002: Regular instructions golden file test. Base URL: svn://svn.chromium.org/native_client/trunk/src/native_client/
Patch Set: Created 7 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « no previous file | src/trusted/validator_ragel/testdata/32bit_regular.golden » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: src/trusted/validator_ragel/compress_regular_instructions.py
===================================================================
--- src/trusted/validator_ragel/compress_regular_instructions.py (revision 0)
+++ src/trusted/validator_ragel/compress_regular_instructions.py (revision 0)
@@ -0,0 +1,1636 @@
+# Copyright (c) 2013 The Native Client Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+"""
+Traverse the validator's DFA, collect all "normal" instruction and then
+compress output. Note: "anybyte fields" (immediates and displacements)
+are always filled with zeros. Otherwise processing of sextillions (sic!)
+of possibilities will take too long.
+
+Each rule is applied only when all variants are accepted by validator.
+The following compression rules are present:
+
+1. Compress ModR/M (+SIB & displacement).
+ Instruction: 00 00 add %al,(%rax)
+ ...
+ Instruction: 00 ff add %bh,%bh
+ becomes
+ Instruction: 00 XX add [%al..%bh],[%al..%bh or memory]
+
+1a. Compress ModR/M (+SIB & displacement) memory-only.
+ Instruction: f0 01 00 lock add %eax,(%eax)
+ ...
+ Instruction: f0 01 bf 00 00 00 00 lock add %edi,0x0(%edi)
+ becomes
+ Instruction: f0 01 XX lock add [%eax..edi],[memory]
+
+1b. Compress ModR/M register only.
+ Instruction: 66 0f 50 c0 movmskpd %xmm0,%eax
+ ...
+ Instruction: 66 0f 50 ff movmskpd %xmm7,%edi
+ becomes
+ Instruction: 66 0f 50 XX movmskpd [%xmm0..%xmm7],[%eax..edi]
+
+2. Compress ModR/M (+SIB & displacement) with opcode extension.
+ Instruction: 0f 90 00 seto (%eax)
+ ...
+ Instruction: 0f 90 c7 seto %bh
+ becomes
+ Instruction: 0f 90 XX/0 seto [%al..%bh or memory]
+
+2a. Compress ModR/M (+SIB & displacement) memory-only with opcode extension.
+ Instruction: f0 ff 00 lock incl (%eax)
+ ...
+ Instruction: f0 ff 84 ff 00 00 00 00 lock incl 0x0(%edi,%edi,8)
+ becomes
+ Instruction: f0 ff XX/1 lock decl [memory]
+
+2b. Compress ModR/M register-only with opcode extension.
+ Instruction: 0f 71 d0 00 psrlw $0x0,%mm0
+ ...
+ Instruction: 0f 71 d7 00 psrlw $0x0,%mm7
+ becomes
+ Instruction: 66 0f 71 XX/2 00 psrlw $0x0,[%mm0..%mm7]
+
+3. Compress register-in-opcode.
+ Instruction: d9 c0 fld %st(0)
+ ...
+ Instruction: d9 c7 fld %st(7)
+ becomes
+ Instruction: Instruction: d9 c[0..7] fld [%st(0)..%st(7)]
+
+ Only applies if all possible register accesses are accepted by validator.
+
+4. Special compressor for "set" instruction.
+ Instruction: 0f 90 XX/0 seto [%al..%bh or memory]
+ ...
+ Instruction: 0f 90 XX/7 seto [%al..%bh or memory]
+ becomes
+ Instruction: 0f 90 XX seto [%al..%bh or memory]
+"""
+
+import itertools
+import multiprocessing
+import optparse
+import os
+import re
+import subprocess
+import sys
+import tempfile
+import traceback
+
+import dfa_parser
+import dfa_traversal
+import validator
+
+
+# Register names in 'natual' order (as defined by IA32/x86-64 ABI)
+#
+# X86-64 ABI splits all registers in groups of 8 because it uses 3-bit field
+# in opcode, ModR/M, and/or SIB bytes to encode them.
+#
+# In most cases there are 16 registers of a given kind and two such groups,
+# but there are couple of exceptions:
+# 1. There are 20 8-bit registers and three groups (two of them overlap)
+# 2. There are eight X87 and MMX registers thus two groups are identical
+#
+# We use typical register from a group to name the whole group. Most groups
+# use first register, but 'spl' group uses fifth register because it's first
+# four registers are the same as 'al' group. We use mnemonic name 'mmalt'
+# to represent the "evil mirror" of the 'mm0' group.
+REGISTERS = {
+ 'al': [ 'al', 'cl', 'dl', 'bl', 'ah', 'ch', 'dh', 'bh' ],
+ 'spl': [ 'al', 'cl', 'dl', 'bl', 'spl', 'bpl', 'sil', 'dil' ],
+ 'ax': [ 'ax', 'cx', 'dx', 'bx', 'sp', 'bp', 'si', 'di' ],
+ 'eax': [ 'eax', 'ecx', 'edx', 'ebx', 'esp', 'ebp', 'esi', 'edi' ],
+ 'rax': [ 'rax', 'rcx', 'rdx', 'rbx', 'rsp', 'rbp', 'rsi', 'rdi' ],
+ 'r8b': [ 'r{}b'.format(N) for N in range(8,16) ],
+ 'r8w': [ 'r{}w'.format(N) for N in range(8,16) ],
+ 'r8d': [ 'r{}d'.format(N) for N in range(8,16) ],
+ 'r8': [ 'r{}'.format(N) for N in range(8,16) ],
+ 'mm0': [ 'mm{}'.format(N) for N in range(8) ],
+ 'mmalt': [ 'mm{}'.format(N) for N in range(8) ],
+ 'st(0)': [ 'st({})'.format(N) for N in range(8) ],
+ 'xmm0': [ 'xmm{}'.format(N) for N in range(8) ],
+ 'xmm8': [ 'xmm{}'.format(N) for N in range(8,16) ],
+ 'ymm0': [ 'ymm{}'.format(N) for N in range(8) ],
+ 'ymm8': [ 'ymm{}'.format(N) for N in range(8,16) ]
+}
+
+
+NOP = 0x90
+
+
+def PadToBundleSize(bytes):
+ assert len(bytes) <= validator.BUNDLE_SIZE
+ return bytes + [NOP] * (validator.BUNDLE_SIZE - len(bytes))
+
+
+# In x86-64 mode we have so-called 'restricted register' which is used to
+# tie two groups together. Some instructions require particular value to
+# be stored in this variable, while some accept any non-special restricted
+# register (%ebp and %esp are special because they can only be accepted by
+# a few 'special' instructions).
+#
+# You can find more details in the "NaCl SFI model on x86-64 systems" manual.
+#
+# We try to feed all possible 'restricted registers' into validator and then
+# classify the instruction using this map. If set of acceptable 'restricted
+# registers' is not here, then it's an error in validator.
+ACCEPTABLE_X86_64_INPUTS = {
+ 0x00001: 'input_rr=%eax',
+ 0x00002: 'input_rr=%ecx',
+ 0x00004: 'input_rr=%edx',
+ 0x00008: 'input_rr=%ebx',
+ 0x00010: 'input_rr=%esp',
+ 0x00020: 'input_rr=%ebp',
+ 0x00040: 'input_rr=%esi',
+ 0x00080: 'input_rr=%edi',
+ 0x00100: 'input_rr=%r8d',
+ 0x00200: 'input_rr=%r9d',
+ 0x00400: 'input_rr=%r10d',
+ 0x00800: 'input_rr=%r11d',
+ 0x01000: 'input_rr=%r12d',
+ 0x02000: 'input_rr=%r13d',
+ 0x04000: 'input_rr=%r14d',
+ 0x08000: 'input_rr=%r15d',
+ 0x1ffcf: 'input_rr=any_nonspecial'
+}
+
+# Any instruction must produce either None or one of fifteen registers as an
+# output 'restricted register' value. 'r15d' is NOT acceptable as an output.
+ACCEPTABLE_X86_64_OUTPUT_REGISTERS = tuple(
+ '%' + reg for reg in (REGISTERS['eax'] + REGISTERS['r8d'])[0:-1])
+
+
+def ValidateInstruction(instruction, validator_inst):
+ bundle = ''.join(map(chr, PadToBundleSize(instruction)))
+ if options.bitness == 32:
+ result = validator_inst.ValidateChunk(bundle, bitness=32)
+ return result, []
+ else:
+ valid_inputs = 0
+ known_final_rr = None
+ output_rr = None
+ # Note that iteration order is aligned with ACCEPTABLE_X86_64_INPUTS array
+ # above.
+ for bit, initial_rr in enumerate(validator.ALL_REGISTERS + [None]):
+ valid, final_rr = validator_inst.ValidateAndGetFinalRestrictedRegister(
+ bundle, len(instruction), initial_rr)
+ if valid:
+ # final_rr should not depend on input_rr
+ assert valid_inputs == 0 or known_final_rr == final_rr
+ valid_inputs |= 1 << bit
+ known_final_rr = final_rr
+ # If nothing is accepted then instruction is not valid. Easy and simple.
+ if valid_inputs == 0: return False, []
+ # If returned value in unacceptable we'll get IndexError here and this
+ # test will fail
+ if known_final_rr is not None:
+ output_rr = ACCEPTABLE_X86_64_OUTPUT_REGISTERS[known_final_rr]
+ # If collected valid_inputs are unacceptable we'll get KeyError here and
+ # this test will fail
+ return True, [ACCEPTABLE_X86_64_INPUTS[valid_inputs],
+ 'output_rr={}'.format(output_rr)]
+
+
+class WorkerState(object):
+ def __init__(self, prefix, validator):
+ self.total_instructions = 0
+ self.num_valid = 0
+ self.validator = validator
+ self.output = set()
+ self.trace = []
+
+
+ def ReceiveInstruction(self, bytes):
+ self.total_instructions += 1
+ result, notes = ValidateInstruction(bytes, self.validator)
+ if result:
+ self.num_valid += 1
+ dis = self.validator.DisassembleChunk(
+ ''.join(map(chr, bytes)),
+ bitness=options.bitness)
+ for line_nr in xrange(len(dis)):
+ dis[line_nr] = str(dis[line_nr])
+ assert dis[line_nr][0:17] == 'Instruction(0x' + str(line_nr) + ': '
+ assert dis[line_nr][-1:] == ')'
+ dis[line_nr] = dis[line_nr][17:-1]
+ # If %rip is involved then comment will be different depending on the
+ # instruction length. Eliminate it.
+ if '(%rip)' in dis[0]:
+ dis[0] = re.sub(' # 0x[ ]*[0-9a-fA-F]*', '', dis[0])
+ # Zero displacements are represented as 0x0 for all instructions except
+ # jumps where they disassembled as non-zero due to %eip/%rip-relative
+ # addressing. We replace this displacement with %eip/%rip to simplify
+ # compression.
+ if ' 0x' in dis[0] and ' 0x0' not in dis[0]:
+ for bytes in xrange(1, 16):
+ dis[0] = re.sub(
+ '(' + '(?:[0-9a-fA-F][0-9a-fA-F] ){' + str(bytes) + '} .* )' +
+ hex(bytes) + '(.*)',
+ '\\1%eip\\2' if options.bitness == 32 else '\\1%rip\\2',
+ dis[0]);
+ dis[0] = 'Instruction: ' + dis[0]
+ dis += notes
+ self.output.add('; '.join(dis))
+
+
+ def RecordTrace(self, compressor_nr, instruction):
+ self.trace.append((compressor_nr, instruction))
+
+
+# Compressor has three slots: regex (which picks apart given instruction),
+# subst (which is used to denote compressed version) and replacements (which
+# are used to generate set of instructions from a given code).
+#
+# Example compressor:
+# regex = '.*?[0-9a-fA-F]([0-7]) \\w* (%e(?:[abcd]x|[sb]p|[sd]i)).*()'
+# subst = ('[0-7]', '[%eax..%edi]', ' # register in opcode')
+# replacements = ((0, '%eax'), (1, '%ecx'), (2, '%edx'), (3, '%ebx')
+# (4, '%esp'), (5, '%ebp'), (6, '%esi'), (7, '%edi'))
+#
+# When faced with instriuction '40 inc %eax' it will capture the following
+# pieces of said instruction: '4[0] inc [%eax]'.
+#
+# Then it will produce the following eight instructions:
+# '40 inc %eax'
+# '41 inc %ecx'
+# '42 inc %edx'
+# '43 inc %ebx'
+# '44 inc %esp'
+# '45 inc %ebp'
+# '46 inc %esi'
+# '47 inc %edi'
+#
+# If all these instructions can be found in a set of instructions then
+# compressor will remove them from said set and will insert one replacement
+# "compressed instruction" '4[0-7] inc [%eax..%edi] # register in opcode'.
+#
+# Note that last group is only used in the replacement. It's used to grab marks
+# added by previous compressors and to replace them with a new mark.
+class Compressor(object):
+ __slots__ = [
+ 'regex',
+ 'subst',
+ 'replacements'
+ ]
+
+ def __init__(self, regex, subst, replacements=None):
+ self.regex = re.compile(regex)
+ self.subst = subst
+ self.replacements = [] if replacements is None else replacements
+
+
+def CompressionTemplate(instruction, match, mark):
+ """ Replace all match groups with the mark. """
+ pos = 0
+ format_str = ''
+ for group in range(1, len(match.groups())):
+ format_str += instruction[pos:match.start(group)] + mark
+ pos = match.end(group)
+ return format_str + instruction[pos:match.start(len(match.groups()))]
+
+
+def CompressOneMatch(instructions, instruction, match, compressor):
+ format_str = CompressionTemplate(instruction, match, '{}')
+ subset = set()
+ for replacement in compressor.replacements:
+ replacement_str = format_str.format(*replacement)
+ if not replacement_str in instructions:
+ return (False, instructions)
+ subset.add(replacement_str)
+ instructions -= subset
+ instructions.add((format_str + '{}').format(*compressor.subst))
+ return (True, instructions)
+
+
+def CompressOneInstruction(instructions, compressors, split, cache):
+ sorted_instructions = (sorted(i for i in instructions if i > split) +
+ sorted(i for i in instructions if i < split))
+ for instruction in sorted_instructions:
+ if instruction in cache:
+ compressors_list = cache[instruction]
+ for compressor_nr, match, compressor in compressors_list:
+ result, instructions = CompressOneMatch(
+ instructions, instruction, match, compressor)
+ if result:
+ return (instructions, compressor_nr, instruction)
+ else:
+ compressors_list = []
+ for compressor_nr, compressor in enumerate(compressors):
+ match = compressor.regex.match(instruction)
+ if match:
+ compressors_list.append((compressor_nr, match, compressor))
+ result, instructions = CompressOneMatch(
+ instructions, instruction, match, compressor)
+ if result:
+ return (instructions, compressor_nr, instruction)
+ cache[instruction] = compressors_list
+ return (instructions, False, False)
+
+
+def Compressed(instructions, compressors, show_progress):
+ split = ''
+ cache = {}
+ while True:
+ instructions, rule, split = CompressOneInstruction(
+ instructions, compressors, split, cache)
+ if rule is False: break
+ show_progress(rule, split)
+ return instructions
+
+
+def Worker((prefix, state_index)):
+ worker_state = WorkerState(prefix, worker_validator)
+
+ try:
+ dfa_traversal.TraverseTree(
+ dfa.states[state_index],
+ final_callback=worker_state.ReceiveInstruction,
+ prefix=prefix,
+ anyfield=0)
+ if (prefix[0] != 0x0f or prefix[1] != 0x0f): # Skip 3DNow! instructions
+ worker_state.output = Compressed(set(worker_state.output),
+ compressors,
+ worker_state.RecordTrace)
+ except Exception as e:
+ traceback.print_exc() # because multiprocessing imap swallows traceback
+ raise
+
+ return (
+ prefix,
+ worker_state.total_instructions,
+ worker_state.num_valid,
+ worker_state.output,
+ worker_state.trace)
+
+
+def ParseOptions():
+ parser = optparse.OptionParser(usage='%prog [options] xmlfile')
+
+ parser.add_option('--bitness',
+ choices=['32', '64'],
+ help='The subarchitecture: 32 or 64')
+ parser.add_option('--validator_dll',
+ help='Path to librdfa_validator_dll')
+ parser.add_option('--decoder_dll',
+ help='Path to librdfa_decoder_dll')
+
+ options, args = parser.parse_args()
+ options.bitness = int(options.bitness)
+
+ if len(args) != 1:
+ parser.error('specify one xml file')
+
+ (xml_file, ) = args
+
+ return options, xml_file
+
+
+# Version suitable for use in regular expressions
+REGISTERS_RE = REGISTERS.copy()
+REGISTERS_RE['st(0)'] = [ 'st\\({}\\)'.format(N) for N in range(8) ]
+REGISTERS_RE['st\\(0\\)'] = REGISTERS_RE['st(0)']
+
+# Index names in 'natual' order (as defined by IA32/x86-64 ABI)
+INDEXES = {
+ 'eax': [ 'eax', 'ecx', 'edx', 'ebx', 'eiz', 'ebp', 'esi', 'edi' ],
+ 'rax': [ 'rax', 'rcx', 'rdx', 'rbx', 'riz', 'rbp', 'rsi', 'rdi' ],
+ 'r8': [ 'r8', 'r9', 'r10', 'r11', 'r12', 'r13', 'r14', 'r15' ]
+}
+# Register which can not be used as base in 64-bit mode in all incarnations
+X86_64_BASE_REGISTERS = set([
+ '%spl', '%bpl', '%r15b',
+ '%sp', '%bp', '%r15w',
+ '%esp', '%ebp', '%r15d',
+ '%rsp', '%rbp', '%r15',
+ '%rip'
+])
+
+
+def InstructionIsDangerous(input, output, register_write,
+ writes_to, memory_accessed=False,
+ base_text='%riz', index_text='%riz'):
+ """ Check if instruction with given replacements will be dangerous
+
+ Args:
+ input: input argument
+ output: output argument
+ register_write: three-state selector
+ 'sandbox' - instruction can be used to produce "restricted register"
+ 'protect' - instruction can damage output, protect "special registers"
+ 'ignore' - instruction does not affect it's operands (e.g. test) or
+ is used with non-GP registers (X87, MMX, XMM, etc)
+ memory_accessed: True if instruction accesses memory
+ base: base register (if memory is accessed)
+ index: index register (if memory is accessed)
+
+ Returns:
+ True if instruction should be rejected by validator
+ """
+ if memory_accessed:
+ if base_text not in X86_64_BASE_REGISTERS:
+ return True
+ if index_text in X86_64_BASE_REGISTERS - set(['%r15']):
+ return True
+ if register_write == 'protect' and output in X86_64_BASE_REGISTERS:
+ return True
+ if register_write == 'sandbox' and output == '%r15d':
+ return True
+ if writes_to == 'both' and input in X86_64_BASE_REGISTERS:
+ return True
+ return False
+
+
+def AppendOperandsReplacement(replacement, rm_text, reg, modrm, writes_to):
+ """ Appends replacement text to replacement list
+
+ Args:
+ replacement: replacement list
+ rm_text: replacement for rm field
+ reg: register kind (or None if reg field is used as opcode extension)
+ modrm: modrm byte
+ writes_to: three-state selector
+ 'reg' - instruction uses rm as source, reg as destination
+ 'rm' - instruction uses reg as source, rm as destination
+ 'both' - instruction writes to both reg and rm
+
+ Returns:
+ input: textual representation of input argument
+ output: textual representation of output argument
+
+ Side-effect:
+ output (if reg is None) or (input, output) tuple (if reg is not None)
+ are added to replacement list.
+ """
+ if reg is None:
+ assert writes_to == 'rm'
+ input, output = None, rm_text
+ replacement.append(output)
+ else:
+ reg_field = (modrm >> 3) & 0x07
+ reg_text = '%' + REGISTERS[reg][reg_field]
+ if writes_to == 'reg':
+ input, output = rm_text, reg_text
+ else: # rm, both
+ input, output = reg_text, rm_text
+ replacement.extend([input, output])
+ return input, output
+
+
+def ModRMRegisterReplacements(rm, reg=None, writes_to='rm', opcode_bits=0,
+ register_write='ignore'):
+ """Creates replacement tuples list for register-to-register instructions
+
+ Args:
+ rm: rm operand kind (see REGISTERS array)
+ reg: reg operand kind (see REGISTERS array) or None if reg is not used
+ writes_to: three-state selector
+ 'reg' - instruction uses rm as source, reg as destination
+ 'rm' - instruction uses reg as source, rm as destination
+ 'both' - instruction writes to both reg and rm
+ opcode_bits: opcode extensions code (used when reg is None)
+ register_write: three-state selector
+ 'sandbox' - instruction can be used to produce "restricted register"
+ 'protect' - instruction can damage output, protect "special registers"
+ 'ignore' - instruction does not affect it's operands (e.g. test) or
+ is used with non-GP registers (X87, MMX, XMM, etc)
+ Returns:
+ List of replacement tuples
+ """
+ # Reg field can be used either as reg or as opcode extension, but not both
+ assert reg is None or opcode_bits == 0
+
+ output_key = (options.bitness, reg, rm, writes_to, opcode_bits,
+ register_write)
+ if output_key in ModRMRegisterReplacements.replacements:
+ return ModRMRegisterReplacements.replacements[output_key]
+
+ replacements = []
+
+ # Two upper bits of ModR/M byte (mod field) must be equal to 11
+ # This gives us range from 0xc0 to 0xff but we are going from the
+ # end to make rejection faster (%r15 is equal to 0x7 and %rbp is 0x5).
+ if reg is None:
+ # reg field is used as opcode extension
+ byte_range = [byte
+ for byte in range(0xff, 0xbf, -1)
+ if (byte >> 3) & 0x7 == opcode_bits]
+ else:
+ byte_range = range(0xff, 0xbf, -1)
+
+ for modrm in byte_range:
+ rm_field = (modrm & 0x07)
+ rm_text = '%' + REGISTERS[rm][rm_field]
+ byte_text = '{:02x}'.format(modrm)
+ replacement = [byte_text]
+ input, output = AppendOperandsReplacement(
+ replacement, rm_text, reg, modrm, writes_to)
+ if options.bitness == 64:
+ replacement.append('any_nonspecial') # input_rr
+ replacement.append(output if register_write == 'sandbox' else None)
+ if InstructionIsDangerous(input, output, register_write, writes_to):
+ continue
+ replacements.append(tuple(replacement))
+ ModRMRegisterReplacements.replacements[output_key] = tuple(replacements)
+ return ModRMRegisterReplacements.replacements[output_key]
+ModRMRegisterReplacements.replacements = {}
+
+
+def BaseOnlyMemoryOperand(modrm, base):
+ """Creates replacement tuples list for register-to-memory instructions
+ (base only, no SIB)
+
+ Args:
+ modrm: modrm byte
+ base: register kind for base
+ Returns:
+ bytes_text: replacement for "bytes" group
+ rm_text: textual representation of "rm" argument
+ base_text: textual representation of "base" register
+ """
+ mod_field = (modrm >> 6) & 0x03
+ rm_field = (modrm & 0x07)
+ base_text = '%' + REGISTERS[base][rm_field]
+ # If RM field == %rbp and MOD field is zero then it's absolute address
+ # in 32-bit mode and %rip-based address in 64-bit mode
+ if mod_field == 0 and rm_field == validator.REG_RBP:
+ bytes_text = '{:02x} 00 00 00 00'.format(modrm)
+ rm_text = '0x0' if options.bitness == 32 else '0x0(%rip)'
+ base_text = '%eiz' if options.bitness == 32 else '%rip'
+ # Memory access with just a base register
+ elif mod_field == 0:
+ bytes_text = '{:02x}'.format(modrm)
+ rm_text = '({})'.format(base_text)
+ # Memory access with base and 8bit offset
+ elif mod_field == 1:
+ bytes_text = '{:02x} 00'.format(modrm)
+ rm_text = '0x0({})'.format(base_text)
+ # Memory access with base and 32bit offset
+ else: # mod_field == 2
+ bytes_text = '{:02x} 00 00 00 00'.format(modrm)
+ rm_text = '0x0({})'.format(base_text)
+ return bytes_text, rm_text, base_text
+
+
+def SIBMemoryOperand(modrm, sib, base, index):
+ """Creates replacement tuples list for register-to-memory instructions
+ (base only, no SIB)
+
+ Args:
+ modrm: modrm byte
+ base: register kind for base
+ Returns:
+ bytes_text: replacement for "bytes" group
+ rm_text: textual representation of "rm" argument
+ base_text: textual representation of "base" register
+ index_text: textual representation of "index" register
+ """
+ mod_field = (modrm >> 6) & 0x03
+ scale_field = (sib >> 6) & 0x03
+ index_field = (sib >> 3) & 0x07
+ base_field = (sib & 0x07)
+ index_text = '%' + INDEXES[index][index_field]
+ base_text = '%' + REGISTERS[base][base_field]
+ scale_text = str(1 << scale_field)
+ # If BASE is %rbp and MOD == 0 then index with 32bit offset is used
+ if mod_field == 0 and base_field == validator.REG_RBP:
+ bytes_text = '{:02x} {:02x} 00 00 00 00'.format(modrm, sib)
+ if (options.bitness == 64 and
+ index_text == '%riz' and
+ scale_text == '1'):
+ rm_text = '0x0'
+ else:
+ rm_text = '0x0(,{},{})'.format(index_text, scale_text)
+ # There are no base in this case
+ base_text = '%eiz' if options.bitness == 32 else '%riz'
+ # Memory access with base and index (no offset)
+ elif mod_field == 0:
+ bytes_text = '{:02x} {:02x}'.format(modrm, sib)
+ rm_text = '({},{},{})'.format(base_text, index_text, scale_text)
+ # Memory access with base, index and 8bit offset
+ elif mod_field == 1:
+ bytes_text = '{:02x} {:02x} 00'.format(modrm, sib)
+ rm_text = '0x0({},{},{})'.format(base_text, index_text, scale_text)
+ # Memory access with base, index and 32bit offset
+ elif mod_field == 2:
+ bytes_text = '{:02x} {:02x} 00 00 00 00'.format(modrm, sib)
+ rm_text = '0x0({},{},{})'.format(base_text, index_text, scale_text)
+ # Pretty-printing of access via %rsp (or %r12)
+ if (base_field == validator.REG_RSP and
+ index_text in ('%eiz', '%riz') and
+ scale_text == '1'):
+ if mod_field == 0: # no offset
+ rm_text = '({})'.format(base_text)
+ else: # 8-bit or 32-bit offset
+ rm_text = '0x0({})'.format(base_text)
+ return bytes_text, rm_text, base_text, index_text
+
+
+def ModRMMemoryReplacements(reg=None, writes_to='rm', opcode_bits=0,
+ memory_accessed=True, register_write='ignore',
+ base_r8=False, index_r8=False):
+ """Creates replacement tuples list for register-to-memory instructions
+
+ Args:
+ reg: reg operand kind (see REGISTERS array) or None if reg is not used
+ writes_to: three-state selector
+ 'reg' - instruction uses rm as source, reg as destination
+ 'rm' - instruction uses reg as source, rm as destination
+ 'both' - instruction writes to both reg and rm
+ opcode_bits: opcode extensions code (used when reg is None)
+ memory_accessed: True if instruction accesses memory
+ register_write: three-state selector
+ 'sandbox' - instruction can be used to produce "restricted register"
+ 'protect' - instruction can damage output, protect "special registers"
+ 'ignore' - instruction does not affect it's operands (e.g. test) or
+ is used with non-GP registers (X87, MMX, XMM, etc)
+ index_r8: True if REX.X bit in the instruction set to 1
+
+ Returns:
+ List of replacement tuples
+ """
+ # Reg field can be used either as reg or as opcode extension, but not both
+ assert reg is None or opcode_bits == 0
+
+ output_key = (options.bitness, reg, writes_to, opcode_bits,
+ base_r8, index_r8, memory_accessed, register_write)
+ if output_key in ModRMMemoryReplacements.replacements:
+ return ModRMMemoryReplacements.replacements[output_key]
+
+ if options.bitness == 32:
+ base = 'eax'
+ index = 'eax'
+ else:
+ base = 'r8' if base_r8 else 'rax'
+ index = 'r8' if index_r8 else 'rax'
+
+ replacements = []
+
+ # Two upper bits of ModR/M byte (mod field) must be equal to 00, 01, or 10
+ # This gives us range from 0x00 to 0xbf but we are going from the end to make
+ # rejection faster (%r15 is equal to 0x7 and %rbp is 0x5).
+ if reg is None:
+ # reg field is used as opcode extension
+ byte_range = [byte
+ for byte in range(0xbf, -1, -1)
+ if (byte >> 3) & 0x7 == opcode_bits]
+ else:
+ byte_range = range(0xbf, -1, -1)
+
+ for modrm in byte_range:
+ # If RM field != %rsp then there are no SIB byte
+ if (modrm & 0x07) != validator.REG_RSP:
+ bytes_text, rm_text, base_text = BaseOnlyMemoryOperand(modrm, base)
+ replacement = [bytes_text]
+ input, output = AppendOperandsReplacement(
+ replacement, rm_text, reg, modrm, writes_to)
+ if options.bitness == 64:
+ replacement.append('any_nonspecial')
+ # If writes_to is equal to 'reg' then output is register
+ if writes_to == 'reg' and register_write == 'sandbox':
+ replacement.append(output)
+ else:
+ # Note that instruction like xchg could write to another register:
+ # it's "input"! But validator currently does not support this case
+ # thus we are ignoring it here and writing "None" in this case, too.
+ replacement.append(None)
+ if InstructionIsDangerous(input, output, register_write, writes_to,
+ memory_accessed, base_text):
+ continue
+ replacements.append(tuple(replacement))
+ else:
+ # If RM field == %rsp then we have SIB byte
+ for sib in xrange(0x100):
+ bytes_text, rm_text, base_text, index_text = SIBMemoryOperand(
+ modrm, sib, base, index)
+ replacement = [bytes_text]
+ input, output = AppendOperandsReplacement(
+ replacement, rm_text, reg, modrm, writes_to)
+ if options.bitness == 64:
+ if not memory_accessed or index_text == '%riz':
+ replacement.append('any_nonspecial')
+ else:
+ if index_r8:
+ # Convert %r8 to %r8d, %r9 to %r9d, etc
+ replacement.append(index_text + 'd')
+ else:
+ # Convert %rax to %eax, %rsp to %esp, etc
+ replacement.append('%e' + index_text[2:])
+ # If writes_to is equal to 'reg' then output is register
+ if writes_to == 'reg' and register_write == 'sandbox':
+ replacement.append(output)
+ else:
+ # Note that instruction like xchg could write to another register:
+ # it's "input"! But validator currently does not support this case
+ # thus we are ignoring it here and writing "None" in this case, too.
+ replacement.append(None)
+ if InstructionIsDangerous(input, output, register_write, writes_to,
+ memory_accessed, base_text, index_text):
+ continue
+ replacements.append(tuple(replacement))
+ ModRMMemoryReplacements.replacements[output_key] = tuple(replacements)
+ return ModRMMemoryReplacements.replacements[output_key]
+ModRMMemoryReplacements.replacements = {}
+
+
+# Map from "REX bit off" group of registers to "REX bit on" group of registers
+r8 = {
+ 'al': 'r8b',
+ 'ax': 'r8w',
+ 'eax': 'r8d',
+ 'rax': 'r8',
+ 'mm0': 'mmalt',
+ 'xmm0': 'xmm8',
+ 'ymm0': 'ymm8'
+}
+
+
+def RegisterKinds():
+ """ Return list of register kinds we process with register compressors """
+
+ if options.bitness == 32:
+ return ('al', 'ax', 'eax', 'mm0', 'xmm0', 'ymm0')
+ else:
+ return ('al', 'spl', 'ax', 'eax', 'rax', 'mm0', 'xmm0', 'ymm0',
+ 'r8b', 'r8w', 'r8d', 'r8', 'mmalt', 'xmm8', 'ymm8')
+
+
+def RegisterKindPairs():
+ """ Return hand-picked pairs which we must consider in compressors """
+
+ if options.bitness == 32:
+ return (
+ ( 'al', 'al'),
+ ( 'ax', 'al'),
+ ( 'al', 'ax'),
+ ( 'ax', 'ax'),
+ ( 'eax', 'al'),
+ ( 'al', 'eax'),
+ ( 'eax', 'ax'),
+ ( 'ax', 'eax'),
+ ( 'eax', 'eax'),
+ ( 'eax', 'mm0'),
+ ( 'mm0', 'eax'),
+ ( 'eax', 'xmm0'),
+ ('xmm0', 'eax'),
+ ( 'mm0', 'mm0'),
+ ( 'mm0', 'xmm0'),
+ ('xmm0', 'mm0'),
+ ('xmm0', 'xmm0'),
+ ('xmm0', 'ymm0'),
+ ('ymm0', 'xmm0'),
+ ('ymm0', 'ymm0')
+ )
+ else:
+ return (
+ ( 'al', 'al'),
+ ( 'spl', 'spl'), ( 'spl', 'r8b'), ( 'r8b', 'spl'), ( 'r8b', 'r8b'),
+ ( 'ax', 'al'),
+ ( 'ax', 'spl'), ( 'ax', 'r8b'), ( 'r8w', 'spl'), ( 'r8w', 'r8b'),
+ ( 'al', 'ax'),
+ ( 'spl', 'ax'), ( 'spl', 'r8w'), ( 'r8b', 'ax'), ( 'r8b', 'r8w'),
+ ( 'ax', 'ax'), ( 'ax', 'r8w'), ( 'r8w', 'ax'), ( 'r8w', 'r8w'),
+ ( 'eax', 'al'),
+ ( 'eax', 'spl'), ( 'eax', 'r8b'), ( 'r8d', 'spl'), ( 'r8d', 'r8b'),
+ ( 'al', 'eax'),
+ ( 'spl', 'eax'), ( 'spl', 'r8d'), ( 'r8b', 'eax'), ( 'r8b', 'r8d'),
+ ( 'eax', 'ax'), ( 'eax', 'r8w'), ( 'r8d', 'ax'), ( 'r8d', 'r8w'),
+ ( 'ax', 'eax'), ( 'ax', 'r8d'), ( 'r8w', 'eax'), ( 'r8w', 'r8d'),
+ ( 'eax', 'eax'), ( 'eax', 'r8d'), ( 'r8d', 'eax'), ( 'r8d', 'r8d'),
+ ( 'rax', 'al'),
+ ( 'rax', 'spl'), ( 'rax', 'r8b'), ( 'r8', 'spl'), ( 'r8', 'r8b'),
+ ( 'al', 'rax'),
+ ( 'spl', 'rax'), ( 'spl', 'r8'), ( 'r8b', 'rax'), ( 'r8b', 'r8'),
+ ( 'rax', 'ax'), ( 'rax', 'r8w'), ( 'r8', 'ax'), ( 'r8', 'r8w'),
+ ( 'ax', 'rax'), ( 'ax', 'r8'), ( 'r8w', 'rax'), ( 'r8w', 'r8'),
+ ( 'rax', 'eax'), ( 'rax', 'r8d'), ( 'r8', 'eax'), ( 'r8', 'r8d'),
+ ( 'eax', 'rax'), ( 'eax', 'r8'), ( 'r8d', 'rax'), ( 'r8d', 'r8'),
+ ( 'rax', 'rax'), ( 'rax', 'r8'), ( 'r8', 'rax'), ( 'r8', 'r8'),
+ ( 'eax', 'mm0'), ( 'eax','mmalt'), ( 'r8d', 'mm0'), ( 'eax', 'mmalt'),
+ ( 'rax', 'mm0'), ( 'rax','mmalt'), ( 'r8', 'mm0'), ( 'r8', 'mmalt'),
+ ( 'mm0', 'eax'), ('mmalt', 'eax'), ( 'mm0', 'r8d'), ('mmalt', 'r8d'),
+ ( 'mm0', 'rax'), ('mmalt', 'rax'), ( 'mm0', 'r8'), ('mmalt', 'r8'),
+ ( 'eax', 'xmm0'), ( 'eax', 'xmm8'), ( 'r8d', 'xmm0'), ( 'r8d', 'xmm8'),
+ ( 'rax', 'xmm0'), ( 'rax', 'xmm8'), ( 'r8', 'xmm0'), ( 'r8', 'xmm8'),
+ ('xmm0', 'eax'), ('xmm0', 'r8d'), ('xmm8', 'eax'), ('xmm8', 'r8d'),
+ ('xmm0', 'rax'), ('xmm0', 'r8'), ('xmm8', 'rax'), ('xmm8', 'r8'),
+ ( 'mm0', 'mm0'), ('mmalt', 'mm0'), ( 'mm0','mmalt'), ('mmalt','mmalt'),
+ ( 'mm0', 'xmm0'), ('mmalt','xmm0'), ( 'mm0', 'xmm8'), ('mmalt', 'xmm8'),
+ ('xmm0', 'mm0'), ('xmm8', 'mm0'), ('xmm0','mmalt'), ('xmm8', 'mmalt'),
+ ('xmm0', 'xmm0'), ('xmm0', 'xmm8'), ('xmm8', 'xmm0'), ('xmm8', 'xmm8'),
+ ('xmm0', 'ymm0'), ('xmm0', 'ymm8'), ('xmm8', 'ymm0'), ('xmm8', 'ymm8'),
+ ('ymm0', 'xmm0'), ('ymm0', 'xmm8'), ('ymm8', 'xmm0'), ('ymm8', 'xmm8'),
+ ('ymm0', 'ymm0'), ('ymm0', 'ymm8'), ('ymm8', 'ymm0'), ('ymm8', 'ymm8')
+ )
+
+
+def MemRegex(opcode_bits = 0):
+ """ Returns two memory regexes: for bytes and textual representation. """
halyavin 2013/12/06 16:17:58 It is 3 regex now.
khim 2013/12/06 16:28:50 Done.
+
+ modrm_regex = '^.*?({:02x})'.format(0xc0 + opcode_bits * 8)
+ # We only need to process ModR/M+SIB '04 04' or '04 07' here.
+ modrm_sib_regex = '^.*?({:02x} 0[47])'.format(0x04 + opcode_bits * 8)
+ if options.bitness == 32:
+ regex_mem = '\\(%esp,%eax,1\\)'
+ else:
+ regex_mem = '\\((?:%rsp|%r15),(?:%rax|%r8),1\\)'
+ return modrm_regex, modrm_sib_regex, regex_mem
+
+
+def RegToRmCompressors(rm, reg, writes_to, memory_accessed=True,
+ register_write='ignore', is_3Dnow=False, notes=''):
+ """Returns a list of reg <-> rm compressors for a given set of parameters.
+
+ Args:
+ rm: rm operand kind (see REGISTERS array)
+ reg: reg operand kind (see REGISTERS array) or None if reg is not used
+ writes_to: three-state selector
+ 'reg' - instruction uses rm as source, reg as destination
+ 'rm' - instruction uses reg as source, rm as destination
+ 'both' - instruction writes to both reg and rm
+ memory_accessed: True if instruction accesses memory
+ register_write: three-state selector
+ 'sandbox' - instruction can be used to produce "restricted register"
+ 'protect' - instruction can damage output, protect "special registers"
+ 'ignore' - instruction does not affect it's operands (e.g. test) or
+ is used with non-GP registers (X87, MMX, XMM, etc)
+ is_3DNow - True if instruction is 3DNow! style (opcode in immidiate)
+ notes - Notes to add to the description
+
+ Returns:
+ List of compressors
+ """
+
+ compressors = []
+
+ start_reg = REGISTERS[reg][0]
+ end_reg = REGISTERS[reg][-1]
+ # Exclude r15 from the interval, if instruction can write to reg.
+ if (end_reg.startswith('r15') and register_write != 'ignore' and
+ writes_to in ('reg', 'both')):
+ end_reg = REGISTERS[reg][-2]
+ start_rm = REGISTERS[rm][0]
+ end_rm = REGISTERS[rm][-1]
+ # Exclude r15 from the interval, if instruction can write to rm.
+ if (end_rm.startswith('r15') and register_write != 'ignore' and
+ writes_to in ('rm', 'both')):
+ end_rm = REGISTERS[rm][-2]
+
+ # Regex here matches everything AFTER the ModR/M (+ SIB) bytes.
+ regex = '(?: 00)*'
+ # Additional byte is opcode extension with 3DNow! instructions.
+ if is_3Dnow:
halyavin 2013/12/06 16:05:47 Write it as if else and replace "Additional byte"
khim 2013/12/06 16:28:50 Done.
+ regex = ' [0-9a-fA-F][0-9a-fA-F]'
+ # 2 spaces separate byte code from instruction name.
+ regex += ' '
+ # Optional "lock" prefix.
+ regex += '(?:lock )?'
+ # Instruction name.
+ regex += '\\w* '
+ # Immediate or implicit operand that can precede reg/rm pair.
+ regex += '(?:\\$0x0,|\\$0x0,\\$0x0,|%cl,|%xmm0,)?'
+ modrm_regex, modrm_sib_regex, regex_mem = MemRegex()
+ output = None
+ output_note = None
+ if writes_to == 'reg':
+ regex += '(%' + REGISTERS[rm][0] + '|' + regex_mem + ')'
+ regex += ',(%' + REGISTERS[reg][0] + ')'
+ if register_write == 'sandbox':
+ assert reg in ('eax', 'r8d')
+ output = '%' + reg + '|None'
+ output_note = '[%eax..%edi]' if reg == 'eax' else '[%r8d..%r14d]'
+ subst = (
+ 'XX', '[%{}..%{} or memory]'.format(start_rm, end_rm),
+ '[%{}..%{}]'.format(start_reg, end_reg))
+ subst_register = (
+ 'XX', '[%{}..%{}]'.format(start_rm, end_rm),
+ '[%{}..%{}]'.format(start_reg, end_reg))
+ subst_memory = (
+ 'XX', '[memory]',
+ '[%{}..%{}]'.format(start_reg, end_reg))
+ else:
+ regex += '(%' + REGISTERS[reg][0] + ')'
+ regex += ',(%' + REGISTERS[rm][0] + '|' + regex_mem + ')'
+ if register_write == 'sandbox':
+ assert rm in ('eax', 'r8d')
+ output = '%' + rm + '|None'
+ output_note = '[%eax..%edi]' if rm == 'eax' else '[%r8d..%r14d]'
+ subst = (
+ 'XX', '[%{}..%{}]'.format(start_reg, end_reg),
+ '[%{}..%{} or memory]'.format(start_rm, end_rm))
+ subst_register = (
+ 'XX', '[%{}..%{}]'.format(start_reg, end_reg),
+ '[%{}..%{}]'.format(start_rm, end_rm))
+ subst_memory = (
+ 'XX', '[%{}..%{}]'.format(start_reg, end_reg),
+ '[memory]')
+ # Skip implicit arguments (if any)
+ regex += '.*'
+ if options.bitness == 64:
+ regex += '; input_rr=(%eax|%r8d|any_nonspecial)'
+ regex += '; output_rr=({})'.format(output)
+ if memory_accessed:
+ input_note = '[%eax..%edi]'
+ input_note_r8 = '[%r8d..%r15d]'
+ else:
+ input_note = 'any_nonspecial'
+ input_note_r8 = 'any_nonspecial'
+ subst_r8 = subst + (input_note_r8, output_note)
+ subst = subst + (input_note, output_note)
+ subst_memory_r8 = subst_memory + (input_note_r8, output_note)
+ subst_memory = subst_memory + (input_note, output_note)
+ subst_register = subst_register + ('any_nonspecial', output_note)
+ # With "or memory" or "memory" compressors we always can see where is
+ # reg and where is rm, but with reg to rm or rm to reg it's impossible.
+ # Add the appropriate comment
+ if writes_to == 'reg':
+ notes_register = ' # rm to reg'
+ else:
+ notes_register = ' # reg to rm'
+ if notes != '':
+ notes_register += '; ' + notes
+ notes = ' # ' + notes
+ subst = subst + (notes, )
+ subst_memory = subst_memory + (notes, )
+ subst_register = subst_register + (notes_register, )
+ if options.bitness == 64:
+ subst_r8 = subst_r8 + (notes, )
+ subst_memory_r8 = subst_memory_r8 + (notes, )
+ regex += '()$'
+ base_r8 = rm in r8.values()
+ memory_replacement = ModRMMemoryReplacements(
+ reg=reg, base_r8=base_r8, writes_to=writes_to,
+ memory_accessed=memory_accessed, register_write=register_write)
+ compressors.append(Compressor(
+ modrm_sib_regex + regex, subst_memory, memory_replacement))
+ if options.bitness == 64:
+ memory_replacement_r8 = ModRMMemoryReplacements(
+ reg=reg, base_r8=base_r8, index_r8=True, writes_to=writes_to,
+ memory_accessed=memory_accessed, register_write=register_write)
+ compressors.append(Compressor(
+ modrm_sib_regex + regex, subst_memory_r8, memory_replacement_r8))
+ # Instructions with no memory access are instructions which are doing
+ # something with memory address (e.g. lea) and as such they don't have
+ # non-memory forms.
+ if memory_accessed:
+ register_replacement = ModRMRegisterReplacements(
+ reg=reg, rm=rm, writes_to=writes_to, register_write=register_write)
+ compressors.append(Compressor(
+ modrm_regex + regex, subst_register, register_replacement))
+ main_replacement = register_replacement + memory_replacement
+ compressors.append(Compressor(
+ modrm_sib_regex + regex, subst, main_replacement))
+ if options.bitness == 64:
+ main_replacement_r8 = register_replacement + memory_replacement_r8
+ compressors.append(Compressor(
+ modrm_sib_regex + regex, subst_r8, main_replacement_r8))
+ return compressors
+
+
+def AllRegToRmCompressors():
+ """ Return list of all Reg to RM (and RM to Reg) compressors. """
+
+ compressors = []
+
+ for reg, rm in RegisterKindPairs():
+ instruction_kinds = [
+ # Normal instructions with two operands (rm to reg)
+ {'writes_to': 'reg'},
+ # Normal instructions with two operands (reg to rm)
+ {'writes_to': 'rm'}
+ ]
+ # In 64 bit mode we have many different types of instructions depending
+ # on whether we are accessing memory or whether we are writing to registers.
+ if options.bitness == 64:
+ # Lea in 64 bit mode is truly unique instruction for now
+ if reg in ('ax', 'r8w', 'eax', 'r8d', 'rax', 'r8'):
+ register_write = 'sandbox' if reg in ('eax', 'r8d') else 'protect'
+ instruction_kinds.append(
+ {'writes_to': 'reg',
+ 'memory_accessed': False,
+ 'register_write': register_write,
+ 'notes': 'lea'})
+ # There are few more forms in 64 bit case (rm to reg)
+ if reg in ('eax', 'r8d'):
+ # Zero-extending version.
+ instruction_kinds.append(
+ {'writes_to': 'reg',
+ 'register_write': 'sandbox'})
+ # More forms in 64 bit case (reg to rm)
+ if rm in ('eax', 'r8d'):
+ # Zero-extending version.
+ instruction_kinds.append(
+ {'writes_to': 'rm',
+ 'register_write': 'sandbox'})
+ # Zero-extending xchg/xadd
+ instruction_kinds.append(
+ {'writes_to': 'both',
+ 'register_write': 'sandbox',
+ 'notes': 'write to both'})
+ # Still more forms for 64 bit case (rm to reg).
+ if reg in ('al', 'spl', 'ax', 'eax', 'rax', 'r8b', 'r8w', 'r8d', 'r8'):
+ # Dangerous instructions (rm to reg)
+ instruction_kinds.append(
+ {'writes_to': 'reg',
+ 'register_write': 'protect'})
+ # Still more forms for 64 bit case (reg to rm)
+ if rm in ('al', 'spl', 'ax', 'eax', 'rax', 'r8b', 'r8w', 'r8d', 'r8'):
+ # Dangerous instructions (reg to rm)
+ instruction_kinds.append(
+ {'writes_to': 'rm',
+ 'register_write': 'protect'})
+ # Dangerous xchg/xadd
+ instruction_kinds.append(
+ {'writes_to': 'both',
+ 'register_write': 'protect',
+ 'notes': 'write to both'})
+ # 3DNow! instructions
+ if reg in ('mm0', 'mmalt') or rm in ('mm0', 'mmalt'):
+ instruction_kinds.append(
+ {'writes_to': 'reg',
+ 'is_3Dnow': True})
+ for instruction_kind in instruction_kinds:
+ compressors.extend(RegToRmCompressors(reg=reg, rm=rm, **instruction_kind))
+ return compressors
+
+
+def RmCompressors(rm, opcode_bits,
+ memory_accessed=True, register_write='ignore'):
+ """Returns a list of rm compressors for a given set of parameters.
+
+ Args:
+ rm: rm operand kind (see REGISTERS array)
+ memory_accessed: True if instruction accesses memory
+ register_write: three-state selector
+ 'sandbox' - instruction can be used to produce "restricted register"
+ 'protect' - instruction can damage output, protect "special registers"
+ 'ignore' - instruction does not affect it's operands (e.g. test) or
+ is used with non-GP registers (X87, MMX, XMM, etc)
+
+ Returns:
+ List of compressors
+ """
+ compressors = []
+
+ start_rm = REGISTERS[rm][0]
+ end_rm = REGISTERS[rm][-1]
+ # Exclude r15 from the interval, if instruction can write to rm.
+ if end_rm.startswith('r15') and register_write != 'ignore':
+ end_rm = REGISTERS[rm][-2]
+ byte_mark = 'XX/' + str(opcode_bits)
+
+ subst = (byte_mark, '[%{}..%{} or memory]'.format(start_rm, end_rm), '')
+ subst_register = (byte_mark, '[%{}..%{}]'.format(start_rm, end_rm), '')
+ subst_memory = (byte_mark, '[memory]', '')
+
+ # Regex here matches everything AFTER the ModR/M (+ SIB) bytes.
+ regex = '(?: 00)*'
+ # 2 spaces separate byte code from instruction name.
+ regex += ' '
+ # Optional "lock" prefix.
+ regex += '(?:lock )?'
+ # Instruction name.
+ regex += '\\w* '
+ # Immediate or implicit operand that can precede reg/rm pair.
+ regex += '(?:\\$0x0,|\\$0x0,|%cl,)?'
+ modrm_regex, modrm_sib_regex, regex_mem = MemRegex(opcode_bits)
+ # First register or memory access
+ regex += '(%' + REGISTERS[rm][0] + '|' + regex_mem + ').*'
+ output = None
+ output_note = None
+ if options.bitness == 64:
+ if register_write == 'sandbox':
+ assert rm in ('eax', 'r8d')
+ output = '%' + rm + '|None'
+ output_note = '[%eax..%edi]' if rm == 'eax' else '[%r8d..%r14d]'
+ regex += '; input_rr=(%eax|%r8d|any_nonspecial)'
+ regex += '; output_rr=({})'.format(output)
+ if memory_accessed:
+ input_note = '[%eax..%edi]'
+ input_note_r8 = '[%r8d..%r15d]'
+ else:
+ input_note = 'any_nonspecial'
+ input_note_r8 = 'any_nonspecial'
+ subst_r8 = subst[0:-1] + (input_note_r8, output_note) + subst[-1:]
+ subst = subst[0:-1] + (input_note, output_note) + subst[-1:]
+ subst_memory_r8 = subst_memory[0:-1] + (
+ input_note_r8, output_note) + subst_memory[-1:]
+ subst_memory = subst_memory[0:-1] + (
+ input_note, output_note) + subst_memory[-1:]
+ subst_register = subst_register[0:-1] + (
+ 'any_nonspecial', output_note) + subst_register[-1:]
+ regex += '()'
+ base_r8 = rm in r8.values()
+ memory_replacement = ModRMMemoryReplacements(
+ reg=None, base_r8=base_r8, opcode_bits=opcode_bits,
+ memory_accessed=memory_accessed, register_write=register_write)
+ compressors.append(Compressor(
+ modrm_sib_regex + regex, subst_memory, memory_replacement))
+ if options.bitness == 64:
+ memory_replacement_r8 = ModRMMemoryReplacements(
+ reg=None, base_r8=base_r8, index_r8=True, opcode_bits=opcode_bits,
+ memory_accessed=memory_accessed, register_write=register_write)
+ compressors.append(Compressor(
+ modrm_sib_regex + regex, subst_memory_r8, memory_replacement_r8))
+ # Instructions with no memory access are instructions which are doing
+ # something with memory address (e.g. prefetch) and as such they don't
+ # have non-memory forms.
+ if memory_accessed:
+ register_replacement = ModRMRegisterReplacements(
+ reg=None, rm=rm, opcode_bits=opcode_bits, register_write=register_write)
+ compressors.append(Compressor(
+ modrm_regex + regex, subst_register, register_replacement))
+ main_replacement = register_replacement + memory_replacement
+ compressors.append(Compressor(
+ modrm_sib_regex + regex, subst, main_replacement))
+ if options.bitness == 64:
+ main_replacement_r8 = register_replacement + memory_replacement_r8
+ compressors.append(Compressor(
+ '.*?({:02x} 0[47])'.format(0x04 + opcode_bits * 8) + regex,
+ subst_r8, main_replacement_r8))
+ return compressors
+
+
+def AllRmCompressors():
+ """ Return list of all RM (with reg as opcode extension) compressors. """
+ compressors = []
+
+ for rm in RegisterKinds():
+ for opcode_bits in xrange(8):
+ instruction_kinds = [
+ # The most basic form
+ {}
+ ]
+ # In 64 bit mode we have many different types of instructions depending
+ # on whether we are accessing memory or whether we are writing to
+ # registers.
+ if options.bitness == 64:
+ # No memory access (e.g. prefetch)
+ instruction_kinds.append(
+ {'memory_accessed':False})
+ # More forms in 64 bit case.
+ if rm in ('eax', 'r8d'):
+ # Zero-extending version.
+ instruction_kinds.append(
+ {'register_write':'sandbox'})
+ # Still more forms for 64 bit case (reg to rm).
+ if rm in ('al', 'spl', 'ax', 'eax', 'rax',
+ 'r8b', 'r8w', 'r8d', 'r8'):
+ # Dangerous instructions.
+ instruction_kinds.append(
+ {'register_write':'protect'})
+ for instruction_kind in instruction_kinds:
+ compressors.extend(RmCompressors(
+ rm=rm, opcode_bits=opcode_bits, **instruction_kind))
+ return compressors
+
+
+def AllOpcodeCompressors():
+ """ Return "register in opcode" compressors. """
+ compressors = []
+
+ for reg in RegisterKinds() + ('st(0)',):
+ for opcode in xrange(8):
+ for text1, text2, nibble in (
+ ('[0..7]', '[8..f]', xrange(8)),
+ ('[012367]', '[89abef]', (0, 1, 2, 3, 6, 7)),
+ ('[0..6]', '[8..e]', xrange(7))
+ ):
+ start_reg = REGISTERS[reg][0]
+ end_reg = REGISTERS[reg][-1 if 7 in nibble else -2]
+ subst_regs = '[%{}..%{}]'.format(start_reg, end_reg)
+ # Note that we use instruction with 1st register, not 0th register
+ # here to avoid ambiguity when opcode is 0x00
+ compressors.append(Compressor(
+ '.*?[0-9a-fA-F](1)(?: 00)*'
+ ' \\w* (?:\\$0x0,|%ax,|%st,)?'
halyavin 2013/12/06 16:30:38 Use raw literals when you are escaping '\\'
khim 2013/12/08 23:23:05 Done.
+ '(%' + REGISTERS_RE[reg][1] + ').*()',
+ (text1, subst_regs, ''),
+ tuple(('{:x}'.format(n), '%' + REGISTERS[reg][n])
+ for n in nibble)))
+ compressors.append(Compressor(
+ '.*?[0-9a-fA-F](8)(?: 00)*'
+ ' \\w* (?:\\$0x0,|%ax,|%st,)?'
+ '(%' + REGISTERS_RE[reg][0] + ').*()',
+ (text2, subst_regs, ''),
+ tuple(('{:x}'.format(n + 8), '%' + REGISTERS[reg][n])
+ for n in nibble)))
+ # Another version for 64 bit case
+ if options.bitness == 64 and reg in ('eax', 'r8d'):
+ compressors.append(Compressor(
+ '.*?[0-9a-fA-F](1)(?: 00)*'
+ ' \\w* (?:\\$0x0,|%ax,|%st,)?'
+ '(%' + REGISTERS_RE[reg][1] + ').*'
+ 'output_rr=(%'+ REGISTERS_RE[reg][1] + ').*()',
+ (text1, subst_regs, subst_regs, ''),
+ tuple(['{:x}'.format(n)] + ['%' + REGISTERS[reg][n]] * 2
+ for n in nibble)))
+ compressors.append(Compressor(
+ '.*?[0-9a-fA-F](8)(?: 00)*'
+ ' \\w* (?:\\$0x0,|%ax,|%st,)?'
+ '(%' + REGISTERS_RE[reg][0] + ').*'
+ 'output_rr=(%'+ REGISTERS_RE[reg][0] + ').*()',
+ (text2, subst_regs, subst_regs, ''),
+ tuple(['{:x}'.format(n + 8)] + ['%' + REGISTERS[reg][n]] * 2
+ for n in nibble)))
+ return compressors
+
+
+def AllMemoryNonMemoryCompressors():
+ """ Return memory/nonmemory compressors. """
+
+ compressors = []
+
+ if options.bitness == 32:
+ letters_and_registers = (('b', 'al', ''), ('w', 'ax', ''), ('l', 'eax', ''))
+ else:
+ letters_and_registers = (
+ ('b', 'al', 'eax'), ('b', 'spl', 'eax'), ('b', 'r8b', 'r8d'),
+ ('w', 'ax', 'eax'), ('w', 'r8w', 'r8d'),
+ ('l', 'eax', 'eax'), ('l', 'r8d', 'r8d'),
+ ('q', 'rax', 'eax'), ('q', 'r8', 'r8d')
+ )
+ for letter, reg, out_reg in letters_and_registers:
+ start_reg = REGISTERS[reg][0]
+ if reg[0:2] == 'r8':
+ end_regs = (REGISTERS[reg][-2], REGISTERS[reg][-1])
+ else:
+ end_regs = (REGISTERS[reg][-1], )
+ for end_reg in end_regs:
+ all_regs = '[%{}..%{}]'.format(start_reg, end_reg)
+ regs_mark = '[%{}..%{} or memory]'.format(start_reg, end_reg)
+ if options.bitness == 64:
+ start_out = REGISTERS[out_reg][0]
+ end_out = REGISTERS[out_reg][-1 if out_reg[0:2] != 'r8' else -2]
+ out_regs = '[%{}..%{}]'.format(start_out, end_out)
+ for notes in ('', ' # rm to reg', ' # reg to rm'):
+ compressors.append(Compressor(
+ '.* \\w*(' + letter + ') .*(\\[memory]).*()()',
+ ('[{}]?'.format(letter), regs_mark, '', ''),
+ ((letter, '[memory]', ''), ('', all_regs, notes))))
+ if options.bitness == 64:
+ for index_reg in ('eax', 'r8d'):
+ start_index = REGISTERS[index_reg][0]
+ end_index = REGISTERS[index_reg][-1]
+ index_regs = '[%{}..%{}]'.format(start_index, end_index)
+ for output_rrs in ((None, out_regs),
+ (out_regs, None),
+ (None, None)):
+ compressors.append(Compressor(
+ '.* \\w*(' + letter + ') .*(\\[memory]).*; '
+ 'input_rr=(\\[%[a-z0-9]*..%[a-z0-9]*\\]); '
+ 'output_rr=(\\[%[a-z0-9]*..%[a-z0-9]*\\]|None)()()',
+ ('[{}]?'.format(letter), regs_mark, index_regs,
+ output_rrs[1] if output_rrs[0] is None else output_rrs[0],
+ '', ''),
+ ((letter, '[memory]', index_regs, output_rrs[0], ''),
+ ('', all_regs, 'any_nonspecial', output_rrs[1], notes))))
+ return compressors
+
+
+def RexCompressor(rm, rmR15, reg, regR15, rexw, input_rr, output_rr, rm_to_reg):
+ """ Return REX compressor (or nothing) for a given set of paramenters.
+
+ Args:
+ rm: rm operand kind (see REGISTERS array) or None if rm is not used
+ reg: reg operand kind (see REGISTERS array) or None if reg is not used
+ rmR15: True if R15 register is included
+ regR15: True if R15 register is included
+ rexw: True if REX.W should be set
+ input_rr: True if input_rr is used
+ output_rr: true if output_rr is used
+ rm_to_reg: True if direction is rm to reg
+ """
+
+ # reg and rm can be of three different possible intervals
+ # start_reg/rm to end_reg/rm (e.g. from %al to %bh)
+ # start_reg/rm to end_reg0/rm0 (e.g from %al to %dil)
+ # start_reg8/rm8 to end_reg8/rm8 (e.g. from %r8b to %r15b)
+ # First form can be observed if there are not REX, second form
+ # if REX.R/REX.B is not set and third form is where it's set.
+ if reg:
+ start_reg = REGISTERS[reg][0]
+ start_reg8 = REGISTERS[r8[reg]][0]
+ end_reg = REGISTERS[reg][-1]
+ end_reg0 = 'dil' if reg == 'al' else end_reg
+ end_reg8 = REGISTERS[r8[reg]][-1 if regR15 else -2]
+ if rexw:
+ reg_regex = '\\[(%' + start_reg + '\\.\\.%' + end_reg0 + ')]'
+ else:
+ reg_regex = '\\[(%' + start_reg + '\\.\\.%' + end_reg + ')]'
+ if rm:
+ start_rm = REGISTERS[rm][0]
+ start_rm8 = REGISTERS[r8[rm]][0]
+ end_rm = REGISTERS[rm][-1]
+ end_rm0 = 'dil' if rm == 'al' else end_rm
+ end_rm8 = REGISTERS[r8[rm]][-1 if rmR15 else -2]
+ if rexw:
+ rm_regex = ('\\[(%' + start_rm + '\\.\\.%' + end_rm0 + ')'
+ '(?: or memory)?]')
+ else:
+ rm_regex = ('\\[(%' + start_rm + '\\.\\.%' + end_rm + ')'
+ '(?: or memory)?]')
+
+ # Legacy prefixes
+ regex = '.*:(?: 26| 2e| 36| 3e| 64| 65| 66| 67| f0| f2| f3)*'
+ # REX
+ regex += '( 48).*' if rexw else '( 40|).*'
+ # Replacement text
+ replacement_tuple = (' [REX:48..4f]' if rexw else ' [REX:40..47]?', )
+ if reg: replacement_regs = '%{}..%{}'.format(start_reg, end_reg8)
+ if rm: replacement_rms = '%{}..%{}'.format(start_rm, end_rm8)
+ # Instruction arguments
+ if not reg and not rm:
+ pass
+ elif not reg and rm:
+ regex += rm_regex + '.*'
+ replacement_tuple += (replacement_rms, )
+ elif reg and not rm:
+ regex += reg_regex + '.*'
+ replacement_tuple += (replacement_regs, )
+ elif rm_to_reg:
+ regex += rm_regex + ',' + reg_regex + '.*'
+ replacement_tuple += (replacement_rms, replacement_regs)
+ else:
+ regex += reg_regex + ',' + rm_regex + '.*'
+ replacement_tuple += (replacement_regs, replacement_rms)
+ # Input and output restricted registers
+ if input_rr:
+ regex += 'input_rr=\\[(%eax\\.\\.%edi)].*'
+ replacement_tuple += ('%eax..%r15d', )
+ if output_rr:
+ regex += 'output_rr=\\[(%eax\\.\\.%edi)].*'
+ replacement_tuple += ('%eax..%r14d', )
+ regex += '()'
+ replacement_tuple += ('', )
+ # Replacement cases
+ replacement_tuples = ()
+ for byte in (range(0x48, 0x50) if rexw else range(0x40, 0x48) + [0]):
+ replacement_case = (' {:02x}'.format(byte) if byte else '', )
+ if rm:
+ if byte & 0x1:
+ replacement_rms = '%{}..%{}'.format(start_rm8, end_rm8)
+ elif byte:
+ replacement_rms = '%{}..%{}'.format(start_rm, end_rm0)
+ else:
+ replacement_rms = '%{}..%{}'.format(start_rm, end_rm)
+ if byte & 0x2:
+ replacement_index = '%r8d..%r15d'
+ else:
+ replacement_index = '%eax..%edi'
+ if reg:
+ if byte & 0x4:
+ replacement_regs = '%{}..%{}'.format(start_reg8, end_reg8)
+ elif byte:
+ replacement_regs = '%{}..%{}'.format(start_reg, end_reg0)
+ else:
+ replacement_regs = '%{}..%{}'.format(start_reg, end_reg)
+ if not reg and not rm:
+ pass
+ elif not reg and rm:
+ replacement_case += (replacement_rms, )
+ final_rr = '%r8d..%r14d' if byte & 0x1 else '%eax..%edi'
+ elif reg and not rm:
+ replacement_case += (replacement_regs, )
+ final_rr = '%r8d..%r14d' if byte & 0x4 else '%eax..%edi'
+ elif rm_to_reg:
+ replacement_case += (replacement_rms, replacement_regs)
+ final_rr = '%r8d..%r14d' if byte & 0x4 else '%eax..%edi'
+ else:
+ replacement_case += (replacement_regs, replacement_rms)
+ final_rr = '%r8d..%r14d' if byte & 0x1 else '%eax..%edi'
+ if input_rr: replacement_case += (replacement_index, )
+ if output_rr: replacement_case += (final_rr, )
+ replacement_tuples += (replacement_case, )
+ return Compressor(regex, replacement_tuple, replacement_tuples)
+
+
+def AllRexCompressors():
+ """ Return "REX" compressors which combine different REX prefixes. """
+
+ if options.bitness != 64:
+ return []
+
+ compressors = []
+
+ # First pretty complex set of compressors to combine versions of REX with
+ # three lowest bits in different states.
+ register_kind_pairs = (
+ ( None, None),
+ ( 'al', 'al'), ( 'al', None), (None, 'al'),
+ ( 'ax', 'al'), ( 'al', 'ax'),
+ ( 'ax', 'ax'), ( 'ax', None), (None, 'ax'),
+ ( 'eax', 'al'), ( 'al', 'eax'),
+ ( 'eax', 'ax'), ( 'ax', 'eax'),
+ ( 'eax', 'eax'), ( 'eax', None), (None, 'eax'),
+ ( 'rax', 'al'), ( 'al', 'rax'),
+ ( 'rax', 'ax'), ( 'ax', 'rax'),
+ ( 'rax', 'eax'), ( 'eax', 'rax'),
+ ( 'rax', 'rax'), ( 'rax', None), (None, 'rax'),
+ ( 'eax', 'mm0'), ( 'mm0', 'eax'),
+ ( 'rax', 'mm0'), ( 'mm0', 'rax'),
+ ( 'mm0', 'eax'), ( 'eax', 'mm0'),
+ ( 'mm0', 'rax'), ( 'rax', 'mm0'),
+ ( 'eax', 'xmm0'),
+ ( 'rax', 'xmm0'),
+ ('xmm0', 'eax'),
+ ('xmm0', 'rax'),
+ ( 'mm0', 'mm0'), ( 'mm0', None), (None, 'mm0'),
+ ( 'mm0', 'xmm0'),
+ ('xmm0', 'mm0'),
+ ('xmm0', 'xmm0'),
+ ('xmm0', 'ymm0'), ('xmm0', None), (None, 'xmm0'),
+ ('ymm0', 'xmm0'),
+ ('ymm0', 'ymm0'), ('ymm0', None), (None, 'ymm0'),
+ )
+ for reg, rm in register_kind_pairs:
+ for regR15 in (True, False):
+ for rmR15 in (True, False):
+ for rexw in (True, False):
+ for input_rr in (True, False):
+ for output_rr in (True, False):
+ for rm_to_reg in (True, False):
+ # These combinations will just produce useless duplicates
+ if not reg and not regR15: continue
+ if not rm and not rmR15: continue
+ if not reg and not rm and (output_rr or rm_to_reg): continue
+ compressors.append(RexCompressor(
+ rm=rm, rmR15=rmR15, reg=reg, regR15=regR15,
+ rexw=rexw, input_rr=input_rr, output_rr=output_rr,
+ rm_to_reg=rm_to_reg))
+
+ # This is pretty simple compressor to combine two lines with different REX.W
+ # bits (only if they are otherwise identical).
+ compressors.append(Compressor(
+ '.*(\\[REX:40\\.\\.47]\\?).*()', ('[REX:40..4f]?', ''),
+ (('[REX:40..47]?', ), ('[REX:48..4f]', ))))
+ return compressors
+
+
+def AllSpecialCompressors():
+ """ Return all "special" compressors. """
+
+ compressors = []
+
+ # Special compressors: will handle some cosmetic issues.
+ #
+ # SETxx ignores reg field and thus are described as many separate instructions
+ compressors.append(Compressor(
+ '.*0f 9[0-9a-fA-F] XX(/[0-7]) set.*()', ('', ''),
+ [('/' + str(i), ) for i in range(8)]))
+ # BSWAP is described with opcode "0f c8+r", not "0f /1" in manual
+ if options.bitness == 32:
+ compressors.append(Compressor(
+ '.*(XX/1) bswap.*ax.*()', ('c[8..f]', ''), [('XX/1', )]))
+ else:
+ compressors.append(Compressor(
+ '.*(XX/1) bswap.*ax.*()', ('c[89abef]', ''), [('XX/1', )]))
+ compressors.append(Compressor(
+ '.*(XX/1) bswap.*r8.*()', ('c[8..e]', ''), [('XX/1', )]))
+ # Add mark '# write to both' to certain versions of CMPXCHG, XADD, and XCHG
+ if options.bitness == 64:
+ compressors.append(Compressor(
+ '.* (?:cmpxchg|xadd|xchg).*%al\\.\\.%bh[^#]*()$',
+ (' # write to both', ), ((), )))
+ # "and $0xe0,[%eax..%edi]" is treated specially which means that we list all
+ # versions of and "[$0x1..$0xff],[%eax..%edi]" separately here.
+ # Without this rule these ands comprise 2/3 of the whole output!
+ if options.bitness == 32:
+ compressors.append(Compressor(
+ '.*83 (e0 01 and \\$0x1,%eax)()',
+ ('XX/4 00 and[l]? $0x0,[%eax..%edi or memory]', ' # special and'),
+ [('e{} {:02x} and $0x{:x},%{}'.format(r, i, i, REGISTERS['eax'][r]), )
+ for i in range(0x01, 0x100) for r in range(8)] +
+ [('XX/4 00 and[l]? $0x0,[%eax..%edi or memory]', )]))
+ else:
+ for reg in ('eax', 'r8d'):
+ start_reg = REGISTERS[reg][0]
+ end_reg = REGISTERS[reg][-1 if reg[0:2] != 'r8' else -2]
+ for index_reg in ('eax', 'r8d'):
+ start_index = REGISTERS[index_reg][0]
+ end_index = REGISTERS[index_reg][-1]
+ compressors.append(Compressor(
+ '.*83 (e0 01 and \\$0x1,%' + reg + ').*'
+ 'input_rr=(any_nonspecial); output_rr=(%' + reg + ')()',
+ ('XX/4 00 and[l]? $0x0,[%{}..%{} or memory]'.format(start_reg,
+ end_reg), '[%{}..%{}]'.format(start_index, end_index),
+ '[%{}..%{}]'.format(start_reg, end_reg),
+ ' # special and'),
+ [('e{} {:02x} and $0x{:x},%{}'.format(r, i, i, REGISTERS[reg][r]),
+ 'any_nonspecial', '%' + REGISTERS[reg][r])
+ for i in range(0x01, 0x100) for r in range(7 + (reg == 'eax'))] +
+ [('XX/4 00 and[l]? $0x0,[%{}..%{} or memory]'.format(start_reg,
+ end_reg), '[%{}..%{}]'.format(start_index, end_index),
+ '[%{}..%{}]'.format(start_reg, end_reg))]))
+
+ # "and $e0" and similar are used to align %rsp. All negative values are
+ # accepted by validator and there are 127 of these.
+ # Consolidate them into one line.
+ if options.bitness == 64:
+ compressors.append(Compressor(
+ '.*(?:81|83) (?:e4|e5) (80) (?:00 00 00 |) and \\$0x(80),%r[bs]p.*()',
+ ('[80..ff]', '[80..ff]', ' # alignment and'),
+ [('{:02x}'.format(i), '{:02x}'.format(i)) for i in range(0x80, 0x100)]))
+ return compressors
+
+
+def PrepareCompressors():
+ """ Return list of all compressors sorted from bigger ones to smaller ones """
+
+ return tuple(sorted(
+ AllRegToRmCompressors() +
+ AllRmCompressors() +
+ AllOpcodeCompressors() +
+ AllMemoryNonMemoryCompressors() +
+ AllRexCompressors() +
+ AllSpecialCompressors(),
+ key=lambda compressor: -len(compressor.replacements)))
+
+
+def ShowProgress(rule, instruction):
+ if rule not in ShowProgress.rules_shown:
+ first_print = True
+ ShowProgress.rules_shown[rule]=len(ShowProgress.rules_shown)
+ else:
+ first_print = False
+ print >> sys.stderr, '-------- Compressed --------'
+ print >> sys.stderr, 'Rule:', ShowProgress.rules_shown[rule]
+ print >> sys.stderr, '--------'
+ compressor = compressors[rule]
+ match = compressor.regex.match(instruction)
+ assert match
+ format_str = CompressionTemplate(instruction, match, '{{{}}}')
+ replacements = sorted(format_str.format(*replacement)
+ for replacement in compressor.replacements)
+ if len(compressor.replacements) <= 4 or first_print:
+ for replacement in replacements:
+ print >> sys.stderr, replacement
+ else:
+ print >> sys.stderr, replacements[0]
+ print >> sys.stderr, '...'
+ print >> sys.stderr, replacements[-1]
+ print >> sys.stderr, '--------'
+ print >> sys.stderr, 'Compressed', (
+ format_str + '{{{}}}').format(*compressor.subst)
+ShowProgress.rules_shown = {}
+
+
+def main():
+ # We are keeping these global to share state graph and compressors
+ # between workers spawned by multiprocess. Passing them every time is slow.
+ global options, xml_file
+ global dfa
+ global worker_validator
+ options, xml_file = ParseOptions()
+ dfa = dfa_parser.ParseXml(xml_file)
+ worker_validator = validator.Validator(
+ validator_dll=options.validator_dll,
+ decoder_dll=options.decoder_dll)
+ global compressors
+ compressors = PrepareCompressors()
+
+ assert dfa.initial_state.is_accepting
+ assert not dfa.initial_state.any_byte
+
+ print >> sys.stderr, len(dfa.states), 'states'
+
+ num_suffixes = dfa_traversal.GetNumSuffixes(dfa.initial_state)
+
+ # We can't just write 'num_suffixes[dfa.initial_state]' because
+ # initial state is accepting.
+ total_instructions = sum(
+ num_suffixes[t.to_state]
+ for t in dfa.initial_state.forward_transitions.values())
+ print >> sys.stderr, total_instructions, 'regular instructions total'
+
+ tasks = dfa_traversal.CreateTraversalTasks(dfa.states, dfa.initial_state)
+ print >> sys.stderr, len(tasks), 'tasks'
+
+ pool = multiprocessing.Pool()
+
+ results = pool.imap(Worker, tasks)
+
+ total = 0
+ num_valid = 0
+ full_output = set()
+ for prefix, count, valid_count, output, trace in results:
+ print >> sys.stderr, 'Prefix:', ', '.join(map(hex, prefix))
+ total += count
+ num_valid += valid_count
+ full_output |= output
+ for rule, instruction in trace:
+ ShowProgress(rule, instruction)
+ for instruction in sorted(Compressed(full_output,
+ compressors,
+ ShowProgress)):
+ print instruction
+
+ print >> sys.stderr, total, 'instructions were processed'
+ print >> sys.stderr, num_valid, 'valid instructions'
+
+
+if __name__ == '__main__':
+ main()
« no previous file with comments | « no previous file | src/trusted/validator_ragel/testdata/32bit_regular.golden » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698