src/trusted/validator_ragel/compress_regular_instructions.py - Issue 49183002: Regular instructions golden file test.

Side by Side Diff: src/trusted/validator_ragel/compress_regular_instructions.py

Issue 49183002: Regular instructions golden file test. Base URL: svn://svn.chromium.org/native_client/trunk/src/native_client/

Patch Set: Created 7 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
(Empty)
	1 # Copyright (c) 2013 The Native Client Authors. All rights reserved.

	2 # Use of this source code is governed by a BSD-style license that can be

	3 # found in the LICENSE file.

	4

	5 """

	6 Traverse the validator's DFA, collect all "normal" instruction and then

	7 compress output. Note: "anybyte fields" (immediates and displacements)

	8 are always filled with zeros. Otherwise processing of sextillions (sic!)

	9 of possibilities will take too long.

	10

	11 Each rule is applied only when all variants are accepted by validator.

	12 The following compression rules are present:

	13

	14 1. Compress ModR/M (+SIB & displacement).

	15 Instruction: 00 00 add %al,(%rax)

	16 ...

	17 Instruction: 00 ff add %bh,%bh

	18 becomes

	19 Instruction: 00 XX add [%al..%bh],[%al..%bh or memory]

	20

	21 1a. Compress ModR/M (+SIB & displacement) memory-only.

	22 Instruction: f0 01 00 lock add %eax,(%eax)

	23 ...

	24 Instruction: f0 01 bf 00 00 00 00 lock add %edi,0x0(%edi)

	25 becomes

	26 Instruction: f0 01 XX lock add [%eax..edi],[memory]

	27

	28 1b. Compress ModR/M register only.

	29 Instruction: 66 0f 50 c0 movmskpd %xmm0,%eax

	30 ...

	31 Instruction: 66 0f 50 ff movmskpd %xmm7,%edi

	32 becomes

	33 Instruction: 66 0f 50 XX movmskpd [%xmm0..%xmm7],[%eax..edi]

	34

	35 2. Compress ModR/M (+SIB & displacement) with opcode extension.

	36 Instruction: 0f 90 00 seto (%eax)

	37 ...

	38 Instruction: 0f 90 c7 seto %bh

	39 becomes

	40 Instruction: 0f 90 XX/0 seto [%al..%bh or memory]

	41

	42 2a. Compress ModR/M (+SIB & displacement) memory-only with opcode extension.

	43 Instruction: f0 ff 00 lock incl (%eax)

	44 ...

	45 Instruction: f0 ff 84 ff 00 00 00 00 lock incl 0x0(%edi,%edi,8)

	46 becomes

	47 Instruction: f0 ff XX/1 lock decl [memory]

	48

	49 2b. Compress ModR/M register-only with opcode extension.

	50 Instruction: 0f 71 d0 00 psrlw $0x0,%mm0

	51 ...

	52 Instruction: 0f 71 d7 00 psrlw $0x0,%mm7

	53 becomes

	54 Instruction: 66 0f 71 XX/2 00 psrlw $0x0,[%mm0..%mm7]

	55

	56 3. Compress register-in-opcode.

	57 Instruction: d9 c0 fld %st(0)

	58 ...

	59 Instruction: d9 c7 fld %st(7)

	60 becomes

	61 Instruction: Instruction: d9 c[0..7] fld [%st(0)..%st(7)]

	62

	63 Only applies if all possible register accesses are accepted by validator.

	64

	65 4. Special compressor for "set" instruction.

	66 Instruction: 0f 90 XX/0 seto [%al..%bh or memory]

	67 ...

	68 Instruction: 0f 90 XX/7 seto [%al..%bh or memory]

	69 becomes

	70 Instruction: 0f 90 XX seto [%al..%bh or memory]

	71 """

	72

	73 import itertools

	74 import multiprocessing

	75 import optparse

	76 import os

	77 import re

	78 import subprocess

	79 import sys

	80 import tempfile

	81 import traceback

	82

	83 import dfa_parser

	84 import dfa_traversal

	85 import validator

	86

	87

	88 # Register names in 'natual' order (as defined by IA32/x86-64 ABI)

	89 #

	90 # X86-64 ABI splits all registers in groups of 8 because it uses 3-bit field

	91 # in opcode, ModR/M, and/or SIB bytes to encode them.

	92 #

	93 # In most cases there are 16 registers of a given kind and two such groups,

	94 # but there are couple of exceptions:

	95 # 1. There are 20 8-bit registers and three groups (two of them overlap)

	96 # 2. There are eight X87 and MMX registers thus two groups are identical

	97 #

	98 # We use typical register from a group to name the whole group. Most groups

	99 # use first register, but 'spl' group uses fifth register because it's first

	100 # four registers are the same as 'al' group. We use mnemonic name 'mmalt'

	101 # to represent the "evil mirror" of the 'mm0' group.

	102 REGISTERS = {

	103 'al': [ 'al', 'cl', 'dl', 'bl', 'ah', 'ch', 'dh', 'bh' ],

	104 'spl': [ 'al', 'cl', 'dl', 'bl', 'spl', 'bpl', 'sil', 'dil' ],

	105 'ax': [ 'ax', 'cx', 'dx', 'bx', 'sp', 'bp', 'si', 'di' ],

	106 'eax': [ 'eax', 'ecx', 'edx', 'ebx', 'esp', 'ebp', 'esi', 'edi' ],

	107 'rax': [ 'rax', 'rcx', 'rdx', 'rbx', 'rsp', 'rbp', 'rsi', 'rdi' ],

	108 'r8b': [ 'r{}b'.format(N) for N in range(8,16) ],

	109 'r8w': [ 'r{}w'.format(N) for N in range(8,16) ],

	110 'r8d': [ 'r{}d'.format(N) for N in range(8,16) ],

	111 'r8': [ 'r{}'.format(N) for N in range(8,16) ],

	112 'mm0': [ 'mm{}'.format(N) for N in range(8) ],

	113 'mmalt': [ 'mm{}'.format(N) for N in range(8) ],

	114 'st(0)': [ 'st({})'.format(N) for N in range(8) ],

	115 'xmm0': [ 'xmm{}'.format(N) for N in range(8) ],

	116 'xmm8': [ 'xmm{}'.format(N) for N in range(8,16) ],

	117 'ymm0': [ 'ymm{}'.format(N) for N in range(8) ],

	118 'ymm8': [ 'ymm{}'.format(N) for N in range(8,16) ]

	119 }

	120

	121

	122 NOP = 0x90

	123

	124

	125 def PadToBundleSize(bytes):

	126 assert len(bytes) <= validator.BUNDLE_SIZE

	127 return bytes + [NOP] * (validator.BUNDLE_SIZE - len(bytes))

	128

	129

	130 # In x86-64 mode we have so-called 'restricted register' which is used to

	131 # tie two groups together. Some instructions require particular value to

	132 # be stored in this variable, while some accept any non-special restricted

	133 # register (%ebp and %esp are special because they can only be accepted by

	134 # a few 'special' instructions).

	135 #

	136 # You can find more details in the "NaCl SFI model on x86-64 systems" manual.

	137 #

	138 # We try to feed all possible 'restricted registers' into validator and then

	139 # classify the instruction using this map. If set of acceptable 'restricted

	140 # registers' is not here, then it's an error in validator.

	141 ACCEPTABLE_X86_64_INPUTS = {

	142 0x00001: 'input_rr=%eax',

	143 0x00002: 'input_rr=%ecx',

	144 0x00004: 'input_rr=%edx',

	145 0x00008: 'input_rr=%ebx',

	146 0x00010: 'input_rr=%esp',

	147 0x00020: 'input_rr=%ebp',

	148 0x00040: 'input_rr=%esi',

	149 0x00080: 'input_rr=%edi',

	150 0x00100: 'input_rr=%r8d',

	151 0x00200: 'input_rr=%r9d',

	152 0x00400: 'input_rr=%r10d',

	153 0x00800: 'input_rr=%r11d',

	154 0x01000: 'input_rr=%r12d',

	155 0x02000: 'input_rr=%r13d',

	156 0x04000: 'input_rr=%r14d',

	157 0x08000: 'input_rr=%r15d',

	158 0x1ffcf: 'input_rr=any_nonspecial'

	159 }

	160

	161 # Any instruction must produce either None or one of fifteen registers as an

	162 # output 'restricted register' value. 'r15d' is NOT acceptable as an output.

	163 ACCEPTABLE_X86_64_OUTPUT_REGISTERS = tuple(

	164 '%' + reg for reg in (REGISTERS['eax'] + REGISTERS['r8d'])[0:-1])

	165

	166

	167 def ValidateInstruction(instruction, validator_inst):

	168 bundle = ''.join(map(chr, PadToBundleSize(instruction)))

	169 if options.bitness == 32:

	170 result = validator_inst.ValidateChunk(bundle, bitness=32)

	171 return result, []

	172 else:

	173 valid_inputs = 0

	174 known_final_rr = None

	175 output_rr = None

	176 # Note that iteration order is aligned with ACCEPTABLE_X86_64_INPUTS array

	177 # above.

	178 for bit, initial_rr in enumerate(validator.ALL_REGISTERS + [None]):

	179 valid, final_rr = validator_inst.ValidateAndGetFinalRestrictedRegister(

	180 bundle, len(instruction), initial_rr)

	181 if valid:

	182 # final_rr should not depend on input_rr

	183 assert valid_inputs == 0 or known_final_rr == final_rr

	184 valid_inputs \|= 1 << bit

	185 known_final_rr = final_rr

	186 # If nothing is accepted then instruction is not valid. Easy and simple.

	187 if valid_inputs == 0: return False, []

	188 # If returned value in unacceptable we'll get IndexError here and this

	189 # test will fail

	190 if known_final_rr is not None:

	191 output_rr = ACCEPTABLE_X86_64_OUTPUT_REGISTERS[known_final_rr]

	192 # If collected valid_inputs are unacceptable we'll get KeyError here and

	193 # this test will fail

	194 return True, [ACCEPTABLE_X86_64_INPUTS[valid_inputs],

	195 'output_rr={}'.format(output_rr)]

	196

	197

	198 class WorkerState(object):

	199 def __init__(self, prefix, validator):

	200 self.total_instructions = 0

	201 self.num_valid = 0

	202 self.validator = validator

	203 self.output = set()

	204 self.trace = []

	205

	206

	207 def ReceiveInstruction(self, bytes):

	208 self.total_instructions += 1

	209 result, notes = ValidateInstruction(bytes, self.validator)

	210 if result:

	211 self.num_valid += 1

	212 dis = self.validator.DisassembleChunk(

	213 ''.join(map(chr, bytes)),

	214 bitness=options.bitness)

	215 for line_nr in xrange(len(dis)):

	216 dis[line_nr] = str(dis[line_nr])

	217 assert dis[line_nr][0:17] == 'Instruction(0x' + str(line_nr) + ': '

	218 assert dis[line_nr][-1:] == ')'

	219 dis[line_nr] = dis[line_nr][17:-1]

	220 # If %rip is involved then comment will be different depending on the

	221 # instruction length. Eliminate it.

	222 if '(%rip)' in dis[0]:

	223 dis[0] = re.sub(' # 0x[ ][0-9a-fA-F]', '', dis[0])

	224 # Zero displacements are represented as 0x0 for all instructions except

	225 # jumps where they disassembled as non-zero due to %eip/%rip-relative

	226 # addressing. We replace this displacement with %eip/%rip to simplify

	227 # compression.

	228 if ' 0x' in dis[0] and ' 0x0' not in dis[0]:

	229 for bytes in xrange(1, 16):

	230 dis[0] = re.sub(

	231 '(' + '(?:[0-9a-fA-F][0-9a-fA-F] ){' + str(bytes) + '} .* )' +

	232 hex(bytes) + '(.*)',

	233 '\\1%eip\\2' if options.bitness == 32 else '\\1%rip\\2',

	234 dis[0]);

	235 dis[0] = 'Instruction: ' + dis[0]

	236 dis += notes

	237 self.output.add('; '.join(dis))

	238

	239

	240 def RecordTrace(self, compressor_nr, instruction):

	241 self.trace.append((compressor_nr, instruction))

	242

	243

	244 # Compressor has three slots: regex (which picks apart given instruction),

	245 # subst (which is used to denote compressed version) and replacements (which

	246 # are used to generate set of instructions from a given code).

	247 #

	248 # Example compressor:

	249 # regex = '.?[0-9a-fA-F]([0-7]) \\w (%e(?:[abcd]x\|[sb]p\|[sd]i)).*()'

	250 # subst = ('[0-7]', '[%eax..%edi]', ' # register in opcode')

	251 # replacements = ((0, '%eax'), (1, '%ecx'), (2, '%edx'), (3, '%ebx')

	252 # (4, '%esp'), (5, '%ebp'), (6, '%esi'), (7, '%edi'))

	253 #

	254 # When faced with instriuction '40 inc %eax' it will capture the following

	255 # pieces of said instruction: '4[0] inc [%eax]'.

	256 #

	257 # Then it will produce the following eight instructions:

	258 # '40 inc %eax'

	259 # '41 inc %ecx'

	260 # '42 inc %edx'

	261 # '43 inc %ebx'

	262 # '44 inc %esp'

	263 # '45 inc %ebp'

	264 # '46 inc %esi'

	265 # '47 inc %edi'

	266 #

	267 # If all these instructions can be found in a set of instructions then

	268 # compressor will remove them from said set and will insert one replacement

	269 # "compressed instruction" '4[0-7] inc [%eax..%edi] # register in opcode'.

	270 #

	271 # Note that last group is only used in the replacement. It's used to grab marks

	272 # added by previous compressors and to replace them with a new mark.

	273 class Compressor(object):

	274 __slots__ = [

	275 'regex',

	276 'subst',

	277 'replacements'

	278 ]

	279

	280 def __init__(self, regex, subst, replacements=None):

	281 self.regex = re.compile(regex)

	282 self.subst = subst

	283 self.replacements = [] if replacements is None else replacements

	284

	285

	286 def CompressionTemplate(instruction, match, mark):

	287 """ Replace all match groups with the mark. """

	288 pos = 0

	289 format_str = ''

	290 for group in range(1, len(match.groups())):

	291 format_str += instruction[pos:match.start(group)] + mark

	292 pos = match.end(group)

	293 return format_str + instruction[pos:match.start(len(match.groups()))]

	294

	295

	296 def CompressOneMatch(instructions, instruction, match, compressor):

	297 format_str = CompressionTemplate(instruction, match, '{}')

	298 subset = set()

	299 for replacement in compressor.replacements:

	300 replacement_str = format_str.format(*replacement)

	301 if not replacement_str in instructions:

	302 return (False, instructions)

	303 subset.add(replacement_str)

	304 instructions -= subset

	305 instructions.add((format_str + '{}').format(*compressor.subst))

	306 return (True, instructions)

	307

	308

	309 def CompressOneInstruction(instructions, compressors, split, cache):

	310 sorted_instructions = (sorted(i for i in instructions if i > split) +

	311 sorted(i for i in instructions if i < split))

	312 for instruction in sorted_instructions:

	313 if instruction in cache:

	314 compressors_list = cache[instruction]

	315 for compressor_nr, match, compressor in compressors_list:

	316 result, instructions = CompressOneMatch(

	317 instructions, instruction, match, compressor)

	318 if result:

	319 return (instructions, compressor_nr, instruction)

	320 else:

	321 compressors_list = []

	322 for compressor_nr, compressor in enumerate(compressors):

	323 match = compressor.regex.match(instruction)

	324 if match:

	325 compressors_list.append((compressor_nr, match, compressor))

	326 result, instructions = CompressOneMatch(

	327 instructions, instruction, match, compressor)

	328 if result:

	329 return (instructions, compressor_nr, instruction)

	330 cache[instruction] = compressors_list

	331 return (instructions, False, False)

	332

	333

	334 def Compressed(instructions, compressors, show_progress):

	335 split = ''

	336 cache = {}

	337 while True:

	338 instructions, rule, split = CompressOneInstruction(

	339 instructions, compressors, split, cache)

	340 if rule is False: break

	341 show_progress(rule, split)

	342 return instructions

	343

	344

	345 def Worker((prefix, state_index)):

	346 worker_state = WorkerState(prefix, worker_validator)

	347

	348 try:

	349 dfa_traversal.TraverseTree(

	350 dfa.states[state_index],

	351 final_callback=worker_state.ReceiveInstruction,

	352 prefix=prefix,

	353 anyfield=0)

	354 if (prefix[0] != 0x0f or prefix[1] != 0x0f): # Skip 3DNow! instructions

	355 worker_state.output = Compressed(set(worker_state.output),

	356 compressors,

	357 worker_state.RecordTrace)

	358 except Exception as e:

	359 traceback.print_exc() # because multiprocessing imap swallows traceback

	360 raise

	361

	362 return (

	363 prefix,

	364 worker_state.total_instructions,

	365 worker_state.num_valid,

	366 worker_state.output,

	367 worker_state.trace)

	368

	369

	370 def ParseOptions():

	371 parser = optparse.OptionParser(usage='%prog [options] xmlfile')

	372

	373 parser.add_option('--bitness',

	374 choices=['32', '64'],

	375 help='The subarchitecture: 32 or 64')

	376 parser.add_option('--validator_dll',

	377 help='Path to librdfa_validator_dll')

	378 parser.add_option('--decoder_dll',

	379 help='Path to librdfa_decoder_dll')

	380

	381 options, args = parser.parse_args()

	382 options.bitness = int(options.bitness)

	383

	384 if len(args) != 1:

	385 parser.error('specify one xml file')

	386

	387 (xml_file, ) = args

	388

	389 return options, xml_file

	390

	391

	392 # Version suitable for use in regular expressions

	393 REGISTERS_RE = REGISTERS.copy()

	394 REGISTERS_RE['st(0)'] = [ 'st\${}\$'.format(N) for N in range(8) ]

	395 REGISTERS_RE['st\$0\$'] = REGISTERS_RE['st(0)']

	396

	397 # Index names in 'natual' order (as defined by IA32/x86-64 ABI)

	398 INDEXES = {

	399 'eax': [ 'eax', 'ecx', 'edx', 'ebx', 'eiz', 'ebp', 'esi', 'edi' ],

	400 'rax': [ 'rax', 'rcx', 'rdx', 'rbx', 'riz', 'rbp', 'rsi', 'rdi' ],

	401 'r8': [ 'r8', 'r9', 'r10', 'r11', 'r12', 'r13', 'r14', 'r15' ]

	402 }

	403 # Register which can not be used as base in 64-bit mode in all incarnations

	404 X86_64_BASE_REGISTERS = set([

	405 '%spl', '%bpl', '%r15b',

	406 '%sp', '%bp', '%r15w',

	407 '%esp', '%ebp', '%r15d',

	408 '%rsp', '%rbp', '%r15',

	409 '%rip'

	410 ])

	411

	412

	413 def InstructionIsDangerous(input, output, register_write,

	414 writes_to, memory_accessed=False,

	415 base_text='%riz', index_text='%riz'):

	416 """ Check if instruction with given replacements will be dangerous

	417

	418 Args:

	419 input: input argument

	420 output: output argument

	421 register_write: three-state selector

	422 'sandbox' - instruction can be used to produce "restricted register"

	423 'protect' - instruction can damage output, protect "special registers"

	424 'ignore' - instruction does not affect it's operands (e.g. test) or

	425 is used with non-GP registers (X87, MMX, XMM, etc)

	426 memory_accessed: True if instruction accesses memory

	427 base: base register (if memory is accessed)

	428 index: index register (if memory is accessed)

	429

	430 Returns:

	431 True if instruction should be rejected by validator

	432 """

	433 if memory_accessed:

	434 if base_text not in X86_64_BASE_REGISTERS:

	435 return True

	436 if index_text in X86_64_BASE_REGISTERS - set(['%r15']):

	437 return True

	438 if register_write == 'protect' and output in X86_64_BASE_REGISTERS:

	439 return True

	440 if register_write == 'sandbox' and output == '%r15d':

	441 return True

	442 if writes_to == 'both' and input in X86_64_BASE_REGISTERS:

	443 return True

	444 return False

	445

	446

	447 def AppendOperandsReplacement(replacement, rm_text, reg, modrm, writes_to):

	448 """ Appends replacement text to replacement list

	449

	450 Args:

	451 replacement: replacement list

	452 rm_text: replacement for rm field

	453 reg: register kind (or None if reg field is used as opcode extension)

	454 modrm: modrm byte

	455 writes_to: three-state selector

	456 'reg' - instruction uses rm as source, reg as destination

	457 'rm' - instruction uses reg as source, rm as destination

	458 'both' - instruction writes to both reg and rm

	459

	460 Returns:

	461 input: textual representation of input argument

	462 output: textual representation of output argument

	463

	464 Side-effect:

	465 output (if reg is None) or (input, output) tuple (if reg is not None)

	466 are added to replacement list.

	467 """

	468 if reg is None:

	469 assert writes_to == 'rm'

	470 input, output = None, rm_text

	471 replacement.append(output)

	472 else:

	473 reg_field = (modrm >> 3) & 0x07

	474 reg_text = '%' + REGISTERS[reg][reg_field]

	475 if writes_to == 'reg':

	476 input, output = rm_text, reg_text

	477 else: # rm, both

	478 input, output = reg_text, rm_text

	479 replacement.extend([input, output])

	480 return input, output

	481

	482

	483 def ModRMRegisterReplacements(rm, reg=None, writes_to='rm', opcode_bits=0,

	484 register_write='ignore'):

	485 """Creates replacement tuples list for register-to-register instructions

	486

	487 Args:

	488 rm: rm operand kind (see REGISTERS array)

	489 reg: reg operand kind (see REGISTERS array) or None if reg is not used

	490 writes_to: three-state selector

	491 'reg' - instruction uses rm as source, reg as destination

	492 'rm' - instruction uses reg as source, rm as destination

	493 'both' - instruction writes to both reg and rm

	494 opcode_bits: opcode extensions code (used when reg is None)

	495 register_write: three-state selector

	496 'sandbox' - instruction can be used to produce "restricted register"

	497 'protect' - instruction can damage output, protect "special registers"

	498 'ignore' - instruction does not affect it's operands (e.g. test) or

	499 is used with non-GP registers (X87, MMX, XMM, etc)

	500 Returns:

	501 List of replacement tuples

	502 """

	503 # Reg field can be used either as reg or as opcode extension, but not both

	504 assert reg is None or opcode_bits == 0

	505

	506 output_key = (options.bitness, reg, rm, writes_to, opcode_bits,

	507 register_write)

	508 if output_key in ModRMRegisterReplacements.replacements:

	509 return ModRMRegisterReplacements.replacements[output_key]

	510

	511 replacements = []

	512

	513 # Two upper bits of ModR/M byte (mod field) must be equal to 11

	514 # This gives us range from 0xc0 to 0xff but we are going from the

	515 # end to make rejection faster (%r15 is equal to 0x7 and %rbp is 0x5).

	516 if reg is None:

	517 # reg field is used as opcode extension

	518 byte_range = [byte

	519 for byte in range(0xff, 0xbf, -1)

	520 if (byte >> 3) & 0x7 == opcode_bits]

	521 else:

	522 byte_range = range(0xff, 0xbf, -1)

	523

	524 for modrm in byte_range:

	525 rm_field = (modrm & 0x07)

	526 rm_text = '%' + REGISTERS[rm][rm_field]

	527 byte_text = '{:02x}'.format(modrm)

	528 replacement = [byte_text]

	529 input, output = AppendOperandsReplacement(

	530 replacement, rm_text, reg, modrm, writes_to)

	531 if options.bitness == 64:

	532 replacement.append('any_nonspecial') # input_rr

	533 replacement.append(output if register_write == 'sandbox' else None)

	534 if InstructionIsDangerous(input, output, register_write, writes_to):

	535 continue

	536 replacements.append(tuple(replacement))

	537 ModRMRegisterReplacements.replacements[output_key] = tuple(replacements)

	538 return ModRMRegisterReplacements.replacements[output_key]

	539 ModRMRegisterReplacements.replacements = {}

	540

	541

	542 def BaseOnlyMemoryOperand(modrm, base):

	543 """Creates replacement tuples list for register-to-memory instructions

	544 (base only, no SIB)

	545

	546 Args:

	547 modrm: modrm byte

	548 base: register kind for base

	549 Returns:

	550 bytes_text: replacement for "bytes" group

	551 rm_text: textual representation of "rm" argument

	552 base_text: textual representation of "base" register

	553 """

	554 mod_field = (modrm >> 6) & 0x03

	555 rm_field = (modrm & 0x07)

	556 base_text = '%' + REGISTERS[base][rm_field]

	557 # If RM field == %rbp and MOD field is zero then it's absolute address

	558 # in 32-bit mode and %rip-based address in 64-bit mode

	559 if mod_field == 0 and rm_field == validator.REG_RBP:

	560 bytes_text = '{:02x} 00 00 00 00'.format(modrm)

	561 rm_text = '0x0' if options.bitness == 32 else '0x0(%rip)'

	562 base_text = '%eiz' if options.bitness == 32 else '%rip'

	563 # Memory access with just a base register

	564 elif mod_field == 0:

	565 bytes_text = '{:02x}'.format(modrm)

	566 rm_text = '({})'.format(base_text)

	567 # Memory access with base and 8bit offset

	568 elif mod_field == 1:

	569 bytes_text = '{:02x} 00'.format(modrm)

	570 rm_text = '0x0({})'.format(base_text)

	571 # Memory access with base and 32bit offset

	572 else: # mod_field == 2

	573 bytes_text = '{:02x} 00 00 00 00'.format(modrm)

	574 rm_text = '0x0({})'.format(base_text)

	575 return bytes_text, rm_text, base_text

	576

	577

	578 def SIBMemoryOperand(modrm, sib, base, index):

	579 """Creates replacement tuples list for register-to-memory instructions

	580 (base only, no SIB)

	581

	582 Args:

	583 modrm: modrm byte

	584 base: register kind for base

	585 Returns:

	586 bytes_text: replacement for "bytes" group

	587 rm_text: textual representation of "rm" argument

	588 base_text: textual representation of "base" register

	589 index_text: textual representation of "index" register

	590 """

	591 mod_field = (modrm >> 6) & 0x03

	592 scale_field = (sib >> 6) & 0x03

	593 index_field = (sib >> 3) & 0x07

	594 base_field = (sib & 0x07)

	595 index_text = '%' + INDEXES[index][index_field]

	596 base_text = '%' + REGISTERS[base][base_field]

	597 scale_text = str(1 << scale_field)

	598 # If BASE is %rbp and MOD == 0 then index with 32bit offset is used

	599 if mod_field == 0 and base_field == validator.REG_RBP:

	600 bytes_text = '{:02x} {:02x} 00 00 00 00'.format(modrm, sib)

	601 if (options.bitness == 64 and

	602 index_text == '%riz' and

	603 scale_text == '1'):

	604 rm_text = '0x0'

	605 else:

	606 rm_text = '0x0(,{},{})'.format(index_text, scale_text)

	607 # There are no base in this case

	608 base_text = '%eiz' if options.bitness == 32 else '%riz'

	609 # Memory access with base and index (no offset)

	610 elif mod_field == 0:

	611 bytes_text = '{:02x} {:02x}'.format(modrm, sib)

	612 rm_text = '({},{},{})'.format(base_text, index_text, scale_text)

	613 # Memory access with base, index and 8bit offset

	614 elif mod_field == 1:

	615 bytes_text = '{:02x} {:02x} 00'.format(modrm, sib)

	616 rm_text = '0x0({},{},{})'.format(base_text, index_text, scale_text)

	617 # Memory access with base, index and 32bit offset

	618 elif mod_field == 2:

	619 bytes_text = '{:02x} {:02x} 00 00 00 00'.format(modrm, sib)

	620 rm_text = '0x0({},{},{})'.format(base_text, index_text, scale_text)

	621 # Pretty-printing of access via %rsp (or %r12)

	622 if (base_field == validator.REG_RSP and

	623 index_text in ('%eiz', '%riz') and

	624 scale_text == '1'):

	625 if mod_field == 0: # no offset

	626 rm_text = '({})'.format(base_text)

	627 else: # 8-bit or 32-bit offset

	628 rm_text = '0x0({})'.format(base_text)

	629 return bytes_text, rm_text, base_text, index_text

	630

	631

	632 def ModRMMemoryReplacements(reg=None, writes_to='rm', opcode_bits=0,

	633 memory_accessed=True, register_write='ignore',

	634 base_r8=False, index_r8=False):

	635 """Creates replacement tuples list for register-to-memory instructions

	636

	637 Args:

	638 reg: reg operand kind (see REGISTERS array) or None if reg is not used

	639 writes_to: three-state selector

	640 'reg' - instruction uses rm as source, reg as destination

	641 'rm' - instruction uses reg as source, rm as destination

	642 'both' - instruction writes to both reg and rm

	643 opcode_bits: opcode extensions code (used when reg is None)

	644 memory_accessed: True if instruction accesses memory

	645 register_write: three-state selector

	646 'sandbox' - instruction can be used to produce "restricted register"

	647 'protect' - instruction can damage output, protect "special registers"

	648 'ignore' - instruction does not affect it's operands (e.g. test) or

	649 is used with non-GP registers (X87, MMX, XMM, etc)

	650 index_r8: True if REX.X bit in the instruction set to 1

	651

	652 Returns:

	653 List of replacement tuples

	654 """

	655 # Reg field can be used either as reg or as opcode extension, but not both

	656 assert reg is None or opcode_bits == 0

	657

	658 output_key = (options.bitness, reg, writes_to, opcode_bits,

	659 base_r8, index_r8, memory_accessed, register_write)

	660 if output_key in ModRMMemoryReplacements.replacements:

	661 return ModRMMemoryReplacements.replacements[output_key]

	662

	663 if options.bitness == 32:

	664 base = 'eax'

	665 index = 'eax'

	666 else:

	667 base = 'r8' if base_r8 else 'rax'

	668 index = 'r8' if index_r8 else 'rax'

	669

	670 replacements = []

	671

	672 # Two upper bits of ModR/M byte (mod field) must be equal to 00, 01, or 10

	673 # This gives us range from 0x00 to 0xbf but we are going from the end to make

	674 # rejection faster (%r15 is equal to 0x7 and %rbp is 0x5).

	675 if reg is None:

	676 # reg field is used as opcode extension

	677 byte_range = [byte

	678 for byte in range(0xbf, -1, -1)

	679 if (byte >> 3) & 0x7 == opcode_bits]

	680 else:

	681 byte_range = range(0xbf, -1, -1)

	682

	683 for modrm in byte_range:

	684 # If RM field != %rsp then there are no SIB byte

	685 if (modrm & 0x07) != validator.REG_RSP:

	686 bytes_text, rm_text, base_text = BaseOnlyMemoryOperand(modrm, base)

	687 replacement = [bytes_text]

	688 input, output = AppendOperandsReplacement(

	689 replacement, rm_text, reg, modrm, writes_to)

	690 if options.bitness == 64:

	691 replacement.append('any_nonspecial')

	692 # If writes_to is equal to 'reg' then output is register

	693 if writes_to == 'reg' and register_write == 'sandbox':

	694 replacement.append(output)

	695 else:

	696 # Note that instruction like xchg could write to another register:

	697 # it's "input"! But validator currently does not support this case

	698 # thus we are ignoring it here and writing "None" in this case, too.

	699 replacement.append(None)

	700 if InstructionIsDangerous(input, output, register_write, writes_to,

	701 memory_accessed, base_text):

	702 continue

	703 replacements.append(tuple(replacement))

	704 else:

	705 # If RM field == %rsp then we have SIB byte

	706 for sib in xrange(0x100):

	707 bytes_text, rm_text, base_text, index_text = SIBMemoryOperand(

	708 modrm, sib, base, index)

	709 replacement = [bytes_text]

	710 input, output = AppendOperandsReplacement(

	711 replacement, rm_text, reg, modrm, writes_to)

	712 if options.bitness == 64:

	713 if not memory_accessed or index_text == '%riz':

	714 replacement.append('any_nonspecial')

	715 else:

	716 if index_r8:

	717 # Convert %r8 to %r8d, %r9 to %r9d, etc

	718 replacement.append(index_text + 'd')

	719 else:

	720 # Convert %rax to %eax, %rsp to %esp, etc

	721 replacement.append('%e' + index_text[2:])

	722 # If writes_to is equal to 'reg' then output is register

	723 if writes_to == 'reg' and register_write == 'sandbox':

	724 replacement.append(output)

	725 else:

	726 # Note that instruction like xchg could write to another register:

	727 # it's "input"! But validator currently does not support this case

	728 # thus we are ignoring it here and writing "None" in this case, too.

	729 replacement.append(None)

	730 if InstructionIsDangerous(input, output, register_write, writes_to,

	731 memory_accessed, base_text, index_text):

	732 continue

	733 replacements.append(tuple(replacement))

	734 ModRMMemoryReplacements.replacements[output_key] = tuple(replacements)

	735 return ModRMMemoryReplacements.replacements[output_key]

	736 ModRMMemoryReplacements.replacements = {}

	737

	738

	739 # Map from "REX bit off" group of registers to "REX bit on" group of registers

	740 r8 = {

	741 'al': 'r8b',

	742 'ax': 'r8w',

	743 'eax': 'r8d',

	744 'rax': 'r8',

	745 'mm0': 'mmalt',

	746 'xmm0': 'xmm8',

	747 'ymm0': 'ymm8'

	748 }

	749

	750

	751 def RegisterKinds():

	752 """ Return list of register kinds we process with register compressors """

	753

	754 if options.bitness == 32:

	755 return ('al', 'ax', 'eax', 'mm0', 'xmm0', 'ymm0')

	756 else:

	757 return ('al', 'spl', 'ax', 'eax', 'rax', 'mm0', 'xmm0', 'ymm0',

	758 'r8b', 'r8w', 'r8d', 'r8', 'mmalt', 'xmm8', 'ymm8')

	759

	760

	761 def RegisterKindPairs():

	762 """ Return hand-picked pairs which we must consider in compressors """

	763

	764 if options.bitness == 32:

	765 return (

	766 ( 'al', 'al'),

	767 ( 'ax', 'al'),

	768 ( 'al', 'ax'),

	769 ( 'ax', 'ax'),

	770 ( 'eax', 'al'),

	771 ( 'al', 'eax'),

	772 ( 'eax', 'ax'),

	773 ( 'ax', 'eax'),

	774 ( 'eax', 'eax'),

	775 ( 'eax', 'mm0'),

	776 ( 'mm0', 'eax'),

	777 ( 'eax', 'xmm0'),

	778 ('xmm0', 'eax'),

	779 ( 'mm0', 'mm0'),

	780 ( 'mm0', 'xmm0'),

	781 ('xmm0', 'mm0'),

	782 ('xmm0', 'xmm0'),

	783 ('xmm0', 'ymm0'),

	784 ('ymm0', 'xmm0'),

	785 ('ymm0', 'ymm0')

	786 )

	787 else:

	788 return (

	789 ( 'al', 'al'),

	790 ( 'spl', 'spl'), ( 'spl', 'r8b'), ( 'r8b', 'spl'), ( 'r8b', 'r8b'),

	791 ( 'ax', 'al'),

	792 ( 'ax', 'spl'), ( 'ax', 'r8b'), ( 'r8w', 'spl'), ( 'r8w', 'r8b'),

	793 ( 'al', 'ax'),

	794 ( 'spl', 'ax'), ( 'spl', 'r8w'), ( 'r8b', 'ax'), ( 'r8b', 'r8w'),

	795 ( 'ax', 'ax'), ( 'ax', 'r8w'), ( 'r8w', 'ax'), ( 'r8w', 'r8w'),

	796 ( 'eax', 'al'),

	797 ( 'eax', 'spl'), ( 'eax', 'r8b'), ( 'r8d', 'spl'), ( 'r8d', 'r8b'),

	798 ( 'al', 'eax'),

	799 ( 'spl', 'eax'), ( 'spl', 'r8d'), ( 'r8b', 'eax'), ( 'r8b', 'r8d'),

	800 ( 'eax', 'ax'), ( 'eax', 'r8w'), ( 'r8d', 'ax'), ( 'r8d', 'r8w'),

	801 ( 'ax', 'eax'), ( 'ax', 'r8d'), ( 'r8w', 'eax'), ( 'r8w', 'r8d'),

	802 ( 'eax', 'eax'), ( 'eax', 'r8d'), ( 'r8d', 'eax'), ( 'r8d', 'r8d'),

	803 ( 'rax', 'al'),

	804 ( 'rax', 'spl'), ( 'rax', 'r8b'), ( 'r8', 'spl'), ( 'r8', 'r8b'),

	805 ( 'al', 'rax'),

	806 ( 'spl', 'rax'), ( 'spl', 'r8'), ( 'r8b', 'rax'), ( 'r8b', 'r8'),

	807 ( 'rax', 'ax'), ( 'rax', 'r8w'), ( 'r8', 'ax'), ( 'r8', 'r8w'),

	808 ( 'ax', 'rax'), ( 'ax', 'r8'), ( 'r8w', 'rax'), ( 'r8w', 'r8'),

	809 ( 'rax', 'eax'), ( 'rax', 'r8d'), ( 'r8', 'eax'), ( 'r8', 'r8d'),

	810 ( 'eax', 'rax'), ( 'eax', 'r8'), ( 'r8d', 'rax'), ( 'r8d', 'r8'),

	811 ( 'rax', 'rax'), ( 'rax', 'r8'), ( 'r8', 'rax'), ( 'r8', 'r8'),

	812 ( 'eax', 'mm0'), ( 'eax','mmalt'), ( 'r8d', 'mm0'), ( 'eax', 'mmalt'),

	813 ( 'rax', 'mm0'), ( 'rax','mmalt'), ( 'r8', 'mm0'), ( 'r8', 'mmalt'),

	814 ( 'mm0', 'eax'), ('mmalt', 'eax'), ( 'mm0', 'r8d'), ('mmalt', 'r8d'),

	815 ( 'mm0', 'rax'), ('mmalt', 'rax'), ( 'mm0', 'r8'), ('mmalt', 'r8'),

	816 ( 'eax', 'xmm0'), ( 'eax', 'xmm8'), ( 'r8d', 'xmm0'), ( 'r8d', 'xmm8'),

	817 ( 'rax', 'xmm0'), ( 'rax', 'xmm8'), ( 'r8', 'xmm0'), ( 'r8', 'xmm8'),

	818 ('xmm0', 'eax'), ('xmm0', 'r8d'), ('xmm8', 'eax'), ('xmm8', 'r8d'),

	819 ('xmm0', 'rax'), ('xmm0', 'r8'), ('xmm8', 'rax'), ('xmm8', 'r8'),

	820 ( 'mm0', 'mm0'), ('mmalt', 'mm0'), ( 'mm0','mmalt'), ('mmalt','mmalt'),

	821 ( 'mm0', 'xmm0'), ('mmalt','xmm0'), ( 'mm0', 'xmm8'), ('mmalt', 'xmm8'),

	822 ('xmm0', 'mm0'), ('xmm8', 'mm0'), ('xmm0','mmalt'), ('xmm8', 'mmalt'),

	823 ('xmm0', 'xmm0'), ('xmm0', 'xmm8'), ('xmm8', 'xmm0'), ('xmm8', 'xmm8'),

	824 ('xmm0', 'ymm0'), ('xmm0', 'ymm8'), ('xmm8', 'ymm0'), ('xmm8', 'ymm8'),

	825 ('ymm0', 'xmm0'), ('ymm0', 'xmm8'), ('ymm8', 'xmm0'), ('ymm8', 'xmm8'),

	826 ('ymm0', 'ymm0'), ('ymm0', 'ymm8'), ('ymm8', 'ymm0'), ('ymm8', 'ymm8')

	827 )

	828

	829

	830 def RegToRmCompressors(rm, reg, writes_to, memory_accessed=True,

	831 register_write='ignore', x3Dnow=False, notes=''):

	832 """Returns a list of reg <-> rm compressors for a given set of parameters.

	833

	834 Args:

	835 rm: rm operand kind (see REGISTERS array)

	836 reg: reg operand kind (see REGISTERS array) or None if reg is not used

	837 writes_to: three-state selector

	838 'reg' - instruction uses rm as source, reg as destination

	839 'rm' - instruction uses reg as source, rm as destination

	840 'both' - instruction writes to both reg and rm

	841 memory_accessed: True if instruction accesses memory

	842 register_write: three-state selector

	843 'sandbox' - instruction can be used to produce "restricted register"

	844 'protect' - instruction can damage output, protect "special registers"

	845 'ignore' - instruction does not affect it's operands (e.g. test) or

	846 is used with non-GP registers (X87, MMX, XMM, etc)

	847 x3DNow - True if instruction is 3DNow! style (opcode in immidiate)

	848 notes - Notes to add to the description

	849

	850 Returns:

	851 List of compressors

	852 """

	853

	854 compressors = []

	855

	856 start_reg = REGISTERS[reg][0]

	857 if reg[0:2] == 'r8' and register_write != 'ignore' and writes_to != 'rm':
	halyavin 2013/12/06 10:58:46 end_reg = REGISTERS[reg][-1] # Exclude r15 from th end_reg = REGISTERS[reg][-1] # Exclude r15 from the interval, if instruction can write to reg. if (end_reg.startswith('r15') and register_write != 'ignore' and writes_to in ['reg', 'both']): end_reg = REGISTERS[reg][-2] khim 2013/12/06 15:57:36 Done. Show quoted text On 2013/12/06 10:58:46, halyavin wrote: > end_reg = REGISTERS[reg][-1] > # Exclude r15 from the interval, if instruction can write to reg. > if (end_reg.startswith('r15') and register_write != 'ignore' and > writes_to in ['reg', 'both']): > end_reg = REGISTERS[reg][-2] Done.
	858 end_reg = REGISTERS[reg][-2]

	859 else:

	860 end_reg = REGISTERS[reg][-1]

	861 start_rm = REGISTERS[rm][0]

	862 if rm[0:2] == 'r8' and register_write != 'ignore' and writes_to != 'reg':
	halyavin 2013/12/06 10:58:46 The same as above. The same as above. khim 2013/12/06 15:57:36 Done. Show quoted text On 2013/12/06 10:58:46, halyavin wrote: > The same as above. Done.
	863 end_rm = REGISTERS[rm][-2]

	864 else:

	865 end_rm = REGISTERS[rm][-1]

	866

	867 regex = '(?: 00)*'
	halyavin 2013/12/06 10:58:46 This line is confusing. Since this is the start of This line is confusing. Since this is the start of the regular expression, I assumed that it matches the start of the instruction. Do we need these zeros at all? There could be zero of them anyway. khim 2013/12/06 15:57:36 Well, it's does not work that way. Tried to add th Show quoted text On 2013/12/06 10:58:46, halyavin wrote: > This line is confusing. Since this is the start of the regular expression, I assumed that it matches the start of the instruction. Well, it's does not work that way. Tried to add the clarifying comment. Show quoted text > Do we need these zeros at all? There could be zero of them anyway. Sure, most instructions have zero zeros here, but some (like movabs) may have up to eight zeros here.
	868 # Additional byte is opcode extension with 3DNow! instructions.
	halyavin 2013/12/06 10:58:46 # The last byte of the 3DNow! instruction is opcod # The last byte of the 3DNow! instruction is opcode extension. The same question as above: do we need this at all? khim 2013/12/06 15:57:36 Yes, we do. Do you really think I would add useles Show quoted text On 2013/12/06 10:58:46, halyavin wrote: > # The last byte of the 3DNow! instruction is opcode extension. > The same question as above: do we need this at all? Yes, we do. Do you really think I would add useless pieces of code to the copressor which works just fine as it is. This time here is even comment which explains just WHY do we need these!
	869 if x3Dnow:

	870 regex = ' [0-9a-fA-F][0-9a-fA-F]'

	871 regex += ' (?:lock )?\\w* (?:\\$0x0,\|\\$0x0,\\$0x0,\|%cl,\|%xmm0,)?'
	halyavin 2013/12/06 10:58:46 # 2 spaces separate byte code from instruction nam # 2 spaces separate byte code from instruction name. regex += ' ' regex += '(?:lock )?' # Instruction name regex += '\\w* ' # Immediate or implicit operand that can precede reg/rm pair. regex += '(?:...)?' Why do we need %xmm0 and 2 immediates here? I thought that enter/leave do not have any register or memory parameters. khim 2013/12/06 15:57:36 Done. Show quoted text On 2013/12/06 10:58:46, halyavin wrote: > # 2 spaces separate byte code from instruction name. > regex += ' ' > regex += '(?:lock )?' > # Instruction name > regex += '\\w* ' > # Immediate or implicit operand that can precede reg/rm pair. > regex += '(?:...)?' > Done. Show quoted text > Why do we need %xmm0 and 2 immediates here? %xmm0 is used e.g. in pblendvb, two immediates are used e.g. in insertq Show quoted text > I thought that enter/leave do not have any register or memory parameters. They don't, you are correct. These are not for them.
	872 # We only need to process ModR/M+SIB '04 04' or '04 07' here

	873 if options.bitness == 32:

	874 regex_mem = '\$%esp,%eax,1\$'

	875 else:

	876 regex_mem = '\$(?:%rsp\|%r15),(?:%rax\|%r8),1\$'

	877 output = None

	878 output_note = None

	879 # With "or memory" or "memory" compressors we always can see where is

	880 # reg and where is rm, but with reg to rm or rm to reg it's impossible.

	881 # Add the appropriate comment

	882 if notes == '':

	883 if writes_to == 'reg':

	884 notes_register = ' # rm to reg'

	885 else:

	886 notes_register = ' # reg to rm'

	887 else:
	halyavin 2013/12/06 10:58:46 if writes_to == 'reg': notes_register = ' # rm t if writes_to == 'reg': notes_register = ' # rm to reg' else: notes_register = ' # reg to rm' if notes != '': notes_register += '; ' + notes notes = ' # ' + notes khim 2013/12/06 15:57:36 Done. Show quoted text On 2013/12/06 10:58:46, halyavin wrote: > if writes_to == 'reg': > notes_register = ' # rm to reg' > else: > notes_register = ' # reg to rm' > if notes != '': > notes_register += '; ' + notes > notes = ' # ' + notes Done.
	888 if writes_to == 'reg':

	889 notes_register = '# rm to reg; ' + notes

	890 else:

	891 notes_register = '# reg to rm;' + notes

	892 notes = ' # ' + notes
	halyavin 2013/12/06 10:58:46 Why we have space before # here but not in notes_r Why we have space before # here but not in notes_register? Can we omit '# ' and write function def ToComment(comment): if comment != '': return '# ' + comment return '' khim 2013/12/06 15:57:36 What will it save? Show quoted text On 2013/12/06 10:58:46, halyavin wrote: > Why we have space before # here but not in notes_register? > Can we omit '# ' and write function > def ToComment(comment): > if comment != '': > return '# ' + comment > return '' What will it save?
	893 if writes_to == 'reg':

	894 regex += '(%' + REGISTERS[rm][0] + '\|' + regex_mem + ')'

	895 regex += ',(%' + REGISTERS[reg][0] + ')'

	896 if register_write == 'sandbox':

	897 assert reg in ('eax', 'r8d')

	898 output = '%' + reg + '\|None'

	899 output_note = '[%eax..%edi]' if reg == 'eax' else '[%r8d..%r14d]'

	900 subst = (

	901 'XX', '[%{}..%{} or memory]'.format(start_rm, end_rm),

	902 '[%{}..%{}]'.format(start_reg, end_reg), notes)

	903 subst_register = (

	904 'XX', '[%{}..%{}]'.format(start_rm, end_rm),

	905 '[%{}..%{}]'.format(start_reg, end_reg), notes_register)

	906 subst_memory = (

	907 'XX', '[memory]',

	908 '[%{}..%{}]'.format(start_reg, end_reg), notes)

	909 else:

	910 regex += '(%' + REGISTERS[reg][0] + ')'

	911 regex += ',(%' + REGISTERS[rm][0] + '\|' + regex_mem + ')'

	912 if register_write == 'sandbox':
	halyavin 2013/12/06 10:58:46 Can we have situation that second operand is memor Can we have situation that second operand is memory but register_write == 'sandbox'? khim 2013/12/06 15:57:36 Yes, why not? Show quoted text On 2013/12/06 10:58:46, halyavin wrote: > Can we have situation that second operand is memory but register_write == > 'sandbox'? Yes, why not?
	913 assert rm in ('eax', 'r8d')

	914 output = '%' + rm + '\|None'

	915 output_note = '[%eax..%edi]' if rm == 'eax' else '[%r8d..%r14d]'

	916 subst = (

	917 'XX', '[%{}..%{}]'.format(start_reg, end_reg),

	918 '[%{}..%{} or memory]'.format(start_rm, end_rm), notes)

	919 subst_register = (

	920 'XX', '[%{}..%{}]'.format(start_reg, end_reg),

	921 '[%{}..%{}]'.format(start_rm, end_rm), notes_register)

	922 subst_memory = (

	923 'XX', '[%{}..%{}]'.format(start_reg, end_reg),

	924 '[memory]', notes)

	925 regex += '.*'
	halyavin 2013/12/06 11:25:43 Can we use '$' instead? Can we use '$' instead? khim 2013/12/06 15:57:36 We can, but of course not here. Done. Show quoted text On 2013/12/06 11:25:43, halyavin wrote: > Can we use '$' instead? We can, but of course not here. Done.
	926 if options.bitness == 64:

	927 regex += '; input_rr=(%eax\|%r8d\|any_nonspecial)'

	928 regex += '; output_rr=({})'.format(output)

	929 if memory_accessed:

	930 input_note = '[%eax..%edi]'

	931 input_note_r8 = '[%r8d..%r15d]'

	932 else:

	933 input_note = 'any_nonspecial'

	934 input_note_r8 = 'any_nonspecial'

	935 subst_r8 = subst[0:-1] + (input_note_r8, output_note) + subst[-1:]
	halyavin 2013/12/06 11:25:43 Add notes at the end. Add notes at the end. khim 2013/12/06 15:57:36 Done. Show quoted text On 2013/12/06 11:25:43, halyavin wrote: > Add notes at the end. Done.
	936 subst = subst[0:-1] + (input_note, output_note) + subst[-1:]

	937 subst_memory_r8 = subst_memory[0:-1] + (

	938 input_note_r8, output_note) + subst_memory[-1:]

	939 subst_memory = subst_memory[0:-1] + (

	940 input_note, output_note) + subst_memory[-1:]

	941 subst_register = subst_register[0:-1] + (

	942 'any_nonspecial', output_note) + subst_register[-1:]

	943 regex += '()'

	944 base_r8 = rm in r8.values()

	945 memory_replacement = ModRMMemoryReplacements(

	946 reg=reg, base_r8=base_r8, writes_to=writes_to,

	947 memory_accessed=memory_accessed, register_write=register_write)

	948 compressors.append(Compressor(

	949 '.*?(04 0[47])' + regex, subst_memory, memory_replacement))
	halyavin 2013/12/06 11:25:43 mod_rm_sib_regex = '(04 0[47])' mod_rm_sib_regex = '(04 0[47])' khim 2013/12/06 15:57:36 Done. Show quoted text On 2013/12/06 11:25:43, halyavin wrote: > mod_rm_sib_regex = '(04 0[47])' Done.
	950 if options.bitness == 64:

	951 memory_replacement_r8 = ModRMMemoryReplacements(

	952 reg=reg, base_r8=base_r8, index_r8=True, writes_to=writes_to,

	953 memory_accessed=memory_accessed, register_write=register_write)

	954 compressors.append(Compressor(

	955 '.*?(04 0[47])' + regex, subst_memory_r8, memory_replacement_r8))

	956 # Instructions with no memory access are instructions which are doing

	957 # something with memory address (e.g. lea) and as such they don't have

	958 # non-memory forms.

	959 if memory_accessed:

	960 register_replacement = ModRMRegisterReplacements(

	961 reg=reg, rm=rm, writes_to=writes_to, register_write=register_write)

	962 compressors.append(Compressor(

	963 '.*?(c0)' + regex, subst_register, register_replacement))

	964 main_replacement = register_replacement + memory_replacement

	965 compressors.append(Compressor(

	966 '.*?(04 0[47])' + regex, subst, main_replacement))

	967 if options.bitness == 64:

	968 main_replacement_r8 = register_replacement + memory_replacement_r8

	969 compressors.append(Compressor(

	970 '.*?(04 0[47])' + regex, subst_r8, main_replacement_r8))

	971 return compressors

	972

	973

	974 def AllRegToRmCompressors():

	975 """ Return list of all Reg to RM (and RM to Reg) compressors. """

	976

	977 compressors = []

	978

	979 for reg, rm in RegisterKindPairs():

	980 instruction_kinds = [

	981 # Normal instructions with two operands (rm to reg)

	982 {'writes_to': 'reg'},

	983 # Normal instructions with two operands (reg to rm)

	984 {'writes_to': 'rm'}

	985 ]

	986 # In 64 bit mode we have many different types of instructions depending

	987 # on whether we are accessing memory or whether we are writing to registers.

	988 if options.bitness == 64:

	989 # Lea in 64 bit mode is truly unique instruction for now

	990 if reg in ('ax', 'r8w', 'eax', 'r8d', 'rax', 'r8'):

	991 instruction_kinds = [
	halyavin 2013/12/06 11:41:33 Use append and extract register_write to local var Use append and extract register_write to local variable. khim 2013/12/06 15:57:36 Done. Show quoted text On 2013/12/06 11:41:33, halyavin wrote: > Use append and extract register_write to local variable. Done.
	992 {'writes_to': 'reg', 'memory_accessed':False,

	993 'register_write': ['protect', 'sandbox'][reg in ('eax', 'r8d')],

	994 'notes': 'lea'}] + instruction_kinds

	995 # There are few more forms in 64 bit case (rm to reg)

	996 if reg in ('eax', 'r8d'):

	997 # Zero-extending version.

	998 instruction_kinds.append(

	999 {'writes_to':'reg', 'register_write':'sandbox'})

	1000 # More forms in 64 bit case (reg to rm)

	1001 if rm in ('eax', 'r8d'):

	1002 # Zero-extending version.

	1003 instruction_kinds.append(

	1004 {'writes_to':'rm', 'register_write':'sandbox'})

	1005 # Zero-extending xchg/xadd

	1006 instruction_kinds.append(

	1007 {'writes_to':'both', 'register_write':'sandbox',

	1008 'notes': 'write to both'})

	1009 # Still more forms for 64 bit case (rm to reg).

	1010 if reg in ('al', 'spl', 'ax', 'eax', 'rax', 'r8b', 'r8w', 'r8d', 'r8'):

	1011 # Dangerous instructions (rm to reg)

	1012 instruction_kinds.append(

	1013 {'writes_to':'reg', 'register_write':'protect'})

	1014 # Still more forms for 64 bit case (reg to rm)

	1015 if rm in ('al', 'spl', 'ax', 'eax', 'rax', 'r8b', 'r8w', 'r8d', 'r8'):

	1016 # Dangerous instructions (reg to rm)

	1017 instruction_kinds.append(

	1018 {'writes_to':'rm', 'register_write':'protect'})

	1019 # Dangerous xchg/xadd

	1020 instruction_kinds.append(

	1021 {'writes_to':'both', 'register_write':'protect',

	1022 'notes': 'write to both'})

	1023 # 3DNow! instructions

	1024 if reg in ('mm0', 'mmalt') or rm in ('mm0', 'mmalt'):

	1025 instruction_kinds.append(

	1026 {'writes_to':'reg', 'x3Dnow':'yes'})
	halyavin 2013/12/06 11:31:17 'yes'->True 'yes'->True halyavin 2013/12/06 11:31:17 'x3Dnow'-> 'is_3Dnow' 'x3Dnow'-> 'is_3Dnow' khim 2013/12/06 15:57:36 Done. Show quoted text On 2013/12/06 11:31:17, halyavin wrote: > 'yes'->True Done. khim 2013/12/06 15:57:36 Done. Show quoted text On 2013/12/06 11:31:17, halyavin wrote: > 'x3Dnow'-> 'is_3Dnow' Done.
	1027 for instruction_kind in instruction_kinds:

	1028 compressors.extend(RegToRmCompressors(reg=reg, rm=rm, **instruction_kind))

	1029 return compressors

	1030

	1031

	1032 def RmCompressors(rm, opcode_bits,

	1033 memory_accessed=True, register_write='ignore'):

	1034 """Returns a list of rm compressors for a given set of parameters.

	1035

	1036 Args:

	1037 rm: rm operand kind (see REGISTERS array)

	1038 memory_accessed: True if instruction accesses memory

	1039 register_write: three-state selector

	1040 'sandbox' - instruction can be used to produce "restricted register"

	1041 'protect' - instruction can damage output, protect "special registers"

	1042 'ignore' - instruction does not affect it's operands (e.g. test) or

	1043 is used with non-GP registers (X87, MMX, XMM, etc)

	1044

	1045 Returns:

	1046 List of compressors

	1047 """

	1048 compressors = []

	1049

	1050 start_rm = REGISTERS[rm][0]

	1051 if rm[0:2] == 'r8' and register_write != 'ignore':
	halyavin 2013/12/06 12:38:27 The same as in previous function. The same as in previous function. khim 2013/12/06 15:57:36 Done. Show quoted text On 2013/12/06 12:38:27, halyavin wrote: > The same as in previous function. Done.
	1052 end_rm = REGISTERS[rm][-2]

	1053 else:

	1054 end_rm = REGISTERS[rm][-1]

	1055 byte_mark = 'XX/' + str(opcode_bits)

	1056

	1057 subst = (byte_mark, '[%{}..%{} or memory]'.format(start_rm, end_rm), '')

	1058 subst_register = (byte_mark, '[%{}..%{}]'.format(start_rm, end_rm), '')

	1059 subst_memory = (byte_mark, '[memory]', '')

	1060 # We only need to process ModR/M+SIB '04 04' or '04 07' here
	halyavin 2013/12/06 12:38:27 Extract to function. Note, that you will need opco Extract to function. Note, that you will need opcode_bits as a parameter. khim 2013/12/06 15:57:36 Done. Show quoted text On 2013/12/06 12:38:27, halyavin wrote: > Extract to function. Note, that you will need opcode_bits as a parameter. Done.
	1061 if options.bitness == 32:

	1062 regex_mem = '\$%esp,%eax,1\$'

	1063 else:

	1064 regex_mem = '\$(?:%rsp\|%r15),(?:%rax\|%r8),1\$'

	1065 regex = ('(?: 00)* (?:lock )?\\w* (?:\\$0x0,\|%cl,)?'
	halyavin 2013/12/06 12:38:27 Split into multiple additions as in previous funct Split into multiple additions as in previous function. khim 2013/12/06 15:57:36 Done. Show quoted text On 2013/12/06 12:38:27, halyavin wrote: > Split into multiple additions as in previous function. Done.
	1066 '(%' + REGISTERS[rm][0] + '\|' + regex_mem + ').*')

	1067 output = None

	1068 output_note = None

	1069 if options.bitness == 64:

	1070 if register_write == 'sandbox':

	1071 assert rm in ('eax', 'r8d')

	1072 output = '%' + rm + '\|None'

	1073 output_note = '[%eax..%edi]' if rm == 'eax' else '[%r8d..%r14d]'

	1074 regex += '; input_rr=(%eax\|%r8d\|any_nonspecial)'

	1075 regex += '; output_rr=({})'.format(output)

	1076 if memory_accessed:

	1077 input_note = '[%eax..%edi]'

	1078 input_note_r8 = '[%r8d..%r15d]'

	1079 else:

	1080 input_note = 'any_nonspecial'

	1081 input_note_r8 = 'any_nonspecial'

	1082 subst_r8 = subst[0:-1] + (input_note_r8, output_note) + subst[-1:]

	1083 subst = subst[0:-1] + (input_note, output_note) + subst[-1:]

	1084 subst_memory_r8 = subst_memory[0:-1] + (

	1085 input_note_r8, output_note) + subst_memory[-1:]

	1086 subst_memory = subst_memory[0:-1] + (

	1087 input_note, output_note) + subst_memory[-1:]

	1088 subst_register = subst_register[0:-1] + (

	1089 'any_nonspecial', output_note) + subst_register[-1:]

	1090 regex += '()'

	1091 base_r8 = rm in r8.values()

	1092 memory_replacement = ModRMMemoryReplacements(

	1093 reg=None, base_r8=base_r8, opcode_bits=opcode_bits,

	1094 memory_accessed=memory_accessed, register_write=register_write)

	1095 compressors.append(Compressor(

	1096 '.?({:02x} 0[47])'.format(0x04 + opcode_bits 8) + regex,

	1097 subst_memory, memory_replacement))

	1098 if options.bitness == 64:

	1099 memory_replacement_r8 = ModRMMemoryReplacements(

	1100 reg=None, base_r8=base_r8, index_r8=True, opcode_bits=opcode_bits,

	1101 memory_accessed=memory_accessed, register_write=register_write)

	1102 compressors.append(Compressor(

	1103 '.?({:02x} 0[47])'.format(0x04 + opcode_bits 8) + regex,

	1104 subst_memory_r8, memory_replacement_r8))

	1105 # Instructions with no memory access are instructions which are doing

	1106 # something with memory address (e.g. prefetch) and as such they don't

	1107 # have non-memory forms.

	1108 if memory_accessed:

	1109 register_replacement = ModRMRegisterReplacements(

	1110 reg=None, rm=rm, opcode_bits=opcode_bits, register_write=register_write)

	1111 compressors.append(Compressor(

	1112 '.?({:02x})'.format(0xc0 + opcode_bits 8) + regex,

	1113 subst_register, register_replacement))

	1114 main_replacement = register_replacement + memory_replacement

	1115 compressors.append(Compressor(

	1116 '.?({:02x} 0[47])'.format(0x04 + opcode_bits 8) + regex,

	1117 subst, main_replacement))

	1118 if options.bitness == 64:

	1119 main_replacement_r8 = register_replacement + memory_replacement_r8

	1120 compressors.append(Compressor(

	1121 '.?({:02x} 0[47])'.format(0x04 + opcode_bits 8) + regex,

	1122 subst_r8, main_replacement_r8))

	1123 return compressors

	1124

	1125

	1126 def AllRmCompressors():

	1127 """ Return list of all RM (with reg as opcode extension) compressors. """

	1128 compressors = []

	1129

	1130 for rm in RegisterKinds():

	1131 for opcode_bits in xrange(8):

	1132 instruction_kinds = [

	1133 # The most basic form

	1134 {}

	1135 ]

	1136 # In 64 bit mode we have many different types of instructions depending

	1137 # on whether we are accessing memory or whether we are writing to

	1138 # registers.

	1139 if options.bitness == 64:

	1140 # No memory access (e.g. prefetch)

	1141 instruction_kinds = [

	1142 {'memory_accessed':False}] + instruction_kinds

	1143 # More forms in 64 bit case.

	1144 if rm in ('eax', 'r8d'):

	1145 # Zero-extending version.

	1146 instruction_kinds.append(

	1147 {'register_write':'sandbox'})

	1148 # Still more forms for 64 bit case (reg to rm).

	1149 if rm in ('al', 'spl', 'ax', 'eax', 'rax',

	1150 'r8b', 'r8w', 'r8d', 'r8'):

	1151 # Dangerous instructions.

	1152 instruction_kinds.append(

	1153 {'register_write':'protect'})

	1154 for instruction_kind in instruction_kinds:

	1155 compressors.extend(RmCompressors(

	1156 rm=rm, opcode_bits=opcode_bits, **instruction_kind))

	1157 return compressors

	1158

	1159

	1160 def AllOpcodeCompressors():

	1161 """ Return "register in opcode" compressors. """

	1162 compressors = []

	1163

	1164 for reg in RegisterKinds() + ('st(0)',):

	1165 for opcode in xrange(8):

	1166 for text1, text2, nibble in (

	1167 ('[0..7]', '[8..f]', xrange(8)),

	1168 ('[012367]', '[89abef]', (0, 1, 2, 3, 6, 7)),

	1169 ('[0..6]', '[8..e]', xrange(7))

	1170 ):

	1171 start_reg = REGISTERS[reg][0]

	1172 end_reg = REGISTERS[reg][-2 if text2[-2] == 'e' else -1]
	halyavin 2013/12/06 13:02:51 use nibble instead. use nibble instead. khim 2013/12/06 15:57:36 Done. Show quoted text On 2013/12/06 13:02:51, halyavin wrote: > use nibble instead. Done.
	1173 # Note that we use 2nd line here to avoid ambiguity when opcode is 0x00

	1174 compressors.append(Compressor(

	1175 '.?[0-9a-fA-F](1)(?: 00)'

	1176 ' \\w* (?:\\$0x0,\|%ax,\|%st,)?'

	1177 '(%(?:' + REGISTERS_RE[reg][1] + ')).*()',
	halyavin 2013/12/06 13:02:51 I have no idea why sometimes it is [1] and sometim I have no idea why sometimes it is [1] and sometimes it is [0] halyavin 2013/12/06 13:02:51 Why not '(%' + REGISTERS_RE[reg][1] + ').()'? Why not '(%' + REGISTERS_RE[reg][1] + ').()'? khim 2013/12/06 15:57:36 There are comment here which explains it! Tryed t Show quoted text On 2013/12/06 13:02:51, halyavin wrote: > I have no idea why sometimes it is [1] and sometimes it is [0] There are comment here which explains it! Tryed to change it. khim 2013/12/06 15:57:36 For historical reasons. Fixed. Show quoted text On 2013/12/06 13:02:51, halyavin wrote: > Why not '(%' + REGISTERS_RE[reg][1] + ').*()'? For historical reasons. Fixed.
	1178 (text1, '[%{}..%{}]'.format(start_reg, end_reg), ''),

	1179 tuple(('{:x}'.format(n), '%' + REGISTERS[reg][n])

	1180 for n in nibble)))

	1181 compressors.append(Compressor(

	1182 '.?[0-9a-fA-F](8)(?: 00)'

	1183 ' \\w* (?:\\$0x0,\|%ax,\|%st,)?'

	1184 '(%(?:' + REGISTERS_RE[reg][0] + ')).*()',

	1185 (text2, '[%{}..%{}]'.format(start_reg, end_reg), ''),

	1186 tuple(('{:x}'.format(n + 8), '%' + REGISTERS[reg][n])

	1187 for n in nibble)))

	1188 # Another version for 64 bit case

	1189 if options.bitness == 64 and reg in ('eax', 'r8d'):

	1190 compressors.append(Compressor(

	1191 '.?[0-9a-fA-F](1)(?: 00)'

	1192 ' \\w* (?:\\$0x0,\|%ax,\|%st,)?'

	1193 '(%(?:' + REGISTERS_RE[reg][1] + ')).*'

	1194 'output_rr=(%(?:'+ REGISTERS_RE[reg][1] + ')).*()',

	1195 tuple([text1] + ['[%{}..%{}]'.format(start_reg, end_reg)] * 2 +
	halyavin 2013/12/06 13:02:51 Extract '[%{}..%{}]'.format(start_reg, end_reg) to Extract '[%{}..%{}]'.format(start_reg, end_reg) to variable. khim 2013/12/06 15:57:36 Done. Show quoted text On 2013/12/06 13:02:51, halyavin wrote: > Extract '[%{}..%{}]'.format(start_reg, end_reg) to variable. Done.
	1196 ['']),

	1197 tuple(['{:x}'.format(n)] + ['%' + REGISTERS[reg][n]] * 2

	1198 for n in nibble)))

	1199 compressors.append(Compressor(

	1200 '.?[0-9a-fA-F](8)(?: 00)'

	1201 ' \\w* (?:\\$0x0,\|%ax,\|%st,)?'

	1202 '(%(?:' + REGISTERS_RE[reg][0] + ')).*'

	1203 'output_rr=(%(?:'+ REGISTERS_RE[reg][0] + ')).*()',

	1204 tuple([text2] + ['[%{}..%{}]'.format(start_reg, end_reg)] * 2 +

	1205 ['']),

	1206 tuple(['{:x}'.format(n + 8)] + ['%' + REGISTERS[reg][n]] * 2

	1207 for n in nibble)))

	1208 return compressors

	1209

	1210

	1211 def AllMemoryNonMemoryCompressors():

	1212 """ Return memory/nonmemory compressors. """

	1213

	1214 compressors = []

	1215

	1216 if options.bitness == 32:

	1217 letters_and_registers = (('b', 'al', ''), ('w', 'ax', ''), ('l', 'eax', ''))

	1218 else:

	1219 letters_and_registers = (

	1220 ('b', 'al', 'eax'), ('b', 'spl', 'eax'), ('b', 'r8b', 'r8d'),

	1221 ('w', 'ax', 'eax'), ('w', 'r8w', 'r8d'),

	1222 ('l', 'eax', 'eax'), ('l', 'r8d', 'r8d'),

	1223 ('q', 'rax', 'eax'), ('q', 'r8', 'r8d')

	1224 )

	1225 for letter, reg, out_reg in letters_and_registers:

	1226 start_reg = REGISTERS[reg][0]

	1227 if reg[0:2] == 'r8':

	1228 end_regs = (REGISTERS[reg][-2], REGISTERS[reg][-1])

	1229 else:

	1230 end_regs = (REGISTERS[reg][-1], )

	1231 for end_reg in end_regs:

	1232 all_regs = '[%{}..%{}]'.format(start_reg, end_reg)

	1233 regs_mark = '[%{}..%{} or memory]'.format(start_reg, end_reg)

	1234 if options.bitness == 64:

	1235 start_out = REGISTERS[out_reg][0]

	1236 end_out = REGISTERS[out_reg][-1 if out_reg[0:2] != 'r8' else -2]

	1237 out_regs = '[%{}..%{}]'.format(start_out, end_out)

	1238 for notes in ('', ' # rm to reg', ' # reg to rm'):

	1239 compressors.append(Compressor(

	1240 '.* \\w(' + letter + ') .(\\[memory]).*()()',

	1241 ('[{}]?'.format(letter), regs_mark, '', ''),

	1242 ((letter, '[memory]', ''), ('', all_regs, notes))))

	1243 if options.bitness == 64:

	1244 for index_reg in ('eax', 'r8d'):

	1245 start_index = REGISTERS[index_reg][0]

	1246 end_index = REGISTERS[index_reg][-1]

	1247 index_regs = '[%{}..%{}]'.format(start_index, end_index)

	1248 for output_rrs in ((None, out_regs),

	1249 (out_regs, None),

	1250 (None, None)):

	1251 compressors.append(Compressor(

	1252 '.* \\w(' + letter + ') .(\\[memory]).*; '

	1253 'input_rr=(\\[%[a-z0-9]..%[a-z0-9]\\]); '

	1254 'output_rr=(\\[%[a-z0-9]..%[a-z0-9]\\]\|None)()()',

	1255 ('[{}]?'.format(letter), regs_mark, index_regs,

	1256 output_rrs[1] if output_rrs[0] is None else output_rrs[0],

	1257 '', ''),

	1258 ((letter, '[memory]', index_regs, output_rrs[0], ''),

	1259 ('', all_regs, 'any_nonspecial', output_rrs[1], notes))))

	1260 return compressors

	1261

	1262

	1263 def RexCompressor(rm, rmR15, reg, regR15, rexw, input_rr, output_rr, rm_to_reg):

	1264 """ Return REX compressor (or nothing) for a given set of paramenters.

	1265

	1266 Args:

	1267 rm: rm operand kind (see REGISTERS array) or None if rm is not used

	1268 reg: reg operand kind (see REGISTERS array) or None if reg is not used

	1269 rmR15: True if R15 register is included

	1270 regR15: True if R15 register is included

	1271 rexw: True if REX.W should be set

	1272 input_rr: True if input_rr is used

	1273 output_rr: true if output_rr is used

	1274 rm_to_reg: True if direction is rm to reg

	1275 """

	1276

	1277 # reg and rm can be of three different possible intervals

	1278 # start_reg/rm to end_reg/rm (e.g. from %al to %bh)

	1279 # start_reg/rm to end_reg0/rm0 (e.g from %al to %dil)

	1280 # start_reg8/rm8 to end_reg8/rm8 (e.g. from %r8b to %r15b)

	1281 # First form can be observed if there are not REX, second form

	1282 # if REX.R/REX.B is not set and third form is where it's set.

	1283 if reg:

	1284 start_reg = REGISTERS[reg][0]

	1285 start_reg8 = REGISTERS[r8[reg]][0]

	1286 end_reg = REGISTERS[reg][-1]

	1287 end_reg0 = 'dil' if reg == 'al' else end_reg

	1288 end_reg8 = REGISTERS[r8[reg]][-1 if regR15 else -2]

	1289 if rexw:

	1290 reg_regex = '\\[(%' + start_reg + '\\.\\.%' + end_reg0 + ')]'

	1291 else:

	1292 reg_regex = '\\[(%' + start_reg + '\\.\\.%' + end_reg + ')]'

	1293 if rm:

	1294 start_rm = REGISTERS[rm][0]

	1295 start_rm8 = REGISTERS[r8[rm]][0]

	1296 end_rm = REGISTERS[rm][-1]

	1297 end_rm0 = 'dil' if rm == 'al' else end_rm

	1298 end_rm8 = REGISTERS[r8[rm]][-1 if rmR15 else -2]

	1299 if rexw:

	1300 rm_regex = ('\\[(%' + start_rm + '\\.\\.%' + end_rm0 + ')'

	1301 '(?: or memory)?]')

	1302 else:

	1303 rm_regex = ('\\[(%' + start_rm + '\\.\\.%' + end_rm + ')'

	1304 '(?: or memory)?]')

	1305

	1306 # Legacy prefixes

	1307 regex = '.:(?: 26\| 2e\| 36\| 3e\| 64\| 65\| 66\| 67\| f0\| f2\| f3)'

	1308 # REX

	1309 regex += '( 48).' if rexw else '( 40\|).'

	1310 # Replacement text

	1311 replacement_tuple = (' [REX:48..4f]' if rexw else ' [REX:40..47]?', )

	1312 if reg: replacement_regs = '%{}..%{}'.format(start_reg, end_reg8)

	1313 if rm: replacement_rms = '%{}..%{}'.format(start_rm, end_rm8)

	1314 # Instruction arguments

	1315 if not reg and not rm:

	1316 pass

	1317 elif not reg and rm:

	1318 regex += rm_regex + '.*'

	1319 replacement_tuple += (replacement_rms, )

	1320 elif reg and not rm:

	1321 regex += reg_regex + '.*'

	1322 replacement_tuple += (replacement_regs, )

	1323 elif rm_to_reg:

	1324 regex += rm_regex + ',' + reg_regex + '.*'

	1325 replacement_tuple += (replacement_rms, replacement_regs)

	1326 else:

	1327 regex += reg_regex + ',' + rm_regex + '.*'

	1328 replacement_tuple += (replacement_regs, replacement_rms)

	1329 # Input and output restricted registers

	1330 if input_rr:

	1331 regex += 'input_rr=\\[(%eax\\.\\.%edi)].*'

	1332 replacement_tuple += ('%eax..%r15d', )

	1333 if output_rr:

	1334 regex += 'output_rr=\\[(%eax\\.\\.%edi)].*'

	1335 replacement_tuple += ('%eax..%r14d', )

	1336 regex += '()'

	1337 replacement_tuple += ('', )

	1338 # Replacement cases

	1339 replacement_tuples = ()

	1340 for byte in (range(0x48, 0x50) if rexw else range(0x40, 0x48) + [0]):

	1341 replacement_case = (' {:02x}'.format(byte) if byte else '', )

	1342 if rm:

	1343 if byte & 0x1:

	1344 replacement_rms = '%{}..%{}'.format(start_rm8, end_rm8)

	1345 elif byte:

	1346 replacement_rms = '%{}..%{}'.format(start_rm, end_rm0)

	1347 else:

	1348 replacement_rms = '%{}..%{}'.format(start_rm, end_rm)

	1349 if byte & 0x2:

	1350 replacement_index = '%r8d..%r15d'

	1351 else:

	1352 replacement_index = '%eax..%edi'

	1353 if reg:

	1354 if byte & 0x4:

	1355 replacement_regs = '%{}..%{}'.format(start_reg8, end_reg8)

	1356 elif byte:

	1357 replacement_regs = '%{}..%{}'.format(start_reg, end_reg0)

	1358 else:

	1359 replacement_regs = '%{}..%{}'.format(start_reg, end_reg)

	1360 if not reg and not rm:

	1361 pass

	1362 elif not reg and rm:

	1363 replacement_case += (replacement_rms, )

	1364 final_rr = '%r8d..%r14d' if byte & 0x1 else '%eax..%edi'

	1365 elif reg and not rm:

	1366 replacement_case += (replacement_regs, )

	1367 final_rr = '%r8d..%r14d' if byte & 0x4 else '%eax..%edi'

	1368 elif rm_to_reg:

	1369 replacement_case += (replacement_rms, replacement_regs)

	1370 final_rr = '%r8d..%r14d' if byte & 0x4 else '%eax..%edi'

	1371 else:

	1372 replacement_case += (replacement_regs, replacement_rms)

	1373 final_rr = '%r8d..%r14d' if byte & 0x1 else '%eax..%edi'

	1374 if input_rr: replacement_case += (replacement_index, )

	1375 if output_rr: replacement_case += (final_rr, )

	1376 replacement_tuples += (replacement_case, )

	1377 return Compressor(regex, replacement_tuple, replacement_tuples)

	1378

	1379

	1380 def AllRexCompressors():

	1381 """ Return "REX" compressors which combine different REX prefixes. """

	1382

	1383 if options.bitness != 64:

	1384 return []

	1385

	1386 compressors = []

	1387

	1388 # First pretty complex set of compressors to combine versions of REX with

	1389 # three lowest bits in different states.

	1390 register_kind_pairs = (

	1391 ( None, None),

	1392 ( 'al', 'al'), ( 'al', None), (None, 'al'),

	1393 ( 'ax', 'al'), ( 'al', 'ax'),

	1394 ( 'ax', 'ax'), ( 'ax', None), (None, 'ax'),

	1395 ( 'eax', 'al'), ( 'al', 'eax'),

	1396 ( 'eax', 'ax'), ( 'ax', 'eax'),

	1397 ( 'eax', 'eax'), ( 'eax', None), (None, 'eax'),

	1398 ( 'rax', 'al'), ( 'al', 'rax'),

	1399 ( 'rax', 'ax'), ( 'ax', 'rax'),

	1400 ( 'rax', 'eax'), ( 'eax', 'rax'),

	1401 ( 'rax', 'rax'), ( 'rax', None), (None, 'rax'),

	1402 ( 'eax', 'mm0'), ( 'mm0', 'eax'),

	1403 ( 'rax', 'mm0'), ( 'mm0', 'rax'),

	1404 ( 'mm0', 'eax'), ( 'eax', 'mm0'),

	1405 ( 'mm0', 'rax'), ( 'rax', 'mm0'),

	1406 ( 'eax', 'xmm0'),

	1407 ( 'rax', 'xmm0'),

	1408 ('xmm0', 'eax'),

	1409 ('xmm0', 'rax'),

	1410 ( 'mm0', 'mm0'), ( 'mm0', None), (None, 'mm0'),

	1411 ( 'mm0', 'xmm0'),

	1412 ('xmm0', 'mm0'),

	1413 ('xmm0', 'xmm0'),

	1414 ('xmm0', 'ymm0'), ('xmm0', None), (None, 'xmm0'),

	1415 ('ymm0', 'xmm0'),

	1416 ('ymm0', 'ymm0'), ('ymm0', None), (None, 'ymm0'),

	1417 )

	1418 for reg, rm in register_kind_pairs:

	1419 for regR15 in (True, False):

	1420 for rmR15 in (True, False):

	1421 for rexw in (True, False):

	1422 for input_rr in (True, False):

	1423 for output_rr in (True, False):

	1424 for rm_to_reg in (True, False):

	1425 # These combinations will just produce useless duplicates

	1426 if not reg and not regR15: continue

	1427 if not rm and not rmR15: continue

	1428 if not reg and not rm and (output_rr or rm_to_reg): continue

	1429 compressors.append(RexCompressor(

	1430 rm=rm, rmR15=rmR15, reg=reg, regR15=regR15,

	1431 rexw=rexw, input_rr=input_rr, output_rr=output_rr,

	1432 rm_to_reg=rm_to_reg))

	1433

	1434 # This is pretty simple compressor to combine two lines with different REX.W

	1435 # bits (only if they are otherwise identical).

	1436 compressors.append(Compressor(

	1437 '.(\\[REX:40\\.\\.47]\\?).()', ('[REX:40..4f]?', ''),

	1438 (('[REX:40..47]?', ), ('[REX:48..4f]', ))))

	1439 return compressors

	1440

	1441

	1442 def AllSpecialCompressors():

	1443 """ Return all "special" compressors. """

	1444

	1445 compressors = []

	1446

	1447 # Special compressors: will handle some cosmetic issues.

	1448 #

	1449 # SETxx ignores reg field and thus are described as many separate instructions

	1450 compressors.append(Compressor(

	1451 '.0f 9[0-9a-fA-F] XX(/[0-7]) set.()', ('', ''),

	1452 [('/' + str(i), ) for i in range(8)]))

	1453 # BSWAP is described with opcode "0f c8+r", not "0f /1" in manual

	1454 if options.bitness == 32:

	1455 compressors.append(Compressor(

	1456 '.(XX/1) bswap.ax.*()', ('c[8..f]', ''), [('XX/1', )]))

	1457 else:

	1458 compressors.append(Compressor(

	1459 '.(XX/1) bswap.ax.*()', ('c[89abef]', ''), [('XX/1', )]))

	1460 compressors.append(Compressor(

	1461 '.(XX/1) bswap.r8.*()', ('c[8..e]', ''), [('XX/1', )]))

	1462 # Add mark '# write to both' to certain versions of CMPXCHG, XADD, and XCHG

	1463 if options.bitness == 64:

	1464 compressors.append(Compressor(

	1465 '.* (?:cmpxchg\|xadd\|xchg).%al\\.\\.%bh[^#]()$',

	1466 (' # write to both', ), ((), )))

	1467 # "and $0xe0,[%eax..%edi]" is treated specially which means that we list all

	1468 # versions of and "[$0x1..$0xff],[%eax..%edi]" separately here.

	1469 # Without this rule these ands comprise 2/3 of the whole output!

	1470 if options.bitness == 32:

	1471 compressors.append(Compressor(

	1472 '.*83 (e0 01 and \\$0x1,%eax)()',

	1473 ('XX/4 00 and[l]? $0x0,[%eax..%edi or memory]', ' # special and'),

	1474 [('e{} {:02x} and $0x{:x},%{}'.format(r, i, i, REGISTERS['eax'][r]), )

	1475 for i in range(0x01, 0x100) for r in range(8)] +

	1476 [('XX/4 00 and[l]? $0x0,[%eax..%edi or memory]', )]))

	1477 else:

	1478 for reg in ('eax', 'r8d'):

	1479 start_reg = REGISTERS[reg][0]

	1480 end_reg = REGISTERS[reg][-1 if reg[0:2] != 'r8' else -2]

	1481 for index_reg in ('eax', 'r8d'):

	1482 start_index = REGISTERS[index_reg][0]

	1483 end_index = REGISTERS[index_reg][-1]

	1484 compressors.append(Compressor(

	1485 '.83 (e0 01 and \\$0x1,%' + reg + ').'

	1486 'input_rr=(any_nonspecial); output_rr=(%' + reg + ')()',

	1487 ('XX/4 00 and[l]? $0x0,[%{}..%{} or memory]'.format(start_reg,

	1488 end_reg), '[%{}..%{}]'.format(start_index, end_index),

	1489 '[%{}..%{}]'.format(start_reg, end_reg),

	1490 ' # special and'),

	1491 [('e{} {:02x} and $0x{:x},%{}'.format(r, i, i, REGISTERS[reg][r]),

	1492 'any_nonspecial', '%' + REGISTERS[reg][r])

	1493 for i in range(0x01, 0x100) for r in range(7 + (reg == 'eax'))] +

	1494 [('XX/4 00 and[l]? $0x0,[%{}..%{} or memory]'.format(start_reg,

	1495 end_reg), '[%{}..%{}]'.format(start_index, end_index),

	1496 '[%{}..%{}]'.format(start_reg, end_reg))]))

	1497

	1498 # "and $e0" and similar are used to align %rsp. All negative values are

	1499 # accepted by validator and there are 127 of these.

	1500 # Consolidate them into one line.

	1501 if options.bitness == 64:

	1502 compressors.append(Compressor(

	1503 '.(?:81\|83) (?:e4\|e5) (80) (?:00 00 00 \|) and \\$0x(80),%r[bs]p.()',

	1504 ('[80..ff]', '[80..ff]', ' # alignment and'),

	1505 [('{:02x}'.format(i), '{:02x}'.format(i)) for i in range(0x80, 0x100)]))

	1506 return compressors

	1507

	1508

	1509 def PrepareCompressors():

	1510 """ Return list of all compressors sorted from bigger ones to smaller ones """

	1511

	1512 return tuple(sorted(

	1513 AllRegToRmCompressors() +

	1514 AllRmCompressors() +

	1515 AllOpcodeCompressors() +

	1516 AllMemoryNonMemoryCompressors() +

	1517 AllRexCompressors() +

	1518 AllSpecialCompressors(),

	1519 key=lambda compressor: -len(compressor.replacements)))

	1520

	1521

	1522 def ShowProgress(rule, instruction):

	1523 if rule not in ShowProgress.rules_shown:

	1524 first_print = True

	1525 ShowProgress.rules_shown[rule]=len(ShowProgress.rules_shown)

	1526 else:

	1527 first_print = False

	1528 print >> sys.stderr, '-------- Compressed --------'

	1529 print >> sys.stderr, 'Rule:', ShowProgress.rules_shown[rule]

	1530 print >> sys.stderr, '--------'

	1531 compressor = compressors[rule]

	1532 match = compressor.regex.match(instruction)

	1533 assert match

	1534 format_str = CompressionTemplate(instruction, match, '{{{}}}')

	1535 replacements = sorted(format_str.format(*replacement)

	1536 for replacement in compressor.replacements)

	1537 if len(compressor.replacements) <= 4 or first_print:

	1538 for replacement in replacements:

	1539 print >> sys.stderr, replacement

	1540 else:

	1541 print >> sys.stderr, replacements[0]

	1542 print >> sys.stderr, '...'

	1543 print >> sys.stderr, replacements[-1]

	1544 print >> sys.stderr, '--------'

	1545 print >> sys.stderr, 'Compressed', (

	1546 format_str + '{{{}}}').format(*compressor.subst)

	1547 ShowProgress.rules_shown = {}

	1548

	1549

	1550 def main():

	1551 # We are keeping these global to share state graph and compressors

	1552 # between workers spawned by multiprocess. Passing them every time is slow.

	1553 global options, xml_file

	1554 global dfa

	1555 global worker_validator

	1556 options, xml_file = ParseOptions()

	1557 dfa = dfa_parser.ParseXml(xml_file)

	1558 worker_validator = validator.Validator(

	1559 validator_dll=options.validator_dll,

	1560 decoder_dll=options.decoder_dll)

	1561 global compressors

	1562 compressors = PrepareCompressors()

	1563

	1564 assert dfa.initial_state.is_accepting

	1565 assert not dfa.initial_state.any_byte

	1566

	1567 print >> sys.stderr, len(dfa.states), 'states'

	1568

	1569 num_suffixes = dfa_traversal.GetNumSuffixes(dfa.initial_state)

	1570

	1571 # We can't just write 'num_suffixes[dfa.initial_state]' because

	1572 # initial state is accepting.

	1573 total_instructions = sum(

	1574 num_suffixes[t.to_state]

	1575 for t in dfa.initial_state.forward_transitions.values())

	1576 print >> sys.stderr, total_instructions, 'regular instructions total'

	1577

	1578 tasks = dfa_traversal.CreateTraversalTasks(dfa.states, dfa.initial_state)

	1579 print >> sys.stderr, len(tasks), 'tasks'

	1580

	1581 pool = multiprocessing.Pool()

	1582

	1583 results = pool.imap(Worker, tasks)

	1584

	1585 total = 0

	1586 num_valid = 0

	1587 full_output = set()

	1588 for prefix, count, valid_count, output, trace in results:

	1589 print >> sys.stderr, 'Prefix:', ', '.join(map(hex, prefix))

	1590 total += count

	1591 num_valid += valid_count

	1592 full_output \|= output

	1593 for rule, instruction in trace:

	1594 ShowProgress(rule, instruction)

	1595 for instruction in sorted(Compressed(full_output,

	1596 compressors,

	1597 ShowProgress)):

	1598 print instruction

	1599

	1600 print >> sys.stderr, total, 'instructions were processed'

	1601 print >> sys.stderr, num_valid, 'valid instructions'

	1602

	1603

	1604 if __name__ == '__main__':

	1605 main()

OLD	NEW

« no previous file with comments | « no previous file | src/trusted/validator_ragel/testdata/32bit_regular.golden » ('j') | no next file with comments »