Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(9)

Side by Side Diff: src/trusted/validator_ragel/compress_regular_instructions.py

Issue 49183002: Regular instructions golden file test. Base URL: svn://svn.chromium.org/native_client/trunk/src/native_client/
Patch Set: Created 7 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « no previous file | src/trusted/validator_ragel/testdata/32bit_regular.golden » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 # Copyright (c) 2013 The Native Client Authors. All rights reserved.
2 # Use of this source code is governed by a BSD-style license that can be
3 # found in the LICENSE file.
4
5 """
6 Traverse the validator's DFA, collect all "normal" instruction and then
halyavin 2013/12/07 17:27:15 instruction -> instructions
khim 2013/12/08 23:22:31 Done.
7 compress output. Note: "anybyte fields" (immediates and displacements)
8 are always filled with zeros. Otherwise processing of sextillions (sic!)
9 of possibilities will take too long.
10
11 Each rule is applied only when all variants are accepted by validator.
12 The following compression rules are present:
13
14 1. Compress ModR/M (+SIB & displacement).
15 Instruction: 00 00 add %al,(%rax)
16 ...
17 Instruction: 00 ff add %bh,%bh
18 becomes
19 Instruction: 00 XX add [%al..%bh],[%al..%bh or memory]
20
21 1a. Compress ModR/M (+SIB & displacement) memory-only.
22 Instruction: f0 01 00 lock add %eax,(%eax)
23 ...
24 Instruction: f0 01 bf 00 00 00 00 lock add %edi,0x0(%edi)
25 becomes
26 Instruction: f0 01 XX lock add [%eax..edi],[memory]
27
28 1b. Compress ModR/M register only.
29 Instruction: 66 0f 50 c0 movmskpd %xmm0,%eax
30 ...
31 Instruction: 66 0f 50 ff movmskpd %xmm7,%edi
32 becomes
33 Instruction: 66 0f 50 XX movmskpd [%xmm0..%xmm7],[%eax..edi]
34
35 2. Compress ModR/M (+SIB & displacement) with opcode extension.
36 Instruction: 0f 90 00 seto (%eax)
37 ...
38 Instruction: 0f 90 c7 seto %bh
39 becomes
40 Instruction: 0f 90 XX/0 seto [%al..%bh or memory]
41
42 2a. Compress ModR/M (+SIB & displacement) memory-only with opcode extension.
43 Instruction: f0 ff 00 lock incl (%eax)
44 ...
45 Instruction: f0 ff 84 ff 00 00 00 00 lock incl 0x0(%edi,%edi,8)
46 becomes
47 Instruction: f0 ff XX/1 lock decl [memory]
48
49 2b. Compress ModR/M register-only with opcode extension.
50 Instruction: 0f 71 d0 00 psrlw $0x0,%mm0
51 ...
52 Instruction: 0f 71 d7 00 psrlw $0x0,%mm7
53 becomes
54 Instruction: 66 0f 71 XX/2 00 psrlw $0x0,[%mm0..%mm7]
55
56 3. Compress register-in-opcode.
57 Instruction: d9 c0 fld %st(0)
58 ...
59 Instruction: d9 c7 fld %st(7)
60 becomes
61 Instruction: Instruction: d9 c[0..7] fld [%st(0)..%st(7)]
62
63 Only applies if all possible register accesses are accepted by validator.
64
65 4. Special compressor for "set" instruction.
66 Instruction: 0f 90 XX/0 seto [%al..%bh or memory]
67 ...
68 Instruction: 0f 90 XX/7 seto [%al..%bh or memory]
69 becomes
70 Instruction: 0f 90 XX seto [%al..%bh or memory]
71 """
72
73 import itertools
74 import multiprocessing
75 import optparse
76 import os
77 import re
78 import subprocess
79 import sys
80 import tempfile
81 import traceback
82
83 import dfa_parser
84 import dfa_traversal
85 import validator
86
87
88 # Register names in 'natual' order (as defined by IA32/x86-64 ABI)
89 #
90 # X86-64 ABI splits all registers in groups of 8 because it uses 3-bit field
91 # in opcode, ModR/M, and/or SIB bytes to encode them.
92 #
93 # In most cases there are 16 registers of a given kind and two such groups,
94 # but there are couple of exceptions:
95 # 1. There are 20 8-bit registers and three groups (two of them overlap)
96 # 2. There are eight X87 and MMX registers thus two groups are identical
97 #
98 # We use typical register from a group to name the whole group. Most groups
99 # use first register, but 'spl' group uses fifth register because it's first
halyavin 2013/12/07 17:27:15 are named by the first register but 'spl' group is
khim 2013/12/08 23:22:31 Done.
100 # four registers are the same as 'al' group. We use mnemonic name 'mmalt'
101 # to represent the "evil mirror" of the 'mm0' group.
102 REGISTERS = {
103 'al': [ 'al', 'cl', 'dl', 'bl', 'ah', 'ch', 'dh', 'bh' ],
104 'spl': [ 'al', 'cl', 'dl', 'bl', 'spl', 'bpl', 'sil', 'dil' ],
105 'ax': [ 'ax', 'cx', 'dx', 'bx', 'sp', 'bp', 'si', 'di' ],
106 'eax': [ 'eax', 'ecx', 'edx', 'ebx', 'esp', 'ebp', 'esi', 'edi' ],
107 'rax': [ 'rax', 'rcx', 'rdx', 'rbx', 'rsp', 'rbp', 'rsi', 'rdi' ],
108 'r8b': [ 'r{}b'.format(N) for N in range(8,16) ],
109 'r8w': [ 'r{}w'.format(N) for N in range(8,16) ],
110 'r8d': [ 'r{}d'.format(N) for N in range(8,16) ],
111 'r8': [ 'r{}'.format(N) for N in range(8,16) ],
112 'mm0': [ 'mm{}'.format(N) for N in range(8) ],
113 'mmalt': [ 'mm{}'.format(N) for N in range(8) ],
114 'st(0)': [ 'st({})'.format(N) for N in range(8) ],
115 'xmm0': [ 'xmm{}'.format(N) for N in range(8) ],
116 'xmm8': [ 'xmm{}'.format(N) for N in range(8,16) ],
117 'ymm0': [ 'ymm{}'.format(N) for N in range(8) ],
118 'ymm8': [ 'ymm{}'.format(N) for N in range(8,16) ]
119 }
120
121
122 NOP = 0x90
123
124
125 def PadToBundleSize(bytes):
126 assert len(bytes) <= validator.BUNDLE_SIZE
127 return bytes + [NOP] * (validator.BUNDLE_SIZE - len(bytes))
128
129
130 # In x86-64 mode we have so-called 'restricted register' which is used to
131 # tie two groups together. Some instructions require particular value to
132 # be stored in this variable, while some accept any non-special restricted
133 # register (%ebp and %esp are special because they can only be accepted by
134 # a few 'special' instructions).
135 #
136 # You can find more details in the "NaCl SFI model on x86-64 systems" manual.
137 #
138 # We try to feed all possible 'restricted registers' into validator and then
139 # classify the instruction using this map. If set of acceptable 'restricted
140 # registers' is not here, then it's an error in validator.
141 ACCEPTABLE_X86_64_INPUTS = {
142 0x00001: 'input_rr=%eax',
143 0x00002: 'input_rr=%ecx',
144 0x00004: 'input_rr=%edx',
145 0x00008: 'input_rr=%ebx',
146 0x00010: 'input_rr=%esp',
147 0x00020: 'input_rr=%ebp',
148 0x00040: 'input_rr=%esi',
149 0x00080: 'input_rr=%edi',
150 0x00100: 'input_rr=%r8d',
151 0x00200: 'input_rr=%r9d',
152 0x00400: 'input_rr=%r10d',
153 0x00800: 'input_rr=%r11d',
154 0x01000: 'input_rr=%r12d',
155 0x02000: 'input_rr=%r13d',
156 0x04000: 'input_rr=%r14d',
157 0x08000: 'input_rr=%r15d',
158 0x1ffcf: 'input_rr=any_nonspecial'
159 }
160
161 # Any instruction must produce either None or one of fifteen registers as an
162 # output 'restricted register' value. 'r15d' is NOT acceptable as an output.
163 ACCEPTABLE_X86_64_OUTPUT_REGISTERS = tuple(
164 '%' + reg for reg in (REGISTERS['eax'] + REGISTERS['r8d'])[0:-1])
165
166
167 def ValidateInstruction(instruction, validator_inst):
168 bundle = ''.join(map(chr, PadToBundleSize(instruction)))
169 if options.bitness == 32:
170 result = validator_inst.ValidateChunk(bundle, bitness=32)
171 return result, []
172 else:
173 valid_inputs = 0
174 known_final_rr = None
175 output_rr = None
176 # Note that iteration order is aligned with ACCEPTABLE_X86_64_INPUTS array
177 # above.
178 for bit, initial_rr in enumerate(validator.ALL_REGISTERS + [None]):
179 valid, final_rr = validator_inst.ValidateAndGetFinalRestrictedRegister(
180 bundle, len(instruction), initial_rr)
181 if valid:
182 # final_rr should not depend on input_rr
183 assert valid_inputs == 0 or known_final_rr == final_rr
184 valid_inputs |= 1 << bit
185 known_final_rr = final_rr
186 # If nothing is accepted then instruction is not valid. Easy and simple.
187 if valid_inputs == 0: return False, []
188 # If returned value in unacceptable we'll get IndexError here and this
189 # test will fail
190 if known_final_rr is not None:
191 output_rr = ACCEPTABLE_X86_64_OUTPUT_REGISTERS[known_final_rr]
192 # If collected valid_inputs are unacceptable we'll get KeyError here and
193 # this test will fail
194 return True, [ACCEPTABLE_X86_64_INPUTS[valid_inputs],
195 'output_rr={}'.format(output_rr)]
196
197
198 class WorkerState(object):
199 def __init__(self, prefix, validator):
200 self.total_instructions = 0
201 self.num_valid = 0
202 self.validator = validator
203 self.output = set()
204 self.trace = []
205
206
207 def ReceiveInstruction(self, bytes):
208 self.total_instructions += 1
209 result, notes = ValidateInstruction(bytes, self.validator)
210 if result:
211 self.num_valid += 1
212 dis = self.validator.DisassembleChunk(
213 ''.join(map(chr, bytes)),
214 bitness=options.bitness)
215 for line_nr in xrange(len(dis)):
216 dis[line_nr] = str(dis[line_nr])
217 assert dis[line_nr][0:17] == 'Instruction(0x' + str(line_nr) + ': '
218 assert dis[line_nr][-1:] == ')'
219 dis[line_nr] = dis[line_nr][17:-1]
220 # If %rip is involved then comment will be different depending on the
221 # instruction length. Eliminate it.
222 if '(%rip)' in dis[0]:
223 dis[0] = re.sub(' # 0x[ ]*[0-9a-fA-F]*', '', dis[0])
224 # Zero displacements are represented as 0x0 for all instructions except
225 # jumps where they disassembled as non-zero due to %eip/%rip-relative
226 # addressing. We replace this displacement with %eip/%rip to simplify
227 # compression.
228 if ' 0x' in dis[0] and ' 0x0' not in dis[0]:
229 for bytes in xrange(1, 16):
230 dis[0] = re.sub(
231 '(' + '(?:[0-9a-fA-F][0-9a-fA-F] ){' + str(bytes) + '} .* )' +
232 hex(bytes) + '(.*)',
233 '\\1%eip\\2' if options.bitness == 32 else '\\1%rip\\2',
234 dis[0]);
235 dis[0] = 'Instruction: ' + dis[0]
236 dis += notes
237 self.output.add('; '.join(dis))
238
239
240 def RecordTrace(self, compressor_nr, instruction):
241 self.trace.append((compressor_nr, instruction))
242
243
244 # Compressor has three slots: regex (which picks apart given instruction),
245 # subst (which is used to denote compressed version) and replacements (which
halyavin 2013/12/07 17:27:15 which is used to create the compressed version
khim 2013/12/08 23:22:31 Done.
246 # are used to generate set of instructions from a given code).
247 #
248 # Example compressor:
249 # regex = '.*?[0-9a-fA-F]([0-7]) \\w* (%e(?:[abcd]x|[sb]p|[sd]i)).*()'
250 # subst = ('[0-7]', '[%eax..%edi]', ' # register in opcode')
251 # replacements = ((0, '%eax'), (1, '%ecx'), (2, '%edx'), (3, '%ebx')
252 # (4, '%esp'), (5, '%ebp'), (6, '%esi'), (7, '%edi'))
253 #
254 # When faced with instriuction '40 inc %eax' it will capture the following
halyavin 2013/12/07 17:27:15 with instruction
khim 2013/12/08 23:22:31 Done.
255 # pieces of said instruction: '4[0] inc [%eax]'.
halyavin 2013/12/07 17:27:15 Add the empty group which will be replaced with th
khim 2013/12/08 23:22:31 Done.
256 #
257 # Then it will produce the following eight instructions:
258 # '40 inc %eax'
259 # '41 inc %ecx'
260 # '42 inc %edx'
261 # '43 inc %ebx'
262 # '44 inc %esp'
263 # '45 inc %ebp'
264 # '46 inc %esi'
265 # '47 inc %edi'
266 #
267 # If all these instructions can be found in a set of instructions then
268 # compressor will remove them from said set and will insert one replacement
269 # "compressed instruction" '4[0-7] inc [%eax..%edi] # register in opcode'.
270 #
271 # Note that last group is only used in the replacement. It's used to grab marks
halyavin 2013/12/07 17:27:15 the last group
halyavin 2013/12/07 17:27:15 Replacement is a poor choice of word in this sente
khim 2013/12/08 23:22:31 Done.
khim 2013/12/08 23:22:31 Done.
272 # added by previous compressors and to replace them with a new mark.
273 class Compressor(object):
274 __slots__ = [
275 'regex',
276 'subst',
277 'replacements'
278 ]
279
280 def __init__(self, regex, subst, replacements=None):
halyavin 2013/12/07 17:27:15 replacements are never None. Such rules are useles
khim 2013/12/08 23:22:31 Done.
281 self.regex = re.compile(regex)
282 self.subst = subst
283 self.replacements = [] if replacements is None else replacements
halyavin 2013/12/07 17:27:15 This line can be simplified to self.replacements=r
khim 2013/12/08 23:22:31 Done.
284
285
286 def CompressionTemplate(instruction, match, mark):
287 """ Replace all match groups with the mark. """
288 pos = 0
289 format_str = ''
290 for group in range(1, len(match.groups())):
291 format_str += instruction[pos:match.start(group)] + mark
292 pos = match.end(group)
293 return format_str + instruction[pos:match.start(len(match.groups()))]
halyavin 2013/12/07 17:27:15 You are not replacing the last group with the mark
khim 2013/12/08 23:22:31 Actually this behavior is bad. Take a look on All
294
295
296 def CompressOneMatch(instructions, instruction, match, compressor):
297 format_str = CompressionTemplate(instruction, match, '{}')
298 subset = set()
299 for replacement in compressor.replacements:
300 replacement_str = format_str.format(*replacement)
301 if not replacement_str in instructions:
302 return (False, instructions)
303 subset.add(replacement_str)
304 instructions -= subset
halyavin 2013/12/07 19:51:15 Add "assert instruction in subset". We may see a f
khim 2013/12/08 20:13:04 No surprises. Done.
305 instructions.add((format_str + '{}').format(*compressor.subst))
306 return (True, instructions)
307
308
309 def CompressOneInstruction(instructions, compressors, split, cache):
310 sorted_instructions = (sorted(i for i in instructions if i > split) +
311 sorted(i for i in instructions if i < split))
312 for instruction in sorted_instructions:
313 if instruction in cache:
314 compressors_list = cache[instruction]
315 for compressor_nr, match, compressor in compressors_list:
316 result, instructions = CompressOneMatch(
317 instructions, instruction, match, compressor)
318 if result:
319 return (instructions, compressor_nr, instruction)
320 else:
321 compressors_list = []
322 for compressor_nr, compressor in enumerate(compressors):
323 match = compressor.regex.match(instruction)
324 if match:
325 compressors_list.append((compressor_nr, match, compressor))
326 result, instructions = CompressOneMatch(
327 instructions, instruction, match, compressor)
328 if result:
329 return (instructions, compressor_nr, instruction)
330 cache[instruction] = compressors_list
331 return (instructions, False, False)
332
333
334 def Compressed(instructions, compressors, show_progress):
335 split = ''
336 cache = {}
337 while True:
338 instructions, rule, split = CompressOneInstruction(
339 instructions, compressors, split, cache)
340 if rule is False: break
341 show_progress(rule, split)
342 return instructions
343
344
345 def Worker((prefix, state_index)):
346 worker_state = WorkerState(prefix, worker_validator)
347
348 try:
349 dfa_traversal.TraverseTree(
350 dfa.states[state_index],
351 final_callback=worker_state.ReceiveInstruction,
352 prefix=prefix,
353 anyfield=0)
354 if (prefix[0] != 0x0f or prefix[1] != 0x0f): # Skip 3DNow! instructions
355 worker_state.output = Compressed(set(worker_state.output),
356 compressors,
357 worker_state.RecordTrace)
358 except Exception as e:
359 traceback.print_exc() # because multiprocessing imap swallows traceback
360 raise
361
362 return (
363 prefix,
364 worker_state.total_instructions,
365 worker_state.num_valid,
366 worker_state.output,
367 worker_state.trace)
368
369
370 def ParseOptions():
371 parser = optparse.OptionParser(usage='%prog [options] xmlfile')
372
373 parser.add_option('--bitness',
374 choices=['32', '64'],
375 help='The subarchitecture: 32 or 64')
376 parser.add_option('--validator_dll',
377 help='Path to librdfa_validator_dll')
378 parser.add_option('--decoder_dll',
379 help='Path to librdfa_decoder_dll')
380
381 options, args = parser.parse_args()
382 options.bitness = int(options.bitness)
383
384 if len(args) != 1:
385 parser.error('specify one xml file')
386
387 (xml_file, ) = args
388
389 return options, xml_file
390
391
392 # Version suitable for use in regular expressions
393 REGISTERS_RE = REGISTERS.copy()
394 REGISTERS_RE['st(0)'] = [ 'st\\({}\\)'.format(N) for N in range(8) ]
395 REGISTERS_RE['st\\(0\\)'] = REGISTERS_RE['st(0)']
halyavin 2013/12/07 17:27:15 Do we need this line?
khim 2013/12/08 23:22:31 Probably not. Removed.
396
397 # Index names in 'natual' order (as defined by IA32/x86-64 ABI)
398 INDEXES = {
399 'eax': [ 'eax', 'ecx', 'edx', 'ebx', 'eiz', 'ebp', 'esi', 'edi' ],
400 'rax': [ 'rax', 'rcx', 'rdx', 'rbx', 'riz', 'rbp', 'rsi', 'rdi' ],
401 'r8': [ 'r8', 'r9', 'r10', 'r11', 'r12', 'r13', 'r14', 'r15' ]
402 }
403 # Register which can not be used as base in 64-bit mode in all incarnations
halyavin 2013/12/07 17:27:15 The comment is wrong. We do can use %r15, %rbp, %r
khim 2013/12/08 23:22:31 Done.
404 X86_64_BASE_REGISTERS = set([
405 '%spl', '%bpl', '%r15b',
406 '%sp', '%bp', '%r15w',
407 '%esp', '%ebp', '%r15d',
408 '%rsp', '%rbp', '%r15',
409 '%rip'
410 ])
411
412
413 def InstructionIsDangerous(input, output, register_write,
414 writes_to, memory_accessed=False,
415 base_text='%riz', index_text='%riz'):
416 """ Check if instruction with given replacements will be dangerous
417
418 Args:
419 input: input argument
420 output: output argument
421 register_write: three-state selector
422 'sandbox' - instruction can be used to produce "restricted register"
423 'protect' - instruction can damage output, protect "special registers"
424 'ignore' - instruction does not affect it's operands (e.g. test) or
425 is used with non-GP registers (X87, MMX, XMM, etc)
426 memory_accessed: True if instruction accesses memory
427 base: base register (if memory is accessed)
428 index: index register (if memory is accessed)
429
430 Returns:
431 True if instruction should be rejected by validator
432 """
433 if memory_accessed:
434 if base_text not in X86_64_BASE_REGISTERS:
435 return True
436 if index_text in X86_64_BASE_REGISTERS - set(['%r15']):
halyavin 2013/12/07 17:27:15 We wanted to add comment here.
khim 2013/12/08 23:22:31 Done.
437 return True
438 if register_write == 'protect' and output in X86_64_BASE_REGISTERS:
439 return True
440 if register_write == 'sandbox' and output == '%r15d':
441 return True
442 if writes_to == 'both' and input in X86_64_BASE_REGISTERS:
443 return True
444 return False
445
446
447 def AppendOperandsReplacement(replacement, rm_text, reg, modrm, writes_to):
448 """ Appends replacement text to replacement list
449
450 Args:
451 replacement: replacement list
452 rm_text: replacement for rm field
453 reg: register kind (or None if reg field is used as opcode extension)
454 modrm: modrm byte
455 writes_to: three-state selector
456 'reg' - instruction uses rm as source, reg as destination
457 'rm' - instruction uses reg as source, rm as destination
458 'both' - instruction writes to both reg and rm
459
460 Returns:
461 input: textual representation of input argument
462 output: textual representation of output argument
463
464 Side-effect:
465 output (if reg is None) or (input, output) tuple (if reg is not None)
466 are added to replacement list.
467 """
468 if reg is None:
469 assert writes_to == 'rm'
470 input, output = None, rm_text
471 replacement.append(output)
472 else:
473 reg_field = (modrm >> 3) & 0x07
474 reg_text = '%' + REGISTERS[reg][reg_field]
475 if writes_to == 'reg':
476 input, output = rm_text, reg_text
477 else: # rm, both
478 input, output = reg_text, rm_text
479 replacement.extend([input, output])
480 return input, output
481
482
483 def ModRMRegisterReplacements(rm, reg=None, writes_to='rm', opcode_bits=0,
484 register_write='ignore'):
485 """Creates replacement tuples list for register-to-register instructions
486
487 Args:
488 rm: rm operand kind (see REGISTERS array)
489 reg: reg operand kind (see REGISTERS array) or None if reg is not used
490 writes_to: three-state selector
491 'reg' - instruction uses rm as source, reg as destination
492 'rm' - instruction uses reg as source, rm as destination
493 'both' - instruction writes to both reg and rm
494 opcode_bits: opcode extensions code (used when reg is None)
495 register_write: three-state selector
496 'sandbox' - instruction can be used to produce "restricted register"
497 'protect' - instruction can damage output, protect "special registers"
498 'ignore' - instruction does not affect it's operands (e.g. test) or
499 is used with non-GP registers (X87, MMX, XMM, etc)
500 Returns:
501 List of replacement tuples
502 """
503 # Reg field can be used either as reg or as opcode extension, but not both
504 assert reg is None or opcode_bits == 0
505
506 output_key = (options.bitness, reg, rm, writes_to, opcode_bits,
507 register_write)
508 if output_key in ModRMRegisterReplacements.replacements:
509 return ModRMRegisterReplacements.replacements[output_key]
510
511 replacements = []
512
513 # Two upper bits of ModR/M byte (mod field) must be equal to 11
514 # This gives us range from 0xc0 to 0xff but we are going from the
515 # end to make rejection faster (%r15 is equal to 0x7 and %rbp is 0x5).
516 if reg is None:
517 # reg field is used as opcode extension
518 byte_range = [byte
519 for byte in range(0xff, 0xbf, -1)
520 if (byte >> 3) & 0x7 == opcode_bits]
521 else:
522 byte_range = range(0xff, 0xbf, -1)
523
524 for modrm in byte_range:
525 rm_field = (modrm & 0x07)
526 rm_text = '%' + REGISTERS[rm][rm_field]
527 byte_text = '{:02x}'.format(modrm)
528 replacement = [byte_text]
529 input, output = AppendOperandsReplacement(
530 replacement, rm_text, reg, modrm, writes_to)
531 if options.bitness == 64:
532 replacement.append('any_nonspecial') # input_rr
533 replacement.append(output if register_write == 'sandbox' else None)
534 if InstructionIsDangerous(input, output, register_write, writes_to):
535 continue
536 replacements.append(tuple(replacement))
537 ModRMRegisterReplacements.replacements[output_key] = tuple(replacements)
538 return ModRMRegisterReplacements.replacements[output_key]
539 ModRMRegisterReplacements.replacements = {}
540
541
542 def BaseOnlyMemoryOperand(modrm, base):
543 """Creates replacement tuples list for register-to-memory instructions
544 (base only, no SIB)
545
546 Args:
547 modrm: modrm byte
548 base: register kind for base
549 Returns:
550 bytes_text: replacement for "bytes" group
551 rm_text: textual representation of "rm" argument
552 base_text: textual representation of "base" register
553 """
554 mod_field = (modrm >> 6) & 0x03
555 rm_field = (modrm & 0x07)
556 base_text = '%' + REGISTERS[base][rm_field]
557 # If RM field == %rbp and MOD field is zero then it's absolute address
558 # in 32-bit mode and %rip-based address in 64-bit mode
559 if mod_field == 0 and rm_field == validator.REG_RBP:
560 bytes_text = '{:02x} 00 00 00 00'.format(modrm)
561 rm_text = '0x0' if options.bitness == 32 else '0x0(%rip)'
562 base_text = '%eiz' if options.bitness == 32 else '%rip'
563 # Memory access with just a base register
564 elif mod_field == 0:
565 bytes_text = '{:02x}'.format(modrm)
566 rm_text = '({})'.format(base_text)
567 # Memory access with base and 8bit offset
568 elif mod_field == 1:
569 bytes_text = '{:02x} 00'.format(modrm)
570 rm_text = '0x0({})'.format(base_text)
571 # Memory access with base and 32bit offset
572 else: # mod_field == 2
573 bytes_text = '{:02x} 00 00 00 00'.format(modrm)
574 rm_text = '0x0({})'.format(base_text)
575 return bytes_text, rm_text, base_text
576
577
578 def SIBMemoryOperand(modrm, sib, base, index):
579 """Creates replacement tuples list for register-to-memory instructions
580 (base only, no SIB)
581
582 Args:
583 modrm: modrm byte
584 base: register kind for base
585 Returns:
586 bytes_text: replacement for "bytes" group
587 rm_text: textual representation of "rm" argument
588 base_text: textual representation of "base" register
589 index_text: textual representation of "index" register
590 """
591 mod_field = (modrm >> 6) & 0x03
592 scale_field = (sib >> 6) & 0x03
593 index_field = (sib >> 3) & 0x07
594 base_field = (sib & 0x07)
595 index_text = '%' + INDEXES[index][index_field]
596 base_text = '%' + REGISTERS[base][base_field]
597 scale_text = str(1 << scale_field)
598 # If BASE is %rbp and MOD == 0 then index with 32bit offset is used
599 if mod_field == 0 and base_field == validator.REG_RBP:
600 bytes_text = '{:02x} {:02x} 00 00 00 00'.format(modrm, sib)
601 if (options.bitness == 64 and
602 index_text == '%riz' and
603 scale_text == '1'):
604 rm_text = '0x0'
605 else:
606 rm_text = '0x0(,{},{})'.format(index_text, scale_text)
607 # There are no base in this case
608 base_text = '%eiz' if options.bitness == 32 else '%riz'
609 # Memory access with base and index (no offset)
610 elif mod_field == 0:
611 bytes_text = '{:02x} {:02x}'.format(modrm, sib)
612 rm_text = '({},{},{})'.format(base_text, index_text, scale_text)
613 # Memory access with base, index and 8bit offset
614 elif mod_field == 1:
615 bytes_text = '{:02x} {:02x} 00'.format(modrm, sib)
616 rm_text = '0x0({},{},{})'.format(base_text, index_text, scale_text)
617 # Memory access with base, index and 32bit offset
618 elif mod_field == 2:
619 bytes_text = '{:02x} {:02x} 00 00 00 00'.format(modrm, sib)
620 rm_text = '0x0({},{},{})'.format(base_text, index_text, scale_text)
621 # Pretty-printing of access via %rsp (or %r12)
622 if (base_field == validator.REG_RSP and
623 index_text in ('%eiz', '%riz') and
624 scale_text == '1'):
625 if mod_field == 0: # no offset
626 rm_text = '({})'.format(base_text)
627 else: # 8-bit or 32-bit offset
628 rm_text = '0x0({})'.format(base_text)
629 return bytes_text, rm_text, base_text, index_text
630
631
632 def ModRMMemoryReplacements(reg=None, writes_to='rm', opcode_bits=0,
633 memory_accessed=True, register_write='ignore',
634 base_r8=False, index_r8=False):
635 """Creates replacement tuples list for register-to-memory instructions
636
637 Args:
638 reg: reg operand kind (see REGISTERS array) or None if reg is not used
639 writes_to: three-state selector
640 'reg' - instruction uses rm as source, reg as destination
641 'rm' - instruction uses reg as source, rm as destination
642 'both' - instruction writes to both reg and rm
643 opcode_bits: opcode extensions code (used when reg is None)
644 memory_accessed: True if instruction accesses memory
645 register_write: three-state selector
646 'sandbox' - instruction can be used to produce "restricted register"
647 'protect' - instruction can damage output, protect "special registers"
648 'ignore' - instruction does not affect it's operands (e.g. test) or
649 is used with non-GP registers (X87, MMX, XMM, etc)
650 index_r8: True if REX.X bit in the instruction set to 1
651
652 Returns:
653 List of replacement tuples
654 """
655 # Reg field can be used either as reg or as opcode extension, but not both
656 assert reg is None or opcode_bits == 0
657
658 output_key = (options.bitness, reg, writes_to, opcode_bits,
659 base_r8, index_r8, memory_accessed, register_write)
660 if output_key in ModRMMemoryReplacements.replacements:
661 return ModRMMemoryReplacements.replacements[output_key]
662
663 if options.bitness == 32:
664 base = 'eax'
665 index = 'eax'
666 else:
667 base = 'r8' if base_r8 else 'rax'
668 index = 'r8' if index_r8 else 'rax'
669
670 replacements = []
671
672 # Two upper bits of ModR/M byte (mod field) must be equal to 00, 01, or 10
673 # This gives us range from 0x00 to 0xbf but we are going from the end to make
674 # rejection faster (%r15 is equal to 0x7 and %rbp is 0x5).
675 if reg is None:
676 # reg field is used as opcode extension
677 byte_range = [byte
678 for byte in range(0xbf, -1, -1)
679 if (byte >> 3) & 0x7 == opcode_bits]
680 else:
681 byte_range = range(0xbf, -1, -1)
682
683 for modrm in byte_range:
684 # If RM field != %rsp then there are no SIB byte
685 if (modrm & 0x07) != validator.REG_RSP:
686 bytes_text, rm_text, base_text = BaseOnlyMemoryOperand(modrm, base)
687 replacement = [bytes_text]
688 input, output = AppendOperandsReplacement(
689 replacement, rm_text, reg, modrm, writes_to)
690 if options.bitness == 64:
691 replacement.append('any_nonspecial')
692 # If writes_to is equal to 'reg' then output is register
693 if writes_to == 'reg' and register_write == 'sandbox':
694 replacement.append(output)
695 else:
696 # Note that instruction like xchg could write to another register:
697 # it's "input"! But validator currently does not support this case
698 # thus we are ignoring it here and writing "None" in this case, too.
699 replacement.append(None)
700 if InstructionIsDangerous(input, output, register_write, writes_to,
701 memory_accessed, base_text):
702 continue
703 replacements.append(tuple(replacement))
704 else:
705 # If RM field == %rsp then we have SIB byte
706 for sib in xrange(0x100):
707 bytes_text, rm_text, base_text, index_text = SIBMemoryOperand(
708 modrm, sib, base, index)
709 replacement = [bytes_text]
710 input, output = AppendOperandsReplacement(
711 replacement, rm_text, reg, modrm, writes_to)
712 if options.bitness == 64:
713 if not memory_accessed or index_text == '%riz':
714 replacement.append('any_nonspecial')
715 else:
716 if index_r8:
717 # Convert %r8 to %r8d, %r9 to %r9d, etc
718 replacement.append(index_text + 'd')
719 else:
720 # Convert %rax to %eax, %rsp to %esp, etc
721 replacement.append('%e' + index_text[2:])
722 # If writes_to is equal to 'reg' then output is register
723 if writes_to == 'reg' and register_write == 'sandbox':
724 replacement.append(output)
725 else:
726 # Note that instruction like xchg could write to another register:
727 # it's "input"! But validator currently does not support this case
728 # thus we are ignoring it here and writing "None" in this case, too.
729 replacement.append(None)
730 if InstructionIsDangerous(input, output, register_write, writes_to,
731 memory_accessed, base_text, index_text):
732 continue
733 replacements.append(tuple(replacement))
734 ModRMMemoryReplacements.replacements[output_key] = tuple(replacements)
735 return ModRMMemoryReplacements.replacements[output_key]
736 ModRMMemoryReplacements.replacements = {}
737
738
739 # Map from "REX bit off" group of registers to "REX bit on" group of registers
740 r8 = {
741 'al': 'r8b',
742 'ax': 'r8w',
743 'eax': 'r8d',
744 'rax': 'r8',
745 'mm0': 'mmalt',
746 'xmm0': 'xmm8',
747 'ymm0': 'ymm8'
748 }
749
750
751 def RegisterKinds():
752 """ Return list of register kinds we process with register compressors """
753
754 if options.bitness == 32:
755 return ('al', 'ax', 'eax', 'mm0', 'xmm0', 'ymm0')
756 else:
757 return ('al', 'spl', 'ax', 'eax', 'rax', 'mm0', 'xmm0', 'ymm0',
758 'r8b', 'r8w', 'r8d', 'r8', 'mmalt', 'xmm8', 'ymm8')
759
760
761 def RegisterKindPairs():
762 """ Return hand-picked pairs which we must consider in compressors """
763
764 if options.bitness == 32:
765 return (
766 ( 'al', 'al'),
767 ( 'ax', 'al'),
768 ( 'al', 'ax'),
769 ( 'ax', 'ax'),
770 ( 'eax', 'al'),
771 ( 'al', 'eax'),
772 ( 'eax', 'ax'),
773 ( 'ax', 'eax'),
774 ( 'eax', 'eax'),
775 ( 'eax', 'mm0'),
776 ( 'mm0', 'eax'),
777 ( 'eax', 'xmm0'),
778 ('xmm0', 'eax'),
779 ( 'mm0', 'mm0'),
780 ( 'mm0', 'xmm0'),
781 ('xmm0', 'mm0'),
782 ('xmm0', 'xmm0'),
783 ('xmm0', 'ymm0'),
784 ('ymm0', 'xmm0'),
785 ('ymm0', 'ymm0')
786 )
787 else:
788 return (
789 ( 'al', 'al'),
790 ( 'spl', 'spl'), ( 'spl', 'r8b'), ( 'r8b', 'spl'), ( 'r8b', 'r8b'),
791 ( 'ax', 'al'),
792 ( 'ax', 'spl'), ( 'ax', 'r8b'), ( 'r8w', 'spl'), ( 'r8w', 'r8b'),
793 ( 'al', 'ax'),
794 ( 'spl', 'ax'), ( 'spl', 'r8w'), ( 'r8b', 'ax'), ( 'r8b', 'r8w'),
795 ( 'ax', 'ax'), ( 'ax', 'r8w'), ( 'r8w', 'ax'), ( 'r8w', 'r8w'),
796 ( 'eax', 'al'),
797 ( 'eax', 'spl'), ( 'eax', 'r8b'), ( 'r8d', 'spl'), ( 'r8d', 'r8b'),
798 ( 'al', 'eax'),
799 ( 'spl', 'eax'), ( 'spl', 'r8d'), ( 'r8b', 'eax'), ( 'r8b', 'r8d'),
800 ( 'eax', 'ax'), ( 'eax', 'r8w'), ( 'r8d', 'ax'), ( 'r8d', 'r8w'),
801 ( 'ax', 'eax'), ( 'ax', 'r8d'), ( 'r8w', 'eax'), ( 'r8w', 'r8d'),
802 ( 'eax', 'eax'), ( 'eax', 'r8d'), ( 'r8d', 'eax'), ( 'r8d', 'r8d'),
803 ( 'rax', 'al'),
804 ( 'rax', 'spl'), ( 'rax', 'r8b'), ( 'r8', 'spl'), ( 'r8', 'r8b'),
805 ( 'al', 'rax'),
806 ( 'spl', 'rax'), ( 'spl', 'r8'), ( 'r8b', 'rax'), ( 'r8b', 'r8'),
807 ( 'rax', 'ax'), ( 'rax', 'r8w'), ( 'r8', 'ax'), ( 'r8', 'r8w'),
808 ( 'ax', 'rax'), ( 'ax', 'r8'), ( 'r8w', 'rax'), ( 'r8w', 'r8'),
809 ( 'rax', 'eax'), ( 'rax', 'r8d'), ( 'r8', 'eax'), ( 'r8', 'r8d'),
810 ( 'eax', 'rax'), ( 'eax', 'r8'), ( 'r8d', 'rax'), ( 'r8d', 'r8'),
811 ( 'rax', 'rax'), ( 'rax', 'r8'), ( 'r8', 'rax'), ( 'r8', 'r8'),
812 ( 'eax', 'mm0'), ( 'eax','mmalt'), ( 'r8d', 'mm0'), ( 'eax', 'mmalt'),
813 ( 'rax', 'mm0'), ( 'rax','mmalt'), ( 'r8', 'mm0'), ( 'r8', 'mmalt'),
814 ( 'mm0', 'eax'), ('mmalt', 'eax'), ( 'mm0', 'r8d'), ('mmalt', 'r8d'),
815 ( 'mm0', 'rax'), ('mmalt', 'rax'), ( 'mm0', 'r8'), ('mmalt', 'r8'),
816 ( 'eax', 'xmm0'), ( 'eax', 'xmm8'), ( 'r8d', 'xmm0'), ( 'r8d', 'xmm8'),
817 ( 'rax', 'xmm0'), ( 'rax', 'xmm8'), ( 'r8', 'xmm0'), ( 'r8', 'xmm8'),
818 ('xmm0', 'eax'), ('xmm0', 'r8d'), ('xmm8', 'eax'), ('xmm8', 'r8d'),
819 ('xmm0', 'rax'), ('xmm0', 'r8'), ('xmm8', 'rax'), ('xmm8', 'r8'),
820 ( 'mm0', 'mm0'), ('mmalt', 'mm0'), ( 'mm0','mmalt'), ('mmalt','mmalt'),
821 ( 'mm0', 'xmm0'), ('mmalt','xmm0'), ( 'mm0', 'xmm8'), ('mmalt', 'xmm8'),
822 ('xmm0', 'mm0'), ('xmm8', 'mm0'), ('xmm0','mmalt'), ('xmm8', 'mmalt'),
823 ('xmm0', 'xmm0'), ('xmm0', 'xmm8'), ('xmm8', 'xmm0'), ('xmm8', 'xmm8'),
824 ('xmm0', 'ymm0'), ('xmm0', 'ymm8'), ('xmm8', 'ymm0'), ('xmm8', 'ymm8'),
825 ('ymm0', 'xmm0'), ('ymm0', 'xmm8'), ('ymm8', 'xmm0'), ('ymm8', 'xmm8'),
826 ('ymm0', 'ymm0'), ('ymm0', 'ymm8'), ('ymm8', 'ymm0'), ('ymm8', 'ymm8')
827 )
828
829
830 def MemRegex(opcode_bits = 0):
831 """ Returns three regexes: two for bytes and one textual representation. """
832
833 modrm_regex = '^.*?({:02x})'.format(0xc0 + opcode_bits * 8)
834 # We only need to process ModR/M+SIB '04 04' or '04 07' here.
835 modrm_sib_regex = '^.*?({:02x} 0[47])'.format(0x04 + opcode_bits * 8)
836 if options.bitness == 32:
halyavin 2013/12/07 17:27:15 # We only match memory operands without offset, so
khim 2013/12/08 23:22:31 Done.
837 regex_mem = '\\(%esp,%eax,1\\)'
halyavin 2013/12/07 17:27:15 regex_mem->mem_regex for consistency.
khim 2013/12/08 23:22:31 Actually it makes more sense to put _mem as suffix
838 else:
839 regex_mem = '\\((?:%rsp|%r15),(?:%rax|%r8),1\\)'
840 return modrm_regex, modrm_sib_regex, regex_mem
841
842
843 def RegToRmCompressors(rm, reg, writes_to, memory_accessed=True,
halyavin 2013/12/07 19:51:15 Big refactoring (and so it is better to make it in
khim 2013/12/08 20:13:04 Done.
844 register_write='ignore', is_3Dnow=False, notes=''):
845 """Returns a list of reg <-> rm compressors for a given set of parameters.
846
847 Args:
848 rm: rm operand kind (see REGISTERS array)
849 reg: reg operand kind (see REGISTERS array) or None if reg is not used
850 writes_to: three-state selector
851 'reg' - instruction uses rm as source, reg as destination
852 'rm' - instruction uses reg as source, rm as destination
853 'both' - instruction writes to both reg and rm
854 memory_accessed: True if instruction accesses memory
855 register_write: three-state selector
856 'sandbox' - instruction can be used to produce "restricted register"
857 'protect' - instruction can damage output, protect "special registers"
858 'ignore' - instruction does not affect it's operands (e.g. test) or
859 is used with non-GP registers (X87, MMX, XMM, etc)
860 is_3DNow - True if instruction is 3DNow! style (opcode in immidiate)
861 notes - Notes to add to the description
862
863 Returns:
864 List of compressors
865 """
866
867 compressors = []
868
869 start_reg = REGISTERS[reg][0]
870 end_reg = REGISTERS[reg][-1]
871 # Exclude r15 from the interval, if instruction can write to reg.
872 if (end_reg.startswith('r15') and register_write != 'ignore' and
873 writes_to in ('reg', 'both')):
874 end_reg = REGISTERS[reg][-2]
875 start_rm = REGISTERS[rm][0]
876 end_rm = REGISTERS[rm][-1]
877 # Exclude r15 from the interval, if instruction can write to rm.
878 if (end_rm.startswith('r15') and register_write != 'ignore' and
879 writes_to in ('rm', 'both')):
880 end_rm = REGISTERS[rm][-2]
881
882 # Regex here matches everything AFTER the ModR/M (+ SIB) bytes.
883 if is_3Dnow:
884 # Immediate byte is opcode extension with 3DNow! instructions.
885 regex = ' [0-9a-fA-F][0-9a-fA-F]'
886 else:
887 # Accept immediate (which is always zeros in our case).
888 regex = '(?: 00)*'
889 # 2 spaces separate byte code from instruction name.
890 regex += ' '
891 # Optional "lock" prefix.
892 regex += '(?:lock )?'
893 # Instruction name.
894 regex += '\\w* '
895 # Immediate or implicit operand that can precede reg/rm pair.
896 regex += '(?:\\$0x0,|\\$0x0,\\$0x0,|%cl,|%xmm0,)?'
897 modrm_regex, modrm_sib_regex, regex_mem = MemRegex()
898 output = None
halyavin 2013/12/07 17:27:15 rename to output_rr_regex
khim 2013/12/08 23:22:31 regex_output_rr
899 output_note = None
halyavin 2013/12/07 17:27:15 rename to output_rr_subst
khim 2013/12/08 23:22:31 subst_output_rr
900 if writes_to == 'reg':
901 regex += '(%' + REGISTERS[rm][0] + '|' + regex_mem + ')'
902 regex += ',(%' + REGISTERS[reg][0] + ')'
903 if register_write == 'sandbox':
904 assert reg in ('eax', 'r8d')
905 output = '%' + reg + '|None'
halyavin 2013/12/07 17:27:15 We need to either remove None from here or explain
khim 2013/12/08 23:22:31 Removed.
906 output_note = '[%eax..%edi]' if reg == 'eax' else '[%r8d..%r14d]'
907 subst = (
908 'XX', '[%{}..%{} or memory]'.format(start_rm, end_rm),
909 '[%{}..%{}]'.format(start_reg, end_reg))
910 subst_register = (
911 'XX', '[%{}..%{}]'.format(start_rm, end_rm),
912 '[%{}..%{}]'.format(start_reg, end_reg))
913 subst_memory = (
914 'XX', '[memory]',
915 '[%{}..%{}]'.format(start_reg, end_reg))
916 else:
917 regex += '(%' + REGISTERS[reg][0] + ')'
918 regex += ',(%' + REGISTERS[rm][0] + '|' + regex_mem + ')'
919 if register_write == 'sandbox':
920 assert rm in ('eax', 'r8d')
921 output = '%' + rm + '|None'
halyavin 2013/12/07 17:27:15 We need to add comment of why we are doing such sl
khim 2013/12/08 23:22:31 Done.
922 output_note = '[%eax..%edi]' if rm == 'eax' else '[%r8d..%r14d]'
923 subst = (
924 'XX', '[%{}..%{}]'.format(start_reg, end_reg),
925 '[%{}..%{} or memory]'.format(start_rm, end_rm))
926 subst_register = (
927 'XX', '[%{}..%{}]'.format(start_reg, end_reg),
928 '[%{}..%{}]'.format(start_rm, end_rm))
929 subst_memory = (
930 'XX', '[%{}..%{}]'.format(start_reg, end_reg),
931 '[memory]')
932 # Skip implicit arguments (if any)
933 regex += '.*'
934 if options.bitness == 64:
halyavin 2013/12/07 17:49:32 # We generate 2 variants of compressors with memor
khim 2013/12/08 23:22:31 Done.
935 regex += '; input_rr=(%eax|%r8d|any_nonspecial)'
936 regex += '; output_rr=({})'.format(output)
937 if memory_accessed:
halyavin 2013/12/07 17:27:15 Can we move regex += '; input_rr=(%eax|%r8d)' here
khim 2013/12/08 23:22:31 Done.
938 input_note = '[%eax..%edi]'
halyavin 2013/12/07 17:27:15 input_note->input_rr_subst
khim 2013/12/08 23:22:31 subst_input_rr
939 input_note_r8 = '[%r8d..%r15d]'
halyavin 2013/12/07 17:27:15 input_note_r8->input_rr_subst_r8
khim 2013/12/08 23:22:31 Removed.
940 else:
941 input_note = 'any_nonspecial'
942 input_note_r8 = 'any_nonspecial'
halyavin 2013/12/07 17:49:32 r8->index8
khim 2013/12/08 23:22:31 Removed
943 subst_r8 = subst + (input_note_r8, output_note)
944 subst = subst + (input_note, output_note)
945 subst_memory_r8 = subst_memory + (input_note_r8, output_note)
946 subst_memory = subst_memory + (input_note, output_note)
947 subst_register = subst_register + ('any_nonspecial', output_note)
948 # With "or memory" or "memory" compressors we always can see where is
949 # reg and where is rm, but with reg to rm or rm to reg it's impossible.
950 # Add the appropriate comment
951 if writes_to == 'reg':
952 notes_register = ' # rm to reg'
halyavin 2013/12/07 17:27:15 notes_register->notes_register_subst
khim 2013/12/08 23:22:31 N/A: Changed (and hopefully simplified) logic diff
953 else:
954 notes_register = ' # reg to rm'
955 if notes != '':
956 notes_register += '; ' + notes
957 notes = ' # ' + notes
halyavin 2013/12/07 17:27:15 notes_subst = '# ' + notes
khim 2013/12/08 23:22:31 N/A: Changed (and hopefully simplified) logic diff
958 subst += (notes, )
959 subst_memory += (notes, )
960 subst_register += (notes_register, )
961 if options.bitness == 64:
962 subst_r8 += (notes, )
963 subst_memory_r8 += (notes, )
964 regex += '()$'
965 base_r8 = rm in r8.values()
966 memory_replacement = ModRMMemoryReplacements(
967 reg=reg, base_r8=base_r8, writes_to=writes_to,
968 memory_accessed=memory_accessed, register_write=register_write)
969 compressors.append(Compressor(
970 modrm_sib_regex + regex, subst_memory, memory_replacement))
971 if options.bitness == 64:
972 memory_replacement_r8 = ModRMMemoryReplacements(
973 reg=reg, base_r8=base_r8, index_r8=True, writes_to=writes_to,
974 memory_accessed=memory_accessed, register_write=register_write)
975 compressors.append(Compressor(
976 modrm_sib_regex + regex, subst_memory_r8, memory_replacement_r8))
977 # Instructions with no memory access are instructions which are doing
978 # something with memory address (e.g. lea) and as such they don't have
979 # non-memory forms.
980 if memory_accessed:
981 register_replacement = ModRMRegisterReplacements(
982 reg=reg, rm=rm, writes_to=writes_to, register_write=register_write)
983 compressors.append(Compressor(
984 modrm_regex + regex, subst_register, register_replacement))
985 main_replacement = register_replacement + memory_replacement
986 compressors.append(Compressor(
987 modrm_sib_regex + regex, subst, main_replacement))
988 if options.bitness == 64:
989 main_replacement_r8 = register_replacement + memory_replacement_r8
990 compressors.append(Compressor(
991 modrm_sib_regex + regex, subst_r8, main_replacement_r8))
992 return compressors
993
994
995 def AllRegToRmCompressors():
996 """ Return list of all Reg to RM (and RM to Reg) compressors. """
997
998 compressors = []
999
1000 for reg, rm in RegisterKindPairs():
1001 instruction_kinds = [
1002 # Normal instructions with two operands (rm to reg)
1003 {'writes_to': 'reg'},
1004 # Normal instructions with two operands (reg to rm)
1005 {'writes_to': 'rm'}
1006 ]
1007 # In 64 bit mode we have many different types of instructions depending
1008 # on whether we are accessing memory or whether we are writing to registers.
1009 if options.bitness == 64:
1010 # Lea in 64 bit mode is truly unique instruction for now
1011 if reg in ('ax', 'r8w', 'eax', 'r8d', 'rax', 'r8'):
1012 register_write = 'sandbox' if reg in ('eax', 'r8d') else 'protect'
1013 instruction_kinds.append(
1014 {'writes_to': 'reg',
1015 'memory_accessed': False,
1016 'register_write': register_write,
1017 'notes': 'lea'})
1018 # There are few more forms in 64 bit case (rm to reg)
1019 if reg in ('eax', 'r8d'):
1020 # Zero-extending version.
1021 instruction_kinds.append(
1022 {'writes_to': 'reg',
1023 'register_write': 'sandbox'})
1024 # More forms in 64 bit case (reg to rm)
1025 if rm in ('eax', 'r8d'):
1026 # Zero-extending version.
1027 instruction_kinds.append(
1028 {'writes_to': 'rm',
1029 'register_write': 'sandbox'})
1030 # Zero-extending xchg/xadd
1031 instruction_kinds.append(
1032 {'writes_to': 'both',
1033 'register_write': 'sandbox',
1034 'notes': 'write to both'})
1035 # Still more forms for 64 bit case (rm to reg).
1036 if reg in ('al', 'spl', 'ax', 'eax', 'rax', 'r8b', 'r8w', 'r8d', 'r8'):
1037 # Dangerous instructions (rm to reg)
1038 instruction_kinds.append(
1039 {'writes_to': 'reg',
1040 'register_write': 'protect'})
1041 # Still more forms for 64 bit case (reg to rm)
1042 if rm in ('al', 'spl', 'ax', 'eax', 'rax', 'r8b', 'r8w', 'r8d', 'r8'):
1043 # Dangerous instructions (reg to rm)
1044 instruction_kinds.append(
1045 {'writes_to': 'rm',
1046 'register_write': 'protect'})
1047 # Dangerous xchg/xadd
1048 instruction_kinds.append(
1049 {'writes_to': 'both',
1050 'register_write': 'protect',
1051 'notes': 'write to both'})
1052 # 3DNow! instructions
1053 if reg in ('mm0', 'mmalt') or rm in ('mm0', 'mmalt'):
1054 instruction_kinds.append(
1055 {'writes_to': 'reg',
1056 'is_3Dnow': True})
1057 for instruction_kind in instruction_kinds:
1058 compressors.extend(RegToRmCompressors(reg=reg, rm=rm, **instruction_kind))
1059 return compressors
1060
1061
1062 def RmCompressors(rm, opcode_bits,
1063 memory_accessed=True, register_write='ignore'):
1064 """Returns a list of rm compressors for a given set of parameters.
1065
1066 Args:
1067 rm: rm operand kind (see REGISTERS array)
1068 memory_accessed: True if instruction accesses memory
1069 register_write: three-state selector
1070 'sandbox' - instruction can be used to produce "restricted register"
1071 'protect' - instruction can damage output, protect "special registers"
1072 'ignore' - instruction does not affect it's operands (e.g. test) or
1073 is used with non-GP registers (X87, MMX, XMM, etc)
1074
1075 Returns:
1076 List of compressors
1077 """
1078 compressors = []
1079
1080 start_rm = REGISTERS[rm][0]
1081 end_rm = REGISTERS[rm][-1]
1082 # Exclude r15 from the interval, if instruction can write to rm.
1083 if end_rm.startswith('r15') and register_write != 'ignore':
1084 end_rm = REGISTERS[rm][-2]
1085 byte_mark = 'XX/' + str(opcode_bits)
1086
1087 subst = (byte_mark, '[%{}..%{} or memory]'.format(start_rm, end_rm))
1088 subst_register = (byte_mark, '[%{}..%{}]'.format(start_rm, end_rm))
1089 subst_memory = (byte_mark, '[memory]')
1090
1091 # Regex here matches everything AFTER the ModR/M (+ SIB) bytes.
1092 regex = '(?: 00)*'
1093 # 2 spaces separate byte code from instruction name.
1094 regex += ' '
1095 # Optional "lock" prefix.
1096 regex += '(?:lock )?'
1097 # Instruction name.
1098 regex += '\\w* '
1099 # Immediate or implicit operand that can precede rm argument.
1100 regex += '(?:\\$0x0,|%cl,)?'
1101 modrm_regex, modrm_sib_regex, regex_mem = MemRegex(opcode_bits)
1102 # First register or memory access
1103 regex += '(%' + REGISTERS[rm][0] + '|' + regex_mem + ').*'
1104 output = None
1105 output_note = None
1106 if options.bitness == 32:
1107 subst += ('', )
1108 subst_memory += ('', )
1109 subst_register += ('', )
1110 else:
1111 if register_write == 'sandbox':
1112 assert rm in ('eax', 'r8d')
1113 output = '%' + rm + '|None'
1114 output_note = '[%eax..%edi]' if rm == 'eax' else '[%r8d..%r14d]'
1115 regex += '; input_rr=(%eax|%r8d|any_nonspecial)'
1116 regex += '; output_rr=({})'.format(output)
1117 if memory_accessed:
1118 input_note = '[%eax..%edi]'
1119 input_note_r8 = '[%r8d..%r15d]'
1120 else:
1121 input_note = 'any_nonspecial'
1122 input_note_r8 = 'any_nonspecial'
1123 subst_r8 = subst + (input_note_r8, output_note, '')
1124 subst = subst + (input_note, output_note, '')
1125 subst_memory_r8 = subst_memory + (input_note_r8, output_note, '')
1126 subst_memory = subst_memory + (input_note, output_note, '')
1127 subst_register = subst_register + ('any_nonspecial', output_note, '')
1128 regex += '()'
1129 base_r8 = rm in r8.values()
1130 memory_replacement = ModRMMemoryReplacements(
1131 reg=None, base_r8=base_r8, opcode_bits=opcode_bits,
1132 memory_accessed=memory_accessed, register_write=register_write)
1133 compressors.append(Compressor(
1134 modrm_sib_regex + regex, subst_memory, memory_replacement))
1135 if options.bitness == 64:
1136 memory_replacement_r8 = ModRMMemoryReplacements(
1137 reg=None, base_r8=base_r8, index_r8=True, opcode_bits=opcode_bits,
1138 memory_accessed=memory_accessed, register_write=register_write)
1139 compressors.append(Compressor(
1140 modrm_sib_regex + regex, subst_memory_r8, memory_replacement_r8))
1141 # Instructions with no memory access are instructions which are doing
1142 # something with memory address (e.g. prefetch) and as such they don't
1143 # have non-memory forms.
1144 if memory_accessed:
1145 register_replacement = ModRMRegisterReplacements(
1146 reg=None, rm=rm, opcode_bits=opcode_bits, register_write=register_write)
1147 compressors.append(Compressor(
1148 modrm_regex + regex, subst_register, register_replacement))
1149 main_replacement = register_replacement + memory_replacement
1150 compressors.append(Compressor(
1151 modrm_sib_regex + regex, subst, main_replacement))
1152 if options.bitness == 64:
1153 main_replacement_r8 = register_replacement + memory_replacement_r8
1154 compressors.append(Compressor(
1155 modrm_sib_regex + regex, subst_r8, main_replacement_r8))
1156 return compressors
1157
1158
1159 def AllRmCompressors():
1160 """ Return list of all RM (with reg as opcode extension) compressors. """
1161 compressors = []
1162
1163 for rm in RegisterKinds():
1164 for opcode_bits in xrange(8):
1165 instruction_kinds = [
1166 # The most basic form
1167 {}
1168 ]
1169 # In 64 bit mode we have many different types of instructions depending
1170 # on whether we are accessing memory or whether we are writing to
1171 # registers.
1172 if options.bitness == 64:
1173 # No memory access (e.g. prefetch)
1174 instruction_kinds.append(
1175 {'memory_accessed':False})
1176 # More forms in 64 bit case.
1177 if rm in ('eax', 'r8d'):
1178 # Zero-extending version.
1179 instruction_kinds.append(
1180 {'register_write':'sandbox'})
1181 # Still more forms for 64 bit case (reg to rm).
1182 if rm in ('al', 'spl', 'ax', 'eax', 'rax',
1183 'r8b', 'r8w', 'r8d', 'r8'):
1184 # Dangerous instructions.
1185 instruction_kinds.append(
1186 {'register_write':'protect'})
1187 for instruction_kind in instruction_kinds:
1188 compressors.extend(RmCompressors(
1189 rm=rm, opcode_bits=opcode_bits, **instruction_kind))
1190 return compressors
1191
1192
1193 def AllOpcodeCompressors():
1194 """ Return "register in opcode" compressors. """
1195 compressors = []
1196
1197 for reg in RegisterKinds() + ('st(0)',):
1198 for opcode in xrange(8):
1199 for text1, text2, nibble in (
1200 ('[0..7]', '[8..f]', xrange(8)),
1201 ('[012367]', '[89abef]', (0, 1, 2, 3, 6, 7)),
1202 ('[0..6]', '[8..e]', xrange(7))
1203 ):
1204 start_reg = REGISTERS[reg][0]
1205 end_reg = REGISTERS[reg][-1 if 7 in nibble else -2]
1206 subst_regs = '[%{}..%{}]'.format(start_reg, end_reg)
1207 # Note that we use instruction with 1st register, not 0th register
1208 # here to avoid ambiguity when opcode is 0x00
1209 compressors.append(Compressor(
1210 '.*?[0-9a-fA-F](1)(?: 00)*'
1211 ' \\w* (?:\\$0x0,|%ax,|%st,)?'
1212 '(%' + REGISTERS_RE[reg][1] + ').*()',
1213 (text1, subst_regs, ''),
1214 tuple(('{:x}'.format(n), '%' + REGISTERS[reg][n])
1215 for n in nibble)))
1216 compressors.append(Compressor(
1217 '.*?[0-9a-fA-F](8)(?: 00)*'
1218 ' \\w* (?:\\$0x0,|%ax,|%st,)?'
1219 '(%' + REGISTERS_RE[reg][0] + ').*()',
1220 (text2, subst_regs, ''),
1221 tuple(('{:x}'.format(n + 8), '%' + REGISTERS[reg][n])
1222 for n in nibble)))
1223 # Another version for 64 bit case
1224 if options.bitness == 64 and reg in ('eax', 'r8d'):
1225 compressors.append(Compressor(
1226 '.*?[0-9a-fA-F](1)(?: 00)*'
1227 ' \\w* (?:\\$0x0,|%ax,|%st,)?'
1228 '(%' + REGISTERS_RE[reg][1] + ').*'
1229 'output_rr=(%'+ REGISTERS_RE[reg][1] + ').*()',
1230 (text1, subst_regs, subst_regs, ''),
1231 tuple(['{:x}'.format(n)] + ['%' + REGISTERS[reg][n]] * 2
1232 for n in nibble)))
1233 compressors.append(Compressor(
1234 '.*?[0-9a-fA-F](8)(?: 00)*'
1235 ' \\w* (?:\\$0x0,|%ax,|%st,)?'
1236 '(%' + REGISTERS_RE[reg][0] + ').*'
1237 'output_rr=(%'+ REGISTERS_RE[reg][0] + ').*()',
1238 (text2, subst_regs, subst_regs, ''),
1239 tuple(['{:x}'.format(n + 8)] + ['%' + REGISTERS[reg][n]] * 2
1240 for n in nibble)))
1241 return compressors
1242
1243
1244 def AllMemoryNonMemoryCompressors():
1245 """ Return memory/nonmemory compressors. """
1246
1247 compressors = []
1248
1249 if options.bitness == 32:
1250 letters_and_registers = (('b', 'al', ''), ('w', 'ax', ''), ('l', 'eax', ''))
1251 else:
1252 letters_and_registers = (
1253 ('b', 'al', 'eax'), ('b', 'spl', 'eax'), ('b', 'r8b', 'r8d'),
1254 ('w', 'ax', 'eax'), ('w', 'r8w', 'r8d'),
1255 ('l', 'eax', 'eax'), ('l', 'r8d', 'r8d'),
1256 ('q', 'rax', 'eax'), ('q', 'r8', 'r8d')
1257 )
1258 for letter, reg, out_reg in letters_and_registers:
1259 start_reg = REGISTERS[reg][0]
1260 if reg[0:2] == 'r8':
1261 end_regs = (REGISTERS[reg][-2], REGISTERS[reg][-1])
1262 else:
1263 end_regs = (REGISTERS[reg][-1], )
1264 for end_reg in end_regs:
1265 all_regs = '[%{}..%{}]'.format(start_reg, end_reg)
1266 regs_mark = '[%{}..%{} or memory]'.format(start_reg, end_reg)
1267 if options.bitness == 64:
1268 start_out = REGISTERS[out_reg][0]
1269 end_out = REGISTERS[out_reg][-1 if out_reg[0:2] != 'r8' else -2]
1270 out_regs = '[%{}..%{}]'.format(start_out, end_out)
1271 for notes in ('', ' # rm to reg', ' # reg to rm'):
1272 compressors.append(Compressor(
1273 '.* \\w*(' + letter + ') .*(\\[memory]).*()()',
1274 ('[{}]?'.format(letter), regs_mark, '', ''),
1275 ((letter, '[memory]', ''), ('', all_regs, notes))))
1276 if options.bitness == 64:
1277 for index_reg in ('eax', 'r8d'):
1278 start_index = REGISTERS[index_reg][0]
1279 end_index = REGISTERS[index_reg][-1]
1280 index_regs = '[%{}..%{}]'.format(start_index, end_index)
1281 for output_rrs in ((None, out_regs),
1282 (out_regs, None),
1283 (None, None)):
1284 compressors.append(Compressor(
1285 '.* \\w*(' + letter + ') .*(\\[memory]).*; '
1286 'input_rr=(\\[%[a-z0-9]*..%[a-z0-9]*\\]); '
1287 'output_rr=(\\[%[a-z0-9]*..%[a-z0-9]*\\]|None)()()',
1288 ('[{}]?'.format(letter), regs_mark, index_regs,
1289 output_rrs[1] if output_rrs[0] is None else output_rrs[0],
1290 '', ''),
1291 ((letter, '[memory]', index_regs, output_rrs[0], ''),
1292 ('', all_regs, 'any_nonspecial', output_rrs[1], notes))))
1293 return compressors
1294
1295
1296 def RexCompressor(rm, rmR15, reg, regR15, rexw, input_rr, output_rr, rm_to_reg):
1297 """ Return REX compressor (or nothing) for a given set of paramenters.
1298
1299 Args:
1300 rm: rm operand kind (see REGISTERS array) or None if rm is not used
1301 reg: reg operand kind (see REGISTERS array) or None if reg is not used
1302 rmR15: True if R15 register is included
1303 regR15: True if R15 register is included
1304 rexw: True if REX.W should be set
1305 input_rr: True if input_rr is used
1306 output_rr: true if output_rr is used
1307 rm_to_reg: True if direction is rm to reg
1308 """
1309
1310 # reg and rm can be of three different possible intervals
1311 # start_reg/rm to end_reg/rm (e.g. from %al to %bh)
1312 # start_reg/rm to end_reg0/rm0 (e.g from %al to %dil)
1313 # start_reg8/rm8 to end_reg8/rm8 (e.g. from %r8b to %r15b)
1314 # First form can be observed if there are not REX, second form
1315 # if REX.R/REX.B is not set and third form is where it's set.
1316 if reg:
1317 start_reg = REGISTERS[reg][0]
1318 start_reg8 = REGISTERS[r8[reg]][0]
1319 end_reg = REGISTERS[reg][-1]
1320 end_reg0 = 'dil' if reg == 'al' else end_reg
1321 end_reg8 = REGISTERS[r8[reg]][-1 if regR15 else -2]
1322 if rexw:
1323 reg_regex = '\\[(%' + start_reg + '\\.\\.%' + end_reg0 + ')]'
1324 else:
1325 reg_regex = '\\[(%' + start_reg + '\\.\\.%' + end_reg + ')]'
1326 if rm:
1327 start_rm = REGISTERS[rm][0]
1328 start_rm8 = REGISTERS[r8[rm]][0]
1329 end_rm = REGISTERS[rm][-1]
1330 end_rm0 = 'dil' if rm == 'al' else end_rm
1331 end_rm8 = REGISTERS[r8[rm]][-1 if rmR15 else -2]
1332 if rexw:
1333 rm_regex = ('\\[(%' + start_rm + '\\.\\.%' + end_rm0 + ')'
1334 '(?: or memory)?]')
1335 else:
1336 rm_regex = ('\\[(%' + start_rm + '\\.\\.%' + end_rm + ')'
1337 '(?: or memory)?]')
1338
1339 # Legacy prefixes
1340 regex = '.*:(?: 26| 2e| 36| 3e| 64| 65| 66| 67| f0| f2| f3)*'
1341 # REX
1342 regex += '( 48).*' if rexw else '( 40|).*'
1343 # Replacement text
1344 replacement_tuple = (' [REX:48..4f]' if rexw else ' [REX:40..47]?', )
1345 if reg: replacement_regs = '%{}..%{}'.format(start_reg, end_reg8)
1346 if rm: replacement_rms = '%{}..%{}'.format(start_rm, end_rm8)
1347 # Instruction arguments
1348 if not reg and not rm:
1349 pass
1350 elif not reg and rm:
1351 regex += rm_regex + '.*'
1352 replacement_tuple += (replacement_rms, )
1353 elif reg and not rm:
1354 regex += reg_regex + '.*'
1355 replacement_tuple += (replacement_regs, )
1356 elif rm_to_reg:
1357 regex += rm_regex + ',' + reg_regex + '.*'
1358 replacement_tuple += (replacement_rms, replacement_regs)
1359 else:
1360 regex += reg_regex + ',' + rm_regex + '.*'
1361 replacement_tuple += (replacement_regs, replacement_rms)
1362 # Input and output restricted registers
1363 if input_rr:
1364 regex += 'input_rr=\\[(%eax\\.\\.%edi)].*'
1365 replacement_tuple += ('%eax..%r15d', )
1366 if output_rr:
1367 regex += 'output_rr=\\[(%eax\\.\\.%edi)].*'
1368 replacement_tuple += ('%eax..%r14d', )
1369 regex += '()'
1370 replacement_tuple += ('', )
1371 # Replacement cases
1372 replacement_tuples = ()
1373 for byte in (range(0x48, 0x50) if rexw else range(0x40, 0x48) + [0]):
1374 replacement_case = (' {:02x}'.format(byte) if byte else '', )
1375 if rm:
1376 if byte & 0x1:
1377 replacement_rms = '%{}..%{}'.format(start_rm8, end_rm8)
1378 elif byte:
1379 replacement_rms = '%{}..%{}'.format(start_rm, end_rm0)
1380 else:
1381 replacement_rms = '%{}..%{}'.format(start_rm, end_rm)
1382 if byte & 0x2:
1383 replacement_index = '%r8d..%r15d'
1384 else:
1385 replacement_index = '%eax..%edi'
1386 if reg:
1387 if byte & 0x4:
1388 replacement_regs = '%{}..%{}'.format(start_reg8, end_reg8)
1389 elif byte:
1390 replacement_regs = '%{}..%{}'.format(start_reg, end_reg0)
1391 else:
1392 replacement_regs = '%{}..%{}'.format(start_reg, end_reg)
1393 if not reg and not rm:
1394 pass
1395 elif not reg and rm:
1396 replacement_case += (replacement_rms, )
1397 final_rr = '%r8d..%r14d' if byte & 0x1 else '%eax..%edi'
1398 elif reg and not rm:
1399 replacement_case += (replacement_regs, )
1400 final_rr = '%r8d..%r14d' if byte & 0x4 else '%eax..%edi'
1401 elif rm_to_reg:
1402 replacement_case += (replacement_rms, replacement_regs)
1403 final_rr = '%r8d..%r14d' if byte & 0x4 else '%eax..%edi'
1404 else:
1405 replacement_case += (replacement_regs, replacement_rms)
1406 final_rr = '%r8d..%r14d' if byte & 0x1 else '%eax..%edi'
1407 if input_rr: replacement_case += (replacement_index, )
1408 if output_rr: replacement_case += (final_rr, )
1409 replacement_tuples += (replacement_case, )
1410 return Compressor(regex, replacement_tuple, replacement_tuples)
1411
1412
1413 def AllRexCompressors():
1414 """ Return "REX" compressors which combine different REX prefixes. """
1415
1416 if options.bitness != 64:
1417 return []
1418
1419 compressors = []
1420
1421 # First pretty complex set of compressors to combine versions of REX with
1422 # three lowest bits in different states.
1423 register_kind_pairs = (
1424 ( None, None),
1425 ( 'al', 'al'), ( 'al', None), (None, 'al'),
1426 ( 'ax', 'al'), ( 'al', 'ax'),
1427 ( 'ax', 'ax'), ( 'ax', None), (None, 'ax'),
1428 ( 'eax', 'al'), ( 'al', 'eax'),
1429 ( 'eax', 'ax'), ( 'ax', 'eax'),
1430 ( 'eax', 'eax'), ( 'eax', None), (None, 'eax'),
1431 ( 'rax', 'al'), ( 'al', 'rax'),
1432 ( 'rax', 'ax'), ( 'ax', 'rax'),
1433 ( 'rax', 'eax'), ( 'eax', 'rax'),
1434 ( 'rax', 'rax'), ( 'rax', None), (None, 'rax'),
1435 ( 'eax', 'mm0'), ( 'mm0', 'eax'),
1436 ( 'rax', 'mm0'), ( 'mm0', 'rax'),
1437 ( 'mm0', 'eax'), ( 'eax', 'mm0'),
1438 ( 'mm0', 'rax'), ( 'rax', 'mm0'),
1439 ( 'eax', 'xmm0'),
1440 ( 'rax', 'xmm0'),
1441 ('xmm0', 'eax'),
1442 ('xmm0', 'rax'),
1443 ( 'mm0', 'mm0'), ( 'mm0', None), (None, 'mm0'),
1444 ( 'mm0', 'xmm0'),
1445 ('xmm0', 'mm0'),
1446 ('xmm0', 'xmm0'),
1447 ('xmm0', 'ymm0'), ('xmm0', None), (None, 'xmm0'),
1448 ('ymm0', 'xmm0'),
1449 ('ymm0', 'ymm0'), ('ymm0', None), (None, 'ymm0'),
1450 )
1451 for reg, rm in register_kind_pairs:
1452 for regR15 in (True, False):
1453 for rmR15 in (True, False):
1454 for rexw in (True, False):
1455 for input_rr in (True, False):
1456 for output_rr in (True, False):
1457 for rm_to_reg in (True, False):
1458 # These combinations will just produce useless duplicates
1459 if not reg and not regR15: continue
1460 if not rm and not rmR15: continue
1461 if not reg and not rm and (output_rr or rm_to_reg): continue
1462 compressors.append(RexCompressor(
1463 rm=rm, rmR15=rmR15, reg=reg, regR15=regR15,
1464 rexw=rexw, input_rr=input_rr, output_rr=output_rr,
1465 rm_to_reg=rm_to_reg))
1466
1467 # This is pretty simple compressor to combine two lines with different REX.W
1468 # bits (only if they are otherwise identical).
1469 compressors.append(Compressor(
1470 '.*(\\[REX:40\\.\\.47]\\?).*()', ('[REX:40..4f]?', ''),
1471 (('[REX:40..47]?', ), ('[REX:48..4f]', ))))
1472 return compressors
1473
1474
1475 def AllSpecialCompressors():
1476 """ Return all "special" compressors. """
1477
1478 compressors = []
1479
1480 # Special compressors: will handle some cosmetic issues.
1481 #
1482 # SETxx ignores reg field and thus are described as many separate instructions
1483 compressors.append(Compressor(
1484 '.*0f 9[0-9a-fA-F] XX(/[0-7]) set.*()', ('', ''),
1485 [('/' + str(i), ) for i in range(8)]))
1486 # BSWAP is described with opcode "0f c8+r", not "0f /1" in manual
1487 if options.bitness == 32:
1488 compressors.append(Compressor(
1489 '.*(XX/1) bswap.*ax.*()', ('c[8..f]', ''), [('XX/1', )]))
1490 else:
1491 compressors.append(Compressor(
1492 '.*(XX/1) bswap.*ax.*()', ('c[89abef]', ''), [('XX/1', )]))
1493 compressors.append(Compressor(
1494 '.*(XX/1) bswap.*r8.*()', ('c[8..e]', ''), [('XX/1', )]))
1495 # Add mark '# write to both' to certain versions of CMPXCHG, XADD, and XCHG
1496 if options.bitness == 64:
1497 compressors.append(Compressor(
1498 '.* (?:cmpxchg|xadd|xchg).*%al\\.\\.%bh[^#]*()$',
1499 (' # write to both', ), ((), )))
1500 # "and $0xe0,[%eax..%edi]" is treated specially which means that we list all
1501 # versions of and "[$0x1..$0xff],[%eax..%edi]" separately here.
1502 # Without this rule these ands comprise 2/3 of the whole output!
1503 if options.bitness == 32:
1504 compressors.append(Compressor(
1505 '.*83 (e0 01 and \\$0x1,%eax)()',
1506 ('XX/4 00 and[l]? $0x0,[%eax..%edi or memory]', ' # special and'),
1507 [('e{} {:02x} and $0x{:x},%{}'.format(r, i, i, REGISTERS['eax'][r]), )
1508 for i in range(0x01, 0x100) for r in range(8)] +
1509 [('XX/4 00 and[l]? $0x0,[%eax..%edi or memory]', )]))
1510 else:
1511 for reg in ('eax', 'r8d'):
1512 start_reg = REGISTERS[reg][0]
1513 end_reg = REGISTERS[reg][-1 if reg[0:2] != 'r8' else -2]
1514 for index_reg in ('eax', 'r8d'):
1515 start_index = REGISTERS[index_reg][0]
1516 end_index = REGISTERS[index_reg][-1]
1517 compressors.append(Compressor(
1518 '.*83 (e0 01 and \\$0x1,%' + reg + ').*'
1519 'input_rr=(any_nonspecial); output_rr=(%' + reg + ')()',
1520 ('XX/4 00 and[l]? $0x0,[%{}..%{} or memory]'.format(start_reg,
1521 end_reg), '[%{}..%{}]'.format(start_index, end_index),
1522 '[%{}..%{}]'.format(start_reg, end_reg),
1523 ' # special and'),
1524 [('e{} {:02x} and $0x{:x},%{}'.format(r, i, i, REGISTERS[reg][r]),
1525 'any_nonspecial', '%' + REGISTERS[reg][r])
1526 for i in range(0x01, 0x100) for r in range(7 + (reg == 'eax'))] +
1527 [('XX/4 00 and[l]? $0x0,[%{}..%{} or memory]'.format(start_reg,
1528 end_reg), '[%{}..%{}]'.format(start_index, end_index),
1529 '[%{}..%{}]'.format(start_reg, end_reg))]))
1530
1531 # "and $e0" and similar are used to align %rsp. All negative values are
1532 # accepted by validator and there are 127 of these.
1533 # Consolidate them into one line.
1534 if options.bitness == 64:
1535 compressors.append(Compressor(
1536 '.*(?:81|83) (?:e4|e5) (80) (?:00 00 00 |) and \\$0x(80),%r[bs]p.*()',
1537 ('[80..ff]', '[80..ff]', ' # alignment and'),
1538 [('{:02x}'.format(i), '{:02x}'.format(i)) for i in range(0x80, 0x100)]))
1539 return compressors
1540
1541
1542 def PrepareCompressors():
1543 """ Return list of all compressors sorted from bigger ones to smaller ones """
1544
1545 return tuple(sorted(
1546 AllRegToRmCompressors() +
1547 AllRmCompressors() +
1548 AllOpcodeCompressors() +
1549 AllMemoryNonMemoryCompressors() +
1550 AllRexCompressors() +
1551 AllSpecialCompressors(),
1552 key=lambda compressor: -len(compressor.replacements)))
1553
1554
1555 def ShowProgress(rule, instruction):
1556 if rule not in ShowProgress.rules_shown:
1557 first_print = True
1558 ShowProgress.rules_shown[rule]=len(ShowProgress.rules_shown)
1559 else:
1560 first_print = False
1561 print >> sys.stderr, '-------- Compressed --------'
1562 print >> sys.stderr, 'Rule:', ShowProgress.rules_shown[rule]
1563 print >> sys.stderr, '--------'
1564 compressor = compressors[rule]
1565 match = compressor.regex.match(instruction)
1566 assert match
1567 format_str = CompressionTemplate(instruction, match, '{{{}}}')
1568 replacements = sorted(format_str.format(*replacement)
1569 for replacement in compressor.replacements)
1570 if len(compressor.replacements) <= 4 or first_print:
1571 for replacement in replacements:
1572 print >> sys.stderr, replacement
1573 else:
1574 print >> sys.stderr, replacements[0]
1575 print >> sys.stderr, '...'
1576 print >> sys.stderr, replacements[-1]
1577 print >> sys.stderr, '--------'
1578 print >> sys.stderr, 'Compressed', (
1579 format_str + '{{{}}}').format(*compressor.subst)
1580 ShowProgress.rules_shown = {}
1581
1582
1583 def main():
1584 # We are keeping these global to share state graph and compressors
1585 # between workers spawned by multiprocess. Passing them every time is slow.
1586 global options, xml_file
1587 global dfa
1588 global worker_validator
1589 options, xml_file = ParseOptions()
1590 dfa = dfa_parser.ParseXml(xml_file)
1591 worker_validator = validator.Validator(
1592 validator_dll=options.validator_dll,
1593 decoder_dll=options.decoder_dll)
1594 global compressors
1595 compressors = PrepareCompressors()
1596
1597 assert dfa.initial_state.is_accepting
1598 assert not dfa.initial_state.any_byte
1599
1600 print >> sys.stderr, len(dfa.states), 'states'
1601
1602 num_suffixes = dfa_traversal.GetNumSuffixes(dfa.initial_state)
1603
1604 # We can't just write 'num_suffixes[dfa.initial_state]' because
1605 # initial state is accepting.
1606 total_instructions = sum(
1607 num_suffixes[t.to_state]
1608 for t in dfa.initial_state.forward_transitions.values())
1609 print >> sys.stderr, total_instructions, 'regular instructions total'
1610
1611 tasks = dfa_traversal.CreateTraversalTasks(dfa.states, dfa.initial_state)
1612 print >> sys.stderr, len(tasks), 'tasks'
1613
1614 pool = multiprocessing.Pool()
1615
1616 results = pool.imap(Worker, tasks)
1617
1618 total = 0
1619 num_valid = 0
1620 full_output = set()
1621 for prefix, count, valid_count, output, trace in results:
1622 print >> sys.stderr, 'Prefix:', ', '.join(map(hex, prefix))
1623 total += count
1624 num_valid += valid_count
1625 full_output |= output
1626 for rule, instruction in trace:
1627 ShowProgress(rule, instruction)
1628 for instruction in sorted(Compressed(full_output,
1629 compressors,
1630 ShowProgress)):
1631 print instruction
1632
1633 print >> sys.stderr, total, 'instructions were processed'
1634 print >> sys.stderr, num_valid, 'valid instructions'
1635
1636
1637 if __name__ == '__main__':
1638 main()
OLDNEW
« no previous file with comments | « no previous file | src/trusted/validator_ragel/testdata/32bit_regular.golden » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698