Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(213)

Side by Side Diff: src/trusted/validator_ragel/compress_regular_instructions.py

Issue 49183002: Regular instructions golden file test. Base URL: svn://svn.chromium.org/native_client/trunk/src/native_client/
Patch Set: Created 7 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « no previous file | src/trusted/validator_ragel/testdata/32bit_regular.golden » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 # Copyright (c) 2013 The Native Client Authors. All rights reserved.
2 # Use of this source code is governed by a BSD-style license that can be
3 # found in the LICENSE file.
4
5 """
6 Traverse the validator's DFA, collect all "normal" instruction and then
7 compress output. Note: "anybyte fields" (immediates and displacements)
8 are always filled with zeros. Otherwise processing of sextillions (sic!)
9 of possibilities will take too long.
10
11 Each rule is applied only when all variants are accepted by validator.
12 The following compression rules are present:
13
14 1. Compress ModR/M (+SIB & displacement).
15 Instruction: 00 00 add %al,(%rax)
16 ...
17 Instruction: 00 ff add %bh,%bh
18 becomes
19 Instruction: 00 XX add [%al..%bh],[%al..%bh or memory]
20
21 1a. Compress ModR/M (+SIB & displacement) memory-only.
22 Instruction: f0 01 00 lock add %eax,(%eax)
23 ...
24 Instruction: f0 01 bf 00 00 00 00 lock add %edi,0x0(%edi)
25 becomes
26 Instruction: f0 01 XX lock add [%eax..edi],[memory]
27
28 1b. Compress ModR/M register only.
29 Instruction: 66 0f 50 c0 movmskpd %xmm0,%eax
30 ...
31 Instruction: 66 0f 50 ff movmskpd %xmm7,%edi
32 becomes
33 Instruction: 66 0f 50 XX movmskpd [%xmm0..%xmm7],[%eax..edi]
34
35 2. Compress ModR/M (+SIB & displacement) with opcode extension.
36 Instruction: 0f 90 00 seto (%eax)
37 ...
38 Instruction: 0f 90 c7 seto %bh
39 becomes
40 Instruction: 0f 90 XX/0 seto [%al..%bh or memory]
41
42 2a. Compress ModR/M (+SIB & displacement) memory-only with opcode extension.
43 Instruction: f0 ff 00 lock incl (%eax)
44 ...
45 Instruction: f0 ff 84 ff 00 00 00 00 lock incl 0x0(%edi,%edi,8)
46 becomes
47 Instruction: f0 ff XX/1 lock decl [memory]
48
49 2b. Compress ModR/M register-only with opcode extension.
50 Instruction: 0f 71 d0 00 psrlw $0x0,%mm0
51 ...
52 Instruction: 0f 71 d7 00 psrlw $0x0,%mm7
53 becomes
54 Instruction: 66 0f 71 XX/2 00 psrlw $0x0,[%mm0..%mm7]
55
56 3. Compress register-in-opcode.
57 Instruction: d9 c0 fld %st(0)
58 ...
59 Instruction: d9 c7 fld %st(7)
60 becomes
61 Instruction: Instruction: d9 c[0..7] fld [%st(0)..%st(7)]
62
63 Only applies if all possible register accesses are accepted by validator.
64
65 4. Special compressor for "set" instruction.
66 Instruction: 0f 90 XX/0 seto [%al..%bh or memory]
67 ...
68 Instruction: 0f 90 XX/7 seto [%al..%bh or memory]
69 becomes
70 Instruction: 0f 90 XX seto [%al..%bh or memory]
71 """
72
73 import itertools
74 import multiprocessing
75 import optparse
76 import os
77 import re
78 import subprocess
79 import sys
80 import tempfile
81 import traceback
82
83 import dfa_parser
84 import dfa_traversal
85 import validator
86
87
88 # Register names in 'natual' order (as defined by IA32/x86-64 ABI)
89 #
90 # X86-64 ABI splits all registers in groups of 8 because it uses 3-bit field
91 # in opcode, ModR/M, and/or SIB bytes to encode them.
92 #
93 # In most cases there are 16 registers of a given kind and two such groups,
94 # but there are couple of exceptions:
95 # 1. There are 20 8-bit registers and three groups (two of them overlap)
96 # 2. There are eight X87 and MMX registers thus two groups are identical
97 #
98 # We use typical register from a group to name the whole group. Most groups
99 # use first register, but 'spl' group uses fifth register because it's first
100 # four registers are the same as 'al' group. We use mnemonic name 'mmalt'
101 # to represent the "evil mirror" of the 'mm0' group.
102 REGISTERS = {
103 'al': [ 'al', 'cl', 'dl', 'bl', 'ah', 'ch', 'dh', 'bh' ],
104 'spl': [ 'al', 'cl', 'dl', 'bl', 'spl', 'bpl', 'sil', 'dil' ],
105 'ax': [ 'ax', 'cx', 'dx', 'bx', 'sp', 'bp', 'si', 'di' ],
106 'eax': [ 'eax', 'ecx', 'edx', 'ebx', 'esp', 'ebp', 'esi', 'edi' ],
107 'rax': [ 'rax', 'rcx', 'rdx', 'rbx', 'rsp', 'rbp', 'rsi', 'rdi' ],
108 'r8b': [ 'r{}b'.format(N) for N in range(8,16) ],
109 'r8w': [ 'r{}w'.format(N) for N in range(8,16) ],
110 'r8d': [ 'r{}d'.format(N) for N in range(8,16) ],
111 'r8': [ 'r{}'.format(N) for N in range(8,16) ],
112 'mm0': [ 'mm{}'.format(N) for N in range(8) ],
113 'mmalt': [ 'mm{}'.format(N) for N in range(8) ],
114 'st(0)': [ 'st({})'.format(N) for N in range(8) ],
115 'xmm0': [ 'xmm{}'.format(N) for N in range(8) ],
116 'xmm8': [ 'xmm{}'.format(N) for N in range(8,16) ],
117 'ymm0': [ 'ymm{}'.format(N) for N in range(8) ],
118 'ymm8': [ 'ymm{}'.format(N) for N in range(8,16) ]
119 }
120
121
122 NOP = 0x90
123
124
125 def PadToBundleSize(bytes):
126 assert len(bytes) <= validator.BUNDLE_SIZE
127 return bytes + [NOP] * (validator.BUNDLE_SIZE - len(bytes))
128
129
130 # In x86-64 mode we have so-called 'restricted register' which is used to
131 # tie two groups together. Some instructions require particular value to
132 # be stored in this variable, while some accept any non-special restricted
133 # register (%ebp and %esp are special because they can only be accepted by
134 # a few 'special' instructions).
135 #
136 # You can find more details in the "NaCl SFI model on x86-64 systems" manual.
137 #
138 # We try to feed all possible 'restricted registers' into validator and then
139 # classify the instruction using this map. If set of acceptable 'restricted
140 # registers' is not here, then it's an error in validator.
141 ACCEPTABLE_X86_64_INPUTS = {
142 0x00001: 'input_rr=%eax',
143 0x00002: 'input_rr=%ecx',
144 0x00004: 'input_rr=%edx',
145 0x00008: 'input_rr=%ebx',
146 0x00010: 'input_rr=%esp',
147 0x00020: 'input_rr=%ebp',
148 0x00040: 'input_rr=%esi',
149 0x00080: 'input_rr=%edi',
150 0x00100: 'input_rr=%r8d',
151 0x00200: 'input_rr=%r9d',
152 0x00400: 'input_rr=%r10d',
153 0x00800: 'input_rr=%r11d',
154 0x01000: 'input_rr=%r12d',
155 0x02000: 'input_rr=%r13d',
156 0x04000: 'input_rr=%r14d',
157 0x08000: 'input_rr=%r15d',
158 0x1ffcf: 'input_rr=any_nonspecial'
159 }
160
161 # Any instruction must produce either None or one of fifteen registers as an
162 # output 'restricted register' value. 'r15d' is NOT acceptable as an output.
163 ACCEPTABLE_X86_64_OUTPUT_REGISTERS = tuple(
164 '%' + reg for reg in (REGISTERS['eax'] + REGISTERS['r8d'])[0:-1])
165
166
167 def ValidateInstruction(instruction, validator_inst):
168 bundle = ''.join(map(chr, PadToBundleSize(instruction)))
169 if options.bitness == 32:
170 result = validator_inst.ValidateChunk(bundle, bitness=32)
171 return result, []
172 else:
173 valid_inputs = 0
174 known_final_rr = None
175 output_rr = None
176 # Note that iteration order is aligned with ACCEPTABLE_X86_64_INPUTS array
177 # above.
178 for bit, initial_rr in enumerate(validator.ALL_REGISTERS + [None]):
179 valid, final_rr = validator_inst.ValidateAndGetFinalRestrictedRegister(
180 bundle, len(instruction), initial_rr)
181 if valid:
182 # final_rr should not depend on input_rr
183 assert valid_inputs == 0 or known_final_rr == final_rr
184 valid_inputs |= 1 << bit
185 known_final_rr = final_rr
186 # If nothing is accepted then instruction is not valid. Easy and simple.
187 if valid_inputs == 0: return False, []
188 # If returned value in unacceptable we'll get IndexError here and this
189 # test will fail
190 if known_final_rr is not None:
191 output_rr = ACCEPTABLE_X86_64_OUTPUT_REGISTERS[known_final_rr]
192 # If collected valid_inputs are unacceptable we'll get KeyError here and
193 # this test will fail
194 return True, [ACCEPTABLE_X86_64_INPUTS[valid_inputs],
195 'output_rr={}'.format(output_rr)]
196
197
198 class WorkerState(object):
199 def __init__(self, prefix, validator):
200 self.total_instructions = 0
201 self.num_valid = 0
202 self.validator = validator
203 self.output = set()
204 self.trace = []
205
206
207 def ReceiveInstruction(self, bytes):
208 self.total_instructions += 1
209 result, notes = ValidateInstruction(bytes, self.validator)
210 if result:
211 self.num_valid += 1
212 dis = self.validator.DisassembleChunk(
213 ''.join(map(chr, bytes)),
214 bitness=options.bitness)
215 for line_nr in xrange(len(dis)):
216 dis[line_nr] = str(dis[line_nr])
217 assert dis[line_nr][0:17] == 'Instruction(0x' + str(line_nr) + ': '
218 assert dis[line_nr][-1:] == ')'
219 dis[line_nr] = dis[line_nr][17:-1]
220 # If %rip is involved then comment will be different depending on the
221 # instruction length. Eliminate it.
222 if '(%rip)' in dis[0]:
223 dis[0] = re.sub(' # 0x[ ]*[0-9a-fA-F]*', '', dis[0])
224 # Zero displacements are represented as 0x0 for all instructions except
225 # jumps where they disassembled as non-zero due to %eip/%rip-relative
226 # addressing. We replace this displacement with %eip/%rip to simplify
227 # compression.
228 if ' 0x' in dis[0] and ' 0x0' not in dis[0]:
229 for bytes in xrange(1, 16):
230 dis[0] = re.sub(
231 '(' + '(?:[0-9a-fA-F][0-9a-fA-F] ){' + str(bytes) + '} .* )' +
232 hex(bytes) + '(.*)',
233 '\\1%eip\\2' if options.bitness == 32 else '\\1%rip\\2',
234 dis[0]);
235 dis[0] = 'Instruction: ' + dis[0]
236 dis += notes
237 self.output.add('; '.join(dis))
238
239
240 def RecordTrace(self, compressor_nr, instruction):
241 self.trace.append((compressor_nr, instruction))
242
243
244 # Compressor has three slots: regex (which picks apart given instruction),
245 # subst (which is used to denote compressed version) and replacements (which
246 # are used to generate set of instructions from a given code).
247 #
248 # Example compressor:
249 # regex = '.*?[0-9a-fA-F]([0-7]) \\w* (%e(?:[abcd]x|[sb]p|[sd]i)).*()'
250 # subst = ('[0-7]', '[%eax..%edi]', ' # register in opcode')
251 # replacements = ((0, '%eax'), (1, '%ecx'), (2, '%edx'), (3, '%ebx')
252 # (4, '%esp'), (5, '%ebp'), (6, '%esi'), (7, '%edi'))
253 #
254 # When faced with instriuction '40 inc %eax' it will capture the following
255 # pieces of said instruction: '4[0] inc [%eax]'.
256 #
257 # Then it will produce the following eight instructions:
258 # '40 inc %eax'
259 # '41 inc %ecx'
260 # '42 inc %edx'
261 # '43 inc %ebx'
262 # '44 inc %esp'
263 # '45 inc %ebp'
264 # '46 inc %esi'
265 # '47 inc %edi'
266 #
267 # If all these instructions can be found in a set of instructions then
268 # compressor will remove them from said set and will insert one replacement
269 # "compressed instruction" '4[0-7] inc [%eax..%edi] # register in opcode'.
270 #
271 # Note that last group is only used in the replacement. It's used to grab marks
272 # added by previous compressors and to replace them with a new mark.
273 class Compressor(object):
274 __slots__ = [
275 'regex',
276 'subst',
277 'replacements'
278 ]
279
280 def __init__(self, regex, subst, replacements=None):
281 self.regex = re.compile(regex)
282 self.subst = subst
283 self.replacements = [] if replacements is None else replacements
284
285
286 def CompressionTemplate(instruction, match, mark):
287 """ Replace all match groups with the mark. """
288 pos = 0
289 format_str = ''
290 for group in range(1, len(match.groups())):
291 format_str += instruction[pos:match.start(group)] + mark
292 pos = match.end(group)
293 return format_str + instruction[pos:match.start(len(match.groups()))]
294
295
296 def CompressOneMatch(instructions, instruction, match, compressor):
297 format_str = CompressionTemplate(instruction, match, '{}')
298 subset = set()
299 for replacement in compressor.replacements:
300 replacement_str = format_str.format(*replacement)
301 if not replacement_str in instructions:
302 return (False, instructions)
303 subset.add(replacement_str)
304 instructions -= subset
305 instructions.add((format_str + '{}').format(*compressor.subst))
306 return (True, instructions)
307
308
309 def CompressOneInstruction(instructions, compressors, split, cache):
310 sorted_instructions = (sorted(i for i in instructions if i > split) +
311 sorted(i for i in instructions if i < split))
312 for instruction in sorted_instructions:
313 if instruction in cache:
314 compressors_list = cache[instruction]
315 for compressor_nr, match, compressor in compressors_list:
316 result, instructions = CompressOneMatch(
317 instructions, instruction, match, compressor)
318 if result:
319 return (instructions, compressor_nr, instruction)
320 else:
321 compressors_list = []
322 for compressor_nr, compressor in enumerate(compressors):
323 match = compressor.regex.match(instruction)
324 if match:
325 compressors_list.append((compressor_nr, match, compressor))
326 result, instructions = CompressOneMatch(
327 instructions, instruction, match, compressor)
328 if result:
329 return (instructions, compressor_nr, instruction)
330 cache[instruction] = compressors_list
331 return (instructions, False, False)
332
333
334 def Compressed(instructions, compressors, show_progress):
335 split = ''
336 cache = {}
337 while True:
338 instructions, rule, split = CompressOneInstruction(
339 instructions, compressors, split, cache)
340 if rule is False: break
341 show_progress(rule, split)
342 return instructions
343
344
345 def Worker((prefix, state_index)):
346 worker_state = WorkerState(prefix, worker_validator)
347
348 try:
349 dfa_traversal.TraverseTree(
350 dfa.states[state_index],
351 final_callback=worker_state.ReceiveInstruction,
352 prefix=prefix,
353 anyfield=0)
354 if (prefix[0] != 0x0f or prefix[1] != 0x0f): # Skip 3DNow! instructions
355 worker_state.output = Compressed(set(worker_state.output),
356 compressors,
357 worker_state.RecordTrace)
358 except Exception as e:
359 traceback.print_exc() # because multiprocessing imap swallows traceback
360 raise
361
362 return (
363 prefix,
364 worker_state.total_instructions,
365 worker_state.num_valid,
366 worker_state.output,
367 worker_state.trace)
368
369
370 def ParseOptions():
371 parser = optparse.OptionParser(usage='%prog [options] xmlfile')
372
373 parser.add_option('--bitness',
374 choices=['32', '64'],
375 help='The subarchitecture: 32 or 64')
376 parser.add_option('--validator_dll',
377 help='Path to librdfa_validator_dll')
378 parser.add_option('--decoder_dll',
379 help='Path to librdfa_decoder_dll')
380
381 options, args = parser.parse_args()
382 options.bitness = int(options.bitness)
383
384 if len(args) != 1:
385 parser.error('specify one xml file')
386
387 (xml_file, ) = args
388
389 return options, xml_file
390
391
392 # Version suitable for use in regular expressions
393 REGISTERS_RE = REGISTERS.copy()
394 REGISTERS_RE['st(0)'] = [ 'st\\({}\\)'.format(N) for N in range(8) ]
395 REGISTERS_RE['st\\(0\\)'] = REGISTERS_RE['st(0)']
396
397 # Index names in 'natual' order (as defined by IA32/x86-64 ABI)
398 INDEXES = {
399 'eax': [ 'eax', 'ecx', 'edx', 'ebx', 'eiz', 'ebp', 'esi', 'edi' ],
400 'rax': [ 'rax', 'rcx', 'rdx', 'rbx', 'riz', 'rbp', 'rsi', 'rdi' ],
401 'r8': [ 'r8', 'r9', 'r10', 'r11', 'r12', 'r13', 'r14', 'r15' ]
402 }
403 # Register which can not be used as base in 64-bit mode in all incarnations
404 X86_64_BASE_REGISTERS = set([
405 '%spl', '%bpl', '%r15b',
406 '%sp', '%bp', '%r15w',
407 '%esp', '%ebp', '%r15d',
408 '%rsp', '%rbp', '%r15',
409 '%rip'
410 ])
411
412
413 def InstructionIsDangerous(input, output, register_write,
414 writes_to, memory_accessed=False,
415 base_text='%r15', index_text='%r15'):
halyavin 2013/11/18 09:50:47 Can we use %riz as default value instead?
khim 2013/11/18 12:10:21 Hmm... Yes, I just wanted to use "always valid" re
416 """ Check if instruction with given replacements will be dangerous
417
418 Args:
419 input: input argument
420 output: output argument
421 register_write: three-state selector
422 'sandbox' - instruction can be used to produce "restricted register"
423 'protect' - instruction can damage output, protect "special registers"
424 'ignore' - instruction does not affect it's operands (e.g. test) or
425 is used with non-GP registers (X87, MMX, XMM, etc)
426 memory_accessed: True if instruction accesses memory
427 base: base register (if memory is accessed)
428 index: index register (if memory is accessed)
429
430 Returns:
431 True if instruction should be rejected by validator
432 """
433 if memory_accessed:
434 if base_text not in X86_64_BASE_REGISTERS:
435 return True
436 if index_text in X86_64_BASE_REGISTERS - set(['%r15']):
halyavin 2013/11/18 09:50:47 Why index_text can be %r15? Is (%r15,%r15,) valid?
khim 2013/11/18 12:10:21 That's two separate questions :-) Yes, I need a ne
437 return True
438 if register_write == 'protect' and output in X86_64_BASE_REGISTERS:
439 return True
440 if register_write == 'sandbox' and output == '%r15d':
441 return True
442 if (writes_to == 'both' and input in X86_64_BASE_REGISTERS):
halyavin 2013/11/18 09:50:47 We don't need brackets here.
khim 2013/11/18 12:10:21 Done.
443 return True
444 return False
445
446
447 def AppendOperandsReplacement(replacement, rm_text, reg, modrm, writes_to):
448 """ Appends replacement text to replacement list
449
450 Args:
451 replacement: replacement list
452 rm_text: replacement for rm field
453 reg: register kind (or None if reg field is used as opcode extension)
454 reg_field: value of reg field in ModR/M byte (is only used if reg != None)
halyavin 2013/11/18 09:50:47 You changed the argument from reg_field to modrm.
khim 2013/11/18 12:10:21 Done.
455 writes_to: three-state selector
456 'reg' - instruction uses rm as source, reg as destination
457 'rm' - instruction uses reg as source, rm as destination
458 'both' - instruction writes to both reg and rm
459
460 Returns:
461 input: textual representation of input argument
462 output: textual representation of output argument
463
464 Side-effect:
465 output (if reg is None) or (input, output) tuple (if reg is not None)
466 are added to replacement list.
467 """
468 reg_field = (modrm >> 3) & 0x07
halyavin 2013/11/18 09:50:47 Move this line to else clause.
khim 2013/11/18 12:10:21 Done.
469 if reg is None:
470 assert writes_to == 'rm'
471 input, output = None, rm_text
472 replacement.append(output)
473 else:
474 reg_text = '%' + REGISTERS[reg][reg_field]
475 if writes_to == 'reg':
476 input, output = rm_text, reg_text
477 else: # rm, both
478 input, output = reg_text, rm_text
479 replacement.extend([input, output])
480 return input, output
481
482
483 def ModRMRegisterReplacements(rm, reg=None, writes_to='rm', opcode_bits=0,
484 register_write='ignore'):
485 """Creates replacement tuples list for register-to-register instructions
486
487 Args:
488 rm: rm operand kind (see REGISTERS array)
489 reg: reg operand kind (see REGISTERS array) or None if reg is not used
490 writes_to: three-state selector
491 'reg' - instruction uses rm as source, reg as destination
492 'rm' - instruction uses reg as source, rm as destination
493 'both' - instruction writes to both reg and rm
494 opcode_bits: opcode extensions code (used when reg is None)
495 register_write: three-state selector
496 'sandbox' - instruction can be used to produce "restricted register"
497 'protect' - instruction can damage output, protect "special registers"
498 'ignore' - instruction does not affect it's operands (e.g. test) or
499 is used with non-GP registers (X87, MMX, XMM, etc)
500 Returns:
501 List of replacement tuples
502 """
503 # Reg field can be used either as reg or as opcode extension, but not both
504 assert reg is None or opcode_bits == 0
505
506 output_key = (options.bitness, reg, rm, writes_to, opcode_bits,
507 register_write)
508 if output_key in ModRMRegisterReplacements.replacements:
509 return ModRMRegisterReplacements.replacements[output_key]
510
511 replacements = []
512
513 # Two upper bits of ModR/M byte (mod field) must be equal to 11
514 # This gives us range from 0xc0 to 0xff but we are going from the
515 # end to make rejection faster (%r15 is equal to 0x7 and %rbp is 0x5).
516 if reg is None:
517 # reg field is used as opcode extension
518 byte_range = [byte
519 for byte in range(0xff, 0xbf, -1)
520 if (byte >> 3) & 0x7 == opcode_bits]
521 else:
522 byte_range = range(0xff, 0xbf, -1)
523
524 for modrm in byte_range:
525 rm_field = (modrm & 0x07)
526 rm_text = '%' + REGISTERS[rm][rm_field]
527 byte_text = '{:02x}'.format(modrm)
528 replacement = [byte_text]
529 input, output = AppendOperandsReplacement(
530 replacement, rm_text, reg, modrm, writes_to)
531 if options.bitness == 64:
532 replacement.append('any_nonspecial') # input_rr
533 replacement.append(output if register_write == 'sandbox' else None)
534 if InstructionIsDangerous(input, output, register_write, writes_to):
535 continue
536 replacements.append(tuple(replacement))
537 ModRMRegisterReplacements.replacements[output_key] = tuple(replacements)
538 return ModRMRegisterReplacements.replacements[output_key]
539 ModRMRegisterReplacements.replacements = {}
540
541
542 def BaseOnlyMemoryOperand(modrm, base):
543 """Creates replacement tuples list for register-to-memory instructions
544 (base only, no SIB)
545
546 Args:
547 modrm: modrm byte
548 base: register kind for base
549 Returns:
550 bytes_text: replacement for "bytes" group
551 rm_text: textual representation of "rm" argument
552 base_text: textual representation of "base" register
553 """
554 mod_field = (modrm >> 6) & 0x03
555 rm_field = (modrm & 0x07)
556 base_text = '%' + REGISTERS[base][rm_field]
557 # If RM field == %rbp and MOD field is zero then it's absolute address
558 # in 32-bit mode and %rip-based address in 64-bit mode
559 if mod_field == 0 and rm_field == validator.REG_RBP:
560 bytes_text = '{:02x} 00 00 00 00'.format(modrm)
561 rm_text = '0x0' if options.bitness == 32 else '0x0(%rip)'
562 base_text = '%rip'
halyavin 2013/11/18 09:50:47 Use %riz or %eiz in 32-bit case.
khim 2013/11/18 12:10:21 Done.
563 # Memory access with just a base register
564 elif mod_field == 0:
565 bytes_text = '{:02x}'.format(modrm)
566 rm_text = '({})'.format(base_text)
567 # Memory access with base and 8bit offset
568 elif mod_field == 1:
569 bytes_text = '{:02x} 00'.format(modrm)
570 rm_text = '0x0({})'.format(base_text)
571 # Memory access with base and 32bit offset
572 else: # mod_field == 2
573 bytes_text = '{:02x} 00 00 00 00'.format(modrm)
574 rm_text = '0x0({})'.format(base_text)
575 return bytes_text, rm_text, base_text
576
577
578 def SIBMemoryOperand(modrm, sib, base, index):
579 """Creates replacement tuples list for register-to-memory instructions
580 (base only, no SIB)
581
582 Args:
583 modrm: modrm byte
584 base: register kind for base
585 Returns:
586 bytes_text: replacement for "bytes" group
587 rm_text: textual representation of "rm" argument
588 base_text: textual representation of "base" register
589 index_text: textual representation of "base" register
halyavin 2013/11/18 09:50:47 base->index
khim 2013/11/18 12:10:21 Done.
590 """
591 mod_field = (modrm >> 6) & 0x03
592 scale_field = (sib >> 6) & 0x03
593 index_field = (sib >> 3) & 0x07
594 base_field = (sib & 0x07)
595 index_text = '%' + INDEXES[index][index_field]
596 base_text = '%' + REGISTERS[base][base_field]
597 scale_text = str(1 << scale_field)
598 # If BASE is %rbp and MOD == 0 then index with 32bit offset is used
599 if mod_field == 0 and base_field == validator.REG_RBP:
600 bytes_text = '{:02x} {:02x} 00 00 00 00'.format(modrm, sib)
601 # In 64-bit mode this case is displayed as simple absolute address
602 # In 32-bit mode there are another, shorter, form, but it's used
603 # for %rip-relative addressing in 64-bit mode
604 if (options.bitness == 64 and
605 index_text == '%riz' and
606 scale_field == 0):
halyavin 2013/11/18 09:50:47 scale_text == '1'. It will make easier to understa
khim 2013/11/18 12:10:21 Done.
607 rm_text = '0x0'
608 else:
609 rm_text = '0x0(,{},{})'.format(index_text, scale_text)
610 # There are no base in this case
611 base_text = ''
halyavin 2013/11/18 09:50:47 Use %riz/%eiz.
khim 2013/11/18 12:10:21 Done.
612 # Memory access with base and index (no offset)
613 elif mod_field == 0:
614 bytes_text = '{:02x} {:02x}'.format(modrm, sib)
615 rm_text = '({},{},{})'.format(base_text, index_text, scale_text)
616 # Memory access with base, index and 8bit offset
617 elif mod_field == 1:
618 bytes_text = '{:02x} {:02x} 00'.format(modrm, sib)
619 rm_text = '0x0({},{},{})'.format(base_text, index_text, scale_text)
620 # Memory access with base, index and 32bit offset
621 elif mod_field == 2:
622 bytes_text = '{:02x} {:02x} 00 00 00 00'.format(modrm, sib)
623 rm_text = '0x0({},{},{})'.format(base_text, index_text, scale_text)
624 # Pretty-printing of access via %rsp (or %r12)
625 if (base_field == validator.REG_RSP and
626 index_text in ('%eiz', '%riz') and
627 scale_field == 0):
halyavin 2013/11/18 09:50:47 scale_text == '1'
khim 2013/11/18 12:10:21 Done.
628 rm_text = ('0x0({})' if mod_field else '({})').format(base_text)
halyavin 2013/11/18 09:50:47 if mod_field in (1, 2): # 8-bit or 32-bit offset
khim 2013/11/18 12:10:21 Grr. Four lines instead of one. Done.
629 return bytes_text, rm_text, base_text, index_text
630
631
632 def ModRMMemoryReplacements(reg=None, writes_to='rm', opcode_bits=0,
633 memory_accessed=True, register_write='ignore',
634 base_r8=False, index_r8=False):
635 """Creates replacement tuples list for register-to-memory instructions
636
637 Args:
638 rm: rm operand kind (see REGISTERS array)
639 reg: reg operand kind (see REGISTERS array) or None if reg is not used
640 writes_to: three-state selector
641 'reg' - instruction uses rm as source, reg as destination
642 'rm' - instruction uses reg as source, rm as destination
643 'both' - instruction writes to both reg and rm
644 opcode_bits: opcode extensions code (used when reg is None)
645 memory_accessed: True if instruction accesses memory
646 register_write: three-state selector
647 'sandbox' - instruction can be used to produce "restricted register"
648 'protect' - instruction can damage output, protect "special registers"
649 'ignore' - instruction does not affect it's operands (e.g. test) or
650 is used with non-GP registers (X87, MMX, XMM, etc)
651 index_r8: True if REX.X bit in the instruction set to 1
652
653 Returns:
654 List of replacement tuples
655 """
656 # Reg field can be used either as reg or as opcode extension, but not both
657 assert reg is None or opcode_bits == 0
658
659 output_key = (options.bitness, reg, writes_to, opcode_bits,
660 base_r8, index_r8, memory_accessed, register_write)
661 if output_key in ModRMMemoryReplacements.replacements:
662 return ModRMMemoryReplacements.replacements[output_key]
663
664 if options.bitness == 32:
665 base = 'eax'
666 index = 'eax'
667 else:
668 base = 'r8' if base_r8 else 'rax'
669 index = 'r8' if index_r8 else 'rax'
670
671 replacements = []
672
673 # Two upper bits of ModR/M byte (mod field) must be equal to 00, 01, or 10
674 # This gives us range from 0x00 to 0xbf but we are going from the end to make
675 # rejection faster (%r15 is equal to 0x7 and %rbp is 0x5).
676 if reg is None:
677 # reg field is used as opcode extension
678 byte_range = [byte
679 for byte in range(0xbf, -1, -1)
680 if (byte >> 3) & 0x7 == opcode_bits]
681 else:
682 byte_range = range(0xbf, -1, -1)
683
684 for modrm in byte_range:
685 # If RM field != %rsp then there are no SIB byte
686 if (modrm & 0x07) != validator.REG_RSP:
687 bytes_text, rm_text, base_text = BaseOnlyMemoryOperand(modrm, base)
688 replacement = [bytes_text]
689 input, output = AppendOperandsReplacement(
690 replacement, rm_text, reg, modrm, writes_to)
691 if options.bitness == 64:
692 replacement.append('any_nonspecial')
693 # xchg with memory can not be used to sandbox it's operand, only
694 # instriuction which explicitly writes to reg operand can do that
695 if writes_to == 'reg' and register_write == 'sandbox':
696 replacement.append(output)
697 else:
698 replacement.append(None)
699 if InstructionIsDangerous(input, output, register_write, writes_to,
700 memory_accessed, base_text):
701 continue
702 replacement = tuple(replacement)
703 replacements.append(replacement)
704 else:
705 # If RM field == %rsp then we have SIB byte
706 for sib in xrange(0x100):
707 bytes_text, rm_text, base_text, index_text = SIBMemoryOperand(
708 modrm, sib, base, index)
709 replacement = [bytes_text]
710 input, output = AppendOperandsReplacement(
711 replacement, rm_text, reg, modrm, writes_to)
712 if options.bitness == 64:
713 if not memory_accessed or index_text == '%riz':
714 replacement.append('any_nonspecial')
715 else:
716 if index_r8:
717 # Convert %r8 to %r8, %r9 to %r9d, etc
halyavin 2013/11/18 09:50:47 Convert %r8 to %r8d
khim 2013/11/18 12:10:21 Done.
718 replacement.append(index_text + 'd')
719 else:
720 # Convert %rax to %eax, %rsp to %esp, etc
721 replacement.append('%e' + index_text[2:])
722 # xchg with memory can not be used to sandbox it's operand, only
723 # instriuction which explicitly write to reg operand can do that
halyavin 2013/11/18 09:50:47 instriuction->instruction
khim 2013/11/18 12:10:21 Done.
724 if writes_to == 'reg' and register_write == 'sandbox':
725 replacement.append(output)
726 else:
727 replacement.append(None)
728 if InstructionIsDangerous(input, output, register_write, writes_to,
729 memory_accessed, base_text, index_text):
730 continue
731 replacements.append(tuple(replacement))
732 ModRMMemoryReplacements.replacements[output_key] = tuple(replacements)
733 return ModRMMemoryReplacements.replacements[output_key]
734 ModRMMemoryReplacements.replacements = {}
735
736
737 def PrepareCompressors():
738 global compressors
739 global main_compressors
740 global register_compressors
741 global memory_compressors
742
743 # "Larger" compressors should be tried first, then "smaller" ones.
744 main_compressors = []
745 register_compressors = []
746 memory_compressors = []
747 extra_compressors = []
748
749 # Map from "REX bit off" group of registers to "REX bit on" group of registers
750 r8 = {
751 'al': 'r8b',
752 'ax': 'r8w',
753 'eax': 'r8d',
754 'rax': 'r8',
755 'mm0': 'mmalt',
756 'xmm0': 'xmm8',
757 'ymm0': 'ymm8'
758 }
759
760 if options.bitness == 32:
761 register_kinds = ('al', 'ax', 'eax', 'mm0', 'xmm0', 'ymm0')
762 register_kind_pairs = (
763 ( 'al', 'al'),
764 ( 'ax', 'al'),
765 ( 'ax', 'ax'),
766 ( 'eax', 'al'),
767 ( 'eax', 'ax'),
768 ( 'eax', 'eax'),
769 ( 'eax', 'mm0'),
770 ( 'mm0', 'eax'),
771 ( 'eax', 'xmm0'),
772 ('xmm0', 'eax'),
773 ( 'mm0', 'mm0'),
774 ( 'mm0', 'xmm0'),
775 ('xmm0', 'mm0'),
776 ('xmm0', 'xmm0'),
777 ('xmm0', 'ymm0'),
778 ('ymm0', 'xmm0'),
779 ('ymm0', 'ymm0')
780 )
781 else:
782 register_kinds = ('al', 'spl', 'ax', 'eax', 'rax', 'mm0', 'xmm0', 'ymm0',
783 'r8b', 'r8w', 'r8d', 'r8', 'mmalt', 'xmm8', 'ymm8')
784 register_kind_pairs = (
785 ( 'al', 'al'),
786 ( 'spl', 'spl'), ( 'spl', 'r8b'), ( 'r8b', 'spl'), ( 'r8b', 'r8b'),
787 ( 'ax', 'al'),
788 ( 'ax', 'spl'), ( 'ax', 'r8b'), ( 'r8w', 'spl'), ( 'r8w', 'r8b'),
789 ( 'ax', 'ax'), ( 'ax', 'r8w'), ( 'r8w', 'ax'), ( 'r8w', 'r8w'),
790 ( 'eax', 'al'),
791 ( 'eax', 'spl'), ( 'eax', 'r8b'), ( 'r8d', 'spl'), ( 'r8d', 'r8b'),
792 ( 'eax', 'ax'), ( 'eax', 'r8w'), ( 'r8d', 'ax'), ( 'r8d', 'r8w'),
793 ( 'eax', 'eax'), ( 'eax', 'r8d'), ( 'r8d', 'eax'), ( 'r8d', 'r8d'),
794 ( 'rax', 'al'),
795 ( 'rax', 'spl'), ( 'rax', 'r8b'), ( 'r8', 'spl'), ( 'r8', 'r8b'),
796 ( 'rax', 'ax'), ( 'rax', 'r8w'), ( 'r8', 'ax'), ( 'r8', 'r8w'),
797 ( 'rax', 'eax'), ( 'rax', 'r8d'), ( 'r8', 'eax'), ( 'r8', 'r8d'),
798 ( 'rax', 'rax'), ( 'rax', 'r8'), ( 'r8', 'rax'), ( 'r8', 'r8'),
799 ( 'eax', 'mm0'), ( 'eax','mmalt'), ( 'r8d', 'mm0'), ( 'eax', 'mmalt'),
800 ( 'rax', 'mm0'), ( 'rax','mmalt'), ( 'r8', 'mm0'), ( 'r8', 'mmalt'),
801 ( 'mm0', 'eax'), ('mmalt', 'eax'), ( 'mm0', 'r8d'), ('mmalt', 'r8d'),
802 ( 'mm0', 'rax'), ('mmalt', 'rax'), ( 'mm0', 'r8'), ('mmalt', 'r8'),
803 ( 'eax', 'xmm0'), ( 'eax', 'xmm8'), ( 'r8d', 'xmm0'), ( 'r8d', 'xmm8'),
804 ( 'rax', 'xmm0'), ( 'rax', 'xmm8'), ( 'r8', 'xmm0'), ( 'r8', 'xmm8'),
805 ('xmm0', 'eax'), ('xmm0', 'r8d'), ('xmm8', 'eax'), ('xmm8', 'r8d'),
806 ('xmm0', 'rax'), ('xmm0', 'r8'), ('xmm8', 'rax'), ('xmm8', 'r8'),
807 ( 'mm0', 'mm0'), ('mmalt', 'mm0'), ( 'mm0','mmalt'), ('mmalt','mmalt'),
808 ( 'mm0', 'xmm0'), ('mmalt','xmm0'), ( 'mm0', 'xmm8'), ('mmalt', 'xmm8'),
809 ('xmm0', 'mm0'), ('xmm8', 'mm0'), ('xmm0','mmalt'), ('xmm8', 'mmalt'),
810 ('xmm0', 'xmm0'), ('xmm0', 'xmm8'), ('xmm8', 'xmm0'), ('xmm8', 'xmm8'),
811 ('xmm0', 'ymm0'), ('xmm0', 'ymm8'), ('xmm8', 'ymm0'), ('xmm8', 'ymm8'),
812 ('ymm0', 'xmm0'), ('ymm0', 'xmm8'), ('ymm8', 'xmm0'), ('ymm8', 'xmm8'),
813 ('ymm0', 'ymm0'), ('ymm0', 'ymm8'), ('ymm8', 'ymm0'), ('ymm8', 'ymm8')
814 )
815
816 # Largest compressors: both reg and rm fields are used
817 for reg, rm in register_kind_pairs:
818 start_reg = REGISTERS[reg][0]
819 end_reg = REGISTERS[reg][-1 if reg[0:2] != 'r8' else -2]
820 start_rm = REGISTERS[rm][0]
821 end_rm = REGISTERS[rm][-1 if rm[0:2] != 'r8' else -2]
822 instruction_kinds = [
823 # Normal instructions with two operands (rm to reg)
824 ({'writes_to':'reg'}, '', ' # rm to reg', ''),
825 # Normal instructions with two operands (reg to rm)
826 ({'writes_to':'rm'}, '', ' # reg to rm', '')
827 ]
828 # Lea in 64 bit mode is truly unique instruction for now
829 if options.bitness == 64 and reg in ('eax', 'r8d', 'rax', 'r8'):
830 instruction_kinds = [
831 ({'writes_to':'reg', 'memory_accessed':False,
832 'register_write':'sandbox' if reg in ('eax', 'r8d') else 'protect'},
833 ' # lea', ' # rm to reg; lea', ' # lea')] + instruction_kinds
834 # There are few more forms in 64 bit case (rm to reg)
835 if options.bitness == 64 and reg in ('eax', 'r8d'):
836 # Zero-extending version.
837 instruction_kinds.append(
838 ({'writes_to':'reg', 'register_write':'sandbox'},
839 '', ' # rm to reg', ''))
840 # More forms in 64 bit case (reg to rm)
841 if options.bitness == 64 and rm in ('eax', 'r8d'):
842 # Zero-extending version.
843 instruction_kinds.append(
844 ({'writes_to':'rm', 'register_write':'sandbox'},
845 '', ' # reg to rm', ''))
846 # Zero-extending xchg/xadd
847 instruction_kinds.append(
848 ({'writes_to':'both', 'register_write':'sandbox'},
849 ' # write to both',
850 ' # reg to rm; write to both',
851 ' # write to both'))
852 # Still more forms for 64 bit case (rm to reg).
853 if options.bitness == 64 and reg in ('al', 'spl', 'ax', 'eax', 'rax',
854 'r8b', 'r8w', 'r8d', 'r8'):
855 # Dangerous instructions (rm to reg)
856 instruction_kinds.append(
857 ({'writes_to':'reg', 'register_write':'protect'},
858 '', ' # rm to reg', ''))
859 # Still more forms for 64 bit case (reg to rm)
860 if options.bitness == 64 and rm in ('al', 'spl', 'ax', 'eax', 'rax',
861 'r8b', 'r8w', 'r8d', 'r8'):
862 # Dangerous instructions (reg to rm)
863 instruction_kinds.append(
864 ({'writes_to':'rm', 'register_write':'protect'},
865 '', ' # reg to rm', ''))
866 # Dangerous xchg/xadd
867 instruction_kinds.append(
868 ({'writes_to':'both', 'register_write':'protect'},
869 ' # write to both',
870 ' # reg to rm; write to both',
871 ' # write to both'))
872 # 3DNow! instructions
873 instruction_kinds.append(
874 ({'writes_to':'reg', '3dnow':'yes'}, '', ' # rm to reg', ''))
875 for args, notes, notes_register, notes_memory in instruction_kinds:
876 regex = '(?: 00)*'
877 # Additional byte is opcode extension with 3DNow! instructions.
878 if '3dnow' in args:
879 regex = ' [0-9a-fA-F][0-9a-fA-F]'
880 args.pop('3dnow')
881 regex += ' (?:lock )?\\w* (?:\\$0x0,|\\$0x0,\\$0x0,|%cl,|%xmm0,)?'
882 # We only need to process ModR/M+SIB '04 04' or '04 07' here
883 if options.bitness == 32:
884 regex_mem = '\\(%esp,%eax,1\\)'
885 else:
886 regex_mem = '\\((?:%rsp|%r15),(?:%rax|%r8),1\\)'
887 output = None
888 output_note = None
889 if args['writes_to'] == 'reg':
890 regex += '(%' + REGISTERS[rm][0] + '|' + regex_mem + ')'
891 regex += ',(%' + REGISTERS[reg][0] + ')'
892 if 'register_write' in args and args['register_write'] == 'sandbox':
893 assert reg in ('eax', 'r8d')
894 output = '%' + reg + '|None'
895 output_note = '[%eax..%edi]' if reg == 'eax' else '[%r8d..%r14d]'
896 subst = (
897 'XX', '[%{}..%{} or memory]'.format(start_rm, end_rm),
898 '[%{}..%{}]'.format(start_reg, end_reg), notes)
899 subst_register = (
900 'XX', '[%{}..%{}]'.format(start_rm, end_rm),
901 '[%{}..%{}]'.format(start_reg, end_reg), notes_register)
902 subst_memory = (
903 'XX', '[memory]',
904 '[%{}..%{}]'.format(start_reg, end_reg), notes_memory)
905 else:
906 regex += '(%' + REGISTERS[reg][0] + ')'
907 regex += ',(%' + REGISTERS[rm][0] + '|' + regex_mem + ')'
908 if 'register_write'in args and args['register_write'] == 'sandbox':
909 assert rm in ('eax', 'r8d')
910 output = '%' + rm + '|None'
911 output_note = '[%eax..%edi]' if rm == 'eax' else '[%r8d..%r14d]'
912 subst = (
913 'XX', '[%{}..%{}]'.format(start_reg, end_reg),
914 '[%{}..%{} or memory]'.format(start_rm, end_rm), notes)
915 subst_register = (
916 'XX', '[%{}..%{}]'.format(start_reg, end_reg),
917 '[%{}..%{}]'.format(start_rm, end_rm), notes_register)
918 subst_memory = (
919 'XX', '[%{}..%{}]'.format(start_reg, end_reg),
920 '[memory]', notes_memory)
921 regex += '.*'
922 if options.bitness == 64:
923 regex += '; input_rr=(%eax|%r8d|any_nonspecial)'
924 regex += '; output_rr=({})'.format(output)
925 if 'memory_accessed' in args:
926 input_note = 'any_nonspecial'
927 input_note_r8 = 'any_nonspecial'
928 else:
929 input_note = '[%eax..%edi]'
930 input_note_r8 = '[%r8d..%r15d]'
931 subst_r8 = subst[0:-1] + (input_note_r8, output_note) + subst[-1:]
932 subst = subst[0:-1] + (input_note, output_note) + subst[-1:]
933 subst_memory_r8 = subst_memory[0:-1] + (
934 input_note_r8, output_note) + subst_memory[-1:]
935 subst_memory = subst_memory[0:-1] + (
936 input_note, output_note) + subst_memory[-1:]
937 subst_register = subst_register[0:-1] + (
938 'any_nonspecial', output_note) + subst_register[-1:]
939 regex += '()'
940 base_r8 = rm in r8.values()
941 memory_replacement = ModRMMemoryReplacements(
942 reg=reg, base_r8=base_r8, **args)
943 memory_compressors.append(Compressor(
944 '.*?(04 0[47])' + regex, subst_memory, memory_replacement))
945 if options.bitness == 64:
946 memory_replacement_r8 = ModRMMemoryReplacements(
947 reg=reg, base_r8=base_r8, index_r8=True, **args)
948 memory_compressors.append(Compressor(
949 '.*?(04 0[47])' + regex, subst_memory_r8, memory_replacement_r8))
950 # Instructions with no memory access are instructions which are doing
951 # something with memory address (e.g. lea) and as such they don't have
952 # non-memory forms.
953 if not 'memory_accessed' in args:
954 register_replacement = ModRMRegisterReplacements(rm=rm, reg=reg, **args)
955 register_compressors.append(Compressor(
956 '.*?(c0)' + regex, subst_register, register_replacement))
957 main_replacement = register_replacement + memory_replacement
958 main_compressors.append(Compressor(
959 '.*?(04 0[47])' + regex, subst, main_replacement))
960 if options.bitness == 64:
961 main_replacement_r8 = register_replacement + memory_replacement_r8
962 main_compressors.append(Compressor(
963 '.*?(04 0[47])' + regex, subst_r8, main_replacement_r8))
964
965 # Smaller compressors: only rm field is used.
966 for rm in register_kinds:
967 start_rm = REGISTERS[rm][0]
968 end_rm = REGISTERS[rm][-1 if rm[0:2] != 'r8' else -2]
969 for opcode_bits in xrange(8):
970 XX_byte_mark = 'XX/' + str(opcode_bits)
971 instruction_kinds = [
972 # The most basic form
973 ({}, '', '', '')
974 ]
975 if options.bitness == 64:
976 # No memory access (e.g. prefetch)
977 instruction_kinds = [
978 ({'memory_accessed':False}, '', '', '')] + instruction_kinds
979 # More forms in 64 bit case.
980 if options.bitness == 64 and rm in ('eax', 'r8d'):
981 # Zero-extending version.
982 instruction_kinds.append(
983 ({'register_write':'sandbox'}, '', '', ''))
984 # Still more forms for 64 bit case (reg to rm).
985 if options.bitness == 64 and rm in ('al', 'spl', 'ax', 'eax', 'rax',
986 'r8b', 'r8w', 'r8d', 'r8'):
987 # Dangerous instructions.
988 instruction_kinds.append(
989 ({'register_write':'protect'}, '', '', ''))
990 for args, notes, notes_register, notes_memory in instruction_kinds:
991 subst = (XX_byte_mark, '[%{}..%{} or memory]'.format(start_rm, end_rm),
992 notes)
993 subst_register = (XX_byte_mark, '[%{}..%{}]'.format(start_rm, end_rm),
994 notes_register)
995 subst_memory = (XX_byte_mark, '[memory]',
996 notes_memory)
997 regex = ('(?: 00)* (?:lock )?\\w* (?:\\$0x0,|%cl,)?'
998 '(%' + REGISTERS[rm][0] + '|' + regex_mem + ').*')
999 output = None
1000 output_note = None
1001 if options.bitness == 64:
1002 if 'register_write' in args and args['register_write'] == 'sandbox':
1003 assert rm in ('eax', 'r8d')
1004 output = '%' + rm + '|None'
1005 output_note = '[%eax..%edi]' if rm == 'eax' else '[%r8d..%r14d]'
1006 regex += '; input_rr=(%eax|%r8d|any_nonspecial)'
1007 regex += '; output_rr=({})'.format(output)
1008 if 'memory_accessed' in args:
1009 input_note = 'any_nonspecial'
1010 input_note_r8 = 'any_nonspecial'
1011 else:
1012 input_note = '[%eax..%edi]'
1013 input_note_r8 = '[%r8d..%r15d]'
1014 subst_r8 = subst[0:-1] + (input_note_r8, output_note) + subst[-1:]
1015 subst = subst[0:-1] + (input_note, output_note) + subst[-1:]
1016 subst_memory_r8 = subst_memory[0:-1] + (
1017 input_note_r8, output_note) + subst_memory[-1:]
1018 subst_memory = subst_memory[0:-1] + (
1019 input_note, output_note) + subst_memory[-1:]
1020 subst_register = subst_register[0:-1] + (
1021 'any_nonspecial', output_note) + subst_register[-1:]
1022 regex += '()'
1023 base_r8 = rm in r8.values()
1024 memory_replacement = ModRMMemoryReplacements(
1025 reg=None, base_r8=base_r8, opcode_bits=opcode_bits, **args)
1026 memory_compressors.append(Compressor(
1027 '.*?({:02x} 0[47])'.format(0x04 + opcode_bits * 8) + regex,
1028 subst_memory, memory_replacement))
1029 if options.bitness == 64:
1030 memory_replacement_r8 = ModRMMemoryReplacements(
1031 reg=None, base_r8=base_r8, index_r8=True, opcode_bits=opcode_bits,
1032 **args)
1033 memory_compressors.append(Compressor(
1034 '.*?({:02x} 0[47])'.format(0x04 + opcode_bits * 8) + regex,
1035 subst_memory_r8, memory_replacement_r8))
1036 # Instructions with no memory access are instructions which are doing
1037 # something with memory address (e.g. prefetch) and as such they don't
1038 # have non-memory forms.
1039 if not 'memory_accessed' in args:
1040 register_replacement = ModRMRegisterReplacements(
1041 reg=None, rm=rm, opcode_bits=opcode_bits, **args)
1042 register_compressors.append(Compressor(
1043 '.*?({:02x})'.format(0xc0 + opcode_bits * 8) + regex,
1044 subst_register, register_replacement))
1045 main_replacement = register_replacement + memory_replacement
1046 main_compressors.append(Compressor(
1047 '.*?({:02x} 0[47])'.format(0x04 + opcode_bits * 8) + regex,
1048 subst, main_replacement))
1049 if options.bitness == 64:
1050 main_replacement_r8 = register_replacement + memory_replacement_r8
1051 main_compressors.append(Compressor(
1052 '.*?({:02x} 0[47])'.format(0x04 + opcode_bits * 8) + regex,
1053 subst_r8, main_replacement_r8))
1054
1055 # Even smaller compressors: only low 3 bits of opcode are used.
1056 for reg in register_kinds + ('st(0)',):
1057 start_reg = REGISTERS[reg][0]
1058 end_reg = REGISTERS[reg][-1 if reg[0:2] != 'r8' else -2]
1059 for opcode in xrange(8):
1060 for text1, text2, nibble in (
1061 ('[0..7]', '[8..f]', xrange(8)),
1062 ('[012367]', '[89abef]', (0, 1, 2, 3, 6, 7)),
1063 ('[0..6]', '[8..e]', xrange(7))
1064 ):
1065 # Note that we use 2nd line here to avoid ambiguity when opcode is 0x00
1066 extra_compressors.append(Compressor(
1067 '.*?[0-9a-fA-F](1)(?: 00)*'
1068 ' \\w* (?:\\$0x0,|%ax,|%st,)?'
1069 '(%(?:' + REGISTERS_RE[reg][1] + ')).*()',
1070 (text1, '[%{}..%{}]'.format(start_reg, end_reg), ''),
1071 tuple(('{:x}'.format(n), '%' + REGISTERS[reg][n])
1072 for n in nibble)))
1073 extra_compressors.append(Compressor(
1074 '.*?[0-9a-fA-F](8)(?: 00)*'
1075 ' \\w* (?:\\$0x0,|%ax,|%st,)?'
1076 '(%(?:' + REGISTERS_RE[reg][0] + ')).*()',
1077 (text2, '[%{}..%{}]'.format(start_reg, end_reg), ''),
1078 tuple(('{:x}'.format(n + 8), '%' + REGISTERS[reg][n])
1079 for n in nibble)))
1080 # Another version for 64 bit case
1081 if options.bitness == 64 and reg in ('eax', 'r8d'):
1082 extra_compressors.append(Compressor(
1083 '.*?[0-9a-fA-F](1)(?: 00)*'
1084 ' \\w* (?:\\$0x0,|%ax,|%st,)?'
1085 '(%(?:' + REGISTERS_RE[reg][1] + ')).*'
1086 'output_rr=(%(?:'+ REGISTERS_RE[reg][1] + ')).*()',
1087 tuple([text1] + ['[%{}..%{}]'.format(start_reg, end_reg)] * 2 +
1088 ['']),
1089 tuple(['{:x}'.format(n)] + ['%' + REGISTERS[reg][n]] * 2
1090 for n in nibble)))
1091 extra_compressors.append(Compressor(
1092 '.*?[0-9a-fA-F](8)(?: 00)*'
1093 ' \\w* (?:\\$0x0,|%ax,|%st,)?'
1094 '(%(?:' + REGISTERS_RE[reg][0] + ')).*'
1095 'output_rr=(%(?:'+ REGISTERS_RE[reg][0] + ')).*()',
1096 tuple([text2] + ['[%{}..%{}]'.format(start_reg, end_reg)] * 2 +
1097 ['']),
1098 tuple(['{:x}'.format(n + 8)] + ['%' + REGISTERS[reg][n]] * 2
1099 for n in nibble)))
1100 compressors = (main_compressors + memory_compressors + register_compressors +
1101 extra_compressors)
1102
1103 # Special compressors: will handle some cosmetic issues.
1104 #
1105 # SETxx ignores reg field and thus are described as many separate instructions
1106 compressors.append(Compressor(
1107 '.*0f 9[0-9a-fA-F] XX(/[0-7]) set.*()', ('', ''),
1108 [('/' + str(i), ) for i in range(8)]))
1109 # BSWAP is described with opcode "0f c8+r", not "0f /1" in manual
1110 if options.bitness == 32:
1111 compressors.append(Compressor(
1112 '.*(XX/1) bswap.*ax.*()', ('c[8..f]', ''), [('XX/1', )]))
1113 else:
1114 compressors.append(Compressor(
1115 '.*(XX/1) bswap.*ax.*()', ('c[89abef]', ''), [('XX/1', )]))
1116 compressors.append(Compressor(
1117 '.*(XX/1) bswap.*r8.*()', ('c[8..e]', ''), [('XX/1', )]))
1118 # Add mark '# write to both' to certain versions of CMPXCHG, XADD, and XCHG
1119 if options.bitness == 64:
1120 compressors.append(Compressor(
1121 '.* (?:cmpxchg|xadd|xchg).*%al\\.\\.%bh[^#]*()$',
1122 (' # write to both', ), ((), )))
1123 # "and $0xe0,[%eax..%edi]" is treated specially which means that we list all
1124 # versions of and "[$0x1..$0xff],[%eax..%edi]" separately here.
1125 # Without this rule these ands comprise 2/3 of the whole output!
1126 if options.bitness == 32:
1127 compressors.append(Compressor(
1128 '.*83 (e0 01 and \\$0x1,%eax)()',
1129 ('XX/4 00 and[l]? $0x0,[%eax..%edi or memory]', ' # special and'),
1130 [('e{} {:02x} and $0x{:x},%{}'.format(r, i, i, REGISTERS['eax'][r]), )
1131 for i in range(0x01, 0x100) for r in range(8)] +
1132 [('XX/4 00 and[l]? $0x0,[%eax..%edi or memory]', )]))
1133 else:
1134 for reg in ('eax', 'r8d'):
1135 start_reg = REGISTERS[reg][0]
1136 end_reg = REGISTERS[reg][-1 if reg[0:2] != 'r8' else -2]
1137 for index_reg in ('eax', 'r8d'):
1138 start_index = REGISTERS[index_reg][0]
1139 end_index = REGISTERS[index_reg][-1]
1140 compressors.append(Compressor(
1141 '.*83 (e0 01 and \\$0x1,%' + reg + ').*'
1142 'input_rr=(any_nonspecial); output_rr=(%' + reg + ')()',
1143 ('XX/4 00 and[l]? $0x0,[%{}..%{} or memory]'.format(start_reg,
1144 end_reg), '[%{}..%{}]'.format(start_index, end_index),
1145 '[%{}..%{}]'.format(start_reg, end_reg),
1146 ' # special and'),
1147 [('e{} {:02x} and $0x{:x},%{}'.format(r, i, i, REGISTERS[reg][r]),
1148 'any_nonspecial', '%' + REGISTERS[reg][r])
1149 for i in range(0x01, 0x100) for r in range(7 + (reg == 'eax'))] +
1150 [('XX/4 00 and[l]? $0x0,[%{}..%{} or memory]'.format(start_reg,
1151 end_reg), '[%{}..%{}]'.format(start_index, end_index),
1152 '[%{}..%{}]'.format(start_reg, end_reg))]))
1153
1154 # "and $e0" and similar are used to align %rsp. All negative values are
1155 # accepted by validator and there are 127 of these.
1156 # Consolidate them into one line.
1157 if options.bitness == 64:
1158 compressors.append(Compressor(
1159 '.*(?:81|83) (?:e4|e5) (80) (?:00 00 00 |) and \\$0x(80),%r[bs]p.*()',
1160 ('[80..ff]', '[80..ff]', ' # alignment and'),
1161 [('{:02x}'.format(i), '{:02x}'.format(i)) for i in range(0x80, 0x100)]))
1162
1163 # Merge memory and non-memory access
1164 if options.bitness == 32:
1165 letters_and_registers = (('b', 'al', ''), ('w', 'ax', ''), ('l', 'eax', ''))
1166 else:
1167 letters_and_registers = (
1168 ('b', 'al', 'eax'), ('b', 'spl', 'eax'), ('b', 'r8b', 'r8d'),
1169 ('w', 'ax', 'eax'), ('w', 'r8w', 'r8d'),
1170 ('l', 'eax', 'eax'), ('l', 'r8d', 'r8d'),
1171 ('q', 'rax', 'eax'), ('q', 'r8', 'r8d')
1172 )
1173 for letter, reg, out_reg in letters_and_registers:
1174 start_reg = REGISTERS[reg][0]
1175 end_reg = REGISTERS[reg][-1 if reg[0:2] != 'r8' else -2]
1176 all_regs = '[%{}..%{}]'.format(start_reg, end_reg)
1177 regs_mark = '[%{}..%{} or memory]'.format(start_reg, end_reg)
1178 if options.bitness == 64:
1179 start_out = REGISTERS[out_reg][0]
1180 end_out = REGISTERS[out_reg][-1 if out_reg[0:2] != 'r8' else -2]
1181 out_regs = '[%{}..%{}]'.format(start_out, end_out)
1182 for notes in ('', ' # rm to reg', ' # reg to rm'):
1183 compressors.append(Compressor(
1184 '.* \\w*(' + letter + ') .*(\\[memory]).*()()',
1185 ('[{}]?'.format(letter), regs_mark, '', ''),
1186 ((letter, '[memory]', ''), ('', all_regs, notes))))
1187 if options.bitness == 64:
1188 for index_reg in ('eax', 'r8d'):
1189 start_index = REGISTERS[index_reg][0]
1190 end_index = REGISTERS[index_reg][-1]
1191 index_regs = '[%{}..%{}]'.format(start_index, end_index)
1192 for output_rrs in ((None, out_regs), (out_regs, None), (None, None)):
1193 compressors.append(Compressor(
1194 '.* \\w*(' + letter + ') .*(\\[memory]).*; '
1195 'input_rr=(\\[%[a-z0-9]*..%[a-z0-9]*\\]); '
1196 'output_rr=(\\[%[a-z0-9]*..%[a-z0-9]*\\]|None)()()',
1197 ('[{}]?'.format(letter), regs_mark, index_regs,
1198 output_rrs[0] if output_rrs[0] is not None else output_rrs[1],
1199 '', ''),
1200 ((letter, '[memory]', index_regs, output_rrs[0], ''),
1201 ('', all_regs, 'any_nonspecial', output_rrs[1], notes))))
1202
1203 # REX compressors
1204 if options.bitness == 64:
1205 # First pretty complex set of compressors to combine versions of REX with
1206 # three lowest bits in different states.
1207 register_kind_pairs = (
1208 ( None, None),
1209 ( 'al', 'al'), ( 'al', None), (None, 'al'),
1210 ( 'ax', 'al'), ( 'al', 'ax'),
1211 ( 'ax', 'ax'), ( 'ax', None), (None, 'ax'),
1212 ( 'eax', 'al'), ( 'al', 'eax'),
1213 ( 'eax', 'ax'), ( 'ax', 'eax'),
1214 ( 'eax', 'eax'), ( 'eax', None), (None, 'eax'),
1215 ( 'rax', 'al'), ( 'al', 'rax'),
1216 ( 'rax', 'ax'), ( 'ax', 'rax'),
1217 ( 'rax', 'eax'), ( 'eax', 'rax'),
1218 ( 'rax', 'rax'), ( 'rax', None), (None, 'rax'),
1219 ( 'eax', 'mm0'), ( 'mm0', 'eax'),
1220 ( 'rax', 'mm0'), ( 'mm0', 'rax'),
1221 ( 'mm0', 'eax'), ( 'eax', 'mm0'),
1222 ( 'mm0', 'rax'), ( 'rax', 'mm0'),
1223 ( 'eax', 'xmm0'),
1224 ( 'rax', 'xmm0'),
1225 ('xmm0', 'eax'),
1226 ('xmm0', 'rax'),
1227 ( 'mm0', 'mm0'), ( 'mm0', None), (None, 'mm0'),
1228 ( 'mm0', 'xmm0'),
1229 ('xmm0', 'mm0'),
1230 ('xmm0', 'xmm0'),
1231 ('xmm0', 'ymm0'), ('xmm0', None), (None, 'xmm0'),
1232 ('ymm0', 'xmm0'),
1233 ('ymm0', 'ymm0'), ('ymm0', None), (None, 'ymm0'),
1234 )
1235 for reg, rm in register_kind_pairs:
1236 for last_reg, last_rm in ((-1, -1), (-1, -2), (-2, -1), (-2, -2)):
1237 if reg:
1238 start_reg = REGISTERS[reg][0]
1239 start_reg8 = REGISTERS[r8[reg]][0]
1240 end_reg = REGISTERS[reg][-1]
1241 end_reg0 = 'dil' if reg == 'al' else end_reg
1242 end_reg8 = REGISTERS[r8[reg]][last_reg]
1243 reg_regex = '\\[(%' + start_reg + '\\.\\.%' + end_reg + ')]'
1244 reg_regex0 = '\\[(%' + start_reg + '\\.\\.%' + end_reg0 + ')]'
1245 elif last_reg == -2:
1246 continue
1247 if rm:
1248 start_rm = REGISTERS[rm][0]
1249 start_rm8 = REGISTERS[r8[rm]][0]
1250 end_rm = REGISTERS[rm][-1]
1251 end_rm0 = 'dil' if rm == 'al' else end_rm
1252 end_rm8 = REGISTERS[r8[rm]][last_rm]
1253 rm_regex = ('\\[(%' + start_rm + '\\.\\.%' + end_rm + ')'
1254 '(?: or memory)?]')
1255 rm_regex0 = ('\\[(%' + start_rm + '\\.\\.%' + end_rm0 + ')'
1256 '(?: or memory)?]')
1257 elif last_rm == -2:
1258 continue
1259 for rexw in (True, False):
1260 for input_rr in (True, False):
1261 for output_rr in (True, False) if reg or rm else (None, ):
1262 for rm_to_reg in (True, False) if reg and rm else (None, ):
1263 # Legacy prefixes
1264 regex = '.*:(?: 26| 2e| 36| 3e| 64| 65| 66| 67| f0| f2| f3)*'
1265 # REX
1266 regex += '( 48).*' if rexw else '( 40|).*'
1267 # Replacement text
1268 replacement_tuple = (
1269 ' [REX:48..4f]' if rexw else ' [REX:40..47]?', )
1270 if reg:
1271 replacement_regs = '%{}..%{}'.format(start_reg, end_reg8)
1272 if rm:
1273 replacement_rms = '%{}..%{}'.format(start_rm, end_rm8)
1274 # Instruction arguments
1275 if not reg and not rm:
1276 pass
1277 elif not reg and rm:
1278 if rexw:
1279 regex += rm_regex0 + '.*'
1280 else:
1281 regex += rm_regex + '.*'
1282 replacement_tuple += (replacement_rms, )
1283 elif reg and not rm:
1284 if rexw:
1285 regex += reg_regex0 + '.*'
1286 else:
1287 regex += reg_regex + '.*'
1288 replacement_tuple += (replacement_regs, )
1289 elif rm_to_reg:
1290 if rexw:
1291 regex += rm_regex0 + ',' + reg_regex0 + '.*'
1292 else:
1293 regex += rm_regex + ',' + reg_regex + '.*'
1294 replacement_tuple += (replacement_rms, replacement_regs)
1295 else:
1296 if rexw:
1297 regex += reg_regex0 + ',' + rm_regex0 + '.*'
1298 else:
1299 regex += reg_regex + ',' + rm_regex + '.*'
1300 replacement_tuple += (replacement_regs, replacement_rms)
1301 # Input and output restricted registers
1302 if input_rr:
1303 regex += 'input_rr=\\[(%eax\\.\\.%edi)].*'
1304 replacement_tuple += ('%eax..%r15d', )
1305 if output_rr:
1306 regex += 'output_rr=\\[(%eax\\.\\.%edi)].*'
1307 replacement_tuple += ('%eax..%r14d', )
1308 regex += '()'
1309 replacement_tuple += ('', )
1310 # Replacement cases
1311 replacement_tuples = ()
1312 for byte in (range(0x48, 0x50)
1313 if rexw
1314 else range(0x40, 0x48) + ['']):
1315 replacement_case = (
1316 ' {:02x}'.format(byte) if byte else byte, )
1317 if byte:
1318 if rm:
1319 if byte & 0x1:
1320 replacement_rms = '%{}..%{}'.format(start_rm8, end_rm8)
1321 else:
1322 replacement_rms = '%{}..%{}'.format(start_rm, end_rm0)
1323 if byte & 0x2:
1324 replacement_index = '%r8d..%r15d'
1325 else:
1326 replacement_index = '%eax..%edi'
1327 if reg:
1328 if byte & 0x4:
1329 replacement_regs = '%{}..%{}'.format(start_reg8,
1330 end_reg8)
1331 else:
1332 replacement_regs = '%{}..%{}'.format(start_reg,
1333 end_reg0)
1334 else:
1335 if rm:
1336 replacement_rms = '%{}..%{}'.format(start_rm, end_rm)
1337 replacement_index = '%eax..%edi'
1338 if reg:
1339 replacement_regs = '%{}..%{}'.format(start_reg, end_reg)
1340 if not reg and not rm:
1341 pass
1342 elif not reg and rm:
1343 replacement_case += (replacement_rms, )
1344 if byte:
1345 final_rr = '%r8d..%r14d' if byte & 0x1 else '%eax..%edi'
1346 else:
1347 final_rr = '%eax..%edi'
1348 elif reg and not rm:
1349 replacement_case += (replacement_regs, )
1350 if byte:
1351 final_rr = '%r8d..%r14d' if byte & 0x4 else '%eax..%edi'
1352 else:
1353 final_rr = '%eax..%edi'
1354 elif rm_to_reg:
1355 replacement_case += (replacement_rms, replacement_regs)
1356 if byte:
1357 final_rr = '%r8d..%r14d' if byte & 0x4 else '%eax..%edi'
1358 else:
1359 final_rr = '%eax..%edi'
1360 else:
1361 replacement_case += (replacement_regs, replacement_rms)
1362 if byte:
1363 final_rr = '%r8d..%r14d' if byte & 0x1 else '%eax..%edi'
1364 else:
1365 final_rr = '%eax..%edi'
1366 if input_rr: replacement_case += (replacement_index, )
1367 if output_rr: replacement_case += (final_rr, )
1368 replacement_tuples += (replacement_case, )
1369 compressors.append(Compressor(
1370 regex, replacement_tuple, replacement_tuples))
1371 # This is pretty simple compressor to combine two lines with different REX.W
1372 # bits (only if they are otherwise identical).
1373 compressors.append(Compressor(
1374 '.*(\\[REX:40\\.\\.47]\\?).*()', ('[REX:40..4f]?', ''),
1375 (('[REX:40..47]?', ), ('[REX:48..4f]', ))))
1376
1377
1378 def ShowProgress(rule, instruction):
1379 if rule not in ShowProgress.rules_shown:
1380 first_print = True
1381 ShowProgress.rules_shown[rule]=len(ShowProgress.rules_shown)
1382 else:
1383 first_print = False
1384 print >> sys.stderr, '-------- Compressed --------'
1385 print >> sys.stderr, 'Rule:', ShowProgress.rules_shown[rule]
1386 print >> sys.stderr, '--------'
1387 compressor = compressors[rule]
1388 match = compressor.regex.match(instruction)
1389 assert match
1390 format_str = CompressionTemplate(instruction, match, '{{{}}}')
1391 replacements = sorted(format_str.format(*replacement)
1392 for replacement in compressor.replacements)
1393 if len(compressor.replacements) <= 4 or first_print:
1394 for replacement in replacements:
1395 print >> sys.stderr, replacement
1396 else:
1397 print >> sys.stderr, replacements[0]
1398 print >> sys.stderr, '...'
1399 print >> sys.stderr, replacements[-1]
1400 print >> sys.stderr, '--------'
1401 print >> sys.stderr, 'Compressed', (
1402 format_str + '{{{}}}').format(*compressor.subst)
1403 ShowProgress.rules_shown = {}
1404
1405
1406 def main():
1407 # We are keeping these global to share state graph and compressors
1408 # between workers spawned by multiprocess. Passing them every time is slow.
1409 global options, xml_file
1410 global dfa
1411 global worker_validator
1412 options, xml_file = ParseOptions()
1413 dfa = dfa_parser.ParseXml(xml_file)
1414 worker_validator = validator.Validator(
1415 validator_dll=options.validator_dll,
1416 decoder_dll=options.decoder_dll)
1417 PrepareCompressors()
1418
1419 assert dfa.initial_state.is_accepting
1420 assert not dfa.initial_state.any_byte
1421
1422 print >> sys.stderr, len(dfa.states), 'states'
1423
1424 num_suffixes = dfa_traversal.GetNumSuffixes(dfa.initial_state)
1425
1426 # We can't just write 'num_suffixes[dfa.initial_state]' because
1427 # initial state is accepting.
1428 total_instructions = sum(
1429 num_suffixes[t.to_state]
1430 for t in dfa.initial_state.forward_transitions.values())
1431 print >> sys.stderr, total_instructions, 'regular instructions total'
1432
1433 tasks = dfa_traversal.CreateTraversalTasks(dfa.states, dfa.initial_state)
1434 print >> sys.stderr, len(tasks), 'tasks'
1435
1436 pool = multiprocessing.Pool()
1437
1438 results = pool.imap(Worker, tasks)
1439
1440 total = 0
1441 num_valid = 0
1442 full_output = set()
1443 for prefix, count, valid_count, output, trace in results:
1444 print >> sys.stderr, 'Prefix:', ', '.join(map(hex, prefix))
1445 total += count
1446 num_valid += valid_count
1447 full_output |= output
1448 for rule, instruction in trace:
1449 ShowProgress(rule, instruction)
1450 for instruction in sorted(Compressed(full_output,
1451 compressors,
1452 ShowProgress)):
1453 print instruction
1454
1455 print >> sys.stderr, total, 'instructions were processed'
1456 print >> sys.stderr, num_valid, 'valid instructions'
1457
1458
1459 if __name__ == '__main__':
1460 main()
OLDNEW
« no previous file with comments | « no previous file | src/trusted/validator_ragel/testdata/32bit_regular.golden » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698