Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(85)

Side by Side Diff: src/trusted/validator_ragel/compress_regular_instructions.py

Issue 49183002: Regular instructions golden file test. Base URL: svn://svn.chromium.org/native_client/trunk/src/native_client/
Patch Set: Created 6 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
(Empty)
1 # Copyright (c) 2013 The Native Client Authors. All rights reserved.
2 # Use of this source code is governed by a BSD-style license that can be
3 # found in the LICENSE file.
4
5 """
6 Traverse the validator's DFA, collect all "normal" instructions and then
7 compress output. Note: "anybyte fields" (immediates and displacements)
8 are always filled with zeros. Otherwise processing of sextillions (sic!)
9 of possibilities will take too long.
10
11 Each rule is applied only when all variants are accepted by validator.
12 The following compression rules are present:
13
14 1. Compress ModR/M (+SIB & displacement).
15 Instruction: 00 00 add %al,(%rax)
16 ...
17 Instruction: 00 ff add %bh,%bh
18 becomes
19 Instruction: 00 XX add [%al..%bh],[%al..%bh or memory]
20
21 1a. Compress ModR/M (+SIB & displacement) memory-only.
22 Instruction: f0 01 00 lock add %eax,(%eax)
23 ...
24 Instruction: f0 01 bf 00 00 00 00 lock add %edi,0x0(%edi)
25 becomes
26 Instruction: f0 01 XX lock add [%eax..edi],[memory]
27
28 1b. Compress ModR/M register only.
29 Instruction: 66 0f 50 c0 movmskpd %xmm0,%eax
30 ...
31 Instruction: 66 0f 50 ff movmskpd %xmm7,%edi
32 becomes
33 Instruction: 66 0f 50 XX movmskpd [%xmm0..%xmm7],[%eax..edi]
34
35 2. Compress ModR/M (+SIB & displacement) with opcode extension.
36 Instruction: 0f 90 00 seto (%eax)
37 ...
38 Instruction: 0f 90 c7 seto %bh
39 becomes
40 Instruction: 0f 90 XX/0 seto [%al..%bh or memory]
41
42 2a. Compress ModR/M (+SIB & displacement) memory-only with opcode extension.
43 Instruction: f0 ff 00 lock incl (%eax)
44 ...
45 Instruction: f0 ff 84 ff 00 00 00 00 lock incl 0x0(%edi,%edi,8)
46 becomes
47 Instruction: f0 ff XX/1 lock decl [memory]
48
49 2b. Compress ModR/M register-only with opcode extension.
50 Instruction: 0f 71 d0 00 psrlw $0x0,%mm0
51 ...
52 Instruction: 0f 71 d7 00 psrlw $0x0,%mm7
53 becomes
54 Instruction: 66 0f 71 XX/2 00 psrlw $0x0,[%mm0..%mm7]
55
56 3. Compress register-in-opcode.
57 Instruction: d9 c0 fld %st(0)
58 ...
59 Instruction: d9 c7 fld %st(7)
60 becomes
61 Instruction: Instruction: d9 c[0..7] fld [%st(0)..%st(7)]
62
63 Only applies if all possible register accesses are accepted by validator.
64
65 4. Special compressor for "set" instruction.
66 Instruction: 0f 90 XX/0 seto [%al..%bh or memory]
67 ...
68 Instruction: 0f 90 XX/7 seto [%al..%bh or memory]
69 becomes
70 Instruction: 0f 90 XX seto [%al..%bh or memory]
71 """
72
73 import itertools
74 import multiprocessing
75 import optparse
76 import os
77 import re
78 import subprocess
79 import sys
80 import tempfile
81 import traceback
82
83 import dfa_parser
84 import dfa_traversal
85 import validator
86
87
88 # Register names in 'natual' order (as defined by IA32/x86-64 ABI)
89 #
90 # X86-64 ABI splits all registers in groups of 8 because it uses 3-bit field
91 # in opcode, ModR/M, and/or SIB bytes to encode them.
92 #
93 # In most cases there are 16 registers of a given kind and two such groups,
94 # but there are couple of exceptions:
95 # 1. There are 20 8-bit registers and three groups (two of them overlap)
96 # 2. There are eight X87 and MMX registers thus two groups are identical
97 #
98 # We use typical register from a group to name the whole group. Most groups
99 # are named by the first register but 'spl' group is named by the fifth register
100 # because it's first four registers are the same as 'al' group.
101 # We use mnemonic name 'mmalt' to represent the "evil mirror" of the 'mm0'
102 # group: it the same as 'mm0', but implies use of the appropriate REX bit.
103 REGISTERS = {
104 'al': [ 'al', 'cl', 'dl', 'bl', 'ah', 'ch', 'dh', 'bh' ],
105 'spl': [ 'al', 'cl', 'dl', 'bl', 'spl', 'bpl', 'sil', 'dil' ],
106 'ax': [ 'ax', 'cx', 'dx', 'bx', 'sp', 'bp', 'si', 'di' ],
107 'eax': [ 'eax', 'ecx', 'edx', 'ebx', 'esp', 'ebp', 'esi', 'edi' ],
108 'rax': [ 'rax', 'rcx', 'rdx', 'rbx', 'rsp', 'rbp', 'rsi', 'rdi' ],
109 'r8b': [ 'r{}b'.format(N) for N in range(8,16) ],
110 'r8w': [ 'r{}w'.format(N) for N in range(8,16) ],
111 'r8d': [ 'r{}d'.format(N) for N in range(8,16) ],
112 'r8': [ 'r{}'.format(N) for N in range(8,16) ],
113 'mm0': [ 'mm{}'.format(N) for N in range(8) ],
114 'mmalt': [ 'mm{}'.format(N) for N in range(8) ],
115 'st(0)': [ 'st({})'.format(N) for N in range(8) ],
116 'xmm0': [ 'xmm{}'.format(N) for N in range(8) ],
117 'xmm8': [ 'xmm{}'.format(N) for N in range(8,16) ],
118 'ymm0': [ 'ymm{}'.format(N) for N in range(8) ],
119 'ymm8': [ 'ymm{}'.format(N) for N in range(8,16) ]
120 }
121
122
123 NOP = 0x90
124
125
126 def PadToBundleSize(bytes):
127 assert len(bytes) <= validator.BUNDLE_SIZE
128 return bytes + [NOP] * (validator.BUNDLE_SIZE - len(bytes))
129
130
131 # In x86-64 mode we have so-called 'restricted register' which is used to
132 # tie two groups together. Some instructions require particular value to
133 # be stored in this variable, while some accept any non-special restricted
134 # register (%ebp and %esp are special because they can only be accepted by
135 # a few 'special' instructions).
136 #
137 # You can find more details in the "NaCl SFI model on x86-64 systems" manual.
138 #
139 # We try to feed all possible 'restricted registers' into validator and then
140 # classify the instruction using this map. If set of acceptable 'restricted
141 # registers' is not here, then it's an error in validator.
142 ACCEPTABLE_X86_64_INPUTS = {
143 0x00001: 'input_rr=%eax',
144 0x00002: 'input_rr=%ecx',
145 0x00004: 'input_rr=%edx',
146 0x00008: 'input_rr=%ebx',
147 0x00010: 'input_rr=%esp',
148 0x00020: 'input_rr=%ebp',
149 0x00040: 'input_rr=%esi',
150 0x00080: 'input_rr=%edi',
151 0x00100: 'input_rr=%r8d',
152 0x00200: 'input_rr=%r9d',
153 0x00400: 'input_rr=%r10d',
154 0x00800: 'input_rr=%r11d',
155 0x01000: 'input_rr=%r12d',
156 0x02000: 'input_rr=%r13d',
157 0x04000: 'input_rr=%r14d',
158 0x08000: 'input_rr=%r15d',
159 0x1ffcf: 'input_rr=any_nonspecial'
160 }
161
162 # Any instruction must produce either None or one of fifteen registers as an
163 # output 'restricted register' value. 'r15d' is NOT acceptable as an output.
164 ACCEPTABLE_X86_64_OUTPUT_REGISTERS = tuple(
165 '%' + reg for reg in (REGISTERS['eax'] + REGISTERS['r8d'])[0:-1])
166
167
168 def ValidateInstruction(instruction, validator_inst):
169 bundle = ''.join(map(chr, PadToBundleSize(instruction)))
170 if options.bitness == 32:
171 result = validator_inst.ValidateChunk(bundle, bitness=32)
172 return result, []
173 else:
174 valid_inputs = 0
175 known_final_rr = None
176 output_rr = None
177 # Note that iteration order is aligned with ACCEPTABLE_X86_64_INPUTS array
178 # above.
179 for bit, initial_rr in enumerate(validator.ALL_REGISTERS + [None]):
180 valid, final_rr = validator_inst.ValidateAndGetFinalRestrictedRegister(
181 bundle, len(instruction), initial_rr)
182 if valid:
183 # final_rr should not depend on input_rr
184 assert valid_inputs == 0 or known_final_rr == final_rr
185 valid_inputs |= 1 << bit
186 known_final_rr = final_rr
187 # If nothing is accepted then instruction is not valid. Easy and simple.
188 if valid_inputs == 0: return False, []
189 # If returned value in unacceptable we'll get IndexError here and this
190 # test will fail
191 if known_final_rr is not None:
192 output_rr = ACCEPTABLE_X86_64_OUTPUT_REGISTERS[known_final_rr]
193 # If collected valid_inputs are unacceptable we'll get KeyError here and
194 # this test will fail
195 return True, [ACCEPTABLE_X86_64_INPUTS[valid_inputs],
196 'output_rr={}'.format(output_rr)]
197
198
199 class WorkerState(object):
200 def __init__(self, prefix, validator):
201 self.total_instructions = 0
202 self.num_valid = 0
203 self.validator = validator
204 self.output = set()
205 self.trace = []
206
207
208 def ReceiveInstruction(self, bytes):
209 self.total_instructions += 1
210 result, notes = ValidateInstruction(bytes, self.validator)
211 if result:
212 self.num_valid += 1
213 dis = self.validator.DisassembleChunk(
214 ''.join(map(chr, bytes)),
215 bitness=options.bitness)
216 for line_nr in xrange(len(dis)):
217 dis[line_nr] = str(dis[line_nr])
218 assert dis[line_nr][0:17] == 'Instruction(0x' + str(line_nr) + ': '
219 assert dis[line_nr][-1:] == ')'
220 dis[line_nr] = dis[line_nr][17:-1]
221 # If %rip is involved then comment will be different depending on the
222 # instruction length. Eliminate it.
223 if '(%rip)' in dis[0]:
224 dis[0] = re.sub(' # 0x[ ]*[0-9a-fA-F]*', '', dis[0])
225 # Zero displacements are represented as 0x0 for all instructions except
226 # jumps where they disassembled as non-zero due to %eip/%rip-relative
227 # addressing. We replace this displacement with %eip/%rip to simplify
228 # compression.
229 if ' 0x' in dis[0] and ' 0x0' not in dis[0]:
230 for bytes in xrange(1, 16):
231 dis[0] = re.sub(
232 '(' + '(?:[0-9a-fA-F][0-9a-fA-F] ){' + str(bytes) + '} .* )' +
233 hex(bytes) + '(.*)',
234 r'\1%eip\2' if options.bitness == 32 else r'\1%rip\2',
235 dis[0]);
236 dis[0] = 'Instruction: ' + dis[0]
237 dis += notes
238 self.output.add('; '.join(dis))
239
240
241 def RecordTrace(self, compressor_nr, instruction):
242 self.trace.append((compressor_nr, instruction))
243
244
245 # Compressor has three slots: regex (which picks apart given instruction),
246 # subst (which is used to create the compressed version) and replacements (which
247 # are used to generate set of instructions from a given code).
248 #
249 # Example compressor:
250 # regex = r'.*?[0-9a-fA-F]([0-7]) \w* (%e(?:[abcd]x|[sb]p|[sd]i)).*()'
251 # subst = ('[0-7]', '[%eax..%edi]', ' # register in opcode')
252 # replacements = ((0, '%eax', ''), (1, '%ecx', ''),
253 # (2, '%edx', ''), (3, '%ebx', '')
254 # (4, '%esp', ''), (5, '%ebp', ''),
255 # (6, '%esi', ''), (7, '%edi', ''))
256 #
257 # When faced with instruction '40 inc %eax' it will capture the following
258 # pieces of said instruction: '4{0} inc {%eax}{}'.
259 #
260 # Then it will replace groups to produce the following 8 instructions:
261 # '40 inc %eax'
262 # '41 inc %ecx'
263 # '42 inc %edx'
264 # '43 inc %ebx'
265 # '44 inc %esp'
266 # '45 inc %ebp'
267 # '46 inc %esi'
268 # '47 inc %edi'
269 #
270 # If all these instructions can be found in a set of instructions then
271 # compressor will remove them from said set and will substitute them with
272 # one "compressed instruction" '4[0-7] inc [%eax..%edi] # register in opcode'.
273 class Compressor(object):
274 __slots__ = [
275 'regex',
276 'subst',
277 'replacements'
278 ]
279
280 def __init__(self, regex, subst, replacements):
281 self.regex = re.compile(regex)
282 self.subst = subst
283 self.replacements = replacements
284
285
286 def CompressionTemplate(instruction, match, mark):
287 """ Replace all match groups with the mark. """
288 pos = 0
289 format_str = ''
290 for group in range(0, len(match.groups())):
291 format_str += instruction[pos:match.start(group + 1)] + mark
292 pos = match.end(group + 1)
293 return format_str + instruction[pos:]
294
295
296 def CompressOneMatch(instructions, instruction, match, compressor):
297 format_str = CompressionTemplate(instruction, match, '{}')
298 subset = set()
299 for replacement in compressor.replacements:
300 replacement_str = format_str.format(*replacement)
301 if not replacement_str in instructions:
302 return (False, instructions)
303 subset.add(replacement_str)
304 assert instruction in subset
305 instructions -= subset
306 instructions.add(format_str.format(*compressor.subst))
307 return (True, instructions)
308
309
310 def CompressOneInstruction(instructions, compressors, split, cache):
311 sorted_instructions = (sorted(i for i in instructions if i > split) +
312 sorted(i for i in instructions if i < split))
313 for instruction in sorted_instructions:
314 if instruction in cache:
315 compressors_list = cache[instruction]
316 for compressor_nr, match, compressor in compressors_list:
317 result, instructions = CompressOneMatch(
318 instructions, instruction, match, compressor)
319 if result:
320 return (instructions, compressor_nr, instruction)
321 else:
322 compressors_list = []
323 for compressor_nr, compressor in enumerate(compressors):
324 match = compressor.regex.match(instruction)
325 if match:
326 compressors_list.append((compressor_nr, match, compressor))
327 result, instructions = CompressOneMatch(
328 instructions, instruction, match, compressor)
329 if result:
330 return (instructions, compressor_nr, instruction)
331 cache[instruction] = compressors_list
332 return (instructions, False, False)
333
334
335 def Compressed(instructions, compressors, show_progress):
336 split = ''
337 cache = {}
338 while True:
339 instructions, rule, split = CompressOneInstruction(
340 instructions, compressors, split, cache)
341 if rule is False: break
342 show_progress(rule, split)
343 return instructions
344
345
346 def Worker((prefix, state_index)):
347 worker_state = WorkerState(prefix, worker_validator)
348
349 try:
350 dfa_traversal.TraverseTree(
351 dfa.states[state_index],
352 final_callback=worker_state.ReceiveInstruction,
353 prefix=prefix,
354 anyfield=0)
355 if (prefix[0] != 0x0f or prefix[1] != 0x0f): # Skip 3DNow! instructions
356 worker_state.output = Compressed(set(worker_state.output),
357 compressors,
358 worker_state.RecordTrace)
359 except Exception as e:
360 traceback.print_exc() # because multiprocessing imap swallows traceback
361 raise
362
363 return (
364 prefix,
365 worker_state.total_instructions,
366 worker_state.num_valid,
367 worker_state.output,
368 worker_state.trace)
369
370
371 def ParseOptions():
372 parser = optparse.OptionParser(usage='%prog [options] xmlfile')
373
374 parser.add_option('--bitness',
375 choices=['32', '64'],
376 help='The subarchitecture: 32 or 64')
377 parser.add_option('--validator_dll',
378 help='Path to librdfa_validator_dll')
379 parser.add_option('--decoder_dll',
380 help='Path to librdfa_decoder_dll')
381
382 options, args = parser.parse_args()
383 options.bitness = int(options.bitness)
384
385 if len(args) != 1:
386 parser.error('specify one xml file')
387
388 (xml_file, ) = args
389
390 return options, xml_file
391
392
393 # Version suitable for use in regular expressions
394 REGISTERS_RE = REGISTERS.copy()
395 REGISTERS_RE['st(0)'] = [ r'st\({}\)'.format(N) for N in range(8) ]
396
397 # Index names in 'natual' order (as defined by IA32/x86-64 ABI)
398 INDEXES = {
399 'eax': [ 'eax', 'ecx', 'edx', 'ebx', 'eiz', 'ebp', 'esi', 'edi' ],
400 'rax': [ 'rax', 'rcx', 'rdx', 'rbx', 'riz', 'rbp', 'rsi', 'rdi' ],
401 'r8': [ 'r8', 'r9', 'r10', 'r11', 'r12', 'r13', 'r14', 'r15' ]
402 }
403 # Register which can be used as base in 64-bit mode in all their incarnations.
404 X86_64_BASE_REGISTERS = set([
405 '%spl', '%bpl', '%r15b',
406 '%sp', '%bp', '%r15w',
407 '%esp', '%ebp', '%r15d',
408 '%rsp', '%rbp', '%r15',
409 '%rip'
410 ])
411
412
413 def InstructionIsDangerous(input, output, register_write,
414 writes_to, memory_accessed=False,
415 base_text='%riz', index_text='%riz'):
416 """ Check if instruction with given replacements will be dangerous
417
418 Args:
419 input: input argument
420 output: output argument
421 register_write: three-state selector
422 'sandbox' - instruction can be used to produce "restricted register"
423 'protect' - instruction can damage output, protect "special registers"
424 'ignore' - instruction does not affect it's operands (e.g. test) or
425 is used with non-GP registers (X87, MMX, XMM, etc)
426 memory_accessed: True if instruction accesses memory
427 base: base register (if memory is accessed)
428 index: index register (if memory is accessed)
429
430 Returns:
431 True if instruction should be rejected by validator
432 """
433 if memory_accessed:
434 if base_text not in X86_64_BASE_REGISTERS:
435 return True
436 # Surprisingly enough %r15 is considered "valid index register" by our
437 # validator: it can not be ever produced as "restricted register" which
438 # means that such instruction can not ever occur in real program (it's
439 # guaranteed because ACCEPTABLE_X86_64_OUTPUT_REGISTERS excludes it), but
440 # when instruction is tested in isolation it's allowed.
441 if index_text in X86_64_BASE_REGISTERS - set(['%r15']):
442 return True
443 if register_write == 'protect' and output in X86_64_BASE_REGISTERS:
444 return True
445 if register_write == 'sandbox' and output == '%r15d':
446 return True
447 if writes_to == 'both' and input in X86_64_BASE_REGISTERS:
448 return True
449 return False
450
451
452 def AppendOperandsReplacement(replacement, rm_text, reg, modrm, writes_to):
453 """ Appends replacement text to replacement list
454
455 Args:
456 replacement: replacement list
457 rm_text: replacement for rm field
458 reg: register kind (or None if reg field is used as opcode extension)
459 modrm: modrm byte
460 writes_to: three-state selector
461 'reg' - instruction uses rm as source, reg as destination
462 'rm' - instruction uses reg as source, rm as destination
463 'both' - instruction writes to both reg and rm
464
465 Returns:
466 input: textual representation of input argument
467 output: textual representation of output argument
468
469 Side-effect:
470 output (if reg is None) or (input, output) tuple (if reg is not None)
471 are added to replacement list.
472 """
473 if reg is None:
474 assert writes_to == 'rm'
475 input, output = None, rm_text
476 replacement.append(output)
477 else:
478 reg_field = (modrm >> 3) & 0x07
479 reg_text = '%' + REGISTERS[reg][reg_field]
480 if writes_to == 'reg':
481 input, output = rm_text, reg_text
482 else: # rm, both
483 input, output = reg_text, rm_text
484 replacement.extend([input, output])
485 return input, output
486
487
488 def ModRMRegisterReplacements(rm, reg=None, writes_to='rm', opcode_bits=0,
489 register_write='ignore', note=''):
490 """Creates replacement tuples list for register-to-register instructions
491
492 Args:
493 rm: rm operand kind (see REGISTERS array)
494 reg: reg operand kind (see REGISTERS array) or None if reg is not used
495 writes_to: three-state selector
496 'reg' - instruction uses rm as source, reg as destination
497 'rm' - instruction uses reg as source, rm as destination
498 'both' - instruction writes to both reg and rm
499 opcode_bits: opcode extensions code (used when reg is None)
500 register_write: three-state selector
501 'sandbox' - instruction can be used to produce "restricted register"
502 'protect' - instruction can damage output, protect "special registers"
503 'ignore' - instruction does not affect it's operands (e.g. test) or
504 is used with non-GP registers (X87, MMX, XMM, etc)
505 note: note to include at the end of input instruction
506 Returns:
507 List of replacement tuples
508 """
509 # Reg field can be used either as reg or as opcode extension, but not both
510 assert reg is None or opcode_bits == 0
511
512 output_key = (options.bitness, reg, rm, writes_to, opcode_bits,
513 register_write)
514 if output_key in ModRMRegisterReplacements.replacements:
515 return ModRMRegisterReplacements.replacements[output_key]
516
517 replacements = []
518
519 # Two upper bits of ModR/M byte (mod field) must be equal to 11
520 # This gives us range from 0xc0 to 0xff but we are going from the
521 # end to make rejection faster (%r15 is equal to 0x7 and %rbp is 0x5).
522 if reg is None:
523 # reg field is used as opcode extension
524 byte_range = [byte
525 for byte in range(0xff, 0xbf, -1)
526 if (byte >> 3) & 0x7 == opcode_bits]
527 else:
528 byte_range = range(0xff, 0xbf, -1)
529
530 for modrm in byte_range:
531 rm_field = (modrm & 0x07)
532 rm_text = '%' + REGISTERS[rm][rm_field]
533 byte_text = '{:02x}'.format(modrm)
534 replacement = [byte_text]
535 input, output = AppendOperandsReplacement(
536 replacement, rm_text, reg, modrm, writes_to)
537 if options.bitness == 64:
538 replacement.append('any_nonspecial') # input_rr
539 replacement.append(output if register_write == 'sandbox' else None)
540 if InstructionIsDangerous(input, output, register_write, writes_to):
541 continue
542 replacement.append(note)
543 replacements.append(tuple(replacement))
544 ModRMRegisterReplacements.replacements[output_key] = tuple(replacements)
545 return ModRMRegisterReplacements.replacements[output_key]
546 ModRMRegisterReplacements.replacements = {}
547
548
549 def BaseOnlyMemoryOperand(modrm, base):
550 """Creates replacement tuples list for register-to-memory instructions
551 (base only, no SIB)
552
553 Args:
554 modrm: modrm byte
555 base: register kind for base
556 Returns:
557 bytes_text: replacement for "bytes" group
558 rm_text: textual representation of "rm" argument
559 base_text: textual representation of "base" register
560 """
561 mod_field = (modrm >> 6) & 0x03
562 rm_field = (modrm & 0x07)
563 base_text = '%' + REGISTERS[base][rm_field]
564 # If RM field == %rbp and MOD field is zero then it's absolute address
565 # in 32-bit mode and %rip-based address in 64-bit mode
566 if mod_field == 0 and rm_field == validator.REG_RBP:
567 bytes_text = '{:02x} 00 00 00 00'.format(modrm)
568 rm_text = '0x0' if options.bitness == 32 else '0x0(%rip)'
569 base_text = '%eiz' if options.bitness == 32 else '%rip'
570 # Memory access with just a base register
571 elif mod_field == 0:
572 bytes_text = '{:02x}'.format(modrm)
573 rm_text = '({})'.format(base_text)
574 # Memory access with base and 8bit offset
575 elif mod_field == 1:
576 bytes_text = '{:02x} 00'.format(modrm)
577 rm_text = '0x0({})'.format(base_text)
578 # Memory access with base and 32bit offset
579 else: # mod_field == 2
580 bytes_text = '{:02x} 00 00 00 00'.format(modrm)
581 rm_text = '0x0({})'.format(base_text)
582 return bytes_text, rm_text, base_text
583
584
585 def SIBMemoryOperand(modrm, sib, base, index):
586 """Creates replacement tuples list for register-to-memory instructions
587 (base only, no SIB)
588
589 Args:
590 modrm: modrm byte
591 base: register kind for base
592 Returns:
593 bytes_text: replacement for "bytes" group
594 rm_text: textual representation of "rm" argument
595 base_text: textual representation of "base" register
596 index_text: textual representation of "index" register
597 """
598 mod_field = (modrm >> 6) & 0x03
599 scale_field = (sib >> 6) & 0x03
600 index_field = (sib >> 3) & 0x07
601 base_field = (sib & 0x07)
602 index_text = '%' + INDEXES[index][index_field]
603 base_text = '%' + REGISTERS[base][base_field]
604 scale_text = str(1 << scale_field)
605 # If BASE is %rbp and MOD == 0 then index with 32bit offset is used
606 if mod_field == 0 and base_field == validator.REG_RBP:
607 bytes_text = '{:02x} {:02x} 00 00 00 00'.format(modrm, sib)
608 if (options.bitness == 64 and
609 index_text == '%riz' and
610 scale_text == '1'):
611 rm_text = '0x0'
612 else:
613 rm_text = '0x0(,{},{})'.format(index_text, scale_text)
614 # There are no base in this case
615 base_text = '%eiz' if options.bitness == 32 else '%riz'
616 # Memory access with base and index (no offset)
617 elif mod_field == 0:
618 bytes_text = '{:02x} {:02x}'.format(modrm, sib)
619 rm_text = '({},{},{})'.format(base_text, index_text, scale_text)
620 # Memory access with base, index and 8bit offset
621 elif mod_field == 1:
622 bytes_text = '{:02x} {:02x} 00'.format(modrm, sib)
623 rm_text = '0x0({},{},{})'.format(base_text, index_text, scale_text)
624 # Memory access with base, index and 32bit offset
625 elif mod_field == 2:
626 bytes_text = '{:02x} {:02x} 00 00 00 00'.format(modrm, sib)
627 rm_text = '0x0({},{},{})'.format(base_text, index_text, scale_text)
628 # Pretty-printing of access via %rsp (or %r12)
629 if (base_field == validator.REG_RSP and
630 index_text in ('%eiz', '%riz') and
631 scale_text == '1'):
632 if mod_field == 0: # no offset
633 rm_text = '({})'.format(base_text)
634 else: # 8-bit or 32-bit offset
635 rm_text = '0x0({})'.format(base_text)
636 return bytes_text, rm_text, base_text, index_text
637
638
639 def ModRMMemoryReplacements(reg=None, writes_to='rm', opcode_bits=0,
640 memory_accessed=True, register_write='ignore',
641 base_r8=False, index_r8=False, note=''):
642 """Creates replacement tuples list for register-to-memory instructions
643
644 Args:
645 reg: reg operand kind (see REGISTERS array) or None if reg is not used
646 writes_to: three-state selector
647 'reg' - instruction uses rm as source, reg as destination
648 'rm' - instruction uses reg as source, rm as destination
649 'both' - instruction writes to both reg and rm
650 opcode_bits: opcode extensions code (used when reg is None)
651 memory_accessed: True if instruction accesses memory
652 register_write: three-state selector
653 'sandbox' - instruction can be used to produce "restricted register"
654 'protect' - instruction can damage output, protect "special registers"
655 'ignore' - instruction does not affect it's operands (e.g. test) or
656 is used with non-GP registers (X87, MMX, XMM, etc)
657 index_r8: True if REX.X bit in the instruction set to 1
658 note: note to include at the end of input instruction
659
660 Returns:
661 List of replacement tuples
662 """
663 # Reg field can be used either as reg or as opcode extension, but not both
664 assert reg is None or opcode_bits == 0
665
666 output_key = (options.bitness, reg, writes_to, opcode_bits,
667 base_r8, index_r8, memory_accessed, register_write)
668 if output_key in ModRMMemoryReplacements.replacements:
669 return ModRMMemoryReplacements.replacements[output_key]
670
671 if options.bitness == 32:
672 base = 'eax'
673 index = 'eax'
674 else:
675 base = 'r8' if base_r8 else 'rax'
676 index = 'r8' if index_r8 else 'rax'
677
678 replacements = []
679
680 # Two upper bits of ModR/M byte (mod field) must be equal to 00, 01, or 10
681 # This gives us range from 0x00 to 0xbf but we are going from the end to make
682 # rejection faster (%r15 is equal to 0x7 and %rbp is 0x5).
683 if reg is None:
684 # reg field is used as opcode extension
685 byte_range = [byte
686 for byte in range(0xbf, -1, -1)
687 if (byte >> 3) & 0x7 == opcode_bits]
688 else:
689 byte_range = range(0xbf, -1, -1)
690
691 for modrm in byte_range:
692 # If RM field != %rsp then there are no SIB byte
693 if (modrm & 0x07) != validator.REG_RSP:
694 bytes_text, rm_text, base_text = BaseOnlyMemoryOperand(modrm, base)
695 replacement = [bytes_text]
696 input, output = AppendOperandsReplacement(
697 replacement, rm_text, reg, modrm, writes_to)
698 if options.bitness == 64:
699 replacement.append('any_nonspecial')
700 # If writes_to is equal to 'reg' then output is register
701 if writes_to == 'reg' and register_write == 'sandbox':
702 replacement.append(output)
703 else:
704 # Note that instruction like xchg could write to another register:
705 # it's "input"! But validator currently does not support this case
706 # thus we are ignoring it here and writing "None" in this case, too.
707 replacement.append(None)
708 if InstructionIsDangerous(input, output, register_write, writes_to,
709 memory_accessed, base_text):
710 continue
711 replacement.append(note)
712 replacements.append(tuple(replacement))
713 else:
714 # If RM field == %rsp then we have SIB byte
715 for sib in xrange(0x100):
716 bytes_text, rm_text, base_text, index_text = SIBMemoryOperand(
717 modrm, sib, base, index)
718 replacement = [bytes_text]
719 input, output = AppendOperandsReplacement(
720 replacement, rm_text, reg, modrm, writes_to)
721 if options.bitness == 64:
722 if not memory_accessed or index_text == '%riz':
723 replacement.append('any_nonspecial')
724 else:
725 if index_r8:
726 # Convert %r8 to %r8d, %r9 to %r9d, etc
727 replacement.append(index_text + 'd')
728 else:
729 # Convert %rax to %eax, %rsp to %esp, etc
730 replacement.append('%e' + index_text[2:])
731 # If writes_to is equal to 'reg' then output is register
732 if writes_to == 'reg' and register_write == 'sandbox':
733 replacement.append(output)
734 else:
735 # Note that instruction like xchg could write to another register:
736 # it's "input"! But validator currently does not support this case
737 # thus we are ignoring it here and writing "None" in this case, too.
738 replacement.append(None)
739 if InstructionIsDangerous(input, output, register_write, writes_to,
740 memory_accessed, base_text, index_text):
741 continue
742 replacement.append(note)
743 replacements.append(tuple(replacement))
744 ModRMMemoryReplacements.replacements[output_key] = tuple(replacements)
745 return ModRMMemoryReplacements.replacements[output_key]
746 ModRMMemoryReplacements.replacements = {}
747
748
749 # Map from "REX bit off" group of registers to "REX bit on" group of registers
750 r8 = {
751 'al': 'r8b',
752 'ax': 'r8w',
753 'eax': 'r8d',
754 'rax': 'r8',
755 'mm0': 'mmalt',
756 'xmm0': 'xmm8',
757 'ymm0': 'ymm8'
758 }
759
760
761 def RegisterKinds():
762 """ Return list of register kinds we process with register compressors """
763
764 if options.bitness == 32:
765 return ('al', 'ax', 'eax', 'mm0', 'xmm0', 'ymm0')
766 else:
767 return ('al', 'spl', 'ax', 'eax', 'rax', 'mm0', 'xmm0', 'ymm0',
768 'r8b', 'r8w', 'r8d', 'r8', 'mmalt', 'xmm8', 'ymm8')
769
770
771 def RegisterKindPairs():
772 """ Return hand-picked pairs which we must consider in compressors """
773
774 if options.bitness == 32:
775 return (
776 ( 'al', 'al'),
777 ( 'ax', 'al'),
778 ( 'al', 'ax'),
779 ( 'ax', 'ax'),
780 ( 'eax', 'al'),
781 ( 'al', 'eax'),
782 ( 'eax', 'ax'),
783 ( 'ax', 'eax'),
784 ( 'eax', 'eax'),
785 ( 'eax', 'mm0'),
786 ( 'mm0', 'eax'),
787 ( 'eax', 'xmm0'),
788 ('xmm0', 'eax'),
789 ( 'mm0', 'mm0'),
790 ( 'mm0', 'xmm0'),
791 ('xmm0', 'mm0'),
792 ('xmm0', 'xmm0'),
793 ('xmm0', 'ymm0'),
794 ('ymm0', 'xmm0'),
795 ('ymm0', 'ymm0')
796 )
797 else:
798 return (
799 ( 'al', 'al'),
800 ( 'spl', 'spl'), ( 'spl', 'r8b'), ( 'r8b', 'spl'), ( 'r8b', 'r8b'),
801 ( 'ax', 'al'),
802 ( 'ax', 'spl'), ( 'ax', 'r8b'), ( 'r8w', 'spl'), ( 'r8w', 'r8b'),
803 ( 'al', 'ax'),
804 ( 'spl', 'ax'), ( 'spl', 'r8w'), ( 'r8b', 'ax'), ( 'r8b', 'r8w'),
805 ( 'ax', 'ax'), ( 'ax', 'r8w'), ( 'r8w', 'ax'), ( 'r8w', 'r8w'),
806 ( 'eax', 'al'),
807 ( 'eax', 'spl'), ( 'eax', 'r8b'), ( 'r8d', 'spl'), ( 'r8d', 'r8b'),
808 ( 'al', 'eax'),
809 ( 'spl', 'eax'), ( 'spl', 'r8d'), ( 'r8b', 'eax'), ( 'r8b', 'r8d'),
810 ( 'eax', 'ax'), ( 'eax', 'r8w'), ( 'r8d', 'ax'), ( 'r8d', 'r8w'),
811 ( 'ax', 'eax'), ( 'ax', 'r8d'), ( 'r8w', 'eax'), ( 'r8w', 'r8d'),
812 ( 'eax', 'eax'), ( 'eax', 'r8d'), ( 'r8d', 'eax'), ( 'r8d', 'r8d'),
813 ( 'rax', 'al'),
814 ( 'rax', 'spl'), ( 'rax', 'r8b'), ( 'r8', 'spl'), ( 'r8', 'r8b'),
815 ( 'al', 'rax'),
816 ( 'spl', 'rax'), ( 'spl', 'r8'), ( 'r8b', 'rax'), ( 'r8b', 'r8'),
817 ( 'rax', 'ax'), ( 'rax', 'r8w'), ( 'r8', 'ax'), ( 'r8', 'r8w'),
818 ( 'ax', 'rax'), ( 'ax', 'r8'), ( 'r8w', 'rax'), ( 'r8w', 'r8'),
819 ( 'rax', 'eax'), ( 'rax', 'r8d'), ( 'r8', 'eax'), ( 'r8', 'r8d'),
820 ( 'eax', 'rax'), ( 'eax', 'r8'), ( 'r8d', 'rax'), ( 'r8d', 'r8'),
821 ( 'rax', 'rax'), ( 'rax', 'r8'), ( 'r8', 'rax'), ( 'r8', 'r8'),
822 ( 'eax', 'mm0'), ( 'eax','mmalt'), ( 'r8d', 'mm0'), ( 'eax', 'mmalt'),
823 ( 'rax', 'mm0'), ( 'rax','mmalt'), ( 'r8', 'mm0'), ( 'r8', 'mmalt'),
824 ( 'mm0', 'eax'), ('mmalt', 'eax'), ( 'mm0', 'r8d'), ('mmalt', 'r8d'),
825 ( 'mm0', 'rax'), ('mmalt', 'rax'), ( 'mm0', 'r8'), ('mmalt', 'r8'),
826 ( 'eax', 'xmm0'), ( 'eax', 'xmm8'), ( 'r8d', 'xmm0'), ( 'r8d', 'xmm8'),
827 ( 'rax', 'xmm0'), ( 'rax', 'xmm8'), ( 'r8', 'xmm0'), ( 'r8', 'xmm8'),
828 ('xmm0', 'eax'), ('xmm0', 'r8d'), ('xmm8', 'eax'), ('xmm8', 'r8d'),
829 ('xmm0', 'rax'), ('xmm0', 'r8'), ('xmm8', 'rax'), ('xmm8', 'r8'),
830 ( 'mm0', 'mm0'), ('mmalt', 'mm0'), ( 'mm0','mmalt'), ('mmalt','mmalt'),
831 ( 'mm0', 'xmm0'), ('mmalt','xmm0'), ( 'mm0', 'xmm8'), ('mmalt', 'xmm8'),
832 ('xmm0', 'mm0'), ('xmm8', 'mm0'), ('xmm0','mmalt'), ('xmm8', 'mmalt'),
833 ('xmm0', 'xmm0'), ('xmm0', 'xmm8'), ('xmm8', 'xmm0'), ('xmm8', 'xmm8'),
834 ('xmm0', 'ymm0'), ('xmm0', 'ymm8'), ('xmm8', 'ymm0'), ('xmm8', 'ymm8'),
835 ('ymm0', 'xmm0'), ('ymm0', 'xmm8'), ('ymm8', 'xmm0'), ('ymm8', 'xmm8'),
836 ('ymm0', 'ymm0'), ('ymm0', 'ymm8'), ('ymm8', 'ymm0'), ('ymm8', 'ymm8')
837 )
838
839
840 def ProtectIndexRegister(registers_interval):
841 """ Remove index-forbidden regsiters from the registers_interval """
842
843 replacements = {
844 '%al..%dil': '%al|%cl|%dl|%bl|%sil|%dil',
845 '%ax..%di': '%ax|%cx|%dx|%bx|%si|%di',
846 '%eax..%edi': '%eax|%ecx|%edx|%ebx|%esi|%edi',
847 '%rax..%rdi': '%rax|%rcx|%rdx|%rbx|%rsi|%rdi',
848 '%al..%r15b': '%al..%bl|%ah..%bh|%sil..%r15b',
849 '%ax..%r15w': '%ax..%bx|%si..%r15w',
850 '%eax..%r15d': '%eax..%ebx|%esi..%r15d',
851 '%rax..%r15': '%rax..%rbx|%rsi..%r15'
852 }
853 return replacements.get(registers_interval, registers_interval)
854
855
856 def ProtectBaseRegister(registers_interval):
857 """ Remove base-allowed from the registers_interval """
858
859 replacements = {
860 '%al..%r15b': '%al..%bl|%ah..%bh|%sil..%r14b',
861 '%ax..%r15w': '%ax..%bx|%si..%r14w',
862 '%eax..%r15d': '%eax..%ebx|%esi..%r14d',
863 '%rax..%r15': '%rax..%rbx|%rsi..%r14',
864 '%r8b..%r15b': '%r8b..%r14b',
865 '%r8w..%r15w': '%r8w..%r14w',
866 '%r8d..%r15d': '%r8d..%r14d',
867 '%r8..%r15': '%r8..%r14'
868 }
869 if registers_interval in replacements:
870 return replacements.get(registers_interval, registers_interval)
871 else:
872 return ProtectIndexRegister(registers_interval)
873
874
875 def RegisterRegex(register_kind, opcode_bits = 0):
876 """ Returns modrm regex for the first register """
877
878 regex_mod = '^.*?({:02x})'.format(0xc0 + opcode_bits * 8)
879 regex_rm = '(%' + REGISTERS[register_kind][0] + ')'
880 return regex_mod, regex_rm
881
882
883 def MemoryRegex(base_r8, index_r8, opcode_bits = 0):
884 """ Returns one regex for bytes and one for the textual representation. """
885
886 # We only need to process ModR/M+SIB '04 04' or '04 07' here. We only match
887 # memory operands without offset, so we don't need to add zeros to the end of
888 # regex_modrm and regex_modrm regular expressions.
889 #
890 # We pick version which is "always there" (in both 32bit and 64bit mode) and
891 # which is not 0x00 (to avoid confusion with offsets and immediates).
892 if base_r8:
893 regex_modrm_sib = '^.*?({:02x} 07)'.format(0x04 + opcode_bits * 8)
894 else:
895 regex_modrm_sib = '^.*?({:02x} 04)'.format(0x04 + opcode_bits * 8)
896 if options.bitness == 32:
897 regex_mem = r'(\(%esp,%eax,1\))'
898 elif base_r8:
899 regex_mem = r'(\(%r15,%r8,1\))' if index_r8 else r'(\(%r15,%rax,1\))'
900 else:
901 regex_mem = r'(\(%rsp,%r8,1\))' if index_r8 else r'(\(%rsp,%rax,1\))'
902 return regex_modrm_sib, regex_mem
903
904
905 def RegToRmCompressor(rm, reg, writes_to, memory_accessed=True,
906 register_write='ignore', index_r8=False,
907 rm_compression='register', is_3Dnow=False, notes=''):
908 """Returns a list of reg <-> rm compressors for a given set of parameters.
909
910 Args:
911 rm: rm operand kind (see REGISTERS array)
912 reg: reg operand kind (see REGISTERS array) or None if reg is not used
913 writes_to: three-state selector
914 'reg' - instruction uses rm as source, reg as destination
915 'rm' - instruction uses reg as source, rm as destination
916 'both' - instruction writes to both reg and rm
917 memory_accessed: True if instruction accesses memory
918 register_write: three-state selector
919 'sandbox' - instruction can be used to produce "restricted register"
920 'protect' - instruction can damage output, protect "special registers"
921 'ignore' - instruction does not affect it's operands (e.g. test) or
922 is used with non-GP registers (X87, MMX, XMM, etc)
923 index_r8: True if REX.X bit in the instruction set to 1
924 rm_compression: three-state selector
925 'register' - instruction supports register-only rm parameter
926 'memory' - instruction supports memory-only rm parameter
927 'register_or_memory' - instruction supports all kinds of rm parameters
928 is_3DNow - True if instruction is 3DNow! style (opcode in immidiate)
929 notes - Notes to add to the description
930
931 Returns:
932 List of compressors
933 """
934
935 start_reg = REGISTERS[reg][0]
936 end_reg = REGISTERS[reg][-1]
937 mark_reg = '%{}..%{}'.format(start_reg, end_reg)
938 # Exclude rbp/rsp/r15 from the interval, if instruction can write to reg.
939 if register_write != 'ignore' and writes_to in ('reg', 'both'):
940 mark_reg = ProtectBaseRegister(mark_reg)
941 start_rm = REGISTERS[rm][0]
942 end_rm = REGISTERS[rm][-1]
943 mark_rm = '%{}..%{}'.format(start_rm, end_rm)
944 # Exclude rbp/rsp/r15 from the interval, if instruction can write to rm.
945 if register_write != 'ignore' and writes_to in ('rm', 'both'):
946 mark_rm = ProtectBaseRegister(mark_rm)
947 # Prepare substitution texts
948 subst_reg = '[{}]'.format(mark_reg)
949 if rm_compression == 'register':
950 subst_rm = '[{}]'.format(mark_rm)
951 elif rm_compression == 'register_or_memory':
952 subst_rm = '[{} or memory]'.format(mark_rm)
953 else:
954 assert rm_compression == 'memory'
955 subst_rm = '[memory]'
956 base_r8 = rm in r8.values()
957 regex, regex_reg = RegisterRegex(reg)
958 if rm_compression == 'register':
959 regex, regex_rm = RegisterRegex(rm)
960 else: # memory and register_or_memory
961 regex, regex_rm = MemoryRegex(base_r8, index_r8)
962 if is_3Dnow:
963 # Immediate byte is opcode extension with 3DNow! instructions.
964 regex += ' [0-9a-fA-F][0-9a-fA-F]'
965 else:
966 # Accept immediate (which is always zeros in our case).
967 regex += '(?: 00)*'
968 # 2 spaces separate byte code from instruction name.
969 regex += ' '
970 # Optional "lock" prefix.
971 regex += '(?:lock )?'
972 # Instruction name.
973 regex += r'\w* '
974 # Immediate or implicit operand that can precede reg/rm pair.
975 regex += r'(?:\$0x0,|\$0x0,\$0x0,|%cl,|%xmm0,)?'
976 regex_output_rr = None
977 subst_output_rr = None
978 if writes_to == 'reg':
979 regex += regex_rm + ',' + regex_reg
980 if register_write == 'sandbox':
981 assert reg in ('eax', 'r8d')
982 regex_output_rr = '%' + reg
983 subst_output_rr = subst_reg
984 subst = ('XX', subst_rm, subst_reg)
985 else:
986 regex += regex_reg + ',' + regex_rm
987 if register_write == 'sandbox':
988 assert rm in ('eax', 'r8d')
989 # Instruction can be sandboxing one and yet produce output_rr == None
990 # This looks strange, but it's perfectly logical because we can't ever
991 # sandbox memory!
992 if rm_compression == 'register':
993 regex_output_rr = '%' + rm
994 if rm_compression != 'memory':
995 subst_output_rr = '[{}]'.format(mark_rm)
996 subst = ('XX', subst_reg, subst_rm)
997 # Skip implicit arguments (if any)
998 regex += '.*'
999 if options.bitness == 64:
1000 if memory_accessed and rm_compression != 'register':
1001 regex += '; input_rr=(%r8d)' if index_r8 else '; input_rr=(%eax)'
1002 subst_input_rr = '%r8d..%r15d' if index_r8 else '%eax..%edi'
1003 subst_input_rr = '[{}]'.format(ProtectIndexRegister(subst_input_rr))
1004 else:
1005 regex += '; input_rr=(any_nonspecial)'
1006 subst_input_rr = 'any_nonspecial'
1007 regex += '; output_rr=({})'.format(regex_output_rr)
1008 subst = subst + (subst_input_rr, subst_output_rr)
1009 # With "or memory" or "memory" compressors we always can see where is
1010 # reg and where is rm, but with reg to rm or rm to reg it's impossible.
1011 # Add the appropriate comment
1012 notes = () if notes == '' else (notes, )
1013 if rm_compression == 'register':
1014 notes += ('rm to reg' if writes_to == 'reg' else 'reg to rm', )
1015 notes = '; '.join(notes)
1016 if notes != '':
1017 notes = ' # ' + notes
1018 subst += (notes, )
1019 regex += '()$'
1020 replacement = ()
1021 if rm_compression != 'memory':
1022 replacement += ModRMRegisterReplacements(
1023 reg=reg, rm=rm, writes_to=writes_to, register_write=register_write)
1024 if rm_compression != 'register':
1025 replacement += ModRMMemoryReplacements(
1026 reg=reg, base_r8=base_r8, index_r8=index_r8, writes_to=writes_to,
1027 memory_accessed=memory_accessed, register_write=register_write)
1028 return Compressor(regex, subst, replacement)
1029
1030
1031 def RegToRmCompressors():
1032 """ Return list of all Reg to RM (and RM to Reg) compressors. """
1033
1034 compressors = []
1035
1036 for reg, rm in RegisterKindPairs():
1037 instruction_kinds = [
1038 # Normal instructions with two operands (rm to reg)
1039 {'writes_to': 'reg'},
1040 # Normal instructions with two operands (reg to rm)
1041 {'writes_to': 'rm'}
1042 ]
1043 # In 64 bit mode we have many different types of instructions depending
1044 # on whether we are accessing memory or whether we are writing to registers.
1045 if options.bitness == 64:
1046 # Lea in 64 bit mode is truly unique instruction for now
1047 if reg in ('ax', 'r8w', 'eax', 'r8d', 'rax', 'r8'):
1048 register_write = 'sandbox' if reg in ('eax', 'r8d') else 'protect'
1049 instruction_kinds.append(
1050 {'writes_to': 'reg',
1051 'memory_accessed': False,
1052 'register_write': register_write,
1053 'notes': 'lea'})
1054 # There are few more forms in 64 bit case (rm to reg)
1055 if reg in ('eax', 'r8d'):
1056 # Zero-extending version.
1057 instruction_kinds.append(
1058 {'writes_to': 'reg',
1059 'register_write': 'sandbox'})
1060 # More forms in 64 bit case (reg to rm)
1061 if rm in ('eax', 'r8d'):
1062 # Zero-extending version.
1063 instruction_kinds.append(
1064 {'writes_to': 'rm',
1065 'register_write': 'sandbox'})
1066 # Zero-extending xchg/xadd
1067 instruction_kinds.append(
1068 {'writes_to': 'both',
1069 'register_write': 'sandbox',
1070 'notes': 'write to both'})
1071 # Still more forms for 64 bit case (rm to reg).
1072 if reg in ('al', 'spl', 'ax', 'eax', 'rax', 'r8b', 'r8w', 'r8d', 'r8'):
1073 # Dangerous instructions (rm to reg)
1074 instruction_kinds.append(
1075 {'writes_to': 'reg',
1076 'register_write': 'protect'})
1077 # Still more forms for 64 bit case (reg to rm)
1078 if rm in ('al', 'spl', 'ax', 'eax', 'rax', 'r8b', 'r8w', 'r8d', 'r8'):
1079 # Dangerous instructions (reg to rm)
1080 instruction_kinds.append(
1081 {'writes_to': 'rm',
1082 'register_write': 'protect'})
1083 # Dangerous xchg/xadd
1084 instruction_kinds.append(
1085 {'writes_to': 'both',
1086 'register_write': 'protect',
1087 'notes': 'write to both'})
1088 # 3DNow! instructions
1089 if reg in ('mm0', 'mmalt') or rm in ('mm0', 'mmalt'):
1090 instruction_kinds.append(
1091 {'writes_to': 'reg',
1092 'is_3Dnow': True})
1093 for instruction_kind in instruction_kinds:
1094 for rm_compression in ('register', 'memory', 'register_or_memory'):
1095 if rm_compression == 'register':
1096 # Register-to-register instructions can not access memory anyway
1097 # which means special compressors are not needed.
1098 if 'memory_accessed' in instruction_kind:
1099 continue
1100 elif options.bitness == 64:
1101 # We generate 2 variants of compressors with memory operands to
1102 # enumerate 2 possible values of index_r8 property. Register only
1103 # instructions have only one version since they don't use index.
1104 compressors.append(RegToRmCompressor(
1105 reg=reg, rm=rm, index_r8=True,
1106 rm_compression=rm_compression, **instruction_kind))
1107 # Create "normal" compressor now
1108 compressors.append(RegToRmCompressor(
1109 reg=reg, rm=rm,
1110 rm_compression=rm_compression, **instruction_kind))
1111 return compressors
1112
1113
1114 def RmCompressor(rm, opcode_bits, memory_accessed=True, index_r8=False,
1115 rm_compression='register', register_write='ignore'):
1116 """Returns a list of rm compressors for a given set of parameters.
1117
1118 Args:
1119 rm: rm operand kind (see REGISTERS array)
1120 memory_accessed: True if instruction accesses memory
1121 index_r8: True if REX.X bit in the instruction set to 1
1122 rm_compression: three-state selector
1123 'register' - instruction supports register-only rm parameter
1124 'memory' - instruction supports memory-only rm parameter
1125 'register_or_memory' - instruction supports all kinds of rm parameters
1126 register_write: three-state selector
1127 'sandbox' - instruction can be used to produce "restricted register"
1128 'protect' - instruction can damage output, protect "special registers"
1129 'ignore' - instruction does not affect it's operands (e.g. test) or
1130 is used with non-GP registers (X87, MMX, XMM, etc)
1131
1132 Returns:
1133 List of compressors
1134 """
1135 start_rm = REGISTERS[rm][0]
1136 end_rm = REGISTERS[rm][-1]
1137 mark_rm = '%{}..%{}'.format(start_rm, end_rm)
1138 # Exclude rbp/rsp/r15 from the interval, if instruction can write to rm.
1139 if register_write != 'ignore':
1140 mark_rm = ProtectBaseRegister(mark_rm)
1141 byte_mark = 'XX/' + str(opcode_bits)
1142 # Prepare substitution texts
1143 if rm_compression == 'register':
1144 subst_rm = '[{}]'.format(mark_rm)
1145 elif rm_compression == 'register_or_memory':
1146 subst_rm = '[{} or memory]'.format(mark_rm)
1147 else:
1148 assert rm_compression == 'memory'
1149 subst_rm = '[memory]'
1150 base_r8 = rm in r8.values()
1151 if rm_compression == 'register':
1152 regex, regex_rm = RegisterRegex(rm, opcode_bits)
1153 else: # memory and register_or_memory
1154 regex, regex_rm = MemoryRegex(base_r8, index_r8, opcode_bits)
1155 # Regex here matches everything AFTER the ModR/M (+ SIB) bytes.
1156 regex += '(?: 00)*'
1157 # 2 spaces separate byte code from instruction name.
1158 regex += ' '
1159 # Optional "lock" prefix.
1160 regex += '(?:lock )?'
1161 # Instruction name.
1162 regex += r'\w* '
1163 # Immediate or implicit operand that can precede rm argument.
1164 regex += r'(?:\$0x0,|%cl,)?'
1165 # Register or memory access
1166 regex += regex_rm
1167 # Skip everything after that
1168 regex += '.*'
1169 regex_output_rr = None
1170 subst_output_rr = None
1171 subst = (byte_mark, subst_rm)
1172 if options.bitness == 32:
1173 subst += ('', )
1174 else:
1175 if register_write == 'sandbox':
1176 assert rm in ('eax', 'r8d')
1177 # Instruction can be sandboxing one and yet produce output_rr == None
1178 # This looks strange, but it's perfectly logical because we can't ever
1179 # sandbox memory!
1180 if rm_compression == 'register':
1181 regex_output_rr = '%' + rm
1182 if rm_compression != 'memory':
1183 subst_output_rr = '[{}]'.format(mark_rm)
1184 if memory_accessed and rm_compression != 'register':
1185 regex += '; input_rr=(%r8d)' if index_r8 else '; input_rr=(%eax)'
1186 subst_input_rr = '%r8d..%r15d' if index_r8 else '%eax..%edi'
1187 subst_input_rr = '[{}]'.format(ProtectIndexRegister(subst_input_rr))
1188 else:
1189 regex += '; input_rr=(any_nonspecial)'
1190 subst_input_rr = 'any_nonspecial'
1191 regex += '; output_rr=({})'.format(regex_output_rr)
1192 subst = subst + (subst_input_rr, subst_output_rr, '')
1193 regex += '()'
1194 replacement = ()
1195 if rm_compression != 'memory':
1196 replacement += ModRMRegisterReplacements(
1197 rm=rm, opcode_bits=opcode_bits, register_write=register_write)
1198 if rm_compression != 'register':
1199 replacement += ModRMMemoryReplacements(
1200 base_r8=base_r8, index_r8=index_r8, opcode_bits=opcode_bits,
1201 memory_accessed=memory_accessed, register_write=register_write)
1202 return Compressor(regex, subst, replacement)
1203
1204
1205 def RmCompressors():
1206 """ Return list of all RM (with reg as opcode extension) compressors. """
1207 compressors = []
1208
1209 for rm in RegisterKinds():
1210 for opcode_bits in xrange(8):
1211 instruction_kinds = [
1212 # The most basic form
1213 {}
1214 ]
1215 # In 64 bit mode we have many different types of instructions depending
1216 # on whether we are accessing memory or whether we are writing to
1217 # registers.
1218 if options.bitness == 64:
1219 # No memory access (e.g. prefetch)
1220 instruction_kinds.append(
1221 {'memory_accessed':False})
1222 # More forms in 64 bit case.
1223 if rm in ('eax', 'r8d'):
1224 # Zero-extending version.
1225 instruction_kinds.append(
1226 {'register_write':'sandbox'})
1227 # Still more forms for 64 bit case (reg to rm).
1228 if rm in ('al', 'spl', 'ax', 'eax', 'rax',
1229 'r8b', 'r8w', 'r8d', 'r8'):
1230 # Dangerous instructions.
1231 instruction_kinds.append(
1232 {'register_write':'protect'})
1233 for instruction_kind in instruction_kinds:
1234 for rm_compression in ('register', 'memory', 'register_or_memory'):
1235 if rm_compression == 'register':
1236 # Register-to-register instructions can not access memory anyway
1237 # which means special compressors are not needed.
1238 if 'memory_accessed' in instruction_kind:
1239 continue
1240 elif options.bitness == 64:
1241 # We generate 2 variants of compressors with memory operands to
1242 # enumerate 2 possible values of index_r8 property. Register only
1243 # instructions have only one version since they don't use index.
1244 compressors.append(RmCompressor(
1245 rm=rm, opcode_bits=opcode_bits, index_r8=True,
1246 rm_compression=rm_compression, **instruction_kind))
1247 # Create "normal" compressor now
1248 compressors.append(RmCompressor(
1249 rm=rm, opcode_bits=opcode_bits,
1250 rm_compression=rm_compression, **instruction_kind))
1251 return compressors
1252
1253
1254 def OpcodeCompressors():
1255 """ Return "register in opcode" compressors. """
1256 compressors = []
1257
1258 for reg in RegisterKinds() + ('st(0)',):
1259 for opcode in xrange(8):
1260 for text1, text2, nibble in (
1261 ('[0..7]', '[8..f]', xrange(8)),
1262 ('[012367]', '[89abef]', (0, 1, 2, 3, 6, 7)),
1263 ('[0..6]', '[8..e]', xrange(7))
1264 ):
1265 start_reg = REGISTERS[reg][0]
1266 end_reg = REGISTERS[reg][-1]
1267 mark_reg = '%{}..%{}'.format(start_reg, end_reg)
1268 if 7 not in nibble:
1269 if ProtectBaseRegister(mark_reg) != mark_reg:
1270 mark_reg = ProtectBaseRegister(mark_reg)
1271 else:
1272 continue
1273 subst_reg = '[{}]'.format(mark_reg)
1274 # Note that we use instruction with 1st register, not 0th register
1275 # here to avoid ambiguity when opcode is 0x00
1276 compressors.append(Compressor(
1277 r'.*?[0-9a-fA-F](1)(?: 00)*'
1278 r' \w* (?:\$0x0,|%ax,|%st,)?'
1279 r'(%' + REGISTERS_RE[reg][1] + ').*',
1280 (text1, subst_reg),
1281 [('{:x}'.format(n), '%' + REGISTERS[reg][n])
1282 for n in nibble]))
1283 compressors.append(Compressor(
1284 r'.*?[0-9a-fA-F](8)(?: 00)*'
1285 r' \w* (?:\$0x0,|%ax,|%st,)?'
1286 r'(%' + REGISTERS_RE[reg][0] + ').*',
1287 (text2, subst_reg),
1288 [('{:x}'.format(n + 8), '%' + REGISTERS[reg][n])
1289 for n in nibble]))
1290 # Another version for 64 bit case
1291 if options.bitness == 64 and reg in ('eax', 'r8d'):
1292 compressors.append(Compressor(
1293 r'.*?[0-9a-fA-F](1)(?: 00)*'
1294 r' \w* (?:\$0x0,|%ax,|%st,)?'
1295 r'(%' + REGISTERS_RE[reg][1] + ').*'
1296 r'output_rr=(%'+ REGISTERS_RE[reg][1] + ').*',
1297 (text1, subst_reg, subst_reg),
1298 [['{:x}'.format(n)] + ['%' + REGISTERS[reg][n]] * 2
1299 for n in nibble]))
1300 compressors.append(Compressor(
1301 r'.*?[0-9a-fA-F](8)(?: 00)*'
1302 r' \w* (?:\$0x0,|%ax,|%st,)?'
1303 r'(%' + REGISTERS_RE[reg][0] + ').*'
1304 r'output_rr=(%'+ REGISTERS_RE[reg][0] + ').*',
1305 (text2, subst_reg, subst_reg),
1306 [['{:x}'.format(n + 8)] + ['%' + REGISTERS[reg][n]] * 2
1307 for n in nibble]))
1308 return compressors
1309
1310
1311 def MemoryNonMemoryCompressors():
1312 """ Return memory/nonmemory compressors. """
1313
1314 compressors = []
1315
1316 if options.bitness == 32:
1317 letters_and_registers = (('b', 'al', ''), ('w', 'ax', ''), ('l', 'eax', ''))
1318 else:
1319 letters_and_registers = (
1320 ('b', 'al', 'eax'), ('b', 'spl', 'eax'), ('b', 'r8b', 'r8d'),
1321 ('w', 'ax', 'eax'), ('w', 'r8w', 'r8d'),
1322 ('l', 'eax', 'eax'), ('l', 'r8d', 'r8d'),
1323 ('q', 'rax', 'eax'), ('q', 'r8', 'r8d')
1324 )
1325 for letter, reg, out_reg in letters_and_registers:
1326 start_reg = REGISTERS[reg][0]
1327 end_reg = REGISTERS[reg][-1]
1328 for rmR15 in (True, False):
1329 rm_mark = '%{}..%{}'.format(start_reg, end_reg)
1330 if not rmR15:
1331 rm_mark = ProtectBaseRegister(rm_mark)
1332 all_regs = '[{}]'.format(rm_mark)
1333 regs_mark = '[{} or memory]'.format(rm_mark)
1334 if options.bitness == 64:
1335 start_out = REGISTERS[out_reg][0]
1336 end_out = REGISTERS[out_reg][-1]
1337 mark_out = '%{}..%{}'.format(start_out, end_out)
1338 out_regs = '[{}]'.format(ProtectBaseRegister(mark_out))
1339 for notes in ('', ' # rm to reg', ' # reg to rm'):
1340 compressors.append(Compressor(
1341 r'.* \w*(' + letter + r') .*(\[memory]).*()',
1342 ('[{}]?'.format(letter), regs_mark, ''),
1343 ((letter, '[memory]', ''), ('', all_regs, notes))))
1344 if options.bitness == 64:
1345 for index_reg in ('eax', 'r8d'):
1346 start_index = REGISTERS[index_reg][0]
1347 end_index = REGISTERS[index_reg][-1]
1348 index_mark = '%{}..%{}'.format(start_index, end_index)
1349 index_regs = '[{}]'.format(ProtectIndexRegister(index_mark))
1350 for output_rrs in ((None, out_regs),
1351 (out_regs, None),
1352 (None, None)):
1353 compressors.append(Compressor(
1354 r'.* \w*(' + letter + r') .*(\[memory]).*; '
1355 r'input_rr=(\[(?:%[a-z0-9]*(?:\.\.|\|))+%[a-z0-9]*]); '
1356 r'output_rr=(\[(?:%[a-z0-9]*(?:\.\.|\|))+%[a-z0-9]*]|None)()',
1357 ('[{}]?'.format(letter), regs_mark, index_regs,
1358 output_rrs[1] if output_rrs[0] is None else output_rrs[0],
1359 ''),
1360 ((letter, '[memory]', index_regs, output_rrs[0], ''),
1361 ('', all_regs, 'any_nonspecial', output_rrs[1], notes))))
1362 return compressors
1363
1364
1365 def RexCompressor(rm, rmR15, reg, regR15, rexw, input_rr, output_rr, rm_to_reg):
1366 """ Return REX compressor (or nothing) for a given set of paramenters.
1367
1368 Args:
1369 rm: rm operand kind (see REGISTERS array) or None if rm is not used
1370 reg: reg operand kind (see REGISTERS array) or None if reg is not used
1371 rmR15: True if R15 register is included
1372 regR15: True if R15 register is included
1373 rexw: True if REX.W should be set
1374 input_rr: True if input_rr is used
1375 output_rr: true if output_rr is used
1376 rm_to_reg: True if direction is rm to reg
1377 """
1378
1379 def MakeRegex(string):
1380 return string.replace('.', r'\.').replace('|', r'\|')
1381
1382 # reg and rm can be of three different possible intervals
1383 # start_reg/rm to end_reg/rm (e.g. from %al to %bh)
1384 # start_reg/rm to end_reg0/rm0 (e.g from %al to %dil)
1385 # start_reg8/rm8 to end_reg8/rm8 (e.g. from %r8b to %r15b)
1386 # First form can be observed if there are not REX, second form
1387 # if REX.R/REX.B is not set and third form is where it's set.
1388 if reg:
1389 start_reg = REGISTERS[reg][0]
1390 start_reg8 = REGISTERS[r8[reg]][0]
1391 end_reg = REGISTERS[reg][-1]
1392 end_reg0 = 'dil' if reg == 'al' else end_reg
1393 end_reg8 = REGISTERS[r8[reg]][-1]
1394 if rexw:
1395 mark_reg = '%{}..%{}'.format(start_reg, end_reg0)
1396 else:
1397 mark_reg = '%{}..%{}'.format(start_reg, end_reg)
1398 if not regR15:
1399 mark_reg = ProtectBaseRegister(mark_reg)
1400 regex_reg = r'\[({})]'.format(MakeRegex(mark_reg))
1401 if rm:
1402 start_rm = REGISTERS[rm][0]
1403 start_rm8 = REGISTERS[r8[rm]][0]
1404 end_rm = REGISTERS[rm][-1]
1405 end_rm0 = 'dil' if rm == 'al' else end_rm
1406 end_rm8 = REGISTERS[r8[rm]][-1]
1407 if rexw:
1408 mark_rm = '%{}..%{}'.format(start_rm, end_rm0)
1409 else:
1410 mark_rm = '%{}..%{}'.format(start_rm, end_rm)
1411 if not rmR15:
1412 mark_rm = ProtectBaseRegister(mark_rm)
1413 regex_rm = r'\[({})(?: or memory)?]'.format(MakeRegex(mark_rm))
1414
1415 # Legacy prefixes
1416 regex = '.*:(?: 26| 2e| 36| 3e| 64| 65| 66| 67| f0| f2| f3)*'
1417 # REX
1418 regex += '( 48).*' if rexw else '( 40|).*'
1419 # Replacement text
1420 replacement_tuple = (' [REX:48..4f]' if rexw else ' [REX:40..47]?', )
1421 if reg:
1422 replacement_regs = '%{}..%{}'.format(start_reg, end_reg8)
1423 if not regR15:
1424 replacement_regs = ProtectBaseRegister(replacement_regs)
1425 if rm:
1426 replacement_rms = '%{}..%{}'.format(start_rm, end_rm8)
1427 if not rmR15:
1428 replacement_rms = ProtectBaseRegister(replacement_rms)
1429 # Instruction arguments
1430 if not reg and not rm:
1431 pass
1432 elif not reg and rm:
1433 regex += regex_rm + '.*'
1434 replacement_tuple += (replacement_rms, )
1435 elif reg and not rm:
1436 regex += regex_reg + '.*'
1437 replacement_tuple += (replacement_regs, )
1438 elif rm_to_reg:
1439 regex += regex_rm + ',' + regex_reg + '.*'
1440 replacement_tuple += (replacement_rms, replacement_regs)
1441 else:
1442 regex += regex_reg + ',' + regex_rm + '.*'
1443 replacement_tuple += (replacement_regs, replacement_rms)
1444 # Input and output restricted registers
1445 if input_rr:
1446 regex += r'input_rr=\[({})].*'.format(
1447 MakeRegex(ProtectIndexRegister('%eax..%edi')))
1448 replacement_tuple += (ProtectIndexRegister('%eax..%r15d'), )
1449 if output_rr:
1450 regex += r'output_rr=\[({})].*'.format(
1451 MakeRegex(ProtectIndexRegister('%eax..%edi')))
1452 replacement_tuple += (ProtectBaseRegister('%eax..%r15d'), )
1453 # Replacement cases
1454 replacement_tuples = ()
1455 for byte in (range(0x48, 0x50) if rexw else range(0x40, 0x48) + [0]):
1456 replacement_case = (' {:02x}'.format(byte) if byte else '', )
1457 if rm:
1458 if byte & 0x1:
1459 replacement_rms = '%{}..%{}'.format(start_rm8, end_rm8)
1460 elif byte:
1461 replacement_rms = '%{}..%{}'.format(start_rm, end_rm0)
1462 else:
1463 replacement_rms = '%{}..%{}'.format(start_rm, end_rm)
1464 if not rmR15:
1465 replacement_rms = ProtectBaseRegister(replacement_rms)
1466 if byte & 0x2:
1467 replacement_index = '%r8d..%r15d'
1468 else:
1469 replacement_index = '%eax..%edi'
1470 if reg:
1471 if byte & 0x4:
1472 replacement_regs = '%{}..%{}'.format(start_reg8, end_reg8)
1473 elif byte:
1474 replacement_regs = '%{}..%{}'.format(start_reg, end_reg0)
1475 else:
1476 replacement_regs = '%{}..%{}'.format(start_reg, end_reg)
1477 if not regR15:
1478 replacement_regs = ProtectBaseRegister(replacement_regs)
1479 if not reg and not rm:
1480 pass
1481 elif not reg and rm:
1482 replacement_case += (replacement_rms, )
1483 final_rr = '%r8d..%r15d' if byte & 0x1 else '%eax..%edi'
1484 elif reg and not rm:
1485 replacement_case += (replacement_regs, )
1486 final_rr = '%r8d..%r15d' if byte & 0x4 else '%eax..%edi'
1487 elif rm_to_reg:
1488 replacement_case += (replacement_rms, replacement_regs)
1489 final_rr = '%r8d..%r15d' if byte & 0x4 else '%eax..%edi'
1490 else:
1491 replacement_case += (replacement_regs, replacement_rms)
1492 final_rr = '%r8d..%r15d' if byte & 0x1 else '%eax..%edi'
1493 if input_rr: replacement_case += (ProtectIndexRegister(replacement_index), )
1494 if output_rr: replacement_case += (ProtectBaseRegister(final_rr), )
1495 replacement_tuples += (replacement_case, )
1496 return Compressor(regex, replacement_tuple, replacement_tuples)
1497
1498
1499 def RexCompressors():
1500 """ Return "REX" compressors which combine different REX prefixes. """
1501
1502 if options.bitness != 64:
1503 return []
1504
1505 compressors = []
1506
1507 # First pretty complex set of compressors to combine versions of REX with
1508 # three lowest bits in different states.
1509 register_kind_pairs = (
1510 ( None, None),
1511 ( 'al', 'al'), ( 'al', None), (None, 'al'),
1512 ( 'ax', 'al'), ( 'al', 'ax'),
1513 ( 'ax', 'ax'), ( 'ax', None), (None, 'ax'),
1514 ( 'eax', 'al'), ( 'al', 'eax'),
1515 ( 'eax', 'ax'), ( 'ax', 'eax'),
1516 ( 'eax', 'eax'), ( 'eax', None), (None, 'eax'),
1517 ( 'rax', 'al'), ( 'al', 'rax'),
1518 ( 'rax', 'ax'), ( 'ax', 'rax'),
1519 ( 'rax', 'eax'), ( 'eax', 'rax'),
1520 ( 'rax', 'rax'), ( 'rax', None), (None, 'rax'),
1521 ( 'eax', 'mm0'), ( 'mm0', 'eax'),
1522 ( 'rax', 'mm0'), ( 'mm0', 'rax'),
1523 ( 'mm0', 'eax'), ( 'eax', 'mm0'),
1524 ( 'mm0', 'rax'), ( 'rax', 'mm0'),
1525 ( 'eax', 'xmm0'),
1526 ( 'rax', 'xmm0'),
1527 ('xmm0', 'eax'),
1528 ('xmm0', 'rax'),
1529 ( 'mm0', 'mm0'), ( 'mm0', None), (None, 'mm0'),
1530 ( 'mm0', 'xmm0'),
1531 ('xmm0', 'mm0'),
1532 ('xmm0', 'xmm0'),
1533 ('xmm0', 'ymm0'), ('xmm0', None), (None, 'xmm0'),
1534 ('ymm0', 'xmm0'),
1535 ('ymm0', 'ymm0'), ('ymm0', None), (None, 'ymm0'),
1536 )
1537 for reg, rm in register_kind_pairs:
1538 for regR15 in (True, False):
1539 for rmR15 in (True, False):
1540 for rexw in (True, False):
1541 for input_rr in (True, False):
1542 for output_rr in (True, False):
1543 for rm_to_reg in (True, False):
1544 # These combinations will just produce useless duplicates
1545 if not reg and not regR15: continue
1546 if not rm and not rmR15: continue
1547 if not reg and not rm and (output_rr or rm_to_reg): continue
1548 compressors.append(RexCompressor(
1549 rm=rm, rmR15=rmR15, reg=reg, regR15=regR15,
1550 rexw=rexw, input_rr=input_rr, output_rr=output_rr,
1551 rm_to_reg=rm_to_reg))
1552
1553 # This is pretty simple compressor to combine two lines with different REX.W
1554 # bits (only if they are otherwise identical).
1555 compressors.append(Compressor(
1556 r'.*(\[REX:40\.\.47]\?).*', ('[REX:40..4f]?', ),
1557 (('[REX:40..47]?', ), ('[REX:48..4f]', ))))
1558 return compressors
1559
1560
1561 def SpecialCompressors():
1562 """ Return all "special" compressors. """
1563
1564 compressors = []
1565
1566 # Special compressors: will handle some cosmetic issues.
1567 #
1568 # SETxx ignores reg field and thus are described as many separate instructions
1569 compressors.append(Compressor(
1570 '.*0f 9[0-9a-fA-F] XX(/[0-7]) set.*', ('', ),
1571 [('/' + str(i), ) for i in range(8)]))
1572 # BSWAP is described with opcode "0f c8+r", not "0f /1" in manual
1573 if options.bitness == 32:
1574 compressors.append(Compressor(
1575 '.*(XX/1) bswap.*ax.*', ('c[8..f]', ), (('XX/1', ), )))
1576 else:
1577 compressors.append(Compressor(
1578 '.*(XX/1) bswap.*ax.*', ('c[89abef]', ), (('XX/1', ), )))
1579 compressors.append(Compressor(
1580 '.*(XX/1) bswap.*r8.*', ('c[8..e]', ), (('XX/1', ), )))
1581 # Add mark '# write to both' to certain versions of CMPXCHG, XADD, and XCHG
1582 if options.bitness == 64:
1583 compressors.append(Compressor(
1584 r'.* (?:cmpxchg|xadd|xchg).*%al\.\.%bh[^#]*()$',
1585 (' # write to both', ),
1586 (('', ), )))
1587 # "and $0xe0,[%eax..%edi]" is treated specially which means that we list all
1588 # versions of and "[$0x1..$0xff],[%eax..%edi]" separately here.
1589 # Without this rule these ands comprise 2/3 of the whole output!
1590 if options.bitness == 32:
1591 compressors.append(Compressor(
1592 r'.*83 (e0 01 and \$0x1,%eax)()',
1593 ('XX/4 00 and[l]? $0x0,[%eax..%edi or memory]',
1594 ' # special and'),
1595 [('e{} {:02x} and $0x{:x},%{}'.format(r, i, i, REGISTERS['eax'][r]),
1596 '')
1597 for i in range(0x01, 0x100) for r in range(8)] +
1598 [('XX/4 00 and[l]? $0x0,[%eax..%edi or memory]',
1599 '')]))
1600 else:
1601 for reg in ('eax', 'r8d'):
1602 start_reg = REGISTERS[reg][0]
1603 end_reg = REGISTERS[reg][-1]
1604 mark_reg = '%{}..%{}'.format(start_reg, end_reg)
1605 mark_reg = ProtectBaseRegister(mark_reg)
1606 for index_reg in ('eax', 'r8d'):
1607 start_index = REGISTERS[index_reg][0]
1608 end_index = REGISTERS[index_reg][-1]
1609 mark_index = '%{}..%{}'.format(start_index, end_index)
1610 mark_index = ProtectIndexRegister(mark_index)
1611 compressors.append(Compressor(
1612 r'.*83 (e0 01 and \$0x1,%' + reg + ').*'
1613 'input_rr=(any_nonspecial); output_rr=(%' + reg + ')()',
1614 ('XX/4 00 and[l]? $0x0,[{} or memory]'.format(mark_reg),
1615 '[{}]'.format(mark_index),
1616 '[{}]'.format(mark_reg),
1617 ' # special and'),
1618 [('e{} {:02x} and $0x{:x},%{}'.format(r, i, i, REGISTERS[reg][r]),
1619 'any_nonspecial', '%' + REGISTERS[reg][r], '')
1620 for i in range(0x01, 0x100) for r in range(7 + (reg == 'eax'))] +
1621 [('XX/4 00 and[l]? $0x0,[{} or memory]'.format(mark_reg),
1622 '[{}]'.format(mark_index),
1623 '[{}]'.format(mark_reg),
1624 '')]))
1625
1626 # "and $e0" and similar are used to align %rsp. All negative values are
1627 # accepted by validator and there are 127 of these.
1628 # Consolidate them into one line.
1629 if options.bitness == 64:
1630 compressors.append(Compressor(
1631 r'.*(?:81|83) (?:e4|e5) (80) (?:00 00 00 |) and \$0x(80),%r[bs]p.*()',
1632 ('[80..ff]', '[80..ff]', ' # alignment and'),
1633 [('{:02x}'.format(i), '{:02x}'.format(i), '')
1634 for i in range(0x80, 0x100)]))
1635 return compressors
1636
1637
1638 def PrepareCompressors():
1639 """ Return list of all compressors sorted from bigger ones to smaller ones """
1640
1641 return tuple(sorted(
1642 RegToRmCompressors() +
1643 RmCompressors() +
1644 OpcodeCompressors() +
1645 MemoryNonMemoryCompressors() +
1646 RexCompressors() +
1647 SpecialCompressors(),
1648 key=lambda compressor: -len(compressor.replacements)))
1649
1650
1651 def ShowProgress(rule, instruction):
1652 if rule not in ShowProgress.rules_shown:
1653 first_print = True
1654 ShowProgress.rules_shown[rule]=len(ShowProgress.rules_shown)
1655 else:
1656 first_print = False
1657 print >> sys.stderr, '-------- Compressed --------'
1658 print >> sys.stderr, 'Rule:', ShowProgress.rules_shown[rule]
1659 print >> sys.stderr, '--------'
1660 compressor = compressors[rule]
1661 match = compressor.regex.match(instruction)
1662 assert match
1663 format_str = CompressionTemplate(instruction, match, '{{{}}}')
1664 replacements = sorted(format_str.format(*replacement)
1665 for replacement in compressor.replacements)
1666 if len(compressor.replacements) <= 4 or first_print:
1667 for replacement in replacements:
1668 print >> sys.stderr, replacement
1669 else:
1670 print >> sys.stderr, replacements[0]
1671 print >> sys.stderr, '...'
1672 print >> sys.stderr, replacements[-1]
1673 print >> sys.stderr, '--------'
1674 print >> sys.stderr, 'Compressed', format_str.format(*compressor.subst)
1675 ShowProgress.rules_shown = {}
1676
1677
1678 def main():
1679 # We are keeping these global to share state graph and compressors
1680 # between workers spawned by multiprocess. Passing them every time is slow.
1681 global options, xml_file
1682 global dfa
1683 global worker_validator
1684 options, xml_file = ParseOptions()
1685 dfa = dfa_parser.ParseXml(xml_file)
1686 worker_validator = validator.Validator(
1687 validator_dll=options.validator_dll,
1688 decoder_dll=options.decoder_dll)
1689 global compressors
1690 compressors = PrepareCompressors()
1691
1692 assert dfa.initial_state.is_accepting
1693 assert not dfa.initial_state.any_byte
1694
1695 print >> sys.stderr, len(dfa.states), 'states'
1696
1697 num_suffixes = dfa_traversal.GetNumSuffixes(dfa.initial_state)
1698
1699 # We can't just write 'num_suffixes[dfa.initial_state]' because
1700 # initial state is accepting.
1701 total_instructions = sum(
1702 num_suffixes[t.to_state]
1703 for t in dfa.initial_state.forward_transitions.values())
1704 print >> sys.stderr, total_instructions, 'regular instructions total'
1705
1706 tasks = dfa_traversal.CreateTraversalTasks(dfa.states, dfa.initial_state)
1707 print >> sys.stderr, len(tasks), 'tasks'
1708
1709 pool = multiprocessing.Pool()
1710
1711 results = pool.imap(Worker, tasks)
1712
1713 total = 0
1714 num_valid = 0
1715 full_output = set()
1716 for prefix, count, valid_count, output, trace in results:
1717 print >> sys.stderr, 'Prefix:', ', '.join(map(hex, prefix))
1718 total += count
1719 num_valid += valid_count
1720 full_output |= output
1721 for rule, instruction in trace:
1722 ShowProgress(rule, instruction)
1723 for instruction in sorted(Compressed(full_output,
1724 compressors,
1725 ShowProgress)):
1726 print instruction
1727
1728 print >> sys.stderr, total, 'instructions were processed'
1729 print >> sys.stderr, num_valid, 'valid instructions'
1730
1731
1732 if __name__ == '__main__':
1733 main()
OLDNEW
« no previous file with comments | « src/trusted/validator_ragel/build.scons ('k') | src/trusted/validator_ragel/testdata/32bit_regular.golden » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698