Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1)

Side by Side Diff: src/ia32/codegen-ia32.cc

Issue 2582001: Add optimized version of memcpy on ia32. (Closed)
Patch Set: Created 10 years, 6 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // Copyright 2010 the V8 project authors. All rights reserved. 1 // Copyright 2010 the V8 project authors. All rights reserved.
2 // Redistribution and use in source and binary forms, with or without 2 // Redistribution and use in source and binary forms, with or without
3 // modification, are permitted provided that the following conditions are 3 // modification, are permitted provided that the following conditions are
4 // met: 4 // met:
5 // 5 //
6 // * Redistributions of source code must retain the above copyright 6 // * Redistributions of source code must retain the above copyright
7 // notice, this list of conditions and the following disclaimer. 7 // notice, this list of conditions and the following disclaimer.
8 // * Redistributions in binary form must reproduce the above 8 // * Redistributions in binary form must reproduce the above
9 // copyright notice, this list of conditions and the following 9 // copyright notice, this list of conditions and the following
10 // disclaimer in the documentation and/or other materials provided 10 // disclaimer in the documentation and/or other materials provided
(...skipping 13476 matching lines...) Expand 10 before | Expand all | Expand 10 after
13487 13487
13488 // Compare flat ascii strings. 13488 // Compare flat ascii strings.
13489 GenerateCompareFlatAsciiStrings(masm, edx, eax, ecx, ebx, edi); 13489 GenerateCompareFlatAsciiStrings(masm, edx, eax, ecx, ebx, edi);
13490 13490
13491 // Call the runtime; it returns -1 (less), 0 (equal), or 1 (greater) 13491 // Call the runtime; it returns -1 (less), 0 (equal), or 1 (greater)
13492 // tagged as a small integer. 13492 // tagged as a small integer.
13493 __ bind(&runtime); 13493 __ bind(&runtime);
13494 __ TailCallRuntime(Runtime::kStringCompare, 2, 1); 13494 __ TailCallRuntime(Runtime::kStringCompare, 2, 1);
13495 } 13495 }
13496 13496
13497
13498 MemCopyFunction CreateMemCopyFunction() {
13499 size_t actual_size;
13500 byte* buffer = static_cast<byte*>(OS::Allocate(Assembler::kMinimalBufferSize,
13501 &actual_size,
13502 true));
13503 CHECK(buffer);
13504 HandleScope handles;
13505 MacroAssembler assembler(buffer, static_cast<int>(actual_size));
Erik Corry 2010/06/03 20:29:49 Might as well just call this 'masm'?
Lasse Reichstein 2010/06/04 11:52:13 True. Just have to redefine __ as well.
13506 MacroAssembler* masm = &assembler; // For the __ macro.
13507
13508 // Generated code is put into a fixed, unmovable, buffer, and not into
13509 // the V8 heap. We can't, and don't, refer to any relocatable addresses
13510 // (e.g. the JavaScript nan-object).
13511
13512 // 32-bit C declaration function calls pass arguments on stack.
13513
13514 // Stack layout:
13515 // esp[12]: Third argument, size.
13516 // esp[8]: Second argument, source pointer.
13517 // esp[4]: First argument, destination pointer.
13518 // esp[0]: return address
13519
13520 const int kDestinationOffset = 1 * kPointerSize;
13521 const int kSourceOffset = 2 * kPointerSize;
13522 const int kSizeOffset = 3 * kPointerSize;
13523
13524 int stack_offset = 0; // Update if we change the stack height.
13525
13526 if (FLAG_debug_code) {
13527 __ cmp(Operand(esp, kSizeOffset + stack_offset),
13528 Immediate(kMinComplexMemCopy));
13529 Label ok;
13530 __ j(greater_equal, &ok);
13531 __ int3();
13532 __ bind(&ok);
13533 }
13534 if (CpuFeatures::IsSupported(SSE2)) {
13535 CpuFeatures::Scope enable(SSE2);
13536 __ push(edi);
13537 __ push(esi);
13538 stack_offset += 2 * kPointerSize;
13539 __ mov(edi, Operand(esp, stack_offset + kDestinationOffset));
13540 __ mov(esi, Operand(esp, stack_offset + kSourceOffset));
13541 __ mov(ecx, Operand(esp, stack_offset + kSizeOffset));
Erik Corry 2010/06/04 07:18:10 I think the clarity of this code would benefit fro
Lasse Reichstein 2010/06/04 11:52:13 Done.
13542
13543 __ movdqu(xmm0, Operand(esi, 0));
13544 __ movdqu(Operand(edi, 0), xmm0);
13545 __ mov(edx, edi);
13546 __ and_(edx, 0x0F);
Erik Corry 2010/06/04 07:18:10 0x0F -> 0xF
Lasse Reichstein 2010/06/04 11:52:13 Done.
13547 __ neg(edx);
13548 __ add(Operand(edx), Immediate(16));
13549 __ add(edi, Operand(edx));
13550 __ add(esi, Operand(edx));
13551 __ sub(Operand(ecx), edx);
Erik Corry 2010/06/04 07:18:10 Where do the bytes you skipped over here get copie
Lasse Reichstein 2010/06/04 11:52:13 They were copied just before. I only increase src/
13552
13553 // edi is now aligned. Check if esi is also aligned.
13554 Label unaligned_source;
13555 __ test(Operand(esi), Immediate(0x0F));
13556 __ j(not_zero, &unaligned_source);
13557 {
13558 __ IncrementCounter(&Counters::memcopy_aligned, 1);
13559 // Copy loop for aligned source and destination.
13560 __ mov(edx, ecx);
13561 __ shr(ecx, 5);
13562 {
13563 // Main copy loop.
13564 Label loop;
13565 __ bind(&loop);
13566 __ prefetch(Operand(esi, 0x20), 1);
13567 __ movdqa(xmm0, Operand(esi, 0x00));
Erik Corry 2010/06/04 07:18:10 Apart from the dqa/dqu this seems to be duplicated
Lasse Reichstein 2010/06/04 11:52:13 Correct. This is the fast case code where source i
13568 __ movdqa(xmm1, Operand(esi, 0x10));
13569 __ add(Operand(esi), Immediate(0x20));
13570
13571 __ movdqa(Operand(edi, 0x00), xmm0);
13572 __ movdqa(Operand(edi, 0x10), xmm1);
13573 __ add(Operand(edi), Immediate(0x20));
13574
13575 __ dec(ecx);
13576 __ j(not_zero, &loop);
13577 }
13578
13579 // At most 31 bytes to copy.
Erik Corry 2010/06/04 07:18:10 This code seems to be duplicated lower down.
Lasse Reichstein 2010/06/04 11:52:13 Not identically, the second copy uses movdqu for t
13580 Label move_less_16;
13581 __ test(Operand(edx), Immediate(0x10));
13582 __ j(zero, &move_less_16);
13583 __ movdqa(xmm0, Operand(esi, 0));
13584 __ add(Operand(esi), Immediate(0x10));
13585 __ movdqa(Operand(edi, 0), xmm0);
13586 __ add(Operand(edi), Immediate(0x10));
13587 __ bind(&move_less_16);
13588
13589 // At most 15 bytes to copy. Copy 16 bytes at end of string.
13590 __ and_(edx, 0x0F);
13591 __ movdqu(xmm0, Operand(esi, edx, times_1, -16));
13592 __ movdqu(Operand(edi, edx, times_1, -16), xmm0);
13593
13594 __ pop(esi);
13595 __ pop(edi);
13596 __ ret(0);
13597 }
13598 __ Align(16);
13599 {
13600 // Copy loop for unaligned source and aligned destination.
13601 // If source is not aligned, we can't read it as efficiently.
13602 __ bind(&unaligned_source);
13603 __ IncrementCounter(&Counters::memcopy_unaligned, 1);
13604 __ mov(edx, ecx);
13605 __ shr(ecx, 5);
13606 {
13607 // Main copy loop
13608 Label loop;
13609 __ bind(&loop);
13610 __ prefetch(Operand(esi, 0x20), 1);
13611 __ movdqu(xmm0, Operand(esi, 0x00));
13612 __ movdqu(xmm1, Operand(esi, 0x10));
13613 __ add(Operand(esi), Immediate(0x20));
13614
13615 __ movdqa(Operand(edi, 0x00), xmm0);
13616 __ movdqa(Operand(edi, 0x10), xmm1);
13617 __ add(Operand(edi), Immediate(0x20));
13618
13619 __ dec(ecx);
13620 __ j(not_zero, &loop);
13621 }
13622
13623 // At most 31 bytes to copy.
13624 Label move_less_16;
13625 __ test(Operand(edx), Immediate(0x10));
13626 __ j(zero, &move_less_16);
13627 __ movdqu(xmm0, Operand(esi, 0));
13628 __ add(Operand(esi), Immediate(0x10));
13629 __ movdqa(Operand(edi, 0), xmm0);
13630 __ add(Operand(edi), Immediate(0x10));
13631 __ bind(&move_less_16);
13632
13633 // At most 15 bytes to copy. Copy 16 bytes at end of string.
13634 __ and_(edx, 0x0F);
13635 __ movdqu(xmm0, Operand(esi, edx, times_1, -0x10));
13636 __ movdqu(Operand(edi, edx, times_1, -0x10), xmm0);
13637
13638 __ pop(esi);
13639 __ pop(edi);
13640 __ ret(0);
13641 }
13642
13643 } else {
13644 __ IncrementCounter(&Counters::memcopy_noxmm, 1);
13645 // SSE2 not supported. Unlikely to happen in practice.
13646 __ push(edi);
13647 __ push(esi);
13648 stack_offset += 2 * kPointerSize;
13649 __ cld();
13650 __ mov(edi, Operand(esp, stack_offset + kDestinationOffset));
13651 __ mov(esi, Operand(esp, stack_offset + kSourceOffset));
13652 __ mov(ecx, Operand(esp, stack_offset + kSizeOffset));
13653
13654 // Copy the first word.
13655 __ mov(eax, Operand(esi, 0));
13656 __ mov(Operand(edi, 0), eax);
13657
13658 // Increment esi,edi so that edi is aligned.
13659 __ mov(edx, edi);
13660 __ and_(edx, 0x03);
13661 __ neg(edx);
13662 __ add(Operand(edx), Immediate(4)); // edx = 4 - (edi & 3)
13663 __ add(edi, Operand(edx));
13664 __ add(esi, Operand(edx));
13665 __ sub(Operand(ecx), edx);
13666 // edi is now aligned, ecx holds number of remaning bytes to copy.
13667 __ mov(edx, ecx);
13668 __ shr(ecx, 2); // Make word count instead of byte count.
13669
13670 __ rep_movs();
13671
13672 // At most 3 bytes left to copy. Copy 4 bytes at end of string.
13673 __ and_(edx, 3);
13674 __ mov(eax, Operand(esi, edx, times_1, -4));
13675 __ mov(Operand(edi, edx, times_1, -4), eax);
13676
13677 __ pop(esi);
13678 __ pop(edi);
13679 __ ret(0);
13680 }
13681
13682 CodeDesc desc;
13683 assembler.GetCode(&desc);
13684 // Call the function from C++.
13685 return FUNCTION_CAST<MemCopyFunction>(buffer);
13686 }
13687
13497 #undef __ 13688 #undef __
13498 13689
13499 } } // namespace v8::internal 13690 } } // namespace v8::internal
13500 13691
13501 #endif // V8_TARGET_ARCH_IA32 13692 #endif // V8_TARGET_ARCH_IA32
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698