src/ia32/codegen-ia32.cc - Issue 2582001: Add optimized version of memcpy on ia32.

Unified Diff: src/ia32/codegen-ia32.cc

Issue 2582001: Add optimized version of memcpy on ia32. (Closed)

Patch Set: Created 10 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Index: src/ia32/codegen-ia32.cc

diff --git a/src/ia32/codegen-ia32.cc b/src/ia32/codegen-ia32.cc

index 7d8116dcb68c3f74a4e3433005f98b8e25097f41..e8b56f7b2f39dec63d7bfcc732188c4ab40b8853 100644

--- a/src/ia32/codegen-ia32.cc

+++ b/src/ia32/codegen-ia32.cc

@@ -13494,6 +13494,197 @@ void StringCompareStub::Generate(MacroAssembler* masm) {

__ TailCallRuntime(Runtime::kStringCompare, 2, 1);

}

+MemCopyFunction CreateMemCopyFunction() {

+ size_t actual_size;

+ byte* buffer = static_cast<byte*>(OS::Allocate(Assembler::kMinimalBufferSize,

+ &actual_size,

+ true));

+ CHECK(buffer);

+ HandleScope handles;

+ MacroAssembler assembler(buffer, static_cast<int>(actual_size));

Erik Corry 2010/06/03 20:29:49 Might as well just call this 'masm'?

Lasse Reichstein 2010/06/04 11:52:13 True. Just have to redefine __ as well.

+ MacroAssembler* masm = &assembler; // For the __ macro.

+ // Generated code is put into a fixed, unmovable, buffer, and not into

+ // the V8 heap. We can't, and don't, refer to any relocatable addresses

+ // (e.g. the JavaScript nan-object).

+ // 32-bit C declaration function calls pass arguments on stack.

+ // Stack layout:

+ // esp[12]: Third argument, size.

+ // esp[8]: Second argument, source pointer.

+ // esp[4]: First argument, destination pointer.

+ // esp[0]: return address

+ const int kDestinationOffset = 1 * kPointerSize;

+ const int kSourceOffset = 2 * kPointerSize;

+ const int kSizeOffset = 3 * kPointerSize;

+ int stack_offset = 0; // Update if we change the stack height.

+ if (FLAG_debug_code) {

+ __ cmp(Operand(esp, kSizeOffset + stack_offset),

+ Immediate(kMinComplexMemCopy));

+ Label ok;

+ __ j(greater_equal, &ok);

+ __ int3();

+ __ bind(&ok);

+ }

+ if (CpuFeatures::IsSupported(SSE2)) {

+ CpuFeatures::Scope enable(SSE2);

+ __ push(edi);

+ __ push(esi);

+ stack_offset += 2 * kPointerSize;

+ __ mov(edi, Operand(esp, stack_offset + kDestinationOffset));

+ __ mov(esi, Operand(esp, stack_offset + kSourceOffset));

+ __ mov(ecx, Operand(esp, stack_offset + kSizeOffset));

Erik Corry 2010/06/04 07:18:10 I think the clarity of this code would benefit fro

Lasse Reichstein 2010/06/04 11:52:13 Done.

+ __ movdqu(xmm0, Operand(esi, 0));

+ __ movdqu(Operand(edi, 0), xmm0);

+ __ mov(edx, edi);

+ __ and_(edx, 0x0F);

Erik Corry 2010/06/04 07:18:10 0x0F -> 0xF

Lasse Reichstein 2010/06/04 11:52:13 Done.

+ __ neg(edx);

+ __ add(Operand(edx), Immediate(16));

+ __ add(edi, Operand(edx));

+ __ add(esi, Operand(edx));

+ __ sub(Operand(ecx), edx);

Erik Corry 2010/06/04 07:18:10 Where do the bytes you skipped over here get copie

Lasse Reichstein 2010/06/04 11:52:13 They were copied just before. I only increase src/

+ // edi is now aligned. Check if esi is also aligned.

+ Label unaligned_source;

+ __ test(Operand(esi), Immediate(0x0F));

+ __ j(not_zero, &unaligned_source);

+ {

+ __ IncrementCounter(&Counters::memcopy_aligned, 1);

+ // Copy loop for aligned source and destination.

+ __ mov(edx, ecx);

+ __ shr(ecx, 5);

+ {

+ // Main copy loop.

+ Label loop;

+ __ bind(&loop);

+ __ prefetch(Operand(esi, 0x20), 1);

+ __ movdqa(xmm0, Operand(esi, 0x00));

Erik Corry 2010/06/04 07:18:10 Apart from the dqa/dqu this seems to be duplicated

Lasse Reichstein 2010/06/04 11:52:13 Correct. This is the fast case code where source i

+ __ movdqa(xmm1, Operand(esi, 0x10));

+ __ add(Operand(esi), Immediate(0x20));

+ __ movdqa(Operand(edi, 0x00), xmm0);

+ __ movdqa(Operand(edi, 0x10), xmm1);

+ __ add(Operand(edi), Immediate(0x20));

+ __ dec(ecx);

+ __ j(not_zero, &loop);

+ }

+ // At most 31 bytes to copy.

Erik Corry 2010/06/04 07:18:10 This code seems to be duplicated lower down.

Lasse Reichstein 2010/06/04 11:52:13 Not identically, the second copy uses movdqu for t

+ Label move_less_16;

+ __ test(Operand(edx), Immediate(0x10));

+ __ j(zero, &move_less_16);

+ __ movdqa(xmm0, Operand(esi, 0));

+ __ add(Operand(esi), Immediate(0x10));

+ __ movdqa(Operand(edi, 0), xmm0);

+ __ add(Operand(edi), Immediate(0x10));

+ __ bind(&move_less_16);

+ // At most 15 bytes to copy. Copy 16 bytes at end of string.

+ __ and_(edx, 0x0F);

+ __ movdqu(xmm0, Operand(esi, edx, times_1, -16));

+ __ movdqu(Operand(edi, edx, times_1, -16), xmm0);

+ __ pop(esi);

+ __ pop(edi);

+ __ ret(0);

+ }

+ __ Align(16);

+ {

+ // Copy loop for unaligned source and aligned destination.

+ // If source is not aligned, we can't read it as efficiently.

+ __ bind(&unaligned_source);

+ __ IncrementCounter(&Counters::memcopy_unaligned, 1);

+ __ mov(edx, ecx);

+ __ shr(ecx, 5);

+ {

+ // Main copy loop

+ Label loop;

+ __ bind(&loop);

+ __ prefetch(Operand(esi, 0x20), 1);

+ __ movdqu(xmm0, Operand(esi, 0x00));

+ __ movdqu(xmm1, Operand(esi, 0x10));

+ __ add(Operand(esi), Immediate(0x20));

+ __ movdqa(Operand(edi, 0x00), xmm0);

+ __ movdqa(Operand(edi, 0x10), xmm1);

+ __ add(Operand(edi), Immediate(0x20));

+ __ dec(ecx);

+ __ j(not_zero, &loop);

+ }

+ // At most 31 bytes to copy.

+ Label move_less_16;

+ __ test(Operand(edx), Immediate(0x10));

+ __ j(zero, &move_less_16);

+ __ movdqu(xmm0, Operand(esi, 0));

+ __ add(Operand(esi), Immediate(0x10));

+ __ movdqa(Operand(edi, 0), xmm0);

+ __ add(Operand(edi), Immediate(0x10));

+ __ bind(&move_less_16);

+ // At most 15 bytes to copy. Copy 16 bytes at end of string.

+ __ and_(edx, 0x0F);

+ __ movdqu(xmm0, Operand(esi, edx, times_1, -0x10));

+ __ movdqu(Operand(edi, edx, times_1, -0x10), xmm0);

+ __ pop(esi);

+ __ pop(edi);

+ __ ret(0);

+ }

+ } else {

+ __ IncrementCounter(&Counters::memcopy_noxmm, 1);

+ // SSE2 not supported. Unlikely to happen in practice.

+ __ push(edi);

+ __ push(esi);

+ stack_offset += 2 * kPointerSize;

+ __ cld();

+ __ mov(edi, Operand(esp, stack_offset + kDestinationOffset));

+ __ mov(esi, Operand(esp, stack_offset + kSourceOffset));

+ __ mov(ecx, Operand(esp, stack_offset + kSizeOffset));

+ // Copy the first word.

+ __ mov(eax, Operand(esi, 0));

+ __ mov(Operand(edi, 0), eax);

+ // Increment esi,edi so that edi is aligned.

+ __ mov(edx, edi);

+ __ and_(edx, 0x03);

+ __ neg(edx);

+ __ add(Operand(edx), Immediate(4)); // edx = 4 - (edi & 3)

+ __ add(edi, Operand(edx));

+ __ add(esi, Operand(edx));

+ __ sub(Operand(ecx), edx);

+ // edi is now aligned, ecx holds number of remaning bytes to copy.

+ __ mov(edx, ecx);

+ __ shr(ecx, 2); // Make word count instead of byte count.

+ __ rep_movs();

+ // At most 3 bytes left to copy. Copy 4 bytes at end of string.

+ __ and_(edx, 3);

+ __ mov(eax, Operand(esi, edx, times_1, -4));

+ __ mov(Operand(edi, edx, times_1, -4), eax);

+ __ pop(esi);

+ __ pop(edi);

+ __ ret(0);

+ }

+ CodeDesc desc;

+ assembler.GetCode(&desc);

+ // Call the function from C++.

+ return FUNCTION_CAST<MemCopyFunction>(buffer);

#undef __

} } // namespace v8::internal

« src/ia32/assembler-ia32.cc ('K') | « src/ia32/assembler-ia32.cc ('k') | src/ia32/disasm-ia32.cc » ('j') | src/ia32/disasm-ia32.cc » ('J')