src/arm/codegen-arm.cc - Issue 12920009: Use generated Neon version of MemCopy() on ARM, if platform supports it.

Unified Diff: src/arm/codegen-arm.cc

Issue 12920009: Use generated Neon version of MemCopy() on ARM, if platform supports it. (Closed) Base URL: http://v8.googlecode.com/svn/branches/bleeding_edge/

Patch Set: Created 7 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: src/arm/codegen-arm.cc

===================================================================

--- src/arm/codegen-arm.cc (revision 14076)

+++ src/arm/codegen-arm.cc (working copy)

@@ -114,10 +114,159 @@

#endif

}

+static void MemCopyWrapper(void* dest, const void* src, size_t size) {

+ memcpy(dest, src, size);

+// Based on Bionic's memcpy.s.

+OS::MemCopyFunction CreateMemCopyFunction() {

+ size_t actual_size;

+ static const int kCacheLineSize = 64;

Rodolph Perfetta 2013/04/30 17:08:37 This is true on A8 and A15 but not A9.

+ static const int kPrefetchDistance = kCacheLineSize * 4;

+ // Allocate buffer in executable space.

+ byte* buffer = static_cast<byte*>(OS::Allocate(1 * KB,

+ &actual_size,

+ true));

+ if (buffer == NULL) return &MemCopyWrapper;

+ if (!CpuFeatures::IsSupported(NEON)) return &MemCopyWrapper;

+ MacroAssembler masm(NULL, buffer, static_cast<int>(actual_size));

+ CpuFeatureScope use_neon(&masm, NEON);

+ Label less16, aligned16, aligned8, skip_copy8, skip_copy4,

+ fix_remainder, main_loop, has32, less32;

+ // ----------- S t a t e -------------

+ // -- r0 : dest

+ // -- r1 : src

+ // -- r2 : count

+ // -----------------------------------

+ __ push(lr);

+ // Start preloading as early as possible.

+ // TODO: consider using pldw for write preload, if makes sense on chip.

+ __ pld(r1, kCacheLineSize * 0);

+ __ pld(r1, kCacheLineSize * 1);

+#ifdef DEBUG

+ Label check_ok;

+ // Do we have at least 16-bytes to copy (needed for alignment below).

+ ASSERT(OS::kMinComplexMemCopy >= 16);

Rodolph Perfetta 2013/04/30 17:08:37 STATIC_ASSERT

+ __ cmp(r2, Operand(OS::kMinComplexMemCopy));

+ __ b(&check_ok, hs);

+ __ bkpt(0);

+__ bind(&check_ok);

+#endif

+ // Align destination to half cache-line for the write-buffer.

+ __ rsb(r3, r0, Operand(0));

+ __ and_(r3, r3, Operand(0xf), SetCC);

+ __ b(&aligned16, eq);

+ // Copy up to 15-bytes (count in r3).

+ __ sub(r2, r2, r3);

+ __ mov(ip, Operand(r3, LSL, 31), SetCC);

+ __ ldrb(lr, MemOperand(r1, 1, PostIndex), mi);

+ __ strb(lr, MemOperand(r0, 1, PostIndex), mi);

+ __ ldrb(ip, MemOperand(r1, 1, PostIndex), cs);

+ __ ldrb(lr, MemOperand(r1, 1, PostIndex), cs);

Rodolph Perfetta 2013/04/30 17:08:37 use ldrh instead of 2 ldrb. Same for stores.

+ __ strb(ip, MemOperand(r0, 1, PostIndex), cs);

+ __ strb(lr, MemOperand(r0, 1, PostIndex), cs);

+ __ mov(ip, Operand(r3, LSL, 29), SetCC);

+ __ b(&aligned8, ge);

+ // Copies 4 bytes, destination 32-bits aligned.

+ __ vld4(8, r1, d0, element_0, Writeback);

Rodolph Perfetta 2013/04/30 17:08:37 I am not sure why you are using vld4. Currently yo

+ __ vst4(8, r0, d0, element_0, Writeback, 32 / 8);

+__ bind(&aligned8);

+ __ b(&aligned16, cc);

+ // Copies 8 bytes, destination 64-bits aligned.

+ __ vld1(8, r1, d0, d0, Writeback);

+ __ vst1(8, r0, d0, d0, Writeback, 64 / 8);

+__ bind(&aligned16);

+ // Preload immediately the next cache line, which we may need.

+ __ pld(r1, kCacheLineSize * 0);

+ __ pld(r1, kCacheLineSize * 1);

+ // Make sure we have at least 64 bytes to copy.

+ __ sub(r2, r2, Operand(64), SetCC);

+ __ b(&fix_remainder, lo);

+ // Preload all the cache lines we need.

+ // NOTE: the number of pld below depends on PREFETCH_DISTANCE,

+ // ideally would would increase the distance in the main loop to

+ // avoid the goofy code below. In practice this doesn't seem to make

+ // a big difference.

+ __ pld(r1, kCacheLineSize * 2);

+ __ pld(r1, kCacheLineSize * 3);

+ __ pld(r1, kPrefetchDistance);

+ // The main loop copies 64 bytes at a time.

+__ bind(&main_loop);

+ __ vld1(8, r1, d0, d3, Writeback);

+ __ vld1(8, r1, d4, d7, Writeback);

+ __ pld(r1, kPrefetchDistance);

+ __ sub(r2, r2, Operand(64), SetCC);

+ __ vst1(8, r0, d0, d3, Writeback, 128 / 8);

+ __ vst1(8, r0, d4, d7, Writeback, 128 / 8);

+ __ b(&main_loop, hs);

+ // Fix-up the remaining count and make sure we have >= 32 bytes left.

+__ bind(&fix_remainder);

+ __ add(r2, r2, Operand(64));

+ __ sub(r2, r2, Operand(32), SetCC);

+ __ b(&less32, lo);

+ // 32 bytes at a time. These cache lines were already preloaded.

+__ bind(&has32);

+ __ vld1(8, r1, d0, d3, Writeback);

+ __ sub(r2, r2, Operand(32), SetCC);

+ __ vst1(8, r0, d0, d3, Writeback, 128 / 8);

+ __ b(&has32, hs);

Rodolph Perfetta 2013/04/30 17:08:37 If I followed correctly when you enter the has32 b

+ // Less than 32 left.

+__ bind(&less32);

+ __ add(r2, r2, Operand(32));

+ __ tst(r2, Operand(0x10));

+ __ b(&less16, eq);

+ // Copies 16 bytes, 128-bits aligned.

+ __ vld1(8, r1, d0, d1, Writeback);

+ __ vst1(8, r0, d0, d1, Writeback, 128 / 8);

+ // Copy up to 15-bytes (count in r2).

+__ bind(&less16);

+ __ mov(ip, Operand(r2, LSL, 29), SetCC);

+ __ b(&skip_copy8, cc);

+ __ vld1(8, r1, d0, d0, Writeback);

+ __ vst1(8, r0, d0, d0, Writeback);

+__ bind(&skip_copy8);

+ __ b(&skip_copy4, ge);

Rodolph Perfetta 2013/04/30 17:08:37 ge implies N flag == V flag, shift with SetCC don'

+ __ vld4(8, r1, d0, element_0, Writeback);

+ __ vst4(8, r0, d0, element_0, Writeback);

+__ bind(&skip_copy4);

+ __ mov(ip, Operand(r2, LSL, 31), SetCC);

+ __ ldrb(r3, MemOperand(r1, 1, PostIndex), mi);

+ __ ldrb(ip, MemOperand(r1, 1, PostIndex), cs);

+ __ ldrb(lr, MemOperand(r1, 1, PostIndex), cs);

Rodolph Perfetta 2013/04/30 17:08:37 ldrh, then strh below.

+ __ strb(r3, MemOperand(r0, 1, PostIndex), mi);

+ __ strb(ip, MemOperand(r0, 1, PostIndex), cs);

+ __ strb(lr, MemOperand(r0, 1, PostIndex), cs);

+ __ pop(lr);

+ __ bx(lr);

Rodolph Perfetta 2013/04/30 17:08:37 You can combine both operations above with: __ p

+ CodeDesc desc;

+ masm.GetCode(&desc);

+ ASSERT(!RelocInfo::RequiresRelocation(desc));

+ CPU::FlushICache(buffer, actual_size);

+ OS::ProtectCode(buffer, actual_size);

+ return FUNCTION_CAST<OS::MemCopyFunction>(buffer);

#undef __

UnaryMathFunction CreateSqrtFunction() {

return &sqrt;

}

« no previous file with comments | « src/arm/assembler-arm.cc ('k') | src/platform.h » ('j') | no next file with comments »