Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(9)

Unified Diff: src/arm/codegen-arm.cc

Issue 12920009: Use generated Neon version of MemCopy() on ARM, if platform supports it. (Closed) Base URL: http://v8.googlecode.com/svn/branches/bleeding_edge/
Patch Set: Created 7 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: src/arm/codegen-arm.cc
===================================================================
--- src/arm/codegen-arm.cc (revision 14076)
+++ src/arm/codegen-arm.cc (working copy)
@@ -114,10 +114,158 @@
#endif
}
+static void MemCopyWrapper(void* dest, const void* src, size_t size) {
+ memcpy(dest, src, size);
+}
+// Based on Bionic's memcpy.s
hans 2013/04/02 12:34:29 period at the end of comments, here and below
Nike 2013/04/03 15:04:06 Done.
+OS::MemCopyFunction CreateMemCopyFunction() {
+ size_t actual_size;
+ static const int kCacheLineSize = 64;
+ static const int kPrefetchDistance = kCacheLineSize * 4;
+ // Allocate buffer in executable space.
+ byte* buffer = static_cast<byte*>(OS::Allocate(1 * KB,
+ &actual_size,
+ true));
+ if (buffer == NULL) return &MemCopyWrapper;
+ if (!CpuFeatures::IsSupported(NEON)) return &MemCopyWrapper;
+
+ MacroAssembler masm(NULL, buffer, static_cast<int>(actual_size));
+
+ CpuFeatureScope use_neon(&masm, NEON);
+ Label less16, aligned16, aligned8, skip_copy8, skip_copy4,
+ fix_remainder, main_loop, has32, less32;
+
+ // ----------- S t a t e -------------
+ // -- r0 : dest
+ // -- r1 : src
+ // -- r2 : count
+ // -----------------------------------
+
+ __ push(lr);
+
+ // Start preloading as early as possible
+ __ pld(r1, kCacheLineSize * 0);
+ __ pld(r1, kCacheLineSize * 1);
+
+#ifdef DEBUG
+ Label check_ok;
+ // Do we have at least 16-bytes to copy (needed for alignment below)
+ ASSERT(OS::kMinComplexMemCopy >= 16);
+ __ cmp(r2, Operand(OS::kMinComplexMemCopy));
+ __ b(&check_ok, hs);
+ __ bkpt(0);
+__ bind(&check_ok);
+#endif
+
+ // Align destination to half cache-line for the write-buffer
+ __ rsb(r3, r0, Operand(0));
+ __ and_(r3, r3, Operand(0xf), SetCC);
+ __ b(&aligned16, eq);
+
+ // Copy up to 15-bytes (count in r3)
+ __ sub(r2, r2, r3);
+ __ mov(ip, Operand(r3, LSL, 31), SetCC);
+ __ ldrb(lr, MemOperand(r1, 1, PostIndex), mi);
+ __ strb(lr, MemOperand(r0, 1, PostIndex), mi);
+ __ ldrb(ip, MemOperand(r1, 1, PostIndex), cs);
+ __ ldrb(lr, MemOperand(r1, 1, PostIndex), cs);
+ __ strb(ip, MemOperand(r0, 1, PostIndex), cs);
+ __ strb(lr, MemOperand(r0, 1, PostIndex), cs);
+ __ mov(ip, Operand(r3, LSL, 29), SetCC);
+ __ b(&aligned8, ge);
+ // Copies 4 bytes, destination 32-bits aligned
+ __ vld4(8, r1, d0, element_0, Writeback);
+ __ vst4(8, r0, d0, element_0, Writeback, 32 / 8);
+__ bind(&aligned8);
+ __ b(&aligned16, cc);
+ // Copies 8 bytes, destination 64-bits aligned
+ __ vld1(8, r1, d0, d0, Writeback);
+ __ vst1(8, r0, d0, d0, Writeback, 64 / 8);
+
+__ bind(&aligned16);
+ // Preload immediately the next cache line, which we may need
+ __ pld(r1, kCacheLineSize * 0);
+ __ pld(r1, kCacheLineSize * 1);
+
+ // Make sure we have at least 64 bytes to copy
+ __ sub(r2, r2, Operand(64), SetCC);
+ __ b(&fix_remainder, lo);
+
+ // Preload all the cache lines we need.
+ // NOTE: the number of pld below depends on PREFETCH_DISTANCE,
+ // ideally would would increase the distance in the main loop to
+ // avoid the goofy code below. In practice this doesn't seem to make
+ // a big difference.
+ __ pld(r1, kCacheLineSize * 2);
+ __ pld(r1, kCacheLineSize * 3);
+ __ pld(r1, kPrefetchDistance);
+
+ // The main loop copies 64 bytes at a time
+__ bind(&main_loop);
+ __ vld1(8, r1, d0, d3, Writeback);
+ __ vld1(8, r1, d4, d7, Writeback);
+ __ pld(r1, kPrefetchDistance);
+ __ sub(r2, r2, Operand(64), SetCC);
+ __ vst1(8, r0, d0, d3, Writeback, 128 / 8);
+ __ vst1(8, r0, d4, d7, Writeback, 128 / 8);
+ __ b(&main_loop, hs);
+
+ // Fix-up the remaining count and make sure we have >= 32 bytes left
+__ bind(&fix_remainder);
+ __ add(r2, r2, Operand(64));
+ __ sub(r2, r2, Operand(32), SetCC);
+ __ b(&less32, lo);
+
+ // 32 bytes at a time. These cache lines were already preloaded
+__ bind(&has32);
+ __ vld1(8, r1, d0, d3, Writeback);
+ __ sub(r2, r2, Operand(32), SetCC);
+ __ vst1(8, r0, d0, d3, Writeback, 128 / 8);
+ __ b(&has32, hs);
+
+ // Less than 32 left
+__ bind(&less32);
+ __ add(r2, r2, Operand(32));
+ __ tst(r2, Operand(0x10));
+ __ b(&less16, eq);
+ // Copies 16 bytes, 128-bits aligned
+ __ vld1(8, r1, d0, d1, Writeback);
+ __ vst1(8, r0, d0, d1, Writeback, 128 / 8);
+
+ // copy up to 15-bytes (count in r2)
+__ bind(&less16);
+ __ mov(ip, Operand(r2, LSL, 29), SetCC);
+ __ b(&skip_copy8, cc);
+ __ vld1(8, r1, d0, d0, Writeback);
+ __ vst1(8, r0, d0, d0, Writeback);
+__ bind(&skip_copy8);
+ __ b(&skip_copy4, ge);
+ __ vld4(8, r1, d0, element_0, Writeback);
+ __ vst4(8, r0, d0, element_0, Writeback);
+__ bind(&skip_copy4);
+ __ mov(ip, Operand(r2, LSL, 31), SetCC);
+ __ ldrb(r3, MemOperand(r1, 1, PostIndex), mi);
+ __ ldrb(ip, MemOperand(r1, 1, PostIndex), cs);
+ __ ldrb(lr, MemOperand(r1, 1, PostIndex), cs);
+ __ strb(r3, MemOperand(r0, 1, PostIndex), mi);
+ __ strb(ip, MemOperand(r0, 1, PostIndex), cs);
+ __ strb(lr, MemOperand(r0, 1, PostIndex), cs);
+
+ __ pop(lr);
+ __ bx(lr);
+
+ CodeDesc desc;
+ masm.GetCode(&desc);
+ ASSERT(!RelocInfo::RequiresRelocation(desc));
+
+ CPU::FlushICache(buffer, actual_size);
+ OS::ProtectCode(buffer, actual_size);
+ return FUNCTION_CAST<OS::MemCopyFunction>(buffer);
+}
+
#undef __
-
UnaryMathFunction CreateSqrtFunction() {
return &sqrt;
}
« src/arm/assembler-arm.cc ('K') | « src/arm/assembler-arm.cc ('k') | src/platform.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698