Chromium Code Reviews| Index: src/arm/codegen-arm.cc |
| =================================================================== |
| --- src/arm/codegen-arm.cc (revision 14076) |
| +++ src/arm/codegen-arm.cc (working copy) |
| @@ -114,10 +114,158 @@ |
| #endif |
| } |
| +static void MemCopyWrapper(void* dest, const void* src, size_t size) { |
| + memcpy(dest, src, size); |
| +} |
| +// Based on Bionic's memcpy.s |
|
hans
2013/04/02 12:34:29
period at the end of comments, here and below
Nike
2013/04/03 15:04:06
Done.
|
| +OS::MemCopyFunction CreateMemCopyFunction() { |
| + size_t actual_size; |
| + static const int kCacheLineSize = 64; |
| + static const int kPrefetchDistance = kCacheLineSize * 4; |
| + // Allocate buffer in executable space. |
| + byte* buffer = static_cast<byte*>(OS::Allocate(1 * KB, |
| + &actual_size, |
| + true)); |
| + if (buffer == NULL) return &MemCopyWrapper; |
| + if (!CpuFeatures::IsSupported(NEON)) return &MemCopyWrapper; |
| + |
| + MacroAssembler masm(NULL, buffer, static_cast<int>(actual_size)); |
| + |
| + CpuFeatureScope use_neon(&masm, NEON); |
| + Label less16, aligned16, aligned8, skip_copy8, skip_copy4, |
| + fix_remainder, main_loop, has32, less32; |
| + |
| + // ----------- S t a t e ------------- |
| + // -- r0 : dest |
| + // -- r1 : src |
| + // -- r2 : count |
| + // ----------------------------------- |
| + |
| + __ push(lr); |
| + |
| + // Start preloading as early as possible |
| + __ pld(r1, kCacheLineSize * 0); |
| + __ pld(r1, kCacheLineSize * 1); |
| + |
| +#ifdef DEBUG |
| + Label check_ok; |
| + // Do we have at least 16-bytes to copy (needed for alignment below) |
| + ASSERT(OS::kMinComplexMemCopy >= 16); |
| + __ cmp(r2, Operand(OS::kMinComplexMemCopy)); |
| + __ b(&check_ok, hs); |
| + __ bkpt(0); |
| +__ bind(&check_ok); |
| +#endif |
| + |
| + // Align destination to half cache-line for the write-buffer |
| + __ rsb(r3, r0, Operand(0)); |
| + __ and_(r3, r3, Operand(0xf), SetCC); |
| + __ b(&aligned16, eq); |
| + |
| + // Copy up to 15-bytes (count in r3) |
| + __ sub(r2, r2, r3); |
| + __ mov(ip, Operand(r3, LSL, 31), SetCC); |
| + __ ldrb(lr, MemOperand(r1, 1, PostIndex), mi); |
| + __ strb(lr, MemOperand(r0, 1, PostIndex), mi); |
| + __ ldrb(ip, MemOperand(r1, 1, PostIndex), cs); |
| + __ ldrb(lr, MemOperand(r1, 1, PostIndex), cs); |
| + __ strb(ip, MemOperand(r0, 1, PostIndex), cs); |
| + __ strb(lr, MemOperand(r0, 1, PostIndex), cs); |
| + __ mov(ip, Operand(r3, LSL, 29), SetCC); |
| + __ b(&aligned8, ge); |
| + // Copies 4 bytes, destination 32-bits aligned |
| + __ vld4(8, r1, d0, element_0, Writeback); |
| + __ vst4(8, r0, d0, element_0, Writeback, 32 / 8); |
| +__ bind(&aligned8); |
| + __ b(&aligned16, cc); |
| + // Copies 8 bytes, destination 64-bits aligned |
| + __ vld1(8, r1, d0, d0, Writeback); |
| + __ vst1(8, r0, d0, d0, Writeback, 64 / 8); |
| + |
| +__ bind(&aligned16); |
| + // Preload immediately the next cache line, which we may need |
| + __ pld(r1, kCacheLineSize * 0); |
| + __ pld(r1, kCacheLineSize * 1); |
| + |
| + // Make sure we have at least 64 bytes to copy |
| + __ sub(r2, r2, Operand(64), SetCC); |
| + __ b(&fix_remainder, lo); |
| + |
| + // Preload all the cache lines we need. |
| + // NOTE: the number of pld below depends on PREFETCH_DISTANCE, |
| + // ideally would would increase the distance in the main loop to |
| + // avoid the goofy code below. In practice this doesn't seem to make |
| + // a big difference. |
| + __ pld(r1, kCacheLineSize * 2); |
| + __ pld(r1, kCacheLineSize * 3); |
| + __ pld(r1, kPrefetchDistance); |
| + |
| + // The main loop copies 64 bytes at a time |
| +__ bind(&main_loop); |
| + __ vld1(8, r1, d0, d3, Writeback); |
| + __ vld1(8, r1, d4, d7, Writeback); |
| + __ pld(r1, kPrefetchDistance); |
| + __ sub(r2, r2, Operand(64), SetCC); |
| + __ vst1(8, r0, d0, d3, Writeback, 128 / 8); |
| + __ vst1(8, r0, d4, d7, Writeback, 128 / 8); |
| + __ b(&main_loop, hs); |
| + |
| + // Fix-up the remaining count and make sure we have >= 32 bytes left |
| +__ bind(&fix_remainder); |
| + __ add(r2, r2, Operand(64)); |
| + __ sub(r2, r2, Operand(32), SetCC); |
| + __ b(&less32, lo); |
| + |
| + // 32 bytes at a time. These cache lines were already preloaded |
| +__ bind(&has32); |
| + __ vld1(8, r1, d0, d3, Writeback); |
| + __ sub(r2, r2, Operand(32), SetCC); |
| + __ vst1(8, r0, d0, d3, Writeback, 128 / 8); |
| + __ b(&has32, hs); |
| + |
| + // Less than 32 left |
| +__ bind(&less32); |
| + __ add(r2, r2, Operand(32)); |
| + __ tst(r2, Operand(0x10)); |
| + __ b(&less16, eq); |
| + // Copies 16 bytes, 128-bits aligned |
| + __ vld1(8, r1, d0, d1, Writeback); |
| + __ vst1(8, r0, d0, d1, Writeback, 128 / 8); |
| + |
| + // copy up to 15-bytes (count in r2) |
| +__ bind(&less16); |
| + __ mov(ip, Operand(r2, LSL, 29), SetCC); |
| + __ b(&skip_copy8, cc); |
| + __ vld1(8, r1, d0, d0, Writeback); |
| + __ vst1(8, r0, d0, d0, Writeback); |
| +__ bind(&skip_copy8); |
| + __ b(&skip_copy4, ge); |
| + __ vld4(8, r1, d0, element_0, Writeback); |
| + __ vst4(8, r0, d0, element_0, Writeback); |
| +__ bind(&skip_copy4); |
| + __ mov(ip, Operand(r2, LSL, 31), SetCC); |
| + __ ldrb(r3, MemOperand(r1, 1, PostIndex), mi); |
| + __ ldrb(ip, MemOperand(r1, 1, PostIndex), cs); |
| + __ ldrb(lr, MemOperand(r1, 1, PostIndex), cs); |
| + __ strb(r3, MemOperand(r0, 1, PostIndex), mi); |
| + __ strb(ip, MemOperand(r0, 1, PostIndex), cs); |
| + __ strb(lr, MemOperand(r0, 1, PostIndex), cs); |
| + |
| + __ pop(lr); |
| + __ bx(lr); |
| + |
| + CodeDesc desc; |
| + masm.GetCode(&desc); |
| + ASSERT(!RelocInfo::RequiresRelocation(desc)); |
| + |
| + CPU::FlushICache(buffer, actual_size); |
| + OS::ProtectCode(buffer, actual_size); |
| + return FUNCTION_CAST<OS::MemCopyFunction>(buffer); |
| +} |
| + |
| #undef __ |
| - |
| UnaryMathFunction CreateSqrtFunction() { |
| return &sqrt; |
| } |