src/arm/codegen-arm.cc - Issue 12920009: Use generated Neon version of MemCopy() on ARM, if platform supports it.

Side by Side Diff: src/arm/codegen-arm.cc

Issue 12920009: Use generated Neon version of MemCopy() on ARM, if platform supports it. (Closed) Base URL: http://v8.googlecode.com/svn/branches/bleeding_edge/

Patch Set: Created 7 years, 8 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright 2012 the V8 project authors. All rights reserved.	1 // Copyright 2012 the V8 project authors. All rights reserved.

2 // Redistribution and use in source and binary forms, with or without	2 // Redistribution and use in source and binary forms, with or without

3 // modification, are permitted provided that the following conditions are	3 // modification, are permitted provided that the following conditions are

4 // met:	4 // met:

5 //	5 //

6 // * Redistributions of source code must retain the above copyright	6 // * Redistributions of source code must retain the above copyright

7 // notice, this list of conditions and the following disclaimer.	7 // notice, this list of conditions and the following disclaimer.

8 // * Redistributions in binary form must reproduce the above	8 // * Redistributions in binary form must reproduce the above

9 // copyright notice, this list of conditions and the following	9 // copyright notice, this list of conditions and the following

10 // disclaimer in the documentation and/or other materials provided	10 // disclaimer in the documentation and/or other materials provided

(...skipping 96 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
107 OS::ProtectCode(buffer, actual_size);	107 OS::ProtectCode(buffer, actual_size);

108	108

109 #if !defined(USE_SIMULATOR)	109 #if !defined(USE_SIMULATOR)

110 return FUNCTION_CAST<UnaryMathFunction>(buffer);	110 return FUNCTION_CAST<UnaryMathFunction>(buffer);

111 #else	111 #else

112 fast_exp_arm_machine_code = buffer;	112 fast_exp_arm_machine_code = buffer;

113 return &fast_exp_simulator;	113 return &fast_exp_simulator;

114 #endif	114 #endif

115 }	115 }

116	116

	117 static void MemCopyWrapper(void* dest, const void* src, size_t size) {

	118 memcpy(dest, src, size);

	119 }

	120

	121 // Based on Bionic's memcpy.s.

	122 OS::MemCopyFunction CreateMemCopyFunction() {

	123 size_t actual_size;

	124 static const int kCacheLineSize = 64;
	Rodolph Perfetta 2013/04/30 17:08:37 This is true on A8 and A15 but not A9. This is true on A8 and A15 but not A9.
	125 static const int kPrefetchDistance = kCacheLineSize * 4;

	126 // Allocate buffer in executable space.

	127 byte* buffer = static_cast<byte>(OS::Allocate(1 KB,

	128 &actual_size,

	129 true));

	130 if (buffer == NULL) return &MemCopyWrapper;

	131 if (!CpuFeatures::IsSupported(NEON)) return &MemCopyWrapper;

	132

	133 MacroAssembler masm(NULL, buffer, static_cast<int>(actual_size));

	134

	135 CpuFeatureScope use_neon(&masm, NEON);

	136 Label less16, aligned16, aligned8, skip_copy8, skip_copy4,

	137 fix_remainder, main_loop, has32, less32;

	138

	139 // ----------- S t a t e -------------

	140 // -- r0 : dest

	141 // -- r1 : src

	142 // -- r2 : count

	143 // -----------------------------------

	144

	145 __ push(lr);

	146

	147 // Start preloading as early as possible.

	148 // TODO: consider using pldw for write preload, if makes sense on chip.

	149 __ pld(r1, kCacheLineSize * 0);

	150 __ pld(r1, kCacheLineSize * 1);

	151

	152 #ifdef DEBUG

	153 Label check_ok;

	154 // Do we have at least 16-bytes to copy (needed for alignment below).

	155 ASSERT(OS::kMinComplexMemCopy >= 16);
	Rodolph Perfetta 2013/04/30 17:08:37 STATIC_ASSERT STATIC_ASSERT
	156 __ cmp(r2, Operand(OS::kMinComplexMemCopy));

	157 __ b(&check_ok, hs);

	158 __ bkpt(0);

	159 __ bind(&check_ok);

	160 #endif

	161

	162 // Align destination to half cache-line for the write-buffer.

	163 __ rsb(r3, r0, Operand(0));

	164 __ and_(r3, r3, Operand(0xf), SetCC);

	165 __ b(&aligned16, eq);

	166

	167 // Copy up to 15-bytes (count in r3).

	168 __ sub(r2, r2, r3);

	169 __ mov(ip, Operand(r3, LSL, 31), SetCC);

	170 __ ldrb(lr, MemOperand(r1, 1, PostIndex), mi);

	171 __ strb(lr, MemOperand(r0, 1, PostIndex), mi);

	172 __ ldrb(ip, MemOperand(r1, 1, PostIndex), cs);

	173 __ ldrb(lr, MemOperand(r1, 1, PostIndex), cs);
	Rodolph Perfetta 2013/04/30 17:08:37 use ldrh instead of 2 ldrb. Same for stores. use ldrh instead of 2 ldrb. Same for stores.
	174 __ strb(ip, MemOperand(r0, 1, PostIndex), cs);

	175 __ strb(lr, MemOperand(r0, 1, PostIndex), cs);

	176 __ mov(ip, Operand(r3, LSL, 29), SetCC);

	177 __ b(&aligned8, ge);

	178 // Copies 4 bytes, destination 32-bits aligned.

	179 __ vld4(8, r1, d0, element_0, Writeback);
	Rodolph Perfetta 2013/04/30 17:08:37 I am not sure why you are using vld4. Currently yo I am not sure why you are using vld4. Currently you are loading a byte into d0[0], the next byte into d1[0], etc. So you are touching 4 double registers to load 4 bytes. If you want to load a word then use ldr. It also saves you implementing vld4 in the assembler/disasm/simulator. If you want to avoid mixing arm/neon then you should also get rid of he ldrb/strb above.
	180 __ vst4(8, r0, d0, element_0, Writeback, 32 / 8);

	181 __ bind(&aligned8);

	182 __ b(&aligned16, cc);

	183 // Copies 8 bytes, destination 64-bits aligned.

	184 __ vld1(8, r1, d0, d0, Writeback);

	185 __ vst1(8, r0, d0, d0, Writeback, 64 / 8);

	186

	187 __ bind(&aligned16);

	188 // Preload immediately the next cache line, which we may need.

	189 __ pld(r1, kCacheLineSize * 0);

	190 __ pld(r1, kCacheLineSize * 1);

	191

	192 // Make sure we have at least 64 bytes to copy.

	193 __ sub(r2, r2, Operand(64), SetCC);

	194 __ b(&fix_remainder, lo);

	195

	196 // Preload all the cache lines we need.

	197 // NOTE: the number of pld below depends on PREFETCH_DISTANCE,

	198 // ideally would would increase the distance in the main loop to

	199 // avoid the goofy code below. In practice this doesn't seem to make

	200 // a big difference.

	201 __ pld(r1, kCacheLineSize * 2);

	202 __ pld(r1, kCacheLineSize * 3);

	203 __ pld(r1, kPrefetchDistance);

	204

	205 // The main loop copies 64 bytes at a time.

	206 __ bind(&main_loop);

	207 __ vld1(8, r1, d0, d3, Writeback);

	208 __ vld1(8, r1, d4, d7, Writeback);

	209 __ pld(r1, kPrefetchDistance);

	210 __ sub(r2, r2, Operand(64), SetCC);

	211 __ vst1(8, r0, d0, d3, Writeback, 128 / 8);

	212 __ vst1(8, r0, d4, d7, Writeback, 128 / 8);

	213 __ b(&main_loop, hs);

	214

	215 // Fix-up the remaining count and make sure we have >= 32 bytes left.

	216 __ bind(&fix_remainder);

	217 __ add(r2, r2, Operand(64));

	218 __ sub(r2, r2, Operand(32), SetCC);

	219 __ b(&less32, lo);

	220

	221 // 32 bytes at a time. These cache lines were already preloaded.

	222 __ bind(&has32);

	223 __ vld1(8, r1, d0, d3, Writeback);

	224 __ sub(r2, r2, Operand(32), SetCC);

	225 __ vst1(8, r0, d0, d3, Writeback, 128 / 8);

	226 __ b(&has32, hs);
	Rodolph Perfetta 2013/04/30 17:08:37 If I followed correctly when you enter the has32 b If I followed correctly when you enter the has32 block, you know that 32 <= r2 < 64, so this branch will never be taken.
	227

	228 // Less than 32 left.

	229 __ bind(&less32);

	230 __ add(r2, r2, Operand(32));

	231 __ tst(r2, Operand(0x10));

	232 __ b(&less16, eq);

	233 // Copies 16 bytes, 128-bits aligned.

	234 __ vld1(8, r1, d0, d1, Writeback);

	235 __ vst1(8, r0, d0, d1, Writeback, 128 / 8);

	236

	237 // Copy up to 15-bytes (count in r2).

	238 __ bind(&less16);

	239 __ mov(ip, Operand(r2, LSL, 29), SetCC);

	240 __ b(&skip_copy8, cc);

	241 __ vld1(8, r1, d0, d0, Writeback);

	242 __ vst1(8, r0, d0, d0, Writeback);

	243 __ bind(&skip_copy8);

	244 __ b(&skip_copy4, ge);
	Rodolph Perfetta 2013/04/30 17:08:37 ge implies N flag == V flag, shift with SetCC don' ge implies N flag == V flag, shift with SetCC don't touch the overflow flag so you probably meant to use pl (N clear).
	245 __ vld4(8, r1, d0, element_0, Writeback);

	246 __ vst4(8, r0, d0, element_0, Writeback);

	247 __ bind(&skip_copy4);

	248 __ mov(ip, Operand(r2, LSL, 31), SetCC);

	249 __ ldrb(r3, MemOperand(r1, 1, PostIndex), mi);

	250 __ ldrb(ip, MemOperand(r1, 1, PostIndex), cs);

	251 __ ldrb(lr, MemOperand(r1, 1, PostIndex), cs);
	Rodolph Perfetta 2013/04/30 17:08:37 ldrh, then strh below. ldrh, then strh below.
	252 __ strb(r3, MemOperand(r0, 1, PostIndex), mi);

	253 __ strb(ip, MemOperand(r0, 1, PostIndex), cs);

	254 __ strb(lr, MemOperand(r0, 1, PostIndex), cs);

	255

	256 __ pop(lr);

	257 __ bx(lr);
	Rodolph Perfetta 2013/04/30 17:08:37 You can combine both operations above with: __ p You can combine both operations above with: __ pop(pc);
	258

	259 CodeDesc desc;

	260 masm.GetCode(&desc);

	261 ASSERT(!RelocInfo::RequiresRelocation(desc));

	262

	263 CPU::FlushICache(buffer, actual_size);

	264 OS::ProtectCode(buffer, actual_size);

	265 return FUNCTION_CAST<OS::MemCopyFunction>(buffer);

	266 }

117	267

118 #undef __	268 #undef __

119	269

120

121 UnaryMathFunction CreateSqrtFunction() {	270 UnaryMathFunction CreateSqrtFunction() {

122 return &sqrt;	271 return &sqrt;

123 }	272 }

124	273

125 // -------------------------------------------------------------------------	274 // -------------------------------------------------------------------------

126 // Platform-specific RuntimeCallHelper functions.	275 // Platform-specific RuntimeCallHelper functions.

127	276

128 void StubRuntimeCallHelper::BeforeCall(MacroAssembler* masm) const {	277 void StubRuntimeCallHelper::BeforeCall(MacroAssembler* masm) const {

129 masm->EnterFrame(StackFrame::INTERNAL);	278 masm->EnterFrame(StackFrame::INTERNAL);

130 ASSERT(!masm->has_frame());	279 ASSERT(!masm->has_frame());

(...skipping 568 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
699 patcher.masm()->add(r0, pc, Operand(-8));	848 patcher.masm()->add(r0, pc, Operand(-8));

700 patcher.masm()->ldr(pc, MemOperand(pc, -4));	849 patcher.masm()->ldr(pc, MemOperand(pc, -4));

701 patcher.masm()->dd(reinterpret_cast<uint32_t>(stub->instruction_start()));	850 patcher.masm()->dd(reinterpret_cast<uint32_t>(stub->instruction_start()));

702 }	851 }

703 }	852 }

704	853

705	854

706 } } // namespace v8::internal	855 } } // namespace v8::internal

707	856

708 #endif // V8_TARGET_ARCH_ARM	857 #endif // V8_TARGET_ARCH_ARM

OLD	NEW

« no previous file with comments | « src/arm/assembler-arm.cc ('k') | src/platform.h » ('j') | no next file with comments »