Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(249)

Side by Side Diff: src/arm/codegen-arm.cc

Issue 12920009: Use generated Neon version of MemCopy() on ARM, if platform supports it. (Closed) Base URL: http://v8.googlecode.com/svn/branches/bleeding_edge/
Patch Set: Created 7 years, 8 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « src/arm/assembler-arm.cc ('k') | src/platform.h » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2012 the V8 project authors. All rights reserved. 1 // Copyright 2012 the V8 project authors. All rights reserved.
2 // Redistribution and use in source and binary forms, with or without 2 // Redistribution and use in source and binary forms, with or without
3 // modification, are permitted provided that the following conditions are 3 // modification, are permitted provided that the following conditions are
4 // met: 4 // met:
5 // 5 //
6 // * Redistributions of source code must retain the above copyright 6 // * Redistributions of source code must retain the above copyright
7 // notice, this list of conditions and the following disclaimer. 7 // notice, this list of conditions and the following disclaimer.
8 // * Redistributions in binary form must reproduce the above 8 // * Redistributions in binary form must reproduce the above
9 // copyright notice, this list of conditions and the following 9 // copyright notice, this list of conditions and the following
10 // disclaimer in the documentation and/or other materials provided 10 // disclaimer in the documentation and/or other materials provided
(...skipping 96 matching lines...) Expand 10 before | Expand all | Expand 10 after
107 OS::ProtectCode(buffer, actual_size); 107 OS::ProtectCode(buffer, actual_size);
108 108
109 #if !defined(USE_SIMULATOR) 109 #if !defined(USE_SIMULATOR)
110 return FUNCTION_CAST<UnaryMathFunction>(buffer); 110 return FUNCTION_CAST<UnaryMathFunction>(buffer);
111 #else 111 #else
112 fast_exp_arm_machine_code = buffer; 112 fast_exp_arm_machine_code = buffer;
113 return &fast_exp_simulator; 113 return &fast_exp_simulator;
114 #endif 114 #endif
115 } 115 }
116 116
117 static void MemCopyWrapper(void* dest, const void* src, size_t size) {
118 memcpy(dest, src, size);
119 }
120
121 // Based on Bionic's memcpy.s.
122 OS::MemCopyFunction CreateMemCopyFunction() {
123 size_t actual_size;
124 static const int kCacheLineSize = 64;
Rodolph Perfetta 2013/04/30 17:08:37 This is true on A8 and A15 but not A9.
125 static const int kPrefetchDistance = kCacheLineSize * 4;
126 // Allocate buffer in executable space.
127 byte* buffer = static_cast<byte*>(OS::Allocate(1 * KB,
128 &actual_size,
129 true));
130 if (buffer == NULL) return &MemCopyWrapper;
131 if (!CpuFeatures::IsSupported(NEON)) return &MemCopyWrapper;
132
133 MacroAssembler masm(NULL, buffer, static_cast<int>(actual_size));
134
135 CpuFeatureScope use_neon(&masm, NEON);
136 Label less16, aligned16, aligned8, skip_copy8, skip_copy4,
137 fix_remainder, main_loop, has32, less32;
138
139 // ----------- S t a t e -------------
140 // -- r0 : dest
141 // -- r1 : src
142 // -- r2 : count
143 // -----------------------------------
144
145 __ push(lr);
146
147 // Start preloading as early as possible.
148 // TODO: consider using pldw for write preload, if makes sense on chip.
149 __ pld(r1, kCacheLineSize * 0);
150 __ pld(r1, kCacheLineSize * 1);
151
152 #ifdef DEBUG
153 Label check_ok;
154 // Do we have at least 16-bytes to copy (needed for alignment below).
155 ASSERT(OS::kMinComplexMemCopy >= 16);
Rodolph Perfetta 2013/04/30 17:08:37 STATIC_ASSERT
156 __ cmp(r2, Operand(OS::kMinComplexMemCopy));
157 __ b(&check_ok, hs);
158 __ bkpt(0);
159 __ bind(&check_ok);
160 #endif
161
162 // Align destination to half cache-line for the write-buffer.
163 __ rsb(r3, r0, Operand(0));
164 __ and_(r3, r3, Operand(0xf), SetCC);
165 __ b(&aligned16, eq);
166
167 // Copy up to 15-bytes (count in r3).
168 __ sub(r2, r2, r3);
169 __ mov(ip, Operand(r3, LSL, 31), SetCC);
170 __ ldrb(lr, MemOperand(r1, 1, PostIndex), mi);
171 __ strb(lr, MemOperand(r0, 1, PostIndex), mi);
172 __ ldrb(ip, MemOperand(r1, 1, PostIndex), cs);
173 __ ldrb(lr, MemOperand(r1, 1, PostIndex), cs);
Rodolph Perfetta 2013/04/30 17:08:37 use ldrh instead of 2 ldrb. Same for stores.
174 __ strb(ip, MemOperand(r0, 1, PostIndex), cs);
175 __ strb(lr, MemOperand(r0, 1, PostIndex), cs);
176 __ mov(ip, Operand(r3, LSL, 29), SetCC);
177 __ b(&aligned8, ge);
178 // Copies 4 bytes, destination 32-bits aligned.
179 __ vld4(8, r1, d0, element_0, Writeback);
Rodolph Perfetta 2013/04/30 17:08:37 I am not sure why you are using vld4. Currently yo
180 __ vst4(8, r0, d0, element_0, Writeback, 32 / 8);
181 __ bind(&aligned8);
182 __ b(&aligned16, cc);
183 // Copies 8 bytes, destination 64-bits aligned.
184 __ vld1(8, r1, d0, d0, Writeback);
185 __ vst1(8, r0, d0, d0, Writeback, 64 / 8);
186
187 __ bind(&aligned16);
188 // Preload immediately the next cache line, which we may need.
189 __ pld(r1, kCacheLineSize * 0);
190 __ pld(r1, kCacheLineSize * 1);
191
192 // Make sure we have at least 64 bytes to copy.
193 __ sub(r2, r2, Operand(64), SetCC);
194 __ b(&fix_remainder, lo);
195
196 // Preload all the cache lines we need.
197 // NOTE: the number of pld below depends on PREFETCH_DISTANCE,
198 // ideally would would increase the distance in the main loop to
199 // avoid the goofy code below. In practice this doesn't seem to make
200 // a big difference.
201 __ pld(r1, kCacheLineSize * 2);
202 __ pld(r1, kCacheLineSize * 3);
203 __ pld(r1, kPrefetchDistance);
204
205 // The main loop copies 64 bytes at a time.
206 __ bind(&main_loop);
207 __ vld1(8, r1, d0, d3, Writeback);
208 __ vld1(8, r1, d4, d7, Writeback);
209 __ pld(r1, kPrefetchDistance);
210 __ sub(r2, r2, Operand(64), SetCC);
211 __ vst1(8, r0, d0, d3, Writeback, 128 / 8);
212 __ vst1(8, r0, d4, d7, Writeback, 128 / 8);
213 __ b(&main_loop, hs);
214
215 // Fix-up the remaining count and make sure we have >= 32 bytes left.
216 __ bind(&fix_remainder);
217 __ add(r2, r2, Operand(64));
218 __ sub(r2, r2, Operand(32), SetCC);
219 __ b(&less32, lo);
220
221 // 32 bytes at a time. These cache lines were already preloaded.
222 __ bind(&has32);
223 __ vld1(8, r1, d0, d3, Writeback);
224 __ sub(r2, r2, Operand(32), SetCC);
225 __ vst1(8, r0, d0, d3, Writeback, 128 / 8);
226 __ b(&has32, hs);
Rodolph Perfetta 2013/04/30 17:08:37 If I followed correctly when you enter the has32 b
227
228 // Less than 32 left.
229 __ bind(&less32);
230 __ add(r2, r2, Operand(32));
231 __ tst(r2, Operand(0x10));
232 __ b(&less16, eq);
233 // Copies 16 bytes, 128-bits aligned.
234 __ vld1(8, r1, d0, d1, Writeback);
235 __ vst1(8, r0, d0, d1, Writeback, 128 / 8);
236
237 // Copy up to 15-bytes (count in r2).
238 __ bind(&less16);
239 __ mov(ip, Operand(r2, LSL, 29), SetCC);
240 __ b(&skip_copy8, cc);
241 __ vld1(8, r1, d0, d0, Writeback);
242 __ vst1(8, r0, d0, d0, Writeback);
243 __ bind(&skip_copy8);
244 __ b(&skip_copy4, ge);
Rodolph Perfetta 2013/04/30 17:08:37 ge implies N flag == V flag, shift with SetCC don'
245 __ vld4(8, r1, d0, element_0, Writeback);
246 __ vst4(8, r0, d0, element_0, Writeback);
247 __ bind(&skip_copy4);
248 __ mov(ip, Operand(r2, LSL, 31), SetCC);
249 __ ldrb(r3, MemOperand(r1, 1, PostIndex), mi);
250 __ ldrb(ip, MemOperand(r1, 1, PostIndex), cs);
251 __ ldrb(lr, MemOperand(r1, 1, PostIndex), cs);
Rodolph Perfetta 2013/04/30 17:08:37 ldrh, then strh below.
252 __ strb(r3, MemOperand(r0, 1, PostIndex), mi);
253 __ strb(ip, MemOperand(r0, 1, PostIndex), cs);
254 __ strb(lr, MemOperand(r0, 1, PostIndex), cs);
255
256 __ pop(lr);
257 __ bx(lr);
Rodolph Perfetta 2013/04/30 17:08:37 You can combine both operations above with: __ p
258
259 CodeDesc desc;
260 masm.GetCode(&desc);
261 ASSERT(!RelocInfo::RequiresRelocation(desc));
262
263 CPU::FlushICache(buffer, actual_size);
264 OS::ProtectCode(buffer, actual_size);
265 return FUNCTION_CAST<OS::MemCopyFunction>(buffer);
266 }
117 267
118 #undef __ 268 #undef __
119 269
120
121 UnaryMathFunction CreateSqrtFunction() { 270 UnaryMathFunction CreateSqrtFunction() {
122 return &sqrt; 271 return &sqrt;
123 } 272 }
124 273
125 // ------------------------------------------------------------------------- 274 // -------------------------------------------------------------------------
126 // Platform-specific RuntimeCallHelper functions. 275 // Platform-specific RuntimeCallHelper functions.
127 276
128 void StubRuntimeCallHelper::BeforeCall(MacroAssembler* masm) const { 277 void StubRuntimeCallHelper::BeforeCall(MacroAssembler* masm) const {
129 masm->EnterFrame(StackFrame::INTERNAL); 278 masm->EnterFrame(StackFrame::INTERNAL);
130 ASSERT(!masm->has_frame()); 279 ASSERT(!masm->has_frame());
(...skipping 568 matching lines...) Expand 10 before | Expand all | Expand 10 after
699 patcher.masm()->add(r0, pc, Operand(-8)); 848 patcher.masm()->add(r0, pc, Operand(-8));
700 patcher.masm()->ldr(pc, MemOperand(pc, -4)); 849 patcher.masm()->ldr(pc, MemOperand(pc, -4));
701 patcher.masm()->dd(reinterpret_cast<uint32_t>(stub->instruction_start())); 850 patcher.masm()->dd(reinterpret_cast<uint32_t>(stub->instruction_start()));
702 } 851 }
703 } 852 }
704 853
705 854
706 } } // namespace v8::internal 855 } } // namespace v8::internal
707 856
708 #endif // V8_TARGET_ARCH_ARM 857 #endif // V8_TARGET_ARCH_ARM
OLDNEW
« no previous file with comments | « src/arm/assembler-arm.cc ('k') | src/platform.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698