fusl/src/string/arm/memcpy_le.S - Issue 1712063002: [fusl] Consistently name asm files .s, not .S

Side by Side Diff: fusl/src/string/arm/memcpy_le.S

Issue 1712063002: [fusl] Consistently name asm files .s, not .S (Closed) Base URL: git@github.com:domokit/mojo.git@master

Patch Set: Created 4 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
	(Empty)
1 #ifndef __ARMEB__

2

3 /*

4 * Copyright (C) 2008 The Android Open Source Project

5 * All rights reserved.

6 *

7 * Redistribution and use in source and binary forms, with or without

8 * modification, are permitted provided that the following conditions

9 * are met:

10 * * Redistributions of source code must retain the above copyright

11 * notice, this list of conditions and the following disclaimer.

12 * * Redistributions in binary form must reproduce the above copyright

13 * notice, this list of conditions and the following disclaimer in

14 * the documentation and/or other materials provided with the

15 * distribution.

16 *

17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS

18 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

19 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS

20 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE

21 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,

22 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,

23 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS

24 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED

25 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,

26 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT

27 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF

28 * SUCH DAMAGE.

29 */

30

31

32 /*

33 * Optimized memcpy() for ARM.

34 *

35 * note that memcpy() always returns the destination pointer,

36 * so we have to preserve R0.

37 */

38

39 /*

40 * This file has been modified from the original for use in musl libc.

41 * The main changes are: addition of .type memcpy,%function to make the

42 * code safely callable from thumb mode, adjusting the return

43 * instructions to be compatible with pre-thumb ARM cpus, and removal

44 * of prefetch code that is not compatible with older cpus.

45 */

46

47 .syntax unified

48

49 .global memcpy

50 .type memcpy,%function

51 memcpy:

52 /* The stack must always be 64-bits aligned to be compliant with the

53 * ARM ABI. Since we have to save R0, we might as well save R4

54 * which we can use for better pipelining of the reads below

55 */

56 .fnstart

57 .save {r0, r4, lr}

58 stmfd sp!, {r0, r4, lr}

59 /* Making room for r5-r11 which will be spilled later */

60 .pad #28

61 sub sp, sp, #28

62

63 /* it simplifies things to take care of len<4 early */

64 cmp r2, #4

65 blo copy_last_3_and_return

66

67 /* compute the offset to align the source

68 * offset = (4-(src&3))&3 = -src & 3

69 */

70 rsb r3, r1, #0

71 ands r3, r3, #3

72 beq src_aligned

73

74 /* align source to 32 bits. We need to insert 2 instructions between

75 * a ldr[b\|h] and str[b\|h] because byte and half-word instructions

76 * stall 2 cycles.

77 */

78 movs r12, r3, lsl #31

79 sub r2, r2, r3 /* we know that r3 <= r2 because r2 >= 4 */

80 ldrbmi r3, [r1], #1

81 ldrbcs r4, [r1], #1

82 ldrbcs r12,[r1], #1

83 strbmi r3, [r0], #1

84 strbcs r4, [r0], #1

85 strbcs r12,[r0], #1

86

87 src_aligned:

88

89 /* see if src and dst are aligned together (congruent) */

90 eor r12, r0, r1

91 tst r12, #3

92 bne non_congruent

93

94 /* Use post-incriment mode for stm to spill r5-r11 to reserved stack

95 * frame. Don't update sp.

96 */

97 stmea sp, {r5-r11}

98

99 /* align the destination to a cache-line */

100 rsb r3, r0, #0

101 ands r3, r3, #0x1C

102 beq congruent_aligned32

103 cmp r3, r2

104 andhi r3, r2, #0x1C

105

106 /* conditionnaly copies 0 to 7 words (length in r3) */

107 movs r12, r3, lsl #28

108 ldmcs r1!, {r4, r5, r6, r7} /* 16 bytes */

109 ldmmi r1!, {r8, r9} /* 8 bytes */

110 stmcs r0!, {r4, r5, r6, r7}

111 stmmi r0!, {r8, r9}

112 tst r3, #0x4

113 ldrne r10,[r1], #4 /* 4 bytes */

114 strne r10,[r0], #4

115 sub r2, r2, r3

116

117 congruent_aligned32:

118 /*

119 * here source is aligned to 32 bytes.

120 */

121

122 cached_aligned32:

123 subs r2, r2, #32

124 blo less_than_32_left

125

126 /*

127 * We preload a cache-line up to 64 bytes ahead. On the 926, this will

128 * stall only until the requested world is fetched, but the linefill

129 * continues in the the background.

130 * While the linefill is going, we write our previous cache-line

131 * into the write-buffer (which should have some free space).

132 * When the linefill is done, the writebuffer will

133 * start dumping its content into memory

134 *

135 * While all this is going, we then load a full cache line into

136 * 8 registers, this cache line should be in the cache by now

137 * (or partly in the cache).

138 *

139 * This code should work well regardless of the source/dest alignment.

140 *

141 */

142

143 /* Align the preload register to a cache-line because the cpu does

144 * "critical word first" (the first word requested is loaded first).

145 */

146 @ bic r12, r1, #0x1F

147 @ add r12, r12, #64

148

149 1: ldmia r1!, { r4-r11 }

150 subs r2, r2, #32

151

152 /*

153 * NOTE: if r12 is more than 64 ahead of r1, the following ldrhi

154 * for ARM9 preload will not be safely guarded by the preceding subs.

155 * When it is safely guarded the only possibility to have SIGSEGV here

156 * is because the caller overstates the length.

157 */

158 @ ldrhi r3, [r12], #32 /* cheap ARM9 preload */

159 stmia r0!, { r4-r11 }

160 bhs 1b

161

162 add r2, r2, #32

163

164 less_than_32_left:

165 /*

166 * less than 32 bytes left at this point (length in r2)

167 */

168

169 /* skip all this if there is nothing to do, which should

170 * be a common case (if not executed the code below takes

171 * about 16 cycles)

172 */

173 tst r2, #0x1F

174 beq 1f

175

176 /* conditionnaly copies 0 to 31 bytes */

177 movs r12, r2, lsl #28

178 ldmcs r1!, {r4, r5, r6, r7} /* 16 bytes */

179 ldmmi r1!, {r8, r9} /* 8 bytes */

180 stmcs r0!, {r4, r5, r6, r7}

181 stmmi r0!, {r8, r9}

182 movs r12, r2, lsl #30

183 ldrcs r3, [r1], #4 /* 4 bytes */

184 ldrhmi r4, [r1], #2 /* 2 bytes */

185 strcs r3, [r0], #4

186 strhmi r4, [r0], #2

187 tst r2, #0x1

188 ldrbne r3, [r1] /* last byte */

189 strbne r3, [r0]

190

191 /* we're done! restore everything and return */

192 1: ldmfd sp!, {r5-r11}

193 ldmfd sp!, {r0, r4, lr}

194 bx lr

195

196 /********************************************************************/

197

198 non_congruent:

199 /*

200 * here source is aligned to 4 bytes

201 * but destination is not.

202 *

203 * in the code below r2 is the number of bytes read

204 * (the number of bytes written is always smaller, because we have

205 * partial words in the shift queue)

206 */

207 cmp r2, #4

208 blo copy_last_3_and_return

209

210 /* Use post-incriment mode for stm to spill r5-r11 to reserved stack

211 * frame. Don't update sp.

212 */

213 stmea sp, {r5-r11}

214

215 /* compute shifts needed to align src to dest */

216 rsb r5, r0, #0

217 and r5, r5, #3 /* r5 = # bytes in partial words */

218 mov r12, r5, lsl #3 /* r12 = right */

219 rsb lr, r12, #32 /* lr = left */

220

221 /* read the first word */

222 ldr r3, [r1], #4

223 sub r2, r2, #4

224

225 /* write a partial word (0 to 3 bytes), such that destination

226 * becomes aligned to 32 bits (r5 = nb of words to copy for alignment)

227 */

228 movs r5, r5, lsl #31

229 strbmi r3, [r0], #1

230 movmi r3, r3, lsr #8

231 strbcs r3, [r0], #1

232 movcs r3, r3, lsr #8

233 strbcs r3, [r0], #1

234 movcs r3, r3, lsr #8

235

236 cmp r2, #4

237 blo partial_word_tail

238

239 /* Align destination to 32 bytes (cache line boundary) */

240 1: tst r0, #0x1c

241 beq 2f

242 ldr r5, [r1], #4

243 sub r2, r2, #4

244 orr r4, r3, r5, lsl lr

245 mov r3, r5, lsr r12

246 str r4, [r0], #4

247 cmp r2, #4

248 bhs 1b

249 blo partial_word_tail

250

251 /* copy 32 bytes at a time */

252 2: subs r2, r2, #32

253 blo less_than_thirtytwo

254

255 /* Use immediate mode for the shifts, because there is an extra cycle

256 * for register shifts, which could account for up to 50% of

257 * performance hit.

258 */

259

260 cmp r12, #24

261 beq loop24

262 cmp r12, #8

263 beq loop8

264

265 loop16:

266 ldr r12, [r1], #4

267 1: mov r4, r12

268 ldmia r1!, { r5,r6,r7, r8,r9,r10,r11}

269 subs r2, r2, #32

270 ldrhs r12, [r1], #4

271 orr r3, r3, r4, lsl #16

272 mov r4, r4, lsr #16

273 orr r4, r4, r5, lsl #16

274 mov r5, r5, lsr #16

275 orr r5, r5, r6, lsl #16

276 mov r6, r6, lsr #16

277 orr r6, r6, r7, lsl #16

278 mov r7, r7, lsr #16

279 orr r7, r7, r8, lsl #16

280 mov r8, r8, lsr #16

281 orr r8, r8, r9, lsl #16

282 mov r9, r9, lsr #16

283 orr r9, r9, r10, lsl #16

284 mov r10, r10, lsr #16

285 orr r10, r10, r11, lsl #16

286 stmia r0!, {r3,r4,r5,r6, r7,r8,r9,r10}

287 mov r3, r11, lsr #16

288 bhs 1b

289 b less_than_thirtytwo

290

291 loop8:

292 ldr r12, [r1], #4

293 1: mov r4, r12

294 ldmia r1!, { r5,r6,r7, r8,r9,r10,r11}

295 subs r2, r2, #32

296 ldrhs r12, [r1], #4

297 orr r3, r3, r4, lsl #24

298 mov r4, r4, lsr #8

299 orr r4, r4, r5, lsl #24

300 mov r5, r5, lsr #8

301 orr r5, r5, r6, lsl #24

302 mov r6, r6, lsr #8

303 orr r6, r6, r7, lsl #24

304 mov r7, r7, lsr #8

305 orr r7, r7, r8, lsl #24

306 mov r8, r8, lsr #8

307 orr r8, r8, r9, lsl #24

308 mov r9, r9, lsr #8

309 orr r9, r9, r10, lsl #24

310 mov r10, r10, lsr #8

311 orr r10, r10, r11, lsl #24

312 stmia r0!, {r3,r4,r5,r6, r7,r8,r9,r10}

313 mov r3, r11, lsr #8

314 bhs 1b

315 b less_than_thirtytwo

316

317 loop24:

318 ldr r12, [r1], #4

319 1: mov r4, r12

320 ldmia r1!, { r5,r6,r7, r8,r9,r10,r11}

321 subs r2, r2, #32

322 ldrhs r12, [r1], #4

323 orr r3, r3, r4, lsl #8

324 mov r4, r4, lsr #24

325 orr r4, r4, r5, lsl #8

326 mov r5, r5, lsr #24

327 orr r5, r5, r6, lsl #8

328 mov r6, r6, lsr #24

329 orr r6, r6, r7, lsl #8

330 mov r7, r7, lsr #24

331 orr r7, r7, r8, lsl #8

332 mov r8, r8, lsr #24

333 orr r8, r8, r9, lsl #8

334 mov r9, r9, lsr #24

335 orr r9, r9, r10, lsl #8

336 mov r10, r10, lsr #24

337 orr r10, r10, r11, lsl #8

338 stmia r0!, {r3,r4,r5,r6, r7,r8,r9,r10}

339 mov r3, r11, lsr #24

340 bhs 1b

341

342 less_than_thirtytwo:

343 /* copy the last 0 to 31 bytes of the source */

344 rsb r12, lr, #32 /* we corrupted r12, recompute it */

345 add r2, r2, #32

346 cmp r2, #4

347 blo partial_word_tail

348

349 1: ldr r5, [r1], #4

350 sub r2, r2, #4

351 orr r4, r3, r5, lsl lr

352 mov r3, r5, lsr r12

353 str r4, [r0], #4

354 cmp r2, #4

355 bhs 1b

356

357 partial_word_tail:

358 /* we have a partial word in the input buffer */

359 movs r5, lr, lsl #(31-3)

360 strbmi r3, [r0], #1

361 movmi r3, r3, lsr #8

362 strbcs r3, [r0], #1

363 movcs r3, r3, lsr #8

364 strbcs r3, [r0], #1

365

366 /* Refill spilled registers from the stack. Don't update sp. */

367 ldmfd sp, {r5-r11}

368

369 copy_last_3_and_return:

370 movs r2, r2, lsl #31 /* copy remaining 0, 1, 2 or 3 bytes */

371 ldrbmi r2, [r1], #1

372 ldrbcs r3, [r1], #1

373 ldrbcs r12,[r1]

374 strbmi r2, [r0], #1

375 strbcs r3, [r0], #1

376 strbcs r12,[r0]

377

378 /* we're done! restore sp and spilled registers and return */

379 add sp, sp, #28

380 ldmfd sp!, {r0, r4, lr}

381 bx lr

382

383 #endif

OLD	NEW

« no previous file with comments | « fusl/src/setjmp/mips/setjmp.s ('k') | fusl/src/string/arm/memcpy_le.s » ('j') | no next file with comments »