fusl/src/string/armel/memcpy.s - Issue 1573973002: Add a "fork" of musl as //fusl.

Side by Side Diff: fusl/src/string/armel/memcpy.s

Issue 1573973002: Add a "fork" of musl as //fusl. (Closed) Base URL: https://github.com/domokit/mojo.git@master

Patch Set: Created 4 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
(Empty)
	1 /*

	2 * Copyright (C) 2008 The Android Open Source Project

	3 * All rights reserved.

	4 *

	5 * Redistribution and use in source and binary forms, with or without

	6 * modification, are permitted provided that the following conditions

	7 * are met:

	8 * * Redistributions of source code must retain the above copyright

	9 * notice, this list of conditions and the following disclaimer.

	10 * * Redistributions in binary form must reproduce the above copyright

	11 * notice, this list of conditions and the following disclaimer in

	12 * the documentation and/or other materials provided with the

	13 * distribution.

	14 *

	15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS

	16 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

	17 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS

	18 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE

	19 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,

	20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,

	21 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS

	22 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED

	23 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,

	24 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT

	25 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF

	26 * SUCH DAMAGE.

	27 */

	28

	29

	30 /*

	31 * Optimized memcpy() for ARM.

	32 *

	33 * note that memcpy() always returns the destination pointer,

	34 * so we have to preserve R0.

	35 */

	36

	37 /*

	38 * This file has been modified from the original for use in musl libc.

	39 * The main changes are: addition of .type memcpy,%function to make the

	40 * code safely callable from thumb mode, adjusting the return

	41 * instructions to be compatible with pre-thumb ARM cpus, and removal

	42 * of prefetch code that is not compatible with older cpus.

	43 */

	44

	45 .syntax unified

	46

	47 .global memcpy

	48 .type memcpy,%function

	49 memcpy:

	50 /* The stack must always be 64-bits aligned to be compliant with the

	51 * ARM ABI. Since we have to save R0, we might as well save R4

	52 * which we can use for better pipelining of the reads below

	53 */

	54 .fnstart

	55 .save {r0, r4, lr}

	56 stmfd sp!, {r0, r4, lr}

	57 /* Making room for r5-r11 which will be spilled later */

	58 .pad #28

	59 sub sp, sp, #28

	60

	61 /* it simplifies things to take care of len<4 early */

	62 cmp r2, #4

	63 blo copy_last_3_and_return

	64

	65 /* compute the offset to align the source

	66 * offset = (4-(src&3))&3 = -src & 3

	67 */

	68 rsb r3, r1, #0

	69 ands r3, r3, #3

	70 beq src_aligned

	71

	72 /* align source to 32 bits. We need to insert 2 instructions between

	73 * a ldr[b\|h] and str[b\|h] because byte and half-word instructions

	74 * stall 2 cycles.

	75 */

	76 movs r12, r3, lsl #31

	77 sub r2, r2, r3 /* we know that r3 <= r2 because r2 >= 4 */

	78 ldrbmi r3, [r1], #1

	79 ldrbcs r4, [r1], #1

	80 ldrbcs r12,[r1], #1

	81 strbmi r3, [r0], #1

	82 strbcs r4, [r0], #1

	83 strbcs r12,[r0], #1

	84

	85 src_aligned:

	86

	87 /* see if src and dst are aligned together (congruent) */

	88 eor r12, r0, r1

	89 tst r12, #3

	90 bne non_congruent

	91

	92 /* Use post-incriment mode for stm to spill r5-r11 to reserved stack

	93 * frame. Don't update sp.

	94 */

	95 stmea sp, {r5-r11}

	96

	97 /* align the destination to a cache-line */

	98 rsb r3, r0, #0

	99 ands r3, r3, #0x1C

	100 beq congruent_aligned32

	101 cmp r3, r2

	102 andhi r3, r2, #0x1C

	103

	104 /* conditionnaly copies 0 to 7 words (length in r3) */

	105 movs r12, r3, lsl #28

	106 ldmcs r1!, {r4, r5, r6, r7} /* 16 bytes */

	107 ldmmi r1!, {r8, r9} /* 8 bytes */

	108 stmcs r0!, {r4, r5, r6, r7}

	109 stmmi r0!, {r8, r9}

	110 tst r3, #0x4

	111 ldrne r10,[r1], #4 /* 4 bytes */

	112 strne r10,[r0], #4

	113 sub r2, r2, r3

	114

	115 congruent_aligned32:

	116 /*

	117 * here source is aligned to 32 bytes.

	118 */

	119

	120 cached_aligned32:

	121 subs r2, r2, #32

	122 blo less_than_32_left

	123

	124 /*

	125 * We preload a cache-line up to 64 bytes ahead. On the 926, this will

	126 * stall only until the requested world is fetched, but the linefill

	127 * continues in the the background.

	128 * While the linefill is going, we write our previous cache-line

	129 * into the write-buffer (which should have some free space).

	130 * When the linefill is done, the writebuffer will

	131 * start dumping its content into memory

	132 *

	133 * While all this is going, we then load a full cache line into

	134 * 8 registers, this cache line should be in the cache by now

	135 * (or partly in the cache).

	136 *

	137 * This code should work well regardless of the source/dest alignment.

	138 *

	139 */

	140

	141 /* Align the preload register to a cache-line because the cpu does

	142 * "critical word first" (the first word requested is loaded first).

	143 */

	144 @ bic r12, r1, #0x1F

	145 @ add r12, r12, #64

	146

	147 1: ldmia r1!, { r4-r11 }

	148 subs r2, r2, #32

	149

	150 /*

	151 * NOTE: if r12 is more than 64 ahead of r1, the following ldrhi

	152 * for ARM9 preload will not be safely guarded by the preceding subs.

	153 * When it is safely guarded the only possibility to have SIGSEGV here

	154 * is because the caller overstates the length.

	155 */

	156 @ ldrhi r3, [r12], #32 /* cheap ARM9 preload */

	157 stmia r0!, { r4-r11 }

	158 bhs 1b

	159

	160 add r2, r2, #32

	161

	162 less_than_32_left:

	163 /*

	164 * less than 32 bytes left at this point (length in r2)

	165 */

	166

	167 /* skip all this if there is nothing to do, which should

	168 * be a common case (if not executed the code below takes

	169 * about 16 cycles)

	170 */

	171 tst r2, #0x1F

	172 beq 1f

	173

	174 /* conditionnaly copies 0 to 31 bytes */

	175 movs r12, r2, lsl #28

	176 ldmcs r1!, {r4, r5, r6, r7} /* 16 bytes */

	177 ldmmi r1!, {r8, r9} /* 8 bytes */

	178 stmcs r0!, {r4, r5, r6, r7}

	179 stmmi r0!, {r8, r9}

	180 movs r12, r2, lsl #30

	181 ldrcs r3, [r1], #4 /* 4 bytes */

	182 ldrhmi r4, [r1], #2 /* 2 bytes */

	183 strcs r3, [r0], #4

	184 strhmi r4, [r0], #2

	185 tst r2, #0x1

	186 ldrbne r3, [r1] /* last byte */

	187 strbne r3, [r0]

	188

	189 /* we're done! restore everything and return */

	190 1: ldmfd sp!, {r5-r11}

	191 ldmfd sp!, {r0, r4, lr}

	192 bx lr

	193

	194 /********************************************************************/

	195

	196 non_congruent:

	197 /*

	198 * here source is aligned to 4 bytes

	199 * but destination is not.

	200 *

	201 * in the code below r2 is the number of bytes read

	202 * (the number of bytes written is always smaller, because we have

	203 * partial words in the shift queue)

	204 */

	205 cmp r2, #4

	206 blo copy_last_3_and_return

	207

	208 /* Use post-incriment mode for stm to spill r5-r11 to reserved stack

	209 * frame. Don't update sp.

	210 */

	211 stmea sp, {r5-r11}

	212

	213 /* compute shifts needed to align src to dest */

	214 rsb r5, r0, #0

	215 and r5, r5, #3 /* r5 = # bytes in partial words */

	216 mov r12, r5, lsl #3 /* r12 = right */

	217 rsb lr, r12, #32 /* lr = left */

	218

	219 /* read the first word */

	220 ldr r3, [r1], #4

	221 sub r2, r2, #4

	222

	223 /* write a partial word (0 to 3 bytes), such that destination

	224 * becomes aligned to 32 bits (r5 = nb of words to copy for alignment)

	225 */

	226 movs r5, r5, lsl #31

	227 strbmi r3, [r0], #1

	228 movmi r3, r3, lsr #8

	229 strbcs r3, [r0], #1

	230 movcs r3, r3, lsr #8

	231 strbcs r3, [r0], #1

	232 movcs r3, r3, lsr #8

	233

	234 cmp r2, #4

	235 blo partial_word_tail

	236

	237 /* Align destination to 32 bytes (cache line boundary) */

	238 1: tst r0, #0x1c

	239 beq 2f

	240 ldr r5, [r1], #4

	241 sub r2, r2, #4

	242 orr r4, r3, r5, lsl lr

	243 mov r3, r5, lsr r12

	244 str r4, [r0], #4

	245 cmp r2, #4

	246 bhs 1b

	247 blo partial_word_tail

	248

	249 /* copy 32 bytes at a time */

	250 2: subs r2, r2, #32

	251 blo less_than_thirtytwo

	252

	253 /* Use immediate mode for the shifts, because there is an extra cycle

	254 * for register shifts, which could account for up to 50% of

	255 * performance hit.

	256 */

	257

	258 cmp r12, #24

	259 beq loop24

	260 cmp r12, #8

	261 beq loop8

	262

	263 loop16:

	264 ldr r12, [r1], #4

	265 1: mov r4, r12

	266 ldmia r1!, { r5,r6,r7, r8,r9,r10,r11}

	267 subs r2, r2, #32

	268 ldrhs r12, [r1], #4

	269 orr r3, r3, r4, lsl #16

	270 mov r4, r4, lsr #16

	271 orr r4, r4, r5, lsl #16

	272 mov r5, r5, lsr #16

	273 orr r5, r5, r6, lsl #16

	274 mov r6, r6, lsr #16

	275 orr r6, r6, r7, lsl #16

	276 mov r7, r7, lsr #16

	277 orr r7, r7, r8, lsl #16

	278 mov r8, r8, lsr #16

	279 orr r8, r8, r9, lsl #16

	280 mov r9, r9, lsr #16

	281 orr r9, r9, r10, lsl #16

	282 mov r10, r10, lsr #16

	283 orr r10, r10, r11, lsl #16

	284 stmia r0!, {r3,r4,r5,r6, r7,r8,r9,r10}

	285 mov r3, r11, lsr #16

	286 bhs 1b

	287 b less_than_thirtytwo

	288

	289 loop8:

	290 ldr r12, [r1], #4

	291 1: mov r4, r12

	292 ldmia r1!, { r5,r6,r7, r8,r9,r10,r11}

	293 subs r2, r2, #32

	294 ldrhs r12, [r1], #4

	295 orr r3, r3, r4, lsl #24

	296 mov r4, r4, lsr #8

	297 orr r4, r4, r5, lsl #24

	298 mov r5, r5, lsr #8

	299 orr r5, r5, r6, lsl #24

	300 mov r6, r6, lsr #8

	301 orr r6, r6, r7, lsl #24

	302 mov r7, r7, lsr #8

	303 orr r7, r7, r8, lsl #24

	304 mov r8, r8, lsr #8

	305 orr r8, r8, r9, lsl #24

	306 mov r9, r9, lsr #8

	307 orr r9, r9, r10, lsl #24

	308 mov r10, r10, lsr #8

	309 orr r10, r10, r11, lsl #24

	310 stmia r0!, {r3,r4,r5,r6, r7,r8,r9,r10}

	311 mov r3, r11, lsr #8

	312 bhs 1b

	313 b less_than_thirtytwo

	314

	315 loop24:

	316 ldr r12, [r1], #4

	317 1: mov r4, r12

	318 ldmia r1!, { r5,r6,r7, r8,r9,r10,r11}

	319 subs r2, r2, #32

	320 ldrhs r12, [r1], #4

	321 orr r3, r3, r4, lsl #8

	322 mov r4, r4, lsr #24

	323 orr r4, r4, r5, lsl #8

	324 mov r5, r5, lsr #24

	325 orr r5, r5, r6, lsl #8

	326 mov r6, r6, lsr #24

	327 orr r6, r6, r7, lsl #8

	328 mov r7, r7, lsr #24

	329 orr r7, r7, r8, lsl #8

	330 mov r8, r8, lsr #24

	331 orr r8, r8, r9, lsl #8

	332 mov r9, r9, lsr #24

	333 orr r9, r9, r10, lsl #8

	334 mov r10, r10, lsr #24

	335 orr r10, r10, r11, lsl #8

	336 stmia r0!, {r3,r4,r5,r6, r7,r8,r9,r10}

	337 mov r3, r11, lsr #24

	338 bhs 1b

	339

	340 less_than_thirtytwo:

	341 /* copy the last 0 to 31 bytes of the source */

	342 rsb r12, lr, #32 /* we corrupted r12, recompute it */

	343 add r2, r2, #32

	344 cmp r2, #4

	345 blo partial_word_tail

	346

	347 1: ldr r5, [r1], #4

	348 sub r2, r2, #4

	349 orr r4, r3, r5, lsl lr

	350 mov r3, r5, lsr r12

	351 str r4, [r0], #4

	352 cmp r2, #4

	353 bhs 1b

	354

	355 partial_word_tail:

	356 /* we have a partial word in the input buffer */

	357 movs r5, lr, lsl #(31-3)

	358 strbmi r3, [r0], #1

	359 movmi r3, r3, lsr #8

	360 strbcs r3, [r0], #1

	361 movcs r3, r3, lsr #8

	362 strbcs r3, [r0], #1

	363

	364 /* Refill spilled registers from the stack. Don't update sp. */

	365 ldmfd sp, {r5-r11}

	366

	367 copy_last_3_and_return:

	368 movs r2, r2, lsl #31 /* copy remaining 0, 1, 2 or 3 bytes */

	369 ldrbmi r2, [r1], #1

	370 ldrbcs r3, [r1], #1

	371 ldrbcs r12,[r1]

	372 strbmi r2, [r0], #1

	373 strbcs r3, [r0], #1

	374 strbcs r12,[r0]

	375

	376 /* we're done! restore sp and spilled registers and return */

	377 add sp, sp, #28

	378 ldmfd sp!, {r0, r4, lr}

	379 bx lr

OLD	NEW

« no previous file with comments | « fusl/src/stdlib/wcstol.c ('k') | fusl/src/string/armel/memcpy.sub » ('j') | no next file with comments »