| OLD | NEW |
| (Empty) |
| 1 /*************************************************************************** | |
| 2 * Copyright (c) 2009,2010, Code Aurora Forum. All rights reserved. | |
| 3 * | |
| 4 * Use of this source code is governed by a BSD-style license that can be | |
| 5 * found in the LICENSE file. | |
| 6 ***************************************************************************/ | |
| 7 | |
| 8 /*************************************************************************** | |
| 9 Neon memset: Attempts to do a memset with Neon registers if possible, | |
| 10 Inputs: | |
| 11 s: The buffer to write to | |
| 12 c: The integer data to write to the buffer | |
| 13 n: The size_t count. | |
| 14 Outputs: | |
| 15 | |
| 16 ***************************************************************************/ | |
| 17 | |
| 18 .syntax unified | |
| 19 .code 32 | |
| 20 .fpu neon | |
| 21 .align 4 | |
| 22 .globl memset16_neon | |
| 23 .hidden memset16_neon | |
| 24 | |
| 25 memset16_neon: | |
| 26 cmp r2, #0 | |
| 27 bxeq lr | |
| 28 | |
| 29 /* Keep in mind that r2 -- the count argument -- is for the | |
| 30 * number of 16-bit items to copy. | |
| 31 */ | |
| 32 lsl r2, r2, #1 | |
| 33 | |
| 34 push {r0} | |
| 35 | |
| 36 /* If we have < 8 bytes, just do a quick loop to handle that */ | |
| 37 cmp r2, #8 | |
| 38 bgt memset_gt4 | |
| 39 memset_smallcopy_loop: | |
| 40 strh r1, [r0], #2 | |
| 41 subs r2, r2, #2 | |
| 42 bne memset_smallcopy_loop | |
| 43 memset_smallcopy_done: | |
| 44 pop {r0} | |
| 45 bx lr | |
| 46 | |
| 47 memset_gt4: | |
| 48 /* | |
| 49 * Duplicate the r1 lowest 16-bits across r1. The idea is to have | |
| 50 * a register with two 16-bit-values we can copy. We do this by | |
| 51 * duplicating lowest 16-bits of r1 to upper 16-bits. | |
| 52 */ | |
| 53 orr r1, r1, r1, lsl #16 | |
| 54 /* | |
| 55 * If we're copying > 64 bytes, then we may want to get | |
| 56 * onto a 16-byte boundary to improve speed even more. | |
| 57 */ | |
| 58 cmp r2, #64 | |
| 59 blt memset_route | |
| 60 ands r12, r0, #0xf | |
| 61 beq memset_route | |
| 62 /* | |
| 63 * Determine the number of bytes to move forward to get to the 16-byte | |
| 64 * boundary. Note that this will be a multiple of 4, since we | |
| 65 * already are word-aligned. | |
| 66 */ | |
| 67 rsb r12, r12, #16 | |
| 68 sub r2, r2, r12 | |
| 69 lsls r12, r12, #29 | |
| 70 strmi r1, [r0], #4 | |
| 71 strcs r1, [r0], #4 | |
| 72 strcs r1, [r0], #4 | |
| 73 lsls r12, r12, #2 | |
| 74 strhcs r1, [r0], #2 | |
| 75 memset_route: | |
| 76 /* | |
| 77 * Decide where to route for the maximum copy sizes. Note that we | |
| 78 * build q0 and q1 depending on if we'll need it, so that's | |
| 79 * interwoven here as well. | |
| 80 */ | |
| 81 vdup.u32 d0, r1 | |
| 82 cmp r2, #16 | |
| 83 blt memset_8 | |
| 84 vmov d1, d0 | |
| 85 cmp r2, #64 | |
| 86 blt memset_16 | |
| 87 vmov q1, q0 | |
| 88 cmp r2, #128 | |
| 89 blt memset_32 | |
| 90 memset_128: | |
| 91 mov r12, r2, lsr #7 | |
| 92 memset_128_loop: | |
| 93 vst1.64 {q0, q1}, [r0]! | |
| 94 vst1.64 {q0, q1}, [r0]! | |
| 95 vst1.64 {q0, q1}, [r0]! | |
| 96 vst1.64 {q0, q1}, [r0]! | |
| 97 subs r12, r12, #1 | |
| 98 bne memset_128_loop | |
| 99 ands r2, r2, #0x7f | |
| 100 beq memset_end | |
| 101 memset_32: | |
| 102 movs r12, r2, lsr #5 | |
| 103 beq memset_16 | |
| 104 memset_32_loop: | |
| 105 subs r12, r12, #1 | |
| 106 vst1.64 {q0, q1}, [r0]! | |
| 107 bne memset_32_loop | |
| 108 ands r2, r2, #0x1f | |
| 109 beq memset_end | |
| 110 memset_16: | |
| 111 movs r12, r2, lsr #4 | |
| 112 beq memset_8 | |
| 113 memset_16_loop: | |
| 114 subs r12, r12, #1 | |
| 115 vst1.32 {q0}, [r0]! | |
| 116 bne memset_16_loop | |
| 117 ands r2, r2, #0xf | |
| 118 beq memset_end | |
| 119 /* | |
| 120 * memset_8 isn't a loop, since we try to do our loops at 16 | |
| 121 * bytes and above. We should loop there, then drop down here | |
| 122 * to finish the <16-byte versions. Same for memset_4 and | |
| 123 * memset_1. | |
| 124 */ | |
| 125 memset_8: | |
| 126 cmp r2, #8 | |
| 127 blt memset_4 | |
| 128 subs r2, r2, #8 | |
| 129 vst1.32 {d0}, [r0]! | |
| 130 memset_4: | |
| 131 cmp r2, #4 | |
| 132 blt memset_2 | |
| 133 subs r2, r2, #4 | |
| 134 str r1, [r0], #4 | |
| 135 memset_2: | |
| 136 cmp r2, #0 | |
| 137 ble memset_end | |
| 138 strh r1, [r0], #2 | |
| 139 memset_end: | |
| 140 pop {r0} | |
| 141 bx lr | |
| 142 | |
| 143 .end | |
| OLD | NEW |