OLD | NEW |
| (Empty) |
1 /*************************************************************************** | |
2 * Copyright (c) 2009,2010, Code Aurora Forum. All rights reserved. | |
3 * | |
4 * Use of this source code is governed by a BSD-style license that can be | |
5 * found in the LICENSE file. | |
6 ***************************************************************************/ | |
7 | |
8 /*************************************************************************** | |
9 Neon memset: Attempts to do a memset with Neon registers if possible, | |
10 Inputs: | |
11 s: The buffer to write to | |
12 c: The integer data to write to the buffer | |
13 n: The size_t count. | |
14 Outputs: | |
15 | |
16 ***************************************************************************/ | |
17 | |
18 .syntax unified | |
19 .code 32 | |
20 .fpu neon | |
21 .align 4 | |
22 .globl memset16_neon | |
23 .hidden memset16_neon | |
24 | |
25 memset16_neon: | |
26 cmp r2, #0 | |
27 bxeq lr | |
28 | |
29 /* Keep in mind that r2 -- the count argument -- is for the | |
30 * number of 16-bit items to copy. | |
31 */ | |
32 lsl r2, r2, #1 | |
33 | |
34 push {r0} | |
35 | |
36 /* If we have < 8 bytes, just do a quick loop to handle that */ | |
37 cmp r2, #8 | |
38 bgt memset_gt4 | |
39 memset_smallcopy_loop: | |
40 strh r1, [r0], #2 | |
41 subs r2, r2, #2 | |
42 bne memset_smallcopy_loop | |
43 memset_smallcopy_done: | |
44 pop {r0} | |
45 bx lr | |
46 | |
47 memset_gt4: | |
48 /* | |
49 * Duplicate the r1 lowest 16-bits across r1. The idea is to have | |
50 * a register with two 16-bit-values we can copy. We do this by | |
51 * duplicating lowest 16-bits of r1 to upper 16-bits. | |
52 */ | |
53 orr r1, r1, r1, lsl #16 | |
54 /* | |
55 * If we're copying > 64 bytes, then we may want to get | |
56 * onto a 16-byte boundary to improve speed even more. | |
57 */ | |
58 cmp r2, #64 | |
59 blt memset_route | |
60 ands r12, r0, #0xf | |
61 beq memset_route | |
62 /* | |
63 * Determine the number of bytes to move forward to get to the 16-byte | |
64 * boundary. Note that this will be a multiple of 4, since we | |
65 * already are word-aligned. | |
66 */ | |
67 rsb r12, r12, #16 | |
68 sub r2, r2, r12 | |
69 lsls r12, r12, #29 | |
70 strmi r1, [r0], #4 | |
71 strcs r1, [r0], #4 | |
72 strcs r1, [r0], #4 | |
73 lsls r12, r12, #2 | |
74 strhcs r1, [r0], #2 | |
75 memset_route: | |
76 /* | |
77 * Decide where to route for the maximum copy sizes. Note that we | |
78 * build q0 and q1 depending on if we'll need it, so that's | |
79 * interwoven here as well. | |
80 */ | |
81 vdup.u32 d0, r1 | |
82 cmp r2, #16 | |
83 blt memset_8 | |
84 vmov d1, d0 | |
85 cmp r2, #64 | |
86 blt memset_16 | |
87 vmov q1, q0 | |
88 cmp r2, #128 | |
89 blt memset_32 | |
90 memset_128: | |
91 mov r12, r2, lsr #7 | |
92 memset_128_loop: | |
93 vst1.64 {q0, q1}, [r0]! | |
94 vst1.64 {q0, q1}, [r0]! | |
95 vst1.64 {q0, q1}, [r0]! | |
96 vst1.64 {q0, q1}, [r0]! | |
97 subs r12, r12, #1 | |
98 bne memset_128_loop | |
99 ands r2, r2, #0x7f | |
100 beq memset_end | |
101 memset_32: | |
102 movs r12, r2, lsr #5 | |
103 beq memset_16 | |
104 memset_32_loop: | |
105 subs r12, r12, #1 | |
106 vst1.64 {q0, q1}, [r0]! | |
107 bne memset_32_loop | |
108 ands r2, r2, #0x1f | |
109 beq memset_end | |
110 memset_16: | |
111 movs r12, r2, lsr #4 | |
112 beq memset_8 | |
113 memset_16_loop: | |
114 subs r12, r12, #1 | |
115 vst1.32 {q0}, [r0]! | |
116 bne memset_16_loop | |
117 ands r2, r2, #0xf | |
118 beq memset_end | |
119 /* | |
120 * memset_8 isn't a loop, since we try to do our loops at 16 | |
121 * bytes and above. We should loop there, then drop down here | |
122 * to finish the <16-byte versions. Same for memset_4 and | |
123 * memset_1. | |
124 */ | |
125 memset_8: | |
126 cmp r2, #8 | |
127 blt memset_4 | |
128 subs r2, r2, #8 | |
129 vst1.32 {d0}, [r0]! | |
130 memset_4: | |
131 cmp r2, #4 | |
132 blt memset_2 | |
133 subs r2, r2, #4 | |
134 str r1, [r0], #4 | |
135 memset_2: | |
136 cmp r2, #0 | |
137 ble memset_end | |
138 strh r1, [r0], #2 | |
139 memset_end: | |
140 pop {r0} | |
141 bx lr | |
142 | |
143 .end | |
OLD | NEW |