Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(246)

Side by Side Diff: src/opts/memset16_neon.S

Issue 1075003002: Replace NEON assembly memset16 and memset32 with intrinsic versions. (Closed) Base URL: https://skia.googlesource.com/skia@master
Patch Set: Created 5 years, 8 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 /***************************************************************************
2 * Copyright (c) 2009,2010, Code Aurora Forum. All rights reserved.
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 ***************************************************************************/
7
8 /***************************************************************************
9 Neon memset: Attempts to do a memset with Neon registers if possible,
10 Inputs:
11 s: The buffer to write to
12 c: The integer data to write to the buffer
13 n: The size_t count.
14 Outputs:
15
16 ***************************************************************************/
17
18 .syntax unified
19 .code 32
20 .fpu neon
21 .align 4
22 .globl memset16_neon
23 .hidden memset16_neon
24
25 memset16_neon:
26 cmp r2, #0
27 bxeq lr
28
29 /* Keep in mind that r2 -- the count argument -- is for the
30 * number of 16-bit items to copy.
31 */
32 lsl r2, r2, #1
33
34 push {r0}
35
36 /* If we have < 8 bytes, just do a quick loop to handle that */
37 cmp r2, #8
38 bgt memset_gt4
39 memset_smallcopy_loop:
40 strh r1, [r0], #2
41 subs r2, r2, #2
42 bne memset_smallcopy_loop
43 memset_smallcopy_done:
44 pop {r0}
45 bx lr
46
47 memset_gt4:
48 /*
49 * Duplicate the r1 lowest 16-bits across r1. The idea is to have
50 * a register with two 16-bit-values we can copy. We do this by
51 * duplicating lowest 16-bits of r1 to upper 16-bits.
52 */
53 orr r1, r1, r1, lsl #16
54 /*
55 * If we're copying > 64 bytes, then we may want to get
56 * onto a 16-byte boundary to improve speed even more.
57 */
58 cmp r2, #64
59 blt memset_route
60 ands r12, r0, #0xf
61 beq memset_route
62 /*
63 * Determine the number of bytes to move forward to get to the 16-byte
64 * boundary. Note that this will be a multiple of 4, since we
65 * already are word-aligned.
66 */
67 rsb r12, r12, #16
68 sub r2, r2, r12
69 lsls r12, r12, #29
70 strmi r1, [r0], #4
71 strcs r1, [r0], #4
72 strcs r1, [r0], #4
73 lsls r12, r12, #2
74 strhcs r1, [r0], #2
75 memset_route:
76 /*
77 * Decide where to route for the maximum copy sizes. Note that we
78 * build q0 and q1 depending on if we'll need it, so that's
79 * interwoven here as well.
80 */
81 vdup.u32 d0, r1
82 cmp r2, #16
83 blt memset_8
84 vmov d1, d0
85 cmp r2, #64
86 blt memset_16
87 vmov q1, q0
88 cmp r2, #128
89 blt memset_32
90 memset_128:
91 mov r12, r2, lsr #7
92 memset_128_loop:
93 vst1.64 {q0, q1}, [r0]!
94 vst1.64 {q0, q1}, [r0]!
95 vst1.64 {q0, q1}, [r0]!
96 vst1.64 {q0, q1}, [r0]!
97 subs r12, r12, #1
98 bne memset_128_loop
99 ands r2, r2, #0x7f
100 beq memset_end
101 memset_32:
102 movs r12, r2, lsr #5
103 beq memset_16
104 memset_32_loop:
105 subs r12, r12, #1
106 vst1.64 {q0, q1}, [r0]!
107 bne memset_32_loop
108 ands r2, r2, #0x1f
109 beq memset_end
110 memset_16:
111 movs r12, r2, lsr #4
112 beq memset_8
113 memset_16_loop:
114 subs r12, r12, #1
115 vst1.32 {q0}, [r0]!
116 bne memset_16_loop
117 ands r2, r2, #0xf
118 beq memset_end
119 /*
120 * memset_8 isn't a loop, since we try to do our loops at 16
121 * bytes and above. We should loop there, then drop down here
122 * to finish the <16-byte versions. Same for memset_4 and
123 * memset_1.
124 */
125 memset_8:
126 cmp r2, #8
127 blt memset_4
128 subs r2, r2, #8
129 vst1.32 {d0}, [r0]!
130 memset_4:
131 cmp r2, #4
132 blt memset_2
133 subs r2, r2, #4
134 str r1, [r0], #4
135 memset_2:
136 cmp r2, #0
137 ble memset_end
138 strh r1, [r0], #2
139 memset_end:
140 pop {r0}
141 bx lr
142
143 .end
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698