OLD | NEW |
1 ; | 1 ; |
2 ; Copyright (c) 2013 The WebM project authors. All Rights Reserved. | 2 ; Copyright (c) 2013 The WebM project authors. All Rights Reserved. |
3 ; | 3 ; |
4 ; Use of this source code is governed by a BSD-style license | 4 ; Use of this source code is governed by a BSD-style license |
5 ; that can be found in the LICENSE file in the root of the source | 5 ; that can be found in the LICENSE file in the root of the source |
6 ; tree. An additional intellectual property rights grant can be found | 6 ; tree. An additional intellectual property rights grant can be found |
7 ; in the file PATENTS. All contributing project authors may | 7 ; in the file PATENTS. All contributing project authors may |
8 ; be found in the AUTHORS file in the root of the source tree. | 8 ; be found in the AUTHORS file in the root of the source tree. |
9 ; | 9 ; |
10 | 10 |
(...skipping 60 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
71 | 71 |
72 sub r4, r3, r3, lsl #2 ; -dst_stride * 3 | 72 sub r4, r3, r3, lsl #2 ; -dst_stride * 3 |
73 add r4, r4, #4 ; -dst_stride * 3 + 4 | 73 add r4, r4, #4 ; -dst_stride * 3 + 4 |
74 | 74 |
75 rsb r9, r6, r1, lsl #2 ; reset src for outer loop | 75 rsb r9, r6, r1, lsl #2 ; reset src for outer loop |
76 sub r9, r9, #7 | 76 sub r9, r9, #7 |
77 rsb r12, r6, r3, lsl #2 ; reset dst for outer loop | 77 rsb r12, r6, r3, lsl #2 ; reset dst for outer loop |
78 | 78 |
79 mov r10, r6 ; w loop counter | 79 mov r10, r6 ; w loop counter |
80 | 80 |
81 loop_horiz_v | 81 vp9_convolve8_loop_horiz_v |
82 vld1.8 {d24}, [r0], r1 | 82 vld1.8 {d24}, [r0], r1 |
83 vld1.8 {d25}, [r0], r1 | 83 vld1.8 {d25}, [r0], r1 |
84 vld1.8 {d26}, [r0], r1 | 84 vld1.8 {d26}, [r0], r1 |
85 vld1.8 {d27}, [r0], r8 | 85 vld1.8 {d27}, [r0], r8 |
86 | 86 |
87 vtrn.16 q12, q13 | 87 vtrn.16 q12, q13 |
88 vtrn.8 d24, d25 | 88 vtrn.8 d24, d25 |
89 vtrn.8 d26, d27 | 89 vtrn.8 d26, d27 |
90 | 90 |
91 pld [r0, r1, lsl #2] | 91 pld [r0, r1, lsl #2] |
92 | 92 |
93 vmovl.u8 q8, d24 | 93 vmovl.u8 q8, d24 |
94 vmovl.u8 q9, d25 | 94 vmovl.u8 q9, d25 |
95 vmovl.u8 q10, d26 | 95 vmovl.u8 q10, d26 |
96 vmovl.u8 q11, d27 | 96 vmovl.u8 q11, d27 |
97 | 97 |
98 ; save a few instructions in the inner loop | 98 ; save a few instructions in the inner loop |
99 vswp d17, d18 | 99 vswp d17, d18 |
100 vmov d23, d21 | 100 vmov d23, d21 |
101 | 101 |
102 add r0, r0, #3 | 102 add r0, r0, #3 |
103 | 103 |
104 loop_horiz | 104 vp9_convolve8_loop_horiz |
105 add r5, r0, #64 | 105 add r5, r0, #64 |
106 | 106 |
107 vld1.32 {d28[]}, [r0], r1 | 107 vld1.32 {d28[]}, [r0], r1 |
108 vld1.32 {d29[]}, [r0], r1 | 108 vld1.32 {d29[]}, [r0], r1 |
109 vld1.32 {d31[]}, [r0], r1 | 109 vld1.32 {d31[]}, [r0], r1 |
110 vld1.32 {d30[]}, [r0], r8 | 110 vld1.32 {d30[]}, [r0], r8 |
111 | 111 |
112 pld [r5] | 112 pld [r5] |
113 | 113 |
114 vtrn.16 d28, d31 | 114 vtrn.16 d28, d31 |
(...skipping 37 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
152 vst1.u32 {d3[0]}, [r2@32], r3 | 152 vst1.u32 {d3[0]}, [r2@32], r3 |
153 vst1.u32 {d2[1]}, [r2@32], r3 | 153 vst1.u32 {d2[1]}, [r2@32], r3 |
154 vst1.u32 {d3[1]}, [r2@32], r4 | 154 vst1.u32 {d3[1]}, [r2@32], r4 |
155 | 155 |
156 vmov q8, q9 | 156 vmov q8, q9 |
157 vmov d20, d23 | 157 vmov d20, d23 |
158 vmov q11, q12 | 158 vmov q11, q12 |
159 vmov q9, q13 | 159 vmov q9, q13 |
160 | 160 |
161 subs r6, r6, #4 ; w -= 4 | 161 subs r6, r6, #4 ; w -= 4 |
162 bgt loop_horiz | 162 bgt vp9_convolve8_loop_horiz |
163 | 163 |
164 ; outer loop | 164 ; outer loop |
165 mov r6, r10 ; restore w counter | 165 mov r6, r10 ; restore w counter |
166 add r0, r0, r9 ; src += src_stride * 4 - w | 166 add r0, r0, r9 ; src += src_stride * 4 - w |
167 add r2, r2, r12 ; dst += dst_stride * 4 - w | 167 add r2, r2, r12 ; dst += dst_stride * 4 - w |
168 subs r7, r7, #4 ; h -= 4 | 168 subs r7, r7, #4 ; h -= 4 |
169 bgt loop_horiz_v | 169 bgt vp9_convolve8_loop_horiz_v |
170 | 170 |
171 pop {r4-r10, pc} | 171 pop {r4-r10, pc} |
172 | 172 |
173 ENDP | 173 ENDP |
174 | 174 |
175 |vp9_convolve8_vert_neon| PROC | 175 |vp9_convolve8_vert_neon| PROC |
176 ldr r12, [sp, #12] | 176 ldr r12, [sp, #12] |
177 cmp r12, #16 | 177 cmp r12, #16 |
178 bne vp9_convolve8_vert_c | 178 bne vp9_convolve8_vert_c |
179 | 179 |
180 push {r4-r8, lr} | 180 push {r4-r8, lr} |
181 | 181 |
182 ; adjust for taps | 182 ; adjust for taps |
183 sub r0, r0, r1 | 183 sub r0, r0, r1 |
184 sub r0, r0, r1, lsl #1 | 184 sub r0, r0, r1, lsl #1 |
185 | 185 |
186 ldr r4, [sp, #32] ; filter_y | 186 ldr r4, [sp, #32] ; filter_y |
187 ldr r6, [sp, #40] ; w | 187 ldr r6, [sp, #40] ; w |
188 ldr lr, [sp, #44] ; h | 188 ldr lr, [sp, #44] ; h |
189 | 189 |
190 vld1.s16 {q0}, [r4] ; filter_y | 190 vld1.s16 {q0}, [r4] ; filter_y |
191 | 191 |
192 lsl r1, r1, #1 | 192 lsl r1, r1, #1 |
193 lsl r3, r3, #1 | 193 lsl r3, r3, #1 |
194 | 194 |
195 loop_vert_h | 195 vp9_convolve8_loop_vert_h |
196 mov r4, r0 | 196 mov r4, r0 |
197 add r7, r0, r1, asr #1 | 197 add r7, r0, r1, asr #1 |
198 mov r5, r2 | 198 mov r5, r2 |
199 add r8, r2, r3, asr #1 | 199 add r8, r2, r3, asr #1 |
200 mov r12, lr ; h loop counter | 200 mov r12, lr ; h loop counter |
201 | 201 |
202 vld1.u32 {d16[0]}, [r4], r1 | 202 vld1.u32 {d16[0]}, [r4], r1 |
203 vld1.u32 {d16[1]}, [r7], r1 | 203 vld1.u32 {d16[1]}, [r7], r1 |
204 vld1.u32 {d18[0]}, [r4], r1 | 204 vld1.u32 {d18[0]}, [r4], r1 |
205 vld1.u32 {d18[1]}, [r7], r1 | 205 vld1.u32 {d18[1]}, [r7], r1 |
206 vld1.u32 {d20[0]}, [r4], r1 | 206 vld1.u32 {d20[0]}, [r4], r1 |
207 vld1.u32 {d20[1]}, [r7], r1 | 207 vld1.u32 {d20[1]}, [r7], r1 |
208 vld1.u32 {d22[0]}, [r4], r1 | 208 vld1.u32 {d22[0]}, [r4], r1 |
209 | 209 |
210 vmovl.u8 q8, d16 | 210 vmovl.u8 q8, d16 |
211 vmovl.u8 q9, d18 | 211 vmovl.u8 q9, d18 |
212 vmovl.u8 q10, d20 | 212 vmovl.u8 q10, d20 |
213 vmovl.u8 q11, d22 | 213 vmovl.u8 q11, d22 |
214 | 214 |
215 loop_vert | 215 vp9_convolve8_loop_vert |
216 ; always process a 4x4 block at a time | 216 ; always process a 4x4 block at a time |
217 vld1.u32 {d24[0]}, [r7], r1 | 217 vld1.u32 {d24[0]}, [r7], r1 |
218 vld1.u32 {d26[0]}, [r4], r1 | 218 vld1.u32 {d26[0]}, [r4], r1 |
219 vld1.u32 {d26[1]}, [r7], r1 | 219 vld1.u32 {d26[1]}, [r7], r1 |
220 vld1.u32 {d24[1]}, [r4], r1 | 220 vld1.u32 {d24[1]}, [r4], r1 |
221 | 221 |
222 ; extract to s16 | 222 ; extract to s16 |
223 vmovl.u8 q12, d24 | 223 vmovl.u8 q12, d24 |
224 vmovl.u8 q13, d26 | 224 vmovl.u8 q13, d26 |
225 | 225 |
(...skipping 33 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
259 vst1.u32 {d3[0]}, [r5@32], r3 | 259 vst1.u32 {d3[0]}, [r5@32], r3 |
260 vst1.u32 {d3[1]}, [r8@32], r3 | 260 vst1.u32 {d3[1]}, [r8@32], r3 |
261 | 261 |
262 vmov q8, q10 | 262 vmov q8, q10 |
263 vmov d18, d22 | 263 vmov d18, d22 |
264 vmov d19, d24 | 264 vmov d19, d24 |
265 vmov q10, q13 | 265 vmov q10, q13 |
266 vmov d22, d25 | 266 vmov d22, d25 |
267 | 267 |
268 subs r12, r12, #4 ; h -= 4 | 268 subs r12, r12, #4 ; h -= 4 |
269 bgt loop_vert | 269 bgt vp9_convolve8_loop_vert |
270 | 270 |
271 ; outer loop | 271 ; outer loop |
272 add r0, r0, #4 | 272 add r0, r0, #4 |
273 add r2, r2, #4 | 273 add r2, r2, #4 |
274 subs r6, r6, #4 ; w -= 4 | 274 subs r6, r6, #4 ; w -= 4 |
275 bgt loop_vert_h | 275 bgt vp9_convolve8_loop_vert_h |
276 | 276 |
277 pop {r4-r8, pc} | 277 pop {r4-r8, pc} |
278 | 278 |
279 ENDP | 279 ENDP |
280 END | 280 END |
OLD | NEW |