OLD | NEW |
1 ; | 1 ; |
2 ; Copyright (c) 2013 The WebM project authors. All Rights Reserved. | 2 ; Copyright (c) 2013 The WebM project authors. All Rights Reserved. |
3 ; | 3 ; |
4 ; Use of this source code is governed by a BSD-style license | 4 ; Use of this source code is governed by a BSD-style license |
5 ; that can be found in the LICENSE file in the root of the source | 5 ; that can be found in the LICENSE file in the root of the source |
6 ; tree. An additional intellectual property rights grant can be found | 6 ; tree. An additional intellectual property rights grant can be found |
7 ; in the file PATENTS. All contributing project authors may | 7 ; in the file PATENTS. All contributing project authors may |
8 ; be found in the AUTHORS file in the root of the source tree. | 8 ; be found in the AUTHORS file in the root of the source tree. |
9 ; | 9 ; |
10 | 10 |
(...skipping 60 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
71 | 71 |
72 sub r4, r3, r3, lsl #2 ; -dst_stride * 3 | 72 sub r4, r3, r3, lsl #2 ; -dst_stride * 3 |
73 add r4, r4, #4 ; -dst_stride * 3 + 4 | 73 add r4, r4, #4 ; -dst_stride * 3 + 4 |
74 | 74 |
75 rsb r9, r6, r1, lsl #2 ; reset src for outer loop | 75 rsb r9, r6, r1, lsl #2 ; reset src for outer loop |
76 sub r9, r9, #7 | 76 sub r9, r9, #7 |
77 rsb r12, r6, r3, lsl #2 ; reset dst for outer loop | 77 rsb r12, r6, r3, lsl #2 ; reset dst for outer loop |
78 | 78 |
79 mov r10, r6 ; w loop counter | 79 mov r10, r6 ; w loop counter |
80 | 80 |
81 loop_horiz_v | 81 vp9_convolve8_avg_loop_horiz_v |
82 vld1.8 {d24}, [r0], r1 | 82 vld1.8 {d24}, [r0], r1 |
83 vld1.8 {d25}, [r0], r1 | 83 vld1.8 {d25}, [r0], r1 |
84 vld1.8 {d26}, [r0], r1 | 84 vld1.8 {d26}, [r0], r1 |
85 vld1.8 {d27}, [r0], r8 | 85 vld1.8 {d27}, [r0], r8 |
86 | 86 |
87 vtrn.16 q12, q13 | 87 vtrn.16 q12, q13 |
88 vtrn.8 d24, d25 | 88 vtrn.8 d24, d25 |
89 vtrn.8 d26, d27 | 89 vtrn.8 d26, d27 |
90 | 90 |
91 pld [r0, r1, lsl #2] | 91 pld [r0, r1, lsl #2] |
92 | 92 |
93 vmovl.u8 q8, d24 | 93 vmovl.u8 q8, d24 |
94 vmovl.u8 q9, d25 | 94 vmovl.u8 q9, d25 |
95 vmovl.u8 q10, d26 | 95 vmovl.u8 q10, d26 |
96 vmovl.u8 q11, d27 | 96 vmovl.u8 q11, d27 |
97 | 97 |
98 ; save a few instructions in the inner loop | 98 ; save a few instructions in the inner loop |
99 vswp d17, d18 | 99 vswp d17, d18 |
100 vmov d23, d21 | 100 vmov d23, d21 |
101 | 101 |
102 add r0, r0, #3 | 102 add r0, r0, #3 |
103 | 103 |
104 loop_horiz | 104 vp9_convolve8_avg_loop_horiz |
105 add r5, r0, #64 | 105 add r5, r0, #64 |
106 | 106 |
107 vld1.32 {d28[]}, [r0], r1 | 107 vld1.32 {d28[]}, [r0], r1 |
108 vld1.32 {d29[]}, [r0], r1 | 108 vld1.32 {d29[]}, [r0], r1 |
109 vld1.32 {d31[]}, [r0], r1 | 109 vld1.32 {d31[]}, [r0], r1 |
110 vld1.32 {d30[]}, [r0], r8 | 110 vld1.32 {d30[]}, [r0], r8 |
111 | 111 |
112 pld [r5] | 112 pld [r5] |
113 | 113 |
114 vtrn.16 d28, d31 | 114 vtrn.16 d28, d31 |
(...skipping 48 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
163 vst1.u32 {d3[0]}, [r2@32], r3 | 163 vst1.u32 {d3[0]}, [r2@32], r3 |
164 vst1.u32 {d2[1]}, [r2@32], r3 | 164 vst1.u32 {d2[1]}, [r2@32], r3 |
165 vst1.u32 {d3[1]}, [r2@32], r4 | 165 vst1.u32 {d3[1]}, [r2@32], r4 |
166 | 166 |
167 vmov q8, q9 | 167 vmov q8, q9 |
168 vmov d20, d23 | 168 vmov d20, d23 |
169 vmov q11, q12 | 169 vmov q11, q12 |
170 vmov q9, q13 | 170 vmov q9, q13 |
171 | 171 |
172 subs r6, r6, #4 ; w -= 4 | 172 subs r6, r6, #4 ; w -= 4 |
173 bgt loop_horiz | 173 bgt vp9_convolve8_avg_loop_horiz |
174 | 174 |
175 ; outer loop | 175 ; outer loop |
176 mov r6, r10 ; restore w counter | 176 mov r6, r10 ; restore w counter |
177 add r0, r0, r9 ; src += src_stride * 4 - w | 177 add r0, r0, r9 ; src += src_stride * 4 - w |
178 add r2, r2, r12 ; dst += dst_stride * 4 - w | 178 add r2, r2, r12 ; dst += dst_stride * 4 - w |
179 subs r7, r7, #4 ; h -= 4 | 179 subs r7, r7, #4 ; h -= 4 |
180 bgt loop_horiz_v | 180 bgt vp9_convolve8_avg_loop_horiz_v |
181 | 181 |
182 pop {r4-r10, pc} | 182 pop {r4-r10, pc} |
183 | 183 |
184 ENDP | 184 ENDP |
185 | 185 |
186 |vp9_convolve8_avg_vert_neon| PROC | 186 |vp9_convolve8_avg_vert_neon| PROC |
187 ldr r12, [sp, #12] | 187 ldr r12, [sp, #12] |
188 cmp r12, #16 | 188 cmp r12, #16 |
189 bne vp9_convolve8_avg_vert_c | 189 bne vp9_convolve8_avg_vert_c |
190 | 190 |
191 push {r4-r8, lr} | 191 push {r4-r8, lr} |
192 | 192 |
193 ; adjust for taps | 193 ; adjust for taps |
194 sub r0, r0, r1 | 194 sub r0, r0, r1 |
195 sub r0, r0, r1, lsl #1 | 195 sub r0, r0, r1, lsl #1 |
196 | 196 |
197 ldr r4, [sp, #32] ; filter_y | 197 ldr r4, [sp, #32] ; filter_y |
198 ldr r6, [sp, #40] ; w | 198 ldr r6, [sp, #40] ; w |
199 ldr lr, [sp, #44] ; h | 199 ldr lr, [sp, #44] ; h |
200 | 200 |
201 vld1.s16 {q0}, [r4] ; filter_y | 201 vld1.s16 {q0}, [r4] ; filter_y |
202 | 202 |
203 lsl r1, r1, #1 | 203 lsl r1, r1, #1 |
204 lsl r3, r3, #1 | 204 lsl r3, r3, #1 |
205 | 205 |
206 loop_vert_h | 206 vp9_convolve8_avg_loop_vert_h |
207 mov r4, r0 | 207 mov r4, r0 |
208 add r7, r0, r1, asr #1 | 208 add r7, r0, r1, asr #1 |
209 mov r5, r2 | 209 mov r5, r2 |
210 add r8, r2, r3, asr #1 | 210 add r8, r2, r3, asr #1 |
211 mov r12, lr ; h loop counter | 211 mov r12, lr ; h loop counter |
212 | 212 |
213 vld1.u32 {d16[0]}, [r4], r1 | 213 vld1.u32 {d16[0]}, [r4], r1 |
214 vld1.u32 {d16[1]}, [r7], r1 | 214 vld1.u32 {d16[1]}, [r7], r1 |
215 vld1.u32 {d18[0]}, [r4], r1 | 215 vld1.u32 {d18[0]}, [r4], r1 |
216 vld1.u32 {d18[1]}, [r7], r1 | 216 vld1.u32 {d18[1]}, [r7], r1 |
217 vld1.u32 {d20[0]}, [r4], r1 | 217 vld1.u32 {d20[0]}, [r4], r1 |
218 vld1.u32 {d20[1]}, [r7], r1 | 218 vld1.u32 {d20[1]}, [r7], r1 |
219 vld1.u32 {d22[0]}, [r4], r1 | 219 vld1.u32 {d22[0]}, [r4], r1 |
220 | 220 |
221 vmovl.u8 q8, d16 | 221 vmovl.u8 q8, d16 |
222 vmovl.u8 q9, d18 | 222 vmovl.u8 q9, d18 |
223 vmovl.u8 q10, d20 | 223 vmovl.u8 q10, d20 |
224 vmovl.u8 q11, d22 | 224 vmovl.u8 q11, d22 |
225 | 225 |
226 loop_vert | 226 vp9_convolve8_avg_loop_vert |
227 ; always process a 4x4 block at a time | 227 ; always process a 4x4 block at a time |
228 vld1.u32 {d24[0]}, [r7], r1 | 228 vld1.u32 {d24[0]}, [r7], r1 |
229 vld1.u32 {d26[0]}, [r4], r1 | 229 vld1.u32 {d26[0]}, [r4], r1 |
230 vld1.u32 {d26[1]}, [r7], r1 | 230 vld1.u32 {d26[1]}, [r7], r1 |
231 vld1.u32 {d24[1]}, [r4], r1 | 231 vld1.u32 {d24[1]}, [r4], r1 |
232 | 232 |
233 ; extract to s16 | 233 ; extract to s16 |
234 vmovl.u8 q12, d24 | 234 vmovl.u8 q12, d24 |
235 vmovl.u8 q13, d26 | 235 vmovl.u8 q13, d26 |
236 | 236 |
(...skipping 44 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
281 vst1.u32 {d3[0]}, [r5@32], r3 | 281 vst1.u32 {d3[0]}, [r5@32], r3 |
282 vst1.u32 {d3[1]}, [r8@32], r3 | 282 vst1.u32 {d3[1]}, [r8@32], r3 |
283 | 283 |
284 vmov q8, q10 | 284 vmov q8, q10 |
285 vmov d18, d22 | 285 vmov d18, d22 |
286 vmov d19, d24 | 286 vmov d19, d24 |
287 vmov q10, q13 | 287 vmov q10, q13 |
288 vmov d22, d25 | 288 vmov d22, d25 |
289 | 289 |
290 subs r12, r12, #4 ; h -= 4 | 290 subs r12, r12, #4 ; h -= 4 |
291 bgt loop_vert | 291 bgt vp9_convolve8_avg_loop_vert |
292 | 292 |
293 ; outer loop | 293 ; outer loop |
294 add r0, r0, #4 | 294 add r0, r0, #4 |
295 add r2, r2, #4 | 295 add r2, r2, #4 |
296 subs r6, r6, #4 ; w -= 4 | 296 subs r6, r6, #4 ; w -= 4 |
297 bgt loop_vert_h | 297 bgt vp9_convolve8_avg_loop_vert_h |
298 | 298 |
299 pop {r4-r8, pc} | 299 pop {r4-r8, pc} |
300 | 300 |
301 ENDP | 301 ENDP |
302 END | 302 END |
OLD | NEW |