Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(409)

Side by Side Diff: source/libvpx/vp9/common/arm/neon/vp9_convolve8_avg_neon.asm

Issue 756673003: libvpx: Pull from upstream (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/libvpx/
Patch Set: Created 6 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 ; 1 ;
2 ; Copyright (c) 2013 The WebM project authors. All Rights Reserved. 2 ; Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3 ; 3 ;
4 ; Use of this source code is governed by a BSD-style license 4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source 5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found 6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may 7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree. 8 ; be found in the AUTHORS file in the root of the source tree.
9 ; 9 ;
10 10
(...skipping 60 matching lines...) Expand 10 before | Expand all | Expand 10 after
71 71
72 sub r4, r3, r3, lsl #2 ; -dst_stride * 3 72 sub r4, r3, r3, lsl #2 ; -dst_stride * 3
73 add r4, r4, #4 ; -dst_stride * 3 + 4 73 add r4, r4, #4 ; -dst_stride * 3 + 4
74 74
75 rsb r9, r6, r1, lsl #2 ; reset src for outer loop 75 rsb r9, r6, r1, lsl #2 ; reset src for outer loop
76 sub r9, r9, #7 76 sub r9, r9, #7
77 rsb r12, r6, r3, lsl #2 ; reset dst for outer loop 77 rsb r12, r6, r3, lsl #2 ; reset dst for outer loop
78 78
79 mov r10, r6 ; w loop counter 79 mov r10, r6 ; w loop counter
80 80
81 loop_horiz_v 81 vp9_convolve8_avg_loop_horiz_v
82 vld1.8 {d24}, [r0], r1 82 vld1.8 {d24}, [r0], r1
83 vld1.8 {d25}, [r0], r1 83 vld1.8 {d25}, [r0], r1
84 vld1.8 {d26}, [r0], r1 84 vld1.8 {d26}, [r0], r1
85 vld1.8 {d27}, [r0], r8 85 vld1.8 {d27}, [r0], r8
86 86
87 vtrn.16 q12, q13 87 vtrn.16 q12, q13
88 vtrn.8 d24, d25 88 vtrn.8 d24, d25
89 vtrn.8 d26, d27 89 vtrn.8 d26, d27
90 90
91 pld [r0, r1, lsl #2] 91 pld [r0, r1, lsl #2]
92 92
93 vmovl.u8 q8, d24 93 vmovl.u8 q8, d24
94 vmovl.u8 q9, d25 94 vmovl.u8 q9, d25
95 vmovl.u8 q10, d26 95 vmovl.u8 q10, d26
96 vmovl.u8 q11, d27 96 vmovl.u8 q11, d27
97 97
98 ; save a few instructions in the inner loop 98 ; save a few instructions in the inner loop
99 vswp d17, d18 99 vswp d17, d18
100 vmov d23, d21 100 vmov d23, d21
101 101
102 add r0, r0, #3 102 add r0, r0, #3
103 103
104 loop_horiz 104 vp9_convolve8_avg_loop_horiz
105 add r5, r0, #64 105 add r5, r0, #64
106 106
107 vld1.32 {d28[]}, [r0], r1 107 vld1.32 {d28[]}, [r0], r1
108 vld1.32 {d29[]}, [r0], r1 108 vld1.32 {d29[]}, [r0], r1
109 vld1.32 {d31[]}, [r0], r1 109 vld1.32 {d31[]}, [r0], r1
110 vld1.32 {d30[]}, [r0], r8 110 vld1.32 {d30[]}, [r0], r8
111 111
112 pld [r5] 112 pld [r5]
113 113
114 vtrn.16 d28, d31 114 vtrn.16 d28, d31
(...skipping 48 matching lines...) Expand 10 before | Expand all | Expand 10 after
163 vst1.u32 {d3[0]}, [r2@32], r3 163 vst1.u32 {d3[0]}, [r2@32], r3
164 vst1.u32 {d2[1]}, [r2@32], r3 164 vst1.u32 {d2[1]}, [r2@32], r3
165 vst1.u32 {d3[1]}, [r2@32], r4 165 vst1.u32 {d3[1]}, [r2@32], r4
166 166
167 vmov q8, q9 167 vmov q8, q9
168 vmov d20, d23 168 vmov d20, d23
169 vmov q11, q12 169 vmov q11, q12
170 vmov q9, q13 170 vmov q9, q13
171 171
172 subs r6, r6, #4 ; w -= 4 172 subs r6, r6, #4 ; w -= 4
173 bgt loop_horiz 173 bgt vp9_convolve8_avg_loop_horiz
174 174
175 ; outer loop 175 ; outer loop
176 mov r6, r10 ; restore w counter 176 mov r6, r10 ; restore w counter
177 add r0, r0, r9 ; src += src_stride * 4 - w 177 add r0, r0, r9 ; src += src_stride * 4 - w
178 add r2, r2, r12 ; dst += dst_stride * 4 - w 178 add r2, r2, r12 ; dst += dst_stride * 4 - w
179 subs r7, r7, #4 ; h -= 4 179 subs r7, r7, #4 ; h -= 4
180 bgt loop_horiz_v 180 bgt vp9_convolve8_avg_loop_horiz_v
181 181
182 pop {r4-r10, pc} 182 pop {r4-r10, pc}
183 183
184 ENDP 184 ENDP
185 185
186 |vp9_convolve8_avg_vert_neon| PROC 186 |vp9_convolve8_avg_vert_neon| PROC
187 ldr r12, [sp, #12] 187 ldr r12, [sp, #12]
188 cmp r12, #16 188 cmp r12, #16
189 bne vp9_convolve8_avg_vert_c 189 bne vp9_convolve8_avg_vert_c
190 190
191 push {r4-r8, lr} 191 push {r4-r8, lr}
192 192
193 ; adjust for taps 193 ; adjust for taps
194 sub r0, r0, r1 194 sub r0, r0, r1
195 sub r0, r0, r1, lsl #1 195 sub r0, r0, r1, lsl #1
196 196
197 ldr r4, [sp, #32] ; filter_y 197 ldr r4, [sp, #32] ; filter_y
198 ldr r6, [sp, #40] ; w 198 ldr r6, [sp, #40] ; w
199 ldr lr, [sp, #44] ; h 199 ldr lr, [sp, #44] ; h
200 200
201 vld1.s16 {q0}, [r4] ; filter_y 201 vld1.s16 {q0}, [r4] ; filter_y
202 202
203 lsl r1, r1, #1 203 lsl r1, r1, #1
204 lsl r3, r3, #1 204 lsl r3, r3, #1
205 205
206 loop_vert_h 206 vp9_convolve8_avg_loop_vert_h
207 mov r4, r0 207 mov r4, r0
208 add r7, r0, r1, asr #1 208 add r7, r0, r1, asr #1
209 mov r5, r2 209 mov r5, r2
210 add r8, r2, r3, asr #1 210 add r8, r2, r3, asr #1
211 mov r12, lr ; h loop counter 211 mov r12, lr ; h loop counter
212 212
213 vld1.u32 {d16[0]}, [r4], r1 213 vld1.u32 {d16[0]}, [r4], r1
214 vld1.u32 {d16[1]}, [r7], r1 214 vld1.u32 {d16[1]}, [r7], r1
215 vld1.u32 {d18[0]}, [r4], r1 215 vld1.u32 {d18[0]}, [r4], r1
216 vld1.u32 {d18[1]}, [r7], r1 216 vld1.u32 {d18[1]}, [r7], r1
217 vld1.u32 {d20[0]}, [r4], r1 217 vld1.u32 {d20[0]}, [r4], r1
218 vld1.u32 {d20[1]}, [r7], r1 218 vld1.u32 {d20[1]}, [r7], r1
219 vld1.u32 {d22[0]}, [r4], r1 219 vld1.u32 {d22[0]}, [r4], r1
220 220
221 vmovl.u8 q8, d16 221 vmovl.u8 q8, d16
222 vmovl.u8 q9, d18 222 vmovl.u8 q9, d18
223 vmovl.u8 q10, d20 223 vmovl.u8 q10, d20
224 vmovl.u8 q11, d22 224 vmovl.u8 q11, d22
225 225
226 loop_vert 226 vp9_convolve8_avg_loop_vert
227 ; always process a 4x4 block at a time 227 ; always process a 4x4 block at a time
228 vld1.u32 {d24[0]}, [r7], r1 228 vld1.u32 {d24[0]}, [r7], r1
229 vld1.u32 {d26[0]}, [r4], r1 229 vld1.u32 {d26[0]}, [r4], r1
230 vld1.u32 {d26[1]}, [r7], r1 230 vld1.u32 {d26[1]}, [r7], r1
231 vld1.u32 {d24[1]}, [r4], r1 231 vld1.u32 {d24[1]}, [r4], r1
232 232
233 ; extract to s16 233 ; extract to s16
234 vmovl.u8 q12, d24 234 vmovl.u8 q12, d24
235 vmovl.u8 q13, d26 235 vmovl.u8 q13, d26
236 236
(...skipping 44 matching lines...) Expand 10 before | Expand all | Expand 10 after
281 vst1.u32 {d3[0]}, [r5@32], r3 281 vst1.u32 {d3[0]}, [r5@32], r3
282 vst1.u32 {d3[1]}, [r8@32], r3 282 vst1.u32 {d3[1]}, [r8@32], r3
283 283
284 vmov q8, q10 284 vmov q8, q10
285 vmov d18, d22 285 vmov d18, d22
286 vmov d19, d24 286 vmov d19, d24
287 vmov q10, q13 287 vmov q10, q13
288 vmov d22, d25 288 vmov d22, d25
289 289
290 subs r12, r12, #4 ; h -= 4 290 subs r12, r12, #4 ; h -= 4
291 bgt loop_vert 291 bgt vp9_convolve8_avg_loop_vert
292 292
293 ; outer loop 293 ; outer loop
294 add r0, r0, #4 294 add r0, r0, #4
295 add r2, r2, #4 295 add r2, r2, #4
296 subs r6, r6, #4 ; w -= 4 296 subs r6, r6, #4 ; w -= 4
297 bgt loop_vert_h 297 bgt vp9_convolve8_avg_loop_vert_h
298 298
299 pop {r4-r8, pc} 299 pop {r4-r8, pc}
300 300
301 ENDP 301 ENDP
302 END 302 END
OLDNEW
« no previous file with comments | « source/libvpx/vp8/vp8cx_arm.mk ('k') | source/libvpx/vp9/common/arm/neon/vp9_convolve8_neon.asm » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698