Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1224)

Side by Side Diff: source/libvpx/vp9/common/arm/neon/vp9_convolve8_neon.asm

Issue 756673003: libvpx: Pull from upstream (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/libvpx/
Patch Set: Created 6 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 ; 1 ;
2 ; Copyright (c) 2013 The WebM project authors. All Rights Reserved. 2 ; Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3 ; 3 ;
4 ; Use of this source code is governed by a BSD-style license 4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source 5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found 6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may 7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree. 8 ; be found in the AUTHORS file in the root of the source tree.
9 ; 9 ;
10 10
(...skipping 60 matching lines...) Expand 10 before | Expand all | Expand 10 after
71 71
72 sub r4, r3, r3, lsl #2 ; -dst_stride * 3 72 sub r4, r3, r3, lsl #2 ; -dst_stride * 3
73 add r4, r4, #4 ; -dst_stride * 3 + 4 73 add r4, r4, #4 ; -dst_stride * 3 + 4
74 74
75 rsb r9, r6, r1, lsl #2 ; reset src for outer loop 75 rsb r9, r6, r1, lsl #2 ; reset src for outer loop
76 sub r9, r9, #7 76 sub r9, r9, #7
77 rsb r12, r6, r3, lsl #2 ; reset dst for outer loop 77 rsb r12, r6, r3, lsl #2 ; reset dst for outer loop
78 78
79 mov r10, r6 ; w loop counter 79 mov r10, r6 ; w loop counter
80 80
81 loop_horiz_v 81 vp9_convolve8_loop_horiz_v
82 vld1.8 {d24}, [r0], r1 82 vld1.8 {d24}, [r0], r1
83 vld1.8 {d25}, [r0], r1 83 vld1.8 {d25}, [r0], r1
84 vld1.8 {d26}, [r0], r1 84 vld1.8 {d26}, [r0], r1
85 vld1.8 {d27}, [r0], r8 85 vld1.8 {d27}, [r0], r8
86 86
87 vtrn.16 q12, q13 87 vtrn.16 q12, q13
88 vtrn.8 d24, d25 88 vtrn.8 d24, d25
89 vtrn.8 d26, d27 89 vtrn.8 d26, d27
90 90
91 pld [r0, r1, lsl #2] 91 pld [r0, r1, lsl #2]
92 92
93 vmovl.u8 q8, d24 93 vmovl.u8 q8, d24
94 vmovl.u8 q9, d25 94 vmovl.u8 q9, d25
95 vmovl.u8 q10, d26 95 vmovl.u8 q10, d26
96 vmovl.u8 q11, d27 96 vmovl.u8 q11, d27
97 97
98 ; save a few instructions in the inner loop 98 ; save a few instructions in the inner loop
99 vswp d17, d18 99 vswp d17, d18
100 vmov d23, d21 100 vmov d23, d21
101 101
102 add r0, r0, #3 102 add r0, r0, #3
103 103
104 loop_horiz 104 vp9_convolve8_loop_horiz
105 add r5, r0, #64 105 add r5, r0, #64
106 106
107 vld1.32 {d28[]}, [r0], r1 107 vld1.32 {d28[]}, [r0], r1
108 vld1.32 {d29[]}, [r0], r1 108 vld1.32 {d29[]}, [r0], r1
109 vld1.32 {d31[]}, [r0], r1 109 vld1.32 {d31[]}, [r0], r1
110 vld1.32 {d30[]}, [r0], r8 110 vld1.32 {d30[]}, [r0], r8
111 111
112 pld [r5] 112 pld [r5]
113 113
114 vtrn.16 d28, d31 114 vtrn.16 d28, d31
(...skipping 37 matching lines...) Expand 10 before | Expand all | Expand 10 after
152 vst1.u32 {d3[0]}, [r2@32], r3 152 vst1.u32 {d3[0]}, [r2@32], r3
153 vst1.u32 {d2[1]}, [r2@32], r3 153 vst1.u32 {d2[1]}, [r2@32], r3
154 vst1.u32 {d3[1]}, [r2@32], r4 154 vst1.u32 {d3[1]}, [r2@32], r4
155 155
156 vmov q8, q9 156 vmov q8, q9
157 vmov d20, d23 157 vmov d20, d23
158 vmov q11, q12 158 vmov q11, q12
159 vmov q9, q13 159 vmov q9, q13
160 160
161 subs r6, r6, #4 ; w -= 4 161 subs r6, r6, #4 ; w -= 4
162 bgt loop_horiz 162 bgt vp9_convolve8_loop_horiz
163 163
164 ; outer loop 164 ; outer loop
165 mov r6, r10 ; restore w counter 165 mov r6, r10 ; restore w counter
166 add r0, r0, r9 ; src += src_stride * 4 - w 166 add r0, r0, r9 ; src += src_stride * 4 - w
167 add r2, r2, r12 ; dst += dst_stride * 4 - w 167 add r2, r2, r12 ; dst += dst_stride * 4 - w
168 subs r7, r7, #4 ; h -= 4 168 subs r7, r7, #4 ; h -= 4
169 bgt loop_horiz_v 169 bgt vp9_convolve8_loop_horiz_v
170 170
171 pop {r4-r10, pc} 171 pop {r4-r10, pc}
172 172
173 ENDP 173 ENDP
174 174
175 |vp9_convolve8_vert_neon| PROC 175 |vp9_convolve8_vert_neon| PROC
176 ldr r12, [sp, #12] 176 ldr r12, [sp, #12]
177 cmp r12, #16 177 cmp r12, #16
178 bne vp9_convolve8_vert_c 178 bne vp9_convolve8_vert_c
179 179
180 push {r4-r8, lr} 180 push {r4-r8, lr}
181 181
182 ; adjust for taps 182 ; adjust for taps
183 sub r0, r0, r1 183 sub r0, r0, r1
184 sub r0, r0, r1, lsl #1 184 sub r0, r0, r1, lsl #1
185 185
186 ldr r4, [sp, #32] ; filter_y 186 ldr r4, [sp, #32] ; filter_y
187 ldr r6, [sp, #40] ; w 187 ldr r6, [sp, #40] ; w
188 ldr lr, [sp, #44] ; h 188 ldr lr, [sp, #44] ; h
189 189
190 vld1.s16 {q0}, [r4] ; filter_y 190 vld1.s16 {q0}, [r4] ; filter_y
191 191
192 lsl r1, r1, #1 192 lsl r1, r1, #1
193 lsl r3, r3, #1 193 lsl r3, r3, #1
194 194
195 loop_vert_h 195 vp9_convolve8_loop_vert_h
196 mov r4, r0 196 mov r4, r0
197 add r7, r0, r1, asr #1 197 add r7, r0, r1, asr #1
198 mov r5, r2 198 mov r5, r2
199 add r8, r2, r3, asr #1 199 add r8, r2, r3, asr #1
200 mov r12, lr ; h loop counter 200 mov r12, lr ; h loop counter
201 201
202 vld1.u32 {d16[0]}, [r4], r1 202 vld1.u32 {d16[0]}, [r4], r1
203 vld1.u32 {d16[1]}, [r7], r1 203 vld1.u32 {d16[1]}, [r7], r1
204 vld1.u32 {d18[0]}, [r4], r1 204 vld1.u32 {d18[0]}, [r4], r1
205 vld1.u32 {d18[1]}, [r7], r1 205 vld1.u32 {d18[1]}, [r7], r1
206 vld1.u32 {d20[0]}, [r4], r1 206 vld1.u32 {d20[0]}, [r4], r1
207 vld1.u32 {d20[1]}, [r7], r1 207 vld1.u32 {d20[1]}, [r7], r1
208 vld1.u32 {d22[0]}, [r4], r1 208 vld1.u32 {d22[0]}, [r4], r1
209 209
210 vmovl.u8 q8, d16 210 vmovl.u8 q8, d16
211 vmovl.u8 q9, d18 211 vmovl.u8 q9, d18
212 vmovl.u8 q10, d20 212 vmovl.u8 q10, d20
213 vmovl.u8 q11, d22 213 vmovl.u8 q11, d22
214 214
215 loop_vert 215 vp9_convolve8_loop_vert
216 ; always process a 4x4 block at a time 216 ; always process a 4x4 block at a time
217 vld1.u32 {d24[0]}, [r7], r1 217 vld1.u32 {d24[0]}, [r7], r1
218 vld1.u32 {d26[0]}, [r4], r1 218 vld1.u32 {d26[0]}, [r4], r1
219 vld1.u32 {d26[1]}, [r7], r1 219 vld1.u32 {d26[1]}, [r7], r1
220 vld1.u32 {d24[1]}, [r4], r1 220 vld1.u32 {d24[1]}, [r4], r1
221 221
222 ; extract to s16 222 ; extract to s16
223 vmovl.u8 q12, d24 223 vmovl.u8 q12, d24
224 vmovl.u8 q13, d26 224 vmovl.u8 q13, d26
225 225
(...skipping 33 matching lines...) Expand 10 before | Expand all | Expand 10 after
259 vst1.u32 {d3[0]}, [r5@32], r3 259 vst1.u32 {d3[0]}, [r5@32], r3
260 vst1.u32 {d3[1]}, [r8@32], r3 260 vst1.u32 {d3[1]}, [r8@32], r3
261 261
262 vmov q8, q10 262 vmov q8, q10
263 vmov d18, d22 263 vmov d18, d22
264 vmov d19, d24 264 vmov d19, d24
265 vmov q10, q13 265 vmov q10, q13
266 vmov d22, d25 266 vmov d22, d25
267 267
268 subs r12, r12, #4 ; h -= 4 268 subs r12, r12, #4 ; h -= 4
269 bgt loop_vert 269 bgt vp9_convolve8_loop_vert
270 270
271 ; outer loop 271 ; outer loop
272 add r0, r0, #4 272 add r0, r0, #4
273 add r2, r2, #4 273 add r2, r2, #4
274 subs r6, r6, #4 ; w -= 4 274 subs r6, r6, #4 ; w -= 4
275 bgt loop_vert_h 275 bgt vp9_convolve8_loop_vert_h
276 276
277 pop {r4-r8, pc} 277 pop {r4-r8, pc}
278 278
279 ENDP 279 ENDP
280 END 280 END
OLDNEW
« no previous file with comments | « source/libvpx/vp9/common/arm/neon/vp9_convolve8_avg_neon.asm ('k') | source/libvpx/vp9/common/vp9_alloccommon.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698