Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(531)

Side by Side Diff: source/libvpx/vp9/common/arm/neon/vp9_convolve8_avg_neon.asm

Issue 23600008: libvpx: Pull from upstream (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/libvpx/
Patch Set: Created 7 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 ; 1 ;
2 ; Copyright (c) 2013 The WebM project authors. All Rights Reserved. 2 ; Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3 ; 3 ;
4 ; Use of this source code is governed by a BSD-style license 4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source 5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found 6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may 7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree. 8 ; be found in the AUTHORS file in the root of the source tree.
9 ; 9 ;
10 10
(...skipping 34 matching lines...) Expand 10 before | Expand all | Expand 10 after
45 ; r2 uint8_t *dst 45 ; r2 uint8_t *dst
46 ; r3 int dst_stride 46 ; r3 int dst_stride
47 ; sp[]const int16_t *filter_x 47 ; sp[]const int16_t *filter_x
48 ; sp[]int x_step_q4 48 ; sp[]int x_step_q4
49 ; sp[]const int16_t *filter_y ; unused 49 ; sp[]const int16_t *filter_y ; unused
50 ; sp[]int y_step_q4 ; unused 50 ; sp[]int y_step_q4 ; unused
51 ; sp[]int w 51 ; sp[]int w
52 ; sp[]int h 52 ; sp[]int h
53 53
54 |vp9_convolve8_avg_horiz_neon| PROC 54 |vp9_convolve8_avg_horiz_neon| PROC
55 ldr r12, [sp, #4] ; x_step_q4
56 cmp r12, #16
57 bne vp9_convolve8_avg_horiz_c
58
55 push {r4-r10, lr} 59 push {r4-r10, lr}
56 60
57 sub r0, r0, #3 ; adjust for taps 61 sub r0, r0, #3 ; adjust for taps
58 62
59 ldr r4, [sp, #36] ; x_step_q4
60 ldr r5, [sp, #32] ; filter_x 63 ldr r5, [sp, #32] ; filter_x
61 cmp r4, #16
62 bne call_horiz_c_convolve ; x_step_q4 != 16
63
64 ldr r6, [sp, #48] ; w 64 ldr r6, [sp, #48] ; w
65 ldr r7, [sp, #52] ; h 65 ldr r7, [sp, #52] ; h
66 66
67 vld1.s16 {q0}, [r5] ; filter_x 67 vld1.s16 {q0}, [r5] ; filter_x
68 68
69 add r8, r1, r1, lsl #1 ; src_stride * 3 69 sub r8, r1, r1, lsl #2 ; -src_stride * 3
70 add r8, r8, #4 ; src_stride * 3 + 4 70 add r8, r8, #4 ; -src_stride * 3 + 4
71 rsb r8, r8, #0 ; reset for src
72 71
73 add r4, r3, r3, lsl #1 ; dst_stride * 3 72 sub r4, r3, r3, lsl #2 ; -dst_stride * 3
74 sub r4, r4, #4 ; dst_stride * 3 - 4 73 add r4, r4, #4 ; -dst_stride * 3 + 4
75 rsb r4, r4, #0 ; reset for dst
76 74
77 sub r9, r1, #8 ; post increment for src load 75 rsb r9, r6, r1, lsl #2 ; reset src for outer loop
78 76 sub r9, r9, #7
79 rsb r1, r6, r1, lsl #2 ; reset src for outer loop
80 rsb r12, r6, r3, lsl #2 ; reset dst for outer loop 77 rsb r12, r6, r3, lsl #2 ; reset dst for outer loop
81 78
82 mov r10, r6 ; w loop counter 79 mov r10, r6 ; w loop counter
83 80
84 loop_horiz 81 loop_horiz_v
85 vld4.u8 {d24[0], d25[0], d26[0], d27[0]}, [r0]! 82 vld1.8 {d24}, [r0], r1
86 vld4.u8 {d24[4], d25[4], d26[4], d27[4]}, [r0]! 83 vld1.8 {d25}, [r0], r1
87 vld3.u8 {d28[0], d29[0], d30[0]}, [r0], r9 84 vld1.8 {d26}, [r0], r1
85 vld1.8 {d27}, [r0], r8
88 86
89 vld4.u8 {d24[1], d25[1], d26[1], d27[1]}, [r0]! 87 vtrn.16 q12, q13
90 vld4.u8 {d24[5], d25[5], d26[5], d27[5]}, [r0]! 88 vtrn.8 d24, d25
91 vld3.u8 {d28[1], d29[1], d30[1]}, [r0], r9 89 vtrn.8 d26, d27
92 90
93 vld4.u8 {d24[2], d25[2], d26[2], d27[2]}, [r0]! 91 pld [r0, r1, lsl #2]
94 vld4.u8 {d24[6], d25[6], d26[6], d27[6]}, [r0]!
95 vld3.u8 {d28[2], d29[2], d30[2]}, [r0], r9
96 92
97 vld4.u8 {d24[3], d25[3], d26[3], d27[3]}, [r0]!
98 vld4.u8 {d24[7], d25[7], d26[7], d27[7]}, [r0]!
99 vld3.u8 {d28[3], d29[3], d30[3]}, [r0], r8
100
101 ; extract to s16
102 vmovl.u8 q8, d24 93 vmovl.u8 q8, d24
103 vmovl.u8 q9, d25 94 vmovl.u8 q9, d25
104 vmovl.u8 q10, d26 95 vmovl.u8 q10, d26
105 vmovl.u8 q11, d27 96 vmovl.u8 q11, d27
106 vtrn.32 d28, d29 ; only the first half is populated 97
98 ; save a few instructions in the inner loop
99 vswp d17, d18
100 vmov d23, d21
101
102 add r0, r0, #3
103
104 loop_horiz
105 add r5, r0, #64
106
107 vld1.32 {d28[]}, [r0], r1
108 vld1.32 {d29[]}, [r0], r1
109 vld1.32 {d31[]}, [r0], r1
110 vld1.32 {d30[]}, [r0], r8
111
112 pld [r5]
113
114 vtrn.16 d28, d31
115 vtrn.16 d29, d30
116 vtrn.8 d28, d29
117 vtrn.8 d31, d30
118
119 pld [r5, r1]
120
121 ; extract to s16
122 vtrn.32 q14, q15
107 vmovl.u8 q12, d28 123 vmovl.u8 q12, d28
108 vmovl.u8 q13, d30 124 vmovl.u8 q13, d29
125
126 pld [r5, r1, lsl #1]
109 127
110 ; slightly out of order load to match the existing data 128 ; slightly out of order load to match the existing data
111 vld1.u32 {d6[0]}, [r2], r3 129 vld1.u32 {d6[0]}, [r2], r3
112 vld1.u32 {d7[0]}, [r2], r3 130 vld1.u32 {d7[0]}, [r2], r3
113 vld1.u32 {d6[1]}, [r2], r3 131 vld1.u32 {d6[1]}, [r2], r3
114 vld1.u32 {d7[1]}, [r2], r3 132 vld1.u32 {d7[1]}, [r2], r3
115 133
116 sub r2, r2, r3, lsl #2 ; reset for store 134 sub r2, r2, r3, lsl #2 ; reset for store
117 135
118 ; src[] * filter_x 136 ; src[] * filter_x
119 MULTIPLY_BY_Q0 q1, d16, d18, d20, d22, d17, d19, d21, d23 137 MULTIPLY_BY_Q0 q1, d16, d17, d20, d22, d18, d19, d23, d24
120 MULTIPLY_BY_Q0 q2, d18, d20, d22, d17, d19, d21, d23, d24 138 MULTIPLY_BY_Q0 q2, d17, d20, d22, d18, d19, d23, d24, d26
121 MULTIPLY_BY_Q0 q14, d20, d22, d17, d19, d21, d23, d24, d25 139 MULTIPLY_BY_Q0 q14, d20, d22, d18, d19, d23, d24, d26, d27
122 MULTIPLY_BY_Q0 q15, d22, d17, d19, d21, d23, d24, d25, d26 140 MULTIPLY_BY_Q0 q15, d22, d18, d19, d23, d24, d26, d27, d25
141
142 pld [r5, -r8]
123 143
124 ; += 64 >> 7 144 ; += 64 >> 7
125 vqrshrun.s32 d2, q1, #7 145 vqrshrun.s32 d2, q1, #7
126 vqrshrun.s32 d3, q2, #7 146 vqrshrun.s32 d3, q2, #7
127 vqrshrun.s32 d4, q14, #7 147 vqrshrun.s32 d4, q14, #7
128 vqrshrun.s32 d5, q15, #7 148 vqrshrun.s32 d5, q15, #7
129 149
130 ; saturate 150 ; saturate
131 vqshrn.u16 d2, q1, #0 151 vqmovn.u16 d2, q1
132 vqshrn.u16 d3, q2, #0 152 vqmovn.u16 d3, q2
133 153
134 ; transpose 154 ; transpose
135 vtrn.16 d2, d3 155 vtrn.16 d2, d3
136 vtrn.32 d2, d3 156 vtrn.32 d2, d3
137 vtrn.8 d2, d3 157 vtrn.8 d2, d3
138 158
139 ; average the new value and the dst value 159 ; average the new value and the dst value
140 vaddl.u8 q8, d2, d6 160 vrhadd.u8 q1, q1, q3
141 vaddl.u8 q9, d3, d7
142 vqrshrn.u16 d2, q8, #1
143 vqrshrn.u16 d3, q9, #1
144 161
145 vst1.u32 {d2[0]}, [r2], r3 162 vst1.u32 {d2[0]}, [r2@32], r3
146 vst1.u32 {d3[0]}, [r2], r3 163 vst1.u32 {d3[0]}, [r2@32], r3
147 vst1.u32 {d2[1]}, [r2], r3 164 vst1.u32 {d2[1]}, [r2@32], r3
148 vst1.u32 {d3[1]}, [r2], r4 165 vst1.u32 {d3[1]}, [r2@32], r4
166
167 vmov q8, q9
168 vmov d20, d23
169 vmov q11, q12
170 vmov q9, q13
149 171
150 subs r6, r6, #4 ; w -= 4 172 subs r6, r6, #4 ; w -= 4
151 bgt loop_horiz 173 bgt loop_horiz
152 174
153 ; outer loop 175 ; outer loop
154 mov r6, r10 ; restore w counter 176 mov r6, r10 ; restore w counter
155 add r0, r0, r1 ; src += src_stride * 4 - w 177 add r0, r0, r9 ; src += src_stride * 4 - w
156 add r2, r2, r12 ; dst += dst_stride * 4 - w 178 add r2, r2, r12 ; dst += dst_stride * 4 - w
157 subs r7, r7, #4 ; h -= 4 179 subs r7, r7, #4 ; h -= 4
158 bgt loop_horiz 180 bgt loop_horiz_v
159 181
160 pop {r4-r10, pc} 182 pop {r4-r10, pc}
161 183
162 call_horiz_c_convolve
163 pop {r4-r10, lr}
164 add r0, r0, #3 ; un-adjust for taps
165 b vp9_convolve8_avg_horiz_c
166
167
168 ENDP 184 ENDP
169 185
170 |vp9_convolve8_avg_vert_neon| PROC 186 |vp9_convolve8_avg_vert_neon| PROC
171 push {r4-r10, lr} 187 ldr r12, [sp, #12]
188 cmp r12, #16
189 bne vp9_convolve8_avg_vert_c
190
191 push {r4-r8, lr}
172 192
173 ; adjust for taps 193 ; adjust for taps
174 sub r0, r0, r1 194 sub r0, r0, r1
175 sub r0, r0, r1, lsl #1 195 sub r0, r0, r1, lsl #1
176 196
177 ldr r6, [sp, #44] ; y_step_q4 197 ldr r4, [sp, #32] ; filter_y
178 ldr r7, [sp, #40] ; filter_y 198 ldr r6, [sp, #40] ; w
179 cmp r6, #16 199 ldr lr, [sp, #44] ; h
180 bne call_vert_c_convolve ; y_step_q4 != 16
181 200
182 ldr r8, [sp, #48] ; w 201 vld1.s16 {q0}, [r4] ; filter_y
183 ldr r9, [sp, #52] ; h
184 202
185 vld1.s16 {q0}, [r7] ; filter_y 203 lsl r1, r1, #1
204 lsl r3, r3, #1
186 205
187 mov r5, r1, lsl #1 ; src_stride * 2 206 loop_vert_h
188 add r5, r5, r1, lsl #3 ; src_stride * 10 207 mov r4, r0
189 sub r5, r5, #4 ; src_stride * 10 + 4 208 add r7, r0, r1, asr #1
190 rsb r5, r5, #0 ; reset for src 209 mov r5, r2
210 add r8, r2, r3, asr #1
211 mov r12, lr ; h loop counter
191 212
192 add r6, r3, r3, lsl #1 ; dst_stride * 3 213 vld1.u32 {d16[0]}, [r4], r1
193 sub r6, r6, #4 ; dst_stride * 3 - 4 214 vld1.u32 {d16[1]}, [r7], r1
194 rsb r6, r6, #0 ; reset for dst 215 vld1.u32 {d18[0]}, [r4], r1
216 vld1.u32 {d18[1]}, [r7], r1
217 vld1.u32 {d20[0]}, [r4], r1
218 vld1.u32 {d20[1]}, [r7], r1
219 vld1.u32 {d22[0]}, [r4], r1
195 220
196 rsb r7, r8, r1, lsl #2 ; reset src for outer loop
197 rsb r12, r8, r3, lsl #2 ; reset dst for outer loop
198
199 mov r10, r8 ; w loop counter
200
201 loop_vert
202 ; always process a 4x4 block at a time
203 vld1.u32 {d16[0]}, [r0], r1
204 vld1.u32 {d16[1]}, [r0], r1
205 vld1.u32 {d18[0]}, [r0], r1
206 vld1.u32 {d18[1]}, [r0], r1
207 vld1.u32 {d20[0]}, [r0], r1
208 vld1.u32 {d20[1]}, [r0], r1
209 vld1.u32 {d22[0]}, [r0], r1
210 vld1.u32 {d22[1]}, [r0], r1
211 vld1.u32 {d24[0]}, [r0], r1
212 vld1.u32 {d24[1]}, [r0], r1
213 vld1.u32 {d26[0]}, [r0], r5
214
215 ; extract to s16
216 vmovl.u8 q8, d16 221 vmovl.u8 q8, d16
217 vmovl.u8 q9, d18 222 vmovl.u8 q9, d18
218 vmovl.u8 q10, d20 223 vmovl.u8 q10, d20
219 vmovl.u8 q11, d22 224 vmovl.u8 q11, d22
225
226 loop_vert
227 ; always process a 4x4 block at a time
228 vld1.u32 {d24[0]}, [r7], r1
229 vld1.u32 {d26[0]}, [r4], r1
230 vld1.u32 {d26[1]}, [r7], r1
231 vld1.u32 {d24[1]}, [r4], r1
232
233 ; extract to s16
220 vmovl.u8 q12, d24 234 vmovl.u8 q12, d24
221 vmovl.u8 q13, d26 235 vmovl.u8 q13, d26
222 236
223 vld1.u32 {d6[0]}, [r2], r3 237 vld1.u32 {d6[0]}, [r5@32], r3
224 vld1.u32 {d6[1]}, [r2], r3 238 vld1.u32 {d6[1]}, [r8@32], r3
225 vld1.u32 {d7[0]}, [r2], r3 239 vld1.u32 {d7[0]}, [r5@32], r3
226 vld1.u32 {d7[1]}, [r2], r3 240 vld1.u32 {d7[1]}, [r8@32], r3
227 241
228 sub r2, r2, r3, lsl #2 ; reset for store 242 pld [r7]
243 pld [r4]
229 244
230 ; src[] * filter_y 245 ; src[] * filter_y
231 MULTIPLY_BY_Q0 q1, d16, d17, d18, d19, d20, d21, d22, d23 246 MULTIPLY_BY_Q0 q1, d16, d17, d18, d19, d20, d21, d22, d24
232 MULTIPLY_BY_Q0 q2, d17, d18, d19, d20, d21, d22, d23, d24 247
233 MULTIPLY_BY_Q0 q14, d18, d19, d20, d21, d22, d23, d24, d25 248 pld [r7, r1]
234 MULTIPLY_BY_Q0 q15, d19, d20, d21, d22, d23, d24, d25, d26 249 pld [r4, r1]
250
251 MULTIPLY_BY_Q0 q2, d17, d18, d19, d20, d21, d22, d24, d26
252
253 pld [r5]
254 pld [r8]
255
256 MULTIPLY_BY_Q0 q14, d18, d19, d20, d21, d22, d24, d26, d27
257
258 pld [r5, r3]
259 pld [r8, r3]
260
261 MULTIPLY_BY_Q0 q15, d19, d20, d21, d22, d24, d26, d27, d25
235 262
236 ; += 64 >> 7 263 ; += 64 >> 7
237 vqrshrun.s32 d2, q1, #7 264 vqrshrun.s32 d2, q1, #7
238 vqrshrun.s32 d3, q2, #7 265 vqrshrun.s32 d3, q2, #7
239 vqrshrun.s32 d4, q14, #7 266 vqrshrun.s32 d4, q14, #7
240 vqrshrun.s32 d5, q15, #7 267 vqrshrun.s32 d5, q15, #7
241 268
242 ; saturate 269 ; saturate
243 vqshrn.u16 d2, q1, #0 270 vqmovn.u16 d2, q1
244 vqshrn.u16 d3, q2, #0 271 vqmovn.u16 d3, q2
245 272
246 ; average the new value and the dst value 273 ; average the new value and the dst value
247 vaddl.u8 q8, d2, d6 274 vrhadd.u8 q1, q1, q3
248 vaddl.u8 q9, d3, d7
249 vqrshrn.u16 d2, q8, #1
250 vqrshrn.u16 d3, q9, #1
251 275
252 vst1.u32 {d2[0]}, [r2], r3 276 sub r5, r5, r3, lsl #1 ; reset for store
253 vst1.u32 {d2[1]}, [r2], r3 277 sub r8, r8, r3, lsl #1
254 vst1.u32 {d3[0]}, [r2], r3
255 vst1.u32 {d3[1]}, [r2], r6
256 278
257 subs r8, r8, #4 ; w -= 4 279 vst1.u32 {d2[0]}, [r5@32], r3
280 vst1.u32 {d2[1]}, [r8@32], r3
281 vst1.u32 {d3[0]}, [r5@32], r3
282 vst1.u32 {d3[1]}, [r8@32], r3
283
284 vmov q8, q10
285 vmov d18, d22
286 vmov d19, d24
287 vmov q10, q13
288 vmov d22, d25
289
290 subs r12, r12, #4 ; h -= 4
258 bgt loop_vert 291 bgt loop_vert
259 292
260 ; outer loop 293 ; outer loop
261 mov r8, r10 ; restore w counter 294 add r0, r0, #4
262 add r0, r0, r7 ; src += 4 * src_stride - w 295 add r2, r2, #4
263 add r2, r2, r12 ; dst += 4 * dst_stride - w 296 subs r6, r6, #4 ; w -= 4
264 subs r9, r9, #4 ; h -= 4 297 bgt loop_vert_h
265 bgt loop_vert
266 298
267 pop {r4-r10, pc} 299 pop {r4-r8, pc}
268
269 call_vert_c_convolve
270 pop {r4-r10, lr}
271 ; un-adjust for taps
272 add r0, r0, r1
273 add r0, r0, r1, lsl #1
274 b vp9_convolve8_avg_vert_c
275 300
276 ENDP 301 ENDP
277 END 302 END
OLDNEW
« no previous file with comments | « source/libvpx/vp9/common/arm/neon/vp9_avg_neon.asm ('k') | source/libvpx/vp9/common/arm/neon/vp9_convolve8_neon.asm » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698