Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(459)

Side by Side Diff: source/libvpx/vp9/common/arm/neon/vp9_convolve8_neon.asm

Issue 23600008: libvpx: Pull from upstream (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/libvpx/
Patch Set: Created 7 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 ; 1 ;
2 ; Copyright (c) 2013 The WebM project authors. All Rights Reserved. 2 ; Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3 ; 3 ;
4 ; Use of this source code is governed by a BSD-style license 4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source 5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found 6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may 7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree. 8 ; be found in the AUTHORS file in the root of the source tree.
9 ; 9 ;
10 10
(...skipping 34 matching lines...) Expand 10 before | Expand all | Expand 10 after
45 ; r2 uint8_t *dst 45 ; r2 uint8_t *dst
46 ; r3 int dst_stride 46 ; r3 int dst_stride
47 ; sp[]const int16_t *filter_x 47 ; sp[]const int16_t *filter_x
48 ; sp[]int x_step_q4 48 ; sp[]int x_step_q4
49 ; sp[]const int16_t *filter_y ; unused 49 ; sp[]const int16_t *filter_y ; unused
50 ; sp[]int y_step_q4 ; unused 50 ; sp[]int y_step_q4 ; unused
51 ; sp[]int w 51 ; sp[]int w
52 ; sp[]int h 52 ; sp[]int h
53 53
54 |vp9_convolve8_horiz_neon| PROC 54 |vp9_convolve8_horiz_neon| PROC
55 ldr r12, [sp, #4] ; x_step_q4
56 cmp r12, #16
57 bne vp9_convolve8_horiz_c
58
55 push {r4-r10, lr} 59 push {r4-r10, lr}
56 60
57 sub r0, r0, #3 ; adjust for taps 61 sub r0, r0, #3 ; adjust for taps
58 62
59 ldr r4, [sp, #36] ; x_step_q4
60 ldr r5, [sp, #32] ; filter_x 63 ldr r5, [sp, #32] ; filter_x
61 cmp r4, #16
62 bne call_horiz_c_convolve ; x_step_q4 != 16
63
64 ldr r6, [sp, #48] ; w 64 ldr r6, [sp, #48] ; w
65 ldr r7, [sp, #52] ; h 65 ldr r7, [sp, #52] ; h
66 66
67 vld1.s16 {q0}, [r5] ; filter_x 67 vld1.s16 {q0}, [r5] ; filter_x
68 68
69 add r8, r1, r1, lsl #1 ; src_stride * 3 69 sub r8, r1, r1, lsl #2 ; -src_stride * 3
70 add r8, r8, #4 ; src_stride * 3 + 4 70 add r8, r8, #4 ; -src_stride * 3 + 4
71 rsb r8, r8, #0 ; reset for src
72 71
73 add r4, r3, r3, lsl #1 ; dst_stride * 3 72 sub r4, r3, r3, lsl #2 ; -dst_stride * 3
74 sub r4, r4, #4 ; dst_stride * 3 - 4 73 add r4, r4, #4 ; -dst_stride * 3 + 4
75 rsb r4, r4, #0 ; reset for dst
76 74
77 sub r9, r1, #8 ; post increment for src load 75 rsb r9, r6, r1, lsl #2 ; reset src for outer loop
78 76 sub r9, r9, #7
79 rsb r1, r6, r1, lsl #2 ; reset src for outer loop
80 rsb r12, r6, r3, lsl #2 ; reset dst for outer loop 77 rsb r12, r6, r3, lsl #2 ; reset dst for outer loop
81 78
82 mov r10, r6 ; w loop counter 79 mov r10, r6 ; w loop counter
83 80
84 loop_horiz 81 loop_horiz_v
85 vld4.u8 {d24[0], d25[0], d26[0], d27[0]}, [r0]! 82 vld1.8 {d24}, [r0], r1
86 vld4.u8 {d24[4], d25[4], d26[4], d27[4]}, [r0]! 83 vld1.8 {d25}, [r0], r1
87 vld3.u8 {d28[0], d29[0], d30[0]}, [r0], r9 84 vld1.8 {d26}, [r0], r1
85 vld1.8 {d27}, [r0], r8
88 86
89 vld4.u8 {d24[1], d25[1], d26[1], d27[1]}, [r0]! 87 vtrn.16 q12, q13
90 vld4.u8 {d24[5], d25[5], d26[5], d27[5]}, [r0]! 88 vtrn.8 d24, d25
91 vld3.u8 {d28[1], d29[1], d30[1]}, [r0], r9 89 vtrn.8 d26, d27
92 90
93 vld4.u8 {d24[2], d25[2], d26[2], d27[2]}, [r0]! 91 pld [r0, r1, lsl #2]
94 vld4.u8 {d24[6], d25[6], d26[6], d27[6]}, [r0]!
95 vld3.u8 {d28[2], d29[2], d30[2]}, [r0], r9
96 92
97 vld4.u8 {d24[3], d25[3], d26[3], d27[3]}, [r0]!
98 vld4.u8 {d24[7], d25[7], d26[7], d27[7]}, [r0]!
99 vld3.u8 {d28[3], d29[3], d30[3]}, [r0], r8
100
101 ; extract to s16
102 vmovl.u8 q8, d24 93 vmovl.u8 q8, d24
103 vmovl.u8 q9, d25 94 vmovl.u8 q9, d25
104 vmovl.u8 q10, d26 95 vmovl.u8 q10, d26
105 vmovl.u8 q11, d27 96 vmovl.u8 q11, d27
106 vtrn.32 d28, d29 ; only the first half is populated 97
98 ; save a few instructions in the inner loop
99 vswp d17, d18
100 vmov d23, d21
101
102 add r0, r0, #3
103
104 loop_horiz
105 add r5, r0, #64
106
107 vld1.32 {d28[]}, [r0], r1
108 vld1.32 {d29[]}, [r0], r1
109 vld1.32 {d31[]}, [r0], r1
110 vld1.32 {d30[]}, [r0], r8
111
112 pld [r5]
113
114 vtrn.16 d28, d31
115 vtrn.16 d29, d30
116 vtrn.8 d28, d29
117 vtrn.8 d31, d30
118
119 pld [r5, r1]
120
121 ; extract to s16
122 vtrn.32 q14, q15
107 vmovl.u8 q12, d28 123 vmovl.u8 q12, d28
108 vmovl.u8 q13, d30 124 vmovl.u8 q13, d29
125
126 pld [r5, r1, lsl #1]
109 127
110 ; src[] * filter_x 128 ; src[] * filter_x
111 MULTIPLY_BY_Q0 q1, d16, d18, d20, d22, d17, d19, d21, d23 129 MULTIPLY_BY_Q0 q1, d16, d17, d20, d22, d18, d19, d23, d24
112 MULTIPLY_BY_Q0 q2, d18, d20, d22, d17, d19, d21, d23, d24 130 MULTIPLY_BY_Q0 q2, d17, d20, d22, d18, d19, d23, d24, d26
113 MULTIPLY_BY_Q0 q14, d20, d22, d17, d19, d21, d23, d24, d25 131 MULTIPLY_BY_Q0 q14, d20, d22, d18, d19, d23, d24, d26, d27
114 MULTIPLY_BY_Q0 q15, d22, d17, d19, d21, d23, d24, d25, d26 132 MULTIPLY_BY_Q0 q15, d22, d18, d19, d23, d24, d26, d27, d25
133
134 pld [r5, -r8]
115 135
116 ; += 64 >> 7 136 ; += 64 >> 7
117 vqrshrun.s32 d2, q1, #7 137 vqrshrun.s32 d2, q1, #7
118 vqrshrun.s32 d3, q2, #7 138 vqrshrun.s32 d3, q2, #7
119 vqrshrun.s32 d4, q14, #7 139 vqrshrun.s32 d4, q14, #7
120 vqrshrun.s32 d5, q15, #7 140 vqrshrun.s32 d5, q15, #7
121 141
122 ; saturate 142 ; saturate
123 vqshrn.u16 d2, q1, #0 143 vqmovn.u16 d2, q1
124 vqshrn.u16 d3, q2, #0 144 vqmovn.u16 d3, q2
125 145
126 ; transpose 146 ; transpose
127 vtrn.16 d2, d3 147 vtrn.16 d2, d3
128 vtrn.32 d2, d3 148 vtrn.32 d2, d3
129 vtrn.8 d2, d3 149 vtrn.8 d2, d3
130 150
131 vst1.u32 {d2[0]}, [r2], r3 151 vst1.u32 {d2[0]}, [r2@32], r3
132 vst1.u32 {d3[0]}, [r2], r3 152 vst1.u32 {d3[0]}, [r2@32], r3
133 vst1.u32 {d2[1]}, [r2], r3 153 vst1.u32 {d2[1]}, [r2@32], r3
134 vst1.u32 {d3[1]}, [r2], r4 154 vst1.u32 {d3[1]}, [r2@32], r4
155
156 vmov q8, q9
157 vmov d20, d23
158 vmov q11, q12
159 vmov q9, q13
135 160
136 subs r6, r6, #4 ; w -= 4 161 subs r6, r6, #4 ; w -= 4
137 bgt loop_horiz 162 bgt loop_horiz
138 163
139 ; outer loop 164 ; outer loop
140 mov r6, r10 ; restore w counter 165 mov r6, r10 ; restore w counter
141 add r0, r0, r1 ; src += src_stride * 4 - w 166 add r0, r0, r9 ; src += src_stride * 4 - w
142 add r2, r2, r12 ; dst += dst_stride * 4 - w 167 add r2, r2, r12 ; dst += dst_stride * 4 - w
143 subs r7, r7, #4 ; h -= 4 168 subs r7, r7, #4 ; h -= 4
144 bgt loop_horiz 169 bgt loop_horiz_v
145 170
146 pop {r4-r10, pc} 171 pop {r4-r10, pc}
147 172
148 call_horiz_c_convolve
149 pop {r4-r10, lr}
150 add r0, r0, #3 ; un-adjust for taps
151 b vp9_convolve8_horiz_c
152
153
154 ENDP 173 ENDP
155 174
156 |vp9_convolve8_vert_neon| PROC 175 |vp9_convolve8_vert_neon| PROC
157 push {r4-r10, lr} 176 ldr r12, [sp, #12]
177 cmp r12, #16
178 bne vp9_convolve8_vert_c
179
180 push {r4-r8, lr}
158 181
159 ; adjust for taps 182 ; adjust for taps
160 sub r0, r0, r1 183 sub r0, r0, r1
161 sub r0, r0, r1, lsl #1 184 sub r0, r0, r1, lsl #1
162 185
163 ldr r6, [sp, #44] ; y_step_q4 186 ldr r4, [sp, #32] ; filter_y
164 ldr r7, [sp, #40] ; filter_y 187 ldr r6, [sp, #40] ; w
165 cmp r6, #16 188 ldr lr, [sp, #44] ; h
166 bne call_vert_c_convolve ; y_step_q4 != 16
167 189
168 ldr r8, [sp, #48] ; w 190 vld1.s16 {q0}, [r4] ; filter_y
169 ldr r9, [sp, #52] ; h
170 191
171 vld1.s16 {q0}, [r7] ; filter_y 192 lsl r1, r1, #1
193 lsl r3, r3, #1
172 194
173 mov r5, r1, lsl #1 ; src_stride * 2 195 loop_vert_h
174 add r5, r5, r1, lsl #3 ; src_stride * 10 196 mov r4, r0
175 sub r5, r5, #4 ; src_stride * 10 + 4 197 add r7, r0, r1, asr #1
176 rsb r5, r5, #0 ; reset for src 198 mov r5, r2
199 add r8, r2, r3, asr #1
200 mov r12, lr ; h loop counter
177 201
178 add r6, r3, r3, lsl #1 ; dst_stride * 3 202 vld1.u32 {d16[0]}, [r4], r1
179 sub r6, r6, #4 ; dst_stride * 3 - 4 203 vld1.u32 {d16[1]}, [r7], r1
180 rsb r6, r6, #0 ; reset for dst 204 vld1.u32 {d18[0]}, [r4], r1
205 vld1.u32 {d18[1]}, [r7], r1
206 vld1.u32 {d20[0]}, [r4], r1
207 vld1.u32 {d20[1]}, [r7], r1
208 vld1.u32 {d22[0]}, [r4], r1
181 209
182 rsb r7, r8, r1, lsl #2 ; reset src for outer loop
183 rsb r12, r8, r3, lsl #2 ; reset dst for outer loop
184
185 mov r10, r8 ; w loop counter
186
187 loop_vert
188 ; always process a 4x4 block at a time
189 vld1.u32 {d16[0]}, [r0], r1
190 vld1.u32 {d16[1]}, [r0], r1
191 vld1.u32 {d18[0]}, [r0], r1
192 vld1.u32 {d18[1]}, [r0], r1
193 vld1.u32 {d20[0]}, [r0], r1
194 vld1.u32 {d20[1]}, [r0], r1
195 vld1.u32 {d22[0]}, [r0], r1
196 vld1.u32 {d22[1]}, [r0], r1
197 vld1.u32 {d24[0]}, [r0], r1
198 vld1.u32 {d24[1]}, [r0], r1
199 vld1.u32 {d26[0]}, [r0], r5
200
201 ; extract to s16
202 vmovl.u8 q8, d16 210 vmovl.u8 q8, d16
203 vmovl.u8 q9, d18 211 vmovl.u8 q9, d18
204 vmovl.u8 q10, d20 212 vmovl.u8 q10, d20
205 vmovl.u8 q11, d22 213 vmovl.u8 q11, d22
214
215 loop_vert
216 ; always process a 4x4 block at a time
217 vld1.u32 {d24[0]}, [r7], r1
218 vld1.u32 {d26[0]}, [r4], r1
219 vld1.u32 {d26[1]}, [r7], r1
220 vld1.u32 {d24[1]}, [r4], r1
221
222 ; extract to s16
206 vmovl.u8 q12, d24 223 vmovl.u8 q12, d24
207 vmovl.u8 q13, d26 224 vmovl.u8 q13, d26
208 225
226 pld [r5]
227 pld [r8]
228
209 ; src[] * filter_y 229 ; src[] * filter_y
210 MULTIPLY_BY_Q0 q1, d16, d17, d18, d19, d20, d21, d22, d23 230 MULTIPLY_BY_Q0 q1, d16, d17, d18, d19, d20, d21, d22, d24
211 MULTIPLY_BY_Q0 q2, d17, d18, d19, d20, d21, d22, d23, d24 231
212 MULTIPLY_BY_Q0 q14, d18, d19, d20, d21, d22, d23, d24, d25 232 pld [r5, r3]
213 MULTIPLY_BY_Q0 q15, d19, d20, d21, d22, d23, d24, d25, d26 233 pld [r8, r3]
234
235 MULTIPLY_BY_Q0 q2, d17, d18, d19, d20, d21, d22, d24, d26
236
237 pld [r7]
238 pld [r4]
239
240 MULTIPLY_BY_Q0 q14, d18, d19, d20, d21, d22, d24, d26, d27
241
242 pld [r7, r1]
243 pld [r4, r1]
244
245 MULTIPLY_BY_Q0 q15, d19, d20, d21, d22, d24, d26, d27, d25
214 246
215 ; += 64 >> 7 247 ; += 64 >> 7
216 vqrshrun.s32 d2, q1, #7 248 vqrshrun.s32 d2, q1, #7
217 vqrshrun.s32 d3, q2, #7 249 vqrshrun.s32 d3, q2, #7
218 vqrshrun.s32 d4, q14, #7 250 vqrshrun.s32 d4, q14, #7
219 vqrshrun.s32 d5, q15, #7 251 vqrshrun.s32 d5, q15, #7
220 252
221 ; saturate 253 ; saturate
222 vqshrn.u16 d2, q1, #0 254 vqmovn.u16 d2, q1
223 vqshrn.u16 d3, q2, #0 255 vqmovn.u16 d3, q2
224 256
225 vst1.u32 {d2[0]}, [r2], r3 257 vst1.u32 {d2[0]}, [r5@32], r3
226 vst1.u32 {d2[1]}, [r2], r3 258 vst1.u32 {d2[1]}, [r8@32], r3
227 vst1.u32 {d3[0]}, [r2], r3 259 vst1.u32 {d3[0]}, [r5@32], r3
228 vst1.u32 {d3[1]}, [r2], r6 260 vst1.u32 {d3[1]}, [r8@32], r3
229 261
230 subs r8, r8, #4 ; w -= 4 262 vmov q8, q10
263 vmov d18, d22
264 vmov d19, d24
265 vmov q10, q13
266 vmov d22, d25
267
268 subs r12, r12, #4 ; h -= 4
231 bgt loop_vert 269 bgt loop_vert
232 270
233 ; outer loop 271 ; outer loop
234 mov r8, r10 ; restore w counter 272 add r0, r0, #4
235 add r0, r0, r7 ; src += 4 * src_stride - w 273 add r2, r2, #4
236 add r2, r2, r12 ; dst += 4 * dst_stride - w 274 subs r6, r6, #4 ; w -= 4
237 subs r9, r9, #4 ; h -= 4 275 bgt loop_vert_h
238 bgt loop_vert
239 276
240 pop {r4-r10, pc} 277 pop {r4-r8, pc}
241
242 call_vert_c_convolve
243 pop {r4-r10, lr}
244 ; un-adjust for taps
245 add r0, r0, r1
246 add r0, r0, r1, lsl #1
247 b vp9_convolve8_vert_c
248 278
249 ENDP 279 ENDP
250 END 280 END
OLDNEW
« no previous file with comments | « source/libvpx/vp9/common/arm/neon/vp9_convolve8_avg_neon.asm ('k') | source/libvpx/vp9/common/arm/neon/vp9_convolve_neon.c » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698