Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(93)

Side by Side Diff: source/libvpx/vp9/common/arm/neon/vp9_convolve8_avg_neon.c

Issue 812033011: libvpx: Pull from upstream (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/libvpx/
Patch Set: Created 5 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
(Empty)
1 /*
2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <stddef.h>
12 #include <arm_neon.h>
13
14 #include "./vpx_config.h"
15 #include "vpx_ports/mem.h"
16
17 void vp9_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
18 uint8_t *dst, ptrdiff_t dst_stride,
19 const int16_t *filter_x, int x_step_q4,
20 const int16_t *filter_y, int y_step_q4,
21 int w, int h);
22 void vp9_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
23 uint8_t *dst, ptrdiff_t dst_stride,
24 const int16_t *filter_x, int x_step_q4,
25 const int16_t *filter_y, int y_step_q4,
26 int w, int h);
27
28 static INLINE int32x4_t MULTIPLY_BY_Q0(
29 int16x4_t dsrc0,
30 int16x4_t dsrc1,
31 int16x4_t dsrc2,
32 int16x4_t dsrc3,
33 int16x4_t dsrc4,
34 int16x4_t dsrc5,
35 int16x4_t dsrc6,
36 int16x4_t dsrc7,
37 int16x8_t q0s16) {
38 int32x4_t qdst;
39 int16x4_t d0s16, d1s16;
40
41 d0s16 = vget_low_s16(q0s16);
42 d1s16 = vget_high_s16(q0s16);
43
44 qdst = vmull_lane_s16(dsrc0, d0s16, 0);
45 qdst = vmlal_lane_s16(qdst, dsrc1, d0s16, 1);
46 qdst = vmlal_lane_s16(qdst, dsrc2, d0s16, 2);
47 qdst = vmlal_lane_s16(qdst, dsrc3, d0s16, 3);
48 qdst = vmlal_lane_s16(qdst, dsrc4, d1s16, 0);
49 qdst = vmlal_lane_s16(qdst, dsrc5, d1s16, 1);
50 qdst = vmlal_lane_s16(qdst, dsrc6, d1s16, 2);
51 qdst = vmlal_lane_s16(qdst, dsrc7, d1s16, 3);
52 return qdst;
53 }
54
55 void vp9_convolve8_avg_horiz_neon(
56 uint8_t *src,
57 ptrdiff_t src_stride,
58 uint8_t *dst,
59 ptrdiff_t dst_stride,
60 const int16_t *filter_x,
61 int x_step_q4,
62 const int16_t *filter_y, // unused
63 int y_step_q4, // unused
64 int w,
65 int h) {
66 int width;
67 uint8_t *s, *d;
68 uint8x8_t d2u8, d3u8, d24u8, d25u8, d26u8, d27u8, d28u8, d29u8;
69 uint32x2_t d2u32, d3u32, d6u32, d7u32, d28u32, d29u32, d30u32, d31u32;
70 uint8x16_t q1u8, q3u8, q12u8, q13u8, q14u8, q15u8;
71 int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d22s16, d23s16;
72 int16x4_t d24s16, d25s16, d26s16, d27s16;
73 uint16x4_t d2u16, d3u16, d4u16, d5u16, d16u16, d17u16, d18u16, d19u16;
74 int16x8_t q0s16;
75 uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16;
76 int32x4_t q1s32, q2s32, q14s32, q15s32;
77 uint16x8x2_t q0x2u16;
78 uint8x8x2_t d0x2u8, d1x2u8;
79 uint32x2x2_t d0x2u32;
80 uint16x4x2_t d0x2u16, d1x2u16;
81 uint32x4x2_t q0x2u32;
82
83 if (x_step_q4 != 16) {
84 vp9_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride,
85 filter_x, x_step_q4,
86 filter_y, y_step_q4, w, h);
87 return;
88 }
89
90 q0s16 = vld1q_s16(filter_x);
91
92 src -= 3; // adjust for taps
93 for (; h > 0; h -= 4) { // loop_horiz_v
94 s = src;
95 d24u8 = vld1_u8(s);
96 s += src_stride;
97 d25u8 = vld1_u8(s);
98 s += src_stride;
99 d26u8 = vld1_u8(s);
100 s += src_stride;
101 d27u8 = vld1_u8(s);
102
103 q12u8 = vcombine_u8(d24u8, d25u8);
104 q13u8 = vcombine_u8(d26u8, d27u8);
105
106 q0x2u16 = vtrnq_u16(vreinterpretq_u16_u8(q12u8),
107 vreinterpretq_u16_u8(q13u8));
108 d24u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[0]));
109 d25u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[0]));
110 d26u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[1]));
111 d27u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[1]));
112 d0x2u8 = vtrn_u8(d24u8, d25u8);
113 d1x2u8 = vtrn_u8(d26u8, d27u8);
114
115 __builtin_prefetch(src + src_stride * 4);
116 __builtin_prefetch(src + src_stride * 5);
117
118 q8u16 = vmovl_u8(d0x2u8.val[0]);
119 q9u16 = vmovl_u8(d0x2u8.val[1]);
120 q10u16 = vmovl_u8(d1x2u8.val[0]);
121 q11u16 = vmovl_u8(d1x2u8.val[1]);
122
123 src += 7;
124 d16u16 = vget_low_u16(q8u16);
125 d17u16 = vget_high_u16(q8u16);
126 d18u16 = vget_low_u16(q9u16);
127 d19u16 = vget_high_u16(q9u16);
128 q8u16 = vcombine_u16(d16u16, d18u16); // vswp 17 18
129 q9u16 = vcombine_u16(d17u16, d19u16);
130
131 d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));
132 d23s16 = vreinterpret_s16_u16(vget_high_u16(q10u16)); // vmov 23 21
133 for (width = w;
134 width > 0;
135 width -= 4, src += 4, dst += 4) { // loop_horiz
136 s = src;
137 d28u32 = vld1_dup_u32((const uint32_t *)s);
138 s += src_stride;
139 d29u32 = vld1_dup_u32((const uint32_t *)s);
140 s += src_stride;
141 d31u32 = vld1_dup_u32((const uint32_t *)s);
142 s += src_stride;
143 d30u32 = vld1_dup_u32((const uint32_t *)s);
144
145 __builtin_prefetch(src + 64);
146
147 d0x2u16 = vtrn_u16(vreinterpret_u16_u32(d28u32),
148 vreinterpret_u16_u32(d31u32));
149 d1x2u16 = vtrn_u16(vreinterpret_u16_u32(d29u32),
150 vreinterpret_u16_u32(d30u32));
151 d0x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[0]), // d28
152 vreinterpret_u8_u16(d1x2u16.val[0])); // d29
153 d1x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[1]), // d31
154 vreinterpret_u8_u16(d1x2u16.val[1])); // d30
155
156 __builtin_prefetch(src + 64 + src_stride);
157
158 q14u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]);
159 q15u8 = vcombine_u8(d1x2u8.val[1], d1x2u8.val[0]);
160 q0x2u32 = vtrnq_u32(vreinterpretq_u32_u8(q14u8),
161 vreinterpretq_u32_u8(q15u8));
162
163 d28u8 = vreinterpret_u8_u32(vget_low_u32(q0x2u32.val[0]));
164 d29u8 = vreinterpret_u8_u32(vget_high_u32(q0x2u32.val[0]));
165 q12u16 = vmovl_u8(d28u8);
166 q13u16 = vmovl_u8(d29u8);
167
168 __builtin_prefetch(src + 64 + src_stride * 2);
169
170 d = dst;
171 d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 0);
172 d += dst_stride;
173 d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 0);
174 d += dst_stride;
175 d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 1);
176 d += dst_stride;
177 d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 1);
178
179 d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16));
180 d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16));
181 d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16));
182 d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16));
183 d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
184 d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
185 d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
186 d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
187 d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
188
189 q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d20s16, d22s16,
190 d18s16, d19s16, d23s16, d24s16, q0s16);
191 q2s32 = MULTIPLY_BY_Q0(d17s16, d20s16, d22s16, d18s16,
192 d19s16, d23s16, d24s16, d26s16, q0s16);
193 q14s32 = MULTIPLY_BY_Q0(d20s16, d22s16, d18s16, d19s16,
194 d23s16, d24s16, d26s16, d27s16, q0s16);
195 q15s32 = MULTIPLY_BY_Q0(d22s16, d18s16, d19s16, d23s16,
196 d24s16, d26s16, d27s16, d25s16, q0s16);
197
198 __builtin_prefetch(src + 64 + src_stride * 3);
199
200 d2u16 = vqrshrun_n_s32(q1s32, 7);
201 d3u16 = vqrshrun_n_s32(q2s32, 7);
202 d4u16 = vqrshrun_n_s32(q14s32, 7);
203 d5u16 = vqrshrun_n_s32(q15s32, 7);
204
205 q1u16 = vcombine_u16(d2u16, d3u16);
206 q2u16 = vcombine_u16(d4u16, d5u16);
207
208 d2u8 = vqmovn_u16(q1u16);
209 d3u8 = vqmovn_u16(q2u16);
210
211 d0x2u16 = vtrn_u16(vreinterpret_u16_u8(d2u8),
212 vreinterpret_u16_u8(d3u8));
213 d0x2u32 = vtrn_u32(vreinterpret_u32_u16(d0x2u16.val[0]),
214 vreinterpret_u32_u16(d0x2u16.val[1]));
215 d0x2u8 = vtrn_u8(vreinterpret_u8_u32(d0x2u32.val[0]),
216 vreinterpret_u8_u32(d0x2u32.val[1]));
217
218 q1u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]);
219 q3u8 = vreinterpretq_u8_u32(vcombine_u32(d6u32, d7u32));
220
221 q1u8 = vrhaddq_u8(q1u8, q3u8);
222
223 d2u32 = vreinterpret_u32_u8(vget_low_u8(q1u8));
224 d3u32 = vreinterpret_u32_u8(vget_high_u8(q1u8));
225
226 d = dst;
227 vst1_lane_u32((uint32_t *)d, d2u32, 0);
228 d += dst_stride;
229 vst1_lane_u32((uint32_t *)d, d3u32, 0);
230 d += dst_stride;
231 vst1_lane_u32((uint32_t *)d, d2u32, 1);
232 d += dst_stride;
233 vst1_lane_u32((uint32_t *)d, d3u32, 1);
234
235 q8u16 = q9u16;
236 d20s16 = d23s16;
237 q11u16 = q12u16;
238 q9u16 = q13u16;
239 d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
240 }
241 src += src_stride * 4 - w - 7;
242 dst += dst_stride * 4 - w;
243 }
244 return;
245 }
246
247 void vp9_convolve8_avg_vert_neon(
248 uint8_t *src,
249 ptrdiff_t src_stride,
250 uint8_t *dst,
251 ptrdiff_t dst_stride,
252 const int16_t *filter_x, // unused
253 int x_step_q4, // unused
254 const int16_t *filter_y,
255 int y_step_q4,
256 int w,
257 int h) {
258 int height;
259 uint8_t *s, *d;
260 uint8x8_t d2u8, d3u8;
261 uint32x2_t d2u32, d3u32, d6u32, d7u32;
262 uint32x2_t d16u32, d18u32, d20u32, d22u32, d24u32, d26u32;
263 uint8x16_t q1u8, q3u8;
264 int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16;
265 int16x4_t d24s16, d25s16, d26s16, d27s16;
266 uint16x4_t d2u16, d3u16, d4u16, d5u16;
267 int16x8_t q0s16;
268 uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16;
269 int32x4_t q1s32, q2s32, q14s32, q15s32;
270
271 if (y_step_q4 != 16) {
272 vp9_convolve8_avg_vert_c(src, src_stride, dst, dst_stride,
273 filter_x, x_step_q4,
274 filter_y, y_step_q4, w, h);
275 return;
276 }
277
278 src -= src_stride * 3;
279 q0s16 = vld1q_s16(filter_y);
280 for (; w > 0; w -= 4, src += 4, dst += 4) { // loop_vert_h
281 s = src;
282 d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 0);
283 s += src_stride;
284 d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 1);
285 s += src_stride;
286 d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 0);
287 s += src_stride;
288 d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 1);
289 s += src_stride;
290 d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 0);
291 s += src_stride;
292 d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 1);
293 s += src_stride;
294 d22u32 = vld1_lane_u32((const uint32_t *)s, d22u32, 0);
295 s += src_stride;
296
297 q8u16 = vmovl_u8(vreinterpret_u8_u32(d16u32));
298 q9u16 = vmovl_u8(vreinterpret_u8_u32(d18u32));
299 q10u16 = vmovl_u8(vreinterpret_u8_u32(d20u32));
300 q11u16 = vmovl_u8(vreinterpret_u8_u32(d22u32));
301
302 d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16));
303 d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16));
304 d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
305 d = dst;
306 for (height = h; height > 0; height -= 4) { // loop_vert
307 d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 0);
308 s += src_stride;
309 d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 0);
310 s += src_stride;
311 d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 1);
312 s += src_stride;
313 d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 1);
314 s += src_stride;
315
316 q12u16 = vmovl_u8(vreinterpret_u8_u32(d24u32));
317 q13u16 = vmovl_u8(vreinterpret_u8_u32(d26u32));
318
319 d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 0);
320 d += dst_stride;
321 d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 1);
322 d += dst_stride;
323 d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 0);
324 d += dst_stride;
325 d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 1);
326 d -= dst_stride * 3;
327
328 d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16));
329 d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16));
330 d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));
331 d21s16 = vreinterpret_s16_u16(vget_high_u16(q10u16));
332 d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
333 d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
334 d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
335 d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
336
337 __builtin_prefetch(s);
338 __builtin_prefetch(s + src_stride);
339 q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d18s16, d19s16,
340 d20s16, d21s16, d22s16, d24s16, q0s16);
341 __builtin_prefetch(s + src_stride * 2);
342 __builtin_prefetch(s + src_stride * 3);
343 q2s32 = MULTIPLY_BY_Q0(d17s16, d18s16, d19s16, d20s16,
344 d21s16, d22s16, d24s16, d26s16, q0s16);
345 __builtin_prefetch(d);
346 __builtin_prefetch(d + dst_stride);
347 q14s32 = MULTIPLY_BY_Q0(d18s16, d19s16, d20s16, d21s16,
348 d22s16, d24s16, d26s16, d27s16, q0s16);
349 __builtin_prefetch(d + dst_stride * 2);
350 __builtin_prefetch(d + dst_stride * 3);
351 q15s32 = MULTIPLY_BY_Q0(d19s16, d20s16, d21s16, d22s16,
352 d24s16, d26s16, d27s16, d25s16, q0s16);
353
354 d2u16 = vqrshrun_n_s32(q1s32, 7);
355 d3u16 = vqrshrun_n_s32(q2s32, 7);
356 d4u16 = vqrshrun_n_s32(q14s32, 7);
357 d5u16 = vqrshrun_n_s32(q15s32, 7);
358
359 q1u16 = vcombine_u16(d2u16, d3u16);
360 q2u16 = vcombine_u16(d4u16, d5u16);
361
362 d2u8 = vqmovn_u16(q1u16);
363 d3u8 = vqmovn_u16(q2u16);
364
365 q1u8 = vcombine_u8(d2u8, d3u8);
366 q3u8 = vreinterpretq_u8_u32(vcombine_u32(d6u32, d7u32));
367
368 q1u8 = vrhaddq_u8(q1u8, q3u8);
369
370 d2u32 = vreinterpret_u32_u8(vget_low_u8(q1u8));
371 d3u32 = vreinterpret_u32_u8(vget_high_u8(q1u8));
372
373 vst1_lane_u32((uint32_t *)d, d2u32, 0);
374 d += dst_stride;
375 vst1_lane_u32((uint32_t *)d, d2u32, 1);
376 d += dst_stride;
377 vst1_lane_u32((uint32_t *)d, d3u32, 0);
378 d += dst_stride;
379 vst1_lane_u32((uint32_t *)d, d3u32, 1);
380 d += dst_stride;
381
382 q8u16 = q10u16;
383 d18s16 = d22s16;
384 d19s16 = d24s16;
385 q10u16 = q13u16;
386 d22s16 = d25s16;
387 }
388 }
389 return;
390 }
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698