Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(156)

Side by Side Diff: source/libvpx/vp8/common/arm/neon/loopfilter_neon.c

Issue 554673004: libvpx: Pull from upstream (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/libvpx/
Patch Set: Created 6 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
(Empty)
1 /*
2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <arm_neon.h>
12 #include "./vpx_config.h"
13
14 static INLINE void vp8_loop_filter_neon(
15 uint8x16_t qblimit, // flimit
16 uint8x16_t qlimit, // limit
17 uint8x16_t qthresh, // thresh
18 uint8x16_t q3, // p3
19 uint8x16_t q4, // p2
20 uint8x16_t q5, // p1
21 uint8x16_t q6, // p0
22 uint8x16_t q7, // q0
23 uint8x16_t q8, // q1
24 uint8x16_t q9, // q2
25 uint8x16_t q10, // q3
26 uint8x16_t *q5r, // p1
27 uint8x16_t *q6r, // p0
28 uint8x16_t *q7r, // q0
29 uint8x16_t *q8r) { // q1
30 uint8x16_t q0u8, q1u8, q2u8, q11u8, q12u8, q13u8, q14u8, q15u8;
31 int16x8_t q2s16, q11s16;
32 uint16x8_t q4u16;
33 int8x16_t q1s8, q2s8, q10s8, q11s8, q12s8, q13s8;
34 int8x8_t d2s8, d3s8;
35
36 q11u8 = vabdq_u8(q3, q4);
37 q12u8 = vabdq_u8(q4, q5);
38 q13u8 = vabdq_u8(q5, q6);
39 q14u8 = vabdq_u8(q8, q7);
40 q3 = vabdq_u8(q9, q8);
41 q4 = vabdq_u8(q10, q9);
42
43 q11u8 = vmaxq_u8(q11u8, q12u8);
44 q12u8 = vmaxq_u8(q13u8, q14u8);
45 q3 = vmaxq_u8(q3, q4);
46 q15u8 = vmaxq_u8(q11u8, q12u8);
47
48 q9 = vabdq_u8(q6, q7);
49
50 // vp8_hevmask
51 q13u8 = vcgtq_u8(q13u8, qthresh);
52 q14u8 = vcgtq_u8(q14u8, qthresh);
53 q15u8 = vmaxq_u8(q15u8, q3);
54
55 q2u8 = vabdq_u8(q5, q8);
56 q9 = vqaddq_u8(q9, q9);
57
58 q15u8 = vcgeq_u8(qlimit, q15u8);
59
60 // vp8_filter() function
61 // convert to signed
62 q10 = vdupq_n_u8(0x80);
63 q8 = veorq_u8(q8, q10);
64 q7 = veorq_u8(q7, q10);
65 q6 = veorq_u8(q6, q10);
66 q5 = veorq_u8(q5, q10);
67
68 q2u8 = vshrq_n_u8(q2u8, 1);
69 q9 = vqaddq_u8(q9, q2u8);
70
71 q10 = vdupq_n_u8(3);
72
73 q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7)),
74 vget_low_s8(vreinterpretq_s8_u8(q6)));
75 q11s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7)),
76 vget_high_s8(vreinterpretq_s8_u8(q6)));
77
78 q9 = vcgeq_u8(qblimit, q9);
79
80 q1s8 = vqsubq_s8(vreinterpretq_s8_u8(q5),
81 vreinterpretq_s8_u8(q8));
82
83 q14u8 = vorrq_u8(q13u8, q14u8);
84
85 q4u16 = vmovl_u8(vget_low_u8(q10));
86 q2s16 = vmulq_s16(q2s16, vreinterpretq_s16_u16(q4u16));
87 q11s16 = vmulq_s16(q11s16, vreinterpretq_s16_u16(q4u16));
88
89 q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q14u8);
90 q15u8 = vandq_u8(q15u8, q9);
91
92 q1s8 = vreinterpretq_s8_u8(q1u8);
93 q2s16 = vaddw_s8(q2s16, vget_low_s8(q1s8));
94 q11s16 = vaddw_s8(q11s16, vget_high_s8(q1s8));
95
96 q9 = vdupq_n_u8(4);
97 // vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0))
98 d2s8 = vqmovn_s16(q2s16);
99 d3s8 = vqmovn_s16(q11s16);
100 q1s8 = vcombine_s8(d2s8, d3s8);
101 q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q15u8);
102 q1s8 = vreinterpretq_s8_u8(q1u8);
103
104 q2s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q10));
105 q1s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q9));
106 q2s8 = vshrq_n_s8(q2s8, 3);
107 q1s8 = vshrq_n_s8(q1s8, 3);
108
109 q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q6), q2s8);
110 q10s8 = vqsubq_s8(vreinterpretq_s8_u8(q7), q1s8);
111
112 q1s8 = vrshrq_n_s8(q1s8, 1);
113 q1s8 = vbicq_s8(q1s8, vreinterpretq_s8_u8(q14u8));
114
115 q13s8 = vqaddq_s8(vreinterpretq_s8_u8(q5), q1s8);
116 q12s8 = vqsubq_s8(vreinterpretq_s8_u8(q8), q1s8);
117
118 q0u8 = vdupq_n_u8(0x80);
119 *q8r = veorq_u8(vreinterpretq_u8_s8(q12s8), q0u8);
120 *q7r = veorq_u8(vreinterpretq_u8_s8(q10s8), q0u8);
121 *q6r = veorq_u8(vreinterpretq_u8_s8(q11s8), q0u8);
122 *q5r = veorq_u8(vreinterpretq_u8_s8(q13s8), q0u8);
123 return;
124 }
125
126 void vp8_loop_filter_horizontal_edge_y_neon(
127 unsigned char *src,
128 int pitch,
129 unsigned char blimit,
130 unsigned char limit,
131 unsigned char thresh) {
132 uint8x16_t qblimit, qlimit, qthresh, q3, q4;
133 uint8x16_t q5, q6, q7, q8, q9, q10;
134
135 qblimit = vdupq_n_u8(blimit);
136 qlimit = vdupq_n_u8(limit);
137 qthresh = vdupq_n_u8(thresh);
138 src -= (pitch << 2);
139
140 q3 = vld1q_u8(src);
141 src += pitch;
142 q4 = vld1q_u8(src);
143 src += pitch;
144 q5 = vld1q_u8(src);
145 src += pitch;
146 q6 = vld1q_u8(src);
147 src += pitch;
148 q7 = vld1q_u8(src);
149 src += pitch;
150 q8 = vld1q_u8(src);
151 src += pitch;
152 q9 = vld1q_u8(src);
153 src += pitch;
154 q10 = vld1q_u8(src);
155
156 vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
157 q5, q6, q7, q8, q9, q10,
158 &q5, &q6, &q7, &q8);
159
160 src -= (pitch * 5);
161 vst1q_u8(src, q5);
162 src += pitch;
163 vst1q_u8(src, q6);
164 src += pitch;
165 vst1q_u8(src, q7);
166 src += pitch;
167 vst1q_u8(src, q8);
168 return;
169 }
170
171 void vp8_loop_filter_horizontal_edge_uv_neon(
172 unsigned char *u,
173 int pitch,
174 unsigned char blimit,
175 unsigned char limit,
176 unsigned char thresh,
177 unsigned char *v) {
178 uint8x16_t qblimit, qlimit, qthresh, q3, q4;
179 uint8x16_t q5, q6, q7, q8, q9, q10;
180 uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;
181 uint8x8_t d15, d16, d17, d18, d19, d20, d21;
182
183 qblimit = vdupq_n_u8(blimit);
184 qlimit = vdupq_n_u8(limit);
185 qthresh = vdupq_n_u8(thresh);
186
187 u -= (pitch << 2);
188 v -= (pitch << 2);
189
190 d6 = vld1_u8(u);
191 u += pitch;
192 d7 = vld1_u8(v);
193 v += pitch;
194 d8 = vld1_u8(u);
195 u += pitch;
196 d9 = vld1_u8(v);
197 v += pitch;
198 d10 = vld1_u8(u);
199 u += pitch;
200 d11 = vld1_u8(v);
201 v += pitch;
202 d12 = vld1_u8(u);
203 u += pitch;
204 d13 = vld1_u8(v);
205 v += pitch;
206 d14 = vld1_u8(u);
207 u += pitch;
208 d15 = vld1_u8(v);
209 v += pitch;
210 d16 = vld1_u8(u);
211 u += pitch;
212 d17 = vld1_u8(v);
213 v += pitch;
214 d18 = vld1_u8(u);
215 u += pitch;
216 d19 = vld1_u8(v);
217 v += pitch;
218 d20 = vld1_u8(u);
219 d21 = vld1_u8(v);
220
221 q3 = vcombine_u8(d6, d7);
222 q4 = vcombine_u8(d8, d9);
223 q5 = vcombine_u8(d10, d11);
224 q6 = vcombine_u8(d12, d13);
225 q7 = vcombine_u8(d14, d15);
226 q8 = vcombine_u8(d16, d17);
227 q9 = vcombine_u8(d18, d19);
228 q10 = vcombine_u8(d20, d21);
229
230 vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
231 q5, q6, q7, q8, q9, q10,
232 &q5, &q6, &q7, &q8);
233
234 u -= (pitch * 5);
235 vst1_u8(u, vget_low_u8(q5));
236 u += pitch;
237 vst1_u8(u, vget_low_u8(q6));
238 u += pitch;
239 vst1_u8(u, vget_low_u8(q7));
240 u += pitch;
241 vst1_u8(u, vget_low_u8(q8));
242
243 v -= (pitch * 5);
244 vst1_u8(v, vget_high_u8(q5));
245 v += pitch;
246 vst1_u8(v, vget_high_u8(q6));
247 v += pitch;
248 vst1_u8(v, vget_high_u8(q7));
249 v += pitch;
250 vst1_u8(v, vget_high_u8(q8));
251 return;
252 }
253
254 static INLINE void write_4x8(unsigned char *dst, int pitch,
255 const uint8x8x4_t result) {
256 #if (__GNUC__ == 4 && (__GNUC_MINOR__ >= 7))
257 vst4_lane_u8(dst, result, 0);
258 dst += pitch;
259 vst4_lane_u8(dst, result, 1);
260 dst += pitch;
261 vst4_lane_u8(dst, result, 2);
262 dst += pitch;
263 vst4_lane_u8(dst, result, 3);
264 dst += pitch;
265 vst4_lane_u8(dst, result, 4);
266 dst += pitch;
267 vst4_lane_u8(dst, result, 5);
268 dst += pitch;
269 vst4_lane_u8(dst, result, 6);
270 dst += pitch;
271 vst4_lane_u8(dst, result, 7);
272 #else
273 /*
274 * uint8x8x4_t result
275 00 01 02 03 | 04 05 06 07
276 10 11 12 13 | 14 15 16 17
277 20 21 22 23 | 24 25 26 27
278 30 31 32 33 | 34 35 36 37
279 ---
280 * after vtrn_u16
281 00 01 20 21 | 04 05 24 25
282 02 03 22 23 | 06 07 26 27
283 10 11 30 31 | 14 15 34 35
284 12 13 32 33 | 16 17 36 37
285 ---
286 * after vtrn_u8
287 00 10 20 30 | 04 14 24 34
288 01 11 21 31 | 05 15 25 35
289 02 12 22 32 | 06 16 26 36
290 03 13 23 33 | 07 17 27 37
291 */
292 const uint16x4x2_t r02_u16 = vtrn_u16(vreinterpret_u16_u8(result.val[0]),
293 vreinterpret_u16_u8(result.val[2]));
294 const uint16x4x2_t r13_u16 = vtrn_u16(vreinterpret_u16_u8(result.val[1]),
295 vreinterpret_u16_u8(result.val[3]));
296 const uint8x8x2_t r01_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[0]),
297 vreinterpret_u8_u16(r13_u16.val[0]));
298 const uint8x8x2_t r23_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[1]),
299 vreinterpret_u8_u16(r13_u16.val[1]));
300 const uint32x2_t x_0_4 = vreinterpret_u32_u8(r01_u8.val[0]);
301 const uint32x2_t x_1_5 = vreinterpret_u32_u8(r01_u8.val[1]);
302 const uint32x2_t x_2_6 = vreinterpret_u32_u8(r23_u8.val[0]);
303 const uint32x2_t x_3_7 = vreinterpret_u32_u8(r23_u8.val[1]);
304 vst1_lane_u32((uint32_t *)dst, x_0_4, 0);
305 dst += pitch;
306 vst1_lane_u32((uint32_t *)dst, x_1_5, 0);
307 dst += pitch;
308 vst1_lane_u32((uint32_t *)dst, x_2_6, 0);
309 dst += pitch;
310 vst1_lane_u32((uint32_t *)dst, x_3_7, 0);
311 dst += pitch;
312 vst1_lane_u32((uint32_t *)dst, x_0_4, 1);
313 dst += pitch;
314 vst1_lane_u32((uint32_t *)dst, x_1_5, 1);
315 dst += pitch;
316 vst1_lane_u32((uint32_t *)dst, x_2_6, 1);
317 dst += pitch;
318 vst1_lane_u32((uint32_t *)dst, x_3_7, 1);
319 #endif
320 }
321
322 void vp8_loop_filter_vertical_edge_y_neon(
323 unsigned char *src,
324 int pitch,
325 unsigned char blimit,
326 unsigned char limit,
327 unsigned char thresh) {
328 unsigned char *s, *d;
329 uint8x16_t qblimit, qlimit, qthresh, q3, q4;
330 uint8x16_t q5, q6, q7, q8, q9, q10;
331 uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;
332 uint8x8_t d15, d16, d17, d18, d19, d20, d21;
333 uint32x4x2_t q2tmp0, q2tmp1, q2tmp2, q2tmp3;
334 uint16x8x2_t q2tmp4, q2tmp5, q2tmp6, q2tmp7;
335 uint8x16x2_t q2tmp8, q2tmp9, q2tmp10, q2tmp11;
336 uint8x8x4_t q4ResultH, q4ResultL;
337
338 qblimit = vdupq_n_u8(blimit);
339 qlimit = vdupq_n_u8(limit);
340 qthresh = vdupq_n_u8(thresh);
341
342 s = src - 4;
343 d6 = vld1_u8(s);
344 s += pitch;
345 d8 = vld1_u8(s);
346 s += pitch;
347 d10 = vld1_u8(s);
348 s += pitch;
349 d12 = vld1_u8(s);
350 s += pitch;
351 d14 = vld1_u8(s);
352 s += pitch;
353 d16 = vld1_u8(s);
354 s += pitch;
355 d18 = vld1_u8(s);
356 s += pitch;
357 d20 = vld1_u8(s);
358 s += pitch;
359 d7 = vld1_u8(s);
360 s += pitch;
361 d9 = vld1_u8(s);
362 s += pitch;
363 d11 = vld1_u8(s);
364 s += pitch;
365 d13 = vld1_u8(s);
366 s += pitch;
367 d15 = vld1_u8(s);
368 s += pitch;
369 d17 = vld1_u8(s);
370 s += pitch;
371 d19 = vld1_u8(s);
372 s += pitch;
373 d21 = vld1_u8(s);
374
375 q3 = vcombine_u8(d6, d7);
376 q4 = vcombine_u8(d8, d9);
377 q5 = vcombine_u8(d10, d11);
378 q6 = vcombine_u8(d12, d13);
379 q7 = vcombine_u8(d14, d15);
380 q8 = vcombine_u8(d16, d17);
381 q9 = vcombine_u8(d18, d19);
382 q10 = vcombine_u8(d20, d21);
383
384 q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7));
385 q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8));
386 q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9));
387 q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10));
388
389 q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]),
390 vreinterpretq_u16_u32(q2tmp2.val[0]));
391 q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]),
392 vreinterpretq_u16_u32(q2tmp3.val[0]));
393 q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]),
394 vreinterpretq_u16_u32(q2tmp2.val[1]));
395 q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]),
396 vreinterpretq_u16_u32(q2tmp3.val[1]));
397
398 q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]),
399 vreinterpretq_u8_u16(q2tmp5.val[0]));
400 q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]),
401 vreinterpretq_u8_u16(q2tmp5.val[1]));
402 q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]),
403 vreinterpretq_u8_u16(q2tmp7.val[0]));
404 q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]),
405 vreinterpretq_u8_u16(q2tmp7.val[1]));
406
407 q3 = q2tmp8.val[0];
408 q4 = q2tmp8.val[1];
409 q5 = q2tmp9.val[0];
410 q6 = q2tmp9.val[1];
411 q7 = q2tmp10.val[0];
412 q8 = q2tmp10.val[1];
413 q9 = q2tmp11.val[0];
414 q10 = q2tmp11.val[1];
415
416 vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
417 q5, q6, q7, q8, q9, q10,
418 &q5, &q6, &q7, &q8);
419
420 q4ResultL.val[0] = vget_low_u8(q5); // d10
421 q4ResultL.val[1] = vget_low_u8(q6); // d12
422 q4ResultL.val[2] = vget_low_u8(q7); // d14
423 q4ResultL.val[3] = vget_low_u8(q8); // d16
424 q4ResultH.val[0] = vget_high_u8(q5); // d11
425 q4ResultH.val[1] = vget_high_u8(q6); // d13
426 q4ResultH.val[2] = vget_high_u8(q7); // d15
427 q4ResultH.val[3] = vget_high_u8(q8); // d17
428
429 d = src - 2;
430 write_4x8(d, pitch, q4ResultL);
431 d += pitch * 8;
432 write_4x8(d, pitch, q4ResultH);
433 }
434
435 void vp8_loop_filter_vertical_edge_uv_neon(
436 unsigned char *u,
437 int pitch,
438 unsigned char blimit,
439 unsigned char limit,
440 unsigned char thresh,
441 unsigned char *v) {
442 unsigned char *us, *ud;
443 unsigned char *vs, *vd;
444 uint8x16_t qblimit, qlimit, qthresh, q3, q4;
445 uint8x16_t q5, q6, q7, q8, q9, q10;
446 uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;
447 uint8x8_t d15, d16, d17, d18, d19, d20, d21;
448 uint32x4x2_t q2tmp0, q2tmp1, q2tmp2, q2tmp3;
449 uint16x8x2_t q2tmp4, q2tmp5, q2tmp6, q2tmp7;
450 uint8x16x2_t q2tmp8, q2tmp9, q2tmp10, q2tmp11;
451 uint8x8x4_t q4ResultH, q4ResultL;
452
453 qblimit = vdupq_n_u8(blimit);
454 qlimit = vdupq_n_u8(limit);
455 qthresh = vdupq_n_u8(thresh);
456
457 us = u - 4;
458 d6 = vld1_u8(us);
459 us += pitch;
460 d8 = vld1_u8(us);
461 us += pitch;
462 d10 = vld1_u8(us);
463 us += pitch;
464 d12 = vld1_u8(us);
465 us += pitch;
466 d14 = vld1_u8(us);
467 us += pitch;
468 d16 = vld1_u8(us);
469 us += pitch;
470 d18 = vld1_u8(us);
471 us += pitch;
472 d20 = vld1_u8(us);
473
474 vs = v - 4;
475 d7 = vld1_u8(vs);
476 vs += pitch;
477 d9 = vld1_u8(vs);
478 vs += pitch;
479 d11 = vld1_u8(vs);
480 vs += pitch;
481 d13 = vld1_u8(vs);
482 vs += pitch;
483 d15 = vld1_u8(vs);
484 vs += pitch;
485 d17 = vld1_u8(vs);
486 vs += pitch;
487 d19 = vld1_u8(vs);
488 vs += pitch;
489 d21 = vld1_u8(vs);
490
491 q3 = vcombine_u8(d6, d7);
492 q4 = vcombine_u8(d8, d9);
493 q5 = vcombine_u8(d10, d11);
494 q6 = vcombine_u8(d12, d13);
495 q7 = vcombine_u8(d14, d15);
496 q8 = vcombine_u8(d16, d17);
497 q9 = vcombine_u8(d18, d19);
498 q10 = vcombine_u8(d20, d21);
499
500 q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7));
501 q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8));
502 q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9));
503 q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10));
504
505 q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]),
506 vreinterpretq_u16_u32(q2tmp2.val[0]));
507 q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]),
508 vreinterpretq_u16_u32(q2tmp3.val[0]));
509 q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]),
510 vreinterpretq_u16_u32(q2tmp2.val[1]));
511 q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]),
512 vreinterpretq_u16_u32(q2tmp3.val[1]));
513
514 q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]),
515 vreinterpretq_u8_u16(q2tmp5.val[0]));
516 q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]),
517 vreinterpretq_u8_u16(q2tmp5.val[1]));
518 q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]),
519 vreinterpretq_u8_u16(q2tmp7.val[0]));
520 q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]),
521 vreinterpretq_u8_u16(q2tmp7.val[1]));
522
523 q3 = q2tmp8.val[0];
524 q4 = q2tmp8.val[1];
525 q5 = q2tmp9.val[0];
526 q6 = q2tmp9.val[1];
527 q7 = q2tmp10.val[0];
528 q8 = q2tmp10.val[1];
529 q9 = q2tmp11.val[0];
530 q10 = q2tmp11.val[1];
531
532 vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
533 q5, q6, q7, q8, q9, q10,
534 &q5, &q6, &q7, &q8);
535
536 q4ResultL.val[0] = vget_low_u8(q5); // d10
537 q4ResultL.val[1] = vget_low_u8(q6); // d12
538 q4ResultL.val[2] = vget_low_u8(q7); // d14
539 q4ResultL.val[3] = vget_low_u8(q8); // d16
540 ud = u - 2;
541 write_4x8(ud, pitch, q4ResultL);
542
543 q4ResultH.val[0] = vget_high_u8(q5); // d11
544 q4ResultH.val[1] = vget_high_u8(q6); // d13
545 q4ResultH.val[2] = vget_high_u8(q7); // d15
546 q4ResultH.val[3] = vget_high_u8(q8); // d17
547 vd = v - 2;
548 write_4x8(vd, pitch, q4ResultH);
549 }
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698