OLD | NEW |
1 /* | 1 /* |
2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. | 2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
11 #include <arm_neon.h> | 11 #include <arm_neon.h> |
12 #include "./vpx_config.h" | 12 #include "./vpx_config.h" |
| 13 #include "vpx_ports/arm.h" |
13 | 14 |
14 static INLINE void vp8_loop_filter_neon( | 15 static INLINE void vp8_loop_filter_neon( |
15 uint8x16_t qblimit, // flimit | 16 uint8x16_t qblimit, // flimit |
16 uint8x16_t qlimit, // limit | 17 uint8x16_t qlimit, // limit |
17 uint8x16_t qthresh, // thresh | 18 uint8x16_t qthresh, // thresh |
18 uint8x16_t q3, // p3 | 19 uint8x16_t q3, // p3 |
19 uint8x16_t q4, // p2 | 20 uint8x16_t q4, // p2 |
20 uint8x16_t q5, // p1 | 21 uint8x16_t q5, // p1 |
21 uint8x16_t q6, // p0 | 22 uint8x16_t q6, // p0 |
22 uint8x16_t q7, // q0 | 23 uint8x16_t q7, // q0 |
(...skipping 221 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
244 vst1_u8(v, vget_high_u8(q5)); | 245 vst1_u8(v, vget_high_u8(q5)); |
245 v += pitch; | 246 v += pitch; |
246 vst1_u8(v, vget_high_u8(q6)); | 247 vst1_u8(v, vget_high_u8(q6)); |
247 v += pitch; | 248 v += pitch; |
248 vst1_u8(v, vget_high_u8(q7)); | 249 vst1_u8(v, vget_high_u8(q7)); |
249 v += pitch; | 250 v += pitch; |
250 vst1_u8(v, vget_high_u8(q8)); | 251 vst1_u8(v, vget_high_u8(q8)); |
251 return; | 252 return; |
252 } | 253 } |
253 | 254 |
254 #if (__GNUC__ == 4 && (__GNUC_MINOR__ == 6)) | |
255 #warning Using GCC 4.6 is not recommended | |
256 // Some versions of gcc4.6 do not correctly process vst4_lane_u8. When built | |
257 // with any gcc4.6, use the C code. | |
258 extern void vp8_loop_filter_vertical_edge_c(unsigned char *s, int p, | |
259 const unsigned char *blimit, | |
260 const unsigned char *limit, | |
261 const unsigned char *thresh, | |
262 int count); | |
263 | |
264 void vp8_loop_filter_vertical_edge_y_neon( | |
265 unsigned char *src, | |
266 int pitch, | |
267 unsigned char blimit, | |
268 unsigned char limit, | |
269 unsigned char thresh) { | |
270 vp8_loop_filter_vertical_edge_c(src, pitch, &blimit, &limit, &thresh, 2); | |
271 } | |
272 | |
273 void vp8_loop_filter_vertical_edge_uv_neon( | |
274 unsigned char *u, | |
275 int pitch, | |
276 unsigned char blimit, | |
277 unsigned char limit, | |
278 unsigned char thresh, | |
279 unsigned char *v) { | |
280 vp8_loop_filter_vertical_edge_c(u, pitch, &blimit, &limit, &thresh, 1); | |
281 vp8_loop_filter_vertical_edge_c(v, pitch, &blimit, &limit, &thresh, 1); | |
282 } | |
283 #else | |
284 static INLINE void write_4x8(unsigned char *dst, int pitch, | 255 static INLINE void write_4x8(unsigned char *dst, int pitch, |
285 const uint8x8x4_t result) { | 256 const uint8x8x4_t result) { |
| 257 #ifdef VPX_INCOMPATIBLE_GCC |
| 258 /* |
| 259 * uint8x8x4_t result |
| 260 00 01 02 03 | 04 05 06 07 |
| 261 10 11 12 13 | 14 15 16 17 |
| 262 20 21 22 23 | 24 25 26 27 |
| 263 30 31 32 33 | 34 35 36 37 |
| 264 --- |
| 265 * after vtrn_u16 |
| 266 00 01 20 21 | 04 05 24 25 |
| 267 02 03 22 23 | 06 07 26 27 |
| 268 10 11 30 31 | 14 15 34 35 |
| 269 12 13 32 33 | 16 17 36 37 |
| 270 --- |
| 271 * after vtrn_u8 |
| 272 00 10 20 30 | 04 14 24 34 |
| 273 01 11 21 31 | 05 15 25 35 |
| 274 02 12 22 32 | 06 16 26 36 |
| 275 03 13 23 33 | 07 17 27 37 |
| 276 */ |
| 277 const uint16x4x2_t r02_u16 = vtrn_u16(vreinterpret_u16_u8(result.val[0]), |
| 278 vreinterpret_u16_u8(result.val[2])); |
| 279 const uint16x4x2_t r13_u16 = vtrn_u16(vreinterpret_u16_u8(result.val[1]), |
| 280 vreinterpret_u16_u8(result.val[3])); |
| 281 const uint8x8x2_t r01_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[0]), |
| 282 vreinterpret_u8_u16(r13_u16.val[0])); |
| 283 const uint8x8x2_t r23_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[1]), |
| 284 vreinterpret_u8_u16(r13_u16.val[1])); |
| 285 const uint32x2_t x_0_4 = vreinterpret_u32_u8(r01_u8.val[0]); |
| 286 const uint32x2_t x_1_5 = vreinterpret_u32_u8(r01_u8.val[1]); |
| 287 const uint32x2_t x_2_6 = vreinterpret_u32_u8(r23_u8.val[0]); |
| 288 const uint32x2_t x_3_7 = vreinterpret_u32_u8(r23_u8.val[1]); |
| 289 vst1_lane_u32((uint32_t *)dst, x_0_4, 0); |
| 290 dst += pitch; |
| 291 vst1_lane_u32((uint32_t *)dst, x_1_5, 0); |
| 292 dst += pitch; |
| 293 vst1_lane_u32((uint32_t *)dst, x_2_6, 0); |
| 294 dst += pitch; |
| 295 vst1_lane_u32((uint32_t *)dst, x_3_7, 0); |
| 296 dst += pitch; |
| 297 vst1_lane_u32((uint32_t *)dst, x_0_4, 1); |
| 298 dst += pitch; |
| 299 vst1_lane_u32((uint32_t *)dst, x_1_5, 1); |
| 300 dst += pitch; |
| 301 vst1_lane_u32((uint32_t *)dst, x_2_6, 1); |
| 302 dst += pitch; |
| 303 vst1_lane_u32((uint32_t *)dst, x_3_7, 1); |
| 304 #else |
286 vst4_lane_u8(dst, result, 0); | 305 vst4_lane_u8(dst, result, 0); |
287 dst += pitch; | 306 dst += pitch; |
288 vst4_lane_u8(dst, result, 1); | 307 vst4_lane_u8(dst, result, 1); |
289 dst += pitch; | 308 dst += pitch; |
290 vst4_lane_u8(dst, result, 2); | 309 vst4_lane_u8(dst, result, 2); |
291 dst += pitch; | 310 dst += pitch; |
292 vst4_lane_u8(dst, result, 3); | 311 vst4_lane_u8(dst, result, 3); |
293 dst += pitch; | 312 dst += pitch; |
294 vst4_lane_u8(dst, result, 4); | 313 vst4_lane_u8(dst, result, 4); |
295 dst += pitch; | 314 dst += pitch; |
296 vst4_lane_u8(dst, result, 5); | 315 vst4_lane_u8(dst, result, 5); |
297 dst += pitch; | 316 dst += pitch; |
298 vst4_lane_u8(dst, result, 6); | 317 vst4_lane_u8(dst, result, 6); |
299 dst += pitch; | 318 dst += pitch; |
300 vst4_lane_u8(dst, result, 7); | 319 vst4_lane_u8(dst, result, 7); |
| 320 #endif // VPX_INCOMPATIBLE_GCC |
301 } | 321 } |
302 | 322 |
303 void vp8_loop_filter_vertical_edge_y_neon( | 323 void vp8_loop_filter_vertical_edge_y_neon( |
304 unsigned char *src, | 324 unsigned char *src, |
305 int pitch, | 325 int pitch, |
306 unsigned char blimit, | 326 unsigned char blimit, |
307 unsigned char limit, | 327 unsigned char limit, |
308 unsigned char thresh) { | 328 unsigned char thresh) { |
309 unsigned char *s, *d; | 329 unsigned char *s, *d; |
310 uint8x16_t qblimit, qlimit, qthresh, q3, q4; | 330 uint8x16_t qblimit, qlimit, qthresh, q3, q4; |
(...skipping 210 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
521 ud = u - 2; | 541 ud = u - 2; |
522 write_4x8(ud, pitch, q4ResultL); | 542 write_4x8(ud, pitch, q4ResultL); |
523 | 543 |
524 q4ResultH.val[0] = vget_high_u8(q5); // d11 | 544 q4ResultH.val[0] = vget_high_u8(q5); // d11 |
525 q4ResultH.val[1] = vget_high_u8(q6); // d13 | 545 q4ResultH.val[1] = vget_high_u8(q6); // d13 |
526 q4ResultH.val[2] = vget_high_u8(q7); // d15 | 546 q4ResultH.val[2] = vget_high_u8(q7); // d15 |
527 q4ResultH.val[3] = vget_high_u8(q8); // d17 | 547 q4ResultH.val[3] = vget_high_u8(q8); // d17 |
528 vd = v - 2; | 548 vd = v - 2; |
529 write_4x8(vd, pitch, q4ResultH); | 549 write_4x8(vd, pitch, q4ResultH); |
530 } | 550 } |
531 #endif // (__GNUC__ == 4 && (__GNUC_MINOR__ == 6)) | |
OLD | NEW |