source/libvpx/vp8/common/arm/neon/loopfilter_neon.c - Issue 668403002: libvpx: Pull from upstream

Side by Side Diff: source/libvpx/vp8/common/arm/neon/loopfilter_neon.c

Issue 668403002: libvpx: Pull from upstream (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/libvpx/

Patch Set: Created 6 years, 2 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 /*	1 /*

2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved.	2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license	4 * Use of this source code is governed by a BSD-style license

5 * that can be found in the LICENSE file in the root of the source	5 * that can be found in the LICENSE file in the root of the source

6 * tree. An additional intellectual property rights grant can be found	6 * tree. An additional intellectual property rights grant can be found

7 * in the file PATENTS. All contributing project authors may	7 * in the file PATENTS. All contributing project authors may

8 * be found in the AUTHORS file in the root of the source tree.	8 * be found in the AUTHORS file in the root of the source tree.

9 */	9 */

10	10

11 #include <arm_neon.h>	11 #include <arm_neon.h>

12 #include "./vpx_config.h"	12 #include "./vpx_config.h"

	13 #include "vpx_ports/arm.h"

13	14

14 static INLINE void vp8_loop_filter_neon(	15 static INLINE void vp8_loop_filter_neon(

15 uint8x16_t qblimit, // flimit	16 uint8x16_t qblimit, // flimit

16 uint8x16_t qlimit, // limit	17 uint8x16_t qlimit, // limit

17 uint8x16_t qthresh, // thresh	18 uint8x16_t qthresh, // thresh

18 uint8x16_t q3, // p3	19 uint8x16_t q3, // p3

19 uint8x16_t q4, // p2	20 uint8x16_t q4, // p2

20 uint8x16_t q5, // p1	21 uint8x16_t q5, // p1

21 uint8x16_t q6, // p0	22 uint8x16_t q6, // p0

22 uint8x16_t q7, // q0	23 uint8x16_t q7, // q0

(...skipping 221 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
244 vst1_u8(v, vget_high_u8(q5));	245 vst1_u8(v, vget_high_u8(q5));

245 v += pitch;	246 v += pitch;

246 vst1_u8(v, vget_high_u8(q6));	247 vst1_u8(v, vget_high_u8(q6));

247 v += pitch;	248 v += pitch;

248 vst1_u8(v, vget_high_u8(q7));	249 vst1_u8(v, vget_high_u8(q7));

249 v += pitch;	250 v += pitch;

250 vst1_u8(v, vget_high_u8(q8));	251 vst1_u8(v, vget_high_u8(q8));

251 return;	252 return;

252 }	253 }

253	254

254 #if (__GNUC__ == 4 && (__GNUC_MINOR__ == 6))

255 #warning Using GCC 4.6 is not recommended

256 // Some versions of gcc4.6 do not correctly process vst4_lane_u8. When built

257 // with any gcc4.6, use the C code.

258 extern void vp8_loop_filter_vertical_edge_c(unsigned char *s, int p,

259 const unsigned char *blimit,

260 const unsigned char *limit,

261 const unsigned char *thresh,

262 int count);

263

264 void vp8_loop_filter_vertical_edge_y_neon(

265 unsigned char *src,

266 int pitch,

267 unsigned char blimit,

268 unsigned char limit,

269 unsigned char thresh) {

270 vp8_loop_filter_vertical_edge_c(src, pitch, &blimit, &limit, &thresh, 2);

271 }

272

273 void vp8_loop_filter_vertical_edge_uv_neon(

274 unsigned char *u,

275 int pitch,

276 unsigned char blimit,

277 unsigned char limit,

278 unsigned char thresh,

279 unsigned char *v) {

280 vp8_loop_filter_vertical_edge_c(u, pitch, &blimit, &limit, &thresh, 1);

281 vp8_loop_filter_vertical_edge_c(v, pitch, &blimit, &limit, &thresh, 1);

282 }

283 #else

284 static INLINE void write_4x8(unsigned char *dst, int pitch,	255 static INLINE void write_4x8(unsigned char *dst, int pitch,

285 const uint8x8x4_t result) {	256 const uint8x8x4_t result) {

	257 #ifdef VPX_INCOMPATIBLE_GCC

	258 /*

	259 * uint8x8x4_t result

	260 00 01 02 03 \| 04 05 06 07

	261 10 11 12 13 \| 14 15 16 17

	262 20 21 22 23 \| 24 25 26 27

	263 30 31 32 33 \| 34 35 36 37

	264 ---

	265 * after vtrn_u16

	266 00 01 20 21 \| 04 05 24 25

	267 02 03 22 23 \| 06 07 26 27

	268 10 11 30 31 \| 14 15 34 35

	269 12 13 32 33 \| 16 17 36 37

	270 ---

	271 * after vtrn_u8

	272 00 10 20 30 \| 04 14 24 34

	273 01 11 21 31 \| 05 15 25 35

	274 02 12 22 32 \| 06 16 26 36

	275 03 13 23 33 \| 07 17 27 37

	276 */

	277 const uint16x4x2_t r02_u16 = vtrn_u16(vreinterpret_u16_u8(result.val[0]),

	278 vreinterpret_u16_u8(result.val[2]));

	279 const uint16x4x2_t r13_u16 = vtrn_u16(vreinterpret_u16_u8(result.val[1]),

	280 vreinterpret_u16_u8(result.val[3]));

	281 const uint8x8x2_t r01_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[0]),

	282 vreinterpret_u8_u16(r13_u16.val[0]));

	283 const uint8x8x2_t r23_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[1]),

	284 vreinterpret_u8_u16(r13_u16.val[1]));

	285 const uint32x2_t x_0_4 = vreinterpret_u32_u8(r01_u8.val[0]);

	286 const uint32x2_t x_1_5 = vreinterpret_u32_u8(r01_u8.val[1]);

	287 const uint32x2_t x_2_6 = vreinterpret_u32_u8(r23_u8.val[0]);

	288 const uint32x2_t x_3_7 = vreinterpret_u32_u8(r23_u8.val[1]);

	289 vst1_lane_u32((uint32_t *)dst, x_0_4, 0);

	290 dst += pitch;

	291 vst1_lane_u32((uint32_t *)dst, x_1_5, 0);

	292 dst += pitch;

	293 vst1_lane_u32((uint32_t *)dst, x_2_6, 0);

	294 dst += pitch;

	295 vst1_lane_u32((uint32_t *)dst, x_3_7, 0);

	296 dst += pitch;

	297 vst1_lane_u32((uint32_t *)dst, x_0_4, 1);

	298 dst += pitch;

	299 vst1_lane_u32((uint32_t *)dst, x_1_5, 1);

	300 dst += pitch;

	301 vst1_lane_u32((uint32_t *)dst, x_2_6, 1);

	302 dst += pitch;

	303 vst1_lane_u32((uint32_t *)dst, x_3_7, 1);

	304 #else

286 vst4_lane_u8(dst, result, 0);	305 vst4_lane_u8(dst, result, 0);

287 dst += pitch;	306 dst += pitch;

288 vst4_lane_u8(dst, result, 1);	307 vst4_lane_u8(dst, result, 1);

289 dst += pitch;	308 dst += pitch;

290 vst4_lane_u8(dst, result, 2);	309 vst4_lane_u8(dst, result, 2);

291 dst += pitch;	310 dst += pitch;

292 vst4_lane_u8(dst, result, 3);	311 vst4_lane_u8(dst, result, 3);

293 dst += pitch;	312 dst += pitch;

294 vst4_lane_u8(dst, result, 4);	313 vst4_lane_u8(dst, result, 4);

295 dst += pitch;	314 dst += pitch;

296 vst4_lane_u8(dst, result, 5);	315 vst4_lane_u8(dst, result, 5);

297 dst += pitch;	316 dst += pitch;

298 vst4_lane_u8(dst, result, 6);	317 vst4_lane_u8(dst, result, 6);

299 dst += pitch;	318 dst += pitch;

300 vst4_lane_u8(dst, result, 7);	319 vst4_lane_u8(dst, result, 7);

	320 #endif // VPX_INCOMPATIBLE_GCC

301 }	321 }

302	322

303 void vp8_loop_filter_vertical_edge_y_neon(	323 void vp8_loop_filter_vertical_edge_y_neon(

304 unsigned char *src,	324 unsigned char *src,

305 int pitch,	325 int pitch,

306 unsigned char blimit,	326 unsigned char blimit,

307 unsigned char limit,	327 unsigned char limit,

308 unsigned char thresh) {	328 unsigned char thresh) {

309 unsigned char s, d;	329 unsigned char s, d;

310 uint8x16_t qblimit, qlimit, qthresh, q3, q4;	330 uint8x16_t qblimit, qlimit, qthresh, q3, q4;

(...skipping 210 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
521 ud = u - 2;	541 ud = u - 2;

522 write_4x8(ud, pitch, q4ResultL);	542 write_4x8(ud, pitch, q4ResultL);

523	543

524 q4ResultH.val[0] = vget_high_u8(q5); // d11	544 q4ResultH.val[0] = vget_high_u8(q5); // d11

525 q4ResultH.val[1] = vget_high_u8(q6); // d13	545 q4ResultH.val[1] = vget_high_u8(q6); // d13

526 q4ResultH.val[2] = vget_high_u8(q7); // d15	546 q4ResultH.val[2] = vget_high_u8(q7); // d15

527 q4ResultH.val[3] = vget_high_u8(q8); // d17	547 q4ResultH.val[3] = vget_high_u8(q8); // d17

528 vd = v - 2;	548 vd = v - 2;

529 write_4x8(vd, pitch, q4ResultH);	549 write_4x8(vd, pitch, q4ResultH);

530 }	550 }

531 #endif // (__GNUC__ == 4 && (__GNUC_MINOR__ == 6))

OLD	NEW

« no previous file with comments | « source/libvpx/tools_common.c ('k') | source/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c » ('j') | no next file with comments »