source/libvpx/vp8/common/arm/neon/loopfilter_neon.c - Issue 554673004: libvpx: Pull from upstream

Side by Side Diff: source/libvpx/vp8/common/arm/neon/loopfilter_neon.c

Issue 554673004: libvpx: Pull from upstream (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/libvpx/

Patch Set: Created 6 years, 3 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

« no previous file with comments | « source/libvpx/vp8/common/arm/neon/loopfilter_neon.asm ('k') | source/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
(Empty)
	1 /*

	2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved.

	3 *

	4 * Use of this source code is governed by a BSD-style license

	5 * that can be found in the LICENSE file in the root of the source

	6 * tree. An additional intellectual property rights grant can be found

	7 * in the file PATENTS. All contributing project authors may

	8 * be found in the AUTHORS file in the root of the source tree.

	9 */

	10

	11 #include <arm_neon.h>

	12 #include "./vpx_config.h"

	13

	14 static INLINE void vp8_loop_filter_neon(

	15 uint8x16_t qblimit, // flimit

	16 uint8x16_t qlimit, // limit

	17 uint8x16_t qthresh, // thresh

	18 uint8x16_t q3, // p3

	19 uint8x16_t q4, // p2

	20 uint8x16_t q5, // p1

	21 uint8x16_t q6, // p0

	22 uint8x16_t q7, // q0

	23 uint8x16_t q8, // q1

	24 uint8x16_t q9, // q2

	25 uint8x16_t q10, // q3

	26 uint8x16_t *q5r, // p1

	27 uint8x16_t *q6r, // p0

	28 uint8x16_t *q7r, // q0

	29 uint8x16_t *q8r) { // q1

	30 uint8x16_t q0u8, q1u8, q2u8, q11u8, q12u8, q13u8, q14u8, q15u8;

	31 int16x8_t q2s16, q11s16;

	32 uint16x8_t q4u16;

	33 int8x16_t q1s8, q2s8, q10s8, q11s8, q12s8, q13s8;

	34 int8x8_t d2s8, d3s8;

	35

	36 q11u8 = vabdq_u8(q3, q4);

	37 q12u8 = vabdq_u8(q4, q5);

	38 q13u8 = vabdq_u8(q5, q6);

	39 q14u8 = vabdq_u8(q8, q7);

	40 q3 = vabdq_u8(q9, q8);

	41 q4 = vabdq_u8(q10, q9);

	42

	43 q11u8 = vmaxq_u8(q11u8, q12u8);

	44 q12u8 = vmaxq_u8(q13u8, q14u8);

	45 q3 = vmaxq_u8(q3, q4);

	46 q15u8 = vmaxq_u8(q11u8, q12u8);

	47

	48 q9 = vabdq_u8(q6, q7);

	49

	50 // vp8_hevmask

	51 q13u8 = vcgtq_u8(q13u8, qthresh);

	52 q14u8 = vcgtq_u8(q14u8, qthresh);

	53 q15u8 = vmaxq_u8(q15u8, q3);

	54

	55 q2u8 = vabdq_u8(q5, q8);

	56 q9 = vqaddq_u8(q9, q9);

	57

	58 q15u8 = vcgeq_u8(qlimit, q15u8);

	59

	60 // vp8_filter() function

	61 // convert to signed

	62 q10 = vdupq_n_u8(0x80);

	63 q8 = veorq_u8(q8, q10);

	64 q7 = veorq_u8(q7, q10);

	65 q6 = veorq_u8(q6, q10);

	66 q5 = veorq_u8(q5, q10);

	67

	68 q2u8 = vshrq_n_u8(q2u8, 1);

	69 q9 = vqaddq_u8(q9, q2u8);

	70

	71 q10 = vdupq_n_u8(3);

	72

	73 q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7)),

	74 vget_low_s8(vreinterpretq_s8_u8(q6)));

	75 q11s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7)),

	76 vget_high_s8(vreinterpretq_s8_u8(q6)));

	77

	78 q9 = vcgeq_u8(qblimit, q9);

	79

	80 q1s8 = vqsubq_s8(vreinterpretq_s8_u8(q5),

	81 vreinterpretq_s8_u8(q8));

	82

	83 q14u8 = vorrq_u8(q13u8, q14u8);

	84

	85 q4u16 = vmovl_u8(vget_low_u8(q10));

	86 q2s16 = vmulq_s16(q2s16, vreinterpretq_s16_u16(q4u16));

	87 q11s16 = vmulq_s16(q11s16, vreinterpretq_s16_u16(q4u16));

	88

	89 q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q14u8);

	90 q15u8 = vandq_u8(q15u8, q9);

	91

	92 q1s8 = vreinterpretq_s8_u8(q1u8);

	93 q2s16 = vaddw_s8(q2s16, vget_low_s8(q1s8));

	94 q11s16 = vaddw_s8(q11s16, vget_high_s8(q1s8));

	95

	96 q9 = vdupq_n_u8(4);

	97 // vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0))

	98 d2s8 = vqmovn_s16(q2s16);

	99 d3s8 = vqmovn_s16(q11s16);

	100 q1s8 = vcombine_s8(d2s8, d3s8);

	101 q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q15u8);

	102 q1s8 = vreinterpretq_s8_u8(q1u8);

	103

	104 q2s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q10));

	105 q1s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q9));

	106 q2s8 = vshrq_n_s8(q2s8, 3);

	107 q1s8 = vshrq_n_s8(q1s8, 3);

	108

	109 q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q6), q2s8);

	110 q10s8 = vqsubq_s8(vreinterpretq_s8_u8(q7), q1s8);

	111

	112 q1s8 = vrshrq_n_s8(q1s8, 1);

	113 q1s8 = vbicq_s8(q1s8, vreinterpretq_s8_u8(q14u8));

	114

	115 q13s8 = vqaddq_s8(vreinterpretq_s8_u8(q5), q1s8);

	116 q12s8 = vqsubq_s8(vreinterpretq_s8_u8(q8), q1s8);

	117

	118 q0u8 = vdupq_n_u8(0x80);

	119 *q8r = veorq_u8(vreinterpretq_u8_s8(q12s8), q0u8);

	120 *q7r = veorq_u8(vreinterpretq_u8_s8(q10s8), q0u8);

	121 *q6r = veorq_u8(vreinterpretq_u8_s8(q11s8), q0u8);

	122 *q5r = veorq_u8(vreinterpretq_u8_s8(q13s8), q0u8);

	123 return;

	124 }

	125

	126 void vp8_loop_filter_horizontal_edge_y_neon(

	127 unsigned char *src,

	128 int pitch,

	129 unsigned char blimit,

	130 unsigned char limit,

	131 unsigned char thresh) {

	132 uint8x16_t qblimit, qlimit, qthresh, q3, q4;

	133 uint8x16_t q5, q6, q7, q8, q9, q10;

	134

	135 qblimit = vdupq_n_u8(blimit);

	136 qlimit = vdupq_n_u8(limit);

	137 qthresh = vdupq_n_u8(thresh);

	138 src -= (pitch << 2);

	139

	140 q3 = vld1q_u8(src);

	141 src += pitch;

	142 q4 = vld1q_u8(src);

	143 src += pitch;

	144 q5 = vld1q_u8(src);

	145 src += pitch;

	146 q6 = vld1q_u8(src);

	147 src += pitch;

	148 q7 = vld1q_u8(src);

	149 src += pitch;

	150 q8 = vld1q_u8(src);

	151 src += pitch;

	152 q9 = vld1q_u8(src);

	153 src += pitch;

	154 q10 = vld1q_u8(src);

	155

	156 vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4,

	157 q5, q6, q7, q8, q9, q10,

	158 &q5, &q6, &q7, &q8);

	159

	160 src -= (pitch * 5);

	161 vst1q_u8(src, q5);

	162 src += pitch;

	163 vst1q_u8(src, q6);

	164 src += pitch;

	165 vst1q_u8(src, q7);

	166 src += pitch;

	167 vst1q_u8(src, q8);

	168 return;

	169 }

	170

	171 void vp8_loop_filter_horizontal_edge_uv_neon(

	172 unsigned char *u,

	173 int pitch,

	174 unsigned char blimit,

	175 unsigned char limit,

	176 unsigned char thresh,

	177 unsigned char *v) {

	178 uint8x16_t qblimit, qlimit, qthresh, q3, q4;

	179 uint8x16_t q5, q6, q7, q8, q9, q10;

	180 uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;

	181 uint8x8_t d15, d16, d17, d18, d19, d20, d21;

	182

	183 qblimit = vdupq_n_u8(blimit);

	184 qlimit = vdupq_n_u8(limit);

	185 qthresh = vdupq_n_u8(thresh);

	186

	187 u -= (pitch << 2);

	188 v -= (pitch << 2);

	189

	190 d6 = vld1_u8(u);

	191 u += pitch;

	192 d7 = vld1_u8(v);

	193 v += pitch;

	194 d8 = vld1_u8(u);

	195 u += pitch;

	196 d9 = vld1_u8(v);

	197 v += pitch;

	198 d10 = vld1_u8(u);

	199 u += pitch;

	200 d11 = vld1_u8(v);

	201 v += pitch;

	202 d12 = vld1_u8(u);

	203 u += pitch;

	204 d13 = vld1_u8(v);

	205 v += pitch;

	206 d14 = vld1_u8(u);

	207 u += pitch;

	208 d15 = vld1_u8(v);

	209 v += pitch;

	210 d16 = vld1_u8(u);

	211 u += pitch;

	212 d17 = vld1_u8(v);

	213 v += pitch;

	214 d18 = vld1_u8(u);

	215 u += pitch;

	216 d19 = vld1_u8(v);

	217 v += pitch;

	218 d20 = vld1_u8(u);

	219 d21 = vld1_u8(v);

	220

	221 q3 = vcombine_u8(d6, d7);

	222 q4 = vcombine_u8(d8, d9);

	223 q5 = vcombine_u8(d10, d11);

	224 q6 = vcombine_u8(d12, d13);

	225 q7 = vcombine_u8(d14, d15);

	226 q8 = vcombine_u8(d16, d17);

	227 q9 = vcombine_u8(d18, d19);

	228 q10 = vcombine_u8(d20, d21);

	229

	230 vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4,

	231 q5, q6, q7, q8, q9, q10,

	232 &q5, &q6, &q7, &q8);

	233

	234 u -= (pitch * 5);

	235 vst1_u8(u, vget_low_u8(q5));

	236 u += pitch;

	237 vst1_u8(u, vget_low_u8(q6));

	238 u += pitch;

	239 vst1_u8(u, vget_low_u8(q7));

	240 u += pitch;

	241 vst1_u8(u, vget_low_u8(q8));

	242

	243 v -= (pitch * 5);

	244 vst1_u8(v, vget_high_u8(q5));

	245 v += pitch;

	246 vst1_u8(v, vget_high_u8(q6));

	247 v += pitch;

	248 vst1_u8(v, vget_high_u8(q7));

	249 v += pitch;

	250 vst1_u8(v, vget_high_u8(q8));

	251 return;

	252 }

	253

	254 static INLINE void write_4x8(unsigned char *dst, int pitch,

	255 const uint8x8x4_t result) {

	256 #if (__GNUC__ == 4 && (__GNUC_MINOR__ >= 7))

	257 vst4_lane_u8(dst, result, 0);

	258 dst += pitch;

	259 vst4_lane_u8(dst, result, 1);

	260 dst += pitch;

	261 vst4_lane_u8(dst, result, 2);

	262 dst += pitch;

	263 vst4_lane_u8(dst, result, 3);

	264 dst += pitch;

	265 vst4_lane_u8(dst, result, 4);

	266 dst += pitch;

	267 vst4_lane_u8(dst, result, 5);

	268 dst += pitch;

	269 vst4_lane_u8(dst, result, 6);

	270 dst += pitch;

	271 vst4_lane_u8(dst, result, 7);

	272 #else

	273 /*

	274 * uint8x8x4_t result

	275 00 01 02 03 \| 04 05 06 07

	276 10 11 12 13 \| 14 15 16 17

	277 20 21 22 23 \| 24 25 26 27

	278 30 31 32 33 \| 34 35 36 37

	279 ---

	280 * after vtrn_u16

	281 00 01 20 21 \| 04 05 24 25

	282 02 03 22 23 \| 06 07 26 27

	283 10 11 30 31 \| 14 15 34 35

	284 12 13 32 33 \| 16 17 36 37

	285 ---

	286 * after vtrn_u8

	287 00 10 20 30 \| 04 14 24 34

	288 01 11 21 31 \| 05 15 25 35

	289 02 12 22 32 \| 06 16 26 36

	290 03 13 23 33 \| 07 17 27 37

	291 */

	292 const uint16x4x2_t r02_u16 = vtrn_u16(vreinterpret_u16_u8(result.val[0]),

	293 vreinterpret_u16_u8(result.val[2]));

	294 const uint16x4x2_t r13_u16 = vtrn_u16(vreinterpret_u16_u8(result.val[1]),

	295 vreinterpret_u16_u8(result.val[3]));

	296 const uint8x8x2_t r01_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[0]),

	297 vreinterpret_u8_u16(r13_u16.val[0]));

	298 const uint8x8x2_t r23_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[1]),

	299 vreinterpret_u8_u16(r13_u16.val[1]));

	300 const uint32x2_t x_0_4 = vreinterpret_u32_u8(r01_u8.val[0]);

	301 const uint32x2_t x_1_5 = vreinterpret_u32_u8(r01_u8.val[1]);

	302 const uint32x2_t x_2_6 = vreinterpret_u32_u8(r23_u8.val[0]);

	303 const uint32x2_t x_3_7 = vreinterpret_u32_u8(r23_u8.val[1]);

	304 vst1_lane_u32((uint32_t *)dst, x_0_4, 0);

	305 dst += pitch;

	306 vst1_lane_u32((uint32_t *)dst, x_1_5, 0);

	307 dst += pitch;

	308 vst1_lane_u32((uint32_t *)dst, x_2_6, 0);

	309 dst += pitch;

	310 vst1_lane_u32((uint32_t *)dst, x_3_7, 0);

	311 dst += pitch;

	312 vst1_lane_u32((uint32_t *)dst, x_0_4, 1);

	313 dst += pitch;

	314 vst1_lane_u32((uint32_t *)dst, x_1_5, 1);

	315 dst += pitch;

	316 vst1_lane_u32((uint32_t *)dst, x_2_6, 1);

	317 dst += pitch;

	318 vst1_lane_u32((uint32_t *)dst, x_3_7, 1);

	319 #endif

	320 }

	321

	322 void vp8_loop_filter_vertical_edge_y_neon(

	323 unsigned char *src,

	324 int pitch,

	325 unsigned char blimit,

	326 unsigned char limit,

	327 unsigned char thresh) {

	328 unsigned char s, d;

	329 uint8x16_t qblimit, qlimit, qthresh, q3, q4;

	330 uint8x16_t q5, q6, q7, q8, q9, q10;

	331 uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;

	332 uint8x8_t d15, d16, d17, d18, d19, d20, d21;

	333 uint32x4x2_t q2tmp0, q2tmp1, q2tmp2, q2tmp3;

	334 uint16x8x2_t q2tmp4, q2tmp5, q2tmp6, q2tmp7;

	335 uint8x16x2_t q2tmp8, q2tmp9, q2tmp10, q2tmp11;

	336 uint8x8x4_t q4ResultH, q4ResultL;

	337

	338 qblimit = vdupq_n_u8(blimit);

	339 qlimit = vdupq_n_u8(limit);

	340 qthresh = vdupq_n_u8(thresh);

	341

	342 s = src - 4;

	343 d6 = vld1_u8(s);

	344 s += pitch;

	345 d8 = vld1_u8(s);

	346 s += pitch;

	347 d10 = vld1_u8(s);

	348 s += pitch;

	349 d12 = vld1_u8(s);

	350 s += pitch;

	351 d14 = vld1_u8(s);

	352 s += pitch;

	353 d16 = vld1_u8(s);

	354 s += pitch;

	355 d18 = vld1_u8(s);

	356 s += pitch;

	357 d20 = vld1_u8(s);

	358 s += pitch;

	359 d7 = vld1_u8(s);

	360 s += pitch;

	361 d9 = vld1_u8(s);

	362 s += pitch;

	363 d11 = vld1_u8(s);

	364 s += pitch;

	365 d13 = vld1_u8(s);

	366 s += pitch;

	367 d15 = vld1_u8(s);

	368 s += pitch;

	369 d17 = vld1_u8(s);

	370 s += pitch;

	371 d19 = vld1_u8(s);

	372 s += pitch;

	373 d21 = vld1_u8(s);

	374

	375 q3 = vcombine_u8(d6, d7);

	376 q4 = vcombine_u8(d8, d9);

	377 q5 = vcombine_u8(d10, d11);

	378 q6 = vcombine_u8(d12, d13);

	379 q7 = vcombine_u8(d14, d15);

	380 q8 = vcombine_u8(d16, d17);

	381 q9 = vcombine_u8(d18, d19);

	382 q10 = vcombine_u8(d20, d21);

	383

	384 q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7));

	385 q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8));

	386 q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9));

	387 q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10));

	388

	389 q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]),

	390 vreinterpretq_u16_u32(q2tmp2.val[0]));

	391 q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]),

	392 vreinterpretq_u16_u32(q2tmp3.val[0]));

	393 q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]),

	394 vreinterpretq_u16_u32(q2tmp2.val[1]));

	395 q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]),

	396 vreinterpretq_u16_u32(q2tmp3.val[1]));

	397

	398 q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]),

	399 vreinterpretq_u8_u16(q2tmp5.val[0]));

	400 q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]),

	401 vreinterpretq_u8_u16(q2tmp5.val[1]));

	402 q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]),

	403 vreinterpretq_u8_u16(q2tmp7.val[0]));

	404 q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]),

	405 vreinterpretq_u8_u16(q2tmp7.val[1]));

	406

	407 q3 = q2tmp8.val[0];

	408 q4 = q2tmp8.val[1];

	409 q5 = q2tmp9.val[0];

	410 q6 = q2tmp9.val[1];

	411 q7 = q2tmp10.val[0];

	412 q8 = q2tmp10.val[1];

	413 q9 = q2tmp11.val[0];

	414 q10 = q2tmp11.val[1];

	415

	416 vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4,

	417 q5, q6, q7, q8, q9, q10,

	418 &q5, &q6, &q7, &q8);

	419

	420 q4ResultL.val[0] = vget_low_u8(q5); // d10

	421 q4ResultL.val[1] = vget_low_u8(q6); // d12

	422 q4ResultL.val[2] = vget_low_u8(q7); // d14

	423 q4ResultL.val[3] = vget_low_u8(q8); // d16

	424 q4ResultH.val[0] = vget_high_u8(q5); // d11

	425 q4ResultH.val[1] = vget_high_u8(q6); // d13

	426 q4ResultH.val[2] = vget_high_u8(q7); // d15

	427 q4ResultH.val[3] = vget_high_u8(q8); // d17

	428

	429 d = src - 2;

	430 write_4x8(d, pitch, q4ResultL);

	431 d += pitch * 8;

	432 write_4x8(d, pitch, q4ResultH);

	433 }

	434

	435 void vp8_loop_filter_vertical_edge_uv_neon(

	436 unsigned char *u,

	437 int pitch,

	438 unsigned char blimit,

	439 unsigned char limit,

	440 unsigned char thresh,

	441 unsigned char *v) {

	442 unsigned char us, ud;

	443 unsigned char vs, vd;

	444 uint8x16_t qblimit, qlimit, qthresh, q3, q4;

	445 uint8x16_t q5, q6, q7, q8, q9, q10;

	446 uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;

	447 uint8x8_t d15, d16, d17, d18, d19, d20, d21;

	448 uint32x4x2_t q2tmp0, q2tmp1, q2tmp2, q2tmp3;

	449 uint16x8x2_t q2tmp4, q2tmp5, q2tmp6, q2tmp7;

	450 uint8x16x2_t q2tmp8, q2tmp9, q2tmp10, q2tmp11;

	451 uint8x8x4_t q4ResultH, q4ResultL;

	452

	453 qblimit = vdupq_n_u8(blimit);

	454 qlimit = vdupq_n_u8(limit);

	455 qthresh = vdupq_n_u8(thresh);

	456

	457 us = u - 4;

	458 d6 = vld1_u8(us);

	459 us += pitch;

	460 d8 = vld1_u8(us);

	461 us += pitch;

	462 d10 = vld1_u8(us);

	463 us += pitch;

	464 d12 = vld1_u8(us);

	465 us += pitch;

	466 d14 = vld1_u8(us);

	467 us += pitch;

	468 d16 = vld1_u8(us);

	469 us += pitch;

	470 d18 = vld1_u8(us);

	471 us += pitch;

	472 d20 = vld1_u8(us);

	473

	474 vs = v - 4;

	475 d7 = vld1_u8(vs);

	476 vs += pitch;

	477 d9 = vld1_u8(vs);

	478 vs += pitch;

	479 d11 = vld1_u8(vs);

	480 vs += pitch;

	481 d13 = vld1_u8(vs);

	482 vs += pitch;

	483 d15 = vld1_u8(vs);

	484 vs += pitch;

	485 d17 = vld1_u8(vs);

	486 vs += pitch;

	487 d19 = vld1_u8(vs);

	488 vs += pitch;

	489 d21 = vld1_u8(vs);

	490

	491 q3 = vcombine_u8(d6, d7);

	492 q4 = vcombine_u8(d8, d9);

	493 q5 = vcombine_u8(d10, d11);

	494 q6 = vcombine_u8(d12, d13);

	495 q7 = vcombine_u8(d14, d15);

	496 q8 = vcombine_u8(d16, d17);

	497 q9 = vcombine_u8(d18, d19);

	498 q10 = vcombine_u8(d20, d21);

	499

	500 q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7));

	501 q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8));

	502 q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9));

	503 q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10));

	504

	505 q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]),

	506 vreinterpretq_u16_u32(q2tmp2.val[0]));

	507 q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]),

	508 vreinterpretq_u16_u32(q2tmp3.val[0]));

	509 q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]),

	510 vreinterpretq_u16_u32(q2tmp2.val[1]));

	511 q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]),

	512 vreinterpretq_u16_u32(q2tmp3.val[1]));

	513

	514 q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]),

	515 vreinterpretq_u8_u16(q2tmp5.val[0]));

	516 q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]),

	517 vreinterpretq_u8_u16(q2tmp5.val[1]));

	518 q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]),

	519 vreinterpretq_u8_u16(q2tmp7.val[0]));

	520 q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]),

	521 vreinterpretq_u8_u16(q2tmp7.val[1]));

	522

	523 q3 = q2tmp8.val[0];

	524 q4 = q2tmp8.val[1];

	525 q5 = q2tmp9.val[0];

	526 q6 = q2tmp9.val[1];

	527 q7 = q2tmp10.val[0];

	528 q8 = q2tmp10.val[1];

	529 q9 = q2tmp11.val[0];

	530 q10 = q2tmp11.val[1];

	531

	532 vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4,

	533 q5, q6, q7, q8, q9, q10,

	534 &q5, &q6, &q7, &q8);

	535

	536 q4ResultL.val[0] = vget_low_u8(q5); // d10

	537 q4ResultL.val[1] = vget_low_u8(q6); // d12

	538 q4ResultL.val[2] = vget_low_u8(q7); // d14

	539 q4ResultL.val[3] = vget_low_u8(q8); // d16

	540 ud = u - 2;

	541 write_4x8(ud, pitch, q4ResultL);

	542

	543 q4ResultH.val[0] = vget_high_u8(q5); // d11

	544 q4ResultH.val[1] = vget_high_u8(q6); // d13

	545 q4ResultH.val[2] = vget_high_u8(q7); // d15

	546 q4ResultH.val[3] = vget_high_u8(q8); // d17

	547 vd = v - 2;

	548 write_4x8(vd, pitch, q4ResultH);

	549 }

OLD	NEW