third_party/libwebp/dsp/dec_neon.c - Issue 1178013008: Use the upstream version of libwebp, v0.4.3.

Side by Side Diff: third_party/libwebp/dsp/dec_neon.c

Issue 1178013008: Use the upstream version of libwebp, v0.4.3. (Closed) Base URL: https://skia.googlesource.com/skia.git@master

Patch Set: Fixes for SkWebpImageDecoder and SkWebpCodec. Created 5 years, 6 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
(Empty)
	1 // Copyright 2012 Google Inc. All Rights Reserved.

	2 //

	3 // Use of this source code is governed by a BSD-style license

	4 // that can be found in the COPYING file in the root of the source

	5 // tree. An additional intellectual property rights grant can be found

	6 // in the file PATENTS. All contributing project authors may

	7 // be found in the AUTHORS file in the root of the source tree.

	8 // -----------------------------------------------------------------------------

	9 //

	10 // ARM NEON version of dsp functions and loop filtering.

	11 //

	12 // Authors: Somnath Banerjee (somnath@google.com)

	13 // Johann Koenig (johannkoenig@google.com)

	14

	15 #include "./dsp.h"

	16

	17 #if defined(WEBP_USE_NEON)

	18

	19 #include "./neon.h"

	20 #include "../dec/vp8i.h"

	21

	22 //------------------------------------------------------------------------------

	23 // NxM Loading functions

	24

	25 // Load/Store vertical edge

	26 #define LOAD8x4(c1, c2, c3, c4, b1, b2, stride) \

	27 "vld4.8 {" #c1"[0], " #c2"[0], " #c3"[0], " #c4"[0]}," #b1 "," #stride"\n" \

	28 "vld4.8 {" #c1"[1], " #c2"[1], " #c3"[1], " #c4"[1]}," #b2 "," #stride"\n" \

	29 "vld4.8 {" #c1"[2], " #c2"[2], " #c3"[2], " #c4"[2]}," #b1 "," #stride"\n" \

	30 "vld4.8 {" #c1"[3], " #c2"[3], " #c3"[3], " #c4"[3]}," #b2 "," #stride"\n" \

	31 "vld4.8 {" #c1"[4], " #c2"[4], " #c3"[4], " #c4"[4]}," #b1 "," #stride"\n" \

	32 "vld4.8 {" #c1"[5], " #c2"[5], " #c3"[5], " #c4"[5]}," #b2 "," #stride"\n" \

	33 "vld4.8 {" #c1"[6], " #c2"[6], " #c3"[6], " #c4"[6]}," #b1 "," #stride"\n" \

	34 "vld4.8 {" #c1"[7], " #c2"[7], " #c3"[7], " #c4"[7]}," #b2 "," #stride"\n"

	35

	36 #define STORE8x2(c1, c2, p, stride) \

	37 "vst2.8 {" #c1"[0], " #c2"[0]}," #p "," #stride " \n" \

	38 "vst2.8 {" #c1"[1], " #c2"[1]}," #p "," #stride " \n" \

	39 "vst2.8 {" #c1"[2], " #c2"[2]}," #p "," #stride " \n" \

	40 "vst2.8 {" #c1"[3], " #c2"[3]}," #p "," #stride " \n" \

	41 "vst2.8 {" #c1"[4], " #c2"[4]}," #p "," #stride " \n" \

	42 "vst2.8 {" #c1"[5], " #c2"[5]}," #p "," #stride " \n" \

	43 "vst2.8 {" #c1"[6], " #c2"[6]}," #p "," #stride " \n" \

	44 "vst2.8 {" #c1"[7], " #c2"[7]}," #p "," #stride " \n"

	45

	46 #if !defined(WORK_AROUND_GCC)

	47

	48 // This intrinsics version makes gcc-4.6.3 crash during Load4x??() compilation

	49 // (register alloc, probably). The variants somewhat mitigate the problem, but

	50 // not quite. HFilter16i() remains problematic.

	51 static WEBP_INLINE uint8x8x4_t Load4x8(const uint8_t* const src, int stride) {

	52 const uint8x8_t zero = vdup_n_u8(0);

	53 uint8x8x4_t out;

	54 INIT_VECTOR4(out, zero, zero, zero, zero);

	55 out = vld4_lane_u8(src + 0 * stride, out, 0);

	56 out = vld4_lane_u8(src + 1 * stride, out, 1);

	57 out = vld4_lane_u8(src + 2 * stride, out, 2);

	58 out = vld4_lane_u8(src + 3 * stride, out, 3);

	59 out = vld4_lane_u8(src + 4 * stride, out, 4);

	60 out = vld4_lane_u8(src + 5 * stride, out, 5);

	61 out = vld4_lane_u8(src + 6 * stride, out, 6);

	62 out = vld4_lane_u8(src + 7 * stride, out, 7);

	63 return out;

	64 }

	65

	66 static WEBP_INLINE void Load4x16(const uint8_t* const src, int stride,

	67 uint8x16_t* const p1, uint8x16_t* const p0,

	68 uint8x16_t* const q0, uint8x16_t* const q1) {

	69 // row0 = p1[0..7]\|p0[0..7]\|q0[0..7]\|q1[0..7]

	70 // row8 = p1[8..15]\|p0[8..15]\|q0[8..15]\|q1[8..15]

	71 const uint8x8x4_t row0 = Load4x8(src - 2 + 0 * stride, stride);

	72 const uint8x8x4_t row8 = Load4x8(src - 2 + 8 * stride, stride);

	73 *p1 = vcombine_u8(row0.val[0], row8.val[0]);

	74 *p0 = vcombine_u8(row0.val[1], row8.val[1]);

	75 *q0 = vcombine_u8(row0.val[2], row8.val[2]);

	76 *q1 = vcombine_u8(row0.val[3], row8.val[3]);

	77 }

	78

	79 #else // WORK_AROUND_GCC

	80

	81 #define LOADQ_LANE_32b(VALUE, LANE) do { \

	82 (VALUE) = vld1q_lane_u32((const uint32_t*)src, (VALUE), (LANE)); \

	83 src += stride; \

	84 } while (0)

	85

	86 static WEBP_INLINE void Load4x16(const uint8_t* src, int stride,

	87 uint8x16_t* const p1, uint8x16_t* const p0,

	88 uint8x16_t* const q0, uint8x16_t* const q1) {

	89 const uint32x4_t zero = vdupq_n_u32(0);

	90 uint32x4x4_t in;

	91 INIT_VECTOR4(in, zero, zero, zero, zero);

	92 src -= 2;

	93 LOADQ_LANE_32b(in.val[0], 0);

	94 LOADQ_LANE_32b(in.val[1], 0);

	95 LOADQ_LANE_32b(in.val[2], 0);

	96 LOADQ_LANE_32b(in.val[3], 0);

	97 LOADQ_LANE_32b(in.val[0], 1);

	98 LOADQ_LANE_32b(in.val[1], 1);

	99 LOADQ_LANE_32b(in.val[2], 1);

	100 LOADQ_LANE_32b(in.val[3], 1);

	101 LOADQ_LANE_32b(in.val[0], 2);

	102 LOADQ_LANE_32b(in.val[1], 2);

	103 LOADQ_LANE_32b(in.val[2], 2);

	104 LOADQ_LANE_32b(in.val[3], 2);

	105 LOADQ_LANE_32b(in.val[0], 3);

	106 LOADQ_LANE_32b(in.val[1], 3);

	107 LOADQ_LANE_32b(in.val[2], 3);

	108 LOADQ_LANE_32b(in.val[3], 3);

	109 // Transpose four 4x4 parts:

	110 {

	111 const uint8x16x2_t row01 = vtrnq_u8(vreinterpretq_u8_u32(in.val[0]),

	112 vreinterpretq_u8_u32(in.val[1]));

	113 const uint8x16x2_t row23 = vtrnq_u8(vreinterpretq_u8_u32(in.val[2]),

	114 vreinterpretq_u8_u32(in.val[3]));

	115 const uint16x8x2_t row02 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[0]),

	116 vreinterpretq_u16_u8(row23.val[0]));

	117 const uint16x8x2_t row13 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[1]),

	118 vreinterpretq_u16_u8(row23.val[1]));

	119 *p1 = vreinterpretq_u8_u16(row02.val[0]);

	120 *p0 = vreinterpretq_u8_u16(row13.val[0]);

	121 *q0 = vreinterpretq_u8_u16(row02.val[1]);

	122 *q1 = vreinterpretq_u8_u16(row13.val[1]);

	123 }

	124 }

	125 #undef LOADQ_LANE_32b

	126

	127 #endif // !WORK_AROUND_GCC

	128

	129 static WEBP_INLINE void Load8x16(const uint8_t* const src, int stride,

	130 uint8x16_t* const p3, uint8x16_t* const p2,

	131 uint8x16_t* const p1, uint8x16_t* const p0,

	132 uint8x16_t* const q0, uint8x16_t* const q1,

	133 uint8x16_t* const q2, uint8x16_t* const q3) {

	134 Load4x16(src - 2, stride, p3, p2, p1, p0);

	135 Load4x16(src + 2, stride, q0, q1, q2, q3);

	136 }

	137

	138 static WEBP_INLINE void Load16x4(const uint8_t* const src, int stride,

	139 uint8x16_t* const p1, uint8x16_t* const p0,

	140 uint8x16_t* const q0, uint8x16_t* const q1) {

	141 p1 = vld1q_u8(src - 2 stride);

	142 p0 = vld1q_u8(src - 1 stride);

	143 q0 = vld1q_u8(src + 0 stride);

	144 q1 = vld1q_u8(src + 1 stride);

	145 }

	146

	147 static WEBP_INLINE void Load16x8(const uint8_t* const src, int stride,

	148 uint8x16_t* const p3, uint8x16_t* const p2,

	149 uint8x16_t* const p1, uint8x16_t* const p0,

	150 uint8x16_t* const q0, uint8x16_t* const q1,

	151 uint8x16_t* const q2, uint8x16_t* const q3) {

	152 Load16x4(src - 2 * stride, stride, p3, p2, p1, p0);

	153 Load16x4(src + 2 * stride, stride, q0, q1, q2, q3);

	154 }

	155

	156 static WEBP_INLINE void Load8x8x2(const uint8_t* const u,

	157 const uint8_t* const v,

	158 int stride,

	159 uint8x16_t* const p3, uint8x16_t* const p2,

	160 uint8x16_t* const p1, uint8x16_t* const p0,

	161 uint8x16_t* const q0, uint8x16_t* const q1,

	162 uint8x16_t* const q2, uint8x16_t* const q3) {

	163 // We pack the 8x8 u-samples in the lower half of the uint8x16_t destination

	164 // and the v-samples on the higher half.

	165 p3 = vcombine_u8(vld1_u8(u - 4 stride), vld1_u8(v - 4 * stride));

	166 p2 = vcombine_u8(vld1_u8(u - 3 stride), vld1_u8(v - 3 * stride));

	167 p1 = vcombine_u8(vld1_u8(u - 2 stride), vld1_u8(v - 2 * stride));

	168 p0 = vcombine_u8(vld1_u8(u - 1 stride), vld1_u8(v - 1 * stride));

	169 q0 = vcombine_u8(vld1_u8(u + 0 stride), vld1_u8(v + 0 * stride));

	170 q1 = vcombine_u8(vld1_u8(u + 1 stride), vld1_u8(v + 1 * stride));

	171 q2 = vcombine_u8(vld1_u8(u + 2 stride), vld1_u8(v + 2 * stride));

	172 q3 = vcombine_u8(vld1_u8(u + 3 stride), vld1_u8(v + 3 * stride));

	173 }

	174

	175 #if !defined(WORK_AROUND_GCC)

	176

	177 #define LOAD_UV_8(ROW) \

	178 vcombine_u8(vld1_u8(u - 4 + (ROW) * stride), vld1_u8(v - 4 + (ROW) * stride))

	179

	180 static WEBP_INLINE void Load8x8x2T(const uint8_t* const u,

	181 const uint8_t* const v,

	182 int stride,

	183 uint8x16_t* const p3, uint8x16_t* const p2,

	184 uint8x16_t* const p1, uint8x16_t* const p0,

	185 uint8x16_t* const q0, uint8x16_t* const q1,

	186 uint8x16_t* const q2, uint8x16_t* const q3) {

	187 // We pack the 8x8 u-samples in the lower half of the uint8x16_t destination

	188 // and the v-samples on the higher half.

	189 const uint8x16_t row0 = LOAD_UV_8(0);

	190 const uint8x16_t row1 = LOAD_UV_8(1);

	191 const uint8x16_t row2 = LOAD_UV_8(2);

	192 const uint8x16_t row3 = LOAD_UV_8(3);

	193 const uint8x16_t row4 = LOAD_UV_8(4);

	194 const uint8x16_t row5 = LOAD_UV_8(5);

	195 const uint8x16_t row6 = LOAD_UV_8(6);

	196 const uint8x16_t row7 = LOAD_UV_8(7);

	197 // Perform two side-by-side 8x8 transposes

	198 // u00 u01 u02 u03 u04 u05 u06 u07 \| v00 v01 v02 v03 v04 v05 v06 v07

	199 // u10 u11 u12 u13 u14 u15 u16 u17 \| v10 v11 v12 ...

	200 // u20 u21 u22 u23 u24 u25 u26 u27 \| v20 v21 ...

	201 // u30 u31 u32 u33 u34 u35 u36 u37 \| ...

	202 // u40 u41 u42 u43 u44 u45 u46 u47 \| ...

	203 // u50 u51 u52 u53 u54 u55 u56 u57 \| ...

	204 // u60 u61 u62 u63 u64 u65 u66 u67 \| v60 ...

	205 // u70 u71 u72 u73 u74 u75 u76 u77 \| v70 v71 v72 ...

	206 const uint8x16x2_t row01 = vtrnq_u8(row0, row1); // u00 u10 u02 u12 ...

	207 // u01 u11 u03 u13 ...

	208 const uint8x16x2_t row23 = vtrnq_u8(row2, row3); // u20 u30 u22 u32 ...

	209 // u21 u31 u23 u33 ...

	210 const uint8x16x2_t row45 = vtrnq_u8(row4, row5); // ...

	211 const uint8x16x2_t row67 = vtrnq_u8(row6, row7); // ...

	212 const uint16x8x2_t row02 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[0]),

	213 vreinterpretq_u16_u8(row23.val[0]));

	214 const uint16x8x2_t row13 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[1]),

	215 vreinterpretq_u16_u8(row23.val[1]));

	216 const uint16x8x2_t row46 = vtrnq_u16(vreinterpretq_u16_u8(row45.val[0]),

	217 vreinterpretq_u16_u8(row67.val[0]));

	218 const uint16x8x2_t row57 = vtrnq_u16(vreinterpretq_u16_u8(row45.val[1]),

	219 vreinterpretq_u16_u8(row67.val[1]));

	220 const uint32x4x2_t row04 = vtrnq_u32(vreinterpretq_u32_u16(row02.val[0]),

	221 vreinterpretq_u32_u16(row46.val[0]));

	222 const uint32x4x2_t row26 = vtrnq_u32(vreinterpretq_u32_u16(row02.val[1]),

	223 vreinterpretq_u32_u16(row46.val[1]));

	224 const uint32x4x2_t row15 = vtrnq_u32(vreinterpretq_u32_u16(row13.val[0]),

	225 vreinterpretq_u32_u16(row57.val[0]));

	226 const uint32x4x2_t row37 = vtrnq_u32(vreinterpretq_u32_u16(row13.val[1]),

	227 vreinterpretq_u32_u16(row57.val[1]));

	228 *p3 = vreinterpretq_u8_u32(row04.val[0]);

	229 *p2 = vreinterpretq_u8_u32(row15.val[0]);

	230 *p1 = vreinterpretq_u8_u32(row26.val[0]);

	231 *p0 = vreinterpretq_u8_u32(row37.val[0]);

	232 *q0 = vreinterpretq_u8_u32(row04.val[1]);

	233 *q1 = vreinterpretq_u8_u32(row15.val[1]);

	234 *q2 = vreinterpretq_u8_u32(row26.val[1]);

	235 *q3 = vreinterpretq_u8_u32(row37.val[1]);

	236 }

	237 #undef LOAD_UV_8

	238

	239 #endif // !WORK_AROUND_GCC

	240

	241 static WEBP_INLINE void Store2x8(const uint8x8x2_t v,

	242 uint8_t* const dst, int stride) {

	243 vst2_lane_u8(dst + 0 * stride, v, 0);

	244 vst2_lane_u8(dst + 1 * stride, v, 1);

	245 vst2_lane_u8(dst + 2 * stride, v, 2);

	246 vst2_lane_u8(dst + 3 * stride, v, 3);

	247 vst2_lane_u8(dst + 4 * stride, v, 4);

	248 vst2_lane_u8(dst + 5 * stride, v, 5);

	249 vst2_lane_u8(dst + 6 * stride, v, 6);

	250 vst2_lane_u8(dst + 7 * stride, v, 7);

	251 }

	252

	253 static WEBP_INLINE void Store2x16(const uint8x16_t p0, const uint8x16_t q0,

	254 uint8_t* const dst, int stride) {

	255 uint8x8x2_t lo, hi;

	256 lo.val[0] = vget_low_u8(p0);

	257 lo.val[1] = vget_low_u8(q0);

	258 hi.val[0] = vget_high_u8(p0);

	259 hi.val[1] = vget_high_u8(q0);

	260 Store2x8(lo, dst - 1 + 0 * stride, stride);

	261 Store2x8(hi, dst - 1 + 8 * stride, stride);

	262 }

	263

	264 #if !defined(WORK_AROUND_GCC)

	265 static WEBP_INLINE void Store4x8(const uint8x8x4_t v,

	266 uint8_t* const dst, int stride) {

	267 vst4_lane_u8(dst + 0 * stride, v, 0);

	268 vst4_lane_u8(dst + 1 * stride, v, 1);

	269 vst4_lane_u8(dst + 2 * stride, v, 2);

	270 vst4_lane_u8(dst + 3 * stride, v, 3);

	271 vst4_lane_u8(dst + 4 * stride, v, 4);

	272 vst4_lane_u8(dst + 5 * stride, v, 5);

	273 vst4_lane_u8(dst + 6 * stride, v, 6);

	274 vst4_lane_u8(dst + 7 * stride, v, 7);

	275 }

	276

	277 static WEBP_INLINE void Store4x16(const uint8x16_t p1, const uint8x16_t p0,

	278 const uint8x16_t q0, const uint8x16_t q1,

	279 uint8_t* const dst, int stride) {

	280 uint8x8x4_t lo, hi;

	281 INIT_VECTOR4(lo,

	282 vget_low_u8(p1), vget_low_u8(p0),

	283 vget_low_u8(q0), vget_low_u8(q1));

	284 INIT_VECTOR4(hi,

	285 vget_high_u8(p1), vget_high_u8(p0),

	286 vget_high_u8(q0), vget_high_u8(q1));

	287 Store4x8(lo, dst - 2 + 0 * stride, stride);

	288 Store4x8(hi, dst - 2 + 8 * stride, stride);

	289 }

	290 #endif // !WORK_AROUND_GCC

	291

	292 static WEBP_INLINE void Store16x2(const uint8x16_t p0, const uint8x16_t q0,

	293 uint8_t* const dst, int stride) {

	294 vst1q_u8(dst - stride, p0);

	295 vst1q_u8(dst, q0);

	296 }

	297

	298 static WEBP_INLINE void Store16x4(const uint8x16_t p1, const uint8x16_t p0,

	299 const uint8x16_t q0, const uint8x16_t q1,

	300 uint8_t* const dst, int stride) {

	301 Store16x2(p1, p0, dst - stride, stride);

	302 Store16x2(q0, q1, dst + stride, stride);

	303 }

	304

	305 static WEBP_INLINE void Store8x2x2(const uint8x16_t p0, const uint8x16_t q0,

	306 uint8_t* const u, uint8_t* const v,

	307 int stride) {

	308 // p0 and q0 contain the u+v samples packed in low/high halves.

	309 vst1_u8(u - stride, vget_low_u8(p0));

	310 vst1_u8(u, vget_low_u8(q0));

	311 vst1_u8(v - stride, vget_high_u8(p0));

	312 vst1_u8(v, vget_high_u8(q0));

	313 }

	314

	315 static WEBP_INLINE void Store8x4x2(const uint8x16_t p1, const uint8x16_t p0,

	316 const uint8x16_t q0, const uint8x16_t q1,

	317 uint8_t* const u, uint8_t* const v,

	318 int stride) {

	319 // The p1...q1 registers contain the u+v samples packed in low/high halves.

	320 Store8x2x2(p1, p0, u - stride, v - stride, stride);

	321 Store8x2x2(q0, q1, u + stride, v + stride, stride);

	322 }

	323

	324 #if !defined(WORK_AROUND_GCC)

	325

	326 #define STORE6_LANE(DST, VAL0, VAL1, LANE) do { \

	327 vst3_lane_u8((DST) - 3, (VAL0), (LANE)); \

	328 vst3_lane_u8((DST) + 0, (VAL1), (LANE)); \

	329 (DST) += stride; \

	330 } while (0)

	331

	332 static WEBP_INLINE void Store6x8x2(const uint8x16_t p2, const uint8x16_t p1,

	333 const uint8x16_t p0, const uint8x16_t q0,

	334 const uint8x16_t q1, const uint8x16_t q2,

	335 uint8_t* u, uint8_t* v,

	336 int stride) {

	337 uint8x8x3_t u0, u1, v0, v1;

	338 INIT_VECTOR3(u0, vget_low_u8(p2), vget_low_u8(p1), vget_low_u8(p0));

	339 INIT_VECTOR3(u1, vget_low_u8(q0), vget_low_u8(q1), vget_low_u8(q2));

	340 INIT_VECTOR3(v0, vget_high_u8(p2), vget_high_u8(p1), vget_high_u8(p0));

	341 INIT_VECTOR3(v1, vget_high_u8(q0), vget_high_u8(q1), vget_high_u8(q2));

	342 STORE6_LANE(u, u0, u1, 0);

	343 STORE6_LANE(u, u0, u1, 1);

	344 STORE6_LANE(u, u0, u1, 2);

	345 STORE6_LANE(u, u0, u1, 3);

	346 STORE6_LANE(u, u0, u1, 4);

	347 STORE6_LANE(u, u0, u1, 5);

	348 STORE6_LANE(u, u0, u1, 6);

	349 STORE6_LANE(u, u0, u1, 7);

	350 STORE6_LANE(v, v0, v1, 0);

	351 STORE6_LANE(v, v0, v1, 1);

	352 STORE6_LANE(v, v0, v1, 2);

	353 STORE6_LANE(v, v0, v1, 3);

	354 STORE6_LANE(v, v0, v1, 4);

	355 STORE6_LANE(v, v0, v1, 5);

	356 STORE6_LANE(v, v0, v1, 6);

	357 STORE6_LANE(v, v0, v1, 7);

	358 }

	359 #undef STORE6_LANE

	360

	361 static WEBP_INLINE void Store4x8x2(const uint8x16_t p1, const uint8x16_t p0,

	362 const uint8x16_t q0, const uint8x16_t q1,

	363 uint8_t* const u, uint8_t* const v,

	364 int stride) {

	365 uint8x8x4_t u0, v0;

	366 INIT_VECTOR4(u0,

	367 vget_low_u8(p1), vget_low_u8(p0),

	368 vget_low_u8(q0), vget_low_u8(q1));

	369 INIT_VECTOR4(v0,

	370 vget_high_u8(p1), vget_high_u8(p0),

	371 vget_high_u8(q0), vget_high_u8(q1));

	372 vst4_lane_u8(u - 2 + 0 * stride, u0, 0);

	373 vst4_lane_u8(u - 2 + 1 * stride, u0, 1);

	374 vst4_lane_u8(u - 2 + 2 * stride, u0, 2);

	375 vst4_lane_u8(u - 2 + 3 * stride, u0, 3);

	376 vst4_lane_u8(u - 2 + 4 * stride, u0, 4);

	377 vst4_lane_u8(u - 2 + 5 * stride, u0, 5);

	378 vst4_lane_u8(u - 2 + 6 * stride, u0, 6);

	379 vst4_lane_u8(u - 2 + 7 * stride, u0, 7);

	380 vst4_lane_u8(v - 2 + 0 * stride, v0, 0);

	381 vst4_lane_u8(v - 2 + 1 * stride, v0, 1);

	382 vst4_lane_u8(v - 2 + 2 * stride, v0, 2);

	383 vst4_lane_u8(v - 2 + 3 * stride, v0, 3);

	384 vst4_lane_u8(v - 2 + 4 * stride, v0, 4);

	385 vst4_lane_u8(v - 2 + 5 * stride, v0, 5);

	386 vst4_lane_u8(v - 2 + 6 * stride, v0, 6);

	387 vst4_lane_u8(v - 2 + 7 * stride, v0, 7);

	388 }

	389

	390 #endif // !WORK_AROUND_GCC

	391

	392 // Treats 'v' as an uint8x8_t and zero extends to an int16x8_t.

	393 static WEBP_INLINE int16x8_t ConvertU8ToS16(uint32x2_t v) {

	394 return vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(v)));

	395 }

	396

	397 // Performs unsigned 8b saturation on 'dst01' and 'dst23' storing the result

	398 // to the corresponding rows of 'dst'.

	399 static WEBP_INLINE void SaturateAndStore4x4(uint8_t* const dst,

	400 const int16x8_t dst01,

	401 const int16x8_t dst23) {

	402 // Unsigned saturate to 8b.

	403 const uint8x8_t dst01_u8 = vqmovun_s16(dst01);

	404 const uint8x8_t dst23_u8 = vqmovun_s16(dst23);

	405

	406 // Store the results.

	407 vst1_lane_u32((uint32_t)(dst + 0 BPS), vreinterpret_u32_u8(dst01_u8), 0);

	408 vst1_lane_u32((uint32_t)(dst + 1 BPS), vreinterpret_u32_u8(dst01_u8), 1);

	409 vst1_lane_u32((uint32_t)(dst + 2 BPS), vreinterpret_u32_u8(dst23_u8), 0);

	410 vst1_lane_u32((uint32_t)(dst + 3 BPS), vreinterpret_u32_u8(dst23_u8), 1);

	411 }

	412

	413 static WEBP_INLINE void Add4x4(const int16x8_t row01, const int16x8_t row23,

	414 uint8_t* const dst) {

	415 uint32x2_t dst01 = vdup_n_u32(0);

	416 uint32x2_t dst23 = vdup_n_u32(0);

	417

	418 // Load the source pixels.

	419 dst01 = vld1_lane_u32((uint32_t)(dst + 0 BPS), dst01, 0);

	420 dst23 = vld1_lane_u32((uint32_t)(dst + 2 BPS), dst23, 0);

	421 dst01 = vld1_lane_u32((uint32_t)(dst + 1 BPS), dst01, 1);

	422 dst23 = vld1_lane_u32((uint32_t)(dst + 3 BPS), dst23, 1);

	423

	424 {

	425 // Convert to 16b.

	426 const int16x8_t dst01_s16 = ConvertU8ToS16(dst01);

	427 const int16x8_t dst23_s16 = ConvertU8ToS16(dst23);

	428

	429 // Descale with rounding.

	430 const int16x8_t out01 = vrsraq_n_s16(dst01_s16, row01, 3);

	431 const int16x8_t out23 = vrsraq_n_s16(dst23_s16, row23, 3);

	432 // Add the inverse transform.

	433 SaturateAndStore4x4(dst, out01, out23);

	434 }

	435 }

	436

	437 //-----------------------------------------------------------------------------

	438 // Simple In-loop filtering (Paragraph 15.2)

	439

	440 static uint8x16_t NeedsFilter(const uint8x16_t p1, const uint8x16_t p0,

	441 const uint8x16_t q0, const uint8x16_t q1,

	442 int thresh) {

	443 const uint8x16_t thresh_v = vdupq_n_u8((uint8_t)thresh);

	444 const uint8x16_t a_p0_q0 = vabdq_u8(p0, q0); // abs(p0-q0)

	445 const uint8x16_t a_p1_q1 = vabdq_u8(p1, q1); // abs(p1-q1)

	446 const uint8x16_t a_p0_q0_2 = vqaddq_u8(a_p0_q0, a_p0_q0); // 2 * abs(p0-q0)

	447 const uint8x16_t a_p1_q1_2 = vshrq_n_u8(a_p1_q1, 1); // abs(p1-q1) / 2

	448 const uint8x16_t sum = vqaddq_u8(a_p0_q0_2, a_p1_q1_2);

	449 const uint8x16_t mask = vcgeq_u8(thresh_v, sum);

	450 return mask;

	451 }

	452

	453 static int8x16_t FlipSign(const uint8x16_t v) {

	454 const uint8x16_t sign_bit = vdupq_n_u8(0x80);

	455 return vreinterpretq_s8_u8(veorq_u8(v, sign_bit));

	456 }

	457

	458 static uint8x16_t FlipSignBack(const int8x16_t v) {

	459 const int8x16_t sign_bit = vdupq_n_s8(0x80);

	460 return vreinterpretq_u8_s8(veorq_s8(v, sign_bit));

	461 }

	462

	463 static int8x16_t GetBaseDelta(const int8x16_t p1, const int8x16_t p0,

	464 const int8x16_t q0, const int8x16_t q1) {

	465 const int8x16_t q0_p0 = vqsubq_s8(q0, p0); // (q0-p0)

	466 const int8x16_t p1_q1 = vqsubq_s8(p1, q1); // (p1-q1)

	467 const int8x16_t s1 = vqaddq_s8(p1_q1, q0_p0); // (p1-q1) + 1 * (q0 - p0)

	468 const int8x16_t s2 = vqaddq_s8(q0_p0, s1); // (p1-q1) + 2 * (q0 - p0)

	469 const int8x16_t s3 = vqaddq_s8(q0_p0, s2); // (p1-q1) + 3 * (q0 - p0)

	470 return s3;

	471 }

	472

	473 static int8x16_t GetBaseDelta0(const int8x16_t p0, const int8x16_t q0) {

	474 const int8x16_t q0_p0 = vqsubq_s8(q0, p0); // (q0-p0)

	475 const int8x16_t s1 = vqaddq_s8(q0_p0, q0_p0); // 2 * (q0 - p0)

	476 const int8x16_t s2 = vqaddq_s8(q0_p0, s1); // 3 * (q0 - p0)

	477 return s2;

	478 }

	479

	480 //------------------------------------------------------------------------------

	481

	482 static void ApplyFilter2(const int8x16_t p0s, const int8x16_t q0s,

	483 const int8x16_t delta,

	484 uint8x16_t* const op0, uint8x16_t* const oq0) {

	485 const int8x16_t kCst3 = vdupq_n_s8(0x03);

	486 const int8x16_t kCst4 = vdupq_n_s8(0x04);

	487 const int8x16_t delta_p3 = vqaddq_s8(delta, kCst3);

	488 const int8x16_t delta_p4 = vqaddq_s8(delta, kCst4);

	489 const int8x16_t delta3 = vshrq_n_s8(delta_p3, 3);

	490 const int8x16_t delta4 = vshrq_n_s8(delta_p4, 3);

	491 const int8x16_t sp0 = vqaddq_s8(p0s, delta3);

	492 const int8x16_t sq0 = vqsubq_s8(q0s, delta4);

	493 *op0 = FlipSignBack(sp0);

	494 *oq0 = FlipSignBack(sq0);

	495 }

	496

	497 #if defined(USE_INTRINSICS)

	498

	499 static void DoFilter2(const uint8x16_t p1, const uint8x16_t p0,

	500 const uint8x16_t q0, const uint8x16_t q1,

	501 const uint8x16_t mask,

	502 uint8x16_t* const op0, uint8x16_t* const oq0) {

	503 const int8x16_t p1s = FlipSign(p1);

	504 const int8x16_t p0s = FlipSign(p0);

	505 const int8x16_t q0s = FlipSign(q0);

	506 const int8x16_t q1s = FlipSign(q1);

	507 const int8x16_t delta0 = GetBaseDelta(p1s, p0s, q0s, q1s);

	508 const int8x16_t delta1 = vandq_s8(delta0, vreinterpretq_s8_u8(mask));

	509 ApplyFilter2(p0s, q0s, delta1, op0, oq0);

	510 }

	511

	512 static void SimpleVFilter16(uint8_t* p, int stride, int thresh) {

	513 uint8x16_t p1, p0, q0, q1, op0, oq0;

	514 Load16x4(p, stride, &p1, &p0, &q0, &q1);

	515 {

	516 const uint8x16_t mask = NeedsFilter(p1, p0, q0, q1, thresh);

	517 DoFilter2(p1, p0, q0, q1, mask, &op0, &oq0);

	518 }

	519 Store16x2(op0, oq0, p, stride);

	520 }

	521

	522 static void SimpleHFilter16(uint8_t* p, int stride, int thresh) {

	523 uint8x16_t p1, p0, q0, q1, oq0, op0;

	524 Load4x16(p, stride, &p1, &p0, &q0, &q1);

	525 {

	526 const uint8x16_t mask = NeedsFilter(p1, p0, q0, q1, thresh);

	527 DoFilter2(p1, p0, q0, q1, mask, &op0, &oq0);

	528 }

	529 Store2x16(op0, oq0, p, stride);

	530 }

	531

	532 #else

	533

	534 #define QRegs "q0", "q1", "q2", "q3", \

	535 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

	536

	537 #define FLIP_SIGN_BIT2(a, b, s) \

	538 "veor " #a "," #a "," #s " \n" \

	539 "veor " #b "," #b "," #s " \n" \

	540

	541 #define FLIP_SIGN_BIT4(a, b, c, d, s) \

	542 FLIP_SIGN_BIT2(a, b, s) \

	543 FLIP_SIGN_BIT2(c, d, s) \

	544

	545 #define NEEDS_FILTER(p1, p0, q0, q1, thresh, mask) \

	546 "vabd.u8 q15," #p0 "," #q0 " \n" /* abs(p0 - q0) */ \

	547 "vabd.u8 q14," #p1 "," #q1 " \n" /* abs(p1 - q1) */ \

	548 "vqadd.u8 q15, q15, q15 \n" /* abs(p0 - q0) * 2 */ \

	549 "vshr.u8 q14, q14, #1 \n" /* abs(p1 - q1) / 2 */ \

	550 "vqadd.u8 q15, q15, q14 \n" /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */ \

	551 "vdup.8 q14, " #thresh " \n" \

	552 "vcge.u8 " #mask ", q14, q15 \n" /* mask <= thresh */

	553

	554 #define GET_BASE_DELTA(p1, p0, q0, q1, o) \

	555 "vqsub.s8 q15," #q0 "," #p0 " \n" /* (q0 - p0) */ \

	556 "vqsub.s8 " #o "," #p1 "," #q1 " \n" /* (p1 - q1) */ \

	557 "vqadd.s8 " #o "," #o ", q15 \n" /* (p1 - q1) + 1 * (p0 - q0) */ \

	558 "vqadd.s8 " #o "," #o ", q15 \n" /* (p1 - q1) + 2 * (p0 - q0) */ \

	559 "vqadd.s8 " #o "," #o ", q15 \n" /* (p1 - q1) + 3 * (p0 - q0) */

	560

	561 #define DO_SIMPLE_FILTER(p0, q0, fl) \

	562 "vmov.i8 q15, #0x03 \n" \

	563 "vqadd.s8 q15, q15, " #fl " \n" /* filter1 = filter + 3 */ \

	564 "vshr.s8 q15, q15, #3 \n" /* filter1 >> 3 */ \

	565 "vqadd.s8 " #p0 "," #p0 ", q15 \n" /* p0 += filter1 */ \

	566 \

	567 "vmov.i8 q15, #0x04 \n" \

	568 "vqadd.s8 q15, q15, " #fl " \n" /* filter1 = filter + 4 */ \

	569 "vshr.s8 q15, q15, #3 \n" /* filter2 >> 3 */ \

	570 "vqsub.s8 " #q0 "," #q0 ", q15 \n" /* q0 -= filter2 */

	571

	572 // Applies filter on 2 pixels (p0 and q0)

	573 #define DO_FILTER2(p1, p0, q0, q1, thresh) \

	574 NEEDS_FILTER(p1, p0, q0, q1, thresh, q9) /* filter mask in q9 */ \

	575 "vmov.i8 q10, #0x80 \n" /* sign bit */ \

	576 FLIP_SIGN_BIT4(p1, p0, q0, q1, q10) /* convert to signed value */ \

	577 GET_BASE_DELTA(p1, p0, q0, q1, q11) /* get filter level */ \

	578 "vand q9, q9, q11 \n" /* apply filter mask */ \

	579 DO_SIMPLE_FILTER(p0, q0, q9) /* apply filter */ \

	580 FLIP_SIGN_BIT2(p0, q0, q10)

	581

	582 static void SimpleVFilter16(uint8_t* p, int stride, int thresh) {

	583 __asm__ volatile (

	584 "sub %[p], %[p], %[stride], lsl #1 \n" // p -= 2 * stride

	585

	586 "vld1.u8 {q1}, [%[p]], %[stride] \n" // p1

	587 "vld1.u8 {q2}, [%[p]], %[stride] \n" // p0

	588 "vld1.u8 {q3}, [%[p]], %[stride] \n" // q0

	589 "vld1.u8 {q12}, [%[p]] \n" // q1

	590

	591 DO_FILTER2(q1, q2, q3, q12, %[thresh])

	592

	593 "sub %[p], %[p], %[stride], lsl #1 \n" // p -= 2 * stride

	594

	595 "vst1.u8 {q2}, [%[p]], %[stride] \n" // store op0

	596 "vst1.u8 {q3}, [%[p]] \n" // store oq0

	597 : [p] "+r"(p)

	598 : [stride] "r"(stride), [thresh] "r"(thresh)

	599 : "memory", QRegs

	600 );

	601 }

	602

	603 static void SimpleHFilter16(uint8_t* p, int stride, int thresh) {

	604 __asm__ volatile (

	605 "sub r4, %[p], #2 \n" // base1 = p - 2

	606 "lsl r6, %[stride], #1 \n" // r6 = 2 * stride

	607 "add r5, r4, %[stride] \n" // base2 = base1 + stride

	608

	609 LOAD8x4(d2, d3, d4, d5, [r4], [r5], r6)

	610 LOAD8x4(d24, d25, d26, d27, [r4], [r5], r6)

	611 "vswp d3, d24 \n" // p1:q1 p0:q3

	612 "vswp d5, d26 \n" // q0:q2 q1:q4

	613 "vswp q2, q12 \n" // p1:q1 p0:q2 q0:q3 q1:q4

	614

	615 DO_FILTER2(q1, q2, q12, q13, %[thresh])

	616

	617 "sub %[p], %[p], #1 \n" // p - 1

	618

	619 "vswp d5, d24 \n"

	620 STORE8x2(d4, d5, [%[p]], %[stride])

	621 STORE8x2(d24, d25, [%[p]], %[stride])

	622

	623 : [p] "+r"(p)

	624 : [stride] "r"(stride), [thresh] "r"(thresh)

	625 : "memory", "r4", "r5", "r6", QRegs

	626 );

	627 }

	628

	629 #endif // USE_INTRINSICS

	630

	631 static void SimpleVFilter16i(uint8_t* p, int stride, int thresh) {

	632 uint32_t k;

	633 for (k = 3; k != 0; --k) {

	634 p += 4 * stride;

	635 SimpleVFilter16(p, stride, thresh);

	636 }

	637 }

	638

	639 static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) {

	640 uint32_t k;

	641 for (k = 3; k != 0; --k) {

	642 p += 4;

	643 SimpleHFilter16(p, stride, thresh);

	644 }

	645 }

	646

	647 //------------------------------------------------------------------------------

	648 // Complex In-loop filtering (Paragraph 15.3)

	649

	650 static uint8x16_t NeedsHev(const uint8x16_t p1, const uint8x16_t p0,

	651 const uint8x16_t q0, const uint8x16_t q1,

	652 int hev_thresh) {

	653 const uint8x16_t hev_thresh_v = vdupq_n_u8((uint8_t)hev_thresh);

	654 const uint8x16_t a_p1_p0 = vabdq_u8(p1, p0); // abs(p1 - p0)

	655 const uint8x16_t a_q1_q0 = vabdq_u8(q1, q0); // abs(q1 - q0)

	656 const uint8x16_t mask1 = vcgtq_u8(a_p1_p0, hev_thresh_v);

	657 const uint8x16_t mask2 = vcgtq_u8(a_q1_q0, hev_thresh_v);

	658 const uint8x16_t mask = vorrq_u8(mask1, mask2);

	659 return mask;

	660 }

	661

	662 static uint8x16_t NeedsFilter2(const uint8x16_t p3, const uint8x16_t p2,

	663 const uint8x16_t p1, const uint8x16_t p0,

	664 const uint8x16_t q0, const uint8x16_t q1,

	665 const uint8x16_t q2, const uint8x16_t q3,

	666 int ithresh, int thresh) {

	667 const uint8x16_t ithresh_v = vdupq_n_u8((uint8_t)ithresh);

	668 const uint8x16_t a_p3_p2 = vabdq_u8(p3, p2); // abs(p3 - p2)

	669 const uint8x16_t a_p2_p1 = vabdq_u8(p2, p1); // abs(p2 - p1)

	670 const uint8x16_t a_p1_p0 = vabdq_u8(p1, p0); // abs(p1 - p0)

	671 const uint8x16_t a_q3_q2 = vabdq_u8(q3, q2); // abs(q3 - q2)

	672 const uint8x16_t a_q2_q1 = vabdq_u8(q2, q1); // abs(q2 - q1)

	673 const uint8x16_t a_q1_q0 = vabdq_u8(q1, q0); // abs(q1 - q0)

	674 const uint8x16_t max1 = vmaxq_u8(a_p3_p2, a_p2_p1);

	675 const uint8x16_t max2 = vmaxq_u8(a_p1_p0, a_q3_q2);

	676 const uint8x16_t max3 = vmaxq_u8(a_q2_q1, a_q1_q0);

	677 const uint8x16_t max12 = vmaxq_u8(max1, max2);

	678 const uint8x16_t max123 = vmaxq_u8(max12, max3);

	679 const uint8x16_t mask2 = vcgeq_u8(ithresh_v, max123);

	680 const uint8x16_t mask1 = NeedsFilter(p1, p0, q0, q1, thresh);

	681 const uint8x16_t mask = vandq_u8(mask1, mask2);

	682 return mask;

	683 }

	684

	685 // 4-points filter

	686

	687 static void ApplyFilter4(

	688 const int8x16_t p1, const int8x16_t p0,

	689 const int8x16_t q0, const int8x16_t q1,

	690 const int8x16_t delta0,

	691 uint8x16_t* const op1, uint8x16_t* const op0,

	692 uint8x16_t* const oq0, uint8x16_t* const oq1) {

	693 const int8x16_t kCst3 = vdupq_n_s8(0x03);

	694 const int8x16_t kCst4 = vdupq_n_s8(0x04);

	695 const int8x16_t delta1 = vqaddq_s8(delta0, kCst4);

	696 const int8x16_t delta2 = vqaddq_s8(delta0, kCst3);

	697 const int8x16_t a1 = vshrq_n_s8(delta1, 3);

	698 const int8x16_t a2 = vshrq_n_s8(delta2, 3);

	699 const int8x16_t a3 = vrshrq_n_s8(a1, 1); // a3 = (a1 + 1) >> 1

	700 *op0 = FlipSignBack(vqaddq_s8(p0, a2)); // clip(p0 + a2)

	701 *oq0 = FlipSignBack(vqsubq_s8(q0, a1)); // clip(q0 - a1)

	702 *op1 = FlipSignBack(vqaddq_s8(p1, a3)); // clip(p1 + a3)

	703 *oq1 = FlipSignBack(vqsubq_s8(q1, a3)); // clip(q1 - a3)

	704 }

	705

	706 static void DoFilter4(

	707 const uint8x16_t p1, const uint8x16_t p0,

	708 const uint8x16_t q0, const uint8x16_t q1,

	709 const uint8x16_t mask, const uint8x16_t hev_mask,

	710 uint8x16_t* const op1, uint8x16_t* const op0,

	711 uint8x16_t* const oq0, uint8x16_t* const oq1) {

	712 // This is a fused version of DoFilter2() calling ApplyFilter2 directly

	713 const int8x16_t p1s = FlipSign(p1);

	714 int8x16_t p0s = FlipSign(p0);

	715 int8x16_t q0s = FlipSign(q0);

	716 const int8x16_t q1s = FlipSign(q1);

	717 const uint8x16_t simple_lf_mask = vandq_u8(mask, hev_mask);

	718

	719 // do_filter2 part (simple loopfilter on pixels with hev)

	720 {

	721 const int8x16_t delta = GetBaseDelta(p1s, p0s, q0s, q1s);

	722 const int8x16_t simple_lf_delta =

	723 vandq_s8(delta, vreinterpretq_s8_u8(simple_lf_mask));

	724 uint8x16_t tmp_p0, tmp_q0;

	725 ApplyFilter2(p0s, q0s, simple_lf_delta, &tmp_p0, &tmp_q0);

	726 // TODO(skal): avoid the double FlipSign() in ApplyFilter2() and here

	727 p0s = FlipSign(tmp_p0);

	728 q0s = FlipSign(tmp_q0);

	729 }

	730

	731 // do_filter4 part (complex loopfilter on pixels without hev)

	732 {

	733 const int8x16_t delta0 = GetBaseDelta0(p0s, q0s);

	734 // we use: (mask & hev_mask) ^ mask = mask & !hev_mask

	735 const uint8x16_t complex_lf_mask = veorq_u8(simple_lf_mask, mask);

	736 const int8x16_t complex_lf_delta =

	737 vandq_s8(delta0, vreinterpretq_s8_u8(complex_lf_mask));

	738 ApplyFilter4(p1s, p0s, q0s, q1s, complex_lf_delta, op1, op0, oq0, oq1);

	739 }

	740 }

	741

	742 // 6-points filter

	743

	744 static void ApplyFilter6(

	745 const int8x16_t p2, const int8x16_t p1, const int8x16_t p0,

	746 const int8x16_t q0, const int8x16_t q1, const int8x16_t q2,

	747 const int8x16_t delta,

	748 uint8x16_t* const op2, uint8x16_t* const op1, uint8x16_t* const op0,

	749 uint8x16_t* const oq0, uint8x16_t* const oq1, uint8x16_t* const oq2) {

	750 const int16x8_t kCst63 = vdupq_n_s16(63);

	751 const int8x8_t kCst27 = vdup_n_s8(27);

	752 const int8x8_t kCst18 = vdup_n_s8(18);

	753 const int8x8_t kCst9 = vdup_n_s8(9);

	754 const int8x8_t delta_lo = vget_low_s8(delta);

	755 const int8x8_t delta_hi = vget_high_s8(delta);

	756 const int16x8_t s1_lo = vmlal_s8(kCst63, kCst27, delta_lo); // 63 + 27 * a

	757 const int16x8_t s1_hi = vmlal_s8(kCst63, kCst27, delta_hi); // 63 + 27 * a

	758 const int16x8_t s2_lo = vmlal_s8(kCst63, kCst18, delta_lo); // 63 + 18 * a

	759 const int16x8_t s2_hi = vmlal_s8(kCst63, kCst18, delta_hi); // 63 + 18 * a

	760 const int16x8_t s3_lo = vmlal_s8(kCst63, kCst9, delta_lo); // 63 + 9 * a

	761 const int16x8_t s3_hi = vmlal_s8(kCst63, kCst9, delta_hi); // 63 + 9 * a

	762 const int8x8_t a1_lo = vqshrn_n_s16(s1_lo, 7);

	763 const int8x8_t a1_hi = vqshrn_n_s16(s1_hi, 7);

	764 const int8x8_t a2_lo = vqshrn_n_s16(s2_lo, 7);

	765 const int8x8_t a2_hi = vqshrn_n_s16(s2_hi, 7);

	766 const int8x8_t a3_lo = vqshrn_n_s16(s3_lo, 7);

	767 const int8x8_t a3_hi = vqshrn_n_s16(s3_hi, 7);

	768 const int8x16_t a1 = vcombine_s8(a1_lo, a1_hi);

	769 const int8x16_t a2 = vcombine_s8(a2_lo, a2_hi);

	770 const int8x16_t a3 = vcombine_s8(a3_lo, a3_hi);

	771

	772 *op0 = FlipSignBack(vqaddq_s8(p0, a1)); // clip(p0 + a1)

	773 *oq0 = FlipSignBack(vqsubq_s8(q0, a1)); // clip(q0 - q1)

	774 *oq1 = FlipSignBack(vqsubq_s8(q1, a2)); // clip(q1 - a2)

	775 *op1 = FlipSignBack(vqaddq_s8(p1, a2)); // clip(p1 + a2)

	776 *oq2 = FlipSignBack(vqsubq_s8(q2, a3)); // clip(q2 - a3)

	777 *op2 = FlipSignBack(vqaddq_s8(p2, a3)); // clip(p2 + a3)

	778 }

	779

	780 static void DoFilter6(

	781 const uint8x16_t p2, const uint8x16_t p1, const uint8x16_t p0,

	782 const uint8x16_t q0, const uint8x16_t q1, const uint8x16_t q2,

	783 const uint8x16_t mask, const uint8x16_t hev_mask,

	784 uint8x16_t* const op2, uint8x16_t* const op1, uint8x16_t* const op0,

	785 uint8x16_t* const oq0, uint8x16_t* const oq1, uint8x16_t* const oq2) {

	786 // This is a fused version of DoFilter2() calling ApplyFilter2 directly

	787 const int8x16_t p2s = FlipSign(p2);

	788 const int8x16_t p1s = FlipSign(p1);

	789 int8x16_t p0s = FlipSign(p0);

	790 int8x16_t q0s = FlipSign(q0);

	791 const int8x16_t q1s = FlipSign(q1);

	792 const int8x16_t q2s = FlipSign(q2);

	793 const uint8x16_t simple_lf_mask = vandq_u8(mask, hev_mask);

	794 const int8x16_t delta0 = GetBaseDelta(p1s, p0s, q0s, q1s);

	795

	796 // do_filter2 part (simple loopfilter on pixels with hev)

	797 {

	798 const int8x16_t simple_lf_delta =

	799 vandq_s8(delta0, vreinterpretq_s8_u8(simple_lf_mask));

	800 uint8x16_t tmp_p0, tmp_q0;

	801 ApplyFilter2(p0s, q0s, simple_lf_delta, &tmp_p0, &tmp_q0);

	802 // TODO(skal): avoid the double FlipSign() in ApplyFilter2() and here

	803 p0s = FlipSign(tmp_p0);

	804 q0s = FlipSign(tmp_q0);

	805 }

	806

	807 // do_filter6 part (complex loopfilter on pixels without hev)

	808 {

	809 // we use: (mask & hev_mask) ^ mask = mask & !hev_mask

	810 const uint8x16_t complex_lf_mask = veorq_u8(simple_lf_mask, mask);

	811 const int8x16_t complex_lf_delta =

	812 vandq_s8(delta0, vreinterpretq_s8_u8(complex_lf_mask));

	813 ApplyFilter6(p2s, p1s, p0s, q0s, q1s, q2s, complex_lf_delta,

	814 op2, op1, op0, oq0, oq1, oq2);

	815 }

	816 }

	817

	818 // on macroblock edges

	819

	820 static void VFilter16(uint8_t* p, int stride,

	821 int thresh, int ithresh, int hev_thresh) {

	822 uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;

	823 Load16x8(p, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);

	824 {

	825 const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3,

	826 ithresh, thresh);

	827 const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);

	828 uint8x16_t op2, op1, op0, oq0, oq1, oq2;

	829 DoFilter6(p2, p1, p0, q0, q1, q2, mask, hev_mask,

	830 &op2, &op1, &op0, &oq0, &oq1, &oq2);

	831 Store16x2(op2, op1, p - 2 * stride, stride);

	832 Store16x2(op0, oq0, p + 0 * stride, stride);

	833 Store16x2(oq1, oq2, p + 2 * stride, stride);

	834 }

	835 }

	836

	837 static void HFilter16(uint8_t* p, int stride,

	838 int thresh, int ithresh, int hev_thresh) {

	839 uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;

	840 Load8x16(p, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);

	841 {

	842 const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3,

	843 ithresh, thresh);

	844 const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);

	845 uint8x16_t op2, op1, op0, oq0, oq1, oq2;

	846 DoFilter6(p2, p1, p0, q0, q1, q2, mask, hev_mask,

	847 &op2, &op1, &op0, &oq0, &oq1, &oq2);

	848 Store2x16(op2, op1, p - 2, stride);

	849 Store2x16(op0, oq0, p + 0, stride);

	850 Store2x16(oq1, oq2, p + 2, stride);

	851 }

	852 }

	853

	854 // on three inner edges

	855 static void VFilter16i(uint8_t* p, int stride,

	856 int thresh, int ithresh, int hev_thresh) {

	857 uint32_t k;

	858 uint8x16_t p3, p2, p1, p0;

	859 Load16x4(p + 2 * stride, stride, &p3, &p2, &p1, &p0);

	860 for (k = 3; k != 0; --k) {

	861 uint8x16_t q0, q1, q2, q3;

	862 p += 4 * stride;

	863 Load16x4(p + 2 * stride, stride, &q0, &q1, &q2, &q3);

	864 {

	865 const uint8x16_t mask =

	866 NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3, ithresh, thresh);

	867 const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);

	868 // p3 and p2 are not just temporary variables here: they will be

	869 // re-used for next span. And q2/q3 will become p1/p0 accordingly.

	870 DoFilter4(p1, p0, q0, q1, mask, hev_mask, &p1, &p0, &p3, &p2);

	871 Store16x4(p1, p0, p3, p2, p, stride);

	872 p1 = q2;

	873 p0 = q3;

	874 }

	875 }

	876 }

	877

	878 #if !defined(WORK_AROUND_GCC)

	879 static void HFilter16i(uint8_t* p, int stride,

	880 int thresh, int ithresh, int hev_thresh) {

	881 uint32_t k;

	882 uint8x16_t p3, p2, p1, p0;

	883 Load4x16(p + 2, stride, &p3, &p2, &p1, &p0);

	884 for (k = 3; k != 0; --k) {

	885 uint8x16_t q0, q1, q2, q3;

	886 p += 4;

	887 Load4x16(p + 2, stride, &q0, &q1, &q2, &q3);

	888 {

	889 const uint8x16_t mask =

	890 NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3, ithresh, thresh);

	891 const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);

	892 DoFilter4(p1, p0, q0, q1, mask, hev_mask, &p1, &p0, &p3, &p2);

	893 Store4x16(p1, p0, p3, p2, p, stride);

	894 p1 = q2;

	895 p0 = q3;

	896 }

	897 }

	898 }

	899 #endif // !WORK_AROUND_GCC

	900

	901 // 8-pixels wide variant, for chroma filtering

	902 static void VFilter8(uint8_t* u, uint8_t* v, int stride,

	903 int thresh, int ithresh, int hev_thresh) {

	904 uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;

	905 Load8x8x2(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);

	906 {

	907 const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3,

	908 ithresh, thresh);

	909 const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);

	910 uint8x16_t op2, op1, op0, oq0, oq1, oq2;

	911 DoFilter6(p2, p1, p0, q0, q1, q2, mask, hev_mask,

	912 &op2, &op1, &op0, &oq0, &oq1, &oq2);

	913 Store8x2x2(op2, op1, u - 2 * stride, v - 2 * stride, stride);

	914 Store8x2x2(op0, oq0, u + 0 * stride, v + 0 * stride, stride);

	915 Store8x2x2(oq1, oq2, u + 2 * stride, v + 2 * stride, stride);

	916 }

	917 }

	918 static void VFilter8i(uint8_t* u, uint8_t* v, int stride,

	919 int thresh, int ithresh, int hev_thresh) {

	920 uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;

	921 u += 4 * stride;

	922 v += 4 * stride;

	923 Load8x8x2(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);

	924 {

	925 const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3,

	926 ithresh, thresh);

	927 const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);

	928 uint8x16_t op1, op0, oq0, oq1;

	929 DoFilter4(p1, p0, q0, q1, mask, hev_mask, &op1, &op0, &oq0, &oq1);

	930 Store8x4x2(op1, op0, oq0, oq1, u, v, stride);

	931 }

	932 }

	933

	934 #if !defined(WORK_AROUND_GCC)

	935 static void HFilter8(uint8_t* u, uint8_t* v, int stride,

	936 int thresh, int ithresh, int hev_thresh) {

	937 uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;

	938 Load8x8x2T(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);

	939 {

	940 const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3,

	941 ithresh, thresh);

	942 const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);

	943 uint8x16_t op2, op1, op0, oq0, oq1, oq2;

	944 DoFilter6(p2, p1, p0, q0, q1, q2, mask, hev_mask,

	945 &op2, &op1, &op0, &oq0, &oq1, &oq2);

	946 Store6x8x2(op2, op1, op0, oq0, oq1, oq2, u, v, stride);

	947 }

	948 }

	949

	950 static void HFilter8i(uint8_t* u, uint8_t* v, int stride,

	951 int thresh, int ithresh, int hev_thresh) {

	952 uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;

	953 u += 4;

	954 v += 4;

	955 Load8x8x2T(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);

	956 {

	957 const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3,

	958 ithresh, thresh);

	959 const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);

	960 uint8x16_t op1, op0, oq0, oq1;

	961 DoFilter4(p1, p0, q0, q1, mask, hev_mask, &op1, &op0, &oq0, &oq1);

	962 Store4x8x2(op1, op0, oq0, oq1, u, v, stride);

	963 }

	964 }

	965 #endif // !WORK_AROUND_GCC

	966

	967 //-----------------------------------------------------------------------------

	968 // Inverse transforms (Paragraph 14.4)

	969

	970 // Technically these are unsigned but vqdmulh is only available in signed.

	971 // vqdmulh returns high half (effectively >> 16) but also doubles the value,

	972 // changing the >> 16 to >> 15 and requiring an additional >> 1.

	973 // We use this to our advantage with kC2. The canonical value is 35468.

	974 // However, the high bit is set so treating it as signed will give incorrect

	975 // results. We avoid this by down shifting by 1 here to clear the highest bit.

	976 // Combined with the doubling effect of vqdmulh we get >> 16.

	977 // This can not be applied to kC1 because the lowest bit is set. Down shifting

	978 // the constant would reduce precision.

	979

	980 // libwebp uses a trick to avoid some extra addition that libvpx does.

	981 // Instead of:

	982 // temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1) >> 16);

	983 // libwebp adds 1 << 16 to cospi8sqrt2minus1 (kC1). However, this causes the

	984 // same issue with kC1 and vqdmulh that we work around by down shifting kC2

	985

	986 static const int16_t kC1 = 20091;

	987 static const int16_t kC2 = 17734; // half of kC2, actually. See comment above.

	988

	989 #if defined(USE_INTRINSICS)

	990 static WEBP_INLINE void Transpose8x2(const int16x8_t in0, const int16x8_t in1,

	991 int16x8x2_t* const out) {

	992 // a0 a1 a2 a3 \| b0 b1 b2 b3 => a0 b0 c0 d0 \| a1 b1 c1 d1

	993 // c0 c1 c2 c3 \| d0 d1 d2 d3 a2 b2 c2 d2 \| a3 b3 c3 d3

	994 const int16x8x2_t tmp0 = vzipq_s16(in0, in1); // a0 c0 a1 c1 a2 c2 ...

	995 // b0 d0 b1 d1 b2 d2 ...

	996 *out = vzipq_s16(tmp0.val[0], tmp0.val[1]);

	997 }

	998

	999 static WEBP_INLINE void TransformPass(int16x8x2_t* const rows) {

	1000 // {rows} = in0 \| in4

	1001 // in8 \| in12

	1002 // B1 = in4 \| in12

	1003 const int16x8_t B1 =

	1004 vcombine_s16(vget_high_s16(rows->val[0]), vget_high_s16(rows->val[1]));

	1005 // C0 = kC1 * in4 \| kC1 * in12

	1006 // C1 = kC2 * in4 \| kC2 * in12

	1007 const int16x8_t C0 = vsraq_n_s16(B1, vqdmulhq_n_s16(B1, kC1), 1);

	1008 const int16x8_t C1 = vqdmulhq_n_s16(B1, kC2);

	1009 const int16x4_t a = vqadd_s16(vget_low_s16(rows->val[0]),

	1010 vget_low_s16(rows->val[1])); // in0 + in8

	1011 const int16x4_t b = vqsub_s16(vget_low_s16(rows->val[0]),

	1012 vget_low_s16(rows->val[1])); // in0 - in8

	1013 // c = kC2 * in4 - kC1 * in12

	1014 // d = kC1 * in4 + kC2 * in12

	1015 const int16x4_t c = vqsub_s16(vget_low_s16(C1), vget_high_s16(C0));

	1016 const int16x4_t d = vqadd_s16(vget_low_s16(C0), vget_high_s16(C1));

	1017 const int16x8_t D0 = vcombine_s16(a, b); // D0 = a \| b

	1018 const int16x8_t D1 = vcombine_s16(d, c); // D1 = d \| c

	1019 const int16x8_t E0 = vqaddq_s16(D0, D1); // a+d \| b+c

	1020 const int16x8_t E_tmp = vqsubq_s16(D0, D1); // a-d \| b-c

	1021 const int16x8_t E1 = vcombine_s16(vget_high_s16(E_tmp), vget_low_s16(E_tmp));

	1022 Transpose8x2(E0, E1, rows);

	1023 }

	1024

	1025 static void TransformOne(const int16_t* in, uint8_t* dst) {

	1026 int16x8x2_t rows;

	1027 INIT_VECTOR2(rows, vld1q_s16(in + 0), vld1q_s16(in + 8));

	1028 TransformPass(&rows);

	1029 TransformPass(&rows);

	1030 Add4x4(rows.val[0], rows.val[1], dst);

	1031 }

	1032

	1033 #else

	1034

	1035 static void TransformOne(const int16_t* in, uint8_t* dst) {

	1036 const int kBPS = BPS;

	1037 // kC1, kC2. Padded because vld1.16 loads 8 bytes

	1038 const int16_t constants[4] = { kC1, kC2, 0, 0 };

	1039 /* Adapted from libvpx: vp8/common/arm/neon/shortidct4x4llm_neon.asm */

	1040 __asm__ volatile (

	1041 "vld1.16 {q1, q2}, [%[in]] \n"

	1042 "vld1.16 {d0}, [%[constants]] \n"

	1043

	1044 /* d2: in[0]

	1045 * d3: in[8]

	1046 * d4: in[4]

	1047 * d5: in[12]

	1048 */

	1049 "vswp d3, d4 \n"

	1050

	1051 /* q8 = {in[4], in[12]} * kC1 * 2 >> 16

	1052 * q9 = {in[4], in[12]} * kC2 >> 16

	1053 */

	1054 "vqdmulh.s16 q8, q2, d0[0] \n"

	1055 "vqdmulh.s16 q9, q2, d0[1] \n"

	1056

	1057 /* d22 = a = in[0] + in[8]

	1058 * d23 = b = in[0] - in[8]

	1059 */

	1060 "vqadd.s16 d22, d2, d3 \n"

	1061 "vqsub.s16 d23, d2, d3 \n"

	1062

	1063 /* The multiplication should be x * kC1 >> 16

	1064 * However, with vqdmulh we get x * kC1 * 2 >> 16

	1065 * (multiply, double, return high half)

	1066 * We avoided this in kC2 by pre-shifting the constant.

	1067 * q8 = in[4]/[12] * kC1 >> 16

	1068 */

	1069 "vshr.s16 q8, q8, #1 \n"

	1070

	1071 /* Add {in[4], in[12]} back after the multiplication. This is handled by

	1072 * adding 1 << 16 to kC1 in the libwebp C code.

	1073 */

	1074 "vqadd.s16 q8, q2, q8 \n"

	1075

	1076 /* d20 = c = in[4]kC2 - in[12]kC1

	1077 * d21 = d = in[4]kC1 + in[12]kC2

	1078 */

	1079 "vqsub.s16 d20, d18, d17 \n"

	1080 "vqadd.s16 d21, d19, d16 \n"

	1081

	1082 /* d2 = tmp[0] = a + d

	1083 * d3 = tmp[1] = b + c

	1084 * d4 = tmp[2] = b - c

	1085 * d5 = tmp[3] = a - d

	1086 */

	1087 "vqadd.s16 d2, d22, d21 \n"

	1088 "vqadd.s16 d3, d23, d20 \n"

	1089 "vqsub.s16 d4, d23, d20 \n"

	1090 "vqsub.s16 d5, d22, d21 \n"

	1091

	1092 "vzip.16 q1, q2 \n"

	1093 "vzip.16 q1, q2 \n"

	1094

	1095 "vswp d3, d4 \n"

	1096

	1097 /* q8 = {tmp[4], tmp[12]} * kC1 * 2 >> 16

	1098 * q9 = {tmp[4], tmp[12]} * kC2 >> 16

	1099 */

	1100 "vqdmulh.s16 q8, q2, d0[0] \n"

	1101 "vqdmulh.s16 q9, q2, d0[1] \n"

	1102

	1103 /* d22 = a = tmp[0] + tmp[8]

	1104 * d23 = b = tmp[0] - tmp[8]

	1105 */

	1106 "vqadd.s16 d22, d2, d3 \n"

	1107 "vqsub.s16 d23, d2, d3 \n"

	1108

	1109 /* See long winded explanations prior */

	1110 "vshr.s16 q8, q8, #1 \n"

	1111 "vqadd.s16 q8, q2, q8 \n"

	1112

	1113 /* d20 = c = in[4]kC2 - in[12]kC1

	1114 * d21 = d = in[4]kC1 + in[12]kC2

	1115 */

	1116 "vqsub.s16 d20, d18, d17 \n"

	1117 "vqadd.s16 d21, d19, d16 \n"

	1118

	1119 /* d2 = tmp[0] = a + d

	1120 * d3 = tmp[1] = b + c

	1121 * d4 = tmp[2] = b - c

	1122 * d5 = tmp[3] = a - d

	1123 */

	1124 "vqadd.s16 d2, d22, d21 \n"

	1125 "vqadd.s16 d3, d23, d20 \n"

	1126 "vqsub.s16 d4, d23, d20 \n"

	1127 "vqsub.s16 d5, d22, d21 \n"

	1128

	1129 "vld1.32 d6[0], [%[dst]], %[kBPS] \n"

	1130 "vld1.32 d6[1], [%[dst]], %[kBPS] \n"

	1131 "vld1.32 d7[0], [%[dst]], %[kBPS] \n"

	1132 "vld1.32 d7[1], [%[dst]], %[kBPS] \n"

	1133

	1134 "sub %[dst], %[dst], %[kBPS], lsl #2 \n"

	1135

	1136 /* (val) + 4 >> 3 */

	1137 "vrshr.s16 d2, d2, #3 \n"

	1138 "vrshr.s16 d3, d3, #3 \n"

	1139 "vrshr.s16 d4, d4, #3 \n"

	1140 "vrshr.s16 d5, d5, #3 \n"

	1141

	1142 "vzip.16 q1, q2 \n"

	1143 "vzip.16 q1, q2 \n"

	1144

	1145 /* Must accumulate before saturating */

	1146 "vmovl.u8 q8, d6 \n"

	1147 "vmovl.u8 q9, d7 \n"

	1148

	1149 "vqadd.s16 q1, q1, q8 \n"

	1150 "vqadd.s16 q2, q2, q9 \n"

	1151

	1152 "vqmovun.s16 d0, q1 \n"

	1153 "vqmovun.s16 d1, q2 \n"

	1154

	1155 "vst1.32 d0[0], [%[dst]], %[kBPS] \n"

	1156 "vst1.32 d0[1], [%[dst]], %[kBPS] \n"

	1157 "vst1.32 d1[0], [%[dst]], %[kBPS] \n"

	1158 "vst1.32 d1[1], [%[dst]] \n"

	1159

	1160 : [in] "+r"(in), [dst] "+r"(dst) /* modified registers */

	1161 : [kBPS] "r"(kBPS), [constants] "r"(constants) /* constants */

	1162 : "memory", "q0", "q1", "q2", "q8", "q9", "q10", "q11" /* clobbered */

	1163 );

	1164 }

	1165

	1166 #endif // USE_INTRINSICS

	1167

	1168 static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {

	1169 TransformOne(in, dst);

	1170 if (do_two) {

	1171 TransformOne(in + 16, dst + 4);

	1172 }

	1173 }

	1174

	1175 static void TransformDC(const int16_t* in, uint8_t* dst) {

	1176 const int16x8_t DC = vdupq_n_s16(in[0]);

	1177 Add4x4(DC, DC, dst);

	1178 }

	1179

	1180 //------------------------------------------------------------------------------

	1181

	1182 #define STORE_WHT(dst, col, rows) do { \

	1183 *dst = vgetq_lane_s32(rows.val[0], col); (dst) += 16; \

	1184 *dst = vgetq_lane_s32(rows.val[1], col); (dst) += 16; \

	1185 *dst = vgetq_lane_s32(rows.val[2], col); (dst) += 16; \

	1186 *dst = vgetq_lane_s32(rows.val[3], col); (dst) += 16; \

	1187 } while (0)

	1188

	1189 static void TransformWHT(const int16_t* in, int16_t* out) {

	1190 int32x4x4_t tmp;

	1191

	1192 {

	1193 // Load the source.

	1194 const int16x4_t in00_03 = vld1_s16(in + 0);

	1195 const int16x4_t in04_07 = vld1_s16(in + 4);

	1196 const int16x4_t in08_11 = vld1_s16(in + 8);

	1197 const int16x4_t in12_15 = vld1_s16(in + 12);

	1198 const int32x4_t a0 = vaddl_s16(in00_03, in12_15); // in[0..3] + in[12..15]

	1199 const int32x4_t a1 = vaddl_s16(in04_07, in08_11); // in[4..7] + in[8..11]

	1200 const int32x4_t a2 = vsubl_s16(in04_07, in08_11); // in[4..7] - in[8..11]

	1201 const int32x4_t a3 = vsubl_s16(in00_03, in12_15); // in[0..3] - in[12..15]

	1202 tmp.val[0] = vaddq_s32(a0, a1);

	1203 tmp.val[1] = vaddq_s32(a3, a2);

	1204 tmp.val[2] = vsubq_s32(a0, a1);

	1205 tmp.val[3] = vsubq_s32(a3, a2);

	1206 // Arrange the temporary results column-wise.

	1207 tmp = Transpose4x4(tmp);

	1208 }

	1209

	1210 {

	1211 const int32x4_t kCst3 = vdupq_n_s32(3);

	1212 const int32x4_t dc = vaddq_s32(tmp.val[0], kCst3); // add rounder

	1213 const int32x4_t a0 = vaddq_s32(dc, tmp.val[3]);

	1214 const int32x4_t a1 = vaddq_s32(tmp.val[1], tmp.val[2]);

	1215 const int32x4_t a2 = vsubq_s32(tmp.val[1], tmp.val[2]);

	1216 const int32x4_t a3 = vsubq_s32(dc, tmp.val[3]);

	1217

	1218 tmp.val[0] = vaddq_s32(a0, a1);

	1219 tmp.val[1] = vaddq_s32(a3, a2);

	1220 tmp.val[2] = vsubq_s32(a0, a1);

	1221 tmp.val[3] = vsubq_s32(a3, a2);

	1222

	1223 // right shift the results by 3.

	1224 tmp.val[0] = vshrq_n_s32(tmp.val[0], 3);

	1225 tmp.val[1] = vshrq_n_s32(tmp.val[1], 3);

	1226 tmp.val[2] = vshrq_n_s32(tmp.val[2], 3);

	1227 tmp.val[3] = vshrq_n_s32(tmp.val[3], 3);

	1228

	1229 STORE_WHT(out, 0, tmp);

	1230 STORE_WHT(out, 1, tmp);

	1231 STORE_WHT(out, 2, tmp);

	1232 STORE_WHT(out, 3, tmp);

	1233 }

	1234 }

	1235

	1236 #undef STORE_WHT

	1237

	1238 //------------------------------------------------------------------------------

	1239

	1240 #define MUL(a, b) (((a) * (b)) >> 16)

	1241 static void TransformAC3(const int16_t* in, uint8_t* dst) {

	1242 static const int kC1_full = 20091 + (1 << 16);

	1243 static const int kC2_full = 35468;

	1244 const int16x4_t A = vdup_n_s16(in[0]);

	1245 const int16x4_t c4 = vdup_n_s16(MUL(in[4], kC2_full));

	1246 const int16x4_t d4 = vdup_n_s16(MUL(in[4], kC1_full));

	1247 const int c1 = MUL(in[1], kC2_full);

	1248 const int d1 = MUL(in[1], kC1_full);

	1249 const uint64_t cd = (uint64_t)( d1 & 0xffff) << 0 \|

	1250 (uint64_t)( c1 & 0xffff) << 16 \|

	1251 (uint64_t)(-c1 & 0xffff) << 32 \|

	1252 (uint64_t)(-d1 & 0xffff) << 48;

	1253 const int16x4_t CD = vcreate_s16(cd);

	1254 const int16x4_t B = vqadd_s16(A, CD);

	1255 const int16x8_t m0_m1 = vcombine_s16(vqadd_s16(B, d4), vqadd_s16(B, c4));

	1256 const int16x8_t m2_m3 = vcombine_s16(vqsub_s16(B, c4), vqsub_s16(B, d4));

	1257 Add4x4(m0_m1, m2_m3, dst);

	1258 }

	1259 #undef MUL

	1260

	1261 #endif // WEBP_USE_NEON

	1262

	1263 //------------------------------------------------------------------------------

	1264 // Entry point

	1265

	1266 extern void VP8DspInitNEON(void);

	1267

	1268 void VP8DspInitNEON(void) {

	1269 #if defined(WEBP_USE_NEON)

	1270 VP8Transform = TransformTwo;

	1271 VP8TransformAC3 = TransformAC3;

	1272 VP8TransformDC = TransformDC;

	1273 VP8TransformWHT = TransformWHT;

	1274

	1275 VP8VFilter16 = VFilter16;

	1276 VP8VFilter16i = VFilter16i;

	1277 VP8HFilter16 = HFilter16;

	1278 #if !defined(WORK_AROUND_GCC)

	1279 VP8HFilter16i = HFilter16i;

	1280 #endif

	1281 VP8VFilter8 = VFilter8;

	1282 VP8VFilter8i = VFilter8i;

	1283 #if !defined(WORK_AROUND_GCC)

	1284 VP8HFilter8 = HFilter8;

	1285 VP8HFilter8i = HFilter8i;

	1286 #endif

	1287 VP8SimpleVFilter16 = SimpleVFilter16;

	1288 VP8SimpleHFilter16 = SimpleHFilter16;

	1289 VP8SimpleVFilter16i = SimpleVFilter16i;

	1290 VP8SimpleHFilter16i = SimpleHFilter16i;

	1291 #endif // WEBP_USE_NEON

	1292 }

OLD	NEW

« src/codec/SkWebpCodec.cpp ('K') | « third_party/libwebp/dsp/dec_mips32.c ('k') | third_party/libwebp/dsp/dec_sse2.c » ('j') | no next file with comments »