third_party/libwebp/dsp/enc_neon.c - Issue 1178013008: Use the upstream version of libwebp, v0.4.3.

Side by Side Diff: third_party/libwebp/dsp/enc_neon.c

Issue 1178013008: Use the upstream version of libwebp, v0.4.3. (Closed) Base URL: https://skia.googlesource.com/skia.git@master

Patch Set: Fixes for SkWebpImageDecoder and SkWebpCodec. Created 5 years, 6 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
(Empty)
	1 // Copyright 2012 Google Inc. All Rights Reserved.

	2 //

	3 // Use of this source code is governed by a BSD-style license

	4 // that can be found in the COPYING file in the root of the source

	5 // tree. An additional intellectual property rights grant can be found

	6 // in the file PATENTS. All contributing project authors may

	7 // be found in the AUTHORS file in the root of the source tree.

	8 // -----------------------------------------------------------------------------

	9 //

	10 // ARM NEON version of speed-critical encoding functions.

	11 //

	12 // adapted from libvpx (http://www.webmproject.org/code/)

	13

	14 #include "./dsp.h"

	15

	16 #if defined(WEBP_USE_NEON)

	17

	18 #include <assert.h>

	19

	20 #include "./neon.h"

	21 #include "../enc/vp8enci.h"

	22

	23 //------------------------------------------------------------------------------

	24 // Transforms (Paragraph 14.4)

	25

	26 // Inverse transform.

	27 // This code is pretty much the same as TransformOne in the dec_neon.c, except

	28 // for subtraction to *ref. See the comments there for algorithmic explanations.

	29

	30 static const int16_t kC1 = 20091;

	31 static const int16_t kC2 = 17734; // half of kC2, actually. See comment above.

	32

	33 // This code works but is slower than the inlined-asm version below

	34 // (with gcc-4.6). So we disable it for now. Later, it'll be conditional to

	35 // USE_INTRINSICS define.

	36 // With gcc-4.8, it's a little faster speed than inlined-assembly.

	37 #if defined(USE_INTRINSICS)

	38

	39 // Treats 'v' as an uint8x8_t and zero extends to an int16x8_t.

	40 static WEBP_INLINE int16x8_t ConvertU8ToS16(uint32x2_t v) {

	41 return vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(v)));

	42 }

	43

	44 // Performs unsigned 8b saturation on 'dst01' and 'dst23' storing the result

	45 // to the corresponding rows of 'dst'.

	46 static WEBP_INLINE void SaturateAndStore4x4(uint8_t* const dst,

	47 const int16x8_t dst01,

	48 const int16x8_t dst23) {

	49 // Unsigned saturate to 8b.

	50 const uint8x8_t dst01_u8 = vqmovun_s16(dst01);

	51 const uint8x8_t dst23_u8 = vqmovun_s16(dst23);

	52

	53 // Store the results.

	54 vst1_lane_u32((uint32_t)(dst + 0 BPS), vreinterpret_u32_u8(dst01_u8), 0);

	55 vst1_lane_u32((uint32_t)(dst + 1 BPS), vreinterpret_u32_u8(dst01_u8), 1);

	56 vst1_lane_u32((uint32_t)(dst + 2 BPS), vreinterpret_u32_u8(dst23_u8), 0);

	57 vst1_lane_u32((uint32_t)(dst + 3 BPS), vreinterpret_u32_u8(dst23_u8), 1);

	58 }

	59

	60 static WEBP_INLINE void Add4x4(const int16x8_t row01, const int16x8_t row23,

	61 const uint8_t* const ref, uint8_t* const dst) {

	62 uint32x2_t dst01 = vdup_n_u32(0);

	63 uint32x2_t dst23 = vdup_n_u32(0);

	64

	65 // Load the source pixels.

	66 dst01 = vld1_lane_u32((uint32_t)(ref + 0 BPS), dst01, 0);

	67 dst23 = vld1_lane_u32((uint32_t)(ref + 2 BPS), dst23, 0);

	68 dst01 = vld1_lane_u32((uint32_t)(ref + 1 BPS), dst01, 1);

	69 dst23 = vld1_lane_u32((uint32_t)(ref + 3 BPS), dst23, 1);

	70

	71 {

	72 // Convert to 16b.

	73 const int16x8_t dst01_s16 = ConvertU8ToS16(dst01);

	74 const int16x8_t dst23_s16 = ConvertU8ToS16(dst23);

	75

	76 // Descale with rounding.

	77 const int16x8_t out01 = vrsraq_n_s16(dst01_s16, row01, 3);

	78 const int16x8_t out23 = vrsraq_n_s16(dst23_s16, row23, 3);

	79 // Add the inverse transform.

	80 SaturateAndStore4x4(dst, out01, out23);

	81 }

	82 }

	83

	84 static WEBP_INLINE void Transpose8x2(const int16x8_t in0, const int16x8_t in1,

	85 int16x8x2_t* const out) {

	86 // a0 a1 a2 a3 \| b0 b1 b2 b3 => a0 b0 c0 d0 \| a1 b1 c1 d1

	87 // c0 c1 c2 c3 \| d0 d1 d2 d3 a2 b2 c2 d2 \| a3 b3 c3 d3

	88 const int16x8x2_t tmp0 = vzipq_s16(in0, in1); // a0 c0 a1 c1 a2 c2 ...

	89 // b0 d0 b1 d1 b2 d2 ...

	90 *out = vzipq_s16(tmp0.val[0], tmp0.val[1]);

	91 }

	92

	93 static WEBP_INLINE void TransformPass(int16x8x2_t* const rows) {

	94 // {rows} = in0 \| in4

	95 // in8 \| in12

	96 // B1 = in4 \| in12

	97 const int16x8_t B1 =

	98 vcombine_s16(vget_high_s16(rows->val[0]), vget_high_s16(rows->val[1]));

	99 // C0 = kC1 * in4 \| kC1 * in12

	100 // C1 = kC2 * in4 \| kC2 * in12

	101 const int16x8_t C0 = vsraq_n_s16(B1, vqdmulhq_n_s16(B1, kC1), 1);

	102 const int16x8_t C1 = vqdmulhq_n_s16(B1, kC2);

	103 const int16x4_t a = vqadd_s16(vget_low_s16(rows->val[0]),

	104 vget_low_s16(rows->val[1])); // in0 + in8

	105 const int16x4_t b = vqsub_s16(vget_low_s16(rows->val[0]),

	106 vget_low_s16(rows->val[1])); // in0 - in8

	107 // c = kC2 * in4 - kC1 * in12

	108 // d = kC1 * in4 + kC2 * in12

	109 const int16x4_t c = vqsub_s16(vget_low_s16(C1), vget_high_s16(C0));

	110 const int16x4_t d = vqadd_s16(vget_low_s16(C0), vget_high_s16(C1));

	111 const int16x8_t D0 = vcombine_s16(a, b); // D0 = a \| b

	112 const int16x8_t D1 = vcombine_s16(d, c); // D1 = d \| c

	113 const int16x8_t E0 = vqaddq_s16(D0, D1); // a+d \| b+c

	114 const int16x8_t E_tmp = vqsubq_s16(D0, D1); // a-d \| b-c

	115 const int16x8_t E1 = vcombine_s16(vget_high_s16(E_tmp), vget_low_s16(E_tmp));

	116 Transpose8x2(E0, E1, rows);

	117 }

	118

	119 static void ITransformOne(const uint8_t* ref,

	120 const int16_t* in, uint8_t* dst) {

	121 int16x8x2_t rows;

	122 INIT_VECTOR2(rows, vld1q_s16(in + 0), vld1q_s16(in + 8));

	123 TransformPass(&rows);

	124 TransformPass(&rows);

	125 Add4x4(rows.val[0], rows.val[1], ref, dst);

	126 }

	127

	128 #else

	129

	130 static void ITransformOne(const uint8_t* ref,

	131 const int16_t* in, uint8_t* dst) {

	132 const int kBPS = BPS;

	133 const int16_t kC1C2[] = { kC1, kC2, 0, 0 };

	134

	135 __asm__ volatile (

	136 "vld1.16 {q1, q2}, [%[in]] \n"

	137 "vld1.16 {d0}, [%[kC1C2]] \n"

	138

	139 // d2: in[0]

	140 // d3: in[8]

	141 // d4: in[4]

	142 // d5: in[12]

	143 "vswp d3, d4 \n"

	144

	145 // q8 = {in[4], in[12]} * kC1 * 2 >> 16

	146 // q9 = {in[4], in[12]} * kC2 >> 16

	147 "vqdmulh.s16 q8, q2, d0[0] \n"

	148 "vqdmulh.s16 q9, q2, d0[1] \n"

	149

	150 // d22 = a = in[0] + in[8]

	151 // d23 = b = in[0] - in[8]

	152 "vqadd.s16 d22, d2, d3 \n"

	153 "vqsub.s16 d23, d2, d3 \n"

	154

	155 // q8 = in[4]/[12] * kC1 >> 16

	156 "vshr.s16 q8, q8, #1 \n"

	157

	158 // Add {in[4], in[12]} back after the multiplication.

	159 "vqadd.s16 q8, q2, q8 \n"

	160

	161 // d20 = c = in[4]kC2 - in[12]kC1

	162 // d21 = d = in[4]kC1 + in[12]kC2

	163 "vqsub.s16 d20, d18, d17 \n"

	164 "vqadd.s16 d21, d19, d16 \n"

	165

	166 // d2 = tmp[0] = a + d

	167 // d3 = tmp[1] = b + c

	168 // d4 = tmp[2] = b - c

	169 // d5 = tmp[3] = a - d

	170 "vqadd.s16 d2, d22, d21 \n"

	171 "vqadd.s16 d3, d23, d20 \n"

	172 "vqsub.s16 d4, d23, d20 \n"

	173 "vqsub.s16 d5, d22, d21 \n"

	174

	175 "vzip.16 q1, q2 \n"

	176 "vzip.16 q1, q2 \n"

	177

	178 "vswp d3, d4 \n"

	179

	180 // q8 = {tmp[4], tmp[12]} * kC1 * 2 >> 16

	181 // q9 = {tmp[4], tmp[12]} * kC2 >> 16

	182 "vqdmulh.s16 q8, q2, d0[0] \n"

	183 "vqdmulh.s16 q9, q2, d0[1] \n"

	184

	185 // d22 = a = tmp[0] + tmp[8]

	186 // d23 = b = tmp[0] - tmp[8]

	187 "vqadd.s16 d22, d2, d3 \n"

	188 "vqsub.s16 d23, d2, d3 \n"

	189

	190 "vshr.s16 q8, q8, #1 \n"

	191 "vqadd.s16 q8, q2, q8 \n"

	192

	193 // d20 = c = in[4]kC2 - in[12]kC1

	194 // d21 = d = in[4]kC1 + in[12]kC2

	195 "vqsub.s16 d20, d18, d17 \n"

	196 "vqadd.s16 d21, d19, d16 \n"

	197

	198 // d2 = tmp[0] = a + d

	199 // d3 = tmp[1] = b + c

	200 // d4 = tmp[2] = b - c

	201 // d5 = tmp[3] = a - d

	202 "vqadd.s16 d2, d22, d21 \n"

	203 "vqadd.s16 d3, d23, d20 \n"

	204 "vqsub.s16 d4, d23, d20 \n"

	205 "vqsub.s16 d5, d22, d21 \n"

	206

	207 "vld1.32 d6[0], [%[ref]], %[kBPS] \n"

	208 "vld1.32 d6[1], [%[ref]], %[kBPS] \n"

	209 "vld1.32 d7[0], [%[ref]], %[kBPS] \n"

	210 "vld1.32 d7[1], [%[ref]], %[kBPS] \n"

	211

	212 "sub %[ref], %[ref], %[kBPS], lsl #2 \n"

	213

	214 // (val) + 4 >> 3

	215 "vrshr.s16 d2, d2, #3 \n"

	216 "vrshr.s16 d3, d3, #3 \n"

	217 "vrshr.s16 d4, d4, #3 \n"

	218 "vrshr.s16 d5, d5, #3 \n"

	219

	220 "vzip.16 q1, q2 \n"

	221 "vzip.16 q1, q2 \n"

	222

	223 // Must accumulate before saturating

	224 "vmovl.u8 q8, d6 \n"

	225 "vmovl.u8 q9, d7 \n"

	226

	227 "vqadd.s16 q1, q1, q8 \n"

	228 "vqadd.s16 q2, q2, q9 \n"

	229

	230 "vqmovun.s16 d0, q1 \n"

	231 "vqmovun.s16 d1, q2 \n"

	232

	233 "vst1.32 d0[0], [%[dst]], %[kBPS] \n"

	234 "vst1.32 d0[1], [%[dst]], %[kBPS] \n"

	235 "vst1.32 d1[0], [%[dst]], %[kBPS] \n"

	236 "vst1.32 d1[1], [%[dst]] \n"

	237

	238 : [in] "+r"(in), [dst] "+r"(dst) // modified registers

	239 : [kBPS] "r"(kBPS), [kC1C2] "r"(kC1C2), [ref] "r"(ref) // constants

	240 : "memory", "q0", "q1", "q2", "q8", "q9", "q10", "q11" // clobbered

	241 );

	242 }

	243

	244 #endif // USE_INTRINSICS

	245

	246 static void ITransform(const uint8_t* ref,

	247 const int16_t* in, uint8_t* dst, int do_two) {

	248 ITransformOne(ref, in, dst);

	249 if (do_two) {

	250 ITransformOne(ref + 4, in + 16, dst + 4);

	251 }

	252 }

	253

	254 // Load all 4x4 pixels into a single uint8x16_t variable.

	255 static uint8x16_t Load4x4(const uint8_t* src) {

	256 uint32x4_t out = vdupq_n_u32(0);

	257 out = vld1q_lane_u32((const uint32_t)(src + 0 BPS), out, 0);

	258 out = vld1q_lane_u32((const uint32_t)(src + 1 BPS), out, 1);

	259 out = vld1q_lane_u32((const uint32_t)(src + 2 BPS), out, 2);

	260 out = vld1q_lane_u32((const uint32_t)(src + 3 BPS), out, 3);

	261 return vreinterpretq_u8_u32(out);

	262 }

	263

	264 // Forward transform.

	265

	266 #if defined(USE_INTRINSICS)

	267

	268 static WEBP_INLINE void Transpose4x4_S16(const int16x4_t A, const int16x4_t B,

	269 const int16x4_t C, const int16x4_t D,

	270 int16x8_t* const out01,

	271 int16x8_t* const out32) {

	272 const int16x4x2_t AB = vtrn_s16(A, B);

	273 const int16x4x2_t CD = vtrn_s16(C, D);

	274 const int32x2x2_t tmp02 = vtrn_s32(vreinterpret_s32_s16(AB.val[0]),

	275 vreinterpret_s32_s16(CD.val[0]));

	276 const int32x2x2_t tmp13 = vtrn_s32(vreinterpret_s32_s16(AB.val[1]),

	277 vreinterpret_s32_s16(CD.val[1]));

	278 *out01 = vreinterpretq_s16_s64(

	279 vcombine_s64(vreinterpret_s64_s32(tmp02.val[0]),

	280 vreinterpret_s64_s32(tmp13.val[0])));

	281 *out32 = vreinterpretq_s16_s64(

	282 vcombine_s64(vreinterpret_s64_s32(tmp13.val[1]),

	283 vreinterpret_s64_s32(tmp02.val[1])));

	284 }

	285

	286 static WEBP_INLINE int16x8_t DiffU8ToS16(const uint8x8_t a,

	287 const uint8x8_t b) {

	288 return vreinterpretq_s16_u16(vsubl_u8(a, b));

	289 }

	290

	291 static void FTransform(const uint8_t* src, const uint8_t* ref,

	292 int16_t* out) {

	293 int16x8_t d0d1, d3d2; // working 4x4 int16 variables

	294 {

	295 const uint8x16_t S0 = Load4x4(src);

	296 const uint8x16_t R0 = Load4x4(ref);

	297 const int16x8_t D0D1 = DiffU8ToS16(vget_low_u8(S0), vget_low_u8(R0));

	298 const int16x8_t D2D3 = DiffU8ToS16(vget_high_u8(S0), vget_high_u8(R0));

	299 const int16x4_t D0 = vget_low_s16(D0D1);

	300 const int16x4_t D1 = vget_high_s16(D0D1);

	301 const int16x4_t D2 = vget_low_s16(D2D3);

	302 const int16x4_t D3 = vget_high_s16(D2D3);

	303 Transpose4x4_S16(D0, D1, D2, D3, &d0d1, &d3d2);

	304 }

	305 { // 1rst pass

	306 const int32x4_t kCst937 = vdupq_n_s32(937);

	307 const int32x4_t kCst1812 = vdupq_n_s32(1812);

	308 const int16x8_t a0a1 = vaddq_s16(d0d1, d3d2); // d0+d3 \| d1+d2 (=a0\|a1)

	309 const int16x8_t a3a2 = vsubq_s16(d0d1, d3d2); // d0-d3 \| d1-d2 (=a3\|a2)

	310 const int16x8_t a0a1_2 = vshlq_n_s16(a0a1, 3);

	311 const int16x4_t tmp0 = vadd_s16(vget_low_s16(a0a1_2),

	312 vget_high_s16(a0a1_2));

	313 const int16x4_t tmp2 = vsub_s16(vget_low_s16(a0a1_2),

	314 vget_high_s16(a0a1_2));

	315 const int32x4_t a3_2217 = vmull_n_s16(vget_low_s16(a3a2), 2217);

	316 const int32x4_t a2_2217 = vmull_n_s16(vget_high_s16(a3a2), 2217);

	317 const int32x4_t a2_p_a3 = vmlal_n_s16(a2_2217, vget_low_s16(a3a2), 5352);

	318 const int32x4_t a3_m_a2 = vmlsl_n_s16(a3_2217, vget_high_s16(a3a2), 5352);

	319 const int16x4_t tmp1 = vshrn_n_s32(vaddq_s32(a2_p_a3, kCst1812), 9);

	320 const int16x4_t tmp3 = vshrn_n_s32(vaddq_s32(a3_m_a2, kCst937), 9);

	321 Transpose4x4_S16(tmp0, tmp1, tmp2, tmp3, &d0d1, &d3d2);

	322 }

	323 { // 2nd pass

	324 // the (1<<16) addition is for the replacement: a3!=0 <-> 1-(a3==0)

	325 const int32x4_t kCst12000 = vdupq_n_s32(12000 + (1 << 16));

	326 const int32x4_t kCst51000 = vdupq_n_s32(51000);

	327 const int16x8_t a0a1 = vaddq_s16(d0d1, d3d2); // d0+d3 \| d1+d2 (=a0\|a1)

	328 const int16x8_t a3a2 = vsubq_s16(d0d1, d3d2); // d0-d3 \| d1-d2 (=a3\|a2)

	329 const int16x4_t a0_k7 = vadd_s16(vget_low_s16(a0a1), vdup_n_s16(7));

	330 const int16x4_t out0 = vshr_n_s16(vadd_s16(a0_k7, vget_high_s16(a0a1)), 4);

	331 const int16x4_t out2 = vshr_n_s16(vsub_s16(a0_k7, vget_high_s16(a0a1)), 4);

	332 const int32x4_t a3_2217 = vmull_n_s16(vget_low_s16(a3a2), 2217);

	333 const int32x4_t a2_2217 = vmull_n_s16(vget_high_s16(a3a2), 2217);

	334 const int32x4_t a2_p_a3 = vmlal_n_s16(a2_2217, vget_low_s16(a3a2), 5352);

	335 const int32x4_t a3_m_a2 = vmlsl_n_s16(a3_2217, vget_high_s16(a3a2), 5352);

	336 const int16x4_t tmp1 = vaddhn_s32(a2_p_a3, kCst12000);

	337 const int16x4_t out3 = vaddhn_s32(a3_m_a2, kCst51000);

	338 const int16x4_t a3_eq_0 =

	339 vreinterpret_s16_u16(vceq_s16(vget_low_s16(a3a2), vdup_n_s16(0)));

	340 const int16x4_t out1 = vadd_s16(tmp1, a3_eq_0);

	341 vst1_s16(out + 0, out0);

	342 vst1_s16(out + 4, out1);

	343 vst1_s16(out + 8, out2);

	344 vst1_s16(out + 12, out3);

	345 }

	346 }

	347

	348 #else

	349

	350 // adapted from vp8/encoder/arm/neon/shortfdct_neon.asm

	351 static const int16_t kCoeff16[] = {

	352 5352, 5352, 5352, 5352, 2217, 2217, 2217, 2217

	353 };

	354 static const int32_t kCoeff32[] = {

	355 1812, 1812, 1812, 1812,

	356 937, 937, 937, 937,

	357 12000, 12000, 12000, 12000,

	358 51000, 51000, 51000, 51000

	359 };

	360

	361 static void FTransform(const uint8_t* src, const uint8_t* ref,

	362 int16_t* out) {

	363 const int kBPS = BPS;

	364 const uint8_t* src_ptr = src;

	365 const uint8_t* ref_ptr = ref;

	366 const int16_t* coeff16 = kCoeff16;

	367 const int32_t* coeff32 = kCoeff32;

	368

	369 __asm__ volatile (

	370 // load src into q4, q5 in high half

	371 "vld1.8 {d8}, [%[src_ptr]], %[kBPS] \n"

	372 "vld1.8 {d10}, [%[src_ptr]], %[kBPS] \n"

	373 "vld1.8 {d9}, [%[src_ptr]], %[kBPS] \n"

	374 "vld1.8 {d11}, [%[src_ptr]] \n"

	375

	376 // load ref into q6, q7 in high half

	377 "vld1.8 {d12}, [%[ref_ptr]], %[kBPS] \n"

	378 "vld1.8 {d14}, [%[ref_ptr]], %[kBPS] \n"

	379 "vld1.8 {d13}, [%[ref_ptr]], %[kBPS] \n"

	380 "vld1.8 {d15}, [%[ref_ptr]] \n"

	381

	382 // Pack the high values in to q4 and q6

	383 "vtrn.32 q4, q5 \n"

	384 "vtrn.32 q6, q7 \n"

	385

	386 // d[0-3] = src - ref

	387 "vsubl.u8 q0, d8, d12 \n"

	388 "vsubl.u8 q1, d9, d13 \n"

	389

	390 // load coeff16 into q8(d16=5352, d17=2217)

	391 "vld1.16 {q8}, [%[coeff16]] \n"

	392

	393 // load coeff32 high half into q9 = 1812, q10 = 937

	394 "vld1.32 {q9, q10}, [%[coeff32]]! \n"

	395

	396 // load coeff32 low half into q11=12000, q12=51000

	397 "vld1.32 {q11,q12}, [%[coeff32]] \n"

	398

	399 // part 1

	400 // Transpose. Register dN is the same as dN in C

	401 "vtrn.32 d0, d2 \n"

	402 "vtrn.32 d1, d3 \n"

	403 "vtrn.16 d0, d1 \n"

	404 "vtrn.16 d2, d3 \n"

	405

	406 "vadd.s16 d4, d0, d3 \n" // a0 = d0 + d3

	407 "vadd.s16 d5, d1, d2 \n" // a1 = d1 + d2

	408 "vsub.s16 d6, d1, d2 \n" // a2 = d1 - d2

	409 "vsub.s16 d7, d0, d3 \n" // a3 = d0 - d3

	410

	411 "vadd.s16 d0, d4, d5 \n" // a0 + a1

	412 "vshl.s16 d0, d0, #3 \n" // temp[0+i*4] = (a0+a1) << 3

	413 "vsub.s16 d2, d4, d5 \n" // a0 - a1

	414 "vshl.s16 d2, d2, #3 \n" // (temp[2+i*4] = (a0-a1) << 3

	415

	416 "vmlal.s16 q9, d7, d16 \n" // a3*5352 + 1812

	417 "vmlal.s16 q10, d7, d17 \n" // a3*2217 + 937

	418 "vmlal.s16 q9, d6, d17 \n" // a22217 + a35352 + 1812

	419 "vmlsl.s16 q10, d6, d16 \n" // a32217 + 937 - a25352

	420

	421 // temp[1+i4] = (d22217 + d3*5352 + 1812) >> 9

	422 // temp[3+i4] = (d32217 + 937 - d2*5352) >> 9

	423 "vshrn.s32 d1, q9, #9 \n"

	424 "vshrn.s32 d3, q10, #9 \n"

	425

	426 // part 2

	427 // transpose d0=ip[0], d1=ip[4], d2=ip[8], d3=ip[12]

	428 "vtrn.32 d0, d2 \n"

	429 "vtrn.32 d1, d3 \n"

	430 "vtrn.16 d0, d1 \n"

	431 "vtrn.16 d2, d3 \n"

	432

	433 "vmov.s16 d26, #7 \n"

	434

	435 "vadd.s16 d4, d0, d3 \n" // a1 = ip[0] + ip[12]

	436 "vadd.s16 d5, d1, d2 \n" // b1 = ip[4] + ip[8]

	437 "vsub.s16 d6, d1, d2 \n" // c1 = ip[4] - ip[8]

	438 "vadd.s16 d4, d4, d26 \n" // a1 + 7

	439 "vsub.s16 d7, d0, d3 \n" // d1 = ip[0] - ip[12]

	440

	441 "vadd.s16 d0, d4, d5 \n" // op[0] = a1 + b1 + 7

	442 "vsub.s16 d2, d4, d5 \n" // op[8] = a1 - b1 + 7

	443

	444 "vmlal.s16 q11, d7, d16 \n" // d1*5352 + 12000

	445 "vmlal.s16 q12, d7, d17 \n" // d1*2217 + 51000

	446

	447 "vceq.s16 d4, d7, #0 \n"

	448

	449 "vshr.s16 d0, d0, #4 \n"

	450 "vshr.s16 d2, d2, #4 \n"

	451

	452 "vmlal.s16 q11, d6, d17 \n" // c12217 + d15352 + 12000

	453 "vmlsl.s16 q12, d6, d16 \n" // d12217 - c15352 + 51000

	454

	455 "vmvn d4, d4 \n" // !(d1 == 0)

	456 // op[4] = (c12217 + d15352 + 12000)>>16

	457 "vshrn.s32 d1, q11, #16 \n"

	458 // op[4] += (d1!=0)

	459 "vsub.s16 d1, d1, d4 \n"

	460 // op[12]= (d12217 - c15352 + 51000)>>16

	461 "vshrn.s32 d3, q12, #16 \n"

	462

	463 // set result to out array

	464 "vst1.16 {q0, q1}, [%[out]] \n"

	465 : [src_ptr] "+r"(src_ptr), [ref_ptr] "+r"(ref_ptr),

	466 [coeff32] "+r"(coeff32) // modified registers

	467 : [kBPS] "r"(kBPS), [coeff16] "r"(coeff16),

	468 [out] "r"(out) // constants

	469 : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9",

	470 "q10", "q11", "q12", "q13" // clobbered

	471 );

	472 }

	473

	474 #endif

	475

	476 #define LOAD_LANE_16b(VALUE, LANE) do { \

	477 (VALUE) = vld1_lane_s16(src, (VALUE), (LANE)); \

	478 src += stride; \

	479 } while (0)

	480

	481 static void FTransformWHT(const int16_t* src, int16_t* out) {

	482 const int stride = 16;

	483 const int16x4_t zero = vdup_n_s16(0);

	484 int32x4x4_t tmp0;

	485 int16x4x4_t in;

	486 INIT_VECTOR4(in, zero, zero, zero, zero);

	487 LOAD_LANE_16b(in.val[0], 0);

	488 LOAD_LANE_16b(in.val[1], 0);

	489 LOAD_LANE_16b(in.val[2], 0);

	490 LOAD_LANE_16b(in.val[3], 0);

	491 LOAD_LANE_16b(in.val[0], 1);

	492 LOAD_LANE_16b(in.val[1], 1);

	493 LOAD_LANE_16b(in.val[2], 1);

	494 LOAD_LANE_16b(in.val[3], 1);

	495 LOAD_LANE_16b(in.val[0], 2);

	496 LOAD_LANE_16b(in.val[1], 2);

	497 LOAD_LANE_16b(in.val[2], 2);

	498 LOAD_LANE_16b(in.val[3], 2);

	499 LOAD_LANE_16b(in.val[0], 3);

	500 LOAD_LANE_16b(in.val[1], 3);

	501 LOAD_LANE_16b(in.val[2], 3);

	502 LOAD_LANE_16b(in.val[3], 3);

	503

	504 {

	505 // a0 = in[0 * 16] + in[2 * 16]

	506 // a1 = in[1 * 16] + in[3 * 16]

	507 // a2 = in[1 * 16] - in[3 * 16]

	508 // a3 = in[0 * 16] - in[2 * 16]

	509 const int32x4_t a0 = vaddl_s16(in.val[0], in.val[2]);

	510 const int32x4_t a1 = vaddl_s16(in.val[1], in.val[3]);

	511 const int32x4_t a2 = vsubl_s16(in.val[1], in.val[3]);

	512 const int32x4_t a3 = vsubl_s16(in.val[0], in.val[2]);

	513 tmp0.val[0] = vaddq_s32(a0, a1);

	514 tmp0.val[1] = vaddq_s32(a3, a2);

	515 tmp0.val[2] = vsubq_s32(a3, a2);

	516 tmp0.val[3] = vsubq_s32(a0, a1);

	517 }

	518 {

	519 const int32x4x4_t tmp1 = Transpose4x4(tmp0);

	520 // a0 = tmp[0 + i] + tmp[ 8 + i]

	521 // a1 = tmp[4 + i] + tmp[12 + i]

	522 // a2 = tmp[4 + i] - tmp[12 + i]

	523 // a3 = tmp[0 + i] - tmp[ 8 + i]

	524 const int32x4_t a0 = vaddq_s32(tmp1.val[0], tmp1.val[2]);

	525 const int32x4_t a1 = vaddq_s32(tmp1.val[1], tmp1.val[3]);

	526 const int32x4_t a2 = vsubq_s32(tmp1.val[1], tmp1.val[3]);

	527 const int32x4_t a3 = vsubq_s32(tmp1.val[0], tmp1.val[2]);

	528 const int32x4_t b0 = vhaddq_s32(a0, a1); // (a0 + a1) >> 1

	529 const int32x4_t b1 = vhaddq_s32(a3, a2); // (a3 + a2) >> 1

	530 const int32x4_t b2 = vhsubq_s32(a3, a2); // (a3 - a2) >> 1

	531 const int32x4_t b3 = vhsubq_s32(a0, a1); // (a0 - a1) >> 1

	532 const int16x4_t out0 = vmovn_s32(b0);

	533 const int16x4_t out1 = vmovn_s32(b1);

	534 const int16x4_t out2 = vmovn_s32(b2);

	535 const int16x4_t out3 = vmovn_s32(b3);

	536

	537 vst1_s16(out + 0, out0);

	538 vst1_s16(out + 4, out1);

	539 vst1_s16(out + 8, out2);

	540 vst1_s16(out + 12, out3);

	541 }

	542 }

	543 #undef LOAD_LANE_16b

	544

	545 //------------------------------------------------------------------------------

	546 // Texture distortion

	547 //

	548 // We try to match the spectral content (weighted) between source and

	549 // reconstructed samples.

	550

	551 // This code works but is slower than the inlined-asm version below

	552 // (with gcc-4.6). So we disable it for now. Later, it'll be conditional to

	553 // USE_INTRINSICS define.

	554 // With gcc-4.8, it's only slightly slower than the inlined.

	555 #if defined(USE_INTRINSICS)

	556

	557 // Zero extend an uint16x4_t 'v' to an int32x4_t.

	558 static WEBP_INLINE int32x4_t ConvertU16ToS32(uint16x4_t v) {

	559 return vreinterpretq_s32_u32(vmovl_u16(v));

	560 }

	561

	562 // Does a regular 4x4 transpose followed by an adjustment of the upper columns

	563 // in the inner rows to restore the source order of differences,

	564 // i.e., a0 - a1 \| a3 - a2.

	565 static WEBP_INLINE int32x4x4_t DistoTranspose4x4(const int32x4x4_t rows) {

	566 int32x4x4_t out = Transpose4x4(rows);

	567 // restore source order in the columns containing differences.

	568 const int32x2_t r1h = vget_high_s32(out.val[1]);

	569 const int32x2_t r2h = vget_high_s32(out.val[2]);

	570 out.val[1] = vcombine_s32(vget_low_s32(out.val[1]), r2h);

	571 out.val[2] = vcombine_s32(vget_low_s32(out.val[2]), r1h);

	572 return out;

	573 }

	574

	575 static WEBP_INLINE int32x4x4_t DistoHorizontalPass(const uint8x8_t r0r1,

	576 const uint8x8_t r2r3) {

	577 // a0 = in[0] + in[2] \| a1 = in[1] + in[3]

	578 const uint16x8_t a0a1 = vaddl_u8(r0r1, r2r3);

	579 // a3 = in[0] - in[2] \| a2 = in[1] - in[3]

	580 const uint16x8_t a3a2 = vsubl_u8(r0r1, r2r3);

	581 const int32x4_t tmp0 = vpaddlq_s16(vreinterpretq_s16_u16(a0a1)); // a0 + a1

	582 const int32x4_t tmp1 = vpaddlq_s16(vreinterpretq_s16_u16(a3a2)); // a3 + a2

	583 // no pairwise subtraction; reorder to perform tmp[2]/tmp[3] calculations.

	584 // a0a0 a3a3 a0a0 a3a3 a0a0 a3a3 a0a0 a3a3

	585 // a1a1 a2a2 a1a1 a2a2 a1a1 a2a2 a1a1 a2a2

	586 const int16x8x2_t transpose =

	587 vtrnq_s16(vreinterpretq_s16_u16(a0a1), vreinterpretq_s16_u16(a3a2));

	588 // tmp[3] = a0 - a1 \| tmp[2] = a3 - a2

	589 const int32x4_t tmp32_1 = vsubl_s16(vget_low_s16(transpose.val[0]),

	590 vget_low_s16(transpose.val[1]));

	591 const int32x4_t tmp32_2 = vsubl_s16(vget_high_s16(transpose.val[0]),

	592 vget_high_s16(transpose.val[1]));

	593 // [0]: tmp[3] [1]: tmp[2]

	594 const int32x4x2_t split = vtrnq_s32(tmp32_1, tmp32_2);

	595 const int32x4x4_t res = { { tmp0, tmp1, split.val[1], split.val[0] } };

	596 return res;

	597 }

	598

	599 static WEBP_INLINE int32x4x4_t DistoVerticalPass(const int32x4x4_t rows) {

	600 // a0 = tmp[0 + i] + tmp[8 + i];

	601 const int32x4_t a0 = vaddq_s32(rows.val[0], rows.val[1]);

	602 // a1 = tmp[4 + i] + tmp[12+ i];

	603 const int32x4_t a1 = vaddq_s32(rows.val[2], rows.val[3]);

	604 // a2 = tmp[4 + i] - tmp[12+ i];

	605 const int32x4_t a2 = vsubq_s32(rows.val[2], rows.val[3]);

	606 // a3 = tmp[0 + i] - tmp[8 + i];

	607 const int32x4_t a3 = vsubq_s32(rows.val[0], rows.val[1]);

	608 const int32x4_t b0 = vqabsq_s32(vaddq_s32(a0, a1)); // abs(a0 + a1)

	609 const int32x4_t b1 = vqabsq_s32(vaddq_s32(a3, a2)); // abs(a3 + a2)

	610 const int32x4_t b2 = vabdq_s32(a3, a2); // abs(a3 - a2)

	611 const int32x4_t b3 = vabdq_s32(a0, a1); // abs(a0 - a1)

	612 const int32x4x4_t res = { { b0, b1, b2, b3 } };

	613 return res;

	614 }

	615

	616 // Calculate the weighted sum of the rows in 'b'.

	617 static WEBP_INLINE int64x1_t DistoSum(const int32x4x4_t b,

	618 const int32x4_t w0, const int32x4_t w1,

	619 const int32x4_t w2, const int32x4_t w3) {

	620 const int32x4_t s0 = vmulq_s32(w0, b.val[0]);

	621 const int32x4_t s1 = vmlaq_s32(s0, w1, b.val[1]);

	622 const int32x4_t s2 = vmlaq_s32(s1, w2, b.val[2]);

	623 const int32x4_t s3 = vmlaq_s32(s2, w3, b.val[3]);

	624 const int64x2_t sum1 = vpaddlq_s32(s3);

	625 const int64x1_t sum2 = vadd_s64(vget_low_s64(sum1), vget_high_s64(sum1));

	626 return sum2;

	627 }

	628

	629 #define LOAD_LANE_32b(src, VALUE, LANE) \

	630 (VALUE) = vld1q_lane_u32((const uint32_t*)(src), (VALUE), (LANE))

	631

	632 // Hadamard transform

	633 // Returns the weighted sum of the absolute value of transformed coefficients.

	634 static int Disto4x4(const uint8_t* const a, const uint8_t* const b,

	635 const uint16_t* const w) {

	636 uint32x4_t d0d1 = { 0, 0, 0, 0 };

	637 uint32x4_t d2d3 = { 0, 0, 0, 0 };

	638 LOAD_LANE_32b(a + 0 * BPS, d0d1, 0); // a00 a01 a02 a03

	639 LOAD_LANE_32b(a + 1 * BPS, d0d1, 1); // a10 a11 a12 a13

	640 LOAD_LANE_32b(b + 0 * BPS, d0d1, 2); // b00 b01 b02 b03

	641 LOAD_LANE_32b(b + 1 * BPS, d0d1, 3); // b10 b11 b12 b13

	642 LOAD_LANE_32b(a + 2 * BPS, d2d3, 0); // a20 a21 a22 a23

	643 LOAD_LANE_32b(a + 3 * BPS, d2d3, 1); // a30 a31 a32 a33

	644 LOAD_LANE_32b(b + 2 * BPS, d2d3, 2); // b20 b21 b22 b23

	645 LOAD_LANE_32b(b + 3 * BPS, d2d3, 3); // b30 b31 b32 b33

	646

	647 {

	648 // a00 a01 a20 a21 a10 a11 a30 a31 b00 b01 b20 b21 b10 b11 b30 b31

	649 // a02 a03 a22 a23 a12 a13 a32 a33 b02 b03 b22 b23 b12 b13 b32 b33

	650 const uint16x8x2_t tmp =

	651 vtrnq_u16(vreinterpretq_u16_u32(d0d1), vreinterpretq_u16_u32(d2d3));

	652 const uint8x16_t d0d1u8 = vreinterpretq_u8_u16(tmp.val[0]);

	653 const uint8x16_t d2d3u8 = vreinterpretq_u8_u16(tmp.val[1]);

	654 const int32x4x4_t hpass_a = DistoHorizontalPass(vget_low_u8(d0d1u8),

	655 vget_low_u8(d2d3u8));

	656 const int32x4x4_t hpass_b = DistoHorizontalPass(vget_high_u8(d0d1u8),

	657 vget_high_u8(d2d3u8));

	658 const int32x4x4_t tmp_a = DistoTranspose4x4(hpass_a);

	659 const int32x4x4_t tmp_b = DistoTranspose4x4(hpass_b);

	660 const int32x4x4_t vpass_a = DistoVerticalPass(tmp_a);

	661 const int32x4x4_t vpass_b = DistoVerticalPass(tmp_b);

	662 const int32x4_t w0 = ConvertU16ToS32(vld1_u16(w + 0));

	663 const int32x4_t w1 = ConvertU16ToS32(vld1_u16(w + 4));

	664 const int32x4_t w2 = ConvertU16ToS32(vld1_u16(w + 8));

	665 const int32x4_t w3 = ConvertU16ToS32(vld1_u16(w + 12));

	666 const int64x1_t sum1 = DistoSum(vpass_a, w0, w1, w2, w3);

	667 const int64x1_t sum2 = DistoSum(vpass_b, w0, w1, w2, w3);

	668 const int32x2_t diff = vabd_s32(vreinterpret_s32_s64(sum1),

	669 vreinterpret_s32_s64(sum2));

	670 const int32x2_t res = vshr_n_s32(diff, 5);

	671 return vget_lane_s32(res, 0);

	672 }

	673 }

	674

	675 #undef LOAD_LANE_32b

	676

	677 #else

	678

	679 // Hadamard transform

	680 // Returns the weighted sum of the absolute value of transformed coefficients.

	681 static int Disto4x4(const uint8_t* const a, const uint8_t* const b,

	682 const uint16_t* const w) {

	683 const int kBPS = BPS;

	684 const uint8_t* A = a;

	685 const uint8_t* B = b;

	686 const uint16_t* W = w;

	687 int sum;

	688 __asm__ volatile (

	689 "vld1.32 d0[0], [%[a]], %[kBPS] \n"

	690 "vld1.32 d0[1], [%[a]], %[kBPS] \n"

	691 "vld1.32 d2[0], [%[a]], %[kBPS] \n"

	692 "vld1.32 d2[1], [%[a]] \n"

	693

	694 "vld1.32 d1[0], [%[b]], %[kBPS] \n"

	695 "vld1.32 d1[1], [%[b]], %[kBPS] \n"

	696 "vld1.32 d3[0], [%[b]], %[kBPS] \n"

	697 "vld1.32 d3[1], [%[b]] \n"

	698

	699 // a d0/d2, b d1/d3

	700 // d0/d1: 01 01 01 01

	701 // d2/d3: 23 23 23 23

	702 // But: it goes 01 45 23 67

	703 // Notice the middle values are transposed

	704 "vtrn.16 q0, q1 \n"

	705

	706 // {a0, a1} = {in[0] + in[2], in[1] + in[3]}

	707 "vaddl.u8 q2, d0, d2 \n"

	708 "vaddl.u8 q10, d1, d3 \n"

	709 // {a3, a2} = {in[0] - in[2], in[1] - in[3]}

	710 "vsubl.u8 q3, d0, d2 \n"

	711 "vsubl.u8 q11, d1, d3 \n"

	712

	713 // tmp[0] = a0 + a1

	714 "vpaddl.s16 q0, q2 \n"

	715 "vpaddl.s16 q8, q10 \n"

	716

	717 // tmp[1] = a3 + a2

	718 "vpaddl.s16 q1, q3 \n"

	719 "vpaddl.s16 q9, q11 \n"

	720

	721 // No pair subtract

	722 // q2 = {a0, a3}

	723 // q3 = {a1, a2}

	724 "vtrn.16 q2, q3 \n"

	725 "vtrn.16 q10, q11 \n"

	726

	727 // {tmp[3], tmp[2]} = {a0 - a1, a3 - a2}

	728 "vsubl.s16 q12, d4, d6 \n"

	729 "vsubl.s16 q13, d5, d7 \n"

	730 "vsubl.s16 q14, d20, d22 \n"

	731 "vsubl.s16 q15, d21, d23 \n"

	732

	733 // separate tmp[3] and tmp[2]

	734 // q12 = tmp[3]

	735 // q13 = tmp[2]

	736 "vtrn.32 q12, q13 \n"

	737 "vtrn.32 q14, q15 \n"

	738

	739 // Transpose tmp for a

	740 "vswp d1, d26 \n" // vtrn.64

	741 "vswp d3, d24 \n" // vtrn.64

	742 "vtrn.32 q0, q1 \n"

	743 "vtrn.32 q13, q12 \n"

	744

	745 // Transpose tmp for b

	746 "vswp d17, d30 \n" // vtrn.64

	747 "vswp d19, d28 \n" // vtrn.64

	748 "vtrn.32 q8, q9 \n"

	749 "vtrn.32 q15, q14 \n"

	750

	751 // The first Q register is a, the second b.

	752 // q0/8 tmp[0-3]

	753 // q13/15 tmp[4-7]

	754 // q1/9 tmp[8-11]

	755 // q12/14 tmp[12-15]

	756

	757 // These are still in 01 45 23 67 order. We fix it easily in the addition

	758 // case but the subtraction propagates them.

	759 "vswp d3, d27 \n"

	760 "vswp d19, d31 \n"

	761

	762 // a0 = tmp[0] + tmp[8]

	763 "vadd.s32 q2, q0, q1 \n"

	764 "vadd.s32 q3, q8, q9 \n"

	765

	766 // a1 = tmp[4] + tmp[12]

	767 "vadd.s32 q10, q13, q12 \n"

	768 "vadd.s32 q11, q15, q14 \n"

	769

	770 // a2 = tmp[4] - tmp[12]

	771 "vsub.s32 q13, q13, q12 \n"

	772 "vsub.s32 q15, q15, q14 \n"

	773

	774 // a3 = tmp[0] - tmp[8]

	775 "vsub.s32 q0, q0, q1 \n"

	776 "vsub.s32 q8, q8, q9 \n"

	777

	778 // b0 = a0 + a1

	779 "vadd.s32 q1, q2, q10 \n"

	780 "vadd.s32 q9, q3, q11 \n"

	781

	782 // b1 = a3 + a2

	783 "vadd.s32 q12, q0, q13 \n"

	784 "vadd.s32 q14, q8, q15 \n"

	785

	786 // b2 = a3 - a2

	787 "vsub.s32 q0, q0, q13 \n"

	788 "vsub.s32 q8, q8, q15 \n"

	789

	790 // b3 = a0 - a1

	791 "vsub.s32 q2, q2, q10 \n"

	792 "vsub.s32 q3, q3, q11 \n"

	793

	794 "vld1.64 {q10, q11}, [%[w]] \n"

	795

	796 // abs(b0)

	797 "vabs.s32 q1, q1 \n"

	798 "vabs.s32 q9, q9 \n"

	799 // abs(b1)

	800 "vabs.s32 q12, q12 \n"

	801 "vabs.s32 q14, q14 \n"

	802 // abs(b2)

	803 "vabs.s32 q0, q0 \n"

	804 "vabs.s32 q8, q8 \n"

	805 // abs(b3)

	806 "vabs.s32 q2, q2 \n"

	807 "vabs.s32 q3, q3 \n"

	808

	809 // expand w before using.

	810 "vmovl.u16 q13, d20 \n"

	811 "vmovl.u16 q15, d21 \n"

	812

	813 // w[0] * abs(b0)

	814 "vmul.u32 q1, q1, q13 \n"

	815 "vmul.u32 q9, q9, q13 \n"

	816

	817 // w[4] * abs(b1)

	818 "vmla.u32 q1, q12, q15 \n"

	819 "vmla.u32 q9, q14, q15 \n"

	820

	821 // expand w before using.

	822 "vmovl.u16 q13, d22 \n"

	823 "vmovl.u16 q15, d23 \n"

	824

	825 // w[8] * abs(b1)

	826 "vmla.u32 q1, q0, q13 \n"

	827 "vmla.u32 q9, q8, q13 \n"

	828

	829 // w[12] * abs(b1)

	830 "vmla.u32 q1, q2, q15 \n"

	831 "vmla.u32 q9, q3, q15 \n"

	832

	833 // Sum the arrays

	834 "vpaddl.u32 q1, q1 \n"

	835 "vpaddl.u32 q9, q9 \n"

	836 "vadd.u64 d2, d3 \n"

	837 "vadd.u64 d18, d19 \n"

	838

	839 // Hadamard transform needs 4 bits of extra precision (2 bits in each

	840 // direction) for dynamic raw. Weights w[] are 16bits at max, so the maximum

	841 // precision for coeff is 8bit of input + 4bits of Hadamard transform +

	842 // 16bits for w[] + 2 bits of abs() summation.

	843 //

	844 // This uses a maximum of 31 bits (signed). Discarding the top 32 bits is

	845 // A-OK.

	846

	847 // sum2 - sum1

	848 "vsub.u32 d0, d2, d18 \n"

	849 // abs(sum2 - sum1)

	850 "vabs.s32 d0, d0 \n"

	851 // abs(sum2 - sum1) >> 5

	852 "vshr.u32 d0, #5 \n"

	853

	854 // It would be better to move the value straight into r0 but I'm not

	855 // entirely sure how this works with inline assembly.

	856 "vmov.32 %[sum], d0[0] \n"

	857

	858 : [sum] "=r"(sum), [a] "+r"(A), [b] "+r"(B), [w] "+r"(W)

	859 : [kBPS] "r"(kBPS)

	860 : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9",

	861 "q10", "q11", "q12", "q13", "q14", "q15" // clobbered

	862 ) ;

	863

	864 return sum;

	865 }

	866

	867 #endif // USE_INTRINSICS

	868

	869 static int Disto16x16(const uint8_t* const a, const uint8_t* const b,

	870 const uint16_t* const w) {

	871 int D = 0;

	872 int x, y;

	873 for (y = 0; y < 16 * BPS; y += 4 * BPS) {

	874 for (x = 0; x < 16; x += 4) {

	875 D += Disto4x4(a + x + y, b + x + y, w);

	876 }

	877 }

	878 return D;

	879 }

	880

	881 //------------------------------------------------------------------------------

	882

	883 static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,

	884 int start_block, int end_block,

	885 VP8Histogram* const histo) {

	886 const uint16x8_t max_coeff_thresh = vdupq_n_u16(MAX_COEFF_THRESH);

	887 int j;

	888 for (j = start_block; j < end_block; ++j) {

	889 int16_t out[16];

	890 FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out);

	891 {

	892 int k;

	893 const int16x8_t a0 = vld1q_s16(out + 0);

	894 const int16x8_t b0 = vld1q_s16(out + 8);

	895 const uint16x8_t a1 = vreinterpretq_u16_s16(vabsq_s16(a0));

	896 const uint16x8_t b1 = vreinterpretq_u16_s16(vabsq_s16(b0));

	897 const uint16x8_t a2 = vshrq_n_u16(a1, 3);

	898 const uint16x8_t b2 = vshrq_n_u16(b1, 3);

	899 const uint16x8_t a3 = vminq_u16(a2, max_coeff_thresh);

	900 const uint16x8_t b3 = vminq_u16(b2, max_coeff_thresh);

	901 vst1q_s16(out + 0, vreinterpretq_s16_u16(a3));

	902 vst1q_s16(out + 8, vreinterpretq_s16_u16(b3));

	903 // Convert coefficients to bin.

	904 for (k = 0; k < 16; ++k) {

	905 histo->distribution[out[k]]++;

	906 }

	907 }

	908 }

	909 }

	910

	911 //------------------------------------------------------------------------------

	912

	913 static WEBP_INLINE void AccumulateSSE16(const uint8_t* const a,

	914 const uint8_t* const b,

	915 uint32x4_t* const sum) {

	916 const uint8x16_t a0 = vld1q_u8(a);

	917 const uint8x16_t b0 = vld1q_u8(b);

	918 const uint8x16_t abs_diff = vabdq_u8(a0, b0);

	919 uint16x8_t prod = vmull_u8(vget_low_u8(abs_diff), vget_low_u8(abs_diff));

	920 prod = vmlal_u8(prod, vget_high_u8(abs_diff), vget_high_u8(abs_diff));

	921 sum = vpadalq_u16(sum, prod); // pair-wise add and accumulate

	922 }

	923

	924 // Horizontal sum of all four uint32_t values in 'sum'.

	925 static int SumToInt(uint32x4_t sum) {

	926 const uint64x2_t sum2 = vpaddlq_u32(sum);

	927 const uint64_t sum3 = vgetq_lane_u64(sum2, 0) + vgetq_lane_u64(sum2, 1);

	928 return (int)sum3;

	929 }

	930

	931 static int SSE16x16(const uint8_t* a, const uint8_t* b) {

	932 uint32x4_t sum = vdupq_n_u32(0);

	933 int y;

	934 for (y = 0; y < 16; ++y) {

	935 AccumulateSSE16(a + y * BPS, b + y * BPS, &sum);

	936 }

	937 return SumToInt(sum);

	938 }

	939

	940 static int SSE16x8(const uint8_t* a, const uint8_t* b) {

	941 uint32x4_t sum = vdupq_n_u32(0);

	942 int y;

	943 for (y = 0; y < 8; ++y) {

	944 AccumulateSSE16(a + y * BPS, b + y * BPS, &sum);

	945 }

	946 return SumToInt(sum);

	947 }

	948

	949 static int SSE8x8(const uint8_t* a, const uint8_t* b) {

	950 uint32x4_t sum = vdupq_n_u32(0);

	951 int y;

	952 for (y = 0; y < 8; ++y) {

	953 const uint8x8_t a0 = vld1_u8(a + y * BPS);

	954 const uint8x8_t b0 = vld1_u8(b + y * BPS);

	955 const uint8x8_t abs_diff = vabd_u8(a0, b0);

	956 const uint16x8_t prod = vmull_u8(abs_diff, abs_diff);

	957 sum = vpadalq_u16(sum, prod);

	958 }

	959 return SumToInt(sum);

	960 }

	961

	962 static int SSE4x4(const uint8_t* a, const uint8_t* b) {

	963 const uint8x16_t a0 = Load4x4(a);

	964 const uint8x16_t b0 = Load4x4(b);

	965 const uint8x16_t abs_diff = vabdq_u8(a0, b0);

	966 uint16x8_t prod = vmull_u8(vget_low_u8(abs_diff), vget_low_u8(abs_diff));

	967 prod = vmlal_u8(prod, vget_high_u8(abs_diff), vget_high_u8(abs_diff));

	968 return SumToInt(vpaddlq_u16(prod));

	969 }

	970

	971 //------------------------------------------------------------------------------

	972

	973 // Compilation with gcc-4.6.x is problematic for now.

	974 #if !defined(WORK_AROUND_GCC)

	975

	976 static int16x8_t Quantize(int16_t* const in,

	977 const VP8Matrix* const mtx, int offset) {

	978 const uint16x8_t sharp = vld1q_u16(&mtx->sharpen_[offset]);

	979 const uint16x8_t q = vld1q_u16(&mtx->q_[offset]);

	980 const uint16x8_t iq = vld1q_u16(&mtx->iq_[offset]);

	981 const uint32x4_t bias0 = vld1q_u32(&mtx->bias_[offset + 0]);

	982 const uint32x4_t bias1 = vld1q_u32(&mtx->bias_[offset + 4]);

	983

	984 const int16x8_t a = vld1q_s16(in + offset); // in

	985 const uint16x8_t b = vreinterpretq_u16_s16(vabsq_s16(a)); // coeff = abs(in)

	986 const int16x8_t sign = vshrq_n_s16(a, 15); // sign

	987 const uint16x8_t c = vaddq_u16(b, sharp); // + sharpen

	988 const uint32x4_t m0 = vmull_u16(vget_low_u16(c), vget_low_u16(iq));

	989 const uint32x4_t m1 = vmull_u16(vget_high_u16(c), vget_high_u16(iq));

	990 const uint32x4_t m2 = vhaddq_u32(m0, bias0);

	991 const uint32x4_t m3 = vhaddq_u32(m1, bias1); // (coeff * iQ + bias) >> 1

	992 const uint16x8_t c0 = vcombine_u16(vshrn_n_u32(m2, 16),

	993 vshrn_n_u32(m3, 16)); // QFIX=17 = 16+1

	994 const uint16x8_t c1 = vminq_u16(c0, vdupq_n_u16(MAX_LEVEL));

	995 const int16x8_t c2 = veorq_s16(vreinterpretq_s16_u16(c1), sign);

	996 const int16x8_t c3 = vsubq_s16(c2, sign); // restore sign

	997 const int16x8_t c4 = vmulq_s16(c3, vreinterpretq_s16_u16(q));

	998 vst1q_s16(in + offset, c4);

	999 assert(QFIX == 17); // this function can't work as is if QFIX != 16+1

	1000 return c3;

	1001 }

	1002

	1003 static const uint8_t kShuffles[4][8] = {

	1004 { 0, 1, 2, 3, 8, 9, 16, 17 },

	1005 { 10, 11, 4, 5, 6, 7, 12, 13 },

	1006 { 18, 19, 24, 25, 26, 27, 20, 21 },

	1007 { 14, 15, 22, 23, 28, 29, 30, 31 }

	1008 };

	1009

	1010 static int QuantizeBlock(int16_t in[16], int16_t out[16],

	1011 const VP8Matrix* const mtx) {

	1012 const int16x8_t out0 = Quantize(in, mtx, 0);

	1013 const int16x8_t out1 = Quantize(in, mtx, 8);

	1014 uint8x8x4_t shuffles;

	1015 // vtbl4_u8 is marked unavailable for iOS arm64, use wider versions there.

	1016 #if defined(__APPLE__) && defined(__aarch64__) && \

	1017 defined(__apple_build_version__)

	1018 uint8x16x2_t all_out;

	1019 INIT_VECTOR2(all_out, vreinterpretq_u8_s16(out0), vreinterpretq_u8_s16(out1));

	1020 INIT_VECTOR4(shuffles,

	1021 vtbl2q_u8(all_out, vld1_u8(kShuffles[0])),

	1022 vtbl2q_u8(all_out, vld1_u8(kShuffles[1])),

	1023 vtbl2q_u8(all_out, vld1_u8(kShuffles[2])),

	1024 vtbl2q_u8(all_out, vld1_u8(kShuffles[3])));

	1025 #else

	1026 uint8x8x4_t all_out;

	1027 INIT_VECTOR4(all_out,

	1028 vreinterpret_u8_s16(vget_low_s16(out0)),

	1029 vreinterpret_u8_s16(vget_high_s16(out0)),

	1030 vreinterpret_u8_s16(vget_low_s16(out1)),

	1031 vreinterpret_u8_s16(vget_high_s16(out1)));

	1032 INIT_VECTOR4(shuffles,

	1033 vtbl4_u8(all_out, vld1_u8(kShuffles[0])),

	1034 vtbl4_u8(all_out, vld1_u8(kShuffles[1])),

	1035 vtbl4_u8(all_out, vld1_u8(kShuffles[2])),

	1036 vtbl4_u8(all_out, vld1_u8(kShuffles[3])));

	1037 #endif

	1038 // Zigzag reordering

	1039 vst1_u8((uint8_t*)(out + 0), shuffles.val[0]);

	1040 vst1_u8((uint8_t*)(out + 4), shuffles.val[1]);

	1041 vst1_u8((uint8_t*)(out + 8), shuffles.val[2]);

	1042 vst1_u8((uint8_t*)(out + 12), shuffles.val[3]);

	1043 // test zeros

	1044 if ((uint64_t)(out + 0) != 0) return 1;

	1045 if ((uint64_t)(out + 4) != 0) return 1;

	1046 if ((uint64_t)(out + 8) != 0) return 1;

	1047 if ((uint64_t)(out + 12) != 0) return 1;

	1048 return 0;

	1049 }

	1050

	1051 #endif // !WORK_AROUND_GCC

	1052

	1053 #endif // WEBP_USE_NEON

	1054

	1055 //------------------------------------------------------------------------------

	1056 // Entry point

	1057

	1058 extern void VP8EncDspInitNEON(void);

	1059

	1060 void VP8EncDspInitNEON(void) {

	1061 #if defined(WEBP_USE_NEON)

	1062 VP8ITransform = ITransform;

	1063 VP8FTransform = FTransform;

	1064

	1065 VP8FTransformWHT = FTransformWHT;

	1066

	1067 VP8TDisto4x4 = Disto4x4;

	1068 VP8TDisto16x16 = Disto16x16;

	1069 VP8CollectHistogram = CollectHistogram;

	1070 VP8SSE16x16 = SSE16x16;

	1071 VP8SSE16x8 = SSE16x8;

	1072 VP8SSE8x8 = SSE8x8;

	1073 VP8SSE4x4 = SSE4x4;

	1074 #if !defined(WORK_AROUND_GCC)

	1075 VP8EncQuantizeBlock = QuantizeBlock;

	1076 #endif

	1077 #endif // WEBP_USE_NEON

	1078 }

OLD	NEW

« src/codec/SkWebpCodec.cpp ('K') | « third_party/libwebp/dsp/enc_mips32.c ('k') | third_party/libwebp/dsp/enc_sse2.c » ('j') | no next file with comments »