third_party/libwebp/dsp/dec_neon.c - Issue 116213006: Update libwebp to 0.4.0

Side by Side Diff: third_party/libwebp/dsp/dec_neon.c

Issue 116213006: Update libwebp to 0.4.0 (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: Created 6 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright 2012 Google Inc. All Rights Reserved.	1 // Copyright 2012 Google Inc. All Rights Reserved.

2 //	2 //

3 // Use of this source code is governed by a BSD-style license	3 // Use of this source code is governed by a BSD-style license

4 // that can be found in the COPYING file in the root of the source	4 // that can be found in the COPYING file in the root of the source

5 // tree. An additional intellectual property rights grant can be found	5 // tree. An additional intellectual property rights grant can be found

6 // in the file PATENTS. All contributing project authors may	6 // in the file PATENTS. All contributing project authors may

7 // be found in the AUTHORS file in the root of the source tree.	7 // be found in the AUTHORS file in the root of the source tree.

8 // -----------------------------------------------------------------------------	8 // -----------------------------------------------------------------------------

9 //	9 //

10 // ARM NEON version of dsp functions and loop filtering.	10 // ARM NEON version of dsp functions and loop filtering.

11 //	11 //

12 // Authors: Somnath Banerjee (somnath@google.com)	12 // Authors: Somnath Banerjee (somnath@google.com)

13 // Johann Koenig (johannkoenig@google.com)	13 // Johann Koenig (johannkoenig@google.com)

14	14

15 #include "./dsp.h"	15 #include "./dsp.h"

16	16

17 #if defined(__cplusplus) \|\| defined(c_plusplus)

18 extern "C" {

19 #endif

20

21 #if defined(WEBP_USE_NEON)	17 #if defined(WEBP_USE_NEON)

22	18

23 #include "../dec/vp8i.h"	19 #include "../dec/vp8i.h"

24	20

25 #define QRegs "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", \	21 #define QRegs "q0", "q1", "q2", "q3", \

26 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"	22 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

27	23

28 #define FLIP_SIGN_BIT2(a, b, s) \	24 #define FLIP_SIGN_BIT2(a, b, s) \

29 "veor " #a "," #a "," #s " \n" \	25 "veor " #a "," #a "," #s " \n" \

30 "veor " #b "," #b "," #s " \n" \	26 "veor " #b "," #b "," #s " \n" \

31	27

32 #define FLIP_SIGN_BIT4(a, b, c, d, s) \	28 #define FLIP_SIGN_BIT4(a, b, c, d, s) \

33 FLIP_SIGN_BIT2(a, b, s) \	29 FLIP_SIGN_BIT2(a, b, s) \

34 FLIP_SIGN_BIT2(c, d, s) \	30 FLIP_SIGN_BIT2(c, d, s) \

35	31

(...skipping 58 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
94 //-----------------------------------------------------------------------------	90 //-----------------------------------------------------------------------------

95 // Simple In-loop filtering (Paragraph 15.2)	91 // Simple In-loop filtering (Paragraph 15.2)

96	92

97 static void SimpleVFilter16NEON(uint8_t* p, int stride, int thresh) {	93 static void SimpleVFilter16NEON(uint8_t* p, int stride, int thresh) {

98 __asm__ volatile (	94 __asm__ volatile (

99 "sub %[p], %[p], %[stride], lsl #1 \n" // p -= 2 * stride	95 "sub %[p], %[p], %[stride], lsl #1 \n" // p -= 2 * stride

100	96

101 "vld1.u8 {q1}, [%[p]], %[stride] \n" // p1	97 "vld1.u8 {q1}, [%[p]], %[stride] \n" // p1

102 "vld1.u8 {q2}, [%[p]], %[stride] \n" // p0	98 "vld1.u8 {q2}, [%[p]], %[stride] \n" // p0

103 "vld1.u8 {q3}, [%[p]], %[stride] \n" // q0	99 "vld1.u8 {q3}, [%[p]], %[stride] \n" // q0

104 "vld1.u8 {q4}, [%[p]] \n" // q1	100 "vld1.u8 {q12}, [%[p]] \n" // q1

105	101

106 DO_FILTER2(q1, q2, q3, q4, %[thresh])	102 DO_FILTER2(q1, q2, q3, q12, %[thresh])

107	103

108 "sub %[p], %[p], %[stride], lsl #1 \n" // p -= 2 * stride	104 "sub %[p], %[p], %[stride], lsl #1 \n" // p -= 2 * stride

109	105

110 "vst1.u8 {q2}, [%[p]], %[stride] \n" // store op0	106 "vst1.u8 {q2}, [%[p]], %[stride] \n" // store op0

111 "vst1.u8 {q3}, [%[p]] \n" // store oq0	107 "vst1.u8 {q3}, [%[p]] \n" // store oq0

112 : [p] "+r"(p)	108 : [p] "+r"(p)

113 : [stride] "r"(stride), [thresh] "r"(thresh)	109 : [stride] "r"(stride), [thresh] "r"(thresh)

114 : "memory", QRegs	110 : "memory", QRegs

115 );	111 );

116 }	112 }

117	113

118 static void SimpleHFilter16NEON(uint8_t* p, int stride, int thresh) {	114 static void SimpleHFilter16NEON(uint8_t* p, int stride, int thresh) {

119 __asm__ volatile (	115 __asm__ volatile (

120 "sub r4, %[p], #2 \n" // base1 = p - 2	116 "sub r4, %[p], #2 \n" // base1 = p - 2

121 "lsl r6, %[stride], #1 \n" // r6 = 2 * stride	117 "lsl r6, %[stride], #1 \n" // r6 = 2 * stride

122 "add r5, r4, %[stride] \n" // base2 = base1 + stride	118 "add r5, r4, %[stride] \n" // base2 = base1 + stride

123	119

124 LOAD8x4(d2, d3, d4, d5, [r4], [r5], r6)	120 LOAD8x4(d2, d3, d4, d5, [r4], [r5], r6)

125 LOAD8x4(d6, d7, d8, d9, [r4], [r5], r6)	121 LOAD8x4(d24, d25, d26, d27, [r4], [r5], r6)

126 "vswp d3, d6 \n" // p1:q1 p0:q3	122 "vswp d3, d24 \n" // p1:q1 p0:q3

127 "vswp d5, d8 \n" // q0:q2 q1:q4	123 "vswp d5, d26 \n" // q0:q2 q1:q4

128 "vswp q2, q3 \n" // p1:q1 p0:q2 q0:q3 q1:q4	124 "vswp q2, q12 \n" // p1:q1 p0:q2 q0:q3 q1:q4

129	125

130 DO_FILTER2(q1, q2, q3, q4, %[thresh])	126 DO_FILTER2(q1, q2, q12, q13, %[thresh])

131	127

132 "sub %[p], %[p], #1 \n" // p - 1	128 "sub %[p], %[p], #1 \n" // p - 1

133	129

134 "vswp d5, d6 \n"	130 "vswp d5, d24 \n"

135 STORE8x2(d4, d5, [%[p]], %[stride])	131 STORE8x2(d4, d5, [%[p]], %[stride])

136 STORE8x2(d6, d7, [%[p]], %[stride])	132 STORE8x2(d24, d25, [%[p]], %[stride])

137	133

138 : [p] "+r"(p)	134 : [p] "+r"(p)

139 : [stride] "r"(stride), [thresh] "r"(thresh)	135 : [stride] "r"(stride), [thresh] "r"(thresh)

140 : "memory", "r4", "r5", "r6", QRegs	136 : "memory", "r4", "r5", "r6", QRegs

141 );	137 );

142 }	138 }

143	139

144 static void SimpleVFilter16iNEON(uint8_t* p, int stride, int thresh) {	140 static void SimpleVFilter16iNEON(uint8_t* p, int stride, int thresh) {

145 int k;	141 int k;

146 for (k = 3; k > 0; --k) {	142 for (k = 3; k > 0; --k) {

147 p += 4 * stride;	143 p += 4 * stride;

148 SimpleVFilter16NEON(p, stride, thresh);	144 SimpleVFilter16NEON(p, stride, thresh);

149 }	145 }

150 }	146 }

151	147

152 static void SimpleHFilter16iNEON(uint8_t* p, int stride, int thresh) {	148 static void SimpleHFilter16iNEON(uint8_t* p, int stride, int thresh) {

153 int k;	149 int k;

154 for (k = 3; k > 0; --k) {	150 for (k = 3; k > 0; --k) {

155 p += 4;	151 p += 4;

156 SimpleHFilter16NEON(p, stride, thresh);	152 SimpleHFilter16NEON(p, stride, thresh);

157 }	153 }

158 }	154 }

159	155

160 //-----------------------------------------------------------------------------	156 //-----------------------------------------------------------------------------

161 // Inverse transforms (Paragraph 14.4)	157 // Inverse transforms (Paragraph 14.4)

162	158

163 static void TransformOneNEON(const int16_t in, uint8_t dst) {	159 static void TransformOne(const int16_t* in, uint8_t* dst) {

164 const int kBPS = BPS;	160 const int kBPS = BPS;

165 const int16_t constants[] = {20091, 17734, 0, 0};	161 const int16_t constants[] = {20091, 17734, 0, 0};

166 /* kC1, kC2. Padded because vld1.16 loads 8 bytes	162 /* kC1, kC2. Padded because vld1.16 loads 8 bytes

167 * Technically these are unsigned but vqdmulh is only available in signed.	163 * Technically these are unsigned but vqdmulh is only available in signed.

168 * vqdmulh returns high half (effectively >> 16) but also doubles the value,	164 * vqdmulh returns high half (effectively >> 16) but also doubles the value,

169 * changing the >> 16 to >> 15 and requiring an additional >> 1.	165 * changing the >> 16 to >> 15 and requiring an additional >> 1.

170 * We use this to our advantage with kC2. The canonical value is 35468.	166 * We use this to our advantage with kC2. The canonical value is 35468.

171 * However, the high bit is set so treating it as signed will give incorrect	167 * However, the high bit is set so treating it as signed will give incorrect

172 * results. We avoid this by down shifting by 1 here to clear the highest bit.	168 * results. We avoid this by down shifting by 1 here to clear the highest bit.

173 * Combined with the doubling effect of vqdmulh we get >> 16.	169 * Combined with the doubling effect of vqdmulh we get >> 16.

(...skipping 128 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
302 "vst1.32 d0[1], [%[dst]], %[kBPS] \n"	298 "vst1.32 d0[1], [%[dst]], %[kBPS] \n"

303 "vst1.32 d1[0], [%[dst]], %[kBPS] \n"	299 "vst1.32 d1[0], [%[dst]], %[kBPS] \n"

304 "vst1.32 d1[1], [%[dst]] \n"	300 "vst1.32 d1[1], [%[dst]] \n"

305	301

306 : [in] "+r"(in), [dst] "+r"(dst) /* modified registers */	302 : [in] "+r"(in), [dst] "+r"(dst) /* modified registers */

307 : [kBPS] "r"(kBPS), [constants] "r"(constants) /* constants */	303 : [kBPS] "r"(kBPS), [constants] "r"(constants) /* constants */

308 : "memory", "q0", "q1", "q2", "q8", "q9", "q10", "q11" /* clobbered */	304 : "memory", "q0", "q1", "q2", "q8", "q9", "q10", "q11" /* clobbered */

309 );	305 );

310 }	306 }

311	307

312 static void TransformTwoNEON(const int16_t* in, uint8_t* dst, int do_two) {	308 static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {

313 TransformOneNEON(in, dst);	309 TransformOne(in, dst);

314 if (do_two) {	310 if (do_two) {

315 TransformOneNEON(in + 16, dst + 4);	311 TransformOne(in + 16, dst + 4);

316 }	312 }

317 }	313 }

318	314

	315 static void TransformDC(const int16_t* in, uint8_t* dst) {

	316 const int DC = (in[0] + 4) >> 3;

	317 const int kBPS = BPS;

	318 __asm__ volatile (

	319 "vdup.16 q1, %[DC] \n"

	320

	321 "vld1.32 d0[0], [%[dst]], %[kBPS] \n"

	322 "vld1.32 d1[0], [%[dst]], %[kBPS] \n"

	323 "vld1.32 d0[1], [%[dst]], %[kBPS] \n"

	324 "vld1.32 d1[1], [%[dst]], %[kBPS] \n"

	325

	326 "sub %[dst], %[dst], %[kBPS], lsl #2 \n"

	327

	328 // add DC and convert to s16.

	329 "vaddw.u8 q2, q1, d0 \n"

	330 "vaddw.u8 q3, q1, d1 \n"

	331 // convert back to u8 with saturation

	332 "vqmovun.s16 d0, q2 \n"

	333 "vqmovun.s16 d1, q3 \n"

	334

	335 "vst1.32 d0[0], [%[dst]], %[kBPS] \n"

	336 "vst1.32 d1[0], [%[dst]], %[kBPS] \n"

	337 "vst1.32 d0[1], [%[dst]], %[kBPS] \n"

	338 "vst1.32 d1[1], [%[dst]] \n"

	339 : [in] "+r"(in), [dst] "+r"(dst) /* modified registers */

	340 : [kBPS] "r"(kBPS), /* constants */

	341 [DC] "r"(DC)

	342 : "memory", "q0", "q1", "q2", "q3" /* clobbered */

	343 );

	344 }

	345

319 static void TransformWHT(const int16_t* in, int16_t* out) {	346 static void TransformWHT(const int16_t* in, int16_t* out) {

320 const int kStep = 32; // The store is only incrementing the pointer as if we	347 const int kStep = 32; // The store is only incrementing the pointer as if we

321 // had stored a single byte.	348 // had stored a single byte.

322 __asm__ volatile (	349 __asm__ volatile (

323 // part 1	350 // part 1

324 // load data into q0, q1	351 // load data into q0, q1

325 "vld1.16 {q0, q1}, [%[in]] \n"	352 "vld1.16 {q0, q1}, [%[in]] \n"

326	353

327 "vaddl.s16 q2, d0, d3 \n" // a0 = in[0] + in[12]	354 "vaddl.s16 q2, d0, d3 \n" // a0 = in[0] + in[12]

328 "vaddl.s16 q3, d1, d2 \n" // a1 = in[4] + in[8]	355 "vaddl.s16 q3, d1, d2 \n" // a1 = in[4] + in[8]

329 "vsubl.s16 q4, d1, d2 \n" // a2 = in[4] - in[8]	356 "vsubl.s16 q10, d1, d2 \n" // a2 = in[4] - in[8]

330 "vsubl.s16 q5, d0, d3 \n" // a3 = in[0] - in[12]	357 "vsubl.s16 q11, d0, d3 \n" // a3 = in[0] - in[12]

331	358

332 "vadd.s32 q0, q2, q3 \n" // tmp[0] = a0 + a1	359 "vadd.s32 q0, q2, q3 \n" // tmp[0] = a0 + a1

333 "vsub.s32 q2, q2, q3 \n" // tmp[8] = a0 - a1	360 "vsub.s32 q2, q2, q3 \n" // tmp[8] = a0 - a1

334 "vadd.s32 q1, q5, q4 \n" // tmp[4] = a3 + a2	361 "vadd.s32 q1, q11, q10 \n" // tmp[4] = a3 + a2

335 "vsub.s32 q3, q5, q4 \n" // tmp[12] = a3 - a2	362 "vsub.s32 q3, q11, q10 \n" // tmp[12] = a3 - a2

336	363

337 // Transpose	364 // Transpose

338 // q0 = tmp[0, 4, 8, 12], q1 = tmp[2, 6, 10, 14]	365 // q0 = tmp[0, 4, 8, 12], q1 = tmp[2, 6, 10, 14]

339 // q2 = tmp[1, 5, 9, 13], q3 = tmp[3, 7, 11, 15]	366 // q2 = tmp[1, 5, 9, 13], q3 = tmp[3, 7, 11, 15]

340 "vswp d1, d4 \n" // vtrn.64 q0, q2	367 "vswp d1, d4 \n" // vtrn.64 q0, q2

341 "vswp d3, d6 \n" // vtrn.64 q1, q3	368 "vswp d3, d6 \n" // vtrn.64 q1, q3

342 "vtrn.32 q0, q1 \n"	369 "vtrn.32 q0, q1 \n"

343 "vtrn.32 q2, q3 \n"	370 "vtrn.32 q2, q3 \n"

344	371

345 "vmov.s32 q4, #3 \n" // dc = 3	372 "vmov.s32 q10, #3 \n" // dc = 3

346 "vadd.s32 q0, q0, q4 \n" // dc = tmp[0] + 3	373 "vadd.s32 q0, q0, q10 \n" // dc = tmp[0] + 3

347 "vadd.s32 q6, q0, q3 \n" // a0 = dc + tmp[3]	374 "vadd.s32 q12, q0, q3 \n" // a0 = dc + tmp[3]

348 "vadd.s32 q7, q1, q2 \n" // a1 = tmp[1] + tmp[2]	375 "vadd.s32 q13, q1, q2 \n" // a1 = tmp[1] + tmp[2]

349 "vsub.s32 q8, q1, q2 \n" // a2 = tmp[1] - tmp[2]	376 "vsub.s32 q8, q1, q2 \n" // a2 = tmp[1] - tmp[2]

350 "vsub.s32 q9, q0, q3 \n" // a3 = dc - tmp[3]	377 "vsub.s32 q9, q0, q3 \n" // a3 = dc - tmp[3]

351	378

352 "vadd.s32 q0, q6, q7 \n"	379 "vadd.s32 q0, q12, q13 \n"

353 "vshrn.s32 d0, q0, #3 \n" // (a0 + a1) >> 3	380 "vshrn.s32 d0, q0, #3 \n" // (a0 + a1) >> 3

354 "vadd.s32 q1, q9, q8 \n"	381 "vadd.s32 q1, q9, q8 \n"

355 "vshrn.s32 d1, q1, #3 \n" // (a3 + a2) >> 3	382 "vshrn.s32 d1, q1, #3 \n" // (a3 + a2) >> 3

356 "vsub.s32 q2, q6, q7 \n"	383 "vsub.s32 q2, q12, q13 \n"

357 "vshrn.s32 d2, q2, #3 \n" // (a0 - a1) >> 3	384 "vshrn.s32 d2, q2, #3 \n" // (a0 - a1) >> 3

358 "vsub.s32 q3, q9, q8 \n"	385 "vsub.s32 q3, q9, q8 \n"

359 "vshrn.s32 d3, q3, #3 \n" // (a3 - a2) >> 3	386 "vshrn.s32 d3, q3, #3 \n" // (a3 - a2) >> 3

360	387

361 // set the results to output	388 // set the results to output

362 "vst1.16 d0[0], [%[out]], %[kStep] \n"	389 "vst1.16 d0[0], [%[out]], %[kStep] \n"

363 "vst1.16 d1[0], [%[out]], %[kStep] \n"	390 "vst1.16 d1[0], [%[out]], %[kStep] \n"

364 "vst1.16 d2[0], [%[out]], %[kStep] \n"	391 "vst1.16 d2[0], [%[out]], %[kStep] \n"

365 "vst1.16 d3[0], [%[out]], %[kStep] \n"	392 "vst1.16 d3[0], [%[out]], %[kStep] \n"

366 "vst1.16 d0[1], [%[out]], %[kStep] \n"	393 "vst1.16 d0[1], [%[out]], %[kStep] \n"

367 "vst1.16 d1[1], [%[out]], %[kStep] \n"	394 "vst1.16 d1[1], [%[out]], %[kStep] \n"

368 "vst1.16 d2[1], [%[out]], %[kStep] \n"	395 "vst1.16 d2[1], [%[out]], %[kStep] \n"

369 "vst1.16 d3[1], [%[out]], %[kStep] \n"	396 "vst1.16 d3[1], [%[out]], %[kStep] \n"

370 "vst1.16 d0[2], [%[out]], %[kStep] \n"	397 "vst1.16 d0[2], [%[out]], %[kStep] \n"

371 "vst1.16 d1[2], [%[out]], %[kStep] \n"	398 "vst1.16 d1[2], [%[out]], %[kStep] \n"

372 "vst1.16 d2[2], [%[out]], %[kStep] \n"	399 "vst1.16 d2[2], [%[out]], %[kStep] \n"

373 "vst1.16 d3[2], [%[out]], %[kStep] \n"	400 "vst1.16 d3[2], [%[out]], %[kStep] \n"

374 "vst1.16 d0[3], [%[out]], %[kStep] \n"	401 "vst1.16 d0[3], [%[out]], %[kStep] \n"

375 "vst1.16 d1[3], [%[out]], %[kStep] \n"	402 "vst1.16 d1[3], [%[out]], %[kStep] \n"

376 "vst1.16 d2[3], [%[out]], %[kStep] \n"	403 "vst1.16 d2[3], [%[out]], %[kStep] \n"

377 "vst1.16 d3[3], [%[out]], %[kStep] \n"	404 "vst1.16 d3[3], [%[out]], %[kStep] \n"

378	405

379 : [out] "+r"(out) // modified registers	406 : [out] "+r"(out) // modified registers

380 : [in] "r"(in), [kStep] "r"(kStep) // constants	407 : [in] "r"(in), [kStep] "r"(kStep) // constants

381 : "memory", "q0", "q1", "q2", "q3", "q4",	408 : "memory", "q0", "q1", "q2", "q3",

382 "q5", "q6", "q7", "q8", "q9" // clobbered	409 "q8", "q9", "q10", "q11", "q12", "q13" // clobbered

383 );	410 );

384 }	411 }

385	412

386 #endif // WEBP_USE_NEON	413 #endif // WEBP_USE_NEON

387	414

388 //------------------------------------------------------------------------------	415 //------------------------------------------------------------------------------

389 // Entry point	416 // Entry point

390	417

391 extern void VP8DspInitNEON(void);	418 extern void VP8DspInitNEON(void);

392	419

393 void VP8DspInitNEON(void) {	420 void VP8DspInitNEON(void) {

394 #if defined(WEBP_USE_NEON)	421 #if defined(WEBP_USE_NEON)

395 VP8Transform = TransformTwoNEON;	422 VP8Transform = TransformTwo;

	423 VP8TransformAC3 = TransformOne; // no special code here

	424 VP8TransformDC = TransformDC;

396 VP8TransformWHT = TransformWHT;	425 VP8TransformWHT = TransformWHT;

397	426

398 VP8SimpleVFilter16 = SimpleVFilter16NEON;	427 VP8SimpleVFilter16 = SimpleVFilter16NEON;

399 VP8SimpleHFilter16 = SimpleHFilter16NEON;	428 VP8SimpleHFilter16 = SimpleHFilter16NEON;

400 VP8SimpleVFilter16i = SimpleVFilter16iNEON;	429 VP8SimpleVFilter16i = SimpleVFilter16iNEON;

401 VP8SimpleHFilter16i = SimpleHFilter16iNEON;	430 VP8SimpleHFilter16i = SimpleHFilter16iNEON;

402 #endif // WEBP_USE_NEON	431 #endif // WEBP_USE_NEON

403 }	432 }

404	433

405 #if defined(__cplusplus) \|\| defined(c_plusplus)

406 } // extern "C"

407 #endif

OLD	NEW

« third_party/libwebp/README.chromium ('K') | « third_party/libwebp/dsp/dec.c ('k') | third_party/libwebp/dsp/dec_sse2.c » ('j') | no next file with comments »