third_party/libwebp/dsp/dec_neon.c - Issue 12942006: libwebp: update snapshot to v0.3.0-rc6

Side by Side Diff: third_party/libwebp/dsp/dec_neon.c

Issue 12942006: libwebp: update snapshot to v0.3.0-rc6 (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: local webkit layout expectations Created 7 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright 2012 Google Inc. All Rights Reserved.	1 // Copyright 2012 Google Inc. All Rights Reserved.

2 //	2 //

3 // This code is licensed under the same terms as WebM:	3 // This code is licensed under the same terms as WebM:

4 // Software License Agreement: http://www.webmproject.org/license/software/	4 // Software License Agreement: http://www.webmproject.org/license/software/

5 // Additional IP Rights Grant: http://www.webmproject.org/license/additional/	5 // Additional IP Rights Grant: http://www.webmproject.org/license/additional/

6 // -----------------------------------------------------------------------------	6 // -----------------------------------------------------------------------------

7 //	7 //

8 // ARM NEON version of dsp functions and loop filtering.	8 // ARM NEON version of dsp functions and loop filtering.

9 //	9 //

10 // Authors: Somnath Banerjee (somnath@google.com)	10 // Authors: Somnath Banerjee (somnath@google.com)

11 // Johann Koenig (johannkoenig@google.com)	11 // Johann Koenig (johannkoenig@google.com)

12	12

13 #include "./dsp.h"	13 #include "./dsp.h"

14	14

	15 #if defined(__cplusplus) \|\| defined(c_plusplus)
	fbarchard 2013/03/22 19:08:27 pick one? pick one? jzern 2013/03/22 19:18:25 skal was the source of that way back. actually I d Show quoted text On 2013/03/22 19:08:27, fbarchard wrote: > pick one? skal was the source of that way back. actually I don't know which compiler uses the latter...
	16 extern "C" {

	17 #endif

	18

15 #if defined(WEBP_USE_NEON)	19 #if defined(WEBP_USE_NEON)

16	20

17 #include "../dec/vp8i.h"	21 #include "../dec/vp8i.h"

18	22

19 #if defined(__cplusplus) \|\| defined(c_plusplus)

20 extern "C" {

21 #endif

22

23 #define QRegs "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", \	23 #define QRegs "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", \

24 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"	24 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

25	25

26 #define FLIP_SIGN_BIT2(a, b, s) \	26 #define FLIP_SIGN_BIT2(a, b, s) \

27 "veor " #a "," #a "," #s " \n" \	27 "veor " #a "," #a "," #s " \n" \

28 "veor " #b "," #b "," #s " \n" \	28 "veor " #b "," #b "," #s " \n" \

29	29

30 #define FLIP_SIGN_BIT4(a, b, c, d, s) \	30 #define FLIP_SIGN_BIT4(a, b, c, d, s) \

31 FLIP_SIGN_BIT2(a, b, s) \	31 FLIP_SIGN_BIT2(a, b, s) \

32 FLIP_SIGN_BIT2(c, d, s) \	32 FLIP_SIGN_BIT2(c, d, s) \

(...skipping 115 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
148 }	148 }

149	149

150 static void SimpleHFilter16iNEON(uint8_t* p, int stride, int thresh) {	150 static void SimpleHFilter16iNEON(uint8_t* p, int stride, int thresh) {

151 int k;	151 int k;

152 for (k = 3; k > 0; --k) {	152 for (k = 3; k > 0; --k) {

153 p += 4;	153 p += 4;

154 SimpleHFilter16NEON(p, stride, thresh);	154 SimpleHFilter16NEON(p, stride, thresh);

155 }	155 }

156 }	156 }

157	157

	158 //-----------------------------------------------------------------------------

	159 // Inverse transforms (Paragraph 14.4)

	160

158 static void TransformOneNEON(const int16_t in, uint8_t dst) {	161 static void TransformOneNEON(const int16_t in, uint8_t dst) {

159 const int kBPS = BPS;	162 const int kBPS = BPS;

160 const int16_t constants[] = {20091, 17734, 0, 0};	163 const int16_t constants[] = {20091, 17734, 0, 0};

161 /* kC1, kC2. Padded because vld1.16 loads 8 bytes	164 /* kC1, kC2. Padded because vld1.16 loads 8 bytes

162 * Technically these are unsigned but vqdmulh is only available in signed.	165 * Technically these are unsigned but vqdmulh is only available in signed.

163 * vqdmulh returns high half (effectively >> 16) but also doubles the value,	166 * vqdmulh returns high half (effectively >> 16) but also doubles the value,

164 * changing the >> 16 to >> 15 and requiring an additional >> 1.	167 * changing the >> 16 to >> 15 and requiring an additional >> 1.

165 * We use this to our advantage with kC2. The canonical value is 35468.	168 * We use this to our advantage with kC2. The canonical value is 35468.

166 * However, the high bit is set so treating it as signed will give incorrect	169 * However, the high bit is set so treating it as signed will give incorrect

167 * results. We avoid this by down shifting by 1 here to clear the highest bit.	170 * results. We avoid this by down shifting by 1 here to clear the highest bit.

(...skipping 136 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
304 );	307 );

305 }	308 }

306	309

307 static void TransformTwoNEON(const int16_t* in, uint8_t* dst, int do_two) {	310 static void TransformTwoNEON(const int16_t* in, uint8_t* dst, int do_two) {

308 TransformOneNEON(in, dst);	311 TransformOneNEON(in, dst);

309 if (do_two) {	312 if (do_two) {

310 TransformOneNEON(in + 16, dst + 4);	313 TransformOneNEON(in + 16, dst + 4);

311 }	314 }

312 }	315 }

313	316

	317 static void TransformWHT(const int16_t* in, int16_t* out) {

	318 const int kStep = 32; // The store is only incrementing the pointer as if we

	319 // had stored a single byte.

	320 __asm__ volatile (

	321 // part 1

	322 // load data into q0, q1

	323 "vld1.16 {q0, q1}, [%[in]] \n"

	324

	325 "vaddl.s16 q2, d0, d3 \n" // a0 = in[0] + in[12]

	326 "vaddl.s16 q3, d1, d2 \n" // a1 = in[4] + in[8]

	327 "vsubl.s16 q4, d1, d2 \n" // a2 = in[4] - in[8]

	328 "vsubl.s16 q5, d0, d3 \n" // a3 = in[0] - in[12]

	329

	330 "vadd.s32 q0, q2, q3 \n" // tmp[0] = a0 + a1

	331 "vsub.s32 q2, q2, q3 \n" // tmp[8] = a0 - a1

	332 "vadd.s32 q1, q5, q4 \n" // tmp[4] = a3 + a2

	333 "vsub.s32 q3, q5, q4 \n" // tmp[12] = a3 - a2

	334

	335 // Transpose

	336 // q0 = tmp[0, 4, 8, 12], q1 = tmp[2, 6, 10, 14]

	337 // q2 = tmp[1, 5, 9, 13], q3 = tmp[3, 7, 11, 15]

	338 "vswp d1, d4 \n" // vtrn.64 q0, q2

	339 "vswp d3, d6 \n" // vtrn.64 q1, q3

	340 "vtrn.32 q0, q1 \n"

	341 "vtrn.32 q2, q3 \n"

	342

	343 "vmov.s32 q4, #3 \n" // dc = 3

	344 "vadd.s32 q0, q0, q4 \n" // dc = tmp[0] + 3
	fbarchard 2013/03/22 19:08:27 vadd can do immediate 3, no? If you must waste a r vadd can do immediate 3, no? If you must waste a register, initialize it further up to avoid interlock. avoid q4. Johann 2013/03/22 19:19:15 Not that I'm aware of: http://infocenter.arm.com/h Show quoted text On 2013/03/22 19:08:27, fbarchard wrote: > vadd can do immediate 3, no? Not that I'm aware of: http://infocenter.arm.com/help/topic/com.arm.doc.dui0204j/CIHJCAAG.html Show quoted text > If you must waste a register, initialize it further up to avoid interlock. > avoid q4. Yes it could be moved up some.
	345 "vadd.s32 q6, q0, q3 \n" // a0 = dc + tmp[3]

	346 "vadd.s32 q7, q1, q2 \n" // a1 = tmp[1] + tmp[2]

	347 "vsub.s32 q8, q1, q2 \n" // a2 = tmp[1] - tmp[2]

	348 "vsub.s32 q9, q0, q3 \n" // a3 = dc - tmp[3]

	349

	350 "vadd.s32 q0, q6, q7 \n"

	351 "vshrn.s32 d0, q0, #3 \n" // (a0 + a1) >> 3
	fbarchard 2013/03/22 19:08:27 consider rounding.. its free. consider rounding.. its free. Johann 2013/03/22 19:19:15 It wouldn't match the intended output. Show quoted text On 2013/03/22 19:08:27, fbarchard wrote: > consider rounding.. its free. It wouldn't match the intended output.
	352 "vadd.s32 q1, q9, q8 \n"

	353 "vshrn.s32 d1, q1, #3 \n" // (a3 + a2) >> 3

	354 "vsub.s32 q2, q6, q7 \n"

	355 "vshrn.s32 d2, q2, #3 \n" // (a0 - a1) >> 3

	356 "vsub.s32 q3, q9, q8 \n"

	357 "vshrn.s32 d3, q3, #3 \n" // (a3 - a2) >> 3

	358

	359 // set the results to output

	360 "vst1.16 d0[0], [%[out]], %[kStep] \n"
	fbarchard 2013/03/22 19:08:27 Can this be done with 1 vst? out continuous, vdup Can this be done with 1 vst? out continuous, vdup value. Johann 2013/03/22 19:19:15 I'm not aware of a vst which takes kStep. Show quoted text On 2013/03/22 19:08:27, fbarchard wrote: > Can this be done with 1 vst? out continuous, vdup value. I'm not aware of a vst which takes kStep.
	361 "vst1.16 d1[0], [%[out]], %[kStep] \n"

	362 "vst1.16 d2[0], [%[out]], %[kStep] \n"

	363 "vst1.16 d3[0], [%[out]], %[kStep] \n"

	364 "vst1.16 d0[1], [%[out]], %[kStep] \n"

	365 "vst1.16 d1[1], [%[out]], %[kStep] \n"

	366 "vst1.16 d2[1], [%[out]], %[kStep] \n"

	367 "vst1.16 d3[1], [%[out]], %[kStep] \n"

	368 "vst1.16 d0[2], [%[out]], %[kStep] \n"

	369 "vst1.16 d1[2], [%[out]], %[kStep] \n"

	370 "vst1.16 d2[2], [%[out]], %[kStep] \n"

	371 "vst1.16 d3[2], [%[out]], %[kStep] \n"

	372 "vst1.16 d0[3], [%[out]], %[kStep] \n"

	373 "vst1.16 d1[3], [%[out]], %[kStep] \n"

	374 "vst1.16 d2[3], [%[out]], %[kStep] \n"

	375 "vst1.16 d3[3], [%[out]], %[kStep] \n"

	376

	377 : [out] "+r"(out) // modified registers

	378 : [in] "r"(in), [kStep] "r"(kStep) // constants

	379 : "memory", "q0", "q1", "q2", "q3", "q4",

	380 "q5", "q6", "q7", "q8", "q9" // clobbered
	fbarchard 2013/03/22 19:08:27 avoid q4-q7. use q0-q3, q8-q15. avoid q4-q7. use q0-q3, q8-q15. Johann 2013/03/22 19:19:15 Yes that should have been done. Show quoted text On 2013/03/22 19:08:27, fbarchard wrote: > avoid q4-q7. use q0-q3, q8-q15. Yes that should have been done.
	381 );

	382 }

	383

	384 #endif // WEBP_USE_NEON

	385

	386 //------------------------------------------------------------------------------

	387 // Entry point

	388

314 extern void VP8DspInitNEON(void);	389 extern void VP8DspInitNEON(void);

315	390

316 void VP8DspInitNEON(void) {	391 void VP8DspInitNEON(void) {

	392 #if defined(WEBP_USE_NEON)

317 VP8Transform = TransformTwoNEON;	393 VP8Transform = TransformTwoNEON;

	394 VP8TransformWHT = TransformWHT;

318	395

319 VP8SimpleVFilter16 = SimpleVFilter16NEON;	396 VP8SimpleVFilter16 = SimpleVFilter16NEON;

320 VP8SimpleHFilter16 = SimpleHFilter16NEON;	397 VP8SimpleHFilter16 = SimpleHFilter16NEON;

321 VP8SimpleVFilter16i = SimpleVFilter16iNEON;	398 VP8SimpleVFilter16i = SimpleVFilter16iNEON;

322 VP8SimpleHFilter16i = SimpleHFilter16iNEON;	399 VP8SimpleHFilter16i = SimpleHFilter16iNEON;

	400 #endif // WEBP_USE_NEON

323 }	401 }

324	402

325 #if defined(__cplusplus) \|\| defined(c_plusplus)	403 #if defined(__cplusplus) \|\| defined(c_plusplus)

326 } // extern "C"	404 } // extern "C"

327 #endif	405 #endif

328

329 #endif // WEBP_USE_NEON

OLD	NEW

« third_party/libwebp/dsp/dec.c ('K') | « third_party/libwebp/dsp/dec.c ('k') | third_party/libwebp/dsp/dec_sse2.c » ('j') | no next file with comments »