Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(966)

Side by Side Diff: third_party/libwebp/dsp/dec_neon.c

Issue 12942006: libwebp: update snapshot to v0.3.0-rc6 (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src
Patch Set: local webkit layout expectations Created 7 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 // Copyright 2012 Google Inc. All Rights Reserved. 1 // Copyright 2012 Google Inc. All Rights Reserved.
2 // 2 //
3 // This code is licensed under the same terms as WebM: 3 // This code is licensed under the same terms as WebM:
4 // Software License Agreement: http://www.webmproject.org/license/software/ 4 // Software License Agreement: http://www.webmproject.org/license/software/
5 // Additional IP Rights Grant: http://www.webmproject.org/license/additional/ 5 // Additional IP Rights Grant: http://www.webmproject.org/license/additional/
6 // ----------------------------------------------------------------------------- 6 // -----------------------------------------------------------------------------
7 // 7 //
8 // ARM NEON version of dsp functions and loop filtering. 8 // ARM NEON version of dsp functions and loop filtering.
9 // 9 //
10 // Authors: Somnath Banerjee (somnath@google.com) 10 // Authors: Somnath Banerjee (somnath@google.com)
11 // Johann Koenig (johannkoenig@google.com) 11 // Johann Koenig (johannkoenig@google.com)
12 12
13 #include "./dsp.h" 13 #include "./dsp.h"
14 14
15 #if defined(__cplusplus) || defined(c_plusplus)
fbarchard 2013/03/22 19:08:27 pick one?
jzern 2013/03/22 19:18:25 skal was the source of that way back. actually I d
16 extern "C" {
17 #endif
18
15 #if defined(WEBP_USE_NEON) 19 #if defined(WEBP_USE_NEON)
16 20
17 #include "../dec/vp8i.h" 21 #include "../dec/vp8i.h"
18 22
19 #if defined(__cplusplus) || defined(c_plusplus)
20 extern "C" {
21 #endif
22
23 #define QRegs "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", \ 23 #define QRegs "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", \
24 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" 24 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
25 25
26 #define FLIP_SIGN_BIT2(a, b, s) \ 26 #define FLIP_SIGN_BIT2(a, b, s) \
27 "veor " #a "," #a "," #s " \n" \ 27 "veor " #a "," #a "," #s " \n" \
28 "veor " #b "," #b "," #s " \n" \ 28 "veor " #b "," #b "," #s " \n" \
29 29
30 #define FLIP_SIGN_BIT4(a, b, c, d, s) \ 30 #define FLIP_SIGN_BIT4(a, b, c, d, s) \
31 FLIP_SIGN_BIT2(a, b, s) \ 31 FLIP_SIGN_BIT2(a, b, s) \
32 FLIP_SIGN_BIT2(c, d, s) \ 32 FLIP_SIGN_BIT2(c, d, s) \
(...skipping 115 matching lines...) Expand 10 before | Expand all | Expand 10 after
148 } 148 }
149 149
150 static void SimpleHFilter16iNEON(uint8_t* p, int stride, int thresh) { 150 static void SimpleHFilter16iNEON(uint8_t* p, int stride, int thresh) {
151 int k; 151 int k;
152 for (k = 3; k > 0; --k) { 152 for (k = 3; k > 0; --k) {
153 p += 4; 153 p += 4;
154 SimpleHFilter16NEON(p, stride, thresh); 154 SimpleHFilter16NEON(p, stride, thresh);
155 } 155 }
156 } 156 }
157 157
158 //-----------------------------------------------------------------------------
159 // Inverse transforms (Paragraph 14.4)
160
158 static void TransformOneNEON(const int16_t *in, uint8_t *dst) { 161 static void TransformOneNEON(const int16_t *in, uint8_t *dst) {
159 const int kBPS = BPS; 162 const int kBPS = BPS;
160 const int16_t constants[] = {20091, 17734, 0, 0}; 163 const int16_t constants[] = {20091, 17734, 0, 0};
161 /* kC1, kC2. Padded because vld1.16 loads 8 bytes 164 /* kC1, kC2. Padded because vld1.16 loads 8 bytes
162 * Technically these are unsigned but vqdmulh is only available in signed. 165 * Technically these are unsigned but vqdmulh is only available in signed.
163 * vqdmulh returns high half (effectively >> 16) but also doubles the value, 166 * vqdmulh returns high half (effectively >> 16) but also doubles the value,
164 * changing the >> 16 to >> 15 and requiring an additional >> 1. 167 * changing the >> 16 to >> 15 and requiring an additional >> 1.
165 * We use this to our advantage with kC2. The canonical value is 35468. 168 * We use this to our advantage with kC2. The canonical value is 35468.
166 * However, the high bit is set so treating it as signed will give incorrect 169 * However, the high bit is set so treating it as signed will give incorrect
167 * results. We avoid this by down shifting by 1 here to clear the highest bit. 170 * results. We avoid this by down shifting by 1 here to clear the highest bit.
(...skipping 136 matching lines...) Expand 10 before | Expand all | Expand 10 after
304 ); 307 );
305 } 308 }
306 309
307 static void TransformTwoNEON(const int16_t* in, uint8_t* dst, int do_two) { 310 static void TransformTwoNEON(const int16_t* in, uint8_t* dst, int do_two) {
308 TransformOneNEON(in, dst); 311 TransformOneNEON(in, dst);
309 if (do_two) { 312 if (do_two) {
310 TransformOneNEON(in + 16, dst + 4); 313 TransformOneNEON(in + 16, dst + 4);
311 } 314 }
312 } 315 }
313 316
317 static void TransformWHT(const int16_t* in, int16_t* out) {
318 const int kStep = 32; // The store is only incrementing the pointer as if we
319 // had stored a single byte.
320 __asm__ volatile (
321 // part 1
322 // load data into q0, q1
323 "vld1.16 {q0, q1}, [%[in]] \n"
324
325 "vaddl.s16 q2, d0, d3 \n" // a0 = in[0] + in[12]
326 "vaddl.s16 q3, d1, d2 \n" // a1 = in[4] + in[8]
327 "vsubl.s16 q4, d1, d2 \n" // a2 = in[4] - in[8]
328 "vsubl.s16 q5, d0, d3 \n" // a3 = in[0] - in[12]
329
330 "vadd.s32 q0, q2, q3 \n" // tmp[0] = a0 + a1
331 "vsub.s32 q2, q2, q3 \n" // tmp[8] = a0 - a1
332 "vadd.s32 q1, q5, q4 \n" // tmp[4] = a3 + a2
333 "vsub.s32 q3, q5, q4 \n" // tmp[12] = a3 - a2
334
335 // Transpose
336 // q0 = tmp[0, 4, 8, 12], q1 = tmp[2, 6, 10, 14]
337 // q2 = tmp[1, 5, 9, 13], q3 = tmp[3, 7, 11, 15]
338 "vswp d1, d4 \n" // vtrn.64 q0, q2
339 "vswp d3, d6 \n" // vtrn.64 q1, q3
340 "vtrn.32 q0, q1 \n"
341 "vtrn.32 q2, q3 \n"
342
343 "vmov.s32 q4, #3 \n" // dc = 3
344 "vadd.s32 q0, q0, q4 \n" // dc = tmp[0] + 3
fbarchard 2013/03/22 19:08:27 vadd can do immediate 3, no? If you must waste a r
Johann 2013/03/22 19:19:15 Not that I'm aware of: http://infocenter.arm.com/h
345 "vadd.s32 q6, q0, q3 \n" // a0 = dc + tmp[3]
346 "vadd.s32 q7, q1, q2 \n" // a1 = tmp[1] + tmp[2]
347 "vsub.s32 q8, q1, q2 \n" // a2 = tmp[1] - tmp[2]
348 "vsub.s32 q9, q0, q3 \n" // a3 = dc - tmp[3]
349
350 "vadd.s32 q0, q6, q7 \n"
351 "vshrn.s32 d0, q0, #3 \n" // (a0 + a1) >> 3
fbarchard 2013/03/22 19:08:27 consider rounding.. its free.
Johann 2013/03/22 19:19:15 It wouldn't match the intended output.
352 "vadd.s32 q1, q9, q8 \n"
353 "vshrn.s32 d1, q1, #3 \n" // (a3 + a2) >> 3
354 "vsub.s32 q2, q6, q7 \n"
355 "vshrn.s32 d2, q2, #3 \n" // (a0 - a1) >> 3
356 "vsub.s32 q3, q9, q8 \n"
357 "vshrn.s32 d3, q3, #3 \n" // (a3 - a2) >> 3
358
359 // set the results to output
360 "vst1.16 d0[0], [%[out]], %[kStep] \n"
fbarchard 2013/03/22 19:08:27 Can this be done with 1 vst? out continuous, vdup
Johann 2013/03/22 19:19:15 I'm not aware of a vst which takes kStep.
361 "vst1.16 d1[0], [%[out]], %[kStep] \n"
362 "vst1.16 d2[0], [%[out]], %[kStep] \n"
363 "vst1.16 d3[0], [%[out]], %[kStep] \n"
364 "vst1.16 d0[1], [%[out]], %[kStep] \n"
365 "vst1.16 d1[1], [%[out]], %[kStep] \n"
366 "vst1.16 d2[1], [%[out]], %[kStep] \n"
367 "vst1.16 d3[1], [%[out]], %[kStep] \n"
368 "vst1.16 d0[2], [%[out]], %[kStep] \n"
369 "vst1.16 d1[2], [%[out]], %[kStep] \n"
370 "vst1.16 d2[2], [%[out]], %[kStep] \n"
371 "vst1.16 d3[2], [%[out]], %[kStep] \n"
372 "vst1.16 d0[3], [%[out]], %[kStep] \n"
373 "vst1.16 d1[3], [%[out]], %[kStep] \n"
374 "vst1.16 d2[3], [%[out]], %[kStep] \n"
375 "vst1.16 d3[3], [%[out]], %[kStep] \n"
376
377 : [out] "+r"(out) // modified registers
378 : [in] "r"(in), [kStep] "r"(kStep) // constants
379 : "memory", "q0", "q1", "q2", "q3", "q4",
380 "q5", "q6", "q7", "q8", "q9" // clobbered
fbarchard 2013/03/22 19:08:27 avoid q4-q7. use q0-q3, q8-q15.
Johann 2013/03/22 19:19:15 Yes that should have been done.
381 );
382 }
383
384 #endif // WEBP_USE_NEON
385
386 //------------------------------------------------------------------------------
387 // Entry point
388
314 extern void VP8DspInitNEON(void); 389 extern void VP8DspInitNEON(void);
315 390
316 void VP8DspInitNEON(void) { 391 void VP8DspInitNEON(void) {
392 #if defined(WEBP_USE_NEON)
317 VP8Transform = TransformTwoNEON; 393 VP8Transform = TransformTwoNEON;
394 VP8TransformWHT = TransformWHT;
318 395
319 VP8SimpleVFilter16 = SimpleVFilter16NEON; 396 VP8SimpleVFilter16 = SimpleVFilter16NEON;
320 VP8SimpleHFilter16 = SimpleHFilter16NEON; 397 VP8SimpleHFilter16 = SimpleHFilter16NEON;
321 VP8SimpleVFilter16i = SimpleVFilter16iNEON; 398 VP8SimpleVFilter16i = SimpleVFilter16iNEON;
322 VP8SimpleHFilter16i = SimpleHFilter16iNEON; 399 VP8SimpleHFilter16i = SimpleHFilter16iNEON;
400 #endif // WEBP_USE_NEON
323 } 401 }
324 402
325 #if defined(__cplusplus) || defined(c_plusplus) 403 #if defined(__cplusplus) || defined(c_plusplus)
326 } // extern "C" 404 } // extern "C"
327 #endif 405 #endif
328
329 #endif // WEBP_USE_NEON
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698