OLD | NEW |
---|---|
1 // Copyright 2012 Google Inc. All Rights Reserved. | 1 // Copyright 2012 Google Inc. All Rights Reserved. |
2 // | 2 // |
3 // This code is licensed under the same terms as WebM: | 3 // This code is licensed under the same terms as WebM: |
4 // Software License Agreement: http://www.webmproject.org/license/software/ | 4 // Software License Agreement: http://www.webmproject.org/license/software/ |
5 // Additional IP Rights Grant: http://www.webmproject.org/license/additional/ | 5 // Additional IP Rights Grant: http://www.webmproject.org/license/additional/ |
6 // ----------------------------------------------------------------------------- | 6 // ----------------------------------------------------------------------------- |
7 // | 7 // |
8 // ARM NEON version of dsp functions and loop filtering. | 8 // ARM NEON version of dsp functions and loop filtering. |
9 // | 9 // |
10 // Authors: Somnath Banerjee (somnath@google.com) | 10 // Authors: Somnath Banerjee (somnath@google.com) |
11 // Johann Koenig (johannkoenig@google.com) | 11 // Johann Koenig (johannkoenig@google.com) |
12 | 12 |
13 #include "./dsp.h" | 13 #include "./dsp.h" |
14 | 14 |
15 #if defined(__cplusplus) || defined(c_plusplus) | |
fbarchard
2013/03/22 19:08:27
pick one?
jzern
2013/03/22 19:18:25
skal was the source of that way back. actually I d
| |
16 extern "C" { | |
17 #endif | |
18 | |
15 #if defined(WEBP_USE_NEON) | 19 #if defined(WEBP_USE_NEON) |
16 | 20 |
17 #include "../dec/vp8i.h" | 21 #include "../dec/vp8i.h" |
18 | 22 |
19 #if defined(__cplusplus) || defined(c_plusplus) | |
20 extern "C" { | |
21 #endif | |
22 | |
23 #define QRegs "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", \ | 23 #define QRegs "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", \ |
24 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" | 24 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" |
25 | 25 |
26 #define FLIP_SIGN_BIT2(a, b, s) \ | 26 #define FLIP_SIGN_BIT2(a, b, s) \ |
27 "veor " #a "," #a "," #s " \n" \ | 27 "veor " #a "," #a "," #s " \n" \ |
28 "veor " #b "," #b "," #s " \n" \ | 28 "veor " #b "," #b "," #s " \n" \ |
29 | 29 |
30 #define FLIP_SIGN_BIT4(a, b, c, d, s) \ | 30 #define FLIP_SIGN_BIT4(a, b, c, d, s) \ |
31 FLIP_SIGN_BIT2(a, b, s) \ | 31 FLIP_SIGN_BIT2(a, b, s) \ |
32 FLIP_SIGN_BIT2(c, d, s) \ | 32 FLIP_SIGN_BIT2(c, d, s) \ |
(...skipping 115 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
148 } | 148 } |
149 | 149 |
150 static void SimpleHFilter16iNEON(uint8_t* p, int stride, int thresh) { | 150 static void SimpleHFilter16iNEON(uint8_t* p, int stride, int thresh) { |
151 int k; | 151 int k; |
152 for (k = 3; k > 0; --k) { | 152 for (k = 3; k > 0; --k) { |
153 p += 4; | 153 p += 4; |
154 SimpleHFilter16NEON(p, stride, thresh); | 154 SimpleHFilter16NEON(p, stride, thresh); |
155 } | 155 } |
156 } | 156 } |
157 | 157 |
158 //----------------------------------------------------------------------------- | |
159 // Inverse transforms (Paragraph 14.4) | |
160 | |
158 static void TransformOneNEON(const int16_t *in, uint8_t *dst) { | 161 static void TransformOneNEON(const int16_t *in, uint8_t *dst) { |
159 const int kBPS = BPS; | 162 const int kBPS = BPS; |
160 const int16_t constants[] = {20091, 17734, 0, 0}; | 163 const int16_t constants[] = {20091, 17734, 0, 0}; |
161 /* kC1, kC2. Padded because vld1.16 loads 8 bytes | 164 /* kC1, kC2. Padded because vld1.16 loads 8 bytes |
162 * Technically these are unsigned but vqdmulh is only available in signed. | 165 * Technically these are unsigned but vqdmulh is only available in signed. |
163 * vqdmulh returns high half (effectively >> 16) but also doubles the value, | 166 * vqdmulh returns high half (effectively >> 16) but also doubles the value, |
164 * changing the >> 16 to >> 15 and requiring an additional >> 1. | 167 * changing the >> 16 to >> 15 and requiring an additional >> 1. |
165 * We use this to our advantage with kC2. The canonical value is 35468. | 168 * We use this to our advantage with kC2. The canonical value is 35468. |
166 * However, the high bit is set so treating it as signed will give incorrect | 169 * However, the high bit is set so treating it as signed will give incorrect |
167 * results. We avoid this by down shifting by 1 here to clear the highest bit. | 170 * results. We avoid this by down shifting by 1 here to clear the highest bit. |
(...skipping 136 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
304 ); | 307 ); |
305 } | 308 } |
306 | 309 |
307 static void TransformTwoNEON(const int16_t* in, uint8_t* dst, int do_two) { | 310 static void TransformTwoNEON(const int16_t* in, uint8_t* dst, int do_two) { |
308 TransformOneNEON(in, dst); | 311 TransformOneNEON(in, dst); |
309 if (do_two) { | 312 if (do_two) { |
310 TransformOneNEON(in + 16, dst + 4); | 313 TransformOneNEON(in + 16, dst + 4); |
311 } | 314 } |
312 } | 315 } |
313 | 316 |
317 static void TransformWHT(const int16_t* in, int16_t* out) { | |
318 const int kStep = 32; // The store is only incrementing the pointer as if we | |
319 // had stored a single byte. | |
320 __asm__ volatile ( | |
321 // part 1 | |
322 // load data into q0, q1 | |
323 "vld1.16 {q0, q1}, [%[in]] \n" | |
324 | |
325 "vaddl.s16 q2, d0, d3 \n" // a0 = in[0] + in[12] | |
326 "vaddl.s16 q3, d1, d2 \n" // a1 = in[4] + in[8] | |
327 "vsubl.s16 q4, d1, d2 \n" // a2 = in[4] - in[8] | |
328 "vsubl.s16 q5, d0, d3 \n" // a3 = in[0] - in[12] | |
329 | |
330 "vadd.s32 q0, q2, q3 \n" // tmp[0] = a0 + a1 | |
331 "vsub.s32 q2, q2, q3 \n" // tmp[8] = a0 - a1 | |
332 "vadd.s32 q1, q5, q4 \n" // tmp[4] = a3 + a2 | |
333 "vsub.s32 q3, q5, q4 \n" // tmp[12] = a3 - a2 | |
334 | |
335 // Transpose | |
336 // q0 = tmp[0, 4, 8, 12], q1 = tmp[2, 6, 10, 14] | |
337 // q2 = tmp[1, 5, 9, 13], q3 = tmp[3, 7, 11, 15] | |
338 "vswp d1, d4 \n" // vtrn.64 q0, q2 | |
339 "vswp d3, d6 \n" // vtrn.64 q1, q3 | |
340 "vtrn.32 q0, q1 \n" | |
341 "vtrn.32 q2, q3 \n" | |
342 | |
343 "vmov.s32 q4, #3 \n" // dc = 3 | |
344 "vadd.s32 q0, q0, q4 \n" // dc = tmp[0] + 3 | |
fbarchard
2013/03/22 19:08:27
vadd can do immediate 3, no?
If you must waste a r
Johann
2013/03/22 19:19:15
Not that I'm aware of:
http://infocenter.arm.com/h
| |
345 "vadd.s32 q6, q0, q3 \n" // a0 = dc + tmp[3] | |
346 "vadd.s32 q7, q1, q2 \n" // a1 = tmp[1] + tmp[2] | |
347 "vsub.s32 q8, q1, q2 \n" // a2 = tmp[1] - tmp[2] | |
348 "vsub.s32 q9, q0, q3 \n" // a3 = dc - tmp[3] | |
349 | |
350 "vadd.s32 q0, q6, q7 \n" | |
351 "vshrn.s32 d0, q0, #3 \n" // (a0 + a1) >> 3 | |
fbarchard
2013/03/22 19:08:27
consider rounding.. its free.
Johann
2013/03/22 19:19:15
It wouldn't match the intended output.
| |
352 "vadd.s32 q1, q9, q8 \n" | |
353 "vshrn.s32 d1, q1, #3 \n" // (a3 + a2) >> 3 | |
354 "vsub.s32 q2, q6, q7 \n" | |
355 "vshrn.s32 d2, q2, #3 \n" // (a0 - a1) >> 3 | |
356 "vsub.s32 q3, q9, q8 \n" | |
357 "vshrn.s32 d3, q3, #3 \n" // (a3 - a2) >> 3 | |
358 | |
359 // set the results to output | |
360 "vst1.16 d0[0], [%[out]], %[kStep] \n" | |
fbarchard
2013/03/22 19:08:27
Can this be done with 1 vst? out continuous, vdup
Johann
2013/03/22 19:19:15
I'm not aware of a vst which takes kStep.
| |
361 "vst1.16 d1[0], [%[out]], %[kStep] \n" | |
362 "vst1.16 d2[0], [%[out]], %[kStep] \n" | |
363 "vst1.16 d3[0], [%[out]], %[kStep] \n" | |
364 "vst1.16 d0[1], [%[out]], %[kStep] \n" | |
365 "vst1.16 d1[1], [%[out]], %[kStep] \n" | |
366 "vst1.16 d2[1], [%[out]], %[kStep] \n" | |
367 "vst1.16 d3[1], [%[out]], %[kStep] \n" | |
368 "vst1.16 d0[2], [%[out]], %[kStep] \n" | |
369 "vst1.16 d1[2], [%[out]], %[kStep] \n" | |
370 "vst1.16 d2[2], [%[out]], %[kStep] \n" | |
371 "vst1.16 d3[2], [%[out]], %[kStep] \n" | |
372 "vst1.16 d0[3], [%[out]], %[kStep] \n" | |
373 "vst1.16 d1[3], [%[out]], %[kStep] \n" | |
374 "vst1.16 d2[3], [%[out]], %[kStep] \n" | |
375 "vst1.16 d3[3], [%[out]], %[kStep] \n" | |
376 | |
377 : [out] "+r"(out) // modified registers | |
378 : [in] "r"(in), [kStep] "r"(kStep) // constants | |
379 : "memory", "q0", "q1", "q2", "q3", "q4", | |
380 "q5", "q6", "q7", "q8", "q9" // clobbered | |
fbarchard
2013/03/22 19:08:27
avoid q4-q7. use q0-q3, q8-q15.
Johann
2013/03/22 19:19:15
Yes that should have been done.
| |
381 ); | |
382 } | |
383 | |
384 #endif // WEBP_USE_NEON | |
385 | |
386 //------------------------------------------------------------------------------ | |
387 // Entry point | |
388 | |
314 extern void VP8DspInitNEON(void); | 389 extern void VP8DspInitNEON(void); |
315 | 390 |
316 void VP8DspInitNEON(void) { | 391 void VP8DspInitNEON(void) { |
392 #if defined(WEBP_USE_NEON) | |
317 VP8Transform = TransformTwoNEON; | 393 VP8Transform = TransformTwoNEON; |
394 VP8TransformWHT = TransformWHT; | |
318 | 395 |
319 VP8SimpleVFilter16 = SimpleVFilter16NEON; | 396 VP8SimpleVFilter16 = SimpleVFilter16NEON; |
320 VP8SimpleHFilter16 = SimpleHFilter16NEON; | 397 VP8SimpleHFilter16 = SimpleHFilter16NEON; |
321 VP8SimpleVFilter16i = SimpleVFilter16iNEON; | 398 VP8SimpleVFilter16i = SimpleVFilter16iNEON; |
322 VP8SimpleHFilter16i = SimpleHFilter16iNEON; | 399 VP8SimpleHFilter16i = SimpleHFilter16iNEON; |
400 #endif // WEBP_USE_NEON | |
323 } | 401 } |
324 | 402 |
325 #if defined(__cplusplus) || defined(c_plusplus) | 403 #if defined(__cplusplus) || defined(c_plusplus) |
326 } // extern "C" | 404 } // extern "C" |
327 #endif | 405 #endif |
328 | |
329 #endif // WEBP_USE_NEON | |
OLD | NEW |