Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(129)

Side by Side Diff: third_party/libwebp/dsp/dec_neon.c

Issue 116213006: Update libwebp to 0.4.0 (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src
Patch Set: After Blink Roll Created 6 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « third_party/libwebp/dsp/dec.c ('k') | third_party/libwebp/dsp/dec_sse2.c » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2012 Google Inc. All Rights Reserved. 1 // Copyright 2012 Google Inc. All Rights Reserved.
2 // 2 //
3 // Use of this source code is governed by a BSD-style license 3 // Use of this source code is governed by a BSD-style license
4 // that can be found in the COPYING file in the root of the source 4 // that can be found in the COPYING file in the root of the source
5 // tree. An additional intellectual property rights grant can be found 5 // tree. An additional intellectual property rights grant can be found
6 // in the file PATENTS. All contributing project authors may 6 // in the file PATENTS. All contributing project authors may
7 // be found in the AUTHORS file in the root of the source tree. 7 // be found in the AUTHORS file in the root of the source tree.
8 // ----------------------------------------------------------------------------- 8 // -----------------------------------------------------------------------------
9 // 9 //
10 // ARM NEON version of dsp functions and loop filtering. 10 // ARM NEON version of dsp functions and loop filtering.
11 // 11 //
12 // Authors: Somnath Banerjee (somnath@google.com) 12 // Authors: Somnath Banerjee (somnath@google.com)
13 // Johann Koenig (johannkoenig@google.com) 13 // Johann Koenig (johannkoenig@google.com)
14 14
15 #include "./dsp.h" 15 #include "./dsp.h"
16 16
17 #if defined(__cplusplus) || defined(c_plusplus)
18 extern "C" {
19 #endif
20
21 #if defined(WEBP_USE_NEON) 17 #if defined(WEBP_USE_NEON)
22 18
23 #include "../dec/vp8i.h" 19 #include "../dec/vp8i.h"
24 20
25 #define QRegs "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", \ 21 #define QRegs "q0", "q1", "q2", "q3", \
26 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" 22 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
27 23
28 #define FLIP_SIGN_BIT2(a, b, s) \ 24 #define FLIP_SIGN_BIT2(a, b, s) \
29 "veor " #a "," #a "," #s " \n" \ 25 "veor " #a "," #a "," #s " \n" \
30 "veor " #b "," #b "," #s " \n" \ 26 "veor " #b "," #b "," #s " \n" \
31 27
32 #define FLIP_SIGN_BIT4(a, b, c, d, s) \ 28 #define FLIP_SIGN_BIT4(a, b, c, d, s) \
33 FLIP_SIGN_BIT2(a, b, s) \ 29 FLIP_SIGN_BIT2(a, b, s) \
34 FLIP_SIGN_BIT2(c, d, s) \ 30 FLIP_SIGN_BIT2(c, d, s) \
35 31
(...skipping 58 matching lines...) Expand 10 before | Expand all | Expand 10 after
94 //----------------------------------------------------------------------------- 90 //-----------------------------------------------------------------------------
95 // Simple In-loop filtering (Paragraph 15.2) 91 // Simple In-loop filtering (Paragraph 15.2)
96 92
97 static void SimpleVFilter16NEON(uint8_t* p, int stride, int thresh) { 93 static void SimpleVFilter16NEON(uint8_t* p, int stride, int thresh) {
98 __asm__ volatile ( 94 __asm__ volatile (
99 "sub %[p], %[p], %[stride], lsl #1 \n" // p -= 2 * stride 95 "sub %[p], %[p], %[stride], lsl #1 \n" // p -= 2 * stride
100 96
101 "vld1.u8 {q1}, [%[p]], %[stride] \n" // p1 97 "vld1.u8 {q1}, [%[p]], %[stride] \n" // p1
102 "vld1.u8 {q2}, [%[p]], %[stride] \n" // p0 98 "vld1.u8 {q2}, [%[p]], %[stride] \n" // p0
103 "vld1.u8 {q3}, [%[p]], %[stride] \n" // q0 99 "vld1.u8 {q3}, [%[p]], %[stride] \n" // q0
104 "vld1.u8 {q4}, [%[p]] \n" // q1 100 "vld1.u8 {q12}, [%[p]] \n" // q1
105 101
106 DO_FILTER2(q1, q2, q3, q4, %[thresh]) 102 DO_FILTER2(q1, q2, q3, q12, %[thresh])
107 103
108 "sub %[p], %[p], %[stride], lsl #1 \n" // p -= 2 * stride 104 "sub %[p], %[p], %[stride], lsl #1 \n" // p -= 2 * stride
109 105
110 "vst1.u8 {q2}, [%[p]], %[stride] \n" // store op0 106 "vst1.u8 {q2}, [%[p]], %[stride] \n" // store op0
111 "vst1.u8 {q3}, [%[p]] \n" // store oq0 107 "vst1.u8 {q3}, [%[p]] \n" // store oq0
112 : [p] "+r"(p) 108 : [p] "+r"(p)
113 : [stride] "r"(stride), [thresh] "r"(thresh) 109 : [stride] "r"(stride), [thresh] "r"(thresh)
114 : "memory", QRegs 110 : "memory", QRegs
115 ); 111 );
116 } 112 }
117 113
118 static void SimpleHFilter16NEON(uint8_t* p, int stride, int thresh) { 114 static void SimpleHFilter16NEON(uint8_t* p, int stride, int thresh) {
119 __asm__ volatile ( 115 __asm__ volatile (
120 "sub r4, %[p], #2 \n" // base1 = p - 2 116 "sub r4, %[p], #2 \n" // base1 = p - 2
121 "lsl r6, %[stride], #1 \n" // r6 = 2 * stride 117 "lsl r6, %[stride], #1 \n" // r6 = 2 * stride
122 "add r5, r4, %[stride] \n" // base2 = base1 + stride 118 "add r5, r4, %[stride] \n" // base2 = base1 + stride
123 119
124 LOAD8x4(d2, d3, d4, d5, [r4], [r5], r6) 120 LOAD8x4(d2, d3, d4, d5, [r4], [r5], r6)
125 LOAD8x4(d6, d7, d8, d9, [r4], [r5], r6) 121 LOAD8x4(d24, d25, d26, d27, [r4], [r5], r6)
126 "vswp d3, d6 \n" // p1:q1 p0:q3 122 "vswp d3, d24 \n" // p1:q1 p0:q3
127 "vswp d5, d8 \n" // q0:q2 q1:q4 123 "vswp d5, d26 \n" // q0:q2 q1:q4
128 "vswp q2, q3 \n" // p1:q1 p0:q2 q0:q3 q1:q4 124 "vswp q2, q12 \n" // p1:q1 p0:q2 q0:q3 q1:q4
129 125
130 DO_FILTER2(q1, q2, q3, q4, %[thresh]) 126 DO_FILTER2(q1, q2, q12, q13, %[thresh])
131 127
132 "sub %[p], %[p], #1 \n" // p - 1 128 "sub %[p], %[p], #1 \n" // p - 1
133 129
134 "vswp d5, d6 \n" 130 "vswp d5, d24 \n"
135 STORE8x2(d4, d5, [%[p]], %[stride]) 131 STORE8x2(d4, d5, [%[p]], %[stride])
136 STORE8x2(d6, d7, [%[p]], %[stride]) 132 STORE8x2(d24, d25, [%[p]], %[stride])
137 133
138 : [p] "+r"(p) 134 : [p] "+r"(p)
139 : [stride] "r"(stride), [thresh] "r"(thresh) 135 : [stride] "r"(stride), [thresh] "r"(thresh)
140 : "memory", "r4", "r5", "r6", QRegs 136 : "memory", "r4", "r5", "r6", QRegs
141 ); 137 );
142 } 138 }
143 139
144 static void SimpleVFilter16iNEON(uint8_t* p, int stride, int thresh) { 140 static void SimpleVFilter16iNEON(uint8_t* p, int stride, int thresh) {
145 int k; 141 int k;
146 for (k = 3; k > 0; --k) { 142 for (k = 3; k > 0; --k) {
147 p += 4 * stride; 143 p += 4 * stride;
148 SimpleVFilter16NEON(p, stride, thresh); 144 SimpleVFilter16NEON(p, stride, thresh);
149 } 145 }
150 } 146 }
151 147
152 static void SimpleHFilter16iNEON(uint8_t* p, int stride, int thresh) { 148 static void SimpleHFilter16iNEON(uint8_t* p, int stride, int thresh) {
153 int k; 149 int k;
154 for (k = 3; k > 0; --k) { 150 for (k = 3; k > 0; --k) {
155 p += 4; 151 p += 4;
156 SimpleHFilter16NEON(p, stride, thresh); 152 SimpleHFilter16NEON(p, stride, thresh);
157 } 153 }
158 } 154 }
159 155
160 //----------------------------------------------------------------------------- 156 //-----------------------------------------------------------------------------
161 // Inverse transforms (Paragraph 14.4) 157 // Inverse transforms (Paragraph 14.4)
162 158
163 static void TransformOneNEON(const int16_t *in, uint8_t *dst) { 159 static void TransformOne(const int16_t* in, uint8_t* dst) {
164 const int kBPS = BPS; 160 const int kBPS = BPS;
165 const int16_t constants[] = {20091, 17734, 0, 0}; 161 const int16_t constants[] = {20091, 17734, 0, 0};
166 /* kC1, kC2. Padded because vld1.16 loads 8 bytes 162 /* kC1, kC2. Padded because vld1.16 loads 8 bytes
167 * Technically these are unsigned but vqdmulh is only available in signed. 163 * Technically these are unsigned but vqdmulh is only available in signed.
168 * vqdmulh returns high half (effectively >> 16) but also doubles the value, 164 * vqdmulh returns high half (effectively >> 16) but also doubles the value,
169 * changing the >> 16 to >> 15 and requiring an additional >> 1. 165 * changing the >> 16 to >> 15 and requiring an additional >> 1.
170 * We use this to our advantage with kC2. The canonical value is 35468. 166 * We use this to our advantage with kC2. The canonical value is 35468.
171 * However, the high bit is set so treating it as signed will give incorrect 167 * However, the high bit is set so treating it as signed will give incorrect
172 * results. We avoid this by down shifting by 1 here to clear the highest bit. 168 * results. We avoid this by down shifting by 1 here to clear the highest bit.
173 * Combined with the doubling effect of vqdmulh we get >> 16. 169 * Combined with the doubling effect of vqdmulh we get >> 16.
(...skipping 128 matching lines...) Expand 10 before | Expand all | Expand 10 after
302 "vst1.32 d0[1], [%[dst]], %[kBPS] \n" 298 "vst1.32 d0[1], [%[dst]], %[kBPS] \n"
303 "vst1.32 d1[0], [%[dst]], %[kBPS] \n" 299 "vst1.32 d1[0], [%[dst]], %[kBPS] \n"
304 "vst1.32 d1[1], [%[dst]] \n" 300 "vst1.32 d1[1], [%[dst]] \n"
305 301
306 : [in] "+r"(in), [dst] "+r"(dst) /* modified registers */ 302 : [in] "+r"(in), [dst] "+r"(dst) /* modified registers */
307 : [kBPS] "r"(kBPS), [constants] "r"(constants) /* constants */ 303 : [kBPS] "r"(kBPS), [constants] "r"(constants) /* constants */
308 : "memory", "q0", "q1", "q2", "q8", "q9", "q10", "q11" /* clobbered */ 304 : "memory", "q0", "q1", "q2", "q8", "q9", "q10", "q11" /* clobbered */
309 ); 305 );
310 } 306 }
311 307
312 static void TransformTwoNEON(const int16_t* in, uint8_t* dst, int do_two) { 308 static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {
313 TransformOneNEON(in, dst); 309 TransformOne(in, dst);
314 if (do_two) { 310 if (do_two) {
315 TransformOneNEON(in + 16, dst + 4); 311 TransformOne(in + 16, dst + 4);
316 } 312 }
317 } 313 }
318 314
315 static void TransformDC(const int16_t* in, uint8_t* dst) {
316 const int DC = (in[0] + 4) >> 3;
317 const int kBPS = BPS;
318 __asm__ volatile (
319 "vdup.16 q1, %[DC] \n"
320
321 "vld1.32 d0[0], [%[dst]], %[kBPS] \n"
322 "vld1.32 d1[0], [%[dst]], %[kBPS] \n"
323 "vld1.32 d0[1], [%[dst]], %[kBPS] \n"
324 "vld1.32 d1[1], [%[dst]], %[kBPS] \n"
325
326 "sub %[dst], %[dst], %[kBPS], lsl #2 \n"
327
328 // add DC and convert to s16.
329 "vaddw.u8 q2, q1, d0 \n"
330 "vaddw.u8 q3, q1, d1 \n"
331 // convert back to u8 with saturation
332 "vqmovun.s16 d0, q2 \n"
333 "vqmovun.s16 d1, q3 \n"
334
335 "vst1.32 d0[0], [%[dst]], %[kBPS] \n"
336 "vst1.32 d1[0], [%[dst]], %[kBPS] \n"
337 "vst1.32 d0[1], [%[dst]], %[kBPS] \n"
338 "vst1.32 d1[1], [%[dst]] \n"
339 : [in] "+r"(in), [dst] "+r"(dst) /* modified registers */
340 : [kBPS] "r"(kBPS), /* constants */
341 [DC] "r"(DC)
342 : "memory", "q0", "q1", "q2", "q3" /* clobbered */
343 );
344 }
345
319 static void TransformWHT(const int16_t* in, int16_t* out) { 346 static void TransformWHT(const int16_t* in, int16_t* out) {
320 const int kStep = 32; // The store is only incrementing the pointer as if we 347 const int kStep = 32; // The store is only incrementing the pointer as if we
321 // had stored a single byte. 348 // had stored a single byte.
322 __asm__ volatile ( 349 __asm__ volatile (
323 // part 1 350 // part 1
324 // load data into q0, q1 351 // load data into q0, q1
325 "vld1.16 {q0, q1}, [%[in]] \n" 352 "vld1.16 {q0, q1}, [%[in]] \n"
326 353
327 "vaddl.s16 q2, d0, d3 \n" // a0 = in[0] + in[12] 354 "vaddl.s16 q2, d0, d3 \n" // a0 = in[0] + in[12]
328 "vaddl.s16 q3, d1, d2 \n" // a1 = in[4] + in[8] 355 "vaddl.s16 q3, d1, d2 \n" // a1 = in[4] + in[8]
329 "vsubl.s16 q4, d1, d2 \n" // a2 = in[4] - in[8] 356 "vsubl.s16 q10, d1, d2 \n" // a2 = in[4] - in[8]
330 "vsubl.s16 q5, d0, d3 \n" // a3 = in[0] - in[12] 357 "vsubl.s16 q11, d0, d3 \n" // a3 = in[0] - in[12]
331 358
332 "vadd.s32 q0, q2, q3 \n" // tmp[0] = a0 + a1 359 "vadd.s32 q0, q2, q3 \n" // tmp[0] = a0 + a1
333 "vsub.s32 q2, q2, q3 \n" // tmp[8] = a0 - a1 360 "vsub.s32 q2, q2, q3 \n" // tmp[8] = a0 - a1
334 "vadd.s32 q1, q5, q4 \n" // tmp[4] = a3 + a2 361 "vadd.s32 q1, q11, q10 \n" // tmp[4] = a3 + a2
335 "vsub.s32 q3, q5, q4 \n" // tmp[12] = a3 - a2 362 "vsub.s32 q3, q11, q10 \n" // tmp[12] = a3 - a2
336 363
337 // Transpose 364 // Transpose
338 // q0 = tmp[0, 4, 8, 12], q1 = tmp[2, 6, 10, 14] 365 // q0 = tmp[0, 4, 8, 12], q1 = tmp[2, 6, 10, 14]
339 // q2 = tmp[1, 5, 9, 13], q3 = tmp[3, 7, 11, 15] 366 // q2 = tmp[1, 5, 9, 13], q3 = tmp[3, 7, 11, 15]
340 "vswp d1, d4 \n" // vtrn.64 q0, q2 367 "vswp d1, d4 \n" // vtrn.64 q0, q2
341 "vswp d3, d6 \n" // vtrn.64 q1, q3 368 "vswp d3, d6 \n" // vtrn.64 q1, q3
342 "vtrn.32 q0, q1 \n" 369 "vtrn.32 q0, q1 \n"
343 "vtrn.32 q2, q3 \n" 370 "vtrn.32 q2, q3 \n"
344 371
345 "vmov.s32 q4, #3 \n" // dc = 3 372 "vmov.s32 q10, #3 \n" // dc = 3
346 "vadd.s32 q0, q0, q4 \n" // dc = tmp[0] + 3 373 "vadd.s32 q0, q0, q10 \n" // dc = tmp[0] + 3
347 "vadd.s32 q6, q0, q3 \n" // a0 = dc + tmp[3] 374 "vadd.s32 q12, q0, q3 \n" // a0 = dc + tmp[3]
348 "vadd.s32 q7, q1, q2 \n" // a1 = tmp[1] + tmp[2] 375 "vadd.s32 q13, q1, q2 \n" // a1 = tmp[1] + tmp[2]
349 "vsub.s32 q8, q1, q2 \n" // a2 = tmp[1] - tmp[2] 376 "vsub.s32 q8, q1, q2 \n" // a2 = tmp[1] - tmp[2]
350 "vsub.s32 q9, q0, q3 \n" // a3 = dc - tmp[3] 377 "vsub.s32 q9, q0, q3 \n" // a3 = dc - tmp[3]
351 378
352 "vadd.s32 q0, q6, q7 \n" 379 "vadd.s32 q0, q12, q13 \n"
353 "vshrn.s32 d0, q0, #3 \n" // (a0 + a1) >> 3 380 "vshrn.s32 d0, q0, #3 \n" // (a0 + a1) >> 3
354 "vadd.s32 q1, q9, q8 \n" 381 "vadd.s32 q1, q9, q8 \n"
355 "vshrn.s32 d1, q1, #3 \n" // (a3 + a2) >> 3 382 "vshrn.s32 d1, q1, #3 \n" // (a3 + a2) >> 3
356 "vsub.s32 q2, q6, q7 \n" 383 "vsub.s32 q2, q12, q13 \n"
357 "vshrn.s32 d2, q2, #3 \n" // (a0 - a1) >> 3 384 "vshrn.s32 d2, q2, #3 \n" // (a0 - a1) >> 3
358 "vsub.s32 q3, q9, q8 \n" 385 "vsub.s32 q3, q9, q8 \n"
359 "vshrn.s32 d3, q3, #3 \n" // (a3 - a2) >> 3 386 "vshrn.s32 d3, q3, #3 \n" // (a3 - a2) >> 3
360 387
361 // set the results to output 388 // set the results to output
362 "vst1.16 d0[0], [%[out]], %[kStep] \n" 389 "vst1.16 d0[0], [%[out]], %[kStep] \n"
363 "vst1.16 d1[0], [%[out]], %[kStep] \n" 390 "vst1.16 d1[0], [%[out]], %[kStep] \n"
364 "vst1.16 d2[0], [%[out]], %[kStep] \n" 391 "vst1.16 d2[0], [%[out]], %[kStep] \n"
365 "vst1.16 d3[0], [%[out]], %[kStep] \n" 392 "vst1.16 d3[0], [%[out]], %[kStep] \n"
366 "vst1.16 d0[1], [%[out]], %[kStep] \n" 393 "vst1.16 d0[1], [%[out]], %[kStep] \n"
367 "vst1.16 d1[1], [%[out]], %[kStep] \n" 394 "vst1.16 d1[1], [%[out]], %[kStep] \n"
368 "vst1.16 d2[1], [%[out]], %[kStep] \n" 395 "vst1.16 d2[1], [%[out]], %[kStep] \n"
369 "vst1.16 d3[1], [%[out]], %[kStep] \n" 396 "vst1.16 d3[1], [%[out]], %[kStep] \n"
370 "vst1.16 d0[2], [%[out]], %[kStep] \n" 397 "vst1.16 d0[2], [%[out]], %[kStep] \n"
371 "vst1.16 d1[2], [%[out]], %[kStep] \n" 398 "vst1.16 d1[2], [%[out]], %[kStep] \n"
372 "vst1.16 d2[2], [%[out]], %[kStep] \n" 399 "vst1.16 d2[2], [%[out]], %[kStep] \n"
373 "vst1.16 d3[2], [%[out]], %[kStep] \n" 400 "vst1.16 d3[2], [%[out]], %[kStep] \n"
374 "vst1.16 d0[3], [%[out]], %[kStep] \n" 401 "vst1.16 d0[3], [%[out]], %[kStep] \n"
375 "vst1.16 d1[3], [%[out]], %[kStep] \n" 402 "vst1.16 d1[3], [%[out]], %[kStep] \n"
376 "vst1.16 d2[3], [%[out]], %[kStep] \n" 403 "vst1.16 d2[3], [%[out]], %[kStep] \n"
377 "vst1.16 d3[3], [%[out]], %[kStep] \n" 404 "vst1.16 d3[3], [%[out]], %[kStep] \n"
378 405
379 : [out] "+r"(out) // modified registers 406 : [out] "+r"(out) // modified registers
380 : [in] "r"(in), [kStep] "r"(kStep) // constants 407 : [in] "r"(in), [kStep] "r"(kStep) // constants
381 : "memory", "q0", "q1", "q2", "q3", "q4", 408 : "memory", "q0", "q1", "q2", "q3",
382 "q5", "q6", "q7", "q8", "q9" // clobbered 409 "q8", "q9", "q10", "q11", "q12", "q13" // clobbered
383 ); 410 );
384 } 411 }
385 412
386 #endif // WEBP_USE_NEON 413 #endif // WEBP_USE_NEON
387 414
388 //------------------------------------------------------------------------------ 415 //------------------------------------------------------------------------------
389 // Entry point 416 // Entry point
390 417
391 extern void VP8DspInitNEON(void); 418 extern void VP8DspInitNEON(void);
392 419
393 void VP8DspInitNEON(void) { 420 void VP8DspInitNEON(void) {
394 #if defined(WEBP_USE_NEON) 421 #if defined(WEBP_USE_NEON)
395 VP8Transform = TransformTwoNEON; 422 VP8Transform = TransformTwo;
423 VP8TransformAC3 = TransformOne; // no special code here
424 VP8TransformDC = TransformDC;
396 VP8TransformWHT = TransformWHT; 425 VP8TransformWHT = TransformWHT;
397 426
398 VP8SimpleVFilter16 = SimpleVFilter16NEON; 427 VP8SimpleVFilter16 = SimpleVFilter16NEON;
399 VP8SimpleHFilter16 = SimpleHFilter16NEON; 428 VP8SimpleHFilter16 = SimpleHFilter16NEON;
400 VP8SimpleVFilter16i = SimpleVFilter16iNEON; 429 VP8SimpleVFilter16i = SimpleVFilter16iNEON;
401 VP8SimpleHFilter16i = SimpleHFilter16iNEON; 430 VP8SimpleHFilter16i = SimpleHFilter16iNEON;
402 #endif // WEBP_USE_NEON 431 #endif // WEBP_USE_NEON
403 } 432 }
404 433
405 #if defined(__cplusplus) || defined(c_plusplus)
406 } // extern "C"
407 #endif
OLDNEW
« no previous file with comments | « third_party/libwebp/dsp/dec.c ('k') | third_party/libwebp/dsp/dec_sse2.c » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698