Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(73)

Side by Side Diff: third_party/libwebp/dsp/dec_neon.c

Issue 10832153: libwebp: update snapshot to v0.2.0-rc1 (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src
Patch Set: Created 8 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 // Copyright 2011 Google Inc. 1 // Copyright 2012 Google Inc. All Rights Reserved.
2 // 2 //
3 // This code is licensed under the same terms as WebM: 3 // This code is licensed under the same terms as WebM:
4 // Software License Agreement: http://www.webmproject.org/license/software/ 4 // Software License Agreement: http://www.webmproject.org/license/software/
5 // Additional IP Rights Grant: http://www.webmproject.org/license/additional/ 5 // Additional IP Rights Grant: http://www.webmproject.org/license/additional/
6 // ----------------------------------------------------------------------------- 6 // -----------------------------------------------------------------------------
7 // 7 //
8 // ARM NEON version of dsp functions and loop filtering. 8 // ARM NEON version of dsp functions and loop filtering.
9 // 9 //
10 // Author: somnath@google.com (Somnath Banerjee) 10 // Authors: Somnath Banerjee (somnath@google.com)
11 // Johann Koenig (johannkoenig@google.com)
11 12
12 #if defined(__GNUC__) && defined(__ARM_NEON__) 13 #include "./dsp.h"
14
15 #if defined(WEBP_USE_NEON)
13 16
14 #include "../dec/vp8i.h" 17 #include "../dec/vp8i.h"
15 18
16 #if defined(__cplusplus) || defined(c_plusplus) 19 #if defined(__cplusplus) || defined(c_plusplus)
17 extern "C" { 20 extern "C" {
18 #endif 21 #endif
19 22
20 #define QRegs "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", \ 23 #define QRegs "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", \
21 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" 24 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
22 25
(...skipping 122 matching lines...) Expand 10 before | Expand all | Expand 10 after
145 } 148 }
146 149
147 static void SimpleHFilter16iNEON(uint8_t* p, int stride, int thresh) { 150 static void SimpleHFilter16iNEON(uint8_t* p, int stride, int thresh) {
148 int k; 151 int k;
149 for (k = 3; k > 0; --k) { 152 for (k = 3; k > 0; --k) {
150 p += 4; 153 p += 4;
151 SimpleHFilter16NEON(p, stride, thresh); 154 SimpleHFilter16NEON(p, stride, thresh);
152 } 155 }
153 } 156 }
154 157
158 static void TransformOneNEON(const int16_t *in, uint8_t *dst) {
159 const int kBPS = BPS;
160 const int16_t constants[] = {20091, 17734, 0, 0};
161 /* kC1, kC2. Padded because vld1.16 loads 8 bytes
162 * Technically these are unsigned but vqdmulh is only available in signed.
163 * vqdmulh returns high half (effectively >> 16) but also doubles the value,
164 * changing the >> 16 to >> 15 and requiring an additional >> 1.
165 * We use this to our advantage with kC2. The canonical value is 35468.
166 * However, the high bit is set so treating it as signed will give incorrect
167 * results. We avoid this by down shifting by 1 here to clear the highest bit.
168 * Combined with the doubling effect of vqdmulh we get >> 16.
169 * This can not be applied to kC1 because the lowest bit is set. Down shifting
170 * the constant would reduce precision.
171 */
172
173 /* libwebp uses a trick to avoid some extra addition that libvpx does.
174 * Instead of:
175 * temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1) >> 16);
176 * libwebp adds 1 << 16 to cospi8sqrt2minus1 (kC1). However, this causes the
177 * same issue with kC1 and vqdmulh that we work around by down shifting kC2
178 */
179
180 /* Adapted from libvpx: vp8/common/arm/neon/shortidct4x4llm_neon.asm */
181 __asm__ volatile (
182 "vld1.16 {q1, q2}, [%[in]] \n"
183 "vld1.16 {d0}, [%[constants]] \n"
184
185 /* d2: in[0]
186 * d3: in[8]
187 * d4: in[4]
188 * d5: in[12]
189 */
190 "vswp d3, d4 \n"
191
192 /* q8 = {in[4], in[12]} * kC1 * 2 >> 16
193 * q9 = {in[4], in[12]} * kC2 >> 16
194 */
195 "vqdmulh.s16 q8, q2, d0[0] \n"
196 "vqdmulh.s16 q9, q2, d0[1] \n"
197
198 /* d22 = a = in[0] + in[8]
199 * d23 = b = in[0] - in[8]
200 */
201 "vqadd.s16 d22, d2, d3 \n"
202 "vqsub.s16 d23, d2, d3 \n"
203
204 /* The multiplication should be x * kC1 >> 16
205 * However, with vqdmulh we get x * kC1 * 2 >> 16
206 * (multiply, double, return high half)
207 * We avoided this in kC2 by pre-shifting the constant.
208 * q8 = in[4]/[12] * kC1 >> 16
209 */
210 "vshr.s16 q8, q8, #1 \n"
211
212 /* Add {in[4], in[12]} back after the multiplication. This is handled by
213 * adding 1 << 16 to kC1 in the libwebp C code.
214 */
215 "vqadd.s16 q8, q2, q8 \n"
216
217 /* d20 = c = in[4]*kC2 - in[12]*kC1
218 * d21 = d = in[4]*kC1 + in[12]*kC2
219 */
220 "vqsub.s16 d20, d18, d17 \n"
221 "vqadd.s16 d21, d19, d16 \n"
222
223 /* d2 = tmp[0] = a + d
224 * d3 = tmp[1] = b + c
225 * d4 = tmp[2] = b - c
226 * d5 = tmp[3] = a - d
227 */
228 "vqadd.s16 d2, d22, d21 \n"
229 "vqadd.s16 d3, d23, d20 \n"
230 "vqsub.s16 d4, d23, d20 \n"
231 "vqsub.s16 d5, d22, d21 \n"
232
233 "vzip.16 q1, q2 \n"
234 "vzip.16 q1, q2 \n"
235
236 "vswp d3, d4 \n"
237
238 /* q8 = {tmp[4], tmp[12]} * kC1 * 2 >> 16
239 * q9 = {tmp[4], tmp[12]} * kC2 >> 16
240 */
241 "vqdmulh.s16 q8, q2, d0[0] \n"
242 "vqdmulh.s16 q9, q2, d0[1] \n"
243
244 /* d22 = a = tmp[0] + tmp[8]
245 * d23 = b = tmp[0] - tmp[8]
246 */
247 "vqadd.s16 d22, d2, d3 \n"
248 "vqsub.s16 d23, d2, d3 \n"
249
250 /* See long winded explanations prior */
251 "vshr.s16 q8, q8, #1 \n"
252 "vqadd.s16 q8, q2, q8 \n"
253
254 /* d20 = c = in[4]*kC2 - in[12]*kC1
255 * d21 = d = in[4]*kC1 + in[12]*kC2
256 */
257 "vqsub.s16 d20, d18, d17 \n"
258 "vqadd.s16 d21, d19, d16 \n"
259
260 /* d2 = tmp[0] = a + d
261 * d3 = tmp[1] = b + c
262 * d4 = tmp[2] = b - c
263 * d5 = tmp[3] = a - d
264 */
265 "vqadd.s16 d2, d22, d21 \n"
266 "vqadd.s16 d3, d23, d20 \n"
267 "vqsub.s16 d4, d23, d20 \n"
268 "vqsub.s16 d5, d22, d21 \n"
269
270 "vld1.32 d6[0], [%[dst]], %[kBPS] \n"
271 "vld1.32 d6[1], [%[dst]], %[kBPS] \n"
272 "vld1.32 d7[0], [%[dst]], %[kBPS] \n"
273 "vld1.32 d7[1], [%[dst]], %[kBPS] \n"
274
275 "sub %[dst], %[dst], %[kBPS], lsl #2 \n"
276
277 /* (val) + 4 >> 3 */
278 "vrshr.s16 d2, d2, #3 \n"
279 "vrshr.s16 d3, d3, #3 \n"
280 "vrshr.s16 d4, d4, #3 \n"
281 "vrshr.s16 d5, d5, #3 \n"
282
283 "vzip.16 q1, q2 \n"
284 "vzip.16 q1, q2 \n"
285
286 /* Must accumulate before saturating */
287 "vmovl.u8 q8, d6 \n"
288 "vmovl.u8 q9, d7 \n"
289
290 "vqadd.s16 q1, q1, q8 \n"
291 "vqadd.s16 q2, q2, q9 \n"
292
293 "vqmovun.s16 d0, q1 \n"
294 "vqmovun.s16 d1, q2 \n"
295
296 "vst1.32 d0[0], [%[dst]], %[kBPS] \n"
297 "vst1.32 d0[1], [%[dst]], %[kBPS] \n"
298 "vst1.32 d1[0], [%[dst]], %[kBPS] \n"
299 "vst1.32 d1[1], [%[dst]] \n"
300
301 : [in] "+r"(in), [dst] "+r"(dst) /* modified registers */
302 : [kBPS] "r"(kBPS), [constants] "r"(constants) /* constants */
303 : "memory", "q0", "q1", "q2", "q8", "q9", "q10", "q11" /* clobbered */
304 );
305 }
306
307 static void TransformTwoNEON(const int16_t* in, uint8_t* dst, int do_two) {
308 TransformOneNEON(in, dst);
309 if (do_two) {
310 TransformOneNEON(in + 16, dst + 4);
311 }
312 }
313
155 extern void VP8DspInitNEON(void); 314 extern void VP8DspInitNEON(void);
156 315
157 void VP8DspInitNEON(void) { 316 void VP8DspInitNEON(void) {
317 VP8Transform = TransformTwoNEON;
318
158 VP8SimpleVFilter16 = SimpleVFilter16NEON; 319 VP8SimpleVFilter16 = SimpleVFilter16NEON;
159 VP8SimpleHFilter16 = SimpleHFilter16NEON; 320 VP8SimpleHFilter16 = SimpleHFilter16NEON;
160 VP8SimpleVFilter16i = SimpleVFilter16iNEON; 321 VP8SimpleVFilter16i = SimpleVFilter16iNEON;
161 VP8SimpleHFilter16i = SimpleHFilter16iNEON; 322 VP8SimpleHFilter16i = SimpleHFilter16iNEON;
162 } 323 }
163 324
164 #if defined(__cplusplus) || defined(c_plusplus) 325 #if defined(__cplusplus) || defined(c_plusplus)
165 } // extern "C" 326 } // extern "C"
166 #endif 327 #endif
167 328
168 #endif // __GNUC__ && __ARM_NEON__ 329 #endif // WEBP_USE_NEON
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698