Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(4)

Side by Side Diff: third_party/libwebp/dsp/lossless_neon.c

Issue 2651883004: libwebp-0.6.0-rc1 (Closed)
Patch Set: Created 3 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « third_party/libwebp/dsp/lossless_msa.c ('k') | third_party/libwebp/dsp/lossless_sse2.c » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2014 Google Inc. All Rights Reserved. 1 // Copyright 2014 Google Inc. All Rights Reserved.
2 // 2 //
3 // Use of this source code is governed by a BSD-style license 3 // Use of this source code is governed by a BSD-style license
4 // that can be found in the COPYING file in the root of the source 4 // that can be found in the COPYING file in the root of the source
5 // tree. An additional intellectual property rights grant can be found 5 // tree. An additional intellectual property rights grant can be found
6 // in the file PATENTS. All contributing project authors may 6 // in the file PATENTS. All contributing project authors may
7 // be found in the AUTHORS file in the root of the source tree. 7 // be found in the AUTHORS file in the root of the source tree.
8 // ----------------------------------------------------------------------------- 8 // -----------------------------------------------------------------------------
9 // 9 //
10 // NEON variant of methods for lossless decoder 10 // NEON variant of methods for lossless decoder
(...skipping 121 matching lines...) Expand 10 before | Expand all | Expand 10 after
132 vst1_u8(dst + 0, vtbl4_u8(pixels, shuffle0)); 132 vst1_u8(dst + 0, vtbl4_u8(pixels, shuffle0));
133 vst1_u8(dst + 8, vtbl4_u8(pixels, shuffle1)); 133 vst1_u8(dst + 8, vtbl4_u8(pixels, shuffle1));
134 vst1_u8(dst + 16, vtbl4_u8(pixels, shuffle2)); 134 vst1_u8(dst + 16, vtbl4_u8(pixels, shuffle2));
135 dst += 8 * 3; 135 dst += 8 * 3;
136 } 136 }
137 VP8LConvertBGRAToRGB_C(src, num_pixels & 7, dst); // left-overs 137 VP8LConvertBGRAToRGB_C(src, num_pixels & 7, dst); // left-overs
138 } 138 }
139 139
140 #endif // !WORK_AROUND_GCC 140 #endif // !WORK_AROUND_GCC
141 141
142
143 //------------------------------------------------------------------------------
144 // Predictor Transform
145
146 #define LOAD_U32_AS_U8(IN) vreinterpret_u8_u32(vdup_n_u32((IN)))
147 #define LOAD_U32P_AS_U8(IN) vreinterpret_u8_u32(vld1_u32((IN)))
148 #define LOADQ_U32_AS_U8(IN) vreinterpretq_u8_u32(vdupq_n_u32((IN)))
149 #define LOADQ_U32P_AS_U8(IN) vreinterpretq_u8_u32(vld1q_u32((IN)))
150 #define GET_U8_AS_U32(IN) vget_lane_u32(vreinterpret_u32_u8((IN)), 0);
151 #define GETQ_U8_AS_U32(IN) vgetq_lane_u32(vreinterpretq_u32_u8((IN)), 0);
152 #define STOREQ_U8_AS_U32P(OUT, IN) vst1q_u32((OUT), vreinterpretq_u32_u8((IN)));
153 #define ROTATE32_LEFT(L) vextq_u8((L), (L), 12) // D|C|B|A -> C|B|A|D
154
155 static WEBP_INLINE uint8x8_t Average2_u8_NEON(uint32_t a0, uint32_t a1) {
156 const uint8x8_t A0 = LOAD_U32_AS_U8(a0);
157 const uint8x8_t A1 = LOAD_U32_AS_U8(a1);
158 return vhadd_u8(A0, A1);
159 }
160
161 static WEBP_INLINE uint32_t ClampedAddSubtractHalf_NEON(uint32_t c0,
162 uint32_t c1,
163 uint32_t c2) {
164 const uint8x8_t avg = Average2_u8_NEON(c0, c1);
165 // Remove one to c2 when bigger than avg.
166 const uint8x8_t C2 = LOAD_U32_AS_U8(c2);
167 const uint8x8_t cmp = vcgt_u8(C2, avg);
168 const uint8x8_t C2_1 = vadd_u8(C2, cmp);
169 // Compute half of the difference between avg and c2.
170 const int8x8_t diff_avg = vreinterpret_s8_u8(vhsub_u8(avg, C2_1));
171 // Compute the sum with avg and saturate.
172 const int16x8_t avg_16 = vreinterpretq_s16_u16(vmovl_u8(avg));
173 const uint8x8_t res = vqmovun_s16(vaddw_s8(avg_16, diff_avg));
174 const uint32_t output = GET_U8_AS_U32(res);
175 return output;
176 }
177
178 static WEBP_INLINE uint32_t Average2_NEON(uint32_t a0, uint32_t a1) {
179 const uint8x8_t avg_u8x8 = Average2_u8_NEON(a0, a1);
180 const uint32_t avg = GET_U8_AS_U32(avg_u8x8);
181 return avg;
182 }
183
184 static WEBP_INLINE uint32_t Average3_NEON(uint32_t a0, uint32_t a1,
185 uint32_t a2) {
186 const uint8x8_t avg0 = Average2_u8_NEON(a0, a2);
187 const uint8x8_t A1 = LOAD_U32_AS_U8(a1);
188 const uint32_t avg = GET_U8_AS_U32(vhadd_u8(avg0, A1));
189 return avg;
190 }
191
192 static uint32_t Predictor5_NEON(uint32_t left, const uint32_t* const top) {
193 return Average3_NEON(left, top[0], top[1]);
194 }
195 static uint32_t Predictor6_NEON(uint32_t left, const uint32_t* const top) {
196 return Average2_NEON(left, top[-1]);
197 }
198 static uint32_t Predictor7_NEON(uint32_t left, const uint32_t* const top) {
199 return Average2_NEON(left, top[0]);
200 }
201 static uint32_t Predictor13_NEON(uint32_t left, const uint32_t* const top) {
202 return ClampedAddSubtractHalf_NEON(left, top[0], top[-1]);
203 }
204
205 // Batch versions of those functions.
206
207 // Predictor0: ARGB_BLACK.
208 static void PredictorAdd0_NEON(const uint32_t* in, const uint32_t* upper,
209 int num_pixels, uint32_t* out) {
210 int i;
211 const uint8x16_t black = vreinterpretq_u8_u32(vdupq_n_u32(ARGB_BLACK));
212 for (i = 0; i + 4 <= num_pixels; i += 4) {
213 const uint8x16_t src = LOADQ_U32P_AS_U8(&in[i]);
214 const uint8x16_t res = vaddq_u8(src, black);
215 STOREQ_U8_AS_U32P(&out[i], res);
216 }
217 VP8LPredictorsAdd_C[0](in + i, upper + i, num_pixels - i, out + i);
218 }
219
220 // Predictor1: left.
221 static void PredictorAdd1_NEON(const uint32_t* in, const uint32_t* upper,
222 int num_pixels, uint32_t* out) {
223 int i;
224 const uint8x16_t zero = LOADQ_U32_AS_U8(0);
225 for (i = 0; i + 4 <= num_pixels; i += 4) {
226 // a | b | c | d
227 const uint8x16_t src = LOADQ_U32P_AS_U8(&in[i]);
228 // 0 | a | b | c
229 const uint8x16_t shift0 = vextq_u8(zero, src, 12);
230 // a | a + b | b + c | c + d
231 const uint8x16_t sum0 = vaddq_u8(src, shift0);
232 // 0 | 0 | a | a + b
233 const uint8x16_t shift1 = vextq_u8(zero, sum0, 8);
234 // a | a + b | a + b + c | a + b + c + d
235 const uint8x16_t sum1 = vaddq_u8(sum0, shift1);
236 const uint8x16_t prev = LOADQ_U32_AS_U8(out[i - 1]);
237 const uint8x16_t res = vaddq_u8(sum1, prev);
238 STOREQ_U8_AS_U32P(&out[i], res);
239 }
240 VP8LPredictorsAdd_C[1](in + i, upper + i, num_pixels - i, out + i);
241 }
242
243 // Macro that adds 32-bit integers from IN using mod 256 arithmetic
244 // per 8 bit channel.
245 #define GENERATE_PREDICTOR_1(X, IN) \
246 static void PredictorAdd##X##_NEON(const uint32_t* in, \
247 const uint32_t* upper, int num_pixels, \
248 uint32_t* out) { \
249 int i; \
250 for (i = 0; i + 4 <= num_pixels; i += 4) { \
251 const uint8x16_t src = LOADQ_U32P_AS_U8(&in[i]); \
252 const uint8x16_t other = LOADQ_U32P_AS_U8(&(IN)); \
253 const uint8x16_t res = vaddq_u8(src, other); \
254 STOREQ_U8_AS_U32P(&out[i], res); \
255 } \
256 VP8LPredictorsAdd_C[(X)](in + i, upper + i, num_pixels - i, out + i); \
257 }
258 // Predictor2: Top.
259 GENERATE_PREDICTOR_1(2, upper[i])
260 // Predictor3: Top-right.
261 GENERATE_PREDICTOR_1(3, upper[i + 1])
262 // Predictor4: Top-left.
263 GENERATE_PREDICTOR_1(4, upper[i - 1])
264 #undef GENERATE_PREDICTOR_1
265
266 // Predictor5: average(average(left, TR), T)
267 #define DO_PRED5(LANE) do { \
268 const uint8x16_t avgLTR = vhaddq_u8(L, TR); \
269 const uint8x16_t avg = vhaddq_u8(avgLTR, T); \
270 const uint8x16_t res = vaddq_u8(avg, src); \
271 vst1q_lane_u32(&out[i + (LANE)], vreinterpretq_u32_u8(res), (LANE)); \
272 L = ROTATE32_LEFT(res); \
273 } while (0)
274
275 static void PredictorAdd5_NEON(const uint32_t* in, const uint32_t* upper,
276 int num_pixels, uint32_t* out) {
277 int i;
278 uint8x16_t L = LOADQ_U32_AS_U8(out[-1]);
279 for (i = 0; i + 4 <= num_pixels; i += 4) {
280 const uint8x16_t src = LOADQ_U32P_AS_U8(&in[i]);
281 const uint8x16_t T = LOADQ_U32P_AS_U8(&upper[i + 0]);
282 const uint8x16_t TR = LOADQ_U32P_AS_U8(&upper[i + 1]);
283 DO_PRED5(0);
284 DO_PRED5(1);
285 DO_PRED5(2);
286 DO_PRED5(3);
287 }
288 VP8LPredictorsAdd_C[5](in + i, upper + i, num_pixels - i, out + i);
289 }
290 #undef DO_PRED5
291
292 #define DO_PRED67(LANE) do { \
293 const uint8x16_t avg = vhaddq_u8(L, top); \
294 const uint8x16_t res = vaddq_u8(avg, src); \
295 vst1q_lane_u32(&out[i + (LANE)], vreinterpretq_u32_u8(res), (LANE)); \
296 L = ROTATE32_LEFT(res); \
297 } while (0)
298
299 // Predictor6: average(left, TL)
300 static void PredictorAdd6_NEON(const uint32_t* in, const uint32_t* upper,
301 int num_pixels, uint32_t* out) {
302 int i;
303 uint8x16_t L = LOADQ_U32_AS_U8(out[-1]);
304 for (i = 0; i + 4 <= num_pixels; i += 4) {
305 const uint8x16_t src = LOADQ_U32P_AS_U8(&in[i]);
306 const uint8x16_t top = LOADQ_U32P_AS_U8(&upper[i - 1]);
307 DO_PRED67(0);
308 DO_PRED67(1);
309 DO_PRED67(2);
310 DO_PRED67(3);
311 }
312 VP8LPredictorsAdd_C[6](in + i, upper + i, num_pixels - i, out + i);
313 }
314
315 // Predictor7: average(left, T)
316 static void PredictorAdd7_NEON(const uint32_t* in, const uint32_t* upper,
317 int num_pixels, uint32_t* out) {
318 int i;
319 uint8x16_t L = LOADQ_U32_AS_U8(out[-1]);
320 for (i = 0; i + 4 <= num_pixels; i += 4) {
321 const uint8x16_t src = LOADQ_U32P_AS_U8(&in[i]);
322 const uint8x16_t top = LOADQ_U32P_AS_U8(&upper[i]);
323 DO_PRED67(0);
324 DO_PRED67(1);
325 DO_PRED67(2);
326 DO_PRED67(3);
327 }
328 VP8LPredictorsAdd_C[7](in + i, upper + i, num_pixels - i, out + i);
329 }
330 #undef DO_PRED67
331
332 #define GENERATE_PREDICTOR_2(X, IN) \
333 static void PredictorAdd##X##_NEON(const uint32_t* in, \
334 const uint32_t* upper, int num_pixels, \
335 uint32_t* out) { \
336 int i; \
337 for (i = 0; i + 4 <= num_pixels; i += 4) { \
338 const uint8x16_t src = LOADQ_U32P_AS_U8(&in[i]); \
339 const uint8x16_t Tother = LOADQ_U32P_AS_U8(&(IN)); \
340 const uint8x16_t T = LOADQ_U32P_AS_U8(&upper[i]); \
341 const uint8x16_t avg = vhaddq_u8(T, Tother); \
342 const uint8x16_t res = vaddq_u8(avg, src); \
343 STOREQ_U8_AS_U32P(&out[i], res); \
344 } \
345 VP8LPredictorsAdd_C[(X)](in + i, upper + i, num_pixels - i, out + i); \
346 }
347 // Predictor8: average TL T.
348 GENERATE_PREDICTOR_2(8, upper[i - 1])
349 // Predictor9: average T TR.
350 GENERATE_PREDICTOR_2(9, upper[i + 1])
351 #undef GENERATE_PREDICTOR_2
352
353 // Predictor10: average of (average of (L,TL), average of (T, TR)).
354 #define DO_PRED10(LANE) do { \
355 const uint8x16_t avgLTL = vhaddq_u8(L, TL); \
356 const uint8x16_t avg = vhaddq_u8(avgTTR, avgLTL); \
357 const uint8x16_t res = vaddq_u8(avg, src); \
358 vst1q_lane_u32(&out[i + (LANE)], vreinterpretq_u32_u8(res), (LANE)); \
359 L = ROTATE32_LEFT(res); \
360 } while (0)
361
362 static void PredictorAdd10_NEON(const uint32_t* in, const uint32_t* upper,
363 int num_pixels, uint32_t* out) {
364 int i;
365 uint8x16_t L = LOADQ_U32_AS_U8(out[-1]);
366 for (i = 0; i + 4 <= num_pixels; i += 4) {
367 const uint8x16_t src = LOADQ_U32P_AS_U8(&in[i]);
368 const uint8x16_t TL = LOADQ_U32P_AS_U8(&upper[i - 1]);
369 const uint8x16_t T = LOADQ_U32P_AS_U8(&upper[i]);
370 const uint8x16_t TR = LOADQ_U32P_AS_U8(&upper[i + 1]);
371 const uint8x16_t avgTTR = vhaddq_u8(T, TR);
372 DO_PRED10(0);
373 DO_PRED10(1);
374 DO_PRED10(2);
375 DO_PRED10(3);
376 }
377 VP8LPredictorsAdd_C[10](in + i, upper + i, num_pixels - i, out + i);
378 }
379 #undef DO_PRED10
380
381 // Predictor11: select.
382 #define DO_PRED11(LANE) do { \
383 const uint8x16_t sumLin = vaddq_u8(L, src); /* in + L */ \
384 const uint8x16_t pLTL = vabdq_u8(L, TL); /* |L - TL| */ \
385 const uint16x8_t sum_LTL = vpaddlq_u8(pLTL); \
386 const uint32x4_t pa = vpaddlq_u16(sum_LTL); \
387 const uint32x4_t mask = vcleq_u32(pa, pb); \
388 const uint8x16_t res = vbslq_u8(vreinterpretq_u8_u32(mask), sumTin, sumLin); \
389 vst1q_lane_u32(&out[i + (LANE)], vreinterpretq_u32_u8(res), (LANE)); \
390 L = ROTATE32_LEFT(res); \
391 } while (0)
392
393 static void PredictorAdd11_NEON(const uint32_t* in, const uint32_t* upper,
394 int num_pixels, uint32_t* out) {
395 int i;
396 uint8x16_t L = LOADQ_U32_AS_U8(out[-1]);
397 for (i = 0; i + 4 <= num_pixels; i += 4) {
398 const uint8x16_t T = LOADQ_U32P_AS_U8(&upper[i]);
399 const uint8x16_t TL = LOADQ_U32P_AS_U8(&upper[i - 1]);
400 const uint8x16_t pTTL = vabdq_u8(T, TL); // |T - TL|
401 const uint16x8_t sum_TTL = vpaddlq_u8(pTTL);
402 const uint32x4_t pb = vpaddlq_u16(sum_TTL);
403 const uint8x16_t src = LOADQ_U32P_AS_U8(&in[i]);
404 const uint8x16_t sumTin = vaddq_u8(T, src); // in + T
405 DO_PRED11(0);
406 DO_PRED11(1);
407 DO_PRED11(2);
408 DO_PRED11(3);
409 }
410 VP8LPredictorsAdd_C[11](in + i, upper + i, num_pixels - i, out + i);
411 }
412 #undef DO_PRED11
413
414 // Predictor12: ClampedAddSubtractFull.
415 #define DO_PRED12(DIFF, LANE) do { \
416 const uint8x8_t pred = \
417 vqmovun_s16(vaddq_s16(vreinterpretq_s16_u16(L), (DIFF))); \
418 const uint8x8_t res = \
419 vadd_u8(pred, (LANE <= 1) ? vget_low_u8(src) : vget_high_u8(src)); \
420 const uint16x8_t res16 = vmovl_u8(res); \
421 vst1_lane_u32(&out[i + (LANE)], vreinterpret_u32_u8(res), (LANE) & 1); \
422 /* rotate in the left predictor for next iteration */ \
423 L = vextq_u16(res16, res16, 4); \
424 } while (0)
425
426 static void PredictorAdd12_NEON(const uint32_t* in, const uint32_t* upper,
427 int num_pixels, uint32_t* out) {
428 int i;
429 uint16x8_t L = vmovl_u8(LOAD_U32_AS_U8(out[-1]));
430 for (i = 0; i + 4 <= num_pixels; i += 4) {
431 // load four pixels of source
432 const uint8x16_t src = LOADQ_U32P_AS_U8(&in[i]);
433 // precompute the difference T - TL once for all, stored as s16
434 const uint8x16_t TL = LOADQ_U32P_AS_U8(&upper[i - 1]);
435 const uint8x16_t T = LOADQ_U32P_AS_U8(&upper[i]);
436 const int16x8_t diff_lo =
437 vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(T), vget_low_u8(TL)));
438 const int16x8_t diff_hi =
439 vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(T), vget_high_u8(TL)));
440 // loop over the four reconstructed pixels
441 DO_PRED12(diff_lo, 0);
442 DO_PRED12(diff_lo, 1);
443 DO_PRED12(diff_hi, 2);
444 DO_PRED12(diff_hi, 3);
445 }
446 VP8LPredictorsAdd_C[12](in + i, upper + i, num_pixels - i, out + i);
447 }
448 #undef DO_PRED12
449
450 // Predictor13: ClampedAddSubtractHalf
451 #define DO_PRED13(LANE, LOW_OR_HI) do { \
452 const uint8x16_t avg = vhaddq_u8(L, T); \
453 const uint8x16_t cmp = vcgtq_u8(TL, avg); \
454 const uint8x16_t TL_1 = vaddq_u8(TL, cmp); \
455 /* Compute half of the difference between avg and TL'. */ \
456 const int8x8_t diff_avg = \
457 vreinterpret_s8_u8(LOW_OR_HI(vhsubq_u8(avg, TL_1))); \
458 /* Compute the sum with avg and saturate. */ \
459 const int16x8_t avg_16 = vreinterpretq_s16_u16(vmovl_u8(LOW_OR_HI(avg))); \
460 const uint8x8_t delta = vqmovun_s16(vaddw_s8(avg_16, diff_avg)); \
461 const uint8x8_t res = vadd_u8(LOW_OR_HI(src), delta); \
462 const uint8x16_t res2 = vcombine_u8(res, res); \
463 vst1_lane_u32(&out[i + (LANE)], vreinterpret_u32_u8(res), (LANE) & 1); \
464 L = ROTATE32_LEFT(res2); \
465 } while (0)
466
467 static void PredictorAdd13_NEON(const uint32_t* in, const uint32_t* upper,
468 int num_pixels, uint32_t* out) {
469 int i;
470 uint8x16_t L = LOADQ_U32_AS_U8(out[-1]);
471 for (i = 0; i + 4 <= num_pixels; i += 4) {
472 const uint8x16_t src = LOADQ_U32P_AS_U8(&in[i]);
473 const uint8x16_t T = LOADQ_U32P_AS_U8(&upper[i]);
474 const uint8x16_t TL = LOADQ_U32P_AS_U8(&upper[i - 1]);
475 DO_PRED13(0, vget_low_u8);
476 DO_PRED13(1, vget_low_u8);
477 DO_PRED13(2, vget_high_u8);
478 DO_PRED13(3, vget_high_u8);
479 }
480 VP8LPredictorsAdd_C[13](in + i, upper + i, num_pixels - i, out + i);
481 }
482 #undef DO_PRED13
483
484 #undef LOAD_U32_AS_U8
485 #undef LOAD_U32P_AS_U8
486 #undef LOADQ_U32_AS_U8
487 #undef LOADQ_U32P_AS_U8
488 #undef GET_U8_AS_U32
489 #undef GETQ_U8_AS_U32
490 #undef STOREQ_U8_AS_U32P
491 #undef ROTATE32_LEFT
492
142 //------------------------------------------------------------------------------ 493 //------------------------------------------------------------------------------
143 // Subtract-Green Transform 494 // Subtract-Green Transform
144 495
145 // vtbl?_u8 are marked unavailable for iOS arm64 with Xcode < 6.3, use 496 // vtbl?_u8 are marked unavailable for iOS arm64 with Xcode < 6.3, use
146 // non-standard versions there. 497 // non-standard versions there.
147 #if defined(__APPLE__) && defined(__aarch64__) && \ 498 #if defined(__APPLE__) && defined(__aarch64__) && \
148 defined(__apple_build_version__) && (__apple_build_version__< 6020037) 499 defined(__apple_build_version__) && (__apple_build_version__< 6020037)
149 #define USE_VTBLQ 500 #define USE_VTBLQ
150 #endif 501 #endif
151 502
(...skipping 12 matching lines...) Expand all
164 // 255 = byte will be zeroed 515 // 255 = byte will be zeroed
165 static const uint8_t kGreenShuffle[8] = { 1, 255, 1, 255, 5, 255, 5, 255 }; 516 static const uint8_t kGreenShuffle[8] = { 1, 255, 1, 255, 5, 255, 5, 255 };
166 517
167 static WEBP_INLINE uint8x16_t DoGreenShuffle(const uint8x16_t argb, 518 static WEBP_INLINE uint8x16_t DoGreenShuffle(const uint8x16_t argb,
168 const uint8x8_t shuffle) { 519 const uint8x8_t shuffle) {
169 return vcombine_u8(vtbl1_u8(vget_low_u8(argb), shuffle), 520 return vcombine_u8(vtbl1_u8(vget_low_u8(argb), shuffle),
170 vtbl1_u8(vget_high_u8(argb), shuffle)); 521 vtbl1_u8(vget_high_u8(argb), shuffle));
171 } 522 }
172 #endif // USE_VTBLQ 523 #endif // USE_VTBLQ
173 524
174 static void AddGreenToBlueAndRed(uint32_t* argb_data, int num_pixels) { 525 static void AddGreenToBlueAndRed(const uint32_t* src, int num_pixels,
175 const uint32_t* const end = argb_data + (num_pixels & ~3); 526 uint32_t* dst) {
527 const uint32_t* const end = src + (num_pixels & ~3);
176 #ifdef USE_VTBLQ 528 #ifdef USE_VTBLQ
177 const uint8x16_t shuffle = vld1q_u8(kGreenShuffle); 529 const uint8x16_t shuffle = vld1q_u8(kGreenShuffle);
178 #else 530 #else
179 const uint8x8_t shuffle = vld1_u8(kGreenShuffle); 531 const uint8x8_t shuffle = vld1_u8(kGreenShuffle);
180 #endif 532 #endif
181 for (; argb_data < end; argb_data += 4) { 533 for (; src < end; src += 4, dst += 4) {
182 const uint8x16_t argb = vld1q_u8((uint8_t*)argb_data); 534 const uint8x16_t argb = vld1q_u8((const uint8_t*)src);
183 const uint8x16_t greens = DoGreenShuffle(argb, shuffle); 535 const uint8x16_t greens = DoGreenShuffle(argb, shuffle);
184 vst1q_u8((uint8_t*)argb_data, vaddq_u8(argb, greens)); 536 vst1q_u8((uint8_t*)dst, vaddq_u8(argb, greens));
185 } 537 }
186 // fallthrough and finish off with plain-C 538 // fallthrough and finish off with plain-C
187 VP8LAddGreenToBlueAndRed_C(argb_data, num_pixels & 3); 539 VP8LAddGreenToBlueAndRed_C(src, num_pixels & 3, dst);
188 } 540 }
189 541
190 //------------------------------------------------------------------------------ 542 //------------------------------------------------------------------------------
191 // Color Transform 543 // Color Transform
192 544
193 static void TransformColorInverse(const VP8LMultipliers* const m, 545 static void TransformColorInverse(const VP8LMultipliers* const m,
194 uint32_t* argb_data, int num_pixels) { 546 const uint32_t* const src, int num_pixels,
195 // sign-extended multiplying constants, pre-shifted by 6. 547 uint32_t* dst) {
548 // sign-extended multiplying constants, pre-shifted by 6.
196 #define CST(X) (((int16_t)(m->X << 8)) >> 6) 549 #define CST(X) (((int16_t)(m->X << 8)) >> 6)
197 const int16_t rb[8] = { 550 const int16_t rb[8] = {
198 CST(green_to_blue_), CST(green_to_red_), 551 CST(green_to_blue_), CST(green_to_red_),
199 CST(green_to_blue_), CST(green_to_red_), 552 CST(green_to_blue_), CST(green_to_red_),
200 CST(green_to_blue_), CST(green_to_red_), 553 CST(green_to_blue_), CST(green_to_red_),
201 CST(green_to_blue_), CST(green_to_red_) 554 CST(green_to_blue_), CST(green_to_red_)
202 }; 555 };
203 const int16x8_t mults_rb = vld1q_s16(rb); 556 const int16x8_t mults_rb = vld1q_s16(rb);
204 const int16_t b2[8] = { 557 const int16_t b2[8] = {
205 0, CST(red_to_blue_), 0, CST(red_to_blue_), 558 0, CST(red_to_blue_), 0, CST(red_to_blue_),
206 0, CST(red_to_blue_), 0, CST(red_to_blue_), 559 0, CST(red_to_blue_), 0, CST(red_to_blue_),
207 }; 560 };
208 const int16x8_t mults_b2 = vld1q_s16(b2); 561 const int16x8_t mults_b2 = vld1q_s16(b2);
209 #undef CST 562 #undef CST
210 #ifdef USE_VTBLQ 563 #ifdef USE_VTBLQ
211 static const uint8_t kg0g0[16] = { 564 static const uint8_t kg0g0[16] = {
212 255, 1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13 565 255, 1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13
213 }; 566 };
214 const uint8x16_t shuffle = vld1q_u8(kg0g0); 567 const uint8x16_t shuffle = vld1q_u8(kg0g0);
215 #else 568 #else
216 static const uint8_t k0g0g[8] = { 255, 1, 255, 1, 255, 5, 255, 5 }; 569 static const uint8_t k0g0g[8] = { 255, 1, 255, 1, 255, 5, 255, 5 };
217 const uint8x8_t shuffle = vld1_u8(k0g0g); 570 const uint8x8_t shuffle = vld1_u8(k0g0g);
218 #endif 571 #endif
219 const uint32x4_t mask_ag = vdupq_n_u32(0xff00ff00u); 572 const uint32x4_t mask_ag = vdupq_n_u32(0xff00ff00u);
220 int i; 573 int i;
221 for (i = 0; i + 4 <= num_pixels; i += 4) { 574 for (i = 0; i + 4 <= num_pixels; i += 4) {
222 const uint8x16_t in = vld1q_u8((uint8_t*)(argb_data + i)); 575 const uint8x16_t in = vld1q_u8((const uint8_t*)(src + i));
223 const uint32x4_t a0g0 = vandq_u32(vreinterpretq_u32_u8(in), mask_ag); 576 const uint32x4_t a0g0 = vandq_u32(vreinterpretq_u32_u8(in), mask_ag);
224 // 0 g 0 g 577 // 0 g 0 g
225 const uint8x16_t greens = DoGreenShuffle(in, shuffle); 578 const uint8x16_t greens = DoGreenShuffle(in, shuffle);
226 // x dr x db1 579 // x dr x db1
227 const int16x8_t A = vqdmulhq_s16(vreinterpretq_s16_u8(greens), mults_rb); 580 const int16x8_t A = vqdmulhq_s16(vreinterpretq_s16_u8(greens), mults_rb);
228 // x r' x b' 581 // x r' x b'
229 const int8x16_t B = vaddq_s8(vreinterpretq_s8_u8(in), 582 const int8x16_t B = vaddq_s8(vreinterpretq_s8_u8(in),
230 vreinterpretq_s8_s16(A)); 583 vreinterpretq_s8_s16(A));
231 // r' 0 b' 0 584 // r' 0 b' 0
232 const int16x8_t C = vshlq_n_s16(vreinterpretq_s16_s8(B), 8); 585 const int16x8_t C = vshlq_n_s16(vreinterpretq_s16_s8(B), 8);
233 // x db2 0 0 586 // x db2 0 0
234 const int16x8_t D = vqdmulhq_s16(C, mults_b2); 587 const int16x8_t D = vqdmulhq_s16(C, mults_b2);
235 // 0 x db2 0 588 // 0 x db2 0
236 const uint32x4_t E = vshrq_n_u32(vreinterpretq_u32_s16(D), 8); 589 const uint32x4_t E = vshrq_n_u32(vreinterpretq_u32_s16(D), 8);
237 // r' x b'' 0 590 // r' x b'' 0
238 const int8x16_t F = vaddq_s8(vreinterpretq_s8_u32(E), 591 const int8x16_t F = vaddq_s8(vreinterpretq_s8_u32(E),
239 vreinterpretq_s8_s16(C)); 592 vreinterpretq_s8_s16(C));
240 // 0 r' 0 b'' 593 // 0 r' 0 b''
241 const uint16x8_t G = vshrq_n_u16(vreinterpretq_u16_s8(F), 8); 594 const uint16x8_t G = vshrq_n_u16(vreinterpretq_u16_s8(F), 8);
242 const uint32x4_t out = vorrq_u32(vreinterpretq_u32_u16(G), a0g0); 595 const uint32x4_t out = vorrq_u32(vreinterpretq_u32_u16(G), a0g0);
243 vst1q_u32(argb_data + i, out); 596 vst1q_u32(dst + i, out);
244 } 597 }
245 // Fall-back to C-version for left-overs. 598 // Fall-back to C-version for left-overs.
246 VP8LTransformColorInverse_C(m, argb_data + i, num_pixels - i); 599 VP8LTransformColorInverse_C(m, src + i, num_pixels - i, dst + i);
247 } 600 }
248 601
249 #undef USE_VTBLQ 602 #undef USE_VTBLQ
250 603
251 //------------------------------------------------------------------------------ 604 //------------------------------------------------------------------------------
252 // Entry point 605 // Entry point
253 606
254 extern void VP8LDspInitNEON(void); 607 extern void VP8LDspInitNEON(void);
255 608
256 WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInitNEON(void) { 609 WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInitNEON(void) {
610 VP8LPredictors[5] = Predictor5_NEON;
611 VP8LPredictors[6] = Predictor6_NEON;
612 VP8LPredictors[7] = Predictor7_NEON;
613 VP8LPredictors[13] = Predictor13_NEON;
614
615 VP8LPredictorsAdd[0] = PredictorAdd0_NEON;
616 VP8LPredictorsAdd[1] = PredictorAdd1_NEON;
617 VP8LPredictorsAdd[2] = PredictorAdd2_NEON;
618 VP8LPredictorsAdd[3] = PredictorAdd3_NEON;
619 VP8LPredictorsAdd[4] = PredictorAdd4_NEON;
620 VP8LPredictorsAdd[5] = PredictorAdd5_NEON;
621 VP8LPredictorsAdd[6] = PredictorAdd6_NEON;
622 VP8LPredictorsAdd[7] = PredictorAdd7_NEON;
623 VP8LPredictorsAdd[8] = PredictorAdd8_NEON;
624 VP8LPredictorsAdd[9] = PredictorAdd9_NEON;
625 VP8LPredictorsAdd[10] = PredictorAdd10_NEON;
626 VP8LPredictorsAdd[11] = PredictorAdd11_NEON;
627 VP8LPredictorsAdd[12] = PredictorAdd12_NEON;
628 VP8LPredictorsAdd[13] = PredictorAdd13_NEON;
629
257 VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA; 630 VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA;
258 VP8LConvertBGRAToBGR = ConvertBGRAToBGR; 631 VP8LConvertBGRAToBGR = ConvertBGRAToBGR;
259 VP8LConvertBGRAToRGB = ConvertBGRAToRGB; 632 VP8LConvertBGRAToRGB = ConvertBGRAToRGB;
260 633
261 VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed; 634 VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed;
262 VP8LTransformColorInverse = TransformColorInverse; 635 VP8LTransformColorInverse = TransformColorInverse;
263 } 636 }
264 637
265 #else // !WEBP_USE_NEON 638 #else // !WEBP_USE_NEON
266 639
267 WEBP_DSP_INIT_STUB(VP8LDspInitNEON) 640 WEBP_DSP_INIT_STUB(VP8LDspInitNEON)
268 641
269 #endif // WEBP_USE_NEON 642 #endif // WEBP_USE_NEON
OLDNEW
« no previous file with comments | « third_party/libwebp/dsp/lossless_msa.c ('k') | third_party/libwebp/dsp/lossless_sse2.c » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698