third_party/libwebp/dsp/lossless_neon.c - Issue 2651883004: libwebp-0.6.0-rc1

Side by Side Diff: third_party/libwebp/dsp/lossless_neon.c

Issue 2651883004: libwebp-0.6.0-rc1 (Closed)

Patch Set: Created 3 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 // Copyright 2014 Google Inc. All Rights Reserved.	1 // Copyright 2014 Google Inc. All Rights Reserved.

2 //	2 //

3 // Use of this source code is governed by a BSD-style license	3 // Use of this source code is governed by a BSD-style license

4 // that can be found in the COPYING file in the root of the source	4 // that can be found in the COPYING file in the root of the source

5 // tree. An additional intellectual property rights grant can be found	5 // tree. An additional intellectual property rights grant can be found

6 // in the file PATENTS. All contributing project authors may	6 // in the file PATENTS. All contributing project authors may

7 // be found in the AUTHORS file in the root of the source tree.	7 // be found in the AUTHORS file in the root of the source tree.

8 // -----------------------------------------------------------------------------	8 // -----------------------------------------------------------------------------

9 //	9 //

10 // NEON variant of methods for lossless decoder	10 // NEON variant of methods for lossless decoder

(...skipping 121 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
132 vst1_u8(dst + 0, vtbl4_u8(pixels, shuffle0));	132 vst1_u8(dst + 0, vtbl4_u8(pixels, shuffle0));

133 vst1_u8(dst + 8, vtbl4_u8(pixels, shuffle1));	133 vst1_u8(dst + 8, vtbl4_u8(pixels, shuffle1));

134 vst1_u8(dst + 16, vtbl4_u8(pixels, shuffle2));	134 vst1_u8(dst + 16, vtbl4_u8(pixels, shuffle2));

135 dst += 8 * 3;	135 dst += 8 * 3;

136 }	136 }

137 VP8LConvertBGRAToRGB_C(src, num_pixels & 7, dst); // left-overs	137 VP8LConvertBGRAToRGB_C(src, num_pixels & 7, dst); // left-overs

138 }	138 }

139	139

140 #endif // !WORK_AROUND_GCC	140 #endif // !WORK_AROUND_GCC

141	141

	142

	143 //------------------------------------------------------------------------------

	144 // Predictor Transform

	145

	146 #define LOAD_U32_AS_U8(IN) vreinterpret_u8_u32(vdup_n_u32((IN)))

	147 #define LOAD_U32P_AS_U8(IN) vreinterpret_u8_u32(vld1_u32((IN)))

	148 #define LOADQ_U32_AS_U8(IN) vreinterpretq_u8_u32(vdupq_n_u32((IN)))

	149 #define LOADQ_U32P_AS_U8(IN) vreinterpretq_u8_u32(vld1q_u32((IN)))

	150 #define GET_U8_AS_U32(IN) vget_lane_u32(vreinterpret_u32_u8((IN)), 0);

	151 #define GETQ_U8_AS_U32(IN) vgetq_lane_u32(vreinterpretq_u32_u8((IN)), 0);

	152 #define STOREQ_U8_AS_U32P(OUT, IN) vst1q_u32((OUT), vreinterpretq_u32_u8((IN)));

	153 #define ROTATE32_LEFT(L) vextq_u8((L), (L), 12) // D\|C\|B\|A -> C\|B\|A\|D

	154

	155 static WEBP_INLINE uint8x8_t Average2_u8_NEON(uint32_t a0, uint32_t a1) {

	156 const uint8x8_t A0 = LOAD_U32_AS_U8(a0);

	157 const uint8x8_t A1 = LOAD_U32_AS_U8(a1);

	158 return vhadd_u8(A0, A1);

	159 }

	160

	161 static WEBP_INLINE uint32_t ClampedAddSubtractHalf_NEON(uint32_t c0,

	162 uint32_t c1,

	163 uint32_t c2) {

	164 const uint8x8_t avg = Average2_u8_NEON(c0, c1);

	165 // Remove one to c2 when bigger than avg.

	166 const uint8x8_t C2 = LOAD_U32_AS_U8(c2);

	167 const uint8x8_t cmp = vcgt_u8(C2, avg);

	168 const uint8x8_t C2_1 = vadd_u8(C2, cmp);

	169 // Compute half of the difference between avg and c2.

	170 const int8x8_t diff_avg = vreinterpret_s8_u8(vhsub_u8(avg, C2_1));

	171 // Compute the sum with avg and saturate.

	172 const int16x8_t avg_16 = vreinterpretq_s16_u16(vmovl_u8(avg));

	173 const uint8x8_t res = vqmovun_s16(vaddw_s8(avg_16, diff_avg));

	174 const uint32_t output = GET_U8_AS_U32(res);

	175 return output;

	176 }

	177

	178 static WEBP_INLINE uint32_t Average2_NEON(uint32_t a0, uint32_t a1) {

	179 const uint8x8_t avg_u8x8 = Average2_u8_NEON(a0, a1);

	180 const uint32_t avg = GET_U8_AS_U32(avg_u8x8);

	181 return avg;

	182 }

	183

	184 static WEBP_INLINE uint32_t Average3_NEON(uint32_t a0, uint32_t a1,

	185 uint32_t a2) {

	186 const uint8x8_t avg0 = Average2_u8_NEON(a0, a2);

	187 const uint8x8_t A1 = LOAD_U32_AS_U8(a1);

	188 const uint32_t avg = GET_U8_AS_U32(vhadd_u8(avg0, A1));

	189 return avg;

	190 }

	191

	192 static uint32_t Predictor5_NEON(uint32_t left, const uint32_t* const top) {

	193 return Average3_NEON(left, top[0], top[1]);

	194 }

	195 static uint32_t Predictor6_NEON(uint32_t left, const uint32_t* const top) {

	196 return Average2_NEON(left, top[-1]);

	197 }

	198 static uint32_t Predictor7_NEON(uint32_t left, const uint32_t* const top) {

	199 return Average2_NEON(left, top[0]);

	200 }

	201 static uint32_t Predictor13_NEON(uint32_t left, const uint32_t* const top) {

	202 return ClampedAddSubtractHalf_NEON(left, top[0], top[-1]);

	203 }

	204

	205 // Batch versions of those functions.

	206

	207 // Predictor0: ARGB_BLACK.

	208 static void PredictorAdd0_NEON(const uint32_t* in, const uint32_t* upper,

	209 int num_pixels, uint32_t* out) {

	210 int i;

	211 const uint8x16_t black = vreinterpretq_u8_u32(vdupq_n_u32(ARGB_BLACK));

	212 for (i = 0; i + 4 <= num_pixels; i += 4) {

	213 const uint8x16_t src = LOADQ_U32P_AS_U8(&in[i]);

	214 const uint8x16_t res = vaddq_u8(src, black);

	215 STOREQ_U8_AS_U32P(&out[i], res);

	216 }

	217 VP8LPredictorsAdd_C[0](in + i, upper + i, num_pixels - i, out + i);

	218 }

	219

	220 // Predictor1: left.

	221 static void PredictorAdd1_NEON(const uint32_t* in, const uint32_t* upper,

	222 int num_pixels, uint32_t* out) {

	223 int i;

	224 const uint8x16_t zero = LOADQ_U32_AS_U8(0);

	225 for (i = 0; i + 4 <= num_pixels; i += 4) {

	226 // a \| b \| c \| d

	227 const uint8x16_t src = LOADQ_U32P_AS_U8(&in[i]);

	228 // 0 \| a \| b \| c

	229 const uint8x16_t shift0 = vextq_u8(zero, src, 12);

	230 // a \| a + b \| b + c \| c + d

	231 const uint8x16_t sum0 = vaddq_u8(src, shift0);

	232 // 0 \| 0 \| a \| a + b

	233 const uint8x16_t shift1 = vextq_u8(zero, sum0, 8);

	234 // a \| a + b \| a + b + c \| a + b + c + d

	235 const uint8x16_t sum1 = vaddq_u8(sum0, shift1);

	236 const uint8x16_t prev = LOADQ_U32_AS_U8(out[i - 1]);

	237 const uint8x16_t res = vaddq_u8(sum1, prev);

	238 STOREQ_U8_AS_U32P(&out[i], res);

	239 }

	240 VP8LPredictorsAdd_C[1](in + i, upper + i, num_pixels - i, out + i);

	241 }

	242

	243 // Macro that adds 32-bit integers from IN using mod 256 arithmetic

	244 // per 8 bit channel.

	245 #define GENERATE_PREDICTOR_1(X, IN) \

	246 static void PredictorAdd##X##_NEON(const uint32_t* in, \

	247 const uint32_t* upper, int num_pixels, \

	248 uint32_t* out) { \

	249 int i; \

	250 for (i = 0; i + 4 <= num_pixels; i += 4) { \

	251 const uint8x16_t src = LOADQ_U32P_AS_U8(&in[i]); \

	252 const uint8x16_t other = LOADQ_U32P_AS_U8(&(IN)); \

	253 const uint8x16_t res = vaddq_u8(src, other); \

	254 STOREQ_U8_AS_U32P(&out[i], res); \

	255 } \

	256 VP8LPredictorsAdd_C[(X)](in + i, upper + i, num_pixels - i, out + i); \

	257 }

	258 // Predictor2: Top.

	259 GENERATE_PREDICTOR_1(2, upper[i])

	260 // Predictor3: Top-right.

	261 GENERATE_PREDICTOR_1(3, upper[i + 1])

	262 // Predictor4: Top-left.

	263 GENERATE_PREDICTOR_1(4, upper[i - 1])

	264 #undef GENERATE_PREDICTOR_1

	265

	266 // Predictor5: average(average(left, TR), T)

	267 #define DO_PRED5(LANE) do { \

	268 const uint8x16_t avgLTR = vhaddq_u8(L, TR); \

	269 const uint8x16_t avg = vhaddq_u8(avgLTR, T); \

	270 const uint8x16_t res = vaddq_u8(avg, src); \

	271 vst1q_lane_u32(&out[i + (LANE)], vreinterpretq_u32_u8(res), (LANE)); \

	272 L = ROTATE32_LEFT(res); \

	273 } while (0)

	274

	275 static void PredictorAdd5_NEON(const uint32_t* in, const uint32_t* upper,

	276 int num_pixels, uint32_t* out) {

	277 int i;

	278 uint8x16_t L = LOADQ_U32_AS_U8(out[-1]);

	279 for (i = 0; i + 4 <= num_pixels; i += 4) {

	280 const uint8x16_t src = LOADQ_U32P_AS_U8(&in[i]);

	281 const uint8x16_t T = LOADQ_U32P_AS_U8(&upper[i + 0]);

	282 const uint8x16_t TR = LOADQ_U32P_AS_U8(&upper[i + 1]);

	283 DO_PRED5(0);

	284 DO_PRED5(1);

	285 DO_PRED5(2);

	286 DO_PRED5(3);

	287 }

	288 VP8LPredictorsAdd_C[5](in + i, upper + i, num_pixels - i, out + i);

	289 }

	290 #undef DO_PRED5

	291

	292 #define DO_PRED67(LANE) do { \

	293 const uint8x16_t avg = vhaddq_u8(L, top); \

	294 const uint8x16_t res = vaddq_u8(avg, src); \

	295 vst1q_lane_u32(&out[i + (LANE)], vreinterpretq_u32_u8(res), (LANE)); \

	296 L = ROTATE32_LEFT(res); \

	297 } while (0)

	298

	299 // Predictor6: average(left, TL)

	300 static void PredictorAdd6_NEON(const uint32_t* in, const uint32_t* upper,

	301 int num_pixels, uint32_t* out) {

	302 int i;

	303 uint8x16_t L = LOADQ_U32_AS_U8(out[-1]);

	304 for (i = 0; i + 4 <= num_pixels; i += 4) {

	305 const uint8x16_t src = LOADQ_U32P_AS_U8(&in[i]);

	306 const uint8x16_t top = LOADQ_U32P_AS_U8(&upper[i - 1]);

	307 DO_PRED67(0);

	308 DO_PRED67(1);

	309 DO_PRED67(2);

	310 DO_PRED67(3);

	311 }

	312 VP8LPredictorsAdd_C[6](in + i, upper + i, num_pixels - i, out + i);

	313 }

	314

	315 // Predictor7: average(left, T)

	316 static void PredictorAdd7_NEON(const uint32_t* in, const uint32_t* upper,

	317 int num_pixels, uint32_t* out) {

	318 int i;

	319 uint8x16_t L = LOADQ_U32_AS_U8(out[-1]);

	320 for (i = 0; i + 4 <= num_pixels; i += 4) {

	321 const uint8x16_t src = LOADQ_U32P_AS_U8(&in[i]);

	322 const uint8x16_t top = LOADQ_U32P_AS_U8(&upper[i]);

	323 DO_PRED67(0);

	324 DO_PRED67(1);

	325 DO_PRED67(2);

	326 DO_PRED67(3);

	327 }

	328 VP8LPredictorsAdd_C[7](in + i, upper + i, num_pixels - i, out + i);

	329 }

	330 #undef DO_PRED67

	331

	332 #define GENERATE_PREDICTOR_2(X, IN) \

	333 static void PredictorAdd##X##_NEON(const uint32_t* in, \

	334 const uint32_t* upper, int num_pixels, \

	335 uint32_t* out) { \

	336 int i; \

	337 for (i = 0; i + 4 <= num_pixels; i += 4) { \

	338 const uint8x16_t src = LOADQ_U32P_AS_U8(&in[i]); \

	339 const uint8x16_t Tother = LOADQ_U32P_AS_U8(&(IN)); \

	340 const uint8x16_t T = LOADQ_U32P_AS_U8(&upper[i]); \

	341 const uint8x16_t avg = vhaddq_u8(T, Tother); \

	342 const uint8x16_t res = vaddq_u8(avg, src); \

	343 STOREQ_U8_AS_U32P(&out[i], res); \

	344 } \

	345 VP8LPredictorsAdd_C[(X)](in + i, upper + i, num_pixels - i, out + i); \

	346 }

	347 // Predictor8: average TL T.

	348 GENERATE_PREDICTOR_2(8, upper[i - 1])

	349 // Predictor9: average T TR.

	350 GENERATE_PREDICTOR_2(9, upper[i + 1])

	351 #undef GENERATE_PREDICTOR_2

	352

	353 // Predictor10: average of (average of (L,TL), average of (T, TR)).

	354 #define DO_PRED10(LANE) do { \

	355 const uint8x16_t avgLTL = vhaddq_u8(L, TL); \

	356 const uint8x16_t avg = vhaddq_u8(avgTTR, avgLTL); \

	357 const uint8x16_t res = vaddq_u8(avg, src); \

	358 vst1q_lane_u32(&out[i + (LANE)], vreinterpretq_u32_u8(res), (LANE)); \

	359 L = ROTATE32_LEFT(res); \

	360 } while (0)

	361

	362 static void PredictorAdd10_NEON(const uint32_t* in, const uint32_t* upper,

	363 int num_pixels, uint32_t* out) {

	364 int i;

	365 uint8x16_t L = LOADQ_U32_AS_U8(out[-1]);

	366 for (i = 0; i + 4 <= num_pixels; i += 4) {

	367 const uint8x16_t src = LOADQ_U32P_AS_U8(&in[i]);

	368 const uint8x16_t TL = LOADQ_U32P_AS_U8(&upper[i - 1]);

	369 const uint8x16_t T = LOADQ_U32P_AS_U8(&upper[i]);

	370 const uint8x16_t TR = LOADQ_U32P_AS_U8(&upper[i + 1]);

	371 const uint8x16_t avgTTR = vhaddq_u8(T, TR);

	372 DO_PRED10(0);

	373 DO_PRED10(1);

	374 DO_PRED10(2);

	375 DO_PRED10(3);

	376 }

	377 VP8LPredictorsAdd_C[10](in + i, upper + i, num_pixels - i, out + i);

	378 }

	379 #undef DO_PRED10

	380

	381 // Predictor11: select.

	382 #define DO_PRED11(LANE) do { \

	383 const uint8x16_t sumLin = vaddq_u8(L, src); /* in + L */ \

	384 const uint8x16_t pLTL = vabdq_u8(L, TL); /* \|L - TL\| */ \

	385 const uint16x8_t sum_LTL = vpaddlq_u8(pLTL); \

	386 const uint32x4_t pa = vpaddlq_u16(sum_LTL); \

	387 const uint32x4_t mask = vcleq_u32(pa, pb); \

	388 const uint8x16_t res = vbslq_u8(vreinterpretq_u8_u32(mask), sumTin, sumLin); \

	389 vst1q_lane_u32(&out[i + (LANE)], vreinterpretq_u32_u8(res), (LANE)); \

	390 L = ROTATE32_LEFT(res); \

	391 } while (0)

	392

	393 static void PredictorAdd11_NEON(const uint32_t* in, const uint32_t* upper,

	394 int num_pixels, uint32_t* out) {

	395 int i;

	396 uint8x16_t L = LOADQ_U32_AS_U8(out[-1]);

	397 for (i = 0; i + 4 <= num_pixels; i += 4) {

	398 const uint8x16_t T = LOADQ_U32P_AS_U8(&upper[i]);

	399 const uint8x16_t TL = LOADQ_U32P_AS_U8(&upper[i - 1]);

	400 const uint8x16_t pTTL = vabdq_u8(T, TL); // \|T - TL\|

	401 const uint16x8_t sum_TTL = vpaddlq_u8(pTTL);

	402 const uint32x4_t pb = vpaddlq_u16(sum_TTL);

	403 const uint8x16_t src = LOADQ_U32P_AS_U8(&in[i]);

	404 const uint8x16_t sumTin = vaddq_u8(T, src); // in + T

	405 DO_PRED11(0);

	406 DO_PRED11(1);

	407 DO_PRED11(2);

	408 DO_PRED11(3);

	409 }

	410 VP8LPredictorsAdd_C[11](in + i, upper + i, num_pixels - i, out + i);

	411 }

	412 #undef DO_PRED11

	413

	414 // Predictor12: ClampedAddSubtractFull.

	415 #define DO_PRED12(DIFF, LANE) do { \

	416 const uint8x8_t pred = \

	417 vqmovun_s16(vaddq_s16(vreinterpretq_s16_u16(L), (DIFF))); \

	418 const uint8x8_t res = \

	419 vadd_u8(pred, (LANE <= 1) ? vget_low_u8(src) : vget_high_u8(src)); \

	420 const uint16x8_t res16 = vmovl_u8(res); \

	421 vst1_lane_u32(&out[i + (LANE)], vreinterpret_u32_u8(res), (LANE) & 1); \

	422 /* rotate in the left predictor for next iteration */ \

	423 L = vextq_u16(res16, res16, 4); \

	424 } while (0)

	425

	426 static void PredictorAdd12_NEON(const uint32_t* in, const uint32_t* upper,

	427 int num_pixels, uint32_t* out) {

	428 int i;

	429 uint16x8_t L = vmovl_u8(LOAD_U32_AS_U8(out[-1]));

	430 for (i = 0; i + 4 <= num_pixels; i += 4) {

	431 // load four pixels of source

	432 const uint8x16_t src = LOADQ_U32P_AS_U8(&in[i]);

	433 // precompute the difference T - TL once for all, stored as s16

	434 const uint8x16_t TL = LOADQ_U32P_AS_U8(&upper[i - 1]);

	435 const uint8x16_t T = LOADQ_U32P_AS_U8(&upper[i]);

	436 const int16x8_t diff_lo =

	437 vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(T), vget_low_u8(TL)));

	438 const int16x8_t diff_hi =

	439 vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(T), vget_high_u8(TL)));

	440 // loop over the four reconstructed pixels

	441 DO_PRED12(diff_lo, 0);

	442 DO_PRED12(diff_lo, 1);

	443 DO_PRED12(diff_hi, 2);

	444 DO_PRED12(diff_hi, 3);

	445 }

	446 VP8LPredictorsAdd_C[12](in + i, upper + i, num_pixels - i, out + i);

	447 }

	448 #undef DO_PRED12

	449

	450 // Predictor13: ClampedAddSubtractHalf

	451 #define DO_PRED13(LANE, LOW_OR_HI) do { \

	452 const uint8x16_t avg = vhaddq_u8(L, T); \

	453 const uint8x16_t cmp = vcgtq_u8(TL, avg); \

	454 const uint8x16_t TL_1 = vaddq_u8(TL, cmp); \

	455 /* Compute half of the difference between avg and TL'. */ \

	456 const int8x8_t diff_avg = \

	457 vreinterpret_s8_u8(LOW_OR_HI(vhsubq_u8(avg, TL_1))); \

	458 /* Compute the sum with avg and saturate. */ \

	459 const int16x8_t avg_16 = vreinterpretq_s16_u16(vmovl_u8(LOW_OR_HI(avg))); \

	460 const uint8x8_t delta = vqmovun_s16(vaddw_s8(avg_16, diff_avg)); \

	461 const uint8x8_t res = vadd_u8(LOW_OR_HI(src), delta); \

	462 const uint8x16_t res2 = vcombine_u8(res, res); \

	463 vst1_lane_u32(&out[i + (LANE)], vreinterpret_u32_u8(res), (LANE) & 1); \

	464 L = ROTATE32_LEFT(res2); \

	465 } while (0)

	466

	467 static void PredictorAdd13_NEON(const uint32_t* in, const uint32_t* upper,

	468 int num_pixels, uint32_t* out) {

	469 int i;

	470 uint8x16_t L = LOADQ_U32_AS_U8(out[-1]);

	471 for (i = 0; i + 4 <= num_pixels; i += 4) {

	472 const uint8x16_t src = LOADQ_U32P_AS_U8(&in[i]);

	473 const uint8x16_t T = LOADQ_U32P_AS_U8(&upper[i]);

	474 const uint8x16_t TL = LOADQ_U32P_AS_U8(&upper[i - 1]);

	475 DO_PRED13(0, vget_low_u8);

	476 DO_PRED13(1, vget_low_u8);

	477 DO_PRED13(2, vget_high_u8);

	478 DO_PRED13(3, vget_high_u8);

	479 }

	480 VP8LPredictorsAdd_C[13](in + i, upper + i, num_pixels - i, out + i);

	481 }

	482 #undef DO_PRED13

	483

	484 #undef LOAD_U32_AS_U8

	485 #undef LOAD_U32P_AS_U8

	486 #undef LOADQ_U32_AS_U8

	487 #undef LOADQ_U32P_AS_U8

	488 #undef GET_U8_AS_U32

	489 #undef GETQ_U8_AS_U32

	490 #undef STOREQ_U8_AS_U32P

	491 #undef ROTATE32_LEFT

	492

142 //------------------------------------------------------------------------------	493 //------------------------------------------------------------------------------

143 // Subtract-Green Transform	494 // Subtract-Green Transform

144	495

145 // vtbl?_u8 are marked unavailable for iOS arm64 with Xcode < 6.3, use	496 // vtbl?_u8 are marked unavailable for iOS arm64 with Xcode < 6.3, use

146 // non-standard versions there.	497 // non-standard versions there.

147 #if defined(__APPLE__) && defined(__aarch64__) && \	498 #if defined(__APPLE__) && defined(__aarch64__) && \

148 defined(__apple_build_version__) && (__apple_build_version__< 6020037)	499 defined(__apple_build_version__) && (__apple_build_version__< 6020037)

149 #define USE_VTBLQ	500 #define USE_VTBLQ

150 #endif	501 #endif

151	502

(...skipping 12 matching lines...) Expand all Loading...
164 // 255 = byte will be zeroed	515 // 255 = byte will be zeroed

165 static const uint8_t kGreenShuffle[8] = { 1, 255, 1, 255, 5, 255, 5, 255 };	516 static const uint8_t kGreenShuffle[8] = { 1, 255, 1, 255, 5, 255, 5, 255 };

166	517

167 static WEBP_INLINE uint8x16_t DoGreenShuffle(const uint8x16_t argb,	518 static WEBP_INLINE uint8x16_t DoGreenShuffle(const uint8x16_t argb,

168 const uint8x8_t shuffle) {	519 const uint8x8_t shuffle) {

169 return vcombine_u8(vtbl1_u8(vget_low_u8(argb), shuffle),	520 return vcombine_u8(vtbl1_u8(vget_low_u8(argb), shuffle),

170 vtbl1_u8(vget_high_u8(argb), shuffle));	521 vtbl1_u8(vget_high_u8(argb), shuffle));

171 }	522 }

172 #endif // USE_VTBLQ	523 #endif // USE_VTBLQ

173	524

174 static void AddGreenToBlueAndRed(uint32_t* argb_data, int num_pixels) {	525 static void AddGreenToBlueAndRed(const uint32_t* src, int num_pixels,

175 const uint32_t* const end = argb_data + (num_pixels & ~3);	526 uint32_t* dst) {

	527 const uint32_t* const end = src + (num_pixels & ~3);

176 #ifdef USE_VTBLQ	528 #ifdef USE_VTBLQ

177 const uint8x16_t shuffle = vld1q_u8(kGreenShuffle);	529 const uint8x16_t shuffle = vld1q_u8(kGreenShuffle);

178 #else	530 #else

179 const uint8x8_t shuffle = vld1_u8(kGreenShuffle);	531 const uint8x8_t shuffle = vld1_u8(kGreenShuffle);

180 #endif	532 #endif

181 for (; argb_data < end; argb_data += 4) {	533 for (; src < end; src += 4, dst += 4) {

182 const uint8x16_t argb = vld1q_u8((uint8_t*)argb_data);	534 const uint8x16_t argb = vld1q_u8((const uint8_t*)src);

183 const uint8x16_t greens = DoGreenShuffle(argb, shuffle);	535 const uint8x16_t greens = DoGreenShuffle(argb, shuffle);

184 vst1q_u8((uint8_t*)argb_data, vaddq_u8(argb, greens));	536 vst1q_u8((uint8_t*)dst, vaddq_u8(argb, greens));

185 }	537 }

186 // fallthrough and finish off with plain-C	538 // fallthrough and finish off with plain-C

187 VP8LAddGreenToBlueAndRed_C(argb_data, num_pixels & 3);	539 VP8LAddGreenToBlueAndRed_C(src, num_pixels & 3, dst);

188 }	540 }

189	541

190 //------------------------------------------------------------------------------	542 //------------------------------------------------------------------------------

191 // Color Transform	543 // Color Transform

192	544

193 static void TransformColorInverse(const VP8LMultipliers* const m,	545 static void TransformColorInverse(const VP8LMultipliers* const m,

194 uint32_t* argb_data, int num_pixels) {	546 const uint32_t* const src, int num_pixels,

195 // sign-extended multiplying constants, pre-shifted by 6.	547 uint32_t* dst) {

	548 // sign-extended multiplying constants, pre-shifted by 6.

196 #define CST(X) (((int16_t)(m->X << 8)) >> 6)	549 #define CST(X) (((int16_t)(m->X << 8)) >> 6)

197 const int16_t rb[8] = {	550 const int16_t rb[8] = {

198 CST(green_to_blue_), CST(green_to_red_),	551 CST(green_to_blue_), CST(green_to_red_),

199 CST(green_to_blue_), CST(green_to_red_),	552 CST(green_to_blue_), CST(green_to_red_),

200 CST(green_to_blue_), CST(green_to_red_),	553 CST(green_to_blue_), CST(green_to_red_),

201 CST(green_to_blue_), CST(green_to_red_)	554 CST(green_to_blue_), CST(green_to_red_)

202 };	555 };

203 const int16x8_t mults_rb = vld1q_s16(rb);	556 const int16x8_t mults_rb = vld1q_s16(rb);

204 const int16_t b2[8] = {	557 const int16_t b2[8] = {

205 0, CST(red_to_blue_), 0, CST(red_to_blue_),	558 0, CST(red_to_blue_), 0, CST(red_to_blue_),

206 0, CST(red_to_blue_), 0, CST(red_to_blue_),	559 0, CST(red_to_blue_), 0, CST(red_to_blue_),

207 };	560 };

208 const int16x8_t mults_b2 = vld1q_s16(b2);	561 const int16x8_t mults_b2 = vld1q_s16(b2);

209 #undef CST	562 #undef CST

210 #ifdef USE_VTBLQ	563 #ifdef USE_VTBLQ

211 static const uint8_t kg0g0[16] = {	564 static const uint8_t kg0g0[16] = {

212 255, 1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13	565 255, 1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13

213 };	566 };

214 const uint8x16_t shuffle = vld1q_u8(kg0g0);	567 const uint8x16_t shuffle = vld1q_u8(kg0g0);

215 #else	568 #else

216 static const uint8_t k0g0g[8] = { 255, 1, 255, 1, 255, 5, 255, 5 };	569 static const uint8_t k0g0g[8] = { 255, 1, 255, 1, 255, 5, 255, 5 };

217 const uint8x8_t shuffle = vld1_u8(k0g0g);	570 const uint8x8_t shuffle = vld1_u8(k0g0g);

218 #endif	571 #endif

219 const uint32x4_t mask_ag = vdupq_n_u32(0xff00ff00u);	572 const uint32x4_t mask_ag = vdupq_n_u32(0xff00ff00u);

220 int i;	573 int i;

221 for (i = 0; i + 4 <= num_pixels; i += 4) {	574 for (i = 0; i + 4 <= num_pixels; i += 4) {

222 const uint8x16_t in = vld1q_u8((uint8_t*)(argb_data + i));	575 const uint8x16_t in = vld1q_u8((const uint8_t*)(src + i));

223 const uint32x4_t a0g0 = vandq_u32(vreinterpretq_u32_u8(in), mask_ag);	576 const uint32x4_t a0g0 = vandq_u32(vreinterpretq_u32_u8(in), mask_ag);

224 // 0 g 0 g	577 // 0 g 0 g

225 const uint8x16_t greens = DoGreenShuffle(in, shuffle);	578 const uint8x16_t greens = DoGreenShuffle(in, shuffle);

226 // x dr x db1	579 // x dr x db1

227 const int16x8_t A = vqdmulhq_s16(vreinterpretq_s16_u8(greens), mults_rb);	580 const int16x8_t A = vqdmulhq_s16(vreinterpretq_s16_u8(greens), mults_rb);

228 // x r' x b'	581 // x r' x b'

229 const int8x16_t B = vaddq_s8(vreinterpretq_s8_u8(in),	582 const int8x16_t B = vaddq_s8(vreinterpretq_s8_u8(in),

230 vreinterpretq_s8_s16(A));	583 vreinterpretq_s8_s16(A));

231 // r' 0 b' 0	584 // r' 0 b' 0

232 const int16x8_t C = vshlq_n_s16(vreinterpretq_s16_s8(B), 8);	585 const int16x8_t C = vshlq_n_s16(vreinterpretq_s16_s8(B), 8);

233 // x db2 0 0	586 // x db2 0 0

234 const int16x8_t D = vqdmulhq_s16(C, mults_b2);	587 const int16x8_t D = vqdmulhq_s16(C, mults_b2);

235 // 0 x db2 0	588 // 0 x db2 0

236 const uint32x4_t E = vshrq_n_u32(vreinterpretq_u32_s16(D), 8);	589 const uint32x4_t E = vshrq_n_u32(vreinterpretq_u32_s16(D), 8);

237 // r' x b'' 0	590 // r' x b'' 0

238 const int8x16_t F = vaddq_s8(vreinterpretq_s8_u32(E),	591 const int8x16_t F = vaddq_s8(vreinterpretq_s8_u32(E),

239 vreinterpretq_s8_s16(C));	592 vreinterpretq_s8_s16(C));

240 // 0 r' 0 b''	593 // 0 r' 0 b''

241 const uint16x8_t G = vshrq_n_u16(vreinterpretq_u16_s8(F), 8);	594 const uint16x8_t G = vshrq_n_u16(vreinterpretq_u16_s8(F), 8);

242 const uint32x4_t out = vorrq_u32(vreinterpretq_u32_u16(G), a0g0);	595 const uint32x4_t out = vorrq_u32(vreinterpretq_u32_u16(G), a0g0);

243 vst1q_u32(argb_data + i, out);	596 vst1q_u32(dst + i, out);

244 }	597 }

245 // Fall-back to C-version for left-overs.	598 // Fall-back to C-version for left-overs.

246 VP8LTransformColorInverse_C(m, argb_data + i, num_pixels - i);	599 VP8LTransformColorInverse_C(m, src + i, num_pixels - i, dst + i);

247 }	600 }

248	601

249 #undef USE_VTBLQ	602 #undef USE_VTBLQ

250	603

251 //------------------------------------------------------------------------------	604 //------------------------------------------------------------------------------

252 // Entry point	605 // Entry point

253	606

254 extern void VP8LDspInitNEON(void);	607 extern void VP8LDspInitNEON(void);

255	608

256 WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInitNEON(void) {	609 WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInitNEON(void) {

	610 VP8LPredictors[5] = Predictor5_NEON;

	611 VP8LPredictors[6] = Predictor6_NEON;

	612 VP8LPredictors[7] = Predictor7_NEON;

	613 VP8LPredictors[13] = Predictor13_NEON;

	614

	615 VP8LPredictorsAdd[0] = PredictorAdd0_NEON;

	616 VP8LPredictorsAdd[1] = PredictorAdd1_NEON;

	617 VP8LPredictorsAdd[2] = PredictorAdd2_NEON;

	618 VP8LPredictorsAdd[3] = PredictorAdd3_NEON;

	619 VP8LPredictorsAdd[4] = PredictorAdd4_NEON;

	620 VP8LPredictorsAdd[5] = PredictorAdd5_NEON;

	621 VP8LPredictorsAdd[6] = PredictorAdd6_NEON;

	622 VP8LPredictorsAdd[7] = PredictorAdd7_NEON;

	623 VP8LPredictorsAdd[8] = PredictorAdd8_NEON;

	624 VP8LPredictorsAdd[9] = PredictorAdd9_NEON;

	625 VP8LPredictorsAdd[10] = PredictorAdd10_NEON;

	626 VP8LPredictorsAdd[11] = PredictorAdd11_NEON;

	627 VP8LPredictorsAdd[12] = PredictorAdd12_NEON;

	628 VP8LPredictorsAdd[13] = PredictorAdd13_NEON;

	629

257 VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA;	630 VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA;

258 VP8LConvertBGRAToBGR = ConvertBGRAToBGR;	631 VP8LConvertBGRAToBGR = ConvertBGRAToBGR;

259 VP8LConvertBGRAToRGB = ConvertBGRAToRGB;	632 VP8LConvertBGRAToRGB = ConvertBGRAToRGB;

260	633

261 VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed;	634 VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed;

262 VP8LTransformColorInverse = TransformColorInverse;	635 VP8LTransformColorInverse = TransformColorInverse;

263 }	636 }

264	637

265 #else // !WEBP_USE_NEON	638 #else // !WEBP_USE_NEON

266	639

267 WEBP_DSP_INIT_STUB(VP8LDspInitNEON)	640 WEBP_DSP_INIT_STUB(VP8LDspInitNEON)

268	641

269 #endif // WEBP_USE_NEON	642 #endif // WEBP_USE_NEON

OLD	NEW

« no previous file with comments | « third_party/libwebp/dsp/lossless_msa.c ('k') | third_party/libwebp/dsp/lossless_sse2.c » ('j') | no next file with comments »