| OLD | NEW |
| 1 // Copyright 2014 Google Inc. All Rights Reserved. | 1 // Copyright 2014 Google Inc. All Rights Reserved. |
| 2 // | 2 // |
| 3 // Use of this source code is governed by a BSD-style license | 3 // Use of this source code is governed by a BSD-style license |
| 4 // that can be found in the COPYING file in the root of the source | 4 // that can be found in the COPYING file in the root of the source |
| 5 // tree. An additional intellectual property rights grant can be found | 5 // tree. An additional intellectual property rights grant can be found |
| 6 // in the file PATENTS. All contributing project authors may | 6 // in the file PATENTS. All contributing project authors may |
| 7 // be found in the AUTHORS file in the root of the source tree. | 7 // be found in the AUTHORS file in the root of the source tree. |
| 8 // ----------------------------------------------------------------------------- | 8 // ----------------------------------------------------------------------------- |
| 9 // | 9 // |
| 10 // NEON variant of methods for lossless decoder | 10 // NEON variant of methods for lossless decoder |
| (...skipping 121 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 132 vst1_u8(dst + 0, vtbl4_u8(pixels, shuffle0)); | 132 vst1_u8(dst + 0, vtbl4_u8(pixels, shuffle0)); |
| 133 vst1_u8(dst + 8, vtbl4_u8(pixels, shuffle1)); | 133 vst1_u8(dst + 8, vtbl4_u8(pixels, shuffle1)); |
| 134 vst1_u8(dst + 16, vtbl4_u8(pixels, shuffle2)); | 134 vst1_u8(dst + 16, vtbl4_u8(pixels, shuffle2)); |
| 135 dst += 8 * 3; | 135 dst += 8 * 3; |
| 136 } | 136 } |
| 137 VP8LConvertBGRAToRGB_C(src, num_pixels & 7, dst); // left-overs | 137 VP8LConvertBGRAToRGB_C(src, num_pixels & 7, dst); // left-overs |
| 138 } | 138 } |
| 139 | 139 |
| 140 #endif // !WORK_AROUND_GCC | 140 #endif // !WORK_AROUND_GCC |
| 141 | 141 |
| 142 |
| 143 //------------------------------------------------------------------------------ |
| 144 // Predictor Transform |
| 145 |
| 146 #define LOAD_U32_AS_U8(IN) vreinterpret_u8_u32(vdup_n_u32((IN))) |
| 147 #define LOAD_U32P_AS_U8(IN) vreinterpret_u8_u32(vld1_u32((IN))) |
| 148 #define LOADQ_U32_AS_U8(IN) vreinterpretq_u8_u32(vdupq_n_u32((IN))) |
| 149 #define LOADQ_U32P_AS_U8(IN) vreinterpretq_u8_u32(vld1q_u32((IN))) |
| 150 #define GET_U8_AS_U32(IN) vget_lane_u32(vreinterpret_u32_u8((IN)), 0); |
| 151 #define GETQ_U8_AS_U32(IN) vgetq_lane_u32(vreinterpretq_u32_u8((IN)), 0); |
| 152 #define STOREQ_U8_AS_U32P(OUT, IN) vst1q_u32((OUT), vreinterpretq_u32_u8((IN))); |
| 153 #define ROTATE32_LEFT(L) vextq_u8((L), (L), 12) // D|C|B|A -> C|B|A|D |
| 154 |
| 155 static WEBP_INLINE uint8x8_t Average2_u8_NEON(uint32_t a0, uint32_t a1) { |
| 156 const uint8x8_t A0 = LOAD_U32_AS_U8(a0); |
| 157 const uint8x8_t A1 = LOAD_U32_AS_U8(a1); |
| 158 return vhadd_u8(A0, A1); |
| 159 } |
| 160 |
| 161 static WEBP_INLINE uint32_t ClampedAddSubtractHalf_NEON(uint32_t c0, |
| 162 uint32_t c1, |
| 163 uint32_t c2) { |
| 164 const uint8x8_t avg = Average2_u8_NEON(c0, c1); |
| 165 // Remove one to c2 when bigger than avg. |
| 166 const uint8x8_t C2 = LOAD_U32_AS_U8(c2); |
| 167 const uint8x8_t cmp = vcgt_u8(C2, avg); |
| 168 const uint8x8_t C2_1 = vadd_u8(C2, cmp); |
| 169 // Compute half of the difference between avg and c2. |
| 170 const int8x8_t diff_avg = vreinterpret_s8_u8(vhsub_u8(avg, C2_1)); |
| 171 // Compute the sum with avg and saturate. |
| 172 const int16x8_t avg_16 = vreinterpretq_s16_u16(vmovl_u8(avg)); |
| 173 const uint8x8_t res = vqmovun_s16(vaddw_s8(avg_16, diff_avg)); |
| 174 const uint32_t output = GET_U8_AS_U32(res); |
| 175 return output; |
| 176 } |
| 177 |
| 178 static WEBP_INLINE uint32_t Average2_NEON(uint32_t a0, uint32_t a1) { |
| 179 const uint8x8_t avg_u8x8 = Average2_u8_NEON(a0, a1); |
| 180 const uint32_t avg = GET_U8_AS_U32(avg_u8x8); |
| 181 return avg; |
| 182 } |
| 183 |
| 184 static WEBP_INLINE uint32_t Average3_NEON(uint32_t a0, uint32_t a1, |
| 185 uint32_t a2) { |
| 186 const uint8x8_t avg0 = Average2_u8_NEON(a0, a2); |
| 187 const uint8x8_t A1 = LOAD_U32_AS_U8(a1); |
| 188 const uint32_t avg = GET_U8_AS_U32(vhadd_u8(avg0, A1)); |
| 189 return avg; |
| 190 } |
| 191 |
| 192 static uint32_t Predictor5_NEON(uint32_t left, const uint32_t* const top) { |
| 193 return Average3_NEON(left, top[0], top[1]); |
| 194 } |
| 195 static uint32_t Predictor6_NEON(uint32_t left, const uint32_t* const top) { |
| 196 return Average2_NEON(left, top[-1]); |
| 197 } |
| 198 static uint32_t Predictor7_NEON(uint32_t left, const uint32_t* const top) { |
| 199 return Average2_NEON(left, top[0]); |
| 200 } |
| 201 static uint32_t Predictor13_NEON(uint32_t left, const uint32_t* const top) { |
| 202 return ClampedAddSubtractHalf_NEON(left, top[0], top[-1]); |
| 203 } |
| 204 |
| 205 // Batch versions of those functions. |
| 206 |
| 207 // Predictor0: ARGB_BLACK. |
| 208 static void PredictorAdd0_NEON(const uint32_t* in, const uint32_t* upper, |
| 209 int num_pixels, uint32_t* out) { |
| 210 int i; |
| 211 const uint8x16_t black = vreinterpretq_u8_u32(vdupq_n_u32(ARGB_BLACK)); |
| 212 for (i = 0; i + 4 <= num_pixels; i += 4) { |
| 213 const uint8x16_t src = LOADQ_U32P_AS_U8(&in[i]); |
| 214 const uint8x16_t res = vaddq_u8(src, black); |
| 215 STOREQ_U8_AS_U32P(&out[i], res); |
| 216 } |
| 217 VP8LPredictorsAdd_C[0](in + i, upper + i, num_pixels - i, out + i); |
| 218 } |
| 219 |
| 220 // Predictor1: left. |
| 221 static void PredictorAdd1_NEON(const uint32_t* in, const uint32_t* upper, |
| 222 int num_pixels, uint32_t* out) { |
| 223 int i; |
| 224 const uint8x16_t zero = LOADQ_U32_AS_U8(0); |
| 225 for (i = 0; i + 4 <= num_pixels; i += 4) { |
| 226 // a | b | c | d |
| 227 const uint8x16_t src = LOADQ_U32P_AS_U8(&in[i]); |
| 228 // 0 | a | b | c |
| 229 const uint8x16_t shift0 = vextq_u8(zero, src, 12); |
| 230 // a | a + b | b + c | c + d |
| 231 const uint8x16_t sum0 = vaddq_u8(src, shift0); |
| 232 // 0 | 0 | a | a + b |
| 233 const uint8x16_t shift1 = vextq_u8(zero, sum0, 8); |
| 234 // a | a + b | a + b + c | a + b + c + d |
| 235 const uint8x16_t sum1 = vaddq_u8(sum0, shift1); |
| 236 const uint8x16_t prev = LOADQ_U32_AS_U8(out[i - 1]); |
| 237 const uint8x16_t res = vaddq_u8(sum1, prev); |
| 238 STOREQ_U8_AS_U32P(&out[i], res); |
| 239 } |
| 240 VP8LPredictorsAdd_C[1](in + i, upper + i, num_pixels - i, out + i); |
| 241 } |
| 242 |
| 243 // Macro that adds 32-bit integers from IN using mod 256 arithmetic |
| 244 // per 8 bit channel. |
| 245 #define GENERATE_PREDICTOR_1(X, IN) \ |
| 246 static void PredictorAdd##X##_NEON(const uint32_t* in, \ |
| 247 const uint32_t* upper, int num_pixels, \ |
| 248 uint32_t* out) { \ |
| 249 int i; \ |
| 250 for (i = 0; i + 4 <= num_pixels; i += 4) { \ |
| 251 const uint8x16_t src = LOADQ_U32P_AS_U8(&in[i]); \ |
| 252 const uint8x16_t other = LOADQ_U32P_AS_U8(&(IN)); \ |
| 253 const uint8x16_t res = vaddq_u8(src, other); \ |
| 254 STOREQ_U8_AS_U32P(&out[i], res); \ |
| 255 } \ |
| 256 VP8LPredictorsAdd_C[(X)](in + i, upper + i, num_pixels - i, out + i); \ |
| 257 } |
| 258 // Predictor2: Top. |
| 259 GENERATE_PREDICTOR_1(2, upper[i]) |
| 260 // Predictor3: Top-right. |
| 261 GENERATE_PREDICTOR_1(3, upper[i + 1]) |
| 262 // Predictor4: Top-left. |
| 263 GENERATE_PREDICTOR_1(4, upper[i - 1]) |
| 264 #undef GENERATE_PREDICTOR_1 |
| 265 |
| 266 // Predictor5: average(average(left, TR), T) |
| 267 #define DO_PRED5(LANE) do { \ |
| 268 const uint8x16_t avgLTR = vhaddq_u8(L, TR); \ |
| 269 const uint8x16_t avg = vhaddq_u8(avgLTR, T); \ |
| 270 const uint8x16_t res = vaddq_u8(avg, src); \ |
| 271 vst1q_lane_u32(&out[i + (LANE)], vreinterpretq_u32_u8(res), (LANE)); \ |
| 272 L = ROTATE32_LEFT(res); \ |
| 273 } while (0) |
| 274 |
| 275 static void PredictorAdd5_NEON(const uint32_t* in, const uint32_t* upper, |
| 276 int num_pixels, uint32_t* out) { |
| 277 int i; |
| 278 uint8x16_t L = LOADQ_U32_AS_U8(out[-1]); |
| 279 for (i = 0; i + 4 <= num_pixels; i += 4) { |
| 280 const uint8x16_t src = LOADQ_U32P_AS_U8(&in[i]); |
| 281 const uint8x16_t T = LOADQ_U32P_AS_U8(&upper[i + 0]); |
| 282 const uint8x16_t TR = LOADQ_U32P_AS_U8(&upper[i + 1]); |
| 283 DO_PRED5(0); |
| 284 DO_PRED5(1); |
| 285 DO_PRED5(2); |
| 286 DO_PRED5(3); |
| 287 } |
| 288 VP8LPredictorsAdd_C[5](in + i, upper + i, num_pixels - i, out + i); |
| 289 } |
| 290 #undef DO_PRED5 |
| 291 |
| 292 #define DO_PRED67(LANE) do { \ |
| 293 const uint8x16_t avg = vhaddq_u8(L, top); \ |
| 294 const uint8x16_t res = vaddq_u8(avg, src); \ |
| 295 vst1q_lane_u32(&out[i + (LANE)], vreinterpretq_u32_u8(res), (LANE)); \ |
| 296 L = ROTATE32_LEFT(res); \ |
| 297 } while (0) |
| 298 |
| 299 // Predictor6: average(left, TL) |
| 300 static void PredictorAdd6_NEON(const uint32_t* in, const uint32_t* upper, |
| 301 int num_pixels, uint32_t* out) { |
| 302 int i; |
| 303 uint8x16_t L = LOADQ_U32_AS_U8(out[-1]); |
| 304 for (i = 0; i + 4 <= num_pixels; i += 4) { |
| 305 const uint8x16_t src = LOADQ_U32P_AS_U8(&in[i]); |
| 306 const uint8x16_t top = LOADQ_U32P_AS_U8(&upper[i - 1]); |
| 307 DO_PRED67(0); |
| 308 DO_PRED67(1); |
| 309 DO_PRED67(2); |
| 310 DO_PRED67(3); |
| 311 } |
| 312 VP8LPredictorsAdd_C[6](in + i, upper + i, num_pixels - i, out + i); |
| 313 } |
| 314 |
| 315 // Predictor7: average(left, T) |
| 316 static void PredictorAdd7_NEON(const uint32_t* in, const uint32_t* upper, |
| 317 int num_pixels, uint32_t* out) { |
| 318 int i; |
| 319 uint8x16_t L = LOADQ_U32_AS_U8(out[-1]); |
| 320 for (i = 0; i + 4 <= num_pixels; i += 4) { |
| 321 const uint8x16_t src = LOADQ_U32P_AS_U8(&in[i]); |
| 322 const uint8x16_t top = LOADQ_U32P_AS_U8(&upper[i]); |
| 323 DO_PRED67(0); |
| 324 DO_PRED67(1); |
| 325 DO_PRED67(2); |
| 326 DO_PRED67(3); |
| 327 } |
| 328 VP8LPredictorsAdd_C[7](in + i, upper + i, num_pixels - i, out + i); |
| 329 } |
| 330 #undef DO_PRED67 |
| 331 |
| 332 #define GENERATE_PREDICTOR_2(X, IN) \ |
| 333 static void PredictorAdd##X##_NEON(const uint32_t* in, \ |
| 334 const uint32_t* upper, int num_pixels, \ |
| 335 uint32_t* out) { \ |
| 336 int i; \ |
| 337 for (i = 0; i + 4 <= num_pixels; i += 4) { \ |
| 338 const uint8x16_t src = LOADQ_U32P_AS_U8(&in[i]); \ |
| 339 const uint8x16_t Tother = LOADQ_U32P_AS_U8(&(IN)); \ |
| 340 const uint8x16_t T = LOADQ_U32P_AS_U8(&upper[i]); \ |
| 341 const uint8x16_t avg = vhaddq_u8(T, Tother); \ |
| 342 const uint8x16_t res = vaddq_u8(avg, src); \ |
| 343 STOREQ_U8_AS_U32P(&out[i], res); \ |
| 344 } \ |
| 345 VP8LPredictorsAdd_C[(X)](in + i, upper + i, num_pixels - i, out + i); \ |
| 346 } |
| 347 // Predictor8: average TL T. |
| 348 GENERATE_PREDICTOR_2(8, upper[i - 1]) |
| 349 // Predictor9: average T TR. |
| 350 GENERATE_PREDICTOR_2(9, upper[i + 1]) |
| 351 #undef GENERATE_PREDICTOR_2 |
| 352 |
| 353 // Predictor10: average of (average of (L,TL), average of (T, TR)). |
| 354 #define DO_PRED10(LANE) do { \ |
| 355 const uint8x16_t avgLTL = vhaddq_u8(L, TL); \ |
| 356 const uint8x16_t avg = vhaddq_u8(avgTTR, avgLTL); \ |
| 357 const uint8x16_t res = vaddq_u8(avg, src); \ |
| 358 vst1q_lane_u32(&out[i + (LANE)], vreinterpretq_u32_u8(res), (LANE)); \ |
| 359 L = ROTATE32_LEFT(res); \ |
| 360 } while (0) |
| 361 |
| 362 static void PredictorAdd10_NEON(const uint32_t* in, const uint32_t* upper, |
| 363 int num_pixels, uint32_t* out) { |
| 364 int i; |
| 365 uint8x16_t L = LOADQ_U32_AS_U8(out[-1]); |
| 366 for (i = 0; i + 4 <= num_pixels; i += 4) { |
| 367 const uint8x16_t src = LOADQ_U32P_AS_U8(&in[i]); |
| 368 const uint8x16_t TL = LOADQ_U32P_AS_U8(&upper[i - 1]); |
| 369 const uint8x16_t T = LOADQ_U32P_AS_U8(&upper[i]); |
| 370 const uint8x16_t TR = LOADQ_U32P_AS_U8(&upper[i + 1]); |
| 371 const uint8x16_t avgTTR = vhaddq_u8(T, TR); |
| 372 DO_PRED10(0); |
| 373 DO_PRED10(1); |
| 374 DO_PRED10(2); |
| 375 DO_PRED10(3); |
| 376 } |
| 377 VP8LPredictorsAdd_C[10](in + i, upper + i, num_pixels - i, out + i); |
| 378 } |
| 379 #undef DO_PRED10 |
| 380 |
| 381 // Predictor11: select. |
| 382 #define DO_PRED11(LANE) do { \ |
| 383 const uint8x16_t sumLin = vaddq_u8(L, src); /* in + L */ \ |
| 384 const uint8x16_t pLTL = vabdq_u8(L, TL); /* |L - TL| */ \ |
| 385 const uint16x8_t sum_LTL = vpaddlq_u8(pLTL); \ |
| 386 const uint32x4_t pa = vpaddlq_u16(sum_LTL); \ |
| 387 const uint32x4_t mask = vcleq_u32(pa, pb); \ |
| 388 const uint8x16_t res = vbslq_u8(vreinterpretq_u8_u32(mask), sumTin, sumLin); \ |
| 389 vst1q_lane_u32(&out[i + (LANE)], vreinterpretq_u32_u8(res), (LANE)); \ |
| 390 L = ROTATE32_LEFT(res); \ |
| 391 } while (0) |
| 392 |
| 393 static void PredictorAdd11_NEON(const uint32_t* in, const uint32_t* upper, |
| 394 int num_pixels, uint32_t* out) { |
| 395 int i; |
| 396 uint8x16_t L = LOADQ_U32_AS_U8(out[-1]); |
| 397 for (i = 0; i + 4 <= num_pixels; i += 4) { |
| 398 const uint8x16_t T = LOADQ_U32P_AS_U8(&upper[i]); |
| 399 const uint8x16_t TL = LOADQ_U32P_AS_U8(&upper[i - 1]); |
| 400 const uint8x16_t pTTL = vabdq_u8(T, TL); // |T - TL| |
| 401 const uint16x8_t sum_TTL = vpaddlq_u8(pTTL); |
| 402 const uint32x4_t pb = vpaddlq_u16(sum_TTL); |
| 403 const uint8x16_t src = LOADQ_U32P_AS_U8(&in[i]); |
| 404 const uint8x16_t sumTin = vaddq_u8(T, src); // in + T |
| 405 DO_PRED11(0); |
| 406 DO_PRED11(1); |
| 407 DO_PRED11(2); |
| 408 DO_PRED11(3); |
| 409 } |
| 410 VP8LPredictorsAdd_C[11](in + i, upper + i, num_pixels - i, out + i); |
| 411 } |
| 412 #undef DO_PRED11 |
| 413 |
| 414 // Predictor12: ClampedAddSubtractFull. |
| 415 #define DO_PRED12(DIFF, LANE) do { \ |
| 416 const uint8x8_t pred = \ |
| 417 vqmovun_s16(vaddq_s16(vreinterpretq_s16_u16(L), (DIFF))); \ |
| 418 const uint8x8_t res = \ |
| 419 vadd_u8(pred, (LANE <= 1) ? vget_low_u8(src) : vget_high_u8(src)); \ |
| 420 const uint16x8_t res16 = vmovl_u8(res); \ |
| 421 vst1_lane_u32(&out[i + (LANE)], vreinterpret_u32_u8(res), (LANE) & 1); \ |
| 422 /* rotate in the left predictor for next iteration */ \ |
| 423 L = vextq_u16(res16, res16, 4); \ |
| 424 } while (0) |
| 425 |
| 426 static void PredictorAdd12_NEON(const uint32_t* in, const uint32_t* upper, |
| 427 int num_pixels, uint32_t* out) { |
| 428 int i; |
| 429 uint16x8_t L = vmovl_u8(LOAD_U32_AS_U8(out[-1])); |
| 430 for (i = 0; i + 4 <= num_pixels; i += 4) { |
| 431 // load four pixels of source |
| 432 const uint8x16_t src = LOADQ_U32P_AS_U8(&in[i]); |
| 433 // precompute the difference T - TL once for all, stored as s16 |
| 434 const uint8x16_t TL = LOADQ_U32P_AS_U8(&upper[i - 1]); |
| 435 const uint8x16_t T = LOADQ_U32P_AS_U8(&upper[i]); |
| 436 const int16x8_t diff_lo = |
| 437 vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(T), vget_low_u8(TL))); |
| 438 const int16x8_t diff_hi = |
| 439 vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(T), vget_high_u8(TL))); |
| 440 // loop over the four reconstructed pixels |
| 441 DO_PRED12(diff_lo, 0); |
| 442 DO_PRED12(diff_lo, 1); |
| 443 DO_PRED12(diff_hi, 2); |
| 444 DO_PRED12(diff_hi, 3); |
| 445 } |
| 446 VP8LPredictorsAdd_C[12](in + i, upper + i, num_pixels - i, out + i); |
| 447 } |
| 448 #undef DO_PRED12 |
| 449 |
| 450 // Predictor13: ClampedAddSubtractHalf |
| 451 #define DO_PRED13(LANE, LOW_OR_HI) do { \ |
| 452 const uint8x16_t avg = vhaddq_u8(L, T); \ |
| 453 const uint8x16_t cmp = vcgtq_u8(TL, avg); \ |
| 454 const uint8x16_t TL_1 = vaddq_u8(TL, cmp); \ |
| 455 /* Compute half of the difference between avg and TL'. */ \ |
| 456 const int8x8_t diff_avg = \ |
| 457 vreinterpret_s8_u8(LOW_OR_HI(vhsubq_u8(avg, TL_1))); \ |
| 458 /* Compute the sum with avg and saturate. */ \ |
| 459 const int16x8_t avg_16 = vreinterpretq_s16_u16(vmovl_u8(LOW_OR_HI(avg))); \ |
| 460 const uint8x8_t delta = vqmovun_s16(vaddw_s8(avg_16, diff_avg)); \ |
| 461 const uint8x8_t res = vadd_u8(LOW_OR_HI(src), delta); \ |
| 462 const uint8x16_t res2 = vcombine_u8(res, res); \ |
| 463 vst1_lane_u32(&out[i + (LANE)], vreinterpret_u32_u8(res), (LANE) & 1); \ |
| 464 L = ROTATE32_LEFT(res2); \ |
| 465 } while (0) |
| 466 |
| 467 static void PredictorAdd13_NEON(const uint32_t* in, const uint32_t* upper, |
| 468 int num_pixels, uint32_t* out) { |
| 469 int i; |
| 470 uint8x16_t L = LOADQ_U32_AS_U8(out[-1]); |
| 471 for (i = 0; i + 4 <= num_pixels; i += 4) { |
| 472 const uint8x16_t src = LOADQ_U32P_AS_U8(&in[i]); |
| 473 const uint8x16_t T = LOADQ_U32P_AS_U8(&upper[i]); |
| 474 const uint8x16_t TL = LOADQ_U32P_AS_U8(&upper[i - 1]); |
| 475 DO_PRED13(0, vget_low_u8); |
| 476 DO_PRED13(1, vget_low_u8); |
| 477 DO_PRED13(2, vget_high_u8); |
| 478 DO_PRED13(3, vget_high_u8); |
| 479 } |
| 480 VP8LPredictorsAdd_C[13](in + i, upper + i, num_pixels - i, out + i); |
| 481 } |
| 482 #undef DO_PRED13 |
| 483 |
| 484 #undef LOAD_U32_AS_U8 |
| 485 #undef LOAD_U32P_AS_U8 |
| 486 #undef LOADQ_U32_AS_U8 |
| 487 #undef LOADQ_U32P_AS_U8 |
| 488 #undef GET_U8_AS_U32 |
| 489 #undef GETQ_U8_AS_U32 |
| 490 #undef STOREQ_U8_AS_U32P |
| 491 #undef ROTATE32_LEFT |
| 492 |
| 142 //------------------------------------------------------------------------------ | 493 //------------------------------------------------------------------------------ |
| 143 // Subtract-Green Transform | 494 // Subtract-Green Transform |
| 144 | 495 |
| 145 // vtbl?_u8 are marked unavailable for iOS arm64 with Xcode < 6.3, use | 496 // vtbl?_u8 are marked unavailable for iOS arm64 with Xcode < 6.3, use |
| 146 // non-standard versions there. | 497 // non-standard versions there. |
| 147 #if defined(__APPLE__) && defined(__aarch64__) && \ | 498 #if defined(__APPLE__) && defined(__aarch64__) && \ |
| 148 defined(__apple_build_version__) && (__apple_build_version__< 6020037) | 499 defined(__apple_build_version__) && (__apple_build_version__< 6020037) |
| 149 #define USE_VTBLQ | 500 #define USE_VTBLQ |
| 150 #endif | 501 #endif |
| 151 | 502 |
| (...skipping 12 matching lines...) Expand all Loading... |
| 164 // 255 = byte will be zeroed | 515 // 255 = byte will be zeroed |
| 165 static const uint8_t kGreenShuffle[8] = { 1, 255, 1, 255, 5, 255, 5, 255 }; | 516 static const uint8_t kGreenShuffle[8] = { 1, 255, 1, 255, 5, 255, 5, 255 }; |
| 166 | 517 |
| 167 static WEBP_INLINE uint8x16_t DoGreenShuffle(const uint8x16_t argb, | 518 static WEBP_INLINE uint8x16_t DoGreenShuffle(const uint8x16_t argb, |
| 168 const uint8x8_t shuffle) { | 519 const uint8x8_t shuffle) { |
| 169 return vcombine_u8(vtbl1_u8(vget_low_u8(argb), shuffle), | 520 return vcombine_u8(vtbl1_u8(vget_low_u8(argb), shuffle), |
| 170 vtbl1_u8(vget_high_u8(argb), shuffle)); | 521 vtbl1_u8(vget_high_u8(argb), shuffle)); |
| 171 } | 522 } |
| 172 #endif // USE_VTBLQ | 523 #endif // USE_VTBLQ |
| 173 | 524 |
| 174 static void AddGreenToBlueAndRed(uint32_t* argb_data, int num_pixels) { | 525 static void AddGreenToBlueAndRed(const uint32_t* src, int num_pixels, |
| 175 const uint32_t* const end = argb_data + (num_pixels & ~3); | 526 uint32_t* dst) { |
| 527 const uint32_t* const end = src + (num_pixels & ~3); |
| 176 #ifdef USE_VTBLQ | 528 #ifdef USE_VTBLQ |
| 177 const uint8x16_t shuffle = vld1q_u8(kGreenShuffle); | 529 const uint8x16_t shuffle = vld1q_u8(kGreenShuffle); |
| 178 #else | 530 #else |
| 179 const uint8x8_t shuffle = vld1_u8(kGreenShuffle); | 531 const uint8x8_t shuffle = vld1_u8(kGreenShuffle); |
| 180 #endif | 532 #endif |
| 181 for (; argb_data < end; argb_data += 4) { | 533 for (; src < end; src += 4, dst += 4) { |
| 182 const uint8x16_t argb = vld1q_u8((uint8_t*)argb_data); | 534 const uint8x16_t argb = vld1q_u8((const uint8_t*)src); |
| 183 const uint8x16_t greens = DoGreenShuffle(argb, shuffle); | 535 const uint8x16_t greens = DoGreenShuffle(argb, shuffle); |
| 184 vst1q_u8((uint8_t*)argb_data, vaddq_u8(argb, greens)); | 536 vst1q_u8((uint8_t*)dst, vaddq_u8(argb, greens)); |
| 185 } | 537 } |
| 186 // fallthrough and finish off with plain-C | 538 // fallthrough and finish off with plain-C |
| 187 VP8LAddGreenToBlueAndRed_C(argb_data, num_pixels & 3); | 539 VP8LAddGreenToBlueAndRed_C(src, num_pixels & 3, dst); |
| 188 } | 540 } |
| 189 | 541 |
| 190 //------------------------------------------------------------------------------ | 542 //------------------------------------------------------------------------------ |
| 191 // Color Transform | 543 // Color Transform |
| 192 | 544 |
| 193 static void TransformColorInverse(const VP8LMultipliers* const m, | 545 static void TransformColorInverse(const VP8LMultipliers* const m, |
| 194 uint32_t* argb_data, int num_pixels) { | 546 const uint32_t* const src, int num_pixels, |
| 195 // sign-extended multiplying constants, pre-shifted by 6. | 547 uint32_t* dst) { |
| 548 // sign-extended multiplying constants, pre-shifted by 6. |
| 196 #define CST(X) (((int16_t)(m->X << 8)) >> 6) | 549 #define CST(X) (((int16_t)(m->X << 8)) >> 6) |
| 197 const int16_t rb[8] = { | 550 const int16_t rb[8] = { |
| 198 CST(green_to_blue_), CST(green_to_red_), | 551 CST(green_to_blue_), CST(green_to_red_), |
| 199 CST(green_to_blue_), CST(green_to_red_), | 552 CST(green_to_blue_), CST(green_to_red_), |
| 200 CST(green_to_blue_), CST(green_to_red_), | 553 CST(green_to_blue_), CST(green_to_red_), |
| 201 CST(green_to_blue_), CST(green_to_red_) | 554 CST(green_to_blue_), CST(green_to_red_) |
| 202 }; | 555 }; |
| 203 const int16x8_t mults_rb = vld1q_s16(rb); | 556 const int16x8_t mults_rb = vld1q_s16(rb); |
| 204 const int16_t b2[8] = { | 557 const int16_t b2[8] = { |
| 205 0, CST(red_to_blue_), 0, CST(red_to_blue_), | 558 0, CST(red_to_blue_), 0, CST(red_to_blue_), |
| 206 0, CST(red_to_blue_), 0, CST(red_to_blue_), | 559 0, CST(red_to_blue_), 0, CST(red_to_blue_), |
| 207 }; | 560 }; |
| 208 const int16x8_t mults_b2 = vld1q_s16(b2); | 561 const int16x8_t mults_b2 = vld1q_s16(b2); |
| 209 #undef CST | 562 #undef CST |
| 210 #ifdef USE_VTBLQ | 563 #ifdef USE_VTBLQ |
| 211 static const uint8_t kg0g0[16] = { | 564 static const uint8_t kg0g0[16] = { |
| 212 255, 1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13 | 565 255, 1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13 |
| 213 }; | 566 }; |
| 214 const uint8x16_t shuffle = vld1q_u8(kg0g0); | 567 const uint8x16_t shuffle = vld1q_u8(kg0g0); |
| 215 #else | 568 #else |
| 216 static const uint8_t k0g0g[8] = { 255, 1, 255, 1, 255, 5, 255, 5 }; | 569 static const uint8_t k0g0g[8] = { 255, 1, 255, 1, 255, 5, 255, 5 }; |
| 217 const uint8x8_t shuffle = vld1_u8(k0g0g); | 570 const uint8x8_t shuffle = vld1_u8(k0g0g); |
| 218 #endif | 571 #endif |
| 219 const uint32x4_t mask_ag = vdupq_n_u32(0xff00ff00u); | 572 const uint32x4_t mask_ag = vdupq_n_u32(0xff00ff00u); |
| 220 int i; | 573 int i; |
| 221 for (i = 0; i + 4 <= num_pixels; i += 4) { | 574 for (i = 0; i + 4 <= num_pixels; i += 4) { |
| 222 const uint8x16_t in = vld1q_u8((uint8_t*)(argb_data + i)); | 575 const uint8x16_t in = vld1q_u8((const uint8_t*)(src + i)); |
| 223 const uint32x4_t a0g0 = vandq_u32(vreinterpretq_u32_u8(in), mask_ag); | 576 const uint32x4_t a0g0 = vandq_u32(vreinterpretq_u32_u8(in), mask_ag); |
| 224 // 0 g 0 g | 577 // 0 g 0 g |
| 225 const uint8x16_t greens = DoGreenShuffle(in, shuffle); | 578 const uint8x16_t greens = DoGreenShuffle(in, shuffle); |
| 226 // x dr x db1 | 579 // x dr x db1 |
| 227 const int16x8_t A = vqdmulhq_s16(vreinterpretq_s16_u8(greens), mults_rb); | 580 const int16x8_t A = vqdmulhq_s16(vreinterpretq_s16_u8(greens), mults_rb); |
| 228 // x r' x b' | 581 // x r' x b' |
| 229 const int8x16_t B = vaddq_s8(vreinterpretq_s8_u8(in), | 582 const int8x16_t B = vaddq_s8(vreinterpretq_s8_u8(in), |
| 230 vreinterpretq_s8_s16(A)); | 583 vreinterpretq_s8_s16(A)); |
| 231 // r' 0 b' 0 | 584 // r' 0 b' 0 |
| 232 const int16x8_t C = vshlq_n_s16(vreinterpretq_s16_s8(B), 8); | 585 const int16x8_t C = vshlq_n_s16(vreinterpretq_s16_s8(B), 8); |
| 233 // x db2 0 0 | 586 // x db2 0 0 |
| 234 const int16x8_t D = vqdmulhq_s16(C, mults_b2); | 587 const int16x8_t D = vqdmulhq_s16(C, mults_b2); |
| 235 // 0 x db2 0 | 588 // 0 x db2 0 |
| 236 const uint32x4_t E = vshrq_n_u32(vreinterpretq_u32_s16(D), 8); | 589 const uint32x4_t E = vshrq_n_u32(vreinterpretq_u32_s16(D), 8); |
| 237 // r' x b'' 0 | 590 // r' x b'' 0 |
| 238 const int8x16_t F = vaddq_s8(vreinterpretq_s8_u32(E), | 591 const int8x16_t F = vaddq_s8(vreinterpretq_s8_u32(E), |
| 239 vreinterpretq_s8_s16(C)); | 592 vreinterpretq_s8_s16(C)); |
| 240 // 0 r' 0 b'' | 593 // 0 r' 0 b'' |
| 241 const uint16x8_t G = vshrq_n_u16(vreinterpretq_u16_s8(F), 8); | 594 const uint16x8_t G = vshrq_n_u16(vreinterpretq_u16_s8(F), 8); |
| 242 const uint32x4_t out = vorrq_u32(vreinterpretq_u32_u16(G), a0g0); | 595 const uint32x4_t out = vorrq_u32(vreinterpretq_u32_u16(G), a0g0); |
| 243 vst1q_u32(argb_data + i, out); | 596 vst1q_u32(dst + i, out); |
| 244 } | 597 } |
| 245 // Fall-back to C-version for left-overs. | 598 // Fall-back to C-version for left-overs. |
| 246 VP8LTransformColorInverse_C(m, argb_data + i, num_pixels - i); | 599 VP8LTransformColorInverse_C(m, src + i, num_pixels - i, dst + i); |
| 247 } | 600 } |
| 248 | 601 |
| 249 #undef USE_VTBLQ | 602 #undef USE_VTBLQ |
| 250 | 603 |
| 251 //------------------------------------------------------------------------------ | 604 //------------------------------------------------------------------------------ |
| 252 // Entry point | 605 // Entry point |
| 253 | 606 |
| 254 extern void VP8LDspInitNEON(void); | 607 extern void VP8LDspInitNEON(void); |
| 255 | 608 |
| 256 WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInitNEON(void) { | 609 WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInitNEON(void) { |
| 610 VP8LPredictors[5] = Predictor5_NEON; |
| 611 VP8LPredictors[6] = Predictor6_NEON; |
| 612 VP8LPredictors[7] = Predictor7_NEON; |
| 613 VP8LPredictors[13] = Predictor13_NEON; |
| 614 |
| 615 VP8LPredictorsAdd[0] = PredictorAdd0_NEON; |
| 616 VP8LPredictorsAdd[1] = PredictorAdd1_NEON; |
| 617 VP8LPredictorsAdd[2] = PredictorAdd2_NEON; |
| 618 VP8LPredictorsAdd[3] = PredictorAdd3_NEON; |
| 619 VP8LPredictorsAdd[4] = PredictorAdd4_NEON; |
| 620 VP8LPredictorsAdd[5] = PredictorAdd5_NEON; |
| 621 VP8LPredictorsAdd[6] = PredictorAdd6_NEON; |
| 622 VP8LPredictorsAdd[7] = PredictorAdd7_NEON; |
| 623 VP8LPredictorsAdd[8] = PredictorAdd8_NEON; |
| 624 VP8LPredictorsAdd[9] = PredictorAdd9_NEON; |
| 625 VP8LPredictorsAdd[10] = PredictorAdd10_NEON; |
| 626 VP8LPredictorsAdd[11] = PredictorAdd11_NEON; |
| 627 VP8LPredictorsAdd[12] = PredictorAdd12_NEON; |
| 628 VP8LPredictorsAdd[13] = PredictorAdd13_NEON; |
| 629 |
| 257 VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA; | 630 VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA; |
| 258 VP8LConvertBGRAToBGR = ConvertBGRAToBGR; | 631 VP8LConvertBGRAToBGR = ConvertBGRAToBGR; |
| 259 VP8LConvertBGRAToRGB = ConvertBGRAToRGB; | 632 VP8LConvertBGRAToRGB = ConvertBGRAToRGB; |
| 260 | 633 |
| 261 VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed; | 634 VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed; |
| 262 VP8LTransformColorInverse = TransformColorInverse; | 635 VP8LTransformColorInverse = TransformColorInverse; |
| 263 } | 636 } |
| 264 | 637 |
| 265 #else // !WEBP_USE_NEON | 638 #else // !WEBP_USE_NEON |
| 266 | 639 |
| 267 WEBP_DSP_INIT_STUB(VP8LDspInitNEON) | 640 WEBP_DSP_INIT_STUB(VP8LDspInitNEON) |
| 268 | 641 |
| 269 #endif // WEBP_USE_NEON | 642 #endif // WEBP_USE_NEON |
| OLD | NEW |