| OLD | NEW |
| 1 // Copyright 2014 Google Inc. All Rights Reserved. | 1 // Copyright 2014 Google Inc. All Rights Reserved. |
| 2 // | 2 // |
| 3 // Use of this source code is governed by a BSD-style license | 3 // Use of this source code is governed by a BSD-style license |
| 4 // that can be found in the COPYING file in the root of the source | 4 // that can be found in the COPYING file in the root of the source |
| 5 // tree. An additional intellectual property rights grant can be found | 5 // tree. An additional intellectual property rights grant can be found |
| 6 // in the file PATENTS. All contributing project authors may | 6 // in the file PATENTS. All contributing project authors may |
| 7 // be found in the AUTHORS file in the root of the source tree. | 7 // be found in the AUTHORS file in the root of the source tree. |
| 8 // ----------------------------------------------------------------------------- | 8 // ----------------------------------------------------------------------------- |
| 9 // | 9 // |
| 10 // NEON variant of methods for lossless decoder | 10 // NEON variant of methods for lossless decoder |
| (...skipping 122 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 133 vst1_u8(dst + 8, vtbl4_u8(pixels, shuffle1)); | 133 vst1_u8(dst + 8, vtbl4_u8(pixels, shuffle1)); |
| 134 vst1_u8(dst + 16, vtbl4_u8(pixels, shuffle2)); | 134 vst1_u8(dst + 16, vtbl4_u8(pixels, shuffle2)); |
| 135 dst += 8 * 3; | 135 dst += 8 * 3; |
| 136 } | 136 } |
| 137 VP8LConvertBGRAToRGB_C(src, num_pixels & 7, dst); // left-overs | 137 VP8LConvertBGRAToRGB_C(src, num_pixels & 7, dst); // left-overs |
| 138 } | 138 } |
| 139 | 139 |
| 140 #endif // !WORK_AROUND_GCC | 140 #endif // !WORK_AROUND_GCC |
| 141 | 141 |
| 142 //------------------------------------------------------------------------------ | 142 //------------------------------------------------------------------------------ |
| 143 | |
| 144 #ifdef USE_INTRINSICS | |
| 145 | |
| 146 static WEBP_INLINE uint32_t Average2(const uint32_t* const a, | |
| 147 const uint32_t* const b) { | |
| 148 const uint8x8_t a0 = vreinterpret_u8_u64(vcreate_u64(*a)); | |
| 149 const uint8x8_t b0 = vreinterpret_u8_u64(vcreate_u64(*b)); | |
| 150 const uint8x8_t avg = vhadd_u8(a0, b0); | |
| 151 return vget_lane_u32(vreinterpret_u32_u8(avg), 0); | |
| 152 } | |
| 153 | |
| 154 static WEBP_INLINE uint32_t Average3(const uint32_t* const a, | |
| 155 const uint32_t* const b, | |
| 156 const uint32_t* const c) { | |
| 157 const uint8x8_t a0 = vreinterpret_u8_u64(vcreate_u64(*a)); | |
| 158 const uint8x8_t b0 = vreinterpret_u8_u64(vcreate_u64(*b)); | |
| 159 const uint8x8_t c0 = vreinterpret_u8_u64(vcreate_u64(*c)); | |
| 160 const uint8x8_t avg1 = vhadd_u8(a0, c0); | |
| 161 const uint8x8_t avg2 = vhadd_u8(avg1, b0); | |
| 162 return vget_lane_u32(vreinterpret_u32_u8(avg2), 0); | |
| 163 } | |
| 164 | |
| 165 static WEBP_INLINE uint32_t Average4(const uint32_t* const a, | |
| 166 const uint32_t* const b, | |
| 167 const uint32_t* const c, | |
| 168 const uint32_t* const d) { | |
| 169 const uint8x8_t a0 = vreinterpret_u8_u64(vcreate_u64(*a)); | |
| 170 const uint8x8_t b0 = vreinterpret_u8_u64(vcreate_u64(*b)); | |
| 171 const uint8x8_t c0 = vreinterpret_u8_u64(vcreate_u64(*c)); | |
| 172 const uint8x8_t d0 = vreinterpret_u8_u64(vcreate_u64(*d)); | |
| 173 const uint8x8_t avg1 = vhadd_u8(a0, b0); | |
| 174 const uint8x8_t avg2 = vhadd_u8(c0, d0); | |
| 175 const uint8x8_t avg3 = vhadd_u8(avg1, avg2); | |
| 176 return vget_lane_u32(vreinterpret_u32_u8(avg3), 0); | |
| 177 } | |
| 178 | |
| 179 static uint32_t Predictor5(uint32_t left, const uint32_t* const top) { | |
| 180 return Average3(&left, top + 0, top + 1); | |
| 181 } | |
| 182 | |
| 183 static uint32_t Predictor6(uint32_t left, const uint32_t* const top) { | |
| 184 return Average2(&left, top - 1); | |
| 185 } | |
| 186 | |
| 187 static uint32_t Predictor7(uint32_t left, const uint32_t* const top) { | |
| 188 return Average2(&left, top + 0); | |
| 189 } | |
| 190 | |
| 191 static uint32_t Predictor8(uint32_t left, const uint32_t* const top) { | |
| 192 (void)left; | |
| 193 return Average2(top - 1, top + 0); | |
| 194 } | |
| 195 | |
| 196 static uint32_t Predictor9(uint32_t left, const uint32_t* const top) { | |
| 197 (void)left; | |
| 198 return Average2(top + 0, top + 1); | |
| 199 } | |
| 200 | |
| 201 static uint32_t Predictor10(uint32_t left, const uint32_t* const top) { | |
| 202 return Average4(&left, top - 1, top + 0, top + 1); | |
| 203 } | |
| 204 | |
| 205 //------------------------------------------------------------------------------ | |
| 206 | |
| 207 static WEBP_INLINE uint32_t Select(const uint32_t* const c0, | |
| 208 const uint32_t* const c1, | |
| 209 const uint32_t* const c2) { | |
| 210 const uint8x8_t p0 = vreinterpret_u8_u64(vcreate_u64(*c0)); | |
| 211 const uint8x8_t p1 = vreinterpret_u8_u64(vcreate_u64(*c1)); | |
| 212 const uint8x8_t p2 = vreinterpret_u8_u64(vcreate_u64(*c2)); | |
| 213 const uint8x8_t bc = vabd_u8(p1, p2); // |b-c| | |
| 214 const uint8x8_t ac = vabd_u8(p0, p2); // |a-c| | |
| 215 const int16x4_t sum_bc = vreinterpret_s16_u16(vpaddl_u8(bc)); | |
| 216 const int16x4_t sum_ac = vreinterpret_s16_u16(vpaddl_u8(ac)); | |
| 217 const int32x2_t diff = vpaddl_s16(vsub_s16(sum_bc, sum_ac)); | |
| 218 const int32_t pa_minus_pb = vget_lane_s32(diff, 0); | |
| 219 return (pa_minus_pb <= 0) ? *c0 : *c1; | |
| 220 } | |
| 221 | |
| 222 static uint32_t Predictor11(uint32_t left, const uint32_t* const top) { | |
| 223 return Select(top + 0, &left, top - 1); | |
| 224 } | |
| 225 | |
| 226 static WEBP_INLINE uint32_t ClampedAddSubtractFull(const uint32_t* const c0, | |
| 227 const uint32_t* const c1, | |
| 228 const uint32_t* const c2) { | |
| 229 const uint8x8_t p0 = vreinterpret_u8_u64(vcreate_u64(*c0)); | |
| 230 const uint8x8_t p1 = vreinterpret_u8_u64(vcreate_u64(*c1)); | |
| 231 const uint8x8_t p2 = vreinterpret_u8_u64(vcreate_u64(*c2)); | |
| 232 const uint16x8_t sum0 = vaddl_u8(p0, p1); // add and widen | |
| 233 const uint16x8_t sum1 = vqsubq_u16(sum0, vmovl_u8(p2)); // widen and subtract | |
| 234 const uint8x8_t out = vqmovn_u16(sum1); // narrow and clamp | |
| 235 return vget_lane_u32(vreinterpret_u32_u8(out), 0); | |
| 236 } | |
| 237 | |
| 238 static uint32_t Predictor12(uint32_t left, const uint32_t* const top) { | |
| 239 return ClampedAddSubtractFull(&left, top + 0, top - 1); | |
| 240 } | |
| 241 | |
| 242 static WEBP_INLINE uint32_t ClampedAddSubtractHalf(const uint32_t* const c0, | |
| 243 const uint32_t* const c1, | |
| 244 const uint32_t* const c2) { | |
| 245 const uint8x8_t p0 = vreinterpret_u8_u64(vcreate_u64(*c0)); | |
| 246 const uint8x8_t p1 = vreinterpret_u8_u64(vcreate_u64(*c1)); | |
| 247 const uint8x8_t p2 = vreinterpret_u8_u64(vcreate_u64(*c2)); | |
| 248 const uint8x8_t avg = vhadd_u8(p0, p1); // Average(c0,c1) | |
| 249 const uint8x8_t ab = vshr_n_u8(vqsub_u8(avg, p2), 1); // (a-b)>>1 saturated | |
| 250 const uint8x8_t ba = vshr_n_u8(vqsub_u8(p2, avg), 1); // (b-a)>>1 saturated | |
| 251 const uint8x8_t out = vqsub_u8(vqadd_u8(avg, ab), ba); | |
| 252 return vget_lane_u32(vreinterpret_u32_u8(out), 0); | |
| 253 } | |
| 254 | |
| 255 static uint32_t Predictor13(uint32_t left, const uint32_t* const top) { | |
| 256 return ClampedAddSubtractHalf(&left, top + 0, top - 1); | |
| 257 } | |
| 258 | |
| 259 //------------------------------------------------------------------------------ | |
| 260 // Subtract-Green Transform | 143 // Subtract-Green Transform |
| 261 | 144 |
| 262 // vtbl?_u8 are marked unavailable for iOS arm64 with Xcode < 6.3, use | 145 // vtbl?_u8 are marked unavailable for iOS arm64 with Xcode < 6.3, use |
| 263 // non-standard versions there. | 146 // non-standard versions there. |
| 264 #if defined(__APPLE__) && defined(__aarch64__) && \ | 147 #if defined(__APPLE__) && defined(__aarch64__) && \ |
| 265 defined(__apple_build_version__) && (__apple_build_version__< 6020037) | 148 defined(__apple_build_version__) && (__apple_build_version__< 6020037) |
| 266 #define USE_VTBLQ | 149 #define USE_VTBLQ |
| 267 #endif | 150 #endif |
| 268 | 151 |
| 269 #ifdef USE_VTBLQ | 152 #ifdef USE_VTBLQ |
| (...skipping 11 matching lines...) Expand all Loading... |
| 281 // 255 = byte will be zeroed | 164 // 255 = byte will be zeroed |
| 282 static const uint8_t kGreenShuffle[8] = { 1, 255, 1, 255, 5, 255, 5, 255 }; | 165 static const uint8_t kGreenShuffle[8] = { 1, 255, 1, 255, 5, 255, 5, 255 }; |
| 283 | 166 |
| 284 static WEBP_INLINE uint8x16_t DoGreenShuffle(const uint8x16_t argb, | 167 static WEBP_INLINE uint8x16_t DoGreenShuffle(const uint8x16_t argb, |
| 285 const uint8x8_t shuffle) { | 168 const uint8x8_t shuffle) { |
| 286 return vcombine_u8(vtbl1_u8(vget_low_u8(argb), shuffle), | 169 return vcombine_u8(vtbl1_u8(vget_low_u8(argb), shuffle), |
| 287 vtbl1_u8(vget_high_u8(argb), shuffle)); | 170 vtbl1_u8(vget_high_u8(argb), shuffle)); |
| 288 } | 171 } |
| 289 #endif // USE_VTBLQ | 172 #endif // USE_VTBLQ |
| 290 | 173 |
| 291 static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) { | 174 static void AddGreenToBlueAndRed(uint32_t* argb_data, int num_pixels) { |
| 292 const uint32_t* const end = argb_data + (num_pixels & ~3); | 175 const uint32_t* const end = argb_data + (num_pixels & ~3); |
| 293 #ifdef USE_VTBLQ | 176 #ifdef USE_VTBLQ |
| 294 const uint8x16_t shuffle = vld1q_u8(kGreenShuffle); | 177 const uint8x16_t shuffle = vld1q_u8(kGreenShuffle); |
| 295 #else | 178 #else |
| 296 const uint8x8_t shuffle = vld1_u8(kGreenShuffle); | 179 const uint8x8_t shuffle = vld1_u8(kGreenShuffle); |
| 297 #endif | 180 #endif |
| 298 for (; argb_data < end; argb_data += 4) { | 181 for (; argb_data < end; argb_data += 4) { |
| 299 const uint8x16_t argb = vld1q_u8((uint8_t*)argb_data); | 182 const uint8x16_t argb = vld1q_u8((uint8_t*)argb_data); |
| 300 const uint8x16_t greens = DoGreenShuffle(argb, shuffle); | 183 const uint8x16_t greens = DoGreenShuffle(argb, shuffle); |
| 301 vst1q_u8((uint8_t*)argb_data, vsubq_u8(argb, greens)); | |
| 302 } | |
| 303 // fallthrough and finish off with plain-C | |
| 304 VP8LSubtractGreenFromBlueAndRed_C(argb_data, num_pixels & 3); | |
| 305 } | |
| 306 | |
| 307 static void AddGreenToBlueAndRed(uint32_t* argb_data, int num_pixels) { | |
| 308 const uint32_t* const end = argb_data + (num_pixels & ~3); | |
| 309 #ifdef USE_VTBLQ | |
| 310 const uint8x16_t shuffle = vld1q_u8(kGreenShuffle); | |
| 311 #else | |
| 312 const uint8x8_t shuffle = vld1_u8(kGreenShuffle); | |
| 313 #endif | |
| 314 for (; argb_data < end; argb_data += 4) { | |
| 315 const uint8x16_t argb = vld1q_u8((uint8_t*)argb_data); | |
| 316 const uint8x16_t greens = DoGreenShuffle(argb, shuffle); | |
| 317 vst1q_u8((uint8_t*)argb_data, vaddq_u8(argb, greens)); | 184 vst1q_u8((uint8_t*)argb_data, vaddq_u8(argb, greens)); |
| 318 } | 185 } |
| 319 // fallthrough and finish off with plain-C | 186 // fallthrough and finish off with plain-C |
| 320 VP8LAddGreenToBlueAndRed_C(argb_data, num_pixels & 3); | 187 VP8LAddGreenToBlueAndRed_C(argb_data, num_pixels & 3); |
| 321 } | 188 } |
| 322 | 189 |
| 190 //------------------------------------------------------------------------------ |
| 191 // Color Transform |
| 192 |
| 193 static void TransformColorInverse(const VP8LMultipliers* const m, |
| 194 uint32_t* argb_data, int num_pixels) { |
| 195 // sign-extended multiplying constants, pre-shifted by 6. |
| 196 #define CST(X) (((int16_t)(m->X << 8)) >> 6) |
| 197 const int16_t rb[8] = { |
| 198 CST(green_to_blue_), CST(green_to_red_), |
| 199 CST(green_to_blue_), CST(green_to_red_), |
| 200 CST(green_to_blue_), CST(green_to_red_), |
| 201 CST(green_to_blue_), CST(green_to_red_) |
| 202 }; |
| 203 const int16x8_t mults_rb = vld1q_s16(rb); |
| 204 const int16_t b2[8] = { |
| 205 0, CST(red_to_blue_), 0, CST(red_to_blue_), |
| 206 0, CST(red_to_blue_), 0, CST(red_to_blue_), |
| 207 }; |
| 208 const int16x8_t mults_b2 = vld1q_s16(b2); |
| 209 #undef CST |
| 210 #ifdef USE_VTBLQ |
| 211 static const uint8_t kg0g0[16] = { |
| 212 255, 1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13 |
| 213 }; |
| 214 const uint8x16_t shuffle = vld1q_u8(kg0g0); |
| 215 #else |
| 216 static const uint8_t k0g0g[8] = { 255, 1, 255, 1, 255, 5, 255, 5 }; |
| 217 const uint8x8_t shuffle = vld1_u8(k0g0g); |
| 218 #endif |
| 219 const uint32x4_t mask_ag = vdupq_n_u32(0xff00ff00u); |
| 220 int i; |
| 221 for (i = 0; i + 4 <= num_pixels; i += 4) { |
| 222 const uint8x16_t in = vld1q_u8((uint8_t*)(argb_data + i)); |
| 223 const uint32x4_t a0g0 = vandq_u32(vreinterpretq_u32_u8(in), mask_ag); |
| 224 // 0 g 0 g |
| 225 const uint8x16_t greens = DoGreenShuffle(in, shuffle); |
| 226 // x dr x db1 |
| 227 const int16x8_t A = vqdmulhq_s16(vreinterpretq_s16_u8(greens), mults_rb); |
| 228 // x r' x b' |
| 229 const int8x16_t B = vaddq_s8(vreinterpretq_s8_u8(in), |
| 230 vreinterpretq_s8_s16(A)); |
| 231 // r' 0 b' 0 |
| 232 const int16x8_t C = vshlq_n_s16(vreinterpretq_s16_s8(B), 8); |
| 233 // x db2 0 0 |
| 234 const int16x8_t D = vqdmulhq_s16(C, mults_b2); |
| 235 // 0 x db2 0 |
| 236 const uint32x4_t E = vshrq_n_u32(vreinterpretq_u32_s16(D), 8); |
| 237 // r' x b'' 0 |
| 238 const int8x16_t F = vaddq_s8(vreinterpretq_s8_u32(E), |
| 239 vreinterpretq_s8_s16(C)); |
| 240 // 0 r' 0 b'' |
| 241 const uint16x8_t G = vshrq_n_u16(vreinterpretq_u16_s8(F), 8); |
| 242 const uint32x4_t out = vorrq_u32(vreinterpretq_u32_u16(G), a0g0); |
| 243 vst1q_u32(argb_data + i, out); |
| 244 } |
| 245 // Fall-back to C-version for left-overs. |
| 246 VP8LTransformColorInverse_C(m, argb_data + i, num_pixels - i); |
| 247 } |
| 248 |
| 323 #undef USE_VTBLQ | 249 #undef USE_VTBLQ |
| 324 | 250 |
| 325 #endif // USE_INTRINSICS | |
| 326 | |
| 327 #endif // WEBP_USE_NEON | |
| 328 | |
| 329 //------------------------------------------------------------------------------ | 251 //------------------------------------------------------------------------------ |
| 252 // Entry point |
| 330 | 253 |
| 331 extern void VP8LDspInitNEON(void); | 254 extern void VP8LDspInitNEON(void); |
| 332 | 255 |
| 333 void VP8LDspInitNEON(void) { | 256 WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInitNEON(void) { |
| 334 #if defined(WEBP_USE_NEON) | |
| 335 VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA; | 257 VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA; |
| 336 VP8LConvertBGRAToBGR = ConvertBGRAToBGR; | 258 VP8LConvertBGRAToBGR = ConvertBGRAToBGR; |
| 337 VP8LConvertBGRAToRGB = ConvertBGRAToRGB; | 259 VP8LConvertBGRAToRGB = ConvertBGRAToRGB; |
| 338 | 260 |
| 339 #ifdef USE_INTRINSICS | |
| 340 VP8LPredictors[5] = Predictor5; | |
| 341 VP8LPredictors[6] = Predictor6; | |
| 342 VP8LPredictors[7] = Predictor7; | |
| 343 VP8LPredictors[8] = Predictor8; | |
| 344 VP8LPredictors[9] = Predictor9; | |
| 345 VP8LPredictors[10] = Predictor10; | |
| 346 VP8LPredictors[11] = Predictor11; | |
| 347 VP8LPredictors[12] = Predictor12; | |
| 348 VP8LPredictors[13] = Predictor13; | |
| 349 | |
| 350 VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed; | |
| 351 VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed; | 261 VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed; |
| 352 #endif | 262 VP8LTransformColorInverse = TransformColorInverse; |
| 353 | |
| 354 #endif // WEBP_USE_NEON | |
| 355 } | 263 } |
| 356 | 264 |
| 357 //------------------------------------------------------------------------------ | 265 #else // !WEBP_USE_NEON |
| 266 |
| 267 WEBP_DSP_INIT_STUB(VP8LDspInitNEON) |
| 268 |
| 269 #endif // WEBP_USE_NEON |
| OLD | NEW |