OLD | NEW |
1 // Copyright 2014 Google Inc. All Rights Reserved. | 1 // Copyright 2014 Google Inc. All Rights Reserved. |
2 // | 2 // |
3 // Use of this source code is governed by a BSD-style license | 3 // Use of this source code is governed by a BSD-style license |
4 // that can be found in the COPYING file in the root of the source | 4 // that can be found in the COPYING file in the root of the source |
5 // tree. An additional intellectual property rights grant can be found | 5 // tree. An additional intellectual property rights grant can be found |
6 // in the file PATENTS. All contributing project authors may | 6 // in the file PATENTS. All contributing project authors may |
7 // be found in the AUTHORS file in the root of the source tree. | 7 // be found in the AUTHORS file in the root of the source tree. |
8 // ----------------------------------------------------------------------------- | 8 // ----------------------------------------------------------------------------- |
9 // | 9 // |
10 // NEON variant of methods for lossless decoder | 10 // NEON variant of methods for lossless decoder |
(...skipping 122 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
133 vst1_u8(dst + 8, vtbl4_u8(pixels, shuffle1)); | 133 vst1_u8(dst + 8, vtbl4_u8(pixels, shuffle1)); |
134 vst1_u8(dst + 16, vtbl4_u8(pixels, shuffle2)); | 134 vst1_u8(dst + 16, vtbl4_u8(pixels, shuffle2)); |
135 dst += 8 * 3; | 135 dst += 8 * 3; |
136 } | 136 } |
137 VP8LConvertBGRAToRGB_C(src, num_pixels & 7, dst); // left-overs | 137 VP8LConvertBGRAToRGB_C(src, num_pixels & 7, dst); // left-overs |
138 } | 138 } |
139 | 139 |
140 #endif // !WORK_AROUND_GCC | 140 #endif // !WORK_AROUND_GCC |
141 | 141 |
142 //------------------------------------------------------------------------------ | 142 //------------------------------------------------------------------------------ |
143 | |
144 #ifdef USE_INTRINSICS | |
145 | |
146 static WEBP_INLINE uint32_t Average2(const uint32_t* const a, | |
147 const uint32_t* const b) { | |
148 const uint8x8_t a0 = vreinterpret_u8_u64(vcreate_u64(*a)); | |
149 const uint8x8_t b0 = vreinterpret_u8_u64(vcreate_u64(*b)); | |
150 const uint8x8_t avg = vhadd_u8(a0, b0); | |
151 return vget_lane_u32(vreinterpret_u32_u8(avg), 0); | |
152 } | |
153 | |
154 static WEBP_INLINE uint32_t Average3(const uint32_t* const a, | |
155 const uint32_t* const b, | |
156 const uint32_t* const c) { | |
157 const uint8x8_t a0 = vreinterpret_u8_u64(vcreate_u64(*a)); | |
158 const uint8x8_t b0 = vreinterpret_u8_u64(vcreate_u64(*b)); | |
159 const uint8x8_t c0 = vreinterpret_u8_u64(vcreate_u64(*c)); | |
160 const uint8x8_t avg1 = vhadd_u8(a0, c0); | |
161 const uint8x8_t avg2 = vhadd_u8(avg1, b0); | |
162 return vget_lane_u32(vreinterpret_u32_u8(avg2), 0); | |
163 } | |
164 | |
165 static WEBP_INLINE uint32_t Average4(const uint32_t* const a, | |
166 const uint32_t* const b, | |
167 const uint32_t* const c, | |
168 const uint32_t* const d) { | |
169 const uint8x8_t a0 = vreinterpret_u8_u64(vcreate_u64(*a)); | |
170 const uint8x8_t b0 = vreinterpret_u8_u64(vcreate_u64(*b)); | |
171 const uint8x8_t c0 = vreinterpret_u8_u64(vcreate_u64(*c)); | |
172 const uint8x8_t d0 = vreinterpret_u8_u64(vcreate_u64(*d)); | |
173 const uint8x8_t avg1 = vhadd_u8(a0, b0); | |
174 const uint8x8_t avg2 = vhadd_u8(c0, d0); | |
175 const uint8x8_t avg3 = vhadd_u8(avg1, avg2); | |
176 return vget_lane_u32(vreinterpret_u32_u8(avg3), 0); | |
177 } | |
178 | |
179 static uint32_t Predictor5(uint32_t left, const uint32_t* const top) { | |
180 return Average3(&left, top + 0, top + 1); | |
181 } | |
182 | |
183 static uint32_t Predictor6(uint32_t left, const uint32_t* const top) { | |
184 return Average2(&left, top - 1); | |
185 } | |
186 | |
187 static uint32_t Predictor7(uint32_t left, const uint32_t* const top) { | |
188 return Average2(&left, top + 0); | |
189 } | |
190 | |
191 static uint32_t Predictor8(uint32_t left, const uint32_t* const top) { | |
192 (void)left; | |
193 return Average2(top - 1, top + 0); | |
194 } | |
195 | |
196 static uint32_t Predictor9(uint32_t left, const uint32_t* const top) { | |
197 (void)left; | |
198 return Average2(top + 0, top + 1); | |
199 } | |
200 | |
201 static uint32_t Predictor10(uint32_t left, const uint32_t* const top) { | |
202 return Average4(&left, top - 1, top + 0, top + 1); | |
203 } | |
204 | |
205 //------------------------------------------------------------------------------ | |
206 | |
207 static WEBP_INLINE uint32_t Select(const uint32_t* const c0, | |
208 const uint32_t* const c1, | |
209 const uint32_t* const c2) { | |
210 const uint8x8_t p0 = vreinterpret_u8_u64(vcreate_u64(*c0)); | |
211 const uint8x8_t p1 = vreinterpret_u8_u64(vcreate_u64(*c1)); | |
212 const uint8x8_t p2 = vreinterpret_u8_u64(vcreate_u64(*c2)); | |
213 const uint8x8_t bc = vabd_u8(p1, p2); // |b-c| | |
214 const uint8x8_t ac = vabd_u8(p0, p2); // |a-c| | |
215 const int16x4_t sum_bc = vreinterpret_s16_u16(vpaddl_u8(bc)); | |
216 const int16x4_t sum_ac = vreinterpret_s16_u16(vpaddl_u8(ac)); | |
217 const int32x2_t diff = vpaddl_s16(vsub_s16(sum_bc, sum_ac)); | |
218 const int32_t pa_minus_pb = vget_lane_s32(diff, 0); | |
219 return (pa_minus_pb <= 0) ? *c0 : *c1; | |
220 } | |
221 | |
222 static uint32_t Predictor11(uint32_t left, const uint32_t* const top) { | |
223 return Select(top + 0, &left, top - 1); | |
224 } | |
225 | |
226 static WEBP_INLINE uint32_t ClampedAddSubtractFull(const uint32_t* const c0, | |
227 const uint32_t* const c1, | |
228 const uint32_t* const c2) { | |
229 const uint8x8_t p0 = vreinterpret_u8_u64(vcreate_u64(*c0)); | |
230 const uint8x8_t p1 = vreinterpret_u8_u64(vcreate_u64(*c1)); | |
231 const uint8x8_t p2 = vreinterpret_u8_u64(vcreate_u64(*c2)); | |
232 const uint16x8_t sum0 = vaddl_u8(p0, p1); // add and widen | |
233 const uint16x8_t sum1 = vqsubq_u16(sum0, vmovl_u8(p2)); // widen and subtract | |
234 const uint8x8_t out = vqmovn_u16(sum1); // narrow and clamp | |
235 return vget_lane_u32(vreinterpret_u32_u8(out), 0); | |
236 } | |
237 | |
238 static uint32_t Predictor12(uint32_t left, const uint32_t* const top) { | |
239 return ClampedAddSubtractFull(&left, top + 0, top - 1); | |
240 } | |
241 | |
242 static WEBP_INLINE uint32_t ClampedAddSubtractHalf(const uint32_t* const c0, | |
243 const uint32_t* const c1, | |
244 const uint32_t* const c2) { | |
245 const uint8x8_t p0 = vreinterpret_u8_u64(vcreate_u64(*c0)); | |
246 const uint8x8_t p1 = vreinterpret_u8_u64(vcreate_u64(*c1)); | |
247 const uint8x8_t p2 = vreinterpret_u8_u64(vcreate_u64(*c2)); | |
248 const uint8x8_t avg = vhadd_u8(p0, p1); // Average(c0,c1) | |
249 const uint8x8_t ab = vshr_n_u8(vqsub_u8(avg, p2), 1); // (a-b)>>1 saturated | |
250 const uint8x8_t ba = vshr_n_u8(vqsub_u8(p2, avg), 1); // (b-a)>>1 saturated | |
251 const uint8x8_t out = vqsub_u8(vqadd_u8(avg, ab), ba); | |
252 return vget_lane_u32(vreinterpret_u32_u8(out), 0); | |
253 } | |
254 | |
255 static uint32_t Predictor13(uint32_t left, const uint32_t* const top) { | |
256 return ClampedAddSubtractHalf(&left, top + 0, top - 1); | |
257 } | |
258 | |
259 //------------------------------------------------------------------------------ | |
260 // Subtract-Green Transform | 143 // Subtract-Green Transform |
261 | 144 |
262 // vtbl?_u8 are marked unavailable for iOS arm64 with Xcode < 6.3, use | 145 // vtbl?_u8 are marked unavailable for iOS arm64 with Xcode < 6.3, use |
263 // non-standard versions there. | 146 // non-standard versions there. |
264 #if defined(__APPLE__) && defined(__aarch64__) && \ | 147 #if defined(__APPLE__) && defined(__aarch64__) && \ |
265 defined(__apple_build_version__) && (__apple_build_version__< 6020037) | 148 defined(__apple_build_version__) && (__apple_build_version__< 6020037) |
266 #define USE_VTBLQ | 149 #define USE_VTBLQ |
267 #endif | 150 #endif |
268 | 151 |
269 #ifdef USE_VTBLQ | 152 #ifdef USE_VTBLQ |
(...skipping 11 matching lines...) Expand all Loading... |
281 // 255 = byte will be zeroed | 164 // 255 = byte will be zeroed |
282 static const uint8_t kGreenShuffle[8] = { 1, 255, 1, 255, 5, 255, 5, 255 }; | 165 static const uint8_t kGreenShuffle[8] = { 1, 255, 1, 255, 5, 255, 5, 255 }; |
283 | 166 |
284 static WEBP_INLINE uint8x16_t DoGreenShuffle(const uint8x16_t argb, | 167 static WEBP_INLINE uint8x16_t DoGreenShuffle(const uint8x16_t argb, |
285 const uint8x8_t shuffle) { | 168 const uint8x8_t shuffle) { |
286 return vcombine_u8(vtbl1_u8(vget_low_u8(argb), shuffle), | 169 return vcombine_u8(vtbl1_u8(vget_low_u8(argb), shuffle), |
287 vtbl1_u8(vget_high_u8(argb), shuffle)); | 170 vtbl1_u8(vget_high_u8(argb), shuffle)); |
288 } | 171 } |
289 #endif // USE_VTBLQ | 172 #endif // USE_VTBLQ |
290 | 173 |
291 static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) { | 174 static void AddGreenToBlueAndRed(uint32_t* argb_data, int num_pixels) { |
292 const uint32_t* const end = argb_data + (num_pixels & ~3); | 175 const uint32_t* const end = argb_data + (num_pixels & ~3); |
293 #ifdef USE_VTBLQ | 176 #ifdef USE_VTBLQ |
294 const uint8x16_t shuffle = vld1q_u8(kGreenShuffle); | 177 const uint8x16_t shuffle = vld1q_u8(kGreenShuffle); |
295 #else | 178 #else |
296 const uint8x8_t shuffle = vld1_u8(kGreenShuffle); | 179 const uint8x8_t shuffle = vld1_u8(kGreenShuffle); |
297 #endif | 180 #endif |
298 for (; argb_data < end; argb_data += 4) { | 181 for (; argb_data < end; argb_data += 4) { |
299 const uint8x16_t argb = vld1q_u8((uint8_t*)argb_data); | 182 const uint8x16_t argb = vld1q_u8((uint8_t*)argb_data); |
300 const uint8x16_t greens = DoGreenShuffle(argb, shuffle); | 183 const uint8x16_t greens = DoGreenShuffle(argb, shuffle); |
301 vst1q_u8((uint8_t*)argb_data, vsubq_u8(argb, greens)); | |
302 } | |
303 // fallthrough and finish off with plain-C | |
304 VP8LSubtractGreenFromBlueAndRed_C(argb_data, num_pixels & 3); | |
305 } | |
306 | |
307 static void AddGreenToBlueAndRed(uint32_t* argb_data, int num_pixels) { | |
308 const uint32_t* const end = argb_data + (num_pixels & ~3); | |
309 #ifdef USE_VTBLQ | |
310 const uint8x16_t shuffle = vld1q_u8(kGreenShuffle); | |
311 #else | |
312 const uint8x8_t shuffle = vld1_u8(kGreenShuffle); | |
313 #endif | |
314 for (; argb_data < end; argb_data += 4) { | |
315 const uint8x16_t argb = vld1q_u8((uint8_t*)argb_data); | |
316 const uint8x16_t greens = DoGreenShuffle(argb, shuffle); | |
317 vst1q_u8((uint8_t*)argb_data, vaddq_u8(argb, greens)); | 184 vst1q_u8((uint8_t*)argb_data, vaddq_u8(argb, greens)); |
318 } | 185 } |
319 // fallthrough and finish off with plain-C | 186 // fallthrough and finish off with plain-C |
320 VP8LAddGreenToBlueAndRed_C(argb_data, num_pixels & 3); | 187 VP8LAddGreenToBlueAndRed_C(argb_data, num_pixels & 3); |
321 } | 188 } |
322 | 189 |
| 190 //------------------------------------------------------------------------------ |
| 191 // Color Transform |
| 192 |
| 193 static void TransformColorInverse(const VP8LMultipliers* const m, |
| 194 uint32_t* argb_data, int num_pixels) { |
| 195 // sign-extended multiplying constants, pre-shifted by 6. |
| 196 #define CST(X) (((int16_t)(m->X << 8)) >> 6) |
| 197 const int16_t rb[8] = { |
| 198 CST(green_to_blue_), CST(green_to_red_), |
| 199 CST(green_to_blue_), CST(green_to_red_), |
| 200 CST(green_to_blue_), CST(green_to_red_), |
| 201 CST(green_to_blue_), CST(green_to_red_) |
| 202 }; |
| 203 const int16x8_t mults_rb = vld1q_s16(rb); |
| 204 const int16_t b2[8] = { |
| 205 0, CST(red_to_blue_), 0, CST(red_to_blue_), |
| 206 0, CST(red_to_blue_), 0, CST(red_to_blue_), |
| 207 }; |
| 208 const int16x8_t mults_b2 = vld1q_s16(b2); |
| 209 #undef CST |
| 210 #ifdef USE_VTBLQ |
| 211 static const uint8_t kg0g0[16] = { |
| 212 255, 1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13 |
| 213 }; |
| 214 const uint8x16_t shuffle = vld1q_u8(kg0g0); |
| 215 #else |
| 216 static const uint8_t k0g0g[8] = { 255, 1, 255, 1, 255, 5, 255, 5 }; |
| 217 const uint8x8_t shuffle = vld1_u8(k0g0g); |
| 218 #endif |
| 219 const uint32x4_t mask_ag = vdupq_n_u32(0xff00ff00u); |
| 220 int i; |
| 221 for (i = 0; i + 4 <= num_pixels; i += 4) { |
| 222 const uint8x16_t in = vld1q_u8((uint8_t*)(argb_data + i)); |
| 223 const uint32x4_t a0g0 = vandq_u32(vreinterpretq_u32_u8(in), mask_ag); |
| 224 // 0 g 0 g |
| 225 const uint8x16_t greens = DoGreenShuffle(in, shuffle); |
| 226 // x dr x db1 |
| 227 const int16x8_t A = vqdmulhq_s16(vreinterpretq_s16_u8(greens), mults_rb); |
| 228 // x r' x b' |
| 229 const int8x16_t B = vaddq_s8(vreinterpretq_s8_u8(in), |
| 230 vreinterpretq_s8_s16(A)); |
| 231 // r' 0 b' 0 |
| 232 const int16x8_t C = vshlq_n_s16(vreinterpretq_s16_s8(B), 8); |
| 233 // x db2 0 0 |
| 234 const int16x8_t D = vqdmulhq_s16(C, mults_b2); |
| 235 // 0 x db2 0 |
| 236 const uint32x4_t E = vshrq_n_u32(vreinterpretq_u32_s16(D), 8); |
| 237 // r' x b'' 0 |
| 238 const int8x16_t F = vaddq_s8(vreinterpretq_s8_u32(E), |
| 239 vreinterpretq_s8_s16(C)); |
| 240 // 0 r' 0 b'' |
| 241 const uint16x8_t G = vshrq_n_u16(vreinterpretq_u16_s8(F), 8); |
| 242 const uint32x4_t out = vorrq_u32(vreinterpretq_u32_u16(G), a0g0); |
| 243 vst1q_u32(argb_data + i, out); |
| 244 } |
| 245 // Fall-back to C-version for left-overs. |
| 246 VP8LTransformColorInverse_C(m, argb_data + i, num_pixels - i); |
| 247 } |
| 248 |
323 #undef USE_VTBLQ | 249 #undef USE_VTBLQ |
324 | 250 |
325 #endif // USE_INTRINSICS | |
326 | |
327 #endif // WEBP_USE_NEON | |
328 | |
329 //------------------------------------------------------------------------------ | 251 //------------------------------------------------------------------------------ |
| 252 // Entry point |
330 | 253 |
331 extern void VP8LDspInitNEON(void); | 254 extern void VP8LDspInitNEON(void); |
332 | 255 |
333 void VP8LDspInitNEON(void) { | 256 WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInitNEON(void) { |
334 #if defined(WEBP_USE_NEON) | |
335 VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA; | 257 VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA; |
336 VP8LConvertBGRAToBGR = ConvertBGRAToBGR; | 258 VP8LConvertBGRAToBGR = ConvertBGRAToBGR; |
337 VP8LConvertBGRAToRGB = ConvertBGRAToRGB; | 259 VP8LConvertBGRAToRGB = ConvertBGRAToRGB; |
338 | 260 |
339 #ifdef USE_INTRINSICS | |
340 VP8LPredictors[5] = Predictor5; | |
341 VP8LPredictors[6] = Predictor6; | |
342 VP8LPredictors[7] = Predictor7; | |
343 VP8LPredictors[8] = Predictor8; | |
344 VP8LPredictors[9] = Predictor9; | |
345 VP8LPredictors[10] = Predictor10; | |
346 VP8LPredictors[11] = Predictor11; | |
347 VP8LPredictors[12] = Predictor12; | |
348 VP8LPredictors[13] = Predictor13; | |
349 | |
350 VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed; | |
351 VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed; | 261 VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed; |
352 #endif | 262 VP8LTransformColorInverse = TransformColorInverse; |
353 | |
354 #endif // WEBP_USE_NEON | |
355 } | 263 } |
356 | 264 |
357 //------------------------------------------------------------------------------ | 265 #else // !WEBP_USE_NEON |
| 266 |
| 267 WEBP_DSP_INIT_STUB(VP8LDspInitNEON) |
| 268 |
| 269 #endif // WEBP_USE_NEON |
OLD | NEW |