Index: third_party/libwebp/dsp/dec_neon.c |
diff --git a/third_party/libwebp/dsp/dec_neon.c b/third_party/libwebp/dsp/dec_neon.c |
index ec824b790ba3d87a7996c61639cb94e5aa080faa..5d7cff151c81f34bd0e2374f6b1fe8cb93ff5cf6 100644 |
--- a/third_party/libwebp/dsp/dec_neon.c |
+++ b/third_party/libwebp/dsp/dec_neon.c |
@@ -12,14 +12,14 @@ |
#include "./dsp.h" |
-#if defined(WEBP_USE_NEON) |
- |
-#include "../dec/vp8i.h" |
- |
#if defined(__cplusplus) || defined(c_plusplus) |
extern "C" { |
#endif |
+#if defined(WEBP_USE_NEON) |
+ |
+#include "../dec/vp8i.h" |
+ |
#define QRegs "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", \ |
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" |
@@ -155,6 +155,9 @@ static void SimpleHFilter16iNEON(uint8_t* p, int stride, int thresh) { |
} |
} |
+//----------------------------------------------------------------------------- |
+// Inverse transforms (Paragraph 14.4) |
+ |
static void TransformOneNEON(const int16_t *in, uint8_t *dst) { |
const int kBPS = BPS; |
const int16_t constants[] = {20091, 17734, 0, 0}; |
@@ -311,19 +314,92 @@ static void TransformTwoNEON(const int16_t* in, uint8_t* dst, int do_two) { |
} |
} |
+static void TransformWHT(const int16_t* in, int16_t* out) { |
+ const int kStep = 32; // The store is only incrementing the pointer as if we |
+ // had stored a single byte. |
+ __asm__ volatile ( |
+ // part 1 |
+ // load data into q0, q1 |
+ "vld1.16 {q0, q1}, [%[in]] \n" |
+ |
+ "vaddl.s16 q2, d0, d3 \n" // a0 = in[0] + in[12] |
+ "vaddl.s16 q3, d1, d2 \n" // a1 = in[4] + in[8] |
+ "vsubl.s16 q4, d1, d2 \n" // a2 = in[4] - in[8] |
+ "vsubl.s16 q5, d0, d3 \n" // a3 = in[0] - in[12] |
+ |
+ "vadd.s32 q0, q2, q3 \n" // tmp[0] = a0 + a1 |
+ "vsub.s32 q2, q2, q3 \n" // tmp[8] = a0 - a1 |
+ "vadd.s32 q1, q5, q4 \n" // tmp[4] = a3 + a2 |
+ "vsub.s32 q3, q5, q4 \n" // tmp[12] = a3 - a2 |
+ |
+ // Transpose |
+ // q0 = tmp[0, 4, 8, 12], q1 = tmp[2, 6, 10, 14] |
+ // q2 = tmp[1, 5, 9, 13], q3 = tmp[3, 7, 11, 15] |
+ "vswp d1, d4 \n" // vtrn.64 q0, q2 |
+ "vswp d3, d6 \n" // vtrn.64 q1, q3 |
+ "vtrn.32 q0, q1 \n" |
+ "vtrn.32 q2, q3 \n" |
+ |
+ "vmov.s32 q4, #3 \n" // dc = 3 |
+ "vadd.s32 q0, q0, q4 \n" // dc = tmp[0] + 3 |
+ "vadd.s32 q6, q0, q3 \n" // a0 = dc + tmp[3] |
+ "vadd.s32 q7, q1, q2 \n" // a1 = tmp[1] + tmp[2] |
+ "vsub.s32 q8, q1, q2 \n" // a2 = tmp[1] - tmp[2] |
+ "vsub.s32 q9, q0, q3 \n" // a3 = dc - tmp[3] |
+ |
+ "vadd.s32 q0, q6, q7 \n" |
+ "vshrn.s32 d0, q0, #3 \n" // (a0 + a1) >> 3 |
+ "vadd.s32 q1, q9, q8 \n" |
+ "vshrn.s32 d1, q1, #3 \n" // (a3 + a2) >> 3 |
+ "vsub.s32 q2, q6, q7 \n" |
+ "vshrn.s32 d2, q2, #3 \n" // (a0 - a1) >> 3 |
+ "vsub.s32 q3, q9, q8 \n" |
+ "vshrn.s32 d3, q3, #3 \n" // (a3 - a2) >> 3 |
+ |
+ // set the results to output |
+ "vst1.16 d0[0], [%[out]], %[kStep] \n" |
+ "vst1.16 d1[0], [%[out]], %[kStep] \n" |
+ "vst1.16 d2[0], [%[out]], %[kStep] \n" |
+ "vst1.16 d3[0], [%[out]], %[kStep] \n" |
+ "vst1.16 d0[1], [%[out]], %[kStep] \n" |
+ "vst1.16 d1[1], [%[out]], %[kStep] \n" |
+ "vst1.16 d2[1], [%[out]], %[kStep] \n" |
+ "vst1.16 d3[1], [%[out]], %[kStep] \n" |
+ "vst1.16 d0[2], [%[out]], %[kStep] \n" |
+ "vst1.16 d1[2], [%[out]], %[kStep] \n" |
+ "vst1.16 d2[2], [%[out]], %[kStep] \n" |
+ "vst1.16 d3[2], [%[out]], %[kStep] \n" |
+ "vst1.16 d0[3], [%[out]], %[kStep] \n" |
+ "vst1.16 d1[3], [%[out]], %[kStep] \n" |
+ "vst1.16 d2[3], [%[out]], %[kStep] \n" |
+ "vst1.16 d3[3], [%[out]], %[kStep] \n" |
+ |
+ : [out] "+r"(out) // modified registers |
+ : [in] "r"(in), [kStep] "r"(kStep) // constants |
+ : "memory", "q0", "q1", "q2", "q3", "q4", |
+ "q5", "q6", "q7", "q8", "q9" // clobbered |
+ ); |
+} |
+ |
+#endif // WEBP_USE_NEON |
+ |
+//------------------------------------------------------------------------------ |
+// Entry point |
+ |
extern void VP8DspInitNEON(void); |
void VP8DspInitNEON(void) { |
+#if defined(WEBP_USE_NEON) |
VP8Transform = TransformTwoNEON; |
+ VP8TransformWHT = TransformWHT; |
VP8SimpleVFilter16 = SimpleVFilter16NEON; |
VP8SimpleHFilter16 = SimpleHFilter16NEON; |
VP8SimpleVFilter16i = SimpleVFilter16iNEON; |
VP8SimpleHFilter16i = SimpleHFilter16iNEON; |
+#endif // WEBP_USE_NEON |
} |
#if defined(__cplusplus) || defined(c_plusplus) |
} // extern "C" |
#endif |
- |
-#endif // WEBP_USE_NEON |