Index: third_party/libwebp/dsp/dec_neon.c |
diff --git a/third_party/libwebp/dsp/dec_neon.c b/third_party/libwebp/dsp/dec_neon.c |
index 5dcd3b7e5eb8cc4446ad093c590f314e2800064b..9c3d8cc0165001327432636db8bb5335b8a15f90 100644 |
--- a/third_party/libwebp/dsp/dec_neon.c |
+++ b/third_party/libwebp/dsp/dec_neon.c |
@@ -14,15 +14,11 @@ |
#include "./dsp.h" |
-#if defined(__cplusplus) || defined(c_plusplus) |
-extern "C" { |
-#endif |
- |
#if defined(WEBP_USE_NEON) |
#include "../dec/vp8i.h" |
-#define QRegs "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", \ |
+#define QRegs "q0", "q1", "q2", "q3", \ |
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" |
#define FLIP_SIGN_BIT2(a, b, s) \ |
@@ -101,9 +97,9 @@ static void SimpleVFilter16NEON(uint8_t* p, int stride, int thresh) { |
"vld1.u8 {q1}, [%[p]], %[stride] \n" // p1 |
"vld1.u8 {q2}, [%[p]], %[stride] \n" // p0 |
"vld1.u8 {q3}, [%[p]], %[stride] \n" // q0 |
- "vld1.u8 {q4}, [%[p]] \n" // q1 |
+ "vld1.u8 {q12}, [%[p]] \n" // q1 |
- DO_FILTER2(q1, q2, q3, q4, %[thresh]) |
+ DO_FILTER2(q1, q2, q3, q12, %[thresh]) |
"sub %[p], %[p], %[stride], lsl #1 \n" // p -= 2 * stride |
@@ -122,18 +118,18 @@ static void SimpleHFilter16NEON(uint8_t* p, int stride, int thresh) { |
"add r5, r4, %[stride] \n" // base2 = base1 + stride |
LOAD8x4(d2, d3, d4, d5, [r4], [r5], r6) |
- LOAD8x4(d6, d7, d8, d9, [r4], [r5], r6) |
- "vswp d3, d6 \n" // p1:q1 p0:q3 |
- "vswp d5, d8 \n" // q0:q2 q1:q4 |
- "vswp q2, q3 \n" // p1:q1 p0:q2 q0:q3 q1:q4 |
+ LOAD8x4(d24, d25, d26, d27, [r4], [r5], r6) |
+ "vswp d3, d24 \n" // p1:q1 p0:q3 |
+ "vswp d5, d26 \n" // q0:q2 q1:q4 |
+ "vswp q2, q12 \n" // p1:q1 p0:q2 q0:q3 q1:q4 |
- DO_FILTER2(q1, q2, q3, q4, %[thresh]) |
+ DO_FILTER2(q1, q2, q12, q13, %[thresh]) |
"sub %[p], %[p], #1 \n" // p - 1 |
- "vswp d5, d6 \n" |
+ "vswp d5, d24 \n" |
STORE8x2(d4, d5, [%[p]], %[stride]) |
- STORE8x2(d6, d7, [%[p]], %[stride]) |
+ STORE8x2(d24, d25, [%[p]], %[stride]) |
: [p] "+r"(p) |
: [stride] "r"(stride), [thresh] "r"(thresh) |
@@ -160,7 +156,7 @@ static void SimpleHFilter16iNEON(uint8_t* p, int stride, int thresh) { |
//----------------------------------------------------------------------------- |
// Inverse transforms (Paragraph 14.4) |
-static void TransformOneNEON(const int16_t *in, uint8_t *dst) { |
+static void TransformOne(const int16_t* in, uint8_t* dst) { |
const int kBPS = BPS; |
const int16_t constants[] = {20091, 17734, 0, 0}; |
/* kC1, kC2. Padded because vld1.16 loads 8 bytes |
@@ -309,13 +305,44 @@ static void TransformOneNEON(const int16_t *in, uint8_t *dst) { |
); |
} |
-static void TransformTwoNEON(const int16_t* in, uint8_t* dst, int do_two) { |
- TransformOneNEON(in, dst); |
+static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) { |
+ TransformOne(in, dst); |
if (do_two) { |
- TransformOneNEON(in + 16, dst + 4); |
+ TransformOne(in + 16, dst + 4); |
} |
} |
+static void TransformDC(const int16_t* in, uint8_t* dst) { |
+ const int DC = (in[0] + 4) >> 3; |
+ const int kBPS = BPS; |
+ __asm__ volatile ( |
+ "vdup.16 q1, %[DC] \n" |
+ |
+ "vld1.32 d0[0], [%[dst]], %[kBPS] \n" |
+ "vld1.32 d1[0], [%[dst]], %[kBPS] \n" |
+ "vld1.32 d0[1], [%[dst]], %[kBPS] \n" |
+ "vld1.32 d1[1], [%[dst]], %[kBPS] \n" |
+ |
+ "sub %[dst], %[dst], %[kBPS], lsl #2 \n" |
+ |
+ // add DC and convert to s16. |
+ "vaddw.u8 q2, q1, d0 \n" |
+ "vaddw.u8 q3, q1, d1 \n" |
+ // convert back to u8 with saturation |
+ "vqmovun.s16 d0, q2 \n" |
+ "vqmovun.s16 d1, q3 \n" |
+ |
+ "vst1.32 d0[0], [%[dst]], %[kBPS] \n" |
+ "vst1.32 d1[0], [%[dst]], %[kBPS] \n" |
+ "vst1.32 d0[1], [%[dst]], %[kBPS] \n" |
+ "vst1.32 d1[1], [%[dst]] \n" |
+ : [in] "+r"(in), [dst] "+r"(dst) /* modified registers */ |
+ : [kBPS] "r"(kBPS), /* constants */ |
+ [DC] "r"(DC) |
+ : "memory", "q0", "q1", "q2", "q3" /* clobbered */ |
+ ); |
+} |
+ |
static void TransformWHT(const int16_t* in, int16_t* out) { |
const int kStep = 32; // The store is only incrementing the pointer as if we |
// had stored a single byte. |
@@ -324,39 +351,39 @@ static void TransformWHT(const int16_t* in, int16_t* out) { |
// load data into q0, q1 |
"vld1.16 {q0, q1}, [%[in]] \n" |
- "vaddl.s16 q2, d0, d3 \n" // a0 = in[0] + in[12] |
- "vaddl.s16 q3, d1, d2 \n" // a1 = in[4] + in[8] |
- "vsubl.s16 q4, d1, d2 \n" // a2 = in[4] - in[8] |
- "vsubl.s16 q5, d0, d3 \n" // a3 = in[0] - in[12] |
+ "vaddl.s16 q2, d0, d3 \n" // a0 = in[0] + in[12] |
+ "vaddl.s16 q3, d1, d2 \n" // a1 = in[4] + in[8] |
+ "vsubl.s16 q10, d1, d2 \n" // a2 = in[4] - in[8] |
+ "vsubl.s16 q11, d0, d3 \n" // a3 = in[0] - in[12] |
- "vadd.s32 q0, q2, q3 \n" // tmp[0] = a0 + a1 |
- "vsub.s32 q2, q2, q3 \n" // tmp[8] = a0 - a1 |
- "vadd.s32 q1, q5, q4 \n" // tmp[4] = a3 + a2 |
- "vsub.s32 q3, q5, q4 \n" // tmp[12] = a3 - a2 |
+ "vadd.s32 q0, q2, q3 \n" // tmp[0] = a0 + a1 |
+ "vsub.s32 q2, q2, q3 \n" // tmp[8] = a0 - a1 |
+ "vadd.s32 q1, q11, q10 \n" // tmp[4] = a3 + a2 |
+ "vsub.s32 q3, q11, q10 \n" // tmp[12] = a3 - a2 |
// Transpose |
// q0 = tmp[0, 4, 8, 12], q1 = tmp[2, 6, 10, 14] |
// q2 = tmp[1, 5, 9, 13], q3 = tmp[3, 7, 11, 15] |
- "vswp d1, d4 \n" // vtrn.64 q0, q2 |
- "vswp d3, d6 \n" // vtrn.64 q1, q3 |
+ "vswp d1, d4 \n" // vtrn.64 q0, q2 |
+ "vswp d3, d6 \n" // vtrn.64 q1, q3 |
"vtrn.32 q0, q1 \n" |
"vtrn.32 q2, q3 \n" |
- "vmov.s32 q4, #3 \n" // dc = 3 |
- "vadd.s32 q0, q0, q4 \n" // dc = tmp[0] + 3 |
- "vadd.s32 q6, q0, q3 \n" // a0 = dc + tmp[3] |
- "vadd.s32 q7, q1, q2 \n" // a1 = tmp[1] + tmp[2] |
- "vsub.s32 q8, q1, q2 \n" // a2 = tmp[1] - tmp[2] |
- "vsub.s32 q9, q0, q3 \n" // a3 = dc - tmp[3] |
+ "vmov.s32 q10, #3 \n" // dc = 3 |
+ "vadd.s32 q0, q0, q10 \n" // dc = tmp[0] + 3 |
+ "vadd.s32 q12, q0, q3 \n" // a0 = dc + tmp[3] |
+ "vadd.s32 q13, q1, q2 \n" // a1 = tmp[1] + tmp[2] |
+ "vsub.s32 q8, q1, q2 \n" // a2 = tmp[1] - tmp[2] |
+ "vsub.s32 q9, q0, q3 \n" // a3 = dc - tmp[3] |
- "vadd.s32 q0, q6, q7 \n" |
- "vshrn.s32 d0, q0, #3 \n" // (a0 + a1) >> 3 |
+ "vadd.s32 q0, q12, q13 \n" |
+ "vshrn.s32 d0, q0, #3 \n" // (a0 + a1) >> 3 |
"vadd.s32 q1, q9, q8 \n" |
- "vshrn.s32 d1, q1, #3 \n" // (a3 + a2) >> 3 |
- "vsub.s32 q2, q6, q7 \n" |
- "vshrn.s32 d2, q2, #3 \n" // (a0 - a1) >> 3 |
+ "vshrn.s32 d1, q1, #3 \n" // (a3 + a2) >> 3 |
+ "vsub.s32 q2, q12, q13 \n" |
+ "vshrn.s32 d2, q2, #3 \n" // (a0 - a1) >> 3 |
"vsub.s32 q3, q9, q8 \n" |
- "vshrn.s32 d3, q3, #3 \n" // (a3 - a2) >> 3 |
+ "vshrn.s32 d3, q3, #3 \n" // (a3 - a2) >> 3 |
// set the results to output |
"vst1.16 d0[0], [%[out]], %[kStep] \n" |
@@ -378,8 +405,8 @@ static void TransformWHT(const int16_t* in, int16_t* out) { |
: [out] "+r"(out) // modified registers |
: [in] "r"(in), [kStep] "r"(kStep) // constants |
- : "memory", "q0", "q1", "q2", "q3", "q4", |
- "q5", "q6", "q7", "q8", "q9" // clobbered |
+ : "memory", "q0", "q1", "q2", "q3", |
+ "q8", "q9", "q10", "q11", "q12", "q13" // clobbered |
); |
} |
@@ -392,7 +419,9 @@ extern void VP8DspInitNEON(void); |
void VP8DspInitNEON(void) { |
#if defined(WEBP_USE_NEON) |
- VP8Transform = TransformTwoNEON; |
+ VP8Transform = TransformTwo; |
+ VP8TransformAC3 = TransformOne; // no special code here |
+ VP8TransformDC = TransformDC; |
VP8TransformWHT = TransformWHT; |
VP8SimpleVFilter16 = SimpleVFilter16NEON; |
@@ -402,6 +431,3 @@ void VP8DspInitNEON(void) { |
#endif // WEBP_USE_NEON |
} |
-#if defined(__cplusplus) || defined(c_plusplus) |
-} // extern "C" |
-#endif |