Index: simd/jsimd_arm_neon.S |
=================================================================== |
--- simd/jsimd_arm_neon.S (revision 106486) |
+++ simd/jsimd_arm_neon.S (working copy) |
@@ -3,7 +3,7 @@ |
* |
* Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies). |
* All rights reserved. |
- * Contact: Alexander Bokovoy <alexander.bokovoy@nokia.com> |
+ * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com> |
* |
* This software is provided 'as-is', without any express or implied |
* warranty. In no event will the authors be held liable for any damages |
@@ -62,17 +62,630 @@ |
vtrn.32 \x1, \x3 |
.endm |
+#define CENTERJSAMPLE 128 |
+ |
/*****************************************************************************/ |
/* |
+ * Perform dequantization and inverse DCT on one block of coefficients. |
+ * |
+ * GLOBAL(void) |
+ * jsimd_idct_islow_neon (void * dct_table, JCOEFPTR coef_block, |
+ * JSAMPARRAY output_buf, JDIMENSION output_col) |
+ */ |
+ |
+#define FIX_0_298631336 (2446) |
+#define FIX_0_390180644 (3196) |
+#define FIX_0_541196100 (4433) |
+#define FIX_0_765366865 (6270) |
+#define FIX_0_899976223 (7373) |
+#define FIX_1_175875602 (9633) |
+#define FIX_1_501321110 (12299) |
+#define FIX_1_847759065 (15137) |
+#define FIX_1_961570560 (16069) |
+#define FIX_2_053119869 (16819) |
+#define FIX_2_562915447 (20995) |
+#define FIX_3_072711026 (25172) |
+ |
+#define FIX_1_175875602_MINUS_1_961570560 (FIX_1_175875602 - FIX_1_961570560) |
+#define FIX_1_175875602_MINUS_0_390180644 (FIX_1_175875602 - FIX_0_390180644) |
+#define FIX_0_541196100_MINUS_1_847759065 (FIX_0_541196100 - FIX_1_847759065) |
+#define FIX_3_072711026_MINUS_2_562915447 (FIX_3_072711026 - FIX_2_562915447) |
+#define FIX_0_298631336_MINUS_0_899976223 (FIX_0_298631336 - FIX_0_899976223) |
+#define FIX_1_501321110_MINUS_0_899976223 (FIX_1_501321110 - FIX_0_899976223) |
+#define FIX_2_053119869_MINUS_2_562915447 (FIX_2_053119869 - FIX_2_562915447) |
+#define FIX_0_541196100_PLUS_0_765366865 (FIX_0_541196100 + FIX_0_765366865) |
+ |
+/* |
+ * Reference SIMD-friendly 1-D ISLOW iDCT C implementation. |
+ * Uses some ideas from the comments in 'simd/jiss2int-64.asm' |
+ */ |
+#define REF_1D_IDCT(xrow0, xrow1, xrow2, xrow3, xrow4, xrow5, xrow6, xrow7) \ |
+{ \ |
+ DCTELEM row0, row1, row2, row3, row4, row5, row6, row7; \ |
+ INT32 q1, q2, q3, q4, q5, q6, q7; \ |
+ INT32 tmp11_plus_tmp2, tmp11_minus_tmp2; \ |
+ \ |
+ /* 1-D iDCT input data */ \ |
+ row0 = xrow0; \ |
+ row1 = xrow1; \ |
+ row2 = xrow2; \ |
+ row3 = xrow3; \ |
+ row4 = xrow4; \ |
+ row5 = xrow5; \ |
+ row6 = xrow6; \ |
+ row7 = xrow7; \ |
+ \ |
+ q5 = row7 + row3; \ |
+ q4 = row5 + row1; \ |
+ q6 = MULTIPLY(q5, FIX_1_175875602_MINUS_1_961570560) + \ |
+ MULTIPLY(q4, FIX_1_175875602); \ |
+ q7 = MULTIPLY(q5, FIX_1_175875602) + \ |
+ MULTIPLY(q4, FIX_1_175875602_MINUS_0_390180644); \ |
+ q2 = MULTIPLY(row2, FIX_0_541196100) + \ |
+ MULTIPLY(row6, FIX_0_541196100_MINUS_1_847759065); \ |
+ q4 = q6; \ |
+ q3 = ((INT32) row0 - (INT32) row4) << 13; \ |
+ q6 += MULTIPLY(row5, -FIX_2_562915447) + \ |
+ MULTIPLY(row3, FIX_3_072711026_MINUS_2_562915447); \ |
+ /* now we can use q1 (reloadable constants have been used up) */ \ |
+ q1 = q3 + q2; \ |
+ q4 += MULTIPLY(row7, FIX_0_298631336_MINUS_0_899976223) + \ |
+ MULTIPLY(row1, -FIX_0_899976223); \ |
+ q5 = q7; \ |
+ q1 = q1 + q6; \ |
+ q7 += MULTIPLY(row7, -FIX_0_899976223) + \ |
+ MULTIPLY(row1, FIX_1_501321110_MINUS_0_899976223); \ |
+ \ |
+ /* (tmp11 + tmp2) has been calculated (out_row1 before descale) */ \ |
+ tmp11_plus_tmp2 = q1; \ |
+ row1 = 0; \ |
+ \ |
+ q1 = q1 - q6; \ |
+ q5 += MULTIPLY(row5, FIX_2_053119869_MINUS_2_562915447) + \ |
+ MULTIPLY(row3, -FIX_2_562915447); \ |
+ q1 = q1 - q6; \ |
+ q6 = MULTIPLY(row2, FIX_0_541196100_PLUS_0_765366865) + \ |
+ MULTIPLY(row6, FIX_0_541196100); \ |
+ q3 = q3 - q2; \ |
+ \ |
+ /* (tmp11 - tmp2) has been calculated (out_row6 before descale) */ \ |
+ tmp11_minus_tmp2 = q1; \ |
+ \ |
+ q1 = ((INT32) row0 + (INT32) row4) << 13; \ |
+ q2 = q1 + q6; \ |
+ q1 = q1 - q6; \ |
+ \ |
+ /* pick up the results */ \ |
+ tmp0 = q4; \ |
+ tmp1 = q5; \ |
+ tmp2 = (tmp11_plus_tmp2 - tmp11_minus_tmp2) / 2; \ |
+ tmp3 = q7; \ |
+ tmp10 = q2; \ |
+ tmp11 = (tmp11_plus_tmp2 + tmp11_minus_tmp2) / 2; \ |
+ tmp12 = q3; \ |
+ tmp13 = q1; \ |
+} |
+ |
+#define XFIX_0_899976223 d0[0] |
+#define XFIX_0_541196100 d0[1] |
+#define XFIX_2_562915447 d0[2] |
+#define XFIX_0_298631336_MINUS_0_899976223 d0[3] |
+#define XFIX_1_501321110_MINUS_0_899976223 d1[0] |
+#define XFIX_2_053119869_MINUS_2_562915447 d1[1] |
+#define XFIX_0_541196100_PLUS_0_765366865 d1[2] |
+#define XFIX_1_175875602 d1[3] |
+#define XFIX_1_175875602_MINUS_0_390180644 d2[0] |
+#define XFIX_0_541196100_MINUS_1_847759065 d2[1] |
+#define XFIX_3_072711026_MINUS_2_562915447 d2[2] |
+#define XFIX_1_175875602_MINUS_1_961570560 d2[3] |
+ |
+.balign 16 |
+jsimd_idct_islow_neon_consts: |
+ .short FIX_0_899976223 /* d0[0] */ |
+ .short FIX_0_541196100 /* d0[1] */ |
+ .short FIX_2_562915447 /* d0[2] */ |
+ .short FIX_0_298631336_MINUS_0_899976223 /* d0[3] */ |
+ .short FIX_1_501321110_MINUS_0_899976223 /* d1[0] */ |
+ .short FIX_2_053119869_MINUS_2_562915447 /* d1[1] */ |
+ .short FIX_0_541196100_PLUS_0_765366865 /* d1[2] */ |
+ .short FIX_1_175875602 /* d1[3] */ |
+ /* reloadable constants */ |
+ .short FIX_1_175875602_MINUS_0_390180644 /* d2[0] */ |
+ .short FIX_0_541196100_MINUS_1_847759065 /* d2[1] */ |
+ .short FIX_3_072711026_MINUS_2_562915447 /* d2[2] */ |
+ .short FIX_1_175875602_MINUS_1_961570560 /* d2[3] */ |
+ |
+asm_function jsimd_idct_islow_neon |
+ |
+ DCT_TABLE .req r0 |
+ COEF_BLOCK .req r1 |
+ OUTPUT_BUF .req r2 |
+ OUTPUT_COL .req r3 |
+ TMP1 .req r0 |
+ TMP2 .req r1 |
+ TMP3 .req r2 |
+ TMP4 .req ip |
+ |
+ ROW0L .req d16 |
+ ROW0R .req d17 |
+ ROW1L .req d18 |
+ ROW1R .req d19 |
+ ROW2L .req d20 |
+ ROW2R .req d21 |
+ ROW3L .req d22 |
+ ROW3R .req d23 |
+ ROW4L .req d24 |
+ ROW4R .req d25 |
+ ROW5L .req d26 |
+ ROW5R .req d27 |
+ ROW6L .req d28 |
+ ROW6R .req d29 |
+ ROW7L .req d30 |
+ ROW7R .req d31 |
+ |
+ /* Load and dequantize coefficients into NEON registers |
+ * with the following allocation: |
+ * 0 1 2 3 | 4 5 6 7 |
+ * ---------+-------- |
+ * 0 | d16 | d17 ( q8 ) |
+ * 1 | d18 | d19 ( q9 ) |
+ * 2 | d20 | d21 ( q10 ) |
+ * 3 | d22 | d23 ( q11 ) |
+ * 4 | d24 | d25 ( q12 ) |
+ * 5 | d26 | d27 ( q13 ) |
+ * 6 | d28 | d29 ( q14 ) |
+ * 7 | d30 | d31 ( q15 ) |
+ */ |
+ adr ip, jsimd_idct_islow_neon_consts |
+ vld1.16 {d16, d17, d18, d19}, [COEF_BLOCK, :128]! |
+ vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]! |
+ vld1.16 {d20, d21, d22, d23}, [COEF_BLOCK, :128]! |
+ vmul.s16 q8, q8, q0 |
+ vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]! |
+ vmul.s16 q9, q9, q1 |
+ vld1.16 {d24, d25, d26, d27}, [COEF_BLOCK, :128]! |
+ vmul.s16 q10, q10, q2 |
+ vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]! |
+ vmul.s16 q11, q11, q3 |
+ vld1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128] |
+ vmul.s16 q12, q12, q0 |
+ vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]! |
+ vmul.s16 q14, q14, q2 |
+ vmul.s16 q13, q13, q1 |
+ vld1.16 {d0, d1, d2, d3}, [ip, :128] /* load constants */ |
+ add ip, ip, #16 |
+ vmul.s16 q15, q15, q3 |
+ vpush {d8-d15} /* save NEON registers */ |
+ /* 1-D IDCT, pass 1, left 4x8 half */ |
+ vadd.s16 d4, ROW7L, ROW3L |
+ vadd.s16 d5, ROW5L, ROW1L |
+ vmull.s16 q6, d4, XFIX_1_175875602_MINUS_1_961570560 |
+ vmlal.s16 q6, d5, XFIX_1_175875602 |
+ vmull.s16 q7, d4, XFIX_1_175875602 |
+ /* Check for the zero coefficients in the right 4x8 half */ |
+ push {r4, r5} |
+ vmlal.s16 q7, d5, XFIX_1_175875602_MINUS_0_390180644 |
+ vsubl.s16 q3, ROW0L, ROW4L |
+ ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 1 * 8))] |
+ vmull.s16 q2, ROW2L, XFIX_0_541196100 |
+ vmlal.s16 q2, ROW6L, XFIX_0_541196100_MINUS_1_847759065 |
+ orr r0, r4, r5 |
+ vmov q4, q6 |
+ vmlsl.s16 q6, ROW5L, XFIX_2_562915447 |
+ ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 2 * 8))] |
+ vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447 |
+ vshl.s32 q3, q3, #13 |
+ orr r0, r0, r4 |
+ vmlsl.s16 q4, ROW1L, XFIX_0_899976223 |
+ orr r0, r0, r5 |
+ vadd.s32 q1, q3, q2 |
+ ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 3 * 8))] |
+ vmov q5, q7 |
+ vadd.s32 q1, q1, q6 |
+ orr r0, r0, r4 |
+ vmlsl.s16 q7, ROW7L, XFIX_0_899976223 |
+ orr r0, r0, r5 |
+ vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223 |
+ vrshrn.s32 ROW1L, q1, #11 |
+ ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 4 * 8))] |
+ vsub.s32 q1, q1, q6 |
+ vmlal.s16 q5, ROW5L, XFIX_2_053119869_MINUS_2_562915447 |
+ orr r0, r0, r4 |
+ vmlsl.s16 q5, ROW3L, XFIX_2_562915447 |
+ orr r0, r0, r5 |
+ vsub.s32 q1, q1, q6 |
+ vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865 |
+ ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 5 * 8))] |
+ vmlal.s16 q6, ROW6L, XFIX_0_541196100 |
+ vsub.s32 q3, q3, q2 |
+ orr r0, r0, r4 |
+ vrshrn.s32 ROW6L, q1, #11 |
+ orr r0, r0, r5 |
+ vadd.s32 q1, q3, q5 |
+ ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 6 * 8))] |
+ vsub.s32 q3, q3, q5 |
+ vaddl.s16 q5, ROW0L, ROW4L |
+ orr r0, r0, r4 |
+ vrshrn.s32 ROW2L, q1, #11 |
+ orr r0, r0, r5 |
+ vrshrn.s32 ROW5L, q3, #11 |
+ ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 7 * 8))] |
+ vshl.s32 q5, q5, #13 |
+ vmlal.s16 q4, ROW7L, XFIX_0_298631336_MINUS_0_899976223 |
+ orr r0, r0, r4 |
+ vadd.s32 q2, q5, q6 |
+ orrs r0, r0, r5 |
+ vsub.s32 q1, q5, q6 |
+ vadd.s32 q6, q2, q7 |
+ ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 0 * 8))] |
+ vsub.s32 q2, q2, q7 |
+ vadd.s32 q5, q1, q4 |
+ orr r0, r4, r5 |
+ vsub.s32 q3, q1, q4 |
+ pop {r4, r5} |
+ vrshrn.s32 ROW7L, q2, #11 |
+ vrshrn.s32 ROW3L, q5, #11 |
+ vrshrn.s32 ROW0L, q6, #11 |
+ vrshrn.s32 ROW4L, q3, #11 |
+ |
+ beq 3f /* Go to do some special handling for the sparse right 4x8 half */ |
+ |
+ /* 1-D IDCT, pass 1, right 4x8 half */ |
+ vld1.s16 {d2}, [ip, :64] /* reload constants */ |
+ vadd.s16 d10, ROW7R, ROW3R |
+ vadd.s16 d8, ROW5R, ROW1R |
+ /* Transpose left 4x8 half */ |
+ vtrn.16 ROW6L, ROW7L |
+ vmull.s16 q6, d10, XFIX_1_175875602_MINUS_1_961570560 |
+ vmlal.s16 q6, d8, XFIX_1_175875602 |
+ vtrn.16 ROW2L, ROW3L |
+ vmull.s16 q7, d10, XFIX_1_175875602 |
+ vmlal.s16 q7, d8, XFIX_1_175875602_MINUS_0_390180644 |
+ vtrn.16 ROW0L, ROW1L |
+ vsubl.s16 q3, ROW0R, ROW4R |
+ vmull.s16 q2, ROW2R, XFIX_0_541196100 |
+ vmlal.s16 q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065 |
+ vtrn.16 ROW4L, ROW5L |
+ vmov q4, q6 |
+ vmlsl.s16 q6, ROW5R, XFIX_2_562915447 |
+ vmlal.s16 q6, ROW3R, XFIX_3_072711026_MINUS_2_562915447 |
+ vtrn.32 ROW1L, ROW3L |
+ vshl.s32 q3, q3, #13 |
+ vmlsl.s16 q4, ROW1R, XFIX_0_899976223 |
+ vtrn.32 ROW4L, ROW6L |
+ vadd.s32 q1, q3, q2 |
+ vmov q5, q7 |
+ vadd.s32 q1, q1, q6 |
+ vtrn.32 ROW0L, ROW2L |
+ vmlsl.s16 q7, ROW7R, XFIX_0_899976223 |
+ vmlal.s16 q7, ROW1R, XFIX_1_501321110_MINUS_0_899976223 |
+ vrshrn.s32 ROW1R, q1, #11 |
+ vtrn.32 ROW5L, ROW7L |
+ vsub.s32 q1, q1, q6 |
+ vmlal.s16 q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447 |
+ vmlsl.s16 q5, ROW3R, XFIX_2_562915447 |
+ vsub.s32 q1, q1, q6 |
+ vmull.s16 q6, ROW2R, XFIX_0_541196100_PLUS_0_765366865 |
+ vmlal.s16 q6, ROW6R, XFIX_0_541196100 |
+ vsub.s32 q3, q3, q2 |
+ vrshrn.s32 ROW6R, q1, #11 |
+ vadd.s32 q1, q3, q5 |
+ vsub.s32 q3, q3, q5 |
+ vaddl.s16 q5, ROW0R, ROW4R |
+ vrshrn.s32 ROW2R, q1, #11 |
+ vrshrn.s32 ROW5R, q3, #11 |
+ vshl.s32 q5, q5, #13 |
+ vmlal.s16 q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223 |
+ vadd.s32 q2, q5, q6 |
+ vsub.s32 q1, q5, q6 |
+ vadd.s32 q6, q2, q7 |
+ vsub.s32 q2, q2, q7 |
+ vadd.s32 q5, q1, q4 |
+ vsub.s32 q3, q1, q4 |
+ vrshrn.s32 ROW7R, q2, #11 |
+ vrshrn.s32 ROW3R, q5, #11 |
+ vrshrn.s32 ROW0R, q6, #11 |
+ vrshrn.s32 ROW4R, q3, #11 |
+ /* Transpose right 4x8 half */ |
+ vtrn.16 ROW6R, ROW7R |
+ vtrn.16 ROW2R, ROW3R |
+ vtrn.16 ROW0R, ROW1R |
+ vtrn.16 ROW4R, ROW5R |
+ vtrn.32 ROW1R, ROW3R |
+ vtrn.32 ROW4R, ROW6R |
+ vtrn.32 ROW0R, ROW2R |
+ vtrn.32 ROW5R, ROW7R |
+ |
+1: /* 1-D IDCT, pass 2 (normal variant), left 4x8 half */ |
+ vld1.s16 {d2}, [ip, :64] /* reload constants */ |
+ vmull.s16 q6, ROW1R, XFIX_1_175875602 /* ROW5L <-> ROW1R */ |
+ vmlal.s16 q6, ROW1L, XFIX_1_175875602 |
+ vmlal.s16 q6, ROW3R, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L <-> ROW3R */ |
+ vmlal.s16 q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560 |
+ vmull.s16 q7, ROW3R, XFIX_1_175875602 /* ROW7L <-> ROW3R */ |
+ vmlal.s16 q7, ROW3L, XFIX_1_175875602 |
+ vmlal.s16 q7, ROW1R, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L <-> ROW1R */ |
+ vmlal.s16 q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644 |
+ vsubl.s16 q3, ROW0L, ROW0R /* ROW4L <-> ROW0R */ |
+ vmull.s16 q2, ROW2L, XFIX_0_541196100 |
+ vmlal.s16 q2, ROW2R, XFIX_0_541196100_MINUS_1_847759065 /* ROW6L <-> ROW2R */ |
+ vmov q4, q6 |
+ vmlsl.s16 q6, ROW1R, XFIX_2_562915447 /* ROW5L <-> ROW1R */ |
+ vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447 |
+ vshl.s32 q3, q3, #13 |
+ vmlsl.s16 q4, ROW1L, XFIX_0_899976223 |
+ vadd.s32 q1, q3, q2 |
+ vmov q5, q7 |
+ vadd.s32 q1, q1, q6 |
+ vmlsl.s16 q7, ROW3R, XFIX_0_899976223 /* ROW7L <-> ROW3R */ |
+ vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223 |
+ vshrn.s32 ROW1L, q1, #16 |
+ vsub.s32 q1, q1, q6 |
+ vmlal.s16 q5, ROW1R, XFIX_2_053119869_MINUS_2_562915447 /* ROW5L <-> ROW1R */ |
+ vmlsl.s16 q5, ROW3L, XFIX_2_562915447 |
+ vsub.s32 q1, q1, q6 |
+ vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865 |
+ vmlal.s16 q6, ROW2R, XFIX_0_541196100 /* ROW6L <-> ROW2R */ |
+ vsub.s32 q3, q3, q2 |
+ vshrn.s32 ROW2R, q1, #16 /* ROW6L <-> ROW2R */ |
+ vadd.s32 q1, q3, q5 |
+ vsub.s32 q3, q3, q5 |
+ vaddl.s16 q5, ROW0L, ROW0R /* ROW4L <-> ROW0R */ |
+ vshrn.s32 ROW2L, q1, #16 |
+ vshrn.s32 ROW1R, q3, #16 /* ROW5L <-> ROW1R */ |
+ vshl.s32 q5, q5, #13 |
+ vmlal.s16 q4, ROW3R, XFIX_0_298631336_MINUS_0_899976223 /* ROW7L <-> ROW3R */ |
+ vadd.s32 q2, q5, q6 |
+ vsub.s32 q1, q5, q6 |
+ vadd.s32 q6, q2, q7 |
+ vsub.s32 q2, q2, q7 |
+ vadd.s32 q5, q1, q4 |
+ vsub.s32 q3, q1, q4 |
+ vshrn.s32 ROW3R, q2, #16 /* ROW7L <-> ROW3R */ |
+ vshrn.s32 ROW3L, q5, #16 |
+ vshrn.s32 ROW0L, q6, #16 |
+ vshrn.s32 ROW0R, q3, #16 /* ROW4L <-> ROW0R */ |
+ /* 1-D IDCT, pass 2, right 4x8 half */ |
+ vld1.s16 {d2}, [ip, :64] /* reload constants */ |
+ vmull.s16 q6, ROW5R, XFIX_1_175875602 |
+ vmlal.s16 q6, ROW5L, XFIX_1_175875602 /* ROW5L <-> ROW1R */ |
+ vmlal.s16 q6, ROW7R, XFIX_1_175875602_MINUS_1_961570560 |
+ vmlal.s16 q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L <-> ROW3R */ |
+ vmull.s16 q7, ROW7R, XFIX_1_175875602 |
+ vmlal.s16 q7, ROW7L, XFIX_1_175875602 /* ROW7L <-> ROW3R */ |
+ vmlal.s16 q7, ROW5R, XFIX_1_175875602_MINUS_0_390180644 |
+ vmlal.s16 q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L <-> ROW1R */ |
+ vsubl.s16 q3, ROW4L, ROW4R /* ROW4L <-> ROW0R */ |
+ vmull.s16 q2, ROW6L, XFIX_0_541196100 /* ROW6L <-> ROW2R */ |
+ vmlal.s16 q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065 |
+ vmov q4, q6 |
+ vmlsl.s16 q6, ROW5R, XFIX_2_562915447 |
+ vmlal.s16 q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447 /* ROW7L <-> ROW3R */ |
+ vshl.s32 q3, q3, #13 |
+ vmlsl.s16 q4, ROW5L, XFIX_0_899976223 /* ROW5L <-> ROW1R */ |
+ vadd.s32 q1, q3, q2 |
+ vmov q5, q7 |
+ vadd.s32 q1, q1, q6 |
+ vmlsl.s16 q7, ROW7R, XFIX_0_899976223 |
+ vmlal.s16 q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223 /* ROW5L <-> ROW1R */ |
+ vshrn.s32 ROW5L, q1, #16 /* ROW5L <-> ROW1R */ |
+ vsub.s32 q1, q1, q6 |
+ vmlal.s16 q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447 |
+ vmlsl.s16 q5, ROW7L, XFIX_2_562915447 /* ROW7L <-> ROW3R */ |
+ vsub.s32 q1, q1, q6 |
+ vmull.s16 q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865 /* ROW6L <-> ROW2R */ |
+ vmlal.s16 q6, ROW6R, XFIX_0_541196100 |
+ vsub.s32 q3, q3, q2 |
+ vshrn.s32 ROW6R, q1, #16 |
+ vadd.s32 q1, q3, q5 |
+ vsub.s32 q3, q3, q5 |
+ vaddl.s16 q5, ROW4L, ROW4R /* ROW4L <-> ROW0R */ |
+ vshrn.s32 ROW6L, q1, #16 /* ROW6L <-> ROW2R */ |
+ vshrn.s32 ROW5R, q3, #16 |
+ vshl.s32 q5, q5, #13 |
+ vmlal.s16 q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223 |
+ vadd.s32 q2, q5, q6 |
+ vsub.s32 q1, q5, q6 |
+ vadd.s32 q6, q2, q7 |
+ vsub.s32 q2, q2, q7 |
+ vadd.s32 q5, q1, q4 |
+ vsub.s32 q3, q1, q4 |
+ vshrn.s32 ROW7R, q2, #16 |
+ vshrn.s32 ROW7L, q5, #16 /* ROW7L <-> ROW3R */ |
+ vshrn.s32 ROW4L, q6, #16 /* ROW4L <-> ROW0R */ |
+ vshrn.s32 ROW4R, q3, #16 |
+ |
+2: /* Descale to 8-bit and range limit */ |
+ vqrshrn.s16 d16, q8, #2 |
+ vqrshrn.s16 d17, q9, #2 |
+ vqrshrn.s16 d18, q10, #2 |
+ vqrshrn.s16 d19, q11, #2 |
+ vpop {d8-d15} /* restore NEON registers */ |
+ vqrshrn.s16 d20, q12, #2 |
+ /* Transpose the final 8-bit samples and do signed->unsigned conversion */ |
+ vtrn.16 q8, q9 |
+ vqrshrn.s16 d21, q13, #2 |
+ vqrshrn.s16 d22, q14, #2 |
+ vmov.u8 q0, #(CENTERJSAMPLE) |
+ vqrshrn.s16 d23, q15, #2 |
+ vtrn.8 d16, d17 |
+ vtrn.8 d18, d19 |
+ vadd.u8 q8, q8, q0 |
+ vadd.u8 q9, q9, q0 |
+ vtrn.16 q10, q11 |
+ /* Store results to the output buffer */ |
+ ldmia OUTPUT_BUF!, {TMP1, TMP2} |
+ add TMP1, TMP1, OUTPUT_COL |
+ add TMP2, TMP2, OUTPUT_COL |
+ vst1.8 {d16}, [TMP1] |
+ vtrn.8 d20, d21 |
+ vst1.8 {d17}, [TMP2] |
+ ldmia OUTPUT_BUF!, {TMP1, TMP2} |
+ add TMP1, TMP1, OUTPUT_COL |
+ add TMP2, TMP2, OUTPUT_COL |
+ vst1.8 {d18}, [TMP1] |
+ vadd.u8 q10, q10, q0 |
+ vst1.8 {d19}, [TMP2] |
+ ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4} |
+ add TMP1, TMP1, OUTPUT_COL |
+ add TMP2, TMP2, OUTPUT_COL |
+ add TMP3, TMP3, OUTPUT_COL |
+ add TMP4, TMP4, OUTPUT_COL |
+ vtrn.8 d22, d23 |
+ vst1.8 {d20}, [TMP1] |
+ vadd.u8 q11, q11, q0 |
+ vst1.8 {d21}, [TMP2] |
+ vst1.8 {d22}, [TMP3] |
+ vst1.8 {d23}, [TMP4] |
+ bx lr |
+ |
+3: /* Left 4x8 half is done, right 4x8 half contains mostly zeros */ |
+ |
+ /* Transpose left 4x8 half */ |
+ vtrn.16 ROW6L, ROW7L |
+ vtrn.16 ROW2L, ROW3L |
+ vtrn.16 ROW0L, ROW1L |
+ vtrn.16 ROW4L, ROW5L |
+ vshl.s16 ROW0R, ROW0R, #2 /* PASS1_BITS */ |
+ vtrn.32 ROW1L, ROW3L |
+ vtrn.32 ROW4L, ROW6L |
+ vtrn.32 ROW0L, ROW2L |
+ vtrn.32 ROW5L, ROW7L |
+ |
+ cmp r0, #0 |
+ beq 4f /* Right 4x8 half has all zeros, go to 'sparse' second pass */ |
+ |
+ /* Only row 0 is non-zero for the right 4x8 half */ |
+ vdup.s16 ROW1R, ROW0R[1] |
+ vdup.s16 ROW2R, ROW0R[2] |
+ vdup.s16 ROW3R, ROW0R[3] |
+ vdup.s16 ROW4R, ROW0R[0] |
+ vdup.s16 ROW5R, ROW0R[1] |
+ vdup.s16 ROW6R, ROW0R[2] |
+ vdup.s16 ROW7R, ROW0R[3] |
+ vdup.s16 ROW0R, ROW0R[0] |
+ b 1b /* Go to 'normal' second pass */ |
+ |
+4: /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), left 4x8 half */ |
+ vld1.s16 {d2}, [ip, :64] /* reload constants */ |
+ vmull.s16 q6, ROW1L, XFIX_1_175875602 |
+ vmlal.s16 q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560 |
+ vmull.s16 q7, ROW3L, XFIX_1_175875602 |
+ vmlal.s16 q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644 |
+ vmull.s16 q2, ROW2L, XFIX_0_541196100 |
+ vshll.s16 q3, ROW0L, #13 |
+ vmov q4, q6 |
+ vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447 |
+ vmlsl.s16 q4, ROW1L, XFIX_0_899976223 |
+ vadd.s32 q1, q3, q2 |
+ vmov q5, q7 |
+ vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223 |
+ vadd.s32 q1, q1, q6 |
+ vadd.s32 q6, q6, q6 |
+ vmlsl.s16 q5, ROW3L, XFIX_2_562915447 |
+ vshrn.s32 ROW1L, q1, #16 |
+ vsub.s32 q1, q1, q6 |
+ vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865 |
+ vsub.s32 q3, q3, q2 |
+ vshrn.s32 ROW2R, q1, #16 /* ROW6L <-> ROW2R */ |
+ vadd.s32 q1, q3, q5 |
+ vsub.s32 q3, q3, q5 |
+ vshll.s16 q5, ROW0L, #13 |
+ vshrn.s32 ROW2L, q1, #16 |
+ vshrn.s32 ROW1R, q3, #16 /* ROW5L <-> ROW1R */ |
+ vadd.s32 q2, q5, q6 |
+ vsub.s32 q1, q5, q6 |
+ vadd.s32 q6, q2, q7 |
+ vsub.s32 q2, q2, q7 |
+ vadd.s32 q5, q1, q4 |
+ vsub.s32 q3, q1, q4 |
+ vshrn.s32 ROW3R, q2, #16 /* ROW7L <-> ROW3R */ |
+ vshrn.s32 ROW3L, q5, #16 |
+ vshrn.s32 ROW0L, q6, #16 |
+ vshrn.s32 ROW0R, q3, #16 /* ROW4L <-> ROW0R */ |
+ /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), right 4x8 half */ |
+ vld1.s16 {d2}, [ip, :64] /* reload constants */ |
+ vmull.s16 q6, ROW5L, XFIX_1_175875602 |
+ vmlal.s16 q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560 |
+ vmull.s16 q7, ROW7L, XFIX_1_175875602 |
+ vmlal.s16 q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644 |
+ vmull.s16 q2, ROW6L, XFIX_0_541196100 |
+ vshll.s16 q3, ROW4L, #13 |
+ vmov q4, q6 |
+ vmlal.s16 q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447 |
+ vmlsl.s16 q4, ROW5L, XFIX_0_899976223 |
+ vadd.s32 q1, q3, q2 |
+ vmov q5, q7 |
+ vmlal.s16 q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223 |
+ vadd.s32 q1, q1, q6 |
+ vadd.s32 q6, q6, q6 |
+ vmlsl.s16 q5, ROW7L, XFIX_2_562915447 |
+ vshrn.s32 ROW5L, q1, #16 /* ROW5L <-> ROW1R */ |
+ vsub.s32 q1, q1, q6 |
+ vmull.s16 q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865 |
+ vsub.s32 q3, q3, q2 |
+ vshrn.s32 ROW6R, q1, #16 |
+ vadd.s32 q1, q3, q5 |
+ vsub.s32 q3, q3, q5 |
+ vshll.s16 q5, ROW4L, #13 |
+ vshrn.s32 ROW6L, q1, #16 /* ROW6L <-> ROW2R */ |
+ vshrn.s32 ROW5R, q3, #16 |
+ vadd.s32 q2, q5, q6 |
+ vsub.s32 q1, q5, q6 |
+ vadd.s32 q6, q2, q7 |
+ vsub.s32 q2, q2, q7 |
+ vadd.s32 q5, q1, q4 |
+ vsub.s32 q3, q1, q4 |
+ vshrn.s32 ROW7R, q2, #16 |
+ vshrn.s32 ROW7L, q5, #16 /* ROW7L <-> ROW3R */ |
+ vshrn.s32 ROW4L, q6, #16 /* ROW4L <-> ROW0R */ |
+ vshrn.s32 ROW4R, q3, #16 |
+ b 2b /* Go to epilogue */ |
+ |
+ .unreq DCT_TABLE |
+ .unreq COEF_BLOCK |
+ .unreq OUTPUT_BUF |
+ .unreq OUTPUT_COL |
+ .unreq TMP1 |
+ .unreq TMP2 |
+ .unreq TMP3 |
+ .unreq TMP4 |
+ |
+ .unreq ROW0L |
+ .unreq ROW0R |
+ .unreq ROW1L |
+ .unreq ROW1R |
+ .unreq ROW2L |
+ .unreq ROW2R |
+ .unreq ROW3L |
+ .unreq ROW3R |
+ .unreq ROW4L |
+ .unreq ROW4R |
+ .unreq ROW5L |
+ .unreq ROW5R |
+ .unreq ROW6L |
+ .unreq ROW6R |
+ .unreq ROW7L |
+ .unreq ROW7R |
+.endfunc |
+ |
+/*****************************************************************************/ |
+ |
+/* |
* jsimd_idct_ifast_neon |
* |
* This function contains a fast, not so accurate integer implementation of |
* the inverse DCT (Discrete Cosine Transform). It uses the same calculations |
- * and produces exactly the same output as IJG's original 'jpeg_idct_fast' |
+ * and produces exactly the same output as IJG's original 'jpeg_idct_ifast' |
* function from jidctfst.c |
* |
- * TODO: a bit better instructions scheduling is needed. |
+ * Normally 1-D AAN DCT needs 5 multiplications and 29 additions. |
+ * But in ARM NEON case some extra additions are required because VQDMULH |
+ * instruction can't handle the constants larger than 1. So the expressions |
+ * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x", |
+ * which introduces an extra addition. Overall, there are 6 extra additions |
+ * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions. |
*/ |
#define XFIX_1_082392200 d0[0] |
@@ -87,166 +700,200 @@ |
.short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */ |
.short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */ |
-/* 1-D IDCT helper macro */ |
- |
-.macro idct_helper x0, x1, x2, x3, x4, x5, x6, x7, \ |
- t10, t11, t12, t13, t14 |
- |
- vsub.s16 \t10, \x0, \x4 |
- vadd.s16 \x4, \x0, \x4 |
- vswp.s16 \t10, \x0 |
- vsub.s16 \t11, \x2, \x6 |
- vadd.s16 \x6, \x2, \x6 |
- vswp.s16 \t11, \x2 |
- vsub.s16 \t10, \x3, \x5 |
- vadd.s16 \x5, \x3, \x5 |
- vswp.s16 \t10, \x3 |
- vsub.s16 \t11, \x1, \x7 |
- vadd.s16 \x7, \x1, \x7 |
- vswp.s16 \t11, \x1 |
- |
- vqdmulh.s16 \t13, \x2, d0[1] |
- vadd.s16 \t12, \x3, \x3 |
- vadd.s16 \x2, \x2, \t13 |
- vqdmulh.s16 \t13, \x3, d0[3] |
- vsub.s16 \t10, \x1, \x3 |
- vadd.s16 \t12, \t12, \t13 |
- vqdmulh.s16 \t13, \t10, d0[2] |
- vsub.s16 \t11, \x7, \x5 |
- vadd.s16 \t10, \t10, \t13 |
- vqdmulh.s16 \t13, \t11, d0[1] |
- vadd.s16 \t11, \t11, \t13 |
- |
- vqdmulh.s16 \t13, \x1, d0[0] |
- vsub.s16 \x2, \x6, \x2 |
- vsub.s16 \t14, \x0, \x2 |
- vadd.s16 \x2, \x0, \x2 |
- vadd.s16 \x0, \x4, \x6 |
- vsub.s16 \x4, \x4, \x6 |
- vadd.s16 \x1, \x1, \t13 |
- vadd.s16 \t13, \x7, \x5 |
- vsub.s16 \t12, \t13, \t12 |
- vsub.s16 \t12, \t12, \t10 |
- vadd.s16 \t11, \t12, \t11 |
- vsub.s16 \t10, \x1, \t10 |
- vadd.s16 \t10, \t10, \t11 |
- |
- vsub.s16 \x7, \x0, \t13 |
- vadd.s16 \x0, \x0, \t13 |
- vadd.s16 \x6, \t14, \t12 |
- vsub.s16 \x1, \t14, \t12 |
- vsub.s16 \x5, \x2, \t11 |
- vadd.s16 \x2, \x2, \t11 |
- vsub.s16 \x3, \x4, \t10 |
- vadd.s16 \x4, \x4, \t10 |
-.endm |
- |
asm_function jsimd_idct_ifast_neon |
DCT_TABLE .req r0 |
COEF_BLOCK .req r1 |
OUTPUT_BUF .req r2 |
OUTPUT_COL .req r3 |
- TMP .req ip |
+ TMP1 .req r0 |
+ TMP2 .req r1 |
+ TMP3 .req r2 |
+ TMP4 .req ip |
- vpush {d8-d15} |
- |
- /* Load constants */ |
- adr TMP, jsimd_idct_ifast_neon_consts |
- vld1.16 {d0}, [TMP, :64] |
- |
- /* Load all COEF_BLOCK into NEON registers with the following allocation: |
+ /* Load and dequantize coefficients into NEON registers |
+ * with the following allocation: |
* 0 1 2 3 | 4 5 6 7 |
* ---------+-------- |
- * 0 | d4 | d5 |
- * 1 | d6 | d7 |
- * 2 | d8 | d9 |
- * 3 | d10 | d11 |
- * 4 | d12 | d13 |
- * 5 | d14 | d15 |
- * 6 | d16 | d17 |
- * 7 | d18 | d19 |
+ * 0 | d16 | d17 ( q8 ) |
+ * 1 | d18 | d19 ( q9 ) |
+ * 2 | d20 | d21 ( q10 ) |
+ * 3 | d22 | d23 ( q11 ) |
+ * 4 | d24 | d25 ( q12 ) |
+ * 5 | d26 | d27 ( q13 ) |
+ * 6 | d28 | d29 ( q14 ) |
+ * 7 | d30 | d31 ( q15 ) |
*/ |
- vld1.16 {d4, d5, d6, d7}, [COEF_BLOCK]! |
- vld1.16 {d8, d9, d10, d11}, [COEF_BLOCK]! |
- vld1.16 {d12, d13, d14, d15}, [COEF_BLOCK]! |
- vld1.16 {d16, d17, d18, d19}, [COEF_BLOCK]! |
- /* Dequantize */ |
- vld1.16 {d20, d21, d22, d23}, [DCT_TABLE]! |
- vmul.s16 q2, q2, q10 |
- vld1.16 {d24, d25, d26, d27}, [DCT_TABLE]! |
- vmul.s16 q3, q3, q11 |
- vmul.s16 q4, q4, q12 |
- vld1.16 {d28, d29, d30, d31}, [DCT_TABLE]! |
- vmul.s16 q5, q5, q13 |
- vmul.s16 q6, q6, q14 |
- vld1.16 {d20, d21, d22, d23}, [DCT_TABLE]! |
- vmul.s16 q7, q7, q15 |
- vmul.s16 q8, q8, q10 |
- vmul.s16 q9, q9, q11 |
- |
- /* Pass 1 */ |
- idct_helper q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12, q13, q14 |
- /* Transpose */ |
- transpose_4x4 d4, d6, d8, d10 |
- transpose_4x4 d5, d7, d9, d11 |
- transpose_4x4 d12, d14, d16, d18 |
- transpose_4x4 d13, d15, d17, d19 |
- vswp d12, d5 |
- vswp d14, d7 |
- vswp d16, d9 |
- vswp d18, d11 |
- |
- /* Pass 2 */ |
- idct_helper q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12, q13, q14 |
- /* Transpose */ |
- transpose_4x4 d4, d6, d8, d10 |
- transpose_4x4 d5, d7, d9, d11 |
- transpose_4x4 d12, d14, d16, d18 |
- transpose_4x4 d13, d15, d17, d19 |
- vswp d12, d5 |
- vswp d14, d7 |
- vswp d16, d9 |
- vswp d18, d11 |
- |
- /* Descale and range limit */ |
- vmov.s16 q15, #(0x80 << 5) |
- vqadd.s16 q2, q2, q15 |
- vqadd.s16 q3, q3, q15 |
- vqadd.s16 q4, q4, q15 |
- vqadd.s16 q5, q5, q15 |
- vqadd.s16 q6, q6, q15 |
- vqadd.s16 q7, q7, q15 |
- vqadd.s16 q8, q8, q15 |
- vqadd.s16 q9, q9, q15 |
- vqshrun.s16 d4, q2, #5 |
- vqshrun.s16 d6, q3, #5 |
- vqshrun.s16 d8, q4, #5 |
- vqshrun.s16 d10, q5, #5 |
- vqshrun.s16 d12, q6, #5 |
- vqshrun.s16 d14, q7, #5 |
- vqshrun.s16 d16, q8, #5 |
- vqshrun.s16 d18, q9, #5 |
- |
- /* Store results to the output buffer */ |
- .irp x, d4, d6, d8, d10, d12, d14, d16, d18 |
- ldr TMP, [OUTPUT_BUF], #4 |
- add TMP, TMP, OUTPUT_COL |
- vst1.8 {\x}, [TMP]! |
- .endr |
- |
- vpop {d8-d15} |
+ adr ip, jsimd_idct_ifast_neon_consts |
+ vld1.16 {d16, d17, d18, d19}, [COEF_BLOCK, :128]! |
+ vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]! |
+ vld1.16 {d20, d21, d22, d23}, [COEF_BLOCK, :128]! |
+ vmul.s16 q8, q8, q0 |
+ vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]! |
+ vmul.s16 q9, q9, q1 |
+ vld1.16 {d24, d25, d26, d27}, [COEF_BLOCK, :128]! |
+ vmul.s16 q10, q10, q2 |
+ vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]! |
+ vmul.s16 q11, q11, q3 |
+ vld1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128] |
+ vmul.s16 q12, q12, q0 |
+ vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]! |
+ vmul.s16 q14, q14, q2 |
+ vmul.s16 q13, q13, q1 |
+ vld1.16 {d0}, [ip, :64] /* load constants */ |
+ vmul.s16 q15, q15, q3 |
+ vpush {d8-d13} /* save NEON registers */ |
+ /* 1-D IDCT, pass 1 */ |
+ vsub.s16 q2, q10, q14 |
+ vadd.s16 q14, q10, q14 |
+ vsub.s16 q1, q11, q13 |
+ vadd.s16 q13, q11, q13 |
+ vsub.s16 q5, q9, q15 |
+ vadd.s16 q15, q9, q15 |
+ vqdmulh.s16 q4, q2, XFIX_1_414213562 |
+ vqdmulh.s16 q6, q1, XFIX_2_613125930 |
+ vadd.s16 q3, q1, q1 |
+ vsub.s16 q1, q5, q1 |
+ vadd.s16 q10, q2, q4 |
+ vqdmulh.s16 q4, q1, XFIX_1_847759065 |
+ vsub.s16 q2, q15, q13 |
+ vadd.s16 q3, q3, q6 |
+ vqdmulh.s16 q6, q2, XFIX_1_414213562 |
+ vadd.s16 q1, q1, q4 |
+ vqdmulh.s16 q4, q5, XFIX_1_082392200 |
+ vsub.s16 q10, q10, q14 |
+ vadd.s16 q2, q2, q6 |
+ vsub.s16 q6, q8, q12 |
+ vadd.s16 q12, q8, q12 |
+ vadd.s16 q9, q5, q4 |
+ vadd.s16 q5, q6, q10 |
+ vsub.s16 q10, q6, q10 |
+ vadd.s16 q6, q15, q13 |
+ vadd.s16 q8, q12, q14 |
+ vsub.s16 q3, q6, q3 |
+ vsub.s16 q12, q12, q14 |
+ vsub.s16 q3, q3, q1 |
+ vsub.s16 q1, q9, q1 |
+ vadd.s16 q2, q3, q2 |
+ vsub.s16 q15, q8, q6 |
+ vadd.s16 q1, q1, q2 |
+ vadd.s16 q8, q8, q6 |
+ vadd.s16 q14, q5, q3 |
+ vsub.s16 q9, q5, q3 |
+ vsub.s16 q13, q10, q2 |
+ vadd.s16 q10, q10, q2 |
+ /* Transpose */ |
+ vtrn.16 q8, q9 |
+ vsub.s16 q11, q12, q1 |
+ vtrn.16 q14, q15 |
+ vadd.s16 q12, q12, q1 |
+ vtrn.16 q10, q11 |
+ vtrn.16 q12, q13 |
+ vtrn.32 q9, q11 |
+ vtrn.32 q12, q14 |
+ vtrn.32 q8, q10 |
+ vtrn.32 q13, q15 |
+ vswp d28, d21 |
+ vswp d26, d19 |
+ /* 1-D IDCT, pass 2 */ |
+ vsub.s16 q2, q10, q14 |
+ vswp d30, d23 |
+ vadd.s16 q14, q10, q14 |
+ vswp d24, d17 |
+ vsub.s16 q1, q11, q13 |
+ vadd.s16 q13, q11, q13 |
+ vsub.s16 q5, q9, q15 |
+ vadd.s16 q15, q9, q15 |
+ vqdmulh.s16 q4, q2, XFIX_1_414213562 |
+ vqdmulh.s16 q6, q1, XFIX_2_613125930 |
+ vadd.s16 q3, q1, q1 |
+ vsub.s16 q1, q5, q1 |
+ vadd.s16 q10, q2, q4 |
+ vqdmulh.s16 q4, q1, XFIX_1_847759065 |
+ vsub.s16 q2, q15, q13 |
+ vadd.s16 q3, q3, q6 |
+ vqdmulh.s16 q6, q2, XFIX_1_414213562 |
+ vadd.s16 q1, q1, q4 |
+ vqdmulh.s16 q4, q5, XFIX_1_082392200 |
+ vsub.s16 q10, q10, q14 |
+ vadd.s16 q2, q2, q6 |
+ vsub.s16 q6, q8, q12 |
+ vadd.s16 q12, q8, q12 |
+ vadd.s16 q9, q5, q4 |
+ vadd.s16 q5, q6, q10 |
+ vsub.s16 q10, q6, q10 |
+ vadd.s16 q6, q15, q13 |
+ vadd.s16 q8, q12, q14 |
+ vsub.s16 q3, q6, q3 |
+ vsub.s16 q12, q12, q14 |
+ vsub.s16 q3, q3, q1 |
+ vsub.s16 q1, q9, q1 |
+ vadd.s16 q2, q3, q2 |
+ vsub.s16 q15, q8, q6 |
+ vadd.s16 q1, q1, q2 |
+ vadd.s16 q8, q8, q6 |
+ vadd.s16 q14, q5, q3 |
+ vsub.s16 q9, q5, q3 |
+ vsub.s16 q13, q10, q2 |
+ vpop {d8-d13} /* restore NEON registers */ |
+ vadd.s16 q10, q10, q2 |
+ vsub.s16 q11, q12, q1 |
+ vadd.s16 q12, q12, q1 |
+ /* Descale to 8-bit and range limit */ |
+ vmov.u8 q0, #0x80 |
+ vqshrn.s16 d16, q8, #5 |
+ vqshrn.s16 d17, q9, #5 |
+ vqshrn.s16 d18, q10, #5 |
+ vqshrn.s16 d19, q11, #5 |
+ vqshrn.s16 d20, q12, #5 |
+ vqshrn.s16 d21, q13, #5 |
+ vqshrn.s16 d22, q14, #5 |
+ vqshrn.s16 d23, q15, #5 |
+ vadd.u8 q8, q8, q0 |
+ vadd.u8 q9, q9, q0 |
+ vadd.u8 q10, q10, q0 |
+ vadd.u8 q11, q11, q0 |
+ /* Transpose the final 8-bit samples */ |
+ vtrn.16 q8, q9 |
+ vtrn.16 q10, q11 |
+ vtrn.32 q8, q10 |
+ vtrn.32 q9, q11 |
+ vtrn.8 d16, d17 |
+ vtrn.8 d18, d19 |
+ /* Store results to the output buffer */ |
+ ldmia OUTPUT_BUF!, {TMP1, TMP2} |
+ add TMP1, TMP1, OUTPUT_COL |
+ add TMP2, TMP2, OUTPUT_COL |
+ vst1.8 {d16}, [TMP1] |
+ vst1.8 {d17}, [TMP2] |
+ ldmia OUTPUT_BUF!, {TMP1, TMP2} |
+ add TMP1, TMP1, OUTPUT_COL |
+ add TMP2, TMP2, OUTPUT_COL |
+ vst1.8 {d18}, [TMP1] |
+ vtrn.8 d20, d21 |
+ vst1.8 {d19}, [TMP2] |
+ ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4} |
+ add TMP1, TMP1, OUTPUT_COL |
+ add TMP2, TMP2, OUTPUT_COL |
+ add TMP3, TMP3, OUTPUT_COL |
+ add TMP4, TMP4, OUTPUT_COL |
+ vst1.8 {d20}, [TMP1] |
+ vtrn.8 d22, d23 |
+ vst1.8 {d21}, [TMP2] |
+ vst1.8 {d22}, [TMP3] |
+ vst1.8 {d23}, [TMP4] |
bx lr |
.unreq DCT_TABLE |
.unreq COEF_BLOCK |
.unreq OUTPUT_BUF |
.unreq OUTPUT_COL |
- .unreq TMP |
+ .unreq TMP1 |
+ .unreq TMP2 |
+ .unreq TMP3 |
+ .unreq TMP4 |
.endfunc |
-.purgem idct_helper |
- |
/*****************************************************************************/ |
/* |
@@ -631,12 +1278,12 @@ |
.macro do_load size |
.if \size == 8 |
- vld1.8 {d4}, [U]! |
- vld1.8 {d5}, [V]! |
- vld1.8 {d0}, [Y]! |
- pld [Y, #64] |
+ vld1.8 {d4}, [U, :64]! |
+ vld1.8 {d5}, [V, :64]! |
+ vld1.8 {d0}, [Y, :64]! |
pld [U, #64] |
pld [V, #64] |
+ pld [Y, #64] |
.elseif \size == 4 |
vld1.8 {d4[0]}, [U]! |
vld1.8 {d4[1]}, [U]! |
@@ -706,7 +1353,11 @@ |
.macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, g_offs, b_offs |
-.macro do_yuv_to_rgb |
+/* |
+ * 2 stage pipelined YCbCr->RGB conversion |
+ */ |
+ |
+.macro do_yuv_to_rgb_stage1 |
vaddw.u8 q3, q1, d4 /* q3 = u - 128 */ |
vaddw.u8 q4, q1, d5 /* q2 = v - 128 */ |
vmull.s16 q10, d6, d1[1] /* multiply by -11277 */ |
@@ -717,6 +1368,9 @@ |
vmull.s16 q13, d9, d1[0] /* multiply by 22971 */ |
vmull.s16 q14, d6, d1[3] /* multiply by 29033 */ |
vmull.s16 q15, d7, d1[3] /* multiply by 29033 */ |
+.endm |
+ |
+.macro do_yuv_to_rgb_stage2 |
vrshrn.s32 d20, q10, #15 |
vrshrn.s32 d21, q11, #15 |
vrshrn.s32 d24, q12, #14 |
@@ -731,6 +1385,43 @@ |
vqmovun.s16 d1\b_offs, q14 |
.endm |
+.macro do_yuv_to_rgb_stage2_store_load_stage1 |
+ vld1.8 {d4}, [U, :64]! |
+ vrshrn.s32 d20, q10, #15 |
+ vrshrn.s32 d21, q11, #15 |
+ vrshrn.s32 d24, q12, #14 |
+ vrshrn.s32 d25, q13, #14 |
+ vrshrn.s32 d28, q14, #14 |
+ vld1.8 {d5}, [V, :64]! |
+ vrshrn.s32 d29, q15, #14 |
+ vaddw.u8 q10, q10, d0 |
+ vaddw.u8 q12, q12, d0 |
+ vaddw.u8 q14, q14, d0 |
+ vqmovun.s16 d1\g_offs, q10 |
+ vld1.8 {d0}, [Y, :64]! |
+ vqmovun.s16 d1\r_offs, q12 |
+ pld [U, #64] |
+ pld [V, #64] |
+ pld [Y, #64] |
+ vqmovun.s16 d1\b_offs, q14 |
+ vaddw.u8 q3, q1, d4 /* q3 = u - 128 */ |
+ vaddw.u8 q4, q1, d5 /* q2 = v - 128 */ |
+ do_store \bpp, 8 |
+ vmull.s16 q10, d6, d1[1] /* multiply by -11277 */ |
+ vmlal.s16 q10, d8, d1[2] /* multiply by -23401 */ |
+ vmull.s16 q11, d7, d1[1] /* multiply by -11277 */ |
+ vmlal.s16 q11, d9, d1[2] /* multiply by -23401 */ |
+ vmull.s16 q12, d8, d1[0] /* multiply by 22971 */ |
+ vmull.s16 q13, d9, d1[0] /* multiply by 22971 */ |
+ vmull.s16 q14, d6, d1[3] /* multiply by 29033 */ |
+ vmull.s16 q15, d7, d1[3] /* multiply by 29033 */ |
+.endm |
+ |
+.macro do_yuv_to_rgb |
+ do_yuv_to_rgb_stage1 |
+ do_yuv_to_rgb_stage2 |
+.endm |
+ |
/* Apple gas crashes on adrl, work around that by using adr. |
* But this requires a copy of these constants for each function. |
*/ |
@@ -791,16 +1482,21 @@ |
/* Inner loop over pixels */ |
subs N, N, #8 |
+ blt 3f |
+ do_load 8 |
+ do_yuv_to_rgb_stage1 |
+ subs N, N, #8 |
blt 2f |
1: |
- do_load 8 |
- do_yuv_to_rgb |
- do_store \bpp, 8 |
+ do_yuv_to_rgb_stage2_store_load_stage1 |
subs N, N, #8 |
bge 1b |
+2: |
+ do_yuv_to_rgb_stage2 |
+ do_store \bpp, 8 |
tst N, #7 |
beq 8f |
-2: |
+3: |
tst N, #4 |
beq 3f |
do_load 4 |
@@ -848,6 +1544,9 @@ |
.endfunc |
.purgem do_yuv_to_rgb |
+.purgem do_yuv_to_rgb_stage1 |
+.purgem do_yuv_to_rgb_stage2 |
+.purgem do_yuv_to_rgb_stage2_store_load_stage1 |
.endm |
@@ -863,3 +1562,598 @@ |
.purgem do_store |
/*****************************************************************************/ |
+ |
+/* |
+ * jsimd_extrgb_ycc_convert_neon |
+ * jsimd_extbgr_ycc_convert_neon |
+ * jsimd_extrgbx_ycc_convert_neon |
+ * jsimd_extbgrx_ycc_convert_neon |
+ * jsimd_extxbgr_ycc_convert_neon |
+ * jsimd_extxrgb_ycc_convert_neon |
+ * |
+ * Colorspace conversion RGB -> YCbCr |
+ */ |
+ |
+.macro do_store size |
+ .if \size == 8 |
+ vst1.8 {d20}, [Y]! |
+ vst1.8 {d21}, [U]! |
+ vst1.8 {d22}, [V]! |
+ .elseif \size == 4 |
+ vst1.8 {d20[0]}, [Y]! |
+ vst1.8 {d20[1]}, [Y]! |
+ vst1.8 {d20[2]}, [Y]! |
+ vst1.8 {d20[3]}, [Y]! |
+ vst1.8 {d21[0]}, [U]! |
+ vst1.8 {d21[1]}, [U]! |
+ vst1.8 {d21[2]}, [U]! |
+ vst1.8 {d21[3]}, [U]! |
+ vst1.8 {d22[0]}, [V]! |
+ vst1.8 {d22[1]}, [V]! |
+ vst1.8 {d22[2]}, [V]! |
+ vst1.8 {d22[3]}, [V]! |
+ .elseif \size == 2 |
+ vst1.8 {d20[4]}, [Y]! |
+ vst1.8 {d20[5]}, [Y]! |
+ vst1.8 {d21[4]}, [U]! |
+ vst1.8 {d21[5]}, [U]! |
+ vst1.8 {d22[4]}, [V]! |
+ vst1.8 {d22[5]}, [V]! |
+ .elseif \size == 1 |
+ vst1.8 {d20[6]}, [Y]! |
+ vst1.8 {d21[6]}, [U]! |
+ vst1.8 {d22[6]}, [V]! |
+ .else |
+ .error unsupported macroblock size |
+ .endif |
+.endm |
+ |
+.macro do_load bpp, size |
+ .if \bpp == 24 |
+ .if \size == 8 |
+ vld3.8 {d10, d11, d12}, [RGB]! |
+ pld [RGB, #128] |
+ .elseif \size == 4 |
+ vld3.8 {d10[0], d11[0], d12[0]}, [RGB]! |
+ vld3.8 {d10[1], d11[1], d12[1]}, [RGB]! |
+ vld3.8 {d10[2], d11[2], d12[2]}, [RGB]! |
+ vld3.8 {d10[3], d11[3], d12[3]}, [RGB]! |
+ .elseif \size == 2 |
+ vld3.8 {d10[4], d11[4], d12[4]}, [RGB]! |
+ vld3.8 {d10[5], d11[5], d12[5]}, [RGB]! |
+ .elseif \size == 1 |
+ vld3.8 {d10[6], d11[6], d12[6]}, [RGB]! |
+ .else |
+ .error unsupported macroblock size |
+ .endif |
+ .elseif \bpp == 32 |
+ .if \size == 8 |
+ vld4.8 {d10, d11, d12, d13}, [RGB]! |
+ pld [RGB, #128] |
+ .elseif \size == 4 |
+ vld4.8 {d10[0], d11[0], d12[0], d13[0]}, [RGB]! |
+ vld4.8 {d10[1], d11[1], d12[1], d13[1]}, [RGB]! |
+ vld4.8 {d10[2], d11[2], d12[2], d13[2]}, [RGB]! |
+ vld4.8 {d10[3], d11[3], d12[3], d13[3]}, [RGB]! |
+ .elseif \size == 2 |
+ vld4.8 {d10[4], d11[4], d12[4], d13[4]}, [RGB]! |
+ vld4.8 {d10[5], d11[5], d12[5], d13[5]}, [RGB]! |
+ .elseif \size == 1 |
+ vld4.8 {d10[6], d11[6], d12[6], d13[6]}, [RGB]! |
+ .else |
+ .error unsupported macroblock size |
+ .endif |
+ .else |
+ .error unsupported bpp |
+ .endif |
+.endm |
+ |
+.macro generate_jsimd_rgb_ycc_convert_neon colorid, bpp, r_offs, g_offs, b_offs |
+ |
+/* |
+ * 2 stage pipelined RGB->YCbCr conversion |
+ */ |
+ |
+.macro do_rgb_to_yuv_stage1 |
+ vmovl.u8 q2, d1\r_offs /* r = { d4, d5 } */ |
+ vmovl.u8 q3, d1\g_offs /* g = { d6, d7 } */ |
+ vmovl.u8 q4, d1\b_offs /* b = { d8, d9 } */ |
+ vmull.u16 q7, d4, d0[0] |
+ vmlal.u16 q7, d6, d0[1] |
+ vmlal.u16 q7, d8, d0[2] |
+ vmull.u16 q8, d5, d0[0] |
+ vmlal.u16 q8, d7, d0[1] |
+ vmlal.u16 q8, d9, d0[2] |
+ vrev64.32 q9, q1 |
+ vrev64.32 q13, q1 |
+ vmlsl.u16 q9, d4, d0[3] |
+ vmlsl.u16 q9, d6, d1[0] |
+ vmlal.u16 q9, d8, d1[1] |
+ vmlsl.u16 q13, d5, d0[3] |
+ vmlsl.u16 q13, d7, d1[0] |
+ vmlal.u16 q13, d9, d1[1] |
+ vrev64.32 q14, q1 |
+ vrev64.32 q15, q1 |
+ vmlal.u16 q14, d4, d1[1] |
+ vmlsl.u16 q14, d6, d1[2] |
+ vmlsl.u16 q14, d8, d1[3] |
+ vmlal.u16 q15, d5, d1[1] |
+ vmlsl.u16 q15, d7, d1[2] |
+ vmlsl.u16 q15, d9, d1[3] |
+.endm |
+ |
+.macro do_rgb_to_yuv_stage2 |
+ vrshrn.u32 d20, q7, #16 |
+ vrshrn.u32 d21, q8, #16 |
+ vshrn.u32 d22, q9, #16 |
+ vshrn.u32 d23, q13, #16 |
+ vshrn.u32 d24, q14, #16 |
+ vshrn.u32 d25, q15, #16 |
+ vmovn.u16 d20, q10 /* d20 = y */ |
+ vmovn.u16 d21, q11 /* d21 = u */ |
+ vmovn.u16 d22, q12 /* d22 = v */ |
+.endm |
+ |
+.macro do_rgb_to_yuv |
+ do_rgb_to_yuv_stage1 |
+ do_rgb_to_yuv_stage2 |
+.endm |
+ |
+.macro do_rgb_to_yuv_stage2_store_load_stage1 |
+ vrshrn.u32 d20, q7, #16 |
+ vrshrn.u32 d21, q8, #16 |
+ vshrn.u32 d22, q9, #16 |
+ vrev64.32 q9, q1 |
+ vshrn.u32 d23, q13, #16 |
+ vrev64.32 q13, q1 |
+ vshrn.u32 d24, q14, #16 |
+ vshrn.u32 d25, q15, #16 |
+ do_load \bpp, 8 |
+ vmovn.u16 d20, q10 /* d20 = y */ |
+ vmovl.u8 q2, d1\r_offs /* r = { d4, d5 } */ |
+ vmovn.u16 d21, q11 /* d21 = u */ |
+ vmovl.u8 q3, d1\g_offs /* g = { d6, d7 } */ |
+ vmovn.u16 d22, q12 /* d22 = v */ |
+ vmovl.u8 q4, d1\b_offs /* b = { d8, d9 } */ |
+ vmull.u16 q7, d4, d0[0] |
+ vmlal.u16 q7, d6, d0[1] |
+ vmlal.u16 q7, d8, d0[2] |
+ vst1.8 {d20}, [Y]! |
+ vmull.u16 q8, d5, d0[0] |
+ vmlal.u16 q8, d7, d0[1] |
+ vmlal.u16 q8, d9, d0[2] |
+ vmlsl.u16 q9, d4, d0[3] |
+ vmlsl.u16 q9, d6, d1[0] |
+ vmlal.u16 q9, d8, d1[1] |
+ vst1.8 {d21}, [U]! |
+ vmlsl.u16 q13, d5, d0[3] |
+ vmlsl.u16 q13, d7, d1[0] |
+ vmlal.u16 q13, d9, d1[1] |
+ vrev64.32 q14, q1 |
+ vrev64.32 q15, q1 |
+ vmlal.u16 q14, d4, d1[1] |
+ vmlsl.u16 q14, d6, d1[2] |
+ vmlsl.u16 q14, d8, d1[3] |
+ vst1.8 {d22}, [V]! |
+ vmlal.u16 q15, d5, d1[1] |
+ vmlsl.u16 q15, d7, d1[2] |
+ vmlsl.u16 q15, d9, d1[3] |
+.endm |
+ |
+.balign 16 |
+jsimd_\colorid\()_ycc_neon_consts: |
+ .short 19595, 38470, 7471, 11059 |
+ .short 21709, 32768, 27439, 5329 |
+ .short 32767, 128, 32767, 128 |
+ .short 32767, 128, 32767, 128 |
+ |
+asm_function jsimd_\colorid\()_ycc_convert_neon |
+ OUTPUT_WIDTH .req r0 |
+ INPUT_BUF .req r1 |
+ OUTPUT_BUF .req r2 |
+ OUTPUT_ROW .req r3 |
+ NUM_ROWS .req r4 |
+ |
+ OUTPUT_BUF0 .req r5 |
+ OUTPUT_BUF1 .req r6 |
+ OUTPUT_BUF2 .req OUTPUT_BUF |
+ |
+ RGB .req r7 |
+ Y .req r8 |
+ U .req r9 |
+ V .req r10 |
+ N .req ip |
+ |
+ /* Load constants to d0, d1, d2, d3 */ |
+ adr ip, jsimd_\colorid\()_ycc_neon_consts |
+ vld1.16 {d0, d1, d2, d3}, [ip, :128] |
+ |
+ /* Save ARM registers and handle input arguments */ |
+ push {r4, r5, r6, r7, r8, r9, r10, lr} |
+ ldr NUM_ROWS, [sp, #(4 * 8)] |
+ ldr OUTPUT_BUF0, [OUTPUT_BUF] |
+ ldr OUTPUT_BUF1, [OUTPUT_BUF, #4] |
+ ldr OUTPUT_BUF2, [OUTPUT_BUF, #8] |
+ .unreq OUTPUT_BUF |
+ |
+ /* Save NEON registers */ |
+ vpush {d8-d15} |
+ |
+ /* Outer loop over scanlines */ |
+ cmp NUM_ROWS, #1 |
+ blt 9f |
+0: |
+ ldr Y, [OUTPUT_BUF0, OUTPUT_ROW, lsl #2] |
+ ldr U, [OUTPUT_BUF1, OUTPUT_ROW, lsl #2] |
+ mov N, OUTPUT_WIDTH |
+ ldr V, [OUTPUT_BUF2, OUTPUT_ROW, lsl #2] |
+ add OUTPUT_ROW, OUTPUT_ROW, #1 |
+ ldr RGB, [INPUT_BUF], #4 |
+ |
+ /* Inner loop over pixels */ |
+ subs N, N, #8 |
+ blt 3f |
+ do_load \bpp, 8 |
+ do_rgb_to_yuv_stage1 |
+ subs N, N, #8 |
+ blt 2f |
+1: |
+ do_rgb_to_yuv_stage2_store_load_stage1 |
+ subs N, N, #8 |
+ bge 1b |
+2: |
+ do_rgb_to_yuv_stage2 |
+ do_store 8 |
+ tst N, #7 |
+ beq 8f |
+3: |
+ tst N, #4 |
+ beq 3f |
+ do_load \bpp, 4 |
+3: |
+ tst N, #2 |
+ beq 4f |
+ do_load \bpp, 2 |
+4: |
+ tst N, #1 |
+ beq 5f |
+ do_load \bpp, 1 |
+5: |
+ do_rgb_to_yuv |
+ tst N, #4 |
+ beq 6f |
+ do_store 4 |
+6: |
+ tst N, #2 |
+ beq 7f |
+ do_store 2 |
+7: |
+ tst N, #1 |
+ beq 8f |
+ do_store 1 |
+8: |
+ subs NUM_ROWS, NUM_ROWS, #1 |
+ bgt 0b |
+9: |
+ /* Restore all registers and return */ |
+ vpop {d8-d15} |
+ pop {r4, r5, r6, r7, r8, r9, r10, pc} |
+ |
+ .unreq OUTPUT_WIDTH |
+ .unreq OUTPUT_ROW |
+ .unreq INPUT_BUF |
+ .unreq NUM_ROWS |
+ .unreq OUTPUT_BUF0 |
+ .unreq OUTPUT_BUF1 |
+ .unreq OUTPUT_BUF2 |
+ .unreq RGB |
+ .unreq Y |
+ .unreq U |
+ .unreq V |
+ .unreq N |
+.endfunc |
+ |
+.purgem do_rgb_to_yuv |
+.purgem do_rgb_to_yuv_stage1 |
+.purgem do_rgb_to_yuv_stage2 |
+.purgem do_rgb_to_yuv_stage2_store_load_stage1 |
+ |
+.endm |
+ |
+/*--------------------------------- id ----- bpp R G B */ |
+generate_jsimd_rgb_ycc_convert_neon extrgb, 24, 0, 1, 2 |
+generate_jsimd_rgb_ycc_convert_neon extbgr, 24, 2, 1, 0 |
+generate_jsimd_rgb_ycc_convert_neon extrgbx, 32, 0, 1, 2 |
+generate_jsimd_rgb_ycc_convert_neon extbgrx, 32, 2, 1, 0 |
+generate_jsimd_rgb_ycc_convert_neon extxbgr, 32, 3, 2, 1 |
+generate_jsimd_rgb_ycc_convert_neon extxrgb, 32, 1, 2, 3 |
+ |
+.purgem do_load |
+.purgem do_store |
+ |
+/*****************************************************************************/ |
+ |
+/* |
+ * Load data into workspace, applying unsigned->signed conversion |
+ * |
+ * TODO: can be combined with 'jsimd_fdct_ifast_neon' to get |
+ * rid of VST1.16 instructions |
+ */ |
+ |
+asm_function jsimd_convsamp_neon |
+ SAMPLE_DATA .req r0 |
+ START_COL .req r1 |
+ WORKSPACE .req r2 |
+ TMP1 .req r3 |
+ TMP2 .req r4 |
+ TMP3 .req r5 |
+ TMP4 .req ip |
+ |
+ push {r4, r5} |
+ vmov.u8 d0, #128 |
+ |
+ ldmia SAMPLE_DATA!, {TMP1, TMP2, TMP3, TMP4} |
+ add TMP1, TMP1, START_COL |
+ add TMP2, TMP2, START_COL |
+ add TMP3, TMP3, START_COL |
+ add TMP4, TMP4, START_COL |
+ vld1.8 {d16}, [TMP1] |
+ vsubl.u8 q8, d16, d0 |
+ vld1.8 {d18}, [TMP2] |
+ vsubl.u8 q9, d18, d0 |
+ vld1.8 {d20}, [TMP3] |
+ vsubl.u8 q10, d20, d0 |
+ vld1.8 {d22}, [TMP4] |
+ ldmia SAMPLE_DATA!, {TMP1, TMP2, TMP3, TMP4} |
+ vsubl.u8 q11, d22, d0 |
+ vst1.16 {d16, d17, d18, d19}, [WORKSPACE, :128]! |
+ add TMP1, TMP1, START_COL |
+ add TMP2, TMP2, START_COL |
+ vst1.16 {d20, d21, d22, d23}, [WORKSPACE, :128]! |
+ add TMP3, TMP3, START_COL |
+ add TMP4, TMP4, START_COL |
+ vld1.8 {d24}, [TMP1] |
+ vsubl.u8 q12, d24, d0 |
+ vld1.8 {d26}, [TMP2] |
+ vsubl.u8 q13, d26, d0 |
+ vld1.8 {d28}, [TMP3] |
+ vsubl.u8 q14, d28, d0 |
+ vld1.8 {d30}, [TMP4] |
+ vsubl.u8 q15, d30, d0 |
+ vst1.16 {d24, d25, d26, d27}, [WORKSPACE, :128]! |
+ vst1.16 {d28, d29, d30, d31}, [WORKSPACE, :128]! |
+ pop {r4, r5} |
+ bx lr |
+ |
+ .unreq SAMPLE_DATA |
+ .unreq START_COL |
+ .unreq WORKSPACE |
+ .unreq TMP1 |
+ .unreq TMP2 |
+ .unreq TMP3 |
+ .unreq TMP4 |
+.endfunc |
+ |
+/*****************************************************************************/ |
+ |
+/* |
+ * jsimd_fdct_ifast_neon |
+ * |
+ * This function contains a fast, not so accurate integer implementation of |
+ * the forward DCT (Discrete Cosine Transform). It uses the same calculations |
+ * and produces exactly the same output as IJG's original 'jpeg_fdct_ifast' |
+ * function from jfdctfst.c |
+ * |
+ * TODO: can be combined with 'jsimd_convsamp_neon' to get |
+ * rid of a bunch of VLD1.16 instructions |
+ */ |
+ |
+#define XFIX_0_382683433 d0[0] |
+#define XFIX_0_541196100 d0[1] |
+#define XFIX_0_707106781 d0[2] |
+#define XFIX_1_306562965 d0[3] |
+ |
+.balign 16 |
+jsimd_fdct_ifast_neon_consts: |
+ .short (98 * 128) /* XFIX_0_382683433 */ |
+ .short (139 * 128) /* XFIX_0_541196100 */ |
+ .short (181 * 128) /* XFIX_0_707106781 */ |
+ .short (334 * 128 - 256 * 128) /* XFIX_1_306562965 */ |
+ |
+asm_function jsimd_fdct_ifast_neon |
+ |
+ DATA .req r0 |
+ TMP .req ip |
+ |
+ vpush {d8-d15} |
+ |
+ /* Load constants */ |
+ adr TMP, jsimd_fdct_ifast_neon_consts |
+ vld1.16 {d0}, [TMP, :64] |
+ |
+ /* Load all DATA into NEON registers with the following allocation: |
+ * 0 1 2 3 | 4 5 6 7 |
+ * ---------+-------- |
+ * 0 | d16 | d17 | q8 |
+ * 1 | d18 | d19 | q9 |
+ * 2 | d20 | d21 | q10 |
+ * 3 | d22 | d23 | q11 |
+ * 4 | d24 | d25 | q12 |
+ * 5 | d26 | d27 | q13 |
+ * 6 | d28 | d29 | q14 |
+ * 7 | d30 | d31 | q15 |
+ */ |
+ |
+ vld1.16 {d16, d17, d18, d19}, [DATA, :128]! |
+ vld1.16 {d20, d21, d22, d23}, [DATA, :128]! |
+ vld1.16 {d24, d25, d26, d27}, [DATA, :128]! |
+ vld1.16 {d28, d29, d30, d31}, [DATA, :128] |
+ sub DATA, DATA, #(128 - 32) |
+ |
+ mov TMP, #2 |
+1: |
+ /* Transpose */ |
+ vtrn.16 q12, q13 |
+ vtrn.16 q10, q11 |
+ vtrn.16 q8, q9 |
+ vtrn.16 q14, q15 |
+ vtrn.32 q9, q11 |
+ vtrn.32 q13, q15 |
+ vtrn.32 q8, q10 |
+ vtrn.32 q12, q14 |
+ vswp d30, d23 |
+ vswp d24, d17 |
+ vswp d26, d19 |
+ /* 1-D FDCT */ |
+ vadd.s16 q2, q11, q12 |
+ vswp d28, d21 |
+ vsub.s16 q12, q11, q12 |
+ vsub.s16 q6, q10, q13 |
+ vadd.s16 q10, q10, q13 |
+ vsub.s16 q7, q9, q14 |
+ vadd.s16 q9, q9, q14 |
+ vsub.s16 q1, q8, q15 |
+ vadd.s16 q8, q8, q15 |
+ vsub.s16 q4, q9, q10 |
+ vsub.s16 q5, q8, q2 |
+ vadd.s16 q3, q9, q10 |
+ vadd.s16 q4, q4, q5 |
+ vadd.s16 q2, q8, q2 |
+ vqdmulh.s16 q4, q4, XFIX_0_707106781 |
+ vadd.s16 q11, q12, q6 |
+ vadd.s16 q8, q2, q3 |
+ vsub.s16 q12, q2, q3 |
+ vadd.s16 q3, q6, q7 |
+ vadd.s16 q7, q7, q1 |
+ vqdmulh.s16 q3, q3, XFIX_0_707106781 |
+ vsub.s16 q6, q11, q7 |
+ vadd.s16 q10, q5, q4 |
+ vqdmulh.s16 q6, q6, XFIX_0_382683433 |
+ vsub.s16 q14, q5, q4 |
+ vqdmulh.s16 q11, q11, XFIX_0_541196100 |
+ vqdmulh.s16 q5, q7, XFIX_1_306562965 |
+ vadd.s16 q4, q1, q3 |
+ vsub.s16 q3, q1, q3 |
+ vadd.s16 q7, q7, q6 |
+ vadd.s16 q11, q11, q6 |
+ vadd.s16 q7, q7, q5 |
+ vadd.s16 q13, q3, q11 |
+ vsub.s16 q11, q3, q11 |
+ vadd.s16 q9, q4, q7 |
+ vsub.s16 q15, q4, q7 |
+ subs TMP, TMP, #1 |
+ bne 1b |
+ |
+ /* store results */ |
+ vst1.16 {d16, d17, d18, d19}, [DATA, :128]! |
+ vst1.16 {d20, d21, d22, d23}, [DATA, :128]! |
+ vst1.16 {d24, d25, d26, d27}, [DATA, :128]! |
+ vst1.16 {d28, d29, d30, d31}, [DATA, :128] |
+ |
+ vpop {d8-d15} |
+ bx lr |
+ |
+ .unreq DATA |
+ .unreq TMP |
+.endfunc |
+ |
+/*****************************************************************************/ |
+ |
+/* |
+ * GLOBAL(void) |
+ * jsimd_quantize_neon (JCOEFPTR coef_block, DCTELEM * divisors, |
+ * DCTELEM * workspace); |
+ * |
+ * Note: the code uses 2 stage pipelining in order to improve instructions |
+ * scheduling and eliminate stalls (this provides ~15% better |
+ * performance for this function on both ARM Cortex-A8 and |
+ * ARM Cortex-A9 when compared to the non-pipelined variant). |
+ * The instructions which belong to the second stage use different |
+ * indentation for better readiability. |
+ */ |
+asm_function jsimd_quantize_neon |
+ |
+ COEF_BLOCK .req r0 |
+ DIVISORS .req r1 |
+ WORKSPACE .req r2 |
+ |
+ RECIPROCAL .req DIVISORS |
+ CORRECTION .req r3 |
+ SHIFT .req ip |
+ LOOP_COUNT .req r4 |
+ |
+ vld1.16 {d0, d1, d2, d3}, [WORKSPACE, :128]! |
+ vabs.s16 q12, q0 |
+ add CORRECTION, DIVISORS, #(64 * 2) |
+ add SHIFT, DIVISORS, #(64 * 6) |
+ vld1.16 {d20, d21, d22, d23}, [CORRECTION, :128]! |
+ vabs.s16 q13, q1 |
+ vld1.16 {d16, d17, d18, d19}, [RECIPROCAL, :128]! |
+ vadd.u16 q12, q12, q10 /* add correction */ |
+ vadd.u16 q13, q13, q11 |
+ vmull.u16 q10, d24, d16 /* multiply by reciprocal */ |
+ vmull.u16 q11, d25, d17 |
+ vmull.u16 q8, d26, d18 |
+ vmull.u16 q9, d27, d19 |
+ vld1.16 {d24, d25, d26, d27}, [SHIFT, :128]! |
+ vshrn.u32 d20, q10, #16 |
+ vshrn.u32 d21, q11, #16 |
+ vshrn.u32 d22, q8, #16 |
+ vshrn.u32 d23, q9, #16 |
+ vneg.s16 q12, q12 |
+ vneg.s16 q13, q13 |
+ vshr.s16 q2, q0, #15 /* extract sign */ |
+ vshr.s16 q3, q1, #15 |
+ vshl.u16 q14, q10, q12 /* shift */ |
+ vshl.u16 q15, q11, q13 |
+ |
+ push {r4, r5} |
+ mov LOOP_COUNT, #3 |
+1: |
+ vld1.16 {d0, d1, d2, d3}, [WORKSPACE, :128]! |
+ veor.u16 q14, q14, q2 /* restore sign */ |
+ vabs.s16 q12, q0 |
+ vld1.16 {d20, d21, d22, d23}, [CORRECTION, :128]! |
+ vabs.s16 q13, q1 |
+ veor.u16 q15, q15, q3 |
+ vld1.16 {d16, d17, d18, d19}, [RECIPROCAL, :128]! |
+ vadd.u16 q12, q12, q10 /* add correction */ |
+ vadd.u16 q13, q13, q11 |
+ vmull.u16 q10, d24, d16 /* multiply by reciprocal */ |
+ vmull.u16 q11, d25, d17 |
+ vmull.u16 q8, d26, d18 |
+ vmull.u16 q9, d27, d19 |
+ vsub.u16 q14, q14, q2 |
+ vld1.16 {d24, d25, d26, d27}, [SHIFT, :128]! |
+ vsub.u16 q15, q15, q3 |
+ vshrn.u32 d20, q10, #16 |
+ vshrn.u32 d21, q11, #16 |
+ vst1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128]! |
+ vshrn.u32 d22, q8, #16 |
+ vshrn.u32 d23, q9, #16 |
+ vneg.s16 q12, q12 |
+ vneg.s16 q13, q13 |
+ vshr.s16 q2, q0, #15 /* extract sign */ |
+ vshr.s16 q3, q1, #15 |
+ vshl.u16 q14, q10, q12 /* shift */ |
+ vshl.u16 q15, q11, q13 |
+ subs LOOP_COUNT, LOOP_COUNT, #1 |
+ bne 1b |
+ pop {r4, r5} |
+ |
+ veor.u16 q14, q14, q2 /* restore sign */ |
+ veor.u16 q15, q15, q3 |
+ vsub.u16 q14, q14, q2 |
+ vsub.u16 q15, q15, q3 |
+ vst1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128]! |
+ |
+ bx lr /* return */ |
+ |
+ .unreq COEF_BLOCK |
+ .unreq DIVISORS |
+ .unreq WORKSPACE |
+ .unreq RECIPROCAL |
+ .unreq CORRECTION |
+ .unreq SHIFT |
+ .unreq LOOP_COUNT |
+.endfunc |