| OLD | NEW |
| 1 /* | 1 /* |
| 2 * ARMv7 NEON optimizations for libjpeg-turbo | 2 * ARMv7 NEON optimizations for libjpeg-turbo |
| 3 * | 3 * |
| 4 * Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies). | 4 * Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies). |
| 5 * All rights reserved. | 5 * All rights reserved. |
| 6 * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com> | 6 * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com> |
| 7 * Copyright (C) 2014 Siarhei Siamashka. All Rights Reserved. |
| 7 * Copyright (C) 2014 Linaro Limited. All Rights Reserved. | 8 * Copyright (C) 2014 Linaro Limited. All Rights Reserved. |
| 9 * Copyright (C) 2015 D. R. Commander. All Rights Reserved. |
| 10 * Copyright (C) 2015-2016 Matthieu Darbois. All Rights Reserved. |
| 8 * | 11 * |
| 9 * This software is provided 'as-is', without any express or implied | 12 * This software is provided 'as-is', without any express or implied |
| 10 * warranty. In no event will the authors be held liable for any damages | 13 * warranty. In no event will the authors be held liable for any damages |
| 11 * arising from the use of this software. | 14 * arising from the use of this software. |
| 12 * | 15 * |
| 13 * Permission is granted to anyone to use this software for any purpose, | 16 * Permission is granted to anyone to use this software for any purpose, |
| 14 * including commercial applications, and to alter it and redistribute it | 17 * including commercial applications, and to alter it and redistribute it |
| 15 * freely, subject to the following restrictions: | 18 * freely, subject to the following restrictions: |
| 16 * | 19 * |
| 17 * 1. The origin of this software must not be misrepresented; you must not | 20 * 1. The origin of this software must not be misrepresented; you must not |
| 18 * claim that you wrote the original software. If you use this software | 21 * claim that you wrote the original software. If you use this software |
| 19 * in a product, an acknowledgment in the product documentation would be | 22 * in a product, an acknowledgment in the product documentation would be |
| 20 * appreciated but is not required. | 23 * appreciated but is not required. |
| 21 * 2. Altered source versions must be plainly marked as such, and must not be | 24 * 2. Altered source versions must be plainly marked as such, and must not be |
| 22 * misrepresented as being the original software. | 25 * misrepresented as being the original software. |
| 23 * 3. This notice may not be removed or altered from any source distribution. | 26 * 3. This notice may not be removed or altered from any source distribution. |
| 24 */ | 27 */ |
| 25 | 28 |
| 26 #if defined(__linux__) && defined(__ELF__) | 29 #if defined(__linux__) && defined(__ELF__) |
| 27 .section .note.GNU-stack,"",%progbits /* mark stack as non-executable */ | 30 .section .note.GNU-stack, "", %progbits /* mark stack as non-executable */ |
| 28 #endif | 31 #endif |
| 29 | 32 |
| 30 .text | 33 .text |
| 31 .fpu neon | 34 .fpu neon |
| 32 .arch armv7a | 35 .arch armv7a |
| 33 .object_arch armv4 | 36 .object_arch armv4 |
| 34 .arm | 37 .arm |
| 38 .syntax unified |
| 35 | 39 |
| 36 | 40 |
| 37 #define RESPECT_STRICT_ALIGNMENT 1 | 41 #define RESPECT_STRICT_ALIGNMENT 1 |
| 38 | 42 |
| 39 | 43 |
| 40 /*****************************************************************************/ | 44 /*****************************************************************************/ |
| 41 | 45 |
| 42 /* Supplementary macro for setting function attributes */ | 46 /* Supplementary macro for setting function attributes */ |
| 43 .macro asm_function fname | 47 .macro asm_function fname |
| 44 #ifdef __APPLE__ | 48 #ifdef __APPLE__ |
| 45 .globl _\fname | 49 .globl _\fname |
| 46 _\fname: | 50 _\fname: |
| 47 #else | 51 #else |
| 48 .global \fname | 52 .global \fname |
| 49 #ifdef __ELF__ | 53 #ifdef __ELF__ |
| 50 .hidden \fname | 54 .hidden \fname |
| 51 .type \fname, %function | 55 .type \fname, %function |
| 52 #endif | 56 #endif |
| 53 \fname: | 57 \fname: |
| 54 #endif | 58 #endif |
| 55 .endm | 59 .endm |
| 56 | 60 |
| 57 /* Transpose a block of 4x4 coefficients in four 64-bit registers */ | 61 /* Transpose a block of 4x4 coefficients in four 64-bit registers */ |
| 58 .macro transpose_4x4 x0, x1, x2, x3 | 62 .macro transpose_4x4 x0, x1, x2, x3 |
| 59 vtrn.16 \x0, \x1 | 63 vtrn.16 \x0, \x1 |
| 60 vtrn.16 \x2, \x3 | 64 vtrn.16 \x2, \x3 |
| 61 vtrn.32 \x0, \x2 | 65 vtrn.32 \x0, \x2 |
| 62 vtrn.32 \x1, \x3 | 66 vtrn.32 \x1, \x3 |
| 63 .endm | 67 .endm |
| 64 | 68 |
| 65 | 69 |
| 66 #define CENTERJSAMPLE 128 | 70 #define CENTERJSAMPLE 128 |
| 67 | 71 |
| 68 /*****************************************************************************/ | 72 /*****************************************************************************/ |
| 69 | 73 |
| 70 /* | 74 /* |
| 71 * Perform dequantization and inverse DCT on one block of coefficients. | 75 * Perform dequantization and inverse DCT on one block of coefficients. |
| 72 * | 76 * |
| 73 * GLOBAL(void) | 77 * GLOBAL(void) |
| 74 * jsimd_idct_islow_neon (void * dct_table, JCOEFPTR coef_block, | 78 * jsimd_idct_islow_neon (void *dct_table, JCOEFPTR coef_block, |
| 75 * JSAMPARRAY output_buf, JDIMENSION output_col) | 79 * JSAMPARRAY output_buf, JDIMENSION output_col) |
| 76 */ | 80 */ |
| 77 | 81 |
| 78 #define FIX_0_298631336 (2446) | 82 #define FIX_0_298631336 (2446) |
| 79 #define FIX_0_390180644 (3196) | 83 #define FIX_0_390180644 (3196) |
| 80 #define FIX_0_541196100 (4433) | 84 #define FIX_0_541196100 (4433) |
| 81 #define FIX_0_765366865 (6270) | 85 #define FIX_0_765366865 (6270) |
| 82 #define FIX_0_899976223 (7373) | 86 #define FIX_0_899976223 (7373) |
| 83 #define FIX_1_175875602 (9633) | 87 #define FIX_1_175875602 (9633) |
| 84 #define FIX_1_501321110 (12299) | 88 #define FIX_1_501321110 (12299) |
| 85 #define FIX_1_847759065 (15137) | 89 #define FIX_1_847759065 (15137) |
| 86 #define FIX_1_961570560 (16069) | 90 #define FIX_1_961570560 (16069) |
| 87 #define FIX_2_053119869 (16819) | 91 #define FIX_2_053119869 (16819) |
| 88 #define FIX_2_562915447 (20995) | 92 #define FIX_2_562915447 (20995) |
| 89 #define FIX_3_072711026 (25172) | 93 #define FIX_3_072711026 (25172) |
| 90 | 94 |
| 91 #define FIX_1_175875602_MINUS_1_961570560 (FIX_1_175875602 - FIX_1_961570560) | 95 #define FIX_1_175875602_MINUS_1_961570560 (FIX_1_175875602 - FIX_1_961570560) |
| 92 #define FIX_1_175875602_MINUS_0_390180644 (FIX_1_175875602 - FIX_0_390180644) | 96 #define FIX_1_175875602_MINUS_0_390180644 (FIX_1_175875602 - FIX_0_390180644) |
| 93 #define FIX_0_541196100_MINUS_1_847759065 (FIX_0_541196100 - FIX_1_847759065) | 97 #define FIX_0_541196100_MINUS_1_847759065 (FIX_0_541196100 - FIX_1_847759065) |
| 94 #define FIX_3_072711026_MINUS_2_562915447 (FIX_3_072711026 - FIX_2_562915447) | 98 #define FIX_3_072711026_MINUS_2_562915447 (FIX_3_072711026 - FIX_2_562915447) |
| 95 #define FIX_0_298631336_MINUS_0_899976223 (FIX_0_298631336 - FIX_0_899976223) | 99 #define FIX_0_298631336_MINUS_0_899976223 (FIX_0_298631336 - FIX_0_899976223) |
| 96 #define FIX_1_501321110_MINUS_0_899976223 (FIX_1_501321110 - FIX_0_899976223) | 100 #define FIX_1_501321110_MINUS_0_899976223 (FIX_1_501321110 - FIX_0_899976223) |
| 97 #define FIX_2_053119869_MINUS_2_562915447 (FIX_2_053119869 - FIX_2_562915447) | 101 #define FIX_2_053119869_MINUS_2_562915447 (FIX_2_053119869 - FIX_2_562915447) |
| 98 #define FIX_0_541196100_PLUS_0_765366865 (FIX_0_541196100 + FIX_0_765366865) | 102 #define FIX_0_541196100_PLUS_0_765366865 (FIX_0_541196100 + FIX_0_765366865) |
| 99 | 103 |
| 100 /* | 104 /* |
| 101 * Reference SIMD-friendly 1-D ISLOW iDCT C implementation. | 105 * Reference SIMD-friendly 1-D ISLOW iDCT C implementation. |
| 102 * Uses some ideas from the comments in 'simd/jiss2int-64.asm' | 106 * Uses some ideas from the comments in 'simd/jiss2int-64.asm' |
| 103 */ | 107 */ |
| 104 #define REF_1D_IDCT(xrow0, xrow1, xrow2, xrow3, xrow4, xrow5, xrow6, xrow7) \ | 108 #define REF_1D_IDCT(xrow0, xrow1, xrow2, xrow3, xrow4, xrow5, xrow6, xrow7) \ |
| 105 { \ | 109 { \ |
| 106 DCTELEM row0, row1, row2, row3, row4, row5, row6, row7; \ | 110 DCTELEM row0, row1, row2, row3, row4, row5, row6, row7; \ |
| 107 INT32 q1, q2, q3, q4, q5, q6, q7; \ | 111 JLONG q1, q2, q3, q4, q5, q6, q7; \ |
| 108 INT32 tmp11_plus_tmp2, tmp11_minus_tmp2; \ | 112 JLONG tmp11_plus_tmp2, tmp11_minus_tmp2; \ |
| 109 \ | 113 \ |
| 110 /* 1-D iDCT input data */ \ | 114 /* 1-D iDCT input data */ \ |
| 111 row0 = xrow0; \ | 115 row0 = xrow0; \ |
| 112 row1 = xrow1; \ | 116 row1 = xrow1; \ |
| 113 row2 = xrow2; \ | 117 row2 = xrow2; \ |
| 114 row3 = xrow3; \ | 118 row3 = xrow3; \ |
| 115 row4 = xrow4; \ | 119 row4 = xrow4; \ |
| 116 row5 = xrow5; \ | 120 row5 = xrow5; \ |
| 117 row6 = xrow6; \ | 121 row6 = xrow6; \ |
| 118 row7 = xrow7; \ | 122 row7 = xrow7; \ |
| 119 \ | 123 \ |
| 120 q5 = row7 + row3; \ | 124 q5 = row7 + row3; \ |
| 121 q4 = row5 + row1; \ | 125 q4 = row5 + row1; \ |
| 122 q6 = MULTIPLY(q5, FIX_1_175875602_MINUS_1_961570560) + \ | 126 q6 = MULTIPLY(q5, FIX_1_175875602_MINUS_1_961570560) + \ |
| 123 MULTIPLY(q4, FIX_1_175875602); \ | 127 MULTIPLY(q4, FIX_1_175875602); \ |
| 124 q7 = MULTIPLY(q5, FIX_1_175875602) + \ | 128 q7 = MULTIPLY(q5, FIX_1_175875602) + \ |
| 125 MULTIPLY(q4, FIX_1_175875602_MINUS_0_390180644); \ | 129 MULTIPLY(q4, FIX_1_175875602_MINUS_0_390180644); \ |
| 126 q2 = MULTIPLY(row2, FIX_0_541196100) + \ | 130 q2 = MULTIPLY(row2, FIX_0_541196100) + \ |
| 127 MULTIPLY(row6, FIX_0_541196100_MINUS_1_847759065); \ | 131 MULTIPLY(row6, FIX_0_541196100_MINUS_1_847759065); \ |
| 128 q4 = q6; \ | 132 q4 = q6; \ |
| 129 q3 = ((INT32) row0 - (INT32) row4) << 13; \ | 133 q3 = ((JLONG) row0 - (JLONG) row4) << 13; \ |
| 130 q6 += MULTIPLY(row5, -FIX_2_562915447) + \ | 134 q6 += MULTIPLY(row5, -FIX_2_562915447) + \ |
| 131 MULTIPLY(row3, FIX_3_072711026_MINUS_2_562915447); \ | 135 MULTIPLY(row3, FIX_3_072711026_MINUS_2_562915447); \ |
| 132 /* now we can use q1 (reloadable constants have been used up) */ \ | 136 /* now we can use q1 (reloadable constants have been used up) */ \ |
| 133 q1 = q3 + q2; \ | 137 q1 = q3 + q2; \ |
| 134 q4 += MULTIPLY(row7, FIX_0_298631336_MINUS_0_899976223) + \ | 138 q4 += MULTIPLY(row7, FIX_0_298631336_MINUS_0_899976223) + \ |
| 135 MULTIPLY(row1, -FIX_0_899976223); \ | 139 MULTIPLY(row1, -FIX_0_899976223); \ |
| 136 q5 = q7; \ | 140 q5 = q7; \ |
| 137 q1 = q1 + q6; \ | 141 q1 = q1 + q6; \ |
| 138 q7 += MULTIPLY(row7, -FIX_0_899976223) + \ | 142 q7 += MULTIPLY(row7, -FIX_0_899976223) + \ |
| 139 MULTIPLY(row1, FIX_1_501321110_MINUS_0_899976223); \ | 143 MULTIPLY(row1, FIX_1_501321110_MINUS_0_899976223); \ |
| 140 \ | 144 \ |
| 141 /* (tmp11 + tmp2) has been calculated (out_row1 before descale) */ \ | 145 /* (tmp11 + tmp2) has been calculated (out_row1 before descale) */ \ |
| 142 tmp11_plus_tmp2 = q1; \ | 146 tmp11_plus_tmp2 = q1; \ |
| 143 row1 = 0; \ | 147 row1 = 0; \ |
| 144 \ | 148 \ |
| 145 q1 = q1 - q6; \ | 149 q1 = q1 - q6; \ |
| 146 q5 += MULTIPLY(row5, FIX_2_053119869_MINUS_2_562915447) + \ | 150 q5 += MULTIPLY(row5, FIX_2_053119869_MINUS_2_562915447) + \ |
| 147 MULTIPLY(row3, -FIX_2_562915447); \ | 151 MULTIPLY(row3, -FIX_2_562915447); \ |
| 148 q1 = q1 - q6; \ | 152 q1 = q1 - q6; \ |
| 149 q6 = MULTIPLY(row2, FIX_0_541196100_PLUS_0_765366865) + \ | 153 q6 = MULTIPLY(row2, FIX_0_541196100_PLUS_0_765366865) + \ |
| 150 MULTIPLY(row6, FIX_0_541196100); \ | 154 MULTIPLY(row6, FIX_0_541196100); \ |
| 151 q3 = q3 - q2; \ | 155 q3 = q3 - q2; \ |
| 152 \ | 156 \ |
| 153 /* (tmp11 - tmp2) has been calculated (out_row6 before descale) */ \ | 157 /* (tmp11 - tmp2) has been calculated (out_row6 before descale) */ \ |
| 154 tmp11_minus_tmp2 = q1; \ | 158 tmp11_minus_tmp2 = q1; \ |
| 155 \ | 159 \ |
| 156 q1 = ((INT32) row0 + (INT32) row4) << 13; \ | 160 q1 = ((JLONG) row0 + (JLONG) row4) << 13; \ |
| 157 q2 = q1 + q6; \ | 161 q2 = q1 + q6; \ |
| 158 q1 = q1 - q6; \ | 162 q1 = q1 - q6; \ |
| 159 \ | 163 \ |
| 160 /* pick up the results */ \ | 164 /* pick up the results */ \ |
| 161 tmp0 = q4; \ | 165 tmp0 = q4; \ |
| 162 tmp1 = q5; \ | 166 tmp1 = q5; \ |
| 163 tmp2 = (tmp11_plus_tmp2 - tmp11_minus_tmp2) / 2; \ | 167 tmp2 = (tmp11_plus_tmp2 - tmp11_minus_tmp2) / 2; \ |
| 164 tmp3 = q7; \ | 168 tmp3 = q7; \ |
| 165 tmp10 = q2; \ | 169 tmp10 = q2; \ |
| 166 tmp11 = (tmp11_plus_tmp2 + tmp11_minus_tmp2) / 2; \ | 170 tmp11 = (tmp11_plus_tmp2 + tmp11_minus_tmp2) / 2; \ |
| 167 tmp12 = q3; \ | 171 tmp12 = q3; \ |
| 168 tmp13 = q1; \ | 172 tmp13 = q1; \ |
| 169 } | 173 } |
| 170 | 174 |
| 171 #define XFIX_0_899976223 d0[0] | 175 #define XFIX_0_899976223 d0[0] |
| 172 #define XFIX_0_541196100 d0[1] | 176 #define XFIX_0_541196100 d0[1] |
| 173 #define XFIX_2_562915447 d0[2] | 177 #define XFIX_2_562915447 d0[2] |
| 174 #define XFIX_0_298631336_MINUS_0_899976223 d0[3] | 178 #define XFIX_0_298631336_MINUS_0_899976223 d0[3] |
| 175 #define XFIX_1_501321110_MINUS_0_899976223 d1[0] | 179 #define XFIX_1_501321110_MINUS_0_899976223 d1[0] |
| 176 #define XFIX_2_053119869_MINUS_2_562915447 d1[1] | 180 #define XFIX_2_053119869_MINUS_2_562915447 d1[1] |
| 177 #define XFIX_0_541196100_PLUS_0_765366865 d1[2] | 181 #define XFIX_0_541196100_PLUS_0_765366865 d1[2] |
| 178 #define XFIX_1_175875602 d1[3] | 182 #define XFIX_1_175875602 d1[3] |
| 179 #define XFIX_1_175875602_MINUS_0_390180644 d2[0] | 183 #define XFIX_1_175875602_MINUS_0_390180644 d2[0] |
| 180 #define XFIX_0_541196100_MINUS_1_847759065 d2[1] | 184 #define XFIX_0_541196100_MINUS_1_847759065 d2[1] |
| 181 #define XFIX_3_072711026_MINUS_2_562915447 d2[2] | 185 #define XFIX_3_072711026_MINUS_2_562915447 d2[2] |
| 182 #define XFIX_1_175875602_MINUS_1_961570560 d2[3] | 186 #define XFIX_1_175875602_MINUS_1_961570560 d2[3] |
| 183 | 187 |
| 184 .balign 16 | 188 .balign 16 |
| 185 jsimd_idct_islow_neon_consts: | 189 jsimd_idct_islow_neon_consts: |
| 186 .short FIX_0_899976223 /* d0[0] */ | 190 .short FIX_0_899976223 /* d0[0] */ |
| 187 .short FIX_0_541196100 /* d0[1] */ | 191 .short FIX_0_541196100 /* d0[1] */ |
| 188 .short FIX_2_562915447 /* d0[2] */ | 192 .short FIX_2_562915447 /* d0[2] */ |
| 189 .short FIX_0_298631336_MINUS_0_899976223 /* d0[3] */ | 193 .short FIX_0_298631336_MINUS_0_899976223 /* d0[3] */ |
| 190 .short FIX_1_501321110_MINUS_0_899976223 /* d1[0] */ | 194 .short FIX_1_501321110_MINUS_0_899976223 /* d1[0] */ |
| 191 .short FIX_2_053119869_MINUS_2_562915447 /* d1[1] */ | 195 .short FIX_2_053119869_MINUS_2_562915447 /* d1[1] */ |
| 192 .short FIX_0_541196100_PLUS_0_765366865 /* d1[2] */ | 196 .short FIX_0_541196100_PLUS_0_765366865 /* d1[2] */ |
| 193 .short FIX_1_175875602 /* d1[3] */ | 197 .short FIX_1_175875602 /* d1[3] */ |
| 194 /* reloadable constants */ | 198 /* reloadable constants */ |
| 195 .short FIX_1_175875602_MINUS_0_390180644 /* d2[0] */ | 199 .short FIX_1_175875602_MINUS_0_390180644 /* d2[0] */ |
| 196 .short FIX_0_541196100_MINUS_1_847759065 /* d2[1] */ | 200 .short FIX_0_541196100_MINUS_1_847759065 /* d2[1] */ |
| 197 .short FIX_3_072711026_MINUS_2_562915447 /* d2[2] */ | 201 .short FIX_3_072711026_MINUS_2_562915447 /* d2[2] */ |
| 198 .short FIX_1_175875602_MINUS_1_961570560 /* d2[3] */ | 202 .short FIX_1_175875602_MINUS_1_961570560 /* d2[3] */ |
| 199 | 203 |
| 200 asm_function jsimd_idct_islow_neon | 204 asm_function jsimd_idct_islow_neon |
| 201 | 205 |
| 202 DCT_TABLE .req r0 | 206 DCT_TABLE .req r0 |
| 203 COEF_BLOCK .req r1 | 207 COEF_BLOCK .req r1 |
| 204 OUTPUT_BUF .req r2 | 208 OUTPUT_BUF .req r2 |
| 205 OUTPUT_COL .req r3 | 209 OUTPUT_COL .req r3 |
| 206 TMP1 .req r0 | 210 TMP1 .req r0 |
| 207 TMP2 .req r1 | 211 TMP2 .req r1 |
| 208 TMP3 .req r2 | 212 TMP3 .req r2 |
| (...skipping 38 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 247 vmul.s16 q9, q9, q1 | 251 vmul.s16 q9, q9, q1 |
| 248 vld1.16 {d24, d25, d26, d27}, [COEF_BLOCK, :128]! | 252 vld1.16 {d24, d25, d26, d27}, [COEF_BLOCK, :128]! |
| 249 vmul.s16 q10, q10, q2 | 253 vmul.s16 q10, q10, q2 |
| 250 vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]! | 254 vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]! |
| 251 vmul.s16 q11, q11, q3 | 255 vmul.s16 q11, q11, q3 |
| 252 vld1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128] | 256 vld1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128] |
| 253 vmul.s16 q12, q12, q0 | 257 vmul.s16 q12, q12, q0 |
| 254 vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]! | 258 vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]! |
| 255 vmul.s16 q14, q14, q2 | 259 vmul.s16 q14, q14, q2 |
| 256 vmul.s16 q13, q13, q1 | 260 vmul.s16 q13, q13, q1 |
| 257 vld1.16 {d0, d1, d2, d3}, [ip, :128] /* load constants */ | 261 vld1.16 {d0, d1, d2, d3}, [ip, :128] /* load constants */ |
| 258 add ip, ip, #16 | 262 add ip, ip, #16 |
| 259 vmul.s16 q15, q15, q3 | 263 vmul.s16 q15, q15, q3 |
| 260 vpush {d8-d15} /* save NEON registers */ | 264 vpush {d8-d15} /* save NEON registers */ |
| 261 /* 1-D IDCT, pass 1, left 4x8 half */ | 265 /* 1-D IDCT, pass 1, left 4x8 half */ |
| 262 vadd.s16 d4, ROW7L, ROW3L | 266 vadd.s16 d4, ROW7L, ROW3L |
| 263 vadd.s16 d5, ROW5L, ROW1L | 267 vadd.s16 d5, ROW5L, ROW1L |
| 264 vmull.s16 q6, d4, XFIX_1_175875602_MINUS_1_961570560 | 268 vmull.s16 q6, d4, XFIX_1_175875602_MINUS_1_961570560 |
| 265 vmlal.s16 q6, d5, XFIX_1_175875602 | 269 vmlal.s16 q6, d5, XFIX_1_175875602 |
| 266 vmull.s16 q7, d4, XFIX_1_175875602 | 270 vmull.s16 q7, d4, XFIX_1_175875602 |
| 267 /* Check for the zero coefficients in the right 4x8 half */ | 271 /* Check for the zero coefficients in the right 4x8 half */ |
| 268 push {r4, r5} | 272 push {r4, r5} |
| 269 vmlal.s16 q7, d5, XFIX_1_175875602_MINUS_0_390180644 | 273 vmlal.s16 q7, d5, XFIX_1_175875602_MINUS_0_390180644 |
| 270 vsubl.s16 q3, ROW0L, ROW4L | 274 vsubl.s16 q3, ROW0L, ROW4L |
| 271 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 1 * 8))] | 275 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 1 * 8))] |
| 272 vmull.s16 q2, ROW2L, XFIX_0_541196100 | 276 vmull.s16 q2, ROW2L, XFIX_0_541196100 |
| 273 vmlal.s16 q2, ROW6L, XFIX_0_541196100_MINUS_1_847759065 | 277 vmlal.s16 q2, ROW6L, XFIX_0_541196100_MINUS_1_847759065 |
| 274 orr r0, r4, r5 | 278 orr r0, r4, r5 |
| 275 vmov q4, q6 | 279 vmov q4, q6 |
| 276 vmlsl.s16 q6, ROW5L, XFIX_2_562915447 | 280 vmlsl.s16 q6, ROW5L, XFIX_2_562915447 |
| 277 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 2 * 8))] | 281 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 2 * 8))] |
| 278 vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447 | 282 vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447 |
| 279 vshl.s32 q3, q3, #13 | 283 vshl.s32 q3, q3, #13 |
| 280 orr r0, r0, r4 | 284 orr r0, r0, r4 |
| 281 vmlsl.s16 q4, ROW1L, XFIX_0_899976223 | 285 vmlsl.s16 q4, ROW1L, XFIX_0_899976223 |
| 282 orr r0, r0, r5 | 286 orr r0, r0, r5 |
| 283 vadd.s32 q1, q3, q2 | 287 vadd.s32 q1, q3, q2 |
| 284 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 3 * 8))] | 288 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 3 * 8))] |
| 285 vmov q5, q7 | 289 vmov q5, q7 |
| 286 vadd.s32 q1, q1, q6 | 290 vadd.s32 q1, q1, q6 |
| 287 orr r0, r0, r4 | 291 orr r0, r0, r4 |
| 288 vmlsl.s16 q7, ROW7L, XFIX_0_899976223 | 292 vmlsl.s16 q7, ROW7L, XFIX_0_899976223 |
| 289 orr r0, r0, r5 | 293 orr r0, r0, r5 |
| 290 vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223 | 294 vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223 |
| 291 vrshrn.s32 ROW1L, q1, #11 | 295 vrshrn.s32 ROW1L, q1, #11 |
| 292 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 4 * 8))] | 296 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 4 * 8))] |
| 293 vsub.s32 q1, q1, q6 | 297 vsub.s32 q1, q1, q6 |
| 294 vmlal.s16 q5, ROW5L, XFIX_2_053119869_MINUS_2_562915447 | 298 vmlal.s16 q5, ROW5L, XFIX_2_053119869_MINUS_2_562915447 |
| 295 orr r0, r0, r4 | 299 orr r0, r0, r4 |
| 296 vmlsl.s16 q5, ROW3L, XFIX_2_562915447 | 300 vmlsl.s16 q5, ROW3L, XFIX_2_562915447 |
| 297 orr r0, r0, r5 | 301 orr r0, r0, r5 |
| 298 vsub.s32 q1, q1, q6 | 302 vsub.s32 q1, q1, q6 |
| 299 vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865 | 303 vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865 |
| 300 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 5 * 8))] | 304 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 5 * 8))] |
| 301 vmlal.s16 q6, ROW6L, XFIX_0_541196100 | 305 vmlal.s16 q6, ROW6L, XFIX_0_541196100 |
| 302 vsub.s32 q3, q3, q2 | 306 vsub.s32 q3, q3, q2 |
| 303 orr r0, r0, r4 | 307 orr r0, r0, r4 |
| 304 vrshrn.s32 ROW6L, q1, #11 | 308 vrshrn.s32 ROW6L, q1, #11 |
| 305 orr r0, r0, r5 | 309 orr r0, r0, r5 |
| 306 vadd.s32 q1, q3, q5 | 310 vadd.s32 q1, q3, q5 |
| 307 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 6 * 8))] | 311 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 6 * 8))] |
| 308 vsub.s32 q3, q3, q5 | 312 vsub.s32 q3, q3, q5 |
| 309 vaddl.s16 q5, ROW0L, ROW4L | 313 vaddl.s16 q5, ROW0L, ROW4L |
| 310 orr r0, r0, r4 | 314 orr r0, r0, r4 |
| 311 vrshrn.s32 ROW2L, q1, #11 | 315 vrshrn.s32 ROW2L, q1, #11 |
| 312 orr r0, r0, r5 | 316 orr r0, r0, r5 |
| 313 vrshrn.s32 ROW5L, q3, #11 | 317 vrshrn.s32 ROW5L, q3, #11 |
| 314 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 7 * 8))] | 318 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 7 * 8))] |
| 315 vshl.s32 q5, q5, #13 | 319 vshl.s32 q5, q5, #13 |
| 316 vmlal.s16 q4, ROW7L, XFIX_0_298631336_MINUS_0_899976223 | 320 vmlal.s16 q4, ROW7L, XFIX_0_298631336_MINUS_0_899976223 |
| 317 orr r0, r0, r4 | 321 orr r0, r0, r4 |
| 318 vadd.s32 q2, q5, q6 | 322 vadd.s32 q2, q5, q6 |
| 319 orrs r0, r0, r5 | 323 orrs r0, r0, r5 |
| 320 vsub.s32 q1, q5, q6 | 324 vsub.s32 q1, q5, q6 |
| 321 vadd.s32 q6, q2, q7 | 325 vadd.s32 q6, q2, q7 |
| 322 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 0 * 8))] | 326 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 0 * 8))] |
| 323 vsub.s32 q2, q2, q7 | 327 vsub.s32 q2, q2, q7 |
| 324 vadd.s32 q5, q1, q4 | 328 vadd.s32 q5, q1, q4 |
| 325 orr r0, r4, r5 | 329 orr r0, r4, r5 |
| 326 vsub.s32 q3, q1, q4 | 330 vsub.s32 q3, q1, q4 |
| 327 pop {r4, r5} | 331 pop {r4, r5} |
| 328 vrshrn.s32 ROW7L, q2, #11 | 332 vrshrn.s32 ROW7L, q2, #11 |
| 329 vrshrn.s32 ROW3L, q5, #11 | 333 vrshrn.s32 ROW3L, q5, #11 |
| 330 vrshrn.s32 ROW0L, q6, #11 | 334 vrshrn.s32 ROW0L, q6, #11 |
| 331 vrshrn.s32 ROW4L, q3, #11 | 335 vrshrn.s32 ROW4L, q3, #11 |
| 332 | 336 |
| 333 beq 3f /* Go to do some special handling for the sparse right
4x8 half */ | 337 beq 3f /* Go to do some special handling for the sparse |
| 338 right 4x8 half */ |
| 334 | 339 |
| 335 /* 1-D IDCT, pass 1, right 4x8 half */ | 340 /* 1-D IDCT, pass 1, right 4x8 half */ |
| 336 vld1.s16 {d2}, [ip, :64] /* reload constants */ | 341 vld1.s16 {d2}, [ip, :64] /* reload constants */ |
| 337 vadd.s16 d10, ROW7R, ROW3R | 342 vadd.s16 d10, ROW7R, ROW3R |
| 338 vadd.s16 d8, ROW5R, ROW1R | 343 vadd.s16 d8, ROW5R, ROW1R |
| 339 /* Transpose left 4x8 half */ | 344 /* Transpose left 4x8 half */ |
| 340 vtrn.16 ROW6L, ROW7L | 345 vtrn.16 ROW6L, ROW7L |
| 341 vmull.s16 q6, d10, XFIX_1_175875602_MINUS_1_961570560 | 346 vmull.s16 q6, d10, XFIX_1_175875602_MINUS_1_961570560 |
| 342 vmlal.s16 q6, d8, XFIX_1_175875602 | 347 vmlal.s16 q6, d8, XFIX_1_175875602 |
| 343 vtrn.16 ROW2L, ROW3L | 348 vtrn.16 ROW2L, ROW3L |
| 344 vmull.s16 q7, d10, XFIX_1_175875602 | 349 vmull.s16 q7, d10, XFIX_1_175875602 |
| 345 vmlal.s16 q7, d8, XFIX_1_175875602_MINUS_0_390180644 | 350 vmlal.s16 q7, d8, XFIX_1_175875602_MINUS_0_390180644 |
| 346 vtrn.16 ROW0L, ROW1L | 351 vtrn.16 ROW0L, ROW1L |
| 347 vsubl.s16 q3, ROW0R, ROW4R | 352 vsubl.s16 q3, ROW0R, ROW4R |
| 348 vmull.s16 q2, ROW2R, XFIX_0_541196100 | 353 vmull.s16 q2, ROW2R, XFIX_0_541196100 |
| 349 vmlal.s16 q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065 | 354 vmlal.s16 q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065 |
| 350 vtrn.16 ROW4L, ROW5L | 355 vtrn.16 ROW4L, ROW5L |
| 351 vmov q4, q6 | 356 vmov q4, q6 |
| 352 vmlsl.s16 q6, ROW5R, XFIX_2_562915447 | 357 vmlsl.s16 q6, ROW5R, XFIX_2_562915447 |
| 353 vmlal.s16 q6, ROW3R, XFIX_3_072711026_MINUS_2_562915447 | 358 vmlal.s16 q6, ROW3R, XFIX_3_072711026_MINUS_2_562915447 |
| 354 vtrn.32 ROW1L, ROW3L | 359 vtrn.32 ROW1L, ROW3L |
| 355 vshl.s32 q3, q3, #13 | 360 vshl.s32 q3, q3, #13 |
| 356 vmlsl.s16 q4, ROW1R, XFIX_0_899976223 | 361 vmlsl.s16 q4, ROW1R, XFIX_0_899976223 |
| 357 vtrn.32 ROW4L, ROW6L | 362 vtrn.32 ROW4L, ROW6L |
| 358 vadd.s32 q1, q3, q2 | 363 vadd.s32 q1, q3, q2 |
| 359 vmov q5, q7 | 364 vmov q5, q7 |
| 360 vadd.s32 q1, q1, q6 | 365 vadd.s32 q1, q1, q6 |
| 361 vtrn.32 ROW0L, ROW2L | 366 vtrn.32 ROW0L, ROW2L |
| 362 vmlsl.s16 q7, ROW7R, XFIX_0_899976223 | 367 vmlsl.s16 q7, ROW7R, XFIX_0_899976223 |
| 363 vmlal.s16 q7, ROW1R, XFIX_1_501321110_MINUS_0_899976223 | 368 vmlal.s16 q7, ROW1R, XFIX_1_501321110_MINUS_0_899976223 |
| 364 vrshrn.s32 ROW1R, q1, #11 | 369 vrshrn.s32 ROW1R, q1, #11 |
| 365 vtrn.32 ROW5L, ROW7L | 370 vtrn.32 ROW5L, ROW7L |
| 366 vsub.s32 q1, q1, q6 | 371 vsub.s32 q1, q1, q6 |
| 367 vmlal.s16 q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447 | 372 vmlal.s16 q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447 |
| 368 vmlsl.s16 q5, ROW3R, XFIX_2_562915447 | 373 vmlsl.s16 q5, ROW3R, XFIX_2_562915447 |
| 369 vsub.s32 q1, q1, q6 | 374 vsub.s32 q1, q1, q6 |
| 370 vmull.s16 q6, ROW2R, XFIX_0_541196100_PLUS_0_765366865 | 375 vmull.s16 q6, ROW2R, XFIX_0_541196100_PLUS_0_765366865 |
| 371 vmlal.s16 q6, ROW6R, XFIX_0_541196100 | 376 vmlal.s16 q6, ROW6R, XFIX_0_541196100 |
| 372 vsub.s32 q3, q3, q2 | 377 vsub.s32 q3, q3, q2 |
| 373 vrshrn.s32 ROW6R, q1, #11 | 378 vrshrn.s32 ROW6R, q1, #11 |
| 374 vadd.s32 q1, q3, q5 | 379 vadd.s32 q1, q3, q5 |
| 375 vsub.s32 q3, q3, q5 | 380 vsub.s32 q3, q3, q5 |
| 376 vaddl.s16 q5, ROW0R, ROW4R | 381 vaddl.s16 q5, ROW0R, ROW4R |
| 377 vrshrn.s32 ROW2R, q1, #11 | 382 vrshrn.s32 ROW2R, q1, #11 |
| 378 vrshrn.s32 ROW5R, q3, #11 | 383 vrshrn.s32 ROW5R, q3, #11 |
| 379 vshl.s32 q5, q5, #13 | 384 vshl.s32 q5, q5, #13 |
| 380 vmlal.s16 q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223 | 385 vmlal.s16 q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223 |
| 381 vadd.s32 q2, q5, q6 | 386 vadd.s32 q2, q5, q6 |
| 382 vsub.s32 q1, q5, q6 | 387 vsub.s32 q1, q5, q6 |
| 383 vadd.s32 q6, q2, q7 | 388 vadd.s32 q6, q2, q7 |
| 384 vsub.s32 q2, q2, q7 | 389 vsub.s32 q2, q2, q7 |
| 385 vadd.s32 q5, q1, q4 | 390 vadd.s32 q5, q1, q4 |
| 386 vsub.s32 q3, q1, q4 | 391 vsub.s32 q3, q1, q4 |
| 387 vrshrn.s32 ROW7R, q2, #11 | 392 vrshrn.s32 ROW7R, q2, #11 |
| 388 vrshrn.s32 ROW3R, q5, #11 | 393 vrshrn.s32 ROW3R, q5, #11 |
| 389 vrshrn.s32 ROW0R, q6, #11 | 394 vrshrn.s32 ROW0R, q6, #11 |
| 390 vrshrn.s32 ROW4R, q3, #11 | 395 vrshrn.s32 ROW4R, q3, #11 |
| 391 /* Transpose right 4x8 half */ | 396 /* Transpose right 4x8 half */ |
| 392 vtrn.16 ROW6R, ROW7R | 397 vtrn.16 ROW6R, ROW7R |
| 393 vtrn.16 ROW2R, ROW3R | 398 vtrn.16 ROW2R, ROW3R |
| 394 vtrn.16 ROW0R, ROW1R | 399 vtrn.16 ROW0R, ROW1R |
| 395 vtrn.16 ROW4R, ROW5R | 400 vtrn.16 ROW4R, ROW5R |
| 396 vtrn.32 ROW1R, ROW3R | 401 vtrn.32 ROW1R, ROW3R |
| 397 vtrn.32 ROW4R, ROW6R | 402 vtrn.32 ROW4R, ROW6R |
| 398 vtrn.32 ROW0R, ROW2R | 403 vtrn.32 ROW0R, ROW2R |
| 399 vtrn.32 ROW5R, ROW7R | 404 vtrn.32 ROW5R, ROW7R |
| 400 | 405 |
| 401 1: /* 1-D IDCT, pass 2 (normal variant), left 4x8 half */ | 406 1: /* 1-D IDCT, pass 2 (normal variant), left 4x8 half */ |
| 402 vld1.s16 {d2}, [ip, :64] /* reload constants */ | 407 vld1.s16 {d2}, [ip, :64] /* reload constants */ |
| 403 vmull.s16 q6, ROW1R, XFIX_1_175875602 /* ROW5L <-> ROW1R */ | 408 vmull.s16 q6, ROW1R, XFIX_1_175875602 /* ROW5L <-> ROW1R */ |
| 404 vmlal.s16 q6, ROW1L, XFIX_1_175875602 | 409 vmlal.s16 q6, ROW1L, XFIX_1_175875602 |
| 405 vmlal.s16 q6, ROW3R, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L <-
> ROW3R */ | 410 vmlal.s16 q6, ROW3R, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L <->
ROW3R */ |
| 406 vmlal.s16 q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560 | 411 vmlal.s16 q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560 |
| 407 vmull.s16 q7, ROW3R, XFIX_1_175875602 /* ROW7L <-> ROW3R */ | 412 vmull.s16 q7, ROW3R, XFIX_1_175875602 /* ROW7L <-> ROW3R */ |
| 408 vmlal.s16 q7, ROW3L, XFIX_1_175875602 | 413 vmlal.s16 q7, ROW3L, XFIX_1_175875602 |
| 409 vmlal.s16 q7, ROW1R, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L <-
> ROW1R */ | 414 vmlal.s16 q7, ROW1R, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L <->
ROW1R */ |
| 410 vmlal.s16 q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644 | 415 vmlal.s16 q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644 |
| 411 vsubl.s16 q3, ROW0L, ROW0R /* ROW4L <-> ROW0R */ | 416 vsubl.s16 q3, ROW0L, ROW0R /* ROW4L <-> ROW0R */ |
| 412 vmull.s16 q2, ROW2L, XFIX_0_541196100 | 417 vmull.s16 q2, ROW2L, XFIX_0_541196100 |
| 413 vmlal.s16 q2, ROW2R, XFIX_0_541196100_MINUS_1_847759065 /* ROW6L <-
> ROW2R */ | 418 vmlal.s16 q2, ROW2R, XFIX_0_541196100_MINUS_1_847759065 /* ROW6L <->
ROW2R */ |
| 414 vmov q4, q6 | 419 vmov q4, q6 |
| 415 vmlsl.s16 q6, ROW1R, XFIX_2_562915447 /* ROW5L <-> ROW1R */ | 420 vmlsl.s16 q6, ROW1R, XFIX_2_562915447 /* ROW5L <-> ROW1R */ |
| 416 vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447 | 421 vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447 |
| 417 vshl.s32 q3, q3, #13 | 422 vshl.s32 q3, q3, #13 |
| 418 vmlsl.s16 q4, ROW1L, XFIX_0_899976223 | 423 vmlsl.s16 q4, ROW1L, XFIX_0_899976223 |
| 419 vadd.s32 q1, q3, q2 | 424 vadd.s32 q1, q3, q2 |
| 420 vmov q5, q7 | 425 vmov q5, q7 |
| 421 vadd.s32 q1, q1, q6 | 426 vadd.s32 q1, q1, q6 |
| 422 vmlsl.s16 q7, ROW3R, XFIX_0_899976223 /* ROW7L <-> ROW3R */ | 427 vmlsl.s16 q7, ROW3R, XFIX_0_899976223 /* ROW7L <-> ROW3R */ |
| 423 vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223 | 428 vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223 |
| 424 vshrn.s32 ROW1L, q1, #16 | 429 vshrn.s32 ROW1L, q1, #16 |
| 425 vsub.s32 q1, q1, q6 | 430 vsub.s32 q1, q1, q6 |
| 426 vmlal.s16 q5, ROW1R, XFIX_2_053119869_MINUS_2_562915447 /* ROW5L <-
> ROW1R */ | 431 vmlal.s16 q5, ROW1R, XFIX_2_053119869_MINUS_2_562915447 /* ROW5L <->
ROW1R */ |
| 427 vmlsl.s16 q5, ROW3L, XFIX_2_562915447 | 432 vmlsl.s16 q5, ROW3L, XFIX_2_562915447 |
| 428 vsub.s32 q1, q1, q6 | 433 vsub.s32 q1, q1, q6 |
| 429 vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865 | 434 vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865 |
| 430 vmlal.s16 q6, ROW2R, XFIX_0_541196100 /* ROW6L <-> ROW2R */ | 435 vmlal.s16 q6, ROW2R, XFIX_0_541196100 /* ROW6L <-> ROW2R */ |
| 431 vsub.s32 q3, q3, q2 | 436 vsub.s32 q3, q3, q2 |
| 432 vshrn.s32 ROW2R, q1, #16 /* ROW6L <-> ROW2R */ | 437 vshrn.s32 ROW2R, q1, #16 /* ROW6L <-> ROW2R */ |
| 433 vadd.s32 q1, q3, q5 | 438 vadd.s32 q1, q3, q5 |
| 434 vsub.s32 q3, q3, q5 | 439 vsub.s32 q3, q3, q5 |
| 435 vaddl.s16 q5, ROW0L, ROW0R /* ROW4L <-> ROW0R */ | 440 vaddl.s16 q5, ROW0L, ROW0R /* ROW4L <-> ROW0R */ |
| 436 vshrn.s32 ROW2L, q1, #16 | 441 vshrn.s32 ROW2L, q1, #16 |
| 437 vshrn.s32 ROW1R, q3, #16 /* ROW5L <-> ROW1R */ | 442 vshrn.s32 ROW1R, q3, #16 /* ROW5L <-> ROW1R */ |
| 438 vshl.s32 q5, q5, #13 | 443 vshl.s32 q5, q5, #13 |
| 439 vmlal.s16 q4, ROW3R, XFIX_0_298631336_MINUS_0_899976223 /* ROW7L <-
> ROW3R */ | 444 vmlal.s16 q4, ROW3R, XFIX_0_298631336_MINUS_0_899976223 /* ROW7L <->
ROW3R */ |
| 440 vadd.s32 q2, q5, q6 | 445 vadd.s32 q2, q5, q6 |
| 441 vsub.s32 q1, q5, q6 | 446 vsub.s32 q1, q5, q6 |
| 442 vadd.s32 q6, q2, q7 | 447 vadd.s32 q6, q2, q7 |
| 443 vsub.s32 q2, q2, q7 | 448 vsub.s32 q2, q2, q7 |
| 444 vadd.s32 q5, q1, q4 | 449 vadd.s32 q5, q1, q4 |
| 445 vsub.s32 q3, q1, q4 | 450 vsub.s32 q3, q1, q4 |
| 446 vshrn.s32 ROW3R, q2, #16 /* ROW7L <-> ROW3R */ | 451 vshrn.s32 ROW3R, q2, #16 /* ROW7L <-> ROW3R */ |
| 447 vshrn.s32 ROW3L, q5, #16 | 452 vshrn.s32 ROW3L, q5, #16 |
| 448 vshrn.s32 ROW0L, q6, #16 | 453 vshrn.s32 ROW0L, q6, #16 |
| 449 vshrn.s32 ROW0R, q3, #16 /* ROW4L <-> ROW0R */ | 454 vshrn.s32 ROW0R, q3, #16 /* ROW4L <-> ROW0R */ |
| 450 /* 1-D IDCT, pass 2, right 4x8 half */ | 455 /* 1-D IDCT, pass 2, right 4x8 half */ |
| 451 vld1.s16 {d2}, [ip, :64] /* reload constants */ | 456 vld1.s16 {d2}, [ip, :64] /* reload constants */ |
| 452 vmull.s16 q6, ROW5R, XFIX_1_175875602 | 457 vmull.s16 q6, ROW5R, XFIX_1_175875602 |
| 453 vmlal.s16 q6, ROW5L, XFIX_1_175875602 /* ROW5L <-> ROW1R */ | 458 vmlal.s16 q6, ROW5L, XFIX_1_175875602 /* ROW5L <-> ROW1R */ |
| 454 vmlal.s16 q6, ROW7R, XFIX_1_175875602_MINUS_1_961570560 | 459 vmlal.s16 q6, ROW7R, XFIX_1_175875602_MINUS_1_961570560 |
| 455 vmlal.s16 q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L <-
> ROW3R */ | 460 vmlal.s16 q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L <->
ROW3R */ |
| 456 vmull.s16 q7, ROW7R, XFIX_1_175875602 | 461 vmull.s16 q7, ROW7R, XFIX_1_175875602 |
| 457 vmlal.s16 q7, ROW7L, XFIX_1_175875602 /* ROW7L <-> ROW3R */ | 462 vmlal.s16 q7, ROW7L, XFIX_1_175875602 /* ROW7L <-> ROW3R */ |
| 458 vmlal.s16 q7, ROW5R, XFIX_1_175875602_MINUS_0_390180644 | 463 vmlal.s16 q7, ROW5R, XFIX_1_175875602_MINUS_0_390180644 |
| 459 vmlal.s16 q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L <-
> ROW1R */ | 464 vmlal.s16 q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L <->
ROW1R */ |
| 460 vsubl.s16 q3, ROW4L, ROW4R /* ROW4L <-> ROW0R */ | 465 vsubl.s16 q3, ROW4L, ROW4R /* ROW4L <-> ROW0R */ |
| 461 vmull.s16 q2, ROW6L, XFIX_0_541196100 /* ROW6L <-> ROW2R */ | 466 vmull.s16 q2, ROW6L, XFIX_0_541196100 /* ROW6L <-> ROW2R */ |
| 462 vmlal.s16 q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065 | 467 vmlal.s16 q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065 |
| 463 vmov q4, q6 | 468 vmov q4, q6 |
| 464 vmlsl.s16 q6, ROW5R, XFIX_2_562915447 | 469 vmlsl.s16 q6, ROW5R, XFIX_2_562915447 |
| 465 vmlal.s16 q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447 /* ROW7L <-
> ROW3R */ | 470 vmlal.s16 q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447 /* ROW7L <->
ROW3R */ |
| 466 vshl.s32 q3, q3, #13 | 471 vshl.s32 q3, q3, #13 |
| 467 vmlsl.s16 q4, ROW5L, XFIX_0_899976223 /* ROW5L <-> ROW1R */ | 472 vmlsl.s16 q4, ROW5L, XFIX_0_899976223 /* ROW5L <-> ROW1R */ |
| 468 vadd.s32 q1, q3, q2 | 473 vadd.s32 q1, q3, q2 |
| 469 vmov q5, q7 | 474 vmov q5, q7 |
| 470 vadd.s32 q1, q1, q6 | 475 vadd.s32 q1, q1, q6 |
| 471 vmlsl.s16 q7, ROW7R, XFIX_0_899976223 | 476 vmlsl.s16 q7, ROW7R, XFIX_0_899976223 |
| 472 vmlal.s16 q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223 /* ROW5L <-
> ROW1R */ | 477 vmlal.s16 q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223 /* ROW5L <->
ROW1R */ |
| 473 vshrn.s32 ROW5L, q1, #16 /* ROW5L <-> ROW1R */ | 478 vshrn.s32 ROW5L, q1, #16 /* ROW5L <-> ROW1R */ |
| 474 vsub.s32 q1, q1, q6 | 479 vsub.s32 q1, q1, q6 |
| 475 vmlal.s16 q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447 | 480 vmlal.s16 q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447 |
| 476 vmlsl.s16 q5, ROW7L, XFIX_2_562915447 /* ROW7L <-> ROW3R */ | 481 vmlsl.s16 q5, ROW7L, XFIX_2_562915447 /* ROW7L <-> ROW3R */ |
| 477 vsub.s32 q1, q1, q6 | 482 vsub.s32 q1, q1, q6 |
| 478 vmull.s16 q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865 /* ROW6L <->
ROW2R */ | 483 vmull.s16 q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865 /* ROW6L <-> R
OW2R */ |
| 479 vmlal.s16 q6, ROW6R, XFIX_0_541196100 | 484 vmlal.s16 q6, ROW6R, XFIX_0_541196100 |
| 480 vsub.s32 q3, q3, q2 | 485 vsub.s32 q3, q3, q2 |
| 481 vshrn.s32 ROW6R, q1, #16 | 486 vshrn.s32 ROW6R, q1, #16 |
| 482 vadd.s32 q1, q3, q5 | 487 vadd.s32 q1, q3, q5 |
| 483 vsub.s32 q3, q3, q5 | 488 vsub.s32 q3, q3, q5 |
| 484 vaddl.s16 q5, ROW4L, ROW4R /* ROW4L <-> ROW0R */ | 489 vaddl.s16 q5, ROW4L, ROW4R /* ROW4L <-> ROW0R */ |
| 485 vshrn.s32 ROW6L, q1, #16 /* ROW6L <-> ROW2R */ | 490 vshrn.s32 ROW6L, q1, #16 /* ROW6L <-> ROW2R */ |
| 486 vshrn.s32 ROW5R, q3, #16 | 491 vshrn.s32 ROW5R, q3, #16 |
| 487 vshl.s32 q5, q5, #13 | 492 vshl.s32 q5, q5, #13 |
| 488 vmlal.s16 q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223 | 493 vmlal.s16 q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223 |
| 489 vadd.s32 q2, q5, q6 | 494 vadd.s32 q2, q5, q6 |
| 490 vsub.s32 q1, q5, q6 | 495 vsub.s32 q1, q5, q6 |
| 491 vadd.s32 q6, q2, q7 | 496 vadd.s32 q6, q2, q7 |
| 492 vsub.s32 q2, q2, q7 | 497 vsub.s32 q2, q2, q7 |
| 493 vadd.s32 q5, q1, q4 | 498 vadd.s32 q5, q1, q4 |
| 494 vsub.s32 q3, q1, q4 | 499 vsub.s32 q3, q1, q4 |
| 495 vshrn.s32 ROW7R, q2, #16 | 500 vshrn.s32 ROW7R, q2, #16 |
| 496 vshrn.s32 ROW7L, q5, #16 /* ROW7L <-> ROW3R */ | 501 vshrn.s32 ROW7L, q5, #16 /* ROW7L <-> ROW3R */ |
| 497 vshrn.s32 ROW4L, q6, #16 /* ROW4L <-> ROW0R */ | 502 vshrn.s32 ROW4L, q6, #16 /* ROW4L <-> ROW0R */ |
| 498 vshrn.s32 ROW4R, q3, #16 | 503 vshrn.s32 ROW4R, q3, #16 |
| 499 | 504 |
| 500 2: /* Descale to 8-bit and range limit */ | 505 2: /* Descale to 8-bit and range limit */ |
| 501 vqrshrn.s16 d16, q8, #2 | 506 vqrshrn.s16 d16, q8, #2 |
| 502 vqrshrn.s16 d17, q9, #2 | 507 vqrshrn.s16 d17, q9, #2 |
| 503 vqrshrn.s16 d18, q10, #2 | 508 vqrshrn.s16 d18, q10, #2 |
| 504 vqrshrn.s16 d19, q11, #2 | 509 vqrshrn.s16 d19, q11, #2 |
| 505 vpop {d8-d15} /* restore NEON registers */ | 510 vpop {d8-d15} /* restore NEON registers */ |
| 506 vqrshrn.s16 d20, q12, #2 | 511 vqrshrn.s16 d20, q12, #2 |
| 507 /* Transpose the final 8-bit samples and do signed->unsigned conversion */ | 512 /* Transpose the final 8-bit samples and do signed->unsigned conversion */ |
| 508 vtrn.16 q8, q9 | 513 vtrn.16 q8, q9 |
| 509 vqrshrn.s16 d21, q13, #2 | 514 vqrshrn.s16 d21, q13, #2 |
| 510 vqrshrn.s16 d22, q14, #2 | 515 vqrshrn.s16 d22, q14, #2 |
| 511 vmov.u8 q0, #(CENTERJSAMPLE) | 516 vmov.u8 q0, #(CENTERJSAMPLE) |
| 512 vqrshrn.s16 d23, q15, #2 | 517 vqrshrn.s16 d23, q15, #2 |
| 513 vtrn.8 d16, d17 | 518 vtrn.8 d16, d17 |
| 514 vtrn.8 d18, d19 | 519 vtrn.8 d18, d19 |
| 515 vadd.u8 q8, q8, q0 | 520 vadd.u8 q8, q8, q0 |
| 516 vadd.u8 q9, q9, q0 | 521 vadd.u8 q9, q9, q0 |
| 517 vtrn.16 q10, q11 | 522 vtrn.16 q10, q11 |
| 518 /* Store results to the output buffer */ | 523 /* Store results to the output buffer */ |
| 519 ldmia OUTPUT_BUF!, {TMP1, TMP2} | 524 ldmia OUTPUT_BUF!, {TMP1, TMP2} |
| 520 add TMP1, TMP1, OUTPUT_COL | 525 add TMP1, TMP1, OUTPUT_COL |
| 521 add TMP2, TMP2, OUTPUT_COL | 526 add TMP2, TMP2, OUTPUT_COL |
| 522 vst1.8 {d16}, [TMP1] | 527 vst1.8 {d16}, [TMP1] |
| 523 vtrn.8 d20, d21 | 528 vtrn.8 d20, d21 |
| 524 vst1.8 {d17}, [TMP2] | 529 vst1.8 {d17}, [TMP2] |
| 525 ldmia OUTPUT_BUF!, {TMP1, TMP2} | 530 ldmia OUTPUT_BUF!, {TMP1, TMP2} |
| 526 add TMP1, TMP1, OUTPUT_COL | 531 add TMP1, TMP1, OUTPUT_COL |
| 527 add TMP2, TMP2, OUTPUT_COL | 532 add TMP2, TMP2, OUTPUT_COL |
| 528 vst1.8 {d18}, [TMP1] | 533 vst1.8 {d18}, [TMP1] |
| 529 vadd.u8 q10, q10, q0 | 534 vadd.u8 q10, q10, q0 |
| 530 vst1.8 {d19}, [TMP2] | 535 vst1.8 {d19}, [TMP2] |
| 531 ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4} | 536 ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4} |
| 532 add TMP1, TMP1, OUTPUT_COL | 537 add TMP1, TMP1, OUTPUT_COL |
| 533 add TMP2, TMP2, OUTPUT_COL | 538 add TMP2, TMP2, OUTPUT_COL |
| 534 add TMP3, TMP3, OUTPUT_COL | 539 add TMP3, TMP3, OUTPUT_COL |
| 535 add TMP4, TMP4, OUTPUT_COL | 540 add TMP4, TMP4, OUTPUT_COL |
| 536 vtrn.8 d22, d23 | 541 vtrn.8 d22, d23 |
| 537 vst1.8 {d20}, [TMP1] | 542 vst1.8 {d20}, [TMP1] |
| 538 vadd.u8 q11, q11, q0 | 543 vadd.u8 q11, q11, q0 |
| 539 vst1.8 {d21}, [TMP2] | 544 vst1.8 {d21}, [TMP2] |
| 540 vst1.8 {d22}, [TMP3] | 545 vst1.8 {d22}, [TMP3] |
| 541 vst1.8 {d23}, [TMP4] | 546 vst1.8 {d23}, [TMP4] |
| 542 bx lr | 547 bx lr |
| 543 | 548 |
| 544 3: /* Left 4x8 half is done, right 4x8 half contains mostly zeros */ | 549 3: /* Left 4x8 half is done, right 4x8 half contains mostly zeros */ |
| 545 | 550 |
| 546 /* Transpose left 4x8 half */ | 551 /* Transpose left 4x8 half */ |
| 547 vtrn.16 ROW6L, ROW7L | 552 vtrn.16 ROW6L, ROW7L |
| 548 vtrn.16 ROW2L, ROW3L | 553 vtrn.16 ROW2L, ROW3L |
| 549 vtrn.16 ROW0L, ROW1L | 554 vtrn.16 ROW0L, ROW1L |
| 550 vtrn.16 ROW4L, ROW5L | 555 vtrn.16 ROW4L, ROW5L |
| 551 vshl.s16 ROW0R, ROW0R, #2 /* PASS1_BITS */ | 556 vshl.s16 ROW0R, ROW0R, #2 /* PASS1_BITS */ |
| 552 vtrn.32 ROW1L, ROW3L | 557 vtrn.32 ROW1L, ROW3L |
| 553 vtrn.32 ROW4L, ROW6L | 558 vtrn.32 ROW4L, ROW6L |
| 554 vtrn.32 ROW0L, ROW2L | 559 vtrn.32 ROW0L, ROW2L |
| 555 vtrn.32 ROW5L, ROW7L | 560 vtrn.32 ROW5L, ROW7L |
| 556 | 561 |
| 557 cmp r0, #0 | 562 cmp r0, #0 |
| 558 beq 4f /* Right 4x8 half has all zeros, go to 'sparse' second pa
ss */ | 563 beq 4f /* Right 4x8 half has all zeros, go to 'sparse' second |
| 564 pass */ |
| 559 | 565 |
| 560 /* Only row 0 is non-zero for the right 4x8 half */ | 566 /* Only row 0 is non-zero for the right 4x8 half */ |
| 561 vdup.s16 ROW1R, ROW0R[1] | 567 vdup.s16 ROW1R, ROW0R[1] |
| 562 vdup.s16 ROW2R, ROW0R[2] | 568 vdup.s16 ROW2R, ROW0R[2] |
| 563 vdup.s16 ROW3R, ROW0R[3] | 569 vdup.s16 ROW3R, ROW0R[3] |
| 564 vdup.s16 ROW4R, ROW0R[0] | 570 vdup.s16 ROW4R, ROW0R[0] |
| 565 vdup.s16 ROW5R, ROW0R[1] | 571 vdup.s16 ROW5R, ROW0R[1] |
| 566 vdup.s16 ROW6R, ROW0R[2] | 572 vdup.s16 ROW6R, ROW0R[2] |
| 567 vdup.s16 ROW7R, ROW0R[3] | 573 vdup.s16 ROW7R, ROW0R[3] |
| 568 vdup.s16 ROW0R, ROW0R[0] | 574 vdup.s16 ROW0R, ROW0R[0] |
| 569 b 1b /* Go to 'normal' second pass */ | 575 b 1b /* Go to 'normal' second pass */ |
| 570 | 576 |
| 571 4: /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), left 4x8 half */ | 577 4: /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), left 4x8 half */ |
| 572 vld1.s16 {d2}, [ip, :64] /* reload constants */ | 578 vld1.s16 {d2}, [ip, :64] /* reload constants */ |
| 573 vmull.s16 q6, ROW1L, XFIX_1_175875602 | 579 vmull.s16 q6, ROW1L, XFIX_1_175875602 |
| 574 vmlal.s16 q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560 | 580 vmlal.s16 q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560 |
| 575 vmull.s16 q7, ROW3L, XFIX_1_175875602 | 581 vmull.s16 q7, ROW3L, XFIX_1_175875602 |
| 576 vmlal.s16 q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644 | 582 vmlal.s16 q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644 |
| 577 vmull.s16 q2, ROW2L, XFIX_0_541196100 | 583 vmull.s16 q2, ROW2L, XFIX_0_541196100 |
| 578 vshll.s16 q3, ROW0L, #13 | 584 vshll.s16 q3, ROW0L, #13 |
| 579 vmov q4, q6 | 585 vmov q4, q6 |
| 580 vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447 | 586 vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447 |
| 581 vmlsl.s16 q4, ROW1L, XFIX_0_899976223 | 587 vmlsl.s16 q4, ROW1L, XFIX_0_899976223 |
| 582 vadd.s32 q1, q3, q2 | 588 vadd.s32 q1, q3, q2 |
| 583 vmov q5, q7 | 589 vmov q5, q7 |
| 584 vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223 | 590 vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223 |
| 585 vadd.s32 q1, q1, q6 | 591 vadd.s32 q1, q1, q6 |
| 586 vadd.s32 q6, q6, q6 | 592 vadd.s32 q6, q6, q6 |
| 587 vmlsl.s16 q5, ROW3L, XFIX_2_562915447 | 593 vmlsl.s16 q5, ROW3L, XFIX_2_562915447 |
| 588 vshrn.s32 ROW1L, q1, #16 | 594 vshrn.s32 ROW1L, q1, #16 |
| 589 vsub.s32 q1, q1, q6 | 595 vsub.s32 q1, q1, q6 |
| 590 vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865 | 596 vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865 |
| 591 vsub.s32 q3, q3, q2 | 597 vsub.s32 q3, q3, q2 |
| 592 vshrn.s32 ROW2R, q1, #16 /* ROW6L <-> ROW2R */ | 598 vshrn.s32 ROW2R, q1, #16 /* ROW6L <-> ROW2R */ |
| 593 vadd.s32 q1, q3, q5 | 599 vadd.s32 q1, q3, q5 |
| 594 vsub.s32 q3, q3, q5 | 600 vsub.s32 q3, q3, q5 |
| 595 vshll.s16 q5, ROW0L, #13 | 601 vshll.s16 q5, ROW0L, #13 |
| 596 vshrn.s32 ROW2L, q1, #16 | 602 vshrn.s32 ROW2L, q1, #16 |
| 597 vshrn.s32 ROW1R, q3, #16 /* ROW5L <-> ROW1R */ | 603 vshrn.s32 ROW1R, q3, #16 /* ROW5L <-> ROW1R */ |
| 598 vadd.s32 q2, q5, q6 | 604 vadd.s32 q2, q5, q6 |
| 599 vsub.s32 q1, q5, q6 | 605 vsub.s32 q1, q5, q6 |
| 600 vadd.s32 q6, q2, q7 | 606 vadd.s32 q6, q2, q7 |
| 601 vsub.s32 q2, q2, q7 | 607 vsub.s32 q2, q2, q7 |
| 602 vadd.s32 q5, q1, q4 | 608 vadd.s32 q5, q1, q4 |
| 603 vsub.s32 q3, q1, q4 | 609 vsub.s32 q3, q1, q4 |
| 604 vshrn.s32 ROW3R, q2, #16 /* ROW7L <-> ROW3R */ | 610 vshrn.s32 ROW3R, q2, #16 /* ROW7L <-> ROW3R */ |
| 605 vshrn.s32 ROW3L, q5, #16 | 611 vshrn.s32 ROW3L, q5, #16 |
| 606 vshrn.s32 ROW0L, q6, #16 | 612 vshrn.s32 ROW0L, q6, #16 |
| 607 vshrn.s32 ROW0R, q3, #16 /* ROW4L <-> ROW0R */ | 613 vshrn.s32 ROW0R, q3, #16 /* ROW4L <-> ROW0R */ |
| 608 /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), right 4x8 half */ | 614 /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), right 4x8 half */ |
| 609 vld1.s16 {d2}, [ip, :64] /* reload constants */ | 615 vld1.s16 {d2}, [ip, :64] /* reload constants */ |
| 610 vmull.s16 q6, ROW5L, XFIX_1_175875602 | 616 vmull.s16 q6, ROW5L, XFIX_1_175875602 |
| 611 vmlal.s16 q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560 | 617 vmlal.s16 q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560 |
| 612 vmull.s16 q7, ROW7L, XFIX_1_175875602 | 618 vmull.s16 q7, ROW7L, XFIX_1_175875602 |
| 613 vmlal.s16 q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644 | 619 vmlal.s16 q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644 |
| 614 vmull.s16 q2, ROW6L, XFIX_0_541196100 | 620 vmull.s16 q2, ROW6L, XFIX_0_541196100 |
| 615 vshll.s16 q3, ROW4L, #13 | 621 vshll.s16 q3, ROW4L, #13 |
| 616 vmov q4, q6 | 622 vmov q4, q6 |
| 617 vmlal.s16 q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447 | 623 vmlal.s16 q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447 |
| 618 vmlsl.s16 q4, ROW5L, XFIX_0_899976223 | 624 vmlsl.s16 q4, ROW5L, XFIX_0_899976223 |
| 619 vadd.s32 q1, q3, q2 | 625 vadd.s32 q1, q3, q2 |
| 620 vmov q5, q7 | 626 vmov q5, q7 |
| 621 vmlal.s16 q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223 | 627 vmlal.s16 q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223 |
| 622 vadd.s32 q1, q1, q6 | 628 vadd.s32 q1, q1, q6 |
| 623 vadd.s32 q6, q6, q6 | 629 vadd.s32 q6, q6, q6 |
| 624 vmlsl.s16 q5, ROW7L, XFIX_2_562915447 | 630 vmlsl.s16 q5, ROW7L, XFIX_2_562915447 |
| 625 vshrn.s32 ROW5L, q1, #16 /* ROW5L <-> ROW1R */ | 631 vshrn.s32 ROW5L, q1, #16 /* ROW5L <-> ROW1R */ |
| 626 vsub.s32 q1, q1, q6 | 632 vsub.s32 q1, q1, q6 |
| 627 vmull.s16 q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865 | 633 vmull.s16 q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865 |
| 628 vsub.s32 q3, q3, q2 | 634 vsub.s32 q3, q3, q2 |
| 629 vshrn.s32 ROW6R, q1, #16 | 635 vshrn.s32 ROW6R, q1, #16 |
| 630 vadd.s32 q1, q3, q5 | 636 vadd.s32 q1, q3, q5 |
| 631 vsub.s32 q3, q3, q5 | 637 vsub.s32 q3, q3, q5 |
| 632 vshll.s16 q5, ROW4L, #13 | 638 vshll.s16 q5, ROW4L, #13 |
| 633 vshrn.s32 ROW6L, q1, #16 /* ROW6L <-> ROW2R */ | 639 vshrn.s32 ROW6L, q1, #16 /* ROW6L <-> ROW2R */ |
| 634 vshrn.s32 ROW5R, q3, #16 | 640 vshrn.s32 ROW5R, q3, #16 |
| 635 vadd.s32 q2, q5, q6 | 641 vadd.s32 q2, q5, q6 |
| 636 vsub.s32 q1, q5, q6 | 642 vsub.s32 q1, q5, q6 |
| 637 vadd.s32 q6, q2, q7 | 643 vadd.s32 q6, q2, q7 |
| 638 vsub.s32 q2, q2, q7 | 644 vsub.s32 q2, q2, q7 |
| 639 vadd.s32 q5, q1, q4 | 645 vadd.s32 q5, q1, q4 |
| 640 vsub.s32 q3, q1, q4 | 646 vsub.s32 q3, q1, q4 |
| 641 vshrn.s32 ROW7R, q2, #16 | 647 vshrn.s32 ROW7R, q2, #16 |
| 642 vshrn.s32 ROW7L, q5, #16 /* ROW7L <-> ROW3R */ | 648 vshrn.s32 ROW7L, q5, #16 /* ROW7L <-> ROW3R */ |
| 643 vshrn.s32 ROW4L, q6, #16 /* ROW4L <-> ROW0R */ | 649 vshrn.s32 ROW4L, q6, #16 /* ROW4L <-> ROW0R */ |
| 644 vshrn.s32 ROW4R, q3, #16 | 650 vshrn.s32 ROW4R, q3, #16 |
| 645 b 2b /* Go to epilogue */ | 651 b 2b /* Go to epilogue */ |
| 646 | 652 |
| 647 .unreq DCT_TABLE | 653 .unreq DCT_TABLE |
| 648 .unreq COEF_BLOCK | 654 .unreq COEF_BLOCK |
| 649 .unreq OUTPUT_BUF | 655 .unreq OUTPUT_BUF |
| 650 .unreq OUTPUT_COL | 656 .unreq OUTPUT_COL |
| 651 .unreq TMP1 | 657 .unreq TMP1 |
| 652 .unreq TMP2 | 658 .unreq TMP2 |
| 653 .unreq TMP3 | 659 .unreq TMP3 |
| 654 .unreq TMP4 | 660 .unreq TMP4 |
| 655 | 661 |
| (...skipping 33 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 689 * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions. | 695 * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions. |
| 690 */ | 696 */ |
| 691 | 697 |
| 692 #define XFIX_1_082392200 d0[0] | 698 #define XFIX_1_082392200 d0[0] |
| 693 #define XFIX_1_414213562 d0[1] | 699 #define XFIX_1_414213562 d0[1] |
| 694 #define XFIX_1_847759065 d0[2] | 700 #define XFIX_1_847759065 d0[2] |
| 695 #define XFIX_2_613125930 d0[3] | 701 #define XFIX_2_613125930 d0[3] |
| 696 | 702 |
| 697 .balign 16 | 703 .balign 16 |
| 698 jsimd_idct_ifast_neon_consts: | 704 jsimd_idct_ifast_neon_consts: |
| 699 .short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */ | 705 .short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */ |
| 700 .short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */ | 706 .short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */ |
| 701 .short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */ | 707 .short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */ |
| 702 .short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */ | 708 .short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */ |
| 703 | 709 |
| 704 asm_function jsimd_idct_ifast_neon | 710 asm_function jsimd_idct_ifast_neon |
| 705 | 711 |
| 706 DCT_TABLE .req r0 | 712 DCT_TABLE .req r0 |
| 707 COEF_BLOCK .req r1 | 713 COEF_BLOCK .req r1 |
| 708 OUTPUT_BUF .req r2 | 714 OUTPUT_BUF .req r2 |
| 709 OUTPUT_COL .req r3 | 715 OUTPUT_COL .req r3 |
| 710 TMP1 .req r0 | 716 TMP1 .req r0 |
| 711 TMP2 .req r1 | 717 TMP2 .req r1 |
| 712 TMP3 .req r2 | 718 TMP3 .req r2 |
| 713 TMP4 .req ip | 719 TMP4 .req ip |
| 714 | 720 |
| 715 /* Load and dequantize coefficients into NEON registers | 721 /* Load and dequantize coefficients into NEON registers |
| 716 * with the following allocation: | 722 * with the following allocation: |
| 717 * 0 1 2 3 | 4 5 6 7 | 723 * 0 1 2 3 | 4 5 6 7 |
| 718 * ---------+-------- | 724 * ---------+-------- |
| 719 * 0 | d16 | d17 ( q8 ) | 725 * 0 | d16 | d17 ( q8 ) |
| 720 * 1 | d18 | d19 ( q9 ) | 726 * 1 | d18 | d19 ( q9 ) |
| 721 * 2 | d20 | d21 ( q10 ) | 727 * 2 | d20 | d21 ( q10 ) |
| 722 * 3 | d22 | d23 ( q11 ) | 728 * 3 | d22 | d23 ( q11 ) |
| 723 * 4 | d24 | d25 ( q12 ) | 729 * 4 | d24 | d25 ( q12 ) |
| 724 * 5 | d26 | d27 ( q13 ) | 730 * 5 | d26 | d27 ( q13 ) |
| 725 * 6 | d28 | d29 ( q14 ) | 731 * 6 | d28 | d29 ( q14 ) |
| 726 * 7 | d30 | d31 ( q15 ) | 732 * 7 | d30 | d31 ( q15 ) |
| 727 */ | 733 */ |
| 728 adr ip, jsimd_idct_ifast_neon_consts | 734 adr ip, jsimd_idct_ifast_neon_consts |
| 729 vld1.16 {d16, d17, d18, d19}, [COEF_BLOCK, :128]! | 735 vld1.16 {d16, d17, d18, d19}, [COEF_BLOCK, :128]! |
| 730 vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]! | 736 vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]! |
| 731 vld1.16 {d20, d21, d22, d23}, [COEF_BLOCK, :128]! | 737 vld1.16 {d20, d21, d22, d23}, [COEF_BLOCK, :128]! |
| 732 vmul.s16 q8, q8, q0 | 738 vmul.s16 q8, q8, q0 |
| 733 vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]! | 739 vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]! |
| 734 vmul.s16 q9, q9, q1 | 740 vmul.s16 q9, q9, q1 |
| 735 vld1.16 {d24, d25, d26, d27}, [COEF_BLOCK, :128]! | 741 vld1.16 {d24, d25, d26, d27}, [COEF_BLOCK, :128]! |
| 736 vmul.s16 q10, q10, q2 | 742 vmul.s16 q10, q10, q2 |
| 737 vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]! | 743 vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]! |
| 738 vmul.s16 q11, q11, q3 | 744 vmul.s16 q11, q11, q3 |
| 739 vld1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128] | 745 vld1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128] |
| 740 vmul.s16 q12, q12, q0 | 746 vmul.s16 q12, q12, q0 |
| 741 vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]! | 747 vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]! |
| 742 vmul.s16 q14, q14, q2 | 748 vmul.s16 q14, q14, q2 |
| 743 vmul.s16 q13, q13, q1 | 749 vmul.s16 q13, q13, q1 |
| 744 vld1.16 {d0}, [ip, :64] /* load constants */ | 750 vld1.16 {d0}, [ip, :64] /* load constants */ |
| 745 vmul.s16 q15, q15, q3 | 751 vmul.s16 q15, q15, q3 |
| 746 vpush {d8-d13} /* save NEON registers */ | 752 vpush {d8-d13} /* save NEON registers */ |
| 747 /* 1-D IDCT, pass 1 */ | 753 /* 1-D IDCT, pass 1 */ |
| 748 vsub.s16 q2, q10, q14 | 754 vsub.s16 q2, q10, q14 |
| 749 vadd.s16 q14, q10, q14 | 755 vadd.s16 q14, q10, q14 |
| 750 vsub.s16 q1, q11, q13 | 756 vsub.s16 q1, q11, q13 |
| 751 vadd.s16 q13, q11, q13 | 757 vadd.s16 q13, q11, q13 |
| 752 vsub.s16 q5, q9, q15 | 758 vsub.s16 q5, q9, q15 |
| 753 vadd.s16 q15, q9, q15 | 759 vadd.s16 q15, q9, q15 |
| 754 vqdmulh.s16 q4, q2, XFIX_1_414213562 | 760 vqdmulh.s16 q4, q2, XFIX_1_414213562 |
| 755 vqdmulh.s16 q6, q1, XFIX_2_613125930 | 761 vqdmulh.s16 q6, q1, XFIX_2_613125930 |
| 756 vadd.s16 q3, q1, q1 | 762 vadd.s16 q3, q1, q1 |
| 757 vsub.s16 q1, q5, q1 | 763 vsub.s16 q1, q5, q1 |
| 758 vadd.s16 q10, q2, q4 | 764 vadd.s16 q10, q2, q4 |
| 759 vqdmulh.s16 q4, q1, XFIX_1_847759065 | 765 vqdmulh.s16 q4, q1, XFIX_1_847759065 |
| 760 vsub.s16 q2, q15, q13 | 766 vsub.s16 q2, q15, q13 |
| 761 vadd.s16 q3, q3, q6 | 767 vadd.s16 q3, q3, q6 |
| 762 vqdmulh.s16 q6, q2, XFIX_1_414213562 | 768 vqdmulh.s16 q6, q2, XFIX_1_414213562 |
| 763 vadd.s16 q1, q1, q4 | 769 vadd.s16 q1, q1, q4 |
| 764 vqdmulh.s16 q4, q5, XFIX_1_082392200 | 770 vqdmulh.s16 q4, q5, XFIX_1_082392200 |
| 765 vsub.s16 q10, q10, q14 | 771 vsub.s16 q10, q10, q14 |
| 766 vadd.s16 q2, q2, q6 | 772 vadd.s16 q2, q2, q6 |
| 767 vsub.s16 q6, q8, q12 | 773 vsub.s16 q6, q8, q12 |
| 768 vadd.s16 q12, q8, q12 | 774 vadd.s16 q12, q8, q12 |
| 769 vadd.s16 q9, q5, q4 | 775 vadd.s16 q9, q5, q4 |
| 770 vadd.s16 q5, q6, q10 | 776 vadd.s16 q5, q6, q10 |
| 771 vsub.s16 q10, q6, q10 | 777 vsub.s16 q10, q6, q10 |
| 772 vadd.s16 q6, q15, q13 | 778 vadd.s16 q6, q15, q13 |
| 773 vadd.s16 q8, q12, q14 | 779 vadd.s16 q8, q12, q14 |
| 774 vsub.s16 q3, q6, q3 | 780 vsub.s16 q3, q6, q3 |
| 775 vsub.s16 q12, q12, q14 | 781 vsub.s16 q12, q12, q14 |
| 776 vsub.s16 q3, q3, q1 | 782 vsub.s16 q3, q3, q1 |
| 777 vsub.s16 q1, q9, q1 | 783 vsub.s16 q1, q9, q1 |
| 778 vadd.s16 q2, q3, q2 | 784 vadd.s16 q2, q3, q2 |
| 779 vsub.s16 q15, q8, q6 | 785 vsub.s16 q15, q8, q6 |
| 780 vadd.s16 q1, q1, q2 | 786 vadd.s16 q1, q1, q2 |
| 781 vadd.s16 q8, q8, q6 | 787 vadd.s16 q8, q8, q6 |
| 782 vadd.s16 q14, q5, q3 | 788 vadd.s16 q14, q5, q3 |
| 783 vsub.s16 q9, q5, q3 | 789 vsub.s16 q9, q5, q3 |
| 784 vsub.s16 q13, q10, q2 | 790 vsub.s16 q13, q10, q2 |
| 785 vadd.s16 q10, q10, q2 | 791 vadd.s16 q10, q10, q2 |
| 786 /* Transpose */ | 792 /* Transpose */ |
| 787 vtrn.16 q8, q9 | 793 vtrn.16 q8, q9 |
| 788 vsub.s16 q11, q12, q1 | 794 vsub.s16 q11, q12, q1 |
| 789 vtrn.16 q14, q15 | 795 vtrn.16 q14, q15 |
| 790 vadd.s16 q12, q12, q1 | 796 vadd.s16 q12, q12, q1 |
| 791 vtrn.16 q10, q11 | 797 vtrn.16 q10, q11 |
| 792 vtrn.16 q12, q13 | 798 vtrn.16 q12, q13 |
| 793 vtrn.32 q9, q11 | 799 vtrn.32 q9, q11 |
| 794 vtrn.32 q12, q14 | 800 vtrn.32 q12, q14 |
| 795 vtrn.32 q8, q10 | 801 vtrn.32 q8, q10 |
| 796 vtrn.32 q13, q15 | 802 vtrn.32 q13, q15 |
| 797 vswp d28, d21 | 803 vswp d28, d21 |
| 798 vswp d26, d19 | 804 vswp d26, d19 |
| 799 /* 1-D IDCT, pass 2 */ | 805 /* 1-D IDCT, pass 2 */ |
| 800 vsub.s16 q2, q10, q14 | 806 vsub.s16 q2, q10, q14 |
| 801 vswp d30, d23 | 807 vswp d30, d23 |
| 802 vadd.s16 q14, q10, q14 | 808 vadd.s16 q14, q10, q14 |
| 803 vswp d24, d17 | 809 vswp d24, d17 |
| 804 vsub.s16 q1, q11, q13 | 810 vsub.s16 q1, q11, q13 |
| 805 vadd.s16 q13, q11, q13 | 811 vadd.s16 q13, q11, q13 |
| 806 vsub.s16 q5, q9, q15 | 812 vsub.s16 q5, q9, q15 |
| 807 vadd.s16 q15, q9, q15 | 813 vadd.s16 q15, q9, q15 |
| 808 vqdmulh.s16 q4, q2, XFIX_1_414213562 | 814 vqdmulh.s16 q4, q2, XFIX_1_414213562 |
| 809 vqdmulh.s16 q6, q1, XFIX_2_613125930 | 815 vqdmulh.s16 q6, q1, XFIX_2_613125930 |
| 810 vadd.s16 q3, q1, q1 | 816 vadd.s16 q3, q1, q1 |
| 811 vsub.s16 q1, q5, q1 | 817 vsub.s16 q1, q5, q1 |
| 812 vadd.s16 q10, q2, q4 | 818 vadd.s16 q10, q2, q4 |
| 813 vqdmulh.s16 q4, q1, XFIX_1_847759065 | 819 vqdmulh.s16 q4, q1, XFIX_1_847759065 |
| 814 vsub.s16 q2, q15, q13 | 820 vsub.s16 q2, q15, q13 |
| 815 vadd.s16 q3, q3, q6 | 821 vadd.s16 q3, q3, q6 |
| 816 vqdmulh.s16 q6, q2, XFIX_1_414213562 | 822 vqdmulh.s16 q6, q2, XFIX_1_414213562 |
| 817 vadd.s16 q1, q1, q4 | 823 vadd.s16 q1, q1, q4 |
| 818 vqdmulh.s16 q4, q5, XFIX_1_082392200 | 824 vqdmulh.s16 q4, q5, XFIX_1_082392200 |
| 819 vsub.s16 q10, q10, q14 | 825 vsub.s16 q10, q10, q14 |
| 820 vadd.s16 q2, q2, q6 | 826 vadd.s16 q2, q2, q6 |
| 821 vsub.s16 q6, q8, q12 | 827 vsub.s16 q6, q8, q12 |
| 822 vadd.s16 q12, q8, q12 | 828 vadd.s16 q12, q8, q12 |
| 823 vadd.s16 q9, q5, q4 | 829 vadd.s16 q9, q5, q4 |
| 824 vadd.s16 q5, q6, q10 | 830 vadd.s16 q5, q6, q10 |
| 825 vsub.s16 q10, q6, q10 | 831 vsub.s16 q10, q6, q10 |
| 826 vadd.s16 q6, q15, q13 | 832 vadd.s16 q6, q15, q13 |
| 827 vadd.s16 q8, q12, q14 | 833 vadd.s16 q8, q12, q14 |
| 828 vsub.s16 q3, q6, q3 | 834 vsub.s16 q3, q6, q3 |
| 829 vsub.s16 q12, q12, q14 | 835 vsub.s16 q12, q12, q14 |
| 830 vsub.s16 q3, q3, q1 | 836 vsub.s16 q3, q3, q1 |
| 831 vsub.s16 q1, q9, q1 | 837 vsub.s16 q1, q9, q1 |
| 832 vadd.s16 q2, q3, q2 | 838 vadd.s16 q2, q3, q2 |
| 833 vsub.s16 q15, q8, q6 | 839 vsub.s16 q15, q8, q6 |
| 834 vadd.s16 q1, q1, q2 | 840 vadd.s16 q1, q1, q2 |
| 835 vadd.s16 q8, q8, q6 | 841 vadd.s16 q8, q8, q6 |
| 836 vadd.s16 q14, q5, q3 | 842 vadd.s16 q14, q5, q3 |
| 837 vsub.s16 q9, q5, q3 | 843 vsub.s16 q9, q5, q3 |
| 838 vsub.s16 q13, q10, q2 | 844 vsub.s16 q13, q10, q2 |
| 839 vpop {d8-d13} /* restore NEON registers */ | 845 vpop {d8-d13} /* restore NEON registers */ |
| 840 vadd.s16 q10, q10, q2 | 846 vadd.s16 q10, q10, q2 |
| 841 vsub.s16 q11, q12, q1 | 847 vsub.s16 q11, q12, q1 |
| 842 vadd.s16 q12, q12, q1 | 848 vadd.s16 q12, q12, q1 |
| 843 /* Descale to 8-bit and range limit */ | 849 /* Descale to 8-bit and range limit */ |
| 844 vmov.u8 q0, #0x80 | 850 vmov.u8 q0, #0x80 |
| 845 vqshrn.s16 d16, q8, #5 | 851 vqshrn.s16 d16, q8, #5 |
| 846 vqshrn.s16 d17, q9, #5 | 852 vqshrn.s16 d17, q9, #5 |
| 847 vqshrn.s16 d18, q10, #5 | 853 vqshrn.s16 d18, q10, #5 |
| 848 vqshrn.s16 d19, q11, #5 | 854 vqshrn.s16 d19, q11, #5 |
| 849 vqshrn.s16 d20, q12, #5 | 855 vqshrn.s16 d20, q12, #5 |
| 850 vqshrn.s16 d21, q13, #5 | 856 vqshrn.s16 d21, q13, #5 |
| 851 vqshrn.s16 d22, q14, #5 | 857 vqshrn.s16 d22, q14, #5 |
| 852 vqshrn.s16 d23, q15, #5 | 858 vqshrn.s16 d23, q15, #5 |
| 853 vadd.u8 q8, q8, q0 | 859 vadd.u8 q8, q8, q0 |
| 854 vadd.u8 q9, q9, q0 | 860 vadd.u8 q9, q9, q0 |
| 855 vadd.u8 q10, q10, q0 | 861 vadd.u8 q10, q10, q0 |
| 856 vadd.u8 q11, q11, q0 | 862 vadd.u8 q11, q11, q0 |
| 857 /* Transpose the final 8-bit samples */ | 863 /* Transpose the final 8-bit samples */ |
| 858 vtrn.16 q8, q9 | 864 vtrn.16 q8, q9 |
| 859 vtrn.16 q10, q11 | 865 vtrn.16 q10, q11 |
| 860 vtrn.32 q8, q10 | 866 vtrn.32 q8, q10 |
| 861 vtrn.32 q9, q11 | 867 vtrn.32 q9, q11 |
| 862 vtrn.8 d16, d17 | 868 vtrn.8 d16, d17 |
| 863 vtrn.8 d18, d19 | 869 vtrn.8 d18, d19 |
| 864 /* Store results to the output buffer */ | 870 /* Store results to the output buffer */ |
| 865 ldmia OUTPUT_BUF!, {TMP1, TMP2} | 871 ldmia OUTPUT_BUF!, {TMP1, TMP2} |
| 866 add TMP1, TMP1, OUTPUT_COL | 872 add TMP1, TMP1, OUTPUT_COL |
| 867 add TMP2, TMP2, OUTPUT_COL | 873 add TMP2, TMP2, OUTPUT_COL |
| 868 vst1.8 {d16}, [TMP1] | 874 vst1.8 {d16}, [TMP1] |
| 869 vst1.8 {d17}, [TMP2] | 875 vst1.8 {d17}, [TMP2] |
| 870 ldmia OUTPUT_BUF!, {TMP1, TMP2} | 876 ldmia OUTPUT_BUF!, {TMP1, TMP2} |
| 871 add TMP1, TMP1, OUTPUT_COL | 877 add TMP1, TMP1, OUTPUT_COL |
| (...skipping 38 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 910 * The primary purpose of this particular NEON optimized function is | 916 * The primary purpose of this particular NEON optimized function is |
| 911 * bit exact compatibility with jpeg-6b. | 917 * bit exact compatibility with jpeg-6b. |
| 912 * | 918 * |
| 913 * TODO: a bit better instructions scheduling can be achieved by expanding | 919 * TODO: a bit better instructions scheduling can be achieved by expanding |
| 914 * idct_helper/transpose_4x4 macros and reordering instructions, | 920 * idct_helper/transpose_4x4 macros and reordering instructions, |
| 915 * but readability will suffer somewhat. | 921 * but readability will suffer somewhat. |
| 916 */ | 922 */ |
| 917 | 923 |
| 918 #define CONST_BITS 13 | 924 #define CONST_BITS 13 |
| 919 | 925 |
| 920 #define FIX_0_211164243 (1730) /* FIX(0.211164243) */ | 926 #define FIX_0_211164243 (1730) /* FIX(0.211164243) */ |
| 921 #define FIX_0_509795579 (4176) /* FIX(0.509795579) */ | 927 #define FIX_0_509795579 (4176) /* FIX(0.509795579) */ |
| 922 #define FIX_0_601344887 (4926) /* FIX(0.601344887) */ | 928 #define FIX_0_601344887 (4926) /* FIX(0.601344887) */ |
| 923 #define FIX_0_720959822 (5906) /* FIX(0.720959822) */ | 929 #define FIX_0_720959822 (5906) /* FIX(0.720959822) */ |
| 924 #define FIX_0_765366865 (6270) /* FIX(0.765366865) */ | 930 #define FIX_0_765366865 (6270) /* FIX(0.765366865) */ |
| 925 #define FIX_0_850430095 (6967) /* FIX(0.850430095) */ | 931 #define FIX_0_850430095 (6967) /* FIX(0.850430095) */ |
| 926 #define FIX_0_899976223 (7373) /* FIX(0.899976223) */ | 932 #define FIX_0_899976223 (7373) /* FIX(0.899976223) */ |
| 927 #define FIX_1_061594337 (8697) /* FIX(1.061594337) */ | 933 #define FIX_1_061594337 (8697) /* FIX(1.061594337) */ |
| 928 #define FIX_1_272758580 (10426) /* FIX(1.272758580) */ | 934 #define FIX_1_272758580 (10426) /* FIX(1.272758580) */ |
| 929 #define FIX_1_451774981 (11893) /* FIX(1.451774981) */ | 935 #define FIX_1_451774981 (11893) /* FIX(1.451774981) */ |
| 930 #define FIX_1_847759065 (15137) /* FIX(1.847759065) */ | 936 #define FIX_1_847759065 (15137) /* FIX(1.847759065) */ |
| 931 #define FIX_2_172734803 (17799) /* FIX(2.172734803) */ | 937 #define FIX_2_172734803 (17799) /* FIX(2.172734803) */ |
| 932 #define FIX_2_562915447 (20995) /* FIX(2.562915447) */ | 938 #define FIX_2_562915447 (20995) /* FIX(2.562915447) */ |
| 933 #define FIX_3_624509785 (29692) /* FIX(3.624509785) */ | 939 #define FIX_3_624509785 (29692) /* FIX(3.624509785) */ |
| 934 | 940 |
| 935 .balign 16 | 941 .balign 16 |
| 936 jsimd_idct_4x4_neon_consts: | 942 jsimd_idct_4x4_neon_consts: |
| 937 .short FIX_1_847759065 /* d0[0] */ | 943 .short FIX_1_847759065 /* d0[0] */ |
| 938 .short -FIX_0_765366865 /* d0[1] */ | 944 .short -FIX_0_765366865 /* d0[1] */ |
| 939 .short -FIX_0_211164243 /* d0[2] */ | 945 .short -FIX_0_211164243 /* d0[2] */ |
| 940 .short FIX_1_451774981 /* d0[3] */ | 946 .short FIX_1_451774981 /* d0[3] */ |
| 941 .short -FIX_2_172734803 /* d1[0] */ | 947 .short -FIX_2_172734803 /* d1[0] */ |
| 942 .short FIX_1_061594337 /* d1[1] */ | 948 .short FIX_1_061594337 /* d1[1] */ |
| 943 .short -FIX_0_509795579 /* d1[2] */ | 949 .short -FIX_0_509795579 /* d1[2] */ |
| 944 .short -FIX_0_601344887 /* d1[3] */ | 950 .short -FIX_0_601344887 /* d1[3] */ |
| 945 .short FIX_0_899976223 /* d2[0] */ | 951 .short FIX_0_899976223 /* d2[0] */ |
| 946 .short FIX_2_562915447 /* d2[1] */ | 952 .short FIX_2_562915447 /* d2[1] */ |
| 947 .short 1 << (CONST_BITS+1) /* d2[2] */ | 953 .short 1 << (CONST_BITS+1) /* d2[2] */ |
| 948 .short 0 /* d2[3] */ | 954 .short 0 /* d2[3] */ |
| 949 | 955 |
| 950 .macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29 | 956 .macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29 |
| 951 vmull.s16 q14, \x4, d2[2] | 957 vmull.s16 q14, \x4, d2[2] |
| 952 vmlal.s16 q14, \x8, d0[0] | 958 vmlal.s16 q14, \x8, d0[0] |
| 953 vmlal.s16 q14, \x14, d0[1] | 959 vmlal.s16 q14, \x14, d0[1] |
| 954 | 960 |
| 955 vmull.s16 q13, \x16, d1[2] | 961 vmull.s16 q13, \x16, d1[2] |
| 956 vmlal.s16 q13, \x12, d1[3] | 962 vmlal.s16 q13, \x12, d1[3] |
| 957 vmlal.s16 q13, \x10, d2[0] | 963 vmlal.s16 q13, \x10, d2[0] |
| 958 vmlal.s16 q13, \x6, d2[1] | 964 vmlal.s16 q13, \x6, d2[1] |
| 959 | 965 |
| 960 vmull.s16 q15, \x4, d2[2] | 966 vmull.s16 q15, \x4, d2[2] |
| 961 vmlsl.s16 q15, \x8, d0[0] | 967 vmlsl.s16 q15, \x8, d0[0] |
| 962 vmlsl.s16 q15, \x14, d0[1] | 968 vmlsl.s16 q15, \x14, d0[1] |
| 963 | 969 |
| 964 vmull.s16 q12, \x16, d0[2] | 970 vmull.s16 q12, \x16, d0[2] |
| 965 vmlal.s16 q12, \x12, d0[3] | 971 vmlal.s16 q12, \x12, d0[3] |
| 966 vmlal.s16 q12, \x10, d1[0] | 972 vmlal.s16 q12, \x10, d1[0] |
| 967 vmlal.s16 q12, \x6, d1[1] | 973 vmlal.s16 q12, \x6, d1[1] |
| 968 | 974 |
| 969 vadd.s32 q10, q14, q13 | 975 vadd.s32 q10, q14, q13 |
| 970 vsub.s32 q14, q14, q13 | 976 vsub.s32 q14, q14, q13 |
| 971 | 977 |
| 972 .if \shift > 16 | 978 .if \shift > 16 |
| 973 vrshr.s32 q10, q10, #\shift | 979 vrshr.s32 q10, q10, #\shift |
| 974 vrshr.s32 q14, q14, #\shift | 980 vrshr.s32 q14, q14, #\shift |
| 975 vmovn.s32 \y26, q10 | 981 vmovn.s32 \y26, q10 |
| 976 vmovn.s32 \y29, q14 | 982 vmovn.s32 \y29, q14 |
| 977 .else | 983 .else |
| 978 vrshrn.s32 \y26, q10, #\shift | 984 vrshrn.s32 \y26, q10, #\shift |
| 979 vrshrn.s32 \y29, q14, #\shift | 985 vrshrn.s32 \y29, q14, #\shift |
| 980 .endif | 986 .endif |
| 981 | 987 |
| 982 vadd.s32 q10, q15, q12 | 988 vadd.s32 q10, q15, q12 |
| 983 vsub.s32 q15, q15, q12 | 989 vsub.s32 q15, q15, q12 |
| 984 | 990 |
| 985 .if \shift > 16 | 991 .if \shift > 16 |
| 986 vrshr.s32 q10, q10, #\shift | 992 vrshr.s32 q10, q10, #\shift |
| 987 vrshr.s32 q15, q15, #\shift | 993 vrshr.s32 q15, q15, #\shift |
| 988 vmovn.s32 \y27, q10 | 994 vmovn.s32 \y27, q10 |
| 989 vmovn.s32 \y28, q15 | 995 vmovn.s32 \y28, q15 |
| 990 .else | 996 .else |
| 991 vrshrn.s32 \y27, q10, #\shift | 997 vrshrn.s32 \y27, q10, #\shift |
| 992 vrshrn.s32 \y28, q15, #\shift | 998 vrshrn.s32 \y28, q15, #\shift |
| 993 .endif | 999 .endif |
| 994 | |
| 995 .endm | 1000 .endm |
| 996 | 1001 |
| 997 asm_function jsimd_idct_4x4_neon | 1002 asm_function jsimd_idct_4x4_neon |
| 998 | 1003 |
| 999 DCT_TABLE .req r0 | 1004 DCT_TABLE .req r0 |
| 1000 COEF_BLOCK .req r1 | 1005 COEF_BLOCK .req r1 |
| 1001 OUTPUT_BUF .req r2 | 1006 OUTPUT_BUF .req r2 |
| 1002 OUTPUT_COL .req r3 | 1007 OUTPUT_COL .req r3 |
| 1003 TMP1 .req r0 | 1008 TMP1 .req r0 |
| 1004 TMP2 .req r1 | 1009 TMP2 .req r1 |
| (...skipping 115 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1120 * function from jpeg-6b (jidctred.c). | 1125 * function from jpeg-6b (jidctred.c). |
| 1121 * | 1126 * |
| 1122 * NOTE: jpeg-8 has an improved implementation of 2x2 inverse-DCT, which | 1127 * NOTE: jpeg-8 has an improved implementation of 2x2 inverse-DCT, which |
| 1123 * requires much less arithmetic operations and hence should be faster. | 1128 * requires much less arithmetic operations and hence should be faster. |
| 1124 * The primary purpose of this particular NEON optimized function is | 1129 * The primary purpose of this particular NEON optimized function is |
| 1125 * bit exact compatibility with jpeg-6b. | 1130 * bit exact compatibility with jpeg-6b. |
| 1126 */ | 1131 */ |
| 1127 | 1132 |
| 1128 .balign 8 | 1133 .balign 8 |
| 1129 jsimd_idct_2x2_neon_consts: | 1134 jsimd_idct_2x2_neon_consts: |
| 1130 .short -FIX_0_720959822 /* d0[0] */ | 1135 .short -FIX_0_720959822 /* d0[0] */ |
| 1131 .short FIX_0_850430095 /* d0[1] */ | 1136 .short FIX_0_850430095 /* d0[1] */ |
| 1132 .short -FIX_1_272758580 /* d0[2] */ | 1137 .short -FIX_1_272758580 /* d0[2] */ |
| 1133 .short FIX_3_624509785 /* d0[3] */ | 1138 .short FIX_3_624509785 /* d0[3] */ |
| 1134 | 1139 |
| 1135 .macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27 | 1140 .macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27 |
| 1136 vshll.s16 q14, \x4, #15 | 1141 vshll.s16 q14, \x4, #15 |
| 1137 vmull.s16 q13, \x6, d0[3] | 1142 vmull.s16 q13, \x6, d0[3] |
| 1138 vmlal.s16 q13, \x10, d0[2] | 1143 vmlal.s16 q13, \x10, d0[2] |
| 1139 vmlal.s16 q13, \x12, d0[1] | 1144 vmlal.s16 q13, \x12, d0[1] |
| 1140 vmlal.s16 q13, \x16, d0[0] | 1145 vmlal.s16 q13, \x16, d0[0] |
| 1141 | 1146 |
| 1142 vadd.s32 q10, q14, q13 | 1147 vadd.s32 q10, q14, q13 |
| 1143 vsub.s32 q14, q14, q13 | 1148 vsub.s32 q14, q14, q13 |
| 1144 | 1149 |
| 1145 .if \shift > 16 | 1150 .if \shift > 16 |
| 1146 vrshr.s32 q10, q10, #\shift | 1151 vrshr.s32 q10, q10, #\shift |
| 1147 vrshr.s32 q14, q14, #\shift | 1152 vrshr.s32 q14, q14, #\shift |
| 1148 vmovn.s32 \y26, q10 | 1153 vmovn.s32 \y26, q10 |
| 1149 vmovn.s32 \y27, q14 | 1154 vmovn.s32 \y27, q14 |
| 1150 .else | 1155 .else |
| 1151 vrshrn.s32 \y26, q10, #\shift | 1156 vrshrn.s32 \y26, q10, #\shift |
| 1152 vrshrn.s32 \y27, q14, #\shift | 1157 vrshrn.s32 \y27, q14, #\shift |
| 1153 .endif | 1158 .endif |
| 1154 | |
| 1155 .endm | 1159 .endm |
| 1156 | 1160 |
| 1157 asm_function jsimd_idct_2x2_neon | 1161 asm_function jsimd_idct_2x2_neon |
| 1158 | 1162 |
| 1159 DCT_TABLE .req r0 | 1163 DCT_TABLE .req r0 |
| 1160 COEF_BLOCK .req r1 | 1164 COEF_BLOCK .req r1 |
| 1161 OUTPUT_BUF .req r2 | 1165 OUTPUT_BUF .req r2 |
| 1162 OUTPUT_COL .req r3 | 1166 OUTPUT_COL .req r3 |
| 1163 TMP1 .req r0 | 1167 TMP1 .req r0 |
| 1164 TMP2 .req ip | 1168 TMP2 .req ip |
| (...skipping 33 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1198 add DCT_TABLE, DCT_TABLE, #16 | 1202 add DCT_TABLE, DCT_TABLE, #16 |
| 1199 vld1.16 {d26, d27}, [DCT_TABLE, :128]! | 1203 vld1.16 {d26, d27}, [DCT_TABLE, :128]! |
| 1200 vmul.s16 q6, q6, q13 | 1204 vmul.s16 q6, q6, q13 |
| 1201 add DCT_TABLE, DCT_TABLE, #16 | 1205 add DCT_TABLE, DCT_TABLE, #16 |
| 1202 vld1.16 {d30, d31}, [DCT_TABLE, :128]! | 1206 vld1.16 {d30, d31}, [DCT_TABLE, :128]! |
| 1203 vmul.s16 q8, q8, q15 | 1207 vmul.s16 q8, q8, q15 |
| 1204 | 1208 |
| 1205 /* Pass 1 */ | 1209 /* Pass 1 */ |
| 1206 #if 0 | 1210 #if 0 |
| 1207 idct_helper d4, d6, d10, d12, d16, 13, d4, d6 | 1211 idct_helper d4, d6, d10, d12, d16, 13, d4, d6 |
| 1208 transpose_4x4 d4, d6, d8, d10 | 1212 transpose_4x4 d4, d6, d8, d10 |
| 1209 idct_helper d5, d7, d11, d13, d17, 13, d5, d7 | 1213 idct_helper d5, d7, d11, d13, d17, 13, d5, d7 |
| 1210 transpose_4x4 d5, d7, d9, d11 | 1214 transpose_4x4 d5, d7, d9, d11 |
| 1211 #else | 1215 #else |
| 1212 vmull.s16 q13, d6, d0[3] | 1216 vmull.s16 q13, d6, d0[3] |
| 1213 vmlal.s16 q13, d10, d0[2] | 1217 vmlal.s16 q13, d10, d0[2] |
| 1214 vmlal.s16 q13, d12, d0[1] | 1218 vmlal.s16 q13, d12, d0[1] |
| 1215 vmlal.s16 q13, d16, d0[0] | 1219 vmlal.s16 q13, d16, d0[0] |
| 1216 vmull.s16 q12, d7, d0[3] | 1220 vmull.s16 q12, d7, d0[3] |
| 1217 vmlal.s16 q12, d11, d0[2] | 1221 vmlal.s16 q12, d11, d0[2] |
| 1218 vmlal.s16 q12, d13, d0[1] | 1222 vmlal.s16 q12, d13, d0[1] |
| 1219 vmlal.s16 q12, d17, d0[0] | 1223 vmlal.s16 q12, d17, d0[0] |
| 1220 vshll.s16 q14, d4, #15 | 1224 vshll.s16 q14, d4, #15 |
| 1221 vshll.s16 q15, d5, #15 | 1225 vshll.s16 q15, d5, #15 |
| 1222 vadd.s32 q10, q14, q13 | 1226 vadd.s32 q10, q14, q13 |
| 1223 vsub.s32 q14, q14, q13 | 1227 vsub.s32 q14, q14, q13 |
| 1224 vrshrn.s32 d4, q10, #13 | 1228 vrshrn.s32 d4, q10, #13 |
| 1225 vrshrn.s32 d6, q14, #13 | 1229 vrshrn.s32 d6, q14, #13 |
| 1226 vadd.s32 q10, q15, q12 | 1230 vadd.s32 q10, q15, q12 |
| 1227 vsub.s32 q14, q15, q12 | 1231 vsub.s32 q14, q15, q12 |
| 1228 vrshrn.s32 d5, q10, #13 | 1232 vrshrn.s32 d5, q10, #13 |
| 1229 vrshrn.s32 d7, q14, #13 | 1233 vrshrn.s32 d7, q14, #13 |
| 1230 vtrn.16 q2, q3 | 1234 vtrn.16 q2, q3 |
| 1231 vtrn.32 q3, q5 | 1235 vtrn.32 q3, q5 |
| 1232 #endif | 1236 #endif |
| 1233 | 1237 |
| 1234 /* Pass 2 */ | 1238 /* Pass 2 */ |
| 1235 idct_helper d4, d6, d10, d7, d11, 20, d26, d27 | 1239 idct_helper d4, d6, d10, d7, d11, 20, d26, d27 |
| 1236 | 1240 |
| 1237 /* Range limit */ | 1241 /* Range limit */ |
| 1238 vmov.u16 q15, #0x80 | 1242 vmov.u16 q15, #0x80 |
| 1239 vadd.s16 q13, q13, q15 | 1243 vadd.s16 q13, q13, q15 |
| 1240 vqmovun.s16 d26, q13 | 1244 vqmovun.s16 d26, q13 |
| 1241 vqmovun.s16 d27, q13 | 1245 vqmovun.s16 d27, q13 |
| (...skipping 29 matching lines...) Expand all Loading... |
| 1271 * jsimd_ycc_extrgbx_convert_neon | 1275 * jsimd_ycc_extrgbx_convert_neon |
| 1272 * jsimd_ycc_extbgrx_convert_neon | 1276 * jsimd_ycc_extbgrx_convert_neon |
| 1273 * jsimd_ycc_extxbgr_convert_neon | 1277 * jsimd_ycc_extxbgr_convert_neon |
| 1274 * jsimd_ycc_extxrgb_convert_neon | 1278 * jsimd_ycc_extxrgb_convert_neon |
| 1275 * | 1279 * |
| 1276 * Colorspace conversion YCbCr -> RGB | 1280 * Colorspace conversion YCbCr -> RGB |
| 1277 */ | 1281 */ |
| 1278 | 1282 |
| 1279 | 1283 |
| 1280 .macro do_load size | 1284 .macro do_load size |
| 1281 .if \size == 8 | 1285 .if \size == 8 |
| 1282 vld1.8 {d4}, [U, :64]! | 1286 vld1.8 {d4}, [U, :64]! |
| 1283 vld1.8 {d5}, [V, :64]! | 1287 vld1.8 {d5}, [V, :64]! |
| 1284 vld1.8 {d0}, [Y, :64]! | 1288 vld1.8 {d0}, [Y, :64]! |
| 1285 pld [U, #64] | 1289 pld [U, #64] |
| 1286 pld [V, #64] | 1290 pld [V, #64] |
| 1287 pld [Y, #64] | 1291 pld [Y, #64] |
| 1288 .elseif \size == 4 | 1292 .elseif \size == 4 |
| 1289 vld1.8 {d4[0]}, [U]! | 1293 vld1.8 {d4[0]}, [U]! |
| 1290 vld1.8 {d4[1]}, [U]! | 1294 vld1.8 {d4[1]}, [U]! |
| 1291 vld1.8 {d4[2]}, [U]! | 1295 vld1.8 {d4[2]}, [U]! |
| 1292 vld1.8 {d4[3]}, [U]! | 1296 vld1.8 {d4[3]}, [U]! |
| 1293 vld1.8 {d5[0]}, [V]! | 1297 vld1.8 {d5[0]}, [V]! |
| 1294 vld1.8 {d5[1]}, [V]! | 1298 vld1.8 {d5[1]}, [V]! |
| 1295 vld1.8 {d5[2]}, [V]! | 1299 vld1.8 {d5[2]}, [V]! |
| 1296 vld1.8 {d5[3]}, [V]! | 1300 vld1.8 {d5[3]}, [V]! |
| 1297 vld1.8 {d0[0]}, [Y]! | 1301 vld1.8 {d0[0]}, [Y]! |
| 1298 vld1.8 {d0[1]}, [Y]! | 1302 vld1.8 {d0[1]}, [Y]! |
| 1299 vld1.8 {d0[2]}, [Y]! | 1303 vld1.8 {d0[2]}, [Y]! |
| 1300 vld1.8 {d0[3]}, [Y]! | 1304 vld1.8 {d0[3]}, [Y]! |
| 1301 .elseif \size == 2 | 1305 .elseif \size == 2 |
| 1302 vld1.8 {d4[4]}, [U]! | 1306 vld1.8 {d4[4]}, [U]! |
| 1303 vld1.8 {d4[5]}, [U]! | 1307 vld1.8 {d4[5]}, [U]! |
| 1304 vld1.8 {d5[4]}, [V]! | 1308 vld1.8 {d5[4]}, [V]! |
| 1305 vld1.8 {d5[5]}, [V]! | 1309 vld1.8 {d5[5]}, [V]! |
| 1306 vld1.8 {d0[4]}, [Y]! | 1310 vld1.8 {d0[4]}, [Y]! |
| 1307 vld1.8 {d0[5]}, [Y]! | 1311 vld1.8 {d0[5]}, [Y]! |
| 1308 .elseif \size == 1 | 1312 .elseif \size == 1 |
| 1309 vld1.8 {d4[6]}, [U]! | 1313 vld1.8 {d4[6]}, [U]! |
| 1310 vld1.8 {d5[6]}, [V]! | 1314 vld1.8 {d5[6]}, [V]! |
| 1311 vld1.8 {d0[6]}, [Y]! | 1315 vld1.8 {d0[6]}, [Y]! |
| 1312 .else | 1316 .else |
| 1313 .error unsupported macroblock size | 1317 .error unsupported macroblock size |
| 1314 .endif | 1318 .endif |
| 1315 .endm | 1319 .endm |
| 1316 | 1320 |
| 1317 .macro do_store bpp, size | 1321 .macro do_store bpp, size |
| 1318 .if \bpp == 24 | 1322 .if \bpp == 24 |
| 1319 .if \size == 8 | 1323 .if \size == 8 |
| 1320 vst3.8 {d10, d11, d12}, [RGB]! | 1324 vst3.8 {d10, d11, d12}, [RGB]! |
| 1321 .elseif \size == 4 | 1325 .elseif \size == 4 |
| 1322 vst3.8 {d10[0], d11[0], d12[0]}, [RGB]! | 1326 vst3.8 {d10[0], d11[0], d12[0]}, [RGB]! |
| 1323 vst3.8 {d10[1], d11[1], d12[1]}, [RGB]! | 1327 vst3.8 {d10[1], d11[1], d12[1]}, [RGB]! |
| 1324 vst3.8 {d10[2], d11[2], d12[2]}, [RGB]! | 1328 vst3.8 {d10[2], d11[2], d12[2]}, [RGB]! |
| 1325 vst3.8 {d10[3], d11[3], d12[3]}, [RGB]! | 1329 vst3.8 {d10[3], d11[3], d12[3]}, [RGB]! |
| 1326 .elseif \size == 2 | 1330 .elseif \size == 2 |
| 1327 vst3.8 {d10[4], d11[4], d12[4]}, [RGB]! | 1331 vst3.8 {d10[4], d11[4], d12[4]}, [RGB]! |
| 1328 vst3.8 {d10[5], d11[5], d12[5]}, [RGB]! | 1332 vst3.8 {d10[5], d11[5], d12[5]}, [RGB]! |
| 1329 .elseif \size == 1 | 1333 .elseif \size == 1 |
| 1330 vst3.8 {d10[6], d11[6], d12[6]}, [RGB]! | 1334 vst3.8 {d10[6], d11[6], d12[6]}, [RGB]! |
| 1331 .else | |
| 1332 .error unsupported macroblock size | |
| 1333 .endif | |
| 1334 .elseif \bpp == 32 | |
| 1335 .if \size == 8 | |
| 1336 vst4.8 {d10, d11, d12, d13}, [RGB]! | |
| 1337 .elseif \size == 4 | |
| 1338 vst4.8 {d10[0], d11[0], d12[0], d13[0]}, [RGB]! | |
| 1339 vst4.8 {d10[1], d11[1], d12[1], d13[1]}, [RGB]! | |
| 1340 vst4.8 {d10[2], d11[2], d12[2], d13[2]}, [RGB]! | |
| 1341 vst4.8 {d10[3], d11[3], d12[3], d13[3]}, [RGB]! | |
| 1342 .elseif \size == 2 | |
| 1343 vst4.8 {d10[4], d11[4], d12[4], d13[4]}, [RGB]! | |
| 1344 vst4.8 {d10[5], d11[5], d12[5], d13[5]}, [RGB]! | |
| 1345 .elseif \size == 1 | |
| 1346 vst4.8 {d10[6], d11[6], d12[6], d13[6]}, [RGB]! | |
| 1347 .else | |
| 1348 .error unsupported macroblock size | |
| 1349 .endif | |
| 1350 .elseif \bpp == 16 | |
| 1351 .if \size == 8 | |
| 1352 vst1.16 {q15}, [RGB]! | |
| 1353 .elseif \size == 4 | |
| 1354 vst1.16 {d30}, [RGB]! | |
| 1355 .elseif \size == 2 | |
| 1356 vst1.16 {d31[0]}, [RGB]! | |
| 1357 vst1.16 {d31[1]}, [RGB]! | |
| 1358 .elseif \size == 1 | |
| 1359 vst1.16 {d31[2]}, [RGB]! | |
| 1360 .else | |
| 1361 .error unsupported macroblock size | |
| 1362 .endif | |
| 1363 .else | 1335 .else |
| 1364 .error unsupported bpp | 1336 .error unsupported macroblock size |
| 1365 .endif | 1337 .endif |
| 1338 .elseif \bpp == 32 |
| 1339 .if \size == 8 |
| 1340 vst4.8 {d10, d11, d12, d13}, [RGB]! |
| 1341 .elseif \size == 4 |
| 1342 vst4.8 {d10[0], d11[0], d12[0], d13[0]}, [RGB]! |
| 1343 vst4.8 {d10[1], d11[1], d12[1], d13[1]}, [RGB]! |
| 1344 vst4.8 {d10[2], d11[2], d12[2], d13[2]}, [RGB]! |
| 1345 vst4.8 {d10[3], d11[3], d12[3], d13[3]}, [RGB]! |
| 1346 .elseif \size == 2 |
| 1347 vst4.8 {d10[4], d11[4], d12[4], d13[4]}, [RGB]! |
| 1348 vst4.8 {d10[5], d11[5], d12[5], d13[5]}, [RGB]! |
| 1349 .elseif \size == 1 |
| 1350 vst4.8 {d10[6], d11[6], d12[6], d13[6]}, [RGB]! |
| 1351 .else |
| 1352 .error unsupported macroblock size |
| 1353 .endif |
| 1354 .elseif \bpp == 16 |
| 1355 .if \size == 8 |
| 1356 vst1.16 {q15}, [RGB]! |
| 1357 .elseif \size == 4 |
| 1358 vst1.16 {d30}, [RGB]! |
| 1359 .elseif \size == 2 |
| 1360 vst1.16 {d31[0]}, [RGB]! |
| 1361 vst1.16 {d31[1]}, [RGB]! |
| 1362 .elseif \size == 1 |
| 1363 vst1.16 {d31[2]}, [RGB]! |
| 1364 .else |
| 1365 .error unsupported macroblock size |
| 1366 .endif |
| 1367 .else |
| 1368 .error unsupported bpp |
| 1369 .endif |
| 1366 .endm | 1370 .endm |
| 1367 | 1371 |
| 1368 .macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, g_offs, b_offs | 1372 .macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, g_offs, b_offs |
| 1369 | 1373 |
| 1370 /* | 1374 /* |
| 1371 * 2 stage pipelined YCbCr->RGB conversion | 1375 * 2-stage pipelined YCbCr->RGB conversion |
| 1372 */ | 1376 */ |
| 1373 | 1377 |
| 1374 .macro do_yuv_to_rgb_stage1 | 1378 .macro do_yuv_to_rgb_stage1 |
| 1375 vaddw.u8 q3, q1, d4 /* q3 = u - 128 */ | 1379 vaddw.u8 q3, q1, d4 /* q3 = u - 128 */ |
| 1376 vaddw.u8 q4, q1, d5 /* q2 = v - 128 */ | 1380 vaddw.u8 q4, q1, d5 /* q2 = v - 128 */ |
| 1377 vmull.s16 q10, d6, d1[1] /* multiply by -11277 */ | 1381 vmull.s16 q10, d6, d1[1] /* multiply by -11277 */ |
| 1378 vmlal.s16 q10, d8, d1[2] /* multiply by -23401 */ | 1382 vmlal.s16 q10, d8, d1[2] /* multiply by -23401 */ |
| 1379 vmull.s16 q11, d7, d1[1] /* multiply by -11277 */ | 1383 vmull.s16 q11, d7, d1[1] /* multiply by -11277 */ |
| 1380 vmlal.s16 q11, d9, d1[2] /* multiply by -23401 */ | 1384 vmlal.s16 q11, d9, d1[2] /* multiply by -23401 */ |
| 1381 vmull.s16 q12, d8, d1[0] /* multiply by 22971 */ | 1385 vmull.s16 q12, d8, d1[0] /* multiply by 22971 */ |
| 1382 vmull.s16 q13, d9, d1[0] /* multiply by 22971 */ | 1386 vmull.s16 q13, d9, d1[0] /* multiply by 22971 */ |
| 1383 vmull.s16 q14, d6, d1[3] /* multiply by 29033 */ | 1387 vmull.s16 q14, d6, d1[3] /* multiply by 29033 */ |
| 1384 vmull.s16 q15, d7, d1[3] /* multiply by 29033 */ | 1388 vmull.s16 q15, d7, d1[3] /* multiply by 29033 */ |
| 1385 .endm | 1389 .endm |
| 1386 | 1390 |
| 1387 .macro do_yuv_to_rgb_stage2 | 1391 .macro do_yuv_to_rgb_stage2 |
| 1388 vrshrn.s32 d20, q10, #15 | 1392 vrshrn.s32 d20, q10, #15 |
| 1389 vrshrn.s32 d21, q11, #15 | 1393 vrshrn.s32 d21, q11, #15 |
| 1390 vrshrn.s32 d24, q12, #14 | 1394 vrshrn.s32 d24, q12, #14 |
| 1391 vrshrn.s32 d25, q13, #14 | 1395 vrshrn.s32 d25, q13, #14 |
| 1392 vrshrn.s32 d28, q14, #14 | 1396 vrshrn.s32 d28, q14, #14 |
| 1393 vrshrn.s32 d29, q15, #14 | 1397 vrshrn.s32 d29, q15, #14 |
| 1394 vaddw.u8 q11, q10, d0 | 1398 vaddw.u8 q11, q10, d0 |
| 1395 vaddw.u8 q12, q12, d0 | 1399 vaddw.u8 q12, q12, d0 |
| 1396 vaddw.u8 q14, q14, d0 | 1400 vaddw.u8 q14, q14, d0 |
| 1397 .if \bpp != 16 | 1401 .if \bpp != 16 |
| 1398 vqmovun.s16 d1\g_offs, q11 | 1402 vqmovun.s16 d1\g_offs, q11 |
| 1399 vqmovun.s16 d1\r_offs, q12 | 1403 vqmovun.s16 d1\r_offs, q12 |
| 1400 vqmovun.s16 d1\b_offs, q14 | 1404 vqmovun.s16 d1\b_offs, q14 |
| 1401 .else /* rgb565 */ | 1405 .else /* rgb565 */ |
| 1402 vqshlu.s16 q13, q11, #8 | 1406 vqshlu.s16 q13, q11, #8 |
| 1403 vqshlu.s16 q15, q12, #8 | 1407 vqshlu.s16 q15, q12, #8 |
| 1404 vqshlu.s16 q14, q14, #8 | 1408 vqshlu.s16 q14, q14, #8 |
| 1405 vsri.u16 q15, q13, #5 | 1409 vsri.u16 q15, q13, #5 |
| 1406 vsri.u16 q15, q14, #11 | 1410 vsri.u16 q15, q14, #11 |
| 1407 .endif | 1411 .endif |
| 1408 .endm | 1412 .endm |
| 1409 | 1413 |
| 1410 .macro do_yuv_to_rgb_stage2_store_load_stage1 | 1414 .macro do_yuv_to_rgb_stage2_store_load_stage1 |
| 1411 /* "do_yuv_to_rgb_stage2" and "store" */ | 1415 /* "do_yuv_to_rgb_stage2" and "store" */ |
| 1412 vrshrn.s32 d20, q10, #15 | 1416 vrshrn.s32 d20, q10, #15 |
| 1413 /* "load" and "do_yuv_to_rgb_stage1" */ | 1417 /* "load" and "do_yuv_to_rgb_stage1" */ |
| 1414 pld [U, #64] | 1418 pld [U, #64] |
| 1415 vrshrn.s32 d21, q11, #15 | 1419 vrshrn.s32 d21, q11, #15 |
| 1416 pld [V, #64] | 1420 pld [V, #64] |
| 1417 vrshrn.s32 d24, q12, #14 | 1421 vrshrn.s32 d24, q12, #14 |
| 1418 vrshrn.s32 d25, q13, #14 | 1422 vrshrn.s32 d25, q13, #14 |
| 1419 vld1.8 {d4}, [U, :64]! | 1423 vld1.8 {d4}, [U, :64]! |
| 1420 vrshrn.s32 d28, q14, #14 | 1424 vrshrn.s32 d28, q14, #14 |
| 1421 vld1.8 {d5}, [V, :64]! | 1425 vld1.8 {d5}, [V, :64]! |
| 1422 vrshrn.s32 d29, q15, #14 | 1426 vrshrn.s32 d29, q15, #14 |
| 1423 vaddw.u8 q3, q1, d4 /* q3 = u - 128 */ | 1427 vaddw.u8 q3, q1, d4 /* q3 = u - 128 */ |
| 1424 vaddw.u8 q4, q1, d5 /* q2 = v - 128 */ | 1428 vaddw.u8 q4, q1, d5 /* q2 = v - 128 */ |
| 1425 vaddw.u8 q11, q10, d0 | 1429 vaddw.u8 q11, q10, d0 |
| 1426 vmull.s16 q10, d6, d1[1] /* multiply by -11277 */ | 1430 vmull.s16 q10, d6, d1[1] /* multiply by -11277 */ |
| 1427 vmlal.s16 q10, d8, d1[2] /* multiply by -23401 */ | 1431 vmlal.s16 q10, d8, d1[2] /* multiply by -23401 */ |
| 1428 vaddw.u8 q12, q12, d0 | 1432 vaddw.u8 q12, q12, d0 |
| 1429 vaddw.u8 q14, q14, d0 | 1433 vaddw.u8 q14, q14, d0 |
| 1430 .if \bpp != 16 /**************** rgb24/rgb32 *********************************/ | 1434 .if \bpp != 16 /**************** rgb24/rgb32 ******************************/ |
| 1431 vqmovun.s16 d1\g_offs, q11 | 1435 vqmovun.s16 d1\g_offs, q11 |
| 1432 pld [Y, #64] | 1436 pld [Y, #64] |
| 1433 vqmovun.s16 d1\r_offs, q12 | 1437 vqmovun.s16 d1\r_offs, q12 |
| 1434 vld1.8 {d0}, [Y, :64]! | 1438 vld1.8 {d0}, [Y, :64]! |
| 1435 vqmovun.s16 d1\b_offs, q14 | 1439 vqmovun.s16 d1\b_offs, q14 |
| 1436 vmull.s16 q11, d7, d1[1] /* multiply by -11277 */ | 1440 vmull.s16 q11, d7, d1[1] /* multiply by -11277 */ |
| 1437 vmlal.s16 q11, d9, d1[2] /* multiply by -23401 */ | 1441 vmlal.s16 q11, d9, d1[2] /* multiply by -23401 */ |
| 1438 do_store \bpp, 8 | 1442 do_store \bpp, 8 |
| 1439 vmull.s16 q12, d8, d1[0] /* multiply by 22971 */ | 1443 vmull.s16 q12, d8, d1[0] /* multiply by 22971 */ |
| 1440 vmull.s16 q13, d9, d1[0] /* multiply by 22971 */ | 1444 vmull.s16 q13, d9, d1[0] /* multiply by 22971 */ |
| 1441 vmull.s16 q14, d6, d1[3] /* multiply by 29033 */ | 1445 vmull.s16 q14, d6, d1[3] /* multiply by 29033 */ |
| 1442 vmull.s16 q15, d7, d1[3] /* multiply by 29033 */ | 1446 vmull.s16 q15, d7, d1[3] /* multiply by 29033 */ |
| 1443 .else /**************************** rgb565 ***********************************/ | 1447 .else /**************************** rgb565 ********************************/ |
| 1444 vqshlu.s16 q13, q11, #8 | 1448 vqshlu.s16 q13, q11, #8 |
| 1445 pld [Y, #64] | 1449 pld [Y, #64] |
| 1446 vqshlu.s16 q15, q12, #8 | 1450 vqshlu.s16 q15, q12, #8 |
| 1447 vqshlu.s16 q14, q14, #8 | 1451 vqshlu.s16 q14, q14, #8 |
| 1448 vld1.8 {d0}, [Y, :64]! | 1452 vld1.8 {d0}, [Y, :64]! |
| 1449 vmull.s16 q11, d7, d1[1] | 1453 vmull.s16 q11, d7, d1[1] |
| 1450 vmlal.s16 q11, d9, d1[2] | 1454 vmlal.s16 q11, d9, d1[2] |
| 1451 vsri.u16 q15, q13, #5 | 1455 vsri.u16 q15, q13, #5 |
| 1452 vmull.s16 q12, d8, d1[0] | 1456 vmull.s16 q12, d8, d1[0] |
| 1453 vsri.u16 q15, q14, #11 | 1457 vsri.u16 q15, q14, #11 |
| 1454 vmull.s16 q13, d9, d1[0] | 1458 vmull.s16 q13, d9, d1[0] |
| 1455 vmull.s16 q14, d6, d1[3] | 1459 vmull.s16 q14, d6, d1[3] |
| 1456 do_store \bpp, 8 | 1460 do_store \bpp, 8 |
| 1457 vmull.s16 q15, d7, d1[3] | 1461 vmull.s16 q15, d7, d1[3] |
| 1458 .endif | 1462 .endif |
| 1459 .endm | 1463 .endm |
| 1460 | 1464 |
| 1461 .macro do_yuv_to_rgb | 1465 .macro do_yuv_to_rgb |
| 1462 do_yuv_to_rgb_stage1 | 1466 do_yuv_to_rgb_stage1 |
| 1463 do_yuv_to_rgb_stage2 | 1467 do_yuv_to_rgb_stage2 |
| 1464 .endm | 1468 .endm |
| 1465 | 1469 |
| 1466 /* Apple gas crashes on adrl, work around that by using adr. | 1470 /* Apple gas crashes on adrl, work around that by using adr. |
| 1467 * But this requires a copy of these constants for each function. | 1471 * But this requires a copy of these constants for each function. |
| 1468 */ | 1472 */ |
| 1469 | 1473 |
| 1470 .balign 16 | 1474 .balign 16 |
| 1471 jsimd_ycc_\colorid\()_neon_consts: | 1475 jsimd_ycc_\colorid\()_neon_consts: |
| 1472 .short 0, 0, 0, 0 | 1476 .short 0, 0, 0, 0 |
| 1473 .short 22971, -11277, -23401, 29033 | 1477 .short 22971, -11277, -23401, 29033 |
| 1474 .short -128, -128, -128, -128 | 1478 .short -128, -128, -128, -128 |
| 1475 .short -128, -128, -128, -128 | 1479 .short -128, -128, -128, -128 |
| 1476 | 1480 |
| 1477 asm_function jsimd_ycc_\colorid\()_convert_neon | 1481 asm_function jsimd_ycc_\colorid\()_convert_neon |
| 1478 OUTPUT_WIDTH .req r0 | 1482 OUTPUT_WIDTH .req r0 |
| 1479 INPUT_BUF .req r1 | 1483 INPUT_BUF .req r1 |
| 1480 INPUT_ROW .req r2 | 1484 INPUT_ROW .req r2 |
| 1481 OUTPUT_BUF .req r3 | 1485 OUTPUT_BUF .req r3 |
| 1482 NUM_ROWS .req r4 | 1486 NUM_ROWS .req r4 |
| 1483 | 1487 |
| 1484 INPUT_BUF0 .req r5 | 1488 INPUT_BUF0 .req r5 |
| 1485 INPUT_BUF1 .req r6 | 1489 INPUT_BUF1 .req r6 |
| (...skipping 124 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1610 * jsimd_extbgr_ycc_convert_neon | 1614 * jsimd_extbgr_ycc_convert_neon |
| 1611 * jsimd_extrgbx_ycc_convert_neon | 1615 * jsimd_extrgbx_ycc_convert_neon |
| 1612 * jsimd_extbgrx_ycc_convert_neon | 1616 * jsimd_extbgrx_ycc_convert_neon |
| 1613 * jsimd_extxbgr_ycc_convert_neon | 1617 * jsimd_extxbgr_ycc_convert_neon |
| 1614 * jsimd_extxrgb_ycc_convert_neon | 1618 * jsimd_extxrgb_ycc_convert_neon |
| 1615 * | 1619 * |
| 1616 * Colorspace conversion RGB -> YCbCr | 1620 * Colorspace conversion RGB -> YCbCr |
| 1617 */ | 1621 */ |
| 1618 | 1622 |
| 1619 .macro do_store size | 1623 .macro do_store size |
| 1620 .if \size == 8 | 1624 .if \size == 8 |
| 1621 vst1.8 {d20}, [Y]! | 1625 vst1.8 {d20}, [Y]! |
| 1622 vst1.8 {d21}, [U]! | 1626 vst1.8 {d21}, [U]! |
| 1623 vst1.8 {d22}, [V]! | 1627 vst1.8 {d22}, [V]! |
| 1624 .elseif \size == 4 | 1628 .elseif \size == 4 |
| 1625 vst1.8 {d20[0]}, [Y]! | 1629 vst1.8 {d20[0]}, [Y]! |
| 1626 vst1.8 {d20[1]}, [Y]! | 1630 vst1.8 {d20[1]}, [Y]! |
| 1627 vst1.8 {d20[2]}, [Y]! | 1631 vst1.8 {d20[2]}, [Y]! |
| 1628 vst1.8 {d20[3]}, [Y]! | 1632 vst1.8 {d20[3]}, [Y]! |
| 1629 vst1.8 {d21[0]}, [U]! | 1633 vst1.8 {d21[0]}, [U]! |
| 1630 vst1.8 {d21[1]}, [U]! | 1634 vst1.8 {d21[1]}, [U]! |
| 1631 vst1.8 {d21[2]}, [U]! | 1635 vst1.8 {d21[2]}, [U]! |
| 1632 vst1.8 {d21[3]}, [U]! | 1636 vst1.8 {d21[3]}, [U]! |
| 1633 vst1.8 {d22[0]}, [V]! | 1637 vst1.8 {d22[0]}, [V]! |
| 1634 vst1.8 {d22[1]}, [V]! | 1638 vst1.8 {d22[1]}, [V]! |
| 1635 vst1.8 {d22[2]}, [V]! | 1639 vst1.8 {d22[2]}, [V]! |
| 1636 vst1.8 {d22[3]}, [V]! | 1640 vst1.8 {d22[3]}, [V]! |
| 1637 .elseif \size == 2 | 1641 .elseif \size == 2 |
| 1638 vst1.8 {d20[4]}, [Y]! | 1642 vst1.8 {d20[4]}, [Y]! |
| 1639 vst1.8 {d20[5]}, [Y]! | 1643 vst1.8 {d20[5]}, [Y]! |
| 1640 vst1.8 {d21[4]}, [U]! | 1644 vst1.8 {d21[4]}, [U]! |
| 1641 vst1.8 {d21[5]}, [U]! | 1645 vst1.8 {d21[5]}, [U]! |
| 1642 vst1.8 {d22[4]}, [V]! | 1646 vst1.8 {d22[4]}, [V]! |
| 1643 vst1.8 {d22[5]}, [V]! | 1647 vst1.8 {d22[5]}, [V]! |
| 1644 .elseif \size == 1 | 1648 .elseif \size == 1 |
| 1645 vst1.8 {d20[6]}, [Y]! | 1649 vst1.8 {d20[6]}, [Y]! |
| 1646 vst1.8 {d21[6]}, [U]! | 1650 vst1.8 {d21[6]}, [U]! |
| 1647 vst1.8 {d22[6]}, [V]! | 1651 vst1.8 {d22[6]}, [V]! |
| 1648 .else | 1652 .else |
| 1649 .error unsupported macroblock size | 1653 .error unsupported macroblock size |
| 1650 .endif | 1654 .endif |
| 1651 .endm | 1655 .endm |
| 1652 | 1656 |
| 1653 .macro do_load bpp, size | 1657 .macro do_load bpp, size |
| 1654 .if \bpp == 24 | 1658 .if \bpp == 24 |
| 1655 .if \size == 8 | 1659 .if \size == 8 |
| 1656 vld3.8 {d10, d11, d12}, [RGB]! | 1660 vld3.8 {d10, d11, d12}, [RGB]! |
| 1657 pld [RGB, #128] | 1661 pld [RGB, #128] |
| 1658 .elseif \size == 4 | 1662 .elseif \size == 4 |
| 1659 vld3.8 {d10[0], d11[0], d12[0]}, [RGB]! | 1663 vld3.8 {d10[0], d11[0], d12[0]}, [RGB]! |
| 1660 vld3.8 {d10[1], d11[1], d12[1]}, [RGB]! | 1664 vld3.8 {d10[1], d11[1], d12[1]}, [RGB]! |
| 1661 vld3.8 {d10[2], d11[2], d12[2]}, [RGB]! | 1665 vld3.8 {d10[2], d11[2], d12[2]}, [RGB]! |
| 1662 vld3.8 {d10[3], d11[3], d12[3]}, [RGB]! | 1666 vld3.8 {d10[3], d11[3], d12[3]}, [RGB]! |
| 1663 .elseif \size == 2 | 1667 .elseif \size == 2 |
| 1664 vld3.8 {d10[4], d11[4], d12[4]}, [RGB]! | 1668 vld3.8 {d10[4], d11[4], d12[4]}, [RGB]! |
| 1665 vld3.8 {d10[5], d11[5], d12[5]}, [RGB]! | 1669 vld3.8 {d10[5], d11[5], d12[5]}, [RGB]! |
| 1666 .elseif \size == 1 | 1670 .elseif \size == 1 |
| 1667 vld3.8 {d10[6], d11[6], d12[6]}, [RGB]! | 1671 vld3.8 {d10[6], d11[6], d12[6]}, [RGB]! |
| 1668 .else | |
| 1669 .error unsupported macroblock size | |
| 1670 .endif | |
| 1671 .elseif \bpp == 32 | |
| 1672 .if \size == 8 | |
| 1673 vld4.8 {d10, d11, d12, d13}, [RGB]! | |
| 1674 pld [RGB, #128] | |
| 1675 .elseif \size == 4 | |
| 1676 vld4.8 {d10[0], d11[0], d12[0], d13[0]}, [RGB]! | |
| 1677 vld4.8 {d10[1], d11[1], d12[1], d13[1]}, [RGB]! | |
| 1678 vld4.8 {d10[2], d11[2], d12[2], d13[2]}, [RGB]! | |
| 1679 vld4.8 {d10[3], d11[3], d12[3], d13[3]}, [RGB]! | |
| 1680 .elseif \size == 2 | |
| 1681 vld4.8 {d10[4], d11[4], d12[4], d13[4]}, [RGB]! | |
| 1682 vld4.8 {d10[5], d11[5], d12[5], d13[5]}, [RGB]! | |
| 1683 .elseif \size == 1 | |
| 1684 vld4.8 {d10[6], d11[6], d12[6], d13[6]}, [RGB]! | |
| 1685 .else | |
| 1686 .error unsupported macroblock size | |
| 1687 .endif | |
| 1688 .else | 1672 .else |
| 1689 .error unsupported bpp | 1673 .error unsupported macroblock size |
| 1690 .endif | 1674 .endif |
| 1675 .elseif \bpp == 32 |
| 1676 .if \size == 8 |
| 1677 vld4.8 {d10, d11, d12, d13}, [RGB]! |
| 1678 pld [RGB, #128] |
| 1679 .elseif \size == 4 |
| 1680 vld4.8 {d10[0], d11[0], d12[0], d13[0]}, [RGB]! |
| 1681 vld4.8 {d10[1], d11[1], d12[1], d13[1]}, [RGB]! |
| 1682 vld4.8 {d10[2], d11[2], d12[2], d13[2]}, [RGB]! |
| 1683 vld4.8 {d10[3], d11[3], d12[3], d13[3]}, [RGB]! |
| 1684 .elseif \size == 2 |
| 1685 vld4.8 {d10[4], d11[4], d12[4], d13[4]}, [RGB]! |
| 1686 vld4.8 {d10[5], d11[5], d12[5], d13[5]}, [RGB]! |
| 1687 .elseif \size == 1 |
| 1688 vld4.8 {d10[6], d11[6], d12[6], d13[6]}, [RGB]! |
| 1689 .else |
| 1690 .error unsupported macroblock size |
| 1691 .endif |
| 1692 .else |
| 1693 .error unsupported bpp |
| 1694 .endif |
| 1691 .endm | 1695 .endm |
| 1692 | 1696 |
| 1693 .macro generate_jsimd_rgb_ycc_convert_neon colorid, bpp, r_offs, g_offs, b_offs | 1697 .macro generate_jsimd_rgb_ycc_convert_neon colorid, bpp, r_offs, g_offs, b_offs |
| 1694 | 1698 |
| 1695 /* | 1699 /* |
| 1696 * 2 stage pipelined RGB->YCbCr conversion | 1700 * 2-stage pipelined RGB->YCbCr conversion |
| 1697 */ | 1701 */ |
| 1698 | 1702 |
| 1699 .macro do_rgb_to_yuv_stage1 | 1703 .macro do_rgb_to_yuv_stage1 |
| 1700 vmovl.u8 q2, d1\r_offs /* r = { d4, d5 } */ | 1704 vmovl.u8 q2, d1\r_offs /* r = { d4, d5 } */ |
| 1701 vmovl.u8 q3, d1\g_offs /* g = { d6, d7 } */ | 1705 vmovl.u8 q3, d1\g_offs /* g = { d6, d7 } */ |
| 1702 vmovl.u8 q4, d1\b_offs /* b = { d8, d9 } */ | 1706 vmovl.u8 q4, d1\b_offs /* b = { d8, d9 } */ |
| 1703 vmull.u16 q7, d4, d0[0] | 1707 vmull.u16 q7, d4, d0[0] |
| 1704 vmlal.u16 q7, d6, d0[1] | 1708 vmlal.u16 q7, d6, d0[1] |
| 1705 vmlal.u16 q7, d8, d0[2] | 1709 vmlal.u16 q7, d8, d0[2] |
| 1706 vmull.u16 q8, d5, d0[0] | 1710 vmull.u16 q8, d5, d0[0] |
| 1707 vmlal.u16 q8, d7, d0[1] | 1711 vmlal.u16 q8, d7, d0[1] |
| 1708 vmlal.u16 q8, d9, d0[2] | 1712 vmlal.u16 q8, d9, d0[2] |
| 1709 vrev64.32 q9, q1 | 1713 vrev64.32 q9, q1 |
| 1710 vrev64.32 q13, q1 | 1714 vrev64.32 q13, q1 |
| 1711 vmlsl.u16 q9, d4, d0[3] | 1715 vmlsl.u16 q9, d4, d0[3] |
| 1712 vmlsl.u16 q9, d6, d1[0] | 1716 vmlsl.u16 q9, d6, d1[0] |
| 1713 vmlal.u16 q9, d8, d1[1] | 1717 vmlal.u16 q9, d8, d1[1] |
| 1714 vmlsl.u16 q13, d5, d0[3] | 1718 vmlsl.u16 q13, d5, d0[3] |
| 1715 vmlsl.u16 q13, d7, d1[0] | 1719 vmlsl.u16 q13, d7, d1[0] |
| 1716 vmlal.u16 q13, d9, d1[1] | 1720 vmlal.u16 q13, d9, d1[1] |
| 1717 vrev64.32 q14, q1 | 1721 vrev64.32 q14, q1 |
| 1718 vrev64.32 q15, q1 | 1722 vrev64.32 q15, q1 |
| 1719 vmlal.u16 q14, d4, d1[1] | 1723 vmlal.u16 q14, d4, d1[1] |
| 1720 vmlsl.u16 q14, d6, d1[2] | 1724 vmlsl.u16 q14, d6, d1[2] |
| 1721 vmlsl.u16 q14, d8, d1[3] | 1725 vmlsl.u16 q14, d8, d1[3] |
| 1722 vmlal.u16 q15, d5, d1[1] | 1726 vmlal.u16 q15, d5, d1[1] |
| 1723 vmlsl.u16 q15, d7, d1[2] | 1727 vmlsl.u16 q15, d7, d1[2] |
| 1724 vmlsl.u16 q15, d9, d1[3] | 1728 vmlsl.u16 q15, d9, d1[3] |
| 1725 .endm | 1729 .endm |
| 1726 | 1730 |
| 1727 .macro do_rgb_to_yuv_stage2 | 1731 .macro do_rgb_to_yuv_stage2 |
| 1728 vrshrn.u32 d20, q7, #16 | 1732 vrshrn.u32 d20, q7, #16 |
| 1729 vrshrn.u32 d21, q8, #16 | 1733 vrshrn.u32 d21, q8, #16 |
| 1730 vshrn.u32 d22, q9, #16 | 1734 vshrn.u32 d22, q9, #16 |
| 1731 vshrn.u32 d23, q13, #16 | 1735 vshrn.u32 d23, q13, #16 |
| 1732 vshrn.u32 d24, q14, #16 | 1736 vshrn.u32 d24, q14, #16 |
| 1733 vshrn.u32 d25, q15, #16 | 1737 vshrn.u32 d25, q15, #16 |
| 1734 vmovn.u16 d20, q10 /* d20 = y */ | 1738 vmovn.u16 d20, q10 /* d20 = y */ |
| 1735 vmovn.u16 d21, q11 /* d21 = u */ | 1739 vmovn.u16 d21, q11 /* d21 = u */ |
| 1736 vmovn.u16 d22, q12 /* d22 = v */ | 1740 vmovn.u16 d22, q12 /* d22 = v */ |
| 1737 .endm | 1741 .endm |
| 1738 | 1742 |
| 1739 .macro do_rgb_to_yuv | 1743 .macro do_rgb_to_yuv |
| 1740 do_rgb_to_yuv_stage1 | 1744 do_rgb_to_yuv_stage1 |
| 1741 do_rgb_to_yuv_stage2 | 1745 do_rgb_to_yuv_stage2 |
| 1742 .endm | 1746 .endm |
| 1743 | 1747 |
| 1744 .macro do_rgb_to_yuv_stage2_store_load_stage1 | 1748 .macro do_rgb_to_yuv_stage2_store_load_stage1 |
| 1745 vrshrn.u32 d20, q7, #16 | 1749 vrshrn.u32 d20, q7, #16 |
| 1746 vrshrn.u32 d21, q8, #16 | 1750 vrshrn.u32 d21, q8, #16 |
| 1747 vshrn.u32 d22, q9, #16 | 1751 vshrn.u32 d22, q9, #16 |
| 1748 vrev64.32 q9, q1 | 1752 vrev64.32 q9, q1 |
| 1749 vshrn.u32 d23, q13, #16 | 1753 vshrn.u32 d23, q13, #16 |
| 1750 vrev64.32 q13, q1 | 1754 vrev64.32 q13, q1 |
| 1751 vshrn.u32 d24, q14, #16 | 1755 vshrn.u32 d24, q14, #16 |
| 1752 vshrn.u32 d25, q15, #16 | 1756 vshrn.u32 d25, q15, #16 |
| 1753 do_load \bpp, 8 | 1757 do_load \bpp, 8 |
| 1754 vmovn.u16 d20, q10 /* d20 = y */ | 1758 vmovn.u16 d20, q10 /* d20 = y */ |
| 1755 vmovl.u8 q2, d1\r_offs /* r = { d4, d5 } */ | 1759 vmovl.u8 q2, d1\r_offs /* r = { d4, d5 } */ |
| 1756 vmovn.u16 d21, q11 /* d21 = u */ | 1760 vmovn.u16 d21, q11 /* d21 = u */ |
| 1757 vmovl.u8 q3, d1\g_offs /* g = { d6, d7 } */ | 1761 vmovl.u8 q3, d1\g_offs /* g = { d6, d7 } */ |
| 1758 vmovn.u16 d22, q12 /* d22 = v */ | 1762 vmovn.u16 d22, q12 /* d22 = v */ |
| 1759 vmovl.u8 q4, d1\b_offs /* b = { d8, d9 } */ | 1763 vmovl.u8 q4, d1\b_offs /* b = { d8, d9 } */ |
| 1760 vmull.u16 q7, d4, d0[0] | 1764 vmull.u16 q7, d4, d0[0] |
| 1761 vmlal.u16 q7, d6, d0[1] | 1765 vmlal.u16 q7, d6, d0[1] |
| 1762 vmlal.u16 q7, d8, d0[2] | 1766 vmlal.u16 q7, d8, d0[2] |
| 1763 vst1.8 {d20}, [Y]! | 1767 vst1.8 {d20}, [Y]! |
| 1764 vmull.u16 q8, d5, d0[0] | 1768 vmull.u16 q8, d5, d0[0] |
| 1765 vmlal.u16 q8, d7, d0[1] | 1769 vmlal.u16 q8, d7, d0[1] |
| 1766 vmlal.u16 q8, d9, d0[2] | 1770 vmlal.u16 q8, d9, d0[2] |
| 1767 vmlsl.u16 q9, d4, d0[3] | 1771 vmlsl.u16 q9, d4, d0[3] |
| 1768 vmlsl.u16 q9, d6, d1[0] | 1772 vmlsl.u16 q9, d6, d1[0] |
| 1769 vmlal.u16 q9, d8, d1[1] | 1773 vmlal.u16 q9, d8, d1[1] |
| 1770 vst1.8 {d21}, [U]! | 1774 vst1.8 {d21}, [U]! |
| 1771 vmlsl.u16 q13, d5, d0[3] | 1775 vmlsl.u16 q13, d5, d0[3] |
| 1772 vmlsl.u16 q13, d7, d1[0] | 1776 vmlsl.u16 q13, d7, d1[0] |
| 1773 vmlal.u16 q13, d9, d1[1] | 1777 vmlal.u16 q13, d9, d1[1] |
| 1774 vrev64.32 q14, q1 | 1778 vrev64.32 q14, q1 |
| 1775 vrev64.32 q15, q1 | 1779 vrev64.32 q15, q1 |
| 1776 vmlal.u16 q14, d4, d1[1] | 1780 vmlal.u16 q14, d4, d1[1] |
| 1777 vmlsl.u16 q14, d6, d1[2] | 1781 vmlsl.u16 q14, d6, d1[2] |
| 1778 vmlsl.u16 q14, d8, d1[3] | 1782 vmlsl.u16 q14, d8, d1[3] |
| 1779 vst1.8 {d22}, [V]! | 1783 vst1.8 {d22}, [V]! |
| 1780 vmlal.u16 q15, d5, d1[1] | 1784 vmlal.u16 q15, d5, d1[1] |
| 1781 vmlsl.u16 q15, d7, d1[2] | 1785 vmlsl.u16 q15, d7, d1[2] |
| 1782 vmlsl.u16 q15, d9, d1[3] | 1786 vmlsl.u16 q15, d9, d1[3] |
| 1783 .endm | 1787 .endm |
| 1784 | 1788 |
| 1785 .balign 16 | 1789 .balign 16 |
| 1786 jsimd_\colorid\()_ycc_neon_consts: | 1790 jsimd_\colorid\()_ycc_neon_consts: |
| 1787 .short 19595, 38470, 7471, 11059 | 1791 .short 19595, 38470, 7471, 11059 |
| 1788 .short 21709, 32768, 27439, 5329 | 1792 .short 21709, 32768, 27439, 5329 |
| 1789 .short 32767, 128, 32767, 128 | 1793 .short 32767, 128, 32767, 128 |
| 1790 .short 32767, 128, 32767, 128 | 1794 .short 32767, 128, 32767, 128 |
| 1791 | 1795 |
| 1792 asm_function jsimd_\colorid\()_ycc_convert_neon | 1796 asm_function jsimd_\colorid\()_ycc_convert_neon |
| 1793 OUTPUT_WIDTH .req r0 | 1797 OUTPUT_WIDTH .req r0 |
| 1794 INPUT_BUF .req r1 | 1798 INPUT_BUF .req r1 |
| 1795 OUTPUT_BUF .req r2 | 1799 OUTPUT_BUF .req r2 |
| 1796 OUTPUT_ROW .req r3 | 1800 OUTPUT_ROW .req r3 |
| 1797 NUM_ROWS .req r4 | 1801 NUM_ROWS .req r4 |
| 1798 | 1802 |
| 1799 OUTPUT_BUF0 .req r5 | 1803 OUTPUT_BUF0 .req r5 |
| 1800 OUTPUT_BUF1 .req r6 | 1804 OUTPUT_BUF1 .req r6 |
| (...skipping 189 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1990 * rid of a bunch of VLD1.16 instructions | 1994 * rid of a bunch of VLD1.16 instructions |
| 1991 */ | 1995 */ |
| 1992 | 1996 |
| 1993 #define XFIX_0_382683433 d0[0] | 1997 #define XFIX_0_382683433 d0[0] |
| 1994 #define XFIX_0_541196100 d0[1] | 1998 #define XFIX_0_541196100 d0[1] |
| 1995 #define XFIX_0_707106781 d0[2] | 1999 #define XFIX_0_707106781 d0[2] |
| 1996 #define XFIX_1_306562965 d0[3] | 2000 #define XFIX_1_306562965 d0[3] |
| 1997 | 2001 |
| 1998 .balign 16 | 2002 .balign 16 |
| 1999 jsimd_fdct_ifast_neon_consts: | 2003 jsimd_fdct_ifast_neon_consts: |
| 2000 .short (98 * 128) /* XFIX_0_382683433 */ | 2004 .short (98 * 128) /* XFIX_0_382683433 */ |
| 2001 .short (139 * 128) /* XFIX_0_541196100 */ | 2005 .short (139 * 128) /* XFIX_0_541196100 */ |
| 2002 .short (181 * 128) /* XFIX_0_707106781 */ | 2006 .short (181 * 128) /* XFIX_0_707106781 */ |
| 2003 .short (334 * 128 - 256 * 128) /* XFIX_1_306562965 */ | 2007 .short (334 * 128 - 256 * 128) /* XFIX_1_306562965 */ |
| 2004 | 2008 |
| 2005 asm_function jsimd_fdct_ifast_neon | 2009 asm_function jsimd_fdct_ifast_neon |
| 2006 | 2010 |
| 2007 DATA .req r0 | 2011 DATA .req r0 |
| 2008 TMP .req ip | 2012 TMP .req ip |
| 2009 | 2013 |
| 2010 vpush {d8-d15} | 2014 vpush {d8-d15} |
| 2011 | 2015 |
| 2012 /* Load constants */ | 2016 /* Load constants */ |
| 2013 adr TMP, jsimd_fdct_ifast_neon_consts | 2017 adr TMP, jsimd_fdct_ifast_neon_consts |
| (...skipping 16 matching lines...) Expand all Loading... |
| 2030 vld1.16 {d20, d21, d22, d23}, [DATA, :128]! | 2034 vld1.16 {d20, d21, d22, d23}, [DATA, :128]! |
| 2031 vld1.16 {d24, d25, d26, d27}, [DATA, :128]! | 2035 vld1.16 {d24, d25, d26, d27}, [DATA, :128]! |
| 2032 vld1.16 {d28, d29, d30, d31}, [DATA, :128] | 2036 vld1.16 {d28, d29, d30, d31}, [DATA, :128] |
| 2033 sub DATA, DATA, #(128 - 32) | 2037 sub DATA, DATA, #(128 - 32) |
| 2034 | 2038 |
| 2035 mov TMP, #2 | 2039 mov TMP, #2 |
| 2036 1: | 2040 1: |
| 2037 /* Transpose */ | 2041 /* Transpose */ |
| 2038 vtrn.16 q12, q13 | 2042 vtrn.16 q12, q13 |
| 2039 vtrn.16 q10, q11 | 2043 vtrn.16 q10, q11 |
| 2040 vtrn.16 q8, q9 | 2044 vtrn.16 q8, q9 |
| 2041 vtrn.16 q14, q15 | 2045 vtrn.16 q14, q15 |
| 2042 vtrn.32 q9, q11 | 2046 vtrn.32 q9, q11 |
| 2043 vtrn.32 q13, q15 | 2047 vtrn.32 q13, q15 |
| 2044 vtrn.32 q8, q10 | 2048 vtrn.32 q8, q10 |
| 2045 vtrn.32 q12, q14 | 2049 vtrn.32 q12, q14 |
| 2046 vswp d30, d23 | 2050 vswp d30, d23 |
| 2047 vswp d24, d17 | 2051 vswp d24, d17 |
| 2048 vswp d26, d19 | 2052 vswp d26, d19 |
| 2049 /* 1-D FDCT */ | 2053 /* 1-D FDCT */ |
| 2050 vadd.s16 q2, q11, q12 | 2054 vadd.s16 q2, q11, q12 |
| 2051 vswp d28, d21 | 2055 vswp d28, d21 |
| 2052 vsub.s16 q12, q11, q12 | 2056 vsub.s16 q12, q11, q12 |
| 2053 vsub.s16 q6, q10, q13 | 2057 vsub.s16 q6, q10, q13 |
| 2054 vadd.s16 q10, q10, q13 | 2058 vadd.s16 q10, q10, q13 |
| 2055 vsub.s16 q7, q9, q14 | 2059 vsub.s16 q7, q9, q14 |
| 2056 vadd.s16 q9, q9, q14 | 2060 vadd.s16 q9, q9, q14 |
| 2057 vsub.s16 q1, q8, q15 | 2061 vsub.s16 q1, q8, q15 |
| 2058 vadd.s16 q8, q8, q15 | 2062 vadd.s16 q8, q8, q15 |
| 2059 vsub.s16 q4, q9, q10 | 2063 vsub.s16 q4, q9, q10 |
| 2060 vsub.s16 q5, q8, q2 | 2064 vsub.s16 q5, q8, q2 |
| 2061 vadd.s16 q3, q9, q10 | 2065 vadd.s16 q3, q9, q10 |
| 2062 vadd.s16 q4, q4, q5 | 2066 vadd.s16 q4, q4, q5 |
| 2063 vadd.s16 q2, q8, q2 | 2067 vadd.s16 q2, q8, q2 |
| 2064 vqdmulh.s16 q4, q4, XFIX_0_707106781 | 2068 vqdmulh.s16 q4, q4, XFIX_0_707106781 |
| 2065 vadd.s16 q11, q12, q6 | 2069 vadd.s16 q11, q12, q6 |
| 2066 vadd.s16 q8, q2, q3 | 2070 vadd.s16 q8, q2, q3 |
| 2067 vsub.s16 q12, q2, q3 | 2071 vsub.s16 q12, q2, q3 |
| 2068 vadd.s16 q3, q6, q7 | 2072 vadd.s16 q3, q6, q7 |
| 2069 vadd.s16 q7, q7, q1 | 2073 vadd.s16 q7, q7, q1 |
| 2070 vqdmulh.s16 q3, q3, XFIX_0_707106781 | 2074 vqdmulh.s16 q3, q3, XFIX_0_707106781 |
| 2071 vsub.s16 q6, q11, q7 | 2075 vsub.s16 q6, q11, q7 |
| 2072 vadd.s16 q10, q5, q4 | 2076 vadd.s16 q10, q5, q4 |
| 2073 vqdmulh.s16 q6, q6, XFIX_0_382683433 | 2077 vqdmulh.s16 q6, q6, XFIX_0_382683433 |
| 2074 vsub.s16 q14, q5, q4 | 2078 vsub.s16 q14, q5, q4 |
| 2075 vqdmulh.s16 q11, q11, XFIX_0_541196100 | 2079 vqdmulh.s16 q11, q11, XFIX_0_541196100 |
| 2076 vqdmulh.s16 q5, q7, XFIX_1_306562965 | 2080 vqdmulh.s16 q5, q7, XFIX_1_306562965 |
| 2077 vadd.s16 q4, q1, q3 | 2081 vadd.s16 q4, q1, q3 |
| 2078 vsub.s16 q3, q1, q3 | 2082 vsub.s16 q3, q1, q3 |
| 2079 vadd.s16 q7, q7, q6 | 2083 vadd.s16 q7, q7, q6 |
| 2080 vadd.s16 q11, q11, q6 | 2084 vadd.s16 q11, q11, q6 |
| 2081 vadd.s16 q7, q7, q5 | 2085 vadd.s16 q7, q7, q5 |
| 2082 vadd.s16 q13, q3, q11 | 2086 vadd.s16 q13, q3, q11 |
| 2083 vsub.s16 q11, q3, q11 | 2087 vsub.s16 q11, q3, q11 |
| 2084 vadd.s16 q9, q4, q7 | 2088 vadd.s16 q9, q4, q7 |
| 2085 vsub.s16 q15, q4, q7 | 2089 vsub.s16 q15, q4, q7 |
| 2086 subs TMP, TMP, #1 | 2090 subs TMP, TMP, #1 |
| 2087 bne 1b | 2091 bne 1b |
| 2088 | 2092 |
| 2089 /* store results */ | 2093 /* store results */ |
| 2090 vst1.16 {d16, d17, d18, d19}, [DATA, :128]! | 2094 vst1.16 {d16, d17, d18, d19}, [DATA, :128]! |
| 2091 vst1.16 {d20, d21, d22, d23}, [DATA, :128]! | 2095 vst1.16 {d20, d21, d22, d23}, [DATA, :128]! |
| 2092 vst1.16 {d24, d25, d26, d27}, [DATA, :128]! | 2096 vst1.16 {d24, d25, d26, d27}, [DATA, :128]! |
| 2093 vst1.16 {d28, d29, d30, d31}, [DATA, :128] | 2097 vst1.16 {d28, d29, d30, d31}, [DATA, :128] |
| 2094 | 2098 |
| 2095 vpop {d8-d15} | 2099 vpop {d8-d15} |
| 2096 bx lr | 2100 bx lr |
| 2097 | 2101 |
| 2098 .unreq DATA | 2102 .unreq DATA |
| 2099 .unreq TMP | 2103 .unreq TMP |
| 2100 | 2104 |
| 2101 | 2105 |
| 2102 /*****************************************************************************/ | 2106 /*****************************************************************************/ |
| 2103 | 2107 |
| 2104 /* | 2108 /* |
| 2105 * GLOBAL(void) | 2109 * GLOBAL(void) |
| 2106 * jsimd_quantize_neon (JCOEFPTR coef_block, DCTELEM * divisors, | 2110 * jsimd_quantize_neon (JCOEFPTR coef_block, DCTELEM *divisors, |
| 2107 * DCTELEM * workspace); | 2111 * DCTELEM *workspace); |
| 2108 * | 2112 * |
| 2109 * Note: the code uses 2 stage pipelining in order to improve instructions | 2113 * Note: the code uses 2 stage pipelining in order to improve instructions |
| 2110 * scheduling and eliminate stalls (this provides ~15% better | 2114 * scheduling and eliminate stalls (this provides ~15% better |
| 2111 * performance for this function on both ARM Cortex-A8 and | 2115 * performance for this function on both ARM Cortex-A8 and |
| 2112 * ARM Cortex-A9 when compared to the non-pipelined variant). | 2116 * ARM Cortex-A9 when compared to the non-pipelined variant). |
| 2113 * The instructions which belong to the second stage use different | 2117 * The instructions which belong to the second stage use different |
| 2114 * indentation for better readiability. | 2118 * indentation for better readiability. |
| 2115 */ | 2119 */ |
| 2116 asm_function jsimd_quantize_neon | 2120 asm_function jsimd_quantize_neon |
| 2117 | 2121 |
| 2118 COEF_BLOCK .req r0 | 2122 COEF_BLOCK .req r0 |
| 2119 DIVISORS .req r1 | 2123 DIVISORS .req r1 |
| 2120 WORKSPACE .req r2 | 2124 WORKSPACE .req r2 |
| 2121 | 2125 |
| 2122 RECIPROCAL .req DIVISORS | 2126 RECIPROCAL .req DIVISORS |
| 2123 CORRECTION .req r3 | 2127 CORRECTION .req r3 |
| 2124 SHIFT .req ip | 2128 SHIFT .req ip |
| 2125 LOOP_COUNT .req r4 | 2129 LOOP_COUNT .req r4 |
| 2126 | 2130 |
| 2127 vld1.16 {d0, d1, d2, d3}, [WORKSPACE, :128]! | 2131 vld1.16 {d0, d1, d2, d3}, [WORKSPACE, :128]! |
| 2128 vabs.s16 q12, q0 | 2132 vabs.s16 q12, q0 |
| 2129 add CORRECTION, DIVISORS, #(64 * 2) | 2133 add CORRECTION, DIVISORS, #(64 * 2) |
| 2130 add SHIFT, DIVISORS, #(64 * 6) | 2134 add SHIFT, DIVISORS, #(64 * 6) |
| 2131 vld1.16 {d20, d21, d22, d23}, [CORRECTION, :128]! | 2135 vld1.16 {d20, d21, d22, d23}, [CORRECTION, :128]! |
| 2132 vabs.s16 q13, q1 | 2136 vabs.s16 q13, q1 |
| 2133 vld1.16 {d16, d17, d18, d19}, [RECIPROCAL, :128]! | 2137 vld1.16 {d16, d17, d18, d19}, [RECIPROCAL, :128]! |
| 2134 vadd.u16 q12, q12, q10 /* add correction */ | 2138 vadd.u16 q12, q12, q10 /* add correction */ |
| 2135 vadd.u16 q13, q13, q11 | 2139 vadd.u16 q13, q13, q11 |
| 2136 vmull.u16 q10, d24, d16 /* multiply by reciprocal */ | 2140 vmull.u16 q10, d24, d16 /* multiply by reciprocal */ |
| 2137 vmull.u16 q11, d25, d17 | 2141 vmull.u16 q11, d25, d17 |
| 2138 vmull.u16 q8, d26, d18 | 2142 vmull.u16 q8, d26, d18 |
| 2139 vmull.u16 q9, d27, d19 | 2143 vmull.u16 q9, d27, d19 |
| 2140 vld1.16 {d24, d25, d26, d27}, [SHIFT, :128]! | 2144 vld1.16 {d24, d25, d26, d27}, [SHIFT, :128]! |
| 2141 vshrn.u32 d20, q10, #16 | 2145 vshrn.u32 d20, q10, #16 |
| 2142 vshrn.u32 d21, q11, #16 | 2146 vshrn.u32 d21, q11, #16 |
| 2143 vshrn.u32 d22, q8, #16 | 2147 vshrn.u32 d22, q8, #16 |
| 2144 vshrn.u32 d23, q9, #16 | 2148 vshrn.u32 d23, q9, #16 |
| 2145 vneg.s16 q12, q12 | 2149 vneg.s16 q12, q12 |
| 2146 vneg.s16 q13, q13 | 2150 vneg.s16 q13, q13 |
| 2147 vshr.s16 q2, q0, #15 /* extract sign */ | 2151 vshr.s16 q2, q0, #15 /* extract sign */ |
| 2148 vshr.s16 q3, q1, #15 | 2152 vshr.s16 q3, q1, #15 |
| 2149 vshl.u16 q14, q10, q12 /* shift */ | 2153 vshl.u16 q14, q10, q12 /* shift */ |
| 2150 vshl.u16 q15, q11, q13 | 2154 vshl.u16 q15, q11, q13 |
| 2151 | 2155 |
| 2152 push {r4, r5} | 2156 push {r4, r5} |
| 2153 mov LOOP_COUNT, #3 | 2157 mov LOOP_COUNT, #3 |
| 2154 1: | 2158 1: |
| 2155 vld1.16 {d0, d1, d2, d3}, [WORKSPACE, :128]! | 2159 vld1.16 {d0, d1, d2, d3}, [WORKSPACE, :128]! |
| 2156 veor.u16 q14, q14, q2 /* restore sign */ | 2160 veor.u16 q14, q14, q2 /* restore sign */ |
| 2157 vabs.s16 q12, q0 | 2161 vabs.s16 q12, q0 |
| 2158 vld1.16 {d20, d21, d22, d23}, [CORRECTION, :128]! | 2162 vld1.16 {d20, d21, d22, d23}, [CORRECTION, :128]! |
| 2159 vabs.s16 q13, q1 | 2163 vabs.s16 q13, q1 |
| 2160 veor.u16 q15, q15, q3 | 2164 veor.u16 q15, q15, q3 |
| 2161 vld1.16 {d16, d17, d18, d19}, [RECIPROCAL, :128]! | 2165 vld1.16 {d16, d17, d18, d19}, [RECIPROCAL, :128]! |
| 2162 vadd.u16 q12, q12, q10 /* add correction */ | 2166 vadd.u16 q12, q12, q10 /* add correction */ |
| 2163 vadd.u16 q13, q13, q11 | 2167 vadd.u16 q13, q13, q11 |
| 2164 vmull.u16 q10, d24, d16 /* multiply by reciprocal */ | 2168 vmull.u16 q10, d24, d16 /* multiply by reciprocal */ |
| 2165 vmull.u16 q11, d25, d17 | 2169 vmull.u16 q11, d25, d17 |
| 2166 vmull.u16 q8, d26, d18 | 2170 vmull.u16 q8, d26, d18 |
| 2167 vmull.u16 q9, d27, d19 | 2171 vmull.u16 q9, d27, d19 |
| 2168 vsub.u16 q14, q14, q2 | 2172 vsub.u16 q14, q14, q2 |
| 2169 vld1.16 {d24, d25, d26, d27}, [SHIFT, :128]! | 2173 vld1.16 {d24, d25, d26, d27}, [SHIFT, :128]! |
| 2170 vsub.u16 q15, q15, q3 | 2174 vsub.u16 q15, q15, q3 |
| 2171 vshrn.u32 d20, q10, #16 | 2175 vshrn.u32 d20, q10, #16 |
| 2172 vshrn.u32 d21, q11, #16 | 2176 vshrn.u32 d21, q11, #16 |
| 2173 vst1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128]! | 2177 vst1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128]! |
| 2174 vshrn.u32 d22, q8, #16 | 2178 vshrn.u32 d22, q8, #16 |
| 2175 vshrn.u32 d23, q9, #16 | 2179 vshrn.u32 d23, q9, #16 |
| 2176 vneg.s16 q12, q12 | 2180 vneg.s16 q12, q12 |
| 2177 vneg.s16 q13, q13 | 2181 vneg.s16 q13, q13 |
| 2178 vshr.s16 q2, q0, #15 /* extract sign */ | 2182 vshr.s16 q2, q0, #15 /* extract sign */ |
| 2179 vshr.s16 q3, q1, #15 | 2183 vshr.s16 q3, q1, #15 |
| 2180 vshl.u16 q14, q10, q12 /* shift */ | 2184 vshl.u16 q14, q10, q12 /* shift */ |
| 2181 vshl.u16 q15, q11, q13 | 2185 vshl.u16 q15, q11, q13 |
| 2182 subs LOOP_COUNT, LOOP_COUNT, #1 | 2186 subs LOOP_COUNT, LOOP_COUNT, #1 |
| 2183 bne 1b | 2187 bne 1b |
| 2184 pop {r4, r5} | 2188 pop {r4, r5} |
| 2185 | 2189 |
| 2186 veor.u16 q14, q14, q2 /* restore sign */ | 2190 veor.u16 q14, q14, q2 /* restore sign */ |
| 2187 veor.u16 q15, q15, q3 | 2191 veor.u16 q15, q15, q3 |
| 2188 vsub.u16 q14, q14, q2 | 2192 vsub.u16 q14, q14, q2 |
| 2189 vsub.u16 q15, q15, q3 | 2193 vsub.u16 q15, q15, q3 |
| 2190 vst1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128]! | 2194 vst1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128]! |
| 2191 | 2195 |
| 2192 bx lr /* return */ | 2196 bx lr /* return */ |
| 2193 | 2197 |
| 2194 .unreq COEF_BLOCK | 2198 .unreq COEF_BLOCK |
| 2195 .unreq DIVISORS | 2199 .unreq DIVISORS |
| 2196 .unreq WORKSPACE | 2200 .unreq WORKSPACE |
| 2197 .unreq RECIPROCAL | 2201 .unreq RECIPROCAL |
| 2198 .unreq CORRECTION | 2202 .unreq CORRECTION |
| 2199 .unreq SHIFT | 2203 .unreq SHIFT |
| 2200 .unreq LOOP_COUNT | 2204 .unreq LOOP_COUNT |
| 2201 | 2205 |
| 2202 | 2206 |
| 2203 /*****************************************************************************/ | 2207 /*****************************************************************************/ |
| 2204 | 2208 |
| 2205 /* | 2209 /* |
| 2206 * GLOBAL(void) | 2210 * GLOBAL(void) |
| 2207 * jsimd_h2v1_fancy_upsample_neon (int max_v_samp_factor, | 2211 * jsimd_h2v1_fancy_upsample_neon (int max_v_samp_factor, |
| 2208 * JDIMENSION downsampled_width, | 2212 * JDIMENSION downsampled_width, |
| 2209 * JSAMPARRAY input_data, | 2213 * JSAMPARRAY input_data, |
| 2210 * JSAMPARRAY * output_data_ptr); | 2214 * JSAMPARRAY *output_data_ptr); |
| 2211 * | 2215 * |
| 2212 * Note: the use of unaligned writes is the main remaining bottleneck in | 2216 * Note: the use of unaligned writes is the main remaining bottleneck in |
| 2213 * this code, which can be potentially solved to get up to tens | 2217 * this code, which can be potentially solved to get up to tens |
| 2214 * of percents performance improvement on Cortex-A8/Cortex-A9. | 2218 * of percents performance improvement on Cortex-A8/Cortex-A9. |
| 2215 */ | 2219 */ |
| 2216 | 2220 |
| 2217 /* | 2221 /* |
| 2218 * Upsample 16 source pixels to 32 destination pixels. The new 16 source | 2222 * Upsample 16 source pixels to 32 destination pixels. The new 16 source |
| 2219 * pixels are loaded to q0. The previous 16 source pixels are in q1. The | 2223 * pixels are loaded to q0. The previous 16 source pixels are in q1. The |
| 2220 * shifted-by-one source pixels are constructed in q2 by using q0 and q1. | 2224 * shifted-by-one source pixels are constructed in q2 by using q0 and q1. |
| 2221 * Register d28 is used for multiplication by 3. Register q15 is used | 2225 * Register d28 is used for multiplication by 3. Register q15 is used |
| 2222 * for adding +1 bias. | 2226 * for adding +1 bias. |
| 2223 */ | 2227 */ |
| 2224 .macro upsample16 OUTPTR, INPTR | 2228 .macro upsample16 OUTPTR, INPTR |
| 2225 vld1.8 {q0}, [\INPTR]! | 2229 vld1.8 {q0}, [\INPTR]! |
| 2226 vmovl.u8 q8, d0 | 2230 vmovl.u8 q8, d0 |
| 2227 vext.8 q2, q1, q0, #15 | 2231 vext.8 q2, q1, q0, #15 |
| 2228 vmovl.u8 q9, d1 | 2232 vmovl.u8 q9, d1 |
| 2229 vaddw.u8 q10, q15, d4 | 2233 vaddw.u8 q10, q15, d4 |
| 2230 vaddw.u8 q11, q15, d5 | 2234 vaddw.u8 q11, q15, d5 |
| 2231 vmlal.u8 q8, d4, d28 | 2235 vmlal.u8 q8, d4, d28 |
| 2232 vmlal.u8 q9, d5, d28 | 2236 vmlal.u8 q9, d5, d28 |
| 2233 vmlal.u8 q10, d0, d28 | 2237 vmlal.u8 q10, d0, d28 |
| 2234 vmlal.u8 q11, d1, d28 | 2238 vmlal.u8 q11, d1, d28 |
| 2235 vmov q1, q0 /* backup source pixels to q1 */ | 2239 vmov q1, q0 /* backup source pixels to q1 */ |
| 2236 vrshrn.u16 d6, q8, #2 | 2240 vrshrn.u16 d6, q8, #2 |
| 2237 vrshrn.u16 d7, q9, #2 | 2241 vrshrn.u16 d7, q9, #2 |
| 2238 vshrn.u16 d8, q10, #2 | 2242 vshrn.u16 d8, q10, #2 |
| 2239 vshrn.u16 d9, q11, #2 | 2243 vshrn.u16 d9, q11, #2 |
| 2240 vst2.8 {d6, d7, d8, d9}, [\OUTPTR]! | 2244 vst2.8 {d6, d7, d8, d9}, [\OUTPTR]! |
| 2241 .endm | 2245 .endm |
| 2242 | 2246 |
| 2243 /* | 2247 /* |
| 2244 * Upsample 32 source pixels to 64 destination pixels. Compared to 'usample16' | 2248 * Upsample 32 source pixels to 64 destination pixels. Compared to 'usample16' |
| 2245 * macro, the roles of q0 and q1 registers are reversed for even and odd | 2249 * macro, the roles of q0 and q1 registers are reversed for even and odd |
| 2246 * groups of 16 pixels, that's why "vmov q1, q0" instructions are not needed. | 2250 * groups of 16 pixels, that's why "vmov q1, q0" instructions are not needed. |
| 2247 * Also this unrolling allows to reorder loads and stores to compensate | 2251 * Also this unrolling allows to reorder loads and stores to compensate |
| 2248 * multiplication latency and reduce stalls. | 2252 * multiplication latency and reduce stalls. |
| 2249 */ | 2253 */ |
| 2250 .macro upsample32 OUTPTR, INPTR | 2254 .macro upsample32 OUTPTR, INPTR |
| 2251 /* even 16 pixels group */ | 2255 /* even 16 pixels group */ |
| 2252 vld1.8 {q0}, [\INPTR]! | 2256 vld1.8 {q0}, [\INPTR]! |
| 2253 vmovl.u8 q8, d0 | 2257 vmovl.u8 q8, d0 |
| 2254 vext.8 q2, q1, q0, #15 | 2258 vext.8 q2, q1, q0, #15 |
| 2255 vmovl.u8 q9, d1 | 2259 vmovl.u8 q9, d1 |
| 2256 vaddw.u8 q10, q15, d4 | 2260 vaddw.u8 q10, q15, d4 |
| 2257 vaddw.u8 q11, q15, d5 | 2261 vaddw.u8 q11, q15, d5 |
| 2258 vmlal.u8 q8, d4, d28 | 2262 vmlal.u8 q8, d4, d28 |
| 2259 vmlal.u8 q9, d5, d28 | 2263 vmlal.u8 q9, d5, d28 |
| 2260 vmlal.u8 q10, d0, d28 | 2264 vmlal.u8 q10, d0, d28 |
| 2261 vmlal.u8 q11, d1, d28 | 2265 vmlal.u8 q11, d1, d28 |
| 2262 /* odd 16 pixels group */ | 2266 /* odd 16 pixels group */ |
| 2263 vld1.8 {q1}, [\INPTR]! | 2267 vld1.8 {q1}, [\INPTR]! |
| 2264 vrshrn.u16 d6, q8, #2 | 2268 vrshrn.u16 d6, q8, #2 |
| 2265 vrshrn.u16 d7, q9, #2 | 2269 vrshrn.u16 d7, q9, #2 |
| 2266 vshrn.u16 d8, q10, #2 | 2270 vshrn.u16 d8, q10, #2 |
| 2267 vshrn.u16 d9, q11, #2 | 2271 vshrn.u16 d9, q11, #2 |
| 2268 vmovl.u8 q8, d2 | 2272 vmovl.u8 q8, d2 |
| 2269 vext.8 q2, q0, q1, #15 | 2273 vext.8 q2, q0, q1, #15 |
| 2270 vmovl.u8 q9, d3 | 2274 vmovl.u8 q9, d3 |
| 2271 vaddw.u8 q10, q15, d4 | 2275 vaddw.u8 q10, q15, d4 |
| 2272 vaddw.u8 q11, q15, d5 | 2276 vaddw.u8 q11, q15, d5 |
| 2273 vmlal.u8 q8, d4, d28 | 2277 vmlal.u8 q8, d4, d28 |
| 2274 vmlal.u8 q9, d5, d28 | 2278 vmlal.u8 q9, d5, d28 |
| 2275 vmlal.u8 q10, d2, d28 | 2279 vmlal.u8 q10, d2, d28 |
| 2276 vmlal.u8 q11, d3, d28 | 2280 vmlal.u8 q11, d3, d28 |
| 2277 vst2.8 {d6, d7, d8, d9}, [\OUTPTR]! | 2281 vst2.8 {d6, d7, d8, d9}, [\OUTPTR]! |
| 2278 vrshrn.u16 d6, q8, #2 | 2282 vrshrn.u16 d6, q8, #2 |
| 2279 vrshrn.u16 d7, q9, #2 | 2283 vrshrn.u16 d7, q9, #2 |
| 2280 vshrn.u16 d8, q10, #2 | 2284 vshrn.u16 d8, q10, #2 |
| 2281 vshrn.u16 d9, q11, #2 | 2285 vshrn.u16 d9, q11, #2 |
| 2282 vst2.8 {d6, d7, d8, d9}, [\OUTPTR]! | 2286 vst2.8 {d6, d7, d8, d9}, [\OUTPTR]! |
| 2283 .endm | 2287 .endm |
| 2284 | 2288 |
| 2285 /* | 2289 /* |
| 2286 * Upsample a row of WIDTH pixels from INPTR to OUTPTR. | 2290 * Upsample a row of WIDTH pixels from INPTR to OUTPTR. |
| 2287 */ | 2291 */ |
| 2288 .macro upsample_row OUTPTR, INPTR, WIDTH, TMP1 | 2292 .macro upsample_row OUTPTR, INPTR, WIDTH, TMP1 |
| 2289 /* special case for the first and last pixels */ | 2293 /* special case for the first and last pixels */ |
| 2290 sub \WIDTH, \WIDTH, #1 | 2294 sub \WIDTH, \WIDTH, #1 |
| 2291 add \OUTPTR, \OUTPTR, #1 | 2295 add \OUTPTR, \OUTPTR, #1 |
| 2292 ldrb \TMP1, [\INPTR, \WIDTH] | 2296 ldrb \TMP1, [\INPTR, \WIDTH] |
| (...skipping 40 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 2333 vld1.8 {d0[3]}, [\INPTR] | 2337 vld1.8 {d0[3]}, [\INPTR] |
| 2334 sub \INPTR, \INPTR, #1 | 2338 sub \INPTR, \INPTR, #1 |
| 2335 vld1.8 {d0[2]}, [\INPTR] | 2339 vld1.8 {d0[2]}, [\INPTR] |
| 2336 sub \INPTR, \INPTR, #1 | 2340 sub \INPTR, \INPTR, #1 |
| 2337 vld1.8 {d0[1]}, [\INPTR] | 2341 vld1.8 {d0[1]}, [\INPTR] |
| 2338 sub \INPTR, \INPTR, #1 | 2342 sub \INPTR, \INPTR, #1 |
| 2339 vld1.8 {d0[0]}, [\INPTR] | 2343 vld1.8 {d0[0]}, [\INPTR] |
| 2340 2: | 2344 2: |
| 2341 tst \WIDTH, #8 | 2345 tst \WIDTH, #8 |
| 2342 beq 2f | 2346 beq 2f |
| 2343 vmov d1, d0 | 2347 vmov d1, d0 |
| 2344 sub \INPTR, \INPTR, #8 | 2348 sub \INPTR, \INPTR, #8 |
| 2345 vld1.8 {d0}, [\INPTR] | 2349 vld1.8 {d0}, [\INPTR] |
| 2346 2: /* upsample the remaining pixels */ | 2350 2: /* upsample the remaining pixels */ |
| 2347 vmovl.u8 q8, d0 | 2351 vmovl.u8 q8, d0 |
| 2348 vext.8 q2, q1, q0, #15 | 2352 vext.8 q2, q1, q0, #15 |
| 2349 vmovl.u8 q9, d1 | 2353 vmovl.u8 q9, d1 |
| 2350 vaddw.u8 q10, q15, d4 | 2354 vaddw.u8 q10, q15, d4 |
| 2351 vaddw.u8 q11, q15, d5 | 2355 vaddw.u8 q11, q15, d5 |
| 2352 vmlal.u8 q8, d4, d28 | 2356 vmlal.u8 q8, d4, d28 |
| 2353 vmlal.u8 q9, d5, d28 | 2357 vmlal.u8 q9, d5, d28 |
| 2354 vmlal.u8 q10, d0, d28 | 2358 vmlal.u8 q10, d0, d28 |
| 2355 vmlal.u8 q11, d1, d28 | 2359 vmlal.u8 q11, d1, d28 |
| 2356 vrshrn.u16 d10, q8, #2 | 2360 vrshrn.u16 d10, q8, #2 |
| 2357 vrshrn.u16 d12, q9, #2 | 2361 vrshrn.u16 d12, q9, #2 |
| 2358 vshrn.u16 d11, q10, #2 | 2362 vshrn.u16 d11, q10, #2 |
| 2359 vshrn.u16 d13, q11, #2 | 2363 vshrn.u16 d13, q11, #2 |
| 2360 vzip.8 d10, d11 | 2364 vzip.8 d10, d11 |
| 2361 vzip.8 d12, d13 | 2365 vzip.8 d12, d13 |
| 2362 /* store the remaining pixels */ | 2366 /* store the remaining pixels */ |
| 2363 tst \WIDTH, #8 | 2367 tst \WIDTH, #8 |
| 2364 beq 2f | 2368 beq 2f |
| 2365 vst1.8 {d10, d11}, [\OUTPTR]! | 2369 vst1.8 {d10, d11}, [\OUTPTR]! |
| 2366 vmov q5, q6 | 2370 vmov q5, q6 |
| 2367 2: | 2371 2: |
| 2368 tst \WIDTH, #4 | 2372 tst \WIDTH, #4 |
| 2369 beq 2f | 2373 beq 2f |
| 2370 vst1.8 {d10}, [\OUTPTR]! | 2374 vst1.8 {d10}, [\OUTPTR]! |
| 2371 vmov d10, d11 | 2375 vmov d10, d11 |
| 2372 2: | 2376 2: |
| 2373 tst \WIDTH, #2 | 2377 tst \WIDTH, #2 |
| 2374 beq 2f | 2378 beq 2f |
| 2375 vst1.8 {d10[0]}, [\OUTPTR]! | 2379 vst1.8 {d10[0]}, [\OUTPTR]! |
| 2376 vst1.8 {d10[1]}, [\OUTPTR]! | 2380 vst1.8 {d10[1]}, [\OUTPTR]! |
| 2377 vst1.8 {d10[2]}, [\OUTPTR]! | 2381 vst1.8 {d10[2]}, [\OUTPTR]! |
| 2378 vst1.8 {d10[3]}, [\OUTPTR]! | 2382 vst1.8 {d10[3]}, [\OUTPTR]! |
| 2379 vext.8 d10, d10, d10, #4 | 2383 vext.8 d10, d10, d10, #4 |
| 2380 2: | 2384 2: |
| 2381 tst \WIDTH, #1 | 2385 tst \WIDTH, #1 |
| (...skipping 43 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 2425 .unreq DOWNSAMPLED_WIDTH | 2429 .unreq DOWNSAMPLED_WIDTH |
| 2426 .unreq INPUT_DATA | 2430 .unreq INPUT_DATA |
| 2427 .unreq OUTPUT_DATA_PTR | 2431 .unreq OUTPUT_DATA_PTR |
| 2428 .unreq OUTPUT_DATA | 2432 .unreq OUTPUT_DATA |
| 2429 | 2433 |
| 2430 .unreq OUTPTR | 2434 .unreq OUTPTR |
| 2431 .unreq INPTR | 2435 .unreq INPTR |
| 2432 .unreq WIDTH | 2436 .unreq WIDTH |
| 2433 .unreq TMP | 2437 .unreq TMP |
| 2434 | 2438 |
| 2435 | |
| 2436 .purgem upsample16 | 2439 .purgem upsample16 |
| 2437 .purgem upsample32 | 2440 .purgem upsample32 |
| 2438 .purgem upsample_row | 2441 .purgem upsample_row |
| 2442 |
| 2443 |
| 2444 /*****************************************************************************/ |
| 2445 |
| 2446 /* |
| 2447 * GLOBAL(JOCTET*) |
| 2448 * jsimd_huff_encode_one_block (working_state *state, JOCTET *buffer, |
| 2449 * JCOEFPTR block, int last_dc_val, |
| 2450 * c_derived_tbl *dctbl, c_derived_tbl *actbl) |
| 2451 * |
| 2452 */ |
| 2453 |
| 2454 .macro emit_byte BUFFER, PUT_BUFFER, PUT_BITS, ZERO, TMP |
| 2455 sub \PUT_BITS, \PUT_BITS, #0x8 |
| 2456 lsr \TMP, \PUT_BUFFER, \PUT_BITS |
| 2457 uxtb \TMP, \TMP |
| 2458 strb \TMP, [\BUFFER, #1]! |
| 2459 cmp \TMP, #0xff |
| 2460 /*it eq*/ |
| 2461 strbeq \ZERO, [\BUFFER, #1]! |
| 2462 .endm |
| 2463 |
| 2464 .macro put_bits PUT_BUFFER, PUT_BITS, CODE, SIZE |
| 2465 /*lsl \PUT_BUFFER, \PUT_BUFFER, \SIZE*/ |
| 2466 add \PUT_BITS, \SIZE |
| 2467 /*orr \PUT_BUFFER, \PUT_BUFFER, \CODE*/ |
| 2468 orr \PUT_BUFFER, \CODE, \PUT_BUFFER, lsl \SIZE |
| 2469 .endm |
| 2470 |
| 2471 .macro checkbuf15 BUFFER, PUT_BUFFER, PUT_BITS, ZERO, TMP |
| 2472 cmp \PUT_BITS, #0x10 |
| 2473 blt 15f |
| 2474 eor \ZERO, \ZERO, \ZERO |
| 2475 emit_byte \BUFFER, \PUT_BUFFER, \PUT_BITS, \ZERO, \TMP |
| 2476 emit_byte \BUFFER, \PUT_BUFFER, \PUT_BITS, \ZERO, \TMP |
| 2477 15: |
| 2478 .endm |
| 2479 |
| 2480 .balign 16 |
| 2481 jsimd_huff_encode_one_block_neon_consts: |
| 2482 .byte 0x01 |
| 2483 .byte 0x02 |
| 2484 .byte 0x04 |
| 2485 .byte 0x08 |
| 2486 .byte 0x10 |
| 2487 .byte 0x20 |
| 2488 .byte 0x40 |
| 2489 .byte 0x80 |
| 2490 |
| 2491 asm_function jsimd_huff_encode_one_block_neon |
| 2492 push {r4, r5, r6, r7, r8, r9, r10, r11, lr} |
| 2493 add r7, sp, #0x1c |
| 2494 sub r4, sp, #0x40 |
| 2495 bfc r4, #0, #5 |
| 2496 mov sp, r4 /* align sp on 32 bytes */ |
| 2497 vst1.64 {d8, d9, d10, d11}, [r4, :128]! |
| 2498 vst1.64 {d12, d13, d14, d15}, [r4, :128] |
| 2499 sub sp, #0x140 /* reserve 320 bytes */ |
| 2500 str r0, [sp, #0x18] /* working state > sp + Ox18 */ |
| 2501 add r4, sp, #0x20 /* r4 = t1 */ |
| 2502 ldr lr, [r7, #0x8] /* lr = dctbl */ |
| 2503 sub r10, r1, #0x1 /* r10=buffer-- */ |
| 2504 ldrsh r1, [r2] |
| 2505 mov r9, #0x10 |
| 2506 mov r8, #0x1 |
| 2507 adr r5, jsimd_huff_encode_one_block_neon_consts |
| 2508 /* prepare data */ |
| 2509 vld1.8 {d26}, [r5, :64] |
| 2510 veor q8, q8, q8 |
| 2511 veor q9, q9, q9 |
| 2512 vdup.16 q14, r9 |
| 2513 vdup.16 q15, r8 |
| 2514 veor q10, q10, q10 |
| 2515 veor q11, q11, q11 |
| 2516 sub r1, r1, r3 |
| 2517 add r9, r2, #0x22 |
| 2518 add r8, r2, #0x18 |
| 2519 add r3, r2, #0x36 |
| 2520 vmov.16 d0[0], r1 |
| 2521 vld1.16 {d2[0]}, [r9, :16] |
| 2522 vld1.16 {d4[0]}, [r8, :16] |
| 2523 vld1.16 {d6[0]}, [r3, :16] |
| 2524 add r1, r2, #0x2 |
| 2525 add r9, r2, #0x30 |
| 2526 add r8, r2, #0x26 |
| 2527 add r3, r2, #0x28 |
| 2528 vld1.16 {d0[1]}, [r1, :16] |
| 2529 vld1.16 {d2[1]}, [r9, :16] |
| 2530 vld1.16 {d4[1]}, [r8, :16] |
| 2531 vld1.16 {d6[1]}, [r3, :16] |
| 2532 add r1, r2, #0x10 |
| 2533 add r9, r2, #0x40 |
| 2534 add r8, r2, #0x34 |
| 2535 add r3, r2, #0x1a |
| 2536 vld1.16 {d0[2]}, [r1, :16] |
| 2537 vld1.16 {d2[2]}, [r9, :16] |
| 2538 vld1.16 {d4[2]}, [r8, :16] |
| 2539 vld1.16 {d6[2]}, [r3, :16] |
| 2540 add r1, r2, #0x20 |
| 2541 add r9, r2, #0x32 |
| 2542 add r8, r2, #0x42 |
| 2543 add r3, r2, #0xc |
| 2544 vld1.16 {d0[3]}, [r1, :16] |
| 2545 vld1.16 {d2[3]}, [r9, :16] |
| 2546 vld1.16 {d4[3]}, [r8, :16] |
| 2547 vld1.16 {d6[3]}, [r3, :16] |
| 2548 add r1, r2, #0x12 |
| 2549 add r9, r2, #0x24 |
| 2550 add r8, r2, #0x50 |
| 2551 add r3, r2, #0xe |
| 2552 vld1.16 {d1[0]}, [r1, :16] |
| 2553 vld1.16 {d3[0]}, [r9, :16] |
| 2554 vld1.16 {d5[0]}, [r8, :16] |
| 2555 vld1.16 {d7[0]}, [r3, :16] |
| 2556 add r1, r2, #0x4 |
| 2557 add r9, r2, #0x16 |
| 2558 add r8, r2, #0x60 |
| 2559 add r3, r2, #0x1c |
| 2560 vld1.16 {d1[1]}, [r1, :16] |
| 2561 vld1.16 {d3[1]}, [r9, :16] |
| 2562 vld1.16 {d5[1]}, [r8, :16] |
| 2563 vld1.16 {d7[1]}, [r3, :16] |
| 2564 add r1, r2, #0x6 |
| 2565 add r9, r2, #0x8 |
| 2566 add r8, r2, #0x52 |
| 2567 add r3, r2, #0x2a |
| 2568 vld1.16 {d1[2]}, [r1, :16] |
| 2569 vld1.16 {d3[2]}, [r9, :16] |
| 2570 vld1.16 {d5[2]}, [r8, :16] |
| 2571 vld1.16 {d7[2]}, [r3, :16] |
| 2572 add r1, r2, #0x14 |
| 2573 add r9, r2, #0xa |
| 2574 add r8, r2, #0x44 |
| 2575 add r3, r2, #0x38 |
| 2576 vld1.16 {d1[3]}, [r1, :16] |
| 2577 vld1.16 {d3[3]}, [r9, :16] |
| 2578 vld1.16 {d5[3]}, [r8, :16] |
| 2579 vld1.16 {d7[3]}, [r3, :16] |
| 2580 vcgt.s16 q8, q8, q0 |
| 2581 vcgt.s16 q9, q9, q1 |
| 2582 vcgt.s16 q10, q10, q2 |
| 2583 vcgt.s16 q11, q11, q3 |
| 2584 vabs.s16 q0, q0 |
| 2585 vabs.s16 q1, q1 |
| 2586 vabs.s16 q2, q2 |
| 2587 vabs.s16 q3, q3 |
| 2588 veor q8, q8, q0 |
| 2589 veor q9, q9, q1 |
| 2590 veor q10, q10, q2 |
| 2591 veor q11, q11, q3 |
| 2592 add r9, r4, #0x20 |
| 2593 add r8, r4, #0x80 |
| 2594 add r3, r4, #0xa0 |
| 2595 vclz.i16 q0, q0 |
| 2596 vclz.i16 q1, q1 |
| 2597 vclz.i16 q2, q2 |
| 2598 vclz.i16 q3, q3 |
| 2599 vsub.i16 q0, q14, q0 |
| 2600 vsub.i16 q1, q14, q1 |
| 2601 vsub.i16 q2, q14, q2 |
| 2602 vsub.i16 q3, q14, q3 |
| 2603 vst1.16 {d0, d1, d2, d3}, [r4, :256] |
| 2604 vst1.16 {d4, d5, d6, d7}, [r9, :256] |
| 2605 vshl.s16 q0, q15, q0 |
| 2606 vshl.s16 q1, q15, q1 |
| 2607 vshl.s16 q2, q15, q2 |
| 2608 vshl.s16 q3, q15, q3 |
| 2609 vsub.i16 q0, q0, q15 |
| 2610 vsub.i16 q1, q1, q15 |
| 2611 vsub.i16 q2, q2, q15 |
| 2612 vsub.i16 q3, q3, q15 |
| 2613 vand q8, q8, q0 |
| 2614 vand q9, q9, q1 |
| 2615 vand q10, q10, q2 |
| 2616 vand q11, q11, q3 |
| 2617 vst1.16 {d16, d17, d18, d19}, [r8, :256] |
| 2618 vst1.16 {d20, d21, d22, d23}, [r3, :256] |
| 2619 add r1, r2, #0x46 |
| 2620 add r9, r2, #0x3a |
| 2621 add r8, r2, #0x74 |
| 2622 add r3, r2, #0x6a |
| 2623 vld1.16 {d8[0]}, [r1, :16] |
| 2624 vld1.16 {d10[0]}, [r9, :16] |
| 2625 vld1.16 {d12[0]}, [r8, :16] |
| 2626 vld1.16 {d14[0]}, [r3, :16] |
| 2627 veor q8, q8, q8 |
| 2628 veor q9, q9, q9 |
| 2629 veor q10, q10, q10 |
| 2630 veor q11, q11, q11 |
| 2631 add r1, r2, #0x54 |
| 2632 add r9, r2, #0x2c |
| 2633 add r8, r2, #0x76 |
| 2634 add r3, r2, #0x78 |
| 2635 vld1.16 {d8[1]}, [r1, :16] |
| 2636 vld1.16 {d10[1]}, [r9, :16] |
| 2637 vld1.16 {d12[1]}, [r8, :16] |
| 2638 vld1.16 {d14[1]}, [r3, :16] |
| 2639 add r1, r2, #0x62 |
| 2640 add r9, r2, #0x1e |
| 2641 add r8, r2, #0x68 |
| 2642 add r3, r2, #0x7a |
| 2643 vld1.16 {d8[2]}, [r1, :16] |
| 2644 vld1.16 {d10[2]}, [r9, :16] |
| 2645 vld1.16 {d12[2]}, [r8, :16] |
| 2646 vld1.16 {d14[2]}, [r3, :16] |
| 2647 add r1, r2, #0x70 |
| 2648 add r9, r2, #0x2e |
| 2649 add r8, r2, #0x5a |
| 2650 add r3, r2, #0x6c |
| 2651 vld1.16 {d8[3]}, [r1, :16] |
| 2652 vld1.16 {d10[3]}, [r9, :16] |
| 2653 vld1.16 {d12[3]}, [r8, :16] |
| 2654 vld1.16 {d14[3]}, [r3, :16] |
| 2655 add r1, r2, #0x72 |
| 2656 add r9, r2, #0x3c |
| 2657 add r8, r2, #0x4c |
| 2658 add r3, r2, #0x5e |
| 2659 vld1.16 {d9[0]}, [r1, :16] |
| 2660 vld1.16 {d11[0]}, [r9, :16] |
| 2661 vld1.16 {d13[0]}, [r8, :16] |
| 2662 vld1.16 {d15[0]}, [r3, :16] |
| 2663 add r1, r2, #0x64 |
| 2664 add r9, r2, #0x4a |
| 2665 add r8, r2, #0x3e |
| 2666 add r3, r2, #0x6e |
| 2667 vld1.16 {d9[1]}, [r1, :16] |
| 2668 vld1.16 {d11[1]}, [r9, :16] |
| 2669 vld1.16 {d13[1]}, [r8, :16] |
| 2670 vld1.16 {d15[1]}, [r3, :16] |
| 2671 add r1, r2, #0x56 |
| 2672 add r9, r2, #0x58 |
| 2673 add r8, r2, #0x4e |
| 2674 add r3, r2, #0x7c |
| 2675 vld1.16 {d9[2]}, [r1, :16] |
| 2676 vld1.16 {d11[2]}, [r9, :16] |
| 2677 vld1.16 {d13[2]}, [r8, :16] |
| 2678 vld1.16 {d15[2]}, [r3, :16] |
| 2679 add r1, r2, #0x48 |
| 2680 add r9, r2, #0x66 |
| 2681 add r8, r2, #0x5c |
| 2682 add r3, r2, #0x7e |
| 2683 vld1.16 {d9[3]}, [r1, :16] |
| 2684 vld1.16 {d11[3]}, [r9, :16] |
| 2685 vld1.16 {d13[3]}, [r8, :16] |
| 2686 vld1.16 {d15[3]}, [r3, :16] |
| 2687 vcgt.s16 q8, q8, q4 |
| 2688 vcgt.s16 q9, q9, q5 |
| 2689 vcgt.s16 q10, q10, q6 |
| 2690 vcgt.s16 q11, q11, q7 |
| 2691 vabs.s16 q4, q4 |
| 2692 vabs.s16 q5, q5 |
| 2693 vabs.s16 q6, q6 |
| 2694 vabs.s16 q7, q7 |
| 2695 veor q8, q8, q4 |
| 2696 veor q9, q9, q5 |
| 2697 veor q10, q10, q6 |
| 2698 veor q11, q11, q7 |
| 2699 add r1, r4, #0x40 |
| 2700 add r9, r4, #0x60 |
| 2701 add r8, r4, #0xc0 |
| 2702 add r3, r4, #0xe0 |
| 2703 vclz.i16 q4, q4 |
| 2704 vclz.i16 q5, q5 |
| 2705 vclz.i16 q6, q6 |
| 2706 vclz.i16 q7, q7 |
| 2707 vsub.i16 q4, q14, q4 |
| 2708 vsub.i16 q5, q14, q5 |
| 2709 vsub.i16 q6, q14, q6 |
| 2710 vsub.i16 q7, q14, q7 |
| 2711 vst1.16 {d8, d9, d10, d11}, [r1, :256] |
| 2712 vst1.16 {d12, d13, d14, d15}, [r9, :256] |
| 2713 vshl.s16 q4, q15, q4 |
| 2714 vshl.s16 q5, q15, q5 |
| 2715 vshl.s16 q6, q15, q6 |
| 2716 vshl.s16 q7, q15, q7 |
| 2717 vsub.i16 q4, q4, q15 |
| 2718 vsub.i16 q5, q5, q15 |
| 2719 vsub.i16 q6, q6, q15 |
| 2720 vsub.i16 q7, q7, q15 |
| 2721 vand q8, q8, q4 |
| 2722 vand q9, q9, q5 |
| 2723 vand q10, q10, q6 |
| 2724 vand q11, q11, q7 |
| 2725 vst1.16 {d16, d17, d18, d19}, [r8, :256] |
| 2726 vst1.16 {d20, d21, d22, d23}, [r3, :256] |
| 2727 ldr r12, [r7, #0xc] /* r12 = actbl */ |
| 2728 add r1, lr, #0x400 /* r1 = dctbl->ehufsi */ |
| 2729 mov r9, r12 /* r9 = actbl */ |
| 2730 add r6, r4, #0x80 /* r6 = t2 */ |
| 2731 ldr r11, [r0, #0x8] /* r11 = put_buffer */ |
| 2732 ldr r4, [r0, #0xc] /* r4 = put_bits */ |
| 2733 ldrh r2, [r6, #-128] /* r2 = nbits */ |
| 2734 ldrh r3, [r6] /* r3 = temp2 & (((JLONG) 1)<<nbits)
- 1; */ |
| 2735 ldr r0, [lr, r2, lsl #2] |
| 2736 ldrb r5, [r1, r2] |
| 2737 put_bits r11, r4, r0, r5 |
| 2738 checkbuf15 r10, r11, r4, r5, r0 |
| 2739 put_bits r11, r4, r3, r2 |
| 2740 checkbuf15 r10, r11, r4, r5, r0 |
| 2741 mov lr, r6 /* lr = t2 */ |
| 2742 add r5, r9, #0x400 /* r5 = actbl->ehufsi */ |
| 2743 ldrsb r6, [r5, #0xf0] /* r6 = actbl->ehufsi[0xf0] */ |
| 2744 veor q8, q8, q8 |
| 2745 vceq.i16 q0, q0, q8 |
| 2746 vceq.i16 q1, q1, q8 |
| 2747 vceq.i16 q2, q2, q8 |
| 2748 vceq.i16 q3, q3, q8 |
| 2749 vceq.i16 q4, q4, q8 |
| 2750 vceq.i16 q5, q5, q8 |
| 2751 vceq.i16 q6, q6, q8 |
| 2752 vceq.i16 q7, q7, q8 |
| 2753 vmovn.i16 d0, q0 |
| 2754 vmovn.i16 d2, q1 |
| 2755 vmovn.i16 d4, q2 |
| 2756 vmovn.i16 d6, q3 |
| 2757 vmovn.i16 d8, q4 |
| 2758 vmovn.i16 d10, q5 |
| 2759 vmovn.i16 d12, q6 |
| 2760 vmovn.i16 d14, q7 |
| 2761 vand d0, d0, d26 |
| 2762 vand d2, d2, d26 |
| 2763 vand d4, d4, d26 |
| 2764 vand d6, d6, d26 |
| 2765 vand d8, d8, d26 |
| 2766 vand d10, d10, d26 |
| 2767 vand d12, d12, d26 |
| 2768 vand d14, d14, d26 |
| 2769 vpadd.i8 d0, d0, d2 |
| 2770 vpadd.i8 d4, d4, d6 |
| 2771 vpadd.i8 d8, d8, d10 |
| 2772 vpadd.i8 d12, d12, d14 |
| 2773 vpadd.i8 d0, d0, d4 |
| 2774 vpadd.i8 d8, d8, d12 |
| 2775 vpadd.i8 d0, d0, d8 |
| 2776 vmov.32 r1, d0[1] |
| 2777 vmov.32 r8, d0[0] |
| 2778 mvn r1, r1 |
| 2779 mvn r8, r8 |
| 2780 lsrs r1, r1, #0x1 |
| 2781 rrx r8, r8 /* shift in last r1 bit while shifting out
DC bit */ |
| 2782 rbit r1, r1 /* r1 = index1 */ |
| 2783 rbit r8, r8 /* r8 = index0 */ |
| 2784 ldr r0, [r9, #0x3c0] /* r0 = actbl->ehufco[0xf0] */ |
| 2785 str r1, [sp, #0x14] /* index1 > sp + 0x14 */ |
| 2786 cmp r8, #0x0 |
| 2787 beq 6f |
| 2788 1: |
| 2789 clz r2, r8 |
| 2790 add lr, lr, r2, lsl #1 |
| 2791 lsl r8, r8, r2 |
| 2792 ldrh r1, [lr, #-126] |
| 2793 2: |
| 2794 cmp r2, #0x10 |
| 2795 blt 3f |
| 2796 sub r2, r2, #0x10 |
| 2797 put_bits r11, r4, r0, r6 |
| 2798 cmp r4, #0x10 |
| 2799 blt 2b |
| 2800 eor r3, r3, r3 |
| 2801 emit_byte r10, r11, r4, r3, r12 |
| 2802 emit_byte r10, r11, r4, r3, r12 |
| 2803 b 2b |
| 2804 3: |
| 2805 add r2, r1, r2, lsl #4 |
| 2806 ldrh r3, [lr, #2]! |
| 2807 ldr r12, [r9, r2, lsl #2] |
| 2808 ldrb r2, [r5, r2] |
| 2809 put_bits r11, r4, r12, r2 |
| 2810 checkbuf15 r10, r11, r4, r2, r12 |
| 2811 put_bits r11, r4, r3, r1 |
| 2812 checkbuf15 r10, r11, r4, r2, r12 |
| 2813 lsls r8, r8, #0x1 |
| 2814 bne 1b |
| 2815 6: |
| 2816 add r12, sp, #0x20 /* r12 = t1 */ |
| 2817 ldr r8, [sp, #0x14] /* r8 = index1 */ |
| 2818 adds r12, #0xc0 /* r12 = t2 + (DCTSIZE2/2) */ |
| 2819 cmp r8, #0x0 |
| 2820 beq 6f |
| 2821 clz r2, r8 |
| 2822 sub r12, r12, lr |
| 2823 lsl r8, r8, r2 |
| 2824 add r2, r2, r12, lsr #1 |
| 2825 add lr, lr, r2, lsl #1 |
| 2826 b 7f |
| 2827 1: |
| 2828 clz r2, r8 |
| 2829 add lr, lr, r2, lsl #1 |
| 2830 lsl r8, r8, r2 |
| 2831 7: |
| 2832 ldrh r1, [lr, #-126] |
| 2833 2: |
| 2834 cmp r2, #0x10 |
| 2835 blt 3f |
| 2836 sub r2, r2, #0x10 |
| 2837 put_bits r11, r4, r0, r6 |
| 2838 cmp r4, #0x10 |
| 2839 blt 2b |
| 2840 eor r3, r3, r3 |
| 2841 emit_byte r10, r11, r4, r3, r12 |
| 2842 emit_byte r10, r11, r4, r3, r12 |
| 2843 b 2b |
| 2844 3: |
| 2845 add r2, r1, r2, lsl #4 |
| 2846 ldrh r3, [lr, #2]! |
| 2847 ldr r12, [r9, r2, lsl #2] |
| 2848 ldrb r2, [r5, r2] |
| 2849 put_bits r11, r4, r12, r2 |
| 2850 checkbuf15 r10, r11, r4, r2, r12 |
| 2851 put_bits r11, r4, r3, r1 |
| 2852 checkbuf15 r10, r11, r4, r2, r12 |
| 2853 lsls r8, r8, #0x1 |
| 2854 bne 1b |
| 2855 6: |
| 2856 add r0, sp, #0x20 |
| 2857 add r0, #0xfe |
| 2858 cmp lr, r0 |
| 2859 bhs 1f |
| 2860 ldr r1, [r9] |
| 2861 ldrb r0, [r5] |
| 2862 put_bits r11, r4, r1, r0 |
| 2863 checkbuf15 r10, r11, r4, r0, r1 |
| 2864 1: |
| 2865 ldr r12, [sp, #0x18] |
| 2866 str r11, [r12, #0x8] |
| 2867 str r4, [r12, #0xc] |
| 2868 add r0, r10, #0x1 |
| 2869 add r4, sp, #0x140 |
| 2870 vld1.64 {d8, d9, d10, d11}, [r4, :128]! |
| 2871 vld1.64 {d12, d13, d14, d15}, [r4, :128] |
| 2872 sub r4, r7, #0x1c |
| 2873 mov sp, r4 |
| 2874 pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} |
| 2875 |
| 2876 .purgem emit_byte |
| 2877 .purgem put_bits |
| 2878 .purgem checkbuf15 |
| OLD | NEW |