OLD | NEW |
1 /* | 1 /* |
2 * ARM NEON optimizations for libjpeg-turbo | 2 * ARM NEON optimizations for libjpeg-turbo |
3 * | 3 * |
4 * Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies). | 4 * Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies). |
5 * All rights reserved. | 5 * All rights reserved. |
6 * Contact: Alexander Bokovoy <alexander.bokovoy@nokia.com> | 6 * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com> |
7 * | 7 * |
8 * This software is provided 'as-is', without any express or implied | 8 * This software is provided 'as-is', without any express or implied |
9 * warranty. In no event will the authors be held liable for any damages | 9 * warranty. In no event will the authors be held liable for any damages |
10 * arising from the use of this software. | 10 * arising from the use of this software. |
11 * | 11 * |
12 * Permission is granted to anyone to use this software for any purpose, | 12 * Permission is granted to anyone to use this software for any purpose, |
13 * including commercial applications, and to alter it and redistribute it | 13 * including commercial applications, and to alter it and redistribute it |
14 * freely, subject to the following restrictions: | 14 * freely, subject to the following restrictions: |
15 * | 15 * |
16 * 1. The origin of this software must not be misrepresented; you must not | 16 * 1. The origin of this software must not be misrepresented; you must not |
(...skipping 38 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
55 .endm | 55 .endm |
56 | 56 |
57 /* Transpose a block of 4x4 coefficients in four 64-bit registers */ | 57 /* Transpose a block of 4x4 coefficients in four 64-bit registers */ |
58 .macro transpose_4x4 x0, x1, x2, x3 | 58 .macro transpose_4x4 x0, x1, x2, x3 |
59 vtrn.16 \x0, \x1 | 59 vtrn.16 \x0, \x1 |
60 vtrn.16 \x2, \x3 | 60 vtrn.16 \x2, \x3 |
61 vtrn.32 \x0, \x2 | 61 vtrn.32 \x0, \x2 |
62 vtrn.32 \x1, \x3 | 62 vtrn.32 \x1, \x3 |
63 .endm | 63 .endm |
64 | 64 |
| 65 #define CENTERJSAMPLE 128 |
| 66 |
| 67 /*****************************************************************************/ |
| 68 |
| 69 /* |
| 70 * Perform dequantization and inverse DCT on one block of coefficients. |
| 71 * |
| 72 * GLOBAL(void) |
| 73 * jsimd_idct_islow_neon (void * dct_table, JCOEFPTR coef_block, |
| 74 * JSAMPARRAY output_buf, JDIMENSION output_col) |
| 75 */ |
| 76 |
| 77 #define FIX_0_298631336 (2446) |
| 78 #define FIX_0_390180644 (3196) |
| 79 #define FIX_0_541196100 (4433) |
| 80 #define FIX_0_765366865 (6270) |
| 81 #define FIX_0_899976223 (7373) |
| 82 #define FIX_1_175875602 (9633) |
| 83 #define FIX_1_501321110 (12299) |
| 84 #define FIX_1_847759065 (15137) |
| 85 #define FIX_1_961570560 (16069) |
| 86 #define FIX_2_053119869 (16819) |
| 87 #define FIX_2_562915447 (20995) |
| 88 #define FIX_3_072711026 (25172) |
| 89 |
| 90 #define FIX_1_175875602_MINUS_1_961570560 (FIX_1_175875602 - FIX_1_961570560) |
| 91 #define FIX_1_175875602_MINUS_0_390180644 (FIX_1_175875602 - FIX_0_390180644) |
| 92 #define FIX_0_541196100_MINUS_1_847759065 (FIX_0_541196100 - FIX_1_847759065) |
| 93 #define FIX_3_072711026_MINUS_2_562915447 (FIX_3_072711026 - FIX_2_562915447) |
| 94 #define FIX_0_298631336_MINUS_0_899976223 (FIX_0_298631336 - FIX_0_899976223) |
| 95 #define FIX_1_501321110_MINUS_0_899976223 (FIX_1_501321110 - FIX_0_899976223) |
| 96 #define FIX_2_053119869_MINUS_2_562915447 (FIX_2_053119869 - FIX_2_562915447) |
| 97 #define FIX_0_541196100_PLUS_0_765366865 (FIX_0_541196100 + FIX_0_765366865) |
| 98 |
| 99 /* |
| 100 * Reference SIMD-friendly 1-D ISLOW iDCT C implementation. |
| 101 * Uses some ideas from the comments in 'simd/jiss2int-64.asm' |
| 102 */ |
| 103 #define REF_1D_IDCT(xrow0, xrow1, xrow2, xrow3, xrow4, xrow5, xrow6, xrow7) \ |
| 104 { \ |
| 105 DCTELEM row0, row1, row2, row3, row4, row5, row6, row7; \ |
| 106 INT32 q1, q2, q3, q4, q5, q6, q7; \ |
| 107 INT32 tmp11_plus_tmp2, tmp11_minus_tmp2; \ |
| 108 \ |
| 109 /* 1-D iDCT input data */ \ |
| 110 row0 = xrow0; \ |
| 111 row1 = xrow1; \ |
| 112 row2 = xrow2; \ |
| 113 row3 = xrow3; \ |
| 114 row4 = xrow4; \ |
| 115 row5 = xrow5; \ |
| 116 row6 = xrow6; \ |
| 117 row7 = xrow7; \ |
| 118 \ |
| 119 q5 = row7 + row3; \ |
| 120 q4 = row5 + row1; \ |
| 121 q6 = MULTIPLY(q5, FIX_1_175875602_MINUS_1_961570560) + \ |
| 122 MULTIPLY(q4, FIX_1_175875602); \ |
| 123 q7 = MULTIPLY(q5, FIX_1_175875602) + \ |
| 124 MULTIPLY(q4, FIX_1_175875602_MINUS_0_390180644); \ |
| 125 q2 = MULTIPLY(row2, FIX_0_541196100) + \ |
| 126 MULTIPLY(row6, FIX_0_541196100_MINUS_1_847759065); \ |
| 127 q4 = q6; \ |
| 128 q3 = ((INT32) row0 - (INT32) row4) << 13; \ |
| 129 q6 += MULTIPLY(row5, -FIX_2_562915447) + \ |
| 130 MULTIPLY(row3, FIX_3_072711026_MINUS_2_562915447); \ |
| 131 /* now we can use q1 (reloadable constants have been used up) */ \ |
| 132 q1 = q3 + q2; \ |
| 133 q4 += MULTIPLY(row7, FIX_0_298631336_MINUS_0_899976223) + \ |
| 134 MULTIPLY(row1, -FIX_0_899976223); \ |
| 135 q5 = q7; \ |
| 136 q1 = q1 + q6; \ |
| 137 q7 += MULTIPLY(row7, -FIX_0_899976223) + \ |
| 138 MULTIPLY(row1, FIX_1_501321110_MINUS_0_899976223); \ |
| 139 \ |
| 140 /* (tmp11 + tmp2) has been calculated (out_row1 before descale) */ \ |
| 141 tmp11_plus_tmp2 = q1; \ |
| 142 row1 = 0; \ |
| 143 \ |
| 144 q1 = q1 - q6; \ |
| 145 q5 += MULTIPLY(row5, FIX_2_053119869_MINUS_2_562915447) + \ |
| 146 MULTIPLY(row3, -FIX_2_562915447); \ |
| 147 q1 = q1 - q6; \ |
| 148 q6 = MULTIPLY(row2, FIX_0_541196100_PLUS_0_765366865) + \ |
| 149 MULTIPLY(row6, FIX_0_541196100); \ |
| 150 q3 = q3 - q2; \ |
| 151 \ |
| 152 /* (tmp11 - tmp2) has been calculated (out_row6 before descale) */ \ |
| 153 tmp11_minus_tmp2 = q1; \ |
| 154 \ |
| 155 q1 = ((INT32) row0 + (INT32) row4) << 13; \ |
| 156 q2 = q1 + q6; \ |
| 157 q1 = q1 - q6; \ |
| 158 \ |
| 159 /* pick up the results */ \ |
| 160 tmp0 = q4; \ |
| 161 tmp1 = q5; \ |
| 162 tmp2 = (tmp11_plus_tmp2 - tmp11_minus_tmp2) / 2; \ |
| 163 tmp3 = q7; \ |
| 164 tmp10 = q2; \ |
| 165 tmp11 = (tmp11_plus_tmp2 + tmp11_minus_tmp2) / 2; \ |
| 166 tmp12 = q3; \ |
| 167 tmp13 = q1; \ |
| 168 } |
| 169 |
| 170 #define XFIX_0_899976223 d0[0] |
| 171 #define XFIX_0_541196100 d0[1] |
| 172 #define XFIX_2_562915447 d0[2] |
| 173 #define XFIX_0_298631336_MINUS_0_899976223 d0[3] |
| 174 #define XFIX_1_501321110_MINUS_0_899976223 d1[0] |
| 175 #define XFIX_2_053119869_MINUS_2_562915447 d1[1] |
| 176 #define XFIX_0_541196100_PLUS_0_765366865 d1[2] |
| 177 #define XFIX_1_175875602 d1[3] |
| 178 #define XFIX_1_175875602_MINUS_0_390180644 d2[0] |
| 179 #define XFIX_0_541196100_MINUS_1_847759065 d2[1] |
| 180 #define XFIX_3_072711026_MINUS_2_562915447 d2[2] |
| 181 #define XFIX_1_175875602_MINUS_1_961570560 d2[3] |
| 182 |
| 183 .balign 16 |
| 184 jsimd_idct_islow_neon_consts: |
| 185 .short FIX_0_899976223 /* d0[0] */ |
| 186 .short FIX_0_541196100 /* d0[1] */ |
| 187 .short FIX_2_562915447 /* d0[2] */ |
| 188 .short FIX_0_298631336_MINUS_0_899976223 /* d0[3] */ |
| 189 .short FIX_1_501321110_MINUS_0_899976223 /* d1[0] */ |
| 190 .short FIX_2_053119869_MINUS_2_562915447 /* d1[1] */ |
| 191 .short FIX_0_541196100_PLUS_0_765366865 /* d1[2] */ |
| 192 .short FIX_1_175875602 /* d1[3] */ |
| 193 /* reloadable constants */ |
| 194 .short FIX_1_175875602_MINUS_0_390180644 /* d2[0] */ |
| 195 .short FIX_0_541196100_MINUS_1_847759065 /* d2[1] */ |
| 196 .short FIX_3_072711026_MINUS_2_562915447 /* d2[2] */ |
| 197 .short FIX_1_175875602_MINUS_1_961570560 /* d2[3] */ |
| 198 |
| 199 asm_function jsimd_idct_islow_neon |
| 200 |
| 201 DCT_TABLE .req r0 |
| 202 COEF_BLOCK .req r1 |
| 203 OUTPUT_BUF .req r2 |
| 204 OUTPUT_COL .req r3 |
| 205 TMP1 .req r0 |
| 206 TMP2 .req r1 |
| 207 TMP3 .req r2 |
| 208 TMP4 .req ip |
| 209 |
| 210 ROW0L .req d16 |
| 211 ROW0R .req d17 |
| 212 ROW1L .req d18 |
| 213 ROW1R .req d19 |
| 214 ROW2L .req d20 |
| 215 ROW2R .req d21 |
| 216 ROW3L .req d22 |
| 217 ROW3R .req d23 |
| 218 ROW4L .req d24 |
| 219 ROW4R .req d25 |
| 220 ROW5L .req d26 |
| 221 ROW5R .req d27 |
| 222 ROW6L .req d28 |
| 223 ROW6R .req d29 |
| 224 ROW7L .req d30 |
| 225 ROW7R .req d31 |
| 226 |
| 227 /* Load and dequantize coefficients into NEON registers |
| 228 * with the following allocation: |
| 229 * 0 1 2 3 | 4 5 6 7 |
| 230 * ---------+-------- |
| 231 * 0 | d16 | d17 ( q8 ) |
| 232 * 1 | d18 | d19 ( q9 ) |
| 233 * 2 | d20 | d21 ( q10 ) |
| 234 * 3 | d22 | d23 ( q11 ) |
| 235 * 4 | d24 | d25 ( q12 ) |
| 236 * 5 | d26 | d27 ( q13 ) |
| 237 * 6 | d28 | d29 ( q14 ) |
| 238 * 7 | d30 | d31 ( q15 ) |
| 239 */ |
| 240 adr ip, jsimd_idct_islow_neon_consts |
| 241 vld1.16 {d16, d17, d18, d19}, [COEF_BLOCK, :128]! |
| 242 vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]! |
| 243 vld1.16 {d20, d21, d22, d23}, [COEF_BLOCK, :128]! |
| 244 vmul.s16 q8, q8, q0 |
| 245 vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]! |
| 246 vmul.s16 q9, q9, q1 |
| 247 vld1.16 {d24, d25, d26, d27}, [COEF_BLOCK, :128]! |
| 248 vmul.s16 q10, q10, q2 |
| 249 vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]! |
| 250 vmul.s16 q11, q11, q3 |
| 251 vld1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128] |
| 252 vmul.s16 q12, q12, q0 |
| 253 vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]! |
| 254 vmul.s16 q14, q14, q2 |
| 255 vmul.s16 q13, q13, q1 |
| 256 vld1.16 {d0, d1, d2, d3}, [ip, :128] /* load constants */ |
| 257 add ip, ip, #16 |
| 258 vmul.s16 q15, q15, q3 |
| 259 vpush {d8-d15} /* save NEON registers */ |
| 260 /* 1-D IDCT, pass 1, left 4x8 half */ |
| 261 vadd.s16 d4, ROW7L, ROW3L |
| 262 vadd.s16 d5, ROW5L, ROW1L |
| 263 vmull.s16 q6, d4, XFIX_1_175875602_MINUS_1_961570560 |
| 264 vmlal.s16 q6, d5, XFIX_1_175875602 |
| 265 vmull.s16 q7, d4, XFIX_1_175875602 |
| 266 /* Check for the zero coefficients in the right 4x8 half */ |
| 267 push {r4, r5} |
| 268 vmlal.s16 q7, d5, XFIX_1_175875602_MINUS_0_390180644 |
| 269 vsubl.s16 q3, ROW0L, ROW4L |
| 270 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 1 * 8))] |
| 271 vmull.s16 q2, ROW2L, XFIX_0_541196100 |
| 272 vmlal.s16 q2, ROW6L, XFIX_0_541196100_MINUS_1_847759065 |
| 273 orr r0, r4, r5 |
| 274 vmov q4, q6 |
| 275 vmlsl.s16 q6, ROW5L, XFIX_2_562915447 |
| 276 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 2 * 8))] |
| 277 vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447 |
| 278 vshl.s32 q3, q3, #13 |
| 279 orr r0, r0, r4 |
| 280 vmlsl.s16 q4, ROW1L, XFIX_0_899976223 |
| 281 orr r0, r0, r5 |
| 282 vadd.s32 q1, q3, q2 |
| 283 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 3 * 8))] |
| 284 vmov q5, q7 |
| 285 vadd.s32 q1, q1, q6 |
| 286 orr r0, r0, r4 |
| 287 vmlsl.s16 q7, ROW7L, XFIX_0_899976223 |
| 288 orr r0, r0, r5 |
| 289 vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223 |
| 290 vrshrn.s32 ROW1L, q1, #11 |
| 291 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 4 * 8))] |
| 292 vsub.s32 q1, q1, q6 |
| 293 vmlal.s16 q5, ROW5L, XFIX_2_053119869_MINUS_2_562915447 |
| 294 orr r0, r0, r4 |
| 295 vmlsl.s16 q5, ROW3L, XFIX_2_562915447 |
| 296 orr r0, r0, r5 |
| 297 vsub.s32 q1, q1, q6 |
| 298 vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865 |
| 299 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 5 * 8))] |
| 300 vmlal.s16 q6, ROW6L, XFIX_0_541196100 |
| 301 vsub.s32 q3, q3, q2 |
| 302 orr r0, r0, r4 |
| 303 vrshrn.s32 ROW6L, q1, #11 |
| 304 orr r0, r0, r5 |
| 305 vadd.s32 q1, q3, q5 |
| 306 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 6 * 8))] |
| 307 vsub.s32 q3, q3, q5 |
| 308 vaddl.s16 q5, ROW0L, ROW4L |
| 309 orr r0, r0, r4 |
| 310 vrshrn.s32 ROW2L, q1, #11 |
| 311 orr r0, r0, r5 |
| 312 vrshrn.s32 ROW5L, q3, #11 |
| 313 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 7 * 8))] |
| 314 vshl.s32 q5, q5, #13 |
| 315 vmlal.s16 q4, ROW7L, XFIX_0_298631336_MINUS_0_899976223 |
| 316 orr r0, r0, r4 |
| 317 vadd.s32 q2, q5, q6 |
| 318 orrs r0, r0, r5 |
| 319 vsub.s32 q1, q5, q6 |
| 320 vadd.s32 q6, q2, q7 |
| 321 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 0 * 8))] |
| 322 vsub.s32 q2, q2, q7 |
| 323 vadd.s32 q5, q1, q4 |
| 324 orr r0, r4, r5 |
| 325 vsub.s32 q3, q1, q4 |
| 326 pop {r4, r5} |
| 327 vrshrn.s32 ROW7L, q2, #11 |
| 328 vrshrn.s32 ROW3L, q5, #11 |
| 329 vrshrn.s32 ROW0L, q6, #11 |
| 330 vrshrn.s32 ROW4L, q3, #11 |
| 331 |
| 332 beq 3f /* Go to do some special handling for the sparse right
4x8 half */ |
| 333 |
| 334 /* 1-D IDCT, pass 1, right 4x8 half */ |
| 335 vld1.s16 {d2}, [ip, :64] /* reload constants */ |
| 336 vadd.s16 d10, ROW7R, ROW3R |
| 337 vadd.s16 d8, ROW5R, ROW1R |
| 338 /* Transpose left 4x8 half */ |
| 339 vtrn.16 ROW6L, ROW7L |
| 340 vmull.s16 q6, d10, XFIX_1_175875602_MINUS_1_961570560 |
| 341 vmlal.s16 q6, d8, XFIX_1_175875602 |
| 342 vtrn.16 ROW2L, ROW3L |
| 343 vmull.s16 q7, d10, XFIX_1_175875602 |
| 344 vmlal.s16 q7, d8, XFIX_1_175875602_MINUS_0_390180644 |
| 345 vtrn.16 ROW0L, ROW1L |
| 346 vsubl.s16 q3, ROW0R, ROW4R |
| 347 vmull.s16 q2, ROW2R, XFIX_0_541196100 |
| 348 vmlal.s16 q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065 |
| 349 vtrn.16 ROW4L, ROW5L |
| 350 vmov q4, q6 |
| 351 vmlsl.s16 q6, ROW5R, XFIX_2_562915447 |
| 352 vmlal.s16 q6, ROW3R, XFIX_3_072711026_MINUS_2_562915447 |
| 353 vtrn.32 ROW1L, ROW3L |
| 354 vshl.s32 q3, q3, #13 |
| 355 vmlsl.s16 q4, ROW1R, XFIX_0_899976223 |
| 356 vtrn.32 ROW4L, ROW6L |
| 357 vadd.s32 q1, q3, q2 |
| 358 vmov q5, q7 |
| 359 vadd.s32 q1, q1, q6 |
| 360 vtrn.32 ROW0L, ROW2L |
| 361 vmlsl.s16 q7, ROW7R, XFIX_0_899976223 |
| 362 vmlal.s16 q7, ROW1R, XFIX_1_501321110_MINUS_0_899976223 |
| 363 vrshrn.s32 ROW1R, q1, #11 |
| 364 vtrn.32 ROW5L, ROW7L |
| 365 vsub.s32 q1, q1, q6 |
| 366 vmlal.s16 q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447 |
| 367 vmlsl.s16 q5, ROW3R, XFIX_2_562915447 |
| 368 vsub.s32 q1, q1, q6 |
| 369 vmull.s16 q6, ROW2R, XFIX_0_541196100_PLUS_0_765366865 |
| 370 vmlal.s16 q6, ROW6R, XFIX_0_541196100 |
| 371 vsub.s32 q3, q3, q2 |
| 372 vrshrn.s32 ROW6R, q1, #11 |
| 373 vadd.s32 q1, q3, q5 |
| 374 vsub.s32 q3, q3, q5 |
| 375 vaddl.s16 q5, ROW0R, ROW4R |
| 376 vrshrn.s32 ROW2R, q1, #11 |
| 377 vrshrn.s32 ROW5R, q3, #11 |
| 378 vshl.s32 q5, q5, #13 |
| 379 vmlal.s16 q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223 |
| 380 vadd.s32 q2, q5, q6 |
| 381 vsub.s32 q1, q5, q6 |
| 382 vadd.s32 q6, q2, q7 |
| 383 vsub.s32 q2, q2, q7 |
| 384 vadd.s32 q5, q1, q4 |
| 385 vsub.s32 q3, q1, q4 |
| 386 vrshrn.s32 ROW7R, q2, #11 |
| 387 vrshrn.s32 ROW3R, q5, #11 |
| 388 vrshrn.s32 ROW0R, q6, #11 |
| 389 vrshrn.s32 ROW4R, q3, #11 |
| 390 /* Transpose right 4x8 half */ |
| 391 vtrn.16 ROW6R, ROW7R |
| 392 vtrn.16 ROW2R, ROW3R |
| 393 vtrn.16 ROW0R, ROW1R |
| 394 vtrn.16 ROW4R, ROW5R |
| 395 vtrn.32 ROW1R, ROW3R |
| 396 vtrn.32 ROW4R, ROW6R |
| 397 vtrn.32 ROW0R, ROW2R |
| 398 vtrn.32 ROW5R, ROW7R |
| 399 |
| 400 1: /* 1-D IDCT, pass 2 (normal variant), left 4x8 half */ |
| 401 vld1.s16 {d2}, [ip, :64] /* reload constants */ |
| 402 vmull.s16 q6, ROW1R, XFIX_1_175875602 /* ROW5L <-> ROW1R */ |
| 403 vmlal.s16 q6, ROW1L, XFIX_1_175875602 |
| 404 vmlal.s16 q6, ROW3R, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L <-
> ROW3R */ |
| 405 vmlal.s16 q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560 |
| 406 vmull.s16 q7, ROW3R, XFIX_1_175875602 /* ROW7L <-> ROW3R */ |
| 407 vmlal.s16 q7, ROW3L, XFIX_1_175875602 |
| 408 vmlal.s16 q7, ROW1R, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L <-
> ROW1R */ |
| 409 vmlal.s16 q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644 |
| 410 vsubl.s16 q3, ROW0L, ROW0R /* ROW4L <-> ROW0R */ |
| 411 vmull.s16 q2, ROW2L, XFIX_0_541196100 |
| 412 vmlal.s16 q2, ROW2R, XFIX_0_541196100_MINUS_1_847759065 /* ROW6L <-
> ROW2R */ |
| 413 vmov q4, q6 |
| 414 vmlsl.s16 q6, ROW1R, XFIX_2_562915447 /* ROW5L <-> ROW1R */ |
| 415 vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447 |
| 416 vshl.s32 q3, q3, #13 |
| 417 vmlsl.s16 q4, ROW1L, XFIX_0_899976223 |
| 418 vadd.s32 q1, q3, q2 |
| 419 vmov q5, q7 |
| 420 vadd.s32 q1, q1, q6 |
| 421 vmlsl.s16 q7, ROW3R, XFIX_0_899976223 /* ROW7L <-> ROW3R */ |
| 422 vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223 |
| 423 vshrn.s32 ROW1L, q1, #16 |
| 424 vsub.s32 q1, q1, q6 |
| 425 vmlal.s16 q5, ROW1R, XFIX_2_053119869_MINUS_2_562915447 /* ROW5L <-
> ROW1R */ |
| 426 vmlsl.s16 q5, ROW3L, XFIX_2_562915447 |
| 427 vsub.s32 q1, q1, q6 |
| 428 vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865 |
| 429 vmlal.s16 q6, ROW2R, XFIX_0_541196100 /* ROW6L <-> ROW2R */ |
| 430 vsub.s32 q3, q3, q2 |
| 431 vshrn.s32 ROW2R, q1, #16 /* ROW6L <-> ROW2R */ |
| 432 vadd.s32 q1, q3, q5 |
| 433 vsub.s32 q3, q3, q5 |
| 434 vaddl.s16 q5, ROW0L, ROW0R /* ROW4L <-> ROW0R */ |
| 435 vshrn.s32 ROW2L, q1, #16 |
| 436 vshrn.s32 ROW1R, q3, #16 /* ROW5L <-> ROW1R */ |
| 437 vshl.s32 q5, q5, #13 |
| 438 vmlal.s16 q4, ROW3R, XFIX_0_298631336_MINUS_0_899976223 /* ROW7L <-
> ROW3R */ |
| 439 vadd.s32 q2, q5, q6 |
| 440 vsub.s32 q1, q5, q6 |
| 441 vadd.s32 q6, q2, q7 |
| 442 vsub.s32 q2, q2, q7 |
| 443 vadd.s32 q5, q1, q4 |
| 444 vsub.s32 q3, q1, q4 |
| 445 vshrn.s32 ROW3R, q2, #16 /* ROW7L <-> ROW3R */ |
| 446 vshrn.s32 ROW3L, q5, #16 |
| 447 vshrn.s32 ROW0L, q6, #16 |
| 448 vshrn.s32 ROW0R, q3, #16 /* ROW4L <-> ROW0R */ |
| 449 /* 1-D IDCT, pass 2, right 4x8 half */ |
| 450 vld1.s16 {d2}, [ip, :64] /* reload constants */ |
| 451 vmull.s16 q6, ROW5R, XFIX_1_175875602 |
| 452 vmlal.s16 q6, ROW5L, XFIX_1_175875602 /* ROW5L <-> ROW1R */ |
| 453 vmlal.s16 q6, ROW7R, XFIX_1_175875602_MINUS_1_961570560 |
| 454 vmlal.s16 q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L <-
> ROW3R */ |
| 455 vmull.s16 q7, ROW7R, XFIX_1_175875602 |
| 456 vmlal.s16 q7, ROW7L, XFIX_1_175875602 /* ROW7L <-> ROW3R */ |
| 457 vmlal.s16 q7, ROW5R, XFIX_1_175875602_MINUS_0_390180644 |
| 458 vmlal.s16 q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L <-
> ROW1R */ |
| 459 vsubl.s16 q3, ROW4L, ROW4R /* ROW4L <-> ROW0R */ |
| 460 vmull.s16 q2, ROW6L, XFIX_0_541196100 /* ROW6L <-> ROW2R */ |
| 461 vmlal.s16 q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065 |
| 462 vmov q4, q6 |
| 463 vmlsl.s16 q6, ROW5R, XFIX_2_562915447 |
| 464 vmlal.s16 q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447 /* ROW7L <-
> ROW3R */ |
| 465 vshl.s32 q3, q3, #13 |
| 466 vmlsl.s16 q4, ROW5L, XFIX_0_899976223 /* ROW5L <-> ROW1R */ |
| 467 vadd.s32 q1, q3, q2 |
| 468 vmov q5, q7 |
| 469 vadd.s32 q1, q1, q6 |
| 470 vmlsl.s16 q7, ROW7R, XFIX_0_899976223 |
| 471 vmlal.s16 q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223 /* ROW5L <-
> ROW1R */ |
| 472 vshrn.s32 ROW5L, q1, #16 /* ROW5L <-> ROW1R */ |
| 473 vsub.s32 q1, q1, q6 |
| 474 vmlal.s16 q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447 |
| 475 vmlsl.s16 q5, ROW7L, XFIX_2_562915447 /* ROW7L <-> ROW3R */ |
| 476 vsub.s32 q1, q1, q6 |
| 477 vmull.s16 q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865 /* ROW6L <->
ROW2R */ |
| 478 vmlal.s16 q6, ROW6R, XFIX_0_541196100 |
| 479 vsub.s32 q3, q3, q2 |
| 480 vshrn.s32 ROW6R, q1, #16 |
| 481 vadd.s32 q1, q3, q5 |
| 482 vsub.s32 q3, q3, q5 |
| 483 vaddl.s16 q5, ROW4L, ROW4R /* ROW4L <-> ROW0R */ |
| 484 vshrn.s32 ROW6L, q1, #16 /* ROW6L <-> ROW2R */ |
| 485 vshrn.s32 ROW5R, q3, #16 |
| 486 vshl.s32 q5, q5, #13 |
| 487 vmlal.s16 q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223 |
| 488 vadd.s32 q2, q5, q6 |
| 489 vsub.s32 q1, q5, q6 |
| 490 vadd.s32 q6, q2, q7 |
| 491 vsub.s32 q2, q2, q7 |
| 492 vadd.s32 q5, q1, q4 |
| 493 vsub.s32 q3, q1, q4 |
| 494 vshrn.s32 ROW7R, q2, #16 |
| 495 vshrn.s32 ROW7L, q5, #16 /* ROW7L <-> ROW3R */ |
| 496 vshrn.s32 ROW4L, q6, #16 /* ROW4L <-> ROW0R */ |
| 497 vshrn.s32 ROW4R, q3, #16 |
| 498 |
| 499 2: /* Descale to 8-bit and range limit */ |
| 500 vqrshrn.s16 d16, q8, #2 |
| 501 vqrshrn.s16 d17, q9, #2 |
| 502 vqrshrn.s16 d18, q10, #2 |
| 503 vqrshrn.s16 d19, q11, #2 |
| 504 vpop {d8-d15} /* restore NEON registers */ |
| 505 vqrshrn.s16 d20, q12, #2 |
| 506 /* Transpose the final 8-bit samples and do signed->unsigned conversion */ |
| 507 vtrn.16 q8, q9 |
| 508 vqrshrn.s16 d21, q13, #2 |
| 509 vqrshrn.s16 d22, q14, #2 |
| 510 vmov.u8 q0, #(CENTERJSAMPLE) |
| 511 vqrshrn.s16 d23, q15, #2 |
| 512 vtrn.8 d16, d17 |
| 513 vtrn.8 d18, d19 |
| 514 vadd.u8 q8, q8, q0 |
| 515 vadd.u8 q9, q9, q0 |
| 516 vtrn.16 q10, q11 |
| 517 /* Store results to the output buffer */ |
| 518 ldmia OUTPUT_BUF!, {TMP1, TMP2} |
| 519 add TMP1, TMP1, OUTPUT_COL |
| 520 add TMP2, TMP2, OUTPUT_COL |
| 521 vst1.8 {d16}, [TMP1] |
| 522 vtrn.8 d20, d21 |
| 523 vst1.8 {d17}, [TMP2] |
| 524 ldmia OUTPUT_BUF!, {TMP1, TMP2} |
| 525 add TMP1, TMP1, OUTPUT_COL |
| 526 add TMP2, TMP2, OUTPUT_COL |
| 527 vst1.8 {d18}, [TMP1] |
| 528 vadd.u8 q10, q10, q0 |
| 529 vst1.8 {d19}, [TMP2] |
| 530 ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4} |
| 531 add TMP1, TMP1, OUTPUT_COL |
| 532 add TMP2, TMP2, OUTPUT_COL |
| 533 add TMP3, TMP3, OUTPUT_COL |
| 534 add TMP4, TMP4, OUTPUT_COL |
| 535 vtrn.8 d22, d23 |
| 536 vst1.8 {d20}, [TMP1] |
| 537 vadd.u8 q11, q11, q0 |
| 538 vst1.8 {d21}, [TMP2] |
| 539 vst1.8 {d22}, [TMP3] |
| 540 vst1.8 {d23}, [TMP4] |
| 541 bx lr |
| 542 |
| 543 3: /* Left 4x8 half is done, right 4x8 half contains mostly zeros */ |
| 544 |
| 545 /* Transpose left 4x8 half */ |
| 546 vtrn.16 ROW6L, ROW7L |
| 547 vtrn.16 ROW2L, ROW3L |
| 548 vtrn.16 ROW0L, ROW1L |
| 549 vtrn.16 ROW4L, ROW5L |
| 550 vshl.s16 ROW0R, ROW0R, #2 /* PASS1_BITS */ |
| 551 vtrn.32 ROW1L, ROW3L |
| 552 vtrn.32 ROW4L, ROW6L |
| 553 vtrn.32 ROW0L, ROW2L |
| 554 vtrn.32 ROW5L, ROW7L |
| 555 |
| 556 cmp r0, #0 |
| 557 beq 4f /* Right 4x8 half has all zeros, go to 'sparse' second pa
ss */ |
| 558 |
| 559 /* Only row 0 is non-zero for the right 4x8 half */ |
| 560 vdup.s16 ROW1R, ROW0R[1] |
| 561 vdup.s16 ROW2R, ROW0R[2] |
| 562 vdup.s16 ROW3R, ROW0R[3] |
| 563 vdup.s16 ROW4R, ROW0R[0] |
| 564 vdup.s16 ROW5R, ROW0R[1] |
| 565 vdup.s16 ROW6R, ROW0R[2] |
| 566 vdup.s16 ROW7R, ROW0R[3] |
| 567 vdup.s16 ROW0R, ROW0R[0] |
| 568 b 1b /* Go to 'normal' second pass */ |
| 569 |
| 570 4: /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), left 4x8 half */ |
| 571 vld1.s16 {d2}, [ip, :64] /* reload constants */ |
| 572 vmull.s16 q6, ROW1L, XFIX_1_175875602 |
| 573 vmlal.s16 q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560 |
| 574 vmull.s16 q7, ROW3L, XFIX_1_175875602 |
| 575 vmlal.s16 q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644 |
| 576 vmull.s16 q2, ROW2L, XFIX_0_541196100 |
| 577 vshll.s16 q3, ROW0L, #13 |
| 578 vmov q4, q6 |
| 579 vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447 |
| 580 vmlsl.s16 q4, ROW1L, XFIX_0_899976223 |
| 581 vadd.s32 q1, q3, q2 |
| 582 vmov q5, q7 |
| 583 vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223 |
| 584 vadd.s32 q1, q1, q6 |
| 585 vadd.s32 q6, q6, q6 |
| 586 vmlsl.s16 q5, ROW3L, XFIX_2_562915447 |
| 587 vshrn.s32 ROW1L, q1, #16 |
| 588 vsub.s32 q1, q1, q6 |
| 589 vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865 |
| 590 vsub.s32 q3, q3, q2 |
| 591 vshrn.s32 ROW2R, q1, #16 /* ROW6L <-> ROW2R */ |
| 592 vadd.s32 q1, q3, q5 |
| 593 vsub.s32 q3, q3, q5 |
| 594 vshll.s16 q5, ROW0L, #13 |
| 595 vshrn.s32 ROW2L, q1, #16 |
| 596 vshrn.s32 ROW1R, q3, #16 /* ROW5L <-> ROW1R */ |
| 597 vadd.s32 q2, q5, q6 |
| 598 vsub.s32 q1, q5, q6 |
| 599 vadd.s32 q6, q2, q7 |
| 600 vsub.s32 q2, q2, q7 |
| 601 vadd.s32 q5, q1, q4 |
| 602 vsub.s32 q3, q1, q4 |
| 603 vshrn.s32 ROW3R, q2, #16 /* ROW7L <-> ROW3R */ |
| 604 vshrn.s32 ROW3L, q5, #16 |
| 605 vshrn.s32 ROW0L, q6, #16 |
| 606 vshrn.s32 ROW0R, q3, #16 /* ROW4L <-> ROW0R */ |
| 607 /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), right 4x8 half */ |
| 608 vld1.s16 {d2}, [ip, :64] /* reload constants */ |
| 609 vmull.s16 q6, ROW5L, XFIX_1_175875602 |
| 610 vmlal.s16 q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560 |
| 611 vmull.s16 q7, ROW7L, XFIX_1_175875602 |
| 612 vmlal.s16 q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644 |
| 613 vmull.s16 q2, ROW6L, XFIX_0_541196100 |
| 614 vshll.s16 q3, ROW4L, #13 |
| 615 vmov q4, q6 |
| 616 vmlal.s16 q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447 |
| 617 vmlsl.s16 q4, ROW5L, XFIX_0_899976223 |
| 618 vadd.s32 q1, q3, q2 |
| 619 vmov q5, q7 |
| 620 vmlal.s16 q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223 |
| 621 vadd.s32 q1, q1, q6 |
| 622 vadd.s32 q6, q6, q6 |
| 623 vmlsl.s16 q5, ROW7L, XFIX_2_562915447 |
| 624 vshrn.s32 ROW5L, q1, #16 /* ROW5L <-> ROW1R */ |
| 625 vsub.s32 q1, q1, q6 |
| 626 vmull.s16 q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865 |
| 627 vsub.s32 q3, q3, q2 |
| 628 vshrn.s32 ROW6R, q1, #16 |
| 629 vadd.s32 q1, q3, q5 |
| 630 vsub.s32 q3, q3, q5 |
| 631 vshll.s16 q5, ROW4L, #13 |
| 632 vshrn.s32 ROW6L, q1, #16 /* ROW6L <-> ROW2R */ |
| 633 vshrn.s32 ROW5R, q3, #16 |
| 634 vadd.s32 q2, q5, q6 |
| 635 vsub.s32 q1, q5, q6 |
| 636 vadd.s32 q6, q2, q7 |
| 637 vsub.s32 q2, q2, q7 |
| 638 vadd.s32 q5, q1, q4 |
| 639 vsub.s32 q3, q1, q4 |
| 640 vshrn.s32 ROW7R, q2, #16 |
| 641 vshrn.s32 ROW7L, q5, #16 /* ROW7L <-> ROW3R */ |
| 642 vshrn.s32 ROW4L, q6, #16 /* ROW4L <-> ROW0R */ |
| 643 vshrn.s32 ROW4R, q3, #16 |
| 644 b 2b /* Go to epilogue */ |
| 645 |
| 646 .unreq DCT_TABLE |
| 647 .unreq COEF_BLOCK |
| 648 .unreq OUTPUT_BUF |
| 649 .unreq OUTPUT_COL |
| 650 .unreq TMP1 |
| 651 .unreq TMP2 |
| 652 .unreq TMP3 |
| 653 .unreq TMP4 |
| 654 |
| 655 .unreq ROW0L |
| 656 .unreq ROW0R |
| 657 .unreq ROW1L |
| 658 .unreq ROW1R |
| 659 .unreq ROW2L |
| 660 .unreq ROW2R |
| 661 .unreq ROW3L |
| 662 .unreq ROW3R |
| 663 .unreq ROW4L |
| 664 .unreq ROW4R |
| 665 .unreq ROW5L |
| 666 .unreq ROW5R |
| 667 .unreq ROW6L |
| 668 .unreq ROW6R |
| 669 .unreq ROW7L |
| 670 .unreq ROW7R |
| 671 .endfunc |
| 672 |
65 /*****************************************************************************/ | 673 /*****************************************************************************/ |
66 | 674 |
67 /* | 675 /* |
68 * jsimd_idct_ifast_neon | 676 * jsimd_idct_ifast_neon |
69 * | 677 * |
70 * This function contains a fast, not so accurate integer implementation of | 678 * This function contains a fast, not so accurate integer implementation of |
71 * the inverse DCT (Discrete Cosine Transform). It uses the same calculations | 679 * the inverse DCT (Discrete Cosine Transform). It uses the same calculations |
72 * and produces exactly the same output as IJG's original 'jpeg_idct_fast' | 680 * and produces exactly the same output as IJG's original 'jpeg_idct_ifast' |
73 * function from jidctfst.c | 681 * function from jidctfst.c |
74 * | 682 * |
75 * TODO: a bit better instructions scheduling is needed. | 683 * Normally 1-D AAN DCT needs 5 multiplications and 29 additions. |
| 684 * But in ARM NEON case some extra additions are required because VQDMULH |
| 685 * instruction can't handle the constants larger than 1. So the expressions |
| 686 * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x", |
| 687 * which introduces an extra addition. Overall, there are 6 extra additions |
| 688 * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions. |
76 */ | 689 */ |
77 | 690 |
78 #define XFIX_1_082392200 d0[0] | 691 #define XFIX_1_082392200 d0[0] |
79 #define XFIX_1_414213562 d0[1] | 692 #define XFIX_1_414213562 d0[1] |
80 #define XFIX_1_847759065 d0[2] | 693 #define XFIX_1_847759065 d0[2] |
81 #define XFIX_2_613125930 d0[3] | 694 #define XFIX_2_613125930 d0[3] |
82 | 695 |
83 .balign 16 | 696 .balign 16 |
84 jsimd_idct_ifast_neon_consts: | 697 jsimd_idct_ifast_neon_consts: |
85 .short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */ | 698 .short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */ |
86 .short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */ | 699 .short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */ |
87 .short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */ | 700 .short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */ |
88 .short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */ | 701 .short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */ |
89 | 702 |
90 /* 1-D IDCT helper macro */ | |
91 | |
92 .macro idct_helper x0, x1, x2, x3, x4, x5, x6, x7, \ | |
93 t10, t11, t12, t13, t14 | |
94 | |
95 vsub.s16 \t10, \x0, \x4 | |
96 vadd.s16 \x4, \x0, \x4 | |
97 vswp.s16 \t10, \x0 | |
98 vsub.s16 \t11, \x2, \x6 | |
99 vadd.s16 \x6, \x2, \x6 | |
100 vswp.s16 \t11, \x2 | |
101 vsub.s16 \t10, \x3, \x5 | |
102 vadd.s16 \x5, \x3, \x5 | |
103 vswp.s16 \t10, \x3 | |
104 vsub.s16 \t11, \x1, \x7 | |
105 vadd.s16 \x7, \x1, \x7 | |
106 vswp.s16 \t11, \x1 | |
107 | |
108 vqdmulh.s16 \t13, \x2, d0[1] | |
109 vadd.s16 \t12, \x3, \x3 | |
110 vadd.s16 \x2, \x2, \t13 | |
111 vqdmulh.s16 \t13, \x3, d0[3] | |
112 vsub.s16 \t10, \x1, \x3 | |
113 vadd.s16 \t12, \t12, \t13 | |
114 vqdmulh.s16 \t13, \t10, d0[2] | |
115 vsub.s16 \t11, \x7, \x5 | |
116 vadd.s16 \t10, \t10, \t13 | |
117 vqdmulh.s16 \t13, \t11, d0[1] | |
118 vadd.s16 \t11, \t11, \t13 | |
119 | |
120 vqdmulh.s16 \t13, \x1, d0[0] | |
121 vsub.s16 \x2, \x6, \x2 | |
122 vsub.s16 \t14, \x0, \x2 | |
123 vadd.s16 \x2, \x0, \x2 | |
124 vadd.s16 \x0, \x4, \x6 | |
125 vsub.s16 \x4, \x4, \x6 | |
126 vadd.s16 \x1, \x1, \t13 | |
127 vadd.s16 \t13, \x7, \x5 | |
128 vsub.s16 \t12, \t13, \t12 | |
129 vsub.s16 \t12, \t12, \t10 | |
130 vadd.s16 \t11, \t12, \t11 | |
131 vsub.s16 \t10, \x1, \t10 | |
132 vadd.s16 \t10, \t10, \t11 | |
133 | |
134 vsub.s16 \x7, \x0, \t13 | |
135 vadd.s16 \x0, \x0, \t13 | |
136 vadd.s16 \x6, \t14, \t12 | |
137 vsub.s16 \x1, \t14, \t12 | |
138 vsub.s16 \x5, \x2, \t11 | |
139 vadd.s16 \x2, \x2, \t11 | |
140 vsub.s16 \x3, \x4, \t10 | |
141 vadd.s16 \x4, \x4, \t10 | |
142 .endm | |
143 | |
144 asm_function jsimd_idct_ifast_neon | 703 asm_function jsimd_idct_ifast_neon |
145 | 704 |
146 DCT_TABLE .req r0 | 705 DCT_TABLE .req r0 |
147 COEF_BLOCK .req r1 | 706 COEF_BLOCK .req r1 |
148 OUTPUT_BUF .req r2 | 707 OUTPUT_BUF .req r2 |
149 OUTPUT_COL .req r3 | 708 OUTPUT_COL .req r3 |
150 TMP .req ip | 709 TMP1 .req r0 |
151 | 710 TMP2 .req r1 |
152 vpush {d8-d15} | 711 TMP3 .req r2 |
153 | 712 TMP4 .req ip |
154 /* Load constants */ | 713 |
155 adr TMP, jsimd_idct_ifast_neon_consts | 714 /* Load and dequantize coefficients into NEON registers |
156 vld1.16 {d0}, [TMP, :64] | 715 * with the following allocation: |
157 | |
158 /* Load all COEF_BLOCK into NEON registers with the following allocation: | |
159 * 0 1 2 3 | 4 5 6 7 | 716 * 0 1 2 3 | 4 5 6 7 |
160 * ---------+-------- | 717 * ---------+-------- |
161 * 0 | d4 | d5 | 718 * 0 | d16 | d17 ( q8 ) |
162 * 1 | d6 | d7 | 719 * 1 | d18 | d19 ( q9 ) |
163 * 2 | d8 | d9 | 720 * 2 | d20 | d21 ( q10 ) |
164 * 3 | d10 | d11 | 721 * 3 | d22 | d23 ( q11 ) |
165 * 4 | d12 | d13 | 722 * 4 | d24 | d25 ( q12 ) |
166 * 5 | d14 | d15 | 723 * 5 | d26 | d27 ( q13 ) |
167 * 6 | d16 | d17 | 724 * 6 | d28 | d29 ( q14 ) |
168 * 7 | d18 | d19 | 725 * 7 | d30 | d31 ( q15 ) |
169 */ | 726 */ |
170 vld1.16 {d4, d5, d6, d7}, [COEF_BLOCK]! | 727 adr ip, jsimd_idct_ifast_neon_consts |
171 vld1.16 {d8, d9, d10, d11}, [COEF_BLOCK]! | 728 vld1.16 {d16, d17, d18, d19}, [COEF_BLOCK, :128]! |
172 vld1.16 {d12, d13, d14, d15}, [COEF_BLOCK]! | 729 vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]! |
173 vld1.16 {d16, d17, d18, d19}, [COEF_BLOCK]! | 730 vld1.16 {d20, d21, d22, d23}, [COEF_BLOCK, :128]! |
174 /* Dequantize */ | 731 vmul.s16 q8, q8, q0 |
175 vld1.16 {d20, d21, d22, d23}, [DCT_TABLE]! | 732 vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]! |
176 vmul.s16 q2, q2, q10 | 733 vmul.s16 q9, q9, q1 |
177 vld1.16 {d24, d25, d26, d27}, [DCT_TABLE]! | 734 vld1.16 {d24, d25, d26, d27}, [COEF_BLOCK, :128]! |
178 vmul.s16 q3, q3, q11 | 735 vmul.s16 q10, q10, q2 |
179 vmul.s16 q4, q4, q12 | 736 vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]! |
180 vld1.16 {d28, d29, d30, d31}, [DCT_TABLE]! | 737 vmul.s16 q11, q11, q3 |
181 vmul.s16 q5, q5, q13 | 738 vld1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128] |
182 vmul.s16 q6, q6, q14 | 739 vmul.s16 q12, q12, q0 |
183 vld1.16 {d20, d21, d22, d23}, [DCT_TABLE]! | 740 vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]! |
184 vmul.s16 q7, q7, q15 | 741 vmul.s16 q14, q14, q2 |
185 vmul.s16 q8, q8, q10 | 742 vmul.s16 q13, q13, q1 |
186 vmul.s16 q9, q9, q11 | 743 vld1.16 {d0}, [ip, :64] /* load constants */ |
187 | 744 vmul.s16 q15, q15, q3 |
188 /* Pass 1 */ | 745 vpush {d8-d13} /* save NEON registers */ |
189 idct_helper q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12, q13, q14 | 746 /* 1-D IDCT, pass 1 */ |
190 /* Transpose */ | 747 vsub.s16 q2, q10, q14 |
191 transpose_4x4 d4, d6, d8, d10 | 748 vadd.s16 q14, q10, q14 |
192 transpose_4x4 d5, d7, d9, d11 | 749 vsub.s16 q1, q11, q13 |
193 transpose_4x4 d12, d14, d16, d18 | 750 vadd.s16 q13, q11, q13 |
194 transpose_4x4 d13, d15, d17, d19 | 751 vsub.s16 q5, q9, q15 |
195 vswp d12, d5 | 752 vadd.s16 q15, q9, q15 |
196 vswp d14, d7 | 753 vqdmulh.s16 q4, q2, XFIX_1_414213562 |
197 vswp d16, d9 | 754 vqdmulh.s16 q6, q1, XFIX_2_613125930 |
198 vswp d18, d11 | 755 vadd.s16 q3, q1, q1 |
199 | 756 vsub.s16 q1, q5, q1 |
200 /* Pass 2 */ | 757 vadd.s16 q10, q2, q4 |
201 idct_helper q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12, q13, q14 | 758 vqdmulh.s16 q4, q1, XFIX_1_847759065 |
202 /* Transpose */ | 759 vsub.s16 q2, q15, q13 |
203 transpose_4x4 d4, d6, d8, d10 | 760 vadd.s16 q3, q3, q6 |
204 transpose_4x4 d5, d7, d9, d11 | 761 vqdmulh.s16 q6, q2, XFIX_1_414213562 |
205 transpose_4x4 d12, d14, d16, d18 | 762 vadd.s16 q1, q1, q4 |
206 transpose_4x4 d13, d15, d17, d19 | 763 vqdmulh.s16 q4, q5, XFIX_1_082392200 |
207 vswp d12, d5 | 764 vsub.s16 q10, q10, q14 |
208 vswp d14, d7 | 765 vadd.s16 q2, q2, q6 |
209 vswp d16, d9 | 766 vsub.s16 q6, q8, q12 |
210 vswp d18, d11 | 767 vadd.s16 q12, q8, q12 |
211 | 768 vadd.s16 q9, q5, q4 |
212 /* Descale and range limit */ | 769 vadd.s16 q5, q6, q10 |
213 vmov.s16 q15, #(0x80 << 5) | 770 vsub.s16 q10, q6, q10 |
214 vqadd.s16 q2, q2, q15 | 771 vadd.s16 q6, q15, q13 |
215 vqadd.s16 q3, q3, q15 | 772 vadd.s16 q8, q12, q14 |
216 vqadd.s16 q4, q4, q15 | 773 vsub.s16 q3, q6, q3 |
217 vqadd.s16 q5, q5, q15 | 774 vsub.s16 q12, q12, q14 |
218 vqadd.s16 q6, q6, q15 | 775 vsub.s16 q3, q3, q1 |
219 vqadd.s16 q7, q7, q15 | 776 vsub.s16 q1, q9, q1 |
220 vqadd.s16 q8, q8, q15 | 777 vadd.s16 q2, q3, q2 |
221 vqadd.s16 q9, q9, q15 | 778 vsub.s16 q15, q8, q6 |
222 vqshrun.s16 d4, q2, #5 | 779 vadd.s16 q1, q1, q2 |
223 vqshrun.s16 d6, q3, #5 | 780 vadd.s16 q8, q8, q6 |
224 vqshrun.s16 d8, q4, #5 | 781 vadd.s16 q14, q5, q3 |
225 vqshrun.s16 d10, q5, #5 | 782 vsub.s16 q9, q5, q3 |
226 vqshrun.s16 d12, q6, #5 | 783 vsub.s16 q13, q10, q2 |
227 vqshrun.s16 d14, q7, #5 | 784 vadd.s16 q10, q10, q2 |
228 vqshrun.s16 d16, q8, #5 | 785 /* Transpose */ |
229 vqshrun.s16 d18, q9, #5 | 786 vtrn.16 q8, q9 |
230 | 787 vsub.s16 q11, q12, q1 |
231 /* Store results to the output buffer */ | 788 vtrn.16 q14, q15 |
232 .irp x, d4, d6, d8, d10, d12, d14, d16, d18 | 789 vadd.s16 q12, q12, q1 |
233 ldr TMP, [OUTPUT_BUF], #4 | 790 vtrn.16 q10, q11 |
234 add TMP, TMP, OUTPUT_COL | 791 vtrn.16 q12, q13 |
235 vst1.8 {\x}, [TMP]! | 792 vtrn.32 q9, q11 |
236 .endr | 793 vtrn.32 q12, q14 |
237 | 794 vtrn.32 q8, q10 |
238 vpop {d8-d15} | 795 vtrn.32 q13, q15 |
| 796 vswp d28, d21 |
| 797 vswp d26, d19 |
| 798 /* 1-D IDCT, pass 2 */ |
| 799 vsub.s16 q2, q10, q14 |
| 800 vswp d30, d23 |
| 801 vadd.s16 q14, q10, q14 |
| 802 vswp d24, d17 |
| 803 vsub.s16 q1, q11, q13 |
| 804 vadd.s16 q13, q11, q13 |
| 805 vsub.s16 q5, q9, q15 |
| 806 vadd.s16 q15, q9, q15 |
| 807 vqdmulh.s16 q4, q2, XFIX_1_414213562 |
| 808 vqdmulh.s16 q6, q1, XFIX_2_613125930 |
| 809 vadd.s16 q3, q1, q1 |
| 810 vsub.s16 q1, q5, q1 |
| 811 vadd.s16 q10, q2, q4 |
| 812 vqdmulh.s16 q4, q1, XFIX_1_847759065 |
| 813 vsub.s16 q2, q15, q13 |
| 814 vadd.s16 q3, q3, q6 |
| 815 vqdmulh.s16 q6, q2, XFIX_1_414213562 |
| 816 vadd.s16 q1, q1, q4 |
| 817 vqdmulh.s16 q4, q5, XFIX_1_082392200 |
| 818 vsub.s16 q10, q10, q14 |
| 819 vadd.s16 q2, q2, q6 |
| 820 vsub.s16 q6, q8, q12 |
| 821 vadd.s16 q12, q8, q12 |
| 822 vadd.s16 q9, q5, q4 |
| 823 vadd.s16 q5, q6, q10 |
| 824 vsub.s16 q10, q6, q10 |
| 825 vadd.s16 q6, q15, q13 |
| 826 vadd.s16 q8, q12, q14 |
| 827 vsub.s16 q3, q6, q3 |
| 828 vsub.s16 q12, q12, q14 |
| 829 vsub.s16 q3, q3, q1 |
| 830 vsub.s16 q1, q9, q1 |
| 831 vadd.s16 q2, q3, q2 |
| 832 vsub.s16 q15, q8, q6 |
| 833 vadd.s16 q1, q1, q2 |
| 834 vadd.s16 q8, q8, q6 |
| 835 vadd.s16 q14, q5, q3 |
| 836 vsub.s16 q9, q5, q3 |
| 837 vsub.s16 q13, q10, q2 |
| 838 vpop {d8-d13} /* restore NEON registers */ |
| 839 vadd.s16 q10, q10, q2 |
| 840 vsub.s16 q11, q12, q1 |
| 841 vadd.s16 q12, q12, q1 |
| 842 /* Descale to 8-bit and range limit */ |
| 843 vmov.u8 q0, #0x80 |
| 844 vqshrn.s16 d16, q8, #5 |
| 845 vqshrn.s16 d17, q9, #5 |
| 846 vqshrn.s16 d18, q10, #5 |
| 847 vqshrn.s16 d19, q11, #5 |
| 848 vqshrn.s16 d20, q12, #5 |
| 849 vqshrn.s16 d21, q13, #5 |
| 850 vqshrn.s16 d22, q14, #5 |
| 851 vqshrn.s16 d23, q15, #5 |
| 852 vadd.u8 q8, q8, q0 |
| 853 vadd.u8 q9, q9, q0 |
| 854 vadd.u8 q10, q10, q0 |
| 855 vadd.u8 q11, q11, q0 |
| 856 /* Transpose the final 8-bit samples */ |
| 857 vtrn.16 q8, q9 |
| 858 vtrn.16 q10, q11 |
| 859 vtrn.32 q8, q10 |
| 860 vtrn.32 q9, q11 |
| 861 vtrn.8 d16, d17 |
| 862 vtrn.8 d18, d19 |
| 863 /* Store results to the output buffer */ |
| 864 ldmia OUTPUT_BUF!, {TMP1, TMP2} |
| 865 add TMP1, TMP1, OUTPUT_COL |
| 866 add TMP2, TMP2, OUTPUT_COL |
| 867 vst1.8 {d16}, [TMP1] |
| 868 vst1.8 {d17}, [TMP2] |
| 869 ldmia OUTPUT_BUF!, {TMP1, TMP2} |
| 870 add TMP1, TMP1, OUTPUT_COL |
| 871 add TMP2, TMP2, OUTPUT_COL |
| 872 vst1.8 {d18}, [TMP1] |
| 873 vtrn.8 d20, d21 |
| 874 vst1.8 {d19}, [TMP2] |
| 875 ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4} |
| 876 add TMP1, TMP1, OUTPUT_COL |
| 877 add TMP2, TMP2, OUTPUT_COL |
| 878 add TMP3, TMP3, OUTPUT_COL |
| 879 add TMP4, TMP4, OUTPUT_COL |
| 880 vst1.8 {d20}, [TMP1] |
| 881 vtrn.8 d22, d23 |
| 882 vst1.8 {d21}, [TMP2] |
| 883 vst1.8 {d22}, [TMP3] |
| 884 vst1.8 {d23}, [TMP4] |
239 bx lr | 885 bx lr |
240 | 886 |
241 .unreq DCT_TABLE | 887 .unreq DCT_TABLE |
242 .unreq COEF_BLOCK | 888 .unreq COEF_BLOCK |
243 .unreq OUTPUT_BUF | 889 .unreq OUTPUT_BUF |
244 .unreq OUTPUT_COL | 890 .unreq OUTPUT_COL |
245 .unreq TMP | 891 .unreq TMP1 |
| 892 .unreq TMP2 |
| 893 .unreq TMP3 |
| 894 .unreq TMP4 |
246 .endfunc | 895 .endfunc |
247 | 896 |
248 .purgem idct_helper | |
249 | |
250 /*****************************************************************************/ | 897 /*****************************************************************************/ |
251 | 898 |
252 /* | 899 /* |
253 * jsimd_idct_4x4_neon | 900 * jsimd_idct_4x4_neon |
254 * | 901 * |
255 * This function contains inverse-DCT code for getting reduced-size | 902 * This function contains inverse-DCT code for getting reduced-size |
256 * 4x4 pixels output from an 8x8 DCT block. It uses the same calculations | 903 * 4x4 pixels output from an 8x8 DCT block. It uses the same calculations |
257 * and produces exactly the same output as IJG's original 'jpeg_idct_4x4' | 904 * and produces exactly the same output as IJG's original 'jpeg_idct_4x4' |
258 * function from jpeg-6b (jidctred.c). | 905 * function from jpeg-6b (jidctred.c). |
259 * | 906 * |
(...skipping 364 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
624 * jsimd_ycc_extbgrx_convert_neon | 1271 * jsimd_ycc_extbgrx_convert_neon |
625 * jsimd_ycc_extxbgr_convert_neon | 1272 * jsimd_ycc_extxbgr_convert_neon |
626 * jsimd_ycc_extxrgb_convert_neon | 1273 * jsimd_ycc_extxrgb_convert_neon |
627 * | 1274 * |
628 * Colorspace conversion YCbCr -> RGB | 1275 * Colorspace conversion YCbCr -> RGB |
629 */ | 1276 */ |
630 | 1277 |
631 | 1278 |
632 .macro do_load size | 1279 .macro do_load size |
633 .if \size == 8 | 1280 .if \size == 8 |
634 vld1.8 {d4}, [U]! | 1281 vld1.8 {d4}, [U, :64]! |
635 vld1.8 {d5}, [V]! | 1282 vld1.8 {d5}, [V, :64]! |
636 vld1.8 {d0}, [Y]! | 1283 vld1.8 {d0}, [Y, :64]! |
637 pld [Y, #64] | |
638 pld [U, #64] | 1284 pld [U, #64] |
639 pld [V, #64] | 1285 pld [V, #64] |
| 1286 pld [Y, #64] |
640 .elseif \size == 4 | 1287 .elseif \size == 4 |
641 vld1.8 {d4[0]}, [U]! | 1288 vld1.8 {d4[0]}, [U]! |
642 vld1.8 {d4[1]}, [U]! | 1289 vld1.8 {d4[1]}, [U]! |
643 vld1.8 {d4[2]}, [U]! | 1290 vld1.8 {d4[2]}, [U]! |
644 vld1.8 {d4[3]}, [U]! | 1291 vld1.8 {d4[3]}, [U]! |
645 vld1.8 {d5[0]}, [V]! | 1292 vld1.8 {d5[0]}, [V]! |
646 vld1.8 {d5[1]}, [V]! | 1293 vld1.8 {d5[1]}, [V]! |
647 vld1.8 {d5[2]}, [V]! | 1294 vld1.8 {d5[2]}, [V]! |
648 vld1.8 {d5[3]}, [V]! | 1295 vld1.8 {d5[3]}, [V]! |
649 vld1.8 {d0[0]}, [Y]! | 1296 vld1.8 {d0[0]}, [Y]! |
(...skipping 49 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
699 .else | 1346 .else |
700 .error unsupported macroblock size | 1347 .error unsupported macroblock size |
701 .endif | 1348 .endif |
702 .else | 1349 .else |
703 .error unsupported bpp | 1350 .error unsupported bpp |
704 .endif | 1351 .endif |
705 .endm | 1352 .endm |
706 | 1353 |
707 .macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, g_offs, b_offs | 1354 .macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, g_offs, b_offs |
708 | 1355 |
709 .macro do_yuv_to_rgb | 1356 /* |
| 1357 * 2 stage pipelined YCbCr->RGB conversion |
| 1358 */ |
| 1359 |
| 1360 .macro do_yuv_to_rgb_stage1 |
710 vaddw.u8 q3, q1, d4 /* q3 = u - 128 */ | 1361 vaddw.u8 q3, q1, d4 /* q3 = u - 128 */ |
711 vaddw.u8 q4, q1, d5 /* q2 = v - 128 */ | 1362 vaddw.u8 q4, q1, d5 /* q2 = v - 128 */ |
712 vmull.s16 q10, d6, d1[1] /* multiply by -11277 */ | 1363 vmull.s16 q10, d6, d1[1] /* multiply by -11277 */ |
713 vmlal.s16 q10, d8, d1[2] /* multiply by -23401 */ | 1364 vmlal.s16 q10, d8, d1[2] /* multiply by -23401 */ |
714 vmull.s16 q11, d7, d1[1] /* multiply by -11277 */ | 1365 vmull.s16 q11, d7, d1[1] /* multiply by -11277 */ |
715 vmlal.s16 q11, d9, d1[2] /* multiply by -23401 */ | 1366 vmlal.s16 q11, d9, d1[2] /* multiply by -23401 */ |
716 vmull.s16 q12, d8, d1[0] /* multiply by 22971 */ | 1367 vmull.s16 q12, d8, d1[0] /* multiply by 22971 */ |
717 vmull.s16 q13, d9, d1[0] /* multiply by 22971 */ | 1368 vmull.s16 q13, d9, d1[0] /* multiply by 22971 */ |
718 vmull.s16 q14, d6, d1[3] /* multiply by 29033 */ | 1369 vmull.s16 q14, d6, d1[3] /* multiply by 29033 */ |
719 vmull.s16 q15, d7, d1[3] /* multiply by 29033 */ | 1370 vmull.s16 q15, d7, d1[3] /* multiply by 29033 */ |
| 1371 .endm |
| 1372 |
| 1373 .macro do_yuv_to_rgb_stage2 |
720 vrshrn.s32 d20, q10, #15 | 1374 vrshrn.s32 d20, q10, #15 |
721 vrshrn.s32 d21, q11, #15 | 1375 vrshrn.s32 d21, q11, #15 |
722 vrshrn.s32 d24, q12, #14 | 1376 vrshrn.s32 d24, q12, #14 |
723 vrshrn.s32 d25, q13, #14 | 1377 vrshrn.s32 d25, q13, #14 |
724 vrshrn.s32 d28, q14, #14 | 1378 vrshrn.s32 d28, q14, #14 |
725 vrshrn.s32 d29, q15, #14 | 1379 vrshrn.s32 d29, q15, #14 |
726 vaddw.u8 q10, q10, d0 | 1380 vaddw.u8 q10, q10, d0 |
727 vaddw.u8 q12, q12, d0 | 1381 vaddw.u8 q12, q12, d0 |
728 vaddw.u8 q14, q14, d0 | 1382 vaddw.u8 q14, q14, d0 |
729 vqmovun.s16 d1\g_offs, q10 | 1383 vqmovun.s16 d1\g_offs, q10 |
730 vqmovun.s16 d1\r_offs, q12 | 1384 vqmovun.s16 d1\r_offs, q12 |
731 vqmovun.s16 d1\b_offs, q14 | 1385 vqmovun.s16 d1\b_offs, q14 |
732 .endm | 1386 .endm |
733 | 1387 |
| 1388 .macro do_yuv_to_rgb_stage2_store_load_stage1 |
| 1389 vld1.8 {d4}, [U, :64]! |
| 1390 vrshrn.s32 d20, q10, #15 |
| 1391 vrshrn.s32 d21, q11, #15 |
| 1392 vrshrn.s32 d24, q12, #14 |
| 1393 vrshrn.s32 d25, q13, #14 |
| 1394 vrshrn.s32 d28, q14, #14 |
| 1395 vld1.8 {d5}, [V, :64]! |
| 1396 vrshrn.s32 d29, q15, #14 |
| 1397 vaddw.u8 q10, q10, d0 |
| 1398 vaddw.u8 q12, q12, d0 |
| 1399 vaddw.u8 q14, q14, d0 |
| 1400 vqmovun.s16 d1\g_offs, q10 |
| 1401 vld1.8 {d0}, [Y, :64]! |
| 1402 vqmovun.s16 d1\r_offs, q12 |
| 1403 pld [U, #64] |
| 1404 pld [V, #64] |
| 1405 pld [Y, #64] |
| 1406 vqmovun.s16 d1\b_offs, q14 |
| 1407 vaddw.u8 q3, q1, d4 /* q3 = u - 128 */ |
| 1408 vaddw.u8 q4, q1, d5 /* q2 = v - 128 */ |
| 1409 do_store \bpp, 8 |
| 1410 vmull.s16 q10, d6, d1[1] /* multiply by -11277 */ |
| 1411 vmlal.s16 q10, d8, d1[2] /* multiply by -23401 */ |
| 1412 vmull.s16 q11, d7, d1[1] /* multiply by -11277 */ |
| 1413 vmlal.s16 q11, d9, d1[2] /* multiply by -23401 */ |
| 1414 vmull.s16 q12, d8, d1[0] /* multiply by 22971 */ |
| 1415 vmull.s16 q13, d9, d1[0] /* multiply by 22971 */ |
| 1416 vmull.s16 q14, d6, d1[3] /* multiply by 29033 */ |
| 1417 vmull.s16 q15, d7, d1[3] /* multiply by 29033 */ |
| 1418 .endm |
| 1419 |
| 1420 .macro do_yuv_to_rgb |
| 1421 do_yuv_to_rgb_stage1 |
| 1422 do_yuv_to_rgb_stage2 |
| 1423 .endm |
| 1424 |
734 /* Apple gas crashes on adrl, work around that by using adr. | 1425 /* Apple gas crashes on adrl, work around that by using adr. |
735 * But this requires a copy of these constants for each function. | 1426 * But this requires a copy of these constants for each function. |
736 */ | 1427 */ |
737 | 1428 |
738 .balign 16 | 1429 .balign 16 |
739 jsimd_ycc_\colorid\()_neon_consts: | 1430 jsimd_ycc_\colorid\()_neon_consts: |
740 .short 0, 0, 0, 0 | 1431 .short 0, 0, 0, 0 |
741 .short 22971, -11277, -23401, 29033 | 1432 .short 22971, -11277, -23401, 29033 |
742 .short -128, -128, -128, -128 | 1433 .short -128, -128, -128, -128 |
743 .short -128, -128, -128, -128 | 1434 .short -128, -128, -128, -128 |
(...skipping 40 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
784 0: | 1475 0: |
785 ldr Y, [INPUT_BUF0, INPUT_ROW, lsl #2] | 1476 ldr Y, [INPUT_BUF0, INPUT_ROW, lsl #2] |
786 ldr U, [INPUT_BUF1, INPUT_ROW, lsl #2] | 1477 ldr U, [INPUT_BUF1, INPUT_ROW, lsl #2] |
787 mov N, OUTPUT_WIDTH | 1478 mov N, OUTPUT_WIDTH |
788 ldr V, [INPUT_BUF2, INPUT_ROW, lsl #2] | 1479 ldr V, [INPUT_BUF2, INPUT_ROW, lsl #2] |
789 add INPUT_ROW, INPUT_ROW, #1 | 1480 add INPUT_ROW, INPUT_ROW, #1 |
790 ldr RGB, [OUTPUT_BUF], #4 | 1481 ldr RGB, [OUTPUT_BUF], #4 |
791 | 1482 |
792 /* Inner loop over pixels */ | 1483 /* Inner loop over pixels */ |
793 subs N, N, #8 | 1484 subs N, N, #8 |
| 1485 blt 3f |
| 1486 do_load 8 |
| 1487 do_yuv_to_rgb_stage1 |
| 1488 subs N, N, #8 |
794 blt 2f | 1489 blt 2f |
795 1: | 1490 1: |
796 do_load 8 | 1491 do_yuv_to_rgb_stage2_store_load_stage1 |
797 do_yuv_to_rgb | |
798 do_store \bpp, 8 | |
799 subs N, N, #8 | 1492 subs N, N, #8 |
800 bge 1b | 1493 bge 1b |
| 1494 2: |
| 1495 do_yuv_to_rgb_stage2 |
| 1496 do_store \bpp, 8 |
801 tst N, #7 | 1497 tst N, #7 |
802 beq 8f | 1498 beq 8f |
803 2: | 1499 3: |
804 tst N, #4 | 1500 tst N, #4 |
805 beq 3f | 1501 beq 3f |
806 do_load 4 | 1502 do_load 4 |
807 3: | 1503 3: |
808 tst N, #2 | 1504 tst N, #2 |
809 beq 4f | 1505 beq 4f |
810 do_load 2 | 1506 do_load 2 |
811 4: | 1507 4: |
812 tst N, #1 | 1508 tst N, #1 |
813 beq 5f | 1509 beq 5f |
(...skipping 27 matching lines...) Expand all Loading... |
841 .unreq INPUT_BUF1 | 1537 .unreq INPUT_BUF1 |
842 .unreq INPUT_BUF2 | 1538 .unreq INPUT_BUF2 |
843 .unreq RGB | 1539 .unreq RGB |
844 .unreq Y | 1540 .unreq Y |
845 .unreq U | 1541 .unreq U |
846 .unreq V | 1542 .unreq V |
847 .unreq N | 1543 .unreq N |
848 .endfunc | 1544 .endfunc |
849 | 1545 |
850 .purgem do_yuv_to_rgb | 1546 .purgem do_yuv_to_rgb |
| 1547 .purgem do_yuv_to_rgb_stage1 |
| 1548 .purgem do_yuv_to_rgb_stage2 |
| 1549 .purgem do_yuv_to_rgb_stage2_store_load_stage1 |
851 | 1550 |
852 .endm | 1551 .endm |
853 | 1552 |
854 /*--------------------------------- id ----- bpp R G B */ | 1553 /*--------------------------------- id ----- bpp R G B */ |
855 generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, 1, 2 | 1554 generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, 1, 2 |
856 generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, 1, 0 | 1555 generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, 1, 0 |
857 generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, 1, 2 | 1556 generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, 1, 2 |
858 generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, 1, 0 | 1557 generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, 1, 0 |
859 generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, 2, 1 | 1558 generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, 2, 1 |
860 generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, 2, 3 | 1559 generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, 2, 3 |
861 | 1560 |
862 .purgem do_load | 1561 .purgem do_load |
863 .purgem do_store | 1562 .purgem do_store |
864 | 1563 |
865 /*****************************************************************************/ | 1564 /*****************************************************************************/ |
| 1565 |
| 1566 /* |
| 1567 * jsimd_extrgb_ycc_convert_neon |
| 1568 * jsimd_extbgr_ycc_convert_neon |
| 1569 * jsimd_extrgbx_ycc_convert_neon |
| 1570 * jsimd_extbgrx_ycc_convert_neon |
| 1571 * jsimd_extxbgr_ycc_convert_neon |
| 1572 * jsimd_extxrgb_ycc_convert_neon |
| 1573 * |
| 1574 * Colorspace conversion RGB -> YCbCr |
| 1575 */ |
| 1576 |
| 1577 .macro do_store size |
| 1578 .if \size == 8 |
| 1579 vst1.8 {d20}, [Y]! |
| 1580 vst1.8 {d21}, [U]! |
| 1581 vst1.8 {d22}, [V]! |
| 1582 .elseif \size == 4 |
| 1583 vst1.8 {d20[0]}, [Y]! |
| 1584 vst1.8 {d20[1]}, [Y]! |
| 1585 vst1.8 {d20[2]}, [Y]! |
| 1586 vst1.8 {d20[3]}, [Y]! |
| 1587 vst1.8 {d21[0]}, [U]! |
| 1588 vst1.8 {d21[1]}, [U]! |
| 1589 vst1.8 {d21[2]}, [U]! |
| 1590 vst1.8 {d21[3]}, [U]! |
| 1591 vst1.8 {d22[0]}, [V]! |
| 1592 vst1.8 {d22[1]}, [V]! |
| 1593 vst1.8 {d22[2]}, [V]! |
| 1594 vst1.8 {d22[3]}, [V]! |
| 1595 .elseif \size == 2 |
| 1596 vst1.8 {d20[4]}, [Y]! |
| 1597 vst1.8 {d20[5]}, [Y]! |
| 1598 vst1.8 {d21[4]}, [U]! |
| 1599 vst1.8 {d21[5]}, [U]! |
| 1600 vst1.8 {d22[4]}, [V]! |
| 1601 vst1.8 {d22[5]}, [V]! |
| 1602 .elseif \size == 1 |
| 1603 vst1.8 {d20[6]}, [Y]! |
| 1604 vst1.8 {d21[6]}, [U]! |
| 1605 vst1.8 {d22[6]}, [V]! |
| 1606 .else |
| 1607 .error unsupported macroblock size |
| 1608 .endif |
| 1609 .endm |
| 1610 |
| 1611 .macro do_load bpp, size |
| 1612 .if \bpp == 24 |
| 1613 .if \size == 8 |
| 1614 vld3.8 {d10, d11, d12}, [RGB]! |
| 1615 pld [RGB, #128] |
| 1616 .elseif \size == 4 |
| 1617 vld3.8 {d10[0], d11[0], d12[0]}, [RGB]! |
| 1618 vld3.8 {d10[1], d11[1], d12[1]}, [RGB]! |
| 1619 vld3.8 {d10[2], d11[2], d12[2]}, [RGB]! |
| 1620 vld3.8 {d10[3], d11[3], d12[3]}, [RGB]! |
| 1621 .elseif \size == 2 |
| 1622 vld3.8 {d10[4], d11[4], d12[4]}, [RGB]! |
| 1623 vld3.8 {d10[5], d11[5], d12[5]}, [RGB]! |
| 1624 .elseif \size == 1 |
| 1625 vld3.8 {d10[6], d11[6], d12[6]}, [RGB]! |
| 1626 .else |
| 1627 .error unsupported macroblock size |
| 1628 .endif |
| 1629 .elseif \bpp == 32 |
| 1630 .if \size == 8 |
| 1631 vld4.8 {d10, d11, d12, d13}, [RGB]! |
| 1632 pld [RGB, #128] |
| 1633 .elseif \size == 4 |
| 1634 vld4.8 {d10[0], d11[0], d12[0], d13[0]}, [RGB]! |
| 1635 vld4.8 {d10[1], d11[1], d12[1], d13[1]}, [RGB]! |
| 1636 vld4.8 {d10[2], d11[2], d12[2], d13[2]}, [RGB]! |
| 1637 vld4.8 {d10[3], d11[3], d12[3], d13[3]}, [RGB]! |
| 1638 .elseif \size == 2 |
| 1639 vld4.8 {d10[4], d11[4], d12[4], d13[4]}, [RGB]! |
| 1640 vld4.8 {d10[5], d11[5], d12[5], d13[5]}, [RGB]! |
| 1641 .elseif \size == 1 |
| 1642 vld4.8 {d10[6], d11[6], d12[6], d13[6]}, [RGB]! |
| 1643 .else |
| 1644 .error unsupported macroblock size |
| 1645 .endif |
| 1646 .else |
| 1647 .error unsupported bpp |
| 1648 .endif |
| 1649 .endm |
| 1650 |
| 1651 .macro generate_jsimd_rgb_ycc_convert_neon colorid, bpp, r_offs, g_offs, b_offs |
| 1652 |
| 1653 /* |
| 1654 * 2 stage pipelined RGB->YCbCr conversion |
| 1655 */ |
| 1656 |
| 1657 .macro do_rgb_to_yuv_stage1 |
| 1658 vmovl.u8 q2, d1\r_offs /* r = { d4, d5 } */ |
| 1659 vmovl.u8 q3, d1\g_offs /* g = { d6, d7 } */ |
| 1660 vmovl.u8 q4, d1\b_offs /* b = { d8, d9 } */ |
| 1661 vmull.u16 q7, d4, d0[0] |
| 1662 vmlal.u16 q7, d6, d0[1] |
| 1663 vmlal.u16 q7, d8, d0[2] |
| 1664 vmull.u16 q8, d5, d0[0] |
| 1665 vmlal.u16 q8, d7, d0[1] |
| 1666 vmlal.u16 q8, d9, d0[2] |
| 1667 vrev64.32 q9, q1 |
| 1668 vrev64.32 q13, q1 |
| 1669 vmlsl.u16 q9, d4, d0[3] |
| 1670 vmlsl.u16 q9, d6, d1[0] |
| 1671 vmlal.u16 q9, d8, d1[1] |
| 1672 vmlsl.u16 q13, d5, d0[3] |
| 1673 vmlsl.u16 q13, d7, d1[0] |
| 1674 vmlal.u16 q13, d9, d1[1] |
| 1675 vrev64.32 q14, q1 |
| 1676 vrev64.32 q15, q1 |
| 1677 vmlal.u16 q14, d4, d1[1] |
| 1678 vmlsl.u16 q14, d6, d1[2] |
| 1679 vmlsl.u16 q14, d8, d1[3] |
| 1680 vmlal.u16 q15, d5, d1[1] |
| 1681 vmlsl.u16 q15, d7, d1[2] |
| 1682 vmlsl.u16 q15, d9, d1[3] |
| 1683 .endm |
| 1684 |
| 1685 .macro do_rgb_to_yuv_stage2 |
| 1686 vrshrn.u32 d20, q7, #16 |
| 1687 vrshrn.u32 d21, q8, #16 |
| 1688 vshrn.u32 d22, q9, #16 |
| 1689 vshrn.u32 d23, q13, #16 |
| 1690 vshrn.u32 d24, q14, #16 |
| 1691 vshrn.u32 d25, q15, #16 |
| 1692 vmovn.u16 d20, q10 /* d20 = y */ |
| 1693 vmovn.u16 d21, q11 /* d21 = u */ |
| 1694 vmovn.u16 d22, q12 /* d22 = v */ |
| 1695 .endm |
| 1696 |
| 1697 .macro do_rgb_to_yuv |
| 1698 do_rgb_to_yuv_stage1 |
| 1699 do_rgb_to_yuv_stage2 |
| 1700 .endm |
| 1701 |
| 1702 .macro do_rgb_to_yuv_stage2_store_load_stage1 |
| 1703 vrshrn.u32 d20, q7, #16 |
| 1704 vrshrn.u32 d21, q8, #16 |
| 1705 vshrn.u32 d22, q9, #16 |
| 1706 vrev64.32 q9, q1 |
| 1707 vshrn.u32 d23, q13, #16 |
| 1708 vrev64.32 q13, q1 |
| 1709 vshrn.u32 d24, q14, #16 |
| 1710 vshrn.u32 d25, q15, #16 |
| 1711 do_load \bpp, 8 |
| 1712 vmovn.u16 d20, q10 /* d20 = y */ |
| 1713 vmovl.u8 q2, d1\r_offs /* r = { d4, d5 } */ |
| 1714 vmovn.u16 d21, q11 /* d21 = u */ |
| 1715 vmovl.u8 q3, d1\g_offs /* g = { d6, d7 } */ |
| 1716 vmovn.u16 d22, q12 /* d22 = v */ |
| 1717 vmovl.u8 q4, d1\b_offs /* b = { d8, d9 } */ |
| 1718 vmull.u16 q7, d4, d0[0] |
| 1719 vmlal.u16 q7, d6, d0[1] |
| 1720 vmlal.u16 q7, d8, d0[2] |
| 1721 vst1.8 {d20}, [Y]! |
| 1722 vmull.u16 q8, d5, d0[0] |
| 1723 vmlal.u16 q8, d7, d0[1] |
| 1724 vmlal.u16 q8, d9, d0[2] |
| 1725 vmlsl.u16 q9, d4, d0[3] |
| 1726 vmlsl.u16 q9, d6, d1[0] |
| 1727 vmlal.u16 q9, d8, d1[1] |
| 1728 vst1.8 {d21}, [U]! |
| 1729 vmlsl.u16 q13, d5, d0[3] |
| 1730 vmlsl.u16 q13, d7, d1[0] |
| 1731 vmlal.u16 q13, d9, d1[1] |
| 1732 vrev64.32 q14, q1 |
| 1733 vrev64.32 q15, q1 |
| 1734 vmlal.u16 q14, d4, d1[1] |
| 1735 vmlsl.u16 q14, d6, d1[2] |
| 1736 vmlsl.u16 q14, d8, d1[3] |
| 1737 vst1.8 {d22}, [V]! |
| 1738 vmlal.u16 q15, d5, d1[1] |
| 1739 vmlsl.u16 q15, d7, d1[2] |
| 1740 vmlsl.u16 q15, d9, d1[3] |
| 1741 .endm |
| 1742 |
| 1743 .balign 16 |
| 1744 jsimd_\colorid\()_ycc_neon_consts: |
| 1745 .short 19595, 38470, 7471, 11059 |
| 1746 .short 21709, 32768, 27439, 5329 |
| 1747 .short 32767, 128, 32767, 128 |
| 1748 .short 32767, 128, 32767, 128 |
| 1749 |
| 1750 asm_function jsimd_\colorid\()_ycc_convert_neon |
| 1751 OUTPUT_WIDTH .req r0 |
| 1752 INPUT_BUF .req r1 |
| 1753 OUTPUT_BUF .req r2 |
| 1754 OUTPUT_ROW .req r3 |
| 1755 NUM_ROWS .req r4 |
| 1756 |
| 1757 OUTPUT_BUF0 .req r5 |
| 1758 OUTPUT_BUF1 .req r6 |
| 1759 OUTPUT_BUF2 .req OUTPUT_BUF |
| 1760 |
| 1761 RGB .req r7 |
| 1762 Y .req r8 |
| 1763 U .req r9 |
| 1764 V .req r10 |
| 1765 N .req ip |
| 1766 |
| 1767 /* Load constants to d0, d1, d2, d3 */ |
| 1768 adr ip, jsimd_\colorid\()_ycc_neon_consts |
| 1769 vld1.16 {d0, d1, d2, d3}, [ip, :128] |
| 1770 |
| 1771 /* Save ARM registers and handle input arguments */ |
| 1772 push {r4, r5, r6, r7, r8, r9, r10, lr} |
| 1773 ldr NUM_ROWS, [sp, #(4 * 8)] |
| 1774 ldr OUTPUT_BUF0, [OUTPUT_BUF] |
| 1775 ldr OUTPUT_BUF1, [OUTPUT_BUF, #4] |
| 1776 ldr OUTPUT_BUF2, [OUTPUT_BUF, #8] |
| 1777 .unreq OUTPUT_BUF |
| 1778 |
| 1779 /* Save NEON registers */ |
| 1780 vpush {d8-d15} |
| 1781 |
| 1782 /* Outer loop over scanlines */ |
| 1783 cmp NUM_ROWS, #1 |
| 1784 blt 9f |
| 1785 0: |
| 1786 ldr Y, [OUTPUT_BUF0, OUTPUT_ROW, lsl #2] |
| 1787 ldr U, [OUTPUT_BUF1, OUTPUT_ROW, lsl #2] |
| 1788 mov N, OUTPUT_WIDTH |
| 1789 ldr V, [OUTPUT_BUF2, OUTPUT_ROW, lsl #2] |
| 1790 add OUTPUT_ROW, OUTPUT_ROW, #1 |
| 1791 ldr RGB, [INPUT_BUF], #4 |
| 1792 |
| 1793 /* Inner loop over pixels */ |
| 1794 subs N, N, #8 |
| 1795 blt 3f |
| 1796 do_load \bpp, 8 |
| 1797 do_rgb_to_yuv_stage1 |
| 1798 subs N, N, #8 |
| 1799 blt 2f |
| 1800 1: |
| 1801 do_rgb_to_yuv_stage2_store_load_stage1 |
| 1802 subs N, N, #8 |
| 1803 bge 1b |
| 1804 2: |
| 1805 do_rgb_to_yuv_stage2 |
| 1806 do_store 8 |
| 1807 tst N, #7 |
| 1808 beq 8f |
| 1809 3: |
| 1810 tst N, #4 |
| 1811 beq 3f |
| 1812 do_load \bpp, 4 |
| 1813 3: |
| 1814 tst N, #2 |
| 1815 beq 4f |
| 1816 do_load \bpp, 2 |
| 1817 4: |
| 1818 tst N, #1 |
| 1819 beq 5f |
| 1820 do_load \bpp, 1 |
| 1821 5: |
| 1822 do_rgb_to_yuv |
| 1823 tst N, #4 |
| 1824 beq 6f |
| 1825 do_store 4 |
| 1826 6: |
| 1827 tst N, #2 |
| 1828 beq 7f |
| 1829 do_store 2 |
| 1830 7: |
| 1831 tst N, #1 |
| 1832 beq 8f |
| 1833 do_store 1 |
| 1834 8: |
| 1835 subs NUM_ROWS, NUM_ROWS, #1 |
| 1836 bgt 0b |
| 1837 9: |
| 1838 /* Restore all registers and return */ |
| 1839 vpop {d8-d15} |
| 1840 pop {r4, r5, r6, r7, r8, r9, r10, pc} |
| 1841 |
| 1842 .unreq OUTPUT_WIDTH |
| 1843 .unreq OUTPUT_ROW |
| 1844 .unreq INPUT_BUF |
| 1845 .unreq NUM_ROWS |
| 1846 .unreq OUTPUT_BUF0 |
| 1847 .unreq OUTPUT_BUF1 |
| 1848 .unreq OUTPUT_BUF2 |
| 1849 .unreq RGB |
| 1850 .unreq Y |
| 1851 .unreq U |
| 1852 .unreq V |
| 1853 .unreq N |
| 1854 .endfunc |
| 1855 |
| 1856 .purgem do_rgb_to_yuv |
| 1857 .purgem do_rgb_to_yuv_stage1 |
| 1858 .purgem do_rgb_to_yuv_stage2 |
| 1859 .purgem do_rgb_to_yuv_stage2_store_load_stage1 |
| 1860 |
| 1861 .endm |
| 1862 |
| 1863 /*--------------------------------- id ----- bpp R G B */ |
| 1864 generate_jsimd_rgb_ycc_convert_neon extrgb, 24, 0, 1, 2 |
| 1865 generate_jsimd_rgb_ycc_convert_neon extbgr, 24, 2, 1, 0 |
| 1866 generate_jsimd_rgb_ycc_convert_neon extrgbx, 32, 0, 1, 2 |
| 1867 generate_jsimd_rgb_ycc_convert_neon extbgrx, 32, 2, 1, 0 |
| 1868 generate_jsimd_rgb_ycc_convert_neon extxbgr, 32, 3, 2, 1 |
| 1869 generate_jsimd_rgb_ycc_convert_neon extxrgb, 32, 1, 2, 3 |
| 1870 |
| 1871 .purgem do_load |
| 1872 .purgem do_store |
| 1873 |
| 1874 /*****************************************************************************/ |
| 1875 |
| 1876 /* |
| 1877 * Load data into workspace, applying unsigned->signed conversion |
| 1878 * |
| 1879 * TODO: can be combined with 'jsimd_fdct_ifast_neon' to get |
| 1880 * rid of VST1.16 instructions |
| 1881 */ |
| 1882 |
| 1883 asm_function jsimd_convsamp_neon |
| 1884 SAMPLE_DATA .req r0 |
| 1885 START_COL .req r1 |
| 1886 WORKSPACE .req r2 |
| 1887 TMP1 .req r3 |
| 1888 TMP2 .req r4 |
| 1889 TMP3 .req r5 |
| 1890 TMP4 .req ip |
| 1891 |
| 1892 push {r4, r5} |
| 1893 vmov.u8 d0, #128 |
| 1894 |
| 1895 ldmia SAMPLE_DATA!, {TMP1, TMP2, TMP3, TMP4} |
| 1896 add TMP1, TMP1, START_COL |
| 1897 add TMP2, TMP2, START_COL |
| 1898 add TMP3, TMP3, START_COL |
| 1899 add TMP4, TMP4, START_COL |
| 1900 vld1.8 {d16}, [TMP1] |
| 1901 vsubl.u8 q8, d16, d0 |
| 1902 vld1.8 {d18}, [TMP2] |
| 1903 vsubl.u8 q9, d18, d0 |
| 1904 vld1.8 {d20}, [TMP3] |
| 1905 vsubl.u8 q10, d20, d0 |
| 1906 vld1.8 {d22}, [TMP4] |
| 1907 ldmia SAMPLE_DATA!, {TMP1, TMP2, TMP3, TMP4} |
| 1908 vsubl.u8 q11, d22, d0 |
| 1909 vst1.16 {d16, d17, d18, d19}, [WORKSPACE, :128]! |
| 1910 add TMP1, TMP1, START_COL |
| 1911 add TMP2, TMP2, START_COL |
| 1912 vst1.16 {d20, d21, d22, d23}, [WORKSPACE, :128]! |
| 1913 add TMP3, TMP3, START_COL |
| 1914 add TMP4, TMP4, START_COL |
| 1915 vld1.8 {d24}, [TMP1] |
| 1916 vsubl.u8 q12, d24, d0 |
| 1917 vld1.8 {d26}, [TMP2] |
| 1918 vsubl.u8 q13, d26, d0 |
| 1919 vld1.8 {d28}, [TMP3] |
| 1920 vsubl.u8 q14, d28, d0 |
| 1921 vld1.8 {d30}, [TMP4] |
| 1922 vsubl.u8 q15, d30, d0 |
| 1923 vst1.16 {d24, d25, d26, d27}, [WORKSPACE, :128]! |
| 1924 vst1.16 {d28, d29, d30, d31}, [WORKSPACE, :128]! |
| 1925 pop {r4, r5} |
| 1926 bx lr |
| 1927 |
| 1928 .unreq SAMPLE_DATA |
| 1929 .unreq START_COL |
| 1930 .unreq WORKSPACE |
| 1931 .unreq TMP1 |
| 1932 .unreq TMP2 |
| 1933 .unreq TMP3 |
| 1934 .unreq TMP4 |
| 1935 .endfunc |
| 1936 |
| 1937 /*****************************************************************************/ |
| 1938 |
| 1939 /* |
| 1940 * jsimd_fdct_ifast_neon |
| 1941 * |
| 1942 * This function contains a fast, not so accurate integer implementation of |
| 1943 * the forward DCT (Discrete Cosine Transform). It uses the same calculations |
| 1944 * and produces exactly the same output as IJG's original 'jpeg_fdct_ifast' |
| 1945 * function from jfdctfst.c |
| 1946 * |
| 1947 * TODO: can be combined with 'jsimd_convsamp_neon' to get |
| 1948 * rid of a bunch of VLD1.16 instructions |
| 1949 */ |
| 1950 |
| 1951 #define XFIX_0_382683433 d0[0] |
| 1952 #define XFIX_0_541196100 d0[1] |
| 1953 #define XFIX_0_707106781 d0[2] |
| 1954 #define XFIX_1_306562965 d0[3] |
| 1955 |
| 1956 .balign 16 |
| 1957 jsimd_fdct_ifast_neon_consts: |
| 1958 .short (98 * 128) /* XFIX_0_382683433 */ |
| 1959 .short (139 * 128) /* XFIX_0_541196100 */ |
| 1960 .short (181 * 128) /* XFIX_0_707106781 */ |
| 1961 .short (334 * 128 - 256 * 128) /* XFIX_1_306562965 */ |
| 1962 |
| 1963 asm_function jsimd_fdct_ifast_neon |
| 1964 |
| 1965 DATA .req r0 |
| 1966 TMP .req ip |
| 1967 |
| 1968 vpush {d8-d15} |
| 1969 |
| 1970 /* Load constants */ |
| 1971 adr TMP, jsimd_fdct_ifast_neon_consts |
| 1972 vld1.16 {d0}, [TMP, :64] |
| 1973 |
| 1974 /* Load all DATA into NEON registers with the following allocation: |
| 1975 * 0 1 2 3 | 4 5 6 7 |
| 1976 * ---------+-------- |
| 1977 * 0 | d16 | d17 | q8 |
| 1978 * 1 | d18 | d19 | q9 |
| 1979 * 2 | d20 | d21 | q10 |
| 1980 * 3 | d22 | d23 | q11 |
| 1981 * 4 | d24 | d25 | q12 |
| 1982 * 5 | d26 | d27 | q13 |
| 1983 * 6 | d28 | d29 | q14 |
| 1984 * 7 | d30 | d31 | q15 |
| 1985 */ |
| 1986 |
| 1987 vld1.16 {d16, d17, d18, d19}, [DATA, :128]! |
| 1988 vld1.16 {d20, d21, d22, d23}, [DATA, :128]! |
| 1989 vld1.16 {d24, d25, d26, d27}, [DATA, :128]! |
| 1990 vld1.16 {d28, d29, d30, d31}, [DATA, :128] |
| 1991 sub DATA, DATA, #(128 - 32) |
| 1992 |
| 1993 mov TMP, #2 |
| 1994 1: |
| 1995 /* Transpose */ |
| 1996 vtrn.16 q12, q13 |
| 1997 vtrn.16 q10, q11 |
| 1998 vtrn.16 q8, q9 |
| 1999 vtrn.16 q14, q15 |
| 2000 vtrn.32 q9, q11 |
| 2001 vtrn.32 q13, q15 |
| 2002 vtrn.32 q8, q10 |
| 2003 vtrn.32 q12, q14 |
| 2004 vswp d30, d23 |
| 2005 vswp d24, d17 |
| 2006 vswp d26, d19 |
| 2007 /* 1-D FDCT */ |
| 2008 vadd.s16 q2, q11, q12 |
| 2009 vswp d28, d21 |
| 2010 vsub.s16 q12, q11, q12 |
| 2011 vsub.s16 q6, q10, q13 |
| 2012 vadd.s16 q10, q10, q13 |
| 2013 vsub.s16 q7, q9, q14 |
| 2014 vadd.s16 q9, q9, q14 |
| 2015 vsub.s16 q1, q8, q15 |
| 2016 vadd.s16 q8, q8, q15 |
| 2017 vsub.s16 q4, q9, q10 |
| 2018 vsub.s16 q5, q8, q2 |
| 2019 vadd.s16 q3, q9, q10 |
| 2020 vadd.s16 q4, q4, q5 |
| 2021 vadd.s16 q2, q8, q2 |
| 2022 vqdmulh.s16 q4, q4, XFIX_0_707106781 |
| 2023 vadd.s16 q11, q12, q6 |
| 2024 vadd.s16 q8, q2, q3 |
| 2025 vsub.s16 q12, q2, q3 |
| 2026 vadd.s16 q3, q6, q7 |
| 2027 vadd.s16 q7, q7, q1 |
| 2028 vqdmulh.s16 q3, q3, XFIX_0_707106781 |
| 2029 vsub.s16 q6, q11, q7 |
| 2030 vadd.s16 q10, q5, q4 |
| 2031 vqdmulh.s16 q6, q6, XFIX_0_382683433 |
| 2032 vsub.s16 q14, q5, q4 |
| 2033 vqdmulh.s16 q11, q11, XFIX_0_541196100 |
| 2034 vqdmulh.s16 q5, q7, XFIX_1_306562965 |
| 2035 vadd.s16 q4, q1, q3 |
| 2036 vsub.s16 q3, q1, q3 |
| 2037 vadd.s16 q7, q7, q6 |
| 2038 vadd.s16 q11, q11, q6 |
| 2039 vadd.s16 q7, q7, q5 |
| 2040 vadd.s16 q13, q3, q11 |
| 2041 vsub.s16 q11, q3, q11 |
| 2042 vadd.s16 q9, q4, q7 |
| 2043 vsub.s16 q15, q4, q7 |
| 2044 subs TMP, TMP, #1 |
| 2045 bne 1b |
| 2046 |
| 2047 /* store results */ |
| 2048 vst1.16 {d16, d17, d18, d19}, [DATA, :128]! |
| 2049 vst1.16 {d20, d21, d22, d23}, [DATA, :128]! |
| 2050 vst1.16 {d24, d25, d26, d27}, [DATA, :128]! |
| 2051 vst1.16 {d28, d29, d30, d31}, [DATA, :128] |
| 2052 |
| 2053 vpop {d8-d15} |
| 2054 bx lr |
| 2055 |
| 2056 .unreq DATA |
| 2057 .unreq TMP |
| 2058 .endfunc |
| 2059 |
| 2060 /*****************************************************************************/ |
| 2061 |
| 2062 /* |
| 2063 * GLOBAL(void) |
| 2064 * jsimd_quantize_neon (JCOEFPTR coef_block, DCTELEM * divisors, |
| 2065 * DCTELEM * workspace); |
| 2066 * |
| 2067 * Note: the code uses 2 stage pipelining in order to improve instructions |
| 2068 * scheduling and eliminate stalls (this provides ~15% better |
| 2069 * performance for this function on both ARM Cortex-A8 and |
| 2070 * ARM Cortex-A9 when compared to the non-pipelined variant). |
| 2071 * The instructions which belong to the second stage use different |
| 2072 * indentation for better readiability. |
| 2073 */ |
| 2074 asm_function jsimd_quantize_neon |
| 2075 |
| 2076 COEF_BLOCK .req r0 |
| 2077 DIVISORS .req r1 |
| 2078 WORKSPACE .req r2 |
| 2079 |
| 2080 RECIPROCAL .req DIVISORS |
| 2081 CORRECTION .req r3 |
| 2082 SHIFT .req ip |
| 2083 LOOP_COUNT .req r4 |
| 2084 |
| 2085 vld1.16 {d0, d1, d2, d3}, [WORKSPACE, :128]! |
| 2086 vabs.s16 q12, q0 |
| 2087 add CORRECTION, DIVISORS, #(64 * 2) |
| 2088 add SHIFT, DIVISORS, #(64 * 6) |
| 2089 vld1.16 {d20, d21, d22, d23}, [CORRECTION, :128]! |
| 2090 vabs.s16 q13, q1 |
| 2091 vld1.16 {d16, d17, d18, d19}, [RECIPROCAL, :128]! |
| 2092 vadd.u16 q12, q12, q10 /* add correction */ |
| 2093 vadd.u16 q13, q13, q11 |
| 2094 vmull.u16 q10, d24, d16 /* multiply by reciprocal */ |
| 2095 vmull.u16 q11, d25, d17 |
| 2096 vmull.u16 q8, d26, d18 |
| 2097 vmull.u16 q9, d27, d19 |
| 2098 vld1.16 {d24, d25, d26, d27}, [SHIFT, :128]! |
| 2099 vshrn.u32 d20, q10, #16 |
| 2100 vshrn.u32 d21, q11, #16 |
| 2101 vshrn.u32 d22, q8, #16 |
| 2102 vshrn.u32 d23, q9, #16 |
| 2103 vneg.s16 q12, q12 |
| 2104 vneg.s16 q13, q13 |
| 2105 vshr.s16 q2, q0, #15 /* extract sign */ |
| 2106 vshr.s16 q3, q1, #15 |
| 2107 vshl.u16 q14, q10, q12 /* shift */ |
| 2108 vshl.u16 q15, q11, q13 |
| 2109 |
| 2110 push {r4, r5} |
| 2111 mov LOOP_COUNT, #3 |
| 2112 1: |
| 2113 vld1.16 {d0, d1, d2, d3}, [WORKSPACE, :128]! |
| 2114 veor.u16 q14, q14, q2 /* restore sign */ |
| 2115 vabs.s16 q12, q0 |
| 2116 vld1.16 {d20, d21, d22, d23}, [CORRECTION, :128]! |
| 2117 vabs.s16 q13, q1 |
| 2118 veor.u16 q15, q15, q3 |
| 2119 vld1.16 {d16, d17, d18, d19}, [RECIPROCAL, :128]! |
| 2120 vadd.u16 q12, q12, q10 /* add correction */ |
| 2121 vadd.u16 q13, q13, q11 |
| 2122 vmull.u16 q10, d24, d16 /* multiply by reciprocal */ |
| 2123 vmull.u16 q11, d25, d17 |
| 2124 vmull.u16 q8, d26, d18 |
| 2125 vmull.u16 q9, d27, d19 |
| 2126 vsub.u16 q14, q14, q2 |
| 2127 vld1.16 {d24, d25, d26, d27}, [SHIFT, :128]! |
| 2128 vsub.u16 q15, q15, q3 |
| 2129 vshrn.u32 d20, q10, #16 |
| 2130 vshrn.u32 d21, q11, #16 |
| 2131 vst1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128]! |
| 2132 vshrn.u32 d22, q8, #16 |
| 2133 vshrn.u32 d23, q9, #16 |
| 2134 vneg.s16 q12, q12 |
| 2135 vneg.s16 q13, q13 |
| 2136 vshr.s16 q2, q0, #15 /* extract sign */ |
| 2137 vshr.s16 q3, q1, #15 |
| 2138 vshl.u16 q14, q10, q12 /* shift */ |
| 2139 vshl.u16 q15, q11, q13 |
| 2140 subs LOOP_COUNT, LOOP_COUNT, #1 |
| 2141 bne 1b |
| 2142 pop {r4, r5} |
| 2143 |
| 2144 veor.u16 q14, q14, q2 /* restore sign */ |
| 2145 veor.u16 q15, q15, q3 |
| 2146 vsub.u16 q14, q14, q2 |
| 2147 vsub.u16 q15, q15, q3 |
| 2148 vst1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128]! |
| 2149 |
| 2150 bx lr /* return */ |
| 2151 |
| 2152 .unreq COEF_BLOCK |
| 2153 .unreq DIVISORS |
| 2154 .unreq WORKSPACE |
| 2155 .unreq RECIPROCAL |
| 2156 .unreq CORRECTION |
| 2157 .unreq SHIFT |
| 2158 .unreq LOOP_COUNT |
| 2159 .endfunc |
OLD | NEW |