OLD | NEW |
(Empty) | |
| 1 /* |
| 2 * ARM NEON optimizations for libjpeg-turbo |
| 3 * |
| 4 * Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies). |
| 5 * All rights reserved. |
| 6 * Contact: Alexander Bokovoy <alexander.bokovoy@nokia.com> |
| 7 * |
| 8 * This software is provided 'as-is', without any express or implied |
| 9 * warranty. In no event will the authors be held liable for any damages |
| 10 * arising from the use of this software. |
| 11 * |
| 12 * Permission is granted to anyone to use this software for any purpose, |
| 13 * including commercial applications, and to alter it and redistribute it |
| 14 * freely, subject to the following restrictions: |
| 15 * |
| 16 * 1. The origin of this software must not be misrepresented; you must not |
| 17 * claim that you wrote the original software. If you use this software |
| 18 * in a product, an acknowledgment in the product documentation would be |
| 19 * appreciated but is not required. |
| 20 * 2. Altered source versions must be plainly marked as such, and must not be |
| 21 * misrepresented as being the original software. |
| 22 * 3. This notice may not be removed or altered from any source distribution. |
| 23 */ |
| 24 |
| 25 #if defined(__linux__) && defined(__ELF__) |
| 26 .section .note.GNU-stack,"",%progbits /* mark stack as non-executable */ |
| 27 #endif |
| 28 |
| 29 .text |
| 30 .fpu neon |
| 31 .arch armv7a |
| 32 .object_arch armv4 |
| 33 .arm |
| 34 |
| 35 |
| 36 #define RESPECT_STRICT_ALIGNMENT 1 |
| 37 |
| 38 /*****************************************************************************/ |
| 39 |
| 40 /* Supplementary macro for setting function attributes */ |
| 41 .macro asm_function fname |
| 42 #ifdef __APPLE__ |
| 43 .func _\fname |
| 44 .globl _\fname |
| 45 _\fname: |
| 46 #else |
| 47 .func \fname |
| 48 .global \fname |
| 49 #ifdef __ELF__ |
| 50 .hidden \fname |
| 51 .type \fname, %function |
| 52 #endif |
| 53 \fname: |
| 54 #endif |
| 55 .endm |
| 56 |
| 57 /* Transpose a block of 4x4 coefficients in four 64-bit registers */ |
| 58 .macro transpose_4x4 x0, x1, x2, x3 |
| 59 vtrn.16 \x0, \x1 |
| 60 vtrn.16 \x2, \x3 |
| 61 vtrn.32 \x0, \x2 |
| 62 vtrn.32 \x1, \x3 |
| 63 .endm |
| 64 |
| 65 /*****************************************************************************/ |
| 66 |
| 67 /* |
| 68 * jsimd_idct_ifast_neon |
| 69 * |
| 70 * This function contains a fast, not so accurate integer implementation of |
| 71 * the inverse DCT (Discrete Cosine Transform). It uses the same calculations |
| 72 * and produces exactly the same output as IJG's original 'jpeg_idct_fast' |
| 73 * function from jidctfst.c |
| 74 * |
| 75 * TODO: a bit better instructions scheduling is needed. |
| 76 */ |
| 77 |
| 78 #define XFIX_1_082392200 d0[0] |
| 79 #define XFIX_1_414213562 d0[1] |
| 80 #define XFIX_1_847759065 d0[2] |
| 81 #define XFIX_2_613125930 d0[3] |
| 82 |
| 83 .balign 16 |
| 84 jsimd_idct_ifast_neon_consts: |
| 85 .short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */ |
| 86 .short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */ |
| 87 .short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */ |
| 88 .short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */ |
| 89 |
| 90 /* 1-D IDCT helper macro */ |
| 91 |
| 92 .macro idct_helper x0, x1, x2, x3, x4, x5, x6, x7, \ |
| 93 t10, t11, t12, t13, t14 |
| 94 |
| 95 vsub.s16 \t10, \x0, \x4 |
| 96 vadd.s16 \x4, \x0, \x4 |
| 97 vswp.s16 \t10, \x0 |
| 98 vsub.s16 \t11, \x2, \x6 |
| 99 vadd.s16 \x6, \x2, \x6 |
| 100 vswp.s16 \t11, \x2 |
| 101 vsub.s16 \t10, \x3, \x5 |
| 102 vadd.s16 \x5, \x3, \x5 |
| 103 vswp.s16 \t10, \x3 |
| 104 vsub.s16 \t11, \x1, \x7 |
| 105 vadd.s16 \x7, \x1, \x7 |
| 106 vswp.s16 \t11, \x1 |
| 107 |
| 108 vqdmulh.s16 \t13, \x2, d0[1] |
| 109 vadd.s16 \t12, \x3, \x3 |
| 110 vadd.s16 \x2, \x2, \t13 |
| 111 vqdmulh.s16 \t13, \x3, d0[3] |
| 112 vsub.s16 \t10, \x1, \x3 |
| 113 vadd.s16 \t12, \t12, \t13 |
| 114 vqdmulh.s16 \t13, \t10, d0[2] |
| 115 vsub.s16 \t11, \x7, \x5 |
| 116 vadd.s16 \t10, \t10, \t13 |
| 117 vqdmulh.s16 \t13, \t11, d0[1] |
| 118 vadd.s16 \t11, \t11, \t13 |
| 119 |
| 120 vqdmulh.s16 \t13, \x1, d0[0] |
| 121 vsub.s16 \x2, \x6, \x2 |
| 122 vsub.s16 \t14, \x0, \x2 |
| 123 vadd.s16 \x2, \x0, \x2 |
| 124 vadd.s16 \x0, \x4, \x6 |
| 125 vsub.s16 \x4, \x4, \x6 |
| 126 vadd.s16 \x1, \x1, \t13 |
| 127 vadd.s16 \t13, \x7, \x5 |
| 128 vsub.s16 \t12, \t13, \t12 |
| 129 vsub.s16 \t12, \t12, \t10 |
| 130 vadd.s16 \t11, \t12, \t11 |
| 131 vsub.s16 \t10, \x1, \t10 |
| 132 vadd.s16 \t10, \t10, \t11 |
| 133 |
| 134 vsub.s16 \x7, \x0, \t13 |
| 135 vadd.s16 \x0, \x0, \t13 |
| 136 vadd.s16 \x6, \t14, \t12 |
| 137 vsub.s16 \x1, \t14, \t12 |
| 138 vsub.s16 \x5, \x2, \t11 |
| 139 vadd.s16 \x2, \x2, \t11 |
| 140 vsub.s16 \x3, \x4, \t10 |
| 141 vadd.s16 \x4, \x4, \t10 |
| 142 .endm |
| 143 |
| 144 asm_function jsimd_idct_ifast_neon |
| 145 |
| 146 DCT_TABLE .req r0 |
| 147 COEF_BLOCK .req r1 |
| 148 OUTPUT_BUF .req r2 |
| 149 OUTPUT_COL .req r3 |
| 150 TMP .req ip |
| 151 |
| 152 vpush {d8-d15} |
| 153 |
| 154 /* Load constants */ |
| 155 adr TMP, jsimd_idct_ifast_neon_consts |
| 156 vld1.16 {d0}, [TMP, :64] |
| 157 |
| 158 /* Load all COEF_BLOCK into NEON registers with the following allocation: |
| 159 * 0 1 2 3 | 4 5 6 7 |
| 160 * ---------+-------- |
| 161 * 0 | d4 | d5 |
| 162 * 1 | d6 | d7 |
| 163 * 2 | d8 | d9 |
| 164 * 3 | d10 | d11 |
| 165 * 4 | d12 | d13 |
| 166 * 5 | d14 | d15 |
| 167 * 6 | d16 | d17 |
| 168 * 7 | d18 | d19 |
| 169 */ |
| 170 vld1.16 {d4, d5, d6, d7}, [COEF_BLOCK]! |
| 171 vld1.16 {d8, d9, d10, d11}, [COEF_BLOCK]! |
| 172 vld1.16 {d12, d13, d14, d15}, [COEF_BLOCK]! |
| 173 vld1.16 {d16, d17, d18, d19}, [COEF_BLOCK]! |
| 174 /* Dequantize */ |
| 175 vld1.16 {d20, d21, d22, d23}, [DCT_TABLE]! |
| 176 vmul.s16 q2, q2, q10 |
| 177 vld1.16 {d24, d25, d26, d27}, [DCT_TABLE]! |
| 178 vmul.s16 q3, q3, q11 |
| 179 vmul.s16 q4, q4, q12 |
| 180 vld1.16 {d28, d29, d30, d31}, [DCT_TABLE]! |
| 181 vmul.s16 q5, q5, q13 |
| 182 vmul.s16 q6, q6, q14 |
| 183 vld1.16 {d20, d21, d22, d23}, [DCT_TABLE]! |
| 184 vmul.s16 q7, q7, q15 |
| 185 vmul.s16 q8, q8, q10 |
| 186 vmul.s16 q9, q9, q11 |
| 187 |
| 188 /* Pass 1 */ |
| 189 idct_helper q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12, q13, q14 |
| 190 /* Transpose */ |
| 191 transpose_4x4 d4, d6, d8, d10 |
| 192 transpose_4x4 d5, d7, d9, d11 |
| 193 transpose_4x4 d12, d14, d16, d18 |
| 194 transpose_4x4 d13, d15, d17, d19 |
| 195 vswp d12, d5 |
| 196 vswp d14, d7 |
| 197 vswp d16, d9 |
| 198 vswp d18, d11 |
| 199 |
| 200 /* Pass 2 */ |
| 201 idct_helper q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12, q13, q14 |
| 202 /* Transpose */ |
| 203 transpose_4x4 d4, d6, d8, d10 |
| 204 transpose_4x4 d5, d7, d9, d11 |
| 205 transpose_4x4 d12, d14, d16, d18 |
| 206 transpose_4x4 d13, d15, d17, d19 |
| 207 vswp d12, d5 |
| 208 vswp d14, d7 |
| 209 vswp d16, d9 |
| 210 vswp d18, d11 |
| 211 |
| 212 /* Descale and range limit */ |
| 213 vmov.s16 q15, #(0x80 << 5) |
| 214 vqadd.s16 q2, q2, q15 |
| 215 vqadd.s16 q3, q3, q15 |
| 216 vqadd.s16 q4, q4, q15 |
| 217 vqadd.s16 q5, q5, q15 |
| 218 vqadd.s16 q6, q6, q15 |
| 219 vqadd.s16 q7, q7, q15 |
| 220 vqadd.s16 q8, q8, q15 |
| 221 vqadd.s16 q9, q9, q15 |
| 222 vqshrun.s16 d4, q2, #5 |
| 223 vqshrun.s16 d6, q3, #5 |
| 224 vqshrun.s16 d8, q4, #5 |
| 225 vqshrun.s16 d10, q5, #5 |
| 226 vqshrun.s16 d12, q6, #5 |
| 227 vqshrun.s16 d14, q7, #5 |
| 228 vqshrun.s16 d16, q8, #5 |
| 229 vqshrun.s16 d18, q9, #5 |
| 230 |
| 231 /* Store results to the output buffer */ |
| 232 .irp x, d4, d6, d8, d10, d12, d14, d16, d18 |
| 233 ldr TMP, [OUTPUT_BUF], #4 |
| 234 add TMP, TMP, OUTPUT_COL |
| 235 vst1.8 {\x}, [TMP]! |
| 236 .endr |
| 237 |
| 238 vpop {d8-d15} |
| 239 bx lr |
| 240 |
| 241 .unreq DCT_TABLE |
| 242 .unreq COEF_BLOCK |
| 243 .unreq OUTPUT_BUF |
| 244 .unreq OUTPUT_COL |
| 245 .unreq TMP |
| 246 .endfunc |
| 247 |
| 248 .purgem idct_helper |
| 249 |
| 250 /*****************************************************************************/ |
| 251 |
| 252 /* |
| 253 * jsimd_idct_4x4_neon |
| 254 * |
| 255 * This function contains inverse-DCT code for getting reduced-size |
| 256 * 4x4 pixels output from an 8x8 DCT block. It uses the same calculations |
| 257 * and produces exactly the same output as IJG's original 'jpeg_idct_4x4' |
| 258 * function from jpeg-6b (jidctred.c). |
| 259 * |
| 260 * NOTE: jpeg-8 has an improved implementation of 4x4 inverse-DCT, which |
| 261 * requires much less arithmetic operations and hence should be faster. |
| 262 * The primary purpose of this particular NEON optimized function is |
| 263 * bit exact compatibility with jpeg-6b. |
| 264 * |
| 265 * TODO: a bit better instructions scheduling can be achieved by expanding |
| 266 * idct_helper/transpose_4x4 macros and reordering instructions, |
| 267 * but readability will suffer somewhat. |
| 268 */ |
| 269 |
| 270 #define CONST_BITS 13 |
| 271 |
| 272 #define FIX_0_211164243 (1730) /* FIX(0.211164243) */ |
| 273 #define FIX_0_509795579 (4176) /* FIX(0.509795579) */ |
| 274 #define FIX_0_601344887 (4926) /* FIX(0.601344887) */ |
| 275 #define FIX_0_720959822 (5906) /* FIX(0.720959822) */ |
| 276 #define FIX_0_765366865 (6270) /* FIX(0.765366865) */ |
| 277 #define FIX_0_850430095 (6967) /* FIX(0.850430095) */ |
| 278 #define FIX_0_899976223 (7373) /* FIX(0.899976223) */ |
| 279 #define FIX_1_061594337 (8697) /* FIX(1.061594337) */ |
| 280 #define FIX_1_272758580 (10426) /* FIX(1.272758580) */ |
| 281 #define FIX_1_451774981 (11893) /* FIX(1.451774981) */ |
| 282 #define FIX_1_847759065 (15137) /* FIX(1.847759065) */ |
| 283 #define FIX_2_172734803 (17799) /* FIX(2.172734803) */ |
| 284 #define FIX_2_562915447 (20995) /* FIX(2.562915447) */ |
| 285 #define FIX_3_624509785 (29692) /* FIX(3.624509785) */ |
| 286 |
| 287 .balign 16 |
| 288 jsimd_idct_4x4_neon_consts: |
| 289 .short FIX_1_847759065 /* d0[0] */ |
| 290 .short -FIX_0_765366865 /* d0[1] */ |
| 291 .short -FIX_0_211164243 /* d0[2] */ |
| 292 .short FIX_1_451774981 /* d0[3] */ |
| 293 .short -FIX_2_172734803 /* d1[0] */ |
| 294 .short FIX_1_061594337 /* d1[1] */ |
| 295 .short -FIX_0_509795579 /* d1[2] */ |
| 296 .short -FIX_0_601344887 /* d1[3] */ |
| 297 .short FIX_0_899976223 /* d2[0] */ |
| 298 .short FIX_2_562915447 /* d2[1] */ |
| 299 .short 1 << (CONST_BITS+1) /* d2[2] */ |
| 300 .short 0 /* d2[3] */ |
| 301 |
| 302 .macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29 |
| 303 vmull.s16 q14, \x4, d2[2] |
| 304 vmlal.s16 q14, \x8, d0[0] |
| 305 vmlal.s16 q14, \x14, d0[1] |
| 306 |
| 307 vmull.s16 q13, \x16, d1[2] |
| 308 vmlal.s16 q13, \x12, d1[3] |
| 309 vmlal.s16 q13, \x10, d2[0] |
| 310 vmlal.s16 q13, \x6, d2[1] |
| 311 |
| 312 vmull.s16 q15, \x4, d2[2] |
| 313 vmlsl.s16 q15, \x8, d0[0] |
| 314 vmlsl.s16 q15, \x14, d0[1] |
| 315 |
| 316 vmull.s16 q12, \x16, d0[2] |
| 317 vmlal.s16 q12, \x12, d0[3] |
| 318 vmlal.s16 q12, \x10, d1[0] |
| 319 vmlal.s16 q12, \x6, d1[1] |
| 320 |
| 321 vadd.s32 q10, q14, q13 |
| 322 vsub.s32 q14, q14, q13 |
| 323 |
| 324 .if \shift > 16 |
| 325 vrshr.s32 q10, q10, #\shift |
| 326 vrshr.s32 q14, q14, #\shift |
| 327 vmovn.s32 \y26, q10 |
| 328 vmovn.s32 \y29, q14 |
| 329 .else |
| 330 vrshrn.s32 \y26, q10, #\shift |
| 331 vrshrn.s32 \y29, q14, #\shift |
| 332 .endif |
| 333 |
| 334 vadd.s32 q10, q15, q12 |
| 335 vsub.s32 q15, q15, q12 |
| 336 |
| 337 .if \shift > 16 |
| 338 vrshr.s32 q10, q10, #\shift |
| 339 vrshr.s32 q15, q15, #\shift |
| 340 vmovn.s32 \y27, q10 |
| 341 vmovn.s32 \y28, q15 |
| 342 .else |
| 343 vrshrn.s32 \y27, q10, #\shift |
| 344 vrshrn.s32 \y28, q15, #\shift |
| 345 .endif |
| 346 |
| 347 .endm |
| 348 |
| 349 asm_function jsimd_idct_4x4_neon |
| 350 |
| 351 DCT_TABLE .req r0 |
| 352 COEF_BLOCK .req r1 |
| 353 OUTPUT_BUF .req r2 |
| 354 OUTPUT_COL .req r3 |
| 355 TMP1 .req r0 |
| 356 TMP2 .req r1 |
| 357 TMP3 .req r2 |
| 358 TMP4 .req ip |
| 359 |
| 360 vpush {d8-d15} |
| 361 |
| 362 /* Load constants (d3 is just used for padding) */ |
| 363 adr TMP4, jsimd_idct_4x4_neon_consts |
| 364 vld1.16 {d0, d1, d2, d3}, [TMP4, :128] |
| 365 |
| 366 /* Load all COEF_BLOCK into NEON registers with the following allocation: |
| 367 * 0 1 2 3 | 4 5 6 7 |
| 368 * ---------+-------- |
| 369 * 0 | d4 | d5 |
| 370 * 1 | d6 | d7 |
| 371 * 2 | d8 | d9 |
| 372 * 3 | d10 | d11 |
| 373 * 4 | - | - |
| 374 * 5 | d12 | d13 |
| 375 * 6 | d14 | d15 |
| 376 * 7 | d16 | d17 |
| 377 */ |
| 378 vld1.16 {d4, d5, d6, d7}, [COEF_BLOCK, :128]! |
| 379 vld1.16 {d8, d9, d10, d11}, [COEF_BLOCK, :128]! |
| 380 add COEF_BLOCK, COEF_BLOCK, #16 |
| 381 vld1.16 {d12, d13, d14, d15}, [COEF_BLOCK, :128]! |
| 382 vld1.16 {d16, d17}, [COEF_BLOCK, :128]! |
| 383 /* dequantize */ |
| 384 vld1.16 {d18, d19, d20, d21}, [DCT_TABLE, :128]! |
| 385 vmul.s16 q2, q2, q9 |
| 386 vld1.16 {d22, d23, d24, d25}, [DCT_TABLE, :128]! |
| 387 vmul.s16 q3, q3, q10 |
| 388 vmul.s16 q4, q4, q11 |
| 389 add DCT_TABLE, DCT_TABLE, #16 |
| 390 vld1.16 {d26, d27, d28, d29}, [DCT_TABLE, :128]! |
| 391 vmul.s16 q5, q5, q12 |
| 392 vmul.s16 q6, q6, q13 |
| 393 vld1.16 {d30, d31}, [DCT_TABLE, :128]! |
| 394 vmul.s16 q7, q7, q14 |
| 395 vmul.s16 q8, q8, q15 |
| 396 |
| 397 /* Pass 1 */ |
| 398 idct_helper d4, d6, d8, d10, d12, d14, d16, 12, d4, d6, d8, d10 |
| 399 transpose_4x4 d4, d6, d8, d10 |
| 400 idct_helper d5, d7, d9, d11, d13, d15, d17, 12, d5, d7, d9, d11 |
| 401 transpose_4x4 d5, d7, d9, d11 |
| 402 |
| 403 /* Pass 2 */ |
| 404 idct_helper d4, d6, d8, d10, d7, d9, d11, 19, d26, d27, d28, d29 |
| 405 transpose_4x4 d26, d27, d28, d29 |
| 406 |
| 407 /* Range limit */ |
| 408 vmov.u16 q15, #0x80 |
| 409 vadd.s16 q13, q13, q15 |
| 410 vadd.s16 q14, q14, q15 |
| 411 vqmovun.s16 d26, q13 |
| 412 vqmovun.s16 d27, q14 |
| 413 |
| 414 /* Store results to the output buffer */ |
| 415 ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4} |
| 416 add TMP1, TMP1, OUTPUT_COL |
| 417 add TMP2, TMP2, OUTPUT_COL |
| 418 add TMP3, TMP3, OUTPUT_COL |
| 419 add TMP4, TMP4, OUTPUT_COL |
| 420 |
| 421 #if defined(__ARMEL__) && !RESPECT_STRICT_ALIGNMENT |
| 422 /* We can use much less instructions on little endian systems if the |
| 423 * OS kernel is not configured to trap unaligned memory accesses |
| 424 */ |
| 425 vst1.32 {d26[0]}, [TMP1]! |
| 426 vst1.32 {d27[0]}, [TMP3]! |
| 427 vst1.32 {d26[1]}, [TMP2]! |
| 428 vst1.32 {d27[1]}, [TMP4]! |
| 429 #else |
| 430 vst1.8 {d26[0]}, [TMP1]! |
| 431 vst1.8 {d27[0]}, [TMP3]! |
| 432 vst1.8 {d26[1]}, [TMP1]! |
| 433 vst1.8 {d27[1]}, [TMP3]! |
| 434 vst1.8 {d26[2]}, [TMP1]! |
| 435 vst1.8 {d27[2]}, [TMP3]! |
| 436 vst1.8 {d26[3]}, [TMP1]! |
| 437 vst1.8 {d27[3]}, [TMP3]! |
| 438 |
| 439 vst1.8 {d26[4]}, [TMP2]! |
| 440 vst1.8 {d27[4]}, [TMP4]! |
| 441 vst1.8 {d26[5]}, [TMP2]! |
| 442 vst1.8 {d27[5]}, [TMP4]! |
| 443 vst1.8 {d26[6]}, [TMP2]! |
| 444 vst1.8 {d27[6]}, [TMP4]! |
| 445 vst1.8 {d26[7]}, [TMP2]! |
| 446 vst1.8 {d27[7]}, [TMP4]! |
| 447 #endif |
| 448 |
| 449 vpop {d8-d15} |
| 450 bx lr |
| 451 |
| 452 .unreq DCT_TABLE |
| 453 .unreq COEF_BLOCK |
| 454 .unreq OUTPUT_BUF |
| 455 .unreq OUTPUT_COL |
| 456 .unreq TMP1 |
| 457 .unreq TMP2 |
| 458 .unreq TMP3 |
| 459 .unreq TMP4 |
| 460 .endfunc |
| 461 |
| 462 .purgem idct_helper |
| 463 |
| 464 /*****************************************************************************/ |
| 465 |
| 466 /* |
| 467 * jsimd_idct_2x2_neon |
| 468 * |
| 469 * This function contains inverse-DCT code for getting reduced-size |
| 470 * 2x2 pixels output from an 8x8 DCT block. It uses the same calculations |
| 471 * and produces exactly the same output as IJG's original 'jpeg_idct_2x2' |
| 472 * function from jpeg-6b (jidctred.c). |
| 473 * |
| 474 * NOTE: jpeg-8 has an improved implementation of 2x2 inverse-DCT, which |
| 475 * requires much less arithmetic operations and hence should be faster. |
| 476 * The primary purpose of this particular NEON optimized function is |
| 477 * bit exact compatibility with jpeg-6b. |
| 478 */ |
| 479 |
| 480 .balign 8 |
| 481 jsimd_idct_2x2_neon_consts: |
| 482 .short -FIX_0_720959822 /* d0[0] */ |
| 483 .short FIX_0_850430095 /* d0[1] */ |
| 484 .short -FIX_1_272758580 /* d0[2] */ |
| 485 .short FIX_3_624509785 /* d0[3] */ |
| 486 |
| 487 .macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27 |
| 488 vshll.s16 q14, \x4, #15 |
| 489 vmull.s16 q13, \x6, d0[3] |
| 490 vmlal.s16 q13, \x10, d0[2] |
| 491 vmlal.s16 q13, \x12, d0[1] |
| 492 vmlal.s16 q13, \x16, d0[0] |
| 493 |
| 494 vadd.s32 q10, q14, q13 |
| 495 vsub.s32 q14, q14, q13 |
| 496 |
| 497 .if \shift > 16 |
| 498 vrshr.s32 q10, q10, #\shift |
| 499 vrshr.s32 q14, q14, #\shift |
| 500 vmovn.s32 \y26, q10 |
| 501 vmovn.s32 \y27, q14 |
| 502 .else |
| 503 vrshrn.s32 \y26, q10, #\shift |
| 504 vrshrn.s32 \y27, q14, #\shift |
| 505 .endif |
| 506 |
| 507 .endm |
| 508 |
| 509 asm_function jsimd_idct_2x2_neon |
| 510 |
| 511 DCT_TABLE .req r0 |
| 512 COEF_BLOCK .req r1 |
| 513 OUTPUT_BUF .req r2 |
| 514 OUTPUT_COL .req r3 |
| 515 TMP1 .req r0 |
| 516 TMP2 .req ip |
| 517 |
| 518 vpush {d8-d15} |
| 519 |
| 520 /* Load constants */ |
| 521 adr TMP2, jsimd_idct_2x2_neon_consts |
| 522 vld1.16 {d0}, [TMP2, :64] |
| 523 |
| 524 /* Load all COEF_BLOCK into NEON registers with the following allocation: |
| 525 * 0 1 2 3 | 4 5 6 7 |
| 526 * ---------+-------- |
| 527 * 0 | d4 | d5 |
| 528 * 1 | d6 | d7 |
| 529 * 2 | - | - |
| 530 * 3 | d10 | d11 |
| 531 * 4 | - | - |
| 532 * 5 | d12 | d13 |
| 533 * 6 | - | - |
| 534 * 7 | d16 | d17 |
| 535 */ |
| 536 vld1.16 {d4, d5, d6, d7}, [COEF_BLOCK, :128]! |
| 537 add COEF_BLOCK, COEF_BLOCK, #16 |
| 538 vld1.16 {d10, d11}, [COEF_BLOCK, :128]! |
| 539 add COEF_BLOCK, COEF_BLOCK, #16 |
| 540 vld1.16 {d12, d13}, [COEF_BLOCK, :128]! |
| 541 add COEF_BLOCK, COEF_BLOCK, #16 |
| 542 vld1.16 {d16, d17}, [COEF_BLOCK, :128]! |
| 543 /* Dequantize */ |
| 544 vld1.16 {d18, d19, d20, d21}, [DCT_TABLE, :128]! |
| 545 vmul.s16 q2, q2, q9 |
| 546 vmul.s16 q3, q3, q10 |
| 547 add DCT_TABLE, DCT_TABLE, #16 |
| 548 vld1.16 {d24, d25}, [DCT_TABLE, :128]! |
| 549 vmul.s16 q5, q5, q12 |
| 550 add DCT_TABLE, DCT_TABLE, #16 |
| 551 vld1.16 {d26, d27}, [DCT_TABLE, :128]! |
| 552 vmul.s16 q6, q6, q13 |
| 553 add DCT_TABLE, DCT_TABLE, #16 |
| 554 vld1.16 {d30, d31}, [DCT_TABLE, :128]! |
| 555 vmul.s16 q8, q8, q15 |
| 556 |
| 557 /* Pass 1 */ |
| 558 #if 0 |
| 559 idct_helper d4, d6, d10, d12, d16, 13, d4, d6 |
| 560 transpose_4x4 d4, d6, d8, d10 |
| 561 idct_helper d5, d7, d11, d13, d17, 13, d5, d7 |
| 562 transpose_4x4 d5, d7, d9, d11 |
| 563 #else |
| 564 vmull.s16 q13, d6, d0[3] |
| 565 vmlal.s16 q13, d10, d0[2] |
| 566 vmlal.s16 q13, d12, d0[1] |
| 567 vmlal.s16 q13, d16, d0[0] |
| 568 vmull.s16 q12, d7, d0[3] |
| 569 vmlal.s16 q12, d11, d0[2] |
| 570 vmlal.s16 q12, d13, d0[1] |
| 571 vmlal.s16 q12, d17, d0[0] |
| 572 vshll.s16 q14, d4, #15 |
| 573 vshll.s16 q15, d5, #15 |
| 574 vadd.s32 q10, q14, q13 |
| 575 vsub.s32 q14, q14, q13 |
| 576 vrshrn.s32 d4, q10, #13 |
| 577 vrshrn.s32 d6, q14, #13 |
| 578 vadd.s32 q10, q15, q12 |
| 579 vsub.s32 q14, q15, q12 |
| 580 vrshrn.s32 d5, q10, #13 |
| 581 vrshrn.s32 d7, q14, #13 |
| 582 vtrn.16 q2, q3 |
| 583 vtrn.32 q3, q5 |
| 584 #endif |
| 585 |
| 586 /* Pass 2 */ |
| 587 idct_helper d4, d6, d10, d7, d11, 20, d26, d27 |
| 588 |
| 589 /* Range limit */ |
| 590 vmov.u16 q15, #0x80 |
| 591 vadd.s16 q13, q13, q15 |
| 592 vqmovun.s16 d26, q13 |
| 593 vqmovun.s16 d27, q13 |
| 594 |
| 595 /* Store results to the output buffer */ |
| 596 ldmia OUTPUT_BUF, {TMP1, TMP2} |
| 597 add TMP1, TMP1, OUTPUT_COL |
| 598 add TMP2, TMP2, OUTPUT_COL |
| 599 |
| 600 vst1.8 {d26[0]}, [TMP1]! |
| 601 vst1.8 {d27[4]}, [TMP1]! |
| 602 vst1.8 {d26[1]}, [TMP2]! |
| 603 vst1.8 {d27[5]}, [TMP2]! |
| 604 |
| 605 vpop {d8-d15} |
| 606 bx lr |
| 607 |
| 608 .unreq DCT_TABLE |
| 609 .unreq COEF_BLOCK |
| 610 .unreq OUTPUT_BUF |
| 611 .unreq OUTPUT_COL |
| 612 .unreq TMP1 |
| 613 .unreq TMP2 |
| 614 .endfunc |
| 615 |
| 616 .purgem idct_helper |
| 617 |
| 618 /*****************************************************************************/ |
| 619 |
| 620 /* |
| 621 * jsimd_ycc_extrgb_convert_neon |
| 622 * jsimd_ycc_extbgr_convert_neon |
| 623 * jsimd_ycc_extrgbx_convert_neon |
| 624 * jsimd_ycc_extbgrx_convert_neon |
| 625 * jsimd_ycc_extxbgr_convert_neon |
| 626 * jsimd_ycc_extxrgb_convert_neon |
| 627 * |
| 628 * Colorspace conversion YCbCr -> RGB |
| 629 */ |
| 630 |
| 631 |
| 632 .macro do_load size |
| 633 .if \size == 8 |
| 634 vld1.8 {d4}, [U]! |
| 635 vld1.8 {d5}, [V]! |
| 636 vld1.8 {d0}, [Y]! |
| 637 pld [Y, #64] |
| 638 pld [U, #64] |
| 639 pld [V, #64] |
| 640 .elseif \size == 4 |
| 641 vld1.8 {d4[0]}, [U]! |
| 642 vld1.8 {d4[1]}, [U]! |
| 643 vld1.8 {d4[2]}, [U]! |
| 644 vld1.8 {d4[3]}, [U]! |
| 645 vld1.8 {d5[0]}, [V]! |
| 646 vld1.8 {d5[1]}, [V]! |
| 647 vld1.8 {d5[2]}, [V]! |
| 648 vld1.8 {d5[3]}, [V]! |
| 649 vld1.8 {d0[0]}, [Y]! |
| 650 vld1.8 {d0[1]}, [Y]! |
| 651 vld1.8 {d0[2]}, [Y]! |
| 652 vld1.8 {d0[3]}, [Y]! |
| 653 .elseif \size == 2 |
| 654 vld1.8 {d4[4]}, [U]! |
| 655 vld1.8 {d4[5]}, [U]! |
| 656 vld1.8 {d5[4]}, [V]! |
| 657 vld1.8 {d5[5]}, [V]! |
| 658 vld1.8 {d0[4]}, [Y]! |
| 659 vld1.8 {d0[5]}, [Y]! |
| 660 .elseif \size == 1 |
| 661 vld1.8 {d4[6]}, [U]! |
| 662 vld1.8 {d5[6]}, [V]! |
| 663 vld1.8 {d0[6]}, [Y]! |
| 664 .else |
| 665 .error unsupported macroblock size |
| 666 .endif |
| 667 .endm |
| 668 |
| 669 .macro do_store bpp, size |
| 670 .if \bpp == 24 |
| 671 .if \size == 8 |
| 672 vst3.8 {d10, d11, d12}, [RGB]! |
| 673 .elseif \size == 4 |
| 674 vst3.8 {d10[0], d11[0], d12[0]}, [RGB]! |
| 675 vst3.8 {d10[1], d11[1], d12[1]}, [RGB]! |
| 676 vst3.8 {d10[2], d11[2], d12[2]}, [RGB]! |
| 677 vst3.8 {d10[3], d11[3], d12[3]}, [RGB]! |
| 678 .elseif \size == 2 |
| 679 vst3.8 {d10[4], d11[4], d12[4]}, [RGB]! |
| 680 vst3.8 {d10[5], d11[5], d12[5]}, [RGB]! |
| 681 .elseif \size == 1 |
| 682 vst3.8 {d10[6], d11[6], d12[6]}, [RGB]! |
| 683 .else |
| 684 .error unsupported macroblock size |
| 685 .endif |
| 686 .elseif \bpp == 32 |
| 687 .if \size == 8 |
| 688 vst4.8 {d10, d11, d12, d13}, [RGB]! |
| 689 .elseif \size == 4 |
| 690 vst4.8 {d10[0], d11[0], d12[0], d13[0]}, [RGB]! |
| 691 vst4.8 {d10[1], d11[1], d12[1], d13[1]}, [RGB]! |
| 692 vst4.8 {d10[2], d11[2], d12[2], d13[2]}, [RGB]! |
| 693 vst4.8 {d10[3], d11[3], d12[3], d13[3]}, [RGB]! |
| 694 .elseif \size == 2 |
| 695 vst4.8 {d10[4], d11[4], d12[4], d13[4]}, [RGB]! |
| 696 vst4.8 {d10[5], d11[5], d12[5], d13[5]}, [RGB]! |
| 697 .elseif \size == 1 |
| 698 vst4.8 {d10[6], d11[6], d12[6], d13[6]}, [RGB]! |
| 699 .else |
| 700 .error unsupported macroblock size |
| 701 .endif |
| 702 .else |
| 703 .error unsupported bpp |
| 704 .endif |
| 705 .endm |
| 706 |
| 707 .macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, g_offs, b_offs |
| 708 |
| 709 .macro do_yuv_to_rgb |
| 710 vaddw.u8 q3, q1, d4 /* q3 = u - 128 */ |
| 711 vaddw.u8 q4, q1, d5 /* q2 = v - 128 */ |
| 712 vmull.s16 q10, d6, d1[1] /* multiply by -11277 */ |
| 713 vmlal.s16 q10, d8, d1[2] /* multiply by -23401 */ |
| 714 vmull.s16 q11, d7, d1[1] /* multiply by -11277 */ |
| 715 vmlal.s16 q11, d9, d1[2] /* multiply by -23401 */ |
| 716 vmull.s16 q12, d8, d1[0] /* multiply by 22971 */ |
| 717 vmull.s16 q13, d9, d1[0] /* multiply by 22971 */ |
| 718 vmull.s16 q14, d6, d1[3] /* multiply by 29033 */ |
| 719 vmull.s16 q15, d7, d1[3] /* multiply by 29033 */ |
| 720 vrshrn.s32 d20, q10, #15 |
| 721 vrshrn.s32 d21, q11, #15 |
| 722 vrshrn.s32 d24, q12, #14 |
| 723 vrshrn.s32 d25, q13, #14 |
| 724 vrshrn.s32 d28, q14, #14 |
| 725 vrshrn.s32 d29, q15, #14 |
| 726 vaddw.u8 q10, q10, d0 |
| 727 vaddw.u8 q12, q12, d0 |
| 728 vaddw.u8 q14, q14, d0 |
| 729 vqmovun.s16 d1\g_offs, q10 |
| 730 vqmovun.s16 d1\r_offs, q12 |
| 731 vqmovun.s16 d1\b_offs, q14 |
| 732 .endm |
| 733 |
| 734 /* Apple gas crashes on adrl, work around that by using adr. |
| 735 * But this requires a copy of these constants for each function. |
| 736 */ |
| 737 |
| 738 .balign 16 |
| 739 jsimd_ycc_\colorid\()_neon_consts: |
| 740 .short 0, 0, 0, 0 |
| 741 .short 22971, -11277, -23401, 29033 |
| 742 .short -128, -128, -128, -128 |
| 743 .short -128, -128, -128, -128 |
| 744 |
| 745 asm_function jsimd_ycc_\colorid\()_convert_neon |
| 746 OUTPUT_WIDTH .req r0 |
| 747 INPUT_BUF .req r1 |
| 748 INPUT_ROW .req r2 |
| 749 OUTPUT_BUF .req r3 |
| 750 NUM_ROWS .req r4 |
| 751 |
| 752 INPUT_BUF0 .req r5 |
| 753 INPUT_BUF1 .req r6 |
| 754 INPUT_BUF2 .req INPUT_BUF |
| 755 |
| 756 RGB .req r7 |
| 757 Y .req r8 |
| 758 U .req r9 |
| 759 V .req r10 |
| 760 N .req ip |
| 761 |
| 762 /* Load constants to d1, d2, d3 (d0 is just used for padding) */ |
| 763 adr ip, jsimd_ycc_\colorid\()_neon_consts |
| 764 vld1.16 {d0, d1, d2, d3}, [ip, :128] |
| 765 |
| 766 /* Save ARM registers and handle input arguments */ |
| 767 push {r4, r5, r6, r7, r8, r9, r10, lr} |
| 768 ldr NUM_ROWS, [sp, #(4 * 8)] |
| 769 ldr INPUT_BUF0, [INPUT_BUF] |
| 770 ldr INPUT_BUF1, [INPUT_BUF, #4] |
| 771 ldr INPUT_BUF2, [INPUT_BUF, #8] |
| 772 .unreq INPUT_BUF |
| 773 |
| 774 /* Save NEON registers */ |
| 775 vpush {d8-d15} |
| 776 |
| 777 /* Initially set d10, d11, d12, d13 to 0xFF */ |
| 778 vmov.u8 q5, #255 |
| 779 vmov.u8 q6, #255 |
| 780 |
| 781 /* Outer loop over scanlines */ |
| 782 cmp NUM_ROWS, #1 |
| 783 blt 9f |
| 784 0: |
| 785 ldr Y, [INPUT_BUF0, INPUT_ROW, lsl #2] |
| 786 ldr U, [INPUT_BUF1, INPUT_ROW, lsl #2] |
| 787 mov N, OUTPUT_WIDTH |
| 788 ldr V, [INPUT_BUF2, INPUT_ROW, lsl #2] |
| 789 add INPUT_ROW, INPUT_ROW, #1 |
| 790 ldr RGB, [OUTPUT_BUF], #4 |
| 791 |
| 792 /* Inner loop over pixels */ |
| 793 subs N, N, #8 |
| 794 blt 2f |
| 795 1: |
| 796 do_load 8 |
| 797 do_yuv_to_rgb |
| 798 do_store \bpp, 8 |
| 799 subs N, N, #8 |
| 800 bge 1b |
| 801 tst N, #7 |
| 802 beq 8f |
| 803 2: |
| 804 tst N, #4 |
| 805 beq 3f |
| 806 do_load 4 |
| 807 3: |
| 808 tst N, #2 |
| 809 beq 4f |
| 810 do_load 2 |
| 811 4: |
| 812 tst N, #1 |
| 813 beq 5f |
| 814 do_load 1 |
| 815 5: |
| 816 do_yuv_to_rgb |
| 817 tst N, #4 |
| 818 beq 6f |
| 819 do_store \bpp, 4 |
| 820 6: |
| 821 tst N, #2 |
| 822 beq 7f |
| 823 do_store \bpp, 2 |
| 824 7: |
| 825 tst N, #1 |
| 826 beq 8f |
| 827 do_store \bpp, 1 |
| 828 8: |
| 829 subs NUM_ROWS, NUM_ROWS, #1 |
| 830 bgt 0b |
| 831 9: |
| 832 /* Restore all registers and return */ |
| 833 vpop {d8-d15} |
| 834 pop {r4, r5, r6, r7, r8, r9, r10, pc} |
| 835 |
| 836 .unreq OUTPUT_WIDTH |
| 837 .unreq INPUT_ROW |
| 838 .unreq OUTPUT_BUF |
| 839 .unreq NUM_ROWS |
| 840 .unreq INPUT_BUF0 |
| 841 .unreq INPUT_BUF1 |
| 842 .unreq INPUT_BUF2 |
| 843 .unreq RGB |
| 844 .unreq Y |
| 845 .unreq U |
| 846 .unreq V |
| 847 .unreq N |
| 848 .endfunc |
| 849 |
| 850 .purgem do_yuv_to_rgb |
| 851 |
| 852 .endm |
| 853 |
| 854 /*--------------------------------- id ----- bpp R G B */ |
| 855 generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, 1, 2 |
| 856 generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, 1, 0 |
| 857 generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, 1, 2 |
| 858 generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, 1, 0 |
| 859 generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, 2, 1 |
| 860 generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, 2, 3 |
| 861 |
| 862 .purgem do_load |
| 863 .purgem do_store |
| 864 |
| 865 /*****************************************************************************/ |
OLD | NEW |