| OLD | NEW | 
|---|
| (Empty) |  | 
|  | 1 /* | 
|  | 2  * ARM NEON optimizations for libjpeg-turbo | 
|  | 3  * | 
|  | 4  * Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies). | 
|  | 5  * All rights reserved. | 
|  | 6  * Contact: Alexander Bokovoy <alexander.bokovoy@nokia.com> | 
|  | 7  * | 
|  | 8  * This software is provided 'as-is', without any express or implied | 
|  | 9  * warranty.  In no event will the authors be held liable for any damages | 
|  | 10  * arising from the use of this software. | 
|  | 11  * | 
|  | 12  * Permission is granted to anyone to use this software for any purpose, | 
|  | 13  * including commercial applications, and to alter it and redistribute it | 
|  | 14  * freely, subject to the following restrictions: | 
|  | 15  * | 
|  | 16  * 1. The origin of this software must not be misrepresented; you must not | 
|  | 17  *    claim that you wrote the original software. If you use this software | 
|  | 18  *    in a product, an acknowledgment in the product documentation would be | 
|  | 19  *    appreciated but is not required. | 
|  | 20  * 2. Altered source versions must be plainly marked as such, and must not be | 
|  | 21  *    misrepresented as being the original software. | 
|  | 22  * 3. This notice may not be removed or altered from any source distribution. | 
|  | 23  */ | 
|  | 24 | 
|  | 25 #if defined(__linux__) && defined(__ELF__) | 
|  | 26 .section .note.GNU-stack,"",%progbits /* mark stack as non-executable */ | 
|  | 27 #endif | 
|  | 28 | 
|  | 29 .text | 
|  | 30 .fpu neon | 
|  | 31 .arch armv7a | 
|  | 32 .object_arch armv4 | 
|  | 33 .arm | 
|  | 34 | 
|  | 35 | 
|  | 36 #define RESPECT_STRICT_ALIGNMENT 1 | 
|  | 37 | 
|  | 38 /*****************************************************************************/ | 
|  | 39 | 
|  | 40 /* Supplementary macro for setting function attributes */ | 
|  | 41 .macro asm_function fname | 
|  | 42 #ifdef __APPLE__ | 
|  | 43     .func _\fname | 
|  | 44     .globl _\fname | 
|  | 45 _\fname: | 
|  | 46 #else | 
|  | 47     .func \fname | 
|  | 48     .global \fname | 
|  | 49 #ifdef __ELF__ | 
|  | 50     .hidden \fname | 
|  | 51     .type \fname, %function | 
|  | 52 #endif | 
|  | 53 \fname: | 
|  | 54 #endif | 
|  | 55 .endm | 
|  | 56 | 
|  | 57 /* Transpose a block of 4x4 coefficients in four 64-bit registers */ | 
|  | 58 .macro transpose_4x4 x0, x1, x2, x3 | 
|  | 59     vtrn.16 \x0, \x1 | 
|  | 60     vtrn.16 \x2, \x3 | 
|  | 61     vtrn.32 \x0, \x2 | 
|  | 62     vtrn.32 \x1, \x3 | 
|  | 63 .endm | 
|  | 64 | 
|  | 65 /*****************************************************************************/ | 
|  | 66 | 
|  | 67 /* | 
|  | 68  * jsimd_idct_ifast_neon | 
|  | 69  * | 
|  | 70  * This function contains a fast, not so accurate integer implementation of | 
|  | 71  * the inverse DCT (Discrete Cosine Transform). It uses the same calculations | 
|  | 72  * and produces exactly the same output as IJG's original 'jpeg_idct_fast' | 
|  | 73  * function from jidctfst.c | 
|  | 74  * | 
|  | 75  * TODO: a bit better instructions scheduling is needed. | 
|  | 76  */ | 
|  | 77 | 
|  | 78 #define XFIX_1_082392200 d0[0] | 
|  | 79 #define XFIX_1_414213562 d0[1] | 
|  | 80 #define XFIX_1_847759065 d0[2] | 
|  | 81 #define XFIX_2_613125930 d0[3] | 
|  | 82 | 
|  | 83 .balign 16 | 
|  | 84 jsimd_idct_ifast_neon_consts: | 
|  | 85     .short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */ | 
|  | 86     .short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */ | 
|  | 87     .short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */ | 
|  | 88     .short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */ | 
|  | 89 | 
|  | 90 /* 1-D IDCT helper macro */ | 
|  | 91 | 
|  | 92 .macro idct_helper  x0, x1, x2, x3, x4, x5, x6, x7, \ | 
|  | 93                     t10, t11, t12, t13, t14 | 
|  | 94 | 
|  | 95     vsub.s16        \t10, \x0, \x4 | 
|  | 96     vadd.s16        \x4,  \x0, \x4 | 
|  | 97     vswp.s16        \t10, \x0 | 
|  | 98     vsub.s16        \t11, \x2, \x6 | 
|  | 99     vadd.s16        \x6,  \x2, \x6 | 
|  | 100     vswp.s16        \t11, \x2 | 
|  | 101     vsub.s16        \t10, \x3, \x5 | 
|  | 102     vadd.s16        \x5,  \x3, \x5 | 
|  | 103     vswp.s16        \t10, \x3 | 
|  | 104     vsub.s16        \t11, \x1, \x7 | 
|  | 105     vadd.s16        \x7,  \x1, \x7 | 
|  | 106     vswp.s16        \t11, \x1 | 
|  | 107 | 
|  | 108     vqdmulh.s16     \t13, \x2,  d0[1] | 
|  | 109     vadd.s16        \t12, \x3,  \x3 | 
|  | 110     vadd.s16        \x2,  \x2,  \t13 | 
|  | 111     vqdmulh.s16     \t13, \x3,  d0[3] | 
|  | 112     vsub.s16        \t10,  \x1, \x3 | 
|  | 113     vadd.s16        \t12, \t12, \t13 | 
|  | 114     vqdmulh.s16     \t13, \t10, d0[2] | 
|  | 115     vsub.s16        \t11, \x7,  \x5 | 
|  | 116     vadd.s16        \t10, \t10, \t13 | 
|  | 117     vqdmulh.s16     \t13, \t11, d0[1] | 
|  | 118     vadd.s16        \t11, \t11, \t13 | 
|  | 119 | 
|  | 120     vqdmulh.s16     \t13, \x1,  d0[0] | 
|  | 121     vsub.s16        \x2,  \x6,  \x2 | 
|  | 122     vsub.s16        \t14, \x0,  \x2 | 
|  | 123     vadd.s16        \x2,  \x0,  \x2 | 
|  | 124     vadd.s16        \x0,  \x4,  \x6 | 
|  | 125     vsub.s16        \x4,  \x4,  \x6 | 
|  | 126     vadd.s16        \x1,  \x1,  \t13 | 
|  | 127     vadd.s16        \t13, \x7,  \x5 | 
|  | 128     vsub.s16        \t12, \t13, \t12 | 
|  | 129     vsub.s16        \t12, \t12, \t10 | 
|  | 130     vadd.s16        \t11, \t12, \t11 | 
|  | 131     vsub.s16        \t10, \x1,  \t10 | 
|  | 132     vadd.s16        \t10, \t10, \t11 | 
|  | 133 | 
|  | 134     vsub.s16        \x7,  \x0,  \t13 | 
|  | 135     vadd.s16        \x0,  \x0,  \t13 | 
|  | 136     vadd.s16        \x6,  \t14, \t12 | 
|  | 137     vsub.s16        \x1,  \t14, \t12 | 
|  | 138     vsub.s16        \x5,  \x2,  \t11 | 
|  | 139     vadd.s16        \x2,  \x2,  \t11 | 
|  | 140     vsub.s16        \x3,  \x4,  \t10 | 
|  | 141     vadd.s16        \x4,  \x4,  \t10 | 
|  | 142 .endm | 
|  | 143 | 
|  | 144 asm_function jsimd_idct_ifast_neon | 
|  | 145 | 
|  | 146     DCT_TABLE       .req r0 | 
|  | 147     COEF_BLOCK      .req r1 | 
|  | 148     OUTPUT_BUF      .req r2 | 
|  | 149     OUTPUT_COL      .req r3 | 
|  | 150     TMP             .req ip | 
|  | 151 | 
|  | 152     vpush           {d8-d15} | 
|  | 153 | 
|  | 154     /* Load constants */ | 
|  | 155     adr             TMP, jsimd_idct_ifast_neon_consts | 
|  | 156     vld1.16         {d0}, [TMP, :64] | 
|  | 157 | 
|  | 158     /* Load all COEF_BLOCK into NEON registers with the following allocation: | 
|  | 159      *       0 1 2 3 | 4 5 6 7 | 
|  | 160      *      ---------+-------- | 
|  | 161      *   0 | d4      | d5 | 
|  | 162      *   1 | d6      | d7 | 
|  | 163      *   2 | d8      | d9 | 
|  | 164      *   3 | d10     | d11 | 
|  | 165      *   4 | d12     | d13 | 
|  | 166      *   5 | d14     | d15 | 
|  | 167      *   6 | d16     | d17 | 
|  | 168      *   7 | d18     | d19 | 
|  | 169      */ | 
|  | 170     vld1.16         {d4, d5, d6, d7}, [COEF_BLOCK]! | 
|  | 171     vld1.16         {d8, d9, d10, d11}, [COEF_BLOCK]! | 
|  | 172     vld1.16         {d12, d13, d14, d15}, [COEF_BLOCK]! | 
|  | 173     vld1.16         {d16, d17, d18, d19}, [COEF_BLOCK]! | 
|  | 174     /* Dequantize */ | 
|  | 175     vld1.16         {d20, d21, d22, d23}, [DCT_TABLE]! | 
|  | 176     vmul.s16        q2, q2, q10 | 
|  | 177     vld1.16         {d24, d25, d26, d27}, [DCT_TABLE]! | 
|  | 178     vmul.s16        q3, q3, q11 | 
|  | 179     vmul.s16        q4, q4, q12 | 
|  | 180     vld1.16         {d28, d29, d30, d31}, [DCT_TABLE]! | 
|  | 181     vmul.s16        q5, q5, q13 | 
|  | 182     vmul.s16        q6, q6, q14 | 
|  | 183     vld1.16         {d20, d21, d22, d23}, [DCT_TABLE]! | 
|  | 184     vmul.s16        q7, q7, q15 | 
|  | 185     vmul.s16        q8, q8, q10 | 
|  | 186     vmul.s16        q9, q9, q11 | 
|  | 187 | 
|  | 188     /* Pass 1 */ | 
|  | 189     idct_helper     q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12, q13, q14 | 
|  | 190     /* Transpose */ | 
|  | 191     transpose_4x4   d4,  d6,  d8,  d10 | 
|  | 192     transpose_4x4   d5,  d7,  d9,  d11 | 
|  | 193     transpose_4x4   d12, d14, d16, d18 | 
|  | 194     transpose_4x4   d13, d15, d17, d19 | 
|  | 195     vswp            d12, d5 | 
|  | 196     vswp            d14, d7 | 
|  | 197     vswp            d16, d9 | 
|  | 198     vswp            d18, d11 | 
|  | 199 | 
|  | 200     /* Pass 2 */ | 
|  | 201     idct_helper     q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12, q13, q14 | 
|  | 202     /* Transpose */ | 
|  | 203     transpose_4x4   d4,  d6,  d8,  d10 | 
|  | 204     transpose_4x4   d5,  d7,  d9,  d11 | 
|  | 205     transpose_4x4   d12, d14, d16, d18 | 
|  | 206     transpose_4x4   d13, d15, d17, d19 | 
|  | 207     vswp            d12, d5 | 
|  | 208     vswp            d14, d7 | 
|  | 209     vswp            d16, d9 | 
|  | 210     vswp            d18, d11 | 
|  | 211 | 
|  | 212     /* Descale and range limit */ | 
|  | 213     vmov.s16        q15, #(0x80 << 5) | 
|  | 214     vqadd.s16       q2, q2, q15 | 
|  | 215     vqadd.s16       q3, q3, q15 | 
|  | 216     vqadd.s16       q4, q4, q15 | 
|  | 217     vqadd.s16       q5, q5, q15 | 
|  | 218     vqadd.s16       q6, q6, q15 | 
|  | 219     vqadd.s16       q7, q7, q15 | 
|  | 220     vqadd.s16       q8, q8, q15 | 
|  | 221     vqadd.s16       q9, q9, q15 | 
|  | 222     vqshrun.s16     d4, q2, #5 | 
|  | 223     vqshrun.s16     d6, q3, #5 | 
|  | 224     vqshrun.s16     d8, q4, #5 | 
|  | 225     vqshrun.s16     d10, q5, #5 | 
|  | 226     vqshrun.s16     d12, q6, #5 | 
|  | 227     vqshrun.s16     d14, q7, #5 | 
|  | 228     vqshrun.s16     d16, q8, #5 | 
|  | 229     vqshrun.s16     d18, q9, #5 | 
|  | 230 | 
|  | 231     /* Store results to the output buffer */ | 
|  | 232     .irp            x, d4, d6, d8, d10, d12, d14, d16, d18 | 
|  | 233     ldr             TMP, [OUTPUT_BUF], #4 | 
|  | 234     add             TMP, TMP, OUTPUT_COL | 
|  | 235     vst1.8          {\x}, [TMP]! | 
|  | 236     .endr | 
|  | 237 | 
|  | 238     vpop            {d8-d15} | 
|  | 239     bx              lr | 
|  | 240 | 
|  | 241     .unreq          DCT_TABLE | 
|  | 242     .unreq          COEF_BLOCK | 
|  | 243     .unreq          OUTPUT_BUF | 
|  | 244     .unreq          OUTPUT_COL | 
|  | 245     .unreq          TMP | 
|  | 246 .endfunc | 
|  | 247 | 
|  | 248 .purgem idct_helper | 
|  | 249 | 
|  | 250 /*****************************************************************************/ | 
|  | 251 | 
|  | 252 /* | 
|  | 253  * jsimd_idct_4x4_neon | 
|  | 254  * | 
|  | 255  * This function contains inverse-DCT code for getting reduced-size | 
|  | 256  * 4x4 pixels output from an 8x8 DCT block. It uses the same  calculations | 
|  | 257  * and produces exactly the same output as IJG's original 'jpeg_idct_4x4' | 
|  | 258  * function from jpeg-6b (jidctred.c). | 
|  | 259  * | 
|  | 260  * NOTE: jpeg-8 has an improved implementation of 4x4 inverse-DCT, which | 
|  | 261  *       requires much less arithmetic operations and hence should be faster. | 
|  | 262  *       The primary purpose of this particular NEON optimized function is | 
|  | 263  *       bit exact compatibility with jpeg-6b. | 
|  | 264  * | 
|  | 265  * TODO: a bit better instructions scheduling can be achieved by expanding | 
|  | 266  *       idct_helper/transpose_4x4 macros and reordering instructions, | 
|  | 267  *       but readability will suffer somewhat. | 
|  | 268  */ | 
|  | 269 | 
|  | 270 #define CONST_BITS  13 | 
|  | 271 | 
|  | 272 #define FIX_0_211164243  (1730)  /* FIX(0.211164243) */ | 
|  | 273 #define FIX_0_509795579  (4176)  /* FIX(0.509795579) */ | 
|  | 274 #define FIX_0_601344887  (4926)  /* FIX(0.601344887) */ | 
|  | 275 #define FIX_0_720959822  (5906)  /* FIX(0.720959822) */ | 
|  | 276 #define FIX_0_765366865  (6270)  /* FIX(0.765366865) */ | 
|  | 277 #define FIX_0_850430095  (6967)  /* FIX(0.850430095) */ | 
|  | 278 #define FIX_0_899976223  (7373)  /* FIX(0.899976223) */ | 
|  | 279 #define FIX_1_061594337  (8697)  /* FIX(1.061594337) */ | 
|  | 280 #define FIX_1_272758580  (10426) /* FIX(1.272758580) */ | 
|  | 281 #define FIX_1_451774981  (11893) /* FIX(1.451774981) */ | 
|  | 282 #define FIX_1_847759065  (15137) /* FIX(1.847759065) */ | 
|  | 283 #define FIX_2_172734803  (17799) /* FIX(2.172734803) */ | 
|  | 284 #define FIX_2_562915447  (20995) /* FIX(2.562915447) */ | 
|  | 285 #define FIX_3_624509785  (29692) /* FIX(3.624509785) */ | 
|  | 286 | 
|  | 287 .balign 16 | 
|  | 288 jsimd_idct_4x4_neon_consts: | 
|  | 289     .short     FIX_1_847759065     /* d0[0] */ | 
|  | 290     .short     -FIX_0_765366865    /* d0[1] */ | 
|  | 291     .short     -FIX_0_211164243    /* d0[2] */ | 
|  | 292     .short     FIX_1_451774981     /* d0[3] */ | 
|  | 293     .short     -FIX_2_172734803    /* d1[0] */ | 
|  | 294     .short     FIX_1_061594337     /* d1[1] */ | 
|  | 295     .short     -FIX_0_509795579    /* d1[2] */ | 
|  | 296     .short     -FIX_0_601344887    /* d1[3] */ | 
|  | 297     .short     FIX_0_899976223     /* d2[0] */ | 
|  | 298     .short     FIX_2_562915447     /* d2[1] */ | 
|  | 299     .short     1 << (CONST_BITS+1) /* d2[2] */ | 
|  | 300     .short     0                   /* d2[3] */ | 
|  | 301 | 
|  | 302 .macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29 | 
|  | 303     vmull.s16       q14, \x4,  d2[2] | 
|  | 304     vmlal.s16       q14, \x8,  d0[0] | 
|  | 305     vmlal.s16       q14, \x14, d0[1] | 
|  | 306 | 
|  | 307     vmull.s16       q13, \x16, d1[2] | 
|  | 308     vmlal.s16       q13, \x12, d1[3] | 
|  | 309     vmlal.s16       q13, \x10, d2[0] | 
|  | 310     vmlal.s16       q13, \x6,  d2[1] | 
|  | 311 | 
|  | 312     vmull.s16       q15, \x4,  d2[2] | 
|  | 313     vmlsl.s16       q15, \x8,  d0[0] | 
|  | 314     vmlsl.s16       q15, \x14, d0[1] | 
|  | 315 | 
|  | 316     vmull.s16       q12, \x16, d0[2] | 
|  | 317     vmlal.s16       q12, \x12, d0[3] | 
|  | 318     vmlal.s16       q12, \x10, d1[0] | 
|  | 319     vmlal.s16       q12, \x6,  d1[1] | 
|  | 320 | 
|  | 321     vadd.s32        q10, q14, q13 | 
|  | 322     vsub.s32        q14, q14, q13 | 
|  | 323 | 
|  | 324 .if \shift > 16 | 
|  | 325     vrshr.s32       q10,  q10, #\shift | 
|  | 326     vrshr.s32       q14,  q14, #\shift | 
|  | 327     vmovn.s32       \y26, q10 | 
|  | 328     vmovn.s32       \y29, q14 | 
|  | 329 .else | 
|  | 330     vrshrn.s32      \y26, q10, #\shift | 
|  | 331     vrshrn.s32      \y29, q14, #\shift | 
|  | 332 .endif | 
|  | 333 | 
|  | 334     vadd.s32        q10, q15, q12 | 
|  | 335     vsub.s32        q15, q15, q12 | 
|  | 336 | 
|  | 337 .if \shift > 16 | 
|  | 338     vrshr.s32       q10,  q10, #\shift | 
|  | 339     vrshr.s32       q15,  q15, #\shift | 
|  | 340     vmovn.s32       \y27, q10 | 
|  | 341     vmovn.s32       \y28, q15 | 
|  | 342 .else | 
|  | 343     vrshrn.s32      \y27, q10, #\shift | 
|  | 344     vrshrn.s32      \y28, q15, #\shift | 
|  | 345 .endif | 
|  | 346 | 
|  | 347 .endm | 
|  | 348 | 
|  | 349 asm_function jsimd_idct_4x4_neon | 
|  | 350 | 
|  | 351     DCT_TABLE       .req r0 | 
|  | 352     COEF_BLOCK      .req r1 | 
|  | 353     OUTPUT_BUF      .req r2 | 
|  | 354     OUTPUT_COL      .req r3 | 
|  | 355     TMP1            .req r0 | 
|  | 356     TMP2            .req r1 | 
|  | 357     TMP3            .req r2 | 
|  | 358     TMP4            .req ip | 
|  | 359 | 
|  | 360     vpush           {d8-d15} | 
|  | 361 | 
|  | 362     /* Load constants (d3 is just used for padding) */ | 
|  | 363     adr             TMP4, jsimd_idct_4x4_neon_consts | 
|  | 364     vld1.16         {d0, d1, d2, d3}, [TMP4, :128] | 
|  | 365 | 
|  | 366     /* Load all COEF_BLOCK into NEON registers with the following allocation: | 
|  | 367      *       0 1 2 3 | 4 5 6 7 | 
|  | 368      *      ---------+-------- | 
|  | 369      *   0 | d4      | d5 | 
|  | 370      *   1 | d6      | d7 | 
|  | 371      *   2 | d8      | d9 | 
|  | 372      *   3 | d10     | d11 | 
|  | 373      *   4 | -       | - | 
|  | 374      *   5 | d12     | d13 | 
|  | 375      *   6 | d14     | d15 | 
|  | 376      *   7 | d16     | d17 | 
|  | 377      */ | 
|  | 378     vld1.16         {d4, d5, d6, d7}, [COEF_BLOCK, :128]! | 
|  | 379     vld1.16         {d8, d9, d10, d11}, [COEF_BLOCK, :128]! | 
|  | 380     add COEF_BLOCK, COEF_BLOCK, #16 | 
|  | 381     vld1.16         {d12, d13, d14, d15}, [COEF_BLOCK, :128]! | 
|  | 382     vld1.16         {d16, d17}, [COEF_BLOCK, :128]! | 
|  | 383     /* dequantize */ | 
|  | 384     vld1.16         {d18, d19, d20, d21}, [DCT_TABLE, :128]! | 
|  | 385     vmul.s16        q2, q2, q9 | 
|  | 386     vld1.16         {d22, d23, d24, d25}, [DCT_TABLE, :128]! | 
|  | 387     vmul.s16        q3, q3, q10 | 
|  | 388     vmul.s16        q4, q4, q11 | 
|  | 389     add             DCT_TABLE, DCT_TABLE, #16 | 
|  | 390     vld1.16         {d26, d27, d28, d29}, [DCT_TABLE, :128]! | 
|  | 391     vmul.s16        q5, q5, q12 | 
|  | 392     vmul.s16        q6, q6, q13 | 
|  | 393     vld1.16         {d30, d31}, [DCT_TABLE, :128]! | 
|  | 394     vmul.s16        q7, q7, q14 | 
|  | 395     vmul.s16        q8, q8, q15 | 
|  | 396 | 
|  | 397     /* Pass 1 */ | 
|  | 398     idct_helper     d4, d6, d8, d10, d12, d14, d16, 12, d4, d6, d8, d10 | 
|  | 399     transpose_4x4   d4, d6, d8, d10 | 
|  | 400     idct_helper     d5, d7, d9, d11, d13, d15, d17, 12, d5, d7, d9, d11 | 
|  | 401     transpose_4x4   d5, d7, d9, d11 | 
|  | 402 | 
|  | 403     /* Pass 2 */ | 
|  | 404     idct_helper     d4, d6, d8, d10, d7, d9, d11, 19, d26, d27, d28, d29 | 
|  | 405     transpose_4x4   d26, d27, d28, d29 | 
|  | 406 | 
|  | 407     /* Range limit */ | 
|  | 408     vmov.u16        q15, #0x80 | 
|  | 409     vadd.s16        q13, q13, q15 | 
|  | 410     vadd.s16        q14, q14, q15 | 
|  | 411     vqmovun.s16     d26, q13 | 
|  | 412     vqmovun.s16     d27, q14 | 
|  | 413 | 
|  | 414     /* Store results to the output buffer */ | 
|  | 415     ldmia           OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4} | 
|  | 416     add             TMP1, TMP1, OUTPUT_COL | 
|  | 417     add             TMP2, TMP2, OUTPUT_COL | 
|  | 418     add             TMP3, TMP3, OUTPUT_COL | 
|  | 419     add             TMP4, TMP4, OUTPUT_COL | 
|  | 420 | 
|  | 421 #if defined(__ARMEL__) && !RESPECT_STRICT_ALIGNMENT | 
|  | 422     /* We can use much less instructions on little endian systems if the | 
|  | 423      * OS kernel is not configured to trap unaligned memory accesses | 
|  | 424      */ | 
|  | 425     vst1.32         {d26[0]}, [TMP1]! | 
|  | 426     vst1.32         {d27[0]}, [TMP3]! | 
|  | 427     vst1.32         {d26[1]}, [TMP2]! | 
|  | 428     vst1.32         {d27[1]}, [TMP4]! | 
|  | 429 #else | 
|  | 430     vst1.8          {d26[0]}, [TMP1]! | 
|  | 431     vst1.8          {d27[0]}, [TMP3]! | 
|  | 432     vst1.8          {d26[1]}, [TMP1]! | 
|  | 433     vst1.8          {d27[1]}, [TMP3]! | 
|  | 434     vst1.8          {d26[2]}, [TMP1]! | 
|  | 435     vst1.8          {d27[2]}, [TMP3]! | 
|  | 436     vst1.8          {d26[3]}, [TMP1]! | 
|  | 437     vst1.8          {d27[3]}, [TMP3]! | 
|  | 438 | 
|  | 439     vst1.8          {d26[4]}, [TMP2]! | 
|  | 440     vst1.8          {d27[4]}, [TMP4]! | 
|  | 441     vst1.8          {d26[5]}, [TMP2]! | 
|  | 442     vst1.8          {d27[5]}, [TMP4]! | 
|  | 443     vst1.8          {d26[6]}, [TMP2]! | 
|  | 444     vst1.8          {d27[6]}, [TMP4]! | 
|  | 445     vst1.8          {d26[7]}, [TMP2]! | 
|  | 446     vst1.8          {d27[7]}, [TMP4]! | 
|  | 447 #endif | 
|  | 448 | 
|  | 449     vpop            {d8-d15} | 
|  | 450     bx              lr | 
|  | 451 | 
|  | 452     .unreq          DCT_TABLE | 
|  | 453     .unreq          COEF_BLOCK | 
|  | 454     .unreq          OUTPUT_BUF | 
|  | 455     .unreq          OUTPUT_COL | 
|  | 456     .unreq          TMP1 | 
|  | 457     .unreq          TMP2 | 
|  | 458     .unreq          TMP3 | 
|  | 459     .unreq          TMP4 | 
|  | 460 .endfunc | 
|  | 461 | 
|  | 462 .purgem idct_helper | 
|  | 463 | 
|  | 464 /*****************************************************************************/ | 
|  | 465 | 
|  | 466 /* | 
|  | 467  * jsimd_idct_2x2_neon | 
|  | 468  * | 
|  | 469  * This function contains inverse-DCT code for getting reduced-size | 
|  | 470  * 2x2 pixels output from an 8x8 DCT block. It uses the same  calculations | 
|  | 471  * and produces exactly the same output as IJG's original 'jpeg_idct_2x2' | 
|  | 472  * function from jpeg-6b (jidctred.c). | 
|  | 473  * | 
|  | 474  * NOTE: jpeg-8 has an improved implementation of 2x2 inverse-DCT, which | 
|  | 475  *       requires much less arithmetic operations and hence should be faster. | 
|  | 476  *       The primary purpose of this particular NEON optimized function is | 
|  | 477  *       bit exact compatibility with jpeg-6b. | 
|  | 478  */ | 
|  | 479 | 
|  | 480 .balign 8 | 
|  | 481 jsimd_idct_2x2_neon_consts: | 
|  | 482     .short     -FIX_0_720959822    /* d0[0] */ | 
|  | 483     .short     FIX_0_850430095     /* d0[1] */ | 
|  | 484     .short     -FIX_1_272758580    /* d0[2] */ | 
|  | 485     .short     FIX_3_624509785     /* d0[3] */ | 
|  | 486 | 
|  | 487 .macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27 | 
|  | 488     vshll.s16  q14,  \x4,  #15 | 
|  | 489     vmull.s16  q13,  \x6,  d0[3] | 
|  | 490     vmlal.s16  q13,  \x10, d0[2] | 
|  | 491     vmlal.s16  q13,  \x12, d0[1] | 
|  | 492     vmlal.s16  q13,  \x16, d0[0] | 
|  | 493 | 
|  | 494     vadd.s32   q10,  q14,  q13 | 
|  | 495     vsub.s32   q14,  q14,  q13 | 
|  | 496 | 
|  | 497 .if \shift > 16 | 
|  | 498     vrshr.s32  q10,  q10,  #\shift | 
|  | 499     vrshr.s32  q14,  q14,  #\shift | 
|  | 500     vmovn.s32  \y26, q10 | 
|  | 501     vmovn.s32  \y27, q14 | 
|  | 502 .else | 
|  | 503     vrshrn.s32 \y26, q10,  #\shift | 
|  | 504     vrshrn.s32 \y27, q14,  #\shift | 
|  | 505 .endif | 
|  | 506 | 
|  | 507 .endm | 
|  | 508 | 
|  | 509 asm_function jsimd_idct_2x2_neon | 
|  | 510 | 
|  | 511     DCT_TABLE       .req r0 | 
|  | 512     COEF_BLOCK      .req r1 | 
|  | 513     OUTPUT_BUF      .req r2 | 
|  | 514     OUTPUT_COL      .req r3 | 
|  | 515     TMP1            .req r0 | 
|  | 516     TMP2            .req ip | 
|  | 517 | 
|  | 518     vpush           {d8-d15} | 
|  | 519 | 
|  | 520     /* Load constants */ | 
|  | 521     adr             TMP2, jsimd_idct_2x2_neon_consts | 
|  | 522     vld1.16         {d0}, [TMP2, :64] | 
|  | 523 | 
|  | 524     /* Load all COEF_BLOCK into NEON registers with the following allocation: | 
|  | 525      *       0 1 2 3 | 4 5 6 7 | 
|  | 526      *      ---------+-------- | 
|  | 527      *   0 | d4      | d5 | 
|  | 528      *   1 | d6      | d7 | 
|  | 529      *   2 | -       | - | 
|  | 530      *   3 | d10     | d11 | 
|  | 531      *   4 | -       | - | 
|  | 532      *   5 | d12     | d13 | 
|  | 533      *   6 | -       | - | 
|  | 534      *   7 | d16     | d17 | 
|  | 535      */ | 
|  | 536     vld1.16         {d4, d5, d6, d7}, [COEF_BLOCK, :128]! | 
|  | 537     add             COEF_BLOCK, COEF_BLOCK, #16 | 
|  | 538     vld1.16         {d10, d11}, [COEF_BLOCK, :128]! | 
|  | 539     add             COEF_BLOCK, COEF_BLOCK, #16 | 
|  | 540     vld1.16         {d12, d13}, [COEF_BLOCK, :128]! | 
|  | 541     add             COEF_BLOCK, COEF_BLOCK, #16 | 
|  | 542     vld1.16         {d16, d17}, [COEF_BLOCK, :128]! | 
|  | 543     /* Dequantize */ | 
|  | 544     vld1.16         {d18, d19, d20, d21}, [DCT_TABLE, :128]! | 
|  | 545     vmul.s16        q2, q2, q9 | 
|  | 546     vmul.s16        q3, q3, q10 | 
|  | 547     add             DCT_TABLE, DCT_TABLE, #16 | 
|  | 548     vld1.16         {d24, d25}, [DCT_TABLE, :128]! | 
|  | 549     vmul.s16        q5, q5, q12 | 
|  | 550     add             DCT_TABLE, DCT_TABLE, #16 | 
|  | 551     vld1.16         {d26, d27}, [DCT_TABLE, :128]! | 
|  | 552     vmul.s16        q6, q6, q13 | 
|  | 553     add             DCT_TABLE, DCT_TABLE, #16 | 
|  | 554     vld1.16         {d30, d31}, [DCT_TABLE, :128]! | 
|  | 555     vmul.s16        q8, q8, q15 | 
|  | 556 | 
|  | 557     /* Pass 1 */ | 
|  | 558 #if 0 | 
|  | 559     idct_helper     d4, d6, d10, d12, d16, 13, d4, d6 | 
|  | 560     transpose_4x4   d4, d6, d8,  d10 | 
|  | 561     idct_helper     d5, d7, d11, d13, d17, 13, d5, d7 | 
|  | 562     transpose_4x4   d5, d7, d9,  d11 | 
|  | 563 #else | 
|  | 564     vmull.s16       q13, d6,  d0[3] | 
|  | 565     vmlal.s16       q13, d10, d0[2] | 
|  | 566     vmlal.s16       q13, d12, d0[1] | 
|  | 567     vmlal.s16       q13, d16, d0[0] | 
|  | 568     vmull.s16       q12, d7,  d0[3] | 
|  | 569     vmlal.s16       q12, d11, d0[2] | 
|  | 570     vmlal.s16       q12, d13, d0[1] | 
|  | 571     vmlal.s16       q12, d17, d0[0] | 
|  | 572     vshll.s16       q14, d4,  #15 | 
|  | 573     vshll.s16       q15, d5,  #15 | 
|  | 574     vadd.s32        q10, q14, q13 | 
|  | 575     vsub.s32        q14, q14, q13 | 
|  | 576     vrshrn.s32      d4,  q10, #13 | 
|  | 577     vrshrn.s32      d6,  q14, #13 | 
|  | 578     vadd.s32        q10, q15, q12 | 
|  | 579     vsub.s32        q14, q15, q12 | 
|  | 580     vrshrn.s32      d5,  q10, #13 | 
|  | 581     vrshrn.s32      d7,  q14, #13 | 
|  | 582     vtrn.16         q2,  q3 | 
|  | 583     vtrn.32         q3,  q5 | 
|  | 584 #endif | 
|  | 585 | 
|  | 586     /* Pass 2 */ | 
|  | 587     idct_helper     d4, d6, d10, d7, d11, 20, d26, d27 | 
|  | 588 | 
|  | 589     /* Range limit */ | 
|  | 590     vmov.u16        q15, #0x80 | 
|  | 591     vadd.s16        q13, q13, q15 | 
|  | 592     vqmovun.s16     d26, q13 | 
|  | 593     vqmovun.s16     d27, q13 | 
|  | 594 | 
|  | 595     /* Store results to the output buffer */ | 
|  | 596     ldmia           OUTPUT_BUF, {TMP1, TMP2} | 
|  | 597     add             TMP1, TMP1, OUTPUT_COL | 
|  | 598     add             TMP2, TMP2, OUTPUT_COL | 
|  | 599 | 
|  | 600     vst1.8          {d26[0]}, [TMP1]! | 
|  | 601     vst1.8          {d27[4]}, [TMP1]! | 
|  | 602     vst1.8          {d26[1]}, [TMP2]! | 
|  | 603     vst1.8          {d27[5]}, [TMP2]! | 
|  | 604 | 
|  | 605     vpop            {d8-d15} | 
|  | 606     bx              lr | 
|  | 607 | 
|  | 608     .unreq          DCT_TABLE | 
|  | 609     .unreq          COEF_BLOCK | 
|  | 610     .unreq          OUTPUT_BUF | 
|  | 611     .unreq          OUTPUT_COL | 
|  | 612     .unreq          TMP1 | 
|  | 613     .unreq          TMP2 | 
|  | 614 .endfunc | 
|  | 615 | 
|  | 616 .purgem idct_helper | 
|  | 617 | 
|  | 618 /*****************************************************************************/ | 
|  | 619 | 
|  | 620 /* | 
|  | 621  * jsimd_ycc_extrgb_convert_neon | 
|  | 622  * jsimd_ycc_extbgr_convert_neon | 
|  | 623  * jsimd_ycc_extrgbx_convert_neon | 
|  | 624  * jsimd_ycc_extbgrx_convert_neon | 
|  | 625  * jsimd_ycc_extxbgr_convert_neon | 
|  | 626  * jsimd_ycc_extxrgb_convert_neon | 
|  | 627  * | 
|  | 628  * Colorspace conversion YCbCr -> RGB | 
|  | 629  */ | 
|  | 630 | 
|  | 631 | 
|  | 632 .macro do_load size | 
|  | 633     .if \size == 8 | 
|  | 634         vld1.8  {d4}, [U]! | 
|  | 635         vld1.8  {d5}, [V]! | 
|  | 636         vld1.8  {d0}, [Y]! | 
|  | 637         pld     [Y, #64] | 
|  | 638         pld     [U, #64] | 
|  | 639         pld     [V, #64] | 
|  | 640     .elseif \size == 4 | 
|  | 641         vld1.8  {d4[0]}, [U]! | 
|  | 642         vld1.8  {d4[1]}, [U]! | 
|  | 643         vld1.8  {d4[2]}, [U]! | 
|  | 644         vld1.8  {d4[3]}, [U]! | 
|  | 645         vld1.8  {d5[0]}, [V]! | 
|  | 646         vld1.8  {d5[1]}, [V]! | 
|  | 647         vld1.8  {d5[2]}, [V]! | 
|  | 648         vld1.8  {d5[3]}, [V]! | 
|  | 649         vld1.8  {d0[0]}, [Y]! | 
|  | 650         vld1.8  {d0[1]}, [Y]! | 
|  | 651         vld1.8  {d0[2]}, [Y]! | 
|  | 652         vld1.8  {d0[3]}, [Y]! | 
|  | 653     .elseif \size == 2 | 
|  | 654         vld1.8  {d4[4]}, [U]! | 
|  | 655         vld1.8  {d4[5]}, [U]! | 
|  | 656         vld1.8  {d5[4]}, [V]! | 
|  | 657         vld1.8  {d5[5]}, [V]! | 
|  | 658         vld1.8  {d0[4]}, [Y]! | 
|  | 659         vld1.8  {d0[5]}, [Y]! | 
|  | 660     .elseif \size == 1 | 
|  | 661         vld1.8  {d4[6]}, [U]! | 
|  | 662         vld1.8  {d5[6]}, [V]! | 
|  | 663         vld1.8  {d0[6]}, [Y]! | 
|  | 664     .else | 
|  | 665         .error unsupported macroblock size | 
|  | 666     .endif | 
|  | 667 .endm | 
|  | 668 | 
|  | 669 .macro do_store bpp, size | 
|  | 670     .if \bpp == 24 | 
|  | 671         .if \size == 8 | 
|  | 672             vst3.8  {d10, d11, d12}, [RGB]! | 
|  | 673         .elseif \size == 4 | 
|  | 674             vst3.8  {d10[0], d11[0], d12[0]}, [RGB]! | 
|  | 675             vst3.8  {d10[1], d11[1], d12[1]}, [RGB]! | 
|  | 676             vst3.8  {d10[2], d11[2], d12[2]}, [RGB]! | 
|  | 677             vst3.8  {d10[3], d11[3], d12[3]}, [RGB]! | 
|  | 678         .elseif \size == 2 | 
|  | 679             vst3.8  {d10[4], d11[4], d12[4]}, [RGB]! | 
|  | 680             vst3.8  {d10[5], d11[5], d12[5]}, [RGB]! | 
|  | 681         .elseif \size == 1 | 
|  | 682             vst3.8  {d10[6], d11[6], d12[6]}, [RGB]! | 
|  | 683         .else | 
|  | 684             .error unsupported macroblock size | 
|  | 685         .endif | 
|  | 686     .elseif \bpp == 32 | 
|  | 687         .if \size == 8 | 
|  | 688             vst4.8  {d10, d11, d12, d13}, [RGB]! | 
|  | 689         .elseif \size == 4 | 
|  | 690             vst4.8  {d10[0], d11[0], d12[0], d13[0]}, [RGB]! | 
|  | 691             vst4.8  {d10[1], d11[1], d12[1], d13[1]}, [RGB]! | 
|  | 692             vst4.8  {d10[2], d11[2], d12[2], d13[2]}, [RGB]! | 
|  | 693             vst4.8  {d10[3], d11[3], d12[3], d13[3]}, [RGB]! | 
|  | 694         .elseif \size == 2 | 
|  | 695             vst4.8  {d10[4], d11[4], d12[4], d13[4]}, [RGB]! | 
|  | 696             vst4.8  {d10[5], d11[5], d12[5], d13[5]}, [RGB]! | 
|  | 697         .elseif \size == 1 | 
|  | 698             vst4.8  {d10[6], d11[6], d12[6], d13[6]}, [RGB]! | 
|  | 699         .else | 
|  | 700             .error unsupported macroblock size | 
|  | 701         .endif | 
|  | 702     .else | 
|  | 703         .error unsupported bpp | 
|  | 704     .endif | 
|  | 705 .endm | 
|  | 706 | 
|  | 707 .macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, g_offs, b_offs | 
|  | 708 | 
|  | 709 .macro do_yuv_to_rgb | 
|  | 710     vaddw.u8        q3, q1, d4     /* q3 = u - 128 */ | 
|  | 711     vaddw.u8        q4, q1, d5     /* q2 = v - 128 */ | 
|  | 712     vmull.s16       q10, d6, d1[1] /* multiply by -11277 */ | 
|  | 713     vmlal.s16       q10, d8, d1[2] /* multiply by -23401 */ | 
|  | 714     vmull.s16       q11, d7, d1[1] /* multiply by -11277 */ | 
|  | 715     vmlal.s16       q11, d9, d1[2] /* multiply by -23401 */ | 
|  | 716     vmull.s16       q12, d8, d1[0] /* multiply by 22971 */ | 
|  | 717     vmull.s16       q13, d9, d1[0] /* multiply by 22971 */ | 
|  | 718     vmull.s16       q14, d6, d1[3] /* multiply by 29033 */ | 
|  | 719     vmull.s16       q15, d7, d1[3] /* multiply by 29033 */ | 
|  | 720     vrshrn.s32      d20, q10, #15 | 
|  | 721     vrshrn.s32      d21, q11, #15 | 
|  | 722     vrshrn.s32      d24, q12, #14 | 
|  | 723     vrshrn.s32      d25, q13, #14 | 
|  | 724     vrshrn.s32      d28, q14, #14 | 
|  | 725     vrshrn.s32      d29, q15, #14 | 
|  | 726     vaddw.u8        q10, q10, d0 | 
|  | 727     vaddw.u8        q12, q12, d0 | 
|  | 728     vaddw.u8        q14, q14, d0 | 
|  | 729     vqmovun.s16     d1\g_offs, q10 | 
|  | 730     vqmovun.s16     d1\r_offs, q12 | 
|  | 731     vqmovun.s16     d1\b_offs, q14 | 
|  | 732 .endm | 
|  | 733 | 
|  | 734 /* Apple gas crashes on adrl, work around that by using adr. | 
|  | 735  * But this requires a copy of these constants for each function. | 
|  | 736  */ | 
|  | 737 | 
|  | 738 .balign 16 | 
|  | 739 jsimd_ycc_\colorid\()_neon_consts: | 
|  | 740     .short          0,      0,     0,      0 | 
|  | 741     .short          22971, -11277, -23401, 29033 | 
|  | 742     .short          -128,  -128,   -128,   -128 | 
|  | 743     .short          -128,  -128,   -128,   -128 | 
|  | 744 | 
|  | 745 asm_function jsimd_ycc_\colorid\()_convert_neon | 
|  | 746     OUTPUT_WIDTH    .req r0 | 
|  | 747     INPUT_BUF       .req r1 | 
|  | 748     INPUT_ROW       .req r2 | 
|  | 749     OUTPUT_BUF      .req r3 | 
|  | 750     NUM_ROWS        .req r4 | 
|  | 751 | 
|  | 752     INPUT_BUF0      .req r5 | 
|  | 753     INPUT_BUF1      .req r6 | 
|  | 754     INPUT_BUF2      .req INPUT_BUF | 
|  | 755 | 
|  | 756     RGB             .req r7 | 
|  | 757     Y               .req r8 | 
|  | 758     U               .req r9 | 
|  | 759     V               .req r10 | 
|  | 760     N               .req ip | 
|  | 761 | 
|  | 762     /* Load constants to d1, d2, d3 (d0 is just used for padding) */ | 
|  | 763     adr             ip, jsimd_ycc_\colorid\()_neon_consts | 
|  | 764     vld1.16         {d0, d1, d2, d3}, [ip, :128] | 
|  | 765 | 
|  | 766     /* Save ARM registers and handle input arguments */ | 
|  | 767     push            {r4, r5, r6, r7, r8, r9, r10, lr} | 
|  | 768     ldr             NUM_ROWS, [sp, #(4 * 8)] | 
|  | 769     ldr             INPUT_BUF0, [INPUT_BUF] | 
|  | 770     ldr             INPUT_BUF1, [INPUT_BUF, #4] | 
|  | 771     ldr             INPUT_BUF2, [INPUT_BUF, #8] | 
|  | 772     .unreq          INPUT_BUF | 
|  | 773 | 
|  | 774     /* Save NEON registers */ | 
|  | 775     vpush           {d8-d15} | 
|  | 776 | 
|  | 777     /* Initially set d10, d11, d12, d13 to 0xFF */ | 
|  | 778     vmov.u8         q5, #255 | 
|  | 779     vmov.u8         q6, #255 | 
|  | 780 | 
|  | 781     /* Outer loop over scanlines */ | 
|  | 782     cmp             NUM_ROWS, #1 | 
|  | 783     blt             9f | 
|  | 784 0: | 
|  | 785     ldr             Y, [INPUT_BUF0, INPUT_ROW, lsl #2] | 
|  | 786     ldr             U, [INPUT_BUF1, INPUT_ROW, lsl #2] | 
|  | 787     mov             N, OUTPUT_WIDTH | 
|  | 788     ldr             V, [INPUT_BUF2, INPUT_ROW, lsl #2] | 
|  | 789     add             INPUT_ROW, INPUT_ROW, #1 | 
|  | 790     ldr             RGB, [OUTPUT_BUF], #4 | 
|  | 791 | 
|  | 792     /* Inner loop over pixels */ | 
|  | 793     subs            N, N, #8 | 
|  | 794     blt             2f | 
|  | 795 1: | 
|  | 796     do_load         8 | 
|  | 797     do_yuv_to_rgb | 
|  | 798     do_store        \bpp, 8 | 
|  | 799     subs            N, N, #8 | 
|  | 800     bge             1b | 
|  | 801     tst             N, #7 | 
|  | 802     beq             8f | 
|  | 803 2: | 
|  | 804     tst             N, #4 | 
|  | 805     beq             3f | 
|  | 806     do_load         4 | 
|  | 807 3: | 
|  | 808     tst             N, #2 | 
|  | 809     beq             4f | 
|  | 810     do_load         2 | 
|  | 811 4: | 
|  | 812     tst             N, #1 | 
|  | 813     beq             5f | 
|  | 814     do_load         1 | 
|  | 815 5: | 
|  | 816     do_yuv_to_rgb | 
|  | 817     tst             N, #4 | 
|  | 818     beq             6f | 
|  | 819     do_store        \bpp, 4 | 
|  | 820 6: | 
|  | 821     tst             N, #2 | 
|  | 822     beq             7f | 
|  | 823     do_store        \bpp, 2 | 
|  | 824 7: | 
|  | 825     tst             N, #1 | 
|  | 826     beq             8f | 
|  | 827     do_store        \bpp, 1 | 
|  | 828 8: | 
|  | 829     subs            NUM_ROWS, NUM_ROWS, #1 | 
|  | 830     bgt             0b | 
|  | 831 9: | 
|  | 832     /* Restore all registers and return */ | 
|  | 833     vpop            {d8-d15} | 
|  | 834     pop             {r4, r5, r6, r7, r8, r9, r10, pc} | 
|  | 835 | 
|  | 836     .unreq          OUTPUT_WIDTH | 
|  | 837     .unreq          INPUT_ROW | 
|  | 838     .unreq          OUTPUT_BUF | 
|  | 839     .unreq          NUM_ROWS | 
|  | 840     .unreq          INPUT_BUF0 | 
|  | 841     .unreq          INPUT_BUF1 | 
|  | 842     .unreq          INPUT_BUF2 | 
|  | 843     .unreq          RGB | 
|  | 844     .unreq          Y | 
|  | 845     .unreq          U | 
|  | 846     .unreq          V | 
|  | 847     .unreq          N | 
|  | 848 .endfunc | 
|  | 849 | 
|  | 850 .purgem do_yuv_to_rgb | 
|  | 851 | 
|  | 852 .endm | 
|  | 853 | 
|  | 854 /*--------------------------------- id ----- bpp R  G  B */ | 
|  | 855 generate_jsimd_ycc_rgb_convert_neon extrgb,  24, 0, 1, 2 | 
|  | 856 generate_jsimd_ycc_rgb_convert_neon extbgr,  24, 2, 1, 0 | 
|  | 857 generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, 1, 2 | 
|  | 858 generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, 1, 0 | 
|  | 859 generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, 2, 1 | 
|  | 860 generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, 2, 3 | 
|  | 861 | 
|  | 862 .purgem do_load | 
|  | 863 .purgem do_store | 
|  | 864 | 
|  | 865 /*****************************************************************************/ | 
| OLD | NEW | 
|---|