Chromium Code Reviews| OLD | NEW |
|---|---|
| 1 /* | 1 /* |
| 2 * ARMv8 NEON optimizations for libjpeg-turbo | 2 * ARMv8 NEON optimizations for libjpeg-turbo |
| 3 * | 3 * |
| 4 * Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies). | 4 * Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies). |
| 5 * All rights reserved. | 5 * All rights reserved. |
| 6 * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com> | 6 * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com> |
| 7 * Copyright (C) 2013-2014, Linaro Limited | 7 * Copyright (C) 2013-2014, Linaro Limited |
| 8 * Author: Ragesh Radhakrishnan <ragesh.r@linaro.org> | 8 * Author: Ragesh Radhakrishnan <ragesh.r@linaro.org> |
| 9 * | 9 * |
| 10 * This software is provided 'as-is', without any express or implied | 10 * This software is provided 'as-is', without any express or implied |
| (...skipping 11 matching lines...) Expand all Loading... | |
| 22 * 2. Altered source versions must be plainly marked as such, and must not be | 22 * 2. Altered source versions must be plainly marked as such, and must not be |
| 23 * misrepresented as being the original software. | 23 * misrepresented as being the original software. |
| 24 * 3. This notice may not be removed or altered from any source distribution. | 24 * 3. This notice may not be removed or altered from any source distribution. |
| 25 */ | 25 */ |
| 26 | 26 |
| 27 #if defined(__linux__) && defined(__ELF__) | 27 #if defined(__linux__) && defined(__ELF__) |
| 28 .section .note.GNU-stack,"",%progbits /* mark stack as non-executable */ | 28 .section .note.GNU-stack,"",%progbits /* mark stack as non-executable */ |
| 29 #endif | 29 #endif |
| 30 | 30 |
| 31 .text | 31 .text |
| 32 #ifndef __clang__ | |
| 32 .arch armv8-a+fp+simd | 33 .arch armv8-a+fp+simd |
|
Nico
2016/04/20 20:53:30
again, can we please fix this in clang instead?
| |
| 34 #endif | |
| 33 | 35 |
| 34 | 36 |
| 35 #define RESPECT_STRICT_ALIGNMENT 1 | 37 #define RESPECT_STRICT_ALIGNMENT 1 |
| 36 | 38 |
| 37 | 39 |
| 38 /*****************************************************************************/ | 40 /*****************************************************************************/ |
| 39 | 41 |
| 40 /* Supplementary macro for setting function attributes */ | 42 /* Supplementary macro for setting function attributes */ |
| 41 .macro asm_function fname | 43 .macro asm_function fname |
| 42 #ifdef __APPLE__ | 44 #ifdef __APPLE__ |
| (...skipping 147 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 190 tmp0 = q4; \ | 192 tmp0 = q4; \ |
| 191 tmp1 = q5; \ | 193 tmp1 = q5; \ |
| 192 tmp2 = (tmp11_plus_tmp2 - tmp11_minus_tmp2) / 2; \ | 194 tmp2 = (tmp11_plus_tmp2 - tmp11_minus_tmp2) / 2; \ |
| 193 tmp3 = q7; \ | 195 tmp3 = q7; \ |
| 194 tmp10 = q2; \ | 196 tmp10 = q2; \ |
| 195 tmp11 = (tmp11_plus_tmp2 + tmp11_minus_tmp2) / 2; \ | 197 tmp11 = (tmp11_plus_tmp2 + tmp11_minus_tmp2) / 2; \ |
| 196 tmp12 = q3; \ | 198 tmp12 = q3; \ |
| 197 tmp13 = q1; \ | 199 tmp13 = q1; \ |
| 198 } | 200 } |
| 199 | 201 |
| 200 #define XFIX_0_899976223 v0.4h[0] | 202 #define XFIX_0_899976223 v0.h[0] |
| 201 #define XFIX_0_541196100 v0.4h[1] | 203 #define XFIX_0_541196100 v0.h[1] |
| 202 #define XFIX_2_562915447 v0.4h[2] | 204 #define XFIX_2_562915447 v0.h[2] |
| 203 #define XFIX_0_298631336_MINUS_0_899976223 v0.4h[3] | 205 #define XFIX_0_298631336_MINUS_0_899976223 v0.h[3] |
| 204 #define XFIX_1_501321110_MINUS_0_899976223 v1.4h[0] | 206 #define XFIX_1_501321110_MINUS_0_899976223 v1.h[0] |
| 205 #define XFIX_2_053119869_MINUS_2_562915447 v1.4h[1] | 207 #define XFIX_2_053119869_MINUS_2_562915447 v1.h[1] |
| 206 #define XFIX_0_541196100_PLUS_0_765366865 v1.4h[2] | 208 #define XFIX_0_541196100_PLUS_0_765366865 v1.h[2] |
| 207 #define XFIX_1_175875602 v1.4h[3] | 209 #define XFIX_1_175875602 v1.h[3] |
| 208 #define XFIX_1_175875602_MINUS_0_390180644 v2.4h[0] | 210 #define XFIX_1_175875602_MINUS_0_390180644 v2.h[0] |
| 209 #define XFIX_0_541196100_MINUS_1_847759065 v2.4h[1] | 211 #define XFIX_0_541196100_MINUS_1_847759065 v2.h[1] |
| 210 #define XFIX_3_072711026_MINUS_2_562915447 v2.4h[2] | 212 #define XFIX_3_072711026_MINUS_2_562915447 v2.h[2] |
| 211 #define XFIX_1_175875602_MINUS_1_961570560 v2.4h[3] | 213 #define XFIX_1_175875602_MINUS_1_961570560 v2.h[3] |
| 212 | 214 |
| 213 .balign 16 | 215 .balign 16 |
| 214 jsimd_idct_islow_neon_consts: | 216 jsimd_idct_islow_neon_consts: |
| 215 .short FIX_0_899976223 /* d0[0] */ | 217 .short FIX_0_899976223 /* d0[0] */ |
| 216 .short FIX_0_541196100 /* d0[1] */ | 218 .short FIX_0_541196100 /* d0[1] */ |
| 217 .short FIX_2_562915447 /* d0[2] */ | 219 .short FIX_2_562915447 /* d0[2] */ |
| 218 .short FIX_0_298631336_MINUS_0_899976223 /* d0[3] */ | 220 .short FIX_0_298631336_MINUS_0_899976223 /* d0[3] */ |
| 219 .short FIX_1_501321110_MINUS_0_899976223 /* d1[0] */ | 221 .short FIX_1_501321110_MINUS_0_899976223 /* d1[0] */ |
| 220 .short FIX_2_053119869_MINUS_2_562915447 /* d1[1] */ | 222 .short FIX_2_053119869_MINUS_2_562915447 /* d1[1] */ |
| 221 .short FIX_0_541196100_PLUS_0_765366865 /* d1[2] */ | 223 .short FIX_0_541196100_PLUS_0_765366865 /* d1[2] */ |
| (...skipping 41 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 263 st1 {v12.8b - v15.8b}, [sp], 32 | 265 st1 {v12.8b - v15.8b}, [sp], 32 |
| 264 st1 {v16.8b - v19.8b}, [sp], 32 | 266 st1 {v16.8b - v19.8b}, [sp], 32 |
| 265 st1 {v20.8b - v23.8b}, [sp], 32 | 267 st1 {v20.8b - v23.8b}, [sp], 32 |
| 266 st1 {v24.8b - v27.8b}, [sp], 32 | 268 st1 {v24.8b - v27.8b}, [sp], 32 |
| 267 st1 {v28.8b - v31.8b}, [sp], 32 | 269 st1 {v28.8b - v31.8b}, [sp], 32 |
| 268 ld1 {v16.4h, v17.4h, v18.4h, v19.4h}, [COEF_BLOCK], 32 | 270 ld1 {v16.4h, v17.4h, v18.4h, v19.4h}, [COEF_BLOCK], 32 |
| 269 ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [DCT_TABLE], 32 | 271 ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [DCT_TABLE], 32 |
| 270 ld1 {v20.4h, v21.4h, v22.4h, v23.4h}, [COEF_BLOCK], 32 | 272 ld1 {v20.4h, v21.4h, v22.4h, v23.4h}, [COEF_BLOCK], 32 |
| 271 mul v16.4h, v16.4h, v0.4h | 273 mul v16.4h, v16.4h, v0.4h |
| 272 mul v17.4h, v17.4h, v1.4h | 274 mul v17.4h, v17.4h, v1.4h |
| 273 ins v16.2d[1], v17.2d[0] /* 128 bit q8 */ | 275 ins v16.d[1], v17.d[0] /* 128 bit q8 */ |
| 274 ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [DCT_TABLE], 32 | 276 ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [DCT_TABLE], 32 |
| 275 mul v18.4h, v18.4h, v2.4h | 277 mul v18.4h, v18.4h, v2.4h |
| 276 mul v19.4h, v19.4h, v3.4h | 278 mul v19.4h, v19.4h, v3.4h |
| 277 ins v18.2d[1], v19.2d[0] /* 128 bit q9 */ | 279 ins v18.d[1], v19.d[0] /* 128 bit q9 */ |
| 278 ld1 {v24.4h, v25.4h, v26.4h, v27.4h}, [COEF_BLOCK], 32 | 280 ld1 {v24.4h, v25.4h, v26.4h, v27.4h}, [COEF_BLOCK], 32 |
| 279 mul v20.4h, v20.4h, v4.4h | 281 mul v20.4h, v20.4h, v4.4h |
| 280 mul v21.4h, v21.4h, v5.4h | 282 mul v21.4h, v21.4h, v5.4h |
| 281 ins v20.2d[1], v21.2d[0] /* 128 bit q10 */ | 283 ins v20.d[1], v21.d[0] /* 128 bit q10 */ |
| 282 ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [DCT_TABLE], 32 | 284 ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [DCT_TABLE], 32 |
| 283 mul v22.4h, v22.4h, v6.4h | 285 mul v22.4h, v22.4h, v6.4h |
| 284 mul v23.4h, v23.4h, v7.4h | 286 mul v23.4h, v23.4h, v7.4h |
| 285 ins v22.2d[1], v23.2d[0] /* 128 bit q11 */ | 287 ins v22.d[1], v23.d[0] /* 128 bit q11 */ |
| 286 ld1 {v28.4h, v29.4h, v30.4h, v31.4h}, [COEF_BLOCK] | 288 ld1 {v28.4h, v29.4h, v30.4h, v31.4h}, [COEF_BLOCK] |
| 287 mul v24.4h, v24.4h, v0.4h | 289 mul v24.4h, v24.4h, v0.4h |
| 288 mul v25.4h, v25.4h, v1.4h | 290 mul v25.4h, v25.4h, v1.4h |
| 289 ins v24.2d[1], v25.2d[0] /* 128 bit q12 */ | 291 ins v24.d[1], v25.d[0] /* 128 bit q12 */ |
| 290 ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [DCT_TABLE], 32 | 292 ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [DCT_TABLE], 32 |
| 291 mul v28.4h, v28.4h, v4.4h | 293 mul v28.4h, v28.4h, v4.4h |
| 292 mul v29.4h, v29.4h, v5.4h | 294 mul v29.4h, v29.4h, v5.4h |
| 293 ins v28.2d[1], v29.2d[0] /* 128 bit q14 */ | 295 ins v28.d[1], v29.d[0] /* 128 bit q14 */ |
| 294 mul v26.4h, v26.4h, v2.4h | 296 mul v26.4h, v26.4h, v2.4h |
| 295 mul v27.4h, v27.4h, v3.4h | 297 mul v27.4h, v27.4h, v3.4h |
| 296 ins v26.2d[1], v27.2d[0] /* 128 bit q13 */ | 298 ins v26.d[1], v27.d[0] /* 128 bit q13 */ |
| 297 ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [x15] /* load constants */ | 299 ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [x15] /* load constants */ |
| 298 add x15, x15, #16 | 300 add x15, x15, #16 |
| 299 mul v30.4h, v30.4h, v6.4h | 301 mul v30.4h, v30.4h, v6.4h |
| 300 mul v31.4h, v31.4h, v7.4h | 302 mul v31.4h, v31.4h, v7.4h |
| 301 ins v30.2d[1], v31.2d[0] /* 128 bit q15 */ | 303 ins v30.d[1], v31.d[0] /* 128 bit q15 */ |
| 302 /* Go to the bottom of the stack */ | 304 /* Go to the bottom of the stack */ |
| 303 sub sp, sp, 352 | 305 sub sp, sp, 352 |
| 304 stp x4, x5, [sp], 16 | 306 stp x4, x5, [sp], 16 |
| 305 st1 {v8.4h - v11.4h}, [sp], 32 /* save NEON registers */ | 307 st1 {v8.4h - v11.4h}, [sp], 32 /* save NEON registers */ |
| 306 st1 {v12.4h - v15.4h}, [sp], 32 | 308 st1 {v12.4h - v15.4h}, [sp], 32 |
| 307 /* 1-D IDCT, pass 1, left 4x8 half */ | 309 /* 1-D IDCT, pass 1, left 4x8 half */ |
| 308 add v4.4h, ROW7L.4h, ROW3L.4h | 310 add v4.4h, ROW7L.4h, ROW3L.4h |
| 309 add v5.4h, ROW5L.4h, ROW1L.4h | 311 add v5.4h, ROW5L.4h, ROW1L.4h |
| 310 smull v12.4s, v4.4h, XFIX_1_175875602_MINUS_1_961570560 | 312 smull v12.4s, v4.4h, XFIX_1_175875602_MINUS_1_961570560 |
| 311 smlal v12.4s, v5.4h, XFIX_1_175875602 | 313 smlal v12.4s, v5.4h, XFIX_1_175875602 |
| (...skipping 234 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 546 add v12.4s, v4.4s, v14.4s | 548 add v12.4s, v4.4s, v14.4s |
| 547 sub v4.4s, v4.4s, v14.4s | 549 sub v4.4s, v4.4s, v14.4s |
| 548 add v10.4s, v2.4s, v8.4s | 550 add v10.4s, v2.4s, v8.4s |
| 549 sub v6.4s, v2.4s, v8.4s | 551 sub v6.4s, v2.4s, v8.4s |
| 550 shrn ROW7R.4h, v4.4s, #16 | 552 shrn ROW7R.4h, v4.4s, #16 |
| 551 shrn ROW7L.4h, v10.4s, #16 /* ROW7L.4h <-> ROW3R.4h */ | 553 shrn ROW7L.4h, v10.4s, #16 /* ROW7L.4h <-> ROW3R.4h */ |
| 552 shrn ROW4L.4h, v12.4s, #16 /* ROW4L.4h <-> ROW0R.4h */ | 554 shrn ROW4L.4h, v12.4s, #16 /* ROW4L.4h <-> ROW0R.4h */ |
| 553 shrn ROW4R.4h, v6.4s, #16 | 555 shrn ROW4R.4h, v6.4s, #16 |
| 554 | 556 |
| 555 2: /* Descale to 8-bit and range limit */ | 557 2: /* Descale to 8-bit and range limit */ |
| 556 ins v16.2d[1], v17.2d[0] | 558 ins v16.d[1], v17.d[0] |
| 557 ins v18.2d[1], v19.2d[0] | 559 ins v18.d[1], v19.d[0] |
| 558 ins v20.2d[1], v21.2d[0] | 560 ins v20.d[1], v21.d[0] |
| 559 ins v22.2d[1], v23.2d[0] | 561 ins v22.d[1], v23.d[0] |
| 560 sqrshrn v16.8b, v16.8h, #2 | 562 sqrshrn v16.8b, v16.8h, #2 |
| 561 sqrshrn2 v16.16b, v18.8h, #2 | 563 sqrshrn2 v16.16b, v18.8h, #2 |
| 562 sqrshrn v18.8b, v20.8h, #2 | 564 sqrshrn v18.8b, v20.8h, #2 |
| 563 sqrshrn2 v18.16b, v22.8h, #2 | 565 sqrshrn2 v18.16b, v22.8h, #2 |
| 564 | 566 |
| 565 /* vpop {v8.4h - d15.4h} */ /* restore NEON registers */ | 567 /* vpop {v8.4h - d15.4h} */ /* restore NEON registers */ |
| 566 ld1 {v8.4h - v11.4h}, [sp], 32 | 568 ld1 {v8.4h - v11.4h}, [sp], 32 |
| 567 ld1 {v12.4h - v15.4h}, [sp], 32 | 569 ld1 {v12.4h - v15.4h}, [sp], 32 |
| 568 ins v24.2d[1], v25.2d[0] | 570 ins v24.d[1], v25.d[0] |
| 569 | 571 |
| 570 sqrshrn v20.8b, v24.8h, #2 | 572 sqrshrn v20.8b, v24.8h, #2 |
| 571 /* Transpose the final 8-bit samples and do signed->unsigned conversion */ | 573 /* Transpose the final 8-bit samples and do signed->unsigned conversion */ |
| 572 /* trn1 v16.8h, v16.8h, v18.8h */ | 574 /* trn1 v16.8h, v16.8h, v18.8h */ |
| 573 transpose v16, v18, v3, .16b, .8h | 575 transpose v16, v18, v3, .16b, .8h |
| 574 ins v26.2d[1], v27.2d[0] | 576 ins v26.d[1], v27.d[0] |
| 575 ins v28.2d[1], v29.2d[0] | 577 ins v28.d[1], v29.d[0] |
| 576 ins v30.2d[1], v31.2d[0] | 578 ins v30.d[1], v31.d[0] |
| 577 sqrshrn2 v20.16b, v26.8h, #2 | 579 sqrshrn2 v20.16b, v26.8h, #2 |
| 578 sqrshrn v22.8b, v28.8h, #2 | 580 sqrshrn v22.8b, v28.8h, #2 |
| 579 movi v0.16b, #(CENTERJSAMPLE) | 581 movi v0.16b, #(CENTERJSAMPLE) |
| 580 sqrshrn2 v22.16b, v30.8h, #2 | 582 sqrshrn2 v22.16b, v30.8h, #2 |
| 581 transpose_single v16, v17, v3, .2d, .8b | 583 transpose_single v16, v17, v3, .d, .8b |
| 582 transpose_single v18, v19, v3, .2d, .8b | 584 transpose_single v18, v19, v3, .d, .8b |
| 583 add v16.8b, v16.8b, v0.8b | 585 add v16.8b, v16.8b, v0.8b |
| 584 add v17.8b, v17.8b, v0.8b | 586 add v17.8b, v17.8b, v0.8b |
| 585 add v18.8b, v18.8b, v0.8b | 587 add v18.8b, v18.8b, v0.8b |
| 586 add v19.8b, v19.8b, v0.8b | 588 add v19.8b, v19.8b, v0.8b |
| 587 transpose v20, v22, v3, .16b, .8h | 589 transpose v20, v22, v3, .16b, .8h |
| 588 /* Store results to the output buffer */ | 590 /* Store results to the output buffer */ |
| 589 ldp TMP1, TMP2, [OUTPUT_BUF], 16 | 591 ldp TMP1, TMP2, [OUTPUT_BUF], 16 |
| 590 add TMP1, TMP1, OUTPUT_COL | 592 add TMP1, TMP1, OUTPUT_COL |
| 591 add TMP2, TMP2, OUTPUT_COL | 593 add TMP2, TMP2, OUTPUT_COL |
| 592 st1 {v16.8b}, [TMP1] | 594 st1 {v16.8b}, [TMP1] |
| 593 transpose_single v20, v21, v3, .2d, .8b | 595 transpose_single v20, v21, v3, .d, .8b |
| 594 st1 {v17.8b}, [TMP2] | 596 st1 {v17.8b}, [TMP2] |
| 595 ldp TMP1, TMP2, [OUTPUT_BUF], 16 | 597 ldp TMP1, TMP2, [OUTPUT_BUF], 16 |
| 596 add TMP1, TMP1, OUTPUT_COL | 598 add TMP1, TMP1, OUTPUT_COL |
| 597 add TMP2, TMP2, OUTPUT_COL | 599 add TMP2, TMP2, OUTPUT_COL |
| 598 st1 {v18.8b}, [TMP1] | 600 st1 {v18.8b}, [TMP1] |
| 599 add v20.8b, v20.8b, v0.8b | 601 add v20.8b, v20.8b, v0.8b |
| 600 add v21.8b, v21.8b, v0.8b | 602 add v21.8b, v21.8b, v0.8b |
| 601 st1 {v19.8b}, [TMP2] | 603 st1 {v19.8b}, [TMP2] |
| 602 ldp TMP1, TMP2, [OUTPUT_BUF], 16 | 604 ldp TMP1, TMP2, [OUTPUT_BUF], 16 |
| 603 ldp TMP3, TMP4, [OUTPUT_BUF] | 605 ldp TMP3, TMP4, [OUTPUT_BUF] |
| 604 add TMP1, TMP1, OUTPUT_COL | 606 add TMP1, TMP1, OUTPUT_COL |
| 605 add TMP2, TMP2, OUTPUT_COL | 607 add TMP2, TMP2, OUTPUT_COL |
| 606 add TMP3, TMP3, OUTPUT_COL | 608 add TMP3, TMP3, OUTPUT_COL |
| 607 add TMP4, TMP4, OUTPUT_COL | 609 add TMP4, TMP4, OUTPUT_COL |
| 608 transpose_single v22, v23, v3, .2d, .8b | 610 transpose_single v22, v23, v3, .d, .8b |
| 609 st1 {v20.8b}, [TMP1] | 611 st1 {v20.8b}, [TMP1] |
| 610 add v22.8b, v22.8b, v0.8b | 612 add v22.8b, v22.8b, v0.8b |
| 611 add v23.8b, v23.8b, v0.8b | 613 add v23.8b, v23.8b, v0.8b |
| 612 st1 {v21.8b}, [TMP2] | 614 st1 {v21.8b}, [TMP2] |
| 613 st1 {v22.8b}, [TMP3] | 615 st1 {v22.8b}, [TMP3] |
| 614 st1 {v23.8b}, [TMP4] | 616 st1 {v23.8b}, [TMP4] |
| 615 ldr x15, [sp], 16 | 617 ldr x15, [sp], 16 |
| 616 ld1 {v0.8b - v3.8b}, [sp], 32 | 618 ld1 {v0.8b - v3.8b}, [sp], 32 |
| 617 ld1 {v4.8b - v7.8b}, [sp], 32 | 619 ld1 {v4.8b - v7.8b}, [sp], 32 |
| 618 ld1 {v8.8b - v11.8b}, [sp], 32 | 620 ld1 {v8.8b - v11.8b}, [sp], 32 |
| (...skipping 13 matching lines...) Expand all Loading... | |
| 632 transpose ROW4L, ROW5L, v3, .16b, .4h | 634 transpose ROW4L, ROW5L, v3, .16b, .4h |
| 633 shl ROW0R.4h, ROW0R.4h, #2 /* PASS1_BITS */ | 635 shl ROW0R.4h, ROW0R.4h, #2 /* PASS1_BITS */ |
| 634 transpose ROW1L, ROW3L, v3, .16b, .2s | 636 transpose ROW1L, ROW3L, v3, .16b, .2s |
| 635 transpose ROW4L, ROW6L, v3, .16b, .2s | 637 transpose ROW4L, ROW6L, v3, .16b, .2s |
| 636 transpose ROW0L, ROW2L, v3, .16b, .2s | 638 transpose ROW0L, ROW2L, v3, .16b, .2s |
| 637 transpose ROW5L, ROW7L, v3, .16b, .2s | 639 transpose ROW5L, ROW7L, v3, .16b, .2s |
| 638 cmp x0, #0 | 640 cmp x0, #0 |
| 639 beq 4f /* Right 4x8 half has all zeros, go to 'sparse' second pa ss */ | 641 beq 4f /* Right 4x8 half has all zeros, go to 'sparse' second pa ss */ |
| 640 | 642 |
| 641 /* Only row 0 is non-zero for the right 4x8 half */ | 643 /* Only row 0 is non-zero for the right 4x8 half */ |
| 642 dup ROW1R.4h, ROW0R.4h[1] | 644 dup ROW1R.4h, ROW0R.h[1] |
| 643 dup ROW2R.4h, ROW0R.4h[2] | 645 dup ROW2R.4h, ROW0R.h[2] |
| 644 dup ROW3R.4h, ROW0R.4h[3] | 646 dup ROW3R.4h, ROW0R.h[3] |
| 645 dup ROW4R.4h, ROW0R.4h[0] | 647 dup ROW4R.4h, ROW0R.h[0] |
| 646 dup ROW5R.4h, ROW0R.4h[1] | 648 dup ROW5R.4h, ROW0R.h[1] |
| 647 dup ROW6R.4h, ROW0R.4h[2] | 649 dup ROW6R.4h, ROW0R.h[2] |
| 648 dup ROW7R.4h, ROW0R.4h[3] | 650 dup ROW7R.4h, ROW0R.h[3] |
| 649 dup ROW0R.4h, ROW0R.4h[0] | 651 dup ROW0R.4h, ROW0R.h[0] |
| 650 b 1b /* Go to 'normal' second pass */ | 652 b 1b /* Go to 'normal' second pass */ |
| 651 | 653 |
| 652 4: /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), left 4x8 half */ | 654 4: /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), left 4x8 half */ |
| 653 ld1 {v2.4h}, [x15] /* reload constants */ | 655 ld1 {v2.4h}, [x15] /* reload constants */ |
| 654 smull v12.4s, ROW1L.4h, XFIX_1_175875602 | 656 smull v12.4s, ROW1L.4h, XFIX_1_175875602 |
| 655 smlal v12.4s, ROW3L.4h, XFIX_1_175875602_MINUS_1_961570560 | 657 smlal v12.4s, ROW3L.4h, XFIX_1_175875602_MINUS_1_961570560 |
| 656 smull v14.4s, ROW3L.4h, XFIX_1_175875602 | 658 smull v14.4s, ROW3L.4h, XFIX_1_175875602 |
| 657 smlal v14.4s, ROW1L.4h, XFIX_1_175875602_MINUS_0_390180644 | 659 smlal v14.4s, ROW1L.4h, XFIX_1_175875602_MINUS_0_390180644 |
| 658 smull v4.4s, ROW2L.4h, XFIX_0_541196100 | 660 smull v4.4s, ROW2L.4h, XFIX_0_541196100 |
| 659 sshll v6.4s, ROW0L.4h, #13 | 661 sshll v6.4s, ROW0L.4h, #13 |
| (...skipping 103 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 763 * function from jidctfst.c | 765 * function from jidctfst.c |
| 764 * | 766 * |
| 765 * Normally 1-D AAN DCT needs 5 multiplications and 29 additions. | 767 * Normally 1-D AAN DCT needs 5 multiplications and 29 additions. |
| 766 * But in ARM NEON case some extra additions are required because VQDMULH | 768 * But in ARM NEON case some extra additions are required because VQDMULH |
| 767 * instruction can't handle the constants larger than 1. So the expressions | 769 * instruction can't handle the constants larger than 1. So the expressions |
| 768 * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x", | 770 * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x", |
| 769 * which introduces an extra addition. Overall, there are 6 extra additions | 771 * which introduces an extra addition. Overall, there are 6 extra additions |
| 770 * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions. | 772 * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions. |
| 771 */ | 773 */ |
| 772 | 774 |
| 773 #define XFIX_1_082392200 v0.4h[0] | 775 #define XFIX_1_082392200 v0.h[0] |
| 774 #define XFIX_1_414213562 v0.4h[1] | 776 #define XFIX_1_414213562 v0.h[1] |
| 775 #define XFIX_1_847759065 v0.4h[2] | 777 #define XFIX_1_847759065 v0.h[2] |
| 776 #define XFIX_2_613125930 v0.4h[3] | 778 #define XFIX_2_613125930 v0.h[3] |
| 777 | 779 |
| 778 .balign 16 | 780 .balign 16 |
| 779 jsimd_idct_ifast_neon_consts: | 781 jsimd_idct_ifast_neon_consts: |
| 780 .short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */ | 782 .short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */ |
| 781 .short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */ | 783 .short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */ |
| 782 .short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */ | 784 .short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */ |
| 783 .short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */ | 785 .short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */ |
| 784 | 786 |
| 785 asm_function jsimd_idct_ifast_neon | 787 asm_function jsimd_idct_ifast_neon |
| 786 | 788 |
| (...skipping 115 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 902 /* Transpose q8-q10 */ | 904 /* Transpose q8-q10 */ |
| 903 mov v18.16b, v8.16b | 905 mov v18.16b, v8.16b |
| 904 trn1 v8.4s, v8.4s, v10.4s | 906 trn1 v8.4s, v8.4s, v10.4s |
| 905 trn2 v10.4s, v18.4s, v10.4s | 907 trn2 v10.4s, v18.4s, v10.4s |
| 906 /* Transpose q13-q15 */ | 908 /* Transpose q13-q15 */ |
| 907 mov v18.16b, v13.16b | 909 mov v18.16b, v13.16b |
| 908 trn1 v13.4s, v13.4s, v15.4s | 910 trn1 v13.4s, v13.4s, v15.4s |
| 909 trn2 v15.4s, v18.4s, v15.4s | 911 trn2 v15.4s, v18.4s, v15.4s |
| 910 /* vswp v14.4h, v10-MSB.4h */ | 912 /* vswp v14.4h, v10-MSB.4h */ |
| 911 umov x22, v14.d[0] | 913 umov x22, v14.d[0] |
| 912 ins v14.2d[0], v10.2d[1] | 914 ins v14.d[0], v10.d[1] |
| 913 ins v10.2d[1], x22 | 915 ins v10.d[1], x22 |
| 914 /* vswp v13.4h, v9MSB.4h */ | 916 /* vswp v13.4h, v9MSB.4h */ |
| 915 | 917 |
| 916 umov x22, v13.d[0] | 918 umov x22, v13.d[0] |
| 917 ins v13.2d[0], v9.2d[1] | 919 ins v13.d[0], v9.d[1] |
| 918 ins v9.2d[1], x22 | 920 ins v9.d[1], x22 |
| 919 /* 1-D IDCT, pass 2 */ | 921 /* 1-D IDCT, pass 2 */ |
| 920 sub v2.8h, v10.8h, v14.8h | 922 sub v2.8h, v10.8h, v14.8h |
| 921 /* vswp v15.4h, v11MSB.4h */ | 923 /* vswp v15.4h, v11MSB.4h */ |
| 922 umov x22, v15.d[0] | 924 umov x22, v15.d[0] |
| 923 ins v15.2d[0], v11.2d[1] | 925 ins v15.d[0], v11.d[1] |
| 924 ins v11.2d[1], x22 | 926 ins v11.d[1], x22 |
| 925 add v14.8h, v10.8h, v14.8h | 927 add v14.8h, v10.8h, v14.8h |
| 926 /* vswp v12.4h, v8-MSB.4h */ | 928 /* vswp v12.4h, v8-MSB.4h */ |
| 927 umov x22, v12.d[0] | 929 umov x22, v12.d[0] |
| 928 ins v12.2d[0], v8.2d[1] | 930 ins v12.d[0], v8.d[1] |
| 929 ins v8.2d[1], x22 | 931 ins v8.d[1], x22 |
| 930 sub v1.8h, v11.8h, v13.8h | 932 sub v1.8h, v11.8h, v13.8h |
| 931 add v13.8h, v11.8h, v13.8h | 933 add v13.8h, v11.8h, v13.8h |
| 932 sub v5.8h, v9.8h, v15.8h | 934 sub v5.8h, v9.8h, v15.8h |
| 933 add v15.8h, v9.8h, v15.8h | 935 add v15.8h, v9.8h, v15.8h |
| 934 sqdmulh v4.8h, v2.8h, XFIX_1_414213562 | 936 sqdmulh v4.8h, v2.8h, XFIX_1_414213562 |
| 935 sqdmulh v6.8h, v1.8h, XFIX_2_613125930 | 937 sqdmulh v6.8h, v1.8h, XFIX_2_613125930 |
| 936 add v3.8h, v1.8h, v1.8h | 938 add v3.8h, v1.8h, v1.8h |
| 937 sub v1.8h, v5.8h, v1.8h | 939 sub v1.8h, v5.8h, v1.8h |
| 938 add v10.8h, v2.8h, v4.8h | 940 add v10.8h, v2.8h, v4.8h |
| 939 sqdmulh v4.8h, v1.8h, XFIX_1_847759065 | 941 sqdmulh v4.8h, v1.8h, XFIX_1_847759065 |
| (...skipping 50 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 990 trn2 v11.8h, v18.8h, v11.8h | 992 trn2 v11.8h, v18.8h, v11.8h |
| 991 /* Transpose q8-q10 */ | 993 /* Transpose q8-q10 */ |
| 992 mov v18.16b, v8.16b | 994 mov v18.16b, v8.16b |
| 993 trn1 v8.4s, v8.4s, v10.4s | 995 trn1 v8.4s, v8.4s, v10.4s |
| 994 trn2 v10.4s, v18.4s, v10.4s | 996 trn2 v10.4s, v18.4s, v10.4s |
| 995 /* Transpose q9-q11 */ | 997 /* Transpose q9-q11 */ |
| 996 mov v18.16b, v9.16b | 998 mov v18.16b, v9.16b |
| 997 trn1 v9.4s, v9.4s, v11.4s | 999 trn1 v9.4s, v9.4s, v11.4s |
| 998 trn2 v11.4s, v18.4s, v11.4s | 1000 trn2 v11.4s, v18.4s, v11.4s |
| 999 /* make copy */ | 1001 /* make copy */ |
| 1000 ins v17.2d[0], v8.2d[1] | 1002 ins v17.d[0], v8.d[1] |
| 1001 /* Transpose d16-d17-msb */ | 1003 /* Transpose d16-d17-msb */ |
| 1002 mov v18.16b, v8.16b | 1004 mov v18.16b, v8.16b |
| 1003 trn1 v8.8b, v8.8b, v17.8b | 1005 trn1 v8.8b, v8.8b, v17.8b |
| 1004 trn2 v17.8b, v18.8b, v17.8b | 1006 trn2 v17.8b, v18.8b, v17.8b |
| 1005 /* make copy */ | 1007 /* make copy */ |
| 1006 ins v19.2d[0], v9.2d[1] | 1008 ins v19.d[0], v9.d[1] |
| 1007 mov v18.16b, v9.16b | 1009 mov v18.16b, v9.16b |
| 1008 trn1 v9.8b, v9.8b, v19.8b | 1010 trn1 v9.8b, v9.8b, v19.8b |
| 1009 trn2 v19.8b, v18.8b, v19.8b | 1011 trn2 v19.8b, v18.8b, v19.8b |
| 1010 /* Store results to the output buffer */ | 1012 /* Store results to the output buffer */ |
| 1011 ldp TMP1, TMP2, [OUTPUT_BUF], 16 | 1013 ldp TMP1, TMP2, [OUTPUT_BUF], 16 |
| 1012 add TMP1, TMP1, OUTPUT_COL | 1014 add TMP1, TMP1, OUTPUT_COL |
| 1013 add TMP2, TMP2, OUTPUT_COL | 1015 add TMP2, TMP2, OUTPUT_COL |
| 1014 st1 {v8.8b}, [TMP1] | 1016 st1 {v8.8b}, [TMP1] |
| 1015 st1 {v17.8b}, [TMP2] | 1017 st1 {v17.8b}, [TMP2] |
| 1016 ldp TMP1, TMP2, [OUTPUT_BUF], 16 | 1018 ldp TMP1, TMP2, [OUTPUT_BUF], 16 |
| 1017 add TMP1, TMP1, OUTPUT_COL | 1019 add TMP1, TMP1, OUTPUT_COL |
| 1018 add TMP2, TMP2, OUTPUT_COL | 1020 add TMP2, TMP2, OUTPUT_COL |
| 1019 st1 {v9.8b}, [TMP1] | 1021 st1 {v9.8b}, [TMP1] |
| 1020 /* make copy */ | 1022 /* make copy */ |
| 1021 ins v7.2d[0], v10.2d[1] | 1023 ins v7.d[0], v10.d[1] |
| 1022 mov v18.16b, v10.16b | 1024 mov v18.16b, v10.16b |
| 1023 trn1 v10.8b, v10.8b, v7.8b | 1025 trn1 v10.8b, v10.8b, v7.8b |
| 1024 trn2 v7.8b, v18.8b, v7.8b | 1026 trn2 v7.8b, v18.8b, v7.8b |
| 1025 st1 {v19.8b}, [TMP2] | 1027 st1 {v19.8b}, [TMP2] |
| 1026 ldp TMP1, TMP2, [OUTPUT_BUF], 16 | 1028 ldp TMP1, TMP2, [OUTPUT_BUF], 16 |
| 1027 ldp TMP4, TMP5, [OUTPUT_BUF], 16 | 1029 ldp TMP4, TMP5, [OUTPUT_BUF], 16 |
| 1028 add TMP1, TMP1, OUTPUT_COL | 1030 add TMP1, TMP1, OUTPUT_COL |
| 1029 add TMP2, TMP2, OUTPUT_COL | 1031 add TMP2, TMP2, OUTPUT_COL |
| 1030 add TMP4, TMP4, OUTPUT_COL | 1032 add TMP4, TMP4, OUTPUT_COL |
| 1031 add TMP5, TMP5, OUTPUT_COL | 1033 add TMP5, TMP5, OUTPUT_COL |
| 1032 st1 {v10.8b}, [TMP1] | 1034 st1 {v10.8b}, [TMP1] |
| 1033 /* make copy */ | 1035 /* make copy */ |
| 1034 ins v16.2d[0], v11.2d[1] | 1036 ins v16.d[0], v11.d[1] |
| 1035 mov v18.16b, v11.16b | 1037 mov v18.16b, v11.16b |
| 1036 trn1 v11.8b, v11.8b, v16.8b | 1038 trn1 v11.8b, v11.8b, v16.8b |
| 1037 trn2 v16.8b, v18.8b, v16.8b | 1039 trn2 v16.8b, v18.8b, v16.8b |
| 1038 st1 {v7.8b}, [TMP2] | 1040 st1 {v7.8b}, [TMP2] |
| 1039 st1 {v11.8b}, [TMP4] | 1041 st1 {v11.8b}, [TMP4] |
| 1040 st1 {v16.8b}, [TMP5] | 1042 st1 {v16.8b}, [TMP5] |
| 1041 sub sp, sp, #176 | 1043 sub sp, sp, #176 |
| 1042 ldp x22, x23, [sp], 16 | 1044 ldp x22, x23, [sp], 16 |
| 1043 ld1 {v0.8b - v3.8b}, [sp], 32 | 1045 ld1 {v0.8b - v3.8b}, [sp], 32 |
| 1044 ld1 {v4.8b - v7.8b}, [sp], 32 | 1046 ld1 {v4.8b - v7.8b}, [sp], 32 |
| (...skipping 44 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 1089 #define FIX_1_061594337 (8697) /* FIX(1.061594337) */ | 1091 #define FIX_1_061594337 (8697) /* FIX(1.061594337) */ |
| 1090 #define FIX_1_272758580 (10426) /* FIX(1.272758580) */ | 1092 #define FIX_1_272758580 (10426) /* FIX(1.272758580) */ |
| 1091 #define FIX_1_451774981 (11893) /* FIX(1.451774981) */ | 1093 #define FIX_1_451774981 (11893) /* FIX(1.451774981) */ |
| 1092 #define FIX_1_847759065 (15137) /* FIX(1.847759065) */ | 1094 #define FIX_1_847759065 (15137) /* FIX(1.847759065) */ |
| 1093 #define FIX_2_172734803 (17799) /* FIX(2.172734803) */ | 1095 #define FIX_2_172734803 (17799) /* FIX(2.172734803) */ |
| 1094 #define FIX_2_562915447 (20995) /* FIX(2.562915447) */ | 1096 #define FIX_2_562915447 (20995) /* FIX(2.562915447) */ |
| 1095 #define FIX_3_624509785 (29692) /* FIX(3.624509785) */ | 1097 #define FIX_3_624509785 (29692) /* FIX(3.624509785) */ |
| 1096 | 1098 |
| 1097 .balign 16 | 1099 .balign 16 |
| 1098 jsimd_idct_4x4_neon_consts: | 1100 jsimd_idct_4x4_neon_consts: |
| 1099 .short FIX_1_847759065 /* v0.4h[0] */ | 1101 .short FIX_1_847759065 /* v0.h[0] */ |
| 1100 .short -FIX_0_765366865 /* v0.4h[1] */ | 1102 .short -FIX_0_765366865 /* v0.h[1] */ |
| 1101 .short -FIX_0_211164243 /* v0.4h[2] */ | 1103 .short -FIX_0_211164243 /* v0.h[2] */ |
| 1102 .short FIX_1_451774981 /* v0.4h[3] */ | 1104 .short FIX_1_451774981 /* v0.h[3] */ |
| 1103 .short -FIX_2_172734803 /* d1[0] */ | 1105 .short -FIX_2_172734803 /* d1[0] */ |
| 1104 .short FIX_1_061594337 /* d1[1] */ | 1106 .short FIX_1_061594337 /* d1[1] */ |
| 1105 .short -FIX_0_509795579 /* d1[2] */ | 1107 .short -FIX_0_509795579 /* d1[2] */ |
| 1106 .short -FIX_0_601344887 /* d1[3] */ | 1108 .short -FIX_0_601344887 /* d1[3] */ |
| 1107 .short FIX_0_899976223 /* v2.4h[0] */ | 1109 .short FIX_0_899976223 /* v2.h[0] */ |
| 1108 .short FIX_2_562915447 /* v2.4h[1] */ | 1110 .short FIX_2_562915447 /* v2.h[1] */ |
| 1109 .short 1 << (CONST_BITS+1) /* v2.4h[2] */ | 1111 .short 1 << (CONST_BITS+1) /* v2.h[2] */ |
| 1110 .short 0 /* v2.4h[3] */ | 1112 .short 0 /* v2.h[3] */ |
| 1111 | 1113 |
| 1112 .macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29 | 1114 .macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29 |
| 1113 smull v28.4s, \x4, v2.4h[2] | 1115 smull v28.4s, \x4, v2.h[2] |
| 1114 smlal v28.4s, \x8, v0.4h[0] | 1116 smlal v28.4s, \x8, v0.h[0] |
| 1115 smlal v28.4s, \x14, v0.4h[1] | 1117 smlal v28.4s, \x14, v0.h[1] |
| 1116 | 1118 |
| 1117 smull v26.4s, \x16, v1.4h[2] | 1119 smull v26.4s, \x16, v1.h[2] |
| 1118 smlal v26.4s, \x12, v1.4h[3] | 1120 smlal v26.4s, \x12, v1.h[3] |
| 1119 smlal v26.4s, \x10, v2.4h[0] | 1121 smlal v26.4s, \x10, v2.h[0] |
| 1120 smlal v26.4s, \x6, v2.4h[1] | 1122 smlal v26.4s, \x6, v2.h[1] |
| 1121 | 1123 |
| 1122 smull v30.4s, \x4, v2.4h[2] | 1124 smull v30.4s, \x4, v2.h[2] |
| 1123 smlsl v30.4s, \x8, v0.4h[0] | 1125 smlsl v30.4s, \x8, v0.h[0] |
| 1124 smlsl v30.4s, \x14, v0.4h[1] | 1126 smlsl v30.4s, \x14, v0.h[1] |
| 1125 | 1127 |
| 1126 smull v24.4s, \x16, v0.4h[2] | 1128 smull v24.4s, \x16, v0.h[2] |
| 1127 smlal v24.4s, \x12, v0.4h[3] | 1129 smlal v24.4s, \x12, v0.h[3] |
| 1128 smlal v24.4s, \x10, v1.4h[0] | 1130 smlal v24.4s, \x10, v1.h[0] |
| 1129 smlal v24.4s, \x6, v1.4h[1] | 1131 smlal v24.4s, \x6, v1.h[1] |
| 1130 | 1132 |
| 1131 add v20.4s, v28.4s, v26.4s | 1133 add v20.4s, v28.4s, v26.4s |
| 1132 sub v28.4s, v28.4s, v26.4s | 1134 sub v28.4s, v28.4s, v26.4s |
| 1133 | 1135 |
| 1134 .if \shift > 16 | 1136 .if \shift > 16 |
| 1135 srshr v20.4s, v20.4s, #\shift | 1137 srshr v20.4s, v20.4s, #\shift |
| 1136 srshr v28.4s, v28.4s, #\shift | 1138 srshr v28.4s, v28.4s, #\shift |
| 1137 xtn \y26, v20.4s | 1139 xtn \y26, v20.4s |
| 1138 xtn \y29, v28.4s | 1140 xtn \y29, v28.4s |
| 1139 .else | 1141 .else |
| (...skipping 56 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 1196 */ | 1198 */ |
| 1197 ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [COEF_BLOCK], 32 | 1199 ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [COEF_BLOCK], 32 |
| 1198 ld1 {v8.4h, v9.4h, v10.4h, v11.4h}, [COEF_BLOCK], 32 | 1200 ld1 {v8.4h, v9.4h, v10.4h, v11.4h}, [COEF_BLOCK], 32 |
| 1199 add COEF_BLOCK, COEF_BLOCK, #16 | 1201 add COEF_BLOCK, COEF_BLOCK, #16 |
| 1200 ld1 {v12.4h, v13.4h, v14.4h, v15.4h}, [COEF_BLOCK], 32 | 1202 ld1 {v12.4h, v13.4h, v14.4h, v15.4h}, [COEF_BLOCK], 32 |
| 1201 ld1 {v16.4h, v17.4h}, [COEF_BLOCK], 16 | 1203 ld1 {v16.4h, v17.4h}, [COEF_BLOCK], 16 |
| 1202 /* dequantize */ | 1204 /* dequantize */ |
| 1203 ld1 {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32 | 1205 ld1 {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32 |
| 1204 mul v4.4h, v4.4h, v18.4h | 1206 mul v4.4h, v4.4h, v18.4h |
| 1205 mul v5.4h, v5.4h, v19.4h | 1207 mul v5.4h, v5.4h, v19.4h |
| 1206 ins v4.2d[1], v5.2d[0] /* 128 bit q4 */ | 1208 ins v4.d[1], v5.d[0] /* 128 bit q4 */ |
| 1207 ld1 {v22.4h, v23.4h, v24.4h, v25.4h}, [DCT_TABLE], 32 | 1209 ld1 {v22.4h, v23.4h, v24.4h, v25.4h}, [DCT_TABLE], 32 |
| 1208 mul v6.4h, v6.4h, v20.4h | 1210 mul v6.4h, v6.4h, v20.4h |
| 1209 mul v7.4h, v7.4h, v21.4h | 1211 mul v7.4h, v7.4h, v21.4h |
| 1210 ins v6.2d[1], v7.2d[0] /* 128 bit q6 */ | 1212 ins v6.d[1], v7.d[0] /* 128 bit q6 */ |
| 1211 mul v8.4h, v8.4h, v22.4h | 1213 mul v8.4h, v8.4h, v22.4h |
| 1212 mul v9.4h, v9.4h, v23.4h | 1214 mul v9.4h, v9.4h, v23.4h |
| 1213 ins v8.2d[1], v9.2d[0] /* 128 bit q8 */ | 1215 ins v8.d[1], v9.d[0] /* 128 bit q8 */ |
| 1214 add DCT_TABLE, DCT_TABLE, #16 | 1216 add DCT_TABLE, DCT_TABLE, #16 |
| 1215 ld1 {v26.4h, v27.4h, v28.4h, v29.4h}, [DCT_TABLE], 32 | 1217 ld1 {v26.4h, v27.4h, v28.4h, v29.4h}, [DCT_TABLE], 32 |
| 1216 mul v10.4h, v10.4h, v24.4h | 1218 mul v10.4h, v10.4h, v24.4h |
| 1217 mul v11.4h, v11.4h, v25.4h | 1219 mul v11.4h, v11.4h, v25.4h |
| 1218 ins v10.2d[1], v11.2d[0] /* 128 bit q10 */ | 1220 ins v10.d[1], v11.d[0] /* 128 bit q10 */ |
| 1219 mul v12.4h, v12.4h, v26.4h | 1221 mul v12.4h, v12.4h, v26.4h |
| 1220 mul v13.4h, v13.4h, v27.4h | 1222 mul v13.4h, v13.4h, v27.4h |
| 1221 ins v12.2d[1], v13.2d[0] /* 128 bit q12 */ | 1223 ins v12.d[1], v13.d[0] /* 128 bit q12 */ |
| 1222 ld1 {v30.4h, v31.4h}, [DCT_TABLE], 16 | 1224 ld1 {v30.4h, v31.4h}, [DCT_TABLE], 16 |
| 1223 mul v14.4h, v14.4h, v28.4h | 1225 mul v14.4h, v14.4h, v28.4h |
| 1224 mul v15.4h, v15.4h, v29.4h | 1226 mul v15.4h, v15.4h, v29.4h |
| 1225 ins v14.2d[1], v15.2d[0] /* 128 bit q14 */ | 1227 ins v14.d[1], v15.d[0] /* 128 bit q14 */ |
| 1226 mul v16.4h, v16.4h, v30.4h | 1228 mul v16.4h, v16.4h, v30.4h |
| 1227 mul v17.4h, v17.4h, v31.4h | 1229 mul v17.4h, v17.4h, v31.4h |
| 1228 ins v16.2d[1], v17.2d[0] /* 128 bit q16 */ | 1230 ins v16.d[1], v17.d[0] /* 128 bit q16 */ |
| 1229 | 1231 |
| 1230 /* Pass 1 */ | 1232 /* Pass 1 */ |
| 1231 idct_helper v4.4h, v6.4h, v8.4h, v10.4h, v12.4h, v14.4h, v16.4h, 12, v4. 4h, v6.4h, v8.4h, v10.4h | 1233 idct_helper v4.4h, v6.4h, v8.4h, v10.4h, v12.4h, v14.4h, v16.4h, 12, v4. 4h, v6.4h, v8.4h, v10.4h |
| 1232 transpose_4x4 v4, v6, v8, v10, v3 | 1234 transpose_4x4 v4, v6, v8, v10, v3 |
| 1233 ins v10.2d[1], v11.2d[0] | 1235 ins v10.d[1], v11.d[0] |
| 1234 idct_helper v5.4h, v7.4h, v9.4h, v11.4h, v13.4h, v15.4h, v17.4h, 12, v5. 4h, v7.4h, v9.4h, v11.4h | 1236 idct_helper v5.4h, v7.4h, v9.4h, v11.4h, v13.4h, v15.4h, v17.4h, 12, v5. 4h, v7.4h, v9.4h, v11.4h |
| 1235 transpose_4x4 v5, v7, v9, v11, v3 | 1237 transpose_4x4 v5, v7, v9, v11, v3 |
| 1236 ins v10.2d[1], v11.2d[0] | 1238 ins v10.d[1], v11.d[0] |
| 1237 /* Pass 2 */ | 1239 /* Pass 2 */ |
| 1238 idct_helper v4.4h, v6.4h, v8.4h, v10.4h, v7.4h, v9.4h, v11.4h, 19, v26.4 h, v27.4h, v28.4h, v29.4h | 1240 idct_helper v4.4h, v6.4h, v8.4h, v10.4h, v7.4h, v9.4h, v11.4h, 19, v26.4 h, v27.4h, v28.4h, v29.4h |
| 1239 transpose_4x4 v26, v27, v28, v29, v3 | 1241 transpose_4x4 v26, v27, v28, v29, v3 |
| 1240 | 1242 |
| 1241 /* Range limit */ | 1243 /* Range limit */ |
| 1242 movi v30.8h, #0x80 | 1244 movi v30.8h, #0x80 |
| 1243 ins v26.2d[1], v27.2d[0] | 1245 ins v26.d[1], v27.d[0] |
| 1244 ins v28.2d[1], v29.2d[0] | 1246 ins v28.d[1], v29.d[0] |
| 1245 add v26.8h, v26.8h, v30.8h | 1247 add v26.8h, v26.8h, v30.8h |
| 1246 add v28.8h, v28.8h, v30.8h | 1248 add v28.8h, v28.8h, v30.8h |
| 1247 sqxtun v26.8b, v26.8h | 1249 sqxtun v26.8b, v26.8h |
| 1248 sqxtun v27.8b, v28.8h | 1250 sqxtun v27.8b, v28.8h |
| 1249 | 1251 |
| 1250 /* Store results to the output buffer */ | 1252 /* Store results to the output buffer */ |
| 1251 ldp TMP1, TMP2, [OUTPUT_BUF], 16 | 1253 ldp TMP1, TMP2, [OUTPUT_BUF], 16 |
| 1252 ldp TMP3, TMP4, [OUTPUT_BUF] | 1254 ldp TMP3, TMP4, [OUTPUT_BUF] |
| 1253 add TMP1, TMP1, OUTPUT_COL | 1255 add TMP1, TMP1, OUTPUT_COL |
| 1254 add TMP2, TMP2, OUTPUT_COL | 1256 add TMP2, TMP2, OUTPUT_COL |
| (...skipping 71 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 1326 | 1328 |
| 1327 .balign 8 | 1329 .balign 8 |
| 1328 jsimd_idct_2x2_neon_consts: | 1330 jsimd_idct_2x2_neon_consts: |
| 1329 .short -FIX_0_720959822 /* v14[0] */ | 1331 .short -FIX_0_720959822 /* v14[0] */ |
| 1330 .short FIX_0_850430095 /* v14[1] */ | 1332 .short FIX_0_850430095 /* v14[1] */ |
| 1331 .short -FIX_1_272758580 /* v14[2] */ | 1333 .short -FIX_1_272758580 /* v14[2] */ |
| 1332 .short FIX_3_624509785 /* v14[3] */ | 1334 .short FIX_3_624509785 /* v14[3] */ |
| 1333 | 1335 |
| 1334 .macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27 | 1336 .macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27 |
| 1335 sshll v15.4s, \x4, #15 | 1337 sshll v15.4s, \x4, #15 |
| 1336 smull v26.4s, \x6, v14.4h[3] | 1338 smull v26.4s, \x6, v14.h[3] |
| 1337 smlal v26.4s, \x10, v14.4h[2] | 1339 smlal v26.4s, \x10, v14.h[2] |
| 1338 smlal v26.4s, \x12, v14.4h[1] | 1340 smlal v26.4s, \x12, v14.h[1] |
| 1339 smlal v26.4s, \x16, v14.4h[0] | 1341 smlal v26.4s, \x16, v14.h[0] |
| 1340 | 1342 |
| 1341 add v20.4s, v15.4s, v26.4s | 1343 add v20.4s, v15.4s, v26.4s |
| 1342 sub v15.4s, v15.4s, v26.4s | 1344 sub v15.4s, v15.4s, v26.4s |
| 1343 | 1345 |
| 1344 .if \shift > 16 | 1346 .if \shift > 16 |
| 1345 srshr v20.4s, v20.4s, #\shift | 1347 srshr v20.4s, v20.4s, #\shift |
| 1346 srshr v15.4s, v15.4s, #\shift | 1348 srshr v15.4s, v15.4s, #\shift |
| 1347 xtn \y26, v20.4s | 1349 xtn \y26, v20.4s |
| 1348 xtn \y27, v15.4s | 1350 xtn \y27, v15.4s |
| 1349 .else | 1351 .else |
| (...skipping 43 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 1393 add COEF_BLOCK, COEF_BLOCK, #16 | 1395 add COEF_BLOCK, COEF_BLOCK, #16 |
| 1394 ld1 {v10.4h, v11.4h}, [COEF_BLOCK], 16 | 1396 ld1 {v10.4h, v11.4h}, [COEF_BLOCK], 16 |
| 1395 add COEF_BLOCK, COEF_BLOCK, #16 | 1397 add COEF_BLOCK, COEF_BLOCK, #16 |
| 1396 ld1 {v12.4h, v13.4h}, [COEF_BLOCK], 16 | 1398 ld1 {v12.4h, v13.4h}, [COEF_BLOCK], 16 |
| 1397 add COEF_BLOCK, COEF_BLOCK, #16 | 1399 add COEF_BLOCK, COEF_BLOCK, #16 |
| 1398 ld1 {v16.4h, v17.4h}, [COEF_BLOCK], 16 | 1400 ld1 {v16.4h, v17.4h}, [COEF_BLOCK], 16 |
| 1399 /* Dequantize */ | 1401 /* Dequantize */ |
| 1400 ld1 {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32 | 1402 ld1 {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32 |
| 1401 mul v4.4h, v4.4h, v18.4h | 1403 mul v4.4h, v4.4h, v18.4h |
| 1402 mul v5.4h, v5.4h, v19.4h | 1404 mul v5.4h, v5.4h, v19.4h |
| 1403 ins v4.2d[1], v5.2d[0] | 1405 ins v4.d[1], v5.d[0] |
| 1404 mul v6.4h, v6.4h, v20.4h | 1406 mul v6.4h, v6.4h, v20.4h |
| 1405 mul v7.4h, v7.4h, v21.4h | 1407 mul v7.4h, v7.4h, v21.4h |
| 1406 ins v6.2d[1], v7.2d[0] | 1408 ins v6.d[1], v7.d[0] |
| 1407 add DCT_TABLE, DCT_TABLE, #16 | 1409 add DCT_TABLE, DCT_TABLE, #16 |
| 1408 ld1 {v24.4h, v25.4h}, [DCT_TABLE], 16 | 1410 ld1 {v24.4h, v25.4h}, [DCT_TABLE], 16 |
| 1409 mul v10.4h, v10.4h, v24.4h | 1411 mul v10.4h, v10.4h, v24.4h |
| 1410 mul v11.4h, v11.4h, v25.4h | 1412 mul v11.4h, v11.4h, v25.4h |
| 1411 ins v10.2d[1], v11.2d[0] | 1413 ins v10.d[1], v11.d[0] |
| 1412 add DCT_TABLE, DCT_TABLE, #16 | 1414 add DCT_TABLE, DCT_TABLE, #16 |
| 1413 ld1 {v26.4h, v27.4h}, [DCT_TABLE], 16 | 1415 ld1 {v26.4h, v27.4h}, [DCT_TABLE], 16 |
| 1414 mul v12.4h, v12.4h, v26.4h | 1416 mul v12.4h, v12.4h, v26.4h |
| 1415 mul v13.4h, v13.4h, v27.4h | 1417 mul v13.4h, v13.4h, v27.4h |
| 1416 ins v12.2d[1], v13.2d[0] | 1418 ins v12.d[1], v13.d[0] |
| 1417 add DCT_TABLE, DCT_TABLE, #16 | 1419 add DCT_TABLE, DCT_TABLE, #16 |
| 1418 ld1 {v30.4h, v31.4h}, [DCT_TABLE], 16 | 1420 ld1 {v30.4h, v31.4h}, [DCT_TABLE], 16 |
| 1419 mul v16.4h, v16.4h, v30.4h | 1421 mul v16.4h, v16.4h, v30.4h |
| 1420 mul v17.4h, v17.4h, v31.4h | 1422 mul v17.4h, v17.4h, v31.4h |
| 1421 ins v16.2d[1], v17.2d[0] | 1423 ins v16.d[1], v17.d[0] |
| 1422 | 1424 |
| 1423 /* Pass 1 */ | 1425 /* Pass 1 */ |
| 1424 #if 0 | 1426 #if 0 |
| 1425 idct_helper v4.4h, v6.4h, v10.4h, v12.4h, v16.4h, 13, v4.4h, v6.4h | 1427 idct_helper v4.4h, v6.4h, v10.4h, v12.4h, v16.4h, 13, v4.4h, v6.4h |
| 1426 transpose_4x4 v4.4h, v6.4h, v8.4h, v10.4h | 1428 transpose_4x4 v4.4h, v6.4h, v8.4h, v10.4h |
| 1427 idct_helper v5.4h, v7.4h, v11.4h, v13.4h, v17.4h, 13, v5.4h, v7.4h | 1429 idct_helper v5.4h, v7.4h, v11.4h, v13.4h, v17.4h, 13, v5.4h, v7.4h |
| 1428 transpose_4x4 v5.4h, v7.4h, v9.4h, v11.4h | 1430 transpose_4x4 v5.4h, v7.4h, v9.4h, v11.4h |
| 1429 #else | 1431 #else |
| 1430 smull v26.4s, v6.4h, v14.4h[3] | 1432 smull v26.4s, v6.4h, v14.h[3] |
| 1431 smlal v26.4s, v10.4h, v14.4h[2] | 1433 smlal v26.4s, v10.4h, v14.h[2] |
| 1432 smlal v26.4s, v12.4h, v14.4h[1] | 1434 smlal v26.4s, v12.4h, v14.h[1] |
| 1433 smlal v26.4s, v16.4h, v14.4h[0] | 1435 smlal v26.4s, v16.4h, v14.h[0] |
| 1434 smull v24.4s, v7.4h, v14.4h[3] | 1436 smull v24.4s, v7.4h, v14.h[3] |
| 1435 smlal v24.4s, v11.4h, v14.4h[2] | 1437 smlal v24.4s, v11.4h, v14.h[2] |
| 1436 smlal v24.4s, v13.4h, v14.4h[1] | 1438 smlal v24.4s, v13.4h, v14.h[1] |
| 1437 smlal v24.4s, v17.4h, v14.4h[0] | 1439 smlal v24.4s, v17.4h, v14.h[0] |
| 1438 sshll v15.4s, v4.4h, #15 | 1440 sshll v15.4s, v4.4h, #15 |
| 1439 sshll v30.4s, v5.4h, #15 | 1441 sshll v30.4s, v5.4h, #15 |
| 1440 add v20.4s, v15.4s, v26.4s | 1442 add v20.4s, v15.4s, v26.4s |
| 1441 sub v15.4s, v15.4s, v26.4s | 1443 sub v15.4s, v15.4s, v26.4s |
| 1442 rshrn v4.4h, v20.4s, #13 | 1444 rshrn v4.4h, v20.4s, #13 |
| 1443 rshrn v6.4h, v15.4s, #13 | 1445 rshrn v6.4h, v15.4s, #13 |
| 1444 add v20.4s, v30.4s, v24.4s | 1446 add v20.4s, v30.4s, v24.4s |
| 1445 sub v15.4s, v30.4s, v24.4s | 1447 sub v15.4s, v30.4s, v24.4s |
| 1446 rshrn v5.4h, v20.4s, #13 | 1448 rshrn v5.4h, v20.4s, #13 |
| 1447 rshrn v7.4h, v15.4s, #13 | 1449 rshrn v7.4h, v15.4s, #13 |
| 1448 ins v4.2d[1], v5.2d[0] | 1450 ins v4.d[1], v5.d[0] |
| 1449 ins v6.2d[1], v7.2d[0] | 1451 ins v6.d[1], v7.d[0] |
| 1450 transpose v4, v6, v3, .16b, .8h | 1452 transpose v4, v6, v3, .16b, .8h |
| 1451 transpose v6, v10, v3, .16b, .4s | 1453 transpose v6, v10, v3, .16b, .4s |
| 1452 ins v11.2d[0], v10.2d[1] | 1454 ins v11.d[0], v10.d[1] |
| 1453 ins v7.2d[0], v6.2d[1] | 1455 ins v7.d[0], v6.d[1] |
| 1454 #endif | 1456 #endif |
| 1455 | 1457 |
| 1456 /* Pass 2 */ | 1458 /* Pass 2 */ |
| 1457 idct_helper v4.4h, v6.4h, v10.4h, v7.4h, v11.4h, 20, v26.4h, v27.4h | 1459 idct_helper v4.4h, v6.4h, v10.4h, v7.4h, v11.4h, 20, v26.4h, v27.4h |
| 1458 | 1460 |
| 1459 /* Range limit */ | 1461 /* Range limit */ |
| 1460 movi v30.8h, #0x80 | 1462 movi v30.8h, #0x80 |
| 1461 ins v26.2d[1], v27.2d[0] | 1463 ins v26.d[1], v27.d[0] |
| 1462 add v26.8h, v26.8h, v30.8h | 1464 add v26.8h, v26.8h, v30.8h |
| 1463 sqxtun v30.8b, v26.8h | 1465 sqxtun v30.8b, v26.8h |
| 1464 ins v26.2d[0], v30.2d[0] | 1466 ins v26.d[0], v30.d[0] |
| 1465 sqxtun v27.8b, v26.8h | 1467 sqxtun v27.8b, v26.8h |
| 1466 | 1468 |
| 1467 /* Store results to the output buffer */ | 1469 /* Store results to the output buffer */ |
| 1468 ldp TMP1, TMP2, [OUTPUT_BUF] | 1470 ldp TMP1, TMP2, [OUTPUT_BUF] |
| 1469 add TMP1, TMP1, OUTPUT_COL | 1471 add TMP1, TMP1, OUTPUT_COL |
| 1470 add TMP2, TMP2, OUTPUT_COL | 1472 add TMP2, TMP2, OUTPUT_COL |
| 1471 | 1473 |
| 1472 st1 {v26.b}[0], [TMP1], 1 | 1474 st1 {v26.b}[0], [TMP1], 1 |
| 1473 st1 {v27.b}[4], [TMP1], 1 | 1475 st1 {v27.b}[4], [TMP1], 1 |
| 1474 st1 {v26.b}[1], [TMP2], 1 | 1476 st1 {v26.b}[1], [TMP2], 1 |
| (...skipping 122 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 1597 .endif | 1599 .endif |
| 1598 .endm | 1600 .endm |
| 1599 | 1601 |
| 1600 .macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, rsize, g_offs, gsize, b_offs, bsize, defsize | 1602 .macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, rsize, g_offs, gsize, b_offs, bsize, defsize |
| 1601 | 1603 |
| 1602 /* | 1604 /* |
| 1603 * 2-stage pipelined YCbCr->RGB conversion | 1605 * 2-stage pipelined YCbCr->RGB conversion |
| 1604 */ | 1606 */ |
| 1605 | 1607 |
| 1606 .macro do_yuv_to_rgb_stage1 | 1608 .macro do_yuv_to_rgb_stage1 |
| 1607 uaddw v6.8h, v2.8h, v4.8b /* q3 = u - 128 */ | 1609 uaddw v6.8h, v2.8h, v4.8b /* q3 = u - 128 */ |
| 1608 uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */ | 1610 uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */ |
| 1609 smull v20.4s, v6.4h, v1.4h[1] /* multiply by -11277 */ | 1611 smull v20.4s, v6.4h, v1.h[1] /* multiply by -11277 */ |
| 1610 smlal v20.4s, v8.4h, v1.4h[2] /* multiply by -23401 */ | 1612 smlal v20.4s, v8.4h, v1.h[2] /* multiply by -23401 */ |
| 1611 smull2 v22.4s, v6.8h, v1.4h[1] /* multiply by -11277 */ | 1613 smull2 v22.4s, v6.8h, v1.h[1] /* multiply by -11277 */ |
| 1612 smlal2 v22.4s, v8.8h, v1.4h[2] /* multiply by -23401 */ | 1614 smlal2 v22.4s, v8.8h, v1.h[2] /* multiply by -23401 */ |
| 1613 smull v24.4s, v8.4h, v1.4h[0] /* multiply by 22971 */ | 1615 smull v24.4s, v8.4h, v1.h[0] /* multiply by 22971 */ |
| 1614 smull2 v26.4s, v8.8h, v1.4h[0] /* multiply by 22971 */ | 1616 smull2 v26.4s, v8.8h, v1.h[0] /* multiply by 22971 */ |
| 1615 smull v28.4s, v6.4h, v1.4h[3] /* multiply by 29033 */ | 1617 smull v28.4s, v6.4h, v1.h[3] /* multiply by 29033 */ |
| 1616 smull2 v30.4s, v6.8h, v1.4h[3] /* multiply by 29033 */ | 1618 smull2 v30.4s, v6.8h, v1.h[3] /* multiply by 29033 */ |
| 1617 .endm | 1619 .endm |
| 1618 | 1620 |
| 1619 .macro do_yuv_to_rgb_stage2 | 1621 .macro do_yuv_to_rgb_stage2 |
| 1620 rshrn v20.4h, v20.4s, #15 | 1622 rshrn v20.4h, v20.4s, #15 |
| 1621 rshrn2 v20.8h, v22.4s, #15 | 1623 rshrn2 v20.8h, v22.4s, #15 |
| 1622 rshrn v24.4h, v24.4s, #14 | 1624 rshrn v24.4h, v24.4s, #14 |
| 1623 rshrn2 v24.8h, v26.4s, #14 | 1625 rshrn2 v24.8h, v26.4s, #14 |
| 1624 rshrn v28.4h, v28.4s, #14 | 1626 rshrn v28.4h, v28.4s, #14 |
| 1625 rshrn2 v28.8h, v30.4s, #14 | 1627 rshrn2 v28.8h, v30.4s, #14 |
| 1626 uaddw v20.8h, v20.8h, v0.8b | 1628 uaddw v20.8h, v20.8h, v0.8b |
| (...skipping 26 matching lines...) Expand all Loading... | |
| 1653 uaddw v24.8h, v24.8h, v0.8b | 1655 uaddw v24.8h, v24.8h, v0.8b |
| 1654 uaddw v28.8h, v28.8h, v0.8b | 1656 uaddw v28.8h, v28.8h, v0.8b |
| 1655 .if \bpp != 16 /**************** rgb24/rgb32 *********************************/ | 1657 .if \bpp != 16 /**************** rgb24/rgb32 *********************************/ |
| 1656 sqxtun v1\g_offs\defsize, v20.8h | 1658 sqxtun v1\g_offs\defsize, v20.8h |
| 1657 ld1 {v0.8b}, [Y], 8 | 1659 ld1 {v0.8b}, [Y], 8 |
| 1658 sqxtun v1\r_offs\defsize, v24.8h | 1660 sqxtun v1\r_offs\defsize, v24.8h |
| 1659 prfm PLDL1KEEP, [U, #64] | 1661 prfm PLDL1KEEP, [U, #64] |
| 1660 prfm PLDL1KEEP, [V, #64] | 1662 prfm PLDL1KEEP, [V, #64] |
| 1661 prfm PLDL1KEEP, [Y, #64] | 1663 prfm PLDL1KEEP, [Y, #64] |
| 1662 sqxtun v1\b_offs\defsize, v28.8h | 1664 sqxtun v1\b_offs\defsize, v28.8h |
| 1663 uaddw v6.8h, v2.8h, v4.8b /* v6.16b = u - 128 */ | 1665 uaddw v6.8h, v2.8h, v4.8b /* v6.16b = u - 128 */ |
| 1664 uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */ | 1666 uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */ |
| 1665 smull v20.4s, v6.4h, v1.4h[1] /* multiply by -11277 */ | 1667 smull v20.4s, v6.4h, v1.h[1] /* multiply by -11277 */ |
| 1666 smlal v20.4s, v8.4h, v1.4h[2] /* multiply by -23401 */ | 1668 smlal v20.4s, v8.4h, v1.h[2] /* multiply by -23401 */ |
| 1667 smull2 v22.4s, v6.8h, v1.4h[1] /* multiply by -11277 */ | 1669 smull2 v22.4s, v6.8h, v1.h[1] /* multiply by -11277 */ |
| 1668 smlal2 v22.4s, v8.8h, v1.4h[2] /* multiply by -23401 */ | 1670 smlal2 v22.4s, v8.8h, v1.h[2] /* multiply by -23401 */ |
| 1669 smull v24.4s, v8.4h, v1.4h[0] /* multiply by 22971 */ | 1671 smull v24.4s, v8.4h, v1.h[0] /* multiply by 22971 */ |
| 1670 smull2 v26.4s, v8.8h, v1.4h[0] /* multiply by 22971 */ | 1672 smull2 v26.4s, v8.8h, v1.h[0] /* multiply by 22971 */ |
| 1671 .else /**************************** rgb565 ***********************************/ | 1673 .else /**************************** rgb565 ***********************************/ |
| 1672 sqshlu v21.8h, v20.8h, #8 | 1674 sqshlu v21.8h, v20.8h, #8 |
| 1673 sqshlu v25.8h, v24.8h, #8 | 1675 sqshlu v25.8h, v24.8h, #8 |
| 1674 sqshlu v29.8h, v28.8h, #8 | 1676 sqshlu v29.8h, v28.8h, #8 |
| 1675 uaddw v6.8h, v2.8h, v4.8b /* v6.16b = u - 128 */ | 1677 uaddw v6.8h, v2.8h, v4.8b /* v6.16b = u - 128 */ |
| 1676 uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */ | 1678 uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */ |
| 1677 ld1 {v0.8b}, [Y], 8 | 1679 ld1 {v0.8b}, [Y], 8 |
| 1678 smull v20.4s, v6.4h, v1.4h[1] /* multiply by -11277 */ | 1680 smull v20.4s, v6.4h, v1.h[1] /* multiply by -11277 */ |
| 1679 smlal v20.4s, v8.4h, v1.4h[2] /* multiply by -23401 */ | 1681 smlal v20.4s, v8.4h, v1.h[2] /* multiply by -23401 */ |
| 1680 smull2 v22.4s, v6.8h, v1.4h[1] /* multiply by -11277 */ | 1682 smull2 v22.4s, v6.8h, v1.h[1] /* multiply by -11277 */ |
| 1681 smlal2 v22.4s, v8.8h, v1.4h[2] /* multiply by -23401 */ | 1683 smlal2 v22.4s, v8.8h, v1.h[2] /* multiply by -23401 */ |
| 1682 sri v25.8h, v21.8h, #5 | 1684 sri v25.8h, v21.8h, #5 |
| 1683 smull v24.4s, v8.4h, v1.4h[0] /* multiply by 22971 */ | 1685 smull v24.4s, v8.4h, v1.h[0] /* multiply by 22971 */ |
| 1684 smull2 v26.4s, v8.8h, v1.4h[0] /* multiply by 22971 */ | 1686 smull2 v26.4s, v8.8h, v1.h[0] /* multiply by 22971 */ |
| 1685 prfm PLDL1KEEP, [U, #64] | 1687 prfm PLDL1KEEP, [U, #64] |
| 1686 prfm PLDL1KEEP, [V, #64] | 1688 prfm PLDL1KEEP, [V, #64] |
| 1687 prfm PLDL1KEEP, [Y, #64] | 1689 prfm PLDL1KEEP, [Y, #64] |
| 1688 sri v25.8h, v29.8h, #11 | 1690 sri v25.8h, v29.8h, #11 |
| 1689 .endif | 1691 .endif |
| 1690 do_store \bpp, 8 | 1692 do_store \bpp, 8 |
| 1691 smull v28.4s, v6.4h, v1.4h[3] /* multiply by 29033 */ | 1693 smull v28.4s, v6.4h, v1.h[3] /* multiply by 29033 */ |
| 1692 smull2 v30.4s, v6.8h, v1.4h[3] /* multiply by 29033 */ | 1694 smull2 v30.4s, v6.8h, v1.h[3] /* multiply by 29033 */ |
| 1693 .endm | 1695 .endm |
| 1694 | 1696 |
| 1695 .macro do_yuv_to_rgb | 1697 .macro do_yuv_to_rgb |
| 1696 do_yuv_to_rgb_stage1 | 1698 do_yuv_to_rgb_stage1 |
| 1697 do_yuv_to_rgb_stage2 | 1699 do_yuv_to_rgb_stage2 |
| 1698 .endm | 1700 .endm |
| 1699 | 1701 |
| 1700 /* Apple gas crashes on adrl, work around that by using adr. | 1702 /* Apple gas crashes on adrl, work around that by using adr. |
| 1701 * But this requires a copy of these constants for each function. | 1703 * But this requires a copy of these constants for each function. |
| 1702 */ | 1704 */ |
| (...skipping 149 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 1852 /*--------------------------------- id ----- bpp R rsize G gsize B bsize d efsize */ | 1854 /*--------------------------------- id ----- bpp R rsize G gsize B bsize d efsize */ |
| 1853 generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, .4h, 1, .4h, 2, .4h, . 8b | 1855 generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, .4h, 1, .4h, 2, .4h, . 8b |
| 1854 generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, .4h, 1, .4h, 0, .4h, . 8b | 1856 generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, .4h, 1, .4h, 0, .4h, . 8b |
| 1855 generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, .4h, 1, .4h, 2, .4h, . 8b | 1857 generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, .4h, 1, .4h, 2, .4h, . 8b |
| 1856 generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, .4h, 1, .4h, 0, .4h, . 8b | 1858 generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, .4h, 1, .4h, 0, .4h, . 8b |
| 1857 generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, .4h, 2, .4h, 1, .4h, . 8b | 1859 generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, .4h, 2, .4h, 1, .4h, . 8b |
| 1858 generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, .4h, 2, .4h, 3, .4h, . 8b | 1860 generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, .4h, 2, .4h, 3, .4h, . 8b |
| 1859 generate_jsimd_ycc_rgb_convert_neon rgb565, 16, 0, .4h, 0, .4h, 0, .4h, . 8b | 1861 generate_jsimd_ycc_rgb_convert_neon rgb565, 16, 0, .4h, 0, .4h, 0, .4h, . 8b |
| 1860 .purgem do_load | 1862 .purgem do_load |
| 1861 .purgem do_store | 1863 .purgem do_store |
| OLD | NEW |